commit 7972af0697c09bd96e80fe0b922324aa8b630498 Author: Jiang Bo Date: Mon Jan 11 18:16:03 2021 +0800 Initial Commit for VERSION 1.1.28 Signed-off-by: Jiang Bo diff --git a/BUILD.bazel b/BUILD.bazel new file mode 100644 index 0000000..17f39de --- /dev/null +++ b/BUILD.bazel @@ -0,0 +1,51 @@ +package( + default_visibility = ["//visibility:public"], + features = ["-parse_headers"], +) + +cc_library( + name = "tim-vx_interface", + copts = ["-std=c++14", "-Werror", "-fvisibility=default"], + includes = [ + "include", + "src/tim/vx", + ], + hdrs = [ + "include/tim/vx/context.h", + "include/tim/vx/graph.h", + "include/tim/vx/operation.h", + "include/tim/vx/tensor.h", + "include/tim/vx/types.h", + ] + glob([ + "include/tim/vx/ops/*.h" + ]), + srcs = [ + "src/tim/vx/context_private.h", + "src/tim/vx/context.cc", + "src/tim/vx/graph_private.h", + "src/tim/vx/graph.cc", + "src/tim/vx/operation.cc", + "src/tim/vx/operation_private.h", + "src/tim/vx/tensor.cc", + "src/tim/vx/tensor_private.h", + "src/tim/vx/type_utils.h", + "src/tim/vx/type_utils.cc", + ] + glob([ + "src/tim/vx/ops/*.cc" + ]), + deps = [ + "//src/tim/vx/internal:ovxlibimpl", + ], + linkstatic = True, + strip_include_prefix = "include", +) + +cc_binary( + name = "libtim-vx.so", + linkshared = True, + linkstatic = False, + deps = [ + "tim-vx_interface", + ], +) + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c4d7a54 --- /dev/null +++ b/LICENSE @@ -0,0 +1,23 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..cc9a29d --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# TIM-VX - Tensor Interface Module for OpenVX + +TIM-VX is a software integration module provided by VeriSilicon to facilitate deployment of Neural-Networks on OpenVX enabled ML accelerators. It serves as the backend binding for runtime frameworks such as Android NN, Tensorflow-Lite, MLIR, TVM and more. + +Main Features + - Over 130 internal operators with rich format support for both quantized and floating point + - Simplified binding API calls to create Tensors and Operations + - Dynamic graph construction and supports shape inferencing + - Built-in custom layer extensions + - A set of utility functions for debugging + +## Roadmap + +Roadmap of TIM-VX will be updated here in the future. + +## Get started + +### Build and Run +TIM-VX uses [bazel](https://bazel.build) build system by default. [Install bazel](https://docs.bazel.build/versions/master/install.html) first to get started. + +TIM-VX needs to be compiled and linked against VeriSilicon OpenVX SDK which provides related header files and pre-compiled libraries. A default linux-x86_64 SDK is provided which contains the simulation environment on PC. Platform specific SDKs can be obtained from respective SoC vendors. + +To build TIM-VX +```shell +bazel build libtim-vx.so +``` + +To run sample LeNet +```shell +bazel build //samples/lenet:lenet_asymu8_cc +bazel run //samples/lenet:lenet_asymu8_cc +``` + +### Get familiar with OpenVX spec +To development for TIM-VX, you first need to get familiar with [OpenVX API](https://www.khronos.org/openvx/) and [OpenVX NN Extension API](https://www.khronos.org/registry/vx). Please head over to [Khronos](https://www.khronos.org/) to read the spec. + diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..9090603 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +1.1.28 diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 0000000..06fb238 --- /dev/null +++ b/WORKSPACE @@ -0,0 +1,9 @@ +workspace(name = "TIM_VX") + +local_repository( + name = 'TOOLCHAINS', + path = 'toolchains', +) + +load("@TOOLCHAINS//:toolchains.bzl", "init_toolchains") +init_toolchains() diff --git a/include/tim/vx/context.h b/include/tim/vx/context.h new file mode 100644 index 0000000..897bfc8 --- /dev/null +++ b/include/tim/vx/context.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_CONTEXT_H_ +#define TIM_VX_CONTEXT_H_ + +#include + +#include "tim/vx/graph.h" + +namespace tim { +namespace vx { + +class Context { + public: + virtual ~Context() {} + static std::shared_ptr Create(); + virtual std::shared_ptr CreateGraph() = 0; +}; + +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_CONTEXT_H_ */ \ No newline at end of file diff --git a/include/tim/vx/graph.h b/include/tim/vx/graph.h new file mode 100644 index 0000000..8ff8e5e --- /dev/null +++ b/include/tim/vx/graph.h @@ -0,0 +1,61 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_GRAPH_H_ +#define TIM_VX_GRAPH_H_ + +#include +#include + +#include "tim/vx/tensor.h" + +namespace tim { +namespace vx { + +class Graph { + public: + virtual ~Graph() {} + + /// Create a tensor with given `TensorSpec` + virtual std::shared_ptr CreateTensor(const TensorSpec& spec, + const void* data = nullptr) = 0; + + /// Create a placeholder tensor for optional inputs of operations + virtual std::shared_ptr CreateTensorPlaceHolder() = 0; + + /// Freeze graph + virtual bool Compile() = 0; + + /// Process the compiled graph + virtual bool Run() = 0; + + template + std::shared_ptr CreateOperation(Params... parameters) { + return std::make_shared(this, parameters...); + } +}; + +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_GRAPH_H_ */ \ No newline at end of file diff --git a/include/tim/vx/operation.h b/include/tim/vx/operation.h new file mode 100644 index 0000000..4249e9c --- /dev/null +++ b/include/tim/vx/operation.h @@ -0,0 +1,59 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPERATION_H_ +#define TIM_VX_OPERATION_H_ + +#include "tim/vx/graph.h" +#include "tim/vx/tensor.h" + +namespace tim { +namespace vx { + +class OperationImpl; + +class Operation { + public: + Operation(Graph* graph, uint32_t operation_id, int input_cnt = 0, + int ouput_cnt = 0); + virtual ~Operation(); + Operation& BindInput(const std::shared_ptr& tensor); + Operation& BindOutput(const std::shared_ptr& tensor); + Operation& BindInputs(const std::vector>& tensors); + Operation& BindOutputs(const std::vector>& tensors); + Operation& SetRoundingPolicy( + OverflowPolicy overflow_policy = OverflowPolicy::SATURATE, + RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO, + DownScaleSizeRounding down_scale_size_rounding = + DownScaleSizeRounding::FLOOR, + uint32_t accumulator_bits = 0); + std::unique_ptr& impl(); + + protected: + std::unique_ptr impl_; +}; + +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPERATION_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/activations.h b/include/tim/vx/ops/activations.h new file mode 100644 index 0000000..9fb9dda --- /dev/null +++ b/include/tim/vx/ops/activations.h @@ -0,0 +1,60 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_ACTIVATIONS_H_ +#define TIM_VX_OPS_ACTIVATIONS_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +#define DECLARE_NO_PARAMETER_ACTIVATION(NAME) \ + class NAME : public Operation { \ + public: \ + NAME(Graph* graph); \ + }; + +DECLARE_NO_PARAMETER_ACTIVATION(Relu); +DECLARE_NO_PARAMETER_ACTIVATION(Relu1); +DECLARE_NO_PARAMETER_ACTIVATION(Relu6); +DECLARE_NO_PARAMETER_ACTIVATION(Elu); +DECLARE_NO_PARAMETER_ACTIVATION(Tanh); +DECLARE_NO_PARAMETER_ACTIVATION(Sigmoid); +DECLARE_NO_PARAMETER_ACTIVATION(HardSwish); + +#undef DEFINE_NO_PARAMETER_ACTIVATION + +class Prelu : public Operation { + public: + Prelu(Graph* graph, int axis); + + protected: + int axis_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_ACTIVATIONS_H_ */ diff --git a/include/tim/vx/ops/addn.h b/include/tim/vx/ops/addn.h new file mode 100644 index 0000000..e504597 --- /dev/null +++ b/include/tim/vx/ops/addn.h @@ -0,0 +1,41 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_ADDN_H_ +#define TIM_VX_OPS_ADDN_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class AddN : public Operation { + public: + AddN(Graph* graph, uint32_t num_inputs); +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_ADDN_H_ */ diff --git a/include/tim/vx/ops/batch2space.h b/include/tim/vx/ops/batch2space.h new file mode 100644 index 0000000..be4211e --- /dev/null +++ b/include/tim/vx/ops/batch2space.h @@ -0,0 +1,49 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_BATCH2SPACE_H_ +#define TIM_VX_OPS_BATCH2SPACE_H_ + +#include + +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class Batch2Space : public Operation { + public: + Batch2Space(Graph* graph, const std::vector& block_size, + const std::vector& crop); + + protected: + std::vector block_size_; + std::vector crop_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif \ No newline at end of file diff --git a/include/tim/vx/ops/concat.h b/include/tim/vx/ops/concat.h new file mode 100644 index 0000000..7a293e9 --- /dev/null +++ b/include/tim/vx/ops/concat.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_CONCAT_H_ +#define TIM_VX_OPS_CONCAT_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class Concat : public Operation { + public: + Concat(Graph* graph, uint32_t axis, int input_cnt); + + protected: + uint32_t axis_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_CONCAT_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/conv2d.h b/include/tim/vx/ops/conv2d.h new file mode 100644 index 0000000..f63a8c7 --- /dev/null +++ b/include/tim/vx/ops/conv2d.h @@ -0,0 +1,61 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_CONV2D_H_ +#define TIM_VX_OPS_CONV2D_H_ + +#include + +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class Conv2d : public Operation { + public: + Conv2d(Graph* graph, int32_t weights, PadType padding, + const std::array& ksize, + const std::array& stride, + const std::array& dilation, int32_t multiplier = 0); + Conv2d(Graph* graph, int32_t weights, PadType padding, + const std::array& ksize, + const std::array& stride, + const std::array& dilation, + const std::array& pad, int32_t multiplier = 0); + + protected: + const uint32_t weights_; + const PadType padding_; + const std::array ksize_; + const std::array stride_; + const std::array dilation_; + const std::array pad_; + const int32_t multiplier_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_CONV2D_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/depth2space.h b/include/tim/vx/ops/depth2space.h new file mode 100644 index 0000000..6aea69b --- /dev/null +++ b/include/tim/vx/ops/depth2space.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_DEPTH2SPACE_H_ +#define TIM_VX_OPS_DEPTH2SPACE_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { + +namespace ops { + +class DepthToSpace : public Operation { + public: + DepthToSpace(Graph* Graph, int block_size); + + protected: + int block_size_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_DEPTH2SPACE_H_ */ diff --git a/include/tim/vx/ops/elementwise.h b/include/tim/vx/ops/elementwise.h new file mode 100644 index 0000000..21e2807 --- /dev/null +++ b/include/tim/vx/ops/elementwise.h @@ -0,0 +1,63 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_ELEMENTWISE_H_ +#define TIM_VX_OPS_ELEMENTWISE_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +#define DELCATE_ELEMENTWISE_OP(NAME) \ + class NAME : public Operation { \ + public: \ + NAME(Graph* graph); \ + }; + +DELCATE_ELEMENTWISE_OP(Abs); +DELCATE_ELEMENTWISE_OP(Sin); +// TODO(jiangbo): enable it when ovxlib supports `Cos` +//DELCATE_ELEMENTWISE_OP(Cos); +DELCATE_ELEMENTWISE_OP(Exp); +DELCATE_ELEMENTWISE_OP(Log); +DELCATE_ELEMENTWISE_OP(Sqrt); +DELCATE_ELEMENTWISE_OP(Rsqrt); +DELCATE_ELEMENTWISE_OP(Square); +DELCATE_ELEMENTWISE_OP(LogicalNot); + +DELCATE_ELEMENTWISE_OP(Minimum); +DELCATE_ELEMENTWISE_OP(Maximum); +DELCATE_ELEMENTWISE_OP(Add); +DELCATE_ELEMENTWISE_OP(Sub); +DELCATE_ELEMENTWISE_OP(Div); +DELCATE_ELEMENTWISE_OP(Multiply); +DELCATE_ELEMENTWISE_OP(Pow); + +#undef DELCATE_ELEMENTWISE_OP + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_ELEMENTWISE_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/fullyconnected.h b/include/tim/vx/ops/fullyconnected.h new file mode 100644 index 0000000..877982a --- /dev/null +++ b/include/tim/vx/ops/fullyconnected.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_FULLYCONNECTED_H_ +#define TIM_VX_OPS_FULLYCONNECTED_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { +class FullyConnected : public Operation { + public: + FullyConnected(Graph* graph, uint32_t axis, uint32_t weights); + + protected: + uint32_t axis_; + uint32_t weights_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_FULLYCONNECTED_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/gather.h b/include/tim/vx/ops/gather.h new file mode 100644 index 0000000..b819447 --- /dev/null +++ b/include/tim/vx/ops/gather.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_GATHER_H_ +#define TIM_VX_OPS_GATHER_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { + +namespace ops { + +class Gather : public Operation { + public: + Gather(Graph* Graph, int axis); + + protected: + int axis_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_GATHER_H_ */ diff --git a/include/tim/vx/ops/l2normalization.h b/include/tim/vx/ops/l2normalization.h new file mode 100644 index 0000000..8455f91 --- /dev/null +++ b/include/tim/vx/ops/l2normalization.h @@ -0,0 +1,42 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_L2NOMALIZATION_H_ +#define TIM_VX_OPS_L2NOMALIZATION_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { +class L2Normalization : public Operation { + public: + L2Normalization(Graph* graph, int32_t axis); + + protected: + int32_t axis_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim +#endif \ No newline at end of file diff --git a/include/tim/vx/ops/localresponsenormalization.h b/include/tim/vx/ops/localresponsenormalization.h new file mode 100644 index 0000000..4465b20 --- /dev/null +++ b/include/tim/vx/ops/localresponsenormalization.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_LOCALRESPONSENORMALIZATION_H_ +#define TIM_VX_OPS_LOCALRESPONSENORMALIZATION_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { +class LocalResponseNormalization : public Operation { + public: + LocalResponseNormalization(Graph* graph, uint32_t size, float alpha, + float beta, float bias, int32_t axis); + + protected: + uint32_t size_; + float alpha_; + float beta_; + float bias_; + int32_t axis_; +}; +} // namespace ops +} // namespace vx +} // namespace tim + +#endif \ No newline at end of file diff --git a/include/tim/vx/ops/pad.h b/include/tim/vx/ops/pad.h new file mode 100644 index 0000000..1111acc --- /dev/null +++ b/include/tim/vx/ops/pad.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPERATION_PAD_H_ +#define TIM_VX_OPERATION_PAD_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { +class Pad : public Operation { + public: + Pad(Graph* graph, const std::vector& front_size, + const std::vector& back_size, int32_t const_val); + + protected: + std::vector front_size_; + std::vector back_size_; + int32_t const_val_; +}; +} // namespace ops +} // namespace vx +} // namespace tim +#endif \ No newline at end of file diff --git a/include/tim/vx/ops/permute.h b/include/tim/vx/ops/permute.h new file mode 100644 index 0000000..fe33e68 --- /dev/null +++ b/include/tim/vx/ops/permute.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_PERMUTE_H_ +#define TIM_VX_OPS_PERMUTE_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class Permute : public Operation { + public: + Permute(Graph* graph, const std::vector& perm); + + protected: + std::vector perm_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_PERMUTE_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/pool2d.h b/include/tim/vx/ops/pool2d.h new file mode 100644 index 0000000..f4d0bc7 --- /dev/null +++ b/include/tim/vx/ops/pool2d.h @@ -0,0 +1,55 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_POOL2D_H_ +#define TIM_VX_OPS_POOL2D_H_ + +#include + +#include "tim/vx/operation.h" +#include "tim/vx/types.h" + +namespace tim { +namespace vx { +namespace ops { + +class Pool2d : public Operation { + public: + Pool2d(Graph* graph, PoolType type, PadType padding, + const std::array& ksize, + const std::array& stride, + RoundType round_type = RoundType::CEILING); + + protected: + const PoolType type_; + const PadType padding_; + const std::array ksize_; + const std::array stride_; + const RoundType round_type_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_POOL2D_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/reduce.h b/include/tim/vx/ops/reduce.h new file mode 100644 index 0000000..1885733 --- /dev/null +++ b/include/tim/vx/ops/reduce.h @@ -0,0 +1,55 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_REDUCE_H_ +#define TIM_VX_OPS_REDUCE_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +#define DELCATE_REDUCE_OP(NAME) \ + class Reduce##NAME : public Operation { \ + public: \ + Reduce##NAME(Graph* graph, const std::vector& axis, \ + bool keep_dims); \ + \ + protected: \ + std::vector axis_; \ + bool keep_dims_; \ + }; + +DELCATE_REDUCE_OP(Min); +DELCATE_REDUCE_OP(Max); +DELCATE_REDUCE_OP(Any); +DELCATE_REDUCE_OP(Prod); +DELCATE_REDUCE_OP(Mean); + +#undef DEFINE_REDUCE_OP + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_ACTIVATIONS_H_ */ diff --git a/include/tim/vx/ops/reshape.h b/include/tim/vx/ops/reshape.h new file mode 100644 index 0000000..11eab62 --- /dev/null +++ b/include/tim/vx/ops/reshape.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_RESHAPE_H_ +#define TIM_VX_OPS_RESHAPE_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class Reshape : public Operation { + public: + Reshape(Graph* graph, const std::vector& perm); + + protected: + std::vector size_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_RESHAPE_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/resize.h b/include/tim/vx/ops/resize.h new file mode 100644 index 0000000..25d41a6 --- /dev/null +++ b/include/tim/vx/ops/resize.h @@ -0,0 +1,50 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_RESIZE_H_ +#define TIM_VX_OPS_RESIZE_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class Resize : public Operation { + public: + Resize(Graph* graph, ResizeType type, float factor, bool align_corners, + bool half_pixel_centers, int target_height, int target_width); + + protected: + const ResizeType type_; + const float factor_; + const bool align_corners_; + const bool half_pixel_centers_; + const int target_height_; + const int target_width_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_RESIZE_H_ */ diff --git a/include/tim/vx/ops/simple_operations.h b/include/tim/vx/ops/simple_operations.h new file mode 100644 index 0000000..2be9802 --- /dev/null +++ b/include/tim/vx/ops/simple_operations.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_SIMPLE_OPERATIONS_H_ +#define TIM_VX_OPS_SIMPLE_OPERATIONS_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +#define DECLARE_SIMPLE_OP(NAME) \ + class NAME : public Operation { \ + public: \ + NAME(Graph* graph); \ + }; + +DECLARE_SIMPLE_OP(DataConvert) +DECLARE_SIMPLE_OP(Neg) + +#undef DECLARE_SIMPLE_OP + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_SIMPLE_OPERATIONS_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/softmax.h b/include/tim/vx/ops/softmax.h new file mode 100644 index 0000000..049b1f2 --- /dev/null +++ b/include/tim/vx/ops/softmax.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_SOFTMAX_H_ +#define TIM_VX_OPS_SOFTMAX_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class Softmax : public Operation { + public: + Softmax(Graph* graph, float beta, int32_t axis); + + protected: + float beta_; + int32_t axis_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_SOFTMAX_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/space2batch.h b/include/tim/vx/ops/space2batch.h new file mode 100644 index 0000000..81e2f9d --- /dev/null +++ b/include/tim/vx/ops/space2batch.h @@ -0,0 +1,49 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_SPACE2BATCH_H_ +#define TIM_VX_OPS_SPACE2BATCH_H_ + +#include + +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class Space2Batch : public Operation { + public: + Space2Batch(Graph* graph, const std::vector& block_size, + const std::vector& pad); + + protected: + std::vector block_size_; + std::vector pad_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_SPACE2BATCH_H_ */ \ No newline at end of file diff --git a/include/tim/vx/ops/space2depth.h b/include/tim/vx/ops/space2depth.h new file mode 100644 index 0000000..e86dc01 --- /dev/null +++ b/include/tim/vx/ops/space2depth.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_SPACE2DEPTH_H_ +#define TIM_VX_OPS_SPACE2DEPTH_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { + +namespace ops { + +class SpaceToDepth : public Operation { + public: + SpaceToDepth(Graph* graph, std::vector block_size); + + protected: + std::vector block_size_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_SPACE2DEPTH_H_ */ diff --git a/include/tim/vx/ops/split.h b/include/tim/vx/ops/split.h new file mode 100644 index 0000000..5d70a65 --- /dev/null +++ b/include/tim/vx/ops/split.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_SPLIT_H_ +#define TIM_VX_OPS_SPLIT_H_ +#include + +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class Split : public Operation { + public: + Split(Graph* graph, uint32_t axis, std::vector slices); + + protected: + uint32_t axis_; + std::vector slices_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_SPLIT_H_ */ diff --git a/include/tim/vx/ops/stridedslice.h b/include/tim/vx/ops/stridedslice.h new file mode 100644 index 0000000..10c78d6 --- /dev/null +++ b/include/tim/vx/ops/stridedslice.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPS_STRIDEDSLICE_H_ +#define TIM_VX_OPS_STRIDEDSLICE_H_ +#include "tim/vx/operation.h" + +namespace tim { +namespace vx { +namespace ops { + +class StridedSlice : public Operation { + public: + StridedSlice(Graph* graph, const std::vector begin_dims, + const std::vector end_dims, + const std::vector stride_dims, int32_t begin_mask, + int32_t end_mask, int32_t shrink_axis_mask); + + protected: + std::vector begin_dims_; + std::vector end_dims_; + std::vector stride_dims_; + int32_t begin_mask_; + int32_t end_mask_; + int32_t shrink_axis_mask_; +}; + +} // namespace ops +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPS_STRIDEDSLICE_H_ */ \ No newline at end of file diff --git a/include/tim/vx/tensor.h b/include/tim/vx/tensor.h new file mode 100644 index 0000000..a27802e --- /dev/null +++ b/include/tim/vx/tensor.h @@ -0,0 +1,141 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_TENSOR_H_ +#define TIM_VX_TENSOR_H_ + +#include +#include +#include +#include + +#include "tim/vx/types.h" + +namespace tim { +namespace vx { + +using ShapeType = std::vector; + +class Quantization { + public: + Quantization() : type_(QuantType::NONE) {} + Quantization(QuantType type, float scale, int32_t zero_point) + : type_(type), scales_({scale}), zero_points_({zero_point}) {} + Quantization(QuantType type, int32_t channel_dim, std::vector scales, + std::vector zero_points) + : type_(type), + channel_dim_(channel_dim), + scales_(std::move(scales)), + zero_points_(std::move(zero_points)) {} + + QuantType Type() { return type_; } + Quantization& SetType(QuantType type) { + this->type_ = type; + return *this; + } + + int32_t ChannelDim() { return this->channel_dim_; } + Quantization& SetChannelDim(int32_t channel_dim) { + this->channel_dim_ = channel_dim; + return *this; + } + + std::vector& Scales() { return this->scales_; } + Quantization& SetScales(std::vector scales) { + this->scales_ = scales; + return *this; + } + + std::vector& ZeroPoints() { return this->zero_points_; } + Quantization& SetZeroPoints(std::vector zero_points) { + this->zero_points_ = zero_points; + return *this; + } + + protected: + QuantType type_{QuantType::NONE}; + int32_t channel_dim_; + std::vector scales_; + std::vector zero_points_; +}; + +struct TensorSpec { + TensorSpec() {} + TensorSpec(DataType datatype, const ShapeType& shape, TensorAttribute attr) + : datatype_(datatype), shape_(shape), attr_(attr) {} + + TensorSpec(DataType datatype, const ShapeType& shape, TensorAttribute attr, + const Quantization& quantization) + : TensorSpec(datatype, shape, attr) { + this->quantization_ = quantization; + } + + TensorSpec& SetDataType(DataType datatype) { + this->datatype_ = datatype; + return *this; + } + + TensorSpec& SetShape(ShapeType& shape) { + this->shape_ = shape; + return *this; + } + + TensorSpec& SetAttribute(TensorAttribute attr) { + this->attr_ = attr; + return *this; + } + + TensorSpec& SetQuantization(Quantization& quantization) { + this->quantization_ = quantization; + return *this; + } + + TensorSpec AsTransientSpec() const { + return TensorSpec(this->datatype_, ShapeType({}), + TensorAttribute::TRANSIENT, this->quantization_); + } + + DataType datatype_; + ShapeType shape_; + TensorAttribute attr_; + Quantization quantization_; +}; + +class Tensor { + public: + virtual ~Tensor() {} + virtual const ShapeType& GetShape() = 0; + virtual DataType GetDataType() = 0; + virtual const Quantization& GetQuantization() = 0; + virtual const TensorSpec& GetSpec() = 0; + virtual uint32_t GetId() = 0; + virtual bool CopyDataToTensor(const void* data, uint32_t size = 0) = 0; + virtual bool CopyDataFromTensor(void* data) = 0; + virtual bool IsPlaceHolder() = 0; + virtual bool IsConstTensor() = 0; +}; + +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_TENSOR_H_ */ \ No newline at end of file diff --git a/include/tim/vx/types.h b/include/tim/vx/types.h new file mode 100644 index 0000000..4948207 --- /dev/null +++ b/include/tim/vx/types.h @@ -0,0 +1,73 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_TYPES_H_ +#define TIM_VX_TYPES_H_ + +namespace tim { +namespace vx { + +enum class DataType { + UNKNOWN, + INT8, + UINT8, + INT16, + UINT16, + INT32, + UINT32, + FLOAT16, + FLOAT32 +}; + +enum class QuantType { NONE, ASYMMETRIC, SYMMETRIC_PER_CHANNEL }; + +enum class TensorAttribute { CONSTANT, TRANSIENT, VARIABLE, INPUT, OUTPUT }; + +enum class PadType { NONE = -1, AUTO, VALID, SAME }; + +enum class PoolType { MAX, AVG, L2, AVG_ANDROID }; + +enum class RoundType { CEILING, FLOOR }; + +enum class OverflowPolicy { WRAP, SATURATE }; + +enum class RoundingPolicy { TO_ZERO, RTNE }; + +enum class DownScaleSizeRounding { FLOOR, CEILING }; + +enum class ActivationType { + NONE, + RELU, + RELU1, + RELU6, + TANH, + //SIGNBIT, + SIGMOID +}; + +enum class ResizeType { NEAREST_NEIGHBOR, BILINEAR, AREA }; + +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_TYPES_H_ */ diff --git a/prebuilt-sdk/BUILD b/prebuilt-sdk/BUILD new file mode 100644 index 0000000..c85f2e8 --- /dev/null +++ b/prebuilt-sdk/BUILD @@ -0,0 +1,13 @@ +package(default_visibility = ["//visibility:public"]) + +config_setting( + name = "x86_64_linux", + values = {"define": "target_device=x86_64_linux"}, +) + +cc_library( + name = "VIV_SDK_LIB", + deps = select({ + "//conditions:default": ["//prebuilt-sdk/x86_64_linux:VIV_SDK_LIB"], + }), +) diff --git a/prebuilt-sdk/x86_64_linux/BUILD b/prebuilt-sdk/x86_64_linux/BUILD new file mode 100644 index 0000000..e49b679 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/BUILD @@ -0,0 +1,21 @@ +package(default_visibility = ["//visibility:public"]) + +filegroup( + name = "libs", + srcs = glob([ + "lib/*.so", + "lib/*.so.*", + ]), +) + +cc_library( + name = "VIV_SDK_LIB", + hdrs = glob([ + "include/**/*.h" + ]), + srcs = select({ + "//conditions:default": [":libs"], + }), + strip_include_prefix = "include", +) + diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION new file mode 100644 index 0000000..75d46d8 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/VERSION @@ -0,0 +1 @@ +D312513_A294074_R311680_T312233_O312045 diff --git a/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h b/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h new file mode 100644 index 0000000..f5e2df1 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h @@ -0,0 +1,1684 @@ +/**************************************************************************** +* +* Copyright 2016 - 2020 Vivante Corporation, Santa Clara, California. +* All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* 'Software'), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sub license, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject +* to the following conditions: +* +* The above copyright notice and this permission notice (including the +* next paragraph) shall be included in all copies or substantial +* portions of the Software. +* +* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. +* IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _GC_VX_H +#define _GC_VX_H 1 + +#ifdef _VIV_VX_EXTENSION + +#pragma OPENCL EXTENSION CL_VIV_asm : enable + +#ifndef VX_VERSION +#define VX_VERSION 1 /* change to 2 if VX 2 APIs are implemented below */ +#endif + +typedef enum _VXC_FilterMode +{ + VXC_FM_BOX = 0, + VXC_FM_Guassian = 1, + VXC_FM_SobelX = 2, + VXC_FM_SobelY = 3, + VXC_FM_ScharrX = 4, + VXC_FM_ScharrY = 5, + VXC_FM_Max = 8, + VXC_FM_Min = 9, + VXC_FM_Median = 10 +} vxc_filter_mode; + +typedef enum _VXC_RoundMode +{ + VXC_RM_Truncate = 0, /* for integer truncation */ + VXC_RM_TowardZero = 0, /* for floats round to zero */ + VXC_RM_TowardInf = 1, + VXC_RM_ToNearestEven = 2 +} vxc_round_mode; + +typedef enum _VXC_ScatteredOffsetType +{ + VXC_OFFSET_UNSIGNED32 = 0, + VXC_OFFSET_SIGNED32 = 1, + VXC_OFFSET_UNSIGNED16 = 2, + VXC_OFFSET_SIGNED16 = 3, + VXC_OFFSET_UNSIGNED8 = 4, + VXC_OFFSET_SIGNED8 = 5, +} VXC_ScatteredOffsetType; + +typedef enum _VXC_AtomicOp +{ + VXC_ATOMIC_OP_ADD = 0, + VXC_ATOMIC_OP_MIN = 1, + VXC_ATOMIC_OP_MAX = 2, + VXC_ATOMIC_OP_OR = 3, + VXC_ATOMIC_OP_AND = 4, + VXC_ATOMIC_OP_XOR = 5, + VXC_ATOMIC_OP_XCHG = 6, +}VXC_AtomicOpType; + +#define VXC_CLAMP_BITMASK 0x00400000 /* shift 22 */ +#define VXC_PREADJ_BITMASK 0x00200000 /* shift 21 */ +#define VXC_RANGEPI_BITMASK 0x00100000 /* shift 20 */ +#define VXC_FILTER_BITMASK 0x000F0000 /* shift 16 */ +#define VXC_START_BIN_BITMASK 0x0000F000 /* shift 12 */ +#define VXC_END_BIN_BITMASK 0x00000F00 /* shift 8 */ +#define VXC_SOURCE_BIN_BITMASK 0x000000F0 /* shift 4 */ +#define VXC_ROUNDING_MODE_BITMASK 0x0000000C /* shift 2 */ +#define VXC_ENABLEBOOL_BITMASK 0x00000002 /* shift 1 */ +#define VXC_SIGNEXT_BITMASK 0x00000001 /* shift 0 */ + +/* overload FILTER bits, bits in [16:18] for scattered offset type. */ +#define VXC_OFFSET_TYPE_BITMASK 0x00070000 /* shift 16 */ +#define VXC_OFFSET_TYPE_SHIFT 16 /* shift 16 */ + +/* overload FILTER, PREADJ and RANGEPI, bits in [19:21] for scattered offset type. */ +#define VXC_ATOM_OP_BITMASK 0x00380000 /* shift 19 */ +#define VXC_ATOM_OP_SHIFT 19 /* shift 19 */ + +#define VXC_MODIFIER(StartBin, EndBin, SourceBin, RoundingMode, Clamp) \ + ( \ + (((Clamp) << 22)&VXC_CLAMP_BITMASK) | \ + (((StartBin) << 12)&VXC_START_BIN_BITMASK) | \ + (((EndBin) << 8)&VXC_END_BIN_BITMASK) | \ + (((SourceBin) << 4)&VXC_SOURCE_BIN_BITMASK) | \ + ((RoundingMode << 2)&VXC_ROUNDING_MODE_BITMASK) \ + ) + +#define VXC_MODIFIER_SIGNEXT(StartBin, EndBin, SourceBin, SignExt, Clamp) \ + ( \ + (((Clamp) << 22)&VXC_CLAMP_BITMASK) | \ + ((SignExt)&VXC_SIGNEXT_BITMASK) | \ + (((StartBin) << 12)&VXC_START_BIN_BITMASK) | \ + (((EndBin) << 8)&VXC_END_BIN_BITMASK) | \ + (((SourceBin) << 4)&VXC_SOURCE_BIN_BITMASK) \ + ) + +#define VXC_MODIFIER_MAGPHASE(StartBin, EndBin, SourceBin, NoPreAdjust, RangePi) \ + ( \ + (VXC_CLAMP_BITMASK) | \ + (((RangePi) << 20)&VXC_RANGEPI_BITMASK) | \ + (((NoPreAdjust) << 21)&VXC_PREADJ_BITMASK) | \ + (((StartBin) << 12)&VXC_START_BIN_BITMASK) | \ + (((EndBin) << 8)&VXC_END_BIN_BITMASK) | \ + (((SourceBin) << 4)&VXC_SOURCE_BIN_BITMASK) \ + ) + +#define VXC_MODIFIER_CLAMP(StartBin, EndBin, SourceBin, EnableBool) \ + ( \ + (((EnableBool) << 1)&VXC_ENABLEBOOL_BITMASK) | \ + (((StartBin) << 12)&VXC_START_BIN_BITMASK) | \ + (((EndBin) << 8)&VXC_END_BIN_BITMASK) | \ + (((SourceBin) << 4)&VXC_SOURCE_BIN_BITMASK) \ + ) + +#define VXC_MODIFIER_FILTER(StartBin, EndBin, SourceBin, Filter, Clamp) \ + ( \ + (((Clamp) << 22)&VXC_CLAMP_BITMASK) | \ + (((Filter) << 16)&VXC_FILTER_BITMASK) | \ + (((StartBin) << 12)&VXC_START_BIN_BITMASK) | \ + (((EndBin) << 8)&VXC_END_BIN_BITMASK) | \ + (((SourceBin) << 4)&VXC_SOURCE_BIN_BITMASK) \ + ) + +#define VXC_MODIFIER_BIN(StartBin, EndBin, Clamp) \ + ( \ + (((Clamp) << 22)&VXC_CLAMP_BITMASK) | \ + (((StartBin) << 12)&VXC_START_BIN_BITMASK) | \ + (((EndBin) << 8)&VXC_END_BIN_BITMASK) \ + ) + +#define VXC_MODIFIER_GATHER(StartBin, EndBin, SourceBin, OffsetType) \ + ( \ + (((OffsetType) << VXC_OFFSET_TYPE_SHIFT)&VXC_OFFSET_TYPE_BITMASK) | \ + (((StartBin) << 12)&VXC_START_BIN_BITMASK) | \ + (((EndBin) << 8)&VXC_END_BIN_BITMASK) | \ + (((SourceBin) << 4)&VXC_SOURCE_BIN_BITMASK) \ + ) + +#define VXC_MODIFIER_SCATTER(StartBin, EndBin, SourceBin, OffsetType) \ + ( \ + (((OffsetType) << VXC_OFFSET_TYPE_SHIFT)&VXC_OFFSET_TYPE_BITMASK) | \ + (((StartBin) << 12)&VXC_START_BIN_BITMASK) | \ + (((EndBin) << 8)&VXC_END_BIN_BITMASK) | \ + (((SourceBin) << 4)&VXC_SOURCE_BIN_BITMASK) \ + ) + +#define VXC_MODIFIER_ATOMIC_S(StartBin, EndBin, SourceBin, OffsetType, AtomOp) \ + ( \ + (((OffsetType) << VXC_OFFSET_TYPE_SHIFT)&VXC_OFFSET_TYPE_BITMASK) | \ + (((AtomOp) << VXC_ATOM_OP_SHIFT)&VXC_ATOM_OP_BITMASK) | \ + (((StartBin) << 12)&VXC_START_BIN_BITMASK) | \ + (((EndBin) << 8)&VXC_END_BIN_BITMASK) | \ + (((SourceBin) << 4)&VXC_SOURCE_BIN_BITMASK) \ + ) + +/* + * Clamp : 0 ==> result is truncated to fit result type + * 1 ==> result is clamp to fit result type + */ +#define VXC_MODIFIER_SetDestClamp(VxModifier, Clamp) ((VxModifier) | (((Clamp) << 22)&VXC_CLAMP_BITMASK)) + +#define VXC_DEFAULT_MODIFIER (-1) + +typedef unsigned int vxc_modifier; + +/* packed char vector 2/4/8/16 */ +typedef _viv_char2_packed vxc_char2; +typedef _viv_char4_packed vxc_char4; +typedef _viv_char8_packed vxc_char8; +typedef _viv_char16_packed vxc_char16; +typedef struct _vxc_char32 +{ + vxc_char16 hi; + vxc_char16 lo; +} vxc_char32; + +/* packed uchar vector 2/4/8/16 */ +typedef _viv_uchar2_packed vxc_uchar2; +typedef _viv_uchar4_packed vxc_uchar4; +typedef _viv_uchar8_packed vxc_uchar8; +typedef _viv_uchar16_packed vxc_uchar16; +typedef struct _vxc_uchar32 +{ + vxc_uchar16 hi; + vxc_uchar16 lo; +} vxc_uchar32; + +/* packed short vector 2/4/8 */ +typedef _viv_short2_packed vxc_short2; +typedef _viv_short4_packed vxc_short4; +typedef _viv_short8_packed vxc_short8; +typedef struct _vxc_short16 +{ + vxc_short8 hi; + vxc_short8 lo; +} vxc_short16; + +/* packed ushort vector 2/4/8 */ +typedef _viv_ushort2_packed vxc_ushort2; +typedef _viv_ushort4_packed vxc_ushort4; +typedef _viv_ushort8_packed vxc_ushort8; +typedef struct _vxc_ushort16 +{ + vxc_ushort8 hi; + vxc_ushort8 lo; +} vxc_ushort16; + +/* int vector 2/4/8/16 */ +typedef int vxc_int; +typedef int2 vxc_int2; +typedef int4 vxc_int4; +typedef int8 vxc_int8; +typedef int16 vxc_int16; + +/* uint vector 2/4/8/16 */ +typedef uint vxc_uint; +typedef uint2 vxc_uint2; +typedef uint4 vxc_uint4; +typedef uint8 vxc_uint8; +typedef uint16 vxc_uint16; + +/* float vector 2/4/8/16 */ +typedef float vxc_float; +typedef float2 vxc_float2; +typedef float4 vxc_float4; +typedef float8 vxc_float8; +typedef float16 vxc_float16; + +/* half (float16) vector 2/4/8/16 */ +typedef half vxc_half; +typedef _viv_half2_packed vxc_half2; +typedef _viv_half4_packed vxc_half4; +typedef _viv_half8_packed vxc_half8; +typedef struct _vxc_half16 +{ + vxc_half8 hi; + vxc_half8 lo; +} vxc_half16; + +typedef uint16 vxc_512bits; +typedef uint4 vxc_128bits; + +typedef vxc_512bits VXC_512Bits; +typedef vxc_128bits VXC_128Bits; +typedef vxc_modifier VXC_Modifier_t ; +typedef vxc_round_mode VXC_RoundMode; +typedef vxc_filter_mode VXC_FilterMode; + +#ifndef VX_USE_INTRINSIC +#define VX_USE_INTRINSIC 0 /* default to use macro style interface */ +#endif + +enum VXC_OP { + VXC_OP_abs_diff = 3, /* it must be the same value as VIR_IK_abs_diff */ + VXC_OP_iadd, + VXC_OP_iacc_sq, + VXC_OP_lerp, + VXC_OP_filter, + VXC_OP_mag_phase, + VXC_OP_mul_shift, + VXC_OP_dp16x1, + VXC_OP_dp8x2, + VXC_OP_dp4x4, + VXC_OP_dp2x8, + VXC_OP_clamp, + VXC_OP_bi_linear, + VXC_OP_select_add, + VXC_OP_atomic_add, + VXC_OP_bit_extract, + VXC_OP_bit_replace, + VXC_OP_dp32x1, + VXC_OP_dp16x2, + VXC_OP_dp8x4, + VXC_OP_dp4x8, + VXC_OP_dp2x16, + VXC_OP_dp32x1_b, + VXC_OP_dp16x2_b, + VXC_OP_dp8x4_b, + VXC_OP_dp4x8_b, + VXC_OP_dp2x16_b, + VXC_OP_img_load, + VXC_OP_img_load_3d, + VXC_OP_img_store, + VXC_OP_img_store_3d, + VXC_OP_vload2, + VXC_OP_vload3, + VXC_OP_vload4, + VXC_OP_vload8, + VXC_OP_vload16, + VXC_OP_vstore2, + VXC_OP_vstore3, + VXC_OP_vstore4, + VXC_OP_vstore8, + VXC_OP_vstore16, + VXC_OP_index_add, + VXC_OP_vert_min3, + VXC_OP_vert_max3, + VXC_OP_vert_med3, + VXC_OP_horz_min3, + VXC_OP_horz_max3, + VXC_OP_horz_med3, + VXC_OP_error, + OP_bit_extract, + VXC_OP_dp16x1_b, + VXC_OP_dp8x2_b, + VXC_OP_dp4x4_b, + VXC_OP_dp2x8_b, + VXC_OP_gather, + VXC_OP_gather_b, + VXC_OP_scatter, + VXC_OP_scatter_b, + VXC_OP_atomic_s, + VXC_OP_atomic_s_b, +}; + +enum eVXC_ERROR +{ + ERROR_DP2x16_NOT_SUPPORTED, + ERROR_IADD_NOT_SUPPORTED, + ERROR_SELECTADD_NOT_SUPPORTED, + ERROR_BITREPLACE_NOT_SUPPORTED +}; + +#define VXC_OP1(Op, Dest, Src0) _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, Src0) + +#define VXC_OP2(Op, Dest, Src0, Src1) \ + do { \ + int _t1; \ + _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ + _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t1); \ + } while(0) + +#define VXC_OP3(Op, Dest, Src0, Src1, Src2) \ + do { \ + int _t1, _t2; \ + _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ + _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ + _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t2); \ + } while(0) + +#define VXC_OP3_NoDest(Op, Src0, Src1, Src2) \ + do { \ + int _t1, _t2, _t3; \ + _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ + _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ + _viv_asm(INTRINSIC_ST, _t3, VXC_OP_##Op, _t2); \ + } while(0) + + +#define VXC_OP4(Op, Dest, Src0, Src1, Src2, Src3) \ + do { \ + int _t1, _t2, _t3; \ + _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ + _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ + _viv_asm(PARAM_CHAIN, _t3, _t2, Src3); \ + _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t3); \ + } while(0) + +#define VXC_OP4_NoDest(Op, Src0, Src1, Src2, Src3) \ + do { \ + int _t1, _t2, _t3, _t4; \ + _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ + _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ + _viv_asm(PARAM_CHAIN, _t3, _t2, Src3); \ + _viv_asm(INTRINSIC_ST, _t4, VXC_OP_##Op, _t3); \ + } while(0) + +#define VXC_OP4_ST(Op, Dest, Src0, Src1, Src2, Src3) \ + do { \ + int _t1, _t2, _t3; \ + _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ + _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ + _viv_asm(PARAM_CHAIN, _t3, _t2, Src3); \ + _viv_asm(INTRINSIC_ST, Dest, VXC_OP_##Op, _t3);\ + } while(0) + +#define VXC_OP5(Op, Dest, Src0, Src1, Src2, Src3, Src4) \ + do { \ + int _t1, _t2, _t3, _t4; \ + _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ + _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ + _viv_asm(PARAM_CHAIN, _t3, _t2, Src3); \ + _viv_asm(PARAM_CHAIN, _t4, _t3, Src4); \ + _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t4); \ + } while(0) + +#define VXC_OP5_NoDest(Op, Src0, Src1, Src2, Src3, Src4) \ + do { \ + int _t1, _t2, _t3, _t4, _t5; \ + _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ + _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ + _viv_asm(PARAM_CHAIN, _t3, _t2, Src3); \ + _viv_asm(PARAM_CHAIN, _t4, _t3, Src4); \ + _viv_asm(INTRINSIC_ST, _t5, VXC_OP_##Op, _t4); \ + } while(0) + +/* make sure the immediate value offsetX and offsetY are in range of [-16, 15] */ +#define VXC_5BITOFFSET_XY(offsetX, offsetY) ((((offsetY) & 0x1F) << 5) | ((offsetX) & 0x1F)) + +/* Non-packed type image support in VX extension: + * For VX1, following image types are supported : + * image1d_t, image1d_array_t, image2d_t + * For VX2 all image types are supported + * + * OCL image builtins can be used in VX kernel: + * all samplerless image read/write builtins for supported image types + * all image query functions for supported image types + */ + +#if !VX_USE_INTRINSIC /* Use macro and inline asm */ +#define VXC_AbsDiff(Dest, Src0, Src1, Info) VXC_OP3(abs_diff, Dest, Src0, Src1, Info) +#define VXC_IAccSq(Dest, Src0, Src1, Imm, Info) VXC_OP4(iacc_sq, Dest, Src0, Src1, Imm, Info) +#define VXC_Lerp(Dest, Src0, Src1, Src2, Info) VXC_OP4(lerp, Dest, Src0, Src1, Src2, Info) +/* MulShift: Multiples two 8- or 16-bit integers and shifts + * + * Syntax: + * r = MulShift(a, b, Imm) ; // Imm must be an immediate value + * + * Semantics: + * r[i] = (a[i] * b[i]) >> Imm ; i E [0, elem(r) ) + */ +#define VXC_MulShift(Dest, Src0, Src1, Imm, Info) VXC_OP4(mul_shift, Dest, Src0, Src1, Imm, Info) +#define VXC_Clamp(Dest, Src0, Src1, Src2, Info) VXC_OP4(clamp, Dest, Src0, Src1, Src2, Info) +#define VXC_AtomicAdd(Dest, Base, Offset, Data, Info) VXC_OP4_ST(atomic_add, Dest, Base, Offset, Data, Info) +#define VXC_BitExtract(Dest, Src0, Src1, Src2, Info) VXC_OP4(bit_extract, Dest, Src0, Src1, Src2, Info) + +#define VXC_DP16x1(Dest, Src0, Src1, Info, U512) VXC_OP4(dp16x1, Dest, Src0, Src1, Info, U512) +#define VXC_DP8x2(Dest, Src0, Src1, Info, U512) VXC_OP4(dp8x2, Dest, Src0, Src1, Info, U512) +#define VXC_DP4x4(Dest, Src0, Src1, Info, U512) VXC_OP4(dp4x4, Dest, Src0, Src1, Info, U512) +#define VXC_DP2x8(Dest, Src0, Src1, Info, U512) VXC_OP4(dp2x8, Dest, Src0, Src1, Info, U512) + +#define VXC_DP32x1(Dest, Src0, Src1, Info, U512) VXC_OP4(dp32x1, Dest, Src0, Src1, Info, U512) +#define VXC_DP16x2(Dest, Src0, Src1, Info, U512) VXC_OP4(dp16x2, Dest, Src0, Src1, Info, U512) +#define VXC_DP8x4(Dest, Src0, Src1, Info, U512) VXC_OP4(dp8x4, Dest, Src0, Src1, Info, U512) +#define VXC_DP4x8(Dest, Src0, Src1, Info, U512) VXC_OP4(dp4x8, Dest, Src0, Src1, Info, U512) +#if (VX_VERSION >= 2) +#define VXC_DP2x16(Dest, Src0, Src1, Info, U512) VXC_OP1(error, ERROR_DP2x16_NOT_SUPPORTED) +#else +#define VXC_DP2x16(Dest, Src0, Src1, Info, U512) VXC_OP4(dp2x16, Dest, Src0, Src1, Info, U512) +#endif + +#if (VX_VERSION >= 2) +/* DP16 dot c + * vxc_char32 a; + * vxc_char16 b; + * vxc_int result; + * VXC_DP16x1_b(result, a.hi, a.lo, b, modifier, u); + * + * Src0 must be hi part of 256 bit value, Src1 must be lo part + * + */ +#define VXC_DP16x1_b(Dest, Src0, Src1, Src2, Info, U512) VXC_OP5(dp16x1_b, Dest, Src0, Src1, Src2, Info, U512) +#define VXC_DP8x2_b(Dest, Src0, Src1, Src2, Info, U512) VXC_OP5(dp8x2_b, Dest, Src0, Src1, Src2, Info, U512) +#define VXC_DP4x4_b(Dest, Src0, Src1, Src2, Info, U512) VXC_OP5(dp4x4_b, Dest, Src0, Src1, Src2, Info, U512) +#define VXC_DP2x8_b(Dest, Src0, Src1, Src2, Info, U512) VXC_OP5(dp2x8_b, Dest, Src0, Src1, Src2, Info, U512) +#endif + +/* DP32 dot c + * vxc_char32 a; + * vxc_char16 b; + * vxc_int result; + * VXC_DP32x1_b(result, a.hi, a.lo, b, modifier, u); + * + * Src0 must be hi part of 256 bit value, Src1 must be lo part + * + */ +#define VXC_DP32x1_b(Dest, Src0, Src1, Src2, Info, U512) VXC_OP5(dp32x1_b, Dest, Src0, Src1, Src2, Info, U512) +#define VXC_DP16x2_b(Dest, Src0, Src1, Src2, Info, U512) VXC_OP5(dp16x2_b, Dest, Src0, Src1, Src2, Info, U512) +#define VXC_DP8x4_b(Dest, Src0, Src1, Src2, Info, U512) VXC_OP5(dp8x4_b, Dest, Src0, Src1, Src2, Info, U512) +#define VXC_DP4x8_b(Dest, Src0, Src1, Src2, Info, U512) VXC_OP5(dp4x8_b, Dest, Src0, Src1, Src2, Info, U512) +#define VXC_DP2x16_b(Dest, Src0, Src1, Src2, Info, U512) VXC_OP5(dp2x16_b, Dest, Src0, Src1, Src2, Info, U512) + +#define VXC_Gather(Dest, BaseAddr, Offsets, GatherInfo) VXC_OP3(gather, Dest, BaseAddr, Offsets, GatherInfo) +#define VXC_Gather_b(Dest, BaseAddr, Offsets, Offsets_b, GatherInfo) VXC_OP4(gather_b, Dest, BaseAddr, Offsets, Offsets_b, GatherInfo) + +#define VXC_Scatter(BaseAddr, Offsets, Data, ScatterInfo) VXC_OP4_NoDest(scatter, BaseAddr, Offsets, Data, ScatterInfo) +#define VXC_Scatter_b(BaseAddr, Offsets, Offsets_b, Data, ScatterInfo) VXC_OP5_NoDest(scatter_b, BaseAddr, Offsets, Offsets_b, Data, ScatterInfo) + +#define VXC_AtomicS(Dest, BaseAddr, Offsets, Data, AtomicSInfo) VXC_OP4(atomic_s, Dest, BaseAddr, Offsets, Data, AtomicSInfo) +#define VXC_AtomicS_b(Dest, BaseAddr, Offsets, Offsets_b, Data, AtomicSInfo) VXC_OP5(atomic_s_b, Dest, BaseAddr, Offsets, Offsets_b, Data, AtomicSInfo) + +/* packed type image data read/write: supported types are packed 8-bit/16bit integer, 16bit float */ +/* image read/write for image1d_t/image1d_array/image2d_t, + * offset should be composed by using VXC_5BITOFFSET_XY(x, y) */ +#define VXC_ReadImage(Dest, Image, Coord, Offset, Info) VXC_OP4(img_load, Dest, Image, Coord, Offset, Info) +#define VXC_WriteImage(Image, Coord, Color, Info) VXC_OP4_NoDest(img_store, Image, Coord, Color, Info) + +/* image load/store for image2d_array_t, + * Image is a vec8 image descriptor + * Offset should be composed by using VXC_5BITOFFSET_XY(x, y) + * Coord must be type of int4 or float4 + */ +#define VXC_ReadImage2DArray(Dest, Image, Coord, Offset, Info) \ + do { \ + int8 desc; \ + _viv_asm(COPY, desc, Image, sizeof(desc)); \ + _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \ + int baseAddr = (int)(Coord).w *desc.s4 + desc.s0; \ + _viv_asm(MOV, (Coord).w, baseAddr); \ + VXC_OP4(img_load_3d, Dest, Image, (Coord).xyww, Offset, Info); \ + } while (0) +#define VXC_WriteImage2DArray(Image, Coord, Color, Info) \ + do { \ + int8 desc; \ + _viv_asm(COPY, desc, Image, sizeof(desc)); \ + _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \ + int baseAddr = (int)(Coord).w *(desc).s4 + desc.s0; \ + _viv_asm(MOV, (Coord).w, baseAddr); \ + VXC_OP4_NoDest(img_store_3d, Image, (Coord).xyww, Color, Info); \ + } while (0) + +/* image load/store for image3d_t, + * offset should be composed by using VXC_5BITOFFSET_XY(x, y) + * Coord must be type of int4 or float4 + */ +#define VXC_ReadImage3D(Dest, Image, Coord, Offset, Info) VXC_OP4(img_load_3d, Dest, Image, Coord, Offset, Info) +#define VXC_WriteImage3D(Image, Coord, Color, Info) VXC_OP4_NoDest(img_store_3d, Image, Coord, Color, Info) + +#define VXC_Vload2(Dest, Pointer, Offset) do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload2, Dest, Pointer, byteOffset); } while(0) +#define VXC_Vload4(Dest, Pointer, Offset) do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload4, Dest, Pointer, byteOffset); } while(0) +#define VXC_Vload8(Dest, Pointer, Offset) do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload8, Dest, Pointer, byteOffset); } while(0) +#define VXC_Vload16(Dest, Pointer, Offset) do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload16, Dest, Pointer, byteOffset); } while(0) + +#define VXC_Vstore2(Pointer, Offset, Data) do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore2, Pointer, byteOffset, Data); } while(0) +#define VXC_Vstore4(Pointer, Offset, Data) do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore4, Pointer, byteOffset, Data); } while(0) +#define VXC_Vstore8(Pointer, Offset, Data) do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore8, Pointer, byteOffset, Data); } while(0) +#define VXC_Vstore16(Pointer, Offset, Data) do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore16, Pointer, byteOffset, Data); } while(0) + +/* VX2 only instructions*/ +#define VXC_IndexAdd(Dest, Src0, Src1, Src2, Info) VXC_OP4(index_add, Dest, Src0, Src1, Src2, Info) +#define VXC_VertMin3(Dest, Src0, Src1, Src2, Info) VXC_OP4(vert_min3, Dest, Src0, Src1, Src2, Info) +#define VXC_VertMax3(Dest, Src0, Src1, Src2, Info) VXC_OP4(vert_max3, Dest, Src0, Src1, Src2, Info) +#define VXC_VertMed3(Dest, Src0, Src1, Src2, Info) VXC_OP4(vert_med3, Dest, Src0, Src1, Src2, Info) +#define VXC_HorzMin3(Dest, Src0, Info) VXC_OP2(horz_min3, Dest, Src0, Info) +#define VXC_HorzMax3(Dest, Src0, Info) VXC_OP2(horz_max3, Dest, Src0, Info) +#define VXC_HorzMed3(Dest, Src0, Info) VXC_OP2(horz_med3, Dest, Src0, Info) + +#if (VX_VERSION == 2) +#define VXC_BiLinear(Dest, Src0, Src1, Src2, Info) \ + do { \ + int endBin = ((Info) & VXC_END_BIN_BITMASK) >> 8; \ + int roundMode = ((Info) & VXC_ROUNDING_MODE_BITMASK) >> 2; \ + int clamp = ((Info) & VXC_CLAMP_BITMASK) >> 22; \ + int mod1 = VXC_MODIFIER(0, endBin + 1, 0, roundMode, clamp); \ + int4 bitMask = { 0x00000000, 0x00000008, 0x00000010, 0x00000018}; \ + typeof (Dest) bi1; \ + uint4 bi2; \ + int bi3, bi4; \ + VXC_Lerp(bi1, Src0, Src1, (Src2).y, mod1); \ + _viv_asm(PARAM_CHAIN, bi3, bi1.x!, bitMask); \ + _viv_asm(PARAM_CHAIN, bi4, bi3, 8); \ + _viv_asm(INTRINSIC, bi2, OP_bit_extract, bi4); \ + VXC_Lerp(Dest, bi2!, bi2.y!, (Src2).x, Info); \ + } while (0) + +#define VXC_BitReplace(Dest, Src0, Src1, Src2, Info) /* BitReplace definition here */ +#define VXC_IAdd(Dest, Src0, Src1, Src2, Info) /* IAdd definition here */ +#define VXC_MagPhase(Dest, Src0, Src1, Info) /* MagPhase definition here */ +#define VXC_SelectAdd(Dest, Src0, Src1, U512, Info) VXC_OP1(error, ERROR_SELECTADD_NOT_SUPPORTED) + +#define VXC_Filter_Box(Dest, Src0, Src1, Src2, Info) /* box filter definition here */ +#define VXC_Filter_Guassian(Dest, Src0, Src1, Src2, Info) /* Guassian filter definition here */ +#define VXC_Filter_SobelX(Dest, Src0, Src1, Src2, Info) /* SobelX filter definition here */ +#define VXC_Filter_SobelY(Dest, Src0, Src1, Src2, Info) /* SobelY filter definition here */ +#define VXC_Filter_ScharrX(Dest, Src0, Src1, Src2, Info) /* ScharrX filter definition here */ +#define VXC_Filter_ScharrY(Dest, Src0, Src1, Src2, Info) /* ScharrY filter definition here */ +#define VXC_Filter_Max(Dest, Src0, Src1, Src2, Info) /* Max filter definition here */ +#define VXC_Filter_Min(Dest, Src0, Src1, Src2, Info) /* Min filter definition here */ +#define VXC_Filter_Median(Dest, Src0, Src1, Src2, Info) /* Median filter definition here */ +#define VXC_Filter(Dest, Src0, Src1, Src2, Info) do { \ + int filter = (((Info) >> 16)&0x0F); \ + if (filter == VXC_FM_BOX) { VXC_Filter_Box(Dest, Src0, Src1, Src2, Info); } \ + if (filter == VXC_FM_Guassian) { VXC_Filter_Guassian(Dest, Src0, Src1, Src2, Info); } \ + if (filter == VXC_FM_SobelX) { VXC_Filter_SobelX(Dest, Src0, Src1, Src2, Info); } \ + if (filter == VXC_FM_SobelY) { VXC_Filter_SobelY(Dest, Src0, Src1, Src2, Info); } \ + if (filter == VXC_FM_ScharrX) { VXC_Filter_ScharrX(Dest, Src0, Src1, Src2, Info); } \ + if (filter == VXC_FM_ScharrY) { VXC_Filter_ScharrY(Dest, Src0, Src1, Src2, Info); } \ + if (filter == VXC_FM_Max) { VXC_Filter_Max(Dest, Src0, Src1, Src2, Info); } \ + if (filter == VXC_FM_Min) { VXC_Filter_Min(Dest, Src0, Src1, Src2, Info); } \ + if (filter == VXC_FM_Median) { VXC_Filter_Median(Dest, Src0, Src1, Src2, Info); } \ + } while (0) + +#else /* VX1 */ + +#define VXC_BiLinear(Dest, Src0, Src1, Src2, Info) VXC_OP4(bi_linear, Dest, Src0, Src1, Src2, Info) +#define VXC_BitReplace(Dest, Src0, Src1, Src2, Info) VXC_OP4(bit_replace, Dest, Src0, Src1, Src2, Info) +#define VXC_IAdd(Dest, Src0, Src1, Src2, Info) VXC_OP4(iadd, Dest, Src0, Src1, Src2, Info) +#define VXC_MagPhase(Dest, Src0, Src1, Info) VXC_OP3(mag_phase, Dest, Src0, Src1, Info) +#define VXC_SelectAdd(Dest, Src0, Src1, U512, Info) VXC_OP4(select_add, Dest, Src0, Src1, U512, Info) +#define VXC_Filter(Dest, Src0, Src1, Src2, Info) VXC_OP4(filter, Dest, Src0, Src1, Src2, Info) +#endif + +#else + +#ifdef __cplusplus +extern "c" { +#endif + +#define viv_vx_api_only 0 + +#if viv_vx_api_only +#define _RET0_ ; +#define _RET_ ; +#define _EXT_ extern +#else +#define _RET0_ { return (0); } +#define _RET_ { return ; } +#define _EXT_ +#endif + +/* implicit cast for vx_inst parameter*/ +/* uchar */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_icastP_uc(vxc_char16 a) _RET0_ +_EXT_ vxc_uchar8 viv_intrinsic_vx_icastP_uc(vxc_char8 a) _RET0_ + +_EXT_ vxc_uchar16 viv_intrinsic_vx_icastP_uc(vxc_short8 a) _RET0_ +_EXT_ vxc_uchar8 viv_intrinsic_vx_icastP_uc(vxc_short4 a) _RET0_ + +_EXT_ vxc_uchar16 viv_intrinsic_vx_icastP_uc(vxc_ushort8 a) _RET0_ +_EXT_ vxc_uchar8 viv_intrinsic_vx_icastP_uc(vxc_ushort4 a) _RET0_ + +_EXT_ vxc_uchar16 viv_intrinsic_vx_icastP_uc(vxc_half8 a) _RET0_ +_EXT_ vxc_uchar8 viv_intrinsic_vx_icastP_uc(vxc_half4 a) _RET0_ + +/* char */ +_EXT_ vxc_char16 viv_intrinsic_vx_icastP_c(vxc_uchar16 a) _RET0_ +_EXT_ vxc_char8 viv_intrinsic_vx_icastP_c(vxc_uchar8 a) _RET0_ + +_EXT_ vxc_char16 viv_intrinsic_vx_icastP_c(vxc_short8 a) _RET0_ +_EXT_ vxc_char8 viv_intrinsic_vx_icastP_c(vxc_short4 a) _RET0_ + +_EXT_ vxc_char16 viv_intrinsic_vx_icastP_c(vxc_ushort8 a) _RET0_ +_EXT_ vxc_char8 viv_intrinsic_vx_icastP_c(vxc_ushort4 a) _RET0_ + +_EXT_ vxc_char16 viv_intrinsic_vx_icastP_c(vxc_half8 a) _RET0_ +_EXT_ vxc_char8 viv_intrinsic_vx_icastP_c(vxc_half4 a) _RET0_ + +/* ushort */ +_EXT_ vxc_ushort8 viv_intrinsic_vx_icastP_us(vxc_uchar16 a) _RET0_ +_EXT_ vxc_ushort4 viv_intrinsic_vx_icastP_us(vxc_uchar8 a) _RET0_ + +_EXT_ vxc_ushort8 viv_intrinsic_vx_icastP_us(vxc_char16 a) _RET0_ +_EXT_ vxc_ushort4 viv_intrinsic_vx_icastP_us(vxc_char8 a) _RET0_ + +_EXT_ vxc_ushort8 viv_intrinsic_vx_icastP_us(vxc_short8 a) _RET0_ +_EXT_ vxc_ushort4 viv_intrinsic_vx_icastP_us(vxc_short4 a) _RET0_ + +_EXT_ vxc_ushort8 viv_intrinsic_vx_icastP_us(vxc_half8 a) _RET0_ +_EXT_ vxc_ushort4 viv_intrinsic_vx_icastP_us(vxc_half4 a) _RET0_ + +/* short */ +_EXT_ vxc_short8 viv_intrinsic_vx_icastP_s(vxc_uchar16 a) _RET0_ +_EXT_ vxc_short4 viv_intrinsic_vx_icastP_s(vxc_uchar8 a) _RET0_ + +_EXT_ vxc_short8 viv_intrinsic_vx_icastP_s(vxc_char16 a) _RET0_ +_EXT_ vxc_short4 viv_intrinsic_vx_icastP_s(vxc_char8 a) _RET0_ + +_EXT_ vxc_short8 viv_intrinsic_vx_icastP_s(vxc_ushort8 a) _RET0_ +_EXT_ vxc_short4 viv_intrinsic_vx_icastP_s(vxc_ushort4 a) _RET0_ + +_EXT_ vxc_short8 viv_intrinsic_vx_icastP_s(vxc_half8 a) _RET0_ +_EXT_ vxc_short4 viv_intrinsic_vx_icastP_s(vxc_half4 a) _RET0_ + +/* half */ +_EXT_ vxc_half8 viv_intrinsic_vx_icastP_h(vxc_uchar16 a) _RET0_ +_EXT_ vxc_half4 viv_intrinsic_vx_icastP_h(vxc_uchar8 a) _RET0_ + +_EXT_ vxc_half8 viv_intrinsic_vx_icastP_h(vxc_char16 a) _RET0_ +_EXT_ vxc_half4 viv_intrinsic_vx_icastP_h(vxc_char8 a) _RET0_ + +_EXT_ vxc_half8 viv_intrinsic_vx_icastP_h(vxc_ushort8 a) _RET0_ +_EXT_ vxc_half4 viv_intrinsic_vx_icastP_h(vxc_ushort4 a) _RET0_ + +_EXT_ vxc_half8 viv_intrinsic_vx_icastP_h(vxc_short8 a) _RET0_ +_EXT_ vxc_half4 viv_intrinsic_vx_icastP_h(vxc_short4 a) _RET0_ + + +/* implicit cast for vx_inst dest */ +/* uchar */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_icastD_uc(vxc_char16 a) _RET0_ +_EXT_ vxc_uchar8 viv_intrinsic_vx_icastD_uc(vxc_char8 a) _RET0_ + +_EXT_ vxc_uchar8 viv_intrinsic_vx_icastD_uc(vxc_short8 a) _RET0_ +_EXT_ vxc_uchar4 viv_intrinsic_vx_icastD_uc(vxc_short4 a) _RET0_ + +_EXT_ vxc_uchar8 viv_intrinsic_vx_icastD_uc(vxc_ushort8 a) _RET0_ +_EXT_ vxc_uchar4 viv_intrinsic_vx_icastD_uc(vxc_ushort4 a) _RET0_ + +_EXT_ vxc_uchar8 viv_intrinsic_vx_icastD_uc(vxc_half8 a) _RET0_ +_EXT_ vxc_uchar4 viv_intrinsic_vx_icastD_uc(vxc_half4 a) _RET0_ + +_EXT_ vxc_uchar4 viv_intrinsic_vx_icastD_uc(vxc_int4 a) _RET0_ +_EXT_ vxc_uchar2 viv_intrinsic_vx_icastD_uc(vxc_int2 a) _RET0_ + +_EXT_ vxc_uchar4 viv_intrinsic_vx_icastD_uc(vxc_uint4 a) _RET0_ +_EXT_ vxc_uchar2 viv_intrinsic_vx_icastD_uc(vxc_uint2 a) _RET0_ + +/* char */ +_EXT_ vxc_char16 viv_intrinsic_vx_icastD_c(vxc_uchar16 a) _RET0_ +_EXT_ vxc_char8 viv_intrinsic_vx_icastD_c(vxc_uchar8 a) _RET0_ + +_EXT_ vxc_char8 viv_intrinsic_vx_icastD_c(vxc_short8 a) _RET0_ +_EXT_ vxc_char4 viv_intrinsic_vx_icastD_c(vxc_short4 a) _RET0_ + +_EXT_ vxc_char8 viv_intrinsic_vx_icastD_c(vxc_ushort8 a) _RET0_ +_EXT_ vxc_char4 viv_intrinsic_vx_icastD_c(vxc_ushort4 a) _RET0_ + +_EXT_ vxc_char8 viv_intrinsic_vx_icastD_c(vxc_half8 a) _RET0_ +_EXT_ vxc_char4 viv_intrinsic_vx_icastD_c(vxc_half4 a) _RET0_ + +_EXT_ vxc_char4 viv_intrinsic_vx_icastD_c(vxc_int4 a) _RET0_ +_EXT_ vxc_char2 viv_intrinsic_vx_icastD_c(vxc_int2 a) _RET0_ + +_EXT_ vxc_char4 viv_intrinsic_vx_icastD_c(vxc_uint4 a) _RET0_ +_EXT_ vxc_char2 viv_intrinsic_vx_icastD_c(vxc_uint2 a) _RET0_ + +/* ushort */ +_EXT_ vxc_ushort8 viv_intrinsic_vx_icastD_us(vxc_uchar8 a) _RET0_ +_EXT_ vxc_ushort4 viv_intrinsic_vx_icastD_us(vxc_uchar4 a) _RET0_ + +_EXT_ vxc_ushort8 viv_intrinsic_vx_icastD_us(vxc_char8 a) _RET0_ +_EXT_ vxc_ushort4 viv_intrinsic_vx_icastD_us(vxc_char4 a) _RET0_ + +_EXT_ vxc_ushort8 viv_intrinsic_vx_icastD_us(vxc_short8 a) _RET0_ +_EXT_ vxc_ushort4 viv_intrinsic_vx_icastD_us(vxc_short4 a) _RET0_ + +_EXT_ vxc_ushort8 viv_intrinsic_vx_icastD_us(vxc_half8 a) _RET0_ +_EXT_ vxc_ushort4 viv_intrinsic_vx_icastD_us(vxc_half4 a) _RET0_ + +_EXT_ vxc_ushort4 viv_intrinsic_vx_icastD_us(vxc_int4 a) _RET0_ +_EXT_ vxc_ushort2 viv_intrinsic_vx_icastD_us(vxc_int2 a) _RET0_ + +_EXT_ vxc_ushort4 viv_intrinsic_vx_icastD_us(vxc_uint4 a) _RET0_ +_EXT_ vxc_ushort2 viv_intrinsic_vx_icastD_us(vxc_uint2 a) _RET0_ + +/* short */ +_EXT_ vxc_short8 viv_intrinsic_vx_icastD_s(vxc_uchar8 a) _RET0_ +_EXT_ vxc_short4 viv_intrinsic_vx_icastD_s(vxc_uchar4 a) _RET0_ + +_EXT_ vxc_short8 viv_intrinsic_vx_icastD_s(vxc_char8 a) _RET0_ +_EXT_ vxc_short4 viv_intrinsic_vx_icastD_s(vxc_char4 a) _RET0_ + +_EXT_ vxc_short8 viv_intrinsic_vx_icastD_s(vxc_ushort8 a) _RET0_ +_EXT_ vxc_short4 viv_intrinsic_vx_icastD_s(vxc_ushort4 a) _RET0_ + +_EXT_ vxc_short8 viv_intrinsic_vx_icastD_s(vxc_half8 a) _RET0_ +_EXT_ vxc_short4 viv_intrinsic_vx_icastD_s(vxc_half4 a) _RET0_ + +_EXT_ vxc_short4 viv_intrinsic_vx_icastD_s(vxc_int4 a) _RET0_ +_EXT_ vxc_short2 viv_intrinsic_vx_icastD_s(vxc_int2 a) _RET0_ + +_EXT_ vxc_short4 viv_intrinsic_vx_icastD_s(vxc_uint4 a) _RET0_ +_EXT_ vxc_short2 viv_intrinsic_vx_icastD_s(vxc_uint2 a) _RET0_ + +/* half */ +_EXT_ vxc_half8 viv_intrinsic_vx_icastD_h(vxc_uchar8 a) _RET0_ +_EXT_ vxc_half4 viv_intrinsic_vx_icastD_h(vxc_uchar4 a) _RET0_ + +_EXT_ vxc_half8 viv_intrinsic_vx_icastD_h(vxc_char8 a) _RET0_ +_EXT_ vxc_half4 viv_intrinsic_vx_icastD_h(vxc_char4 a) _RET0_ + +_EXT_ vxc_half8 viv_intrinsic_vx_icastD_h(vxc_ushort8 a) _RET0_ +_EXT_ vxc_half4 viv_intrinsic_vx_icastD_h(vxc_ushort4 a) _RET0_ + +_EXT_ vxc_half8 viv_intrinsic_vx_icastD_h(vxc_short8 a) _RET0_ +_EXT_ vxc_half4 viv_intrinsic_vx_icastD_h(vxc_short4 a) _RET0_ + +_EXT_ vxc_half4 viv_intrinsic_vx_icastD_h(vxc_int4 a) _RET0_ +_EXT_ vxc_half2 viv_intrinsic_vx_icastD_h(vxc_int2 a) _RET0_ + +_EXT_ vxc_half4 viv_intrinsic_vx_icastD_h(vxc_uint4 a) _RET0_ +_EXT_ vxc_half2 viv_intrinsic_vx_icastD_h(vxc_uint2 a) _RET0_ + +/* int32 */ +_EXT_ vxc_int4 viv_intrinsic_vx_icastD_i(vxc_char4 a) _RET0_ +_EXT_ vxc_int4 viv_intrinsic_vx_icastD_i(vxc_uchar4 a) _RET0_ +_EXT_ vxc_int4 viv_intrinsic_vx_icastD_i(vxc_short4 a) _RET0_ +_EXT_ vxc_int4 viv_intrinsic_vx_icastD_i(vxc_ushort4 a) _RET0_ +_EXT_ vxc_int4 viv_intrinsic_vx_icastD_i(vxc_half4 a) _RET0_ +_EXT_ vxc_int4 viv_intrinsic_vx_icastD_i(vxc_uint4 a) _RET0_ +_EXT_ vxc_int4 viv_intrinsic_vx_icastD_i(vxc_float4 a) _RET0_ + +_EXT_ vxc_int2 viv_intrinsic_vx_icastD_i(vxc_char2 a) _RET0_ +_EXT_ vxc_int2 viv_intrinsic_vx_icastD_i(vxc_uchar2 a) _RET0_ +_EXT_ vxc_int2 viv_intrinsic_vx_icastD_i(vxc_short2 a) _RET0_ +_EXT_ vxc_int2 viv_intrinsic_vx_icastD_i(vxc_ushort2 a) _RET0_ +_EXT_ vxc_int2 viv_intrinsic_vx_icastD_i(vxc_half2 a) _RET0_ +_EXT_ vxc_int2 viv_intrinsic_vx_icastD_i(vxc_uint2 a) _RET0_ +_EXT_ vxc_int2 viv_intrinsic_vx_icastD_i(vxc_float2 a) _RET0_ + +/* uint32 */ +_EXT_ vxc_uint4 viv_intrinsic_vx_icastD_ui(vxc_char4 a) _RET0_ +_EXT_ vxc_uint4 viv_intrinsic_vx_icastD_ui(vxc_uchar4 a) _RET0_ +_EXT_ vxc_uint4 viv_intrinsic_vx_icastD_ui(vxc_short4 a) _RET0_ +_EXT_ vxc_uint4 viv_intrinsic_vx_icastD_ui(vxc_ushort4 a) _RET0_ +_EXT_ vxc_uint4 viv_intrinsic_vx_icastD_ui(vxc_half4 a) _RET0_ +_EXT_ vxc_uint4 viv_intrinsic_vx_icastD_ui(vxc_int4 a) _RET0_ +_EXT_ vxc_uint4 viv_intrinsic_vx_icastD_ui(vxc_float4 a) _RET0_ + +_EXT_ vxc_uint2 viv_intrinsic_vx_icastD_ui(vxc_char2 a) _RET0_ +_EXT_ vxc_uint2 viv_intrinsic_vx_icastD_ui(vxc_uchar2 a) _RET0_ +_EXT_ vxc_uint2 viv_intrinsic_vx_icastD_ui(vxc_short2 a) _RET0_ +_EXT_ vxc_uint2 viv_intrinsic_vx_icastD_ui(vxc_ushort2 a) _RET0_ +_EXT_ vxc_uint2 viv_intrinsic_vx_icastD_ui(vxc_half2 a) _RET0_ +_EXT_ vxc_uint2 viv_intrinsic_vx_icastD_ui(vxc_int2 a) _RET0_ +_EXT_ vxc_uint2 viv_intrinsic_vx_icastD_ui(vxc_float2 a) _RET0_ + +/* float32 */ +_EXT_ vxc_float4 viv_intrinsic_vx_icastD_f(vxc_char4 a) _RET0_ +_EXT_ vxc_float4 viv_intrinsic_vx_icastD_f(vxc_uchar4 a) _RET0_ +_EXT_ vxc_float4 viv_intrinsic_vx_icastD_f(vxc_short4 a) _RET0_ +_EXT_ vxc_float4 viv_intrinsic_vx_icastD_f(vxc_ushort4 a) _RET0_ +_EXT_ vxc_float4 viv_intrinsic_vx_icastD_f(vxc_half4 a) _RET0_ +_EXT_ vxc_float4 viv_intrinsic_vx_icastD_f(vxc_int4 a) _RET0_ + +_EXT_ vxc_float2 viv_intrinsic_vx_icastD_f(vxc_char2 a) _RET0_ +_EXT_ vxc_float2 viv_intrinsic_vx_icastD_f(vxc_uchar2 a) _RET0_ +_EXT_ vxc_float2 viv_intrinsic_vx_icastD_f(vxc_short2 a) _RET0_ +_EXT_ vxc_float2 viv_intrinsic_vx_icastD_f(vxc_ushort2 a) _RET0_ +_EXT_ vxc_float2 viv_intrinsic_vx_icastD_f(vxc_half2 a) _RET0_ +_EXT_ vxc_float2 viv_intrinsic_vx_icastD_f(vxc_int2 a) _RET0_ + +/* data selection */ +/* swizzle: + * 16 elements: i E [0-15], if mask[i] == '1 then result[i] = a[ swzl[i*4 : i*4 + 3] ]; + * 8 elements: i E [0-7], if mask[2*i : 2*i+1] == '11 then result[i] = a[ swzl[i*4 : i*4 + 3] & 0x7 ]; + */ +#define VXC_SWIZZLE_MASK8_ALL() 0xFFFF +#define VXC_SWIZZLE_MASK8(E0, E1, E2, E3, E4, E5, E6, E7) \ + (((E0) * 0x3) | ((E1) * (0x3 << 2)) | ((E2) * (0x3 << 4)) | \ + ((E3) * (0x3 << 6)) | ((E4) * (0x3 << 8)) | ((E5) * (0x3 << 10)) | \ + ((E6) * (0x3 << 12)) | ((E7) * (0x3 << 14)) ) + +#define VXC_SWIZZLE_MASK16_ALL() 0xFFFF +#define VXC_SWIZZLE_MASK16(E0, E1, E2, E3, E4, E5, E6, E7, E9, E10, E11, E12, E13, E14, E15) \ + (((E0) * 0x1) | ((E1) * (0x1 << 1)) | ((E2) * (0x1 << 2)) | \ + ((E3) * (0x1 << 1)) | ((E4) * (0x1 << 4)) | ((E5) * (0x1 << 5)) | \ + ((E6) * (0x1 << 6)) | ((E7) * (0x1 << 7)) | ((E8) * (0x1 << 8)) | \ + ((E9) * (0x1 << 9)) | ((E10) * (0x1 << 10)) | ((E11) * (0x1 << 11)) | \ + ((E12) * (0x1 << 12)) | ((E13) * (0x1 << 13)) | ((E14) * (0x1 << 14)) | \ + ((E15) * (0x1 << 15)) ) +/* E0 - E15 must be 0 or 1 */ +#define VXC_SWIZZLE8(S0, S1, S2, S3, S4, S5, S6, S7) \ + (uint)((S0) << 0 | (S1) << 4 | (S2) << 8 | (S3) << 12 | \ + (S4) << 16 | (S5) << 20 | (S6) << 24 | (S7) << 28 ) + + +_EXT_ vxc_char16 viv_intrinsic_vx_read_imagec (image2d_t image, int2 coord) _RET0_ +_EXT_ vxc_uchar16 viv_intrinsic_vx_read_imageuc (image2d_t image, int2 coord) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_read_images (image2d_t image, int2 coord) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_read_imageus (image2d_t image, int2 coord) _RET0_ +_EXT_ vxc_half8 viv_intrinsic_vx_read_imageh (image2d_t image, int2 coord) _RET0_ + +_EXT_ vxc_char16 viv_intrinsic_vx_read_imagec (image1d_t image, int coord) _RET0_ +_EXT_ vxc_uchar16 viv_intrinsic_vx_read_imageuc (image1d_t image, int coord) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_read_images (image1d_t image, int coord) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_read_imageus (image1d_t image, int coord) _RET0_ +_EXT_ vxc_half8 viv_intrinsic_vx_read_imageh (image1d_t image, int coord) _RET0_ + +_EXT_ vxc_char16 viv_intrinsic_vx_read_imagec (image1d_array_t image, int2 coord) _RET0_ +_EXT_ vxc_uchar16 viv_intrinsic_vx_read_imageuc (image1d_array_t image, int2 coord) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_read_images (image1d_array_t image, int2 coord) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_read_imageus (image1d_array_t image, int2 coord) _RET0_ +_EXT_ vxc_half8 viv_intrinsic_vx_read_imageh (image1d_array_t image, int2 coord) _RET0_ + +/* image write */ +_EXT_ void viv_intrinsic_vx_write_imagec (image2d_t image, int2 coord, vxc_char16 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_imageuc (image2d_t image, int2 coord, vxc_uchar16 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_images (image2d_t image, int2 coord, vxc_short8 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_imageus (image2d_t image, int2 coord, vxc_ushort8 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_imageh (image2d_t image, int2 coord, vxc_half8 color) _RET_ + +_EXT_ void viv_intrinsic_vx_write_imagec (image1d_t image, int coord, vxc_char16 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_imageuc (image1d_t image, int coord, vxc_uchar16 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_images (image1d_t image, int coord, vxc_short8 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_imageus (image1d_t image, int coord, vxc_ushort8 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_imageh (image1d_t image, int coord, vxc_half8 color) _RET_ + +_EXT_ void viv_intrinsic_vx_write_imagec (image1d_array_t image, int2 coord, vxc_char16 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_imageuc (image1d_array_t image, int2 coord, vxc_uchar16 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_images (image1d_array_t image, int2 coord, vxc_short8 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_imageus (image1d_array_t image, int2 coord, vxc_ushort8 color) _RET_ +_EXT_ void viv_intrinsic_vx_write_imageh (image1d_array_t image, int2 coord, vxc_half8 color) _RET_ + +/* AbsDiff + * + * Syntax: + * r = AbsDiff(a, b) ; + * + * Semantics: + * r[i] = |a[i] - b[i]| ; i E [0, elem(r) ) + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_AbsDiff_uc(vxc_uchar16 a, vxc_uchar16 b) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_AbsDiff_c(vxc_char16 a, vxc_char16 b) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_AbsDiff_s(vxc_short8 a, vxc_short8 b) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_AbsDiff_us(vxc_ushort8 a, vxc_ushort8 b) _RET0_ + +/* IAdd + * + * Syntax: + * r = IAdd(a, b, c) ; + * + * Semantics: + * r[i] = a[i] + b[i] + c[i] ; i E [0, elem(r) ) + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_IAdd_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_IAdd_c(vxc_char16 a, vxc_char16 b, vxc_char16 c) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_IAdd_s(vxc_short8 a, vxc_short8 b, vxc_short8 c) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_IAdd_us(vxc_ushort8 a, vxc_ushort8 b, vxc_ushort8 c) _RET0_ + +/* IAccSq: squares a value and adds it to an accumulator + * + * Syntax: + * r = IAccSq(a, b, Imm) ; // Imm must be an immediate value + * + * Semantics: + * r[i] = a[i] + (b[i]^2 >> Imm) ; i E [0, elem(r) ) + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_IAccSq_uc(vxc_uchar16 a, vxc_uchar16 b, uint Imm) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_IAccSq_c(vxc_char16 a, vxc_char16 b, uint Imm) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_IAccSq_s(vxc_short8 a, vxc_short8 b, uint Imm) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_IAccSq_us(vxc_ushort8 a, vxc_ushort8 b, uint Imm) _RET0_ + +/* Lerp: linear interpolation between two values + * + * Syntax: + * r = Lerp(a, b, c) ; + * + * Semantics: + * r[i] = (1.0 - c) * a[i] + c * b[i] ; i E [0, elem(r) ) + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_Lerp_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_float c) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_Lerp_c(vxc_char16 a, vxc_char16 b, vxc_float c) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_Lerp_s(vxc_short8 a, vxc_short8 b, vxc_float c) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_Lerp_us(vxc_ushort8 a, vxc_ushort8 b, vxc_float c) _RET0_ + +/* Filter: performs a specific filter on a 3x3 pixel block + * + * Syntax: + * r = Filter(a, b, c, f) ; f E { BOX, Guassian, SobelX, SobelY, + * ScharrX, ScharrY, Max, Min, Median } + * + * Semantics: + * S[i] = { {a[i], a[i+1], a[i+2]}, + {b[i], b[i+1], b[i+2]}, + {c[i], c[i+1], c[i+2]} } ; + * r[i] = (*f)(U, S[i]) ; i E [0, elem(r) - 2 ) + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_Filter_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c, vxc_filter_mode f) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_Filter_c(vxc_char16 a, vxc_char16 b, vxc_char16 c, vxc_filter_mode f) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_Filter_s(vxc_short8 a, vxc_short8 b, vxc_short8 c, vxc_filter_mode f) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_Filter_us(vxc_ushort8 a, vxc_ushort8 b, vxc_ushort8 c, vxc_filter_mode f) _RET0_ + +/* MagPhase: computes the magnitude and phase of two incoming values. + * + * Syntax: + * r = MagPhase(a, b) ; + * + * Semantics: + * r[i] = sqrt(a[i]^2 + b[i]^2) ; i E [0, 3] + * r[4+i] = arctan(b[i] / a[i]) ; + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_MagPhase_uc(vxc_uchar16 a, vxc_uchar16 b) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_MagPhase_c(vxc_char16 a, vxc_char16 b) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_MagPhase_s(vxc_short8 a, vxc_short8 b) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_MagPhase_us(vxc_ushort8 a, vxc_ushort8 b) _RET0_ + +/* MulShift: Multiples two 8- or 16-bit integers and shifts + * + * Syntax: + * r = MulShift(a, b, Imm) ; // Imm must be an immediate value + * + * Semantics: + * r[i] = (a[i] * b[i]) >> Imm ; i E [0, elem(r) ) + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_MulShift_uc(vxc_uchar16 a, vxc_uchar16 b, uint Imm) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_MulShift_c(vxc_char16 a, vxc_char16 b, uint Imm) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_MulShift_s(vxc_short8 a, vxc_short8 b, uint Imm) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_MulShift_us(vxc_ushort8 a, vxc_ushort8 b, uint Imm) _RET0_ + +/* Clamp: clamps up to 16 values to a min and.or max value + * + * Syntax: + * r = Clamp(a, b, c) ; + * r = ClampBoolean(a, b, c) ; // boolean mode + * Semantics: + * r[i] = clamp(a[i], b[i], c[i]) ; i E [0, elem(r) ) + * + * In boolean mode it will write a 0 in the result if the value + * is inside the specified min/max range, otherwise all 1\92s will + * be written to the result. + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_Clamp_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_Clamp_c(vxc_char16 a, vxc_char16 b, vxc_char16 c) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_Clamp_s(vxc_short8 a, vxc_short8 b, vxc_short8 c) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_Clamp_us(vxc_ushort8 a, vxc_ushort8 b, vxc_ushort8 c) _RET0_ + +_EXT_ vxc_uchar16 viv_intrinsic_vx_ClampBoolean_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_ClampBoolean_c(vxc_char16 a, vxc_char16 b, vxc_char16 c) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_ClampBoolean_s(vxc_short8 a, vxc_short8 b, vxc_short8 c) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_ClampBoolean_us(vxc_ushort8 a, vxc_ushort8 b, vxc_ushort8 c) _RET0_ + +/* BiLinear: computes a bi-linear interpolation of 4 pixel values. + * + * Syntax: + * r = BiLinear(a, b, c) ; + * Semantics: + * r[i] = a[i] * (1 ? c.x) * (1 ? c.y) + * + a[i+1] * c.x * (1 ? c.y) + * + b[i] * (1 ? c.x) * c.y + * + b[i+1] * c.x * c.y + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_BiLinear_uc(vxc_uchar16 a, vxc_uchar16 b, float2 c) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_BiLinear_c(vxc_char16 a, vxc_char16 b, float2 c) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_BiLinear_s(vxc_short8 a, vxc_short8 b, float2 c) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_BiLinear_us(vxc_ushort8 a, vxc_ushort8 b, float2 c) _RET0_ + + +/* SelectAdd: either adds the pixel value or increments a counter + * inside a number of distribution (histogram) bins + * Syntax: + * r = SelectAdd(a, b, c, r) ; + * Semantics: + * r[i] = a[c[i]] + b[c[i]] ; i E [0, 7] + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_SelectAdd_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_512bits c) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_SelectAdd_c(vxc_char16 a, vxc_char16 b, vxc_512bits c) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_SelectAdd_s(vxc_short8 a, vxc_short8 b, vxc_512bits c) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_SelectAdd_us(vxc_ushort8 a, vxc_ushort8 b, vxc_512bits c) _RET0_ + +/* AtomicAdd: adds a valid atomically to a given address. + * It is infact a read/modify/write instruction + * that executes atomically + * + * Syntax: + * AtomicAdd(a, b, c) ; // a -> base, b -> offset, c -> add value + * Semantics: + * (a + offset)[i] = (a + offset)[i] + c[j]; i E [0, 7] + */ +_EXT_ vxc_uchar16 viv_intrinsic_vx_AtomicAdd_uc(vxc_uchar16 * a, vxc_int offset, vxc_uchar16 c) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vx_AtomicAdd_c(vxc_char16 * a, vxc_int offset, vxc_char16 c) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vx_AtomicAdd_s(vxc_short8 * a, vxc_int offset, vxc_short8 c) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vx_AtomicAdd_us(vxc_ushort8 * a, vxc_int offset, vxc_ushort8 c) _RET0_ + +/* BitExtract: extracts up to 8 bitfields from a packed data stream. + * The input is is a 256-bit blob of data. + * + * Syntax: + * r = BitExtract(a, b, c) ; + * Semantics: + * r[i] = ({b, a} >> c[i]) ^ ((1 << c[i+8]) ? 1) ; i E [0, 7] + */ +_EXT_ vxc_ushort8 viv_intrinsic_vx_BitExtract_us(vxc_ushort8 a, vxc_ushort8 b, vxc_uchar16 c) _RET0_ +_EXT_ vxc_uchar8 viv_intrinsic_vx_BitExtract_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c) _RET0_ + +/* BitReplace: replaces up to 8 bitfields inside a packed data stream. + * Syntax: + * r = BitReplace(a, b, c) ; + * Semantics: + */ +_EXT_ vxc_ushort8 viv_intrinsic_vx_BitReplace_us(vxc_ushort8 a, vxc_ushort8 b, vxc_uchar16 c) _RET0_ +_EXT_ vxc_uchar16 viv_intrinsic_vx_BitReplace_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c) _RET0_ + +/* direct mapping to machine code, with bin and rounding mode info */ +/* samplerless image read */ + +/* offsetXY should be composed by using VXC_5BITOFFSET_XY(x, y) */ +/* offsetXY [ 4: 0] S05 relative x offset + * [ 9: 5] S05 relative y offset + */ +_EXT_ vxc_char16 viv_intrinsic_vxmc_read_imagec (image2d_t image, int2 coord, int offsetXY, vxc_modifier modifier) _RET0_ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_read_imageuc (image2d_t image, int2 coord, int offsetXY, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_read_images (image2d_t image, int2 coord, int offsetXY, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_read_imageus (image2d_t image, int2 coord, int offsetXY, vxc_modifier modifier) _RET0_ +_EXT_ vxc_half8 viv_intrinsic_vxmc_read_imageh (image2d_t image, int2 coord, int offsetXY, vxc_modifier modifier) _RET0_ + +_EXT_ vxc_char16 viv_intrinsic_vxmc_read_imagec (image1d_t image, int coord, int offsetX, vxc_modifier modifier) _RET0_ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_read_imageuc (image1d_t image, int coord, int offsetX, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_read_images (image1d_t image, int coord, int offsetX, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_read_imageus (image1d_t image, int coord, int offsetX, vxc_modifier modifier) _RET0_ +_EXT_ vxc_half8 viv_intrinsic_vxmc_read_imageh (image1d_t image, int coord, int offsetX, vxc_modifier modifier) _RET0_ + +_EXT_ vxc_char16 viv_intrinsic_vxmc_read_imagec (image1d_array_t image, int2 coord, int offsetXY, vxc_modifier modifier) _RET0_ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_read_imageuc (image1d_array_t image, int2 coord, int offsetXY, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_read_images (image1d_array_t image, int2 coord, int offsetXY, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_read_imageus (image1d_array_t image, int2 coord, int offsetXY, vxc_modifier modifier) _RET0_ +_EXT_ vxc_half8 viv_intrinsic_vxmc_read_imageh (image1d_array_t image, int2 coord, int offsetXY, vxc_modifier modifier) _RET0_ + +/* image write */ +_EXT_ void viv_intrinsic_vxmc_write_imagec (image2d_t image, int2 coord, vxc_char16 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_imageuc (image2d_t image, int2 coord, vxc_uchar16 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_images (image2d_t image, int2 coord, vxc_short8 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_imageus (image2d_t image, int2 coord, vxc_ushort8 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_imageh (image2d_t image, int2 coord, vxc_half8 color, vxc_modifier modifier) _RET_ + +_EXT_ void viv_intrinsic_vxmc_write_imagec (image1d_t image, int coord, vxc_char16 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_imageuc (image1d_t image, int coord, vxc_uchar16 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_images (image1d_t image, int coord, vxc_short8 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_imageus (image1d_t image, int coord, vxc_ushort8 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_imageh (image1d_t image, int coord, vxc_half8 color, vxc_modifier modifier) _RET_ + +_EXT_ void viv_intrinsic_vxmc_write_imagec (image1d_array_t image, int2 coord, vxc_char16 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_imageuc (image1d_array_t image, int2 coord, vxc_uchar16 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_images (image1d_array_t image, int2 coord, vxc_short8 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_imageus (image1d_array_t image, int2 coord, vxc_ushort8 color, vxc_modifier modifier) _RET_ +_EXT_ void viv_intrinsic_vxmc_write_imageh (image1d_array_t image, int2 coord, vxc_half8 color, vxc_modifier modifier) _RET_ + +/* AbsDiff */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_AbsDiff_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_AbsDiff_c(vxc_char16 a, vxc_char16 b, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_AbsDiff_s(vxc_short8 a, vxc_short8 b, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_AbsDiff_us(vxc_ushort8 a, vxc_ushort8 b, vxc_modifier modifier) _RET0_ + +/* IAdd */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_IAdd_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_IAdd_c(vxc_char16 a, vxc_char16 b, vxc_char16 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_IAdd_s(vxc_short8 a, vxc_short8 b, vxc_short8 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_IAdd_us(vxc_ushort8 a, vxc_ushort8 b, vxc_ushort8 c, vxc_modifier modifier) _RET0_ + +/* IAccSq */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_IAccSq_uc(vxc_uchar16 a, vxc_uchar16 b, uint Imm, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_IAccSq_c(vxc_char16 a, vxc_char16 b, uint Imm, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_IAccSq_s(vxc_short8 a, vxc_short8 b, uint Imm, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_IAccSq_us(vxc_ushort8 a, vxc_ushort8 b, uint Imm, vxc_modifier modifier) _RET0_ + +/* Lerp */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_Lerp_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_float c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_Lerp_c(vxc_char16 a, vxc_char16 b, vxc_float c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_Lerp_s(vxc_short8 a, vxc_short8 b, vxc_float c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_Lerp_us(vxc_ushort8 a, vxc_ushort8 b, vxc_float c, vxc_modifier modifier) _RET0_ + +/* Filter */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_Filter_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_Filter_c(vxc_char16 a, vxc_char16 b, vxc_char16 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_Filter_s(vxc_short8 a, vxc_short8 b, vxc_short8 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_Filter_us(vxc_ushort8 a, vxc_ushort8 b, vxc_ushort8 c, vxc_modifier modifier) _RET0_ + +/* MagPhase */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_MagPhase_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_MagPhase_c(vxc_char16 a, vxc_char16 b, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_MagPhase_s(vxc_short8 a, vxc_short8 b, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_MagPhase_us(vxc_ushort8 a, vxc_ushort8 b, vxc_modifier modifier) _RET0_ + +/* MulShift */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_MulShift_uc(vxc_uchar16 a, vxc_uchar16 b, uint Imm, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_MulShift_c(vxc_char16 a, vxc_char16 b, uint Imm, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_MulShift_s(vxc_short8 a, vxc_short8 b, uint Imm, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_MulShift_us(vxc_ushort8 a, vxc_ushort8 b, uint Imm, vxc_modifier modifier) _RET0_ + +/* DP16x1: performs a dot-product of two 16-component values + * + * Syntax: + * r = DP16x1(a, b); + * + * Semantics: + * r = Sigma(a[i] * b[i]); i E [0, 15] + */ +_EXT_ vxc_uint viv_intrinsic_vxmc_DP16x1(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_int viv_intrinsic_vxmc_DP16x1(vxc_char16 a, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_float viv_intrinsic_vxmc_DP16x1(vxc_half8 a, vxc_half8 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +/* DP8x2: performs two dot-product of two 8-component values. + * + * Syntax: + * r = DP8x2(a, b); + * + * Semantics: + * r[0] = Sigma(a[i] * b[i]); i E [0, 7] + * r[1] = Sigma(a[i] * b[i]); i E [8, 15] + */ +_EXT_ vxc_uint2 viv_intrinsic_vxmc_DP8x2(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_int2 viv_intrinsic_vxmc_DP8x2(vxc_char16 a, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_float2 viv_intrinsic_vxmc_DP8x2(vxc_half8 a, vxc_half8 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +/* DP4x4: performs four dot-product of two 4-component values. + * + * Syntax: + * r = DP4x4(a, b); + * + * Semantics: + * r[0] = Sigma(a[i] * b[i]); i E [0, 3] + * r[1] = Sigma(a[i] * b[i]); i E [4, 7] + * r[2] = Sigma(a[i] * b[i]); i E [8, 11] + * r[3] = Sigma(a[i] * b[i]); i E [12, 15] + */ +_EXT_ vxc_uint4 viv_intrinsic_vxmc_DP4x4(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_int4 viv_intrinsic_vxmc_DP4x4(vxc_char16 a, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_float4 viv_intrinsic_vxmc_DP4x4(vxc_half8 a, vxc_half8 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +/* DP2x8: performs eight dot-product of two 2-component values. + * + * Syntax: + * r = DP2x8(a, b); + * + * Semantics: + * r[0] = Sigma(a[i] * b[i]); i E [0, 1] + * r[1] = Sigma(a[i] * b[i]); i E [2, 3] + * r[2] = Sigma(a[i] * b[i]); i E [4, 5] + * r[3] = Sigma(a[i] * b[i]); i E [6, 7] + * r[4] = Sigma(a[i] * b[i]); i E [8, 9] + * r[5] = Sigma(a[i] * b[i]); i E [10, 11] + * r[6] = Sigma(a[i] * b[i]); i E [12, 13] + * r[7] = Sigma(a[i] * b[i]); i E [14, 15] + */ +_EXT_ vxc_uchar8 viv_intrinsic_vxmc_DP2x8(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_char8 viv_intrinsic_vxmc_DP2x8(vxc_char16 a, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_half8 viv_intrinsic_vxmc_DP2x8(vxc_half8 a, vxc_half8 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +/* DP32 dot constant */ +_EXT_ vxc_uint viv_intrinsic_vxmc_DP32x1(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_int viv_intrinsic_vxmc_DP32x1(vxc_char16 a, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +_EXT_ vxc_uint2 viv_intrinsic_vxmc_DP16x2(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_int2 viv_intrinsic_vxmc_DP16x2(vxc_char16 a, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +_EXT_ vxc_uint4 viv_intrinsic_vxmc_DP8x4(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_int4 viv_intrinsic_vxmc_DP8x4(vxc_char16 a, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_DP4x8(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_DP4x8(vxc_char16 a, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_DP2x16(vxc_uchar16 a, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_DP2x16(vxc_char16 a, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +/* DP32 dot c + * vxc_char32 a; + * vxc_char16 b; + * vxc_int result = viv_intrinsic_vxmc_DP32x1_b(a.hi, a.lo, b, modifier, u); + */ +_EXT_ vxc_uint viv_intrinsic_vxmc_DP32x1_b(vxc_uchar16 a_hi, vxc_uchar16 a_lo, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_int viv_intrinsic_vxmc_DP32x1_b(vxc_char16 a_hi, vxc_char16 a_lo, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +_EXT_ vxc_uint2 viv_intrinsic_vxmc_DP16x2_b(vxc_uchar16 a_hi, vxc_uchar16 a_lo, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_int2 viv_intrinsic_vxmc_DP16x2_b(vxc_char16 a_hi, vxc_char16 a_lo, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +_EXT_ vxc_uint4 viv_intrinsic_vxmc_DP8x4_b(vxc_uchar16 a_hi, vxc_uchar16 a_lo, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_int4 viv_intrinsic_vxmc_DP8x4_b(vxc_char16 a_hi, vxc_char16 a_lo, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_DP4x8_b(vxc_uchar16 a_hi, vxc_uchar16 a_lo, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_DP4x8_b(vxc_char16 a_hi, vxc_char16 a_lo, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_DP2x16_b(vxc_uchar16 a_hi, vxc_uchar16 a_lo, vxc_uchar16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_DP2x16_b(vxc_char16 a_hi, vxc_char16 a_lo, vxc_char16 b, vxc_modifier modifier, vxc_512bits u) _RET0_ + +/* Clamp */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_Clamp_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_Clamp_c(vxc_char16 a, vxc_char16 b, vxc_char16 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_Clamp_s(vxc_short8 a, vxc_short8 b, vxc_short8 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_Clamp_us(vxc_ushort8 a, vxc_ushort8 b, vxc_ushort8 c, vxc_modifier modifier) _RET0_ + +/* BiLinear */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_BiLinear_uc(vxc_uchar16 a, vxc_uchar16 b, float2 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_BiLinear_c(vxc_char16 a, vxc_char16 b, float2 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_BiLinear_s(vxc_short8 a, vxc_short8 b, float2 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_BiLinear_us(vxc_ushort8 a, vxc_ushort8 b, float2 c, vxc_modifier modifier) _RET0_ + +/* SelectAdd */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_SelectAdd_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_512bits c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_SelectAdd_c(vxc_char16 a, vxc_char16 b, vxc_512bits c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_SelectAdd_s(vxc_short8 a, vxc_short8 b, vxc_512bits c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_SelectAdd_us(vxc_ushort8 a, vxc_ushort8 b, vxc_512bits c, vxc_modifier modifier) _RET0_ + +/* AtomicAdd */ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_AtomicAdd_uc(vxc_uchar16 * a, vxc_int offset, vxc_uchar16 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_char16 viv_intrinsic_vxmc_AtomicAdd_c(vxc_char16 * a, vxc_int offset, vxc_char16 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_short8 viv_intrinsic_vxmc_AtomicAdd_s(vxc_short8 * a, vxc_int offset, vxc_short8 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_AtomicAdd_us(vxc_ushort8 * a, vxc_int offset, vxc_ushort8 c, vxc_modifier modifier) _RET0_ + +/* BitExtract */ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_BitExtract_us(vxc_ushort8 a, vxc_ushort8 b, vxc_uchar16 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_uchar8 viv_intrinsic_vxmc_BitExtract_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c, vxc_modifier modifier) _RET0_ + +/* BitReplace */ +_EXT_ vxc_ushort8 viv_intrinsic_vxmc_BitReplace_us(vxc_ushort8 a, vxc_ushort8 b, vxc_uchar16 c, vxc_modifier modifier) _RET0_ +_EXT_ vxc_uchar16 viv_intrinsic_vxmc_BitReplace_uc(vxc_uchar16 a, vxc_uchar16 b, vxc_uchar16 c, vxc_modifier modifier) _RET0_ + +/* vloadn: read packed vector type from memory as packed in register + where n takes the value of 2, 3, 4, 8, 16 + * Syntax: + * _viv_gentypen_packed dest; + * dest = vloadn(offset, gentype *p); + * Semantics: + * gentype is the generic type to indicate the built-in data types + * char, uchar, short, ushort. + * Return sizeof(_viv_gentypen) bytes of data read + * from address (p + n * offset). The + * address computed as (p + n * offset) must + * be 8-bit aligned if gentype is char, uchar; + * 16-bit aligned if gentype is short, ushort, half; + */ +vxc_char2 viv_intrinsic_vx_vload2(size_t Offset, char *Pointer) { + vxc_char2 dest; + VXC_OP2(vload2, dest, Pointer, Offset * sizeof(vxc_char2)); + return dest; +} + +vxc_char4 viv_intrinsic_vx_vload4(size_t Offset, char *Pointer) { + vxc_char4 dest; + VXC_OP2(vload4, dest, Pointer, Offset * sizeof(vxc_char4)); + return dest; +} + +vxc_char8 viv_intrinsic_vx_vload8(size_t Offset, char *Pointer) { + vxc_char8 dest; + VXC_OP2(vload8, dest, Pointer, Offset * sizeof(vxc_char8)); + return dest; +} + +vxc_char16 viv_intrinsic_vx_vload16(size_t Offset, char *Pointer) { + vxc_char16 dest; + VXC_OP2(vload16, dest, Pointer, Offset * sizeof(vxc_char16)); + return dest; +} + +vxc_uchar2 viv_intrinsic_vx_vload2(size_t Offset, uchar *Pointer) { + vxc_uchar2 dest; + VXC_OP2(vload2, dest, Pointer, Offset * sizeof(vxc_uchar2)); + return dest; +} + +vxc_uchar4 viv_intrinsic_vx_vload4(size_t Offset, uchar *Pointer) { + vxc_uchar4 dest; + VXC_OP2(vload4, dest, Pointer, Offset * sizeof(vxc_uchar4)); + return dest; +} + +vxc_uchar8 viv_intrinsic_vx_vload8(size_t Offset, uchar *Pointer) { + vxc_uchar8 dest; + VXC_OP2(vload8, dest, Pointer, Offset * sizeof(vxc_uchar8)); + return dest; +} + +vxc_uchar16 viv_intrinsic_vx_vload16(size_t Offset, uchar *Pointer) { + vxc_uchar16 dest; + VXC_OP2(vload16, dest, Pointer, Offset * sizeof(vxc_uchar16)); + return dest; +} + +vxc_short2 viv_intrinsic_vx_vload2(size_t Offset, short *Pointer) { + vxc_short2 dest; + VXC_OP2(vload2, dest, Pointer, Offset * sizeof(vxc_short2)); + return dest; +} + +vxc_short4 viv_intrinsic_vx_vload4(size_t Offset, short *Pointer) { + vxc_short4 dest; + VXC_OP2(vload4, dest, Pointer, Offset * sizeof(vxc_short4)); + return dest; +} + +vxc_short8 viv_intrinsic_vx_vload8(size_t Offset, short *Pointer) { + vxc_short8 dest; + VXC_OP2(vload8, dest, Pointer, Offset * sizeof(vxc_short8)); + return dest; +} + +vxc_short16 viv_intrinsic_vx_vload16(size_t Offset, short *Pointer) { + vxc_short16 dest; + VXC_OP2(vload16, dest, Pointer, Offset * sizeof(vxc_short16)); + return dest; +} + +vxc_ushort2 viv_intrinsic_vx_vload2(size_t Offset, ushort *Pointer) { + vxc_ushort2 dest; + VXC_OP2(vload2, dest, Pointer, Offset * sizeof(vxc_ushort2)); + return dest; +} + +vxc_ushort4 viv_intrinsic_vx_vload4(size_t Offset, ushort *Pointer) { + vxc_ushort4 dest; + VXC_OP2(vload4, dest, Pointer, Offset * sizeof(vxc_ushort4)); + return dest; +} + +vxc_ushort8 viv_intrinsic_vx_vload8(size_t Offset, ushort *Pointer) { + vxc_ushort8 dest; + VXC_OP2(vload8, dest, Pointer, Offset * sizeof(vxc_ushort8)); + return dest; +} + +vxc_ushort16 viv_intrinsic_vx_vload16(size_t Offset, ushort *Pointer) { + vxc_ushort16 dest; + VXC_OP2(vload16, dest, Pointer, Offset * sizeof(vxc_ushort16)); + return dest; +} + +vxc_half2 viv_intrinsic_vx_vload2(size_t Offset, half *Pointer) { + vxc_half2 dest; + VXC_OP2(vload2, dest, Pointer, Offset * sizeof(vxc_half2)); + return dest; +} + +vxc_half4 viv_intrinsic_vx_vload4(size_t Offset, half *Pointer) { + vxc_half4 dest; + VXC_OP2(vload4, dest, Pointer, Offset * sizeof(vxc_half4)); + return dest; +} + +vxc_half8 viv_intrinsic_vx_vload8(size_t Offset, half *Pointer) { + vxc_half8 dest; + VXC_OP2(vload8, dest, Pointer, Offset * sizeof(vxc_half8)); + return dest; +} + +vxc_half16 viv_intrinsic_vx_vload16(size_t Offset, half *Pointer) { + vxc_half16 dest; + VXC_OP2(vload16, dest, Pointer, Offset * sizeof(vxc_half16)); + return dest; +} + +/* storen: write packed vector type to memory + where n takes the value of 2, 3, 4, 8, 16 +* Syntax: +* _viv_gentypen_packed dest; +* void vstoren(_viv_gentypen_packed data, int offet, gentype *p); +* Semantics: +* gentype is the generic type to indicate the built-in data types +* char, uchar, short, ushort. +* Write sizeof (_viv_gentypen_packed) bytes given by +* data to address (p + n * offset). The +* address computed as (p + n * offset) must +* be 8-bit aligned if gentype is char, uchar; +* 16-bit aligned if gentype is short, ushort, half; +*/ +void viv_intrinsic_vx_vstore2(vxc_char2 Data, size_t Offset, char * Pointer) { + VXC_OP3_NoDest(vstore2, Pointer, Offset * sizeof(vxc_char2), Data); +} + +void viv_intrinsic_vx_vstore4(vxc_char4 Data, size_t Offset, char * Pointer) { + VXC_OP3_NoDest(vstore4, Pointer, Offset * sizeof(vxc_char4), Data); +} + +void viv_intrinsic_vx_vstore8(vxc_char8 Data, size_t Offset, char * Pointer) { + VXC_OP3_NoDest(vstore8, Pointer, Offset * sizeof(vxc_char8), Data); +} + +void viv_intrinsic_vx_vstore16(vxc_char16 Data, size_t Offset, char * Pointer) { + VXC_OP3_NoDest(vstore16, Pointer, Offset * sizeof(vxc_char16), Data); +} + +void viv_intrinsic_vx_vstore2(vxc_uchar2 Data, size_t Offset, uchar * Pointer) { + VXC_OP3_NoDest(vstore2, Pointer, Offset * sizeof(vxc_uchar2), Data); +} + +void viv_intrinsic_vx_vstore4(vxc_uchar4 Data, size_t Offset, uchar * Pointer) { + VXC_OP3_NoDest(vstore4, Pointer, Offset * sizeof(vxc_uchar4), Data); +} + +void viv_intrinsic_vx_vstore8(vxc_uchar8 Data, size_t Offset, uchar * Pointer) { + VXC_OP3_NoDest(vstore8, Pointer, Offset * sizeof(vxc_uchar8), Data); +} + +void viv_intrinsic_vx_vstore16(vxc_uchar16 Data, size_t Offset, uchar * Pointer) { + VXC_OP3_NoDest(vstore16, Pointer, Offset * sizeof(vxc_uchar16), Data); +} + +void viv_intrinsic_vx_vstore2(vxc_short2 Data, size_t Offset, short * Pointer) { + VXC_OP3_NoDest(vstore2, Pointer, Offset * sizeof(vxc_short2), Data); +} + +void viv_intrinsic_vx_vstore4(vxc_short4 Data, size_t Offset, short * Pointer) { + VXC_OP3_NoDest(vstore4, Pointer, Offset * sizeof(vxc_short4), Data); +} + +void viv_intrinsic_vx_vstore8(vxc_short8 Data, size_t Offset, short * Pointer) { + VXC_OP3_NoDest(vstore8, Pointer, Offset * sizeof(vxc_short8), Data); +} + +void viv_intrinsic_vx_vstore16(vxc_short16 Data, size_t Offset, short * Pointer) { + VXC_OP3_NoDest(vstore16, Pointer, Offset * sizeof(vxc_short16), Data); +} + +void viv_intrinsic_vx_vstore2(vxc_ushort2 Data, size_t Offset, ushort * Pointer) { + VXC_OP3_NoDest(vstore2, Pointer, Offset * sizeof(vxc_ushort2), Data); +} + +void viv_intrinsic_vx_vstore4(vxc_ushort4 Data, size_t Offset, ushort * Pointer) { + VXC_OP3_NoDest(vstore4, Pointer, Offset * sizeof(vxc_ushort4), Data); +} + +void viv_intrinsic_vx_vstore8(vxc_ushort8 Data, size_t Offset, ushort * Pointer) { + VXC_OP3_NoDest(vstore8, Pointer, Offset * sizeof(vxc_ushort8), Data); +} + +void viv_intrinsic_vx_vstore16(vxc_ushort16 Data, size_t Offset, ushort * Pointer) { + VXC_OP3_NoDest(vstore16, Pointer, Offset * sizeof(vxc_ushort16), Data); +} + +void viv_intrinsic_vx_vstore2(vxc_half2 Data, size_t Offset, half * Pointer) { + VXC_OP3_NoDest(vstore2, Pointer, Offset * sizeof(vxc_half2), Data); +} + +void viv_intrinsic_vx_vstore4(vxc_half4 Data, size_t Offset, half * Pointer) { + VXC_OP3_NoDest(vstore4, Pointer, Offset * sizeof(vxc_half4), Data); +} + +void viv_intrinsic_vx_vstore8(vxc_half8 Data, size_t Offset, half * Pointer) { + VXC_OP3_NoDest(vstore8, Pointer, Offset * sizeof(vxc_half8), Data); +} + +void viv_intrinsic_vx_vstore16(vxc_half16 Data, size_t Offset, half * Pointer) { + VXC_OP3_NoDest(vstore16, Pointer, Offset * sizeof(vxc_half16), Data); +} + +#undef _RET0_ +#undef _RET_ +#undef _EXT_ + +#ifdef __cplusplus +} +#endif + +#endif +typedef struct +{ + size_t size; + global char* item; +} vx_array_char; + +typedef struct +{ + size_t size; + global unsigned char* item; +} vx_array_uchar; + +typedef struct +{ + size_t size; + global short* item; +} vx_array_short; + +typedef struct +{ + size_t size; + global unsigned short* item; +} vx_array_ushort; + +typedef struct +{ + size_t size; + global int* item; +} vx_array_int; + +typedef struct +{ + size_t size; + global unsigned int* item; +} vx_array_uint; + + +typedef struct +{ + size_t size; + global float * item; +} vx_array_float; + +typedef struct +{ + size_t size; + global unsigned char* item; +} vx_lut_uchar; + +typedef struct +{ + size_t size; + global unsigned short* item; +} vx_lut_ushort; + +typedef struct +{ + size_t columns; + size_t rows; + global short* matrix; + uint scale; +} vx_convolution; + +typedef struct +{ + size_t columns; + size_t rows; + global char* matrix; +} vx_matrix_char; + +typedef struct +{ + size_t columns; + size_t rows; + global unsigned char* matrix; +} vx_matrix_uchar; + +typedef struct +{ + size_t columns; + size_t rows; + global short* matrix; +} vx_matrix_short; + +typedef struct +{ + size_t columns; + size_t rows; + global unsigned short* matrix; +} vx_matrix_ushort; + +typedef struct +{ + size_t columns; + size_t rows; + global int* matrix; +} vx_matrix_int; + +typedef struct +{ + size_t columns; + size_t rows; + global unsigned int* matrix; +} vx_matrix_uint; + +typedef struct +{ + size_t columns; + size_t rows; + global float* matrix; +} vx_matrix_float; + +typedef struct +{ + int type; + uint value; + uint lower; + uint upper; + uint trueValue; + uint falseValue; +} vx_threshold; + +typedef struct { + int dst_width; + int dst_height; + global float* ptr; +} vx_remap; + +typedef struct +{ + int bins; + int rang; + int offset; + float window_r; + global int* ptr; +} vx_distribution; + +typedef struct _vxc_pyramid +{ + float scale; + uint width; + uint height; + uint format; + uint levelCount; + _viv_image2d_array_t imageArray; +} vxc_pyramid; + +#endif /* _VIV_VX_EXTENSION */ + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h b/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h new file mode 100644 index 0000000..6c1e9f5 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h @@ -0,0 +1,216 @@ +/**************************************************************************** +* +* Copyright 2017 - 2020 Vivante Corporation, Santa Clara, California. +* All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* 'Software'), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sub license, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject +* to the following conditions: +* +* The above copyright notice and this permission notice (including the +* next paragraph) shall be included in all copies or substantial +* portions of the Software. +* +* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. +* IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VIV_NN_COMPATIBILITY_H_ +#define _VIV_NN_COMPATIBILITY_H_ + +#include +#include + +/* keep the backward compatibility with spec 1.1 for standard nn kernels */ +#define VX_KERNEL_NN_SOFTMAX_LAYER VX_KERNEL_SOFTMAX_LAYER +#define VX_KERNEL_NN_NORMALIZATION_LAYER VX_KERNEL_NORMALIZATION_LAYER +#define VX_KERNEL_NN_POOLING_LAYER VX_KERNEL_POOLING_LAYER +#define VX_KERNEL_NN_FULLY_CONNECTED_LAYER VX_KERNEL_FULLY_CONNECTED_LAYER +#define VX_KERNEL_NN_ACTIVATION_LAYER VX_KERNEL_ACTIVATION_LAYER +#define VX_KERNEL_NN_ROIPOOL VX_KERNEL_ROI_POOLING_LAYER +#define VX_KERNEL_NN_CONVOLUTION_LAYER VX_KERNEL_CONVOLUTION_LAYER +#define VX_KERNEL_NN_DECONVOLUTION_LAYER VX_KERNEL_DECONVOLUTION_LAYER + +/* keep the backward compatibility with spec 1.1 for vx_tensor_attribute_e */ +#define VX_TENSOR_NUM_OF_DIMS VX_TENSOR_NUMBER_OF_DIMS +#define VX_TENSOR_FIXED_POINT_POS VX_TENSOR_FIXED_POINT_POSITION + +/* keep the backward compatibility with spec 1.1 from vx_convolutional_network_rounding_type_e to vx_nn_rounding_type_e */ +typedef enum vx_nn_rounding_type_e vx_convolutional_network_rounding_type_e; +#define VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR VX_NN_DS_SIZE_ROUNDING_FLOOR +#define VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_CEILING VX_NN_DS_SIZE_ROUNDING_CEILING + +/* keep the backward compatibility with spec 1.1 from vx_convolutional_network_pooling_type_e to vx_nn_pooling_type_e */ +typedef enum vx_nn_pooling_type_e vx_convolutional_network_pooling_type_e; +#define VX_CONVOLUTIONAL_NETWORK_POOLING_MAX VX_NN_POOLING_MAX +#define VX_CONVOLUTIONAL_NETWORK_POOLING_AVG VX_NN_POOLING_AVG +#define VX_CONVOLUTIONAL_NETWORK_POOLING_L2 VX_NN_POOLING_L2 +#define VX_CONVOLUTIONAL_NETWORK_POOLING_AVG_ANDROID VX_NN_POOLING_AVG_ANDROID + +/* keep the backward compatibility with spec 1.1 from vx_convolutional_network_norm_type_e to vx_nn_norm_type_e */ +typedef enum vx_nn_norm_type_e vx_convolutional_network_norm_type_e; +#define VX_CONVOLUTIONAL_NETWORK_NORM_SAME_MAP VX_NN_NORMALIZATION_SAME_MAP +#define VX_CONVOLUTIONAL_NETWORK_NORM_ACROSS_MAPS VX_NN_NORMALIZATION_ACROSS_MAPS + +/* keep the backward compatibility with spec 1.1 from vx_convolutional_network_layer_type_e to vx_nn_layer_type_e */ +typedef enum vx_nn_layer_type_e vx_convolutional_network_layer_type_e; +#define VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER VX_NN_CONVOLUTION_LAYER +#define VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER VX_NN_FULLYCONNECTED_LAYER + +/* keep the backward compatibility with spec 1.1 from vx_convolutional_network_activation_func_e to vx_nn_activation_function_e */ +typedef enum vx_nn_activation_function_e vx_convolutional_network_activation_func_e; +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LOGISTIC VX_NN_ACTIVATION_LOGISTIC +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HYPERBOLIC_TAN VX_NN_ACTIVATION_HYPERBOLIC_TAN +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU VX_NN_ACTIVATION_RELU +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_BRELU VX_NN_ACTIVATION_BRELU +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SOFTRELU VX_NN_ACTIVATION_SOFTRELU +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ABS VX_NN_ACTIVATION_ABS +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SQUARE VX_NN_ACTIVATION_SQUARE +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SQRT VX_NN_ACTIVATION_SQRT +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LINEAR VX_NN_ACTIVATION_LINEAR +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LEAKYRELU VX_NN_ACTIVATION_LEAKYRELU +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6 VX_NN_ACTIVATION_RELU6 +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1 VX_NN_ACTIVATION_RELU1 +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RSQRT VX_NN_ACTIVATION_RSQRT +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LEAKYRELU_MAX_POOLING VX_NN_ACTIVATION_LEAKYRELU_MAX_POOLING +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_NONE VX_NN_ACTIVATION_NONE + +#ifdef __cplusplus +extern "C" { +#endif + +/* keep the backward compatibility with spec 1.1 for vxCreateTensor */ +VX_API_ENTRY vx_tensor VX_API_CALL +vxCreateTensor_11( + vx_context context, + vx_uint32 num_of_dims, + vx_uint32 *sizes, + vx_enum data_format, + vx_int8 fixed_point_pos + ); +#define vxCreateTensor vxCreateTensor_11 + +/* keep the backward compatibility with spec 1.1 for vxCreateVirtualTensor */ +VX_API_ENTRY vx_tensor VX_API_CALL +vxCreateVirtualTensor_11( + vx_graph graph, + vx_uint32 num_of_dims, + vx_uint32 *sizes, + vx_enum data_format, + vx_int8 fixed_point_pos +); +#define vxCreateVirtualTensor vxCreateVirtualTensor_11 + +/* keep the backward compatibility with spec 1.1 for vxCreateTensorFromView */ +VX_API_ENTRY vx_tensor VX_API_CALL +vxCreateTensorFromView_11( + vx_tensor tensor, + vx_tensor_view view +); +#define vxCreateTensorFromView vxCreateTensorFromView_11 + +/* keep the backward compatibility with spec 1.1 for vxCopyTensorPatch */ +VX_API_ENTRY vx_status VX_API_CALL +vxCopyTensorPatch_11( + vx_tensor tensor, + vx_tensor_view view, + vx_tensor_addressing user_addr, + void *user_ptr, + vx_enum usage, + vx_enum user_mem_type +); +#define vxCopyTensorPatch vxCopyTensorPatch_11 + +/* keep the backward compatibility with spec 1.1 for vxCreateImageObjectArrayFromTensor */ +VX_API_ENTRY vx_object_array VX_API_CALL +vxCreateImageObjectArrayFromTensor_11( + vx_tensor tensor, + vx_rectangle_t rect, + vx_uint32 array_size, + vx_uint32 stride, + vx_df_image image_format +); +#define vxCreateImageObjectArrayFromTensor vxCreateImageObjectArrayFromTensor_11 + +/* keep the backward compatibility with spec 1.1 for vxFullyConnectedLayer */ +VX_API_ENTRY vx_node VX_API_CALL +vxFullyConnectedLayer_11( + vx_graph graph, + vx_tensor inputs, + vx_tensor weights, + vx_tensor biases, + vx_uint32 pad, + vx_uint8 accumulator_bits, + vx_enum overflow_policy, + vx_enum rounding_policy, + vx_enum down_scale_size_rounding, + vx_tensor outputs +); +#define vxFullyConnectedLayer vxFullyConnectedLayer_11 + +/* keep the backward compatibility with spec 1.1 for vxActivationLayer */ +VX_API_ENTRY vx_node VX_API_CALL +vxActivationLayer_11( + vx_graph graph, + vx_tensor inputs, + vx_enum func, + vx_int32 a, + vx_int32 b, + vx_tensor outputs +); +#define vxActivationLayer vxActivationLayer_11 + +/* keep the backward compatibility with spec 1.1 for vxPoolingLayer */ +VX_API_ENTRY vx_node VX_API_CALL +vxPoolingLayer_11( + vx_graph graph, + vx_tensor inputs, + vx_enum pool_type, + vx_uint32 pool_size_x, + vx_uint32 pool_size_y, + vx_uint32 pool_pad_x, + vx_uint32 pool_pad_y, + vx_enum rounding, + vx_tensor outputs +); +#define vxPoolingLayer vxPoolingLayer_11 + +/* keep the backward compatibility with spec 1.1 for vxNormalizationLayer */ +VX_API_ENTRY vx_node VX_API_CALL +vxNormalizationLayer_11( + vx_graph graph, + vx_tensor inputs, + vx_enum type, + vx_uint32 norm_size, + vx_float32 alpha, + vx_float32 beta, + vx_tensor outputs +); +#define vxNormalizationLayer vxNormalizationLayer_11 + +/* keep the backward compatibility with spec 1.1 for vxTensorTransposeNode */ +VX_API_ENTRY vx_node VX_API_CALL +vxTensorTransposeNode_11( + vx_graph graph, + vx_tensor inputs, + vx_tensor outputs, + vx_uint32 dim1, + vx_uint32 dim2 +); +#define vxTensorTransposeNode vxTensorTransposeNode_11 + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx.h b/prebuilt-sdk/x86_64_linux/include/VX/vx.h new file mode 100644 index 0000000..27181ce --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2012-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: Some safety-critical environments may enforce software development + * guidelines (for example MISRA C:2012) to facilitate code quality, + * safety, security, portability and reliability. In order to meet + * such guidelines, developers may modify OpenVX standard header files + * without deviating from the OpenVX specification. + */ + +#ifndef _OPENVX_H_ +#define _OPENVX_H_ + +/*! + * \file + * \brief The top level OpenVX Header. + */ + +/*! \brief Defines the length of the implementation name string, including the trailing zero. + * \ingroup group_context + */ +#define VX_MAX_IMPLEMENTATION_NAME (64) + +/*! \brief Defines the length of a kernel name string to be added to OpenVX, including the trailing zero. + * \ingroup group_kernel + */ +#define VX_MAX_KERNEL_NAME (256) + +/*! \brief Defines the length of a message buffer to copy from the log, including the trailing zero. + * \ingroup group_basic_features + */ +#define VX_MAX_LOG_MESSAGE_LEN (1024) + +/*! \brief Defines the length of the reference name string, including the trailing zero. + * \ingroup group_reference + * \see vxSetReferenceName + */ +#define VX_MAX_REFERENCE_NAME (64) + +#include +#include +#include +#include +#include + +/*! \brief Defines the major version number macro. + * \ingroup group_basic_features + */ +#define VX_VERSION_MAJOR(x) ((x & 0xFFU) << 8) + +/*! \brief Defines the minor version number macro. + * \ingroup group_basic_features + */ +#define VX_VERSION_MINOR(x) ((x & 0xFFU) << 0) + +/*! \brief Defines the predefined version number for 1.0. + * \ingroup group_basic_features + */ +#define VX_VERSION_1_0 (VX_VERSION_MAJOR(1) | VX_VERSION_MINOR(0)) + +/*! \brief Defines the predefined version number for 1.1. + * \ingroup group_basic_features + */ +#define VX_VERSION_1_1 (VX_VERSION_MAJOR(1) | VX_VERSION_MINOR(1)) + +/*! \brief Defines the predefined version number for 1.2. + * \ingroup group_basic_features + */ +#define VX_VERSION_1_2 (VX_VERSION_MAJOR(1) | VX_VERSION_MINOR(2)) + +/*! \brief Defines the predefined version number for 1.3. + * \ingroup group_basic_features + */ +#define VX_VERSION_1_3 (VX_VERSION_MAJOR(1) | VX_VERSION_MINOR(3)) + +/*! \brief Defines the OpenVX Version Number. + * \ingroup group_basic_features + */ +#ifndef VX_VERSION +#define VX_VERSION (VX_VERSION_1_3) +#endif + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h new file mode 100644 index 0000000..4d8aa0b --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h @@ -0,0 +1,3435 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _OPENVX_API_H_ +#define _OPENVX_API_H_ + +/*! + * \file + * \brief The API definition for OpenVX. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*============================================================================== + CONTEXT + =============================================================================*/ + +/*! \brief Creates a \ref vx_context. + * \details This creates a top-level object context for OpenVX. + * \note This is required to do anything else. + * \returns The reference to the implementation context \ref vx_context. Any possible errors + * preventing a successful creation should be checked using \ref vxGetStatus. + * \ingroup group_context + * \post \ref vxReleaseContext + */ +VX_API_ENTRY vx_context VX_API_CALL vxCreateContext(void); + +/*! \brief Releases the OpenVX object context. + * \details All reference counted objects are garbage-collected by the return of this call. + * No calls are possible using the parameter context after the context has been + * released until a new reference from \ref vxCreateContext is returned. + * All outstanding references to OpenVX objects from this context are invalid + * after this call. + * \param [in] context The pointer to the reference to the context. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE context is not a valid \ref vx_context reference. + * \ingroup group_context + * \pre \ref vxCreateContext + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseContext(vx_context *context); + +/*! \brief Retrieves the context from any reference from within a context. + * \param [in] reference The reference from which to extract the context. + * \ingroup group_context + * \return The overall context that created the particular + * reference. Any possible errors preventing a successful completion of this function + * should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_context VX_API_CALL vxGetContext(vx_reference reference); + +/*! \brief Queries the context for some specific information. + * \param [in] context The reference to the context. + * \param [in] attribute The attribute to query. Use a \ref vx_context_attribute_e. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE context is not a valid \ref vx_context reference. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \retval VX_ERROR_NOT_SUPPORTED If the attribute is not supported on this implementation. + * \ingroup group_context + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryContext(vx_context context, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Sets an attribute on the context. + * \param [in] context The handle to the overall context. + * \param [in] attribute The attribute to set from \ref vx_context_attribute_e. + * \param [in] ptr The pointer to the data to which to set the attribute. + * \param [in] size The size in bytes of the data to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE context is not a valid \ref vx_context reference. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \retval VX_ERROR_NOT_SUPPORTED If the attribute is not settable. + * \ingroup group_context + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetContextAttribute(vx_context context, vx_enum attribute, const void *ptr, vx_size size); + +/*! \brief Provides a generic API to give platform-specific hints to the implementation. + * \param [in] reference The reference to the object to hint at. + * This could be \ref vx_context, \ref vx_graph, \ref vx_node, \ref vx_image, \ref vx_array, or any other reference. + * \param [in] hint A \ref vx_hint_e \a hint to give to a \ref vx_context. This is a platform-specific optimization or implementation mechanism. + * \param [in] data Optional vendor specific data. + * \param [in] data_size Size of the data structure \p data. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE reference is not a valid \ref vx_reference reference. + * \retval VX_ERROR_NOT_SUPPORTED If the hint is not supported. + * \ingroup group_hint + */ +VX_API_ENTRY vx_status VX_API_CALL vxHint(vx_reference reference, vx_enum hint, const void* data, vx_size data_size); + +/*! \brief Provides a generic API to give platform-specific directives to the implementations. + * \param [in] reference The reference to the object to set the directive on. + * This could be \ref vx_context, \ref vx_graph, \ref vx_node, \ref vx_image, \ref vx_array, or any other reference. + * \param [in] directive The directive to set. See \ref vx_directive_e. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE reference is not a valid \ref vx_reference reference. + * \retval VX_ERROR_NOT_SUPPORTED If the directive is not supported. + * \note The performance counter directives are only available for the reference \ref vx_context. + * Error VX_ERROR_NOT_SUPPORTED is returned when used with any other reference. + * \ingroup group_directive + */ +VX_API_ENTRY vx_status VX_API_CALL vxDirective(vx_reference reference, vx_enum directive); + +/*! \brief Provides a generic API to return status values from Object constructors if they + * fail. + * \note Users do not need to strictly check every object creator as the errors + * should properly propagate and be detected during verification time or run-time. + * \code + * vx_image img = vxCreateImage(context, 639, 480, VX_DF_IMAGE_UYVY); + * vx_status status = vxGetStatus((vx_reference)img); + * // status == VX_ERROR_INVALID_DIMENSIONS + * vxReleaseImage(&img); + * \endcode + * \pre Appropriate Object Creator function. + * \post Appropriate Object Release function. + * \param [in] reference The reference to check for construction errors. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval * Some error occurred, please check enumeration list and constructor. + * \ingroup group_basic_features + */ +VX_API_ENTRY vx_status VX_API_CALL vxGetStatus(vx_reference reference); + +/*! + * \brief Registers user-defined structures to the context. + * \param [in] context The reference to the implementation context. + * \param [in] size The size of user struct in bytes. + * \return A \ref vx_enum value that is a type given to the User + * to refer to their custom structure when declaring a \ref vx_array + * of that structure. + * \retval VX_TYPE_INVALID If the namespace of types has been exhausted. + * \note This call should only be used once within the lifetime of a context for + * a specific structure. + * \ingroup group_adv_array + */ +VX_API_ENTRY vx_enum VX_API_CALL vxRegisterUserStruct(vx_context context, vx_size size); + +/*! + * \brief Registers user-defined structures to the context, and associates a name to it. + * \param [in] context The reference to the implementation context. + * \param [in] size The size of user struct in bytes. + * \param [in] *type_name Pointer to the '\0' terminated string that identifies the + * user struct type. The string is copied by the function so + * that it stays the property of the caller. NULL means that + * the user struct is not named. The length of the string + * shall be lower than VX_MAX_REFERENCE_NAME bytes. + * \return A \ref vx_enum value that is a type given to the User + * to refer to their custom structure when declaring a \ref vx_array + * of that structure. + * \retval VX_TYPE_INVALID If the namespace of types has been exhausted. + * \note This call should only be used once within the lifetime of a context for + * a specific structure. + * \ingroup group_adv_array + */ +VX_API_ENTRY vx_enum VX_API_CALL vxRegisterUserStructWithName(vx_context context, vx_size size, const vx_char* type_name); + +/*! + * \brief Returns the name of the user-defined structure associated with the enumeration given. + * \param [in] context The reference to the implementation context. + * \param [in] type_name The enumeration value of the user struct + * \param [out] name_size Name of the user struct + * \param [in] name_size The size of allocated buffer pointed to by type_name + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS user_struct_type was valid, and name was found and returned + * \retval VX_ERROR_INVALID_PARAMETERS user_struct_type was not a valid user struct enumeration. + * \retval VX_ERROR_NO_MEMORY name_size is too small to hold the name of the user struct type. + * \retval VX_FAILURE user_struct_type does not have an associated type name. + * \pre \ref vxRegisterUserStructWithName should be called for this user struct. + * \ingroup group_adv_array + */ +VX_API_ENTRY vx_status VX_API_CALL vxGetUserStructNameByEnum(vx_context context, vx_enum user_struct_type, vx_char* type_name, vx_size name_size); + +/*! + * \brief Returns the enum of the user-defined structure associated with the name given + * \param [in] context The reference to the implementation context. + * \param [in] type_name Pointer to the '\0' terminated string that identifies the user + * struct type. The length of the string shall be lower than VX_MAX_REFERENCE_NAME bytes. + * \param [out] user_struct_type The enumeration value of the user struct + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS type_name was valid, and enumeration was found and returned + * \retval VX_FAILURE type_name does not match any user struct enumeration. +* \pre \ref vxRegisterUserStructWithName should be called for this user struct. + * \ingroup group_adv_array + */ +VX_API_ENTRY vx_status VX_API_CALL vxGetUserStructEnumByName(vx_context context, const vx_char* type_name, vx_enum *user_struct_type); + +/*! + * \brief Allocates and registers user-defined kernel enumeration to a context. + * The allocated enumeration is from available pool of 4096 enumerations reserved + * for dynamic allocation from VX_KERNEL_BASE(VX_ID_USER,0). + * \param [in] context The reference to the implementation context. + * \param [out] pKernelEnumId pointer to return \ref vx_enum for user-defined kernel. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE If the context is not a valid \ref vx_context reference. + * \retval VX_ERROR_NO_RESOURCES The enumerations has been exhausted. + * \ingroup group_user_kernels + */ +VX_API_ENTRY vx_status VX_API_CALL vxAllocateUserKernelId(vx_context context, vx_enum * pKernelEnumId); + +/*! + * \brief Allocates and registers user-defined kernel library ID to a context. + * + * The allocated library ID is from available pool of library IDs (1..255) + * reserved for dynamic allocation. The returned libraryId can be used by + * user-kernel library developer to specify individual kernel enum IDs in + * a header file, shown below: + * \code + * #define MY_KERNEL_ID1(libraryId) (VX_KERNEL_BASE(VX_ID_USER,libraryId) + 0); + * #define MY_KERNEL_ID2(libraryId) (VX_KERNEL_BASE(VX_ID_USER,libraryId) + 1); + * #define MY_KERNEL_ID3(libraryId) (VX_KERNEL_BASE(VX_ID_USER,libraryId) + 2); + * \endcode + * \param [in] context The reference to the implementation context. + * \param [out] pLibraryId pointer to \ref vx_enum for user-kernel libraryId. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_NO_RESOURCES The enumerations has been exhausted. + * \ingroup group_user_kernels + */ +VX_API_ENTRY vx_status VX_API_CALL vxAllocateUserKernelLibraryId(vx_context context, vx_enum * pLibraryId); + +/*! \brief Sets the default target of the immediate mode. Upon successful execution of this + * function any future execution of immediate mode function is attempted on the new default + * target of the context. + * \param [in] context The reference to the implementation context. + * \param [in] target_enum The default immediate mode target enum to be set + * to the \ref vx_context object. Use a \ref vx_target_e. + * \param [in] target_string The target name ASCII string. This contains a valid value + * when target_enum is set to \ref VX_TARGET_STRING, otherwise it is ignored. + * \ingroup group_context + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Default target set; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE If the context is not a valid \ref vx_context reference. + * \retval VX_ERROR_NOT_SUPPORTED If the specified target is not supported in this context. + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetImmediateModeTarget(vx_context context, vx_enum target_enum, const char* target_string); + +/*============================================================================== + IMAGE + =============================================================================*/ + +/*! \brief Creates an opaque reference to an image buffer. + * \details Not guaranteed to exist until the \ref vx_graph containing it has been verified. + * \param [in] context The reference to the implementation context. + * \param [in] width The image width in pixels. The image in the formats of + * \ref VX_DF_IMAGE_NV12, \ref VX_DF_IMAGE_NV21, \ref VX_DF_IMAGE_IYUV, + * \ref VX_DF_IMAGE_UYVY, \ref VX_DF_IMAGE_YUYV must have even width. + * \param [in] height The image height in pixels. The image in the formats of + * \ref VX_DF_IMAGE_NV12, \ref VX_DF_IMAGE_NV21, \ref VX_DF_IMAGE_IYUV + * must have even height. + * \param [in] color The VX_DF_IMAGE (\ref vx_df_image_e) code that represents the format + * of the image and the color space. + * \returns An image reference \ref vx_image. Any possible errors preventing a successful + * creation should be checked using \ref vxGetStatus. + * \see vxMapImagePatch to obtain direct memory access to the image data. + * \ingroup group_image + */ +VX_API_ENTRY vx_image VX_API_CALL vxCreateImage(vx_context context, vx_uint32 width, vx_uint32 height, vx_df_image color); + +/*! \brief Creates an image from another image given a rectangle. This second + * reference refers to the data in the original image. Updates to this image + * updates the parent image. The rectangle must be defined within the pixel space + * of the parent image. + * \param [in] img The reference to the parent image. + * \param [in] rect The region of interest rectangle. Must contain points within + * the parent image pixel space. + * \returns An image reference \ref vx_image to the sub-image. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_image + */ +VX_API_ENTRY vx_image VX_API_CALL vxCreateImageFromROI(vx_image img, const vx_rectangle_t *rect); + +/*! \brief Creates a reference to an image object that has a singular, + * uniform value in all pixels. The uniform image created is read-only. + * \param [in] context The reference to the implementation context. + * \param [in] width The image width in pixels. The image in the formats of + * \ref VX_DF_IMAGE_NV12, \ref VX_DF_IMAGE_NV21, \ref VX_DF_IMAGE_IYUV, + * \ref VX_DF_IMAGE_UYVY, \ref VX_DF_IMAGE_YUYV must have even width. + * \param [in] height The image height in pixels. The image in the formats of + * \ref VX_DF_IMAGE_NV12, \ref VX_DF_IMAGE_NV21, + * \ref VX_DF_IMAGE_IYUV must have even height. + * \param [in] color The VX_DF_IMAGE (\ref vx_df_image_e) code that represents the format of the image and the color space. + * \param [in] value The pointer to the pixel value to which to set all pixels. See \ref vx_pixel_value_t. + * \returns An image reference \ref vx_image. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \see vxMapImagePatch to obtain direct memory access to the image data. + * \note \ref vxMapImagePatch and \ref vxUnmapImagePatch may be called with + * a uniform image reference. + * \ingroup group_image + */ +VX_API_ENTRY vx_image VX_API_CALL vxCreateUniformImage(vx_context context, vx_uint32 width, vx_uint32 height, vx_df_image color, const vx_pixel_value_t *value); + +/*! \brief Creates an opaque reference to an image buffer with no direct + * user access. This function allows setting the image width, height, or format. + * \details Virtual data objects allow users to connect various nodes within a + * graph via data references without access to that data, but they also permit the + * implementation to take maximum advantage of possible optimizations. Use this + * API to create a data reference to link two or more nodes together when the + * intermediate data are not required to be accessed by outside entities. This API + * in particular allows the user to define the image format of the data without + * requiring the exact dimensions. Virtual objects are scoped within the graph + * they are declared a part of, and can't be shared outside of this scope. + * All of the following constructions of virtual images are valid. + * \code + * vx_context context = vxCreateContext(); + * vx_graph graph = vxCreateGraph(context); + * vx_image virt[] = { + * vxCreateVirtualImage(graph, 0, 0, VX_DF_IMAGE_U8), // no specified dimension + * vxCreateVirtualImage(graph, 320, 240, VX_DF_IMAGE_VIRT), // no specified format + * vxCreateVirtualImage(graph, 640, 480, VX_DF_IMAGE_U8), // no user access + * }; + * \endcode + * \param [in] graph The reference to the parent graph. + * \param [in] width The width of the image in pixels. A value of zero informs the interface + * that the value is unspecified. The image in the formats of \ref VX_DF_IMAGE_NV12, + * \ref VX_DF_IMAGE_NV21, \ref VX_DF_IMAGE_IYUV, \ref VX_DF_IMAGE_UYVY, + * \ref VX_DF_IMAGE_YUYV must have even width. + * \param [in] height The height of the image in pixels. A value of zero informs the interface + * that the value is unspecified. The image in the formats of \ref VX_DF_IMAGE_NV12, + * \ref VX_DF_IMAGE_NV21, \ref VX_DF_IMAGE_IYUV must have even height. + * \param [in] color The VX_DF_IMAGE (\ref vx_df_image_e) code that represents the format + * of the image and the color space. A value of \ref VX_DF_IMAGE_VIRT informs the + * interface that the format is unspecified. + * \returns An image reference \ref vx_image. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \note Passing this reference to \ref vxMapImagePatch will return an error. + * \ingroup group_image + */ +VX_API_ENTRY vx_image VX_API_CALL vxCreateVirtualImage(vx_graph graph, vx_uint32 width, vx_uint32 height, vx_df_image color); + +/*! \brief Creates a reference to an image object that was externally allocated. + * \param [in] context The reference to the implementation context. + * \param [in] color See the \ref vx_df_image_e codes. This mandates the + * number of planes needed to be valid in the \a addrs and \a ptrs arrays based on the format given. + * \param [in] addrs[] The array of image patch addressing structures that + * define the dimension and stride of the array of pointers. See note below. + * \param [in] ptrs[] The array of platform-defined references to each plane. See note below. + * \param [in] memory_type \ref vx_memory_type_e. When giving \ref VX_MEMORY_TYPE_HOST + * the \a ptrs array is assumed to be HOST accessible pointers to memory. + * \returns An image reference \ref vx_image. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \note The user must call vxMapImagePatch prior to accessing the pixels of an image, even if the + * image was created via \ref vxCreateImageFromHandle. Reads or writes to memory referenced + * by ptrs[ ] after calling \ref vxCreateImageFromHandle without first calling + * \ref vxMapImagePatch will result in undefined behavior. + * The property of addr[] and ptrs[] arrays is kept by the caller (It means that the implementation will + * make an internal copy of the provided information. \a addr and \a ptrs can then simply be application's + * local variables). + * Only \a dim_x, \a dim_y, \a stride_x and \a stride_y fields of the \ref vx_imagepatch_addressing_t need to be + * provided by the application. Other fields (\a step_x, \a step_y, \a scale_x & \a scale_y) are ignored by this function. + * The layout of the imported memory must follow a row-major order. In other words, \a stride_x should be + * sufficiently large so that there is no overlap between data elements corresponding to different + * pixels, and \a stride_y >= \a stride_x * \a dim_x. + * + * In order to release the image back to the application we should use \ref vxSwapImageHandle. + * + * Import type of the created image is available via the image attribute \ref vx_image_attribute_e parameter. + * + * \ingroup group_image + */ +VX_API_ENTRY vx_image VX_API_CALL vxCreateImageFromHandle(vx_context context, vx_df_image color, const vx_imagepatch_addressing_t addrs[], void *const ptrs[], vx_enum memory_type); + +/*! \brief Swaps the image handle of an image previously created from handle. + * + * This function sets the new image handle (i.e. pointer to all image planes) + * and returns the previous one. + * + * Once this function call has completed, the application gets back the + * ownership of the memory referenced by the previous handle. This memory + * contains up-to-date pixel data, and the application can safely reuse or + * release it. + * + * The memory referenced by the new handle must have been allocated + * consistently with the image properties since the import type, + * memory layout and dimensions are unchanged (see addrs, color, and + * memory_type in \ref vxCreateImageFromHandle). + * + * All images created from ROI or channel with this image as parent or ancestor + * will automatically use the memory referenced by the new handle. + * + * The behavior of \ref vxSwapImageHandle when called from a user node is undefined. + * \param [in] image The reference to an image created from handle + * \param [in] new_ptrs[] pointer to a caller owned array that contains + * the new image handle (image plane pointers) + * \arg new_ptrs is non NULL. new_ptrs[i] must be non NULL for each i such as + * 0 < i < nbPlanes, otherwise, this is an error. The address of the storage memory + * for image plane i is set to new_ptrs[i] + * \arg new_ptrs is NULL: the previous image storage memory is reclaimed by the + * caller, while no new handle is provided. + * \param [out] prev_ptrs[] pointer to a caller owned array in which + * the application returns the previous image handle + * \arg prev_ptrs is non NULL. prev_ptrs must have at least as many + * elements as the number of image planes. For each i such as + * 0 < i < nbPlanes , prev_ptrs[i] is set to the address of the previous storage + * memory for plane i. + * \arg prev_ptrs NULL: the previous handle is not returned. + * \param [in] num_planes Number of planes in the image. This must be set equal to the number of planes of the input image. + * The number of elements in new_ptrs and prev_ptrs arrays must be equal to or greater than num_planes. + * If either array has more than num_planes elements, the extra elements are ignored. If either array is smaller + * than num_planes, the results are undefined. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE image is not a valid \ref vx_image reference. + * reference. + * \retval VX_ERROR_INVALID_PARAMETERS The image was not created from handle or + * the content of new_ptrs is not valid. + * \retval VX_FAILURE The image was already being accessed. + * \ingroup group_image + */ + +VX_API_ENTRY vx_status VX_API_CALL vxSwapImageHandle(vx_image image, void* const new_ptrs[], void* prev_ptrs[], vx_size num_planes); + + +/*! \brief Swaps the image created from handle. + *\details This function swap image logical and physical address. + *\these tensors must have the same proterties expect memory related content. + *\Attention: APP should make sure the cache and memory cohensive for the first call vxSwapImage + *\version 0.4 + */ +VX_API_ENTRY vx_status VX_API_CALL vxSwapImage(vx_image image0, vx_image image1); + + +/*! \brief Retrieves various attributes of an image. + * \param [in] image The reference to the image to query. + * \param [in] attribute The attribute to query. Use a \ref vx_image_attribute_e. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE image is not a valid \ref vx_image reference. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \retval VX_ERROR_NOT_SUPPORTED If the attribute is not supported on this implementation. + * \ingroup group_image + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryImage(vx_image image, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Allows setting attributes on the image. + * \param [in] image The reference to the image on which to set the attribute. + * \param [in] attribute The attribute to set. Use a \ref vx_image_attribute_e enumeration. + * \param [in] ptr The pointer to the location from which to read the value. + * \param [in] size The size in bytes of the object pointed to by \a ptr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE image is not a valid \ref vx_image reference. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \ingroup group_image + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetImageAttribute(vx_image image, vx_enum attribute, const void *ptr, vx_size size); + +/*! \brief Initialize an image with the given pixel value. + * \param [in] image The reference to the image to initialize. + * \param [in] pixel_value The pointer to the constant pixel value to initialize all image pixels. See \ref vx_pixel_value_t. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE If the image is a uniform image, a virtual image, or not a \ref vx_image. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \note All pixels of the entire image are initialized to the indicated pixel value, independently from the valid region. + * The valid region of the image is unaffected by this function. The image remains mutable after the call to this function, + * so its pixels and mutable attributes may be changed by subsequent functions. + * \ingroup group_image + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetImagePixelValues(vx_image image, const vx_pixel_value_t *pixel_value); + +/*! \brief Releases a reference to an image object. + * The object may not be garbage collected until its total reference count is zero. + * + * An implementation may defer the actual object destruction after its total + * reference count is zero (potentially until context destruction). Thus, + * releasing an image created from handle + * (see \ref vxCreateImageFromHandle) and all others objects that may + * reference it (nodes, ROI, or channel for instance) are not sufficient to get back the + * ownership of the memory referenced by the current image handle. The only way + * for this is to call \ref vxSwapImageHandle) before releasing the + * image. + * + * \param [in] image The pointer to the image to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE image is not a valid \ref vx_image reference. + * \ingroup group_image + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseImage(vx_image *image); + +/*! + * \brief Accesses a specific indexed pixel in an image patch. + * \param [in] ptr The base pointer of the patch as returned from \ref vxMapImagePatch. + * \param [in] index The 0 based index of the pixel count in the patch. Indexes increase horizontally by 1 then wrap around to the next row. + * \param [in] addr The pointer to the addressing mode information returned from \ref vxMapImagePatch. + * \return void * Returns the pointer to the specified pixel. + * \pre \ref vxMapImagePatch + * \ingroup group_image + */ +VX_API_ENTRY void * VX_API_CALL vxFormatImagePatchAddress1d(void *ptr, vx_uint32 index, const vx_imagepatch_addressing_t *addr); + +/*! + * \brief Accesses a specific pixel at a 2d coordinate in an image patch. + * \param [in] ptr The base pointer of the patch as returned from \ref vxMapImagePatch. + * \param [in] x The x dimension within the patch. + * \param [in] y The y dimension within the patch. + * \param [in] addr The pointer to the addressing mode information returned from \ref vxMapImagePatch. + * \return void * Returns the pointer to the specified pixel. + * \pre \ref vxMapImagePatch + * \ingroup group_image + */ +VX_API_ENTRY void * VX_API_CALL vxFormatImagePatchAddress2d(void *ptr, vx_uint32 x, vx_uint32 y, const vx_imagepatch_addressing_t *addr); + +/*! \brief Retrieves the valid region of the image as a rectangle. + * \param [in] image The image from which to retrieve the valid region. + * \param [out] rect The destination rectangle. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE image is not a valid \ref vx_image reference. + * \retval VX_ERROR_INVALID_PARAMETERS Invalid rect. + * \note This rectangle can be passed directly to \ref vxMapImagePatch to get + * the full valid region of the image. + * \ingroup group_image + */ +VX_API_ENTRY vx_status VX_API_CALL vxGetValidRegionImage(vx_image image, vx_rectangle_t *rect); + +/*! \brief Allows the application to copy a rectangular patch from/into an image object plane. + * \param [in] image The reference to the image object that is the source or the + * destination of the copy. + * \param [in] image_rect The coordinates of the image patch. The patch must be within + * the bounds of the image. (start_x, start_y) gives the coordinates of the topleft + * pixel inside the patch, while (end_x, end_y) gives the coordinates of the bottomright + * element out of the patch. Must be 0 <= start < end <= number of pixels in the image dimension. + * \param [in] image_plane_index The plane index of the image object that is the source or the + * destination of the patch copy. + * \param [in] user_addr The address of a structure describing the layout of the + * user memory location pointed by user_ptr. In the structure, only dim_x, dim_y, + * stride_x and stride_y fields must be provided, other fields are ignored by the function. + * The layout of the user memory must follow a row major order: + * stride_x >= pixel size in bytes, and stride_y >= stride_x * dim_x. + * \param [in] user_ptr The address of the memory location where to store the requested data + * if the copy was requested in read mode, or from where to get the data to store into the image + * object if the copy was requested in write mode. The accessible memory must be large enough + * to contain the specified patch with the specified layout: + * accessible memory in bytes >= (end_y - start_y) * stride_y. + * \param [in] usage This declares the effect of the copy with regard to the image object + * using the \ref vx_accessor_e enumeration. For uniform images, only VX_READ_ONLY + * is supported. For other images, Only \ref VX_READ_ONLY and \ref VX_WRITE_ONLY are supported: + * \arg \ref VX_READ_ONLY means that data is copied from the image object into the application memory + * \arg \ref VX_WRITE_ONLY means that data is copied into the image object from the application memory + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that specifies + * the memory type of the memory referenced by the user_addr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_OPTIMIZED_AWAY This is a reference to a virtual image that cannot be + * accessed by the application. + * \retval VX_ERROR_INVALID_REFERENCE image is not a valid \ref vx_image reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \note The application may ask for data outside the bounds of the valid region, but + * such data has an undefined value. + * \ingroup group_image + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyImagePatch(vx_image image, const vx_rectangle_t *image_rect, vx_uint32 image_plane_index, const vx_imagepatch_addressing_t *user_addr, void * user_ptr, vx_enum usage, vx_enum user_mem_type); + + +/*! \brief Allows the application to get direct access to a rectangular patch of an image object plane. + * \param [in] image The reference to the image object that contains the patch to map. + * \param [in] rect The coordinates of image patch. The patch must be within the + * bounds of the image. (start_x, start_y) gives the coordinate of the topleft + * element inside the patch, while (end_x, end_y) give the coordinate of + * the bottomright element out of the patch. Must be 0 <= start < end. + * \param [in] plane_index The plane index of the image object to be accessed. + * \param [out] map_id The address of a \ref vx_map_id variable where the function + * returns a map identifier. + * \arg (*map_id) must eventually be provided as the map_id parameter of a call to + * \ref vxUnmapImagePatch. + * \param [out] addr The address of a \ref vx_imagepatch_addressing_t structure + * describing the memory layout of the image patch to access. The function fills the + * structure pointed by addr with the layout information that the application must + * consult to access the pixel data at address (*ptr). The layout of the mapped memory + * follows a row-major order: stride_x>0, stride_y>0 and stride_y >= stride_x * dim_x. + * An exception is for \ref VX_DF_IMAGE_U1 where \a stride_x == 0, + * _stride_x_bits_ > 0 and _stride_y_ {geq} (_stride_x_bits_ * _dim_x_ + 7) / 8 + * (i.e., at least the number of bytes needed to hold _dim_x_ pixels). + * If the image object being accessed was created via + * \ref vxCreateImageFromHandle, then the returned memory layout will be + * the identical to that of the addressing structure provided when + * \ref vxCreateImageFromHandle was called. + * \param [out] ptr The address of a pointer that the function sets to the + * address where the requested data can be accessed. This returned (*ptr) address + * is only valid between the call to this function and the corresponding call to + * \ref vxUnmapImagePatch. + * If image was created via \ref vxCreateImageFromHandle then the returned + * address (*ptr) will be the address of the patch in the original pixel buffer + * provided when image was created. + * \param [in] usage This declares the access mode for the image patch, using + * the \ref vx_accessor_e enumeration. For uniform images, only VX_READ_ONLY + * is supported. + * \arg \ref VX_READ_ONLY: after the function call, the content of the memory location + * pointed by (*ptr) contains the image patch data. Writing into this memory location + * is forbidden and its behavior is undefined. + * \arg \ref VX_READ_AND_WRITE: after the function call, the content of the memory + * location pointed by (*ptr) contains the image patch data; writing into this memory + * is allowed only for the location of pixels only and will result in a modification + * of the written pixels in the image object once the patch is unmapped. Writing into + * a gap between pixels (when addr->stride_x > pixel size in bytes or addr->stride_y > addr->stride_x*addr->dim_x) + * is forbidden and its behavior is undefined. + * \arg \ref VX_WRITE_ONLY: after the function call, the memory location pointed by (*ptr) + * contains undefined data; writing each pixel of the patch is required prior to + * unmapping. Pixels not written by the application before unmap will become + * undefined after unmap, even if they were well defined before map. Like for + * VX_READ_AND_WRITE, writing into a gap between pixels is forbidden and its behavior + * is undefined. + * \param [in] mem_type A \ref vx_memory_type_e enumeration that + * specifies the type of the memory where the image patch is requested to be mapped. + * \param [in] flags An integer that allows passing options to the map operation. + * Use the \ref vx_map_flag_e enumeration. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_OPTIMIZED_AWAY This is a reference to a virtual image that cannot be + * accessed by the application. + * \retval VX_ERROR_INVALID_REFERENCE image is not a valid \ref vx_image reference. + * reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \note The user may ask for data outside the bounds of the valid region, but + * such data has an undefined value. + * \ingroup group_image + * \post \ref vxUnmapImagePatch with same (*map_id) value. + */ +VX_API_ENTRY vx_status VX_API_CALL vxMapImagePatch(vx_image image, const vx_rectangle_t *rect, vx_uint32 plane_index, vx_map_id *map_id, vx_imagepatch_addressing_t *addr, void **ptr, vx_enum usage, vx_enum mem_type, vx_uint32 flags); + + +/*! \brief Unmap and commit potential changes to a image object patch that were previously mapped. + * Unmapping an image patch invalidates the memory location from which the patch could + * be accessed by the application. Accessing this memory location after the unmap function + * completes has an undefined behavior. + * \param [in] image The reference to the image object to unmap. + * \param [out] map_id The unique map identifier that was returned by \ref vxMapImagePatch . + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE image is not a valid \ref vx_image reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_image + * \pre \ref vxMapImagePatch with same map_id value + */ +VX_API_ENTRY vx_status VX_API_CALL vxUnmapImagePatch(vx_image image, vx_map_id map_id); + +/*! \brief Create a sub-image from a single plane channel of another image. + * + * The sub-image refers to the data in the original image. Updates to this image + * update the parent image and reversely. + * + * The function supports only channels that occupy an entire plane of a multi-planar + * images, as listed below. Other cases are not supported. + * VX_CHANNEL_Y from YUV4, IYUV, NV12, NV21 + * VX_CHANNEL_U from YUV4, IYUV + * VX_CHANNEL_V from YUV4, IYUV + * + * \param [in] img The reference to the parent image. + * \param [in] channel The \ref vx_channel_e channel to use. + + * \returns An image reference \ref vx_image to the sub-image. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_image + */ +VX_API_ENTRY vx_image VX_API_CALL vxCreateImageFromChannel(vx_image img, vx_enum channel); + + +/*! \brief Sets the valid rectangle for an image according to a supplied rectangle. + * \note Setting or changing the valid region from within a user node by means other than the call-back, for + * example by calling \ref vxSetImageValidRectangle, might result in an incorrect valid region calculation + * by the framework. + * \param [in] image The reference to the image. + * \param [in] rect The value to be set to the image valid rectangle. A NULL indicates that the valid region is the entire image. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE image is not a valid \ref vx_image reference. + * \retval VX_ERROR_INVALID_PARAMETERS The rect does not define a proper valid rectangle. + * \ingroup group_image + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetImageValidRectangle(vx_image image, const vx_rectangle_t *rect); + + + +/*============================================================================== + KERNEL + =============================================================================*/ + + /*! \brief Registers a module with kernels in a context. + * \details This function registers the appropriate publish and unpublish functions + * with the module name if the module is not a dynamic library, so \ref vxLoadKernels and + * \ref vxUnloadKernels can be called. + * \param [in] context The reference to the context the kernels must be added to. + * \param [in] module The short name of the module to load. + * \param [in] publish must add kernels to the context by calling \ref vxAddUserKernel + * for each new kernel. It is called by \ref vxLoadKernels. + * \param [in] unpublish must remove kernels from the context by calling \ref vxRemoveKernel + * for each kernel the vxPublishKernels has added. It is called by \ref vxUnloadKernels. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE context is not a valid \ref vx_context reference. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \ingroup group_user_kernels + * \see vxLoadKernels + */ +VX_API_ENTRY vx_status VX_API_CALL vxRegisterKernelLibrary(vx_context context, const vx_char *module, vx_publish_kernels_f publish, vx_unpublish_kernels_f unpublish); + + +/*! \brief Loads a library of kernels, called module, into a context. + * + * The module must be a dynamic library with by convention, two exported functions + * named vxPublishKernels and vxUnpublishKernels. + * + * vxPublishKernels must have type \ref vx_publish_kernels_f, + * and must add kernels to the context by calling \ref vxAddUserKernel + * for each new kernel. vxPublishKernels is called by \ref vxLoadKernels. + * + * vxUnpublishKernels must have type \ref vx_unpublish_kernels_f, + * and must remove kernels from the context by calling \ref vxRemoveKernel + * for each kernel the vxPublishKernels has added. + * vxUnpublishKernels is called by \ref vxUnloadKernels. + * + * \note When all references to loaded kernels are released, the module + * may be automatically unloaded. + * \param [in] context The reference to the context the kernels must be added to. + * \param [in] module The short name of the module to load. On systems where + * there are specific naming conventions for modules, the name passed + * should ignore such conventions. For example: \c libxyz.so should be + * passed as just \c xyz and the implementation will do the right thing that + * the platform requires. + * \note This API uses the system pre-defined paths for modules. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE context is not a valid \ref vx_context reference. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \ingroup group_user_kernels + * \see vxGetKernelByName + */ +VX_API_ENTRY vx_status VX_API_CALL vxLoadKernels(vx_context context, const vx_char *module); + +/*! \brief Unloads all kernels from the OpenVX context that had been loaded from + * the module using the \ref vxLoadKernels function. + * + * The kernel unloading is performed by calling the vxUnpublishKernels + * exported function of the module. + * \note vxUnpublishKernels is defined in the description of + * \ref vxLoadKernels. + * + * \param [in] context The reference to the context the kernels must be removed from. + * \param [in] module The short name of the module to unload. On systems where + * there are specific naming conventions for modules, the name passed + * should ignore such conventions. For example: \c libxyz.so should be + * passed as just \c xyz and the implementation will do the right thing + * that the platform requires. + * \note This API uses the system pre-defined paths for modules. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE context is not a valid \ref vx_context reference. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are + incorrect. + * \ingroup group_user_kernels + * \see vxLoadKernels + */ +VX_API_ENTRY vx_status VX_API_CALL vxUnloadKernels(vx_context context, const vx_char *module); + +/*! \brief Obtains a reference to a kernel using a string to specify the name. + * \details User Kernels follow a "dotted" heirarchical syntax. For example: + * "com.company.example.xyz". The following are strings specifying the kernel names: + + * org.khronos.openvx.color_convert + + * org.khronos.openvx.channel_extract + + * org.khronos.openvx.channel_combine + + * org.khronos.openvx.sobel_3x3 + + * org.khronos.openvx.magnitude + + * org.khronos.openvx.phase + + * org.khronos.openvx.scale_image + + * org.khronos.openvx.table_lookup + + * org.khronos.openvx.histogram + + * org.khronos.openvx.equalize_histogram + + * org.khronos.openvx.absdiff + + * org.khronos.openvx.mean_stddev + + * org.khronos.openvx.threshold + + * org.khronos.openvx.integral_image + + * org.khronos.openvx.dilate_3x3 + + * org.khronos.openvx.erode_3x3 + + * org.khronos.openvx.median_3x3 + + * org.khronos.openvx.box_3x3 + + * org.khronos.openvx.gaussian_3x3 + + * org.khronos.openvx.custom_convolution + + * org.khronos.openvx.gaussian_pyramid + + * org.khronos.openvx.accumulate + + * org.khronos.openvx.accumulate_weighted + + * org.khronos.openvx.accumulate_square + + * org.khronos.openvx.minmaxloc + + * org.khronos.openvx.convertdepth + + * org.khronos.openvx.canny_edge_detector + + * org.khronos.openvx.and + + * org.khronos.openvx.or + + * org.khronos.openvx.xor + + * org.khronos.openvx.not + + * org.khronos.openvx.multiply + + * org.khronos.openvx.add + + * org.khronos.openvx.subtract + + * org.khronos.openvx.warp_affine + + * org.khronos.openvx.warp_perspective + + * org.khronos.openvx.harris_corners + + * org.khronos.openvx.fast_corners + + * org.khronos.openvx.optical_flow_pyr_lk + + * org.khronos.openvx.remap + + * org.khronos.openvx.halfscale_gaussian + + * org.khronos.openvx.laplacian_pyramid + + * org.khronos.openvx.laplacian_reconstruct + + * org.khronos.openvx.non_linear_filter + + * org.khronos.openvx.match_template + + * org.khronos.openvx.lbp + + * org.khronos.openvx.hough_lines_p + + * org.khronos.openvx.tensor_multiply + + * org.khronos.openvx.tensor_add + + * org.khronos.openvx.tensor_subtract + + * org.khronos.openvx.tensor_table_lookup + + * org.khronos.openvx.tensor_transpose + + * org.khronos.openvx.tensor_convert_depth + + * org.khronos.openvx.tensor_matrix_multiply + + * org.khronos.openvx.copy + + * org.khronos.openvx.non_max_suppression + + * org.khronos.openvx.scalar_operation + + * org.khronos.openvx.hog_features + + * org.khronos.openvx.hog_cells + + * org.khronos.openvx.bilateral_filter + + * org.khronos.openvx.select + + * org.khronos.openvx.min + + * org.khronos.openvx.max + + * \param [in] context The reference to the implementation context. + * \param [in] name The string of the name of the kernel to get. + * \return A kernel reference. Any possible errors preventing a successful + * completion of the function should be checked using \ref vxGetStatus. + * \ingroup group_kernel + * \pre \ref vxLoadKernels if the kernel is not provided by the + * OpenVX implementation. + * \note User Kernels should follow a "dotted" hierarchical syntax. For example: + * "com.company.example.xyz". + */ +VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByName(vx_context context, const vx_char *name); + +/*! \brief Obtains a reference to the kernel using the \ref vx_kernel_e enumeration. + * \details Enum values above the standard set are assumed to apply to + * loaded libraries. + * \param [in] context The reference to the implementation context. + * \param [in] kernel A value from \ref vx_kernel_e or a vendor or client-defined value. + * \return A \ref vx_kernel reference. Any possible errors preventing a successful completion + * of the function should be checked using \ref vxGetStatus. + * \ingroup group_kernel + * \pre \ref vxLoadKernels if the kernel is not provided by the + * OpenVX implementation. + */ +VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByEnum(vx_context context, vx_enum kernel); + +/*! \brief This allows the client to query the kernel to get information about + * the number of parameters, enum values, etc. + * \param [in] kernel The kernel reference to query. + * \param [in] attribute The attribute to query. Use a \ref vx_kernel_attribute_e. + * \param [out] ptr The pointer to the location at which to store the resulting value. + * \param [in] size The size of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE kernel is not a valid \ref vx_kernel reference. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \retval VX_ERROR_NOT_SUPPORTED If the attribute value is not supported in this implementation. + * \ingroup group_kernel + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryKernel(vx_kernel kernel, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Release the reference to the kernel. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] kernel The pointer to the kernel reference to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE kernel is not a valid \ref vx_kernel reference. + * \ingroup group_kernel + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseKernel(vx_kernel *kernel); + +/*! \brief Allows users to add custom kernels to a context at run-time. + * \param [in] context The reference to the context the kernel must be added to. + * \param [in] name The string to use to match the kernel. + * \param [in] enumeration The enumerated value of the kernel to be used by clients. + * \param [in] func_ptr The process-local function pointer to be invoked. + * \param [in] numParams The number of parameters for this kernel. + * \param [in] validate The pointer to \ref vx_kernel_validate_f, which validates + * parameters to this kernel. + * \param [in] init The kernel initialization function. + * \param [in] deinit The kernel de-initialization function. + * \return A \ref vx_kernel reference. Any possible errors + * preventing a successful creation should be checked using \ref vxGetStatus. + * \ingroup group_user_kernels + */ +VX_API_ENTRY vx_kernel VX_API_CALL vxAddUserKernel(vx_context context, + const vx_char name[VX_MAX_KERNEL_NAME], + vx_enum enumeration, + vx_kernel_f func_ptr, + vx_uint32 numParams, + vx_kernel_validate_f validate, + vx_kernel_initialize_f init, + vx_kernel_deinitialize_f deinit); + +/*! \brief This API is called after all parameters have been added to the + * kernel and the kernel is \e ready to be used. Notice that the reference to the kernel created + * by vxAddUserKernel is still valid after the call to vxFinalizeKernel. + * If an error occurs, the kernel is not available for usage by the clients of OpenVX. Typically + * this is due to a mismatch between the number of parameters requested and given. + * \param [in] kernel The reference to the loaded kernel from \ref vxAddUserKernel. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE kernel is not a valid \ref vx_kernel reference. + * \pre \ref vxAddUserKernel and \ref vxAddParameterToKernel + * \ingroup group_user_kernels + */ +VX_API_ENTRY vx_status VX_API_CALL vxFinalizeKernel(vx_kernel kernel); + +/*! \brief Allows users to set the signatures of the custom kernel. + * \param [in] kernel The reference to the kernel added with \ref vxAddUserKernel. + * \param [in] index The index of the parameter to add. + * \param [in] dir The direction of the parameter. This must be either \ref VX_INPUT or + * \ref VX_OUTPUT. \ref VX_BIDIRECTIONAL is not supported for this function. + * \param [in] data_type The type of parameter. This must be a value from \ref vx_type_e. + * \param [in] state The state of the parameter (required or not). This must be a value from \ref vx_parameter_state_e. + * \return A \ref vx_status_e enumerated value. + * \retval VX_SUCCESS Parameter is successfully set on kernel; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE kernel is not a valid \ref vx_kernel reference. + * \retval VX_ERROR_INVALID_PARAMETERS If the parameter is not valid for any reason. + * \pre \ref vxAddUserKernel + * \ingroup group_user_kernels + */ +VX_API_ENTRY vx_status VX_API_CALL vxAddParameterToKernel(vx_kernel kernel, vx_uint32 index, vx_enum dir, vx_enum data_type, vx_enum state); + +/*! \brief Removes a custom kernel from its context and releases it. + * \param [in] kernel The reference to the kernel to remove. Returned from \ref vxAddUserKernel. + * \note Any kernel enumerated in the base standard + * cannot be removed; only kernels added through \ref vxAddUserKernel can + * be removed. + * \return A \ref vx_status_e enumeration. The function returns to the + * application full control over the memory resources provided at the kernel creation time. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE kernel is not a valid \ref vx_kernel reference. + * \retval VX_ERROR_INVALID_PARAMETERS If a base kernel is passed in. + * \retval VX_FAILURE If the application has not released all references to the kernel + * object OR if the application has not released all references to a node that is using + * this kernel OR if the application has not released all references to a graph which + * has nodes that is using this kernel. + * \ingroup group_user_kernels + */ +VX_API_ENTRY vx_status VX_API_CALL vxRemoveKernel(vx_kernel kernel); + +/*! \brief Sets kernel attributes. + * \param [in] kernel The reference to the kernel. + * \param [in] attribute The enumeration of the attributes. See \ref vx_kernel_attribute_e. + * \param [in] ptr The pointer to the location from which to read the attribute. + * \param [in] size The size in bytes of the data area indicated by \a ptr in bytes. + * \note After a kernel has been passed to \ref vxFinalizeKernel, no attributes + * can be altered. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE kernel is not a valid \ref vx_kernel reference. + * \ingroup group_user_kernels + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetKernelAttribute(vx_kernel kernel, vx_enum attribute, const void *ptr, vx_size size); + +/*! \brief Retrieves a \ref vx_parameter from a \ref vx_kernel. + * \param [in] kernel The reference to the kernel. + * \param [in] index The index of the parameter. + * \return A \ref vx_parameter reference. Any possible errors preventing a + * successful completion of the function should be checked using \ref vxGetStatus. + * \ingroup group_parameter + */ +VX_API_ENTRY vx_parameter VX_API_CALL vxGetKernelParameterByIndex(vx_kernel kernel, vx_uint32 index); + +/*============================================================================== + GRAPH + =============================================================================*/ + +/*! \brief Creates an empty graph. + * \param [in] context The reference to the implementation context. + * \returns A graph reference \ref vx_graph. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_graph + */ +VX_API_ENTRY vx_graph VX_API_CALL vxCreateGraph(vx_context context); + +/*! \brief Releases a reference to a graph. + * The object may not be garbage collected until its total reference count is zero. + * Once the reference count is zero, all node references in the graph are automatically + * released as well. Releasing the graph will only release the nodes if the nodes were + * not previously released by the application. Data referenced by those nodes may not + * be released as the user may still have references to the data. + * \param [in] graph The pointer to the graph to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference. + * \ingroup group_graph + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseGraph(vx_graph *graph); + +/*! \brief Verifies the state of the graph before it is executed. + * This is useful to catch programmer errors and contract errors. If not verified, + * the graph verifies before being processed. + * \pre Memory for data objects is not guarenteed to exist before + * this call. \post After this call data objects exist unless + * the implementation optimized them out. + * \param [in] graph The reference to the graph to verify. + * \return A status code for graphs with more than one error; it is + * undefined which error will be returned. Register a log callback using \ref vxRegisterLogCallback + * to receive each specific error in the graph. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference. + * \retval VX_ERROR_MULTIPLE_WRITERS If the graph contains more than one writer + * to any data object. + * \retval VX_ERROR_INVALID_NODE If a node in the graph is invalid or failed be created. + * \retval VX_ERROR_INVALID_GRAPH If the graph contains cycles or some other invalid topology. + * \retval VX_ERROR_INVALID_TYPE If any parameter on a node is given the wrong type. + * \retval VX_ERROR_INVALID_VALUE If any value of any parameter is out of bounds of specification. + * \retval VX_ERROR_INVALID_FORMAT If the image format is not compatible. + * \ingroup group_graph + * \see vxProcessGraph + */ +VX_API_ENTRY vx_status VX_API_CALL vxVerifyGraph(vx_graph graph); + +/*! \brief This function causes the synchronous processing of a graph. If the graph + * has not been verified, then the implementation verifies the graph + * immediately. If verification fails this function returns a status + * identical to what \ref vxVerifyGraph would return. After + * the graph verfies successfully then processing occurs. If the graph was + * previously verified via \ref vxVerifyGraph or \ref vxProcessGraph + * then the graph is processed. This function blocks until the graph is completed. + * \param [in] graph The graph to execute. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Graph has been processed; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference. + * \retval VX_FAILURE A catastrophic error occurred during processing. + * \ingroup group_graph + */ +VX_API_ENTRY vx_status VX_API_CALL vxProcessGraph(vx_graph graph); + +/*! \brief Schedules a graph for future execution. If the graph + * has not been verified, then the implementation verifies the graph + * immediately. If verification fails this function returns a status + * identical to what \ref vxVerifyGraph would return. After + * the graph verfies successfully then processing occurs. If the graph was + * previously verified via \ref vxVerifyGraph or \ref vxProcessGraph + * then the graph is processed. + * \param [in] graph The graph to schedule. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS The graph has been scheduled; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference. + * \retval VX_ERROR_NO_RESOURCES The graph cannot be scheduled now. + * \retval VX_ERROR_NOT_SUFFICIENT The graph is not verified and has failed + * forced verification. + * \ingroup group_graph + */ +VX_API_ENTRY vx_status VX_API_CALL vxScheduleGraph(vx_graph graph); + +/*! \brief Waits for a specific graph to complete. If the graph has been scheduled multiple + * times since the last call to vxWaitGraph, then vxWaitGraph returns only when the last + * scheduled execution completes. + * \param [in] graph The graph to wait on. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS The graph has successfully completed execution and its outputs are the + * valid results of the most recent execution; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference. + * \retval VX_FAILURE An error occurred or the graph was never scheduled. Output data of the + * graph is undefined. + * \pre \ref vxScheduleGraph + * \ingroup group_graph + */ +VX_API_ENTRY vx_status VX_API_CALL vxWaitGraph(vx_graph graph); + +/*! \brief Allows the user to query attributes of the Graph. + * \param [in] graph The reference to the created graph. + * \param [in] attribute The \ref vx_graph_attribute_e type needed. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference. + * \ingroup group_graph + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryGraph(vx_graph graph, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Allows the attributes of the Graph to be set to the provided value. + * \param [in] graph The reference to the graph. + * \param [in] attribute The \ref vx_graph_attribute_e type needed. + * \param [in] ptr The location from which to read the value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference. + * \ingroup group_graph + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetGraphAttribute(vx_graph graph, vx_enum attribute, const void *ptr, vx_size size); + +/*! \brief Adds the given parameter extracted from a \ref vx_node to the graph. + * \param [in] graph The graph reference that contains the node. + * \param [in] parameter The parameter reference to add to the graph from the node. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Parameter added to Graph; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference or parameter is not a valid \ref vx_parameter reference. + * \retval VX_ERROR_INVALID_PARAMETERS The parameter is of a node not in this + * graph. + * \ingroup group_graph_parameters + */ +VX_API_ENTRY vx_status VX_API_CALL vxAddParameterToGraph(vx_graph graph, vx_parameter parameter); + +/*! \brief Sets a reference to the parameter on the graph. The implementation + * must set this parameter on the originating node as well. + * \param [in] graph The graph reference. + * \param [in] index The parameter index. + * \param [in] value The reference to set to the parameter. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Parameter set to Graph; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference or + * value is not a valid \ref vx_reference. + * \retval VX_ERROR_INVALID_PARAMETERS The parameter index is out of bounds or the + * dir parameter is incorrect. + * \ingroup group_graph_parameters + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetGraphParameterByIndex(vx_graph graph, vx_uint32 index, vx_reference value); + +/*! \brief Retrieves a \ref vx_parameter from a \ref vx_graph. + * \param [in] graph The graph. + * \param [in] index The index of the parameter. + * \return \ref vx_parameter reference. Any possible errors preventing a successful + * function completion should be checked using \ref vxGetStatus. + * \ingroup group_graph_parameters + */ +VX_API_ENTRY vx_parameter VX_API_CALL vxGetGraphParameterByIndex(vx_graph graph, vx_uint32 index); + +/*! \brief Returns a Boolean to indicate the state of graph verification. + * \param [in] graph The reference to the graph to check. + * \return A \ref vx_bool value. + * \retval vx_true_e The graph is verified. + * \retval vx_false_e The graph is not verified. It must be verified before + * execution either through \ref vxVerifyGraph or automatically through + * \ref vxProcessGraph or \ref vxScheduleGraph. + * \ingroup group_graph + */ +VX_API_ENTRY vx_bool VX_API_CALL vxIsGraphVerified(vx_graph graph); + +/*! \brief Specify the inputs and outputs of graph explicitly. + * \param [in] graph The graph. + * \param [in] num_of_inputs Number of input reference. + * \param [in] inputs The array of input reference. + * \param [in] num_of_outputs Number of output reference. + * \param [in] ouputs The array of output reference. + * \return A \ref vx_status value. + * \ingroup group_graph + */ +VX_API_ENTRY vx_status VX_API_CALL vxIdentifyGraphInputsAndOutputs(vx_graph graph, + vx_uint32 num_of_inputs, + vx_reference *inputs, + vx_uint32 num_of_outputs, + vx_reference *outputs); + +/*! \brief Get the size of binary graph and generate binary graph into buffer. + * \param [in] graph The graph. + * \param [in] buffer Generate binary graph into buffer if *size value is the size of actual NBG. + * \param [in] size Get the size of binary graph if buffer is NULL. + * \return A \ref vx_status value. + * \ingroup group_graph + */ +VX_API_ENTRY vx_status VX_API_CALL vxGenerateNBG(vx_graph graph, void *buffer, vx_size *size); + +/*============================================================================== + NODE + =============================================================================*/ + +/*! \brief Creates a reference to a node object for a given kernel. + * \details This node has no references assigned as parameters after completion. + * The client is then required to set these parameters manually by \ref vxSetParameterByIndex. + * When clients supply their own node creation functions (for use with User Kernels), this is the API + * to use along with the parameter setting API. + * \param [in] graph The reference to the graph in which this node exists. + * \param [in] kernel The kernel reference to associate with this new node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \note A call to this API sets all parameters to NULL. + * \ingroup group_adv_node + * \post Call \ref vxSetParameterByIndex for as many parameters as needed to be set. + */ +VX_API_ENTRY vx_node VX_API_CALL vxCreateGenericNode(vx_graph graph, vx_kernel kernel); + +/*! \brief Allows a user to query information out of a node. + * \param [in] node The reference to the node to query. + * \param [in] attribute Use \ref vx_node_attribute_e value to query for information. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytesin bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE node is not a valid \ref vx_node reference. + * \retval VX_ERROR_INVALID_PARAMETERS The type or size is incorrect. + * \ingroup group_node + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryNode(vx_node node, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Allows a user to set attribute of a node before Graph Validation. + * \param [in] node The reference to the node to set. + * \param [in] attribute Use \ref vx_node_attribute_e value to set the desired attribute. + * \param [in] ptr The pointer to the desired value of the attribute. + * \param [in] size The size in bytes of the objects to which \a ptr points. + * \note Some attributes are inherited from the \ref vx_kernel, which was used + * to create the node. Some of these can be overridden using this API, notably + * \ref VX_NODE_LOCAL_DATA_SIZE and \ref VX_NODE_LOCAL_DATA_PTR. + * \ingroup group_node + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS The attribute was set; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE node is not a valid \ref vx_node reference. + * \retval VX_ERROR_INVALID_PARAMETERS size is not correct for the type needed. + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetNodeAttribute(vx_node node, vx_enum attribute, const void *ptr, vx_size size); + +/*! \brief Releases a reference to a Node object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] node The pointer to the reference of the node to release. + * \ingroup group_node + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE node is not a valid \ref vx_node reference. + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseNode(vx_node *node); + +/*! \brief Removes a Node from its parent Graph and releases it. + * \param [in] node The pointer to the node to remove and release. + * \ingroup group_node + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE node is not a valid \ref vx_node reference. + */ +VX_API_ENTRY vx_status VX_API_CALL vxRemoveNode(vx_node *node); + +/*! \brief Assigns a callback to a node. + * If a callback already exists in this node, this function must return an error + * and the user may clear the callback by passing a NULL pointer as the callback. + * \param [in] node The reference to the node. + * \param [in] callback The callback to associate with completion of this + * specific node. + * \warning This must be used with extreme caution as it can \e ruin + * optimizations in the power/performance efficiency of a graph. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Callback assigned; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE node is not a valid \ref vx_node reference. + * \ingroup group_node_callback + */ +VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeCallback(vx_node node, vx_nodecomplete_f callback); + +/*! \brief Retrieves the current node callback function pointer set on the node. + * \param [in] node The reference to the \ref vx_node object. + * \ingroup group_node_callback + * \return vx_nodecomplete_f The pointer to the callback function. + * \retval NULL No callback is set. + * \retval * The node callback function. + */ +VX_API_ENTRY vx_nodecomplete_f VX_API_CALL vxRetrieveNodeCallback(vx_node node); + +/*! \brief Sets the node target to the provided value. A success invalidates the graph + * that the node belongs to (\ref vxVerifyGraph must be called before the next execution) + * \param [in] node The reference to the \ref vx_node object. + * \param [in] target_enum The target enum to be set to the \ref vx_node object. + * Use a \ref vx_target_e. + * \param [in] target_string The target name ASCII string. This contains a valid value + * when target_enum is set to \ref VX_TARGET_STRING, otherwise it is ignored. + * \ingroup group_node + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Node target set; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE node is not a valid \ref vx_node reference. + * \retval VX_ERROR_NOT_SUPPORTED If the node kernel is not supported by the specified target. + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetNodeTarget(vx_node node, vx_enum target_enum, const char* target_string); + +/*! \brief Creates replicas of the same node first_node to process a set of objects + * stored in \ref vx_pyramid or \ref vx_object_array. + * first_node needs to have as parameter levels 0 of a \ref vx_pyramid or the index 0 of a \ref vx_object_array. + * Replica nodes are not accessible by the application through any means. An application request for removal of + * first_node from the graph will result in removal of all replicas. Any change of parameter or attribute of + * first_node will be propagated to the replicas. \ref vxVerifyGraph shall enforce consistency of parameters and attributes + * in the replicas. + * \param [in] graph The reference to the graph. + * \param [in] first_node The reference to the node in the graph that will be replicated. + * \param [in] replicate an array of size equal to the number of node parameters, vx_true_e for the parameters + * that should be iterated over (should be a reference to a vx_pyramid or a vx_object_array), + * vx_false_e for the parameters that should be the same across replicated nodes and for optional + * parameters that are not used. Should be vx_true_e for all output and bidirectional parameters. + * \param [in] number_of_parameters number of elements in the replicate array + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference, or first_node is not a + * valid \ref vx_node reference. + * \retval VX_ERROR_NOT_COMPATIBLE At least one of replicated parameters is not of level 0 of a pyramid or at index 0 of an object array. + * \retval VX_FAILURE If the node does not belong to the graph, or the number of objects in the parent objects of inputs and output are not the same. + * \ingroup group_node + */ +VX_API_ENTRY vx_status VX_API_CALL vxReplicateNode(vx_graph graph, vx_node first_node, vx_bool replicate[], vx_uint32 number_of_parameters); + +/*============================================================================== + PARAMETER + =============================================================================*/ + +/*! \brief Retrieves a \ref vx_parameter from a \ref vx_node. + * \param [in] node The node from which to extract the parameter. + * \param [in] index The index of the parameter to which to get a reference. + * \return A parameter reference \ref vx_parameter. Any possible errors preventing a successful + * completion of the function should be checked using \ref vxGetStatus. + * \ingroup group_parameter + */ +VX_API_ENTRY vx_parameter VX_API_CALL vxGetParameterByIndex(vx_node node, vx_uint32 index); + +/*! \brief Releases a reference to a parameter object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] param The pointer to the parameter to release. + * \ingroup group_parameter + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE param is not a valid \ref vx_parameter reference. + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseParameter(vx_parameter *param); + +/*! \brief Sets the specified parameter data for a kernel on the node. + * \param [in] node The node that contains the kernel. + * \param [in] index The index of the parameter desired. + * \param [in] value The desired value of the parameter. + * \note A user may not provide a NULL value for a mandatory parameter of this API. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE node is not a valid \ref vx_node reference, or value + * is not a valid \ref vx_reference reference. + * \ingroup group_parameter + * \see vxSetParameterByReference + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetParameterByIndex(vx_node node, vx_uint32 index, vx_reference value); + +/*! \brief Associates a parameter reference and a data reference with a kernel + * on a node. + * \param [in] parameter The reference to the kernel parameter. + * \param [in] value The value to associate with the kernel parameter. + * \note A user may not provide a NULL value for a mandatory parameter of this API. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE parameter is not a valid \ref vx_parameter reference, + * or value is not a valid \ref vx_reference reference.. + * \ingroup group_parameter + * \see vxGetParameterByIndex + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetParameterByReference(vx_parameter parameter, vx_reference value); + +/*! \brief Allows the client to query a parameter to determine its meta-information. + * \param [in] parameter The reference to the parameter. + * \param [in] attribute The attribute to query. Use a \ref vx_parameter_attribute_e. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE parameter is not a valid \ref vx_parameter reference. + * \ingroup group_parameter + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryParameter(vx_parameter parameter, vx_enum attribute, void *ptr, vx_size size); + +/*============================================================================== + SCALAR + =============================================================================*/ + +/*! \brief Creates a reference to a scalar object. Also see \ref sub_node_parameters. + * \param [in] context The reference to the system context. + * \param [in] data_type The type of data to hold. Must be greater than + * \ref VX_TYPE_INVALID and less than or equal to \ref VX_TYPE_VENDOR_STRUCT_END. + * Or must be a \ref vx_enum returned from \ref vxRegisterUserStruct. + * \param [in] ptr The pointer to the initial value of the scalar. + * \ingroup group_scalar + * \returns A scalar reference \ref vx_scalar. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_scalar VX_API_CALL vxCreateScalar(vx_context context, vx_enum data_type, const void *ptr); + +/*! \brief Creates a reference to a scalar object. Also see \ref sub_node_parameters. + * \param [in] context The reference to the system context. + * \param [in] data_type The type of data to hold. Must be greater than + * \ref VX_TYPE_INVALID and less than or equal to \ref VX_TYPE_VENDOR_STRUCT_END. + * Or must be a \ref vx_enum returned from \ref vxRegisterUserStruct. + * \param [in] ptr The pointer to the initial value of the scalar. + * \param [in] size Size of data at ptr in bytes. + * \ingroup group_scalar + * \returns A scalar reference \ref vx_scalar. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_scalar VX_API_CALL vxCreateScalarWithSize(vx_context context, vx_enum data_type, const void *ptr, vx_size size); + +/*! \brief Creates an opaque reference to a scalar object with no direct user access. + * \param [in] graph The reference to the parent graph. + * \param [in] data_type The type of data to hold. Must be greater than + * \ref VX_TYPE_INVALID and less than or equal to \ref VX_TYPE_VENDOR_STRUCT_END. + * Or must be a \ref vx_enum returned from \ref vxRegisterUserStruct. + * \see \ref vxCreateScalar + * \ingroup group_scalar + * \returns A scalar reference \ref vx_scalar. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_scalar VX_API_CALL vxCreateVirtualScalar(vx_graph graph, vx_enum data_type); + +/*! \brief Releases a reference to a scalar object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] scalar The pointer to the scalar to release. + * \ingroup group_scalar + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE scalar is not a valid \ref vx_scalar reference. + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseScalar(vx_scalar *scalar); + +/*! \brief Queries attributes from a scalar. + * \param [in] scalar The scalar object. + * \param [in] attribute The enumeration to query. Use a \ref vx_scalar_attribute_e enumeration. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE scalar is not a valid \ref vx_scalar reference. + * \ingroup group_scalar + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryScalar(vx_scalar scalar, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Allows the application to copy from/into a scalar object. + * \param [in] scalar The reference to the scalar object that is the source or the + * destination of the copy. + * \param [in] user_ptr The address of the memory location where to store the requested data + * if the copy was requested in read mode, or from where to get the data to store into the + * scalar object if the copy was requested in write mode. In the user memory, the scalar is + * a variable of the type corresponding to \ref VX_SCALAR_TYPE. + * The accessible memory must be large enough to contain this variable. + * \param [in] usage This declares the effect of the copy with regard to the scalar object + * using the \ref vx_accessor_e enumeration. Only \ref VX_READ_ONLY and \ref VX_WRITE_ONLY + * are supported: + * \arg \ref VX_READ_ONLY means that data are copied from the scalar object into the user memory. + * \arg \ref VX_WRITE_ONLY means that data are copied into the scalar object from the user memory. + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that specifies + * the memory type of the memory referenced by the user_addr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE scalar is not a valid \ref vx_scalar reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_scalar + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyScalar(vx_scalar scalar, void *user_ptr, vx_enum usage, vx_enum user_mem_type); + +/*! \brief Allows the application to copy from/into a scalar object with size. + * \param [in] scalar The reference to the scalar object that is the source or the + * destination of the copy. + * \param [in] size The size in bytes of the container to which \a user_ptr points. + * \param [in] user_ptr The address of the memory location where to store the requested data + * if the copy was requested in read mode, or from where to get the data to store into the + * scalar object if the copy was requested in write mode. In the user memory, the scalar is + * a variable of the type corresponding to \ref VX_SCALAR_TYPE. + * The accessible memory must be large enough to contain this variable. + * \param [in] usage This declares the effect of the copy with regard to the scalar object + * using the \ref vx_accessor_e enumeration. Only \ref VX_READ_ONLY and \ref VX_WRITE_ONLY + * are supported: + * \arg \ref VX_READ_ONLY means that data are copied from the scalar object into the user memory. + * \arg \ref VX_WRITE_ONLY means that data are copied into the scalar object from the user memory. + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that specifies + * the memory type of the memory referenced by the user_addr. + * \return A \ref vx_status_e enumeration. + * \retval VX_ERROR_INVALID_REFERENCE The scalar reference is not actually a scalar reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_scalar + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyScalarWithSize(vx_scalar scalar, vx_size size, void *user_ptr, vx_enum usage, vx_enum user_mem_type); + +/*============================================================================== + REFERENCE + =============================================================================*/ + +/*! \brief Queries any reference type for some basic information like count or type. + * \param [in] ref The reference to query. + * \param [in] attribute The value for which to query. Use \ref vx_reference_attribute_e. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE ref is not a valid \ref vx_reference reference. + * \ingroup group_reference + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryReference(vx_reference ref, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Releases a reference. The reference may potentially refer to multiple OpenVX objects of different types. + * This function can be used instead of calling a specific release function for each individual object type + * (e.g. vxRelease). The object will not be destroyed until its total reference count is zero. + * \note After returning from this function the reference is zeroed. + * \param [in] ref_ptr The pointer to the reference of the object to release. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE ref_ptr is not a valid \ref vx_reference reference. + * \ingroup group_reference + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseReference(vx_reference* ref_ptr); + +/*! + * \brief Increments the reference counter of an object + * This function is used to express the fact that the OpenVX object is referenced + * multiple times by an application. Each time this function is called for + * an object, the application will need to release the object one additional + * time before it can be destructed + * \param [in] ref The reference to retain. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE ref is not a valid \ref vx_reference reference. + * \ingroup group_reference + */ +VX_API_ENTRY vx_status VX_API_CALL vxRetainReference(vx_reference ref); + +/*! \brief Name a reference + * \ingroup group_reference + * + * This function is used to associate a name to a referenced object. This name + * can be used by the OpenVX implementation in log messages and any + * other reporting mechanisms. + * + * The OpenVX implementation will not check if the name is unique in + * the reference scope (context or graph). Several references can then + * have the same name. + * + * \param [in] ref The reference to the object to be named. + * \param [in] name Pointer to the '\0' terminated string that identifies + * the referenced object. + * The string is copied by the function so that it + * stays the property of the caller. + * NULL means that the reference is not named. + * The length of the string shall be lower than VX_MAX_REFERENCE_NAME bytes. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE ref is not a valid \ref vx_reference reference. + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetReferenceName(vx_reference ref, const vx_char *name); + +/*============================================================================== + DELAY + =============================================================================*/ + +/*! \brief Queries a \ref vx_delay object attribute. + * \param [in] delay The reference to a delay object. + * \param [in] attribute The attribute to query. Use a \ref vx_delay_attribute_e enumeration. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE delay is not a valid \ref vx_delay reference. + * \ingroup group_delay + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryDelay(vx_delay delay, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Releases a reference to a delay object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] delay The pointer to the delay object reference to release. + * \post After returning from this function the reference is zeroed. + * \ingroup group_delay + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE delay is not a valid \ref vx_delay reference. + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseDelay(vx_delay *delay); + + +/*! \brief Creates a Delay object. + * \details This function creates a delay object with \p num_slots slots. Each slot + * contains a clone of the exemplar. The clones only inherit the metadata of the + * exemplar. The data content of the exemplar is ignored and the clones have their + * data undefined at delay creation time. + * The function does not alter the exemplar. Also, it doesn't retain or release the + * reference to the exemplar. + * \note For the definition of metadata attributes see \ref vxSetMetaFormatAttribute. + * \param [in] context The reference to the context. + * \param [in] exemplar The exemplar object. Supported exemplar object types are:
+ * \arg \ref VX_TYPE_ARRAY + * \arg \ref VX_TYPE_CONVOLUTION + * \arg \ref VX_TYPE_DISTRIBUTION + * \arg \ref VX_TYPE_IMAGE + * \arg \ref VX_TYPE_LUT + * \arg \ref VX_TYPE_MATRIX + * \arg \ref VX_TYPE_OBJECT_ARRAY + * \arg \ref VX_TYPE_PYRAMID + * \arg \ref VX_TYPE_REMAP + * \arg \ref VX_TYPE_SCALAR + * \arg \ref VX_TYPE_THRESHOLD + * \arg \ref VX_TYPE_TENSOR + * \param [in] num_slots The number of objects in the delay. This value must be greater than zero. + * \returns A delay reference \ref vx_delay. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_delay + */ +VX_API_ENTRY vx_delay VX_API_CALL vxCreateDelay(vx_context context, + vx_reference exemplar, + vx_size num_slots); + +/*! \brief Retrieves a reference to a delay slot object. + * \param [in] delay The reference to the delay object. + * \param [in] index The index of the delay slot from which to extract the object reference. + * \return \ref vx_reference. Any possible errors preventing a successful + * completion of the function should be checked using \ref vxGetStatus. + * \note The delay index is in the range \f$ [-count+1,0] \f$. 0 is always the + * \e current object. + * \ingroup group_delay + * \note A reference retrieved with this function must not be given to its associated + * release API (e.g. \ref vxReleaseImage) unless \ref vxRetainReference is used. + */ +VX_API_ENTRY vx_reference VX_API_CALL vxGetReferenceFromDelay(vx_delay delay, vx_int32 index); + +/*! \brief Shifts the internal delay ring by one. + * + * This function performs a shift of the internal delay ring by one. This means that, + * the data originally at index 0 move to index -1 and so forth until index + * \f$ -count+1 \f$. The data originally at index \f$ -count+1 \f$ move to index 0. + * Here \f$ count \f$ is the number of slots in delay ring. + * When a delay is aged, any graph making use of this delay (delay object itself or data + * objects in delay slots) gets its data automatically updated accordingly. + * \param [in] delay + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Delay was aged; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE delay is not a valid \ref vx_delay reference. + * \ingroup group_delay + */ +VX_API_ENTRY vx_status VX_API_CALL vxAgeDelay(vx_delay delay); + +/*! \brief Register a delay for auto-aging. + * + * This function registers a delay object to be auto-aged by the graph. + * This delay object will be automatically aged after each successful completion of + * this graph. Aging of a delay object cannot be called during graph execution. + * A graph abandoned due to a node callback will trigger an auto-aging. + * + * If a delay is registered for auto-aging multiple times in a same graph, + * the delay will be only aged a single time at each graph completion. + * If a delay is registered for auto-aging in multiple graphs, this delay will + * aged automatically after each successful completion of any of these graphs. + * + * \param [in] graph The graph to which the delay is registered for auto-aging. + * \param [in] delay The delay to automatically age. + * + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE graph is not a valid \ref vx_graph reference, or + * delay is not a valid \ref vx_delay reference. + * \ingroup group_graph + */ +VX_API_ENTRY vx_status VX_API_CALL vxRegisterAutoAging(vx_graph graph, vx_delay delay); + +/*============================================================================== + LOGGING + =============================================================================*/ + +/*! \brief Adds a line to the log. + * \param [in] ref The reference to add the log entry against. Some valid value must be provided. + * \param [in] status The status code. \ref VX_SUCCESS status entries are ignored and not added. + * \param [in] message The human readable message to add to the log. + * \param [in] ... a list of variable arguments to the message. + * \note Messages may not exceed \ref VX_MAX_LOG_MESSAGE_LEN bytes and will be truncated in the log if they exceed this limit. + * \ingroup group_log + */ +VX_API_ENTRY void VX_API_CALL vxAddLogEntry(vx_reference ref, vx_status status, const char *message, ...); + +/*! \brief Registers a callback facility to the OpenVX implementation to receive error logs. + * \param [in] context The overall context to OpenVX. + * \param [in] callback The callback function. If NULL, the previous callback is removed. + * \param [in] reentrant If reentrancy flag is \ref vx_true_e, then the callback may be entered from multiple + * simultaneous tasks or threads (if the host OS supports this). + * \ingroup group_log + */ +VX_API_ENTRY void VX_API_CALL vxRegisterLogCallback(vx_context context, vx_log_callback_f callback, vx_bool reentrant); + +/*============================================================================== + LUT + =============================================================================*/ + +/*! \brief Creates LUT object of a given type. The value of \ref VX_LUT_OFFSET is equal to 0 + * for data_type = \ref VX_TYPE_UINT8, and (vx_uint32)(count/2) for \ref VX_TYPE_INT16. + * \param [in] context The reference to the context. + * \param [in] data_type The type of data stored in the LUT. + * \param [in] count The number of entries desired. + * \note data_type can only be \ref VX_TYPE_UINT8 or \ref VX_TYPE_INT16. If data_type + * is \ref VX_TYPE_UINT8, count should be not greater than 256. If data_type is \ref VX_TYPE_INT16, + * count should not be greater than 65536. + * \returns An LUT reference \ref vx_lut. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus. + * \ingroup group_lut + */ +VX_API_ENTRY vx_lut VX_API_CALL vxCreateLUT(vx_context context, vx_enum data_type, vx_size count); + +/*! \brief Creates an opaque reference to a LUT object with no direct user access. + * \param [in] graph The reference to the parent graph. + * \param [in] data_type The type of data stored in the LUT. + * \param [in] count The number of entries desired. + * \see \ref vxCreateLUT + * \note data_type can only be \ref VX_TYPE_UINT8 or \ref VX_TYPE_INT16. If data_type + * is \ref VX_TYPE_UINT8, count should be not greater than 256. If data_type is \ref VX_TYPE_INT16, + * count should not be greater than 65536. + * \returns An LUT reference \ref vx_lut. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus. + * \ingroup group_lut + */ +VX_API_ENTRY vx_lut VX_API_CALL vxCreateVirtualLUT(vx_graph graph, vx_enum data_type, vx_size count); + +/*! \brief Releases a reference to a LUT object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] lut The pointer to the LUT to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE lut is not a valid \ref vx_lut reference. + * \ingroup group_lut + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseLUT(vx_lut *lut); + +/*! \brief Queries attributes from a LUT. + * \param [in] lut The LUT to query. + * \param [in] attribute The attribute to query. Use a \ref vx_lut_attribute_e enumeration. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE lut is not a valid \ref vx_lut reference. + * \ingroup group_lut + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryLUT(vx_lut lut, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Allows the application to copy from/into a LUT object. + * \param [in] lut The reference to the LUT object that is the source or the + * destination of the copy. + * \param [in] user_ptr The address of the memory location where to store the requested data + * if the copy was requested in read mode, or from where to get the data to store into the LUT + * object if the copy was requested in write mode. In the user memory, the LUT is + * represented as a array with elements of the type corresponding to + * \ref VX_LUT_TYPE, and with a number of elements equal to the value + * returned via \ref VX_LUT_COUNT. The accessible memory must be large enough + * to contain this array: + * accessible memory in bytes >= sizeof(data_element) * count. + * \param [in] usage This declares the effect of the copy with regard to the LUT object + * using the \ref vx_accessor_e enumeration. Only \ref VX_READ_ONLY and \ref VX_WRITE_ONLY + * are supported: + * \arg \ref VX_READ_ONLY means that data are copied from the LUT object into the user memory. + * \arg \ref VX_WRITE_ONLY means that data are copied into the LUT object from the user memory. + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that specifies + * the memory type of the memory referenced by the user_addr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE lut is not a valid \ref vx_lut reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_lut + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyLUT(vx_lut lut, void *user_ptr, vx_enum usage, vx_enum user_mem_type); + +/*! \brief Allows the application to get direct access to LUT object. + * \param [in] lut The reference to the LUT object to map. + * \param [out] map_id The address of a \ref vx_map_id variable where the function + * returns a map identifier. + * \arg (*map_id) must eventually be provided as the map_id parameter of a call to + * \ref vxUnmapLUT. + * \param [out] ptr The address of a pointer that the function sets to the + * address where the requested data can be accessed. In the mapped memory area, + * the LUT data are structured as an array with elements of the type corresponding + * to \ref VX_LUT_TYPE, with a number of elements equal to + * the value returned via \ref VX_LUT_COUNT. Accessing the + * memory out of the bound of this array is forbidden and has an undefined behavior. + * The returned (*ptr) address is only valid between the call to the function and + * the corresponding call to \ref vxUnmapLUT. + * \param [in] usage This declares the access mode for the LUT, using + * the \ref vx_accessor_e enumeration. + * \arg \ref VX_READ_ONLY: after the function call, the content of the memory location + * pointed by (*ptr) contains the LUT data. Writing into this memory location + * is forbidden and its behavior is undefined. + * \arg \ref VX_READ_AND_WRITE: after the function call, the content of the memory + * location pointed by (*ptr) contains the LUT data; writing into this memory + * is allowed only for the location of entries and will result in a modification + * of the affected entries in the LUT object once the LUT is unmapped. + * \arg \ref VX_WRITE_ONLY: after the function call, the memory location pointed by(*ptr) + * contains undefined data; writing each entry of LUT is required prior to + * unmapping. Entries not written by the application before unmap will become + * undefined after unmap, even if they were well defined before map. + * \param [in] mem_type A \ref vx_memory_type_e enumeration that + * specifies the type of the memory where the LUT is requested to be mapped. + * \param [in] flags An integer that allows passing options to the map operation. + * Use 0 for this option. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE lut is not a valid \ref vx_lut reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_lut + * \post \ref vxUnmapLUT with same (*map_id) value. + */ +VX_API_ENTRY vx_status VX_API_CALL vxMapLUT(vx_lut lut, vx_map_id *map_id, void **ptr, vx_enum usage, vx_enum mem_type, vx_bitfield flags); + +/*! \brief Unmap and commit potential changes to LUT object that was previously mapped. + * Unmapping a LUT invalidates the memory location from which the LUT data could + * be accessed by the application. Accessing this memory location after the unmap function + * completes has an undefined behavior. + * \param [in] lut The reference to the LUT object to unmap. + * \param [out] map_id The unique map identifier that was returned when calling + * \ref vxMapLUT . + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE lut is not a valid \ref vx_lut reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_lut + * \pre \ref vxMapLUT returning the same map_id value + */ +VX_API_ENTRY vx_status VX_API_CALL vxUnmapLUT(vx_lut lut, vx_map_id map_id); + +/*============================================================================== + DISTRIBUTION + =============================================================================*/ + +/*! \brief Creates a reference to a 1D Distribution of a consecutive interval [offset, offset + range - 1] + * defined by a start offset and valid range, divided equally into numBins parts. + * \param [in] context The reference to the overall context. + * \param [in] numBins The number of bins in the distribution. + * \param [in] offset The start offset into the range value that marks the begining of the 1D Distribution. + * \param [in] range The total number of the consecutive values of the distribution interval. + * \returns A distribution reference \ref vx_distribution. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_distribution + */ +VX_API_ENTRY vx_distribution VX_API_CALL vxCreateDistribution(vx_context context, vx_size numBins, vx_int32 offset, vx_uint32 range); + +/*! \brief Creates an opaque reference to a 1D Distribution object without direct user access. + * \param [in] graph The reference to the parent graph. + * \param [in] numBins The number of bins in the distribution. + * \param [in] offset The start offset into the range value that marks the begining of the 1D Distribution. + * \param [in] range The total number of the consecutive values of the distribution interval. + * \see \ref vxCreateDistribution + * \returns A distribution reference \ref vx_distribution. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_distribution + */ +VX_API_ENTRY vx_distribution VX_API_CALL vxCreateVirtualDistribution(vx_graph graph, vx_size numBins, vx_int32 offset, vx_uint32 range); + +/*! \brief Releases a reference to a distribution object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] distribution The reference to the distribution to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE distribution is not a valid \ref vx_distribution reference. + * \ingroup group_distribution + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseDistribution(vx_distribution *distribution); + +/*! \brief Queries a Distribution object. + * \param [in] distribution The reference to the distribution to query. + * \param [in] attribute The attribute to query. Use a \ref vx_distribution_attribute_e enumeration. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE distribution is not a valid \ref vx_distribution reference. + * \ingroup group_distribution + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryDistribution(vx_distribution distribution, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Allows the application to copy from/into a distribution object. + * \param [in] distribution The reference to the distribution object that is the source or the + * destination of the copy. + * \param [in] user_ptr The address of the memory location where to store the requested data + * if the copy was requested in read mode, or from where to get the data to store into the distribution + * object if the copy was requested in write mode. In the user memory, the distribution is + * represented as a \ref vx_uint32 array with a number of elements equal to the value returned via + * \ref VX_DISTRIBUTION_BINS. The accessible memory must be large enough + * to contain this vx_uint32 array: + * accessible memory in bytes >= sizeof(vx_uint32) * num_bins. + * \param [in] usage This declares the effect of the copy with regard to the distribution object + * using the \ref vx_accessor_e enumeration. Only \ref VX_READ_ONLY and \ref VX_WRITE_ONLY + * are supported: + * \arg \ref VX_READ_ONLY means that data are copied from the distribution object into the user memory. + * \arg \ref VX_WRITE_ONLY means that data are copied into the distribution object from the user memory. + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that specifies + * the memory type of the memory referenced by the user_addr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE distribution is not a valid \ref vx_distribution reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_distribution + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyDistribution(vx_distribution distribution, void *user_ptr, vx_enum usage, vx_enum user_mem_type); + +/*! \brief Allows the application to get direct access to distribution object. + * \param [in] distribution The reference to the distribution object to map. + * \param [out] map_id The address of a \ref vx_map_id variable where the function + * returns a map identifier. + * \arg (*map_id) must eventually be provided as the map_id parameter of a call to + * \ref vxUnmapDistribution. + * \param [out] ptr The address of a pointer that the function sets to the + * address where the requested data can be accessed. In the mapped memory area, + * data are structured as a vx_uint32 array with a number of elements equal to + * the value returned via \ref VX_DISTRIBUTION_BINS. Each + * element of this array corresponds to a bin of the distribution, with a range-major + * ordering. Accessing the memory out of the bound of this array + * is forbidden and has an undefined behavior. The returned (*ptr) address + * is only valid between the call to the function and the corresponding call to + * \ref vxUnmapDistribution. + * \param [in] usage This declares the access mode for the distribution, using + * the \ref vx_accessor_e enumeration. + * \arg \ref VX_READ_ONLY: after the function call, the content of the memory location + * pointed by (*ptr) contains the distribution data. Writing into this memory location + * is forbidden and its behavior is undefined. + * \arg \ref VX_READ_AND_WRITE: after the function call, the content of the memory + * location pointed by (*ptr) contains the distribution data; writing into this memory + * is allowed only for the location of bins and will result in a modification of the + * affected bins in the distribution object once the distribution is unmapped. + * \arg \ref VX_WRITE_ONLY: after the function call, the memory location pointed by (*ptr) + * contains undefined data; writing each bin of distribution is required prior to + * unmapping. Bins not written by the application before unmap will become + * undefined after unmap, even if they were well defined before map. + * \param [in] mem_type A \ref vx_memory_type_e enumeration that + * specifies the type of the memory where the distribution is requested to be mapped. + * \param [in] flags An integer that allows passing options to the map operation. + * Use 0 for this option. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE distribution is not a valid \ref vx_distribution reference. + * reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_distribution + * \post \ref vxUnmapDistribution with same (*map_id) value. + */ +VX_API_ENTRY vx_status VX_API_CALL vxMapDistribution(vx_distribution distribution, vx_map_id *map_id, void **ptr, vx_enum usage, vx_enum mem_type, vx_bitfield flags); + +/*! \brief Unmap and commit potential changes to distribution object that was previously mapped. + * Unmapping a distribution invalidates the memory location from which the distribution data + * could be accessed by the application. Accessing this memory location after the unmap + * function completes has an undefined behavior. + * \param [in] distribution The reference to the distribution object to unmap. + * \param [out] map_id The unique map identifier that was returned when calling + * \ref vxMapDistribution . + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE distribution is not a valid \ref vx_distribution reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_distribution + * \pre \ref vxMapDistribution returning the same map_id value + */ +VX_API_ENTRY vx_status VX_API_CALL vxUnmapDistribution(vx_distribution distribution, vx_map_id map_id); + + +/*============================================================================== + THRESHOLD + =============================================================================*/ + +/*! \brief Creates a threshold object and returns a reference to it. + * + * The threshold object defines the parameters of a thresholding operation + * to an input image, that generates an output image that can have a different + * format. The thresholding 'false' or 'true' output values are specified per + * pixel channels of the output format and can be modified with + * \ref vxCopyThresholdOutput. The default 'false' output value of + * pixels channels should be 0, and the default 'true' value should be non-zero. + * For standard image formats, default output pixel values are defined as + * following: + * \arg \ref VX_DF_IMAGE_RGB : false={0, 0, 0}, true={255,255,255} + * \arg \ref VX_DF_IMAGE_RGBX : false={0, 0, 0, 0}, true={255,255,255,255} + * \arg \ref VX_DF_IMAGE_NV12 : false={0, 0, 0}, true={255,255,255} + * \arg \ref VX_DF_IMAGE_NV21 : false={0, 0, 0}, true={255,255,255} + * \arg \ref VX_DF_IMAGE_UYVY : false={0, 0, 0}, true={255,255,255} + * \arg \ref VX_DF_IMAGE_YUYV : false={0, 0, 0}, true={255,255,255} + * \arg \ref VX_DF_IMAGE_IYUV : false={0, 0, 0}, true={255,255,255} + * \arg \ref VX_DF_IMAGE_YUV4 : false={0, 0, 0}, true={255,255,255} + * \arg \ref VX_DF_IMAGE_U8 : false=0, true=0xFF + * \arg \ref VX_DF_IMAGE_S16 : false=0, true=-1 + * \arg \ref VX_DF_IMAGE_U16 : false=0, true=0xFFFF + * \arg \ref VX_DF_IMAGE_S32 : false=0, true=-1 + * \arg \ref VX_DF_IMAGE_U32 : false=0, true=0xFFFFFFFF + * \param [in] context The reference to the context in which the object is + * created. + * \param [in] thresh_type The type of thresholding operation. + * \param [in] input_format The format of images that will be used as input of + * the thresholding operation. + * \param [in] output_format The format of images that will be generated by the + * thresholding operation. + * \returns A threshold reference \ref vx_threshold. Any possible + * errors preventing a successful creation should be checked using + * \ref vxGetStatus. + * \ingroup group_threshold + */ +VX_API_ENTRY vx_threshold VX_API_CALL vxCreateThresholdForImage(vx_context context, + vx_enum thresh_type, + vx_df_image input_format, + vx_df_image output_format); + +/*! \brief Creates an opaque reference to a threshold object without direct user access. + * + * \param [in] graph The reference to the parent graph. + * \param [in] thresh_type The type of thresholding operation. + * \param [in] input_format The format of images that will be used as input of + * the thresholding operation. + * \param [in] output_format The format of images that will be generated by the + * thresholding operation. + * \see \ref vxCreateThresholdForImage + * \returns A threshold reference \ref vx_threshold. Any possible + * errors preventing a successful creation should be checked using + * \ref vxGetStatus. + * \ingroup group_threshold + */ +VX_API_ENTRY vx_threshold VX_API_CALL vxCreateVirtualThresholdForImage(vx_graph graph, + vx_enum thresh_type, + vx_df_image input_format, + vx_df_image output_format); + +/*! \brief Allows the application to copy the thresholding value from/into a + * threshold object with type \ref VX_THRESHOLD_TYPE_BINARY. + * \param [in] thresh The reference to the threshold object that is the source + * or the destination of the copy. + * \param [in,out] value_ptr The address of the memory location where to store + * the thresholding value if the copy was requested in read mode, or from where + * to get the thresholding value to store into the threshold object if the copy + * was requested in write mode. + * \param [in] usage This declares the effect of the copy with regard to the + * threshold object using the \ref vx_accessor_e enumeration. Only + * \ref VX_READ_ONLY and \ref VX_WRITE_ONLY are supported: + * \arg \ref VX_READ_ONLY means that the thresholding value is copied + * from the threshold object into the user memory. After the copy, only the + * field of the (*value_ptr) union that corresponds to the input image format + * of the threshold object is meaningful. + * \arg \ref VX_WRITE_ONLY means the field of the (*value_ptr) union + * corresponding to the input format of the threshold object is copied into + * the threshold object. + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that + * specifies the type of the memory referenced by \p value_ptr. + * \return A \ref vx_status_e enumeration. + * \retval VX_ERROR_INVALID_REFERENCE The threshold reference is not actually a + * threshold reference. + * \retval VX_ERROR_NOT_COMPATIBLE The threshold object doesn't have type + * \ref VX_THRESHOLD_TYPE_BINARY + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_threshold + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyThresholdValue(vx_threshold thresh, + vx_pixel_value_t * value_ptr, + vx_enum usage, + vx_enum user_mem_type + ); + +/*! \brief Allows the application to copy thresholding values from/into a + * threshold object with type \ref VX_THRESHOLD_TYPE_RANGE. + * \param [in] thresh The reference to the threshold object that is the source + * or the destination of the copy. + * \param [in,out] lower_value_ptr The address of the memory location where to + * store the lower thresholding value if the copy was requested in read mode, + * or from where to get the lower thresholding value to store into the threshold + * object if the copy was requested in write mode. + * \param [in,out] upper_value_ptr The address of the memory location where to + * store the upper thresholding value if the copy was requested in read mode, or + * from where to get the upper thresholding value to store into the threshold + * object if the copy was requested in write mode. + * \param [in] usage This declares the effect of the copy with regard to the + * threshold object using the \ref vx_accessor_e enumeration. Only + * \ref VX_READ_ONLY and \ref VX_WRITE_ONLY are supported: + * \arg \ref VX_READ_ONLY means that thresholding values are copied + * from the threshold object into the user memory. After the copy, only the + * field of (*lower_value_ptr) and (*upper_value_ptr) unions that corresponds + * to the input image format of the threshold object is meaningful. + * \arg \ref VX_WRITE_ONLY means the field of the (*lower_value_ptr) + * and (*upper_value_ptr) unions corresponding to the input format of the + * threshold object is copied into the threshold object. + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that + * specifies the type of the memory referenced by \p lower_value_ptr and + * \p upper_value_ptr. + * \return A \ref vx_status_e enumeration. + * \retval VX_ERROR_INVALID_REFERENCE The threshold reference is not actually + * a threshold reference. + * \retval VX_ERROR_NOT_COMPATIBLE The threshold object doesn't have type + * \ref VX_THRESHOLD_TYPE_RANGE + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_threshold + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyThresholdRange(vx_threshold thresh, + vx_pixel_value_t * lower_value_ptr, + vx_pixel_value_t * upper_value_ptr, + vx_enum usage, + vx_enum user_mem_type); + +/*! \brief Allows the application to copy the true and false output values + * from/into a threshold object. + * \param [in] thresh The reference to the threshold object that is the source + * or the destination of the copy. + * \param [in,out] true_value_ptr The address of the memory location where to + * store the true output value if the copy was requested in read mode, + * or from where to get the true output value to store into the threshold + * object if the copy was requested in write mode. + * \param [in,out] false_value_ptr The address of the memory location where to + * store the false output value if the copy was requested in read mode, or + * from where to get the false output value to store into the threshold + * object if the copy was requested in write mode. + * \param [in] usage This declares the effect of the copy with regard to the + * threshold object using the \ref vx_accessor_e enumeration. Only + * \ref VX_READ_ONLY and \ref VX_WRITE_ONLY are supported: + * \arg \ref VX_READ_ONLY means that true and false output values + * are copied from the threshold object into the user memory. After the copy, + * only the field of (*true_value_ptr) and (*false_value_ptr) unions that + * corresponds to the output image format of the threshold object is meaningful. + * \arg \ref VX_WRITE_ONLY means the field of the (*true_value_ptr) + * and (*false_value_ptr) unions corresponding to the output format of the + * threshold object is copied into the threshold object. + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that + * specifies the type of the memory referenced by \p true_value_ptr and + * \p false_value_ptr. + * \return A \ref vx_status_e enumeration. + * \retval VX_ERROR_INVALID_REFERENCE The threshold reference is not actually + * a threshold reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_threshold + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyThresholdOutput(vx_threshold thresh, + vx_pixel_value_t * true_value_ptr, + vx_pixel_value_t * false_value_ptr, + vx_enum usage, + vx_enum user_mem_type); + +/*! \brief Releases a reference to a threshold object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] thresh The pointer to the threshold to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE thresh is not a valid \ref vx_threshold reference. + * \ingroup group_threshold + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseThreshold(vx_threshold *thresh); + +/*! \brief Sets attributes on the threshold object. + * \param [in] thresh The threshold object to set. + * \param [in] attribute The attribute to modify. Use a \ref vx_threshold_attribute_e enumeration. + * \param [in] ptr The pointer to the value to which to set the attribute. + * \param [in] size The size of the data pointed to by \a ptr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE thresh is not a valid \ref vx_threshold reference. + * \ingroup group_threshold + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetThresholdAttribute(vx_threshold thresh, vx_enum attribute, const void *ptr, vx_size size); + +/*! \brief Queries an attribute on the threshold object. + * \param [in] thresh The threshold object to set. + * \param [in] attribute The attribute to query. Use a \ref vx_threshold_attribute_e enumeration. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE thresh is not a valid \ref vx_threshold reference. + * \ingroup group_threshold + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryThreshold(vx_threshold thresh, vx_enum attribute, void *ptr, vx_size size); + +/*============================================================================== + MATRIX + =============================================================================*/ + +/*! \brief Creates a reference to a matrix object. + * \param [in] c The reference to the overall context. + * \param [in] data_type The unit format of the matrix. \ref VX_TYPE_UINT8 or \ref VX_TYPE_INT32 or \ref VX_TYPE_FLOAT32. + * \param [in] columns The first dimensionality. + * \param [in] rows The second dimensionality. + * \returns An matrix reference \ref vx_matrix. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_matrix + */ +VX_API_ENTRY vx_matrix VX_API_CALL vxCreateMatrix(vx_context c, vx_enum data_type, vx_size columns, vx_size rows); + +/*! \brief Creates an opaque reference to a matrix object without direct user access. + * \param [in] graph The reference to the parent graph. + * \param [in] data_type The unit format of the matrix. \ref VX_TYPE_UINT8 or \ref VX_TYPE_INT32 or \ref VX_TYPE_FLOAT32. + * \param [in] columns The first dimensionality. + * \param [in] rows The second dimensionality. + * \see \ref vxCreateMatrix + * \returns An matrix reference \ref vx_matrix. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_matrix + */ +VX_API_ENTRY vx_matrix VX_API_CALL vxCreateVirtualMatrix(vx_graph graph, vx_enum data_type, vx_size columns, vx_size rows); + +/*! \brief Releases a reference to a matrix object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] mat The matrix reference to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE mat is not a valid \ref vx_matrix reference. + * \ingroup group_matrix + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseMatrix(vx_matrix *mat); + +/*! \brief Queries an attribute on the matrix object. + * \param [in] mat The matrix object to set. + * \param [in] attribute The attribute to query. Use a \ref vx_matrix_attribute_e enumeration. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE mat is not a valid \ref vx_matrix reference. + * \ingroup group_matrix + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryMatrix(vx_matrix mat, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Allows the application to copy from/into a matrix object. + * \param [in] matrix The reference to the matrix object that is the source or the + * destination of the copy. + * \param [in] user_ptr The address of the memory location where to store the requested data + * if the copy was requested in read mode, or from where to get the data to store into the matrix + * object if the copy was requested in write mode. In the user memory, the matrix is + * structured as a row-major 2D array with elements of the type corresponding to + * \ref VX_MATRIX_TYPE, with a number of rows corresponding to + * \ref VX_MATRIX_ROWS and a number of columns corresponding to + * \ref VX_MATRIX_COLUMNS. The accessible memory must be large + * enough to contain this 2D array: + * accessible memory in bytes >= sizeof(data_element) * rows * columns. + * \param [in] usage This declares the effect of the copy with regard to the matrix object + * using the \ref vx_accessor_e enumeration. Only \ref VX_READ_ONLY and \ref VX_WRITE_ONLY + * are supported: + * \arg \ref VX_READ_ONLY means that data are copied from the matrix object into the user memory. + * \arg \ref VX_WRITE_ONLY means that data are copied into the matrix object from the user memory. + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that specifies + * the memory type of the memory referenced by the user_addr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE matrix is not a valid \ref vx_matrix reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_matrix + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyMatrix(vx_matrix matrix, void *user_ptr, vx_enum usage, vx_enum user_mem_type); + +/*! \brief Creates a reference to a matrix object from a boolean pattern. + * \see \ref vxCreateMatrixFromPatternAndOrigin for a description of the matrix patterns. + * \param [in] context The reference to the overall context. + * \param [in] pattern The pattern of the matrix. See \ref VX_MATRIX_PATTERN. + * \param [in] columns The first dimensionality. + * \param [in] rows The second dimensionality. + * \returns A matrix reference \ref vx_matrix of type \ref VX_TYPE_UINT8. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_matrix + */ +VX_API_ENTRY vx_matrix VX_API_CALL vxCreateMatrixFromPattern(vx_context context, vx_enum pattern, vx_size columns, vx_size rows); + +/*! \brief Creates a reference to a matrix object from a boolean pattern, with a user-specified origin. + * + * The matrix created by this function is of type \ref VX_TYPE_UINT8, with the value 0 representing False, + * and the value 255 representing True. It supports the patterns as described below: + * - VX_PATTERN_BOX is a matrix with dimensions equal to the given number of rows and columns, and all cells equal to 255. + * Dimensions of 3x3 and 5x5 must be supported. + * - VX_PATTERN_CROSS is a matrix with dimensions equal to the given number of rows and columns, which both must be odd numbers. + * All cells in the center row and center column are equal to 255, and the rest are equal to zero. + * Dimensions of 3x3 and 5x5 must be supported. + * - VX_PATTERN_DISK is a matrix with dimensions equal to the given number of rows (R) and columns (C), + * where R and C are odd and cell (c, r) is 255 if: \n + * (r-R/2 + 0.5)^2 / (R/2)^2 + (c-C/2 + 0.5)^2/(C/2)^2 is less than or equal to 1,\n and 0 otherwise. + * + * A matrix created from pattern is read-only. The behavior when attempting to modify such a matrix is undefined. + * + * \param [in] context The reference to the overall context. + * \param [in] pattern The pattern of the matrix. See \ref VX_MATRIX_PATTERN. + * \param [in] columns The first dimensionality. + * \param [in] rows The second dimensionality. + * \param [in] origin_col The origin (first dimensionality). + * \param [in] origin_row The origin (second dimensionality). + * \returns A matrix reference \ref vx_matrix of type \ref VX_TYPE_UINT8. Any possible errors + * preventing a successful creation should be checked using \ref vxGetStatus. + * \ingroup group_matrix + */ +VX_API_ENTRY vx_matrix VX_API_CALL vxCreateMatrixFromPatternAndOrigin(vx_context context, vx_enum pattern, vx_size columns, vx_size rows, vx_size origin_col, vx_size origin_row); + + +/*============================================================================== + CONVOLUTION + =============================================================================*/ + +/*! \brief Creates a reference to a convolution matrix object. + * \param [in] context The reference to the overall context. + * \param [in] columns The columns dimension of the convolution. + * Must be odd and greater than or equal to 3 and less than the value returned + * from \ref VX_CONTEXT_CONVOLUTION_MAX_DIMENSION. + * \param [in] rows The rows dimension of the convolution. + * Must be odd and greater than or equal to 3 and less than the value returned + * from \ref VX_CONTEXT_CONVOLUTION_MAX_DIMENSION. + * \returns A convolution reference \ref vx_convolution. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_convolution + */ +VX_API_ENTRY vx_convolution VX_API_CALL vxCreateConvolution(vx_context context, vx_size columns, vx_size rows); + +/*! \brief Creates an opaque reference to a convolution matrix object without direct user access. + * \param [in] graph The reference to the parent graph. + * \param [in] columns The columns dimension of the convolution. + * Must be odd and greater than or equal to 3 and less than the value returned + * from \ref VX_CONTEXT_CONVOLUTION_MAX_DIMENSION. + * \param [in] rows The rows dimension of the convolution. + * Must be odd and greater than or equal to 3 and less than the value returned + * from \ref VX_CONTEXT_CONVOLUTION_MAX_DIMENSION. + * \see \ref vxCreateConvolution + * \returns A convolution reference \ref vx_convolution. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_convolution + */ +VX_API_ENTRY vx_convolution VX_API_CALL vxCreateVirtualConvolution(vx_graph graph, vx_size columns, vx_size rows); + +/*! \brief Releases the reference to a convolution matrix. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] conv The pointer to the convolution matrix to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE conv is not a valid \ref vx_convolution reference. + * \ingroup group_convolution + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseConvolution(vx_convolution *conv); + +/*! \brief Queries an attribute on the convolution matrix object. + * \param [in] conv The convolution matrix object to set. + * \param [in] attribute The attribute to query. Use a \ref vx_convolution_attribute_e enumeration. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE conv is not a valid \ref vx_convolution reference. + * \ingroup group_convolution + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryConvolution(vx_convolution conv, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Sets attributes on the convolution object. + * \param [in] conv The coordinates object to set. + * \param [in] attribute The attribute to modify. Use a \ref vx_convolution_attribute_e enumeration. + * \param [in] ptr The pointer to the value to which to set the attribute. + * \param [in] size The size in bytes of the data pointed to by \a ptr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE conv is not a valid \ref vx_convolution reference. + * \ingroup group_convolution + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetConvolutionAttribute(vx_convolution conv, vx_enum attribute, const void *ptr, vx_size size); + +/*! \brief Allows the application to copy coefficients from/into a convolution object. + * \param [in] conv The reference to the convolution object that is the source or the destination of the copy. + * \param [in] user_ptr The address of the memory location where to store the requested + * coefficient data if the copy was requested in read mode, or from where to get the + * coefficient data to store into the convolution object if the copy was requested in + * write mode. In the user memory, the convolution coefficient data is structured as a + * row-major 2D array with elements of the type corresponding + * to \ref VX_TYPE_CONVOLUTION, with a number of rows corresponding to + * \ref VX_CONVOLUTION_ROWS and a number of columns corresponding to + * \ref VX_CONVOLUTION_COLUMNS. The accessible memory must be large + * enough to contain this 2D array: + * accessible memory in bytes >= sizeof(data_element) * rows * columns. + * \param [in] usage This declares the effect of the copy with regard to the convolution object + * using the \ref vx_accessor_e enumeration. Only \ref VX_READ_ONLY and \ref VX_WRITE_ONLY + * are supported: + * \arg \ref VX_READ_ONLY means that data are copied from the convolution object into the user memory. + * \arg \ref VX_WRITE_ONLY means that data are copied into the convolution object from the user memory. + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that specifies + * the memory type of the memory referenced by the user_addr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE conv is not a valid \ref vx_convolution reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_convolution + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyConvolutionCoefficients(vx_convolution conv, void *user_ptr, vx_enum usage, vx_enum user_mem_type); + + +/*============================================================================== + PYRAMID + =============================================================================*/ + +/*! \brief Creates a reference to a pyramid object of the supplied number of levels. + * \param [in] context The reference to the overall context. + * \param [in] levels The number of levels desired. This is required to be a non-zero value. + * \param [in] scale Used to indicate the scale between pyramid levels. This is required to be a non-zero positive value. + * \ref VX_SCALE_PYRAMID_HALF and \ref VX_SCALE_PYRAMID_ORB must be supported. + * \param [in] width The width of the 0th level image in pixels. + * \param [in] height The height of the 0th level image in pixels. + * \param [in] format The format of all images in the pyramid. NV12, NV21, IYUV, UYVY and YUYV formats are not supported. + * \returns A pyramid reference \ref vx_pyramid containing the sub-images. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_pyramid + */ +VX_API_ENTRY vx_pyramid VX_API_CALL vxCreatePyramid(vx_context context, vx_size levels, vx_float32 scale, vx_uint32 width, vx_uint32 height, vx_df_image format); + +/*! \brief Creates a reference to a virtual pyramid object of the supplied number of levels. + * \details Virtual Pyramids can be used to connect Nodes together when the contents of the pyramids will + * not be accessed by the user of the API. + * All of the following constructions are valid: + * \code + * vx_context context = vxCreateContext(); + * vx_graph graph = vxCreateGraph(context); + * vx_pyramid virt[] = { + * vxCreateVirtualPyramid(graph, 4, VX_SCALE_PYRAMID_HALF, 0, 0, VX_DF_IMAGE_VIRT), // no dimension and format specified for level 0 + * vxCreateVirtualPyramid(graph, 4, VX_SCALE_PYRAMID_HALF, 640, 480, VX_DF_IMAGE_VIRT), // no format specified. + * vxCreateVirtualPyramid(graph, 4, VX_SCALE_PYRAMID_HALF, 640, 480, VX_DF_IMAGE_U8), // no access + * }; + * \endcode + * \param [in] graph The reference to the parent graph. + * \param [in] levels The number of levels desired. This is required to be a non-zero value. + * \param [in] scale Used to indicate the scale between pyramid levels. This is required to be a non-zero positive value. + * \ref VX_SCALE_PYRAMID_HALF and \ref VX_SCALE_PYRAMID_ORB must be supported. + * \param [in] width The width of the 0th level image in pixels. This may be set to zero to indicate to the interface that the value is unspecified. + * \param [in] height The height of the 0th level image in pixels. This may be set to zero to indicate to the interface that the value is unspecified. + * \param [in] format The format of all images in the pyramid. This may be set to \ref VX_DF_IMAGE_VIRT to indicate that the format is unspecified. + * \returns A pyramid reference \ref vx_pyramid. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \note Images extracted with \ref vxGetPyramidLevel behave as Virtual Images and + * cause \ref vxMapImagePatch to return errors. + * \ingroup group_pyramid + */ +VX_API_ENTRY vx_pyramid VX_API_CALL vxCreateVirtualPyramid(vx_graph graph, vx_size levels, vx_float32 scale, vx_uint32 width, vx_uint32 height, vx_df_image format); + + +/*! \brief Releases a reference to a pyramid object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] pyr The pointer to the pyramid to release. + * \ingroup group_pyramid + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE pyr is not a valid \ref vx_pyramid reference. + * \post After returning from this function the reference is zeroed. + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleasePyramid(vx_pyramid *pyr); + +/*! \brief Queries an attribute from an image pyramid. + * \param [in] pyr The pyramid to query. + * \param [in] attribute The attribute for which to query. Use a \ref vx_pyramid_attribute_e enumeration. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE pyr is not a valid \ref vx_pyramid reference. + * \ingroup group_pyramid + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryPyramid(vx_pyramid pyr, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Retrieves a level of the pyramid as a \ref vx_image, which can be used + * elsewhere in OpenVX. A call to vxReleaseImage is necessary to release an image for each + * call of vxGetPyramidLevel. + * \param [in] pyr The pyramid object. + * \param [in] index The index of the level, such that index is less than levels. + * \return A \ref vx_image reference. Any possible errors preventing a successful + * function completion should be checked using \ref vxGetStatus. + * \ingroup group_pyramid + */ +VX_API_ENTRY vx_image VX_API_CALL vxGetPyramidLevel(vx_pyramid pyr, vx_uint32 index); + +/*============================================================================== + REMAP + =============================================================================*/ + +/*! \brief Creates a remap table object. + * \param [in] context The reference to the overall context. + * \param [in] src_width Width of the source image in pixel. + * \param [in] src_height Height of the source image in pixels. + * \param [in] dst_width Width of the destination image in pixels. + * \param [in] dst_height Height of the destination image in pixels. + * \ingroup group_remap + * \returns A remap reference \ref vx_remap. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_remap VX_API_CALL vxCreateRemap(vx_context context, + vx_uint32 src_width, + vx_uint32 src_height, + vx_uint32 dst_width, + vx_uint32 dst_height); + +/*! \brief Creates an opaque reference to a remap table object without direct user access. + * \param [in] graph The reference to the parent graph. + * \param [in] src_width Width of the source image in pixel. + * \param [in] src_height Height of the source image in pixels. + * \param [in] dst_width Width of the destination image in pixels. + * \param [in] dst_height Height of the destination image in pixels. + * \see \ref vxCreateRemap + * \ingroup group_remap + * \returns A remap reference \ref vx_remap. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_remap VX_API_CALL vxCreateVirtualRemap(vx_graph graph, + vx_uint32 src_width, + vx_uint32 src_height, + vx_uint32 dst_width, + vx_uint32 dst_height); + +/*! \brief Releases a reference to a remap table object. The object may not be + * garbage collected until its total reference count is zero. + * \param [in] table The pointer to the remap table to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE table is not a valid \ref vx_remap reference. + * \ingroup group_remap + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseRemap(vx_remap *table); + +/*! \brief Allows the application to get direct access to a rectangular patch of a remap object. + * + * The patch is specified within the destination dimensions and its + * data provide the corresponding coordinate within the source dimensions. + * The patch is mapped as a 2D array of elements of the type associated + * with the \p coordinate_type parameter (i.e., \ref vx_coordinates2df_t + * for \ref VX_TYPE_COORDINATES2DF). + * The memory layout of the mapped 2D array follows a row-major order where rows are + * compact (without any gap between elements), and where the potential + * padding after each lines is determined by (* \p stride_y). + * + * \param [in] remap The reference to the remap object that contains the + * patch to map. + * + * \param [in] rect The coordinates of remap patch. The patch must be specified + * within the bounds of the remap destination dimensions + * (\ref VX_REMAP_DESTINATION_WIDTH x \ref VX_REMAP_DESTINATION_HEIGHT). + * (start_x, start_y) gives the coordinate of the topleft element inside the patch, + * while (end_x, end_y) gives the coordinate of the bottomright element out of the patch. + * + * \param [out] map_id The address of a \ref vx_map_id variable + * where the function returns a map identifier. + * \arg (*map_id) must eventually be provided as the map_id parameter of a call + * to \ref vxUnmapRemapPatch. + * + * \param [out] stride_y The address of a vx_size variable where the function + * returns the difference between the address of the first element of two + * successive lines in the mapped remap patch. The stride value follows the + * following rule : + * (*stride_y) >= sizeof() * (rect->end_x - rect->start_x) + * + * \param [out] ptr The address of a pointer where the function returns where + * remap patch data can be accessed. (*ptr) is the address of the the top-left + * element of the remap patch. + * The returned (*ptr) address is only valid between the call to this function + * and the corresponding call to \ref vxUnmapRemapPatch. + * + * \param [in] coordinate_type This declares the type of the source coordinate + * data that the application wants to access in the remap patch. + * It must be \ref VX_TYPE_COORDINATES2DF. + * + * \param [in] usage This declares the access mode for the remap patch, using + * the \ref vx_accessor_e enumeration. + * \arg \ref VX_READ_ONLY: after the function call, the content of the + * memory location pointed by (*ptr) contains the remap patch data. Writing into + * this memory location is forbidden and its behavior is undefined. + * \arg \ref VX_READ_AND_WRITE: after the function call, the content of + * the memory location pointed by (*ptr) contains the remap patch data; writing + * into this memory is allowed for the location of elements only and will + * result in a modification of the written elements in the remap object once the + * patch is unmapped. Writing into a gap between element lines + * (when (*stride_y) > sizeof() * (rect->end_x - rect->start_x)) + * is forbidden and its behavior is undefined. + * \arg \ref VX_WRITE_ONLY: after the function call, the memory location + * pointed by (*ptr) contains undefined data; writing each element of the patch is + * required prior to unmapping. Elements not written by the application before + * unmap will become undefined after unmap, even if they were well defined before + * map. Like for \ref VX_READ_AND_WRITE, writing into a gap between + * element lines is forbidden and its behavior is undefined. + * + * \param [in] mem_type A \ref vx_memory_type_e enumeration that + * specifies the type of the memory where the remap patch is requested to be mapped. + * + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE remap is not a valid \ref vx_remap reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * + * \ingroup group_remap + * \post \ref vxUnmapRemapPatch with same (*map_id) value. + */ +VX_API_ENTRY vx_status VX_API_CALL vxMapRemapPatch(vx_remap remap, + const vx_rectangle_t *rect, + vx_map_id *map_id, + vx_size *stride_y, + void **ptr, + vx_enum coordinate_type, + vx_enum usage, + vx_enum mem_type); + +/*! \brief Unmap and commit potential changes to a remap object patch that was previously mapped. + * + * Unmapping a remap patch invalidates the memory location from which the patch could + * be accessed by the application. Accessing this memory location after the unmap function + * completes has an undefined behavior. + * \param [in] remap The reference to the remap object to unmap. + * \param [out] map_id The unique map identifier that was returned by \ref vxMapRemapPatch . + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE remap is not a valid \ref vx_remap reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_remap + * \pre \ref vxMapRemapPatch with same map_id value +*/ +VX_API_ENTRY vx_status VX_API_CALL vxUnmapRemapPatch(vx_remap remap, vx_map_id map_id); + +/*! \brief Allows the application to copy a rectangular patch from/into a remap object. + * + * The patch is specified within the destination dimensions and its + * data provide the corresponding coordinate within the source dimensions. + * The patch in user memory is a 2D array of elements of the type associated with the + * \p coordinate_type parameter (i.e., \ref vx_coordinates2df_t for + * \ref VX_TYPE_COORDINATES2DF). + * The memory layout of this array follows a row-major order where rows are + * compact (without any gap between elements), and where the potential padding + * after each line is determined by the \p user_stride_y parameter. + + * \param [in] remap The reference to the remap object that is the source or the + * destination of the patch copy. + * + * \param [in] rect The coordinates of remap patch. The patch must be specified + * within the bounds of the remap destination dimensions + * (\ref VX_REMAP_DESTINATION_WIDTH x \ref VX_REMAP_DESTINATION_HEIGHT). + * (start_x, start_y) gives the coordinate of the topleft element inside the patch, + * while (end_x, end_y) gives the coordinate of the bottomright element out of the patch. + * + * \param [in] user_stride_y The difference between the address of the first element + * of two successive lines of the remap patch in user memory (pointed by + * \p user_ptr). The layout of the user memory must follow a row major order and user_stride_y + * must follow the following rule : + * user_stride_y >= sizeof() * (rect->end_x - rect->start_x). + * + * \param [in] user_ptr The address of the user memory location where to store the requested + * remap data if the copy was requested in read mode, or from where to get the remap data to + * store into the remap object if the copy was requested in write mode. \p user_ptr is the + * address of the the top-left element of the remap patch. + * The accessible user memory must be large enough to contain the specified patch with + * the specified layout: + * accessible memory in bytes >= (rect->end_y - rect->start_y) * user_stride_y. + * + * \param [in] user_coordinate_type This declares the type of the source coordinate remap + * data in the user memory. It must be \ref VX_TYPE_COORDINATES2DF. + * + * \param [in] usage This declares the effect of the copy with regard to the remap object + * using the \ref vx_accessor_e enumeration. Only VX_READ_ONLY and VX_WRITE_ONLY are + * supported: + * \arg \ref VX_READ_ONLY means that data is copied from the remap object into the user + * memory pointer by \p user_ptr. The potential padding after each line in user + * memory will stay unchanged. + * \arg \ref VX_WRITE_ONLY means that data is copied into the remap object from + * the user memory. + * + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that specifies + * the type of the memory pointer by \p user_ptr. + * + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE remap is not a valid \ref vx_remap reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * + * \ingroup group_remap +*/ +VX_API_ENTRY vx_status VX_API_CALL vxCopyRemapPatch(vx_remap remap, + const vx_rectangle_t *rect, + vx_size user_stride_y, + void * user_ptr, + vx_enum user_coordinate_type, + vx_enum usage, + vx_enum user_mem_type); + +/*! \brief Queries attributes from a Remap table. + * \param [in] table The remap to query. + * \param [in] attribute The attribute to query. Use a \ref vx_remap_attribute_e enumeration. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE table is not a valid \ref vx_remap reference. + * \ingroup group_remap + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryRemap(vx_remap table, vx_enum attribute, void *ptr, vx_size size); + +/*============================================================================== + ARRAY + =============================================================================*/ + +/*! + * \brief Creates a reference to an Array object. + * + * User must specify the Array capacity (i.e., the maximal number of items that the array can hold). + * + * \param [in] context The reference to the overall Context. + * \param [in] item_type The type of data to hold. Must be greater than + * \ref VX_TYPE_INVALID and less than or equal to \ref VX_TYPE_VENDOR_STRUCT_END. + * Or must be a \ref vx_enum returned from \ref vxRegisterUserStruct. + * \param [in] capacity The maximal number of items that the array can hold. This value must be greater than zero. + * + * \returns An array reference \ref vx_array. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * \ingroup group_array + */ +VX_API_ENTRY vx_array VX_API_CALL vxCreateArray(vx_context context, vx_enum item_type, vx_size capacity); + +/*! + * \brief Creates an opaque reference to a virtual Array with no direct user access. + * + * Virtual Arrays are useful when item type or capacity are unknown ahead of time + * and the Array is used as internal graph edge. Virtual arrays are scoped within the parent graph only. + * + * All of the following constructions are allowed. + * \code + * vx_context context = vxCreateContext(); + * vx_graph graph = vxCreateGraph(context); + * vx_array virt[] = { + * vxCreateVirtualArray(graph, 0, 0), // totally unspecified + * vxCreateVirtualArray(graph, VX_TYPE_KEYPOINT, 0), // unspecified capacity + * vxCreateVirtualArray(graph, VX_TYPE_KEYPOINT, 1000), // no access + * }; + * \endcode + * + * \param [in] graph The reference to the parent graph. + * \param [in] item_type The type of data to hold. Must be greater than + * \ref VX_TYPE_INVALID and less than or equal to \ref VX_TYPE_VENDOR_STRUCT_END. + * Or must be a \ref vx_enum returned from \ref vxRegisterUserStruct. + * This may to set to zero to indicate an unspecified item type. + * \param [in] capacity The maximal number of items that the array can hold. + * This may be to set to zero to indicate an unspecified capacity. + * \see vxCreateArray for a type list. + * \returns A array reference \ref vx_array. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * \ingroup group_array + */ +VX_API_ENTRY vx_array VX_API_CALL vxCreateVirtualArray(vx_graph graph, vx_enum item_type, vx_size capacity); + +/*! + * \brief Releases a reference of an Array object. + * The object may not be garbage collected until its total reference count is zero. + * After returning from this function the reference is zeroed. + * \param [in] arr The pointer to the Array to release. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE arr is not a valid \ref vx_array reference. + * \ingroup group_array + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseArray(vx_array *arr); + +/*! + * \brief Queries the Array for some specific information. + * + * \param [in] arr The reference to the Array. + * \param [in] attribute The attribute to query. Use a \ref vx_array_attribute_e. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE arr is not a valid \ref vx_array reference. + * \retval VX_ERROR_NOT_SUPPORTED If the \a attribute is not a value supported on this implementation. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * + * \ingroup group_array + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryArray(vx_array arr, vx_enum attribute, void *ptr, vx_size size); + +/*! + * \brief Adds items to the Array. + * + * This function increases the container size. + * + * By default, the function does not reallocate memory, + * so if the container is already full (number of elements is equal to capacity) + * or it doesn't have enough space, + * the function returns \ref VX_FAILURE error code. + * + * \param [in] arr The reference to the Array. + * \param [in] count The total number of elements to insert. + * \param [in] ptr The location from which to read the input values. + * \param [in] stride The number of bytes between the beginning of two consecutive elements. + * + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE arr is not a valid \ref vx_array reference. + * \retval VX_FAILURE If the Array is full. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * + * \ingroup group_array + */ +VX_API_ENTRY vx_status VX_API_CALL vxAddArrayItems(vx_array arr, vx_size count, const void *ptr, vx_size stride); + +/*! + * \brief Truncates an Array (remove items from the end). + * + * \param [in,out] arr The reference to the Array. + * \param [in] new_num_items The new number of items for the Array. + * + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE arr is not a valid \ref vx_array reference. + * \retval VX_ERROR_INVALID_PARAMETERS The \a new_size is greater than the current size. + * + * \ingroup group_array + */ +VX_API_ENTRY vx_status VX_API_CALL vxTruncateArray(vx_array arr, vx_size new_num_items); + +/*! \brief Allows the application to copy a range from/into an array object. + * \param [in] array The reference to the array object that is the source or the + * destination of the copy. + * \param [in] range_start The index of the first item of the array object to copy. + * \param [in] range_end The index of the item following the last item of the + * array object to copy. (range_end range_start) items are copied from index + * range_start included. The range must be within the bounds of the array: + * 0 <= range_start < range_end <= number of items in the array. + * \param [in] user_stride The number of bytes between the beginning of two consecutive + * items in the user memory pointed by user_ptr. The layout of the user memory must + * follow an item major order: + * user_stride >= element size in bytes. + * \param [in] user_ptr The address of the memory location where to store the requested data + * if the copy was requested in read mode, or from where to get the data to store into the array + * object if the copy was requested in write mode. The accessible memory must be large enough + * to contain the specified range with the specified stride: + * accessible memory in bytes >= (range_end range_start) * user_stride. + * \param [in] usage This declares the effect of the copy with regard to the array object + * using the \ref vx_accessor_e enumeration. Only \ref VX_READ_ONLY and \ref VX_WRITE_ONLY + * are supported: + * \arg \ref VX_READ_ONLY means that data are copied from the array object into the user memory. + * \arg \ref VX_WRITE_ONLY means that data are copied into the array object from the user memory. + * \param [in] user_mem_type A \ref vx_memory_type_e enumeration that specifies + * the memory type of the memory referenced by the user_addr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_OPTIMIZED_AWAY This is a reference to a virtual array that cannot be + * accessed by the application. + * \retval VX_ERROR_INVALID_REFERENCE array is not a valid \ref vx_array reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_array + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyArrayRange(vx_array array, vx_size range_start, vx_size range_end, vx_size user_stride, void *user_ptr, vx_enum usage, vx_enum user_mem_type); + +/*! \brief Allows the application to get direct access to a range of an array object. + * \param [in] array The reference to the array object that contains the range to map. + * \param [in] range_start The index of the first item of the array object to map. + * \param [in] range_end The index of the item following the last item of the + * array object to map. (range_end range_start) items are mapped, starting from index + * range_start included. The range must be within the bounds of the array: + * Must be 0 <= range_start < range_end <= number of items. + * \param [out] map_id The address of a \ref vx_map_id variable where the function + * returns a map identifier. + * \arg (*map_id) must eventually be provided as the map_id parameter of a call to + * \ref vxUnmapArrayRange. + * \param [out] stride The address of a vx_size variable where the function + * returns the memory layout of the mapped array range. The function sets (*stride) + * to the number of bytes between the beginning of two consecutive items. + * The application must consult (*stride) to access the array items starting from + * address (*ptr). The layout of the mapped array follows an item major order: + * (*stride) >= item size in bytes. + * \param [out] ptr The address of a pointer that the function sets to the + * address where the requested data can be accessed. The returned (*ptr) address + * is only valid between the call to the function and the corresponding call to + * \ref vxUnmapArrayRange. + * \param [in] usage This declares the access mode for the array range, using + * the \ref vx_accessor_e enumeration. + * \arg \ref VX_READ_ONLY: after the function call, the content of the memory location + * pointed by (*ptr) contains the array range data. Writing into this memory location + * is forbidden and its behavior is undefined. + * \arg \ref VX_READ_AND_WRITE: after the function call, the content of the memory + * location pointed by (*ptr) contains the array range data; writing into this memory + * is allowed only for the location of items and will result in a modification of the + * affected items in the array object once the range is unmapped. Writing into + * a gap between items (when (*stride) > item size in bytes) is forbidden and its + * behavior is undefined. + * \arg \ref VX_WRITE_ONLY: after the function call, the memory location pointed by (*ptr) + * contains undefined data; writing each item of the range is required prior to + * unmapping. Items not written by the application before unmap will become + * undefined after unmap, even if they were well defined before map. Like for + * VX_READ_AND_WRITE, writing into a gap between items is forbidden and its behavior + * is undefined. + * \param [in] mem_type A \ref vx_memory_type_e enumeration that + * specifies the type of the memory where the array range is requested to be mapped. + * \param [in] flags An integer that allows passing options to the map operation. + * Use the \ref vx_map_flag_e enumeration. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_OPTIMIZED_AWAY This is a reference to a virtual array that cannot be + * accessed by the application. + * \retval VX_ERROR_INVALID_REFERENCE array is not a valid \ref vx_array reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_array + * \post \ref vxUnmapArrayRange with same (*map_id) value. + */ +VX_API_ENTRY vx_status VX_API_CALL vxMapArrayRange(vx_array array, vx_size range_start, vx_size range_end, vx_map_id *map_id, vx_size *stride, void **ptr, vx_enum usage, vx_enum mem_type, vx_uint32 flags); + +/*! \brief Unmap and commit potential changes to an array object range that was previously mapped. + * Unmapping an array range invalidates the memory location from which the range could + * be accessed by the application. Accessing this memory location after the unmap function + * completes has an undefined behavior. + * \param [in] array The reference to the array object to unmap. + * \param [out] map_id The unique map identifier that was returned when calling + * \ref vxMapArrayRange . + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE array is not a valid \ref vx_array reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_array + * \pre \ref vxMapArrayRange returning the same map_id value + */ +VX_API_ENTRY vx_status VX_API_CALL vxUnmapArrayRange(vx_array array, vx_map_id map_id); + +/*! + * \brief Accesses a specific indexed element in an array. + * \param [in] ptr The base pointer for the array range. + * \param [in] index The index of the element, not byte, to access. + * \param [in] stride The 'number of bytes' between the beginning of two consecutive elements. + * \ingroup group_array + */ +#define vxFormatArrayPointer(ptr, index, stride) \ + (&(((vx_uint8*)(ptr))[(index) * (stride)])) + +/*! + * \brief Allows access to an array item as a typecast pointer deference. + * \param [in] type The type of the item to access. + * \param [in] ptr The base pointer for the array range. + * \param [in] index The index of the element, not byte, to access. + * \param [in] stride The 'number of bytes' between the beginning of two consecutive elements. + * \ingroup group_array + */ +#define vxArrayItem(type, ptr, index, stride) \ + (*(type *)(vxFormatArrayPointer((ptr), (index), (stride)))) + + +/*============================================================================== + OBJECT ARRAY + =============================================================================*/ +/*! + * \brief Creates a reference to an ObjectArray of count objects. + * + * It uses the metadata of the exemplar to determine the object attributes, + * ignoring the object data. It does not alter the exemplar or keep or release + * the reference to the exemplar. For the definition of supported attributes see + * \ref vxSetMetaFormatAttribute. In case the exemplar is a virtual object + * it must be of immutable metadata, thus it is not allowed to be dimensionless or formatless. + * + * \param [in] context The reference to the overall Context. + * \param [in] exemplar The exemplar object that defines the metadata of the created objects in the ObjectArray. + * \param [in] count Number of Objects to create in the ObjectArray. This value must be greater than zero. + * + * \returns An ObjectArray reference \ref vx_object_array. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. Data objects are not initialized by this function. + * + * \ingroup group_object_array + */ +VX_API_ENTRY vx_object_array VX_API_CALL vxCreateObjectArray(vx_context context, vx_reference exemplar, vx_size count); + +/*! + * \brief Creates an opaque reference to a virtual ObjectArray with no direct user access. + * + * This function creates an ObjectArray of count objects with similar behavior as + * \ref vxCreateObjectArray. The only difference is that the objects that are + * created are virtual in the given graph. + * + * \param [in] graph Reference to the graph where to create the virtual ObjectArray. + * \param [in] exemplar The exemplar object that defines the type of object in the ObjectArray. + * Only exemplar type of \ref vx_image, \ref vx_array and + * \ref vx_pyramid are allowed. + * \param [in] count Number of Objects to create in the ObjectArray. + * \returns A ObjectArray reference \ref vx_object_array. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_object_array + */ +VX_API_ENTRY vx_object_array VX_API_CALL vxCreateVirtualObjectArray(vx_graph graph, vx_reference exemplar, vx_size count); + +/*! + * \brief Retrieves the reference to the OpenVX Object in location index of the ObjectArray. + * + * This is a vx_reference, which can be used elsewhere in OpenVX. A call to vxRelease or \ref vxReleaseReference + * is necessary to release the Object for each call to this function. + * + * \param [in] arr The ObjectArray. + * \param [in] index The index of the object in the ObjectArray. + * \return A reference to an OpenVX data object. Any possible errors preventing a successful + * completion of the function should be checked using \ref vxGetStatus. + * \ingroup group_object_array + */ +VX_API_ENTRY vx_reference VX_API_CALL vxGetObjectArrayItem(vx_object_array arr, vx_uint32 index); + +/*! + * \brief Releases a reference of an ObjectArray object. + * + * The object may not be garbage collected until its total reference and its contained objects + * count is zero. After returning from this function the reference is zeroed/cleared. + * + * \param [in] arr The pointer to the ObjectArray to release. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE arr is not a valid \ref vx_object_array reference. + * \ingroup group_object_array + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseObjectArray(vx_object_array *arr); + +/*! + * \brief Queries an atribute from the ObjectArray. + * + * \param [in] arr The reference to the ObjectArray. + * \param [in] attribute The attribute to query. Use a \ref vx_object_array_attribute_e. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE arr is not a valid \ref vx_object_array reference. + * \retval VX_ERROR_NOT_SUPPORTED If the \a attribute is not a value supported on this implementation. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * + * \ingroup group_object_array + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryObjectArray(vx_object_array arr, vx_enum attribute, void *ptr, vx_size size); + + +/*============================================================================== + META FORMAT + =============================================================================*/ + +/*! \brief This function allows a user to set the attributes of a \ref vx_meta_format object in a kernel output validator. + * + * The \ref vx_meta_format object contains two types of information: data object meta data and + * some specific information that defines how the valid region of an image changes + * + * The meta data attributes that can be set are identified by this list: + * - \ref vx_image : \ref VX_IMAGE_FORMAT, \ref VX_IMAGE_HEIGHT, \ref VX_IMAGE_WIDTH + * - \ref vx_array : \ref VX_ARRAY_CAPACITY, \ref VX_ARRAY_ITEMTYPE + * - \ref vx_pyramid : \ref VX_PYRAMID_FORMAT, \ref VX_PYRAMID_HEIGHT, \ref VX_PYRAMID_WIDTH, \ref VX_PYRAMID_LEVELS, \ref VX_PYRAMID_SCALE + * - \ref vx_scalar : \ref VX_SCALAR_TYPE + * - \ref vx_matrix : \ref VX_MATRIX_TYPE, \ref VX_MATRIX_ROWS, \ref VX_MATRIX_COLUMNS + * - \ref vx_distribution : \ref VX_DISTRIBUTION_BINS, \ref VX_DISTRIBUTION_OFFSET, \ref VX_DISTRIBUTION_RANGE + * - \ref vx_remap : \ref VX_REMAP_SOURCE_WIDTH, \ref VX_REMAP_SOURCE_HEIGHT, \ref VX_REMAP_DESTINATION_WIDTH, \ref VX_REMAP_DESTINATION_HEIGHT + * - \ref vx_lut : \ref VX_LUT_TYPE, \ref VX_LUT_COUNT + * - \ref vx_threshold : \ref VX_THRESHOLD_TYPE, \ref VX_THRESHOLD_INPUT_FORMAT, \ref VX_THRESHOLD_INPUT_FORMAT + * - \ref vx_object_array : \ref VX_OBJECT_ARRAY_NUMITEMS, \ref VX_OBJECT_ARRAY_ITEMTYPE + * - \ref vx_tensor : \ref VX_TENSOR_NUMBER_OF_DIMS, \ref VX_TENSOR_DIMS, \ref VX_TENSOR_DATA_TYPE, \ref VX_TENSOR_FIXED_POINT_POSITION + * - \ref VX_VALID_RECT_CALLBACK + * \note For vx_image, a specific attribute can be used to specify the valid region evolution. This information is not a meta data. + * + * \param [in] meta The reference to the \ref vx_meta_format struct to set + * \param [in] attribute Use the subset of data object attributes that define the meta data of this object or attributes from \ref vx_meta_format. + * \param [in] ptr The input pointer of the value to set on the meta format object. + * \param [in] size The size in bytes of the object to which \a ptr points. + * \ingroup group_user_kernels + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS The attribute was set; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE meta is not a valid \ref vx_meta_format reference. + * \retval VX_ERROR_INVALID_PARAMETERS size was not correct for the type needed. + * \retval VX_ERROR_NOT_SUPPORTED the object attribute was not supported on the meta format object. + * \retval VX_ERROR_INVALID_TYPE attribute type did not match known meta format type. + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetMetaFormatAttribute(vx_meta_format meta, vx_enum attribute, const void *ptr, vx_size size); + +/*! \brief Set a meta format object from an exemplar data object reference + * + * This function sets a \ref vx_meta_format object from the meta data of the exemplar + * + * \param [in] meta The meta format object to set + * \param [in] exemplar The exemplar data object. + * \ingroup group_user_kernels + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS The meta format was correctly set; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE meta is not a valid \ref vx_meta_format reference, + * or exemplar is not a valid \ref vx_reference reference. + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetMetaFormatFromReference(vx_meta_format meta, vx_reference exemplar); + +/*! \brief This function allows a user to query the attributes of a \ref vx_meta_format object in a kernel parameter. + * + * The \ref vx_meta_format object contains two types of information: data object meta data and + * some specific information that defines how the valid region of an image changes + * + * The meta data attributes that can be queried are identified by this list: + * - \ref vx_image : \ref VX_IMAGE_FORMAT, \ref VX_IMAGE_HEIGHT, \ref VX_IMAGE_WIDTH + * - \ref vx_array : \ref VX_ARRAY_CAPACITY, \ref VX_ARRAY_ITEMTYPE + * - \ref vx_pyramid : \ref VX_PYRAMID_FORMAT, \ref VX_PYRAMID_HEIGHT, \ref VX_PYRAMID_WIDTH, \ref VX_PYRAMID_LEVELS, \ref VX_PYRAMID_SCALE + * - \ref vx_scalar : \ref VX_SCALAR_TYPE + * - \ref vx_matrix : \ref VX_MATRIX_TYPE, \ref VX_MATRIX_ROWS, \ref VX_MATRIX_COLUMNS + * - \ref vx_distribution : \ref VX_DISTRIBUTION_BINS, \ref VX_DISTRIBUTION_OFFSET, \ref VX_DISTRIBUTION_RANGE + * - \ref vx_remap : \ref VX_REMAP_SOURCE_WIDTH, \ref VX_REMAP_SOURCE_HEIGHT, \ref VX_REMAP_DESTINATION_WIDTH, \ref VX_REMAP_DESTINATION_HEIGHT + * - \ref vx_lut : \ref VX_LUT_TYPE, \ref VX_LUT_COUNT + * - \ref vx_threshold : \ref VX_THRESHOLD_TYPE, \ref VX_THRESHOLD_INPUT_FORMAT, \ref VX_THRESHOLD_INPUT_FORMAT + * - \ref vx_object_array : \ref VX_OBJECT_ARRAY_NUMITEMS, \ref VX_OBJECT_ARRAY_ITEMTYPE + * - \ref vx_tensor : \ref VX_TENSOR_NUMBER_OF_DIMS, \ref VX_TENSOR_DIMS, \ref VX_TENSOR_DATA_TYPE, \ref VX_TENSOR_FIXED_POINT_POSITION + * - \ref VX_VALID_RECT_CALLBACK + * \note For vx_image, a specific attribute can be used to query the valid region evolution. This information is not a meta data. + * + * \param [in] meta The reference to the \ref vx_meta_format struct to query + * \param [in] attribute Use the subset of data object attributes that define the meta data of this object or attributes from \ref vx_meta_format. + * \param [out] ptr The output pointer of the value to query on the meta format object. + * \param [in] size The size in bytes of the object to which \a ptr points. + * \ingroup group_import_kernel + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS The attribute was returned; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE meta is not a valid \ref vx_meta_format reference. + * \retval VX_ERROR_INVALID_PARAMETERS size was not correct for the type needed. + * \retval VX_ERROR_NOT_SUPPORTED the object attribute was not supported on the meta format object. + * \retval VX_ERROR_INVALID_TYPE attribute type did not match known meta format type. + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryMetaFormatAttribute(vx_meta_format meta, vx_enum attribute, void *ptr, vx_size size); + +VX_API_ENTRY vx_status VX_API_CALL +vxConfigTarget( + vx_context context, + vx_int32 dp_amount, + vx_int32 mad_per_core, + vx_int32 conv_cores, + vx_int32 in_buffer_depth, + vx_int32 accum_buffer_height, + vx_int32 l2_cache_size, + vx_int32 tp_cores +); + +/*============================================================================== + TENSOR DATA FUNCTIONS +=============================================================================*/ +/*! \brief Creates an opaque reference to a tensor data buffer. + * \details Not guaranteed to exist until the \ref vx_graph containing it has been verified. + * Since functions using tensors, need to understand the context of each dimension. We describe a layout of the dimensions in each function using tensors. + * That layout is not mandatory. It is done specifically to explain the functions and not to mandate layout. Different implementation may have different layout. + * Therefore the layout description is logical and not physical. It refers to the order of dimensions given in this function. + * \param [in] context The reference to the implementation context. + * \param [in] number_of_dims The number of dimensions. + * \param [in] dims Dimensions sizes in elements. + * \param [in] data_type The \ref vx_type_e that represents the data type of the tensor data elements. + * \param [in] fixed_point_position Specifies the fixed point position when the input element type is integer. if 0, calculations are performed in integer math. + * \return A tensor data reference. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_object_tensor + */ +VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensor(vx_context context, vx_size number_of_dims, const vx_size * dims, vx_enum data_type,vx_int8 fixed_point_position); + +/*! \brief Creates an array of images into the multi-dimension data, this can be adjacent 2D images or not depending on the stride value. + * The stride value is representing bytes in the third dimension. + * The OpenVX image object that points to a three dimension data and access it as an array of images. + * This has to be portion of the third lowest dimension, and the stride correspond to that third dimension. + * The returned Object array is an array of images. Where the image data is pointing to a specific memory in the input tensor. + * \param [in] tensor The tensor data from which to extract the images. Has to be a 3d tensor. + * \param [in] rect Image coordinates within tensor data. + * \param [in] array_size Number of images to extract. + * \param [in] jump Delta between two images in the array. + * \param [in] image_format The requested image format. Should match the tensor data's data type. + * \return An array of images pointing to the tensor data's data. + * \ingroup group_object_tensor + */ +VX_API_ENTRY vx_object_array VX_API_CALL vxCreateImageObjectArrayFromTensor(vx_tensor tensor, const vx_rectangle_t *rect, vx_size array_size, vx_size jump, vx_df_image image_format); + +/*! \brief Creates a tensor data from another tensor data given a view. This second + * reference refers to the data in the original tensor data. Updates to this tensor data + * updates the parent tensor data. The view must be defined within the dimensions + * of the parent tensor data. + * \param [in] tensor The reference to the parent tensor data. + * \param [in] number_of_dims Number of dimensions in the view. Error return if 0 or greater than number of + * tensor dimensions. If smaller than number of tensor dimensions, the lower dimensions are assumed. + * \param [in] view_start View start coordinates + * \param [in] view_end View end coordinates + * \return The reference to the sub-tensor. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_object_tensor + */ +VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensorFromView(vx_tensor tensor, vx_size number_of_dims, const vx_size * view_start, const vx_size * view_end); + +/*! \brief Creates an opaque reference to a tensor data buffer with no direct + * user access. This function allows setting the tensor data dimensions or data format. + * \details Virtual data objects allow users to connect various nodes within a + * graph via data references without access to that data, but they also permit the + * implementation to take maximum advantage of possible optimizations. Use this + * API to create a data reference to link two or more nodes together when the + * intermediate data are not required to be accessed by outside entities. This API + * in particular allows the user to define the tensor data format of the data without + * requiring the exact dimensions. Virtual objects are scoped within the graph + * they are declared a part of, and can't be shared outside of this scope. + * Since functions using tensors, need to understand the context of each dimension. We describe a layout of the dimensions in each function. + * That layout is not mandated. It is done specifically to explain the functions and not to mandate layout. Different implementation may have different layout. + * Therfore the layout description is logical and not physical. It refers to the order of dimensions given in \ref vxCreateTensor and \ref vxCreateVirtualTensor. + * \param [in] graph The reference to the parent graph. + * \param [in] number_of_dims The number of dimensions. + * \param [in] dims Dimensions sizes in elements. + * \param [in] data_type The \ref vx_type_e that represents the data type of the tensor data elements. + * \param [in] fixed_point_position Specifies the fixed point position when the input element type is integer. If 0, calculations are performed in integer math. + * \return A tensor data reference.Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \note Passing this reference to \ref vxCopyTensorPatch will return an error. + * \ingroup group_object_tensor + */ +VX_API_ENTRY vx_tensor VX_API_CALL vxCreateVirtualTensor(vx_graph graph, vx_size number_of_dims, const vx_size *dims, vx_enum data_type, vx_int8 fixed_point_position); + + +/*! \brief Creates a reference to an tensor object that was externally allocated. + * \param [in] context The reference to the implementation context. + * \param [in] number_of_dims The number of dimensions. + * \param [in] dims Dimensions sizes in elements. + * \param [in] data_type The \ref vx_type_e that represents the data type of the tensor data elements. + * \param [in] fixed_point_position Specifies the fixed point position when the input element type is integer. if 0, calculations are performed in integer math. + * \param [in] stride An array of stride in all dimensions in bytes. The stride value at index 0 must be size of the tensor data element type. + * \param [in] ptr The platform-defined reference to tensor. See note below. + * \param [in] memory_type \ref vx_memory_type_e. When giving \ref VX_MEMORY_TYPE_HOST + * the \a ptr is assumed to be HOST accessible pointer to memory. + * \return A tensor data reference. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \note The user must call vxMapTensorPatch prior to accessing the elements of a tensor, even if the + * tensor was created via \ref vxCreateTensorFromHandle. Reads or writes to memory referenced + * by ptr after calling \ref vxCreateTensorFromHandle without first calling + * \ref vxMapTensorPatch will result in undefined behavior. + * The property of stride[] and ptr is kept by the caller (It means that the implementation will + * make an internal copy of the provided information. \a stride and \a ptr can then simply be application's + * local variables). + * + * In order to release the tensor back to the application we should use \ref vxSwapTensorHandle. + * + * \ingroup group_object_tensor + */ +VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensorFromHandle(vx_context context, vx_size number_of_dims, const vx_size *dims, vx_enum data_type, vx_int8 fixed_point_position, const vx_size * stride, void * ptr, vx_enum memory_type); + +/*! \brief Swaps the tensor handle of an tensor previously created from handle. + * + * This function sets the new tensor handle + * and returns the previous one. + * + * Once this function call has completed, the application gets back the + * ownership of the memory referenced by the previous handle. This memory + * contains up-to-date tensor data, and the application can safely reuse or + * release it. + * + * The memory referenced by the new handle must have been allocated + * consistently with the tensor properties since the import type, + * memory layout and dimensions are unchanged (see stride and + * memory_type in \ref vxCreateTensorFromHandle). + * + * All tensors created from view with this tensor as parent or ancestor + * will automatically use the memory referenced by the new handle. + * + * The behavior of \ref vxSwapTensorHandle when called from a user node is undefined. + * \param [in] tensor The reference to an tensor created from handle. + * \param [in] new_ptr new tensor handle + * If new_ptr is NULL, + * If the new_ptr is NULL, the previous tensor storage memory is reclaimed by the + * caller, while no new handle is provided. + * \param [out] prev_ptr pointer to return the previous tensor handle. + * If prev_ptr is NULL, the previous handle is not returned. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE tensor is not a valid \ref vx_tensor reference. + * reference. + * \retval VX_ERROR_INVALID_PARAMETERS The tensor was not created from handle or + * the content of new_ptr is not valid. + * \retval VX_FAILURE The tensor was already being accessed. + * \ingroup group_tensor + */ +VX_API_ENTRY vx_status VX_API_CALL vxSwapTensorHandle(vx_tensor tensor, void* new_ptr, void** prev_ptr); + +/*! \brief Allows the application to copy a view patch from/into an tensor object . + * \param [in] tensor The reference to the tensor object that is the source or the + * destination of the copy. + * \param [in] number_of_dims Number of patch dimension. Error return if 0 or greater than number of + * tensor dimensions. If smaller than number of tensor dimensions, the lower dimensions are assumed. + * \param [in] view_start Array of patch start points in each dimension + * \param [in] view_end Array of patch end points in each dimension + * \param [in] user_stride Array of user memory strides in each dimension + * \param [in] user_ptr The address of the memory location where to store the requested data + * if the copy was requested in read mode, or from where to get the data to store into the tensor + * object if the copy was requested in write mode. The accessible memory must be large enough + * to contain the specified patch with the specified layout:\n + * accessible memory in bytes >= (end[last_dimension] - start[last_dimension]) * stride[last_dimension].\n + * The layout of the user memory must follow a row major order. + * \param [in] usage This declares the effect of the copy with regard to the tensor object + * using the \ref vx_accessor_e enumeration. Only \ref VX_READ_ONLY and \ref VX_WRITE_ONLY are supported: + * \arg \ref VX_READ_ONLY means that data is copied from the tensor object into the application memory + * \arg \ref VX_WRITE_ONLY means that data is copied into the tensor object from the application memory + * \param [in] user_memory_type A \ref vx_memory_type_e enumeration that specifies + * the memory type of the memory referenced by the user_addr. + * \return A \ref vx_status_e enumeration. + * \retval VX_ERROR_OPTIMIZED_AWAY This is a reference to a virtual tensor that cannot be + * accessed by the application. + * \retval VX_ERROR_INVALID_REFERENCE The tensor reference is not actually an tensor reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_object_tensor + */ +VX_API_ENTRY vx_status VX_API_CALL vxCopyTensorPatch(vx_tensor tensor, vx_size number_of_dims, const vx_size * view_start, const vx_size * view_end, + const vx_size * user_stride, void * user_ptr, vx_enum usage, vx_enum user_memory_type); + +/*! \brief Allows the application to get direct access to a patch of tensor object. + * \param [in] tensor The reference to the tensor object that is the source or the + * destination for direct access. + * \param [in] number_of_dims The number of dimensions. Must be same as tensor number_of_dims. + * \param [in] view_start Array of patch start points in each dimension. This is optional parameter and will be zero when NULL. + * \param [in] view_end Array of patch end points in each dimension. This is optional parameter and will be dims[] of tensor when NULL. + * \param [out] map_id The address of a vx_map_id variable where the function returns a map identifier. + * \arg (*map_id) must eventually be provided as the map_id parameter of a call to \ref vxUnmapTensorPatch. + * \param [out] stride An array of stride in all dimensions in bytes. The stride value at index 0 must be size of the tensor data element type. + * \param [out] ptr The address of a pointer that the function sets to the + * address where the requested data can be accessed. The returned (*ptr) address + * is only valid between the call to the function and the corresponding call to + * \ref vxUnmapTensorPatch. + * \param [in] usage This declares the access mode for the tensor patch, using + * the \ref vx_accessor_e enumeration. + * \arg VX_READ_ONLY: after the function call, the content of the memory location + * pointed by (*ptr) contains the tensor patch data. Writing into this memory location + * is forbidden and its behavior is undefined. + * \arg VX_READ_AND_WRITE : after the function call, the content of the memory + * location pointed by (*ptr) contains the tensor patch data; writing into this memory + * is allowed only for the location of items and will result in a modification of the + * affected items in the tensor object once the range is unmapped. Writing into + * a gap between items (when (*stride) > item size in bytes) is forbidden and its + * behavior is undefined. + * \arg VX_WRITE_ONLY: after the function call, the memory location pointed by (*ptr) + * contains undefined data; writing each item of the range is required prior to + * unmapping. Items not written by the application before unmap will become + * undefined after unmap, even if they were well defined before map. Like for + * VX_READ_AND_WRITE, writing into a gap between items is forbidden and its behavior + * is undefined. + * \param [in] mem_type A \ref vx_memory_type_e enumeration that + * specifies the type of the memory where the tensor patch is requested to be mapped. + * \return A \ref vx_status_e enumeration. + * \retval VX_ERROR_OPTIMIZED_AWAY This is a reference to a virtual tensor that cannot be accessed by the application. + * \retval VX_ERROR_INVALID_REFERENCE The tensor reference is not actually an tensor reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \retval VX_ERROR_NO_MEMORY Internal memory allocation failed. + * \ingroup group_tensor + * \post \ref vxUnmapTensorPatch with same (*map_id) value. + */ +VX_API_ENTRY vx_status VX_API_CALL vxMapTensorPatch(vx_tensor tensor, vx_size number_of_dims, const vx_size * view_start, const vx_size * view_end, vx_map_id * map_id, vx_size * stride, void ** ptr, vx_enum usage, vx_enum mem_type); + +/*! \brief Unmap and commit potential changes to a tensor object patch that was previously mapped. + * Unmapping a tensor patch invalidates the memory location from which the patch could + * be accessed by the application. Accessing this memory location after the unmap function + * completes has an undefined behavior. + * \param [in] tensor The reference to the tensor object to unmap. + * \param [in] map_id The unique map identifier that was returned when calling + * \ref vxMapTensorPatch . + * \return A \ref vx_status_e enumeration. + * \retval VX_ERROR_INVALID_REFERENCE The tensor reference is not actually an tensor reference. + * \retval VX_ERROR_INVALID_PARAMETERS An other parameter is incorrect. + * \ingroup group_tensor + * \pre \ref vxMapTensorPatch returning the same map_id value + */ +VX_API_ENTRY vx_status VX_API_CALL vxUnmapTensorPatch(vx_tensor tensor, const vx_map_id map_id); + +/*! \brief Retrieves various attributes of a tensor data. + * \param [in] tensor The reference to the tensor data to query. + * \param [in] attribute The attribute to query. Use a \ref vx_tensor_attribute_e. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE If data is not a \ref vx_tensor. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \ingroup group_object_tensor + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryTensor(vx_tensor tensor, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Releases a reference to a tensor data object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] tensor The pointer to the tensor data to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; all other values indicate failure + * \retval * An error occurred. See \ref vx_status_e. + * \ingroup group_object_tensor + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensor(vx_tensor *tensor); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_compatibility.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_compatibility.h new file mode 100644 index 0000000..293fde4 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_compatibility.h @@ -0,0 +1,253 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef VX_1_0_1_NAMING_COMPATIBILITY +#define VX_1_0_1_NAMING_COMPATIBILITY + +#define VX_TYPE_SCALAR_MAX (VX_TYPE_BOOL + 1) + +#define vx_border_mode_e vx_border_e +#define vx_border_mode_policy_e vx_border_policy_e +#define _vx_border_mode_t _vx_border_t +#define vx_border_mode_t vx_border_t + +#define VX_ENUM_BORDER_MODE VX_ENUM_BORDER +#define VX_BORDER_MODE_POLICY VX_BORDER_POLICY +#define VX_BORDER_MODE_UNDEFINED VX_BORDER_UNDEFINED +#define VX_BORDER_MODE_CONSTANT VX_BORDER_CONSTANT +#define VX_BORDER_MODE_REPLICATE VX_BORDER_REPLICATE +#define VX_BORDER_MODE_UNSUPPORTED_POLICY_DEFAULT_TO_UNDEFINED VX_BORDER_POLICY_DEFAULT_TO_UNDEFINED +#define VX_BORDER_MODE_UNSUPPORTED_POLICY_RETURN_ERROR VX_BORDER_POLICY_RETURN_ERROR + +#define VX_REF_ATTRIBUTE_COUNT VX_REFERENCE_COUNT +#define VX_REF_ATTRIBUTE_TYPE VX_REFERENCE_TYPE +#define VX_REF_ATTRIBUTE_NAME VX_REFERENCE_NAME + +#define VX_CONTEXT_ATTRIBUTE_VENDOR_ID VX_CONTEXT_VENDOR_ID +#define VX_CONTEXT_ATTRIBUTE_VERSION VX_CONTEXT_VERSION +#define VX_CONTEXT_ATTRIBUTE_UNIQUE_KERNELS VX_CONTEXT_UNIQUE_KERNELS +#define VX_CONTEXT_ATTRIBUTE_MODULES VX_CONTEXT_MODULES +#define VX_CONTEXT_ATTRIBUTE_REFERENCES VX_CONTEXT_REFERENCES +#define VX_CONTEXT_ATTRIBUTE_IMPLEMENTATION VX_CONTEXT_IMPLEMENTATION +#define VX_CONTEXT_ATTRIBUTE_EXTENSIONS_SIZE VX_CONTEXT_EXTENSIONS_SIZE +#define VX_CONTEXT_ATTRIBUTE_EXTENSIONS VX_CONTEXT_EXTENSIONS +#define VX_CONTEXT_ATTRIBUTE_CONVOLUTION_MAXIMUM_DIMENSION VX_CONTEXT_CONVOLUTION_MAX_DIMENSION +#define VX_CONTEXT_ATTRIBUTE_OPTICAL_FLOW_WINDOW_MAXIMUM_DIMENSION VX_CONTEXT_OPTICAL_FLOW_MAX_WINDOW_DIMENSION +#define VX_CONTEXT_ATTRIBUTE_IMMEDIATE_BORDER_MODE VX_CONTEXT_IMMEDIATE_BORDER +#define VX_CONTEXT_ATTRIBUTE_UNIQUE_KERNEL_TABLE VX_CONTEXT_UNIQUE_KERNEL_TABLE + +#define VX_KERNEL_ATTRIBUTE_PARAMETERS VX_KERNEL_PARAMETERS +#define VX_KERNEL_ATTRIBUTE_NAME VX_KERNEL_NAME +#define VX_KERNEL_ATTRIBUTE_ENUM VX_KERNEL_ENUM +#define VX_KERNEL_ATTRIBUTE_LOCAL_DATA_SIZE VX_KERNEL_LOCAL_DATA_SIZE +#define VX_KERNEL_ATTRIBUTE_LOCAL_DATA_PTR (VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x4) + +#define VX_NODE_ATTRIBUTE_STATUS VX_NODE_STATUS +#define VX_NODE_ATTRIBUTE_PERFORMANCE VX_NODE_PERFORMANCE +#define VX_NODE_ATTRIBUTE_BORDER_MODE VX_NODE_BORDER +#define VX_NODE_ATTRIBUTE_LOCAL_DATA_SIZE VX_NODE_LOCAL_DATA_SIZE +#define VX_NODE_ATTRIBUTE_LOCAL_DATA_PTR VX_NODE_LOCAL_DATA_PTR + +#define VX_PARAMETER_ATTRIBUTE_INDEX VX_PARAMETER_INDEX +#define VX_PARAMETER_ATTRIBUTE_DIRECTION VX_PARAMETER_DIRECTION +#define VX_PARAMETER_ATTRIBUTE_TYPE VX_PARAMETER_TYPE +#define VX_PARAMETER_ATTRIBUTE_STATE VX_PARAMETER_STATE +#define VX_PARAMETER_ATTRIBUTE_REF VX_PARAMETER_REF + +#define VX_IMAGE_ATTRIBUTE_WIDTH VX_IMAGE_WIDTH +#define VX_IMAGE_ATTRIBUTE_HEIGHT VX_IMAGE_HEIGHT +#define VX_IMAGE_ATTRIBUTE_FORMAT VX_IMAGE_FORMAT +#define VX_IMAGE_ATTRIBUTE_PLANES VX_IMAGE_PLANES +#define VX_IMAGE_ATTRIBUTE_SPACE VX_IMAGE_SPACE +#define VX_IMAGE_ATTRIBUTE_RANGE VX_IMAGE_RANGE +#define VX_IMAGE_ATTRIBUTE_SIZE VX_IMAGE_SIZE + +#define VX_SCALAR_ATTRIBUTE_TYPE VX_SCALAR_TYPE + +#define VX_GRAPH_ATTRIBUTE_NUMNODES VX_GRAPH_NUMNODES +#define VX_GRAPH_ATTRIBUTE_STATUS (VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_GRAPH) + 0x1) +#define VX_GRAPH_ATTRIBUTE_PERFORMANCE VX_GRAPH_PERFORMANCE +#define VX_GRAPH_ATTRIBUTE_NUMPARAMETERS VX_GRAPH_NUMPARAMETERS + +#define VX_LUT_ATTRIBUTE_TYPE VX_LUT_TYPE +#define VX_LUT_ATTRIBUTE_COUNT VX_LUT_COUNT +#define VX_LUT_ATTRIBUTE_SIZE VX_LUT_SIZE + +#define VX_DISTRIBUTION_ATTRIBUTE_DIMENSIONS VX_DISTRIBUTION_DIMENSIONS +#define VX_DISTRIBUTION_ATTRIBUTE_OFFSET VX_DISTRIBUTION_OFFSET +#define VX_DISTRIBUTION_ATTRIBUTE_RANGE VX_DISTRIBUTION_RANGE +#define VX_DISTRIBUTION_ATTRIBUTE_BINS VX_DISTRIBUTION_BINS +#define VX_DISTRIBUTION_ATTRIBUTE_WINDOW VX_DISTRIBUTION_WINDOW +#define VX_DISTRIBUTION_ATTRIBUTE_SIZE VX_DISTRIBUTION_SIZE + +#define VX_THRESHOLD_ATTRIBUTE_TYPE VX_THRESHOLD_TYPE +#define VX_THRESHOLD_ATTRIBUTE_THRESHOLD_VALUE VX_THRESHOLD_THRESHOLD_VALUE +#define VX_THRESHOLD_ATTRIBUTE_THRESHOLD_LOWER VX_THRESHOLD_THRESHOLD_LOWER +#define VX_THRESHOLD_ATTRIBUTE_THRESHOLD_UPPER VX_THRESHOLD_THRESHOLD_UPPER +#define VX_THRESHOLD_ATTRIBUTE_TRUE_VALUE VX_THRESHOLD_TRUE_VALUE +#define VX_THRESHOLD_ATTRIBUTE_FALSE_VALUE VX_THRESHOLD_FALSE_VALUE +#define VX_THRESHOLD_ATTRIBUTE_DATA_TYPE VX_THRESHOLD_DATA_TYPE + +#define VX_MATRIX_ATTRIBUTE_TYPE VX_MATRIX_TYPE +#define VX_MATRIX_ATTRIBUTE_ROWS VX_MATRIX_ROWS +#define VX_MATRIX_ATTRIBUTE_COLUMNS VX_MATRIX_COLUMNS +#define VX_MATRIX_ATTRIBUTE_SIZE VX_MATRIX_SIZE + +#define VX_CONVOLUTION_ATTRIBUTE_ROWS VX_CONVOLUTION_ROWS +#define VX_CONVOLUTION_ATTRIBUTE_COLUMNS VX_CONVOLUTION_COLUMNS +#define VX_CONVOLUTION_ATTRIBUTE_SCALE VX_CONVOLUTION_SCALE +#define VX_CONVOLUTION_ATTRIBUTE_SIZE VX_CONVOLUTION_SIZE + +#define VX_PYRAMID_ATTRIBUTE_LEVELS VX_PYRAMID_LEVELS +#define VX_PYRAMID_ATTRIBUTE_SCALE VX_PYRAMID_SCALE +#define VX_PYRAMID_ATTRIBUTE_WIDTH VX_PYRAMID_WIDTH +#define VX_PYRAMID_ATTRIBUTE_HEIGHT VX_PYRAMID_HEIGHT +#define VX_PYRAMID_ATTRIBUTE_FORMAT VX_PYRAMID_FORMAT + +#define VX_REMAP_ATTRIBUTE_SOURCE_WIDTH VX_REMAP_SOURCE_WIDTH +#define VX_REMAP_ATTRIBUTE_SOURCE_HEIGHT VX_REMAP_SOURCE_HEIGHT +#define VX_REMAP_ATTRIBUTE_DESTINATION_WIDTH VX_REMAP_DESTINATION_WIDTH +#define VX_REMAP_ATTRIBUTE_DESTINATION_HEIGHT VX_REMAP_DESTINATION_HEIGHT + +#define VX_ARRAY_ATTRIBUTE_ITEMTYPE VX_ARRAY_ITEMTYPE +#define VX_ARRAY_ATTRIBUTE_NUMITEMS VX_ARRAY_NUMITEMS +#define VX_ARRAY_ATTRIBUTE_CAPACITY VX_ARRAY_CAPACITY +#define VX_ARRAY_ATTRIBUTE_ITEMSIZE VX_ARRAY_ITEMSIZE + +#define VX_DELAY_ATTRIBUTE_TYPE VX_DELAY_TYPE +#define VX_DELAY_ATTRIBUTE_SLOTS VX_DELAY_SLOTS + +#define VX_INTERPOLATION_TYPE_AREA VX_INTERPOLATION_AREA +#define VX_INTERPOLATION_TYPE_BILINEAR VX_INTERPOLATION_BILINEAR +#define VX_INTERPOLATION_TYPE_NEAREST_NEIGHBOR VX_INTERPOLATION_NEAREST_NEIGHBOR + +#define VX_IMAGE_SIZE (VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x6) + +#define VX_META_FORMAT_ATTRIBUTE_DELTA_RECTANGLE (VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_META_FORMAT) + 0x0) +#define VX_HINT_SERIALIZE (VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_HINT) + 0x0) + +#define vx_import_type_e vx_memory_type_e +#define VX_ENUM_IMPORT_MEM VX_ENUM_MEMORY_TYPE +#define VX_IMPORT_TYPE_NONE VX_MEMORY_TYPE_NONE +#define VX_IMPORT_TYPE_HOST VX_MEMORY_TYPE_HOST + +#define VX_TYPE_OBJECT_MAX (VX_TYPE_WEIGHTS_BIASES_PARAMETER_BASE + 1) /*TODO: check it for OpenVX 1.2*/ +#define VX_TYPE_STRUCT_MAX VX_TYPE_KHRONOS_STRUCT_MAX + +#define VX_KERNEL_INVALID (VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x0) +#define VX_KERNEL_ACCUMULATE (VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x16) +#define VX_KERNEL_ACCUMULATE_WEIGHTED (VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x17) +#define VX_KERNEL_ACCUMULATE_SQUARE (VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x18) + +#define VX_THRESHOLD_THRESHOLD_VALUE (VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x1) +#define VX_THRESHOLD_THRESHOLD_LOWER (VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x2) +#define VX_THRESHOLD_THRESHOLD_UPPER (VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x3) +#define VX_THRESHOLD_TRUE_VALUE (VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x4) +#define VX_THRESHOLD_FALSE_VALUE (VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x5) +#define VX_THRESHOLD_DATA_TYPE (VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x6) + +#define VX_BIDIRECTIONAL (VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTION) + 0x2) + +typedef vx_status(VX_CALLBACK *vx_kernel_input_validate_f)(vx_node node, vx_uint32 index); + +typedef vx_status(VX_CALLBACK *vx_kernel_output_validate_f)(vx_node node, vx_uint32 index, vx_meta_format meta); + +typedef struct _vx_delta_rectangle_t { + vx_int32 delta_start_x; /*!< \brief The change in the start x. */ + vx_int32 delta_start_y; /*!< \brief The change in the start y. */ + vx_int32 delta_end_x; /*!< \brief The change in the end x. */ + vx_int32 delta_end_y; /*!< \brief The change in the end y. */ +} vx_delta_rectangle_t; + +#ifdef __cplusplus +extern "C" { +#endif + +VX_API_ENTRY vx_kernel VX_API_CALL vxAddKernel(vx_context context, + const vx_char name[VX_MAX_KERNEL_NAME], + vx_enum enumeration, + vx_kernel_f func_ptr, + vx_uint32 numParams, + vx_kernel_input_validate_f input, + vx_kernel_output_validate_f output, + vx_kernel_initialize_f init, + vx_kernel_deinitialize_f deinit); + +VX_API_ENTRY vx_size VX_API_CALL vxComputeImagePatchSize(vx_image image, + const vx_rectangle_t *rect, + vx_uint32 plane_index); + +VX_API_ENTRY vx_status VX_API_CALL vxAccessImagePatch(vx_image image, + const vx_rectangle_t *rect, + vx_uint32 plane_index, + vx_imagepatch_addressing_t *addr, + void **ptr, + vx_enum usage); + +VX_API_ENTRY vx_status VX_API_CALL vxCommitImagePatch(vx_image image, + const vx_rectangle_t *rect, + vx_uint32 plane_index, + const vx_imagepatch_addressing_t *addr, + const void *ptr); + +VX_API_ENTRY vx_status VX_API_CALL vxAccessArrayRange(vx_array arr, vx_size start, vx_size end, vx_size *stride, void **ptr, vx_enum usage); + +VX_API_ENTRY vx_status VX_API_CALL vxCommitArrayRange(vx_array arr, vx_size start, vx_size end, const void *ptr); + +VX_API_ENTRY vx_status VX_API_CALL vxAccessDistribution(vx_distribution distribution, void **ptr, vx_enum usage); + +VX_API_ENTRY vx_status VX_API_CALL vxCommitDistribution(vx_distribution distribution, const void * ptr); + +VX_API_ENTRY vx_status VX_API_CALL vxAccessLUT(vx_lut lut, void **ptr, vx_enum usage); + +VX_API_ENTRY vx_status VX_API_CALL vxCommitLUT(vx_lut lut, const void *ptr); + +VX_API_ENTRY vx_status VX_API_CALL vxReadMatrix(vx_matrix mat, void *array); + +VX_API_ENTRY vx_status VX_API_CALL vxWriteMatrix(vx_matrix mat, const void *array); + +VX_API_ENTRY vx_status VX_API_CALL vxReadConvolutionCoefficients(vx_convolution conv, vx_int16 *array); + +VX_API_ENTRY vx_status VX_API_CALL vxWriteConvolutionCoefficients(vx_convolution conv, const vx_int16 *array); + +VX_API_ENTRY vx_status VX_API_CALL vxReadScalarValue(vx_scalar ref, void *ptr); + +VX_API_ENTRY vx_status VX_API_CALL vxWriteScalarValue(vx_scalar ref, const void *ptr); + +VX_API_ENTRY vx_status VX_API_CALL vxSetRemapPoint(vx_remap table, vx_uint32 dst_x, vx_uint32 dst_y, vx_float32 src_x,vx_float32 src_y); + +VX_API_ENTRY vx_status VX_API_CALL vxGetRemapPoint(vx_remap table, vx_uint32 dst_x, vx_uint32 dst_y, vx_float32 *src_x, vx_float32 *src_y); + +VX_API_ENTRY vx_threshold VX_API_CALL vxCreateThreshold(vx_context c, vx_enum thresh_type, vx_enum data_type); + +VX_API_ENTRY vx_node VX_API_CALL vxAccumulateImageNode(vx_graph graph, vx_image input, vx_image accum); + +VX_API_ENTRY vx_node VX_API_CALL vxAccumulateWeightedImageNode(vx_graph graph, vx_image input, vx_scalar alpha, vx_image accum); + +VX_API_ENTRY vx_node VX_API_CALL vxAccumulateSquareImageNode(vx_graph graph, vx_image input, vx_scalar shift, vx_image accum); + +VX_API_ENTRY vx_status VX_API_CALL vxuAccumulateImage(vx_context context, vx_image input, vx_image accum); + +VX_API_ENTRY vx_status VX_API_CALL vxuAccumulateWeightedImage(vx_context context, vx_image input, vx_scalar alpha, vx_image accum); + +VX_API_ENTRY vx_status VX_API_CALL vxuAccumulateSquareImage(vx_context context, vx_image input, vx_scalar shift, vx_image accum); + +#ifdef __cplusplus +} +#endif + +#endif /* VX_1_0_1_NAMING_COMPATIBILITY */ diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_ext_program.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_ext_program.h new file mode 100644 index 0000000..5c07070 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_ext_program.h @@ -0,0 +1,195 @@ +/**************************************************************************** +* +* Copyright 2017 - 2020 Vivante Corporation, Santa Clara, California. +* All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* 'Software'), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sub license, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject +* to the following conditions: +* +* The above copyright notice and this permission notice (including the +* next paragraph) shall be included in all copies or substantial +* portions of the Software. +* +* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. +* IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VX_EXT_PROGRAM_H_ +#define _VX_EXT_PROGRAM_H_ + +#include + +/***********************************************************************************/ + +#define VX_512BITS_DISABLE 0 +#define VX_512BITS_ADD 0x1 +#define VX_512BITS_SUBTRACT 0x2 +#define VX_512BITS_ACCUMULATOR 0x3 + +#define VX_512BITS_TYPE_FLOAT32 0x0 +#define VX_512BITS_TYPE_FLOAT16 0x1 +#define VX_512BITS_TYPE_SIGNED32 0x2 +#define VX_512BITS_TYPE_SIGNED16 0x3 +#define VX_512BITS_TYPE_SIGNED8 0x4 +#define VX_512BITS_TYPE_UNSIGNED32 0x5 +#define VX_512BITS_TYPE_UNSIGNED16 0x6 +#define VX_512BITS_TYPE_UNSIGNED8 0x7 + +#define VX_512BITS_SELECT_SRC0 0 +#define VX_512BITS_SELECT_SRC1 1 +#define VX_512BITS_SELECT_CONSTANTS 2 + +typedef union _vx_512bits_bin_t +{ + vx_uint8 bin8[16]; + vx_uint16 bin16[8]; + vx_uint32 bin32[4]; +} +vx_512bits_bin_t; + +typedef union _vx_512bits_config_t +{ + struct + { + vx_uint32 flag0 :2; + vx_uint32 flag1 :2; + vx_uint32 flag2 :2; + vx_uint32 flag3 :2; + vx_uint32 flag4 :2; + vx_uint32 flag5 :2; + vx_uint32 flag6 :2; + vx_uint32 flag7 :2; + vx_uint32 flag8 :2; + vx_uint32 flag9 :2; + vx_uint32 flag10:2; + vx_uint32 flag11:2; + vx_uint32 flag12:2; + vx_uint32 flag13:2; + vx_uint32 flag14:2; + vx_uint32 flag15:2; + } + bin2; + + struct + { + vx_uint32 flag0 :4; + vx_uint32 flag1 :4; + vx_uint32 flag2 :4; + vx_uint32 flag3 :4; + vx_uint32 flag4 :4; + vx_uint32 flag5 :4; + vx_uint32 flag6 :4; + vx_uint32 flag7 :4; + } + bin4; +} +vx_512bits_config_t; + +typedef struct _vx_512bits_miscconfig_t +{ + vx_uint32 post_shift :5; /*[0:4]*/ + vx_uint32 resolve1 :3; /*[5:7]*/ + vx_uint32 constant_type :3; /*[8:10]*/ + vx_uint32 resolve2 :1; /*[11:11]*/ + vx_uint32 accu_type :3; /*[12:14]*/ + vx_uint32 resolve3 :17;/*[15:31]*/ +} +vx_512bits_miscconfig_t; + +typedef struct _vx_512bits_t +{ + vx_512bits_config_t termConfig; + vx_512bits_config_t aSelect; + vx_512bits_config_t aBin[2]; + vx_512bits_config_t bSelect; + vx_512bits_config_t bBin[2]; + vx_512bits_miscconfig_t miscConfig; + vx_512bits_bin_t bins[2]; +} +vx_512bits_t; + +/***********************************************************************************/ + +typedef enum vx_ext_program_type_e +{ + VX_TYPE_PROGRAM = 0x900 +} +vx_ext_program_type_e; + +typedef enum vx_program_attribute_e +{ + VX_PROGRAM_ATTRIBUTE_BUILD_LOG = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_PROGRAM) + 0x0, +} +vx_program_attribute_e; + +typedef enum vx_ext_node_attribute_e +{ + VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_NODE) + 0x0, +} +vx_ext_node_attribute_e; + +#define VX_MAX_WORK_ITEM_DIMENSIONS 3 + +typedef struct _vx_kernel_execution_parameters { + vx_uint32 workDim; + vx_size globalWorkOffset[VX_MAX_WORK_ITEM_DIMENSIONS]; + vx_size globalWorkScale[VX_MAX_WORK_ITEM_DIMENSIONS]; + vx_size localWorkSize[VX_MAX_WORK_ITEM_DIMENSIONS]; + vx_size globalWorkSize[VX_MAX_WORK_ITEM_DIMENSIONS]; +} vx_kernel_execution_parameters_t; + +typedef struct _vx_program * vx_program; + +#define VX_BUILD_SUCCESS 0 +#define VX_BUILD_NONE -1 +#define VX_BUILD_ERROR -2 +#define VX_BUILD_IN_PROGRESS -3 + +#if defined(__cplusplus) +extern "C" { +#endif + + +VX_API_ENTRY vx_program VX_API_CALL vxCreateProgramWithSource( + vx_context context, vx_uint32 count, const vx_char * strings[], vx_size lengths[]); + +VX_API_ENTRY vx_program VX_API_CALL vxCreateProgramWithBinary( + vx_context context, const vx_uint8 * binary, vx_size size); + +VX_API_ENTRY vx_status VX_API_CALL vxReleaseProgram(vx_program *program); + +VX_API_ENTRY vx_status VX_API_CALL vxBuildProgram(vx_program program, const vx_char * options); + + +VX_API_ENTRY vx_status VX_API_CALL vxQueryProgram(vx_program program, vx_enum attribute, void *ptr, vx_size size); + +VX_API_ENTRY vx_kernel VX_API_CALL vxAddKernelInProgram( + vx_program program, vx_char name[VX_MAX_KERNEL_NAME], vx_enum enumeration, vx_uint32 num_params, vx_kernel_validate_f validate, + vx_kernel_initialize_f initialize, vx_kernel_deinitialize_f deinitialize); + +VX_API_ENTRY vx_status VX_API_CALL vxSetNodeUniform(vx_node node, const vx_char * name, vx_size count, void * value); + +VX_API_ENTRY vx_status VX_API_CALL vxSetChildGraphOfNode(vx_node node, vx_graph graph); + +VX_API_ENTRY vx_graph VX_API_CALL vxGetChildGraphOfNode(vx_node node); + +VX_API_ENTRY vx_status VX_API_CALL vxSetArrayAttribute(vx_array array, vx_enum attribute, void *ptr, vx_size size); + +VX_API_ENTRY vx_status VX_API_CALL vxSelectKernelSubname(vx_node node, const vx_char * subname); + +#if defined(__cplusplus) +} +#endif + +#endif /* __GC_VX_PROGRAM_H__ */ diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_ext_target.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_ext_target.h new file mode 100644 index 0000000..d5d420f --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_ext_target.h @@ -0,0 +1,135 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_EXT_TARGET_H_ +#define _VX_EXT_TARGET_H_ + +#include + +/*! \file + * \brief The OpenVX Target API Definition + */ + +/*! \brief The extension name. + * \ingroup group_target + */ +#define OPENVX_EXT_TARGET "vx_ext_target" + +/*! \brief Defines the maximum number of characters in a target string. + * \ingroup group_target + */ +#define VX_MAX_TARGET_NAME (64) + +enum vx_ext_target_context_attribute_e { + /*! \brief Used to query the context for the number of active targets. Use a \ref vx_uint32 parameter. */ + VX_CONTEXT_TARGETS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0xE, +}; + +/*! \brief An abstract handle to a target. + * \ingroup group_target + */ +typedef struct _vx_target *vx_target; + +/*! \brief The target attributes list + * \ingroup group_target + */ +enum vx_target_attribute_e { + /*! \brief Returns the index of the given target. Use a \ref vx_uint32 parameter.*/ + VX_TARGET_ATTRIBUTE_INDEX = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_TARGET) + 0x0, + /*! \brief Returns the name of the given target in the format "vendor.vendor_string". + * Use a \ref vx_char[\ref VX_MAX_TARGET_NAME] array + */ + VX_TARGET_ATTRIBUTE_NAME = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_TARGET) + 0x1, + /*! \brief Returns the number of kernels that the target is capable of processing. + * This is then used to allocate a table which is then filled when \ref vxQueryTarget + * is called with \ref VX_TARGET_ATTRIBUTE_KERNELTABLE. + * Use a \ref vx_uint32 parameter. + */ + VX_TARGET_ATTRIBUTE_NUMKERNELS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_TARGET) + 0x2, + /*! \brief Returns the table of all the kernels that a given target can execute. + * Use a vx_kernel_info_t array. + * \pre You must call \ref vxQueryTarget with \ref VX_TARGET_ATTRIBUTE_NUMKERNELS + * to compute the necessary size of the array. + */ + VX_TARGET_ATTRIBUTE_KERNELTABLE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_TARGET) + 0x3, +}; + +#if defined(__cplusplus) +extern "C" { +#endif + +/*! \brief Used to retrieve a target reference by the index of the target. + * \param [in] context The reference to the overall context. + * \param [in] index The index of the target to get a reference to. + * \return \ref vx_target + * \retval 0 Invalid index. + * \retval * A target reference. + * \note Use \ref vxQueryContext with \ref VX_CONTEXT_NUMTARGETS to retrieve the upper limit of targets. + * \ingroup group_target + */ +VX_API_ENTRY vx_target VX_API_CALL vxGetTargetByIndex(vx_context context, vx_uint32 index); + +/*! \brief Used to get a reference to named target when the name is known beforehand. + * \param [in] context The reference to the overall context. + * \param [in] name The target string name. + * \return \ref vx_target + * \retval 0 Invalid index. + * \retval * A target reference. + * \ingroup group_target + */ +VX_API_ENTRY vx_target VX_API_CALL vxGetTargetByName(vx_context context, const vx_char *name); + +/*! \brief Releases a reference to a target object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] target The pointer to the target to release. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE If target is not a \ref vx_target. + * \note After returning from this function the reference will be zeroed. + * \ingroup group_target + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseTarget(vx_target *target); + +/*! \brief Used to query the target about it's properties. + * \param [in] target The reference to the target. + * \param [in] attribute The \ref vx_target_attribute_e value to query for. + * \param [out] ptr The location at which the resulting value will be stored. + * \param [in] size The size of the container to which ptr points. + * \return A \ref vx_status_e enumeration. + * \pre \ref vxGetTargetByName or \ref vxGetTargetByIndex + * \ingroup group_target + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryTarget(vx_target target, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Used to assign target affinity to a node. + * \note This assignment overrides implementation chosen behavior. + * \param [in] node The node reference to assign affinity to. + * \param [in] target The reference to the target to execute the Node on. + * \pre \ref vxGetTargetByName or \ref vxGetTargetByIndex + * \return A \ref vx_status_e enumeration. + * \ingroup group_target + * \pre vxCreateGenericNode or some other node creation function. + * \retval VX_ERROR_INVALID_REFERENCE Either node or target was not a valid reference. + * \retval VX_ERROR_NOT_SUPPORTED The node can not be executed on that target. + */ +VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeAffinity(vx_node node, vx_target target); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_helper.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_helper.h new file mode 100644 index 0000000..33f7307 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_helper.h @@ -0,0 +1,293 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_HELPER_H_ +#define _VX_HELPER_H_ + +#include + +/*! \file + * \brief The OpenVX Helper Library Interface. + * + * \defgroup group_helper OpenVX Helper + * \brief The helper is an non-standardized set of convenience constructs for OpenVX. + * \details These functions use only the OpenVX API in order to implement their + * functionality. As such structures, objects, defines, typedefs and functions + * defined herein are not part of the OpenVX standard, and are + * included as EXAMPLE code only. + */ + +/*! \brief A definition for TAU, or 2*PI. + * \ingroup group_helper + */ +#define VX_TAU 6.28318530717958647692 + +/*! \brief Maximum number of supported entries. + * \ingroup group_helper + */ +#ifndef VX_MAX_LOG_NUM_ENTRIES +#define VX_MAX_LOG_NUM_ENTRIES (1024) +#endif + +#ifndef dimof +/*! \brief A helper macro to determine the number of elements in an array. + * \ingroup group_helper + */ +#define dimof(x) (sizeof(x)/sizeof(x[0])) +#endif + +/*! \brief Contains everything needed to abstractly describe a parameter to a kernel. This is used to + * declare kernel parameters at compile time. + * \ingroup group_helper + */ +typedef struct _vx_param_description_t { + vx_enum direction; /*!< \brief From \ref vx_direction_e */ + vx_enum data_type; /*!< \brief From \ref vx_type_e */ + vx_enum state; /*!< \brief From \ref vx_parameter_state_e */ +} vx_param_description_t; + + +/*! \brief Contains everything needed to abstractly describe a kernel. + * This is used to declare kernels at compile time. + * \ingroup group_helper + */ +typedef struct _vx_kernel_description_t { + /*! \brief The vx_kernel_e enum */ + vx_enum enumeration; + /*! \brief The name that kernel will be used with \ref vxGetKernelByName. */ + vx_char name[VX_MAX_KERNEL_NAME]; + /*! \brief The pointer to the function to execute the kernel */ + vx_kernel_f function; + /*! \brief The pointer to the array of parameter descriptors */ + vx_param_description_t *parameters; + /*! \brief The number of paraemeters in the array. */ + vx_uint32 numParams; + /*! \brief The parameters validator */ + vx_kernel_validate_f validate; + /*! \brief The input validator (deprecated in openvx 1.1) */ + void* input_validate; + /*! \brief The output validator (deprecated in openvx 1.1) */ + void* output_validate; + /*! \brief The initialization function */ + vx_kernel_initialize_f initialize; + /*! \brief The deinitialization function */ + vx_kernel_deinitialize_f deinitialize; +} vx_kernel_description_t; + +/*! \brief A log entry contains the graph reference, a status and a message. + * \ingroup group_helper + */ +typedef struct _vx_log_entry_t { + /*! \brief The status code */ + vx_status status; + /*! \brief The reference to which the message and status pertains. */ + vx_reference reference; + /*! \brief This indicates if the log entry is valid/active or not. */ + vx_enum active; + /*! \brief The message given to the log from OpenVX. This may be an empty string. */ + char message[VX_MAX_LOG_MESSAGE_LEN]; +} vx_log_entry_t; + +/*! \brief The log of a graph + * \ingroup group_helper + */ +typedef struct _vx_log_t { + vx_int32 first; /*!< Inclusive */ + vx_int32 last; /*!< Exclusive */ + vx_uint32 count; /*!< == VX_MAX_LOG_NUM_ENTRIES */ + /*! \brief The set of all log entries. */ + vx_log_entry_t entries[VX_MAX_LOG_NUM_ENTRIES]; +} vx_log_t; + +#define FGETS(str, fh) \ +{ \ + char* success = fgets(str, sizeof(str), fh); \ + if (!success) \ + { \ + printf("fgets failed\n"); \ + } \ +} + +#ifdef __cplusplus +extern "C" { +#endif + +uint32_t math_gcd(uint32_t a, uint32_t b); + + +/*! \brief Returns the previous entry of the log. When called consecutively it + * will return the entire log. The log will be cleared by reading it. + * \param [in] ref The reference to filter the log entries against. + * If the context is given, the next entry will be returned. + * \param [out] message A predefined location to store a copy of the log's + * message value. + * This must point to at least \ref VX_MAX_LOG_MESSAGE_LEN bytes of characters. + * \return Returns the status of the log entry from \ref vx_status_e. + * \ingroup group_helper + * \note The API returns errors oldest to newest order. + * When VX_SUCCESS is returned, the log reading is complete. + */ +vx_status vxGetLogEntry(vx_reference ref, char message[VX_MAX_LOG_MESSAGE_LEN]); + +/*! \brief This enables the helper library logging feature to take over the error + * log callback and keep a database of previous log entries. + * \ingroup group_helper + */ +void vxRegisterHelperAsLogReader(vx_context context); + +/*! + * \brief A method to construct a node via arbitrary parameters and an enum. + * \param [in] graph The handle to desired graph to add the node to. + * \param [in] kernelenum The \ref vx_kernel_e enum value used to create a node. + * \param [in] params The array of parameter information. + * \param [in] num The number of elements in params. + * \return vx_node + * \retval 0 Indicates a failure. + * \ingroup group_helper + */ +vx_node vxCreateNodeByStructure(vx_graph graph, + vx_enum kernelenum, + vx_reference params[], + vx_uint32 num); + +/*! \brief A method to clear out the log for a particular reference, such as a graph. + * \param [in] ref The reference to remove from the log. + * \ingroup group_helper + */ +void vxClearLog(vx_reference ref); + +/*! \brief This is used to connect one node parameter to another node parameter + * when the original handles to the data objects are already lost. + * The context determines if a buffer is necessary or can be optimized out. + * \param [in] a The first parameter + * \param [in] b The second parameter + * \note a or b must be an output parameter and other other an input. + * \return Returns a status code. + * \ingroup group_helper + */ +vx_status vxLinkParametersByReference(vx_parameter a, vx_parameter b); + +/*! \brief This is used to connect one parameter to another parameter by + * explicity indexing when the handles to the data objects are lost. + * \param [in] node_a The source node to link from. + * \param [in] index_a The index of the \ref vx_parameter to link from. + * \param [in] node_b The sink node to link to. + * \param [in] index_b The index of the \ref vx_parameter to link to. + * \return Returns a status code. + * \ingroup group_helper + */ +vx_status vxLinkParametersByIndex(vx_node node_a, vx_uint32 index_a, vx_node node_b, vx_uint32 index_b); + +/*! \brief This helper is used to easily set the affine matrix to a rotation and scale. + * \param [in] matrix The handle to the matrix. + * \param [in] angle The rotation angle in degrees. + * \param [in] scale The scaling value. Values less than one are enlarging. + * \param [in] center_x The center pixel in the x direction. + * \param [in] center_y The center pixel in the y direction. + * \return Returns a \ref vx_status_e enumeration. + * \ingroup group_helper + */ +vx_status vxSetAffineRotationMatrix(vx_matrix matrix, + vx_float32 angle, + vx_float32 scale, + vx_float32 center_x, + vx_float32 center_y); + +/*! \brief [Helper] This function changes the points of a rectangle by some + * delta value per coordinate. + * \param [in] rect The rectangle to modify. + * \param [in] dsx The start x delta. + * \param [in] dsy The start y delta. + * \param [in] dex The end x delta. + * \param [in] dey The end y delta. + * \return vx_status + * \retval VX_SUCCESS Modified rectangle. + * \retval VX_ERROR_INVALID_REFERENCE Not a valid rectangle. + * \ingroup group_helper + */ +vx_status vxAlterRectangle(vx_rectangle_t *rect, + vx_int32 dsx, + vx_int32 dsy, + vx_int32 dex, + vx_int32 dey); + +/*! \brief Adds a parameter to a graph by indicating the source node, and the + * index of the parameter on the node. + * \param [in] g The graph handle. + * \param [in] n The node handle. + * \param [in] index The index of the parameter on the node. + * \return Returns a \ref vx_status_e enumeration. + * \ingroup group_helper + */ +vx_status vxAddParameterToGraphByIndex(vx_graph g, vx_node n, vx_uint32 index); + +#if defined(EXPERIMENTAL_USE_TARGET) +/*! \brief Finds all targets which report that they implement a particular kernel by name. + * \param [in] context The overall context. + * \param [in] kname The name of the kernel to find. + * \param [in,out] targets The array of pointers to character arrays. Each index will + * be modified. If the kernel does not exist on the target, the name will be zeroed. + * If the kernel does exist on the target, the name of the target will be filled in. + * \pre targets must be a preallocated array of vx_char pointers to + * \ref VX_MAX_TARGET_NAME characters with number of elements equal to + * the number of targets in the implementation. + * \ingroup group_helper + */ +vx_bool vxFindAllTargetsOfKernelsByName(vx_context context, vx_char kname[VX_MAX_KERNEL_NAME], vx_char *targets[]); + +/*! \brief Allocates and returns a list of all available targets in a context. + * \param [in] context The overall context. + * \param [out] targets A pointer to variable to hold the array of target strings. + * \param [out] num_targets A pointer to a variable to hold the number of targets found. + * \ingroup group_helper + */ +vx_bool vxCreateListOfAllTargets(vx_context context, vx_char **targets[], vx_uint32 *num_targets); + +/*! \brief Free the array of target name strings. + * \param [in,out] targets The pointer to the variable that holds the array of strings. This variable will be set + * to NULL after this call. + * \param [in] num_targets The number of targets in the system. + * \ingroup group_helper + */ +void vxDestroyListOfAllTargets(vx_char **targets[], vx_uint32 num_targets); + +#endif + +/*! \brief Find the overlapping rectange between two rectangles. + * \ingroup group_helper + */ +vx_bool vxFindOverlapRectangle(vx_rectangle_t *rect_a, vx_rectangle_t *rect_b, vx_rectangle_t *rect_res); + +/*! \brief Read a rectangle-shaped section of an image into a 2D array. + * \ingroup group_helper + */ +void vxReadRectangle(const void *base, + const vx_imagepatch_addressing_t *addr, + const vx_border_t *borders, + vx_df_image type, + vx_uint32 center_x, + vx_uint32 center_y, + vx_uint32 radius_x, + vx_uint32 radius_y, + void *destination); + +#ifdef __cplusplus +} +#endif + +#endif /* _VX_HELPER_H_ */ + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h new file mode 100644 index 0000000..e982437 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h @@ -0,0 +1,498 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _OPENVX_KERNELS_H_ +#define _OPENVX_KERNELS_H_ + +/*! + * \file + * \brief The list of supported kernels in the OpenVX standard. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \brief The standard list of available libraries + * \ingroup group_kernel + */ +enum vx_library_e { + /*! \brief The base set of kernels as defined by Khronos. */ + VX_LIBRARY_KHR_BASE = 0x0, +}; + +/*! + * \brief The standard list of available vision kernels. + * + * Each kernel listed here can be used with the \ref vxGetKernelByEnum call. + * When programming the parameters, use + * \arg \ref VX_INPUT for [in] + * \arg \ref VX_OUTPUT for [out] + * \arg \ref VX_BIDIRECTIONAL for [in,out] + * + * When programming the parameters, use + * \arg \ref VX_TYPE_IMAGE for a \ref vx_image in the size field of \ref vxGetParameterByIndex or \ref vxSetParameterByIndex * \arg \ref VX_TYPE_ARRAY for a \ref vx_array in the size field of \ref vxGetParameterByIndex or \ref vxSetParameterByIndex * \arg or other appropriate types in \ref vx_type_e. + * \ingroup group_kernel + */ +enum vx_kernel_e { + + /*! + * \brief The Color Space conversion kernel. + * \details The conversions are based on the \ref vx_df_image_e code in the images. + * \see group_vision_function_colorconvert + */ + VX_KERNEL_COLOR_CONVERT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1, + + /*! + * \brief The Generic Channel Extraction Kernel. + * \details This kernel can remove individual color channels from an interleaved + * or semi-planar, planar, sub-sampled planar image. A client could extract + * a red channel from an interleaved RGB image or do a Luma extract from a + * YUV format. + * \see group_vision_function_channelextract + */ + VX_KERNEL_CHANNEL_EXTRACT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x2, + + /*! + * \brief The Generic Channel Combine Kernel. + * \details This kernel combine multiple individual planes into a single + * multiplanar image of the type specified in the output image. + * \see group_vision_function_channelcombine + */ + VX_KERNEL_CHANNEL_COMBINE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x3, + + /*! \brief The Sobel 3x3 Filter Kernel. + * \see group_vision_function_sobel3x3 + */ + VX_KERNEL_SOBEL_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x4, + + /*! + * \brief The Magnitude Kernel. + * \details This kernel produces a magnitude plane from two input gradients. + * \see group_vision_function_magnitude + */ + VX_KERNEL_MAGNITUDE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x5, + + /*! + * \brief The Phase Kernel. + * \details This kernel produces a phase plane from two input gradients. + * \see group_vision_function_phase + */ + VX_KERNEL_PHASE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x6, + + /*! + * \brief The Scale Image Kernel. + * \details This kernel provides resizing of an input image to an output image. + * The scaling factor is determined but the relative sizes of the input and + * output. + * \see group_vision_function_scale_image + */ + VX_KERNEL_SCALE_IMAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x7, + + /*! \brief The Table Lookup kernel + * \see group_vision_function_lut + */ + VX_KERNEL_TABLE_LOOKUP = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x8, + + /*! \brief The Histogram Kernel. + * \see group_vision_function_histogram + */ + VX_KERNEL_HISTOGRAM = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x9, + + /*! \brief The Histogram Equalization Kernel. + * \see group_vision_function_equalize_hist + */ + VX_KERNEL_EQUALIZE_HISTOGRAM = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xA, + + /*! \brief The Absolute Difference Kernel. + * \see group_vision_function_absdiff + */ + VX_KERNEL_ABSDIFF = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xB, + + /*! \brief The Mean and Standard Deviation Kernel. + * \see group_vision_function_meanstddev + */ + VX_KERNEL_MEAN_STDDEV = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xC, + + /*! \brief The Threshold Kernel. + * \see group_vision_function_threshold + */ + VX_KERNEL_THRESHOLD = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xD, + + /*! \brief The Integral Image Kernel. + * \see group_vision_function_integral_image + */ + VX_KERNEL_INTEGRAL_IMAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xE, + + /*! \brief The dilate kernel. + * \see group_vision_function_dilate_image + */ + VX_KERNEL_DILATE_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0xF, + + /*! \brief The erode kernel. + * \see group_vision_function_erode_image + */ + VX_KERNEL_ERODE_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x10, + + /*! \brief The median image filter. + * \see group_vision_function_median_image + */ + VX_KERNEL_MEDIAN_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x11, + + /*! \brief The box filter kernel. + * \see group_vision_function_box_image + */ + VX_KERNEL_BOX_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x12, + + /*! \brief The gaussian filter kernel. + * \see group_vision_function_gaussian_image + */ + VX_KERNEL_GAUSSIAN_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x13, + + /*! \brief The custom convolution kernel. + * \see group_vision_function_custom_convolution + */ + VX_KERNEL_CUSTOM_CONVOLUTION = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x14, + + /*! \brief The gaussian image pyramid kernel. + * \see group_vision_function_gaussian_pyramid + */ + VX_KERNEL_GAUSSIAN_PYRAMID = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x15, + + /*! \brief The min and max location kernel. + * \see group_vision_function_minmaxloc + */ + VX_KERNEL_MINMAXLOC = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x19, + + /*! \brief The bit-depth conversion kernel. + * \see group_vision_function_convertdepth + */ + VX_KERNEL_CONVERTDEPTH = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1A, + + /*! \brief The Canny Edge Detector. + * \see group_vision_function_canny + */ + VX_KERNEL_CANNY_EDGE_DETECTOR = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1B, + + /*! \brief The Bitwise And Kernel. + * \see group_vision_function_and + */ + VX_KERNEL_AND = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1C, + + /*! \brief The Bitwise Inclusive Or Kernel. + * \see group_vision_function_or + */ + VX_KERNEL_OR = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1D, + + /*! \brief The Bitwise Exclusive Or Kernel. + * \see group_vision_function_xor + */ + VX_KERNEL_XOR = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1E, + + /*! \brief The Bitwise Not Kernel. + * \see group_vision_function_not + */ + VX_KERNEL_NOT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x1F, + + /*! \brief The Pixelwise Multiplication Kernel. + * \see group_vision_function_mult + */ + VX_KERNEL_MULTIPLY = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x20, + + /*! \brief The Addition Kernel. + * \see group_vision_function_add + */ + VX_KERNEL_ADD = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x21, + + /*! \brief The Subtraction Kernel. + * \see group_vision_function_sub + */ + VX_KERNEL_SUBTRACT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x22, + + /*! \brief The Warp Affine Kernel. + * \see group_vision_function_warp_affine + */ + VX_KERNEL_WARP_AFFINE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x23, + + /*! \brief The Warp Perspective Kernel. + * \see group_vision_function_warp_perspective + */ + VX_KERNEL_WARP_PERSPECTIVE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x24, + + /*! \brief The Harris Corners Kernel. + * \see group_vision_function_harris + */ + VX_KERNEL_HARRIS_CORNERS = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x25, + + /*! \brief The FAST Corners Kernel. + * \see group_vision_function_fast + */ + VX_KERNEL_FAST_CORNERS = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x26, + + /*! \brief The Optical Flow Pyramid (LK) Kernel. + * \see group_vision_function_opticalflowpyrlk + */ + VX_KERNEL_OPTICAL_FLOW_PYR_LK = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x27, + + /*! \brief The Remap Kernel. + * \see group_vision_function_remap + */ + VX_KERNEL_REMAP = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x28, + + /*! \brief The Half Scale Gaussian Kernel. + * \see group_vision_function_scale_image + */ + VX_KERNEL_HALFSCALE_GAUSSIAN = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x29, + + VX_KERNEL_MAX_1_0, /*!< \internal Used for VX1.0 bounds checking in the conformance test. */ + + /* kernel added in OpenVX 1.1 */ + + /*! \brief The Laplacian Image Pyramid Kernel. + * \see group_vision_function_laplacian_pyramid + */ + VX_KERNEL_LAPLACIAN_PYRAMID = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x2A, + + /*! \brief The Laplacian Pyramid Reconstruct Kernel. + * \see group_vision_function_laplacian_pyramid + */ + VX_KERNEL_LAPLACIAN_RECONSTRUCT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x2B, + + /*! \brief The Non Linear Filter Kernel. + * \see group_vision_function_nonlinear_filter + */ + VX_KERNEL_NON_LINEAR_FILTER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x2C, + + VX_KERNEL_MAX_1_1, /*!< \internal Used for VX1.1 bounds checking in the conformance test. */ + + /* kernel added in OpenVX 1.2 */ + + /*! \brief The Match Template Kernel. + * \see group_vision_match_template + */ + VX_KERNEL_MATCH_TEMPLATE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x2D, + + /*! \brief The LBP Kernel. + * \see group_lbp + */ + VX_KERNEL_LBP = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x2E, + + /*! \brief The hough lines probability Kernel. + * \see group_vision_hough_lines_p + */ + VX_KERNEL_HOUGH_LINES_P = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x2F, + + /*! \brief The tensor multiply Kernel. + * \see group_vision_function_tensor_multiply + */ + VX_KERNEL_TENSOR_MULTIPLY = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x30, + + /*! \brief The tensor add Kernel. + * \see group_vision_function_tensor_add + */ + VX_KERNEL_TENSOR_ADD = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x31, + + /*! \brief The tensor subtract Kernel. + * \see group_vision_function_tensor_subtract + */ + VX_KERNEL_TENSOR_SUBTRACT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x32, + + /*! \brief The tensor table look up Kernel. + * \see group_vision_function_tensor_tablelookup + */ + VX_KERNEL_TENSOR_TABLE_LOOKUP = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x33, + + /*! \brief The tensor transpose Kernel. + * \see group_vision_function_tensor_transpose + */ + VX_KERNEL_TENSOR_TRANSPOSE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x34, + + /*! \brief The tensor convert depth Kernel. + * \see group_vision_function_tensor_convert_depth + */ + VX_KERNEL_TENSOR_CONVERT_DEPTH = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x35, + + /*! \brief The tensor matrix multiply Kernel. + * \see group_vision_function_tensor_matrix_multiply + */ + VX_KERNEL_TENSOR_MATRIX_MULTIPLY = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x36, + + /*! \brief The data object copy kernel. + * \see group_vision_function_copy + */ + VX_KERNEL_COPY = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x37, + + /*! \brief The non-max suppression kernel. + * \see group_vision_function_nms + */ + VX_KERNEL_NON_MAX_SUPPRESSION = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x38, + + /*! \brief The scalar operation kernel. + * \see group_control_flow + */ + VX_KERNEL_SCALAR_OPERATION = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x39, + + /*! \brief The HOG features kernel. + * \see group_vision_function_hog + */ + VX_KERNEL_HOG_FEATURES = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x3A, + + /*! \brief The HOG Cells kernel. + * \see group_vision_function_hog + */ + VX_KERNEL_HOG_CELLS = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x3B, + + /*! \brief The bilateral filter kernel. + * \see group_vision_function_bilateral_filter + */ + VX_KERNEL_BILATERAL_FILTER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x3C, + + /*! \brief The select kernel. + * \see group_control_flow + */ + VX_KERNEL_SELECT = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x3D, + + /* insert new kernels here */ + + /*! \brief The max kernel. + * \see group_vision_function_max + */ + VX_KERNEL_MAX = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x3E, + /*! \brief The min kernel. + * \see group_vision_function_min + */ + VX_KERNEL_MIN = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x3F, + + /*! \brief The weigthed average kernel. + * \see group_vision_function_weighted_average + */ + VX_KERNEL_WEIGHTED_AVERAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_BASE) + 0x40, + + /* insert new kernels here */ + VX_KERNEL_NN_CONVOLUTION_RELU_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x0, + + VX_KERNEL_NN_CONVOLUTION_RELU_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1, + + VX_KERNEL_NN_FULLY_CONNECTED_RELU_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2, + + //VX_KERNEL_NN_SOFTMAX_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x3, + + //VX_KERNEL_NN_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4, + + VX_KERNEL_NN_LRN_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x3, + + //VX_KERNEL_NN_NORMALIZE_IMAGE_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4, + + //VX_KERNEL_NN_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x7, + + //VX_KERNEL_NN_ACTIVATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x9, + + VX_KERNEL_NN_LEAKY = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x4, + + VX_KERNEL_NN_BATCH_NORM = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x5, + + VX_KERNEL_NN_RPN = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x6, + + //VX_KERNEL_NN_ROIPOOL = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xD, + + VX_KERNEL_NN_CONCAT2_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x7, + + //VX_KERNEL_NN_CONVOLUTION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xF, + + VX_KERNEL_NN_CONCATINDEFINITE_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x8, + + VX_KERNEL_NN_REORG_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x9, + + //VX_KERNEL_NN_DECONVOLUTION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x12, + + VX_KERNEL_NN_TENSOR_DIV = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xA, + + VX_KERNEL_NN_L2NORMALIZE_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xB, + + VX_KERNEL_NN_TENSOR_COPY = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xC, + + VX_KERNEL_NN_CONVOLUTION_RELU_POOLING_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xD, + + VX_KERNEL_NN_POOLING_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xE, + + VX_KERNEL_NN_TENSOR_REDUCE_SUM = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0xF, + + VX_KERNEL_NN_TENSOR_PAD = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x10, + + VX_KERNEL_NN_LSTM_UNIT = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x11, + + VX_KERNEL_NN_LSTM_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x12, + + VX_KERNEL_NN_REORG2_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x13, + + VX_KERNEL_NN_TENSOR_ROUNDING = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x14, + + VX_KERNEL_NN_HASH_LUT_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x15, + + VX_KERNEL_NN_LSH_PROJECTION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x16, + + VX_KERNEL_NN_TENSOR_RESHPE = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x17, + + VX_KERNEL_NN_LUT2_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x18, + + VX_KERNEL_NN_TENSOR_SCALE = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x19, + + VX_KERNEL_NN_RNN_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1A, + + VX_KERNEL_NN_SOFTMAX2_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1B, + + VX_KERNEL_NN_SVDF_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1C, + + VX_KERNEL_NN_NORMALIZATION_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1D, + + VX_KERNEL_NN_TENSOR_REVERSE = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1E, + + VX_KERNEL_NN_TENSOR_TRANSPOSE = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x1F, + + VX_KERNEL_NN_TENSOR_MEAN = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x20, + + VX_KERNEL_NN_TENSOR_SQUEEZE = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x21, + + VX_KERNEL_NN_TENSOR_STRIDE_SLICE = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x22, + + VX_KERNEL_NN_TENSOR_PAD2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x23, + + VX_KERNEL_NN_YUV2RGB_SCALE = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x24, + + VX_KERNEL_NN_PRELU = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x25, + + VX_KERNEL_NN_GRU_UNIT_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x26, + + VX_KERNEL_NN_GRU_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x27, + + VX_KERNEL_NN_CONV_LSTM_UNIT_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x28, + + VX_KERNEL_NN_CONV_LSTM_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x29, + + VX_KERNEL_NN_FULLY_CONNECTED_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2A, + + VX_KERNEL_NN_L2NORMALIZE_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2B, + + VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _OPEN_VISION_LIBRARY_KERNELS_H_ */ diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_cnn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_cnn.h new file mode 100644 index 0000000..17ac2e9 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_cnn.h @@ -0,0 +1,35 @@ +/**************************************************************************** +* +* Copyright 2017 - 2020 Vivante Corporation, Santa Clara, California. +* All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* 'Software'), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sub license, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject +* to the following conditions: +* +* The above copyright notice and this permission notice (including the +* next paragraph) shall be included in all copies or substantial +* portions of the Software. +* +* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. +* IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VX_KHR_CNN_H_ +#define _VX_KHR_CNN_H_ + +#define OPENVX_KHR_CNN "vx_khr_cnn" + +#include + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h new file mode 100644 index 0000000..353e915 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h @@ -0,0 +1,75 @@ +/**************************************************************************** +* +* Copyright 2017 - 2020 Vivante Corporation, Santa Clara, California. +* All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* 'Software'), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sub license, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject +* to the following conditions: +* +* The above copyright notice and this permission notice (including the +* next paragraph) shall be included in all copies or substantial +* portions of the Software. +* +* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. +* IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef __VX_KHR_COMPATIBLE_H__ +#define __VX_KHR_COMPATIBLE_H__ +/* + VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS is used to distingush deconvolution weight layout + [value] + 0: weight_layout is whnc + 1: weight_layout is whcn +*/ +#define VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS 1 +/* + VX_CONVERT_POLICY_WRAP_ENABLE is used to differentiate two overflow_policys(VX_CONVERT_POLICY_WRAP and VX_CONVERT_POLICY_SAT) + [value] + 0: both overflow_policys considered as VX_CONVERT_POLICY_SAT + 1: overflow_policy is determined by arguments. +*/ +#define VX_CONVERT_POLICY_WRAP_ENABLE 1 + +#define VX_13_NN_COMPATIBLITY 1 +/* + VX_L2NORM_AXIS_PARAMETER_SUPPORT is used to declare that L2NORMALIZE can support axis parameter + [value] + 0: not support + 1: support +*/ +#define VX_L2NORM_AXIS_PARAMETER_SUPPORT 1 +/* + VX_SOFTMAX_AXIS_PARAMETER_SUPPORT is used to declare that SOFTAMX can support axis parameter + [value] + 0: not support + 1: support +*/ +#define VX_SOFTMAX_AXIS_PARAMETER_SUPPORT 1 +/* + VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT is used to declare that NORMALIZATION can support axis parameter + [value] + 0: not support + 1: support +*/ +#define VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT 1 +/* + VX_ACTIVATION_EXT_SUPPORT is used to declare that ACTIVATION can support swish and hswish + [value] + 0: not support + 1: support +*/ +#define VX_ACTIVATION_EXT_SUPPORT 1 + +#endif /* __VX_KHR_COMPATIBLE_H__ */ diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_dot.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_dot.h new file mode 100644 index 0000000..cf54dfd --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_dot.h @@ -0,0 +1,42 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_KHR_DOT_H_ +#define _VX_KHR_DOT_H_ + +#define OPENVX_KHR_DOT "vx_khr_dot" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Exports a single graph to a dotfile. + * \param [in] graph The graph to export. + * \param [in] dotfile The name of the file to write to. + * \param [in] showData If true, data objects will be listed in the graph too. + * \see http://www.graphviz.com + */ +vx_status vxExportGraphToDot(vx_graph g, vx_char dotfile[], vx_bool showData); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_icd.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_icd.h new file mode 100644 index 0000000..fc44049 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_icd.h @@ -0,0 +1,80 @@ +/* + + * Copyright (c) 2017-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! \file + * \defgroup group_icd OpenVX ICD Loader API + * \brief The OpenVX Installable Client Driver (ICD) Loader API. + * \details The vx_khr_icd extension provides a mechanism for vendors to implement Installable Client Driver (ICD) for OpenVX. The OpenVX ICD Loader API provides a mechanism for applications to access these vendor implementations. + */ + +#ifndef _VX_KHR_ICD_H_ +#define _VX_KHR_ICD_H_ + +#include +#include + +/*! \brief Platform handle of an implementation. + * \ingroup group_icd + */ +typedef struct _vx_platform * vx_platform; + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Queries list of available platforms. + * \param [in] capacity Maximum number of items that platform[] can hold. + * \param [out] platform[] List of platform handles. + * \param [out] pNumItems Number of platform handles returned. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_FAILURE If no platforms are found. + * \ingroup group_icd + */ +vx_status VX_API_CALL vxIcdGetPlatforms(vx_size capacity, vx_platform platform[], vx_size * pNumItems); + +/*! \brief Queries the platform for some specific information. + * \param [in] platform The platform handle. + * \param [in] attribute The attribute to query. Use one of the following: + * \ref VX_CONTEXT_VENDOR_ID, + * \ref VX_CONTEXT_VERSION, + * \ref VX_CONTEXT_EXTENSIONS_SIZE, + * \ref VX_CONTEXT_EXTENSIONS. + * \param [out] ptr The location at which to store the resulting value. + * \param [in] size The size in bytes of the container to which \a ptr points. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE If the platform is not a \ref vx_platform. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \retval VX_ERROR_NOT_SUPPORTED If the attribute is not supported on this implementation. + * \ingroup group_icd + */ +vx_status VX_API_CALL vxQueryPlatform(vx_platform platform, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Creates a \ref vx_context from a \ref vx_platform. + * \details This creates a top-level object context for OpenVX from a platform handle. + * \returns The reference to the implementation context \ref vx_context. Any possible errors + * preventing a successful creation should be checked using \ref vxGetStatus. + * \ingroup group_icd + */ +vx_context VX_API_CALL vxCreateContextFromPlatform(vx_platform platform); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_import_kernel.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_import_kernel.h new file mode 100644 index 0000000..373065e --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_import_kernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2012-2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + */ + +#ifndef _OPENVX_IMPORT_KERNEL_H_ +#define _OPENVX_IMPORT_KERNEL_H_ + +#include + +/*! + * \file + * \brief The OpenVX import kernel extension API. + */ +#define OPENVX_KHR_IMPORT_KERNEL "vx_khr_import_kernel" + +/*! \brief The import kernel extension library set + * \ingroup group_import_kernel + */ +#define VX_LIBRARY_KHR_IMPORT_KERNEL_EXTENSION (0x5) + +/* +define type for vxImportKernelFromURL() function +*/ +#define VX_VIVANTE_IMPORT_KERNEL_FROM_FILE "vx_vivante_file" +#define VX_VIVANTE_IMPORT_KERNEL_FROM_FOLDER "vx_vivante_folder" +#define VX_VIVANTE_IMPORT_KERNEL_FROM_LABEL "vx_vivante_label" +#define VX_VIVANTE_IMPORT_KERNEL_FROM_POINTER "vx_vivante_pointer" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Import a kernel from binary specified by URL. + * + * The name of kernel parameters can be queried using the vxQueryReference API + * with vx_parameter as ref and VX_REFERENCE_NAME as attribute. + * + * \param context [in] The OpenVX context + * \param type [in] Vendor-specific identifier that indicates to the implementation + * how to interpret the url. For example, if an implementation can interpret the url + * as a file, a folder a symbolic label, or a pointer, then a vendor may choose + * to use "vx__file", "vx__folder", "vx__label", and + * "vx__pointer", respectively for this field. Container types starting + * with "vx_khr_" are reserved. Refer to vendor documentation for list of + * container types supported + * \param url [in] URL to binary container. + * + * \retval On success, a valid vx_kernel object. Calling vxGetStatus with the return value + * as a parameter will return VX_SUCCESS if the function was successful. + * + * \ingroup group_import_kernel + */ +VX_API_ENTRY vx_kernel VX_API_CALL vxImportKernelFromURL( + vx_context context, + const vx_char * type, + const vx_char * url + ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_interp.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_interp.h new file mode 100644 index 0000000..f3e7b95 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_interp.h @@ -0,0 +1,38 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_KHR_INTERP_H_ +#define _VX_KHR_INTERP_H_ + +/*! \brief The Interpolation Type Query Extension. + * \file + */ + +#define OPENVX_KHR_INTERP "vx_khr_interpolation" + +#include + +/*! \brief Additional interpolation types */ +enum vx_interpolation_type_ext_e { + /*! \brief Bicubic interpolation method */ + VX_INTERPOLATION_BICUBIC = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x3, + /*! \brief Mipmapping interpolation method */ + VX_INTERPOLATION_MIPMAP = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x4, +}; + +#endif + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h new file mode 100644 index 0000000..fb15140 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h @@ -0,0 +1,2101 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_KHR_NN_H_ +#define _VX_KHR_NN_H_ + +/*! + * \file + * \brief The Khronos Extension for Deep Convolutional Networks Functions. + * + * \defgroup group_cnn Extension: Deep Convolutional Networks API + * \brief Convolutional Network Nodes. + */ + +#define OPENVX_KHR_NN "vx_khr_nn" + +#include +#include + + +#ifdef __cplusplus +extern "C" { +#endif + +/*TODO: check it for OpenVX 1.2*/ +//#if defined(OPENVX_CNN_1_0) +//#undef OPENVX_CNN_1_1 +//#endif + +enum vx_context_attribute_internal_type_e +{ + VX_CONTEXT_DEVICE_COUNT_VIV = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_CONTEXT) + 0x0, +}; + +enum vx_graph_attribute_internal_type_e +{ + VX_GRAPH_DEVICE_INDEX_VIV = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x0, +}; + +/*! \brief Size Alignment of User Memory + * \0x40 64Byte Align + * \0x1000 4k Align + */ +#define VX_WRAP_USER_MEMORY_SIZE_ALIGNMENT (0x40) + +/*! \brief OpenVX Version Compatibility set*/ +#define VX_KHR_COMPATIBILITY (0x1) + +/*============================================================================== +CONVOLUTIONAL_NETWORK structs and enums +=============================================================================*/ +/*! \brief The Neural Network Extension Library Set + * \ingroup group_cnn + */ +#define VX_LIBRARY_KHR_NN_EXTENSION (0x1) + +/*! \brief The list of Neural Network Extension Kernels. + * \ingroup group_cnn + */ +enum vx_kernel_nn_ext_e { + /*! \brief The Neural Network Extension convolution Kernel. + * \see group_cnn + */ + VX_KERNEL_CONVOLUTION_LAYER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_NN_EXTENSION) + 0x0, + /*! \brief The Neural Network Extension fully connected Kernel. + * \see group_cnn + */ + VX_KERNEL_FULLY_CONNECTED_LAYER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_NN_EXTENSION) + 0x1, + /*! \brief The Neural Network Extension pooling Kernel. + * \see group_cnn + */ + VX_KERNEL_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_NN_EXTENSION) + 0x2, + /*! \brief The Neural Network Extension softmax Kernel. + * \see group_cnn + */ + VX_KERNEL_SOFTMAX_LAYER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_NN_EXTENSION) + 0x3, + /*! \brief The Neural Network Extension normalization Kernel. + * \see group_cnn + */ + VX_KERNEL_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_NN_EXTENSION) + 0x4, + /*! \brief The Neural Network Extension activation Kernel. + * \see group_cnn + */ + VX_KERNEL_ACTIVATION_LAYER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_NN_EXTENSION) + 0x5, + /*! \brief The Neural Network POI Pooling Kernel. + * \see group_cnn + */ + VX_KERNEL_ROI_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_NN_EXTENSION) + 0x6, + /*! \brief The Neural Network Extension Deconvolution Kernel. + * \see group_cnn + */ + VX_KERNEL_DECONVOLUTION_LAYER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_NN_EXTENSION) + 0x7, + /*! \brief The Neural Network Extension local response normalization Kernel (with bias). + * \see group_cnn + */ + VX_KERNEL_LOCAL_RESPONSE_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_NN_EXTENSION) + 0x8, +}; + +/*! \brief NN extension type enums. + * \ingroup group_cnn + */ +enum vx_nn_enum_e +{ + VX_ENUM_NN_ROUNDING_TYPE = 0x1A, + VX_ENUM_NN_POOLING_TYPE = 0x1B, + VX_ENUM_NN_NORMALIZATION_TYPE = 0x1C, + VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE = 0x1D, + /* 0x1E, 0x1F and 0x20 are reserved for VX_ENUM_CLASSIFIER_MODEL, VX_ENUM_IX_USE and VX_ENUM_SCALAR_OPERATION*/ + VX_ENUM_NN_LAYER_TYPE = 0x21, +}; + +/*! \brief down scale rounding. + * \details Due to different scheme of downscale size calculation in the various training frameworks. Implementation must support 2 rounding methods for down scale calculation. + * The floor and the ceiling. In convolution and pooling functions. + * Relevant when input size is even. + * \ingroup group_cnn + */ +enum vx_nn_rounding_type_e +{ + /*! \brief floor rounding */ + VX_NN_DS_SIZE_ROUNDING_FLOOR = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ROUNDING_TYPE) + 0x0, + /*! \brief ceil rounding */ + VX_NN_DS_SIZE_ROUNDING_CEILING = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ROUNDING_TYPE) + 0x1 +}; + + +/*! \brief The Neural Network pooling type list. + * \details kind of pooling done in pooling function + * \ingroup group_cnn + */ +enum vx_nn_pooling_type_e +{ + /*! \brief max pooling*/ + VX_NN_POOLING_MAX = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_POOLING_TYPE) + 0x0, + /*! \brief average pooling*/ + VX_NN_POOLING_AVG = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_POOLING_TYPE) + 0x1, + /*! \brief l2 pooling*/ + VX_NN_POOLING_L2 = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_POOLING_TYPE) + 0x0, + /*! \brief average pooling for android*/ + VX_NN_POOLING_AVG_ANDROID = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_POOLING_TYPE) + 0x1, +}; + + +/*! \brief The Neural Network normalization type list. + * \ingroup group_cnn + */ +enum vx_nn_norm_type_e +{ + /*! \brief normalization is done on same IFM*/ + VX_NN_NORMALIZATION_SAME_MAP = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_NORMALIZATION_TYPE) + 0x0, + /*! \brief Normalization is done across different IFMs*/ + VX_NN_NORMALIZATION_ACROSS_MAPS = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_NORMALIZATION_TYPE) + 0x1, +}; + + +/*! \brief The Neural Network activation functions list. + * \details + * + *
Function name Mathematical definition Parameters Parameters type + *
logistic \f$f(x)=1/(1+e^{-x}) \f$ + *
hyperbolic tangent \f$f(x)=a\cdot tanh(b\cdot x) \f$ a,b VX_FLOAT32 + *
relu \f$f(x)=max(0,x)\f$ + *
bounded relu \f$f(x)=min(a,max(0,x)) \f$ a VX_FLOAT32 + *
soft relu \f$f(x)=log(1+e^{x}) \f$ + *
abs \f$f(x)=\mid x\mid \f$ + *
square \f$f(x)= x^2 \f$ + *
square root \f$f(x)=\sqrt{x} \f$ + *
linear \f$f(x)=ax+b \f$ a,b VX_FLOAT32 + *
+ * \ingroup group_cnn + */ +enum vx_nn_activation_function_e +{ + VX_NN_ACTIVATION_LOGISTIC = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x0, + VX_NN_ACTIVATION_HYPERBOLIC_TAN = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1, + VX_NN_ACTIVATION_RELU = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x2, + VX_NN_ACTIVATION_BRELU = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x3, + VX_NN_ACTIVATION_SOFTRELU = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x4, + VX_NN_ACTIVATION_ABS = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x5, + VX_NN_ACTIVATION_SQUARE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x6, + VX_NN_ACTIVATION_SQRT = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7, + VX_NN_ACTIVATION_LINEAR = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x8, + VX_NN_ACTIVATION_LEAKYRELU = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x0, + VX_NN_ACTIVATION_RELU6 = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1, + VX_NN_ACTIVATION_RELU1 = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x2, + VX_NN_ACTIVATION_RSQRT = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x3, + VX_NN_ACTIVATION_LEAKYRELU_MAX_POOLING = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x4, + VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x5, + VX_NN_ACTIVATION_SWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x6, + VX_NN_ACTIVATION_HSWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7, +}; + +/*! \brief The Convolutional network type + * \ingroup group_cnn + */ +enum vx_nn_layer_type_e +{ + /*! \brief convolution layer */ + VX_NN_CONVOLUTION_LAYER = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_LAYER_TYPE) + 0x0, + /*! \brief fully connected layer */ + VX_NN_FULLYCONNECTED_LAYER = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NN_LAYER_TYPE) + 0x1, +}; + +/*! \brief The pad mode list. + * \ingroup group_cnn + * \version 0.3 + */ +enum vx_pad_mode_e { + /*! \brief For nodes that support this behavior, a constant value is + * \e filled-in when accessing padding pixels. + * eg. [1,2,3,4]->C,C,[1,2,3,4]C,C + */ + VX_PAD_CONSTANT = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_BORDER) + 0x0, + + /*! \brief For nodes that support this behavior, a relicateion of the nearest + * edge pixels value is given for padding pixels. + * eg. [1,2,3,4]->1,1,[1,2,3,4],4,4 + */ + VX_PAD_REPLICATE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_BORDER) + 0x1, + + /*! \brief For nodes that support this behavior, a mirror of the nearest + * edge pixels value is given for padding pixels. ege is duplicate. + * eg. [1,2,3,4]->2,1,[1,2,3,4],4,3 + */ + VX_PAD_MIRROR_SYMMETRIC = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_BORDER) + 0x2, + + /*! \brief For nodes that support this behavior, a mirror of the nearest + * edge pixels value is given for padding pixels. ege is not duplicate. + * eg. [1,2,3,4]->3,2,[1,2,3,4],3,2 + */ + VX_PAD_MIRROR_REFLECT = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_BORDER) + 0x3, +}; + +/*! \brief The Quantized format list. + * \ingroup group_tensor + * \version 0.3 + */ +enum vx_quantized_format_e +{ + /*! \brief Non-quantized data. */ + VX_QUANT_NONE = 0x0, + /*! \brief A quantization data type which specifies the fixed point position. */ + VX_QUANT_DYNAMIC_FIXED_POINT = 0x1, + /*! \brief A quantization data type which has scale value and zero point to match with TF and Android NN API */ + VX_QUANT_AFFINE_SCALE = 0x2, + + VX_QUANT_AFFINE_SCALE_PER_CHANNEL = 0x3, + +}; + +/*! \brief The rank mode of tensor memory. + * \ingroup group_tensor + * \version 0.4 + */ +enum vx_tensor_rank_type_e +{ + /*! \brief rank with weight,height,channel,batch */ + VX_TENSOR_RANK_WHCN = 0, + + /*! \brief rank with channel,weight,height,batch */ + VX_TENSOR_RANK_CWHN, + + /*! \brief rank with size, batch */ + VX_TENSOR_RANK_SN, +}; + +/*! \brief The precision of tensor. + * \ingroup group_tensor + * \version 0.4 + */ +enum vx_tensor_precision_type_e +{ + /*! \brief auto adapter precision */ + VX_TENSOR_PRECISION_AUTO = 0, + + /*! \brief high precision */ + VX_TENSOR_PRECISION_HIGH, +}; + +/*! \brief Specifies a static or dynamic tensor. + * \ingroup group_tensor + * \version 0.4 + */ +enum vx_tensor_lifetime_type_e +{ + /*! \brief static tensor */ + VX_TENSOR_LIFE_TIME_STATIC = 0, + + /*! \brief dynamic tensor */ + VX_TENSOR_LIFE_TIME_DYNAMIC, +}; + +/*============================================================================== + TENSOR DATA FUNCTIONS +=============================================================================*/ + +/*! \brief The multi dimensional view data structure. +* \details Used to split tensors into several views. Or concatenate several view into one tensor. +* \see vxCreateTensorFromView +* \ingroup group_tensor +*/ +typedef struct _vx_tensor_view_t * vx_tensor_view; + +/*! \brief The addressing of a tensor view patch structure is used by the Host only +* to address elements in a tensor view patch. +* \see \ref vxCopyTensorPatch +* \ingroup group_tensor +*/ +typedef struct _vx_tensor_addressing_t * vx_tensor_addressing; + +/*! \brief Create an opaque reference to a tensor view object. + * \details Not guaranteed to exist until the vx_graph containing it has been verified. + * \param [in] context The reference to the implementation context. + * \param [in] view_array_start a vx_uint32 array of start values of the view. + * \param [in] view_array_end a vx_uint32 array of end values of the view. + * \param [in] numViewDimensions number of dimensions of view_array_start and view_array_end. + * \return A tensor data view reference or zero when an error is encountered. + * \ingroup group_tensor + */ +VX_API_ENTRY vx_tensor_view VX_API_CALL vxCreateTensorView(vx_context context, vx_uint32 *view_array_start, vx_uint32 * view_array_end, vx_uint8 numViewDimensions); + +/*! \brief Releases a reference to a tensor data view object. +* The object may not be garbage collected until its total reference count is zero. +* \param [in] tensor_view The pointer to the tensor data view to release. +* \post After returning from this function the reference is zeroed. +* \return A vx_status_e enumeration. +* \retval VX_SUCCESS No errors. +* \retval VX_SUCCESS Success +* \retval * An error occurred. See vx_status_e. +* \ingroup group_tensor +*/ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensorView(vx_tensor_view *tensor_view); + +/*! \brief Create an opaque reference to a tensor addressing object. +* \details Not guaranteed to exist until the vx_graph containing it has been verified. +* \param [in] context The reference to the implementation context. +* \param [in] addressing_array_dimension a vx_uint32 array of sLength of patch in all dimensions in elements. +* \param [in] addressing_array_stride a vx_uint32 arrayStride in all dimensions in bytes. +* \param [in] numViewDimensions number of dimensions of view_array_start and view_array_end. +* \return A tensor data view reference or zero when an error is encountered. +* \ingroup group_tensor +*/ +VX_API_ENTRY vx_tensor_addressing VX_API_CALL vxCreateTensorAddressing(vx_context context, vx_uint32 *addressing_array_dimension, vx_uint32 * addressing_array_stride, vx_uint8 numViewDimensions); + +/*! \brief Releases a reference to a tensor data addressing object. +* The object may not be garbage collected until its total reference count is zero. +* \param [in] tensor_addr The pointer to the tensor data addressing to release. +* \post After returning from this function the reference is zeroed. +* \return A vx_status_e enumeration. +* \retval VX_SUCCESS No errors. +* \retval VX_SUCCESS Success +* \retval * An error occurred. See vx_status_e. +* \ingroup group_tensor +*/ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensorAddressing(vx_tensor_addressing *tensor_addr); + +/* vxCopyTensorPatchForNN11 is for back compatibility with spec 1.1, which is used in nn*/ +VX_API_ENTRY vx_status VX_API_CALL vxCopyTensorPatchForNN11( + vx_tensor tensor, + vx_tensor_view view, + vx_tensor_addressing user_addr, + void *user_ptr, + vx_enum usage, + vx_enum user_mem_type + ); + +/* vxCreateTensorForNN11 is for back compatibility with spec 1.1, which is used in nn*/ +VX_API_ENTRY vx_tensor VX_API_CALL +vxCreateTensorForNN11( + vx_context context, + vx_uint32 num_of_dims, + vx_uint32 *sizes, + vx_enum data_format, + vx_int8 fixed_point_pos + ); + +/*! \brief Creates an array of tensors + * \param [in] context The reference to the overall Context. + * \param [in] count Number of Objects to create in the ObjectArray. + * \param [in] tensor* The tensors array that need add to the ObjectArray. + * + * \returns An ObjectArray reference \ref vx_object_array. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. Data objects are not initialized by this function. + * + * \ingroup group_object_array + */ +VX_API_ENTRY vx_object_array VX_API_CALL vxCreateTensorObjectArray(vx_context context, vx_uint32 count, vx_tensor* tensor); + +typedef union _vx_tensor_quant_param +{ + struct + { + vx_int8 fixed_point_pos; /*!< \brief Specifies the fixed point position when the input element type is int16/int8, if 0 calculations are performed in integer math */ + } dfp; + + struct + { + vx_float32 scale; /*!< \brief Scale vaule for the quantized value */ + vx_int32 zeroPoint; /*!< \brief A 32 bit integer, in range [0, 255] */ + } affine; + + struct + { + vx_uint32 channelDim; /*!< \brief a 32 bit unsigned integer indicating channel dimension */ + vx_uint32 scaleCount; /*!< \brief the size of the scale array, must be equal to size[channelDim] */ + vx_float32 * scales; /*!< \brief an array of positive 32 bit floating point value. The size of the scales array must be equal to size[channelDim] */ + vx_uint32 zeroPointCount; /*!< \brief the size of the zero point array, must be equal to 0 or size[channelDim] */ + vx_int32 * zeroPoint; /*!< \brief A 32 bit integer, in range [0, 255] */ + } affinePerChannel; +}vx_tensor_quant_param; + +/*! \brief Input parameter for createTensor2 + * \ingroup group_tensor + * \version 0.3 + */ +typedef struct _vx_tensor_create_params_t +{ + vx_uint32 num_of_dims; /*!< \brief The number of dimensions specified in *sizes*/ + vx_uint32 * sizes; /*!< \brief The pointer to an array of dimension */ + vx_enum data_format; /*!< \brief Data format for the tensor */ + vx_enum quant_format; /*!< \brief Quantized format \ref vx_quantized_format_e . */ + vx_tensor_quant_param quant_data; +} vx_tensor_create_params_t; + + +/*! \brief Creates an opaque reference to a tensor data buffer. + * \details Not guaranteed to exist until the vx_graph containing it has been verified. + * \param [in] context The reference to the implementation context. + * \param [in] tensor_create_params A pointer to the tensor create parameter\ref vx_tensor_create_params_t + * \param [in] size_of_create_params Byte size of the parameter structure + * \return A tensor data reference or zero when an error is encountered. + * \ingroup group_tensor + * \version 0.3 + */ +VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensor2(vx_context context, const vx_tensor_create_params_t* tensor_create_params, vx_size size_of_create_params); + +/*! \brief Creates an opaque reference to a tensor data buffer with no direct + * user access. This function allows setting the tensor data dimensions or data format. + * \details Virtual data objects allow users to connect various nodes within a + * graph via data references without access to that data, but they also permit the + * implementation to take maximum advantage of possible optimizations. Use this + * API to create a data reference to link two or more nodes together when the + * intermediate data are not required to be accessed by outside entities. This API + * in particular allows the user to define the tensor data format of the data without + * requiring the exact dimensions. Virtual objects are scoped within the graph + * they are declared a part of, and can't be shared outside of this scope. + * \param [in] graph The reference to the parent graph. + * \param [in] tensor_create_params A pointer to the tensor create parameter\ref vx_tensor_create_params_t + * \param [in] size_of_create_params Byte size of the parameter structure + * \return A tensor data reference or zero when an error is encountered. + * \note Passing this reference to \ref vxCopyTensorPatch will return an error. + * \ingroup group_tensor + * \version 0.3 + */ +VX_API_ENTRY vx_tensor VX_API_CALL vxCreateVirtualTensor2(vx_graph graph, const vx_tensor_create_params_t* tensor_create_params, vx_size size_of_create_params); + +/*! \brief Swaps the tensor created from handle. + *\details This function swap tensors logical and physical address. + *\these tensors must have the same proterties expect memory related content. + *\Attention: APP should make sure the cache and memory cohensive for the first call vxSwapTensor + *\version 0.4 + */ +VX_API_ENTRY vx_status VX_API_CALL vxSwapTensor(vx_tensor tensor0, vx_tensor tensor1); + +/*! \brief Creates a reference to a tensor object that was externally allocated. + * \param [in] context The reference to the implementation context. + * \param [in] tensor_create_params The \ref vx_tensor_create_params_t that points to a parameter structure. + * \param [in] size_of_create_params Size of parameter structure. + * \param [in] addrs The tensor patch addressing structures that define the dimension and stride of pointers. See note below. + * \param [in] ptr The logical pointer of platform-defined references to tensor data. + * \param [in] import_type \ref vx_memory_type_e. When giving \ref VX_MEMORY_TYPE_HOST + * the \a ptr is assumed to be a HOST accessible pointer to memory. + * \returns An tensor reference \ref vx_tensor. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * In order to release the image back to the application we should use \ref vxSwapTensorHandle. + * + * \ingroup group_tensor + *\version 0.4 + */ +VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensorFromHandle2( + vx_context context, const vx_tensor_create_params_t* tensor_create_params, vx_size size_of_create_params, const vx_tensor_addressing addrs, + void * const ptr, vx_enum import_type); + +/* +*\ vxo_flushHandle used to support vxo_createTensorFromHandle/vxo_createImageFromHandle +*\once app change the content of tensor/image, app can call vxo_flushHandle to make the cache cohenrence and will get better performance; +*\ Or driver will handle it default, but worst perforamnce. +*/ +VX_API_ENTRY vx_status VX_API_CALL vxFlushHandle(vx_reference ref); + + +/*! \brief Return a new tensor referencing the same memory location but with different shape. +* \param [in] tensor The input tensor data to reshape. +* \param [in] num_of_dims Size of each dimension. If one component is special value -1, +* the size of that dimension is computed so that the total size remains the same as input tensor. +* If is is [-1], then flatten is performed which turns tensor into 1-D. +* \param [in] sizes The size of the container to which \a num_of_dims points. +* \return a vx_tensor that has shaped. +* \return VX_NULL if an error occurred. +* \ingroup group_tensor +*/ +VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_int32* num_of_dims, vx_uint32 sizes); + +/*! \brief Allows setting attributes on the tensor. + * \param [in] tensor The reference to the tensor on which to set the attribute. + * \param [in] attribute The attribute to set. Use a \ref vx_tensor_attribute_e enumeration. + * \param [in] ptr The pointer to the location from which to read the value. + * \param [in] size The size in bytes of the object pointed to by \a ptr. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE If the tensor is not a \ref vx_tensor. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \ingroup group_tensor + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetTensorAttribute(vx_tensor tensor, vx_enum attribute, const void *ptr, vx_size size); + + +/*! \brief The type enumeration lists all NN extension types. + * \ingroup group_cnn + */ +enum vx_nn_type_e { + VX_TYPE_NN_CONVOLUTION_PARAMS = 0x025,/*!< \brief A \ref vx_nn_convolution_params_t. */ + VX_TYPE_NN_DECONVOLUTION_PARAMS = 0x026,/*!< \brief A \ref vx_nn_deconvolution_params_t. */ + VX_TYPE_NN_ROI_POOL_PARAMS = 0x027,/*!< \brief A \ref vx_nn_roi_pool_params_t. */ +}; + +/*! \brief Input parameters for a convolution operation. + * \ingroup group_cnn + */ +typedef struct _vx_nn_convolution_params_t +{ + vx_size padding_x; /*!< \brief Number of elements added at each side in the x dimension of the input. */ + vx_size padding_y; /*!< \brief Number of elements added at each side in the y dimension of the input. */ + vx_enum overflow_policy; /*!< \brief A VX_TYPE_ENUM of the vx_convert_policy_e enumeration. */ + vx_enum rounding_policy; /*!< \brief A VX_TYPE_ENUM of the vx_round_policy_e enumeration. */ + vx_enum down_scale_size_rounding; /*!< \brief Rounding method for calculating output dimensions. See \ref vx_nn_rounding_type_e */ + vx_size dilation_x; /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the x direction. The value is the number of zeros to insert.*/ + vx_size dilation_y; /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the y direction. The value is the number of zeros to insert.*/ +} vx_nn_convolution_params_t; + +/*! \brief Extended input parameter structure for convolution layer + * \ingroup group_cnn + */ +typedef struct _vx_nn_convolution_params_ext_t +{ + vx_nn_convolution_params_t khr; /*!< \brief Khronos standard structure head */ + vx_size padding_x_right; /*!< \brief Number of elements added at each side in the right of x dimension of the input, + "padding_x" is for the left */ + vx_size padding_y_bottom; /*!< \brief Number of elements added at each side in the bottom of y dimension of the input. + "padding_y" is for the top */ + vx_enum pad_mode; /*!< \brief A VX_TYPE_ENUM of the \ref vx_pad_mode_e enumeration. */ + vx_scalar pad_const; /*!< \brief pad const value if setting pad mode to const, the const value is base value, not quantized value. */ +} vx_nn_convolution_params_ext_t; + +/*! \brief Input parameters for a deconvolution operation. + * \ingroup group_cnn + */ +typedef struct _vx_nn_deconvolution_params_t +{ + vx_size padding_x; /*!< \brief Number of elements subtracted at each side in the x dimension of the output. */ + vx_size padding_y; /*!< \brief Number of elements subtracted at each side in the y dimension of the output. */ + vx_enum overflow_policy; /*!< \brief A VX_TYPE_ENUM of the vx_convert_policy_e enumeration. */ + vx_enum rounding_policy; /*!< \brief A VX_TYPE_ENUM of the vx_round_policy_e enumeration. */ + vx_size a_x; /*!< \brief user-specified quantity used to distinguish between the \f$upscale_x\f$ different possible output sizes. */ + vx_size a_y; /*!< \brief user-specified quantity used to distinguish between the \f$upscale_y\f$ different possible output sizes. */ +} vx_nn_deconvolution_params_t; + +/*! \brief Extended input parameter for a deconvolution operation. + * \ingroup group_cnn + */ +typedef struct _vx_nn_deconvolution_params_ext_t +{ + vx_nn_deconvolution_params_t khr; /*!< \brief Khronos standard structure head \ref vx_nn_deconvolution_params_t */ + vx_size padding_x_right; /*!< \brief Number of elements subtracted at each side in the right of x dimension of the input."padding_x" is for the left */ + vx_size padding_y_bottom; /*!< \brief Number of elements subtracted at each side in the bottom of y dimension of the input. "padding_y" is for the top */ + vx_int32 channel_group; /*!< \brief Number of separate groups for deconvolution (Range: 0 <= groups <= size of z dimension of input; size of z dimension of input can be divided by groups) */ + vx_enum pad_mode; /*!< \brief A VX_TYPE_ENUM of the \ref vx_pad_mode_e enumeration. */ + vx_scalar pad_const; /*!< \brief The pad const value if setting pad mode to const, the const value is base value, not quantized value. */ +} vx_nn_deconvolution_params_ext_t; + +typedef struct _vx_nn_deconvolution_params_ext2_t +{ + vx_nn_deconvolution_params_ext_t ext; /*!< \brief Deconvolution extension structure head */ + vx_uint32 stride_x; /*!< \brief skip x jump for down scale. */ + vx_uint32 stride_y; /*!< \brief skip y jump for down scale. */ + vx_enum down_scale_size_rounding; /*!< \brief Rounding method for calculating output dimensions. See \ref vx_nn_rounding_type_e */ +} vx_nn_deconvolution_params_ext2_t; + +/*! \brief Input parameters for ROI pooling operation. + * \ingroup group_cnn + */ +typedef struct _vx_nn_roi_pool_params_t +{ + vx_enum pool_type; /*!< \brief Of type \ref vx_nn_pooling_type_e. Only \ref VX_NN_POOLING_MAX pooling is supported. */ +} vx_nn_roi_pool_params_t; + +/*! \brief Extended input parameters for ROI pooling operation. + * \ingroup group_cnn + */ +typedef struct _vx_nn_roi_pool_params_ext_t +{ + vx_nn_roi_pool_params_t khr; /*!< \brief Khronos standard structure head \ref vx_nn_roi_pool_params_t */ + vx_float32 spatial_scale; /*!< \brief The ratio of image to feature map (Range: 0 < spatial_scale <= 1) */ + vx_int32 pooled_height; /*!< \brief The height of roi pooling (Range: 0 < pool_height <= height of input_data) */ + vx_int32 pooled_width; /*!< \brief The width of roi pooling(Range: 0 < pool_height <= width of input_data) */ +} vx_nn_roi_pool_params_ext_t; + +typedef struct _vx_nn_convolution_params_ext2_t +{ + vx_nn_convolution_params_ext_t ext; /*!< \brief Convolution extension structure head */ + + vx_uint32 stride_x; /*!< \brief skip x jump for down scale. */ + vx_uint32 stride_y; /*!< \brief skip y jump for down scale. */ + + vx_int32 depth_multiplier; /*!< \brief depthwise multiplier value, if 0, means convolution, elsewise(>=1), the convolution is depthwiseconvolution. */ +} vx_nn_convolution_params_ext2_t; +/*============================================================================== + NN Nodes +=============================================================================*/ +/*! \brief [Graph] Creates a Convolutional Network Convolution Layer Node. + * \details This function implement Convolutional Network Convolution layer. + * For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined, + * and should be at least 16.\n + * round: rounding according the vx_round_policy_e enumeration. \n + * saturate: A saturation according the vx_convert_policy_e enumeration. + * The following equation is implemented: \n + * \f$ outputs[j,k,i] = saturate(round(\sum_{l} (\sum_{m,n} inputs[j+m,k+n,l] \times weights[m,n,l,i])+biasses[j,k,i])) \f$\n + * Where \f$m,n\f$ are indexes on the convolution matrices. \f$ l\f$ is an index on all the convolutions per input.\f$ i\f$ is an index per output. + * \f$ j,k \f$ are the inputs/outputs spatial indexes. + * Convolution is done on the width and height dimensions of the \ref vx_tensor. Therefore, we use here the term x for index along the width dimension and y for index along the height dimension.\n + * before the Convolution is done, a padding with zeros of the width and height input dimensions is performed. + * Then down scale is done by picking the results according to a skip jump. The skip in the x and y is determined by the output size dimensions. + * The relation between input to output is as follows: \n + * \f$ width_{output} = round(\frac{(width_{input} + 2 * padding_x - kernel_x - (kernel_x -1) * dilation_x)}{skip_x} + 1) \f$\n + * and \n + * \f$ height_{output} = round(\frac{(height + 2 * padding_y - kernel_y - (kernel_y -1) * dilation_y)}{skip_y} + 1) \f$\n + * where \f$width\f$ is the size of the input width dimension. \f$height\f$ is the size of the input height dimension. + * \f$width_{output}\f$ is the size of the output width dimension. \f$height_{output}\f$ is the size of the output height dimension. + * \f$kernel_x\f$ and \f$kernel_y\f$ are the convolution sizes in width and height dimensions. + * skip is calculated by the relation between input and output. In case of ambiguity in the inverse calculation of the skip. The minimum solution is chosen. Skip must be a positive non zero integer. + * rounding is done according to \ref vx_convolutional_network_rounding_type_e. + * Notice that this node creation function has more parameters than the corresponding kernel. Numbering of kernel parameters (required if you create this node using the generic interface) is explicitly specified here. + * \param [in] graph The handle to the graph. + * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, all following dimensions represent number of batches, possibly nested. + * The dimension order is [width, height, #IFM, #batches].\n + * \param [in] weights [*static] Weights are 4d tensor with dimensions [kernel_x, kernel_y, #IFM, #OFM]. see \ref vxCreateTensor and \ref vxCreateVirtualTensor \n Weights data type must match the data type of the inputs. (Kernel parameter #1) + * \param [in] biases [*static] Optional, ignored if NULL. The biases, which may be shared (one per ofm) or unshared (one per ofm * output location). The possible layouts are + * either [#OFM] or [width, height, #OFM]. Biases data type must match the data type of the inputs. + * \param [in] convolution_params [static] Pointer to parameters of type \ref vx_nn_convolution_params_t. + * \param [in] size_of_convolution_params [static] Size in bytes of convolution_params. Note that this parameter is not counted as one of the kernel parameters. + * \param [out] outputs The output tensor data. Output will have the same number and structure of dimensions as input. Output tensor data type must be same as the inputs. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxConvolutionLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_convolution_params_t *convolution_params, vx_size size_of_convolution_params, vx_tensor outputs); + +/*! \brief [Graph] Creates a Fully connected Convolutional Network Layer Node. +* \details This function implement Fully connected Convolutional Network layers. +* In case the input and output \ref vx_tensor are signed 16. A fixed point calculation is performed with round and saturate according to the number of accumulator bits. \n +* round: rounding according the vx_round_policy_e enumeration. \n +* saturate: A saturation according the vx_convert_policy_e enumeration. +* The saturation is done based on the accumulator_bits parameter. +* According the accumulator_bits, the saturation might not be performed every operation. +* But every a specified amount of operations, +* that are suspected to saturate the accumulation bits\n +* The equation for Fully connected layer:\n +* \f$ outputs[i] = ( \sum_{j} saturate(round(inputs[j] \times weights[j,i])))+biasses[i] \f$\n +* Where \f$j\f$ is a index on the input feature and \f$i\f$ is a index on the output. +* before the fully connected is done, a padding of the input is performed. +* Then down scale is done by picking the results according to a skip jump. The skip is determined by the output size dimensions. +* The relation between input to output is as follows: +* \f$ size_{output} = round(\frac{(size_{input} + 2 * pad)}{skip} + 1) \f$\n +* where \f$size_{input}\f$ is the size of the input dimension. +* \f$size_{output}\f$ is the size of the output dimension. +* skip is calculated by the relation between input and output. +* rounding is done according to \ref vx_convolutional_network_rounding_type_e. +* \param [in] graph The handle to the graph. +* \param [in] inputs The input tensor data. There two possible input layouts: +* 1. [#IFM, #batches]. See \ref vxCreateTensor and \ref vxCreateVirtualTensor. +* 2. [width, height, #IFM, #batches]. See \ref vxCreateTensor and \ref vxCreateVirtualTensor\n +* In both cases number of batches are optional and may be multidimensional. +* The second option is a special case to deal with convolution layer followed by fully connected. +* The dimension order is [#IFM, #batches]. See \ref vxCreateTensor and \ref vxCreateVirtualTensor. Note that batch may be multidimensional. +* \param [in] weights [*static] Number of dimensions equals dim(single input)+1. Single input dims are [width, height, #IFM], with height and #IFM being optional.\n +* \param [in] biases [*static]The biases, which may be shared (one per ofm) or unshared (one per ofm * output location). +* \param [in] pad [static] Number of elements added at each side in the input. +* \param [in] accumulator_bits [static] Is the total number of bits used during intermediate accumulation. +* \param [in] overflow_policy [static] A VX_TYPE_ENUM of the vx_convert_policy_e enumeration. +* \param [in] rounding_policy [static] A VX_TYPE_ENUM of the vx_round_policy_e enumeration. +* \param [in] down_scale_size_rounding [static] Rounding method for calculating output dimensions. See \ref vx_convolutional_network_rounding_type_e +* \param [out] outputs The output tensor data. Output dimension layout is [#OFM,#batches]. See \ref vxCreateTensor and \ref vxCreateVirtualTensor, where #batches may be multidimensional. +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +* \ingroup group_cnn +*/ +VX_API_ENTRY vx_node VX_API_CALL vxFullyConnectedLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, vx_enum overflow_policy, vx_enum rounding_policy, vx_tensor outputs); + + +/*! \brief [Graph] Creates a Convolutional Network Pooling Layer Node. + * \details Pooling is done on the first 2 dimensions or the \ref vx_tensor. Therefore, we use here the term x for the first dimension and y for the second.\n + * Pooling operation is a function operation over a rectangle size and then a nearest neighbour down scale. + * Here we use pool_size_x and pool_size_y to specify the rectangle size on which the operation + * is performed. \n + * before the operation is done (average or maximum value). the data is padded in the first 2D with zeros. + * The down scale is done by picking the results according to a skip jump. The skip in the x and y dimension is determined by the output size dimensions. +* \param [in] graph The handle to the graph. +* \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, 4th dimension for batch of inputs is optional.Dimension layout is [width, height, #IFM, #batches]. +* See \ref vxCreateTensor and \ref vxCreateVirtualTensor +* \param [in] pool_type [static] Either max pooling or average pooling (see \ref vx_convolutional_network_pooling_type_e). +* \param [in] pool_size_x [static] Size of the pooling region in the x dimension +* \param [in] pool_size_y [static] Size of the pooling region in the y dimension. +* \param [in] pool_pad_x [static] Padding size in the x dimension. +* \param [in] pool_pad_y [static] Padding size in the y dimension. +* \param [in] rounding [static] The rounding method for calculating output dimensions. See \ref vx_convolutional_network_rounding_type_e +* \param [out] outputs The output tensor data. Output will have the same number of dimensions as input. +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +* \ingroup group_cnn +*/ +VX_API_ENTRY vx_node VX_API_CALL vxPoolingLayer(vx_graph graph, vx_tensor inputs, vx_enum pooling_type, + vx_size pooling_size_x, + vx_size pooling_size_y, + vx_size pooling_padding_x, + vx_size pooling_padding_y, + vx_enum rounding, + vx_tensor outputs); + +/*! \brief [Graph] Creates a Convolutional Network Softmax Layer Node. + * \details the softmax function, is a generalization of the logistic function that "squashes" a K-dimensional vector \f$ z \f$ of arbitrary real values to a K-dimensional vector + * \f$ \sigma(z) \f$ of real values in the range (0, 1) that add up to 1. The function is given by: + * \f$ \sigma(z) = \frac{\exp^z}{\sum_i \exp^{z_i}} \f$ + * \param [in] graph The handle to the graph. + * \param [in] inputs The input tensor, with the number of dimensions according to the following scheme. + * In case IFM dimension is 1. Softmax is be calculated on that dimension. + * In case IFM dimension is 2. Softmax is be calculated on the first dimension. The second dimension is batching. + * In case IFM dimension is 3. Dimensions are [Width, Height, Classes]. And Softmax is calculated on the third dimension. + * In case IFM dimension is 4. Dimensions are [Width, Height, Classes, batching]. Softmax is calculated on the third dimension. + * Regarding the layout specification, see \ref vxCreateTensor and \ref vxCreateVirtualTensor. + * \param [out] outputs The output tensor. Output will have the same number of dimensions as input. Output tensor data type must be same as the inputs. + * \ingroup group_cnn + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxSoftmaxLayer(vx_graph graph, vx_tensor inputs, vx_tensor outputs); + +/*! \brief [Graph] Creates a Convolutional Network Normalization Layer Node. +* \details Normalizing over local input regions. Each input value is divided by \f$ (1+\frac{\alpha}{n}\sum_i x^2_i)^\beta \f$ , where n is the number of elements to normalize across. +* and the sum is taken over the region centred at that value (zero padding is added where necessary). +* \param [in] graph The handle to the graph. +* \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, 4th dimension for batch of inputs is optional.Dimension layout is [width, height, IFM, #batches]. +* See \ref vxCreateTensor and \ref vxCreateVirtualTensor. +* \param [in] type [static] Either same map or across maps (see vx_convolutional_network_norm_type_e). +* \param [in] norm_size [static] Number of elements to normalize across. +* \param [in] alpha [static] Alpha parameter in the normalization equation. +* \param [in] beta [static ] Beta parameter in the normalization equation. +* \param [out] outputs The output tensor data. Output will have the same number of dimensions as input. +* \ingroup group_cnn +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +*/ +VX_API_ENTRY vx_node VX_API_CALL vxNormalizationLayer(vx_graph graph, vx_tensor inputs, vx_enum type, + vx_size normalization_size, + vx_float32 alpha, + vx_float32 beta, + vx_tensor outputs); + +/*! \brief [Graph] Creates a Convolutional Network Local Response Normalization Layer Node. This function is optional for 8-bit extension with the extension string 'KHR_NN_8'. + * \details Normalizing over local input regions. Each input value is divided by \f$ (\bias+\frac{\alpha}{n}\sum_i x^2_i)^\beta \f$ , where n is the number of elements to normalize across. + * and the sum is taken over a rectangle region centred at that value (zero padding is added where necessary). + * \param [in] graph The handle to the graph. + * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, 4th dimension for batch of inputs is optional. Dimension layout is [width, height, IFM, #batches]. + * See \ref vxCreateTensor and \ref vxCreateVirtualTensor. + * Implementations must support input tensor data types indicated by the extension strings 'KHR_NN_8 KHR_NN_16'. + * Since this function is optional for 'KHR_NN_8', so implementations only must support VX_TYPE_INT16 with fixed_point_position 8. + * \param [in] type [static] Either same map or across maps (see \ref vx_nn_norm_type_e). + * \param [in] normalization_size [static] Number of elements to normalize across. Must be a positive odd number with maximum size of 7 and minimum of 3. + * \param [in] alpha [static] Alpha parameter in the local response normalization equation. must be positive. + * \param [in] beta [static] Beta parameter in the local response normalization equation. must be positive. + * \param [in] bias [static] Bias parameter in the local response normalization equation. must be positive. + * \param [out] outputs The output tensor data. Output will have the same number of dimensions as input. + * \ingroup group_cnn + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxLocalResponseNormalizationLayer(vx_graph graph, vx_tensor inputs, vx_enum type, + vx_size normalization_size, + vx_float32 alpha, + vx_float32 beta, + vx_float32 bias, + vx_tensor outputs); + +/*! \brief Input parameter for normalization layer2 +* \ingroup group_cnn +*\version 0.4 +*/ +typedef struct _vx_nn_normalization_params_t +{ + vx_enum type; /*!< \brief Either same map or across maps \refvx_convolutional_network_norm_type_e */ + vx_uint32 norm_size; /*!< \brief Number of elements to normalize across */ + vx_float32 alpha; /*!< \brief Alpha parameter in the normalization equation */ + vx_float32 beta; /*!< \brief Beta parameter in the normalization equation */ + vx_float32 bias; /*!< \brief Bias parameter, must not be zero */ +} vx_nn_normalization_params_t; + +/*! \brief extenstion parameters for normalization layer2. + * \ingroup group_cnn + *\version 0.5 + */ +typedef struct _vx_nn_normalization_params_ext_t +{ + vx_nn_normalization_params_t base; /*!< \brief Khronos standard structure head \ref vx_nn_normalization_params_t */ + vx_int32 axis; +} vx_nn_normalization_params_ext_t; + +/*! \brief Input parameter for tensor transpose layer2 +* \ingroup group_cnn +*\version 0.5 +*/ +typedef struct _vx_nn_transpose_params_t +{ + vx_int32* dims; /*!< \brief The array of perm dims */ + vx_uint32 dims_num; /*!< \brief Number of dims */ +} vx_nn_transpose_params_t; + +/*! \brief Input parameter for tensor mean layer +* \ingroup group_cnn +*\version 0.5 +*/ +typedef struct _vx_nn_mean_params_t +{ + vx_tensor axis; /*!< \brief 1D axis tensor of reduce dims */ + vx_int32 keep_dims; /*!< \brief Keep dims, if positive, retains reduced dims with length 1 */ +} vx_nn_mean_params_t; + +/*! \brief Input parameter for tensor squeeze layer +* \ingroup group_cnn +*\version 0.5 +*/ +typedef struct _vx_nn_squeeze_params_t +{ + vx_tensor squeeze_dims; /*!< \brief [Optional]1D tensor of squeeze dims, if specified, only squeezes the dimisions lists. otherwise, squeeze all */ +} vx_nn_squeeze_params_t; + +/*! \brief Input parameter for tensor stride slice layer +* \ingroup group_cnn +*\version 0.5 +*/ +typedef struct _vx_nn_stride_slice_params_t +{ + vx_tensor begin_dims; /*!< \brief 1D tensor of int32, the starts of the dims of the input tensor to be sliced. the length must be of rank(input) */ + vx_tensor end_dims; /*!< \brief 1D tensor of int32, the ends of the dims of the input tensor to be sliced. the length must be of rank(input) */ + vx_tensor stride_dims; /*!< \brief 1D tensor of int32, the stride of the dims of the input tensor to be sliced. the length must be of rank(input) , note that a stride can be negative, which cause a reverse slice */ + vx_int32 begin_mask; /*!< \brief begin mask, if the ith bit of begin maks is set, begin[i] is ignored and the fullest possible range in that dim is used instead. */ + vx_int32 end_mask; /*!< \brief end mask, if the ith bit of end maks is set, end[i] is ignored and the fullest possible range in that dim is used instead. */ + vx_int32 shrink_axis_mask; /*!< \brief An int32 mask, if the ith bit of shrink axis mask is set, it implies that the ith specification shrinks dim must be preserved. */ +} vx_nn_stride_slice_params_t; + +/*! \brief [Graph] Creates a Convolutional Network Normalization Layer Node. +* \details Normalizing over local input regions. Each input value is divided by \f$ (bias+\frac{\alpha}{n}\sum_i x^2_i)^\beta \f$ , where n is the number of elements to normalize across. +:* and the sum is taken over the region centred at that value (zero padding is added where necessary). +* \param [in] graph The handle to the graph. +* \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, 4th dimension for batch of inputs is optional.Dimension layout is [width, height, IFM, #batches]. +* See \ref vxCreateTensor and \ref vxCreateVirtualTensor. +* \param [in] nomalization_params [static] Pointer to \ref vx_nn_normalization_params_t parameter structure. +* \param [in] size_of_normalization_param [static] The size of the parameter structure. +* \param [out] outputs The output tensor data. Output will have the same number of dimensions as input. +* \ingroup group_cnn +* \version 0.4 +* \return vx_node. +*/ +VX_API_ENTRY vx_node VX_API_CALL vxNormalizationLayer2(vx_graph graph, vx_tensor inputs, const vx_nn_normalization_params_t *normalization_params, + vx_size size_of_normalization_param, vx_tensor outputs); + +/*! \brief [Graph] Creates a Convolutional Network Activation Layer Node. + * The function operate a specific function (Specified in \ref vx_nn_activation_function_e), On the input data. + * the equation for the layer is: + * \f$ outputs(i,j,k,l) = function(inputs(i,j,k,l), a, b) \f$ for all i,j,k,l. + * \param [in] graph The handle to the graph. + * \param [in] inputs The input tensor data. + * \param [in] function [static] Non-linear function (see \ref vx_convolutional_network_activation_func_e). Implementations must support \ref VX_NN_ACTIVATION_LOGISTIC, \ref VX_NN_ACTIVATION_HYPERBOLIC_TAN and \ref VX_NN_ACTIVATION_RELU + * \param [in] a [static] Function parameters a. must be positive. + * \param [in] b [static] Function parameters b. must be positive. + * \param [out] outputs The output tensor data. Output will have the same number of dimensions as input. + * \ingroup group_cnn + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxActivationLayer(vx_graph graph, vx_tensor inputs, vx_enum function, vx_float32 a,vx_float32 b, vx_tensor outputs); + +/*! \brief [Graph] Creates a Convolutional Network ROI pooling node + * \details Pooling is done on the width and height dimensions of the \ref vx_tensor. The ROI Pooling get an array of roi rectangles, and an input tensor. + * The kernel crop the width and height dimensions of the input tensor with the ROI rectangles and down scale the result to the size of the output tensor. The output tensor width and height are the pooled width and pooled height. + * The down scale method is determined by the pool_type. + * Notice that this node creation function has more parameters than the corresponding kernel. Numbering of kernel parameters (required if you create this node using the generic interface) is explicitly specified here. + * \param [in] graph The handle to the graph. + * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, 4th dimension for batch of inputs is optional. Dimension layout is [width, height, #IFM, #batches]. + * See \ref vxCreateTensor and \ref vxCreateVirtualTensor. + * Implementations must support input tensor data types indicated by the extension strings 'KHR_NN_8' or 'KHR_NN_8 KHR_NN_16'. (Kernel parameter #0) + * \param [in] inputs_rois The roi array tensor. ROI array with dimensions [4, roi_count, #batches] where the first dimension represents 4 coordinates of the top left and bottom right corners of the roi rectangles, based on the input tensor width and height. + * #batches is optional and must be the same as in inputs. roi_count is the number of ROI rectangles. (Kernel parameter #1) + * \param [in] pool_type [static] Of type \ref vx_nn_pooling_type_e. Only \ref VX_NN_POOLING_MAX pooling is supported. (Kernel parameter #2) + * \param [in] size_of_roi_params [static] Size in bytes of roi_pool_params. Note that this parameter is not counted as one of the kernel parameters. + * \param [out] output_arr The output tensor. Output will have [output_width, output_height, #IFM, #batches] dimensions. #batches is optional and must be the same as in inputs. (Kernel parameter #3) + * \ingroup group_cnn + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxROIPoolingLayer(vx_graph graph, vx_tensor input_data, vx_tensor input_rois, const vx_nn_roi_pool_params_t *roi_pool_params, vx_size size_of_roi_params, vx_tensor output_arr); + + +/*! \brief [Graph] Creates a Convolutional Network Deconvolution Layer Node. + * \details Deconvolution denote a sort of reverse convolution, which importantly and confusingly is not actually a proper mathematical deconvolution. + * Convolutional Network Deconvolution is up-sampling of an image by learned Deconvolution coefficients. + * The operation is similar to convolution but can be implemented by up-sampling the inputs with zeros insertions between the inputs, + * and convolving the Deconvolution kernels on the up-sampled result. + * For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined, + * and should be at least 16.\n + * round: rounding according the vx_round_policy_e enumeration. \n + * saturate: A saturation according the vx_convert_policy_e enumeration. + * The following equation is implemented: \n + * \f$ outputs[j,k,i] = saturate(round(\sum_{l} \sum_{m,n}(inputs_{upscaled}[j+m,k+n,l] \times weights[m,n,l,i])+biasses[j,k,i])) \f$\n + * Where \f$m,n\f$ are indexes on the convolution matrices. \f$ l\f$ is an index on all the convolutions per input.\f$ i\f$ is an index per output. + * \f$ j,k \f$ are the inputs/outputs spatial indexes. + * Deconvolution is done on the width and height dimensions of the \ref vx_tensor. Therefore, we use here the term x for the width dimension and y for the height dimension.\n + * before the Deconvolution is done, up-scaling the width and height dimensions with zeros is performed. + * The relation between input to output is as follows: \n + * \f$ width_{output} = (width_{input} -1) * upscale_x - 2 * padding_x + kernel_x + a_x \f$\n + * and \n + * \f$ height_{output} = (height_{input} - 1) * upscale_y - 2 * padding_y + kernel_y + a_y \f$\n + * where \f$width_{input}\f$ is the size of the input width dimension. \f$height_{input}\f$ is the size of the input height dimension. + * \f$width_{output}\f$ is the size of the output width dimension. \f$height_{output}\f$ is the size of the output height dimension. + * \f$kernel_x\f$ and \f$kernel_y\f$ are the convolution sizes in width and height. \f$a_x\f$ and \f$a_y\f$ are user-specified quantity used to distinguish between the \f$upscale_x\f$ and \f$upscale_y\f$ different possible output sizes. + * \f$upscale_x\f$ and \f$upscale_y\f$ are calculated by the relation between input and output. + * \f$a_x\f$ and \f$a_y\f$ must be positive and smaller then \f$upscale_x\f$ and \f$upscale_y\f$ respectively. + * Since the padding parameter is on the output. The effective input padding is: \n + * \f$ padding_{input_x} = kernel_x -padding_x -1\f$ \n + * \f$ padding_{input_y} = kernel_y -padding_y -1\f$ \n + * Therfore the following constarints apply : \f$kernel_x >= padding_x - 1\f$ and \f$kernel_y >= padding_y - 1\f$. + * rounding is done according to \ref vx_nn_rounding_type_e. + * Notice that this node creation function has more parameters than the corresponding kernel. Numbering of kernel parameters (required if you create this node using the generic interface) is explicitly specified here. + * \param [in] graph The handle to the graph. + * \param [in] inputs The input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Dimension layout is [width, height, #IFM, #batches]. + * See \ref vxCreateTensor and \ref vxCreateVirtualTensor. + * Implementations must support input tensor data types indicated by the extension strings 'KHR_NN_8' or 'KHR_NN_8 KHR_NN_16'. (Kernel parameter #0) + * \param [in] weights [static] The 4d weights with dimensions [width, height, #IFM, #OFM]. See \ref vxCreateTensor and \ref vxCreateVirtualTensor. (Kernel parameter #1) + * \param [in] biases [static] Optional, ignored if NULL. The biases have one dimension [#OFM]. Implementations must support input tensor data type same as the inputs. (Kernel parameter #2) + * \param [in] deconvolution_params [static] Pointer to parameters of type \ref vx_nn_deconvolution_params_t (Kernel parameter #3) + * \param [in] size_of_deconv_params [static] Size in bytes of deconvolution_params. Note that this parameter is not counted as one of the kernel parameters. + * \param [out] outputs The output tensor. The output has the same number of dimensions as the input. (Kernel parameter #4) + * \ingroup group_cnn + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxDeconvolutionLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_deconvolution_params_t *deconvolution_params, vx_size size_of_deconv_params, vx_tensor outputs); + +/*! \brief [Graph] Creates a LeakyRELU Layer Node. + * \details Activate the layer with leakyRELU algorithm. Given an input value x, the leakyRELU layer computes the output as x if x > 0 and negative_slope * x if x <= 0. + * \param [in] graph The reference to the parent graph. + * \param [in] inputs The input tensor data to reorg. + * \param [in] negative_slope [static] specifies whether to leak the nagative part by multiplying it with the slope value rather than setting it to 0. + * \param [in] outputs The output tensor data. Output will have same dimensions number as inputs. + * \return vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn +*/ +VX_API_ENTRY vx_node VX_API_CALL vxLeakyReluLayer( + vx_graph graph, + vx_tensor inputs, + vx_float32 negative_slope, + vx_tensor outputs + ); + +/*! \brief [Graph] Creates a PRelu Layer Node. + * \details Activate the layer with parametric RELU algorithm. Given an input value x, the PRelu layer computes the output as x if x > 0 and alpha * x if x <= 0. + * \param [in] graph The reference to the parent graph. + * \param [in] inputs The input tensor data to reorg. + * \param [in] alpha The per channel alpha tensor to leak the nagative part by multiplying it with alpha value. + * \param [in] outputs The output tensor data. Output will have same dimensions number as inputs. + * \return vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn + * \version 0.5 +*/ +VX_API_ENTRY vx_node VX_API_CALL vxPReluLayer( + vx_graph graph, + vx_tensor inputs, + vx_tensor alpha, + vx_tensor outputs + ); + +/*! \brief [Graph] Creates a Batch Normalization Node. + * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1. + * \param [in] graph The handle to the graph. + * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5) + * \param [in] mean [static] A mean tensor data. + * \param [in] variance [static] A variance tensor data. + * \param [in] gamma [static] A scale tensor data, often denoted gamma in equations. + * \param [in] beta [static] A offset tensor data, often denoted beta in equations. + * \param [in] input The input tensor. + * \param [out] output The output tensor. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxBatchNormalizationLayer( + vx_graph graph, + vx_float32 eps, + vx_tensor mean, + vx_tensor variance, + vx_tensor gamma, + vx_tensor beta, + vx_tensor input, + vx_tensor output + ); + +/*! \brief [Graph] Creates a concat Node. + * \details Concat one tensor from two tensor. + * \param [in] graph The handle to the graph. + * \param [in] in0 The input 0 tensor to be combined. + * \param [in] in1 The input 1 tensor to be combined. + * \param [out] out The output tensor. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxConcat2Layer( + vx_graph graph, + vx_tensor in0, + vx_tensor in1, + vx_tensor out + ); + +/*! \brief parameter for vxConcatIndefiniteLayer + * \ingroup group_cnn + * \version 0.4 + */ +typedef struct _vx_nn_concat_params_t +{ + vx_uint32 axis; /*!< \brief The axis on which we need do concat. */ +} vx_nn_concat_params_t; + +/*! \brief [Graph] Create a concat layer for indefinite number of tensors. + * \param [in] graph The handle to the graph + * \param [in] in Pointer to a list of tensors + * \param [in] concat_params [static] Pointer to parameters of type \ref vx_nn_concat_params_t + * \param [in] size_of_concat_params [static] Size in bytes of vx_nn_concat_params_t. + * \param [out] out The output tensor after concat + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxConcatIndefiniteLayer( + vx_graph graph, + vx_object_array in, + const vx_nn_concat_params_t* concat_params, + vx_size size_of_concat_params, + vx_tensor out + ); + +/*! \brief [Graph] Creates a Reorgnization Layer Node. + * \details Reorganize the layer. Picking up pixels from input tensor according to the rule \n + * dimension 1: i * stride + (k / out_c) % stride \n + * dimension 2: j * stride + (k / out_c) / stride \n + * dimension 3: k % out_c \n + * out_c = input_c / (stride * stride), i is in range (0, input_w-1), j is in range (0, input_h-1), k is in range (0, input_c-1) + * Output value is in order sequence. + * \param [in] graph The reference to the parent graph. + * \param [in] inputs The input tensor data to reorg. + * \param [in] stride [static] Delta size of two pixels in each dimensions to do a reorg operation. + * \param [out] outputs The output tensor data. Output will have different number of each dimensions as input. + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxReorgLayer( + vx_graph graph, + vx_tensor inputs, + vx_uint32 stride, + vx_tensor outputs + ); + +/*! \brief The type list of reorgnization. + * \ingroup group_cnn + * \version 0.4 + */ +enum vx_reorg_type_e +{ + /*! \brief Reorgnization from depth to space. */ + VX_REORG_DEPTH_TO_SPACE = 0, + + /*! \brief Reorgnization from space to depth. */ + VX_REORG_SPACE_TO_DEPTH = 1, + + /*! \brief Reorgnization from batch to space. */ + VX_REORG_BATCH_TO_SPACE_ND, + + /*! \brief Reorgnization from space to batch. */ + VX_REORG_SPACE_TO_BATCH_ND, + + /*! \brief Reorgnzation channel. */ + VX_REORG_SHUFFLE_CHANNEL, +}; + +/*! \brief Input parameter for reorg layer + *\ingroup group_cnn + *\version 0.4 + */ +typedef struct _vx_nn_reorg_params_t +{ + vx_tensor block_size; /*!< \brief The block sizes(int32) for each spatial dimensions of the input to do a reorg operation, all value must > 1 */ + vx_enum type; /*!< \brief The type of Reorgnization, \ref vx_reorg_type_e */ +} vx_nn_reorg_params_t, * vx_nn_reorg_params; + +/*! \brief extenstion parameters for reorg layer . + * \ingroup group_cnn + *\version 0.5 + */ +typedef struct _vx_nn_reorg_params_ext_t +{ + vx_nn_reorg_params_t base; /*!< \brief vx_nn_reorg_params \ref vx_nn_reorg_params_t */ + vx_tensor pad; /*!< \brief [Optional] Only for SPACE2BATCH, 2D tensor for paddings for each spatial dim of the input tensor(rank(input), 2), all values must be >=0. */ +} vx_nn_reorg_params_ext_t; + +typedef struct _vx_nn_reorg_params_ext2_t +{ + vx_nn_reorg_params_t base; /*!< \brief vx_nn_reorg_params \ref vx_nn_reorg_params_t */ + vx_int32 *num_group; + vx_int32 *axis; +} vx_nn_reorg_params_ext2_t; + +/*! \brief [Graph] Creates a Reorgnization Layer Node, Enhancement of vxReorgLayer, Support both DEPTH to SPACE and SPACE to DEPTH. + * \param [in] graph The reference to the parent graph. + * \param [in] input The input tensor data to reorg. + * \param [in] reorg_params [static] Pointer to parameters of type \ref vx_nn_reorg_params + * \param [in] size_of_reorg_params [static] Size in bytes of vx_nn_reorg_params. + * \param [out] output The output tensor data. Output will have different number of each dimensions as input. + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn + * \version 0.4 + */ +VX_API_ENTRY vx_node VX_API_CALL vxReorgLayer2( + vx_graph graph, + vx_tensor input, + const vx_nn_reorg_params reorg_params, + vx_size size_of_reorg_params, + vx_tensor output + ); + +/*! \brief Input parameter for TensorRoundingLayer + * \ingroup group_tensor + * \version 0.4 + */ +typedef struct _vx_nn_rounding_params_t +{ + vx_enum mode; /*!< \brief Rounding method for calculating tensor data(VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR or VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_CEILING). See \ref vx_convolutional_network_rounding_type_e */ +} vx_nn_rounding_params_t, * vx_nn_rounding_params; + +/*! \brief [Graph] Creates a Rounding Layer Node, support FLOOR and CEIL. + * \param [in] graph The reference to the parent graph. + * \param [in] input The input tensor data to reorg. + * \param [in] rounding_params [static] Pointer to parameters of type \ref vx_nn_rounding_params + * \param [in] size_of_rounding_params [static] Size in bytes of vx_nn_rounding_params. + * \param [out] output The output tensor data. + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_tensor + * \version 0.4 + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorRoundingNode( + vx_graph graph, + vx_tensor input, + const vx_nn_rounding_params rounding_params, + vx_size size_of_rounding_params, + vx_tensor output + ); + +/*! \brief Input parameter for hashTableLookupLayer + *\ingroup group_cnn + *\version 0.4 + */ +typedef struct _vx_nn_hashlut_params_t +{ + vx_tensor keys; /*!< \brief A 1-D tensor with shape [ n ]; */ + vx_tensor values; /*!< \brief A tensor with shape of [ n, ?]; i.e., the first dimension must be n. */ +} vx_nn_hashlut_params_t, * vx_nn_hashlut_params; + +/*! \brief [Graph] Creates a hash lookup table Layer Node. + * \details Keys and Values pair represent a map, i.e., the ith element + * in Keys (Keys[i]) is the key to select the ith sub-tensor + * in Values (Values[i]), where 0 <= i <= n-1. + * Keys tensor *MUST* be sorted in ascending order. + * \param [in] graph The reference to the parent graph. + * \param [in] input 1-D tensor with shape [ k ]. + * \param [in] hashlut_params Pointer to parameters of type \ref vx_nn_hashlut_params_t + * \param [in] size_of_hashlut_params [static] Size in bytes of vx_nn_hashlut_params. + * \param [out] hits A boolean tensor with shape [ k ] indicates whether the lookup hits (True) or not (False). + * \param [out] output The output tensor data, tensor with shape [ k, ?] + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn + * \version 0.4 + */ +VX_API_ENTRY vx_node VX_API_CALL vxHashTableLookupLayer( + vx_graph graph, + vx_tensor input, + const vx_nn_hashlut_params hashlut_params, + vx_size size_of_hashlut_params, + vx_tensor hits, + vx_tensor output + ); + +/*! \brief LSH project type list + *\ingroup group_cnn + *\version 0.4 + */ +enum vx_lshproj_type_e { + /*! \brief Computed bit vector is considered to be sparse. */ + VX_LSH_PROJ_SPARSE = 1, + + /*! \brief Computed bit vector is considered to be dense. */ + VX_LSH_PROJ_DENSE = 2, +}; + +/*! \brief Input parameter to LSH projection layer + *\ingroup group_cnn + *\version 0.4 + */ +typedef struct _vx_nn_lshproj_params_t +{ + vx_tensor hash_func; /*!< \brief Tensor of hash function. Dim size is 2, .Dim[0]: Number of hash functions. Dim[1]: Number of seeds per hash functions. Dim[1] <= 32 in sparse case. */ + vx_tensor weights; /*!< \brief Optional. Dim.size == 1, If not set, each input element is considered to have the same weight of 1.0. */ + vx_tensor type; /*!< \brief The type of LSH projection, support VX_LSH_PROJ_SPARSE and VX_LSH_PROJ_DENSE; */ +} vx_nn_lshproj_params_t, * vx_nn_lshproj_params; + +/*! \brief [Graph] Creates a LSH projection Layer Node. + * \details Projects an input to a bit vector via locality senstive hashing. + * Sparse: Value VX_LSH_PROJ_SPARSE(=1). + * Computed bit vector is considered to be sparse. + * Each output element is an int32 made up of multiple bits computed from + * hash functions. + * Dense: Value VX_LSH_PROJ_DENSE(=2). + * Computed bit vector is considered to be dense. Each output element + * represents a bit and can take the value of either 0 or 1. + * + * \param [in] graph The reference to the parent graph. + * \param [in] input input tensor data, Dim size must >= 1. + * \param [in] lshproj_params Pointer to parameters of type \ref vx_nn_lshproj_params + * \param [in] size_of_lshproj_params [static] Size in bytes of vx_nn_lshproj_params. + * \param [out] output The output tensor data. + * If the projection type is sparse: + * Output.Dim == { Tensor[0].Dim[0] } + * A tensor that represents hash signatures. + * If the projection type is Dense: + * Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] } + * A flattened tensor that represents projected bit vectors. + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn + * \version 0.4 + */ +VX_API_ENTRY vx_node VX_API_CALL vxLSHProjectionLayer( + vx_graph graph, + vx_tensor input, + const vx_nn_lshproj_params lshproj_params, + vx_size size_of_lshproj_params, + vx_tensor output + ); + +/*! \brief Input parameter for Reshape layer + *\ingroup group_cnn + *\version 0.4 + */ +typedef struct _vx_nn_reshape_params_t +{ + vx_tensor dims; /*!< \brief dimension. */ +} vx_nn_reshape_params_t, * vx_nn_reshape_params; + +/*! \brief [Graph] Creates a Reshape Layer Node. + * \param [in] graph The reference to the parent graph. + * \param [in] input The input tensor data to reshape. + * \param [in] reshape_params Pointer to parameters of type \ref vx_nn_reshape_params + * \param [in] size_of_reshape_params [static] Size in bytes of vx_nn_reshape_params. + * \param [out] output The output tensor data. + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_tensor + * \version 0.4 + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorReshapeNode( + vx_graph graph, + vx_tensor input, + const vx_nn_reshape_params reshape_params, + vx_size size_of_reshape_params, + vx_tensor output + ); + +/*! \brief Input parameter for Scale layer + *\ingroup group_cnn + *\version 0.4 + */ +typedef struct _vx_nn_scale_params_t +{ + vx_enum type; /*!< \brief The interpolation type, only support VX_INTERPOLATION_BILINEAR. */ +} vx_nn_scale_params_t, * vx_nn_scale_params; + +/*! \brief [Graph] Creates a scale Layer Node. + * \param [in] graph The reference to the parent graph. + * \param [in] input The input tensor data to scale. + * \param [in] scale_params [static] Pointer to parameters of type \ref vx_nn_scale_params + * \param [in] size_of_scale_params [static] Size in bytes of vx_nn_scale_params. + * \param [out] output The output tensor data. + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_tensor + * \version 0.4 + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorScaleNode( + vx_graph graph, + vx_tensor input, + const vx_nn_scale_params scale_params, + vx_size size_of_scale_params, + vx_tensor output + ); + +/*! \brief Input parameter for YUV to RGB scale layer + *\ingroup group_cnn + *\version 0.5 + */ +typedef struct _vx_nn_yuv2rgb_scale_params_t +{ + vx_rectangle_t rect; /*!< \brief The rectangle region of input image to do yuv2rgb scale. If it is set to 0, region is full input image; */ + vx_float32 mean_r; /*!< \brief Mean coefficient for output r channel; */ + vx_float32 mean_g; /*!< \brief Mean coefficient for output g channel; */ + vx_float32 mean_b; /*!< \brief Mean coefficient for output b channel; */ + vx_float32 scale_rgb; /*!< \brief Scale coefficient value for output rgb; Not the scale ratio; */ + vx_bool y_only; /*!< \brief YUV mode, Y only or normal YUV. */ + vx_bool output_rgb; /*!< \brief Output mode, BGR or RGB. */ +} vx_nn_yuv2rgb_scale_params_t, * vx_nn_yuv2rgb_scale_params; + +/*! \brief [Graph] Creates a scale Layer Node. + * \param [in] graph The reference to the parent graph. + * \param [in] input The input tensor data to scale. + * \param [in] scale_params [static] Pointer to parameters of type \ref vx_nn_scale_params + * \param [in] size_of_scale_params [static] Size in bytes of vx_nn_scale_params. + * \param [out] output The output tensor data. + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_tensor + * \version 0.5 + */ +VX_API_ENTRY vx_node VX_API_CALL vxYUV2RGBScaleNode( + vx_graph graph, + vx_image input, + const vx_nn_yuv2rgb_scale_params yuv2rgb_scale_params, + vx_size size_of_yuv2rgb_scale_param, + vx_tensor output + ); + +/*! \brief Input parameter for RNN layer + *\ingroup group_cnn + *\version 0.4 + */ +typedef struct _vx_nn_rnn_params_t +{ + vx_tensor weights; /*!< \brief 2-D recurrent weights tensor, of shape [num_units, input_size], where "num_units" corresponds to the number of units. */ + vx_tensor recurrent_weights; /*!< \brief 2-D tensor, of shape [num_units, num_units], with columns corresponding to the weights from each unit. */ + vx_tensor bias; /*!< \brief 1-D tensor, of shape [num_units]. */ + vx_tensor state_in; /*!< \brief 2-D tensor, of shape [batch_size, num_units]. */ + vx_tensor activation; /*!< \brief Optional, indicating the activation function. If "NONE" is specified then it results in a linear activation. */ +} vx_nn_rnn_params_t, * vx_nn_rnn_params; + +/*! \brief [Graph] Creates a RNN Layer Node. + * \details A basic recurrent neural network layer. + * This layer implements the operation: + * outputs = state = activation(inputs * input_weights + state * recurrent_weights + bias) + * + * Where: + * "input_weights" is a weight matrix that multiplies the inputs; + * "recurrent_weights" is a weight matrix that multiplies the current + * "state" which itself is the output from the previous time step + * computation; + * "bias" is a bias vector (added to each output vector in the batch); + * "activation" is the function passed as the "activation_function" + * argument (if not "NONE"). + * \param [in] graph The reference to the parent graph. + * \param [in] input The input tensor data to rnn, 2-D tensor, of shape [input_size, batch_size], where "batch_size" corresponds to the batching dimension, and "input_size" is the size of the input. + * \param [in] rnn_params Pointer to parameters of type \ref vx_nn_rnn_params + * \param [in] size_of_rnn_params [static] Size in bytes of vx_nn_rnn_params. + * \param [out] state_out The output tensor data, A 2-D tensor, of shape [batch_size, num_units]. + * \param [out] output The output tensor data, 2-D tensor, of shape [batch_size, num_units]. This is effectively the same as the current state value.. + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn + * \version 0.4 + */ +VX_API_ENTRY vx_node VX_API_CALL vxRNNLayer( + vx_graph graph, + vx_tensor input, + const vx_nn_rnn_params rnn_params, + vx_size size_of_rnn_params, + vx_tensor state_out, + vx_tensor output + ); + +/*! \brief Input parameter for softmax layer2 + *\ingroup group_cnn + *\version 0.4 + */ +typedef struct _vx_nn_softmax_params_t +{ + vx_float32 beta; /*!< \brief A FLOAT32 value, specifying the positive scaling factor for the exponent, beta. */ +} vx_nn_softmax_params_t, * vx_nn_softmax_params; + +/*! \brief extenstion parameters for softmax layer2. + * \ingroup group_cnn + *\version 0.5 + */ +typedef struct _vx_nn_softmax_params_ext_t +{ + vx_nn_softmax_params_t base; /*!< \brief Khronos standard structure head \ref vx_nn_softmax_params_t */ + vx_int32 axis; +} vx_nn_softmax_params_ext_t; + +/*! \brief [Graph] Creates a softmax Layer Node. + * \param [in] graph The reference to the parent graph. + * \param [in] input The input tensor data, with number of dimensions equals dim(input batch) + 1. Softmax will be calculated per IFM.. + * \param [in] softmax_params [static] Pointer to parameters of type \ref vx_nn_softmax_params + * \param [in] size_of_softmax_params [static] Size in bytes of vx_nn_softmax_params. + * \param [out] output The output tensor data, Outputs will have the same number of dimensions as input.. + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn + * \version 0.4 + */ +VX_API_ENTRY vx_node VX_API_CALL vxSoftmaxLayer2( + vx_graph graph, + vx_tensor input, + const vx_nn_softmax_params softmax_params, + vx_size size_of_softmax_params, + vx_tensor output + ); + +/*! \brief Input parameter for SVDF layer + *\ingroup group_cnn + *\version 0.4 + */ +typedef struct _vx_nn_svdf_params_t +{ + vx_tensor weights_feature; /*!< \brief A 2-D tensor, of shape [num_units, input_size], where "num_units" corresponds to the number of units. */ + vx_tensor recurrent_time; /*!< \brief A 2-D tensor, of shape [num_units, memory_size], where "memory_size" corresponds to the fixed-size of the memory. */ + vx_tensor bias; /*!< \brief Optional, 1-D tensor of type T, of shape [num_units]. */ + vx_tensor state_in; /*!< \brief A 2-D tensor, of shape [(memory_size - 1) * num_units * rank, batch_size] */ + vx_tensor rank; /*!< \brief The rank of the SVD approximation. */ + vx_tensor activation; /*!< \brief Indicating the activation function, specify linear activation for default */ +} vx_nn_svdf_params_t, * vx_nn_svdf_params; + +/*! \brief [Graph] Creates a svdf Layer Node. + * \details SVDF op is a kind of stateful layer derived from the notion that a + * densely connected layer that's processing a sequence of input frames can + * be approximated by using a singular value decomposition of each of its + * nodes. The implementation is based on: + * + * https://research.google.com/pubs/archive/43813.pdf + * + * P. Nakkiran, R. Alvarez, R. Prabhavalkar, C. Parada. + * "Compressing Deep Neural Networks using a Rank-Constrained Topology". + * INTERSPEECH, 2015. + * + * It processes the incoming input using a 2-stage filtering mechanism: + * stage 1 performs filtering on the "features" dimension, whose outputs get + * pushed into a memory of fixed-size memory_size. + * stage 2 performs filtering on the "time" dimension of the memory_size + * memoized outputs of stage 1. + * + * Specifically, for rank 1, this layer implements the operation: + * + * memory = push(conv1d(inputs, weights_feature, feature_dim, + * "PADDING_VALID")); + * outputs = activation(memory * weights_time + bias); + * + * Where: + * "weights_feature" is a weights matrix that processes the inputs (by + * convolving the input with every "feature filter"), and whose outputs get + * pushed, stacked in order, into the fixed-size "memory" (the oldest entry + * gets dropped); + * "weights_time" is a weights matrix that processes the "memory" (by a + * batched matrix multiplication on the num_units); + * "bias" is an optional bias vector (added to each output vector in the + * batch); and + * "activation" is the function passed as the "fused_activation_function" + * argument (if not "NONE"). + * + * Each rank adds a dimension to the weights matrices by means of stacking + * the filters. + * \param [in] graph The reference to the parent graph. + * \param [in] input The input tensor data, A 2-D tensor of type T, of shape [input_size, batch_size], where + * "batch_size" corresponds to the batching dimension, and "input_size" is + * the size of the input. + * \param [in] svdf_params Pointer to parameters of type \ref vx_nn_svdf_params + * \param [in] size_of_svdf_params [static] Size in bytes of vx_nn_svdf_params. + * \param [out] state_out A 2-D tensor, of shape [(memory_size - 1) * num_units * rank, batch_size]. + * \param [out] output The output tensor data, Outputs will have the same number of dimensions as input. + * \returns vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn + * \version 0.4 + */ +VX_API_ENTRY vx_node VX_API_CALL vxSVDFLayer( + vx_graph graph, + vx_tensor input, + const vx_nn_svdf_params svdf_params, + vx_size size_of_svdf_params, + vx_tensor state_out, + vx_tensor output + ); + +/*! \brief Input parameter for Pooling layer2 + * \ingroup group_cnn + */ +typedef struct _vx_nn_pooling_params_t +{ + vx_enum pool_type; /*!< \brief either max pooling or average pooling, see \ref vx_convolutional_network_pooling_type_e. */ + vx_uint32 pool_size_x; /*!< \brief Size of the pooling region in the x dimension. */ + vx_uint32 pool_size_y; /*!< \brief Size of the pooling region in the y dimension. */ + vx_uint32 pool_pad_x_left; /*!< \brief Padding size in the left of x dimension. */ + vx_uint32 pool_pad_x_right; /*!< \brief Padding size in the right of x dimension. */ + vx_uint32 pool_pad_y_top; /*!< \brief Padding size in the top of y dimension. */ + vx_uint32 pool_pad_y_bottom; /*!< \brief Padding size in the bottom of y dimension. */ + vx_enum rounding; /*!< \brief Rounding method for calculating output dimensions. See \ref vx_convolutional_network_rounding_type_e */ +} vx_nn_pooling_params_t; + + +/*! \brief Extended input parameter for Pooling layer2 + * \ingroup group_cnn + * \version 0.4 + */ +typedef struct _vx_nn_pooling_params_ext_t +{ + vx_nn_pooling_params_t base; /*!< \brief The base definition.\ref vx_nn_pooling_params_t */ + vx_uint32 stride_x; /*!< \brief Skip x jump for down scale. */ + vx_uint32 stride_y; /*!< \brief Skip y jump for down scale. */ +} vx_nn_pooling_params_ext_t; + + +/*! \brief [Graph] Creates a Convolutional Network Pooling Layer Node, this function can support uneven padding. + * \details Pooling is done on the first 2 dimensions or the \ref vx_tensor. Therefore, we use here the term x for the first dimension and y for the second.\n + * Pooling operation is a function operation over a rectangle size and then a nearest neighbour down scale. + * Here we use pool_size_x and pool_size_y to specify the rectangle size on which the operation + * is performed. \n + * before the operation is done (average or maximum value). the data is padded in the first 2D with zeros. + * The down scale is done by picking the results according to a skip jump. The skip in the x and y dimension is determined by the output size dimensions. + * \param [in] graph The handle to the graph. + * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, 4th dimension for batch of inputs is optional.Dimension layout is [width, height, #IFM, #batches]. + * See \ref vxCreateTensor and \ref vxCreateVirtualTensor + * \param [in] pooling_params [static] Pointer to parameters of type \ref vx_nn_pooling_params_t + * \param [in] size_of_pooling_params [static] Size in bytes of pooling_params. + * \param [out] outputs The output tensor data. Output will have the same number of dimensions as input. + * \return vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxPoolingLayer2( + vx_graph graph, + vx_tensor inputs, + const vx_nn_pooling_params_t * pooling_params, + vx_size size_of_pooling_params, + vx_tensor outputs); + +/*! \brief [Graph] Performs arithmetic addition on element values in the input tensor data's. + * \param [in] graph The handle to the graph. + * \param [in] in1 input tensor data,. + * \param [in] in2 input tensor data, inputs must be of equal in dimensions. + * else, If in one of the vx_mddata dimension is 1. + * That dimension is considered as a const on all the dimension terms. + * And will perform as if the values are duplicated on all terms in that dimensions. + * After the expansion. The dimensions are equal. + * \param [in] scale [static] The scale value. + * \param [in] overflow_policy [static] A vx_convert_policy_e enumeration. + * \param [in] rounding_policy [static] A vx_round_policy_e enumeration. + * \param [out] out The output tensor data with the same dimensions as the input tensor data's. + * \ingroup group_tensor + * \return vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorDivideNode(vx_graph graph, vx_tensor in1, vx_tensor in2, vx_scalar scale, vx_enum overflow_policy, vx_enum rounding_policy, vx_tensor out); + +/*! \brief [Graph] Performs LUT on element values in the input tensor data's. + * \param [in] graph The handle to the graph. + * \param [in] in1 input tensor data. + * \param [in] lut lut tensor data. + * \param [out] out The output tensor data with the same dimensions as the input tensor data's. + * \ingroup group_tensor + * \return vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorTableLookupNode2(vx_graph graph, vx_tensor in1, vx_tensor lut, vx_tensor out); + +/*! \brief [Graph] Performs matrices transformation on input tensor. +* The node transpose the tensor according to the matrices that perm gives. +* \param [in] graph The handle to the graph. +* \param [in] in input tensor data, +* \param [out] out output tensor data, +* \param [in] perm [static] that is the matrices to transpose. If not given, do full reversed transpose according to the input tensor dimension. +* \param [in] sizes_of_perm [static] that is the dimension of perm. +* \ingroup group_tensor +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +*/ +VX_API_ENTRY vx_node VX_API_CALL vxTensorPermuteNode(vx_graph graph, vx_tensor in, vx_tensor out, vx_uint32* perm, vx_uint32 sizes_of_perm); + +/*! \brief [Graph] Computes the sum of elements across dimensions of input tensor. +* \param [in] graph The handle to the graph. +* \param [in] in input tensor data, +* \param [out] out output tensor data, +* \param [in] reduce_dim [static] used to determine sum across which dimension(dimension 0 means width, etc). If not given, compute the sum across all dimensions. +* \param [in] dim_size [static] used to specify the array size of redume_dim. +* \param [in] keep_dim [static] means if keep the dimesion count. +* \ingroup group_tensor +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +* \version 0.3 +*/ +VX_API_ENTRY vx_node VX_API_CALL vxTensorReduceSumNode(vx_graph graph, vx_tensor in, vx_tensor out, vx_uint32* reduce_dim, vx_int32 dim_size, vx_bool keep_dim); + + +/*! \brief Input parameter structure for TensorPadNode + * \ingroup group_tensor + * \version 0.3 + */ +typedef struct _vx_nn_pad_params_t +{ + vx_int32 * pad_front_array; /*!< \brief An array of values which specify how many values are added on the front(left, top etc) of a tensor. */ + vx_int32 * pad_back_array; /*!< \brief An array of values which specify how many values are added on the back(right, bottom etc) of a tensor. */ + vx_uint8 numViewDimensions; /*!< \brief The size of two arrays. */ + vx_enum pad_mode; /*!< \brief A VX_TYPE_ENUM of the \ref vx_pad_mode_e enumeration. */ + vx_scalar pad_const; /*!< \brief The order const value if setting pad mode to const, the const value is base value, not quantized value. */ + +} vx_nn_pad_params_t, * vx_nn_pad_params; + + +/*! \brief [Graph] Performs padding on input tensor with diffrent pad mode. +* \param [in] graph The handle to the graph. +* \param [in] in input tensor data, +* \param [out] out output tensor data, +* \param [in] pad_params [static] contains pad left, right, top, bottom, pad mode, const value, etc. +* \param [in] size_of_pad_params [static] The size of pad_params. +* \ingroup group_tensor +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +* \version 0.3 +*/ +VX_API_ENTRY vx_node VX_API_CALL vxTensorPadNode(vx_graph graph, vx_tensor in, vx_tensor out, const vx_nn_pad_params pad_params, vx_size size_of_pad_params); + +/*! \brief [Graph] Performs copy from source tensor to destination tensor. +*\details This copy function also perform format converion if src tensor and dst tensor have differnt formats. +* Dequatization could be done by this function. +* \param [in] graph The handle to the graph. +* \param [in] src input tensor data, +* \param [out] dst output tensor data. +* \note that copy size is the min(srcSize, dstSize) +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +* \ingroup group_tensor +*/ +VX_API_ENTRY vx_node VX_API_CALL vxTensorCopyNode(vx_graph graph, vx_tensor src, vx_tensor dst); + +/*! \brief Input parameter for vxTensorReverse + * \ingroup group_cnn + */ +typedef struct _vx_nn_tensor_reverse_params_t +{ + vx_int32 *axis; /*!< \brief array of axis */ + vx_uint32 numberOfAxis; /*!< \brief size of axis, max value is 4 */ +} +vx_nn_tensor_reverse_params_t; + +/*! \brief [Graph] Performs reverse on input tensor. +* \param [in] graph The handle to the graph. +* \param [in] inputs input tensor data. +* \param [in] tensor_reverse_params [static] Pointer to parameters of type \ref vx_nn_tensor_reverse_params_t. +* \param [in] size_of_tensor_reverse_params [static] The size of tensor_reverse_params. +* \param [out] outputs output tensor data. +* \ingroup group_tensor +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +*/ +VX_API_ENTRY vx_node VX_API_CALL vxTensorReverse(vx_graph graph, vx_tensor inputs, const vx_nn_tensor_reverse_params_t * tensor_reverse_params, vx_size size_of_tensor_reverse_params, vx_tensor outputs); + +/*! \brief Input parameter for L2Normalize layer2 + *\ingroup group_cnn + *\version 0.4 + */ +typedef struct _vx_nn_l2norm_params_t +{ + vx_int32 axis; +} vx_nn_l2norm_params_t; + +/*! \brief [Graph] Creates a Convolutional Network L2Normalize Layer Node. +* \param [in] graph The handle to the graph. +* \param [in] inputs The input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Dimension layout is [width, height, #IFM, #batches]. + * See \ref vxCreateTensor and \ref vxCreateVirtualTensor. +* \param [out] outputs The output tensor data. Output will have the same number of dimensions as input. +* \ingroup group_cnn +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +*/ +VX_API_ENTRY vx_node VX_API_CALL vxL2NormalizeLayer(vx_graph graph, vx_tensor inputs, vx_tensor outputs); + +/*! \brief [Graph] Creates a Convolutional Network L2Normalize Layer2 Node. + * \param [in] graph The handle to the graph. +* \param [in] inputs The input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Dimension layout is [width, height, #IFM, #batches]. + * See \ref vxCreateTensor and \ref vxCreateVirtualTensor. +* \param [in] l2norm_params [static] Pointer to parameters of type \ref vx_nn_l2norm_params +* \param [in] size_of_l2norm_params [static] Size in bytes of vx_nn_l2norm_params. +* \param [out] outputs The output tensor data. Output will have the same number of dimensions as input. +* \ingroup group_cnn +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +*/ +VX_API_ENTRY vx_node VX_API_CALL vxL2NormalizeLayer2( + vx_graph graph, + vx_tensor inputs, + const vx_nn_l2norm_params_t * l2norm_params, + vx_size size_of_l2norm_params, + vx_tensor outputs); + +/*! \brief Input parameter structure for RPNLayer + *\ingroup group_cnn + */ +typedef struct _vx_nn_rpn_params_t +{ + vx_uint32 feature_stride; /*!< \brief Image feature stride. */ + vx_uint32 min_size; /*!< \brief The smallest rectangular box size */ + vx_uint32 pre_nms_topn; /*!< \brief Before NMS, take pre_nms_topn rectangulars for NMS. */ + vx_uint32 post_nms_topn; /*!< \brief After NMS, take post_nms_topn rectangulars for proposals output */ + vx_float32 nms_thresh; /*!< \brief The IOU threshold */ +} vx_nn_rpn_params_t; + +/*! \brief [Graph] Creates a Regin Proposal Networks Layer Node. + * \details A Region Proposal Network(RPN) takes an image(of any size) as input and outputs a set of rectangular object proposals, + * each with an objectness socre. + * \param [in] graph The handle to the graph. + * \param [in] score The score tensor data. its has 2 types of values: foreground and background. Only foreground objects are needed. + * \param [in] bbox The bounding box regressor tensor data. Used for bounding box regression. + * \param [in] anchors The anchor box tensor data. A set of rectangles generated by scale and aspect ratio. + * \param [in] img_info [static] The image information tensor data. 4 elements: image width, image height, image width scale, image height scale. + * \param [in] rpn_params [static] Pointer to parameters of type \ref vx_nn_rpn_params_t + * \param [in] size_of_rpn_params [static] Size in bytes of vx_nn_rpn_params. + * \param [in] roi_output The output tensor. The proposals output tensor data. This information used by ROI pooling + * \param [in] score_output The output tensor. The proposals score output tensor data. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxRPNLayer( + vx_graph graph, + vx_tensor score, + vx_tensor bbox, + vx_tensor anchors, + vx_tensor img_info, + const vx_nn_rpn_params_t * rpn_params, + vx_size size_of_rpn_params, + vx_tensor roi_output, + vx_tensor score_output + ); + +/*! \brief Input parameters for a lstm operation. + * \ingroup group_cnn + * \version 0.3 + */ +typedef struct _vx_nn_lstm_params_t +{ + vx_tensor input2input_weight; /*!< \brief Optional A 2-D tensor of type T, of shape [num_units, input_size]. where "num_units" corresponds to the number of cell units.*/ + vx_tensor input2forget_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, input_size].*/ + vx_tensor input2cell_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, input_size].*/ + vx_tensor input2output_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, input_size].*/ + + vx_tensor recurrent2input_weight; /*!< \brief Optional A 2-D tensor of type T, of shape [num_units, output_size]. where "output_size" corresponds to either the number of cell units (i.e., "num_units"), or the second dimension of the "projection_weights", if defined.*/ + vx_tensor recurrent2forget_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, output_size].*/ + vx_tensor recurrent2cell_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, output_size].*/ + vx_tensor recurrent2output_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, output_size].*/ + + vx_tensor cell2input_weight; /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/ + vx_tensor cell2forget_weight; /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/ + vx_tensor cell2output_weight; /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/ + + vx_tensor input_gate_bias; /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/ + vx_tensor forget_gate_bias; /*!< \brief A 1-D tensor of type T, of shape [num_units].*/ + vx_tensor cell_bias; /*!< \brief A 1-D tensor of type T, of shape [num_units].*/ + vx_tensor output_gate_bias; /*!< \brief A 1-D tensor of type T, of shape [num_units].*/ + + vx_tensor projection_weight; /*!< \brief Optional A 2-D tensor of type T, of shape [output_size, num_units].*/ + vx_tensor projection_bias; /*!< \brief Optional A 1-D tensor of type T, of shape [output_size].*/ + + vx_tensor activation; /*!< \brief Optional. An ActivationFunctionType indicating the activation function. If "NONE" is specified then it results in a linear activation.If "NONE" is specified then it results in a linear activation.*/ + vx_tensor cell_clip; /*!< \brief A clipping threshold for the cell state, such that values are bound within [-cell_clip, cell_clip]. If set to 0.0 then clipping is disabled.*/ + vx_tensor proj_clip; /*!< \brief A clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.*/ +} vx_nn_lstm_params_t; + +/*! \brief extenstion parameters for a lstm unit operation. + * \ingroup group_cnn + */ +typedef struct _vx_nn_lstm_params_ext_t +{ + vx_nn_lstm_params_t base; /*!< \brief standard structure head.*/ + vx_tensor forget_bias; /*!< \brief A bias(float 32) for the forget gate. If set to 0.0f(by default) then bias is ignored.*/ + + vx_float32 norm_gain; /*!< \brief Float32[static] The layer normalization gain initial value(default is 1.0f).*/ + vx_float32 norm_shift; /*!< \brief Float32[static] The layer normalization shift initial value(default is 0.0f).*/ + + vx_tensor sequence_length; /*!< \brief Optional[static] Specifies the length of each sequence in inputs. An `int32` (tensor) size `[batch_size]`, values in `[0, time_len)` or None(by default).*/ + + /*Since ANDROID NN API level 29 there are additional inputs to this op:*/ + vx_tensor layernorm2input_weight; /*!< \brief [Optional] The input layer normalization weights. A 1 - D tensor of shape[num_units].Used to rescale normalized inputs to activation at input gate.*/ + vx_tensor layernorm2forget_weight; /*!< \brief [Optional] The forget layer normalization weights. A 1 - D tensor of shape[num_units].Used to rescale normalized inputs to activation at forget gate.*/ + vx_tensor layernorm2cell_weight; /*!< \brief [Optional] The cell layer normalization weights. A 1 - D tensor of shape[num_units].Used to rescale normalized inputs to activation at cell gate.*/ + vx_tensor layernorm2output_weight; /*!< \brief [Optional] The output layer normalization weights. A 1 - D tensor of shape[num_units].Used to rescale normalized inputs to activation at output gate.*/ +} vx_nn_lstm_params_ext_t; + +/*! \brief input parameters for a lstm layer operation. + * \ingroup group_cnn + */ +typedef struct _vx_nn_lstm_layer_params_t +{ + vx_nn_lstm_params_t lstm_param; /*!< \brief lstm input param \ref vx_nn_lstm_params_t.*/ + vx_enum lstm_layer_type; /*!< \brief lstm layer type.*/ +} vx_nn_lstm_layer_params_t; + +/*! \brief input parameters for a lstm layer operation. + * \ingroup group_cnn + */ +typedef struct _vx_nn_lstm_layer_params_ext_t +{ + vx_nn_lstm_params_ext_t lstm_param; /*!< \brief lstm input param \ref vx_nn_lstm_params_ext_t.*/ + vx_enum lstm_layer_type; /*!< \brief lstm layer type.*/ +} vx_nn_lstm_layer_params_ext_t; + +/*! \brief [Graph] Creates a Long short-term memory unit (LSTM) Unit Networks Layer Node. + * \details + * The default non-peephole implementation is based on: + * http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf + * S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural + * Computation, 9(8):1735-1780, 1997. + * + * The peephole implementation is based on: + * https://research.google.com/pubs/archive/43905.pdf + * Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory + * recurrent neural network architectures for large scale acoustic modeling." + * INTERSPEECH, 2014. + * + * The coupling of input and forget gate (CIFG) is based on: + * http://arxiv.org/pdf/1503.04069.pdf + * Greff et al. "LSTM: A Search Space Odyssey" + * + * The class has the following independently optional inputs: + * * If input gate (if CIFG): "input_to_forget_weights", + * "recurrent_to_input_weights", "cell_to_input_weights", "input_gate_bias". + * * If no peephole connections: "cell_to_input_weights", + * "cell_to_forget_weights", "cell_to_output_weights". + * * If no projection layer: "projection_weights" and "projection_bias". + * * If no projection bias: "projection_bias". + * + * \param [in] graph The handle to the graph. + * \param [in] input A 2-D tensor of type T, of shape [input_size, batch_size], where + * "batch_size" corresponds to the batching dimension, and "input_size" + * is the size of the input. + * \param [in] output_state_in A 2-D tensor of type T, of shape [output_size, batch_size]. + * \param [in] cell_state_in A 2-D tensor of type T, of shape [num_units, batch_size]. + * \param [in] lstm_params LSTM paraments \ref vx_nn_lstm_params_t . + * \param [in] size_of_lstm_params [static] The size of the lstm_params. + * \param [out] scratch A 3-D tensor of type T, of shape [num_cell, 4, batch_size]. + * \param [out] output_state_out A 2-D tensor of type T, of shape [output_size, batch_size]. + * \param [out] cell_state_out A 2-D tensor of type T, of shape [num_units, batch_size]. + * \param [out] output A 2-D tensor of type T, of shape [output_size, batch_size]. + * This is effectively the same as the current "output_state" value. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + * \version 0.3 + */ +VX_API_ENTRY vx_node VX_API_CALL vxLstmUnitLayer( + vx_graph graph, + vx_tensor input, + vx_tensor output_state_in, + vx_tensor cell_state_in, + const vx_nn_lstm_params_t * lstm_params, + vx_size size_of_lstm_params, + vx_tensor scratch, + vx_tensor output_state_out, + vx_tensor cell_state_out, + vx_tensor output); + +/*! \brief [Graph] Creates a Long short-term memory layer (LSTM) Networks Layer Node. + * \details + * + * \param [in] graph The handle to the graph. + * \param [in] input A 3-D tensor of type T, of shape [input_size, batch_size, time_step], where + * "input_size" corresponds to the size of the input, and "batch_size" + * is the batching dimension, time_step means time length actually used by the input. + * \param [in] static_input optional, A 2-D tensor of type T, of shape [input_size, batch_size], where + * "input_size" corresponds to the size of the input, and "batch_size" + * is the batching dimension. + * \param [in] cont optional, A 2-D tensor of type T, of shape [input_size, batch_size], where + * "input_size" corresponds to the size of the input, and "batch_size" + * is the batching dimension. + * \param [in] lstm_layer_params LSTM paraments \ref vx_nn_lstm_layer_params_t . + * \param [in] size_of_lstm_layer_params [static] The size of the lstm_layer_params. + * \param [out] output A 2-D/3D tensor of type T, of shape [output_size, batch_size] or [output_size, batch_size, time]. + * This is effectively the same as the current "output_state" value. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + * \version 0.3 + */ +VX_API_ENTRY vx_node VX_API_CALL vxLstmLayer( + vx_graph graph, + vx_tensor input, + vx_tensor static_input, + vx_tensor cont, + const vx_nn_lstm_layer_params_t * lstm_layer_params, + vx_size size_of_lstm_layer_params, + vx_tensor output + ); + +/*! \brief [Graph] Creates transpose layer node. +* \details +* Transposes the input tensor, permuting the dimensions according to perm tensor. +* +* \param [in] graph The handle to the graph. +* \param [in] input A n-D tensor, specifying the tensor to be transposed. +* \param [in] transpose_params paraments \ref vx_nn_transpose_params_t . +* \param [in] size_of_transpose_param [static] The size of the vx_nn_transpose_params_t. +* \param [out] output A n-D tensor of the same type as input. +* \return vx_node. +* \returns A node reference \ref vx_node. Any possible errors preventing a +* successful creation should be checked using \ref vxGetStatus. +* \ingroup group_tensor +* \version 0.5 +*/ +VX_API_ENTRY vx_node VX_API_CALL vxTensorTransposeNode2( + vx_graph graph, + vx_tensor inputs, + const vx_nn_transpose_params_t *transpose_params, + vx_size size_of_transpose_param, + vx_tensor outputs); + +/*! \brief [Graph] Creates mean layer node. +* \details +* Computes the mean of elements across dimensions of a tensor. +* +* \param [in] graph The handle to the graph. +* \param [in] input A n-D tensor, specifying the input. +* \param [in] mean_params paraments \ref vx_nn_mean_params_t . +* \param [in] size_of_mean_param [static] The size of the vx_nn_mean_params_t. +* \param [out] output A n-D tensor of the same type as input. +* \return vx_node. +* \returns A node reference \ref vx_node. Any possible errors preventing a +* successful creation should be checked using \ref vxGetStatus. +* \ingroup group_tensor +* \version 0.5 +*/ +VX_API_ENTRY vx_node VX_API_CALL vxTensorMeanNode( + vx_graph graph, + vx_tensor inputs, + const vx_nn_mean_params_t *mean_params, + vx_size size_of_mean_param, + vx_tensor outputs); + +/*! \brief [Graph] Creates squeeze layer node. +* \details +* Remove dimensions of size 1 from the input tensor. +* +* \param [in] graph The handle to the graph. +* \param [in] input A n-D tensor, specifying the tensor to be squeezed. +* \param [in] squeeze_params paraments \ref vx_nn_squeeze_params_t . +* \param [in] size_of_squeeze_param [static] The size of the vx_nn_squeeze_params_t. +* \param [out] output A n-D tensor of the same type as input. Contains the same data as input, +* but has one or more dimensions of size 1 removed. +* \return vx_node. +* \returns A node reference \ref vx_node. Any possible errors preventing a +* successful creation should be checked using \ref vxGetStatus. +* \ingroup group_tensor +* \version 0.5 +*/ +VX_API_ENTRY vx_node VX_API_CALL vxTensorSqueezeNode( + vx_graph graph, + vx_tensor inputs, + const vx_nn_squeeze_params_t *squeeze_params, + vx_size size_of_squeeze_param, + vx_tensor outputs); + +/*! \brief [Graph] Creates stride slice layer node. +* \details +* Extracts a stride slice of a tensor. +* +* \param [in] graph The handle to the graph. +* \param [in] input A n-D tensor, specifying the tensor to be sliced. +* \param [in] stride_slice_params paraments \ref vx_nn_stride_slice_params_t . +* \param [in] size_of_stride_slice_param [static] The size of the vx_nn_stride_slice_params_t. +* \param [out] output A n-D tensor of the same type as input. +* \return vx_node. +* \returns A node reference \ref vx_node. Any possible errors preventing a +* successful creation should be checked using \ref vxGetStatus. +* \ingroup group_tensor +* \version 0.5 +*/ +VX_API_ENTRY vx_node VX_API_CALL vxTensorStrideSliceNode( + vx_graph graph, + vx_tensor inputs, + const vx_nn_stride_slice_params_t *stride_slice_params, + vx_size size_of_stride_slice_param, + vx_tensor outputs); + +/*! \brief Input parameters for query hardware caps. + * \ingroup group_context + */ +typedef struct _vx_hardware_caps_params_t +{ + vx_uint32 ecoID; /*!< \brief hardware eco ID.*/ + vx_uint32 customerID; /*!< \brief hardware custmoer ID. ecoID and custmomerID can identify a unique hardware.*/ + vx_bool evis1; /*!< \brief evs1 If true, hardware support evis1.*/ + vx_bool evis2; /*!< \brief evs2 If true, hardware support evis2.*/ +} vx_hardware_caps_params_t; + +/*! \brief Queries hardware caps information. + * \param [in] context The reference to the context. + * \param [in] hardware_caps_params \ref vx_hardware_caps_params_t . + * \param [in] size_of_hardware_caps_param [static] Size in bytes of hardware_caps_params. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE context is not a valid \ref vx_context reference. + * \retval VX_ERROR_INVALID_PARAMETERS If any of the other parameters are incorrect. + * \ingroup group_context + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryHardwareCaps( + vx_context context, + const vx_hardware_caps_params_t * hardware_caps_params, + vx_size size_of_hardware_caps_param + ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h new file mode 100644 index 0000000..e27a37a --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h @@ -0,0 +1,658 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_KHR_NN_INTERNAL_H_ +#define _VX_KHR_NN_INTERNAL_H_ + +/*! + * \file + * \brief The Khronos Extension for Deep Convolutional Networks Functions. + * + * \defgroup group_cnn Extension: Deep Convolutional Networks API + * \brief Convolutional Network Nodes. + */ + +#define OPENVX_KHR_NN_INTERNAL "vx_khr_nn_internal" + +#include + + +#ifdef __cplusplus +extern "C" { +#endif + +/*TODO: check it for OpenVX 1.2*/ +//#if defined(OPENVX_CNN_1_0) +//#undef OPENVX_CNN_1_1 +//#endif + +/*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and pooling Layer Node. +* \details This function implement Convolutional Network Convolution and Activation(Relu) and pooling layer. +* \param [in] graph The handle to the graph. +* \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, all following dimensions represent number of batches, possibly nested. +* The dimension order is [width, height, #IFM, #batches]. \n +* \param [in] weights_biases [static] Point to WeightBiasesParameter data, vx_weights_biases_parameter is an opaque reference.\n +* \param [in] pad_x [static] Number of elements added at each side in the x dimension of the input. +* \param [in] pad_y [static] Number of elements added at each side in the y dimension of the input. In fully connected layers this input is ignored. +* \param [in] accumulator_bits [static] Is the total number of bits used during intermediate accumulation. +* \param [in] overflow_policy [static] A VX_TYPE_ENUM of the vx_convert_policy_e enumeration. +* \param [in] rounding_policy [static] A VX_TYPE_ENUM of the vx_round_policy_e enumeration. +* \param [in] down_scale_size_rounding [static] Rounding method for calculating output dimensions. See \ref vx_convolutional_network_rounding_type_e +* \param [in] enable_relu [static] If true, enable vxActivationLayer's relu function +* \param [in] pool_type [static] if neither max pooling nor average pooling, disable pooling function. (see \ref vx_convolutional_network_pooling_type_e). +* \param [in] pool_size_x [static] Size of the pooling region in the x dimension +* \param [in] pool_size_y [static] Size of the pooling region in the y dimension. +* \param [out] outputs The output tensor data. Output will have the same number and structure of dimensions as input. +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +* \ingroup group_cnn +*/ +VX_API_ENTRY vx_node VX_API_CALL vxConvolutionReluPoolingLayer( + vx_graph graph, + vx_tensor inputs, + vx_weights_biases_parameter weights_biases, + vx_uint32 pad_x, + vx_uint32 pad_y, + vx_uint8 accumulator_bits, + vx_enum overflow_policy, + vx_enum rounding_policy, + vx_enum down_scale_size_rounding, + vx_bool enable_relu, + vx_enum pool_type, + vx_uint32 pool_size_x, + vx_uint32 pool_size_y, + vx_tensor outputs + ); + +/*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) Layer Node. +* \details This function implement Convolutional Network Convolution and Activation(Relu) layer. +* \param [in] graph The handle to the graph. +* \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, all following dimensions represent number of batches, possibly nested. + * The dimension order is [width, height, #IFM, #batches]. \n +* \param [in] weights_biases [static] Point to WeightBiasesParameter data, vx_weights_biases_parameter is an opaque reference. +* \param [in] pad_x [static] Number of elements added at each side in the x dimension of the input. +* \param [in] pad_y [static] Number of elements added at each side in the y dimension of the input. In fully connected layers this input is ignored. +* \param [in] accumulator_bits [static] Is the total number of bits used during intermediate accumulation. +* \param [in] overflow_policy [static] A VX_TYPE_ENUM of the vx_convert_policy_e enumeration. +* \param [in] rounding_policy [static] A VX_TYPE_ENUM of the vx_round_policy_e enumeration. +* \param [in] down_scale_size_rounding [static] Rounding method for calculating output dimensions. See \ref vx_convolutional_network_rounding_type_e +* \param [in] enable_relu [static] If true, enable vxActivationLayer's relu function. +* \param [out] outputs The output tensor data. Output will have the same number and structure of dimensions as input. +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +* \ingroup group_cnn +*/ + +VX_API_ENTRY vx_node VX_API_CALL vxConvolutionReluLayer( + vx_graph graph, + vx_tensor inputs, + vx_weights_biases_parameter weights_biases, + vx_uint32 pad_x, + vx_uint32 pad_y, + vx_uint8 accumulator_bits, + vx_enum overflow_policy, + vx_enum rounding_policy, + vx_enum down_scale_size_rounding, + vx_bool enable_relu, + vx_tensor outputs + ); + +/*! \brief [Graph] Creates a Fully connected and Activation(Relu) Convolutional Network Layer Node. +* \details This function implement Fully connected and Activation(Relu) Convolutional Network layers. +* \param [in] graph The handle to the graph. +* \param [in] inputs The input tensor data. There two possible input layouts: +* 1. [#IFM, #batches]. See \ref vxCreateTensor and \ref vxCreateVirtualTensor. +* 2. [width, height, #IFM, #batches]. See \ref vxCreateTensor and \ref vxCreateVirtualTensor\n +* In both cases number of batches are optional and may be multidimensional. +* The second option is a special case to deal with convolution layer followed by fully connected. +* The dimension order is [#IFM, #batches]. See \ref vxCreateTensor and \ref vxCreateVirtualTensor. Note that batch may be multidimensional. +* \param [in] weights_biases [static] Point to WeightBiasesParameter data, vx_weights_biases_parameter is an opaque reference.\n +* \param [in] pad [static] Number of elements added at each side in the input. +* \param [in] accumulator_bits [static] Is the total number of bits used during intermediate accumulation. +* \param [in] overflow_policy [static] A VX_TYPE_ENUM of the vx_convert_policy_e enumeration. +* \param [in] rounding_policy [static] A VX_TYPE_ENUM of the vx_round_policy_e enumeration. +* \param [in] down_scale_size_rounding [static] Rounding method for calculating output dimensions. See \ref vx_convolutional_network_rounding_type_e +* \param [in] enable_relu [static] If true, enable vxActivationLayer's relu function. +* \param [out] outputs The output tensor data. Output dimension layout is [#OFM,#batches]. See \ref vxCreateTensor and \ref vxCreateVirtualTensor, where #batches may be multidimensional. +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +* \ingroup group_cnn +*/ +VX_API_ENTRY vx_node VX_API_CALL vxFullyConnectedReluLayer( + vx_graph graph, + vx_tensor inputs, + vx_weights_biases_parameter weights_biases, + vx_uint32 pad, + vx_uint8 accumulator_bits, + vx_enum overflow_policy, + vx_enum rounding_policy, + vx_enum down_scale_size_rounding, + vx_bool enable_relu, + vx_tensor outputs + ); + +/*! \brief Input parameter for convolutionReluPooling2 + * \ingroup group_cnn + */ +typedef struct _vx_nn_convolution_relu_pooling_params_t +{ + vx_size dilation_x; /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the x direction. + The value is the number of zeros to insert. */ + vx_size dilation_y; /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the y direction. + The value is the number of zeros to insert. */ + vx_uint32 pad_x_left; /*!< \brief Number of elements added at each side in the left of x dimension of the input. */ + vx_uint32 pad_x_right; /*!< \brief Number of elements added at each side in the right of x dimension of the input. */ + vx_uint32 pad_y_top; /*!< \brief Number of elements added at each side in the top of y dimension of the input. */ + vx_uint32 pad_y_bottom; /*!< \brief Number of elements added at each side in the bottom of y dimension of the input. */ + vx_uint8 accumulator_bits; /*!< \brief Is the total number of bits used during intermediate accumulation. */ + vx_enum overflow_policy; /*!< \brief A VX_TYPE_ENUM of the vx_convert_policy_e enumeration. */ + vx_enum rounding_policy; /*!< \brief A VX_TYPE_ENUM of the vx_round_policy_e enumeration. */ + vx_enum down_scale_size_rounding; /*!< \brief Rounding method for calculating output dimensions. See vx_convolutional_network_rounding_type_e */ + vx_bool enable_relu; /*!< \brief Enable Relu layer function or not. */ + vx_enum pool_type; /*!< \brief neither max pooling nor average pooling, disable pooling function (see vx_convolutional_network_pooling_type_e). */ + vx_uint32 pool_size_x; /*!< \brief Size of the pooling region in the x dimension */ + vx_uint32 pool_size_y; /*!< \brief Size of the pooling region in the y dimension. */ + vx_enum pad_mode; /*!< \brief A VX_TYPE_ENUM of the \ref vx_pad_mode_e enumeration. */ + vx_scalar pad_const; /*!< \brief The order const value if setting pad mode to const, the const value is base value, not quantized value. */ +} vx_nn_convolution_relu_pooling_params_t, * vx_nn_convolution_relu_pooling_params; + +/*! \brief Extended input parameter for a convolutionReluPooling2 operation. + * \ingroup group_cnn + *\version 0.3 + */ +typedef struct _vx_nn_convolution_relu_pooling_params_ext_t +{ + vx_nn_convolution_relu_pooling_params_t base; /*!< \brief convolution relu pooling params \ref vx_nn_convolution_relu_pooling_params_t */ + vx_uint32 stride_x; /*!< \brief skip x jump for down scale. */ + vx_uint32 stride_y; /*!< \brief skip y jump for down scale. */ +} vx_nn_convolution_relu_pooling_params_ext_t, * vx_nn_convolution_relu_pooling_params_ext; + +/*! \brief The 2nd version of extended input parameter for a convolutionReluPooling2 operation. + *\ingroup group_cnn + *\version 0.4 + */ +typedef struct _vx_nn_convolution_relu_pooling_params_ext2_t +{ + vx_nn_convolution_relu_pooling_params_ext_t ext; /*!< \brief convolution relu pooling params \ref vx_nn_convolution_relu_pooling_params__ext_t */ + vx_int32 depth_multiplier; /*!< \brief specifying the depthwise multiplier for depthwise convolution. */ + vx_enum src_rank_mode; /*!< \brief source rank mode A VX_TYPE_ENUM of the \ref vx_tensor_rank_type_e enumeration. */ + vx_enum convert_dst_format; /*!< \brief The convert target format. */ +} vx_nn_convolution_relu_pooling_params_ext2_t, * vx_nn_convolution_relu_pooling_params_ext2; + +#define MERGED_NODE_COUNT_MAX 4 + +typedef struct _vx_nn_convolution_relu_pooling_params_ext3_t +{ + vx_nn_convolution_relu_pooling_params_ext2_t ext2; /*!< \brief convolution relu pooling params \ref vx_nn_convolution_relu_pooling_params__ext_t */ + vx_uint32 mergedNodeCount; + vx_float32* interScale; /*!< \brief specifying the depthwise multiplier for depthwise convolution. */ + vx_int32* interZeroPoint; + vx_enum* interDataType; +} vx_nn_convolution_relu_pooling_params_ext3_t, * vx_nn_convolution_relu_pooling_params_ext3; + +/*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion. + * \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer. + * For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined, + * and should be at least 16.\n + * round: rounding according the vx_round_policy_e enumeration. \n + * saturate: A saturation according the vx_convert_policy_e enumeration. + * The following equation is implemented: \n + * \f$ outputs[j,k,i] = saturate(round(\sum_{l} (\sum_{m,n} inputs[j-m,k-n,l] \times weights[m,n,l,i])+biasses[j,k,i])) \f$\n + * Where \f$m,n\f$ are indexes on the convolution matrices. \f$ l\f$ is an index on all the convolutions per input.\f$ i\f$ is an index per output. + * \f$ j,k \f$ are the inputs/outputs spatial indexes. + * Convolution is done on the width and height dimensions of the \ref vx_tensor. Therefore, we use here the term x for index along the width dimension and y for index along the height dimension.\n + * before the Convolution is done, a padding with zeros of the width and height input dimensions is performed. + * Then down scale is done by picking the results according to a skip jump. The skip in the x and y is determined by the output size dimensions. + * The relation between input to output is as follows: \n + * \f$ width_{output} = round(\frac{(width_{input} + paddingleft_x + paddingright_x - kernel_x - (kernel_x -1) * dilation_x)}{skip_x} + 1) \f$\n + * and \n + * \f$ height_{output} = round(\frac{(height + paddingtop_y + paddingbottom_y - kernel_y - (kernel_y -1) * dilation_y)}{skip_y} + 1) \f$\n + * where \f$width\f$ is the size of the input width dimension. \f$height\f$ is the size of the input height dimension. + * \f$width_{output}\f$ is the size of the output width dimension. \f$height_{output}\f$ is the size of the output height dimension. + * \f$kernel_x\f$ and \f$kernel_y\f$ are the convolution sizes in width and height dimensions. + * skip is calculated by the relation between input and output. + * rounding is done according to \ref vx_convolutional_network_rounding_type_e. + * \param [in] graph The handle to the graph. + * \param [in] inputs The input tensor data. 3 lower dimensions represent a single input, all following dimensions represent number of batches, possibly nested. + * The dimension order is [width, height, #IFM, #batches]. \n + * \param [in] weights_biases [static] Point to WeightBiasesParameter data, vx_weights_biases_parameter is an opaque reference. + * \param [in] convolution_relu_pooling_params [static] Pointer to parameters of type \ref vx_nn_convolution_relu_pooling_params_t + * \param [in] size_of_convolution_relu_pooling_params [static] Size in bytes of convolution_relu_pooling_params. + * \param [out] outputs The output tensor data. Output will have the same number and structure of dimensions as input. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxConvolutionReluPoolingLayer2( + vx_graph graph, + vx_tensor inputs, + vx_weights_biases_parameter weights_biases, + const vx_nn_convolution_relu_pooling_params_t * convolution_relu_pooling_params, + vx_size size_of_convolution_relu_pooling_params, + vx_tensor outputs); + +/*! \brief The optimization direvative for weights_biases_parameter create. + * \ingroup group_cnn + */ +typedef struct _vx_weights_biases_parameter_optimizations_t { + vx_int8 zrl; /*!< \brief The zero run length. Set negtive value to disable*/ + vx_enum outputFormat; /*!< \brief The output format. */ + vx_int32 inputZeroPoint; /*!< \brief zero point of input. A 32 bit integer, in range [0, 255], Set zero value to disable */ +} vx_weights_biases_parameter_optimizations_t; + +typedef struct _vx_weights_biases_parameter_optimizations_ext_t { + vx_int8 zrl; /*!< \brief The zero run length. Set negtive value to disable*/ + vx_enum outputFormat; /*!< \brief The output format. */ + vx_int32 inputZeroPoint; /*!< \brief zero point of input. A 32 bit integer, in range [0, 255], Set zero value to disable */ + vx_uint32 num_of_input_dims; /*< \brief The input dimesion number*/ + vx_uint32 num_of_output_dims; /*!< \brief The output dimesion number*/ +} vx_weights_biases_parameter_optimizations_ext_t; + + +typedef struct _vx_weights_biases_parameter_optimizations_ext2_t { + vx_weights_biases_parameter_optimizations_ext_t ext; + vx_float32 inputScale; + vx_float32 outputScale; + vx_enum inputFormat; + vx_int32 output_ZP_dw; /*depthwise conv output ZP*/ + vx_float32 output_scale_dw; /*depthwise conv output scale*/ + vx_int8 output_fpp_dw; /*depthwise conv output fix-point*/ +} vx_weights_biases_parameter_optimizations_ext2_t; + +/*! + * \brief Creates a reference to a vx_weights_biases_parameter opaque object. + * + * \param [in] layer_type The network type of objects to hold. Types allowed are: + * \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer. + * \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer. + * \param [in] num_of_dims The dimention number of input & output image tensor. + * \param [in] inputs_dims The input tensor's dimension size. + * \param [in] pad_x The number of elements subtracted at each side in the x dimension of the input. + * \param [in] pad_y The number of elements subtracted at each side in the y dimension of the input. + * \param [in] pooling_size_x The size of the pooling region in the x dimension, 0 means no pooling operation. + * \param [in] pooling_size_y The size of the pooling region in the y dimension, 0 means no pooling operation. + * \param [in] down_scale_size_rounding A VX_TYPE_ENUM of the vx_round_policy_e enumeration. + * \param [in] convolution_outputs_dims The output's dimension size after covolution operation. + * \param [in] pool_outputs_dims The output's dimension size after pooling operation. + * \param [in] optimizations A optional param for \ref vx_weights_biases_parameter_optimizations_t. + * \param [in] weights The weights tensor which need be compressed. + * \param [in] biases The biases tensor which need be compressed. + * + * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * \ingroup group_cnn + */ +VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL +vxCreateWeightsBiasesParameterFromTensors( + vx_enum layer_type, + vx_uint32 num_of_dims, + vx_uint32 * inputs_dims, + vx_uint32 pad_x, + vx_uint32 pad_y, + vx_uint32 pooling_size_x, + vx_uint32 pooling_size_y, + vx_enum down_scale_size_rounding, + vx_uint32 * convolution_outputs_dims, + vx_uint32 * pool_outputs_dims, + vx_weights_biases_parameter_optimizations_t *optimizations, + vx_tensor weights, + vx_tensor biases); + +/*! + * \brief Creates a reference to an opaque vx_weights_biases_parameter object. + * + * \param [in] layer_type The network type of objects to hold. Types allowed are: + * \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer. + * \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer. + * \param [in] num_of_dims The dimention number of input & output image tensor. + * \param [in] inputs_dims The input tensor's dimension size. + * \param [in] convolution_outputs_dims The output's dimension size after covolution operation. + * \param [in] pool_outputs_dims The output's dimension size after pooling operation. + * \param [in] output_format The output tensor element type. + * \param [in] convolution_relu_pooling_params The convolution_relu_pooling_params Pointer to parameters of type \ref vx_nn_convolution_relu_pooling_params_t + * \param [in] size_of_convolution_relu_pooling_params The size in bytes of convolution_relu_pooling_params. + * \param [in] optimizations A optional param for \ref vx_weights_biases_parameter_optimizations_t. + * \param [in] weights The weights tensor which need be compressed. + * \param [in] biases The biases tensor which need be compressed. + * + * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * \ingroup group_cnn + */ +VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParameterFromTensors2( + vx_enum layer_type, + vx_uint32 num_of_dims, + vx_uint32 * inputs_dims, + vx_uint32 * convolution_outputs_dims, + vx_uint32 * pool_outputs_dims, + vx_enum output_format, + const vx_nn_convolution_relu_pooling_params convolution_relu_pooling_params, + vx_size size_of_convolution_relu_pooling_params, + vx_weights_biases_parameter_optimizations_t *optimizations, + vx_tensor weights, + vx_tensor biases); + +/*! + * \brief Creates a reference to an opaque vx_weights_biases_parameter object. + * + * \param [in] layer_type The network type of objects to hold. Types allowed are: + * \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer. + * \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer. + * \param [in] inputs_dims The input tensor's dimension size. + * \param [in] convolution_outputs_dims The output's dimension size after covolution operation. + * \param [in] pool_outputs_dims The output's dimension size after pooling operation. + * \param [in] convolution_relu_pooling_params The convolution_relu_pooling_params Pointer to parameters of type \ref vx_nn_convolution_relu_pooling_params_t + * \param [in] size_of_convolution_relu_pooling_params The size in bytes of convolution_relu_pooling_params. + * \param [in] optimizations A optional param for \ref vx_weights_biases_parameter_optimizations_t. + * \param [in] size_of_optimizations The size in bytes of optimizations. + * \param [in] weights The weights tensor which need be compressed. + * \param [in] biases The biases tensor which need be compressed. + * + * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * \ingroup group_cnn + */ +VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParameterFromTensors3( + vx_enum layer_type, + vx_uint32 * inputs_dims, + vx_uint32 * convolution_outputs_dims, + vx_uint32 * pool_outputs_dims, + const vx_nn_convolution_relu_pooling_params convolution_relu_pooling_params, + vx_size size_of_convolution_relu_pooling_params, + vx_weights_biases_parameter_optimizations_t *optimizations, + vx_size size_of_optimizations, + vx_tensor weights, + vx_tensor biases); + +/*! \brief Releases the OpenVX object vx_weights_biases_parameter. + * \param [in] weights_bias The pointer to the reference to the vx_weights_biases_parameter. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE If weights_bias is not a vx_weights_biases_parameter. + * \pre \ref vxCreateWeightsBiasesParameterFromTensors / vxCreateWeightsBiasesParameterFromTensors2/ vxCreateWeightsBiasesParameter / vxCreateWeightsBiasesParameterFromStream + * \ingroup group_cnn + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseWeightsBiasesParameter(vx_weights_biases_parameter *weights_bias); + +/*! + * \brief Creates a reference to an vx_weights_biases_parameter object. + * \param [in] context The OpenVX context object. + * \param [in] layer_type The network type of objects to hold. Types allowed are: + * \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer. + * \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer. + * \param [in] num_of_dims The dimention number of input & output image tensor. + * \param [in] inputs_dims The input tensor's dimension size. + * \param [in] pad_x The number of elements subtracted at each side in the x dimension of the input. + * \param [in] pad_y The number of elements subtracted at each side in the y dimension of the input. + * \param [in] pooling_size_x The size of the pooling region in the x dimension, 0 means no pooling operation. + * \param [in] pooling_size_y The size of the pooling region in the y dimension, 0 means no pooling operation. + * \param [in] down_scale_size_rounding A VX_TYPE_ENUM of the vx_round_policy_e enumeration. + * \param [in] convolution_outputs_dims The output's dimension size after covolution operation. + * \param [in] pool_outputs_dims The output's dimension size after pooling operation. + * \param [in] weights_num_of_dims The dimention number of weights tensor. + * \param [in] weights_dims The dimention size of weights tensor. + * \param [in] weights_data_format The format of weights tensor. + * \param [in] weights_fixed_point_pos The fixed point position when the weights element type is int16/int8, if 0 calculations are performed in integer math. + * \param [in] biases_num_of_dims The dimention number of biases tensor. + * \param [in] biases_dims The dimention size of biases tensor. + * \param [in] biases_data_format The format of biases tensor. + * \param [in] biases_fixed_point_pos The fixed point position when the biases element type is int16/int8, if 0 calculations are performed in integer math. + * \param [in] raw_data_size The data size of compressed data. + * + * \returns A weightsbiases reference without compressed kernel data vx_weights_biases_parameter. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * \ingroup group_cnn + */ +VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL +vxCreateWeightsBiasesParameter( + vx_context context, + vx_enum layer_type, + vx_uint32 num_of_dims, + vx_uint32 * inputs_dims, + vx_uint32 pad_x, + vx_uint32 pad_y, + vx_uint32 pooling_size_x, + vx_uint32 pooling_size_y, + vx_enum down_scale_size_rounding, + vx_uint32 * convolution_outputs_dims, + vx_uint32 * pool_outputs_dims, + vx_uint32 weights_num_of_dims, + vx_uint32 * weights_dims, + vx_enum weights_data_format, + vx_int8 weights_fixed_point_pos, + vx_uint32 biases_num_of_dims, + vx_uint32 * biases_dims, + vx_enum biases_data_format, + vx_int8 biases_fixed_point_pos, + vx_uint32 raw_data_size + ); + +/*! \brief Input parameters for a gru operation. + * \ingroup group_cnn + * \version 0.5 + */ +typedef struct _vx_nn_gru_params_t +{ + vx_tensor reset2input_weights; /*!< \brief [static] Weight matrix for the reset gate with input. A 2-D tensor of type T, of shape [input_size, cell_size]. where "cell_size" corresponds to the number of cell units.*/ + vx_tensor update2input_weights; /*!< \brief [static] Weight matrix for the update gate with input. A 2-D tensor of type T, of shape [input_size, cell_size]. */ + vx_tensor reset2recurrent_weights; /*!< \brief [static] Weight matrix for the reset gate with recurrent(h_prev). A 2-D tensor of type T, of shape [cell_size, cell_size]. */ + vx_tensor update2recurrent_weights; /*!< \brief [static] Weight matrix for the update gate with recurrent(h_prev). A 2-D tensor of type T, of shape [cell_size, cell_size]. */ + + vx_tensor connection2input_weights; /*!< \brief [static] Weight matrix for the cell connection gate with input. A 2-D tensor of type T, of shape [input_size, cell_size]. */ + vx_tensor connection2recurrent_weights; /*!< \brief [static] Weight matrix for the cell connection gate with recurrent(h_prev). A 2-D tensor of type T, of shape [cell_size, cell_size]. */ + + vx_tensor gate_input_bias; /*!< \brief [static] Bias vector for the reset and update gate for input. A 1-D tensor of type T, of shape [cell_size].*/ + vx_tensor gate_recurrent_bias; /*!< \brief [static] Bias vector for the reset and update gate for recurrent. A 1-D tensor of type T, of shape [cell_size].*/ + + vx_tensor connection_bias; /*!< \brief [static] Bias vector for the cell connection gate. A 1-D tensor of type T, of shape [cell_size].*/ + +} vx_nn_gru_params_t; + + +/*! \brief [Graph] Creates a Long short-term memory unit (gru) Unit Networks Layer Node. not implement yet. + * \details + * The implementation is based on: http://arxiv.org/abs/1406.1078 + * Computes the GRU cell forward propagation for 1 time step. + * This kernel op implements the following mathematical equations: + * Biases are initialized with: + * * `b_ru` - constant_initializer(1.0) + * * `b_c` - constant_initializer(0.0) + * + * x_h_prev = [x, h_prev] + * [r_bar u_bar] = x_h_prev * w_ru + b_ru + * r = sigmoid(r_bar) + * u = sigmoid(u_bar) + * h_prevr = h_prev x r + * x_h_prevr = [x h_prevr] + * c_bar = x_h_prevr * w_c + b_c + * c = tanh(c_bar) + * h = (1-u) x c + u x h_prev + * + * \param [in] graph The handle to the graph. + * \param [in] input A 2-D tensor of type T, of shape [input_size, batch_size], where + * "batch_size" corresponds to the batching dimension, and "input_size" + * is the size of the input. + * \param [in] h_prev A 2-D tensor of type T, of shape [cell_size, batch_size]. + * \param [in] gru_params gru paraments \ref vx_nn_gru_params_t . + * \param [in] size_of_gru_params [static] The size of the gru_params. + * \param [out] output A 2-D tensor of type T, of shape [cell_size, batch_size]. + * This is effectively the same as the current "output_state" value. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + * \version 0.5 + */ +VX_API_ENTRY vx_node VX_API_CALL vxGRUUnitLayer( + vx_graph graph, + vx_tensor input, + vx_tensor h_prev, + const vx_nn_gru_params_t * gru_params, + vx_size size_of_gru_params, + vx_tensor output); + +/*! \brief [Graph] Creates a Long short-term memory layer (gru) Networks Layer Node. not implement yet. + * \details + * + * \param [in] graph The handle to the graph. + * \param [in] input A 3-D tensor of type T, of shape [input_size, batch_size, time_step], where + * "input_size" corresponds to the size of the input, and "batch_size" + * is the batching dimension, time_step means time length actually used by the input. + * \param [in] h_prev optional, A 2-D tensor of type T, of shape [cell_size, batch_size], where + * "input_size" corresponds to the size of the input, and "batch_size" + * is the batching dimension. + * \param [in] vx_nn_gru_params gru paraments \ref vx_nn_gru_params_t . + * \param [in] size_of_gru_layer_params [static] The size of the vx_nn_gru_params. + * \param [out] output A 2-D tensor of type T, of shape [cell_size, batch_size]. + * This is effectively the same as the current "output_state" value. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + * \version 0.5 + */ +VX_API_ENTRY vx_node VX_API_CALL vxGRULayer( + vx_graph graph, + vx_tensor input, + vx_tensor h_prev, + const vx_nn_gru_params_t * gru_layer_params, + vx_size size_of_gru_layer_params, + vx_tensor output + ); + + +/*! \brief Input parameters for a convolution lstm operation. + * \ingroup group_cnn + * \version 0.5 + */ +typedef struct _vx_nn_convlstm_params_t +{ + vx_tensor input2input_weight; /*!< \brief Optional A 2-D tensor of type T, of shape [num_units, input_size]. where "num_units" corresponds to the number of cell units.*/ + vx_tensor input2forget_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, input_size].*/ + vx_tensor input2cell_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, input_size].*/ + vx_tensor input2output_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, input_size].*/ + + vx_tensor recurrent2input_weight; /*!< \brief Optional A 2-D tensor of type T, of shape [num_units, output_size]. where "output_size" corresponds to either the number of cell units (i.e., "num_units"), or the second dimension of the "projection_weights", if defined.*/ + vx_tensor recurrent2forget_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, output_size].*/ + vx_tensor recurrent2cell_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, output_size].*/ + vx_tensor recurrent2output_weight; /*!< \brief A 2-D tensor of type T, of shape [num_units, output_size].*/ + + vx_tensor input_gate_bias; /*!< \brief Optional A 1-D tensor of type T, of shape [num_units].*/ + vx_tensor forget_gate_bias; /*!< \brief A 1-D tensor of type T, of shape [num_units].*/ + vx_tensor cell_bias; /*!< \brief A 1-D tensor of type T, of shape [num_units].*/ + vx_tensor output_gate_bias; /*!< \brief A 1-D tensor of type T, of shape [num_units].*/ + + vx_tensor activation; /*!< \brief Optional. An ActivationFunctionType indicating the activation function. If "NONE" is specified then it results in a linear activation.If "NONE" is specified then it results in a linear activation.*/ + + vx_float32 forget_bias; /*!< \brief Float32[static] A bias for the forget gate. If set to 0.0f(by default) then bias is ignored.*/ + vx_bool skip_connection; /*< \brief If set to `vx_true_e`, concatenate the input to the output of the conv LSTM. Default: `vx_false_e`.*/ + +} vx_nn_convlstm_params_t; + +/*! \brief input parameters for a convolution lstm layer operation. + * \ingroup group_cnn + */ +typedef struct _vx_nn_convlstm_layer_params_t +{ + vx_nn_convlstm_params_t convlstm_param; /*!< \brief convolution lstm input param \ref vx_nn_convlstm_params_t.*/ + vx_enum convlstm_layer_type; /*!< \brief convolution lstm layer type.*/ +} vx_nn_convlstm_layer_params_t; + + +/*! \brief [Graph] Creates a Convolution Long short-term memory unit (ConvLSTM) Unit Networks Layer Node. not implement yet. + * \details + * + * https://arxiv.org/pdf/1506.04214v1.pdf + * + * \param [in] graph The handle to the graph. + * \param [in] input A 2-D tensor of type T, of shape [input_size, batch_size], where + * "batch_size" corresponds to the batching dimension, and "input_size" + * is the size of the input. + * \param [in] output_state_in A 2-D tensor of type T, of shape [output_size, batch_size]. + * \param [in] cell_state_in A 2-D tensor of type T, of shape [num_units, batch_size]. + * \param [in] convlstm_params LSTM paraments \ref vx_nn_convlstm_params_t . + * \param [in] size_of_convlstm_params [static] The size of the convlstm_params. + * \param [out] scratch A 3-D tensor of type T, of shape [num_cell, 4, batch_size]. + * \param [out] output_state_out A 2-D tensor of type T, of shape [output_size, batch_size]. + * \param [out] cell_state_out A 2-D tensor of type T, of shape [num_units, batch_size]. + * \param [out] output A 2-D tensor of type T, of shape [output_size, batch_size]. + * This is effectively the same as the current "output_state" value. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + * \version 0.5 + */ +VX_API_ENTRY vx_node VX_API_CALL vxConvLSTMUnitLayer( + vx_graph graph, + vx_tensor input, + vx_tensor output_state_in, + vx_tensor cell_state_in, + const vx_nn_convlstm_params_t * convlstm_params, + vx_size size_of_convlstm_params, + vx_tensor output_state_out, + vx_tensor cell_state_out, + vx_tensor output); + +/*! \brief [Graph] Creates a Long short-term memory layer (LSTM) Networks Layer Node. not implement yet. + * \details + * + * \param [in] graph The handle to the graph. + * \param [in] input A 3-D tensor of type T, of shape [input_size, batch_size, time_step], where + * "input_size" corresponds to the size of the input, and "batch_size" + * is the batching dimension, time_step means time length actually used by the input. + * \param [in] static_input optional, A 2-D tensor of type T, of shape [input_size, batch_size], where + * "input_size" corresponds to the size of the input, and "batch_size" + * is the batching dimension. + * \param [in] cont optional, A 2-D tensor of type T, of shape [input_size, batch_size], where + * "input_size" corresponds to the size of the input, and "batch_size" + * is the batching dimension. + * \param [in] convlstm_layer_params LSTM paraments \ref vx_nn_convlstm_layer_params_t . + * \param [in] size_of_convlstm_layer_params [static] The size of the convlstm_layer_params. + * \param [out] output A 2-D tensor of type T, of shape [output_size, batch_size]. + * This is effectively the same as the current "output_state" value. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + * \version 0.5 + */ +VX_API_ENTRY vx_node VX_API_CALL vxConvLSTMLayer( + vx_graph graph, + vx_tensor input, + vx_tensor static_input, + vx_tensor cont, + const vx_nn_convlstm_layer_params_t * convlstm_layer_params, + vx_size size_of_convlstm_layer_params, + vx_tensor output + ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_node_memory.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_node_memory.h new file mode 100644 index 0000000..e9c4807 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_node_memory.h @@ -0,0 +1,61 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_KHR_NODE_MEMORY_H_ +#define _VX_KHR_NODE_MEMORY_H_ + +/*! \brief The Node Memory Extension. + * \file + */ + +#define OPENVX_KHR_NODE_MEMORY "vx_khr_node_memory" + +#include + +/*! \brief The kernel object attributes for global and local memory. + * \ingroup group_kernel + */ +enum vx_kernel_attribute_memory_e { + /*! \brief The global data pointer size to be shared across all instances of + * the kernel (nodes are instances of kernels). + * Use a \ref vx_size parameter. + * \note If not set it will default to zero. + */ + VX_KERNEL_GLOBAL_DATA_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x5, + /*! \brief The global data pointer to the shared across all the instances of + * the kernel (nodes are instances of the kernels). + * Use a \ref void * parameter. + */ + VX_KERNEL_GLOBAL_DATA_PTR = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x6, +}; + +/*! \brief The node object attributes for global and local memory. + * \ingroup group_node + */ +enum vx_node_attribute_memory_e { + /*! \brief Used to indicate the size of the shared kernel global memory area. + * Use a \ref vx_size parameter. + */ + VX_NODE_GLOBAL_DATA_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x9, + /*! \brief Used to indicate the pointer to the shared kernel global memory area. + * Use a void * parameter. + */ + VX_NODE_GLOBAL_DATA_PTR = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xA, +}; + +#endif + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_opencl.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_opencl.h new file mode 100644 index 0000000..7e09af5 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_opencl.h @@ -0,0 +1,268 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_KHR_OPENCL_H_ +#define _VX_KHR_OPENCL_H_ + +#include +#include + +/*! \file + * \brief The OpenVX to OpenCL Inter-op Extension Header. + * + * \defgroup group_cl_api API + * \brief The API used by Clients to add OpenCL Kernels as vx_kernel. + * \details + * + * \defgroup group_cl_def Extension Defines + * \brief The Extension defines and constants. + * + * \defgroup group_cl_image Images + * \brief OpenVX Images + * \details Depending on whether the OpenCL implementation supports images, vx_image + * may map to an image2d_t or a OpenCL buffer. + * + * \defgroup group_cl_array Arrays + * \brief OpenVX Arrays + * + * \defgroup group_cl_convolution Convolutions + * \brief OpenVX Convolutions + * + * \defgroup group_cl_distribution Distributions + * \brief OpenVX Distributions + * + * \defgroup group_cl_matrix Matricies + * \brief OpenVX Matrix + * + * \defgroup group_cl_types OpenVX to OpenCL Atomic Types + * \brief Atomic Types + * \details OpenVX types map to OpenCL types through this table: + * | VX | OpenCL| + * |:---------|:------| + * |vx_uint8 |uchar | + * |vx_int8 |char | + * |vx_uint16 |ushort | + * |vx_int16 |short | + * |vx_uint32 |uint | + * |vx_int32 |int | + * |vx_uint64 |ulong | + * |vx_int64 |long | + * |vx_float32|float | + * |vx_float64|double | + * |vx_size |size_t | + * + * \note size_t can not be used as a parameter to a __kernel. + */ + +#ifndef VX_SCALE_UNITY +#define VX_SCALE_UNITY (1024) +#endif + +/*!\brief The maximum number of planes an image may have which is compatible across both + * API. + * \ingroup group_cl_def + */ +#define VX_CL_MAX_PLANES (4) + +#if defined(VX_CL_DOCUMENTATION) || !defined(VX_CL_KERNEL) + +#if defined(__APPLE__) || defined(DARWIN) +#include +#else +#include +#endif + +#if (!defined(__APPLE__)) && defined(CL_USE_LUMINANCE) +#define CL_USE_IMAGES +#endif + +/*! \brief The string name of this extension to match for in the extensions list + * \ingroup group_cl_def + */ +#define OPENVX_KHR_OPENCL "vx_khr_opencl" + +/*! \brief Adds an OpenCL Kernel as source code into the OpenVX implementation. + * \param [in] context The OpenVX Context. + * \param [in] name The name of the kernel in OpenVX nomenclature. + * \param [in] enumeration The OpenVX kernel enumeration used to identify this kernel. + * \param [in] source The array of source line pointers. + * \param [in] line_lengths The array of lines lengths for each line of source. + * \param [in] num_lines the number of lines in both the sources array and line_lengths array. + * \param [in] symbol_name The name of the kernel to call in the program. + * \param [in] numParams The number of parameters to the OpenVX kernel. + * \param [in] input The input validator. + * \param [in] output The output validator. + * \see vxAddParameterToKernel to configure the specific parameter attributes. + * \ingroup group_cl_api + */ +VX_API_ENTRY vx_kernel VX_API_CALL vxAddOpenCLAsSourceKernel(vx_context context, + vx_char name[VX_MAX_KERNEL_NAME], + vx_enum enumeration, + char *source[], + size_t line_lengths[], + size_t num_lines, + char symbol_name[], + vx_uint32 numParams, + vx_kernel_input_validate_f input, + vx_kernel_output_validate_f output); + +/*! \brief Adds an OpenCL Kernel as binary program into the OpenVX implementation. + * \param [in] context The OpenVX Context. + * \param [in] name The name of the kernel in OpenVX nomenclature. + * \param [in] enumeration The OpenVX kernel enumeration used to identify this kernel. + * \param [in] program The OpenCL Program which contains the kernel (either pre-compiled or compiled by user). + * \param [in] symbol_name The name of the kernel to call in the program. + * \param [in] numParams The number of parameters to the OpenVX kernel. + * \param [in] input The input validator. + * \param [in] output The output validator. + * \see vxAddParameterToKernel to configure the specific parameter attributes. + * \ingroup group_cl_api + */ +VX_API_ENTRY vx_kernel VX_API_CALL vxAddOpenCLAsBinaryKernel(vx_context context, + vx_char name[VX_MAX_KERNEL_NAME], + vx_enum enumeration, + cl_program program, + char symbol_name[], + vx_uint32 numParams, + vx_kernel_input_validate_f input, + vx_kernel_output_validate_f output); + +#endif // External API + +#if defined(VX_CL_DOCUMENTATION) || defined(VX_CL_KERNEL) + +#if defined(__IMAGE_SUPPORT__) && defined(CL_USE_LUMINANCE) +#define CL_USE_IMAGES +#endif + +/*! \brief Allows access to an image pixel as a typecast pointer deference. + * \param type The OpenCL single element type + * \param ptr The __global pointer to the base of the image. + * \param x The x coordinate. + * \param y The y coordinate. + * \param sx The x stride. + * \param sy The y stride. + * \ingroup group_cl_image + */ +#define vxImagePixel(type, ptr, x, y, sx, sy) \ + (*(type *)(&((uchar *)ptr)[((y) * sy) + ((x) * sx)])) + +/*! + * \brief Allows access to an array item as a typecast pointer deference. + * \param type The OpenCL single element type or structure type. + * \param ptr The __global pointer to the base of the array. + * \param index The index of the element to access. + * \param stride The stride in bytes between two adjacent elements. + * \ingroup group_cl_array + */ +#define vxArrayItem(type, ptr, index, stride) \ + (*(type *)(&((uchar *)ptr)[index*stride])) + +/*! \brief Allows access to a matrix element \f$ M_{ij} \f$ where i is the column and j is the row. + * \param type The OpenCL single element type of the matrix. + * \param ptr The __global pointer to the base of the array. + * \param columns The number of columns in the matrix. + * \param i The column index + * \param j The row index + * \ingroup group_cl_matrix + */ +#define vxMatrixElement(type, ptr, columns, i, j) (((type *)ptr)[columns*j + i]) + +/*! \brief Allows access to a convolution element \f$ C_{ij} \f$ where i is the column and j is the row. + * \note Convolution elements are always of type short. + * \param ptr The __global pointer to the base of the array. + * \param columns The number of columns in the matrix. + * \param i The column index + * \param j The row index + * \ingroup group_cl_convolution + */ +#define vxConvolveElement(ptr, columns, i, j) (((short *)ptr)[columns*j + i]) + +/*! \brief Allows access to a distribution frequency counter. + * \param ptr The __global pointer to the base of the distribution. + * \param value The value to retrive the frequency count for. + * \param offset The offset within the input domain. + * \param range The total range within the domain starting from offset. + * \param window_size The window size of the bin. + * \ingroup group_cl_distribution + */ +#define vxGetFrequency(ptr, value, offset, range, window_size) \ + ((offset <= value) && (value <= (range+offset)) ? ptr[(value-offset)/window_size] : 0) + +/*! \brief Increments a distribution frequency counter for a value. + * \param ptr The __global pointer to the base of the distribution. + * \param value The value to increment the frequency count for. + * \param offset The offset within the input domain. + * \param range The total range within the domain starting from offset. + * \param window_size The window size of the bin. + * \ingroup group_cl_distribution + */ +#define vxIncFrequency(ptr, value, offset, range, window_size) \ + ((offset <= value) && (value <= (range+offset)) ? ++ptr[(value-offset)/window_size] : 0) + +/*! \brief Decrements a distribution frequency counter for a value. + * \param ptr The __global pointer to the base of the distribution. + * \param value The value to decrement the frequency count for. + * \param offset The offset within the input domain. + * \param range The total range within the domain starting from offset. + * \param window_size The window size of the bin. + * \ingroup group_cl_distribution + */ +#define vxDecFrequency(ptr, value, offset, range, window_size) \ + ((offset <= value) && (value <= (range+offset)) ? --ptr[(value-offset)/window_size] : 0) + +#if defined(VX_VERSION_1_1) && (VX_VERSION >= VX_VERSION_1_1) + +/*! \brief Allows access to a distribution frequency counter. + * \param ptr The __global pointer to the base of the distribution. + * \param value The value to retrive the frequency count for. + * \param offset The offset within the input domain. + * \param range The total range within the domain starting from offset. + * \param num_bins The number of bins in the domain range. + * \ingroup group_cl_distribution + */ +#define vxGetFrequency2(ptr, value, offset, range, num_bins) \ + ((offset <= value) && (value <= (range+offset)) ? ptr[(value-offset)*num_bins/range] : 0) + +/*! \brief Increments a distribution frequency counter for a value. + * \param ptr The __global pointer to the base of the distribution. + * \param value The value to increment the frequency count for. + * \param offset The offset within the input domain. + * \param range The total range within the domain starting from offset. + * \param num_bins The number of bins in the domain range. + * \ingroup group_cl_distribution + */ +#define vxIncFrequency2(ptr, value, offset, range, num_bins) \ + ((offset <= value) && (value <= (range+offset)) ? ++ptr[(value-offset)*num_bins/range] : 0) + +/*! \brief Decrements a distribution frequency counter for a value. + * \param ptr The __global pointer to the base of the distribution. + * \param value The value to decrement the frequency count for. + * \param offset The offset within the input domain. + * \param range The total range within the domain starting from offset. + * \param num_bins The number of bins in the domain range. + * \ingroup group_cl_distribution + */ +#define vxDecFrequency2(ptr, value, offset, range, num_bins) \ + ((offset <= value) && (value <= (range+offset)) ? --ptr[(value-offset)*num_bins/range] : 0) + +#endif /*VX_VERSION_1_1*/ + +#endif + +#endif + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_tiling.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_tiling.h new file mode 100644 index 0000000..49c99f0 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_tiling.h @@ -0,0 +1,376 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_KHR_TILING_H_ +#define _VX_KHR_TILING_H_ + +/*! + * \file + * \brief The Khronos Extension for User Tiling Functions. + * + * \defgroup group_tiling Extension: User Tiling API + * \brief The Khronos Extension for User Tiling Functions. + */ + +#define OPENVX_KHR_TILING "vx_khr_tiling" + +#if defined(OPENVX_TILING_1_0) +#undef OPENVX_TILING_1_1 +#endif + +#include +/* For vx_kernel_input_validate_f and vx_kernel_output_validate_f: */ +#include + + +/*! \def VX_RESTRICT + * \brief A platform wrapper for the restrict keyword. + * \ingroup group_tiling + */ +#if defined(_WIN32) +#define VX_RESTRICT +#else +#if defined(__cplusplus) || defined(ANDROID) +#define VX_RESTRICT __restrict +#elif defined(__linux__) +#define VX_RESTRICT +#elif defined __QNXNTO__ +#define VX_RESTRICT +#else +#define VX_RESTRICT restrict +#endif +#endif + +/*! \brief The User Tiling Function tile block size declaration. + * \details The author of a User Tiling Kernel will use this structure to define + * the dimensionality of the tile block. + * \ingroup group_tiling + */ +typedef struct _vx_tile_block_size_t { + vx_int32 width; /*!< \brief Tile block width in pixels. */ + vx_int32 height; /*!< \brief Tile block height in pixels. */ +} vx_tile_block_size_t; + +/*! \brief The User Tiling Function Neighborhood declaration. + * \details The author of a User Tiling Kernel will use this structure to define + * the neighborhood surrounding the tile block. + * \ingroup group_tiling + */ +typedef struct _vx_neighborhood_size_t { + vx_int32 left; /*!< \brief Left of the tile block. */ + vx_int32 right; /*!< \brief Right of the tile block. */ + vx_int32 top; /*!< \brief Top of the tile block. */ + vx_int32 bottom; /*!< \brief Bottom of the tile block. */ +} vx_neighborhood_size_t; + +/*! \brief A structure which describes the tile's parent image. + * \ingroup group_tiling + */ +typedef struct _vx_image_description_t { + vx_uint32 width; /*!< \brief Width of the image */ + vx_uint32 height; /*!< \brief Height of the image */ + vx_df_image format; /*!< \brief The \ref vx_df_image_e of the image */ + vx_uint32 planes; /*!< \brief The number of planes in the image */ + vx_enum range; /*!< \brief The \ref vx_channel_range_e enumeration. */ + vx_enum space; /*!< \brief The \ref vx_color_space_e enumeration. */ +} vx_image_description_t; + +/*! \brief The maximum number of planes in a tiled image. + * \ingroup group_tiling + */ +#define VX_MAX_TILING_PLANES (4) + +/*! \brief The tile structure declaration. + * \ingroup group_tiling + */ +typedef struct _vx_tile_t { + /*! \brief The array of pointers to the tile's image plane. */ + vx_uint8 * VX_RESTRICT base[VX_MAX_TILING_PLANES]; + /*! \brief The top left X pixel index within the width dimension of the image. */ + vx_uint32 tile_x; + /*! \brief The top left Y pixel index within the height dimension of the image. */ + vx_uint32 tile_y; + /*! \brief The array of addressing structure to describe each plane. */ + vx_imagepatch_addressing_t addr[VX_MAX_TILING_PLANES]; + /*! \brief The output block size structure. */ + vx_tile_block_size_t tile_block; + /*! \brief The neighborhood definition. */ + vx_neighborhood_size_t neighborhood; + /*! \brief The description and attributes of the image. */ + vx_image_description_t image; +} vx_tile_t; + +#ifndef VX_TILE_ATTRIBUTES_DEFINITIONS + +/*! + * \brief The full height of the tile's parent image in pixels. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxImageHeight(ptile) ((ptile))->image.height) + +/*! + * \brief The full width of the tile's parent image in pixels. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxImageWidth(ptile) ((ptile))->image.width) + +/*! + * \brief The offset between the left edge of the image and the left edge of the tile, in pixels. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxTileX(ptile) ((ptile)->tile_x) + +/*! + * \brief The offset between the top edge of the image and the top edge of the tile, in pixels. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxTileY(ptile) ((ptile)->tile_y) + +/*! + * \brief The width of the tile in pixels. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \param [in] index The plane index. + * \ingroup group_tiling + */ +#define vxTileWidth(ptile, index) ((ptile)->addr[index].dim_x) + +/*! + * \brief The height of the tile in pixels. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \param [in] index The plane index. + * \ingroup group_tiling + */ +#define vxTileHeight(ptile, index) ((ptile)->addr[index].dim_y) + +/*! + * \brief The tile block height. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxTileBlockHeight(ptile) ((ptile)->tile_block.height) + +/*! + * \brief The tile block width. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxTileBlockWidth(ptile) ((ptile)->tile_block.width) + +/*! + * \brief The simple wrapper to access each image's neighborhood -X value. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxNeighborhoodLeft(ptile) ((ptile)->neighborhood.left) + +/*! + * \brief The simple wrapper to access each image's neighborhood +X value. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxNeighborhoodRight(ptile) ((ptile)->neighborhood.right) + +/*! + * \brief The simple wrapper to access each image's neighborhood -Y value. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxNeighborhoodTop(ptile) ((ptile)->neighborhood.top) + +/*! + * \brief The simple wrapper to access each image's neighborhood +Y value. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxNeighborhoodBottom(ptile) ((ptile)->neighborhood.bottom) + +#if 0 +/*! + * \brief The simple wrapper to access each image's stride X value. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxStrideSizeX(ptile, index) ((ptile)->addr[index].stride_x) + +/*! + * \brief The simple wrapper to access each image's stride Y value. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxStrideSizeY(ptile, index) ((ptile)->addr[index].stride_y) + +/*! + * \brief The simple wrapper to access each image's step X value. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxStepSizeX(ptile, index) ((ptile)->addr[index].step_x) + +/*! + * \brief The simple wrapper to access each image's step Y value. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \ingroup group_tiling + */ +#define vxStepSizeY(ptile, index) ((ptile)->addr[index].step_y) +#endif + +#endif + +/*! \brief The User Kernel Tiling Attributes. + * \ingroup group_tiling + */ +enum vx_kernel_attribute_tiling_e { + /*! \brief This allows a tiling mode kernel to set its input neighborhood. */ + VX_KERNEL_INPUT_NEIGHBORHOOD = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x7, + /*! \brief This allows a tiling mode kernel to set its output tile block size. */ + VX_KERNEL_OUTPUT_TILE_BLOCK_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x8, + /*! \brief This allows the author to set the border mode on the tiling kernel. */ + VX_KERNEL_BORDER = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x9, + /*! \brief This determines the per tile memory allocation. */ + VX_KERNEL_TILE_MEMORY_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0xA, +#if defined(OPENVX_TILING_1_1) + /*! \brief This allows a tiling mode kernel to set its input tile block size. */ + VX_KERNEL_INPUT_TILE_BLOCK_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0xB, + /*! \brief This allows a tiling mode kernel to set its output neighborhood. */ + VX_KERNEL_OUTPUT_NEIGHBORHOOD = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0xC, +#endif +}; + +/*! \brief The User Node Tiling Attributes. + * \note These are largely unusable by the tiling function, as it doesn't give you the node reference! + * \ingroup group_tiling + */ +enum vx_node_attribute_tiling_e { + /*! \brief This allows a tiling mode node to get its input neighborhood. */ + VX_NODE_INPUT_NEIGHBORHOOD = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xB, + /*! \brief This allows a tiling mode node to get its output tile block size. */ + VX_NODE_OUTPUT_TILE_BLOCK_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xC, + /*! \brief This is the size of the tile local memory area. */ + VX_NODE_TILE_MEMORY_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xD, +#if defined(OPENVX_TILING_1_1) + /*! \brief This allows a tiling mode node to get its input tile block size. */ + VX_NODE_INPUT_TILE_BLOCK_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xE, + /*! \brief This allows a tiling mode node to get its output neighborhood. */ + VX_NODE_OUTPUT_NEIGHBORHOOD = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xF, +#endif +}; + +/*! \brief The tiling border mode extensions + * \ingroup group_tiling + */ +enum vx_border_tiling_e { + /*! \brief This value indicates that the author of the tiling kernel wrote + * code to handle border conditions into the kernel itself. If this mode + * is set, it can not be overriden by a call to the \ref vxSetNodeAttribute + * with \ref VX_NODE_BORDER. + */ + VX_BORDER_MODE_SELF = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x3, +}; + +/*! \typedef vx_tiling_kernel_f + * \brief Tiling Kernel function typedef for User Tiling Kernels. + * \note Tiles may come in any dimension and are not guaranteed to be delivered in + * any particular order. + * \param [in] parameters The array abstract pointers to parameters. + * \param [in] tile_memory The local tile memory pointer if requested, otherwise NULL. + * \param [in] tile_memory_size The size of the local tile memory, if not requested, 0. + * \ingroup group_tiling + */ +#ifdef __cplusplus +typedef void (*vx_tiling_kernel_f)(void * VX_RESTRICT parameters[], + void * VX_RESTRICT tile_memory, + vx_size tile_memory_size); +#else +typedef void (*vx_tiling_kernel_f)(void * VX_RESTRICT parameters[VX_RESTRICT], + void * VX_RESTRICT tile_memory, + vx_size tile_memory_size); +#endif + +#ifndef VX_IMAGE_PIXEL_DEFINITION + +/*! \def vxImageOffset + * \brief Computes the offset within an image. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \param [in] i The plane index. + * \param [in] x The Width Coordinates. + * \param [in] y The Height Coordinates. + * \param [in] ox The X offset. + * \param [in] oy The Y offset. + * \ingroup group_tiling + */ +#define vxImageOffset(ptile, i, x, y, ox, oy) \ + ((ptile)->addr[i].stride_y * (vx_int32)(((vx_int32)((oy)+(y)) * (vx_int32)(ptile)->addr[i].scale_y)/(vx_int32)VX_SCALE_UNITY)) + \ + ((ptile)->addr[i].stride_x * (vx_int32)(((vx_int32)((ox)+(x)) * (vx_int32)(ptile)->addr[i].scale_x)/(vx_int32)VX_SCALE_UNITY)) + + +/*! \def vxImagePixel + * \brief Accesses an image pixel as a type-cast indexed pointer dereference. + * \param [in] type The type of the image pixel. Example values are \ref vx_uint8, \ref vx_uint16, \ref vx_uint32, etc. + * \param [in] ptile The pointer to the \ref vx_tile_t structure. + * \param [in] i The plane index. + * \param [in] x The Center Pixel in Width Coordinates. + * \param [in] y The Center Pixel in Height Coordinates. + * \param [in] ox The X offset. + * \param [in] oy The Y offset. + * \ingroup group_tiling + */ +#define vxImagePixel(type, ptile, i, x, y, ox, oy) \ + *((type *)(&((vx_uint8 *)(ptile)->base[i])[vxImageOffset(ptile, i, x, y, ox, oy)])) + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Allows a user to add a tile-able kernel to the OpenVX system. + * \param [in] context The handle to the implementation context. + * \param [in] name The string to be used to match the kernel. + * \param [in] enumeration The enumerated value of the kernel to be used by clients. + * \param [in] flexible_func_ptr The process-local flexible function pointer to be invoked. + * \param [in] fast_func_ptr The process-local fast function pointer to be invoked. + * \param [in] num_params The number of parameters for this kernel. + * \param [in] input The pointer to a function which will validate the + * input parameters to this kernel. + * \param [in] output The pointer to a function which will validate the + * output parameters to this kernel. + * \note Tiling Kernels do not have access to any of the normal node attributes listed + * in \ref vx_node_attribute_e. + * \post Call \ref vxAddParameterToKernel for as many parameters as the function has, + * then call \ref vxFinalizeKernel. + * \retval 0 Indicates that an error occurred when adding the kernel. + * Note that the fast or flexible formula, but not both, can be NULL. + * \ingroup group_tiling + */ +VX_API_ENTRY vx_kernel VX_API_CALL vxAddTilingKernel(vx_context context, + vx_char name[VX_MAX_KERNEL_NAME], + vx_enum enumeration, + vx_tiling_kernel_f flexible_func_ptr, + vx_tiling_kernel_f fast_func_ptr, + vx_uint32 num_params, + vx_kernel_input_validate_f input, + vx_kernel_output_validate_f output); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_variants.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_variants.h new file mode 100644 index 0000000..47ca6c7 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_variants.h @@ -0,0 +1,96 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_KHR_VARIANT_H_ +#define _VX_KHR_VARIANT_H_ + +/*! + * \file + * \brief The Khronos Extension for Kernel Variants. + * + * \defgroup group_variants Extension: Kernel Variants + * \brief The Khronos Extension for Kernel Variants. + * \details Kernel Variants allow the Client-Defined Functions to create several + * kernels on the same target with the same name, but with slight variations + * between them. Frequently these variants are expected to employ different + * algorithms or methodologies. + * + * All target specific kernels and target variants must conform to the same OpenVX + * specification of the OpenVX Kernel in order to use the string name and enumeration. + * For example, a vendor may supply multiple targets, + * and implement the same functionality on each. Futhermore the same + * vendor may offer a variant on some specific target which offers some differentiation but + * still conforms to the definition of the OpenVX Kernel. + * In this example there are 3 implementations of the same computer vision function, "Sobel3x3". + * \arg On "CPU" a "Sobel3x3" which is "faster". A variant which may produce slightly less accurate but still conformant results. + * \arg On "CPU" a "Sobel3x3" which is more "accurate". A variant which may run slower but produces bit exact results. + * \arg On "GPU" a "Sobel3x3" \e default variant which may run on a remote core and produce bit exact results. + * + * In each of the cases a client of OpenVX could request the kernels in nearly + * the same the same manner. There are two main approaches, which depend on the + * method a client calls to get the kernel reference. The first uses enumerations. + * This method allows to client to attempt to find other targets and variants, but if + * these are not present, the default node would still have been constructed. + * The second method depends on using fully qualified strings to get the kernel reference. + * This second method is more compact but is does not permit fail-safing to default versions. + * + * As part of this extension, the function vxGetKernelByName will now accept more + * qualifications to the string naming scheme. Kernels names can be additionally + * qualified in 2 separate ways, by target and by variant. A "fully" qualified name is in the format of + * target:kernel:variant. + * Both \e target and \e variant may be omitted (for an unqualified name). + * In this case, the implementation will assume the "default" value of these + * names (which could literally be "default"). Names may also be fully + * qualified with target included. + * Examples: + * \arg "khronos.c_model:org.khonos.openvx.sobel3x3:default" - fully qualified + * \arg "org.khronos.openvx.sobel3x3:default" (missing target) - partially qualified + * \arg "khronos.c_model:org.khronos.openvx.sobel3x3" (missing variant) - partially qualifed. + * \arg "org.khronos.openvx.sobel3x3" - unqualified. + * + */ + +/*! \brief The string name of the extension. + * \ingroup group_variants + */ +#define OPENVX_KHR_VARIANTS "vx_khr_variants" + +/*! \brief Defines the maximum number of characters in a variant string. + * \ingroup group_variants + */ +#define VX_MAX_VARIANT_NAME (64) + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Used to choose a variant of a kernel for execution on a particular node. + * \param [in] node The reference to the node. + * \param [in] variantName The name of the variant to choose. + * \return A \ref vx_status_e enumeration. + * \ingroup group_variants + */ +VX_API_ENTRY vx_status VX_API_CALL vxChooseKernelVariant(vx_node node, vx_char variantName[VX_MAX_VARIANT_NAME]); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_xml.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_xml.h new file mode 100644 index 0000000..cb78245 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_xml.h @@ -0,0 +1,156 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_KHR_XML_H_ +#define _VX_KHR_XML_H_ + +/*! \file + * \brief The OpenVX XML Schema Extension Header. + * + * \defgroup group_xml Extension: XML API + * \brief The Khronos Extension for OpenVX XML Import and Export Support. + */ + +#define OPENVX_KHR_XML "vx_khr_xml" + +#include + +/*! \brief The Object Type Enumeration for Imports. + * \ingroup group_xml + */ +enum vx_ext_import_type_e { + VX_TYPE_IMPORT = 0x814,/*!< \brief A \ref vx_import */ +}; + +/*! \brief The import type enumeration. + * \ingroup group_xml + * \see VX_IMPORT_ATTRIBUTE_TYPE + */ +enum vx_ext_import_types_e { + VX_IMPORT_TYPE_XML = 0,/*!< \brief The XML import type */ +}; + +/*! \brief The import attributes list + * \ingroup group_xml + * \see vxQueryImport + */ +enum vx_import_attribute_e { + /*! \brief Returns the number of references in the import object. Use a \ref vx_uint32 parameter.*/ + VX_IMPORT_ATTRIBUTE_COUNT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMPORT) + 0x0, + /*! \brief Returns the type of import. Use a \ref vx_ext_import_types_e parameter */ + VX_IMPORT_ATTRIBUTE_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMPORT) + 0x1, +}; + +/*! \brief An abstract handle to an import object. + * \ingroup group_xml + * \extends vx_reference + */ +typedef struct _vx_import *vx_import; + + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Exports all objects in the context to an XML file which uses the OpenVX + * XML Schema. + * \param [in] context The context to export. + * \param [in] xmlfile The file name to write the XML into. + * \note The reference numbers contained in the xml file can appear in any order but + * should be inclusive from index number 0 to [number of references - 1]. For example, + * if there are 20 references in the xml file, none of the reference indices should be >= 20. + * \return A \ref vx_status_e enumeration. + * \see https://www.khronos.org/registry/vx/schema/openvx-1-1.xsd + * \ingroup group_xml + */ +VX_API_ENTRY vx_status VX_API_CALL vxExportToXML(vx_context context, vx_char xmlfile[]); + + +/*! \brief Imports all framework and data objects from an XML file into the given context. + * \param [in] context The context to import into. + * \param [in] xmlfile The XML file to read. + * \note The reference indices in the import object corresponds with the reference numbers in the + * XML file. It is assumed that the program has some means to know which references to use from + * imported list (either by name: \ref vxGetImportReferenceByName, or by index from looking at the XML + * file (debug use case): \ref vxGetImportReferenceByIndex). Alternativly, the program can use + * \ref vxGetImportReferenceByIndex in a loop and query each one to understand what was imported. After + * all references of interest have been retrieved, this import obects should be released using + * \ref vxReleaseImport. + * \return \ref vx_import object containing references to the imported objects in the context + * \see https://www.khronos.org/registry/vx/schema/openvx-1-1.xsd + * \ingroup group_xml + */ +VX_API_ENTRY vx_import VX_API_CALL vxImportFromXML(vx_context context, vx_char xmlfile[]); + +/*! \brief Used to retrieve a reference by name from the import when the name is known beforehand. If + * multiple references have the same name, then *any* one of them may be returned. + * \param [in] import The reference to the import object. + * \param [in] name The reference string name. + * \return \ref vx_reference + * \retval 0 Invalid import object or name does not match a reference in the import object. + * \retval * The reference matching the requested name. + * \note Use \ref vxReleaseReference to release the reference before releasing the context. + * \pre \ref vxImportFromXML + * \ingroup group_xml + */ +VX_API_ENTRY vx_reference VX_API_CALL vxGetImportReferenceByName(vx_import import, const vx_char *name); + +/*! \brief Used to retrieve a reference by the index from the import. + * \param [in] import The reference to the import object. + * \param [in] index The index of the reference in the import object to return. + * \return \ref vx_reference + * \retval 0 Invalid import object or index. + * \retval * The reference at the requested index number. + * \note Use \ref vxQueryImport with \ref VX_IMPORT_ATTRIBUTE_COUNT to retrieve + * the upper limit of references in the import. + * \note Use \ref vxReleaseReference to release the reference before releasing the context. + * \pre \ref vxImportFromXML + * \ingroup group_xml + */ +VX_API_ENTRY vx_reference VX_API_CALL vxGetImportReferenceByIndex(vx_import import, vx_uint32 index); + +/*! \brief Used to query the import about its properties. + * \param [in] import The reference to the import object. + * \param [in] attribute The \ref vx_import_attribute_e value to query for. + * \param [out] ptr The location at which the resulting value will be stored. + * \param [in] size The size of the container to which ptr points. + * \return A \ref vx_status_e enumeration. + * \pre \ref vxImportFromXML + * \ingroup group_xml + */ +VX_API_ENTRY vx_status VX_API_CALL vxQueryImport(vx_import import, vx_enum attribute, void *ptr, vx_size size); + +/*! \brief Releases a reference to an import object. + * Also internally releases its references to its imported objects. These + * imported objects may not be garbage collected until their total reference + * counts are zero. + * \param [in] import The pointer to the import object to release. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE If import is not a \ref vx_import. + * \note After returning from this function the reference will be zeroed. + * \pre \ref vxImportFromXML + * \ingroup group_xml + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseImport(vx_import *import); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_lib_debug.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_lib_debug.h new file mode 100644 index 0000000..1ed1a61 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_lib_debug.h @@ -0,0 +1,385 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _OPENVX_EXT_DEBUG_H_ +#define _OPENVX_EXT_DEBUG_H_ + +#include + +/*! + * \file + * \brief The OpenVX Debugging Extension. + * \defgroup group_debug_ext Debugging Extension + * \defgroup group_vision_function_copy_image Kernel: Copy Image + * \defgroup group_vision_function_copy_array Kernel: Copy Array + * \defgroup group_vision_function_fwrite_image Kernel: File Write Image + * \defgroup group_vision_function_fwrite_array Kernel: File Write Array + * \defgroup group_vision_function_plus1 Kernel: Plus One Image + * \defgroup group_vision_function_fill_image Kernel: Fill Image + * \defgroup group_vision_function_check_image Kernel: Check Image + * \defgroup group_vision_function_check_array Kernel: Check Array + * \defgroup group_vision_function_compare_images Kernel: Compare Images + */ + +/*! \brief The maximum filepath name length. + * \ingroup group_debug_ext + */ +#define VX_MAX_FILE_NAME (256) + +/*! \brief The library value for the extension + * \ingroup group_debug_ext + */ +#define VX_LIBRARY_KHR_DEBUG (0xFF) + +/*! \brief The list of extensions to OpenVX from the Sample Implementation. + * \ingroup group_debug_ext + */ +enum vx_kernel_debug_ext_e { + + /*! + * \brief The Copy kernel. Output = Input. + * \param [in] vx_image The input image. + * \param [out] vx_image The output image. + * \see group_vision_function_copy_image + */ + VX_KERNEL_DEBUG_COPY_IMAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0x0, + + /*! + * \brief The Copy Kernel, Output = Input. + * \param [in] vx_array The input array. + * \param [out] vx_array The output array. + * \see group_vision_function_copy_array + */ + VX_KERNEL_DEBUG_COPY_ARRAY = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0x1, + + /*! + * \brief The File Writing Kernel for Images. + * \param [in] vx_image The input image. + * \param [in] vx_array The name of the file. + * \see group_vision_function_fwrite_image + */ + VX_KERNEL_DEBUG_FWRITE_IMAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0x2, + + /*! + * \brief The File Writing Kernel for Arrays + * \param [in] vx_array The input array. + * \param [in] vx_array The name of the file. + * \see group_vision_function_fwrite_array + */ + VX_KERNEL_DEBUG_FWRITE_ARRAY = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0x3, + + /*! + * \brief The File Reading Kernel for images. + * \param [in] vx_array The name of the file to read. + * \param [out] vx_image The output image. + * \see group_vision_function_fread_image + */ + VX_KERNEL_DEBUG_FREAD_IMAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0x4, + + /*! + * \brief The File Reading Kernel for Arrays. + * \param [in] vx_array The name of the file to read. + * \param [out] vx_image The output image. + * \see group_vision_function_fread_array + */ + VX_KERNEL_DEBUG_FREAD_ARRAY = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0x5, + + /*! + * \brief Fills the image with a given value. + * \param [in] vx_uint32 + * \param [out] vx_image + * \ingroup group_vision_function_fill_image + */ + VX_KERNEL_FILL_IMAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0x6, + + /*! + * \brief Checks an image against a known value and returns a number of + * errors. + * \param [in] vx_image + * \param [in] vx_uint32 + * \param [out] vx_scalar + * \ingroup group_vision_function_check_image + */ + VX_KERNEL_CHECK_IMAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0x7, + + /*! + * \brief Checks an array against a known value and returns a number of + * errors. + * \param [in] vx_array + * \param [in] vx_uint8 + * \param [out] vx_scalar + * \ingroup group_vision_function_check_array + */ + VX_KERNEL_CHECK_ARRAY = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0x8, + + /*! + * \brief Compares two images and returns the number of differences. + * \param [in] vx_image + * \param [in] vx_image + * \param [out] vx_scalar + * \ingroup group_vision_function_compare_image + */ + VX_KERNEL_COMPARE_IMAGE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0x9, + + /*! + * \brief Copies an image from a memory area. + * \param [in] void * + * \param [out] vx_image + * \see group_vision_function_copy_ptr + */ + VX_KERNEL_COPY_IMAGE_FROM_PTR = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_DEBUG) + 0xA, +}; + +/******************************************************************************/ +// GRAPH MODE FUNCTIONS +/******************************************************************************/ + +#ifdef __cplusplus +extern "C" { +#endif +/*! + * \brief [Graph] Creates a Copy Image Node. + * \param [in] graph The handle to the graph. + * \param [in] input The input image. + * \param [out] output The output image. + * \see VX_KERNEL_COPY_IMAGE + * \note Graph Mode Function. + * \ingroup group_vision_function_copy_image + */ +vx_node vxCopyImageNode(vx_graph graph, vx_image input, vx_image output); + +/*! + * \brief [Graph] Creates a Copy Array Node. + * \param [in] graph The handle to the graph. + * \param [in] input The input array. + * \param [out] output The output array. + * \see VX_KERNEL_COPY_ARRAY + * \note Graph Mode Function. + * \ingroup group_vision_function_copy_array + */ +vx_node vxCopyArrayNode(vx_graph graph, vx_array input, vx_array output); + +/*! \brief [Graph] Writes the source image to the file. + * \param [in] graph The handle to the graph. + * \param [in] image The input array. + * \param [in] name The name of the file. + * \note Graph Mode Function. + * \ingroup group_vision_function_fwrite_image + */ +vx_node vxFWriteImageNode(vx_graph graph, vx_image image, vx_char name[VX_MAX_FILE_NAME]); + +/*! \brief [Graph] Writes the source array to the file. + * \param [in] graph The handle to the graph. + * \param [in] array The input array. + * \param [in] name The name of the file. + * \note Graph Mode Function. + * \ingroup group_vision_function_fwrite_array + */ +vx_node vxFWriteArrayNode(vx_graph graph, vx_array array, vx_char name[VX_MAX_FILE_NAME]); + +/*! \brief [Graph] Writes the source image to the file. + * \param [in] graph The handle to the graph. + * \param [in] name The name of the file. + * \param [out] image The output image. + * \note Graph Mode Function. + * \ingroup group_vision_function_fread_image + */ +vx_node vxFReadImageNode(vx_graph graph, vx_char name[VX_MAX_FILE_NAME], vx_image image); + +/*! \brief [Graph] Writes the source array to the file. + * \param [in] graph The handle to the graph. + * \param [in] name The name of the file. + * \param [out] array The output array. + * \note Graph Mode Function. + * \ingroup group_vision_function_fread_array + */ +vx_node vxFReadArrayNode(vx_graph graph, vx_char name[VX_MAX_FILE_NAME], vx_array array); + +/*! \brief [Graph] Adds 1 to each uint8 pixel. This will clamp at 255. + * \param [in] graph The handle to the graph. + * \param [in,out] image The image to increment. + * \note Graph Mode Function + * \ingroup group_vision_function_plus1 + */ +vx_node vxPlusOneNode(vx_graph graph, vx_image image); + +/*! + * \brief [Graph] Fills an image with a known value. + * \param [in] graph The handle to the graph. + * \param [in] value The known value to fill the image with. + * \param [out] output The image to fill. + * \note Graph Mode Function + * \ingroup group_vision_function_fill_image + */ +vx_node vxFillImageNode(vx_graph graph, vx_uint32 value, vx_image output); + +/*! + * \brief [Graph] Checks an image against a known value. + * \param [in] graph The handle to the graph. + * \param [in] input The image to check. + * \param [in] value The known value to check the image against. + * \param [out] errs The handle to the number of errors found. + * \note Graph Mode Function + * \ingroup group_vision_function_check_image + */ +vx_node vxCheckImageNode(vx_graph graph, vx_image input, vx_uint32 value, vx_scalar errs); + +/*! + * \brief [Graph] Checks a array for a known value. + * \param [in] graph The handle to the graph. + * \param [in] input The array to check. + * \param [in] value The known value to check against. + * \param [out] errs An output of the number of errors. + * \note Graph Mode Function + * \ingroup group_vision_function_check_array + */ +vx_node vxCheckArrayNode(vx_graph graph, vx_array input, vx_uint8 value, vx_scalar errs); + +/*! + * \brief [Graph] Compares two images and returns the number of pixel sub-channels + * which are different. + * \param [in] graph The handle to the graph. + * \param [in] a The first image. + * \param [in] b The second image. + * \param [out] diffs The handle to scalar to hold the number of differences. + * \note Graph Mode Function + * \ingroup group_vision_function_compare_image + */ +vx_node vxCompareImagesNode(vx_graph graph, vx_image a, vx_image b, vx_scalar diffs); + +/*! \brief [Graph] Copies a HOST memory area into an image. + * \param [in] graph The handle to the graph. + * \param [in] ptr The input pointer to the memory area to copy. + * \param [out] output The output image. + * \note Graph Mode Function + * \ingroup group_vision_function_copy_ptr + */ +vx_node vxCopyImageFromPtrNode(vx_graph graph, void *ptr, vx_image output); + +/******************************************************************************/ +// IMMEDIATE MODE FUNCTION +/******************************************************************************/ + +/*! \brief [Immediate] Copies the source image to the destination image. + * \param [in] src The input image. + * \param [in] dst The output image. + * \note Immediate Mode Function. + * \ingroup group_vision_function_copy_image + */ +vx_status vxuCopyImage(vx_context context, vx_image src, vx_image dst); + +/*! \brief [Immediate] Copies the source array to the destination array. + * \param [in] src The input array. + * \param [in] dst The output array. + * \note Immediate Mode Function. + * \ingroup group_vision_function_copy_array + */ +vx_status vxuCopyArray(vx_context context, vx_array src, vx_array dst); + +/*! \brief [Immediate] Writes the source image to the file. + * \param [in] image The input array. + * \param [in] name The name of the file. + * \note Immediate Mode Function. + * \ingroup group_vision_function_fwrite_image + */ +vx_status vxuFWriteImage(vx_context context, vx_image image, vx_char name[VX_MAX_FILE_NAME]); + +/*! \brief [Immediate] Writes the source array to the file. + * \param [in] array The input array. + * \param [in] name The name of the file. + * \note Immediate Mode Function. + * \ingroup group_vision_function_fwrite_array + */ +vx_status vxuFWriteArray(vx_context context, vx_array array, vx_char name[VX_MAX_FILE_NAME]); + +/*! \brief [Immediate] Reads the source image from the file. + * \param [in] name The name of the file. + * \param [out] image The output image. + * \note Immediate Mode Function. + * \ingroup group_vision_function_fread_image + */ +vx_status vxuFReadImage(vx_context context, vx_char name[VX_MAX_FILE_NAME], vx_image image); + +/*! \brief [Immediate] Reads the source array from the file. + * \param [in] name The name of the file. + * \param [out] array The output array. + * \note Immediate Mode Function. + * \ingroup group_vision_function_fread_array + */ +vx_status vxuFReadArray(vx_context context, vx_char name[VX_MAX_FILE_NAME], vx_array array); + +/*! \brief [Immediate] Adds 1 to each uint8 pixel. This will clamp at 255. + * \param [in,out] image The image to increment. + * \note Immediate Mode Function + * \ingroup group_vision_function_plus1 + */ +vx_node vxuPlusOneNode(vx_context context, vx_image image); + +/*! + * \brief [Immediate] Fills an image with a known value. + * \param [in] value The known value to fill the image with. + * \param [out] output The image to fill. + * \note Immediate Mode Function + * \ingroup group_vision_function_fill_image + */ +vx_status vxuFillImage(vx_context context, vx_uint32 value, vx_image output); + +/*! + * \brief [Immediate] Checks an image against a known value. + * \param [in] output The image to check. + * \param [in] value The known value to check the image against. + * \param [out] numErrors The handle to the number of errors found. + * \note Immediate Mode Function + * \ingroup group_vision_function_check_image + */ +vx_status vxuCheckImage(vx_context context, vx_image input, vx_uint32 value, vx_uint32 *numErrors); + +/*! + * \brief [Immediate] Checks a array for a known value. + * \param [in] input The array to check. + * \param [in] value The known value to check against. + * \param [out] numErrors An output of the number of errors. + * \note Immediate Mode Function + * \ingroup group_vision_function_check_array + */ +vx_status vxuCheckArray(vx_context context, vx_array input, vx_uint8 value, vx_uint32 *numErrors); + +/*! + * \brief [Immediate] Compares two images and returns the number of pixel sub-channels + * which are different. + * \param [in] a The first image. + * \param [in] b The second image. + * \param [out] numDiffs The handle to scalar to hold the number of differences. + * \note Immediate Mode Function + * \ingroup group_vision_function_compare_image + */ +vx_status vxuCompareImages(vx_context context, vx_image a, vx_image b, vx_uint32 *numDiffs); + +/*! \brief [Immediate] Copies a HOST memory area into an image. + * \param [in] ptr The input pointer to the memory area to copy. + * \param [out] output The output image. + * \note Immediate Mode Function + * \ingroup group_vision_function_copy_ptr + */ +vx_status vxuCopyImageFromPtr(vx_context context, void *ptr, vx_image output); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_lib_extras.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_lib_extras.h new file mode 100644 index 0000000..b697c63 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_lib_extras.h @@ -0,0 +1,252 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_EXT_EXTRAS_H_ +#define _VX_EXT_EXTRAS_H_ + +/*! \file + * \brief Extras Extension. + * + * \defgroup group_extras_ext Khronos Extras Extension. + * \brief A Set of Kernels which extend OpenVX. + * + * \defgroup group_vision_function_laplacian_image Kernel: Laplacian Filter + * \brief Computes a Laplacian filter over a window of the input image. + * \details This filter uses the follow convolution matrix: + \f[ + \mathbf{K}_{gaussian} = \begin{vmatrix} + 1 & 1 & 1\\ + 1 &-8 & 1\\ + 1 & 1 & 1 + \end{vmatrix} * \frac{1}{1} + \f] + * + * \defgroup group_vision_function_scharr3x3 Kernel: Sobel 3x3 + * \brief The Scharr Image Filter Kernel + * \details This kernel produces two output planes (one can be omitted) + * in the x and y plane. The Scharr operators \f$G_x, G_y\f$ are defined as: + \f[ + \mathbf{G}_x=\begin{vmatrix} + -3 & 0 & +3\\ + -10& 0 & +10\\ + -3 & 0 & +3 + \end{vmatrix} + , + \mathbf{G}_y=\begin{vmatrix} + -3 & -10 & -3 \\ + 0 & 0 & 0 \\ + +3 & +10 & +3 + \end{vmatrix} + + \f] + * + */ + +/*! \brief The Khronos Extras Library + * \ingroup group_extras_ext + */ +#define VX_LIBRARY_KHR_EXTRAS (0xFE) + +/*! \brief The Khronos Extras Kernels. + * \ingroup group_extras_ext + */ +enum vx_kernel_extras_ext_e { + /*! \brief The Non-Maximum Supression Kernel for Canny. + * \note Use "org.khronos.extra.nonmaximasuppression" to \ref vxGetKernelByName. + * \param [in] vx_image The magnitude image in VX_DF_IMAGE_U8. + * \param [in] vx_image The phase image in VX_DF_IMAGE_U8. + * \param [out] vx_image The edge image in VX_DF_IMAGE_U8. + * \ingroup group_vision_function_nonmaxsuppression + */ + VX_KERNEL_EXTRAS_NONMAXSUPPRESSION_CANNY = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_EXTRAS) + 0x0, + + /*! \brief The laplacian filter kernel. + * \note Use "org.khronos.extras.laplacian3x3" to \ref vxGetKernelByName. + * \param [in] vx_image The VX_DF_IMAGE_U8 input image. + * \param [out] vx_image The VX_DF_IMAGE_U8 output image. + * \see group_vision_function_laplacian_image + */ + VX_KERNEL_EXTRAS_LAPLACIAN_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_EXTRAS) + 0x1, + + /*! \brief The scharr filter kernel. + * \note Use "org.khronos.extras.scharr3x3" to \ref vxGetKernelByName. + * \param [in] vx_image The VX_DF_IMAGE_U8 input image. + * \param [out] vx_image The VX_DF_IMAGE_S16 output gradient x image. + * \param [out] vx_image The VX_DF_IMAGE_S16 output gradient y image. + * \see group_vision_function_scharr3x3 + */ + VX_KERNEL_EXTRAS_SCHARR_3x3 = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_EXTRAS) + 0x2, + + /*! \brief The Harris Score Kernel. + * \note use "org.khronos.extras.harris_score". + * \param [in] vx_image A VX_DF_IMAGE_S16 X Gradient + * \param [in] vx_image A VX_DF_IMAGE_S16 Y Gradient + * \param [in] vx_scalar A block size. + * \param [out] vx_image A VX_DF_IMAGE_S32 corner score per pixel. + * \ingroup group_vision_function_harris_score + */ + VX_KERNEL_EXTRAS_HARRIS_SCORE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_EXTRAS) + 0x3, + + /*! \brief The Sobel MxN kernel. + * \note Use "org.khronos.extras.sobelMxN" to \ref vxGetKernelByName. + * \param [in] vx_image The VX_DF_IMAGE_U8 input image. + * \param [in] vx_scalar Window Size (3,5,7) + * \param [out] vx_image The VX_DF_IMAGE_S16 output gradient x image. + * \param [out] vx_image The VX_DF_IMAGE_S16 output gradient y image. + * \see group_vision_function_sobelmxn + */ + VX_KERNEL_EXTRAS_SOBEL_MxN = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_EXTRAS) + 0x4, + + /*! \brief The image to list converter. + * \param [in] vx_image The VX_DF_IMAGE_U8 or VX_DF_IMAGE_S32 image. + * \param [out] vx_array The array of output + * \param [out] vx_scalar The total number of non zero points in image (optional) + * \ingroup group_vision_function_image_lister + */ + VX_KERNEL_EXTRAS_IMAGE_LISTER = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_EXTRAS) + 0x5, + + /*! \brief The Euclidean Non-Maximum Suppression Kernel for Harris Corners. + * \param [in] vx_image The VX_DF_IMAGE_F32 image. + * \param [in] vx_scalar The minimum threshold + * \param [in] vx_scalar The euclidean distance from the considered pixel. + * \param [out] vx_image The VX_DF_IMAGE_F32 image. + * \ingroup group_vision_function_euclidean_nonmax + */ + VX_KERNEL_EXTRAS_EUCLIDEAN_NONMAXSUPPRESSION_HARRIS = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_EXTRAS) + 0x6, + + /*! \brief Elementwise binary norm kernel. + * \param [in] vx_image Left image (VX_DF_IMAGE_S16). + * \param [in] vx_image Right image (VX_DF_IMAGE_S16). + * \param [in] vx_scalar Norm type (vx_norm_type_e). + * \param [in] vx_image Output image (VX_DF_IMAGE_U16). + */ + VX_KERNEL_EXTRAS_ELEMENTWISE_NORM = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_EXTRAS) + 0x7, + + /*! \brief Edge tracing kernel. + * \param [in] vx_image Norm image (VX_DF_IMAGE_U16). + * \param [in] vx_image Phase image (VX_DF_IMAGE_U8). + * \param [in] vx_threshold Threshold (VX_THRESHOLD_TYPE_RANGE). + * \param [out] vx_image Output binary image (VX_DF_IMAGE_U8). + */ + VX_KERNEL_EXTRAS_EDGE_TRACE = VX_KERNEL_BASE(VX_ID_KHRONOS, VX_LIBRARY_KHR_EXTRAS) + 0x8 +}; + +/*! \brief Extra VX_DF_IMAGE codes supported by this extension. */ +enum _vx_extra_df_image { + /*! \brief A single plane of 32 bit float data. + * The range of the data is not specified. + */ + VX_DF_IMAGE_F32 = VX_DF_IMAGE('F','0','3','2'), +}; + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief [Graph] Creates a Non Max Suppress Node. + * \param [in] graph The handle to the graph. + * \param [in] input The input image in VX_DF_IMAGE_U8 format. + * \param [out] output The output image in VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_laplacian_image + */ +vx_node vxNonMaxSuppressionCannyNode(vx_graph graph, vx_image mag, vx_image phase, vx_image edge); + +/*! \brief [Immediate] Creates a Non Max Suppress Node. + * \param [in] graph The handle to the graph. + * \param [in] input The input image in VX_DF_IMAGE_U8 format. + * \param [out] output The output image in VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_laplacian_image + */ +vx_status vxuNonMaxSuppressionCanny(vx_context context, vx_image mag, vx_image phase, vx_image edge); + +/*! \brief [Graph] Creates a Laplacian Filter Node. + * \param [in] graph The handle to the graph. + * \param [in] input The input image in VX_DF_IMAGE_U8 format. + * \param [out] output The output image in VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_laplacian_image + */ +vx_node vxLaplacian3x3Node(vx_graph graph, vx_image input, vx_image output); + +/*! \brief [Immediate] Computes a laplacian filter on the image by a 3x3 window. + * \param [in] input The input image in VX_DF_IMAGE_U8 format. + * \param [out] output The output image in VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_laplacian_image + */ +vx_status vxuLaplacian3x3(vx_context context, vx_image input, vx_image output); + +/*! \brief [Graph] Creates a Scharr Filter Node. + * \param [in] graph The handle to the graph. + * \param [in] input The input image in VX_DF_IMAGE_U8 format. + * \param [out] output The output image in VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_laplacian_image + */ +vx_node vxScharr3x3Node(vx_graph graph, vx_image input, vx_image output1, vx_image output2); + +/*! \brief [Immediate] Computes a Scharr filter on the image by a 3x3 window. + * \param [in] input The input image in VX_DF_IMAGE_U8 format. + * \param [out] output The output image in VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_laplacian_image + */ +vx_status vxuScharr3x3(vx_context context, vx_image input, vx_image output1, vx_image output2); + +vx_node vxSobelMxNNode(vx_graph graph, vx_image input, vx_scalar win, vx_image gx, vx_image gy); + +vx_status vxuSobelMxN(vx_context context, vx_image input, vx_scalar win, vx_image gx, vx_image gy); + +vx_node vxHarrisScoreNode(vx_graph graph, + vx_image gx, + vx_image gy, + vx_scalar sensitivity, + vx_scalar grad_size, + vx_scalar block_size, + vx_scalar shift, + vx_image score); + +vx_status vxuHarrisScore(vx_context context, vx_image gx, + vx_image gy, + vx_scalar sensitivity, + vx_scalar grad_size, + vx_scalar block_size, + vx_scalar shift, + vx_image score); + +vx_node vxEuclideanNonMaxHarrisNode(vx_graph graph, + vx_image input, + vx_scalar strength_thresh, + vx_scalar min_distance, + vx_image output); + +vx_status vxuEuclideanNonMaxHarris(vx_context context, vx_image input, + vx_scalar strength_thresh, + vx_scalar min_distance, + vx_image output); + +vx_node vxImageListerNode(vx_graph graph, vx_image input, vx_array arr, vx_scalar num_points); + +vx_status vxuImageLister(vx_context context, vx_image input, + vx_array arr, vx_scalar num_points); + +vx_node vxElementwiseNormNode(vx_graph graph, vx_image input_x, vx_image input_y, vx_scalar norm_type, vx_image output); + +vx_node vxEdgeTraceNode(vx_graph graph, vx_image norm, vx_threshold threshold, vx_image output); + +#ifdef __cplusplus +} +#endif + +#endif /* _VX_EXT_EXTRAS_H_ */ + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_lib_xyz.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_lib_xyz.h new file mode 100644 index 0000000..bbfaeea --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_lib_xyz.h @@ -0,0 +1,109 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#ifndef _OPENVX_EXT_XYZ_H_ +#define _OPENVX_EXT_XYZ_H_ + +/*! + * \file + * \brief An example of how to wrap a User Extension Kernel. + * + * \defgroup group_xyz_ext The Example User Kernel Extension + * + */ + +#include + +/*! + * \file vx_ext_xyz.h + * \brief The example header for how to write a user mode extension to OpenVX. + */ + +/*! \brief The XYZ Data area in bytes + * \ingroup group_xyz_ext + */ +#define XYZ_DATA_AREA (1024) + +/*! \brief The required number of items in the temp array + * \ingroup group_xyz_ext + */ +#define XYZ_TEMP_NUMITEMS (374) + +/*! \brief The minimum value of the scalar for the XYZ Kernel. + * \ingroup group_xyz_ext + */ +#define XYZ_VALUE_MIN (-10) + +/*! \brief The maximum value of the scalar for the XYZ Kernel. + * \ingroup group_xyz_ext + */ +#define XYZ_VALUE_MAX (10) + +//! [KERNEL ENUM] +#define VX_KERNEL_NAME_KHR_XYZ "org.khronos.example.xyz" +/*! \brief The XYZ Example Library Set + * \ingroup group_xyz_ext + */ +#define VX_LIBRARY_XYZ (0x3) // assigned from Khronos, vendors control their own + +/*! \brief The list of XYZ Kernels. + * \ingroup group_xyz_ext + */ +enum vx_kernel_xyz_ext_e { + /*! \brief The Example User Defined Kernel */ + VX_KERNEL_KHR_XYZ = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_XYZ) + 0x0, + // up to 0xFFF kernel enums can be created. +}; +//! [KERNEL ENUM] + +#ifdef __cplusplus +extern "C" { +#endif + +//! [node] +/*! \brief [Graph] This is an example ISV or OEM provided node which executes + * in the Graph to call the XYZ kernel. + * \param [in] graph The handle to the graph in which to instantiate the node. + * \param [in] input The input image. + * \param [in] value The input scalar value + * \param [out] output The output image. + * \param [in,out] temp A temp array for some data which is needed for + * every iteration. + * \ingroup group_example_kernel + */ +vx_node vxXYZNode(vx_graph graph, vx_image input, vx_uint32 value, vx_image output, vx_array temp); +//! [node] + +//! [vxu] +/*! \brief [Immediate] This is an example of an immediate mode version of the XYZ node. + * \param [in] context The overall context of the implementation. + * \param [in] input The input image. + * \param [in] value The input scalar value + * \param [out] output The output image. + * \param [in,out] temp A temp array for some data which is needed for + * every iteration. + * \ingroup group_example_kernel + */ +vx_status vxuXYZ(vx_context context, vx_image input, vx_uint32 value, vx_image output, vx_array temp); +//! [vxu] + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h new file mode 100644 index 0000000..3bfb7f2 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h @@ -0,0 +1,947 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _OPENVX_NODES_H_ +#define _OPENVX_NODES_H_ + +/*! + * \file vx_nodes.h + * \brief The "Simple" API interface for OpenVX. These APIs are just + * wrappers around the more verbose functions defined in \ref vx_api.h. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief [Graph] Creates a color conversion node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image from which to convert. + * \param [out] output The output image to which to convert, which must have the same dimensions as the input image. + * \see VX_KERNEL_COLOR_CONVERT + * \ingroup group_vision_function_colorconvert + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxColorConvertNode(vx_graph graph, vx_image input, vx_image output); + +/*! \brief [Graph] Creates a channel extract node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image. Must be one of the defined \ref vx_df_image_e multi-channel formats. + * \param [in] channel The \ref vx_channel_e channel to extract. + * \param [out] output The output image. Must be \ref VX_DF_IMAGE_U8, and must have the same dimensions as the input image. + * \see VX_KERNEL_CHANNEL_EXTRACT + * \ingroup group_vision_function_channelextract + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxChannelExtractNode(vx_graph graph, + vx_image input, + vx_enum channel, + vx_image output); + +/*! \brief [Graph] Creates a channel combine node. + * \param [in] graph The graph reference. + * \param [in] plane0 The plane that forms channel 0. Must be \ref VX_DF_IMAGE_U8. + * \param [in] plane1 The plane that forms channel 1. Must be \ref VX_DF_IMAGE_U8. + * \param [in] plane2 [optional] The plane that forms channel 2. Must be \ref VX_DF_IMAGE_U8. + * \param [in] plane3 [optional] The plane that forms channel 3. Must be \ref VX_DF_IMAGE_U8. + * \param [out] output The output image. The format of the image must be defined, even if the image is virtual. Must have the same dimensions as the input images + * \see VX_KERNEL_CHANNEL_COMBINE + * \ingroup group_vision_function_channelcombine + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxChannelCombineNode(vx_graph graph, + vx_image plane0, + vx_image plane1, + vx_image plane2, + vx_image plane3, + vx_image output); + +/*! \brief [Graph] Creates a Phase node. + * \param [in] graph The reference to the graph. + * \param [in] grad_x The input x image. This must be in \ref VX_DF_IMAGE_S16 format. + * \param [in] grad_y The input y image. This must be in \ref VX_DF_IMAGE_S16 format. + * \param [out] orientation The phase image. This is in \ref VX_DF_IMAGE_U8 format, and must have the same dimensions as the input images. + * \see VX_KERNEL_PHASE + * \ingroup group_vision_function_phase + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxPhaseNode(vx_graph graph, vx_image grad_x, vx_image grad_y, vx_image orientation); + +/*! \brief [Graph] Creates a Sobel3x3 node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output_x [optional] The output gradient in the x direction in \ref VX_DF_IMAGE_S16. Must have the same dimensions as the input image. + * \param [out] output_y [optional] The output gradient in the y direction in \ref VX_DF_IMAGE_S16. Must have the same dimensions as the input image. + * \see VX_KERNEL_SOBEL_3x3 + * \ingroup group_vision_function_sobel3x3 + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxSobel3x3Node(vx_graph graph, vx_image input, vx_image output_x, vx_image output_y); + + +/*! \brief [Graph] Create a Magnitude node. + * \param [in] graph The reference to the graph. + * \param [in] grad_x The input x image. This must be in \ref VX_DF_IMAGE_S16 format. + * \param [in] grad_y The input y image. This must be in \ref VX_DF_IMAGE_S16 format. + * \param [out] mag The magnitude image. This is in \ref VX_DF_IMAGE_S16 format. Must have the same dimensions as the input image. + * \see VX_KERNEL_MAGNITUDE + * \ingroup group_vision_function_magnitude + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxMagnitudeNode(vx_graph graph, vx_image grad_x, vx_image grad_y, vx_image mag); + +/*! \brief [Graph] Creates a Scale Image Node. + * \param [in] graph The reference to the graph. + * \param [in] src The source image of type \ref VX_DF_IMAGE_U8. + * \param [out] dst The destination image of type \ref VX_DF_IMAGE_U8. + * \param [in] type The interpolation type to use. \see vx_interpolation_type_e. + * \ingroup group_vision_function_scale_image + * \note The destination image must have a defined size and format. The border modes + * \ref VX_NODE_BORDER value \ref VX_BORDER_UNDEFINED, + * \ref VX_BORDER_REPLICATE and \ref VX_BORDER_CONSTANT are supported. + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxScaleImageNode(vx_graph graph, vx_image src, vx_image dst, vx_enum type); + +/*! \brief [Graph] Creates a Table Lookup node. If a value from the input image is not present in the lookup table, the result is undefined. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [in] lut The LUT which is of type \ref VX_TYPE_UINT8 if input image is \ref VX_DF_IMAGE_U8 or \ref VX_TYPE_INT16 if input image is \ref VX_DF_IMAGE_S16. + * \param [out] output The output image of the same type and size as the input image. + * \ingroup group_vision_function_lut + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxTableLookupNode(vx_graph graph, vx_image input, vx_lut lut, vx_image output); + +/*! \brief [Graph] Creates a Histogram node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8. + * \param [out] distribution The output distribution. + * \ingroup group_vision_function_histogram + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxHistogramNode(vx_graph graph, vx_image input, vx_distribution distribution); + +/*! \brief [Graph] Creates a Histogram Equalization node. + * \param [in] graph The reference to the graph. + * \param [in] input The grayscale input image in \ref VX_DF_IMAGE_U8. + * \param [out] output The grayscale output image of type \ref VX_DF_IMAGE_U8 with equalized brightness and contrast and same size as the input image. + * \ingroup group_vision_function_equalize_hist + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxEqualizeHistNode(vx_graph graph, vx_image input, vx_image output); + +/*! \brief [Graph] Creates an AbsDiff node. + * \param [in] graph The reference to the graph. + * \param [in] in1 An input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \param [in] in2 An input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \param [out] out The output image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format, which must have the same dimensions as the input image. + * \ingroup group_vision_function_absdiff + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxAbsDiffNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Graph] Creates a mean value and optionally, a standard deviation node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image. \ref VX_DF_IMAGE_U8 is supported. + * \param [out] mean The \ref VX_TYPE_FLOAT32 average pixel value. + * \param [out] stddev [optional] The \ref VX_TYPE_FLOAT32 standard deviation of the pixel values. + * \ingroup group_vision_function_meanstddev + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxMeanStdDevNode(vx_graph graph, vx_image input, vx_scalar mean, vx_scalar stddev); + +/*! \brief [Graph] Creates a Threshold node and returns a reference to it. + * \param [in] graph The reference to the graph in which the node is created. + * \param [in] input The input image. Only images with format \ref VX_DF_IMAGE_U8 + * and \ref VX_DF_IMAGE_S16 are supported. + * \param [in] thresh The thresholding object that defines the parameters of + * the operation. The \ref VX_THRESHOLD_INPUT_FORMAT must be the same as the input image format and + * the \ref VX_THRESHOLD_OUTPUT_FORMAT must be the same as the output image format. + * \param [out] output The output image, that will contain as pixel value + * true and false values defined by \p thresh. Only images with format + * \ref VX_DF_IMAGE_U8 are supported. The dimensions are the same as the input image. + * \ingroup group_vision_function_threshold + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation + * should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxThresholdNode(vx_graph graph, vx_image input, vx_threshold thresh, vx_image output); + +/*! \brief [Graph] Creates a Non-Maxima Suppression node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \param [in] mask [optional] Constrict suppression to a ROI. The mask image is of type \ref VX_DF_IMAGE_U8 and must be the same dimensions as the input image. + * \param [in] win_size The size of window over which to perform the localized non-maxima suppression. Must be odd, and less than or equal to the smallest dimension of the input image. + * \param [out] output The output image, of the same type and size as the input, that has been non-maxima suppressed. + * \ingroup group_vision_function_nms + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxNonMaxSuppressionNode(vx_graph graph, vx_image input, vx_image mask, vx_int32 win_size, vx_image output); + +/*! \brief [Graph] Creates an Integral Image Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U32 format, which must have the same dimensions as the input image. + * \ingroup group_vision_function_integral_image + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxIntegralImageNode(vx_graph graph, vx_image input, vx_image output); + +/*! \brief [Graph] Creates an Erosion Image Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format, which must have the same dimensions as the input image. + * \ingroup group_vision_function_erode_image + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxErode3x3Node(vx_graph graph, vx_image input, vx_image output); + +/*! \brief [Graph] Creates a Dilation Image Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format, which must have the same dimensions as the input image. + * \ingroup group_vision_function_dilate_image + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxDilate3x3Node(vx_graph graph, vx_image input, vx_image output); + +/*! \brief [Graph] Creates a Median Image Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format, which must have the same dimensions as the input image. + * \ingroup group_vision_function_median_image + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxMedian3x3Node(vx_graph graph, vx_image input, vx_image output); + +/*! \brief [Graph] Creates a Box Filter Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format, which must have the same dimensions as the input image. + * \ingroup group_vision_function_box_image + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxBox3x3Node(vx_graph graph, vx_image input, vx_image output); + +/*! \brief [Graph] Creates a Gaussian Filter Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format, which must have the same dimensions as the input image. + * \ingroup group_vision_function_gaussian_image + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxGaussian3x3Node(vx_graph graph, vx_image input, vx_image output); + +/*! \brief [Graph] Creates a Non-linear Filter Node. + * \param [in] graph The reference to the graph. + * \param [in] function The non-linear filter function. See \ref vx_non_linear_filter_e. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [in] mask The mask to be applied to the Non-linear function. \ref VX_MATRIX_ORIGIN attribute is used + * to place the mask appropriately when computing the resulting image. See \ref vxCreateMatrixFromPattern. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format, which must have the same dimensions as the input image. + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + * \ingroup group_vision_function_nonlinear_filter + */ +VX_API_ENTRY vx_node VX_API_CALL vxNonLinearFilterNode(vx_graph graph, vx_enum function, vx_image input, vx_matrix mask, vx_image output); + +/*! \brief [Graph] Creates a custom convolution node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [in] conv The \ref vx_int16 convolution matrix. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format, which must have the same dimensions as the input image. + * \ingroup group_vision_function_custom_convolution + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxConvolveNode(vx_graph graph, vx_image input, vx_convolution conv, vx_image output); + +/*! \brief [Graph] Creates a node for a Gaussian Image Pyramid. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] gaussian The Gaussian pyramid with \ref VX_DF_IMAGE_U8 to construct. + * \ingroup group_vision_function_gaussian_pyramid + * \see group_pyramid + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxGaussianPyramidNode(vx_graph graph, vx_image input, vx_pyramid gaussian); + +/*! \brief [Graph] Creates a node for a Laplacian Image Pyramid. + * \param [in] graph The reference to the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \param [out] laplacian The Laplacian pyramid with \ref VX_DF_IMAGE_S16 to construct. + * \param [out] output The lowest resolution image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format necessary to reconstruct the input image from the pyramid. The output image format should be same as input image format. + * \ingroup group_vision_function_laplacian_pyramid + * \see group_pyramid + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxLaplacianPyramidNode(vx_graph graph, vx_image input, + vx_pyramid laplacian, vx_image output); + +/*! \brief [Graph] Reconstructs an image from a Laplacian Image pyramid. + * \param [in] graph The reference to the graph. + * \param [in] laplacian The Laplacian pyramid with \ref VX_DF_IMAGE_S16 format. + * \param [in] input The lowest resolution image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format for the Laplacian pyramid. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format with the highest possible resolution reconstructed from the Laplacian pyramid. The output image format should be same as input image format. + * \ingroup group_vision_function_laplacian_reconstruct + * \see group_pyramid + * \return \ref vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + */ +VX_API_ENTRY vx_node VX_API_CALL vxLaplacianReconstructNode(vx_graph graph, vx_pyramid laplacian, vx_image input, + vx_image output); +/*! \brief [Graph] Creates a image weighted average node. + * \param [in] graph The reference to the graph. + * \param [in] img1 The first input \ref VX_DF_IMAGE_U8 image. + * \param [in] alpha The input \ref VX_TYPE_FLOAT32 scalar value with a value in the range of \f$ 0.0 \le \alpha \le 1.0 \f$. + * \param [in] img2 The second \ref VX_DF_IMAGE_U8 image, which must have the same dimensions as the img1. + * \param [out] output The output \ref VX_DF_IMAGE_U8 image, which must have the same dimensions as the img1. + * \ingroup group_vision_function_weighted_average + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxWeightedAverageNode(vx_graph graph, vx_image img1, vx_scalar alpha, vx_image img2, vx_image output); +/*! \brief [Graph] Creates a min,max,loc node. + * \param [in] graph The reference to create the graph. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \param [out] minVal The minimum value in the image, which corresponds to the type of the input. + * \param [out] maxVal The maximum value in the image, which corresponds to the type of the input. + * \param [out] minLoc [optional] The minimum \ref VX_TYPE_COORDINATES2D locations. If the input image has several minimums, the kernel will return up to the capacity of the array. + * \param [out] maxLoc [optional] The maximum \ref VX_TYPE_COORDINATES2D locations. If the input image has several maximums, the kernel will return up to the capacity of the array. + * \param [out] minCount [optional] The total number of detected minimums in image. Use a \ref VX_TYPE_SIZE scalar. + * \param [out] maxCount [optional] The total number of detected maximums in image. Use a \ref VX_TYPE_SIZE scalar. + * \ingroup group_vision_function_minmaxloc + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxMinMaxLocNode(vx_graph graph, + vx_image input, + vx_scalar minVal, vx_scalar maxVal, + vx_array minLoc, vx_array maxLoc, + vx_scalar minCount, vx_scalar maxCount); + +/*! \brief [Graph] Creates a pixel-wise minimum kernel. + * \param [in] graph The reference to the graph where to create the node. + * \param [in] in1 The first input image. Must be of type \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [in] in2 The second input image. Must be of type \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [out] out The output image which will hold the result of min and will have the same type and dimensions of the imput images. + * \ingroup group_vision_function_min + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxMinNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Graph] Creates a pixel-wise maximum kernel. + * \param [in] graph The reference to the graph where to create the node. + * \param [in] in1 The first input image. Must be of type \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [in] in2 The second input image. Must be of type \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [out] out The output image which will hold the result of max and will have the same type and dimensions of the imput images. + * \ingroup group_vision_function_max + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxMaxNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Graph] Creates a bitwise AND node. + * \param [in] graph The reference to the graph. + * \param [in] in1 A \ref VX_DF_IMAGE_U8 input image. + * \param [in] in2 A \ref VX_DF_IMAGE_U8 input image. + * \param [out] out The \ref VX_DF_IMAGE_U8 output image, which must have the same dimensions as the input images. + * \ingroup group_vision_function_and + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxAndNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Graph] Creates a bitwise INCLUSIVE OR node. + * \param [in] graph The reference to the graph. + * \param [in] in1 A \ref VX_DF_IMAGE_U8 input image. + * \param [in] in2 A \ref VX_DF_IMAGE_U8 input image. + * \param [out] out The \ref VX_DF_IMAGE_U8 output image, which must have the same dimensions as the input images. + * \ingroup group_vision_function_or + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxOrNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Graph] Creates a bitwise EXCLUSIVE OR node. + * \param [in] graph The reference to the graph. + * \param [in] in1 A \ref VX_DF_IMAGE_U8 input image. + * \param [in] in2 A \ref VX_DF_IMAGE_U8 input image. + * \param [out] out The \ref VX_DF_IMAGE_U8 output image, which must have the same dimensions as the input images. + * \ingroup group_vision_function_xor + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxXorNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Graph] Creates a bitwise NOT node. + * \param [in] graph The reference to the graph. + * \param [in] input A \ref VX_DF_IMAGE_U8 input image. + * \param [out] output The \ref VX_DF_IMAGE_U8 output image, which must have the same dimensions as the input image. + * \ingroup group_vision_function_not + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxNotNode(vx_graph graph, vx_image input, vx_image output); + +/*! \brief [Graph] Creates a scalar operation node. + * \param [in] graph The reference to the graph. + * \param [in] scalar_operation A \ref VX_TYPE_ENUM of the \ref vx_scalar_operation_e enumeration. + * \param [in] a First scalar operand. + * \param [in] b Second scalar operand. + * \param [out] output Result of the scalar operation. + * \ingroup group_control_flow + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxScalarOperationNode(vx_graph graph, vx_enum scalar_operation, vx_scalar a, vx_scalar b, vx_scalar output); + +/*! \brief [Graph] Selects one of two data objects depending on the the value of a condition (boolean scalar), and copies its data into another data object. + * \details This node supports predicated execution flow within a graph. All the data objects passed to this kernel shall + * have the same object type and meta data. It is important to note that an implementation may optimize away the select and copy when virtual data + * objects are used.\n + * If there is a kernel node that contribute only into virtual data objects during the graph execution due to certain data path being eliminated by not + * taken argument of select node, then the OpenVX implementation guarantees that there will not be any side effects to graph execution and node state.\n + * If the path to a select node contains non-virtual objects, user nodes, or nodes with completion callbacks, then that path may not be "optimized out" + * because the callback must be executed and the non-virtual objects must be modified. + * \param [in] graph The reference to the graph. + * \param [in] condition \ref VX_TYPE_BOOL predicate variable. + * \param [in] true_value Data object for true. + * \param [in] false_value Data object for false. + * \param [out] output Output data object. + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + * \ingroup group_control_flow + */ +VX_API_ENTRY vx_node VX_API_CALL vxSelectNode(vx_graph graph, vx_scalar condition, vx_reference true_value, vx_reference false_value, vx_reference output); + +/*! \brief [Graph] Creates an pixelwise-multiplication node. + * \param [in] graph The reference to the graph. + * \param [in] in1 An input image, \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [in] in2 An input image, \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [in] scale A non-negative \ref VX_TYPE_FLOAT32 multiplied to each product before overflow handling. + * \param [in] overflow_policy A \ref VX_TYPE_ENUM of the \ref vx_convert_policy_e enumeration. + * \param [in] rounding_policy A \ref VX_TYPE_ENUM of the \ref vx_round_policy_e enumeration. + * \param [out] out The output image, a \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 image. Must have the same type and dimensions of the imput images. + * \ingroup group_vision_function_mult + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxMultiplyNode(vx_graph graph, + vx_image in1, vx_image in2, + vx_scalar scale, + vx_enum overflow_policy, + vx_enum rounding_policy, + vx_image out); + +/*! \brief [Graph] Creates an arithmetic addition node. + * \param [in] graph The reference to the graph. + * \param [in] in1 An input image, \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [in] in2 An input image, \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [in] policy A \ref VX_TYPE_ENUM of the \ref vx_convert_policy_e enumeration. + * \param [out] out The output image, a \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 image, which must have the same dimensions as the input images. + * \ingroup group_vision_function_add + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxAddNode(vx_graph graph, + vx_image in1, vx_image in2, + vx_enum policy, + vx_image out); + +/*! \brief [Graph] Creates an arithmetic subtraction node. + * \param [in] graph The reference to the graph. + * \param [in] in1 An input image, \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16, the minuend. + * \param [in] in2 An input image, \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16, the subtrahend. + * \param [in] policy A \ref VX_TYPE_ENUM of the \ref vx_convert_policy_e enumeration. + * \param [out] out The output image, a \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 image, which must have the same dimensions as the input images. + * \ingroup group_vision_function_sub + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxSubtractNode(vx_graph graph, + vx_image in1, vx_image in2, + vx_enum policy, + vx_image out); + +/*! \brief [Graph] Creates a bit-depth conversion node. + * \param [in] graph The reference to the graph. + * \param [in] input The input image. + * \param [out] output The output image with the same dimensions of the input image. + * \param [in] policy A \ref VX_TYPE_ENUM of the \ref vx_convert_policy_e enumeration. + * \param [in] shift A scalar containing a \ref VX_TYPE_INT32 of the shift value. + * \ingroup group_vision_function_convertdepth + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxConvertDepthNode(vx_graph graph, vx_image input, vx_image output, vx_enum policy, vx_scalar shift); + +/*! \brief [Graph] Creates a Canny Edge Detection Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] hyst The double threshold for hysteresis. The \ref VX_THRESHOLD_INPUT_FORMAT shall be either + * \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. The \ref VX_THRESHOLD_OUTPUT_FORMAT is ignored. + * \param [in] gradient_size The size of the Sobel filter window, must support at least 3, 5, and 7. + * \param [in] norm_type A flag indicating the norm used to compute the gradient, \ref VX_NORM_L1 or \ref VX_NORM_L2. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format with values either 0 or 255. + * \ingroup group_vision_function_canny + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxCannyEdgeDetectorNode(vx_graph graph, vx_image input, vx_threshold hyst, + vx_int32 gradient_size, vx_enum norm_type, + vx_image output); + +/*! \brief [Graph] Creates an Affine Warp Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] matrix The affine matrix. Must be 2x3 of type \ref VX_TYPE_FLOAT32. + * \param [in] type The interpolation type from \ref vx_interpolation_type_e. + * \ref VX_INTERPOLATION_AREA is not supported. + * \param [out] output The output \ref VX_DF_IMAGE_U8 image and the same dimensions as the input image. + * \ingroup group_vision_function_warp_affine + * \note The border modes \ref VX_NODE_BORDER value \ref VX_BORDER_UNDEFINED and + * \ref VX_BORDER_CONSTANT are supported. + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxWarpAffineNode(vx_graph graph, vx_image input, vx_matrix matrix, vx_enum type, vx_image output); + +/*! \brief [Graph] Creates a Perspective Warp Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] matrix The perspective matrix. Must be 3x3 of type \ref VX_TYPE_FLOAT32. + * \param [in] type The interpolation type from \ref vx_interpolation_type_e. + * \ref VX_INTERPOLATION_AREA is not supported. + * \param [out] output The output \ref VX_DF_IMAGE_U8 image with the same dimensions as the input image. + * \ingroup group_vision_function_warp_perspective + * \note The border modes \ref VX_NODE_BORDER value \ref VX_BORDER_UNDEFINED and + * \ref VX_BORDER_CONSTANT are supported. + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxWarpPerspectiveNode(vx_graph graph, vx_image input, vx_matrix matrix, vx_enum type, vx_image output); + +/*! \brief [Graph] Creates a Harris Corners Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] strength_thresh The \ref VX_TYPE_FLOAT32 minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel). + * \param [in] min_distance The \ref VX_TYPE_FLOAT32 radial Euclidean distance for non-maximum suppression. + * \param [in] sensitivity The \ref VX_TYPE_FLOAT32 scalar sensitivity threshold \f$ k \f$ from the Harris-Stephens equation. + * \param [in] gradient_size The gradient window size to use on the input. The + * implementation must support at least 3, 5, and 7. + * \param [in] block_size The block window size used to compute the Harris Corner score. + * The implementation must support at least 3, 5, and 7. + * \param [out] corners The array of \ref VX_TYPE_KEYPOINT objects. The order of the keypoints in this array is implementation dependent. + * \param [out] num_corners [optional] The total number of detected corners in image. Use a \ref VX_TYPE_SIZE scalar. + * \ingroup group_vision_function_harris + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxHarrisCornersNode(vx_graph graph, + vx_image input, + vx_scalar strength_thresh, + vx_scalar min_distance, + vx_scalar sensitivity, + vx_int32 gradient_size, + vx_int32 block_size, + vx_array corners, + vx_scalar num_corners); + +/*! \brief [Graph] Creates a FAST Corners Node. + * \param [in] graph The reference to the graph. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] strength_thresh Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle + * of radius 3 (\ref VX_TYPE_FLOAT32 scalar), with a value in the range of 0.0 \f$\le\f$ strength_thresh < 256.0. + * Any fractional value will be truncated to an integer. + * \param [in] nonmax_suppression If true, non-maximum suppression is applied to + * detected corners before being placed in the \ref vx_array of \ref VX_TYPE_KEYPOINT objects. + * \param [out] corners Output corner \ref vx_array of \ref VX_TYPE_KEYPOINT. The order of the + * keypoints in this array is implementation dependent. + * \param [out] num_corners [optional] The total number of detected corners in image. Use a \ref VX_TYPE_SIZE scalar. + * \ingroup group_vision_function_fast + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxFastCornersNode(vx_graph graph, vx_image input, vx_scalar strength_thresh, vx_bool nonmax_suppression, vx_array corners, vx_scalar num_corners); + +/*! \brief [Graph] Creates a Lucas Kanade Tracking Node. + * \param [in] graph The reference to the graph. + * \param [in] old_images Input of first (old) image pyramid in \ref VX_DF_IMAGE_U8. + * \param [in] new_images Input of destination (new) image pyramid \ref VX_DF_IMAGE_U8. + * \param [in] old_points An array of key points in a \ref vx_array of \ref VX_TYPE_KEYPOINT; those key points are defined at + * the \a old_images high resolution pyramid. + * \param [in] new_points_estimates An array of estimation on what is the output key points in a \ref vx_array of + * \ref VX_TYPE_KEYPOINT; those keypoints are defined at the \a new_images high resolution pyramid. + * \param [out] new_points An output array of key points in a \ref vx_array of \ref VX_TYPE_KEYPOINT; those key points are + * defined at the \a new_images high resolution pyramid. + * \param [in] termination The termination can be \ref VX_TERM_CRITERIA_ITERATIONS or \ref VX_TERM_CRITERIA_EPSILON or + * \ref VX_TERM_CRITERIA_BOTH. + * \param [in] epsilon The \ref vx_float32 error for terminating the algorithm. + * \param [in] num_iterations The number of iterations. Use a \ref VX_TYPE_UINT32 scalar. + * \param [in] use_initial_estimate Use a \ref VX_TYPE_BOOL scalar. + * \param [in] window_dimension The size of the window on which to perform the algorithm. See + * \ref VX_CONTEXT_OPTICAL_FLOW_MAX_WINDOW_DIMENSION + * \ingroup group_vision_function_opticalflowpyrlk + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxOpticalFlowPyrLKNode(vx_graph graph, + vx_pyramid old_images, + vx_pyramid new_images, + vx_array old_points, + vx_array new_points_estimates, + vx_array new_points, + vx_enum termination, + vx_scalar epsilon, + vx_scalar num_iterations, + vx_scalar use_initial_estimate, + vx_size window_dimension); + +/*! \brief [Graph] Creates a Remap Node. + * \param [in] graph The reference to the graph that will contain the node. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] table The remap table object. + * \param [in] policy An interpolation type from \ref vx_interpolation_type_e. + * \ref VX_INTERPOLATION_AREA is not supported. + * \param [out] output The output \ref VX_DF_IMAGE_U8 image with the same dimensions as the input image. + * \note The border modes \ref VX_NODE_BORDER value \ref VX_BORDER_UNDEFINED and + * \ref VX_BORDER_CONSTANT are supported. + * \return A \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + * \ingroup group_vision_function_remap + */ +VX_API_ENTRY vx_node VX_API_CALL vxRemapNode(vx_graph graph, + vx_image input, + vx_remap table, + vx_enum policy, + vx_image output); + +/*! \brief [Graph] Performs a Gaussian Blur on an image then half-scales it. The interpolation mode used is nearest-neighbor. + * \details The output image size is determined by: + * \f[ + * W_{output} = \frac{W_{input} + 1}{2} \\ + * , + * H_{output} = \frac{H_{input} + 1}{2} + * \f] + * \param [in] graph The reference to the graph. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [out] output The output \ref VX_DF_IMAGE_U8 image. + * \param [in] kernel_size The input size of the Gaussian filter. Supported values are 1, 3 and 5. + * \ingroup group_vision_function_scale_image + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxHalfScaleGaussianNode(vx_graph graph, vx_image input, vx_image output, vx_int32 kernel_size); + +VX_API_ENTRY vx_node VX_API_CALL vxCensus3x3Node(vx_graph graph, vx_image src, vx_image dst); + +/*! \brief [Graph] The Node Compares an image template against overlapped image regions. + * \details The detailed equation to the matching can be found in \ref vx_comp_metric_e. + * The output of the template matching node is a comparison map as described in \ref vx_comp_metric_e. + * The Node have a limitation on the template image size (width*height). It should not be larger then 65535. + * If the valid region of the template image is smaller than the entire template image, the result in the destination image is implementation-dependent. + * \param [in] graph The reference to the graph. + * \param [in] src The input image of type \ref VX_DF_IMAGE_U8. + * \param [in] templateImage Searched template of type \ref VX_DF_IMAGE_U8. + * \param [in] matchingMethod attribute specifying the comparison method \ref vx_comp_metric_e. This function support only \ref VX_COMPARE_CCORR_NORM and \ref VX_COMPARE_L2. + * \param [out] output Map of comparison results. The output is an image of type VX_DF_IMAGE_S16 + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + * \ingroup group_vision_function_match_template + */ + VX_API_ENTRY vx_node VX_API_CALL vxMatchTemplateNode(vx_graph graph, vx_image src, vx_image templateImage, vx_enum matchingMethod, vx_image output); + + /*! \brief [Graph] Creates a node that extracts LBP image from an input image +* \param [in] graph The reference to the graph. +* \param [in] in An input image in vx_image. Or \f$ SrcImg\f$ in the equations. the image is of type \ref VX_DF_IMAGE_U8 +* \param [in] format A variation of LBP like original LBP and mLBP. see \ref vx_lbp_format_e +* \param [in] kernel_size Kernel size. Only size of 3 and 5 are supported +* \param [out] out An output image in vx_image.Or \f$ DstImg\f$ in the equations. the image is of type \ref VX_DF_IMAGE_U8 with the same dimensions as the input image. + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus +* \ingroup group_vision_function_lbp +*/ +VX_API_ENTRY vx_node VX_API_CALL vxLBPNode(vx_graph graph, vx_image in, vx_enum format, vx_int8 kernel_size, vx_image out); + +/*! \brief [Graph] Performs cell calculations for the average gradient magnitude and gradient orientation histograms. + * \details Firstly, the gradient magnitude and gradient orientation are computed for each pixel in the input image. + * Two 1-D centred, point discrete derivative masks are applied to the input image in the horizontal and vertical directions. + * \f[ M_h = [-1, 0, 1] \f] and \f[ M_v = [-1, 0, 1]^T \f] + * \f$G_v\f$ is the result of applying mask \f$M_v\f$ to the input image, and \f$G_h\f$ is the result of applying mask \f$M_h\f$ to the input image. + * The border mode used for the gradient calculation is implementation dependent. Its behavior should be similar to \ref VX_BORDER_UNDEFINED. + * The gradient magnitudes and gradient orientations for each pixel are then calculated in the following manner. + * \f[ G(x,y) = \sqrt{G_v(x,y)^2 + G_h(x,y)^2} \f] + * \f[ \theta(x,y) = arctan(G_v(x,y), G_h(x,y)) \f] + * where \f$arctan(v, h)\f$ + * is \f$ tan^{-1}(v/h)\f$ when \f$h!=0\f$, + * + * \f$ -pi/2 \f$ if \f$v<0\f$ and \f$h==0\f$, + * + * \f$ pi/2 \f$ if \f$v>0\f$ and \f$h==0\f$ + * + * and \f$ 0 \f$ if \f$v==0\f$ and \f$h==0\f$ + * + * Secondly, the gradient magnitudes and orientations are used to compute the bins output tensor and optional magnitudes output tensor. + * These tensors are computed on a cell level where the cells are rectangular in shape. + * The magnitudes tensor contains the average gradient magnitude for each cell. + * \f[magnitudes(c) = \frac{1}{(cell\_width * cell\_height)}\sum\limits_{w=0}^{cell\_width} \sum\limits_{h=0}^{cell\_height} G_c(w,h)\f] + * where \f$G_c\f$ is the gradient magnitudes related to cell \f$c\f$. + * The bins tensor contains histograms of gradient orientations for each cell. + * The gradient orientations at each pixel range from 0 to 360 degrees. These are quantised into a set of histogram bins based on the num_bins parameter. + * Each pixel votes for a specific cell histogram bin based on its gradient orientation. The vote itself is the pixel's gradient magnitude. + * \f[bins(c, n) = \sum\limits_{w=0}^{cell\_width} \sum\limits_{h=0}^{cell\_height} G_c(w,h) * 1[B_c(w, h, num\_bins) == n]\f] + * where \f$B_c\f$ produces the histogram bin number based on the gradient orientation of the pixel at location (\f$w\f$, \f$h\f$) in cell \f$c\f$ based on + * the \f$num\_bins\f$ and \f[1[B_c(w, h, num\_bins) == n]\f] is a delta-function with value 1 when \f$B_c(w, h, num\_bins) == n\f$ or 0 otherwise. + * \param [in] graph The reference to the graph. + * \param [in] input The input image of type \ref VX_DF_IMAGE_U8. + * \param [in] cell_width The histogram cell width of type \ref VX_TYPE_INT32. + * \param [in] cell_height The histogram cell height of type \ref VX_TYPE_INT32. + * \param [in] num_bins The histogram size of type \ref VX_TYPE_INT32. + * \param [out] magnitudes (Optional) The output average gradient magnitudes per cell of \ref vx_tensor of type \ref VX_TYPE_INT16 of size \f$ [floor(image_{width}/cell_{width}) ,floor(image_{height}/cell_{height}) ] \f$. + * \param [out] bins The output gradient orientation histograms per cell of \ref vx_tensor of type \ref VX_TYPE_INT16 of size \f$ [floor(image_{width}/cell_{width}) ,floor(image_{height}/cell_{height}), num_{bins}] \f$. + * \return \ref vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_vision_function_hog + */ +VX_API_ENTRY vx_node VX_API_CALL vxHOGCellsNode(vx_graph graph, vx_image input, vx_int32 cell_width, vx_int32 cell_height, vx_int32 num_bins, vx_tensor magnitudes, vx_tensor bins); + +/*! \brief [Graph] The node produces HOG features for the W1xW2 window in a sliding window fashion over the whole input image. Each position produces a HOG feature vector. + * \details Firstly if a magnitudes tensor is provided the cell histograms in the bins tensor are normalised by the average cell gradient magnitudes. + \f[bins(c,n) = \frac{bins(c,n)}{magnitudes(c)}\f] + * To account for changes in illumination and contrast the cell histograms must be locally normalized which requires grouping the cell histograms together into larger spatially connected blocks. + * Blocks are rectangular grids represented by three parameters: the number of cells per block, the number of pixels per cell, and the number of bins per cell histogram. + * These blocks typically overlap, meaning that each cell histogram contributes more than once to the final descriptor. + * To normalize a block its cell histograms \f$h\f$ are grouped together to form a vector \f$v = [h_1, h_2, h_3, ... , h_n]\f$. + * This vector is normalised using L2-Hys which means performing L2-norm on this vector; clipping the result (by limiting the maximum values of v to be threshold) and renormalizing again. If the threshold is equal to zero then L2-Hys normalization is not performed. + * \f[L2norm(v) = \frac{v}{\sqrt{\|v\|_2^2 + \epsilon^2}}\f] + * where \f$ \|v\|_k \f$ be its k-norm for k=1, 2, and \f$ \epsilon \f$ be a small constant. + * For a specific window its HOG descriptor is then the concatenated vector of the components of the normalized cell histograms from all of the block regions contained in the window. + * The W1xW2 window starting position is at coordinates 0x0. + * If the input image has dimensions that are not an integer multiple of W1xW2 blocks with the specified stride, then the last positions that contain only a partial W1xW2 window + * will be calculated with the remaining part of the W1xW2 window padded with zeroes. + * The Window W1xW2 must also have a size so that it contains an integer number of cells, otherwise the node is not well-defined. + * The final output tensor will contain HOG descriptors equal to the number of windows in the input image. + * The output features tensor has 3 dimensions, given by:\n + * \f[[ (floor((image_{width}-window_{width})/window_{stride}) + 1),\f] + * \f[ (floor((image_{height}-window_{height})/window_{stride}) + 1),\f] + * \f[ floor((window_{width} - block_{width})/block_{stride} + 1) * floor((window_{height} - block_{height})/block_{stride} + 1) *\f] + * \f[ (((block_{width} * block_{height}) / (cell_{width} * cell_{height})) * num_{bins})] \f] + * See \ref vxCreateTensor and \ref vxCreateVirtualTensor. + * We recommend the output tensors always be *virtual* objects, with this node connected directly to the classifier. + * The output tensor will be very large, and using non-virtual tensors will result in a poorly optimized implementation. + * Merging of this node with a classifier node such as that described in the classifier extension will result in better performance. + * Notice that this node creation function has more parameters than the corresponding kernel. Numbering of kernel parameters (required if you create this node using the generic interface) is explicitly specified here. + * \param [in] graph The reference to the graph. + * \param [in] input The input image of type \ref VX_DF_IMAGE_U8. (Kernel parameter #0) + * \param [in] magnitudes (Optional) The gradient magnitudes per cell of \ref vx_tensor of type \ref VX_TYPE_INT16. It is the output of \ref vxHOGCellsNode. (Kernel parameter #1) + * \param [in] bins The gradient orientation histograms per cell of \ref vx_tensor of type \ref VX_TYPE_INT16. It is the output of \ref vxHOGCellsNode. (Kernel parameter #2) + * \param [in] params The parameters of type \ref vx_hog_t. (Kernel parameter #3) + * \param [in] hog_param_size Size of \ref vx_hog_t in bytes. Note that this parameter is not counted as one of the kernel parameters. + * \param [out] features The output HOG features of \ref vx_tensor of type \ref VX_TYPE_INT16. (Kernel parameter #4) + * \return \ref vx_node. + * \retval 0 Node could not be created. + * \retval * Node handle. + * \ingroup group_vision_function_hog + */ +VX_API_ENTRY vx_node VX_API_CALL vxHOGFeaturesNode(vx_graph graph, vx_image input, vx_tensor magnitudes, vx_tensor bins, const vx_hog_t *params, vx_size hog_param_size, vx_tensor features); + +/*! \brief [Graph] Finds the Probabilistic Hough Lines detected in the input binary image, each line is stored in the output array as a set of points (x1, y1, x2, y2) . + * \details Some implementations of the algorithm may have a random or non-deterministic element. If the target application is in a safety-critical environment this + * should be borne in mind and steps taken in the implementation, the application or both to achieve the level of determinism required by the system design. + * \param [in] graph graph handle + * \param [in] input 8 bit, single channel binary source image + * \param [in] params parameters of the struct \ref vx_hough_lines_p_t + * \param [out] lines_array lines_array contains array of lines, see \ref vx_line2d_t The order of lines in implementation dependent + * \param [out] num_lines [optional] The total number of detected lines in image. Use a VX_TYPE_SIZE scalar + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + * \ingroup group_vision_function_hough_lines_p + */ +VX_API_ENTRY vx_node VX_API_CALL vxHoughLinesPNode(vx_graph graph, vx_image input, const vx_hough_lines_p_t *params, vx_array lines_array, vx_scalar num_lines); + +/*! \brief [Graph] The function applies bilateral filtering to the input tensor. +* \param [in] graph The reference to the graph. +* \param [in] src The input data a \ref vx_tensor. maximum 3 dimension and minimum 2. The tensor is of type \ref VX_TYPE_UINT8 or \ref VX_TYPE_INT16. +* dimensions are [radiometric ,width,height] or [width,height].See \ref vxCreateTensor and \ref vxCreateVirtualTensor. +* \param [in] diameter of each pixel neighbourhood that is used during filtering. Values of diameter must be odd. Bigger then 3 and smaller then 10. +* \param [in] sigmaValues Filter sigma in the radiometric space. Supported values are bigger then 0 and smaller or equal 20. +* \param [in] sigmaSpace Filter sigma in the spatial space. Supported values are bigger then 0 and smaller or equal 20. +* \param [out] dst The output data a \ref vx_tensor,Of type \ref VX_TYPE_UINT8 or \ref VX_TYPE_INT16. And must be the same type and size of the input. +* \note The border modes +* \ref VX_NODE_BORDER value +* \ref VX_BORDER_REPLICATE and \ref VX_BORDER_CONSTANT are supported. +* \return vx_node. +* \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using vxGetStatus +* \ingroup group_vision_function_bilateral_filter +*/ +VX_API_ENTRY vx_node VX_API_CALL vxBilateralFilterNode(vx_graph graph, vx_tensor src, vx_int32 diameter, vx_float32 sigmaSpace, vx_float32 sigmaValues, vx_tensor dst); + +/*! \brief [Graph] Performs element wise multiplications on element values in the input tensor data with a scale. + * \param [in] graph The handle to the graph. + * \param [in] input1 Input tensor data. Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8 and \ref VX_TYPE_INT8, with fixed_point_position 0. + * \param [in] input2 Input tensor data. The dimensions and sizes of input2 match those of input1, unless the vx_tensor of one or more dimensions in input2 is 1. + * In this case, those dimensions are treated as if this tensor was expanded to match the size of the corresponding dimension of input1, + * and data was duplicated on all terms in that dimension. After this expansion, the dimensions will be equal. + * The data type must match the data type of Input1. + * \param [in] scale A non-negative \ref VX_TYPE_FLOAT32 multiplied to each product before overflow handling. + * \param [in] overflow_policy A \ref vx_convert_policy_e enumeration. + * \param [in] rounding_policy A \ref vx_round_policy_e enumeration. + * \param [out] output The output tensor data with the same dimensions as the input tensor data. + * \ingroup group_vision_function_tensor_multiply + * \return \ref vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorMultiplyNode(vx_graph graph, vx_tensor input1, vx_tensor input2, vx_scalar scale, vx_enum overflow_policy, + vx_enum rounding_policy, vx_tensor output); + +/*! \brief [Graph] Performs arithmetic addition on element values in the input tensor data. + * \param [in] graph The handle to the graph. + * \param [in] input1 Input tensor data. Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8 and \ref VX_TYPE_INT8, with fixed_point_position 0. + * \param [in] input2 Input tensor data. The dimensions and sizes of input2 match those of input1, unless the vx_tensor of one or more dimensions in input2 is 1. + * In this case, those dimensions are treated as if this tensor was expanded to match the size of the corresponding dimension of input1, + * and data was duplicated on all terms in that dimension. After this expansion, the dimensions will be equal. + * The data type must match the data type of Input1. + * \param [in] policy A \ref vx_convert_policy_e enumeration. + * \param [out] output The output tensor data with the same dimensions as the input tensor data. + * \ingroup group_vision_function_tensor_add + * \return \ref vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorAddNode(vx_graph graph, vx_tensor input1, vx_tensor input2, vx_enum policy, vx_tensor output); + +/*! \brief [Graph] Performs arithmetic subtraction on element values in the input tensor data. + * \param [in] graph The handle to the graph. + * \param [in] input1 Input tensor data. Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8 and \ref VX_TYPE_INT8, with fixed_point_position 0. + * \param [in] input2 Input tensor data. The dimensions and sizes of input2 match those of input1, unless the vx_tensor of one or more dimensions in input2 is 1. + * In this case, those dimensions are treated as if this tensor was expanded to match the size of the corresponding dimension of input1, + * and data was duplicated on all terms in that dimension. After this expansion, the dimensions will be equal. + * The data type must match the data type of Input1. + * \param [in] policy A \ref vx_convert_policy_e enumeration. + * \param [out] output The output tensor data with the same dimensions as the input tensor data. + * \ingroup group_vision_function_tensor_subtract + * \return \ref vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorSubtractNode(vx_graph graph, vx_tensor input1, vx_tensor input2, vx_enum policy, vx_tensor output); + +/*! \brief [Graph] Performs LUT on element values in the input tensor data. + * \param [in] graph The handle to the graph. + * \param [in] input1 Input tensor data. Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8, with fixed_point_position 0. + * \param [in] lut The look-up table to use, of type \ref vx_lut. + * The elements of input1 are treated as unsigned integers to determine an index into the look-up table. + * The data type of the items in the look-up table must match that of the output tensor. + * \param [out] output The output tensor data with the same dimensions as the input tensor data. + * \ingroup group_vision_function_tensor_tablelookup + * \return \ref vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorTableLookupNode(vx_graph graph, vx_tensor input1, vx_lut lut, vx_tensor output); + +/*! \brief [Graph] Performs transpose on the input tensor. + * The node transpose the tensor according to a specified 2 indexes in the tensor (0-based indexing) + * \param [in] graph The handle to the graph. + * \param [in] input Input tensor data, Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8 and \ref VX_TYPE_INT8, with fixed_point_position 0. + * \param [out] output output tensor data, + * \param [in] dimension1 Dimension index that is transposed with dim 2. + * \param [in] dimension2 Dimension index that is transposed with dim 1. + * \ingroup group_vision_function_tensor_transpose + * \return \ref vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorTransposeNode(vx_graph graph, vx_tensor input, vx_tensor output, vx_size dimension1, vx_size dimension2); +/*! \brief [Graph] Creates a bit-depth conversion node. + * \param [in] graph The reference to the graph. + * \param [in] input The input tensor. Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8 and \ref VX_TYPE_INT8, with fixed_point_position 0. + * \param [in] policy A \ref VX_TYPE_ENUM of the \ref vx_convert_policy_e enumeration. + * \param [in] norm A scalar containing a \ref VX_TYPE_FLOAT32 of the normalization value. + * \param [in] offset A scalar containing a \ref VX_TYPE_FLOAT32 of the offset value subtracted before normalization. + * \param [out] output The output tensor. Implementations must support input tensor data type \ref VX_TYPE_INT16. with fixed_point_position 8. + * And \ref VX_TYPE_UINT8 with fixed_point_position 0. + * \ingroup group_vision_function_tensor_convert_depth + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation should be checked using \ref vxGetStatus + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorConvertDepthNode(vx_graph graph, vx_tensor input, vx_enum policy, vx_scalar norm, vx_scalar offset, vx_tensor output); + +/*! \brief [Graph] Creates a generalized matrix multiplication node. + * \param [in] graph The reference to the graph. + * \param [in] input1 The first input 2D tensor of type \ref VX_TYPE_INT16 with fixed_point_pos 8, or tensor data types \ref VX_TYPE_UINT8 or \ref VX_TYPE_INT8, with fixed_point_pos 0. + * \param [in] input2 The second 2D tensor. Must be in the same data type as input1. + * \param [in] input3 The third 2D tensor. Must be in the same data type as input1. [optional]. + * \param [in] matrix_multiply_params Matrix multiply parameters, see \ref vx_tensor_matrix_multiply_params_t . + * \param [out] output The output 2D tensor. Must be in the same data type as input1. Output dimension must agree the formula in the description. + * \ingroup group_vision_function_tensor_matrix_multiply + * \return \ref vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorMatrixMultiplyNode(vx_graph graph, vx_tensor input1, vx_tensor input2, vx_tensor input3, + const vx_tensor_matrix_multiply_params_t *matrix_multiply_params, vx_tensor output); + +/*! \brief Copy data from one object to another. + * \note An implementation may optimize away the copy when virtual data objects are used. + * \param [in] graph The reference to the graph. + * \param [in] input The input data object. + * \param [out] output The output data object with meta-data identical to the input data object. + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation + * should be checked using \ref vxGetStatus + * \ingroup group_vision_function_copy + */ +VX_API_ENTRY vx_node VX_API_CALL vxCopyNode(vx_graph graph, vx_reference input, vx_reference output); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h new file mode 100644 index 0000000..4c7fdea --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h @@ -0,0 +1,1915 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _OPENVX_TYPES_H_ +#define _OPENVX_TYPES_H_ + +/*! + * \file vx_types.h + * \brief The type definitions required by OpenVX Library. + */ + +#include +#include +#include + +/*! + * \internal + * \def VX_API_ENTRY + * \brief This is a tag used to identify exported, public API functions as + * distinct from internal functions, helpers, and other non-public interfaces. + * It can optionally be defined in the make system according the the compiler and intent. + * \ingroup group_basic_features + */ +#ifndef VX_API_ENTRY +#if defined(_WIN32) +#define VX_API_ENTRY __declspec(dllexport) +#else +#define VX_API_ENTRY __attribute__((visibility("default"))) +#endif +#endif +#ifndef VX_INTERNAL_ENTRY +#if defined(_WIN32) +#define VX_INTERNAL_ENTRY __declspec(dllexport) +#else +#define VX_INTERNAL_ENTRY __attribute__((visibility("default"))) +#endif +#endif +#ifndef VX_API_CALL +#if defined(_WIN32) +#define VX_API_CALL __stdcall +#else +#define VX_API_CALL +#endif +#endif +#ifndef VX_INTERNAL_CALL +#if defined(_WIN32) +#define VX_INTERNAL_CALL __stdcall +#else +#define VX_INTERNAL_CALL +#endif +#endif +#ifndef VX_CALLBACK +#if defined(_WIN32) +#define VX_CALLBACK __stdcall +#else +#define VX_CALLBACK +#endif + +#endif + +/*! \brief An 8 bit ASCII character. + * \ingroup group_basic_features + */ +typedef char vx_char; + +/*! \brief An 8-bit unsigned value. + * \ingroup group_basic_features + */ +typedef uint8_t vx_uint8; + +/*! \brief A 16-bit unsigned value. + * \ingroup group_basic_features + */ +typedef uint16_t vx_uint16; + +/*! \brief A 32-bit unsigned value. + * \ingroup group_basic_features + */ +typedef uint32_t vx_uint32; + +/*! \brief A 64-bit unsigned value. + * \ingroup group_basic_features + */ +typedef uint64_t vx_uint64; + +/*! \brief An 8-bit signed value. + * \ingroup group_basic_features + */ +typedef int8_t vx_int8; + +/*! \brief A 16-bit signed value. + * \ingroup group_basic_features + */ +typedef int16_t vx_int16; + +/*! \brief A 32-bit signed value. + * \ingroup group_basic_features + */ +typedef int32_t vx_int32; + +/*! \brief A 64-bit signed value. + * \ingroup group_basic_features + */ +typedef int64_t vx_int64; + +typedef uint32_t vx_bitfield; + +#if defined(EXPERIMENTAL_PLATFORM_SUPPORTS_16_FLOAT) + +/*! \brief A 16-bit float value. + * \ingroup group_basic_features + */ +typedef hfloat vx_float16; +#endif + +/*! \brief A 32-bit float value. + * \ingroup group_basic_features + */ +typedef float vx_float32; + +/*! \brief A 64-bit float value (aka double). + * \ingroup group_basic_features + */ +typedef double vx_float64; + +/*! \brief A generic opaque reference to any object within OpenVX. + * \details A user of OpenVX should not assume that this can be cast directly to anything; + * however, any object in OpenVX can be cast back to this for the purposes of + * querying attributes of the object or for passing the object as a parameter to + * functions that take a \ref vx_reference type. + * If the API does not take that specific type but may take others, an + * error may be returned from the API. + * \ingroup group_reference + */ +typedef struct _vx_reference *vx_reference; + +/*! \brief Sets the standard enumeration type size to be a fixed quantity. + * \details All enumerable fields must use this type as the container to + * enforce enumeration ranges and sizeof() operations. + * \ingroup group_basic_features + */ +typedef int32_t vx_enum; + +/*! \brief A wrapper of size_t to keep the naming convention uniform. + * \ingroup group_basic_features + */ +typedef size_t vx_size; + +/*! \brief Used to hold a VX_DF_IMAGE code to describe the pixel format and color space. + * \ingroup group_basic_features + */ +typedef uint32_t vx_df_image; + +/*! \brief Holds the address of a variable where the map/unmap functions return a map identifier. + * \ingroup group_image + */ +typedef uintptr_t vx_map_id; + +/*! \brief An opaque reference to a scalar. + * \details A scalar can be up to 64 bits wide. + * \see vxCreateScalar + * \ingroup group_scalar + * \extends vx_reference + */ +typedef struct _vx_scalar *vx_scalar; + +/*! \brief An opaque reference to an image. + * \see vxCreateImage + * \ingroup group_image + * \extends vx_reference + */ +typedef struct _vx_image *vx_image; + +/*! \brief An opaque reference to the descriptor of a kernel. + * \see vxGetKernelByName + * \see vxGetKernelByEnum + * \ingroup group_kernel + * \extends vx_reference + */ +typedef struct _vx_kernel *vx_kernel; + +/*! \brief An opaque reference to a single parameter. + * \see vxGetParameterByIndex + * \ingroup group_parameter + * \extends vx_reference + */ +typedef struct _vx_parameter *vx_parameter; + +/*! \brief An opaque reference to a kernel node. + * \see vxCreateGenericNode + * \ingroup group_node + * \extends vx_reference + */ +typedef struct _vx_node *vx_node; + +/*! \brief An opaque reference to a graph + * \see vxCreateGraph + * \ingroup group_graph + * \extends vx_reference + */ +typedef struct _vx_graph *vx_graph; + +/*! \brief An opaque reference to the implementation context. + * \see vxCreateContext + * \ingroup group_context + * \extends vx_reference + */ +typedef struct _vx_context *vx_context; + +/*! \brief The delay object. This is like a ring buffer of objects that is + * maintained by the OpenVX implementation. + * \see vxCreateDelay + * \extends vx_reference + * \ingroup group_delay + */ +typedef struct _vx_delay *vx_delay; + +/*! \brief The Look-Up Table (LUT) Object. + * \extends vx_reference + * \ingroup group_lut + */ +typedef struct _vx_lut *vx_lut; + +/*! \brief The Distribution object. This has a user-defined number of bins over + * a user-defined range (within a uint32_t range). + * \extends vx_reference + * \ingroup group_distribution + */ +typedef struct _vx_distribution *vx_distribution; + +/*! \brief The Matrix Object. An MxN matrix of some unit type. + * \extends vx_reference + * \ingroup group_matrix + */ +typedef struct _vx_matrix *vx_matrix; + +/*! \brief The Image Pyramid object. A set of scaled images. + * \extends vx_reference + * \ingroup group_pyramid + */ +typedef struct _vx_pyramid *vx_pyramid; + +/*! \brief The Threshold Object. A thresholding object contains the types and + * limit values of the thresholding required. + * \extends vx_reference + * \ingroup group_threshold + */ +typedef struct _vx_threshold *vx_threshold; + +/*! \brief The Convolution Object. A user-defined convolution kernel of MxM elements. + * \extends vx_reference + * \ingroup group_convolution + */ +typedef struct _vx_convolution *vx_convolution; + +/*! \brief The remap table Object. A remap table contains per-pixel mapping of + * output pixels to input pixels. + * \ingroup group_remap + */ +typedef struct _vx_remap *vx_remap; + +/*! \brief The Array Object. Array is a strongly-typed container for other data structures. + * \ingroup group_array + */ +typedef struct _vx_array *vx_array; + +/*! \brief The ObjectArray Object. ObjectArray is a strongly-typed container of OpenVX data-objects. + * \ingroup group_object_array + */ +typedef struct _vx_object_array *vx_object_array; + + /*! \brief The multidimensional data object (Tensor). + * \see vxCreateTensor + * \ingroup group_object_tensor + * \extends vx_reference + */ +typedef struct _vx_tensor_t * vx_tensor; + +/*! \brief The weight bias parameter for fused layers + * \ingroup group_cnn + */ +typedef struct _vx_weights_biases_parameter_s * vx_weights_biases_parameter; + + +/*! \brief A Boolean value. + * This allows 0 to be FALSE, as it is in C, and any non-zero to be TRUE. + * \code + * vx_bool ret = vx_true_e; + * if (ret) printf("true!\n"); + * ret = vx_false_e; + * if (!ret) printf("false!\n"); + * \endcode + * This would print both strings. + * \see vx_bool + * \ingroup group_basic_features + */ +typedef enum _vx_bool_e { + /*! \brief The "false" value. */ + vx_false_e = 0, + /*! \brief The "true" value. */ + vx_true_e, +} vx_bool_e; + +/*! \brief A formal boolean type with known fixed size. + * \see vx_bool_e + * \ingroup group_basic_features + */ +typedef vx_enum vx_bool; + +/*! + * \brief This object is used by output validation functions to specify the meta data + * of the expected output data object. + * \note When the actual output object of the user node is virtual, the information + * given through the vx_meta_format object allows the OpenVX framework to automatically + * create the data object when meta data were not specified by the application at object + * creation time. + * \ingroup group_user_kernels + */ +typedef struct _vx_meta_format* vx_meta_format; + +/*! \brief The type enumeration lists all the known types in OpenVX. + * \ingroup group_basic_features + */ +enum vx_type_e { + VX_TYPE_INVALID = 0x000,/*!< \brief An invalid type value. When passed an error must be returned. */ + VX_TYPE_CHAR = 0x001,/*!< \brief A \ref vx_char. */ + VX_TYPE_INT8 = 0x002,/*!< \brief A \ref vx_int8. */ + VX_TYPE_UINT8 = 0x003,/*!< \brief A \ref vx_uint8. */ + VX_TYPE_INT16 = 0x004,/*!< \brief A \ref vx_int16. */ + VX_TYPE_UINT16 = 0x005,/*!< \brief A \ref vx_uint16. */ + VX_TYPE_INT32 = 0x006,/*!< \brief A \ref vx_int32. */ + VX_TYPE_UINT32 = 0x007,/*!< \brief A \ref vx_uint32. */ + VX_TYPE_INT64 = 0x008,/*!< \brief A \ref vx_int64. */ + VX_TYPE_UINT64 = 0x009,/*!< \brief A \ref vx_uint64. */ + VX_TYPE_FLOAT32 = 0x00A,/*!< \brief A \ref vx_float32. */ + VX_TYPE_FLOAT64 = 0x00B,/*!< \brief A \ref vx_float64. */ + VX_TYPE_ENUM = 0x00C,/*!< \brief A \ref vx_enum. Equivalent in size to a \ref vx_int32. */ + VX_TYPE_SIZE = 0x00D,/*!< \brief A \ref vx_size. */ + VX_TYPE_DF_IMAGE = 0x00E,/*!< \brief A \ref vx_df_image. */ + VX_TYPE_FLOAT16 = 0x00F,/*!< \brief A \ref vx_float16. */ + VX_TYPE_BOOL = 0x010,/*!< \brief A \ref vx_bool. */ + VX_TYPE_BOOL8 = 0x011,/*!< \brief A \ref vx_bool8. */ + + VX_TYPE_RECTANGLE = 0x020,/*!< \brief A \ref vx_rectangle_t. */ + VX_TYPE_KEYPOINT = 0x021,/*!< \brief A \ref vx_keypoint_t. */ + VX_TYPE_COORDINATES2D = 0x022,/*!< \brief A \ref vx_coordinates2d_t. */ + VX_TYPE_COORDINATES3D = 0x023,/*!< \brief A \ref vx_coordinates3d_t. */ + VX_TYPE_COORDINATES2DF = 0x024,/*!< \brief A \ref vx_coordinates2df_t. */ + + /* Reserve enums that are defined in khronos extensions + NN extensions: + VX_TYPE_NN_CONVOLUTION_PARAMS = 0x025, + VX_TYPE_NN_DECONVOLUTION_PARAMS = 0x026, + VX_TYPE_NN_ROI_POOL_PARAMS = 0x027, + Classifier extension: + VX_TYPE_CLASSIFER_MODEL = 0x02C, + */ + VX_TYPE_HOG_PARAMS = 0x028, /*!< \brief A \ref vx_hog_t. */ + VX_TYPE_HOUGH_LINES_PARAMS = 0x029, /*!< \brief A \ref vx_hough_lines_p_t. */ + VX_TYPE_LINE_2D = 0x02A, /*!< \brief A \ref vx_line2d_t. */ + VX_TYPE_TENSOR_MATRIX_MULTIPLY_PARAMS = 0x02B, /*!< \brief A \ref vx_tensor_matrix_multiply_params_t. */ + + + VX_TYPE_USER_STRUCT_START = 0x100,/*!< \brief A user-defined struct base index.*/ + VX_TYPE_VENDOR_STRUCT_START = 0x400,/*!< \brief A vendor-defined struct base index.*/ + VX_TYPE_KHRONOS_OBJECT_START = 0x800,/*!< \brief A Khronos defined object base index. */ + VX_TYPE_VENDOR_OBJECT_START = 0xC00,/*!< \brief A vendor defined object base index. */ + + VX_TYPE_WEIGHTS_BIASES_PARAMETER = VX_TYPE_VENDOR_OBJECT_START, + VX_TYPE_WEIGHTS_BIASES_PARAMETER_BASE = VX_TYPE_VENDOR_OBJECT_START+1, + + VX_TYPE_KHRONOS_STRUCT_MAX = VX_TYPE_USER_STRUCT_START - 1,/*!< \brief A value for comparison between Khronos defined structs and user structs. */ + + VX_TYPE_USER_STRUCT_END = VX_TYPE_VENDOR_STRUCT_START - 1,/*!< \brief A value for comparison between user structs and vendor structs. */ + VX_TYPE_VENDOR_STRUCT_END = VX_TYPE_KHRONOS_OBJECT_START - 1,/*!< \brief A value for comparison between vendor structs and Khronos defined objects. */ + VX_TYPE_KHRONOS_OBJECT_END = VX_TYPE_VENDOR_OBJECT_START - 1,/*!< \brief A value for comparison between Khronos defined objects and vendor structs. */ + VX_TYPE_VENDOR_OBJECT_END = 0xFFF,/*!< \brief A value used for bound checking of vendor objects */ + + + VX_TYPE_REFERENCE = 0x800,/*!< \brief A \ref vx_reference. */ + VX_TYPE_CONTEXT = 0x801,/*!< \brief A \ref vx_context. */ + VX_TYPE_GRAPH = 0x802,/*!< \brief A \ref vx_graph. */ + VX_TYPE_NODE = 0x803,/*!< \brief A \ref vx_node. */ + VX_TYPE_KERNEL = 0x804,/*!< \brief A \ref vx_kernel. */ + VX_TYPE_PARAMETER = 0x805,/*!< \brief A \ref vx_parameter. */ + VX_TYPE_DELAY = 0x806,/*!< \brief A \ref vx_delay. */ + VX_TYPE_LUT = 0x807,/*!< \brief A \ref vx_lut. */ + VX_TYPE_DISTRIBUTION = 0x808,/*!< \brief A \ref vx_distribution. */ + VX_TYPE_PYRAMID = 0x809,/*!< \brief A \ref vx_pyramid. */ + VX_TYPE_THRESHOLD = 0x80A,/*!< \brief A \ref vx_threshold. */ + VX_TYPE_MATRIX = 0x80B,/*!< \brief A \ref vx_matrix. */ + VX_TYPE_CONVOLUTION = 0x80C,/*!< \brief A \ref vx_convolution. */ + VX_TYPE_SCALAR = 0x80D,/*!< \brief A \ref vx_scalar. when needed to be completely generic for kernel validation. */ + VX_TYPE_ARRAY = 0x80E,/*!< \brief A \ref vx_array. */ + VX_TYPE_IMAGE = 0x80F,/*!< \brief A \ref vx_image. */ + VX_TYPE_REMAP = 0x810,/*!< \brief A \ref vx_remap. */ + VX_TYPE_ERROR = 0x811,/*!< \brief An error object which has no type. */ + VX_TYPE_META_FORMAT = 0x812,/*!< \brief A \ref vx_meta_format. */ + VX_TYPE_OBJECT_ARRAY = 0x813,/*!< \brief A \ref vx_object_array. */ + /* Reserved for IX and XML extensions */ + /* VX_TYPE_IMPORT = 0x814, !< \brief A \ref vx_import. */ + VX_TYPE_TENSOR = 0x815,/*!< \brief A \ref vx_tensor. */ + /* Reserved for VX_TYPE_TARGET extensions*/ + VX_TYPE_TARGET = 0x816,/*!< \brief A \ref vx_target */ + VX_TYPE_TENSOR_VIEW = 0x817,/*!< \brief A \ref vx_tensor_view. */ + VX_TYPE_TENSOR_ADDRESS = 0x818,/*!< \brief A \ref vx_tensor_addressing. */ + VX_TYPE_TENSOR_MEM = 0x819,/*!< \brief A \ref vx_tensor_alloc_info. */ + + /* \todo add new object types here */ + VX_TYPE_BFLOAT16 = 0x81A,/*!< \brief A \ref vx_bfloat16. */ + +}; + +/*! \brief The enumeration of all status codes. + * \see vx_status. + * \ingroup group_basic_features + */ +enum vx_status_e { + VX_STATUS_MIN = -25,/*!< \brief Indicates the lower bound of status codes in VX. Used for bounds checks only. */ + /* add new codes here */ + VX_ERROR_REFERENCE_NONZERO = -24,/*!< \brief Indicates that an operation did not complete due to a reference count being non-zero. */ + VX_ERROR_MULTIPLE_WRITERS = -23,/*!< \brief Indicates that the graph has more than one node outputting to the same data object. This is an invalid graph structure. */ + VX_ERROR_GRAPH_ABANDONED = -22,/*!< \brief Indicates that the graph is stopped due to an error or a callback that abandoned execution. */ + VX_ERROR_GRAPH_SCHEDULED = -21,/*!< \brief Indicates that the supplied graph already has been scheduled and may be currently executing. */ + VX_ERROR_INVALID_SCOPE = -20,/*!< \brief Indicates that the supplied parameter is from another scope and cannot be used in the current scope. */ + VX_ERROR_INVALID_NODE = -19,/*!< \brief Indicates that the supplied node could not be created.*/ + VX_ERROR_INVALID_GRAPH = -18,/*!< \brief Indicates that the supplied graph has invalid connections (cycles). */ + VX_ERROR_INVALID_TYPE = -17,/*!< \brief Indicates that the supplied type parameter is incorrect. */ + VX_ERROR_INVALID_VALUE = -16,/*!< \brief Indicates that the supplied parameter has an incorrect value. */ + VX_ERROR_INVALID_DIMENSION = -15,/*!< \brief Indicates that the supplied parameter is too big or too small in dimension. */ + VX_ERROR_INVALID_FORMAT = -14,/*!< \brief Indicates that the supplied parameter is in an invalid format. */ + VX_ERROR_INVALID_LINK = -13,/*!< \brief Indicates that the link is not possible as specified. The parameters are incompatible. */ + VX_ERROR_INVALID_REFERENCE = -12,/*!< \brief Indicates that the reference provided is not valid. */ + VX_ERROR_INVALID_MODULE = -11,/*!< \brief This is returned from \ref vxLoadKernels when the module does not contain the entry point. */ + VX_ERROR_INVALID_PARAMETERS = -10,/*!< \brief Indicates that the supplied parameter information does not match the kernel contract. */ + VX_ERROR_OPTIMIZED_AWAY = -9,/*!< \brief Indicates that the object refered to has been optimized out of existence. */ + VX_ERROR_NO_MEMORY = -8,/*!< \brief Indicates that an internal or implicit allocation failed. Typically catastrophic. After detection, deconstruct the context. \see vxVerifyGraph. */ + VX_ERROR_NO_RESOURCES = -7,/*!< \brief Indicates that an internal or implicit resource can not be acquired (not memory). This is typically catastrophic. After detection, deconstruct the context. \see vxVerifyGraph. */ + VX_ERROR_NOT_COMPATIBLE = -6,/*!< \brief Indicates that the attempt to link two parameters together failed due to type incompatibilty. */ + VX_ERROR_NOT_ALLOCATED = -5,/*!< \brief Indicates to the system that the parameter must be allocated by the system. */ + VX_ERROR_NOT_SUFFICIENT = -4,/*!< \brief Indicates that the given graph has failed verification due to an insufficient number of required parameters, which cannot be automatically created. Typically this indicates required atomic parameters. \see vxVerifyGraph. */ + VX_ERROR_NOT_SUPPORTED = -3,/*!< \brief Indicates that the requested set of parameters produce a configuration that cannot be supported. Refer to the supplied documentation on the configured kernels. \see vx_kernel_e. This is also returned if a function to set an attribute is called on a Read-only attribute.*/ + VX_ERROR_NOT_IMPLEMENTED = -2,/*!< \brief Indicates that the requested kernel is missing. \see vx_kernel_e vxGetKernelByName. */ + VX_FAILURE = -1,/*!< \brief Indicates a generic error code, used when no other describes the error. */ + VX_SUCCESS = 0,/*!< \brief No error. */ +}; + +/*! \brief A formal status type with known fixed size. + * \see vx_status_e + * \ingroup group_basic_features + */ +typedef vx_enum vx_status; + +/*! \brief The formal typedef of the response from the callback. + * \see vx_action_e + * \ingroup group_node_callback + */ +typedef vx_enum vx_action; + +/*! \brief A callback to the client after a particular node has completed. + * \see vx_action + * \see vxAssignNodeCallback + * \param [in] node The node to which the callback was attached. + * \return An action code from \ref vx_action_e. + * \ingroup group_node_callback + */ +typedef vx_action (VX_CALLBACK *vx_nodecomplete_f)(vx_node node); + +/*! \brief Vendor IDs are 2 nibbles in size and are located in the upper byte of + * the 4 bytes of an enumeration. + * \ingroup group_basic_features + */ +#define VX_VENDOR_MASK (0xFFF00000) + +/*! \brief A type mask removes the scalar/object type from the attribute. + * It is 3 nibbles in size and is contained between the third and second byte. + * \see vx_type_e + * \ingroup group_basic_features + */ +#define VX_TYPE_MASK (0x000FFF00) + +/*! \brief A library is a set of vision kernels with its own ID supplied by a vendor. + * The vendor defines the library ID. The range is \f$ [0,2^{8}-1] \f$ inclusive. + * \ingroup group_basic_features + */ +#define VX_LIBRARY_MASK (0x000FF000) + +/*! \brief An individual kernel in a library has its own unique ID within \f$ [0,2^{12}-1] \f$ (inclusive). + * \ingroup group_basic_features + */ +#define VX_KERNEL_MASK (0x00000FFF) + +/*! \brief An object's attribute ID is within the range of \f$ [0,2^{8}-1] \f$ (inclusive). + * \ingroup group_basic_features + */ +#define VX_ATTRIBUTE_ID_MASK (0x000000FF) + +/*! \brief A type of enumeration. The valid range is between \f$ [0,2^{8}-1] \f$ (inclusive). + * \ingroup group_basic_features + */ +#define VX_ENUM_TYPE_MASK (0x000FF000) + +/*! \brief A generic enumeration list can have values between \f$ [0,2^{12}-1] \f$ (inclusive). + * \ingroup group_basic_features + */ +#define VX_ENUM_MASK (0x00000FFF) + +/*! \brief A macro to extract the vendor ID from the enumerated value. + * \ingroup group_basic_features + */ +#define VX_VENDOR(e) (((vx_uint32)e & VX_VENDOR_MASK) >> 20) + +/*! \brief A macro to extract the type from an enumerated attribute value. + * \ingroup group_basic_features + */ +#define VX_TYPE(e) (((vx_uint32)e & VX_TYPE_MASK) >> 8) + +/*! \brief A macro to extract the enum type from an enumerated value. + * \ingroup group_basic_features + */ +#define VX_ENUM_TYPE(e) (((vx_uint32)e & VX_ENUM_TYPE_MASK) >> 12) + +/*! \brief A macro to extract the kernel library enumeration from a enumerated kernel value. + * \ingroup group_basic_features + */ +#define VX_LIBRARY(e) (((vx_uint32)e & VX_LIBRARY_MASK) >> 12) + +#if defined(_LITTLE_ENDIAN_) || (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(_WIN32) +#define VX_DF_IMAGE(a,b,c,d) ((a) | (b << 8) | (c << 16) | (d << 24)) +#define VX_ATTRIBUTE_BASE(vendor, object) (((vendor) << 20) | (object << 8)) +#define VX_KERNEL_BASE(vendor, lib) (((vendor) << 20) | (lib << 12)) +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) +#elif defined(_BIG_ENDIAN_) || (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define VX_DF_IMAGE(a,b,c,d) ((d) | (c << 8) | (b << 16) | (a << 24)) +#define VX_ATTRIBUTE_BASE(vendor, object) ((vendor) | (object << 12)) +#define VX_KERNEL_BASE(vendor, lib) ((vendor) | (lib << 12)) +#define VX_ENUM_BASE(vendor, id) ((vendor) | (id << 12)) +#else +#error "Endian-ness must be defined!" +#endif + +/*! \def VX_DF_IMAGE + * \brief Converts a set of four chars into a \c uint32_t container of a VX_DF_IMAGE code. + * \note Use a \ref vx_df_image variable to hold the value. + * \ingroup group_basic_features + */ +#define VX_DF_IMAGE(a,b,c,d) ((a) | (b << 8) | (c << 16) | (d << 24)) + +/*! \def VX_ATTRIBUTE_BASE + * \brief Defines the manner in which to combine the Vendor and Object IDs to get + * the base value of the enumeration. + * \ingroup group_basic_features + */ +#define VX_ATTRIBUTE_BASE(vendor, object) (((vendor) << 20) | (object << 8)) + +/*! \def VX_KERNEL_BASE + * \brief Defines the manner in which to combine the Vendor and Library IDs to get + * the base value of the enumeration. + * \ingroup group_basic_features + */ +#define VX_KERNEL_BASE(vendor, lib) (((vendor) << 20) | (lib << 12)) + +/*! \def VX_ENUM_BASE + * \brief Defines the manner in which to combine the Vendor and Object IDs to get + * the base value of the enumeration. + * \details From any enumerated value (with exceptions), the vendor, and enumeration + * type should be extractable. Those types that are exceptions are + * \ref vx_vendor_id_e, \ref vx_type_e, \ref vx_enum_e, \ref vx_df_image_e, and \c vx_bool. + * \ingroup group_basic_features + */ +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) + +/*! \brief The set of supported enumerations in OpenVX. + * \details These can be extracted from enumerated values using \ref VX_ENUM_TYPE. + * \ingroup group_basic_features + */ +enum vx_enum_e { + VX_ENUM_DIRECTION = 0x00, /*!< \brief Parameter Direction. */ + VX_ENUM_ACTION = 0x01, /*!< \brief Action Codes. */ + VX_ENUM_HINT = 0x02, /*!< \brief Hint Values. */ + VX_ENUM_DIRECTIVE = 0x03, /*!< \brief Directive Values. */ + VX_ENUM_INTERPOLATION = 0x04, /*!< \brief Interpolation Types. */ + VX_ENUM_OVERFLOW = 0x05, /*!< \brief Overflow Policies. */ + VX_ENUM_COLOR_SPACE = 0x06, /*!< \brief Color Space. */ + VX_ENUM_COLOR_RANGE = 0x07, /*!< \brief Color Space Range. */ + VX_ENUM_PARAMETER_STATE = 0x08, /*!< \brief Parameter State. */ + VX_ENUM_CHANNEL = 0x09, /*!< \brief Channel Name. */ + VX_ENUM_CONVERT_POLICY = 0x0A, /*!< \brief Convert Policy. */ + VX_ENUM_THRESHOLD_TYPE = 0x0B, /*!< \brief Threshold Type List. */ + VX_ENUM_BORDER = 0x0C, /*!< \brief Border Mode List. */ + VX_ENUM_COMPARISON = 0x0D, /*!< \brief Comparison Values. */ + VX_ENUM_MEMORY_TYPE = 0x0E, /*!< \brief The memory type enumeration. */ + VX_ENUM_TERM_CRITERIA = 0x0F, /*!< \brief A termination criteria. */ + VX_ENUM_NORM_TYPE = 0x10, /*!< \brief A norm type. */ + VX_ENUM_ACCESSOR = 0x11, /*!< \brief An accessor flag type. */ + VX_ENUM_ROUND_POLICY = 0x12, /*!< \brief Rounding Policy. */ + VX_ENUM_TARGET = 0x13, /*!< \brief Target. */ + VX_ENUM_BORDER_POLICY = 0x14, /*!< \brief Unsupported Border Mode Policy List. */ + VX_ENUM_GRAPH_STATE = 0x15, /*!< \brief Graph attribute states. */ + VX_ENUM_NONLINEAR = 0x16, /*!< \brief Non-linear function list. */ + VX_ENUM_PATTERN = 0x17, /*!< \brief Matrix pattern enumeration. */ + VX_ENUM_LBP_FORMAT = 0x18, /*!< \brief Lbp format. */ + VX_ENUM_COMP_METRIC = 0x19, /*!< \brief Compare metric. */ + +/* NN extension + VX_ENUM_NN_ROUNDING_TYPE = 0x1A, + VX_ENUM_NN_POOLING_TYPE = 0x1B, + VX_ENUM_NN_NORMALIZATION_TYPE = 0x1C, + VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE = 0x1D, +*/ + +/* Classifier extension + VX_ENUM_CLASSIFIER_MODEL= 0x1E, +*/ +/* IX extension + VX_ENUM_IX_USE = 0x1F, !< \brief How to use references in import and export. */ + VX_ENUM_SCALAR_OPERATION= 0X20 /*!< \brief Scalar operation list. */ + }; + +/*! \brief A return code enumeration from a \ref vx_nodecomplete_f during execution. + * \see vxAssignNodeCallback + * \ingroup group_node_callback + */ +enum vx_action_e { + /*! \brief Continue executing the graph with no changes. */ + VX_ACTION_CONTINUE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ACTION) + 0x0, + /*! \brief Stop executing the graph. */ + VX_ACTION_ABANDON = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ACTION) + 0x1, +}; + +/*! \brief An indication of how a kernel will treat the given parameter. + * \ingroup group_parameter + */ +enum vx_direction_e { + /*! \brief The parameter is an input only. */ + VX_INPUT = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTION) + 0x0, + /*! \brief The parameter is an output only. */ + VX_OUTPUT = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTION) + 0x1, +}; + +/*! \brief These enumerations are given to the \ref vxHint API to enable/disable platform + * optimizations and/or features. Hints are optional and usually are vendor-specific. + * \see \ref vxHint + * \ingroup group_hint + */ +enum vx_hint_e { + /*! \brief Indicates to the implementation that user do not apply any specific + * requirements for performance. + */ + VX_HINT_PERFORMANCE_DEFAULT = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_HINT) + 0x1, + /*! \brief Indicates the user preference is low power consumption versus + * highest performance. + */ + VX_HINT_PERFORMANCE_LOW_POWER = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_HINT) + 0x2, + /*! \brief Indicates the user preference for highest performance over + * low power consumption. + */ + VX_HINT_PERFORMANCE_HIGH_SPEED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_HINT) + 0x3, +}; + +/*! \brief These enumerations are given to the \c vxDirective API to enable/disable + * platform optimizations and/or features. Directives are not optional and + * usually are vendor-specific, by defining a vendor range of directives and + * starting their enumeration from there. + * \see vxDirective + * \ingroup group_directive + */ +enum vx_directive_e { + /*! \brief Disables recording information for graph debugging. */ + VX_DIRECTIVE_DISABLE_LOGGING = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTIVE) + 0x0, + /*! \brief Enables recording information for graph debugging. */ + VX_DIRECTIVE_ENABLE_LOGGING = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTIVE) + 0x1, + /*! \brief Disables performance counters for the context. By default performance counters are disabled */ + VX_DIRECTIVE_DISABLE_PERFORMANCE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTIVE) + 0x2, + /*! \brief Enables performance counters for the context. */ + VX_DIRECTIVE_ENABLE_PERFORMANCE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_DIRECTIVE) + 0x3, +}; + +/*! \brief The Graph State Enumeration. + * \ingroup group_graph + */ +enum vx_graph_state_e { + /*! \brief The graph should be verified before execution */ + VX_GRAPH_STATE_UNVERIFIED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x0, + /*! \brief The graph has been verified and has not been executed or scheduled for execution yet */ + VX_GRAPH_STATE_VERIFIED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x1, + /*! \brief The graph either has been scheduled and not completed, or is being executed */ + VX_GRAPH_STATE_RUNNING = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x2, + /*! \brief The graph execution was abandoned */ + VX_GRAPH_STATE_ABANDONED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x3, + /*! \brief The graph execution is completed and the graph is not scheduled for execution */ + VX_GRAPH_STATE_COMPLETED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x4, +}; + +/*! \brief The graph attributes list. + * \ingroup group_graph + */ +enum vx_graph_attribute_e { + /*! \brief Returns the number of nodes in a graph. Read-only. Use a \ref vx_uint32 parameter.*/ + VX_GRAPH_NUMNODES = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_GRAPH) + 0x0, + /*! \brief Returns the overall performance of the graph. Read-only. Use a \ref vx_perf_t parameter. + * The accuracy of timing information is platform dependent. + * \note Performance tracking must have been enabled. See \ref vx_directive_e + */ + VX_GRAPH_PERFORMANCE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_GRAPH) + 0x2, + /*! \brief Returns the number of explicitly declared parameters on the graph. Read-only. Use a \ref vx_uint32 parameter. */ + VX_GRAPH_NUMPARAMETERS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_GRAPH) + 0x3, + /*! \brief Returns the state of the graph. See \ref vx_graph_state_e enum. */ + VX_GRAPH_STATE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_GRAPH) + 0x4, +}; + +/*! \brief The Conversion Policy Enumeration. + * \ingroup group_basic_features + */ +enum vx_convert_policy_e { + /*! \brief Results are the least significant bits of the output operand, as if + * stored in two's complement binary format in the size of its bit-depth. + */ + VX_CONVERT_POLICY_WRAP = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x0, + /*! \brief Results are saturated to the bit depth of the output operand. */ + VX_CONVERT_POLICY_SATURATE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x1, +}; + +/*! \brief Based on the VX_DF_IMAGE definition. + * \note Use \ref vx_df_image to contain these values. + * \ingroup group_basic_features + */ +enum vx_df_image_e { + /*! \brief A virtual image of no defined type. */ + VX_DF_IMAGE_VIRT = VX_DF_IMAGE('V','I','R','T'), + /*! \brief A single plane of 24-bit pixel as 3 interleaved 8-bit units of + * R then G then B data. This uses the BT709 full range by default. + */ + VX_DF_IMAGE_RGB = VX_DF_IMAGE('R','G','B','2'), + /*! \brief A single plane of 32-bit pixel as 4 interleaved 8-bit units of + * R then G then B data, then a don't care byte. + * This uses the BT709 full range by default. + */ + VX_DF_IMAGE_RGBX = VX_DF_IMAGE('R','G','B','A'), + /*! \brief A 2-plane YUV format of Luma (Y) and interleaved UV data at + * 4:2:0 sampling. This uses the BT709 full range by default. + */ + VX_DF_IMAGE_NV12 = VX_DF_IMAGE('N','V','1','2'), + /*! \brief A 2-plane YUV format of Luma (Y) and interleaved VU data at + * 4:2:0 sampling. This uses the BT709 full range by default. + */ + VX_DF_IMAGE_NV21 = VX_DF_IMAGE('N','V','2','1'), + /*! \brief A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 bytes. + * This uses the BT709 full range by default. + */ + VX_DF_IMAGE_UYVY = VX_DF_IMAGE('U','Y','V','Y'), + /*! \brief A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes. + * This uses the BT709 full range by default. + */ + VX_DF_IMAGE_YUYV = VX_DF_IMAGE('Y','U','Y','V'), + /*! \brief A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes. + * This uses the BT709 full range by default. + */ + VX_DF_IMAGE_IYUV = VX_DF_IMAGE('I','Y','U','V'), + /*! \brief A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes. + * This uses the BT709 full range by default. + */ + VX_DF_IMAGE_YUV4 = VX_DF_IMAGE('Y','U','V','4'), + /*! \brief A single plane of unsigned 1-bit data packed eight pixels per byte. + * The least significant bit is the first pixel in each byte. + * See \ref vx_imagepatch_addressing_t for more details. + */ + VX_DF_IMAGE_U1 = VX_DF_IMAGE('U','0','0','1'), + /*! \brief A single plane of unsigned 8-bit data. + * The range of data is not specified, as it may be extracted from a YUV or + * generated. + */ + VX_DF_IMAGE_U8 = VX_DF_IMAGE('U','0','0','8'), + /*! \brief A single plane of unsigned 16-bit data. + * The range of data is not specified, as it may be extracted from a YUV or + * generated. + */ + VX_DF_IMAGE_U16 = VX_DF_IMAGE('U','0','1','6'), + /*! \brief A single plane of signed 16-bit data. + * The range of data is not specified, as it may be extracted from a YUV or + * generated. + */ + VX_DF_IMAGE_S16 = VX_DF_IMAGE('S','0','1','6'), + /*! \brief A single plane of unsigned 32-bit data. + * The range of data is not specified, as it may be extracted from a YUV or + * generated. + */ + VX_DF_IMAGE_U32 = VX_DF_IMAGE('U','0','3','2'), + /*! \brief A single plane of unsigned 32-bit data. + * The range of data is not specified, as it may be extracted from a YUV or + * generated. + */ + VX_DF_IMAGE_S32 = VX_DF_IMAGE('S','0','3','2'), +}; + +/*! \brief The Target Enumeration. + * \ingroup group_basic_features + */ +enum vx_target_e { + /*! \brief Any available target. An OpenVX implementation must support at least one target associated with this value */ + VX_TARGET_ANY = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_TARGET) + 0x0000, + /*! \brief Target, explicitly specified by its (case-insensitive) name string. */ + VX_TARGET_STRING = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_TARGET) + 0x0001, + /*! \brief Start of Vendor specific target enumerates. */ + VX_TARGET_VENDOR_BEGIN = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_TARGET) + 0x1000, +}; + +/*! \brief The reference attributes list. + * \ingroup group_reference + */ +enum vx_reference_attribute_e { + /*! \brief Returns the reference count of the object. Read-only. Use a \ref vx_uint32 parameter. */ + VX_REFERENCE_COUNT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REFERENCE) + 0x0, + /*! \brief Returns the \ref vx_type_e of the reference. Read-only. Use a \ref vx_enum parameter. */ + VX_REFERENCE_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REFERENCE) + 0x1, + /*! \brief Used to query the reference for its name. Read-write. Use a *\ref vx_char parameter. */ + VX_REFERENCE_NAME = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REFERENCE) + 0x2, +}; + +/*! \brief A list of context attributes. + * \ingroup group_context + */ +enum vx_context_attribute_e { + /*! \brief Queries the unique vendor ID. Read-only. Use a \ref vx_uint16. */ + VX_CONTEXT_VENDOR_ID = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x0, + /*! \brief Queries the OpenVX Version Number. Read-only. Use a \ref vx_uint16 */ + VX_CONTEXT_VERSION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x1, + /*! \brief Queries the context for the number of \e unique kernels. Read-only. Use a \ref vx_uint32 parameter. */ + VX_CONTEXT_UNIQUE_KERNELS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x2, + /*! \brief Queries the context for the number of active modules. Read-only. Use a \ref vx_uint32 parameter. */ + VX_CONTEXT_MODULES = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x3, + /*! \brief Queries the context for the number of active references. Read-only. Use a \ref vx_uint32 parameter. */ + VX_CONTEXT_REFERENCES = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x4, + /*! \brief Queries the context for it's implementation name. Read-only. Use a \ref vx_char[\ref VX_MAX_IMPLEMENTATION_NAME] array */ + VX_CONTEXT_IMPLEMENTATION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x5, + /*! \brief Queries the number of bytes in the extensions string. Read-only. Use a \ref vx_size parameter. */ + VX_CONTEXT_EXTENSIONS_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x6, + /*! \brief Retrieves the extensions string. Read-only. + * This is a space-separated string of extension names. Each OpenVX official extension has a unique identifier, + * comprised of capital letters, numbers and the underscore character, prefixed with "KHR_", for example "KHR_NEW_FEATURE". + * Use a \ref vx_char pointer allocated to the size returned from \ref VX_CONTEXT_EXTENSIONS_SIZE. + */ + VX_CONTEXT_EXTENSIONS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x7, + /*! \brief The maximum width or height of a convolution matrix. + * Read-only. Use a \ref vx_size parameter. + * Each vendor must support centered kernels of size w X h, where both w + * and h are odd numbers, 3 <= w <= n and 3 <= h <= n, where n is the value of the + * \ref VX_CONTEXT_CONVOLUTION_MAX_DIMENSION attribute. n is an odd + * number that should not be smaller than 9. w and h may or may not be equal to + * each other. All combinations of w and h meeting the conditions above must be + * supported. The behavior of \ref vxCreateConvolution is undefined for values + * larger than the value returned by this attribute. + */ + VX_CONTEXT_CONVOLUTION_MAX_DIMENSION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x8, + /*! \brief The maximum window dimension of the OpticalFlowPyrLK kernel. The value of this attribute shall be equal to or greater than '9'. + * \see \ref VX_KERNEL_OPTICAL_FLOW_PYR_LK. Read-only. Use a \ref vx_size parameter. + */ + VX_CONTEXT_OPTICAL_FLOW_MAX_WINDOW_DIMENSION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0x9, + /*! \brief The border mode for immediate mode functions. + * \details Graph mode functions are unaffected by this attribute. Read-write. Use a pointer to a \ref vx_border_t structure as parameter. + * \note The assumed default value for immediate mode functions is \ref VX_BORDER_UNDEFINED. + */ + VX_CONTEXT_IMMEDIATE_BORDER = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0xA, + /*! \brief Returns the table of all unique the kernels that exist in the context. + * Read-only. Use a \ref vx_kernel_info_t array. + * \pre You must call \ref vxQueryContext with \ref VX_CONTEXT_UNIQUE_KERNELS + * to compute the necessary size of the array. + */ + VX_CONTEXT_UNIQUE_KERNEL_TABLE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0xB, + /*! \brief The unsupported border mode policy for immediate mode functions. Read-Write. + * \details Graph mode functions are unaffected by this attribute. Use a \ref vx_enum as parameter. Will contain a \ref vx_border_policy_e. + * \note The assumed default value for immediate mode functions is \ref VX_BORDER_POLICY_DEFAULT_TO_UNDEFINED. Users should refer to the documentation of their implementation to determine what border modes are supported by each kernel. + */ + VX_CONTEXT_IMMEDIATE_BORDER_POLICY = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0xC, + /*! \brief The dimension of the largest nonlinear filter supported. See \ref vxNonLinearFilterNode. + * \details The implementation must support all dimensions (height or width, not necessarily the same) + * up to the value of this attribute. The lowest value that must be supported for this attribute is 9. + * Read-only. Use a \ref vx_size parameter. + */ + VX_CONTEXT_NONLINEAR_MAX_DIMENSION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0xd, + /*! \brief tensor Data maximal number of dimensions supported by the implementation. */ + VX_CONTEXT_MAX_TENSOR_DIMS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONTEXT) + 0xE, +}; + +/*! \brief The kernel attributes list + * \ingroup group_kernel + */ +enum vx_kernel_attribute_e { + /*! \brief Queries a kernel for the number of parameters the kernel + * supports. Read-only. Use a \ref vx_uint32 parameter. + */ + VX_KERNEL_PARAMETERS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x0, + /*! \brief Queries the name of the kernel. Not settable. + * Read-only. Use a \ref vx_char[\ref VX_MAX_KERNEL_NAME] array (not a \ref vx_array). + */ + VX_KERNEL_NAME = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x1, + /*! \brief Queries the enum of the kernel. Not settable. + * Read-only. Use a \ref vx_enum parameter. + */ + VX_KERNEL_ENUM = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x2, + /*! \brief The local data area allocated with each kernel when it becomes a + * node. Read-write. Can be written only before user-kernel finalization. + * Use a \ref vx_size parameter. + * \note If not set it will default to zero. + */ + VX_KERNEL_LOCAL_DATA_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_KERNEL) + 0x3, +}; + +/*! \brief The node attributes list. + * \ingroup group_node + */ +enum vx_node_attribute_e { + /*! \brief Queries the status of node execution. Read-only. Use a \ref vx_status parameter. */ + VX_NODE_STATUS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x0, + /*! \brief Queries the performance of the node execution. + * The accuracy of timing information is platform dependent and also depends on the graph + * optimizations. Read-only. + * \note Performance tracking must have been enabled. See \ref vx_directive_e. + */ + VX_NODE_PERFORMANCE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x1, + /*! \brief Gets or sets the border mode of the node. + * Read-write. Use a \ref vx_border_t structure with a default value of VX_BORDER_UNDEFINED. + */ + VX_NODE_BORDER = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x2, + /*! \brief Indicates the size of the kernel local memory area. + * Read-only. Can be written only at user-node (de)initialization if VX_KERNEL_LOCAL_DATA_SIZE==0. + * Use a \ref vx_size parameter. + */ + VX_NODE_LOCAL_DATA_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x3, + /*! \brief Indicates the pointer kernel local memory area. + * Read-Write. Can be written only at user-node (de)initialization if VX_KERNEL_LOCAL_DATA_SIZE==0. + * Use a void * parameter. + */ + VX_NODE_LOCAL_DATA_PTR = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x4, + /*! \brief Indicates the number of node parameters, including optional parameters that are not passed. + * Read-only. Use a \ref vx_uint32 parameter. + */ + VX_NODE_PARAMETERS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x5, + /*! \brief Indicates whether the node is replicated. Read-only. + * Use a \ref vx_bool parameter. + */ + VX_NODE_IS_REPLICATED = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x6, + /*! \brief Indicates the replicated parameters. Read-only. + * Use a \ref vx_bool* parameter. + */ + VX_NODE_REPLICATE_FLAGS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x7, + /*! \brief Indicates the behavior with respect to the valid rectangle. Read-only. + * Use a \ref vx_bool parameter. + */ + VX_NODE_VALID_RECT_RESET = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x8, +}; + +/*! \brief The parameter attributes list + * \ingroup group_parameter + */ +enum vx_parameter_attribute_e { + /*! \brief Queries a parameter for its index value on the kernel with which it is associated. Read-only. Use a \ref vx_uint32 parameter. */ + VX_PARAMETER_INDEX = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x0, + /*! \brief Queries a parameter for its direction value on the kernel with which it is associated. Read-only. Use a \ref vx_enum parameter. */ + VX_PARAMETER_DIRECTION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x1, + /*! \brief Queries a parameter for its type, \ref vx_type_e is returned. Read-only. The size of the parameter is implied for plain data objects. For opaque data objects like images and arrays a query to their attributes has to be called to determine the size. */ + VX_PARAMETER_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x2, + /*! \brief Queries a parameter for its state. A value in \ref vx_parameter_state_e is returned. Read-only. Use a \ref vx_enum parameter. */ + VX_PARAMETER_STATE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x3, + /*! \brief Use to extract the reference contained in the parameter. Read-only. Use a \ref vx_reference parameter. */ + VX_PARAMETER_REF = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x4, + /*! \brief Use to extract the meta format contained in the parameter. Read-only. Use a \ref vx_meta_format parameter. */ + VX_PARAMETER_META_FORMAT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PARAMETER) + 0x5, +}; + +/*! \brief The image attributes list. + * \ingroup group_image + */ +enum vx_image_attribute_e { + /*! \brief Queries an image for its width. Read-only. Use a \ref vx_uint32 parameter. */ + VX_IMAGE_WIDTH = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x0, + /*! \brief Queries an image for its height. Read-only. Use a \ref vx_uint32 parameter. */ + VX_IMAGE_HEIGHT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x1, + /*! \brief Queries an image for its format. Read-only. Use a \ref vx_df_image parameter. */ + VX_IMAGE_FORMAT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x2, + /*! \brief Queries an image for its number of planes. Read-only. Use a \ref vx_size parameter. */ + VX_IMAGE_PLANES = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x3, + /*! \brief Queries an image for its color space (see \ref vx_color_space_e). Read-write. Use a \ref vx_enum parameter. */ + VX_IMAGE_SPACE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x4, + /*! \brief Queries an image for its channel range (see \ref vx_channel_range_e). Read-only. Use a \ref vx_enum parameter. */ + VX_IMAGE_RANGE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x5, + /*! \brief Queries memory type if created using vxCreateImageFromHandle. If vx_image was not created using + vxCreateImageFromHandle, VX_MEMORY_TYPE_NONE is returned. Use a \ref vx_memory_type_e parameter. */ + VX_IMAGE_MEMORY_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x7, + /*! \brief Queries if an image is uniform. Read-only. Use a \ref vx_bool parameter */ + VX_IMAGE_IS_UNIFORM = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x8, + /*! \brief Queries the image uniform value if any. Read-only. Use a \ref vx_pixel_value_t parameter. */ + VX_IMAGE_UNIFORM_VALUE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_IMAGE) + 0x9, +}; + +/*! \brief The scalar attributes list. + * \ingroup group_scalar + */ +enum vx_scalar_attribute_e { + /*! \brief Queries the type of atomic that is contained in the scalar. Read-only. Use a \ref vx_enum parameter.*/ + VX_SCALAR_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_SCALAR) + 0x0, +}; + +/*! \brief A type of operation in which both operands are scalars. + * \see group_scalar + * \ingroup group_scalar + */ +enum vx_scalar_operation_e { + /*! \brief logical and. */ + VX_SCALAR_OP_AND = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x0, + /*! \brief logical or. */ + VX_SCALAR_OP_OR = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x1, + /*! \brief logical exclusive or. */ + VX_SCALAR_OP_XOR = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x2, + /*! \brief logical nand. */ + VX_SCALAR_OP_NAND = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x3, + /*! \brief comparison (equal). */ + VX_SCALAR_OP_EQUAL = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x4, + /*! \brief comparison (not equal). */ + VX_SCALAR_OP_NOTEQUAL = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x5, + /*! \brief comparison (less than). */ + VX_SCALAR_OP_LESS = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x6, + /*! \brief comparison (less than or equal to). */ + VX_SCALAR_OP_LESSEQ = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x7, + /*! \brief comparison (greater than). */ + VX_SCALAR_OP_GREATER = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x8, + /*! \brief comparison (greater than or equal to). */ + VX_SCALAR_OP_GREATEREQ = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x9, + /*! \brief arithmetic addition. */ + VX_SCALAR_OP_ADD = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0xA, + /*! \brief arithmetic subtraction. */ + VX_SCALAR_OP_SUBTRACT = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0xB, + /*! \brief arithmetic multiplication. */ + VX_SCALAR_OP_MULTIPLY = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0xC, + /*! \brief arithmetic division. */ + VX_SCALAR_OP_DIVIDE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0xD, + /*! \brief arithmetic (modulo operator). */ + VX_SCALAR_OP_MODULUS = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0xE, + /*! \brief minimum of two scalars. */ + VX_SCALAR_OP_MIN = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0xF, + /*! \brief maximum of two scalars. */ + VX_SCALAR_OP_MAX = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_SCALAR_OPERATION) + 0x10, +}; + +/*! \brief The Look-Up Table (LUT) attribute list. + * \ingroup group_lut + */ +enum vx_lut_attribute_e { + /*! \brief Indicates the value type of the LUT. Read-only. Use a \ref vx_enum. */ + VX_LUT_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS,VX_TYPE_LUT) + 0x0, + /*! \brief Indicates the number of elements in the LUT. Read-only. Use a \ref vx_size. */ + VX_LUT_COUNT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS,VX_TYPE_LUT) + 0x1, + /*! \brief Indicates the total size of the LUT in bytes. Read-only. Uses a \ref vx_size. */ + VX_LUT_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS,VX_TYPE_LUT) + 0x2, + /*! \brief Indicates the index of the input value = 0. Read-only. Uses a \ref vx_uint32. */ + VX_LUT_OFFSET = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS,VX_TYPE_LUT) + 0x3, +}; + +/*! \brief The distribution attribute list. + * \ingroup group_distribution + */ +enum vx_distribution_attribute_e { + /*! \brief Indicates the number of dimensions in the distribution. Read-only. Use a \ref vx_size parameter. */ + VX_DISTRIBUTION_DIMENSIONS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x0, + /*! \brief Indicates the start of the values to use (inclusive). Read-only. Use a \ref vx_int32 parameter. */ + VX_DISTRIBUTION_OFFSET = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x1, + /*! \brief Indicates the total number of the consecutive values of the distribution interval. */ + VX_DISTRIBUTION_RANGE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x2, + /*! \brief Indicates the number of bins. Read-only. Use a \ref vx_size parameter. */ + VX_DISTRIBUTION_BINS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x3, + /*! \brief Indicates the width of a bin. Equal to the range divided by the number of bins. If the range is not a + * multiple of the number of bins, it is not valid. Read-only. Use a \ref vx_uint32 parameter. */ + VX_DISTRIBUTION_WINDOW = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x4, + /*! \brief Indicates the total size of the distribution in bytes. Read-only. Use a \ref vx_size parameter. */ + VX_DISTRIBUTION_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DISTRIBUTION) + 0x5, +}; + +/*! \brief The Threshold types. + * \ingroup group_threshold + */ +enum vx_threshold_type_e { + /*! \brief A threshold with only 1 value. */ + VX_THRESHOLD_TYPE_BINARY = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_THRESHOLD_TYPE) + 0x0, + /*! \brief A threshold with 2 values (upper/lower). Use with Canny Edge Detection. */ + VX_THRESHOLD_TYPE_RANGE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_THRESHOLD_TYPE) + 0x1, +}; + +/*! \brief The threshold attributes. + * \ingroup group_threshold + */ +enum vx_threshold_attribute_e { + /*! \brief The value type of the threshold. Read-only. Use a \ref vx_enum parameter. Will contain a \ref vx_threshold_type_e. */ + VX_THRESHOLD_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x0, + /*! \brief The input image format the threshold was created for. Read-only. Use a \ref vx_enum parameter. Will contain a \ref vx_df_image_e.*/ + VX_THRESHOLD_INPUT_FORMAT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x7, + /*! \brief The output image format the threshold was created for. Read-only. Use a \ref vx_enum parameter. Will contain a \ref vx_df_image_e.*/ + VX_THRESHOLD_OUTPUT_FORMAT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_THRESHOLD) + 0x8 +}; + +/*! \brief The matrix attributes. + * \ingroup group_matrix + */ +enum vx_matrix_attribute_e { + /*! \brief The value type of the matrix. Read-only. Use a \ref vx_enum parameter. */ + VX_MATRIX_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_MATRIX) + 0x0, + /*! \brief The M dimension of the matrix. Read-only. Use a \ref vx_size parameter. */ + VX_MATRIX_ROWS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_MATRIX) + 0x1, + /*! \brief The N dimension of the matrix. Read-only. Use a \ref vx_size parameter. */ + VX_MATRIX_COLUMNS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_MATRIX) + 0x2, + /*! \brief The total size of the matrix in bytes. Read-only. Use a \ref vx_size parameter. */ + VX_MATRIX_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_MATRIX) + 0x3, + /*! \brief The origin of the matrix with a default value of [floor(VX_MATRIX_COLUMNS/2), + floor(VX_MATRIX_ROWS/2)]. Read-only. Use a \ref vx_coordinates2d_t parameter. */ + VX_MATRIX_ORIGIN = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_MATRIX) + 0x4, + /*! \brief The pattern of the matrix. See \ref vx_pattern_e . Read-only. Use a + * \ref vx_enum parameter. If the matrix was created via \ref vxCreateMatrixFromPattern + * or \ref vxCreateMatrixFromPatternAndOrigin, the attribute corresponds to the given pattern. + * Otherwise the attribute is \ref VX_PATTERN_OTHER. */ + VX_MATRIX_PATTERN = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_MATRIX) + 0x5, +}; + +/*! \brief The convolution attributes. + * \ingroup group_convolution + */ +enum vx_convolution_attribute_e { + /*! \brief The number of rows of the convolution matrix. Read-only. Use a \ref vx_size parameter. */ + VX_CONVOLUTION_ROWS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONVOLUTION) + 0x0, + /*! \brief The number of columns of the convolution matrix. Read-only. Use a \ref vx_size parameter. */ + VX_CONVOLUTION_COLUMNS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONVOLUTION) + 0x1, + /*! \brief The scale of the convolution matrix. Read-write. Use a \ref vx_uint32 parameter. + * \if OPENVX_STRICT_1_0 + * \note For 1.0, only powers of 2 are supported up to 2^31. + * \endif + */ + VX_CONVOLUTION_SCALE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONVOLUTION) + 0x2, + /*! \brief The total size of the convolution matrix in bytes. Read-only. Use a \ref vx_size parameter. */ + VX_CONVOLUTION_SIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_CONVOLUTION) + 0x3, +}; + +/*! \brief The pyramid object attributes. + * \ingroup group_pyramid + */ +enum vx_pyramid_attribute_e { + /*! \brief The number of levels of the pyramid. Read-only. Use a \ref vx_size parameter. */ + VX_PYRAMID_LEVELS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PYRAMID) + 0x0, + /*! \brief The scale factor between each level of the pyramid. Read-only. Use a \ref vx_float32 parameter. */ + VX_PYRAMID_SCALE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PYRAMID) + 0x1, + /*! \brief The width of the 0th image in pixels. Read-only. Use a \ref vx_uint32 parameter. */ + VX_PYRAMID_WIDTH = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PYRAMID) + 0x2, + /*! \brief The height of the 0th image in pixels. Read-only. Use a \ref vx_uint32 parameter. */ + VX_PYRAMID_HEIGHT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PYRAMID) + 0x3, + /*! \brief The \ref vx_df_image_e format of the image. Read-only. Use a \ref vx_df_image parameter. */ + VX_PYRAMID_FORMAT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_PYRAMID) + 0x4, +}; + +/*! \brief The remap object attributes. + * \ingroup group_remap + */ +enum vx_remap_attribute_e { + /*! \brief The source width. Read-only. Use a \ref vx_uint32 parameter. */ + VX_REMAP_SOURCE_WIDTH = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REMAP) + 0x0, + /*! \brief The source height. Read-only. Use a \ref vx_uint32 parameter. */ + VX_REMAP_SOURCE_HEIGHT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REMAP) + 0x1, + /*! \brief The destination width. Read-only. Use a \ref vx_uint32 parameter. */ + VX_REMAP_DESTINATION_WIDTH = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REMAP) + 0x2, + /*! \brief The destination height. Read-only. Use a \ref vx_uint32 parameter. */ + VX_REMAP_DESTINATION_HEIGHT = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_REMAP) + 0x3, +}; + +/*! \brief The array object attributes. + * \ingroup group_array + */ +enum vx_array_attribute_e { + /*! \brief The type of the Array items. Read-only. Use a \ref vx_enum parameter. */ + VX_ARRAY_ITEMTYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_ARRAY) + 0x0, + /*! \brief The number of items in the Array. Read-only. Use a \ref vx_size parameter. */ + VX_ARRAY_NUMITEMS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_ARRAY) + 0x1, + /*! \brief The maximal number of items that the Array can hold. Read-only. Use a \ref vx_size parameter. */ + VX_ARRAY_CAPACITY = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_ARRAY) + 0x2, + /*! \brief Queries an array item size. Read-only. Use a \ref vx_size parameter. */ + VX_ARRAY_ITEMSIZE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_ARRAY) + 0x3, +}; + +/*! \brief The ObjectArray object attributes. + * \ingroup group_object_array + */ +enum vx_object_array_attribute_e { + /*! \brief The type of the ObjectArray items. Read-only. Use a \ref vx_enum parameter. */ + VX_OBJECT_ARRAY_ITEMTYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_OBJECT_ARRAY) + 0x0, + /*! \brief The number of items in the ObjectArray. Read-only. Use a \ref vx_size parameter. */ + VX_OBJECT_ARRAY_NUMITEMS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_OBJECT_ARRAY) + 0x1, +}; +/*! \brief tensor Data attributes. + * \ingroup group_object_tensor + */ +enum vx_tensor_attribute_e +{ + /*! \brief Number of dimensions. */ + VX_TENSOR_NUMBER_OF_DIMS = VX_ATTRIBUTE_BASE( VX_ID_KHRONOS, VX_TYPE_TENSOR ) + 0x0, + /*! \brief Dimension sizes. */ + VX_TENSOR_DIMS = VX_ATTRIBUTE_BASE( VX_ID_KHRONOS, VX_TYPE_TENSOR ) + 0x1, + /*! \brief tensor Data element data type. vx_type_e */ + VX_TENSOR_DATA_TYPE = VX_ATTRIBUTE_BASE( VX_ID_KHRONOS, VX_TYPE_TENSOR ) + 0x2, + /*! \brief fixed point position when the input element type is integer. */ + VX_TENSOR_FIXED_POINT_POSITION = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_TENSOR) + 0x3, + /*! \brief tensor quantization data type. */ + VX_TENSOR_QUANT_FORMAT = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x0, + /*! \brief tensor quantization zero point. */ + VX_TENSOR_ZERO_POINT = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x1, + /*! \brief tensor quantization scale value. */ + VX_TENSOR_SCALE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x2, + /*! \brief the rank of tensor. */ + VX_TENSOR_RANK = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x3, + /*! \brief the precision of tensor. */ + VX_TENSOR_PRECISION = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x4, + /*! \brief the data lifetime of tensor. */ + VX_TENSOR_LIFETIME = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x5, + /*! \brief the value status of tensor. */ + VX_TENSOR_VALUE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x6, +}; + +/*! \brief The meta valid rectangle attributes. + * \ingroup group_user_kernels + */ +enum vx_meta_valid_rect_attribute_e { + /*! \brief Valid rectangle callback during output parameter validation. Write-only. */ + VX_VALID_RECT_CALLBACK = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_META_FORMAT) + 0x1, +}; + +/*! \brief The channel enumerations for channel extractions. + * \see vxChannelExtractNode + * \see vxuChannelExtract + * \see VX_KERNEL_CHANNEL_EXTRACT + * \ingroup group_basic_features + */ +enum vx_channel_e { + /*! \brief Used by formats with unknown channel types. */ + VX_CHANNEL_0 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x0, + /*! \brief Used by formats with unknown channel types. */ + VX_CHANNEL_1 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x1, + /*! \brief Used by formats with unknown channel types. */ + VX_CHANNEL_2 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x2, + /*! \brief Used by formats with unknown channel types. */ + VX_CHANNEL_3 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x3, + + /*! \brief Use to extract the RED channel, no matter the byte or packing order. */ + VX_CHANNEL_R = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x10, + /*! \brief Use to extract the GREEN channel, no matter the byte or packing order. */ + VX_CHANNEL_G = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x11, + /*! \brief Use to extract the BLUE channel, no matter the byte or packing order. */ + VX_CHANNEL_B = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x12, + /*! \brief Use to extract the ALPHA channel, no matter the byte or packing order. */ + VX_CHANNEL_A = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x13, + /*! \brief Use to extract the LUMA channel, no matter the byte or packing order. */ + VX_CHANNEL_Y = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x14, + /*! \brief Use to extract the Cb/U channel, no matter the byte or packing order. */ + VX_CHANNEL_U = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x15, + /*! \brief Use to extract the Cr/V/Value channel, no matter the byte or packing order. */ + VX_CHANNEL_V = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x16, +}; + +/*! \brief An enumeration of memory import types. + * \ingroup group_context + */ +enum vx_memory_type_e { + /*! \brief For memory allocated through OpenVX, this is the import type. */ + VX_MEMORY_TYPE_NONE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_MEMORY_TYPE) + 0x0, + + /*! \brief The default memory type to import from the Host. */ + VX_MEMORY_TYPE_HOST = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_MEMORY_TYPE) + 0x1, + + VX_MEMORY_TYPE_DMABUF = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_MEMORY_TYPE) + 0x0, + + VX_MEMORY_TYPE_INTERNAL = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_MEMORY_TYPE) + 0x1, + + VX_MEMORY_TYPE_HOST_UNCACHED = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_MEMORY_TYPE) + 0x2, +}; + +/*! \brief The image reconstruction filters supported by image resampling operations. + * + * The edge of a pixel is interpreted as being aligned to the edge of the image. + * The value for an output pixel is evaluated at the center of that pixel. + * + * This means, for example, that an even enlargement of a factor of two in nearest-neighbor + * interpolation will replicate every source pixel into a 2x2 quad in the destination, and that + * an even shrink by a factor of two in bilinear interpolation will create each destination pixel + * by average a 2x2 quad of source pixels. + * + * Samples that cross the boundary of the source image have values determined by the border + * mode - see \ref vx_border_e and \ref VX_NODE_BORDER. + * \see vxuScaleImage + * \see vxScaleImageNode + * \see VX_KERNEL_SCALE_IMAGE + * \see vxuWarpAffine + * \see vxWarpAffineNode + * \see VX_KERNEL_WARP_AFFINE + * \see vxuWarpPerspective + * \see vxWarpPerspectiveNode + * \see VX_KERNEL_WARP_PERSPECTIVE + * \ingroup group_basic_features + */ +enum vx_interpolation_type_e { + /*! \brief Output values are defined to match the source pixel whose center is nearest to the sample position. */ + VX_INTERPOLATION_NEAREST_NEIGHBOR = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x0, + /*! \brief Output values are defined by bilinear interpolation between the pixels whose centers are closest + * to the sample position, weighted linearly by the distance of the sample from the pixel centers. */ + VX_INTERPOLATION_BILINEAR = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x1, + /*! \brief Output values are determined by averaging the source pixels whose areas fall under the + * area of the destination pixel, projected onto the source image. */ + VX_INTERPOLATION_AREA = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x2, +}; + +/*! \brief An enumeration of non-linear filter functions. + * \ingroup group_basic_features + */ +enum vx_non_linear_filter_e { + /*! \brief Nonlinear median filter. */ + VX_NONLINEAR_FILTER_MEDIAN = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NONLINEAR) + 0x0, + /*! \brief Nonlinear Erode. */ + VX_NONLINEAR_FILTER_MIN = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NONLINEAR) + 0x1 , + /*! \brief Nonlinear Dilate. */ + VX_NONLINEAR_FILTER_MAX = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NONLINEAR) + 0x2, +}; + +/*! \brief An enumeration of matrix patterns. See \ref vxCreateMatrixFromPattern + * and \ref vxCreateMatrixFromPatternAndOrigin + * \ingroup group_basic_features + */ +enum vx_pattern_e { + /*! \brief Box pattern matrix */ + VX_PATTERN_BOX = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_PATTERN) + 0x0, + /*! \brief Cross pattern matrix */ + VX_PATTERN_CROSS = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_PATTERN) + 0x1 , + /*! \brief A square matrix (rows = columns = size) */ + VX_PATTERN_DISK = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_PATTERN) + 0x2, + /*! \brief Matrix with any pattern other than above. */ + VX_PATTERN_OTHER = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_PATTERN) + 0x3, +}; + +/*! \brief The image color space list used by the \ref VX_IMAGE_SPACE attribute of a \ref vx_image. + * \ingroup group_image + */ +enum vx_color_space_e { + /*! \brief Use to indicate that no color space is used. */ + VX_COLOR_SPACE_NONE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_SPACE) + 0x0, + /*! \brief Use to indicate that the BT.601 coefficients and SMPTE C primaries are used for conversions. */ + VX_COLOR_SPACE_BT601_525 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_SPACE) + 0x1, + /*! \brief Use to indicate that the BT.601 coefficients and BTU primaries are used for conversions. */ + VX_COLOR_SPACE_BT601_625 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_SPACE) + 0x2, + /*! \brief Use to indicate that the BT.709 coefficients are used for conversions. */ + VX_COLOR_SPACE_BT709 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_SPACE) + 0x3, + + /*! \brief All images in VX are by default BT.709 */ + VX_COLOR_SPACE_DEFAULT = VX_COLOR_SPACE_BT709, +}; + +/*! \brief The image channel range list used by the \ref VX_IMAGE_RANGE attribute of a \ref vx_image. + * \ingroup group_image + */ +enum vx_channel_range_e { + /*! \brief Full range of the unit of the channel */ + VX_CHANNEL_RANGE_FULL = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_RANGE) + 0x0, + /*! \brief Restricted range of the unit of the channel based on the space given */ + VX_CHANNEL_RANGE_RESTRICTED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_COLOR_RANGE) + 0x1, +}; + +/*! \brief The parameter state type. + * \ingroup group_parameter + */ +enum vx_parameter_state_e { + /*! \brief Default. The parameter must be supplied. If not set, during + * Verify, an error is returned. + */ + VX_PARAMETER_STATE_REQUIRED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_PARAMETER_STATE) + 0x0, + /*! \brief The parameter may be unspecified. The kernel takes care not + * to deference optional parameters until it is certain they are valid. + */ + VX_PARAMETER_STATE_OPTIONAL = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_PARAMETER_STATE) + 0x1, + + VX_NODE_ATTRIBUTE_WEIGHT_BIAS_CACHE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_PARAMETER_STATE) + 0x2, +}; + +/*! \brief The border mode list. + * \ingroup group_borders + */ +enum vx_border_e { + /*! \brief No defined border mode behavior is given. */ + VX_BORDER_UNDEFINED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0, + /*! \brief For nodes that support this behavior, a constant value is + * \e filled-in when accessing out-of-bounds pixels. + */ + VX_BORDER_CONSTANT = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1, + /*! \brief For nodes that support this behavior, a replication of the nearest + * edge pixels value is given for out-of-bounds pixels. + */ + VX_BORDER_REPLICATE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2, +}; + +/*! \brief The unsupported border mode policy list. + * \ingroup group_borders + */ +enum vx_border_policy_e { + /*! \brief Use VX_BORDER_UNDEFINED instead of unsupported border modes. */ + VX_BORDER_POLICY_DEFAULT_TO_UNDEFINED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER_POLICY) + 0x0, + /*! \brief Return VX_ERROR_NOT_SUPPORTED for unsupported border modes. */ + VX_BORDER_POLICY_RETURN_ERROR = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER_POLICY) + 0x1, +}; + +/*! \brief The termination criteria list. + * \see group_vision_function_opticalflowpyrlk + * \ingroup group_context + */ +enum vx_termination_criteria_e { + /*! \brief Indicates a termination after a set number of iterations. */ + VX_TERM_CRITERIA_ITERATIONS = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_TERM_CRITERIA) + 0x0, + /*! \brief Indicates a termination after matching against the value of eplison provided to the function. */ + VX_TERM_CRITERIA_EPSILON = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_TERM_CRITERIA) + 0x1, + /*! \brief Indicates that both an iterations and eplison method are employed. Whichever one matches first + * causes the termination. + */ + VX_TERM_CRITERIA_BOTH = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_TERM_CRITERIA) + 0x2, +}; + +/*! \brief A normalization type. + * \see group_vision_function_canny + * \ingroup group_vision_function_canny + */ +enum vx_norm_type_e { + /*! \brief The L1 normalization. */ + VX_NORM_L1 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NORM_TYPE) + 0x0, + /*! \brief The L2 normalization. */ + VX_NORM_L2 = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NORM_TYPE) + 0x1, +}; + +/*! \brief The delay attribute list. + * \ingroup group_delay + */ +enum vx_delay_attribute_e { + /*! \brief The type of objects in the delay. Read-only. Use a \ref vx_enum parameter. */ + VX_DELAY_TYPE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DELAY) + 0x0, + /*! \brief The number of items in the delay. Read-only. Use a \ref vx_size parameter.*/ + VX_DELAY_SLOTS = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_DELAY) + 0x1, +}; + +/*! \brief The memory accessor hint flags. + * These enumeration values are used to indicate desired \e system behavior, + * not the \b User intent. For example: these can be interpretted as hints to the + * system about cache operations or marshalling operations. + * \ingroup group_context + */ +enum vx_accessor_e { + /*! \brief The memory shall be treated by the system as if it were read-only. + * If the User writes to this memory, the results are implementation defined. + */ + VX_READ_ONLY = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ACCESSOR) + 0x1, + /*! \brief The memory shall be treated by the system as if it were write-only. + * If the User reads from this memory, the results are implementation defined. + */ + VX_WRITE_ONLY = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ACCESSOR) + 0x2, + /*! \brief The memory shall be treated by the system as if it were readable and writeable. + */ + VX_READ_AND_WRITE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ACCESSOR) + 0x3, +}; + +/*! \brief The Round Policy Enumeration. + * \ingroup group_context + */ +enum vx_round_policy_e { + /*! \brief When scaling, this truncates the least significant values that are lost in operations. */ + VX_ROUND_POLICY_TO_ZERO = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ROUND_POLICY) + 0x1, + /*! \brief When scaling, this rounds to nearest even output value. */ + VX_ROUND_POLICY_TO_NEAREST_EVEN = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_ROUND_POLICY) + 0x2, +}; + +/*! \brief Local binary pattern supported. + * \ingroup group_vision_function_lbp + */ +enum vx_lbp_format_e +{ + /*! \brief local binary pattern + */ + VX_LBP = VX_ENUM_BASE( VX_ID_KHRONOS, VX_ENUM_LBP_FORMAT ) + 0x0, + /*! \brief Modified Local Binary Patterns + */ + + VX_MLBP = VX_ENUM_BASE( VX_ID_KHRONOS, VX_ENUM_LBP_FORMAT ) + 0x1, + /*! \brief Uniform local binary pattern + */ + VX_ULBP = VX_ENUM_BASE( VX_ID_KHRONOS, VX_ENUM_LBP_FORMAT ) + 0x2 +}; + +/*! \brief comparing metrics. + * \details In all the equations below w and h are width and height of the template image respectively. + * \f$ R \f$ is the compare map. \f$ T \f$ is the template image.\f$ I \f$ is the image on which the template is searched. + * \ingroup group_vision_function_match_template + */ +enum vx_comp_metric_e +{ + /*! \brief hamming distance \f$ R(x,y) = \frac{1}{w*h}\sum_{\grave{x},\grave{y}}^{w,h} XOR(T(\grave{x},\grave{y}),I(x+\grave{x},y+\grave{y}))\f$ */ + VX_COMPARE_HAMMING = VX_ENUM_BASE( VX_ID_KHRONOS, VX_ENUM_COMP_METRIC ) + 0x0, + /*! \brief L1 distance \f$ R(x,y) = \frac{1}{w*h}\sum_{\grave{x},\grave{y}}^{w,h} ABS(T(\grave{x},\grave{y}) - I(x+\grave{x},y+\grave{y}))\f$ */ + VX_COMPARE_L1 = VX_ENUM_BASE( VX_ID_KHRONOS, VX_ENUM_COMP_METRIC ) + 0x1, + /*! \brief L2 distance normalized by image size \f$ R(x,y) = \frac{1}{w*h}\sum_{\grave{x},\grave{y}}^{w,h} (T(\grave{x},\grave{y}) - I(x+\grave{x},y+\grave{y}))^2\f$*/ + VX_COMPARE_L2 = VX_ENUM_BASE( VX_ID_KHRONOS, VX_ENUM_COMP_METRIC ) + 0x2, + /*! \brief cross correlation distance \f$ R(x,y) = \frac{1}{w*h}\sum_{\grave{x},\grave{y}}^{w,h} (T(\grave{x},\grave{y})*I(x+\grave{x},y+\grave{y}))\f$*/ + VX_COMPARE_CCORR = VX_ENUM_BASE( VX_ID_KHRONOS, VX_ENUM_COMP_METRIC ) + 0x3, + /*! \brief L2 normalized distance \f$ R(x,y) = \frac{\sum_{\grave{x},\grave{y}}^{w,h} (T(\grave{x},\grave{y}) - I(x+\grave{x},y+\grave{y}))^2} + * {\sqrt{\sum_{\grave{x},\grave{y}}^{w,h} T(\grave{x},\grave{y})^2 * I(x+\grave{x},y+\grave{y})^2}} \f$*/ + VX_COMPARE_L2_NORM = VX_ENUM_BASE( VX_ID_KHRONOS, VX_ENUM_COMP_METRIC ) + 0x4, + /*! \brief cross correlation normalized distance \f$ R(x,y) = \frac{\sum_{\grave{x},\grave{y}}^{w,h} T(\grave{x},\grave{y}) * I(x+\grave{x},y+\grave{y})*2^{15}} + * {\sqrt{\sum_{\grave{x},\grave{y}}^{w,h} T(\grave{x},\grave{y})^2 * I(x+\grave{x},y+\grave{y})^2}} \f$*/ + VX_COMPARE_CCORR_NORM = VX_ENUM_BASE( VX_ID_KHRONOS, VX_ENUM_COMP_METRIC ) + 0x5 +}; + +#if defined(_WIN32) || defined(UNDER_CE) +#if defined(_WIN64) +/*! \brief Use to aid in debugging values in OpenVX. + * \ingroup group_basic_features + */ +#define VX_FMT_REF "%I64u" +/*! \brief Use to aid in debugging values in OpenVX. + * \ingroup group_basic_features + */ +#define VX_FMT_SIZE "%I64u" +#else +/*! \brief Use to aid in debugging values in OpenVX. + * \ingroup group_basic_features + */ +#define VX_FMT_REF "%lu" +/*! \brief Use to aid in debugging values in OpenVX. + * \ingroup group_basic_features + */ +#define VX_FMT_SIZE "%lu" +#endif +#else +/*! \brief Use to aid in debugging values in OpenVX. + * \ingroup group_basic_features + */ +#define VX_FMT_REF "%p" +/*! \brief Use to aid in debugging values in OpenVX. + * \ingroup group_basic_features + */ +#define VX_FMT_SIZE "%zu" +#endif +/*! \brief Use to indicate the 1:1 ratio in Q22.10 format. + * \ingroup group_basic_features + */ +#define VX_SCALE_UNITY (1024u) + +/*! + * \brief The addressing image patch structure is used by the Host only + * to address pixels in an image patch. The fields of the structure are defined as: + * \arg dim - The dimensions of the image in logical pixel units in the x & y direction. + * \arg stride - The physical byte distance from a logical pixel to the next + * logically adjacent pixel in the positive x or y direction. + * \arg scale - The relationship of scaling from the primary plane (typically + * the zero indexed plane) to this plane. An integer down-scaling factor of \f$ f \f$ shall be + * set to a value equal to \f$ scale = \frac{unity}{f} \f$ and an integer up-scaling factor of \f$ f \f$ + * shall be set to a value of \f$ scale = unity * f \f$. \f$ unity \f$ is defined as \ref VX_SCALE_UNITY. + * \arg step - The step is the number of logical pixel units to skip to + * arrive at the next physically unique pixel. For example, on a plane that is + * half-scaled in a dimension, the step in that dimension is 2 to indicate that + * every other pixel in that dimension is an alias. This is useful in situations + * where iteration over unique pixels is required, such as in serializing + * or de-serializing the image patch information. + * \see \ref vxMapImagePatch + * \ingroup group_image + */ +typedef struct _vx_imagepatch_addressing_t { + vx_uint32 dim_x; /*!< \brief Width of patch in X dimension in pixels. */ + vx_uint32 dim_y; /*!< \brief Height of patch in Y dimension in pixels. */ + vx_int32 stride_x; /*!< \brief Stride in X dimension in bytes. */ + vx_int32 stride_y; /*!< \brief Stride in Y dimension in bytes. */ + vx_uint32 scale_x; /*!< \brief Scale of X dimension. For sub-sampled planes this is the scaling factor of the dimension of the plane in relation to the zero plane. Use \ref VX_SCALE_UNITY in the numerator. */ + vx_uint32 scale_y; /*!< \brief Scale of Y dimension. For sub-sampled planes this is the scaling factor of the dimension of the plane in relation to the zero plane. Use \ref VX_SCALE_UNITY in the numerator. */ + vx_uint32 step_x; /*!< \brief Step of X dimension in pixels. */ + vx_uint32 step_y; /*!< \brief Step of Y dimension in pixels. */ + vx_uint16 stride_x_bits; /*!< \brief Stride in X dimension in bits. Used when stride_x is not an integer number of bytes. */ +} vx_imagepatch_addressing_t; + +/*! \brief Use to initialize a \ref vx_imagepatch_addressing_t structure on the stack. + * \ingroup group_image + */ +#define VX_IMAGEPATCH_ADDR_INIT {0u, 0u, 0, 0, 0u, 0u, 0u, 0u, 0u} + +/*! \brief The performance measurement structure. The time or durations are in units of nano seconds. + * \ingroup group_performance + */ +typedef struct _vx_perf_t { + vx_uint64 tmp; /*!< \brief Holds the last measurement. */ + vx_uint64 beg; /*!< \brief Holds the first measurement in a set. */ + vx_uint64 end; /*!< \brief Holds the last measurement in a set. */ + vx_uint64 sum; /*!< \brief Holds the summation of durations. */ + vx_uint64 avg; /*!< \brief Holds the average of the durations. */ + vx_uint64 min; /*!< \brief Holds the minimum of the durations. */ + vx_uint64 num; /*!< \brief Holds the number of measurements. */ + vx_uint64 max; /*!< \brief Holds the maximum of the durations. */ +} vx_perf_t; + +/*! \brief Hough lines probability parameters. + * \ingroup group_vision_function_hough_lines_p + */ +typedef struct _vx_hough_lines_p_t +{ + /*! \brief Distance resolution of the parameter in pixels. */ + vx_float32 rho; + /*! \brief Angle resolution of the parameter in radians. */ + vx_float32 theta; + /*! \brief The minimum number of intersections to detect a line. */ + vx_int32 threshold; + /*! \brief The minimum number of points that can form a line. Line segments shorter than that are rejected. */ + vx_int32 line_length; + /*! \brief The maximum allowed gap between points on the same line to link them. */ + vx_int32 line_gap; + /*! \brief Optional restriction on theta. The max allowed value. */ + vx_float32 theta_max; + /*! \brief Optional restriction on theta. The min allowed value. */ + vx_float32 theta_min; +} vx_hough_lines_p_t; + +/*! \brief line struct + * \ingroup group_basic_features + */ +typedef struct _vx_line2d_t +{ + /*! \brief x index of line start */ + vx_float32 start_x; + /*! \brief y index of line start */ + vx_float32 start_y; + /*! \brief x index of line end*/ + vx_float32 end_x; + /*! \brief y index of line end*/ + vx_float32 end_y; +} vx_line2d_t; + +/*! \brief Matrix Multiply Parameters + * + * transpose_input1/input2/input3 : if True the matrix is transposed before the operation, otherwise the matrix is used as is. \n + * \ingroup group_vision_function_tensor_matrix_multiply + */ +typedef struct _vx_tensor_matrix_multiply_params_t{ + /*! \brief if True the matrix is transposed before the operation, otherwise the matrix is used as is*/ + vx_bool transpose_input1; + /*! \brief if True the matrix is transposed before the operation, otherwise the matrix is used as is*/ + vx_bool transpose_input2; + /*! \brief if True the matrix is transposed before the operation, otherwise the matrix is used as is*/ + vx_bool transpose_input3; +} vx_tensor_matrix_multiply_params_t; + +/*! \brief Initializes a \ref vx_perf_t on the stack. + * \ingroup group performance + */ +#define VX_PERF_INIT {0ul, 0ul, 0ul, 0ul, 0ul, 0ul} + +/*! \brief The Kernel Information Structure. This is returned by the Context + * to indicate which kernels are available in the OpenVX implementation. + * \ingroup group_kernel + */ +typedef struct _vx_kernel_info_t { + /*! \brief The kernel enumeration value from \ref vx_kernel_e (or an + * extension thereof). + * \see vxGetKernelByEnum + */ + vx_enum enumeration; + + /*! \brief The kernel name in dotted hierarchical format. + * e.g. "org.khronos.openvx.sobel_3x3" + * \see vxGetKernelByName + */ + vx_char name[VX_MAX_KERNEL_NAME]; +} vx_kernel_info_t; + +/*! \brief Use to indicate a half-scale pyramid. + * \ingroup group_pyramid + */ +#define VX_SCALE_PYRAMID_HALF (0.5f) + +/*! \brief Use to indicate a ORB scaled pyramid whose scaling factor is \f$ \frac{1}{\root 4 \of {2}} \f$. + * \ingroup group_pyramid + */ +#define VX_SCALE_PYRAMID_ORB ((vx_float32)0.8408964f) + +/*! \brief The keypoint data structure. + * \ingroup group_basic_features + */ +typedef struct _vx_keypoint_t { + vx_int32 x; /*!< \brief The x coordinate. */ + vx_int32 y; /*!< \brief The y coordinate. */ + vx_float32 strength; /*!< \brief The strength of the keypoint. Its definition is specific to the corner detector. */ + vx_float32 scale; /*!< \brief Initialized to 0 by corner detectors. */ + vx_float32 orientation; /*!< \brief Initialized to 0 by corner detectors. */ + vx_int32 tracking_status; /*!< \brief A zero indicates a lost point. Initialized to 1 by corner detectors. */ + vx_float32 error; /*!< \brief A tracking method specific error. Initialized to 0 by corner detectors. */ +} vx_keypoint_t; + +/*! \brief The rectangle data structure that is shared with the users. The area of the rectangle can be computed as (end_x-start_x)*(end_y-start_y). + * \ingroup group_basic_features + */ +typedef struct _vx_rectangle_t { + vx_uint32 start_x; /*!< \brief The Start X coordinate. */ + vx_uint32 start_y; /*!< \brief The Start Y coordinate. */ + vx_uint32 end_x; /*!< \brief The End X coordinate. */ + vx_uint32 end_y; /*!< \brief The End Y coordinate. */ +} vx_rectangle_t; + +/*! \brief The 2D Coordinates structure. + * \ingroup group_basic_features + */ +typedef struct _vx_coordinates2d_t { + vx_uint32 x; /*!< \brief The X coordinate. */ + vx_uint32 y; /*!< \brief The Y coordinate. */ +} vx_coordinates2d_t; + +/*! \brief The floating-point 2D Coordinates structure. + * \ingroup group_basic_features + */ +typedef struct _vx_coordinates2df_t { + vx_float32 x; /*!< \brief The X coordinate. */ + vx_float32 y; /*!< \brief The Y coordinate. */ +} vx_coordinates2df_t; + +/*! \brief The 3D Coordinates structure. + * \ingroup group_basic_features + */ +typedef struct _vx_coordinates3d_t { + vx_uint32 x; /*!< \brief The X coordinate. */ + vx_uint32 y; /*!< \brief The Y coordinate. */ + vx_uint32 z; /*!< \brief The Z coordinate. */ +} vx_coordinates3d_t; + +/*! \brief Union that describes the value of a pixel for any image format. Use the field +* corresponding to the image format. +* \ingroup group_image +*/ +typedef union _vx_pixel_value_t { + vx_uint8 RGB[3]; /*!< \brief \ref VX_DF_IMAGE_RGB format in the R,G,B order */ + vx_uint8 RGBX[4]; /*!< \brief \ref VX_DF_IMAGE_RGBX format in the R,G,B,X order */ + vx_uint8 YUV[3]; /*!< \brief All YUV formats in the Y,U,V order */ + vx_bool U1; /*!< \brief \ref VX_DF_IMAGE_U1 */ + vx_uint8 U8; /*!< \brief \ref VX_DF_IMAGE_U8 */ + vx_uint16 U16; /*!< \brief \ref VX_DF_IMAGE_U16 */ + vx_int16 S16; /*!< \brief \ref VX_DF_IMAGE_S16 */ + vx_uint32 U32; /*!< \brief \ref VX_DF_IMAGE_U32 */ + vx_int32 S32; /*!< \brief \ref VX_DF_IMAGE_S32 */ + vx_uint8 reserved[16]; +} vx_pixel_value_t; + +/*! \brief The HOG descriptor structure. + * \ingroup group_vision_function_hog + */ +typedef struct { + /*! \brief The histogram cell width of type \ref VX_TYPE_INT32.*/ + vx_int32 cell_width; + /*! \brief The histogram cell height of type \ref VX_TYPE_INT32.*/ + vx_int32 cell_height; + /*! \brief The histogram block width of type \ref VX_TYPE_INT32. Must be divisible by cell_width. */ + vx_int32 block_width; + /*! \brief The histogram block height of type \ref VX_TYPE_INT32. Must be divisible by cell_height. */ + vx_int32 block_height; + /*! \brief The histogram block stride within the window of type \ref VX_TYPE_INT32. Must be an integral number of cell_width and cell_height.*/ + vx_int32 block_stride; + /*! \brief The histogram size of type \ref VX_TYPE_INT32.*/ + vx_int32 num_bins; + /*! \brief The feature descriptor window width of type \ref VX_TYPE_INT32*/ + vx_int32 window_width; + /*! \brief The feature descriptor window height of type \ref VX_TYPE_INT32*/ + vx_int32 window_height; + /*! \brief The feature descriptor window stride of type \ref VX_TYPE_INT32*/ + vx_int32 window_stride; + /*! \brief The threshold for the maximum L2-norm value for a histogram bin. It is used as part of block normalization. It defaults to 0.2. */ + vx_float32 threshold; +} vx_hog_t; + +/*! \brief Use with the enumeration \ref VX_NODE_BORDER to set the +* border mode behavior of a node that supports borders. +* +* If the indicated border mode is not supported, an error \ref VX_ERROR_NOT_SUPPORTED will be reported +* either at the time the \ref VX_NODE_BORDER is set or at the time of graph verification. +* \ingroup group_borders +*/ +typedef struct _vx_border_t { + /*! \brief See \ref vx_border_e. */ + vx_enum mode; + /*! \brief For the mode \ref VX_BORDER_CONSTANT, this union contains the + * value of out-of-bound pixels. + */ + vx_pixel_value_t constant_value; +} vx_border_t; + +/*! +* \brief The type of the vxPublishKernels entry function of modules loaded +* by \ref vxLoadKernels and unloaded by \ref vxUnloadKernels. +* \param [in] context The reference to the context kernels must be added to. +* \ingroup group_user_kernels +*/ +typedef vx_status(VX_API_CALL *vx_publish_kernels_f)(vx_context context); + +/*! +* \brief The type of the vxUnpublishKernels entry function of modules loaded +* by \ref vxLoadKernels and unloaded by \ref vxUnloadKernels. +* \param [in] context The reference to the context kernels have been added to. +* \ingroup group_user_kernels +*/ +typedef vx_status(VX_API_CALL *vx_unpublish_kernels_f)(vx_context context); + +/*! +* \brief The pointer to the Host side kernel. +* \param [in] node The handle to the node that contains this kernel. +* \param [in] parameters The array of parameter references. +* \param [in] num The number of parameters. +* \ingroup group_user_kernels +*/ +typedef vx_status(VX_CALLBACK *vx_kernel_f)(vx_node node, const vx_reference *parameters, vx_uint32 num); + +/*! +* \brief The pointer to the kernel initializer. If the host code requires a call +* to initialize data once all the parameters have been validated, this function is called +* if not NULL. +* \param [in] node The handle to the node that contains this kernel. +* \param [in] parameters The array of parameter references. +* \param [in] num The number of parameters. +* \ingroup group_user_kernels +*/ +typedef vx_status(VX_CALLBACK *vx_kernel_initialize_f)(vx_node node, const vx_reference *parameters, vx_uint32 num); + +/*! +* \brief The pointer to the kernel deinitializer. If the host code requires a call +* to deinitialize data during a node garbage collection, this function is called +* if not NULL. +* \param [in] node The handle to the node that contains this kernel. +* \param [in] parameters The array of parameter references. +* \param [in] num The number of parameters. +* \ingroup group_user_kernels +*/ +typedef vx_status(VX_CALLBACK *vx_kernel_deinitialize_f)(vx_node node, const vx_reference *parameters, vx_uint32 num); + +/*! +* \brief The user-defined kernel node parameters validation function. The function only +* needs to fill in the meta data structure(s). +* \note This function is called once for whole set of parameters. +* \param [in] node The handle to the node that is being validated. +* \param [in] parameters The array of parameters to be validated. +* \param [in] num Number of parameters to be validated. +* \param [in] metas A pointer to a pre-allocated array of structure references that the system holds. +* The system pre-allocates a number of vx_meta_format structures for the output parameters only, +* indexed by the same indices as parameters[]. The validation function fills in the correct type, format, +* and dimensionality for the system to use either to create memory or to check against existing memory. +* \return An error code describing the validation status on parameters. +* \ingroup group_user_kernels +*/ +typedef vx_status(VX_CALLBACK *vx_kernel_validate_f)(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[]); + +/*! +* \brief A user-defined callback function to set the valid rectangle of an output image. +* +* The \ref VX_VALID_RECT_CALLBACK attribute in the \ref vx_meta_format object should be +* set to the desired callback during user node's output validator. The callback must not call +* \ref vxGetValidRegionImage or \ref vxSetImageValidRectangle. Instead, an array of the +* valid rectangles of all the input images is supplied to the callback to calculate the output +* valid rectangle. The output of the user node may be a pyramid, or just an image. If it is just an +* image, the 'Out' array associated with that output only has one element. If the output is a +* pyramid, the array size is equal to the number of pyramid levels. Notice that the array memory +* allocation passed to the callback is managed by the framework, the application must not allocate or +* deallocate those pointers. +* +* The behavior of the callback function vx_kernel_image_valid_rectangle_f is undefined +* if one of the following is true: +* - One of the input arguments of a user node is a pyramid or an array of images. +* - Either input or output argument of a user node is an array of pyramids. +* +* \param [in,out] node The handle to the node that is being validated. +* \param [in] index The index of the output parameter for which a valid region should be set. +* \param [in] input_valid A pointer to an array of valid regions of input images or images +* contained in image container (e.g. pyramids). They are provided in same order as the parameter +* list of the kernel's declaration. +* \param [out] output_valid An array of valid regions that should be set for the output images or +* image containers (e.g. pyramid) after graph processing. The length of the array should be equal +* to the size of the image container (e.g. number of levels in the pyramid). For a simple output +* image the array size is always one. Each rectangle supplies the valid region for one image. The +* array memory allocation is managed by the framework. +* \return An error code describing the validation status on parameters. +* \ingroup group_user_kernels +*/ +typedef vx_status(VX_CALLBACK *vx_kernel_image_valid_rectangle_f)(vx_node node, vx_uint32 index, const vx_rectangle_t* const input_valid[], vx_rectangle_t* const output_valid[]); + +/*! \brief The log callback function. + * \ingroup group_log + */ +typedef void (VX_CALLBACK *vx_log_callback_f)(vx_context context, + vx_reference ref, + vx_status status, + const vx_char string[]); + +/*! \brief The Map/Unmap operation enumeration. + * \ingroup group_image + */ +enum vx_map_flag_e { + VX_NOGAP_X = 1, /*!< \brief No Gap. */ +}; + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_vendors.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_vendors.h new file mode 100644 index 0000000..9d49f95 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_vendors.h @@ -0,0 +1,67 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _OPENVX_VENDORS_H_ +#define _OPENVX_VENDORS_H_ + +/*! + * \file + * \brief The Vendor ID list for OpenVX. + */ + +/*! \brief The Vendor ID of the Implementation. As new vendors submit their + * implementations, this enumeration will grow. + * \ingroup group_basic_features + */ +enum vx_vendor_id_e { + VX_ID_KHRONOS = 0x000, /*!< \brief The Khronos Group */ + VX_ID_TI = 0x001, /*!< \brief Texas Instruments, Inc. */ + VX_ID_QUALCOMM = 0x002, /*!< \brief Qualcomm, Inc. */ + VX_ID_NVIDIA = 0x003, /*!< \brief NVIDIA Corporation */ + VX_ID_ARM = 0x004, /*!< \brief ARM Ltd. */ + VX_ID_BDTI = 0x005, /*!< \brief Berkley Design Technology, Inc. */ + VX_ID_RENESAS = 0x006, /*!< \brief Renasas Electronics */ + VX_ID_VIVANTE = 0x007, /*!< \brief Vivante Corporation */ + VX_ID_XILINX = 0x008, /*!< \brief Xilinx Inc. */ + VX_ID_AXIS = 0x009, /*!< \brief Axis Communications */ + VX_ID_MOVIDIUS = 0x00A, /*!< \brief Movidius Ltd. */ + VX_ID_SAMSUNG = 0x00B, /*!< \brief Samsung Electronics */ + VX_ID_FREESCALE = 0x00C, /*!< \brief Freescale Semiconductor */ + VX_ID_AMD = 0x00D, /*!< \brief Advanced Micro Devices */ + VX_ID_BROADCOM = 0x00E, /*!< \brief Broadcom Corporation */ + VX_ID_INTEL = 0x00F, /*!< \brief Intel Corporation */ + VX_ID_MARVELL = 0x010, /*!< \brief Marvell Technology Group Ltd. */ + VX_ID_MEDIATEK = 0x011, /*!< \brief MediaTek, Inc. */ + VX_ID_ST = 0x012, /*!< \brief STMicroelectronics */ + VX_ID_CEVA = 0x013, /*!< \brief CEVA DSP */ + VX_ID_ITSEEZ = 0x014, /*!< \brief Itseez, Inc. */ + VX_ID_IMAGINATION=0x015, /*!< \brief Imagination Technologies */ + VX_ID_NXP = 0x016, /*!< \brief NXP Semiconductors */ + VX_ID_VIDEANTIS = 0x017, /*!< \brief Videantis */ + VX_ID_SYNOPSYS = 0x018, /*!< \brief Synopsys */ + VX_ID_CADENCE = 0x019, /*!< \brief Cadence */ + VX_ID_HUAWEI = 0x01A, /*!< \brief Huawei */ + VX_ID_SOCIONEXT = 0x01B, /*!< \brief Socionext */ + /* Add new vendor code above this line */ + VX_ID_USER = 0xFFE, /*!< \brief For use by vxAllocateUserKernelId and vxAllocateUserKernelLibraryId */ + VX_ID_MAX = 0xFFF, + /*! \brief For use by all Kernel authors until they can obtain an assigned ID. */ + VX_ID_DEFAULT = VX_ID_MAX, +}; + +#endif + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h new file mode 100644 index 0000000..f97512f --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h @@ -0,0 +1,62 @@ +/**************************************************************************** +* +* Copyright 2017 - 2020 Vivante Corporation, Santa Clara, California. +* All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* 'Software'), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sub license, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject +* to the following conditions: +* +* The above copyright notice and this permission notice (including the +* next paragraph) shall be included in all copies or substantial +* portions of the Software. +* +* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. +* IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VX_VIV_SYS_H_ +#define _VX_VIV_SYS_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief set clock fscale value to change core and shader frequency. + * \param [in] coreIndex Global core index to set the specific core clock frequency. + * If the value is 0xFFFFFFFF, all the cores will be set. + * \param [in] vipFscaleValue Set core frequency scale size. Value can be 64, 32, 16, 8, 4, 2, 1. + * 64 means 64/64 full frequency, 1 means 1/64 frequency. + * \param [in] shaderFscaleValue Set shader frequency scale size. Value can be 64, 32, 16, 8, 4, 2, 1. + * 64 means 64/64 full frequency, 1 means 1/64 frequency. + * + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; + * \retval VX_ERROR_INVAID_PARAMETERS Invalid frequency scale values. + * \retval VX_FAILURE Failed to change core and shader frequency. + */ +VX_API_ENTRY vx_status VX_API_CALL vxSysSetVipFrequency( + vx_uint32 coreIndex, + vx_uint32 vipFscaleValue, + vx_uint32 shaderFscaleValue + ); + +#ifdef __cplusplus +} +#endif + + +#endif + diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vxu.h b/prebuilt-sdk/x86_64_linux/include/VX/vxu.h new file mode 100644 index 0000000..3daf6df --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vxu.h @@ -0,0 +1,924 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _OPENVX_UTILITY_H_ +#define _OPENVX_UTILITY_H_ + +/*! + * \file + * \brief The OpenVX Utility Library. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief [Immediate] Invokes an immediate Color Conversion. + * \param [in] context The reference to the overall context. + * \param [in] input The input image. + * \param [out] output The output image. + * \ingroup group_vision_function_colorconvert + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuColorConvert(vx_context context, vx_image input, vx_image output); + +/*! \brief [Immediate] Invokes an immediate Channel Extract. + * \param [in] context The reference to the overall context. + * \param [in] input The input image. Must be one of the defined \ref vx_df_image_e multi-channel formats. + * \param [in] channel The \ref vx_channel_e enumeration to extract. + * \param [out] output The output image. Must be \ref VX_DF_IMAGE_U8. + * \ingroup group_vision_function_channelextract + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuChannelExtract(vx_context context, vx_image input, vx_enum channel, vx_image output); + +/*! \brief [Immediate] Invokes an immediate Channel Combine. + * \param [in] context The reference to the overall context. + * \param [in] plane0 The plane that forms channel 0. Must be \ref VX_DF_IMAGE_U8. + * \param [in] plane1 The plane that forms channel 1. Must be \ref VX_DF_IMAGE_U8. + * \param [in] plane2 [optional] The plane that forms channel 2. Must be \ref VX_DF_IMAGE_U8. + * \param [in] plane3 [optional] The plane that forms channel 3. Must be \ref VX_DF_IMAGE_U8. + * \param [out] output The output image. + * \ingroup group_vision_function_channelcombine + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuChannelCombine(vx_context context, vx_image plane0, vx_image plane1, vx_image plane2, vx_image plane3, vx_image output); + +/*! \brief [Immediate] Invokes an immediate Sobel 3x3. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output_x [optional] The output gradient in the x direction in \ref VX_DF_IMAGE_S16. + * \param [out] output_y [optional] The output gradient in the y direction in \ref VX_DF_IMAGE_S16. + * \ingroup group_vision_function_sobel3x3 + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuSobel3x3(vx_context context, vx_image input, vx_image output_x, vx_image output_y); + +/*! \brief [Immediate] Invokes an immediate Magnitude. + * \param [in] context The reference to the overall context. + * \param [in] grad_x The input x image. This must be in \ref VX_DF_IMAGE_S16 format. + * \param [in] grad_y The input y image. This must be in \ref VX_DF_IMAGE_S16 format. + * \param [out] mag The magnitude image. This will be in \ref VX_DF_IMAGE_S16 format. + * \ingroup group_vision_function_magnitude + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuMagnitude(vx_context context, vx_image grad_x, vx_image grad_y, vx_image mag); + +/*! \brief [Immediate] Invokes an immediate Phase. + * \param [in] context The reference to the overall context. + * \param [in] grad_x The input x image. This must be in \ref VX_DF_IMAGE_S16 format. + * \param [in] grad_y The input y image. This must be in \ref VX_DF_IMAGE_S16 format. + * \param [out] orientation The phase image. This will be in \ref VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_phase + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuPhase(vx_context context, vx_image grad_x, vx_image grad_y, vx_image orientation); + +/*! \brief [Immediate] Scales an input image to an output image. + * \param [in] context The reference to the overall context. + * \param [in] src The source image of type \ref VX_DF_IMAGE_U8. + * \param [out] dst The destintation image of type \ref VX_DF_IMAGE_U8. + * \param [in] type The interpolation type. \see vx_interpolation_type_e. + * \ingroup group_vision_function_scale_image + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuScaleImage(vx_context context, vx_image src, vx_image dst, vx_enum type); + +/*! \brief [Immediate] Processes the image through the LUT. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [in] lut The LUT which is of type \ref VX_TYPE_UINT8 if input image is \ref VX_DF_IMAGE_U8 or \ref VX_TYPE_INT16 if input image is \ref VX_DF_IMAGE_S16. + * \param [out] output The output image of the same type as the input image. + * \ingroup group_vision_function_lut + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuTableLookup(vx_context context, vx_image input, vx_lut lut, vx_image output); + +/*! \brief [Immediate] Generates a distribution from an image. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 + * \param [out] distribution The output distribution. + * \ingroup group_vision_function_histogram + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuHistogram(vx_context context, vx_image input, vx_distribution distribution); + +/*! \brief [Immediate] Equalizes the Histogram of a grayscale image. + * \param [in] context The reference to the overall context. + * \param [in] input The grayscale input image in \ref VX_DF_IMAGE_U8 + * \param [out] output The grayscale output image of type \ref VX_DF_IMAGE_U8 with equalized brightness and contrast. + * \ingroup group_vision_function_equalize_hist + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuEqualizeHist(vx_context context, vx_image input, vx_image output); + +/*! \brief [Immediate] Computes the absolute difference between two images. + * \param [in] context The reference to the overall context. + * \param [in] in1 An input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \param [in] in2 An input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \param [out] out The output image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \ingroup group_vision_function_absdiff + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuAbsDiff(vx_context context, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Immediate] Computes the mean value and optionally the standard deviation. + * \param [in] context The reference to the overall context. + * \param [in] input The input image. \ref VX_DF_IMAGE_U8 is supported. + * \param [out] mean The average pixel value. + * \param [out] stddev [optional] The standard deviation of the pixel values. + * \ingroup group_vision_function_meanstddev + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuMeanStdDev(vx_context context, vx_image input, vx_float32 *mean, vx_float32 *stddev); + +/*! \brief [Immediate] Threshold's an input image and produces a \ref VX_DF_IMAGE_U8 boolean image. + * \param [in] context The reference to the overall context. + * \param [in] input The input image. Only images with format \ref VX_DF_IMAGE_U8 + * and \ref VX_DF_IMAGE_S16 are supported. + * \param [in] thresh The thresholding object that defines the parameters of + * the operation. The \ref VX_THRESHOLD_INPUT_FORMAT must be the same as the input image format and + * the \ref VX_THRESHOLD_OUTPUT_FORMAT must be the same as the output image format. + * \param [out] output The output image, that will contain as pixel value + * true and false values defined by \p thresh. Only images with format + * \ref VX_DF_IMAGE_U8 are supported. + * \ingroup group_vision_function_threshold + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuThreshold(vx_context context, vx_image input, vx_threshold thresh, vx_image output); + +/*! \brief [Immediate] Performs Non-Maxima Suppression on an image, producing an image of the same type. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \param [in] mask [optional] Constrict suppression to a ROI. The mask image is of type \ref VX_DF_IMAGE_U8 and must be the same dimensions as the input image. + * \param [in] win_size The size of window over which to perform the localized non-maxima suppression. Must be odd, and less than or equal to the smallest dimension of the input image. + * \param [out] output The output image, of the same type as the input, that has been non-maxima suppressed. + * \ingroup group_vision_function_nms + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuNonMaxSuppression(vx_context context, vx_image input, vx_image mask, vx_int32 win_size, vx_image output); + +/*! \brief [Immediate] Computes the integral image of the input. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U32 format. + * \ingroup group_vision_function_integral_image + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuIntegralImage(vx_context context, vx_image input, vx_image output); + +/*! \brief [Immediate] Erodes an image by a 3x3 window. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_erode_image + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuErode3x3(vx_context context, vx_image input, vx_image output); + +/*! \brief [Immediate] Dilates an image by a 3x3 window. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_dilate_image + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuDilate3x3(vx_context context, vx_image input, vx_image output); + +/*! \brief [Immediate] Computes a median filter on the image by a 3x3 window. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_median_image + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuMedian3x3(vx_context context, vx_image input, vx_image output); + +/*! \brief [Immediate] Computes a box filter on the image by a 3x3 window. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_box_image + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuBox3x3(vx_context context, vx_image input, vx_image output); + +/*! \brief [Immediate] Computes a gaussian filter on the image by a 3x3 window. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format. + * \ingroup group_vision_function_gaussian_image + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuGaussian3x3(vx_context context, vx_image input, vx_image output); + +/*! \brief [Immediate] Performs Non-linear Filtering. + * \param [in] context The reference to the overall context. + * \param [in] function The non-linear filter function. See \ref vx_non_linear_filter_e. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [in] mask The mask to be applied to the Non-linear function. \ref VX_MATRIX_ORIGIN attribute is used + * to place the mask appropriately when computing the resulting image. See \ref vxCreateMatrixFromPattern and \ref vxCreateMatrixFromPatternAndOrigin. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + * \ingroup group_vision_function_nonlinear_filter + */ +VX_API_ENTRY vx_status VX_API_CALL vxuNonLinearFilter(vx_context context, vx_enum function, vx_image input, vx_matrix mask, vx_image output); + + +/*! \brief [Immediate] Computes a convolution on the input image with the supplied + * matrix. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 format. + * \param [in] conv The \ref vx_int16 convolution matrix. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \ingroup group_vision_function_custom_convolution + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuConvolve(vx_context context, vx_image input, vx_convolution conv, vx_image output); + +/*! \brief [Immediate] Computes a Gaussian pyramid from an input image. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 + * \param [out] gaussian The Gaussian pyramid with \ref VX_DF_IMAGE_U8 to construct. + * \ingroup group_vision_function_gaussian_pyramid + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuGaussianPyramid(vx_context context, vx_image input, vx_pyramid gaussian); + +/*! \brief [Immediate] Computes a Laplacian pyramid from an input image. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \param [out] laplacian The Laplacian pyramid with \ref VX_DF_IMAGE_S16 to construct. + * \param [out] output The lowest resolution image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format necessary to reconstruct the input image from the pyramid. The output image format should be same as input image format. + * \ingroup group_vision_function_laplacian_pyramid + * \see group_pyramid + * \return A \ref vx_status enumeration. + * \retval VX_SUCCESS Success. + * \retval * An error occured. See \ref vx_status_e + */ +VX_API_ENTRY vx_status VX_API_CALL vxuLaplacianPyramid(vx_context context, vx_image input, vx_pyramid laplacian, vx_image output); + +/*! \brief [Immediate] Reconstructs an image from a Laplacian Image pyramid. + * \param [in] context The reference to the overall context. + * \param [in] laplacian The Laplacian pyramid with \ref VX_DF_IMAGE_S16 format. + * \param [in] input The lowest resolution image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format for the Laplacian pyramid. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format with the highest possible resolution reconstructed from the Laplacian pyramid. The output image format should be same as input image format. + * \ingroup group_vision_function_laplacian_reconstruct + * \see group_pyramid + * \return A \ref vx_status enumeration. + * \retval VX_SUCCESS Success. + * \retval * An error occured. See \ref vx_status_e + */ +VX_API_ENTRY vx_status VX_API_CALL vxuLaplacianReconstruct(vx_context context, vx_pyramid laplacian, vx_image input, + vx_image output); + +/*! \brief [Immediate] Computes a weighted average image. + * \param [in] context The reference to the overall context. + * \param [in] img1 The first \ref VX_DF_IMAGE_U8 image. + * \param [in] alpha A \ref VX_TYPE_FLOAT32 type, the input value with the range \f$ 0.0 \le \alpha \le 1.0 \f$. + * \param [in] img2 The second \ref VX_DF_IMAGE_U8 image. + * \param [out] output The output \ref VX_DF_IMAGE_U8 image. + * \ingroup group_vision_function_weighted_average + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuWeightedAverage(vx_context context, vx_image img1, vx_scalar alpha, vx_image img2, vx_image output); + +/*! \brief [Immediate] Computes the minimum and maximum values of the image. + * \param [in] context The reference to the overall context. + * \param [in] input The input image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \param [out] minVal The minimum value in the image, which corresponds to the type of the input. + * \param [out] maxVal The maximum value in the image, which corresponds to the type of the input. + * \param [out] minLoc [optional] The minimum \ref VX_TYPE_COORDINATES2D locations. If the input image has several minimums, the kernel will return up to the capacity of the array. + * \param [out] maxLoc [optional] The maximum \ref VX_TYPE_COORDINATES2D locations. If the input image has several maximums, the kernel will return up to the capacity of the array. + * \param [out] minCount [optional] The total number of detected minimums in image. Use a \ref VX_TYPE_SIZE scalar. + * \param [out] maxCount [optional] The total number of detected maximums in image. Use a \ref VX_TYPE_SIZE scalar. + * \ingroup group_vision_function_minmaxloc + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuMinMaxLoc(vx_context context, vx_image input, + vx_scalar minVal, vx_scalar maxVal, + vx_array minLoc, vx_array maxLoc, + vx_scalar minCount, vx_scalar maxCount); + +/*! \brief [Immediate] Computes pixel-wise minimum values between two images. + * \param [in] context The reference to the overall context. + * \param [in] in1 The first input image. Must be of type \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [in] in2 The second input image. Must be of type \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [out] out The output image which will hold the result of min. + * \ingroup group_vision_function_min + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuMin(vx_context context, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Immediate] Computes pixel-wise maximum values between two images. + * \param [in] context The reference to the overall context. + * \param [in] in1 The first input image. Must be of type \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [in] in2 The second input image. Must be of type \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. + * \param [out] out The output image which will hold the result of max. + * \ingroup group_vision_function_max + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuMax(vx_context context, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Immediate] Converts the input images bit-depth into the output image. + * \param [in] context The reference to the overall context. + * \param [in] input The input image. + * \param [out] output The output image. + * \param [in] policy A \ref VX_TYPE_ENUM of the \ref vx_convert_policy_e enumeration. + * \param [in] shift A scalar containing a \ref VX_TYPE_INT32 of the shift value. + * \ingroup group_vision_function_convertdepth + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e.. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuConvertDepth(vx_context context, vx_image input, vx_image output, vx_enum policy, vx_int32 shift); + +/*! \brief [Immediate] Computes Canny Edges on the input image into the output image. + * \param [in] context The reference to the overall context. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] hyst The double threshold for hysteresis. The \ref VX_THRESHOLD_INPUT_FORMAT shall be either + * \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16. The \ref VX_THRESHOLD_OUTPUT_FORMAT is ignored. + * \param [in] gradient_size The size of the Sobel filter window, must support at least 3, 5 and 7. + * \param [in] norm_type A flag indicating the norm used to compute the gradient, \ref VX_NORM_L1 or \ref VX_NORM_L2. + * \param [out] output The output image in \ref VX_DF_IMAGE_U8 format with values either 0 or 255. + * \ingroup group_vision_function_canny + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuCannyEdgeDetector(vx_context context, vx_image input, vx_threshold hyst, + vx_int32 gradient_size, vx_enum norm_type, + vx_image output); + +/*! \brief [Immediate] Performs a Gaussian Blur on an image then half-scales it. The interpolation mode used is nearest-neighbor. + * \param [in] context The reference to the overall context. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [out] output The output \ref VX_DF_IMAGE_U8 image. + * \param [in] kernel_size The input size of the Gaussian filter. Supported values are 1, 3 and 5. + * \ingroup group_vision_function_scale_image + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuHalfScaleGaussian(vx_context context, vx_image input, vx_image output, vx_int32 kernel_size); + +/*! \brief [Immediate] Computes the bitwise and between two images. + * \param [in] context The reference to the overall context. + * \param [in] in1 A \ref VX_DF_IMAGE_U8 input image + * \param [in] in2 A \ref VX_DF_IMAGE_U8 input image + * \param [out] out The \ref VX_DF_IMAGE_U8 output image. + * \ingroup group_vision_function_and + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuAnd(vx_context context, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Immediate] Computes the bitwise inclusive-or between two images. + * \param [in] context The reference to the overall context. + * \param [in] in1 A \ref VX_DF_IMAGE_U8 input image + * \param [in] in2 A \ref VX_DF_IMAGE_U8 input image + * \param [out] out The \ref VX_DF_IMAGE_U8 output image. + * \ingroup group_vision_function_or + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuOr(vx_context context, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Immediate] Computes the bitwise exclusive-or between two images. + * \param [in] context The reference to the overall context. + * \param [in] in1 A \ref VX_DF_IMAGE_U8 input image + * \param [in] in2 A \ref VX_DF_IMAGE_U8 input image + * \param [out] out The \ref VX_DF_IMAGE_U8 output image. + * \ingroup group_vision_function_xor + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuXor(vx_context context, vx_image in1, vx_image in2, vx_image out); + +/*! \brief [Immediate] Computes the bitwise not of an image. + * \param [in] context The reference to the overall context. + * \param [in] input The \ref VX_DF_IMAGE_U8 input image + * \param [out] output The \ref VX_DF_IMAGE_U8 output image. + * \ingroup group_vision_function_not + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuNot(vx_context context, vx_image input, vx_image output); + +/*! \brief [Immediate] Performs elementwise multiplications on pixel values in the input images and a scale. + * \param [in] context The reference to the overall context. + * \param [in] in1 A \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 input image. + * \param [in] in2 A \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 input image. + * \param [in] scale A non-negative \ref VX_TYPE_FLOAT32 multiplied to each product before overflow handling. + * \param [in] overflow_policy A \ref vx_convert_policy_e enumeration. + * \param [in] rounding_policy A \ref vx_round_policy_e enumeration. + * \param [out] out The output image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \ingroup group_vision_function_mult + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuMultiply(vx_context context, vx_image in1, vx_image in2, vx_float32 scale, vx_enum overflow_policy, vx_enum rounding_policy, vx_image out); + +/*! \brief [Immediate] Performs arithmetic addition on pixel values in the input images. + * \param [in] context The reference to the overall context. + * \param [in] in1 A \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 input image. + * \param [in] in2 A \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 input image. + * \param [in] policy A \ref vx_convert_policy_e enumeration. + * \param [out] out The output image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \ingroup group_vision_function_add + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuAdd(vx_context context, vx_image in1, vx_image in2, vx_enum policy, vx_image out); + +/*! \brief [Immediate] Performs arithmetic subtraction on pixel values in the input images. + * \param [in] context The reference to the overall context. + * \param [in] in1 A \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 input image, the minuend. + * \param [in] in2 A \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 input image, the subtrahend. + * \param [in] policy A \ref vx_convert_policy_e enumeration. + * \param [out] out The output image in \ref VX_DF_IMAGE_U8 or \ref VX_DF_IMAGE_S16 format. + * \ingroup group_vision_function_sub + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuSubtract(vx_context context, vx_image in1, vx_image in2, vx_enum policy, vx_image out); + +/*! \brief [Immediate] Performs an Affine warp on an image. + * \param [in] context The reference to the overall context. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] matrix The affine matrix. Must be 2x3 of type \ref VX_TYPE_FLOAT32. + * \param [in] type The interpolation type from \ref vx_interpolation_type_e. + * \ref VX_INTERPOLATION_AREA is not supported. + * \param [out] output The output \ref VX_DF_IMAGE_U8 image. + * \ingroup group_vision_function_warp_affine + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuWarpAffine(vx_context context, vx_image input, vx_matrix matrix, vx_enum type, vx_image output); + +/*! \brief [Immediate] Performs an Perspective warp on an image. + * \param [in] context The reference to the overall context. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] matrix The perspective matrix. Must be 3x3 of type \ref VX_TYPE_FLOAT32. + * \param [in] type The interpolation type from \ref vx_interpolation_type_e. + * \ref VX_INTERPOLATION_AREA is not supported. + * \param [out] output The output \ref VX_DF_IMAGE_U8 image. + * \ingroup group_vision_function_warp_perspective + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuWarpPerspective(vx_context context, vx_image input, vx_matrix matrix, vx_enum type, vx_image output); + +/*! \brief [Immediate] Computes the Harris Corners over an image and produces the array of scored points. + * \param [in] context The reference to the overall context. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] strength_thresh The \ref VX_TYPE_FLOAT32 minimum threshold which to eliminate Harris Corner scores (computed using the normalized Sobel kernel). + * \param [in] min_distance The \ref VX_TYPE_FLOAT32 radial Euclidean distance for non-maximum suppression. + * \param [in] sensitivity The \ref VX_TYPE_FLOAT32 scalar sensitivity threshold \f$ k \f$ from the Harris-Stephens equation. + * \param [in] gradient_size The gradient window size to use on the input. The + * implementation must support at least 3, 5, and 7. + * \param [in] block_size The block window size used to compute the harris corner score. + * The implementation must support at least 3, 5, and 7. + * \param [out] corners The array of \ref VX_TYPE_KEYPOINT structs. The order of the keypoints in this array is implementation dependent. + * \param [out] num_corners [optional] The total number of detected corners in image. Use a \ref VX_TYPE_SIZE scalar + * \ingroup group_vision_function_harris + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuHarrisCorners(vx_context context, + vx_image input, + vx_scalar strength_thresh, + vx_scalar min_distance, + vx_scalar sensitivity, + vx_int32 gradient_size, + vx_int32 block_size, + vx_array corners, + vx_scalar num_corners); + + +/*! \brief [Immediate] Computes corners on an image using FAST algorithm and produces the array of feature points. + * \param [in] context The reference to the overall context. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] strength_thresh Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle + * of radius 3 (\ref VX_TYPE_FLOAT32 scalar), with a value in the range of 0.0 \f$\le\f$ strength_thresh < 256.0. + * Any fractional value will be truncated to an integer. + * \param [in] nonmax_suppression If true, non-maximum suppression is applied to + * detected corners before being places in the \ref vx_array of \ref VX_TYPE_KEYPOINT structs. + * \param [out] corners Output corner \ref vx_array of \ref VX_TYPE_KEYPOINT. The order of the keypoints in this array is implementation dependent. + * \param [out] num_corners [optional] The total number of detected corners in image. Use a \ref VX_TYPE_SIZE scalar. + * \ingroup group_vision_function_fast + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuFastCorners(vx_context context, vx_image input, vx_scalar strength_thresh, vx_bool nonmax_suppression, vx_array corners, vx_scalar num_corners); + +/*! \brief [Immediate] Computes an optical flow on two images. + * \param [in] context The reference to the overall context. + * \param [in] old_images Input of first (old) image pyramid in \ref VX_DF_IMAGE_U8. + * \param [in] new_images Input of destination (new) image pyramid in \ref VX_DF_IMAGE_U8 + * \param [in] old_points an array of key points in a vx_array of \ref VX_TYPE_KEYPOINT those key points are defined at + * the old_images high resolution pyramid + * \param [in] new_points_estimates an array of estimation on what is the output key points in a \ref vx_array of + * \ref VX_TYPE_KEYPOINT those keypoints are defined at the new_images high resolution pyramid + * \param [out] new_points an output array of key points in a \ref vx_array of \ref VX_TYPE_KEYPOINT those key points are + * defined at the new_images high resolution pyramid + * \param [in] termination termination can be \ref VX_TERM_CRITERIA_ITERATIONS or \ref VX_TERM_CRITERIA_EPSILON or + * \ref VX_TERM_CRITERIA_BOTH + * \param [in] epsilon is the \ref vx_float32 error for terminating the algorithm + * \param [in] num_iterations is the number of iterations. Use a \ref VX_TYPE_UINT32 scalar. + * \param [in] use_initial_estimate Can be set to either \ref vx_false_e or \ref vx_true_e. + * \param [in] window_dimension The size of the window on which to perform the algorithm. See + * \ref VX_CONTEXT_OPTICAL_FLOW_MAX_WINDOW_DIMENSION + * + * \ingroup group_vision_function_opticalflowpyrlk + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuOpticalFlowPyrLK(vx_context context, + vx_pyramid old_images, + vx_pyramid new_images, + vx_array old_points, + vx_array new_points_estimates, + vx_array new_points, + vx_enum termination, + vx_scalar epsilon, + vx_scalar num_iterations, + vx_scalar use_initial_estimate, + vx_size window_dimension); + +/*! \brief [Immediate] The function compares an image template against overlapped image regions. + * \details The detailed equation to the matching can be found in \ref vx_comp_metric_e. + * The output of the template matching node is a comparison map as described in \ref vx_comp_metric_e. + * The Node have a limitation on the template image size (width*height). It should not be larger then 65535. + * If the valid region of the template image is smaller than the entire template image, the result in the destination image is implementation-dependent. + * \param [in] context The reference to the overall context. + * \param [in] src The input image of type \ref VX_DF_IMAGE_U8. + * \param [in] templateImage Searched template of type \ref VX_DF_IMAGE_U8. + * \param [in] matchingMethod attribute specifying the comparison method \ref vx_comp_metric_e. This function support only \ref VX_COMPARE_CCORR_NORM and \ref VX_COMPARE_L2. + * \param [out] output Map of comparison results. The output is an image of type \ref VX_DF_IMAGE_S16 + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + * \ingroup group_vision_function_match_template + */ + VX_API_ENTRY vx_status VX_API_CALL vxuMatchTemplate(vx_context context, vx_image src, vx_image templateImage, vx_enum matchingMethod, vx_image output); + + /*! \brief [Immediate] The function extracts LBP image from an input image + * \param [in] context The reference to the overall context. + * \param [in] in An input image in vx_image. Or \f$ SrcImg\f$ in the equations. the image is of type \ref VX_DF_IMAGE_U8 + * \param [in] format A variation of LBP like original LBP and mLBP. see \ref vx_lbp_format_e + * \param [in] kernel_size Kernel size. Only size of 3 and 5 are supported + * \param [out] out An output image in vx_image.Or \f$ DstImg\f$ in the equations. the image is of type \ref VX_DF_IMAGE_U8 + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + * \ingroup group_vision_function_lbp + */ +VX_API_ENTRY vx_status VX_API_CALL vxuLBP(vx_context context, + vx_image in, vx_enum format, vx_int8 kernel_size, vx_image out); + +/*! \brief [Immediate] Performs cell calculations for the average gradient magnitude and gradient orientation histograms. + * \details Firstly, the gradient magnitude and gradient orientation are computed for each pixel in the input image. + * Two 1-D centred, point discrete derivative masks are applied to the input image in the horizontal and vertical directions. + * \f[ M_h = [-1, 0, 1] \f] and \f[ M_v = [-1, 0, 1]^T \f] + * \f$G_v\f$ is the result of applying mask \f$M_v\f$ to the input image, and \f$G_h\f$ is the result of applying mask \f$M_h\f$ to the input image. + * The border mode used for the gradient calculation is implementation dependent. Its behavior should be similar to \ref VX_BORDER_UNDEFINED. + * The gradient magnitudes and gradient orientations for each pixel are then calculated in the following manner. + * \f[ G(x,y) = \sqrt{G_v(x,y)^2 + G_h(x,y)^2} \f] + * \f[ \theta(x,y) = arctan(G_v(x,y), G_h(x,y)) \f] + * where \f$arctan(v, h)\f$ + * is \f$ tan^{-1}(v/h)\f$ when \f$h!=0\f$, + * + * \f$ -pi/2 \f$ if \f$v<0\f$ and \f$h==0\f$, + * + * \f$ pi/2 \f$ if \f$v>0\f$ and \f$h==0\f$ + * + * and \f$ 0 \f$ if \f$v==0\f$ and \f$h==0\f$ + * + * Secondly, the gradient magnitudes and orientations are used to compute the bins output tensor and optional magnitudes output tensor. + * These tensors are computed on a cell level where the cells are rectangular in shape. + * The magnitudes tensor contains the average gradient magnitude for each cell. + * \f[magnitudes(c) = \frac{1}{(cell\_width * cell\_height)}\sum\limits_{w=0}^{cell\_width} \sum\limits_{h=0}^{cell\_height} G_c(w,h)\f] + * where \f$G_c\f$ is the gradient magnitudes related to cell \f$c\f$. + * The bins tensor contains histograms of gradient orientations for each cell. + * The gradient orientations at each pixel range from 0 to 360 degrees. These are quantised into a set of histogram bins based on the num_bins parameter. + * Each pixel votes for a specific cell histogram bin based on its gradient orientation. The vote itself is the pixel's gradient magnitude. + * \f[bins(c, n) = \sum\limits_{w=0}^{cell\_width} \sum\limits_{h=0}^{cell\_height} G_c(w,h) * 1[B_c(w, h, num\_bins) == n]\f] + * where \f$B_c\f$ produces the histogram bin number based on the gradient orientation of the pixel at location (\f$w\f$, \f$h\f$) in cell \f$c\f$ based on + * the \f$num\_bins\f$ and \f[1[B_c(w, h, num\_bins) == n]\f] is a delta-function with value 1 when \f$B_c(w, h, num\_bins) == n\f$ or 0 otherwise. + * \param [in] context The reference to the overall context. + * \param [in] input The input image of type \ref VX_DF_IMAGE_U8. + * \param [in] cell_width The histogram cell width of type \ref VX_TYPE_INT32. + * \param [in] cell_height The histogram cell height of type \ref VX_TYPE_INT32. + * \param [in] num_bins The histogram size of type \ref VX_TYPE_INT32. + * \param [out] magnitudes The output average gradient magnitudes per cell of \ref vx_tensor of type \ref VX_TYPE_INT16 of size \f$ [floor(image_{width}/cell_{width}) ,floor(image_{height}/cell_{height}) ] \f$. + * \param [out] bins The output gradient orientation histograms per cell of \ref vx_tensor of type \ref VX_TYPE_INT16 of size \f$ [floor(image_{width}/cell_{width}) ,floor(image_{height}/cell_{height}), num_{bins}] \f$. + * + * \ingroup group_vision_function_hog + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuHOGCells(vx_context context, vx_image input, vx_int32 cell_width, vx_int32 cell_height, vx_int32 num_bins, vx_tensor magnitudes, vx_tensor bins); + +/*! \brief [Immediate] Computes Histogram of Oriented Gradients features for the W1xW2 window in a sliding window fashion over the whole input image. + * \details Firstly if a magnitudes tensor is provided the cell histograms in the bins tensor are normalised by the average cell gradient magnitudes. + \f[bins(c,n) = \frac{bins(c,n)}{magnitudes(c)}\f] + * To account for changes in illumination and contrast the cell histograms must be locally normalized which requires grouping the cell histograms together into larger spatially connected blocks. + * Blocks are rectangular grids represented by three parameters: the number of cells per block, the number of pixels per cell, and the number of bins per cell histogram. + * These blocks typically overlap, meaning that each cell histogram contributes more than once to the final descriptor. + * To normalize a block its cell histograms \f$h\f$ are grouped together to form a vector \f$v = [h_1, h_2, h_3, ... , h_n]\f$. + * This vector is normalised using L2-Hys which means performing L2-norm on this vector; clipping the result (by limiting the maximum values of v to be threshold) and renormalizing again. If the threshold is equal to zero then L2-Hys normalization is not performed. + * \f[L2norm(v) = \frac{v}{\sqrt{\|v\|_2^2 + \epsilon^2}}\f] + * where \f$ \|v\|_k \f$ be its k-norm for k=1, 2, and \f$ \epsilon \f$ be a small constant. + * For a specific window its HOG descriptor is then the concatenated vector of the components of the normalized cell histograms from all of the block regions contained in the window. + * The W1xW2 window starting position is at coordinates 0x0. + * If the input image has dimensions that are not an integer multiple of W1xW2 blocks with the specified stride, then the last positions that contain only a partial W1xW2 window + * will be calculated with the remaining part of the W1xW2 window padded with zeroes. + * The Window W1xW2 must also have a size so that it contains an integer number of cells, otherwise the node is not well-defined. + * The final output tensor will contain HOG descriptors equal to the number of windows in the input image. + * The output features tensor has 3 dimensions, given by:\n + * \f[[ (floor((image_{width}-window_{width})/window_{stride}) + 1),\f] + * \f[ (floor((image_{height}-window_{height})/window_{stride}) + 1),\f] + * \f[ floor((window_{width} - block_{width})/block_{stride} + 1) * floor((window_{height} - block_{height})/block_{stride} + 1) *\f] +* \f[ (((block_{width} * block_{height}) / (cell_{width} * cell_{height})) * num_{bins})] \f] + * See \ref vxCreateTensor and \ref vxCreateVirtualTensor. + * The output tensor from this function may be very large. For this reason, is it not recommended that this "immediate mode" version of the function be used. + * The preferred method to perform this function is as graph node with a virtual tensor as the output. + * \param [in] context The reference to the overall context. + * \param [in] input The input image of type \ref VX_DF_IMAGE_U8. + * \param [in] magnitudes The averge gradient magnitudes per cell of \ref vx_tensor of type \ref VX_TYPE_INT16. It is the output of \ref vxuHOGCells. + * \param [in] bins The gradient orientation histogram per cell of \ref vx_tensor of type \ref VX_TYPE_INT16. It is the output of \ref vxuHOGCells. + * \param [in] params The parameters of type \ref vx_hog_t. + * \param [in] hog_param_size Size of \ref vx_hog_t in bytes. + * \param [out] features The output HOG features of \ref vx_tensor of type \ref VX_TYPE_INT16. + * + * \ingroup group_vision_function_hog + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ + +VX_API_ENTRY vx_status VX_API_CALL vxuHOGFeatures(vx_context context, vx_image input, vx_tensor magnitudes, vx_tensor bins, const vx_hog_t *params, vx_size hog_param_size, vx_tensor features); + +/*! \brief [Immediate] Finds the Probabilistic Hough Lines detected in the input binary image, each line is stored in the output array as a set of points (x1, y1, x2, y2) . + * \details Some implementations of the algorithm may have a random or non-deterministic element. If the target application is in a safety-critical environment this + * should be borne in mind and steps taken in the implementation, the application or both to achieve the level of determinism required by the system design. + * \param [in] context The reference to the overall context. + * \param [in] input 8 bit, single channel binary source image + * \param [in] params parameters of the struct \ref vx_hough_lines_p_t + * \param [out] lines_array lines_array contains array of lines, see \ref vx_line2d_t The order of lines in implementation dependent + * \param [out] num_lines [optional] The total number of detected lines in image. Use a VX_TYPE_SIZE scalar + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + * \ingroup group_vision_function_hough_lines_p + */ +VX_API_ENTRY vx_status VX_API_CALL vxuHoughLinesP(vx_context context, vx_image input, const vx_hough_lines_p_t *params, vx_array lines_array, vx_scalar num_lines); + +/*! \brief [Immediate] Remaps an output image from an input image. + * \param [in] context The reference to the overall context. + * \param [in] input The input \ref VX_DF_IMAGE_U8 image. + * \param [in] table The remap table object. + * \param [in] policy The interpolation policy from \ref vx_interpolation_type_e. + * \ref VX_INTERPOLATION_AREA is not supported. + * \param [out] output The output \ref VX_DF_IMAGE_U8 image. + * \return A \ref vx_status_e enumeration. + * \ingroup group_vision_function_remap + */ +VX_API_ENTRY vx_status VX_API_CALL vxuRemap(vx_context context, + vx_image input, + vx_remap table, + vx_enum policy, + vx_image output); + +/*! \brief [Immediate] The function applies bilateral filtering to the input tensor. +* \param [in] context The reference to the overall context. +* \param [in] src The input data a \ref vx_tensor. maximum 3 dimension and minimum 2. The tensor is of type \ref VX_TYPE_UINT8 or \ref VX_TYPE_INT16. +* dimensions are [radiometric ,width,height] or [width,height] +* \param [in] diameter of each pixel neighbourhood that is used during filtering. Values of diameter must be odd. Bigger then 3 and smaller then 10. +* \param [in] sigmaValues Filter sigma in the radiometric space. Supported values are bigger then 0 and smaller or equal 20. +* \param [in] sigmaSpace Filter sigma in the spatial space. Supported values are bigger then 0 and smaller or equal 20. +* \param [out] dst The output data a \ref vx_tensor,Of type \ref VX_TYPE_UINT8 or \ref VX_TYPE_INT16. And must be the same type and size of the input. +* \note The border modes +* \ref VX_NODE_BORDER value +* \ref VX_BORDER_REPLICATE and \ref VX_BORDER_CONSTANT are supported. +* \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. +* \ingroup group_vision_function_bilateral_filter +*/ +VX_API_ENTRY vx_status VX_API_CALL vxuBilateralFilter(vx_context context, vx_tensor src, vx_int32 diameter, vx_float32 sigmaSpace, vx_float32 sigmaValues, vx_tensor dst); + +/*! \brief [Immediate] Performs element wise multiplications on element values in the input tensor data with a scale. + * \param [in] context The reference to the overall context. + * \param [in] input1 Input tensor data. Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8 and \ref VX_TYPE_INT8, with fixed_point_position 0. + * \param [in] input2 Input tensor data. The dimensions and sizes of input2 match those of input1, unless the vx_tensor of one or more dimensions in input2 is 1. + * In this case, those dimensions are treated as if this tensor was expanded to match the size of the corresponding dimension of input1, + * and data was duplicated on all terms in that dimension. After this expansion, the dimensions will be equal. + * The data type must match the data type of Input1. + * \param [in] scale A non-negative \ref VX_TYPE_FLOAT32 multiplied to each product before overflow handling. + * \param [in] overflow_policy A \ref vx_convert_policy_e enumeration. + * \param [in] rounding_policy A \ref vx_round_policy_e enumeration. + * \param [out] output The output tensor data with the same dimensions as the input tensor data. + * \ingroup group_vision_function_tensor_multiply + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuTensorMultiply(vx_context context, vx_tensor input1, vx_tensor input2, vx_scalar scale, vx_enum overflow_policy, + vx_enum rounding_policy, vx_tensor output); + +/*! \brief [Immediate] Performs arithmetic addition on element values in the input tensor data. + * \param [in] context The reference to the overall context. + * \param [in] input1 Input tensor data. Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8 and \ref VX_TYPE_INT8, with fixed_point_position 0. + * \param [in] input2 Input tensor data. The dimensions and sizes of input2 match those of input1, unless the vx_tensor of one or more dimensions in input2 is 1. + * In this case, those dimensions are treated as if this tensor was expanded to match the size of the corresponding dimension of input1, + * and data was duplicated on all terms in that dimension. After this expansion, the dimensions will be equal. + * The data type must match the data type of Input1. + * \param [in] policy A \ref vx_convert_policy_e enumeration. + * \param [out] output The output tensor data with the same dimensions as the input tensor data. + * \ingroup group_vision_function_tensor_add + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuTensorAdd(vx_context context, vx_tensor input1, vx_tensor input2, vx_enum policy, vx_tensor output); + +/*! \brief [Immediate] Performs arithmetic subtraction on element values in the input tensor data. + * \param [in] context The reference to the overall context. + * \param [in] input1 Input tensor data. Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8 and \ref VX_TYPE_INT8, with fixed_point_position 0. + * \param [in] input2 Input tensor data. The dimensions and sizes of input2 match those of input1, unless the vx_tensor of one or more dimensions in input2 is 1. + * In this case, those dimensions are treated as if this tensor was expanded to match the size of the corresponding dimension of input1, + * and data was duplicated on all terms in that dimension. After this expansion, the dimensions will be equal. + * The data type must match the data type of Input1. + * \param [in] policy A \ref vx_convert_policy_e enumeration. + * \param [out] output The output tensor data with the same dimensions as the input tensor data. + * \ingroup group_vision_function_tensor_subtract + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuTensorSubtract(vx_context context, vx_tensor input1, vx_tensor input2, vx_enum policy, vx_tensor output); + +/*! \brief [Immediate] Performs LUT on element values in the input tensor data. + * \param [in] context The reference to the overall context. + * \param [in] input1 Input tensor data. Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8, with fixed_point_position 0. + * \param [in] lut The look-up table to use, of type \ref vx_lut. + * The elements of input1 are treated as unsigned integers to determine an index into the look-up table. + * The data type of the items in the look-up table must match that of the output tensor. + * \param [out] output The output tensor data with the same dimensions as the input tensor data. + * \ingroup group_vision_function_tensor_tablelookup + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuTensorTableLookup(vx_context context, vx_tensor input1, vx_lut lut, vx_tensor output); + +/*! \brief [Immediate] Performs transpose on the input tensor. + * The tensor is transposed according to a specified 2 indexes in the tensor (0-based indexing) + * \param [in] context The reference to the overall context. + * \param [in] input Input tensor data, Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8 and \ref VX_TYPE_INT8, with fixed_point_position 0. + * \param [out] output output tensor data, + * \param [in] dimension1 Dimension index that is transposed with dim 2. + * \param [in] dimension2 Dimension index that is transposed with dim 1. + * \ingroup group_vision_function_tensor_transpose + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuTensorTranspose(vx_context context, vx_tensor input, vx_tensor output, vx_size dimension1, vx_size dimension2); + +/*! \brief [Immediate] Performs a bit-depth conversion. + * \param [in] context The reference to the overall context. + * \param [in] input The input tensor. Implementations must support input tensor data type \ref VX_TYPE_INT16 with fixed_point_position 8, + * and tensor data types \ref VX_TYPE_UINT8 and \ref VX_TYPE_INT8, with fixed_point_position 0. + * \param [in] policy A \ref VX_TYPE_ENUM of the \ref vx_convert_policy_e enumeration. + * \param [in] norm A scalar containing a \ref VX_TYPE_FLOAT32 of the normalization value. + * \param [in] offset A scalar containing a \ref VX_TYPE_FLOAT32 of the offset value subtracted before normalization. + * \param [out] output The output tensor. Implementations must support input tensor data type VX_TYPE_INT16. with fixed_point_position 8. + * And VX_TYPE_UINT8 with fixed_point_position 0. + * \ingroup group_vision_function_tensor_convert_depth + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuTensorConvertDepth(vx_context context, vx_tensor input, vx_enum policy, vx_scalar norm, vx_scalar offset, vx_tensor output); + +/*! \brief [Immediate] Performs a generalized matrix multiplication. + * \param [in] context The reference to the overall context. + * \param [in] input1 The first input 2D tensor of type \ref VX_TYPE_INT16 with fixed_point_pos 8, or tensor data types \ref VX_TYPE_UINT8 or \ref VX_TYPE_INT8, with fixed_point_pos 0. + * \param [in] input2 The second 2D tensor. Must be in the same data type as input1. + * \param [in] input3 The third 2D tensor. Must be in the same data type as input1. [optional]. + * \param [in] matrix_multiply_params Matrix multiply parameters, see \ref vx_tensor_matrix_multiply_params_t . + * \param [out] output The output 2D tensor. Must be in the same data type as input1. Output dimension must agree the formula in the description. + * \ingroup group_vision_function_tensor_matrix_multiply + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + */ +VX_API_ENTRY vx_status VX_API_CALL vxuTensorMatrixMultiply(vx_context context, vx_tensor input1, vx_tensor input2, vx_tensor input3, + const vx_tensor_matrix_multiply_params_t *matrix_multiply_params, vx_tensor output); + + +/*! \brief [Immediate] Copy data from one object to another. + * \param [in] context The reference to the overall context. + * \param [in] input The input data object. + * \param [out] output The output data object. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Success + * \retval * An error occurred. See \ref vx_status_e. + * \ingroup group_vision_function_copy + */ +VX_API_ENTRY vx_status VX_API_CALL vxuCopy(vx_context context, vx_reference input, vx_reference output); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so new file mode 100755 index 0000000..bbf9dc3 Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so new file mode 100755 index 0000000..62d2831 Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so new file mode 100755 index 0000000..3c0d94c Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so new file mode 100755 index 0000000..b22f3d8 Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so new file mode 100755 index 0000000..653e9bb Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so new file mode 120000 index 0000000..664ae82 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so @@ -0,0 +1 @@ +libOpenVX.so.1.3.0 \ No newline at end of file diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 new file mode 120000 index 0000000..664ae82 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 @@ -0,0 +1 @@ +libOpenVX.so.1.3.0 \ No newline at end of file diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 new file mode 100755 index 0000000..e4c1ad5 Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so new file mode 100755 index 0000000..34b84d8 Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so new file mode 100755 index 0000000..4af4d51 Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so new file mode 100755 index 0000000..5c283a0 Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so b/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so new file mode 100644 index 0000000..628f663 Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so differ diff --git a/samples/lenet/BUILD b/samples/lenet/BUILD new file mode 100644 index 0000000..46db564 --- /dev/null +++ b/samples/lenet/BUILD @@ -0,0 +1,13 @@ +cc_test( + name = "lenet_asymu8_cc", + copts = [ + "-Werror", "-std=c++14" + ], + srcs = [ + "lenet_asymu8.cc", + "lenet_asymu8_weights.h" + ], + deps = [ + "//:tim-vx_interface" + ], +) diff --git a/samples/lenet/lenet_asymu8.cc b/samples/lenet/lenet_asymu8.cc new file mode 100644 index 0000000..0edf539 --- /dev/null +++ b/samples/lenet/lenet_asymu8.cc @@ -0,0 +1,316 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include + +#include "lenet_asymu8_weights.h" +#include "tim/vx/context.h" +#include "tim/vx/graph.h" +#include "tim/vx/operation.h" +#include "tim/vx/ops/activations.h" +#include "tim/vx/ops/conv2d.h" +#include "tim/vx/ops/fullyconnected.h" +#include "tim/vx/ops/pool2d.h" +#include "tim/vx/ops/softmax.h" +#include "tim/vx/tensor.h" + +std::vector input_data = { + 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 8, 0, + 3, 0, 7, 0, 2, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 3, 1, 1, 0, 14, 0, 0, 3, 0, + 2, 4, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 3, 0, 0, 0, 5, 0, 4, 0, 0, + 0, 0, 10, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 6, 5, 0, 2, 0, 9, 0, 12, 2, 0, 5, 1, 0, + 0, 2, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 33, 0, 0, 155, 186, 55, 17, 22, 0, 0, 3, 9, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 167, 253, 255, 235, 255, 240, 134, 36, 0, 6, 1, 4, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 87, + 240, 251, 254, 254, 237, 255, 252, 191, 27, 0, 0, 5, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 19, 226, 255, 235, + 255, 255, 254, 242, 255, 255, 68, 12, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 4, 1, 58, 254, 255, 158, 0, 2, + 47, 173, 253, 247, 255, 65, 4, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 162, 240, 248, 92, 8, 0, 13, 0, + 88, 249, 244, 148, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 17, 64, 244, 255, 210, 0, 0, 1, 2, 0, 52, 223, + 255, 223, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 144, 245, 255, 142, 0, 4, 9, 0, 6, 0, 37, 222, 226, + 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73, + 255, 243, 104, 0, 0, 0, 0, 11, 0, 0, 0, 235, 242, 101, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133, 245, 226, + 12, 4, 15, 0, 0, 0, 0, 24, 0, 235, 246, 41, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 236, 245, 152, 0, 10, + 0, 0, 0, 0, 6, 0, 28, 227, 239, 1, 6, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 227, 240, 53, 4, 0, 0, 24, + 0, 1, 0, 8, 181, 249, 177, 0, 2, 0, 0, 0, 0, 4, 0, + 6, 1, 5, 0, 0, 87, 246, 219, 14, 0, 0, 2, 0, 10, 7, + 0, 134, 255, 249, 104, 4, 0, 0, 0, 0, 0, 8, 0, 3, 0, + 0, 0, 4, 89, 255, 228, 0, 11, 0, 8, 14, 0, 0, 100, 250, + 248, 236, 0, 0, 8, 0, 0, 0, 0, 5, 0, 2, 0, 0, 2, + 6, 68, 250, 228, 6, 6, 0, 0, 1, 0, 140, 240, 253, 238, 51, + 31, 0, 3, 0, 0, 0, 0, 0, 0, 5, 0, 0, 2, 0, 26, + 215, 255, 119, 0, 21, 1, 40, 156, 233, 244, 239, 103, 0, 6, 6, + 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 225, 251, + 240, 141, 118, 139, 222, 244, 255, 249, 112, 17, 0, 0, 8, 3, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 245, 255, 247, + 255, 249, 255, 255, 249, 132, 11, 0, 9, 3, 1, 1, 0, 0, 0, + 0, 2, 0, 0, 1, 0, 0, 6, 1, 0, 166, 236, 255, 255, 248, + 249, 248, 72, 0, 0, 16, 0, 16, 0, 4, 0, 0, 0, 0, 0, + 0, 0, 6, 0, 0, 4, 0, 0, 20, 106, 126, 188, 190, 112, 28, + 0, 21, 0, 1, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, +}; + +template +static void printTopN(const T* prob, int outputCount, int topNum) { + std::vector> data; + + for (int i = 0; i < outputCount; i++) { + data.push_back(std::make_tuple(i, prob[i])); + } + + std::sort(data.begin(), data.end(), + [](auto& a, auto& b) { return std::get<1>(a) > std::get<1>(b); }); + + std::cout << " --- Top" << topNum << " ---" << std::endl; + for (int i = 0; i < topNum; i++) { + std::cout << std::setw(3) << std::get<0>(data[i]) << ": " << std::fixed + << std::setprecision(6) << std::get<1>(data[i]) << std::endl; + } +} + +int main(int argc, char** argv) { + auto context = tim::vx::Context::Create(); + auto graph = context->CreateGraph(); + + tim::vx::ShapeType input_shape({28, 28, 1, 1}); + tim::vx::Quantization input_quant(tim::vx::QuantType::ASYMMETRIC, 0.00390625f, + 0); + tim::vx::TensorSpec input_spec(tim::vx::DataType::UINT8, input_shape, + tim::vx::TensorAttribute::INPUT, input_quant); + auto input = graph->CreateTensor(input_spec); + + tim::vx::ShapeType conv1_weight_shape({5, 5, 1, 20}); + tim::vx::Quantization conv1_weighteight_quant(tim::vx::QuantType::ASYMMETRIC, + 0.00336234f, 119); + tim::vx::TensorSpec conv1_weighteight_spec( + tim::vx::DataType::UINT8, conv1_weight_shape, + tim::vx::TensorAttribute::CONSTANT, conv1_weighteight_quant); + auto conv1_weight = + graph->CreateTensor(conv1_weighteight_spec, &lenet_weights[0]); + + tim::vx::ShapeType conv1_bias_shape({20}); + tim::vx::Quantization conv1_bias_quant(tim::vx::QuantType::ASYMMETRIC, + 1.313e-05f, 0); + tim::vx::TensorSpec conv1_bias_spec( + tim::vx::DataType::INT32, conv1_bias_shape, + tim::vx::TensorAttribute::CONSTANT, conv1_bias_quant); + auto conv1_bias = graph->CreateTensor(conv1_bias_spec, &lenet_weights[500]); + + tim::vx::Quantization conv1_output_quant(tim::vx::QuantType::ASYMMETRIC, + 0.01928069f, 140); + tim::vx::TensorSpec conv1_output_spec(tim::vx::DataType::UINT8, {}, + tim::vx::TensorAttribute::TRANSIENT, + conv1_output_quant); + auto conv1_output = graph->CreateTensor(conv1_output_spec); + + tim::vx::Quantization pool1_output_quant(tim::vx::QuantType::ASYMMETRIC, + 0.01928069f, 140); + tim::vx::TensorSpec pool1_output_spec(tim::vx::DataType::UINT8, {}, + tim::vx::TensorAttribute::TRANSIENT, + pool1_output_quant); + auto pool1_output = graph->CreateTensor(pool1_output_spec); + + tim::vx::ShapeType conv2_weight_shape({5, 5, 20, 50}); + tim::vx::Quantization conv2_weight_quant(tim::vx::QuantType::ASYMMETRIC, + 0.0011482f, 128); + tim::vx::TensorSpec conv2_weight_spec( + tim::vx::DataType::UINT8, conv2_weight_shape, + tim::vx::TensorAttribute::CONSTANT, conv2_weight_quant); + auto conv2_weight = + graph->CreateTensor(conv2_weight_spec, &lenet_weights[580]); + + tim::vx::ShapeType conv2_bias_shape({50}); + tim::vx::Quantization conv2_bias_quant(tim::vx::QuantType::ASYMMETRIC, + 2.214e-05f, 0); + tim::vx::TensorSpec conv2_bias_spec( + tim::vx::DataType::INT32, conv2_bias_shape, + tim::vx::TensorAttribute::CONSTANT, conv2_bias_quant); + auto conv2_bias = graph->CreateTensor(conv2_bias_spec, &lenet_weights[25580]); + + tim::vx::Quantization conv2_output_quant(tim::vx::QuantType::ASYMMETRIC, + 0.04075872f, 141); + tim::vx::TensorSpec conv2_output_spec(tim::vx::DataType::UINT8, {}, + tim::vx::TensorAttribute::TRANSIENT, + conv2_output_quant); + auto conv2_output = graph->CreateTensor(conv2_output_spec); + + tim::vx::Quantization pool2_output_quant(tim::vx::QuantType::ASYMMETRIC, + 0.04075872f, 141); + tim::vx::TensorSpec pool2_output_spec(tim::vx::DataType::UINT8, {}, + tim::vx::TensorAttribute::TRANSIENT, + pool2_output_quant); + auto pool2_output = graph->CreateTensor(pool2_output_spec); + + tim::vx::ShapeType fc3_weight_shape({800, 500}); + tim::vx::Quantization fc3_weight_quant(tim::vx::QuantType::ASYMMETRIC, + 0.00073548f, 130); + tim::vx::TensorSpec fc3_weight_spec( + tim::vx::DataType::UINT8, fc3_weight_shape, + tim::vx::TensorAttribute::CONSTANT, fc3_weight_quant); + auto fc3_weight = graph->CreateTensor(fc3_weight_spec, &lenet_weights[25780]); + + tim::vx::ShapeType fc3_bias_shape({500}); + tim::vx::Quantization fc3_bias_quant(tim::vx::QuantType::ASYMMETRIC, + 2.998e-05f, 0); + tim::vx::TensorSpec fc3_bias_spec(tim::vx::DataType::INT32, fc3_bias_shape, + tim::vx::TensorAttribute::CONSTANT, + fc3_bias_quant); + auto fc3_bias = graph->CreateTensor(fc3_bias_spec, &lenet_weights[425780]); + + tim::vx::Quantization fc3_output_quant(tim::vx::QuantType::ASYMMETRIC, + 0.01992089f, 0); + tim::vx::TensorSpec fc3_output_spec(tim::vx::DataType::UINT8, {}, + tim::vx::TensorAttribute::TRANSIENT, + fc3_output_quant); + auto fc3_output = graph->CreateTensor(fc3_output_spec); + + tim::vx::Quantization relu_output_quant(tim::vx::QuantType::ASYMMETRIC, + 0.01992089f, 0); + tim::vx::TensorSpec relu_output_spec(tim::vx::DataType::UINT8, {}, + tim::vx::TensorAttribute::TRANSIENT, + relu_output_quant); + auto relu_output = graph->CreateTensor(relu_output_spec); + + tim::vx::ShapeType fc4_weight_shape({500, 10}); + tim::vx::Quantization fc4_weight_quant(tim::vx::QuantType::ASYMMETRIC, + 0.00158043f, 135); + tim::vx::TensorSpec fc4_weight_spec( + tim::vx::DataType::UINT8, fc4_weight_shape, + tim::vx::TensorAttribute::CONSTANT, fc4_weight_quant); + auto fc4_weight = + graph->CreateTensor(fc4_weight_spec, &lenet_weights[427780]); + + tim::vx::ShapeType fc4_bias_shape({10}); + tim::vx::Quantization fc4_bias_quant(tim::vx::QuantType::ASYMMETRIC, + 3.148e-05f, 0); + tim::vx::TensorSpec fc4_bias_spec(tim::vx::DataType::INT32, fc4_bias_shape, + tim::vx::TensorAttribute::CONSTANT, + fc4_bias_quant); + auto fc4_bias = graph->CreateTensor(fc4_bias_spec, &lenet_weights[432780]); + + tim::vx::Quantization fc4_output_quant(tim::vx::QuantType::ASYMMETRIC, + 0.06251489f, 80); + tim::vx::TensorSpec fc4_output_spec(tim::vx::DataType::UINT8, {}, + tim::vx::TensorAttribute::TRANSIENT, + fc4_output_quant); + auto fc4_output = graph->CreateTensor(fc4_output_spec); + + tim::vx::ShapeType output_shape({10, 1}); + tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32, output_shape, + tim::vx::TensorAttribute::OUTPUT); + auto output = graph->CreateTensor(output_spec); + + auto conv1 = graph->CreateOperation( + conv1_weight_shape[3], tim::vx::PadType::VALID, + std::array({5, 5}), std::array({1, 1}), + std::array({1, 1})); + (*conv1) + .BindInputs({input, conv1_weight, conv1_bias}) + .BindOutputs({conv1_output}); + + auto pool1 = graph->CreateOperation( + tim::vx::PoolType::MAX, tim::vx::PadType::NONE, + std::array({2, 2}), std::array({2, 2})); + (*pool1).BindInputs({conv1_output}).BindOutputs({pool1_output}); + + auto conv2 = graph->CreateOperation( + conv2_weight_shape[3], tim::vx::PadType::VALID, + std::array({5, 5}), std::array({1, 1}), + std::array({1, 1})); + (*conv2) + .BindInputs({pool1_output, conv2_weight, conv2_bias}) + .BindOutputs({conv2_output}); + + auto pool2 = graph->CreateOperation( + tim::vx::PoolType::MAX, tim::vx::PadType::NONE, + std::array({2, 2}), std::array({2, 2})); + (*pool2).BindInputs({conv2_output}).BindOutputs({pool2_output}); + + auto fc3 = graph->CreateOperation( + 2, fc3_weight_shape[1]); + (*fc3) + .BindInputs({pool2_output, fc3_weight, fc3_bias}) + .BindOutputs({fc3_output}); + + auto relu = graph->CreateOperation(); + (*relu).BindInput(fc3_output).BindOutput(relu_output); + + auto fc4 = graph->CreateOperation( + 0, fc4_weight_shape[1]); + (*fc4) + .BindInputs({relu_output, fc4_weight, fc4_bias}) + .BindOutputs({fc4_output}); + + auto softmax = graph->CreateOperation(1.0f, 0); + (*softmax).BindInput(fc4_output).BindOutput(output); + + if (!graph->Compile()) { + std::cout << "Compile graph fail." << std::endl; + return -1; + } + + if (!input->CopyDataToTensor(input_data.data(), input_data.size())) { + std::cout << "Copy input data fail." << std::endl; + return -1; + } + + if (!graph->Run()) { + std::cout << "Run graph fail." << std::endl; + return -1; + } + + std::vector output_data; + output_data.resize(1 * 10); + if (!output->CopyDataFromTensor(output_data.data())) { + std::cout << "Copy output data fail." << std::endl; + return -1; + } + + printTopN(output_data.data(), output_data.size(), 5); + + return 0; +} diff --git a/samples/lenet/lenet_asymu8_weights.h b/samples/lenet/lenet_asymu8_weights.h new file mode 100644 index 0000000..68d43a7 --- /dev/null +++ b/samples/lenet/lenet_asymu8_weights.h @@ -0,0 +1,36094 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +unsigned char lenet_weights[432820] = { + 0xc4, 0x37, 0x19, 0xb5, 0xa8, 0xa8, 0x2f, 0x8c, 0xc8, 0xcd, 0xab, 0x19, + 0xc8, 0xe7, 0xf5, 0xa2, 0x85, 0xc7, 0x9b, 0xc1, 0xcb, 0x38, 0xab, 0x1e, + 0x2e, 0x4c, 0xd4, 0x7b, 0xff, 0xde, 0x35, 0x5d, 0xc5, 0x4b, 0xa5, 0x69, + 0x5, 0xc, 0x13, 0x9a, 0x9c, 0x2f, 0xb1, 0x2b, 0x7d, 0x1a, 0x27, 0x79, + 0x3, 0xf, 0xda, 0x7f, 0x6c, 0xd6, 0xb9, 0xd9, 0xbf, 0x89, 0xd0, 0x4e, + 0xea, 0x69, 0x97, 0x1d, 0x56, 0x59, 0xc8, 0xd5, 0xc4, 0x70, 0xca, 0x21, + 0x2a, 0x1e, 0x74, 0x62, 0x9, 0x2c, 0x54, 0xb1, 0x3d, 0x1c, 0x76, 0x54, + 0x51, 0x96, 0xb7, 0x8f, 0x25, 0x41, 0x9f, 0x96, 0x1f, 0x7a, 0xab, 0xd8, + 0xc4, 0x42, 0x3b, 0xac, 0xe1, 0xd6, 0xc0, 0x92, 0x69, 0x97, 0x1c, 0x55, + 0xe3, 0x71, 0x28, 0xc4, 0xdd, 0x63, 0x2c, 0x81, 0x6a, 0x62, 0x9c, 0x5e, + 0xa6, 0x57, 0x49, 0xd0, 0xaa, 0x80, 0x48, 0x39, 0xca, 0x12, 0xab, 0xa0, + 0x7c, 0x23, 0x74, 0x6b, 0xc6, 0x1e, 0xe, 0xe3, 0xc0, 0x2e, 0x52, 0x8e, + 0x95, 0x4a, 0xd0, 0x1c, 0x9b, 0x8d, 0xbb, 0x3d, 0x2c, 0x20, 0x53, 0x1a, + 0x6f, 0x22, 0xad, 0x3a, 0xe5, 0x85, 0x40, 0xd8, 0x8d, 0xb3, 0x97, 0x38, + 0x9f, 0x57, 0xde, 0x73, 0x42, 0x7b, 0x6e, 0xc1, 0x61, 0x5c, 0x71, 0xe0, + 0xaf, 0x39, 0x5d, 0x38, 0x79, 0xbe, 0xd2, 0x92, 0xe, 0xa7, 0x98, 0x6f, + 0x18, 0x39, 0x3f, 0x91, 0xe1, 0x8e, 0x15, 0x60, 0x99, 0xb5, 0x49, 0x9f, + 0xbf, 0x4d, 0xec, 0xaa, 0x28, 0xa6, 0xd6, 0x82, 0xac, 0x87, 0x43, 0xaf, + 0x67, 0x85, 0xb, 0x57, 0xa9, 0x5c, 0x63, 0x32, 0xa, 0x8d, 0x8c, 0x93, + 0x96, 0x4b, 0xa1, 0x12, 0xa5, 0x28, 0x34, 0xbf, 0x71, 0x2f, 0x12, 0xb0, + 0x98, 0xd1, 0xbd, 0x45, 0x2a, 0x1f, 0xda, 0xa1, 0x6c, 0x2e, 0x56, 0x4d, + 0xb, 0x3f, 0x6b, 0x4c, 0x31, 0x54, 0x57, 0x9c, 0x6e, 0x20, 0xab, 0x43, + 0x95, 0x9d, 0xa5, 0x73, 0xe0, 0xd5, 0xa5, 0xe1, 0xf1, 0xfc, 0x37, 0xc0, + 0x63, 0xaa, 0x46, 0x74, 0xd2, 0x7b, 0xa1, 0x7, 0x1b, 0x4d, 0x39, 0x9c, + 0xe, 0xaa, 0xc3, 0x23, 0x2, 0x84, 0xe, 0xc1, 0x41, 0x8c, 0x49, 0xd9, + 0x5d, 0x25, 0x60, 0xc3, 0x7, 0x83, 0x6c, 0xac, 0x3f, 0x13, 0x8d, 0x8a, + 0xcb, 0xec, 0x5f, 0xdd, 0xe4, 0x92, 0xdc, 0xa6, 0x27, 0x8f, 0x43, 0x5a, + 0x89, 0x8f, 0x65, 0x74, 0x7a, 0xbc, 0x54, 0xae, 0xf, 0xf, 0xa4, 0x8d, + 0, 0x15, 0x63, 0xb6, 0x55, 0xae, 0x6, 0x21, 0xe9, 0x8b, 0x1a, 0x5b, + 0x3f, 0xf6, 0x2e, 0x82, 0x11, 0x10, 0x49, 0x8f, 0x27, 0x25, 0x72, 0x31, + 0x28, 0xa3, 0x6c, 0x35, 0x5a, 0x4a, 0x31, 0x12, 0x8d, 0x5d, 0x89, 0x34, + 0xb7, 0x25, 0x71, 0x42, 0x96, 0xbc, 0xab, 0x13, 0x13, 0x37, 0x28, 0xbd, + 0x36, 0xbe, 0x82, 0x21, 0x18, 0x6e, 0x54, 0x5e, 0xbf, 0xb3, 0xbe, 0xbe, + 0xac, 0xd2, 0x2f, 0xcc, 0x92, 0x8a, 0xbb, 0xae, 0xe1, 0x47, 0x8c, 0xa6, + 0x39, 0xd5, 0x80, 0xca, 0x6e, 0x97, 0x90, 0xc, 0x7, 0xc4, 0x45, 0x2a, + 0x7f, 0x32, 0x1f, 0x9d, 0xac, 0x85, 0x41, 0x9f, 0x97, 0x62, 0xd6, 0x26, + 0xc4, 0x9e, 0x53, 0x70, 0xd6, 0xda, 0xea, 0xc7, 0xd5, 0x3b, 0x28, 0x19, + 0x74, 0x8c, 0x65, 0xc3, 0x39, 0x49, 0x18, 0x3f, 0xd2, 0x82, 0x3b, 0x5e, + 0x7b, 0xce, 0x92, 0x84, 0x81, 0x19, 0x9a, 0x83, 0x73, 0xc7, 0xa, 0x81, + 0x1a, 0xe, 0x68, 0x86, 0x38, 0x13, 0xae, 0x37, 0xb0, 0xec, 0x7d, 0x4c, + 0x27, 0x33, 0xca, 0xb1, 0x7b, 0x8a, 0x4c, 0x83, 0x9f, 0x81, 0x2e, 0x47, + 0x1a, 0x7b, 0x8b, 0x58, 0x73, 0xa, 0x84, 0xe, 0x8f, 0xa4, 0xff, 0xff, + 0xaf, 0xe6, 0xff, 0xff, 0xda, 0xa6, 0xff, 0xff, 0x8d, 0xc, 0, 0, + 0xae, 0xd5, 0xff, 0xff, 0x47, 0x6, 0, 0, 0xfc, 0xc, 0, 0, + 0x40, 0xf8, 0xff, 0xff, 0x1f, 0xd2, 0xff, 0xff, 0xec, 0xcf, 0xff, 0xff, + 0x6e, 0xd9, 0xff, 0xff, 0x61, 0xdf, 0xff, 0xff, 0x92, 0xe3, 0xff, 0xff, + 0x47, 0x23, 0, 0, 0x5a, 0x4e, 0, 0, 0xe3, 0xd4, 0xff, 0xff, + 0x77, 0xdd, 0xff, 0xff, 0x38, 0xcf, 0xff, 0xff, 0x4a, 0x11, 0, 0, + 0x82, 0xf, 0, 0, 0x77, 0x97, 0x75, 0x74, 0x85, 0xb1, 0x44, 0xa1, + 0x49, 0x9b, 0x78, 0x83, 0x75, 0x95, 0x82, 0x67, 0x69, 0x6b, 0x94, 0xa7, + 0xa9, 0x7a, 0x46, 0x45, 0x39, 0x42, 0x5d, 0x98, 0xab, 0x76, 0xaf, 0xcb, + 0x8d, 0x3b, 0xa2, 0xc2, 0xbe, 0x8f, 0x43, 0x5c, 0x94, 0x5d, 0xbd, 0x59, + 0x96, 0xad, 0x5f, 0x6a, 0x7b, 0x87, 0x49, 0xa6, 0xaf, 0x7d, 0x97, 0x83, + 0xa3, 0x47, 0x50, 0x23, 0x5b, 0xa8, 0x57, 0x75, 0xa3, 0x93, 0x2e, 0x55, + 0xbf, 0x5b, 0x92, 0x9e, 0xa6, 0x40, 0x87, 0x86, 0x9f, 0x71, 0x94, 0x76, + 0x59, 0x39, 0x75, 0xa6, 0x89, 0x6e, 0x98, 0x92, 0x54, 0x60, 0x98, 0x8a, + 0xa6, 0x50, 0x71, 0x71, 0x8e, 0x68, 0xa3, 0x95, 0xab, 0xac, 0x6b, 0x99, + 0xb4, 0x36, 0xb0, 0x3f, 0x79, 0x6b, 0x6c, 0x80, 0x80, 0x4e, 0x9d, 0x92, + 0xa7, 0x7c, 0x62, 0x86, 0x94, 0xa0, 0x48, 0x57, 0xb9, 0x56, 0xb5, 0x46, + 0x5a, 0x8a, 0x53, 0x4c, 0x76, 0x62, 0xa0, 0x67, 0x55, 0xbd, 0x92, 0x62, + 0x57, 0x93, 0x56, 0x84, 0x88, 0x5c, 0x48, 0x73, 0x51, 0xb5, 0x62, 0xa0, + 0xae, 0x55, 0xc1, 0x82, 0x4a, 0x4c, 0x60, 0x51, 0x5e, 0xb3, 0x52, 0x97, + 0xb5, 0x64, 0xc0, 0xab, 0x60, 0x5d, 0x86, 0xb4, 0x97, 0x96, 0x97, 0x9e, + 0xb3, 0x93, 0x84, 0xac, 0xb2, 0x77, 0x4a, 0xaa, 0x3f, 0x50, 0xbe, 0x3e, + 0xb1, 0x95, 0x7a, 0x3f, 0xbf, 0x96, 0x8b, 0xa6, 0xbc, 0x91, 0xa6, 0xb7, + 0x6e, 0x94, 0x9c, 0xae, 0x49, 0xae, 0x63, 0x8c, 0x45, 0xa4, 0x75, 0x58, + 0x3e, 0x5a, 0x7c, 0xa0, 0x78, 0x72, 0xa6, 0x6b, 0x5b, 0xbf, 0x79, 0xa1, + 0x48, 0x66, 0xa3, 0xc6, 0xa2, 0xa5, 0x9a, 0x58, 0xa8, 0x4c, 0xae, 0xc7, + 0xac, 0x71, 0x8e, 0x89, 0xa1, 0x45, 0x85, 0x9c, 0x6d, 0x83, 0x6d, 0x97, + 0xc1, 0xc3, 0x57, 0x45, 0x70, 0x9e, 0x72, 0x9a, 0x3d, 0xb1, 0x81, 0x9a, + 0x59, 0x65, 0x98, 0x6d, 0x52, 0xd0, 0x66, 0xc6, 0x8a, 0x2f, 0x6d, 0x9c, + 0x50, 0x6f, 0x5d, 0x70, 0x79, 0x4c, 0xd2, 0x6f, 0xc8, 0xa5, 0x4c, 0x42, + 0x8b, 0xb8, 0x6e, 0x8b, 0xa8, 0x79, 0x92, 0xb2, 0x5b, 0xb7, 0xb2, 0x44, + 0x55, 0xbf, 0x6d, 0x90, 0x5e, 0x89, 0x6f, 0x83, 0x3c, 0x69, 0x7b, 0x81, + 0x6c, 0x93, 0xb6, 0x55, 0x96, 0xb9, 0xd1, 0x9d, 0x69, 0x87, 0x42, 0x96, + 0xc3, 0xb6, 0x6f, 0x50, 0x4a, 0x90, 0x89, 0x7f, 0x56, 0x82, 0x59, 0xb4, + 0x4b, 0x6e, 0x45, 0xa4, 0xb1, 0x70, 0x6c, 0x98, 0xab, 0x5e, 0x9f, 0x58, + 0x5c, 0x66, 0x2b, 0x8b, 0x7f, 0xa9, 0x5b, 0x4c, 0xa2, 0xb7, 0x88, 0x88, + 0x4f, 0x95, 0xc2, 0x66, 0xb6, 0x85, 0x6f, 0xa1, 0x4b, 0xa4, 0x51, 0x45, + 0x87, 0x9b, 0x6b, 0x6c, 0x9b, 0xb8, 0x58, 0x9c, 0x8f, 0x5c, 0x4c, 0x66, + 0x89, 0x96, 0x43, 0xa6, 0xd2, 0x73, 0xa5, 0x66, 0x5f, 0x90, 0x6f, 0x9d, + 0xba, 0xb3, 0xb4, 0x46, 0x6b, 0x9d, 0xb1, 0x6f, 0x72, 0xad, 0x43, 0x6a, + 0x9d, 0x47, 0x63, 0x4a, 0x73, 0x8a, 0x59, 0x72, 0x47, 0x95, 0x85, 0x6b, + 0x6c, 0x66, 0x73, 0xbe, 0x8c, 0x4c, 0x5d, 0xa9, 0xa2, 0x69, 0x94, 0x6c, + 0x48, 0x6d, 0x3f, 0x8d, 0x6d, 0x7f, 0xa0, 0x6d, 0x71, 0x60, 0x8e, 0xb1, + 0x55, 0xad, 0x7d, 0x5b, 0xa0, 0xb7, 0x74, 0xbe, 0x67, 0x83, 0x9e, 0x7d, + 0x9d, 0x76, 0x88, 0x70, 0x99, 0xb0, 0xa1, 0x48, 0xb9, 0x6d, 0x72, 0xac, + 0x79, 0x62, 0x5f, 0x92, 0xc0, 0x50, 0x83, 0x99, 0xc1, 0x8f, 0x6a, 0x70, + 0x8f, 0x51, 0x7c, 0x91, 0x82, 0xc3, 0xc6, 0x3c, 0xa1, 0x86, 0xad, 0x63, + 0x7f, 0x65, 0x3b, 0x5e, 0x54, 0x9b, 0x71, 0x91, 0x65, 0x36, 0x3f, 0x78, + 0x49, 0x64, 0xc4, 0xa3, 0xc0, 0x8d, 0x58, 0x70, 0x76, 0xb2, 0x74, 0x90, + 0x61, 0x7b, 0x4a, 0x64, 0xab, 0x85, 0x8a, 0x44, 0xa2, 0x9b, 0x7b, 0x5a, + 0x8b, 0x46, 0x9d, 0x7b, 0xa3, 0xb2, 0x65, 0x94, 0xb4, 0x4e, 0x98, 0x82, + 0xaf, 0x7c, 0xc8, 0x7c, 0xc0, 0xbd, 0xab, 0x58, 0x8f, 0x72, 0x60, 0x6f, + 0x57, 0x67, 0x68, 0x36, 0x76, 0xaa, 0x93, 0x62, 0x36, 0x92, 0x87, 0xb6, + 0x4e, 0xa0, 0xac, 0xbb, 0xc8, 0x65, 0x99, 0x84, 0x86, 0x7f, 0x3a, 0x72, + 0x44, 0xba, 0x77, 0xbc, 0xbc, 0x91, 0x77, 0xb0, 0x59, 0x80, 0xae, 0xaf, + 0xa6, 0x9e, 0x54, 0x5d, 0x7f, 0xa2, 0x48, 0xba, 0xa0, 0x5c, 0xa0, 0x78, + 0x81, 0x3a, 0xae, 0xa5, 0xab, 0x58, 0x79, 0x70, 0x75, 0x4a, 0xa5, 0x4a, + 0xc3, 0xb5, 0xa4, 0x9c, 0x7a, 0x52, 0x91, 0x7a, 0x73, 0x69, 0x4e, 0x89, + 0xab, 0x75, 0xc1, 0x79, 0xa9, 0xa1, 0x57, 0x5a, 0x80, 0x8d, 0x9a, 0xb5, + 0x96, 0xaa, 0xba, 0x50, 0x99, 0xb8, 0x82, 0x5b, 0xb8, 0x4b, 0x81, 0xaa, + 0x5f, 0x8f, 0x65, 0x83, 0x6a, 0xa2, 0x7d, 0x69, 0x79, 0x3f, 0xa0, 0xb9, + 0x6d, 0x7d, 0x96, 0x9a, 0xaf, 0x51, 0x4c, 0x56, 0xb5, 0x61, 0x3c, 0xbf, + 0xa0, 0x70, 0x6f, 0x8b, 0xbd, 0xca, 0x9f, 0x61, 0xbc, 0x48, 0xb1, 0x7d, + 0x65, 0xb8, 0x6e, 0xa2, 0x57, 0x80, 0x45, 0xae, 0x70, 0x87, 0x57, 0x45, + 0x86, 0x58, 0x62, 0x60, 0x80, 0x95, 0x5a, 0x59, 0xbe, 0x9e, 0x9a, 0xb5, + 0x50, 0xb1, 0x5f, 0xaa, 0xb8, 0x82, 0x58, 0x96, 0x74, 0xc8, 0x64, 0x79, + 0x9b, 0xb1, 0xb0, 0x7f, 0xac, 0xb2, 0xb1, 0x45, 0xa0, 0x60, 0xb3, 0x47, + 0x45, 0x52, 0x85, 0xa7, 0x51, 0x7b, 0x9c, 0x37, 0xa7, 0x4e, 0xc7, 0x9f, + 0x3d, 0xaf, 0xc3, 0xbd, 0xca, 0x77, 0x89, 0x86, 0x96, 0xbe, 0x78, 0x5f, + 0x61, 0xb5, 0xc1, 0xbe, 0x9f, 0x51, 0x91, 0xb3, 0x8a, 0x62, 0x4b, 0x40, + 0x27, 0x5f, 0x30, 0x31, 0xb8, 0x98, 0x3f, 0x45, 0x2f, 0x48, 0x94, 0x49, + 0x6c, 0xa5, 0x90, 0x50, 0x6f, 0x57, 0x6b, 0x8c, 0x8b, 0x4f, 0xb5, 0x99, + 0xa1, 0x8f, 0x52, 0x8d, 0x81, 0x5e, 0x80, 0x50, 0xac, 0x63, 0x6b, 0x44, + 0x59, 0x8f, 0x37, 0x72, 0x47, 0x95, 0x4d, 0x46, 0xc1, 0x61, 0x79, 0xa8, + 0xa9, 0x53, 0x84, 0x60, 0x41, 0x82, 0x4c, 0x94, 0x91, 0x62, 0x82, 0x56, + 0x82, 0x7f, 0x8a, 0x46, 0x9f, 0xab, 0x6a, 0x7c, 0x8b, 0x8f, 0x9d, 0xbc, + 0x6a, 0x81, 0x63, 0xa6, 0xac, 0x93, 0x81, 0xa9, 0x8d, 0x71, 0xb8, 0x5a, + 0x87, 0x97, 0x99, 0x9a, 0x43, 0x97, 0x98, 0x56, 0x7c, 0x9d, 0x4a, 0x9d, + 0x98, 0x3e, 0x73, 0x56, 0x94, 0x51, 0x45, 0x84, 0x64, 0x8d, 0x73, 0xac, + 0xbc, 0x78, 0x9f, 0xac, 0x57, 0x5d, 0x96, 0x94, 0xa2, 0x9b, 0xca, 0xa1, + 0x52, 0x7f, 0x7d, 0xb0, 0x40, 0x51, 0xa0, 0x9b, 0x5d, 0x7d, 0x5a, 0x88, + 0xa0, 0xb8, 0xa3, 0x95, 0xa2, 0x7a, 0xc4, 0xbe, 0x4a, 0xad, 0x9c, 0xb5, + 0x6b, 0x4a, 0x9f, 0x53, 0xab, 0x9d, 0x4e, 0xba, 0x93, 0x9f, 0x71, 0xaa, + 0xb7, 0x7d, 0x3f, 0x89, 0xa9, 0xb9, 0x5d, 0x9f, 0x48, 0x7a, 0x3d, 0x6e, + 0xad, 0x6b, 0xc3, 0x48, 0x93, 0xd2, 0xce, 0xc0, 0x66, 0x97, 0xa5, 0xa8, + 0x37, 0x89, 0x4e, 0x9f, 0x5f, 0x86, 0xa9, 0xad, 0xc1, 0x66, 0x57, 0x6b, + 0x6b, 0x5d, 0x3e, 0xc4, 0xb9, 0x67, 0x61, 0x7f, 0xc8, 0xab, 0x7e, 0x68, + 0x56, 0x59, 0x6f, 0x63, 0x83, 0x61, 0x78, 0x44, 0x71, 0x97, 0x9d, 0x56, + 0xa3, 0x5e, 0x84, 0x82, 0x93, 0x7c, 0x7a, 0xa9, 0x98, 0x88, 0x6d, 0x86, + 0x93, 0x5d, 0x77, 0xaa, 0xcb, 0x80, 0x91, 0xa3, 0x85, 0x7d, 0x99, 0x30, + 0x24, 0x6f, 0xad, 0x39, 0x4d, 0xa5, 0xb9, 0x79, 0x9e, 0x6b, 0x8c, 0xa7, + 0x38, 0x35, 0xb9, 0x8b, 0x56, 0x67, 0x4a, 0xc2, 0x4d, 0xbf, 0x63, 0x6a, + 0x94, 0x97, 0x9e, 0x58, 0x8c, 0x78, 0x9d, 0x9c, 0xaa, 0x59, 0xb1, 0x7a, + 0x94, 0x49, 0x4e, 0x53, 0x75, 0x86, 0xbb, 0x50, 0xb3, 0x61, 0x67, 0x6b, + 0xaf, 0xb3, 0x56, 0x80, 0xc7, 0xbe, 0x74, 0x80, 0x9c, 0x80, 0x6b, 0x9e, + 0x75, 0x8a, 0x73, 0x7e, 0x8b, 0x69, 0x8a, 0x90, 0x44, 0xb4, 0x7f, 0x6a, + 0x66, 0x46, 0x99, 0x75, 0x83, 0x46, 0xb8, 0x76, 0x82, 0xa2, 0xaf, 0x8c, + 0x6d, 0x5b, 0x86, 0x5a, 0x57, 0x87, 0x91, 0x90, 0x75, 0xb6, 0x73, 0x80, + 0x87, 0x77, 0x89, 0x4f, 0x98, 0x77, 0x4c, 0x6e, 0x3c, 0x4d, 0x62, 0x88, + 0x4d, 0x3b, 0x6b, 0x77, 0x6d, 0x52, 0x6c, 0x9b, 0x5a, 0x84, 0x8c, 0xa6, + 0x5d, 0x7b, 0x34, 0xa7, 0x9b, 0x3c, 0xa0, 0x8a, 0xa3, 0x9c, 0x41, 0xb1, + 0x53, 0x88, 0x75, 0x2e, 0x6e, 0x65, 0xc1, 0x70, 0x6e, 0xa8, 0x4c, 0x5c, + 0x64, 0x49, 0x81, 0x4f, 0x4b, 0x4d, 0x4d, 0xae, 0x65, 0x72, 0x72, 0x69, + 0x8b, 0xab, 0xa2, 0x57, 0x54, 0x4f, 0x86, 0x65, 0x8d, 0x3e, 0x57, 0x9d, + 0x50, 0x72, 0x50, 0xb9, 0x9b, 0x5d, 0x51, 0x82, 0x46, 0x70, 0x4f, 0xa5, + 0x60, 0xab, 0x6c, 0x95, 0x9b, 0x37, 0x48, 0x43, 0x9c, 0x6a, 0x7a, 0x43, + 0x40, 0x77, 0xa5, 0xb0, 0x4d, 0x93, 0xd0, 0xb2, 0x48, 0x4f, 0xa0, 0xcd, + 0x7c, 0x5e, 0xb9, 0x57, 0x8e, 0x3d, 0x3a, 0xa1, 0x68, 0xc5, 0xa0, 0x86, + 0x93, 0x76, 0xa2, 0xab, 0x3d, 0x91, 0x6f, 0x4a, 0x90, 0xb4, 0x6d, 0x5f, + 0x99, 0x95, 0x44, 0x3f, 0x8c, 0x47, 0x89, 0xa8, 0x87, 0xaa, 0x91, 0x72, + 0x56, 0x5c, 0x50, 0x7c, 0xae, 0x48, 0xbc, 0x59, 0x75, 0x43, 0xb3, 0x94, + 0xa9, 0x90, 0x86, 0x9e, 0x72, 0xab, 0x63, 0x98, 0xa8, 0x3d, 0x4c, 0x5c, + 0x75, 0x56, 0xae, 0x94, 0xb1, 0x71, 0x62, 0x3c, 0x72, 0xbc, 0x66, 0x49, + 0x7d, 0x4e, 0x76, 0xb7, 0x44, 0x93, 0x47, 0x4c, 0x59, 0x96, 0x50, 0xa3, + 0x44, 0xbf, 0x3a, 0xc0, 0x72, 0x5d, 0x80, 0xa5, 0x3e, 0xac, 0x93, 0xb0, + 0x47, 0xb1, 0x58, 0x60, 0x64, 0x6b, 0x52, 0x46, 0x42, 0x3e, 0xa3, 0x42, + 0xa2, 0x39, 0x8d, 0x46, 0x99, 0xb2, 0xb1, 0xa0, 0xbc, 0x7c, 0x8e, 0x8a, + 0x58, 0x76, 0x46, 0x85, 0x62, 0xb1, 0x7c, 0x6b, 0x6e, 0x91, 0xb2, 0x3b, + 0xa3, 0x6a, 0x9a, 0x89, 0x72, 0x64, 0x86, 0x8c, 0x98, 0x4e, 0x8e, 0xae, + 0x4a, 0xc2, 0x9f, 0x4c, 0x73, 0x9e, 0xca, 0x43, 0x58, 0xa9, 0xb0, 0x8b, + 0xa2, 0xb6, 0xae, 0xb6, 0x6c, 0xaf, 0xca, 0x6e, 0xbf, 0xba, 0x71, 0xb1, + 0x52, 0x8e, 0xae, 0xb6, 0x8f, 0x86, 0x63, 0x7b, 0xb4, 0x4b, 0x94, 0x5d, + 0x44, 0x79, 0xb1, 0x81, 0xaf, 0x76, 0x8e, 0x51, 0x65, 0x6f, 0x44, 0x63, + 0xa5, 0xb1, 0x5a, 0xa8, 0x8e, 0x82, 0xb1, 0x7c, 0x5d, 0xae, 0xbd, 0x66, + 0x5a, 0x50, 0x7b, 0xa5, 0x51, 0x60, 0x91, 0x9c, 0xb5, 0xbf, 0x3c, 0xa9, + 0x58, 0x91, 0x84, 0xc3, 0x43, 0x6c, 0x6e, 0xb1, 0xaa, 0x31, 0x87, 0x68, + 0x58, 0xa0, 0x63, 0x87, 0x61, 0xb7, 0x7b, 0x40, 0x5b, 0x85, 0x38, 0x33, + 0xaf, 0x9e, 0xb9, 0x79, 0x61, 0xb2, 0x89, 0x86, 0x3e, 0x80, 0x62, 0x9e, + 0x87, 0x65, 0xa1, 0xa3, 0x39, 0x7c, 0x7b, 0xb4, 0xaa, 0x8f, 0x84, 0x9f, + 0x81, 0x6d, 0x70, 0xaf, 0x64, 0xa3, 0x7f, 0xac, 0x81, 0xb2, 0x7e, 0x4d, + 0x44, 0x5d, 0x4f, 0x45, 0x69, 0x7d, 0x55, 0x3e, 0x67, 0x4a, 0x98, 0x45, + 0x87, 0x46, 0x79, 0x6e, 0x3e, 0x3f, 0x51, 0xb4, 0x5b, 0x61, 0x47, 0x64, + 0x8b, 0x83, 0x92, 0x52, 0xb8, 0x9d, 0x60, 0x7f, 0xc8, 0xba, 0x81, 0x7d, + 0xb4, 0xae, 0x98, 0x55, 0x8e, 0x66, 0x5d, 0xb3, 0x65, 0x7f, 0x77, 0x50, + 0x93, 0x5b, 0xcd, 0xb5, 0x6a, 0x8a, 0x64, 0x90, 0xd6, 0x7f, 0x8b, 0x8e, + 0x50, 0xd4, 0xe3, 0x7e, 0x97, 0xb1, 0x98, 0x90, 0x6b, 0x8e, 0xa0, 0x41, + 0x5c, 0x5d, 0x7f, 0x4e, 0x62, 0x67, 0x67, 0x99, 0xa8, 0x8b, 0x85, 0x9b, + 0x59, 0x69, 0x65, 0x70, 0x56, 0x92, 0xaf, 0xa2, 0x89, 0x68, 0xa7, 0x8e, + 0x4d, 0x65, 0x8a, 0x4f, 0x9f, 0x94, 0x5d, 0x59, 0x6e, 0x6a, 0xa7, 0x5d, + 0x2f, 0x8c, 0xa5, 0x9c, 0x9f, 0x69, 0x69, 0xa4, 0xbf, 0x4e, 0x85, 0x38, + 0x8c, 0x85, 0x49, 0x95, 0x48, 0xa6, 0x5c, 0xb4, 0xb1, 0x64, 0x92, 0xb4, + 0xbf, 0x46, 0xb2, 0x52, 0x80, 0x40, 0xa1, 0xc4, 0xb4, 0x46, 0x4f, 0x5b, + 0x4c, 0x55, 0x9b, 0xaf, 0x6d, 0x3e, 0xa5, 0x9a, 0x53, 0x5a, 0x9f, 0x46, + 0x8d, 0x89, 0x85, 0x94, 0x3b, 0x74, 0x94, 0x66, 0x7e, 0x87, 0x74, 0x51, + 0x79, 0x9e, 0xb0, 0x67, 0x81, 0x4b, 0xa2, 0x42, 0x67, 0x4e, 0x56, 0x4c, + 0x97, 0x9b, 0x70, 0x89, 0x82, 0x94, 0xa6, 0x46, 0x78, 0x99, 0x9a, 0x65, + 0x74, 0x7b, 0x77, 0x5d, 0x78, 0xbc, 0xa2, 0x9e, 0x4e, 0x5a, 0x7f, 0x59, + 0x50, 0x80, 0x9e, 0xb6, 0x49, 0x63, 0x88, 0x85, 0xab, 0xb7, 0x78, 0x66, + 0x6e, 0xb4, 0x43, 0x9d, 0x3d, 0x82, 0x52, 0x95, 0x8f, 0x58, 0x78, 0x53, + 0x99, 0x58, 0x43, 0x66, 0x67, 0x66, 0x9b, 0x85, 0xb5, 0x87, 0x75, 0x85, + 0x6d, 0x79, 0x74, 0x61, 0xb8, 0xa7, 0x94, 0xcc, 0xba, 0x48, 0x7f, 0x81, + 0x77, 0x79, 0x63, 0x97, 0x5a, 0x43, 0xa5, 0x5a, 0xa1, 0x99, 0x82, 0xbe, + 0xb9, 0x72, 0x80, 0x7e, 0x50, 0x59, 0x54, 0x5a, 0xcb, 0xa6, 0x37, 0x38, + 0x9c, 0x90, 0xb1, 0xe8, 0xe7, 0x3d, 0x51, 0x64, 0xa2, 0x6a, 0x8d, 0xa8, + 0x77, 0x49, 0x44, 0x97, 0x6c, 0x6e, 0xa3, 0xac, 0x4a, 0x68, 0xca, 0xba, + 0x8f, 0x5d, 0x7d, 0x4e, 0xbe, 0x84, 0x74, 0x7a, 0x63, 0x72, 0x67, 0xbb, + 0xa6, 0xb8, 0x4e, 0x94, 0x88, 0xaf, 0x86, 0xc8, 0x88, 0xc3, 0x91, 0x39, + 0x79, 0x75, 0xa7, 0xad, 0x7e, 0x8d, 0xd1, 0x7f, 0x5a, 0x9e, 0x5a, 0x86, + 0x6a, 0x63, 0x35, 0x41, 0x39, 0x7e, 0x6f, 0x7e, 0x4b, 0x51, 0x7c, 0xb5, + 0x9d, 0x84, 0x6f, 0x61, 0x64, 0x57, 0x89, 0x86, 0x4c, 0x95, 0x64, 0x83, + 0x87, 0xba, 0x9a, 0xb4, 0x7e, 0x9b, 0x6b, 0xad, 0x78, 0xa8, 0x72, 0x3a, + 0x59, 0xb9, 0x9c, 0xbf, 0x3d, 0x2f, 0x9b, 0x4f, 0x9d, 0x6c, 0x9a, 0x93, + 0x2e, 0xaa, 0xa9, 0x60, 0x79, 0xb2, 0x6e, 0x70, 0xa9, 0x85, 0x7f, 0x59, + 0x9a, 0x96, 0xa4, 0x6f, 0xad, 0xc0, 0xc6, 0xde, 0xc3, 0x88, 0x61, 0x94, + 0x6a, 0x8a, 0x4e, 0xac, 0x5c, 0x9e, 0x90, 0x42, 0x48, 0x65, 0x94, 0x88, + 0x48, 0x4f, 0x99, 0x6d, 0x5e, 0x61, 0x58, 0xa3, 0x48, 0x91, 0x6d, 0x73, + 0xbe, 0xb2, 0x68, 0xac, 0x9c, 0xaa, 0x65, 0x77, 0xca, 0x7d, 0x67, 0xb3, + 0xa4, 0xbd, 0xb3, 0x89, 0x40, 0x6c, 0x49, 0xa1, 0xaa, 0x8f, 0xb1, 0x72, + 0xb9, 0xb2, 0x55, 0xda, 0x79, 0x9a, 0x7e, 0x5d, 0x77, 0x95, 0x46, 0x51, + 0x89, 0x83, 0x7c, 0x83, 0xa0, 0x9b, 0x7f, 0xa5, 0xa2, 0x4e, 0xa1, 0x35, + 0xb8, 0x87, 0x8e, 0x8e, 0x7f, 0xb8, 0x76, 0xa9, 0x62, 0xa0, 0xae, 0x5d, + 0x71, 0xa1, 0xb3, 0xbf, 0x5e, 0x8a, 0x99, 0xa5, 0xb2, 0xa4, 0x68, 0x42, + 0x99, 0x95, 0xe0, 0xcb, 0xc0, 0x8a, 0x50, 0x4d, 0x56, 0x9c, 0x87, 0xb7, + 0xd1, 0x43, 0x79, 0xac, 0x58, 0x8c, 0xae, 0x5f, 0xae, 0x93, 0x88, 0x42, + 0x74, 0x6d, 0x74, 0x6b, 0x90, 0xb9, 0xa4, 0x47, 0x36, 0x54, 0x6a, 0x79, + 0x64, 0xae, 0x7f, 0x5d, 0x7f, 0xbe, 0xa0, 0x4d, 0x9a, 0x66, 0x65, 0x95, + 0x56, 0x61, 0x5f, 0x96, 0x9f, 0x83, 0x6c, 0x89, 0x37, 0xc1, 0x90, 0x55, + 0x47, 0x49, 0xb7, 0xa3, 0xba, 0x1e, 0x34, 0x4a, 0xcc, 0x5a, 0x49, 0x98, + 0x90, 0x84, 0x65, 0x29, 0xb8, 0x8e, 0x52, 0x76, 0xa6, 0x7c, 0x38, 0x3b, + 0xb0, 0xa8, 0x54, 0x87, 0xb9, 0x71, 0xbf, 0x81, 0x33, 0x88, 0x81, 0xad, + 0x74, 0x39, 0x90, 0x61, 0x78, 0x6f, 0x44, 0x4e, 0xa2, 0xaa, 0x60, 0x7f, + 0x64, 0x41, 0x6f, 0x43, 0xc8, 0xc5, 0x77, 0x3f, 0x97, 0x72, 0x83, 0x80, + 0x3e, 0x6c, 0x54, 0x93, 0xa2, 0x3c, 0x80, 0x94, 0xc3, 0x3f, 0x68, 0x57, + 0x7f, 0x53, 0x6d, 0x43, 0x6e, 0x51, 0xac, 0x8b, 0x4c, 0xa0, 0x73, 0x71, + 0xaa, 0xa4, 0x42, 0x3d, 0x52, 0xa3, 0xb5, 0xb6, 0x5a, 0xb5, 0xb2, 0x74, + 0xa8, 0x45, 0x52, 0x9d, 0x7b, 0x7d, 0x77, 0xa9, 0x8f, 0x9d, 0x9e, 0x39, + 0x99, 0x97, 0x5c, 0x91, 0x65, 0x78, 0x6a, 0x70, 0xa0, 0x8a, 0x71, 0x83, + 0x9d, 0x93, 0x4b, 0x7e, 0x85, 0xb0, 0x7d, 0x49, 0xc6, 0xa0, 0x9a, 0x8f, + 0x43, 0xa0, 0x9d, 0x8b, 0x64, 0x91, 0xb0, 0xa0, 0x66, 0x9a, 0x99, 0xc4, + 0x5c, 0x86, 0x82, 0x3d, 0x4d, 0x55, 0xaa, 0x87, 0x7f, 0x95, 0xca, 0x2e, + 0x64, 0x5d, 0x8e, 0x4e, 0x9b, 0xa8, 0x34, 0xc1, 0x97, 0x9b, 0xc9, 0x80, + 0x84, 0x4c, 0x6a, 0xa2, 0x6a, 0x56, 0x36, 0x4d, 0xbf, 0x71, 0x86, 0x91, + 0x83, 0xa3, 0x3c, 0x50, 0x73, 0x35, 0xb2, 0xaf, 0x54, 0x83, 0x5e, 0xb8, + 0xac, 0x84, 0x55, 0x4a, 0x57, 0xb7, 0xac, 0x6f, 0xb5, 0xa2, 0x61, 0xa4, + 0x7b, 0x8e, 0x42, 0x66, 0x6d, 0x87, 0x67, 0x83, 0xad, 0x3c, 0x3f, 0xbe, + 0x89, 0x6f, 0x24, 0x69, 0xc4, 0x7d, 0x53, 0x3f, 0x81, 0x62, 0xa3, 0x39, + 0x3c, 0x7c, 0x6d, 0x93, 0x44, 0x65, 0xbe, 0xb4, 0x9e, 0x82, 0xbf, 0x77, + 0x61, 0xa2, 0x3f, 0xc0, 0x90, 0x34, 0x89, 0xb1, 0xc0, 0xa2, 0x6b, 0xcb, + 0x8a, 0x65, 0x7d, 0x3a, 0x7d, 0x74, 0x78, 0x75, 0x8e, 0x8f, 0x46, 0xc3, + 0x40, 0x5b, 0x79, 0x46, 0x9a, 0x41, 0xa1, 0x71, 0x51, 0x5b, 0x74, 0xa2, + 0x93, 0x9a, 0x3b, 0x8c, 0xa4, 0xa0, 0x62, 0x6b, 0x8f, 0xd1, 0x4a, 0x78, + 0x78, 0xde, 0x88, 0x58, 0x42, 0x6c, 0xc0, 0x51, 0x47, 0x79, 0xc4, 0x5b, + 0xb7, 0x60, 0x2e, 0x37, 0xa3, 0xd2, 0xae, 0x3b, 0x41, 0xd0, 0x53, 0x99, + 0x8f, 0x4c, 0xa3, 0x74, 0xaf, 0x7c, 0x8f, 0x48, 0x97, 0x52, 0xc8, 0xc3, + 0xc0, 0x61, 0x9c, 0x87, 0x6a, 0x9d, 0x80, 0x75, 0x7f, 0xa3, 0x30, 0x98, + 0xb6, 0xba, 0xaa, 0x75, 0x33, 0x80, 0x81, 0xa8, 0x3a, 0x95, 0x57, 0x42, + 0x32, 0x4e, 0xa4, 0x81, 0x9a, 0x8e, 0x94, 0xa0, 0xb4, 0xbf, 0x5e, 0x6f, + 0xc0, 0x7c, 0x7f, 0x70, 0xa8, 0xa9, 0x87, 0x6a, 0x7c, 0xbb, 0x75, 0x85, + 0x57, 0x42, 0x7b, 0xac, 0x99, 0xa2, 0xa7, 0x50, 0x33, 0x8a, 0x43, 0xaa, + 0x4f, 0x42, 0x81, 0x44, 0xa0, 0x97, 0x3b, 0x53, 0x81, 0x59, 0x6c, 0x80, + 0x75, 0x6b, 0x8a, 0x39, 0x70, 0x80, 0x61, 0x54, 0x94, 0x4b, 0xbb, 0x44, + 0x81, 0x5e, 0xb0, 0xad, 0x8a, 0x7d, 0x70, 0x69, 0xad, 0x4a, 0x66, 0x9f, + 0x76, 0xc9, 0x48, 0x4a, 0x95, 0xb6, 0x39, 0x4a, 0x59, 0x90, 0x9c, 0x97, + 0x50, 0x47, 0xb7, 0x98, 0xa1, 0xb7, 0x59, 0x53, 0x4b, 0xab, 0x63, 0x89, + 0x99, 0x8b, 0xa7, 0x96, 0x60, 0x6a, 0x6c, 0xaa, 0x4d, 0xa4, 0x88, 0x3b, + 0xa2, 0x44, 0x94, 0x80, 0xaa, 0xa6, 0x50, 0x88, 0x92, 0x78, 0x93, 0xb0, + 0x4c, 0x9e, 0x9a, 0x4f, 0x65, 0x5e, 0xac, 0x2e, 0x64, 0x62, 0x3d, 0xaa, + 0x6a, 0xbb, 0x48, 0x69, 0xae, 0x7a, 0x7b, 0x53, 0x7f, 0x9e, 0xb6, 0x7e, + 0xbc, 0x8e, 0x62, 0x9d, 0x99, 0x56, 0x8c, 0x99, 0xa7, 0x60, 0x8d, 0xc5, + 0x72, 0x31, 0x29, 0xb9, 0xae, 0x9d, 0x4e, 0x4a, 0x54, 0x50, 0xb6, 0xcd, + 0x7b, 0x83, 0x94, 0xce, 0xac, 0x87, 0x52, 0xbf, 0x49, 0x3e, 0x6c, 0x35, + 0xc1, 0xb1, 0xa8, 0x4e, 0x46, 0xa8, 0xb7, 0x8a, 0xac, 0x5f, 0xb2, 0x40, + 0x73, 0x76, 0xac, 0x6c, 0xb9, 0x77, 0x90, 0x71, 0x4c, 0x74, 0xb2, 0x68, + 0x8f, 0x8f, 0x70, 0xa4, 0x84, 0x9a, 0x45, 0x3d, 0x8f, 0x70, 0x68, 0x79, + 0x4d, 0x91, 0x99, 0x7d, 0x52, 0xab, 0x90, 0x9b, 0x91, 0x87, 0x4e, 0x79, + 0x7a, 0xb6, 0xc3, 0x41, 0x82, 0x75, 0x91, 0xaa, 0xa9, 0x97, 0x61, 0x40, + 0x8f, 0x60, 0xac, 0x6c, 0x68, 0x9a, 0x92, 0x7b, 0x5d, 0x4d, 0x41, 0x85, + 0x56, 0x60, 0x4c, 0xc7, 0x46, 0x6b, 0x44, 0x54, 0xc9, 0x52, 0xae, 0xd3, + 0x54, 0xc7, 0x8d, 0x5c, 0x87, 0x81, 0x85, 0x5f, 0x7b, 0xb2, 0xbf, 0x76, + 0x96, 0xba, 0x55, 0x59, 0x7f, 0x85, 0xb1, 0x75, 0xb5, 0xad, 0x9d, 0x44, + 0xb5, 0x5d, 0xb6, 0x82, 0x9f, 0x80, 0xad, 0x75, 0xab, 0x9b, 0x5f, 0x46, + 0xb9, 0x92, 0x80, 0x8d, 0xbf, 0xb8, 0x3d, 0xa8, 0x34, 0x42, 0x64, 0x61, + 0x84, 0x55, 0x53, 0x8b, 0x59, 0x56, 0x86, 0xa5, 0x92, 0x50, 0x60, 0xbc, + 0x65, 0xb4, 0x79, 0x3a, 0x46, 0x95, 0x60, 0x57, 0x66, 0xb3, 0x95, 0x68, + 0x7a, 0xa0, 0x96, 0x4b, 0x88, 0x76, 0x8a, 0x48, 0xa1, 0x65, 0x80, 0xb7, + 0x61, 0x69, 0xb8, 0x81, 0xac, 0x99, 0x92, 0x4e, 0x39, 0x4f, 0x83, 0x4f, + 0xb6, 0xb8, 0xc4, 0x97, 0x4c, 0x57, 0xa5, 0xaf, 0xaf, 0x3a, 0xb6, 0x6a, + 0x2c, 0x71, 0x49, 0xbc, 0xc2, 0xc3, 0xaa, 0xc2, 0xac, 0xb3, 0xbb, 0x72, + 0x5e, 0xc1, 0x5b, 0x8a, 0x79, 0xb6, 0x8a, 0x95, 0x9f, 0x4b, 0x84, 0x42, + 0x94, 0x99, 0xd1, 0x51, 0x7d, 0x8f, 0xb3, 0x42, 0x6f, 0x54, 0x50, 0x59, + 0x68, 0x45, 0x46, 0xa0, 0x92, 0xcb, 0x6d, 0xc6, 0x9b, 0xc0, 0xa6, 0xb1, + 0x71, 0x6d, 0x50, 0x50, 0xaa, 0xc6, 0xbc, 0xab, 0x63, 0x4a, 0x63, 0xa9, + 0x97, 0xb3, 0x90, 0x51, 0x76, 0x92, 0x40, 0x4c, 0x84, 0x2a, 0x74, 0xbc, + 0x5c, 0x4e, 0x8b, 0x8f, 0xa0, 0x8c, 0x87, 0x64, 0x49, 0x4c, 0x8b, 0xbd, + 0xba, 0x90, 0x6b, 0x3d, 0x96, 0x8c, 0x8a, 0x85, 0x5b, 0x38, 0x82, 0x7f, + 0x9d, 0xb9, 0xb9, 0x9d, 0x75, 0xbb, 0x81, 0x86, 0xbb, 0xc8, 0x89, 0x8d, + 0x8e, 0xb0, 0xa9, 0x69, 0x5f, 0x67, 0x54, 0xab, 0x84, 0x66, 0x96, 0x81, + 0x89, 0x41, 0x7a, 0xb7, 0x71, 0x4d, 0xb2, 0x9f, 0x8b, 0xb2, 0xad, 0xc7, + 0x7f, 0x35, 0xac, 0x83, 0x67, 0xa2, 0x70, 0x4e, 0x72, 0x90, 0xa9, 0x6e, + 0x5c, 0xc1, 0x6b, 0x6f, 0xab, 0xa3, 0xc1, 0x6a, 0x9e, 0x41, 0x58, 0x66, + 0x75, 0x82, 0xae, 0x40, 0x3e, 0x4e, 0x55, 0x41, 0x36, 0x67, 0x55, 0x90, + 0x4e, 0xd2, 0x89, 0xa2, 0x95, 0x9b, 0x5f, 0x55, 0x78, 0x56, 0x7e, 0xa9, + 0x8b, 0x86, 0x4c, 0x45, 0x74, 0x99, 0x88, 0x63, 0x92, 0xac, 0x6d, 0x5a, + 0x5d, 0x85, 0x8b, 0xa2, 0x51, 0x8f, 0xa3, 0x51, 0x50, 0x9a, 0x44, 0x6e, + 0x6b, 0xba, 0x7e, 0xbe, 0x63, 0x75, 0x6d, 0x3f, 0xa5, 0x77, 0x54, 0x9c, + 0xb1, 0x53, 0xc4, 0x76, 0xc6, 0x50, 0x67, 0xca, 0xc7, 0x98, 0xbb, 0x5e, + 0xbd, 0x97, 0x88, 0x40, 0x53, 0x39, 0x30, 0x57, 0xae, 0x8c, 0x5b, 0x9c, + 0x88, 0x95, 0xc9, 0x88, 0x4e, 0x3b, 0x9d, 0x58, 0x93, 0x7f, 0xa9, 0x54, + 0xaa, 0xa8, 0x5f, 0xda, 0x7a, 0xa6, 0xbe, 0x7f, 0x79, 0xb1, 0x4f, 0xc7, + 0x58, 0x53, 0x7c, 0x75, 0x4e, 0x5f, 0x6c, 0x95, 0x55, 0x82, 0x5b, 0x84, + 0x4d, 0x59, 0x3b, 0x72, 0x48, 0x63, 0x62, 0x7f, 0x90, 0x37, 0x9a, 0x7d, + 0xae, 0x74, 0x97, 0x3e, 0xa8, 0xa4, 0x56, 0x60, 0x65, 0x52, 0x83, 0x95, + 0x80, 0x3a, 0x85, 0xbe, 0xa5, 0xc8, 0x3f, 0x6b, 0xb0, 0xba, 0x4b, 0x7b, + 0x58, 0xbf, 0xb3, 0x55, 0x99, 0x42, 0x4a, 0xa0, 0x67, 0x58, 0x5f, 0x5b, + 0x8b, 0xd5, 0xc0, 0x48, 0x5f, 0xae, 0x9c, 0x67, 0x91, 0x36, 0x74, 0x39, + 0x79, 0xb2, 0x9d, 0x8a, 0x2a, 0x5c, 0x9e, 0x4f, 0x29, 0x4a, 0x5e, 0xc6, + 0xbf, 0x86, 0x2f, 0x86, 0x6e, 0xa0, 0xb2, 0x7f, 0x66, 0xba, 0x62, 0x76, + 0xc2, 0x9a, 0xa2, 0xb1, 0x90, 0xc4, 0xb2, 0x3b, 0x7e, 0x72, 0x91, 0x8d, + 0x60, 0x65, 0xca, 0x89, 0x6b, 0x97, 0xab, 0xa8, 0xb5, 0xb4, 0x64, 0xb9, + 0x68, 0x53, 0xac, 0xa1, 0x78, 0x54, 0xa8, 0x3a, 0x87, 0xbc, 0xba, 0x84, + 0x5e, 0x7a, 0x5b, 0x6a, 0x66, 0x55, 0xae, 0xb8, 0x93, 0x5b, 0xab, 0x83, + 0x8a, 0x84, 0x5a, 0x6c, 0x6c, 0xbe, 0x7e, 0x64, 0x48, 0x9e, 0x9b, 0x8c, + 0x8a, 0x87, 0x57, 0x8d, 0xd0, 0x6e, 0x9e, 0x8c, 0x62, 0x70, 0x45, 0x91, + 0x7f, 0xa6, 0x78, 0x84, 0x74, 0x39, 0x82, 0x67, 0x72, 0xad, 0x8b, 0x5d, + 0x7f, 0xb9, 0x3c, 0x70, 0x57, 0xbd, 0x41, 0x64, 0x5f, 0x98, 0x77, 0x98, + 0x3e, 0xb2, 0x8e, 0x53, 0x99, 0x89, 0xaf, 0xa8, 0x66, 0xa7, 0x53, 0x4a, + 0xc7, 0x98, 0x86, 0x59, 0x39, 0x98, 0x8a, 0xab, 0xab, 0x49, 0x8d, 0x5c, + 0x82, 0x95, 0x62, 0xa3, 0x64, 0x49, 0x74, 0x77, 0x68, 0x75, 0xac, 0x67, + 0x86, 0xba, 0x5f, 0x6c, 0x9b, 0xb5, 0x98, 0xcf, 0x8c, 0xc4, 0x8a, 0xd5, + 0x8f, 0x85, 0x65, 0x6f, 0xac, 0xb4, 0x4e, 0x67, 0x9f, 0xc3, 0x7e, 0xc8, + 0x6b, 0x9c, 0x48, 0x85, 0x6d, 0x77, 0x5d, 0xb4, 0x8a, 0xcc, 0x6f, 0x7e, + 0x8d, 0x95, 0x79, 0x43, 0x3d, 0x9f, 0x85, 0x59, 0x34, 0xaf, 0x8e, 0x31, + 0x72, 0x48, 0x8d, 0xca, 0x83, 0x40, 0x63, 0x6d, 0x67, 0x65, 0x7a, 0xae, + 0x78, 0x5b, 0x8e, 0x84, 0x87, 0x7e, 0xbc, 0x4c, 0xa8, 0xd3, 0x8b, 0x8e, + 0xd6, 0xbb, 0xb8, 0x9a, 0x7d, 0x59, 0x65, 0xa6, 0x7f, 0x57, 0x55, 0x5b, + 0x6f, 0x9a, 0x66, 0x45, 0x64, 0x98, 0xa1, 0xc3, 0x90, 0xb4, 0x6a, 0x79, + 0x80, 0x62, 0x90, 0x68, 0x79, 0x8d, 0xdc, 0x95, 0xad, 0x76, 0x37, 0xac, + 0x8b, 0x9e, 0x54, 0x2f, 0x4f, 0x90, 0x84, 0x3e, 0x90, 0x45, 0x9b, 0x4e, + 0xc8, 0x88, 0x9f, 0x95, 0x6a, 0x64, 0xa0, 0x40, 0xb6, 0x58, 0x9c, 0x8e, + 0xa5, 0x46, 0x79, 0xbf, 0x68, 0x6f, 0x73, 0x63, 0x34, 0x97, 0x57, 0x93, + 0x8a, 0x46, 0xac, 0xb1, 0xb1, 0x8e, 0x5b, 0x5a, 0x82, 0x66, 0xb5, 0x93, + 0x76, 0x36, 0x88, 0x9e, 0x62, 0x8d, 0x6c, 0x4b, 0x81, 0x7c, 0x8f, 0x75, + 0x92, 0x39, 0x8f, 0x81, 0x82, 0x79, 0x8d, 0x6c, 0x51, 0x99, 0x7b, 0x86, + 0x98, 0x80, 0x86, 0xb6, 0xa0, 0x8b, 0x8f, 0xa3, 0x8c, 0x4b, 0x74, 0x71, + 0x64, 0x4b, 0x59, 0x7a, 0xc2, 0xb5, 0x73, 0x9b, 0x95, 0x45, 0x46, 0xcb, + 0x74, 0xad, 0xa5, 0x6f, 0xc5, 0xc8, 0x93, 0x5a, 0x7b, 0x75, 0x9d, 0x4c, + 0x65, 0x59, 0xa2, 0x9a, 0x59, 0x4d, 0x69, 0x8d, 0x58, 0x8d, 0x54, 0x90, + 0x4b, 0x47, 0x90, 0xc1, 0x40, 0x5b, 0x9f, 0xb9, 0x64, 0x80, 0x4c, 0x4e, + 0x85, 0x95, 0x59, 0x8c, 0x75, 0x2c, 0xab, 0xb8, 0xbd, 0x5a, 0x52, 0xe2, + 0xd1, 0xbb, 0xb0, 0x79, 0x88, 0x83, 0x55, 0x5d, 0x83, 0x3e, 0x93, 0xb6, + 0x72, 0x64, 0x9b, 0x5f, 0x59, 0x99, 0x43, 0xa1, 0x72, 0xb1, 0x5b, 0xb3, + 0x45, 0x6e, 0x95, 0x7f, 0x6f, 0xa8, 0x56, 0x54, 0x8e, 0x6c, 0x4d, 0x62, + 0x99, 0xaa, 0x58, 0x83, 0x6e, 0x98, 0xa1, 0x8f, 0x70, 0xab, 0xcb, 0x9d, + 0x3d, 0x45, 0xc6, 0x7a, 0x4c, 0x3f, 0xbe, 0xa1, 0x75, 0x59, 0x5a, 0x81, + 0x8c, 0x4a, 0x69, 0x98, 0x66, 0xa7, 0xbb, 0x6a, 0xc4, 0x88, 0x75, 0x68, + 0x79, 0x76, 0x94, 0x3a, 0x66, 0x75, 0x77, 0x7c, 0x87, 0xa9, 0x55, 0x72, + 0x5f, 0xa5, 0xbe, 0xa5, 0x2e, 0x55, 0x50, 0x7f, 0x74, 0x65, 0x73, 0x48, + 0x94, 0x8c, 0x6b, 0xaf, 0x5d, 0x5a, 0xa3, 0x4e, 0x3e, 0x97, 0x5f, 0x7b, + 0x44, 0xa8, 0x46, 0x9c, 0x5a, 0x92, 0x89, 0xa6, 0x4e, 0x8a, 0x84, 0x8d, + 0x54, 0xb2, 0x6f, 0x68, 0x8f, 0x7c, 0x67, 0x58, 0x81, 0xa3, 0x9f, 0xa8, + 0x95, 0x71, 0x4c, 0x71, 0x84, 0x3e, 0x5d, 0x70, 0x98, 0x7e, 0x70, 0x9f, + 0x47, 0x45, 0xa9, 0xb9, 0xae, 0x6b, 0x67, 0x7d, 0x7b, 0x41, 0x8b, 0x71, + 0x48, 0x70, 0x41, 0xa7, 0x72, 0x9d, 0x65, 0xa3, 0x77, 0x3c, 0x71, 0xa3, + 0x85, 0x34, 0x4d, 0x77, 0x8d, 0x57, 0x90, 0x81, 0x5f, 0xba, 0x97, 0x8e, + 0x72, 0x4f, 0xb7, 0x84, 0x88, 0x46, 0x4b, 0x3e, 0x5e, 0x9c, 0x4d, 0x87, + 0x74, 0xb0, 0x73, 0x6d, 0xd6, 0xa5, 0xbd, 0x52, 0xd3, 0xd5, 0x81, 0x8e, + 0x66, 0x90, 0xa3, 0xad, 0x5a, 0x41, 0x82, 0xae, 0x5c, 0x84, 0x79, 0x7e, + 0x7a, 0xa1, 0x88, 0x4b, 0x97, 0x7f, 0x43, 0x66, 0x58, 0x96, 0xae, 0x55, + 0xb0, 0x9a, 0xc4, 0x6b, 0xb1, 0x71, 0x6e, 0x67, 0x43, 0xa4, 0x40, 0x4b, + 0xdb, 0x80, 0x4e, 0x7e, 0xb1, 0x8d, 0x7b, 0x4a, 0x38, 0x69, 0x48, 0x6a, + 0xaf, 0x9c, 0x5f, 0x36, 0xa5, 0xae, 0x61, 0x64, 0xb6, 0x9c, 0xa5, 0x60, + 0xc9, 0x9b, 0xc9, 0xc1, 0xd0, 0x91, 0x68, 0x67, 0x4c, 0x94, 0x4e, 0xba, + 0xb2, 0xa0, 0x98, 0x78, 0xaa, 0xc3, 0xa3, 0x8e, 0x57, 0xc2, 0x40, 0x74, + 0x64, 0x94, 0x98, 0x62, 0x80, 0x5c, 0x58, 0xa2, 0x52, 0x50, 0x9b, 0x6e, + 0x4e, 0x8b, 0xaa, 0x73, 0xa5, 0x7b, 0x94, 0x44, 0x9d, 0x6a, 0xb1, 0xbe, + 0xcb, 0xae, 0xd2, 0xb4, 0x8c, 0x57, 0x83, 0xba, 0xac, 0x6c, 0x3d, 0x87, + 0x6a, 0x3b, 0x43, 0x88, 0xdb, 0x48, 0x4c, 0xac, 0x4b, 0x9d, 0x66, 0x74, + 0x35, 0x66, 0x88, 0xc8, 0x80, 0x3a, 0x64, 0x83, 0x51, 0x72, 0x73, 0x72, + 0x9c, 0x98, 0x3d, 0x9e, 0x50, 0xc3, 0x54, 0x65, 0x9c, 0xc4, 0xc4, 0x7a, + 0x9f, 0xac, 0xd3, 0x91, 0xc1, 0x6d, 0x97, 0x91, 0xac, 0xa0, 0x93, 0x9b, + 0x7f, 0x4b, 0x94, 0x96, 0x4a, 0x50, 0x7b, 0x7a, 0x90, 0x68, 0x62, 0xb5, + 0x68, 0xa2, 0xc6, 0xa6, 0x32, 0x71, 0x68, 0x60, 0x37, 0x36, 0x42, 0x6b, + 0x9d, 0x72, 0xa0, 0x64, 0x79, 0x89, 0xac, 0x7f, 0x50, 0x89, 0x6d, 0x5e, + 0x5b, 0x50, 0x70, 0x92, 0x60, 0x32, 0x6d, 0x8e, 0x52, 0xc3, 0x95, 0x3e, + 0x96, 0xb8, 0x60, 0xa6, 0x42, 0xca, 0xa6, 0x54, 0x3c, 0x43, 0xb3, 0x41, + 0x7e, 0x7e, 0x8c, 0x6f, 0xb7, 0x73, 0xc1, 0x4e, 0x9a, 0x7b, 0x5c, 0x82, + 0x9d, 0xc4, 0xb7, 0x84, 0x84, 0x52, 0x88, 0x55, 0x62, 0x3f, 0x58, 0x81, + 0xb0, 0xb1, 0xa6, 0x75, 0x92, 0x5b, 0x69, 0x58, 0xa0, 0x63, 0x97, 0x95, + 0x9e, 0xaf, 0x92, 0x4a, 0xad, 0x3c, 0x9c, 0x73, 0x7d, 0x9c, 0x74, 0xb0, + 0x89, 0x47, 0x9f, 0x48, 0x79, 0x78, 0xae, 0x60, 0x4a, 0x41, 0x30, 0x3c, + 0x54, 0x49, 0x79, 0, 0x56, 0x78, 0x37, 0x2b, 0x24, 0xce, 0x8d, 0x52, + 0x47, 0xa0, 0xbd, 0x49, 0xb7, 0x62, 0x6c, 0xcb, 0x69, 0x9a, 0xa1, 0xbd, + 0xae, 0x9a, 0x9b, 0xa1, 0xb4, 0xa5, 0x4a, 0x7f, 0x6d, 0x66, 0xa0, 0x35, + 0x6b, 0x45, 0x52, 0x7e, 0x40, 0x88, 0x4e, 0x4d, 0x84, 0x74, 0x89, 0x4a, + 0xa3, 0x75, 0x98, 0xb8, 0x4c, 0x31, 0x14, 0x23, 0xaa, 0xab, 0x27, 0x26, + 0x9a, 0xae, 0x71, 0x52, 0x5b, 0xeb, 0x81, 0xa6, 0x66, 0xa3, 0x93, 0x5d, + 0x8e, 0xc2, 0x60, 0x35, 0x3e, 0x3f, 0x9a, 0x83, 0x45, 0xc4, 0xac, 0x5c, + 0x46, 0xae, 0x8f, 0xc4, 0x47, 0x81, 0x8f, 0xa8, 0x74, 0x90, 0x5f, 0x70, + 0x44, 0x58, 0x3b, 0x35, 0x85, 0xae, 0x47, 0x56, 0xa3, 0x8f, 0x62, 0x54, + 0x44, 0x6f, 0xaa, 0x42, 0x64, 0x31, 0x89, 0x9a, 0x99, 0x92, 0x94, 0xba, + 0x7c, 0x9a, 0x71, 0x95, 0x62, 0x49, 0x60, 0x5e, 0x53, 0xc7, 0x53, 0x44, + 0x92, 0x71, 0x5f, 0x6a, 0x52, 0x5c, 0xac, 0x96, 0x47, 0xa7, 0xb0, 0x7e, + 0x67, 0x40, 0x5e, 0x52, 0x59, 0x65, 0x50, 0x4f, 0x8b, 0x95, 0x56, 0x96, + 0x82, 0x92, 0x4f, 0xbb, 0x78, 0x7f, 0x9c, 0x84, 0xa6, 0x6f, 0x54, 0x93, + 0x62, 0x53, 0x97, 0xb9, 0xa8, 0xab, 0x9f, 0x8c, 0x56, 0x50, 0x73, 0x2a, + 0x9b, 0x5c, 0x71, 0x82, 0x84, 0x2e, 0x48, 0x3c, 0x33, 0x55, 0x94, 0x97, + 0x70, 0x81, 0x98, 0x7e, 0x94, 0x67, 0x58, 0x9f, 0x54, 0x53, 0xba, 0x97, + 0x1d, 0x3b, 0xba, 0xa7, 0x7f, 0x45, 0xba, 0x5a, 0x89, 0x4b, 0x61, 0xa1, + 0x51, 0x54, 0x86, 0xba, 0xb6, 0x5c, 0x81, 0x9c, 0x30, 0x6d, 0x48, 0xad, + 0x7e, 0x66, 0x8a, 0x7d, 0xaa, 0x9b, 0x63, 0x5d, 0x91, 0x84, 0x31, 0x5d, + 0xa4, 0x9b, 0x39, 0x9b, 0x51, 0x7c, 0x4f, 0x69, 0x35, 0x4f, 0x5d, 0x44, + 0x5a, 0xa3, 0xb6, 0x90, 0x8c, 0x65, 0x66, 0xea, 0xae, 0xb2, 0x81, 0x7e, + 0xb8, 0x65, 0x6d, 0xb2, 0x6e, 0x63, 0x30, 0x66, 0x5b, 0x4c, 0x85, 0x8b, + 0x51, 0x8b, 0x94, 0xa6, 0x5c, 0x59, 0x83, 0x69, 0xa0, 0x7e, 0x7a, 0x88, + 0xa9, 0x5f, 0x65, 0x88, 0xaf, 0x3f, 0x3f, 0x62, 0x9c, 0x52, 0xa2, 0x27, + 0x8e, 0x56, 0x69, 0x7d, 0x2a, 0x88, 0x45, 0x4f, 0x7c, 0x49, 0x76, 0xba, + 0xae, 0x9e, 0xba, 0xa4, 0x73, 0x5f, 0x6d, 0x90, 0x93, 0x3a, 0x6a, 0xb2, + 0xbd, 0x97, 0x8c, 0x88, 0x68, 0x6d, 0x8f, 0x91, 0xb7, 0xa6, 0x6a, 0x3e, + 0x7f, 0x45, 0xba, 0x37, 0x22, 0x3a, 0xa9, 0x5d, 0x44, 0x52, 0xa9, 0x70, + 0xcb, 0x76, 0x74, 0xa3, 0x5f, 0x7e, 0xdc, 0x4b, 0x97, 0xd7, 0xcd, 0x7e, + 0xa3, 0x55, 0xb1, 0xcf, 0x84, 0x93, 0x55, 0x51, 0x81, 0x6c, 0xb6, 0x7f, + 0xa3, 0x53, 0x75, 0x4e, 0x5c, 0x82, 0x57, 0x59, 0x68, 0x57, 0xa3, 0xa7, + 0x46, 0x4e, 0x7c, 0x48, 0xb7, 0xae, 0x77, 0x7b, 0x88, 0xad, 0x4d, 0x58, + 0x75, 0x6c, 0x82, 0xba, 0xa6, 0x69, 0x99, 0x57, 0xa1, 0x77, 0x89, 0x9b, + 0x33, 0x83, 0x75, 0x3d, 0x5a, 0x49, 0x49, 0x4e, 0x3d, 0x99, 0xc4, 0xbc, + 0x77, 0x79, 0x7f, 0x4a, 0xa5, 0x4a, 0x5f, 0x97, 0x99, 0x6a, 0x9b, 0x84, + 0x5b, 0x3f, 0x82, 0xb9, 0xb9, 0x46, 0x83, 0x63, 0xc6, 0x43, 0x9a, 0xa7, + 0x89, 0x6f, 0xac, 0x62, 0x78, 0xb8, 0xb8, 0x9a, 0xc1, 0xaa, 0x7c, 0x47, + 0x77, 0x4e, 0x7a, 0x69, 0x7f, 0x94, 0x80, 0x45, 0xa6, 0x62, 0x90, 0x34, + 0x9e, 0x51, 0x68, 0xc0, 0x7f, 0x58, 0xc7, 0x9a, 0x6d, 0x9e, 0xa2, 0x73, + 0x55, 0x91, 0x72, 0x75, 0x67, 0x57, 0x35, 0x5a, 0x59, 0x6f, 0x7e, 0x54, + 0x5b, 0x50, 0x4d, 0xad, 0x93, 0x40, 0x2a, 0x5d, 0x3b, 0x7d, 0x32, 0x15, + 0x9d, 0x87, 0xaa, 0xb7, 0x6e, 0x89, 0x5f, 0xe0, 0xc8, 0x56, 0x75, 0x58, + 0x53, 0x5a, 0x85, 0x3d, 0x69, 0xb2, 0x7a, 0xaa, 0xb8, 0x6c, 0x4f, 0xc8, + 0x56, 0x74, 0x64, 0x41, 0x80, 0x45, 0x6e, 0x57, 0x7b, 0x81, 0xdb, 0x88, + 0xc7, 0xd6, 0xf2, 0x93, 0xa6, 0xb5, 0x73, 0x48, 0x7c, 0x8a, 0x73, 0x70, + 0x78, 0x64, 0x95, 0x4a, 0x9a, 0x67, 0x72, 0x70, 0x97, 0xbd, 0x72, 0xcf, + 0xae, 0xa7, 0x78, 0x51, 0x45, 0x90, 0x9e, 0x9e, 0xbd, 0x8d, 0x49, 0x9d, + 0x82, 0xb3, 0xc1, 0x70, 0xa0, 0x86, 0xbe, 0x6b, 0x56, 0xbe, 0x6f, 0x31, + 0x41, 0x3d, 0x39, 0x47, 0x9f, 0xb8, 0x82, 0x7a, 0x37, 0xad, 0x92, 0x61, + 0x6a, 0x49, 0x89, 0x81, 0x4e, 0x77, 0x5e, 0x69, 0x82, 0xa8, 0x4a, 0xae, + 0xc7, 0xb3, 0xa9, 0x68, 0xb0, 0x66, 0x56, 0xa3, 0xb6, 0x82, 0x54, 0x50, + 0x96, 0xa0, 0x8e, 0xb5, 0x90, 0x67, 0x6a, 0x78, 0x6c, 0x88, 0x88, 0x86, + 0xa5, 0xa4, 0x74, 0xad, 0x54, 0x4f, 0x7f, 0x56, 0x53, 0xa6, 0x89, 0x49, + 0x60, 0xa8, 0x8c, 0x83, 0x3d, 0xb6, 0x70, 0x63, 0xb0, 0x77, 0x77, 0x56, + 0xa0, 0xc0, 0x8c, 0x67, 0xab, 0x4d, 0x5f, 0x8f, 0x3b, 0x4c, 0x53, 0x71, + 0xb0, 0x89, 0x33, 0x41, 0x85, 0x48, 0x6f, 0x76, 0x6f, 0x42, 0x5f, 0x8f, + 0x59, 0x42, 0x49, 0x53, 0x72, 0x43, 0x78, 0x70, 0x92, 0xa4, 0x6c, 0x43, + 0xc3, 0xc8, 0x73, 0x5d, 0x64, 0x60, 0x85, 0x3a, 0x33, 0x77, 0x3a, 0x76, + 0x45, 0x69, 0xa7, 0xcb, 0x5d, 0x59, 0xbd, 0x76, 0x63, 0xa6, 0x65, 0x9c, + 0x56, 0x5a, 0x51, 0x70, 0x7f, 0x6d, 0x8f, 0x65, 0x86, 0x7c, 0x69, 0x8f, + 0xa6, 0xb2, 0xc4, 0xbd, 0x77, 0x6e, 0xa4, 0x85, 0xc0, 0x92, 0x70, 0x90, + 0xb1, 0x46, 0x66, 0x27, 0x3e, 0x89, 0xbe, 0x6c, 0x88, 0xd7, 0x9c, 0xca, + 0xc1, 0xed, 0x83, 0x82, 0xa5, 0x99, 0x61, 0x36, 0x3c, 0x51, 0x15, 0x6b, + 0x22, 0x8d, 0x7f, 0x61, 0x73, 0x74, 0x50, 0x7a, 0xa0, 0xb2, 0xc3, 0x92, + 0xbd, 0x46, 0x31, 0x6e, 0x5c, 0x51, 0x94, 0x9b, 0x68, 0x87, 0xa9, 0x50, + 0x7f, 0x4c, 0x57, 0x5e, 0x9e, 0x50, 0x7d, 0x3b, 0x8c, 0x5b, 0x3f, 0x4c, + 0x83, 0x62, 0x69, 0xe0, 0xd6, 0xc2, 0xc0, 0x77, 0xca, 0x84, 0xb4, 0xc0, + 0x96, 0x3d, 0x41, 0xa7, 0x3a, 0x4c, 0x73, 0x46, 0x85, 0x84, 0x59, 0x62, + 0x75, 0x84, 0xa8, 0x74, 0x34, 0x3f, 0x62, 0x84, 0x2f, 0x8d, 0x6d, 0x32, + 0xa0, 0x7c, 0xb1, 0x70, 0x6a, 0x7d, 0xb2, 0x7f, 0x55, 0xbb, 0x64, 0x96, + 0x61, 0x79, 0xd1, 0x5b, 0x4a, 0xbf, 0x7a, 0x92, 0x68, 0x3c, 0x33, 0x2d, + 0x55, 0x9d, 0x77, 0x7b, 0x5c, 0xa3, 0x48, 0x43, 0x54, 0x74, 0x6c, 0x8f, + 0xb8, 0x6f, 0xc9, 0xad, 0x8b, 0xae, 0xcb, 0x68, 0x7e, 0xc0, 0x7d, 0x61, + 0x43, 0xbe, 0x5b, 0xb5, 0x54, 0xa6, 0x78, 0x6a, 0x5d, 0x92, 0x87, 0xab, + 0xab, 0xaa, 0x8d, 0x4a, 0x8d, 0xb9, 0x9b, 0x4d, 0x6c, 0xaa, 0x86, 0xaf, + 0xc0, 0xba, 0xd1, 0x8b, 0x70, 0xb8, 0xb4, 0xce, 0x63, 0xc1, 0x76, 0x9a, + 0x49, 0x2e, 0x38, 0x63, 0x8f, 0x5f, 0x74, 0x99, 0x6d, 0xcc, 0xb3, 0x7f, + 0xa6, 0x71, 0xb4, 0xc4, 0x82, 0x4b, 0x65, 0x3d, 0x76, 0x50, 0x7c, 0xc3, + 0x9c, 0x7e, 0xa8, 0x85, 0xbf, 0x84, 0x8e, 0x45, 0x8e, 0x42, 0x71, 0x4b, + 0x56, 0xb9, 0x49, 0x80, 0x85, 0xa0, 0xac, 0xc2, 0x41, 0x60, 0x59, 0xbc, + 0x9e, 0xb8, 0x59, 0x80, 0x63, 0x69, 0xa8, 0x46, 0x74, 0x55, 0x54, 0x48, + 0x7f, 0x62, 0x5a, 0x7b, 0x65, 0x7f, 0xb4, 0x60, 0x98, 0x56, 0x95, 0x94, + 0x9a, 0x64, 0xc8, 0xbb, 0x6d, 0xb1, 0xa7, 0x6e, 0xaf, 0x53, 0x8e, 0x8f, + 0x32, 0x79, 0x35, 0xb6, 0x56, 0x90, 0x6a, 0x44, 0x2f, 0x2c, 0x91, 0x60, + 0x35, 0xa8, 0x79, 0xa1, 0x67, 0xad, 0x62, 0x3b, 0x5a, 0x8e, 0x9e, 0x6a, + 0x40, 0xb4, 0x7a, 0xa4, 0x63, 0x93, 0x54, 0x96, 0x63, 0x7b, 0x7d, 0x8f, + 0xcc, 0x84, 0x99, 0xa1, 0x82, 0x88, 0xc4, 0x91, 0x7d, 0xd4, 0xa5, 0xc1, + 0x55, 0x80, 0x60, 0xac, 0x5d, 0x75, 0xa4, 0x76, 0x9c, 0x6d, 0x90, 0x68, + 0x43, 0x43, 0x3a, 0x94, 0xb0, 0xbd, 0x99, 0x85, 0x8e, 0x8f, 0x76, 0x93, + 0x63, 0x4c, 0x6f, 0x71, 0xb4, 0x7d, 0x89, 0xa4, 0xba, 0x3f, 0x99, 0x77, + 0x90, 0xa9, 0x8a, 0xae, 0x76, 0x93, 0x6a, 0x55, 0x95, 0x8c, 0xa8, 0x6f, + 0x5c, 0xaa, 0x9c, 0x82, 0x3d, 0x82, 0x97, 0x7e, 0xbe, 0x66, 0x79, 0x35, + 0xc3, 0x7f, 0x6f, 0x35, 0x6c, 0x9e, 0xaa, 0x4f, 0x44, 0x3c, 0x5b, 0x3e, + 0x89, 0x89, 0x3b, 0x3a, 0x3a, 0x94, 0x89, 0x50, 0xb5, 0xc5, 0xad, 0x48, + 0x96, 0x48, 0xc6, 0x80, 0x58, 0xab, 0x4f, 0x89, 0x4d, 0x65, 0x61, 0x6e, + 0xa0, 0x7b, 0x94, 0x32, 0x3a, 0x60, 0x6c, 0x41, 0x81, 0x45, 0x74, 0x8f, + 0x74, 0x9a, 0x3f, 0x61, 0x7b, 0xb5, 0x96, 0x69, 0x50, 0x8e, 0xb0, 0x6d, + 0x81, 0x6a, 0xd3, 0x7c, 0xb3, 0x90, 0xbb, 0x6d, 0x64, 0x54, 0x76, 0x5c, + 0x53, 0xa1, 0x8d, 0x8e, 0x65, 0xbd, 0x3e, 0x68, 0x81, 0x7f, 0x80, 0x99, + 0x4f, 0x6d, 0x70, 0xb5, 0x70, 0x4b, 0x99, 0xce, 0x89, 0x72, 0x8a, 0x95, + 0xbb, 0x7d, 0x21, 0x47, 0x95, 0x9a, 0x68, 0x95, 0xa4, 0x3d, 0x98, 0x4e, + 0x45, 0xa5, 0xa6, 0xbb, 0x4e, 0x55, 0xa9, 0x7a, 0x99, 0xb8, 0x5c, 0x74, + 0x6b, 0xd5, 0x4e, 0x7c, 0x51, 0x9b, 0xae, 0x45, 0x9e, 0x78, 0xae, 0x97, + 0x86, 0x98, 0xb2, 0xa9, 0x9b, 0x4c, 0x8d, 0xba, 0x49, 0x89, 0x5f, 0x6e, + 0x8c, 0x89, 0x67, 0x94, 0x7d, 0xa1, 0x53, 0x4f, 0xab, 0xb1, 0x5b, 0x9b, + 0x9c, 0x4e, 0xa5, 0x7d, 0xa5, 0x94, 0xae, 0x64, 0x7e, 0x62, 0x95, 0xb1, + 0xc1, 0xac, 0x4a, 0xc0, 0x5a, 0x41, 0x45, 0x3e, 0xca, 0xc0, 0x40, 0xa5, + 0x3a, 0x79, 0x46, 0x64, 0x89, 0x52, 0xba, 0x5d, 0x8b, 0x4a, 0x81, 0xa9, + 0xaa, 0x5d, 0x59, 0x64, 0x98, 0x50, 0x62, 0x56, 0xa1, 0x49, 0x50, 0x5b, + 0x5f, 0x5e, 0x4d, 0x3e, 0x49, 0x7f, 0x86, 0xae, 0x9c, 0x82, 0x76, 0xb0, + 0x98, 0x7e, 0x65, 0xa4, 0x47, 0xb0, 0x6d, 0x98, 0x6f, 0x41, 0xb7, 0x84, + 0x7d, 0x92, 0x6b, 0x48, 0x72, 0x50, 0x37, 0x5a, 0x3e, 0x66, 0x91, 0xa9, + 0xa9, 0x82, 0x90, 0xb6, 0x58, 0x6a, 0x49, 0xcc, 0xc6, 0xc4, 0x40, 0x5d, + 0x90, 0x7e, 0x5a, 0xc2, 0x7a, 0x9c, 0x83, 0x59, 0x84, 0x76, 0x78, 0xb2, + 0x62, 0x3e, 0x72, 0x8e, 0x9b, 0x5e, 0xaf, 0x4c, 0x3c, 0x44, 0x74, 0xb4, + 0x6c, 0xb6, 0x64, 0x4f, 0x96, 0x95, 0xac, 0x41, 0x70, 0xa4, 0x84, 0xb4, + 0x8d, 0x75, 0x73, 0x38, 0x6e, 0x44, 0x71, 0x4b, 0x86, 0x6c, 0x6f, 0xaf, + 0x77, 0x3e, 0x33, 0x49, 0x6e, 0x6b, 0x4d, 0x9c, 0x99, 0x9f, 0x9b, 0x6d, + 0x47, 0x81, 0xc0, 0x7b, 0x99, 0x78, 0x3b, 0x4a, 0xb4, 0xb9, 0x5f, 0xa7, + 0x7d, 0x9b, 0x83, 0x82, 0x7d, 0x5d, 0x4c, 0x5a, 0x92, 0xb6, 0x47, 0xa5, + 0x88, 0x71, 0x44, 0x9f, 0x5c, 0x5e, 0xac, 0x98, 0x40, 0xb1, 0x5a, 0x62, + 0xa0, 0x8a, 0x4d, 0x53, 0x73, 0x94, 0x6e, 0x87, 0x8d, 0x92, 0x76, 0x54, + 0xa7, 0x82, 0x66, 0xb1, 0x99, 0x8e, 0xb0, 0x5d, 0x97, 0x53, 0x61, 0x4c, + 0x82, 0x94, 0x8b, 0xaa, 0x3c, 0xaa, 0x78, 0x52, 0x52, 0x45, 0x4d, 0x60, + 0x78, 0x32, 0x69, 0xa4, 0xa5, 0xc8, 0x8e, 0xb4, 0xd6, 0x6e, 0xc3, 0xbc, + 0x76, 0x4e, 0x2c, 0x78, 0x50, 0x6b, 0x76, 0x22, 0x35, 0x88, 0x83, 0x4d, + 0x92, 0xb9, 0x8b, 0xbc, 0x38, 0x6c, 0x9a, 0x8b, 0x8c, 0x68, 0x91, 0x7a, + 0x85, 0xdb, 0xc2, 0xdb, 0x86, 0x8d, 0x6c, 0x5c, 0x72, 0xb2, 0xa3, 0x3a, + 0x97, 0x53, 0xb6, 0x93, 0x43, 0x84, 0x8f, 0x76, 0x84, 0x7a, 0x6b, 0x9e, + 0x68, 0xbb, 0x89, 0xb0, 0x75, 0xc6, 0xab, 0x82, 0x4a, 0x76, 0x18, 0xb8, + 0x6c, 0x4b, 0xb0, 0x7f, 0xc5, 0x94, 0x8c, 0x47, 0xa3, 0x6d, 0x46, 0x79, + 0x77, 0x38, 0xb1, 0x87, 0x63, 0x75, 0x7c, 0x5f, 0x67, 0x6d, 0x52, 0x4a, + 0x9f, 0x4d, 0x91, 0x5b, 0x88, 0xb2, 0xb7, 0x6f, 0x91, 0xa9, 0x6c, 0x60, + 0x41, 0x5f, 0x59, 0x76, 0x88, 0xbd, 0x8f, 0xa0, 0x73, 0x5d, 0x3e, 0x9c, + 0x98, 0x77, 0x4c, 0x7f, 0x77, 0x79, 0x46, 0x42, 0x92, 0x76, 0x96, 0xa8, + 0x7c, 0x48, 0x8b, 0x9d, 0x36, 0x63, 0xab, 0x2f, 0xa6, 0x49, 0x7d, 0x8b, + 0x4c, 0x60, 0xbc, 0x62, 0x9d, 0xab, 0x62, 0x3f, 0x64, 0xbc, 0xae, 0x61, + 0x73, 0xc3, 0xb6, 0x78, 0xa1, 0x3b, 0x77, 0x5a, 0x54, 0x89, 0xa6, 0x55, + 0x46, 0x57, 0x83, 0xba, 0x7e, 0xb8, 0xc2, 0xa4, 0x80, 0x74, 0x66, 0xb9, + 0xa6, 0xb5, 0x99, 0xa2, 0x3f, 0x92, 0x71, 0x61, 0x56, 0x85, 0x64, 0x44, + 0x72, 0xac, 0x7b, 0x4b, 0xb7, 0x4b, 0x60, 0x8f, 0x45, 0xa5, 0x94, 0x4b, + 0x81, 0x52, 0x8c, 0x67, 0x5e, 0xa2, 0xa3, 0xad, 0x98, 0x4e, 0x95, 0xa8, + 0x43, 0x89, 0x97, 0x45, 0x7d, 0x94, 0x8a, 0x9e, 0x9a, 0x6f, 0xa0, 0x3d, + 0x8c, 0x86, 0x97, 0x96, 0x35, 0xb0, 0x8f, 0x75, 0x6e, 0xba, 0x78, 0xba, + 0xa6, 0xc1, 0xa3, 0x63, 0x64, 0x89, 0x9e, 0x57, 0x66, 0xb2, 0x8d, 0x91, + 0x6b, 0xd3, 0x9f, 0xac, 0xc0, 0xa7, 0x59, 0x82, 0x72, 0x9f, 0x5b, 0x81, + 0x63, 0x6a, 0x85, 0x5f, 0x9c, 0x4e, 0x70, 0x6a, 0x61, 0x44, 0x43, 0x92, + 0xb0, 0x49, 0x50, 0xa7, 0xa3, 0x90, 0x45, 0x9c, 0x5e, 0xa5, 0x8f, 0xb9, + 0x65, 0x56, 0x97, 0x7b, 0x84, 0x75, 0x6c, 0x4b, 0x6b, 0x73, 0xb1, 0x7c, + 0x47, 0x8e, 0xb9, 0xc2, 0xb8, 0x95, 0x62, 0xb9, 0x6a, 0x8b, 0x9c, 0x79, + 0x48, 0x64, 0x60, 0x4c, 0x53, 0x60, 0x49, 0x43, 0x61, 0xc2, 0x96, 0xa9, + 0xaf, 0x48, 0x91, 0x80, 0x81, 0x59, 0x50, 0x3b, 0x77, 0x44, 0x89, 0x7e, + 0x85, 0x64, 0x98, 0xbf, 0x81, 0xb2, 0xad, 0x71, 0x4d, 0xbf, 0x51, 0x79, + 0x4b, 0xa9, 0x3d, 0xb1, 0x42, 0x4d, 0xaf, 0x82, 0x40, 0x96, 0x38, 0x4e, + 0xb2, 0x94, 0x64, 0x99, 0xb1, 0x80, 0x9f, 0x88, 0xb2, 0x7d, 0x79, 0x3b, + 0x7c, 0x84, 0x88, 0x48, 0x4d, 0xd6, 0x57, 0x7f, 0xac, 0x5e, 0x72, 0x6b, + 0x4d, 0x79, 0x48, 0x50, 0x80, 0x85, 0x3c, 0x9a, 0x5a, 0x88, 0xb5, 0xb3, + 0x62, 0x5c, 0x8b, 0xae, 0xc0, 0x76, 0x7d, 0x58, 0x8c, 0x7f, 0x4c, 0x4b, + 0xc4, 0xc9, 0xbf, 0x4b, 0x70, 0x5e, 0x72, 0xce, 0x62, 0x8d, 0x8b, 0x46, + 0x98, 0x67, 0x79, 0xab, 0x9a, 0x47, 0x5f, 0xac, 0x7c, 0x4a, 0x64, 0xa9, + 0x5c, 0x5a, 0x99, 0xac, 0xcf, 0x57, 0xb6, 0x57, 0x4b, 0x46, 0x3a, 0xa6, + 0x72, 0x4a, 0xa2, 0xab, 0xbd, 0xb9, 0xad, 0x51, 0x49, 0x37, 0xbc, 0xbd, + 0x64, 0x75, 0x6a, 0x7a, 0x45, 0xc3, 0xa6, 0xb7, 0x9f, 0x58, 0xaf, 0xb0, + 0x67, 0x66, 0x51, 0x89, 0x71, 0x6b, 0xb4, 0x86, 0x3b, 0x5e, 0x8e, 0xb4, + 0x53, 0x70, 0x75, 0x8f, 0x9e, 0xa6, 0x84, 0xc7, 0xbf, 0xb1, 0x6e, 0xa4, + 0x5a, 0x4e, 0xb3, 0x6c, 0x56, 0xa6, 0x25, 0x64, 0x97, 0xa4, 0x60, 0x49, + 0x47, 0x60, 0x8c, 0x48, 0xb4, 0x2c, 0x6e, 0x48, 0x88, 0x6e, 0x8b, 0x88, + 0x87, 0x47, 0x7b, 0x44, 0x5c, 0x7e, 0x46, 0x97, 0x67, 0xb7, 0x44, 0x64, + 0x86, 0x3f, 0x89, 0xc5, 0xa5, 0x69, 0x9b, 0xb8, 0x8e, 0xab, 0x5b, 0xb8, + 0x98, 0xb6, 0x8e, 0xaa, 0xad, 0x6d, 0xc2, 0x3e, 0x44, 0x39, 0x80, 0xbf, + 0xc8, 0x3e, 0xa4, 0x86, 0x76, 0xc5, 0x9d, 0x6a, 0x9a, 0x4c, 0x9f, 0x83, + 0xa7, 0x66, 0x81, 0x5b, 0x8e, 0x97, 0x95, 0x92, 0x5d, 0xa6, 0xb7, 0x99, + 0x5a, 0x3e, 0x9b, 0x8b, 0x86, 0x43, 0x35, 0x9d, 0xbf, 0x5b, 0x65, 0x64, + 0x64, 0x4d, 0x74, 0x90, 0x43, 0x77, 0x93, 0xaa, 0x4d, 0x56, 0x77, 0xa1, + 0x55, 0x8e, 0x72, 0x39, 0x99, 0x8a, 0x78, 0xaa, 0xbb, 0x92, 0x81, 0x8a, + 0x55, 0x79, 0x40, 0x45, 0x61, 0x89, 0xbe, 0x61, 0x89, 0xb4, 0x6b, 0x55, + 0x95, 0x62, 0x7a, 0x76, 0x8e, 0xbe, 0x4d, 0x42, 0x64, 0x99, 0x52, 0x92, + 0x43, 0xa0, 0x81, 0x8c, 0x5a, 0x4d, 0x77, 0x51, 0x64, 0xa4, 0xb3, 0x72, + 0xb5, 0x9c, 0x83, 0x41, 0xa9, 0x6d, 0xcc, 0x77, 0x42, 0x69, 0x3b, 0x91, + 0x79, 0x4e, 0x52, 0xac, 0x9c, 0xc5, 0x75, 0x62, 0x6c, 0x49, 0xac, 0x5b, + 0xab, 0x84, 0x9c, 0x57, 0xa9, 0x63, 0xa4, 0x8d, 0xab, 0x4e, 0x79, 0xad, + 0x82, 0x84, 0x6c, 0x68, 0x9b, 0x6c, 0x66, 0x64, 0x83, 0x58, 0x5d, 0x82, + 0x35, 0x31, 0x9b, 0x52, 0x93, 0x99, 0x75, 0x57, 0x66, 0x42, 0xa1, 0xaa, + 0xb4, 0xab, 0x49, 0x3f, 0xaa, 0x88, 0x6c, 0x40, 0x57, 0x45, 0x97, 0x83, + 0x59, 0xa6, 0x5e, 0x61, 0x9c, 0x3d, 0xa5, 0x48, 0xaa, 0x94, 0x9f, 0x99, + 0x60, 0xb2, 0xc7, 0x5b, 0x5c, 0x9b, 0x6c, 0xd2, 0x80, 0x7e, 0x46, 0xa9, + 0xc4, 0x7f, 0x5d, 0xbf, 0x8d, 0xc2, 0xb7, 0x3c, 0x57, 0xb0, 0xc6, 0x43, + 0x5c, 0x23, 0x36, 0x35, 0xa3, 0x57, 0x9a, 0x60, 0xac, 0x9b, 0x42, 0xa8, + 0x49, 0x87, 0x91, 0x76, 0xa7, 0x63, 0xac, 0x50, 0x6d, 0x6b, 0x67, 0x63, + 0x54, 0x9f, 0x9e, 0xad, 0x92, 0xb1, 0x59, 0xae, 0x63, 0x89, 0xa2, 0x93, + 0x8f, 0x8e, 0xbe, 0x75, 0xc3, 0xd1, 0x51, 0x45, 0x7e, 0xcd, 0xcf, 0xc0, + 0x7d, 0x7e, 0x3f, 0x7c, 0x65, 0x5a, 0xad, 0x77, 0x89, 0x8d, 0x59, 0xb5, + 0x94, 0x7b, 0x4b, 0x7a, 0xa3, 0xa1, 0x69, 0x41, 0x9a, 0x8c, 0x33, 0x5a, + 0x76, 0x36, 0xc7, 0x82, 0x8e, 0x64, 0xaf, 0xcf, 0x80, 0x63, 0x7a, 0x8e, + 0xbd, 0x8e, 0x4c, 0x8a, 0x6b, 0xc6, 0xa4, 0x8b, 0x6b, 0x84, 0x98, 0x39, + 0x46, 0x42, 0xbe, 0x85, 0x79, 0xab, 0x93, 0x67, 0xb7, 0xa8, 0x3e, 0x6e, + 0x74, 0x6e, 0xaf, 0xb3, 0x53, 0x92, 0x5c, 0xd5, 0x9a, 0x5a, 0x95, 0x5b, + 0x93, 0x5f, 0x4c, 0x9b, 0x6a, 0x4e, 0xab, 0xbb, 0x4a, 0xb4, 0x81, 0x7e, + 0x3d, 0x47, 0x9b, 0x4c, 0x80, 0x5d, 0x8e, 0x5f, 0x79, 0x92, 0x36, 0x77, + 0x6b, 0x87, 0x91, 0x84, 0x6f, 0x53, 0xad, 0xc5, 0x81, 0xb2, 0x45, 0x55, + 0xc9, 0x78, 0x77, 0x91, 0x71, 0x74, 0xc2, 0xb4, 0x6a, 0xb0, 0x59, 0x60, + 0x72, 0x3d, 0xbe, 0x7e, 0xb7, 0xab, 0x68, 0x67, 0x8a, 0xad, 0x8d, 0x9d, + 0x3f, 0x74, 0xb0, 0x5d, 0x5a, 0x4f, 0x5a, 0xb4, 0x7b, 0x8a, 0x8c, 0xb9, + 0x4d, 0x87, 0x7a, 0xba, 0xac, 0x39, 0x60, 0x55, 0xbe, 0x9f, 0x68, 0x68, + 0x9d, 0x40, 0x4b, 0x53, 0x88, 0x8f, 0x82, 0xc1, 0x5c, 0x47, 0xa8, 0x4f, + 0xae, 0x95, 0x79, 0x66, 0x66, 0xb1, 0x7b, 0x5e, 0x43, 0x89, 0x92, 0x47, + 0x61, 0xb8, 0x61, 0xb7, 0x61, 0x88, 0xa3, 0xbb, 0x92, 0xc8, 0x86, 0x8a, + 0x47, 0x82, 0xab, 0xa6, 0x37, 0x93, 0x4a, 0x6a, 0x70, 0x6b, 0x2c, 0x9b, + 0x4d, 0xc7, 0xa2, 0x65, 0x62, 0x69, 0x63, 0xab, 0x60, 0x4f, 0xbb, 0x94, + 0x5f, 0x9f, 0x68, 0xb0, 0x66, 0x62, 0x80, 0xa1, 0xbe, 0x9a, 0x2e, 0xb7, + 0xca, 0x61, 0x98, 0x59, 0x9e, 0xb1, 0x97, 0x6e, 0x3e, 0x7e, 0x87, 0xaf, + 0xad, 0x4a, 0x5c, 0xa9, 0xa3, 0x95, 0x72, 0x97, 0x88, 0x54, 0x5d, 0x63, + 0x55, 0x8c, 0x71, 0x51, 0xc9, 0xa6, 0x9f, 0x53, 0x64, 0x7e, 0x6d, 0xa0, + 0x38, 0x77, 0xc1, 0xb8, 0x73, 0x89, 0x64, 0x7a, 0x53, 0x5f, 0x47, 0x7c, + 0x65, 0xc8, 0xa2, 0x65, 0x79, 0x8b, 0x8a, 0xc3, 0xa3, 0x87, 0x64, 0x65, + 0x77, 0xaf, 0xc2, 0x92, 0x8e, 0x86, 0x94, 0x85, 0x83, 0x3a, 0x85, 0x59, + 0x65, 0xc3, 0x3f, 0x71, 0x83, 0x7d, 0xd0, 0x92, 0x5c, 0x86, 0x93, 0x97, + 0x90, 0x76, 0x32, 0x71, 0x87, 0x69, 0x47, 0x78, 0x4c, 0x9c, 0x51, 0x3b, + 0x8c, 0x3d, 0x7a, 0x9a, 0x52, 0xc4, 0xa6, 0xa4, 0x49, 0x81, 0xc1, 0x83, + 0x9c, 0xba, 0x51, 0x43, 0x58, 0x8b, 0x87, 0x78, 0xac, 0x7a, 0xb0, 0xac, + 0x43, 0x8a, 0x51, 0xc4, 0xac, 0x8d, 0x5c, 0x50, 0x81, 0xae, 0x7b, 0x8e, + 0x9b, 0x57, 0x7f, 0x65, 0x84, 0x80, 0x4f, 0x66, 0x88, 0x60, 0x37, 0xa8, + 0x6f, 0x99, 0x9b, 0xa6, 0x75, 0x94, 0x98, 0x89, 0xac, 0x66, 0x78, 0x9a, + 0xb7, 0xa0, 0x45, 0xc2, 0x70, 0x3e, 0xa5, 0x57, 0xa0, 0x34, 0x41, 0xa2, + 0x47, 0x61, 0xa8, 0x8b, 0x72, 0x5e, 0x7a, 0xae, 0xc0, 0x9a, 0x55, 0x5e, + 0x57, 0x7d, 0x4a, 0xc0, 0xba, 0x78, 0x3e, 0x89, 0x77, 0xbc, 0xa4, 0x7f, + 0xbe, 0x8b, 0x49, 0xb4, 0xb9, 0x81, 0xbf, 0xbc, 0x8f, 0x88, 0x78, 0x7e, + 0x51, 0xb7, 0xb0, 0x50, 0xa1, 0x9c, 0x75, 0xa9, 0x55, 0xaa, 0x58, 0x8b, + 0xb7, 0x97, 0x54, 0x41, 0x52, 0x94, 0x72, 0x50, 0x62, 0x93, 0x37, 0x78, + 0x7d, 0x97, 0x67, 0xba, 0x70, 0x72, 0x4c, 0xc1, 0x7c, 0x7c, 0x5f, 0xb6, + 0x3d, 0x53, 0x3f, 0x3f, 0xbf, 0x7f, 0x69, 0x51, 0x8c, 0xc0, 0x7f, 0x93, + 0x4f, 0x42, 0x51, 0x97, 0xa7, 0x6c, 0xa5, 0xb2, 0xc0, 0xb4, 0x6c, 0x7b, + 0xa6, 0x6c, 0x98, 0xac, 0x95, 0x6a, 0x6b, 0x73, 0xa4, 0x5d, 0x76, 0x4d, + 0x37, 0x85, 0x6a, 0xbd, 0x6b, 0x39, 0x66, 0x50, 0xa5, 0x52, 0x6d, 0x51, + 0x6b, 0x6e, 0x62, 0x44, 0x8c, 0x9f, 0x8e, 0x80, 0x94, 0x80, 0x2e, 0x65, + 0x3d, 0x87, 0x94, 0x51, 0x63, 0x5e, 0x58, 0xb0, 0xac, 0x49, 0xac, 0xb7, + 0xc9, 0x8b, 0x47, 0x37, 0x85, 0x93, 0x40, 0x77, 0x52, 0x9f, 0x22, 0x44, + 0x6d, 0xbf, 0xad, 0x62, 0x21, 0x59, 0xbe, 0xa7, 0x8a, 0x66, 0xba, 0x9d, + 0x7f, 0xb5, 0x55, 0x8a, 0x71, 0x7f, 0xba, 0xc2, 0xb0, 0xa9, 0xb2, 0x4d, + 0xa6, 0x9a, 0x92, 0xa4, 0xa8, 0x7b, 0x9b, 0x57, 0xad, 0x63, 0x64, 0x76, + 0x84, 0x74, 0x59, 0x8d, 0x9c, 0x52, 0xa6, 0x5b, 0x85, 0x47, 0x5a, 0xd3, + 0xa3, 0xc3, 0x7c, 0x5f, 0xc4, 0x9f, 0xb5, 0x9c, 0xa4, 0xaa, 0x53, 0x66, + 0xb3, 0x6c, 0x99, 0xa9, 0x5d, 0x4e, 0xb4, 0xad, 0x4a, 0x52, 0x69, 0xa6, + 0x7a, 0x54, 0xc1, 0x72, 0x66, 0x5e, 0x46, 0x4a, 0x7a, 0x92, 0x92, 0x92, + 0x6a, 0x8a, 0x52, 0xa3, 0x61, 0xb5, 0x57, 0x94, 0x85, 0x66, 0x3f, 0xbb, + 0x63, 0x58, 0x5f, 0x39, 0x87, 0xad, 0x7c, 0x7f, 0x54, 0x45, 0x99, 0x64, + 0xb2, 0x7b, 0x8c, 0x51, 0x5d, 0xa4, 0x60, 0xae, 0x79, 0xbc, 0x68, 0x72, + 0xb2, 0xc4, 0xb6, 0x6f, 0xb7, 0xc1, 0x9c, 0x44, 0x98, 0xb2, 0x8b, 0x4f, + 0x6d, 0x88, 0x8e, 0x92, 0xa6, 0x78, 0x84, 0x66, 0xb6, 0x3e, 0xb0, 0x8a, + 0xb4, 0x6f, 0x65, 0xaa, 0x45, 0x43, 0x3d, 0x3a, 0xa0, 0x89, 0x47, 0xa4, + 0x92, 0x5a, 0x4e, 0xb3, 0xb8, 0x8b, 0x76, 0xcd, 0x5a, 0x87, 0x73, 0x47, + 0x88, 0x82, 0x5d, 0xa1, 0x7a, 0xac, 0x8a, 0x4b, 0x67, 0xae, 0x62, 0x47, + 0x41, 0x68, 0xb5, 0xba, 0xac, 0x5a, 0x95, 0x8f, 0x6f, 0xa3, 0x9e, 0xbe, + 0x9f, 0xb3, 0xb2, 0xc1, 0xb5, 0x7e, 0xa8, 0x39, 0x83, 0x51, 0x63, 0x74, + 0xbb, 0x5d, 0x52, 0x5a, 0x9b, 0x82, 0xa2, 0x8b, 0x5f, 0x95, 0x7c, 0x58, + 0x85, 0x5b, 0x95, 0xad, 0x7b, 0xa8, 0x83, 0xa2, 0x83, 0x80, 0x9d, 0x89, + 0xac, 0x8b, 0xac, 0x7a, 0x44, 0x9f, 0x5f, 0x7d, 0xac, 0x5a, 0xbf, 0xa7, + 0x81, 0x3b, 0x68, 0x4b, 0x84, 0x65, 0xa5, 0xae, 0x3f, 0x6c, 0x7a, 0x8d, + 0x4b, 0x9c, 0x69, 0x5b, 0x7d, 0xa2, 0x85, 0x9e, 0x4e, 0xba, 0x8f, 0x67, + 0x79, 0x7f, 0xb0, 0x98, 0x60, 0x5e, 0x45, 0x4f, 0x57, 0x98, 0xbb, 0xc2, + 0x86, 0x5a, 0x53, 0x8e, 0x84, 0xbd, 0x72, 0x40, 0x68, 0xa9, 0x47, 0x91, + 0x95, 0x55, 0x48, 0x74, 0xae, 0x7c, 0x56, 0x88, 0xba, 0x44, 0x93, 0x3e, + 0x82, 0x67, 0x77, 0x48, 0x73, 0x75, 0x94, 0x60, 0x4e, 0x60, 0xaf, 0xc0, + 0x99, 0x4a, 0x69, 0x65, 0x98, 0xa9, 0x65, 0x79, 0x64, 0x4f, 0x4d, 0x79, + 0x9d, 0x96, 0x39, 0xa0, 0x4c, 0x72, 0x57, 0x75, 0x4a, 0x74, 0x5f, 0x45, + 0xbc, 0x94, 0xb3, 0xb0, 0xb9, 0x7b, 0x3d, 0x6a, 0x5f, 0xb6, 0x76, 0x75, + 0x84, 0xa6, 0xb0, 0xb2, 0xb9, 0x7f, 0x4c, 0x81, 0x9a, 0x4f, 0x76, 0x49, + 0x95, 0x8e, 0xa2, 0x4f, 0xc4, 0xb8, 0x70, 0x3c, 0x74, 0x66, 0x46, 0x51, + 0x9b, 0x4b, 0xbb, 0x94, 0x79, 0xa8, 0x53, 0x7a, 0x76, 0x7f, 0xab, 0xad, + 0x81, 0x95, 0x6e, 0xaf, 0x63, 0x7d, 0xbc, 0xac, 0x4e, 0xb1, 0x80, 0x7f, + 0x3d, 0x80, 0xbe, 0x63, 0x41, 0x69, 0xbf, 0xc7, 0xb5, 0xad, 0x78, 0x7f, + 0xa3, 0xbd, 0xc4, 0xc2, 0x78, 0x7b, 0x3f, 0xc3, 0xa7, 0x91, 0x42, 0x85, + 0x97, 0x65, 0x96, 0x96, 0xb9, 0x7a, 0x84, 0x44, 0xbf, 0x6f, 0x7f, 0x59, + 0x66, 0x71, 0xaa, 0x4c, 0x6e, 0x9e, 0x62, 0xc3, 0xaa, 0x72, 0xa6, 0x61, + 0x8f, 0x60, 0x85, 0x5d, 0x5e, 0x63, 0x63, 0x64, 0xb7, 0x95, 0x90, 0xb3, + 0x9a, 0x48, 0x71, 0x5f, 0x4a, 0x6e, 0x9a, 0x45, 0xaa, 0x9f, 0x98, 0x6c, + 0x45, 0x6b, 0xa4, 0xa4, 0x5f, 0x82, 0x97, 0x4c, 0xa1, 0x94, 0xa3, 0xb2, + 0xb1, 0xaa, 0xae, 0x6e, 0xa7, 0x4a, 0x84, 0x50, 0x62, 0x75, 0x93, 0x74, + 0x96, 0xb8, 0xa4, 0x5a, 0xb7, 0x56, 0x73, 0x47, 0x53, 0x75, 0x5c, 0x81, + 0x5b, 0x79, 0xb6, 0x47, 0x4b, 0x96, 0xc0, 0x82, 0xae, 0xa9, 0x91, 0x72, + 0x77, 0x94, 0xbb, 0x78, 0xa6, 0x7b, 0x6f, 0x5e, 0xc5, 0x8c, 0xc1, 0xb6, + 0x9d, 0x84, 0x3c, 0xa4, 0x4a, 0xae, 0xad, 0x5b, 0x5c, 0xc0, 0x67, 0x9a, + 0xac, 0xa0, 0x90, 0x5e, 0xa6, 0x8e, 0x48, 0x78, 0xb5, 0x85, 0x77, 0x98, + 0x44, 0x44, 0x51, 0x8d, 0x5d, 0xc6, 0x8c, 0x96, 0x7e, 0x76, 0xbb, 0x63, + 0x74, 0x60, 0x7b, 0x57, 0x53, 0x8c, 0x9f, 0x4f, 0x63, 0x4c, 0x45, 0x50, + 0x4f, 0xb9, 0x8e, 0x74, 0x40, 0xaa, 0x96, 0x6e, 0xca, 0x5f, 0x6f, 0x91, + 0x53, 0x63, 0x5d, 0xae, 0x98, 0x5b, 0x9e, 0xa7, 0x92, 0x66, 0x81, 0x9d, + 0x7f, 0xa0, 0x42, 0xbd, 0x47, 0x57, 0x83, 0x5c, 0x6e, 0x8a, 0x3d, 0x94, + 0x42, 0xa6, 0x72, 0x6d, 0x89, 0x6a, 0xad, 0x91, 0x9e, 0x45, 0x62, 0xab, + 0x90, 0x51, 0x9a, 0x4c, 0x94, 0xa2, 0x95, 0xb8, 0x89, 0xae, 0xbd, 0x53, + 0xb7, 0x6d, 0x7c, 0x64, 0x69, 0xb8, 0xa6, 0x53, 0xa6, 0x43, 0x54, 0x92, + 0x6f, 0x80, 0x98, 0x48, 0x77, 0x72, 0xac, 0x5c, 0x75, 0xa9, 0xb9, 0x47, + 0xc4, 0x57, 0x8b, 0x6c, 0x6f, 0x90, 0x80, 0x57, 0x7a, 0xa0, 0xa6, 0x7c, + 0x6c, 0x69, 0x78, 0x57, 0x88, 0x81, 0x9c, 0x50, 0x6f, 0x74, 0xa0, 0x6e, + 0xc2, 0x95, 0xac, 0x49, 0xd4, 0x8b, 0x31, 0x94, 0x70, 0x52, 0xa9, 0x9f, + 0x92, 0xcb, 0x8c, 0x4d, 0x7a, 0xaf, 0x79, 0x97, 0xa1, 0x88, 0x4f, 0x44, + 0x53, 0x8c, 0x94, 0xb0, 0x65, 0x4d, 0x5a, 0x44, 0x9b, 0x9a, 0xba, 0x51, + 0xba, 0x44, 0xc6, 0xc0, 0x95, 0x7f, 0x77, 0x95, 0x3e, 0x93, 0x49, 0xa2, + 0x64, 0xb6, 0x53, 0x4e, 0x9c, 0x6c, 0x46, 0x4d, 0x51, 0x66, 0x96, 0x84, + 0x9f, 0x6d, 0xa9, 0xa6, 0x66, 0x9d, 0x6f, 0xc0, 0x9f, 0x7f, 0xab, 0xaf, + 0xc0, 0x49, 0x54, 0xa5, 0xa6, 0xa8, 0x56, 0x9f, 0xb7, 0x8a, 0xaf, 0x52, + 0xb3, 0x89, 0x4e, 0x84, 0xb5, 0x86, 0xa4, 0x8c, 0x52, 0xc1, 0x87, 0x50, + 0x81, 0x6e, 0x6f, 0x8b, 0x76, 0x4f, 0xb1, 0x94, 0x41, 0xa0, 0xaf, 0x80, + 0xa3, 0x4c, 0x5c, 0x4b, 0x71, 0xb7, 0x62, 0xba, 0xb4, 0x8e, 0xb2, 0x55, + 0x7c, 0x6e, 0x8c, 0xb2, 0x9c, 0x59, 0x44, 0x68, 0x53, 0x8d, 0x6a, 0x90, + 0xa9, 0x7a, 0x75, 0x99, 0x9d, 0x84, 0xa6, 0xa2, 0x43, 0xaa, 0xc6, 0xd0, + 0xb0, 0x68, 0x48, 0xc9, 0x78, 0x5d, 0x3d, 0x5d, 0xc3, 0x91, 0x7f, 0x65, + 0x83, 0xaa, 0x82, 0x45, 0xb1, 0x52, 0x7a, 0x78, 0xa1, 0xa1, 0x42, 0x57, + 0xa7, 0x95, 0xb3, 0x75, 0xa7, 0x83, 0x81, 0x6e, 0xb2, 0x53, 0x84, 0x9d, + 0x50, 0x87, 0x4c, 0x99, 0x84, 0x88, 0x58, 0xb6, 0x78, 0xae, 0x46, 0x6d, + 0x54, 0x66, 0xb8, 0x65, 0x48, 0xa3, 0x71, 0x5b, 0xb5, 0x48, 0xc6, 0x6c, + 0x4e, 0xa1, 0x45, 0x86, 0x85, 0x81, 0x58, 0x6b, 0x8b, 0x4d, 0x88, 0x59, + 0x8b, 0x65, 0x25, 0x64, 0x83, 0x54, 0x94, 0x40, 0xa5, 0x48, 0x91, 0x46, + 0x7a, 0xb1, 0x85, 0xbb, 0x84, 0xbe, 0xb9, 0x5b, 0x8e, 0x3a, 0x50, 0xa4, + 0x8a, 0xa0, 0x49, 0x75, 0x7d, 0x66, 0x39, 0x8f, 0xbe, 0x71, 0x7e, 0xb0, + 0xc3, 0xae, 0x71, 0x42, 0x4d, 0x6c, 0x54, 0x76, 0x6d, 0xb6, 0xa6, 0x59, + 0x6f, 0x3e, 0x46, 0x50, 0x5c, 0xad, 0x6e, 0x83, 0xa8, 0x80, 0x84, 0x85, + 0xb9, 0x8e, 0x6d, 0xb7, 0x8a, 0x58, 0xab, 0x70, 0x6a, 0x4d, 0xc8, 0x3c, + 0x96, 0x9c, 0x93, 0x7d, 0x76, 0xa5, 0xb7, 0x46, 0x6e, 0x46, 0xb2, 0x54, + 0x90, 0x7c, 0x7d, 0x75, 0xbe, 0x4b, 0x74, 0xb7, 0x8b, 0xa7, 0x6f, 0x85, + 0x72, 0xb3, 0x78, 0x87, 0xb6, 0x60, 0x7c, 0xa1, 0x50, 0x8c, 0x80, 0x90, + 0xad, 0xbe, 0x5b, 0x8f, 0xd2, 0x65, 0xa6, 0x9e, 0x88, 0x44, 0x9d, 0x42, + 0x77, 0x8d, 0x54, 0x78, 0x73, 0x41, 0x94, 0x65, 0x74, 0x52, 0x78, 0x70, + 0x3a, 0x62, 0x74, 0x64, 0xb5, 0x64, 0xad, 0x5e, 0x80, 0xa5, 0x47, 0x95, + 0xbb, 0x49, 0xa3, 0x5e, 0x67, 0xa1, 0x57, 0x92, 0x5a, 0x7f, 0xaf, 0x45, + 0x93, 0x76, 0xbd, 0xbd, 0x53, 0x6a, 0x90, 0xa3, 0x78, 0x65, 0x86, 0x65, + 0x68, 0x81, 0x8b, 0x91, 0x97, 0x6c, 0x46, 0x63, 0xa5, 0x95, 0x6b, 0xaf, + 0xa2, 0xc0, 0x42, 0x5b, 0x55, 0x45, 0x50, 0xaa, 0x3f, 0x4f, 0x96, 0xac, + 0xc2, 0xbf, 0x9c, 0x6c, 0x89, 0xc1, 0x83, 0xa9, 0x56, 0x7f, 0xc4, 0x40, + 0xa4, 0xba, 0x87, 0x96, 0x76, 0x45, 0xba, 0x47, 0x85, 0x5c, 0xae, 0x5a, + 0x4f, 0x7a, 0xa1, 0x6a, 0x63, 0x73, 0xb1, 0x70, 0x45, 0x6e, 0x71, 0xab, + 0x54, 0x9a, 0x67, 0x49, 0xa7, 0xc7, 0x96, 0xa6, 0x43, 0xa4, 0xc9, 0x99, + 0x40, 0x95, 0x8f, 0x71, 0x81, 0x77, 0x41, 0x95, 0x27, 0x7c, 0xbd, 0xb2, + 0xae, 0x5e, 0x6d, 0xd2, 0xc8, 0x77, 0x54, 0x7b, 0x5a, 0x4f, 0x32, 0x70, + 0xab, 0x93, 0x71, 0x68, 0xb7, 0xbd, 0x85, 0x66, 0x84, 0xa8, 0x9e, 0x75, + 0x9a, 0x81, 0x92, 0x61, 0x69, 0x83, 0x96, 0x87, 0xbc, 0x73, 0x4a, 0x37, + 0x2d, 0x89, 0x57, 0x73, 0x40, 0x2a, 0x40, 0x56, 0x3d, 0x8a, 0x85, 0x97, + 0x93, 0x81, 0x86, 0x6e, 0x52, 0x7f, 0x9d, 0x59, 0x58, 0x9c, 0xcb, 0x6e, + 0x8a, 0x59, 0x56, 0xb7, 0xba, 0x8e, 0x76, 0x9c, 0x58, 0x45, 0x74, 0xc5, + 0x53, 0x50, 0x83, 0xbd, 0x96, 0xb3, 0x9a, 0x65, 0x45, 0xac, 0xbe, 0x5c, + 0x3e, 0x5b, 0x54, 0x8c, 0x48, 0xb2, 0xb4, 0x67, 0x50, 0xae, 0xa8, 0x91, + 0x74, 0xa3, 0x64, 0xb5, 0x6e, 0x2c, 0x2f, 0x72, 0xc9, 0x90, 0x92, 0x82, + 0xab, 0x72, 0xa1, 0xac, 0x4d, 0x46, 0xa6, 0x88, 0x7d, 0x8e, 0x55, 0xb8, + 0x58, 0x65, 0x54, 0xc9, 0x6f, 0xb1, 0x88, 0x98, 0xa1, 0xb0, 0xaf, 0x7e, + 0xba, 0xb8, 0x89, 0xbc, 0x6d, 0x97, 0x51, 0x84, 0x84, 0x4e, 0xbb, 0x68, + 0xa3, 0x47, 0x82, 0x69, 0xa8, 0x6f, 0xb4, 0x7b, 0x70, 0x94, 0x9e, 0x49, + 0xba, 0xa5, 0x3f, 0x3f, 0xa0, 0x53, 0x8a, 0x9a, 0x7d, 0xaa, 0xae, 0x56, + 0x5c, 0x45, 0xb6, 0x6e, 0x79, 0x59, 0x62, 0x68, 0x97, 0xb8, 0x74, 0x73, + 0x8d, 0x92, 0x57, 0x96, 0x4a, 0xae, 0xa4, 0x4d, 0x94, 0x92, 0x3c, 0x71, + 0x6f, 0x4c, 0x78, 0x69, 0x97, 0xad, 0xa9, 0xc2, 0xb6, 0x97, 0x8a, 0xb5, + 0xc7, 0x9d, 0x5b, 0x68, 0x74, 0x57, 0xb6, 0x73, 0x8f, 0x9b, 0xc9, 0x8c, + 0xc4, 0x64, 0x8b, 0x95, 0xba, 0xb3, 0x85, 0xa9, 0x4c, 0xc3, 0x77, 0x9a, + 0xaa, 0x90, 0x5f, 0xd2, 0x99, 0x48, 0x43, 0x6c, 0x82, 0x92, 0x4c, 0x39, + 0x85, 0x8d, 0x6e, 0x83, 0xb8, 0xb6, 0x8e, 0x72, 0xbc, 0xc7, 0x92, 0x5b, + 0x7f, 0xe8, 0x8d, 0x82, 0x3a, 0x9c, 0x3c, 0x6d, 0xca, 0x9b, 0x8b, 0x57, + 0x79, 0x54, 0x5f, 0x41, 0xa7, 0x7a, 0x74, 0x60, 0x94, 0x78, 0x67, 0x89, + 0xc1, 0x93, 0x66, 0x63, 0x95, 0x95, 0x4e, 0xcc, 0x53, 0xcb, 0xa2, 0xa8, + 0x84, 0xc0, 0x4e, 0x5d, 0x54, 0xad, 0x8f, 0x65, 0x8b, 0x4a, 0x4d, 0xc7, + 0x64, 0x76, 0x3e, 0xb8, 0x62, 0xa7, 0x3d, 0x9b, 0x76, 0x4b, 0x5f, 0xbe, + 0x62, 0x53, 0x70, 0x8e, 0x80, 0x77, 0x7e, 0x89, 0x8e, 0x44, 0x55, 0x96, + 0x4d, 0xae, 0x65, 0x72, 0x7c, 0x61, 0x9b, 0x43, 0xc4, 0x48, 0x89, 0xb1, + 0x80, 0xc6, 0x6a, 0x92, 0x99, 0xc9, 0xd4, 0x60, 0x8d, 0x95, 0xcd, 0xad, + 0x9d, 0xa1, 0x53, 0xc2, 0xbb, 0xaf, 0xa4, 0xb4, 0x5b, 0x8e, 0x44, 0x8a, + 0x95, 0x4b, 0xa4, 0xb8, 0x44, 0xa4, 0x5b, 0x36, 0x84, 0xc3, 0x7b, 0x3a, + 0x39, 0xa7, 0x62, 0xc3, 0xa4, 0x3e, 0x94, 0xba, 0xaa, 0xa2, 0x83, 0x4b, + 0xa9, 0xb1, 0xb0, 0xc3, 0x7b, 0x41, 0x5c, 0x73, 0x5f, 0xc5, 0x5e, 0x89, + 0x95, 0x5d, 0x85, 0xa0, 0x85, 0x55, 0x8e, 0x89, 0x70, 0x51, 0x53, 0x52, + 0x76, 0x52, 0x3f, 0x65, 0x8f, 0x4e, 0x84, 0xab, 0x45, 0x3e, 0x4f, 0xb6, + 0x49, 0xae, 0x58, 0x86, 0x4a, 0x8a, 0x84, 0x2b, 0x70, 0x7c, 0xc1, 0x71, + 0x67, 0xc7, 0x86, 0xb5, 0x74, 0x5f, 0xb5, 0x55, 0x4e, 0xb6, 0x77, 0x9a, + 0x51, 0xba, 0xab, 0xc2, 0x51, 0x61, 0xc0, 0xa8, 0x60, 0x79, 0x9e, 0xc1, + 0x66, 0x9b, 0xa3, 0x93, 0x40, 0x70, 0xb0, 0xa5, 0x59, 0x75, 0xb0, 0x8f, + 0xb1, 0xa6, 0x87, 0xb7, 0x75, 0x75, 0x9e, 0x6f, 0xbd, 0x68, 0x37, 0xac, + 0x94, 0x5a, 0x43, 0x66, 0x58, 0x72, 0x50, 0x5b, 0xbd, 0x95, 0x49, 0x95, + 0x53, 0x44, 0x49, 0x7c, 0x57, 0x50, 0x83, 0x6b, 0x6e, 0x6e, 0x3d, 0x88, + 0x9b, 0x5c, 0x9a, 0xae, 0x5f, 0x47, 0xb1, 0xb8, 0x66, 0x96, 0x8b, 0x66, + 0x9f, 0x50, 0x78, 0xa6, 0xab, 0xaf, 0x5d, 0x68, 0x69, 0x5d, 0x7b, 0x5a, + 0xb6, 0x6a, 0x5d, 0x69, 0x75, 0x57, 0xa8, 0x70, 0x80, 0x88, 0x8f, 0xa9, + 0x71, 0x6b, 0x82, 0xbe, 0x77, 0x74, 0x3c, 0x91, 0x79, 0x53, 0x32, 0x93, + 0x96, 0x78, 0x87, 0x3f, 0x47, 0xbd, 0xad, 0x97, 0x4b, 0x84, 0xa0, 0x75, + 0xa1, 0xb4, 0x4b, 0x56, 0x5b, 0x6c, 0x6f, 0x8b, 0x99, 0xb5, 0x64, 0xa9, + 0x58, 0x84, 0x5e, 0x59, 0xbb, 0x75, 0xac, 0x69, 0x87, 0x6a, 0x68, 0x67, + 0xc4, 0xa2, 0x93, 0xb0, 0x48, 0x4b, 0x6b, 0xa1, 0x9a, 0x84, 0x74, 0xb8, + 0x74, 0x5e, 0x8e, 0x50, 0x9a, 0xa3, 0xa8, 0x5c, 0x45, 0x84, 0xa6, 0x65, + 0x50, 0x59, 0x5f, 0x47, 0x55, 0xb3, 0x4c, 0x7b, 0xc9, 0x4e, 0x54, 0x8a, + 0xaa, 0x93, 0x86, 0x69, 0x88, 0x88, 0x85, 0x4b, 0x96, 0x86, 0x7b, 0x9d, + 0xa1, 0x97, 0xa7, 0x8b, 0x4a, 0x9e, 0x69, 0x84, 0x48, 0x7a, 0x95, 0xbf, + 0x47, 0xae, 0x90, 0x7d, 0x86, 0x9e, 0xb2, 0x76, 0x63, 0x57, 0xa1, 0x99, + 0x45, 0x85, 0x66, 0xbc, 0x62, 0x4b, 0x45, 0x69, 0x91, 0x95, 0x81, 0x32, + 0x76, 0x70, 0x9b, 0x7e, 0x2e, 0x76, 0x5c, 0xc4, 0x82, 0x79, 0xcb, 0x82, + 0xb1, 0xa3, 0x6c, 0x67, 0xb3, 0x86, 0x73, 0x58, 0x65, 0x42, 0xb5, 0x5e, + 0x33, 0x50, 0x6a, 0x77, 0x49, 0x34, 0x76, 0x73, 0x60, 0x5a, 0x99, 0xc1, + 0x45, 0x79, 0x2e, 0x48, 0xa5, 0xb3, 0x75, 0xa0, 0x5e, 0x60, 0x76, 0x83, + 0xa7, 0x8d, 0xac, 0xa8, 0xa4, 0x94, 0x51, 0xa8, 0x72, 0x9e, 0xb1, 0x38, + 0xb1, 0xbe, 0xba, 0x6d, 0x69, 0x81, 0x4c, 0xa4, 0x53, 0x5c, 0x9e, 0x79, + 0x77, 0x71, 0xc0, 0x4e, 0x3d, 0x4c, 0x83, 0x83, 0x5b, 0x93, 0x34, 0x8a, + 0x5e, 0x53, 0x81, 0xa4, 0xbc, 0x7e, 0x50, 0x5e, 0x5f, 0x5a, 0x70, 0x5a, + 0x58, 0xab, 0xbd, 0xa5, 0xb1, 0x44, 0xad, 0x60, 0xb0, 0x5c, 0x4c, 0x5f, + 0x5f, 0x44, 0x58, 0x85, 0x87, 0x71, 0x70, 0x7d, 0x51, 0x5e, 0x5d, 0x97, + 0x39, 0x99, 0x7d, 0xa8, 0x75, 0x94, 0xc5, 0xa8, 0xa9, 0x57, 0x9c, 0xa2, + 0xaa, 0x72, 0x38, 0x46, 0x91, 0xab, 0x59, 0xb6, 0xa0, 0x4a, 0x85, 0x53, + 0x66, 0xab, 0xb3, 0xa7, 0xc6, 0xaf, 0x92, 0xbe, 0x7d, 0xcc, 0x96, 0x63, + 0x97, 0x82, 0x77, 0x85, 0x70, 0xaa, 0xd2, 0x88, 0xb2, 0x6e, 0x74, 0x6a, + 0x8d, 0x87, 0xa5, 0x6d, 0x7f, 0x82, 0x7a, 0x91, 0x4b, 0x71, 0x84, 0xa9, + 0x91, 0x6f, 0x5a, 0xc8, 0x83, 0x70, 0x4a, 0x9d, 0x84, 0x74, 0xa4, 0x5a, + 0xbb, 0x5a, 0x91, 0xb1, 0x4f, 0x52, 0x72, 0x63, 0x5a, 0x8b, 0x50, 0x51, + 0xa9, 0x83, 0xb4, 0x6f, 0x7d, 0x69, 0x6c, 0x53, 0x94, 0x6d, 0x5a, 0x41, + 0x86, 0xba, 0x46, 0x4d, 0x8c, 0xb4, 0xb4, 0xa1, 0xa5, 0x3a, 0x8f, 0xa8, + 0x4b, 0xb7, 0x4b, 0xb8, 0x73, 0x4c, 0x72, 0x68, 0x71, 0x3e, 0xbe, 0x8f, + 0x68, 0x43, 0xa9, 0x9c, 0x37, 0x67, 0x8e, 0x65, 0x66, 0x67, 0x99, 0xa4, + 0x95, 0x67, 0x9d, 0x82, 0x91, 0x3d, 0x33, 0x8e, 0xc2, 0x51, 0xa4, 0x47, + 0x56, 0xbc, 0x3d, 0xa1, 0x93, 0x96, 0x4f, 0x54, 0x92, 0x9c, 0x53, 0xad, + 0x9b, 0x46, 0x99, 0x9d, 0xa8, 0xac, 0x96, 0x51, 0x3e, 0xb3, 0xa3, 0x85, + 0xaa, 0x45, 0xa5, 0x84, 0x84, 0x96, 0x6d, 0x7d, 0x9a, 0x86, 0x6c, 0x46, + 0x44, 0xc7, 0x6c, 0x4d, 0x62, 0xb2, 0x97, 0xa5, 0x5f, 0x8d, 0x84, 0xa5, + 0x6f, 0x60, 0x62, 0x69, 0x52, 0xc9, 0xb8, 0x88, 0x95, 0x55, 0xa6, 0x3f, + 0x6f, 0x6a, 0x78, 0x9b, 0x30, 0x90, 0x66, 0x4f, 0x4c, 0x5d, 0x57, 0x93, + 0x84, 0x4c, 0x70, 0x95, 0x57, 0x9d, 0x57, 0x8c, 0xa7, 0x63, 0x90, 0x61, + 0x83, 0x97, 0x92, 0x57, 0xa5, 0x8c, 0x87, 0x58, 0xa0, 0x8f, 0xb3, 0x72, + 0x98, 0x9f, 0x9f, 0x78, 0x6a, 0x66, 0x39, 0xa7, 0xb4, 0xaa, 0x8c, 0x79, + 0x87, 0x43, 0x5b, 0x37, 0xb0, 0x55, 0x39, 0x49, 0x86, 0x85, 0xa4, 0x50, + 0x46, 0x8b, 0x79, 0x64, 0xbb, 0xae, 0x39, 0x89, 0xb9, 0x73, 0x80, 0xa4, + 0xb2, 0x66, 0x84, 0x9d, 0x65, 0x95, 0x37, 0x55, 0x72, 0x76, 0x85, 0x9f, + 0x59, 0x7c, 0x97, 0xba, 0x80, 0x8e, 0x48, 0xc0, 0xae, 0x88, 0x42, 0xaf, + 0x95, 0x91, 0x9a, 0x85, 0xb3, 0x88, 0x67, 0xa5, 0x8b, 0x9b, 0x68, 0x66, + 0x7f, 0x41, 0x36, 0x3f, 0x8d, 0x4e, 0x6e, 0x88, 0x94, 0x8c, 0x55, 0x88, + 0x58, 0x94, 0x74, 0x7b, 0x41, 0x84, 0x9f, 0x48, 0x91, 0x4a, 0x62, 0x46, + 0xb0, 0x9c, 0x89, 0x97, 0x94, 0x6e, 0x75, 0xa9, 0xaa, 0x7b, 0x82, 0xb0, + 0xbd, 0x62, 0x73, 0xc7, 0x94, 0x42, 0x63, 0x49, 0x44, 0x4e, 0xa7, 0x70, + 0xb6, 0x53, 0x71, 0x51, 0x9d, 0xb6, 0x8c, 0x4d, 0x84, 0xb8, 0x7c, 0x55, + 0xba, 0x8a, 0x89, 0x92, 0xaf, 0x7c, 0x77, 0x4b, 0xb3, 0xaa, 0xb4, 0x6c, + 0x3d, 0xa0, 0x8f, 0x7b, 0x7f, 0x89, 0xb3, 0x8f, 0x4b, 0x5d, 0x5a, 0x8c, + 0x88, 0x9e, 0xbd, 0x97, 0x42, 0x84, 0xa2, 0x4f, 0x92, 0x9b, 0x83, 0x41, + 0x72, 0x6a, 0x7c, 0x4c, 0x5a, 0x84, 0x6f, 0xa2, 0x79, 0xb6, 0xb4, 0xb0, + 0x51, 0xb8, 0x83, 0xb6, 0x4a, 0x95, 0x62, 0x4c, 0x91, 0x66, 0xa2, 0x81, + 0x6d, 0x5b, 0x90, 0x6f, 0x44, 0x76, 0x47, 0x9c, 0xc2, 0x7a, 0x53, 0x79, + 0x87, 0x80, 0xba, 0x99, 0xc6, 0x50, 0x90, 0x9c, 0x33, 0x33, 0x95, 0x36, + 0x27, 0x90, 0x93, 0x41, 0x8f, 0xb6, 0x9d, 0xa2, 0xb2, 0xa6, 0x44, 0xa7, + 0xaa, 0xa8, 0x96, 0x8d, 0x62, 0x60, 0xba, 0xbc, 0x88, 0x7e, 0x41, 0x8a, + 0x4e, 0xa7, 0x9b, 0x75, 0x51, 0x83, 0x89, 0xa6, 0x75, 0x90, 0xa5, 0x77, + 0x87, 0x6c, 0x8b, 0x93, 0x83, 0x83, 0x99, 0xb9, 0x53, 0x9e, 0x82, 0x87, + 0x6c, 0x52, 0x68, 0x44, 0x52, 0x61, 0x82, 0x88, 0x97, 0xac, 0x4f, 0xa6, + 0x55, 0x98, 0x8c, 0x9c, 0x8b, 0xa8, 0x94, 0x89, 0x8c, 0x5f, 0x9e, 0x62, + 0x53, 0xa9, 0x65, 0x58, 0x83, 0x54, 0x63, 0x4d, 0xb9, 0x57, 0x92, 0x53, + 0x94, 0x72, 0xa1, 0x47, 0x9f, 0x92, 0x88, 0x8e, 0x53, 0x44, 0x78, 0x7d, + 0x90, 0x84, 0x5d, 0x80, 0xc0, 0xab, 0xc6, 0x94, 0x65, 0xa0, 0x77, 0x60, + 0xbf, 0x54, 0xb6, 0x8b, 0x64, 0x9e, 0xc0, 0x52, 0xa5, 0xad, 0x59, 0x37, + 0x6d, 0x4e, 0xb8, 0x51, 0x8d, 0x4d, 0x8e, 0xa2, 0x42, 0x6c, 0xa5, 0xa3, + 0x66, 0xa2, 0x42, 0x5b, 0x6f, 0x9d, 0x49, 0xa1, 0x7e, 0xab, 0x48, 0x5f, + 0x6c, 0x74, 0x81, 0x9c, 0x8c, 0x64, 0x4f, 0x60, 0xaf, 0x96, 0x40, 0x83, + 0x9e, 0x92, 0xa5, 0xa1, 0x9e, 0x9f, 0xba, 0x41, 0x7b, 0x83, 0xa3, 0xb4, + 0xba, 0x87, 0x8c, 0xb4, 0x3e, 0x42, 0x6e, 0x47, 0x40, 0x49, 0x6c, 0x86, + 0xaf, 0x52, 0x49, 0x70, 0x5e, 0x98, 0x4e, 0x57, 0x85, 0x62, 0x76, 0x8b, + 0x4f, 0x72, 0xb5, 0x80, 0x64, 0xa8, 0x92, 0x76, 0xa3, 0x78, 0x78, 0x86, + 0xc0, 0xa1, 0xbc, 0x78, 0x50, 0x58, 0xaf, 0x7b, 0x9c, 0x36, 0x70, 0x98, + 0xac, 0xa8, 0x5c, 0x87, 0x93, 0x5c, 0x5b, 0x71, 0x4c, 0x59, 0x66, 0x87, + 0xbb, 0x3e, 0x92, 0x9a, 0xb6, 0x83, 0x92, 0xb7, 0x93, 0x57, 0x3b, 0x7d, + 0xbe, 0x4a, 0x90, 0x74, 0x9b, 0xb3, 0x85, 0x83, 0x55, 0xca, 0x7e, 0x44, + 0xa9, 0x4d, 0xb3, 0x3c, 0x49, 0xbb, 0xae, 0x53, 0x35, 0x39, 0xbf, 0x9a, + 0xa1, 0x4e, 0xac, 0xaf, 0x58, 0x65, 0xbd, 0xa5, 0x7e, 0xa0, 0xae, 0x99, + 0x3b, 0x50, 0xb3, 0x64, 0x7c, 0x5b, 0xb4, 0x66, 0x88, 0x5a, 0xba, 0xb4, + 0xc3, 0x88, 0x49, 0x75, 0x41, 0x80, 0xaf, 0x6d, 0x69, 0x6a, 0x76, 0xdc, + 0xaa, 0x83, 0x34, 0x4b, 0xa3, 0xad, 0x9f, 0x2f, 0x9f, 0x5f, 0x95, 0x9b, + 0x53, 0x6a, 0x9b, 0x6e, 0x86, 0x6a, 0xab, 0x8d, 0x6a, 0xb2, 0xae, 0x31, + 0x5a, 0x6c, 0x51, 0xbf, 0xb7, 0x98, 0x87, 0x80, 0x7a, 0x60, 0x5b, 0x5b, + 0xb5, 0x5a, 0xb9, 0x75, 0x91, 0x45, 0xc0, 0x99, 0x99, 0xc0, 0x4f, 0x6a, + 0x70, 0x5f, 0x71, 0xb6, 0x7b, 0x8a, 0x74, 0x75, 0x80, 0xaa, 0xc1, 0x7d, + 0x78, 0x94, 0x4f, 0x8a, 0x7c, 0x8d, 0x80, 0x4f, 0x57, 0x7b, 0x9f, 0x94, + 0xc3, 0x3a, 0x82, 0xb4, 0x6f, 0xa4, 0x95, 0x93, 0xa4, 0xab, 0xb0, 0xaa, + 0xc2, 0xac, 0x62, 0x84, 0xa4, 0xa1, 0x5b, 0x9a, 0x49, 0x69, 0xce, 0x72, + 0x71, 0x6f, 0x87, 0xb5, 0x7a, 0xb9, 0x8d, 0x8d, 0xa7, 0x96, 0x99, 0x35, + 0x4b, 0x77, 0x68, 0x8f, 0x9d, 0x64, 0x72, 0x51, 0x67, 0x80, 0x48, 0xd7, + 0x9f, 0xcc, 0x52, 0x7a, 0xa1, 0x79, 0x8d, 0x7a, 0x45, 0xb0, 0xaf, 0x97, + 0x81, 0x89, 0x6c, 0x4f, 0x78, 0x5e, 0xa6, 0x84, 0x95, 0x46, 0x59, 0x69, + 0x68, 0x60, 0xaf, 0x70, 0x4e, 0x96, 0x8d, 0x89, 0x42, 0x4b, 0x7a, 0xc0, + 0x68, 0xaf, 0xae, 0x9c, 0x55, 0x7d, 0x3d, 0xc6, 0xb9, 0xa3, 0xa7, 0x40, + 0xb1, 0x65, 0x99, 0xb1, 0x5d, 0x7f, 0xba, 0xb8, 0x61, 0x55, 0xa8, 0xa2, + 0x8e, 0x99, 0x50, 0x64, 0x6e, 0xab, 0x7e, 0x8c, 0x73, 0x82, 0x6b, 0x9e, + 0xa1, 0x91, 0xbb, 0xb6, 0x61, 0x6a, 0x5f, 0x52, 0x64, 0x64, 0x6e, 0x4c, + 0x3b, 0x6c, 0x7c, 0x52, 0x60, 0xa6, 0x37, 0x8e, 0x58, 0x97, 0x6c, 0x57, + 0xc9, 0x8e, 0x5f, 0x97, 0x57, 0x98, 0x79, 0xaf, 0x8a, 0x41, 0xaf, 0xa2, + 0x8f, 0x9b, 0xbe, 0x85, 0xa7, 0xa2, 0x66, 0x83, 0x66, 0x8e, 0xa7, 0x4d, + 0x85, 0x84, 0x6e, 0x84, 0xb2, 0xb7, 0x5b, 0x42, 0xb3, 0xb2, 0x5a, 0x6c, + 0x86, 0xa7, 0x7b, 0x9c, 0x41, 0x50, 0xaa, 0x8f, 0x69, 0x4c, 0x70, 0x42, + 0x4a, 0xb6, 0x6c, 0x99, 0xac, 0xbe, 0x38, 0xa0, 0xcc, 0x88, 0xc0, 0x2a, + 0xc2, 0xcc, 0x3e, 0x51, 0x35, 0x78, 0x80, 0x8f, 0x88, 0xaa, 0xb2, 0x94, + 0x6a, 0x9e, 0x5b, 0x73, 0x82, 0xaf, 0x59, 0xb4, 0x7d, 0x87, 0x7d, 0xb2, + 0x75, 0x68, 0x7b, 0x31, 0x47, 0x81, 0x67, 0x3f, 0x39, 0x5e, 0xba, 0xa4, + 0xbd, 0xb0, 0xa1, 0xb9, 0xa2, 0x8f, 0xba, 0x4d, 0x57, 0x41, 0x3a, 0x84, + 0x3a, 0x6d, 0x65, 0x3a, 0x68, 0x9d, 0xb6, 0x57, 0x97, 0x4f, 0x40, 0x6a, + 0x3d, 0x81, 0x81, 0x4f, 0x3e, 0x52, 0x6e, 0xb8, 0x75, 0x9a, 0x5f, 0x5b, + 0x71, 0x55, 0x3e, 0x4f, 0x6c, 0x7a, 0x45, 0x70, 0xb3, 0x85, 0xab, 0x6d, + 0x83, 0x3e, 0x41, 0x83, 0x7e, 0x72, 0xa3, 0x4c, 0x6e, 0xa5, 0x61, 0x3c, + 0x96, 0x77, 0x84, 0x4c, 0x4f, 0x46, 0xbb, 0x9e, 0x68, 0x82, 0x98, 0x64, + 0x80, 0x3d, 0x41, 0x92, 0xaf, 0x69, 0x45, 0xa9, 0x5d, 0xa7, 0xb2, 0x7f, + 0xaa, 0x78, 0xb1, 0x81, 0x74, 0x96, 0x9b, 0x61, 0xa1, 0x75, 0x71, 0x48, + 0xc6, 0x55, 0x48, 0x6c, 0x67, 0x88, 0xbe, 0x88, 0x6a, 0x50, 0xaf, 0x68, + 0x84, 0xb1, 0xb8, 0x46, 0x64, 0xb8, 0x6f, 0x85, 0x68, 0x8a, 0x77, 0x60, + 0x40, 0xb6, 0xae, 0x73, 0xb9, 0xa1, 0xa4, 0xb0, 0x5c, 0x6d, 0x8e, 0x81, + 0x71, 0x7f, 0xc6, 0x85, 0xb4, 0xa5, 0x36, 0xc9, 0xc3, 0x47, 0x89, 0x8c, + 0x94, 0x5c, 0x8e, 0x33, 0x90, 0x42, 0x25, 0x82, 0x30, 0x74, 0x6e, 0x6e, + 0xaf, 0x5d, 0xb7, 0x74, 0x3d, 0x94, 0xce, 0x4f, 0x47, 0x60, 0xac, 0x52, + 0x56, 0x61, 0x9f, 0x93, 0x68, 0x6b, 0x76, 0x77, 0x55, 0xd6, 0x66, 0x54, + 0x66, 0x71, 0xa8, 0x33, 0x65, 0x96, 0xf4, 0x60, 0x2d, 0x4d, 0xd0, 0xa4, + 0x58, 0x55, 0x99, 0xae, 0x7a, 0x31, 0x8d, 0x81, 0x65, 0x7a, 0x6f, 0xb0, + 0x55, 0x6d, 0xbf, 0x7c, 0x74, 0x99, 0x67, 0x5b, 0x80, 0xa6, 0xb2, 0xa0, + 0x88, 0x89, 0x9b, 0x60, 0x58, 0x6d, 0x8f, 0xad, 0x4a, 0x4d, 0x98, 0x66, + 0x62, 0x85, 0x4e, 0x98, 0xba, 0x4c, 0x39, 0xbe, 0x88, 0x6f, 0x66, 0x61, + 0xa3, 0x85, 0xa7, 0x92, 0x63, 0xa1, 0xb3, 0x5c, 0x9f, 0x53, 0x6c, 0x8b, + 0x45, 0x7d, 0xcd, 0xaf, 0x3f, 0x79, 0x54, 0x9d, 0xb0, 0x9f, 0x92, 0x98, + 0x90, 0xa8, 0x79, 0x3f, 0x67, 0x99, 0x46, 0x53, 0x56, 0x50, 0x9b, 0xb2, + 0x6c, 0xa2, 0x78, 0x37, 0x4b, 0x66, 0x48, 0x8e, 0x52, 0x91, 0x8e, 0x99, + 0x6c, 0x53, 0xab, 0x98, 0xa2, 0x80, 0x45, 0x5f, 0xbd, 0x91, 0xac, 0x58, + 0xa4, 0x59, 0x5d, 0x9b, 0x97, 0x64, 0xcb, 0x80, 0x39, 0x7f, 0x56, 0xa6, + 0x7f, 0x31, 0x5a, 0x8d, 0xa3, 0xb2, 0x78, 0xb4, 0x6c, 0xcb, 0x95, 0xb4, + 0x63, 0xc6, 0x6c, 0xab, 0xa1, 0x44, 0x34, 0xd9, 0xa1, 0x8f, 0x25, 0xde, + 0xff, 0xb4, 0x49, 0xa7, 0xf2, 0xbc, 0x8d, 0x91, 0x79, 0xcf, 0xaf, 0x7e, + 0x94, 0x6b, 0x8c, 0x42, 0xb0, 0x47, 0x4b, 0x61, 0xa7, 0x44, 0x9d, 0x63, + 0xb1, 0xb0, 0xc0, 0x9d, 0x8b, 0xdf, 0xd3, 0x49, 0x80, 0x94, 0xc9, 0x8b, + 0x89, 0x95, 0xa0, 0x6e, 0x63, 0x6f, 0x88, 0x7d, 0xe6, 0x58, 0x99, 0x40, + 0xa7, 0xd0, 0x2c, 0x39, 0x3e, 0xa4, 0x4b, 0x66, 0x64, 0x5f, 0xaf, 0x43, + 0x52, 0x79, 0x8a, 0x9b, 0x8c, 0x41, 0x5d, 0x8a, 0x91, 0x46, 0x8a, 0x88, + 0xce, 0xbe, 0x2a, 0x6a, 0xb0, 0x6e, 0x74, 0x8f, 0xb3, 0xa9, 0x68, 0x4a, + 0xaf, 0xbc, 0x57, 0x61, 0x85, 0x56, 0xa4, 0x91, 0x7d, 0x44, 0x71, 0xb5, + 0x6e, 0x61, 0x4d, 0x47, 0xcb, 0x36, 0x19, 0x86, 0xbf, 0x53, 0x3f, 0x4d, + 0xa4, 0x53, 0x88, 0x42, 0x77, 0xc0, 0x9a, 0x53, 0x8a, 0x87, 0x9c, 0x7c, + 0x32, 0x69, 0x62, 0x97, 0x7c, 0x75, 0x63, 0xba, 0xc0, 0x24, 0xb0, 0xad, + 0x75, 0xa6, 0x7e, 0xcc, 0x78, 0x63, 0x67, 0x3e, 0xa3, 0x8e, 0xad, 0x6e, + 0x8f, 0x2c, 0xb5, 0xc5, 0x77, 0x93, 0x3e, 0xab, 0xb0, 0x34, 0x21, 0xc0, + 0xa8, 0x89, 0x3c, 0x81, 0x7f, 0x9e, 0x83, 0x59, 0xbf, 0x66, 0x98, 0x51, + 0x47, 0x82, 0x91, 0x73, 0xa2, 0x9c, 0x5d, 0x83, 0x4d, 0x56, 0x9e, 0x69, + 0x91, 0x71, 0x8b, 0x80, 0x59, 0x85, 0x55, 0x49, 0x7e, 0x70, 0x45, 0x7f, + 0xa6, 0x8e, 0x69, 0x5a, 0x5c, 0x4b, 0x7c, 0x5c, 0x7a, 0x8a, 0x55, 0xac, + 0x6e, 0x3c, 0x66, 0x87, 0xb4, 0x48, 0x93, 0x6f, 0x95, 0x87, 0x65, 0x78, + 0x4d, 0x5d, 0x2f, 0x44, 0x8e, 0xce, 0x86, 0x5c, 0x55, 0xdb, 0x8e, 0x49, + 0x48, 0x8f, 0x96, 0x88, 0x99, 0x8a, 0xa5, 0x82, 0x57, 0x60, 0xc5, 0x89, + 0x7c, 0x37, 0xcc, 0x8a, 0x6a, 0x7d, 0xb2, 0x7a, 0x4e, 0x75, 0x7f, 0x79, + 0x4c, 0x4a, 0xa7, 0xbb, 0xb2, 0x92, 0x4f, 0x70, 0xb4, 0x52, 0xa9, 0x61, + 0x76, 0x75, 0x79, 0x87, 0xa9, 0x83, 0x49, 0x66, 0xb5, 0x3e, 0x94, 0x7a, + 0x8c, 0x43, 0x8e, 0x4e, 0x78, 0xb4, 0x48, 0xa9, 0x9e, 0x8d, 0x5d, 0x93, + 0x77, 0x64, 0xb5, 0xd1, 0x6e, 0x4c, 0xda, 0xd5, 0xae, 0x6e, 0x7b, 0x94, + 0x9f, 0x7f, 0x7c, 0x6b, 0xb6, 0xcb, 0xbf, 0x1d, 0x84, 0x7b, 0x1e, 0x41, + 0x43, 0x37, 0x52, 0x2e, 0x1c, 0x43, 0x6a, 0x52, 0x71, 0x62, 0x55, 0x4f, + 0x95, 0xa3, 0x9d, 0xc3, 0x8f, 0x4b, 0xc5, 0x84, 0xc0, 0x96, 0x8b, 0xe7, + 0x87, 0x9f, 0x5f, 0xad, 0x92, 0x81, 0x90, 0xa4, 0xb1, 0x78, 0x77, 0x8c, + 0xbb, 0x9c, 0x60, 0x90, 0x64, 0xad, 0x87, 0x91, 0xb5, 0x6a, 0x46, 0x51, + 0x72, 0xb2, 0x2b, 0x9f, 0x58, 0x44, 0x60, 0x82, 0xae, 0x58, 0x4c, 0x3b, + 0x8f, 0x99, 0x76, 0x3a, 0x86, 0x77, 0x38, 0x38, 0x77, 0x61, 0xc1, 0x53, + 0x38, 0x5a, 0x9a, 0xbf, 0x6e, 0x84, 0xbc, 0x9d, 0x81, 0x83, 0x7c, 0x72, + 0x90, 0xab, 0x70, 0x5a, 0xcd, 0xbb, 0x59, 0x51, 0x8e, 0x94, 0x41, 0x8b, + 0x68, 0x91, 0x61, 0xa9, 0x68, 0x8d, 0x2f, 0x2b, 0x56, 0x66, 0xc3, 0x52, + 0x64, 0x8c, 0x71, 0x72, 0xb7, 0x88, 0x44, 0x84, 0xa2, 0xae, 0x98, 0x3f, + 0xa8, 0xaf, 0x48, 0x37, 0x47, 0x9b, 0x64, 0x7a, 0x69, 0x47, 0x7d, 0xa4, + 0x56, 0x8a, 0x95, 0xb3, 0xc3, 0x8e, 0x62, 0x7e, 0x6f, 0x83, 0x7c, 0x6d, + 0x79, 0x3d, 0x49, 0x97, 0x4f, 0xaa, 0x9d, 0xb5, 0xc2, 0x78, 0x93, 0x97, + 0x86, 0x7b, 0x86, 0xb1, 0x77, 0xaa, 0x71, 0x56, 0x58, 0x73, 0x7b, 0x74, + 0xad, 0x58, 0x68, 0x62, 0x9c, 0x74, 0x7e, 0x8d, 0x71, 0x44, 0x64, 0x47, + 0xbc, 0x9b, 0x75, 0xb7, 0x92, 0x4b, 0x82, 0x63, 0x8c, 0xa1, 0x5e, 0x95, + 0x7e, 0x60, 0xc8, 0x89, 0x9a, 0xa2, 0x9a, 0xa7, 0x67, 0x89, 0x7f, 0x8a, + 0x8a, 0x6a, 0x45, 0x8d, 0xb5, 0x5e, 0x75, 0xb3, 0x5f, 0x56, 0x99, 0x4d, + 0x55, 0x7c, 0xc5, 0x52, 0xad, 0x87, 0xb3, 0x94, 0x61, 0xa3, 0x9f, 0x58, + 0x91, 0xa0, 0x5f, 0x4f, 0x4d, 0xb0, 0x5d, 0x41, 0xb7, 0xd4, 0xbe, 0xa3, + 0x89, 0x8c, 0x34, 0x4c, 0x6a, 0x40, 0x38, 0x2d, 0x83, 0x8d, 0x91, 0x78, + 0x60, 0x76, 0x58, 0x65, 0xb2, 0x52, 0x75, 0x56, 0x43, 0xab, 0xaf, 0xb6, + 0x8e, 0x99, 0x65, 0x5a, 0x5e, 0x83, 0x87, 0xb8, 0xb8, 0x63, 0x86, 0xbc, + 0xaf, 0x71, 0x7d, 0x8e, 0x7a, 0x82, 0x51, 0xaf, 0x57, 0x69, 0x75, 0x57, + 0xcc, 0xcb, 0x5a, 0x7f, 0x74, 0xa6, 0x57, 0x80, 0x9a, 0x6b, 0x2d, 0x97, + 0x9b, 0x7c, 0xa6, 0x2a, 0x83, 0x36, 0xc3, 0x5a, 0x3c, 0xb1, 0xb2, 0x9b, + 0xa2, 0x5f, 0x4f, 0xa1, 0x75, 0x7e, 0x38, 0x72, 0xab, 0x5f, 0x8f, 0x9f, + 0x56, 0x70, 0x86, 0x89, 0x57, 0x70, 0x8c, 0x4e, 0x67, 0x7b, 0x46, 0x51, + 0x87, 0x6d, 0x7c, 0x74, 0xb8, 0x79, 0x45, 0x43, 0x6e, 0x90, 0xc8, 0x86, + 0x5b, 0x6f, 0x6d, 0xbf, 0xec, 0x93, 0xc7, 0x8b, 0x98, 0x9b, 0x7c, 0x78, + 0xba, 0xa0, 0x6e, 0xce, 0x80, 0x71, 0x85, 0xaa, 0x59, 0x80, 0x45, 0xa3, + 0xb5, 0x96, 0x5e, 0xbf, 0x55, 0x93, 0x67, 0x87, 0x5b, 0x3e, 0x4b, 0x8e, + 0x56, 0x7a, 0x7b, 0xbf, 0x98, 0x35, 0xac, 0x7d, 0xa2, 0x7e, 0xbd, 0xb7, + 0x79, 0x6e, 0xa6, 0x54, 0x4f, 0x50, 0xa1, 0x8b, 0x62, 0x9b, 0x84, 0x78, + 0xca, 0x85, 0xa5, 0x5d, 0x74, 0x73, 0x9b, 0xac, 0x56, 0x99, 0x65, 0x78, + 0x6f, 0x85, 0x40, 0x9b, 0x2d, 0x42, 0x52, 0x2f, 0x46, 0x78, 0x56, 0x5d, + 0x56, 0x3e, 0x9f, 0x9b, 0x51, 0x45, 0x62, 0x39, 0x66, 0xb5, 0xa9, 0xb8, + 0x7d, 0x97, 0x61, 0x96, 0xa9, 0x89, 0x89, 0xaa, 0xac, 0xa4, 0x5a, 0x93, + 0x5c, 0x9f, 0xb1, 0x96, 0x63, 0x8a, 0xbb, 0x45, 0x3b, 0xa5, 0xae, 0xa0, + 0xad, 0x53, 0xc2, 0x71, 0xa7, 0xc5, 0xb6, 0x88, 0xbf, 0xba, 0xbe, 0xc7, + 0x2f, 0x2b, 0xa5, 0x8c, 0x69, 0x49, 0x65, 0x85, 0x1b, 0xc, 0x51, 0x51, + 0x4f, 0x3c, 0xb5, 0x8d, 0xc2, 0xc6, 0xb4, 0x95, 0x75, 0x69, 0xbc, 0x60, + 0x84, 0x8b, 0x4c, 0x68, 0x99, 0x91, 0xba, 0x8a, 0x60, 0xce, 0x9d, 0x5e, + 0xae, 0x79, 0x49, 0x7f, 0x5c, 0x86, 0x97, 0x69, 0x85, 0x9c, 0x56, 0xd8, + 0xa3, 0xd3, 0xa2, 0x4c, 0xa8, 0x82, 0xa7, 0x7f, 0x9a, 0xb0, 0x61, 0x91, + 0x8f, 0x69, 0x91, 0x85, 0x39, 0x58, 0xb1, 0x6a, 0xcf, 0xd2, 0x7b, 0x97, + 0x5a, 0x7e, 0xa3, 0x75, 0xb7, 0x5c, 0x6b, 0x44, 0x83, 0x9d, 0x76, 0xd4, + 0xa6, 0x3d, 0xac, 0xb2, 0x81, 0x6f, 0x6c, 0x58, 0xb1, 0x57, 0x4d, 0xa7, + 0xa7, 0x48, 0xb7, 0x6b, 0x95, 0xad, 0x52, 0x2a, 0x68, 0x9e, 0x6e, 0x5c, + 0x7d, 0x75, 0x81, 0xb9, 0x80, 0x7a, 0xad, 0x71, 0x89, 0xb4, 0x86, 0x99, + 0x68, 0x70, 0x47, 0x62, 0x6a, 0xa4, 0x4a, 0x4a, 0x7b, 0x4e, 0x48, 0x55, + 0x67, 0x9b, 0x78, 0x77, 0x78, 0x90, 0x50, 0x89, 0xaf, 0x6a, 0x67, 0x7d, + 0x56, 0x3b, 0x6e, 0x46, 0x94, 0x63, 0x55, 0x81, 0xac, 0xa0, 0x9c, 0x42, + 0xad, 0xc8, 0xd5, 0x4f, 0x62, 0x64, 0xc9, 0xa9, 0xd3, 0x7f, 0x4d, 0x71, + 0x47, 0x50, 0x64, 0x70, 0x47, 0x88, 0xb1, 0x76, 0x3c, 0xa6, 0x7d, 0x87, + 0xa4, 0x51, 0xa4, 0xac, 0x5e, 0xa9, 0x3e, 0x72, 0x73, 0xaf, 0x5a, 0x80, + 0x83, 0x51, 0xbe, 0x68, 0xbd, 0x9f, 0x96, 0xa6, 0x7f, 0x6b, 0x56, 0x8f, + 0x9f, 0x9e, 0x68, 0x68, 0x40, 0x47, 0x8d, 0x7d, 0x5f, 0x46, 0x8f, 0xb5, + 0x8c, 0x8c, 0xbf, 0xb3, 0xa3, 0x71, 0x8d, 0xa8, 0xc3, 0x54, 0xb9, 0x95, + 0xa9, 0x3c, 0x3c, 0x67, 0xc4, 0x61, 0x9a, 0x99, 0x65, 0x90, 0xbc, 0xcb, + 0xa3, 0xb3, 0xc4, 0x5c, 0xb6, 0x96, 0x5a, 0xc3, 0x6d, 0xb7, 0xbf, 0x52, + 0x9f, 0x9f, 0x53, 0xa6, 0xa2, 0x5d, 0x79, 0xb2, 0x4c, 0xad, 0x9f, 0x9b, + 0xc6, 0xb2, 0xa0, 0x93, 0x74, 0x96, 0x86, 0x92, 0x65, 0x40, 0xad, 0xc0, + 0xbb, 0x63, 0x4c, 0x65, 0x7e, 0x52, 0xaf, 0xcd, 0xc0, 0x3c, 0x8d, 0x4d, + 0x93, 0xb0, 0x4f, 0x97, 0x89, 0x41, 0x89, 0x9b, 0x79, 0x61, 0x84, 0x9c, + 0xac, 0x57, 0x35, 0x62, 0xa4, 0x8b, 0x77, 0x65, 0x9b, 0x79, 0x84, 0x9d, + 0xc4, 0x8c, 0x74, 0x96, 0x7f, 0x7a, 0x97, 0x5d, 0x96, 0x89, 0x7d, 0x7e, + 0x57, 0x73, 0x93, 0x44, 0xa7, 0xb7, 0x67, 0xa2, 0x42, 0x48, 0x9e, 0x6c, + 0x33, 0x55, 0x78, 0xaa, 0x42, 0x6d, 0x4c, 0x4b, 0x57, 0x67, 0x8d, 0x97, + 0x47, 0xa7, 0xd0, 0x5c, 0xac, 0xbf, 0xd5, 0xbc, 0x85, 0xbb, 0x83, 0x97, + 0x70, 0x77, 0x34, 0xa3, 0x5d, 0x9a, 0xa7, 0x7c, 0x66, 0x52, 0x6d, 0xa0, + 0x95, 0x7e, 0x67, 0x90, 0x61, 0x41, 0xa6, 0x91, 0xa3, 0xb0, 0x79, 0x6b, + 0x98, 0x46, 0x84, 0x5f, 0x6b, 0x8e, 0x4e, 0x9b, 0xa3, 0x87, 0x86, 0x70, + 0xb0, 0xb3, 0xab, 0x8f, 0x6e, 0x39, 0x5b, 0x79, 0x65, 0x9d, 0x49, 0x3f, + 0x96, 0x43, 0xa2, 0x6f, 0xb8, 0x90, 0x3b, 0x82, 0x90, 0xc4, 0x97, 0x6f, + 0x5b, 0xa5, 0xc4, 0xb0, 0xa0, 0x88, 0xb2, 0x6c, 0xad, 0x3d, 0xb6, 0x56, + 0x47, 0x56, 0x93, 0x6f, 0x63, 0x80, 0x3b, 0x78, 0x8d, 0xbe, 0x65, 0xa0, + 0xc7, 0x5e, 0x50, 0xa2, 0xbd, 0x6f, 0x42, 0x48, 0x96, 0xbf, 0xb3, 0xab, + 0x6e, 0x77, 0x6e, 0x9f, 0x7f, 0x62, 0x62, 0x4f, 0x3f, 0x6e, 0x9e, 0x50, + 0x84, 0x90, 0x5a, 0x59, 0xcb, 0x87, 0x7f, 0x5a, 0x90, 0x81, 0x87, 0xa6, + 0x40, 0x75, 0x45, 0x7e, 0xbb, 0x41, 0x4d, 0x77, 0x5d, 0x76, 0x43, 0x51, + 0x9d, 0xce, 0x89, 0x6a, 0x9d, 0x84, 0xbe, 0x70, 0x2e, 0xa3, 0x3f, 0x3d, + 0x68, 0x4e, 0xc5, 0x9c, 0x30, 0x9f, 0xb6, 0x84, 0x5e, 0x3b, 0xc3, 0x9e, + 0x63, 0x8c, 0x90, 0x9d, 0x83, 0xa8, 0x74, 0xe3, 0x4d, 0x53, 0x8e, 0x6b, + 0x56, 0x6f, 0xb6, 0x7d, 0x4e, 0xba, 0xa9, 0xa5, 0x7f, 0x80, 0x40, 0x72, + 0x87, 0x6a, 0x53, 0x6a, 0x6d, 0x73, 0x40, 0x8a, 0x23, 0x4d, 0x74, 0xbc, + 0x56, 0x69, 0x4b, 0x7a, 0xa0, 0x59, 0x85, 0x3a, 0x4d, 0x65, 0x41, 0x43, + 0x46, 0x56, 0x4a, 0x6f, 0x79, 0x5d, 0x60, 0x3d, 0x51, 0xb5, 0xa9, 0x4d, + 0x63, 0x92, 0xc3, 0xbb, 0x92, 0x8d, 0xa5, 0x45, 0xa3, 0x44, 0x87, 0x51, + 0xb4, 0x70, 0x4d, 0xc8, 0x99, 0x93, 0x49, 0x44, 0x64, 0xc7, 0x69, 0x50, + 0x8c, 0x3e, 0xab, 0x76, 0x47, 0xaf, 0xab, 0x5b, 0x51, 0x4e, 0x65, 0xaf, + 0x69, 0x56, 0x71, 0x7d, 0x91, 0x77, 0xb4, 0x4e, 0x5d, 0x47, 0x4f, 0x5b, + 0x4d, 0x55, 0x92, 0x8c, 0x5e, 0x71, 0x7b, 0xa2, 0x59, 0x60, 0xb0, 0x86, + 0x79, 0x72, 0xbb, 0xa4, 0x47, 0xb2, 0xb6, 0x5c, 0x66, 0x7a, 0xc2, 0x5c, + 0x55, 0x70, 0x78, 0xc6, 0x61, 0x9d, 0x44, 0x92, 0xb9, 0x67, 0x69, 0x5e, + 0x34, 0xbf, 0x44, 0x46, 0x9c, 0x4f, 0xbc, 0x5a, 0xae, 0x59, 0x64, 0xae, + 0x90, 0x4a, 0xc1, 0x8c, 0x65, 0xae, 0xb7, 0x77, 0x92, 0x5b, 0x3b, 0x8b, + 0x98, 0x54, 0xcb, 0x5f, 0x92, 0xa0, 0x4f, 0x92, 0x5b, 0x71, 0x9f, 0xb1, + 0x9a, 0x6f, 0x7c, 0x9e, 0x87, 0x64, 0x96, 0xaa, 0x83, 0x58, 0x8d, 0x47, + 0x5f, 0x3c, 0x90, 0xc9, 0x6c, 0x4b, 0xba, 0xbc, 0x5e, 0xa6, 0x7d, 0xd5, + 0x8b, 0x98, 0x2f, 0x79, 0xbe, 0xc7, 0x84, 0x4a, 0x83, 0x68, 0xa8, 0x8e, + 0x3d, 0x6c, 0x61, 0xa6, 0x88, 0x92, 0x9b, 0x98, 0xab, 0xbc, 0x84, 0x7b, + 0x73, 0xb7, 0xb6, 0x8a, 0x9e, 0x86, 0x9d, 0xa9, 0x87, 0x57, 0x71, 0x93, + 0x4c, 0x74, 0x4d, 0x83, 0x97, 0x6e, 0x7c, 0x94, 0xbf, 0x7e, 0x61, 0x77, + 0xcd, 0x47, 0x40, 0xb8, 0xcc, 0x8c, 0xb1, 0x69, 0xa8, 0x8e, 0x74, 0xa9, + 0xb9, 0x77, 0x9e, 0x73, 0x86, 0x89, 0x61, 0x98, 0x51, 0xb9, 0x76, 0xc4, + 0x4f, 0x7a, 0x85, 0xb9, 0x78, 0x4a, 0x3c, 0x5a, 0x9b, 0x9c, 0x9d, 0xbf, + 0x5e, 0x93, 0x58, 0xa2, 0x90, 0xb1, 0x3a, 0x94, 0xb3, 0x63, 0x5f, 0x78, + 0x6b, 0x30, 0x41, 0x84, 0x66, 0x62, 0x66, 0x76, 0xa3, 0x3c, 0xaa, 0x61, + 0x60, 0x41, 0x59, 0xa7, 0xab, 0xbe, 0xa7, 0x62, 0xc7, 0x55, 0x89, 0x67, + 0x96, 0xcb, 0x92, 0x6f, 0x49, 0x7f, 0xad, 0xab, 0x3e, 0x36, 0x6d, 0xcd, + 0x83, 0x9c, 0x79, 0x56, 0xa5, 0x85, 0x6a, 0x78, 0x96, 0x4f, 0x52, 0x93, + 0xa3, 0x6e, 0x56, 0x4f, 0x5a, 0x73, 0x75, 0x84, 0x46, 0x7b, 0x60, 0x7b, + 0x41, 0x73, 0x96, 0x9d, 0x47, 0x44, 0x7f, 0x92, 0x9a, 0x8e, 0x3c, 0xad, + 0xb2, 0xb5, 0x46, 0x39, 0x4c, 0xb6, 0x39, 0xae, 0x7e, 0xc7, 0xc1, 0x72, + 0x59, 0x49, 0x80, 0xa3, 0x60, 0x7c, 0x9e, 0xb6, 0x6e, 0x91, 0x49, 0x6d, + 0xa6, 0xb2, 0x5c, 0xad, 0x3e, 0x6b, 0x81, 0x65, 0x82, 0xa8, 0x9b, 0x93, + 0x77, 0x39, 0x44, 0x74, 0x93, 0x87, 0x7f, 0x68, 0xaf, 0xa9, 0x63, 0x7c, + 0x96, 0x6e, 0x86, 0x4b, 0x48, 0x99, 0x38, 0xaa, 0x7f, 0x5e, 0x68, 0x3c, + 0x92, 0x9c, 0x89, 0x8b, 0xb0, 0x5c, 0xa9, 0x5d, 0x54, 0x58, 0x6f, 0x8e, + 0xbf, 0x89, 0x86, 0x61, 0x89, 0x66, 0x8a, 0x6b, 0x78, 0xb8, 0x74, 0x51, + 0x3c, 0xa8, 0x45, 0x70, 0xb1, 0x9f, 0x52, 0x9c, 0x86, 0x60, 0x78, 0xbb, + 0x6a, 0x3c, 0x6e, 0x6b, 0x91, 0x5b, 0x8e, 0x95, 0x6c, 0x91, 0x53, 0xa5, + 0x3a, 0x56, 0x62, 0x32, 0x85, 0x98, 0x74, 0x70, 0x96, 0x93, 0x43, 0x8d, + 0x73, 0x49, 0x68, 0xbf, 0xc3, 0xba, 0x73, 0x64, 0x52, 0x78, 0x64, 0x49, + 0xac, 0xb1, 0x90, 0x51, 0x42, 0xb4, 0x9a, 0x9a, 0x60, 0xb3, 0x75, 0x61, + 0xb5, 0x7d, 0x7c, 0x4c, 0x75, 0x8a, 0xa7, 0xbd, 0x88, 0x5c, 0x97, 0x54, + 0x9d, 0x8d, 0x4f, 0x60, 0xa2, 0x5b, 0x72, 0x91, 0x97, 0x80, 0x8e, 0xa5, + 0xbb, 0x92, 0xa4, 0x4f, 0x83, 0x46, 0x4d, 0x5c, 0xad, 0x8e, 0x9b, 0x50, + 0xb9, 0x4a, 0x51, 0x45, 0x8f, 0x59, 0x34, 0x61, 0x5c, 0x9e, 0x9c, 0xc9, + 0x88, 0xa1, 0x42, 0x6e, 0x7a, 0xa0, 0xae, 0x6d, 0xbd, 0x47, 0x84, 0x9f, + 0x5b, 0x9b, 0x58, 0x5e, 0xac, 0xa9, 0xb4, 0xad, 0xa3, 0xa2, 0xc4, 0x3f, + 0x58, 0x6c, 0x5d, 0x89, 0x54, 0x5f, 0xb9, 0x88, 0x78, 0x50, 0x79, 0x80, + 0x3e, 0x9c, 0x5c, 0xa2, 0x7e, 0x8a, 0xa8, 0x5b, 0x35, 0x9f, 0xaf, 0x89, + 0x7f, 0x98, 0x9d, 0xbc, 0x9c, 0x48, 0x95, 0xb5, 0xa2, 0x5e, 0x66, 0xc1, + 0x66, 0xb7, 0x7b, 0x93, 0x3d, 0x75, 0xb0, 0x61, 0x61, 0x79, 0x66, 0x4d, + 0x90, 0xc3, 0x52, 0x5b, 0x67, 0x5f, 0xa8, 0xad, 0x4e, 0x5f, 0x7e, 0xa1, + 0xa9, 0x6d, 0xb6, 0x57, 0x96, 0x3f, 0xaf, 0x4b, 0x6e, 0x6e, 0x44, 0xb9, + 0xc6, 0xb7, 0xa7, 0x90, 0x5c, 0xba, 0x6b, 0x4d, 0x63, 0x94, 0x96, 0x6e, + 0x4b, 0x8d, 0xae, 0x75, 0x74, 0x7d, 0x50, 0x47, 0x43, 0x90, 0xb4, 0x68, + 0x8c, 0x51, 0x72, 0xb3, 0x47, 0xb8, 0x56, 0x91, 0x78, 0x96, 0x65, 0x83, + 0x83, 0x8c, 0x5a, 0x42, 0x58, 0x5a, 0x6f, 0x68, 0x7d, 0x94, 0x59, 0x5a, + 0xb0, 0x47, 0x49, 0x6d, 0xbc, 0x9a, 0x6b, 0x53, 0x72, 0x41, 0x8e, 0xa7, + 0x51, 0x52, 0xbb, 0x80, 0x8d, 0x4d, 0x40, 0x54, 0x4c, 0x77, 0xae, 0x4a, + 0x7c, 0x94, 0x79, 0xa4, 0x69, 0xb5, 0x5e, 0xad, 0xa1, 0xb4, 0x55, 0x8b, + 0x9e, 0xc1, 0xba, 0x7c, 0xab, 0xb9, 0x62, 0x6b, 0x63, 0x38, 0x9f, 0x6a, + 0xae, 0x81, 0xb6, 0xaf, 0xa5, 0x9b, 0x6c, 0x75, 0x75, 0x48, 0x48, 0x77, + 0x8d, 0xae, 0x57, 0x9c, 0x8d, 0x47, 0x8c, 0x73, 0xaa, 0x7d, 0x86, 0x47, + 0xa4, 0xa5, 0x5f, 0xb3, 0x7a, 0xb0, 0xb4, 0x9d, 0x95, 0x73, 0xab, 0xb3, + 0x60, 0xa7, 0x46, 0x73, 0x66, 0x4e, 0x92, 0x4c, 0x9e, 0x4b, 0x76, 0x48, + 0xb6, 0xb8, 0x60, 0x53, 0x53, 0x95, 0x70, 0x77, 0x6f, 0x5f, 0x36, 0x5e, + 0x64, 0x4f, 0x6e, 0x6a, 0x98, 0x9a, 0x84, 0x6e, 0x7b, 0x69, 0x9d, 0xbb, + 0x68, 0x72, 0x49, 0x91, 0xbe, 0x5a, 0x82, 0xad, 0x79, 0x89, 0xc4, 0x58, + 0x4b, 0x68, 0x39, 0x50, 0xb6, 0x6b, 0x45, 0x5f, 0x73, 0x8d, 0x7f, 0xc6, + 0xcb, 0x4a, 0xb3, 0xb2, 0x67, 0x9a, 0xc6, 0x57, 0xb6, 0x8d, 0x55, 0x82, + 0x91, 0xba, 0x77, 0x5c, 0x6f, 0xa3, 0x93, 0x61, 0xa3, 0xb8, 0x50, 0x45, + 0x7e, 0xbd, 0x6a, 0x68, 0x79, 0x77, 0x67, 0x76, 0x6e, 0x6d, 0xa3, 0x6e, + 0x4b, 0xbc, 0x54, 0xac, 0xb7, 0xab, 0x8f, 0x45, 0x76, 0x8c, 0x48, 0x98, + 0x56, 0x8e, 0x8c, 0xa0, 0x50, 0x99, 0x56, 0xae, 0x74, 0x7b, 0x98, 0x7c, + 0x7d, 0xb2, 0x50, 0x87, 0x9d, 0x96, 0x5a, 0x80, 0x4f, 0x88, 0x95, 0xa7, + 0x83, 0x7c, 0x6e, 0xab, 0x9f, 0x42, 0xb0, 0x48, 0xa1, 0xab, 0xb8, 0xa6, + 0xb9, 0x5b, 0x4c, 0x42, 0x8a, 0x7d, 0x89, 0x44, 0x59, 0x56, 0xad, 0x9f, + 0x57, 0x99, 0xbe, 0xb0, 0x79, 0xab, 0x40, 0x76, 0x35, 0xa2, 0x7c, 0x55, + 0xc6, 0xa6, 0x74, 0x64, 0x96, 0xab, 0x6a, 0xb2, 0x93, 0xc9, 0xbe, 0x4c, + 0x96, 0x5f, 0x54, 0x8e, 0x50, 0x5c, 0x46, 0x4b, 0x87, 0xa3, 0x53, 0x70, + 0x60, 0xbb, 0x9d, 0x88, 0x54, 0xc3, 0x5d, 0x95, 0x45, 0xb0, 0x7a, 0x8b, + 0x9b, 0xbd, 0x85, 0x4b, 0x6a, 0x4f, 0x7f, 0xb9, 0x5b, 0x3d, 0x33, 0x97, + 0xae, 0xbc, 0xbc, 0xa1, 0x77, 0xb4, 0x6e, 0xb2, 0x6a, 0xa5, 0xad, 0x91, + 0x92, 0xb3, 0x95, 0x8c, 0xb4, 0xbc, 0xc9, 0xdb, 0xc9, 0xa5, 0x9a, 0x93, + 0xca, 0x92, 0xad, 0x54, 0x6a, 0xce, 0x70, 0x60, 0x6e, 0x67, 0xad, 0xba, + 0x4b, 0x4d, 0x5c, 0xb0, 0x86, 0x57, 0x9a, 0x4b, 0x98, 0x54, 0x59, 0x42, + 0x6e, 0x76, 0x68, 0xa7, 0x86, 0x5c, 0x9d, 0x6b, 0x41, 0x50, 0xb6, 0xbf, + 0x6a, 0x89, 0x8f, 0x7a, 0x69, 0x97, 0x59, 0x82, 0x41, 0x7c, 0x6d, 0xb1, + 0xcd, 0xbf, 0xc2, 0x90, 0x83, 0x98, 0xae, 0xab, 0xb2, 0x4f, 0xc8, 0xa8, + 0x8d, 0x6f, 0xa8, 0x75, 0x89, 0xa1, 0xaa, 0x8d, 0x40, 0x5f, 0x9b, 0x52, + 0xac, 0x70, 0x7d, 0x5f, 0x89, 0x46, 0x82, 0x67, 0x5a, 0xba, 0x4a, 0x85, + 0x8c, 0x7f, 0x49, 0x8c, 0xb3, 0x70, 0x84, 0x8a, 0x59, 0x65, 0x6f, 0xab, + 0x8d, 0x43, 0x5a, 0xa2, 0x46, 0xa6, 0x64, 0x97, 0x55, 0x40, 0x87, 0xbb, + 0xb8, 0x57, 0x4d, 0x92, 0xb3, 0x9f, 0x9b, 0x5c, 0x80, 0x9a, 0xa2, 0x78, + 0x76, 0x5c, 0x97, 0xbc, 0x9c, 0x61, 0x8b, 0x7d, 0x58, 0x6d, 0xbb, 0x88, + 0x98, 0x76, 0x92, 0xa0, 0x6c, 0xa2, 0x4a, 0x84, 0x61, 0x4f, 0xc9, 0x9e, + 0xc7, 0x95, 0x89, 0x9a, 0x4a, 0x97, 0x66, 0x7b, 0xb8, 0x92, 0x88, 0x8d, + 0x73, 0x4f, 0x57, 0x66, 0x3d, 0x6a, 0xc1, 0xbe, 0x3c, 0xc0, 0x4d, 0x90, + 0x8a, 0x6a, 0xb6, 0x65, 0x5a, 0x40, 0x79, 0xb1, 0x3f, 0x86, 0x66, 0x83, + 0x53, 0xae, 0x9e, 0x56, 0x70, 0xc1, 0xaf, 0x78, 0x85, 0x89, 0x6f, 0x9c, + 0x5c, 0x92, 0x6d, 0x78, 0x55, 0xac, 0xab, 0x5d, 0xaa, 0xaa, 0xa1, 0xa1, + 0x59, 0xa7, 0x6a, 0x41, 0x4b, 0xbe, 0x92, 0x54, 0x3b, 0x35, 0xb7, 0x8f, + 0x8e, 0x93, 0x44, 0x63, 0x87, 0x5b, 0x4f, 0x87, 0x57, 0x7f, 0x6f, 0xa7, + 0x74, 0x5d, 0x48, 0x8c, 0x46, 0x99, 0x7f, 0xac, 0xac, 0x7b, 0x6a, 0x3a, + 0x3a, 0x67, 0xa2, 0x64, 0x94, 0x77, 0x6b, 0x7c, 0x93, 0x81, 0x94, 0x87, + 0x64, 0x9b, 0x6e, 0x83, 0xac, 0x5b, 0x79, 0x6e, 0xb7, 0xa2, 0x95, 0x6f, + 0xba, 0xe3, 0xa2, 0x55, 0x44, 0x47, 0xa8, 0x45, 0x94, 0x4c, 0x67, 0xad, + 0x95, 0xb5, 0x5a, 0xc9, 0x6d, 0xb8, 0xa3, 0x49, 0xa6, 0xa5, 0xa8, 0x53, + 0x41, 0x35, 0xa8, 0x3a, 0x56, 0x44, 0x93, 0x4e, 0x75, 0x49, 0x7b, 0x6a, + 0x91, 0x7d, 0x4c, 0x82, 0xb8, 0xb5, 0xb2, 0x5c, 0x5f, 0x6f, 0x9c, 0xa3, + 0x40, 0x45, 0x9b, 0x33, 0x3d, 0xad, 0xc0, 0xaf, 0x72, 0xba, 0x9f, 0x5c, + 0xa0, 0xcb, 0x62, 0x90, 0xa8, 0x82, 0x8d, 0x4e, 0xa7, 0xa8, 0x8f, 0xc7, + 0xbc, 0x60, 0x3f, 0x80, 0xb7, 0x76, 0x9a, 0x6b, 0x70, 0x9b, 0x72, 0x79, + 0x77, 0xac, 0x57, 0xb5, 0x54, 0x5d, 0x8a, 0xa0, 0x54, 0xb8, 0x6b, 0x8a, + 0xac, 0x4d, 0x45, 0x83, 0x8d, 0xa0, 0xbe, 0xaf, 0x6e, 0x5e, 0xb9, 0x87, + 0x63, 0x78, 0xb0, 0x74, 0xba, 0x93, 0x6a, 0xc5, 0xc6, 0x7c, 0x55, 0x5c, + 0xa4, 0x54, 0x80, 0x6d, 0xbb, 0xc4, 0x78, 0xb2, 0x72, 0x60, 0x6c, 0x61, + 0xa9, 0x98, 0x3f, 0x78, 0x9d, 0x49, 0x3b, 0xa7, 0xad, 0x85, 0x62, 0xad, + 0x91, 0x56, 0x62, 0xa1, 0x50, 0x64, 0x3c, 0x65, 0xa9, 0x9f, 0xc6, 0x9f, + 0xa2, 0xbe, 0x7a, 0x7d, 0xaf, 0x79, 0x42, 0x60, 0xbb, 0x60, 0x7d, 0x4c, + 0xbc, 0x48, 0xa7, 0x71, 0x87, 0x4b, 0xbb, 0x9a, 0x95, 0x96, 0x69, 0xc7, + 0x92, 0xc1, 0xc2, 0x5e, 0x44, 0xb6, 0x80, 0xc0, 0x82, 0x9e, 0xb6, 0xb1, + 0xbc, 0x69, 0x5d, 0x76, 0x93, 0x27, 0x58, 0x62, 0xad, 0x33, 0x95, 0x6d, + 0x5e, 0x56, 0x49, 0x4f, 0x49, 0xcd, 0xbd, 0x42, 0x4a, 0x68, 0x7d, 0xaa, + 0x89, 0x80, 0x79, 0x49, 0xa0, 0x98, 0x4c, 0xa4, 0x45, 0x29, 0x8c, 0x87, + 0x7c, 0x95, 0x79, 0xa2, 0x62, 0x65, 0x5d, 0x78, 0x6b, 0xbd, 0x3c, 0x46, + 0x61, 0x4d, 0x83, 0x29, 0x5f, 0x4b, 0x9c, 0x6c, 0x2b, 0x96, 0xbd, 0x8c, + 0xa5, 0x4a, 0xb7, 0x7d, 0x5b, 0x74, 0x34, 0x99, 0x72, 0xa4, 0x43, 0x60, + 0xa7, 0x7e, 0x71, 0x7a, 0xa1, 0x92, 0x4d, 0x74, 0xbf, 0xbc, 0xaa, 0x4d, + 0x9a, 0xb5, 0xa3, 0x45, 0x45, 0x83, 0x76, 0x68, 0x36, 0x83, 0x51, 0x88, + 0xa9, 0x8c, 0x9a, 0x96, 0x65, 0x3a, 0x40, 0x5b, 0x8c, 0x99, 0x6e, 0xa7, + 0xb5, 0x8c, 0x7f, 0x2f, 0x55, 0x74, 0x96, 0x56, 0x90, 0xba, 0x61, 0xb2, + 0x6e, 0x79, 0xb4, 0xae, 0x9b, 0x87, 0xb2, 0x8f, 0x3e, 0xa5, 0xbb, 0x96, + 0x87, 0x59, 0x91, 0x57, 0x71, 0x5d, 0x41, 0x6c, 0xba, 0x91, 0x78, 0xad, + 0x5c, 0xa5, 0x5e, 0xbe, 0x4d, 0xbb, 0x6f, 0x61, 0x67, 0x91, 0x8f, 0x7b, + 0x54, 0x33, 0x3f, 0x6e, 0xb1, 0x3e, 0x55, 0xb2, 0xb7, 0x96, 0x63, 0xa1, + 0xb1, 0xb3, 0x5c, 0x6b, 0x8f, 0x45, 0x4f, 0xb9, 0x70, 0x66, 0xb5, 0x6c, + 0xaa, 0x51, 0x8e, 0x7c, 0x83, 0x49, 0x73, 0x2c, 0x98, 0xa3, 0x83, 0x65, + 0xa8, 0x9f, 0xa7, 0x88, 0x81, 0x37, 0xb5, 0xaa, 0x5d, 0x81, 0x3b, 0xb3, + 0xe0, 0xcc, 0x3e, 0x87, 0xb4, 0x66, 0xa4, 0x86, 0x66, 0x97, 0x8e, 0xa0, + 0xb1, 0x92, 0xa8, 0x94, 0x74, 0xa6, 0x5a, 0x8c, 0xc6, 0x8c, 0x67, 0x91, + 0xa1, 0xc6, 0x92, 0x4c, 0xb0, 0x97, 0xa0, 0x4c, 0x84, 0x73, 0xae, 0xd4, + 0x91, 0x7d, 0x74, 0xb7, 0x7c, 0x8a, 0x98, 0x4d, 0x9a, 0x78, 0x78, 0x4a, + 0xa8, 0xc3, 0xa4, 0x2f, 0xbc, 0x64, 0x89, 0x40, 0x58, 0x69, 0x5a, 0x67, + 0x40, 0x43, 0x5a, 0x6e, 0x5b, 0x83, 0x8d, 0x80, 0x3f, 0x78, 0x8f, 0xac, + 0x5f, 0xa9, 0x60, 0x94, 0xdb, 0xb3, 0x76, 0x9f, 0x6c, 0x75, 0x87, 0x58, + 0xb7, 0xca, 0x90, 0x63, 0x58, 0xc1, 0xcb, 0x60, 0x7d, 0xa7, 0x7a, 0x77, + 0x99, 0x9c, 0xac, 0xa1, 0x86, 0x95, 0x47, 0xa3, 0x7c, 0x38, 0x92, 0x91, + 0x71, 0xbe, 0x62, 0x6a, 0x69, 0xb3, 0x73, 0x96, 0x7c, 0xb3, 0xb4, 0x78, + 0xab, 0x41, 0x87, 0xa1, 0x9e, 0x44, 0xc9, 0xb9, 0xc5, 0x79, 0x59, 0x72, + 0xce, 0xb4, 0x9c, 0x96, 0x7e, 0xd1, 0xb1, 0x3f, 0xb0, 0x76, 0xcc, 0x8f, + 0x46, 0xa0, 0x94, 0x4f, 0x57, 0x78, 0x80, 0xbb, 0xb2, 0x6e, 0x58, 0x5d, + 0xcf, 0xb3, 0x62, 0x9e, 0x71, 0xcb, 0x89, 0x42, 0x8a, 0xbb, 0xb5, 0x7f, + 0x7f, 0x4c, 0x80, 0x59, 0x51, 0x44, 0x69, 0x7d, 0x89, 0x7f, 0x83, 0x45, + 0x7a, 0x9d, 0xb7, 0x40, 0xb4, 0xa7, 0x64, 0x5c, 0x66, 0xaf, 0x33, 0xb1, + 0x50, 0x92, 0x97, 0x4b, 0x96, 0x6b, 0x79, 0x59, 0x6d, 0x33, 0x46, 0x4e, + 0x53, 0x51, 0x2b, 0x9c, 0x7a, 0xa0, 0x2b, 0x4a, 0x3f, 0xa3, 0x5f, 0x46, + 0xb0, 0x31, 0x80, 0x63, 0x5f, 0xb4, 0x42, 0x6f, 0xbb, 0x57, 0x40, 0x6b, + 0x3a, 0x53, 0xb3, 0xa4, 0x3f, 0x4a, 0x7e, 0x4f, 0x82, 0x44, 0x97, 0x47, + 0x86, 0x83, 0x53, 0xb4, 0x6e, 0x4c, 0x98, 0x58, 0x80, 0x59, 0x71, 0x68, + 0x5e, 0x8e, 0x66, 0x50, 0xb1, 0x74, 0x51, 0x42, 0x79, 0x5e, 0x92, 0xbc, + 0x54, 0x9b, 0xad, 0xa5, 0x9a, 0xa6, 0x60, 0x67, 0x94, 0xa0, 0x66, 0x34, + 0x90, 0x6c, 0x46, 0x47, 0x64, 0x67, 0x77, 0x99, 0xae, 0x3a, 0x6f, 0x79, + 0x88, 0x70, 0x7b, 0x6a, 0x5a, 0x62, 0x83, 0x60, 0x9f, 0x67, 0x9a, 0x6c, + 0x91, 0xe0, 0xb7, 0x90, 0xbf, 0xd5, 0xb5, 0x9b, 0x63, 0x8f, 0x15, 0x34, + 0x58, 0x84, 0x46, 0x52, 0x76, 0x9b, 0xa3, 0x9d, 0x9e, 0x46, 0x96, 0xa9, + 0x57, 0x8f, 0x56, 0xc2, 0xad, 0xa8, 0xc9, 0x9b, 0x92, 0x73, 0x77, 0x96, + 0xd7, 0x9c, 0x54, 0x6c, 0xa2, 0x91, 0x90, 0x50, 0x3e, 0x88, 0x79, 0x6e, + 0x7c, 0x71, 0x4e, 0xc4, 0x95, 0x8c, 0xc7, 0xce, 0x9c, 0x90, 0x90, 0x7e, + 0x8b, 0x46, 0xab, 0x99, 0x39, 0x41, 0x19, 0x9c, 0x77, 0xc1, 0x4f, 0xa9, + 0xbf, 0x6f, 0x86, 0xa1, 0x83, 0x3d, 0x7a, 0xb1, 0x9f, 0x85, 0x46, 0x9f, + 0xa9, 0x4a, 0x34, 0xad, 0x7c, 0x91, 0xc9, 0xa3, 0x84, 0x6c, 0x50, 0x3d, + 0xa4, 0xa1, 0xbb, 0x5f, 0xb7, 0x80, 0x3f, 0x8f, 0x84, 0x6b, 0x8b, 0x69, + 0xb4, 0x42, 0x94, 0x5d, 0x3b, 0x50, 0x4f, 0x49, 0x6f, 0x7f, 0x77, 0x9d, + 0x41, 0x6b, 0x8d, 0x90, 0x9d, 0x9c, 0x76, 0xae, 0xa8, 0x84, 0x4e, 0xa6, + 0x64, 0xb1, 0x50, 0x6b, 0xad, 0x95, 0xa8, 0xce, 0x70, 0x88, 0x82, 0xb0, + 0xbc, 0xb9, 0x87, 0x8b, 0x89, 0x73, 0x9d, 0x5b, 0x57, 0x4b, 0xba, 0x48, + 0x5b, 0x76, 0x55, 0x57, 0x99, 0x7a, 0xaa, 0xaa, 0x80, 0x73, 0x85, 0x49, + 0x61, 0x50, 0x57, 0x44, 0x79, 0x54, 0x9b, 0x66, 0x5f, 0x44, 0xbd, 0x49, + 0x59, 0x6e, 0xaf, 0xd3, 0xaf, 0x55, 0x8c, 0x51, 0xbf, 0xa5, 0x5f, 0x3b, + 0x60, 0x7c, 0x5a, 0x91, 0x6c, 0x71, 0x51, 0x5f, 0x46, 0x58, 0xcd, 0x92, + 0x74, 0x76, 0x82, 0x97, 0x62, 0x9e, 0xba, 0x6e, 0xbe, 0xb5, 0x7d, 0x8b, + 0x6e, 0xa7, 0xa4, 0x56, 0x84, 0x60, 0x97, 0x45, 0x64, 0x78, 0xb0, 0x9b, + 0xa3, 0x3c, 0x33, 0x46, 0x79, 0x77, 0x5e, 0x72, 0x40, 0x8a, 0x57, 0x89, + 0xbe, 0x9a, 0x89, 0x66, 0x4e, 0x9f, 0x94, 0x6b, 0x52, 0xd9, 0xb0, 0xb8, + 0x6f, 0x66, 0x7e, 0x3b, 0x31, 0x97, 0x83, 0x63, 0x24, 0x4f, 0x52, 0xa9, + 0xa6, 0x6c, 0x8b, 0xc1, 0x5e, 0xa4, 0x58, 0xb1, 0x66, 0x59, 0x6e, 0xab, + 0x4c, 0x87, 0xb6, 0x59, 0x4f, 0x7e, 0x99, 0xb8, 0x76, 0xaa, 0xc7, 0x6b, + 0x86, 0xc8, 0xc2, 0x83, 0x6d, 0x5e, 0x4d, 0x81, 0x63, 0xbb, 0xb4, 0x90, + 0x96, 0x9d, 0xa7, 0x90, 0x7f, 0xc0, 0x72, 0x74, 0x4b, 0x9c, 0x44, 0x55, + 0xaa, 0x84, 0x53, 0x8a, 0x5e, 0x89, 0xa3, 0x7a, 0x9c, 0x78, 0x3f, 0x42, + 0x84, 0x48, 0xa3, 0x65, 0x5b, 0x8c, 0x4b, 0x3c, 0x64, 0x4a, 0x9b, 0x62, + 0xb3, 0x84, 0xae, 0x57, 0xa3, 0x7a, 0x61, 0x99, 0xb6, 0x6b, 0x44, 0xb9, + 0x8b, 0x55, 0x8f, 0x3d, 0x7b, 0x6b, 0x6f, 0x3c, 0x7e, 0x92, 0x53, 0xb4, + 0x91, 0x73, 0xb8, 0xaf, 0xe1, 0x98, 0xba, 0x71, 0xbe, 0x6e, 0x9c, 0x51, + 0x7a, 0xcb, 0xce, 0xa3, 0xc4, 0x89, 0x9a, 0x55, 0x82, 0x3d, 0x79, 0x55, + 0x48, 0x82, 0x4e, 0x8f, 0x6d, 0xb1, 0x6c, 0xcf, 0x8e, 0x85, 0x51, 0xb1, + 0x5c, 0x39, 0x6b, 0x56, 0x95, 0x58, 0x7d, 0xc2, 0x47, 0xa6, 0xd7, 0x8b, + 0xb4, 0x5f, 0xc4, 0x9c, 0x73, 0x45, 0xa0, 0x83, 0x23, 0x99, 0x9a, 0xaf, + 0x9d, 0x99, 0x67, 0x46, 0xb2, 0xbd, 0x5e, 0x53, 0xc3, 0x76, 0x87, 0xb4, + 0x8d, 0x49, 0x4e, 0x75, 0x58, 0x9e, 0x6c, 0x5b, 0x4a, 0x9f, 0x66, 0x8a, + 0xa9, 0x50, 0x77, 0xa4, 0x50, 0x4f, 0x61, 0xa3, 0x9a, 0x7b, 0x88, 0x53, + 0xa8, 0x8d, 0x48, 0xba, 0x57, 0x8f, 0xac, 0x94, 0x5f, 0x52, 0x6f, 0x75, + 0x77, 0xb7, 0x94, 0x79, 0x9e, 0x80, 0x5c, 0x7f, 0x43, 0x4f, 0xb2, 0xa2, + 0xba, 0x82, 0xae, 0xa5, 0x9d, 0x4c, 0xa2, 0x91, 0x3a, 0xa4, 0xa3, 0x64, + 0x74, 0x66, 0xa3, 0x71, 0x64, 0x83, 0xa5, 0x59, 0xb6, 0x2d, 0x39, 0x66, + 0xb5, 0xa8, 0x4f, 0xa3, 0x48, 0x86, 0x65, 0xb0, 0x49, 0x62, 0x9d, 0x9d, + 0x6f, 0xc4, 0x66, 0x4f, 0x32, 0x86, 0xa2, 0xce, 0x38, 0x6e, 0x79, 0x8a, + 0x9f, 0x4e, 0x46, 0x3d, 0x9d, 0xc8, 0x54, 0x7f, 0x78, 0x57, 0x71, 0xd3, + 0x6e, 0xc3, 0x67, 0x4e, 0x95, 0xa7, 0xa8, 0x9b, 0x87, 0x9f, 0x84, 0x8e, + 0x69, 0x41, 0x3d, 0xaf, 0x8c, 0x68, 0xa5, 0xa7, 0xa3, 0xbb, 0x8e, 0xa2, + 0x57, 0x98, 0x7d, 0x85, 0xb1, 0x6f, 0xab, 0x6e, 0x3f, 0xab, 0x62, 0x63, + 0x87, 0x70, 0xa8, 0xa0, 0x34, 0xc0, 0x6a, 0xbc, 0x76, 0x58, 0x8c, 0x54, + 0x73, 0x62, 0xaa, 0x44, 0x71, 0xc5, 0x9f, 0x66, 0x45, 0xa8, 0x82, 0x85, + 0x9e, 0x3b, 0x82, 0xad, 0x7d, 0x67, 0x45, 0x33, 0x81, 0x3e, 0xba, 0x88, + 0x64, 0xae, 0x8b, 0x52, 0x88, 0xab, 0x85, 0x5d, 0x7d, 0xc6, 0x47, 0x58, + 0x8b, 0xba, 0x64, 0x5b, 0x61, 0x52, 0x92, 0x6b, 0xbf, 0x98, 0xa4, 0xb4, + 0x53, 0x60, 0x69, 0x42, 0xa9, 0x97, 0x4f, 0x82, 0xa3, 0x79, 0x61, 0x5b, + 0x74, 0x62, 0x75, 0xa7, 0x89, 0x6b, 0x89, 0x91, 0x73, 0x55, 0x55, 0x8e, + 0x71, 0xb6, 0x4b, 0xb1, 0x87, 0x9f, 0x5c, 0x96, 0x37, 0xa2, 0xce, 0xb5, + 0xb8, 0x6a, 0x81, 0x8c, 0x84, 0x5a, 0x4f, 0xb0, 0xc0, 0x78, 0x73, 0xab, + 0x8f, 0x83, 0x93, 0xba, 0x67, 0x4b, 0x4a, 0x65, 0x95, 0xa6, 0xc7, 0x7c, + 0x6c, 0x65, 0x70, 0xc1, 0x67, 0x4a, 0x3f, 0x5c, 0x4c, 0xbc, 0x83, 0x54, + 0x53, 0xab, 0xb7, 0x9b, 0x94, 0x50, 0x8d, 0xaf, 0x4f, 0xb2, 0x50, 0xad, + 0x8d, 0x6d, 0x53, 0x86, 0xa9, 0xba, 0x7d, 0xb9, 0x3a, 0xb8, 0xa7, 0xab, + 0x65, 0x7a, 0x67, 0x63, 0x85, 0xa9, 0x95, 0x83, 0xb1, 0x90, 0x4c, 0x5c, + 0x8a, 0xa0, 0x53, 0x6a, 0x96, 0x94, 0x7a, 0x73, 0x7e, 0xa0, 0x3a, 0x3f, + 0x5f, 0xba, 0xb2, 0x82, 0xb0, 0x75, 0xa2, 0x66, 0x8d, 0x45, 0x83, 0x40, + 0xad, 0x4e, 0xd6, 0xb5, 0x85, 0x78, 0xac, 0x89, 0x89, 0x48, 0x68, 0xac, + 0x8f, 0x58, 0xb1, 0x67, 0xa7, 0xa5, 0x98, 0x6a, 0x81, 0xc5, 0x95, 0xa7, + 0xb9, 0x7e, 0x6f, 0x7c, 0x87, 0x5e, 0x9a, 0x61, 0x5f, 0x66, 0x5c, 0x69, + 0x6f, 0x81, 0x5c, 0xab, 0x63, 0x5b, 0x45, 0x69, 0x7b, 0xa3, 0x5b, 0x33, + 0x43, 0xa9, 0x96, 0x76, 0xb7, 0xa5, 0xbc, 0xda, 0x78, 0x7f, 0x3e, 0x57, + 0x8e, 0xba, 0xae, 0x41, 0x93, 0x89, 0xc7, 0x55, 0x8d, 0xb7, 0xbe, 0xb9, + 0xbb, 0x8b, 0x59, 0x9c, 0x5b, 0xbd, 0xaf, 0xbe, 0x4c, 0x35, 0x4f, 0xb4, + 0x9f, 0x99, 0x61, 0x8e, 0x6b, 0x61, 0x96, 0x90, 0x8a, 0x8b, 0x91, 0x86, + 0x8b, 0x85, 0x55, 0x48, 0x9a, 0x90, 0x8a, 0xb1, 0x94, 0xa6, 0x8f, 0x4d, + 0x99, 0x6a, 0x40, 0x71, 0x85, 0xba, 0xb2, 0xb9, 0x69, 0x43, 0xb2, 0x60, + 0x91, 0x8b, 0xa6, 0x73, 0xa7, 0x91, 0x8a, 0x54, 0x69, 0xa8, 0xcc, 0x94, + 0x94, 0x8e, 0x79, 0x6b, 0xab, 0xa9, 0x35, 0x90, 0x67, 0x7f, 0x90, 0x94, + 0x4f, 0x49, 0x4b, 0x72, 0x40, 0x69, 0x4a, 0x69, 0x7d, 0x6b, 0x71, 0x88, + 0x40, 0x93, 0x85, 0x3b, 0x4e, 0xb2, 0x4f, 0x73, 0x7a, 0xbb, 0xb4, 0x7b, + 0x97, 0x71, 0x88, 0x42, 0x8b, 0x96, 0x8a, 0x8c, 0x84, 0x8b, 0x46, 0x52, + 0xb2, 0x82, 0x96, 0xa1, 0x78, 0x67, 0x67, 0x9e, 0x4a, 0x6e, 0x58, 0x8f, + 0x8c, 0x99, 0xa6, 0x74, 0x51, 0x58, 0x90, 0x64, 0xa5, 0x80, 0x40, 0x6f, + 0xc1, 0xa6, 0x77, 0x50, 0x4f, 0xb3, 0x98, 0x76, 0x9d, 0x4d, 0xbf, 0x88, + 0x67, 0x99, 0xb7, 0x85, 0x70, 0xc2, 0x8e, 0xc4, 0x99, 0xa1, 0x5e, 0x36, + 0xb2, 0x7c, 0x8e, 0xac, 0x4c, 0x4a, 0x6c, 0x5f, 0xa9, 0x72, 0x83, 0x64, + 0x31, 0x35, 0x94, 0xbd, 0xaf, 0xb6, 0xb3, 0xcd, 0xb2, 0xb0, 0x9f, 0xa9, + 0x4e, 0x55, 0x6b, 0xbf, 0xa3, 0x5d, 0xac, 0xa5, 0x75, 0xba, 0x79, 0x86, + 0x67, 0x4e, 0x8e, 0x9e, 0x94, 0x66, 0x8c, 0xaa, 0xc0, 0xb9, 0x4e, 0x90, + 0x9e, 0x8e, 0xa8, 0x9d, 0x60, 0x89, 0x84, 0x3b, 0x6b, 0x58, 0x81, 0x53, + 0x82, 0x55, 0x31, 0xa3, 0x6f, 0x82, 0xb9, 0xb5, 0x78, 0x5c, 0x84, 0x66, + 0x66, 0x59, 0xca, 0xcc, 0xb3, 0x64, 0x67, 0xbb, 0x9d, 0x75, 0x7e, 0xa3, + 0xa8, 0x91, 0x65, 0xa6, 0x8a, 0xb0, 0xa4, 0x69, 0xb7, 0x72, 0x66, 0xa8, + 0xb5, 0xb3, 0x98, 0xb5, 0xa8, 0x59, 0x83, 0x4d, 0x87, 0x88, 0x3e, 0x67, + 0x7d, 0x96, 0xbf, 0x76, 0x9f, 0x70, 0x6a, 0x57, 0xb3, 0xa1, 0x3f, 0x51, + 0xaa, 0x83, 0xaa, 0x7e, 0x83, 0xa2, 0x6f, 0x88, 0xa7, 0xb6, 0x96, 0xb2, + 0x47, 0xc0, 0x7d, 0xb7, 0x9a, 0x9f, 0x72, 0x7d, 0xb6, 0xae, 0x69, 0x7a, + 0xb4, 0x53, 0xb3, 0x96, 0x49, 0x4f, 0xb7, 0x4c, 0x99, 0x63, 0x91, 0x9e, + 0x4c, 0xbf, 0x7c, 0x44, 0xa3, 0xaa, 0x87, 0x4f, 0xbf, 0x6b, 0x57, 0x47, + 0x8e, 0xbe, 0xa5, 0x58, 0x9d, 0xa8, 0x6d, 0x42, 0x51, 0x96, 0x75, 0x70, + 0x92, 0x72, 0x5a, 0xbe, 0x8e, 0x9d, 0xac, 0xac, 0x67, 0x56, 0x7d, 0xa9, + 0x48, 0xc3, 0x52, 0x54, 0x77, 0x71, 0xa0, 0x40, 0xb1, 0x59, 0x6d, 0x67, + 0x96, 0xa1, 0x52, 0x9f, 0x88, 0x4e, 0x53, 0x88, 0xb1, 0x58, 0x88, 0x94, + 0xb6, 0x73, 0x85, 0x8a, 0xb2, 0xa8, 0x75, 0x54, 0xc0, 0x68, 0x84, 0xb6, + 0x5c, 0x50, 0xb9, 0x54, 0x76, 0x99, 0x8c, 0x97, 0xb2, 0x6d, 0xb4, 0xab, + 0x4b, 0x5f, 0x9b, 0xb5, 0xbe, 0x65, 0xa8, 0x90, 0x9c, 0xbe, 0xb9, 0x8e, + 0xae, 0x68, 0xb5, 0xa1, 0x5f, 0xc8, 0x8b, 0x40, 0x9e, 0x86, 0x6b, 0x85, + 0x5f, 0x65, 0xb4, 0xb0, 0x60, 0x55, 0x3d, 0x53, 0x9f, 0xa7, 0xb7, 0x75, + 0x6c, 0x52, 0x54, 0x3f, 0xa1, 0x4a, 0x5e, 0xaf, 0x79, 0x65, 0xa5, 0xad, + 0x98, 0x87, 0x91, 0xac, 0x6a, 0x6f, 0x4f, 0x8b, 0xad, 0xa7, 0xbd, 0xa7, + 0xaf, 0x64, 0x78, 0x6a, 0x93, 0xac, 0x8d, 0x7c, 0x6d, 0xbb, 0xa0, 0x57, + 0xb3, 0x44, 0x9b, 0x49, 0x64, 0x97, 0x76, 0xb8, 0x9a, 0x53, 0x8f, 0x69, + 0x9d, 0x9b, 0x82, 0x6b, 0xb0, 0x47, 0xa9, 0x5e, 0x43, 0x9a, 0x55, 0x79, + 0xbf, 0x97, 0x57, 0xa1, 0x6b, 0x5c, 0x61, 0x77, 0x52, 0x81, 0x4b, 0x69, + 0x83, 0xab, 0x60, 0x89, 0x5f, 0x60, 0x78, 0x6d, 0xc3, 0x5e, 0xbe, 0xbf, + 0x9a, 0x56, 0xb0, 0x86, 0x61, 0x4c, 0x72, 0x83, 0xb7, 0x50, 0xb2, 0x56, + 0x99, 0x92, 0x47, 0x79, 0xab, 0xb8, 0x76, 0x9f, 0x48, 0xb7, 0x8b, 0x9f, + 0x9b, 0xa5, 0x50, 0x62, 0x67, 0x5e, 0x80, 0x98, 0xb8, 0xad, 0x91, 0x54, + 0xb2, 0xc3, 0x7f, 0xb7, 0x72, 0x43, 0xb6, 0x97, 0x41, 0x6c, 0x5f, 0x66, + 0x81, 0x61, 0x64, 0xab, 0x56, 0x70, 0x6e, 0x55, 0x5f, 0x47, 0x87, 0x43, + 0x6d, 0x5a, 0x50, 0x3e, 0x92, 0xb7, 0xa3, 0x57, 0x3e, 0x99, 0x42, 0x59, + 0x8f, 0x66, 0xa2, 0x5e, 0x63, 0x5a, 0x81, 0x65, 0xa0, 0x65, 0x39, 0x90, + 0xb7, 0x9d, 0xa6, 0x63, 0xa0, 0x85, 0xc2, 0xae, 0x85, 0x97, 0x76, 0x73, + 0x9d, 0x9c, 0x60, 0xc7, 0x9b, 0x98, 0xb1, 0x9c, 0xaf, 0x66, 0x49, 0x44, + 0xba, 0x58, 0x67, 0xbc, 0xce, 0xc2, 0x74, 0xbb, 0x64, 0x83, 0x7c, 0x57, + 0x52, 0xb0, 0xbd, 0xa7, 0x55, 0x71, 0x72, 0x73, 0x9f, 0x5b, 0x4a, 0x7e, + 0x9d, 0x67, 0xbe, 0x9d, 0xb7, 0xcb, 0x94, 0x83, 0x61, 0x70, 0x4f, 0xac, + 0x55, 0x49, 0x2c, 0x4d, 0xa6, 0x3b, 0xa1, 0x9f, 0x87, 0x5c, 0x4e, 0x9c, + 0x90, 0x66, 0x7f, 0x54, 0x91, 0xab, 0x70, 0x2f, 0x41, 0xc6, 0xe0, 0xb3, + 0x71, 0x73, 0x50, 0xb0, 0x8a, 0xc2, 0xc0, 0xb6, 0xa6, 0x42, 0x6f, 0xcd, + 0x74, 0x66, 0xd1, 0xc2, 0xb4, 0x59, 0x87, 0xa2, 0x7a, 0x99, 0x53, 0x94, + 0x8a, 0x4d, 0xb9, 0xa0, 0x91, 0x86, 0x8c, 0x9e, 0xc3, 0xa8, 0x9d, 0x42, + 0x50, 0xa3, 0xc0, 0xc6, 0x7f, 0x9a, 0x74, 0x53, 0x50, 0x4c, 0x91, 0x69, + 0xc0, 0xb2, 0x40, 0xa4, 0x59, 0xce, 0x79, 0xb5, 0x9f, 0xa9, 0x7f, 0x6e, + 0xc6, 0x5a, 0xa7, 0x3f, 0x77, 0x84, 0x98, 0x4e, 0xb7, 0x4a, 0x9e, 0x4f, + 0x35, 0x90, 0x98, 0xce, 0xd2, 0x6c, 0x73, 0x6e, 0x3e, 0xc7, 0x75, 0x93, + 0x3c, 0x91, 0xaa, 0x84, 0x7b, 0xb6, 0x60, 0x6e, 0x89, 0x62, 0x42, 0xa1, + 0x7a, 0x8c, 0x88, 0x42, 0x70, 0x3e, 0x59, 0x58, 0x6c, 0x60, 0x54, 0xc2, + 0x82, 0xa2, 0x8c, 0xba, 0x9f, 0x56, 0x68, 0xcb, 0xb6, 0x8d, 0x6e, 0xa5, + 0x99, 0x6f, 0xd3, 0x89, 0x73, 0x69, 0x9b, 0x6e, 0x84, 0x88, 0x68, 0xb4, + 0x45, 0x8f, 0x47, 0xaf, 0x49, 0xa0, 0x40, 0x35, 0x77, 0x74, 0xb8, 0xa3, + 0x30, 0x76, 0xaf, 0xa5, 0x5f, 0xbc, 0x86, 0xa3, 0x5b, 0x70, 0x6c, 0xa4, + 0x5c, 0xc5, 0x50, 0x47, 0x50, 0x4f, 0xac, 0x45, 0x86, 0xa3, 0xa4, 0x9d, + 0xa6, 0xad, 0x66, 0xa2, 0x4b, 0x94, 0x70, 0xab, 0x86, 0x48, 0x3c, 0x3f, + 0x73, 0x6f, 0x86, 0x8e, 0xae, 0x6f, 0x7f, 0x90, 0x7e, 0x8e, 0x9a, 0x57, + 0x98, 0x88, 0x6e, 0x76, 0xaf, 0x59, 0x98, 0x9a, 0x72, 0x6a, 0xcc, 0x5d, + 0xa8, 0x68, 0x8b, 0xaf, 0x57, 0x73, 0x6a, 0xa4, 0x62, 0x6e, 0x54, 0x6a, + 0x8d, 0xad, 0x48, 0x73, 0x63, 0xd7, 0xd1, 0xa4, 0x73, 0x83, 0x45, 0xa8, + 0x98, 0x74, 0xd7, 0x79, 0x74, 0x8d, 0x5e, 0xc2, 0x5a, 0x55, 0x7a, 0x45, + 0x6f, 0x83, 0xa6, 0xb3, 0x4f, 0x3d, 0x4f, 0x6c, 0xad, 0x6f, 0x9f, 0x76, + 0x66, 0x7e, 0x79, 0x4a, 0x67, 0xc7, 0x65, 0x7f, 0x67, 0xa3, 0xa8, 0x91, + 0xa6, 0x5e, 0x4a, 0x4f, 0x84, 0x6e, 0xd6, 0xe2, 0x77, 0x61, 0x4e, 0x56, + 0x4e, 0xba, 0x91, 0x67, 0x49, 0xaf, 0x8e, 0xc0, 0x83, 0x92, 0xb2, 0x9d, + 0x67, 0x7d, 0x91, 0x7a, 0x70, 0x55, 0x7b, 0x30, 0xa7, 0x9e, 0x46, 0x86, + 0x7d, 0x78, 0xa8, 0x65, 0xae, 0xd1, 0x92, 0xb2, 0x91, 0xa7, 0x7f, 0x68, + 0x83, 0x5b, 0x50, 0x59, 0x82, 0x73, 0x99, 0x52, 0x7d, 0x7e, 0x75, 0xa4, + 0x68, 0x55, 0x67, 0xc2, 0xaa, 0x80, 0xa6, 0x5f, 0xa2, 0xb0, 0xb4, 0x5c, + 0xb3, 0xb1, 0xb5, 0x61, 0xca, 0x5d, 0x62, 0x74, 0xb0, 0x80, 0xb6, 0xd4, + 0x7d, 0x71, 0x73, 0x74, 0x98, 0xc3, 0xc7, 0x9d, 0x51, 0xa6, 0xa4, 0x77, + 0x87, 0xa7, 0x8e, 0x4b, 0x4b, 0xa9, 0x70, 0xc1, 0x60, 0xc9, 0xa4, 0x9e, + 0xb8, 0xdf, 0x70, 0x90, 0x68, 0x9f, 0x6c, 0x54, 0x9f, 0x61, 0xb1, 0x3f, + 0xa6, 0x73, 0x4a, 0x51, 0xc1, 0xac, 0xb1, 0x56, 0x61, 0x61, 0xae, 0x8b, + 0x6c, 0x4c, 0x42, 0x91, 0xc2, 0x42, 0x5b, 0x69, 0x97, 0x81, 0x72, 0xa9, + 0xa5, 0x60, 0xa1, 0x92, 0x37, 0xc6, 0x6d, 0x5e, 0x6a, 0x49, 0x94, 0x74, + 0x4c, 0x6b, 0x4a, 0x9e, 0x88, 0xb8, 0x86, 0x48, 0x7d, 0xa0, 0xaa, 0x4a, + 0x87, 0x50, 0xb6, 0x88, 0xc4, 0x74, 0xa9, 0x64, 0x8e, 0xb5, 0x55, 0x81, + 0xaf, 0x5c, 0x83, 0x64, 0x78, 0xa3, 0xa7, 0xbb, 0x86, 0x82, 0x79, 0x74, + 0x8f, 0x4b, 0x51, 0x73, 0x8f, 0x4e, 0xc1, 0x59, 0xa1, 0x8a, 0x96, 0x5f, + 0xc3, 0x9d, 0x8a, 0x54, 0x4b, 0x54, 0xb1, 0xbe, 0xb7, 0xb4, 0x27, 0x49, + 0x61, 0x69, 0xb1, 0x81, 0x32, 0x4b, 0x5d, 0x9e, 0xb0, 0x80, 0x3b, 0x84, + 0x96, 0x46, 0x67, 0x69, 0xb0, 0x98, 0xb1, 0xde, 0x68, 0xb0, 0x6b, 0xa3, + 0xf2, 0xc8, 0x61, 0x59, 0x6e, 0x49, 0xd5, 0xd3, 0x70, 0xb0, 0x5e, 0x5c, + 0xa7, 0xb4, 0x6b, 0x90, 0xb6, 0x8a, 0x9c, 0xb1, 0x6a, 0x97, 0xc7, 0x8b, + 0x8a, 0x56, 0x3d, 0xbc, 0x6d, 0x4f, 0x74, 0x73, 0x8c, 0x62, 0xb1, 0x7d, + 0x55, 0x4e, 0x93, 0xa5, 0xb4, 0x56, 0x92, 0x99, 0x9d, 0x76, 0x3f, 0xb0, + 0x60, 0x80, 0x93, 0x77, 0x7c, 0xb1, 0xa1, 0x7f, 0xd8, 0x81, 0x9c, 0x5f, + 0xc0, 0xc7, 0x4a, 0x4c, 0x3d, 0x99, 0x78, 0xa2, 0x4c, 0x4a, 0x7d, 0xa9, + 0xc1, 0x8f, 0x3c, 0x66, 0x4b, 0x5d, 0xa9, 0xaa, 0x30, 0x84, 0x8e, 0x4b, + 0x90, 0xa0, 0x8c, 0x8c, 0x67, 0xa1, 0xc3, 0x83, 0x79, 0xa1, 0x72, 0x8e, + 0xab, 0x81, 0x45, 0x4c, 0x6f, 0x6d, 0x9f, 0x4a, 0xaf, 0x97, 0xb8, 0x3d, + 0x5b, 0xc5, 0xa0, 0x51, 0x8b, 0xa8, 0x5d, 0x73, 0xbc, 0x96, 0x5b, 0xb1, + 0x95, 0x81, 0x90, 0x61, 0x65, 0x81, 0xa7, 0x38, 0x4c, 0x63, 0x8b, 0xbc, + 0x6a, 0xb6, 0xa1, 0x90, 0x6b, 0x49, 0xc0, 0x32, 0x89, 0xb7, 0xb2, 0x8a, + 0xbd, 0x86, 0x47, 0x89, 0x85, 0x78, 0xb9, 0xa0, 0x47, 0xae, 0x8e, 0x90, + 0x8b, 0xa3, 0x92, 0x54, 0x68, 0xaa, 0xa5, 0x70, 0x7b, 0x8c, 0x64, 0x80, + 0x42, 0x6f, 0xb0, 0x7a, 0x82, 0x99, 0x4d, 0x99, 0x8f, 0xb5, 0x73, 0xbc, + 0xa5, 0xb0, 0x8f, 0x60, 0xbf, 0x42, 0x67, 0x7f, 0x5c, 0xa2, 0x68, 0x77, + 0x90, 0xbe, 0x46, 0x4f, 0x8e, 0xab, 0xa7, 0x8a, 0xcb, 0x93, 0x97, 0x83, + 0x91, 0xab, 0x6e, 0x72, 0xae, 0x4d, 0x72, 0x57, 0x7f, 0x97, 0x6c, 0x94, + 0x82, 0xb4, 0xbb, 0x65, 0x2e, 0x47, 0x9a, 0xc4, 0x88, 0x99, 0x4d, 0x5e, + 0x30, 0x88, 0x55, 0x5e, 0x47, 0x6e, 0xb2, 0x6a, 0xa4, 0x5d, 0x51, 0x97, + 0xb3, 0x7c, 0xa3, 0x98, 0xa0, 0x45, 0x9f, 0x33, 0x7d, 0xb4, 0xc2, 0x8e, + 0xac, 0xa9, 0xc2, 0x7a, 0xad, 0x89, 0x7f, 0x66, 0x5d, 0x61, 0x68, 0x58, + 0x7c, 0xc9, 0xae, 0xac, 0x93, 0x5a, 0x80, 0xc4, 0xdb, 0x95, 0x37, 0x76, + 0x77, 0x77, 0x43, 0xbb, 0xa5, 0x55, 0x91, 0x61, 0x83, 0xa6, 0x70, 0xac, + 0xb3, 0x4f, 0x3a, 0x44, 0x54, 0x7d, 0x3f, 0x3d, 0x78, 0x58, 0x64, 0x54, + 0xa0, 0x64, 0x9d, 0xa7, 0x68, 0x80, 0x6b, 0x71, 0x7b, 0xb5, 0x50, 0x5c, + 0x4c, 0xad, 0x57, 0x5c, 0xbc, 0x66, 0x53, 0x47, 0x5d, 0x41, 0x85, 0xa6, + 0x79, 0x8e, 0x9a, 0x69, 0x8e, 0xc0, 0x9e, 0xaa, 0xa1, 0x7c, 0x85, 0x9e, + 0x7f, 0x6e, 0x81, 0xb9, 0xa4, 0xc3, 0xae, 0x88, 0x66, 0x59, 0x7d, 0x94, + 0x5e, 0x6d, 0x47, 0x44, 0x5e, 0x94, 0x56, 0x58, 0x9e, 0x48, 0x44, 0xa0, + 0x7a, 0x45, 0x85, 0xa3, 0x83, 0x79, 0x62, 0x54, 0x39, 0x53, 0xb5, 0xa7, + 0x95, 0x76, 0x5a, 0xaf, 0xda, 0xb7, 0xd0, 0x85, 0x9e, 0x83, 0x8f, 0x8e, + 0x9e, 0x5a, 0xb1, 0x46, 0xb5, 0x54, 0x95, 0x58, 0x78, 0x96, 0x62, 0xa8, + 0xbe, 0x61, 0x8f, 0x37, 0xa3, 0x4c, 0x58, 0xa4, 0x92, 0x46, 0x9c, 0x76, + 0x86, 0x6c, 0x84, 0xb2, 0xb2, 0x87, 0x98, 0x59, 0xbb, 0xa5, 0x56, 0xc6, + 0x9f, 0x8f, 0x53, 0x82, 0x6a, 0x94, 0x85, 0x54, 0x9a, 0x8d, 0x82, 0x94, + 0x79, 0x53, 0x67, 0x78, 0x6c, 0x8b, 0x58, 0x5d, 0x3b, 0x37, 0x5e, 0x41, + 0xb2, 0xce, 0x55, 0x65, 0x8f, 0x5a, 0xa1, 0x76, 0xc9, 0xbc, 0x90, 0x54, + 0xc3, 0x77, 0x4e, 0xb3, 0x4d, 0x6f, 0xa4, 0x9a, 0xac, 0xb6, 0x76, 0x54, + 0x85, 0xaf, 0xb2, 0x9f, 0xa2, 0xac, 0x4e, 0x95, 0x89, 0x56, 0x97, 0x52, + 0x4c, 0x37, 0xb8, 0x88, 0x6e, 0x49, 0xab, 0x90, 0x6c, 0xa4, 0xac, 0x82, + 0xb3, 0xbb, 0x79, 0x4e, 0xb6, 0xac, 0x55, 0x97, 0x68, 0x5b, 0x4b, 0x74, + 0x74, 0xb4, 0x83, 0x68, 0x46, 0xad, 0xca, 0x9e, 0x6b, 0xb4, 0x51, 0x63, + 0x78, 0x55, 0xaa, 0x4e, 0xa0, 0x4b, 0xaf, 0x87, 0x6a, 0x36, 0xa9, 0x8f, + 0x55, 0x54, 0x6f, 0x48, 0x90, 0x4f, 0x5c, 0x3f, 0x34, 0xb9, 0xc4, 0xbb, + 0x45, 0xb9, 0x92, 0xa6, 0x72, 0x3b, 0x8b, 0xb6, 0xcd, 0x51, 0x46, 0x4c, + 0x83, 0x64, 0x50, 0x81, 0x7e, 0x75, 0x56, 0x4c, 0x54, 0x44, 0x7d, 0x53, + 0x4c, 0xae, 0x82, 0x53, 0xaa, 0x8e, 0x4c, 0x82, 0x80, 0x4a, 0x79, 0x68, + 0x62, 0x60, 0x4b, 0x6e, 0x8e, 0xbe, 0x7f, 0x99, 0x64, 0x9b, 0xb2, 0x91, + 0xab, 0x8e, 0x6c, 0x48, 0xba, 0x7f, 0x9f, 0xa5, 0x6f, 0x93, 0x8b, 0xba, + 0x49, 0x41, 0xb3, 0x9e, 0x69, 0x72, 0xc1, 0x98, 0x9a, 0x59, 0x7d, 0xae, + 0x4d, 0x9e, 0x8b, 0x9f, 0x5b, 0x48, 0xa8, 0x8f, 0xaf, 0x68, 0xaa, 0xa7, + 0x88, 0xc3, 0xb8, 0x5b, 0x72, 0x3f, 0x42, 0xb6, 0x52, 0xc8, 0x96, 0xb8, + 0x5f, 0x96, 0x78, 0x73, 0x6c, 0xb6, 0x61, 0x6c, 0x42, 0x57, 0x8e, 0xc8, + 0xb3, 0x93, 0x81, 0x6a, 0x6b, 0x99, 0x68, 0x93, 0xac, 0x67, 0x7f, 0xa8, + 0x9a, 0x7d, 0x41, 0x57, 0x61, 0x43, 0x44, 0xad, 0x5d, 0xb6, 0xba, 0x92, + 0x6a, 0x4a, 0x82, 0x42, 0x9a, 0x7f, 0x6d, 0x73, 0x97, 0xa5, 0x8d, 0xab, + 0x94, 0x75, 0x55, 0xa5, 0xbb, 0xa4, 0x9f, 0x9f, 0xbf, 0xa2, 0x5e, 0x70, + 0xa0, 0xd2, 0x7e, 0xae, 0xbc, 0x77, 0x83, 0xc0, 0x58, 0x6e, 0xa2, 0x86, + 0xcf, 0xba, 0x62, 0x7c, 0x9e, 0x55, 0xaf, 0x9b, 0x57, 0xc0, 0xbb, 0x6c, + 0xae, 0x7c, 0xb4, 0x8d, 0x96, 0xaf, 0xab, 0x5d, 0x47, 0x46, 0x9e, 0x6d, + 0x55, 0x93, 0x8e, 0x7c, 0x8c, 0x57, 0x62, 0x4f, 0x79, 0xbc, 0xc4, 0x7b, + 0x59, 0x4e, 0x4c, 0x5e, 0x6a, 0x86, 0xb1, 0x4f, 0x89, 0xcd, 0x79, 0x91, + 0x6c, 0xbf, 0x68, 0xac, 0x8a, 0x57, 0x64, 0x9f, 0x8f, 0xa8, 0x99, 0x71, + 0x44, 0x6f, 0x4b, 0x64, 0xbb, 0xc1, 0x81, 0xa3, 0x64, 0xc0, 0xa9, 0xb2, + 0x74, 0xb0, 0x9f, 0x8d, 0xa3, 0x32, 0x8c, 0xba, 0x63, 0x49, 0x49, 0x66, + 0x69, 0x5b, 0x8b, 0xb1, 0x57, 0x4c, 0x48, 0xa2, 0xc1, 0x4e, 0x84, 0x74, + 0xb8, 0x61, 0x79, 0x56, 0x57, 0xbc, 0x85, 0x77, 0x4f, 0x3f, 0x4a, 0x8e, + 0x5b, 0xa6, 0x61, 0xba, 0x55, 0x64, 0x9a, 0x70, 0x90, 0x87, 0x67, 0x8a, + 0x5a, 0xbb, 0x76, 0x4d, 0xa0, 0x55, 0xba, 0xc0, 0x90, 0x4b, 0x54, 0x61, + 0xbb, 0xc2, 0xa6, 0x6a, 0x92, 0x91, 0x45, 0x56, 0x7d, 0xb8, 0xaf, 0xc2, + 0x56, 0xc3, 0x8f, 0x55, 0xb9, 0x77, 0x47, 0x76, 0xb1, 0x96, 0x95, 0x93, + 0x45, 0x90, 0x70, 0x70, 0x5c, 0x6b, 0xad, 0x62, 0x3f, 0x92, 0x45, 0x71, + 0x5d, 0x85, 0xb9, 0x55, 0x50, 0x8b, 0xb5, 0x9f, 0xbc, 0x7c, 0xb2, 0x6f, + 0x61, 0x75, 0xa3, 0x87, 0x4f, 0x9c, 0xb6, 0x69, 0x67, 0xb0, 0xa8, 0x56, + 0x7a, 0x81, 0x55, 0x47, 0x84, 0xb5, 0x8a, 0x7e, 0xb3, 0x69, 0xaa, 0x7e, + 0x5a, 0xb9, 0x42, 0x76, 0x74, 0xcb, 0x6b, 0x54, 0x4e, 0x64, 0x7a, 0x7a, + 0x71, 0x52, 0x55, 0xcb, 0x92, 0x8b, 0xaa, 0xa8, 0x6b, 0x7e, 0x94, 0x64, + 0x8f, 0x73, 0xb6, 0x83, 0x3d, 0x47, 0x7f, 0x6f, 0x52, 0x63, 0x49, 0x82, + 0xb8, 0x6a, 0xb5, 0x45, 0x9b, 0x4d, 0x59, 0xb3, 0x77, 0x91, 0x82, 0x79, + 0x63, 0x9a, 0xd5, 0x8f, 0x80, 0x6b, 0xb8, 0xb8, 0x47, 0x42, 0x86, 0x8f, + 0x88, 0x8b, 0x85, 0xa2, 0xc3, 0x73, 0x66, 0x47, 0x55, 0xc5, 0x9f, 0x99, + 0x30, 0xa7, 0x69, 0x97, 0x32, 0x65, 0x7d, 0x3e, 0x9e, 0xba, 0x8e, 0xae, + 0x63, 0x2a, 0xbb, 0x46, 0x94, 0x66, 0x68, 0x5e, 0xb3, 0x88, 0xcd, 0x44, + 0x44, 0x5f, 0x90, 0x9b, 0x64, 0xae, 0xba, 0x8d, 0x2d, 0xb8, 0x8b, 0x6f, + 0x6b, 0x7b, 0x5b, 0x67, 0xa3, 0x7b, 0x2a, 0x4c, 0xcf, 0x9b, 0x99, 0x36, + 0x75, 0x8b, 0x93, 0x8e, 0x88, 0x9f, 0x7d, 0x80, 0x4b, 0x9b, 0x7c, 0x66, + 0xac, 0xc2, 0x6c, 0x80, 0x99, 0x8d, 0x8d, 0x42, 0x5b, 0x71, 0xad, 0x52, + 0x4f, 0x5b, 0xb4, 0x56, 0xab, 0x41, 0x9d, 0x9d, 0x43, 0x4d, 0x41, 0x78, + 0x87, 0x99, 0x9b, 0x63, 0x5e, 0x46, 0x73, 0x4e, 0x65, 0x65, 0x50, 0x46, + 0x5d, 0x76, 0x9d, 0xae, 0x74, 0x58, 0xa2, 0x65, 0x8f, 0x8c, 0x7b, 0x7f, + 0x92, 0x73, 0xc1, 0xac, 0x75, 0xa9, 0x80, 0x71, 0x64, 0x64, 0x7d, 0xc8, + 0x6b, 0x62, 0xa3, 0x5a, 0x7e, 0x9e, 0x8a, 0x99, 0x89, 0x83, 0x8d, 0x6f, + 0x4c, 0x9f, 0x4a, 0xa3, 0xae, 0x84, 0xbc, 0xa4, 0x8b, 0x94, 0x3b, 0x86, + 0x9d, 0xb1, 0x55, 0x33, 0x6c, 0xc7, 0x71, 0x96, 0x4b, 0x3b, 0x70, 0xd1, + 0x73, 0x51, 0xcb, 0x60, 0xa2, 0x9f, 0x8e, 0xc9, 0x9c, 0x86, 0x55, 0x68, + 0xcf, 0xde, 0x97, 0x8b, 0x6d, 0xba, 0xd9, 0x5b, 0x6c, 0xb6, 0x90, 0x59, + 0xa0, 0xaa, 0x34, 0x8e, 0x64, 0x89, 0x5f, 0x56, 0xb7, 0x99, 0x5d, 0x7d, + 0x59, 0xbb, 0x8b, 0x8c, 0x48, 0x49, 0x9b, 0x82, 0xb6, 0x8e, 0x56, 0xd1, + 0xa4, 0x6d, 0x9b, 0xb1, 0xce, 0x6d, 0x8c, 0x42, 0x7c, 0x84, 0xc1, 0xc5, + 0x96, 0x59, 0xc7, 0x69, 0x7c, 0x4f, 0xa6, 0x64, 0xa2, 0x49, 0x60, 0x35, + 0xa5, 0x8b, 0x73, 0x95, 0x9f, 0xa1, 0x5d, 0x98, 0x3b, 0x5c, 0x87, 0x38, + 0x62, 0x91, 0x5f, 0x4f, 0x8b, 0xa3, 0x95, 0x7b, 0x2b, 0x4e, 0x88, 0x89, + 0x9c, 0x83, 0x41, 0xa9, 0x62, 0x79, 0x62, 0xd6, 0xbe, 0x9e, 0x4f, 0xbf, + 0xca, 0xcf, 0x81, 0x87, 0x9e, 0x9e, 0xab, 0x99, 0x99, 0x51, 0x71, 0xc8, + 0x9f, 0x91, 0x6b, 0xc5, 0x9b, 0x57, 0x58, 0x6f, 0x91, 0x52, 0x63, 0x54, + 0x6d, 0x57, 0x5a, 0x4a, 0x9a, 0x8b, 0x75, 0x9d, 0x51, 0xa2, 0x95, 0x53, + 0xa7, 0x3f, 0xa6, 0x5c, 0x3d, 0x8a, 0xc0, 0x99, 0x8e, 0x90, 0x5d, 0xca, + 0xcd, 0x62, 0x88, 0x6e, 0xf8, 0x64, 0xa4, 0x8b, 0xb3, 0xae, 0xc4, 0x6a, + 0x4e, 0x9a, 0x93, 0xaf, 0xa4, 0xb1, 0x60, 0x7b, 0x50, 0x61, 0x9b, 0xd2, + 0xce, 0x7a, 0x3a, 0x70, 0x8e, 0x72, 0x44, 0x33, 0xc5, 0xcf, 0xb5, 0x8a, + 0x27, 0xb3, 0x8e, 0x77, 0x8a, 0x7c, 0x6d, 0x3d, 0x94, 0x3d, 0xa7, 0x93, + 0xb3, 0x93, 0x7c, 0x6a, 0xb7, 0x6a, 0xa4, 0xb8, 0x3d, 0x6f, 0x77, 0x45, + 0x76, 0x59, 0x70, 0x9f, 0x7f, 0xa3, 0xb6, 0x6f, 0x55, 0x55, 0x75, 0x86, + 0x75, 0x62, 0x48, 0xba, 0xad, 0x7c, 0x3b, 0x4a, 0x58, 0x80, 0x89, 0x5f, + 0x98, 0x70, 0x7b, 0x42, 0x3f, 0xb6, 0x93, 0xa2, 0x5e, 0x45, 0x72, 0x76, + 0xc1, 0x80, 0x43, 0x6e, 0xa2, 0x48, 0x54, 0xae, 0x3e, 0x9c, 0x4b, 0x6b, + 0x99, 0x57, 0x94, 0x8e, 0xb9, 0x3b, 0x2e, 0x6b, 0x61, 0xb8, 0xb8, 0x95, + 0x31, 0x56, 0x72, 0xa5, 0x65, 0x59, 0x69, 0xa0, 0xa4, 0x9f, 0x8f, 0x92, + 0x91, 0xaa, 0xab, 0xc7, 0x60, 0x65, 0xb8, 0x87, 0x9d, 0xbe, 0x69, 0x48, + 0x88, 0xad, 0x3d, 0xa6, 0x78, 0x4b, 0x9b, 0x65, 0x6f, 0x3a, 0xa2, 0xb2, + 0x3b, 0x94, 0x68, 0x7e, 0xc4, 0x59, 0x63, 0x96, 0x4a, 0x78, 0x69, 0x53, + 0x99, 0x7d, 0xad, 0x5a, 0x81, 0x6c, 0xaf, 0x7f, 0xb6, 0xa6, 0x8b, 0x68, + 0x5d, 0x4e, 0xc4, 0x6c, 0x55, 0x8b, 0xb8, 0x60, 0x6b, 0x58, 0x88, 0x63, + 0x44, 0x34, 0x4c, 0x6b, 0xb3, 0x42, 0x8f, 0x99, 0xac, 0xb2, 0x99, 0x97, + 0xae, 0x70, 0xd0, 0xb5, 0x84, 0x9f, 0x3b, 0xb3, 0xc4, 0x91, 0x30, 0x5d, + 0x71, 0x79, 0x7e, 0x99, 0x8e, 0x55, 0x54, 0xb4, 0x48, 0x4b, 0x86, 0x80, + 0x55, 0xa5, 0xb3, 0x81, 0xb4, 0xa1, 0x55, 0x6f, 0x6a, 0x85, 0x99, 0xae, + 0xc6, 0x53, 0x46, 0x7f, 0x56, 0x81, 0xa9, 0x9d, 0x65, 0xbb, 0x8f, 0xa8, + 0x91, 0x87, 0x49, 0x8c, 0x70, 0x83, 0x9c, 0x95, 0x55, 0x51, 0x70, 0x79, + 0x3e, 0x5c, 0x9b, 0xa8, 0x58, 0x85, 0x6b, 0x38, 0xa3, 0xbd, 0x6b, 0x39, + 0x60, 0xad, 0xae, 0x7d, 0x80, 0xa5, 0x67, 0x7d, 0x80, 0xb1, 0x49, 0xa0, + 0x67, 0x66, 0xb6, 0x99, 0x82, 0x4f, 0x86, 0xc6, 0x9a, 0x83, 0x62, 0x88, + 0x97, 0xa6, 0x80, 0xa9, 0x98, 0xbb, 0x97, 0x8d, 0x6d, 0x6c, 0x9f, 0x9a, + 0xa2, 0x9e, 0x45, 0x69, 0x99, 0x80, 0x5f, 0xc5, 0xc2, 0x63, 0x69, 0x40, + 0x54, 0xb3, 0x4f, 0x85, 0xad, 0x88, 0x67, 0x3a, 0x55, 0x43, 0x69, 0x69, + 0xb3, 0x3f, 0xad, 0x56, 0xa8, 0x99, 0x65, 0x93, 0x6e, 0xbb, 0x82, 0x9b, + 0xb1, 0x82, 0x5e, 0xac, 0x81, 0xaf, 0xb5, 0x62, 0x7b, 0x55, 0x9c, 0x96, + 0xba, 0xb2, 0xae, 0xba, 0x4c, 0xa0, 0xb4, 0x80, 0xc9, 0x9c, 0x4a, 0x99, + 0x59, 0x65, 0x7d, 0x9f, 0x82, 0x89, 0x59, 0x9c, 0x87, 0x52, 0x5c, 0xb2, + 0x43, 0x7b, 0x51, 0x86, 0x37, 0x8e, 0x65, 0x3f, 0x5b, 0x80, 0x88, 0x8e, + 0xb5, 0x50, 0x66, 0xb2, 0xa1, 0x89, 0x73, 0xae, 0x9a, 0xab, 0x63, 0x62, + 0xbb, 0xc3, 0xbe, 0xae, 0x8a, 0x91, 0x5c, 0x79, 0x8f, 0x4b, 0x83, 0x31, + 0x96, 0x97, 0xa0, 0x73, 0x3e, 0x36, 0x97, 0x64, 0x87, 0xb5, 0xb2, 0x9c, + 0x3e, 0x53, 0x6a, 0x74, 0x61, 0x67, 0xbf, 0x52, 0x5f, 0x85, 0x9e, 0x87, + 0x7e, 0x84, 0x39, 0x97, 0x9a, 0x51, 0x72, 0x55, 0x86, 0x43, 0xb7, 0xae, + 0x69, 0xa9, 0xbc, 0xa5, 0xbf, 0xa4, 0xbc, 0xa1, 0xae, 0x51, 0x86, 0x8b, + 0x97, 0xb1, 0x65, 0x55, 0x5a, 0x83, 0x62, 0x5c, 0x77, 0x9d, 0x4c, 0x9f, + 0x2e, 0x66, 0x51, 0xa5, 0x6d, 0x4f, 0x7c, 0x45, 0xad, 0x77, 0xb4, 0xba, + 0x9c, 0x8d, 0xb0, 0xd0, 0xa2, 0x93, 0x85, 0x57, 0x8f, 0xa6, 0x43, 0x57, + 0x8b, 0xc8, 0x6a, 0xad, 0x5d, 0x55, 0x9c, 0x76, 0x5c, 0x8d, 0x90, 0x3e, + 0x5f, 0x79, 0x53, 0x78, 0x80, 0x93, 0xa9, 0x2e, 0x47, 0xb5, 0x3f, 0x90, + 0x37, 0xb0, 0x9a, 0x6d, 0x83, 0x74, 0x71, 0x83, 0x51, 0xb6, 0x8b, 0x2f, + 0x88, 0x5d, 0x9f, 0x57, 0xb0, 0x56, 0x64, 0xaa, 0xb2, 0x5f, 0x80, 0xa2, + 0x7c, 0x60, 0xae, 0x69, 0xc0, 0x4c, 0x8e, 0x51, 0x70, 0xb8, 0x9a, 0xaf, + 0xb9, 0x67, 0xae, 0xa6, 0x74, 0x87, 0xa6, 0x6c, 0x5b, 0x51, 0x45, 0x8b, + 0xb3, 0x73, 0x46, 0x42, 0x40, 0x8f, 0xcb, 0x9b, 0x82, 0x92, 0xb5, 0xaf, + 0x53, 0xa2, 0xa9, 0x4a, 0x41, 0x7b, 0xb7, 0x57, 0x50, 0xc5, 0xb8, 0x90, + 0x69, 0x52, 0x47, 0xb2, 0x4d, 0x92, 0x52, 0x91, 0xa7, 0x3d, 0x63, 0x55, + 0x50, 0x80, 0xb3, 0x78, 0xc6, 0x43, 0x8f, 0xa1, 0x69, 0xac, 0x88, 0x9f, + 0x86, 0x73, 0x9a, 0x3f, 0x55, 0xa6, 0x74, 0x67, 0x8c, 0x74, 0xb6, 0x3d, + 0x74, 0xaf, 0xa7, 0x96, 0x4b, 0x38, 0x87, 0xb2, 0x8e, 0x86, 0x63, 0x46, + 0xb8, 0x4a, 0x78, 0x8d, 0x85, 0x51, 0x4b, 0xbc, 0xc1, 0x75, 0xb4, 0xb9, + 0x41, 0x6f, 0xa5, 0x4b, 0x5c, 0x99, 0x4c, 0x72, 0xbf, 0xc2, 0xfc, 0xe2, + 0x7a, 0xb3, 0x7c, 0x42, 0x30, 0x8e, 0xe, 0x56, 0x11, 0x94, 0x6c, 0x34, + 0x55, 0x70, 0x4b, 0x98, 0xa0, 0x83, 0xab, 0x42, 0x4b, 0x9c, 0x73, 0x91, + 0xcd, 0x4c, 0x92, 0xdc, 0xe8, 0x9c, 0x68, 0x7a, 0xbf, 0xbb, 0x87, 0x61, + 0xac, 0x56, 0x81, 0xb5, 0x47, 0x63, 0x56, 0x64, 0xbc, 0xac, 0x97, 0x4e, + 0xcc, 0xac, 0xba, 0xc7, 0x7d, 0x78, 0x91, 0x8f, 0x70, 0x30, 0x50, 0x68, + 0x73, 0x89, 0x53, 0x53, 0x70, 0xa6, 0x47, 0x7e, 0x87, 0x83, 0x9c, 0x4f, + 0x4f, 0x79, 0x54, 0xac, 0x73, 0x7d, 0x45, 0x3b, 0x2e, 0x47, 0x54, 0x55, + 0x94, 0x97, 0x76, 0x60, 0x9d, 0x5f, 0xce, 0x79, 0xa2, 0x63, 0x50, 0x9e, + 0x5d, 0x77, 0x64, 0xa7, 0x6a, 0xba, 0x5c, 0x81, 0xb7, 0x86, 0x66, 0x3e, + 0x82, 0x71, 0x94, 0x91, 0x2e, 0x1b, 0x85, 0x68, 0x47, 0x97, 0x53, 0x8e, + 0xb5, 0xb7, 0x66, 0x2c, 0x37, 0x42, 0x40, 0x42, 0x8d, 0x90, 0x57, 0x74, + 0xb4, 0x8b, 0x90, 0x7f, 0x5d, 0x5b, 0xa5, 0xa7, 0x45, 0x55, 0xa8, 0x44, + 0x62, 0x50, 0x51, 0x5f, 0xa4, 0xb0, 0x87, 0x99, 0x9c, 0x9e, 0x9b, 0x39, + 0x49, 0x69, 0xc4, 0x56, 0x4a, 0xab, 0xca, 0x6f, 0x7b, 0x75, 0xa4, 0xc3, + 0x4f, 0xb7, 0xb4, 0x48, 0x6d, 0x43, 0xbc, 0x42, 0x8c, 0x62, 0x72, 0x92, + 0x62, 0x6f, 0x52, 0x47, 0x21, 0x94, 0x7a, 0x87, 0x7f, 0x96, 0x59, 0x4b, + 0x53, 0x8d, 0x83, 0x45, 0x40, 0xbf, 0xb4, 0xca, 0xce, 0x75, 0xe0, 0xd4, + 0x7d, 0xb1, 0x85, 0x4c, 0x99, 0x54, 0x49, 0x65, 0x31, 0x9d, 0x9b, 0x6a, + 0x2f, 0xb5, 0xbb, 0x4f, 0x7b, 0xb4, 0x52, 0x94, 0x84, 0x6b, 0x8d, 0x5e, + 0x77, 0x69, 0xa6, 0x55, 0x60, 0x87, 0x9a, 0x7c, 0x9e, 0xa7, 0x68, 0x4a, + 0x7a, 0x5d, 0xb2, 0x58, 0x44, 0x82, 0x9d, 0x73, 0xd6, 0x4a, 0x88, 0x82, + 0x1b, 0x5e, 0x2b, 0x8e, 0x44, 0x76, 0x98, 0x82, 0xb3, 0x76, 0xb7, 0xcb, + 0xbb, 0x67, 0xd2, 0x7c, 0x66, 0x9b, 0x90, 0x76, 0x90, 0x5e, 0x63, 0x42, + 0x64, 0x9e, 0x63, 0xa2, 0xa0, 0xc5, 0x82, 0x71, 0x85, 0x4b, 0xa0, 0xbd, + 0x4a, 0x53, 0x8a, 0xb2, 0x4c, 0x94, 0xc0, 0xca, 0xc0, 0xc3, 0xf0, 0xc8, + 0xb1, 0x82, 0x32, 0x77, 0x51, 0x44, 0x6e, 0x2a, 0x97, 0x75, 0x4b, 0x8f, + 0x5c, 0x9e, 0xc7, 0x48, 0x8c, 0xad, 0x5d, 0x64, 0x9e, 0x41, 0x2b, 0x38, + 0x2f, 0x5b, 0x60, 0x2c, 0x5c, 0x61, 0xb2, 0x82, 0xbc, 0xc5, 0x7c, 0x5c, + 0xb0, 0x6e, 0x55, 0x9b, 0x6e, 0x90, 0x9e, 0x7a, 0x8d, 0x54, 0x9d, 0x76, + 0x68, 0x26, 0x52, 0x5f, 0xb2, 0xb1, 0x78, 0x7f, 0x98, 0xac, 0x96, 0xec, + 0x9d, 0xa8, 0xe2, 0x74, 0x75, 0x72, 0x56, 0x70, 0xbc, 0x6f, 0xcc, 0x9f, + 0xa6, 0xb1, 0x44, 0x66, 0x88, 0x42, 0x85, 0xa4, 0xa8, 0x9b, 0xcf, 0x95, + 0xb1, 0xb1, 0x76, 0xb0, 0xc7, 0x86, 0xa0, 0x4b, 0x76, 0x34, 0xa2, 0x82, + 0xc2, 0x95, 0x92, 0xd5, 0x9a, 0x9e, 0xe5, 0x71, 0x9f, 0x83, 0x90, 0x55, + 0x47, 0x47, 0x80, 0x7a, 0x6c, 0x3e, 0x63, 0x93, 0xb6, 0xb9, 0x81, 0x7f, + 0xa1, 0xd5, 0xc6, 0xe6, 0x7d, 0xbf, 0xc6, 0xaa, 0x8e, 0x55, 0x53, 0x3a, + 0x9d, 0x95, 0x63, 0x85, 0x97, 0x3a, 0x3e, 0x4c, 0x8a, 0x47, 0x71, 0x60, + 0x53, 0x45, 0xa8, 0x46, 0x7c, 0xb4, 0x87, 0x60, 0xcd, 0xaf, 0x5d, 0x62, + 0xb0, 0xad, 0x92, 0x6c, 0x83, 0x45, 0x9e, 0x43, 0xba, 0x7d, 0x78, 0x4c, + 0xa0, 0xad, 0x97, 0x9a, 0xc4, 0x9c, 0xa7, 0xbc, 0x62, 0xbc, 0x7f, 0x7a, + 0x60, 0xcc, 0x61, 0x53, 0x55, 0xa9, 0x6a, 0x73, 0x6f, 0x86, 0x42, 0x75, + 0xba, 0xab, 0x5b, 0xbe, 0xd1, 0x9a, 0x92, 0xa8, 0xcd, 0x59, 0x6b, 0x88, + 0x6b, 0xa1, 0x7d, 0x58, 0x10, 0x65, 0x54, 0x45, 0x56, 0x23, 0xad, 0x5a, + 0x63, 0x8c, 0x3a, 0x81, 0x50, 0xba, 0x54, 0x69, 0xa2, 0x67, 0x79, 0x72, + 0xa9, 0x8e, 0xa5, 0x8f, 0xc7, 0x6a, 0x69, 0x9a, 0x3c, 0x9b, 0x47, 0x45, + 0x69, 0x6e, 0x44, 0x7a, 0x94, 0x54, 0x67, 0xb7, 0xe3, 0xab, 0xc5, 0xbd, + 0xb0, 0x5e, 0xb5, 0x47, 0x4b, 0x5d, 0x97, 0x83, 0x22, 0x5b, 0x5c, 0x69, + 0x27, 0x4c, 0xaf, 0x37, 0x61, 0x83, 0x9a, 0x3f, 0x53, 0x3f, 0x60, 0x8e, + 0x93, 0x5e, 0x2f, 0x9e, 0x2f, 0x77, 0xac, 0x5b, 0xbb, 0x4d, 0x9b, 0x8d, + 0xc4, 0xbf, 0xb4, 0x91, 0x9b, 0x88, 0x7b, 0x82, 0xaf, 0x6f, 0xc4, 0xb9, + 0x6d, 0x9e, 0x69, 0xb4, 0x6b, 0x91, 0xa9, 0x5f, 0x42, 0x77, 0xa8, 0x69, + 0x6a, 0x3b, 0x35, 0x4a, 0x9d, 0x49, 0x7c, 0x6b, 0x86, 0xa7, 0x4e, 0xb8, + 0x79, 0x58, 0x92, 0x5a, 0xb2, 0x70, 0xa0, 0x78, 0x96, 0x5c, 0x85, 0x4a, + 0x7c, 0xb5, 0x94, 0x69, 0x68, 0xa4, 0xb3, 0x9d, 0x57, 0x81, 0xab, 0x51, + 0x70, 0xa0, 0xbc, 0xae, 0x9d, 0x3d, 0x95, 0x7f, 0x62, 0x69, 0x8a, 0x3b, + 0x90, 0x5b, 0xb9, 0xc3, 0x68, 0x64, 0xa3, 0x58, 0x41, 0x9d, 0x95, 0x9b, + 0x49, 0x71, 0x47, 0x7a, 0x9c, 0x9d, 0x63, 0xa1, 0x3f, 0xb5, 0xad, 0x9e, + 0x5f, 0x9e, 0x88, 0x98, 0x85, 0x61, 0x9f, 0xb6, 0x58, 0x92, 0xbb, 0x90, + 0x61, 0xc4, 0xc2, 0x54, 0x98, 0xa8, 0x76, 0xac, 0x87, 0x8d, 0xa8, 0x4e, + 0x49, 0x8c, 0x42, 0x67, 0x97, 0x71, 0x70, 0x8a, 0x6f, 0x36, 0x45, 0x94, + 0x93, 0x52, 0x5b, 0x9a, 0xa1, 0x74, 0x78, 0x50, 0x66, 0x8e, 0x8b, 0xba, + 0x8a, 0xa6, 0x84, 0x5b, 0x4f, 0x78, 0x90, 0x90, 0x75, 0x66, 0x5a, 0x9a, + 0x68, 0xb4, 0xa9, 0xb8, 0xaa, 0x9e, 0x8c, 0x2f, 0x7f, 0x2c, 0x3f, 0x29, + 0xaf, 0x65, 0x9e, 0xa2, 0x78, 0x81, 0x89, 0xc7, 0xd6, 0x7d, 0xad, 0xad, + 0x9c, 0x83, 0x2f, 0x53, 0x81, 0x95, 0x93, 0xb6, 0x90, 0xb4, 0xbe, 0xa9, + 0x74, 0x90, 0xc0, 0xc3, 0x5b, 0xc2, 0xab, 0x82, 0x78, 0x67, 0x83, 0xab, + 0x55, 0x92, 0x75, 0x93, 0xbf, 0x62, 0x52, 0xac, 0x9b, 0x6c, 0x46, 0x7f, + 0x32, 0x9d, 0x70, 0x7c, 0x5d, 0x59, 0x37, 0x68, 0x46, 0xb1, 0x73, 0x4c, + 0x76, 0x99, 0x9c, 0x52, 0xa7, 0x81, 0x9d, 0x75, 0x87, 0x82, 0x99, 0x7a, + 0x74, 0x83, 0x86, 0x4b, 0x59, 0x4a, 0x75, 0xdc, 0xaf, 0x80, 0x82, 0x6c, + 0x64, 0x88, 0x6f, 0x50, 0xa5, 0x94, 0x53, 0x45, 0x32, 0x47, 0x35, 0x8b, + 0xab, 0xa4, 0xa9, 0x81, 0x6b, 0x5e, 0xcf, 0xbe, 0xdc, 0x89, 0xd7, 0xdf, + 0xc9, 0xbd, 0x8b, 0x82, 0x72, 0x88, 0x81, 0x94, 0xa1, 0x8f, 0x90, 0x69, + 0x5f, 0x9c, 0xa2, 0x39, 0x5c, 0x94, 0x69, 0xa2, 0x6e, 0x55, 0x6f, 0xc0, + 0x6f, 0x86, 0x5a, 0x84, 0x8b, 0x6d, 0x80, 0x67, 0x7c, 0xa5, 0xb1, 0x7c, + 0x76, 0x55, 0x53, 0xb2, 0x6c, 0xd6, 0x69, 0x58, 0xaa, 0x83, 0x7e, 0x2c, + 0xae, 0x9c, 0x49, 0x70, 0x92, 0x58, 0x88, 0xb4, 0x84, 0x5d, 0xa9, 0xcb, + 0xa3, 0x75, 0xae, 0xa1, 0xb6, 0x60, 0x71, 0xae, 0x7c, 0x44, 0x84, 0x4a, + 0x9c, 0x3c, 0x44, 0x4f, 0x82, 0xb8, 0xae, 0x58, 0x64, 0x99, 0xa1, 0x40, + 0x3a, 0x47, 0x49, 0xce, 0x4c, 0xb4, 0x88, 0xc2, 0x5e, 0x5e, 0x7b, 0xa8, + 0x70, 0xa3, 0xaf, 0xb6, 0x40, 0x8c, 0x95, 0x7d, 0x4b, 0x6f, 0xa4, 0xa7, + 0x61, 0x4f, 0x5c, 0x4d, 0xdb, 0x8e, 0xc5, 0x78, 0x95, 0x6a, 0x73, 0x70, + 0x88, 0x96, 0x5d, 0x54, 0x6f, 0xa9, 0x44, 0x70, 0xb8, 0xad, 0x9f, 0xae, + 0xb3, 0x77, 0x4a, 0xda, 0x47, 0xa4, 0x75, 0x9f, 0xa9, 0x57, 0x1b, 0x3b, + 0x3d, 0xb6, 0x31, 0xa, 0x25, 0x33, 0x44, 0x6a, 0x6e, 0x94, 0x8f, 0x40, + 0x34, 0x5c, 0x74, 0x87, 0x78, 0x97, 0xb8, 0x55, 0x6b, 0x54, 0x96, 0xae, + 0xc0, 0x9b, 0x7e, 0xb1, 0x71, 0x91, 0x59, 0xa1, 0x43, 0x5b, 0x46, 0x3d, + 0x84, 0x5e, 0x9b, 0x9f, 0x74, 0xa3, 0xa8, 0x54, 0x71, 0x84, 0x7e, 0x93, + 0x33, 0x65, 0x39, 0x78, 0x9c, 0x4f, 0x22, 0x8b, 0x5c, 0x57, 0x4f, 0x79, + 0x7c, 0x81, 0x86, 0xa4, 0x79, 0xcb, 0xb9, 0x79, 0x78, 0x4a, 0x37, 0x81, + 0x36, 0x5e, 0x91, 0x52, 0x38, 0x80, 0xa0, 0xab, 0x98, 0xab, 0x58, 0x71, + 0xb1, 0x89, 0xb0, 0xb5, 0x56, 0x71, 0xa6, 0xa0, 0xbf, 0xa6, 0x86, 0x32, + 0x42, 0x78, 0x60, 0x85, 0x9a, 0x5d, 0x45, 0x42, 0x9e, 0x9e, 0x7b, 0x8c, + 0x51, 0x42, 0x6a, 0xb9, 0x4e, 0x84, 0xb2, 0x9b, 0x6c, 0x4b, 0x69, 0x75, + 0x4c, 0x77, 0x63, 0xa0, 0xa4, 0x4d, 0x38, 0x64, 0x52, 0xba, 0x4b, 0x46, + 0x40, 0xb9, 0x8b, 0x65, 0x4b, 0x5c, 0x80, 0x4e, 0x66, 0x6c, 0x6d, 0x6a, + 0x41, 0x6c, 0xb0, 0x39, 0x5a, 0x72, 0x87, 0x27, 0x68, 0x52, 0x93, 0x82, + 0x6a, 0xa3, 0x7a, 0x81, 0x49, 0xad, 0xb9, 0x5e, 0x8b, 0xad, 0x9d, 0xa5, + 0x6e, 0xa5, 0xa7, 0x37, 0xb0, 0x3c, 0x9c, 0x5f, 0x68, 0xae, 0x33, 0x40, + 0x58, 0x3b, 0x7a, 0x9f, 0x4c, 0x65, 0x69, 0x9b, 0x64, 0x53, 0x35, 0x35, + 0x88, 0x63, 0x54, 0xc2, 0x49, 0x8c, 0x4d, 0x49, 0x5e, 0x42, 0xac, 0x91, + 0x93, 0x6d, 0x4a, 0xad, 0x6a, 0x3c, 0xba, 0xa7, 0x5e, 0x8a, 0x91, 0x61, + 0x49, 0xc4, 0x4a, 0xb1, 0xac, 0x60, 0x65, 0x41, 0xaf, 0x36, 0x7a, 0x96, + 0x68, 0x3f, 0x68, 0x6c, 0xc5, 0x70, 0x61, 0x63, 0x73, 0x64, 0x6c, 0xbc, + 0xae, 0xb6, 0x85, 0x61, 0x41, 0x46, 0x3c, 0x81, 0x40, 0x47, 0x41, 0x49, + 0x15, 0xca, 0x86, 0xd9, 0x63, 0x7f, 0xa1, 0xcc, 0xa8, 0x7a, 0x5d, 0x51, + 0x61, 0x9a, 0x8c, 0x7e, 0x8c, 0x40, 0x65, 0x65, 0x5a, 0x3d, 0x64, 0xa5, + 0x38, 0x77, 0x88, 0xac, 0x54, 0x3b, 0x4c, 0xac, 0x54, 0x4d, 0xb2, 0xa8, + 0x93, 0x66, 0x42, 0x84, 0x87, 0xb1, 0xcc, 0x6c, 0x9e, 0x5b, 0x26, 0x85, + 0x5b, 0x56, 0x50, 0x9a, 0x71, 0x7e, 0x87, 0xa7, 0x48, 0x3e, 0x8f, 0x95, + 0x7c, 0x9f, 0x4f, 0x81, 0xb7, 0x46, 0x6c, 0xa5, 0x4f, 0x86, 0x4d, 0x5f, + 0xc1, 0xa6, 0xb7, 0x2e, 0x9b, 0xb0, 0xa5, 0xac, 0xb5, 0xc4, 0x8e, 0x8b, + 0x8a, 0x8e, 0x69, 0x5d, 0x91, 0x5d, 0x93, 0xb2, 0x80, 0x52, 0x96, 0x5a, + 0x8d, 0xa5, 0xbe, 0x71, 0x65, 0xa3, 0xc0, 0x7b, 0x76, 0xa4, 0xe4, 0x5e, + 0x5e, 0x9f, 0x66, 0x85, 0xc4, 0x78, 0x66, 0x5f, 0x41, 0x64, 0x4a, 0x4a, + 0x93, 0x7d, 0x48, 0xa6, 0x2f, 0x49, 0x88, 0x99, 0x6e, 0x85, 0x74, 0xb0, + 0x4b, 0x3f, 0xb4, 0x43, 0x4b, 0xb1, 0x3d, 0x91, 0x69, 0x59, 0x84, 0x82, + 0xc8, 0x7e, 0x8a, 0xc5, 0x5d, 0x6c, 0x99, 0x30, 0x30, 0x84, 0x5f, 0x97, + 0xa5, 0x4c, 0x2c, 0x99, 0x53, 0x97, 0x3b, 0xa8, 0x86, 0xad, 0x81, 0xa4, + 0x68, 0xb4, 0x8e, 0x32, 0x3f, 0x57, 0x6b, 0x69, 0x4e, 0x44, 0x71, 0x7a, + 0x95, 0x47, 0xa0, 0x68, 0xc2, 0x85, 0x6b, 0x52, 0x9c, 0xa6, 0x4c, 0xb4, + 0x88, 0x9d, 0xad, 0x87, 0xb0, 0xb2, 0x42, 0x58, 0x4c, 0x5c, 0xaf, 0x5c, + 0x93, 0x79, 0x9e, 0x99, 0x98, 0x90, 0xad, 0xbc, 0xb4, 0x57, 0x7b, 0x69, + 0x99, 0xa0, 0x55, 0x9e, 0x94, 0xc9, 0xa8, 0x83, 0x59, 0x66, 0x75, 0x5c, + 0x92, 0x74, 0xb3, 0x40, 0x82, 0x3b, 0x32, 0x8c, 0x6a, 0xba, 0x96, 0x84, + 0xc6, 0x4d, 0x4e, 0x9e, 0x85, 0xc7, 0x58, 0x2a, 0x49, 0x55, 0x9c, 0x7d, + 0x86, 0x49, 0x4d, 0x26, 0x4b, 0x53, 0x51, 0xb4, 0x65, 0x59, 0x4a, 0x95, + 0x33, 0xa9, 0x6e, 0x68, 0x8c, 0xaa, 0x79, 0x65, 0xb6, 0xc6, 0xe1, 0xdc, + 0x8a, 0x57, 0xbd, 0x4f, 0x64, 0x34, 0x51, 0x6b, 0x3c, 0x8d, 0x79, 0x96, + 0x73, 0x69, 0x70, 0x98, 0xbb, 0x61, 0xb2, 0x78, 0x77, 0x51, 0xa2, 0x7f, + 0x77, 0x4a, 0x29, 0x4d, 0x25, 0x39, 0x39, 0x33, 0x98, 0xb3, 0x32, 0x9d, + 0x62, 0x51, 0x59, 0x88, 0x80, 0x3d, 0x42, 0x9f, 0x50, 0x5f, 0x8d, 0x36, + 0x3b, 0x4f, 0x7f, 0x76, 0x8f, 0xcc, 0xba, 0xb2, 0x62, 0x8b, 0xd0, 0x65, + 0xb6, 0x52, 0x90, 0x66, 0xb6, 0xb2, 0x7a, 0x91, 0x7c, 0x73, 0x59, 0x51, + 0x66, 0x88, 0xbf, 0x38, 0x3b, 0xa4, 0x39, 0x97, 0x87, 0xaf, 0x70, 0x79, + 0x31, 0x94, 0x64, 0xb5, 0x75, 0x6a, 0x7e, 0x52, 0x78, 0x9b, 0x91, 0x69, + 0x7a, 0x5f, 0x88, 0x90, 0xac, 0x9a, 0x72, 0xad, 0x81, 0xc5, 0xc7, 0xa7, + 0xaa, 0x87, 0x66, 0x67, 0x7a, 0x75, 0x77, 0x89, 0x55, 0x49, 0x88, 0x8e, + 0xa4, 0x47, 0x47, 0x40, 0x91, 0x4d, 0x48, 0xac, 0x5d, 0xb1, 0xab, 0x91, + 0xc6, 0xc7, 0x62, 0x76, 0x8f, 0xa8, 0x66, 0x49, 0x69, 0xbf, 0x79, 0x7a, + 0x93, 0xb7, 0x7f, 0x54, 0xb3, 0xa4, 0x4a, 0x65, 0x43, 0x69, 0x3a, 0xab, + 0x3f, 0xa7, 0x8b, 0xb4, 0xb3, 0x4a, 0x95, 0x95, 0x37, 0xb2, 0xc4, 0x76, + 0x55, 0xa5, 0x58, 0x47, 0x5d, 0xaa, 0xc0, 0x5a, 0xae, 0x79, 0x33, 0x89, + 0x90, 0x8d, 0x5e, 0x62, 0x74, 0x53, 0x88, 0x78, 0x46, 0xb4, 0x64, 0xb5, + 0x95, 0x71, 0x71, 0x64, 0xaf, 0x6f, 0x9f, 0x40, 0x8b, 0x55, 0xb8, 0x90, + 0xbd, 0x9d, 0x8a, 0x80, 0x8a, 0x9c, 0x31, 0x64, 0x45, 0x8e, 0xce, 0x5e, + 0x42, 0xb0, 0x67, 0x4a, 0x7f, 0x7a, 0x53, 0x6c, 0x5e, 0x98, 0x65, 0xb0, + 0x7a, 0x4f, 0x87, 0xbd, 0xe4, 0xb3, 0x74, 0x55, 0x55, 0x47, 0x63, 0x56, + 0x7d, 0x5c, 0x60, 0x53, 0x89, 0x52, 0xa4, 0x7c, 0x63, 0x98, 0xd7, 0x76, + 0x67, 0x9d, 0x63, 0xbe, 0x83, 0xaa, 0x4d, 0x4d, 0x3f, 0x4f, 0x63, 0x5f, + 0x4d, 0x94, 0x64, 0xa6, 0xc4, 0x97, 0x48, 0x71, 0xa1, 0x41, 0x83, 0x6a, + 0xac, 0x6c, 0xb9, 0x4d, 0x62, 0x86, 0xa6, 0xcc, 0x92, 0x4f, 0x4d, 0x9e, + 0xad, 0x7a, 0x75, 0xac, 0x43, 0x75, 0x4f, 0x45, 0x88, 0x5a, 0x6d, 0xb9, + 0x58, 0xae, 0xa9, 0x78, 0x86, 0xae, 0x85, 0x55, 0x8f, 0xac, 0x50, 0xba, + 0x4b, 0x89, 0x55, 0x88, 0xd0, 0xad, 0xa0, 0xad, 0x79, 0x9b, 0x96, 0x58, + 0xb0, 0xc8, 0xbe, 0xc4, 0xab, 0xc6, 0x90, 0x9f, 0x5d, 0x72, 0x84, 0x39, + 0x41, 0x6b, 0x63, 0x6f, 0x88, 0xae, 0x74, 0x37, 0x58, 0x4a, 0x94, 0x47, + 0x8a, 0x46, 0x86, 0x59, 0xa1, 0x80, 0x78, 0x51, 0xcd, 0xbc, 0x8a, 0x52, + 0x86, 0x73, 0x8a, 0x74, 0x80, 0x7a, 0x61, 0x6f, 0x8e, 0xb7, 0x51, 0xaa, + 0xc2, 0x90, 0x8e, 0x45, 0xa9, 0xa5, 0x4e, 0xae, 0x65, 0xa4, 0x41, 0x68, + 0x5e, 0x43, 0x6c, 0xad, 0x4d, 0x99, 0x79, 0x5d, 0x97, 0x6b, 0xb9, 0x60, + 0xaa, 0x53, 0x61, 0x4c, 0x34, 0x3c, 0x36, 0x59, 0x76, 0x85, 0xa2, 0xa0, + 0xca, 0xa8, 0x3f, 0x7d, 0x7a, 0x59, 0x86, 0x92, 0x99, 0x6f, 0xa2, 0x61, + 0xad, 0x70, 0xc1, 0x9b, 0xb2, 0x53, 0xac, 0xa4, 0x9f, 0xb6, 0x79, 0x9b, + 0x65, 0x71, 0x43, 0x81, 0x49, 0x74, 0x76, 0x2d, 0x5d, 0x88, 0x95, 0x9b, + 0xac, 0xa2, 0xce, 0xc9, 0x7f, 0x89, 0x5f, 0x54, 0x70, 0x8b, 0x96, 0x51, + 0x8d, 0x39, 0x91, 0x3e, 0xcc, 0xc1, 0x82, 0x93, 0x90, 0x45, 0x89, 0x6a, + 0x3d, 0xb7, 0xac, 0x67, 0xba, 0x47, 0x4f, 0x4c, 0xb0, 0x40, 0x87, 0x56, + 0xad, 0xbc, 0x4f, 0xbf, 0xa0, 0x47, 0x5c, 0x9d, 0xa1, 0x99, 0x42, 0xb4, + 0x87, 0xad, 0x65, 0x89, 0xba, 0x4e, 0x9c, 0xa3, 0xb7, 0x8a, 0xa4, 0x9c, + 0xad, 0x41, 0x94, 0x85, 0x43, 0xb7, 0x9a, 0xc3, 0xb8, 0x9b, 0x4e, 0x9c, + 0x93, 0x5b, 0x58, 0x5d, 0x86, 0x6e, 0x49, 0xae, 0x9b, 0x3e, 0xa4, 0xa7, + 0x95, 0x9e, 0x38, 0x5c, 0x4d, 0x60, 0xad, 0x95, 0x3d, 0x70, 0xaa, 0xa1, + 0x9f, 0x61, 0xb0, 0x47, 0xb9, 0x9d, 0x42, 0x6a, 0x5f, 0xba, 0x8a, 0xbf, + 0x7e, 0x3d, 0xb8, 0x54, 0x62, 0x8e, 0xba, 0x9c, 0xa9, 0xa3, 0xbe, 0x9b, + 0xb3, 0x7c, 0x9b, 0x3b, 0xa6, 0x8f, 0x52, 0x4f, 0x3d, 0xb8, 0x67, 0x8f, + 0xa6, 0x42, 0x53, 0x7b, 0x3d, 0x70, 0x92, 0x84, 0xbf, 0x4c, 0x83, 0xca, + 0xad, 0x9f, 0xb4, 0x6a, 0x67, 0x52, 0x8b, 0xae, 0x9e, 0x5a, 0xa4, 0x63, + 0xc7, 0x8d, 0xb5, 0x68, 0x40, 0x43, 0xa6, 0x79, 0x61, 0x4f, 0x97, 0x68, + 0x79, 0x8a, 0x5a, 0x7d, 0x47, 0x96, 0x5c, 0x97, 0x3f, 0xaa, 0x94, 0x52, + 0x3a, 0xab, 0xb5, 0x85, 0xb3, 0x82, 0x8c, 0x89, 0xab, 0xac, 0x96, 0xb9, + 0x5b, 0x79, 0x72, 0x41, 0xa0, 0xb6, 0x45, 0xa7, 0x40, 0x6b, 0x90, 0xbb, + 0x48, 0x4f, 0xb5, 0x72, 0x61, 0x8f, 0x55, 0x64, 0x8e, 0x48, 0x92, 0x7c, + 0x59, 0xc5, 0xc5, 0x6f, 0x65, 0xcd, 0x60, 0x56, 0x7e, 0x5e, 0xc0, 0x70, + 0x58, 0x91, 0x61, 0x9c, 0x48, 0x3f, 0x96, 0xa6, 0x55, 0x40, 0xb8, 0x74, + 0xb4, 0x6a, 0x6b, 0x64, 0x96, 0x4f, 0x55, 0x9b, 0x9f, 0x68, 0x76, 0x7d, + 0xa1, 0xb7, 0x82, 0x56, 0x64, 0xa3, 0x74, 0x5d, 0x6d, 0xa8, 0xbd, 0x9d, + 0xbd, 0x52, 0x73, 0x3b, 0x93, 0x6b, 0x8c, 0x7d, 0x72, 0xc4, 0x65, 0xb7, + 0x85, 0x9b, 0xa4, 0xbe, 0xab, 0x52, 0x51, 0xa3, 0x8c, 0x7d, 0x7b, 0x90, + 0xbe, 0xa6, 0x4c, 0x8f, 0xbd, 0xb2, 0x94, 0xab, 0xbf, 0xb7, 0x59, 0xaa, + 0x76, 0x9e, 0x97, 0x6b, 0x70, 0x79, 0x5a, 0x52, 0xb0, 0x7f, 0x57, 0xa6, + 0x88, 0x8d, 0x88, 0x7a, 0x49, 0xb0, 0x74, 0x34, 0x46, 0x80, 0x8c, 0xb9, + 0x98, 0x9b, 0x92, 0x91, 0x53, 0x73, 0xac, 0xaf, 0x42, 0x3a, 0x85, 0xbd, + 0x84, 0x85, 0xa0, 0xb9, 0x6b, 0x78, 0xa4, 0xcb, 0x8d, 0xbc, 0xb2, 0xca, + 0xb0, 0x52, 0xc0, 0x4c, 0x74, 0x9e, 0x8f, 0x73, 0x6a, 0xb1, 0x5d, 0x6e, + 0x67, 0x5d, 0x76, 0x50, 0x60, 0x8d, 0x4c, 0x6a, 0xa6, 0x85, 0xab, 0x85, + 0x68, 0xbd, 0x63, 0xa6, 0x7c, 0x59, 0x5a, 0xa3, 0x76, 0x52, 0x5a, 0xb0, + 0x9e, 0x84, 0xa2, 0x9d, 0x67, 0x84, 0x84, 0x5d, 0xb2, 0xab, 0x3f, 0xb2, + 0x55, 0xa4, 0x4d, 0x84, 0xc0, 0x48, 0x99, 0xb5, 0x61, 0x65, 0x70, 0x53, + 0x47, 0x67, 0x92, 0x9a, 0x53, 0x5c, 0x41, 0x4c, 0x93, 0x6f, 0x57, 0x46, + 0x74, 0x9a, 0x9f, 0x55, 0x8a, 0xb0, 0x52, 0x3c, 0x7b, 0x73, 0x7a, 0x70, + 0xb9, 0x84, 0x73, 0x88, 0x82, 0x97, 0x4c, 0xbd, 0x85, 0xaf, 0xb9, 0x7c, + 0x96, 0x9f, 0xa4, 0x3e, 0x67, 0x60, 0x91, 0x72, 0x80, 0xbf, 0x59, 0x9a, + 0x9a, 0xa4, 0x43, 0x5f, 0x4c, 0x60, 0x65, 0x3c, 0x88, 0x81, 0x46, 0x97, + 0x8a, 0x42, 0x72, 0xa2, 0x95, 0x7c, 0xa0, 0xb4, 0x48, 0x68, 0xa6, 0x9a, + 0x50, 0x78, 0x75, 0x5c, 0x9d, 0x96, 0x85, 0x61, 0x4e, 0xa0, 0xbd, 0x5e, + 0x96, 0x8e, 0x85, 0x46, 0x68, 0x9b, 0x4d, 0x61, 0x79, 0x77, 0x7d, 0x85, + 0x9a, 0x6c, 0x96, 0x42, 0x5a, 0x95, 0x6d, 0x82, 0x56, 0x99, 0xb7, 0x66, + 0x70, 0x95, 0xb5, 0x51, 0x6c, 0x9b, 0x59, 0x92, 0x2f, 0x9b, 0x67, 0x9f, + 0x7b, 0xc2, 0x78, 0x99, 0x70, 0xa8, 0x63, 0x6e, 0x53, 0x51, 0x59, 0x49, + 0x52, 0xa7, 0x92, 0x7f, 0x99, 0x6a, 0x48, 0x68, 0xad, 0x70, 0x37, 0x7b, + 0x82, 0x6b, 0xb0, 0x93, 0x76, 0x80, 0xd8, 0xd1, 0x9a, 0x9e, 0xdd, 0xdf, + 0xae, 0xaf, 0x39, 0x47, 0xb5, 0x78, 0x52, 0x57, 0xb3, 0x51, 0x60, 0x83, + 0xb3, 0x69, 0x76, 0x63, 0x8c, 0x9b, 0xc3, 0xb2, 0xd1, 0xd3, 0xb9, 0x90, + 0x77, 0x56, 0x7d, 0x4c, 0x3a, 0x54, 0x84, 0x63, 0x8c, 0x68, 0x3a, 0x5d, + 0x47, 0xbd, 0x53, 0xac, 0x85, 0xc9, 0x59, 0xbd, 0x8f, 0xa1, 0x84, 0x92, + 0x96, 0xa8, 0xbf, 0x6f, 0x54, 0x3b, 0x34, 0x88, 0x8b, 0x95, 0x8c, 0x9c, + 0x75, 0xb0, 0x45, 0x97, 0x90, 0x5d, 0x70, 0x84, 0x77, 0xa2, 0x81, 0xb1, + 0x67, 0xb9, 0x4c, 0x45, 0x61, 0x8e, 0x38, 0x43, 0x65, 0x6a, 0x7c, 0x57, + 0x5f, 0x5f, 0x41, 0x3e, 0x8c, 0x7c, 0x5d, 0x54, 0xac, 0x84, 0x22, 0x3f, + 0x70, 0xab, 0x39, 0xa3, 0x5f, 0x5b, 0x64, 0x92, 0x94, 0xa8, 0x5f, 0x97, + 0x5a, 0x9a, 0x53, 0x51, 0xa5, 0x71, 0xb6, 0xbc, 0x80, 0x83, 0xc1, 0x94, + 0x92, 0x2c, 0x46, 0x96, 0x9f, 0xa0, 0x80, 0x5b, 0x95, 0x94, 0x69, 0x61, + 0x48, 0x7e, 0x5b, 0x91, 0x8a, 0xa1, 0x66, 0x97, 0x5b, 0x75, 0x65, 0xb1, + 0xac, 0xaf, 0x75, 0x7c, 0x62, 0x74, 0x4d, 0xb9, 0x5c, 0x51, 0x96, 0xa1, + 0xbb, 0x74, 0x6d, 0x49, 0x76, 0x24, 0xa6, 0x80, 0x69, 0x58, 0xad, 0x63, + 0x9e, 0xd5, 0xa2, 0x73, 0xb4, 0x9f, 0x87, 0x89, 0xa9, 0x99, 0x71, 0x7f, + 0xab, 0x69, 0x59, 0x48, 0xa3, 0xa9, 0x60, 0x93, 0xba, 0x9b, 0x3d, 0x64, + 0x7c, 0x5c, 0x6c, 0x9a, 0xb0, 0x76, 0xbe, 0x8c, 0x71, 0x79, 0xae, 0xab, + 0x71, 0x1d, 0x2f, 0x55, 0x3f, 0xb9, 0xe2, 0xc6, 0xb1, 0xdb, 0xfa, 0xb1, + 0x85, 0x8f, 0x3a, 0x3e, 0x29, 0x3b, 0x9d, 0x89, 0x9d, 0x59, 0x98, 0xbf, + 0x7e, 0x53, 0x8f, 0xa7, 0x97, 0xd2, 0xc3, 0x83, 0xac, 0xa8, 0x8e, 0x82, + 0x32, 0x39, 0x79, 0x83, 0x3e, 0x7e, 0xb7, 0x9d, 0x69, 0x56, 0xb2, 0x77, + 0x91, 0x47, 0x6c, 0x49, 0x38, 0x3f, 0xa1, 0x5e, 0x82, 0xc7, 0xca, 0x69, + 0x73, 0xf6, 0xa3, 0x4a, 0x85, 0x38, 0x8e, 0x4b, 0x45, 0x4e, 0x40, 0x65, + 0x58, 0xbf, 0x68, 0x4a, 0x3b, 0x95, 0x6f, 0x54, 0x4c, 0x96, 0xa7, 0x64, + 0xa6, 0x3f, 0x14, 0x2d, 0x8d, 0x55, 0x27, 0x7e, 0x93, 0x5a, 0xbb, 0xa4, + 0x73, 0x91, 0xab, 0xa7, 0x5b, 0xb9, 0x79, 0x8d, 0x67, 0x7a, 0xb3, 0x57, + 0x51, 0x4f, 0xaa, 0x37, 0x4e, 0xa3, 0x59, 0x43, 0x77, 0x69, 0xd0, 0x66, + 0xb5, 0xd5, 0x6a, 0x55, 0x7d, 0x9d, 0x6f, 0x5f, 0x9c, 0x74, 0xc7, 0xda, + 0xcb, 0x8a, 0xca, 0xa7, 0xd0, 0xbe, 0x84, 0x84, 0xa9, 0x7f, 0x63, 0x9a, + 0xab, 0xcd, 0x86, 0x76, 0xa3, 0x81, 0x8b, 0x83, 0x87, 0xa4, 0x8d, 0x91, + 0x66, 0x85, 0x45, 0x69, 0x9d, 0x78, 0xca, 0x89, 0xb0, 0x85, 0xdc, 0xda, + 0xa7, 0xa4, 0x92, 0x89, 0x76, 0xaf, 0x52, 0x9a, 0x9d, 0x59, 0x61, 0x51, + 0x69, 0x99, 0x37, 0x96, 0x68, 0xa8, 0xad, 0xaf, 0x97, 0x81, 0xa7, 0x6c, + 0xc0, 0x7a, 0x77, 0x84, 0x62, 0x7b, 0x8f, 0x84, 0xcf, 0xd1, 0x60, 0xa6, + 0x70, 0x57, 0x65, 0x5b, 0x47, 0x64, 0x95, 0x5f, 0xa0, 0x61, 0x93, 0x64, + 0x58, 0xa5, 0x97, 0x81, 0x4b, 0x74, 0xc6, 0x77, 0x9d, 0x78, 0x52, 0x46, + 0xb3, 0x88, 0x8f, 0x53, 0x33, 0x7e, 0x87, 0x8a, 0xb7, 0x59, 0x86, 0x53, + 0x96, 0xc0, 0x68, 0xcc, 0xbe, 0x89, 0xbc, 0xb6, 0x91, 0xa2, 0xa9, 0xbf, + 0x4e, 0xa2, 0x2b, 0x89, 0x58, 0x88, 0xa8, 0x87, 0x65, 0xae, 0xb9, 0x65, + 0x5c, 0x9b, 0xd0, 0x4a, 0x3c, 0x68, 0xb8, 0xb4, 0x83, 0xb0, 0xa1, 0x88, + 0x9d, 0x68, 0x99, 0x51, 0xb1, 0xa8, 0x78, 0xb2, 0x91, 0x5c, 0x8e, 0x82, + 0xbb, 0x6d, 0x4c, 0xc0, 0x96, 0x4b, 0xbd, 0x45, 0x94, 0xc7, 0x59, 0x68, + 0xbe, 0x5a, 0xa9, 0x61, 0xa0, 0x49, 0x69, 0x9e, 0x34, 0x7f, 0x81, 0x62, + 0x9a, 0xb5, 0xbb, 0x5f, 0x4a, 0xb6, 0xac, 0x3c, 0x5d, 0x89, 0x7f, 0x7f, + 0x40, 0x56, 0x6c, 0x3a, 0xb1, 0x86, 0x79, 0x59, 0xb7, 0x52, 0x5e, 0xd2, + 0x5f, 0x61, 0x48, 0x42, 0xb5, 0x90, 0x5a, 0x5d, 0x60, 0xc5, 0xa8, 0xc6, + 0x69, 0x56, 0xad, 0x8b, 0xa0, 0x96, 0x5e, 0x4f, 0x8a, 0xa5, 0x85, 0x3a, + 0xa7, 0x48, 0x50, 0xc0, 0x77, 0x95, 0x5a, 0x88, 0xab, 0x68, 0x9e, 0x95, + 0xaa, 0xba, 0x90, 0xa9, 0xba, 0x73, 0x3f, 0x71, 0x8c, 0x5b, 0xbf, 0x4c, + 0x43, 0x56, 0xc7, 0x68, 0x3a, 0x5b, 0xa4, 0x57, 0x7f, 0x43, 0x85, 0x79, + 0x7f, 0x64, 0x95, 0x80, 0x9c, 0x57, 0x7b, 0x9b, 0x53, 0x96, 0xa5, 0x6b, + 0xb5, 0x6f, 0x70, 0x7e, 0x72, 0x59, 0xa0, 0xab, 0x92, 0x9d, 0x42, 0x74, + 0x54, 0x54, 0xbd, 0x41, 0x8d, 0x9c, 0x9f, 0x5e, 0x6a, 0xb9, 0x8d, 0x64, + 0x6c, 0x5c, 0x3c, 0xb6, 0xb1, 0x7c, 0xa7, 0x95, 0xb4, 0xc3, 0x71, 0x9b, + 0x90, 0x5d, 0x57, 0xc7, 0x65, 0xbf, 0x9a, 0x38, 0x99, 0x94, 0x3f, 0x3f, + 0x9c, 0x4d, 0x5f, 0x48, 0xb0, 0x7b, 0xc8, 0x73, 0x9c, 0xa0, 0x7d, 0x6f, + 0x7a, 0x4b, 0x89, 0x8a, 0xc4, 0xb0, 0x96, 0x88, 0x9f, 0x9a, 0xa2, 0x5b, + 0x69, 0x95, 0x71, 0x40, 0x7e, 0x90, 0x48, 0x8f, 0x80, 0x50, 0x8d, 0x9a, + 0xbe, 0x8c, 0xa0, 0x55, 0xa7, 0xbc, 0x64, 0x8f, 0x41, 0xa1, 0x8f, 0x75, + 0x95, 0x9e, 0x39, 0x5c, 0x71, 0x44, 0xc3, 0x63, 0x8a, 0x66, 0x72, 0xc7, + 0xb2, 0x80, 0xa2, 0x45, 0x74, 0x5e, 0x7f, 0x6e, 0xba, 0x94, 0x68, 0xb9, + 0x76, 0xb4, 0x95, 0x68, 0x4b, 0x3b, 0x3e, 0x92, 0x99, 0xa4, 0x70, 0x7e, + 0x5b, 0x87, 0xae, 0x79, 0x84, 0x41, 0x8a, 0xd6, 0x3f, 0xb1, 0x39, 0x45, + 0x8e, 0x7d, 0x92, 0x4a, 0x77, 0x67, 0x5c, 0xb6, 0x8d, 0x7c, 0x89, 0x8a, + 0xab, 0xaf, 0xa1, 0x84, 0x42, 0x52, 0x4c, 0x8b, 0x92, 0xba, 0xb2, 0x69, + 0xa0, 0x59, 0x9a, 0x76, 0xad, 0x95, 0x41, 0x5c, 0x57, 0x87, 0xb1, 0x76, + 0x90, 0x7c, 0x41, 0x6c, 0x73, 0xb6, 0x76, 0x50, 0xae, 0x82, 0x96, 0x9e, + 0x37, 0x86, 0x43, 0xa9, 0xb2, 0xae, 0x4d, 0x72, 0xb9, 0x4b, 0x62, 0x6b, + 0x8d, 0x7e, 0x69, 0x55, 0xa8, 0x63, 0x7b, 0xbd, 0x83, 0x82, 0x6e, 0x49, + 0x67, 0xa7, 0x69, 0x65, 0xac, 0x70, 0x48, 0x94, 0x5f, 0x46, 0x51, 0xad, + 0xa8, 0x66, 0x71, 0x61, 0xb8, 0x81, 0x51, 0xc1, 0x46, 0x6d, 0xba, 0x49, + 0x5b, 0x66, 0xbf, 0x7a, 0x57, 0xa6, 0x42, 0x4b, 0x82, 0xb4, 0x49, 0x9f, + 0x9e, 0x96, 0x5a, 0x6f, 0x49, 0x4d, 0x41, 0x5f, 0x90, 0xa3, 0x3a, 0x73, + 0x8f, 0x5f, 0x8c, 0xa2, 0x58, 0x61, 0x82, 0x44, 0x3d, 0xa4, 0x7f, 0x66, + 0x61, 0x39, 0xb3, 0xac, 0x64, 0xa5, 0x80, 0x68, 0x91, 0xbc, 0xa7, 0x8d, + 0x72, 0xad, 0x7f, 0x76, 0xaf, 0x4a, 0x91, 0x5c, 0x4e, 0x97, 0x3d, 0x74, + 0x98, 0x47, 0xad, 0xa0, 0xad, 0x51, 0x9a, 0x98, 0x8b, 0xbc, 0x63, 0x81, + 0x9d, 0x69, 0x7b, 0x4a, 0x55, 0x98, 0x9f, 0xac, 0x9c, 0xb1, 0x5a, 0x54, + 0xa2, 0x74, 0xb6, 0x58, 0x92, 0xbd, 0x93, 0x79, 0x60, 0xb3, 0xa4, 0x4f, + 0x58, 0x66, 0x84, 0x6b, 0x25, 0x60, 0xd5, 0x91, 0x7e, 0x65, 0x97, 0x67, + 0x8b, 0x43, 0x7e, 0xb2, 0x84, 0x8a, 0x80, 0x89, 0xae, 0x9f, 0x74, 0x5a, + 0x92, 0x8d, 0x66, 0x90, 0xc3, 0xaa, 0x89, 0x63, 0x69, 0x9f, 0xaf, 0x6f, + 0x67, 0x6b, 0xde, 0x5d, 0x70, 0xa6, 0xb5, 0x87, 0x97, 0x99, 0x3b, 0x47, + 0x40, 0x6b, 0xad, 0x47, 0xa4, 0x3c, 0x7d, 0x30, 0x68, 0x6e, 0xc2, 0x77, + 0xa0, 0x51, 0x8f, 0x78, 0x70, 0xac, 0x98, 0x40, 0x35, 0x77, 0x5e, 0xb0, + 0x91, 0x98, 0x66, 0xc4, 0x9b, 0x9c, 0xbe, 0xb9, 0x87, 0x88, 0x78, 0x50, + 0x79, 0x57, 0x85, 0x3d, 0x61, 0xcb, 0xa9, 0x59, 0xb4, 0xaf, 0xaa, 0x92, + 0x9b, 0x7a, 0xb8, 0x5b, 0x5c, 0x98, 0xb4, 0x74, 0x30, 0x61, 0x50, 0x44, + 0x53, 0x38, 0x7f, 0x5a, 0x60, 0x8d, 0x3e, 0x75, 0x9c, 0xab, 0x6a, 0xae, + 0x75, 0x74, 0x4e, 0x53, 0x70, 0x8d, 0x49, 0xb1, 0xc2, 0x80, 0xbe, 0x6d, + 0x65, 0x53, 0x77, 0x79, 0x7e, 0x85, 0x7c, 0x9b, 0x68, 0x3e, 0x46, 0x62, + 0x49, 0xac, 0xc0, 0x52, 0xc3, 0x62, 0x7d, 0x7e, 0x7f, 0xa8, 0xa7, 0x93, + 0xc0, 0x8a, 0x62, 0x99, 0x87, 0x9d, 0xa3, 0x8e, 0x9a, 0x9e, 0x52, 0xad, + 0x6c, 0xc1, 0x66, 0x62, 0x86, 0x74, 0x7b, 0x74, 0x5d, 0xa5, 0x9a, 0x5a, + 0x99, 0x5b, 0x60, 0x4a, 0x77, 0xb0, 0x99, 0x48, 0x93, 0x86, 0x7b, 0x5f, + 0x41, 0xac, 0x61, 0x48, 0x68, 0xa9, 0x37, 0xb7, 0x9c, 0x89, 0xad, 0x76, + 0x60, 0x53, 0x75, 0x83, 0x75, 0x4c, 0x9c, 0xa4, 0xb8, 0x9a, 0x2b, 0xa8, + 0xba, 0x65, 0x37, 0x5b, 0x3f, 0x8a, 0xa9, 0x3e, 0xa5, 0xbf, 0x83, 0x98, + 0x51, 0x93, 0x60, 0x4a, 0xa0, 0xbb, 0x39, 0x6e, 0x9a, 0x56, 0x4e, 0x64, + 0xc6, 0xbd, 0x97, 0x96, 0x85, 0xa2, 0xb6, 0x93, 0x94, 0x92, 0x55, 0x74, + 0x3f, 0x64, 0x9f, 0x69, 0x8f, 0x51, 0xa3, 0xb6, 0xae, 0x3e, 0x33, 0x64, + 0x5f, 0x91, 0x6f, 0x97, 0x65, 0x6a, 0x88, 0xa7, 0xc6, 0xa2, 0x83, 0x67, + 0x85, 0xbb, 0xb4, 0xb3, 0x80, 0x88, 0x85, 0x84, 0x56, 0x57, 0x86, 0x83, + 0x75, 0x5b, 0xb1, 0x73, 0xa3, 0xb3, 0x6f, 0xb6, 0x88, 0xae, 0x47, 0x50, + 0x5c, 0x43, 0x3e, 0xa3, 0xa4, 0x99, 0x97, 0x82, 0xc6, 0x3d, 0xaa, 0xa7, + 0x64, 0xa7, 0xa3, 0x80, 0x3c, 0x86, 0x3b, 0xaa, 0xcb, 0xae, 0x8d, 0x88, + 0xb3, 0x62, 0x9d, 0x8d, 0x74, 0x94, 0x7e, 0x87, 0x5a, 0x56, 0xbb, 0x6a, + 0x66, 0x9f, 0x68, 0x98, 0x62, 0x5d, 0x82, 0x3c, 0x7f, 0x9b, 0x72, 0x51, + 0x7c, 0x92, 0x5b, 0x63, 0x7b, 0x81, 0xc2, 0xcd, 0x67, 0x6c, 0x7e, 0x66, + 0x9b, 0xbb, 0xa7, 0xbb, 0xbf, 0xb0, 0xd6, 0x98, 0xba, 0xbc, 0x46, 0x7f, + 0x64, 0xb5, 0x57, 0x3b, 0x6e, 0x5d, 0x4f, 0xb8, 0x83, 0x5c, 0xbd, 0xba, + 0x58, 0x82, 0x34, 0x7d, 0xaa, 0xa9, 0x95, 0x54, 0xcb, 0xb3, 0x58, 0xc3, + 0x64, 0x97, 0x62, 0x6e, 0x6d, 0x57, 0x9a, 0xa4, 0xa4, 0x8a, 0xa5, 0xb2, + 0x66, 0x56, 0xc0, 0x53, 0x7e, 0xbf, 0x56, 0xaf, 0x7d, 0x43, 0xbf, 0x6c, + 0xad, 0x8f, 0xab, 0x3e, 0xa8, 0x78, 0x87, 0xab, 0x71, 0x54, 0x3b, 0xb6, + 0x5c, 0xb2, 0x5a, 0x7e, 0x96, 0xcb, 0x57, 0x62, 0x5f, 0x4d, 0x96, 0x52, + 0x7c, 0x9e, 0x86, 0x75, 0x3a, 0xad, 0x5e, 0x5b, 0xb4, 0x8e, 0x66, 0xb2, + 0x96, 0x82, 0xaf, 0x79, 0x65, 0xa7, 0xb6, 0xa9, 0xa8, 0x65, 0xa6, 0x8b, + 0xa7, 0x98, 0x6f, 0x55, 0x78, 0xab, 0x8b, 0x70, 0x94, 0x89, 0x43, 0xb1, + 0xb0, 0xb5, 0x49, 0x46, 0x6d, 0x9d, 0x86, 0x9b, 0xc0, 0x8b, 0xa6, 0x56, + 0x72, 0x61, 0x98, 0x67, 0x51, 0x76, 0x3f, 0xbe, 0x5c, 0x7a, 0x87, 0x3c, + 0x46, 0x36, 0x8f, 0xa4, 0xa2, 0x61, 0xbb, 0x91, 0xd5, 0x5d, 0xa9, 0xc0, + 0x97, 0xb5, 0xb2, 0x58, 0x24, 0x43, 0x5d, 0xa3, 0x68, 0x82, 0x8b, 0x2a, + 0x88, 0x5a, 0x51, 0xb3, 0x6f, 0xb2, 0x76, 0x7e, 0x6a, 0xa5, 0x51, 0x53, + 0xa1, 0xd6, 0x89, 0x77, 0xcd, 0x9a, 0x98, 0xbe, 0xb7, 0x80, 0x81, 0x78, + 0x7d, 0xb2, 0x76, 0x80, 0x74, 0x62, 0x79, 0x9a, 0xbd, 0x6a, 0x67, 0xc9, + 0x4c, 0x7c, 0x9f, 0x85, 0xa2, 0x6b, 0x93, 0x37, 0x2f, 0x41, 0x4d, 0x3d, + 0x8c, 0x84, 0x8c, 0x56, 0x74, 0x63, 0xad, 0x80, 0x60, 0x9c, 0x98, 0x57, + 0x63, 0xa5, 0x59, 0x8e, 0x2a, 0x80, 0x90, 0x3f, 0x92, 0xaf, 0x35, 0xac, + 0x8e, 0xa2, 0xc5, 0x74, 0x7f, 0xa8, 0x90, 0x6b, 0x76, 0x45, 0x43, 0x64, + 0xaa, 0xad, 0x74, 0x9a, 0xbb, 0x51, 0x8e, 0x43, 0xc2, 0xc7, 0xc1, 0x58, + 0x56, 0x76, 0x68, 0x7a, 0x42, 0x67, 0x7c, 0x56, 0xc4, 0x87, 0x60, 0x39, + 0x8a, 0x70, 0x7c, 0x7f, 0x8a, 0xa3, 0x74, 0xa4, 0xae, 0x64, 0x8d, 0x9d, + 0x5c, 0x71, 0x99, 0x62, 0x86, 0x68, 0x9b, 0xac, 0x38, 0x86, 0x6b, 0x5a, + 0xba, 0x70, 0x68, 0x9e, 0x93, 0x8f, 0xa8, 0xa6, 0x7e, 0x45, 0x89, 0x5f, + 0x3f, 0x4a, 0xb8, 0x83, 0xbe, 0x64, 0xb9, 0x58, 0x83, 0x61, 0x6d, 0x42, + 0x74, 0x5b, 0x66, 0x58, 0x4d, 0x8c, 0x6c, 0x64, 0x57, 0xaa, 0xad, 0x7d, + 0x7d, 0xad, 0x76, 0x50, 0x6e, 0x9a, 0x81, 0x93, 0xaa, 0x8e, 0x37, 0x79, + 0x9d, 0xa1, 0x78, 0x4c, 0x3a, 0x6f, 0x64, 0x5b, 0x6b, 0x92, 0x68, 0xb4, + 0x5a, 0x9d, 0x4b, 0x4b, 0xac, 0x44, 0x85, 0x5b, 0x90, 0x94, 0x60, 0x43, + 0x55, 0xa5, 0xb8, 0x6f, 0x86, 0xb9, 0x8c, 0x4a, 0x94, 0x7a, 0x4c, 0x7b, + 0x45, 0x62, 0x7e, 0xab, 0xa7, 0x95, 0x86, 0xb4, 0x5a, 0xa7, 0xa7, 0x96, + 0x9f, 0x5f, 0x86, 0xac, 0x87, 0x7b, 0x5e, 0x6f, 0x61, 0x1f, 0x5e, 0xa9, + 0x71, 0x39, 0xa5, 0x87, 0x40, 0x92, 0xc8, 0x85, 0xa4, 0x67, 0xa9, 0x9c, + 0x68, 0x56, 0xc0, 0x6b, 0xa6, 0xa8, 0x50, 0xb0, 0x7b, 0xa5, 0xbe, 0x82, + 0x96, 0x47, 0x9c, 0x84, 0x88, 0xae, 0x60, 0x40, 0x60, 0x3d, 0x3e, 0x82, + 0x8a, 0x52, 0x59, 0xb3, 0x42, 0x53, 0xb2, 0xc4, 0x97, 0xbd, 0xa5, 0xa1, + 0x80, 0xa1, 0x7d, 0xa6, 0x7a, 0x67, 0x65, 0xaa, 0x59, 0xa4, 0x5b, 0x62, + 0x9f, 0x89, 0x86, 0x5e, 0x4a, 0xab, 0xa1, 0x5c, 0x49, 0x7b, 0x85, 0x3a, + 0xa1, 0x4a, 0x82, 0x45, 0x81, 0x5a, 0xb8, 0xa8, 0xc0, 0x9e, 0x8c, 0x4e, + 0x50, 0x89, 0x3e, 0x50, 0x5a, 0x72, 0xa0, 0x7b, 0x5c, 0x8e, 0x6d, 0x5a, + 0xb0, 0x59, 0xb1, 0x2a, 0xad, 0x87, 0xa8, 0x60, 0x89, 0x78, 0xc0, 0x8d, + 0x5d, 0x6e, 0x99, 0xba, 0x5b, 0x52, 0x49, 0x99, 0xce, 0x85, 0x4f, 0xcb, + 0x9d, 0x4f, 0x3f, 0x8c, 0x54, 0xa9, 0x87, 0x6e, 0x54, 0x8f, 0x86, 0x5d, + 0xab, 0xcf, 0x9c, 0x6d, 0x49, 0xb2, 0x37, 0x75, 0x64, 0x51, 0x89, 0xa1, + 0x8b, 0xa6, 0x90, 0xc4, 0xa2, 0xd9, 0x81, 0xb8, 0x4f, 0xc4, 0x88, 0xb3, + 0x3b, 0x33, 0x94, 0x40, 0x9d, 0x89, 0x7a, 0xb3, 0x59, 0x4f, 0x9a, 0x8d, + 0x5e, 0x6c, 0x75, 0x93, 0x93, 0xae, 0x8d, 0xa6, 0x74, 0x8f, 0x86, 0x80, + 0x7b, 0x75, 0x72, 0x93, 0x53, 0x49, 0x80, 0x39, 0x54, 0x4c, 0x96, 0x79, + 0x65, 0x7f, 0xaf, 0x7f, 0x94, 0x5f, 0xb9, 0x51, 0x87, 0xb8, 0xa5, 0x69, + 0x91, 0x7e, 0xc0, 0x99, 0x43, 0x84, 0xb2, 0x48, 0x77, 0x9f, 0x57, 0x61, + 0xb3, 0x9d, 0x99, 0x7d, 0x38, 0xbd, 0x56, 0xc5, 0x87, 0xc1, 0x4e, 0xb2, + 0xa3, 0xc4, 0x77, 0xac, 0x44, 0x3b, 0x9b, 0x4d, 0x8f, 0xd8, 0xc2, 0x81, + 0xa1, 0x58, 0xab, 0x84, 0x64, 0x8a, 0x5f, 0x19, 0x42, 0x9b, 0xac, 0x3b, + 0x46, 0x7f, 0x63, 0x70, 0x6c, 0x37, 0xa6, 0xad, 0x50, 0x85, 0x75, 0x6d, + 0xa7, 0xa1, 0x82, 0xd6, 0x92, 0x7d, 0x66, 0x8b, 0x6c, 0xca, 0xa3, 0x80, + 0x9d, 0x65, 0xb9, 0x78, 0x68, 0x4c, 0x65, 0xb3, 0x60, 0x5f, 0xb6, 0x8e, + 0x7a, 0x4e, 0x73, 0x9b, 0xd6, 0x72, 0xb5, 0x51, 0x53, 0x53, 0x90, 0x9c, + 0x63, 0x66, 0x72, 0xa9, 0x5d, 0x95, 0x85, 0x7f, 0xb9, 0x84, 0xb6, 0x76, + 0xa2, 0x9d, 0x67, 0x82, 0x5c, 0x4e, 0x97, 0x35, 0x44, 0x4b, 0x65, 0x7f, + 0xb0, 0x88, 0x7d, 0x4f, 0x80, 0xa4, 0x3c, 0x84, 0x9b, 0xa8, 0x52, 0x43, + 0xb8, 0x7a, 0x91, 0xa1, 0x82, 0x6f, 0xc7, 0x6c, 0xbb, 0x54, 0xb5, 0x73, + 0x37, 0x7b, 0x45, 0x7a, 0x4e, 0x8f, 0x78, 0x94, 0x49, 0x4d, 0x54, 0x6a, + 0x51, 0x5e, 0x73, 0x4c, 0xb1, 0xa9, 0x64, 0x6b, 0x9b, 0x8d, 0xb4, 0x7b, + 0x68, 0x8e, 0xac, 0xb3, 0x87, 0x66, 0xbe, 0x69, 0x8f, 0xa9, 0x50, 0xad, + 0x98, 0x7d, 0x80, 0x9f, 0x5c, 0xb4, 0x63, 0x4f, 0x6b, 0x3c, 0x2c, 0x40, + 0x7d, 0x65, 0xb9, 0x62, 0xb0, 0x88, 0x82, 0x4b, 0xc0, 0x6b, 0x4c, 0x77, + 0xb9, 0x45, 0x3f, 0x3a, 0x93, 0x3c, 0x8c, 0xba, 0x90, 0x68, 0xbd, 0x99, + 0x52, 0x69, 0x8f, 0x51, 0x52, 0xb5, 0xa0, 0x8b, 0x73, 0x77, 0x95, 0x73, + 0x8b, 0xbc, 0x65, 0xb4, 0x79, 0x87, 0x86, 0x8f, 0x97, 0x99, 0x83, 0xc3, + 0xa6, 0xac, 0x51, 0x55, 0x78, 0x2f, 0x60, 0x96, 0x96, 0x70, 0x49, 0xa0, + 0x3c, 0xc0, 0xac, 0x53, 0xb1, 0x3a, 0x6f, 0x63, 0x65, 0x95, 0xbd, 0x72, + 0x67, 0xb3, 0x97, 0x47, 0xbb, 0xa2, 0x68, 0xae, 0xa8, 0x49, 0x8d, 0x93, + 0x5c, 0x8c, 0xa7, 0x6f, 0x7a, 0x72, 0x6a, 0x84, 0x91, 0x4b, 0x3e, 0xb7, + 0x67, 0x40, 0x65, 0x39, 0xa3, 0xb0, 0x5f, 0x94, 0x4f, 0x9b, 0x5e, 0x69, + 0x79, 0x3b, 0x76, 0xb6, 0x86, 0xa7, 0x32, 0x41, 0x3a, 0x48, 0x9c, 0xb5, + 0x58, 0x9b, 0xbd, 0xb0, 0xb5, 0x56, 0xb5, 0x96, 0x42, 0xa5, 0xa1, 0x5b, + 0xb6, 0x60, 0x99, 0x40, 0xc6, 0x87, 0x79, 0xb7, 0xb9, 0xbf, 0xaa, 0xab, + 0x89, 0x90, 0x84, 0x81, 0x96, 0x4c, 0x55, 0x9b, 0x3e, 0x5d, 0x48, 0x77, + 0x64, 0x6d, 0x97, 0x90, 0xb1, 0x5c, 0x45, 0xb2, 0x9d, 0x50, 0x54, 0x82, + 0x79, 0xa2, 0x9b, 0x75, 0x58, 0x93, 0x97, 0x5a, 0x7a, 0x66, 0xdb, 0x6f, + 0xa4, 0x92, 0x90, 0x4d, 0xb6, 0x9f, 0x83, 0x6a, 0x95, 0x86, 0xb9, 0x74, + 0x57, 0x5d, 0x96, 0x6f, 0x73, 0x67, 0x6c, 0x95, 0xbb, 0x7e, 0x6c, 0xc6, + 0x54, 0xa0, 0x6e, 0xbf, 0xc6, 0x88, 0x65, 0x6c, 0x9a, 0x4c, 0xc6, 0x9d, + 0xb6, 0xad, 0xba, 0x64, 0x3f, 0x8e, 0x5d, 0x70, 0x81, 0x7d, 0x76, 0x55, + 0x50, 0x61, 0x82, 0xa4, 0x6e, 0x43, 0xa1, 0xae, 0x67, 0xae, 0x7c, 0x71, + 0x5e, 0x4a, 0x5d, 0xc4, 0x6b, 0x5a, 0x8e, 0xba, 0xc2, 0xa6, 0xaf, 0x63, + 0x3a, 0xa3, 0x6c, 0x95, 0x8c, 0x9b, 0x9d, 0x9d, 0x5e, 0xa4, 0x41, 0xae, + 0x9f, 0x90, 0x7a, 0x97, 0xc7, 0x7b, 0x6c, 0x6c, 0xc5, 0x65, 0xa7, 0x7a, + 0x6f, 0x9f, 0x39, 0x98, 0x59, 0x95, 0x92, 0x51, 0x81, 0x95, 0x90, 0x6f, + 0x9a, 0x63, 0x6b, 0x7b, 0x76, 0xb1, 0x83, 0x4e, 0x49, 0x63, 0x6e, 0x4d, + 0xc1, 0x5f, 0x77, 0x5f, 0x64, 0xa4, 0xa3, 0x60, 0x76, 0x62, 0xbf, 0x6a, + 0x91, 0x9a, 0xba, 0xbb, 0xa1, 0x38, 0x4d, 0xb3, 0xa4, 0xc1, 0x60, 0x92, + 0xbe, 0x96, 0x79, 0x89, 0x8e, 0xa1, 0x4e, 0x5f, 0x8e, 0xa5, 0x45, 0x4f, + 0x7c, 0x5b, 0x4e, 0xc3, 0x34, 0x56, 0x56, 0x52, 0xa5, 0x39, 0xb2, 0x32, + 0x5c, 0x99, 0x3c, 0x5a, 0x96, 0x6f, 0xa0, 0xa5, 0x68, 0xbd, 0x8a, 0x31, + 0x6c, 0x96, 0x86, 0x77, 0xb8, 0x4d, 0x72, 0x4f, 0xa1, 0x50, 0x93, 0xb3, + 0x47, 0x8a, 0x8f, 0xbd, 0x62, 0xa4, 0xb2, 0xc1, 0x4e, 0x50, 0x99, 0x5e, + 0x7d, 0x30, 0x9a, 0x6b, 0xac, 0x5b, 0x78, 0x59, 0x5f, 0x97, 0xa3, 0xb1, + 0x68, 0xa3, 0x41, 0x92, 0x49, 0x51, 0x9a, 0xae, 0x9d, 0x69, 0x87, 0x7c, + 0x6d, 0x6c, 0x48, 0x8f, 0x89, 0x5b, 0x74, 0x5e, 0xa7, 0x50, 0xa2, 0x4c, + 0x87, 0xa6, 0x35, 0x47, 0x66, 0x92, 0x7d, 0x88, 0xbb, 0xc6, 0x8a, 0xab, + 0xbc, 0xb5, 0x69, 0x4c, 0x9a, 0xae, 0x60, 0x65, 0x3f, 0x51, 0x53, 0x41, + 0x44, 0x3c, 0x32, 0x8f, 0x4b, 0x36, 0xb7, 0xae, 0x97, 0x58, 0x6c, 0x9d, + 0xa6, 0x9a, 0x45, 0x9b, 0x4c, 0x6d, 0x96, 0x63, 0xaf, 0x51, 0xb0, 0x45, + 0x76, 0x6f, 0xb3, 0x87, 0x3e, 0x8e, 0x81, 0xa9, 0x90, 0x97, 0x6b, 0xc3, + 0x79, 0xc3, 0x7c, 0x70, 0xa4, 0xac, 0x5a, 0x74, 0xbc, 0x9e, 0xa5, 0x50, + 0x3d, 0xb3, 0x79, 0xb5, 0xa0, 0x3e, 0x98, 0xb3, 0x7b, 0x5e, 0x6f, 0xae, + 0x6c, 0x55, 0x5e, 0x79, 0xba, 0xa0, 0x80, 0x73, 0x42, 0x81, 0xc4, 0xaf, + 0xba, 0x72, 0xcf, 0x6b, 0x85, 0x7b, 0xa3, 0xc0, 0xa9, 0xbb, 0x80, 0x61, + 0x82, 0x81, 0x66, 0x71, 0xab, 0xa6, 0x6b, 0x94, 0x8f, 0xc4, 0x56, 0x9b, + 0xab, 0x39, 0x76, 0xb7, 0x3d, 0x5b, 0x41, 0x4b, 0xad, 0x82, 0x5b, 0x51, + 0x57, 0xae, 0xa5, 0x72, 0x66, 0x3f, 0x69, 0x63, 0x9c, 0x50, 0x67, 0xa7, + 0x6b, 0xa8, 0x90, 0x7d, 0xaf, 0x8a, 0x69, 0x78, 0xc2, 0xc2, 0x67, 0x6c, + 0xb3, 0xb3, 0x81, 0x7e, 0x7b, 0xa2, 0x9a, 0x69, 0xaa, 0x3e, 0x73, 0x76, + 0x5d, 0x75, 0x45, 0x8d, 0x62, 0x57, 0x50, 0x9b, 0x4b, 0x8f, 0x6f, 0xb2, + 0xa6, 0x4b, 0x3e, 0x44, 0x81, 0x5c, 0x81, 0xb3, 0x74, 0x7f, 0x89, 0xdb, + 0xb1, 0x61, 0xca, 0x87, 0x63, 0x31, 0x93, 0x61, 0xb9, 0x77, 0x65, 0x7f, + 0x83, 0x5f, 0x52, 0x4d, 0x5b, 0x56, 0x9e, 0x7f, 0x78, 0xad, 0x6a, 0x7c, + 0x51, 0x69, 0x5d, 0xc1, 0x7e, 0x96, 0xc6, 0x4f, 0x82, 0x60, 0x86, 0xa9, + 0x46, 0x3e, 0x3f, 0x59, 0xa9, 0x71, 0x3b, 0x95, 0xa7, 0x8f, 0x85, 0x51, + 0xb3, 0x8d, 0x90, 0x74, 0x51, 0xbb, 0x9d, 0x72, 0xc3, 0x53, 0x88, 0xab, + 0x3c, 0x94, 0x48, 0x7e, 0xd0, 0x82, 0x63, 0x83, 0x8c, 0x63, 0x82, 0x83, + 0xab, 0x48, 0x6e, 0x87, 0x87, 0x96, 0xa4, 0x4a, 0xaf, 0x59, 0x9a, 0xab, + 0x63, 0x64, 0xa0, 0x55, 0xb9, 0x65, 0x5e, 0xc0, 0x54, 0x72, 0x79, 0xa5, + 0xbb, 0x7d, 0x48, 0x50, 0x62, 0xb9, 0xb4, 0x40, 0x8d, 0x42, 0x7f, 0xab, + 0xb1, 0x7f, 0xb0, 0x5a, 0x71, 0x76, 0x6e, 0x58, 0x9e, 0x8f, 0xb3, 0xc3, + 0xc2, 0x4d, 0x80, 0xa3, 0xc6, 0xb1, 0x7c, 0x89, 0xae, 0xa8, 0x8f, 0x6e, + 0x52, 0x67, 0xc6, 0x65, 0x9a, 0x6c, 0x7f, 0xc0, 0x65, 0x52, 0x4b, 0x9a, + 0x51, 0x9d, 0x99, 0x84, 0xb7, 0xab, 0xb0, 0x93, 0xa0, 0x8c, 0x57, 0x92, + 0x5a, 0x83, 0x6b, 0x50, 0x57, 0x80, 0xab, 0x5f, 0x51, 0x7a, 0xbe, 0x73, + 0x98, 0x6c, 0x51, 0x75, 0x85, 0x4c, 0xc6, 0xad, 0x9d, 0xb5, 0xad, 0x47, + 0x9e, 0x49, 0x70, 0x8f, 0x94, 0x47, 0x80, 0xb3, 0x50, 0x60, 0x99, 0x57, + 0x86, 0x40, 0x50, 0x7c, 0x85, 0xa6, 0x87, 0x58, 0x9d, 0xb5, 0x59, 0x75, + 0x4f, 0x8e, 0xba, 0xb4, 0x8c, 0xc6, 0xcd, 0xa0, 0x90, 0x8e, 0x96, 0xb2, + 0x60, 0xb6, 0x60, 0x62, 0x6f, 0x7c, 0xb2, 0x46, 0x81, 0x99, 0xb6, 0x4f, + 0xa5, 0xb3, 0x8a, 0x8d, 0xb4, 0x5b, 0x61, 0xd0, 0xd3, 0xae, 0x7c, 0x54, + 0x9c, 0x7e, 0x71, 0x87, 0x3f, 0x32, 0x2d, 0x37, 0x67, 0x46, 0x71, 0x9c, + 0xbd, 0x9c, 0xbf, 0x6e, 0x5a, 0xbd, 0xae, 0xc2, 0x99, 0x5e, 0x70, 0x7e, + 0xc1, 0xb3, 0xe0, 0x9a, 0x9d, 0xd3, 0x8d, 0xbe, 0xc6, 0x78, 0xa6, 0x6a, + 0x69, 0x4c, 0x38, 0x89, 0x83, 0x9e, 0xa3, 0x53, 0xbe, 0xa5, 0x51, 0x92, + 0x9d, 0x46, 0x91, 0x81, 0xd6, 0x70, 0x88, 0xb8, 0x8b, 0x30, 0xa3, 0x96, + 0x54, 0x67, 0x4e, 0x86, 0x42, 0x74, 0x4a, 0x51, 0x66, 0x5a, 0xa8, 0xb7, + 0xce, 0x81, 0xa0, 0xb8, 0xbd, 0x68, 0x81, 0xbc, 0xc5, 0xa7, 0xc3, 0x41, + 0x85, 0x55, 0xcf, 0xa2, 0x8b, 0x48, 0x67, 0x76, 0x83, 0x5f, 0xb0, 0x66, + 0xbf, 0xb5, 0x7e, 0xb6, 0xb0, 0xaa, 0xbe, 0xab, 0x60, 0x73, 0xad, 0x4e, + 0x58, 0x4b, 0x6f, 0x43, 0x80, 0x77, 0x42, 0x3d, 0x97, 0xa8, 0x8e, 0x8d, + 0xa2, 0x5c, 0xbd, 0x96, 0x5d, 0x64, 0x9c, 0x9a, 0x63, 0x7f, 0x97, 0x59, + 0xa2, 0x70, 0x98, 0x68, 0x55, 0xa9, 0x9a, 0x4a, 0x41, 0x9e, 0xa0, 0x65, + 0x64, 0x45, 0xc0, 0xbc, 0x88, 0xa3, 0x92, 0xae, 0x34, 0xbc, 0xa9, 0x7c, + 0x7a, 0x71, 0x5f, 0x48, 0xc9, 0xc8, 0x89, 0x66, 0x5e, 0xa7, 0x94, 0x6d, + 0x7b, 0x57, 0x7d, 0xb2, 0xa4, 0x5a, 0x4e, 0xc0, 0x7f, 0xb0, 0x89, 0x66, + 0x8b, 0x78, 0x5e, 0x66, 0xa1, 0x82, 0x48, 0x5d, 0xa3, 0x8e, 0xb3, 0x8a, + 0x87, 0xbd, 0x69, 0x6b, 0x6f, 0x88, 0x6c, 0x56, 0xa5, 0xc3, 0x54, 0x42, + 0x6a, 0xb8, 0x53, 0x8f, 0xb1, 0x8d, 0x4e, 0x5e, 0x66, 0x4f, 0x8d, 0x62, + 0x91, 0xcf, 0xbc, 0x9e, 0x75, 0x7a, 0xb8, 0xb0, 0x5d, 0x4a, 0x46, 0x91, + 0xca, 0x8b, 0x5d, 0x4f, 0x6f, 0x5d, 0x4a, 0x5c, 0x93, 0x7c, 0x7a, 0x57, + 0x5d, 0x7f, 0xd8, 0xab, 0xad, 0x67, 0xcb, 0xa3, 0x7c, 0x92, 0x8a, 0x28, + 0x7a, 0x5a, 0x53, 0x65, 0xb2, 0xc1, 0xab, 0x4c, 0xe8, 0xeb, 0x7f, 0x3a, + 0x6b, 0x6a, 0x5e, 0x6c, 0x7b, 0x61, 0x4e, 0x5a, 0x74, 0xb5, 0xbf, 0x3c, + 0x75, 0x9e, 0x68, 0x9a, 0x9e, 0xc3, 0xb4, 0xb4, 0x75, 0x94, 0xa8, 0x47, + 0x40, 0xa9, 0x4c, 0x98, 0x61, 0x96, 0x5d, 0xb4, 0xe5, 0x8e, 0x59, 0xab, + 0x54, 0x9b, 0x43, 0xae, 0x87, 0x40, 0x4a, 0x58, 0x53, 0x6f, 0x44, 0xbe, + 0xa4, 0x89, 0x4e, 0x4b, 0x78, 0xae, 0x62, 0x67, 0x80, 0x9f, 0x5b, 0x92, + 0xa7, 0x43, 0xa1, 0x75, 0x7f, 0x42, 0x3f, 0x96, 0xb2, 0x4d, 0x5e, 0x8e, + 0x7a, 0x75, 0xa0, 0xbd, 0x8d, 0x57, 0xb3, 0x67, 0xaa, 0x5f, 0x8a, 0x7c, + 0xae, 0xb4, 0xa4, 0x4a, 0x67, 0x89, 0xa1, 0xa4, 0xc2, 0x5c, 0x8c, 0x94, + 0x5c, 0xc1, 0xa1, 0x7d, 0x96, 0x53, 0xd8, 0x7b, 0x4a, 0x91, 0x6b, 0x69, + 0xbc, 0xb8, 0xbb, 0x80, 0xb0, 0x62, 0x44, 0x90, 0xb7, 0xcf, 0xa6, 0x48, + 0xc4, 0x59, 0xb8, 0x79, 0x82, 0x89, 0x85, 0x9f, 0x60, 0xa0, 0xaf, 0x84, + 0x94, 0x5c, 0x7d, 0x78, 0x6f, 0xdc, 0x72, 0x7f, 0xc3, 0xa0, 0x5b, 0x71, + 0x80, 0xac, 0xa7, 0x8a, 0x88, 0x8f, 0x92, 0x5a, 0x5c, 0x92, 0x5c, 0xbf, + 0x51, 0xce, 0xbc, 0xa7, 0xa6, 0xd0, 0xc4, 0xab, 0x81, 0xbd, 0x95, 0x67, + 0x5d, 0x4d, 0x62, 0x70, 0x3c, 0x45, 0x94, 0x65, 0xb9, 0xc0, 0x57, 0x94, + 0xbc, 0xa9, 0x98, 0xa5, 0x7a, 0x66, 0x86, 0xba, 0x7b, 0x52, 0xc9, 0xbe, + 0xa0, 0x7b, 0x74, 0x45, 0x86, 0x60, 0x8e, 0x59, 0x68, 0x87, 0x3d, 0x7c, + 0xb1, 0x56, 0xb4, 0x88, 0x80, 0x55, 0xb6, 0x64, 0xab, 0xbd, 0x6f, 0x48, + 0x8c, 0xb3, 0x95, 0x89, 0x61, 0x6c, 0x89, 0x72, 0x91, 0x96, 0x73, 0x34, + 0x53, 0x86, 0x55, 0x67, 0x7f, 0x19, 0x25, 0x4e, 0x60, 0x3a, 0xa3, 0x86, + 0x89, 0x70, 0xa5, 0x9f, 0xbd, 0x91, 0x95, 0xc2, 0x6d, 0x5c, 0x9d, 0x6e, + 0x8b, 0x96, 0xba, 0xd1, 0x68, 0x7f, 0x9a, 0x84, 0xaf, 0x65, 0xc1, 0x94, + 0xb1, 0x44, 0x21, 0x63, 0x81, 0x72, 0xa7, 0xa0, 0x8f, 0xa5, 0x93, 0x3c, + 0x7a, 0xb9, 0x89, 0x90, 0xa3, 0x6b, 0x2e, 0x22, 0x96, 0x61, 0x17, 0x92, + 0x82, 0x7a, 0xa6, 0x5b, 0xc2, 0xb8, 0x99, 0xd2, 0xa0, 0xd1, 0x86, 0x5b, + 0x96, 0xa4, 0x69, 0x85, 0x82, 0x74, 0x6c, 0xac, 0x4e, 0x82, 0x6b, 0xdc, + 0xa6, 0xd1, 0x7d, 0xbe, 0xc1, 0xac, 0x4f, 0xb8, 0x47, 0x9e, 0x90, 0x9a, + 0x98, 0x96, 0xa6, 0x5f, 0x34, 0xa9, 0x75, 0xa6, 0x72, 0x7c, 0x65, 0xb3, + 0x68, 0x56, 0x4e, 0xbc, 0x76, 0x93, 0xb6, 0xab, 0x5d, 0x4d, 0xc9, 0x9b, + 0x99, 0x7a, 0x4a, 0xa5, 0x9a, 0x55, 0x9b, 0x49, 0x56, 0x91, 0x6e, 0x7a, + 0x61, 0x87, 0xb8, 0x41, 0x8f, 0xbd, 0xac, 0x93, 0x44, 0x6d, 0x75, 0x9f, + 0x7d, 0x42, 0x81, 0x7f, 0x71, 0x34, 0x9f, 0x45, 0xb8, 0x7a, 0xa6, 0x86, + 0xb4, 0xab, 0x5f, 0xab, 0x54, 0x7d, 0x8a, 0x6c, 0x7f, 0xbc, 0xa4, 0x8a, + 0x5f, 0x9e, 0x77, 0xa1, 0xb5, 0x47, 0x66, 0xa6, 0x74, 0x93, 0x82, 0x6b, + 0x65, 0x4e, 0x6f, 0x66, 0x27, 0x70, 0x3b, 0x56, 0x7e, 0x43, 0xbe, 0xb1, + 0x78, 0xa2, 0x49, 0xa9, 0x6c, 0x76, 0x3a, 0x5c, 0x49, 0xbb, 0x6f, 0xa0, + 0x66, 0x52, 0x6f, 0x84, 0x76, 0x99, 0x6d, 0x4b, 0x44, 0x72, 0x57, 0x5a, + 0x86, 0xbc, 0xce, 0xb6, 0x76, 0x98, 0x8b, 0x48, 0x4e, 0x8e, 0x64, 0xa4, + 0x81, 0x92, 0x83, 0x5a, 0xc3, 0x6e, 0x5f, 0x62, 0x84, 0x4f, 0x72, 0x8a, + 0x56, 0x66, 0x66, 0x51, 0x7f, 0xb5, 0x8e, 0x60, 0x46, 0x40, 0x64, 0x97, + 0x95, 0x2d, 0x6b, 0xa2, 0xb9, 0x7f, 0x9b, 0xc3, 0xd1, 0x7d, 0x72, 0xb8, + 0x98, 0x60, 0xa2, 0x9d, 0x41, 0x87, 0x43, 0xc9, 0x93, 0xa9, 0x96, 0x69, + 0x80, 0x7d, 0x83, 0xae, 0xb9, 0xa1, 0x6f, 0x56, 0xbc, 0x69, 0x64, 0x6a, + 0x74, 0xa1, 0xa0, 0x4f, 0x93, 0x62, 0xc0, 0x5e, 0x38, 0x7a, 0x69, 0x6d, + 0x42, 0xba, 0x46, 0x3a, 0x5e, 0x75, 0x99, 0x8b, 0x63, 0xab, 0xb1, 0x92, + 0xa3, 0x8c, 0x8a, 0x99, 0x4f, 0xc3, 0x7d, 0x96, 0x9d, 0x3f, 0xb0, 0x6c, + 0x7e, 0x96, 0x90, 0x85, 0x43, 0x42, 0xa4, 0x9b, 0x64, 0x9e, 0x51, 0x51, + 0x3c, 0x97, 0x6f, 0x58, 0x5e, 0x7d, 0x42, 0x68, 0x31, 0x9b, 0x9f, 0xbe, + 0x72, 0x8b, 0x5a, 0x73, 0xb1, 0xb6, 0x79, 0xb8, 0x56, 0xc4, 0x66, 0xc4, + 0x9e, 0x48, 0x7b, 0x69, 0x7b, 0x54, 0x5d, 0x6b, 0x8d, 0x51, 0x7d, 0xa4, + 0x6a, 0x65, 0x9a, 0x48, 0x6b, 0x88, 0x69, 0x7b, 0x9a, 0xa5, 0xd1, 0x94, + 0xc4, 0xac, 0x5a, 0x8b, 0x8a, 0x4d, 0x82, 0x6f, 0x3f, 0x7c, 0x7b, 0x93, + 0x6c, 0xa5, 0x6c, 0x5a, 0xc5, 0x6f, 0x63, 0x63, 0x80, 0xb9, 0x5d, 0x40, + 0x4a, 0x3f, 0x53, 0x6f, 0x4d, 0x44, 0x9a, 0x49, 0xb0, 0xab, 0x73, 0x93, + 0x42, 0x8f, 0xaa, 0x38, 0x47, 0xa8, 0x99, 0x67, 0xad, 0x8a, 0x68, 0xac, + 0x81, 0x64, 0xa3, 0x7d, 0xc6, 0xbf, 0x77, 0x9d, 0xc2, 0x93, 0xcf, 0xc0, + 0x75, 0xb5, 0x80, 0x96, 0xa9, 0x9e, 0xa9, 0x6e, 0x85, 0x7a, 0xcb, 0x52, + 0x54, 0xb7, 0x6a, 0xb2, 0xad, 0x80, 0x72, 0x7c, 0xa2, 0xb1, 0x6c, 0x6f, + 0x6b, 0x99, 0x44, 0x7d, 0x7d, 0x93, 0x7f, 0xc5, 0xb3, 0xb3, 0x90, 0xa3, + 0x6a, 0x6b, 0x8b, 0x71, 0x66, 0x90, 0x4b, 0xa9, 0x69, 0xa6, 0x93, 0xae, + 0x4a, 0x90, 0x44, 0x95, 0xd2, 0xd2, 0x61, 0x42, 0x7c, 0x68, 0xa2, 0x2e, + 0xa9, 0x8f, 0x4a, 0x2b, 0x69, 0x7a, 0x63, 0x7a, 0x11, 0x68, 0x63, 0x4d, + 0x88, 0x2c, 0x78, 0x5a, 0x53, 0x42, 0x69, 0x7b, 0x60, 0x6d, 0x55, 0x90, + 0x80, 0x72, 0x57, 0xb9, 0x8d, 0x61, 0xa4, 0x6f, 0x84, 0x5e, 0xaa, 0xab, + 0x43, 0x9b, 0x89, 0xb2, 0x4f, 0x67, 0x7c, 0xde, 0xad, 0xa4, 0x53, 0xd9, + 0xc0, 0x4e, 0x39, 0x48, 0xb5, 0x6e, 0x4c, 0x46, 0xb8, 0x57, 0x82, 0x19, + 0x89, 0xc9, 0x38, 0x4d, 0x38, 0xa7, 0xaf, 0x5d, 0x49, 0xa4, 0x5a, 0x74, + 0x88, 0x5c, 0x7c, 0x44, 0x5f, 0xa4, 0x66, 0xa3, 0x97, 0x62, 0x4c, 0x5b, + 0x78, 0x41, 0x6d, 0x7b, 0x9a, 0x4d, 0x8f, 0x43, 0x5c, 0x6e, 0x65, 0xa4, + 0x52, 0x8c, 0x75, 0x66, 0x7d, 0x83, 0xbc, 0x47, 0x4a, 0x7d, 0x56, 0x5a, + 0x1c, 0x4e, 0xb3, 0x65, 0x8f, 0x2d, 0x21, 0x5c, 0xab, 0xba, 0x52, 0x70, + 0xb4, 0x67, 0x75, 0x9e, 0x98, 0x6b, 0x69, 0x36, 0xba, 0xa7, 0xa6, 0xa2, + 0x9d, 0xd1, 0x48, 0x88, 0x96, 0xb1, 0x4f, 0x54, 0x75, 0xaa, 0x4c, 0xa3, + 0x44, 0x39, 0x51, 0x71, 0x91, 0x3f, 0x8b, 0x76, 0x9a, 0x7c, 0xb6, 0x8d, + 0x5a, 0x5d, 0x58, 0x7a, 0x7b, 0x8e, 0x51, 0x69, 0x53, 0x42, 0x78, 0x62, + 0x84, 0xc1, 0x9e, 0x99, 0x9d, 0x59, 0x96, 0x26, 0x45, 0xb1, 0x4c, 0x78, + 0x69, 0x97, 0x36, 0xa4, 0x62, 0xa3, 0x5d, 0x4b, 0x8d, 0x66, 0x3f, 0x77, + 0xc4, 0x95, 0x94, 0x5d, 0x8b, 0x68, 0x91, 0x6c, 0x9d, 0x87, 0x5f, 0x65, + 0x54, 0x3a, 0x93, 0xa5, 0x84, 0x71, 0x74, 0xb7, 0x94, 0x42, 0x8f, 0xa7, + 0x93, 0x81, 0x7e, 0x69, 0x64, 0x2c, 0x8a, 0x96, 0x51, 0x54, 0x66, 0x80, + 0x8f, 0x63, 0x36, 0x9c, 0xae, 0x7b, 0xa3, 0x79, 0xd8, 0xb6, 0x6f, 0x30, + 0x69, 0xd2, 0x76, 0xb2, 0x29, 0x36, 0x56, 0x9c, 0x83, 0x62, 0xc1, 0x8a, + 0x78, 0x3a, 0x9a, 0xcd, 0x80, 0x5a, 0x90, 0x48, 0x99, 0x8c, 0x72, 0xb5, + 0x53, 0x5b, 0xac, 0xbb, 0xae, 0xaf, 0x7d, 0x83, 0xaf, 0xc2, 0xb9, 0x8a, + 0x61, 0x65, 0x68, 0xce, 0x2c, 0x39, 0x76, 0xb1, 0x88, 0x51, 0x8e, 0x6a, + 0x9c, 0x45, 0x4d, 0x65, 0xa0, 0xc6, 0xb7, 0x63, 0x8a, 0xae, 0x6a, 0x6f, + 0x61, 0xae, 0xa8, 0x42, 0x1f, 0x57, 0xa9, 0x52, 0x4f, 0x62, 0xac, 0x64, + 0x4d, 0x62, 0x75, 0xd0, 0x72, 0x4b, 0x46, 0x4d, 0xc4, 0xb5, 0x95, 0x5f, + 0xd3, 0xde, 0x75, 0x82, 0x91, 0xdc, 0x95, 0x4b, 0xa4, 0x7f, 0xc7, 0x9a, + 0x65, 0x40, 0x5e, 0xbb, 0x8f, 0x59, 0x57, 0x93, 0x65, 0x4b, 0x4f, 0x8d, + 0x68, 0xc8, 0x4a, 0x31, 0x7c, 0xe4, 0x86, 0x5b, 0x35, 0xc8, 0x89, 0xb6, + 0x8b, 0xb3, 0xb6, 0xbe, 0x85, 0x6d, 0x74, 0x5e, 0x67, 0x57, 0x91, 0xae, + 0x7f, 0x3a, 0xa9, 0x5a, 0x81, 0x65, 0x8b, 0xa5, 0x71, 0x81, 0x86, 0x83, + 0x28, 0x7f, 0x8f, 0x8c, 0x29, 0x51, 0x69, 0xad, 0xa3, 0x4f, 0xc0, 0x96, + 0xc9, 0xbe, 0x96, 0x43, 0x55, 0x55, 0xa9, 0x8d, 0x92, 0x4c, 0x9b, 0x8f, + 0x52, 0x71, 0xc4, 0x4e, 0x35, 0x23, 0x71, 0x4e, 0xa2, 0xb9, 0xd8, 0xbd, + 0xa9, 0xa3, 0x7c, 0xb8, 0x80, 0x46, 0xbe, 0xb7, 0x35, 0x27, 0x43, 0xab, + 0x4e, 0x30, 0x6a, 0x92, 0xc0, 0x64, 0x67, 0x64, 0x8c, 0x7b, 0x82, 0x9e, + 0xa9, 0x65, 0x81, 0x7b, 0x86, 0x6d, 0x52, 0x5e, 0x9c, 0x9e, 0x3f, 0x60, + 0xad, 0x54, 0x8d, 0x84, 0x59, 0xb2, 0x51, 0x72, 0xa5, 0xa5, 0x85, 0x6a, + 0x54, 0x85, 0x47, 0x5c, 0x39, 0x8a, 0x76, 0x91, 0x3f, 0xa7, 0x6c, 0x4e, + 0x79, 0x71, 0xb8, 0xa9, 0x87, 0x62, 0x67, 0xbe, 0xad, 0x6b, 0xb4, 0xb1, + 0x60, 0x4b, 0x4c, 0x49, 0x8d, 0x4e, 0x14, 0x86, 0x71, 0xb4, 0x52, 0x55, + 0x90, 0x95, 0xbd, 0x25, 0x43, 0x41, 0x7f, 0x6d, 0xb2, 0x92, 0x37, 0x6b, + 0x5b, 0x68, 0xc9, 0x68, 0x5f, 0x64, 0x7d, 0x87, 0x9f, 0x4e, 0x8a, 0x89, + 0x70, 0xa9, 0x88, 0x98, 0x72, 0x8a, 0x58, 0x85, 0x6a, 0x96, 0x95, 0x50, + 0x97, 0x49, 0x81, 0x78, 0xad, 0x5d, 0x7c, 0x61, 0x3c, 0x52, 0x5a, 0x5f, + 0x62, 0x80, 0x52, 0x50, 0x64, 0x3f, 0x60, 0xa9, 0x9d, 0x9f, 0x3a, 0x7d, + 0x75, 0x38, 0x48, 0x39, 0x89, 0x39, 0x78, 0xa5, 0xa9, 0x9a, 0x92, 0x4b, + 0x3f, 0x61, 0x54, 0x67, 0xa0, 0x7a, 0x74, 0x74, 0x5c, 0x63, 0x57, 0xa1, + 0xb0, 0xbc, 0x93, 0x8b, 0xaf, 0x4e, 0x79, 0xae, 0xc1, 0xb3, 0x79, 0x2f, + 0xa7, 0xa5, 0xa8, 0x27, 0x4d, 0xa1, 0x68, 0xa5, 0x5c, 0xb2, 0x43, 0x84, + 0xa0, 0x51, 0x8e, 0x51, 0x54, 0xb5, 0xa4, 0x5c, 0x89, 0x8e, 0x88, 0x59, + 0x4a, 0xa2, 0x8a, 0xa8, 0xcd, 0x98, 0x99, 0x62, 0xb3, 0x66, 0xa5, 0x82, + 0x45, 0x86, 0x7c, 0x5d, 0x7b, 0x44, 0x4a, 0xb5, 0x76, 0x55, 0x47, 0x6b, + 0x84, 0x52, 0x80, 0x55, 0x85, 0xba, 0x5b, 0x76, 0x44, 0xa1, 0xab, 0x27, + 0x56, 0xa3, 0x6e, 0xa5, 0x5d, 0xad, 0x7f, 0x94, 0xaf, 0x7c, 0x8d, 0x99, + 0x87, 0x67, 0x7d, 0xa0, 0x52, 0x64, 0x7d, 0x49, 0x41, 0x9a, 0x98, 0x61, + 0x94, 0x81, 0xa8, 0xb2, 0x7c, 0xb0, 0x50, 0x57, 0xc4, 0xaf, 0x50, 0x3d, + 0x5c, 0x88, 0x44, 0x3a, 0x7d, 0x5e, 0x94, 0x50, 0x64, 0xae, 0x71, 0x7b, + 0x55, 0x69, 0xb8, 0x8f, 0x3f, 0x3e, 0xb3, 0x6d, 0x6a, 0x68, 0x7f, 0x4a, + 0x40, 0xa9, 0xb1, 0x3d, 0x2c, 0x88, 0x4e, 0x52, 0x3e, 0x41, 0x63, 0x94, + 0x5c, 0x6b, 0x3f, 0x60, 0x57, 0x9a, 0x71, 0x6a, 0x5f, 0xa6, 0x3a, 0x24, + 0x37, 0x97, 0x88, 0x69, 0x5f, 0x99, 0xa1, 0x8f, 0xa3, 0x63, 0xb2, 0xa3, + 0xac, 0x48, 0xa7, 0x57, 0x5e, 0x7b, 0xb4, 0xb2, 0x88, 0x8a, 0x68, 0x4e, + 0xa9, 0x6d, 0x54, 0x56, 0x59, 0x5d, 0x88, 0x78, 0xb2, 0xc2, 0x80, 0x7e, + 0x2c, 0x3a, 0xc4, 0x8e, 0x85, 0x7d, 0x51, 0x73, 0x72, 0xa8, 0x8c, 0xb8, + 0x53, 0x90, 0x6f, 0x77, 0xa3, 0x7b, 0xa4, 0x6e, 0x8b, 0x65, 0x5c, 0x97, + 0xa0, 0x7a, 0x9a, 0x40, 0x9e, 0x33, 0x40, 0xa5, 0xa9, 0xa1, 0x71, 0x5a, + 0x67, 0x95, 0x42, 0xbd, 0x77, 0x7f, 0x6b, 0x59, 0xab, 0xaa, 0x35, 0x4d, + 0xbb, 0x8f, 0x51, 0x61, 0xb6, 0x71, 0xab, 0x94, 0xc0, 0x56, 0x5b, 0x9c, + 0xac, 0x63, 0x68, 0x6e, 0xaa, 0xae, 0x9f, 0xbc, 0x86, 0x97, 0x4b, 0x83, + 0xd0, 0xc0, 0xc1, 0xa4, 0x96, 0xc3, 0x81, 0x44, 0xa2, 0x94, 0xc5, 0x64, + 0x61, 0xc7, 0x75, 0x9a, 0x7d, 0x3e, 0x5b, 0x73, 0xb5, 0xaa, 0x80, 0x87, + 0xa2, 0x5e, 0x60, 0x7e, 0x47, 0x6c, 0x7a, 0x5b, 0x8b, 0x90, 0x95, 0xa4, + 0x93, 0x82, 0x69, 0xbf, 0x7f, 0xa7, 0xa1, 0xc8, 0x4b, 0x42, 0x7f, 0x7b, + 0xb6, 0xa7, 0x70, 0x45, 0x94, 0x90, 0xb4, 0x32, 0xa3, 0x52, 0xaf, 0xb0, + 0x89, 0xad, 0xb8, 0x90, 0xaa, 0xcd, 0xac, 0x8b, 0x4d, 0xa0, 0xb5, 0x97, + 0x42, 0x6b, 0x66, 0x60, 0x78, 0xa1, 0x5d, 0x88, 0x4c, 0x4d, 0xbb, 0x54, + 0x43, 0x6b, 0x7b, 0x7a, 0x87, 0x57, 0x87, 0x43, 0x9e, 0x45, 0x86, 0x92, + 0x9f, 0x79, 0x44, 0x79, 0xb7, 0xab, 0x94, 0xb2, 0x86, 0x3e, 0xaf, 0x6f, + 0x81, 0x5d, 0x69, 0x2b, 0xb4, 0x76, 0x77, 0xb7, 0xb7, 0x99, 0x4c, 0x7b, + 0x6e, 0xb1, 0xb6, 0x9e, 0xa9, 0x44, 0x91, 0x64, 0x4b, 0x62, 0x3d, 0x56, + 0x5f, 0x98, 0x96, 0x8e, 0x9c, 0x8b, 0x67, 0x58, 0x86, 0xf9, 0xff, 0xff, + 0x77, 0xf5, 0xff, 0xff, 0x6d, 0x3, 0, 0, 0xa5, 0xf2, 0xff, 0xff, + 0xae, 0xfa, 0xff, 0xff, 0x91, 0xfc, 0xff, 0xff, 0x75, 0xfb, 0xff, 0xff, + 0x4f, 0xf3, 0xff, 0xff, 0xbe, 0, 0, 0, 0xe6, 0xe9, 0xff, 0xff, + 0x7e, 0xf5, 0xff, 0xff, 0x21, 0xfa, 0xff, 0xff, 0x4d, 0xf6, 0xff, 0xff, + 0x41, 0x1, 0, 0, 0xc2, 0xf9, 0xff, 0xff, 0x8, 0xfc, 0xff, 0xff, + 0x1a, 0xf9, 0xff, 0xff, 0x35, 0xfd, 0xff, 0xff, 0x82, 0xff, 0xff, 0xff, + 0xf6, 0xf3, 0xff, 0xff, 0x12, 0xeb, 0xff, 0xff, 0xd3, 0xff, 0xff, 0xff, + 0x87, 0xf7, 0xff, 0xff, 0xa0, 0xf9, 0xff, 0xff, 0x53, 0xf8, 0xff, 0xff, + 0xe, 0xfe, 0xff, 0xff, 0xc5, 0xfb, 0xff, 0xff, 0x62, 0xf6, 0xff, 0xff, + 0x9f, 0xf4, 0xff, 0xff, 0x97, 0xfc, 0xff, 0xff, 0xa4, 0xf1, 0xff, 0xff, + 0xb8, 0xf1, 0xff, 0xff, 0xb5, 0xf0, 0xff, 0xff, 0x97, 0xfc, 0xff, 0xff, + 0x3c, 0xff, 0xff, 0xff, 0x8b, 0xf5, 0xff, 0xff, 0xdb, 0xfc, 0xff, 0xff, + 0xde, 0x7, 0, 0, 0xa5, 0x2, 0, 0, 0x7a, 0xf9, 0xff, 0xff, + 0xc4, 0xf2, 0xff, 0xff, 0xf, 0xf5, 0xff, 0xff, 0x54, 0xf4, 0xff, 0xff, + 0x53, 0xf7, 0xff, 0xff, 0xf, 0xf7, 0xff, 0xff, 0x65, 0xfb, 0xff, 0xff, + 0xa0, 0xf0, 0xff, 0xff, 0x40, 0xf1, 0xff, 0xff, 0x95, 0xf6, 0xff, 0xff, + 0x9d, 0x2, 0, 0, 0x42, 0x9c, 0x34, 0x69, 0xcf, 0x4d, 0x9d, 0xb7, + 0x62, 0x76, 0xc3, 0x6c, 0xaa, 0x91, 0x98, 0xb3, 0x3a, 0x74, 0x8d, 0xcb, + 0xbf, 0x9a, 0x7c, 0xd6, 0x39, 0xa3, 0x7a, 0xa2, 0x85, 0x99, 0x78, 0x41, + 0xd0, 0x6c, 0x85, 0x83, 0xc4, 0xab, 0xb3, 0xb7, 0x72, 0x6c, 0x47, 0x76, + 0x9d, 0xba, 0xb9, 0x70, 0x58, 0x3e, 0x6d, 0x36, 0x47, 0x5c, 0x76, 0x73, + 0x60, 0xb8, 0x9b, 0x7c, 0x70, 0xb4, 0xbd, 0xbd, 0xa7, 0xad, 0x48, 0x96, + 0x71, 0x78, 0xc2, 0xa1, 0xc0, 0xba, 0x60, 0xbe, 0x5c, 0x2c, 0x33, 0x77, + 0x79, 0x59, 0x34, 0x42, 0x70, 0x7e, 0x8c, 0x4a, 0xb9, 0xb7, 0x92, 0xa5, + 0xa2, 0x81, 0x74, 0xc2, 0x32, 0xa1, 0xa8, 0x5d, 0x99, 0xbe, 0xbb, 0x4c, + 0x48, 0x66, 0x77, 0xb3, 0xab, 0xa4, 0xd1, 0xb2, 0x9b, 0x99, 0x49, 0xa1, + 0x3f, 0x37, 0x9a, 0xb0, 0x65, 0x69, 0xc9, 0x42, 0x3a, 0xa7, 0xc0, 0xca, + 0xa4, 0x4e, 0x5b, 0x65, 0x73, 0x81, 0xb8, 0x5a, 0x44, 0x3b, 0x5a, 0xd6, + 0xa6, 0xdf, 0x9f, 0x9f, 0x84, 0xb3, 0x9e, 0x42, 0x32, 0xb3, 0x66, 0x58, + 0xa7, 0x9d, 0x5a, 0x4b, 0x5d, 0x35, 0x35, 0x30, 0x65, 0x6b, 0x42, 0x61, + 0xc3, 0x9b, 0x42, 0x7d, 0x49, 0xa1, 0xbb, 0x75, 0x8f, 0x36, 0x61, 0x9a, + 0x3e, 0x90, 0x69, 0xb7, 0x4e, 0xa7, 0xd0, 0x79, 0xb3, 0xc4, 0x4f, 0x3b, + 0xcc, 0x2b, 0x6a, 0x8e, 0xac, 0xb1, 0x51, 0xa5, 0x56, 0x2f, 0x7e, 0x48, + 0x36, 0x7a, 0xce, 0xb4, 0xc0, 0xc1, 0xa0, 0x40, 0xbc, 0x92, 0x80, 0x6c, + 0x6b, 0xb0, 0x3c, 0x69, 0x88, 0x6a, 0x4c, 0x5c, 0xc0, 0x86, 0xae, 0x60, + 0x66, 0x6a, 0xa3, 0x38, 0x96, 0x55, 0x48, 0xca, 0x4c, 0x86, 0x69, 0xb7, + 0x9b, 0x3b, 0x49, 0x44, 0xce, 0x8a, 0x5b, 0x6b, 0x5e, 0x71, 0x8d, 0x5a, + 0x9a, 0xb0, 0x7c, 0xbe, 0xa1, 0x57, 0x9c, 0x67, 0x7d, 0x85, 0x58, 0x5c, + 0x68, 0x5c, 0x9f, 0x77, 0x56, 0xa2, 0xcc, 0xbc, 0xac, 0xa9, 0xab, 0x5a, + 0xb4, 0x80, 0xb0, 0xa7, 0x34, 0x82, 0x56, 0x55, 0x5a, 0x6b, 0xc2, 0x77, + 0x8f, 0x99, 0x79, 0x4a, 0x63, 0xa0, 0x86, 0xc7, 0x5a, 0x56, 0x7d, 0xcb, + 0xa0, 0x3b, 0x59, 0xad, 0x8f, 0x38, 0xb6, 0x64, 0xc4, 0x6f, 0x6d, 0x62, + 0xd2, 0x74, 0xb7, 0xaa, 0x5a, 0x8d, 0x9f, 0x9f, 0x51, 0x3c, 0x39, 0x7e, + 0x40, 0x8a, 0x51, 0xd9, 0x3a, 0x74, 0xd6, 0x44, 0xb9, 0x97, 0x76, 0xb7, + 0x52, 0x63, 0xba, 0x27, 0x54, 0x3d, 0x9b, 0x59, 0xa2, 0x91, 0x5c, 0xbf, + 0xc3, 0x6e, 0xc6, 0xa6, 0x49, 0xa0, 0xa7, 0x2f, 0x35, 0x72, 0x53, 0xb0, + 0x7a, 0xc8, 0xa9, 0x7f, 0xb6, 0x5d, 0xac, 0x8c, 0xd8, 0x85, 0x80, 0x3e, + 0xd3, 0x43, 0x7f, 0xa3, 0x97, 0x57, 0x60, 0x91, 0x94, 0x4b, 0x69, 0x60, + 0x70, 0xcf, 0xbf, 0xac, 0x5b, 0x3e, 0x55, 0x90, 0xc9, 0x5f, 0x72, 0x5e, + 0x78, 0xc5, 0x83, 0x91, 0x56, 0xc6, 0x79, 0x53, 0x65, 0x9f, 0x8f, 0x6e, + 0xd8, 0xb9, 0xbf, 0x94, 0x8e, 0x9d, 0xa8, 0x3d, 0xbd, 0xb3, 0xb2, 0x4b, + 0x90, 0x97, 0x4d, 0x32, 0xb8, 0x8f, 0x46, 0xbb, 0x33, 0xbc, 0xa5, 0x65, + 0x84, 0x4b, 0xc3, 0x9b, 0x58, 0x4c, 0x43, 0x9c, 0x9b, 0x6f, 0xb2, 0x47, + 0xdb, 0x85, 0xd4, 0x88, 0x64, 0x59, 0x48, 0xad, 0x6c, 0x5c, 0x64, 0xb3, + 0x8d, 0x80, 0x5f, 0x3c, 0x58, 0xc6, 0x7c, 0x66, 0xbc, 0xa7, 0xc4, 0xcb, + 0xa4, 0x87, 0x95, 0xba, 0x8f, 0xa5, 0xab, 0x84, 0x47, 0xc6, 0xbd, 0xc7, + 0x9a, 0x67, 0xa9, 0x7f, 0xd3, 0xa8, 0x6e, 0xc7, 0x6d, 0x37, 0x97, 0x2d, + 0x78, 0xbf, 0x6f, 0x4f, 0xa8, 0xc4, 0x41, 0x8f, 0x5f, 0xc1, 0xc5, 0xb4, + 0xbb, 0x7c, 0x78, 0x8a, 0x70, 0xb0, 0x5a, 0x87, 0x53, 0x9d, 0x32, 0x9f, + 0x7b, 0x5e, 0xaf, 0xbd, 0xca, 0x4c, 0x5a, 0xae, 0xa3, 0x2f, 0x59, 0x6f, + 0xbb, 0xa6, 0xa8, 0xbd, 0x79, 0xd5, 0x83, 0x4a, 0x93, 0xc9, 0x69, 0x92, + 0x36, 0xcf, 0x8e, 0x9f, 0x57, 0x82, 0x6d, 0x75, 0x9c, 0xc0, 0x93, 0xcb, + 0x45, 0x36, 0x5b, 0x52, 0x69, 0xa5, 0x88, 0xce, 0xc2, 0xb0, 0xb6, 0x63, + 0x3b, 0x9f, 0xae, 0x54, 0x38, 0x79, 0x74, 0x89, 0x44, 0x83, 0x9e, 0xa3, + 0xcf, 0xca, 0x6e, 0xb9, 0x64, 0xaf, 0x5b, 0x64, 0xbe, 0xb8, 0x58, 0x36, + 0xbd, 0x94, 0x9e, 0x8d, 0xb7, 0xd0, 0xb4, 0x4f, 0x7e, 0x3d, 0x8b, 0xb8, + 0xb5, 0x89, 0x36, 0x38, 0xcf, 0xe5, 0x6b, 0xe4, 0x87, 0x4b, 0x63, 0xae, + 0x73, 0x8f, 0xb6, 0x9f, 0x2e, 0x98, 0x51, 0x4d, 0x71, 0x66, 0x71, 0xc7, + 0x92, 0x83, 0xc8, 0xb5, 0x60, 0x91, 0x33, 0x70, 0xa8, 0x37, 0xab, 0x94, + 0x55, 0xbf, 0x76, 0xb6, 0x6f, 0x57, 0x62, 0x71, 0x44, 0x93, 0x4e, 0x90, + 0xb8, 0x38, 0x59, 0x9c, 0x96, 0x4b, 0xbf, 0xc1, 0x66, 0x82, 0x87, 0x32, + 0x78, 0xb2, 0x70, 0x8e, 0x7c, 0x87, 0xa7, 0x91, 0xbd, 0x73, 0x92, 0xc2, + 0x8f, 0xd3, 0x97, 0x4e, 0x37, 0xb2, 0x4f, 0x7a, 0xa4, 0x48, 0x67, 0xc7, + 0xa1, 0x4b, 0x7b, 0xaa, 0x43, 0x41, 0xc5, 0x42, 0x8d, 0x91, 0x81, 0x99, + 0x61, 0x29, 0xa8, 0x67, 0xb4, 0x8e, 0x8d, 0x94, 0xa3, 0x33, 0x7b, 0x57, + 0xbd, 0xad, 0x37, 0x5f, 0xb4, 0x8c, 0x3d, 0xc7, 0x7e, 0xbf, 0x5e, 0x3d, + 0xac, 0x5d, 0x9c, 0x44, 0x94, 0x96, 0x6b, 0xc6, 0x86, 0x46, 0x7a, 0xc6, + 0x60, 0xc9, 0xa1, 0x5f, 0x89, 0xa5, 0xd8, 0xc8, 0xc2, 0xae, 0x84, 0x65, + 0xc3, 0xb5, 0xca, 0x94, 0x7c, 0x42, 0xa1, 0x51, 0x5f, 0x4a, 0xbd, 0x84, + 0x3b, 0x9c, 0xb5, 0xa1, 0x70, 0x60, 0xb7, 0xb8, 0x61, 0x9b, 0x73, 0xc0, + 0xae, 0x98, 0xd1, 0x5b, 0xb4, 0x85, 0xc9, 0x71, 0x71, 0x4a, 0xb7, 0xc2, + 0xb7, 0x9f, 0x6b, 0x4b, 0x59, 0xcc, 0x7d, 0xab, 0x6c, 0xaa, 0xc4, 0xc7, + 0x4f, 0x5d, 0xc6, 0xcd, 0x9f, 0x7f, 0x51, 0x72, 0x4b, 0xab, 0x46, 0x96, + 0x7f, 0x4b, 0x95, 0xb0, 0x66, 0x68, 0x58, 0x3c, 0x82, 0x36, 0xb2, 0xbb, + 0xc9, 0xb9, 0x32, 0xcf, 0x7d, 0x91, 0x99, 0xb3, 0xa7, 0x99, 0xcd, 0x69, + 0x45, 0x45, 0xcd, 0xcf, 0x38, 0x37, 0x31, 0x6e, 0x84, 0x92, 0xc6, 0x63, + 0xc8, 0xad, 0x6d, 0xca, 0x52, 0x38, 0xba, 0x4f, 0x77, 0x70, 0x86, 0x92, + 0x34, 0xc8, 0x6a, 0xc1, 0x84, 0x46, 0x44, 0x5d, 0x7a, 0x82, 0xc8, 0xc2, + 0x69, 0x7f, 0xa0, 0x87, 0xc0, 0x72, 0x6f, 0x35, 0x6f, 0x81, 0x4c, 0xc6, + 0x73, 0x89, 0xcf, 0x46, 0x71, 0x68, 0x6e, 0x6f, 0xbf, 0x63, 0xbb, 0x3a, + 0xc8, 0x44, 0xbd, 0xa2, 0x5b, 0x56, 0x33, 0xd3, 0x91, 0x73, 0x94, 0x59, + 0x63, 0x34, 0x48, 0x49, 0xc9, 0x95, 0x57, 0xc7, 0x99, 0x4e, 0x43, 0x6c, + 0x6b, 0xd0, 0x7a, 0x95, 0x7c, 0x75, 0x56, 0x39, 0xba, 0x37, 0xc8, 0x93, + 0x77, 0xbd, 0xb2, 0xc1, 0xad, 0x6e, 0x98, 0xaf, 0x40, 0x79, 0x8f, 0xbc, + 0x30, 0x6e, 0x4e, 0xcb, 0x59, 0x32, 0x56, 0xca, 0x42, 0x62, 0x49, 0xc1, + 0x52, 0xbf, 0x94, 0x79, 0x8f, 0x4a, 0xba, 0xb6, 0x30, 0x4d, 0x58, 0x3c, + 0x5c, 0xac, 0x68, 0x9b, 0x40, 0xc3, 0x62, 0x49, 0xc1, 0xc6, 0x9d, 0x34, + 0x3f, 0x64, 0x68, 0x4e, 0x6d, 0xb7, 0x5d, 0x5a, 0x5a, 0x66, 0x53, 0x9f, + 0xcf, 0x81, 0xd0, 0x44, 0x8b, 0x5b, 0xb6, 0x60, 0x9f, 0x89, 0x4b, 0xc3, + 0x89, 0xcf, 0xba, 0x8c, 0x6f, 0xc6, 0x38, 0x53, 0xc3, 0xcd, 0x7a, 0x3b, + 0x9a, 0xa9, 0x56, 0xae, 0x96, 0x82, 0x80, 0x8b, 0xab, 0xa3, 0x6c, 0x38, + 0xa7, 0x86, 0x59, 0xb4, 0x6c, 0x56, 0xc0, 0x4a, 0x5b, 0xad, 0x8d, 0xb5, + 0x7a, 0x53, 0x37, 0x87, 0x51, 0x4c, 0x36, 0x30, 0x7e, 0xb5, 0xc7, 0x79, + 0x74, 0x53, 0x8c, 0x7a, 0xae, 0xad, 0x4d, 0x81, 0x9b, 0x9a, 0xb9, 0x5b, + 0x46, 0xc3, 0x8a, 0xba, 0x93, 0x70, 0x4a, 0x7e, 0x43, 0x5a, 0x4d, 0x44, + 0x42, 0x79, 0x5d, 0xb0, 0x60, 0xae, 0x9f, 0x9a, 0xce, 0xcd, 0x90, 0x67, + 0x44, 0xc4, 0x84, 0x52, 0x7a, 0x58, 0x3d, 0x4e, 0x61, 0x46, 0xc0, 0xbb, + 0x99, 0x71, 0x8c, 0x67, 0x51, 0x95, 0x78, 0x97, 0xac, 0x41, 0xa8, 0x7b, + 0x6c, 0x4e, 0xc7, 0x53, 0x39, 0x69, 0x82, 0x37, 0x39, 0xc0, 0x33, 0x59, + 0x62, 0x9e, 0xab, 0x55, 0x6d, 0x5d, 0x3a, 0x79, 0x96, 0x5e, 0x3e, 0x89, + 0x96, 0xb9, 0x5e, 0x89, 0x94, 0x36, 0xce, 0x98, 0x87, 0x84, 0x6e, 0x5a, + 0x9d, 0x3a, 0xd0, 0x3c, 0xbe, 0x4c, 0x41, 0x64, 0xba, 0x92, 0x88, 0xcf, + 0x69, 0x66, 0x6f, 0x48, 0x34, 0xc2, 0xcd, 0x70, 0xcf, 0xce, 0xb8, 0xb1, + 0x3a, 0xca, 0x66, 0x49, 0x5f, 0x9d, 0xa2, 0xa6, 0xb9, 0xba, 0x46, 0x84, + 0x36, 0x6d, 0xb1, 0x6b, 0x92, 0x48, 0x96, 0x4a, 0x70, 0x8f, 0xc3, 0x53, + 0xbc, 0xd0, 0xc0, 0xb4, 0x70, 0xa4, 0xb5, 0x35, 0x83, 0x41, 0x37, 0xc5, + 0x9a, 0xae, 0xbb, 0x3d, 0x61, 0xd1, 0x95, 0x86, 0xcd, 0xae, 0x98, 0x90, + 0xc9, 0xa5, 0x9f, 0xc3, 0x3b, 0x75, 0x99, 0x82, 0x83, 0xa8, 0x61, 0x7f, + 0x48, 0x9a, 0x77, 0xa4, 0x88, 0x44, 0xab, 0x92, 0xd2, 0x52, 0x72, 0x4f, + 0x73, 0x95, 0x59, 0x4a, 0x60, 0xc9, 0x8a, 0xa9, 0xb4, 0xa4, 0xc2, 0x3b, + 0x3e, 0x8d, 0xc1, 0x3d, 0x4e, 0xb9, 0xd2, 0xa2, 0xc1, 0x96, 0x92, 0x74, + 0x6d, 0x69, 0x56, 0xc3, 0x68, 0x96, 0x43, 0x96, 0x44, 0x7d, 0x5c, 0x80, + 0xcb, 0x5e, 0x33, 0x6b, 0x7b, 0xbc, 0x47, 0x90, 0x8d, 0xac, 0x94, 0x4c, + 0x36, 0x4c, 0x5e, 0xd1, 0xcd, 0xa6, 0x7b, 0x6a, 0xcc, 0x95, 0x53, 0x73, + 0x87, 0x7d, 0x7e, 0x69, 0x31, 0x47, 0xd0, 0xba, 0x68, 0x8f, 0x82, 0x6f, + 0xc1, 0xc5, 0x91, 0x71, 0x3e, 0xb1, 0x9f, 0x80, 0xc2, 0x6a, 0xd1, 0xbb, + 0x8c, 0x93, 0x31, 0x42, 0x3d, 0x6a, 0x7a, 0x51, 0x8a, 0x4a, 0x66, 0x53, + 0x46, 0xbd, 0x46, 0xa4, 0x60, 0x4f, 0xb7, 0x40, 0xbb, 0x73, 0x89, 0x67, + 0xc1, 0x92, 0x89, 0x4d, 0xb9, 0xbf, 0x72, 0xbe, 0x63, 0x9a, 0x66, 0x88, + 0x9c, 0xbf, 0xc4, 0x9e, 0x62, 0x70, 0x8f, 0x45, 0xb7, 0x85, 0xa8, 0xc2, + 0xae, 0x34, 0xa4, 0x39, 0x6d, 0x58, 0x71, 0xb3, 0x3c, 0x63, 0x4a, 0x8b, + 0x8e, 0x7c, 0xce, 0x45, 0x9f, 0xc6, 0xcf, 0xb0, 0x60, 0xa6, 0x5b, 0xb1, + 0x61, 0x4d, 0x4c, 0x8f, 0x7f, 0xb2, 0x48, 0x4b, 0x77, 0xb3, 0x46, 0xc2, + 0x7c, 0x60, 0xc9, 0x6c, 0x8d, 0x40, 0xa0, 0x64, 0xa4, 0xc7, 0x93, 0x3c, + 0xa8, 0xa8, 0x45, 0xa2, 0x53, 0xb8, 0x5e, 0xae, 0xbe, 0xb2, 0x77, 0x6c, + 0xb4, 0x80, 0x74, 0x6c, 0xc0, 0x33, 0x34, 0xad, 0x5e, 0xaf, 0x52, 0x47, + 0x3d, 0x60, 0xd1, 0x37, 0x86, 0x8a, 0x5f, 0xae, 0x43, 0x3a, 0x59, 0x3a, + 0x40, 0x48, 0x9f, 0xcd, 0x9b, 0xd0, 0x75, 0xc9, 0xc4, 0x34, 0x84, 0x46, + 0x30, 0x85, 0x6b, 0x9f, 0xb1, 0xb8, 0xb2, 0x38, 0x5a, 0x30, 0x5d, 0x86, + 0x73, 0x93, 0xc5, 0x90, 0xba, 0x82, 0x34, 0x52, 0x71, 0x4b, 0x6d, 0x4b, + 0xa0, 0x74, 0x6e, 0x9c, 0x3b, 0x40, 0x3b, 0x87, 0x31, 0x33, 0x94, 0x7d, + 0x32, 0xca, 0x58, 0x92, 0x67, 0xbb, 0x8c, 0xc1, 0xa7, 0x35, 0x40, 0xce, + 0xc7, 0x75, 0x49, 0x64, 0x91, 0x36, 0xc4, 0xcd, 0x56, 0x8b, 0x90, 0xcb, + 0x6f, 0x74, 0x82, 0x59, 0x95, 0xb6, 0xaf, 0x64, 0x3f, 0x79, 0x36, 0x67, + 0xad, 0x3b, 0xae, 0x93, 0xcb, 0x3d, 0xa2, 0x59, 0xc7, 0x83, 0xd0, 0x5e, + 0xaf, 0x4b, 0x4b, 0x67, 0x5c, 0x68, 0x9f, 0xb5, 0x2e, 0x7f, 0xc3, 0x37, + 0xc2, 0xd0, 0x3a, 0x32, 0x77, 0x81, 0x63, 0x8d, 0xb5, 0x8b, 0x3b, 0xd2, + 0xa7, 0xab, 0x9d, 0x4c, 0xc8, 0x9e, 0x4a, 0xab, 0x64, 0x65, 0x98, 0x91, + 0xbc, 0x47, 0x4a, 0xbc, 0x66, 0xcb, 0x8d, 0x8d, 0xaf, 0xbe, 0x78, 0x48, + 0xa4, 0x8b, 0xa4, 0xa2, 0xae, 0x80, 0x8b, 0xae, 0xaa, 0xc5, 0x66, 0x7d, + 0xc5, 0x61, 0xa6, 0x7a, 0x91, 0x48, 0x87, 0x4f, 0xb3, 0x7b, 0x3b, 0xbb, + 0x7f, 0xc8, 0x66, 0x70, 0x5f, 0x80, 0x48, 0xc0, 0x87, 0xa9, 0x4e, 0x40, + 0x79, 0xb6, 0xc8, 0x4f, 0x59, 0x87, 0x85, 0x7a, 0xc4, 0x9d, 0x60, 0x95, + 0x3b, 0x50, 0x81, 0x3e, 0x5e, 0xbe, 0x3e, 0xaa, 0x5f, 0x9b, 0xbf, 0x47, + 0x9d, 0xca, 0x8b, 0x8e, 0xba, 0x92, 0x66, 0x71, 0xb0, 0x60, 0xbf, 0x6e, + 0x46, 0x5e, 0xcd, 0xd5, 0x72, 0x8d, 0x38, 0x89, 0x3f, 0x9a, 0x58, 0x6f, + 0xbc, 0x46, 0x49, 0xc8, 0xcf, 0xa7, 0x60, 0x82, 0x62, 0xcc, 0x39, 0xad, + 0x8d, 0x72, 0x4b, 0xa9, 0x58, 0x62, 0xd8, 0xc0, 0x81, 0xa9, 0x56, 0x3e, + 0xa2, 0x84, 0x5b, 0x40, 0xc8, 0x76, 0x98, 0xc6, 0x4e, 0x77, 0x67, 0xa2, + 0x89, 0xc6, 0x5c, 0x83, 0x2e, 0xa7, 0xc2, 0xc2, 0x63, 0x5e, 0xb0, 0x36, + 0x3c, 0x76, 0x35, 0x3b, 0xbe, 0x90, 0x43, 0x3d, 0x87, 0x82, 0xbc, 0x9f, + 0x87, 0xa5, 0x4b, 0x94, 0x69, 0x55, 0xd6, 0x7e, 0x5b, 0xc4, 0x67, 0x60, + 0x96, 0x84, 0x7c, 0xcd, 0xa9, 0x4a, 0x29, 0x6d, 0x62, 0x87, 0x72, 0x7d, + 0x6e, 0xa6, 0xe4, 0x4d, 0xb1, 0x8f, 0xc7, 0x48, 0xa4, 0x9d, 0xb7, 0xc1, + 0x44, 0x37, 0x77, 0x9a, 0xb3, 0xc3, 0x5c, 0x83, 0xcd, 0x6b, 0x98, 0x5e, + 0x4c, 0xb4, 0x3e, 0x41, 0xa5, 0x50, 0xbd, 0x4e, 0x32, 0x83, 0xa8, 0x5d, + 0x98, 0x4d, 0x95, 0x24, 0x43, 0xa5, 0x91, 0x68, 0xc8, 0x73, 0x2d, 0xa2, + 0xa1, 0xc4, 0xa3, 0x5e, 0x55, 0x94, 0xb1, 0xc3, 0x53, 0x68, 0xbe, 0x39, + 0xb4, 0x62, 0xaa, 0x3c, 0xbe, 0x8d, 0x93, 0xab, 0x56, 0x3a, 0x96, 0x38, + 0xb2, 0xc3, 0x57, 0xbc, 0x8d, 0x8c, 0x90, 0xaa, 0x92, 0x8d, 0x44, 0x4d, + 0x76, 0x84, 0xd2, 0x6b, 0x69, 0x9c, 0xa2, 0xb9, 0x74, 0xb8, 0x3a, 0xc9, + 0x7b, 0x51, 0x89, 0xba, 0x96, 0x38, 0x63, 0x98, 0x9c, 0xca, 0x4f, 0x78, + 0x8d, 0x69, 0x55, 0xc3, 0x90, 0x56, 0x47, 0xac, 0x43, 0x80, 0x60, 0x67, + 0x9b, 0x7f, 0x43, 0x62, 0x7c, 0x93, 0x92, 0x4f, 0x64, 0x79, 0x3d, 0xc2, + 0xa6, 0x5b, 0xa9, 0xb5, 0x53, 0x9e, 0x48, 0x77, 0xb5, 0x7c, 0x5d, 0x50, + 0x7c, 0xbf, 0xb6, 0xa2, 0xb2, 0x79, 0xa0, 0xaf, 0xc1, 0x50, 0xb2, 0x5e, + 0xbb, 0x41, 0xa0, 0x30, 0x40, 0xb9, 0x9b, 0x52, 0x84, 0x4b, 0x78, 0x8f, + 0xcf, 0x3f, 0x21, 0x40, 0x37, 0x5c, 0x45, 0x96, 0xaf, 0xd3, 0x4f, 0xa3, + 0xa4, 0xbc, 0x57, 0x5d, 0x53, 0xb4, 0x77, 0xac, 0x72, 0x34, 0x8f, 0x88, + 0xac, 0x54, 0x49, 0xa8, 0x73, 0xb0, 0x88, 0xa3, 0x55, 0xb1, 0xba, 0xa3, + 0xb7, 0x4f, 0xb2, 0x7e, 0x6d, 0xc4, 0xc4, 0x4a, 0x78, 0x64, 0x51, 0x90, + 0x97, 0xcc, 0x3e, 0xc7, 0xc0, 0x99, 0xd7, 0x86, 0xae, 0xa8, 0x5d, 0x85, + 0x98, 0xd0, 0x4b, 0x96, 0x77, 0x61, 0x7d, 0xb4, 0x51, 0x62, 0x3d, 0x44, + 0x84, 0xb5, 0x95, 0x93, 0xce, 0x9d, 0x5e, 0x9f, 0x49, 0xd5, 0x51, 0x6a, + 0xbc, 0x9e, 0x5b, 0xa6, 0xc3, 0x90, 0xe4, 0xb9, 0x55, 0x49, 0x83, 0x5b, + 0x82, 0x55, 0x41, 0xb1, 0xae, 0xc9, 0xb2, 0x40, 0x5e, 0xc8, 0xa4, 0x7e, + 0xc2, 0xb2, 0xc4, 0x4c, 0xa1, 0x2a, 0x95, 0x9d, 0x41, 0x68, 0x3a, 0x36, + 0x58, 0x6d, 0x86, 0x39, 0x29, 0x98, 0x97, 0x34, 0x4c, 0x5f, 0xcd, 0x2b, + 0x3a, 0x9a, 0x98, 0xd5, 0xa9, 0x98, 0xd0, 0x9f, 0x3c, 0x41, 0x9f, 0xa7, + 0xcc, 0x77, 0x99, 0x3d, 0xa4, 0x72, 0x90, 0x50, 0x65, 0xb9, 0x51, 0xb6, + 0x92, 0xbd, 0xb0, 0xb4, 0xb4, 0x47, 0xb8, 0x82, 0x6c, 0x71, 0xbe, 0xb8, + 0x7f, 0x50, 0xb5, 0xbb, 0x5f, 0x8a, 0x41, 0x7c, 0x75, 0xa1, 0x81, 0x51, + 0x34, 0x52, 0xb0, 0xcb, 0xb1, 0x92, 0x34, 0x9d, 0x89, 0x78, 0x8e, 0x3c, + 0xc9, 0x34, 0xcb, 0x8e, 0x69, 0xc6, 0xab, 0x85, 0x57, 0xb9, 0x73, 0xac, + 0x8a, 0xc4, 0x53, 0x4a, 0xd7, 0xd1, 0x74, 0x56, 0x96, 0x86, 0x78, 0x59, + 0xa4, 0x4b, 0xda, 0x7d, 0xa8, 0xb2, 0x48, 0xc0, 0x7c, 0x3f, 0xa1, 0xcc, + 0x82, 0xc4, 0x48, 0x86, 0xa3, 0x81, 0x3b, 0xbc, 0x41, 0x74, 0x71, 0x89, + 0x39, 0xa0, 0x52, 0xc9, 0x7a, 0x84, 0x69, 0x28, 0x3e, 0x99, 0x85, 0x60, + 0xc5, 0x98, 0xcd, 0x88, 0x69, 0x45, 0xb8, 0x28, 0x69, 0x9d, 0x41, 0x8f, + 0xb5, 0xa1, 0x92, 0x6d, 0x53, 0xbd, 0x2f, 0xb3, 0x6b, 0xa5, 0x4c, 0x8c, + 0x38, 0xac, 0x65, 0x7b, 0x3b, 0x3b, 0x51, 0x7b, 0x38, 0x7a, 0x69, 0x63, + 0x69, 0xa4, 0xad, 0x44, 0x9d, 0x9c, 0x2e, 0x54, 0x3c, 0xc8, 0x35, 0xc2, + 0x8f, 0xa4, 0xcf, 0xce, 0xce, 0x77, 0xb2, 0xb4, 0xb8, 0x43, 0x30, 0xc2, + 0x8b, 0x85, 0xca, 0x64, 0xa8, 0x55, 0x83, 0x8f, 0x7a, 0xc0, 0x62, 0x2c, + 0x86, 0x66, 0xce, 0x9c, 0xa5, 0xca, 0x68, 0x75, 0x8b, 0xb6, 0x4d, 0x71, + 0x54, 0x38, 0x9f, 0x6b, 0x7f, 0xb4, 0x9e, 0x7d, 0x7e, 0x71, 0xb6, 0x53, + 0xa7, 0x39, 0x3c, 0x61, 0xce, 0x9f, 0x3e, 0x8d, 0x50, 0x44, 0xb2, 0x5b, + 0x7c, 0xb4, 0x59, 0x72, 0x8d, 0x9b, 0x60, 0x3b, 0x5d, 0xad, 0xcd, 0x86, + 0x9b, 0xc5, 0x5a, 0xb3, 0x56, 0xb4, 0x8f, 0x6c, 0xa3, 0xa0, 0x51, 0x97, + 0x95, 0x7b, 0x89, 0x86, 0xcd, 0x7e, 0xab, 0x44, 0x5b, 0xbc, 0xd0, 0xc4, + 0xbb, 0x6b, 0x3d, 0xb2, 0x6b, 0x92, 0xcd, 0xb1, 0xcb, 0xb3, 0x6a, 0x87, + 0x90, 0x45, 0xd1, 0x56, 0x61, 0x8d, 0x36, 0xc4, 0x3b, 0x4c, 0x30, 0x97, + 0x75, 0x98, 0xb3, 0x68, 0x7a, 0xbe, 0x66, 0xb3, 0x78, 0x99, 0x83, 0xa4, + 0x42, 0x3c, 0x73, 0x5f, 0xab, 0x60, 0x3d, 0x2e, 0x99, 0xcb, 0x9b, 0x8f, + 0x9a, 0x5c, 0x45, 0x6b, 0x40, 0xc8, 0x95, 0xbb, 0x68, 0x5e, 0x3f, 0xb0, + 0xc6, 0xb3, 0xc2, 0x73, 0x55, 0xc4, 0x31, 0xaa, 0x96, 0xc7, 0x4c, 0x81, + 0xac, 0x58, 0xc4, 0x3d, 0x7c, 0xa8, 0x66, 0x41, 0xcc, 0x4c, 0xac, 0x53, + 0x7b, 0x60, 0x54, 0x60, 0x8b, 0x9e, 0xc8, 0xbb, 0x9b, 0x40, 0xb4, 0xb9, + 0x3d, 0x7f, 0x56, 0xc5, 0xc4, 0xc7, 0x74, 0xdb, 0x59, 0x8e, 0xd7, 0xc1, + 0xc1, 0x51, 0x5c, 0x4e, 0x65, 0xd3, 0xaa, 0xc0, 0xb0, 0x6e, 0xbd, 0xb2, + 0x93, 0xab, 0x2f, 0x98, 0x63, 0xc3, 0xa3, 0x6b, 0x9f, 0x8f, 0x84, 0x9f, + 0x4f, 0x82, 0x2a, 0x84, 0x7e, 0xda, 0x9e, 0xaf, 0xab, 0xb5, 0x82, 0x50, + 0x57, 0x3e, 0xb4, 0xcb, 0xbd, 0x70, 0x9f, 0x2b, 0x31, 0xc7, 0x93, 0xab, + 0xc1, 0x5b, 0x7b, 0x30, 0x7c, 0x86, 0xa6, 0x52, 0xb0, 0x76, 0xc2, 0xa1, + 0x48, 0x51, 0xd1, 0x80, 0x8b, 0x58, 0x8a, 0xb0, 0xcb, 0xb6, 0x6a, 0x6b, + 0x48, 0xb5, 0xc5, 0xc7, 0x79, 0x5a, 0x42, 0x8d, 0xa2, 0x9f, 0x50, 0x2c, + 0x37, 0xb8, 0x5a, 0x94, 0x7f, 0xb0, 0x71, 0xa3, 0x9c, 0x69, 0x71, 0x4b, + 0x6d, 0x47, 0x43, 0xaf, 0x95, 0xca, 0x8e, 0x5f, 0x63, 0x5b, 0x6f, 0x70, + 0x80, 0x88, 0x7e, 0x5e, 0x56, 0xbe, 0x3d, 0xb3, 0x86, 0x7f, 0x95, 0xa7, + 0xa7, 0xad, 0xbc, 0x66, 0x6b, 0x90, 0xdf, 0xa3, 0xb1, 0x58, 0xbf, 0x42, + 0x7d, 0x93, 0xd1, 0x79, 0x41, 0x62, 0x48, 0x3f, 0x74, 0xd5, 0x46, 0xc7, + 0xb8, 0x6b, 0xb5, 0x71, 0x85, 0x40, 0xa2, 0xcf, 0x9e, 0x6f, 0xa2, 0x7a, + 0x72, 0x45, 0x69, 0x92, 0x89, 0x59, 0xa0, 0x7d, 0xc6, 0x48, 0x8f, 0x84, + 0x71, 0x98, 0x6e, 0x8a, 0x94, 0x61, 0x79, 0x30, 0xa5, 0x76, 0x52, 0xa8, + 0x2c, 0xa0, 0xb9, 0xc2, 0x6f, 0xb3, 0x46, 0xb2, 0x4e, 0x30, 0x39, 0x55, + 0x99, 0x75, 0x95, 0x62, 0xbe, 0x61, 0xa6, 0xc8, 0x33, 0x34, 0x81, 0x44, + 0xb7, 0x70, 0x92, 0xa6, 0x36, 0x95, 0x68, 0xb5, 0x98, 0xbe, 0x68, 0x59, + 0x5b, 0xb1, 0x7c, 0xb6, 0xa6, 0xb9, 0xc9, 0x66, 0x70, 0x95, 0x70, 0x76, + 0x75, 0xb0, 0xbe, 0x6c, 0x8a, 0xb8, 0xcb, 0x2e, 0xb4, 0x60, 0xb5, 0xcd, + 0x5f, 0x67, 0x97, 0xb2, 0x97, 0x63, 0xaf, 0xb8, 0x51, 0xd7, 0x4a, 0x61, + 0xcd, 0x4d, 0x95, 0x2b, 0x72, 0x55, 0x50, 0x74, 0x7d, 0x3e, 0xc8, 0x3b, + 0xae, 0xaa, 0x60, 0x35, 0xbf, 0x52, 0xb3, 0x93, 0x53, 0x96, 0x46, 0x87, + 0xbf, 0x2d, 0x4f, 0x3b, 0x5a, 0xc5, 0x6d, 0x83, 0xd4, 0xba, 0x8b, 0x34, + 0x90, 0xda, 0x41, 0x4e, 0x70, 0xdc, 0xa4, 0x5b, 0x86, 0xae, 0xaf, 0xc3, + 0x4c, 0x9c, 0x56, 0xba, 0xb8, 0x2a, 0x91, 0x3a, 0xc1, 0x1f, 0x60, 0xb3, + 0xb0, 0x9d, 0x63, 0x6a, 0x52, 0x90, 0xe0, 0x8a, 0x95, 0x2e, 0x91, 0xc6, + 0xb2, 0xa8, 0x61, 0xd6, 0x58, 0x46, 0x63, 0x4a, 0x34, 0xce, 0x73, 0x4d, + 0xd8, 0x83, 0x92, 0x99, 0xb0, 0x9d, 0xa9, 0xb5, 0xcd, 0x90, 0x73, 0x2e, + 0x8c, 0x40, 0xb0, 0x49, 0x3d, 0x4d, 0x41, 0x7e, 0x99, 0x4b, 0x63, 0x80, + 0x7b, 0xaa, 0x54, 0x91, 0x89, 0x5d, 0xb4, 0xbf, 0xbc, 0x3a, 0xc1, 0x97, + 0xa8, 0x5d, 0x7e, 0xb8, 0x69, 0x1e, 0x40, 0x4f, 0x9d, 0x94, 0x3e, 0x8e, + 0x9b, 0xbf, 0x49, 0xc5, 0x45, 0xa0, 0x41, 0xb7, 0xcf, 0x93, 0x2d, 0x61, + 0x97, 0x7a, 0xad, 0xc6, 0xb3, 0xc9, 0xad, 0x8a, 0x4d, 0x98, 0x39, 0x30, + 0x33, 0x8b, 0x93, 0x6d, 0x5c, 0x9c, 0xb5, 0xc9, 0x5b, 0xce, 0x65, 0x68, + 0xcb, 0xaa, 0x9a, 0xcf, 0xd3, 0x70, 0x51, 0x8c, 0x79, 0x3c, 0x66, 0xbf, + 0x32, 0x75, 0x9a, 0x9e, 0x43, 0xb7, 0x79, 0xbf, 0xa2, 0x85, 0x7f, 0xcb, + 0x98, 0x9a, 0xc5, 0x9e, 0xad, 0x95, 0x83, 0x59, 0x41, 0xbb, 0x6c, 0xa0, + 0xb0, 0x46, 0x5a, 0x8a, 0x3b, 0x6f, 0x62, 0x86, 0x2d, 0x83, 0x95, 0x5c, + 0x84, 0x2d, 0x63, 0x6d, 0x65, 0xa8, 0x8e, 0x2d, 0x5d, 0x31, 0x4a, 0x92, + 0x37, 0x5c, 0xaf, 0xc0, 0xd2, 0xc4, 0x3d, 0x50, 0x8b, 0x65, 0x8d, 0xa1, + 0x7a, 0x46, 0x8c, 0x61, 0x3c, 0x82, 0x40, 0x40, 0x6d, 0x8f, 0xb5, 0x91, + 0x62, 0x8e, 0x4c, 0xc1, 0x49, 0xa1, 0xb7, 0xac, 0x55, 0xd7, 0x94, 0xce, + 0xa4, 0x38, 0x47, 0x2d, 0x6f, 0x50, 0xce, 0xc9, 0x9b, 0x99, 0xa8, 0x5b, + 0xc3, 0x6e, 0x84, 0x81, 0x2b, 0xad, 0xc9, 0x47, 0x9f, 0x4d, 0xbb, 0x7e, + 0xb0, 0x53, 0x55, 0xb1, 0xa7, 0x70, 0xba, 0x9e, 0x43, 0x93, 0x6e, 0x80, + 0x98, 0x75, 0xc9, 0x84, 0xb0, 0x68, 0x9f, 0xbe, 0x84, 0x90, 0x58, 0xc7, + 0x51, 0xcc, 0x7d, 0x4d, 0x69, 0x9e, 0x26, 0x37, 0x73, 0x52, 0xb2, 0xce, + 0x47, 0x7f, 0xba, 0x70, 0xa5, 0xb0, 0x6f, 0xa7, 0x47, 0x9a, 0xaa, 0x39, + 0xa8, 0xb3, 0x65, 0xb9, 0x67, 0x81, 0x32, 0xa0, 0x68, 0x5f, 0x6d, 0x7b, + 0xc0, 0x8f, 0x97, 0x4c, 0x71, 0x83, 0x4e, 0x6c, 0x46, 0xc5, 0x7f, 0x41, + 0x67, 0x63, 0x8d, 0x66, 0x3d, 0xaf, 0x67, 0x6d, 0x96, 0x58, 0x5b, 0x93, + 0x8d, 0x96, 0x46, 0x83, 0x83, 0x72, 0xba, 0x72, 0x6a, 0xc2, 0x9b, 0xb5, + 0x78, 0x6b, 0x50, 0xd9, 0x77, 0x55, 0xa9, 0xbd, 0x78, 0xa3, 0x57, 0x37, + 0x95, 0xcb, 0x8e, 0xc9, 0x6d, 0xa9, 0x7f, 0x62, 0x55, 0x48, 0x89, 0x66, + 0xc5, 0x3d, 0xb0, 0x9e, 0x73, 0x66, 0xbf, 0xbe, 0xc7, 0x4d, 0x80, 0xab, + 0x4a, 0xab, 0xae, 0x79, 0x67, 0x90, 0x94, 0xb4, 0xa6, 0xb7, 0x5e, 0x69, + 0x64, 0x4f, 0x67, 0x94, 0xd3, 0x9f, 0xc6, 0x5b, 0x3b, 0xc1, 0x80, 0x88, + 0xad, 0xb4, 0x3e, 0x3a, 0x80, 0xcc, 0x4e, 0x36, 0xb9, 0x94, 0xb2, 0xad, + 0x5c, 0x4b, 0x3e, 0x40, 0x67, 0x58, 0xba, 0x62, 0x9f, 0x57, 0xb2, 0x5b, + 0x7e, 0x9f, 0x50, 0xc0, 0xc2, 0x84, 0x3f, 0x5a, 0x65, 0xb3, 0x9d, 0x61, + 0xbf, 0x4f, 0xd4, 0x49, 0xa4, 0x53, 0x9f, 0xbf, 0x4e, 0x8f, 0x87, 0xaa, + 0x7f, 0x92, 0x7f, 0xcc, 0xb3, 0xa9, 0xb3, 0x91, 0xa6, 0x69, 0xd8, 0x8e, + 0x29, 0xb3, 0xbb, 0x88, 0x43, 0x22, 0x40, 0x86, 0xba, 0x54, 0x68, 0x6c, + 0x8a, 0x7d, 0x3b, 0x9d, 0x9d, 0xbb, 0xc5, 0xb6, 0x58, 0x53, 0xb7, 0x76, + 0x84, 0xc8, 0xdb, 0xae, 0x7c, 0xc3, 0x86, 0xc1, 0x63, 0x73, 0x64, 0x34, + 0x72, 0xaa, 0xc1, 0x92, 0x8e, 0x99, 0x9b, 0x38, 0x51, 0xaf, 0xbc, 0x77, + 0x56, 0x65, 0xac, 0xc5, 0x4f, 0x85, 0xb5, 0x94, 0x7c, 0xb1, 0x7c, 0x63, + 0xb7, 0x6c, 0x4a, 0x91, 0xc7, 0xd2, 0xa9, 0xb3, 0x8d, 0xc7, 0xbe, 0x3b, + 0x3a, 0xda, 0x85, 0xb3, 0x38, 0xb7, 0xd6, 0x69, 0x89, 0xb7, 0x79, 0x75, + 0xb6, 0x91, 0xbe, 0xbf, 0xca, 0xc1, 0x84, 0x46, 0x44, 0xcd, 0x87, 0x70, + 0xa9, 0x63, 0x64, 0x99, 0x71, 0x3b, 0x67, 0xbf, 0x7b, 0x39, 0x6e, 0x59, + 0x3c, 0xbc, 0x95, 0x1e, 0x6f, 0x8e, 0x50, 0x5e, 0x37, 0x36, 0x61, 0x61, + 0x39, 0x4a, 0x59, 0x82, 0x76, 0x74, 0xaf, 0x8d, 0x61, 0xa6, 0x9c, 0x40, + 0x6b, 0xa7, 0x3c, 0x80, 0xaf, 0x94, 0x2b, 0xb5, 0x6d, 0x60, 0xc5, 0xb4, + 0xc7, 0x76, 0x6a, 0x46, 0x67, 0xa1, 0x70, 0xaa, 0x50, 0x43, 0x89, 0x31, + 0x83, 0x58, 0x1b, 0x8e, 0x65, 0xcc, 0x87, 0x45, 0x44, 0x40, 0xcc, 0xa1, + 0x44, 0x8a, 0x97, 0x8d, 0x69, 0xc0, 0x62, 0xc8, 0x4f, 0xa4, 0xae, 0xb1, + 0xca, 0x76, 0xcd, 0x91, 0xdf, 0x3c, 0x2b, 0x2b, 0x53, 0x9a, 0x9e, 0x92, + 0x82, 0xd7, 0x82, 0x66, 0x4f, 0xbd, 0x45, 0x9c, 0x54, 0xca, 0x3e, 0xc4, + 0xbd, 0x4b, 0xba, 0x5f, 0x7b, 0x44, 0x47, 0x80, 0x41, 0x6a, 0x5a, 0xcf, + 0x72, 0xae, 0x2f, 0x62, 0xbe, 0x52, 0xcb, 0xd1, 0x55, 0x93, 0x78, 0x95, + 0xb3, 0x61, 0xb7, 0x5b, 0xbb, 0x65, 0xda, 0x56, 0x82, 0xc6, 0x9f, 0x87, + 0xd0, 0x2d, 0x40, 0xba, 0x96, 0xa2, 0x6d, 0x88, 0x70, 0x49, 0x8c, 0x76, + 0x79, 0xb5, 0x6b, 0xd7, 0x43, 0xbf, 0xc6, 0xa5, 0xb1, 0x8a, 0x7c, 0xc5, + 0x51, 0xb3, 0x5f, 0xb0, 0x73, 0xd7, 0x9f, 0x29, 0x9f, 0xb6, 0xcf, 0x57, + 0x94, 0x52, 0x8a, 0x7a, 0x4f, 0xb3, 0x7d, 0x31, 0x9f, 0x82, 0x3e, 0x96, + 0xa0, 0xac, 0x4a, 0xc6, 0xd2, 0xab, 0x56, 0x6f, 0x88, 0x56, 0x42, 0x6e, + 0x46, 0x99, 0x8a, 0x91, 0x8d, 0x70, 0x76, 0x6c, 0xce, 0x5c, 0x8f, 0xd1, + 0x70, 0x8d, 0xba, 0x32, 0x4c, 0x44, 0x4f, 0xe0, 0x9f, 0xc5, 0x8e, 0x49, + 0x48, 0xac, 0xa0, 0x8a, 0xa1, 0x73, 0x41, 0x5f, 0xb5, 0x4f, 0xa0, 0xbd, + 0x9f, 0x39, 0xce, 0x6e, 0x68, 0x7f, 0x9f, 0x50, 0xd6, 0x93, 0x64, 0x3f, + 0x47, 0xb6, 0x3e, 0xb9, 0x76, 0xba, 0x82, 0x96, 0x6d, 0x4f, 0x4d, 0x3e, + 0xc5, 0x46, 0x6b, 0xc8, 0xbd, 0xa4, 0x5c, 0xba, 0x47, 0x66, 0x44, 0x36, + 0xa0, 0x6c, 0xbb, 0x39, 0x8c, 0x7f, 0x52, 0x62, 0x7f, 0xaf, 0xc8, 0x86, + 0x87, 0xa0, 0x4e, 0x39, 0x63, 0x39, 0xc2, 0x49, 0x58, 0x37, 0x72, 0x72, + 0x43, 0x3a, 0x71, 0xc1, 0x56, 0xbc, 0x87, 0x36, 0xa8, 0x6a, 0x3f, 0xa1, + 0x63, 0xa7, 0x8b, 0x7f, 0x53, 0x63, 0x71, 0x3d, 0xa9, 0x95, 0xb0, 0x4d, + 0x79, 0x38, 0xd5, 0xa2, 0xc7, 0x33, 0xc7, 0xb6, 0x92, 0x99, 0x59, 0x63, + 0x3a, 0x9e, 0xd6, 0x51, 0xd8, 0x82, 0x75, 0x64, 0xc6, 0x97, 0x89, 0x7c, + 0x58, 0xb1, 0xa9, 0x64, 0x6b, 0x31, 0xb3, 0x6e, 0x4f, 0xd1, 0x6e, 0x51, + 0x41, 0x8c, 0xca, 0xab, 0xa5, 0xb8, 0x39, 0xc2, 0x50, 0xaa, 0x68, 0x79, + 0xb0, 0x9d, 0x98, 0x92, 0x82, 0xc1, 0x63, 0x4a, 0xa3, 0xb1, 0x85, 0x52, + 0xbb, 0x3a, 0x90, 0x49, 0xcf, 0x70, 0xb1, 0xb9, 0xbb, 0x76, 0x43, 0xc5, + 0x89, 0x3b, 0x95, 0x83, 0x74, 0x7d, 0x3f, 0x95, 0xb0, 0x53, 0xa5, 0xa4, + 0x3a, 0xbf, 0x94, 0xb4, 0x3c, 0x5e, 0x50, 0x8b, 0x91, 0x6e, 0x90, 0x65, + 0x31, 0x92, 0x60, 0xab, 0x70, 0x46, 0x63, 0xc3, 0xba, 0x7a, 0x5d, 0xd3, + 0x66, 0x8f, 0x9f, 0x9b, 0xd2, 0x7b, 0x37, 0xbb, 0x40, 0xb3, 0x96, 0x59, + 0xc3, 0xbf, 0x5d, 0x6d, 0xd3, 0x54, 0xb7, 0x89, 0xa2, 0x32, 0x34, 0x8f, + 0xc7, 0x89, 0xc3, 0xc0, 0x9a, 0xc5, 0x91, 0xa0, 0xa9, 0x99, 0x3e, 0xb4, + 0x70, 0x7e, 0x84, 0x9e, 0x43, 0xd5, 0xcf, 0x2e, 0x46, 0x5d, 0x8b, 0x92, + 0xb0, 0x1c, 0xa8, 0x57, 0xab, 0x67, 0x57, 0x55, 0x38, 0xc5, 0xa1, 0xa6, + 0x59, 0x9d, 0xa5, 0x80, 0xb9, 0x31, 0xb8, 0xd8, 0xab, 0xbe, 0xc4, 0x9c, + 0xb9, 0xa5, 0x91, 0x37, 0xd9, 0xb9, 0xa7, 0xb3, 0xc4, 0x42, 0xa3, 0x40, + 0x4f, 0x67, 0x4d, 0x3b, 0x33, 0x54, 0x94, 0x6a, 0x4e, 0xb9, 0xc0, 0x31, + 0x49, 0x40, 0x89, 0x6b, 0x38, 0x68, 0x77, 0xa2, 0x73, 0xcc, 0x49, 0x74, + 0xbc, 0x35, 0x5c, 0x97, 0x69, 0xb9, 0x78, 0x87, 0x29, 0xa9, 0x79, 0x5a, + 0xba, 0x3c, 0x9b, 0x65, 0x3c, 0x99, 0x3e, 0xac, 0xa3, 0x50, 0x6f, 0x8b, + 0x76, 0x7f, 0x7e, 0x4c, 0xd7, 0x7a, 0xc8, 0x5b, 0xc1, 0x4e, 0x9a, 0x9d, + 0xde, 0x90, 0x8c, 0xb6, 0x6c, 0x7c, 0xa6, 0xc1, 0x43, 0x52, 0x94, 0x8a, + 0xc6, 0x79, 0xb8, 0x4d, 0x88, 0xad, 0x78, 0x82, 0x3d, 0x82, 0x47, 0x6e, + 0x9f, 0x8e, 0xb0, 0x77, 0xbd, 0x9e, 0xaf, 0xb9, 0x47, 0x81, 0x68, 0x8f, + 0x75, 0x77, 0xbd, 0x36, 0xbc, 0x44, 0x36, 0xb4, 0x61, 0xa4, 0xbc, 0x60, + 0x9d, 0x26, 0x8b, 0x39, 0x6c, 0x46, 0x6f, 0x70, 0x97, 0x9e, 0xd0, 0xb1, + 0xc9, 0xb2, 0x73, 0x68, 0xcb, 0x76, 0xc7, 0xb9, 0x50, 0x50, 0xb0, 0xb8, + 0x6e, 0xc6, 0x84, 0x4b, 0xbd, 0x6e, 0x9f, 0x4c, 0x96, 0xc7, 0x93, 0x9e, + 0x8d, 0xbe, 0xb8, 0x92, 0xca, 0x8e, 0xd2, 0xbd, 0x4d, 0xab, 0x5d, 0x1c, + 0xd9, 0x61, 0xb0, 0x61, 0x99, 0x9e, 0x98, 0x80, 0x3e, 0x3a, 0xc1, 0x47, + 0xd2, 0x50, 0x43, 0x5b, 0xb0, 0x27, 0x67, 0x84, 0xb7, 0x5b, 0xbf, 0x9d, + 0xa8, 0x56, 0x72, 0x53, 0xc9, 0x9f, 0x65, 0x49, 0x75, 0x94, 0x5f, 0x62, + 0xca, 0x46, 0x87, 0x80, 0x6b, 0x4a, 0x8b, 0xb5, 0xbc, 0x6b, 0x9f, 0x81, + 0x82, 0xa3, 0x56, 0xbd, 0x8c, 0x69, 0x98, 0x73, 0xb6, 0x6e, 0x65, 0xc1, + 0xbd, 0x70, 0x46, 0x56, 0xaa, 0xbc, 0xdc, 0xab, 0xac, 0x58, 0x8b, 0x4d, + 0xb5, 0x7d, 0x92, 0x9d, 0x5c, 0x98, 0x59, 0x46, 0x3e, 0x6c, 0x42, 0xb2, + 0x73, 0xc5, 0xc5, 0xd0, 0xa5, 0x72, 0x80, 0x9e, 0x54, 0x6c, 0x2d, 0x5a, + 0x71, 0xc5, 0xb7, 0x90, 0x3f, 0xb9, 0x80, 0xcf, 0x81, 0xae, 0xa3, 0x4e, + 0xd0, 0xc4, 0xc5, 0x9c, 0x54, 0x3d, 0x64, 0xb1, 0xcf, 0xad, 0x91, 0xbc, + 0xad, 0x70, 0x49, 0xcc, 0x81, 0x46, 0x31, 0x7c, 0x62, 0x31, 0x84, 0x37, + 0x81, 0x92, 0x31, 0x37, 0x8c, 0x55, 0xc3, 0x37, 0x9e, 0x60, 0xb3, 0xed, + 0x88, 0xa4, 0x9f, 0x5f, 0x5e, 0x60, 0x81, 0x42, 0x9c, 0x9f, 0x80, 0xa1, + 0xae, 0xce, 0x53, 0x6c, 0x7a, 0x41, 0x5f, 0x33, 0x50, 0x89, 0x9a, 0xd1, + 0x8f, 0x4c, 0x8b, 0xb6, 0x82, 0xc8, 0x2a, 0x7e, 0x32, 0x76, 0xb7, 0x29, + 0x87, 0x9d, 0x6d, 0x84, 0x89, 0xbb, 0x55, 0x44, 0x6b, 0xab, 0x67, 0x66, + 0x9e, 0x23, 0x5a, 0x62, 0x32, 0xa1, 0xa5, 0x32, 0x6a, 0x91, 0x48, 0x4f, + 0x8f, 0x72, 0x35, 0xc5, 0x3c, 0x95, 0x8c, 0xbd, 0xa8, 0xbb, 0x5f, 0xcb, + 0x3a, 0xd2, 0x8e, 0xd8, 0x87, 0x94, 0x4f, 0x4a, 0xc6, 0x7d, 0xcf, 0xae, + 0x7c, 0x4c, 0x38, 0xc4, 0x3a, 0x94, 0x9f, 0x9a, 0x99, 0x6d, 0xba, 0xc7, + 0x47, 0x46, 0x3c, 0x2b, 0xcc, 0x47, 0x40, 0x79, 0x7e, 0xb4, 0x6f, 0x3d, + 0x78, 0xcf, 0xae, 0xa6, 0x91, 0xb2, 0x63, 0xa5, 0xac, 0x4a, 0xdc, 0x30, + 0xb0, 0x87, 0x84, 0x6b, 0xad, 0xc9, 0x61, 0x9a, 0xc2, 0x39, 0x46, 0x9d, + 0x62, 0xbf, 0x86, 0x99, 0x6f, 0x63, 0xaa, 0x48, 0x50, 0xc5, 0xa3, 0xc7, + 0x9c, 0xd3, 0x88, 0x5a, 0x4e, 0x61, 0x8b, 0x6f, 0xce, 0x42, 0xa8, 0xa3, + 0x6c, 0x4c, 0x75, 0xb9, 0x95, 0x90, 0x65, 0xca, 0x34, 0x58, 0x88, 0x5b, + 0x3a, 0x32, 0x67, 0x99, 0x63, 0x98, 0x86, 0xd1, 0x3b, 0x33, 0x7b, 0x80, + 0x8d, 0xbc, 0x88, 0xd2, 0x69, 0x7b, 0xb9, 0x7e, 0x5d, 0x94, 0xc8, 0x38, + 0x67, 0x5a, 0xd1, 0x40, 0x3c, 0xbd, 0x9e, 0x3e, 0x6f, 0x36, 0x4e, 0x35, + 0x52, 0x77, 0x6c, 0xcc, 0x40, 0x91, 0x58, 0x98, 0xbc, 0x66, 0xcf, 0xb8, + 0xcd, 0x35, 0x81, 0xad, 0xc7, 0x5e, 0x90, 0xa4, 0xa3, 0x45, 0x59, 0x63, + 0x57, 0xcb, 0x7e, 0xeb, 0xd8, 0xb9, 0xcd, 0x9a, 0xb8, 0x37, 0xb3, 0x23, + 0x2c, 0x62, 0xcc, 0x89, 0x99, 0x99, 0xae, 0x96, 0x7f, 0x2d, 0xdb, 0x3f, + 0x79, 0xbf, 0xce, 0xb6, 0x60, 0x50, 0x82, 0xc7, 0x56, 0xc2, 0x3b, 0xb7, + 0x46, 0x9b, 0xa0, 0x7b, 0x4b, 0xd2, 0xca, 0x49, 0x9c, 0x87, 0x5a, 0x45, + 0xa7, 0x85, 0xba, 0x6f, 0x56, 0x6c, 0x58, 0xa0, 0x50, 0xa1, 0x36, 0x9c, + 0xc1, 0x79, 0x60, 0x7a, 0xae, 0x96, 0x86, 0x88, 0x8c, 0x59, 0x67, 0x71, + 0x66, 0x5e, 0x92, 0xa8, 0x90, 0x6c, 0x5e, 0xb9, 0xb4, 0x50, 0xb8, 0x85, + 0x3e, 0x6c, 0x9b, 0x5e, 0x4e, 0x74, 0x33, 0x63, 0xbe, 0xa0, 0x94, 0x6f, + 0x65, 0x3e, 0x7f, 0xd5, 0x61, 0x67, 0x46, 0xc0, 0x52, 0x64, 0x45, 0x83, + 0xb6, 0xb1, 0x2b, 0x36, 0x5c, 0x99, 0x8f, 0xca, 0xb8, 0x69, 0x77, 0x61, + 0x33, 0x3a, 0xa1, 0x8e, 0xcc, 0x64, 0x63, 0xc6, 0x6f, 0xc0, 0x71, 0x38, + 0x5c, 0xb1, 0x96, 0x4d, 0x57, 0x47, 0x6e, 0xb8, 0xd0, 0xb4, 0x6f, 0x98, + 0x9c, 0x3f, 0x8d, 0x3c, 0x57, 0xab, 0x42, 0x65, 0x55, 0x95, 0x7e, 0x4d, + 0x97, 0x35, 0x5f, 0x59, 0x5b, 0x44, 0xd2, 0x57, 0xaf, 0xb2, 0x86, 0x95, + 0x97, 0x89, 0xa7, 0xb5, 0x4b, 0x8b, 0xa8, 0x56, 0x70, 0xb1, 0xa9, 0x48, + 0x82, 0xc7, 0x8b, 0xce, 0x9f, 0x94, 0xbc, 0x71, 0x77, 0xa2, 0x3e, 0xc3, + 0xc9, 0xce, 0xce, 0x94, 0x71, 0x9b, 0x8f, 0x49, 0x7c, 0x39, 0xc8, 0xc7, + 0x55, 0x4a, 0xd9, 0x59, 0x72, 0xb3, 0x9e, 0xc4, 0x3d, 0x63, 0x47, 0xda, + 0x69, 0x80, 0x86, 0xbc, 0x93, 0x5f, 0x89, 0x40, 0x4c, 0x53, 0xbc, 0x77, + 0x64, 0x89, 0x81, 0x51, 0x63, 0xb2, 0x3e, 0x42, 0x2f, 0x91, 0xa6, 0x9e, + 0x3d, 0x64, 0xbc, 0x73, 0x56, 0x8d, 0x66, 0x5a, 0xca, 0x7f, 0x20, 0xbe, + 0x3c, 0x6f, 0x4c, 0x63, 0x66, 0x64, 0x6b, 0xa9, 0x9d, 0x57, 0x44, 0xc9, + 0xc9, 0x71, 0xa0, 0x46, 0x49, 0x5e, 0x3d, 0x9e, 0x3b, 0x3a, 0x88, 0x8f, + 0x91, 0xb1, 0x71, 0x42, 0x53, 0x8e, 0x2b, 0x42, 0x34, 0xaf, 0x1b, 0x88, + 0x65, 0xc0, 0x59, 0x78, 0x6d, 0xb5, 0x88, 0xb7, 0x60, 0x44, 0x3f, 0x72, + 0x99, 0x8a, 0xb2, 0xc5, 0x84, 0x95, 0xa4, 0x5b, 0xb1, 0x62, 0xc7, 0x49, + 0xa7, 0x9c, 0xb4, 0x2b, 0xb4, 0x36, 0x7b, 0x9c, 0x62, 0x3d, 0x9d, 0xc0, + 0x70, 0x59, 0x56, 0x3a, 0x9d, 0xc1, 0x86, 0xca, 0x46, 0xb1, 0xc8, 0x52, + 0xb3, 0xab, 0x34, 0xc4, 0xb1, 0x8e, 0x5b, 0xc0, 0xa6, 0x72, 0xa8, 0x75, + 0x9f, 0x38, 0xc6, 0x42, 0x4e, 0x73, 0x3a, 0x75, 0x71, 0xa3, 0x38, 0x8c, + 0x76, 0x7a, 0x3c, 0x8d, 0x65, 0x55, 0x7b, 0x44, 0x9b, 0xa9, 0x4e, 0xa9, + 0x53, 0xa7, 0xa5, 0x66, 0x67, 0xa0, 0x56, 0x92, 0x53, 0xbd, 0x75, 0x32, + 0xbb, 0x53, 0xcd, 0xce, 0xbd, 0x6d, 0x77, 0x8c, 0x3e, 0xce, 0xa4, 0xa1, + 0x6c, 0x3b, 0x61, 0x44, 0xb8, 0xc0, 0x81, 0x7c, 0xd6, 0xaa, 0x39, 0xb6, + 0x5c, 0xd2, 0xb7, 0x95, 0x5b, 0x3f, 0x64, 0x55, 0xcb, 0x41, 0xba, 0x76, + 0xac, 0x48, 0x78, 0x4a, 0xd7, 0x2b, 0x8a, 0xc5, 0xac, 0x68, 0xa5, 0x4a, + 0x88, 0x3b, 0x54, 0x6b, 0x40, 0x69, 0xb4, 0xbb, 0x75, 0xb3, 0x5f, 0x39, + 0xac, 0x73, 0x45, 0xa7, 0x93, 0x64, 0x72, 0x57, 0xc9, 0xb3, 0xd9, 0xc8, + 0x3c, 0x9c, 0x85, 0x4b, 0xa5, 0x56, 0x26, 0xb3, 0x95, 0x3b, 0x65, 0x86, + 0x52, 0x87, 0xd9, 0x9e, 0x6c, 0x46, 0x90, 0x32, 0x58, 0x9f, 0xb0, 0xa0, + 0x41, 0x72, 0xb1, 0x3e, 0x3b, 0xcc, 0x36, 0x37, 0x67, 0x61, 0x73, 0x32, + 0xaa, 0x5e, 0xbf, 0xc6, 0xc5, 0x5a, 0x49, 0xac, 0x43, 0x9e, 0x76, 0x88, + 0x50, 0x57, 0x78, 0x6d, 0xb0, 0x68, 0x59, 0xa7, 0x55, 0x64, 0xa5, 0xc9, + 0xa0, 0xc9, 0x49, 0xc1, 0x6d, 0x81, 0x3b, 0x46, 0x72, 0x9e, 0xb9, 0x31, + 0x72, 0x7b, 0x82, 0x57, 0x91, 0xb1, 0x68, 0x8b, 0x7a, 0x8e, 0x6e, 0x6c, + 0x37, 0x44, 0xb3, 0x79, 0x84, 0x61, 0x6b, 0x54, 0x78, 0xca, 0xc5, 0x5a, + 0x49, 0x62, 0xa0, 0x9b, 0xaf, 0xc9, 0x5a, 0x6a, 0x40, 0xba, 0xc1, 0xa3, + 0xc7, 0xbf, 0x7f, 0xbf, 0x30, 0x76, 0xd0, 0x7b, 0x83, 0x47, 0x3d, 0x65, + 0x76, 0xb2, 0x52, 0x83, 0xcd, 0xa9, 0xb4, 0x4c, 0x72, 0x9f, 0x60, 0x60, + 0x7d, 0x95, 0x86, 0x52, 0x7c, 0x39, 0x5b, 0x43, 0x72, 0x6b, 0x8c, 0x8d, + 0x34, 0xa4, 0x42, 0x5d, 0xa1, 0x73, 0x36, 0x5f, 0xce, 0x8e, 0x6c, 0x74, + 0x87, 0x6a, 0x9a, 0x76, 0x9a, 0x88, 0x61, 0x6f, 0x46, 0x33, 0x3f, 0x41, + 0x84, 0x81, 0xae, 0xd1, 0x3b, 0x4d, 0x3f, 0xab, 0x3a, 0x4e, 0x56, 0x45, + 0x82, 0x8c, 0x43, 0x5f, 0x8d, 0xc3, 0x8a, 0x6f, 0x43, 0x4a, 0x6d, 0x34, + 0x3f, 0xab, 0x49, 0xc2, 0x37, 0xd1, 0x60, 0x8a, 0x3d, 0x43, 0x6a, 0x36, + 0x59, 0xbc, 0x64, 0x61, 0xbd, 0xcb, 0x72, 0x8c, 0x7f, 0x80, 0x55, 0xa0, + 0xa0, 0xa4, 0x88, 0xa6, 0x5f, 0xc2, 0x58, 0x7d, 0x4d, 0x52, 0xa0, 0x62, + 0x84, 0x4c, 0x5a, 0x9f, 0x93, 0xa3, 0xbf, 0x7c, 0x92, 0x4f, 0x34, 0x52, + 0x87, 0xce, 0x6b, 0x78, 0x6b, 0x9b, 0x73, 0x8e, 0x90, 0xb2, 0xab, 0xba, + 0x36, 0xbc, 0xc1, 0x89, 0x96, 0x61, 0x66, 0xb3, 0x65, 0x3d, 0xb0, 0x5c, + 0x85, 0x61, 0x47, 0x74, 0x52, 0x82, 0x4e, 0x6b, 0x5c, 0x56, 0x61, 0x58, + 0xd3, 0x9e, 0x69, 0x55, 0x4b, 0xc7, 0x43, 0x52, 0x3e, 0xc6, 0xc3, 0x39, + 0xbe, 0x7e, 0x81, 0xc6, 0xb4, 0x82, 0xcb, 0x4f, 0xb4, 0x75, 0xa8, 0x81, + 0xa6, 0x38, 0xb9, 0x75, 0xa7, 0x9b, 0x96, 0xc0, 0x72, 0x7e, 0xbc, 0x48, + 0x89, 0x40, 0x6d, 0x58, 0x3f, 0xa9, 0x60, 0x74, 0xcc, 0x43, 0x7c, 0x5e, + 0x35, 0xaf, 0x60, 0x60, 0xa8, 0x57, 0x52, 0x4a, 0xb9, 0xb3, 0x5a, 0x5a, + 0xbf, 0x4c, 0xb1, 0x5a, 0x33, 0xa4, 0x89, 0x54, 0x50, 0x51, 0x6e, 0x77, + 0xaa, 0xca, 0x33, 0xbf, 0xd7, 0x84, 0xb7, 0x3e, 0x9e, 0xcb, 0xa6, 0x8c, + 0xb0, 0x9b, 0xc7, 0xaa, 0x79, 0x4c, 0xa8, 0xc8, 0x69, 0x8a, 0xaf, 0x3d, + 0x5e, 0x4a, 0x55, 0x9b, 0x3d, 0x83, 0x90, 0xce, 0x47, 0x5b, 0x5d, 0x9b, + 0xa1, 0x95, 0xa1, 0x74, 0x65, 0xab, 0x3b, 0x85, 0x72, 0x71, 0x91, 0x44, + 0x77, 0x61, 0x5d, 0xba, 0x8f, 0xc5, 0xa8, 0xa7, 0x5a, 0x86, 0xbb, 0xc2, + 0x60, 0x7c, 0x95, 0x3c, 0xc0, 0x6c, 0xb2, 0x8d, 0x73, 0x84, 0x63, 0xc1, + 0x5a, 0x4b, 0xb9, 0x8d, 0x73, 0x5d, 0xac, 0x8b, 0xaf, 0x4a, 0x8b, 0x7e, + 0xbf, 0x36, 0x51, 0xa2, 0x65, 0xc0, 0x38, 0x51, 0x8d, 0x43, 0xd6, 0xb8, + 0x39, 0x74, 0x65, 0x45, 0xb1, 0x77, 0xac, 0x72, 0xb2, 0xbe, 0x72, 0x58, + 0x4c, 0x49, 0x3c, 0xaf, 0xa1, 0xc8, 0xa5, 0x41, 0xd2, 0x8b, 0x53, 0x8f, + 0x6b, 0xc7, 0xab, 0x89, 0x99, 0x97, 0x50, 0x5a, 0x62, 0xb2, 0x9a, 0x7b, + 0x9a, 0x9d, 0x45, 0x67, 0x7d, 0x7f, 0x50, 0x56, 0xbb, 0xaa, 0x3e, 0x48, + 0x65, 0xac, 0xc4, 0x7e, 0xb0, 0x66, 0x59, 0x99, 0x71, 0x3d, 0x3d, 0xc0, + 0x67, 0x7e, 0xad, 0x69, 0xad, 0x3d, 0xa8, 0x55, 0x74, 0xc3, 0xa4, 0xa0, + 0x60, 0x63, 0x38, 0x7b, 0x7e, 0x80, 0x8e, 0x49, 0x58, 0x97, 0xb9, 0x83, + 0xbc, 0xb4, 0x55, 0x96, 0x7c, 0x57, 0x66, 0x71, 0x33, 0x61, 0xa4, 0x45, + 0x62, 0xa5, 0x77, 0x76, 0x30, 0xd2, 0x98, 0x37, 0x5a, 0x7b, 0x56, 0x4a, + 0x3d, 0x9d, 0xbb, 0x8a, 0x42, 0x97, 0x4b, 0x93, 0x3a, 0x43, 0x74, 0x6f, + 0x4b, 0x32, 0x51, 0x4a, 0x3d, 0x4a, 0xc0, 0x38, 0x9b, 0xb0, 0x9f, 0x68, + 0x9b, 0x66, 0x47, 0xba, 0x7f, 0x8e, 0x39, 0xc7, 0xb3, 0x65, 0x6c, 0x58, + 0xa6, 0x9b, 0x72, 0xce, 0xc4, 0xa1, 0xd3, 0x83, 0xa2, 0x7c, 0x88, 0x9b, + 0xa8, 0x8d, 0xd2, 0x88, 0x94, 0x47, 0x81, 0x78, 0xc4, 0x55, 0x90, 0x99, + 0x3f, 0x59, 0x80, 0xce, 0x9f, 0x6c, 0x94, 0x67, 0x4e, 0x3b, 0x53, 0x75, + 0x7a, 0xba, 0x52, 0xd1, 0x34, 0xc0, 0xbb, 0xa1, 0x98, 0x79, 0x47, 0x5b, + 0xc0, 0xc7, 0x39, 0x76, 0x4e, 0x4c, 0x4d, 0xac, 0x64, 0xa8, 0xbc, 0xc0, + 0x3b, 0x52, 0x71, 0xad, 0x57, 0xbc, 0xa6, 0x92, 0xd0, 0x58, 0x56, 0xb0, + 0xb8, 0x72, 0x5a, 0x7a, 0x4a, 0xad, 0x62, 0xa2, 0x8d, 0xae, 0x3a, 0xcc, + 0x53, 0x71, 0xb5, 0x33, 0x75, 0xbb, 0x62, 0x46, 0x45, 0xbc, 0x81, 0xa1, + 0x82, 0x96, 0x7d, 0x6d, 0xb3, 0x98, 0x31, 0xb1, 0xc5, 0xc4, 0x72, 0x5e, + 0xc9, 0xa1, 0x46, 0x94, 0x4e, 0xb4, 0x5f, 0x59, 0x36, 0xa2, 0xa8, 0x88, + 0x4b, 0x64, 0xce, 0xd2, 0x86, 0xba, 0x45, 0xb7, 0x66, 0x3d, 0xa4, 0x9a, + 0x9d, 0xc7, 0x36, 0x4f, 0xaa, 0xa3, 0x69, 0x57, 0xa4, 0xc4, 0x5f, 0x57, + 0x77, 0x6c, 0xaf, 0x60, 0x8a, 0x62, 0xa5, 0x45, 0x5d, 0x81, 0x64, 0xac, + 0x75, 0xbd, 0xb3, 0xba, 0xb9, 0x46, 0x44, 0xbe, 0xb1, 0x88, 0x3a, 0x77, + 0x47, 0x96, 0x54, 0x7f, 0x43, 0x47, 0x2e, 0xcb, 0x56, 0xab, 0xb9, 0xb0, + 0x69, 0x4a, 0x4b, 0xa3, 0xbc, 0x7b, 0x87, 0x92, 0xcf, 0x3f, 0x40, 0x4f, + 0x6e, 0x55, 0x4e, 0x7f, 0x79, 0x7f, 0x92, 0x6c, 0xc4, 0xc5, 0x4d, 0x8d, + 0xc3, 0x38, 0x5c, 0xbb, 0xce, 0xac, 0x8c, 0x98, 0xa4, 0x43, 0xa0, 0x53, + 0x67, 0xa1, 0x7f, 0x80, 0x80, 0x5c, 0x3a, 0xa7, 0xb4, 0xbc, 0x33, 0x88, + 0x9e, 0x56, 0x93, 0xca, 0x55, 0x3b, 0xa6, 0xcb, 0x9e, 0xd1, 0x9c, 0xb6, + 0x5a, 0x3c, 0x89, 0xa1, 0x8b, 0x79, 0x80, 0xd1, 0xc2, 0x67, 0x4c, 0xc6, + 0xaa, 0x7f, 0xaf, 0xbb, 0x9c, 0x52, 0xbf, 0x68, 0x7f, 0x72, 0x80, 0x5c, + 0xa3, 0x80, 0xc7, 0xca, 0x53, 0x53, 0x84, 0x87, 0xd1, 0x7b, 0x60, 0xbd, + 0x9a, 0xcd, 0xa0, 0x90, 0x9c, 0x40, 0x92, 0xd5, 0xd7, 0x3e, 0x75, 0x45, + 0xc7, 0x52, 0x8d, 0x81, 0x58, 0x8b, 0x4d, 0x3a, 0x79, 0xd3, 0x81, 0x20, + 0x42, 0xd3, 0x7d, 0x8e, 0x8d, 0x84, 0x8a, 0x90, 0xcf, 0x6e, 0x4c, 0xc6, + 0x97, 0x88, 0x9a, 0x83, 0x6b, 0x32, 0xac, 0xda, 0x84, 0x30, 0x42, 0x42, + 0xc2, 0x80, 0xa4, 0x41, 0xaa, 0x60, 0x93, 0x92, 0x66, 0x96, 0xc6, 0x94, + 0x74, 0x42, 0x6d, 0xc0, 0x9c, 0xb3, 0xcc, 0xca, 0x6b, 0x9f, 0x75, 0xad, + 0x66, 0xbc, 0x89, 0x8f, 0x9b, 0x77, 0x84, 0xcd, 0xb4, 0x3b, 0xb0, 0x71, + 0x87, 0xa1, 0x70, 0x9b, 0x82, 0x9d, 0x97, 0xb1, 0x64, 0xc9, 0x42, 0x89, + 0x6f, 0xa7, 0x6f, 0xdf, 0x92, 0x3c, 0x46, 0xa1, 0xaa, 0xd3, 0x42, 0xce, + 0xd7, 0x48, 0x91, 0x36, 0x6b, 0x7c, 0x8e, 0x3b, 0x36, 0xb9, 0xac, 0x45, + 0xa7, 0x4c, 0xa9, 0xd1, 0x54, 0xaf, 0x73, 0xa3, 0xd7, 0xc0, 0x8d, 0xc1, + 0xb5, 0xbd, 0x6f, 0x46, 0xd1, 0xa0, 0x66, 0xc0, 0xcc, 0x81, 0x9d, 0x3a, + 0xc7, 0x7d, 0x29, 0x6c, 0xcd, 0x4a, 0xa6, 0x9f, 0x7d, 0x37, 0xc8, 0x64, + 0x76, 0xc5, 0x5c, 0x88, 0xa1, 0x8c, 0x57, 0xc8, 0x44, 0x92, 0x81, 0x4b, + 0xb6, 0x91, 0xc5, 0x97, 0x76, 0xb7, 0x49, 0x36, 0x44, 0x96, 0x52, 0x54, + 0x6e, 0xaa, 0x64, 0x47, 0x7e, 0xa8, 0xaa, 0x74, 0x73, 0x9c, 0x81, 0xc4, + 0x9f, 0xb0, 0xd3, 0x75, 0xc9, 0xc9, 0x7a, 0x3e, 0x67, 0x93, 0x72, 0xcb, + 0x6c, 0x59, 0x53, 0x8a, 0xa5, 0xbe, 0x52, 0x81, 0x65, 0x6d, 0xb2, 0x5a, + 0x89, 0xa7, 0xac, 0x87, 0x9b, 0xba, 0xb3, 0xd0, 0xab, 0x89, 0x52, 0x30, + 0x49, 0xb2, 0x2d, 0x6c, 0xaa, 0x92, 0xca, 0x6a, 0xbc, 0xa5, 0x93, 0x5d, + 0x92, 0xae, 0xa8, 0xb8, 0x65, 0x71, 0x53, 0xcf, 0x3e, 0x5c, 0x40, 0x7b, + 0x5b, 0x42, 0x90, 0xb5, 0x94, 0xbf, 0xc0, 0x45, 0xab, 0xc9, 0xda, 0x2d, + 0xad, 0x50, 0x3b, 0x2b, 0xa4, 0x8d, 0x36, 0x84, 0x73, 0x8d, 0xbf, 0x9c, + 0x8d, 0x89, 0x6f, 0x4b, 0x55, 0x4e, 0x3f, 0x80, 0x93, 0x42, 0xa4, 0x4e, + 0x9b, 0x2f, 0xad, 0x8e, 0x60, 0x8d, 0x6f, 0x9c, 0xc4, 0x5f, 0xa7, 0xb0, + 0x8f, 0x84, 0x50, 0x51, 0x91, 0x84, 0x50, 0x7f, 0x77, 0x29, 0x67, 0x69, + 0x8f, 0xce, 0x45, 0xbc, 0x7e, 0xbd, 0xc2, 0x48, 0xb8, 0x8c, 0x4b, 0x5c, + 0x7c, 0x50, 0x69, 0x67, 0x2b, 0xcf, 0x9e, 0x7a, 0x84, 0x7f, 0xa7, 0x47, + 0x77, 0x66, 0x4d, 0x9a, 0xb2, 0xa5, 0x59, 0x89, 0xa3, 0xcd, 0xc6, 0x30, + 0x9b, 0x36, 0xa8, 0x9e, 0xbe, 0xbe, 0xbb, 0x76, 0xc2, 0x6d, 0x7c, 0x78, + 0x9c, 0x3c, 0x8d, 0x5e, 0x4e, 0x96, 0x65, 0xb4, 0x43, 0x7c, 0x76, 0x39, + 0xa4, 0x5f, 0xb5, 0x93, 0x83, 0x9b, 0xb3, 0xa0, 0xbd, 0xc1, 0xd5, 0x87, + 0x61, 0x78, 0x95, 0xe2, 0x6f, 0x88, 0x5d, 0x94, 0x89, 0x28, 0x74, 0x5c, + 0xa9, 0xbc, 0x4c, 0x54, 0x3c, 0x4d, 0x6c, 0x69, 0x74, 0x66, 0xc4, 0x72, + 0x38, 0x93, 0x4b, 0x67, 0xb3, 0x95, 0x90, 0xc9, 0x57, 0x62, 0x67, 0x37, + 0x4e, 0xaf, 0x57, 0xa3, 0x93, 0xbb, 0xa0, 0x41, 0xab, 0x85, 0x64, 0x81, + 0x9d, 0x60, 0xcb, 0x41, 0x49, 0x7b, 0x33, 0x66, 0x50, 0x79, 0xc3, 0x58, + 0x9b, 0x80, 0xcb, 0xbd, 0x7d, 0xa6, 0x98, 0x6d, 0x63, 0x59, 0x87, 0x88, + 0x74, 0xda, 0xab, 0xb5, 0x65, 0x5b, 0x9c, 0x89, 0xd4, 0x95, 0xa8, 0xa8, + 0xb1, 0x9e, 0x73, 0x4e, 0x3b, 0x62, 0x60, 0x68, 0x74, 0x50, 0x44, 0xc1, + 0x6d, 0xbb, 0x6a, 0x2e, 0x82, 0x53, 0xa2, 0xa1, 0xa9, 0x72, 0x4b, 0x67, + 0x57, 0x8b, 0x7a, 0x38, 0x63, 0xb7, 0x38, 0x54, 0x8b, 0xa1, 0xbf, 0x51, + 0xb1, 0x6c, 0x39, 0x94, 0x2f, 0x3b, 0x6a, 0x94, 0x54, 0x25, 0x7f, 0xbb, + 0x5b, 0xbb, 0xb6, 0x7a, 0xbf, 0xb1, 0x75, 0xc1, 0xd5, 0x55, 0x57, 0x9b, + 0x5c, 0xab, 0xd0, 0x64, 0x72, 0x88, 0xcc, 0xf2, 0xc7, 0x3c, 0x4a, 0xb4, + 0x4c, 0x84, 0x57, 0x3b, 0x77, 0x81, 0x74, 0x89, 0x93, 0x84, 0x3e, 0xb5, + 0x87, 0x8d, 0x98, 0x4b, 0x8c, 0x41, 0x46, 0x84, 0xba, 0x59, 0x54, 0x7f, + 0x2b, 0x45, 0x61, 0x57, 0x7d, 0x49, 0xa5, 0x50, 0xaa, 0x6b, 0x88, 0x56, + 0x88, 0xd1, 0xa4, 0xa2, 0x90, 0xa7, 0x3b, 0x77, 0xb3, 0x4e, 0x6b, 0x5d, + 0x88, 0x9f, 0x7e, 0x78, 0x1c, 0x5c, 0x49, 0xd4, 0x59, 0x75, 0xba, 0x63, + 0x86, 0xb9, 0xc1, 0x82, 0xd6, 0x68, 0x74, 0xbc, 0x61, 0x61, 0x9e, 0xb4, + 0x84, 0xb3, 0x66, 0x92, 0x4d, 0x4d, 0x85, 0xac, 0xb0, 0xba, 0xa8, 0xa6, + 0xb9, 0xb8, 0xc8, 0xa1, 0x8d, 0x9e, 0x73, 0xe2, 0xcc, 0x74, 0xa4, 0x78, + 0x4b, 0x49, 0x8b, 0xc1, 0x3b, 0x3c, 0x4b, 0xb4, 0x30, 0xcf, 0x58, 0xca, + 0xd4, 0x89, 0xa1, 0x61, 0x51, 0x43, 0x79, 0x9b, 0x3b, 0x39, 0xb8, 0xa1, + 0x87, 0x59, 0x3c, 0xc7, 0x3a, 0x65, 0x5a, 0xad, 0x9a, 0xaa, 0x72, 0xa8, + 0xc6, 0x72, 0x5c, 0xb3, 0xc8, 0xc8, 0xd7, 0xac, 0x97, 0x5e, 0xca, 0xc0, + 0x7a, 0xc5, 0x9b, 0xa7, 0x6e, 0x52, 0x83, 0x9f, 0x60, 0x93, 0x56, 0x5e, + 0x4a, 0xbe, 0x93, 0x59, 0x9e, 0x8f, 0x49, 0x8d, 0x44, 0x7b, 0xe2, 0x9a, + 0x53, 0xda, 0x80, 0x39, 0xc5, 0x65, 0x9b, 0x80, 0x52, 0xa1, 0x38, 0x3e, + 0x8c, 0xc8, 0xd8, 0xdc, 0x79, 0x82, 0x1f, 0x71, 0x95, 0xcb, 0x48, 0x1c, + 0x32, 0xd7, 0x9e, 0xba, 0x8e, 0x6e, 0xd2, 0x53, 0xcf, 0xb0, 0x6a, 0xac, + 0xb0, 0x95, 0xba, 0x30, 0x91, 0x8f, 0x50, 0x7d, 0x9c, 0xc0, 0x3f, 0x9e, + 0x82, 0x9f, 0x7f, 0xc2, 0xd9, 0x66, 0x93, 0xcc, 0x65, 0x44, 0x7a, 0x85, + 0xbe, 0x8b, 0x7d, 0x78, 0x7e, 0x82, 0x7b, 0xac, 0x55, 0x71, 0x99, 0xae, + 0xcc, 0x57, 0xc6, 0x46, 0xa8, 0xa4, 0x9b, 0xc3, 0x82, 0x54, 0x48, 0x33, + 0x46, 0xa1, 0x92, 0xd6, 0xa3, 0xb5, 0xb6, 0x8b, 0x67, 0x6b, 0x6c, 0xb1, + 0x4e, 0x91, 0x8b, 0xc7, 0x81, 0x8e, 0x58, 0x97, 0x71, 0x6d, 0x92, 0xb8, + 0x94, 0xa1, 0x83, 0xae, 0x71, 0xba, 0x25, 0xb7, 0x5c, 0x68, 0xd9, 0xd2, + 0xcb, 0xa7, 0xcd, 0x94, 0x50, 0xc8, 0x5f, 0x7f, 0x8a, 0x3b, 0x73, 0xc6, + 0x72, 0x55, 0x3f, 0x6b, 0x4f, 0x77, 0x67, 0xa3, 0x5d, 0x73, 0x7a, 0xb3, + 0x60, 0xb0, 0xa9, 0xbc, 0xca, 0xb8, 0x5e, 0x72, 0x3a, 0xcf, 0x57, 0x9b, + 0x80, 0x32, 0x5e, 0x30, 0xc7, 0x77, 0x86, 0x68, 0x4b, 0x68, 0x39, 0x49, + 0xb2, 0x3e, 0xe0, 0x55, 0x9e, 0x4f, 0x62, 0x2a, 0x6a, 0x62, 0xa0, 0x46, + 0x34, 0x49, 0xb5, 0x85, 0x4d, 0x92, 0xc5, 0xb9, 0x60, 0x95, 0xa0, 0xdd, + 0x78, 0x97, 0x44, 0x72, 0x63, 0x5f, 0xe3, 0x8f, 0x18, 0xbe, 0x40, 0x57, + 0x3f, 0xb5, 0x44, 0x9e, 0xb5, 0x99, 0xa9, 0x48, 0x5a, 0x4c, 0x46, 0x6e, + 0x7d, 0x61, 0x86, 0x96, 0x53, 0x90, 0x43, 0x78, 0x67, 0x36, 0x5d, 0xb1, + 0x79, 0x5b, 0x9e, 0x52, 0x5b, 0xc6, 0x43, 0x57, 0x8e, 0x82, 0xac, 0x4e, + 0x81, 0x8a, 0x85, 0xb2, 0x41, 0xd5, 0x80, 0x92, 0x52, 0x6a, 0x65, 0x7c, + 0x63, 0x77, 0xa5, 0x50, 0x9e, 0x86, 0xaf, 0x75, 0xb5, 0x6b, 0x76, 0x91, + 0x8d, 0xa8, 0x7b, 0xbf, 0xcc, 0x7a, 0x5a, 0x98, 0x63, 0x51, 0xc7, 0xad, + 0xa9, 0xbd, 0x97, 0x67, 0x8b, 0xb3, 0x8e, 0x7e, 0xa4, 0xc0, 0x33, 0x9e, + 0x91, 0xcd, 0x97, 0xdd, 0x88, 0x94, 0xae, 0x9c, 0x5c, 0x88, 0x4a, 0x47, + 0xc8, 0xbe, 0xca, 0x41, 0xc5, 0x55, 0x36, 0x6d, 0xb6, 0x80, 0x2c, 0x4e, + 0x97, 0x6a, 0x5c, 0x24, 0x59, 0x3d, 0xc0, 0x6c, 0x61, 0xbd, 0x94, 0xa4, + 0x90, 0xbe, 0x85, 0xc2, 0x68, 0xd0, 0xb5, 0x49, 0x48, 0xb9, 0x6c, 0x9d, + 0x76, 0xa1, 0x99, 0x7b, 0x83, 0xaa, 0x3c, 0xd4, 0x5b, 0xc4, 0x88, 0xac, + 0xaa, 0x78, 0x7b, 0x6a, 0xa9, 0xa3, 0xb5, 0xd6, 0xdf, 0x73, 0xe0, 0x44, + 0xa3, 0xb4, 0x9a, 0x75, 0x5f, 0xc0, 0xbf, 0x4a, 0xbc, 0xa4, 0x5e, 0xb5, + 0x4c, 0xb1, 0xbb, 0x8b, 0x1f, 0x50, 0x5e, 0x56, 0xbb, 0x35, 0x36, 0x40, + 0x34, 0xaa, 0xa6, 0xb2, 0xc6, 0x58, 0xd7, 0x70, 0xc2, 0x4d, 0x48, 0xc2, + 0xca, 0xb8, 0x95, 0x73, 0x7d, 0x3a, 0x53, 0x51, 0x86, 0x77, 0x53, 0x6d, + 0x4d, 0x23, 0xa8, 0x58, 0xa8, 0x92, 0xcf, 0xc6, 0x32, 0xac, 0x64, 0x73, + 0x8e, 0x47, 0x4b, 0x49, 0x91, 0x9e, 0xd4, 0xa0, 0x83, 0xc3, 0x4a, 0x66, + 0xb3, 0x8f, 0x6b, 0xa2, 0xc0, 0x88, 0xbd, 0x74, 0xc3, 0x50, 0xcb, 0xc2, + 0x60, 0x71, 0x75, 0x56, 0x36, 0x8d, 0x5e, 0x80, 0xe3, 0xb7, 0x68, 0x98, + 0x5d, 0x66, 0x9b, 0x9e, 0xa1, 0x6c, 0x80, 0x66, 0x97, 0x7a, 0x7a, 0x7e, + 0x74, 0x74, 0x42, 0x48, 0x85, 0xe3, 0xbc, 0x4c, 0x3c, 0xa8, 0xa0, 0x93, + 0x71, 0xca, 0x31, 0x2f, 0x85, 0xa1, 0xa1, 0x49, 0x5d, 0x37, 0x41, 0x6a, + 0x94, 0xb2, 0x57, 0x60, 0x64, 0xc3, 0x64, 0xbd, 0xa0, 0x3c, 0x9c, 0xbf, + 0xc6, 0x74, 0x96, 0x7b, 0x5c, 0x6e, 0x36, 0x59, 0x94, 0x88, 0xc0, 0x95, + 0x63, 0xc2, 0x68, 0xd2, 0xaf, 0xcf, 0xcc, 0x31, 0xc5, 0x50, 0x40, 0x2d, + 0xad, 0xaf, 0x5f, 0x7b, 0x62, 0x32, 0xa8, 0x58, 0x47, 0x63, 0x99, 0x8b, + 0xb2, 0x65, 0x2f, 0x90, 0x81, 0xbc, 0xc7, 0x92, 0x9a, 0x71, 0xb8, 0x55, + 0x93, 0xca, 0x70, 0x91, 0x71, 0xb2, 0x93, 0x44, 0x7e, 0x5a, 0xc5, 0x86, + 0xbe, 0x82, 0xc9, 0xa5, 0xa5, 0x77, 0x3b, 0x98, 0x76, 0x99, 0x97, 0xc6, + 0x40, 0x46, 0xb8, 0x87, 0x5d, 0x84, 0x56, 0xdc, 0x65, 0x34, 0xc1, 0x89, + 0x61, 0x3b, 0xb4, 0xa9, 0x8b, 0xb5, 0x95, 0x52, 0x7f, 0x8a, 0x8e, 0x99, + 0xa7, 0x7c, 0x50, 0x45, 0x4a, 0xb9, 0xca, 0x52, 0x64, 0x6d, 0x82, 0x35, + 0x50, 0xbf, 0xd9, 0x39, 0x6e, 0xe7, 0x4a, 0x39, 0xd0, 0xab, 0xb5, 0xce, + 0xc1, 0xa9, 0x4a, 0x73, 0x45, 0xab, 0x5a, 0xb1, 0x54, 0x94, 0xaf, 0x72, + 0xc9, 0x98, 0x9c, 0x8d, 0x39, 0x93, 0x55, 0x51, 0x62, 0xc6, 0x53, 0xbb, + 0xd5, 0xed, 0xbe, 0xaa, 0x93, 0x3a, 0x5f, 0x4e, 0x7b, 0xd4, 0x65, 0xad, + 0xa8, 0xc7, 0x91, 0xb0, 0x99, 0x80, 0x4e, 0x6c, 0xa6, 0xa7, 0x48, 0xa5, + 0x5b, 0x83, 0x87, 0x99, 0xb0, 0x92, 0x99, 0x75, 0xda, 0x83, 0x5e, 0x95, + 0xa7, 0xea, 0xc2, 0x68, 0x22, 0xbc, 0x8a, 0x60, 0xd2, 0xaf, 0x3f, 0x8a, + 0x76, 0x78, 0x64, 0xb2, 0xcd, 0xc7, 0x3e, 0x9e, 0x70, 0x79, 0x6d, 0xbc, + 0xaa, 0x8a, 0x5d, 0xa7, 0x8d, 0x55, 0x99, 0x96, 0x5e, 0xc9, 0x84, 0x7d, + 0x80, 0x9b, 0x6f, 0x5e, 0xbd, 0x95, 0x5e, 0x32, 0x9e, 0xa2, 0x57, 0x5c, + 0x5b, 0x9e, 0x5c, 0xb3, 0xab, 0xb2, 0xac, 0xc3, 0x80, 0xa8, 0xb7, 0x8e, + 0x71, 0x3b, 0x79, 0xb7, 0xce, 0x60, 0x7e, 0x59, 0x7f, 0x6a, 0xd0, 0xcb, + 0x42, 0xa7, 0x6e, 0xc0, 0xb1, 0x74, 0x95, 0x3c, 0xbf, 0xc0, 0x44, 0xb5, + 0xc9, 0x53, 0x50, 0x59, 0xae, 0x8b, 0xcd, 0x37, 0x77, 0x79, 0xca, 0x69, + 0x7f, 0xb4, 0xa1, 0x60, 0x82, 0xb6, 0x5f, 0x33, 0xbb, 0xbc, 0x62, 0xbf, + 0xe8, 0x65, 0x40, 0x62, 0xad, 0x80, 0xc8, 0x3c, 0x49, 0xb0, 0x56, 0xc7, + 0x55, 0xc0, 0x43, 0x54, 0x38, 0xb3, 0x79, 0x9e, 0x98, 0xc5, 0x44, 0x5d, + 0xbe, 0x98, 0xa3, 0x76, 0x41, 0xb6, 0xb8, 0x89, 0xad, 0xb0, 0x65, 0x73, + 0x7f, 0x87, 0xc9, 0x91, 0x33, 0xd6, 0x71, 0x99, 0x90, 0xa9, 0x54, 0x63, + 0x65, 0xd2, 0xca, 0x78, 0x84, 0x8c, 0x8c, 0x69, 0xb4, 0x58, 0x46, 0xb7, + 0x3d, 0x3f, 0x3d, 0x3b, 0x9c, 0x95, 0x97, 0xcf, 0xb9, 0x61, 0x42, 0x7a, + 0xc8, 0x94, 0x92, 0x7f, 0xac, 0xc4, 0x92, 0x3c, 0x8a, 0x63, 0xa5, 0x89, + 0x86, 0x9d, 0x8b, 0xa7, 0xb3, 0x4b, 0x44, 0x9b, 0x6d, 0x50, 0xd3, 0xb8, + 0x56, 0x3b, 0xb0, 0x47, 0xd5, 0x81, 0x93, 0xc2, 0x33, 0x4c, 0xbf, 0x41, + 0xab, 0xb9, 0xae, 0x88, 0x8e, 0xa7, 0xb0, 0x74, 0x37, 0x6a, 0x48, 0x8e, + 0x80, 0xbd, 0x5b, 0xa3, 0x5c, 0xc5, 0xc9, 0x3f, 0x2c, 0x6e, 0x60, 0xb3, + 0x4a, 0x79, 0xb8, 0xa4, 0xb8, 0x5c, 0x6e, 0xd4, 0xc3, 0x7d, 0xb1, 0x94, + 0xc4, 0x3f, 0x6b, 0xc8, 0x97, 0x46, 0x36, 0xbf, 0x4d, 0x97, 0x45, 0x4b, + 0xa4, 0x4a, 0x9e, 0x64, 0xb8, 0x3f, 0xc7, 0xba, 0xaf, 0x7b, 0x48, 0xad, + 0xd9, 0xcf, 0x96, 0xc5, 0xd9, 0x8b, 0x54, 0xc6, 0xd3, 0x30, 0xa4, 0x50, + 0xa2, 0x48, 0xbd, 0x4b, 0xab, 0x98, 0x69, 0x63, 0xb1, 0x55, 0xd2, 0xc2, + 0x97, 0xc7, 0x54, 0x9c, 0x68, 0x8e, 0xe7, 0x91, 0xc1, 0x95, 0x7c, 0x3b, + 0xbf, 0xa0, 0x3d, 0xc9, 0x56, 0x56, 0x86, 0xd8, 0xca, 0xd1, 0x7e, 0x82, + 0x50, 0x82, 0xe3, 0x55, 0x8c, 0x98, 0xc0, 0x95, 0x8d, 0xcd, 0x52, 0x7f, + 0xba, 0xcf, 0xc7, 0x3f, 0x35, 0x6a, 0x77, 0xc3, 0x67, 0xc0, 0x96, 0x96, + 0x8c, 0xd7, 0x94, 0xc0, 0xa0, 0x7b, 0xaa, 0xb6, 0x45, 0x68, 0x90, 0x41, + 0x2d, 0xbd, 0x35, 0x29, 0x56, 0xab, 0x67, 0x63, 0xc0, 0x9c, 0x9f, 0x42, + 0x52, 0xb9, 0xac, 0x43, 0x4c, 0x82, 0x40, 0xb9, 0x82, 0xab, 0x82, 0xc1, + 0x7a, 0x5b, 0x95, 0x8e, 0x71, 0xa9, 0xc0, 0x6f, 0xc4, 0xcf, 0x7c, 0xb2, + 0x83, 0x42, 0x97, 0x60, 0x75, 0x5f, 0x87, 0xa5, 0x71, 0xcd, 0xdd, 0x8d, + 0xa5, 0x4f, 0x2f, 0xbf, 0x96, 0x45, 0x66, 0x8c, 0x2d, 0x58, 0x6d, 0x60, + 0xa3, 0x47, 0x93, 0x6e, 0x59, 0x8b, 0xa6, 0x84, 0x89, 0x98, 0x78, 0xa1, + 0x3e, 0x57, 0x6e, 0x75, 0x6e, 0xb0, 0x89, 0x7f, 0x30, 0xa5, 0xc2, 0x4e, + 0x74, 0x67, 0x9b, 0x5c, 0xba, 0x89, 0xa8, 0xb3, 0xd0, 0x96, 0x7b, 0x55, + 0xd1, 0x89, 0x9c, 0x2c, 0xc1, 0x68, 0x65, 0x47, 0x69, 0xc0, 0x99, 0xce, + 0x4e, 0x57, 0xbb, 0x40, 0xa1, 0x8c, 0x83, 0x57, 0x29, 0x99, 0x74, 0xb9, + 0x77, 0xa4, 0x4e, 0x7e, 0xcb, 0x67, 0x82, 0x90, 0x46, 0xa5, 0xad, 0x3e, + 0x58, 0xa4, 0xb9, 0xbb, 0x49, 0xe2, 0xa9, 0x49, 0xa6, 0x50, 0xab, 0xa7, + 0x6f, 0x78, 0x81, 0xb3, 0x6d, 0x71, 0x33, 0x57, 0xba, 0xc4, 0x5f, 0x52, + 0x7f, 0x20, 0xc3, 0x92, 0x29, 0xa7, 0x49, 0x6c, 0xc3, 0x62, 0x7e, 0x5a, + 0x6f, 0x46, 0x8d, 0xb9, 0x59, 0x51, 0x90, 0xd0, 0xb8, 0x95, 0x54, 0x5d, + 0xcd, 0x7c, 0x37, 0x58, 0x4c, 0x79, 0x44, 0xc6, 0x58, 0x55, 0x87, 0x86, + 0x34, 0x42, 0x86, 0x3f, 0x88, 0xa7, 0x6c, 0x65, 0xbd, 0x9e, 0x6d, 0xa8, + 0x94, 0x48, 0x97, 0xaa, 0x57, 0xaf, 0x43, 0x3f, 0x86, 0xbc, 0x44, 0xc5, + 0x47, 0xc6, 0xa9, 0x40, 0xc3, 0x92, 0xa5, 0x9b, 0xa1, 0x82, 0x3f, 0x96, + 0xc5, 0x79, 0xcd, 0xa0, 0xb1, 0x9d, 0x61, 0x68, 0xbd, 0x3b, 0x81, 0x8e, + 0x6a, 0x31, 0xcc, 0x3d, 0x5b, 0x91, 0x49, 0x9e, 0x9a, 0x60, 0x34, 0xb8, + 0x70, 0x7d, 0xac, 0xb8, 0xbe, 0xb2, 0xaa, 0x9e, 0x70, 0xa0, 0x86, 0x6e, + 0x7c, 0x59, 0x3f, 0x99, 0x9e, 0xa7, 0xc3, 0x6e, 0xa3, 0xbf, 0x68, 0x9f, + 0x86, 0x53, 0x77, 0xc1, 0x8b, 0xc0, 0x52, 0x65, 0xb0, 0xdc, 0xa1, 0x78, + 0x4d, 0xba, 0x53, 0xd6, 0x49, 0x60, 0xad, 0x77, 0xa9, 0xb1, 0x45, 0x8e, + 0x57, 0x68, 0xe1, 0x3d, 0xb1, 0xb3, 0x75, 0x85, 0xba, 0x6a, 0x8e, 0x62, + 0x7d, 0x88, 0x9b, 0x89, 0xa4, 0x77, 0x37, 0x8c, 0xa1, 0x39, 0x78, 0x86, + 0x89, 0xa7, 0x8a, 0x9b, 0x5e, 0xd8, 0x65, 0x57, 0x57, 0xaa, 0xb5, 0x4e, + 0xa9, 0x81, 0x93, 0xc5, 0x8c, 0xc0, 0x36, 0x87, 0xc8, 0x2b, 0xc0, 0xa2, + 0x9c, 0xd5, 0xab, 0x85, 0xc8, 0x85, 0xc6, 0x77, 0x5f, 0x8a, 0xbc, 0x52, + 0x72, 0xba, 0xd0, 0x6f, 0xd7, 0xe2, 0xe3, 0xb8, 0xb7, 0xa1, 0x37, 0x2b, + 0x66, 0x95, 0x63, 0x9f, 0x73, 0xa2, 0x80, 0x6d, 0xbd, 0xe0, 0x8e, 0xc8, + 0xb4, 0xa7, 0x89, 0xac, 0x85, 0x7a, 0xc7, 0xab, 0xac, 0xda, 0x7f, 0x4f, + 0xcf, 0xc6, 0x6a, 0x94, 0x83, 0x5b, 0x79, 0x51, 0xa2, 0xaf, 0xbe, 0x52, + 0x4d, 0x41, 0x74, 0xb0, 0xe1, 0xbd, 0x7a, 0x78, 0xdf, 0x90, 0x6d, 0xa3, + 0x2f, 0x80, 0xcf, 0x54, 0x48, 0xa4, 0x8f, 0x39, 0x52, 0x7b, 0x3d, 0x8d, + 0x88, 0xd2, 0xb1, 0x48, 0x8e, 0xa9, 0x99, 0x5c, 0xa9, 0x66, 0xc6, 0x52, + 0x49, 0x59, 0xb8, 0x8c, 0x7c, 0x7d, 0x45, 0xbb, 0x87, 0xb9, 0xb8, 0x4a, + 0x64, 0x74, 0x79, 0x72, 0x4c, 0xbc, 0x24, 0x8c, 0xac, 0xa7, 0x77, 0x55, + 0x51, 0x83, 0x69, 0x86, 0x83, 0x56, 0x4f, 0xba, 0x4e, 0xd1, 0x7b, 0x9b, + 0x92, 0xc2, 0x5a, 0x3a, 0x6f, 0xd4, 0x90, 0xe3, 0xc1, 0x80, 0x8b, 0x48, + 0x66, 0x59, 0x98, 0x85, 0x3d, 0x6c, 0x73, 0x87, 0x63, 0x36, 0xab, 0x56, + 0xa4, 0xb1, 0x66, 0xa7, 0xc7, 0xb9, 0x6a, 0xad, 0x59, 0xae, 0x71, 0xa0, + 0x3b, 0x7a, 0xbd, 0x54, 0xb3, 0xc8, 0x88, 0x32, 0x48, 0xb7, 0xc9, 0xd0, + 0x81, 0xb1, 0xd2, 0xc7, 0x70, 0x9f, 0x36, 0x58, 0xe5, 0x65, 0x59, 0xb0, + 0x47, 0x73, 0x4e, 0x7c, 0x80, 0x67, 0x75, 0x6e, 0x3b, 0x5a, 0xa2, 0xa1, + 0x6c, 0xc7, 0xc7, 0xa2, 0x8e, 0x46, 0x46, 0x33, 0xc7, 0x76, 0x51, 0xd9, + 0xe0, 0x6e, 0x5e, 0x9d, 0x8e, 0x68, 0x5c, 0x43, 0x9a, 0x67, 0x6c, 0x8f, + 0xbf, 0x7c, 0x9f, 0x86, 0x52, 0x2b, 0x97, 0x37, 0x36, 0x75, 0x4c, 0x6a, + 0x60, 0x42, 0x8f, 0x9c, 0xdc, 0x2e, 0xd6, 0x80, 0x3c, 0x9e, 0xc0, 0xa7, + 0xaa, 0xb7, 0x43, 0x7d, 0xb2, 0xb9, 0x7f, 0x8e, 0x42, 0x6f, 0x33, 0x6d, + 0xc8, 0x93, 0x6b, 0xc1, 0x60, 0x9e, 0xa7, 0x82, 0xbb, 0xaf, 0x2d, 0x6c, + 0x79, 0xbe, 0x57, 0x36, 0xc2, 0x77, 0x7d, 0xb8, 0x43, 0x42, 0xc1, 0x7f, + 0xb3, 0x62, 0x58, 0xbb, 0x36, 0x91, 0x82, 0x4c, 0x94, 0x43, 0xa0, 0x38, + 0x8f, 0x63, 0x42, 0x55, 0xcb, 0x58, 0x6e, 0x97, 0x48, 0xbb, 0xa5, 0xa4, + 0x81, 0x88, 0x32, 0x45, 0x6c, 0xc8, 0x6d, 0x56, 0x5e, 0xa6, 0x9c, 0xcc, + 0x5e, 0x48, 0xa7, 0xc3, 0x42, 0x8b, 0x88, 0x3f, 0xa6, 0x90, 0x95, 0xa9, + 0x38, 0x86, 0x68, 0xa8, 0x44, 0xca, 0x94, 0x5d, 0x4e, 0x49, 0x50, 0xc0, + 0x49, 0x68, 0x43, 0xa3, 0x55, 0x74, 0x8d, 0x40, 0x95, 0xcd, 0x78, 0xcc, + 0xb2, 0x3e, 0xc1, 0x9f, 0xd1, 0x3a, 0xa8, 0x96, 0x41, 0x50, 0x7c, 0x72, + 0x7b, 0x54, 0x69, 0x97, 0x63, 0xc3, 0x3c, 0x9e, 0xc1, 0x9e, 0x95, 0xae, + 0xc7, 0xba, 0xa6, 0x57, 0x78, 0x99, 0x4e, 0x5c, 0x8a, 0xb7, 0xba, 0xb6, + 0x76, 0x6a, 0x63, 0x55, 0xba, 0x80, 0x34, 0x6c, 0x45, 0xa5, 0x4b, 0xa4, + 0x79, 0x5f, 0x9e, 0x38, 0x72, 0x3f, 0x82, 0x4c, 0x81, 0xa5, 0x94, 0x5c, + 0xb3, 0xc4, 0x6e, 0xb0, 0xa4, 0x90, 0xb2, 0x68, 0xb0, 0xc4, 0x4e, 0xa1, + 0xa7, 0x8c, 0xc1, 0xc6, 0x38, 0xa4, 0x72, 0x80, 0xc0, 0x62, 0xb0, 0x7f, + 0xac, 0x80, 0xa1, 0x3f, 0x3c, 0x41, 0xba, 0xca, 0x8c, 0x8e, 0xc0, 0x9d, + 0x54, 0x43, 0x4a, 0xa7, 0x51, 0x34, 0xd3, 0xc9, 0x6b, 0x70, 0xbb, 0xb6, + 0x55, 0x91, 0x8e, 0x9f, 0x35, 0xc5, 0x7a, 0x48, 0x5b, 0x9c, 0x89, 0xcc, + 0x8e, 0x3a, 0xa6, 0xc2, 0x90, 0xc9, 0x30, 0x9f, 0x87, 0x6c, 0xae, 0x63, + 0x2f, 0x52, 0x7d, 0x42, 0x59, 0x32, 0xc8, 0xa1, 0x33, 0x93, 0xc5, 0x5c, + 0x55, 0x37, 0xb9, 0xb1, 0xc0, 0x8e, 0xcf, 0x3e, 0xcf, 0x69, 0xa1, 0x83, + 0x41, 0x49, 0x72, 0x4d, 0xc2, 0xa2, 0x34, 0x36, 0xc6, 0xa0, 0x68, 0x75, + 0x9a, 0xc7, 0x4f, 0x85, 0xa8, 0x78, 0x5f, 0x6f, 0x9c, 0x69, 0x5d, 0xa9, + 0xa1, 0x4c, 0x7a, 0xc2, 0xaa, 0x6c, 0x56, 0xb7, 0x7f, 0x80, 0x9f, 0xc8, + 0xc3, 0x51, 0x46, 0x8c, 0x37, 0xbc, 0xb7, 0xa3, 0x95, 0x69, 0x55, 0xd3, + 0x82, 0x79, 0x55, 0x55, 0xbd, 0x50, 0x71, 0xa2, 0x99, 0xbf, 0x9f, 0xd2, + 0x3e, 0x89, 0x61, 0x49, 0xaa, 0xcd, 0xd4, 0x85, 0xb7, 0x6b, 0x3f, 0x9d, + 0xd5, 0xe2, 0xb3, 0x56, 0x8d, 0x74, 0xc9, 0x95, 0x5e, 0xaf, 0x50, 0x4e, + 0x71, 0x7e, 0x82, 0xa3, 0x6c, 0x8f, 0x8a, 0xbc, 0x5a, 0xcb, 0x5a, 0x3c, + 0xc1, 0xa4, 0xa2, 0x84, 0x3e, 0x9a, 0x4c, 0x8c, 0x64, 0xb0, 0x97, 0x4d, + 0xb3, 0xa0, 0x82, 0x7f, 0x87, 0x7b, 0x7e, 0x66, 0x45, 0x84, 0xa1, 0xc4, + 0xae, 0x49, 0x89, 0xc9, 0x4e, 0xd1, 0x73, 0x5c, 0x46, 0xa5, 0x41, 0x7e, + 0x84, 0x40, 0x43, 0x8c, 0x5b, 0x9b, 0xd3, 0xa6, 0x8f, 0x46, 0xbb, 0xba, + 0x8e, 0x65, 0x41, 0xc9, 0x83, 0x55, 0x8e, 0x6a, 0x77, 0x64, 0x8b, 0x71, + 0x64, 0xae, 0xae, 0x52, 0x36, 0x3f, 0xc3, 0x86, 0x40, 0x96, 0xce, 0x6f, + 0xb7, 0xb0, 0xac, 0x64, 0x95, 0x60, 0x41, 0x58, 0x36, 0x42, 0x57, 0x8b, + 0x57, 0xd7, 0x57, 0xb6, 0xb8, 0x98, 0x8b, 0x91, 0x4d, 0xc1, 0x3d, 0x5a, + 0x8c, 0x6e, 0xd3, 0x4d, 0x42, 0x53, 0x97, 0xb1, 0x90, 0xca, 0x6a, 0x65, + 0x97, 0xc4, 0x7d, 0x67, 0x6e, 0x5d, 0xa0, 0xa4, 0x3b, 0xbe, 0x96, 0x84, + 0x86, 0xa3, 0x6c, 0x8c, 0x92, 0x42, 0x37, 0x51, 0x5b, 0x56, 0x7f, 0x63, + 0x63, 0xc7, 0x4e, 0xb0, 0x95, 0x33, 0x83, 0x43, 0x80, 0x93, 0xa6, 0x96, + 0x90, 0x63, 0x94, 0x47, 0xc1, 0x4f, 0x6e, 0x7c, 0x88, 0xc2, 0x62, 0x45, + 0xb9, 0xbe, 0xc7, 0xa6, 0xb7, 0xb1, 0x73, 0x87, 0x55, 0x8d, 0x7f, 0x88, + 0x44, 0xd8, 0x4f, 0xc9, 0x99, 0x29, 0x85, 0x3b, 0x7c, 0xba, 0x51, 0x68, + 0x7d, 0x7c, 0x8a, 0xc4, 0x70, 0x62, 0xc6, 0x82, 0xbc, 0x35, 0x64, 0x8a, + 0x58, 0x9b, 0x72, 0x4d, 0x3b, 0xcc, 0x93, 0xa4, 0x95, 0xad, 0xac, 0x38, + 0xa9, 0xae, 0x92, 0x45, 0x3f, 0xa5, 0x43, 0xad, 0x54, 0xc7, 0xa5, 0x5c, + 0x93, 0x2e, 0xa2, 0x8b, 0x97, 0x3d, 0xcb, 0x72, 0x9b, 0x68, 0x42, 0x7f, + 0x58, 0x85, 0x51, 0xc0, 0x4d, 0x67, 0x54, 0x97, 0xb1, 0xce, 0x69, 0x64, + 0x60, 0x84, 0x84, 0xa2, 0x70, 0x7a, 0x91, 0x8b, 0x59, 0xa2, 0x63, 0x6c, + 0xa8, 0x87, 0xcd, 0xc7, 0x59, 0x76, 0xbb, 0x7d, 0xbb, 0xcb, 0x40, 0xb4, + 0x6d, 0x76, 0x5d, 0xa8, 0x76, 0x33, 0xbc, 0x64, 0x85, 0xbd, 0xa5, 0xca, + 0x74, 0x38, 0x6a, 0x85, 0xc1, 0xd0, 0x8d, 0xb6, 0x50, 0x31, 0xca, 0xc6, + 0x89, 0x3b, 0x84, 0xb4, 0x5c, 0x61, 0xba, 0xcc, 0x8d, 0xb6, 0x9f, 0xc3, + 0x7e, 0x8e, 0x4a, 0x42, 0xa1, 0x50, 0x9b, 0xcf, 0x9f, 0xcc, 0x8d, 0x55, + 0x57, 0x67, 0x96, 0x83, 0xcc, 0x92, 0x97, 0x42, 0xc7, 0x86, 0x50, 0x79, + 0x89, 0x2f, 0xcb, 0x6d, 0xbc, 0xad, 0x74, 0xcc, 0x92, 0x81, 0x79, 0x8b, + 0x96, 0x3d, 0x45, 0x9d, 0x5a, 0xa6, 0xa5, 0xb8, 0x2e, 0x86, 0xc5, 0x79, + 0x68, 0xcb, 0x87, 0xc3, 0x6e, 0xb0, 0x5a, 0xce, 0x4c, 0x6d, 0x37, 0xc3, + 0x9a, 0x85, 0x90, 0x5c, 0x7f, 0x6b, 0x81, 0x48, 0x3c, 0xab, 0xb3, 0x83, + 0xb5, 0xb9, 0xb6, 0xc2, 0xb0, 0x57, 0x27, 0xbf, 0x75, 0x3a, 0x6c, 0xbf, + 0x66, 0x69, 0xb1, 0xbe, 0x48, 0xc2, 0xc5, 0x53, 0x88, 0x54, 0xa7, 0x67, + 0xc9, 0x37, 0xab, 0xb6, 0x36, 0x3b, 0x55, 0x95, 0x73, 0x3c, 0x5d, 0xa6, + 0x8c, 0x99, 0xb9, 0x5d, 0x75, 0xe2, 0xaa, 0x79, 0x66, 0xbf, 0x46, 0xc7, + 0x67, 0x43, 0x71, 0xbf, 0x9e, 0xcc, 0xc2, 0xb4, 0xb7, 0x81, 0x9b, 0x98, + 0x9a, 0xd3, 0x7a, 0x6c, 0x4b, 0x99, 0xb3, 0x84, 0x39, 0x45, 0x36, 0x87, + 0x49, 0x44, 0xc8, 0x70, 0x39, 0x98, 0xa9, 0x47, 0xd2, 0x54, 0x91, 0xa0, + 0x5e, 0xd0, 0xb8, 0x95, 0x74, 0xb5, 0x83, 0xa9, 0xa3, 0x34, 0x9c, 0x5b, + 0xc7, 0x32, 0x6b, 0x61, 0xc8, 0x7a, 0xad, 0xc0, 0x45, 0x6f, 0x9f, 0x82, + 0x9c, 0x46, 0xca, 0x70, 0x95, 0x53, 0xb1, 0x7b, 0x32, 0xb9, 0x79, 0x33, + 0x39, 0x4c, 0x78, 0xbe, 0x6e, 0x77, 0x7c, 0x4c, 0x68, 0x5c, 0x64, 0x49, + 0x6f, 0xbe, 0x73, 0x6e, 0xa8, 0xba, 0x69, 0x39, 0x9a, 0x61, 0x9d, 0x69, + 0x9d, 0xaa, 0x72, 0x6f, 0x8f, 0x72, 0x84, 0x76, 0x79, 0x66, 0x82, 0x34, + 0xa1, 0xa2, 0xbb, 0x74, 0xc4, 0x6f, 0xa7, 0x53, 0x84, 0x4b, 0xa2, 0x32, + 0x98, 0xd2, 0x7f, 0x5b, 0x94, 0xd2, 0xb8, 0x3a, 0xb4, 0xca, 0x70, 0xa2, + 0x69, 0xa0, 0x92, 0xb6, 0x42, 0x67, 0x60, 0xa7, 0x47, 0x9c, 0x73, 0x42, + 0xc5, 0x8b, 0x43, 0x82, 0xcb, 0x52, 0x86, 0x73, 0x95, 0x78, 0xc5, 0x7e, + 0x7c, 0x4b, 0xba, 0x49, 0x8c, 0x9c, 0x41, 0x47, 0x3f, 0x70, 0xd1, 0xa8, + 0xb4, 0x49, 0x79, 0x49, 0x84, 0xa1, 0x98, 0xc3, 0xa5, 0x6b, 0xa6, 0xa7, + 0x71, 0x51, 0x6d, 0x3b, 0x32, 0x9e, 0x7f, 0x69, 0xc1, 0xa6, 0x8a, 0xc5, + 0x87, 0x95, 0xaf, 0xb6, 0x78, 0x38, 0x5e, 0x66, 0x7c, 0xc3, 0x36, 0x4f, + 0x90, 0x93, 0x86, 0x80, 0x98, 0x3c, 0xd1, 0xb0, 0x79, 0xc5, 0x68, 0x5f, + 0xa4, 0x4d, 0x47, 0x36, 0x54, 0x9d, 0x90, 0x90, 0x92, 0x4a, 0x5f, 0x67, + 0x60, 0x59, 0x4f, 0x9f, 0x77, 0x60, 0xc5, 0x42, 0xa9, 0x90, 0x81, 0x77, + 0x92, 0x7f, 0x93, 0x7f, 0x64, 0x68, 0x64, 0x98, 0x93, 0x84, 0x8d, 0x87, + 0xd0, 0x73, 0x77, 0xbb, 0x33, 0xb6, 0xa1, 0x71, 0x9b, 0xc4, 0x9a, 0x38, + 0x95, 0x3c, 0x5f, 0x51, 0x9f, 0x62, 0xcd, 0x53, 0x74, 0x3f, 0xa0, 0x87, + 0x57, 0x5b, 0x5e, 0x90, 0x97, 0x6e, 0x58, 0x6b, 0x48, 0x75, 0x9e, 0xc0, + 0x35, 0xb0, 0x5f, 0x5c, 0xa2, 0xaf, 0x8c, 0x90, 0xc0, 0xac, 0xd0, 0x33, + 0xd2, 0xb0, 0xc7, 0xaf, 0x5d, 0x95, 0x4d, 0x70, 0x82, 0x94, 0x44, 0xd0, + 0x6e, 0x9b, 0x65, 0xce, 0x97, 0x80, 0xb0, 0x73, 0xa6, 0xbe, 0x45, 0x61, + 0x6f, 0x9a, 0x36, 0x43, 0xcb, 0x87, 0xa1, 0x83, 0x57, 0x8d, 0x51, 0x71, + 0x78, 0x99, 0x9e, 0xc1, 0x3e, 0x80, 0x70, 0x6d, 0x9c, 0x58, 0xaf, 0x35, + 0xbf, 0x9b, 0x70, 0x3b, 0x5d, 0xbc, 0xcf, 0x6a, 0x59, 0xa3, 0x72, 0x51, + 0x60, 0x9c, 0x3b, 0x60, 0xb8, 0x82, 0x7e, 0x54, 0x7f, 0x6e, 0x4d, 0xb9, + 0x9c, 0x9c, 0x4c, 0x5d, 0x66, 0x70, 0xd3, 0x78, 0xd3, 0x83, 0x47, 0x6d, + 0x3f, 0x94, 0xc9, 0x3b, 0x3d, 0x38, 0x4d, 0x64, 0x65, 0x53, 0x67, 0x95, + 0x3f, 0x32, 0xa5, 0xab, 0x55, 0x8f, 0xa9, 0x5a, 0x3a, 0xba, 0xb1, 0x9f, + 0x4d, 0x92, 0xb3, 0xa4, 0x58, 0x40, 0x62, 0x66, 0x43, 0x97, 0xb9, 0x37, + 0x59, 0x92, 0x9a, 0x77, 0x8e, 0x73, 0xa9, 0x8f, 0x62, 0x43, 0x6c, 0xd0, + 0x99, 0x9f, 0x3a, 0xca, 0xc1, 0x3a, 0xa3, 0xac, 0xcd, 0xa4, 0x4a, 0xb6, + 0x91, 0x51, 0x7a, 0x94, 0x45, 0xb9, 0xbc, 0x47, 0x4b, 0x4b, 0x53, 0x61, + 0x41, 0x97, 0x8d, 0xce, 0x53, 0x6f, 0x99, 0x71, 0xa6, 0x3b, 0x79, 0xc2, + 0x9c, 0x75, 0x47, 0xab, 0x8b, 0x3f, 0x49, 0x56, 0x50, 0x4c, 0xb2, 0x3b, + 0xa1, 0x39, 0x47, 0xbf, 0xb6, 0x49, 0x82, 0x71, 0xb8, 0x6e, 0xb6, 0xa1, + 0x51, 0x86, 0x3b, 0x3c, 0x7c, 0x9e, 0x7f, 0x84, 0x6d, 0x7b, 0x54, 0x57, + 0xc5, 0x4f, 0x5d, 0x67, 0xc5, 0x9c, 0xb2, 0x5c, 0xba, 0x79, 0x4e, 0xc0, + 0x66, 0x9a, 0x9e, 0x70, 0x7f, 0x42, 0x8a, 0x4a, 0xaf, 0x4f, 0x45, 0x4a, + 0x34, 0x8c, 0x5c, 0x89, 0xa1, 0x91, 0x7c, 0x45, 0xa4, 0xa9, 0x91, 0x7b, + 0x76, 0x35, 0x40, 0xb5, 0xcd, 0x6a, 0x7f, 0x74, 0x7c, 0x92, 0x40, 0xb4, + 0x4a, 0x64, 0x31, 0x49, 0x44, 0xa5, 0x33, 0x5d, 0x48, 0xa3, 0xa6, 0x57, + 0xaf, 0xb2, 0xad, 0xba, 0xb5, 0xa7, 0xc0, 0x41, 0x67, 0x8c, 0x6e, 0x64, + 0xa4, 0x4f, 0xa8, 0xbd, 0x68, 0x34, 0xb0, 0x8e, 0x4d, 0x88, 0x7f, 0xb9, + 0xc6, 0xbc, 0x32, 0x68, 0xc1, 0x4d, 0x68, 0x9c, 0x5a, 0x85, 0x72, 0xb9, + 0x62, 0xac, 0x35, 0xcb, 0xaa, 0x71, 0xa0, 0x7a, 0xb2, 0xca, 0x81, 0x3f, + 0x98, 0x55, 0x73, 0x3e, 0xa9, 0xb8, 0xb4, 0xbb, 0xc6, 0xbc, 0x4b, 0x71, + 0x53, 0x9c, 0x47, 0xa1, 0x3e, 0x63, 0xc9, 0x99, 0x5c, 0x5f, 0xaf, 0x8b, + 0x62, 0xa4, 0x94, 0x31, 0x94, 0x85, 0x44, 0x9e, 0x86, 0x4d, 0x45, 0x7d, + 0x9c, 0x9c, 0x3a, 0x63, 0xb3, 0x70, 0x72, 0x8e, 0xb9, 0x70, 0x7a, 0x51, + 0x59, 0x6f, 0x7c, 0xc6, 0x86, 0x6d, 0x6b, 0x3a, 0x84, 0x44, 0xb9, 0x37, + 0x55, 0xba, 0xd3, 0x84, 0x9d, 0x63, 0x73, 0x3f, 0xc1, 0xb1, 0x33, 0x84, + 0xad, 0xa7, 0xb4, 0x39, 0x34, 0x6f, 0x75, 0xbe, 0x4e, 0x92, 0x5d, 0x9a, + 0x9d, 0x8e, 0x41, 0x9e, 0xb8, 0xd3, 0x57, 0xb8, 0x64, 0x9d, 0x4e, 0xad, + 0xab, 0xd2, 0xc5, 0xa9, 0x3e, 0x68, 0x73, 0xaa, 0x38, 0x89, 0x4a, 0x47, + 0xc2, 0x3e, 0x7a, 0x62, 0xa6, 0xb5, 0xcf, 0x41, 0x6b, 0x71, 0x3a, 0xbe, + 0x76, 0x93, 0x93, 0xbb, 0x73, 0x63, 0x50, 0xa4, 0xaf, 0x6e, 0xae, 0xbb, + 0xb9, 0xcb, 0xbf, 0x34, 0x4a, 0x75, 0x3f, 0x38, 0x3a, 0x97, 0xb1, 0x35, + 0x37, 0x86, 0x8d, 0xa7, 0x47, 0x88, 0x86, 0x53, 0x7b, 0xa7, 0xa2, 0x90, + 0x90, 0xc3, 0x82, 0x44, 0x9a, 0xb4, 0x96, 0x35, 0x64, 0x4d, 0x8c, 0x65, + 0xa9, 0x47, 0x71, 0x69, 0xb1, 0xcd, 0x35, 0x76, 0xb1, 0x8c, 0x7d, 0xa7, + 0xae, 0x50, 0x7a, 0x99, 0x84, 0x76, 0x54, 0x3a, 0x4c, 0x9d, 0xc6, 0x95, + 0x38, 0x94, 0x4f, 0x58, 0x48, 0xa4, 0x6d, 0x9a, 0x4c, 0xa6, 0x34, 0xb2, + 0x41, 0x4c, 0xba, 0xbc, 0x52, 0x60, 0x4d, 0x8b, 0xd6, 0x33, 0xaf, 0x59, + 0x5c, 0x4a, 0xa7, 0x56, 0xa1, 0xa3, 0x61, 0x42, 0x5b, 0x54, 0xc4, 0x71, + 0xb2, 0xcd, 0x9b, 0x31, 0xd6, 0x89, 0x94, 0x7e, 0xb7, 0xb9, 0x7b, 0x6b, + 0xcf, 0x95, 0x8b, 0xaa, 0x80, 0x48, 0xc1, 0xc3, 0x60, 0x4e, 0x84, 0x5c, + 0x55, 0x61, 0xb2, 0x6c, 0x5c, 0xa0, 0xb6, 0x4f, 0x66, 0x97, 0xb5, 0x5f, + 0x97, 0xb4, 0x9b, 0x5c, 0x3c, 0x7c, 0x6c, 0xc8, 0x95, 0x8a, 0xbd, 0xd0, + 0xc0, 0x46, 0xcf, 0x35, 0x6c, 0x85, 0x7e, 0x39, 0x59, 0x50, 0xd3, 0x59, + 0x42, 0xa8, 0x56, 0xba, 0xbf, 0xd4, 0xa5, 0xa5, 0x54, 0xa3, 0x6a, 0xcf, + 0x50, 0x49, 0x35, 0x90, 0x5f, 0x63, 0xbe, 0x98, 0x50, 0xb1, 0xde, 0x77, + 0x5d, 0x7e, 0xa1, 0x91, 0x8f, 0xc0, 0x41, 0x3b, 0xcc, 0x42, 0x94, 0x7b, + 0x8c, 0xb0, 0x8e, 0x29, 0xb3, 0xa3, 0xa9, 0xb5, 0x7d, 0xb9, 0xc4, 0xd0, + 0x5b, 0x3e, 0x66, 0x56, 0xa4, 0xca, 0xc7, 0xa1, 0x66, 0x58, 0x73, 0x6f, + 0xcd, 0x33, 0xa2, 0x73, 0xc5, 0x4e, 0x97, 0x46, 0xa7, 0xad, 0x3b, 0xb2, + 0x5a, 0xbf, 0x70, 0x37, 0x9d, 0xc8, 0x77, 0x46, 0x9a, 0x62, 0x95, 0x4a, + 0x5a, 0xc9, 0xa7, 0x4d, 0xaf, 0x9e, 0x47, 0x3a, 0x41, 0x6e, 0x53, 0x46, + 0x83, 0x81, 0x43, 0x60, 0xa8, 0x9b, 0x7f, 0x71, 0x51, 0x7e, 0x7d, 0x74, + 0x73, 0x4b, 0xa7, 0xc0, 0x46, 0x85, 0x7b, 0x8c, 0x4a, 0x9a, 0xab, 0xb0, + 0x55, 0x9f, 0x3e, 0xb7, 0x5d, 0x65, 0xb5, 0x4a, 0x97, 0x6d, 0x37, 0x8e, + 0x83, 0x63, 0x8a, 0x6f, 0x5a, 0xc2, 0x5e, 0x40, 0x35, 0xa4, 0x73, 0x7b, + 0x4d, 0xd6, 0xb7, 0x96, 0x5e, 0xb9, 0xc8, 0x7a, 0x51, 0xc7, 0xcf, 0x70, + 0x4b, 0x77, 0x63, 0x8a, 0xa5, 0x58, 0x7a, 0x39, 0x55, 0x4e, 0x51, 0xaa, + 0x7f, 0x4e, 0xc1, 0x3a, 0xc8, 0x4d, 0xc0, 0x3b, 0x30, 0x53, 0x54, 0x7a, + 0x92, 0x5b, 0x78, 0x53, 0x7d, 0x47, 0xbc, 0x40, 0x77, 0xaf, 0xba, 0x76, + 0xcc, 0x5f, 0xa3, 0x8b, 0x9b, 0xc0, 0x40, 0xd2, 0x5e, 0x39, 0x93, 0xbe, + 0x62, 0xb8, 0x93, 0x4a, 0x48, 0x90, 0xd7, 0xb2, 0xb4, 0xc4, 0x3d, 0x9c, + 0xa4, 0xb8, 0xc4, 0x56, 0x63, 0x5a, 0xce, 0x47, 0x37, 0x5e, 0x51, 0x4b, + 0xaa, 0xce, 0x52, 0x7c, 0x9e, 0xc0, 0x51, 0x77, 0x72, 0xb2, 0x2d, 0xbe, + 0xb3, 0x74, 0x53, 0x8d, 0xad, 0x87, 0x3f, 0xa2, 0xac, 0x85, 0x64, 0xc1, + 0xd2, 0x65, 0x5e, 0xcb, 0x3b, 0xa5, 0xb1, 0x9a, 0x94, 0x84, 0x61, 0x30, + 0x7c, 0x5d, 0x5b, 0x87, 0xc9, 0x5a, 0x6b, 0x88, 0x4d, 0xbd, 0x8f, 0x5b, + 0x62, 0x48, 0xa9, 0x65, 0x7b, 0xa0, 0xb4, 0xd8, 0x42, 0x57, 0xce, 0x66, + 0xcb, 0x72, 0x33, 0x45, 0xb4, 0x8d, 0x68, 0x41, 0x46, 0xa4, 0x69, 0x62, + 0x47, 0xc4, 0x3e, 0x9f, 0x59, 0xad, 0x92, 0x3f, 0x65, 0x76, 0x59, 0x57, + 0x35, 0x7f, 0x83, 0x94, 0x73, 0x74, 0xa7, 0x7d, 0x5e, 0x3d, 0x89, 0x63, + 0xca, 0x6a, 0x5c, 0xba, 0x54, 0x9a, 0xc1, 0xa8, 0x3a, 0x4f, 0x57, 0x43, + 0x41, 0x77, 0x3f, 0xa8, 0x48, 0xc8, 0x44, 0xbc, 0x95, 0x46, 0xbe, 0xb6, + 0xa6, 0xbf, 0xc6, 0x30, 0xc3, 0xb2, 0x87, 0x64, 0x58, 0x4f, 0xb1, 0x6f, + 0xba, 0x3c, 0x6e, 0x9b, 0x84, 0xa5, 0xbf, 0x40, 0xd3, 0x63, 0x96, 0x33, + 0x2e, 0x3e, 0xd1, 0x46, 0x88, 0x51, 0xcc, 0x2f, 0xd3, 0x42, 0xad, 0x3b, + 0xb6, 0xb3, 0x3b, 0x8c, 0x93, 0x37, 0x3e, 0x58, 0x53, 0x78, 0x5a, 0x7f, + 0xd5, 0xba, 0x44, 0xd2, 0x62, 0x6e, 0x44, 0x4b, 0x34, 0xb2, 0x4f, 0x3e, + 0x7c, 0x59, 0x41, 0x68, 0x78, 0x59, 0x74, 0xb7, 0xce, 0x48, 0x2d, 0x8c, + 0x65, 0xa1, 0xc6, 0x96, 0x95, 0x95, 0xb9, 0x49, 0x5b, 0xa6, 0x63, 0xcb, + 0x4c, 0x81, 0x7f, 0x81, 0x41, 0xce, 0x5a, 0x43, 0xbc, 0x63, 0xda, 0xcc, + 0x50, 0x42, 0xc0, 0xbb, 0x56, 0x80, 0xcc, 0xbb, 0x6d, 0x7b, 0x3a, 0xd5, + 0xa0, 0x91, 0x8b, 0x4c, 0x34, 0xbc, 0xaf, 0xa6, 0x3c, 0x3f, 0x3e, 0x5d, + 0xc0, 0x9c, 0x73, 0xb2, 0x47, 0x51, 0xa8, 0x56, 0xab, 0x8d, 0x7e, 0x83, + 0xca, 0xb0, 0x4b, 0x80, 0x7b, 0x53, 0x35, 0x6d, 0xca, 0x58, 0xa8, 0x6e, + 0x8d, 0x69, 0x74, 0xce, 0x8e, 0xd2, 0xce, 0xe5, 0x58, 0x2e, 0xcd, 0x52, + 0x3c, 0x4a, 0xc6, 0x37, 0xcc, 0xb5, 0xa5, 0x8c, 0x83, 0x8d, 0x49, 0x46, + 0x37, 0xbd, 0x80, 0xc0, 0x35, 0x56, 0x5a, 0x4b, 0x3f, 0x3d, 0xc4, 0x7d, + 0xa4, 0x3f, 0x80, 0x5f, 0x6b, 0x6c, 0xc2, 0xb1, 0x92, 0x5a, 0xc0, 0x34, + 0x39, 0xa9, 0xb8, 0xd6, 0x6c, 0x95, 0x73, 0xb6, 0x45, 0xb7, 0xc9, 0x7b, + 0x90, 0xa2, 0xad, 0xb1, 0x7c, 0x4c, 0x8d, 0xb3, 0x83, 0x84, 0x9d, 0x98, + 0x6a, 0x85, 0x88, 0xb3, 0xb8, 0x51, 0xa0, 0xd2, 0x8d, 0x68, 0xb1, 0xd1, + 0x67, 0x47, 0xd5, 0x8b, 0x52, 0x4b, 0x61, 0x97, 0x69, 0xa9, 0xa1, 0x6e, + 0xcf, 0x9a, 0x80, 0x9a, 0xce, 0x62, 0x46, 0xbf, 0x54, 0x60, 0xb5, 0x83, + 0xa3, 0xa2, 0x7a, 0x7a, 0xcc, 0x7c, 0x50, 0xbd, 0x68, 0x97, 0x5f, 0xc6, + 0x8f, 0x78, 0xce, 0x3e, 0x7c, 0xa7, 0x96, 0xc6, 0x9b, 0x7a, 0xcc, 0xdc, + 0xcb, 0x81, 0xc7, 0xaf, 0x51, 0xc7, 0x31, 0x81, 0x79, 0x94, 0xaf, 0x7b, + 0xc8, 0x75, 0x8c, 0xa3, 0x73, 0xd2, 0x88, 0xbd, 0x85, 0x40, 0x44, 0x54, + 0xbe, 0x78, 0xa9, 0x5e, 0x61, 0x3f, 0x77, 0x86, 0xbc, 0x4c, 0x86, 0x51, + 0x83, 0x4b, 0xa1, 0x99, 0x88, 0xc4, 0xb7, 0x88, 0xa8, 0xca, 0x7b, 0x7a, + 0x3e, 0x88, 0xbb, 0x88, 0xa1, 0x71, 0x97, 0xb5, 0xad, 0xc5, 0xb0, 0x94, + 0x9d, 0x93, 0x3a, 0x99, 0xa0, 0xda, 0x4c, 0xd8, 0x3e, 0xad, 0x82, 0x2b, + 0xc3, 0x39, 0xab, 0xd4, 0x73, 0x2d, 0xc5, 0x88, 0x9d, 0x9a, 0x69, 0x97, + 0xac, 0xb5, 0xb9, 0x4e, 0xbd, 0x42, 0x67, 0xce, 0x81, 0xc5, 0x67, 0x4e, + 0xc6, 0x92, 0xb6, 0xa9, 0xcf, 0x73, 0x41, 0x89, 0x90, 0x6b, 0x43, 0x56, + 0xb6, 0x37, 0x93, 0x7a, 0xc2, 0xca, 0x77, 0x79, 0xa2, 0x31, 0xc2, 0x84, + 0x88, 0x88, 0x3f, 0x9c, 0x3f, 0x56, 0xbb, 0x45, 0x3b, 0x8c, 0x7d, 0x4d, + 0x9e, 0x86, 0x4b, 0x5d, 0xbb, 0x80, 0x77, 0x91, 0x42, 0xa5, 0x6f, 0x38, + 0x84, 0xb2, 0x4c, 0xbe, 0x95, 0x9b, 0x79, 0x5e, 0xa2, 0xb7, 0x9f, 0x54, + 0x62, 0x9a, 0x96, 0x8d, 0xa5, 0xc4, 0x6b, 0x3a, 0xc1, 0x77, 0xb8, 0x61, + 0x8c, 0x31, 0x96, 0x34, 0xca, 0xb0, 0x41, 0x2e, 0x8b, 0xb5, 0x65, 0x49, + 0xa7, 0xd2, 0x8e, 0x76, 0x95, 0xc5, 0x3e, 0x9b, 0x5d, 0xc6, 0x4b, 0x89, + 0xbf, 0x9c, 0xd1, 0x5c, 0x5a, 0x57, 0x6e, 0x83, 0x94, 0x59, 0xcc, 0x44, + 0x38, 0xaf, 0xa5, 0x75, 0xc4, 0x85, 0x36, 0xae, 0xa4, 0xa3, 0xa7, 0x8e, + 0xbc, 0x3f, 0x5e, 0xbb, 0x9b, 0x4d, 0xb7, 0x8b, 0xb1, 0x84, 0x53, 0x32, + 0x75, 0x45, 0x8a, 0x3d, 0x40, 0xb2, 0x83, 0xc8, 0xb6, 0x3f, 0xb3, 0xab, + 0xc3, 0x66, 0xbe, 0xc9, 0x41, 0xc6, 0x63, 0xb7, 0x48, 0x86, 0x35, 0x74, + 0x43, 0x52, 0x3c, 0xaa, 0x69, 0x76, 0x9e, 0xba, 0x64, 0x8e, 0x96, 0xd0, + 0x6d, 0x75, 0x74, 0xac, 0x5a, 0xa0, 0x6b, 0x48, 0xa2, 0x4d, 0xa8, 0xa3, + 0x5c, 0x95, 0x77, 0x53, 0x84, 0xb7, 0x48, 0x3e, 0x78, 0x4d, 0xab, 0xc2, + 0x59, 0xc2, 0x42, 0x96, 0x3c, 0xc4, 0xb6, 0x34, 0xa9, 0x6d, 0xaf, 0x39, + 0x5f, 0x9c, 0x73, 0xbd, 0xa3, 0xd1, 0x50, 0xaa, 0xd2, 0x92, 0x45, 0xb0, + 0xa0, 0x99, 0xd3, 0x9b, 0xa3, 0xab, 0xc3, 0xd2, 0x8e, 0x4e, 0xc4, 0x4f, + 0xa3, 0x6b, 0x4f, 0xa3, 0x50, 0x9e, 0xc5, 0x3e, 0xa5, 0xbb, 0xb1, 0x5c, + 0xd2, 0x7d, 0xcf, 0xad, 0x9a, 0x71, 0x82, 0x88, 0x8a, 0x7c, 0x96, 0x2f, + 0x3a, 0x56, 0x9d, 0x95, 0x73, 0x54, 0xa6, 0x74, 0x82, 0x9e, 0x55, 0x6c, + 0x6e, 0xab, 0xb3, 0x67, 0x97, 0x8c, 0x80, 0x66, 0x48, 0x7c, 0x37, 0xc7, + 0x7e, 0x4f, 0xc4, 0x42, 0x83, 0x91, 0xba, 0x9e, 0xa5, 0xa6, 0x67, 0x6b, + 0xbf, 0xcc, 0x6d, 0x9d, 0xc1, 0xa4, 0xc0, 0x3b, 0xb6, 0x8a, 0x94, 0xc5, + 0x44, 0xa3, 0xa4, 0xbc, 0x52, 0xad, 0x87, 0x4b, 0x9b, 0xb1, 0x92, 0x7b, + 0x57, 0x40, 0xb7, 0x48, 0xa9, 0xc3, 0x91, 0xae, 0x8b, 0xa3, 0x57, 0x6c, + 0x56, 0x74, 0x64, 0xcd, 0x6f, 0xa1, 0xb7, 0x36, 0x43, 0x3f, 0x59, 0x63, + 0x61, 0xd0, 0x3d, 0x9d, 0x73, 0xc0, 0x90, 0xd2, 0x95, 0xa1, 0x99, 0x9d, + 0x97, 0x96, 0x65, 0x51, 0xc7, 0xc2, 0x53, 0x7b, 0x59, 0x6b, 0xb4, 0x80, + 0x5f, 0x8c, 0x71, 0xa7, 0x86, 0xab, 0xaa, 0xcb, 0xa8, 0x4e, 0xc6, 0x88, + 0xa1, 0xc6, 0x35, 0xcc, 0xd1, 0x65, 0x52, 0x5e, 0xce, 0x91, 0x7c, 0xb6, + 0xcf, 0x49, 0xa1, 0x6d, 0xa6, 0xcc, 0x38, 0xa6, 0xc8, 0x87, 0x9d, 0xca, + 0xa3, 0x4c, 0xc3, 0x83, 0x3d, 0xbf, 0x68, 0xb2, 0x70, 0x36, 0xc0, 0xce, + 0x93, 0x73, 0xac, 0xb5, 0xb1, 0xaf, 0x81, 0x7c, 0x72, 0x37, 0x64, 0x76, + 0x87, 0xb6, 0xce, 0x4d, 0xca, 0x5a, 0x90, 0xc3, 0x35, 0x82, 0x95, 0xc3, + 0xd0, 0x9b, 0x51, 0x6b, 0x6e, 0xb5, 0xba, 0x44, 0x52, 0x35, 0xbc, 0xb9, + 0x7e, 0x66, 0x3f, 0x6f, 0x5e, 0x55, 0xb4, 0xca, 0xb5, 0x31, 0xc0, 0x84, + 0x86, 0xcb, 0x4c, 0x35, 0x9f, 0xc9, 0xbc, 0x3c, 0x80, 0x95, 0xad, 0x3b, + 0x41, 0x64, 0x3d, 0x7a, 0xc9, 0xce, 0x6a, 0xb9, 0x4c, 0x84, 0x4f, 0x7c, + 0xac, 0x7b, 0xb5, 0xab, 0x64, 0x3f, 0x41, 0x4a, 0x69, 0x72, 0x5a, 0xcb, + 0x9c, 0x84, 0x92, 0x70, 0x9d, 0xc8, 0xb3, 0xc1, 0x80, 0x95, 0x9f, 0x4f, + 0xa2, 0x9e, 0x92, 0x30, 0xad, 0xa9, 0x51, 0x36, 0xb9, 0xd1, 0xae, 0x3c, + 0x5f, 0x35, 0xb8, 0xa9, 0xbd, 0x72, 0x6f, 0x7c, 0x34, 0x9e, 0x3c, 0xd0, + 0x89, 0x55, 0x50, 0x3b, 0xa4, 0x33, 0x8e, 0x7d, 0x63, 0x83, 0x38, 0x45, + 0x57, 0x8d, 0x92, 0x5c, 0xd0, 0x68, 0x48, 0x40, 0xaf, 0x91, 0x43, 0x8e, + 0x61, 0x6d, 0x4d, 0x33, 0x7e, 0x65, 0x8a, 0x97, 0xcb, 0x36, 0x7f, 0xc3, + 0x55, 0x8a, 0xb2, 0xbf, 0x59, 0xc1, 0x58, 0x77, 0x4a, 0x78, 0x55, 0x9b, + 0xc9, 0xcb, 0xab, 0xa8, 0xb8, 0xcf, 0xa9, 0x36, 0x8c, 0x47, 0x49, 0xc8, + 0x6f, 0x37, 0x57, 0xbe, 0x62, 0xa1, 0xcc, 0x3f, 0x88, 0x4f, 0x94, 0xbb, + 0x7a, 0x6f, 0x3b, 0x65, 0x95, 0x7d, 0xa8, 0x7f, 0x90, 0x59, 0xc1, 0x8b, + 0x90, 0x9e, 0x42, 0xa9, 0x75, 0x63, 0x49, 0x6f, 0x86, 0x59, 0xd4, 0x7d, + 0x37, 0x71, 0x49, 0x81, 0x73, 0x6a, 0x31, 0xc8, 0x9a, 0x4d, 0x31, 0x8b, + 0x98, 0xb4, 0x68, 0x96, 0x59, 0x6e, 0x99, 0x51, 0xc0, 0x48, 0xa9, 0x5c, + 0xa7, 0x87, 0x8a, 0xad, 0xc5, 0x39, 0x5f, 0x54, 0x37, 0xa2, 0x8c, 0x38, + 0xa4, 0x97, 0x67, 0x42, 0x33, 0xc7, 0x44, 0xc7, 0xaf, 0xd0, 0x41, 0x74, + 0xc7, 0x41, 0x60, 0xc6, 0x42, 0x46, 0xa6, 0x97, 0x8b, 0xb1, 0x58, 0x9e, + 0x9c, 0x8d, 0x5c, 0x39, 0xb9, 0x40, 0xc4, 0x8b, 0x4a, 0x7c, 0xab, 0x5a, + 0x75, 0xbc, 0x3f, 0x77, 0x2f, 0x6a, 0x59, 0xc6, 0xc7, 0x89, 0xa0, 0xd1, + 0xb7, 0x3b, 0xa1, 0x8c, 0x9d, 0x69, 0x3c, 0x50, 0xa5, 0x56, 0x49, 0x90, + 0x81, 0xb4, 0x97, 0x98, 0xb7, 0x3c, 0x65, 0xbc, 0x53, 0x80, 0x96, 0x38, + 0x51, 0x77, 0x42, 0x9a, 0xb4, 0xa9, 0x9c, 0xc8, 0x6d, 0x42, 0xb9, 0x51, + 0xb9, 0xd0, 0x5e, 0x85, 0x8a, 0x57, 0x58, 0x48, 0x8b, 0x99, 0x54, 0x81, + 0x3a, 0xac, 0x4b, 0xac, 0x39, 0x36, 0x35, 0x66, 0x98, 0xaa, 0x3f, 0x3f, + 0x54, 0x38, 0x5c, 0x71, 0x93, 0x43, 0x7d, 0x4a, 0x6f, 0xca, 0x9f, 0x49, + 0x92, 0x94, 0x76, 0x91, 0xd1, 0x79, 0x71, 0xb3, 0x65, 0xa6, 0x92, 0x64, + 0xaa, 0x5f, 0xad, 0x7c, 0xd2, 0xc4, 0x5b, 0x44, 0x84, 0x8e, 0x96, 0x64, + 0xb5, 0x47, 0x33, 0x86, 0x79, 0x98, 0xa5, 0xa3, 0xae, 0x84, 0x6c, 0x97, + 0x61, 0x4d, 0x60, 0x8e, 0x54, 0x76, 0x41, 0x30, 0x6e, 0x6c, 0xb0, 0x68, + 0x9a, 0x34, 0xc4, 0xa4, 0x48, 0xbc, 0x87, 0x5a, 0xaf, 0xad, 0x92, 0x93, + 0x50, 0xc7, 0x4c, 0x74, 0xb8, 0xb5, 0x9d, 0x38, 0x68, 0x5f, 0x5e, 0xb0, + 0x7b, 0x42, 0x87, 0x4d, 0x59, 0x7c, 0xc4, 0x41, 0xa1, 0x8b, 0x51, 0x8e, + 0x57, 0xcf, 0xc8, 0xb9, 0x6d, 0x55, 0xba, 0xbd, 0x80, 0x5b, 0x8f, 0x51, + 0x4a, 0xd3, 0x6a, 0x48, 0xc4, 0x36, 0xbb, 0x64, 0x48, 0x6a, 0xc4, 0x6a, + 0xa5, 0xac, 0x3b, 0xc4, 0x93, 0x88, 0xb4, 0x55, 0x5c, 0xb0, 0xc1, 0x34, + 0x65, 0xb2, 0xbb, 0x87, 0x66, 0x7d, 0x3f, 0xb2, 0xb8, 0x91, 0xd2, 0x50, + 0x83, 0xd1, 0x67, 0x6e, 0x52, 0x4f, 0x91, 0xc9, 0x89, 0xaa, 0x56, 0xc7, + 0x43, 0x48, 0x5f, 0x51, 0x60, 0xc3, 0x81, 0xcd, 0x90, 0x9b, 0x59, 0x6d, + 0x6b, 0xb5, 0x4e, 0xab, 0xa5, 0x83, 0x62, 0xcb, 0x32, 0xc7, 0x5b, 0x7c, + 0xa9, 0xa4, 0x8b, 0x32, 0x95, 0x61, 0x88, 0xad, 0xa2, 0x62, 0x8b, 0x4f, + 0x63, 0x7c, 0xa3, 0xb0, 0xc8, 0x54, 0x43, 0x94, 0x95, 0xa9, 0x43, 0x36, + 0x9a, 0x63, 0x9c, 0x43, 0x6c, 0x4e, 0x8f, 0x66, 0xc3, 0xa8, 0x60, 0x5a, + 0x8a, 0x43, 0x95, 0x59, 0x49, 0x50, 0xac, 0x90, 0x63, 0xbe, 0x68, 0x42, + 0x85, 0x45, 0x5f, 0xd0, 0xc0, 0x65, 0x8b, 0xce, 0x38, 0x8a, 0xc5, 0x71, + 0xab, 0xba, 0xc0, 0xcd, 0xbc, 0x31, 0x49, 0x40, 0xa8, 0x46, 0xad, 0xb1, + 0xb9, 0x9c, 0x4f, 0x3f, 0x9e, 0x6e, 0x6b, 0xbb, 0x9e, 0x35, 0xbf, 0xb2, + 0x3a, 0x5d, 0xb1, 0x50, 0x8e, 0x4f, 0xac, 0xc5, 0xa8, 0x4e, 0x9d, 0x56, + 0x55, 0x42, 0x44, 0x48, 0x69, 0x6f, 0x7f, 0x94, 0x96, 0xa5, 0xaf, 0x6e, + 0xb8, 0xae, 0x93, 0x9b, 0xaf, 0x84, 0xd2, 0x6d, 0xc9, 0x5d, 0xb5, 0xac, + 0x6d, 0x6c, 0x96, 0xc9, 0xb0, 0x62, 0x35, 0x85, 0x5f, 0x4c, 0x84, 0xa7, + 0x70, 0x71, 0x5d, 0x6b, 0x96, 0x56, 0xbd, 0x73, 0x98, 0x77, 0x48, 0xb4, + 0x33, 0xc3, 0x8e, 0x41, 0x9a, 0x96, 0xc9, 0xbf, 0x80, 0x97, 0xbd, 0x50, + 0x63, 0x31, 0xae, 0xb0, 0x9b, 0xbc, 0x6e, 0x59, 0x48, 0x58, 0x8a, 0x6c, + 0xa2, 0x6f, 0x76, 0x3c, 0xc0, 0x62, 0x69, 0x5f, 0xaa, 0x34, 0x5e, 0x85, + 0x95, 0x90, 0x49, 0x96, 0xc9, 0x62, 0x39, 0x70, 0x41, 0xcd, 0x38, 0x81, + 0x69, 0x32, 0xc2, 0x63, 0x4a, 0xa0, 0x6f, 0xb5, 0x56, 0xc0, 0xbf, 0x37, + 0x57, 0x57, 0x3e, 0xc6, 0xb4, 0x34, 0x78, 0x47, 0x7d, 0x55, 0x34, 0xc3, + 0x57, 0x44, 0x92, 0x8f, 0x48, 0xaf, 0x5b, 0xb4, 0x57, 0xad, 0xc4, 0xcb, + 0xb9, 0xa2, 0x67, 0x60, 0xc1, 0x66, 0x3b, 0xab, 0x92, 0xa3, 0xc4, 0x8b, + 0x72, 0x9c, 0x64, 0x59, 0x6f, 0xd3, 0xaf, 0x72, 0x5d, 0xc2, 0x67, 0xbf, + 0x8f, 0x4d, 0x6f, 0x5d, 0xb3, 0x9b, 0xb5, 0xc8, 0x68, 0xb3, 0x8c, 0x75, + 0x54, 0x97, 0x8e, 0x8c, 0xad, 0x61, 0x40, 0xa0, 0x41, 0xbf, 0xce, 0x5d, + 0x92, 0x3a, 0xc0, 0xc9, 0x6a, 0x79, 0x60, 0x5e, 0x38, 0x6c, 0x97, 0xc4, + 0x5c, 0x55, 0x84, 0x66, 0xd4, 0xc2, 0xa9, 0xb8, 0x8f, 0xae, 0x9a, 0xc4, + 0x70, 0x68, 0x74, 0x8b, 0x5a, 0x92, 0xbd, 0x51, 0x78, 0x3e, 0x97, 0x62, + 0x7f, 0x82, 0xd3, 0x72, 0xa3, 0x39, 0x79, 0x9d, 0xc7, 0x8e, 0x38, 0x44, + 0x60, 0xa3, 0x36, 0x4e, 0xd3, 0x85, 0xb3, 0x77, 0x44, 0x77, 0xcb, 0xa8, + 0x97, 0x34, 0x46, 0x51, 0x5b, 0x45, 0x42, 0x3e, 0x9b, 0x98, 0x5c, 0x68, + 0x50, 0x9b, 0xd5, 0xc1, 0x35, 0x95, 0x3f, 0x7b, 0xb5, 0xae, 0xb7, 0x82, + 0x99, 0xc3, 0xa8, 0xac, 0x9e, 0x4f, 0xb6, 0x8f, 0x52, 0x72, 0x5d, 0x82, + 0x61, 0x61, 0x37, 0x5e, 0xa9, 0xb6, 0x95, 0x7e, 0x88, 0x62, 0x6c, 0xbb, + 0xa2, 0x37, 0x56, 0x42, 0x84, 0x4d, 0x91, 0xac, 0x43, 0x39, 0x95, 0x5a, + 0x69, 0x89, 0xac, 0xd0, 0x40, 0x49, 0x3a, 0x8f, 0xb7, 0x45, 0x3a, 0xa3, + 0x3b, 0x3d, 0xa1, 0xd0, 0x9b, 0x8a, 0xa5, 0x87, 0x75, 0x38, 0x75, 0x3c, + 0xcd, 0x88, 0xad, 0x9a, 0x3a, 0x56, 0x44, 0xb6, 0x6d, 0x92, 0x77, 0x3d, + 0xc3, 0x6b, 0xb1, 0xc6, 0x8b, 0x98, 0x6b, 0xc0, 0xba, 0x9a, 0x9f, 0x33, + 0x42, 0xc3, 0xce, 0xad, 0x7e, 0x88, 0xa5, 0x6b, 0x7a, 0x47, 0xc9, 0x81, + 0x64, 0x4a, 0x57, 0x93, 0x42, 0x49, 0x4c, 0xd2, 0xcb, 0xc8, 0xcb, 0x81, + 0x6a, 0x56, 0x52, 0x9c, 0xbd, 0x49, 0xc1, 0x99, 0xd3, 0x5d, 0xb8, 0x3a, + 0x70, 0x97, 0x62, 0x60, 0x76, 0x66, 0x93, 0x6e, 0xb8, 0xab, 0x34, 0x84, + 0x49, 0x3a, 0x3f, 0x58, 0x70, 0x40, 0xd2, 0x6b, 0xd3, 0x82, 0x37, 0x3e, + 0xa3, 0xc6, 0x39, 0xb5, 0x68, 0xa7, 0x44, 0xc5, 0xc2, 0x63, 0xab, 0x87, + 0x37, 0x4d, 0xba, 0xc6, 0xc9, 0xb8, 0x9f, 0xbf, 0x5b, 0xd0, 0x58, 0xbf, + 0xc0, 0xbf, 0x36, 0x67, 0x36, 0x3a, 0xa2, 0xbb, 0xa2, 0x79, 0x5a, 0xb8, + 0x40, 0x4a, 0x3e, 0x4d, 0x48, 0x6f, 0x7e, 0x89, 0x63, 0xa7, 0xb3, 0x36, + 0x88, 0x5c, 0xb6, 0x6a, 0x44, 0xb1, 0x3c, 0xae, 0x3b, 0xbd, 0xad, 0x58, + 0xca, 0x67, 0x5c, 0x50, 0x84, 0xc7, 0xa9, 0x35, 0x54, 0x97, 0x4f, 0x55, + 0x76, 0x78, 0x69, 0x64, 0x30, 0x95, 0x7e, 0x77, 0xc8, 0xb6, 0xc5, 0x88, + 0x58, 0xb7, 0x98, 0xa0, 0x8b, 0x34, 0x43, 0x3c, 0xb3, 0x66, 0x99, 0x34, + 0x67, 0x76, 0x9f, 0x44, 0xce, 0xa0, 0xa1, 0xac, 0xaa, 0x72, 0x8c, 0xbe, + 0x8e, 0xb0, 0x94, 0xbb, 0x3d, 0x4e, 0x9c, 0x71, 0xc8, 0x77, 0xcd, 0xa1, + 0xbc, 0x61, 0x45, 0x42, 0x8a, 0x38, 0xbf, 0x9a, 0xa2, 0x4c, 0x94, 0xa4, + 0x69, 0xc7, 0x9f, 0x5c, 0x87, 0x3a, 0x66, 0xa5, 0x51, 0xc2, 0x71, 0xb5, + 0x34, 0x44, 0x78, 0x5c, 0x54, 0x64, 0x34, 0xaa, 0x4b, 0xad, 0x4f, 0x40, + 0x96, 0xc9, 0x50, 0xb5, 0xbf, 0x7f, 0xba, 0xb8, 0xce, 0x67, 0x41, 0x3e, + 0xa1, 0x7b, 0x5a, 0x9c, 0x70, 0xbb, 0x49, 0x5f, 0x5b, 0x6f, 0x3f, 0x97, + 0x9c, 0x8e, 0xc9, 0xa4, 0x89, 0x98, 0xbf, 0x7a, 0x8e, 0xac, 0x3b, 0x56, + 0x47, 0x6a, 0x7a, 0xa4, 0x7e, 0xc4, 0x63, 0xad, 0x55, 0xbf, 0x8c, 0xb7, + 0x7b, 0x9d, 0x6d, 0x94, 0x65, 0x8a, 0x61, 0xc9, 0x3d, 0xa2, 0xa5, 0x9b, + 0x81, 0xdd, 0x9d, 0xc9, 0x50, 0xbf, 0x6f, 0x72, 0x5c, 0xa0, 0x63, 0x50, + 0x99, 0x75, 0x8f, 0xca, 0xab, 0xd1, 0x3f, 0xb4, 0xbd, 0xa0, 0x91, 0xb1, + 0x80, 0x41, 0x5a, 0x71, 0xc3, 0x88, 0x33, 0x8d, 0x56, 0x98, 0x3f, 0xa2, + 0x82, 0xb2, 0x74, 0x90, 0xc3, 0x38, 0xab, 0x98, 0x3c, 0xd8, 0x86, 0xb8, + 0x58, 0x8c, 0x67, 0x9b, 0x5b, 0x4f, 0x76, 0x55, 0x7e, 0x57, 0xb0, 0x8c, + 0xb2, 0x6d, 0xa0, 0x90, 0xbc, 0x72, 0xb2, 0xce, 0xbd, 0x6d, 0xcf, 0x2f, + 0x8b, 0x8a, 0x3e, 0x45, 0x97, 0x7f, 0xc4, 0x3e, 0x93, 0xba, 0xd8, 0x53, + 0x84, 0x97, 0x93, 0x3a, 0x73, 0x6b, 0xc2, 0xc3, 0x74, 0xca, 0x90, 0x59, + 0x9e, 0x3f, 0x9d, 0x3e, 0x85, 0xa0, 0x75, 0x4d, 0xc1, 0x39, 0xa7, 0xbe, + 0xa5, 0xc1, 0xd6, 0x8f, 0xbc, 0x8c, 0x50, 0x3c, 0x69, 0x3a, 0x6c, 0x64, + 0xac, 0x5a, 0x81, 0x77, 0x6d, 0x4c, 0xc6, 0x5a, 0x3e, 0x81, 0xcc, 0xac, + 0x48, 0x32, 0x3d, 0xc3, 0x35, 0xc1, 0x81, 0x7f, 0x99, 0x68, 0x7e, 0xa8, + 0x90, 0xc0, 0x53, 0xc0, 0x90, 0x86, 0x4b, 0x87, 0x6d, 0x77, 0xc6, 0x43, + 0x60, 0xb4, 0x31, 0xcb, 0x4a, 0xcf, 0x70, 0xcc, 0xc8, 0x6d, 0x6a, 0x7a, + 0xbc, 0x6a, 0x3d, 0x83, 0x62, 0xd9, 0x94, 0x33, 0x36, 0xb5, 0x46, 0x88, + 0xb7, 0x41, 0xa8, 0x8f, 0x74, 0x55, 0x8e, 0xae, 0x64, 0x40, 0x60, 0x5b, + 0x7c, 0x88, 0x69, 0x68, 0x9c, 0x4b, 0xbd, 0x8c, 0xab, 0x49, 0x9e, 0x70, + 0xab, 0xd0, 0x34, 0xa3, 0x50, 0x95, 0x5f, 0x45, 0x34, 0x73, 0x84, 0x50, + 0xcb, 0x72, 0x6c, 0x69, 0x51, 0x51, 0x45, 0xb8, 0xc7, 0xca, 0x87, 0x54, + 0x89, 0xd3, 0x7f, 0x4a, 0xc2, 0xa9, 0xca, 0x5a, 0x8d, 0x3d, 0xc6, 0x35, + 0x59, 0x51, 0x92, 0x7a, 0x81, 0x3b, 0x6b, 0x82, 0x86, 0x72, 0xbb, 0x7e, + 0xb8, 0xba, 0xd4, 0xb0, 0xa3, 0x9a, 0x7e, 0x2a, 0x67, 0x2e, 0xbf, 0xaa, + 0x89, 0x37, 0xaa, 0x88, 0xad, 0xdf, 0x6b, 0x48, 0x7f, 0xac, 0x9b, 0xa3, + 0xa8, 0x96, 0x46, 0x62, 0x33, 0x9b, 0x95, 0x65, 0xa6, 0x54, 0x30, 0x77, + 0xb2, 0xb4, 0x3d, 0xc9, 0xaf, 0x88, 0xac, 0xad, 0x92, 0x66, 0x6a, 0x98, + 0x47, 0x69, 0x94, 0xa4, 0x6d, 0x8d, 0x45, 0x6e, 0x8f, 0x58, 0xbb, 0xca, + 0x67, 0xb8, 0x57, 0x49, 0x9d, 0xc0, 0x75, 0x73, 0x6e, 0xbd, 0xa0, 0xa0, + 0x33, 0xc4, 0x84, 0x3b, 0x94, 0xd2, 0x37, 0x8e, 0x6b, 0x64, 0xb5, 0x6c, + 0x9a, 0x6d, 0x7c, 0x79, 0xbc, 0x41, 0x83, 0x9a, 0xad, 0x3a, 0xb9, 0x63, + 0xbd, 0xca, 0x99, 0x8c, 0xa8, 0xc6, 0x54, 0x3a, 0x5d, 0x59, 0xb2, 0x67, + 0x88, 0x95, 0x4b, 0xaa, 0x4d, 0x7b, 0xb2, 0x72, 0x80, 0x72, 0x4d, 0x41, + 0x77, 0x6a, 0x5e, 0x6e, 0xc2, 0x62, 0xc5, 0x3b, 0x7b, 0xc9, 0x94, 0x2d, + 0xba, 0x8d, 0x66, 0x33, 0x62, 0xb4, 0x8c, 0xba, 0x7a, 0x89, 0x73, 0x71, + 0xaa, 0x9e, 0x8a, 0xb4, 0x87, 0xb4, 0x8e, 0x64, 0x8f, 0xc7, 0xbd, 0xd3, + 0x68, 0xa1, 0x8f, 0x80, 0xbd, 0x39, 0xa5, 0xa6, 0x6b, 0x9d, 0x84, 0xbb, + 0x3b, 0x3d, 0x9d, 0xb6, 0x53, 0x5c, 0xb0, 0x42, 0xc9, 0x87, 0xb5, 0xdd, + 0x69, 0xac, 0x74, 0x5b, 0xc7, 0xd0, 0xb8, 0x93, 0x92, 0x49, 0x67, 0x40, + 0x6f, 0x5e, 0x33, 0x93, 0xa5, 0x43, 0xcd, 0x82, 0x85, 0xc7, 0x5b, 0xba, + 0x6b, 0x65, 0x9d, 0x5b, 0x59, 0x4e, 0x45, 0x2f, 0xce, 0x8e, 0x4c, 0x55, + 0x61, 0x40, 0xdc, 0x42, 0xbd, 0x93, 0xaf, 0xd6, 0xb5, 0xc2, 0x3e, 0xbd, + 0x58, 0x8a, 0xbf, 0x99, 0xcc, 0xc1, 0xd6, 0x9c, 0x59, 0x48, 0x98, 0xc4, + 0x68, 0xc8, 0x49, 0xcc, 0xc5, 0xb1, 0x57, 0x8e, 0x54, 0xb9, 0x7a, 0x86, + 0x52, 0x70, 0x57, 0xa2, 0x65, 0x44, 0xbd, 0xec, 0x5c, 0xaa, 0x63, 0xc6, + 0x60, 0x9d, 0x3a, 0x28, 0xb1, 0xa3, 0xb1, 0x66, 0x5c, 0x5a, 0x6f, 0x4d, + 0x3a, 0x94, 0x8f, 0xd4, 0xbc, 0xcb, 0x6d, 0xac, 0x8e, 0xb7, 0x73, 0x69, + 0x52, 0x36, 0xd4, 0xad, 0xbb, 0x6e, 0xa4, 0x84, 0x8d, 0x3b, 0x6d, 0x60, + 0x6b, 0x7d, 0x41, 0x67, 0x76, 0x99, 0xbf, 0xaf, 0x8a, 0x71, 0x4b, 0xb2, + 0x7e, 0x42, 0x3a, 0x5f, 0x9f, 0xbc, 0xa3, 0xb3, 0xcf, 0x52, 0xa5, 0xb8, + 0x3c, 0x6c, 0x79, 0x3f, 0x6f, 0x4a, 0x6d, 0xac, 0xc4, 0xdb, 0xdb, 0xc2, + 0x71, 0xa9, 0x58, 0x32, 0xb1, 0x79, 0xc3, 0xc0, 0x36, 0x8c, 0x5a, 0x3e, + 0xa1, 0xda, 0xca, 0x90, 0x3d, 0xc5, 0x7e, 0x9d, 0x38, 0x9c, 0xaf, 0x4f, + 0x6a, 0x7b, 0xba, 0x6b, 0x91, 0x99, 0x96, 0xc4, 0x8a, 0x39, 0xb4, 0xcc, + 0x48, 0x4f, 0xb3, 0x76, 0x5b, 0x8f, 0x8f, 0x3c, 0xae, 0x4a, 0xb4, 0x87, + 0x6e, 0xc4, 0x4a, 0xe2, 0x98, 0x87, 0x8e, 0x89, 0x61, 0x59, 0x43, 0xd0, + 0x8f, 0x6f, 0x32, 0xd2, 0x5b, 0xce, 0x9b, 0x76, 0xbe, 0x9f, 0x93, 0x41, + 0x49, 0x93, 0x60, 0xae, 0xbc, 0x64, 0x7c, 0xb9, 0x5c, 0xb1, 0xc3, 0xb6, + 0x74, 0x5e, 0x86, 0x94, 0x8c, 0xd0, 0xbe, 0x6c, 0x46, 0x55, 0x77, 0xad, + 0x5e, 0xc1, 0x47, 0x60, 0x92, 0xb5, 0x43, 0x53, 0x66, 0x48, 0x8b, 0x85, + 0xa4, 0xca, 0x87, 0x8d, 0x77, 0x8a, 0xad, 0x7f, 0x33, 0xa6, 0xc9, 0x93, + 0x40, 0x6d, 0x4e, 0x4b, 0x42, 0x3f, 0x5a, 0x8f, 0xa5, 0xae, 0xbd, 0xd5, + 0xab, 0x35, 0x8a, 0x51, 0x7b, 0x53, 0xc5, 0xd5, 0x8a, 0x37, 0x81, 0xae, + 0xb7, 0xc1, 0x83, 0xaa, 0x60, 0x60, 0x6d, 0x4d, 0x6a, 0x4c, 0x5b, 0xc0, + 0xaf, 0xa9, 0x65, 0xc4, 0x47, 0xa9, 0xc1, 0x5f, 0xa1, 0x99, 0x55, 0xcb, + 0x90, 0x85, 0x5b, 0xbd, 0x99, 0x33, 0x60, 0xa9, 0xbc, 0xc7, 0xbf, 0xb1, + 0x5d, 0xa5, 0x7b, 0x5c, 0x43, 0x69, 0x85, 0x89, 0x73, 0x50, 0x89, 0xc2, + 0x8a, 0xa8, 0xb3, 0xd6, 0x74, 0x72, 0xbd, 0x53, 0x7a, 0xdd, 0x61, 0x43, + 0x47, 0x62, 0x5b, 0x8b, 0x6d, 0xc5, 0x86, 0xa3, 0x37, 0x71, 0xc7, 0xbd, + 0x88, 0x72, 0xc3, 0xd7, 0xb0, 0x64, 0xac, 0x60, 0x32, 0x80, 0x52, 0xc8, + 0x36, 0xa1, 0xb3, 0x99, 0x3c, 0x75, 0x9c, 0x40, 0x89, 0x83, 0x87, 0x9c, + 0xbd, 0x83, 0x32, 0x90, 0x3b, 0xc7, 0x84, 0x88, 0xad, 0x96, 0x98, 0x84, + 0xa0, 0x8e, 0x5e, 0x94, 0x66, 0x87, 0x9b, 0x93, 0x7e, 0x54, 0x55, 0x7a, + 0x5f, 0xae, 0x78, 0xca, 0xb0, 0x37, 0xbf, 0x7f, 0xc9, 0x8b, 0xae, 0x88, + 0x99, 0x4c, 0x59, 0x66, 0x4c, 0x78, 0x66, 0xa9, 0x88, 0xb4, 0x96, 0xc3, + 0x26, 0x49, 0xe8, 0xc5, 0x92, 0x9d, 0x9f, 0xb4, 0xd3, 0x3f, 0xb8, 0x4b, + 0x8a, 0xd7, 0xb1, 0x7d, 0xc2, 0x59, 0x47, 0x42, 0x6f, 0xad, 0x7d, 0xc7, + 0xc7, 0xbc, 0x69, 0x8a, 0x90, 0x60, 0xa2, 0x54, 0x49, 0x66, 0xc9, 0xb1, + 0xbc, 0x74, 0x40, 0x36, 0x49, 0xb2, 0x7e, 0x97, 0x80, 0xc9, 0x30, 0x71, + 0x88, 0x31, 0x42, 0x38, 0xb4, 0x82, 0x3f, 0xc0, 0xa0, 0xb4, 0x46, 0x5f, + 0x9d, 0xb3, 0x76, 0x4a, 0xba, 0x8c, 0x6a, 0x44, 0xa3, 0x62, 0x58, 0x3e, + 0x57, 0x73, 0x9d, 0xad, 0x39, 0xc7, 0x4f, 0x8c, 0x67, 0x9d, 0xb6, 0x57, + 0xb9, 0x95, 0xbc, 0x35, 0x95, 0x47, 0x4e, 0x57, 0xa6, 0x85, 0xbb, 0xcf, + 0xc8, 0x89, 0x3d, 0x8d, 0xc9, 0x9b, 0xbf, 0xc8, 0x4f, 0x2b, 0xc9, 0x7e, + 0x9f, 0x51, 0x51, 0x8b, 0x51, 0x4e, 0x54, 0x41, 0xba, 0x43, 0xac, 0x30, + 0xb8, 0x41, 0xa1, 0x59, 0xb1, 0x9c, 0xae, 0x69, 0x3a, 0x6e, 0xba, 0x51, + 0x9f, 0x89, 0xc3, 0x3c, 0x5a, 0xc6, 0xd5, 0x48, 0x31, 0x5b, 0x83, 0x37, + 0x83, 0x67, 0x3f, 0xc8, 0xc1, 0x79, 0x58, 0xc6, 0xb9, 0x6a, 0x78, 0x64, + 0x8f, 0xb1, 0x6c, 0x57, 0xbf, 0x9b, 0x9b, 0x9e, 0xb0, 0x50, 0x52, 0x4a, + 0x9c, 0xc2, 0x45, 0xbe, 0x79, 0x7d, 0x8e, 0x83, 0x68, 0xc0, 0x65, 0x96, + 0x4c, 0xb2, 0x5c, 0x86, 0x9b, 0xa4, 0x3a, 0x81, 0x5e, 0x85, 0x8f, 0x88, + 0xb3, 0x82, 0xc6, 0x72, 0x5c, 0x37, 0x89, 0x70, 0x96, 0xaa, 0x82, 0xa1, + 0x5e, 0x4e, 0x48, 0x5d, 0x63, 0x6e, 0x87, 0xc5, 0xca, 0xb5, 0x64, 0x56, + 0x7e, 0x51, 0x81, 0x4c, 0xa2, 0x9e, 0x5c, 0x79, 0xcb, 0x3b, 0xcd, 0x3c, + 0xa9, 0x66, 0x95, 0x73, 0x38, 0x7a, 0x36, 0x52, 0x95, 0xc2, 0x4c, 0x91, + 0xc0, 0x80, 0x73, 0x9c, 0x72, 0x44, 0x72, 0x9d, 0x96, 0xce, 0x5b, 0x92, + 0x4a, 0xd1, 0x63, 0xca, 0xa6, 0x78, 0x3a, 0x63, 0x92, 0x48, 0xd3, 0x53, + 0x9c, 0xa2, 0x97, 0x79, 0xb8, 0xb4, 0x8a, 0x5f, 0x36, 0xa9, 0x6d, 0x48, + 0x78, 0x3c, 0xa5, 0x90, 0x58, 0xad, 0x7d, 0x59, 0xb0, 0x55, 0x78, 0x9a, + 0xbc, 0x4b, 0x96, 0x60, 0xa2, 0x68, 0x61, 0xb9, 0xd2, 0x89, 0x71, 0x8b, + 0xbd, 0xb3, 0x43, 0x93, 0x3a, 0xb3, 0x83, 0x84, 0x82, 0x6f, 0x3d, 0xbe, + 0xc9, 0xc0, 0x8c, 0x50, 0xc3, 0x85, 0x99, 0x5a, 0x4a, 0x37, 0x41, 0x92, + 0x5c, 0x79, 0x4c, 0x3b, 0x6d, 0xb2, 0xaf, 0xa4, 0x68, 0xce, 0xd4, 0x7e, + 0xa2, 0x68, 0x6f, 0x84, 0x5a, 0x63, 0x96, 0x8f, 0x50, 0x60, 0x5b, 0x9f, + 0xbf, 0x84, 0xa3, 0x49, 0x4a, 0x79, 0x5f, 0x4a, 0xab, 0xc0, 0x5e, 0xf0, + 0x8b, 0x8e, 0x49, 0xa8, 0x52, 0x59, 0x43, 0xa9, 0x84, 0x5f, 0x8f, 0x76, + 0x5f, 0x8c, 0x79, 0xa0, 0x86, 0xb7, 0x62, 0xb8, 0x4a, 0x78, 0x56, 0x59, + 0x61, 0x95, 0xaf, 0x4e, 0x73, 0x9f, 0x8b, 0xa9, 0x72, 0x61, 0x61, 0x72, + 0x69, 0x5b, 0x80, 0x4c, 0x62, 0x80, 0x87, 0x60, 0xc7, 0xa6, 0xa3, 0x5c, + 0xad, 0x40, 0x38, 0x89, 0x79, 0x9e, 0xb0, 0xa2, 0x64, 0x5d, 0x8a, 0xa0, + 0x7c, 0x4c, 0x4a, 0xc4, 0x7b, 0x5c, 0x70, 0x53, 0xda, 0xc6, 0x3f, 0x8e, + 0xb3, 0x5a, 0x9b, 0x86, 0xb7, 0x7f, 0x43, 0x94, 0x82, 0xc0, 0x76, 0x47, + 0xb1, 0x5a, 0x50, 0x52, 0xbf, 0xa7, 0x58, 0x55, 0x44, 0x79, 0x7d, 0xaa, + 0x77, 0x4a, 0xce, 0xc7, 0xac, 0xb7, 0x7d, 0xa5, 0x60, 0xcd, 0x53, 0xae, + 0x9f, 0x9c, 0x68, 0x7a, 0xb6, 0xa7, 0xae, 0x63, 0xa0, 0xb7, 0x73, 0xbe, + 0x95, 0xa6, 0x3b, 0x76, 0x4a, 0x3d, 0x85, 0x75, 0x3f, 0xc6, 0xcc, 0x5c, + 0x8f, 0x91, 0x39, 0xc6, 0x62, 0xb7, 0xb8, 0xe7, 0x7c, 0x69, 0x9a, 0x4d, + 0xb3, 0xc4, 0x73, 0x4c, 0x9f, 0x66, 0x71, 0x8f, 0x62, 0xc0, 0x5d, 0x7d, + 0x5e, 0x77, 0xc8, 0x6b, 0x4f, 0xb8, 0x90, 0x76, 0xb3, 0x92, 0xca, 0xb7, + 0x59, 0xc2, 0x88, 0x7d, 0x2d, 0xba, 0xc7, 0x4b, 0x4b, 0xbf, 0x56, 0x7e, + 0xcb, 0x9c, 0x4b, 0x59, 0xa4, 0xb9, 0x77, 0x43, 0x91, 0x8d, 0x65, 0x76, + 0x54, 0xaa, 0x35, 0x6f, 0x72, 0x8c, 0xb4, 0xa7, 0x54, 0xcc, 0x85, 0xa6, + 0x68, 0x49, 0xd1, 0xb9, 0x8b, 0x7d, 0xa8, 0x56, 0x87, 0x4e, 0xb8, 0x45, + 0x49, 0x7a, 0x52, 0x61, 0xb0, 0x78, 0xa9, 0x6d, 0x68, 0xa1, 0xb3, 0x65, + 0x94, 0xbe, 0xb0, 0xa3, 0x5b, 0x2c, 0xa2, 0x52, 0xa5, 0xc5, 0x3c, 0xab, + 0xa6, 0x8c, 0x79, 0x44, 0x58, 0x83, 0xa2, 0x3d, 0x86, 0x97, 0xa5, 0x79, + 0xa2, 0x4c, 0x55, 0x57, 0xb5, 0xbd, 0xc3, 0x4b, 0xcc, 0x3f, 0xa4, 0x64, + 0x86, 0xd0, 0x5c, 0x6b, 0xcc, 0x70, 0x46, 0x96, 0xa0, 0x2e, 0xb2, 0x6f, + 0xa6, 0x7e, 0x47, 0xc6, 0xab, 0x62, 0xa0, 0x84, 0x2c, 0xc8, 0x74, 0xbb, + 0xc1, 0x6c, 0xa3, 0xb9, 0xb3, 0x73, 0xa9, 0x4f, 0x84, 0x32, 0x9f, 0xb8, + 0xa0, 0x68, 0x7a, 0xdd, 0x6c, 0x62, 0x84, 0x92, 0x2a, 0x2e, 0xba, 0x44, + 0x86, 0x8f, 0x60, 0x55, 0xb2, 0x91, 0xc2, 0xa1, 0x5d, 0x91, 0x3e, 0x9d, + 0xa8, 0x84, 0x8f, 0x57, 0x69, 0x2d, 0xb1, 0x69, 0x2e, 0xb8, 0x7c, 0x9c, + 0x81, 0xa0, 0xa9, 0x60, 0xa2, 0x44, 0xd3, 0x90, 0x8f, 0x93, 0x4b, 0x9d, + 0xa1, 0xcb, 0x59, 0xb6, 0x5d, 0xbc, 0xa4, 0xcd, 0x9f, 0x60, 0x47, 0xbb, + 0x8d, 0x7a, 0xc6, 0xb4, 0x61, 0x33, 0x7e, 0x3a, 0x81, 0x3d, 0x62, 0xaf, + 0xcb, 0x7e, 0xc2, 0x6c, 0x9a, 0x65, 0x32, 0x74, 0x55, 0x37, 0xb1, 0x30, + 0xac, 0x7a, 0x57, 0xa0, 0x6c, 0x65, 0x81, 0xb8, 0xbf, 0x94, 0xd8, 0xb5, + 0xa4, 0x55, 0xb1, 0x95, 0xa2, 0x57, 0xb4, 0x69, 0x8e, 0x98, 0xbc, 0x8c, + 0xd2, 0xc3, 0x40, 0xb0, 0x97, 0x3a, 0x6a, 0x70, 0xbd, 0x88, 0x91, 0x7a, + 0x50, 0xae, 0x35, 0x48, 0x36, 0x6d, 0x68, 0x6c, 0x9e, 0xa0, 0x4c, 0x85, + 0xcb, 0xb1, 0x78, 0x60, 0x4b, 0x96, 0xbb, 0x3b, 0x9b, 0x78, 0xc2, 0x50, + 0x7d, 0xd3, 0x3b, 0xc9, 0xaa, 0xa3, 0x9c, 0xba, 0x56, 0x5a, 0x57, 0xc3, + 0x69, 0x63, 0x4d, 0x56, 0xa1, 0x71, 0x49, 0xc6, 0x87, 0x74, 0x38, 0x95, + 0x9e, 0x99, 0x91, 0x55, 0x94, 0x60, 0x8e, 0x82, 0x74, 0x8a, 0x3b, 0xa3, + 0xca, 0x8e, 0xb5, 0x30, 0x62, 0x4e, 0xa5, 0x58, 0x47, 0x95, 0xa3, 0x3f, + 0xd9, 0x56, 0x72, 0x6a, 0x35, 0x53, 0xae, 0x3f, 0x41, 0xa3, 0xc3, 0x36, + 0x97, 0xa3, 0x2e, 0xa0, 0x72, 0x4f, 0x6c, 0x53, 0x4e, 0x6e, 0x54, 0xaf, + 0xa0, 0x3b, 0xc7, 0x99, 0x8b, 0x76, 0x7b, 0x33, 0x5e, 0x8b, 0x5e, 0x8f, + 0x9c, 0xab, 0x57, 0x8c, 0xc5, 0x2e, 0xc3, 0xca, 0xb5, 0xbc, 0x63, 0xc8, + 0x3c, 0x47, 0x4f, 0x9d, 0x7e, 0x65, 0xa2, 0x68, 0xab, 0xa2, 0x33, 0xd8, + 0x74, 0x3f, 0x5a, 0x80, 0x8b, 0x5b, 0x7c, 0x4f, 0x93, 0x30, 0x6d, 0xa6, + 0x72, 0x6c, 0x4e, 0x5e, 0xb4, 0x67, 0xc8, 0x76, 0x80, 0x5d, 0x56, 0x96, + 0x70, 0x94, 0xa1, 0xab, 0x97, 0x60, 0xa8, 0xbd, 0x91, 0x77, 0xbd, 0x36, + 0x40, 0xca, 0x7e, 0x9b, 0xc8, 0xb6, 0x8b, 0x9d, 0x7a, 0xcb, 0xb9, 0x44, + 0xb5, 0x7b, 0x56, 0x39, 0x99, 0x48, 0x4b, 0x3c, 0xbd, 0x4e, 0xa7, 0x35, + 0x42, 0xaf, 0x94, 0xa6, 0x6f, 0x24, 0x82, 0x9e, 0xca, 0x75, 0x6f, 0xb4, + 0x87, 0x48, 0x32, 0xa5, 0xb4, 0x9d, 0x99, 0xd9, 0x94, 0x7c, 0x3a, 0x8b, + 0x40, 0x61, 0x94, 0xae, 0x87, 0xd2, 0x51, 0x41, 0x36, 0x88, 0x3f, 0x40, + 0x8e, 0x47, 0x74, 0xa8, 0x93, 0xdb, 0x88, 0xc3, 0xc8, 0xb0, 0x50, 0x56, + 0xd2, 0x68, 0x33, 0x75, 0x30, 0x3f, 0x6e, 0x85, 0xa9, 0x70, 0x3f, 0x38, + 0x65, 0x55, 0x72, 0x68, 0x8f, 0xa7, 0x60, 0x86, 0xc8, 0x43, 0xa7, 0xbc, + 0x76, 0xc7, 0x66, 0x6f, 0x40, 0x3e, 0x39, 0xbc, 0x7c, 0x64, 0xa8, 0x78, + 0x43, 0x3c, 0x80, 0x7b, 0xd9, 0x8f, 0x6a, 0x2e, 0x96, 0x43, 0xd6, 0x6e, + 0xa1, 0xb0, 0xab, 0x36, 0x72, 0x3e, 0xbc, 0x73, 0x82, 0x67, 0xb4, 0x7a, + 0x98, 0x3f, 0xb4, 0xae, 0x78, 0x68, 0xaf, 0x70, 0xda, 0x77, 0x91, 0x77, + 0x41, 0x6f, 0xc7, 0x8b, 0x87, 0xe7, 0x48, 0x66, 0x78, 0x5e, 0xc6, 0xb7, + 0xa9, 0x53, 0x8e, 0x54, 0x8c, 0x6f, 0x61, 0x50, 0x5b, 0x5c, 0xa3, 0x87, + 0x6f, 0x49, 0x9c, 0x50, 0x55, 0x5b, 0x3c, 0x7b, 0x92, 0x89, 0x41, 0x8f, + 0xb5, 0xb3, 0x6c, 0xce, 0x3f, 0xd3, 0xcb, 0x3f, 0xcb, 0x6d, 0x4b, 0x57, + 0x71, 0xbb, 0x96, 0x70, 0x9e, 0xac, 0xa8, 0x6a, 0x7b, 0x69, 0xce, 0x9a, + 0x8a, 0x71, 0xbd, 0xbb, 0xd0, 0x45, 0xd3, 0xc5, 0xc6, 0x7b, 0x40, 0x43, + 0x51, 0x93, 0x58, 0x63, 0x51, 0xd0, 0x82, 0x77, 0x89, 0x56, 0x51, 0xb8, + 0x7a, 0xa8, 0xd0, 0xaf, 0x92, 0x4e, 0x73, 0x49, 0x5d, 0x56, 0x40, 0x8c, + 0x1e, 0x8b, 0x5e, 0x6a, 0x5a, 0x41, 0x4d, 0x39, 0xa9, 0xab, 0x56, 0x84, + 0xbc, 0x48, 0x6d, 0xda, 0x78, 0x5b, 0x8d, 0xa8, 0x78, 0x35, 0x4b, 0xd5, + 0xc5, 0x29, 0x32, 0x44, 0xa9, 0xba, 0x81, 0xbb, 0xb1, 0xa5, 0x55, 0x9e, + 0xbc, 0x9c, 0x63, 0x48, 0x4f, 0x73, 0xcf, 0x9b, 0xca, 0xb7, 0x31, 0x98, + 0xb2, 0x7e, 0x5a, 0x69, 0xa2, 0x54, 0xc6, 0x70, 0xa1, 0x71, 0xa0, 0x5d, + 0x7a, 0x83, 0x44, 0xa9, 0x9b, 0x97, 0xa7, 0x7e, 0xa6, 0x53, 0x9a, 0x3e, + 0x8c, 0x86, 0x6b, 0x61, 0x38, 0x82, 0x66, 0x5e, 0xe1, 0xb7, 0x42, 0xab, + 0x40, 0xbd, 0x92, 0xbd, 0xa5, 0x92, 0x7a, 0x41, 0x9d, 0xb5, 0xb4, 0x7d, + 0xc8, 0x84, 0xc5, 0xb9, 0x83, 0x3d, 0x63, 0x73, 0x57, 0xba, 0x40, 0x43, + 0xcf, 0x9f, 0x5e, 0xae, 0xb6, 0xd2, 0xd4, 0x78, 0x80, 0x8a, 0x92, 0x5c, + 0x67, 0x3c, 0xa9, 0x93, 0xa5, 0x68, 0x85, 0xa2, 0x78, 0x89, 0x8c, 0x4f, + 0x3b, 0xb1, 0x3f, 0x5b, 0x51, 0x3e, 0xa3, 0xab, 0x37, 0xac, 0xc1, 0x3d, + 0x68, 0x31, 0x30, 0x8a, 0x78, 0x88, 0x3f, 0x4b, 0x81, 0xdd, 0x69, 0x56, + 0xb9, 0x6d, 0x73, 0x25, 0x4f, 0xac, 0x4b, 0xac, 0xa7, 0xa7, 0x57, 0xc0, + 0xcc, 0xb7, 0x5b, 0x9e, 0x73, 0xc1, 0x6a, 0x89, 0xb9, 0x9b, 0xa7, 0x2d, + 0x9a, 0x96, 0xd1, 0x34, 0x7c, 0xb0, 0x97, 0xb0, 0xad, 0x8f, 0x7e, 0xcb, + 0x3d, 0x89, 0x6c, 0xb2, 0xd1, 0x84, 0xcb, 0xa7, 0x9b, 0xad, 0x9f, 0x72, + 0x5a, 0x54, 0x5e, 0x58, 0xb3, 0xc0, 0x69, 0x91, 0x57, 0xcf, 0x62, 0x65, + 0x8d, 0x85, 0x5c, 0x35, 0xbd, 0xa0, 0xb3, 0x87, 0x50, 0x44, 0x31, 0x4f, + 0x6e, 0x6a, 0x9d, 0x55, 0x6e, 0x7f, 0xd8, 0x9b, 0x2c, 0x35, 0x38, 0xaa, + 0x8a, 0xce, 0x43, 0x61, 0xd5, 0xa9, 0x73, 0x8b, 0x2b, 0x35, 0x57, 0x47, + 0x4a, 0x33, 0x92, 0x6d, 0xc8, 0xac, 0x60, 0x7f, 0xdc, 0xb5, 0x3f, 0x33, + 0x73, 0x3d, 0x3e, 0x32, 0xb7, 0x87, 0x45, 0xbf, 0xb9, 0xd2, 0x82, 0x66, + 0xa6, 0x77, 0x37, 0x79, 0x47, 0x64, 0x84, 0x85, 0x70, 0x40, 0xbd, 0x53, + 0xb8, 0x53, 0x9b, 0x81, 0x46, 0x57, 0xa9, 0x85, 0x70, 0xb9, 0x58, 0x58, + 0x67, 0xc0, 0x5a, 0xa4, 0x4c, 0xc8, 0x61, 0xa1, 0xc0, 0xc9, 0x46, 0x4e, + 0xa9, 0xce, 0x7f, 0x81, 0x67, 0x8e, 0x7a, 0x62, 0x93, 0x38, 0xc9, 0xc8, + 0x8a, 0x6d, 0xb9, 0x44, 0x62, 0xc3, 0x84, 0x45, 0x82, 0x4d, 0x80, 0xda, + 0xaa, 0xd4, 0x7f, 0x71, 0x6a, 0xc1, 0x8d, 0x54, 0xb5, 0x76, 0x92, 0x8f, + 0x3b, 0xcb, 0x2e, 0x60, 0x3e, 0x89, 0xcb, 0x6e, 0x3f, 0xa7, 0x8a, 0xd2, + 0xad, 0x76, 0x72, 0x53, 0xa2, 0x52, 0x7e, 0xa6, 0x90, 0x58, 0x81, 0x57, + 0x7f, 0x42, 0x9e, 0xb2, 0xaa, 0x8c, 0xa9, 0x9f, 0x76, 0x52, 0x64, 0x4e, + 0x8a, 0x87, 0xa6, 0x63, 0x88, 0xd3, 0xcc, 0x5f, 0x56, 0x4b, 0xa5, 0xae, + 0x4e, 0x46, 0xc7, 0x6f, 0x65, 0x85, 0x6c, 0x4b, 0x3f, 0x75, 0xb0, 0x84, + 0x67, 0xcc, 0x8f, 0x59, 0x30, 0x76, 0xc7, 0x71, 0xb9, 0x49, 0x58, 0x42, + 0xaa, 0x7a, 0x47, 0x41, 0x68, 0x34, 0x5d, 0x7d, 0xa0, 0x8f, 0x3a, 0x92, + 0x7c, 0xba, 0xd8, 0x8e, 0x7d, 0xc4, 0x5f, 0x29, 0x88, 0x36, 0xca, 0xc4, + 0x41, 0x9d, 0xb3, 0x7d, 0x50, 0x60, 0x71, 0x8d, 0x74, 0x53, 0x33, 0x5b, + 0x4c, 0x47, 0x43, 0xa1, 0xbf, 0xab, 0xd9, 0xa9, 0xb4, 0x9e, 0x52, 0x53, + 0xa7, 0x58, 0x60, 0x93, 0x5a, 0xb0, 0x9d, 0x6e, 0xaf, 0xdb, 0x6b, 0x67, + 0x61, 0xd0, 0x5e, 0xa3, 0x8f, 0xba, 0x8f, 0xb3, 0x75, 0x90, 0x76, 0x69, + 0x48, 0x36, 0x34, 0xa8, 0xc1, 0x55, 0xb2, 0x81, 0x99, 0xc6, 0xa2, 0x78, + 0xa0, 0xad, 0x69, 0x37, 0x91, 0xca, 0xc2, 0x8a, 0xaa, 0xbb, 0xbc, 0xc7, + 0x6f, 0xab, 0x74, 0x6c, 0x96, 0x9b, 0xc3, 0x66, 0x8e, 0xcc, 0x98, 0xba, + 0xad, 0xd5, 0xc7, 0x8e, 0x5b, 0xaf, 0x36, 0xbe, 0x91, 0xc3, 0x53, 0x6b, + 0x35, 0xc0, 0xd0, 0x90, 0x92, 0x55, 0xc9, 0x74, 0x95, 0x9d, 0x42, 0xa2, + 0x3c, 0xbf, 0x60, 0x3b, 0x33, 0xbe, 0xc9, 0x8d, 0xbd, 0xb7, 0x5d, 0x60, + 0x71, 0x8d, 0x4f, 0x79, 0x40, 0xb1, 0xc1, 0x94, 0x57, 0xca, 0x4e, 0x82, + 0x74, 0x7f, 0x48, 0x7d, 0x62, 0x4f, 0x52, 0x91, 0x74, 0x6e, 0x80, 0xae, + 0x77, 0x32, 0x57, 0x64, 0x83, 0xcb, 0x82, 0x8b, 0x3c, 0xe8, 0x88, 0xbb, + 0x51, 0x69, 0x97, 0xd5, 0xd5, 0x40, 0xc6, 0x3b, 0xc7, 0x61, 0x35, 0xd6, + 0x7f, 0x2b, 0xc8, 0xb3, 0x5e, 0xc4, 0xcb, 0x8b, 0xc4, 0x70, 0x9a, 0x9a, + 0x61, 0x3c, 0x39, 0x3b, 0x7b, 0xbe, 0x37, 0x72, 0x57, 0xaf, 0x94, 0x54, + 0x90, 0xa4, 0x9d, 0xa5, 0x5d, 0x65, 0x52, 0x80, 0x95, 0xa7, 0x40, 0xa4, + 0x95, 0xab, 0x55, 0x5e, 0x42, 0xb1, 0x83, 0x92, 0xb8, 0x30, 0xcd, 0xb3, + 0x6a, 0x64, 0x33, 0xb3, 0x4a, 0x9d, 0x6d, 0x76, 0x55, 0xa0, 0xae, 0x4b, + 0x9e, 0x87, 0x4c, 0x9b, 0x41, 0x2e, 0x5d, 0x4a, 0x94, 0x87, 0x86, 0x58, + 0x88, 0x4a, 0x8d, 0xc7, 0xc6, 0x5b, 0x39, 0x87, 0xad, 0xe0, 0x52, 0x9e, + 0x58, 0x68, 0xc7, 0xc5, 0x4e, 0xb6, 0x65, 0x72, 0x75, 0x72, 0xbb, 0x96, + 0xc5, 0x9c, 0xa7, 0xa5, 0x2d, 0xae, 0xca, 0x4e, 0x37, 0x7c, 0x4f, 0xba, + 0x48, 0x73, 0x4f, 0xc2, 0xc4, 0x81, 0x5a, 0x53, 0x89, 0x9d, 0x35, 0x6d, + 0x44, 0x3c, 0x92, 0x3e, 0x91, 0xce, 0x7e, 0x59, 0x8d, 0x80, 0x8c, 0x4d, + 0x89, 0x44, 0xa2, 0x7d, 0xb9, 0x6e, 0x9c, 0x82, 0x78, 0xbe, 0x6f, 0xd4, + 0x8b, 0x79, 0x80, 0x89, 0x34, 0x9b, 0x9b, 0xa5, 0x78, 0x7f, 0x79, 0x40, + 0x9b, 0x95, 0x70, 0x8d, 0x46, 0x89, 0x82, 0x88, 0x8f, 0x57, 0x40, 0xa6, + 0x5a, 0x92, 0x96, 0x8a, 0xc6, 0xbb, 0x8f, 0xb3, 0x5f, 0x60, 0x6c, 0x56, + 0xb8, 0x94, 0x50, 0xa9, 0x33, 0x55, 0x7e, 0xb0, 0xc9, 0xa6, 0x4c, 0xcd, + 0xae, 0xc3, 0x78, 0x55, 0xcb, 0x4f, 0xb6, 0x52, 0x43, 0xc9, 0x79, 0x80, + 0x4e, 0x5d, 0xb3, 0x6e, 0xc5, 0xbe, 0x44, 0x74, 0x9a, 0xc5, 0x40, 0x98, + 0x59, 0x64, 0x84, 0x44, 0xdd, 0xd0, 0xb1, 0x78, 0xae, 0x9f, 0x84, 0xb5, + 0x75, 0x92, 0x43, 0x91, 0x5b, 0x76, 0xc7, 0x6e, 0xaa, 0xc6, 0xd3, 0x37, + 0x3d, 0xbd, 0x9c, 0x6e, 0xad, 0x75, 0x7d, 0x72, 0x6d, 0x54, 0xa8, 0x62, + 0x8f, 0x44, 0x7a, 0xda, 0x96, 0xa8, 0x6c, 0xe2, 0x65, 0x5d, 0x50, 0x3a, + 0xa2, 0x55, 0x42, 0x80, 0xc5, 0xc4, 0xaa, 0xd1, 0x75, 0x7e, 0x40, 0x60, + 0x3a, 0x40, 0x8c, 0x76, 0x79, 0x71, 0x9e, 0x33, 0xc6, 0x46, 0x52, 0xcd, + 0x74, 0x99, 0xbb, 0x33, 0x34, 0x5f, 0x93, 0x82, 0x68, 0xb2, 0x70, 0x73, + 0xb3, 0x48, 0x6c, 0xc5, 0x6e, 0x9b, 0x6b, 0xbb, 0xc4, 0x61, 0x5b, 0xa4, + 0x5b, 0x7a, 0xb7, 0x47, 0x78, 0xce, 0xbc, 0x86, 0x77, 0xb0, 0xa7, 0x9c, + 0x87, 0x71, 0x66, 0x6c, 0xbb, 0x28, 0xbe, 0x84, 0x79, 0xb1, 0x96, 0x29, + 0x84, 0xbd, 0x5b, 0x6d, 0x66, 0xd1, 0xb2, 0x81, 0x32, 0xc0, 0x37, 0x72, + 0xa3, 0x59, 0x3a, 0x3d, 0xbb, 0x3f, 0x41, 0xaf, 0xca, 0x25, 0x35, 0x82, + 0xaa, 0x7b, 0xac, 0xcb, 0x7c, 0x53, 0xc7, 0x6b, 0xb5, 0xae, 0x51, 0x6e, + 0xaf, 0xbd, 0xba, 0x8f, 0x9c, 0xa1, 0x59, 0x9b, 0xa1, 0xc5, 0x91, 0x71, + 0x8e, 0x48, 0x99, 0x78, 0x8b, 0x77, 0x55, 0x59, 0x91, 0x98, 0xd2, 0x64, + 0x32, 0x52, 0x99, 0x82, 0x63, 0x3c, 0x82, 0x7b, 0xaf, 0xc0, 0x96, 0x3b, + 0x63, 0x7d, 0xab, 0xb2, 0x85, 0x34, 0x4b, 0x86, 0x3d, 0x6b, 0x6a, 0x94, + 0x67, 0xd9, 0x89, 0xca, 0xc4, 0x7c, 0x62, 0x97, 0x9d, 0x71, 0xb1, 0xc5, + 0x79, 0xac, 0x87, 0x4d, 0x8d, 0x61, 0x83, 0xc2, 0xaa, 0xc0, 0x3e, 0xa5, + 0x97, 0xdc, 0x56, 0xb2, 0xaf, 0x63, 0xa1, 0x9e, 0x3d, 0xb2, 0x74, 0xbf, + 0x5c, 0x6c, 0x8e, 0xb4, 0x4e, 0x32, 0x50, 0x8d, 0xc0, 0x65, 0x9b, 0x4d, + 0x47, 0x8c, 0x44, 0x99, 0x9f, 0xcc, 0x9d, 0x47, 0x38, 0x81, 0x3e, 0x78, + 0x4a, 0xc2, 0x52, 0xa3, 0x78, 0x3f, 0x60, 0x31, 0x8c, 0xc3, 0x6d, 0xaf, + 0xa9, 0x5d, 0x42, 0x6a, 0x32, 0x6e, 0x59, 0xb0, 0xc3, 0xbe, 0x89, 0x8a, + 0x49, 0x44, 0x7b, 0x82, 0x51, 0xb1, 0x88, 0x80, 0x51, 0x55, 0xc3, 0xbb, + 0x9a, 0x2d, 0x4c, 0x9c, 0xab, 0x66, 0xc9, 0xb0, 0xaf, 0xb7, 0x30, 0xca, + 0x72, 0xbe, 0xa5, 0x69, 0x75, 0xcf, 0x82, 0xc6, 0xc1, 0xb8, 0x4b, 0x60, + 0x32, 0x95, 0x96, 0xb8, 0x65, 0x7b, 0x48, 0x91, 0x96, 0xb0, 0x74, 0x92, + 0xa8, 0x8e, 0x5d, 0xb5, 0xc1, 0xbc, 0x90, 0xa9, 0x76, 0x93, 0xa6, 0x3f, + 0x57, 0x41, 0xce, 0x5e, 0xce, 0x85, 0x85, 0xc3, 0xa8, 0x4d, 0x9a, 0x7e, + 0x54, 0x60, 0x46, 0x53, 0x3f, 0xc4, 0xb7, 0x52, 0xa4, 0xb3, 0x7f, 0x46, + 0x9a, 0x56, 0xb0, 0x99, 0x6a, 0x9f, 0x8d, 0xb0, 0x4d, 0xbe, 0x60, 0x4c, + 0xa6, 0x8d, 0xbc, 0x3a, 0x65, 0x63, 0x73, 0x8d, 0x56, 0x67, 0x52, 0xa7, + 0x42, 0x68, 0xaa, 0x6e, 0x93, 0x7b, 0xc8, 0xd5, 0xa9, 0x8c, 0x9b, 0x74, + 0x5c, 0x3b, 0x99, 0x87, 0x94, 0x4c, 0x3e, 0x5e, 0x9d, 0xd1, 0x72, 0x3f, + 0x55, 0xa3, 0x67, 0x98, 0x83, 0x8a, 0x46, 0xcc, 0xba, 0xa2, 0xb2, 0xa1, + 0x3a, 0x3b, 0x3d, 0x6c, 0x91, 0xae, 0x3b, 0x44, 0xa5, 0x6b, 0x2f, 0x67, + 0xc5, 0xbe, 0xd5, 0x9d, 0x9f, 0xcd, 0x8a, 0xa3, 0x34, 0x9c, 0xd0, 0x51, + 0x59, 0x9c, 0xa9, 0xa1, 0x7e, 0xa7, 0x8f, 0xb7, 0x6d, 0x78, 0x9d, 0x94, + 0x9d, 0x56, 0xa1, 0xb6, 0x65, 0x97, 0x6b, 0xa6, 0xa4, 0x75, 0x35, 0x5e, + 0x80, 0x7b, 0x50, 0x4b, 0xca, 0x69, 0x6c, 0x56, 0xb1, 0xd6, 0x73, 0x95, + 0x68, 0xa8, 0x7d, 0xc3, 0x85, 0x3c, 0xd4, 0xbb, 0x9f, 0x8f, 0xaf, 0x34, + 0x7d, 0xb6, 0xc3, 0xd5, 0xc5, 0xb6, 0xa0, 0xa4, 0xa0, 0x6c, 0x94, 0xb7, + 0x79, 0x49, 0x91, 0x5a, 0x6b, 0xda, 0x53, 0x69, 0x69, 0x63, 0x77, 0xb2, + 0x9d, 0xa0, 0x92, 0x52, 0x75, 0xbc, 0x51, 0xaa, 0xc1, 0xd5, 0xa6, 0xca, + 0x81, 0x7e, 0x8f, 0x87, 0x6e, 0x9a, 0x9f, 0x7b, 0xbe, 0x6d, 0xb7, 0x3e, + 0xb5, 0xc7, 0x3d, 0x37, 0xb8, 0xe0, 0xdf, 0x6a, 0x72, 0xcd, 0x6f, 0xac, + 0xc3, 0x37, 0x7b, 0x74, 0xb5, 0x1f, 0x6f, 0xd9, 0x5d, 0x8b, 0x69, 0x37, + 0xb5, 0xc5, 0x9f, 0x41, 0xb9, 0xa0, 0xe2, 0x6c, 0x99, 0xa0, 0xaa, 0x40, + 0x70, 0xa1, 0xa7, 0x4a, 0x4d, 0x89, 0x4e, 0x59, 0x55, 0x5d, 0x93, 0x81, + 0x4f, 0x45, 0xa5, 0x3f, 0x5c, 0x2f, 0x95, 0x31, 0xc3, 0x62, 0x51, 0x54, + 0x5e, 0x9c, 0x89, 0xd7, 0x6d, 0x9a, 0x4e, 0x86, 0x4a, 0x85, 0x90, 0xcc, + 0x32, 0x8c, 0xb6, 0xaf, 0xc7, 0xd5, 0xc9, 0x85, 0xa0, 0x95, 0x94, 0xba, + 0x3f, 0x54, 0x43, 0x85, 0x6d, 0x39, 0x95, 0x6b, 0xc2, 0x46, 0xd5, 0x68, + 0x90, 0xb4, 0x74, 0x34, 0x59, 0x5f, 0x69, 0x88, 0x47, 0x39, 0x6d, 0x64, + 0xac, 0x38, 0x7c, 0xc2, 0x9f, 0x34, 0xcd, 0x68, 0x83, 0xb3, 0xe2, 0xb7, + 0x60, 0x66, 0x9a, 0xc3, 0x41, 0x8e, 0x53, 0x32, 0xac, 0xd7, 0x4f, 0x53, + 0x6d, 0xb2, 0xbc, 0x89, 0xbb, 0x94, 0x80, 0x59, 0x81, 0x39, 0xce, 0x7f, + 0x8f, 0xc1, 0x78, 0x88, 0x80, 0xc2, 0x90, 0x5c, 0xa5, 0x83, 0x63, 0x74, + 0xb3, 0x3c, 0x4a, 0xb5, 0xd7, 0x7b, 0x8d, 0x4d, 0xbf, 0x9d, 0x83, 0xa4, + 0xd5, 0xd1, 0x32, 0x9e, 0x82, 0xb5, 0xc8, 0x92, 0x92, 0x67, 0x3a, 0x9a, + 0x37, 0x60, 0x5d, 0x5f, 0x48, 0xce, 0xbb, 0x85, 0x76, 0x9f, 0x2f, 0x5a, + 0x4e, 0x9e, 0x7d, 0x8c, 0x47, 0xb5, 0xb7, 0x83, 0x72, 0x5e, 0x7c, 0xcc, + 0xc6, 0x56, 0x88, 0x4d, 0xc5, 0x45, 0xb3, 0x96, 0x8a, 0x66, 0xb1, 0xa1, + 0x8b, 0x3a, 0x7b, 0xc4, 0x56, 0xac, 0x47, 0x98, 0xc9, 0x45, 0xe4, 0x5a, + 0x6c, 0x58, 0xca, 0x5e, 0xa9, 0x40, 0x30, 0x49, 0xd0, 0x60, 0x55, 0x56, + 0xb8, 0xab, 0xaa, 0x7a, 0xb3, 0x2e, 0x7a, 0x72, 0xce, 0x38, 0xa3, 0xb6, + 0x4e, 0x6f, 0x6f, 0x4c, 0x49, 0x69, 0x86, 0xcd, 0xbc, 0xc3, 0x75, 0x41, + 0x35, 0x89, 0x62, 0x82, 0xac, 0x78, 0xc8, 0x6f, 0xd0, 0x63, 0x88, 0x4b, + 0x64, 0xcd, 0xc8, 0x92, 0xae, 0x3e, 0xa7, 0x2c, 0x67, 0x24, 0x78, 0xa1, + 0xc2, 0xc4, 0xc1, 0x59, 0x81, 0xc1, 0x44, 0x3d, 0x71, 0xa3, 0x27, 0xb7, + 0x28, 0x8e, 0x64, 0x61, 0x6b, 0x6d, 0xab, 0xd6, 0xd5, 0x4c, 0x6f, 0x72, + 0x7b, 0x7f, 0x96, 0xd1, 0x6f, 0x70, 0x9c, 0x59, 0xae, 0x97, 0x49, 0x82, + 0x7d, 0xc0, 0x85, 0xbb, 0x5d, 0x5c, 0x54, 0xb9, 0xa6, 0xb5, 0x82, 0x59, + 0x6a, 0xc5, 0xbf, 0x5d, 0x49, 0x31, 0xa2, 0x7e, 0x76, 0x47, 0xbd, 0x7f, + 0x3f, 0x9c, 0x68, 0xa5, 0x7c, 0xa4, 0x7d, 0x96, 0xad, 0xc5, 0x7b, 0x79, + 0x89, 0xb0, 0xa5, 0x59, 0x87, 0x44, 0xa3, 0x49, 0xbf, 0x9c, 0x5e, 0x69, + 0x64, 0xaf, 0x5b, 0x6e, 0x54, 0x50, 0xb2, 0x37, 0xcf, 0x38, 0x53, 0x3d, + 0xd5, 0x39, 0xa2, 0xbe, 0x32, 0x4f, 0xc0, 0xa2, 0xc1, 0xc9, 0x58, 0x64, + 0x50, 0xd3, 0x52, 0xc0, 0x53, 0x47, 0x4b, 0xb8, 0x44, 0x55, 0x86, 0xae, + 0x3a, 0x93, 0x60, 0xa0, 0x78, 0xb6, 0x93, 0x80, 0x40, 0x3c, 0xad, 0x6d, + 0x38, 0xd6, 0x77, 0x61, 0x44, 0xda, 0x4e, 0x9d, 0x87, 0xd1, 0x97, 0x58, + 0x6e, 0x37, 0xac, 0xa2, 0x7e, 0x3e, 0x39, 0xa0, 0xbb, 0x66, 0xae, 0xc0, + 0x64, 0xb1, 0xa1, 0x53, 0x2a, 0x62, 0x50, 0x4a, 0x78, 0x4f, 0x82, 0x3d, + 0x82, 0x65, 0x43, 0x34, 0xc6, 0x94, 0x78, 0x3b, 0x66, 0xae, 0x8d, 0xba, + 0x6e, 0xe2, 0xb2, 0xc4, 0x59, 0x9d, 0x48, 0x94, 0x83, 0x5a, 0x60, 0x71, + 0x86, 0x69, 0x9e, 0x60, 0xa8, 0x4d, 0x4e, 0x31, 0x4f, 0x99, 0xb9, 0x4c, + 0x88, 0xac, 0x5f, 0x7e, 0x76, 0xcf, 0x94, 0x39, 0x70, 0xcd, 0x97, 0xa9, + 0x79, 0xcd, 0x72, 0x41, 0x34, 0xb2, 0xd4, 0x6f, 0x54, 0x92, 0x7e, 0x45, + 0x55, 0x84, 0x71, 0x54, 0x95, 0xa2, 0xae, 0x39, 0xb4, 0x7c, 0x66, 0x61, + 0x76, 0xbe, 0x5a, 0x8e, 0xb1, 0x84, 0xa1, 0x52, 0x45, 0xb1, 0xb4, 0x4c, + 0x39, 0xb5, 0x84, 0x7f, 0x91, 0xc8, 0x4a, 0x4a, 0x60, 0x9e, 0x9c, 0x74, + 0x78, 0x87, 0xbf, 0x95, 0x7d, 0xb0, 0x63, 0x51, 0x35, 0xc1, 0x4d, 0x62, + 0x49, 0x99, 0x70, 0xc6, 0x53, 0xc8, 0x43, 0x8d, 0x49, 0x92, 0x99, 0xad, + 0x5e, 0x4e, 0x31, 0x88, 0xc7, 0xbd, 0xd3, 0x35, 0x82, 0xa4, 0x7b, 0xb0, + 0x7c, 0x5d, 0xbd, 0x74, 0xd5, 0xc7, 0x6c, 0x38, 0x44, 0x8d, 0x8a, 0xbb, + 0xb9, 0x64, 0x93, 0x69, 0x71, 0xcf, 0x9a, 0x8c, 0x7d, 0x35, 0x84, 0x80, + 0x5b, 0x56, 0x37, 0x79, 0xbd, 0x92, 0xcc, 0x8a, 0xce, 0xa4, 0x98, 0xcb, + 0x6b, 0x3c, 0xce, 0x97, 0x89, 0x30, 0xa9, 0xb1, 0xb4, 0x5c, 0x75, 0x65, + 0x9a, 0x6c, 0x55, 0xbc, 0x7e, 0xcb, 0x46, 0xad, 0x64, 0x4d, 0x96, 0x95, + 0x53, 0x9c, 0xcb, 0x76, 0x86, 0xd0, 0xb1, 0x87, 0x7f, 0xc2, 0x6d, 0x95, + 0x52, 0x3d, 0x76, 0x6f, 0x41, 0xc0, 0x9c, 0x6e, 0x80, 0x47, 0x9b, 0x9c, + 0x8c, 0x68, 0xcf, 0x98, 0xcf, 0x8a, 0xce, 0x97, 0x89, 0x5b, 0xc5, 0x6b, + 0xb4, 0xa1, 0x48, 0xa4, 0x75, 0x6c, 0x34, 0xab, 0x99, 0x62, 0x9e, 0x8d, + 0x55, 0x61, 0x91, 0x3c, 0xa3, 0x7a, 0x51, 0xa6, 0x52, 0xce, 0xbe, 0xc4, + 0x4e, 0x93, 0x4f, 0x9c, 0x47, 0xcf, 0x82, 0xcf, 0xbe, 0x51, 0x7f, 0x6e, + 0x9d, 0x4d, 0x5d, 0x8d, 0x5b, 0xa4, 0xa2, 0x68, 0x43, 0x7d, 0x63, 0x9a, + 0xce, 0x77, 0xbe, 0x3c, 0xb6, 0x7c, 0xb7, 0xa8, 0x48, 0x9e, 0xcf, 0xb7, + 0xcb, 0x55, 0x94, 0x3d, 0x98, 0xc8, 0x7b, 0x77, 0xaf, 0x6f, 0xbd, 0x73, + 0x70, 0xb8, 0xc5, 0xaa, 0x9b, 0xce, 0xd0, 0x89, 0x95, 0xa3, 0x66, 0xcc, + 0x5f, 0xb0, 0x61, 0x72, 0x5d, 0x66, 0xa7, 0xa6, 0xb7, 0x68, 0x3c, 0x3f, + 0x32, 0x30, 0x74, 0x7d, 0x45, 0x5d, 0x6f, 0xc7, 0xb4, 0xa9, 0x76, 0xb6, + 0x62, 0x9f, 0x34, 0x56, 0xc7, 0x95, 0x6f, 0xcc, 0xab, 0x73, 0x61, 0x32, + 0x5d, 0xa7, 0xb4, 0x96, 0xb7, 0x88, 0xc0, 0x38, 0x80, 0xa7, 0xaf, 0x4b, + 0x60, 0x97, 0xcc, 0xaf, 0x49, 0x83, 0xd0, 0x3e, 0x65, 0x38, 0x6a, 0x60, + 0x65, 0x91, 0xcb, 0x66, 0x98, 0xc7, 0x50, 0x89, 0x35, 0x3f, 0x3a, 0xbc, + 0x9c, 0x95, 0xb7, 0x3a, 0x84, 0x72, 0xb2, 0xae, 0x4a, 0x5d, 0x38, 0x82, + 0xcf, 0x40, 0x3d, 0x99, 0x5d, 0x3d, 0x8f, 0x6c, 0x70, 0x99, 0x8b, 0xbc, + 0x9f, 0x32, 0xc5, 0x60, 0x70, 0x3c, 0x5d, 0x5b, 0x31, 0x7e, 0xa0, 0x57, + 0x4c, 0x54, 0xbe, 0xac, 0x65, 0x95, 0x61, 0x6c, 0x6a, 0xba, 0x79, 0xab, + 0x34, 0x73, 0xc2, 0x8a, 0x3e, 0x8f, 0x38, 0x91, 0x98, 0x47, 0x43, 0x54, + 0xae, 0x3f, 0x6b, 0xba, 0x92, 0x6d, 0x6b, 0x95, 0x5e, 0x91, 0x5c, 0x56, + 0x98, 0x5d, 0x7c, 0x7a, 0x9f, 0xd3, 0x5d, 0xa8, 0x82, 0x35, 0x3b, 0x43, + 0x99, 0x74, 0x54, 0x41, 0xae, 0xa5, 0x37, 0x3c, 0x85, 0x6a, 0x36, 0xa2, + 0x55, 0x74, 0x64, 0x49, 0x6c, 0xad, 0x65, 0x7c, 0xc1, 0x6d, 0x53, 0x66, + 0xc7, 0x8f, 0x47, 0xb0, 0x74, 0xae, 0x6d, 0x5b, 0x45, 0xa7, 0x33, 0x31, + 0x95, 0x45, 0x41, 0x4b, 0x57, 0x3f, 0x95, 0x4a, 0x88, 0x93, 0x37, 0x99, + 0x90, 0x8e, 0xba, 0x33, 0x94, 0x73, 0xba, 0xb2, 0xa7, 0x82, 0xc6, 0x7a, + 0x55, 0xcf, 0x31, 0xca, 0x89, 0x4b, 0xa8, 0x42, 0x86, 0xb8, 0x9a, 0x5c, + 0x45, 0x80, 0x31, 0xa8, 0x83, 0xa1, 0x58, 0x50, 0xbe, 0x5e, 0x93, 0x5e, + 0x6f, 0x9b, 0xa2, 0x92, 0x77, 0x50, 0x8b, 0x35, 0x8f, 0x76, 0x83, 0xcb, + 0xa0, 0x54, 0x7b, 0xad, 0xb0, 0x45, 0x40, 0x9d, 0x4d, 0x98, 0x42, 0xc6, + 0x9a, 0x96, 0x5f, 0x3d, 0xd2, 0x43, 0x8a, 0x48, 0x3d, 0xb7, 0x67, 0x42, + 0x50, 0x36, 0x46, 0xb5, 0x64, 0xc8, 0xd4, 0x38, 0x5d, 0x73, 0xb2, 0x87, + 0x54, 0x5b, 0xb8, 0x4b, 0x54, 0x52, 0x3b, 0xad, 0xc7, 0xb0, 0x93, 0xa5, + 0xab, 0x8d, 0xb8, 0x32, 0x5a, 0x38, 0x5a, 0x8e, 0x41, 0x97, 0x8c, 0x42, + 0x3a, 0x7c, 0x78, 0x49, 0xbf, 0xca, 0x8c, 0x8d, 0x66, 0xa8, 0x30, 0x7f, + 0x55, 0xcd, 0x35, 0x3f, 0xa4, 0x78, 0x46, 0xd5, 0xbe, 0x70, 0x62, 0x6d, + 0x31, 0xb9, 0x7c, 0xa2, 0x7e, 0x65, 0x52, 0x9e, 0x4a, 0xbd, 0x62, 0x55, + 0x40, 0xa0, 0x3a, 0x9d, 0x6d, 0xa3, 0x97, 0xb3, 0xaf, 0x60, 0x6e, 0xc5, + 0x8e, 0x73, 0x51, 0x31, 0xa7, 0x4f, 0x73, 0xa6, 0x51, 0x40, 0x80, 0xa2, + 0x62, 0xb3, 0x53, 0x93, 0xa2, 0x6b, 0x6a, 0x7d, 0xc4, 0x59, 0x60, 0x35, + 0x51, 0x56, 0xa8, 0x79, 0x9e, 0x85, 0x30, 0x6e, 0x3b, 0x87, 0xa4, 0x44, + 0x9f, 0x5c, 0x67, 0xac, 0x8f, 0x7a, 0x40, 0x42, 0x6a, 0x57, 0xa3, 0x62, + 0x49, 0x62, 0x80, 0x3d, 0xaa, 0x48, 0x84, 0xd2, 0x59, 0xb9, 0x34, 0x72, + 0xc7, 0xce, 0x9c, 0xa6, 0x8b, 0x51, 0xab, 0x39, 0x49, 0x9f, 0xb3, 0x4b, + 0xb3, 0x9d, 0x3b, 0x55, 0x38, 0x61, 0x82, 0x3b, 0x42, 0xa9, 0x35, 0xd4, + 0x32, 0xd0, 0x41, 0x51, 0xd0, 0x4e, 0x93, 0x8e, 0x63, 0x5e, 0x48, 0x7d, + 0x60, 0xba, 0xb8, 0xa5, 0x35, 0x71, 0x5d, 0x64, 0x44, 0xc5, 0x58, 0x66, + 0xaf, 0x7d, 0x9c, 0x81, 0x3a, 0x97, 0x5f, 0x48, 0x3b, 0x8d, 0x53, 0x98, + 0xa7, 0x55, 0x7f, 0x40, 0xa1, 0xb7, 0x45, 0x80, 0x62, 0x9e, 0xad, 0x70, + 0x86, 0xbf, 0x73, 0x68, 0x5e, 0x41, 0x8b, 0x86, 0x71, 0xc7, 0x6d, 0xc4, + 0xc7, 0x55, 0x47, 0x9b, 0x75, 0x92, 0xb0, 0x8a, 0xd0, 0xaa, 0x49, 0x67, + 0x8f, 0x89, 0xc5, 0x85, 0x97, 0x99, 0xb1, 0x8b, 0x9b, 0x5f, 0x9c, 0x9f, + 0x34, 0x73, 0x73, 0x64, 0x53, 0x6f, 0x34, 0x5d, 0xb2, 0x71, 0x5c, 0x55, + 0x66, 0x4d, 0x6c, 0x48, 0x77, 0xa6, 0xc0, 0xbc, 0x57, 0x87, 0xb5, 0x5a, + 0xa8, 0x36, 0xba, 0x36, 0x88, 0xc9, 0x45, 0x90, 0x54, 0xc6, 0x6b, 0x66, + 0xbd, 0x82, 0x44, 0x69, 0x6c, 0xdc, 0xa3, 0xb1, 0x62, 0x67, 0x7f, 0x4f, + 0x4b, 0xaa, 0xbb, 0x77, 0xb2, 0xb4, 0x7e, 0x64, 0x65, 0xcb, 0xd8, 0x56, + 0xbc, 0x4b, 0x8d, 0xb8, 0xc2, 0x86, 0xd6, 0xbd, 0x59, 0xab, 0x60, 0x62, + 0x3a, 0x37, 0x84, 0x72, 0xc3, 0x5e, 0x2f, 0x3f, 0x6c, 0xd4, 0xa2, 0x92, + 0xb8, 0x37, 0x92, 0x70, 0x53, 0xac, 0x63, 0x62, 0xaa, 0xca, 0x66, 0x8b, + 0x27, 0xbd, 0x7d, 0x6a, 0x96, 0x3f, 0x71, 0xc2, 0x46, 0x45, 0x53, 0x93, + 0x84, 0x9b, 0x66, 0xd6, 0x5a, 0xc8, 0x94, 0x88, 0xb2, 0x70, 0x92, 0x53, + 0xb4, 0x70, 0xc1, 0x76, 0xc5, 0x88, 0xad, 0xb6, 0x9f, 0xd9, 0x9c, 0x79, + 0x7a, 0x88, 0x4d, 0x50, 0xc3, 0xa7, 0xd7, 0xe7, 0x6f, 0x56, 0x71, 0x68, + 0x9c, 0x7d, 0xab, 0xae, 0xa0, 0x2a, 0x2a, 0x6a, 0x81, 0x6c, 0xb2, 0x55, + 0xa2, 0x66, 0x52, 0x57, 0xcf, 0xd1, 0xc0, 0x5e, 0x34, 0x79, 0xab, 0x61, + 0xc8, 0x4e, 0xa2, 0xa7, 0x9b, 0xb7, 0xa3, 0xa2, 0x8c, 0x83, 0xa9, 0x3c, + 0x46, 0x5e, 0x88, 0x8a, 0x82, 0x95, 0x85, 0xb1, 0xb2, 0x90, 0x9c, 0x86, + 0x65, 0x58, 0x73, 0x3f, 0x52, 0x9c, 0x46, 0x61, 0xc3, 0xb3, 0xd2, 0xb8, + 0x55, 0xb5, 0xd6, 0xa9, 0x70, 0x72, 0x47, 0x61, 0x93, 0xbf, 0x77, 0xaf, + 0x88, 0x34, 0xae, 0xb6, 0xca, 0x8f, 0xce, 0xcb, 0xa5, 0x9a, 0x56, 0x84, + 0x6f, 0x3b, 0x9f, 0x41, 0x83, 0x3a, 0x45, 0x70, 0x47, 0x57, 0x63, 0xaa, + 0x37, 0x70, 0xb5, 0xa2, 0x6b, 0xc8, 0x87, 0xa2, 0xb4, 0x91, 0x6d, 0x8e, + 0xa5, 0x74, 0x44, 0xa5, 0x55, 0xab, 0x52, 0x3b, 0x7d, 0x94, 0x64, 0x92, + 0x88, 0xca, 0x6b, 0x87, 0x8a, 0x9a, 0x64, 0x5d, 0xd3, 0x96, 0xc8, 0x7f, + 0x43, 0x4d, 0x8a, 0x99, 0xa8, 0x85, 0xaf, 0x35, 0x7f, 0xd9, 0xa7, 0xbe, + 0x70, 0xca, 0xcf, 0x31, 0x75, 0x7b, 0x8e, 0x44, 0xda, 0x69, 0x4c, 0x94, + 0x98, 0x91, 0x99, 0x65, 0x93, 0x92, 0xb6, 0x31, 0x5d, 0x5f, 0x78, 0x3b, + 0x64, 0x6e, 0xbd, 0x99, 0x68, 0xd2, 0xbf, 0x5b, 0x7d, 0x3b, 0x64, 0xa5, + 0x59, 0x44, 0x95, 0x73, 0xda, 0x46, 0xcd, 0xd5, 0x76, 0xa1, 0x94, 0xc1, + 0x4a, 0x53, 0xc2, 0xa6, 0x3c, 0x48, 0x41, 0x44, 0x9f, 0x81, 0x97, 0x32, + 0x70, 0x4f, 0x4e, 0x2d, 0x41, 0xcf, 0x6b, 0x43, 0x54, 0x43, 0x5a, 0xc2, + 0x96, 0x68, 0xb8, 0x7c, 0x81, 0x90, 0x71, 0x45, 0x86, 0xab, 0x95, 0x82, + 0xb4, 0x52, 0x5e, 0x87, 0xaf, 0xc0, 0x5d, 0x74, 0x6d, 0xba, 0xc6, 0xb9, + 0x6a, 0x9f, 0xaa, 0x4a, 0x3b, 0x7a, 0xb9, 0xa7, 0x84, 0x4e, 0x87, 0x4d, + 0x63, 0xab, 0x4d, 0xb8, 0xd2, 0x67, 0x92, 0x32, 0x51, 0xb0, 0x8e, 0x49, + 0x51, 0x6a, 0xb2, 0x73, 0xab, 0x45, 0x6b, 0x59, 0x75, 0x53, 0x8a, 0x35, + 0x53, 0x3b, 0x94, 0x37, 0xb2, 0x71, 0xb3, 0xc0, 0x5b, 0x4e, 0xc6, 0xa3, + 0x59, 0x86, 0xc7, 0x7d, 0x97, 0x3f, 0xbd, 0x34, 0xcd, 0xcb, 0x91, 0x56, + 0x4e, 0x50, 0xca, 0x9c, 0x4c, 0xbe, 0x90, 0x3b, 0xa6, 0x80, 0xbe, 0xb8, + 0xc0, 0xc4, 0xaa, 0x9b, 0x5f, 0xb6, 0x3e, 0x61, 0xbd, 0xb3, 0x44, 0xd2, + 0x86, 0x3d, 0x61, 0x34, 0xa6, 0x38, 0x3b, 0x58, 0x5d, 0x81, 0xce, 0xbe, + 0x94, 0x41, 0x7d, 0xa8, 0x37, 0x53, 0x8d, 0x2b, 0x47, 0x5d, 0x29, 0x51, + 0x96, 0x48, 0x90, 0x7c, 0xaf, 0x3a, 0x41, 0xc0, 0x49, 0xc0, 0x44, 0xb1, + 0xc6, 0x32, 0xd3, 0xa6, 0x68, 0x8a, 0xb3, 0x66, 0x8f, 0x30, 0xaf, 0x74, + 0x64, 0xbf, 0x8e, 0x34, 0x6e, 0xb1, 0xce, 0xa8, 0x35, 0x61, 0x5b, 0x53, + 0x60, 0x79, 0x6d, 0x86, 0xa4, 0x74, 0xcf, 0x67, 0x5d, 0x3d, 0xbc, 0xb4, + 0x5c, 0xdd, 0x63, 0x8b, 0xa9, 0xc0, 0x65, 0x48, 0x9a, 0x5c, 0xc4, 0x54, + 0xc1, 0xb8, 0xa0, 0x57, 0x8d, 0x9b, 0xe4, 0x60, 0xcf, 0xad, 0x9e, 0x7b, + 0x59, 0x48, 0x31, 0x92, 0xa6, 0x78, 0x74, 0x84, 0xb8, 0x6b, 0x80, 0x7e, + 0x9e, 0x6c, 0x7a, 0x61, 0xb3, 0x9b, 0xab, 0xcb, 0x7e, 0x47, 0x9b, 0x44, + 0x76, 0x60, 0xa6, 0xa0, 0x39, 0xcd, 0xae, 0x4c, 0xa3, 0xc2, 0xbe, 0x8b, + 0xb0, 0xbe, 0xb1, 0x80, 0xa9, 0xb2, 0x7b, 0x85, 0xd6, 0xa1, 0xd3, 0x61, + 0xa8, 0x44, 0x7f, 0xa1, 0x60, 0xb9, 0x40, 0x3c, 0x4e, 0x45, 0xcc, 0x47, + 0x39, 0x54, 0x41, 0xa5, 0x63, 0xd1, 0x71, 0x87, 0x7b, 0xcb, 0x5f, 0xa0, + 0x9a, 0x27, 0x25, 0x97, 0x97, 0x33, 0xaf, 0xcb, 0x45, 0x71, 0x97, 0xa9, + 0x93, 0x5c, 0x62, 0x7b, 0x92, 0x40, 0xc6, 0x82, 0x97, 0xd5, 0xc6, 0x69, + 0x6c, 0x8b, 0xb9, 0x9a, 0xc0, 0xa4, 0x60, 0xb1, 0x3d, 0x3e, 0x46, 0x4f, + 0x7e, 0x7e, 0x7e, 0xc5, 0xba, 0x45, 0x3d, 0xa2, 0xb7, 0x90, 0x56, 0x54, + 0x4e, 0xae, 0xc7, 0x57, 0xa6, 0xcb, 0x69, 0xbd, 0xbd, 0x44, 0x75, 0xcb, + 0x94, 0x31, 0xa0, 0x60, 0x66, 0x7f, 0xa1, 0xa1, 0x5a, 0x60, 0x43, 0x8b, + 0x9e, 0x6c, 0x95, 0x47, 0xa3, 0xb7, 0x35, 0x62, 0x50, 0x9e, 0x80, 0x49, + 0x49, 0x8f, 0xd6, 0x7d, 0x5a, 0x77, 0xc1, 0xce, 0x4b, 0x51, 0x5f, 0x3d, + 0xc9, 0x6a, 0x49, 0x8b, 0x38, 0x90, 0x4c, 0x6b, 0x4e, 0x6e, 0xb6, 0x2e, + 0x76, 0x5d, 0x6c, 0xb5, 0xa9, 0x64, 0x9f, 0xb4, 0x9a, 0x71, 0x96, 0xa8, + 0x32, 0x80, 0xaa, 0xcb, 0xbf, 0x7b, 0xd3, 0x57, 0x9b, 0xd5, 0x7f, 0x8a, + 0xc4, 0xc6, 0x50, 0x30, 0x4f, 0x5f, 0x87, 0x99, 0xaf, 0x5a, 0xcc, 0x80, + 0x33, 0x50, 0x77, 0x7a, 0xc6, 0x98, 0xbe, 0x55, 0x92, 0x91, 0x54, 0x78, + 0xc5, 0x85, 0xa4, 0x53, 0x35, 0xcd, 0x7b, 0x39, 0xc4, 0x53, 0xb7, 0x70, + 0x91, 0xc6, 0x67, 0xb3, 0x68, 0x6b, 0x7a, 0x68, 0xb0, 0x87, 0xa4, 0x70, + 0x70, 0x78, 0xa3, 0x6a, 0x7c, 0x4f, 0x61, 0x8f, 0xd0, 0x3a, 0x7b, 0x59, + 0x67, 0xbf, 0xa3, 0x6c, 0xb9, 0xa1, 0x5a, 0xbb, 0x8a, 0x57, 0x47, 0x8e, + 0x58, 0x6d, 0x76, 0xac, 0xc3, 0xae, 0x3f, 0x84, 0x62, 0xc2, 0x95, 0x36, + 0xad, 0xa0, 0x93, 0x57, 0x92, 0x73, 0x46, 0x4f, 0xb5, 0x82, 0xd2, 0x79, + 0x33, 0x5d, 0x85, 0x3c, 0x52, 0x9b, 0x77, 0x97, 0x8d, 0x72, 0x6c, 0xa5, + 0xae, 0x80, 0x91, 0x55, 0xb0, 0xc7, 0x96, 0xc4, 0x40, 0x61, 0x8a, 0x33, + 0x58, 0xa9, 0x86, 0x7e, 0x58, 0xc6, 0xaf, 0xc7, 0x51, 0x59, 0x86, 0x48, + 0x94, 0x9c, 0xa1, 0x6a, 0x64, 0x58, 0x39, 0x70, 0x6d, 0x66, 0xb5, 0x4b, + 0x6b, 0xc2, 0xaf, 0x47, 0xc7, 0x67, 0x67, 0x90, 0x66, 0x77, 0x58, 0x9f, + 0xd1, 0x94, 0xb3, 0xc3, 0x54, 0xc8, 0x3c, 0x81, 0x87, 0x31, 0x91, 0x53, + 0x93, 0x30, 0x29, 0x47, 0xa7, 0x74, 0x6f, 0x3d, 0x64, 0xb5, 0x8c, 0xb8, + 0x6f, 0x36, 0x5a, 0x7b, 0x84, 0x7e, 0x61, 0x40, 0x5a, 0xc8, 0xb1, 0xb2, + 0x63, 0xc2, 0x6c, 0xa8, 0xc3, 0x43, 0x82, 0x56, 0xb0, 0x3c, 0x5a, 0xce, + 0xa2, 0xd2, 0x5e, 0x52, 0x6b, 0xc7, 0x9f, 0x7d, 0xc8, 0x2e, 0x54, 0x80, + 0x47, 0xc0, 0x65, 0x4b, 0x49, 0x8c, 0x50, 0x89, 0xaa, 0x5e, 0x99, 0xd2, + 0x5a, 0x74, 0x94, 0x44, 0x40, 0x38, 0x42, 0x8d, 0x7d, 0xc9, 0x93, 0x68, + 0x66, 0x82, 0x4a, 0xc0, 0x98, 0xb2, 0x52, 0x40, 0x72, 0xcb, 0x7e, 0x78, + 0x9e, 0x39, 0xcc, 0xa9, 0x84, 0x2f, 0xb4, 0x60, 0x72, 0xac, 0x37, 0xbf, + 0x87, 0x61, 0x52, 0xc2, 0x8b, 0x46, 0xc1, 0x3b, 0xaf, 0x83, 0x93, 0x78, + 0x50, 0x49, 0x70, 0x48, 0xc4, 0xd1, 0x39, 0x8f, 0xbe, 0xaa, 0x99, 0x9a, + 0x6c, 0xb5, 0xb7, 0x71, 0x4e, 0x92, 0x9d, 0x3b, 0xbd, 0x50, 0xd9, 0xcc, + 0xcb, 0xbe, 0x9b, 0x42, 0x6b, 0xbb, 0x56, 0xc2, 0xaa, 0xce, 0x95, 0xbe, + 0xac, 0x80, 0x95, 0x47, 0x4c, 0x68, 0x70, 0xc4, 0x65, 0xa3, 0x64, 0xc3, + 0x7e, 0x6b, 0x8e, 0xb8, 0x60, 0x8b, 0x45, 0xc6, 0x5d, 0xa9, 0x96, 0x75, + 0xb6, 0x91, 0xc3, 0xac, 0x34, 0xaa, 0x68, 0x5c, 0xb9, 0x43, 0x9c, 0x98, + 0x9c, 0x94, 0x52, 0xa7, 0xc0, 0x54, 0x5f, 0xd1, 0x9d, 0xae, 0xc1, 0x84, + 0xb2, 0x34, 0xd7, 0x4a, 0x6c, 0x39, 0xc6, 0xa4, 0xaa, 0x6f, 0x90, 0x85, + 0xbf, 0xb6, 0x48, 0xcf, 0x47, 0x9a, 0x91, 0x54, 0xad, 0x6b, 0xa9, 0x8b, + 0x8f, 0x40, 0x61, 0x73, 0x97, 0x60, 0x63, 0xb5, 0xa8, 0x5c, 0x3f, 0xd5, + 0x52, 0x77, 0xac, 0xb4, 0xaa, 0xcd, 0xb4, 0x30, 0x61, 0x6b, 0xae, 0xa4, + 0x8b, 0xbe, 0x5d, 0x6d, 0xab, 0x46, 0xa1, 0x78, 0x49, 0x86, 0xb2, 0x4e, + 0xc6, 0xa9, 0xb0, 0x6f, 0x7d, 0xa3, 0xc3, 0x3a, 0xc1, 0x98, 0x8f, 0x7f, + 0x8e, 0x4a, 0x63, 0x51, 0x43, 0x39, 0xa4, 0xa6, 0xd1, 0x67, 0xd4, 0x92, + 0x4b, 0x9a, 0x81, 0x8c, 0xb3, 0x6f, 0x53, 0xcd, 0x88, 0xce, 0xc0, 0x72, + 0xb5, 0xb9, 0x9a, 0x94, 0x8c, 0x52, 0x7d, 0x42, 0x36, 0x53, 0x88, 0x4f, + 0x68, 0x75, 0xca, 0xa9, 0x47, 0x6a, 0xb8, 0xb1, 0x9e, 0x72, 0x51, 0x42, + 0x3d, 0x77, 0xc0, 0x6e, 0x3b, 0x47, 0x53, 0x41, 0x68, 0xa1, 0x67, 0x83, + 0x71, 0x9e, 0x9b, 0x84, 0x64, 0x96, 0x5c, 0xd3, 0x67, 0x48, 0x4d, 0x83, + 0x8a, 0x8b, 0xb4, 0x82, 0xc4, 0xb9, 0xb6, 0x9a, 0x44, 0x92, 0x9d, 0x86, + 0xa2, 0xa1, 0xbe, 0x95, 0xad, 0xc9, 0x67, 0x54, 0x62, 0x9d, 0x2f, 0x68, + 0x5c, 0x96, 0xa8, 0xb7, 0x9e, 0x7a, 0x7e, 0xd4, 0xb4, 0x92, 0x7b, 0xc1, + 0x9d, 0xcf, 0x61, 0x89, 0x58, 0x99, 0x7f, 0x8b, 0x9e, 0x65, 0x30, 0x70, + 0x6f, 0xb1, 0xd0, 0xb6, 0x9b, 0xbd, 0x5c, 0xab, 0x9c, 0xb4, 0xa0, 0x63, + 0x49, 0x52, 0xa4, 0x56, 0x6a, 0x6c, 0x39, 0x2a, 0x3c, 0x64, 0x71, 0xac, + 0x69, 0x5e, 0x45, 0x75, 0x38, 0x9b, 0x9d, 0xa5, 0x39, 0x6e, 0x31, 0xc6, + 0x52, 0x8a, 0x30, 0x66, 0x99, 0xc6, 0x97, 0x98, 0xc9, 0x89, 0x92, 0x58, + 0x7f, 0x9d, 0xc4, 0x6c, 0x61, 0xae, 0x34, 0x5a, 0x89, 0x99, 0xc8, 0x67, + 0xbe, 0x77, 0xce, 0x8a, 0x38, 0x80, 0x6a, 0x49, 0x73, 0x8e, 0xd4, 0x55, + 0x68, 0x62, 0x98, 0x93, 0xc2, 0x94, 0x7c, 0x77, 0x85, 0x71, 0x73, 0xa6, + 0x62, 0xa7, 0x81, 0x6b, 0xab, 0x64, 0xc4, 0xcd, 0xc9, 0x9e, 0x51, 0x89, + 0x41, 0x93, 0xc9, 0xbd, 0x6e, 0x87, 0xb9, 0x60, 0x87, 0x55, 0x87, 0x3f, + 0x7f, 0xd5, 0x97, 0x48, 0xa6, 0x63, 0xba, 0x97, 0x91, 0x93, 0x8e, 0xbc, + 0x46, 0x48, 0x71, 0x48, 0x45, 0x34, 0x4b, 0xc6, 0xc0, 0x95, 0x8a, 0x62, + 0x78, 0x5c, 0x55, 0x4b, 0xcd, 0x82, 0xb2, 0x80, 0x5e, 0x6f, 0xae, 0xb8, + 0xa3, 0xb0, 0xa8, 0xa4, 0x54, 0x97, 0x43, 0xa5, 0xbf, 0x78, 0x62, 0x61, + 0xab, 0x30, 0x8b, 0xca, 0x97, 0xbc, 0x64, 0xd2, 0x30, 0xa0, 0x46, 0xc0, + 0x96, 0xce, 0x77, 0x8d, 0xc6, 0x33, 0x52, 0xcd, 0x66, 0x58, 0x3a, 0x7d, + 0xcb, 0xc5, 0x4f, 0xc5, 0x9c, 0xcf, 0x65, 0xbc, 0x7d, 0x3c, 0x49, 0x52, + 0x75, 0x53, 0x66, 0x38, 0x6f, 0x61, 0xce, 0x6e, 0x46, 0x52, 0xb3, 0x61, + 0xc5, 0xad, 0x8a, 0xc4, 0x4b, 0xab, 0xae, 0x6c, 0xbe, 0x31, 0x58, 0x87, + 0xbf, 0x76, 0x49, 0x9e, 0x3d, 0xae, 0x30, 0xbd, 0x7c, 0x5a, 0xb5, 0x9b, + 0x80, 0x4e, 0xba, 0x43, 0x54, 0xc8, 0xab, 0x87, 0x8a, 0x6e, 0x87, 0xae, + 0x4f, 0x5e, 0x8f, 0xd2, 0x6c, 0x7d, 0xdb, 0x98, 0x4a, 0x4a, 0xd4, 0xa8, + 0xbc, 0x8d, 0x74, 0xd8, 0x42, 0xc4, 0x55, 0xca, 0x4b, 0x42, 0x3b, 0x83, + 0x7c, 0xab, 0x9d, 0xcd, 0xc3, 0xce, 0x70, 0x7d, 0x5d, 0x5d, 0xac, 0xc3, + 0xa7, 0x67, 0x61, 0x6e, 0x78, 0x77, 0x49, 0x81, 0x46, 0x7e, 0xb9, 0x73, + 0xa8, 0xa9, 0xb1, 0xa7, 0xc9, 0xa8, 0x42, 0x64, 0x54, 0xdb, 0xaf, 0x79, + 0x79, 0x7d, 0x88, 0x8d, 0x4d, 0xb4, 0x84, 0x41, 0x59, 0xa5, 0x86, 0x4a, + 0x7e, 0x51, 0x74, 0xc9, 0xc0, 0x3a, 0x3f, 0xcc, 0x9d, 0x45, 0x9f, 0x88, + 0x52, 0x83, 0xc2, 0x4f, 0xb6, 0xa6, 0x4b, 0xc4, 0x5a, 0xa1, 0xc4, 0x92, + 0xc2, 0xb8, 0x92, 0x2f, 0xca, 0xa3, 0x5c, 0x98, 0x5b, 0x8b, 0xc2, 0x77, + 0x53, 0x68, 0x8a, 0x72, 0x67, 0xd2, 0xa3, 0x66, 0x37, 0x6e, 0x75, 0xce, + 0x54, 0x9c, 0x71, 0x2a, 0xa2, 0x5c, 0x74, 0x50, 0x74, 0xb3, 0x96, 0x5a, + 0x3f, 0xcd, 0x65, 0x52, 0x43, 0xda, 0xbf, 0x52, 0x92, 0xc1, 0x63, 0x54, + 0x89, 0xd1, 0xd6, 0x30, 0xa5, 0x82, 0x32, 0x9b, 0x42, 0xa3, 0x31, 0xa1, + 0x5f, 0x85, 0x39, 0xc5, 0xaa, 0x38, 0x82, 0x6f, 0x4e, 0x3b, 0x9e, 0x2b, + 0x7f, 0x85, 0x49, 0x53, 0x98, 0x59, 0x5a, 0xa2, 0x40, 0xb2, 0x44, 0x94, + 0x67, 0x72, 0xbe, 0xa8, 0x38, 0xc5, 0x45, 0x8f, 0x6e, 0x53, 0xab, 0x3e, + 0x33, 0x8d, 0xd6, 0x9b, 0x24, 0xa8, 0xc5, 0x72, 0x9d, 0xb5, 0x77, 0x53, + 0x94, 0x74, 0xa6, 0xc3, 0x40, 0x57, 0x48, 0xd1, 0x7e, 0x98, 0x6e, 0x62, + 0x48, 0xd0, 0xac, 0x2e, 0x78, 0x33, 0xbf, 0x6c, 0x74, 0x2f, 0x79, 0x72, + 0x60, 0xa0, 0x71, 0xb8, 0x90, 0xbd, 0xc0, 0xc8, 0xbf, 0x83, 0xc6, 0xaa, + 0x6f, 0xcf, 0x25, 0x65, 0x5f, 0xa1, 0x30, 0xc0, 0xdf, 0x5f, 0xa6, 0xca, + 0x6b, 0xc7, 0x47, 0x9f, 0xad, 0x8d, 0xd4, 0x9f, 0x53, 0xc9, 0xae, 0x58, + 0x8a, 0x9d, 0x74, 0x84, 0x98, 0x50, 0x35, 0xd3, 0x88, 0xa2, 0x7a, 0x9a, + 0x49, 0x53, 0x5c, 0x3a, 0xa7, 0x9b, 0x39, 0xd8, 0xb0, 0xbc, 0xc7, 0x65, + 0x51, 0x9f, 0x8a, 0xb2, 0x74, 0x4d, 0x30, 0xc3, 0xb2, 0xcc, 0xa6, 0x48, + 0x5f, 0x86, 0xa6, 0x9c, 0x71, 0x61, 0x43, 0x57, 0xac, 0xb1, 0x7e, 0x73, + 0x50, 0x36, 0x99, 0x8a, 0x7a, 0x7c, 0x34, 0xc0, 0xa5, 0x6f, 0x3e, 0xcc, + 0x41, 0x29, 0x65, 0xb7, 0x5b, 0x6b, 0x28, 0xd6, 0x59, 0x89, 0x89, 0x5f, + 0xc3, 0xa0, 0xa7, 0x64, 0x66, 0x63, 0x37, 0xb9, 0xd9, 0x9a, 0x35, 0x8e, + 0x85, 0x44, 0x94, 0x45, 0x49, 0xd6, 0x55, 0x6d, 0x64, 0xca, 0x67, 0xc4, + 0xb6, 0x69, 0x99, 0x4a, 0x8a, 0x94, 0x3e, 0xa9, 0x25, 0x6a, 0x7e, 0x89, + 0xa2, 0x4d, 0xbc, 0x8a, 0x72, 0xb2, 0xb7, 0x9e, 0x7a, 0x47, 0xb5, 0xa1, + 0x6d, 0x79, 0x63, 0x98, 0x74, 0xd5, 0x50, 0x48, 0x9d, 0x30, 0xb8, 0x57, + 0x53, 0x9a, 0x9e, 0x99, 0xcc, 0x65, 0x66, 0x37, 0x4f, 0x68, 0x67, 0x7a, + 0xb1, 0x39, 0x67, 0x9f, 0x48, 0xae, 0xbf, 0x6b, 0xc8, 0x96, 0xb3, 0xac, + 0x6f, 0xa3, 0x88, 0xc9, 0x39, 0x73, 0x47, 0x98, 0xb5, 0xbb, 0xc6, 0x65, + 0x4c, 0x8e, 0xae, 0xd3, 0x44, 0x35, 0x66, 0x9e, 0x91, 0xba, 0xaa, 0x88, + 0x65, 0x58, 0x4e, 0x6e, 0xd5, 0x85, 0x94, 0x23, 0x3c, 0xbb, 0x75, 0x85, + 0xc4, 0xa4, 0x35, 0x93, 0x5a, 0x77, 0x61, 0xc2, 0x9c, 0xb4, 0xd9, 0x36, + 0xae, 0xb4, 0x48, 0x46, 0x4a, 0xc5, 0xc7, 0x73, 0x91, 0x3b, 0x98, 0x69, + 0x5b, 0x99, 0xc0, 0x7f, 0xc6, 0xaa, 0xb0, 0x72, 0x6f, 0xc2, 0x9e, 0x65, + 0x89, 0x72, 0x4b, 0x4a, 0x2f, 0x84, 0xd2, 0xcf, 0x86, 0x4f, 0xce, 0x88, + 0xb5, 0x39, 0x3f, 0x4c, 0x5e, 0x8d, 0x49, 0xa9, 0xc5, 0x8e, 0xaa, 0x8f, + 0xa7, 0xa7, 0xab, 0xb5, 0x94, 0x45, 0xcb, 0x99, 0x90, 0xb1, 0x46, 0xc9, + 0xc7, 0xb1, 0x9b, 0x6d, 0x5b, 0xb1, 0x81, 0xc2, 0x71, 0xd7, 0xa5, 0xc2, + 0x53, 0x88, 0x56, 0x4a, 0x3f, 0x49, 0x5e, 0x84, 0x97, 0xc5, 0x5f, 0xd1, + 0xb8, 0x8b, 0x81, 0xce, 0xca, 0xae, 0x8e, 0x31, 0x85, 0x82, 0x50, 0x33, + 0xb2, 0xc7, 0xce, 0x6f, 0x77, 0x8e, 0x5e, 0xcf, 0x5d, 0x82, 0xa0, 0xcf, + 0x3b, 0x96, 0xc2, 0x84, 0xa0, 0x7f, 0x64, 0x37, 0x4b, 0xbb, 0x50, 0x95, + 0x6c, 0xa3, 0x7e, 0x88, 0x62, 0xcc, 0x61, 0xc1, 0x4d, 0x69, 0x7a, 0xc3, + 0x67, 0x58, 0x79, 0x57, 0x8c, 0x54, 0x37, 0x2b, 0x85, 0x67, 0x6e, 0x99, + 0x78, 0xb4, 0x7e, 0x6d, 0x3a, 0x5d, 0x77, 0xbc, 0x32, 0x89, 0xcd, 0xca, + 0x89, 0x86, 0xa2, 0x41, 0xb3, 0xa0, 0x7b, 0x34, 0xac, 0x2d, 0xb9, 0x94, + 0xac, 0x46, 0x59, 0x28, 0x67, 0x77, 0x33, 0xc1, 0x45, 0x78, 0x9b, 0x85, + 0x9b, 0x87, 0x7d, 0x60, 0x5e, 0xa5, 0x46, 0xa5, 0x8f, 0x30, 0xc0, 0x5e, + 0x4c, 0x78, 0xc3, 0xca, 0x58, 0x68, 0x5e, 0x32, 0xbf, 0x8b, 0x62, 0xb3, + 0x2e, 0xa0, 0xc0, 0xca, 0xa4, 0x5d, 0xc5, 0xca, 0x96, 0x87, 0x64, 0x92, + 0x6b, 0x33, 0x76, 0x8a, 0x6e, 0x57, 0x83, 0x9d, 0x69, 0x80, 0xad, 0x37, + 0xaf, 0xc7, 0xb8, 0x8b, 0x3b, 0xbd, 0x30, 0x58, 0xbb, 0x9b, 0x4b, 0x51, + 0xb9, 0x62, 0xc4, 0x7f, 0x43, 0x9d, 0x2b, 0x78, 0xa1, 0x6b, 0x6d, 0x8c, + 0xc1, 0x66, 0x80, 0x49, 0x8b, 0xbf, 0x48, 0xa2, 0xbc, 0x35, 0xb4, 0x41, + 0x5f, 0xa6, 0x47, 0x86, 0x7e, 0x40, 0x94, 0xa2, 0x84, 0xc7, 0xbf, 0x40, + 0x8f, 0xb2, 0x52, 0x94, 0x9b, 0x96, 0x95, 0xb9, 0x55, 0xd1, 0x81, 0xa1, + 0x3d, 0x9a, 0xab, 0xb9, 0x50, 0x76, 0xc3, 0x90, 0x46, 0x6a, 0xd3, 0x5d, + 0x43, 0x8c, 0x91, 0x83, 0x5f, 0x3d, 0xbd, 0xde, 0xe3, 0xc7, 0x80, 0xcb, + 0x56, 0x5f, 0xa5, 0x72, 0xc0, 0xc5, 0x6a, 0x89, 0x59, 0x9e, 0x79, 0xc0, + 0x7d, 0xbe, 0x90, 0x86, 0x97, 0x74, 0x40, 0x3e, 0xf1, 0xbc, 0x29, 0x2d, + 0x60, 0x78, 0x4b, 0x71, 0x50, 0xa1, 0x75, 0xaa, 0x4b, 0x98, 0xab, 0x33, + 0xb6, 0x60, 0x71, 0x9e, 0x7f, 0x46, 0x59, 0x79, 0x5c, 0xb5, 0xb0, 0x2b, + 0x43, 0x62, 0x25, 0xd2, 0x5f, 0xc4, 0x7e, 0x7d, 0xc3, 0x4e, 0x8f, 0xcb, + 0xc7, 0x5f, 0x6a, 0x32, 0xad, 0xa5, 0xb7, 0x7e, 0x54, 0xb1, 0x83, 0x93, + 0x36, 0x8c, 0xa6, 0x80, 0x41, 0xbe, 0x79, 0x4a, 0x48, 0x5f, 0x80, 0x71, + 0xa9, 0xb4, 0x5e, 0xc0, 0x9b, 0x85, 0x32, 0x4f, 0xc8, 0x54, 0xc5, 0xb9, + 0x93, 0x51, 0xbe, 0xc7, 0x9d, 0x36, 0xaa, 0x61, 0x89, 0x51, 0x93, 0xc7, + 0x59, 0x49, 0xa6, 0xca, 0x62, 0xb8, 0x57, 0x52, 0x6a, 0x64, 0xa8, 0x53, + 0x4c, 0x9c, 0xbc, 0x3d, 0x94, 0x90, 0x50, 0x63, 0x92, 0x65, 0x98, 0xae, + 0x2f, 0x85, 0x77, 0x8c, 0x97, 0xbf, 0xb6, 0xd1, 0xbd, 0x66, 0x3e, 0xad, + 0xc4, 0xa8, 0x6c, 0xb5, 0xd0, 0xd4, 0xc8, 0x4e, 0xc3, 0x5f, 0x49, 0xc9, + 0x70, 0xd8, 0xa6, 0x50, 0xcc, 0x9f, 0x4b, 0x4c, 0x3a, 0x8e, 0xcf, 0xd1, + 0xc4, 0x3f, 0xae, 0x2f, 0x81, 0xba, 0xc7, 0x94, 0x46, 0x7d, 0x88, 0x53, + 0x62, 0x36, 0x6c, 0x4a, 0x5e, 0x8f, 0x54, 0xb6, 0x7b, 0x4e, 0xd2, 0xc6, + 0xae, 0x9f, 0xcf, 0xb5, 0xcb, 0x5a, 0x73, 0x42, 0x56, 0x8f, 0x9a, 0xd7, + 0x49, 0x8e, 0x40, 0x7d, 0xa6, 0xcc, 0xc4, 0x60, 0xb4, 0x9a, 0x97, 0x5a, + 0x35, 0xb5, 0xa2, 0xba, 0x7f, 0xa3, 0x3a, 0x9c, 0x6c, 0x37, 0xa3, 0x84, + 0x3e, 0x39, 0x6b, 0xcb, 0x96, 0x35, 0xba, 0x91, 0xc9, 0x34, 0x7b, 0xa1, + 0xb7, 0x9c, 0xba, 0x9e, 0x3e, 0x48, 0x8a, 0x82, 0x7d, 0xc5, 0x64, 0x82, + 0x6e, 0xa9, 0x5c, 0xa6, 0x7e, 0x5c, 0xad, 0xc2, 0x9b, 0xac, 0x74, 0x39, + 0x88, 0xaa, 0x39, 0x3b, 0xc4, 0xb1, 0x75, 0x68, 0x76, 0x57, 0x77, 0x87, + 0x37, 0xb5, 0x82, 0x51, 0x3d, 0x6e, 0x98, 0xa1, 0x38, 0x9b, 0xbc, 0x5c, + 0x6f, 0x5e, 0x4c, 0x82, 0x99, 0xbf, 0x81, 0xc0, 0x66, 0x7b, 0xa8, 0x7a, + 0x39, 0x96, 0x9f, 0x43, 0x7a, 0xaf, 0xba, 0x81, 0x5a, 0x83, 0xa1, 0x8c, + 0x88, 0x5b, 0x4e, 0x33, 0x64, 0xbf, 0xa8, 0x74, 0x91, 0xa6, 0xa3, 0x8b, + 0x6f, 0xb9, 0xc2, 0x65, 0xa1, 0xd2, 0x97, 0xc3, 0x75, 0x65, 0xcc, 0xb5, + 0x8d, 0xc5, 0xc9, 0x8c, 0x4a, 0x45, 0xa9, 0xcc, 0xb5, 0x3c, 0x7b, 0x56, + 0x52, 0x59, 0xae, 0x30, 0x89, 0xc9, 0x45, 0x53, 0x6c, 0x4b, 0x63, 0x7b, + 0x9b, 0x87, 0x67, 0xc7, 0x8b, 0xbe, 0xd2, 0x66, 0x75, 0x97, 0x65, 0xb3, + 0x6e, 0x71, 0x9f, 0x97, 0x5e, 0xab, 0x96, 0x63, 0x55, 0x40, 0xc8, 0xbf, + 0x45, 0x43, 0x44, 0x87, 0x34, 0x66, 0xa9, 0x8e, 0x89, 0x3c, 0x83, 0xa4, + 0xca, 0x43, 0x59, 0x99, 0x6d, 0xa3, 0xaf, 0xd1, 0xca, 0xb3, 0xb5, 0xa0, + 0x52, 0x48, 0xb8, 0xcc, 0x61, 0x4e, 0x76, 0x79, 0x41, 0x52, 0xc9, 0x2b, + 0xc6, 0x6a, 0x4c, 0x9a, 0x84, 0x47, 0xd3, 0x3c, 0x84, 0x9e, 0x46, 0xaf, + 0x5d, 0x99, 0xa3, 0x4d, 0xae, 0x42, 0xa9, 0xbb, 0x3d, 0x9c, 0xd4, 0x9b, + 0xc3, 0x77, 0xa3, 0xc0, 0xb0, 0xb9, 0x85, 0xc0, 0x51, 0x44, 0x36, 0x4f, + 0x4f, 0xb4, 0xac, 0x4b, 0x36, 0x8a, 0x45, 0x3b, 0xd5, 0x53, 0xab, 0x78, + 0x60, 0x6a, 0x3a, 0x95, 0x4f, 0x90, 0x30, 0x5f, 0x70, 0xaf, 0x5c, 0xc9, + 0x62, 0x5a, 0xb4, 0x9a, 0x48, 0xc0, 0x51, 0x8a, 0x4a, 0x8f, 0x31, 0xaf, + 0xb1, 0xb2, 0x69, 0xb2, 0xcb, 0x80, 0xc2, 0x75, 0x3f, 0xa7, 0x90, 0x60, + 0x48, 0x84, 0x6d, 0x6e, 0xa4, 0x3c, 0x36, 0xc6, 0xd5, 0xa5, 0x3f, 0xa5, + 0x48, 0x7c, 0x5a, 0x45, 0x67, 0x94, 0x40, 0x93, 0xa9, 0xa9, 0xbf, 0x3a, + 0x40, 0x48, 0x9a, 0x39, 0xb8, 0x7a, 0x9a, 0x5c, 0xc0, 0xae, 0x5b, 0x5e, + 0xb2, 0x8b, 0x6e, 0x46, 0xca, 0x76, 0x6b, 0x62, 0x8c, 0x68, 0xd1, 0x71, + 0x46, 0xa1, 0x3a, 0x3d, 0x72, 0x57, 0x5e, 0x9b, 0xcb, 0x91, 0xbd, 0x46, + 0x67, 0x34, 0x96, 0x8c, 0xa8, 0x6a, 0xc9, 0xbd, 0x9b, 0x9a, 0x93, 0x50, + 0xb1, 0x61, 0x46, 0xac, 0x73, 0x93, 0x45, 0xa0, 0xae, 0xcb, 0x5b, 0xb4, + 0x59, 0x77, 0x6a, 0xcb, 0xaa, 0xc5, 0x9d, 0x93, 0xc5, 0x9b, 0xa0, 0x36, + 0x5f, 0x54, 0x8a, 0x7b, 0xc9, 0x6f, 0x7b, 0xb4, 0x7c, 0xd4, 0x88, 0xcd, + 0xb6, 0x67, 0x44, 0xbc, 0xca, 0x8a, 0x75, 0x37, 0x81, 0x53, 0xbc, 0x76, + 0x93, 0x47, 0xb3, 0xd1, 0xb7, 0x72, 0xca, 0xbd, 0x5f, 0xaa, 0x3e, 0x5b, + 0x68, 0xb6, 0x2c, 0xa8, 0x48, 0x60, 0xbd, 0xd0, 0x65, 0xac, 0x78, 0xce, + 0x67, 0x8c, 0x7b, 0xa0, 0x53, 0xc6, 0xcc, 0x99, 0x9a, 0x43, 0x45, 0x85, + 0x6c, 0x9f, 0x6b, 0xc2, 0x77, 0x4d, 0x52, 0x35, 0x52, 0x78, 0xb9, 0x7f, + 0xac, 0x88, 0xa3, 0x82, 0x55, 0xcb, 0x4b, 0x6a, 0xc9, 0x72, 0xd5, 0xcd, + 0x48, 0xc7, 0x9e, 0x3b, 0x8c, 0xd0, 0xbc, 0x9c, 0x9a, 0xa7, 0xbd, 0x5e, + 0x71, 0x5b, 0xbb, 0xb6, 0x32, 0x80, 0x90, 0xbb, 0x8a, 0xc3, 0xbf, 0xc0, + 0x93, 0x38, 0x62, 0xae, 0xc9, 0x45, 0x3e, 0xcd, 0x9f, 0x70, 0x55, 0x41, + 0xa0, 0x54, 0x5a, 0xc6, 0x93, 0x60, 0x50, 0xb7, 0xc4, 0x78, 0x60, 0x64, + 0x97, 0xcf, 0x37, 0xab, 0x51, 0x6c, 0x45, 0x4e, 0xc8, 0x3f, 0xca, 0x86, + 0xc6, 0xbc, 0xb4, 0x8a, 0x8e, 0x9b, 0x54, 0x72, 0xbc, 0x52, 0x6f, 0x61, + 0x42, 0x6d, 0x46, 0x83, 0x8c, 0xa9, 0x62, 0xbe, 0xae, 0x60, 0xbf, 0xd3, + 0x33, 0xaf, 0x99, 0xb5, 0xa2, 0xa7, 0x49, 0xb7, 0x66, 0x51, 0x78, 0x93, + 0x52, 0x4b, 0x9a, 0x40, 0x4c, 0x93, 0xca, 0xca, 0x30, 0x9b, 0x84, 0x5e, + 0x6c, 0xa1, 0x83, 0xa6, 0x64, 0xad, 0xa9, 0x4a, 0x47, 0x43, 0x9c, 0x51, + 0x3d, 0x2c, 0x97, 0x85, 0xaf, 0x5a, 0x49, 0xac, 0x6f, 0x4a, 0x40, 0xc5, + 0xaa, 0x48, 0xc9, 0x82, 0xd2, 0x91, 0xc0, 0x3c, 0x6a, 0x76, 0x4f, 0x89, + 0xc5, 0xaf, 0xab, 0x43, 0x42, 0x46, 0xbc, 0xbc, 0xce, 0xa4, 0x6a, 0x60, + 0x39, 0x3a, 0xb3, 0xd6, 0x30, 0xc3, 0xbb, 0xb5, 0xb8, 0x3b, 0x62, 0x36, + 0x80, 0xc9, 0x86, 0x6c, 0xca, 0xc1, 0x40, 0xa1, 0x52, 0xbe, 0x32, 0xb3, + 0xb8, 0x48, 0x6a, 0xc6, 0xc7, 0x4b, 0xbd, 0xb8, 0x52, 0x4f, 0xca, 0x28, + 0x49, 0x7f, 0x6e, 0xbe, 0x41, 0xd6, 0x5e, 0x95, 0x77, 0xa1, 0x8b, 0x36, + 0x5a, 0x52, 0xd9, 0x58, 0x2f, 0x4b, 0x5e, 0x9e, 0x8b, 0x3b, 0x9a, 0x64, + 0xb2, 0x4a, 0x51, 0x71, 0xbd, 0x67, 0xa4, 0xa0, 0x95, 0xb8, 0x97, 0x75, + 0xb0, 0xa0, 0xcf, 0xb7, 0xad, 0xba, 0x86, 0xe2, 0xad, 0x3b, 0xcb, 0xb4, + 0x43, 0xc3, 0x94, 0x7b, 0x4e, 0xce, 0xca, 0x41, 0x61, 0x48, 0xa5, 0x8c, + 0xbe, 0x43, 0x9f, 0x8f, 0x5d, 0x3c, 0x73, 0xdf, 0x6f, 0x53, 0xd2, 0x5e, + 0x9f, 0x4f, 0x70, 0xbc, 0x45, 0x92, 0x9c, 0x5e, 0x8e, 0xc9, 0x9e, 0xbe, + 0x34, 0x2e, 0x73, 0xc0, 0x5e, 0x31, 0xb7, 0xc4, 0x5f, 0x7b, 0xbd, 0x9e, + 0x3d, 0xb7, 0x74, 0xd2, 0xba, 0x70, 0xc0, 0x98, 0xc1, 0x51, 0xd2, 0xb0, + 0x81, 0x84, 0x81, 0xd4, 0x50, 0x54, 0x84, 0xbd, 0x97, 0x6c, 0x31, 0x81, + 0x21, 0x61, 0xa5, 0x1e, 0xd0, 0x6a, 0x99, 0x6d, 0x51, 0xb3, 0xb9, 0xa6, + 0xaf, 0x79, 0xa0, 0x8c, 0x5f, 0x93, 0x38, 0xcf, 0x53, 0xae, 0x6b, 0xcf, + 0x30, 0x65, 0x5e, 0xa3, 0xb2, 0xa9, 0x51, 0x52, 0x57, 0xa9, 0xb9, 0x3a, + 0xa9, 0xcb, 0x8d, 0x4c, 0x95, 0xbc, 0x57, 0xba, 0x55, 0x4d, 0xbf, 0x70, + 0x8b, 0xb5, 0x74, 0x2c, 0x2d, 0x4a, 0x77, 0xa5, 0xd0, 0x62, 0xb8, 0x6a, + 0x94, 0x72, 0xa5, 0xbe, 0x3f, 0x8b, 0x70, 0x9f, 0x61, 0x69, 0x9f, 0x80, + 0x94, 0x39, 0x69, 0x37, 0x55, 0x46, 0xbc, 0xcb, 0x9f, 0x33, 0xd4, 0x42, + 0x8c, 0x90, 0x54, 0x35, 0x99, 0xb7, 0xb9, 0x3b, 0x3d, 0x7b, 0x61, 0xb7, + 0xb8, 0x36, 0x37, 0x39, 0xb6, 0x93, 0x93, 0xa4, 0x91, 0x8c, 0xb8, 0xc3, + 0x58, 0x53, 0xc8, 0x95, 0xbc, 0x93, 0x50, 0x25, 0x86, 0x9d, 0xd8, 0x4b, + 0xb0, 0x41, 0x94, 0x84, 0x4b, 0x7a, 0x46, 0x71, 0x92, 0xa8, 0xd6, 0xca, + 0x48, 0x50, 0x5a, 0x65, 0x8f, 0x7f, 0x37, 0x84, 0xa1, 0x42, 0x90, 0x59, + 0x4a, 0x7d, 0xbc, 0xa1, 0xa4, 0x7f, 0x89, 0x86, 0x37, 0xc4, 0xc2, 0xb3, + 0xb1, 0xaf, 0xba, 0x7c, 0x61, 0x44, 0xb0, 0xe1, 0x3b, 0xb2, 0x61, 0x4e, + 0x83, 0x44, 0xc3, 0xc2, 0x42, 0x3d, 0x8d, 0xcc, 0x3a, 0x75, 0xc9, 0x38, + 0x5c, 0xa9, 0xb4, 0x63, 0xa0, 0x45, 0x6f, 0x87, 0x9e, 0x8d, 0x52, 0xa8, + 0xb6, 0x7c, 0xa4, 0x72, 0x3e, 0x3c, 0x74, 0x77, 0xbe, 0x61, 0x87, 0x8c, + 0x55, 0xd3, 0xaa, 0x86, 0x86, 0x45, 0x36, 0x5d, 0x4c, 0x3e, 0x9c, 0x29, + 0x58, 0x7b, 0x87, 0x89, 0x60, 0x3f, 0x65, 0xd0, 0x9e, 0xd5, 0x3f, 0xc8, + 0xd9, 0x76, 0xae, 0xb2, 0xa6, 0x4c, 0xce, 0xcb, 0x5b, 0x84, 0x77, 0x98, + 0xb2, 0x41, 0xc8, 0xbb, 0x90, 0x5b, 0x63, 0xa4, 0x74, 0x73, 0x45, 0x4c, + 0x6b, 0x9a, 0x9d, 0xb2, 0xc3, 0xba, 0x63, 0x74, 0x5d, 0x5c, 0xba, 0x8f, + 0x99, 0x3e, 0xcb, 0xc8, 0x77, 0x5e, 0xe1, 0xde, 0x55, 0x38, 0x5d, 0xae, + 0x89, 0xbf, 0xbc, 0xb3, 0x89, 0xb0, 0xc3, 0xb5, 0xad, 0xcd, 0x72, 0x4b, + 0x29, 0x3b, 0x54, 0xcd, 0xb0, 0x3d, 0x33, 0x57, 0x96, 0xc3, 0x9b, 0xcf, + 0x9d, 0xaa, 0x8a, 0x4c, 0x85, 0xc9, 0x8f, 0xd0, 0x3a, 0x73, 0x96, 0x50, + 0xc2, 0x27, 0x7a, 0xb4, 0x48, 0x66, 0xa2, 0x63, 0xab, 0x44, 0x6f, 0x38, + 0xc7, 0x95, 0xc1, 0xa2, 0xb3, 0xc3, 0x82, 0x5b, 0x3e, 0x87, 0xa0, 0x7f, + 0x3e, 0x64, 0x66, 0xb3, 0x5d, 0xd0, 0x9c, 0x89, 0x32, 0x6f, 0xb0, 0xce, + 0xb1, 0x3e, 0x4c, 0xa0, 0xbd, 0x87, 0x44, 0xca, 0xc1, 0x94, 0xa1, 0x82, + 0xa6, 0x7e, 0x9b, 0x6a, 0xc5, 0xbb, 0xa4, 0x95, 0x72, 0x63, 0xb2, 0x5e, + 0x42, 0x86, 0x99, 0x2f, 0xc5, 0x8f, 0x73, 0xcc, 0xa8, 0x60, 0x72, 0xb6, + 0x80, 0x81, 0x46, 0xc3, 0x44, 0x8a, 0xc8, 0x47, 0xac, 0xc7, 0x3e, 0x38, + 0xa0, 0xb7, 0x6c, 0x58, 0x5a, 0x85, 0x83, 0x48, 0x95, 0x89, 0x9a, 0x94, + 0xac, 0xb4, 0x7b, 0x6b, 0x73, 0xa6, 0x62, 0x76, 0xaa, 0x9f, 0xba, 0x3e, + 0x6f, 0x6c, 0xa4, 0x5c, 0x9d, 0x77, 0x8e, 0x8b, 0x42, 0xba, 0xb1, 0x69, + 0x6c, 0x64, 0xc0, 0xc1, 0x4e, 0x67, 0x9c, 0x43, 0xb4, 0x28, 0x62, 0xae, + 0xbf, 0x59, 0xa0, 0x67, 0xa6, 0xbe, 0x6a, 0x8b, 0x3d, 0xc0, 0xa5, 0xdd, + 0xb1, 0xc6, 0x73, 0x95, 0xaa, 0x95, 0x51, 0x59, 0x92, 0x7c, 0x98, 0x70, + 0xaf, 0xa7, 0xaa, 0xc3, 0x56, 0x54, 0x49, 0x9e, 0xb5, 0x45, 0xa2, 0xd0, + 0x7c, 0xbb, 0xa8, 0x5e, 0xd5, 0x53, 0xb1, 0xa5, 0x2f, 0x79, 0x6a, 0xc9, + 0x3c, 0x36, 0xaa, 0x4e, 0x93, 0x92, 0xa0, 0x79, 0x80, 0x85, 0x6e, 0x5d, + 0xc6, 0xcb, 0xce, 0x68, 0xc6, 0x72, 0x83, 0x7e, 0xc3, 0x69, 0x9c, 0x45, + 0x75, 0x6f, 0x66, 0x5f, 0xa1, 0xcc, 0x4e, 0x4c, 0x4e, 0x8f, 0x56, 0x3a, + 0x7f, 0xc3, 0x95, 0x75, 0x9a, 0x5f, 0x4c, 0xa0, 0xc6, 0x64, 0x72, 0x73, + 0xb0, 0x9d, 0x98, 0x7b, 0xab, 0xcb, 0x3b, 0x7a, 0x56, 0xaf, 0x40, 0x50, + 0xc6, 0x7f, 0x82, 0x3c, 0x8f, 0x62, 0x75, 0xd9, 0xd8, 0xd9, 0x6e, 0x3a, + 0x92, 0x7a, 0x9b, 0xb0, 0x7a, 0x57, 0x47, 0x51, 0x46, 0x90, 0x7a, 0xb9, + 0x5c, 0xba, 0xa1, 0x32, 0xd8, 0xb1, 0x81, 0xcb, 0x2e, 0xcd, 0x84, 0x5a, + 0xa5, 0xbc, 0x82, 0xb6, 0x7f, 0xbe, 0x58, 0x9e, 0xbf, 0x7f, 0x3c, 0x54, + 0x5e, 0x38, 0x66, 0x79, 0x4f, 0x39, 0x50, 0xa3, 0x97, 0x85, 0x45, 0xab, + 0x54, 0x48, 0x50, 0x85, 0xbb, 0x80, 0x88, 0x80, 0xc4, 0x55, 0x9c, 0x8d, + 0xbe, 0x82, 0x92, 0xcb, 0x3a, 0x77, 0x44, 0xb1, 0x5f, 0xa3, 0xbd, 0xb9, + 0x7a, 0xa2, 0x41, 0x6f, 0x87, 0xc4, 0xcd, 0x6a, 0x88, 0x67, 0xb2, 0x80, + 0x72, 0x50, 0xac, 0x7e, 0xc3, 0x4b, 0xcd, 0xab, 0x60, 0x67, 0x49, 0x5e, + 0x73, 0x71, 0x9f, 0x78, 0x6d, 0xb9, 0xac, 0xd0, 0x4a, 0x80, 0x6f, 0xba, + 0x70, 0x3d, 0xc1, 0x83, 0xcb, 0x9c, 0x42, 0xbd, 0x36, 0x86, 0x3e, 0x61, + 0xcd, 0x74, 0x3f, 0x48, 0xd0, 0xc6, 0x9e, 0xcf, 0xce, 0x96, 0x68, 0x68, + 0x9c, 0x7b, 0x71, 0xba, 0x52, 0x7a, 0x6b, 0x42, 0xa5, 0x9f, 0xc6, 0x5f, + 0xd0, 0x8e, 0x71, 0x3a, 0x9b, 0x65, 0xa3, 0xbd, 0xc7, 0x5a, 0x79, 0x76, + 0x6a, 0x93, 0xc5, 0x3d, 0x77, 0x58, 0x46, 0xc1, 0xbe, 0x7e, 0x66, 0x8a, + 0x78, 0x41, 0x71, 0x49, 0x6d, 0x38, 0x69, 0xad, 0xd0, 0xa9, 0x5e, 0xbc, + 0x6f, 0x6c, 0x69, 0xa8, 0x91, 0x5c, 0x4e, 0x3a, 0x47, 0xc1, 0x7d, 0x85, + 0x4e, 0x70, 0x3e, 0x3e, 0x50, 0x5a, 0x4c, 0xc1, 0xcd, 0xc9, 0x97, 0x73, + 0xb7, 0x65, 0xcf, 0x64, 0x95, 0xae, 0xb2, 0x36, 0x44, 0x82, 0x55, 0x6d, + 0x9f, 0x98, 0xcd, 0x62, 0x5a, 0xc1, 0x50, 0x35, 0x95, 0x7e, 0x79, 0x90, + 0x4e, 0xcc, 0xa7, 0x8d, 0x62, 0x3b, 0xa8, 0xb4, 0xd4, 0x9a, 0xa9, 0x2f, + 0x33, 0xaa, 0x59, 0x5d, 0xa3, 0x67, 0xc9, 0x61, 0xd2, 0xad, 0x48, 0x76, + 0x86, 0x91, 0x43, 0x42, 0x6b, 0x78, 0x7f, 0x64, 0x60, 0xa6, 0x63, 0xb5, + 0x50, 0x64, 0x77, 0x51, 0x35, 0xb4, 0x56, 0x66, 0x32, 0x85, 0x52, 0x78, + 0x7d, 0x58, 0x72, 0x58, 0xc7, 0x6d, 0xb8, 0x86, 0xd1, 0x55, 0x72, 0x71, + 0x54, 0x6e, 0x4f, 0x84, 0xc4, 0x74, 0x86, 0x86, 0x6c, 0xd5, 0xa7, 0xd2, + 0x51, 0x93, 0x77, 0x6d, 0x68, 0x76, 0xb3, 0x6a, 0xa7, 0x2e, 0xb6, 0x3f, + 0x43, 0x7e, 0xc1, 0x7d, 0x4c, 0xad, 0xad, 0xb4, 0x74, 0x4c, 0x44, 0x8b, + 0x8f, 0xba, 0xc0, 0x98, 0x3c, 0xbd, 0x93, 0xa5, 0x61, 0x77, 0x52, 0x40, + 0x4c, 0x5e, 0xb1, 0x95, 0x67, 0x4e, 0x77, 0x51, 0xa1, 0x38, 0x40, 0x7a, + 0x45, 0x9b, 0xa3, 0x63, 0x7f, 0x67, 0xb7, 0x68, 0x74, 0xc1, 0x5b, 0x4e, + 0xb8, 0x61, 0x6b, 0x87, 0x61, 0x7d, 0x4c, 0x56, 0x8c, 0x92, 0x98, 0x74, + 0x9f, 0xb6, 0x55, 0x87, 0xad, 0x4d, 0x5a, 0x7a, 0x44, 0x36, 0x9d, 0xc2, + 0x9e, 0x8d, 0xa9, 0xd0, 0x3b, 0x7f, 0x9b, 0x46, 0xba, 0x91, 0xa8, 0x9e, + 0x4a, 0x60, 0x5f, 0x94, 0x6d, 0xbf, 0x55, 0x92, 0x60, 0x5d, 0x56, 0x89, + 0x3c, 0x82, 0xbd, 0xaa, 0xc9, 0x9a, 0xa8, 0x9f, 0x33, 0x73, 0x94, 0x9e, + 0x95, 0x5f, 0x66, 0x2e, 0x46, 0x3f, 0x38, 0x72, 0x6c, 0x64, 0x56, 0x5e, + 0x90, 0x7b, 0x79, 0x50, 0xaf, 0x43, 0xbb, 0x49, 0x4c, 0xd1, 0x5d, 0x99, + 0xb4, 0x5b, 0x60, 0xbf, 0x31, 0x87, 0x89, 0x55, 0x6f, 0x77, 0x63, 0x9d, + 0x88, 0x86, 0xa5, 0x3a, 0xb3, 0xd1, 0x9e, 0x7a, 0xc0, 0x99, 0xc7, 0x9d, + 0x95, 0x93, 0x8d, 0x58, 0xca, 0x67, 0x60, 0x76, 0xc5, 0x88, 0x82, 0x51, + 0xa5, 0xae, 0x41, 0xba, 0x75, 0x62, 0x95, 0xcb, 0x33, 0xb5, 0x97, 0x6f, + 0xc3, 0xd1, 0x70, 0x90, 0x4b, 0x84, 0x74, 0x9c, 0x43, 0xca, 0xcb, 0xae, + 0x7c, 0x9e, 0x64, 0xd0, 0x2e, 0xc9, 0xb7, 0xab, 0xac, 0xa9, 0x3f, 0xaa, + 0xc8, 0x3c, 0x65, 0x83, 0x52, 0x57, 0x95, 0xb9, 0x4c, 0xc3, 0xbc, 0xa5, + 0x4e, 0x9f, 0x88, 0x40, 0x40, 0x7c, 0x57, 0xa1, 0xd1, 0x46, 0x95, 0x2e, + 0x77, 0xab, 0x5c, 0x51, 0xab, 0x60, 0xd1, 0x3c, 0x77, 0xbb, 0x7f, 0xd2, + 0x86, 0x82, 0xb3, 0x5f, 0xa9, 0xb9, 0xa8, 0x6c, 0xac, 0xb1, 0x64, 0xb7, + 0xb2, 0x98, 0x58, 0xc1, 0xab, 0x77, 0xa7, 0x94, 0xd3, 0xa4, 0xc0, 0x77, + 0xc8, 0x37, 0x9a, 0x4e, 0x4e, 0x6a, 0x7c, 0x93, 0xc4, 0x36, 0xb7, 0x38, + 0xc5, 0x9d, 0xcf, 0xa8, 0xc5, 0xb0, 0x6f, 0x6c, 0x66, 0x62, 0x4f, 0x7e, + 0x63, 0x54, 0x38, 0xb2, 0xac, 0x67, 0xba, 0xae, 0x73, 0x9f, 0xd2, 0xd2, + 0x54, 0x3d, 0x58, 0x51, 0x49, 0xbd, 0x7c, 0x3a, 0xb0, 0x9f, 0x66, 0x5a, + 0x39, 0xa7, 0x7f, 0x80, 0x9f, 0xca, 0x46, 0x3d, 0xc7, 0xba, 0x8c, 0x86, + 0xa1, 0xc5, 0x92, 0x83, 0x4f, 0xa2, 0x9b, 0xd5, 0x48, 0x52, 0x68, 0x98, + 0x7b, 0xa6, 0x86, 0xc8, 0x70, 0x9c, 0x91, 0xd2, 0xc0, 0xad, 0x35, 0x3f, + 0xac, 0x5e, 0xc4, 0xc3, 0x35, 0xa7, 0xc8, 0x5a, 0x5d, 0x49, 0xcc, 0x8a, + 0x69, 0x7e, 0x5b, 0xc5, 0x75, 0xca, 0x7d, 0xcb, 0xbd, 0x6f, 0x82, 0x94, + 0x41, 0x7c, 0x64, 0xb3, 0x6f, 0x46, 0xc3, 0xc6, 0x3f, 0xc8, 0x6f, 0xca, + 0x30, 0xcb, 0x36, 0xb4, 0xa1, 0xac, 0x9b, 0x76, 0x4d, 0x47, 0xc7, 0x88, + 0x76, 0xce, 0x48, 0x66, 0x97, 0x6d, 0x32, 0x99, 0x68, 0x54, 0x6d, 0x88, + 0x70, 0x92, 0x98, 0xce, 0x88, 0xae, 0xa0, 0x83, 0xca, 0x47, 0xbc, 0x8f, + 0x56, 0x51, 0x54, 0x62, 0x8b, 0x39, 0x56, 0x81, 0x6c, 0x9d, 0xa3, 0x79, + 0x7f, 0xa0, 0x59, 0x84, 0x67, 0x90, 0xac, 0x93, 0x61, 0xbc, 0xb8, 0x4e, + 0x63, 0x39, 0x46, 0x4f, 0x36, 0xc2, 0xa5, 0x3f, 0xcc, 0x42, 0x7f, 0x46, + 0xae, 0x77, 0x65, 0x70, 0x9f, 0xbe, 0xd8, 0xb0, 0x47, 0xaa, 0x6e, 0xba, + 0xa7, 0x9d, 0x61, 0x91, 0x69, 0xc7, 0xb5, 0x40, 0x90, 0x58, 0x82, 0x9f, + 0x60, 0x45, 0x42, 0x80, 0xa4, 0x74, 0xd2, 0xb9, 0x8a, 0x7b, 0xca, 0x6c, + 0xc8, 0x78, 0x4b, 0xb8, 0xb3, 0x35, 0xba, 0x4f, 0xa6, 0x62, 0x4a, 0x9d, + 0x6b, 0x84, 0x67, 0x6a, 0x44, 0x6c, 0xbc, 0xb9, 0xce, 0x8f, 0x7d, 0xb3, + 0xd5, 0x8d, 0xb4, 0x78, 0x63, 0xc9, 0xa7, 0x70, 0x62, 0x50, 0x8b, 0xa2, + 0xbb, 0xae, 0x2c, 0x95, 0x7c, 0x98, 0x6a, 0x56, 0x54, 0x9a, 0xa0, 0x50, + 0x84, 0x62, 0x88, 0x87, 0x7b, 0x2d, 0x7d, 0xcd, 0x4d, 0x3f, 0x72, 0x4f, + 0x85, 0xb1, 0xcf, 0xad, 0xa4, 0xad, 0x42, 0x49, 0x5a, 0x66, 0x39, 0x94, + 0xc9, 0xc3, 0x4e, 0x59, 0x8c, 0xc6, 0x53, 0xb9, 0x54, 0x5a, 0x79, 0x72, + 0xc6, 0xb4, 0xab, 0x4c, 0x99, 0x41, 0x54, 0x3d, 0xb9, 0x90, 0x43, 0x64, + 0x69, 0x72, 0xad, 0xc3, 0x58, 0x68, 0x77, 0x58, 0x76, 0x79, 0x48, 0xa2, + 0x87, 0x76, 0xc2, 0x34, 0x74, 0xad, 0xcb, 0x99, 0x72, 0x90, 0x6e, 0xb5, + 0xa6, 0x5e, 0x54, 0x9e, 0xa7, 0x64, 0x9a, 0xbd, 0x6d, 0x9f, 0x4b, 0x35, + 0x65, 0x50, 0x40, 0x89, 0xa2, 0x77, 0x51, 0x44, 0xc5, 0xcd, 0x8a, 0xbc, + 0x55, 0x5e, 0x85, 0x4f, 0x92, 0x58, 0xa3, 0x6d, 0x4b, 0xca, 0x53, 0xbd, + 0x49, 0x78, 0xa3, 0x87, 0x86, 0x9b, 0x41, 0x63, 0xba, 0xb0, 0x91, 0x9c, + 0x9d, 0x5e, 0x38, 0x66, 0x78, 0x96, 0x41, 0x61, 0xae, 0xa2, 0x77, 0x80, + 0x54, 0xc4, 0x9d, 0x6b, 0x4d, 0xd3, 0x4f, 0x47, 0x6b, 0x84, 0xb9, 0x50, + 0xb3, 0x94, 0xc2, 0x94, 0x50, 0x6b, 0x4c, 0x83, 0x51, 0xc0, 0xb2, 0x6a, + 0x76, 0x71, 0x9c, 0x7a, 0x89, 0x8a, 0xb9, 0xa3, 0xb3, 0x6a, 0x93, 0x8b, + 0x38, 0x63, 0xb8, 0xbb, 0x72, 0x64, 0x80, 0xb4, 0x56, 0xa5, 0xb9, 0x4c, + 0x6c, 0xa1, 0x9a, 0x69, 0x79, 0x65, 0x82, 0x9f, 0x87, 0x7f, 0xc5, 0x8e, + 0x8e, 0x36, 0x38, 0x90, 0x45, 0x56, 0x92, 0xb8, 0x8f, 0x7c, 0x9d, 0x5e, + 0x9d, 0x37, 0xd3, 0x7d, 0x56, 0x62, 0xb8, 0x93, 0x70, 0x44, 0x87, 0x80, + 0x74, 0x98, 0x66, 0x66, 0xcf, 0x86, 0xb9, 0x5a, 0x8e, 0x8a, 0x59, 0x96, + 0x77, 0x50, 0x7f, 0x65, 0x9e, 0x39, 0xc2, 0x5f, 0x47, 0xc4, 0x91, 0x8a, + 0xaf, 0x3f, 0x9d, 0x6a, 0x42, 0x5f, 0x60, 0xc8, 0xcb, 0x3a, 0xb2, 0xad, + 0xae, 0x89, 0x34, 0xb7, 0x8a, 0x82, 0xc2, 0x44, 0xce, 0x8e, 0x4b, 0x59, + 0xc3, 0xd4, 0x6e, 0x7f, 0x42, 0x87, 0x3e, 0xa0, 0x40, 0xd1, 0x91, 0x92, + 0x6e, 0x50, 0xb9, 0xc2, 0x8a, 0x41, 0x7b, 0x4e, 0x55, 0x6d, 0xcd, 0x43, + 0x7f, 0x69, 0xd0, 0x3b, 0x41, 0xbf, 0x5f, 0x34, 0xc4, 0x80, 0x91, 0xa3, + 0x8e, 0xb6, 0x66, 0x58, 0x42, 0x91, 0x65, 0xa4, 0x8c, 0x9f, 0x52, 0xac, + 0xbd, 0x53, 0x40, 0xd0, 0x9b, 0xb9, 0xa0, 0xaf, 0xc2, 0x54, 0x6e, 0xa1, + 0x54, 0x46, 0x92, 0x64, 0x5c, 0x6c, 0xcb, 0x52, 0xca, 0x60, 0xae, 0xd0, + 0x6a, 0x7c, 0x42, 0x7d, 0xd3, 0x84, 0xcb, 0x61, 0xab, 0xca, 0x56, 0x57, + 0x44, 0x94, 0x64, 0xcb, 0x91, 0x67, 0x51, 0x45, 0x45, 0x54, 0xd1, 0x58, + 0x94, 0x43, 0x6c, 0xab, 0xb2, 0x90, 0x7b, 0x78, 0x4c, 0x97, 0x56, 0xb4, + 0x51, 0x33, 0x91, 0x67, 0x71, 0xb3, 0x5f, 0x43, 0x9f, 0x57, 0xcd, 0x6b, + 0x5e, 0x8c, 0xbc, 0xa8, 0x5e, 0xcb, 0x8a, 0xc7, 0x8c, 0x32, 0xb4, 0x3f, + 0xd3, 0xb8, 0x7c, 0x7b, 0x53, 0xc7, 0x5f, 0x78, 0xa4, 0x6a, 0x5d, 0xb4, + 0x42, 0x61, 0x7c, 0x86, 0x76, 0x33, 0x6d, 0x3a, 0x99, 0x5c, 0x67, 0x68, + 0x99, 0x6d, 0x70, 0x48, 0xd1, 0xb7, 0xc8, 0x3a, 0x6f, 0x5f, 0x86, 0x5c, + 0xba, 0x6f, 0x9b, 0x45, 0x4c, 0xad, 0xb7, 0x72, 0xb3, 0x91, 0x44, 0xcf, + 0xc1, 0x7a, 0xd0, 0x6b, 0x34, 0x55, 0x76, 0x57, 0x4e, 0xbb, 0x4b, 0xc9, + 0x9b, 0x82, 0x9e, 0x70, 0x68, 0x4d, 0xb2, 0x34, 0x68, 0x79, 0x9e, 0x9d, + 0x4d, 0xc5, 0xa3, 0xc3, 0x3c, 0x95, 0xae, 0x3f, 0x9e, 0x83, 0x5d, 0x78, + 0xae, 0x3d, 0xbe, 0xae, 0x5d, 0x4b, 0xc8, 0xce, 0xad, 0xca, 0x75, 0x3a, + 0xaf, 0x96, 0x3b, 0x6a, 0x3b, 0x4e, 0x9f, 0x40, 0xb0, 0x9c, 0x67, 0x41, + 0x51, 0xbd, 0x59, 0xc0, 0x86, 0x4b, 0xa6, 0x5e, 0x89, 0x52, 0x56, 0xce, + 0x4f, 0xbf, 0xaf, 0x6c, 0x63, 0x5a, 0x4a, 0x4d, 0x49, 0x5f, 0x2f, 0x95, + 0x61, 0x8c, 0x55, 0xa7, 0xd0, 0x34, 0x42, 0x4c, 0x80, 0x6d, 0xaa, 0x58, + 0x6b, 0x54, 0x8c, 0x74, 0xb9, 0x5c, 0x3a, 0xbf, 0x92, 0x61, 0x5c, 0xce, + 0xac, 0x67, 0xca, 0x77, 0xb0, 0x66, 0xc6, 0x4b, 0x3a, 0x7d, 0xa5, 0x57, + 0x3e, 0x47, 0x6a, 0x71, 0x61, 0x72, 0x64, 0x51, 0x9a, 0x9a, 0x2e, 0x52, + 0xb5, 0x95, 0x31, 0x8f, 0x9b, 0x54, 0x90, 0x66, 0xb0, 0x6c, 0x42, 0x8f, + 0xb4, 0xc8, 0xab, 0x4d, 0xcf, 0x8d, 0x5c, 0x36, 0x4a, 0x45, 0x74, 0xce, + 0x7c, 0x92, 0x87, 0xd2, 0x64, 0x7a, 0x7d, 0x5d, 0xb9, 0x60, 0xa9, 0x96, + 0x51, 0xbd, 0xa2, 0x63, 0xb9, 0xbc, 0xba, 0x9f, 0x6d, 0x48, 0x4b, 0xb4, + 0x6d, 0x39, 0x5c, 0x6b, 0x74, 0xb2, 0x94, 0x95, 0x99, 0x4b, 0x5a, 0xca, + 0x3d, 0x58, 0x80, 0xc8, 0x55, 0x6f, 0xbf, 0x9f, 0x49, 0xbc, 0x8f, 0x4e, + 0x74, 0x90, 0xaa, 0x6b, 0x9c, 0x44, 0xab, 0x9d, 0xcd, 0x89, 0x9f, 0x4c, + 0xa7, 0x88, 0x86, 0xd2, 0x44, 0x8c, 0x44, 0x61, 0xbd, 0x38, 0x62, 0x3d, + 0x39, 0x87, 0xaf, 0x3a, 0x95, 0xcd, 0xb4, 0x3a, 0x7c, 0x8d, 0x63, 0x66, + 0xc9, 0xc3, 0xaa, 0xc4, 0x34, 0x96, 0x63, 0x54, 0x70, 0x45, 0xc5, 0xc6, + 0x5f, 0x82, 0xad, 0x3f, 0x61, 0x94, 0xa9, 0x44, 0x41, 0x71, 0x7e, 0xaa, + 0xca, 0xc7, 0xa6, 0xc3, 0x64, 0x4b, 0xcb, 0x9d, 0x98, 0x43, 0x7b, 0x74, + 0x5c, 0x7d, 0x62, 0xaa, 0x70, 0xc8, 0x95, 0xc1, 0xb1, 0xc4, 0x49, 0xa2, + 0x38, 0x77, 0x7c, 0x61, 0x47, 0x7b, 0xb7, 0xc1, 0xc4, 0xa0, 0x96, 0x97, + 0x84, 0x54, 0xb9, 0x63, 0x92, 0x76, 0xc4, 0x39, 0x97, 0xa1, 0xaa, 0xc7, + 0xa3, 0xb1, 0xb2, 0x3c, 0x84, 0xba, 0x5c, 0x88, 0x7d, 0x9e, 0xc9, 0xb5, + 0x83, 0x71, 0x4a, 0xc4, 0xca, 0x67, 0x88, 0x73, 0x70, 0x6f, 0x6a, 0x66, + 0x61, 0x6a, 0xaf, 0xb9, 0xbe, 0x58, 0x98, 0x4b, 0x48, 0x9d, 0x5f, 0xd4, + 0xa2, 0x71, 0xcb, 0x61, 0x47, 0x72, 0x64, 0x4c, 0xc2, 0x48, 0x37, 0x40, + 0xa8, 0x84, 0x75, 0x7a, 0xc1, 0xab, 0x6a, 0x5c, 0x6f, 0x3e, 0x9b, 0x4a, + 0xc7, 0x5b, 0xb5, 0xca, 0x73, 0x9c, 0x6e, 0x53, 0x51, 0x79, 0xa3, 0xa8, + 0x53, 0xce, 0x37, 0x54, 0xa4, 0x5d, 0xad, 0x42, 0x8a, 0xb9, 0xa2, 0x93, + 0x7d, 0x93, 0x6c, 0xa6, 0x85, 0x46, 0x32, 0x59, 0x9b, 0xb1, 0x8b, 0xd6, + 0x37, 0x57, 0x7d, 0xb6, 0x6c, 0x56, 0x3f, 0x70, 0x6f, 0x9a, 0x43, 0x80, + 0xbc, 0x69, 0x56, 0x60, 0xc2, 0xad, 0xa7, 0xc2, 0x36, 0x5e, 0x42, 0xa2, + 0xd9, 0x4e, 0xd0, 0x9e, 0x67, 0xaa, 0xb7, 0x4b, 0x88, 0x62, 0x51, 0xcf, + 0x9f, 0x4d, 0x3d, 0x43, 0xad, 0xce, 0xcc, 0xb0, 0xc9, 0x59, 0x48, 0x7a, + 0xc5, 0x95, 0x92, 0x94, 0x9f, 0x69, 0xd8, 0xae, 0xc2, 0xcf, 0x80, 0xa6, + 0x35, 0x83, 0xdd, 0x7d, 0x4e, 0x28, 0x73, 0xa3, 0x31, 0x5d, 0xbc, 0xb0, + 0x8c, 0xd2, 0xce, 0xaf, 0xd2, 0x49, 0xb4, 0x80, 0x75, 0x96, 0x77, 0xd4, + 0x4c, 0x31, 0x95, 0xa5, 0x7c, 0x29, 0x4d, 0xd3, 0xcf, 0x43, 0xd7, 0xa3, + 0x9b, 0x83, 0xad, 0x72, 0x5a, 0x5e, 0x6f, 0x95, 0x2f, 0xbd, 0xa6, 0x7a, + 0x44, 0xbd, 0xc8, 0x5b, 0x60, 0x36, 0x97, 0x89, 0xb1, 0x83, 0x7d, 0xa8, + 0xb0, 0x96, 0x93, 0xad, 0x75, 0x79, 0xbc, 0x7e, 0x91, 0xc0, 0xe0, 0x36, + 0x5c, 0x47, 0x60, 0x54, 0x51, 0xb3, 0x58, 0x35, 0x3a, 0xbc, 0x41, 0x67, + 0x68, 0x8d, 0xdb, 0x56, 0x4b, 0x5f, 0xd5, 0xb7, 0xd1, 0x4b, 0xae, 0xd2, + 0x6c, 0xa6, 0x96, 0x51, 0xca, 0x70, 0x54, 0x86, 0x6e, 0x31, 0xca, 0xd6, + 0x8d, 0xb0, 0x67, 0xa3, 0x8b, 0x47, 0x85, 0x53, 0x9e, 0xc1, 0x94, 0x52, + 0xad, 0x7d, 0x98, 0x6c, 0xa6, 0x61, 0x62, 0x9b, 0x8c, 0x6b, 0xd6, 0x8d, + 0xb4, 0x9c, 0x62, 0x72, 0xaa, 0xca, 0x79, 0x7a, 0xa7, 0x93, 0xc3, 0xdb, + 0xcc, 0x4b, 0x9f, 0xb4, 0xa3, 0x5f, 0xd1, 0x4f, 0x48, 0x57, 0x46, 0x41, + 0x5b, 0xa3, 0x30, 0x53, 0x6d, 0xa8, 0x31, 0x42, 0xaf, 0xd1, 0xb4, 0x82, + 0x8e, 0x59, 0x94, 0x5c, 0x93, 0x85, 0x4e, 0x8b, 0x7f, 0x5d, 0xb2, 0x99, + 0x79, 0xb8, 0xc3, 0x4c, 0xca, 0xca, 0xd1, 0x3a, 0xaa, 0xa4, 0x62, 0x60, + 0x7c, 0xa5, 0xa8, 0x65, 0x7e, 0xcc, 0xb6, 0x49, 0xb3, 0x39, 0x9f, 0xa3, + 0x5b, 0x8c, 0x97, 0xb1, 0x83, 0x61, 0x62, 0xb6, 0xce, 0xb1, 0x9b, 0xb2, + 0xb5, 0x7e, 0xb6, 0x4e, 0x5a, 0xa4, 0xcd, 0x67, 0x62, 0x6b, 0xb8, 0xc5, + 0x7a, 0x5e, 0xa3, 0x7d, 0x97, 0x7e, 0x4b, 0xbc, 0xa1, 0x9d, 0x28, 0xa6, + 0x55, 0x3a, 0xab, 0x49, 0x74, 0x77, 0x65, 0x99, 0xb1, 0x35, 0x7e, 0x46, + 0x4f, 0xb7, 0xbf, 0x2f, 0x38, 0x76, 0xc9, 0x26, 0x5a, 0x67, 0xd7, 0x41, + 0x51, 0xe2, 0xbe, 0x5e, 0xaa, 0xe8, 0x34, 0xd2, 0x8a, 0x9a, 0x3f, 0xc9, + 0x28, 0x5a, 0x6b, 0xa3, 0x2f, 0xbd, 0xb0, 0xbe, 0xb3, 0x6c, 0x25, 0xaa, + 0x2e, 0x5b, 0x93, 0xa0, 0xd6, 0x4e, 0x4b, 0xd5, 0x4a, 0x63, 0xa0, 0xcf, + 0x4a, 0x2e, 0xb3, 0x51, 0x8c, 0xb9, 0x9e, 0xa9, 0xcb, 0x66, 0xb5, 0x80, + 0x89, 0x79, 0xa3, 0x6d, 0x54, 0x9c, 0x66, 0xc0, 0xc7, 0xbf, 0x71, 0xb4, + 0x9c, 0xa6, 0x4e, 0xb8, 0xbe, 0xbe, 0x7d, 0x3a, 0x6c, 0xcf, 0x80, 0x73, + 0x3f, 0x91, 0x64, 0xcf, 0x5b, 0x51, 0xba, 0xab, 0x5a, 0xaf, 0xb6, 0x2f, + 0x95, 0x7f, 0xc3, 0x81, 0xb7, 0x82, 0x88, 0xa1, 0x97, 0x86, 0x40, 0xb3, + 0x9a, 0x4b, 0x87, 0x73, 0xa3, 0xa1, 0xb7, 0x53, 0xac, 0xb5, 0x67, 0x45, + 0x76, 0x88, 0x50, 0xc2, 0x61, 0xd6, 0x98, 0xac, 0x3e, 0x58, 0x59, 0x79, + 0x37, 0xae, 0xdb, 0x43, 0x52, 0x81, 0xb6, 0x35, 0x80, 0x66, 0xab, 0xc0, + 0x82, 0x54, 0x62, 0x7d, 0xa7, 0xab, 0xc6, 0x3a, 0xc5, 0xc6, 0x71, 0x5d, + 0xae, 0xba, 0x95, 0x76, 0x6f, 0x3d, 0x9f, 0x45, 0x94, 0xcd, 0x6f, 0xb8, + 0xb7, 0x70, 0xba, 0xc5, 0x5c, 0x8c, 0xc3, 0x7a, 0x6a, 0x37, 0x75, 0x6e, + 0xdd, 0xe0, 0xa2, 0xc4, 0xa9, 0xd6, 0x94, 0x59, 0x44, 0xb5, 0x3c, 0x6c, + 0xa7, 0xc5, 0xb8, 0x69, 0x7e, 0xca, 0x7e, 0x76, 0xa0, 0xc3, 0xa0, 0x5f, + 0x39, 0x9b, 0x96, 0x7e, 0x8e, 0x33, 0x36, 0x90, 0x2d, 0x95, 0xba, 0x45, + 0x70, 0xb0, 0x81, 0x9d, 0x48, 0xa4, 0xc4, 0xcd, 0xc1, 0x91, 0xaa, 0x4d, + 0x92, 0x4c, 0x3b, 0xce, 0x37, 0xa9, 0x47, 0xa7, 0x6c, 0x76, 0x41, 0xa4, + 0xc7, 0xa1, 0x9a, 0x5e, 0xba, 0x5f, 0x67, 0x77, 0x72, 0x8a, 0x5f, 0x8a, + 0xb7, 0x74, 0xd5, 0xaf, 0x40, 0xa0, 0x7b, 0xda, 0xc0, 0xad, 0xb3, 0x95, + 0xc4, 0x9c, 0x6c, 0xc1, 0xbe, 0xd2, 0xa1, 0xd8, 0xca, 0x9d, 0xb7, 0x8d, + 0xa0, 0xda, 0xe3, 0x36, 0x4c, 0xd8, 0xd8, 0x64, 0x53, 0xcf, 0x9d, 0x7e, + 0xaa, 0x50, 0x6c, 0x5a, 0x5a, 0x6c, 0x56, 0xad, 0x80, 0xbd, 0x8f, 0x93, + 0x85, 0x37, 0x89, 0xd2, 0xb3, 0x72, 0xaa, 0x9c, 0xba, 0x56, 0x5e, 0x6d, + 0x3f, 0xd3, 0xc1, 0x99, 0xbe, 0xb3, 0xb5, 0x4c, 0x92, 0xca, 0x55, 0xbd, + 0x9f, 0xca, 0xa1, 0xa7, 0x5d, 0x5d, 0xaf, 0x71, 0xa4, 0xb9, 0x75, 0x61, + 0xb6, 0x9b, 0xc6, 0x4e, 0x6b, 0x4c, 0x67, 0x9c, 0xaa, 0x95, 0x36, 0x58, + 0xc3, 0xc4, 0x7a, 0xaf, 0x4a, 0xa8, 0x8f, 0x53, 0xa4, 0xd0, 0xac, 0x99, + 0x6e, 0x53, 0xa5, 0x47, 0xbf, 0xc5, 0x8f, 0x40, 0x3c, 0xcc, 0xa9, 0x9a, + 0x63, 0x58, 0x8b, 0xc4, 0x6b, 0x8b, 0xc2, 0x5d, 0x74, 0x6c, 0x48, 0x7a, + 0x8f, 0xa0, 0xc1, 0x75, 0x8c, 0x95, 0xa0, 0x3f, 0x7d, 0xa0, 0x9f, 0x47, + 0xa0, 0xc6, 0x46, 0x88, 0x61, 0x38, 0x8d, 0x6d, 0x7a, 0x51, 0xdc, 0x87, + 0xba, 0xac, 0x97, 0x81, 0xdb, 0xaa, 0x7e, 0x47, 0x66, 0x5e, 0xc1, 0x49, + 0xb1, 0xbc, 0x58, 0xa0, 0x97, 0x42, 0x64, 0xac, 0x27, 0x79, 0x8c, 0x6e, + 0x84, 0x77, 0x56, 0x7c, 0xd0, 0x7b, 0xc9, 0xe4, 0x78, 0x51, 0x49, 0xa7, + 0xa3, 0x55, 0xc8, 0xba, 0x41, 0x49, 0xc7, 0x7b, 0xae, 0x3b, 0x60, 0x4b, + 0x31, 0x4d, 0xb7, 0x72, 0xa7, 0x50, 0x6f, 0xad, 0x96, 0x88, 0x8a, 0xd3, + 0x68, 0x62, 0x6f, 0xa0, 0x3d, 0xb1, 0xa8, 0x8b, 0x7f, 0x82, 0x78, 0x5e, + 0x66, 0xb0, 0xb4, 0xc7, 0xbd, 0xb7, 0x7a, 0x47, 0x81, 0xce, 0xbe, 0x5b, + 0x83, 0x72, 0x37, 0x87, 0xaf, 0xa7, 0x92, 0xc9, 0x68, 0xd2, 0xb4, 0x4c, + 0xc3, 0x7b, 0x8f, 0xaa, 0xa7, 0xa8, 0x9f, 0x7c, 0x3b, 0x4a, 0x94, 0x96, + 0xd3, 0x3a, 0x33, 0x91, 0x4c, 0x6e, 0x8c, 0x75, 0x75, 0x67, 0xbe, 0x36, + 0xc8, 0x4a, 0x98, 0x3a, 0x7a, 0x4e, 0xa3, 0x4d, 0x39, 0x55, 0x9f, 0x52, + 0xaf, 0xbc, 0x70, 0x56, 0xb1, 0x95, 0x50, 0x3f, 0x91, 0x60, 0x9d, 0xba, + 0x87, 0x9f, 0x83, 0xcd, 0xb4, 0x80, 0x6f, 0x4e, 0x6f, 0x75, 0xb9, 0x88, + 0x69, 0x7f, 0x48, 0xd0, 0xa2, 0x5d, 0x85, 0x9c, 0x7d, 0x93, 0x5a, 0xb7, + 0x36, 0xd0, 0xd2, 0x76, 0x58, 0x3a, 0x97, 0xb8, 0x4b, 0x50, 0x5e, 0xc0, + 0x56, 0x29, 0xc1, 0x5d, 0xa3, 0x58, 0xa6, 0x75, 0x4a, 0x7a, 0x45, 0xc4, + 0x50, 0xd5, 0xcb, 0xbc, 0x7b, 0x63, 0x55, 0x51, 0xe0, 0xc9, 0x50, 0x63, + 0x73, 0x52, 0xa4, 0x88, 0x4c, 0x5e, 0x4a, 0x68, 0x78, 0x87, 0xba, 0x5d, + 0x67, 0x7d, 0x50, 0x5e, 0x70, 0xd1, 0x90, 0x8b, 0xbf, 0x71, 0x50, 0xc6, + 0x46, 0x64, 0x98, 0x42, 0x9f, 0xa2, 0x92, 0x49, 0x43, 0xcd, 0xad, 0x6c, + 0x9a, 0xa0, 0x70, 0x9f, 0xc2, 0x35, 0x2a, 0xbc, 0x6c, 0x86, 0xaa, 0xc4, + 0x63, 0x8c, 0xc6, 0xbc, 0xc4, 0xbd, 0xad, 0xa8, 0xcd, 0x96, 0x9c, 0x51, + 0x9a, 0x72, 0x45, 0xcc, 0x41, 0xb4, 0x43, 0x6e, 0xb9, 0xc0, 0x44, 0xb1, + 0x71, 0x4e, 0xc6, 0x84, 0xce, 0x58, 0xc3, 0x97, 0x8d, 0x59, 0xa8, 0xc1, + 0x61, 0x97, 0x99, 0xc4, 0x38, 0xd1, 0x50, 0x89, 0x73, 0x57, 0xcb, 0x74, + 0x8b, 0x40, 0x62, 0xcf, 0x99, 0x6a, 0x36, 0xab, 0x42, 0x94, 0x94, 0x91, + 0x80, 0x97, 0x49, 0xb6, 0x56, 0x57, 0xa6, 0x2f, 0xa2, 0x75, 0x9c, 0xa0, + 0xaa, 0x6e, 0xb4, 0xb1, 0x65, 0xd4, 0xb2, 0xc8, 0x80, 0xd7, 0x47, 0x7b, + 0x88, 0x54, 0x6e, 0xad, 0x5c, 0x4d, 0x78, 0xbd, 0xc6, 0x70, 0xad, 0x94, + 0xb8, 0x3e, 0xce, 0x88, 0x58, 0xc6, 0x42, 0xb7, 0xb7, 0xa7, 0x74, 0x69, + 0x99, 0xa7, 0x43, 0x3a, 0x35, 0x4a, 0xc0, 0x6d, 0x46, 0xbf, 0x9c, 0x99, + 0x65, 0x3a, 0x60, 0x3a, 0x63, 0x5d, 0xa6, 0x82, 0x3b, 0xb7, 0x76, 0xaf, + 0x89, 0xc2, 0x7e, 0x70, 0x61, 0x9c, 0x43, 0xcb, 0x8e, 0x78, 0xac, 0x88, + 0xaa, 0xb6, 0x6c, 0xbb, 0x83, 0x49, 0x94, 0xbd, 0xd9, 0x7b, 0x3f, 0x95, + 0x3e, 0x76, 0x97, 0x9e, 0x85, 0x3f, 0x7e, 0xad, 0x43, 0x9f, 0xc4, 0xa1, + 0x81, 0x95, 0x64, 0x32, 0x84, 0x40, 0xad, 0x66, 0xb0, 0xc2, 0xd0, 0x69, + 0xb9, 0xce, 0x62, 0x70, 0xa2, 0x4a, 0x58, 0xb4, 0xa4, 0xc5, 0x2d, 0x8b, + 0x71, 0x6b, 0xad, 0x5d, 0x2e, 0xa9, 0xba, 0xa2, 0xb7, 0x89, 0x7e, 0x4e, + 0x95, 0x95, 0x8e, 0x3f, 0x75, 0x38, 0xb1, 0x63, 0xa3, 0x5e, 0x89, 0x66, + 0xc5, 0xd1, 0x43, 0xc0, 0x42, 0xc5, 0xc6, 0xb1, 0x48, 0x50, 0x54, 0x76, + 0x53, 0x8e, 0x6c, 0x9d, 0xcc, 0x99, 0x53, 0x9c, 0x82, 0x2e, 0x4f, 0x36, + 0x9b, 0xb0, 0x8b, 0xa5, 0x83, 0xb3, 0x37, 0x87, 0x39, 0x5f, 0x89, 0x7a, + 0x38, 0x74, 0x40, 0x84, 0xbe, 0x51, 0x96, 0x8f, 0xc6, 0xa0, 0x5d, 0xd8, + 0x34, 0xb7, 0xca, 0x87, 0x91, 0xbf, 0xb3, 0x8e, 0x36, 0x97, 0x7c, 0x74, + 0x83, 0x98, 0x95, 0x90, 0x6e, 0xcc, 0xbf, 0xb1, 0x42, 0xa6, 0x5c, 0x9c, + 0x95, 0x3e, 0xb1, 0x5b, 0x99, 0xab, 0xb9, 0xd5, 0xc2, 0x45, 0xc0, 0xc8, + 0xb3, 0x79, 0xa0, 0x64, 0x6e, 0xb5, 0x2b, 0x50, 0x50, 0xa1, 0xc2, 0x84, + 0x94, 0x43, 0xac, 0x7e, 0x56, 0x99, 0x73, 0x92, 0xa2, 0x70, 0xa9, 0xc1, + 0xa3, 0x51, 0xa6, 0x9a, 0x89, 0xbc, 0x3f, 0x59, 0xa3, 0x94, 0x54, 0x90, + 0xc5, 0xa5, 0x67, 0x5c, 0xd1, 0x55, 0x98, 0x95, 0xc9, 0x6e, 0x5f, 0x76, + 0x69, 0xb6, 0x83, 0x41, 0x37, 0xa0, 0x2e, 0xb6, 0xdb, 0x6b, 0x85, 0x3f, + 0xc7, 0x6c, 0x56, 0x85, 0x43, 0xc6, 0x87, 0x8b, 0x61, 0x83, 0xbe, 0xbd, + 0xaa, 0xb1, 0x8b, 0xcd, 0x7c, 0x5f, 0xc4, 0x9c, 0xc4, 0xa7, 0x46, 0xa3, + 0xb1, 0xb2, 0x38, 0xba, 0x44, 0x81, 0xcf, 0xe7, 0xd8, 0x71, 0x5c, 0xb1, + 0xac, 0x43, 0xaa, 0x5c, 0x63, 0xb0, 0x44, 0x3e, 0x41, 0x53, 0x83, 0x88, + 0x59, 0x40, 0xd1, 0x89, 0x4f, 0xcc, 0x75, 0x72, 0x44, 0x82, 0x44, 0x37, + 0x4f, 0x44, 0x51, 0x6c, 0xb0, 0x73, 0x46, 0x8a, 0xc3, 0x96, 0xbe, 0x4f, + 0x75, 0xbd, 0xc4, 0xba, 0x5f, 0xb5, 0x88, 0x4e, 0x31, 0x39, 0x4d, 0x81, + 0xd3, 0xc8, 0x7d, 0x44, 0x67, 0x53, 0x7a, 0xa0, 0xa6, 0x71, 0xc7, 0x62, + 0x9b, 0xb5, 0x7d, 0x60, 0xc4, 0x63, 0xa5, 0x84, 0x67, 0xac, 0x7a, 0xb3, + 0x2d, 0x84, 0x75, 0x69, 0x2e, 0xa0, 0x41, 0x49, 0x7b, 0xb0, 0x5e, 0xce, + 0x9e, 0x72, 0x8b, 0x72, 0x98, 0x9e, 0x51, 0x6d, 0xd3, 0x76, 0xbc, 0x65, + 0x7f, 0x2f, 0x91, 0xa0, 0x45, 0xc6, 0x34, 0xc9, 0xc4, 0x91, 0xab, 0x7c, + 0x9f, 0xce, 0x46, 0x70, 0x41, 0x88, 0x7b, 0x6d, 0x54, 0x83, 0xcb, 0x58, + 0x44, 0xb0, 0x64, 0xbf, 0x51, 0xbe, 0x5c, 0x73, 0x9c, 0xad, 0x88, 0x76, + 0x67, 0xd4, 0x9c, 0x80, 0x69, 0x9c, 0xc4, 0x4d, 0x92, 0x3c, 0x6f, 0x9d, + 0x6c, 0x96, 0xb6, 0x61, 0x87, 0x57, 0x86, 0x86, 0x80, 0xaf, 0x7e, 0x45, + 0xd5, 0x96, 0xaf, 0x85, 0x62, 0x8a, 0xaf, 0x59, 0x4b, 0xb0, 0xd8, 0x96, + 0x35, 0x8e, 0x9c, 0x66, 0x7f, 0x63, 0xbe, 0x55, 0x51, 0xa3, 0xc0, 0x31, + 0x99, 0x29, 0x5f, 0x55, 0x6e, 0x98, 0xa7, 0x5c, 0x78, 0xc4, 0x39, 0x91, + 0x91, 0x95, 0x88, 0x4d, 0xbd, 0x63, 0x53, 0xb6, 0xa8, 0xb8, 0x94, 0xb6, + 0x74, 0x58, 0x54, 0x44, 0xb2, 0x99, 0xd0, 0x68, 0xae, 0x6d, 0x40, 0x77, + 0x97, 0x55, 0xca, 0x79, 0xcd, 0x86, 0x64, 0x70, 0xd4, 0x62, 0x43, 0xc6, + 0xa1, 0x42, 0xd1, 0xab, 0x77, 0xa4, 0x69, 0x89, 0xcf, 0x80, 0x52, 0xa6, + 0x7b, 0xcc, 0x33, 0x4e, 0x34, 0xd0, 0xad, 0x6c, 0x91, 0xc2, 0x86, 0xb6, + 0x71, 0x94, 0x54, 0xb6, 0xbd, 0x5a, 0x86, 0xab, 0xab, 0x9b, 0xca, 0xa0, + 0x9f, 0x4a, 0xb4, 0xbd, 0x4b, 0x99, 0xbe, 0x46, 0xa3, 0x69, 0x2e, 0x72, + 0xcc, 0x6b, 0x87, 0xaf, 0x7c, 0xca, 0xcc, 0xab, 0x9b, 0x9e, 0xe1, 0xd7, + 0xc7, 0x9b, 0x9f, 0xae, 0x35, 0x7b, 0xd2, 0xc8, 0x88, 0x7e, 0x85, 0xaf, + 0xc2, 0x38, 0x68, 0xc8, 0x8f, 0x92, 0xa0, 0x88, 0x71, 0x56, 0x84, 0x49, + 0x4f, 0xc7, 0x77, 0x70, 0xa1, 0x52, 0xaa, 0xd1, 0x48, 0x3b, 0x72, 0x59, + 0xc9, 0x8b, 0x6a, 0x91, 0x42, 0x36, 0xb9, 0x9b, 0x70, 0x90, 0xa4, 0x5b, + 0x6d, 0x64, 0x55, 0x76, 0xa4, 0x36, 0x7c, 0xcd, 0xc3, 0x76, 0x50, 0x7c, + 0x2a, 0xcf, 0x3c, 0xaf, 0x8c, 0xae, 0x4c, 0xc5, 0xc7, 0xc4, 0xb6, 0x78, + 0xc8, 0x85, 0x37, 0xaa, 0x7e, 0x49, 0x87, 0xb8, 0x37, 0x68, 0xb8, 0x8d, + 0x3a, 0xaa, 0xa0, 0x83, 0x39, 0x6a, 0x8c, 0x3e, 0xc8, 0x5c, 0xa1, 0x76, + 0xb1, 0x63, 0xaf, 0x7f, 0xb3, 0xb7, 0xaf, 0x5a, 0x96, 0x45, 0x58, 0x5f, + 0xc3, 0x32, 0x8e, 0x58, 0x33, 0x4d, 0x91, 0x91, 0x95, 0x67, 0x6a, 0x96, + 0x33, 0xcc, 0xd0, 0x8b, 0xbb, 0x8e, 0x77, 0xa0, 0xc9, 0x50, 0xa6, 0x3c, + 0x50, 0xdb, 0x9d, 0x90, 0x88, 0x47, 0xc9, 0x82, 0xae, 0xd2, 0x6a, 0x8e, + 0xbc, 0xc8, 0x67, 0x6a, 0x47, 0x7c, 0x88, 0x9a, 0xd4, 0x39, 0x5f, 0x6d, + 0x4c, 0x4f, 0x4b, 0x4d, 0xa5, 0xa2, 0x58, 0x47, 0x65, 0xb2, 0xaa, 0x93, + 0x57, 0x4b, 0x98, 0x68, 0xa1, 0x6f, 0x67, 0xd7, 0x9d, 0xc4, 0x89, 0xb2, + 0xa6, 0xa8, 0x48, 0xcc, 0x43, 0xc0, 0xa6, 0x44, 0x5c, 0xb4, 0xa4, 0x6e, + 0x3a, 0x70, 0xb2, 0xb4, 0x2f, 0xc3, 0x35, 0x9b, 0x7e, 0x34, 0x79, 0xab, + 0xbc, 0x6f, 0x8b, 0x75, 0x3e, 0x55, 0xc3, 0x4f, 0x82, 0x5c, 0xc5, 0xb0, + 0xbb, 0xc6, 0x8f, 0x81, 0xaf, 0x7a, 0x87, 0x73, 0x7a, 0x98, 0xc1, 0xca, + 0x4a, 0x50, 0xba, 0x69, 0xa3, 0xb0, 0x5e, 0x6a, 0x33, 0x9d, 0x55, 0xd7, + 0x4e, 0xb9, 0x34, 0xd1, 0x50, 0xca, 0xc7, 0x90, 0x72, 0x43, 0x9e, 0x99, + 0x31, 0xa5, 0xaf, 0x7e, 0x8f, 0xc3, 0x74, 0x99, 0xd1, 0xa2, 0x95, 0x65, + 0xcb, 0xb6, 0x3f, 0xb2, 0x50, 0x7b, 0x4a, 0x78, 0xad, 0x3c, 0x5e, 0x64, + 0x82, 0xb0, 0x3e, 0x58, 0xc8, 0xd2, 0xd0, 0x81, 0x71, 0x82, 0x4d, 0x52, + 0x41, 0xc1, 0x94, 0x7b, 0x6b, 0x68, 0xba, 0x71, 0x53, 0x44, 0x69, 0x6c, + 0x93, 0xbf, 0x77, 0x49, 0xb3, 0x3b, 0xca, 0x96, 0x53, 0x6a, 0xc6, 0x82, + 0x4f, 0x7b, 0xb4, 0x46, 0xa7, 0x38, 0xa7, 0x79, 0x91, 0x98, 0x9b, 0x5d, + 0xcc, 0x72, 0x4f, 0x81, 0x45, 0x38, 0x51, 0xa4, 0x41, 0x63, 0x81, 0x68, + 0xcd, 0x35, 0x9e, 0x42, 0xa3, 0x75, 0x88, 0xd3, 0x44, 0x64, 0x57, 0x90, + 0x54, 0x99, 0xb5, 0x9a, 0x93, 0x97, 0x6c, 0x95, 0x45, 0x9d, 0x4a, 0x48, + 0x6c, 0x6f, 0x70, 0xb6, 0x63, 0x9d, 0x46, 0xa2, 0x54, 0x57, 0x57, 0x76, + 0xd2, 0xa2, 0xa1, 0x5d, 0xb3, 0x35, 0x40, 0xca, 0x57, 0x97, 0xca, 0x68, + 0x86, 0x6d, 0x52, 0x4a, 0x41, 0x80, 0x41, 0x56, 0x43, 0xc3, 0xb6, 0x69, + 0x65, 0xd2, 0x75, 0x6a, 0x7c, 0xaf, 0x81, 0x85, 0x9e, 0x93, 0x5b, 0x4c, + 0x76, 0x4a, 0xc9, 0xcc, 0xd1, 0x7e, 0x6d, 0x98, 0x8b, 0x81, 0x4b, 0x8e, + 0xa6, 0xc0, 0x40, 0x6d, 0x7e, 0x71, 0x3d, 0x66, 0x3b, 0xaf, 0x80, 0x72, + 0xb5, 0xc4, 0x3b, 0x96, 0x38, 0x92, 0x45, 0xba, 0x4f, 0x5b, 0x68, 0x99, + 0x5e, 0x72, 0x65, 0x84, 0x6c, 0x8b, 0xad, 0x35, 0x49, 0x8e, 0x9f, 0x49, + 0xb3, 0xc0, 0x3d, 0x60, 0x71, 0xca, 0x4d, 0x99, 0x9c, 0x60, 0x3f, 0xae, + 0xa1, 0xa0, 0xb5, 0x3c, 0x4d, 0x8f, 0x77, 0x3e, 0xb4, 0x83, 0x96, 0x37, + 0x77, 0x39, 0x91, 0x3a, 0x9e, 0xab, 0x92, 0x94, 0x91, 0x31, 0x80, 0xb3, + 0xb3, 0x87, 0xb2, 0xb7, 0x70, 0xa0, 0x92, 0x66, 0x61, 0xca, 0x5e, 0x8e, + 0xcd, 0x55, 0xc2, 0x5d, 0xc9, 0x62, 0xa9, 0x3b, 0xce, 0x5b, 0x54, 0x3f, + 0x53, 0xab, 0x4a, 0x8c, 0xb8, 0x5d, 0x97, 0xa1, 0xc4, 0x7d, 0x8e, 0x81, + 0x72, 0x47, 0xa5, 0x8d, 0xa8, 0x33, 0xb1, 0x79, 0x69, 0xc9, 0x79, 0x30, + 0x81, 0x86, 0x75, 0x36, 0x76, 0xaf, 0x34, 0x37, 0xb7, 0xb6, 0x69, 0x64, + 0xbc, 0x9a, 0x7c, 0xc5, 0x9a, 0x72, 0x76, 0xad, 0x40, 0x4a, 0x8c, 0x7c, + 0x91, 0x63, 0x52, 0xa3, 0x6a, 0x2f, 0x3a, 0xb9, 0x57, 0x3b, 0x44, 0x81, + 0x67, 0xa9, 0x99, 0xb0, 0x62, 0x4c, 0xab, 0xc2, 0x3a, 0x88, 0x49, 0xb9, + 0x4b, 0xbc, 0x7a, 0x31, 0xa1, 0x6d, 0x52, 0x47, 0xca, 0x65, 0x3b, 0x8a, + 0x3e, 0x44, 0x6b, 0xd9, 0x4e, 0xbb, 0x8e, 0xb3, 0x85, 0xa5, 0xac, 0x7e, + 0x45, 0x47, 0xb8, 0x3d, 0x8d, 0x9c, 0xb1, 0xb4, 0x9d, 0xc1, 0x95, 0x5d, + 0x84, 0x56, 0xcf, 0x5e, 0x90, 0xb8, 0x33, 0xb9, 0x5c, 0x94, 0xcb, 0x64, + 0xd3, 0xbb, 0x3b, 0x49, 0xae, 0x7c, 0x4f, 0x52, 0x9c, 0x7c, 0x5c, 0x61, + 0x9b, 0x64, 0x92, 0x6a, 0x7e, 0xc7, 0x82, 0x73, 0x6d, 0x95, 0x7b, 0x38, + 0xb2, 0xa2, 0x7d, 0xc6, 0x4f, 0x6d, 0x41, 0xa9, 0xae, 0xb7, 0x93, 0x4c, + 0x45, 0x2c, 0x3e, 0xc3, 0x38, 0x39, 0x5d, 0x6a, 0x52, 0x80, 0xa6, 0xa8, + 0x7d, 0x3b, 0x5a, 0x2c, 0xa6, 0x58, 0x47, 0x92, 0x97, 0xa0, 0x3e, 0x59, + 0xc0, 0x9f, 0x6f, 0x8f, 0x8f, 0x72, 0x94, 0x74, 0x7c, 0xcf, 0x54, 0x50, + 0x45, 0x46, 0x8c, 0x6b, 0x4d, 0x8c, 0x55, 0xa7, 0x55, 0x8b, 0x33, 0x93, + 0x63, 0x8b, 0x7a, 0x3e, 0x5c, 0x77, 0xb9, 0x80, 0x73, 0x77, 0x64, 0x52, + 0x85, 0x5b, 0xa7, 0x3a, 0x85, 0x9f, 0xa8, 0x9f, 0x70, 0x35, 0x63, 0x4b, + 0x5b, 0xb9, 0x36, 0xa9, 0xb7, 0x4e, 0x60, 0x3e, 0xb9, 0x7d, 0x60, 0x3d, + 0x99, 0x40, 0x6e, 0x8b, 0x6a, 0x33, 0x7b, 0x51, 0x5d, 0x64, 0x98, 0x39, + 0x5f, 0xa5, 0x5c, 0x47, 0x90, 0x50, 0xd3, 0x53, 0x9d, 0xc7, 0xad, 0x32, + 0xba, 0xb8, 0x9f, 0x3a, 0xa9, 0xc0, 0xbc, 0xc8, 0x71, 0x4b, 0x45, 0x59, + 0x93, 0xa8, 0x7b, 0x40, 0x50, 0xa0, 0x9b, 0xc6, 0x8f, 0x59, 0x4e, 0x55, + 0x44, 0x48, 0x75, 0xc2, 0xa8, 0x3c, 0x7f, 0x81, 0x71, 0xcb, 0x64, 0xcf, + 0x85, 0x60, 0xb9, 0x37, 0xbc, 0x8c, 0x51, 0x98, 0x88, 0xa4, 0x93, 0xcd, + 0xb2, 0xbe, 0x9a, 0xc4, 0x4c, 0x72, 0x58, 0x56, 0x54, 0x73, 0x94, 0x6e, + 0x7f, 0xc3, 0x47, 0x93, 0xa3, 0x68, 0xa6, 0x8e, 0x83, 0x49, 0x4b, 0x96, + 0x7c, 0x64, 0x76, 0x70, 0x98, 0x95, 0x94, 0x7d, 0x9d, 0xbe, 0x3a, 0x89, + 0x91, 0xb0, 0x36, 0x78, 0x53, 0xd2, 0x54, 0x83, 0xc6, 0x5d, 0xcb, 0xce, + 0x57, 0xbd, 0x34, 0xaf, 0x70, 0x6a, 0xb4, 0xb6, 0x46, 0x34, 0xc0, 0x46, + 0x9c, 0xcc, 0x90, 0xa3, 0xb5, 0x9a, 0x73, 0x9d, 0x6f, 0x53, 0xac, 0x72, + 0xad, 0x4c, 0xac, 0x30, 0x43, 0x8d, 0x53, 0xc8, 0x4e, 0xa7, 0xb1, 0x6c, + 0xcd, 0x60, 0x6a, 0x6e, 0x9e, 0x6f, 0xab, 0x9a, 0x39, 0xb3, 0x32, 0x45, + 0x63, 0x6c, 0x8e, 0x38, 0x98, 0x86, 0x70, 0x87, 0x6d, 0xa0, 0x6e, 0x7c, + 0xb1, 0x99, 0xb6, 0x7c, 0x75, 0xb3, 0x73, 0x66, 0x85, 0xae, 0x38, 0x78, + 0x39, 0x84, 0x94, 0x6c, 0xa7, 0x4e, 0xba, 0x99, 0x48, 0x96, 0xb3, 0x92, + 0x62, 0x4d, 0x94, 0x55, 0x91, 0x48, 0xa7, 0x7d, 0xc0, 0x4b, 0x3e, 0x62, + 0xae, 0x6d, 0x98, 0x85, 0x70, 0x5e, 0x55, 0xd0, 0x45, 0xa0, 0x3e, 0x6b, + 0xc9, 0x3c, 0x61, 0x6e, 0xa8, 0xc6, 0x44, 0x4f, 0x36, 0x8f, 0x3b, 0x4e, + 0xc2, 0x89, 0x44, 0x7d, 0x96, 0xd0, 0x41, 0xd0, 0xb3, 0xd1, 0x41, 0x5a, + 0x5b, 0x72, 0xc9, 0x43, 0x36, 0x9e, 0xb8, 0xad, 0x4d, 0x44, 0xb0, 0xcb, + 0x50, 0x7f, 0x3a, 0xcf, 0xb7, 0x5f, 0x8d, 0x9f, 0xc9, 0x4a, 0xc8, 0xb9, + 0x66, 0x54, 0x5c, 0x5d, 0x87, 0x9d, 0x69, 0xc2, 0xb7, 0xa4, 0x56, 0x3b, + 0x59, 0xc9, 0x69, 0x5f, 0x45, 0x45, 0x4b, 0x4f, 0xc8, 0x3d, 0x99, 0x48, + 0x31, 0x8d, 0x56, 0x90, 0x87, 0x2f, 0x94, 0x9d, 0xc4, 0x59, 0x58, 0x78, + 0xcf, 0x78, 0x58, 0x3b, 0x76, 0xa9, 0x4a, 0xcf, 0x8b, 0x61, 0x7e, 0xba, + 0xb8, 0x66, 0xc9, 0xc4, 0x40, 0xa1, 0x48, 0x74, 0xb9, 0x88, 0x6c, 0x44, + 0x72, 0xaf, 0x8b, 0x84, 0x3c, 0x81, 0xb9, 0xa3, 0x64, 0x9f, 0x80, 0x51, + 0x6d, 0x3b, 0xd1, 0x71, 0xc9, 0x6b, 0x99, 0x75, 0x6d, 0x40, 0x69, 0xbd, + 0x9e, 0x9c, 0xb7, 0xa8, 0x3e, 0x38, 0x50, 0xd3, 0x45, 0x48, 0x97, 0x39, + 0xb6, 0x62, 0x95, 0x54, 0x80, 0x9c, 0xd0, 0xae, 0xab, 0xcf, 0x5c, 0x42, + 0xa4, 0x72, 0xb9, 0x3d, 0x53, 0x4b, 0xab, 0xcb, 0x31, 0x5c, 0x47, 0xc0, + 0xc4, 0x9e, 0x61, 0xb9, 0x60, 0x71, 0xca, 0x60, 0x93, 0x7c, 0xad, 0x6f, + 0xc7, 0x65, 0x65, 0x4e, 0x7c, 0x67, 0x79, 0x8d, 0x3a, 0x72, 0x50, 0xcc, + 0x7b, 0x5e, 0x3e, 0xc5, 0xc6, 0x4a, 0x70, 0x44, 0x76, 0x91, 0x4c, 0x6b, + 0xae, 0x93, 0x39, 0x8c, 0x42, 0xcf, 0x88, 0xbc, 0x87, 0x50, 0x9c, 0xc2, + 0x72, 0xcd, 0xc5, 0x4d, 0xb3, 0xba, 0x80, 0xca, 0x54, 0x3a, 0x37, 0x74, + 0x31, 0x58, 0x6a, 0x66, 0x3a, 0xad, 0x99, 0x77, 0x94, 0x8b, 0x8f, 0x9c, + 0xb2, 0x33, 0x75, 0xa5, 0x53, 0x53, 0x95, 0xb3, 0xae, 0xc4, 0x9c, 0x4e, + 0x9e, 0x82, 0x9b, 0x8d, 0x8e, 0x62, 0x97, 0xa1, 0x7a, 0xcc, 0xbb, 0xbe, + 0xc0, 0x41, 0x79, 0x36, 0x84, 0x4e, 0x40, 0x3d, 0x3a, 0x65, 0x8c, 0x82, + 0x67, 0x67, 0x8f, 0x4a, 0x44, 0x8e, 0x93, 0x68, 0xb3, 0xc9, 0x64, 0xcc, + 0x5c, 0x39, 0xab, 0x36, 0x89, 0xc6, 0x39, 0xac, 0x6c, 0x2d, 0xaa, 0xab, + 0x5b, 0x83, 0x56, 0xbe, 0x63, 0x80, 0xbd, 0x85, 0x4e, 0xad, 0xb5, 0x87, + 0xbc, 0x73, 0x83, 0x97, 0xcc, 0x46, 0x3c, 0x34, 0x5c, 0x9a, 0xb0, 0x31, + 0xd0, 0x4f, 0x5c, 0xba, 0x35, 0x5f, 0x9e, 0xb1, 0x8d, 0x52, 0x9f, 0x97, + 0x31, 0x50, 0xa6, 0x51, 0xa2, 0xcf, 0x9c, 0xaf, 0x38, 0x99, 0x67, 0xbd, + 0xaa, 0x93, 0xb2, 0xaf, 0x37, 0x65, 0xd1, 0x78, 0xbf, 0x36, 0x34, 0xa6, + 0x77, 0xd3, 0x9f, 0x75, 0xad, 0x82, 0x88, 0xb9, 0x92, 0xba, 0x77, 0x53, + 0xc9, 0x35, 0x34, 0x97, 0x66, 0x3c, 0xa7, 0x81, 0xb9, 0xa0, 0x53, 0xcf, + 0xb6, 0x87, 0xc1, 0xab, 0x4b, 0x57, 0xb1, 0xaf, 0x64, 0xab, 0xb2, 0x46, + 0xca, 0x6b, 0xa6, 0x7d, 0xb5, 0x4a, 0x75, 0x53, 0x32, 0x9f, 0xaa, 0xc3, + 0xbd, 0xbc, 0xbf, 0x5f, 0xcf, 0x3e, 0x69, 0x78, 0xaf, 0x60, 0x83, 0x58, + 0xa4, 0xc3, 0x2d, 0xb9, 0x2d, 0x8f, 0x56, 0xb5, 0x77, 0xb1, 0xba, 0x78, + 0x6f, 0xb1, 0x80, 0x8b, 0xd0, 0x9e, 0xcd, 0x96, 0x5f, 0x9b, 0xcc, 0x57, + 0x9f, 0x60, 0x88, 0x54, 0xae, 0x68, 0x5a, 0x51, 0x8c, 0x99, 0x72, 0x85, + 0xb8, 0x3b, 0x94, 0x32, 0x5e, 0x87, 0x7a, 0x75, 0x7a, 0x9f, 0x88, 0x31, + 0x92, 0xb8, 0x69, 0x69, 0x9b, 0x73, 0x4b, 0xaa, 0x67, 0x92, 0x69, 0x57, + 0x68, 0x74, 0x44, 0x9e, 0xb0, 0x6e, 0x46, 0x63, 0xcc, 0x71, 0xbd, 0xb1, + 0xaf, 0x54, 0x50, 0x4a, 0x7f, 0x89, 0xa8, 0xc1, 0xa3, 0x9a, 0x6e, 0x35, + 0xaa, 0x82, 0x51, 0x9f, 0x66, 0x44, 0xa9, 0x84, 0x89, 0xbc, 0x33, 0x44, + 0x9c, 0xb6, 0x68, 0xb7, 0x97, 0x7a, 0xb1, 0xc3, 0xa8, 0x4b, 0x9b, 0xb5, + 0x3a, 0x9b, 0x62, 0xc9, 0x42, 0x50, 0x85, 0xc8, 0x78, 0xc1, 0xb5, 0xcc, + 0x6e, 0x64, 0xc9, 0x57, 0x91, 0x61, 0x63, 0xa9, 0xca, 0xd0, 0xa4, 0x45, + 0x73, 0xca, 0x89, 0x5d, 0x87, 0x94, 0x89, 0xa0, 0xd4, 0xca, 0xb2, 0xb2, + 0x46, 0x64, 0x65, 0x3d, 0xd3, 0x68, 0xae, 0x46, 0x6c, 0x81, 0x67, 0xc0, + 0xc9, 0x92, 0xaa, 0x6b, 0xcf, 0xc5, 0xc8, 0x89, 0x66, 0x92, 0x4e, 0xca, + 0xc3, 0x97, 0x7a, 0x54, 0x65, 0x3b, 0x48, 0x99, 0x5f, 0xb9, 0xb8, 0x39, + 0xa9, 0xc0, 0x4f, 0x36, 0x67, 0xa8, 0x4f, 0xab, 0x53, 0x4c, 0x74, 0xb6, + 0x93, 0xdb, 0xbc, 0xac, 0x94, 0x63, 0x3f, 0x31, 0x81, 0x48, 0x92, 0x88, + 0xcb, 0x8f, 0xbe, 0x75, 0xbd, 0x7f, 0x77, 0x2d, 0x56, 0x75, 0x50, 0xa3, + 0xad, 0x8f, 0x8b, 0xca, 0x44, 0x7c, 0x67, 0x7e, 0xb8, 0x9f, 0x6a, 0xb5, + 0xc4, 0x71, 0x49, 0x9b, 0x8b, 0x6e, 0x77, 0xc1, 0x66, 0x7a, 0x7a, 0xae, + 0x7c, 0x9c, 0xc2, 0x4d, 0xa3, 0x4a, 0x83, 0x8e, 0x73, 0xac, 0x43, 0xbc, + 0x68, 0xce, 0x46, 0xa0, 0x74, 0xba, 0xa9, 0xae, 0x36, 0xbb, 0xbb, 0xac, + 0x65, 0x58, 0x5b, 0xb2, 0x6a, 0x78, 0x9b, 0x93, 0xa1, 0xaa, 0x74, 0x8d, + 0x81, 0x2c, 0x41, 0x87, 0x89, 0x75, 0x9d, 0x48, 0x3e, 0x9e, 0x7d, 0x6d, + 0xa6, 0xde, 0x9e, 0x73, 0x7c, 0xcb, 0x4f, 0x68, 0x49, 0xaf, 0x37, 0x96, + 0x9d, 0x34, 0x7f, 0xb3, 0x58, 0xc0, 0x53, 0x3d, 0xa1, 0xc7, 0x4c, 0xa2, + 0xb1, 0xa8, 0xc0, 0x8d, 0x4e, 0xc6, 0xa2, 0xd0, 0xad, 0x43, 0x58, 0x79, + 0x3c, 0x53, 0x6a, 0x76, 0x53, 0x65, 0x76, 0x6b, 0x47, 0xcb, 0xa2, 0xcc, + 0xc6, 0xc7, 0x59, 0x6b, 0x97, 0xa9, 0xcb, 0x34, 0x44, 0xae, 0x45, 0x5e, + 0x3b, 0x8e, 0x61, 0xc7, 0x71, 0xad, 0x77, 0xb5, 0x8a, 0xbd, 0x49, 0x78, + 0x65, 0x3c, 0x57, 0x3a, 0x7a, 0x9a, 0x96, 0x7d, 0x64, 0xb1, 0x7d, 0xb3, + 0xbf, 0x3a, 0x6d, 0x66, 0x46, 0x81, 0x69, 0x8b, 0x6e, 0x51, 0xbc, 0x55, + 0xd6, 0x7f, 0xb7, 0xc6, 0x54, 0xdd, 0xc4, 0xa4, 0x51, 0x75, 0x8b, 0x93, + 0xa9, 0xad, 0x66, 0x7f, 0x7d, 0x9e, 0x60, 0x4d, 0x4d, 0x55, 0x7c, 0xb6, + 0x41, 0x71, 0x64, 0xcb, 0xc9, 0x75, 0x32, 0x8d, 0x8b, 0x69, 0x5f, 0x61, + 0x5c, 0x5d, 0x7f, 0x63, 0x4f, 0xc9, 0xad, 0x45, 0xc2, 0x5e, 0xcb, 0x38, + 0x57, 0xd0, 0x6c, 0x47, 0x37, 0xc4, 0x53, 0x6e, 0xdd, 0x80, 0x68, 0x90, + 0xab, 0xb3, 0x41, 0xa9, 0xcd, 0x7e, 0xb9, 0xd6, 0x7e, 0xbf, 0xa5, 0x61, + 0x86, 0xba, 0xa8, 0xd4, 0x56, 0x67, 0x8d, 0x45, 0xa1, 0x4c, 0x48, 0x36, + 0x80, 0xaf, 0xaa, 0x68, 0x59, 0x3e, 0x36, 0x46, 0x9b, 0x87, 0xc5, 0x37, + 0x69, 0x3a, 0x43, 0x2d, 0xa5, 0x67, 0xda, 0xa2, 0x79, 0x85, 0xb7, 0x9e, + 0x72, 0x65, 0x6a, 0xa5, 0x85, 0x62, 0x70, 0xa0, 0x71, 0x70, 0xab, 0x58, + 0xd5, 0x84, 0x4e, 0x62, 0xad, 0x68, 0x31, 0x7e, 0x60, 0x6c, 0x72, 0x9e, + 0x45, 0x75, 0xcb, 0x8a, 0x6c, 0x36, 0xad, 0x99, 0x7c, 0x44, 0x3e, 0x6f, + 0x36, 0xc5, 0xd4, 0x76, 0x34, 0xc7, 0xd0, 0x6e, 0x31, 0x4d, 0xbe, 0x9e, + 0x85, 0x3b, 0x73, 0xbb, 0xaa, 0x46, 0xc9, 0x51, 0xd0, 0xc1, 0xca, 0xd3, + 0x5c, 0x46, 0xba, 0xc9, 0xc5, 0x63, 0x7c, 0x64, 0xa6, 0x58, 0xc0, 0x43, + 0x40, 0x61, 0xba, 0x3a, 0x72, 0x99, 0xd3, 0x75, 0x6e, 0x8c, 0x72, 0x8e, + 0x87, 0xb0, 0x33, 0x40, 0x50, 0xa0, 0xa8, 0x4f, 0x94, 0x68, 0x34, 0x3d, + 0xcd, 0xb5, 0x58, 0x9f, 0xb0, 0x68, 0xcf, 0xa0, 0x9d, 0xba, 0x5b, 0x82, + 0xc1, 0x34, 0x8c, 0xa3, 0x6f, 0x68, 0xb7, 0x8b, 0x5f, 0x87, 0x9d, 0x79, + 0xa4, 0x42, 0x42, 0x4d, 0xd9, 0x3f, 0x70, 0x8e, 0x42, 0x64, 0xa5, 0x71, + 0x5f, 0x4a, 0x8d, 0x7e, 0x52, 0xa5, 0xc0, 0xc1, 0x94, 0x57, 0xc9, 0x55, + 0x6a, 0x94, 0x86, 0xa6, 0xd0, 0x44, 0x5c, 0x72, 0x90, 0x3e, 0x45, 0x8c, + 0x8b, 0xc5, 0xa5, 0xa9, 0x93, 0x9b, 0x4c, 0x5e, 0xbf, 0x66, 0x62, 0x76, + 0x95, 0x61, 0x46, 0x5b, 0x91, 0xae, 0x5c, 0xa8, 0x42, 0x60, 0xcf, 0x7d, + 0xca, 0x5c, 0xc0, 0x3b, 0x9c, 0x6f, 0xa2, 0x51, 0x55, 0x57, 0x8a, 0x4e, + 0xd0, 0x57, 0x4d, 0xa0, 0xb5, 0x94, 0x32, 0xdd, 0x98, 0x60, 0x5f, 0x61, + 0x70, 0x3a, 0xcd, 0x37, 0x92, 0x36, 0x4f, 0x39, 0x66, 0xbf, 0xa7, 0xab, + 0x65, 0xb1, 0xad, 0xa9, 0xb8, 0xaa, 0x8b, 0x32, 0xca, 0x92, 0x6a, 0x46, + 0xca, 0xaf, 0x9a, 0x79, 0x85, 0x8f, 0x6d, 0x5d, 0x7f, 0x29, 0x5e, 0xae, + 0xbb, 0xae, 0x61, 0x94, 0x37, 0x89, 0x87, 0xce, 0x45, 0x65, 0x77, 0x6f, + 0x43, 0x2d, 0x70, 0x4d, 0x3f, 0x5a, 0xb1, 0x7f, 0xab, 0x76, 0x8b, 0xb7, + 0x4c, 0xb4, 0xcf, 0x74, 0xd0, 0x45, 0x9a, 0x3f, 0x8f, 0x41, 0x56, 0x6d, + 0x4d, 0x2d, 0x86, 0xb6, 0x99, 0x41, 0xb8, 0x42, 0x55, 0xa6, 0xa0, 0xaf, + 0x8a, 0xaa, 0x6f, 0x9f, 0xce, 0x2f, 0x69, 0x69, 0x8d, 0x2e, 0x82, 0x86, + 0xd1, 0x41, 0xc9, 0x71, 0x36, 0x43, 0x9a, 0xc6, 0x3e, 0x82, 0xaf, 0x61, + 0x5b, 0x5d, 0xac, 0x6f, 0xad, 0xd9, 0xd2, 0x95, 0x69, 0x47, 0x70, 0x9c, + 0x76, 0xbe, 0x80, 0x7b, 0xbe, 0x98, 0x9c, 0x66, 0xba, 0x9f, 0xaf, 0xb9, + 0x60, 0x68, 0x98, 0x9d, 0x9f, 0x62, 0x68, 0x50, 0xa4, 0xc5, 0x2f, 0xce, + 0x82, 0x6c, 0x4a, 0xcd, 0x8d, 0xc7, 0xa6, 0x96, 0x9b, 0x8b, 0xbf, 0x4b, + 0x48, 0x65, 0x91, 0x3d, 0x98, 0x81, 0x9f, 0x77, 0xc6, 0xbc, 0xac, 0x9e, + 0x90, 0x61, 0x46, 0x5f, 0x5a, 0x91, 0x53, 0x36, 0x81, 0xb0, 0x9f, 0x73, + 0x6a, 0x45, 0xd0, 0x7c, 0xbf, 0xb8, 0x8f, 0x73, 0x8a, 0xb4, 0x9a, 0xcd, + 0x96, 0x79, 0xa5, 0xb8, 0x85, 0x9c, 0xcc, 0x2d, 0x48, 0x85, 0x87, 0xc5, + 0x78, 0x44, 0x87, 0xc5, 0xd0, 0x96, 0x4e, 0x6e, 0x9a, 0xbc, 0x3a, 0x4a, + 0x53, 0xb8, 0xd1, 0x63, 0x30, 0x3e, 0x97, 0x91, 0x6b, 0x2e, 0x93, 0x2d, + 0xc7, 0x6c, 0xa3, 0x5f, 0x57, 0x7d, 0x5b, 0xc0, 0x89, 0x9d, 0xae, 0x67, + 0x7a, 0xb0, 0x3a, 0xce, 0x3b, 0xa3, 0x8d, 0x87, 0x56, 0x87, 0x85, 0x75, + 0x73, 0x79, 0x4e, 0x3e, 0x64, 0xcd, 0x7d, 0x68, 0x3d, 0xcf, 0xa8, 0x71, + 0xb5, 0xcb, 0xac, 0x6d, 0x87, 0xb8, 0x3a, 0x51, 0x66, 0x98, 0xb4, 0x63, + 0xcd, 0x6c, 0x5e, 0x67, 0x68, 0xa4, 0x62, 0x77, 0x5c, 0x7a, 0x4e, 0x87, + 0xb8, 0x48, 0x6d, 0xb9, 0x3b, 0x54, 0x5e, 0x2c, 0x35, 0x89, 0xaa, 0x5c, + 0x8c, 0x52, 0x49, 0xd2, 0x5f, 0x4e, 0xd9, 0x77, 0x71, 0x6a, 0x80, 0xac, + 0x55, 0xd2, 0x74, 0x5e, 0x52, 0xb0, 0x4a, 0x86, 0x51, 0x42, 0x51, 0xc5, + 0x70, 0x7e, 0x70, 0xbf, 0xcd, 0x83, 0xb1, 0xc9, 0xce, 0xb2, 0x58, 0x80, + 0x8c, 0x30, 0x8e, 0xa3, 0xcd, 0x94, 0x40, 0x82, 0x40, 0x2d, 0x59, 0xaa, + 0x3d, 0xb6, 0x5b, 0x74, 0x43, 0x68, 0x64, 0x76, 0x77, 0xb9, 0xb2, 0x9f, + 0x55, 0x4b, 0xbf, 0x58, 0x3d, 0xbb, 0xcb, 0xc2, 0x43, 0xc5, 0x39, 0x75, + 0x9a, 0x41, 0x74, 0x62, 0xa5, 0x72, 0x86, 0x4d, 0x50, 0xa2, 0x73, 0x46, + 0xb3, 0x35, 0xc3, 0x98, 0x84, 0x5d, 0x2b, 0x4c, 0x38, 0x82, 0x68, 0x7c, + 0x7f, 0x95, 0x94, 0x67, 0x68, 0x84, 0x3e, 0x75, 0x83, 0xa6, 0xbe, 0xb9, + 0xa7, 0x4f, 0xa4, 0x3a, 0xad, 0x4c, 0xb6, 0xb8, 0x87, 0xb2, 0x3b, 0xb0, + 0x2f, 0xb9, 0x72, 0x88, 0x73, 0xa9, 0x59, 0x65, 0x3f, 0xc2, 0x4b, 0x62, + 0x98, 0x75, 0x64, 0x34, 0x64, 0x54, 0xa0, 0x7c, 0xb6, 0x87, 0xb9, 0xad, + 0x93, 0x4a, 0x69, 0x6d, 0xd4, 0xcb, 0x97, 0x66, 0x49, 0x6d, 0x4e, 0x4f, + 0x3c, 0x51, 0x40, 0x89, 0xd4, 0x80, 0x58, 0x40, 0x5b, 0xbc, 0x7c, 0x81, + 0x3a, 0x70, 0x5c, 0x78, 0x6b, 0x9d, 0xae, 0xbd, 0x2e, 0x43, 0x92, 0x9b, + 0x48, 0x53, 0xb5, 0xc7, 0x44, 0x95, 0x3e, 0xb4, 0x73, 0xbc, 0x37, 0x3c, + 0xba, 0xa3, 0xa3, 0x7d, 0x65, 0x7a, 0x4f, 0x9e, 0xa4, 0x95, 0x39, 0x3f, + 0xaf, 0x91, 0xb2, 0x7a, 0x4f, 0x46, 0x62, 0xd1, 0x74, 0x87, 0xba, 0x59, + 0x9c, 0xc5, 0x70, 0xa4, 0x3d, 0x7e, 0x61, 0x3b, 0xa8, 0xb5, 0x9d, 0xb7, + 0x48, 0x90, 0x3e, 0x69, 0xb6, 0x61, 0x43, 0xbf, 0x8d, 0xa1, 0x8e, 0x77, + 0x87, 0xaf, 0xc6, 0xcc, 0xb7, 0x70, 0x6b, 0x5e, 0x6c, 0xde, 0x46, 0x77, + 0x52, 0x90, 0x58, 0xc5, 0x91, 0x8f, 0x7a, 0xac, 0xbc, 0xd6, 0x93, 0xa2, + 0xa4, 0xaf, 0xb8, 0xc8, 0xc2, 0xae, 0x8d, 0x55, 0x8e, 0x8b, 0x9a, 0x4a, + 0xbf, 0xbb, 0xc6, 0x51, 0x40, 0x7a, 0xcb, 0x9d, 0x91, 0xc5, 0x31, 0x56, + 0x34, 0x64, 0x9c, 0x7e, 0xab, 0xb7, 0x55, 0x70, 0xb3, 0x48, 0xb5, 0x39, + 0x2f, 0x32, 0xc2, 0xa2, 0xc3, 0xc9, 0x9e, 0xaf, 0x48, 0xc6, 0x4a, 0x8d, + 0x83, 0x9e, 0x9d, 0x37, 0xbc, 0xcf, 0xbe, 0x5f, 0x67, 0x82, 0x4e, 0x8a, + 0x47, 0x98, 0x9d, 0x76, 0xb9, 0x7b, 0x63, 0xad, 0xaa, 0x96, 0x90, 0x77, + 0x61, 0x66, 0x85, 0x82, 0x4a, 0xa4, 0x6a, 0x86, 0xb5, 0x9e, 0x3f, 0xd4, + 0x96, 0xa9, 0x77, 0x70, 0x76, 0xc0, 0xb5, 0xba, 0xd0, 0xc8, 0x49, 0xa6, + 0x6e, 0x5e, 0xd0, 0x7e, 0xc8, 0xbd, 0x7b, 0x68, 0xae, 0x70, 0x49, 0x82, + 0xb1, 0x67, 0xdf, 0x8d, 0x76, 0x2b, 0x45, 0x84, 0x7c, 0x69, 0x82, 0x6d, + 0xc4, 0xb9, 0xa7, 0xce, 0x8a, 0x2e, 0xa6, 0xb5, 0x9b, 0x7c, 0x68, 0x39, + 0x93, 0xd0, 0x38, 0x76, 0xcf, 0x43, 0x53, 0x77, 0xa3, 0xa1, 0x9b, 0x37, + 0xaf, 0x58, 0x4a, 0x31, 0x64, 0xa0, 0xab, 0x8a, 0x45, 0xb8, 0xb6, 0xad, + 0x41, 0xd5, 0xa9, 0xb5, 0x3c, 0x72, 0x2f, 0x8a, 0xc6, 0x6e, 0x7b, 0x76, + 0x7b, 0xab, 0xcf, 0x76, 0x64, 0x92, 0x98, 0xad, 0x9e, 0x87, 0xbd, 0x79, + 0x4f, 0xb8, 0xcb, 0xb9, 0x7c, 0x64, 0xa0, 0xb7, 0xc1, 0x67, 0x76, 0x51, + 0x32, 0xbf, 0x84, 0x75, 0xa7, 0x89, 0xa1, 0xd5, 0xba, 0x75, 0x4a, 0xa3, + 0x69, 0x53, 0xad, 0xa4, 0x89, 0x64, 0x4a, 0xaa, 0x6d, 0x66, 0x69, 0xb4, + 0x44, 0x34, 0x58, 0x7c, 0x5f, 0x73, 0xcf, 0x42, 0x4f, 0x93, 0xb7, 0x5b, + 0x58, 0x9c, 0x8a, 0x2f, 0xaf, 0xb9, 0x8b, 0x94, 0x85, 0x84, 0xad, 0x47, + 0x93, 0xbb, 0x64, 0x91, 0x60, 0x7c, 0xc9, 0x43, 0x43, 0x47, 0x30, 0x9a, + 0x7b, 0x64, 0xb9, 0xbf, 0x54, 0x97, 0x75, 0x95, 0x7c, 0x72, 0x36, 0x58, + 0xa8, 0x7b, 0x66, 0xc4, 0x9d, 0x6a, 0x72, 0x3b, 0x9d, 0x59, 0x40, 0xbe, + 0xb5, 0x46, 0x37, 0x77, 0x6c, 0x52, 0x39, 0x9b, 0x7e, 0x6c, 0x5e, 0x91, + 0x39, 0x59, 0x87, 0x4a, 0x7a, 0x57, 0xc2, 0x58, 0x45, 0x74, 0x2f, 0x45, + 0xa8, 0x36, 0xaa, 0xba, 0x62, 0x3f, 0xae, 0x9f, 0xb8, 0x66, 0xa0, 0xa4, + 0x2d, 0x95, 0x6f, 0x46, 0x57, 0x48, 0xd4, 0x85, 0x73, 0x32, 0x5a, 0x2d, + 0xb4, 0x74, 0x8e, 0x70, 0x67, 0xb9, 0xd7, 0xa4, 0x60, 0x66, 0x3e, 0x6c, + 0xb8, 0x8a, 0x8b, 0x99, 0x8b, 0x45, 0x63, 0x4b, 0x4a, 0xb3, 0x8b, 0xd4, + 0x78, 0xb7, 0x8f, 0x72, 0x95, 0x8d, 0x92, 0x92, 0xb7, 0x34, 0xa4, 0x3d, + 0x48, 0x88, 0x40, 0x87, 0x3f, 0x86, 0xb5, 0x60, 0x60, 0x3f, 0x9d, 0x77, + 0x3d, 0x4f, 0x97, 0xb4, 0x8d, 0x60, 0x76, 0x55, 0xd5, 0x81, 0x9c, 0xa3, + 0xd8, 0x59, 0x9c, 0xc6, 0xd0, 0x54, 0x45, 0x94, 0xbb, 0x66, 0xc9, 0x9a, + 0xa7, 0x47, 0x3a, 0x35, 0x8e, 0xa4, 0xc3, 0x5d, 0xb7, 0x80, 0x73, 0xbe, + 0x71, 0x92, 0x69, 0x43, 0x5f, 0x41, 0x80, 0x87, 0x2b, 0x37, 0x53, 0x35, + 0x4a, 0xa5, 0x71, 0x25, 0x7b, 0xa4, 0x85, 0x3f, 0xa1, 0x6f, 0x7b, 0x30, + 0xbc, 0x83, 0x80, 0xbd, 0xc1, 0xa4, 0x51, 0x7b, 0x9e, 0x85, 0x8a, 0xc8, + 0xd6, 0xd9, 0x6b, 0x67, 0x94, 0xd9, 0x9a, 0x3e, 0xcb, 0x75, 0x60, 0x75, + 0x70, 0x43, 0x94, 0x3e, 0xcb, 0x84, 0xd0, 0x92, 0x54, 0x8b, 0x92, 0x98, + 0x57, 0x3c, 0x5b, 0x6f, 0x59, 0x74, 0x48, 0xa2, 0x46, 0x82, 0xa9, 0x53, + 0x9a, 0x45, 0x56, 0x51, 0x8c, 0xa5, 0x96, 0x97, 0x8a, 0x60, 0xc2, 0x62, + 0x54, 0x73, 0xb2, 0x9e, 0xba, 0x47, 0x96, 0xa1, 0x30, 0x84, 0xd0, 0xbd, + 0xb0, 0x74, 0xc7, 0xce, 0xa0, 0xb9, 0x42, 0x43, 0x89, 0xa7, 0x83, 0xa9, + 0x4f, 0x91, 0xac, 0x65, 0xcf, 0xb6, 0x32, 0xc4, 0x52, 0x66, 0x53, 0xc2, + 0x80, 0x63, 0x71, 0x99, 0x83, 0x55, 0x3f, 0xb4, 0x4c, 0x51, 0x54, 0x46, + 0x43, 0xcc, 0x72, 0xa3, 0x85, 0x4b, 0x35, 0x7b, 0xa0, 0xcd, 0x8f, 0xa5, + 0x5c, 0x42, 0x97, 0x5f, 0x8d, 0x76, 0xcb, 0xc2, 0xb1, 0x80, 0x72, 0xb4, + 0x81, 0xd2, 0x4c, 0xbf, 0x86, 0x6a, 0x4f, 0x88, 0x4e, 0x82, 0x35, 0xa7, + 0xc7, 0x50, 0x47, 0x53, 0x52, 0x86, 0x89, 0xa9, 0x67, 0xbf, 0xd8, 0x93, + 0x6d, 0xaf, 0x9a, 0x3c, 0x9c, 0x91, 0xc3, 0x59, 0x51, 0x61, 0xd7, 0x94, + 0xb9, 0x81, 0x52, 0x37, 0x72, 0x29, 0x87, 0x37, 0xbb, 0x77, 0x73, 0xaf, + 0x72, 0x9e, 0xd7, 0xc5, 0xcf, 0x79, 0x49, 0xb9, 0x40, 0x84, 0x88, 0x4a, + 0x6d, 0xb6, 0x55, 0xcc, 0x36, 0x3c, 0x85, 0xcd, 0x48, 0xbb, 0x50, 0x48, + 0xbe, 0x4a, 0x42, 0x8f, 0x5b, 0xcb, 0x78, 0x4b, 0x81, 0xa8, 0x57, 0xb4, + 0x48, 0x57, 0xb2, 0x2e, 0x45, 0x41, 0xb4, 0xc9, 0x9f, 0x8b, 0x51, 0x44, + 0x65, 0x59, 0x89, 0x96, 0x68, 0x49, 0xbb, 0x6c, 0x83, 0x52, 0x42, 0x39, + 0xd5, 0x79, 0x6b, 0xb0, 0xaf, 0x89, 0x86, 0xac, 0xbe, 0xa4, 0x82, 0x6a, + 0x88, 0xbe, 0x84, 0xc2, 0x38, 0x95, 0x9b, 0x66, 0x72, 0x8f, 0xaa, 0x49, + 0x6b, 0x7a, 0x80, 0x52, 0x3f, 0x68, 0x6e, 0xa6, 0x8a, 0x6f, 0x43, 0xb5, + 0xcd, 0x9c, 0x86, 0x63, 0x46, 0x86, 0xc9, 0x71, 0xb1, 0xba, 0xb3, 0x47, + 0x64, 0x55, 0x6a, 0xdc, 0xc0, 0xb4, 0x54, 0x74, 0x80, 0xbb, 0x87, 0x77, + 0xce, 0x83, 0x78, 0xc3, 0xa7, 0x44, 0xcf, 0xc2, 0x7b, 0xc6, 0x55, 0x4b, + 0xc4, 0x7f, 0x6b, 0xb7, 0x55, 0x78, 0x94, 0x5c, 0xd0, 0xcd, 0xc8, 0x4b, + 0xc9, 0x6a, 0x6c, 0x7e, 0x88, 0x52, 0x69, 0x4d, 0x46, 0x87, 0xb8, 0xc4, + 0x8e, 0xbe, 0xa4, 0x76, 0x39, 0x49, 0x50, 0x66, 0x85, 0x53, 0x48, 0x7c, + 0x8b, 0xbf, 0x39, 0x7c, 0x90, 0x6a, 0x39, 0x73, 0xbe, 0xbd, 0xb2, 0xd2, + 0x3b, 0x9e, 0x94, 0x92, 0xac, 0x68, 0x4c, 0x40, 0x4c, 0x8e, 0xae, 0xe4, + 0x70, 0x47, 0x6d, 0xa9, 0xca, 0xcf, 0xb1, 0x45, 0x51, 0x69, 0x84, 0x55, + 0x87, 0x77, 0xac, 0x5e, 0xdd, 0x38, 0xa3, 0x7f, 0x6b, 0xc9, 0x2a, 0x66, + 0x44, 0xd0, 0x96, 0x4f, 0xb6, 0xad, 0xbf, 0x56, 0x41, 0x4d, 0x73, 0x57, + 0xd9, 0x7f, 0x49, 0x40, 0x30, 0x36, 0x92, 0xb0, 0x34, 0x86, 0x82, 0xad, + 0x4d, 0x4f, 0x83, 0xcf, 0x9c, 0x6e, 0xa9, 0x67, 0xa8, 0x71, 0x7f, 0x5a, + 0x48, 0xb2, 0xa6, 0xb6, 0x93, 0xa0, 0xca, 0x33, 0x93, 0x74, 0x6e, 0xc6, + 0xa0, 0x68, 0xa1, 0xb3, 0x94, 0x63, 0x48, 0x34, 0x7a, 0x8d, 0x8f, 0x47, + 0x9b, 0x57, 0x64, 0xc2, 0x56, 0x7b, 0xaa, 0x58, 0xb0, 0x46, 0xad, 0x44, + 0xa9, 0x86, 0x50, 0xbd, 0x9c, 0x82, 0xab, 0x38, 0x94, 0xa7, 0xcf, 0xaa, + 0x7d, 0x83, 0x71, 0xc3, 0x95, 0x58, 0xb3, 0x8e, 0x63, 0x5b, 0x95, 0xa8, + 0x45, 0xbd, 0x2f, 0xaa, 0x88, 0x4b, 0x2f, 0x8a, 0x6d, 0xb9, 0x34, 0x8c, + 0xa3, 0xbc, 0x82, 0x39, 0x6d, 0x7d, 0x6c, 0x94, 0x97, 0xd0, 0xa3, 0xa9, + 0x89, 0x47, 0x5d, 0x9a, 0xd2, 0xc4, 0x4b, 0xa9, 0xc6, 0x79, 0xcb, 0x97, + 0x32, 0x69, 0x85, 0x56, 0xc9, 0x36, 0xa1, 0x75, 0x72, 0x8b, 0xa5, 0xcd, + 0x5d, 0x80, 0x7d, 0x3d, 0xbe, 0xcf, 0xc9, 0x53, 0x55, 0x36, 0xb0, 0x3e, + 0x8b, 0x88, 0xa8, 0xd9, 0x53, 0x5e, 0xc7, 0xdb, 0x5c, 0xa9, 0x89, 0xa5, + 0x58, 0x78, 0xa4, 0x39, 0xb2, 0x90, 0x55, 0xc6, 0xc9, 0xce, 0xb2, 0xa9, + 0x98, 0x7b, 0x65, 0x34, 0xb6, 0xb5, 0x2d, 0x7e, 0xcd, 0xe0, 0x6f, 0x44, + 0x87, 0xbe, 0x95, 0x52, 0x90, 0xcb, 0x87, 0x8d, 0x92, 0xc3, 0x9a, 0x6d, + 0x73, 0xae, 0x49, 0x49, 0x8b, 0x6a, 0x85, 0x97, 0x8c, 0x6e, 0x61, 0x32, + 0x4d, 0x92, 0x82, 0xbb, 0x9a, 0xaa, 0x63, 0xc2, 0xba, 0x62, 0x8e, 0xa1, + 0xc6, 0xb8, 0x7f, 0x47, 0xb1, 0xa2, 0xcc, 0xc4, 0x4d, 0x52, 0x9d, 0x5f, + 0x56, 0xc0, 0x91, 0x47, 0x85, 0x94, 0xa0, 0x63, 0xd2, 0xbd, 0x32, 0x67, + 0x43, 0xac, 0x8d, 0xc9, 0xad, 0x97, 0x65, 0x97, 0xbe, 0x86, 0x93, 0xc2, + 0x70, 0xc8, 0x42, 0xad, 0x4a, 0x74, 0x64, 0x6c, 0x9e, 0xac, 0x58, 0x82, + 0xa3, 0x3d, 0x4b, 0xab, 0xab, 0x5d, 0xc3, 0x3c, 0x88, 0x79, 0x7f, 0x42, + 0x62, 0x52, 0x9e, 0x86, 0x7b, 0x52, 0xc3, 0x64, 0x7a, 0x64, 0x73, 0xc5, + 0x62, 0x8e, 0x82, 0x95, 0x58, 0x42, 0x7d, 0xd1, 0x89, 0xc0, 0xba, 0x7b, + 0xc4, 0xc1, 0x79, 0x62, 0xa0, 0x36, 0x43, 0xd4, 0xb9, 0x6b, 0x6b, 0x6f, + 0x78, 0x6c, 0x69, 0x63, 0x7f, 0xd3, 0x3b, 0xbe, 0xcd, 0x48, 0x48, 0xba, + 0x86, 0x93, 0x7b, 0x9a, 0x6c, 0x3d, 0x48, 0x76, 0x92, 0x81, 0x34, 0xc4, + 0x6b, 0x9f, 0xc0, 0x4e, 0xd8, 0xbd, 0xce, 0x83, 0xb4, 0x94, 0xb4, 0x95, + 0xb2, 0x95, 0x7d, 0xbd, 0x53, 0x31, 0x45, 0xb5, 0x58, 0x9f, 0x7f, 0x82, + 0xab, 0xc9, 0x6e, 0xc6, 0xa6, 0x99, 0xa3, 0x6a, 0x87, 0x4b, 0x6f, 0x44, + 0x87, 0x81, 0xb7, 0xa6, 0xce, 0xc4, 0x4e, 0x79, 0x5a, 0xa6, 0x56, 0x83, + 0xb5, 0x68, 0x9d, 0x71, 0x50, 0xc5, 0x82, 0x9b, 0xd0, 0x5c, 0xab, 0x45, + 0x4b, 0x7d, 0x5c, 0xbe, 0xc6, 0x68, 0x71, 0x84, 0xa2, 0x94, 0x9a, 0x3a, + 0x7a, 0x6a, 0x6c, 0x52, 0x3c, 0x3c, 0xa8, 0x82, 0x84, 0x67, 0x84, 0x32, + 0x8c, 0xd9, 0xa8, 0x43, 0xae, 0x99, 0x3b, 0x51, 0xba, 0xa1, 0x37, 0x86, + 0xb0, 0xa6, 0x5c, 0x94, 0x5f, 0x36, 0x50, 0x5f, 0x69, 0x67, 0x5e, 0xca, + 0x3b, 0x41, 0x69, 0x67, 0x41, 0xaa, 0xc4, 0x81, 0x74, 0x6b, 0x93, 0x5b, + 0xae, 0x6a, 0xc0, 0x79, 0x65, 0xa6, 0x66, 0x32, 0x73, 0xbf, 0xb9, 0xbc, + 0x8b, 0x63, 0x58, 0xc2, 0xce, 0x80, 0x53, 0x76, 0x7c, 0x69, 0x7a, 0xd3, + 0x6a, 0x71, 0x9e, 0x3d, 0x60, 0xbe, 0x9e, 0x97, 0x37, 0x9b, 0x88, 0x3c, + 0x61, 0x7a, 0xc9, 0x49, 0xc3, 0xcf, 0xae, 0x9d, 0x6f, 0x81, 0x60, 0xd9, + 0xaa, 0xbc, 0x48, 0x38, 0x3c, 0x31, 0x73, 0xc6, 0xc4, 0xa4, 0xb2, 0x49, + 0xbd, 0x7d, 0x83, 0x8e, 0xbe, 0x85, 0x6d, 0x7a, 0xc5, 0x3d, 0xa0, 0x7b, + 0x9a, 0xa8, 0x4a, 0xc4, 0x39, 0x90, 0x58, 0x50, 0xc8, 0x3b, 0xbe, 0x3a, + 0x6a, 0x7f, 0x5e, 0xc6, 0x58, 0xb8, 0x4d, 0x92, 0xc2, 0xcf, 0xc1, 0xd2, + 0xd4, 0x45, 0xc1, 0xcc, 0x5f, 0xd1, 0xe6, 0x3e, 0x41, 0x79, 0x7f, 0xbf, + 0x52, 0xaf, 0xcd, 0xc0, 0x7e, 0x89, 0x74, 0x88, 0xac, 0x44, 0x67, 0xa1, + 0x63, 0xc1, 0x79, 0xc7, 0x46, 0xd1, 0x54, 0x32, 0xb0, 0x84, 0x7d, 0xa7, + 0x55, 0x4f, 0xa7, 0x9a, 0xc0, 0x91, 0x87, 0x7d, 0x38, 0xc0, 0x55, 0x5e, + 0x9a, 0x74, 0x92, 0x85, 0xc5, 0x63, 0x67, 0x3f, 0xb6, 0xb0, 0x44, 0x45, + 0xaa, 0xc3, 0x8d, 0x46, 0x41, 0x85, 0x79, 0x6e, 0x90, 0x73, 0x42, 0x49, + 0x43, 0xc5, 0x88, 0x3d, 0xc0, 0x64, 0xa1, 0x9c, 0x6b, 0xb4, 0xbd, 0x55, + 0x90, 0x74, 0xc3, 0x4e, 0x62, 0x99, 0x5f, 0xaa, 0x78, 0x96, 0x49, 0x5a, + 0xd2, 0x84, 0xcb, 0xcf, 0xbe, 0xa7, 0xa7, 0xc0, 0x35, 0x90, 0xa4, 0x63, + 0x50, 0xc2, 0xcf, 0x38, 0x6d, 0x4d, 0x7f, 0x47, 0x37, 0xc3, 0x8d, 0x57, + 0xa9, 0xaf, 0xc4, 0x85, 0x38, 0x3b, 0x4d, 0xcb, 0xb7, 0x72, 0xbf, 0xc5, + 0x82, 0xc4, 0x2f, 0x63, 0xcd, 0x56, 0xbf, 0x7c, 0x92, 0xa0, 0x93, 0x95, + 0x33, 0x85, 0x4c, 0x81, 0xcb, 0xab, 0x4a, 0x32, 0x5e, 0x35, 0xcf, 0xc3, + 0xaf, 0x55, 0x4e, 0x35, 0xc9, 0x8a, 0xad, 0x4f, 0xba, 0x8c, 0x9c, 0x62, + 0x70, 0x58, 0x45, 0x98, 0x5d, 0xb8, 0x7a, 0xc8, 0x89, 0x5e, 0x8c, 0xcd, + 0x51, 0x5e, 0x87, 0x90, 0x51, 0x5e, 0xce, 0x8a, 0xd1, 0x65, 0xb3, 0x92, + 0xa8, 0x6a, 0x6e, 0x51, 0xce, 0xc3, 0x58, 0xc1, 0x70, 0xd5, 0xb7, 0xdc, + 0xc8, 0xd3, 0xa7, 0xb4, 0xc2, 0x46, 0x55, 0x88, 0x68, 0xb7, 0x49, 0x98, + 0x85, 0x56, 0x56, 0xb0, 0x47, 0x4a, 0xbe, 0xb7, 0xa8, 0xc8, 0x44, 0x92, + 0x51, 0x55, 0xbd, 0xae, 0x7c, 0x2c, 0x61, 0x7d, 0x94, 0xc8, 0x78, 0x50, + 0xb9, 0x3c, 0x43, 0x37, 0xbf, 0xaf, 0xb8, 0xb9, 0x3b, 0x8d, 0x49, 0xb9, + 0xb3, 0x3d, 0x68, 0xb6, 0x48, 0x7d, 0xcc, 0x9c, 0x7a, 0x8d, 0x4c, 0xcd, + 0x37, 0x71, 0x3d, 0xc9, 0x4b, 0x83, 0x5c, 0x5c, 0x52, 0x54, 0x48, 0xa3, + 0xae, 0x9e, 0x60, 0x65, 0xc6, 0x7e, 0x87, 0x4d, 0x3b, 0xa3, 0x73, 0x5a, + 0x3a, 0x7f, 0x86, 0x78, 0xb9, 0x87, 0x57, 0x67, 0x4a, 0x76, 0x9e, 0x67, + 0xb6, 0x55, 0x6f, 0xa5, 0xb7, 0x4e, 0x2e, 0xbe, 0x30, 0x89, 0x4b, 0x7d, + 0x74, 0xb6, 0x98, 0xa4, 0x9d, 0x7b, 0xd2, 0x95, 0xca, 0x62, 0x48, 0x93, + 0x6a, 0x5a, 0x6a, 0x82, 0x41, 0x95, 0x60, 0x9c, 0x36, 0x38, 0x34, 0x8e, + 0x92, 0x3c, 0x8e, 0xbe, 0x43, 0x97, 0x71, 0x95, 0x73, 0x7b, 0x72, 0x4b, + 0x88, 0x3f, 0x70, 0xa3, 0xa4, 0xb7, 0xba, 0xbb, 0x65, 0x68, 0x56, 0xa7, + 0x9f, 0x3e, 0x31, 0x89, 0x2f, 0xaf, 0x83, 0x98, 0x7a, 0xab, 0x3e, 0x66, + 0x54, 0x52, 0x8c, 0x7a, 0x63, 0xc9, 0xa8, 0x9a, 0x5d, 0xb2, 0xa9, 0x3e, + 0x4e, 0x3b, 0x6b, 0x80, 0x40, 0x7f, 0xce, 0xad, 0x94, 0x85, 0x37, 0x5b, + 0xbe, 0x9e, 0xa3, 0x3e, 0xac, 0xcc, 0xae, 0xa1, 0x36, 0xcd, 0xb8, 0x5a, + 0x54, 0x96, 0xc4, 0x5a, 0xb2, 0x47, 0x99, 0xc2, 0x8e, 0xca, 0xa8, 0x55, + 0x4d, 0x81, 0xce, 0x9b, 0x63, 0xd5, 0x7a, 0x9c, 0x78, 0x89, 0x3c, 0x7f, + 0x98, 0x77, 0xab, 0x7b, 0x42, 0x4f, 0x64, 0x93, 0x44, 0xb2, 0x79, 0xba, + 0x85, 0x5b, 0x39, 0x5d, 0x80, 0x4d, 0x54, 0x64, 0x52, 0x82, 0x64, 0x49, + 0x38, 0xba, 0x9d, 0x9d, 0x39, 0x86, 0x38, 0x49, 0x87, 0x54, 0xa4, 0x57, + 0x37, 0x85, 0xb2, 0x85, 0x54, 0xbd, 0xa5, 0x3a, 0x46, 0x70, 0x5f, 0x9f, + 0x53, 0x63, 0x5b, 0xba, 0x88, 0xaa, 0x2b, 0x32, 0xad, 0xa7, 0x81, 0x50, + 0x6c, 0xb5, 0xb6, 0xc3, 0x69, 0x72, 0xcc, 0x39, 0x31, 0x9f, 0x59, 0x97, + 0xd3, 0xa6, 0xcc, 0xa1, 0x85, 0x58, 0xd1, 0x6b, 0x96, 0x66, 0x75, 0x8f, + 0x82, 0x39, 0x56, 0x81, 0x5f, 0x50, 0x83, 0xbb, 0x51, 0xb0, 0xb5, 0x97, + 0x6f, 0xb0, 0x63, 0x63, 0xac, 0x83, 0x64, 0xb5, 0x80, 0xc1, 0x91, 0x4c, + 0x5f, 0xb3, 0x96, 0x93, 0x7f, 0xb3, 0x45, 0xd0, 0x99, 0x51, 0xa1, 0x7e, + 0xd3, 0x3f, 0x91, 0x8a, 0x3a, 0xb6, 0x84, 0xa9, 0x8a, 0x4b, 0xc6, 0x9b, + 0xbd, 0x56, 0xa0, 0x4e, 0x5f, 0x6b, 0x8a, 0x5c, 0x66, 0xcc, 0x45, 0x36, + 0xb1, 0xc8, 0x6f, 0xbd, 0x6e, 0x33, 0x49, 0x4a, 0x4c, 0x69, 0x58, 0xba, + 0x7e, 0x31, 0xc0, 0x9b, 0xc5, 0x76, 0xd0, 0x8f, 0x95, 0x6d, 0x38, 0xbc, + 0x9c, 0x4f, 0x71, 0x7b, 0x82, 0x54, 0x49, 0xcf, 0xb6, 0x88, 0x33, 0x95, + 0xc3, 0x9a, 0x74, 0xb7, 0x4e, 0x50, 0xd0, 0x63, 0xcd, 0x88, 0x57, 0xcd, + 0x40, 0xac, 0x77, 0x74, 0x98, 0x58, 0xb9, 0x2e, 0x79, 0x66, 0x54, 0x4c, + 0xda, 0xb8, 0x60, 0xb9, 0x99, 0x89, 0x36, 0xc3, 0x37, 0xae, 0xe8, 0x66, + 0x62, 0xba, 0x3e, 0x7f, 0xcd, 0xab, 0x7f, 0x40, 0x6f, 0x96, 0xae, 0x8c, + 0xc3, 0x4f, 0xb9, 0xa4, 0x57, 0x42, 0xac, 0x68, 0x57, 0xca, 0x6e, 0x46, + 0x90, 0x64, 0xd1, 0x7a, 0x5e, 0x79, 0x50, 0xb4, 0xae, 0xc7, 0x4b, 0xbf, + 0xba, 0x6c, 0x69, 0x4e, 0x7a, 0xd1, 0xb7, 0x7f, 0xa3, 0x5f, 0x55, 0x3f, + 0x83, 0x42, 0xa5, 0xcd, 0xc6, 0x81, 0x8b, 0x85, 0x86, 0xbd, 0x4a, 0x63, + 0x95, 0xa2, 0xa9, 0x37, 0x6e, 0x63, 0x8b, 0xc5, 0x58, 0x6a, 0x4b, 0x96, + 0x55, 0x57, 0x9a, 0x3b, 0x72, 0x4b, 0xb1, 0xb0, 0x8b, 0x8e, 0x6f, 0x6b, + 0x75, 0xcb, 0xb5, 0x58, 0x9b, 0x5c, 0x4f, 0xbb, 0x5e, 0x73, 0xa4, 0xd3, + 0x76, 0xac, 0x39, 0xab, 0x5a, 0x8c, 0x4f, 0x9e, 0x42, 0x7c, 0xbb, 0x63, + 0xc1, 0x7d, 0x34, 0x7f, 0x63, 0x3f, 0xc3, 0x40, 0xae, 0x55, 0xd0, 0xbf, + 0x49, 0xb6, 0xc9, 0x81, 0xb7, 0x33, 0x65, 0x47, 0x33, 0x80, 0x41, 0xab, + 0x73, 0xb7, 0x6e, 0x7c, 0x39, 0x8a, 0x57, 0xc9, 0x9b, 0xb7, 0x6a, 0xc9, + 0x3e, 0xa1, 0x59, 0x51, 0xa9, 0x70, 0x96, 0x5e, 0xc3, 0xa8, 0x88, 0x60, + 0xa0, 0xa2, 0x7e, 0xbb, 0x66, 0x81, 0xcd, 0x8d, 0x83, 0xb3, 0xa7, 0x39, + 0x41, 0xac, 0x3f, 0xaa, 0x84, 0x50, 0x4f, 0xbf, 0x33, 0x7b, 0x89, 0xb4, + 0x6d, 0x79, 0x97, 0x5a, 0xb2, 0x4d, 0xc6, 0x58, 0x62, 0x80, 0x4b, 0x84, + 0x3f, 0xb4, 0x79, 0xba, 0x4c, 0xb6, 0xc2, 0xc6, 0x6c, 0x6e, 0x47, 0x4f, + 0x92, 0xa1, 0x6b, 0x5c, 0x3a, 0x53, 0x44, 0x3a, 0xb0, 0x35, 0x55, 0x7a, + 0x8e, 0x88, 0x88, 0x7a, 0x68, 0x41, 0x6d, 0xa6, 0x8c, 0x9f, 0x67, 0x3a, + 0x5b, 0x49, 0xb5, 0x85, 0xcf, 0xc8, 0x81, 0x6c, 0x37, 0x35, 0x72, 0x63, + 0x67, 0x73, 0x67, 0x98, 0x84, 0x93, 0x50, 0x3a, 0xcd, 0xa3, 0x77, 0x37, + 0x95, 0x5a, 0x4a, 0x80, 0x9e, 0x4b, 0xb2, 0x8e, 0x7b, 0xaf, 0x5c, 0x40, + 0xbe, 0x9e, 0x9c, 0x32, 0x4c, 0xaa, 0x42, 0x64, 0x6d, 0x3b, 0x9e, 0x66, + 0x64, 0x43, 0x3e, 0x4d, 0xb8, 0x68, 0x3e, 0x88, 0x97, 0x55, 0x4e, 0x69, + 0x8a, 0x49, 0x48, 0x5a, 0x62, 0x8c, 0x57, 0x44, 0x82, 0x5c, 0x8c, 0x51, + 0x41, 0xb4, 0x63, 0x59, 0x3f, 0x60, 0x4e, 0x69, 0x86, 0x84, 0x47, 0x64, + 0x59, 0x63, 0x75, 0x64, 0xa1, 0xa6, 0xac, 0x61, 0xa4, 0x33, 0x35, 0xaf, + 0x7f, 0x60, 0x8f, 0x4c, 0xb9, 0xa8, 0x75, 0x45, 0x9d, 0x39, 0x78, 0xa6, + 0x66, 0x40, 0x52, 0xad, 0x76, 0xbd, 0x5d, 0x8e, 0x86, 0x70, 0x9e, 0x5b, + 0x95, 0x48, 0x5a, 0xb7, 0x4f, 0xc8, 0xb7, 0x34, 0x88, 0x91, 0x34, 0x68, + 0xad, 0x79, 0xcc, 0x44, 0xc4, 0xaf, 0xa3, 0x98, 0xa6, 0x9d, 0x69, 0xaf, + 0xab, 0x5b, 0xc7, 0x8c, 0x3c, 0x39, 0x36, 0xc5, 0xc3, 0x7a, 0x7f, 0xa7, + 0xc1, 0x5d, 0x71, 0xb8, 0xcb, 0xd0, 0xca, 0x77, 0xb6, 0x5c, 0x4d, 0x8a, + 0x36, 0x9c, 0xab, 0xc7, 0xa0, 0x79, 0x74, 0xc6, 0xbf, 0x99, 0x44, 0xcb, + 0x9a, 0x38, 0xd3, 0xcc, 0x98, 0x54, 0x95, 0xb4, 0x49, 0x90, 0xa0, 0xb7, + 0x31, 0xb8, 0x8a, 0xc0, 0xc8, 0xcb, 0x61, 0x7a, 0x3c, 0x55, 0x4f, 0x4e, + 0xc3, 0xbf, 0x4b, 0x6e, 0x84, 0x6a, 0x8b, 0xc6, 0x76, 0x65, 0xc8, 0xbb, + 0x6f, 0xc2, 0xb7, 0x75, 0xc7, 0xb8, 0x3d, 0xa8, 0x7d, 0xa8, 0x5e, 0x6f, + 0xd1, 0x6b, 0xc3, 0x5c, 0xce, 0x56, 0x58, 0x8f, 0x95, 0xa3, 0x51, 0x40, + 0xcf, 0x78, 0xca, 0x6a, 0x43, 0x33, 0xa4, 0x7a, 0x3a, 0xb4, 0x61, 0x94, + 0x3b, 0x41, 0x6f, 0x35, 0xb4, 0x50, 0xcf, 0x61, 0x47, 0x4f, 0x7f, 0x9f, + 0xa5, 0xcc, 0x61, 0x8b, 0x9a, 0x70, 0x62, 0x47, 0x76, 0xb8, 0x58, 0x62, + 0x7a, 0xcb, 0x7f, 0x66, 0xb4, 0x84, 0x87, 0xae, 0x64, 0x42, 0x66, 0x5d, + 0x72, 0xa9, 0x9a, 0x81, 0x8e, 0x49, 0x70, 0x77, 0x99, 0xab, 0x47, 0xc9, + 0x69, 0x39, 0x96, 0x5d, 0x94, 0x43, 0xbd, 0x3d, 0x99, 0x65, 0xa1, 0x43, + 0x80, 0xae, 0x6a, 0xb2, 0x86, 0x42, 0x95, 0xac, 0xc5, 0x66, 0x44, 0x6e, + 0x31, 0x69, 0x36, 0xc6, 0x57, 0xb3, 0x72, 0xd0, 0x61, 0x34, 0x54, 0xbc, + 0x9f, 0x96, 0x3a, 0x41, 0xcf, 0xa5, 0x85, 0x6c, 0x38, 0x98, 0x55, 0x93, + 0xc6, 0x38, 0xcd, 0x3f, 0xb9, 0xd0, 0x7f, 0x7c, 0x43, 0x5e, 0x50, 0xa6, + 0x5a, 0x77, 0x3d, 0xaa, 0x7c, 0xc4, 0x8e, 0x62, 0x83, 0x5c, 0x5d, 0xca, + 0x52, 0x39, 0x5c, 0xcd, 0xa1, 0xbc, 0xd1, 0xc3, 0xd3, 0x52, 0x97, 0x6e, + 0x83, 0x95, 0xc4, 0x44, 0xaa, 0x59, 0x42, 0x67, 0x60, 0x8d, 0x81, 0x93, + 0xc4, 0x95, 0xca, 0x9f, 0x3f, 0x35, 0xc7, 0x51, 0x89, 0x7a, 0x85, 0x57, + 0x75, 0xad, 0x72, 0x4d, 0xb6, 0xa5, 0x6d, 0x41, 0x65, 0x92, 0x63, 0xa9, + 0x81, 0x90, 0xbf, 0x8f, 0xb0, 0x6f, 0x5f, 0x94, 0xad, 0xbb, 0xb1, 0x57, + 0x9a, 0x9b, 0xa5, 0x56, 0x9c, 0x77, 0x67, 0x95, 0x48, 0xcf, 0xc0, 0xbb, + 0x5e, 0x3a, 0x59, 0x37, 0x72, 0x4d, 0xca, 0xa0, 0x39, 0x8f, 0x6c, 0x7f, + 0x7c, 0x44, 0x6f, 0x4e, 0xc1, 0x7a, 0x53, 0xc0, 0x82, 0x57, 0x87, 0xae, + 0x4c, 0x42, 0x55, 0x6d, 0x71, 0xb7, 0x9d, 0xa2, 0xc3, 0xb4, 0x7f, 0xc4, + 0xb2, 0xb8, 0x54, 0x7c, 0x37, 0x9d, 0x52, 0x9d, 0x74, 0x34, 0x49, 0x34, + 0x59, 0x66, 0xa0, 0xb2, 0xb4, 0xad, 0x6c, 0x4d, 0x75, 0x6a, 0xb3, 0xb2, + 0x62, 0x47, 0xa4, 0xc2, 0xca, 0x61, 0x72, 0x5a, 0x86, 0x7d, 0x51, 0x9e, + 0x4d, 0xcb, 0x61, 0x3b, 0x8f, 0x59, 0x36, 0x93, 0x4a, 0xc8, 0x4b, 0x7d, + 0xba, 0x44, 0xbe, 0x87, 0x99, 0x9e, 0xc1, 0x89, 0x77, 0x41, 0x81, 0xa7, + 0x92, 0xc4, 0x56, 0x68, 0x8b, 0x80, 0x61, 0x96, 0x60, 0x4b, 0xc3, 0xc0, + 0xcf, 0x60, 0x3f, 0x89, 0x7e, 0x95, 0x31, 0x6c, 0xcb, 0x63, 0x64, 0x96, + 0x9a, 0x46, 0x48, 0x5c, 0x86, 0x54, 0x6a, 0x67, 0xa4, 0xb8, 0x8e, 0x86, + 0xaf, 0x82, 0x78, 0xa3, 0x4f, 0x3d, 0x50, 0xc4, 0xa3, 0x55, 0xa5, 0x98, + 0xa1, 0x84, 0x61, 0xb8, 0x8a, 0x5b, 0x76, 0x92, 0x48, 0xa7, 0x90, 0x8d, + 0x34, 0x6e, 0x5d, 0x65, 0xad, 0x61, 0x43, 0x6e, 0x62, 0x53, 0x3e, 0x46, + 0x39, 0x77, 0x4c, 0x66, 0x82, 0x38, 0xba, 0x62, 0xd3, 0x6c, 0x77, 0xba, + 0xc0, 0xca, 0x3a, 0x83, 0x5d, 0x56, 0xb3, 0x87, 0xa3, 0xa0, 0xba, 0x3e, + 0x31, 0x3c, 0xc4, 0x79, 0x8d, 0x9a, 0xac, 0x4b, 0xc4, 0xd1, 0xcf, 0x96, + 0xd1, 0xa2, 0x9e, 0x97, 0x99, 0xc3, 0xa7, 0x83, 0x7a, 0x67, 0x8c, 0x46, + 0x47, 0xb6, 0x5c, 0x80, 0x95, 0xaf, 0xbd, 0x6b, 0x5f, 0x33, 0x75, 0x66, + 0xb3, 0xcc, 0xaf, 0x63, 0xb0, 0xa9, 0x4e, 0x56, 0xce, 0x53, 0x8a, 0x6a, + 0xba, 0xcd, 0x3a, 0xc6, 0x6d, 0x41, 0xc5, 0xc1, 0x80, 0x7d, 0x5d, 0x82, + 0x92, 0x80, 0xc9, 0x81, 0x8a, 0x82, 0x66, 0x44, 0xa0, 0x8c, 0x5d, 0x45, + 0x84, 0x8b, 0x35, 0xcb, 0x6f, 0x4e, 0x72, 0xc7, 0x9a, 0x3b, 0x90, 0x88, + 0x62, 0xcb, 0x6a, 0xa3, 0x64, 0xac, 0x50, 0x7f, 0x81, 0xbc, 0x59, 0x49, + 0x68, 0x97, 0xd7, 0x6f, 0x88, 0x9a, 0x72, 0xc1, 0x2e, 0x66, 0xad, 0x4a, + 0x72, 0xa7, 0x64, 0x63, 0x63, 0x84, 0x84, 0xd2, 0xa8, 0xb0, 0x46, 0x89, + 0x9d, 0x59, 0x91, 0x67, 0xb2, 0x5c, 0x3c, 0x76, 0xb3, 0x67, 0xcd, 0xd0, + 0x55, 0xca, 0x42, 0x6a, 0x5f, 0x40, 0x34, 0xa1, 0x31, 0x72, 0x9b, 0x3d, + 0x67, 0xb3, 0x89, 0xbc, 0x95, 0x3e, 0x95, 0x8d, 0x88, 0x6e, 0x5f, 0xb9, + 0xba, 0x6a, 0x62, 0x8c, 0xcf, 0x45, 0x40, 0x52, 0x58, 0x33, 0x54, 0x3f, + 0xaa, 0xb3, 0x7b, 0xcc, 0x9b, 0xa6, 0x40, 0x77, 0xa2, 0x68, 0xaa, 0x95, + 0x71, 0xce, 0xab, 0xcc, 0xa0, 0x74, 0xb4, 0x3d, 0xcb, 0xa0, 0x3f, 0x40, + 0x66, 0x4f, 0x96, 0xc5, 0x39, 0x7e, 0x73, 0x75, 0x46, 0x44, 0xc8, 0x8e, + 0xae, 0x98, 0xb0, 0x68, 0x95, 0x71, 0x39, 0xc8, 0x40, 0xa1, 0x99, 0x3c, + 0xd3, 0x4a, 0x72, 0x45, 0x9b, 0x6a, 0x7a, 0xa0, 0x78, 0x82, 0xb4, 0x93, + 0x58, 0x77, 0xc0, 0x89, 0xbb, 0xb1, 0x93, 0x66, 0x54, 0x6d, 0x71, 0x53, + 0x92, 0x60, 0x7a, 0xc8, 0x5f, 0xd3, 0x66, 0x90, 0x5c, 0x8a, 0x8b, 0x51, + 0xd2, 0xbc, 0xc0, 0xd0, 0x95, 0x50, 0x7c, 0x2f, 0xaf, 0x93, 0x4e, 0xc3, + 0x8f, 0x6d, 0xce, 0xaf, 0xcf, 0x5c, 0x85, 0x42, 0x49, 0xc8, 0xcd, 0x6a, + 0x31, 0x41, 0x53, 0xb2, 0x53, 0x82, 0xd0, 0x8c, 0x70, 0x89, 0x57, 0x77, + 0x39, 0xa7, 0x9e, 0x50, 0x5b, 0x96, 0x53, 0x6b, 0x98, 0x4e, 0x42, 0xc8, + 0x89, 0x50, 0x35, 0xb4, 0xd2, 0x92, 0xa8, 0x50, 0x53, 0x8e, 0x92, 0x63, + 0x60, 0xad, 0xc5, 0xa0, 0xd1, 0x85, 0xb0, 0x54, 0x7f, 0xa6, 0x78, 0x61, + 0x67, 0x7a, 0x7f, 0x50, 0x4d, 0x36, 0x91, 0xa3, 0xc9, 0x38, 0x3a, 0xab, + 0x2f, 0xd1, 0x39, 0x83, 0xb5, 0x9b, 0x94, 0x3e, 0x40, 0x72, 0x99, 0x41, + 0x78, 0x93, 0x32, 0xb2, 0x3d, 0x38, 0xa0, 0x35, 0x73, 0x3e, 0x99, 0x92, + 0xbd, 0x37, 0x52, 0x74, 0x55, 0x9b, 0x72, 0xaf, 0x4b, 0x41, 0x86, 0x6e, + 0x53, 0xbc, 0x77, 0xb4, 0x8a, 0x9d, 0xc6, 0x6e, 0x60, 0x3f, 0x32, 0x98, + 0x90, 0x89, 0x9d, 0x46, 0x82, 0x4c, 0x84, 0xb8, 0x63, 0x91, 0xb7, 0xa3, + 0x7b, 0xc7, 0x31, 0x4d, 0x82, 0x6d, 0x64, 0x7f, 0xa4, 0x40, 0x60, 0x3e, + 0x8b, 0xd4, 0x5f, 0xa2, 0xbf, 0xa1, 0x71, 0xc1, 0x79, 0x53, 0x8b, 0xb1, + 0xd0, 0x60, 0x52, 0x8b, 0x65, 0x8b, 0x6a, 0x5c, 0x67, 0x9a, 0x49, 0x8e, + 0xca, 0x4a, 0x9a, 0x82, 0xa6, 0x50, 0x99, 0x84, 0x9f, 0xaa, 0x80, 0x81, + 0x3c, 0x93, 0xc6, 0x43, 0xb6, 0x74, 0x3f, 0x38, 0xcc, 0x37, 0x77, 0x86, + 0xa3, 0xc1, 0x51, 0x5b, 0xb6, 0xa4, 0x3a, 0xbc, 0x55, 0x7a, 0x93, 0x4f, + 0xc9, 0x8d, 0x92, 0xc0, 0x8c, 0x57, 0xa0, 0xca, 0x53, 0x34, 0x79, 0x69, + 0x63, 0x5b, 0x40, 0x32, 0x5b, 0x76, 0x77, 0x52, 0x65, 0xcf, 0x5e, 0xa8, + 0x38, 0xa8, 0x74, 0x30, 0x93, 0xad, 0xb5, 0x72, 0x4e, 0xb7, 0xba, 0xb8, + 0x8d, 0x5c, 0x87, 0xd0, 0xd1, 0xa7, 0x72, 0xb8, 0x8f, 0x84, 0xb6, 0x31, + 0x3b, 0xa6, 0xcc, 0xce, 0xd4, 0xd3, 0xb2, 0x3f, 0xbf, 0x69, 0x6c, 0x42, + 0xcb, 0x85, 0x4a, 0xa4, 0x40, 0xc4, 0x64, 0x41, 0x87, 0xcd, 0x9a, 0x9b, + 0x70, 0x9d, 0x8a, 0x62, 0xa5, 0x90, 0x34, 0xbd, 0x60, 0x55, 0x3e, 0x95, + 0x37, 0xa5, 0xa4, 0x90, 0xa2, 0x94, 0x51, 0x63, 0x9f, 0xa8, 0xaa, 0x4a, + 0xa3, 0xb2, 0x6a, 0x4b, 0xab, 0x77, 0x59, 0xb9, 0x6f, 0x35, 0x48, 0xa6, + 0x79, 0xb4, 0xb4, 0x36, 0xae, 0x33, 0x55, 0x9d, 0xbc, 0x7a, 0x68, 0x8a, + 0x38, 0xcc, 0xc2, 0xc5, 0x9f, 0xc7, 0x98, 0x3f, 0x5c, 0x9a, 0x50, 0x33, + 0x67, 0x53, 0x91, 0xa4, 0xb3, 0x80, 0x96, 0xbf, 0x7c, 0x98, 0x53, 0x9e, + 0x3e, 0xbe, 0xa0, 0xbf, 0x69, 0x5c, 0xbf, 0x43, 0xce, 0xc2, 0x76, 0x5d, + 0x96, 0x5f, 0xa8, 0x91, 0x76, 0xb0, 0x5e, 0xb5, 0x85, 0xa0, 0x46, 0x64, + 0x31, 0x36, 0xcb, 0x8a, 0xc6, 0xab, 0x6c, 0x43, 0x61, 0x5c, 0x59, 0x8e, + 0x75, 0xb4, 0x7c, 0x9a, 0x4a, 0x53, 0x87, 0x3e, 0x6a, 0xc7, 0x64, 0xcb, + 0x42, 0xba, 0x78, 0x96, 0xcc, 0x7b, 0x69, 0x32, 0x78, 0x54, 0x56, 0x45, + 0x6a, 0xa5, 0x95, 0xa5, 0x7d, 0x4b, 0xce, 0x67, 0x3e, 0xce, 0x71, 0xad, + 0xbb, 0x4a, 0x3e, 0x71, 0x95, 0xcc, 0x9c, 0xc8, 0x98, 0x7f, 0x71, 0xb3, + 0x83, 0x6f, 0x9f, 0xb0, 0x94, 0x3d, 0x8f, 0x49, 0x87, 0xa3, 0xc7, 0x5f, + 0x61, 0xa4, 0x9f, 0xaa, 0xb9, 0x60, 0xcc, 0x48, 0x44, 0x6c, 0xaa, 0x5d, + 0xcb, 0x7f, 0x7a, 0x6b, 0x88, 0x55, 0xb6, 0x4a, 0x65, 0x42, 0x47, 0xb2, + 0x73, 0x69, 0xcb, 0xc2, 0xbf, 0xb3, 0x44, 0xb7, 0x7b, 0x5f, 0xc9, 0x86, + 0x40, 0xb2, 0xda, 0xc2, 0x88, 0xaf, 0x65, 0x5d, 0x45, 0xa7, 0xbf, 0x35, + 0x77, 0xb6, 0xc9, 0xa8, 0x96, 0xb7, 0x94, 0x97, 0x91, 0xc2, 0x4e, 0x63, + 0x6b, 0xbd, 0x78, 0x78, 0x8f, 0x60, 0x5e, 0x47, 0x56, 0x96, 0x50, 0xbe, + 0x9b, 0x8a, 0x77, 0x42, 0x73, 0x40, 0x65, 0x3b, 0x80, 0x99, 0xd2, 0xca, + 0xb9, 0x47, 0xd7, 0x8a, 0xcd, 0xc6, 0xab, 0xc1, 0xcd, 0xa9, 0xcd, 0x7e, + 0xae, 0xce, 0xb0, 0xb3, 0xc1, 0xb1, 0xc9, 0xc7, 0x7d, 0x67, 0xcc, 0xaf, + 0x40, 0xcd, 0x3f, 0x89, 0x54, 0x52, 0xac, 0x38, 0x69, 0x4c, 0xd2, 0x76, + 0x7b, 0xb3, 0xa1, 0x36, 0xc5, 0x59, 0x4b, 0x2d, 0x95, 0xca, 0x4e, 0xb6, + 0x42, 0xc0, 0x48, 0x61, 0xb9, 0x4d, 0x6b, 0x43, 0x7f, 0xa9, 0xab, 0x98, + 0x8f, 0x91, 0xb8, 0x50, 0x8e, 0xcf, 0x6a, 0x88, 0x6b, 0x7c, 0xca, 0xc9, + 0x75, 0xa1, 0xc8, 0x83, 0xa7, 0xae, 0x86, 0x8f, 0xb9, 0x8d, 0x93, 0x56, + 0x3e, 0x9a, 0x90, 0x60, 0xa7, 0x95, 0x86, 0x79, 0xae, 0x4f, 0x54, 0x2c, + 0x5b, 0x4b, 0xa1, 0xc7, 0x8d, 0xd4, 0xd3, 0xd1, 0xd4, 0xce, 0xc9, 0x47, + 0x62, 0xd7, 0x84, 0x81, 0x8b, 0xab, 0x7b, 0x6a, 0x57, 0x69, 0x78, 0x96, + 0xb3, 0x98, 0xb4, 0xbe, 0x6b, 0xbc, 0x61, 0xc7, 0xca, 0x62, 0x4f, 0x97, + 0x7a, 0xce, 0x5a, 0x7b, 0xd0, 0x5b, 0x74, 0x51, 0x7a, 0x2f, 0xbb, 0x56, + 0xb4, 0xb1, 0x4d, 0xbb, 0x4b, 0x8b, 0xc4, 0xba, 0xc2, 0xb8, 0xd0, 0x8a, + 0xb6, 0x7c, 0x7b, 0xaf, 0xc2, 0x96, 0x82, 0x63, 0xa1, 0x4f, 0xa3, 0x83, + 0x39, 0x53, 0x96, 0x7a, 0xaa, 0x6e, 0x44, 0xc0, 0xba, 0x4a, 0xb8, 0x4e, + 0x82, 0x7d, 0x44, 0x82, 0x5f, 0x4f, 0x3e, 0x55, 0x7c, 0x62, 0xb7, 0xcf, + 0x73, 0xcd, 0x39, 0x9d, 0x35, 0x48, 0x8b, 0x80, 0x68, 0x6a, 0x9e, 0x8a, + 0x44, 0x87, 0x61, 0xaa, 0xcf, 0xb8, 0x8c, 0x37, 0xb0, 0x99, 0x7b, 0x66, + 0x9b, 0x97, 0xcd, 0xc5, 0x3e, 0x67, 0x81, 0x87, 0x88, 0xb9, 0x7a, 0xa4, + 0xb5, 0x4c, 0x33, 0x5f, 0x9f, 0xbe, 0x4b, 0xbb, 0x6a, 0x57, 0xaa, 0xad, + 0x84, 0x9e, 0xc9, 0x6c, 0x67, 0x7c, 0x89, 0x8e, 0x4a, 0xa0, 0x69, 0x4a, + 0x80, 0x70, 0xcd, 0x99, 0xc1, 0x3f, 0x54, 0x98, 0x5a, 0x65, 0xc1, 0x77, + 0x4d, 0xa1, 0xa3, 0x68, 0x6c, 0xc8, 0xa5, 0x6e, 0x37, 0x95, 0xce, 0x41, + 0x34, 0x8c, 0xb8, 0x6d, 0x3b, 0x4b, 0x7f, 0x97, 0xca, 0x55, 0xbb, 0x4f, + 0xbf, 0x34, 0x91, 0x95, 0x59, 0x9d, 0x5c, 0x4c, 0x3b, 0xcc, 0xc5, 0x75, + 0x67, 0x41, 0x64, 0xa9, 0x72, 0x9b, 0xc3, 0x6a, 0x51, 0x81, 0x41, 0x82, + 0x90, 0x81, 0x72, 0x57, 0xbf, 0x4d, 0xcd, 0xbc, 0xc5, 0xc3, 0xd6, 0x91, + 0x4e, 0x34, 0x6b, 0x93, 0xcc, 0x55, 0x8c, 0xc1, 0x30, 0x31, 0x36, 0x34, + 0x8e, 0x4a, 0x43, 0x80, 0x6c, 0x7b, 0xa4, 0x68, 0x4c, 0x44, 0xae, 0x95, + 0x86, 0x34, 0xb0, 0x7f, 0x77, 0x3e, 0xcd, 0x32, 0x40, 0x8b, 0x99, 0xbc, + 0x72, 0x3b, 0x9a, 0x42, 0xa9, 0x35, 0x64, 0x60, 0x7a, 0x55, 0xc9, 0x97, + 0x7a, 0x9c, 0x54, 0x73, 0x6e, 0xa0, 0xbd, 0x38, 0xa9, 0xc1, 0x56, 0x58, + 0xc4, 0x71, 0x53, 0x48, 0xcb, 0x49, 0x79, 0x8e, 0xa2, 0x68, 0x97, 0x98, + 0xc8, 0x8d, 0xcc, 0x59, 0x92, 0x44, 0xaf, 0x43, 0x75, 0xc0, 0x8d, 0xa5, + 0xbe, 0x61, 0x41, 0x80, 0x47, 0x7d, 0xa1, 0x4f, 0xd5, 0xba, 0x51, 0x77, + 0xc6, 0xab, 0x9d, 0xc3, 0x3c, 0x92, 0x7c, 0x8d, 0xad, 0x4c, 0x89, 0xc1, + 0x8f, 0x3f, 0x4f, 0xbf, 0xa6, 0xad, 0x38, 0x45, 0x89, 0x4e, 0x7c, 0x66, + 0x3a, 0xb9, 0x65, 0x74, 0xa3, 0x8a, 0x7c, 0x71, 0x71, 0x81, 0x38, 0xb2, + 0xa6, 0xa3, 0x85, 0x38, 0x9c, 0x91, 0x5b, 0xb6, 0x53, 0x64, 0x49, 0x8f, + 0x57, 0x70, 0x55, 0x99, 0x4b, 0x38, 0x30, 0x36, 0x43, 0x5f, 0x64, 0xc8, + 0xca, 0x33, 0x77, 0x4e, 0x50, 0x60, 0x70, 0xc6, 0x88, 0x59, 0x5e, 0xa2, + 0x95, 0x84, 0x62, 0xba, 0xb1, 0xb4, 0xaa, 0xca, 0x6f, 0x4f, 0x5d, 0xae, + 0xa7, 0x6c, 0xc5, 0xa4, 0x4b, 0x70, 0xaa, 0xb5, 0x7a, 0xca, 0x83, 0x38, + 0x98, 0xaa, 0xd1, 0xba, 0xb0, 0xaf, 0xd0, 0xc9, 0x91, 0x58, 0x8b, 0xca, + 0x56, 0x7f, 0x9d, 0x95, 0x73, 0xb7, 0xaa, 0x79, 0xce, 0xb2, 0x99, 0xc3, + 0xd3, 0x64, 0x84, 0x74, 0xa7, 0x4c, 0x4c, 0xc1, 0x6b, 0xd4, 0x7b, 0x82, + 0xc0, 0x99, 0x6c, 0x4e, 0x45, 0x4c, 0x53, 0x97, 0x87, 0xa9, 0xb2, 0xc6, + 0x8f, 0x64, 0xa9, 0xb5, 0xab, 0x9a, 0x63, 0x56, 0x9a, 0x8b, 0xbc, 0x3f, + 0x89, 0xab, 0xa7, 0xa8, 0x92, 0x4e, 0xa3, 0x7b, 0x69, 0x95, 0x48, 0x96, + 0x68, 0xc0, 0xcf, 0x3f, 0xc4, 0xb9, 0xc1, 0xd5, 0xac, 0x74, 0x98, 0x4e, + 0x38, 0x5b, 0x95, 0xcd, 0xba, 0xa7, 0x37, 0x7e, 0x40, 0xcd, 0x5d, 0x93, + 0x77, 0x8a, 0xb1, 0x6d, 0x46, 0x85, 0x5f, 0xd6, 0x74, 0xa0, 0x48, 0x5c, + 0x82, 0xa9, 0xc3, 0x64, 0x77, 0xab, 0xa1, 0x5c, 0x4f, 0xb9, 0x3e, 0x3a, + 0x59, 0x4b, 0x40, 0x5d, 0x72, 0x82, 0x5d, 0xc7, 0x91, 0xd9, 0x64, 0x71, + 0x5d, 0x30, 0x8d, 0x44, 0x93, 0x7f, 0xc2, 0xcc, 0xcd, 0x3b, 0x97, 0x36, + 0x78, 0x8d, 0x9e, 0x48, 0xc8, 0xac, 0x65, 0xa9, 0x46, 0x91, 0x34, 0x7e, + 0x4e, 0x68, 0xd9, 0x61, 0xbb, 0xcb, 0xa8, 0x69, 0xc1, 0x70, 0xa1, 0x98, + 0x90, 0x37, 0xc8, 0x82, 0x84, 0xc7, 0x64, 0x53, 0x40, 0x9b, 0x68, 0x84, + 0xac, 0x52, 0x66, 0x6d, 0x4d, 0xca, 0xae, 0x75, 0xc5, 0x4c, 0x68, 0x61, + 0x6d, 0x4d, 0x6f, 0x72, 0x3b, 0x63, 0xa5, 0xc4, 0x76, 0xb8, 0xc0, 0x68, + 0x5e, 0x57, 0x73, 0x6b, 0x93, 0x6e, 0xd1, 0x92, 0x8a, 0x3c, 0xc4, 0x5d, + 0xda, 0x97, 0x3e, 0x76, 0x43, 0x8a, 0x3c, 0xa5, 0x37, 0x55, 0x4d, 0xb2, + 0x4a, 0x63, 0xab, 0xa1, 0x5d, 0xbe, 0x64, 0xa6, 0x32, 0x47, 0xaf, 0xda, + 0xa9, 0x90, 0x78, 0x73, 0x40, 0xa6, 0xaa, 0x76, 0x92, 0x41, 0x69, 0x41, + 0x66, 0x6e, 0x2d, 0xcf, 0x5b, 0x7a, 0xbe, 0x4d, 0xa1, 0x7e, 0x59, 0x9b, + 0x43, 0xa1, 0xd0, 0x9e, 0x36, 0xbb, 0x6d, 0x8f, 0x82, 0x41, 0x86, 0xbf, + 0xd5, 0xd2, 0x5e, 0x3d, 0xc1, 0x6a, 0x68, 0x8b, 0x7b, 0xaa, 0x7b, 0xae, + 0x73, 0x77, 0x37, 0x3e, 0xc6, 0x76, 0x5c, 0x81, 0x34, 0x4b, 0x85, 0xd2, + 0x51, 0x92, 0xbe, 0xb0, 0x76, 0x89, 0x55, 0x68, 0x35, 0x5a, 0x8b, 0x53, + 0x5d, 0xbd, 0x44, 0x5d, 0x9d, 0xc0, 0x4e, 0x61, 0x5e, 0xa3, 0x61, 0x7f, + 0xb9, 0x67, 0x3d, 0x34, 0x2a, 0x54, 0x7f, 0x67, 0x96, 0x32, 0x72, 0x73, + 0x31, 0xa1, 0x59, 0x9d, 0x64, 0x3d, 0x67, 0x5f, 0x80, 0x8c, 0x56, 0x49, + 0x69, 0x89, 0x31, 0x6d, 0xd0, 0x95, 0x96, 0xbe, 0x3f, 0x47, 0xb0, 0xcc, + 0x3a, 0x3a, 0x55, 0x9e, 0xa8, 0x7c, 0x60, 0x82, 0x85, 0x8f, 0x71, 0x69, + 0x6b, 0x99, 0xc5, 0x50, 0xc5, 0x84, 0x4a, 0x86, 0x45, 0x40, 0x9d, 0x7d, + 0x2b, 0x83, 0x95, 0x56, 0x56, 0xd2, 0x8c, 0x83, 0xb3, 0x5c, 0xa6, 0x56, + 0xd1, 0xcf, 0xbd, 0x68, 0xc0, 0x87, 0x43, 0x59, 0x85, 0x95, 0x74, 0x79, + 0x67, 0xc3, 0x6c, 0x7d, 0xc1, 0x49, 0xda, 0x9f, 0xce, 0x39, 0x4e, 0x3e, + 0xc6, 0xb3, 0xa3, 0xa3, 0x4c, 0x7a, 0x69, 0xb6, 0xbc, 0x50, 0x38, 0x8a, + 0x95, 0xc5, 0x91, 0xaf, 0x4b, 0x3e, 0x4b, 0xa1, 0x3f, 0x5b, 0x8a, 0x6e, + 0xa1, 0xc4, 0xb0, 0x81, 0x40, 0xa4, 0xb8, 0x42, 0x9f, 0xb9, 0x60, 0x59, + 0xb6, 0xa8, 0xd4, 0xa3, 0x64, 0x40, 0xaf, 0x83, 0x4c, 0xcb, 0x78, 0x7a, + 0x67, 0xb0, 0x70, 0x4e, 0xc6, 0x65, 0x99, 0xa9, 0x35, 0x81, 0x95, 0xb9, + 0x6b, 0x74, 0xbf, 0xbb, 0x82, 0xa5, 0xa2, 0x8e, 0xc4, 0x5c, 0x74, 0x59, + 0xa5, 0xc4, 0x4d, 0xc7, 0xcf, 0xb9, 0xcc, 0x63, 0x68, 0x4e, 0x60, 0x84, + 0xc9, 0x6f, 0xb0, 0x43, 0xaf, 0x32, 0x2b, 0x4d, 0x87, 0x5c, 0x55, 0x41, + 0xaa, 0x7e, 0x7f, 0x74, 0x61, 0x47, 0xde, 0x7a, 0x46, 0x77, 0x64, 0xce, + 0xa1, 0x68, 0xa0, 0x67, 0x34, 0x95, 0x66, 0x5d, 0x5b, 0xac, 0xad, 0x3e, + 0x76, 0x78, 0x41, 0x69, 0xa1, 0x84, 0x98, 0x7e, 0xa5, 0x5c, 0x65, 0x72, + 0xb7, 0xaa, 0x31, 0x5f, 0x8f, 0xd3, 0xaf, 0xbb, 0x37, 0x84, 0xba, 0x7e, + 0x7f, 0x9c, 0xa6, 0xa9, 0x38, 0x58, 0x58, 0x9f, 0x78, 0xa7, 0x80, 0x9a, + 0x80, 0xc8, 0x86, 0x68, 0xaa, 0xb6, 0xc8, 0x46, 0xa7, 0xb1, 0x9b, 0x6b, + 0xac, 0x83, 0x7a, 0xd3, 0x9e, 0x80, 0x7c, 0xc9, 0x74, 0xd8, 0x4c, 0x6f, + 0x3f, 0x96, 0xbc, 0x8c, 0xb6, 0xc8, 0xbc, 0x4c, 0x7a, 0xb1, 0x8b, 0x47, + 0xa7, 0x99, 0x45, 0xb1, 0xc3, 0xaa, 0xbe, 0x4a, 0xa0, 0xcc, 0xb9, 0x68, + 0x70, 0x81, 0x84, 0x39, 0x5f, 0x83, 0x4e, 0x80, 0x74, 0xb2, 0xb0, 0x76, + 0x66, 0xae, 0x58, 0xba, 0xc6, 0x9b, 0xb2, 0xa2, 0xc7, 0xcb, 0xa0, 0x50, + 0xb6, 0xd2, 0x67, 0xb2, 0x8a, 0x5e, 0xcf, 0xa9, 0x9c, 0xa1, 0x6f, 0x3c, + 0x4b, 0xc0, 0xce, 0x99, 0x5f, 0xad, 0x42, 0x37, 0x43, 0xa4, 0xb8, 0x49, + 0x5b, 0x65, 0x94, 0x95, 0x9d, 0x6e, 0x84, 0x57, 0x6f, 0xb8, 0x86, 0x53, + 0x7d, 0xbe, 0x77, 0x44, 0xcc, 0x79, 0x9b, 0x93, 0x9e, 0xb5, 0xa1, 0xa2, + 0x57, 0xc9, 0x3a, 0xbc, 0x48, 0xc8, 0x98, 0x6f, 0x8d, 0xbe, 0xbe, 0xc3, + 0xb2, 0xbc, 0xac, 0x51, 0x56, 0x99, 0xc5, 0x62, 0x78, 0x8d, 0x61, 0x37, + 0x41, 0x4f, 0x4c, 0xc4, 0x36, 0x5f, 0x37, 0x93, 0x7b, 0x8f, 0x95, 0xc8, + 0xc3, 0x46, 0xb5, 0xa2, 0x9d, 0x65, 0xb8, 0xb5, 0x63, 0x7d, 0xc7, 0xa9, + 0x89, 0xb3, 0xb3, 0xbe, 0x42, 0x4c, 0x4a, 0x55, 0x4b, 0x63, 0xdb, 0x84, + 0xb4, 0x2d, 0xb5, 0x56, 0xc3, 0xcb, 0x71, 0x3c, 0xce, 0xb1, 0x64, 0x7b, + 0xad, 0x50, 0x83, 0x47, 0x48, 0xa4, 0xa8, 0xbd, 0xc1, 0x60, 0x8b, 0xac, + 0xc4, 0xd5, 0x5f, 0x4f, 0x76, 0x40, 0xcb, 0x78, 0xc8, 0x99, 0x78, 0x6a, + 0x42, 0x73, 0x8a, 0x91, 0x4e, 0x8e, 0x37, 0x64, 0x5b, 0x53, 0x53, 0x3d, + 0x5e, 0x97, 0x63, 0xbc, 0xd0, 0xc2, 0x9e, 0x6f, 0x88, 0x9d, 0xc6, 0x70, + 0x92, 0x6a, 0x80, 0x53, 0x75, 0x73, 0x6f, 0x5d, 0xdb, 0xbc, 0xa1, 0x97, + 0xc1, 0x55, 0x64, 0x7c, 0x6e, 0xa7, 0x54, 0xe1, 0x85, 0x39, 0xc6, 0x5e, + 0x91, 0xaf, 0x72, 0xcc, 0x32, 0x5d, 0x60, 0x3d, 0xd3, 0xba, 0x56, 0xbd, + 0xc5, 0xb1, 0xb8, 0x65, 0x72, 0x66, 0xbb, 0xc1, 0x2e, 0x5e, 0xa5, 0xbe, + 0x79, 0x58, 0xc2, 0x5d, 0xab, 0xd1, 0x3d, 0x49, 0xae, 0x34, 0x84, 0xb1, + 0x8b, 0x2b, 0x2e, 0x3f, 0x9f, 0x9e, 0xcf, 0x3c, 0x7d, 0x35, 0x8c, 0xba, + 0x7e, 0x72, 0x92, 0x69, 0xc0, 0x41, 0x98, 0xa4, 0xc5, 0xc7, 0x66, 0xd7, + 0xc5, 0xab, 0x8e, 0x49, 0x41, 0x71, 0x66, 0x45, 0x4b, 0x97, 0x48, 0xc7, + 0x7d, 0xa6, 0x92, 0x2d, 0x41, 0xba, 0xb2, 0xc3, 0x4f, 0x50, 0x60, 0x99, + 0x9e, 0x38, 0xb6, 0x91, 0x99, 0x33, 0xb8, 0x41, 0x4a, 0xb9, 0x6e, 0xa8, + 0x73, 0x79, 0xc9, 0x71, 0x71, 0xd1, 0x92, 0x3a, 0x5d, 0xc0, 0xcd, 0x6f, + 0x9d, 0x39, 0xc8, 0x38, 0x8d, 0xd7, 0x3b, 0x7b, 0x65, 0xd1, 0xae, 0x3b, + 0x51, 0x76, 0xa4, 0x67, 0x89, 0x81, 0xb1, 0xb4, 0xc1, 0x7b, 0x85, 0x62, + 0x2d, 0x5d, 0x54, 0xa2, 0x6d, 0x6b, 0xcc, 0x9a, 0xa0, 0x4a, 0x42, 0x7c, + 0xba, 0xa6, 0x33, 0xb8, 0x4c, 0xb8, 0x4d, 0x3e, 0xba, 0x60, 0x67, 0x75, + 0xb5, 0x45, 0x89, 0x37, 0x42, 0xd3, 0x47, 0x8b, 0x5b, 0x6f, 0x9a, 0x2f, + 0xc1, 0x67, 0x52, 0x7f, 0x4c, 0xbe, 0xd9, 0xb5, 0x8c, 0x60, 0xd9, 0xce, + 0xb3, 0x60, 0xb5, 0x58, 0x58, 0x9e, 0x81, 0xd7, 0x4c, 0xc9, 0xbd, 0x7d, + 0xbe, 0x3c, 0x51, 0x36, 0x9a, 0x9b, 0x46, 0x72, 0x56, 0xa4, 0xc0, 0x93, + 0xb7, 0x49, 0x51, 0xdc, 0xd7, 0x85, 0x7e, 0x7a, 0x42, 0x6f, 0x36, 0x27, + 0x50, 0xaf, 0x48, 0x91, 0x79, 0xd2, 0x7b, 0xc4, 0x8a, 0x47, 0x93, 0x5e, + 0x55, 0xa3, 0x32, 0xb8, 0xa5, 0xd5, 0xc7, 0x3a, 0x99, 0x7c, 0x45, 0xa7, + 0xbd, 0x9b, 0x46, 0x8d, 0x70, 0x8b, 0x5b, 0x97, 0x3d, 0xd3, 0xb5, 0xd7, + 0xd3, 0x60, 0xc3, 0x68, 0x65, 0x49, 0xba, 0x69, 0x4a, 0xb6, 0x40, 0xb9, + 0x29, 0x35, 0xbd, 0xa6, 0xbb, 0x38, 0x79, 0x9c, 0x69, 0x81, 0x54, 0x72, + 0xca, 0x81, 0xbc, 0x4d, 0xb8, 0x91, 0x41, 0x60, 0xcf, 0x67, 0x36, 0x24, + 0x86, 0xa8, 0xad, 0x44, 0x7a, 0xe0, 0x96, 0x84, 0x5b, 0xd0, 0xc5, 0x81, + 0xbc, 0xd2, 0xd9, 0xaf, 0x3f, 0x46, 0x48, 0x49, 0x9d, 0x66, 0x72, 0x61, + 0x50, 0x8d, 0x6f, 0xac, 0xa4, 0xd3, 0x2d, 0xc5, 0xbe, 0xb7, 0xb8, 0xb5, + 0x4b, 0x6e, 0x9c, 0x3f, 0x3a, 0xa7, 0x8f, 0xa7, 0xc3, 0x93, 0x8c, 0x82, + 0x64, 0x5a, 0x6c, 0xcb, 0x47, 0x43, 0x59, 0x6c, 0x44, 0xb3, 0x8b, 0xa6, + 0x3b, 0x44, 0x64, 0x57, 0x34, 0x73, 0x40, 0x66, 0x64, 0x3d, 0x91, 0x6f, + 0x32, 0x78, 0x3b, 0xb7, 0x89, 0x6b, 0x96, 0xc9, 0x50, 0x58, 0x72, 0xd1, + 0xa2, 0x60, 0x6b, 0xa8, 0x35, 0x50, 0x50, 0xc3, 0x4d, 0x8f, 0x8a, 0x95, + 0x36, 0x43, 0x39, 0x52, 0x4c, 0xa0, 0xc0, 0x8a, 0x33, 0x37, 0x38, 0x3a, + 0x76, 0x71, 0x5b, 0xb3, 0x6d, 0x3c, 0x58, 0x38, 0x54, 0x45, 0x46, 0x85, + 0x88, 0xb0, 0x57, 0x37, 0x94, 0x99, 0x67, 0x8e, 0xd4, 0x76, 0xb3, 0x82, + 0x2e, 0xcc, 0xab, 0xce, 0x63, 0x36, 0x86, 0x36, 0xaf, 0x6e, 0x64, 0x8c, + 0xc0, 0x8e, 0x87, 0xa0, 0x6a, 0x9e, 0x45, 0x6f, 0x74, 0x9c, 0xa4, 0x9d, + 0x92, 0xb4, 0xcb, 0x86, 0x83, 0x8c, 0x9b, 0x7e, 0xac, 0x3b, 0x43, 0x2f, + 0xc9, 0xe2, 0x3d, 0x32, 0x7c, 0xd0, 0x90, 0xb8, 0x55, 0x74, 0x47, 0x4d, + 0x51, 0x3d, 0x81, 0x4d, 0x71, 0x96, 0x7c, 0xa0, 0x7d, 0x65, 0x69, 0xb2, + 0x81, 0x39, 0x98, 0x72, 0x9c, 0x33, 0x38, 0xaa, 0xd8, 0x91, 0xbb, 0x5e, + 0x99, 0x62, 0x3b, 0x68, 0x89, 0xdc, 0x36, 0x60, 0x75, 0x64, 0xa4, 0xd2, + 0x57, 0x57, 0x53, 0x8d, 0x35, 0x43, 0x53, 0x69, 0x69, 0x5a, 0x49, 0x89, + 0x5d, 0xa0, 0x38, 0x86, 0x5e, 0x76, 0x72, 0x85, 0x60, 0xa6, 0x4f, 0x7d, + 0x5a, 0x8f, 0x52, 0x3d, 0xd0, 0xa0, 0x63, 0xcd, 0xd2, 0x95, 0x49, 0x55, + 0xc9, 0xa2, 0x9f, 0x77, 0x52, 0xc1, 0x89, 0x6b, 0x7d, 0xbe, 0x92, 0xb9, + 0xc4, 0x7e, 0xb3, 0x96, 0x54, 0xa1, 0x48, 0xc2, 0x50, 0x2e, 0xa8, 0x8d, + 0x9f, 0xb8, 0xaf, 0xc6, 0xb1, 0x84, 0x52, 0x2c, 0xbb, 0xbc, 0x55, 0x85, + 0x68, 0x96, 0xbf, 0x9b, 0xac, 0xb8, 0xac, 0xc1, 0x44, 0x58, 0x6a, 0x81, + 0x71, 0xa0, 0x9d, 0x2f, 0x53, 0xa6, 0x87, 0x61, 0x71, 0x50, 0xb3, 0x64, + 0x55, 0x3b, 0x41, 0xb1, 0x52, 0x74, 0xaf, 0x53, 0x3e, 0xbb, 0xd3, 0xca, + 0x55, 0x9b, 0x7f, 0x74, 0x9f, 0xc2, 0x98, 0x9e, 0x38, 0x44, 0x55, 0x75, + 0x9e, 0x62, 0xb6, 0x76, 0xbe, 0x9f, 0xca, 0xbc, 0xd3, 0xa0, 0xaa, 0xa2, + 0xc8, 0x53, 0x2f, 0x37, 0xbd, 0x38, 0x4f, 0xb4, 0x4a, 0x80, 0xa8, 0xc6, + 0x6f, 0x52, 0xc3, 0xaf, 0x6a, 0x8e, 0xbb, 0x6d, 0xb0, 0xc0, 0x44, 0x7b, + 0xa5, 0x36, 0x6a, 0xbe, 0xc0, 0x80, 0x4e, 0x9c, 0x66, 0xb8, 0xa0, 0x5e, + 0x81, 0x88, 0xc5, 0x98, 0x80, 0xaf, 0x7b, 0x4c, 0x9c, 0xb2, 0xc9, 0x7d, + 0x45, 0xc1, 0x39, 0x35, 0x79, 0x54, 0x43, 0x87, 0xc0, 0x84, 0x58, 0x6f, + 0x48, 0x92, 0x8b, 0x3a, 0x51, 0xb2, 0x44, 0x41, 0x3d, 0x6c, 0x51, 0x72, + 0x84, 0xa8, 0xc4, 0x4f, 0x3e, 0x28, 0x7f, 0xb8, 0x47, 0x5b, 0x44, 0x77, + 0x89, 0xb2, 0x48, 0x69, 0x45, 0x4d, 0x5f, 0xb5, 0xd4, 0x67, 0x86, 0x9d, + 0x8d, 0x60, 0xca, 0x8f, 0x4f, 0x75, 0xa9, 0x86, 0x64, 0xce, 0x5d, 0x9d, + 0x6e, 0xa9, 0x72, 0x62, 0xd7, 0x46, 0xe3, 0x76, 0xc1, 0x65, 0x82, 0x8d, + 0x5a, 0xa1, 0x3c, 0x33, 0xbe, 0xaa, 0x4a, 0xaa, 0x37, 0x98, 0x58, 0xaa, + 0xb7, 0x45, 0xb5, 0xc4, 0xc0, 0x5d, 0x65, 0x4a, 0x52, 0x42, 0x70, 0xdd, + 0x9b, 0x41, 0x57, 0x4a, 0x37, 0xbf, 0x76, 0x5f, 0x3b, 0x7f, 0x4d, 0x81, + 0xb2, 0x4f, 0xbb, 0xd6, 0x7f, 0x5a, 0xcd, 0xa8, 0x43, 0x8e, 0xb3, 0x7e, + 0x8d, 0x9f, 0x7e, 0xa0, 0xbe, 0x57, 0x89, 0xa0, 0x58, 0xad, 0x6f, 0x66, + 0x65, 0xb7, 0x28, 0xbf, 0x9d, 0x73, 0x57, 0xc3, 0x5a, 0x48, 0x92, 0x71, + 0x5f, 0x8b, 0x8b, 0x4f, 0x80, 0x54, 0xc1, 0xc0, 0x3a, 0xc5, 0x6e, 0x73, + 0x8a, 0x62, 0x5b, 0x78, 0x67, 0x52, 0xa8, 0x6b, 0x58, 0xad, 0xb4, 0x7a, + 0xa9, 0x9f, 0xbc, 0xb4, 0x59, 0x74, 0x73, 0xd4, 0xa1, 0x63, 0xd2, 0x46, + 0xc9, 0x33, 0xc9, 0x82, 0x56, 0x81, 0xcf, 0x83, 0xac, 0x8f, 0x39, 0x79, + 0x8f, 0x99, 0x6a, 0xd4, 0x4f, 0x4b, 0x6f, 0x7f, 0xbf, 0x59, 0xb1, 0x86, + 0x66, 0xb6, 0xd8, 0x7e, 0x8a, 0x86, 0xc4, 0x42, 0x8c, 0xa9, 0x41, 0x8e, + 0x34, 0x4e, 0xc9, 0xa1, 0x57, 0x5b, 0x44, 0x55, 0x45, 0xbf, 0x5b, 0xc2, + 0x4f, 0x9d, 0x2f, 0x92, 0xbb, 0xbb, 0x92, 0x2e, 0x8b, 0xab, 0xb5, 0x91, + 0x36, 0x7a, 0x6d, 0xa2, 0x8a, 0x70, 0x49, 0x90, 0x9e, 0x94, 0xce, 0x39, + 0x9c, 0xc4, 0xbe, 0xb5, 0xa0, 0x83, 0xb5, 0x8d, 0xa5, 0x52, 0xa8, 0x31, + 0x3c, 0x75, 0x89, 0xae, 0xae, 0x96, 0xb2, 0xcd, 0x4c, 0x30, 0x58, 0x89, + 0x3f, 0xbf, 0x91, 0x6c, 0x6d, 0xac, 0x3f, 0x7b, 0x76, 0x94, 0x8e, 0x51, + 0x85, 0x7a, 0x5b, 0xae, 0x36, 0xab, 0x4f, 0x96, 0xc0, 0xae, 0x70, 0x41, + 0xa5, 0x4e, 0x61, 0x65, 0xba, 0x51, 0x56, 0x62, 0xc6, 0x33, 0xa0, 0x57, + 0x83, 0x3b, 0xb6, 0xd2, 0x8c, 0x4c, 0x9d, 0x54, 0x2c, 0x4b, 0x3f, 0xc3, + 0x7d, 0x67, 0x6d, 0x30, 0xcb, 0xae, 0x47, 0x39, 0x77, 0x65, 0xcd, 0x7e, + 0xcf, 0x94, 0x91, 0x35, 0x70, 0xbe, 0xb3, 0x58, 0xc4, 0x3a, 0x45, 0x36, + 0x2f, 0x37, 0x77, 0xb7, 0x3f, 0x6a, 0xa8, 0x97, 0xbb, 0x85, 0x6b, 0x3f, + 0xc1, 0xc1, 0x7b, 0xd2, 0x5c, 0x70, 0x4b, 0x9b, 0x8b, 0x45, 0x8a, 0x9a, + 0x62, 0x6c, 0xa4, 0x64, 0x88, 0xa8, 0x68, 0x8e, 0xc6, 0xd4, 0x92, 0x3d, + 0xa1, 0x48, 0x69, 0x32, 0x4a, 0x95, 0xb7, 0xcd, 0x99, 0x93, 0xac, 0x31, + 0x57, 0x31, 0x64, 0x40, 0xab, 0x58, 0x5f, 0x9e, 0x70, 0x50, 0x34, 0xcd, + 0xc5, 0x5e, 0x7c, 0xac, 0x3e, 0x75, 0x7d, 0xb6, 0x4a, 0x46, 0xd1, 0xa7, + 0x7c, 0x66, 0xa8, 0x98, 0x48, 0x2e, 0x40, 0xc3, 0x8b, 0xbc, 0xb0, 0x8e, + 0x60, 0x7a, 0x8e, 0x6b, 0xb4, 0x7f, 0x4a, 0x7f, 0xd7, 0xc3, 0xb9, 0x4d, + 0xca, 0x7c, 0x5c, 0xa3, 0xce, 0x7c, 0x7b, 0xb8, 0x82, 0x4c, 0x5c, 0x92, + 0xd9, 0x50, 0x56, 0x88, 0x8a, 0x62, 0x95, 0x79, 0x95, 0x63, 0xac, 0x5a, + 0x6c, 0x68, 0x48, 0xa1, 0x5b, 0xb5, 0xbd, 0x7b, 0xb8, 0xc2, 0xaa, 0x98, + 0x9a, 0xac, 0x86, 0xbd, 0x7e, 0xc1, 0x91, 0xe5, 0xbe, 0x4f, 0x36, 0xa9, + 0x80, 0x87, 0x83, 0x52, 0x76, 0x9b, 0xa7, 0x7d, 0x46, 0xc7, 0x46, 0x78, + 0x8c, 0xc1, 0xdb, 0xce, 0xc4, 0x73, 0x65, 0x69, 0x65, 0x60, 0x70, 0x75, + 0xab, 0x92, 0xd9, 0x81, 0xc9, 0xb8, 0xc0, 0xbf, 0x35, 0x2f, 0x87, 0x40, + 0xc9, 0x4e, 0xa0, 0x79, 0x81, 0x75, 0x64, 0xad, 0x93, 0x97, 0x77, 0xc5, + 0xb4, 0x39, 0x9b, 0xb0, 0x47, 0x32, 0x72, 0x36, 0xbb, 0xa0, 0x66, 0x7c, + 0x32, 0x75, 0x66, 0xcc, 0xb4, 0x65, 0x39, 0x38, 0x6d, 0x71, 0x4b, 0xbc, + 0x48, 0xd3, 0xba, 0x7e, 0x3c, 0x3c, 0xd1, 0x8e, 0x3c, 0xa6, 0x96, 0x41, + 0x75, 0x54, 0xa6, 0x52, 0xa7, 0xa4, 0x36, 0x5a, 0x9d, 0x97, 0x88, 0x47, + 0xa0, 0x6e, 0x5f, 0xc3, 0x2f, 0xd0, 0x45, 0x94, 0x41, 0xa4, 0x70, 0x9c, + 0x38, 0x97, 0x46, 0x47, 0x96, 0xcb, 0x9c, 0x84, 0x89, 0xcf, 0x36, 0x79, + 0xd4, 0x35, 0x3e, 0xbc, 0x39, 0x66, 0x52, 0x65, 0x41, 0x75, 0x7a, 0x47, + 0xbc, 0xa9, 0x39, 0x92, 0x41, 0x7c, 0x44, 0x6e, 0x7e, 0x77, 0x8e, 0x57, + 0x73, 0x4c, 0x87, 0x96, 0x6b, 0x82, 0xac, 0xaa, 0x59, 0xd6, 0xcb, 0xd1, + 0xa5, 0xaf, 0xa9, 0xc4, 0xac, 0x5d, 0x9f, 0x46, 0x5c, 0x50, 0xb8, 0xad, + 0x66, 0x78, 0x4d, 0x6f, 0xe3, 0x54, 0x7d, 0x3e, 0x76, 0xcc, 0xc3, 0xa3, + 0xbf, 0x90, 0xab, 0xac, 0x47, 0xb8, 0xda, 0xdb, 0x34, 0x74, 0x50, 0xd8, + 0xa5, 0x8a, 0x6e, 0x50, 0x82, 0x6d, 0x33, 0x51, 0x4e, 0x48, 0x9d, 0x4a, + 0x81, 0x9f, 0xa0, 0xab, 0xa0, 0x4f, 0xc8, 0x76, 0x3b, 0x7f, 0xb6, 0xca, + 0x44, 0x7c, 0xbd, 0x5e, 0xca, 0x3c, 0x61, 0x35, 0x4c, 0xa6, 0xc2, 0xad, + 0xd2, 0xb2, 0x74, 0x5f, 0x90, 0xb6, 0x4d, 0xbf, 0xb8, 0x37, 0x51, 0x95, + 0x8d, 0xcb, 0x89, 0x99, 0x7e, 0x34, 0x5b, 0x95, 0x2a, 0x93, 0xa6, 0xcb, + 0xb6, 0x33, 0x5e, 0xa1, 0xa1, 0xca, 0x69, 0x5d, 0xa0, 0x33, 0xa5, 0x41, + 0xae, 0xd1, 0x7e, 0xb4, 0x35, 0x78, 0x84, 0x7d, 0x98, 0x3b, 0xb5, 0x3c, + 0x3a, 0x5a, 0xc9, 0x81, 0x4f, 0x8f, 0x63, 0xb1, 0xc4, 0x70, 0xcc, 0xc2, + 0xc6, 0x55, 0xa6, 0x3e, 0x6c, 0x81, 0x9e, 0x7f, 0x38, 0xb4, 0xc4, 0x37, + 0x7e, 0x2f, 0xa3, 0x45, 0x3c, 0x77, 0xc5, 0xc2, 0xb9, 0xc8, 0x51, 0x8c, + 0xc2, 0xbe, 0x7d, 0xa8, 0xce, 0xac, 0xa9, 0x7f, 0x6b, 0x66, 0x9d, 0xc3, + 0x5e, 0x8b, 0x63, 0x95, 0x8e, 0x3a, 0x6c, 0x7f, 0x9d, 0x37, 0x77, 0x5e, + 0x4c, 0x37, 0x9b, 0x56, 0xd1, 0x7c, 0x5f, 0x2a, 0x95, 0x74, 0x4e, 0x66, + 0x7c, 0xa3, 0x56, 0xd0, 0x33, 0x4b, 0xc4, 0x63, 0xb8, 0xbb, 0x7e, 0x40, + 0xa2, 0x4e, 0x70, 0x66, 0xcc, 0xac, 0x4c, 0xca, 0x47, 0x78, 0xbd, 0xcb, + 0x3a, 0xa3, 0xa4, 0x53, 0x5b, 0x83, 0x53, 0xcb, 0xa1, 0x62, 0x45, 0x44, + 0x41, 0x7a, 0x49, 0x97, 0x90, 0x56, 0xa3, 0x4a, 0x64, 0x32, 0x6d, 0x71, + 0x6b, 0x73, 0x8e, 0x4e, 0x53, 0x82, 0x86, 0x8f, 0x90, 0xc2, 0xc5, 0x4a, + 0xa1, 0xbd, 0xc8, 0x3f, 0x7b, 0x44, 0x9b, 0x6c, 0x6a, 0x4e, 0xae, 0xc9, + 0x8f, 0x9d, 0xb4, 0x31, 0x82, 0xc2, 0x34, 0xca, 0x79, 0xd7, 0x47, 0x67, + 0x8c, 0x79, 0x3e, 0xab, 0x4a, 0x43, 0x67, 0x70, 0x5e, 0x94, 0xb0, 0x39, + 0xc4, 0xc5, 0x79, 0x87, 0x5c, 0xcc, 0x5e, 0xd1, 0xbc, 0xbe, 0x66, 0x95, + 0x7b, 0x6a, 0x6f, 0x32, 0x7e, 0xa6, 0x9e, 0xac, 0xae, 0x9d, 0x50, 0xba, + 0x7b, 0x46, 0xcb, 0xd0, 0xb4, 0x58, 0x7a, 0x36, 0x33, 0x7b, 0x60, 0x6c, + 0x73, 0x90, 0x2e, 0x97, 0x71, 0x85, 0x39, 0xd0, 0x7b, 0x50, 0xa5, 0x6f, + 0x84, 0x7d, 0x95, 0xca, 0xce, 0x33, 0x9e, 0x37, 0xb5, 0x92, 0x9a, 0x83, + 0x7b, 0xcf, 0xb6, 0x3b, 0x67, 0x6c, 0x32, 0x45, 0xbf, 0xc7, 0x68, 0x4a, + 0xd0, 0x69, 0x6d, 0x57, 0x53, 0x52, 0x90, 0x52, 0x3f, 0xb9, 0xa1, 0x6b, + 0x89, 0x90, 0x52, 0x36, 0x5d, 0x68, 0x58, 0x48, 0x5d, 0x87, 0xdb, 0x58, + 0x7b, 0x72, 0x9f, 0x6b, 0x85, 0x81, 0x54, 0x66, 0x3b, 0x9b, 0xbc, 0x8d, + 0xd7, 0x5e, 0x96, 0xc1, 0x37, 0x8c, 0x8e, 0xb8, 0x6d, 0xb1, 0x72, 0x55, + 0x89, 0xb9, 0x59, 0xa8, 0x90, 0xb8, 0x48, 0x38, 0x5a, 0x3e, 0x9d, 0x89, + 0xbb, 0x63, 0x69, 0x67, 0xc9, 0x5d, 0x9b, 0xc3, 0xba, 0xcd, 0x60, 0x5f, + 0x69, 0xd2, 0x9c, 0xc0, 0x83, 0x5f, 0x51, 0x52, 0xc9, 0x54, 0xb6, 0x71, + 0x84, 0x3b, 0x86, 0x84, 0xa8, 0x92, 0x4e, 0x3d, 0x4c, 0x40, 0x64, 0x51, + 0xc0, 0x40, 0x96, 0x8b, 0xcf, 0x70, 0x41, 0x48, 0xc4, 0xb1, 0x90, 0xcc, + 0x97, 0x9b, 0xc7, 0x9f, 0x7c, 0x9e, 0x59, 0x8c, 0xc2, 0x6a, 0x7a, 0xc6, + 0x8d, 0x65, 0x66, 0xad, 0x66, 0xa9, 0x53, 0x57, 0xab, 0x7e, 0xcf, 0xaa, + 0x91, 0xd2, 0xb7, 0x84, 0x33, 0x4e, 0x5b, 0xb8, 0x40, 0x3c, 0xc2, 0x8c, + 0xb1, 0x3c, 0x8e, 0x7c, 0x36, 0xc5, 0x6b, 0xa1, 0x52, 0xaa, 0x3d, 0x6a, + 0xa9, 0x5f, 0x52, 0x66, 0x8b, 0x83, 0xbb, 0xa5, 0xce, 0x8c, 0x81, 0xb0, + 0x78, 0x84, 0xa2, 0x71, 0xab, 0xb3, 0x37, 0xa2, 0x45, 0x64, 0x48, 0x37, + 0x6e, 0x47, 0x8a, 0x38, 0x4e, 0xb7, 0x93, 0xad, 0x64, 0x50, 0x8e, 0x33, + 0x9a, 0x47, 0xd2, 0x7a, 0x58, 0x71, 0xd3, 0x67, 0x32, 0x73, 0xaf, 0x5e, + 0x70, 0xb5, 0x77, 0xb1, 0x47, 0x46, 0x9e, 0x8f, 0x8a, 0x42, 0xac, 0x44, + 0xb5, 0x94, 0x3e, 0x5e, 0x49, 0x6d, 0x4c, 0x61, 0x59, 0x57, 0xc2, 0x93, + 0x8c, 0x74, 0x57, 0x5e, 0x52, 0x9d, 0xd0, 0x4f, 0x4e, 0x67, 0x36, 0x6c, + 0xbd, 0x7f, 0x6f, 0xb7, 0xc9, 0x2f, 0x99, 0xb9, 0x9f, 0x87, 0xc4, 0xa7, + 0xb1, 0x97, 0x40, 0x8b, 0xcb, 0xce, 0xa5, 0xb6, 0xc1, 0x42, 0xb6, 0x75, + 0xbf, 0x3a, 0x79, 0x85, 0x36, 0x46, 0x82, 0x73, 0x62, 0x9a, 0x37, 0x3c, + 0x84, 0x33, 0x8e, 0x4b, 0xc7, 0x80, 0x91, 0xad, 0x6f, 0x92, 0x35, 0x64, + 0x33, 0x67, 0xb7, 0x9f, 0x3f, 0x6b, 0x50, 0x6c, 0x4f, 0xac, 0x4b, 0x40, + 0xaf, 0xac, 0xa9, 0x40, 0xb1, 0xc9, 0x87, 0xb5, 0x9c, 0xb0, 0xd4, 0xc3, + 0xa7, 0x94, 0x3f, 0x63, 0x74, 0x5f, 0x95, 0x5b, 0x7a, 0x3c, 0xbf, 0xa5, + 0xbc, 0x55, 0xc2, 0x9b, 0xc1, 0x84, 0x4f, 0x6c, 0x3e, 0xcb, 0xb9, 0x66, + 0x57, 0x75, 0x60, 0x93, 0x5a, 0x89, 0x92, 0x58, 0x9d, 0xc5, 0x35, 0x7b, + 0x4d, 0x8f, 0x88, 0x96, 0x73, 0xa8, 0x72, 0x3b, 0x32, 0xbf, 0x80, 0x58, + 0x3d, 0x8b, 0x39, 0x85, 0x3e, 0xb6, 0x9e, 0xae, 0xa1, 0x79, 0x6b, 0xa6, + 0x59, 0x88, 0x62, 0x6c, 0xd3, 0xc7, 0x60, 0x87, 0x84, 0xbe, 0x6d, 0x80, + 0x40, 0x6b, 0x66, 0xba, 0xab, 0x72, 0xab, 0xa0, 0x63, 0x48, 0x34, 0x90, + 0x35, 0xab, 0xd1, 0x64, 0xc0, 0x33, 0x71, 0x79, 0xc5, 0x77, 0x86, 0x6f, + 0xb5, 0x74, 0x81, 0xbe, 0xcf, 0x44, 0x9a, 0xa5, 0x6d, 0x8c, 0x93, 0xbc, + 0x80, 0xc2, 0x7d, 0x59, 0xbf, 0xb4, 0xca, 0x82, 0xcf, 0xd1, 0x4a, 0x63, + 0x44, 0x56, 0x9e, 0x9c, 0x51, 0x53, 0x99, 0x3b, 0x86, 0x7e, 0x61, 0x7b, + 0x88, 0x33, 0xc6, 0xa7, 0x50, 0xbd, 0x8c, 0x76, 0xbd, 0x79, 0x58, 0xb0, + 0xcf, 0x8b, 0x64, 0x6f, 0x62, 0x83, 0x85, 0x62, 0x38, 0x4c, 0x8d, 0xd0, + 0xb4, 0x93, 0xb4, 0xa3, 0xc4, 0x3d, 0x7d, 0x41, 0x3d, 0x30, 0xc6, 0xbf, + 0xc7, 0x78, 0xc1, 0x9c, 0x92, 0x55, 0x41, 0x54, 0x4b, 0xa4, 0x3f, 0x59, + 0x84, 0xbd, 0x60, 0x3b, 0x7c, 0xaa, 0x39, 0xc2, 0x82, 0xb6, 0xa4, 0x42, + 0xc3, 0x33, 0xb6, 0x3c, 0xb7, 0x73, 0x93, 0x6a, 0xbf, 0x71, 0x42, 0xb5, + 0x46, 0xa9, 0x8c, 0x76, 0x86, 0x6a, 0x66, 0xb4, 0x36, 0x42, 0x88, 0xae, + 0x61, 0x5a, 0xa1, 0xd3, 0xb1, 0x96, 0x59, 0x64, 0x60, 0xb2, 0x9e, 0x5d, + 0x71, 0x7a, 0xac, 0xa8, 0xb4, 0x5d, 0xc9, 0xcd, 0x48, 0x33, 0xc4, 0x41, + 0x33, 0xaf, 0xb2, 0x7b, 0x64, 0x8f, 0x59, 0xc7, 0x51, 0x52, 0x68, 0x6f, + 0xb1, 0xcb, 0x6a, 0x83, 0x97, 0x85, 0x8a, 0x43, 0x8a, 0xb0, 0xc6, 0xa5, + 0xc4, 0x55, 0x9f, 0x44, 0x8a, 0x65, 0x6c, 0xc6, 0x59, 0xad, 0x3c, 0x5b, + 0xb1, 0x99, 0x86, 0x5a, 0x5e, 0x89, 0x34, 0x4e, 0xbc, 0x73, 0xc4, 0x7d, + 0x88, 0xc9, 0x4a, 0x37, 0x93, 0x41, 0x3e, 0x6f, 0xa6, 0x7d, 0xca, 0xa0, + 0x42, 0x6a, 0x38, 0x5d, 0x9f, 0x88, 0xba, 0xc8, 0xb7, 0xcd, 0x3e, 0x7b, + 0x85, 0x43, 0x9d, 0x57, 0xc1, 0x63, 0xbe, 0xad, 0xc9, 0xa8, 0x7b, 0x3c, + 0xc0, 0x58, 0xac, 0xc4, 0x5c, 0xba, 0x90, 0x52, 0x6f, 0x45, 0xbe, 0x48, + 0x97, 0x85, 0x35, 0x56, 0xd0, 0x81, 0xac, 0x54, 0x5d, 0xbc, 0x41, 0xa2, + 0x33, 0xac, 0x8a, 0x7e, 0x82, 0x94, 0xd0, 0x6d, 0xcc, 0xd2, 0x79, 0xaf, + 0xb4, 0xa4, 0x81, 0xc2, 0xc2, 0x43, 0xc3, 0x76, 0x3f, 0x4f, 0x62, 0x31, + 0x5d, 0x8a, 0x5c, 0x95, 0x55, 0x82, 0x39, 0xbb, 0x73, 0x7b, 0x4c, 0x78, + 0x52, 0x4e, 0x59, 0xa4, 0x5d, 0x6f, 0xa2, 0xcb, 0xbb, 0xc8, 0x89, 0x6c, + 0x49, 0x58, 0x96, 0x88, 0x7e, 0xb4, 0x31, 0x37, 0x53, 0xb0, 0x78, 0x68, + 0x97, 0xb2, 0xa3, 0x41, 0xbf, 0x9f, 0x76, 0x3b, 0x74, 0xab, 0x70, 0x46, + 0x6a, 0x9b, 0xa8, 0xad, 0xbd, 0x6b, 0xc4, 0xad, 0xb1, 0x3b, 0x6e, 0xd0, + 0xaa, 0x8a, 0x8c, 0xa7, 0xb4, 0x88, 0x5d, 0x47, 0xbe, 0xb2, 0xca, 0x84, + 0xa6, 0x93, 0x3a, 0x59, 0x74, 0xd0, 0x33, 0x99, 0xbf, 0x87, 0x72, 0xc4, + 0x7b, 0xb2, 0xb9, 0x34, 0x3e, 0x54, 0xce, 0x36, 0x79, 0x59, 0x95, 0x56, + 0xa4, 0x50, 0x97, 0x5c, 0xac, 0xb6, 0x37, 0x4f, 0x4a, 0x54, 0xaa, 0x57, + 0xb3, 0x77, 0x70, 0xa4, 0xb4, 0x84, 0x3d, 0x5e, 0x94, 0xb9, 0x72, 0x36, + 0x6d, 0x74, 0x8d, 0x4b, 0x96, 0x96, 0x42, 0x30, 0x4f, 0xbe, 0xc7, 0x52, + 0x9d, 0xa7, 0x7f, 0x7f, 0x7d, 0xc8, 0x7b, 0x92, 0x9d, 0x4b, 0x5c, 0x81, + 0xbf, 0x5d, 0xcd, 0x64, 0x7f, 0xb8, 0x32, 0x32, 0x49, 0x78, 0x70, 0x6b, + 0x72, 0xb2, 0x85, 0x85, 0x62, 0x59, 0xab, 0xbc, 0xc6, 0xa6, 0xcf, 0x5f, + 0x6e, 0xcb, 0xb0, 0x45, 0xa7, 0x50, 0x8a, 0xc7, 0x9d, 0x44, 0x45, 0xa1, + 0xba, 0xc2, 0xbc, 0xcc, 0x80, 0x3c, 0x76, 0x45, 0x75, 0x48, 0x70, 0x95, + 0x66, 0x79, 0xac, 0x49, 0x50, 0x56, 0x41, 0x4a, 0xb9, 0x4a, 0xca, 0x4a, + 0xd1, 0xb6, 0x95, 0x5a, 0x98, 0x5e, 0xb8, 0xc1, 0x67, 0xd1, 0x95, 0x7f, + 0x45, 0x8a, 0x92, 0x92, 0x81, 0x72, 0x72, 0x5a, 0x73, 0x5a, 0x87, 0x9f, + 0x37, 0xbc, 0x6e, 0x9e, 0x82, 0x79, 0x89, 0xd1, 0xa6, 0x54, 0x67, 0x54, + 0x7d, 0x39, 0x94, 0x8e, 0x58, 0x9b, 0x6b, 0xb7, 0x63, 0xc8, 0x87, 0xa7, + 0x5e, 0x8b, 0x69, 0xbc, 0xbb, 0x5c, 0x56, 0x8c, 0xae, 0xc5, 0x47, 0xd3, + 0x6a, 0x50, 0x5e, 0xb0, 0x5e, 0xb9, 0xa6, 0x37, 0x9b, 0x58, 0xbb, 0xa1, + 0x54, 0xb4, 0xa6, 0x73, 0xce, 0xa4, 0xc8, 0xc1, 0x38, 0x57, 0x37, 0x40, + 0xa9, 0xb4, 0x82, 0xac, 0x3e, 0x58, 0x65, 0x31, 0x9a, 0x8d, 0xb1, 0x55, + 0x4c, 0xab, 0x53, 0x63, 0xba, 0xb9, 0xcb, 0x96, 0x76, 0xc8, 0x6f, 0x63, + 0x78, 0x78, 0x6c, 0x7e, 0x79, 0x8f, 0x3f, 0x68, 0xb5, 0x6f, 0xc1, 0xa8, + 0x35, 0x33, 0xa4, 0x94, 0x64, 0xc4, 0xcd, 0x7c, 0xc7, 0xab, 0x51, 0x77, + 0xc2, 0x4c, 0x3e, 0x4d, 0xa6, 0xb9, 0x58, 0x66, 0x58, 0xc6, 0x53, 0x91, + 0xbe, 0x88, 0x77, 0x63, 0xbd, 0x65, 0xa2, 0x8b, 0x68, 0x43, 0x43, 0xbc, + 0xc7, 0x98, 0x3b, 0xbe, 0x50, 0xb8, 0x42, 0xca, 0xae, 0x65, 0xce, 0x3f, + 0xce, 0x7e, 0x9e, 0xa7, 0x84, 0x67, 0xaa, 0x54, 0x7e, 0x5b, 0x8c, 0x89, + 0xac, 0x87, 0xc4, 0x5f, 0x35, 0x61, 0xc4, 0xc0, 0x38, 0x83, 0xac, 0x68, + 0x82, 0x4e, 0x85, 0x4e, 0x9d, 0xc8, 0x4b, 0x86, 0xc5, 0x88, 0x71, 0x7d, + 0xbb, 0xa1, 0x62, 0x6d, 0x4a, 0x74, 0xb8, 0x72, 0xac, 0xd3, 0x44, 0x53, + 0x3c, 0xa8, 0xb2, 0xbb, 0x67, 0xba, 0xa6, 0xca, 0x9f, 0x75, 0x8b, 0x58, + 0x4b, 0x76, 0x96, 0x7d, 0x49, 0x42, 0xbb, 0x69, 0x30, 0x34, 0xb8, 0xb3, + 0xb4, 0x35, 0x52, 0x3c, 0x85, 0x3b, 0x52, 0xba, 0x57, 0x8b, 0xd4, 0x3b, + 0x58, 0x98, 0x47, 0x5e, 0x65, 0x99, 0x41, 0x56, 0x3b, 0x73, 0x93, 0x39, + 0x4d, 0x8d, 0x9e, 0x71, 0x8e, 0x80, 0x70, 0x42, 0xc9, 0xa3, 0x48, 0xc0, + 0xcc, 0xd1, 0xc7, 0x9e, 0x8b, 0x3e, 0x4e, 0xba, 0x73, 0xc2, 0x45, 0x53, + 0x95, 0xd3, 0xd1, 0x52, 0x88, 0x65, 0x77, 0x38, 0x6d, 0x59, 0x34, 0xcf, + 0x73, 0xcd, 0x8f, 0x8f, 0x97, 0x88, 0x4f, 0x6c, 0x6b, 0xba, 0xb2, 0x3a, + 0x57, 0xc7, 0x4f, 0x63, 0x51, 0x88, 0xc4, 0xb1, 0xbc, 0x5b, 0x33, 0xb5, + 0xa1, 0xa7, 0x3f, 0x5e, 0xa9, 0x4a, 0x6d, 0x59, 0x49, 0x43, 0xd0, 0x4f, + 0x44, 0x32, 0xaf, 0x3a, 0xcf, 0xb9, 0x37, 0x91, 0x6d, 0x8b, 0xb3, 0xc4, + 0xc2, 0x3f, 0x65, 0x89, 0x8a, 0xc9, 0x7b, 0x4c, 0x37, 0x7e, 0xae, 0x98, + 0x99, 0x62, 0x68, 0x76, 0x91, 0x65, 0x60, 0xba, 0x7d, 0x97, 0x7c, 0x92, + 0xba, 0x8e, 0x4a, 0x41, 0xc8, 0xc6, 0xbb, 0x5c, 0xa8, 0x61, 0xc6, 0x8a, + 0xa5, 0x4b, 0x98, 0x9e, 0x95, 0x3e, 0x47, 0x52, 0x84, 0x47, 0x52, 0xa4, + 0x5d, 0x5b, 0x44, 0x74, 0x5a, 0x9c, 0x84, 0x77, 0xd0, 0x7b, 0x34, 0x53, + 0xa3, 0xb0, 0xa8, 0x9b, 0x7e, 0x56, 0x87, 0x5b, 0x68, 0xcc, 0x54, 0x59, + 0x40, 0x8d, 0x63, 0x86, 0xad, 0xaf, 0x33, 0xc4, 0xbf, 0xa1, 0x6a, 0x6e, + 0xbc, 0x7e, 0xcc, 0xc4, 0x6b, 0x86, 0x6c, 0x9f, 0x54, 0x4b, 0x55, 0xa1, + 0x3e, 0xa5, 0x36, 0x4b, 0x6b, 0xb7, 0xbf, 0x53, 0xa1, 0x57, 0x3a, 0x38, + 0x75, 0xcf, 0x6a, 0xbd, 0x43, 0x4c, 0xbf, 0xd1, 0xb0, 0xb0, 0xc4, 0x6b, + 0xc4, 0xbc, 0x78, 0x76, 0xa5, 0x55, 0x57, 0x52, 0x92, 0x6c, 0x90, 0xbe, + 0x78, 0xa0, 0x70, 0x86, 0x46, 0x30, 0x60, 0xb1, 0x90, 0x3a, 0x8c, 0x56, + 0xad, 0xb4, 0x63, 0xd3, 0xb0, 0x59, 0x8e, 0xb4, 0x34, 0xc1, 0x4e, 0x95, + 0x3d, 0x6d, 0xa9, 0xab, 0x9a, 0x84, 0xb7, 0x5e, 0x7f, 0x56, 0xc5, 0x94, + 0x53, 0x90, 0x98, 0x8e, 0x5f, 0x75, 0xad, 0x92, 0xb3, 0xac, 0x48, 0x90, + 0x89, 0x49, 0x4a, 0x5a, 0x4f, 0x5f, 0x56, 0xab, 0x56, 0xcf, 0x3e, 0x50, + 0x89, 0x45, 0x4f, 0x56, 0x33, 0x63, 0x66, 0x47, 0xa2, 0x46, 0xc4, 0x8f, + 0x4a, 0x53, 0xc0, 0x36, 0x62, 0x6c, 0x7a, 0x66, 0x62, 0x7b, 0xcc, 0x8f, + 0xad, 0x6b, 0x9c, 0x5f, 0xb7, 0x6e, 0xd1, 0xb4, 0xd5, 0x63, 0xcc, 0x80, + 0x37, 0xb8, 0xa4, 0x5c, 0x7b, 0xb4, 0xc4, 0xcd, 0x89, 0x8f, 0x9b, 0x98, + 0x82, 0x8c, 0x3f, 0xc0, 0x3f, 0x8d, 0x77, 0x73, 0x43, 0xc7, 0x2f, 0x85, + 0x42, 0x35, 0xa8, 0x45, 0xa6, 0x46, 0x68, 0xad, 0x8b, 0x38, 0xb0, 0x48, + 0x9b, 0xc2, 0x86, 0x50, 0x72, 0xb6, 0x87, 0x3d, 0xa7, 0x8d, 0x95, 0xb7, + 0x5e, 0x45, 0x6d, 0x4d, 0xc3, 0x6a, 0x30, 0x8e, 0x7c, 0xce, 0x7f, 0x46, + 0xbb, 0x4b, 0x5e, 0xc2, 0x64, 0x55, 0xab, 0x43, 0x70, 0xa2, 0x3c, 0x80, + 0x57, 0x91, 0x69, 0xcd, 0x89, 0x5d, 0x98, 0xb0, 0x5b, 0xaa, 0x47, 0x7a, + 0xcf, 0x9f, 0x5b, 0x54, 0x47, 0x58, 0x46, 0x54, 0xcb, 0xa3, 0x4d, 0x83, + 0xcc, 0xa1, 0xd0, 0xbb, 0x5d, 0x8d, 0x9b, 0x4e, 0x3d, 0x3c, 0x61, 0xbd, + 0x3c, 0x36, 0x88, 0x7d, 0x62, 0x8b, 0x7b, 0x84, 0x9a, 0x52, 0x6c, 0x67, + 0x9a, 0x78, 0x32, 0x93, 0xab, 0x78, 0x71, 0x48, 0x9f, 0x55, 0x7e, 0x9b, + 0x39, 0xd2, 0x54, 0x47, 0x2f, 0x5d, 0x47, 0x50, 0xc8, 0xb3, 0xc2, 0xd4, + 0x5a, 0x9f, 0x6d, 0xcb, 0x3b, 0x59, 0xc3, 0x42, 0x58, 0x8c, 0x33, 0xb0, + 0x35, 0x71, 0xd1, 0x3f, 0x8f, 0x42, 0x9d, 0xb4, 0x7e, 0x83, 0x6c, 0x60, + 0xcc, 0x95, 0xc6, 0x6f, 0xae, 0x84, 0xba, 0x99, 0x45, 0x62, 0x53, 0x50, + 0x81, 0x81, 0x53, 0xbc, 0x41, 0x3f, 0xa1, 0xc9, 0x85, 0x40, 0x8f, 0xa7, + 0x59, 0xb4, 0xa4, 0x4f, 0x34, 0x4a, 0x85, 0xc0, 0xcf, 0xcf, 0xa5, 0x99, + 0x72, 0xc5, 0xcd, 0x9b, 0x66, 0x57, 0x4a, 0x55, 0x71, 0x63, 0x96, 0x83, + 0xa8, 0x45, 0x41, 0x69, 0x7d, 0xb4, 0x43, 0x66, 0x99, 0x47, 0x65, 0xae, + 0x9e, 0x92, 0xbe, 0x9a, 0xad, 0x9d, 0xad, 0xd2, 0x70, 0xb8, 0x9c, 0x75, + 0xc6, 0x70, 0x80, 0x36, 0x40, 0x74, 0x8e, 0x5a, 0xb7, 0x3c, 0x6b, 0x4d, + 0xb7, 0xa9, 0xb7, 0x60, 0xb1, 0xb7, 0x89, 0x66, 0x46, 0x5d, 0xa5, 0xa9, + 0x4c, 0x37, 0xcf, 0x68, 0x7b, 0xb9, 0xb1, 0xbd, 0xcb, 0x85, 0x8e, 0xcd, + 0x55, 0xc2, 0x59, 0xac, 0x6d, 0x95, 0x5e, 0x47, 0xc6, 0xb2, 0x87, 0x9c, + 0xb1, 0x76, 0xcb, 0x8d, 0xa9, 0x3c, 0x70, 0x60, 0x64, 0x61, 0xcd, 0x93, + 0x91, 0x7f, 0x4d, 0xb7, 0xc7, 0x8c, 0x31, 0x96, 0xa9, 0x98, 0xc1, 0x48, + 0x61, 0xc5, 0x81, 0x57, 0x53, 0xa6, 0xca, 0xcf, 0xc6, 0x6c, 0xc2, 0x80, + 0x45, 0x88, 0xa2, 0xad, 0x98, 0xb1, 0x64, 0x7c, 0x66, 0x76, 0x94, 0xc3, + 0x5b, 0x44, 0xbd, 0x88, 0x37, 0xc7, 0x79, 0xc5, 0x8f, 0x2f, 0x72, 0x6c, + 0xab, 0x8b, 0x5d, 0x8a, 0x3c, 0xc6, 0x50, 0xd0, 0x87, 0x35, 0xa6, 0x45, + 0xb0, 0x61, 0x6b, 0x7a, 0x95, 0xa2, 0x87, 0x73, 0x41, 0xd0, 0xb0, 0x36, + 0xb7, 0x46, 0x6b, 0xa8, 0x38, 0x43, 0xad, 0x6a, 0x72, 0x9f, 0xad, 0x7e, + 0x6e, 0x64, 0x68, 0x49, 0x72, 0x67, 0x51, 0x9d, 0xc3, 0x8b, 0x4e, 0x8b, + 0x7b, 0x6c, 0x4f, 0x5b, 0x36, 0x80, 0xb9, 0x49, 0x51, 0xcf, 0xc8, 0x7d, + 0xbf, 0xa1, 0x66, 0x46, 0xbf, 0x98, 0x53, 0x38, 0xa6, 0x49, 0x70, 0x7c, + 0xbc, 0x9d, 0x8f, 0x60, 0xb5, 0x72, 0xb1, 0xc1, 0x58, 0x3e, 0x40, 0xce, + 0x91, 0x89, 0xcc, 0xdb, 0xab, 0x4a, 0x58, 0xa9, 0x4b, 0x85, 0x85, 0xc1, + 0x89, 0x5b, 0xd4, 0x96, 0x6d, 0x98, 0x2f, 0x31, 0x34, 0x86, 0x3c, 0x4e, + 0xb0, 0x57, 0x3f, 0x39, 0xa4, 0x97, 0xb4, 0x53, 0x38, 0x41, 0xd0, 0x8a, + 0x3f, 0xbf, 0x63, 0x9b, 0x9c, 0x61, 0x64, 0x4e, 0xb6, 0xd3, 0x6c, 0x87, + 0x9a, 0x3c, 0xc6, 0xa6, 0xd0, 0xaf, 0x69, 0x82, 0x4f, 0x6f, 0x38, 0x53, + 0xb8, 0xb9, 0x6f, 0xcb, 0x88, 0x87, 0x69, 0x82, 0x6b, 0x3b, 0x55, 0x82, + 0x5b, 0x72, 0xbe, 0xb9, 0x74, 0x94, 0xd6, 0xb5, 0x82, 0xc9, 0x90, 0x34, + 0x5f, 0xc1, 0xcf, 0xd8, 0x86, 0x59, 0x44, 0x58, 0xce, 0x9e, 0x35, 0xa0, + 0xd0, 0x93, 0xca, 0xc2, 0x90, 0xa1, 0xac, 0xbc, 0xb2, 0x70, 0x34, 0x36, + 0xa8, 0xd2, 0x90, 0xb9, 0x9c, 0x94, 0x61, 0x53, 0x41, 0x8b, 0x9d, 0x8b, + 0x6b, 0x3e, 0x7c, 0xcd, 0x7f, 0x7a, 0x9c, 0x49, 0x48, 0x46, 0xc2, 0xbb, + 0xa8, 0xae, 0x6e, 0xbe, 0x68, 0x3e, 0x3b, 0xba, 0xc7, 0x65, 0x99, 0x93, + 0x4c, 0x52, 0x47, 0x58, 0x8e, 0x4f, 0xa9, 0xcb, 0x4d, 0x60, 0x3e, 0x83, + 0xa1, 0x71, 0x89, 0xad, 0x50, 0xb4, 0xca, 0xba, 0x80, 0x49, 0xb4, 0x66, + 0xc4, 0x81, 0x6a, 0xaa, 0xb0, 0x48, 0xc7, 0x89, 0x3a, 0x41, 0xca, 0x7d, + 0x83, 0x82, 0x76, 0x97, 0x54, 0xbd, 0x8c, 0x5c, 0xce, 0xb5, 0x59, 0xae, + 0xc0, 0x9e, 0x79, 0x80, 0x86, 0x93, 0x87, 0xba, 0xcc, 0x51, 0xb1, 0x7c, + 0x91, 0x92, 0x81, 0x92, 0x5e, 0x5c, 0x7c, 0xbe, 0x31, 0x7f, 0x7b, 0xd2, + 0x65, 0x7f, 0x51, 0x6d, 0x57, 0xb9, 0x8d, 0x3c, 0x6b, 0x95, 0x77, 0xbf, + 0x52, 0x87, 0x74, 0x45, 0x9d, 0xba, 0x99, 0xb2, 0xab, 0xb3, 0x35, 0xcc, + 0x92, 0x38, 0x44, 0x7e, 0xb6, 0xa7, 0x9e, 0x77, 0x3d, 0x92, 0x57, 0x9f, + 0x39, 0xc1, 0x97, 0x99, 0x60, 0x83, 0xc9, 0x85, 0xcd, 0x45, 0x2d, 0x4c, + 0x96, 0x96, 0x61, 0x70, 0x5a, 0x8c, 0xa8, 0x92, 0x3f, 0x5b, 0x72, 0x54, + 0x38, 0xae, 0x78, 0x34, 0xc6, 0xae, 0x72, 0x64, 0xbe, 0x5e, 0x79, 0x7c, + 0x67, 0x9e, 0xb0, 0xad, 0xad, 0x8e, 0x75, 0x67, 0x76, 0x5a, 0xb2, 0x3a, + 0x6c, 0xb7, 0x3f, 0x99, 0x75, 0x9d, 0x41, 0x41, 0x61, 0x74, 0xb4, 0xa8, + 0x76, 0x81, 0x42, 0x77, 0xd1, 0xa1, 0x4d, 0x60, 0x96, 0x7c, 0x7e, 0x76, + 0x6e, 0x94, 0xc9, 0x34, 0x74, 0x72, 0xcb, 0x9c, 0x5d, 0x3f, 0x74, 0x84, + 0x61, 0xa6, 0x37, 0x91, 0x99, 0x9c, 0x89, 0x3e, 0x53, 0x77, 0x48, 0x6f, + 0x9d, 0x43, 0xc1, 0xc3, 0x97, 0x63, 0xb7, 0x5c, 0xcb, 0x42, 0x71, 0xd1, + 0xcc, 0x64, 0xbe, 0xb1, 0xab, 0xb3, 0x6d, 0xbb, 0xb5, 0xb8, 0x8a, 0xb1, + 0x48, 0xa0, 0x40, 0x77, 0xa3, 0xb3, 0x9f, 0x69, 0x63, 0xc2, 0x80, 0xc0, + 0x85, 0x76, 0x6d, 0x57, 0xc4, 0xaf, 0x35, 0x2d, 0x67, 0x49, 0x30, 0x6e, + 0xc2, 0x44, 0x8e, 0xb2, 0xb4, 0x94, 0xcf, 0xab, 0x35, 0xc9, 0x8c, 0x31, + 0x34, 0x68, 0xba, 0x33, 0x4d, 0x72, 0x5b, 0x8f, 0x94, 0xc8, 0x37, 0x56, + 0x5c, 0x8f, 0x47, 0x8b, 0xd3, 0x5c, 0xc5, 0xa6, 0x9b, 0xa1, 0x35, 0xae, + 0x6c, 0xaf, 0x58, 0x5e, 0x38, 0x6c, 0x93, 0xa7, 0x50, 0x9e, 0x80, 0x3f, + 0x3a, 0x48, 0xa4, 0x46, 0x75, 0xaf, 0x64, 0xc1, 0x7d, 0x6d, 0x79, 0xb1, + 0x59, 0xbf, 0xbd, 0xb0, 0x78, 0x34, 0x6c, 0xc2, 0x93, 0x5b, 0x45, 0x71, + 0x42, 0xbf, 0x93, 0x57, 0xb2, 0x5e, 0xca, 0x6f, 0x68, 0x90, 0x86, 0x77, + 0xa9, 0x53, 0x2e, 0xc5, 0x72, 0xca, 0xa3, 0xc1, 0x52, 0x7f, 0x51, 0xb8, + 0x86, 0x7e, 0xb7, 0x37, 0x4c, 0x72, 0x32, 0x75, 0xbb, 0xcb, 0x62, 0x68, + 0x59, 0x6c, 0x6d, 0xcf, 0x34, 0x35, 0x57, 0xb6, 0xa0, 0xb8, 0x5e, 0x65, + 0x6d, 0x91, 0xb9, 0x9f, 0x81, 0xd1, 0x4d, 0x34, 0x76, 0xd4, 0x60, 0x77, + 0x32, 0x3f, 0xb0, 0x80, 0x7b, 0xa9, 0xa1, 0x80, 0x85, 0xb0, 0xbd, 0xa8, + 0x3a, 0x6d, 0x6c, 0xa5, 0x5e, 0x9f, 0xa0, 0xd6, 0x39, 0xc5, 0x90, 0xce, + 0xc4, 0xa4, 0x8d, 0x47, 0x8b, 0xca, 0x69, 0x30, 0x87, 0xbd, 0xbd, 0x60, + 0x50, 0x87, 0x53, 0xa9, 0xc1, 0x80, 0x57, 0x7d, 0xc4, 0xba, 0xb1, 0x64, + 0x65, 0x62, 0x68, 0x70, 0xb5, 0x45, 0xb5, 0x84, 0x43, 0x3a, 0x88, 0x69, + 0x8e, 0x5a, 0x43, 0x9b, 0x9e, 0x77, 0x5d, 0xb5, 0x4e, 0x41, 0x8c, 0x9d, + 0x49, 0x41, 0x5f, 0x2a, 0x57, 0x7c, 0x5b, 0x79, 0xb1, 0x5f, 0x38, 0x46, + 0x9d, 0x48, 0x53, 0x79, 0x47, 0xab, 0x65, 0x7a, 0x97, 0xac, 0xb3, 0x74, + 0x81, 0x85, 0xb4, 0xb4, 0xce, 0x9e, 0x9d, 0x78, 0x55, 0x41, 0xad, 0x32, + 0xad, 0x53, 0x9c, 0xd0, 0x63, 0xa7, 0x3a, 0xca, 0x66, 0x63, 0xc5, 0x7a, + 0xa9, 0xc5, 0xab, 0x39, 0xc8, 0xc2, 0x4c, 0x5a, 0x4c, 0x4d, 0x73, 0x72, + 0x58, 0x6d, 0xa6, 0x83, 0xb5, 0x62, 0x77, 0x3d, 0x32, 0x7a, 0x32, 0xb2, + 0xb4, 0x50, 0x63, 0x70, 0xa5, 0x9c, 0x36, 0x79, 0x43, 0x56, 0x63, 0x91, + 0x77, 0xcf, 0x78, 0x40, 0x59, 0x7f, 0x52, 0x48, 0x41, 0x46, 0xa4, 0x5e, + 0x5e, 0xae, 0x49, 0xc0, 0x89, 0x3e, 0x6b, 0xce, 0x38, 0x5b, 0x92, 0x5e, + 0xb1, 0xca, 0x54, 0x52, 0x43, 0x45, 0xac, 0x8c, 0xd0, 0xc7, 0x8a, 0x69, + 0x9f, 0x9a, 0x5a, 0x71, 0x39, 0x7a, 0x99, 0x83, 0xaa, 0xc1, 0x7b, 0x40, + 0x3d, 0xc0, 0x80, 0x93, 0x5d, 0x73, 0x45, 0xca, 0xa4, 0x30, 0x84, 0xb8, + 0x73, 0x42, 0x4a, 0x4f, 0x4b, 0xa0, 0x3a, 0x9b, 0x5c, 0xa5, 0x46, 0xb1, + 0xad, 0x4f, 0x9f, 0xc1, 0x89, 0xbb, 0xb1, 0x92, 0xbc, 0x3d, 0x94, 0x9f, + 0x71, 0xae, 0xad, 0x63, 0x7a, 0x6b, 0x53, 0x99, 0x99, 0x98, 0x34, 0x72, + 0xa7, 0xb3, 0xa9, 0x75, 0x5f, 0x52, 0x42, 0x30, 0xad, 0xa1, 0xa3, 0x3e, + 0x3d, 0xc8, 0x54, 0x31, 0xb5, 0xc0, 0x82, 0x73, 0xb5, 0x37, 0x82, 0xcb, + 0xc4, 0x7e, 0x62, 0x5a, 0xa3, 0x6f, 0xb7, 0x62, 0x97, 0x5d, 0x77, 0x83, + 0x97, 0x38, 0x5a, 0x5f, 0x79, 0x84, 0xa4, 0x37, 0x8e, 0x91, 0x45, 0x95, + 0x80, 0x61, 0x5b, 0x33, 0xcb, 0xac, 0x91, 0x94, 0xbb, 0x9f, 0xb4, 0x5e, + 0x9a, 0x3d, 0x85, 0xc6, 0x36, 0x7c, 0xbe, 0x79, 0xb8, 0x60, 0x43, 0x61, + 0x54, 0x7a, 0xa1, 0xb8, 0x73, 0x45, 0x88, 0xb4, 0x97, 0x64, 0x63, 0xb1, + 0x56, 0x90, 0xbc, 0xa9, 0x75, 0x87, 0x34, 0x81, 0xab, 0xa7, 0x7c, 0x3b, + 0x7b, 0x59, 0x9b, 0x9d, 0x36, 0x3d, 0xa4, 0x6a, 0xa1, 0x57, 0x66, 0x57, + 0x95, 0xce, 0x4e, 0x49, 0x44, 0x77, 0xa8, 0x8a, 0x71, 0xc2, 0xa5, 0x54, + 0x55, 0x98, 0x9a, 0xc2, 0x5d, 0xc3, 0xb4, 0x44, 0x49, 0x9d, 0x3c, 0xae, + 0xc5, 0x52, 0x80, 0x95, 0xc4, 0x55, 0x4f, 0x90, 0x39, 0x92, 0xb8, 0xbd, + 0xa3, 0xc7, 0x7d, 0xd2, 0x7e, 0x96, 0xbc, 0x6e, 0x6c, 0x74, 0xc6, 0xa4, + 0x50, 0x3c, 0xc2, 0x4c, 0x9f, 0x4f, 0x9a, 0xaf, 0x8d, 0x58, 0xc3, 0x7e, + 0x84, 0x8d, 0x73, 0x41, 0x57, 0x50, 0xc9, 0x70, 0x55, 0x6e, 0xd3, 0x57, + 0x45, 0x59, 0x3f, 0x76, 0xb9, 0xb6, 0xc2, 0x5d, 0x32, 0xc9, 0x32, 0x48, + 0x6a, 0x7b, 0x5f, 0xa7, 0xd3, 0x7a, 0xa3, 0x6e, 0xbf, 0x5a, 0x67, 0x8c, + 0x8e, 0xbe, 0x72, 0xaa, 0xbe, 0x6c, 0xc4, 0xb5, 0x84, 0x3d, 0x74, 0xc6, + 0x78, 0x36, 0x71, 0xa8, 0x69, 0xb6, 0x65, 0x3c, 0x35, 0x57, 0x57, 0x71, + 0x4e, 0x8d, 0xa4, 0x60, 0x4a, 0x60, 0x5a, 0x4c, 0x3c, 0x63, 0xaa, 0xab, + 0x35, 0x38, 0x35, 0xcb, 0x56, 0xc1, 0xa1, 0xb0, 0x97, 0x4c, 0xb6, 0xb6, + 0x8a, 0xb9, 0x87, 0x64, 0x56, 0x63, 0x45, 0x4c, 0x6b, 0x8a, 0xd0, 0xae, + 0x89, 0x72, 0x6a, 0x98, 0xa8, 0x4e, 0xc6, 0x4c, 0x80, 0x7a, 0x6d, 0x9a, + 0x97, 0x3c, 0xc5, 0xa0, 0x98, 0x83, 0x85, 0x33, 0x70, 0x93, 0x4b, 0x8b, + 0x95, 0xbb, 0x50, 0x35, 0xa2, 0x98, 0x75, 0xb7, 0xc8, 0xb3, 0x36, 0xb2, + 0xa2, 0x8c, 0xb2, 0xc3, 0x69, 0x82, 0x43, 0xbb, 0x80, 0x86, 0x7a, 0xa6, + 0x67, 0xce, 0x8a, 0x80, 0x4e, 0xb6, 0xb0, 0xba, 0x6d, 0xb6, 0xa5, 0x8d, + 0x5a, 0x44, 0x61, 0xc2, 0x5d, 0x62, 0x86, 0x4c, 0x31, 0xc1, 0x3c, 0xc7, + 0x77, 0xae, 0x51, 0xc4, 0x62, 0x33, 0x76, 0xa4, 0x90, 0xa3, 0x72, 0xc1, + 0xb0, 0x4b, 0x44, 0x56, 0x74, 0xc1, 0xd0, 0x37, 0x9b, 0xa0, 0xc3, 0xb4, + 0xd1, 0x77, 0x65, 0xb1, 0x45, 0xc9, 0x90, 0x7c, 0x4d, 0x7a, 0x76, 0x93, + 0x54, 0xc5, 0x7d, 0xb2, 0x76, 0xb7, 0x84, 0x4a, 0x8c, 0xb3, 0xd2, 0x8a, + 0x61, 0x84, 0x90, 0xbc, 0x75, 0xbf, 0x72, 0x58, 0x38, 0x41, 0xa7, 0x6f, + 0x85, 0x87, 0x6c, 0xaf, 0x4f, 0x7d, 0x76, 0x9e, 0x9a, 0x56, 0x9b, 0x3f, + 0x4a, 0xb3, 0xc9, 0x69, 0x9b, 0x3c, 0x90, 0x77, 0x41, 0xb5, 0x3d, 0xba, + 0xcf, 0x66, 0xcf, 0x83, 0xcd, 0xae, 0x44, 0x7b, 0x68, 0x7f, 0x78, 0xb0, + 0xa1, 0x9d, 0x56, 0x8f, 0xb6, 0xcc, 0x6f, 0x44, 0x83, 0x53, 0xab, 0x35, + 0x63, 0xb5, 0x99, 0xca, 0x39, 0x64, 0xc8, 0x79, 0xa3, 0x76, 0x95, 0x59, + 0x5c, 0x53, 0xaa, 0x68, 0x89, 0x4d, 0xaa, 0xa9, 0x94, 0x7a, 0x82, 0x88, + 0xa5, 0x86, 0xad, 0xb4, 0xb3, 0xa6, 0xaf, 0x9a, 0xb3, 0x4a, 0xbe, 0xae, + 0x32, 0x9e, 0x53, 0xaa, 0xa6, 0x61, 0x95, 0x8c, 0xb1, 0x77, 0x59, 0x54, + 0x71, 0x84, 0x3e, 0xc3, 0x82, 0xa7, 0x45, 0xc3, 0x78, 0x78, 0xbc, 0x3d, + 0x9d, 0x39, 0xb3, 0x66, 0xd2, 0x72, 0x64, 0xba, 0x87, 0xad, 0x3a, 0xaf, + 0xbb, 0x74, 0x44, 0x34, 0xba, 0xc2, 0xcf, 0xa4, 0x8b, 0x47, 0xbe, 0x78, + 0x86, 0xb5, 0x63, 0x60, 0xa7, 0xa1, 0x93, 0xa3, 0xc9, 0x4e, 0x4b, 0x78, + 0x8b, 0x43, 0x77, 0xcb, 0xba, 0x85, 0x83, 0xa5, 0x5e, 0x58, 0x36, 0xc7, + 0xbd, 0xbf, 0x4f, 0x9a, 0x9f, 0xb6, 0x54, 0xcc, 0x39, 0x9a, 0x7e, 0x96, + 0x49, 0x44, 0xb4, 0x5e, 0x8e, 0x4f, 0x93, 0x8b, 0x74, 0x54, 0x75, 0x68, + 0x50, 0xad, 0x4d, 0x45, 0x65, 0x85, 0x9a, 0x8a, 0x87, 0xd2, 0xd0, 0x8f, + 0x9c, 0x39, 0xd0, 0x45, 0xae, 0x2f, 0x7d, 0x48, 0x53, 0xcb, 0xd2, 0x3d, + 0x79, 0x7d, 0x44, 0x5b, 0xb5, 0xc5, 0x60, 0x5c, 0x5c, 0x96, 0x67, 0x66, + 0x6f, 0x81, 0x7b, 0x42, 0x70, 0xce, 0x3a, 0xd3, 0x8d, 0x51, 0x86, 0x39, + 0x95, 0x7e, 0x89, 0x99, 0x93, 0x39, 0x7b, 0x8d, 0x43, 0xac, 0x51, 0xc5, + 0xa6, 0xbe, 0x93, 0x56, 0x52, 0xba, 0x9c, 0x92, 0x50, 0xae, 0x96, 0xc7, + 0x5b, 0x9e, 0x7e, 0xc5, 0x7d, 0xbe, 0x8c, 0x55, 0x70, 0x82, 0x90, 0x8b, + 0x48, 0x77, 0xb4, 0x46, 0xa8, 0x55, 0x73, 0xc6, 0xbf, 0x9f, 0x8d, 0xc7, + 0x35, 0xc3, 0x2f, 0x3d, 0x49, 0x4b, 0xd4, 0x4d, 0xa7, 0x7b, 0x6d, 0x49, + 0x78, 0x65, 0x74, 0x6f, 0x7a, 0x5f, 0x60, 0x9a, 0xbe, 0x6c, 0x86, 0x95, + 0x54, 0xb3, 0x75, 0x63, 0x90, 0x56, 0x62, 0xca, 0x56, 0x3b, 0x82, 0x52, + 0x63, 0x43, 0x89, 0x52, 0x4d, 0x72, 0x46, 0xb8, 0xcc, 0x45, 0x45, 0x71, + 0x93, 0x90, 0x3a, 0xa4, 0xbc, 0x9c, 0xc9, 0xc6, 0x72, 0x7a, 0x7c, 0x99, + 0x45, 0x4b, 0x38, 0x7d, 0x3a, 0xc3, 0xab, 0x5a, 0xc0, 0x52, 0x4f, 0x46, + 0x35, 0x87, 0x3c, 0xba, 0x87, 0xcb, 0x41, 0xca, 0xa8, 0x4e, 0x7e, 0x55, + 0xbc, 0xac, 0x97, 0x76, 0x31, 0x75, 0x7a, 0x9e, 0x5b, 0x37, 0x68, 0x66, + 0xd3, 0x80, 0x6d, 0xbd, 0x51, 0x56, 0x47, 0x54, 0x49, 0x6d, 0x49, 0x39, + 0x77, 0xb6, 0x35, 0x58, 0x6a, 0x9c, 0xaf, 0x38, 0x35, 0x5b, 0x42, 0x61, + 0x6f, 0xc6, 0xa8, 0xab, 0x2f, 0x37, 0x96, 0x6e, 0x72, 0xa7, 0x2d, 0xad, + 0x3c, 0x72, 0x77, 0x63, 0x8c, 0x86, 0x97, 0x64, 0xc5, 0xbb, 0xca, 0xc5, + 0xcb, 0x74, 0x2b, 0x81, 0x36, 0xbf, 0xcd, 0x9b, 0x7d, 0x74, 0x4e, 0x56, + 0x58, 0xa3, 0x7d, 0xd6, 0x3c, 0x97, 0x47, 0xc6, 0x51, 0x7a, 0x68, 0xc2, + 0xbd, 0xbd, 0xb8, 0x8f, 0xa8, 0x72, 0x74, 0x82, 0x37, 0x84, 0x7e, 0xcb, + 0xa2, 0xaf, 0xc0, 0x3f, 0x63, 0x48, 0x76, 0x93, 0x8f, 0xa7, 0x68, 0xc8, + 0xba, 0x42, 0x6a, 0x56, 0x9f, 0x45, 0xab, 0xc0, 0x58, 0x6f, 0x4e, 0x82, + 0x85, 0xab, 0xaa, 0x5b, 0x7d, 0x7d, 0x5e, 0xce, 0x94, 0x93, 0x61, 0x3a, + 0x7e, 0xad, 0x5c, 0x3e, 0x9c, 0x6c, 0xc0, 0xb5, 0x93, 0x84, 0x67, 0xb8, + 0x39, 0xc2, 0x5a, 0x77, 0xa4, 0xb2, 0x4a, 0x4e, 0x9c, 0x7c, 0x66, 0x34, + 0x3a, 0x68, 0x5d, 0x7a, 0xa0, 0x30, 0x91, 0x44, 0x6b, 0x3f, 0x49, 0x97, + 0xb4, 0xb8, 0x30, 0x63, 0xb6, 0x83, 0xbc, 0xb1, 0xa5, 0x97, 0xbc, 0xc5, + 0x86, 0x6a, 0x64, 0x35, 0x38, 0x8c, 0xb2, 0x6d, 0x3a, 0xb5, 0xd5, 0x9d, + 0x46, 0x35, 0x87, 0xb4, 0x99, 0xc8, 0x39, 0x9a, 0xa2, 0x85, 0x96, 0x6a, + 0xcb, 0xbf, 0x8a, 0x79, 0x7b, 0x77, 0xc1, 0x7a, 0x92, 0x8e, 0x70, 0x8a, + 0x66, 0x71, 0x65, 0x5f, 0x3a, 0xc5, 0x53, 0xc4, 0x56, 0x8b, 0x85, 0x52, + 0x9c, 0x60, 0xb3, 0x54, 0x75, 0xba, 0xca, 0x47, 0x6c, 0xa1, 0x5f, 0xa4, + 0x3c, 0xa3, 0xb4, 0x5a, 0xae, 0x6b, 0xcb, 0x94, 0x9e, 0xcf, 0x5f, 0x70, + 0x95, 0x94, 0x38, 0xaf, 0x3e, 0xd9, 0x35, 0x3c, 0x5c, 0xa3, 0x2f, 0x57, + 0xa6, 0xc3, 0x7f, 0x85, 0x3b, 0x84, 0xb5, 0x75, 0x57, 0x7e, 0x6b, 0xa3, + 0x67, 0xc4, 0x70, 0x5c, 0xab, 0xd2, 0x8d, 0x5f, 0x68, 0xc0, 0x4a, 0x69, + 0x38, 0x79, 0x62, 0xbd, 0xbe, 0x75, 0x6e, 0x77, 0xd4, 0xa0, 0x56, 0xd7, + 0x69, 0x60, 0x3e, 0xc9, 0xb1, 0x5a, 0x61, 0x32, 0xaf, 0x57, 0x70, 0x88, + 0xcb, 0x4c, 0xad, 0x81, 0x54, 0x45, 0x46, 0x4e, 0x92, 0xb9, 0x44, 0xc1, + 0x9a, 0x73, 0xd3, 0x8b, 0xb4, 0xbc, 0x8b, 0xc7, 0x5b, 0x5a, 0x4e, 0x54, + 0x56, 0x55, 0xc8, 0xa6, 0xc3, 0x3e, 0x7a, 0x37, 0xc0, 0x69, 0x7a, 0x8b, + 0x68, 0x91, 0x84, 0x84, 0x89, 0x92, 0x54, 0x91, 0x87, 0x6a, 0xb1, 0xcd, + 0x4d, 0xc2, 0xc2, 0xa9, 0x4d, 0x56, 0xb6, 0x56, 0xb3, 0x76, 0x7f, 0x89, + 0x62, 0xc0, 0x66, 0x90, 0x8b, 0x5c, 0x96, 0x98, 0x78, 0x46, 0x5c, 0xa1, + 0x94, 0xd2, 0x97, 0x53, 0x78, 0x74, 0x71, 0x97, 0xc3, 0x70, 0x65, 0x55, + 0x5e, 0xc4, 0x9a, 0x93, 0x98, 0xbb, 0x8c, 0x4a, 0x9c, 0x76, 0x7f, 0x62, + 0xa7, 0x5e, 0x68, 0x82, 0x5e, 0x51, 0x72, 0x53, 0xcd, 0x4b, 0xb0, 0xb1, + 0x7c, 0x50, 0xa6, 0x64, 0x78, 0xc4, 0x66, 0x8b, 0xc3, 0xce, 0xa4, 0x67, + 0x77, 0x85, 0x7d, 0xac, 0x60, 0x70, 0xa9, 0x3a, 0xc5, 0xa5, 0xa4, 0xc5, + 0xc6, 0xac, 0xaf, 0x8a, 0x57, 0x5c, 0x76, 0x7a, 0x8f, 0xa0, 0x3a, 0x68, + 0x72, 0x97, 0x3e, 0x49, 0x74, 0xc1, 0x90, 0x5d, 0x3f, 0x36, 0x94, 0x99, + 0xbe, 0xc8, 0xad, 0x58, 0xa1, 0x27, 0x2d, 0xcc, 0x50, 0x41, 0x71, 0x94, + 0xac, 0x5a, 0xae, 0x47, 0x4b, 0xca, 0x4f, 0xb5, 0xd3, 0x5f, 0x2b, 0x8d, + 0xcb, 0x9b, 0x60, 0x6d, 0x8d, 0x8c, 0x6a, 0xaf, 0x82, 0x5d, 0x9b, 0xcc, + 0x42, 0xa5, 0x8e, 0x95, 0x71, 0x93, 0xd1, 0x98, 0xa4, 0x4f, 0xc6, 0xa0, + 0xaf, 0xd1, 0x6d, 0x86, 0x7d, 0xc2, 0x82, 0xd1, 0xbe, 0x62, 0xbd, 0x36, + 0x57, 0x57, 0x8f, 0x5d, 0xb5, 0x4d, 0x40, 0xb4, 0xb8, 0xb9, 0x55, 0xa0, + 0xbf, 0x91, 0xa9, 0x2f, 0xc8, 0x97, 0x47, 0x62, 0x7b, 0x8a, 0xcb, 0x66, + 0x8b, 0x3a, 0x8b, 0xbf, 0x8f, 0x3e, 0x48, 0x5d, 0x8e, 0x8c, 0xd2, 0x44, + 0x67, 0x78, 0x98, 0x7b, 0x44, 0xc1, 0xca, 0x6e, 0xa2, 0x89, 0xb1, 0x8d, + 0xbc, 0xad, 0x75, 0x88, 0xa2, 0x86, 0x48, 0x8f, 0x59, 0x40, 0x71, 0xda, + 0x7c, 0x88, 0xa0, 0xa5, 0x8a, 0x34, 0xa5, 0xc8, 0xa8, 0x9f, 0x48, 0xa6, + 0x4e, 0x9a, 0x64, 0xb2, 0xcb, 0xa2, 0x66, 0x41, 0x70, 0x53, 0x89, 0x47, + 0x4b, 0x49, 0x96, 0x4d, 0xd1, 0x45, 0x34, 0x82, 0xd9, 0xa9, 0xbc, 0xa3, + 0xb7, 0x90, 0xb9, 0x34, 0xbf, 0x89, 0x70, 0x93, 0x87, 0xd1, 0x68, 0x86, + 0x99, 0x5a, 0x7f, 0x76, 0xd1, 0xa3, 0xb0, 0x6e, 0x60, 0x42, 0x51, 0x8a, + 0xc9, 0x9a, 0x7a, 0x88, 0xaf, 0x3b, 0xba, 0x32, 0x38, 0x7d, 0xaa, 0x73, + 0xc9, 0x9d, 0xb3, 0x45, 0xa8, 0xc6, 0xa1, 0x9a, 0x6e, 0x51, 0x87, 0x37, + 0x55, 0x48, 0x50, 0x74, 0x48, 0x9c, 0x8e, 0x2b, 0xd3, 0x8d, 0x51, 0x51, + 0x59, 0xb6, 0x76, 0x36, 0x2f, 0xa3, 0x9d, 0x37, 0x8f, 0x56, 0x58, 0x57, + 0x82, 0x6a, 0xb8, 0x4d, 0x83, 0x9c, 0xcf, 0x95, 0x4f, 0xce, 0xa1, 0xc3, + 0xc9, 0xb0, 0xb3, 0x6e, 0xa2, 0xca, 0xae, 0x80, 0x72, 0xcd, 0x56, 0xcd, + 0x44, 0x50, 0xb7, 0xc7, 0x41, 0xad, 0xb7, 0x49, 0xc3, 0x4a, 0x88, 0x8e, + 0x79, 0xac, 0x83, 0x4a, 0x50, 0xb3, 0x47, 0x4d, 0x7f, 0x6e, 0xcb, 0xcc, + 0xab, 0x41, 0xb5, 0x50, 0x92, 0xcb, 0xd2, 0x34, 0xba, 0x40, 0x95, 0xa0, + 0x5a, 0x69, 0x7c, 0x5b, 0xba, 0x3e, 0x36, 0x4d, 0x46, 0x82, 0x67, 0x98, + 0x8a, 0x63, 0x83, 0xbd, 0x64, 0x40, 0xc6, 0x4e, 0x3e, 0x99, 0x83, 0xd1, + 0xc9, 0xbe, 0xac, 0x49, 0x45, 0x8c, 0x75, 0xca, 0x81, 0x6e, 0x39, 0x80, + 0x7d, 0x7f, 0x8c, 0x67, 0x6b, 0xa1, 0x88, 0x82, 0x97, 0x56, 0x57, 0x7a, + 0x52, 0x6d, 0x5b, 0x4c, 0x3a, 0xad, 0x96, 0xc1, 0x55, 0x4f, 0x8a, 0xc8, + 0xb8, 0x75, 0x68, 0xc3, 0x44, 0x3a, 0xad, 0x2e, 0x89, 0x62, 0x7b, 0x50, + 0x4a, 0xbf, 0x90, 0xbe, 0x3e, 0x80, 0xc5, 0x78, 0x8b, 0x5d, 0x9b, 0x77, + 0x6c, 0x89, 0x46, 0x98, 0x3e, 0x73, 0x5c, 0x72, 0xa3, 0x81, 0x97, 0xbd, + 0x30, 0xba, 0xbe, 0x48, 0x51, 0x84, 0x41, 0x5e, 0xbd, 0x4e, 0x82, 0x87, + 0xd0, 0xae, 0x7a, 0xbe, 0x3d, 0xd0, 0x64, 0x37, 0x73, 0xc1, 0xc7, 0xa4, + 0xab, 0x8c, 0xaf, 0x75, 0x57, 0x84, 0x40, 0x90, 0x74, 0xc9, 0x38, 0x5f, + 0xaa, 0x95, 0x72, 0x3a, 0xa4, 0xb7, 0xd4, 0xb0, 0x81, 0x37, 0x45, 0x72, + 0x87, 0xac, 0x45, 0xd5, 0xcd, 0x5e, 0xb7, 0xc0, 0x5d, 0x4f, 0x4d, 0x4e, + 0x3d, 0x73, 0x69, 0x33, 0x4c, 0xc8, 0x86, 0xa0, 0xa9, 0xc8, 0xbe, 0xae, + 0xb2, 0x8e, 0x95, 0xb8, 0x74, 0xa2, 0x42, 0x60, 0x6b, 0x63, 0x82, 0x7c, + 0x33, 0xcb, 0x50, 0x98, 0x53, 0xac, 0x96, 0x78, 0x8e, 0xc8, 0x5f, 0x97, + 0xa1, 0x95, 0x4a, 0x60, 0x9c, 0xc3, 0xa6, 0x42, 0x46, 0x97, 0x36, 0x7a, + 0xa0, 0x55, 0x97, 0xb8, 0xb1, 0x83, 0x77, 0x6e, 0x47, 0x3d, 0xa6, 0x34, + 0x43, 0x72, 0x7b, 0xb6, 0x64, 0xaa, 0x62, 0x64, 0x5f, 0xc1, 0x33, 0x6f, + 0xcd, 0xae, 0xc4, 0xc2, 0x61, 0xc2, 0x90, 0x77, 0x7c, 0x7d, 0xb0, 0x43, + 0x7c, 0xce, 0x94, 0x43, 0x59, 0x6f, 0x71, 0x9d, 0x32, 0xbc, 0x5c, 0xa6, + 0x76, 0xc2, 0xba, 0x92, 0x54, 0x37, 0x36, 0x93, 0xa0, 0x71, 0x40, 0x5d, + 0xc3, 0xb2, 0x36, 0x3f, 0x6f, 0x4a, 0x83, 0xb8, 0xab, 0x66, 0xbb, 0x6b, + 0x3c, 0x7f, 0x7a, 0xa4, 0x34, 0x50, 0x96, 0xae, 0x71, 0x6d, 0x7f, 0xd2, + 0x5a, 0x9b, 0x40, 0xad, 0x67, 0x62, 0x52, 0xbf, 0x42, 0xc1, 0x95, 0xcd, + 0xd0, 0x68, 0x55, 0x7f, 0x51, 0x83, 0xb2, 0x55, 0x4c, 0x3f, 0x5e, 0xaa, + 0x48, 0x96, 0x79, 0xc9, 0xb8, 0xb5, 0x49, 0x7f, 0x3d, 0x40, 0x8b, 0xc3, + 0x50, 0x95, 0x4b, 0x49, 0x47, 0x62, 0x90, 0x52, 0xa3, 0x62, 0x64, 0xa6, + 0xc4, 0x51, 0x89, 0xb6, 0x7c, 0xae, 0x49, 0xcc, 0x57, 0x54, 0x34, 0x75, + 0xa3, 0x33, 0xca, 0x7d, 0x72, 0xbc, 0xae, 0x6a, 0x9f, 0x78, 0xa5, 0xbe, + 0x78, 0x56, 0x6f, 0x68, 0x7b, 0xb9, 0xc6, 0x52, 0x46, 0xcd, 0xc9, 0x65, + 0x9d, 0x7c, 0xbc, 0xb8, 0x71, 0x6c, 0x38, 0xab, 0xbd, 0x3b, 0x6c, 0xc8, + 0x80, 0x31, 0x43, 0x78, 0xbe, 0x99, 0x85, 0x63, 0xbb, 0x8b, 0xa1, 0x7c, + 0xca, 0x88, 0x33, 0x51, 0xaf, 0x9f, 0xce, 0x37, 0x76, 0x56, 0xcf, 0x3a, + 0x8a, 0x42, 0xb9, 0x98, 0x4b, 0x91, 0xcb, 0x7a, 0x39, 0x97, 0xa7, 0x32, + 0x59, 0x6a, 0x79, 0xd5, 0xd2, 0x6d, 0x88, 0xcc, 0xa3, 0x87, 0x59, 0x88, + 0x43, 0xd3, 0x7f, 0xb6, 0x81, 0x54, 0x47, 0x74, 0x5e, 0x67, 0x69, 0x48, + 0x72, 0x82, 0xb9, 0x99, 0x49, 0x35, 0xc7, 0xc4, 0x3b, 0xa2, 0x87, 0x89, + 0x53, 0x7b, 0x4a, 0x46, 0x33, 0x4f, 0x4e, 0x5f, 0x79, 0xa9, 0x6d, 0x5e, + 0x49, 0xb2, 0xc9, 0x99, 0x7e, 0x8d, 0x86, 0xac, 0xb7, 0x9b, 0x84, 0x60, + 0xc7, 0x69, 0x87, 0x63, 0xd0, 0x42, 0xd2, 0x35, 0x6b, 0x76, 0x67, 0x45, + 0xad, 0x7a, 0x31, 0x55, 0xa1, 0xca, 0x48, 0x6d, 0x70, 0xac, 0x68, 0xc3, + 0x91, 0x50, 0x8f, 0x3f, 0x58, 0x86, 0x4f, 0x9d, 0x74, 0x4d, 0x5b, 0x44, + 0xbd, 0x7b, 0xbb, 0x52, 0x59, 0xa7, 0xc5, 0xbb, 0xa0, 0x9b, 0x69, 0x68, + 0x7a, 0x4f, 0xa1, 0xad, 0x8a, 0x34, 0x62, 0x62, 0x72, 0xcb, 0x9a, 0xc6, + 0xc4, 0x5b, 0xbd, 0x84, 0x61, 0x60, 0xc5, 0x85, 0x47, 0xbf, 0xcf, 0xd3, + 0x34, 0x9b, 0x82, 0xb1, 0x61, 0x3b, 0x6b, 0x7d, 0x43, 0x54, 0x66, 0xbd, + 0xa2, 0x44, 0x54, 0x95, 0x91, 0xca, 0xbe, 0x44, 0xa6, 0x6e, 0xcd, 0x7a, + 0x90, 0xa6, 0xd3, 0x6e, 0x92, 0x52, 0x34, 0x33, 0x3d, 0x85, 0x66, 0x3d, + 0x56, 0xd0, 0x4a, 0x58, 0x70, 0x92, 0x82, 0xc7, 0x58, 0x86, 0x77, 0x5f, + 0x7d, 0x41, 0x84, 0x40, 0xb6, 0x7c, 0x35, 0x38, 0x3f, 0x50, 0xbd, 0x51, + 0xd0, 0xc8, 0x5f, 0x3e, 0x41, 0xa5, 0x98, 0x57, 0xaa, 0x5c, 0x90, 0x3a, + 0x34, 0x7f, 0x89, 0x6b, 0x4b, 0xca, 0xcd, 0xa8, 0x6b, 0xc5, 0x35, 0xd4, + 0x45, 0x86, 0x64, 0x9e, 0xb1, 0xd2, 0x7c, 0x6c, 0x73, 0x4e, 0xcc, 0xca, + 0x4f, 0xc5, 0x9c, 0x6e, 0xd2, 0x42, 0x39, 0x53, 0x7e, 0xd1, 0x67, 0x70, + 0x6e, 0xb5, 0xca, 0x5f, 0x8f, 0x3f, 0x78, 0xb6, 0x94, 0x44, 0xa2, 0x9c, + 0x6f, 0x7e, 0x89, 0x81, 0x4e, 0xb3, 0x50, 0x68, 0x72, 0x5b, 0x38, 0xa6, + 0x32, 0xd2, 0x58, 0x98, 0x41, 0x89, 0xb0, 0xb3, 0xbb, 0x71, 0xcf, 0x61, + 0x8d, 0x86, 0x8a, 0x50, 0xa0, 0xd5, 0x60, 0x71, 0x6e, 0x85, 0xae, 0xbd, + 0x4b, 0x46, 0x78, 0x54, 0x4c, 0xa3, 0x68, 0xa0, 0xc4, 0xa0, 0x99, 0xc3, + 0x47, 0xb0, 0x63, 0x94, 0x40, 0x5e, 0x31, 0x71, 0x5e, 0xbb, 0xa1, 0x81, + 0xa9, 0x90, 0x3d, 0x94, 0x43, 0xb0, 0x32, 0x52, 0xa0, 0x91, 0x68, 0x8c, + 0xa3, 0x89, 0x3d, 0x88, 0xbd, 0x51, 0x3b, 0x7d, 0x97, 0xa8, 0xba, 0x5e, + 0x6e, 0x90, 0x41, 0x49, 0x7a, 0xc5, 0xdc, 0xd7, 0x61, 0xab, 0x3e, 0x6b, + 0x5b, 0xa7, 0x5a, 0xd1, 0x68, 0x4d, 0x3b, 0xc7, 0x4d, 0x61, 0x85, 0xa1, + 0x6d, 0xa4, 0x31, 0x8c, 0x4f, 0x85, 0xb2, 0xb7, 0x8e, 0x6a, 0xb8, 0x4b, + 0xa4, 0x7c, 0xae, 0xd1, 0x70, 0x51, 0x77, 0x53, 0x9d, 0xd1, 0xab, 0x50, + 0x74, 0xb0, 0x45, 0x3f, 0xac, 0xc5, 0x98, 0xaf, 0x96, 0x8b, 0x42, 0x2b, + 0x68, 0x4b, 0xa5, 0xb3, 0x3d, 0x4a, 0xbf, 0x42, 0x93, 0xb9, 0x89, 0x75, + 0xc6, 0x42, 0xa7, 0xb8, 0xac, 0xaa, 0x70, 0xbe, 0xa1, 0x48, 0xbe, 0xa2, + 0x78, 0x30, 0x3e, 0xcd, 0x5b, 0x44, 0x5a, 0x41, 0xc0, 0x72, 0x4b, 0xc1, + 0xb2, 0x62, 0x89, 0x92, 0x6f, 0x58, 0x83, 0x7a, 0x7a, 0xa6, 0x94, 0x88, + 0x9b, 0x95, 0x62, 0xb0, 0x4f, 0xa2, 0xd4, 0x3f, 0xa4, 0x53, 0xca, 0x37, + 0x3f, 0x40, 0x7a, 0xa4, 0x7d, 0x68, 0x8c, 0x29, 0x7f, 0x64, 0x5e, 0x68, + 0xb5, 0x7a, 0x8d, 0x99, 0x92, 0x40, 0x40, 0x55, 0x4b, 0x7e, 0x9b, 0x97, + 0x47, 0x2f, 0x34, 0xda, 0xbf, 0x7e, 0x46, 0x50, 0xa1, 0x7f, 0x2f, 0x4c, + 0x72, 0x3d, 0x78, 0x42, 0x92, 0x74, 0x5f, 0x4e, 0x96, 0x5b, 0x9f, 0x8e, + 0x47, 0xc3, 0xb7, 0x93, 0x88, 0xb1, 0x4e, 0x83, 0x37, 0x41, 0x69, 0x67, + 0xbc, 0x60, 0x8a, 0x5c, 0xd5, 0x48, 0x90, 0x9c, 0x85, 0x76, 0xcd, 0x47, + 0x72, 0xb2, 0x72, 0x5e, 0x80, 0x3f, 0x66, 0x83, 0x8a, 0x84, 0x3c, 0x96, + 0x69, 0x87, 0x92, 0x61, 0xa3, 0x82, 0x77, 0x4f, 0x7b, 0xc4, 0x76, 0xbc, + 0x55, 0x3b, 0xd7, 0x58, 0x94, 0x98, 0xce, 0xb9, 0xc8, 0x93, 0x98, 0xbf, + 0x4b, 0x41, 0x9b, 0xcf, 0x44, 0xd0, 0xb5, 0x3d, 0x9f, 0x56, 0x3e, 0x71, + 0x5f, 0xd3, 0x34, 0xcb, 0xb6, 0x9e, 0x56, 0x57, 0x44, 0x47, 0x73, 0xb3, + 0xc3, 0xc4, 0xb2, 0x66, 0x51, 0xba, 0x96, 0x65, 0x4b, 0x5f, 0x48, 0xd3, + 0x70, 0x96, 0x70, 0xd6, 0xa6, 0x92, 0x57, 0x6e, 0x51, 0x9b, 0x34, 0xb5, + 0x7c, 0x76, 0x43, 0x62, 0x34, 0x6b, 0xba, 0x71, 0xb0, 0xac, 0x9e, 0x60, + 0x84, 0xb5, 0xd5, 0xca, 0x43, 0xad, 0xcc, 0x89, 0x54, 0x6f, 0x76, 0xae, + 0x60, 0xbb, 0xb6, 0xa5, 0xc9, 0xb5, 0x43, 0x7a, 0x79, 0x83, 0x89, 0x66, + 0x76, 0xac, 0x4d, 0x50, 0x35, 0x8e, 0x40, 0x57, 0xa3, 0xd1, 0xb2, 0x66, + 0x94, 0x85, 0x6e, 0x6b, 0xab, 0xa5, 0xba, 0x45, 0x8b, 0x76, 0xb0, 0x6a, + 0xbc, 0x30, 0x35, 0x3e, 0x3f, 0x51, 0x5f, 0xae, 0x78, 0xa7, 0xcd, 0x87, + 0x92, 0x48, 0x41, 0x3e, 0xad, 0xaf, 0x77, 0xbd, 0x36, 0xcc, 0x59, 0x3c, + 0x4f, 0xa2, 0x83, 0x6e, 0x68, 0xab, 0xc2, 0x36, 0x7c, 0x48, 0x7a, 0xbf, + 0xba, 0xce, 0x73, 0x85, 0x71, 0x90, 0x67, 0xb9, 0x54, 0x7b, 0x4f, 0xa5, + 0x81, 0x6b, 0x6e, 0x63, 0xc6, 0x9b, 0xb6, 0x46, 0x66, 0x67, 0x5f, 0x3c, + 0x8e, 0x66, 0x81, 0xac, 0xab, 0x55, 0xa5, 0x41, 0x84, 0x66, 0x44, 0x81, + 0x56, 0xb7, 0x43, 0xb5, 0x7f, 0xa7, 0xb5, 0x82, 0xb7, 0x39, 0x61, 0x34, + 0x3d, 0xd2, 0x7e, 0xcf, 0x73, 0x61, 0xad, 0x8b, 0x80, 0x64, 0xd3, 0x93, + 0x55, 0xd1, 0x64, 0x75, 0xb5, 0x7b, 0x58, 0x63, 0xbb, 0xc4, 0xbb, 0x7e, + 0x7b, 0xa1, 0x77, 0x62, 0x63, 0x49, 0xb8, 0x88, 0x69, 0xd8, 0xc3, 0xa9, + 0xd1, 0x96, 0xa2, 0x99, 0xbf, 0xaf, 0x68, 0x34, 0x8f, 0xd1, 0x93, 0x4a, + 0x5a, 0x91, 0xc0, 0x97, 0x59, 0x7e, 0x42, 0xd4, 0xa6, 0xbe, 0x5d, 0x37, + 0x8f, 0xb4, 0x3c, 0x31, 0x35, 0xa4, 0x72, 0x96, 0x62, 0x8f, 0x47, 0xac, + 0x8d, 0x3a, 0x54, 0x9c, 0xbf, 0x3f, 0x5d, 0xa8, 0x98, 0x93, 0x49, 0xa7, + 0xac, 0x95, 0xbc, 0xd8, 0x38, 0x82, 0x96, 0x83, 0xd1, 0x7b, 0x7a, 0xa0, + 0xb0, 0x32, 0x5e, 0xb9, 0x9b, 0x75, 0x6f, 0x5d, 0x6c, 0x6c, 0x5b, 0x54, + 0x5a, 0x6a, 0x47, 0x83, 0x3b, 0xcf, 0x43, 0xc0, 0x61, 0x8b, 0x61, 0x42, + 0x9d, 0xa5, 0x34, 0x6b, 0x67, 0xba, 0x8b, 0x30, 0x98, 0xd0, 0xc2, 0x46, + 0x37, 0x42, 0x77, 0x83, 0x71, 0xcb, 0x30, 0x45, 0xae, 0x84, 0x5b, 0x6b, + 0xcd, 0xc3, 0xae, 0xa8, 0xbc, 0x76, 0x74, 0x9d, 0x40, 0x9b, 0x85, 0x6b, + 0x71, 0xb0, 0x5f, 0x43, 0x45, 0xca, 0x9e, 0xac, 0x46, 0xc6, 0x9f, 0xa8, + 0x70, 0x36, 0x2f, 0x80, 0x76, 0x77, 0x5d, 0x3e, 0xce, 0x53, 0x40, 0x7f, + 0x2f, 0x52, 0xbf, 0x9f, 0xd1, 0xa8, 0x74, 0x9b, 0x9d, 0x57, 0x60, 0x9a, + 0xa1, 0x5a, 0xa7, 0x8d, 0x48, 0x54, 0x8c, 0xc0, 0x76, 0x37, 0x96, 0x46, + 0xb8, 0xb7, 0x45, 0xa3, 0xbf, 0xae, 0x8c, 0x63, 0x77, 0x9a, 0x76, 0xcc, + 0xb9, 0x9d, 0x66, 0x6c, 0xcf, 0xcf, 0x92, 0x41, 0x3d, 0x73, 0x70, 0x7c, + 0x81, 0xcd, 0x5e, 0x95, 0x9b, 0x9b, 0x97, 0x47, 0x5e, 0xdc, 0x6d, 0xb5, + 0xb8, 0x62, 0x64, 0x56, 0xc4, 0x83, 0x7d, 0x69, 0x35, 0xbd, 0xa4, 0xa3, + 0x54, 0xcc, 0x3a, 0x5e, 0x58, 0x71, 0x94, 0x4c, 0x45, 0x80, 0xa2, 0x5e, + 0x56, 0x52, 0xb1, 0x61, 0x88, 0x40, 0xa3, 0xb7, 0xc7, 0x49, 0x60, 0xcb, + 0x45, 0xb0, 0x67, 0xc0, 0xd0, 0x6d, 0x8d, 0x9c, 0xa6, 0xa2, 0x46, 0x54, + 0x7e, 0x5c, 0x91, 0x57, 0x43, 0x97, 0x3a, 0xcd, 0x90, 0x61, 0x7b, 0x65, + 0xde, 0xca, 0x9c, 0x77, 0x4d, 0xbd, 0x74, 0xb4, 0x65, 0x9e, 0x54, 0xa7, + 0x57, 0x66, 0x84, 0x86, 0x41, 0xad, 0xcd, 0x90, 0x86, 0xb2, 0x3a, 0x4a, + 0xcd, 0x3f, 0x82, 0x9d, 0xa9, 0xbc, 0x5e, 0x73, 0x9d, 0xaa, 0xba, 0x70, + 0x6f, 0x4f, 0x34, 0xc0, 0xcc, 0x8d, 0xc2, 0x47, 0xb9, 0x41, 0x66, 0x4b, + 0x47, 0x60, 0x2d, 0x99, 0xb9, 0x77, 0x9f, 0xbe, 0x51, 0xb1, 0x76, 0x4a, + 0x8a, 0xd3, 0xa6, 0xd1, 0x7a, 0x4a, 0xc7, 0x63, 0x92, 0x99, 0x48, 0x39, + 0xa4, 0x9d, 0xb7, 0x88, 0x7f, 0x88, 0xcd, 0x51, 0x4c, 0x5f, 0x8d, 0x46, + 0x86, 0x86, 0x6e, 0x62, 0xc9, 0xc5, 0x78, 0x8a, 0x81, 0x64, 0xce, 0x93, + 0x71, 0x6b, 0x8d, 0x8f, 0xa6, 0xda, 0x4c, 0x4b, 0x95, 0xdb, 0xce, 0x54, + 0x6a, 0x75, 0x40, 0x26, 0x75, 0x7e, 0xc3, 0x75, 0x40, 0x96, 0xca, 0xcc, + 0x95, 0x37, 0xce, 0x8a, 0xa3, 0xc0, 0x98, 0xc1, 0xab, 0x8b, 0x41, 0xc2, + 0x86, 0xa7, 0x68, 0x87, 0x85, 0xcb, 0xd8, 0x38, 0xbb, 0x94, 0x94, 0xb3, + 0x86, 0xb8, 0x85, 0x5e, 0x60, 0x89, 0x99, 0xa5, 0x77, 0x89, 0xad, 0xb9, + 0x58, 0x99, 0xd2, 0x73, 0x7d, 0x69, 0xa3, 0x5f, 0xc1, 0x99, 0xc1, 0x5f, + 0xc0, 0x4a, 0x4a, 0x3f, 0x4a, 0xc4, 0x64, 0xbb, 0xb1, 0x72, 0xa0, 0x32, + 0x4d, 0xc3, 0xb2, 0x4a, 0x65, 0x9e, 0xb2, 0x78, 0x52, 0x85, 0x6a, 0xd2, + 0xaf, 0x43, 0x53, 0x8e, 0x9f, 0xd4, 0x6d, 0xee, 0x38, 0x83, 0x59, 0xb3, + 0xba, 0xa5, 0x4a, 0x36, 0x3b, 0x91, 0x7d, 0x52, 0x8a, 0xb2, 0x57, 0xb7, + 0x38, 0x8b, 0xc0, 0x72, 0x54, 0xce, 0xd1, 0x99, 0xbd, 0xc7, 0xb0, 0x57, + 0x74, 0x69, 0xaa, 0x40, 0x9c, 0xb2, 0x36, 0x33, 0x42, 0x71, 0x89, 0x8a, + 0x44, 0xad, 0xc8, 0x87, 0x46, 0x3c, 0x47, 0x50, 0xc7, 0x5f, 0xa4, 0x4a, + 0x79, 0x9b, 0xa1, 0x5e, 0x3e, 0x3f, 0x51, 0x76, 0xd3, 0x9a, 0xbb, 0x5e, + 0x65, 0xcc, 0x4b, 0x7a, 0x4e, 0xcd, 0x53, 0xdb, 0x48, 0x65, 0xb2, 0x57, + 0x7a, 0x54, 0xb3, 0x8d, 0xba, 0x76, 0xc4, 0x63, 0x6b, 0xa8, 0x5e, 0x67, + 0xc7, 0x64, 0x54, 0x9c, 0x5d, 0xc0, 0x7e, 0x4c, 0x9a, 0xb0, 0xa8, 0x7a, + 0xbf, 0xd0, 0x63, 0x9f, 0x40, 0x9c, 0x71, 0xc0, 0x8b, 0x7a, 0xd3, 0xa9, + 0xb3, 0x6c, 0xaf, 0xb2, 0x78, 0x8b, 0x93, 0x67, 0x64, 0x7c, 0xbb, 0xb0, + 0x3f, 0x7e, 0x90, 0x6c, 0x38, 0x58, 0x2c, 0x33, 0x45, 0xae, 0x6f, 0x92, + 0x7d, 0x7a, 0x7d, 0x73, 0x4c, 0xaf, 0x4a, 0xc0, 0x60, 0x85, 0xcb, 0x7c, + 0xc6, 0xce, 0xb3, 0xbf, 0xaf, 0x2f, 0x58, 0x4d, 0x32, 0x6a, 0xb7, 0x6d, + 0x56, 0xa1, 0xa4, 0xc2, 0xa4, 0x97, 0x9a, 0x98, 0xa5, 0xc8, 0x6d, 0x6f, + 0x7a, 0x5c, 0x6c, 0x95, 0x4a, 0x75, 0x99, 0xbb, 0x22, 0x78, 0xc7, 0xaa, + 0x3a, 0x7b, 0xd5, 0xcb, 0x51, 0xba, 0xb7, 0x29, 0xa6, 0xc5, 0xb8, 0xa9, + 0xaf, 0x5f, 0x6e, 0x7b, 0x84, 0x5c, 0x6a, 0x48, 0xc5, 0x4a, 0x59, 0x3f, + 0x56, 0x5c, 0x78, 0x90, 0x44, 0xc5, 0xa8, 0x9f, 0x92, 0x9f, 0x6e, 0x8b, + 0x54, 0x7c, 0x2a, 0xd3, 0xa5, 0x3f, 0xd5, 0xc0, 0x8b, 0x59, 0x74, 0x3f, + 0xbf, 0x8c, 0xc8, 0x3f, 0x89, 0xce, 0x3a, 0xd3, 0x5f, 0x69, 0xac, 0x73, + 0xa3, 0x65, 0x4d, 0x54, 0x3a, 0x48, 0xae, 0x7d, 0x3b, 0x59, 0x5f, 0x6d, + 0x59, 0xbf, 0x53, 0x75, 0x41, 0x6a, 0x3f, 0xd8, 0x5e, 0x8c, 0x37, 0x7d, + 0x49, 0x6b, 0x64, 0x80, 0xce, 0x6c, 0x97, 0xbd, 0x69, 0x77, 0x73, 0x56, + 0x7e, 0xe0, 0xd6, 0x4a, 0x5a, 0xb3, 0x64, 0x3f, 0x8f, 0x61, 0xc8, 0x35, + 0x34, 0x65, 0x90, 0xcc, 0xab, 0x8c, 0xab, 0x42, 0x56, 0xc8, 0x93, 0xc5, + 0xda, 0xaf, 0xdb, 0x79, 0xbd, 0x5e, 0x73, 0x63, 0x5f, 0xa0, 0x79, 0x72, + 0x6f, 0xa3, 0x56, 0x4b, 0x30, 0x81, 0x3b, 0x3d, 0x4c, 0xb6, 0x9c, 0xd2, + 0xb3, 0x30, 0xad, 0x87, 0xc8, 0x54, 0x6c, 0x55, 0xbc, 0x30, 0x6a, 0x60, + 0x34, 0x6c, 0x4f, 0x2d, 0xa3, 0x3d, 0x68, 0x4b, 0x6d, 0x4b, 0x9b, 0xd1, + 0xc8, 0x72, 0xa1, 0xac, 0x99, 0xc2, 0x96, 0x6f, 0x91, 0x4b, 0xaf, 0x7c, + 0x63, 0xb5, 0x41, 0x5a, 0x38, 0xa2, 0xda, 0xbe, 0x2b, 0x92, 0x95, 0x26, + 0xad, 0xaf, 0x45, 0x62, 0xd5, 0x60, 0x6c, 0xbe, 0xa8, 0x41, 0x79, 0xd3, + 0x63, 0xaf, 0x8a, 0xa9, 0x3e, 0x39, 0x99, 0xba, 0xb8, 0x65, 0x90, 0x67, + 0x91, 0xc6, 0xd0, 0x59, 0x68, 0x78, 0x78, 0x34, 0xaf, 0x3a, 0x38, 0xc4, + 0x44, 0x4f, 0x77, 0xc3, 0x67, 0xbe, 0x42, 0x63, 0x8b, 0xa0, 0x7d, 0x40, + 0x5c, 0x95, 0x92, 0x3c, 0x60, 0x81, 0x71, 0x64, 0xa3, 0x41, 0xcc, 0x98, + 0xc7, 0xa9, 0x68, 0x8a, 0xc5, 0x83, 0x38, 0xbd, 0x24, 0x56, 0x3d, 0x67, + 0xcb, 0x56, 0x3b, 0x93, 0x62, 0x98, 0x60, 0xae, 0x5c, 0xab, 0xc6, 0xb8, + 0x66, 0x97, 0x3c, 0x55, 0x4c, 0xb9, 0x3c, 0xb8, 0x73, 0xc5, 0x65, 0x6b, + 0x5c, 0xa7, 0x85, 0xc4, 0x5f, 0x75, 0xc9, 0xaf, 0x7b, 0x68, 0x68, 0x4f, + 0x84, 0x37, 0x74, 0x4d, 0xbd, 0x6b, 0x7a, 0xb8, 0x59, 0x90, 0xd3, 0x78, + 0xcd, 0x62, 0xd8, 0xcc, 0x9f, 0x33, 0x34, 0x74, 0xaf, 0x83, 0x48, 0x84, + 0xbb, 0xa0, 0x47, 0xcf, 0x9b, 0x91, 0x72, 0x41, 0x3e, 0x47, 0xa5, 0x73, + 0xcd, 0xa8, 0x41, 0x93, 0xb0, 0xae, 0x4d, 0x45, 0x6d, 0x55, 0x39, 0xdd, + 0xc8, 0x69, 0x6a, 0x63, 0xac, 0x98, 0x70, 0xd8, 0x64, 0xbe, 0xaf, 0x84, + 0x3e, 0x54, 0xc2, 0xce, 0xa6, 0x6a, 0x2b, 0x7f, 0xba, 0x7b, 0xa1, 0x9d, + 0xac, 0x9e, 0x3f, 0xba, 0x48, 0x6a, 0xbe, 0xad, 0xb7, 0xc7, 0x92, 0xbe, + 0xbf, 0xb0, 0xc3, 0xa7, 0x88, 0x65, 0x71, 0x89, 0x83, 0x83, 0x3e, 0xab, + 0x81, 0x4d, 0x92, 0x6e, 0x41, 0x7b, 0xcf, 0x53, 0xb6, 0xa3, 0xb0, 0xa6, + 0x94, 0x9a, 0xce, 0x80, 0x26, 0x36, 0x2c, 0x40, 0x76, 0x8b, 0x29, 0x34, + 0x87, 0xad, 0x53, 0x70, 0xc2, 0xa1, 0x86, 0x4d, 0x9c, 0x8b, 0x75, 0xbc, + 0x57, 0x6b, 0x8b, 0x5b, 0x4b, 0x8a, 0xcd, 0xbc, 0x4b, 0x5d, 0x4b, 0xdc, + 0xc8, 0xd1, 0xb6, 0xd3, 0x73, 0x4a, 0x5e, 0x76, 0xc5, 0x88, 0x5c, 0x6b, + 0xbd, 0xb2, 0xad, 0x54, 0x4c, 0x53, 0x7d, 0xdc, 0x7e, 0xd4, 0xb5, 0x9e, + 0x3c, 0x74, 0x6d, 0x9a, 0x8d, 0x58, 0xaf, 0x52, 0x2c, 0x9d, 0x9b, 0xd1, + 0x87, 0x64, 0x3b, 0x75, 0x74, 0xc9, 0xcd, 0x8a, 0xb3, 0x91, 0x8c, 0xc2, + 0xb3, 0x9a, 0xc0, 0xa1, 0x32, 0xae, 0x75, 0x7d, 0x6e, 0xba, 0x6c, 0x57, + 0x9f, 0x64, 0x9b, 0xc1, 0xbe, 0x42, 0xbe, 0x38, 0x5e, 0x3c, 0xa7, 0x95, + 0x52, 0x8b, 0x83, 0xc3, 0x2c, 0x4f, 0xa3, 0x74, 0x5e, 0xc9, 0xa8, 0xd6, + 0x3e, 0x3f, 0x46, 0x95, 0x41, 0x5d, 0xc0, 0x40, 0x7d, 0x58, 0x59, 0x91, + 0xd1, 0x47, 0x44, 0x39, 0x7c, 0xc2, 0x80, 0xbc, 0x32, 0x93, 0x73, 0xbc, + 0x32, 0x33, 0x68, 0x50, 0x53, 0x4f, 0x3d, 0xca, 0x48, 0x7b, 0xb6, 0xd0, + 0x64, 0x70, 0xa6, 0x8b, 0x75, 0xbf, 0xbd, 0x9e, 0xb5, 0xb6, 0x8a, 0x52, + 0xb8, 0x4f, 0x58, 0x4c, 0xaa, 0x8f, 0x43, 0xda, 0x86, 0xba, 0xce, 0x41, + 0xb0, 0x9d, 0x96, 0x56, 0xa1, 0x8d, 0x44, 0xaf, 0xd2, 0x4f, 0x72, 0x65, + 0xa0, 0xc4, 0xbf, 0x42, 0xb4, 0xbc, 0xaa, 0x4f, 0x9a, 0xb1, 0x49, 0xcc, + 0xaa, 0x9c, 0x35, 0x88, 0x3a, 0x89, 0xa7, 0x9c, 0x9f, 0x47, 0xc4, 0xc5, + 0x8c, 0xd4, 0x53, 0xca, 0xbe, 0x4f, 0xb0, 0x66, 0x84, 0xdb, 0x6d, 0xa8, + 0xa2, 0x76, 0x40, 0xc1, 0x81, 0x41, 0xba, 0x5f, 0x5a, 0xb2, 0xb0, 0xbe, + 0xc3, 0x81, 0x68, 0xb8, 0xc9, 0x76, 0xc5, 0xa4, 0x4f, 0x65, 0xbe, 0x37, + 0x83, 0x4f, 0x6f, 0x78, 0x71, 0xb8, 0x5a, 0x71, 0x80, 0xc1, 0x82, 0x94, + 0x5d, 0xa1, 0x8c, 0x9f, 0x41, 0x32, 0x4a, 0x8c, 0xc8, 0x3b, 0x58, 0x4b, + 0x99, 0xb2, 0x69, 0xb3, 0xb9, 0xcf, 0x34, 0x34, 0x92, 0x93, 0xca, 0xb6, + 0xd7, 0x63, 0xd2, 0x9a, 0xa9, 0x6f, 0x70, 0xb0, 0x53, 0x4c, 0x90, 0x84, + 0x6d, 0x46, 0x46, 0x7d, 0x75, 0x78, 0x83, 0x48, 0x41, 0x90, 0xa1, 0x72, + 0x8e, 0x6d, 0x66, 0x8b, 0x5e, 0x5e, 0x69, 0x4b, 0xbd, 0x46, 0x4a, 0xcc, + 0xb5, 0x86, 0xb4, 0x70, 0xb4, 0x8c, 0xc6, 0x7c, 0x65, 0xca, 0x6e, 0x9e, + 0x51, 0x5c, 0xdf, 0xae, 0x94, 0xb1, 0xca, 0x42, 0xc8, 0x46, 0x74, 0x83, + 0xc0, 0x32, 0xb9, 0xb3, 0xc5, 0x54, 0xca, 0x61, 0x5e, 0xd1, 0x8c, 0x5a, + 0x4d, 0x89, 0xc2, 0xa5, 0x81, 0x84, 0x4f, 0x8e, 0xd2, 0xb2, 0xb0, 0x61, + 0x80, 0x5e, 0xa6, 0x59, 0x8c, 0x8f, 0x29, 0xb7, 0x47, 0x90, 0x77, 0x3d, + 0x44, 0x90, 0x6a, 0xbe, 0x43, 0xbe, 0x57, 0x7d, 0xb8, 0x7a, 0xa4, 0xaa, + 0x39, 0xc1, 0x32, 0xa5, 0x9a, 0x59, 0xd9, 0xc4, 0x7f, 0x46, 0xaa, 0xc9, + 0xce, 0x93, 0xc5, 0xd2, 0x6a, 0x4e, 0x65, 0x62, 0x76, 0x95, 0x34, 0x50, + 0x5b, 0x53, 0xb6, 0xcc, 0xa1, 0x95, 0x83, 0xae, 0x95, 0x4f, 0xc4, 0xcb, + 0x41, 0x68, 0x41, 0x55, 0xa1, 0x8c, 0xbc, 0x5a, 0xc8, 0x67, 0xa5, 0x7b, + 0x3c, 0x6b, 0x6d, 0x59, 0x3e, 0x6e, 0x97, 0xbd, 0x9b, 0x5d, 0x3c, 0x76, + 0xc7, 0x51, 0x5b, 0x37, 0x47, 0x84, 0x5e, 0x9d, 0x63, 0xc4, 0x66, 0x42, + 0x53, 0xa3, 0x8d, 0xa0, 0x46, 0x91, 0x32, 0x7e, 0x97, 0x5b, 0x94, 0xc4, + 0xb2, 0xcb, 0x44, 0x74, 0x3b, 0x56, 0x85, 0x61, 0x7b, 0xcd, 0x2f, 0xc9, + 0xcf, 0xbe, 0x88, 0xbe, 0x89, 0xaa, 0x90, 0xa3, 0x4a, 0x5b, 0x9e, 0x60, + 0xc4, 0xc7, 0xa4, 0x84, 0x65, 0xa3, 0x37, 0xb7, 0xbe, 0x44, 0xcc, 0x5b, + 0x97, 0xce, 0x9c, 0xb1, 0xca, 0x96, 0x8f, 0x7f, 0xb8, 0x7a, 0xb2, 0x9f, + 0x48, 0x33, 0x7a, 0xa0, 0xce, 0xb2, 0x35, 0x5f, 0x93, 0x38, 0x94, 0x63, + 0x6e, 0x99, 0xc5, 0xa9, 0x36, 0x89, 0xbb, 0x43, 0x4f, 0x73, 0x3d, 0x7e, + 0x55, 0x3e, 0x59, 0x86, 0xa1, 0x8e, 0xb8, 0xd6, 0xd3, 0x54, 0x66, 0x96, + 0x97, 0xa8, 0x3a, 0x47, 0x3f, 0x9c, 0x45, 0xd2, 0x3f, 0x6d, 0xb7, 0xc8, + 0xad, 0x95, 0x40, 0x65, 0x91, 0xa2, 0x8a, 0x85, 0x3a, 0xc2, 0x5f, 0xaf, + 0x69, 0x8f, 0x67, 0xbc, 0xab, 0xbb, 0x4f, 0xa7, 0xa5, 0xda, 0x78, 0x67, + 0x44, 0xc5, 0x4a, 0xbd, 0x6d, 0x46, 0xba, 0x41, 0x77, 0xc6, 0x81, 0x64, + 0x78, 0x67, 0x9c, 0xa2, 0xaa, 0x7a, 0xd3, 0x4c, 0x72, 0xb0, 0x92, 0x5f, + 0x80, 0x72, 0x87, 0x88, 0x4c, 0x69, 0xb4, 0xe2, 0x44, 0x98, 0xc6, 0x7f, + 0x68, 0x63, 0x87, 0x94, 0xbe, 0xae, 0x5c, 0x7c, 0xa6, 0x78, 0x83, 0x52, + 0x41, 0x86, 0x7a, 0x7c, 0x47, 0xc5, 0x3e, 0xb3, 0x6b, 0x84, 0x82, 0xb5, + 0x56, 0x8a, 0xcd, 0xe2, 0x96, 0x4d, 0xab, 0x56, 0xb7, 0xa6, 0x8a, 0x30, + 0x55, 0x33, 0x80, 0xaf, 0x54, 0xc9, 0xe2, 0xbb, 0xab, 0x92, 0x6d, 0x6e, + 0x74, 0x2d, 0x9f, 0x42, 0xbe, 0x4c, 0x50, 0x90, 0x88, 0xc0, 0x3c, 0xcc, + 0x4f, 0x54, 0x50, 0x8a, 0xa5, 0x38, 0xcd, 0x41, 0xa6, 0x5d, 0xa3, 0x81, + 0xa2, 0x4c, 0xd5, 0xb5, 0x30, 0xa9, 0x91, 0xbc, 0x3f, 0xcd, 0x3c, 0x6e, + 0x79, 0x95, 0xa1, 0x5c, 0xa0, 0x68, 0x79, 0xc9, 0x98, 0x6b, 0x89, 0x95, + 0x6a, 0x7e, 0xc0, 0x52, 0x68, 0x52, 0x57, 0xb6, 0x7d, 0xa5, 0x5b, 0x70, + 0xb6, 0xb7, 0xce, 0x48, 0x71, 0x6c, 0x46, 0x7e, 0xc5, 0x56, 0x3d, 0x66, + 0x95, 0xad, 0x31, 0x5a, 0xc5, 0xa2, 0x88, 0xab, 0x63, 0x4a, 0x6e, 0x61, + 0x48, 0x55, 0x88, 0x5f, 0x6e, 0x3c, 0x8e, 0xca, 0xbd, 0xc6, 0x4b, 0x49, + 0x84, 0xbb, 0x9a, 0x51, 0xa6, 0x39, 0x44, 0xcf, 0x5a, 0x35, 0x3c, 0x61, + 0xbe, 0xd6, 0x94, 0x5e, 0xc0, 0x7d, 0xae, 0x89, 0xa4, 0x32, 0xcd, 0x49, + 0x5d, 0x5e, 0x9f, 0x68, 0x98, 0xc6, 0x80, 0xb5, 0x5c, 0x43, 0xb4, 0xc4, + 0x89, 0x7b, 0x59, 0x9c, 0xcc, 0x42, 0xca, 0x69, 0x95, 0x94, 0x6c, 0xce, + 0x36, 0x65, 0x3d, 0xc4, 0x7c, 0xac, 0x7d, 0x99, 0x7c, 0x9a, 0x81, 0x4d, + 0x74, 0x81, 0x9e, 0x63, 0xca, 0xc5, 0x70, 0x64, 0x76, 0xc7, 0x80, 0xb8, + 0x79, 0x88, 0x7f, 0x96, 0xcf, 0xc1, 0xcc, 0xd6, 0xc2, 0x80, 0x82, 0x59, + 0x7a, 0x7e, 0x3d, 0x62, 0x82, 0x9d, 0x49, 0xb9, 0x48, 0x61, 0x5b, 0xcb, + 0x50, 0x6e, 0x48, 0x7b, 0x6b, 0x82, 0x3d, 0x42, 0x61, 0x91, 0xc1, 0x4f, + 0x4a, 0x3a, 0x65, 0x8e, 0x4b, 0xb3, 0x93, 0x7b, 0x7c, 0x8f, 0xcf, 0xb1, + 0xa1, 0xb6, 0xac, 0x89, 0x37, 0x7b, 0x62, 0xa4, 0xb9, 0xcf, 0xba, 0x44, + 0x5f, 0x39, 0x36, 0x37, 0x33, 0xb3, 0x79, 0x83, 0xc5, 0x4d, 0xa3, 0x8d, + 0x96, 0x37, 0x94, 0x37, 0x6e, 0x42, 0x7c, 0x32, 0x8f, 0x52, 0x44, 0x4a, + 0xd0, 0x95, 0xd0, 0x83, 0x34, 0x5f, 0xb1, 0x81, 0x50, 0xbb, 0xbd, 0x3e, + 0x3c, 0x50, 0xa9, 0x31, 0xb2, 0x71, 0xb8, 0x78, 0x6f, 0xb0, 0x46, 0xbd, + 0xc3, 0xc9, 0x37, 0x97, 0x46, 0x63, 0x45, 0x9d, 0x9e, 0xca, 0xd8, 0xd7, + 0x93, 0x9a, 0xce, 0x7d, 0x90, 0x43, 0x98, 0x62, 0x79, 0x3c, 0x50, 0x41, + 0x68, 0x97, 0x2f, 0xe0, 0xae, 0xab, 0x98, 0xb4, 0x4c, 0x86, 0xb8, 0xbf, + 0x64, 0x65, 0x60, 0x90, 0xc1, 0x60, 0xd4, 0xb6, 0xb1, 0x45, 0x81, 0x2f, + 0xab, 0x72, 0x7e, 0x76, 0x77, 0xbb, 0x87, 0x9c, 0xca, 0x56, 0x3f, 0xa7, + 0xa8, 0x68, 0x67, 0xba, 0x80, 0x83, 0x3e, 0xa5, 0x6f, 0xc2, 0x66, 0x36, + 0x63, 0x68, 0xa7, 0xdd, 0x3e, 0x87, 0xba, 0xd7, 0xcd, 0x49, 0x9d, 0x86, + 0x6e, 0xd6, 0x44, 0x35, 0x42, 0xd3, 0xa7, 0x6f, 0x9a, 0xc5, 0xa7, 0x62, + 0x36, 0x81, 0x3a, 0x5e, 0x95, 0xcf, 0x7f, 0xd9, 0x65, 0x82, 0x8c, 0x91, + 0x61, 0xc1, 0x9d, 0x3b, 0x6e, 0xb0, 0x3f, 0x69, 0x53, 0xa1, 0xc5, 0xb0, + 0xb9, 0x45, 0x7f, 0x36, 0x73, 0x67, 0x46, 0x38, 0x3e, 0x60, 0xc7, 0x83, + 0x71, 0x3e, 0xbc, 0x2b, 0xa5, 0x86, 0x96, 0x94, 0x4f, 0x69, 0x91, 0xbb, + 0xa2, 0x8a, 0x35, 0x53, 0x8e, 0x59, 0x97, 0xa6, 0x81, 0xd9, 0x75, 0xd1, + 0x74, 0x44, 0xad, 0x96, 0x51, 0xa1, 0x9c, 0x44, 0xb8, 0x79, 0xb7, 0x65, + 0xcf, 0x47, 0x3f, 0x80, 0x66, 0x9b, 0x34, 0xc2, 0xcd, 0x4d, 0x66, 0xc2, + 0xbc, 0xc6, 0xa8, 0x8d, 0xcd, 0x3f, 0x48, 0x8e, 0x38, 0x73, 0x64, 0x83, + 0x7b, 0x8a, 0x66, 0x6d, 0x9f, 0x6c, 0x59, 0x35, 0x39, 0x7a, 0x62, 0xc8, + 0x3d, 0x3a, 0x5c, 0xa5, 0xbc, 0xb3, 0xa8, 0xb5, 0x4d, 0x38, 0xc4, 0x91, + 0x4c, 0xb9, 0x5c, 0xc3, 0xc4, 0x51, 0xda, 0x48, 0x8b, 0x45, 0xbf, 0x85, + 0x45, 0x55, 0x77, 0x72, 0x98, 0x36, 0xa2, 0xce, 0xbc, 0x86, 0x4b, 0x5c, + 0x78, 0xb4, 0x63, 0x9b, 0x4c, 0x66, 0xc7, 0xb8, 0x63, 0x70, 0x42, 0x64, + 0x48, 0x9c, 0x59, 0x80, 0xa7, 0x31, 0xd2, 0xcc, 0x90, 0x86, 0xa5, 0x62, + 0xb3, 0xc5, 0xc8, 0x68, 0xc2, 0x56, 0x56, 0x31, 0x95, 0xa3, 0x6d, 0x65, + 0xd1, 0xad, 0xbe, 0xbe, 0x57, 0x90, 0x42, 0x6e, 0x45, 0xab, 0x48, 0x40, + 0x53, 0x57, 0xcc, 0xd4, 0x4b, 0xad, 0x55, 0x87, 0x80, 0x99, 0xa6, 0xcc, + 0xb6, 0x6e, 0x5e, 0x97, 0x70, 0x9a, 0x6e, 0x72, 0x81, 0x33, 0x62, 0xb7, + 0xa5, 0xd7, 0x55, 0xb6, 0xcf, 0x3e, 0x45, 0xa6, 0x7e, 0x46, 0x36, 0xd5, + 0x4e, 0xc2, 0xcc, 0xba, 0x4b, 0xa4, 0xce, 0x3c, 0x7a, 0xa7, 0x9c, 0xc9, + 0x81, 0x56, 0xc8, 0xa5, 0x7e, 0x65, 0x88, 0x44, 0x5a, 0x54, 0x45, 0x81, + 0x81, 0x89, 0x71, 0x52, 0x29, 0xa7, 0x99, 0x7f, 0xd1, 0xce, 0x40, 0xd5, + 0x97, 0x8d, 0x6e, 0x81, 0xd3, 0x76, 0xad, 0x4e, 0x37, 0x98, 0xa5, 0x87, + 0xb1, 0x48, 0x88, 0x35, 0xa4, 0xd9, 0x66, 0xbb, 0xca, 0xc7, 0x34, 0x5c, + 0x38, 0xaa, 0xa0, 0x8e, 0x5d, 0x50, 0xdf, 0x8c, 0x6e, 0x44, 0xda, 0xcc, + 0x71, 0x93, 0xcc, 0x36, 0xc6, 0xcb, 0xae, 0x45, 0x51, 0xb8, 0xdb, 0x9e, + 0x7c, 0x57, 0x76, 0xb5, 0xb9, 0x8d, 0x4e, 0x42, 0x41, 0xc9, 0x7d, 0x5f, + 0xd5, 0x9c, 0xa2, 0x8b, 0x63, 0xa9, 0x57, 0xb8, 0x4b, 0xb1, 0xab, 0xb1, + 0x5f, 0x96, 0xaa, 0x7a, 0x6d, 0xab, 0x8e, 0x6d, 0x41, 0xc3, 0xde, 0xc0, + 0x3d, 0x34, 0x67, 0xac, 0x93, 0x8d, 0x4d, 0x89, 0x79, 0xca, 0x4b, 0x44, + 0xa7, 0x8a, 0x72, 0x87, 0x44, 0x79, 0x4c, 0x7a, 0x7f, 0x7d, 0x4e, 0xc3, + 0x90, 0x97, 0x77, 0x98, 0x85, 0xc2, 0x54, 0x5e, 0x8f, 0x91, 0x95, 0xbb, + 0xbd, 0xac, 0x73, 0xc9, 0x8e, 0xb9, 0x34, 0x23, 0x33, 0x7b, 0x2f, 0xb5, + 0xc4, 0x9f, 0xab, 0xba, 0xc6, 0x88, 0x84, 0x67, 0x94, 0x3e, 0xba, 0xa7, + 0xc6, 0x77, 0x7d, 0xac, 0xcf, 0x55, 0xe1, 0x4c, 0x58, 0x66, 0x7d, 0x70, + 0xce, 0xac, 0x3c, 0x68, 0xcf, 0xc0, 0xa1, 0x66, 0x88, 0x3f, 0x9f, 0x5e, + 0x60, 0x72, 0xa3, 0xb4, 0x55, 0x88, 0xad, 0x80, 0xcd, 0x56, 0x74, 0x94, + 0xc1, 0x48, 0xc3, 0xbc, 0xb6, 0x39, 0xd4, 0x68, 0xd4, 0xc3, 0x3c, 0x38, + 0x71, 0x4f, 0x98, 0x35, 0x65, 0x54, 0x59, 0xdc, 0x76, 0xba, 0x52, 0xac, + 0xc1, 0x58, 0xc0, 0x76, 0x43, 0xd2, 0xb6, 0xc1, 0x54, 0x3b, 0xb2, 0xcf, + 0x5a, 0xaf, 0x9e, 0x90, 0x9e, 0x89, 0x42, 0xcc, 0xcf, 0xc4, 0x26, 0xce, + 0xc9, 0x3c, 0x66, 0xaa, 0x7e, 0xb7, 0xb3, 0x49, 0x98, 0xd6, 0x92, 0x7f, + 0x49, 0xbd, 0xc2, 0xb7, 0x38, 0x6a, 0x88, 0x83, 0x6c, 0x6e, 0x81, 0x60, + 0x44, 0x5f, 0x7b, 0xa3, 0x51, 0x56, 0x9a, 0xa2, 0xb9, 0x38, 0x5a, 0x89, + 0xd1, 0x3d, 0x88, 0xa2, 0x9c, 0xc6, 0x7d, 0x5d, 0x6e, 0x93, 0x4c, 0x52, + 0xc3, 0x35, 0x8a, 0x5d, 0x32, 0xc9, 0x92, 0x48, 0x52, 0x97, 0x57, 0x61, + 0x5e, 0x8c, 0x89, 0xa9, 0x77, 0x89, 0x5c, 0xc5, 0x9f, 0xb2, 0x7d, 0x74, + 0xc6, 0x94, 0x73, 0x7b, 0xa1, 0x56, 0x69, 0x63, 0x9c, 0xd3, 0x4a, 0x5c, + 0x69, 0x7f, 0x50, 0x42, 0x48, 0xa3, 0x52, 0x66, 0xd1, 0x97, 0x8d, 0x7c, + 0xa8, 0xa8, 0x7a, 0xc0, 0x66, 0x59, 0x3f, 0xa6, 0x83, 0xc7, 0x5d, 0x2d, + 0x58, 0x63, 0x30, 0x2f, 0xb8, 0x41, 0x4d, 0x86, 0x32, 0x59, 0x7d, 0xc6, + 0xd0, 0xb6, 0x53, 0x65, 0x7f, 0x62, 0x51, 0x55, 0xcb, 0x3e, 0x49, 0x6d, + 0xa1, 0x8b, 0xc6, 0x93, 0x32, 0x4e, 0x42, 0x57, 0xc2, 0x9a, 0x3a, 0x95, + 0xd5, 0x6f, 0x7c, 0xba, 0x89, 0x58, 0x4b, 0x9c, 0xaa, 0xa8, 0x3d, 0x66, + 0x8f, 0x46, 0x62, 0x4f, 0x63, 0x83, 0x98, 0x61, 0x78, 0xb7, 0x58, 0x51, + 0x6e, 0x99, 0x9b, 0xb8, 0x8e, 0xc0, 0x6c, 0xb4, 0x92, 0x71, 0xc6, 0xca, + 0x46, 0x68, 0x9f, 0xab, 0x96, 0x8e, 0x85, 0x79, 0x95, 0xa2, 0x8d, 0x85, + 0xa0, 0xae, 0x6a, 0xcb, 0xa4, 0x55, 0xa3, 0x83, 0xa5, 0x9d, 0x68, 0xd0, + 0xcb, 0x95, 0x85, 0xbb, 0xa2, 0x57, 0x61, 0x3b, 0x36, 0x9d, 0x77, 0x7d, + 0xbd, 0x65, 0x4f, 0x59, 0xbc, 0xc5, 0xac, 0x70, 0xb3, 0x85, 0x52, 0x8a, + 0x9e, 0x35, 0x41, 0x36, 0x69, 0xc1, 0xbe, 0xb1, 0xa4, 0x58, 0xaa, 0x80, + 0x64, 0xb0, 0x4d, 0x90, 0xb7, 0x8e, 0x89, 0x57, 0x97, 0xbe, 0x72, 0x8d, + 0xb3, 0x32, 0x33, 0x57, 0x81, 0x42, 0xcf, 0x85, 0xcb, 0xd6, 0x64, 0x83, + 0x38, 0x77, 0xb8, 0x83, 0xcc, 0xa5, 0x92, 0x73, 0xc0, 0x73, 0xc4, 0xad, + 0x2d, 0x96, 0xc7, 0x5c, 0x76, 0xc7, 0xc3, 0x3a, 0x75, 0x61, 0x4a, 0xba, + 0xb1, 0x5d, 0x50, 0xc9, 0x80, 0x3a, 0xac, 0xa4, 0x72, 0x9a, 0x9d, 0x86, + 0xab, 0x4b, 0x41, 0xa0, 0xd2, 0x38, 0x72, 0x8d, 0x36, 0x87, 0xc3, 0x52, + 0xb5, 0x56, 0x78, 0x6c, 0x7e, 0xa3, 0x8e, 0xbb, 0x74, 0x3d, 0x53, 0x4e, + 0x6c, 0x75, 0x34, 0xc7, 0x59, 0x96, 0x7a, 0x53, 0xc1, 0x3d, 0x75, 0x49, + 0x6c, 0xcb, 0x79, 0x6c, 0xc2, 0x52, 0x3f, 0xac, 0x4f, 0x47, 0xc8, 0xb4, + 0xb7, 0x3b, 0x97, 0x77, 0x6c, 0xb6, 0x92, 0x8a, 0x71, 0x8a, 0x48, 0x9e, + 0x64, 0x7f, 0x9c, 0x37, 0x71, 0x52, 0x87, 0x61, 0xd0, 0xbb, 0xa0, 0x3a, + 0x9d, 0x6a, 0xb2, 0xbe, 0xa5, 0x85, 0x6c, 0x97, 0x46, 0x38, 0x6d, 0x36, + 0x77, 0xcc, 0xc1, 0x89, 0x62, 0xd3, 0xa5, 0x73, 0x90, 0xbd, 0xaa, 0x87, + 0x8a, 0x9e, 0x52, 0x44, 0x56, 0xaa, 0x91, 0x68, 0xc3, 0x9d, 0x55, 0x7a, + 0x89, 0x62, 0xc1, 0x7d, 0x62, 0x40, 0x35, 0xb1, 0x5c, 0xc4, 0x53, 0x86, + 0xb8, 0xc1, 0x3c, 0xa2, 0xbd, 0xa7, 0x52, 0x54, 0x75, 0x8d, 0xb3, 0x6d, + 0x7b, 0x4b, 0x3d, 0xb2, 0xcc, 0x66, 0x34, 0xb6, 0x4b, 0x9a, 0x5d, 0x54, + 0x67, 0x95, 0x81, 0x45, 0x37, 0xb1, 0x3a, 0x2f, 0x37, 0x9d, 0x4b, 0x3c, + 0x93, 0x59, 0xae, 0x80, 0xc3, 0x4a, 0x8e, 0x6e, 0x95, 0xd6, 0xa6, 0xc7, + 0x30, 0x40, 0x5b, 0x3c, 0x7a, 0x4b, 0x40, 0xaf, 0x99, 0x40, 0x64, 0x6d, + 0x54, 0x81, 0x6d, 0x3d, 0x43, 0xbb, 0xbf, 0x93, 0xa2, 0xc7, 0x5b, 0x45, + 0x35, 0x30, 0x5a, 0xa4, 0x47, 0x61, 0x91, 0x8c, 0x46, 0x87, 0xc7, 0x4d, + 0x36, 0x75, 0x87, 0xa8, 0x7e, 0x81, 0xc7, 0xa2, 0x49, 0x83, 0x71, 0x65, + 0xb1, 0xab, 0x34, 0x9c, 0xbd, 0x9e, 0xd6, 0xa6, 0xc9, 0xb7, 0xbd, 0x56, + 0x8a, 0x89, 0x5d, 0x66, 0xbf, 0x79, 0x71, 0x6d, 0x5d, 0x63, 0x43, 0x4b, + 0x34, 0x5d, 0x5e, 0x56, 0x9d, 0x91, 0xa1, 0xd6, 0xb4, 0x3e, 0x91, 0x67, + 0x98, 0xaf, 0xc0, 0x76, 0xc0, 0x99, 0xc4, 0x9d, 0xa3, 0xaa, 0x88, 0x3c, + 0xbe, 0x88, 0x96, 0xc4, 0x85, 0xbc, 0x96, 0x4f, 0x9d, 0xbf, 0x34, 0x4b, + 0xac, 0x7f, 0x46, 0x34, 0x6c, 0xb5, 0xbd, 0x9e, 0x56, 0x4f, 0x37, 0x61, + 0x75, 0x47, 0xb3, 0x4c, 0x4c, 0x49, 0xa0, 0x65, 0x99, 0xd5, 0x9c, 0x73, + 0x3a, 0x70, 0x3f, 0xb7, 0xaf, 0xb1, 0x40, 0x83, 0x94, 0xd5, 0x87, 0x4c, + 0x54, 0x81, 0xa4, 0x98, 0x73, 0xa5, 0x46, 0xd1, 0x8f, 0x89, 0x55, 0x6b, + 0xd7, 0xc6, 0x7e, 0x3e, 0x4f, 0xc0, 0x2f, 0x33, 0x47, 0x84, 0x52, 0x75, + 0x49, 0x52, 0x8c, 0x30, 0x9a, 0x8c, 0xad, 0x76, 0x87, 0x4d, 0x81, 0x4f, + 0x4d, 0x46, 0x86, 0xad, 0x4d, 0x58, 0x73, 0x7d, 0x67, 0x9e, 0x68, 0xc5, + 0x98, 0x87, 0xb4, 0x8d, 0x7b, 0xa2, 0x79, 0x9b, 0x3a, 0x2e, 0x75, 0x89, + 0x7b, 0x67, 0xb3, 0x95, 0xa3, 0x83, 0x73, 0xbf, 0x59, 0xad, 0xb9, 0x34, + 0x73, 0x5a, 0x7d, 0x54, 0x8d, 0x6f, 0xd6, 0x6d, 0x45, 0xde, 0x6d, 0x67, + 0x61, 0x5d, 0x42, 0x73, 0x73, 0x64, 0x98, 0x5b, 0x54, 0x73, 0x94, 0xbe, + 0x75, 0x60, 0x9d, 0xbf, 0x67, 0x41, 0x4b, 0xbe, 0x47, 0xc4, 0xaf, 0xb9, + 0x46, 0x67, 0xcd, 0xc4, 0x96, 0x7a, 0xce, 0x56, 0xcd, 0x68, 0xbe, 0x9e, + 0x94, 0x40, 0x9b, 0x84, 0x4d, 0xca, 0x69, 0x4e, 0xa4, 0x68, 0x91, 0x59, + 0x9f, 0x83, 0x4d, 0x65, 0xac, 0xb5, 0x9e, 0x80, 0x7a, 0x4e, 0x50, 0x61, + 0x39, 0x42, 0xb9, 0x8d, 0x58, 0x89, 0xbc, 0x73, 0x9f, 0x71, 0x79, 0x6a, + 0xd0, 0xa5, 0xc0, 0xbb, 0xb1, 0x72, 0x29, 0x36, 0xa2, 0x38, 0x7a, 0xbf, + 0x49, 0x9c, 0x91, 0x47, 0x64, 0x64, 0x53, 0x8b, 0x46, 0x51, 0x98, 0x32, + 0x41, 0xa6, 0xcf, 0xae, 0x32, 0xb7, 0xbe, 0x71, 0x43, 0xbf, 0xb8, 0x90, + 0x65, 0x51, 0xbf, 0x64, 0x7c, 0x9d, 0xc6, 0x8e, 0x70, 0x38, 0x74, 0x97, + 0x7e, 0x43, 0x90, 0xa8, 0x86, 0xb2, 0x3f, 0x8a, 0xa0, 0x37, 0x98, 0x88, + 0x30, 0xc5, 0x46, 0xa9, 0x75, 0x51, 0xad, 0x93, 0x45, 0xa5, 0x57, 0xbf, + 0xca, 0x42, 0xad, 0xa0, 0xc9, 0x78, 0xa5, 0x67, 0x52, 0x33, 0x83, 0x64, + 0xba, 0x84, 0xa5, 0x7b, 0xb2, 0xad, 0x3f, 0x96, 0x8d, 0x6c, 0x5f, 0x5f, + 0xc9, 0x7f, 0x99, 0x71, 0x59, 0x66, 0xad, 0x6c, 0xa3, 0x3d, 0xc5, 0xa3, + 0xaa, 0x53, 0x7c, 0x4d, 0xaf, 0xce, 0x4f, 0x55, 0x36, 0xa3, 0x9a, 0x84, + 0x8a, 0xab, 0x77, 0x87, 0xcd, 0x6f, 0x66, 0x8f, 0x3d, 0x75, 0xab, 0x8c, + 0x46, 0x60, 0xc5, 0xb4, 0x68, 0x39, 0x42, 0x41, 0xcd, 0x65, 0x46, 0x3f, + 0x4f, 0x6d, 0x60, 0xda, 0xc2, 0xcb, 0xb2, 0x9d, 0xac, 0x31, 0x88, 0xb1, + 0x7e, 0x6b, 0xb9, 0x3b, 0x90, 0x84, 0x63, 0x85, 0x66, 0x73, 0x6d, 0x8f, + 0xaa, 0x84, 0xba, 0x76, 0xaf, 0xa3, 0x85, 0x63, 0x97, 0x9f, 0x68, 0xc1, + 0x83, 0xc8, 0x74, 0xd2, 0x32, 0x99, 0xb1, 0xaf, 0xbb, 0xa1, 0x42, 0x7f, + 0x54, 0x9a, 0xc1, 0x3b, 0xa0, 0x63, 0x51, 0x3e, 0x69, 0x98, 0x4a, 0x56, + 0x69, 0x98, 0x93, 0x3a, 0xd6, 0x93, 0x69, 0x2f, 0x49, 0x9f, 0xca, 0xaa, + 0x41, 0x80, 0xb9, 0x34, 0xb3, 0x3f, 0x69, 0x7e, 0x3e, 0xb9, 0x97, 0x57, + 0x41, 0x6b, 0x6d, 0x54, 0xa2, 0x31, 0x46, 0x73, 0x78, 0x6f, 0x5c, 0x91, + 0x57, 0xc7, 0x99, 0x7e, 0x92, 0x5d, 0x66, 0x46, 0x6a, 0x32, 0x86, 0x49, + 0xd3, 0xc8, 0x62, 0x88, 0xb1, 0xac, 0x36, 0x4d, 0x79, 0x91, 0xb1, 0xc5, + 0x50, 0x48, 0xd0, 0xa5, 0x9c, 0xc7, 0x34, 0x5c, 0x5c, 0x55, 0x43, 0xd3, + 0x9a, 0xac, 0x8e, 0x7d, 0x5d, 0xc3, 0x81, 0x6d, 0x8f, 0x57, 0x3b, 0x80, + 0x7f, 0x69, 0xc7, 0xb3, 0x5a, 0x6f, 0x5d, 0x4c, 0x8d, 0xc9, 0x60, 0x87, + 0xb1, 0x8a, 0xaa, 0xc3, 0xc1, 0x35, 0x3a, 0x8a, 0xc6, 0xb3, 0x79, 0x53, + 0xd4, 0xd3, 0x80, 0x4c, 0xa1, 0x39, 0x48, 0x42, 0xa3, 0x36, 0x47, 0xa2, + 0x50, 0xc1, 0x61, 0x59, 0xb9, 0x3f, 0x4a, 0xa6, 0xcd, 0xc4, 0xb7, 0x44, + 0xca, 0xb9, 0x53, 0x96, 0x77, 0x75, 0x89, 0xc3, 0x6d, 0x7d, 0xc6, 0xc4, + 0xc2, 0x79, 0x8c, 0x33, 0xcc, 0xac, 0x4e, 0xd1, 0x69, 0x83, 0xa9, 0x36, + 0x5f, 0x8f, 0x5f, 0xb8, 0xc3, 0x97, 0xb9, 0xcc, 0xb8, 0x88, 0x4f, 0xaa, + 0xb4, 0x56, 0x50, 0xa7, 0x54, 0xc8, 0xb5, 0x39, 0x50, 0x5f, 0xbb, 0x44, + 0x8e, 0xae, 0x6b, 0xc9, 0x7d, 0xc2, 0x34, 0x71, 0x92, 0x42, 0x56, 0x5e, + 0x42, 0xa1, 0xaa, 0x4b, 0x99, 0x48, 0x85, 0xce, 0x66, 0x7c, 0xbf, 0x4e, + 0x7c, 0xcf, 0x7b, 0x47, 0x5d, 0x5c, 0x2e, 0x74, 0xd1, 0xc1, 0x62, 0x33, + 0x46, 0x69, 0xcd, 0x64, 0x37, 0xc1, 0x6f, 0xb8, 0xc8, 0x35, 0x54, 0x44, + 0xb2, 0xae, 0x9a, 0xa8, 0xbb, 0xb8, 0xa1, 0x84, 0x33, 0xa4, 0x33, 0x96, + 0x4f, 0x4b, 0x88, 0xc1, 0xbe, 0x60, 0xae, 0x97, 0xcc, 0x74, 0x40, 0x6f, + 0xc5, 0x75, 0x4a, 0x83, 0xa8, 0x9d, 0x79, 0x89, 0xc2, 0x9c, 0x87, 0x6b, + 0x64, 0xc0, 0x46, 0xd3, 0x43, 0x41, 0x60, 0x8c, 0x4b, 0xc6, 0x7e, 0x37, + 0x40, 0xb2, 0x55, 0xc3, 0xcc, 0xbb, 0xd0, 0x4b, 0xbf, 0xcc, 0x84, 0xc6, + 0x3d, 0xd3, 0xd7, 0x55, 0x54, 0x76, 0x9d, 0x8a, 0x6a, 0xbd, 0x69, 0xc0, + 0x88, 0x7e, 0x64, 0x4d, 0x73, 0x69, 0x88, 0x94, 0xa6, 0x8e, 0x34, 0x61, + 0x9d, 0x47, 0x67, 0x94, 0x8b, 0x35, 0x96, 0x60, 0xbd, 0x43, 0x39, 0x46, + 0x46, 0x5c, 0x8f, 0x5a, 0x63, 0xbf, 0xb7, 0x6c, 0x59, 0x51, 0x74, 0x66, + 0x72, 0x75, 0xa2, 0xc3, 0x61, 0xa1, 0xb4, 0xa6, 0x69, 0xd3, 0x87, 0xae, + 0xb1, 0x92, 0x6f, 0x52, 0xce, 0x49, 0xa8, 0x3a, 0xa9, 0x4a, 0xaf, 0x66, + 0x96, 0x5a, 0x4e, 0x58, 0xd0, 0xa2, 0x5e, 0x4f, 0xaf, 0x87, 0x91, 0x8d, + 0x8f, 0x56, 0x3a, 0x3b, 0x31, 0xce, 0x71, 0xd4, 0xa2, 0xd2, 0x44, 0x82, + 0x7d, 0x3b, 0x53, 0x77, 0xc1, 0x88, 0x32, 0x82, 0xa3, 0x98, 0x4c, 0xd6, + 0xcc, 0xa7, 0x8b, 0xba, 0x81, 0xa5, 0xb5, 0x49, 0xa8, 0x47, 0x83, 0x74, + 0x97, 0x70, 0xbf, 0x67, 0x50, 0x66, 0xd1, 0xb6, 0x8e, 0xbd, 0x48, 0x4c, + 0x8e, 0x4d, 0x71, 0x8d, 0x97, 0x77, 0x84, 0x39, 0x50, 0x62, 0xb1, 0x85, + 0x6b, 0xb9, 0xa0, 0x92, 0x6f, 0xb3, 0x9f, 0xca, 0x41, 0x5a, 0x9e, 0x5d, + 0x34, 0x7b, 0xb7, 0x62, 0x92, 0x8e, 0xab, 0x6b, 0x92, 0xb8, 0xcd, 0x72, + 0x47, 0x5a, 0x4e, 0xab, 0x8d, 0xd0, 0x3e, 0x66, 0x50, 0x8f, 0x70, 0x62, + 0x7d, 0x4f, 0x3a, 0xcc, 0x77, 0x58, 0x9c, 0xaa, 0xc1, 0x49, 0x40, 0x9f, + 0x42, 0x92, 0x59, 0x43, 0x2f, 0x73, 0x6b, 0x9f, 0x4f, 0x97, 0x70, 0xab, + 0xaa, 0x9b, 0xa0, 0x9e, 0xca, 0x45, 0xd4, 0xb5, 0x4b, 0x7c, 0xc0, 0x62, + 0xb6, 0x5d, 0x30, 0x8e, 0xb0, 0x8a, 0x41, 0xa2, 0x83, 0x7d, 0xa9, 0x78, + 0x41, 0xb2, 0x9b, 0x87, 0xb1, 0x9c, 0x85, 0x59, 0x8b, 0xb0, 0x7d, 0xca, + 0xb4, 0x6d, 0xd3, 0xa2, 0x44, 0x52, 0x59, 0x93, 0x76, 0xb1, 0x6a, 0x6b, + 0xad, 0x7a, 0x9d, 0x85, 0xaa, 0x9d, 0x58, 0xb2, 0x36, 0x96, 0x3e, 0x64, + 0x96, 0x35, 0x96, 0x60, 0xa2, 0xd0, 0x7b, 0x70, 0xba, 0x3a, 0x79, 0xbd, + 0x72, 0x9d, 0xbd, 0x52, 0x6a, 0x91, 0xa5, 0x76, 0x41, 0x4c, 0x68, 0xa7, + 0x5f, 0xaa, 0xc0, 0xb2, 0xa0, 0x52, 0xa4, 0x58, 0x48, 0x51, 0x78, 0x37, + 0x74, 0xc4, 0xcb, 0xb4, 0xd0, 0x97, 0x78, 0x5f, 0x87, 0x4d, 0x59, 0x67, + 0x35, 0xbf, 0x5d, 0x46, 0xb9, 0x99, 0x84, 0xcd, 0xce, 0xa4, 0x75, 0x7b, + 0x56, 0x71, 0x6a, 0xca, 0x45, 0xa2, 0x97, 0xaa, 0xca, 0x66, 0x44, 0x55, + 0x3e, 0x80, 0x77, 0xbd, 0x64, 0x9c, 0x8e, 0xc9, 0x38, 0x9f, 0x5a, 0x84, + 0x8d, 0xcc, 0x7f, 0x86, 0xd3, 0xcb, 0xb1, 0x7a, 0x4b, 0xad, 0x5a, 0x39, + 0x4a, 0xc3, 0x52, 0x6d, 0xbc, 0xb3, 0x83, 0xc8, 0x49, 0x9a, 0x3d, 0x33, + 0x3d, 0x9e, 0xd4, 0x97, 0x80, 0x53, 0x4d, 0x52, 0xcd, 0xc4, 0xb7, 0xb4, + 0x78, 0xae, 0x3a, 0x73, 0x78, 0x6a, 0xa2, 0x8f, 0x70, 0x55, 0xc5, 0xcf, + 0x7d, 0x39, 0x75, 0x41, 0x66, 0x6b, 0x8e, 0x60, 0x9d, 0xa1, 0x90, 0x47, + 0x64, 0x99, 0x50, 0x7e, 0x7c, 0x85, 0xb0, 0x5c, 0xaf, 0x63, 0xc6, 0x4a, + 0x4a, 0xcb, 0x9c, 0x42, 0x8c, 0x3c, 0x6d, 0x31, 0x8b, 0x88, 0xbf, 0xb6, + 0x40, 0x37, 0x9b, 0x94, 0x33, 0xc2, 0x9e, 0xae, 0x92, 0x82, 0x82, 0xb1, + 0x61, 0x77, 0x74, 0x80, 0x84, 0xc7, 0xae, 0x9b, 0x82, 0x47, 0x84, 0x64, + 0x89, 0x8d, 0x7f, 0x58, 0x7c, 0x38, 0x84, 0x59, 0x8d, 0xac, 0x87, 0xbe, + 0x41, 0x7e, 0xa4, 0x6b, 0x63, 0x57, 0xa0, 0xa2, 0xaf, 0x5c, 0x4f, 0x68, + 0x50, 0x36, 0xc0, 0x93, 0xc3, 0xb1, 0x4c, 0x80, 0x8e, 0x6a, 0x82, 0xca, + 0x52, 0xbf, 0x52, 0xb1, 0x54, 0xab, 0x8f, 0x76, 0x40, 0x47, 0x72, 0x38, + 0x60, 0xbc, 0x8c, 0xbc, 0x8f, 0xc7, 0x6b, 0xaf, 0x9c, 0x64, 0x73, 0x50, + 0xbe, 0x69, 0x5e, 0x74, 0xb9, 0x44, 0xde, 0xb3, 0xaa, 0x4f, 0xc4, 0xa1, + 0xb7, 0x7b, 0x85, 0xc4, 0x7b, 0x9f, 0x7e, 0x6d, 0x62, 0x6a, 0xa8, 0x5c, + 0x75, 0x49, 0x75, 0x5a, 0xa5, 0x47, 0x55, 0x99, 0x2c, 0x34, 0x64, 0x9a, + 0x4a, 0x2c, 0xbe, 0x67, 0x58, 0xb4, 0x96, 0xc5, 0x36, 0x84, 0xc3, 0xd1, + 0xb5, 0xb4, 0xad, 0x67, 0x45, 0xc2, 0xdb, 0x9d, 0x99, 0x86, 0x4d, 0x97, + 0xa1, 0x78, 0xa7, 0xb2, 0x68, 0x68, 0xc5, 0x4e, 0x6f, 0xcc, 0xad, 0x33, + 0x85, 0x96, 0x81, 0x7f, 0x20, 0x30, 0x3d, 0xbf, 0xe7, 0x66, 0x59, 0x8d, + 0xa0, 0x79, 0x43, 0x65, 0xbb, 0xaf, 0x4b, 0x64, 0xca, 0x4e, 0x7c, 0x57, + 0xbd, 0x66, 0x7e, 0x92, 0xc9, 0x5e, 0x5d, 0xc2, 0xb1, 0xbf, 0x7c, 0x82, + 0x8e, 0x8a, 0xa8, 0x5a, 0x51, 0x81, 0x86, 0xbe, 0xc0, 0x92, 0x9f, 0xbc, + 0xbd, 0x7d, 0xad, 0xab, 0xcf, 0x63, 0x7c, 0x39, 0xcc, 0x6e, 0x71, 0x8a, + 0x50, 0xbd, 0x76, 0x55, 0x89, 0xac, 0x40, 0x71, 0xd8, 0x8b, 0xad, 0xbd, + 0x96, 0x97, 0x76, 0x3f, 0x8a, 0x5b, 0xd1, 0x60, 0xaa, 0xa1, 0x85, 0x52, + 0xaa, 0x72, 0x3d, 0xa1, 0x97, 0xac, 0xb3, 0x32, 0x3e, 0x88, 0xc1, 0x60, + 0xad, 0x36, 0xbc, 0x3e, 0x4b, 0xbb, 0x4e, 0x62, 0x86, 0x86, 0xa1, 0x58, + 0x98, 0xd1, 0xb5, 0x7e, 0xcd, 0x8a, 0x5f, 0x54, 0x78, 0xb2, 0xa6, 0x88, + 0x9f, 0x80, 0x59, 0x89, 0x68, 0x4a, 0x79, 0x5b, 0xa3, 0xb2, 0xa3, 0x64, + 0xc8, 0x9e, 0xa7, 0x62, 0x2c, 0x93, 0xe1, 0xac, 0x7b, 0xa4, 0x70, 0xcc, + 0xa0, 0x65, 0xcc, 0x99, 0x60, 0xb7, 0x41, 0xae, 0xd4, 0x62, 0x63, 0x8a, + 0x9e, 0xa3, 0x63, 0x4b, 0x4e, 0x67, 0x61, 0x63, 0x7f, 0x66, 0x63, 0x93, + 0x56, 0x7b, 0x82, 0xb1, 0x9d, 0x46, 0x35, 0xac, 0x86, 0x8c, 0x69, 0x78, + 0xa6, 0x97, 0x87, 0x85, 0x48, 0x3e, 0x5f, 0x88, 0x66, 0x4b, 0xa3, 0x8d, + 0x9b, 0xd8, 0x74, 0x49, 0x3f, 0x27, 0x6b, 0xb1, 0xac, 0x91, 0x8e, 0x79, + 0x60, 0xba, 0x93, 0x6b, 0xbf, 0x9a, 0x73, 0x53, 0xa8, 0x82, 0x8b, 0x50, + 0x29, 0x62, 0x34, 0x4b, 0xa0, 0x92, 0xc5, 0x32, 0xbc, 0x60, 0x64, 0x7d, + 0x39, 0xa4, 0x9d, 0x4f, 0x41, 0x9e, 0x5a, 0xaa, 0xb6, 0x62, 0xdc, 0x7f, + 0x64, 0x8a, 0x35, 0xbf, 0x59, 0x5a, 0xcf, 0x96, 0xad, 0xca, 0xc2, 0xc1, + 0x99, 0xb5, 0x64, 0x9f, 0x3b, 0x89, 0x78, 0x88, 0x3e, 0x37, 0xc3, 0xb9, + 0x59, 0xc8, 0xc9, 0x72, 0x41, 0x2c, 0x55, 0x9a, 0x5d, 0x81, 0x9b, 0x5c, + 0x83, 0x53, 0x44, 0x9c, 0x75, 0xce, 0x60, 0xce, 0x9b, 0x79, 0x41, 0xb0, + 0x6a, 0x7a, 0xbb, 0xcd, 0x3a, 0xbc, 0x3d, 0x45, 0xc9, 0xba, 0x4c, 0x92, + 0xc1, 0xc6, 0x9b, 0x37, 0x47, 0xa9, 0x3f, 0x53, 0x62, 0x3d, 0x91, 0xc7, + 0xae, 0x72, 0x51, 0x46, 0xcd, 0x48, 0xb3, 0x3d, 0x8e, 0x55, 0x3a, 0x47, + 0xd0, 0x5a, 0xc2, 0x66, 0x82, 0x4b, 0x8c, 0x99, 0x6a, 0xae, 0x6d, 0xa1, + 0x73, 0xb8, 0x4d, 0x81, 0x67, 0xc8, 0x5b, 0x41, 0x8a, 0xc6, 0x9a, 0x8b, + 0xa1, 0xd0, 0x77, 0x6f, 0x9c, 0xcc, 0x54, 0x5d, 0xac, 0x7b, 0x4e, 0x8a, + 0x75, 0x8f, 0xa0, 0x41, 0x9a, 0xb5, 0x35, 0xa6, 0x7a, 0x52, 0xcf, 0xc4, + 0x8e, 0x3d, 0xc1, 0x96, 0xaa, 0xb7, 0x94, 0xb0, 0x2e, 0xaf, 0x7c, 0x5a, + 0x3a, 0xaa, 0xa7, 0x88, 0x5f, 0x53, 0x3d, 0x42, 0x49, 0xb5, 0x76, 0xb7, + 0x56, 0x66, 0x9d, 0xaa, 0xd7, 0x7c, 0x9e, 0x98, 0xbf, 0x5f, 0x85, 0x4f, + 0x70, 0x40, 0x43, 0xd5, 0xc1, 0x8e, 0x94, 0x54, 0xc5, 0x5e, 0x99, 0x7e, + 0x3c, 0xbe, 0xcb, 0x8d, 0x9f, 0x55, 0x60, 0x49, 0xc2, 0x5e, 0x9c, 0x91, + 0x83, 0xb7, 0x44, 0xa3, 0x51, 0xb7, 0x84, 0x5b, 0x97, 0x7e, 0x2e, 0x66, + 0x76, 0x42, 0x4d, 0x3d, 0x55, 0x87, 0x95, 0xb1, 0x39, 0x75, 0x8a, 0x81, + 0xd8, 0x45, 0x4c, 0xa6, 0xa7, 0x28, 0xb5, 0x75, 0xc0, 0x43, 0x8c, 0x64, + 0xa1, 0x93, 0x3e, 0x77, 0x9e, 0xdf, 0x67, 0x4a, 0x84, 0x32, 0x76, 0x9e, + 0x34, 0x48, 0x4a, 0xc3, 0xa8, 0x34, 0x7d, 0x3c, 0x39, 0x8d, 0x97, 0xcd, + 0x31, 0x7a, 0x82, 0x64, 0x6e, 0x32, 0x7b, 0xae, 0x4e, 0x88, 0xc2, 0x67, + 0x80, 0x32, 0x97, 0xa2, 0xbc, 0x2d, 0x47, 0x62, 0x5a, 0xa5, 0x5d, 0x41, + 0xb0, 0x2d, 0x33, 0x86, 0x47, 0x3f, 0x90, 0x9b, 0x49, 0x64, 0xb8, 0x62, + 0xb0, 0x8e, 0x59, 0x8a, 0x7e, 0xd0, 0x54, 0x30, 0x5d, 0x5d, 0xc2, 0xaa, + 0xc9, 0x36, 0x63, 0x93, 0x5a, 0xc5, 0x43, 0xb7, 0x5d, 0x4f, 0x36, 0x51, + 0x3b, 0x90, 0xd6, 0x4a, 0xbd, 0x49, 0x4e, 0x63, 0x7d, 0x82, 0x3c, 0x43, + 0xc6, 0x93, 0xcc, 0xad, 0xa2, 0x4e, 0xc0, 0xd1, 0x3f, 0xc7, 0x99, 0x3e, + 0x7d, 0x81, 0x94, 0x47, 0x40, 0xc0, 0xb4, 0x43, 0x6e, 0x44, 0x9e, 0xc5, + 0x7c, 0x85, 0x38, 0x6c, 0xa9, 0x88, 0x63, 0xa4, 0x8f, 0x3f, 0xb2, 0x48, + 0xc4, 0xa8, 0xd3, 0xb3, 0x62, 0x61, 0x99, 0xcf, 0xa3, 0xa6, 0xda, 0x6d, + 0x74, 0x91, 0x31, 0x84, 0x75, 0x81, 0xd0, 0xba, 0x89, 0x5f, 0x41, 0xbe, + 0x9b, 0x33, 0x5f, 0x6a, 0x30, 0xc5, 0xc3, 0x70, 0x92, 0x5c, 0xc2, 0x5a, + 0x99, 0x7b, 0x54, 0xb0, 0xcb, 0x9e, 0x55, 0x94, 0x3a, 0x99, 0x96, 0x9d, + 0x59, 0xd4, 0x8e, 0x3a, 0x4f, 0x6d, 0x4a, 0x7d, 0xcb, 0x2c, 0x90, 0x46, + 0x39, 0x71, 0x6f, 0xc1, 0xce, 0x5c, 0x4e, 0xc8, 0x95, 0x65, 0xa8, 0x82, + 0x4e, 0x4d, 0x9a, 0xc3, 0x4d, 0xb2, 0x49, 0xb0, 0xb8, 0xb4, 0xb8, 0xa4, + 0xd0, 0xc7, 0x89, 0xb1, 0x8b, 0x53, 0x38, 0xa1, 0xcf, 0xba, 0x61, 0x97, + 0x87, 0x68, 0xb4, 0xa6, 0xb2, 0x57, 0x93, 0xcc, 0x3d, 0x3d, 0x5a, 0xc3, + 0xa1, 0xb7, 0xc7, 0x6e, 0x9d, 0x40, 0x74, 0xb9, 0xc2, 0x8f, 0xb9, 0x8a, + 0x7d, 0x92, 0x79, 0xa3, 0x68, 0x60, 0x8c, 0x79, 0x4c, 0xaa, 0x53, 0xc4, + 0xa4, 0xd2, 0xbb, 0x7a, 0xba, 0x5d, 0xd7, 0x9b, 0xb4, 0xd1, 0xb6, 0x87, + 0x9c, 0x4f, 0x4f, 0xc8, 0x62, 0x66, 0x39, 0x51, 0xd2, 0xbd, 0xa1, 0x37, + 0xb6, 0x90, 0xd1, 0x88, 0x6f, 0x88, 0x9c, 0x3f, 0x64, 0x2c, 0xd0, 0xbc, + 0x84, 0x3c, 0x70, 0x67, 0xc2, 0x55, 0x94, 0x54, 0x4b, 0x89, 0x5f, 0x50, + 0x61, 0x2f, 0x71, 0xdc, 0x61, 0xc8, 0x53, 0x93, 0x80, 0xb5, 0xa2, 0x5f, + 0x9d, 0x5e, 0xba, 0xaf, 0x3f, 0xd3, 0x5f, 0x39, 0x74, 0x98, 0x2e, 0x9d, + 0x47, 0x98, 0xcf, 0x51, 0x84, 0x9e, 0xb3, 0xa3, 0x77, 0x33, 0x48, 0xba, + 0x4b, 0x46, 0xb3, 0x2d, 0xce, 0xa5, 0x97, 0xba, 0x6f, 0x68, 0x80, 0x8c, + 0x8c, 0x2e, 0xa6, 0xaa, 0x8e, 0xbb, 0x3b, 0x40, 0x65, 0x4b, 0x7b, 0x68, + 0xa6, 0x7a, 0x8b, 0x8c, 0x32, 0xcd, 0x31, 0x6d, 0xc6, 0x47, 0x82, 0x5d, + 0xa6, 0xc5, 0x64, 0x9a, 0xa4, 0x7c, 0x6d, 0xb1, 0xc3, 0x9a, 0x52, 0x3d, + 0x84, 0x68, 0x87, 0xa2, 0x9d, 0x85, 0x59, 0x81, 0xb2, 0x4a, 0x6b, 0x4d, + 0xbb, 0x42, 0xaa, 0x50, 0x50, 0x36, 0xb9, 0x3d, 0xae, 0x3b, 0xb6, 0x4b, + 0x81, 0x50, 0x22, 0xb3, 0x9f, 0xac, 0x74, 0xba, 0x54, 0xbc, 0x5e, 0x9d, + 0x49, 0x36, 0x79, 0x92, 0x35, 0x33, 0x82, 0x75, 0xb9, 0x58, 0x85, 0xb6, + 0x64, 0x9d, 0x7b, 0xac, 0xd0, 0x4a, 0xd2, 0xc8, 0x95, 0x3d, 0xcb, 0xad, + 0xb1, 0x5f, 0xa4, 0xd3, 0x9e, 0xae, 0xd1, 0x46, 0xa9, 0x71, 0x69, 0x91, + 0x66, 0x46, 0x61, 0xab, 0x78, 0xad, 0xc0, 0x5b, 0x4d, 0xbb, 0x8d, 0x49, + 0xb5, 0xa5, 0xda, 0x94, 0xd1, 0xa3, 0xc6, 0x6f, 0x71, 0x4a, 0xd3, 0x9b, + 0x32, 0x79, 0x6e, 0xa3, 0xce, 0xc2, 0x4b, 0xaf, 0x92, 0x5e, 0x83, 0x43, + 0x69, 0xa5, 0x52, 0xbe, 0x60, 0xcd, 0x4b, 0xc4, 0x54, 0xc8, 0x83, 0xb8, + 0x61, 0x61, 0x65, 0x65, 0x45, 0x9e, 0xc2, 0xb0, 0xca, 0x51, 0x3e, 0x87, + 0x9c, 0xba, 0x8a, 0x73, 0x80, 0x7d, 0x99, 0x52, 0x43, 0xcc, 0x77, 0xa4, + 0x55, 0xca, 0x3b, 0x89, 0x86, 0x63, 0x7f, 0xa8, 0xab, 0x5f, 0x28, 0x7e, + 0xc1, 0x36, 0xcb, 0x7b, 0xc0, 0x6c, 0x6e, 0x68, 0xcc, 0x57, 0xa3, 0x67, + 0xa5, 0xac, 0x57, 0x61, 0xac, 0x3f, 0x9b, 0x5c, 0x35, 0xa8, 0x59, 0x40, + 0x6b, 0xa1, 0xd3, 0x97, 0xcc, 0x84, 0xa4, 0xc9, 0x83, 0x84, 0x49, 0x35, + 0xcc, 0x78, 0x7b, 0xa1, 0x5c, 0xa8, 0x58, 0x63, 0x61, 0x86, 0xc1, 0x7e, + 0x8e, 0x59, 0x54, 0x50, 0xb1, 0x73, 0x64, 0x9a, 0xd6, 0x5b, 0x66, 0x75, + 0xbf, 0xaa, 0x2b, 0x41, 0x46, 0x86, 0x3e, 0x42, 0x5b, 0x9a, 0xb5, 0x95, + 0xba, 0x82, 0x8b, 0xb0, 0x61, 0x51, 0x54, 0x6e, 0x68, 0x8b, 0x7d, 0x35, + 0x91, 0x62, 0xa8, 0xad, 0x7e, 0xb2, 0x4e, 0xc2, 0x32, 0x3a, 0x8f, 0x46, + 0x7e, 0x84, 0x3b, 0x82, 0x98, 0x51, 0x7a, 0x6d, 0x9f, 0x41, 0x5d, 0x42, + 0xcf, 0xad, 0xda, 0x3f, 0x7f, 0xa3, 0xae, 0x7f, 0xc6, 0x76, 0x52, 0xc8, + 0x71, 0x5d, 0x8b, 0x75, 0xac, 0xad, 0x39, 0x48, 0x30, 0xa3, 0x89, 0xbc, + 0x3e, 0x89, 0x3d, 0xb3, 0x40, 0xd3, 0xcf, 0x37, 0xa9, 0xa5, 0xa9, 0xa3, + 0x2c, 0x98, 0x8e, 0xaa, 0x49, 0x93, 0x99, 0x8d, 0x67, 0x87, 0x55, 0xae, + 0x5f, 0xb0, 0x6d, 0x6e, 0x59, 0x71, 0xba, 0x37, 0x57, 0xd6, 0xa5, 0x6c, + 0xb1, 0x3b, 0xa5, 0x66, 0x3d, 0x47, 0x7a, 0x6f, 0xc9, 0x55, 0x44, 0x8c, + 0xcc, 0xaa, 0x50, 0xcf, 0x6c, 0x7c, 0xbd, 0xc5, 0xad, 0xaa, 0x3e, 0xad, + 0x63, 0xa5, 0xaa, 0xb3, 0xcb, 0x70, 0xa5, 0x95, 0xcc, 0x87, 0xbd, 0x73, + 0x79, 0x30, 0x79, 0x9a, 0x50, 0xb1, 0x37, 0x7a, 0xd4, 0xa2, 0xa3, 0x65, + 0x66, 0x5e, 0x64, 0x9e, 0x96, 0xb3, 0xce, 0x53, 0x6c, 0xaf, 0x48, 0x53, + 0x40, 0xbf, 0xd2, 0x55, 0xa6, 0xc2, 0x58, 0x54, 0x3a, 0x78, 0x79, 0x5e, + 0x40, 0x54, 0x4a, 0x55, 0x38, 0x96, 0x5f, 0x86, 0x64, 0x9f, 0x50, 0xc9, + 0x67, 0x97, 0x6e, 0xb7, 0x9e, 0xbe, 0xc1, 0xb9, 0xb1, 0x89, 0x92, 0x46, + 0x4e, 0xa6, 0xbf, 0x44, 0x73, 0x36, 0x92, 0xad, 0x82, 0x44, 0x4e, 0x35, + 0x5f, 0x7d, 0x61, 0x52, 0xc6, 0x9e, 0x33, 0xa2, 0x61, 0x77, 0x38, 0x86, + 0x58, 0xb8, 0xc8, 0x60, 0x8a, 0xa8, 0x5f, 0x9e, 0x7f, 0x8a, 0xa5, 0x90, + 0xac, 0x32, 0x55, 0xc0, 0x5f, 0xd8, 0x72, 0x72, 0x55, 0x95, 0x41, 0xa2, + 0xbf, 0xd3, 0x7f, 0xcc, 0x5f, 0xb2, 0x98, 0xb1, 0x91, 0xcd, 0x74, 0x74, + 0x84, 0x51, 0x93, 0x5c, 0x3a, 0x4b, 0x94, 0xd0, 0x5c, 0xb0, 0xd1, 0x50, + 0x3f, 0xdf, 0x7f, 0xd4, 0x43, 0x4d, 0x6f, 0xa4, 0x5f, 0xcf, 0x7c, 0xbb, + 0x3b, 0x7f, 0x51, 0x86, 0x66, 0x52, 0x95, 0xca, 0x35, 0x68, 0xa2, 0x96, + 0x7d, 0xbd, 0x59, 0x75, 0xc3, 0x53, 0x79, 0x67, 0xc8, 0xa8, 0x5e, 0x3b, + 0x7c, 0x56, 0x7a, 0xbe, 0x6a, 0xcb, 0xaf, 0x41, 0x84, 0x33, 0x89, 0xa1, + 0x58, 0x7d, 0x31, 0x75, 0x68, 0x74, 0xa9, 0xbb, 0xca, 0x3f, 0xa9, 0x39, + 0x63, 0x88, 0xc9, 0x60, 0x8e, 0x5a, 0xa4, 0x90, 0x89, 0xa2, 0x6a, 0xcf, + 0xcd, 0x58, 0x9f, 0x59, 0x80, 0xb2, 0xbd, 0x57, 0x8b, 0x2f, 0xc1, 0xa0, + 0x86, 0x46, 0xa8, 0x5a, 0x38, 0xd2, 0x62, 0x7f, 0x68, 0xbd, 0x4e, 0x5e, + 0xbc, 0xbd, 0xaf, 0x9e, 0x75, 0x55, 0x35, 0x93, 0xd5, 0x5a, 0x76, 0x6f, + 0xbe, 0x92, 0xa9, 0x7f, 0x8b, 0x74, 0x9b, 0x2a, 0x43, 0x6c, 0x69, 0xb9, + 0x5c, 0x95, 0x72, 0xaa, 0xbc, 0x45, 0x6c, 0xcf, 0x7a, 0x4d, 0xcf, 0x51, + 0xad, 0xbc, 0xc6, 0x54, 0x6b, 0x63, 0xd2, 0x79, 0xc5, 0x51, 0xbf, 0xb1, + 0x46, 0x69, 0xd0, 0x48, 0xa5, 0x4b, 0x82, 0x36, 0xa1, 0x51, 0x36, 0x36, + 0x5b, 0x4b, 0xb3, 0x74, 0x8d, 0xac, 0x6c, 0xa1, 0x63, 0x8a, 0x6a, 0x6e, + 0x74, 0x6a, 0xaa, 0xa9, 0x3b, 0xa6, 0x31, 0x50, 0xb1, 0x85, 0x3b, 0xa3, + 0x85, 0xca, 0x97, 0xbb, 0x51, 0x48, 0x98, 0x4a, 0xb7, 0xba, 0x8f, 0xbc, + 0x65, 0x9f, 0x44, 0x45, 0x8a, 0x6e, 0xd1, 0x5e, 0xd1, 0xa6, 0x5a, 0x30, + 0x59, 0x34, 0x42, 0x51, 0xbd, 0xaf, 0x4b, 0x47, 0x89, 0x7c, 0x5a, 0x33, + 0xc2, 0xbb, 0x78, 0x7b, 0xc5, 0xaa, 0x81, 0x40, 0x85, 0xbe, 0x72, 0xad, + 0x43, 0x3e, 0xa3, 0x7b, 0x52, 0xb3, 0xbe, 0xce, 0xc8, 0xce, 0x2f, 0xb1, + 0xa6, 0xbe, 0xb5, 0x3b, 0x93, 0xb7, 0xc8, 0x63, 0x5a, 0x3c, 0x55, 0x8a, + 0xd5, 0x79, 0x38, 0xc5, 0x64, 0xcc, 0xa4, 0x52, 0x97, 0x3c, 0x7d, 0xa7, + 0xd4, 0x5c, 0xb8, 0x45, 0x6f, 0x84, 0xac, 0xc6, 0x57, 0x7d, 0x91, 0xd2, + 0x79, 0x86, 0x57, 0x30, 0x47, 0x4a, 0x4d, 0x97, 0x38, 0x82, 0xa2, 0xb6, + 0xa5, 0x40, 0xae, 0xbc, 0xa9, 0x36, 0x55, 0x6f, 0x3a, 0x4d, 0x44, 0xc7, + 0x6a, 0xc8, 0x59, 0xb8, 0x8b, 0xb1, 0x69, 0x5c, 0x59, 0x44, 0xa2, 0xb0, + 0xcd, 0x63, 0x5d, 0x83, 0xab, 0xa6, 0xd4, 0x79, 0xa6, 0xaa, 0xa2, 0x37, + 0x7f, 0xb5, 0xbf, 0x3d, 0x49, 0xac, 0xbc, 0xa8, 0x3c, 0xcd, 0x68, 0xbf, + 0xbe, 0x41, 0xcb, 0xc7, 0x94, 0xb4, 0x92, 0xb8, 0xb6, 0xc0, 0x5a, 0x69, + 0x9f, 0xba, 0x8e, 0x3d, 0xb0, 0xaf, 0xa4, 0xb1, 0xc1, 0xa7, 0x3a, 0xa4, + 0x2e, 0xb6, 0xb2, 0x54, 0xc0, 0x76, 0x39, 0x62, 0x4f, 0x4f, 0xae, 0x60, + 0x96, 0x9e, 0x8e, 0xbb, 0x94, 0xba, 0xb6, 0x68, 0x56, 0x3d, 0x93, 0x98, + 0x95, 0x80, 0x4b, 0x41, 0x65, 0x65, 0xc4, 0x82, 0x4f, 0x82, 0x5d, 0x6d, + 0xd2, 0x4b, 0x76, 0x82, 0x87, 0xc8, 0x42, 0xc7, 0x6c, 0x70, 0x40, 0x5c, + 0x87, 0x40, 0x9b, 0x5d, 0x61, 0xd3, 0x36, 0x7f, 0x87, 0x46, 0x98, 0x5c, + 0x46, 0x51, 0xae, 0xc8, 0xa1, 0x40, 0x44, 0x9e, 0x86, 0x37, 0x35, 0x8a, + 0x74, 0xa5, 0x50, 0x7e, 0xbf, 0x8b, 0x3c, 0x8c, 0x61, 0x7a, 0x3e, 0x99, + 0xa8, 0xc6, 0x39, 0x49, 0xd5, 0xa0, 0x3a, 0xc0, 0x7c, 0x4a, 0xb7, 0x76, + 0x39, 0x79, 0x94, 0x3f, 0xc1, 0x83, 0xca, 0x78, 0x97, 0xb8, 0x6a, 0xb6, + 0xce, 0xc3, 0x67, 0x88, 0x98, 0xd0, 0x5a, 0x44, 0x81, 0x89, 0x80, 0x46, + 0x5f, 0x44, 0x78, 0x65, 0x7b, 0xd4, 0xce, 0x9f, 0x7c, 0xa5, 0xb7, 0xcf, + 0x90, 0xc3, 0x74, 0x37, 0x56, 0x91, 0x37, 0x75, 0x59, 0xca, 0xa3, 0x43, + 0xca, 0x6c, 0x38, 0xa5, 0xce, 0x49, 0x77, 0x7a, 0x9c, 0xd0, 0xc9, 0x60, + 0x6b, 0x48, 0x9e, 0x4a, 0x5b, 0x8e, 0x89, 0x93, 0x49, 0x6c, 0x72, 0x3d, + 0x62, 0x82, 0x4a, 0x7d, 0x65, 0x68, 0xb3, 0x35, 0x52, 0x36, 0x5b, 0x76, + 0xa1, 0xcc, 0xa6, 0x65, 0xa9, 0x4b, 0x94, 0x7f, 0xcb, 0x91, 0xbb, 0x4e, + 0xbf, 0x55, 0xcf, 0x9d, 0x78, 0xaa, 0x91, 0x8b, 0x37, 0x3c, 0x51, 0x86, + 0x50, 0x4c, 0x60, 0x71, 0x65, 0x93, 0x72, 0x81, 0xb5, 0x76, 0x9f, 0x78, + 0x41, 0x30, 0x6b, 0x5d, 0x31, 0x80, 0xb0, 0x52, 0x41, 0xad, 0x3f, 0x66, + 0x71, 0x53, 0x84, 0x4f, 0x7a, 0x56, 0x7c, 0x4c, 0xaf, 0xd0, 0x42, 0x6a, + 0xbd, 0x8d, 0x4d, 0x7f, 0xb1, 0x58, 0xbb, 0xbb, 0x6e, 0x83, 0xc4, 0x8d, + 0xb8, 0x8a, 0xd1, 0xb7, 0x64, 0x9b, 0xb0, 0x84, 0x6b, 0x7e, 0x3f, 0x47, + 0x9e, 0x52, 0x4f, 0xa7, 0x45, 0xb2, 0x98, 0x4e, 0x57, 0x44, 0x32, 0x79, + 0x84, 0xb5, 0x52, 0x54, 0x6d, 0x7c, 0x90, 0xb7, 0xa5, 0x7c, 0xc8, 0xbc, + 0x78, 0x78, 0xb7, 0xa5, 0x42, 0x3d, 0x34, 0xcc, 0x98, 0x59, 0xcb, 0x4e, + 0x70, 0x4a, 0x4c, 0xce, 0x37, 0x4c, 0x57, 0x3c, 0xbf, 0xc8, 0xd5, 0x5d, + 0x77, 0x50, 0x7a, 0x75, 0xd2, 0x41, 0xa2, 0xc6, 0xcb, 0x7f, 0x87, 0x6f, + 0x51, 0x87, 0x37, 0x3b, 0x2a, 0x7c, 0x9b, 0xce, 0x54, 0x6a, 0xa7, 0x5e, + 0x7a, 0xc4, 0x5f, 0x3a, 0x46, 0x57, 0x78, 0x35, 0xba, 0xb4, 0x64, 0x4d, + 0x45, 0xcf, 0xd0, 0x78, 0xbc, 0x67, 0xc4, 0xcb, 0x6e, 0x47, 0x87, 0x65, + 0x4d, 0xa1, 0xb7, 0x4c, 0x4b, 0x55, 0x54, 0x80, 0xc6, 0x88, 0x2c, 0x39, + 0xa6, 0xbe, 0x70, 0x5d, 0x38, 0xbf, 0x35, 0x59, 0x60, 0x98, 0x7e, 0x68, + 0x99, 0x74, 0xc2, 0x72, 0xd1, 0x89, 0x4c, 0xbe, 0x5e, 0xab, 0x9c, 0x9b, + 0x5c, 0x5c, 0x43, 0x49, 0xa7, 0xb2, 0x82, 0xc1, 0xc6, 0x52, 0xa4, 0x41, + 0x98, 0x63, 0xa3, 0x77, 0xbe, 0x6a, 0x7c, 0xc4, 0xc8, 0xc9, 0x42, 0x99, + 0x42, 0x53, 0x48, 0x3f, 0x8f, 0xc3, 0x53, 0x3c, 0x44, 0xcb, 0x95, 0x88, + 0x51, 0xbb, 0x93, 0x76, 0xcf, 0x57, 0xb2, 0x38, 0xd5, 0xcd, 0x86, 0x33, + 0x4c, 0x9c, 0x69, 0xbc, 0xd6, 0x8b, 0x93, 0x52, 0x33, 0xc4, 0x4d, 0xae, + 0xcc, 0xb8, 0x37, 0x5c, 0x9e, 0xa2, 0x90, 0xd3, 0x4b, 0x7c, 0x83, 0x54, + 0xb7, 0x88, 0xaa, 0x99, 0x75, 0x35, 0x40, 0x93, 0x91, 0x57, 0x60, 0xae, + 0xb5, 0x69, 0x40, 0xa7, 0xce, 0x76, 0x61, 0x5a, 0xa5, 0x46, 0x4c, 0x84, + 0x8f, 0x85, 0xb4, 0xae, 0x50, 0x99, 0x9e, 0x9a, 0x79, 0x67, 0x61, 0x61, + 0x63, 0x83, 0x4d, 0xb5, 0xd5, 0x65, 0xb2, 0x6e, 0x7f, 0x64, 0xd9, 0x49, + 0x83, 0x70, 0x89, 0xb6, 0xca, 0x48, 0x3c, 0xbf, 0x90, 0x84, 0x7f, 0x79, + 0xb3, 0x59, 0x5a, 0x4b, 0xcb, 0x89, 0xd3, 0x88, 0x8b, 0x9c, 0x48, 0x41, + 0xc8, 0xa3, 0xcc, 0x64, 0x5a, 0x8e, 0x63, 0x77, 0x8d, 0x78, 0x9b, 0x31, + 0x65, 0xbc, 0x95, 0x80, 0x54, 0xbe, 0x97, 0xa2, 0x92, 0xbc, 0xc3, 0x3e, + 0xc1, 0xaf, 0x8c, 0xc6, 0x9e, 0x40, 0xa9, 0x36, 0x66, 0x7b, 0xa9, 0xad, + 0x9b, 0x6d, 0xca, 0xc9, 0xa8, 0x53, 0xa2, 0xad, 0x73, 0x34, 0x8e, 0xbb, + 0x8e, 0x7a, 0x41, 0x38, 0x92, 0xd0, 0xcd, 0x51, 0x64, 0x3b, 0x9b, 0xb8, + 0xae, 0x58, 0x36, 0x3d, 0xa1, 0x91, 0xa3, 0xd0, 0x70, 0xc2, 0x78, 0xb5, + 0x89, 0x51, 0x6e, 0x66, 0x88, 0x96, 0xc5, 0x39, 0x41, 0x5d, 0x49, 0x78, + 0x48, 0x61, 0xad, 0x89, 0x77, 0x94, 0x63, 0xa1, 0x74, 0x56, 0x53, 0xc2, + 0xa7, 0x5f, 0x73, 0xce, 0xca, 0x3a, 0x8b, 0x66, 0xcc, 0x65, 0x98, 0x7d, + 0x72, 0x87, 0x72, 0x54, 0x44, 0x6e, 0x5f, 0xdb, 0x6f, 0x3e, 0x3e, 0x5a, + 0x5a, 0xaa, 0xa0, 0x4d, 0x55, 0xbf, 0x5d, 0x9c, 0xa3, 0x7c, 0xa6, 0x8e, + 0xbb, 0x4e, 0x4d, 0xb8, 0xb0, 0xb9, 0xce, 0x5d, 0xb9, 0xac, 0x4a, 0xa6, + 0xa9, 0x33, 0x8a, 0xc9, 0x29, 0x90, 0x52, 0x7b, 0x72, 0x75, 0x58, 0x98, + 0x4c, 0x3f, 0x82, 0xbb, 0xbb, 0xd4, 0xd5, 0x69, 0x4e, 0xc8, 0x66, 0xc9, + 0x6b, 0x64, 0x82, 0xa0, 0x7b, 0x90, 0x78, 0x70, 0x72, 0x86, 0xb3, 0xa2, + 0x90, 0xbb, 0xd2, 0x66, 0x42, 0x9e, 0x6e, 0x99, 0x98, 0x2d, 0x9f, 0x4d, + 0x98, 0x6d, 0x9d, 0xb8, 0x9b, 0x36, 0x5b, 0xcc, 0xcf, 0x89, 0x49, 0x43, + 0x8a, 0x43, 0x46, 0x30, 0xba, 0xa7, 0x6b, 0xc9, 0x54, 0xa0, 0x7f, 0xad, + 0xca, 0xb6, 0x9b, 0x8a, 0x77, 0x3b, 0x9f, 0x3e, 0x78, 0x70, 0x64, 0x3a, + 0x44, 0xcd, 0x4e, 0x60, 0x91, 0xd8, 0xa9, 0x6c, 0xba, 0xc6, 0xa5, 0x40, + 0x88, 0x3b, 0x9d, 0x9a, 0x80, 0x3c, 0xa0, 0x58, 0xc2, 0x4c, 0xcb, 0x47, + 0xb7, 0xa5, 0xc6, 0xd2, 0xb7, 0x71, 0x5b, 0xb8, 0xd4, 0x75, 0x3a, 0x6b, + 0xbc, 0xb2, 0x73, 0x3d, 0x97, 0x79, 0x5a, 0x9f, 0x9f, 0x2f, 0x36, 0xb8, + 0xc1, 0xbc, 0x8b, 0x8e, 0x9c, 0x4e, 0x76, 0x38, 0x93, 0xbd, 0xa2, 0xc8, + 0xaa, 0x34, 0x40, 0x97, 0xa0, 0xd5, 0x54, 0x32, 0x50, 0xd0, 0x74, 0xc7, + 0x73, 0x93, 0x47, 0x88, 0xae, 0xa4, 0x51, 0xae, 0xaa, 0xb4, 0x5d, 0x7c, + 0x78, 0x5e, 0xca, 0xc4, 0xd5, 0xbc, 0x63, 0xd2, 0x6e, 0x81, 0x54, 0x79, + 0x71, 0x45, 0x61, 0x78, 0x9b, 0x58, 0x6c, 0xb3, 0xd2, 0x49, 0x64, 0xd2, + 0xcd, 0x78, 0x76, 0xc0, 0x84, 0x7b, 0x34, 0xb1, 0x3c, 0x46, 0x69, 0x85, + 0xb2, 0xc4, 0xa3, 0x87, 0xc4, 0x51, 0x72, 0x66, 0x8d, 0x6d, 0x6c, 0x49, + 0x58, 0xc4, 0x5d, 0x31, 0x96, 0x6d, 0x78, 0x4e, 0x76, 0xd6, 0xb1, 0x89, + 0xcf, 0x9d, 0x95, 0x69, 0x80, 0x5d, 0x69, 0x36, 0xa2, 0x48, 0x3a, 0x2b, + 0x78, 0xbf, 0xc1, 0xc2, 0x9f, 0x96, 0x88, 0x65, 0xad, 0xab, 0x40, 0x49, + 0x37, 0x71, 0x53, 0x77, 0x71, 0x56, 0x39, 0x8b, 0x3c, 0x9a, 0x37, 0xa4, + 0xb9, 0xad, 0xa6, 0xc0, 0x59, 0x8b, 0x6d, 0x4a, 0x38, 0xc9, 0x57, 0x9c, + 0x8c, 0xb3, 0x9a, 0xc7, 0xa4, 0xbf, 0x43, 0x39, 0x65, 0x90, 0x82, 0x71, + 0x68, 0x9c, 0x99, 0xbb, 0xdc, 0x49, 0x57, 0xbd, 0x73, 0xc5, 0xa1, 0xb1, + 0x8f, 0xc1, 0x79, 0x3f, 0x66, 0xd5, 0x74, 0xaa, 0x88, 0x5c, 0x77, 0x9d, + 0xb4, 0x4b, 0x6d, 0x86, 0xa4, 0x66, 0x33, 0xb1, 0x7b, 0xbb, 0xb7, 0x77, + 0xbd, 0x76, 0x5d, 0x36, 0x61, 0x86, 0x98, 0x3f, 0x61, 0xad, 0x4e, 0x6b, + 0x57, 0x33, 0x62, 0x35, 0x62, 0x92, 0x8f, 0x78, 0xb0, 0x9f, 0x55, 0xc3, + 0x71, 0xc3, 0x98, 0x47, 0xa8, 0x9d, 0x4b, 0xc5, 0xc1, 0x43, 0xbc, 0xce, + 0x72, 0xb6, 0x42, 0xa2, 0x91, 0x5a, 0x51, 0x2d, 0x8b, 0x50, 0xb7, 0xb3, + 0x90, 0x7c, 0x86, 0x46, 0x85, 0x5e, 0x50, 0xb8, 0xa6, 0x73, 0x9e, 0x9d, + 0xca, 0x5a, 0x71, 0x75, 0x5c, 0x74, 0x6d, 0x7d, 0x83, 0x49, 0xc0, 0x33, + 0xb7, 0xba, 0xce, 0x6d, 0x62, 0x78, 0x46, 0x4c, 0x80, 0xcb, 0x5b, 0xa2, + 0xb7, 0x66, 0x66, 0x7a, 0x98, 0x63, 0x3f, 0xcb, 0x37, 0x4d, 0xc6, 0x5c, + 0xb1, 0x8c, 0xc9, 0x32, 0x48, 0xb2, 0x8d, 0x7b, 0xc8, 0xac, 0x76, 0x6b, + 0x82, 0xae, 0xbe, 0x61, 0x4e, 0x61, 0x60, 0x53, 0x8c, 0x9d, 0xab, 0x36, + 0x5b, 0xbb, 0x9d, 0x8b, 0x70, 0x6c, 0x57, 0x40, 0x3e, 0x82, 0xda, 0x6b, + 0x5e, 0x45, 0xbe, 0xd0, 0x7b, 0xc6, 0x6b, 0xbc, 0xa4, 0x58, 0x77, 0x8d, + 0x3d, 0x8d, 0x33, 0x5e, 0x66, 0x56, 0x56, 0xae, 0x6d, 0x8a, 0xab, 0xb1, + 0x7b, 0x74, 0x45, 0xae, 0x8e, 0xba, 0x32, 0x4f, 0x6c, 0x46, 0xb4, 0x65, + 0x5b, 0x60, 0xc9, 0x93, 0x75, 0x4b, 0x5b, 0x78, 0x60, 0x80, 0xcb, 0xc4, + 0x65, 0x54, 0x52, 0x5c, 0x4e, 0x7f, 0xa1, 0x64, 0x86, 0xd4, 0x6d, 0x7c, + 0x47, 0xb7, 0x4b, 0xd8, 0x60, 0xbe, 0xa1, 0x33, 0x99, 0xbe, 0x89, 0x2a, + 0x50, 0xa2, 0xc1, 0xd5, 0x58, 0x3a, 0xc4, 0x94, 0xb5, 0x7d, 0xc7, 0x76, + 0x78, 0x70, 0x90, 0x8f, 0x56, 0xc5, 0xd9, 0x83, 0x77, 0xa1, 0x85, 0x7f, + 0x3c, 0xac, 0x36, 0x74, 0x33, 0x41, 0x77, 0xa1, 0x5b, 0x4d, 0x71, 0x9a, + 0x39, 0x6f, 0x8d, 0xc5, 0x77, 0x84, 0xcc, 0xae, 0x5b, 0xa4, 0x98, 0x4d, + 0x8e, 0x9a, 0x7b, 0x79, 0x65, 0x91, 0x9e, 0x51, 0x2e, 0x74, 0xd3, 0x51, + 0x60, 0xb6, 0xa6, 0xc5, 0xa3, 0x8e, 0x47, 0xd2, 0x79, 0x58, 0x73, 0x83, + 0x53, 0x6a, 0x49, 0x3a, 0x73, 0x57, 0x78, 0x87, 0x6f, 0x3f, 0xa6, 0x83, + 0x39, 0x4a, 0x63, 0x53, 0xde, 0x5c, 0x58, 0x75, 0xdb, 0x4f, 0xb0, 0x28, + 0xcb, 0x8c, 0xa7, 0x80, 0x66, 0xb1, 0xcc, 0xab, 0x70, 0xcb, 0x6a, 0x73, + 0x81, 0xa7, 0x42, 0x47, 0x7c, 0x6c, 0x40, 0x3c, 0x5a, 0xa7, 0x51, 0xbb, + 0x9b, 0x3b, 0x41, 0x56, 0x93, 0x89, 0xcb, 0x53, 0x57, 0x87, 0x8a, 0xa5, + 0x57, 0x65, 0xcc, 0xc4, 0x66, 0xcb, 0xc9, 0x30, 0x8d, 0x8b, 0x74, 0xc5, + 0x48, 0x7a, 0x3e, 0xa7, 0x67, 0x30, 0x5e, 0x60, 0xcf, 0x81, 0x75, 0xb9, + 0xca, 0x72, 0xbf, 0x3e, 0x8e, 0x73, 0x4a, 0x77, 0xa4, 0x8c, 0x34, 0x69, + 0x90, 0x37, 0xc3, 0x79, 0x65, 0x7b, 0xa4, 0x9c, 0x3d, 0xae, 0x44, 0x8d, + 0x67, 0x8e, 0xd1, 0xc2, 0x6c, 0xc0, 0x78, 0xca, 0xce, 0xa1, 0xb7, 0x8c, + 0x8a, 0x5e, 0x81, 0xca, 0x35, 0xcc, 0x7b, 0xca, 0x75, 0xcf, 0xb1, 0x33, + 0x89, 0x35, 0x83, 0x4d, 0xaa, 0x95, 0x4a, 0x5d, 0xc2, 0x8a, 0x93, 0x7b, + 0x95, 0x67, 0x6b, 0xba, 0xcd, 0x35, 0x7d, 0xac, 0x91, 0xa9, 0x64, 0xc6, + 0xad, 0x55, 0x33, 0xd3, 0xa4, 0x73, 0x72, 0x58, 0x50, 0x96, 0xcf, 0x81, + 0x4b, 0x54, 0xb4, 0x7a, 0x48, 0x37, 0xae, 0xc6, 0x85, 0xa1, 0x9f, 0x83, + 0xc6, 0xa3, 0xab, 0xb1, 0x90, 0x40, 0xc1, 0x3e, 0x6c, 0x47, 0xbd, 0xc7, + 0x84, 0x38, 0xaf, 0x9f, 0x3c, 0x58, 0xb0, 0xac, 0x8f, 0x4f, 0x5c, 0x90, + 0x9a, 0x45, 0x58, 0xb6, 0x65, 0xc4, 0x94, 0xa9, 0x54, 0x36, 0x9f, 0xae, + 0x7d, 0xc0, 0x52, 0xb7, 0x5d, 0xc9, 0xbb, 0x52, 0xba, 0x34, 0xb3, 0x88, + 0xbc, 0x4a, 0xa2, 0xd5, 0xcc, 0xd6, 0x57, 0x96, 0x33, 0xbd, 0x34, 0x79, + 0x74, 0x35, 0xc5, 0xb0, 0x39, 0x97, 0xac, 0xa6, 0xae, 0xcc, 0xb7, 0x95, + 0xaa, 0x48, 0x61, 0x7b, 0x6f, 0x92, 0x5e, 0x46, 0x35, 0x94, 0xb6, 0x7d, + 0xcf, 0xbb, 0x41, 0x4d, 0x53, 0x3b, 0x8a, 0x55, 0xb4, 0x9f, 0xba, 0xa6, + 0x4b, 0x45, 0x37, 0xad, 0x7c, 0x5f, 0x96, 0x51, 0x92, 0x4a, 0x68, 0xa3, + 0xcf, 0xb6, 0x4d, 0x6c, 0x97, 0xcf, 0xba, 0x77, 0x70, 0x81, 0xa3, 0x6e, + 0x35, 0xcf, 0x6d, 0x8c, 0x91, 0x81, 0x7c, 0x4b, 0x54, 0x3c, 0x86, 0x9a, + 0x55, 0xd1, 0xb6, 0xab, 0xc9, 0x80, 0x61, 0x73, 0x3d, 0x9c, 0x48, 0x55, + 0x37, 0xa8, 0xbe, 0x6e, 0x6f, 0xab, 0xb9, 0x47, 0xb0, 0x3d, 0x81, 0xc2, + 0x9c, 0xaa, 0x61, 0x65, 0x64, 0xaa, 0x39, 0x69, 0x53, 0x6c, 0xbd, 0x54, + 0xcd, 0x39, 0x72, 0x59, 0x60, 0x4f, 0x3a, 0x3f, 0x70, 0x38, 0x4e, 0x58, + 0x88, 0x9d, 0x6f, 0x78, 0xc4, 0x3a, 0x3c, 0xa5, 0x86, 0x3d, 0x7a, 0x7a, + 0xc8, 0x56, 0x3b, 0x5f, 0xa5, 0x3f, 0xc6, 0xbc, 0x7f, 0x32, 0xc7, 0xb7, + 0x38, 0x78, 0xc6, 0x55, 0x52, 0x88, 0xba, 0x90, 0x6a, 0x70, 0x58, 0xac, + 0x34, 0x84, 0x5f, 0x8f, 0x4d, 0xb9, 0x5e, 0x93, 0xab, 0x6f, 0x8f, 0xb3, + 0x35, 0xba, 0x67, 0xd6, 0x6a, 0x35, 0xb1, 0x4c, 0x59, 0x4f, 0xa1, 0x73, + 0x97, 0x83, 0x92, 0x68, 0xbf, 0x3f, 0x98, 0x64, 0x87, 0x3e, 0x52, 0xc2, + 0x9a, 0x82, 0xcf, 0x5d, 0x69, 0x84, 0x71, 0x5d, 0xaa, 0x7f, 0x9d, 0xa8, + 0x7f, 0xd0, 0x82, 0x55, 0x91, 0xac, 0x50, 0x4d, 0xa5, 0x43, 0xc0, 0x9c, + 0x69, 0xa6, 0xc8, 0x52, 0x91, 0xcc, 0x87, 0xa8, 0xa4, 0xc4, 0x9f, 0x88, + 0x50, 0x6a, 0x52, 0xbc, 0xa1, 0x8a, 0xb2, 0x64, 0xb7, 0x6a, 0x60, 0x7d, + 0x95, 0x9f, 0xa4, 0xbe, 0x6b, 0x96, 0x84, 0xa6, 0xb2, 0x86, 0x6e, 0x31, + 0x57, 0x59, 0x5a, 0x6d, 0x65, 0xbe, 0x4f, 0x40, 0xb5, 0x71, 0x3a, 0x56, + 0xcb, 0xbf, 0x3f, 0xcc, 0x51, 0x47, 0x6d, 0x5b, 0x40, 0xbd, 0x44, 0x99, + 0xbc, 0x9a, 0x92, 0x86, 0x43, 0x57, 0x59, 0x6a, 0x8c, 0x49, 0x37, 0x35, + 0x92, 0xcd, 0x71, 0x58, 0x53, 0xb2, 0x9c, 0xaa, 0xbe, 0x70, 0x88, 0x3b, + 0x7a, 0x69, 0xcb, 0x74, 0xc6, 0xac, 0xbf, 0xbc, 0x43, 0xcb, 0xc3, 0x76, + 0x92, 0x72, 0x4e, 0x4e, 0xca, 0xd2, 0x7c, 0x4b, 0x69, 0x51, 0xbd, 0x45, + 0x6e, 0x69, 0x75, 0xca, 0xba, 0x38, 0x40, 0xa4, 0xc3, 0x62, 0x3d, 0x61, + 0x8c, 0x2b, 0x3c, 0xcc, 0x61, 0x55, 0x79, 0xa4, 0x6b, 0xae, 0x64, 0x6a, + 0xad, 0xba, 0xc1, 0x5e, 0x42, 0x45, 0x5b, 0x8c, 0x6f, 0xb4, 0x6b, 0x6c, + 0x9f, 0x61, 0x3b, 0x83, 0x33, 0x6a, 0x94, 0x35, 0x40, 0x5a, 0xc2, 0x95, + 0x71, 0x45, 0x3e, 0x92, 0xb4, 0xd3, 0x68, 0x35, 0x8d, 0x8a, 0x82, 0x76, + 0x77, 0x8a, 0x4f, 0xcd, 0x61, 0x50, 0x39, 0x96, 0xce, 0x5f, 0xa7, 0x7a, + 0x42, 0x77, 0x86, 0x70, 0x9d, 0x8d, 0x3b, 0xb6, 0x35, 0x84, 0x31, 0xb7, + 0x56, 0x42, 0x4b, 0x40, 0xd6, 0xb4, 0x85, 0x9f, 0x99, 0x74, 0x74, 0xae, + 0x3b, 0x86, 0x57, 0x6b, 0xa2, 0x47, 0xa0, 0x82, 0xb1, 0xb9, 0xa7, 0xa9, + 0x78, 0x82, 0x60, 0x40, 0xb5, 0x4d, 0x54, 0x7b, 0x6e, 0x4f, 0xbe, 0xc0, + 0x48, 0x53, 0x99, 0x52, 0x7e, 0x85, 0xa2, 0x5e, 0x51, 0x4f, 0x45, 0x4a, + 0x82, 0x77, 0xad, 0xa6, 0x94, 0xc6, 0x4f, 0x3b, 0x7a, 0x78, 0x8f, 0x70, + 0x90, 0x4b, 0x4d, 0x62, 0x6b, 0xa2, 0xa9, 0x97, 0xbd, 0xb5, 0x7e, 0xa8, + 0x6d, 0x5c, 0x54, 0xd5, 0x86, 0x3a, 0x9b, 0x3e, 0x89, 0x39, 0x35, 0x77, + 0x57, 0x7c, 0x96, 0x5e, 0x49, 0x5b, 0x7b, 0x36, 0x60, 0x81, 0x4d, 0x66, + 0xc4, 0xb0, 0xae, 0x31, 0x3f, 0xb4, 0xc7, 0xb0, 0x61, 0x95, 0x96, 0xbb, + 0x42, 0x82, 0x57, 0x60, 0x3c, 0xab, 0xc4, 0x46, 0x65, 0x94, 0xb6, 0xcc, + 0xa2, 0x5f, 0xb8, 0xba, 0xb2, 0x3b, 0x96, 0x9c, 0x49, 0xb8, 0x5e, 0x4f, + 0x3e, 0x58, 0x6d, 0x62, 0x4b, 0x84, 0x88, 0x50, 0x88, 0xbc, 0x40, 0x32, + 0x5c, 0x6a, 0x63, 0x58, 0x92, 0x39, 0xba, 0xb3, 0x6e, 0x33, 0xc0, 0x54, + 0x34, 0x38, 0x98, 0x48, 0x40, 0x98, 0xaa, 0xa3, 0x9b, 0xca, 0x47, 0x97, + 0x8c, 0xc9, 0x57, 0xa3, 0x3a, 0x73, 0x80, 0xd3, 0x4f, 0xc2, 0xca, 0xbc, + 0x43, 0x62, 0x98, 0xc7, 0xb6, 0x70, 0x92, 0xb4, 0x3e, 0x40, 0xc6, 0x47, + 0x5f, 0x96, 0x45, 0xad, 0x7f, 0x93, 0x3b, 0xbc, 0x3e, 0xac, 0x57, 0x41, + 0x5f, 0x6c, 0x93, 0xbc, 0xca, 0xb9, 0x53, 0x83, 0xb7, 0x37, 0xd5, 0xcf, + 0x97, 0xbc, 0x91, 0x5c, 0x9c, 0xc9, 0xad, 0x3d, 0x40, 0xcf, 0x5b, 0x2d, + 0xc4, 0xcc, 0x39, 0xc8, 0x8f, 0x85, 0x75, 0xbd, 0xc4, 0xd0, 0x9f, 0x68, + 0xb7, 0x4f, 0xa8, 0x5b, 0x6b, 0x51, 0x5f, 0x42, 0xba, 0x56, 0x40, 0x4a, + 0xc9, 0x41, 0x76, 0x6e, 0x48, 0x95, 0x34, 0x3e, 0x96, 0xa4, 0xa7, 0x5f, + 0xb7, 0x5a, 0x6c, 0xbc, 0x91, 0x5a, 0x62, 0x83, 0x57, 0x28, 0x65, 0x46, + 0x7a, 0x6a, 0x2b, 0xbc, 0xb2, 0xb9, 0x3b, 0x6e, 0x4a, 0x86, 0x2d, 0x35, + 0x5f, 0x97, 0x49, 0xa6, 0xc4, 0x4c, 0x57, 0x9c, 0x6e, 0x9f, 0xc8, 0xc6, + 0xad, 0xb6, 0x81, 0x4c, 0x96, 0xb1, 0x8f, 0x9b, 0x9d, 0x46, 0xc4, 0xa0, + 0x8c, 0x63, 0x5c, 0x3e, 0x5e, 0x51, 0x8d, 0x58, 0xb1, 0x68, 0x67, 0x9c, + 0x97, 0xae, 0x2d, 0x75, 0xdd, 0xd6, 0x49, 0xbe, 0xab, 0xac, 0x6d, 0x4d, + 0xa2, 0xc8, 0x72, 0x46, 0x92, 0x5b, 0x44, 0x88, 0x69, 0x6e, 0xc0, 0x4d, + 0x67, 0xc0, 0x40, 0xba, 0x71, 0xad, 0x2b, 0x7b, 0x69, 0x8c, 0x45, 0x40, + 0xa1, 0xa7, 0x63, 0xbe, 0x99, 0xcf, 0x7a, 0xcc, 0xb0, 0x6c, 0x5a, 0x88, + 0xd1, 0xab, 0xa6, 0x91, 0xac, 0x51, 0x69, 0xaf, 0x70, 0x93, 0x55, 0x9b, + 0xa1, 0x5e, 0xc6, 0x6e, 0x74, 0x8f, 0x86, 0xc3, 0xd2, 0x91, 0x91, 0xad, + 0x35, 0x81, 0x83, 0x67, 0x7d, 0xd0, 0x99, 0xb5, 0x75, 0xb7, 0x3a, 0xc6, + 0x4f, 0xb6, 0x7e, 0x59, 0xbd, 0x55, 0xa9, 0x57, 0xb5, 0xc1, 0xa0, 0xbb, + 0x6d, 0x5a, 0x6b, 0x56, 0x8c, 0x60, 0x9e, 0xa2, 0x45, 0x5c, 0x47, 0x8b, + 0x58, 0xc1, 0xa1, 0x75, 0x9c, 0x89, 0x6c, 0x87, 0x5f, 0xb9, 0xb1, 0xd7, + 0x45, 0xa3, 0xc7, 0x9d, 0xbf, 0xbc, 0xa4, 0x9a, 0xcf, 0xc8, 0x4f, 0x84, + 0xbc, 0x91, 0x36, 0xac, 0x49, 0x65, 0x5f, 0x57, 0x6e, 0xa9, 0x48, 0x87, + 0xb4, 0xbe, 0x71, 0x7a, 0x4e, 0x40, 0x74, 0x48, 0x9f, 0x50, 0xa4, 0xc4, + 0x52, 0xc0, 0xb2, 0x68, 0x9b, 0xc4, 0x53, 0x49, 0xad, 0xbc, 0xac, 0xbc, + 0x65, 0xa2, 0xac, 0x62, 0xaa, 0x85, 0x5c, 0x95, 0xb9, 0xbb, 0x50, 0xce, + 0xc3, 0x9f, 0x85, 0x83, 0x9a, 0xa0, 0x92, 0xc7, 0xd6, 0x6c, 0x8f, 0xaf, + 0xc4, 0xa6, 0xa1, 0x8e, 0x8b, 0x5d, 0xbc, 0xa1, 0x58, 0x75, 0x96, 0x46, + 0xb3, 0xb1, 0x7e, 0x87, 0xc5, 0x9e, 0x43, 0x7e, 0x34, 0x5e, 0x60, 0x60, + 0xb4, 0x83, 0x58, 0xdb, 0xb0, 0x3c, 0xbc, 0xb0, 0x80, 0xb3, 0xb6, 0xb2, + 0xc2, 0x83, 0x70, 0xa0, 0x78, 0x76, 0xd2, 0xc9, 0x59, 0xd7, 0xb6, 0xa5, + 0xcc, 0xc4, 0x40, 0x44, 0x3c, 0x66, 0xae, 0x47, 0x31, 0x77, 0x40, 0x3f, + 0x66, 0x68, 0x49, 0xc3, 0x93, 0xce, 0x46, 0x4a, 0x80, 0xb5, 0xba, 0x93, + 0xbc, 0xd1, 0xbc, 0x3e, 0x67, 0xca, 0x7d, 0x5d, 0x87, 0x88, 0xc4, 0x45, + 0x91, 0xad, 0x64, 0x85, 0x6a, 0xdc, 0x65, 0x5b, 0x93, 0x67, 0xb8, 0x3a, + 0xb9, 0x65, 0x41, 0x55, 0x6e, 0x85, 0x8e, 0x76, 0x62, 0xaf, 0x45, 0x8b, + 0x5b, 0x7a, 0x5b, 0xb7, 0x72, 0xd8, 0xa5, 0x6a, 0xd1, 0x8b, 0x79, 0x32, + 0x94, 0x71, 0x7f, 0x5c, 0x2e, 0xa2, 0x87, 0x65, 0x90, 0x5e, 0x99, 0x98, + 0xa2, 0xa1, 0x87, 0x56, 0x57, 0x98, 0xb0, 0x5d, 0x7b, 0xc4, 0x54, 0x49, + 0xb1, 0x57, 0xb7, 0xaa, 0xab, 0x77, 0xa9, 0xd3, 0xbb, 0xab, 0x3a, 0x46, + 0xae, 0xdc, 0x62, 0xb8, 0xa0, 0x46, 0x7b, 0xd7, 0x58, 0x7a, 0xc6, 0xaa, + 0x73, 0x86, 0x7d, 0x39, 0x35, 0xd0, 0x4d, 0xaf, 0x84, 0x46, 0x86, 0x68, + 0x50, 0xa0, 0x7f, 0x75, 0x64, 0xc6, 0xd0, 0x4f, 0x39, 0x99, 0xc7, 0xb6, + 0x7a, 0x7b, 0x75, 0xbb, 0x85, 0x76, 0x71, 0x44, 0x6f, 0xab, 0xc0, 0x63, + 0xb0, 0x4d, 0xc6, 0x48, 0xc2, 0x7e, 0xb9, 0x82, 0xc2, 0x4f, 0x53, 0x89, + 0xac, 0xbf, 0xc9, 0x4a, 0x69, 0xa9, 0x4e, 0x88, 0xc5, 0x8d, 0x9c, 0x55, + 0x2f, 0x7f, 0xc7, 0x41, 0xa1, 0x2a, 0x5a, 0x4f, 0xb1, 0x94, 0xc8, 0xbf, + 0x28, 0x57, 0xb1, 0x45, 0x5a, 0xd3, 0x44, 0xce, 0x1e, 0x8b, 0x61, 0x72, + 0x97, 0xad, 0x8d, 0x6d, 0xa6, 0x85, 0x43, 0x69, 0xcc, 0x7e, 0x8d, 0x6c, + 0x56, 0x8d, 0x9e, 0xb0, 0xac, 0x9b, 0xcb, 0x98, 0xca, 0xc2, 0x7d, 0x63, + 0x6e, 0x6a, 0x93, 0xb7, 0xc0, 0x60, 0x98, 0x64, 0x42, 0x8f, 0x68, 0xc2, + 0x80, 0xac, 0xbf, 0x2d, 0x7f, 0xd2, 0xb4, 0x79, 0x52, 0x9c, 0x46, 0x8e, + 0x40, 0xcd, 0xd4, 0x3a, 0x78, 0x82, 0xba, 0xb4, 0x8e, 0x76, 0x7f, 0x72, + 0x8a, 0xd5, 0x53, 0xc1, 0xd6, 0x3b, 0x37, 0x51, 0x87, 0x7a, 0xc4, 0x9c, + 0xe3, 0xa3, 0xd9, 0x3d, 0x41, 0x9a, 0xa3, 0x91, 0x77, 0x9c, 0x93, 0x63, + 0xd4, 0xc3, 0x7e, 0x78, 0x3a, 0x84, 0x99, 0xd4, 0x5d, 0x6d, 0x6a, 0x39, + 0xaa, 0x75, 0x8c, 0x79, 0x7f, 0xb4, 0x6d, 0x51, 0xa1, 0x6a, 0x54, 0x45, + 0x6f, 0xbf, 0x5a, 0xb2, 0xac, 0xb1, 0xb4, 0x7d, 0x8b, 0x66, 0xcd, 0xad, + 0xc8, 0x9e, 0x4c, 0x8f, 0x3c, 0xb8, 0x98, 0xa2, 0x6e, 0x56, 0x4b, 0x9b, + 0x7a, 0x74, 0xb2, 0xb7, 0x71, 0xb9, 0x38, 0xb5, 0x5e, 0xae, 0x97, 0x77, + 0xc3, 0x5f, 0xc9, 0x8b, 0x47, 0xc1, 0x94, 0xd6, 0xcc, 0x68, 0x2e, 0xbf, + 0xa3, 0x2e, 0x53, 0xbf, 0x59, 0x38, 0xbd, 0xaa, 0x85, 0xa8, 0x3e, 0x88, + 0x6f, 0x95, 0x88, 0xa8, 0x9b, 0x87, 0x9d, 0xd2, 0x54, 0x76, 0x62, 0x3b, + 0x6d, 0x99, 0x6d, 0x5a, 0x60, 0xdb, 0xa2, 0xca, 0x3e, 0x3b, 0xab, 0xa3, + 0xbb, 0xcd, 0xcd, 0xaa, 0x93, 0x99, 0x87, 0x88, 0xcd, 0xae, 0xb1, 0x46, + 0x68, 0x86, 0xc3, 0xc9, 0x2e, 0x83, 0x42, 0x54, 0x86, 0x3e, 0x2a, 0xad, + 0x77, 0x86, 0x4f, 0x85, 0x9d, 0xd0, 0x6d, 0x2a, 0x33, 0x65, 0x86, 0x64, + 0x8f, 0xae, 0x4c, 0xc4, 0xa5, 0x8c, 0x87, 0x4e, 0x94, 0x5d, 0x9e, 0xa1, + 0x87, 0xd3, 0x54, 0x9e, 0xb2, 0x98, 0xb9, 0x8b, 0x61, 0x6f, 0x8f, 0xc5, + 0xaa, 0xa1, 0xae, 0x5b, 0x60, 0x56, 0xa6, 0x5a, 0x3c, 0x59, 0x2e, 0x49, + 0x9b, 0x97, 0xaf, 0xb7, 0x48, 0x94, 0x54, 0x57, 0x63, 0x68, 0x35, 0xba, + 0x5b, 0xa3, 0xaa, 0x80, 0x8d, 0xa3, 0x8f, 0x39, 0x4f, 0x67, 0xd7, 0xcd, + 0xad, 0x6f, 0x8c, 0x4c, 0x37, 0x55, 0xc5, 0xcc, 0x8a, 0x9a, 0x44, 0xc4, + 0x8c, 0x55, 0x39, 0x7d, 0x77, 0x35, 0x9e, 0x43, 0x4c, 0x6d, 0xaf, 0x53, + 0xa1, 0xbf, 0x99, 0xb6, 0x99, 0x74, 0x90, 0x64, 0x5d, 0x8c, 0xb1, 0x9c, + 0x5b, 0x94, 0x56, 0x57, 0xbf, 0x7d, 0xad, 0x6b, 0x5c, 0xba, 0x45, 0x47, + 0x56, 0xab, 0x66, 0x85, 0xb1, 0x51, 0xd2, 0x70, 0x61, 0x80, 0x49, 0x4c, + 0x9a, 0xc3, 0xd3, 0x3c, 0xb9, 0x67, 0x9d, 0x48, 0x35, 0xcb, 0xc0, 0xb5, + 0x7d, 0x88, 0x6c, 0xbd, 0x54, 0x61, 0x70, 0x8f, 0x89, 0xb0, 0x5a, 0x87, + 0x37, 0x7c, 0xb3, 0x8f, 0x7a, 0xbc, 0x41, 0x55, 0xd4, 0x74, 0x6c, 0x8e, + 0x8d, 0x42, 0xa0, 0x50, 0xa1, 0xdc, 0x53, 0x97, 0x62, 0x69, 0xbe, 0x58, + 0x8b, 0xc2, 0xc8, 0x5d, 0xd5, 0xac, 0x74, 0xb5, 0x97, 0x9c, 0x69, 0x68, + 0x92, 0x72, 0x5d, 0x59, 0x65, 0x53, 0x57, 0xae, 0x96, 0xab, 0x61, 0x5c, + 0x80, 0xcd, 0x3a, 0x43, 0xaf, 0x50, 0x47, 0xb8, 0x4e, 0x3d, 0x4b, 0xad, + 0xa3, 0x67, 0x53, 0x81, 0x4f, 0x38, 0x44, 0xa2, 0x5b, 0x68, 0xad, 0x4d, + 0x72, 0x47, 0x38, 0x7d, 0x4f, 0x50, 0xd0, 0x3a, 0x91, 0x7f, 0xbc, 0x75, + 0x98, 0x32, 0x73, 0xaf, 0xcc, 0x6a, 0x79, 0x46, 0x63, 0x6c, 0x75, 0x3b, + 0x57, 0x8b, 0x8c, 0x97, 0x3b, 0x37, 0x4b, 0xb4, 0x50, 0x67, 0xc7, 0x3a, + 0x81, 0x8f, 0x91, 0xd0, 0xc3, 0x71, 0x6c, 0x85, 0x97, 0x61, 0x79, 0xaf, + 0x40, 0x8e, 0x9d, 0x94, 0xae, 0x9c, 0x65, 0xb1, 0x51, 0x8b, 0x98, 0xb6, + 0x57, 0x93, 0xb9, 0x66, 0xbc, 0x74, 0x8e, 0x3e, 0x7d, 0xa7, 0xd2, 0x3f, + 0x43, 0xcd, 0x39, 0x9b, 0x6c, 0x5a, 0x3d, 0x43, 0x82, 0xa0, 0xaf, 0x9a, + 0x60, 0x6c, 0x9f, 0xa9, 0x57, 0x48, 0x93, 0x52, 0x3b, 0x76, 0x73, 0x52, + 0x87, 0x58, 0x55, 0x51, 0x41, 0xcb, 0xc6, 0x83, 0x99, 0x53, 0xa0, 0x47, + 0x89, 0x33, 0x3f, 0xd4, 0x7c, 0xce, 0x5c, 0xb7, 0x8b, 0x90, 0x3d, 0x60, + 0xc8, 0xbf, 0x7c, 0xc6, 0x84, 0xa1, 0x8e, 0x81, 0x57, 0x77, 0xb8, 0xa8, + 0x5b, 0x2f, 0x92, 0x8f, 0x86, 0x37, 0x69, 0xd2, 0x8b, 0x3c, 0x39, 0x52, + 0x84, 0x32, 0x40, 0x93, 0x7b, 0x4c, 0x5a, 0x51, 0xbf, 0x7f, 0x50, 0x8a, + 0x42, 0x7f, 0x39, 0x60, 0xc1, 0x69, 0xc9, 0x55, 0xac, 0x71, 0x98, 0xc3, + 0xab, 0x7a, 0x33, 0xb3, 0x41, 0x33, 0xc0, 0x37, 0xb0, 0x9a, 0xd1, 0x5c, + 0xb3, 0xbc, 0xd7, 0xc4, 0x86, 0x7d, 0x47, 0xce, 0x57, 0xac, 0xc0, 0x39, + 0x95, 0x5f, 0x96, 0x9c, 0xc8, 0x9f, 0xa1, 0x8a, 0xc3, 0xa5, 0x8f, 0xc3, + 0x43, 0x5d, 0x61, 0xb2, 0x75, 0xb5, 0xa7, 0x56, 0x35, 0x60, 0x88, 0x37, + 0xae, 0x51, 0xa6, 0xcb, 0xce, 0x70, 0x6d, 0xc8, 0x7b, 0xad, 0x95, 0xa2, + 0x52, 0x54, 0x90, 0x54, 0xc7, 0x9d, 0xb6, 0x3c, 0x60, 0x9e, 0x44, 0x76, + 0x9d, 0x8a, 0x54, 0x93, 0xbb, 0xc3, 0x37, 0x86, 0xb5, 0x83, 0x7a, 0xce, + 0x30, 0x8d, 0x83, 0xc7, 0xd0, 0x3b, 0x45, 0x76, 0xa9, 0xad, 0x50, 0xad, + 0xa0, 0x50, 0xcd, 0xba, 0x73, 0x94, 0xd0, 0x83, 0x7b, 0x54, 0x40, 0x44, + 0x91, 0x81, 0x73, 0x52, 0x43, 0x85, 0x9d, 0x3e, 0x33, 0xcf, 0x3d, 0x68, + 0xb4, 0x59, 0x55, 0x7f, 0x5b, 0x83, 0x6a, 0x4e, 0x74, 0xc2, 0x3f, 0x52, + 0x4c, 0xb0, 0x68, 0xbf, 0x94, 0x88, 0x7b, 0xb4, 0x51, 0x30, 0x9b, 0x76, + 0x7a, 0xa7, 0xa3, 0x8a, 0xbc, 0x3d, 0xb2, 0x9e, 0xb7, 0x77, 0x9f, 0xb0, + 0xac, 0xa2, 0x64, 0xd2, 0x7f, 0x83, 0x79, 0x41, 0x5e, 0x30, 0xb5, 0xcd, + 0xd1, 0x90, 0x9f, 0xa5, 0x58, 0xa3, 0xd4, 0xb7, 0x3c, 0xa6, 0xab, 0xca, + 0x93, 0x9b, 0xc6, 0xb0, 0x62, 0xab, 0x96, 0x98, 0x6a, 0x90, 0x45, 0x41, + 0x86, 0x54, 0x6d, 0x8f, 0x7b, 0x77, 0xc9, 0x3a, 0x72, 0x9f, 0x84, 0x5a, + 0x7c, 0x38, 0xc1, 0xbb, 0x38, 0xab, 0x40, 0xc4, 0x65, 0x88, 0x89, 0xcf, + 0x9a, 0x51, 0x56, 0xa4, 0x5e, 0xd4, 0x79, 0x93, 0x85, 0x92, 0x47, 0xa6, + 0xa4, 0x53, 0xba, 0xb6, 0xc5, 0x55, 0x9d, 0x6e, 0x32, 0x89, 0x85, 0x71, + 0x64, 0xc2, 0x38, 0x78, 0xb2, 0xca, 0xc5, 0x89, 0x42, 0x65, 0x60, 0x44, + 0x3c, 0xb9, 0x70, 0x32, 0x93, 0x7e, 0x99, 0xbf, 0x83, 0xd4, 0xc8, 0x71, + 0x4b, 0x4c, 0xce, 0x8f, 0x5c, 0x98, 0xa4, 0xb2, 0xbb, 0x6b, 0x37, 0x33, + 0xb3, 0x8e, 0xb6, 0x91, 0x3e, 0x34, 0xb3, 0xd0, 0x92, 0xae, 0xd2, 0x4d, + 0xa7, 0xb4, 0xa1, 0xa2, 0x5e, 0xce, 0x76, 0x6d, 0x95, 0x37, 0x74, 0xc5, + 0xc6, 0x38, 0xb2, 0x75, 0x47, 0x32, 0x3a, 0x69, 0xb7, 0xcd, 0x5a, 0xbf, + 0xa7, 0x92, 0x4a, 0x63, 0x79, 0xc1, 0x82, 0x68, 0x65, 0xbe, 0x4f, 0x83, + 0x45, 0xc6, 0x4b, 0x98, 0x3b, 0x88, 0x51, 0x39, 0x81, 0x81, 0x9c, 0x4c, + 0x7f, 0xce, 0x73, 0x5b, 0x4c, 0x6e, 0x77, 0x7d, 0xca, 0xc4, 0x4d, 0x98, + 0x3e, 0x76, 0x7e, 0x72, 0x50, 0xaa, 0x83, 0x87, 0x89, 0x66, 0x66, 0x63, + 0x94, 0x76, 0x8b, 0xce, 0xa3, 0x43, 0xce, 0x4b, 0x8b, 0x47, 0x5b, 0x80, + 0x53, 0x70, 0x6b, 0x79, 0x67, 0x46, 0xbe, 0xc7, 0x7a, 0x53, 0xac, 0x83, + 0xa1, 0x3c, 0xa4, 0x5c, 0xbe, 0xad, 0x84, 0x77, 0x8b, 0x7c, 0x96, 0xc6, + 0x67, 0x3b, 0x61, 0x40, 0xb4, 0x92, 0x8f, 0x3d, 0x9b, 0x62, 0x75, 0x83, + 0xa0, 0xd0, 0x42, 0xb0, 0x81, 0x9c, 0x58, 0x6a, 0x4f, 0xce, 0x79, 0x78, + 0x65, 0x42, 0xbf, 0xa8, 0x58, 0x6b, 0x67, 0xb2, 0xc0, 0x84, 0x86, 0xbe, + 0x88, 0x73, 0x5e, 0xc2, 0x86, 0x76, 0x41, 0x32, 0x53, 0xc7, 0x50, 0x80, + 0x5a, 0x5b, 0x49, 0x4c, 0x73, 0x4e, 0x4b, 0xc3, 0x4e, 0xaf, 0x96, 0x57, + 0xb4, 0xb7, 0x7c, 0xd1, 0x57, 0xa7, 0x4c, 0xbd, 0xc2, 0x34, 0xc2, 0x99, + 0x9f, 0x50, 0x5e, 0x7c, 0x55, 0x38, 0x74, 0xbc, 0x99, 0x7c, 0x83, 0xc8, + 0x65, 0xb8, 0xb4, 0xc0, 0xbf, 0x8a, 0x62, 0xa7, 0x39, 0xb9, 0x45, 0x94, + 0x7b, 0xbc, 0x6d, 0x7f, 0x41, 0x2c, 0x57, 0xaa, 0xad, 0x30, 0x78, 0x9c, + 0xa7, 0xce, 0xda, 0xa0, 0x4c, 0x63, 0xae, 0x99, 0x7f, 0x38, 0x39, 0x54, + 0xb6, 0xc0, 0x35, 0xcd, 0x48, 0xca, 0xaf, 0x8f, 0x71, 0x35, 0xbc, 0xa6, + 0x63, 0x8d, 0x52, 0x90, 0x40, 0x50, 0x86, 0x6d, 0x3f, 0x4d, 0xc1, 0xcc, + 0x58, 0xa2, 0x45, 0xa1, 0xc9, 0xcb, 0x80, 0x4d, 0x4f, 0xd6, 0x59, 0x9c, + 0x82, 0x59, 0x4f, 0x97, 0x55, 0xbc, 0x30, 0x55, 0x83, 0xce, 0x48, 0x56, + 0x36, 0xd2, 0xca, 0xaa, 0x53, 0xa8, 0xc7, 0x4d, 0x7d, 0x53, 0x2f, 0x44, + 0x67, 0x62, 0x9a, 0x7c, 0x3c, 0x7c, 0xab, 0x61, 0xc2, 0xb4, 0xb0, 0xd1, + 0x8d, 0xab, 0x7e, 0x7c, 0x97, 0xb3, 0xae, 0x51, 0x6c, 0x7a, 0x49, 0x90, + 0xbb, 0x86, 0xba, 0x91, 0xb0, 0x5d, 0x5b, 0x70, 0x61, 0x8b, 0x6a, 0x5d, + 0x99, 0x7b, 0x79, 0x6d, 0x92, 0x98, 0x5d, 0x6d, 0x6b, 0xc6, 0xb2, 0xbe, + 0xc3, 0x54, 0x72, 0x8c, 0x84, 0x71, 0x94, 0x4a, 0x69, 0xbb, 0x89, 0xcd, + 0x5f, 0x9f, 0x3e, 0xb6, 0xbd, 0x65, 0x63, 0x96, 0x35, 0x43, 0x84, 0x3d, + 0x64, 0xa5, 0x35, 0x70, 0x5c, 0xab, 0xc0, 0x52, 0x26, 0xb2, 0x8b, 0x7e, + 0x6c, 0x22, 0x8f, 0x9c, 0x95, 0x28, 0x51, 0x75, 0x4f, 0x5a, 0x7d, 0x9d, + 0xaf, 0x9b, 0x5e, 0x9a, 0x32, 0x39, 0x56, 0xce, 0x96, 0xb1, 0x91, 0xaf, + 0x60, 0xba, 0x46, 0xce, 0xae, 0x3d, 0x74, 0xb5, 0xa5, 0x6a, 0xb3, 0x5e, + 0x47, 0xbf, 0x60, 0x33, 0x3e, 0xa5, 0x9c, 0xcf, 0x84, 0x50, 0x2f, 0x77, + 0xb9, 0x42, 0x96, 0x58, 0x89, 0x39, 0x69, 0x9d, 0x6f, 0xbb, 0x70, 0x7c, + 0xb5, 0xc9, 0xc2, 0xa8, 0xc7, 0x7d, 0x77, 0x41, 0x3e, 0x44, 0x4f, 0x4c, + 0x4f, 0xc7, 0x66, 0x87, 0xa3, 0x7f, 0x98, 0x87, 0x89, 0x4a, 0x6e, 0x6f, + 0x96, 0x52, 0xa4, 0xc4, 0xa8, 0x59, 0x41, 0xa9, 0x48, 0x92, 0xd6, 0x61, + 0xc8, 0x91, 0xc1, 0x48, 0x6b, 0xb8, 0x5f, 0x7f, 0x62, 0xdc, 0x86, 0x53, + 0xbd, 0x6d, 0xa6, 0x68, 0x6b, 0x48, 0x68, 0xd3, 0xa5, 0x81, 0x4c, 0xc3, + 0x58, 0xb9, 0x49, 0x77, 0x38, 0x90, 0x5b, 0x50, 0x66, 0x32, 0x93, 0x6b, + 0xd2, 0x47, 0xc8, 0xbf, 0x65, 0x92, 0x5b, 0x96, 0x70, 0xb6, 0xa7, 0xd1, + 0xa2, 0x7e, 0x5b, 0x6a, 0xbc, 0xb4, 0xc1, 0x63, 0xc0, 0xad, 0x47, 0x7b, + 0xd3, 0x58, 0xb2, 0x6e, 0x98, 0x93, 0x7e, 0x6f, 0x80, 0xd0, 0xb8, 0x4c, + 0x44, 0x99, 0x8e, 0xb1, 0x87, 0x55, 0x60, 0x84, 0x89, 0x72, 0x5f, 0x78, + 0xd3, 0xbb, 0xa9, 0x6c, 0x4c, 0x36, 0x53, 0x9e, 0x62, 0xbd, 0xba, 0x7f, + 0x6d, 0x26, 0x6e, 0xa8, 0x53, 0x78, 0x9c, 0xb0, 0x6f, 0xc4, 0x5a, 0xc3, + 0x41, 0xa3, 0x50, 0x8f, 0x62, 0xae, 0x9c, 0x4c, 0x36, 0xc1, 0x8e, 0x2d, + 0x36, 0xa8, 0x55, 0x5d, 0x7a, 0x34, 0x50, 0x71, 0x33, 0x9e, 0xcb, 0x99, + 0xcc, 0x5c, 0x67, 0xdb, 0x82, 0xc9, 0xae, 0x7b, 0x93, 0x5e, 0xa6, 0xc5, + 0x4f, 0x32, 0x98, 0x7c, 0xb3, 0xaf, 0x91, 0x7b, 0x4a, 0xb6, 0xc7, 0x4f, + 0x40, 0x8e, 0x74, 0x43, 0x7a, 0xb1, 0x4a, 0x59, 0xb0, 0x74, 0x5d, 0x91, + 0x7d, 0x96, 0x3b, 0x48, 0x5a, 0x51, 0x9b, 0x43, 0x58, 0xbc, 0x41, 0xb1, + 0xcf, 0x84, 0xa3, 0xcb, 0x93, 0x4c, 0xc5, 0xb7, 0xaa, 0xb1, 0xa3, 0x40, + 0x52, 0x32, 0x65, 0x51, 0xa4, 0xa2, 0x82, 0x44, 0x41, 0x6c, 0xaf, 0x97, + 0xac, 0x9c, 0x35, 0x4e, 0xc0, 0xb8, 0xb4, 0x44, 0x62, 0xa7, 0xc5, 0x93, + 0x49, 0x4d, 0xcb, 0x71, 0xb3, 0x67, 0xa8, 0xdb, 0xd3, 0x74, 0xbe, 0x72, + 0x4f, 0x9a, 0x39, 0x92, 0x55, 0xc0, 0x5e, 0x77, 0x52, 0xba, 0xcd, 0x8f, + 0x3f, 0x54, 0x6c, 0x9e, 0x94, 0xdc, 0x3b, 0x65, 0xa8, 0x9e, 0x61, 0x88, + 0x76, 0x3b, 0xbc, 0xa2, 0x9d, 0xcb, 0xdf, 0x7c, 0x9b, 0x45, 0x97, 0x99, + 0x82, 0x4b, 0x54, 0x9e, 0x62, 0xb5, 0x44, 0x93, 0x71, 0xb1, 0x56, 0x91, + 0x69, 0xa6, 0x62, 0xd4, 0x3f, 0xb5, 0x86, 0x3e, 0x2e, 0x75, 0xad, 0x44, + 0x7f, 0x16, 0x3e, 0x7a, 0x76, 0xba, 0x6a, 0xba, 0xc9, 0x4f, 0xcd, 0x61, + 0x89, 0xd0, 0xa6, 0x56, 0x16, 0xa0, 0x36, 0x9d, 0x5c, 0xc8, 0x47, 0xd7, + 0x87, 0x97, 0xa7, 0x6d, 0xbd, 0xb9, 0x74, 0x73, 0x8c, 0xb7, 0xa7, 0x63, + 0x32, 0x94, 0xa0, 0x5f, 0x39, 0x9a, 0x4c, 0x54, 0x91, 0x9a, 0x4e, 0xaf, + 0x60, 0x61, 0x8f, 0x55, 0x7b, 0xa9, 0x5b, 0x88, 0x84, 0x79, 0xaf, 0x6d, + 0xc7, 0x5b, 0x5e, 0x56, 0xc4, 0x75, 0xa5, 0xd0, 0x67, 0x7f, 0x7f, 0x98, + 0x46, 0xac, 0xc5, 0xc2, 0xc9, 0x90, 0xa9, 0x93, 0xbd, 0x62, 0xce, 0x69, + 0x88, 0x92, 0x86, 0xda, 0x42, 0x57, 0x84, 0x5d, 0x29, 0x69, 0x5f, 0x95, + 0x77, 0xa1, 0xa4, 0x5a, 0xd1, 0xcf, 0x53, 0x6b, 0xcc, 0x6a, 0xc6, 0x41, + 0x7a, 0x64, 0x5b, 0x6a, 0xa0, 0x4d, 0x55, 0xa9, 0xac, 0xca, 0x66, 0xa9, + 0xa1, 0x92, 0x5d, 0xaa, 0x83, 0x53, 0x74, 0xcb, 0xb4, 0x4f, 0xb1, 0x48, + 0x8d, 0x84, 0x48, 0x60, 0x34, 0x56, 0x56, 0x91, 0x32, 0xc9, 0xd4, 0xa6, + 0x62, 0xad, 0xb0, 0xaf, 0xb4, 0x53, 0xd9, 0x9b, 0x5b, 0x5d, 0x43, 0xc7, + 0xa4, 0xa4, 0x7d, 0xc6, 0x63, 0xd1, 0xba, 0x74, 0x74, 0x8e, 0x5d, 0x7a, + 0x62, 0xda, 0xa6, 0xc6, 0x68, 0xaa, 0x63, 0xbc, 0xb9, 0x80, 0x85, 0x5a, + 0xa0, 0x62, 0x4e, 0x3f, 0xc6, 0xa6, 0xa0, 0xca, 0x9d, 0x55, 0x51, 0x64, + 0x5f, 0x9a, 0x8c, 0xae, 0x46, 0xb4, 0x72, 0x9e, 0xab, 0x34, 0xa1, 0xc1, + 0x92, 0x87, 0xb0, 0x85, 0x9c, 0x61, 0x4f, 0x92, 0x97, 0x95, 0x8b, 0x54, + 0xc7, 0x71, 0x84, 0xcc, 0xa2, 0x80, 0x96, 0xb8, 0xcb, 0x3b, 0x68, 0x65, + 0x81, 0x55, 0x97, 0x6d, 0xcb, 0x96, 0x35, 0x80, 0x8f, 0x61, 0x9d, 0x42, + 0xc6, 0xa1, 0x57, 0xab, 0x7a, 0xb2, 0x73, 0x6f, 0x3d, 0x7d, 0x91, 0x42, + 0x8d, 0x9c, 0x5d, 0x65, 0xa8, 0x38, 0x92, 0x9a, 0x3f, 0xc8, 0x9c, 0x94, + 0x7e, 0x8b, 0x3c, 0xb5, 0x91, 0xb1, 0xa6, 0x7d, 0x31, 0x94, 0x47, 0xbc, + 0xb1, 0x7d, 0x33, 0x4d, 0x5d, 0x66, 0x65, 0x60, 0xbd, 0x4a, 0x4c, 0x58, + 0xcd, 0x9f, 0xaf, 0x9a, 0x93, 0xcd, 0xca, 0x3b, 0x67, 0xa0, 0xaa, 0x4b, + 0x9f, 0x63, 0x6c, 0x6e, 0xb3, 0x51, 0x74, 0x43, 0xab, 0xb2, 0x3a, 0xbb, + 0x8a, 0x6b, 0x91, 0x36, 0x65, 0x47, 0xa4, 0x9c, 0x41, 0xb8, 0x5b, 0x61, + 0x47, 0xca, 0x9b, 0xc9, 0x32, 0x7f, 0x53, 0xa5, 0x95, 0x93, 0xc5, 0x38, + 0x7e, 0x6d, 0x34, 0x4b, 0x80, 0x4e, 0x5a, 0x97, 0xc1, 0xb8, 0x81, 0xac, + 0x82, 0x4d, 0xc3, 0x55, 0x64, 0xa9, 0x36, 0x8e, 0x5c, 0x55, 0x6d, 0x42, + 0x92, 0x6e, 0xc5, 0x35, 0x49, 0x82, 0x46, 0x4e, 0x9a, 0xbb, 0x6b, 0x4d, + 0x5a, 0x5d, 0xa4, 0xb4, 0x9d, 0xb2, 0x3c, 0x77, 0x74, 0x40, 0x5b, 0x6e, + 0xa5, 0x8a, 0x55, 0x8b, 0x40, 0x80, 0x94, 0x40, 0x5b, 0x7f, 0x88, 0x36, + 0x9e, 0xa1, 0xcc, 0x57, 0xcb, 0xb5, 0x45, 0x4a, 0x49, 0x45, 0x6b, 0xa1, + 0x45, 0xc0, 0xb9, 0x9c, 0x91, 0x79, 0x54, 0x87, 0x43, 0xb3, 0x62, 0xaf, + 0x39, 0x88, 0x4b, 0x8f, 0x8f, 0xd0, 0xaa, 0x57, 0x81, 0x8c, 0xb8, 0x77, + 0xbb, 0x7b, 0xc5, 0x9b, 0xb1, 0x39, 0x96, 0xc0, 0x9b, 0x61, 0xc6, 0x6e, + 0x7e, 0x60, 0x9d, 0x74, 0x51, 0x5e, 0x3c, 0x91, 0x36, 0x62, 0xc6, 0x8d, + 0x7d, 0x82, 0x56, 0xd3, 0x6c, 0xc2, 0x3c, 0x4a, 0xcd, 0xa2, 0x58, 0xbe, + 0x7c, 0x74, 0x46, 0xa8, 0xb6, 0x5d, 0xce, 0x6f, 0xbb, 0xc8, 0x85, 0xb2, + 0xb0, 0x62, 0x3f, 0x97, 0x5d, 0x54, 0x84, 0xa1, 0x5c, 0x98, 0x4e, 0x53, + 0xce, 0x83, 0xa3, 0x58, 0x97, 0x62, 0x7c, 0x6a, 0x49, 0x60, 0x94, 0x94, + 0x87, 0xab, 0xd0, 0x92, 0x49, 0xa7, 0x7c, 0xaa, 0x4c, 0x3d, 0xaa, 0xc7, + 0x36, 0x98, 0x88, 0x51, 0xad, 0x9e, 0x92, 0xb6, 0x62, 0x88, 0x4a, 0x7c, + 0xad, 0xbc, 0x7e, 0xa3, 0xb1, 0x4e, 0x7f, 0x88, 0x3b, 0x4b, 0x95, 0xc5, + 0xcd, 0x93, 0x4a, 0xbf, 0x34, 0x63, 0x6b, 0x86, 0xa3, 0x45, 0x44, 0x6b, + 0x83, 0x61, 0x7b, 0x30, 0x92, 0xba, 0x73, 0x4e, 0x7f, 0x7e, 0x9f, 0xb2, + 0x85, 0x78, 0x30, 0xba, 0x35, 0x98, 0x48, 0x36, 0x7b, 0xc2, 0x80, 0x54, + 0xa2, 0x34, 0x75, 0x3e, 0x6d, 0xaf, 0x3a, 0x9b, 0x76, 0x56, 0x47, 0x7b, + 0xc1, 0xca, 0x37, 0xa5, 0x76, 0xc8, 0x47, 0x31, 0xd1, 0xc6, 0x7c, 0x9b, + 0x48, 0x6d, 0x4d, 0x58, 0x88, 0x60, 0x74, 0x97, 0x90, 0x4d, 0x40, 0x4a, + 0x98, 0x3e, 0xa0, 0x44, 0x3b, 0xc4, 0x9d, 0x84, 0x60, 0x7b, 0x85, 0x34, + 0x91, 0x54, 0xb9, 0x31, 0xc3, 0x61, 0xca, 0x4c, 0xba, 0xa4, 0x5b, 0x69, + 0x4e, 0xc4, 0x3e, 0xb0, 0x88, 0x51, 0x85, 0x68, 0x55, 0xce, 0xb5, 0x8e, + 0x91, 0xc5, 0x8e, 0x7a, 0x99, 0x81, 0xc1, 0x7e, 0x99, 0x4b, 0x73, 0x78, + 0x8a, 0x8d, 0x9d, 0x96, 0x96, 0x54, 0x7d, 0x6f, 0x95, 0xc4, 0x65, 0x42, + 0x3e, 0x88, 0x9c, 0xd0, 0xd2, 0xd2, 0x31, 0xcd, 0xca, 0x34, 0x8e, 0x92, + 0xb1, 0x84, 0x30, 0x59, 0x41, 0x33, 0xbc, 0x48, 0x4f, 0x7b, 0x98, 0x44, + 0x3b, 0xaf, 0xc6, 0x77, 0x7a, 0xa8, 0xbc, 0x71, 0x30, 0xa7, 0x34, 0x4b, + 0x85, 0xb0, 0x7e, 0x96, 0x70, 0x58, 0x9f, 0xba, 0x96, 0x62, 0x4f, 0x4d, + 0x33, 0x35, 0x5e, 0xc4, 0x3c, 0xcf, 0x34, 0x35, 0x90, 0x86, 0x9b, 0xad, + 0x53, 0x8e, 0x9c, 0x59, 0x45, 0x7e, 0xa9, 0x38, 0xa3, 0x99, 0x42, 0x4b, + 0x39, 0x71, 0x94, 0x44, 0x86, 0xae, 0xb1, 0x57, 0x3b, 0x7c, 0x6d, 0x56, + 0xa0, 0xba, 0xd1, 0xca, 0xa5, 0x87, 0x63, 0x34, 0x56, 0x99, 0x64, 0x70, + 0xcb, 0x79, 0xaf, 0x6d, 0x5c, 0x92, 0x89, 0x75, 0x53, 0x33, 0xaa, 0xa4, + 0x6e, 0x4b, 0x6f, 0x91, 0x97, 0xa1, 0x34, 0xcb, 0x72, 0xca, 0x4a, 0xa1, + 0xa0, 0xce, 0x46, 0xbc, 0x84, 0xcd, 0xab, 0x72, 0x80, 0x50, 0xb3, 0x3f, + 0xcc, 0x58, 0xa1, 0x7c, 0x3c, 0x47, 0x31, 0x6f, 0x5b, 0x3f, 0x8e, 0x3f, + 0x66, 0xb1, 0x88, 0x4a, 0x9d, 0xa1, 0x80, 0xb5, 0x57, 0x99, 0x32, 0x3e, + 0x9a, 0xa1, 0x98, 0x53, 0x6e, 0x88, 0xc1, 0xb0, 0xa5, 0x4f, 0x48, 0x57, + 0x80, 0x42, 0x92, 0xd3, 0x6b, 0x46, 0x63, 0x4d, 0xbb, 0x47, 0xbb, 0xc0, + 0x3c, 0x9c, 0x43, 0x35, 0xa2, 0xb9, 0x59, 0x47, 0xc4, 0x67, 0x32, 0x85, + 0x46, 0x96, 0x9d, 0x42, 0x39, 0x5f, 0x8e, 0x47, 0x5b, 0x3b, 0x78, 0x5e, + 0x92, 0x5f, 0x69, 0x42, 0x7b, 0x85, 0x91, 0xc4, 0x66, 0x69, 0x99, 0x31, + 0xb4, 0x8f, 0x5a, 0xac, 0x79, 0xc3, 0xbe, 0xb2, 0x9b, 0x99, 0xa2, 0x4e, + 0x5f, 0xcc, 0xb9, 0x90, 0xcf, 0x53, 0xb7, 0x4f, 0x6e, 0xc3, 0xbe, 0xa5, + 0x8a, 0xab, 0x79, 0xa7, 0xa2, 0x5b, 0x55, 0xbb, 0xb1, 0x52, 0x91, 0x41, + 0x4b, 0xc4, 0xb6, 0xbc, 0x99, 0x7d, 0x90, 0xae, 0x92, 0x65, 0x5a, 0xb7, + 0x98, 0xb2, 0x78, 0x51, 0x5b, 0x36, 0x78, 0x7a, 0x41, 0xa0, 0x82, 0x8c, + 0xbd, 0xc6, 0x77, 0xbe, 0x89, 0x8c, 0x61, 0x55, 0x5d, 0xb8, 0x64, 0x87, + 0xd1, 0x66, 0x60, 0xd1, 0xb3, 0x42, 0x92, 0xa2, 0xc6, 0x4a, 0x88, 0xac, + 0x40, 0x99, 0xd1, 0x8c, 0x3b, 0x6d, 0x31, 0xc3, 0x7a, 0x5d, 0x67, 0x39, + 0x46, 0x59, 0xa5, 0x9b, 0x6b, 0xb0, 0x41, 0x8c, 0xcd, 0x3f, 0xa4, 0xc2, + 0x7a, 0x9c, 0xaa, 0x9f, 0xa6, 0xca, 0x6c, 0xa4, 0x9d, 0xab, 0xc0, 0xd8, + 0x5d, 0xb8, 0x3d, 0x41, 0xa8, 0xb1, 0x40, 0x63, 0xb9, 0xa8, 0x69, 0x4a, + 0x53, 0x93, 0x83, 0x39, 0xc0, 0xd4, 0x4f, 0x48, 0x40, 0x5b, 0x42, 0x83, + 0x58, 0x45, 0x69, 0xc0, 0xc9, 0x4b, 0x6c, 0xb6, 0x3e, 0xa0, 0xb2, 0x55, + 0x3c, 0x4c, 0x7f, 0x95, 0xb6, 0x36, 0x8d, 0x61, 0xbc, 0xc4, 0x4a, 0x4d, + 0xb8, 0x8b, 0x68, 0x6d, 0xa9, 0xda, 0x57, 0xda, 0x64, 0xca, 0x87, 0x89, + 0xc0, 0x7a, 0x66, 0xa2, 0xa9, 0x32, 0x65, 0xc5, 0x5f, 0x8c, 0x60, 0xb5, + 0x83, 0x34, 0x91, 0xc0, 0xd2, 0xac, 0x3d, 0x62, 0x85, 0xb4, 0xaa, 0x8c, + 0x77, 0xc1, 0xb7, 0x57, 0x84, 0xb8, 0x93, 0x71, 0xba, 0xa8, 0x71, 0x45, + 0xbd, 0xaf, 0x5c, 0x8a, 0x77, 0x5c, 0x76, 0xb9, 0x59, 0x42, 0x40, 0x9c, + 0x8f, 0x32, 0x77, 0x5e, 0x80, 0x65, 0x52, 0xbf, 0xa3, 0x9c, 0xcc, 0x9c, + 0xce, 0xc4, 0x4b, 0x64, 0xa6, 0x48, 0x97, 0x9f, 0x87, 0x40, 0x9f, 0x62, + 0xad, 0x9a, 0x3c, 0x33, 0x4f, 0x78, 0x5f, 0x9e, 0xce, 0x53, 0x6d, 0x79, + 0xb7, 0x70, 0xba, 0xc9, 0x3a, 0x87, 0x91, 0xc8, 0x47, 0xcb, 0x57, 0x3b, + 0x64, 0x64, 0x81, 0x9e, 0x6b, 0x91, 0x8b, 0x7d, 0x4b, 0x7a, 0x39, 0x4e, + 0x8f, 0x35, 0x5b, 0xc4, 0x5f, 0x6d, 0xb7, 0x45, 0x5e, 0xb1, 0x67, 0xa5, + 0x97, 0x91, 0x7b, 0x3c, 0xd0, 0xb4, 0x7b, 0x97, 0x48, 0x85, 0x33, 0xcf, + 0xc9, 0x3b, 0xcc, 0x4e, 0x8c, 0x6e, 0x73, 0x73, 0xc3, 0xc6, 0xd4, 0x9f, + 0x49, 0x8f, 0x5d, 0x83, 0x78, 0xbb, 0x43, 0x3f, 0x49, 0xa3, 0xbe, 0x44, + 0x87, 0xbd, 0x5d, 0x3c, 0xa6, 0x4c, 0x32, 0xbd, 0x99, 0x4a, 0xc4, 0xb8, + 0x7d, 0x4f, 0xc3, 0x64, 0x3d, 0x86, 0xab, 0x7f, 0x8f, 0xb8, 0xc6, 0xa4, + 0x66, 0x8c, 0xa4, 0xc8, 0x43, 0xc4, 0xb6, 0x5f, 0x6a, 0xaa, 0x6f, 0x5f, + 0xc0, 0x88, 0x9c, 0xde, 0x66, 0x9c, 0x54, 0x94, 0x63, 0x91, 0x61, 0x55, + 0x45, 0xc2, 0xb1, 0x49, 0x34, 0x48, 0xa5, 0x5d, 0x4b, 0x34, 0xca, 0xc7, + 0x82, 0x71, 0xd3, 0x51, 0x8e, 0x4a, 0x39, 0xd1, 0x82, 0x53, 0x42, 0xb8, + 0xac, 0x60, 0x6b, 0x8a, 0x30, 0x4d, 0x7d, 0x4d, 0x84, 0x6f, 0xb3, 0x59, + 0xcd, 0xb7, 0xbe, 0xa8, 0x9c, 0x97, 0x8b, 0x9d, 0x89, 0xaf, 0xa3, 0x9b, + 0x68, 0xa6, 0x65, 0x9e, 0xbe, 0xad, 0xcf, 0xd6, 0xbf, 0x37, 0xa9, 0x68, + 0x70, 0xb7, 0x89, 0x93, 0xa3, 0xa2, 0x81, 0x61, 0xd1, 0x95, 0x4a, 0xbc, + 0x94, 0xa3, 0xc7, 0xc9, 0x88, 0x75, 0xa7, 0x8c, 0x8b, 0x33, 0x42, 0xba, + 0x5b, 0xc7, 0xb8, 0x34, 0xc9, 0x96, 0x83, 0x37, 0x8d, 0xc5, 0xd8, 0xa9, + 0x82, 0x30, 0xd1, 0xb2, 0xbf, 0x9b, 0x7b, 0x71, 0x90, 0x86, 0xb6, 0x7a, + 0x5a, 0x73, 0x38, 0x7b, 0x6b, 0x4a, 0x85, 0x31, 0x85, 0x6a, 0xb7, 0xa5, + 0x54, 0x75, 0x65, 0xbc, 0x76, 0xbd, 0x93, 0x92, 0xb0, 0x66, 0x2c, 0xa1, + 0xd8, 0x55, 0x5f, 0x3f, 0x36, 0xbb, 0x88, 0xb7, 0x57, 0x50, 0x56, 0xc6, + 0xa1, 0x4c, 0x2f, 0x5d, 0x9c, 0xaa, 0xb4, 0x5c, 0xba, 0x4c, 0x2d, 0x75, + 0xa6, 0x97, 0x31, 0x40, 0x65, 0x67, 0x34, 0xa1, 0x6e, 0xb0, 0x4a, 0x3b, + 0x81, 0xc9, 0xd7, 0x74, 0x75, 0x32, 0xb6, 0xc7, 0x5a, 0x92, 0x96, 0x87, + 0x7b, 0x41, 0x7b, 0x78, 0x63, 0xa3, 0xdb, 0xa5, 0x76, 0x90, 0xbc, 0xd0, + 0xc4, 0xaf, 0x86, 0x64, 0x8c, 0x54, 0xad, 0x46, 0xca, 0x91, 0x6e, 0x70, + 0x66, 0xd4, 0x92, 0xc7, 0x57, 0xbc, 0x9e, 0x97, 0xa8, 0xa4, 0xd3, 0xc4, + 0x41, 0xb5, 0x80, 0x3d, 0x5d, 0x4c, 0xce, 0xd3, 0x80, 0xd2, 0xad, 0x4b, + 0xbc, 0x5e, 0x89, 0x98, 0xcc, 0x48, 0x65, 0x45, 0x56, 0x50, 0x84, 0xb0, + 0x40, 0xcc, 0x9a, 0x3b, 0x70, 0x45, 0x6b, 0x51, 0x38, 0xac, 0x4e, 0x63, + 0xb3, 0xb2, 0xc0, 0x98, 0xca, 0x6b, 0xc8, 0x6b, 0xaf, 0x44, 0x6d, 0x9d, + 0xb2, 0x5c, 0x9b, 0x45, 0x7b, 0x4a, 0x39, 0xb5, 0x5a, 0x35, 0x4c, 0x79, + 0x8b, 0x3c, 0xb2, 0xd2, 0x59, 0x52, 0x7f, 0x76, 0x41, 0xb4, 0x5e, 0x59, + 0x7e, 0xae, 0x9c, 0x89, 0x54, 0xb5, 0x35, 0xbe, 0x7e, 0x70, 0xd6, 0x46, + 0xaf, 0xce, 0x3f, 0xa9, 0x69, 0x82, 0x3f, 0x9e, 0x63, 0x5d, 0x7e, 0x91, + 0xa6, 0x51, 0x6d, 0xac, 0x41, 0x57, 0x47, 0x76, 0x91, 0x96, 0x9e, 0x37, + 0xc9, 0x77, 0xaf, 0x48, 0x79, 0x64, 0x81, 0x38, 0xbc, 0x7e, 0xb3, 0x2e, + 0x68, 0x47, 0x30, 0xa5, 0x3d, 0x95, 0x97, 0xb9, 0x52, 0x40, 0xb5, 0xaf, + 0xc2, 0xae, 0x2d, 0x37, 0x9e, 0x67, 0x51, 0x30, 0x75, 0x8b, 0xba, 0x59, + 0x88, 0xb6, 0x6e, 0x98, 0x92, 0x41, 0x74, 0xa6, 0xc2, 0x39, 0x42, 0xa8, + 0xa3, 0xbd, 0xa9, 0xa3, 0xca, 0x7a, 0x34, 0x43, 0x64, 0x5e, 0x99, 0x6d, + 0xaa, 0x5b, 0x5c, 0x6f, 0x43, 0x88, 0xb0, 0x67, 0x45, 0x3f, 0x99, 0x77, + 0xb8, 0x44, 0x3d, 0x83, 0x87, 0x66, 0x6a, 0x5c, 0x98, 0xc8, 0xbc, 0xa9, + 0x9e, 0x7f, 0xcb, 0xb4, 0x84, 0x62, 0x6f, 0xd2, 0x3e, 0x4c, 0x81, 0x3f, + 0x96, 0xc9, 0x79, 0x48, 0xbb, 0x51, 0x6f, 0x9d, 0x5a, 0xbf, 0x78, 0xb5, + 0x3a, 0x6b, 0x53, 0xb1, 0x5c, 0x58, 0xb0, 0x54, 0xd9, 0x67, 0xc3, 0xd9, + 0xb8, 0xa8, 0xae, 0x39, 0x99, 0x59, 0x8c, 0xb9, 0x7a, 0x3f, 0xd2, 0x5f, + 0x42, 0xa8, 0xbf, 0xaf, 0xc5, 0xc3, 0xa7, 0xae, 0x5d, 0x53, 0x42, 0xbe, + 0x4f, 0x53, 0x92, 0xa6, 0x78, 0xa5, 0xd9, 0x56, 0xb4, 0xb9, 0x57, 0x48, + 0x66, 0xa4, 0x2e, 0x69, 0x36, 0xac, 0xd1, 0x99, 0xa1, 0x4b, 0x5c, 0x8a, + 0x8a, 0x72, 0x64, 0xa8, 0x83, 0x85, 0x67, 0xcd, 0x70, 0x75, 0xb7, 0x8e, + 0xb4, 0x7b, 0x46, 0xca, 0x4d, 0xc9, 0xd0, 0x98, 0xa2, 0x5e, 0x4a, 0x2b, + 0x5f, 0x68, 0x97, 0x67, 0x32, 0x35, 0x87, 0x96, 0xbe, 0x3c, 0x53, 0x5f, + 0x5c, 0x32, 0xb2, 0x66, 0xa4, 0x2e, 0x81, 0x48, 0x8c, 0x8c, 0xa5, 0x6d, + 0x5f, 0x7c, 0x6c, 0x75, 0x97, 0x46, 0x59, 0xc9, 0x37, 0x71, 0xac, 0xa7, + 0x6d, 0x40, 0xb8, 0x6f, 0x3c, 0xac, 0xd0, 0xc1, 0xb2, 0xb7, 0x6a, 0x69, + 0xc6, 0x76, 0x8a, 0xc6, 0x47, 0x83, 0x5d, 0xb5, 0xdd, 0xcc, 0x65, 0xb5, + 0x9e, 0x51, 0x92, 0x49, 0xca, 0xca, 0x40, 0xba, 0x6b, 0x39, 0x99, 0x9c, + 0xa5, 0x7c, 0x7f, 0xb2, 0x64, 0x5f, 0x8c, 0x6e, 0x59, 0xac, 0xb9, 0x8e, + 0x3c, 0xe1, 0x69, 0x3f, 0xc6, 0x9b, 0x91, 0x66, 0x8e, 0x5d, 0x5d, 0x5e, + 0xa3, 0x54, 0x61, 0xc0, 0xb0, 0xc7, 0x44, 0x9d, 0x4b, 0xba, 0x9c, 0x49, + 0x5c, 0x86, 0x97, 0xcf, 0xca, 0xb7, 0x93, 0x9a, 0x92, 0x27, 0xc7, 0x75, + 0x6d, 0xc3, 0x87, 0x3f, 0xb3, 0x27, 0xcc, 0x60, 0xba, 0x4b, 0x46, 0xd2, + 0x48, 0x7f, 0x9f, 0xd0, 0x8a, 0x4e, 0x9f, 0x93, 0x4e, 0x3d, 0xd5, 0x7e, + 0xee, 0x67, 0x71, 0xa8, 0xa9, 0x52, 0xaa, 0x90, 0x9f, 0x63, 0x49, 0x2e, + 0x53, 0x8b, 0x60, 0x3e, 0x70, 0xa6, 0x82, 0x88, 0x41, 0xb8, 0xbd, 0x4b, + 0x79, 0xab, 0x3e, 0xb7, 0x3d, 0x96, 0x3d, 0x92, 0x6b, 0xb5, 0x73, 0x34, + 0x9d, 0x56, 0xbb, 0x92, 0x50, 0x92, 0x48, 0x73, 0x49, 0xb8, 0xb4, 0x87, + 0x6a, 0x2e, 0x76, 0x86, 0x5a, 0x6b, 0xb9, 0xd3, 0x45, 0x69, 0x97, 0xa4, + 0xae, 0xac, 0x9e, 0x33, 0x93, 0x4e, 0xcc, 0xad, 0x4b, 0xd0, 0x8f, 0xc6, + 0x72, 0x46, 0xbe, 0x62, 0x90, 0xb2, 0xb2, 0xce, 0x5c, 0x3c, 0x7f, 0xa5, + 0x5a, 0xa2, 0x5c, 0xa5, 0x8f, 0x73, 0x4d, 0x94, 0x7b, 0xab, 0xc3, 0x53, + 0x48, 0xb0, 0xae, 0xbd, 0x41, 0x39, 0x8b, 0x7e, 0x95, 0x92, 0xcc, 0xb4, + 0x3a, 0x60, 0xad, 0xe1, 0x33, 0x38, 0x65, 0xac, 0x72, 0xb8, 0xb3, 0x42, + 0x5e, 0x63, 0x50, 0x77, 0xc4, 0xd5, 0x3f, 0xc0, 0x79, 0x93, 0xaa, 0x41, + 0xba, 0x3c, 0x47, 0xc2, 0x6d, 0x42, 0x83, 0x4d, 0xab, 0x78, 0xc0, 0xb1, + 0x45, 0xb2, 0xaf, 0xb0, 0x51, 0xd7, 0x58, 0x7a, 0xc2, 0x71, 0xae, 0xcc, + 0xc6, 0x99, 0x47, 0xca, 0x96, 0x4d, 0x62, 0xc3, 0x9f, 0x3c, 0x7a, 0x88, + 0xd6, 0x62, 0x35, 0xc8, 0xbf, 0x6c, 0x8b, 0x47, 0x89, 0x79, 0x85, 0x9c, + 0x55, 0xc5, 0x42, 0x75, 0x62, 0xc2, 0x4c, 0xa5, 0x47, 0x6b, 0x8a, 0x5f, + 0x3a, 0x91, 0xba, 0x3b, 0x4e, 0xb2, 0x72, 0x83, 0x93, 0xc9, 0xbd, 0x97, + 0xbc, 0xaa, 0x43, 0xa6, 0x28, 0x43, 0xa5, 0x90, 0xd9, 0x70, 0x6e, 0x3e, + 0x9e, 0x9a, 0xdd, 0x6c, 0x52, 0x4d, 0x63, 0x48, 0x99, 0x61, 0x88, 0x4d, + 0x7f, 0xbd, 0x9f, 0x62, 0x7b, 0x85, 0x98, 0x9e, 0x62, 0xcb, 0x62, 0xcc, + 0xe5, 0xb2, 0xbf, 0xda, 0xc8, 0xdd, 0x5f, 0x8a, 0x7c, 0x3d, 0x33, 0x3b, + 0xbd, 0xb4, 0xb3, 0xad, 0x59, 0xca, 0x99, 0x2b, 0x7e, 0x68, 0xc2, 0x8a, + 0x80, 0x5d, 0xba, 0x8c, 0xbc, 0xc0, 0x83, 0x49, 0x74, 0xac, 0x68, 0x85, + 0xb9, 0x92, 0x71, 0x4e, 0x9a, 0xb5, 0x73, 0x66, 0x55, 0x50, 0x79, 0xc1, + 0x56, 0xcf, 0x62, 0x96, 0x8d, 0x61, 0xaf, 0x82, 0x56, 0xc1, 0xd7, 0x4a, + 0x96, 0x8e, 0x64, 0x64, 0x49, 0x42, 0xa2, 0xa6, 0x9c, 0x46, 0x8d, 0xbf, + 0xa6, 0x4c, 0xca, 0x59, 0x4b, 0xd8, 0xc1, 0x97, 0x7a, 0xd1, 0xbf, 0xd3, + 0x8c, 0x31, 0xa4, 0x3f, 0xa0, 0x57, 0x7b, 0x79, 0xb2, 0x45, 0x32, 0xc7, + 0xb4, 0xc7, 0x7e, 0x6e, 0x69, 0xae, 0x74, 0xa2, 0x50, 0x8f, 0x74, 0xaf, + 0x70, 0x76, 0x86, 0x6a, 0xa0, 0x7b, 0xbe, 0x35, 0x4d, 0x68, 0x3b, 0x83, + 0x89, 0x8f, 0x66, 0x5b, 0x79, 0x3f, 0x46, 0x70, 0x87, 0x73, 0x56, 0xb0, + 0x71, 0x6f, 0x3f, 0x9a, 0xc1, 0x47, 0x82, 0xae, 0x89, 0x59, 0x3c, 0x61, + 0xc6, 0xca, 0x4e, 0x76, 0x9a, 0x94, 0x44, 0x5b, 0x5f, 0xb6, 0xc6, 0xd9, + 0x4f, 0xd9, 0x7a, 0xa5, 0xbf, 0x48, 0xd3, 0xc5, 0x36, 0xc1, 0x79, 0x84, + 0x35, 0xc2, 0x73, 0x80, 0x9b, 0x75, 0x76, 0xcc, 0xb6, 0xb5, 0x36, 0x6d, + 0x8a, 0x80, 0x6f, 0x92, 0x5a, 0x72, 0x8a, 0x82, 0x94, 0x50, 0x25, 0xc6, + 0xe4, 0x9f, 0x7c, 0x2f, 0x76, 0x64, 0x2c, 0x9c, 0xb2, 0x91, 0xd4, 0x77, + 0xb8, 0x68, 0x62, 0x7e, 0x7b, 0xc3, 0x9c, 0x5d, 0x24, 0x55, 0x9c, 0xb4, + 0x54, 0x60, 0x3c, 0xa2, 0x96, 0xdf, 0x76, 0x51, 0x93, 0x9e, 0x76, 0x50, + 0x43, 0xc5, 0x37, 0xc9, 0xbd, 0x72, 0x51, 0x32, 0x9b, 0x73, 0x5b, 0x48, + 0x6d, 0x69, 0xb7, 0xbf, 0x5d, 0x5c, 0xad, 0xbd, 0x45, 0x77, 0x97, 0x3d, + 0xc4, 0x77, 0x6f, 0xd8, 0xcc, 0x43, 0xc5, 0x76, 0xab, 0x58, 0xcb, 0x7f, + 0x4b, 0x91, 0x67, 0x85, 0x53, 0x32, 0x3b, 0x81, 0xd0, 0x6a, 0x85, 0x92, + 0x6e, 0x36, 0x44, 0x8e, 0xbd, 0x99, 0x94, 0xc1, 0x8d, 0xc5, 0xd5, 0xa1, + 0xc9, 0xc7, 0x48, 0xc1, 0x70, 0xd3, 0x8a, 0x8c, 0x51, 0xd4, 0xa4, 0x7c, + 0xae, 0x65, 0x6d, 0x52, 0xa0, 0x93, 0xc8, 0x6b, 0x2f, 0x6c, 0x54, 0x5a, + 0x59, 0xc2, 0xbc, 0xcb, 0xaf, 0xd7, 0xa2, 0x36, 0x71, 0x60, 0x65, 0x8b, + 0x58, 0xaf, 0x95, 0x9a, 0x7c, 0x9f, 0x7b, 0xb0, 0x39, 0x4e, 0xd3, 0x82, + 0x3a, 0xb0, 0xa0, 0xd6, 0xbc, 0xd2, 0x27, 0xc3, 0x5e, 0xb3, 0x38, 0xc3, + 0x45, 0x9c, 0xd4, 0x8d, 0x89, 0x6f, 0xb2, 0xcb, 0xb9, 0xb9, 0x3a, 0xd3, + 0x4e, 0x96, 0x91, 0x54, 0x61, 0x8e, 0x1d, 0x72, 0x5d, 0x9c, 0x27, 0x6b, + 0x74, 0x6f, 0x68, 0xb5, 0xa1, 0x3c, 0x9a, 0xe0, 0x4e, 0x30, 0x7b, 0xa4, + 0xcf, 0x9d, 0x37, 0x98, 0xa4, 0xc7, 0x45, 0xc0, 0xd3, 0x98, 0x6e, 0xb9, + 0x59, 0x50, 0x4d, 0x44, 0xae, 0x8c, 0xdf, 0x34, 0x50, 0xde, 0x63, 0x94, + 0x98, 0x9a, 0x6a, 0x6d, 0x40, 0x5d, 0x48, 0xe1, 0x61, 0xc4, 0x32, 0x82, + 0x85, 0x45, 0x4b, 0x7e, 0x88, 0xef, 0xcf, 0x57, 0x84, 0x4d, 0xc7, 0x94, + 0xaa, 0x89, 0x4a, 0x40, 0xb5, 0xb6, 0x38, 0xad, 0x81, 0x9d, 0x87, 0xcc, + 0x79, 0x85, 0x84, 0x69, 0xbe, 0x31, 0x33, 0x77, 0x71, 0x36, 0x61, 0x6c, + 0x91, 0x5b, 0x7d, 0xc7, 0xb5, 0x53, 0x5b, 0xb3, 0x6c, 0x9d, 0xb2, 0x38, + 0xa4, 0x95, 0x68, 0x65, 0x90, 0x98, 0xb1, 0xac, 0x56, 0xa9, 0x36, 0x6a, + 0x61, 0x38, 0x8b, 0x3f, 0xd2, 0xbf, 0x56, 0xb8, 0xca, 0x95, 0x31, 0x87, + 0xaa, 0x47, 0x7d, 0x39, 0x51, 0x33, 0x88, 0xbc, 0xa6, 0x51, 0xc0, 0x49, + 0x70, 0xae, 0x9e, 0xc0, 0x52, 0xa8, 0xbe, 0x8c, 0xbd, 0x7e, 0x8a, 0x5b, + 0xc4, 0x40, 0x74, 0x6f, 0xca, 0x67, 0x85, 0xcb, 0xaf, 0x8f, 0x4f, 0xc9, + 0x76, 0x70, 0x49, 0xa5, 0x83, 0x41, 0x6b, 0x3e, 0x6d, 0x88, 0xaf, 0x6b, + 0x64, 0xa8, 0x91, 0x6c, 0x8c, 0x3f, 0x99, 0x47, 0x98, 0x46, 0xa3, 0xb3, + 0xc4, 0x52, 0x97, 0xb0, 0x75, 0x44, 0x48, 0xca, 0xc6, 0xc0, 0x77, 0x5a, + 0x90, 0xba, 0x9d, 0x7e, 0x52, 0x6f, 0xbb, 0x82, 0x52, 0x9f, 0x90, 0x44, + 0xb7, 0x67, 0xcc, 0x94, 0x55, 0x35, 0x63, 0x79, 0x98, 0xcc, 0x41, 0x88, + 0x79, 0xd3, 0xb2, 0x70, 0x8d, 0x89, 0x92, 0x6d, 0xa2, 0x6b, 0x8e, 0xa8, + 0x38, 0x61, 0x55, 0xad, 0x4d, 0xc5, 0xcd, 0xd0, 0x56, 0xd0, 0x63, 0xa1, + 0x5a, 0x58, 0xa8, 0xab, 0xcb, 0x58, 0x7e, 0x8e, 0xd2, 0xbd, 0x47, 0xca, + 0x93, 0x99, 0xca, 0xa8, 0xa2, 0xbb, 0xbb, 0xb0, 0x36, 0x3d, 0x4d, 0x34, + 0xb7, 0x35, 0x7a, 0x5f, 0x5c, 0x36, 0x36, 0xc9, 0xb1, 0x3f, 0x5f, 0xb8, + 0x63, 0x5d, 0xbf, 0x5c, 0x99, 0x4c, 0x5f, 0x75, 0xb0, 0x78, 0x5d, 0x8b, + 0x60, 0x8b, 0x87, 0x6b, 0x35, 0xa9, 0x84, 0xbf, 0x42, 0x3d, 0xd1, 0x49, + 0x54, 0x39, 0x40, 0x3a, 0xae, 0xb0, 0xbc, 0x69, 0x82, 0xc7, 0x4e, 0x99, + 0xaa, 0x97, 0x6c, 0x30, 0xad, 0xcf, 0x58, 0x6a, 0x97, 0x73, 0x5f, 0x51, + 0x54, 0xac, 0x3d, 0xa7, 0x81, 0x5a, 0x5e, 0xa7, 0x4c, 0xae, 0x97, 0xc9, + 0x6a, 0xa8, 0xba, 0xc4, 0x5e, 0x92, 0x51, 0x3a, 0x70, 0x5d, 0x69, 0x78, + 0x9b, 0xc7, 0x57, 0x42, 0xb7, 0xb0, 0x3d, 0x97, 0x59, 0xa7, 0x59, 0xcd, + 0x30, 0x7f, 0x54, 0x6d, 0x95, 0xb0, 0xba, 0xb2, 0xa4, 0xc6, 0x8a, 0x8d, + 0x6f, 0x8c, 0x5f, 0xbf, 0x88, 0xc9, 0x4f, 0x96, 0xb9, 0xae, 0x52, 0x39, + 0x50, 0x69, 0xbe, 0x37, 0xc1, 0x8a, 0x99, 0xa9, 0xaf, 0x3b, 0x88, 0x64, + 0x30, 0x82, 0x76, 0x81, 0x60, 0xbf, 0x39, 0xca, 0x9e, 0x7c, 0x49, 0x72, + 0x6a, 0x6f, 0x83, 0xc5, 0xaf, 0x33, 0x5f, 0x6b, 0x6e, 0x4d, 0xc0, 0xc9, + 0x4e, 0x66, 0x6b, 0xaf, 0xce, 0x33, 0xb1, 0xa3, 0x32, 0x90, 0xc8, 0xac, + 0xa7, 0x8b, 0xa5, 0x78, 0x98, 0x68, 0x3f, 0x57, 0xac, 0x84, 0xc8, 0x31, + 0x82, 0x33, 0xc2, 0x6f, 0x37, 0x37, 0x95, 0x7c, 0x87, 0xc8, 0x5b, 0xae, + 0xb0, 0x34, 0xb2, 0x83, 0x9d, 0x98, 0x9a, 0x53, 0x36, 0x8f, 0x89, 0x87, + 0xa8, 0x54, 0x61, 0x3e, 0x52, 0x54, 0x4b, 0x67, 0xa2, 0x93, 0xa5, 0x75, + 0x55, 0x67, 0x55, 0xc2, 0x4f, 0xbd, 0xce, 0x6c, 0x9e, 0x8e, 0x72, 0x36, + 0x4a, 0x58, 0xa4, 0x32, 0x4a, 0x44, 0xbe, 0xc4, 0xcb, 0x80, 0x41, 0x55, + 0xbe, 0xac, 0x38, 0x68, 0x58, 0x70, 0xae, 0x91, 0x8b, 0x62, 0x35, 0x49, + 0x6b, 0x81, 0x66, 0xbc, 0xa6, 0xc7, 0x54, 0xa7, 0x32, 0x3c, 0x6b, 0x88, + 0xae, 0x34, 0x98, 0x7b, 0xa3, 0x30, 0x33, 0xa9, 0x7f, 0x5b, 0x8d, 0xbb, + 0xb1, 0x80, 0x40, 0x6f, 0xc5, 0x32, 0x74, 0x77, 0x65, 0x6e, 0x7b, 0x35, + 0x4a, 0x96, 0x4a, 0x8d, 0xb6, 0x83, 0x38, 0x85, 0x43, 0x86, 0x6a, 0xc2, + 0x91, 0x8c, 0xc3, 0xb2, 0x53, 0x86, 0x7f, 0xaf, 0x53, 0xa2, 0x48, 0x50, + 0x88, 0x7b, 0xcd, 0x3d, 0xd2, 0xcd, 0x73, 0x86, 0x47, 0xa1, 0x4e, 0xb0, + 0x41, 0x3c, 0xb8, 0x54, 0xa6, 0x9d, 0xae, 0x42, 0xc5, 0x72, 0x49, 0x7d, + 0x4f, 0x84, 0x3f, 0x88, 0x53, 0x48, 0x97, 0x3e, 0xd5, 0x8d, 0x44, 0xbb, + 0xc3, 0xa8, 0xb2, 0xca, 0x5b, 0x57, 0xc4, 0x65, 0x72, 0xc1, 0xb3, 0xc7, + 0xa1, 0x92, 0x98, 0x7f, 0x79, 0x7d, 0x57, 0x78, 0xcd, 0x3e, 0x78, 0x65, + 0xcb, 0x50, 0x8d, 0x9f, 0xa1, 0xae, 0x3b, 0x33, 0x6e, 0x8d, 0x85, 0x6f, + 0x62, 0x4c, 0x71, 0xb7, 0x9d, 0xd0, 0x97, 0xac, 0x92, 0x42, 0x6a, 0x8f, + 0xa2, 0x85, 0x79, 0xa1, 0x5c, 0x42, 0x9b, 0x67, 0x62, 0x3c, 0x34, 0x69, + 0x55, 0xa4, 0xc4, 0x3b, 0x88, 0x87, 0xa5, 0x89, 0x9a, 0xb2, 0xa2, 0xa7, + 0xae, 0x68, 0xac, 0x99, 0x95, 0x5f, 0x51, 0x8d, 0xc4, 0x94, 0x87, 0x79, + 0xb3, 0x93, 0x69, 0x36, 0x60, 0x3a, 0x4f, 0x8c, 0x70, 0xa2, 0x9d, 0x35, + 0x44, 0x99, 0x7a, 0x6a, 0x49, 0x60, 0x52, 0x61, 0x81, 0xc5, 0x5f, 0x6a, + 0x6d, 0xca, 0x92, 0x40, 0x8c, 0x7d, 0xbf, 0x5e, 0x8d, 0x57, 0x3b, 0xb4, + 0xc4, 0xa4, 0x61, 0x4e, 0x47, 0x66, 0x74, 0xa9, 0x60, 0x77, 0x37, 0x6b, + 0x2e, 0x9b, 0xcd, 0x9c, 0x5f, 0x7c, 0xbc, 0xca, 0xb5, 0x8a, 0x57, 0x99, + 0xa0, 0x91, 0x7b, 0xb7, 0xa2, 0x93, 0xc0, 0x8a, 0xb9, 0x3e, 0x62, 0x58, + 0x34, 0x73, 0xa5, 0x7b, 0x4d, 0x53, 0x90, 0xa3, 0x94, 0xa8, 0xaa, 0x78, + 0x90, 0x64, 0xcd, 0x3c, 0x88, 0x9d, 0xd1, 0x85, 0x68, 0x70, 0x5d, 0xcc, + 0x85, 0x5d, 0x4c, 0x42, 0x58, 0x87, 0x9b, 0x4a, 0x84, 0x68, 0x75, 0xd1, + 0xa6, 0x81, 0x88, 0x7d, 0x9d, 0x4f, 0xb5, 0x39, 0xa7, 0x87, 0xca, 0xc2, + 0x7c, 0xa4, 0x7d, 0xc8, 0x5f, 0x83, 0x87, 0xc9, 0xc1, 0x92, 0x5a, 0x70, + 0xa1, 0x5f, 0xaf, 0xbe, 0x38, 0xcc, 0xa1, 0xa2, 0xa3, 0x49, 0x6f, 0xa2, + 0xce, 0xc6, 0xc9, 0x42, 0x81, 0x50, 0xc3, 0xc0, 0x6d, 0x87, 0x62, 0x8b, + 0xad, 0x8f, 0xcf, 0xa7, 0x7c, 0xc3, 0x33, 0x9d, 0x4a, 0xc8, 0x47, 0x90, + 0xa3, 0x7d, 0x42, 0xa3, 0x5d, 0x30, 0xd3, 0x68, 0xd3, 0x6b, 0x42, 0x63, + 0xbc, 0x49, 0xd0, 0x74, 0x57, 0xa5, 0xca, 0x93, 0xb9, 0x6a, 0x42, 0x77, + 0x9d, 0xc1, 0x6e, 0x60, 0x37, 0xab, 0x6a, 0xb2, 0x53, 0x5f, 0xbd, 0x7a, + 0x5a, 0x63, 0x50, 0x4b, 0x91, 0x63, 0x45, 0xb6, 0x7c, 0x9d, 0x68, 0x86, + 0xb2, 0x43, 0x9b, 0xbc, 0xbe, 0x3d, 0xa0, 0x43, 0x50, 0x5a, 0xc9, 0xbc, + 0x5f, 0x5c, 0xac, 0x66, 0x83, 0x6c, 0x9e, 0x7b, 0x59, 0xcd, 0xd2, 0x59, + 0x96, 0xb9, 0xaa, 0x3f, 0x9f, 0x8f, 0x45, 0x4b, 0x8c, 0x90, 0x42, 0xc3, + 0x96, 0x54, 0x9a, 0x85, 0xac, 0x4d, 0xcd, 0xc3, 0x78, 0x9a, 0x91, 0x8d, + 0x3a, 0xb9, 0x75, 0x59, 0x46, 0x28, 0xb8, 0x9d, 0x4b, 0xbd, 0x89, 0x9d, + 0x87, 0xc3, 0x4a, 0xa2, 0xad, 0xa9, 0x93, 0x7b, 0x50, 0x45, 0x6e, 0x89, + 0x75, 0x3a, 0x3c, 0x32, 0x65, 0x93, 0x56, 0xc4, 0x93, 0x3f, 0x80, 0x53, + 0x3f, 0xa9, 0x39, 0x52, 0x4c, 0xc7, 0x78, 0x51, 0x72, 0xa8, 0x3d, 0x70, + 0xaa, 0xb4, 0x64, 0x7a, 0x71, 0x81, 0xb0, 0xa5, 0x5c, 0x54, 0x97, 0x8c, + 0x81, 0x69, 0x3f, 0x8b, 0xa8, 0x64, 0x9a, 0xd0, 0xa0, 0x88, 0xb2, 0xc5, + 0x6b, 0xbf, 0x4d, 0xba, 0xc4, 0x36, 0x8e, 0x8a, 0xa3, 0x92, 0x4a, 0x8b, + 0x3c, 0xaf, 0x5a, 0x3a, 0x46, 0x5e, 0xb4, 0x69, 0xb5, 0x64, 0x67, 0x76, + 0xb1, 0x72, 0xc6, 0x4b, 0x54, 0xb3, 0x34, 0x77, 0xbe, 0xad, 0xb0, 0xd5, + 0x64, 0x4f, 0x7e, 0x42, 0x7e, 0x8c, 0x55, 0xab, 0x78, 0x89, 0x6a, 0xb8, + 0x44, 0x60, 0x4c, 0x9e, 0x66, 0xcb, 0xc0, 0xaa, 0xc4, 0x86, 0x4e, 0xd4, + 0x3b, 0x63, 0x57, 0x9c, 0x3e, 0xb4, 0x60, 0x68, 0x5a, 0x9d, 0x71, 0xc8, + 0x5d, 0x6f, 0x5c, 0x87, 0xae, 0x51, 0x91, 0xca, 0xd0, 0xa3, 0xb4, 0xdc, + 0xb3, 0x6a, 0xab, 0xd3, 0x7f, 0x5f, 0xcb, 0x64, 0x70, 0x9a, 0x4f, 0xac, + 0xbd, 0x8c, 0xb7, 0x41, 0x65, 0xb4, 0xa8, 0x5a, 0x67, 0x60, 0xc8, 0xcf, + 0xcd, 0x81, 0x9c, 0x58, 0x7e, 0x9e, 0x5e, 0x75, 0x4f, 0x5f, 0x98, 0x3a, + 0xad, 0xb1, 0x52, 0xbf, 0x7e, 0x78, 0x9c, 0x50, 0x4a, 0xb6, 0x72, 0x36, + 0xbc, 0x4d, 0x88, 0xbd, 0xcd, 0xac, 0x32, 0x68, 0xb7, 0x81, 0x9d, 0xa8, + 0xb6, 0x40, 0xca, 0xc7, 0x4e, 0xa9, 0x72, 0x8d, 0x8a, 0xdc, 0x5c, 0xa0, + 0xb8, 0x7a, 0x6c, 0x7b, 0xa6, 0x9a, 0x3b, 0xcb, 0xce, 0x55, 0xc2, 0xc0, + 0xba, 0x85, 0x6e, 0x89, 0x60, 0xa4, 0xd6, 0xbb, 0xac, 0x55, 0xa1, 0x84, + 0x3a, 0xd5, 0x72, 0xb2, 0xb6, 0x6c, 0x3a, 0x41, 0x91, 0x95, 0x8d, 0x5d, + 0x34, 0x74, 0x53, 0x46, 0x75, 0xc1, 0x6b, 0x3c, 0x97, 0xc2, 0x41, 0x6f, + 0xcb, 0x9e, 0xcb, 0xa5, 0x32, 0xae, 0x85, 0x90, 0xb9, 0xbe, 0x73, 0xb1, + 0xbd, 0x89, 0xb2, 0x89, 0x9c, 0x47, 0xb5, 0xaa, 0xda, 0xbf, 0xcd, 0x35, + 0x43, 0x99, 0x2d, 0x9d, 0x93, 0x4c, 0x83, 0xd3, 0x5c, 0x68, 0x6b, 0x53, + 0x42, 0xcb, 0x60, 0x38, 0x2f, 0x61, 0xbc, 0xaa, 0xd4, 0xa5, 0x5e, 0x94, + 0xde, 0x8f, 0xaf, 0xa2, 0x94, 0xaf, 0x6e, 0x32, 0xd3, 0x34, 0x7c, 0x87, + 0x5f, 0x85, 0xd8, 0x8e, 0x4a, 0x6d, 0x46, 0xb6, 0xc8, 0xcd, 0x6a, 0xb4, + 0x5e, 0x97, 0xb3, 0x5a, 0x2e, 0x5a, 0xe1, 0x7e, 0x79, 0x67, 0xd1, 0xa9, + 0x46, 0xd0, 0x5e, 0xbb, 0x67, 0x62, 0x50, 0x63, 0x63, 0x5b, 0x4e, 0x7f, + 0x73, 0xd4, 0x56, 0x8d, 0xa2, 0xc3, 0x83, 0x8c, 0x79, 0xc1, 0x34, 0x8b, + 0x4e, 0x46, 0x98, 0x2e, 0xbc, 0xca, 0xa1, 0x3a, 0x7b, 0x62, 0x9f, 0xae, + 0xb1, 0x52, 0xba, 0x3f, 0xc1, 0x52, 0xb4, 0x61, 0x8f, 0xb3, 0x9e, 0x74, + 0xb6, 0x4e, 0xac, 0xc5, 0x82, 0x95, 0x85, 0x6a, 0x5a, 0xbe, 0x71, 0x54, + 0x4f, 0x50, 0x88, 0x71, 0x3c, 0x8a, 0xbb, 0x5e, 0x8e, 0x89, 0xc9, 0x64, + 0xcb, 0x6a, 0xa7, 0x41, 0xa6, 0x9f, 0xd5, 0xa3, 0xa7, 0xb6, 0x63, 0xb7, + 0xb7, 0xac, 0x59, 0xb8, 0xc9, 0x73, 0xa5, 0x46, 0x9b, 0x6d, 0x71, 0x69, + 0xb8, 0xce, 0xcf, 0xab, 0x95, 0xbf, 0x61, 0x76, 0x8b, 0xce, 0x5a, 0x58, + 0xa4, 0x99, 0xcb, 0x36, 0x9e, 0xa5, 0x4e, 0x36, 0x5b, 0x96, 0x2e, 0x3f, + 0xc5, 0x84, 0xa9, 0x40, 0xa7, 0x86, 0x3c, 0x95, 0xb0, 0x96, 0x48, 0x9e, + 0x66, 0x88, 0xb9, 0x59, 0x72, 0x4b, 0x36, 0x44, 0x9e, 0x6b, 0xbe, 0xc0, + 0x57, 0x4f, 0x92, 0xac, 0xcf, 0xaf, 0x51, 0x4d, 0x9d, 0x56, 0x77, 0xc8, + 0xb1, 0x70, 0x9a, 0x5f, 0xc7, 0x74, 0x46, 0x35, 0xb0, 0xc8, 0x4f, 0x9d, + 0x83, 0x8c, 0x6e, 0x66, 0x95, 0x9e, 0xce, 0xd6, 0xd8, 0x77, 0x88, 0x29, + 0xa8, 0xaa, 0xbb, 0x77, 0x7c, 0xb0, 0x49, 0xa9, 0xaf, 0x8c, 0x47, 0xc2, + 0xaf, 0x2e, 0x47, 0x9b, 0x82, 0x5e, 0xdc, 0x45, 0xa8, 0xaa, 0x42, 0x3e, + 0x98, 0x3d, 0x5a, 0x3c, 0xbc, 0x84, 0xb4, 0x97, 0x72, 0x3a, 0xc3, 0x99, + 0x89, 0x3a, 0x42, 0x4b, 0x73, 0x54, 0xb2, 0x61, 0x5c, 0x8c, 0x78, 0xa7, + 0xbc, 0x86, 0x93, 0x43, 0x90, 0x44, 0xb9, 0x64, 0x91, 0x8b, 0x5c, 0x8e, + 0x75, 0x6a, 0x7c, 0x83, 0x51, 0xb1, 0x5e, 0x7f, 0xc9, 0xaf, 0xcd, 0x5b, + 0x70, 0xcb, 0x4e, 0x60, 0x75, 0x9a, 0x49, 0x70, 0xbd, 0xbe, 0xc5, 0x57, + 0x6d, 0x51, 0x5f, 0x7a, 0x92, 0x3d, 0x36, 0x70, 0x99, 0x75, 0x8f, 0xc8, + 0x7d, 0x9f, 0x93, 0x7e, 0x59, 0x41, 0xd1, 0x5f, 0x40, 0xa4, 0x69, 0x7a, + 0x6c, 0xb1, 0x8d, 0x9e, 0x3b, 0x84, 0xe0, 0xa7, 0x39, 0x8b, 0xb7, 0x73, + 0x72, 0x2a, 0x6d, 0x58, 0x6e, 0x88, 0xa7, 0x97, 0xc6, 0x76, 0x98, 0x8a, + 0x8e, 0xd8, 0x42, 0xb3, 0x89, 0x6c, 0xa4, 0x30, 0xc6, 0x53, 0xa7, 0x58, + 0x62, 0x51, 0x75, 0x93, 0xc6, 0x4e, 0xa0, 0x90, 0x5f, 0x63, 0x34, 0x87, + 0x9b, 0xc9, 0xa8, 0xcd, 0xbb, 0x48, 0xba, 0x54, 0xa1, 0x6f, 0xbf, 0x4d, + 0xd3, 0xc0, 0x7c, 0xb9, 0xaf, 0x57, 0xce, 0xc0, 0xbb, 0xab, 0x87, 0xad, + 0x3e, 0x97, 0x47, 0x3c, 0x7d, 0x56, 0x5b, 0xc2, 0x79, 0x5e, 0x7f, 0x3e, + 0x8a, 0x5c, 0x87, 0xa3, 0xcc, 0x6a, 0xc5, 0xbf, 0xcb, 0x32, 0xa5, 0xac, + 0x64, 0x41, 0x34, 0x30, 0x86, 0xc1, 0xbd, 0xa0, 0xa6, 0xcd, 0x74, 0x34, + 0x6a, 0x85, 0xd0, 0x4c, 0x9b, 0x9e, 0x74, 0xd1, 0x66, 0x3e, 0xd2, 0x9c, + 0xbd, 0x73, 0xad, 0x57, 0x9b, 0x63, 0xc4, 0x57, 0x63, 0x98, 0x34, 0x34, + 0x75, 0xc7, 0x49, 0x75, 0xca, 0x3c, 0x75, 0xb0, 0xae, 0xae, 0xb0, 0x5d, + 0x74, 0x7c, 0x70, 0x2d, 0x70, 0xb9, 0x38, 0xa2, 0xd4, 0x83, 0x71, 0x7a, + 0xa1, 0x44, 0x97, 0x99, 0xa0, 0x75, 0xa0, 0x6f, 0x85, 0xbe, 0x59, 0x91, + 0x52, 0x30, 0x75, 0x9a, 0xb3, 0xba, 0x92, 0xce, 0xa8, 0x8d, 0xb9, 0xba, + 0x72, 0xa5, 0x6c, 0x88, 0x9b, 0x7c, 0x63, 0x57, 0xa1, 0xcd, 0x8e, 0xb2, + 0x87, 0x2f, 0xc7, 0x99, 0x7e, 0x70, 0x97, 0x6f, 0xcd, 0x4d, 0x4d, 0x83, + 0x46, 0x8f, 0x56, 0x8c, 0x87, 0x72, 0x9f, 0xb6, 0x95, 0x5e, 0xb7, 0x52, + 0x76, 0x62, 0xa2, 0xab, 0x51, 0xb3, 0x3e, 0xbf, 0x6b, 0xb0, 0x33, 0x96, + 0x47, 0x65, 0x36, 0xca, 0x76, 0xa6, 0x71, 0x3f, 0x3f, 0xab, 0xb3, 0xb6, + 0x79, 0xa4, 0xc2, 0xa2, 0xac, 0x84, 0x57, 0xa6, 0x9d, 0x51, 0x61, 0x6a, + 0x86, 0x6c, 0xb2, 0x4d, 0xba, 0x8e, 0x9c, 0x9e, 0x8f, 0x41, 0x63, 0x73, + 0x80, 0x7f, 0x60, 0x5d, 0x42, 0x5c, 0x46, 0x90, 0xc7, 0xce, 0x57, 0x9e, + 0x45, 0x57, 0x76, 0xa4, 0xc9, 0xc6, 0x4b, 0x81, 0x81, 0xa8, 0x36, 0x83, + 0x6f, 0x39, 0x8b, 0xb9, 0x82, 0x86, 0x8a, 0xa3, 0x94, 0x5f, 0x55, 0xac, + 0xa7, 0x80, 0x76, 0x73, 0x5c, 0xae, 0xb3, 0x5d, 0x42, 0x92, 0xa4, 0x59, + 0x87, 0x85, 0xa7, 0x73, 0xa1, 0x5f, 0x4a, 0x8d, 0x53, 0xc5, 0x7f, 0xab, + 0x6c, 0x3a, 0x82, 0x80, 0x41, 0x80, 0x92, 0x85, 0xb0, 0xa7, 0x96, 0x65, + 0x97, 0xbf, 0xa8, 0xbf, 0x35, 0x83, 0xb9, 0x58, 0x68, 0xba, 0x63, 0x30, + 0x58, 0xa1, 0x6f, 0x99, 0xb7, 0x70, 0xc3, 0xb5, 0xb6, 0x82, 0xb4, 0x91, + 0x74, 0x7d, 0x49, 0xb0, 0x35, 0xa3, 0xd3, 0x6f, 0x4f, 0x38, 0x9a, 0x5b, + 0x8f, 0x6d, 0x84, 0xb4, 0x76, 0xbe, 0xbb, 0x9d, 0x4a, 0xad, 0x4c, 0x59, + 0xd0, 0xd1, 0xa1, 0x6f, 0xaa, 0x73, 0x47, 0xa1, 0x48, 0x32, 0x68, 0x71, + 0x51, 0xb3, 0x3c, 0xcd, 0x9c, 0xb9, 0x96, 0xa2, 0x93, 0x77, 0x5a, 0x2c, + 0xcd, 0x6a, 0x78, 0xb0, 0x95, 0x51, 0x99, 0xab, 0xc5, 0x4f, 0x42, 0xb9, + 0x72, 0x42, 0x42, 0xc2, 0xd2, 0xcc, 0x50, 0x91, 0x68, 0xa5, 0x41, 0xa7, + 0x8f, 0x82, 0xac, 0xa8, 0x51, 0x8c, 0xa5, 0xc2, 0x46, 0x9c, 0x37, 0xbc, + 0x47, 0x62, 0x75, 0x98, 0xc6, 0x51, 0xc0, 0x58, 0x65, 0x68, 0xa5, 0xb8, + 0xaa, 0xc4, 0xb0, 0x7b, 0x56, 0xa0, 0xad, 0xd1, 0x8d, 0x47, 0x94, 0x4b, + 0x7f, 0xca, 0x7f, 0xc3, 0xc8, 0xba, 0x61, 0x82, 0xd0, 0x68, 0x8d, 0x5c, + 0x8e, 0xbb, 0xc0, 0x44, 0xd1, 0x61, 0x71, 0xb0, 0xb5, 0x90, 0x78, 0x33, + 0x91, 0xb6, 0x90, 0x84, 0xcd, 0x48, 0x5c, 0x41, 0xab, 0x88, 0x83, 0x76, + 0x76, 0x90, 0x57, 0xaf, 0x6d, 0xc5, 0xa7, 0x3d, 0x56, 0x87, 0x94, 0xad, + 0x9e, 0x4d, 0x3f, 0x47, 0x47, 0xc4, 0x68, 0x75, 0x50, 0x3b, 0x36, 0x34, + 0xb9, 0xb3, 0xa5, 0xaa, 0x94, 0x86, 0x9f, 0xc3, 0xa9, 0x91, 0x61, 0x71, + 0x3a, 0x45, 0x5e, 0x76, 0xc2, 0xce, 0x67, 0x68, 0xa6, 0x81, 0x84, 0x83, + 0xbf, 0xcf, 0x9b, 0x37, 0x58, 0xa2, 0xb0, 0x9f, 0x48, 0x7c, 0xb4, 0xbb, + 0x9c, 0x46, 0x93, 0x95, 0xb2, 0x75, 0x85, 0xbd, 0x49, 0xd4, 0x81, 0x4d, + 0x94, 0xbe, 0x36, 0x8b, 0xac, 0xba, 0xd4, 0xb7, 0x49, 0xcb, 0x6e, 0x52, + 0x92, 0x43, 0xc7, 0xab, 0x9b, 0xb5, 0x98, 0xc0, 0x81, 0x4d, 0x99, 0x52, + 0x90, 0x7d, 0x72, 0x80, 0x9a, 0xb7, 0x6a, 0x56, 0x83, 0xd2, 0x74, 0xc6, + 0x65, 0x7a, 0xa6, 0x35, 0x3c, 0x7d, 0x82, 0x70, 0x7c, 0x81, 0xb5, 0x8c, + 0x40, 0x96, 0x4b, 0x4c, 0x7c, 0xbb, 0x74, 0x48, 0x77, 0xb1, 0x50, 0x3b, + 0xcc, 0x7f, 0xb0, 0x43, 0xae, 0x58, 0x79, 0x48, 0x9f, 0x8a, 0x94, 0xb9, + 0x4d, 0x60, 0x6f, 0x53, 0x4b, 0x87, 0x9f, 0x35, 0xa9, 0x55, 0x60, 0x3c, + 0x79, 0x65, 0x91, 0x92, 0x3b, 0xcb, 0xc2, 0x64, 0x5f, 0xc0, 0x26, 0x7e, + 0xb5, 0x5a, 0xcd, 0x83, 0x39, 0x38, 0x3b, 0x99, 0x85, 0x30, 0x8a, 0x57, + 0xa7, 0x9c, 0x2f, 0x32, 0x32, 0x98, 0x72, 0xb6, 0x8d, 0x40, 0x2f, 0xc6, + 0x99, 0x68, 0xc9, 0xc7, 0xbb, 0x4d, 0xd0, 0x40, 0xaa, 0x3b, 0x69, 0x9d, + 0x8d, 0xc4, 0xd1, 0x78, 0x34, 0xb8, 0x9a, 0xaa, 0x7d, 0x93, 0xd3, 0x39, + 0x95, 0x73, 0x6a, 0x3e, 0x94, 0x91, 0x7e, 0x5a, 0x82, 0x93, 0xca, 0x9b, + 0x59, 0x34, 0x43, 0x40, 0x37, 0x60, 0x7b, 0xcc, 0x58, 0x66, 0x55, 0x79, + 0x5b, 0x65, 0x45, 0x49, 0x7d, 0x6e, 0x85, 0xbf, 0x71, 0x6f, 0x5a, 0xb7, + 0x62, 0xab, 0x64, 0x8f, 0xa8, 0x75, 0xc2, 0x75, 0x76, 0x53, 0x84, 0x45, + 0x36, 0x49, 0x5b, 0x90, 0x47, 0xd5, 0x7f, 0x60, 0xd4, 0xc8, 0x92, 0x9b, + 0xc4, 0xaa, 0xb9, 0x77, 0x3a, 0x9e, 0x9f, 0x7b, 0x7a, 0xc1, 0xa6, 0x4a, + 0x85, 0xb7, 0x95, 0x35, 0xc5, 0x50, 0x59, 0xb4, 0x3c, 0xa8, 0xa1, 0x45, + 0xbf, 0x52, 0x87, 0x95, 0x69, 0x51, 0xa5, 0xc2, 0xd1, 0xad, 0x3c, 0x69, + 0xa1, 0xb3, 0x7d, 0x5a, 0x56, 0xb4, 0x65, 0x73, 0x5b, 0xd3, 0xc9, 0x8f, + 0xc4, 0xc2, 0x67, 0x65, 0x64, 0xbc, 0xbc, 0x98, 0x71, 0x42, 0x5d, 0xcf, + 0xa0, 0xb9, 0xca, 0x76, 0x84, 0xbe, 0x75, 0x87, 0x8c, 0xbd, 0x40, 0x5b, + 0x63, 0xb4, 0xbe, 0xbe, 0x5a, 0x5a, 0x95, 0xc8, 0xbd, 0x90, 0x57, 0x76, + 0x7f, 0x5a, 0x4c, 0xc2, 0x88, 0x70, 0x53, 0x6d, 0x62, 0x6c, 0x45, 0x51, + 0x60, 0x49, 0x2c, 0x83, 0x6a, 0x86, 0x93, 0xb8, 0x9c, 0x96, 0x31, 0x6e, + 0x6a, 0xc9, 0xcd, 0xac, 0x3a, 0x8b, 0x7c, 0xb1, 0x6c, 0x9a, 0x4d, 0xc6, + 0xa9, 0xba, 0x92, 0xc4, 0xca, 0xab, 0xcc, 0xd1, 0xd2, 0x9b, 0xb0, 0xbb, + 0xc6, 0x59, 0x3b, 0xaa, 0x68, 0xb0, 0xcf, 0x72, 0xb8, 0x5e, 0x63, 0x5f, + 0x51, 0xbc, 0xc4, 0xa7, 0x32, 0xb8, 0x40, 0x3c, 0x8e, 0x96, 0xd1, 0x72, + 0xcd, 0x3f, 0xc6, 0xb1, 0x62, 0x85, 0x79, 0x91, 0x3d, 0x70, 0x51, 0x8f, + 0x3c, 0xc4, 0x62, 0x95, 0xc2, 0x8e, 0xc2, 0xa5, 0x38, 0xa1, 0xde, 0x9b, + 0x82, 0x74, 0x91, 0x88, 0x40, 0x67, 0xad, 0x4e, 0x66, 0xbc, 0x6c, 0x5e, + 0x53, 0x64, 0x83, 0x74, 0x47, 0x6a, 0x73, 0x67, 0x4e, 0x5a, 0x81, 0x8e, + 0x95, 0x6b, 0x3b, 0xa9, 0x9c, 0x7b, 0xc2, 0x79, 0xa7, 0x93, 0x54, 0x5b, + 0x53, 0xa8, 0x89, 0x6a, 0x9f, 0x4c, 0x5b, 0x9d, 0x9b, 0x74, 0xd5, 0xb7, + 0x8c, 0xcc, 0x75, 0xbf, 0xac, 0x36, 0x42, 0xac, 0x66, 0x90, 0x5c, 0x47, + 0x3b, 0xc8, 0xac, 0x87, 0x65, 0x85, 0xac, 0x4e, 0x44, 0x5a, 0x8a, 0x3d, + 0xb9, 0x8c, 0x77, 0xb1, 0xaa, 0x61, 0xa3, 0xc3, 0xb9, 0x8a, 0xbf, 0xcd, + 0xcf, 0xae, 0x95, 0xa8, 0x74, 0x99, 0x47, 0x38, 0x84, 0x8c, 0x50, 0x62, + 0x6e, 0x8d, 0x77, 0x7c, 0x90, 0x94, 0xae, 0x55, 0x65, 0xac, 0x85, 0xcb, + 0x9d, 0x8d, 0x72, 0x6e, 0x3a, 0xa7, 0x4d, 0x70, 0x36, 0x42, 0x8b, 0xb8, + 0x82, 0x5a, 0x98, 0xc6, 0xcd, 0x66, 0x71, 0x44, 0x43, 0x51, 0xb8, 0x7d, + 0x93, 0x67, 0x8d, 0xc1, 0x98, 0x90, 0xbb, 0x7c, 0x3c, 0x38, 0x57, 0x67, + 0x39, 0xb6, 0x72, 0x3c, 0x6a, 0x66, 0xc9, 0x75, 0xae, 0xa1, 0xc2, 0x7f, + 0xbc, 0xa3, 0x82, 0x4d, 0x69, 0xa7, 0x71, 0x9f, 0x84, 0x69, 0x3d, 0x41, + 0x75, 0x63, 0x90, 0xbb, 0xcb, 0xb2, 0x71, 0x3a, 0xcc, 0x4c, 0xc5, 0x57, + 0x84, 0xd3, 0xb7, 0x91, 0x35, 0x97, 0xac, 0x9d, 0xa2, 0x43, 0x7f, 0x96, + 0xd4, 0xb8, 0x8c, 0x33, 0x46, 0x61, 0xcd, 0xa9, 0x8e, 0x8d, 0xb5, 0x37, + 0x8e, 0x91, 0x6a, 0x9c, 0xac, 0x75, 0x84, 0x5b, 0x92, 0xc8, 0x7b, 0x86, + 0xae, 0x6c, 0x58, 0x84, 0x69, 0xc4, 0xba, 0xb6, 0xc7, 0x49, 0xc5, 0xc9, + 0x4c, 0x62, 0x63, 0x43, 0xb1, 0x9f, 0x3f, 0xaa, 0x39, 0x87, 0x9b, 0x8c, + 0xc5, 0x5b, 0xb2, 0x59, 0x57, 0x7e, 0x86, 0x91, 0x6b, 0xc3, 0x4c, 0xa8, + 0x3a, 0xb0, 0xd5, 0x6b, 0x82, 0x40, 0x78, 0x57, 0x49, 0x40, 0x4a, 0x37, + 0x96, 0x9f, 0x6d, 0x9c, 0xb5, 0x9e, 0xba, 0x91, 0x50, 0x99, 0xd5, 0xaa, + 0x83, 0x84, 0xba, 0x64, 0x82, 0x82, 0xc1, 0xbe, 0x39, 0xb8, 0xb9, 0x42, + 0x39, 0x60, 0x87, 0x8d, 0x49, 0x9a, 0x59, 0x9c, 0x54, 0x49, 0x3c, 0x3f, + 0x97, 0x49, 0x29, 0x39, 0xc7, 0x6b, 0x39, 0x3f, 0xaa, 0x8e, 0x9d, 0x3b, + 0x60, 0x50, 0x79, 0x6d, 0x8c, 0x6e, 0x53, 0x31, 0x8b, 0x63, 0x89, 0x89, + 0x46, 0x32, 0x4d, 0x9c, 0x4b, 0x90, 0x84, 0x36, 0x9e, 0x8b, 0x34, 0x42, + 0x5f, 0x83, 0x46, 0x92, 0xca, 0x5c, 0xa8, 0xa9, 0xa2, 0xb7, 0xab, 0x7c, + 0xae, 0x7b, 0xab, 0xa1, 0x67, 0x50, 0x3b, 0x82, 0x58, 0x43, 0xc5, 0xa6, + 0x9a, 0xcd, 0x75, 0x76, 0x7c, 0x5a, 0x62, 0x4e, 0xc9, 0x47, 0x95, 0x92, + 0xbc, 0x39, 0x57, 0x94, 0x7f, 0x65, 0x9e, 0xc0, 0xc4, 0x7c, 0x73, 0xcf, + 0x3f, 0x68, 0x98, 0xc4, 0x41, 0x9b, 0x46, 0x9e, 0x9f, 0xb2, 0x8b, 0x49, + 0x61, 0x56, 0x57, 0x5b, 0xaa, 0x97, 0xb6, 0xc1, 0x81, 0xbc, 0x86, 0x48, + 0x79, 0xb1, 0x40, 0x85, 0xb6, 0xb4, 0x58, 0x4a, 0xb7, 0x4a, 0x34, 0xc0, + 0xd1, 0x60, 0x3b, 0x7a, 0x32, 0xc5, 0x65, 0x88, 0x91, 0x84, 0x93, 0x77, + 0x38, 0x7f, 0x92, 0x9c, 0x56, 0x77, 0x93, 0x3b, 0xc6, 0x56, 0x8d, 0xa7, + 0x96, 0x4c, 0x38, 0x5e, 0xa8, 0x89, 0x4a, 0x69, 0x92, 0x9f, 0x3c, 0x93, + 0x48, 0x57, 0x4e, 0x6c, 0x90, 0x49, 0x6b, 0x3a, 0x7b, 0x65, 0x84, 0x6d, + 0x6d, 0xa6, 0xbf, 0x9d, 0x77, 0x37, 0x9c, 0x9e, 0x3e, 0x3d, 0xae, 0x71, + 0x91, 0x80, 0x3c, 0x8a, 0xaa, 0x38, 0xa4, 0x72, 0x5d, 0x73, 0x81, 0x48, + 0xcb, 0x3b, 0xac, 0x5e, 0xaf, 0xc5, 0xae, 0x8a, 0x5a, 0x7d, 0x42, 0x8c, + 0x72, 0xc4, 0x3d, 0xac, 0x35, 0x4c, 0x7b, 0x2d, 0x4a, 0xa8, 0x6c, 0x98, + 0xbc, 0xbc, 0x39, 0x96, 0x37, 0x4d, 0x31, 0xcd, 0x4d, 0xad, 0xa2, 0x7a, + 0xac, 0x8f, 0x62, 0x9f, 0x9e, 0x4e, 0xd2, 0x62, 0x99, 0x8e, 0x45, 0xb2, + 0x78, 0x7e, 0x63, 0x8f, 0x98, 0xa0, 0xbe, 0x6c, 0x4a, 0xb4, 0x3e, 0xb8, + 0x38, 0xab, 0x4d, 0xba, 0x68, 0x57, 0xd5, 0x39, 0x48, 0x7b, 0xb1, 0x32, + 0x5f, 0x6a, 0x82, 0xc0, 0x6a, 0x94, 0x77, 0x7e, 0x90, 0x8a, 0x8d, 0x67, + 0xa9, 0xcd, 0x83, 0xa9, 0x83, 0xa3, 0x72, 0x9d, 0x8d, 0xbc, 0x8e, 0x77, + 0x7d, 0x7b, 0x97, 0x66, 0xaf, 0x91, 0x5b, 0x48, 0x88, 0xb1, 0xa7, 0xbb, + 0x8f, 0xd5, 0x9f, 0x7d, 0x91, 0x48, 0xc7, 0x58, 0x59, 0x3a, 0x61, 0xbb, + 0x31, 0xa5, 0x63, 0x7c, 0x82, 0x9f, 0xb7, 0x46, 0x6e, 0x58, 0x38, 0x7f, + 0x93, 0x40, 0xc4, 0xb7, 0xab, 0x6b, 0x74, 0x9b, 0xce, 0x8b, 0x88, 0x45, + 0xb8, 0x5d, 0x55, 0x52, 0x59, 0x75, 0x4f, 0x32, 0xbc, 0x51, 0xc1, 0x8f, + 0xd2, 0x58, 0x7e, 0xe1, 0x41, 0x76, 0xa3, 0xd3, 0xb2, 0x46, 0x91, 0x64, + 0x47, 0x98, 0xbb, 0x32, 0x63, 0xae, 0x98, 0x3e, 0x6e, 0xa1, 0xa5, 0x7f, + 0xba, 0xb1, 0x8c, 0x82, 0xb8, 0x35, 0xb9, 0xd0, 0xc3, 0x8c, 0x2d, 0xbe, + 0xab, 0xc5, 0x98, 0x94, 0x3e, 0x40, 0xae, 0x7b, 0x5a, 0x62, 0xc6, 0x54, + 0x57, 0x60, 0xa0, 0x9a, 0xcf, 0x64, 0x95, 0x9d, 0x7c, 0x63, 0x8e, 0xbc, + 0x3d, 0x71, 0x9f, 0x3c, 0x9a, 0x35, 0x33, 0xa6, 0x76, 0x7f, 0x46, 0x34, + 0x94, 0x90, 0x3f, 0xc5, 0x42, 0x92, 0x7a, 0x41, 0x93, 0x5a, 0x52, 0xca, + 0xad, 0x9e, 0x7a, 0x3b, 0xc0, 0x69, 0xb5, 0x57, 0x9e, 0x94, 0x81, 0x9b, + 0x7f, 0xb1, 0x5c, 0x6c, 0x5f, 0x61, 0x81, 0x3d, 0xc9, 0x9c, 0x47, 0x9b, + 0xb8, 0xce, 0xc9, 0x5f, 0xd3, 0x5e, 0x78, 0xa2, 0xab, 0xb1, 0x71, 0xb7, + 0x86, 0x32, 0x70, 0x9f, 0x6a, 0x64, 0xc5, 0x74, 0xca, 0xa9, 0x60, 0xae, + 0x98, 0x37, 0x33, 0x73, 0x96, 0x68, 0x34, 0xc1, 0x3a, 0xa4, 0x7d, 0xa4, + 0x8c, 0x7c, 0x5e, 0x5f, 0x5a, 0x4b, 0x34, 0x35, 0xa1, 0xb1, 0x79, 0x41, + 0xa7, 0x7b, 0x3f, 0xbf, 0x9e, 0x3c, 0xad, 0x9c, 0xcb, 0x89, 0x8e, 0x96, + 0xbe, 0xc2, 0x5a, 0x78, 0x70, 0xba, 0x74, 0x47, 0xae, 0x2e, 0x6c, 0xc5, + 0x2f, 0xc8, 0xd1, 0x63, 0x71, 0x51, 0xa9, 0xbe, 0x7d, 0x97, 0x8a, 0x8c, + 0x7b, 0x33, 0x91, 0x86, 0x4c, 0x3b, 0xd1, 0xbc, 0x52, 0x3c, 0x57, 0x73, + 0x55, 0x68, 0x9d, 0x68, 0x5e, 0x93, 0x63, 0xb4, 0xa8, 0x90, 0xa2, 0xa8, + 0xc0, 0xc4, 0x3d, 0x4f, 0x30, 0xc0, 0x5c, 0xa1, 0xa0, 0x51, 0xc2, 0xd6, + 0x90, 0x8e, 0xb9, 0x5e, 0xc5, 0xb9, 0xb6, 0xc8, 0x30, 0xac, 0x4a, 0xc8, + 0x99, 0x82, 0xcc, 0xd0, 0x36, 0x42, 0x60, 0x42, 0xbf, 0x4f, 0xc4, 0x74, + 0xd4, 0xbb, 0x5e, 0x95, 0x5b, 0x5c, 0x42, 0x4f, 0xa7, 0x3a, 0x6f, 0x9b, + 0xcc, 0x9b, 0x77, 0x6b, 0xcd, 0x44, 0x84, 0x3c, 0x2e, 0x5b, 0x38, 0xca, + 0xc3, 0xc9, 0x64, 0x78, 0x84, 0x42, 0xb0, 0x33, 0x98, 0xb7, 0x6c, 0x64, + 0x6b, 0x91, 0x55, 0x4c, 0x51, 0x7a, 0x45, 0x9d, 0xb8, 0x94, 0x99, 0x43, + 0xd2, 0xcc, 0x92, 0xa1, 0x65, 0xc5, 0x66, 0xab, 0x65, 0x58, 0x9d, 0xae, + 0x69, 0xcd, 0x89, 0x97, 0xbe, 0x8d, 0x43, 0xc5, 0xcc, 0xae, 0x8e, 0x83, + 0xa3, 0x8b, 0x4e, 0x6e, 0x42, 0x91, 0xb6, 0x43, 0x7d, 0x6b, 0xcf, 0x75, + 0x88, 0xa4, 0x69, 0x74, 0xc3, 0x95, 0x71, 0xd1, 0x68, 0x7d, 0x5f, 0x88, + 0xc2, 0x3f, 0xae, 0x59, 0x5a, 0xcc, 0xb2, 0x65, 0x62, 0xac, 0xcb, 0x75, + 0x9b, 0x64, 0xb0, 0x90, 0xca, 0xa4, 0x5f, 0x2f, 0x4a, 0x81, 0x41, 0xbc, + 0xcf, 0x98, 0x9f, 0xb1, 0x6a, 0x3e, 0x70, 0x69, 0x3e, 0x56, 0x6b, 0x3d, + 0xaa, 0x81, 0x56, 0x85, 0x73, 0x55, 0x4c, 0xbd, 0x48, 0xb3, 0x2e, 0x7e, + 0x4a, 0x3f, 0xa0, 0x45, 0x3c, 0x47, 0x6f, 0x3b, 0x5b, 0x37, 0xcd, 0x66, + 0xbf, 0xcc, 0x80, 0xcf, 0xb8, 0x7c, 0x60, 0x4d, 0x87, 0x33, 0x7a, 0xb2, + 0x8b, 0x6d, 0x7f, 0xbf, 0x81, 0x92, 0x54, 0x7a, 0xb1, 0x67, 0x86, 0xa7, + 0xbe, 0x70, 0x64, 0x64, 0xc9, 0xbc, 0x31, 0x33, 0xb3, 0x56, 0x4c, 0x93, + 0x81, 0xd5, 0xab, 0x6d, 0x66, 0x62, 0xa3, 0x71, 0xa2, 0xca, 0xa3, 0xcc, + 0x7b, 0xa7, 0x6e, 0x56, 0x58, 0xba, 0xd0, 0x74, 0x35, 0x5d, 0xbd, 0x68, + 0x53, 0x9a, 0x9b, 0xaf, 0xa3, 0x6c, 0xc3, 0x5d, 0x9d, 0x6b, 0xae, 0x30, + 0xc1, 0x6a, 0xd1, 0xce, 0x56, 0x6f, 0x4c, 0x6c, 0x52, 0xcb, 0x62, 0xbe, + 0xca, 0xc6, 0x36, 0xc2, 0xc9, 0x91, 0xca, 0xbb, 0x66, 0xc1, 0xa4, 0x7c, + 0x86, 0x76, 0x47, 0x5a, 0xa6, 0xb5, 0x50, 0x55, 0xce, 0xd0, 0x8b, 0x6d, + 0xc2, 0xc2, 0x7d, 0xa0, 0x39, 0x4e, 0x59, 0x6a, 0x88, 0x55, 0xb4, 0x3d, + 0x8d, 0x61, 0x3c, 0x64, 0x59, 0xa4, 0x58, 0x71, 0xb9, 0xa5, 0x93, 0xab, + 0x55, 0x8d, 0x5e, 0x29, 0x33, 0x64, 0xb0, 0x75, 0x2f, 0x52, 0x3e, 0x40, + 0x77, 0x35, 0xa4, 0x4e, 0x69, 0x56, 0x5b, 0xcb, 0xcc, 0xac, 0xb4, 0x5e, + 0xa5, 0x97, 0xc5, 0xc3, 0x64, 0xce, 0x4a, 0xc8, 0x95, 0x9f, 0x43, 0xd6, + 0x93, 0x8c, 0x47, 0x78, 0xa0, 0xbc, 0x64, 0x5f, 0x9d, 0x72, 0xb0, 0xb3, + 0x9d, 0xce, 0xc6, 0xa6, 0x9f, 0x4c, 0xbb, 0x98, 0xa9, 0xb1, 0x60, 0x3f, + 0x66, 0x57, 0x6b, 0x45, 0xa9, 0x74, 0xb7, 0xad, 0xab, 0x55, 0x5c, 0xac, + 0x80, 0x53, 0x42, 0xa2, 0xa5, 0x42, 0xab, 0x9c, 0x81, 0xaa, 0x75, 0x45, + 0xb6, 0x7a, 0x44, 0x78, 0x71, 0x45, 0x3d, 0x44, 0xb8, 0x87, 0x93, 0x87, + 0xb2, 0x4f, 0x56, 0xd5, 0x4f, 0xd0, 0x97, 0x9e, 0xa2, 0xc5, 0xb9, 0x7d, + 0x77, 0x91, 0xbd, 0x8d, 0x2f, 0x73, 0x58, 0x7e, 0x6f, 0xa9, 0xa9, 0x79, + 0xac, 0xcd, 0xc7, 0x9b, 0x7d, 0x43, 0x88, 0x5b, 0x59, 0x54, 0x99, 0xd0, + 0x56, 0x50, 0x6d, 0x7b, 0x37, 0x6b, 0x3f, 0x35, 0x43, 0x79, 0x32, 0x9a, + 0x3b, 0xc7, 0x9e, 0x67, 0xc2, 0x76, 0x57, 0x65, 0x3c, 0x54, 0x4f, 0x39, + 0x4c, 0xd1, 0x9a, 0x6e, 0xca, 0x89, 0x4e, 0x64, 0xcf, 0xbe, 0x8a, 0xd6, + 0xce, 0xc3, 0x81, 0xbd, 0x49, 0x34, 0x7b, 0x36, 0xb9, 0x3b, 0x87, 0x78, + 0x5b, 0x4c, 0x6a, 0x3c, 0x9c, 0x9e, 0x95, 0x39, 0xae, 0x6c, 0x66, 0x4f, + 0x5f, 0x5e, 0x69, 0x31, 0x88, 0xbc, 0x77, 0x7b, 0xa2, 0xbb, 0x78, 0x72, + 0x50, 0x60, 0xb7, 0x53, 0x8f, 0x50, 0x55, 0xc2, 0x36, 0x41, 0x40, 0x81, + 0xb3, 0x84, 0x46, 0xba, 0x91, 0x6c, 0x50, 0x3d, 0xd2, 0x73, 0x6c, 0x6f, + 0x62, 0x5c, 0x38, 0x64, 0x52, 0x35, 0x7d, 0x55, 0x6e, 0x9f, 0x3a, 0xa5, + 0x88, 0xad, 0x6e, 0x62, 0x6a, 0x57, 0x6a, 0x53, 0xa4, 0x99, 0x3f, 0x97, + 0xae, 0x32, 0x31, 0x6f, 0xbd, 0x6f, 0x53, 0x72, 0x5a, 0xa4, 0x3c, 0x5a, + 0x53, 0xce, 0x4d, 0xc3, 0x32, 0xd5, 0x5c, 0xa0, 0xa1, 0x9f, 0x88, 0xa2, + 0x7e, 0xa4, 0x87, 0x73, 0x4c, 0x35, 0x4f, 0x80, 0x7a, 0x69, 0x5e, 0xaa, + 0xc2, 0xd2, 0xc2, 0x85, 0x2d, 0x42, 0x59, 0x7d, 0xcc, 0xa6, 0x93, 0xc7, + 0x37, 0x35, 0x8a, 0x8c, 0xac, 0xbe, 0x4b, 0x89, 0x62, 0x97, 0xc8, 0x77, + 0x55, 0x52, 0x50, 0x99, 0xc5, 0x6a, 0x8c, 0x84, 0x4d, 0xb6, 0x82, 0x9e, + 0x75, 0x43, 0x4c, 0xbb, 0xa6, 0x34, 0x42, 0x6f, 0xaf, 0xb1, 0xb5, 0xae, + 0x5f, 0x4b, 0x6c, 0x7c, 0xd1, 0x52, 0x72, 0x4d, 0x36, 0x6e, 0x48, 0x61, + 0x6b, 0x3a, 0xb7, 0x46, 0xb5, 0x49, 0x68, 0xb4, 0x3f, 0x70, 0xb6, 0xcd, + 0x43, 0x73, 0xb6, 0x8c, 0xb9, 0x5f, 0x91, 0xa6, 0x61, 0x88, 0x41, 0xb5, + 0xa2, 0x3e, 0x5d, 0xa5, 0x63, 0xc3, 0x5b, 0x32, 0x9f, 0x56, 0x33, 0xba, + 0xb8, 0x5b, 0x47, 0x3a, 0x99, 0x3b, 0x77, 0x5d, 0x4b, 0x46, 0x6a, 0xa3, + 0x61, 0x65, 0x89, 0xc9, 0xcc, 0xab, 0x93, 0x60, 0x59, 0x80, 0xba, 0x4a, + 0x4e, 0x34, 0xc3, 0xbd, 0xce, 0x97, 0x45, 0x3f, 0x61, 0x77, 0x67, 0xc5, + 0x7b, 0xad, 0xaa, 0x3c, 0xa9, 0x47, 0x92, 0x3c, 0x3c, 0x3d, 0x6b, 0x5a, + 0xd1, 0xbd, 0x61, 0x8b, 0xa5, 0x63, 0x72, 0x5a, 0x8c, 0x64, 0x9a, 0x60, + 0x95, 0x51, 0xb9, 0x7d, 0x34, 0x90, 0x8d, 0x5a, 0x66, 0x58, 0x9c, 0x89, + 0xac, 0x54, 0x86, 0x81, 0x80, 0x63, 0x7e, 0x7d, 0xc4, 0x9f, 0xa0, 0xb4, + 0x39, 0x8d, 0x4c, 0x38, 0x4b, 0x70, 0x81, 0xc1, 0xd1, 0x4d, 0x5d, 0x87, + 0xa4, 0x4c, 0x3a, 0x40, 0xc5, 0x52, 0x3d, 0xad, 0x3e, 0xae, 0x3f, 0xb3, + 0x61, 0x78, 0xc2, 0xc3, 0x69, 0x5d, 0xce, 0x94, 0x7a, 0x54, 0x5a, 0xa7, + 0x91, 0xcc, 0x67, 0x59, 0x5f, 0x8d, 0x46, 0x41, 0x5e, 0xbf, 0x86, 0x9f, + 0xb7, 0x86, 0x32, 0xb2, 0x72, 0x8a, 0xc4, 0x31, 0xa6, 0x67, 0x44, 0x6e, + 0x5c, 0xb2, 0x78, 0xcc, 0x8c, 0x6f, 0x68, 0x53, 0x64, 0x85, 0xba, 0x50, + 0x5e, 0x96, 0xb2, 0x46, 0x35, 0x45, 0xbe, 0x6c, 0x89, 0x90, 0x82, 0xbf, + 0xc5, 0x40, 0x60, 0xc0, 0x83, 0xa3, 0x9c, 0x57, 0xc4, 0x88, 0x6f, 0x32, + 0xb5, 0x74, 0xb7, 0x63, 0xc9, 0x72, 0x52, 0x9e, 0xc3, 0x8b, 0x36, 0x3a, + 0x3f, 0x5a, 0x98, 0x38, 0x5d, 0x7a, 0xb8, 0x56, 0x43, 0x6d, 0x58, 0x32, + 0x7d, 0x41, 0x96, 0xcd, 0x73, 0x38, 0x51, 0x38, 0x5a, 0x4e, 0x82, 0x53, + 0x9d, 0xbb, 0x79, 0x37, 0x76, 0x48, 0x97, 0x79, 0x42, 0xa8, 0x46, 0xb1, + 0x35, 0xad, 0x59, 0x9e, 0x67, 0xb6, 0x9c, 0xa4, 0x9b, 0xc3, 0xc9, 0x35, + 0xc9, 0xb8, 0xae, 0xc5, 0x49, 0x91, 0x9b, 0x67, 0x4b, 0xb3, 0xce, 0x57, + 0x40, 0x39, 0xa7, 0x57, 0x52, 0x69, 0x89, 0x9c, 0x9d, 0x5e, 0x87, 0x4a, + 0x87, 0x7a, 0x3a, 0x64, 0xa8, 0x5e, 0x40, 0x59, 0x4e, 0x63, 0x9d, 0x96, + 0x94, 0x9c, 0x37, 0xa2, 0xc5, 0xbe, 0xd6, 0x8e, 0x84, 0x8a, 0xc0, 0x5e, + 0x59, 0x9f, 0x66, 0xab, 0x83, 0x49, 0x3a, 0x84, 0x38, 0x44, 0x73, 0x9e, + 0x5a, 0xcc, 0x88, 0xd1, 0x98, 0x8a, 0x9e, 0x4b, 0x59, 0x32, 0xcc, 0xbf, + 0x4f, 0x9f, 0xb7, 0xa0, 0x6e, 0x34, 0x39, 0x91, 0xb5, 0x31, 0xbb, 0x8e, + 0x7c, 0x96, 0x35, 0x49, 0x72, 0x74, 0xc0, 0xab, 0x5c, 0x93, 0x6b, 0x6a, + 0x9f, 0x68, 0x49, 0xb9, 0xb6, 0x8e, 0x95, 0x5d, 0xa8, 0x89, 0x5d, 0xc2, + 0xb0, 0xc0, 0x7e, 0x43, 0x4d, 0x7d, 0x59, 0x5a, 0x31, 0x7b, 0x9e, 0x5c, + 0x92, 0x65, 0xcc, 0xc3, 0xce, 0x3f, 0xb3, 0xa9, 0x66, 0x78, 0x69, 0x67, + 0x78, 0x68, 0x57, 0xa0, 0xa5, 0x89, 0x9d, 0x53, 0x7b, 0xa3, 0x3d, 0x55, + 0x37, 0x8e, 0x4f, 0x44, 0xce, 0x47, 0x4d, 0x97, 0x89, 0x76, 0xb5, 0x5b, + 0x7b, 0x8a, 0x5a, 0x64, 0x54, 0x3d, 0x76, 0x92, 0x3b, 0x83, 0x36, 0x31, + 0x57, 0x43, 0x3c, 0x8d, 0x4b, 0x6b, 0x4c, 0xd4, 0xc3, 0x48, 0x63, 0x54, + 0xba, 0xc8, 0x92, 0x63, 0x56, 0x96, 0x97, 0x75, 0x44, 0x64, 0x9a, 0x65, + 0xb3, 0x93, 0x44, 0xa4, 0xb5, 0xa5, 0x5c, 0x91, 0x3b, 0xbd, 0x5b, 0x48, + 0x85, 0xcb, 0x8a, 0x7a, 0x45, 0xa3, 0x54, 0x5b, 0x4d, 0x2f, 0x2f, 0x4b, + 0xce, 0x9e, 0xcc, 0x87, 0x92, 0x9b, 0x77, 0x60, 0x5b, 0xce, 0x67, 0x99, + 0x8a, 0x7d, 0x47, 0x52, 0x6f, 0x76, 0x88, 0xc7, 0x49, 0x73, 0xa3, 0x5f, + 0xc1, 0x7a, 0x6b, 0x44, 0x7c, 0x58, 0x9d, 0x63, 0x75, 0x9c, 0x55, 0x5f, + 0x87, 0x4e, 0xbb, 0xbf, 0x90, 0xd5, 0x56, 0xc1, 0x33, 0x93, 0x9a, 0x58, + 0x88, 0x58, 0x4d, 0x3d, 0x42, 0x82, 0x8c, 0x65, 0x56, 0xbe, 0x66, 0xbe, + 0x62, 0x67, 0x9e, 0x57, 0xc5, 0x4e, 0x70, 0x94, 0x70, 0xc8, 0xbb, 0xa7, + 0x39, 0x7e, 0x81, 0x5e, 0x8c, 0xa9, 0x94, 0x9f, 0xa3, 0x6f, 0x90, 0x51, + 0x76, 0x9a, 0xb3, 0x76, 0x94, 0x81, 0x4a, 0x34, 0x3c, 0xd3, 0xa3, 0x74, + 0xaf, 0x60, 0xa6, 0xa7, 0x3e, 0x3b, 0x84, 0x41, 0x80, 0x6c, 0x88, 0x52, + 0x9d, 0x8b, 0x81, 0xa0, 0x54, 0xbb, 0x4e, 0x91, 0x73, 0xbb, 0x36, 0x85, + 0x50, 0x4e, 0x4e, 0x36, 0xcc, 0xb6, 0x49, 0xc7, 0xc3, 0x73, 0x43, 0xaf, + 0xa9, 0x70, 0x59, 0x6e, 0xcd, 0x83, 0xb7, 0xb2, 0xa9, 0x37, 0x60, 0x3a, + 0x8b, 0xc5, 0x77, 0x74, 0x62, 0x3f, 0x7e, 0x50, 0x68, 0xa4, 0x94, 0x9f, + 0x38, 0x58, 0x74, 0xa0, 0xd1, 0x50, 0x49, 0x33, 0x7d, 0x38, 0x57, 0xbf, + 0x59, 0xb4, 0x6f, 0x7c, 0x63, 0xaa, 0x5b, 0x36, 0x5e, 0x89, 0x94, 0x34, + 0x3f, 0xa0, 0x41, 0xd0, 0xac, 0x4c, 0x7b, 0x86, 0xc3, 0xbf, 0x75, 0xbd, + 0x91, 0xc6, 0xc7, 0x54, 0x85, 0xd6, 0x7f, 0xc8, 0xbb, 0x67, 0x3e, 0x40, + 0x55, 0x63, 0x90, 0xcc, 0x6c, 0x95, 0x52, 0x71, 0xc2, 0xc7, 0xb9, 0x4a, + 0x8c, 0x54, 0xa3, 0x2c, 0x3d, 0x86, 0xc8, 0xc7, 0x4f, 0xcd, 0xd4, 0xb2, + 0x84, 0xc8, 0xa6, 0xcc, 0x52, 0x9a, 0x99, 0x76, 0x9b, 0x8c, 0x8a, 0xcd, + 0x42, 0xcd, 0xa1, 0x82, 0x81, 0xb6, 0x66, 0xa2, 0x84, 0x92, 0xa6, 0x41, + 0xc8, 0xb2, 0xd4, 0x47, 0xb8, 0xa3, 0xc3, 0xc5, 0x63, 0x42, 0x81, 0xc7, + 0xa2, 0x69, 0x53, 0x95, 0xaf, 0x6b, 0x88, 0x53, 0x3d, 0x39, 0xa9, 0x7c, + 0x9c, 0xb4, 0x6c, 0x92, 0xac, 0x97, 0x9b, 0xba, 0x2f, 0x4b, 0x77, 0x4a, + 0x4d, 0x5d, 0x54, 0x83, 0xc9, 0x73, 0x67, 0x44, 0x4c, 0x94, 0xb0, 0x82, + 0xaa, 0xad, 0x47, 0xba, 0xc8, 0x5d, 0x90, 0x92, 0x67, 0x88, 0xaa, 0x81, + 0x52, 0xb7, 0xa2, 0x4e, 0x34, 0x4f, 0x37, 0x95, 0x69, 0x99, 0x53, 0x5a, + 0x9f, 0x74, 0x99, 0xd1, 0x66, 0xc4, 0xac, 0x9c, 0x83, 0x3f, 0x57, 0x39, + 0x43, 0x35, 0x47, 0xb2, 0x83, 0xca, 0x30, 0xb8, 0xab, 0x68, 0x39, 0xc0, + 0x52, 0x7c, 0x5c, 0xc9, 0x6d, 0x96, 0xbd, 0x7f, 0xc9, 0x57, 0xc3, 0x4d, + 0xcf, 0x5d, 0x2b, 0xb9, 0x66, 0x55, 0x34, 0xc4, 0xbb, 0x89, 0x57, 0x62, + 0xc1, 0x57, 0x34, 0x32, 0x3d, 0x4c, 0x9e, 0xb0, 0x9e, 0xcb, 0x57, 0x9d, + 0x2b, 0x84, 0xc9, 0x35, 0x7d, 0x7f, 0x4a, 0x88, 0xa9, 0x43, 0x4e, 0xa8, + 0xb5, 0xab, 0xde, 0x80, 0xe6, 0xa0, 0xd1, 0x65, 0xa0, 0x4d, 0x44, 0x62, + 0xce, 0xe0, 0x70, 0xb7, 0x56, 0xa2, 0x6e, 0x94, 0x59, 0xd2, 0x7c, 0x94, + 0x80, 0xbc, 0xa2, 0x98, 0x45, 0xa7, 0xad, 0xa3, 0x52, 0x8c, 0xc2, 0x8c, + 0x45, 0x69, 0x53, 0x65, 0x79, 0x58, 0x38, 0x5e, 0xaa, 0x7d, 0x9b, 0x46, + 0xcd, 0x3b, 0x9c, 0x40, 0x7f, 0xd3, 0x5d, 0x89, 0xc6, 0xa8, 0x7f, 0xd2, + 0x93, 0x63, 0x69, 0x74, 0xb0, 0xc0, 0x31, 0xbc, 0x4c, 0x5f, 0x8e, 0x4a, + 0x5e, 0xb8, 0x97, 0x87, 0xa3, 0x68, 0x6e, 0x66, 0x2d, 0x58, 0xa8, 0x53, + 0x57, 0x49, 0x3f, 0xb6, 0x80, 0x5d, 0xb2, 0x7d, 0x87, 0xa4, 0x54, 0x9d, + 0x7b, 0x83, 0xa2, 0x32, 0x94, 0x78, 0x71, 0x65, 0xa8, 0x82, 0x4b, 0x86, + 0x43, 0xca, 0x4a, 0x57, 0x9d, 0xc7, 0xbb, 0x3f, 0x9e, 0xd0, 0x89, 0xb8, + 0xb4, 0x3e, 0xd4, 0xcf, 0x7f, 0x9e, 0x69, 0xb5, 0x57, 0xc4, 0x57, 0xc7, + 0x98, 0x52, 0xae, 0xa4, 0xb2, 0xaf, 0xbc, 0x74, 0x75, 0x79, 0x9c, 0x8f, + 0xba, 0x9f, 0x54, 0x6a, 0x87, 0x55, 0x46, 0x7b, 0x47, 0x80, 0x7a, 0x36, + 0xb5, 0xa4, 0x76, 0x39, 0x5e, 0xc7, 0x54, 0x8f, 0x6a, 0x6e, 0xcb, 0xa7, + 0x94, 0x46, 0x30, 0xcb, 0x8e, 0xc3, 0x8d, 0x5d, 0x4a, 0xcc, 0x95, 0xd2, + 0xab, 0xb1, 0x71, 0x46, 0xb6, 0xb0, 0x6c, 0xa5, 0x7f, 0xa4, 0xd6, 0x62, + 0x9a, 0x9c, 0xd7, 0x9f, 0x46, 0xcc, 0x3a, 0x85, 0x91, 0xe0, 0x47, 0xa0, + 0xb8, 0x4e, 0xcc, 0x94, 0x9a, 0xd3, 0xb7, 0x7f, 0x29, 0xb7, 0x3f, 0xcd, + 0x5c, 0x67, 0x30, 0x3c, 0x7b, 0xa5, 0x3c, 0x79, 0xac, 0xa5, 0xa7, 0x91, + 0x50, 0x71, 0x72, 0x41, 0xc1, 0xab, 0x80, 0x75, 0x51, 0x8d, 0x9a, 0x5e, + 0xb7, 0xaa, 0x51, 0x94, 0x9e, 0xb3, 0x38, 0xa9, 0xab, 0x69, 0x59, 0x59, + 0xcb, 0xc8, 0x65, 0x3f, 0xb1, 0x56, 0x3b, 0x7b, 0x4f, 0xb5, 0x6c, 0xad, + 0xa8, 0x86, 0x60, 0xad, 0x45, 0x94, 0x6c, 0x33, 0x4f, 0xaa, 0x9e, 0x6e, + 0xa3, 0x3a, 0x3c, 0x4a, 0x91, 0x57, 0xaa, 0x7b, 0xb4, 0x5c, 0x3a, 0x52, + 0x6f, 0xa3, 0xbf, 0x3c, 0x9d, 0xda, 0x58, 0x50, 0xbe, 0x79, 0x9c, 0xba, + 0xb7, 0x8c, 0x8b, 0x98, 0x62, 0xbf, 0x92, 0x3f, 0x4c, 0xcb, 0xd5, 0x8b, + 0xbb, 0x6a, 0xc5, 0xa6, 0xaa, 0x67, 0x53, 0x5a, 0x76, 0x71, 0x7a, 0x54, + 0x70, 0x94, 0x8c, 0x4d, 0x7d, 0x73, 0x3c, 0xb1, 0x88, 0xca, 0x76, 0xcc, + 0xae, 0x2d, 0x73, 0xcb, 0x44, 0xb4, 0xc8, 0x49, 0xc6, 0x6b, 0x4c, 0x80, + 0x60, 0xaa, 0x84, 0x5f, 0xa5, 0x9d, 0xc3, 0x48, 0x2a, 0x84, 0x5a, 0xa0, + 0xce, 0x3e, 0x49, 0x58, 0x38, 0x49, 0x5d, 0xab, 0x3c, 0xb4, 0xcf, 0x48, + 0x5b, 0x9e, 0x3c, 0x53, 0xbd, 0x8a, 0xbc, 0x3f, 0x6a, 0xcc, 0xa9, 0xd5, + 0xb8, 0x96, 0xc7, 0x50, 0x2f, 0x9e, 0xa7, 0x59, 0xc2, 0x3f, 0x41, 0xbe, + 0x29, 0xab, 0x77, 0xc0, 0x5e, 0x93, 0xc0, 0x93, 0x35, 0x59, 0x50, 0xc0, + 0x9a, 0x6a, 0x53, 0xa1, 0xb8, 0x4e, 0x6a, 0xbe, 0xb3, 0x40, 0x47, 0xb5, + 0xa3, 0xcd, 0x36, 0x93, 0x8e, 0x9d, 0xaa, 0x6b, 0x56, 0x7e, 0xa0, 0xb6, + 0xb4, 0x7a, 0xd7, 0x49, 0x31, 0x7a, 0x70, 0x5a, 0x4f, 0x67, 0x8f, 0x3e, + 0xbc, 0x9a, 0x56, 0x43, 0x41, 0x92, 0xbf, 0x4d, 0x6f, 0x9d, 0xaf, 0x97, + 0xa6, 0x7d, 0x74, 0x82, 0x24, 0x4b, 0xaa, 0xb9, 0xac, 0x7b, 0x42, 0x94, + 0xb7, 0x58, 0xc2, 0x86, 0x8c, 0xd3, 0x43, 0x3d, 0x45, 0x57, 0x7a, 0xc6, + 0x99, 0x7a, 0x76, 0x9d, 0x93, 0x4f, 0x8f, 0x64, 0x5b, 0x6b, 0x43, 0x8f, + 0x48, 0xc1, 0x4e, 0x70, 0xa4, 0x4f, 0xaf, 0xd3, 0xac, 0x64, 0xbf, 0xbc, + 0x59, 0x81, 0x67, 0xa5, 0xcb, 0xd0, 0xd6, 0xbb, 0xc9, 0x7f, 0xa1, 0x4e, + 0xa8, 0x6e, 0xac, 0x71, 0xbc, 0x53, 0xb7, 0x4a, 0x61, 0x7c, 0x45, 0x3d, + 0x4e, 0x47, 0x75, 0xc5, 0xa7, 0xc9, 0x42, 0xac, 0xc5, 0xac, 0x46, 0xb8, + 0x3d, 0x5f, 0x72, 0x9a, 0xc1, 0xa7, 0xbd, 0xa9, 0x72, 0x72, 0x8a, 0x8b, + 0x8f, 0xce, 0x7f, 0x36, 0x49, 0x42, 0xcd, 0x83, 0xc8, 0x97, 0x9a, 0x43, + 0x6e, 0x8f, 0xd2, 0x92, 0x4f, 0x4e, 0x8b, 0x53, 0xa4, 0x44, 0xdc, 0xc3, + 0x38, 0x6e, 0xc5, 0xbb, 0x2e, 0xc2, 0x80, 0xbe, 0x70, 0x6f, 0x3b, 0xa7, + 0x9d, 0x2c, 0x76, 0xc5, 0x97, 0xa8, 0xc3, 0xa2, 0x4f, 0x58, 0x5c, 0x42, + 0x51, 0xb1, 0x41, 0x77, 0x67, 0xb1, 0xd5, 0xc0, 0xa3, 0x93, 0x6c, 0xc9, + 0x5f, 0x3a, 0x71, 0xb3, 0x8d, 0x92, 0x51, 0x97, 0xc6, 0x47, 0x35, 0x81, + 0x69, 0x59, 0x75, 0x89, 0xcb, 0x6e, 0x7e, 0x60, 0x49, 0x79, 0xb7, 0xb0, + 0xa8, 0xa4, 0xb3, 0xce, 0x2b, 0x55, 0xd2, 0x78, 0x51, 0x65, 0x68, 0x9a, + 0x4a, 0x9a, 0x54, 0x7f, 0x3f, 0x50, 0xb9, 0xd4, 0x76, 0x89, 0xac, 0xc3, + 0xb3, 0x67, 0x71, 0xc7, 0x6b, 0x5a, 0xa5, 0x8b, 0x92, 0x3f, 0xd0, 0x9b, + 0xb7, 0xc2, 0x9e, 0xa4, 0x3f, 0x40, 0xb6, 0x32, 0x39, 0x40, 0x46, 0x3c, + 0x30, 0xc1, 0x4a, 0xcb, 0x98, 0x67, 0x38, 0x46, 0x4b, 0xbe, 0x2d, 0x9f, + 0x59, 0xd0, 0x91, 0xb6, 0xa5, 0x3d, 0x5d, 0x36, 0xd1, 0x53, 0xd4, 0x5c, + 0x95, 0x5d, 0x55, 0x4f, 0xab, 0x53, 0x6e, 0x7d, 0x41, 0x83, 0xcf, 0xc9, + 0x6e, 0xc8, 0x8c, 0x60, 0xc3, 0x9a, 0x31, 0x49, 0x82, 0xc4, 0x4b, 0xbc, + 0x7a, 0xd1, 0x54, 0x54, 0x88, 0x97, 0x69, 0x9a, 0x58, 0x64, 0x97, 0xb4, + 0xa0, 0x46, 0xca, 0x52, 0x85, 0x86, 0x93, 0xca, 0x7f, 0x73, 0x59, 0xc7, + 0xb2, 0x34, 0xac, 0x8e, 0x32, 0xca, 0x88, 0x7e, 0x8a, 0xa9, 0x8a, 0x7e, + 0x64, 0xca, 0x64, 0x51, 0x48, 0xca, 0x88, 0x37, 0x6b, 0xb9, 0x52, 0xd6, + 0x55, 0x59, 0x39, 0x9a, 0x37, 0x89, 0x78, 0xb8, 0x66, 0xd5, 0x77, 0x98, + 0x55, 0x6e, 0xa3, 0x88, 0x6d, 0xa6, 0x64, 0xa0, 0xbc, 0xb5, 0x8d, 0xc2, + 0x42, 0x4e, 0x96, 0x55, 0x5a, 0x3e, 0x48, 0x69, 0xb0, 0x5b, 0xc2, 0xc4, + 0x9e, 0x50, 0xc0, 0x33, 0x56, 0xc6, 0x9e, 0x4e, 0x6a, 0x50, 0x60, 0x40, + 0xc2, 0x5b, 0xa2, 0x34, 0x74, 0x64, 0x95, 0x52, 0x3f, 0x9c, 0x66, 0x6d, + 0x8a, 0x41, 0x53, 0xa1, 0x8a, 0xb2, 0x45, 0xa9, 0x7e, 0x76, 0xa1, 0x99, + 0x69, 0xd0, 0x8f, 0x9c, 0x4a, 0x9b, 0x58, 0xbe, 0xbf, 0xc9, 0x55, 0x6d, + 0x4b, 0x8d, 0x66, 0xa6, 0x44, 0x4f, 0x50, 0x47, 0x33, 0x75, 0x68, 0x3d, + 0x86, 0x4a, 0xaf, 0x38, 0x78, 0xba, 0x51, 0xc6, 0x52, 0x4a, 0x91, 0xcd, + 0x67, 0x50, 0xa7, 0x56, 0xa5, 0x3f, 0xc4, 0xad, 0x61, 0x67, 0x70, 0x41, + 0xaa, 0x56, 0xcc, 0x84, 0xb1, 0x68, 0x4e, 0x58, 0x7a, 0xa2, 0x39, 0x7a, + 0xc8, 0x8c, 0xa7, 0xb4, 0x96, 0xa0, 0xc1, 0xc8, 0x3f, 0x35, 0xb8, 0x39, + 0x8b, 0xb0, 0xad, 0x64, 0xd0, 0xd4, 0xa9, 0x6f, 0x93, 0x5c, 0x70, 0x70, + 0x54, 0x4d, 0xb0, 0xb8, 0x89, 0x9f, 0xc4, 0x49, 0x7d, 0xd4, 0x32, 0xa8, + 0x9d, 0x4d, 0x91, 0x9e, 0x5a, 0xc1, 0x67, 0x5c, 0x57, 0x67, 0x73, 0xd3, + 0xb1, 0x8e, 0x5e, 0xc3, 0x9c, 0x90, 0xc9, 0x43, 0xaf, 0x92, 0xb0, 0xbc, + 0x6f, 0x5e, 0x45, 0x42, 0x7c, 0xcd, 0x42, 0x48, 0x70, 0xc9, 0x97, 0x89, + 0xa6, 0x4e, 0x45, 0x9b, 0x8f, 0x7f, 0xbc, 0x3b, 0xcd, 0xcf, 0x90, 0xa1, + 0x7e, 0x6b, 0x4c, 0xa6, 0x31, 0xb6, 0xb2, 0xc6, 0x93, 0x2f, 0x79, 0x93, + 0x84, 0xa4, 0xaa, 0xd3, 0x42, 0x53, 0x56, 0xc8, 0x43, 0x5c, 0x86, 0x52, + 0x7a, 0x65, 0x96, 0x32, 0xab, 0x92, 0x6b, 0x96, 0x85, 0x97, 0xb6, 0x72, + 0x96, 0x5d, 0x54, 0x87, 0x89, 0xbd, 0x5d, 0x85, 0x6a, 0x57, 0x31, 0x62, + 0x59, 0x39, 0x96, 0xb0, 0xa1, 0x3a, 0xa4, 0x80, 0x3f, 0x87, 0x9d, 0x9e, + 0x44, 0xcb, 0x68, 0x8e, 0x74, 0x9f, 0x76, 0x58, 0xbb, 0xc0, 0x39, 0xa1, + 0xab, 0x7c, 0x68, 0x74, 0x80, 0x93, 0x96, 0x65, 0x6c, 0x4c, 0x56, 0xb2, + 0x95, 0xa9, 0xd2, 0x44, 0x7b, 0x61, 0x7f, 0xbf, 0x7a, 0xb1, 0xa4, 0x89, + 0xa6, 0x93, 0x8c, 0xcf, 0x33, 0xcd, 0xb7, 0xa5, 0x61, 0x60, 0x8a, 0xb5, + 0xa2, 0xbc, 0x46, 0x43, 0x57, 0x78, 0x33, 0xa9, 0x9c, 0x32, 0x8b, 0xc8, + 0x8d, 0xb1, 0x4d, 0x98, 0x54, 0x5a, 0xbf, 0x49, 0xc5, 0x6c, 0x5e, 0xaf, + 0x3e, 0xa9, 0x9e, 0x3e, 0xc2, 0x30, 0x78, 0xcf, 0xaf, 0x4f, 0x77, 0x4b, + 0x8d, 0x43, 0xb5, 0x6e, 0x3c, 0x9b, 0xaa, 0x94, 0xb8, 0xa4, 0x47, 0x81, + 0x65, 0x3c, 0x74, 0x35, 0x34, 0x72, 0x4e, 0x3b, 0x9d, 0x73, 0xc9, 0x7c, + 0x48, 0x53, 0xa0, 0xaa, 0xcd, 0x7d, 0xb1, 0x8d, 0x9e, 0x3f, 0x30, 0x59, + 0xc6, 0xa0, 0x9b, 0x8f, 0x6f, 0x79, 0x57, 0x81, 0xc7, 0x87, 0x4f, 0xc1, + 0x30, 0x9d, 0xb1, 0x80, 0xab, 0xad, 0xa9, 0x89, 0x4f, 0xc3, 0x65, 0xb9, + 0x7c, 0xb0, 0x5a, 0xcd, 0xaa, 0xbb, 0xa6, 0x6a, 0x35, 0x8f, 0x60, 0xb6, + 0x7a, 0xb3, 0x90, 0x55, 0xaa, 0x59, 0x36, 0xa0, 0x56, 0x67, 0x3f, 0xad, + 0xae, 0xa8, 0x41, 0xa8, 0x37, 0x58, 0x5f, 0x8e, 0xb6, 0x9d, 0x65, 0x31, + 0xa1, 0x95, 0xa2, 0xa9, 0x59, 0x46, 0x7d, 0x32, 0xa6, 0x5d, 0x4d, 0x47, + 0xc6, 0x45, 0x4f, 0xcf, 0x64, 0xb5, 0xa3, 0xa0, 0x8a, 0x9c, 0x6c, 0xcf, + 0x53, 0x82, 0x98, 0x58, 0x96, 0x60, 0x91, 0x7b, 0x80, 0x33, 0x61, 0x3b, + 0x8e, 0xcf, 0x63, 0x7a, 0x43, 0x6c, 0xc4, 0x7a, 0x72, 0xd1, 0xc1, 0x5a, + 0x67, 0xd0, 0xab, 0x62, 0x81, 0xce, 0x7a, 0xa6, 0x45, 0x3c, 0xa2, 0x74, + 0xbe, 0xb2, 0x34, 0x8a, 0xc9, 0x7c, 0xaf, 0x95, 0x90, 0xca, 0x4f, 0xab, + 0x48, 0xca, 0x3f, 0xa4, 0xab, 0x8c, 0xc8, 0xb0, 0xc3, 0x66, 0x65, 0x4b, + 0x78, 0xd0, 0xc0, 0x9d, 0x87, 0x5f, 0x3e, 0x49, 0xb4, 0x82, 0xa4, 0x6e, + 0x75, 0x5e, 0x39, 0xb4, 0xb0, 0x8f, 0x9b, 0xb2, 0xb1, 0x98, 0xb3, 0xc4, + 0xbe, 0x50, 0x9f, 0x35, 0x50, 0x94, 0xcc, 0x8e, 0x81, 0x9e, 0x42, 0x58, + 0x87, 0xba, 0x62, 0x48, 0x5b, 0x9a, 0x9a, 0x43, 0x48, 0x5b, 0xa5, 0x98, + 0x9f, 0x97, 0xa6, 0xc5, 0xb4, 0x9e, 0x97, 0x83, 0xaa, 0x4c, 0x5d, 0x89, + 0x5a, 0x4b, 0x36, 0x7a, 0x94, 0x50, 0x50, 0x50, 0x96, 0x55, 0x61, 0x83, + 0x40, 0xc3, 0x34, 0x8f, 0x72, 0x92, 0x7f, 0xc5, 0x3d, 0x7c, 0x77, 0x9d, + 0x64, 0x7b, 0x7e, 0x68, 0x36, 0xc0, 0xbb, 0x60, 0x4a, 0x3d, 0x8a, 0xb2, + 0xa8, 0xaf, 0x4b, 0x5c, 0xaa, 0xba, 0x51, 0xb7, 0x53, 0x8f, 0x4d, 0xa9, + 0xab, 0x84, 0xc4, 0xb2, 0x6f, 0x56, 0xbc, 0x35, 0x83, 0x47, 0xc7, 0x6b, + 0x3b, 0xc6, 0xc3, 0xd4, 0x40, 0x94, 0xc5, 0x47, 0x61, 0xbb, 0xbf, 0x9d, + 0x78, 0xc2, 0x9f, 0x75, 0x5b, 0x4b, 0x51, 0x42, 0xb9, 0xb1, 0xb3, 0xd4, + 0x83, 0xbd, 0x94, 0xd7, 0x81, 0x9c, 0x72, 0xc1, 0x83, 0x6c, 0xa0, 0x9a, + 0xb3, 0x59, 0xcc, 0x7b, 0x45, 0xa5, 0xd9, 0x77, 0x93, 0xa3, 0x50, 0x36, + 0x7f, 0xbd, 0x5b, 0x6a, 0x58, 0xba, 0x85, 0xb3, 0x93, 0xb7, 0x41, 0x55, + 0x85, 0x53, 0x53, 0x79, 0x79, 0xb3, 0x8a, 0xc1, 0x6e, 0x3d, 0x5c, 0x90, + 0x69, 0x55, 0x79, 0x7d, 0x37, 0xc3, 0x9a, 0x60, 0xbc, 0xaf, 0xcb, 0xc0, + 0xd2, 0xc4, 0x70, 0xa6, 0x9e, 0xd6, 0x22, 0x54, 0x8d, 0xae, 0x60, 0x67, + 0x5f, 0x4d, 0x7a, 0x67, 0x57, 0xd7, 0x39, 0x98, 0x95, 0x60, 0x50, 0x91, + 0x60, 0xa9, 0xca, 0x4f, 0x8f, 0x6d, 0x46, 0x5e, 0x2f, 0x93, 0x6d, 0xb6, + 0x63, 0x64, 0x8a, 0xb6, 0x6d, 0xc2, 0xb0, 0x5a, 0x62, 0x46, 0x88, 0x83, + 0x5a, 0x87, 0xaa, 0x98, 0x38, 0xa6, 0xd1, 0x4d, 0xc5, 0xc4, 0x50, 0x44, + 0xbc, 0x42, 0x32, 0x6c, 0x83, 0x4a, 0xb1, 0x5a, 0xde, 0xac, 0xd1, 0xa5, + 0x82, 0xb0, 0x70, 0x4e, 0x46, 0x5e, 0x60, 0xa4, 0xb4, 0xa9, 0x50, 0xf1, + 0x37, 0xd3, 0xf6, 0xe2, 0xca, 0xa1, 0x75, 0xbe, 0x8e, 0x2f, 0x4e, 0x44, + 0xb9, 0x76, 0xcf, 0xc7, 0x34, 0xd5, 0xa6, 0xbd, 0xa1, 0xc2, 0x68, 0x2c, + 0xc5, 0x90, 0x98, 0xa8, 0x88, 0xb2, 0x5f, 0xcc, 0xa8, 0x9f, 0xcd, 0xc7, + 0x64, 0x6e, 0x6c, 0x6a, 0x4b, 0xc9, 0x40, 0x7f, 0xa4, 0xc3, 0x44, 0xa3, + 0x7a, 0x3e, 0x7a, 0xb8, 0x3c, 0xb8, 0x85, 0xba, 0x9f, 0x67, 0x76, 0x80, + 0xac, 0x4d, 0x28, 0xb6, 0xa3, 0x28, 0x4b, 0x66, 0xcb, 0x60, 0xcd, 0x8e, + 0xc5, 0x4c, 0xa5, 0x8d, 0xc5, 0x22, 0x93, 0xc2, 0xa8, 0xb5, 0xd3, 0xc0, + 0x5c, 0xb2, 0xbf, 0x3b, 0x30, 0x77, 0x98, 0x8e, 0x63, 0x2e, 0x79, 0x40, + 0xa5, 0x6d, 0xb1, 0x75, 0xcf, 0xb9, 0x84, 0x72, 0xc3, 0xab, 0x47, 0x38, + 0xd7, 0x4b, 0x61, 0x57, 0x9f, 0x40, 0x86, 0x28, 0x4f, 0x27, 0x35, 0x63, + 0x58, 0x91, 0x4e, 0x7e, 0xc3, 0x6d, 0x64, 0x56, 0xbf, 0xaf, 0x79, 0x86, + 0x5c, 0x77, 0x32, 0x73, 0x70, 0x89, 0xc9, 0x68, 0xbb, 0x72, 0x48, 0xc0, + 0x53, 0x88, 0x64, 0x85, 0x61, 0xa7, 0x5e, 0x55, 0x87, 0xc9, 0x7a, 0xa0, + 0x45, 0x3f, 0x90, 0x8c, 0x62, 0x60, 0xb9, 0x20, 0x6c, 0xbe, 0x81, 0x7c, + 0xd4, 0x98, 0xa3, 0x60, 0xa3, 0x47, 0x92, 0x52, 0xb4, 0x7e, 0x59, 0x83, + 0xa0, 0xc3, 0x57, 0x54, 0xc6, 0x91, 0x56, 0x9f, 0xba, 0x89, 0x96, 0xc4, + 0xa8, 0xbc, 0xce, 0x8d, 0x6c, 0x33, 0xc9, 0x74, 0xa3, 0xc9, 0x81, 0x77, + 0x54, 0x89, 0x61, 0xa7, 0x4d, 0xc8, 0xbf, 0xb8, 0xc5, 0xa6, 0x78, 0x5d, + 0x66, 0x1c, 0x88, 0xb0, 0xab, 0x56, 0x4f, 0x8b, 0xd2, 0xa5, 0x3b, 0x3d, + 0x39, 0x4c, 0x55, 0x45, 0x92, 0x7c, 0x91, 0xa6, 0x31, 0xad, 0xcc, 0xa3, + 0xbb, 0x2f, 0x6a, 0x4b, 0x70, 0x75, 0x73, 0x7a, 0x32, 0x5d, 0xa9, 0x6d, + 0x45, 0x88, 0xbf, 0x50, 0x81, 0x3c, 0x6d, 0x9f, 0x2f, 0x78, 0x91, 0xdc, + 0xb0, 0x9a, 0x37, 0xdc, 0x7c, 0x5e, 0x29, 0x54, 0xa5, 0xc5, 0xb7, 0xa6, + 0x33, 0x77, 0x52, 0x26, 0xb7, 0xa7, 0x89, 0xc9, 0x30, 0xba, 0x5d, 0x6d, + 0x5e, 0xd1, 0xbd, 0xaa, 0x71, 0x5f, 0xa0, 0xac, 0x6e, 0x6a, 0x43, 0xbd, + 0x7f, 0x30, 0x28, 0xb5, 0x72, 0x49, 0xc3, 0xba, 0x8f, 0x83, 0xa5, 0xa4, + 0xad, 0x86, 0x5e, 0xa8, 0xca, 0xd0, 0xa0, 0x5f, 0x8c, 0x38, 0x6f, 0x48, + 0xa2, 0x98, 0xcd, 0x53, 0x94, 0x80, 0xc2, 0xa9, 0x4c, 0x48, 0xa9, 0x4c, + 0x81, 0x5f, 0x81, 0x54, 0x2b, 0xba, 0x43, 0xd1, 0x8c, 0x44, 0xc8, 0x50, + 0x60, 0x68, 0x38, 0xac, 0x53, 0x89, 0x53, 0x8a, 0x49, 0x7b, 0x78, 0x77, + 0x4b, 0x49, 0x77, 0xb0, 0x7b, 0x9e, 0x7a, 0x37, 0x92, 0x35, 0x3b, 0xa8, + 0x99, 0x97, 0x98, 0x55, 0x4f, 0x9f, 0xcc, 0x90, 0x51, 0xd4, 0x38, 0x3d, + 0x40, 0x97, 0x2d, 0x78, 0xa1, 0x50, 0x87, 0xa5, 0x87, 0xa4, 0x61, 0x8d, + 0xc1, 0xa9, 0x80, 0xbb, 0xc4, 0x4c, 0xc9, 0x3c, 0x4a, 0x73, 0xb9, 0x7a, + 0x81, 0x7a, 0x9d, 0xbc, 0x4d, 0x8a, 0x81, 0x56, 0x5f, 0x2d, 0x6e, 0xcc, + 0x9a, 0x5e, 0x99, 0xd8, 0x37, 0x6f, 0x72, 0x50, 0x70, 0xa9, 0x4a, 0x85, + 0xb8, 0xc3, 0x89, 0xa3, 0x35, 0x96, 0xdb, 0x5c, 0x54, 0xa5, 0x8a, 0x55, + 0x66, 0x6f, 0xdd, 0xbf, 0xc0, 0x4a, 0x86, 0x93, 0x51, 0x75, 0x7b, 0xc0, + 0x5f, 0x86, 0x7d, 0x5a, 0x7a, 0x39, 0xad, 0x90, 0x89, 0x6e, 0xcd, 0x41, + 0x3b, 0x2c, 0x42, 0x74, 0xbf, 0x7e, 0x77, 0x97, 0xa0, 0x35, 0x82, 0x55, + 0xb6, 0x9d, 0x8c, 0x56, 0x9a, 0x3c, 0xc2, 0x92, 0xa8, 0x4f, 0x48, 0x9e, + 0xab, 0x65, 0xb8, 0x6d, 0xa1, 0x5d, 0xc4, 0xbf, 0x3f, 0x89, 0x5b, 0x88, + 0xa9, 0xa5, 0x88, 0x48, 0xb3, 0x67, 0xa3, 0x7b, 0xae, 0x69, 0x82, 0xc5, + 0x92, 0x6c, 0xc5, 0xb6, 0x97, 0x83, 0x4d, 0x8d, 0x9e, 0x83, 0x56, 0xd0, + 0x4a, 0xb8, 0x87, 0xb2, 0x2f, 0x74, 0xc6, 0xa9, 0xa4, 0x5a, 0x53, 0x49, + 0xcd, 0x51, 0x94, 0xa7, 0x6a, 0x7c, 0x4d, 0x61, 0x74, 0xa1, 0x91, 0x83, + 0x92, 0x62, 0x3a, 0x81, 0xac, 0x52, 0x94, 0x67, 0x37, 0x93, 0x45, 0xca, + 0x5e, 0xb3, 0x7d, 0xcc, 0x43, 0x86, 0x4b, 0x63, 0x53, 0x38, 0x5c, 0x7f, + 0xcf, 0xa8, 0x63, 0xad, 0x95, 0x94, 0xb0, 0x4f, 0x79, 0x43, 0xa4, 0xae, + 0x88, 0x7d, 0xca, 0x83, 0x4c, 0x67, 0x5d, 0x9a, 0x96, 0xa1, 0x7b, 0xb2, + 0x5f, 0x9f, 0x47, 0x5e, 0x4d, 0x98, 0xa7, 0x59, 0xa9, 0x69, 0x5f, 0xe4, + 0xcb, 0xd0, 0x40, 0x96, 0x6a, 0x7b, 0xac, 0xa2, 0xb1, 0x7e, 0x44, 0x8a, + 0x9e, 0xba, 0x5f, 0x61, 0x40, 0x3c, 0x95, 0xb4, 0xa6, 0x82, 0x8c, 0x8d, + 0x8b, 0x58, 0x44, 0x8a, 0x75, 0xb0, 0x94, 0x52, 0xab, 0xbd, 0x75, 0x48, + 0xa2, 0x24, 0x97, 0x3d, 0x9b, 0x1f, 0x5f, 0xba, 0xcf, 0x74, 0xa7, 0x52, + 0x87, 0x88, 0x3a, 0x76, 0x6d, 0xc4, 0x67, 0xa1, 0x4a, 0x81, 0xbd, 0x45, + 0x5e, 0x2e, 0x6f, 0xdc, 0x2e, 0x38, 0x91, 0x95, 0x9f, 0xb2, 0xd8, 0x79, + 0x6b, 0x9a, 0x98, 0x58, 0x63, 0x4f, 0xd9, 0x3b, 0x80, 0xb8, 0x66, 0x63, + 0xcf, 0xc6, 0xa6, 0xcc, 0x6e, 0x97, 0x69, 0x9e, 0x40, 0x61, 0xd0, 0x4f, + 0xc1, 0x57, 0x47, 0xbc, 0x83, 0xa1, 0xaf, 0x94, 0xc0, 0x2a, 0x83, 0x2b, + 0x65, 0x98, 0x53, 0x52, 0xad, 0x56, 0x3f, 0x54, 0x5d, 0x6c, 0x37, 0x7c, + 0x7e, 0x50, 0xbb, 0x3e, 0xa5, 0x6e, 0x92, 0x55, 0xc2, 0x55, 0x7a, 0x46, + 0x54, 0x8a, 0x55, 0x9e, 0xb9, 0x43, 0x90, 0x8e, 0x74, 0x40, 0x72, 0xaa, + 0xbb, 0x57, 0x36, 0x92, 0x75, 0x64, 0x69, 0x31, 0x4a, 0x79, 0x9a, 0x67, + 0x70, 0xa0, 0xc2, 0xbb, 0x60, 0xc9, 0xbc, 0xd4, 0xcf, 0xb5, 0x63, 0xc5, + 0x33, 0x86, 0xa9, 0x6f, 0x49, 0x91, 0x58, 0x43, 0x41, 0xdb, 0x7d, 0x20, + 0x39, 0xb6, 0xab, 0x55, 0xb1, 0xac, 0xb4, 0x52, 0x89, 0x6b, 0x4f, 0xb9, + 0x28, 0xa1, 0xd7, 0x54, 0xac, 0x42, 0x83, 0x74, 0xca, 0x36, 0x4e, 0x53, + 0x5b, 0x6b, 0x23, 0xc6, 0x57, 0x4a, 0x98, 0xd9, 0x8f, 0x6d, 0xc8, 0x84, + 0x89, 0xb9, 0xcb, 0x95, 0x65, 0x68, 0x8d, 0x40, 0xb8, 0x4c, 0xa2, 0x89, + 0x53, 0x3f, 0xaf, 0x7a, 0x54, 0x8b, 0x69, 0x6f, 0xc3, 0x55, 0x62, 0x74, + 0x61, 0x58, 0xc2, 0x9f, 0x49, 0x97, 0x66, 0x78, 0x52, 0x62, 0x97, 0x6c, + 0x78, 0x4c, 0x92, 0x36, 0xb0, 0x8a, 0x3d, 0x6b, 0x8c, 0x6c, 0xaa, 0xbf, + 0xc9, 0x53, 0x6b, 0xae, 0x48, 0x4e, 0x88, 0xc1, 0x8a, 0x9c, 0x6d, 0x46, + 0x5a, 0x9c, 0x89, 0x68, 0x42, 0x64, 0xb7, 0xa5, 0xa9, 0x9f, 0x64, 0x7e, + 0x82, 0x83, 0xd4, 0x8d, 0xb9, 0x47, 0xb4, 0x51, 0x75, 0x4d, 0x98, 0x45, + 0x54, 0x43, 0x54, 0xa0, 0xa1, 0xb0, 0xb5, 0x9e, 0xd5, 0x73, 0x31, 0x70, + 0x83, 0x94, 0x46, 0xaa, 0xc4, 0x59, 0x46, 0xc0, 0xc8, 0xca, 0x62, 0xa7, + 0x97, 0x9b, 0x38, 0x62, 0xc8, 0xb2, 0xaf, 0xb7, 0x33, 0xac, 0xa6, 0xcc, + 0x7b, 0x41, 0x8f, 0x81, 0x6c, 0x6a, 0x5f, 0xb8, 0xaa, 0x33, 0xa4, 0xad, + 0x4f, 0xd1, 0x7b, 0xb2, 0xae, 0x7a, 0x52, 0x96, 0x3b, 0x59, 0x50, 0x51, + 0x46, 0xca, 0xa9, 0xa0, 0x7e, 0x46, 0x92, 0x5c, 0xa2, 0xb4, 0xaa, 0x8d, + 0x33, 0x55, 0xdc, 0xa7, 0x2b, 0xc7, 0x5d, 0x6e, 0x4d, 0xa1, 0xa6, 0x3f, + 0x9f, 0x6a, 0x7f, 0x63, 0xca, 0x74, 0x36, 0xd1, 0xa3, 0x7a, 0x47, 0x51, + 0x68, 0x90, 0x9a, 0x6b, 0x55, 0x3f, 0x51, 0xaf, 0x70, 0x44, 0x5e, 0x4f, + 0xd5, 0x46, 0x6b, 0x7a, 0x6a, 0xa1, 0x9f, 0xcf, 0x43, 0x7b, 0xcb, 0xbc, + 0x6f, 0x82, 0x54, 0x5c, 0x42, 0x8f, 0xc8, 0xd2, 0x76, 0xa2, 0x48, 0x5b, + 0x82, 0x96, 0xc4, 0x76, 0xa7, 0x6c, 0x8b, 0xbf, 0xca, 0xa3, 0xd3, 0x90, + 0xa9, 0xc7, 0x51, 0x9b, 0x9c, 0x5a, 0x36, 0x79, 0x8c, 0x54, 0x9a, 0xa7, + 0x7a, 0x45, 0xb5, 0x46, 0x6d, 0x72, 0x71, 0x80, 0x92, 0x9c, 0x3b, 0x7e, + 0x45, 0x53, 0xa5, 0x39, 0xcf, 0x98, 0x8c, 0x9a, 0x6e, 0x94, 0x9d, 0x58, + 0x5c, 0x49, 0xd4, 0x8f, 0x9d, 0x9a, 0xc4, 0xcc, 0x58, 0x84, 0x70, 0x48, + 0xb9, 0x7f, 0xb5, 0xa7, 0x84, 0x63, 0xa0, 0x8a, 0x67, 0x5f, 0x5e, 0x43, + 0x54, 0x43, 0xb8, 0x9c, 0x48, 0x7e, 0xab, 0x4d, 0x67, 0xcb, 0xd2, 0xc2, + 0xaf, 0x94, 0xa2, 0x69, 0xb0, 0x5e, 0x8d, 0x7a, 0x4f, 0x6a, 0xc7, 0x62, + 0xc2, 0xd6, 0x3b, 0xd3, 0x98, 0x70, 0x8c, 0xc8, 0x4f, 0x96, 0x6f, 0x29, + 0x92, 0xc1, 0xd3, 0xc9, 0x85, 0xc2, 0x48, 0x9c, 0xa9, 0x5f, 0xbf, 0x60, + 0x37, 0x98, 0xa8, 0x74, 0xcc, 0x73, 0x5a, 0xb0, 0x36, 0x93, 0x8d, 0x9f, + 0x93, 0x3e, 0x55, 0x31, 0x61, 0x52, 0x40, 0x79, 0xc9, 0xa3, 0x65, 0x66, + 0x51, 0x9b, 0x63, 0x3b, 0x8f, 0x84, 0xb3, 0x42, 0xb2, 0xcd, 0x4d, 0x5e, + 0x9d, 0x3f, 0x7e, 0xb6, 0x49, 0x93, 0x8d, 0xa9, 0xad, 0x53, 0x66, 0x44, + 0x6e, 0x3e, 0xac, 0x97, 0xc5, 0xb0, 0xa1, 0x31, 0x72, 0x9a, 0xa0, 0x54, + 0xc6, 0xc4, 0x92, 0x77, 0x91, 0xc7, 0xac, 0x64, 0x41, 0xd7, 0xc6, 0x7a, + 0x51, 0x5a, 0x3c, 0xa9, 0x36, 0xb6, 0x97, 0xc3, 0x54, 0x9d, 0xcc, 0xaf, + 0x9b, 0x69, 0x96, 0x7b, 0x66, 0xaf, 0xac, 0x42, 0x7c, 0x76, 0x82, 0x47, + 0x60, 0x97, 0x7e, 0xb2, 0xa8, 0x43, 0x38, 0x9b, 0x83, 0x83, 0x56, 0x90, + 0xd7, 0x89, 0x5c, 0xad, 0xae, 0x59, 0x84, 0x66, 0x39, 0x3f, 0x81, 0x4a, + 0x85, 0x52, 0xba, 0x55, 0xc9, 0x87, 0x81, 0x5e, 0x82, 0x8a, 0x59, 0x8a, + 0x79, 0x60, 0xc2, 0x9c, 0x5b, 0xaa, 0x4e, 0x4d, 0xae, 0x5b, 0xa8, 0xc5, + 0x91, 0x46, 0x7a, 0xa9, 0x31, 0x73, 0x81, 0x90, 0xae, 0x3c, 0x68, 0x5a, + 0x7e, 0x3d, 0xcc, 0x2d, 0x51, 0x8a, 0x37, 0xb4, 0x48, 0x38, 0x6e, 0x86, + 0x76, 0x84, 0x65, 0xd9, 0x60, 0x94, 0xb1, 0x67, 0xd5, 0x78, 0x48, 0x97, + 0x3c, 0x59, 0x56, 0x3d, 0xa7, 0x36, 0xb0, 0xb8, 0xc5, 0xb6, 0xa4, 0x5b, + 0x71, 0x6c, 0x97, 0x39, 0x3d, 0x40, 0xb4, 0x7b, 0xcc, 0x3a, 0x82, 0x7a, + 0x83, 0xb3, 0x56, 0x56, 0x96, 0x82, 0xbb, 0x7b, 0xac, 0x7a, 0x64, 0x56, + 0x83, 0x75, 0xaf, 0x87, 0x7b, 0x46, 0xc0, 0x6c, 0xd0, 0xd0, 0x41, 0x4a, + 0x7e, 0x7c, 0x58, 0x41, 0x57, 0x9c, 0x94, 0xd8, 0xa6, 0x4b, 0x71, 0x66, + 0x7e, 0x64, 0xa8, 0x32, 0x41, 0xa1, 0xc9, 0x37, 0x3b, 0xc4, 0xce, 0x40, + 0x92, 0x31, 0xbc, 0x93, 0xcf, 0x54, 0x6e, 0x4f, 0xb3, 0xa2, 0xca, 0x72, + 0x97, 0x4f, 0x49, 0xa6, 0x6b, 0x94, 0x8c, 0x6a, 0x26, 0x36, 0x50, 0x5d, + 0x59, 0x5e, 0xa2, 0x63, 0xb1, 0x4a, 0x8b, 0x85, 0x7c, 0x6e, 0x5b, 0x5e, + 0x3b, 0x86, 0x8d, 0x7f, 0xc9, 0x3e, 0x55, 0xab, 0x80, 0x31, 0xad, 0x4b, + 0x55, 0x78, 0xaf, 0xb1, 0xb9, 0x40, 0xc4, 0xb6, 0x8d, 0x83, 0x98, 0xbd, + 0x4c, 0x97, 0x47, 0x61, 0x79, 0x3f, 0xc0, 0x94, 0xb6, 0xc0, 0x4c, 0x46, + 0x6b, 0xac, 0xc7, 0x66, 0x8f, 0x46, 0x78, 0xb5, 0x38, 0x56, 0x9d, 0x61, + 0x9e, 0x5d, 0x84, 0xcb, 0xb8, 0x5a, 0x50, 0x48, 0x3f, 0xd0, 0xc4, 0xba, + 0x78, 0x32, 0x3d, 0xa3, 0x94, 0xaa, 0x86, 0x33, 0x8a, 0x81, 0xc9, 0x41, + 0xce, 0xc3, 0x6d, 0x37, 0x36, 0x7b, 0x65, 0x8e, 0xcb, 0xcf, 0x63, 0x6e, + 0x3c, 0x99, 0xc1, 0xae, 0x5c, 0xca, 0x4e, 0xc7, 0x57, 0x8e, 0x8b, 0x92, + 0xd3, 0xb6, 0x34, 0xbe, 0x3f, 0xa1, 0x9a, 0xb9, 0xb5, 0x7f, 0x40, 0x5a, + 0x96, 0x9f, 0x50, 0x8f, 0xd1, 0x86, 0x2b, 0x58, 0xcf, 0x86, 0xa7, 0x4f, + 0x63, 0xcc, 0x95, 0xb2, 0x90, 0x62, 0x8c, 0xb8, 0x9b, 0xc4, 0xcb, 0x34, + 0x89, 0xac, 0xc8, 0xb9, 0x97, 0xd1, 0x53, 0xac, 0x7d, 0x83, 0xad, 0xb0, + 0xa4, 0x60, 0x48, 0xc2, 0x64, 0xc2, 0x8e, 0x50, 0xab, 0x7d, 0x8d, 0x66, + 0xab, 0x66, 0x5e, 0xa9, 0x63, 0x42, 0x66, 0x5c, 0x75, 0xb8, 0x9d, 0xd2, + 0xa3, 0x4c, 0x99, 0x47, 0x4b, 0x62, 0x95, 0xa1, 0xca, 0x77, 0x60, 0xd7, + 0xad, 0xbf, 0x53, 0x3f, 0x8c, 0xc8, 0x36, 0x97, 0xb5, 0x96, 0x38, 0x8e, + 0x8d, 0x4a, 0x66, 0x44, 0x36, 0xb5, 0x4d, 0x40, 0xc1, 0x58, 0xbe, 0x69, + 0xb1, 0x8e, 0x41, 0x88, 0x82, 0xa2, 0x94, 0xa2, 0x30, 0xd1, 0x8f, 0x76, + 0x61, 0x96, 0xaf, 0x87, 0x30, 0x93, 0xd1, 0x93, 0x4b, 0x68, 0xc4, 0xa5, + 0xc4, 0x96, 0x59, 0x74, 0xca, 0x3d, 0xb6, 0x94, 0x6d, 0xac, 0x5e, 0xa3, + 0xb6, 0xaf, 0x52, 0x89, 0x5f, 0xa7, 0x33, 0x99, 0x9a, 0x86, 0x8b, 0xa9, + 0xac, 0xbb, 0xcc, 0xd3, 0x38, 0xbe, 0x49, 0x69, 0x68, 0x43, 0xc6, 0xc0, + 0x42, 0xbe, 0x69, 0xc5, 0xd3, 0x85, 0x53, 0x52, 0xc2, 0x4a, 0xa0, 0x77, + 0x31, 0x43, 0x84, 0xd1, 0x4d, 0x6d, 0xb5, 0x94, 0xc2, 0xbe, 0x30, 0x94, + 0x66, 0x77, 0xca, 0x8a, 0xb8, 0xbd, 0x64, 0xac, 0x79, 0x42, 0x59, 0x86, + 0xaa, 0x76, 0x56, 0x43, 0xab, 0x7b, 0x3f, 0x7c, 0x75, 0x37, 0x98, 0x79, + 0x88, 0x47, 0x63, 0x5c, 0xb5, 0xd1, 0xcd, 0x7e, 0x48, 0x9e, 0xbb, 0x9f, + 0x74, 0x3a, 0x68, 0xa3, 0x74, 0x45, 0x8f, 0xba, 0x73, 0x96, 0x95, 0xad, + 0x4a, 0xca, 0xd1, 0x90, 0x4e, 0x41, 0x61, 0x9c, 0x98, 0xb1, 0xb6, 0xbb, + 0x98, 0xca, 0xa5, 0x9e, 0x2e, 0x8f, 0x62, 0x5a, 0x89, 0x4c, 0x36, 0x49, + 0x42, 0x70, 0x63, 0xcb, 0x40, 0xb7, 0x94, 0x42, 0x9a, 0xbb, 0xba, 0xb5, + 0xa6, 0x2e, 0x7d, 0x3e, 0xb7, 0xc1, 0xc2, 0xca, 0x62, 0x2e, 0x98, 0x90, + 0xcd, 0x77, 0x62, 0x7f, 0x71, 0x3f, 0x4c, 0xa7, 0x64, 0x3a, 0x4d, 0x37, + 0x6d, 0xbb, 0x8f, 0x87, 0x50, 0x40, 0x31, 0x92, 0xb6, 0x7a, 0x4b, 0xa3, + 0xa3, 0x65, 0x8f, 0x96, 0xd7, 0x85, 0xaa, 0xc8, 0x6a, 0x9c, 0x5b, 0x6d, + 0x78, 0x7d, 0xa7, 0x38, 0x8d, 0xbe, 0x93, 0x8b, 0x9e, 0x6f, 0xa7, 0x4b, + 0x3b, 0xd8, 0xc8, 0xad, 0xb4, 0xdb, 0x60, 0x74, 0xc7, 0x3c, 0x7a, 0x31, + 0x75, 0xda, 0x70, 0x72, 0x29, 0xab, 0x88, 0x67, 0x45, 0xb3, 0xb8, 0x34, + 0x3d, 0xb6, 0x6e, 0x37, 0xbf, 0xc3, 0xab, 0x59, 0x9b, 0xc5, 0xa9, 0xbf, + 0xbc, 0x8e, 0x84, 0xb5, 0x79, 0xb4, 0xcb, 0x88, 0xa6, 0x52, 0xb4, 0xce, + 0xc3, 0x94, 0x86, 0x3a, 0x39, 0x58, 0xbc, 0xcc, 0x57, 0x49, 0xdc, 0x63, + 0xd0, 0xc5, 0xcb, 0x7b, 0xa3, 0x96, 0x8e, 0x55, 0xac, 0x3f, 0x4e, 0x9d, + 0x97, 0x52, 0x37, 0xaf, 0xba, 0x8f, 0xcd, 0x9e, 0x51, 0x80, 0x77, 0x9e, + 0x7d, 0x81, 0xb6, 0xba, 0x54, 0xce, 0x80, 0xa1, 0x6a, 0xbd, 0xcc, 0x43, + 0xb3, 0x53, 0x5c, 0x8e, 0xcb, 0xb9, 0xbd, 0x7b, 0xb2, 0x3e, 0x5c, 0x3c, + 0x50, 0x41, 0xa9, 0x30, 0x8e, 0x2c, 0xda, 0x6f, 0xba, 0x5c, 0x5c, 0xa9, + 0xaa, 0x52, 0x5d, 0x7d, 0x59, 0x93, 0x36, 0x65, 0x8f, 0xc2, 0xb2, 0xc2, + 0x86, 0xb6, 0xb6, 0x5d, 0x5f, 0x79, 0x63, 0x55, 0xd4, 0xae, 0x92, 0x49, + 0x44, 0x5a, 0x54, 0x88, 0xd2, 0x3a, 0xb3, 0x92, 0xc1, 0xc9, 0x95, 0x43, + 0x9b, 0x98, 0x5f, 0x29, 0x73, 0x52, 0x6c, 0xcb, 0x77, 0xa1, 0x4a, 0x8b, + 0x3c, 0x39, 0x3e, 0xad, 0x32, 0x90, 0xab, 0xa4, 0xc4, 0x8d, 0xa2, 0x4b, + 0xc4, 0x8c, 0xb7, 0x8f, 0x70, 0x33, 0xb2, 0xb4, 0x9e, 0x71, 0x53, 0xd1, + 0x8c, 0x5d, 0xcb, 0x5d, 0x2f, 0x3f, 0xb9, 0xd4, 0xb5, 0x9d, 0x9f, 0x6f, + 0x91, 0x86, 0x64, 0x63, 0x75, 0x67, 0x7f, 0x7e, 0x4f, 0xaf, 0x9f, 0x75, + 0x4b, 0x6d, 0xcb, 0x94, 0x92, 0xa5, 0xd0, 0xd8, 0x55, 0x52, 0x7e, 0x89, + 0x64, 0x78, 0x73, 0xc4, 0x72, 0xb1, 0xb4, 0x8e, 0xb0, 0xbb, 0x6d, 0x83, + 0x66, 0x69, 0xac, 0x7f, 0x87, 0xae, 0x71, 0xc1, 0x74, 0x58, 0x9b, 0x8e, + 0xa0, 0xb6, 0x6d, 0xcd, 0x6e, 0x68, 0x97, 0x8c, 0x55, 0xcd, 0x35, 0x7b, + 0xd3, 0x50, 0x33, 0x8d, 0x4d, 0x77, 0x67, 0x50, 0x6d, 0x6d, 0x4c, 0xce, + 0x64, 0x7f, 0x2a, 0x66, 0x9d, 0x81, 0x46, 0x68, 0x95, 0x3c, 0x7f, 0xab, + 0x6b, 0xb2, 0x6e, 0x45, 0xb1, 0xc5, 0x5e, 0x9a, 0xc9, 0x27, 0x83, 0xb1, + 0x77, 0x8a, 0x5d, 0x73, 0xa8, 0xbf, 0xd7, 0xb2, 0xe7, 0x5c, 0xad, 0x68, + 0x60, 0x50, 0x46, 0x35, 0x49, 0x65, 0x89, 0x5e, 0x91, 0xc3, 0x94, 0x8f, + 0xb3, 0x36, 0x59, 0x7e, 0xc4, 0x49, 0x94, 0x47, 0xbb, 0x68, 0xc0, 0xc1, + 0x54, 0x77, 0xaa, 0x5e, 0x56, 0x79, 0x38, 0x69, 0xc0, 0xd8, 0x54, 0x5c, + 0x40, 0x90, 0xc9, 0xd2, 0x9b, 0x8c, 0xa2, 0x4e, 0x61, 0x4e, 0x57, 0x6f, + 0x7e, 0x4d, 0x75, 0x3d, 0x74, 0x9d, 0x6f, 0x42, 0xc6, 0x5c, 0x4f, 0x69, + 0x95, 0xc6, 0x54, 0xbb, 0x94, 0xa4, 0x71, 0x45, 0x78, 0x7c, 0x3f, 0x79, + 0xcb, 0x6e, 0x95, 0xa7, 0x89, 0xa6, 0x96, 0x63, 0x3b, 0x66, 0x6c, 0x43, + 0xca, 0x92, 0x97, 0x75, 0x65, 0xb6, 0x88, 0x44, 0x94, 0x67, 0x65, 0x4f, + 0x42, 0x95, 0x66, 0xb3, 0xa2, 0x7e, 0xd1, 0x36, 0x64, 0x76, 0x6c, 0x82, + 0x7b, 0x85, 0x6a, 0x66, 0x4b, 0x58, 0x3d, 0x8a, 0x69, 0x82, 0x91, 0x8e, + 0x90, 0x95, 0x77, 0x41, 0x84, 0x72, 0xa2, 0x9a, 0x44, 0x40, 0x3c, 0x9f, + 0x9c, 0x93, 0x82, 0x59, 0xb2, 0x8d, 0x76, 0x85, 0x4d, 0xd6, 0x3a, 0x97, + 0x71, 0x41, 0x7c, 0xad, 0xac, 0xb7, 0xcf, 0x82, 0xa5, 0x49, 0x80, 0x7b, + 0xa9, 0x8c, 0x97, 0xa5, 0x42, 0x78, 0xd9, 0xac, 0x6f, 0x97, 0x53, 0x86, + 0xb3, 0x30, 0xbf, 0x5d, 0xa5, 0x5a, 0x40, 0xad, 0x66, 0x7c, 0x50, 0x79, + 0x94, 0x60, 0x6d, 0xcc, 0xb5, 0xc5, 0x5a, 0x5b, 0xb1, 0x59, 0xac, 0x99, + 0x58, 0x80, 0xb3, 0x5d, 0x48, 0x5c, 0x7c, 0x74, 0x7d, 0x4b, 0x8c, 0xa4, + 0xb3, 0x60, 0xb3, 0x48, 0x99, 0x8a, 0x8f, 0x7f, 0x91, 0x3f, 0x8a, 0x93, + 0xae, 0x3e, 0x7f, 0x82, 0xd0, 0xcf, 0xb6, 0x2c, 0x99, 0x46, 0x72, 0xa1, + 0x64, 0xa4, 0xa9, 0x57, 0xd3, 0xcc, 0xba, 0xb6, 0xa9, 0x3d, 0xc9, 0x7b, + 0x50, 0x5b, 0x60, 0x4f, 0x3b, 0x7a, 0xd0, 0x88, 0xc1, 0xb6, 0x94, 0x91, + 0x3c, 0x83, 0xd1, 0x65, 0x81, 0x49, 0x8b, 0x4d, 0x9a, 0x80, 0x52, 0x3b, + 0x5c, 0x3f, 0xa7, 0x51, 0xb5, 0x50, 0xc5, 0xcd, 0x5b, 0x8c, 0x7d, 0x6d, + 0xab, 0x46, 0x38, 0xc2, 0x40, 0xca, 0x51, 0x8b, 0xa6, 0x5b, 0x9e, 0xa0, + 0xd8, 0xcc, 0x4b, 0x3f, 0x88, 0x59, 0x6d, 0x4d, 0x6b, 0x8d, 0x81, 0xa3, + 0x46, 0x9c, 0xa8, 0x55, 0x90, 0x51, 0xd4, 0x8d, 0xd3, 0x65, 0x95, 0xbd, + 0xda, 0x6a, 0x40, 0x56, 0xd2, 0x52, 0x7a, 0x72, 0x56, 0x7b, 0x4c, 0x68, + 0x61, 0xd4, 0x41, 0x32, 0xad, 0x35, 0x95, 0x68, 0x83, 0x84, 0x9a, 0xa6, + 0x7d, 0x4f, 0xa6, 0x58, 0x49, 0x37, 0x31, 0x3f, 0x34, 0x56, 0xbf, 0xb2, + 0x83, 0xd9, 0x39, 0x95, 0x56, 0xb6, 0x43, 0x66, 0x48, 0xcb, 0x67, 0xa0, + 0x4d, 0xb5, 0x5e, 0xc5, 0x43, 0x31, 0x4f, 0xb2, 0x67, 0x76, 0x93, 0x6e, + 0x8c, 0x81, 0x3a, 0x7b, 0x9b, 0xce, 0xcf, 0x79, 0x34, 0x6f, 0x84, 0x30, + 0xc2, 0xa0, 0xac, 0xc5, 0xb5, 0x77, 0xc9, 0x94, 0x8e, 0x7b, 0x8b, 0xa6, + 0x6d, 0x84, 0x3e, 0xc7, 0x6c, 0x71, 0x30, 0x4c, 0xbb, 0x40, 0x4e, 0x6d, + 0x70, 0xa7, 0xa2, 0xd0, 0xa6, 0x89, 0x4f, 0xd4, 0x68, 0xb1, 0xbd, 0xb5, + 0xae, 0x6b, 0xc9, 0x61, 0xcd, 0x47, 0x8a, 0xc5, 0xb8, 0x7a, 0xa9, 0x71, + 0xcc, 0x91, 0x93, 0xb3, 0x47, 0xbb, 0xcf, 0xa0, 0xa9, 0x7c, 0x62, 0x51, + 0xa3, 0xb6, 0x7e, 0x7f, 0x98, 0x7e, 0xa0, 0x50, 0xb7, 0x58, 0x48, 0x55, + 0x8a, 0x81, 0xa0, 0x90, 0x85, 0x7a, 0x5a, 0xd8, 0x51, 0x6b, 0x7f, 0xcc, + 0x50, 0x34, 0x4e, 0x9d, 0x53, 0x7a, 0x9d, 0xbd, 0xd3, 0x98, 0xba, 0x64, + 0x63, 0x5b, 0x41, 0x89, 0x4d, 0x49, 0xcf, 0x96, 0x8a, 0x9e, 0x69, 0x69, + 0x88, 0x81, 0x53, 0x64, 0x9d, 0xab, 0xa9, 0xd6, 0xa7, 0x94, 0xa3, 0x3a, + 0xad, 0x3e, 0xa9, 0x5c, 0x43, 0xc9, 0x9d, 0xc8, 0x37, 0x59, 0x33, 0x44, + 0xc5, 0xc7, 0xca, 0xcc, 0x7f, 0x79, 0x6d, 0x97, 0x67, 0xd0, 0x9a, 0x78, + 0xaf, 0xb9, 0xcd, 0x51, 0x72, 0xaa, 0xa1, 0xb1, 0x68, 0x5c, 0x76, 0xc0, + 0x56, 0xd9, 0xd9, 0x98, 0xbd, 0x5b, 0xb2, 0x8a, 0x7a, 0x66, 0xcd, 0xd1, + 0x83, 0x63, 0x40, 0xb2, 0x5d, 0x52, 0xc1, 0x7e, 0x50, 0xaf, 0xd7, 0xc4, + 0x81, 0x73, 0x66, 0x63, 0x5e, 0x7b, 0xd2, 0x43, 0xaf, 0x35, 0xc7, 0x31, + 0xac, 0xad, 0x78, 0x55, 0xa7, 0xd0, 0x6c, 0x94, 0x48, 0x85, 0xb3, 0xab, + 0xa1, 0x69, 0x3f, 0x6f, 0x54, 0x6e, 0xbe, 0x58, 0xa6, 0x3b, 0x4d, 0x3f, + 0xb2, 0x46, 0x94, 0x85, 0x81, 0xa4, 0x5a, 0x7c, 0x83, 0x39, 0x8d, 0x44, + 0x3e, 0x8c, 0x82, 0xa6, 0x97, 0x9e, 0xac, 0x93, 0x53, 0xd8, 0xba, 0x7d, + 0x41, 0x57, 0x5b, 0x78, 0xb7, 0x33, 0xb6, 0xa2, 0xc2, 0x96, 0x7d, 0x3f, + 0xca, 0xc9, 0x68, 0x60, 0x77, 0xb7, 0x9a, 0x62, 0x88, 0xb9, 0x30, 0xdb, + 0x5a, 0xd5, 0xcf, 0x4c, 0xa9, 0x4e, 0xba, 0xce, 0x96, 0x6d, 0x8e, 0x7c, + 0x6b, 0x8d, 0x85, 0xd2, 0x77, 0xb0, 0x43, 0x97, 0xc7, 0x7a, 0x4a, 0x7b, + 0xb4, 0xae, 0x69, 0x40, 0x6b, 0x31, 0x48, 0x3d, 0xcb, 0x3a, 0x9f, 0x71, + 0x41, 0x93, 0x31, 0xb5, 0x78, 0x97, 0xad, 0xd2, 0x64, 0x31, 0xcb, 0x6c, + 0x7e, 0x6f, 0x57, 0x70, 0x9c, 0xbd, 0xa7, 0xa4, 0x3e, 0xc5, 0xb7, 0xb5, + 0x9f, 0x83, 0x7e, 0x4d, 0x6c, 0xd6, 0x9b, 0xd7, 0x3c, 0x45, 0x3f, 0x9b, + 0x9c, 0x5a, 0x55, 0x9b, 0xa7, 0x3a, 0xae, 0x60, 0x7b, 0x80, 0x41, 0xbb, + 0xc9, 0x9a, 0x7c, 0x67, 0x87, 0x5b, 0x70, 0xba, 0x8c, 0x9a, 0x6c, 0x93, + 0x9d, 0xbc, 0x54, 0x31, 0x9d, 0x53, 0x43, 0xc4, 0x8c, 0x6a, 0x6f, 0xa2, + 0xac, 0xc1, 0x63, 0x5f, 0x48, 0x99, 0x77, 0x73, 0x5f, 0xa2, 0x8f, 0x4b, + 0x33, 0x41, 0x41, 0xc8, 0x48, 0x74, 0x48, 0xcf, 0x82, 0x56, 0xa7, 0x3d, + 0x84, 0x9e, 0x72, 0x86, 0x48, 0xc5, 0x95, 0xba, 0x9c, 0x9a, 0x44, 0xb3, + 0x57, 0x49, 0x60, 0x53, 0xcf, 0x5a, 0x4b, 0x9f, 0xab, 0xd0, 0x63, 0x79, + 0xc1, 0x97, 0x93, 0x52, 0x43, 0x72, 0x58, 0xa8, 0xc9, 0x99, 0xbe, 0x35, + 0xd4, 0x7b, 0xc5, 0x92, 0x79, 0xb3, 0xd7, 0xa1, 0x53, 0x62, 0x45, 0xa7, + 0x74, 0xaf, 0x5e, 0x4a, 0x67, 0xa3, 0x3f, 0xc9, 0xac, 0xb3, 0x70, 0x63, + 0x64, 0x3a, 0x70, 0xa6, 0x82, 0xba, 0xd3, 0x9d, 0x56, 0x3e, 0xc7, 0xab, + 0x8c, 0x6e, 0xac, 0x73, 0xae, 0x43, 0x5a, 0x8b, 0xb0, 0xc0, 0x7f, 0x6e, + 0x84, 0xc1, 0x80, 0xb6, 0xb2, 0x7c, 0x38, 0x49, 0x80, 0x3b, 0x5d, 0x66, + 0x8e, 0x62, 0xa6, 0x85, 0x89, 0x50, 0x86, 0x79, 0x53, 0x73, 0x36, 0x96, + 0x7c, 0x5e, 0x75, 0x93, 0x85, 0x35, 0x9b, 0x53, 0x55, 0x8d, 0x8d, 0x91, + 0x46, 0x4d, 0x73, 0x9b, 0x4c, 0xa8, 0x9f, 0x78, 0x73, 0x8d, 0x7d, 0x4f, + 0x5f, 0x8c, 0x50, 0xc6, 0x94, 0x36, 0x66, 0x5e, 0x76, 0xcd, 0xae, 0x81, + 0x3d, 0x5b, 0x7c, 0xa7, 0x74, 0xaa, 0xcf, 0x88, 0x65, 0x7a, 0x98, 0x45, + 0x6c, 0x30, 0x90, 0x74, 0xa1, 0xb9, 0x4b, 0xc2, 0xa8, 0x33, 0xa9, 0x7d, + 0xa0, 0xa6, 0x8b, 0x67, 0x5b, 0x53, 0xbe, 0x6a, 0x9b, 0x5c, 0x51, 0x44, + 0x91, 0x3b, 0x4e, 0x9d, 0x90, 0x5e, 0xb8, 0x77, 0xc6, 0x78, 0x48, 0xb1, + 0x4f, 0x48, 0x5d, 0xcb, 0x42, 0xc3, 0x45, 0x63, 0x54, 0xa5, 0xb7, 0xc7, + 0xb1, 0x9e, 0x54, 0x40, 0x4c, 0x5d, 0x3e, 0x55, 0x64, 0x49, 0x38, 0x6d, + 0x3e, 0x3e, 0xbd, 0x92, 0x40, 0xaf, 0x6e, 0x42, 0xa6, 0x53, 0xb4, 0x62, + 0x92, 0xaa, 0xa2, 0x7b, 0xb7, 0x4c, 0x81, 0x73, 0x3a, 0x40, 0x65, 0x93, + 0x7f, 0x5a, 0x67, 0x8e, 0x4f, 0xb2, 0xa9, 0x3c, 0xd1, 0xad, 0x68, 0xa0, + 0x9a, 0x7c, 0x90, 0xa5, 0xa5, 0xa1, 0x6a, 0xa3, 0x3c, 0xb2, 0xcf, 0xbb, + 0x5d, 0x57, 0x9b, 0xb9, 0x58, 0xa1, 0x93, 0x37, 0xd3, 0x7f, 0x44, 0xc3, + 0x40, 0x77, 0x89, 0xb8, 0x7e, 0x74, 0xa6, 0xb9, 0x9a, 0x86, 0xad, 0xb2, + 0x3c, 0xca, 0x4d, 0xab, 0x81, 0x55, 0x4c, 0x87, 0x52, 0xcd, 0xbf, 0x7a, + 0x8e, 0x30, 0xb8, 0x6e, 0x65, 0x3e, 0x95, 0x33, 0x9e, 0x4b, 0x5b, 0xba, + 0x82, 0xaa, 0xce, 0xbd, 0x7c, 0x72, 0x63, 0xcf, 0x9f, 0x56, 0xcc, 0x4e, + 0xab, 0xd0, 0x7b, 0x3d, 0x8b, 0x72, 0x3a, 0x3b, 0x4a, 0x7a, 0xae, 0x30, + 0x73, 0x71, 0x56, 0x5b, 0xc1, 0x76, 0xb4, 0xc2, 0x3e, 0x92, 0xce, 0xcb, + 0x90, 0x8d, 0xc7, 0x79, 0x73, 0x6d, 0xbf, 0xc1, 0x38, 0x7a, 0x7c, 0x5f, + 0x85, 0x32, 0x8f, 0xc0, 0x34, 0xc6, 0x82, 0x9d, 0xc5, 0x99, 0x4a, 0x74, + 0x90, 0x45, 0x8b, 0x97, 0x4f, 0x44, 0x68, 0x90, 0xc8, 0x83, 0x5c, 0xad, + 0x7c, 0x7f, 0xc5, 0x7f, 0x5e, 0x48, 0x9b, 0xba, 0xca, 0x53, 0x6c, 0x7b, + 0x8a, 0x45, 0xae, 0x64, 0x40, 0xae, 0x47, 0x72, 0xcb, 0x7c, 0x91, 0x44, + 0xbe, 0x79, 0x4c, 0x77, 0x34, 0x8c, 0x53, 0x7c, 0x5a, 0xa1, 0x97, 0xd3, + 0x73, 0xcf, 0x96, 0x82, 0xc6, 0xcf, 0x55, 0xa3, 0x49, 0xbe, 0xb8, 0x59, + 0x4a, 0x40, 0xb0, 0xcb, 0xbb, 0x9d, 0x40, 0xbc, 0xb2, 0x51, 0xc4, 0x39, + 0x6f, 0x81, 0xb2, 0x5b, 0x65, 0x72, 0x4f, 0xa9, 0xaa, 0x8e, 0x54, 0xc5, + 0x39, 0x42, 0x5e, 0x60, 0xc4, 0x34, 0x40, 0x9e, 0x4c, 0x59, 0x3c, 0x4b, + 0x58, 0x86, 0xad, 0x87, 0x8f, 0x41, 0xc0, 0x3a, 0x77, 0x98, 0x55, 0x72, + 0x46, 0x86, 0x57, 0xad, 0x59, 0x82, 0x69, 0xc8, 0xad, 0xa9, 0xaf, 0xc5, + 0x83, 0x5d, 0xc7, 0x35, 0xce, 0x5a, 0x60, 0xa3, 0x60, 0x94, 0xb5, 0x74, + 0x33, 0x92, 0xbe, 0x3b, 0x43, 0x8c, 0x63, 0x5c, 0xbc, 0xc4, 0x99, 0xc9, + 0x5d, 0x8c, 0xac, 0xca, 0xcd, 0x32, 0xb6, 0x95, 0x4a, 0xbc, 0x7b, 0x65, + 0x99, 0x9c, 0xbd, 0x87, 0xb0, 0x6f, 0xab, 0x8f, 0x5b, 0x33, 0x73, 0x6e, + 0x92, 0x85, 0x8c, 0xc6, 0x9f, 0x47, 0xb5, 0x9b, 0x7b, 0x5b, 0x44, 0xcc, + 0xc0, 0x47, 0xb6, 0xcb, 0x7d, 0x36, 0x70, 0x65, 0x3b, 0x40, 0x83, 0x51, + 0x58, 0x5c, 0x4b, 0xc6, 0x7f, 0x69, 0xc8, 0x82, 0x81, 0x38, 0xcc, 0x84, + 0x78, 0x90, 0xbe, 0x95, 0x70, 0x84, 0xc5, 0x5b, 0x97, 0x41, 0x48, 0x37, + 0xba, 0xb2, 0x7c, 0x47, 0xae, 0x47, 0x4e, 0x8f, 0x40, 0x8d, 0xad, 0x45, + 0xb0, 0x6a, 0x58, 0x32, 0x4a, 0xd1, 0x7b, 0xc4, 0x9d, 0x35, 0xaa, 0x9b, + 0x87, 0x46, 0xab, 0x91, 0x49, 0x68, 0xa8, 0x3e, 0xb2, 0x80, 0x5d, 0x49, + 0xb1, 0xcd, 0x5b, 0x55, 0x37, 0x6f, 0xcd, 0xa4, 0xc1, 0xa0, 0x64, 0xb7, + 0x82, 0x71, 0xd1, 0x7c, 0xc1, 0x6a, 0xb4, 0xb0, 0xd4, 0xa4, 0xae, 0x57, + 0x7e, 0xa3, 0x3d, 0x51, 0x5c, 0x48, 0x33, 0x64, 0xc6, 0x7f, 0xc8, 0x99, + 0x8e, 0xa1, 0x85, 0x44, 0x8f, 0xb8, 0xc5, 0x6b, 0x66, 0x87, 0x8e, 0x60, + 0x42, 0x7c, 0xa9, 0x3a, 0x70, 0xcc, 0xbf, 0x85, 0x88, 0x5c, 0x35, 0x6d, + 0xc5, 0xc9, 0x41, 0x5f, 0x9a, 0x77, 0xad, 0x9a, 0x8b, 0x52, 0xca, 0x5a, + 0x9c, 0x9b, 0x37, 0x77, 0x33, 0x69, 0x4e, 0xcf, 0xa5, 0xb4, 0x75, 0xae, + 0xba, 0x9d, 0x45, 0xb5, 0x95, 0x72, 0x44, 0xc0, 0xb4, 0x36, 0x36, 0x93, + 0x67, 0x5b, 0x35, 0x7d, 0x4c, 0x7e, 0x4b, 0x8d, 0x9b, 0x41, 0x5c, 0x52, + 0x37, 0x5d, 0xa7, 0x44, 0xc8, 0x72, 0x8c, 0x60, 0xb7, 0xa8, 0x88, 0xaa, + 0x87, 0x71, 0xb4, 0x5e, 0x55, 0x7b, 0x74, 0xcf, 0x99, 0xc6, 0x75, 0xc4, + 0x86, 0x64, 0x73, 0x54, 0xc0, 0x47, 0x90, 0xa0, 0x8d, 0x6f, 0x8d, 0xad, + 0x62, 0x41, 0x56, 0xb6, 0xd1, 0x98, 0xa0, 0x83, 0xa4, 0x78, 0x6c, 0x34, + 0x7b, 0x4a, 0x56, 0xc4, 0x52, 0xbf, 0x39, 0xad, 0x7c, 0x63, 0x5e, 0x91, + 0x9b, 0x54, 0x66, 0xb8, 0x39, 0xa4, 0x4b, 0x69, 0xa0, 0xb1, 0x63, 0x70, + 0x5a, 0x51, 0x79, 0x54, 0xba, 0x61, 0x3c, 0xc7, 0xb0, 0x84, 0xaa, 0x47, + 0x99, 0xc4, 0x61, 0x9e, 0x81, 0x5b, 0xa6, 0x4c, 0x87, 0xbb, 0x3b, 0xc0, + 0x69, 0xd1, 0xa2, 0x8f, 0xa4, 0x4b, 0xb4, 0xb7, 0x8a, 0x90, 0x86, 0x58, + 0x80, 0x5c, 0x76, 0x9c, 0xa1, 0x96, 0x8b, 0x3c, 0x87, 0x51, 0x6a, 0x55, + 0x6e, 0x59, 0xa9, 0xa2, 0x96, 0x81, 0x31, 0x8f, 0x94, 0x92, 0x4c, 0x38, + 0x3c, 0xc3, 0xae, 0x7a, 0xb6, 0x36, 0xa3, 0x6c, 0x50, 0xbb, 0x78, 0xb5, + 0xb8, 0xb6, 0x70, 0x8a, 0x64, 0xad, 0xa2, 0xbb, 0xb4, 0x5c, 0x71, 0x67, + 0x8d, 0x5d, 0x4c, 0x79, 0x9f, 0x51, 0x8a, 0x65, 0xbc, 0x33, 0xbd, 0xaa, + 0x5a, 0x5f, 0x89, 0x97, 0x48, 0x65, 0x6d, 0x4f, 0x4f, 0xcd, 0x92, 0xb3, + 0x40, 0x88, 0x91, 0x7d, 0x97, 0x70, 0x56, 0x8a, 0x85, 0x8f, 0x53, 0x3a, + 0x51, 0x8c, 0x3c, 0x45, 0xad, 0xa4, 0x67, 0x43, 0xa1, 0x83, 0x4d, 0x50, + 0x8e, 0x3f, 0xba, 0x72, 0x37, 0x62, 0x60, 0x27, 0x63, 0xac, 0x3f, 0x65, + 0x90, 0x83, 0x24, 0x75, 0x9b, 0x91, 0xbf, 0x64, 0x40, 0x36, 0x6b, 0xa8, + 0x99, 0x57, 0x78, 0x69, 0xc6, 0xbe, 0x43, 0x2c, 0x85, 0x9e, 0xd0, 0x45, + 0x47, 0x57, 0x68, 0xaa, 0x5f, 0x47, 0x35, 0xbc, 0xdb, 0x59, 0xad, 0xb2, + 0x74, 0x8c, 0x89, 0x35, 0xcf, 0x52, 0x75, 0xc1, 0x49, 0x31, 0x57, 0xa9, + 0x87, 0xcb, 0x77, 0x74, 0xbf, 0xa5, 0x6b, 0x6f, 0x59, 0xce, 0xd1, 0x83, + 0x8b, 0xad, 0x56, 0x7d, 0x69, 0x65, 0xc3, 0x7c, 0xa0, 0x26, 0x69, 0x2e, + 0xbf, 0x60, 0x58, 0xbc, 0xa8, 0x64, 0x57, 0x9f, 0x73, 0xab, 0xdf, 0x7c, + 0x61, 0x32, 0xc3, 0x9d, 0x5c, 0x34, 0xb1, 0x88, 0xa5, 0x89, 0xa6, 0xc3, + 0x36, 0xbe, 0xd9, 0x77, 0x88, 0x79, 0x7b, 0x69, 0x63, 0x8f, 0x57, 0x5b, + 0x2d, 0xb9, 0x68, 0x97, 0x30, 0x9b, 0x9c, 0x73, 0x34, 0xae, 0x98, 0x90, + 0xbb, 0x48, 0x6d, 0x79, 0xb3, 0x63, 0x41, 0x94, 0xc3, 0x87, 0xb4, 0x8a, + 0x4b, 0x47, 0x7e, 0x4e, 0x86, 0xa8, 0x3c, 0x3b, 0x87, 0x9c, 0x39, 0x4d, + 0x4b, 0x62, 0x6f, 0x51, 0x8e, 0x45, 0x81, 0x82, 0x7f, 0x9c, 0xa8, 0xc5, + 0x39, 0x64, 0xbb, 0x76, 0xc9, 0xd0, 0x85, 0x40, 0x53, 0xb8, 0x92, 0xca, + 0x94, 0xb8, 0x5a, 0x70, 0x53, 0x36, 0x85, 0x53, 0xd1, 0x4e, 0xab, 0x53, + 0x95, 0xca, 0x99, 0x9f, 0x34, 0x6c, 0x53, 0x53, 0xc1, 0x7c, 0xcf, 0x45, + 0x78, 0x72, 0x73, 0x7a, 0xa3, 0x9e, 0x50, 0x41, 0x64, 0xc1, 0x81, 0x7a, + 0x6a, 0xa4, 0x3f, 0x43, 0x6d, 0x3b, 0x84, 0x82, 0x45, 0xa0, 0x8e, 0x7a, + 0xcc, 0x76, 0x64, 0x6a, 0x8e, 0x66, 0x59, 0x51, 0x63, 0xbd, 0x9e, 0x5a, + 0x81, 0x3a, 0x64, 0x8d, 0x90, 0x8d, 0x62, 0x82, 0x76, 0x39, 0xb5, 0xb6, + 0x32, 0xa7, 0x3d, 0x8a, 0xc6, 0x6d, 0x4c, 0x4d, 0x81, 0x52, 0x35, 0x48, + 0x3c, 0x4e, 0x75, 0xb0, 0xc9, 0x62, 0xa4, 0x7b, 0xcf, 0x6e, 0x9c, 0xc5, + 0x47, 0xa3, 0x53, 0x93, 0x2e, 0x54, 0x7e, 0x42, 0xa6, 0x5f, 0x72, 0xb2, + 0x33, 0x85, 0xc5, 0xcf, 0x8a, 0x97, 0xad, 0x40, 0x95, 0x93, 0x9b, 0xa8, + 0x90, 0xcf, 0x4e, 0xc1, 0x57, 0x47, 0x36, 0xac, 0x49, 0x7e, 0x43, 0xb9, + 0xad, 0xd6, 0x73, 0xd4, 0xa5, 0x42, 0xa6, 0xa5, 0xab, 0x9f, 0xae, 0x98, + 0x7a, 0x8a, 0x34, 0xa8, 0xb6, 0x94, 0x85, 0xcf, 0x91, 0x9d, 0x7f, 0xa8, + 0xab, 0x7f, 0x7d, 0xcd, 0x7a, 0x9d, 0x41, 0x5c, 0x58, 0x35, 0x78, 0x50, + 0xb5, 0x64, 0x4d, 0x45, 0x3f, 0x3e, 0x7e, 0x92, 0x59, 0x42, 0x7d, 0xc2, + 0x4d, 0x36, 0xb2, 0xb6, 0x4f, 0x34, 0xbd, 0x75, 0x69, 0x44, 0x44, 0x8b, + 0xbc, 0x79, 0xbc, 0xc2, 0x6d, 0x9a, 0x3d, 0x7d, 0x78, 0xc8, 0x9a, 0x71, + 0x56, 0x80, 0x5d, 0x56, 0xba, 0x5c, 0x8f, 0x49, 0x34, 0x59, 0xa2, 0x9e, + 0x9d, 0x6e, 0x3a, 0xb8, 0xb0, 0x9b, 0xaa, 0x8f, 0x4b, 0xac, 0x42, 0x71, + 0x63, 0xae, 0xd5, 0x6f, 0x66, 0x4c, 0x77, 0xad, 0x87, 0xce, 0x89, 0x3e, + 0x47, 0xa8, 0xcc, 0xcc, 0x46, 0x5e, 0x62, 0x84, 0xa4, 0x69, 0x9f, 0x3f, + 0xb6, 0x86, 0x56, 0x9f, 0x98, 0x78, 0x3f, 0x3f, 0x82, 0x93, 0x30, 0x8a, + 0xb4, 0xbb, 0x3b, 0x6b, 0xba, 0x38, 0x9d, 0x3c, 0x6e, 0x36, 0x75, 0x77, + 0x5d, 0xc1, 0xa5, 0xbf, 0x8d, 0x47, 0x98, 0x46, 0x84, 0x52, 0x8a, 0x5c, + 0xba, 0x64, 0xcf, 0x5f, 0x86, 0xbc, 0x83, 0x92, 0x97, 0x50, 0xaa, 0x75, + 0xa0, 0x95, 0x78, 0x9d, 0xb5, 0x4c, 0x48, 0x82, 0xb1, 0x85, 0x5d, 0x4e, + 0x88, 0x82, 0xba, 0x8f, 0xce, 0x52, 0x58, 0xa2, 0xb7, 0x5f, 0xcf, 0x61, + 0x8a, 0xd6, 0x86, 0x87, 0xb4, 0x2d, 0xa7, 0x98, 0x6b, 0x79, 0xb5, 0x8a, + 0xaa, 0x5c, 0x67, 0xd6, 0x58, 0x77, 0x84, 0x9e, 0x29, 0x65, 0x92, 0xb9, + 0x9b, 0x73, 0x62, 0x9a, 0x58, 0x53, 0x50, 0x6a, 0xa4, 0xa5, 0x93, 0x4a, + 0xaa, 0x96, 0x8f, 0x72, 0x84, 0x70, 0x5c, 0x6b, 0x75, 0x8c, 0xd0, 0x82, + 0xc6, 0x75, 0xcf, 0x4e, 0x4b, 0x7b, 0x40, 0x95, 0x3b, 0xbb, 0xc2, 0x2f, + 0x90, 0x4b, 0x82, 0xb5, 0xb2, 0xb0, 0x6a, 0xb4, 0xa3, 0x6e, 0x56, 0x3d, + 0x9e, 0x58, 0xae, 0x94, 0x75, 0xc8, 0x4a, 0x48, 0xa8, 0x63, 0x73, 0x84, + 0x7d, 0x89, 0xa2, 0x91, 0xad, 0x48, 0x70, 0x8f, 0x5f, 0xb5, 0xa2, 0x3a, + 0xc0, 0xa0, 0x9a, 0xc3, 0xab, 0xa8, 0xd7, 0xae, 0x47, 0x93, 0xd0, 0x38, + 0x4c, 0x8d, 0x65, 0xc1, 0xa8, 0x58, 0xb5, 0x88, 0xb9, 0xd1, 0xa2, 0x2f, + 0xd2, 0xc5, 0x68, 0x77, 0x6b, 0xc1, 0x4e, 0x32, 0xaf, 0xb3, 0xc4, 0x2c, + 0xa4, 0xa4, 0xad, 0x4c, 0x79, 0x95, 0x9c, 0x9b, 0xca, 0x6b, 0x64, 0x4b, + 0x5f, 0x80, 0x9c, 0x8a, 0xd1, 0xb4, 0x60, 0xc5, 0x6e, 0x36, 0x8d, 0x78, + 0xcd, 0x8b, 0x45, 0x81, 0x92, 0xc3, 0xbf, 0xab, 0x6e, 0xac, 0xc0, 0x9f, + 0x48, 0x9e, 0x8c, 0x9f, 0xcf, 0xd0, 0x86, 0x58, 0x85, 0xc2, 0x4b, 0x44, + 0xad, 0x3b, 0xb4, 0x9f, 0xa9, 0xae, 0x88, 0x47, 0x38, 0x73, 0x9f, 0x3f, + 0x9b, 0xab, 0x69, 0x9c, 0xac, 0x74, 0x6d, 0x86, 0xaf, 0xa3, 0x88, 0x84, + 0x59, 0xcc, 0xa3, 0xa4, 0x4c, 0x79, 0xd0, 0x71, 0x9f, 0x79, 0x90, 0x52, + 0x64, 0x5f, 0xc8, 0xaa, 0x77, 0x66, 0x4c, 0x80, 0x8c, 0x43, 0x5c, 0x5e, + 0x56, 0x6a, 0xc8, 0x4e, 0xae, 0x7b, 0x32, 0xbd, 0x6a, 0xc8, 0xa9, 0xb8, + 0x63, 0x9d, 0x6c, 0xd3, 0xd4, 0x75, 0x45, 0x99, 0xb4, 0xa2, 0x69, 0xb5, + 0x78, 0x62, 0xb6, 0x68, 0x90, 0xac, 0x7c, 0x85, 0x3e, 0xc5, 0xa5, 0xbe, + 0x6f, 0x5e, 0x8b, 0x34, 0x49, 0x98, 0xbf, 0x51, 0x8e, 0x8b, 0xb2, 0x3e, + 0x3a, 0x39, 0x69, 0x59, 0x7e, 0x7c, 0x56, 0xbb, 0x8b, 0x31, 0xbe, 0xa0, + 0x40, 0x66, 0x4b, 0x9d, 0x38, 0x6c, 0x69, 0xc9, 0xb2, 0xac, 0x66, 0x8a, + 0xc3, 0xc1, 0xd5, 0xd3, 0x93, 0x46, 0x94, 0xca, 0xad, 0x7e, 0x84, 0x86, + 0x8c, 0x43, 0x7f, 0xbd, 0xd4, 0x9e, 0x6c, 0x7f, 0x6f, 0x71, 0x71, 0x83, + 0x55, 0x90, 0xd5, 0x8b, 0xb1, 0x5d, 0x7b, 0xb8, 0xc9, 0x3b, 0x6f, 0xd4, + 0xd5, 0x4f, 0xbc, 0xbc, 0x50, 0x99, 0x6a, 0x8d, 0xb7, 0x89, 0x40, 0x63, + 0x83, 0x5a, 0x62, 0x78, 0x36, 0x57, 0x70, 0x6a, 0x98, 0x3b, 0x63, 0x69, + 0xa8, 0xcd, 0x32, 0x7b, 0xa9, 0xa6, 0x3d, 0x49, 0x88, 0xb9, 0xca, 0x57, + 0xca, 0x68, 0x6a, 0x69, 0xb7, 0xc9, 0x66, 0xb5, 0x37, 0x7c, 0x6a, 0x89, + 0xbb, 0x73, 0x5d, 0x86, 0xc6, 0x4b, 0xb0, 0xbb, 0xcc, 0x7d, 0x89, 0x7a, + 0x8e, 0x61, 0xda, 0x82, 0xbf, 0xb2, 0xaa, 0x98, 0x93, 0x88, 0xd6, 0x8c, + 0x3c, 0xca, 0xae, 0x4a, 0x43, 0x3a, 0x91, 0x56, 0x68, 0xb5, 0x33, 0x75, + 0x8d, 0x85, 0x2e, 0xad, 0xbc, 0x58, 0x56, 0x8b, 0x75, 0xce, 0xd5, 0xc3, + 0xb1, 0x81, 0xc9, 0x6a, 0x68, 0x9a, 0x9f, 0x6f, 0x5a, 0x64, 0xcf, 0xa4, + 0x4f, 0x59, 0x7f, 0xa5, 0xb5, 0x7d, 0xd5, 0x8b, 0x4d, 0x50, 0x73, 0x8e, + 0x61, 0xc4, 0x8e, 0xa3, 0xb9, 0x7e, 0x6a, 0xb8, 0x77, 0x64, 0x6c, 0x2f, + 0xcf, 0xc9, 0xa1, 0x3a, 0x5a, 0xd6, 0x55, 0xc3, 0xa1, 0x82, 0xca, 0x7f, + 0x74, 0x67, 0xa6, 0x8d, 0x93, 0xbc, 0xa4, 0xb2, 0x64, 0x59, 0x56, 0x55, + 0x70, 0xae, 0x8d, 0xca, 0x50, 0x8b, 0x8a, 0x84, 0xa3, 0xa2, 0x49, 0x3b, + 0x9b, 0xab, 0xbe, 0xaa, 0x42, 0x53, 0x9e, 0x5c, 0x88, 0x5c, 0x93, 0x65, + 0x5c, 0xac, 0x52, 0x96, 0x87, 0x70, 0x6d, 0xbe, 0xc5, 0xc8, 0x88, 0x6c, + 0x6a, 0x41, 0x64, 0x4c, 0x42, 0x8c, 0x7f, 0x88, 0x9f, 0x87, 0x73, 0x55, + 0x58, 0xa1, 0x81, 0x8e, 0x48, 0x87, 0xc1, 0x5e, 0xb6, 0x3e, 0x8f, 0x37, + 0x85, 0x6a, 0xc1, 0x2f, 0x80, 0x86, 0x34, 0x82, 0x93, 0x9f, 0xb4, 0x66, + 0x37, 0x69, 0x58, 0x62, 0x8a, 0xae, 0x37, 0xa9, 0x6e, 0x5c, 0xc4, 0x3d, + 0xcd, 0xcd, 0x80, 0x53, 0x8c, 0x79, 0xbe, 0x54, 0x1b, 0xa1, 0x46, 0x31, + 0x68, 0x2c, 0x4b, 0x68, 0x85, 0xc9, 0x53, 0xa1, 0x6a, 0xa0, 0x96, 0x71, + 0x59, 0x69, 0xcd, 0xc8, 0xad, 0x71, 0x3f, 0x31, 0x37, 0x66, 0x3a, 0x32, + 0x68, 0x66, 0xb2, 0x6f, 0x80, 0xcc, 0x37, 0x49, 0x36, 0x80, 0x62, 0x39, + 0x74, 0x46, 0x7e, 0x97, 0x66, 0x88, 0xce, 0x4f, 0x5d, 0xdb, 0x53, 0x7e, + 0xaf, 0x44, 0xb6, 0x95, 0x4f, 0xbc, 0x89, 0x59, 0x43, 0x5d, 0x79, 0xe3, + 0x59, 0x84, 0x69, 0x73, 0x8c, 0x5b, 0x77, 0x61, 0x3b, 0x6b, 0xc7, 0x8a, + 0x45, 0x90, 0x72, 0x65, 0xb8, 0x57, 0xb2, 0x81, 0x84, 0x5d, 0x64, 0x34, + 0x5f, 0x75, 0xbd, 0x6d, 0x5e, 0x6e, 0x7c, 0xca, 0xce, 0x7a, 0xc5, 0x36, + 0x5e, 0x26, 0xab, 0x9b, 0x48, 0xa6, 0x41, 0x70, 0x67, 0xb9, 0x96, 0x98, + 0x8e, 0x9f, 0x65, 0xab, 0x7d, 0x7d, 0x4e, 0x60, 0xab, 0xb3, 0xce, 0x7b, + 0xa4, 0x55, 0xdd, 0xa4, 0x5b, 0xd7, 0xa5, 0x77, 0xc5, 0x78, 0x83, 0x72, + 0xd4, 0xa6, 0x49, 0x66, 0x31, 0xcc, 0x53, 0x9e, 0x4c, 0xd9, 0x58, 0xb5, + 0xd2, 0x97, 0x7a, 0x5e, 0x46, 0x8b, 0xa9, 0x9b, 0x3f, 0x69, 0x5d, 0x6d, + 0x38, 0x5a, 0xcd, 0x9a, 0x97, 0x58, 0x2e, 0x62, 0xaa, 0xc3, 0xcb, 0xc1, + 0xba, 0x81, 0x6d, 0x68, 0x7b, 0x45, 0x85, 0x42, 0xdc, 0x75, 0xbd, 0x81, + 0x7d, 0xac, 0x48, 0x50, 0x7c, 0xc8, 0xa3, 0xcd, 0x32, 0xc2, 0x83, 0x62, + 0xa2, 0xc6, 0xb5, 0x81, 0xcc, 0xbe, 0xbe, 0x3b, 0x9d, 0xb6, 0x46, 0x5d, + 0xaf, 0x34, 0x66, 0xa5, 0x97, 0x9f, 0x79, 0xaa, 0x63, 0x41, 0x6f, 0x4f, + 0xa1, 0xaf, 0x62, 0x9d, 0x9b, 0x90, 0x60, 0x51, 0x91, 0x3b, 0x62, 0x39, + 0x81, 0xae, 0x5d, 0x45, 0x98, 0x8d, 0xcf, 0xe7, 0xd6, 0x3c, 0x9e, 0x75, + 0x63, 0xc7, 0x69, 0x49, 0x5c, 0x3d, 0x3a, 0xa7, 0x51, 0xab, 0x4c, 0xd1, + 0xb2, 0x72, 0x75, 0xdb, 0xa8, 0x34, 0xb1, 0x95, 0xbd, 0xa7, 0x2a, 0x73, + 0x6a, 0x7f, 0x86, 0x7d, 0x38, 0xa8, 0x9e, 0x43, 0x46, 0x42, 0x21, 0x42, + 0xa5, 0x54, 0x3f, 0x4f, 0x3c, 0x7d, 0xa4, 0x97, 0x7c, 0x8f, 0xcc, 0x7e, + 0xbc, 0x7d, 0x92, 0x88, 0x42, 0x99, 0x79, 0xa2, 0x9e, 0xbe, 0x4b, 0x8c, + 0x32, 0x71, 0x3a, 0xb7, 0x49, 0x7c, 0x4e, 0x4a, 0x41, 0xc3, 0x94, 0x85, + 0xd7, 0xc7, 0xc8, 0xa1, 0x84, 0x99, 0xa4, 0xc9, 0xb8, 0xda, 0xbb, 0x63, + 0x83, 0xa6, 0x60, 0xa9, 0xae, 0x39, 0x82, 0xbc, 0x44, 0x6c, 0x4e, 0xce, + 0xdf, 0x92, 0xaf, 0xc9, 0xa9, 0x80, 0x5b, 0x38, 0xbb, 0x80, 0xd8, 0x6c, + 0xc9, 0x98, 0x55, 0x7f, 0xa9, 0x4a, 0x55, 0x7c, 0xb8, 0xaa, 0xbe, 0x9d, + 0x37, 0x45, 0x75, 0x71, 0xa2, 0x94, 0xcc, 0x5e, 0x35, 0xc1, 0x60, 0x2a, + 0x56, 0x62, 0xa0, 0x61, 0x94, 0x80, 0xb1, 0x38, 0xb8, 0xb2, 0x59, 0xa4, + 0x85, 0x7f, 0xc3, 0x8f, 0xd4, 0x53, 0x4d, 0xc3, 0xaf, 0xc3, 0x5e, 0xcf, + 0xb8, 0x94, 0xa5, 0xa7, 0xa7, 0x3e, 0x6d, 0xcf, 0x57, 0x87, 0x5f, 0x9c, + 0xb7, 0xc0, 0x45, 0xb4, 0x90, 0x6f, 0x9d, 0xb7, 0x50, 0xd2, 0x5e, 0xbd, + 0x9f, 0xbd, 0x3f, 0x83, 0xa5, 0x4f, 0xad, 0x7e, 0x74, 0x3e, 0x5c, 0x51, + 0x3b, 0x3e, 0x2a, 0xa1, 0x7e, 0x5d, 0xb5, 0x9c, 0x4f, 0x4f, 0x59, 0x3f, + 0x55, 0x4b, 0x3b, 0x86, 0x71, 0x9d, 0x21, 0x9a, 0xbd, 0x3c, 0x97, 0x74, + 0x5c, 0x42, 0x45, 0x95, 0x5f, 0xaf, 0x4a, 0x59, 0xc1, 0xd1, 0x53, 0xa3, + 0x4a, 0x47, 0xaf, 0x7c, 0x7e, 0x73, 0x48, 0xab, 0xa1, 0x65, 0xb6, 0x99, + 0x9b, 0xb8, 0xc8, 0x38, 0xcd, 0x6f, 0x61, 0x3a, 0x66, 0x35, 0xc7, 0xb2, + 0x7f, 0xbc, 0x3b, 0x77, 0x89, 0xac, 0x93, 0xc0, 0xb3, 0x33, 0x5f, 0x97, + 0x72, 0x5d, 0x71, 0x88, 0x3b, 0xbc, 0xc2, 0x3d, 0x3f, 0x72, 0x44, 0x6c, + 0xa8, 0x9f, 0x3c, 0x7d, 0x97, 0x4a, 0xaf, 0xc9, 0xce, 0x8c, 0x59, 0xa0, + 0x57, 0x84, 0xaa, 0x66, 0x75, 0xaf, 0x89, 0xb3, 0xc9, 0x6d, 0xcf, 0x8a, + 0xa4, 0x8e, 0x93, 0x74, 0x5f, 0x5a, 0xb0, 0xa2, 0x82, 0x3a, 0x4d, 0x30, + 0x9c, 0x50, 0x30, 0x8a, 0x95, 0x40, 0x9b, 0xcb, 0xd5, 0x35, 0xbf, 0x8c, + 0xb3, 0xa7, 0x8e, 0xcb, 0x40, 0x44, 0x51, 0xcf, 0x96, 0x73, 0xac, 0x57, + 0x40, 0x34, 0xc4, 0x4c, 0x98, 0xc5, 0xbe, 0xc2, 0x39, 0x91, 0x53, 0x47, + 0xca, 0x50, 0xbe, 0x4f, 0x83, 0x60, 0x93, 0xd3, 0x9b, 0x7d, 0x80, 0x8a, + 0x39, 0xa2, 0xb2, 0xcd, 0x64, 0x3b, 0x9d, 0x6b, 0x4b, 0xb0, 0x38, 0x8b, + 0x70, 0xaf, 0xa4, 0x8a, 0xce, 0xb5, 0xb0, 0x85, 0x79, 0x69, 0xae, 0xa0, + 0x77, 0xa7, 0xbb, 0x95, 0x65, 0xb1, 0x5f, 0x7d, 0x70, 0x9e, 0x65, 0xca, + 0x38, 0xcf, 0x4a, 0xb4, 0x53, 0x65, 0x4f, 0xa6, 0x63, 0xc3, 0x5d, 0x2d, + 0xa8, 0x6a, 0xb0, 0xc2, 0xd1, 0xa8, 0x7f, 0x78, 0xa6, 0xa1, 0x78, 0x80, + 0x82, 0x69, 0x66, 0xb7, 0x59, 0x33, 0x80, 0x72, 0x42, 0x86, 0x8e, 0x4c, + 0x7a, 0x79, 0x74, 0xae, 0x32, 0x48, 0x50, 0xaf, 0x75, 0x85, 0xae, 0x88, + 0xb1, 0x30, 0xb8, 0x5b, 0x39, 0x93, 0xb9, 0x6b, 0x43, 0xa7, 0x3f, 0x7d, + 0xcf, 0x52, 0xcd, 0x90, 0x9b, 0x33, 0x5c, 0x5f, 0xb3, 0xc6, 0x93, 0x8f, + 0x62, 0x2e, 0x8c, 0x7f, 0x80, 0xb3, 0xb8, 0x63, 0x9b, 0x9f, 0xa0, 0xbe, + 0x7e, 0xa0, 0xd3, 0x98, 0x54, 0x4e, 0x5e, 0x42, 0x64, 0xba, 0x41, 0x80, + 0xd2, 0x50, 0x94, 0xbf, 0x44, 0xc6, 0x54, 0x46, 0x4d, 0x41, 0x72, 0x9b, + 0xd3, 0x39, 0xc0, 0x75, 0x30, 0xc8, 0x79, 0x53, 0xba, 0x9c, 0xb9, 0xbf, + 0x92, 0x7a, 0xa7, 0xca, 0x62, 0xc8, 0xc2, 0xb3, 0x84, 0x61, 0xa3, 0x9c, + 0x95, 0xbc, 0xa3, 0x87, 0x80, 0x7c, 0x82, 0x95, 0xb9, 0xab, 0x59, 0x36, + 0x43, 0xa3, 0x72, 0x58, 0x4d, 0x8b, 0x9c, 0x60, 0xb8, 0x60, 0x3d, 0x70, + 0xcc, 0xcf, 0x5e, 0xd7, 0x2f, 0x49, 0x37, 0xa0, 0xbc, 0x5e, 0x4f, 0x77, + 0xca, 0x42, 0xa4, 0x77, 0x6e, 0xb3, 0xb5, 0x8a, 0x41, 0x74, 0x3e, 0x7d, + 0x71, 0x71, 0xa6, 0x8a, 0x74, 0x90, 0x6d, 0x7c, 0xaf, 0xc4, 0x55, 0xb4, + 0x87, 0xbe, 0x84, 0x9d, 0x50, 0x65, 0x34, 0xb7, 0x71, 0x46, 0xa8, 0x66, + 0x54, 0x4c, 0x45, 0x5f, 0x56, 0x44, 0xad, 0x65, 0x42, 0xb7, 0xa9, 0x7d, + 0x48, 0x47, 0xc1, 0x3a, 0x31, 0x87, 0x63, 0xad, 0x44, 0xa3, 0xcb, 0xb9, + 0x59, 0xb7, 0xb0, 0x7a, 0x63, 0x42, 0x88, 0x9f, 0x80, 0x4d, 0x64, 0xa2, + 0xa9, 0x75, 0x72, 0x7b, 0xc2, 0x4c, 0x3f, 0xc7, 0xc9, 0x77, 0xb4, 0x74, + 0xb1, 0x72, 0xba, 0x9a, 0x6b, 0xa7, 0x4f, 0x91, 0xc9, 0x89, 0x80, 0x8e, + 0xbc, 0x89, 0x6d, 0xac, 0xa8, 0xaf, 0x64, 0xab, 0x6d, 0xbc, 0xc9, 0x5c, + 0x81, 0xba, 0xaf, 0x9b, 0x42, 0x98, 0xb2, 0x94, 0x58, 0x57, 0x60, 0xb1, + 0x5f, 0x74, 0x35, 0xae, 0xbf, 0x56, 0x55, 0xb6, 0x57, 0x5f, 0x6f, 0x98, + 0xce, 0x47, 0x6a, 0x4d, 0xa7, 0x8f, 0x76, 0x51, 0x36, 0x4a, 0xcf, 0xba, + 0xa2, 0x67, 0x7e, 0x67, 0x6c, 0x3d, 0xa2, 0xcb, 0x9b, 0x64, 0xa1, 0x34, + 0x4b, 0x69, 0xd0, 0xa8, 0x6e, 0x89, 0x84, 0x58, 0x45, 0xad, 0x8e, 0x3e, + 0x5e, 0xa0, 0xa8, 0x69, 0x6a, 0x78, 0x3a, 0x30, 0x9c, 0x93, 0xca, 0x40, + 0x58, 0x3d, 0x6f, 0x5e, 0x74, 0x7e, 0x3c, 0x43, 0x65, 0x72, 0x9d, 0x93, + 0x69, 0xa8, 0x84, 0xb3, 0x95, 0xb6, 0x82, 0x9f, 0x44, 0x48, 0x9b, 0x49, + 0x79, 0xa3, 0xaf, 0x7e, 0xb8, 0x8b, 0xcc, 0x6f, 0x6c, 0x32, 0x3c, 0x90, + 0x5d, 0x39, 0x5b, 0xaa, 0x46, 0xbe, 0x6c, 0x63, 0xcc, 0x83, 0xaf, 0x5f, + 0x9d, 0x87, 0x4e, 0xd3, 0x37, 0xb0, 0x86, 0xa4, 0xc4, 0x89, 0xc9, 0x71, + 0xca, 0x89, 0x33, 0x9d, 0x39, 0x6b, 0x48, 0x60, 0x44, 0x68, 0x52, 0xa0, + 0x71, 0x33, 0x60, 0x4b, 0xa5, 0xd0, 0x5c, 0x67, 0xc9, 0xbb, 0x4e, 0xc2, + 0xbb, 0x8a, 0xaa, 0x3c, 0x3e, 0x64, 0xcd, 0xb4, 0xa1, 0x77, 0xc5, 0x84, + 0x8c, 0x52, 0x3a, 0xa7, 0x70, 0x82, 0x37, 0x65, 0x67, 0x73, 0x82, 0x69, + 0xb6, 0xa8, 0x65, 0x60, 0xc1, 0x30, 0x5f, 0xd1, 0x3a, 0x9f, 0xd0, 0x90, + 0xc3, 0x71, 0x43, 0x53, 0x30, 0xb9, 0xb4, 0x54, 0x37, 0x7c, 0x6f, 0x93, + 0x77, 0xb5, 0x6d, 0x3d, 0xc9, 0x66, 0x3c, 0xaa, 0x97, 0x5e, 0x7d, 0xc2, + 0x8d, 0xc0, 0x8a, 0x67, 0xc3, 0x8c, 0x9a, 0x71, 0x7b, 0x62, 0x67, 0x62, + 0x87, 0x4a, 0xbe, 0xa2, 0x75, 0x4f, 0xb6, 0xad, 0xca, 0xc4, 0xaf, 0x84, + 0xca, 0x5f, 0xac, 0x7b, 0x7f, 0x61, 0x36, 0x8e, 0x89, 0x8d, 0xac, 0xca, + 0x72, 0x53, 0x57, 0x6a, 0xc6, 0xa7, 0x3e, 0xc7, 0xb4, 0xb9, 0x81, 0x73, + 0x86, 0x90, 0x76, 0x88, 0xce, 0xbd, 0x65, 0x6c, 0xac, 0x50, 0xc8, 0x4e, + 0x67, 0x88, 0x5e, 0xca, 0x6a, 0x6c, 0x3e, 0x86, 0xbe, 0x91, 0x89, 0x41, + 0x2f, 0x5c, 0xa1, 0x9f, 0x5c, 0x7e, 0xb0, 0x56, 0xc8, 0x5f, 0x58, 0x76, + 0x37, 0x97, 0xa8, 0x79, 0x6a, 0x7b, 0xb6, 0xbd, 0x34, 0x51, 0x3b, 0x6a, + 0xd3, 0x93, 0x6c, 0x66, 0x30, 0x94, 0xbb, 0xd3, 0x40, 0x7e, 0xac, 0x5f, + 0xa4, 0x8f, 0x8e, 0x75, 0x87, 0xbb, 0x76, 0xc9, 0x70, 0xcd, 0x58, 0x7b, + 0x3c, 0xb1, 0x73, 0xc1, 0x49, 0x37, 0x32, 0x9c, 0x58, 0x77, 0x83, 0xab, + 0x4b, 0xd0, 0x4c, 0x50, 0x8a, 0xb1, 0x5d, 0xb4, 0x8b, 0x8a, 0x7f, 0x87, + 0x6f, 0x54, 0x2f, 0x67, 0xbe, 0x9d, 0x4c, 0x6e, 0x4d, 0xae, 0xa0, 0xcb, + 0xbe, 0x5d, 0x93, 0x75, 0xb4, 0xd8, 0x5d, 0xb2, 0xa2, 0xd2, 0x79, 0x36, + 0xb7, 0x8a, 0x58, 0x6d, 0xde, 0x47, 0x9c, 0x83, 0xcc, 0xa0, 0x46, 0x4b, + 0x6d, 0xac, 0x74, 0x4c, 0x58, 0xbb, 0xd0, 0x4e, 0xaa, 0x5f, 0xdc, 0xd0, + 0xcc, 0xaa, 0xcc, 0x4f, 0x53, 0x4b, 0x55, 0x7b, 0x45, 0x8e, 0xc2, 0x63, + 0x9f, 0x4c, 0x52, 0x9b, 0x82, 0x40, 0x41, 0xd5, 0x42, 0x46, 0x43, 0xd8, + 0x66, 0x39, 0x7a, 0x85, 0xd1, 0x64, 0xcf, 0x89, 0x82, 0x80, 0x92, 0x68, + 0x9c, 0xb8, 0xa6, 0x5d, 0x36, 0x78, 0xa2, 0xbe, 0x35, 0xb0, 0xb6, 0x4f, + 0xc6, 0x79, 0x76, 0x7c, 0x85, 0xa0, 0xbb, 0x6a, 0xc1, 0xad, 0xda, 0x78, + 0x86, 0x53, 0xfe, 0x73, 0xc0, 0xbd, 0x67, 0xa4, 0x5b, 0x81, 0xab, 0xb7, + 0xac, 0xd2, 0x40, 0x28, 0xc1, 0x86, 0x8a, 0x3f, 0x3e, 0xa5, 0x36, 0xcc, + 0xd2, 0x75, 0x59, 0x9e, 0x79, 0x4f, 0x81, 0xc0, 0xc0, 0x57, 0x4d, 0xbc, + 0x53, 0xbb, 0x45, 0x91, 0xc7, 0x5a, 0x63, 0x6b, 0x4d, 0x58, 0xb0, 0x62, + 0xcc, 0xc9, 0x95, 0x98, 0x99, 0xb9, 0xcd, 0x28, 0x74, 0xa6, 0xd3, 0xaa, + 0x67, 0x3b, 0x92, 0x58, 0x5c, 0x96, 0x87, 0x60, 0x52, 0x6f, 0xca, 0x98, + 0xa2, 0xa4, 0x7b, 0x83, 0x6d, 0x2c, 0xbe, 0x6c, 0x43, 0x36, 0xac, 0x44, + 0x86, 0xd1, 0x99, 0x53, 0x9d, 0x6b, 0x9e, 0x97, 0x9b, 0xb5, 0xa1, 0x85, + 0xc8, 0x85, 0x6a, 0x7b, 0xca, 0x96, 0xb6, 0x47, 0x85, 0xb3, 0x7a, 0x61, + 0x44, 0x88, 0xc1, 0x7c, 0x58, 0x98, 0x85, 0x45, 0x7a, 0x48, 0x3c, 0xa3, + 0x4e, 0xa9, 0xae, 0x72, 0xb2, 0xc6, 0xdd, 0xc9, 0x8e, 0xb9, 0x85, 0x9d, + 0xb6, 0xc7, 0x2a, 0x3a, 0x58, 0x65, 0x58, 0x7b, 0xca, 0x73, 0x8c, 0x2f, + 0xac, 0xc8, 0xc6, 0x42, 0xb1, 0x41, 0x4d, 0x46, 0xc4, 0xa9, 0xba, 0x63, + 0xc7, 0x76, 0x94, 0xb7, 0x2e, 0x46, 0x8e, 0x67, 0x63, 0x6e, 0x8e, 0x96, + 0x68, 0x61, 0xa5, 0x98, 0xd9, 0x46, 0xb4, 0xbe, 0x80, 0x78, 0xab, 0xb8, + 0x84, 0x9d, 0x63, 0x8b, 0x86, 0xaa, 0x6c, 0x69, 0x5b, 0x99, 0xa0, 0x4d, + 0xca, 0xcb, 0x99, 0x5e, 0x92, 0x57, 0xa1, 0x73, 0x88, 0xca, 0x94, 0xce, + 0x98, 0xa4, 0xef, 0x2f, 0xae, 0xb5, 0x92, 0x57, 0x70, 0x8c, 0x79, 0x6e, + 0x55, 0xb2, 0xc2, 0x78, 0x82, 0x86, 0x8c, 0x86, 0x79, 0xa5, 0x67, 0x3e, + 0x9f, 0x56, 0xac, 0x94, 0xa6, 0x70, 0x92, 0x7e, 0xaf, 0x64, 0x8c, 0x74, + 0x99, 0x70, 0x98, 0x75, 0xb9, 0x37, 0x60, 0x98, 0x83, 0x65, 0xcc, 0x6e, + 0x80, 0x47, 0x90, 0x71, 0x75, 0xab, 0xa9, 0x3d, 0x3c, 0xd5, 0x83, 0x5d, + 0x54, 0x58, 0xad, 0x57, 0xba, 0xab, 0xbc, 0x53, 0x6c, 0x5a, 0x88, 0xbc, + 0x91, 0x2d, 0xb2, 0x6b, 0x82, 0xd3, 0xa9, 0x8c, 0x48, 0x5a, 0x6d, 0xc0, + 0x9a, 0x73, 0x40, 0x94, 0x5b, 0xa9, 0xd4, 0xc9, 0x4b, 0xa2, 0x79, 0x2e, + 0xc9, 0xb0, 0xdc, 0xc5, 0x3c, 0x53, 0x8d, 0x90, 0x6f, 0x4e, 0x7e, 0x7c, + 0x54, 0x34, 0xc3, 0xd0, 0xbc, 0xb1, 0xa2, 0xc5, 0x5d, 0x4c, 0x62, 0x9c, + 0xd3, 0x62, 0x8e, 0x5e, 0x64, 0x84, 0x8d, 0xcb, 0xab, 0x7c, 0xd8, 0x9e, + 0x36, 0xa8, 0x9b, 0x4e, 0xa9, 0x72, 0xb6, 0xcc, 0xa1, 0x82, 0x8a, 0xb9, + 0x86, 0xc3, 0x65, 0x52, 0xab, 0x6f, 0xc6, 0xce, 0x48, 0xa6, 0xbe, 0x97, + 0xb8, 0xaa, 0x60, 0x98, 0xb1, 0x43, 0x38, 0x70, 0x6a, 0x8d, 0x8f, 0x81, + 0x8c, 0x89, 0x7b, 0x40, 0x6f, 0xb5, 0x72, 0x61, 0xcd, 0x5d, 0xa2, 0x65, + 0xaf, 0x7b, 0x2a, 0x78, 0x6b, 0x3a, 0x83, 0x90, 0x38, 0x85, 0x3a, 0x7f, + 0x72, 0x3a, 0xab, 0xd0, 0xad, 0x76, 0xd5, 0xd0, 0x2f, 0xa5, 0x68, 0xd0, + 0xa7, 0xa1, 0x6c, 0x77, 0xa6, 0x65, 0xb4, 0x71, 0x51, 0x42, 0x9f, 0xad, + 0x37, 0xc3, 0xb5, 0x9b, 0x58, 0x6f, 0x74, 0x3b, 0x99, 0x8f, 0x4c, 0xbd, + 0xb2, 0x7d, 0x80, 0x68, 0x47, 0x39, 0xa3, 0x61, 0x81, 0x8c, 0xb7, 0x5e, + 0x7a, 0x86, 0x47, 0xda, 0x68, 0x89, 0x9d, 0x6e, 0xca, 0xb1, 0xa4, 0x65, + 0xd3, 0x84, 0x52, 0xbc, 0xbf, 0x9a, 0x5c, 0x6b, 0x8d, 0xb1, 0x4a, 0xd5, + 0xd1, 0x52, 0x7d, 0x81, 0xd3, 0xbb, 0x83, 0x63, 0x7d, 0xba, 0x96, 0xd3, + 0x94, 0x65, 0x49, 0x98, 0x48, 0x71, 0xc0, 0x53, 0x9c, 0x9e, 0xa3, 0xd1, + 0x53, 0xd2, 0x7a, 0xa8, 0x8f, 0xc0, 0x9e, 0x87, 0xbe, 0x6c, 0xb5, 0xd3, + 0x6a, 0x8b, 0x71, 0xa6, 0x6d, 0xad, 0x38, 0xbd, 0xa4, 0x1d, 0x4b, 0x49, + 0xa9, 0x2e, 0x52, 0xab, 0x8e, 0x6f, 0x83, 0x57, 0x70, 0x91, 0x60, 0xa9, + 0xac, 0x5d, 0x57, 0x67, 0x60, 0xc9, 0x7d, 0xd6, 0xc6, 0x43, 0xbc, 0x41, + 0xb3, 0x76, 0x78, 0x7e, 0x7e, 0xd1, 0xd4, 0x91, 0x71, 0x5b, 0x45, 0xdf, + 0x5d, 0x56, 0xbc, 0x89, 0xbc, 0x3d, 0x70, 0x81, 0x4d, 0x42, 0x42, 0xd3, + 0x90, 0xcf, 0xb1, 0x7f, 0xca, 0x97, 0x69, 0x85, 0xc9, 0x57, 0x94, 0xad, + 0x97, 0x7a, 0xce, 0x6a, 0x61, 0x95, 0x53, 0xa3, 0x8a, 0x7a, 0xc1, 0x36, + 0xa2, 0x4e, 0x67, 0xba, 0x99, 0xc3, 0xa3, 0x44, 0xb8, 0x67, 0x5d, 0xa7, + 0x4d, 0xcb, 0xbf, 0x5f, 0x72, 0x39, 0x8b, 0x82, 0x4b, 0x57, 0x36, 0xc1, + 0x39, 0xbb, 0x80, 0x31, 0xad, 0x85, 0x33, 0xb1, 0x26, 0xa8, 0x8e, 0x9c, + 0x48, 0x85, 0xdb, 0x5e, 0x85, 0x9c, 0x5f, 0x95, 0x93, 0xac, 0x8a, 0x84, + 0x63, 0x8a, 0x1b, 0x81, 0x31, 0xb8, 0x5c, 0x93, 0x93, 0xad, 0xd6, 0x46, + 0x84, 0xa9, 0x8d, 0x86, 0xa1, 0xbc, 0x90, 0xc4, 0x85, 0x9f, 0x69, 0x5d, + 0x73, 0x7f, 0x77, 0x39, 0x62, 0xd9, 0x93, 0x8a, 0xc1, 0x82, 0xc7, 0x72, + 0x5d, 0xb0, 0x47, 0x4a, 0x70, 0x83, 0x87, 0x9f, 0x71, 0xac, 0x85, 0x93, + 0xce, 0x7f, 0xd0, 0x7a, 0x34, 0x62, 0x92, 0x47, 0xae, 0x43, 0x4e, 0x3e, + 0x61, 0xb9, 0xdf, 0x63, 0x53, 0x39, 0x90, 0x9d, 0x89, 0xb4, 0x5a, 0x3e, + 0x5f, 0x4b, 0xc7, 0x45, 0x6b, 0x9f, 0x5e, 0xba, 0x3b, 0x3a, 0x4a, 0x61, + 0x94, 0xc1, 0x37, 0x78, 0xca, 0x71, 0xc4, 0x74, 0x66, 0xbe, 0xb1, 0x55, + 0x96, 0x96, 0x8d, 0xa4, 0x95, 0xa8, 0x62, 0x50, 0xd4, 0x46, 0x90, 0x8e, + 0x83, 0xd1, 0x7e, 0x48, 0x5b, 0xb0, 0x40, 0x5a, 0x51, 0xc7, 0xaf, 0x3d, + 0xaa, 0x84, 0xb5, 0xc6, 0x2c, 0x7a, 0x85, 0x6b, 0xa9, 0x8c, 0x44, 0x77, + 0x55, 0x33, 0x67, 0x9d, 0x6a, 0xd0, 0xc5, 0xba, 0x68, 0xcf, 0x84, 0x83, + 0xd1, 0x52, 0x69, 0x95, 0x9f, 0x8e, 0xc3, 0x8d, 0xa7, 0x84, 0xc5, 0xa6, + 0x4c, 0xa3, 0x4a, 0x53, 0x90, 0x7f, 0x51, 0x4a, 0x8a, 0x39, 0xb6, 0x89, + 0x9d, 0xc0, 0x7c, 0x80, 0x7c, 0x85, 0x9e, 0x55, 0x76, 0x9e, 0x99, 0x78, + 0x97, 0x8a, 0x77, 0x62, 0x83, 0xa4, 0xaa, 0x27, 0x63, 0x79, 0xd1, 0x89, + 0x27, 0x3c, 0x6b, 0xcc, 0x43, 0xb4, 0x46, 0x92, 0xb6, 0x8e, 0x43, 0xbd, + 0x5f, 0x8a, 0x5a, 0x7f, 0x65, 0xa8, 0xd4, 0x74, 0x7c, 0xd2, 0x41, 0x5d, + 0x8e, 0xa9, 0x9e, 0xb6, 0x75, 0x8f, 0x4e, 0xaa, 0x89, 0x8f, 0x98, 0x6b, + 0x38, 0xab, 0x3c, 0x65, 0xd0, 0x78, 0xcd, 0x66, 0x3d, 0xaf, 0x62, 0x36, + 0x3f, 0x57, 0xad, 0x9c, 0x62, 0x4e, 0x7f, 0x66, 0xd2, 0x4e, 0xd2, 0x8e, + 0x42, 0xa2, 0x50, 0x95, 0x5d, 0x78, 0xb5, 0xaf, 0x56, 0x81, 0x69, 0xc0, + 0x91, 0x67, 0xaa, 0xb8, 0xd1, 0x5a, 0x8b, 0x9f, 0x62, 0x73, 0x70, 0x4c, + 0x4f, 0xb4, 0x6f, 0x6e, 0x70, 0x9a, 0x68, 0x66, 0x37, 0xae, 0x48, 0x49, + 0xb5, 0x50, 0x7c, 0xcb, 0x43, 0xb5, 0xbb, 0xa1, 0x3d, 0x32, 0x7e, 0x81, + 0xa9, 0x96, 0x3e, 0xbf, 0xba, 0xa5, 0xd1, 0x90, 0x5c, 0x7e, 0x4f, 0x81, + 0x58, 0x3c, 0xd1, 0x87, 0x56, 0x32, 0x56, 0xd2, 0xd2, 0xab, 0xcd, 0xb2, + 0x94, 0x4c, 0x45, 0x52, 0x69, 0x79, 0x77, 0xaf, 0xb7, 0x96, 0x8d, 0x8b, + 0x8d, 0x56, 0xac, 0xa0, 0x69, 0xa6, 0x51, 0x8e, 0x82, 0xae, 0x57, 0x8c, + 0x35, 0x90, 0x75, 0x7c, 0xb1, 0x54, 0x92, 0xcf, 0x56, 0xa8, 0xb5, 0x86, + 0xa1, 0x6e, 0x99, 0xb7, 0x79, 0x52, 0xb2, 0xda, 0x97, 0xde, 0xa5, 0x1e, + 0x4b, 0x4b, 0x72, 0xb8, 0xca, 0xae, 0xb6, 0x48, 0xb0, 0x7d, 0xba, 0x47, + 0xbb, 0x3e, 0x65, 0x57, 0x67, 0xbd, 0x83, 0x57, 0x3f, 0xc7, 0x65, 0x5b, + 0x63, 0xa4, 0x45, 0x4c, 0x85, 0x9b, 0x94, 0x57, 0x79, 0x34, 0x7c, 0xdb, + 0x6c, 0xa1, 0xaf, 0xa8, 0x61, 0x74, 0xa6, 0x81, 0xa0, 0x67, 0xbc, 0x66, + 0xa5, 0xac, 0xa8, 0xc8, 0xa6, 0xb5, 0x9f, 0xbb, 0x55, 0x6c, 0xb1, 0xb2, + 0x93, 0x5a, 0x64, 0x7c, 0xbb, 0x4f, 0x31, 0x60, 0xaf, 0xa9, 0x3f, 0x60, + 0x30, 0x4a, 0x50, 0x9a, 0x3d, 0x78, 0x85, 0x96, 0x72, 0x62, 0x64, 0x71, + 0x9d, 0x2f, 0x81, 0x4c, 0x30, 0x51, 0xb3, 0xcb, 0xb8, 0xc8, 0x89, 0x49, + 0xa5, 0x47, 0x3a, 0xa6, 0xb7, 0xca, 0x55, 0x2c, 0x6d, 0xa4, 0xca, 0x62, + 0x44, 0x30, 0x66, 0x5f, 0xb0, 0xc1, 0x7a, 0x94, 0x85, 0xc5, 0xbc, 0xdc, + 0x5e, 0x6d, 0x61, 0xcc, 0x38, 0x87, 0x7e, 0xca, 0x34, 0x4b, 0xa8, 0x34, + 0x36, 0x7b, 0xb7, 0x71, 0x70, 0xa6, 0xad, 0x41, 0x8b, 0x53, 0x96, 0x86, + 0xb9, 0x65, 0x8d, 0x40, 0x7c, 0x50, 0xcd, 0x7c, 0x9f, 0x76, 0x3d, 0xbb, + 0x48, 0x52, 0x79, 0xd1, 0x6a, 0x6f, 0xb7, 0x6e, 0x5a, 0x4e, 0x93, 0x43, + 0xc9, 0xc6, 0x49, 0x4c, 0x85, 0xc7, 0x58, 0x54, 0x71, 0x6d, 0xb3, 0x94, + 0xb7, 0xd3, 0x79, 0x6c, 0x9d, 0x6a, 0xc2, 0x9f, 0xa1, 0x5a, 0x82, 0x92, + 0x43, 0x54, 0xdb, 0x46, 0x8a, 0x73, 0x97, 0xa8, 0x60, 0x59, 0x79, 0xc8, + 0xa1, 0x72, 0xc9, 0x45, 0x80, 0x95, 0x4b, 0x98, 0xa6, 0x85, 0x35, 0x39, + 0xa0, 0xaf, 0x7d, 0x6a, 0xb6, 0xb8, 0xd4, 0x62, 0x84, 0x3c, 0x7c, 0x88, + 0xbe, 0x6b, 0x4a, 0x6c, 0x4b, 0x7b, 0x6b, 0x9d, 0x79, 0x34, 0x68, 0x9f, + 0xce, 0xa4, 0x83, 0x7d, 0xd3, 0x46, 0x8f, 0x5b, 0x9c, 0xa3, 0x6e, 0xd5, + 0xcd, 0x37, 0xbf, 0xca, 0x6e, 0x8d, 0x56, 0xbd, 0x55, 0x55, 0x70, 0x54, + 0x44, 0x8b, 0x4d, 0x85, 0x76, 0xaa, 0xab, 0x57, 0x36, 0xa0, 0x91, 0x3e, + 0x98, 0x5a, 0xb5, 0xb5, 0x5e, 0x25, 0xc2, 0xc8, 0x78, 0x71, 0x56, 0xc8, + 0x4f, 0x31, 0xac, 0x8a, 0x3b, 0xc6, 0x87, 0x9a, 0x38, 0x7d, 0x68, 0x5d, + 0xb6, 0xbb, 0x57, 0x99, 0x57, 0xbf, 0x7a, 0x70, 0xa8, 0x45, 0xb8, 0x80, + 0x74, 0x96, 0x3c, 0x50, 0x99, 0xb0, 0x7d, 0xa4, 0xa0, 0x7c, 0x6e, 0xa0, + 0x6a, 0x72, 0xc3, 0x73, 0xd8, 0xd4, 0x55, 0x74, 0xa9, 0xce, 0x34, 0x99, + 0x6a, 0xcf, 0xaa, 0x3d, 0xd7, 0x8f, 0x3c, 0xc5, 0x90, 0xce, 0x56, 0x39, + 0xb8, 0x64, 0x69, 0xa4, 0x9d, 0x4e, 0x4f, 0xb3, 0x39, 0x8e, 0xad, 0xc7, + 0x76, 0x79, 0x5e, 0x8a, 0xab, 0x92, 0xbf, 0x7d, 0x89, 0x79, 0xcf, 0xb4, + 0x91, 0xa1, 0xa9, 0x76, 0x9c, 0x58, 0x87, 0x88, 0xbd, 0xd2, 0x6f, 0x73, + 0x4e, 0x81, 0x3c, 0x5f, 0x92, 0x99, 0x93, 0x79, 0x58, 0x76, 0x64, 0xa6, + 0xae, 0xb6, 0x55, 0x35, 0x92, 0x96, 0x59, 0x80, 0x6a, 0xa5, 0x34, 0x9c, + 0xb2, 0x60, 0x9d, 0x3d, 0x73, 0xb8, 0xc8, 0xb3, 0x5f, 0x74, 0x71, 0xc9, + 0x45, 0x7e, 0xb7, 0x48, 0xbc, 0xa8, 0x6b, 0x6b, 0x45, 0x36, 0x7a, 0x95, + 0xc0, 0x68, 0x3d, 0x43, 0x53, 0x38, 0xce, 0x87, 0x79, 0xb3, 0x64, 0x74, + 0xa5, 0x6e, 0xbe, 0x84, 0x4b, 0xb4, 0x36, 0x67, 0x67, 0x63, 0xba, 0x55, + 0x8b, 0x7b, 0x7d, 0x95, 0x8e, 0x9d, 0x4c, 0x4b, 0x9c, 0xb4, 0x9a, 0x59, + 0x5f, 0x98, 0x9e, 0x92, 0x7d, 0x79, 0x66, 0xa9, 0x3a, 0xcd, 0x9d, 0xad, + 0xbe, 0x6f, 0x96, 0x4d, 0x51, 0x64, 0x33, 0xc6, 0x8a, 0x60, 0x7f, 0x8f, + 0x84, 0xa6, 0xaf, 0x6c, 0x55, 0x55, 0x35, 0xad, 0x67, 0x66, 0x6f, 0x93, + 0x47, 0x3c, 0x4a, 0x92, 0xae, 0x91, 0x68, 0x92, 0xc5, 0xc8, 0x62, 0x3c, + 0x75, 0xb6, 0x4e, 0x89, 0x51, 0xb8, 0xb5, 0x45, 0x95, 0x29, 0x86, 0x4d, + 0x5f, 0x78, 0x2e, 0xb1, 0x45, 0x96, 0x8f, 0xc8, 0x40, 0xc1, 0xcb, 0xb6, + 0xc2, 0x90, 0x49, 0xa9, 0xa9, 0xc4, 0x76, 0xac, 0x4b, 0x5f, 0x9a, 0x5e, + 0x62, 0x64, 0x47, 0x6c, 0x63, 0x86, 0x37, 0x31, 0x7d, 0xc6, 0xc6, 0x31, + 0xc0, 0xc9, 0xa4, 0xa0, 0xac, 0x39, 0x63, 0xb0, 0x6a, 0x9c, 0x69, 0x97, + 0x4d, 0x5c, 0xb4, 0x4d, 0x3f, 0x7d, 0x84, 0x44, 0x9f, 0x7d, 0x77, 0x4d, + 0xd3, 0x52, 0xca, 0xba, 0x53, 0x3d, 0x71, 0xad, 0x61, 0x3e, 0x77, 0x7b, + 0x5e, 0xa8, 0x9f, 0x8f, 0x8b, 0x94, 0x77, 0x92, 0x5c, 0x94, 0xc1, 0x97, + 0xcd, 0xbb, 0xc8, 0xbe, 0xd3, 0x53, 0xa5, 0xa8, 0x83, 0x7e, 0x9d, 0x9f, + 0x39, 0x6f, 0x5a, 0x59, 0x93, 0x3a, 0x4f, 0xd3, 0x51, 0x9a, 0xb0, 0x4c, + 0x82, 0x4c, 0x66, 0x91, 0x59, 0x74, 0x56, 0xa3, 0x8b, 0x3e, 0x82, 0x88, + 0x55, 0x8c, 0x9b, 0x38, 0x68, 0x2d, 0x71, 0x89, 0xbe, 0xb3, 0xac, 0x74, + 0x68, 0x56, 0x49, 0xd6, 0x8f, 0x68, 0x9a, 0x78, 0x36, 0x98, 0x9a, 0xd6, + 0x58, 0x40, 0x47, 0x47, 0x75, 0x66, 0xb5, 0x91, 0xb4, 0x9f, 0xb5, 0x34, + 0xd2, 0x78, 0x6e, 0x60, 0x71, 0xb6, 0x37, 0xb8, 0x84, 0x97, 0x63, 0x45, + 0xd3, 0xb4, 0x80, 0xbd, 0x32, 0xa5, 0xcd, 0x73, 0x9c, 0x68, 0x88, 0x88, + 0x41, 0x9a, 0x39, 0x7c, 0x38, 0xac, 0x5e, 0xcb, 0x67, 0x65, 0xc3, 0x5e, + 0xcd, 0x42, 0x55, 0xb7, 0x82, 0x8f, 0x4c, 0x89, 0xc7, 0x80, 0x7b, 0x75, + 0x42, 0x32, 0xad, 0xb1, 0x39, 0xc3, 0x3f, 0x87, 0x90, 0xae, 0x8a, 0x3c, + 0xc8, 0x46, 0x7f, 0xb6, 0xd0, 0x76, 0x8f, 0xab, 0x4a, 0x64, 0xa3, 0x71, + 0x65, 0x8b, 0xa1, 0x54, 0x60, 0xb7, 0xd0, 0xc9, 0xab, 0x6a, 0x50, 0xbb, + 0x7e, 0xab, 0xa0, 0x6b, 0x3f, 0x50, 0x95, 0x67, 0x6f, 0xc0, 0xb6, 0x8e, + 0x98, 0xcc, 0x8a, 0x56, 0x5e, 0x74, 0x44, 0x8f, 0x7f, 0x92, 0x84, 0x60, + 0x54, 0x3b, 0xb1, 0xb8, 0x67, 0x7d, 0x5d, 0xa5, 0x46, 0x68, 0xa3, 0x6c, + 0xda, 0x33, 0xb8, 0x7b, 0x5c, 0xbe, 0x74, 0x62, 0xad, 0x7a, 0xbc, 0x48, + 0xc4, 0xbf, 0x94, 0xa4, 0x40, 0x8e, 0xa0, 0xb9, 0x93, 0xc1, 0x74, 0x91, + 0x67, 0x3c, 0x84, 0x71, 0x62, 0xbd, 0xac, 0x8f, 0x56, 0x53, 0x7c, 0xc9, + 0xb7, 0x96, 0x5e, 0x5c, 0xcf, 0x62, 0x39, 0xa2, 0xa6, 0x90, 0xb6, 0x57, + 0x7c, 0xb2, 0x83, 0x51, 0x5b, 0x4a, 0x4c, 0x77, 0x8d, 0x4b, 0x42, 0xc1, + 0xb2, 0x38, 0x3e, 0xbc, 0x5d, 0xac, 0x6f, 0xc8, 0x66, 0x40, 0xa0, 0x9c, + 0x3c, 0x9d, 0x2c, 0x69, 0x68, 0xc4, 0xb8, 0x83, 0x34, 0x8d, 0x34, 0xb8, + 0xb4, 0xaf, 0xcf, 0x43, 0x3c, 0x3c, 0x3a, 0x46, 0xb8, 0x88, 0x91, 0xc6, + 0xc7, 0x35, 0xc1, 0x72, 0x7f, 0x3c, 0x75, 0x99, 0xad, 0x3b, 0xa2, 0x72, + 0x6a, 0xa3, 0x82, 0xd2, 0x37, 0xc9, 0x52, 0xcc, 0x99, 0x66, 0x97, 0x80, + 0x4c, 0xaa, 0x43, 0xae, 0x92, 0xa4, 0xb5, 0x5f, 0x55, 0x75, 0x70, 0xc0, + 0xae, 0x5b, 0x6f, 0xcf, 0x74, 0xac, 0x39, 0xa2, 0x40, 0x73, 0x7b, 0xd1, + 0x5b, 0x5b, 0xa6, 0x9f, 0x9d, 0x7b, 0xc6, 0x8b, 0xb6, 0x44, 0x67, 0x8e, + 0xa1, 0x39, 0x32, 0xbd, 0xb1, 0x6a, 0xbe, 0xb6, 0xb4, 0xc0, 0x39, 0x9e, + 0x58, 0xca, 0x8e, 0x41, 0xa6, 0x5c, 0x97, 0x81, 0x71, 0x5d, 0x68, 0x60, + 0x83, 0x3c, 0x58, 0x91, 0x6e, 0x6e, 0xa2, 0xc0, 0x66, 0xc9, 0x5b, 0x64, + 0x84, 0xac, 0x62, 0x9a, 0x96, 0x68, 0x5a, 0x37, 0x65, 0x62, 0x8f, 0x90, + 0xc9, 0xa2, 0x7f, 0x99, 0x65, 0x7e, 0x49, 0x37, 0x5e, 0x7b, 0x51, 0x62, + 0x3f, 0x52, 0xc6, 0x6e, 0x7b, 0x3e, 0x34, 0x4b, 0x9f, 0x48, 0x6e, 0x4b, + 0x6f, 0xcc, 0xc1, 0xcf, 0xb4, 0x77, 0x51, 0xbf, 0x9e, 0x51, 0x47, 0xba, + 0x8b, 0x7e, 0x43, 0xbe, 0x7a, 0x81, 0x60, 0x47, 0x7f, 0x8e, 0xaf, 0xae, + 0xc0, 0x89, 0x66, 0x4e, 0x3e, 0xa3, 0x99, 0xc0, 0xab, 0xb9, 0x48, 0x4c, + 0x97, 0x95, 0xc0, 0x56, 0x90, 0x45, 0x9e, 0x92, 0x4c, 0x3e, 0x36, 0x7a, + 0xcc, 0xc5, 0x6c, 0xbb, 0x40, 0xb6, 0x3e, 0xaa, 0x53, 0x9c, 0x78, 0x9d, + 0x6e, 0xa9, 0x68, 0x83, 0xbd, 0x89, 0xa4, 0xaf, 0x46, 0xd4, 0xac, 0xb5, + 0x7c, 0xd2, 0xc7, 0x5e, 0xd7, 0x8a, 0xcb, 0x39, 0xbb, 0x3e, 0x71, 0x87, + 0x76, 0x94, 0x9a, 0x9a, 0x74, 0x40, 0x8a, 0x52, 0x81, 0x31, 0x3d, 0xac, + 0x2e, 0x36, 0x91, 0x73, 0xa9, 0xae, 0x67, 0x98, 0x3e, 0x87, 0x68, 0xd0, + 0xa4, 0x59, 0xa0, 0x90, 0x36, 0x46, 0x4f, 0x87, 0x4f, 0xd4, 0xcc, 0x79, + 0xa6, 0x8e, 0xb7, 0x81, 0xd5, 0x68, 0x5f, 0xa0, 0x3e, 0x8b, 0x56, 0x46, + 0x50, 0x62, 0xc6, 0x4d, 0x7d, 0x7b, 0xcc, 0x3e, 0x50, 0xcd, 0x9f, 0xa4, + 0xd9, 0x6c, 0x66, 0xa4, 0x73, 0x4a, 0x5d, 0x9b, 0x6a, 0x44, 0x5d, 0xa0, + 0x96, 0xb6, 0x90, 0x64, 0x5a, 0x83, 0x8f, 0x69, 0x65, 0xa9, 0x7e, 0x84, + 0x5e, 0x76, 0x7b, 0x68, 0xac, 0x6d, 0x87, 0x6f, 0x96, 0x42, 0x7f, 0x35, + 0x76, 0x37, 0x60, 0x5b, 0x4e, 0x6a, 0xc0, 0x48, 0x78, 0x55, 0xb9, 0x9a, + 0x57, 0x83, 0xd3, 0x9f, 0x60, 0xcb, 0x6c, 0x35, 0x5e, 0x5a, 0x8c, 0xcf, + 0x63, 0x8a, 0x92, 0x97, 0xb7, 0x5e, 0x81, 0x2c, 0xd3, 0xbd, 0xb3, 0xc5, + 0x39, 0x89, 0x7a, 0xbc, 0x47, 0x35, 0xa2, 0x94, 0x45, 0x53, 0x50, 0x83, + 0x9a, 0xc1, 0xc5, 0x41, 0xb0, 0x75, 0x66, 0x45, 0x58, 0x79, 0x85, 0x4d, + 0xb0, 0x55, 0x6f, 0x3d, 0x6f, 0x61, 0x97, 0xc7, 0x4b, 0xcf, 0x7f, 0x8c, + 0x88, 0x6f, 0x4f, 0x6d, 0xc7, 0x85, 0x6f, 0x4e, 0xa0, 0x9c, 0xb5, 0x3a, + 0x78, 0x39, 0x87, 0x37, 0x61, 0x82, 0xa6, 0xb6, 0x47, 0x46, 0x6a, 0x74, + 0x9d, 0x36, 0x8c, 0x8d, 0xa7, 0x9a, 0x8c, 0x50, 0x6c, 0x41, 0x4e, 0xc5, + 0x3c, 0x68, 0xa3, 0x8e, 0x7d, 0x4e, 0x5c, 0xa3, 0x8a, 0xb4, 0x3e, 0x95, + 0x3e, 0x81, 0x75, 0x93, 0x99, 0xcc, 0x50, 0x42, 0x68, 0xd9, 0x78, 0x96, + 0x33, 0x81, 0xd5, 0xc2, 0x4c, 0x57, 0x62, 0x7d, 0x77, 0x71, 0xc2, 0x53, + 0x8a, 0x8b, 0xc0, 0x67, 0xaf, 0x48, 0xad, 0xbd, 0xc0, 0xc8, 0x67, 0xab, + 0x71, 0xdb, 0xa8, 0x83, 0x59, 0xb4, 0x8b, 0x75, 0x8e, 0x97, 0x77, 0xda, + 0x82, 0x77, 0xc3, 0x73, 0xb0, 0xb7, 0xd9, 0xd5, 0xb3, 0x76, 0xa2, 0x45, + 0x75, 0x36, 0x96, 0x8e, 0x41, 0x84, 0x76, 0xc2, 0x76, 0x96, 0xbf, 0x57, + 0x48, 0x9a, 0x59, 0xae, 0xb8, 0x35, 0x98, 0x4d, 0xcb, 0xaf, 0x3a, 0x91, + 0xa0, 0xa6, 0xa2, 0x60, 0xb2, 0xbb, 0xb2, 0x35, 0x4f, 0xc2, 0xad, 0x65, + 0xc7, 0x53, 0x74, 0xa6, 0x69, 0x79, 0xb3, 0x86, 0x4d, 0x4b, 0xb1, 0x81, + 0x44, 0x8c, 0x96, 0x6b, 0x48, 0x4f, 0x97, 0x75, 0x7a, 0x94, 0x3b, 0x49, + 0xb7, 0x4f, 0x93, 0x71, 0xac, 0x47, 0xbd, 0x77, 0x65, 0x3a, 0x55, 0x39, + 0xc3, 0x80, 0x9f, 0xbf, 0x4d, 0x36, 0x82, 0xc6, 0x40, 0x62, 0x71, 0xb2, + 0x77, 0xad, 0x4f, 0x6d, 0x6a, 0x58, 0x68, 0x6a, 0x75, 0x9b, 0x70, 0x4f, + 0x7d, 0x79, 0x77, 0x87, 0xaf, 0x58, 0xd1, 0x6b, 0x40, 0x71, 0x9e, 0xb3, + 0x78, 0x9e, 0xad, 0x56, 0x6c, 0x5e, 0x70, 0x73, 0xa1, 0x55, 0x5f, 0x85, + 0x42, 0x87, 0x98, 0x57, 0x76, 0x8b, 0x7f, 0x3a, 0x50, 0xba, 0x2c, 0x45, + 0x60, 0x3f, 0xa2, 0xc7, 0x4c, 0x5f, 0xa1, 0x7f, 0x62, 0x56, 0x9b, 0x75, + 0x95, 0x6f, 0xd0, 0x6b, 0x88, 0x4e, 0xbd, 0x99, 0x68, 0x88, 0x5b, 0x9d, + 0xcd, 0x9c, 0xa6, 0x8a, 0xce, 0x9f, 0xbc, 0x8d, 0x9f, 0x43, 0x3e, 0x38, + 0xa6, 0x6b, 0xb3, 0xaa, 0x35, 0x33, 0x71, 0x7e, 0xd5, 0xb2, 0x81, 0xa0, + 0xa8, 0x9b, 0x47, 0xb5, 0xa0, 0x83, 0xd4, 0xba, 0xb2, 0xc5, 0x6d, 0x3f, + 0xcf, 0xb1, 0xa3, 0xd3, 0x8f, 0x6c, 0x48, 0xb2, 0xaf, 0x25, 0xa6, 0x42, + 0x5e, 0xb3, 0x77, 0xa4, 0x59, 0xb5, 0x3d, 0x95, 0x3b, 0x5d, 0xd5, 0xac, + 0xbe, 0x94, 0xa4, 0x8d, 0x7d, 0xc4, 0x99, 0xa9, 0x49, 0xc0, 0x51, 0x40, + 0x78, 0x43, 0xc5, 0x97, 0xa4, 0xdd, 0x8c, 0x56, 0x44, 0x34, 0x85, 0x88, + 0x6c, 0x84, 0x81, 0xbc, 0x6e, 0x6f, 0xd2, 0x3b, 0x57, 0x7b, 0x69, 0x62, + 0x53, 0x91, 0x42, 0x38, 0x48, 0xc2, 0x84, 0x4a, 0x54, 0x53, 0x8b, 0x97, + 0xce, 0xb2, 0xaf, 0x85, 0xbc, 0x7c, 0x82, 0xb1, 0x6d, 0xb9, 0x4c, 0xa7, + 0xaf, 0x6d, 0x67, 0x6f, 0x65, 0x33, 0x44, 0x53, 0x78, 0x7e, 0x65, 0x67, + 0x9e, 0x76, 0xaf, 0x51, 0x59, 0x84, 0x50, 0x70, 0xbc, 0x8c, 0xd3, 0x50, + 0x6e, 0x69, 0xc5, 0x6e, 0x72, 0x83, 0x81, 0x5b, 0x74, 0x6f, 0x56, 0xbd, + 0x6b, 0xcc, 0xd5, 0x83, 0x42, 0x4c, 0x56, 0x50, 0x8e, 0xab, 0x4c, 0x5f, + 0x59, 0x5a, 0x90, 0x5c, 0x8a, 0x61, 0x62, 0xc8, 0x6a, 0x60, 0xc6, 0x54, + 0x9e, 0x98, 0x86, 0xab, 0xaa, 0xa3, 0x57, 0x44, 0xa9, 0x63, 0x6a, 0xb8, + 0x97, 0xa2, 0x92, 0x97, 0x69, 0x60, 0x5c, 0x45, 0xa7, 0x71, 0x53, 0x5f, + 0xbc, 0x78, 0xce, 0xc0, 0xbd, 0x4c, 0x6b, 0x55, 0xd2, 0x74, 0x7e, 0x83, + 0x33, 0xd4, 0x87, 0xb6, 0x60, 0x9e, 0x9e, 0xa4, 0x62, 0xcc, 0xc1, 0xcb, + 0x66, 0x37, 0x7a, 0x85, 0x69, 0x62, 0x92, 0xc7, 0x7e, 0xbf, 0x48, 0x7c, + 0xd0, 0x4d, 0xb7, 0x6f, 0x74, 0x40, 0x66, 0x7d, 0xdb, 0x66, 0x8b, 0x8b, + 0x76, 0x75, 0xbc, 0xc6, 0xcb, 0x39, 0x9c, 0xa8, 0x66, 0xc9, 0xcf, 0x45, + 0x84, 0x48, 0x44, 0x45, 0xb8, 0x5d, 0xb2, 0x4c, 0xbe, 0x70, 0x4e, 0xb1, + 0x7f, 0xbe, 0x86, 0x74, 0x40, 0x7f, 0x8f, 0xba, 0x7d, 0x70, 0xb8, 0xc0, + 0xbb, 0xc7, 0x56, 0x60, 0x47, 0x79, 0x4e, 0x3a, 0x6a, 0x46, 0x8f, 0x96, + 0x88, 0x91, 0xd4, 0x3e, 0xc9, 0x51, 0xa1, 0x64, 0xc2, 0x7d, 0x51, 0xd0, + 0x8c, 0x8c, 0xaa, 0x50, 0xab, 0x59, 0x50, 0x53, 0x94, 0x97, 0x89, 0x5c, + 0x56, 0x5a, 0x9c, 0x37, 0x34, 0x65, 0x86, 0x42, 0x81, 0x49, 0xcc, 0xc7, + 0x58, 0x87, 0xb7, 0x5d, 0x7c, 0x61, 0x3b, 0x88, 0x5b, 0x74, 0x9c, 0x4f, + 0x77, 0x7e, 0x8b, 0x55, 0x83, 0xc6, 0x7e, 0x8c, 0xb0, 0xc3, 0x59, 0x57, + 0x9b, 0x49, 0x40, 0xcd, 0x51, 0xd1, 0x61, 0x84, 0x7c, 0x77, 0x9d, 0x91, + 0x8e, 0x35, 0x4b, 0x35, 0x3a, 0xa9, 0x9d, 0x40, 0x33, 0xb8, 0x7f, 0xd1, + 0xbe, 0x35, 0x38, 0x9d, 0x98, 0xcb, 0x78, 0xbe, 0x9d, 0x6b, 0x50, 0xac, + 0x82, 0x76, 0x89, 0xb0, 0x93, 0x5b, 0x63, 0x3e, 0x6a, 0x48, 0xb4, 0xbe, + 0xc4, 0xb7, 0x8d, 0x69, 0x46, 0xb5, 0xc6, 0xd6, 0xc0, 0x41, 0x89, 0x6a, + 0x8d, 0x62, 0x7b, 0x67, 0x70, 0x30, 0x8a, 0x7a, 0xb8, 0x73, 0x9d, 0x94, + 0x95, 0x85, 0x3c, 0xb2, 0x8d, 0x4c, 0x4f, 0x85, 0x42, 0x9d, 0x42, 0x44, + 0x84, 0x60, 0x62, 0x76, 0x3c, 0xbf, 0x36, 0xa9, 0xa9, 0xad, 0x63, 0x69, + 0x65, 0x3c, 0x3d, 0xc3, 0xcc, 0x90, 0xce, 0x7c, 0xa8, 0x91, 0xc4, 0xc3, + 0x76, 0xad, 0xbc, 0x7d, 0x42, 0xd6, 0x61, 0x45, 0xab, 0x9d, 0xbb, 0x93, + 0x9d, 0x94, 0x9f, 0x4e, 0xd9, 0x86, 0xbc, 0xcc, 0x4d, 0x71, 0x6d, 0x43, + 0x7d, 0x68, 0xd4, 0x9a, 0xb2, 0x6d, 0xbb, 0xa8, 0x76, 0xc4, 0xc2, 0x41, + 0x49, 0xa4, 0x84, 0xbe, 0xcc, 0x97, 0xb1, 0xaa, 0xc1, 0xd2, 0x8e, 0xc4, + 0x69, 0xba, 0x6f, 0xcc, 0x38, 0x4a, 0x43, 0x48, 0x58, 0xae, 0x6c, 0x53, + 0x66, 0x64, 0xcc, 0x6e, 0x43, 0x2c, 0x69, 0x72, 0x90, 0xb2, 0x77, 0x7c, + 0x8c, 0x6c, 0xc2, 0x78, 0xc4, 0xa2, 0x76, 0x41, 0x4b, 0x9a, 0xcd, 0x78, + 0x82, 0xc6, 0xb2, 0x3f, 0x4b, 0x4e, 0x8d, 0x79, 0xd4, 0x82, 0x69, 0xa4, + 0xb1, 0x74, 0x70, 0x54, 0x8a, 0xa2, 0x7d, 0xc2, 0xaa, 0xbf, 0x96, 0x33, + 0x83, 0x39, 0x3c, 0x6d, 0x9d, 0x95, 0xc9, 0x89, 0x6a, 0x75, 0x54, 0x58, + 0x3c, 0xa5, 0x4c, 0x52, 0x5f, 0x7b, 0x82, 0xb3, 0x8c, 0xaf, 0xba, 0x44, + 0x98, 0xac, 0x7d, 0x64, 0x48, 0xc1, 0x3e, 0x95, 0x6a, 0xb8, 0xb7, 0xc3, + 0x52, 0x30, 0x57, 0xb4, 0x59, 0xb4, 0xb9, 0x62, 0x6f, 0x82, 0x61, 0x58, + 0x84, 0x3d, 0xa6, 0x2f, 0x81, 0x98, 0x89, 0x4f, 0x4d, 0x45, 0x38, 0xb1, + 0x6d, 0x6f, 0x70, 0x94, 0x73, 0xb8, 0x7d, 0x84, 0x3d, 0x4f, 0x9a, 0x4b, + 0x80, 0x7d, 0x58, 0x96, 0xa4, 0x9f, 0x3b, 0xb4, 0x82, 0x52, 0x4a, 0xb2, + 0x8f, 0x5a, 0x4d, 0x75, 0x49, 0x87, 0x80, 0x7d, 0x78, 0x6f, 0x6a, 0x8a, + 0x6c, 0x44, 0x74, 0x4f, 0xb1, 0x52, 0xac, 0xab, 0x5d, 0x5d, 0xc6, 0xb7, + 0x5d, 0xa3, 0xa5, 0x4d, 0xb9, 0x3d, 0xcb, 0x4e, 0x3f, 0x55, 0x5e, 0x6f, + 0x5b, 0x31, 0xbf, 0xbb, 0x9b, 0x33, 0xcd, 0xc7, 0xd0, 0x49, 0x8e, 0x6e, + 0x86, 0x8f, 0x3b, 0xb6, 0x67, 0x6b, 0x9f, 0x7f, 0xc1, 0x5f, 0xc6, 0x59, + 0x7b, 0x69, 0x57, 0x53, 0xca, 0x6d, 0x8e, 0xc6, 0xbd, 0x90, 0x8f, 0x52, + 0x4f, 0x35, 0x5e, 0xce, 0xb5, 0xc0, 0x7e, 0x4d, 0x35, 0xba, 0x77, 0x47, + 0x4d, 0xb4, 0xba, 0xb8, 0xbb, 0xba, 0x50, 0x7e, 0x55, 0x6d, 0x44, 0x5e, + 0x50, 0x5d, 0x7a, 0x7b, 0x98, 0x69, 0x74, 0xa7, 0x74, 0x62, 0xc3, 0xae, + 0x89, 0x57, 0x46, 0x3d, 0x53, 0xc3, 0x4b, 0xa1, 0xaf, 0x81, 0x65, 0x2c, + 0xa1, 0x56, 0x44, 0x84, 0xc3, 0xbe, 0x87, 0x8e, 0x77, 0x56, 0x5d, 0x46, + 0xa6, 0x53, 0x59, 0x5b, 0x93, 0x57, 0x47, 0x8a, 0x6e, 0xae, 0xa3, 0x32, + 0x4e, 0xae, 0xbe, 0x32, 0xaf, 0x74, 0x97, 0x49, 0x7d, 0x7f, 0x55, 0x54, + 0xb2, 0x46, 0xae, 0x8f, 0x51, 0x75, 0x60, 0x5e, 0x91, 0x70, 0xaf, 0xcc, + 0xab, 0x5b, 0x38, 0x8b, 0xa3, 0x76, 0x43, 0x36, 0x65, 0xa1, 0xd6, 0x97, + 0xbb, 0x31, 0xa5, 0x72, 0xad, 0xc6, 0xc3, 0x4d, 0x4a, 0xc8, 0x38, 0x85, + 0xcf, 0x4c, 0x64, 0xcd, 0x5d, 0xa4, 0x70, 0x3a, 0x73, 0x94, 0x5a, 0x56, + 0x6e, 0x3d, 0x8d, 0xc6, 0xb6, 0x4f, 0x9c, 0x91, 0x59, 0x4c, 0x42, 0xbb, + 0x45, 0x96, 0x77, 0x58, 0x49, 0xa3, 0x5f, 0x62, 0xac, 0x3d, 0x53, 0xd2, + 0x47, 0x6d, 0x3e, 0xa3, 0xd1, 0x3b, 0x94, 0x5c, 0x3f, 0x68, 0x6f, 0xb5, + 0x3b, 0x9e, 0xb6, 0xbf, 0xcc, 0x78, 0x5b, 0x81, 0x6a, 0x62, 0x84, 0x82, + 0xae, 0x64, 0xa0, 0x90, 0x52, 0x70, 0x5a, 0x93, 0x7f, 0x3e, 0xcd, 0x8e, + 0x38, 0xa1, 0x87, 0xa7, 0x6c, 0xc3, 0x8a, 0xbd, 0x6c, 0x87, 0xba, 0x7a, + 0x9b, 0x3c, 0x77, 0x95, 0xd1, 0x77, 0xc6, 0x80, 0x78, 0xc1, 0x6d, 0x63, + 0x4f, 0xca, 0x92, 0x41, 0xad, 0xb8, 0xa0, 0xbb, 0xc3, 0x6b, 0x63, 0x6b, + 0x3a, 0xbd, 0x63, 0xa4, 0x2e, 0x3b, 0x72, 0xc6, 0x8b, 0x8e, 0xd8, 0xc3, + 0x39, 0x33, 0x56, 0x62, 0x9d, 0x5d, 0xcc, 0x61, 0x4a, 0xb1, 0x74, 0x3e, + 0xc2, 0xa4, 0x3b, 0x91, 0x5e, 0x97, 0x7a, 0x75, 0x4a, 0x54, 0x74, 0x86, + 0xca, 0xb8, 0x9a, 0x95, 0xa0, 0x4b, 0x68, 0x64, 0xb0, 0x56, 0x70, 0x76, + 0x46, 0xc7, 0x7b, 0xc3, 0x95, 0xb3, 0x34, 0x8c, 0xa1, 0x60, 0x4e, 0x3d, + 0x9e, 0x6b, 0x78, 0x6b, 0x6e, 0x38, 0xc5, 0xa7, 0x2e, 0x7a, 0x8f, 0xc7, + 0x72, 0x63, 0x80, 0x9c, 0x41, 0x4a, 0x89, 0x84, 0x31, 0x36, 0x51, 0x47, + 0x58, 0x40, 0x79, 0xb1, 0x8a, 0x68, 0x83, 0xbf, 0x6d, 0xc3, 0x8d, 0xd4, + 0xbf, 0x5d, 0x6a, 0x58, 0xaa, 0xbb, 0x73, 0x62, 0x54, 0x6a, 0xad, 0xcf, + 0x62, 0x9a, 0x84, 0x32, 0x5b, 0xb5, 0x70, 0x87, 0x3c, 0x9a, 0x9f, 0xb0, + 0x3f, 0xa2, 0x68, 0xa7, 0xaf, 0x7a, 0x83, 0xc4, 0x7c, 0x99, 0x6b, 0x58, + 0x8c, 0xaa, 0x6e, 0xb1, 0x88, 0x5b, 0xb5, 0x6b, 0x43, 0xcb, 0x8e, 0xca, + 0x96, 0xb5, 0x43, 0x32, 0x63, 0x95, 0x6e, 0x9d, 0xb4, 0x84, 0x4b, 0x67, + 0xb6, 0x62, 0x3f, 0xa2, 0xa8, 0x95, 0x6a, 0x41, 0xa1, 0x3e, 0x36, 0x8f, + 0xb9, 0x8f, 0x50, 0xa1, 0x71, 0xa3, 0x9a, 0x8e, 0xa7, 0xc2, 0x3d, 0x6f, + 0x70, 0xa3, 0xcb, 0x52, 0x92, 0xa0, 0x5e, 0xb4, 0x3d, 0x52, 0x99, 0x97, + 0xb2, 0x9d, 0xbe, 0x45, 0x99, 0x67, 0xa1, 0x60, 0x61, 0xb4, 0x9b, 0x6b, + 0x42, 0x2e, 0x81, 0x54, 0x65, 0x42, 0xbd, 0xb3, 0x69, 0x66, 0x7a, 0xbd, + 0x89, 0x41, 0x60, 0x7b, 0x4b, 0x9b, 0x52, 0x46, 0x66, 0xa9, 0x6e, 0xbe, + 0x56, 0x77, 0xbe, 0x9b, 0xc9, 0x8e, 0x3b, 0xa1, 0xcb, 0x5d, 0x2f, 0xbb, + 0x8a, 0x51, 0x7f, 0x4f, 0x4c, 0xc4, 0xa8, 0x67, 0xa9, 0x53, 0x90, 0x7d, + 0x68, 0x92, 0xc3, 0x82, 0x61, 0x9d, 0x74, 0x67, 0x72, 0x8f, 0xc5, 0x62, + 0x3b, 0x32, 0x9c, 0xcf, 0x76, 0xa4, 0x6d, 0x30, 0xa1, 0x7f, 0x76, 0x68, + 0x46, 0x35, 0xcc, 0x38, 0xc5, 0x99, 0xbf, 0xb8, 0x5b, 0x3b, 0x96, 0xbe, + 0x77, 0xa8, 0xc6, 0x72, 0x8e, 0x82, 0x7e, 0x94, 0x6e, 0xa0, 0xbd, 0x91, + 0x7d, 0xcc, 0x9e, 0xc0, 0xb4, 0x3f, 0x4f, 0x52, 0x9c, 0xa3, 0x47, 0x57, + 0x36, 0x84, 0x76, 0x46, 0x87, 0xa7, 0x76, 0x54, 0x34, 0x9a, 0x3a, 0x62, + 0xc6, 0xbb, 0x49, 0xa4, 0xc3, 0x9c, 0x75, 0xb5, 0xb7, 0x56, 0x48, 0xc9, + 0xb3, 0xb7, 0x86, 0x85, 0x49, 0x3b, 0xad, 0xc7, 0x2d, 0x58, 0x58, 0x42, + 0x4d, 0xaf, 0x7e, 0x69, 0x9e, 0x4a, 0x3e, 0x4b, 0x74, 0xc8, 0x98, 0x42, + 0x9d, 0x43, 0xba, 0x34, 0x84, 0x96, 0xa6, 0x5c, 0xc3, 0xc7, 0x6e, 0x65, + 0x8f, 0x49, 0xa1, 0x65, 0x55, 0x7b, 0x4d, 0xdb, 0x2a, 0x51, 0x38, 0x64, + 0x9f, 0xcd, 0x7a, 0x38, 0xa9, 0x47, 0xa8, 0x32, 0xb0, 0xc3, 0x65, 0xa6, + 0x70, 0x31, 0xa0, 0x74, 0x6f, 0x80, 0x9f, 0x64, 0x6d, 0x58, 0xae, 0xa7, + 0x75, 0xc6, 0x9a, 0x80, 0x4e, 0x69, 0x7a, 0xc3, 0xb6, 0x8a, 0x91, 0x39, + 0x67, 0x53, 0x92, 0x45, 0x4d, 0x38, 0xc0, 0xaa, 0x8b, 0x86, 0xcc, 0xa2, + 0x7b, 0xcd, 0xcf, 0xc4, 0x90, 0x45, 0xd1, 0x77, 0x9c, 0x4e, 0x81, 0x64, + 0xb5, 0xcb, 0x49, 0xbf, 0xc4, 0xad, 0xb1, 0x9f, 0x85, 0x6e, 0x40, 0x49, + 0x40, 0x38, 0x5c, 0x99, 0xc3, 0xbc, 0x5a, 0xc1, 0x77, 0x78, 0xc0, 0xc9, + 0xaf, 0xbe, 0x5a, 0x76, 0x52, 0x7d, 0xbc, 0xaa, 0x85, 0x93, 0x6b, 0x89, + 0x90, 0x74, 0xbf, 0xc3, 0x88, 0xa7, 0x93, 0xa3, 0x75, 0xad, 0x53, 0x1d, + 0x5d, 0xc0, 0xcd, 0x2e, 0x98, 0x95, 0xc3, 0x56, 0x58, 0xb5, 0xa1, 0xb2, + 0xc4, 0xb1, 0xc8, 0xac, 0x8a, 0x50, 0xc3, 0x65, 0x58, 0x79, 0x4e, 0x67, + 0x6e, 0x89, 0xb0, 0x29, 0xcb, 0x79, 0xcf, 0x87, 0xc9, 0x57, 0x35, 0x44, + 0xbc, 0xa7, 0xa2, 0x7a, 0x45, 0xde, 0x46, 0x4f, 0x46, 0xb1, 0x94, 0xd4, + 0x58, 0xb3, 0xa5, 0xb5, 0xdc, 0xb4, 0x52, 0x25, 0x49, 0x34, 0x59, 0x85, + 0xca, 0x85, 0x8e, 0x87, 0x49, 0x4e, 0x67, 0x71, 0x5d, 0x5a, 0x47, 0xb0, + 0xbf, 0x3f, 0x6f, 0xb9, 0xbf, 0x7a, 0xd1, 0x66, 0x40, 0xe4, 0x52, 0xa4, + 0xca, 0x5f, 0x9e, 0xa3, 0xcf, 0x82, 0x32, 0x9b, 0x92, 0x94, 0x64, 0x5b, + 0xbd, 0xa1, 0x3b, 0xa0, 0x82, 0x4b, 0xbb, 0x36, 0xc2, 0x5d, 0x7c, 0x6b, + 0xd4, 0x7e, 0x4f, 0xcc, 0x81, 0x37, 0x6f, 0x90, 0x70, 0x64, 0xa1, 0xd1, + 0xb0, 0x60, 0x96, 0x2a, 0x86, 0xaf, 0x6d, 0x62, 0x5f, 0x57, 0x7d, 0x53, + 0xb8, 0xa9, 0xd7, 0x95, 0xc5, 0xca, 0x58, 0x48, 0xb7, 0x83, 0x8f, 0xca, + 0x47, 0xd7, 0x86, 0x7e, 0xcd, 0x31, 0x89, 0xa5, 0xa9, 0x33, 0x50, 0x8d, + 0x83, 0x4c, 0x7d, 0xad, 0xcc, 0x5e, 0xcb, 0x89, 0x6e, 0x5d, 0xd8, 0xb4, + 0xd0, 0x69, 0x4e, 0xc6, 0x88, 0x8f, 0xbd, 0xd0, 0x61, 0xaf, 0x56, 0xc0, + 0x83, 0xbd, 0x89, 0xad, 0x85, 0x90, 0x93, 0xa3, 0x9d, 0xbe, 0xac, 0x4c, + 0x48, 0x68, 0x83, 0x9c, 0xb5, 0x5c, 0x77, 0x32, 0xa9, 0x9c, 0xcf, 0x7d, + 0x87, 0xb8, 0xa1, 0xd9, 0xad, 0x55, 0x65, 0xd0, 0xa2, 0x53, 0x60, 0x46, + 0xbb, 0xa0, 0x77, 0x65, 0x3f, 0x85, 0x7a, 0x44, 0x46, 0xbc, 0xaf, 0x77, + 0x91, 0x86, 0x94, 0x71, 0xb2, 0x91, 0xb6, 0x3a, 0xcd, 0xab, 0x3a, 0x3a, + 0x4f, 0x2f, 0x57, 0x3d, 0xb8, 0x97, 0x6d, 0x78, 0xca, 0x5d, 0xd0, 0xd2, + 0xb5, 0x71, 0xa2, 0x90, 0xc2, 0x8c, 0x77, 0x86, 0xad, 0x80, 0xcb, 0x98, + 0x6c, 0x9a, 0x79, 0x6b, 0x5b, 0x64, 0xc0, 0x3f, 0xd7, 0x98, 0x82, 0x60, + 0x36, 0xb1, 0x39, 0x38, 0x2e, 0x9b, 0x6c, 0x4b, 0x38, 0x6f, 0x38, 0xb3, + 0x5f, 0xa4, 0xe0, 0x5f, 0x72, 0x7b, 0x8f, 0xd2, 0xad, 0x50, 0xc2, 0x4c, + 0x91, 0xb6, 0x99, 0xa6, 0x72, 0x70, 0x97, 0x8f, 0xa5, 0x4b, 0x49, 0x97, + 0xc4, 0x54, 0x6e, 0x21, 0xa8, 0xcc, 0x95, 0xb1, 0x75, 0xcf, 0x61, 0x57, + 0x2f, 0x3a, 0x41, 0x4a, 0x5d, 0x45, 0x81, 0x88, 0x54, 0x59, 0xc0, 0x48, + 0x62, 0x89, 0x82, 0x56, 0x3e, 0x98, 0x4a, 0x92, 0x55, 0x8b, 0xca, 0x75, + 0x6d, 0x8f, 0xad, 0xd7, 0xaf, 0xc3, 0x86, 0xd2, 0x60, 0xc1, 0x38, 0xaf, + 0x6d, 0x8f, 0x98, 0x6e, 0x49, 0x87, 0xb5, 0xac, 0xd1, 0xac, 0xaf, 0x83, + 0x65, 0x8d, 0xb8, 0xb6, 0x62, 0x92, 0x89, 0x90, 0x3d, 0x8e, 0xc8, 0xc1, + 0x8b, 0xca, 0xb9, 0x9f, 0x56, 0x61, 0xc7, 0xad, 0x32, 0x6b, 0xbe, 0x31, + 0x72, 0x89, 0xad, 0x95, 0xd0, 0x6a, 0x98, 0x98, 0x59, 0x33, 0xd8, 0x8b, + 0x7c, 0xa1, 0x5a, 0x82, 0x81, 0x85, 0xba, 0x64, 0x7f, 0x3c, 0x48, 0x6b, + 0x64, 0x43, 0x48, 0xa5, 0x7a, 0x50, 0x82, 0x75, 0x8b, 0x6f, 0xb5, 0xd5, + 0x88, 0xc4, 0x5e, 0x8a, 0xd9, 0x98, 0x78, 0x45, 0x3d, 0x47, 0x93, 0x6c, + 0x5d, 0x56, 0x6e, 0xad, 0x54, 0x43, 0x56, 0xbc, 0x87, 0x83, 0x48, 0x66, + 0xc6, 0xb4, 0xcf, 0x3f, 0x6c, 0x48, 0x49, 0x88, 0x9b, 0xd2, 0x98, 0x40, + 0xdd, 0x8c, 0xf7, 0xb4, 0x55, 0xc2, 0xa7, 0x33, 0xd9, 0x8f, 0x5b, 0x4e, + 0x81, 0x6f, 0x68, 0x91, 0xa5, 0xa1, 0x65, 0xc5, 0x93, 0xbd, 0x61, 0x7b, + 0x41, 0x8b, 0x38, 0xa6, 0x9d, 0x47, 0x69, 0x36, 0x91, 0x7f, 0x54, 0x61, + 0xcf, 0x9f, 0xba, 0xd2, 0x7d, 0x7c, 0xd8, 0x9e, 0xe3, 0x6a, 0x91, 0x79, + 0x2f, 0xb5, 0x35, 0x46, 0x78, 0x38, 0x5c, 0xb5, 0x2f, 0xb8, 0x8e, 0xb4, + 0x5f, 0x44, 0xa9, 0x64, 0xab, 0xce, 0xc5, 0xa6, 0xcf, 0x4c, 0xe2, 0x37, + 0x3b, 0x6b, 0x3b, 0xce, 0xac, 0xc5, 0x63, 0x87, 0xd2, 0x89, 0x75, 0xb9, + 0x86, 0xd6, 0xb7, 0xb2, 0xcb, 0xd1, 0xbe, 0xd5, 0x72, 0x70, 0xb1, 0x39, + 0xcd, 0x38, 0xd2, 0x8f, 0x77, 0x87, 0xa7, 0x46, 0x46, 0x60, 0x4f, 0xe1, + 0x94, 0x65, 0xb3, 0x68, 0xa4, 0x7e, 0x3a, 0x5c, 0xab, 0x8f, 0x7a, 0x38, + 0xc4, 0x54, 0xbc, 0x39, 0xd7, 0x7c, 0x57, 0xc8, 0xa0, 0xb8, 0xd9, 0xaa, + 0x99, 0x52, 0xd3, 0x5b, 0xb9, 0xd6, 0xb1, 0x82, 0x94, 0x75, 0x4b, 0x60, + 0x62, 0xc1, 0xc7, 0x9f, 0x64, 0x58, 0x60, 0x91, 0x3c, 0x7d, 0x82, 0xb9, + 0xac, 0xc1, 0x57, 0xac, 0x50, 0xab, 0x4b, 0xaa, 0xbf, 0x56, 0x6c, 0x29, + 0x48, 0x4d, 0x44, 0x88, 0xb0, 0x7b, 0xce, 0x9d, 0xb9, 0x73, 0x7e, 0xda, + 0x64, 0x47, 0x54, 0xc2, 0x5b, 0xaa, 0x51, 0xa3, 0xc4, 0x56, 0x8d, 0xad, + 0xbb, 0xd1, 0x57, 0x83, 0x78, 0x7f, 0xa2, 0xc1, 0x63, 0xb8, 0xa7, 0xaa, + 0x98, 0x76, 0xac, 0xc7, 0x9f, 0x91, 0x63, 0x8c, 0xa3, 0xa3, 0x48, 0x4a, + 0x45, 0x98, 0x47, 0x38, 0x40, 0x6b, 0x49, 0x48, 0x6c, 0x7a, 0xc1, 0x51, + 0x5e, 0x30, 0x2b, 0xd1, 0x99, 0x34, 0x7e, 0x59, 0x47, 0x6a, 0x62, 0xd3, + 0x57, 0x3d, 0xa8, 0x95, 0x46, 0x30, 0x9b, 0xa7, 0x66, 0x45, 0xcb, 0xcd, + 0x7e, 0x58, 0x3f, 0x2f, 0x40, 0xad, 0x9d, 0x6f, 0x3e, 0x74, 0xb3, 0x85, + 0x9f, 0x4f, 0x51, 0x55, 0x77, 0x58, 0xa0, 0x5a, 0x9b, 0x8b, 0xc8, 0x6e, + 0x7a, 0xb2, 0x90, 0xa6, 0x96, 0x9c, 0x9c, 0x3d, 0xbd, 0x6e, 0xc4, 0x9d, + 0x75, 0x5d, 0x62, 0x87, 0x76, 0x8c, 0xc8, 0x65, 0x80, 0x69, 0x48, 0x67, + 0xb5, 0xc4, 0xc6, 0x79, 0x47, 0xc5, 0x40, 0xc0, 0xc1, 0x64, 0xad, 0x29, + 0x8d, 0x67, 0xc2, 0x55, 0xd4, 0xb3, 0xb8, 0x8b, 0x99, 0xb0, 0x5a, 0x39, + 0xd7, 0x79, 0x7d, 0x57, 0x5a, 0x3a, 0x4e, 0xc4, 0x64, 0x84, 0x46, 0x67, + 0xb9, 0xb0, 0xd3, 0x4e, 0x5c, 0xa3, 0x95, 0xaa, 0x6d, 0xbe, 0xa7, 0xe7, + 0x8d, 0x4c, 0xb1, 0x99, 0x55, 0x78, 0x93, 0xd3, 0x3c, 0x96, 0xd0, 0x2d, + 0x6f, 0x58, 0xd9, 0x9b, 0xce, 0x80, 0x3c, 0x89, 0xcf, 0x9a, 0x6a, 0x4d, + 0x2b, 0xc5, 0xd6, 0x97, 0xa0, 0x3c, 0x64, 0xa4, 0xaa, 0x56, 0xb9, 0x84, + 0x90, 0xd7, 0xad, 0x74, 0x39, 0x28, 0xa3, 0xb2, 0x3b, 0x32, 0x30, 0x81, + 0x76, 0x45, 0xbc, 0xca, 0xd4, 0x6b, 0x84, 0xc1, 0x6e, 0x60, 0x7c, 0xcf, + 0x74, 0x2f, 0x9d, 0x43, 0x71, 0x4c, 0x94, 0xb7, 0x81, 0x6d, 0xac, 0x49, + 0xce, 0xc0, 0x3c, 0x65, 0xc9, 0x44, 0x39, 0xc9, 0xa7, 0x84, 0x65, 0xd2, + 0x54, 0x64, 0x75, 0xc3, 0xc0, 0xab, 0x56, 0xaf, 0x62, 0xb2, 0xab, 0x8a, + 0xc2, 0xc6, 0x49, 0x65, 0x7d, 0x76, 0x77, 0xaa, 0x66, 0x66, 0x55, 0x42, + 0x7d, 0x7a, 0x50, 0x96, 0x2d, 0x9b, 0xcd, 0x39, 0x39, 0x7f, 0xc7, 0x93, + 0x9b, 0x3e, 0x2b, 0x98, 0x54, 0xd0, 0x91, 0x97, 0x8d, 0xc7, 0x4a, 0x42, + 0x5e, 0x9a, 0x42, 0x99, 0xda, 0x45, 0xa0, 0xc4, 0x37, 0x4b, 0x68, 0x5d, + 0x9e, 0x4c, 0x3e, 0x71, 0xb6, 0xdc, 0x6b, 0x9f, 0x98, 0x65, 0xb2, 0x60, + 0x97, 0x7b, 0x54, 0x4f, 0x46, 0x3f, 0x34, 0x47, 0x57, 0x97, 0x6c, 0x98, + 0x6b, 0x84, 0x54, 0x14, 0xa3, 0x9a, 0x2e, 0xaa, 0xae, 0xbb, 0x7e, 0xb3, + 0x76, 0xa5, 0xb3, 0x2f, 0x39, 0x90, 0xc2, 0x99, 0x98, 0x8b, 0xa4, 0xb8, + 0x53, 0x70, 0x97, 0x51, 0xc5, 0x5c, 0xae, 0xac, 0x4f, 0xcc, 0xb0, 0x60, + 0x5a, 0x62, 0x60, 0x4b, 0x7e, 0x50, 0x9a, 0x8c, 0xc6, 0xb1, 0xb1, 0x53, + 0xa2, 0x87, 0x8b, 0x25, 0x32, 0x7b, 0x75, 0xa2, 0xa0, 0x7d, 0x94, 0xbb, + 0x5f, 0x90, 0x9a, 0xad, 0xce, 0x48, 0x77, 0x4e, 0xd2, 0x45, 0x8b, 0x85, + 0x6c, 0x47, 0xbd, 0xa6, 0xcb, 0x5d, 0x47, 0x97, 0xc3, 0xa0, 0x49, 0x86, + 0x65, 0x32, 0xad, 0xbc, 0x4a, 0x5b, 0xb9, 0xc4, 0x43, 0xb6, 0x6f, 0x5d, + 0x38, 0x69, 0xbf, 0x41, 0x49, 0x88, 0xc8, 0x5e, 0x3b, 0xb4, 0xa4, 0xba, + 0x3d, 0x9f, 0x9f, 0xc9, 0xa0, 0x8f, 0x93, 0x83, 0x59, 0x8d, 0xb4, 0x90, + 0x3c, 0xbc, 0xc2, 0xa2, 0xbb, 0x9e, 0x70, 0x43, 0x83, 0x84, 0xd9, 0x8d, + 0xb6, 0xd2, 0x7c, 0xa3, 0x54, 0x7b, 0x3e, 0x4c, 0x8d, 0x82, 0xa5, 0x42, + 0xc6, 0x4c, 0xca, 0x61, 0x3b, 0xd1, 0x57, 0x94, 0xb7, 0x50, 0x90, 0x6b, + 0xbf, 0x3f, 0xcd, 0xc6, 0x44, 0xc8, 0x86, 0xa2, 0xa5, 0xc8, 0x99, 0xdb, + 0x6b, 0xcd, 0x33, 0xd5, 0x6d, 0xbd, 0x99, 0x9c, 0xc2, 0xaa, 0x89, 0xb5, + 0x8f, 0x9a, 0xcb, 0x72, 0x50, 0xce, 0x72, 0xa9, 0xc5, 0x9f, 0xcd, 0x3f, + 0x7d, 0xd1, 0xc9, 0xbd, 0xb9, 0xc0, 0x31, 0x49, 0x52, 0x3b, 0x74, 0xb3, + 0x84, 0xc0, 0x9c, 0xa0, 0x9d, 0xc6, 0xc2, 0xbc, 0xba, 0x81, 0xbc, 0x99, + 0xbd, 0x6f, 0x39, 0x43, 0x87, 0x42, 0xce, 0x4c, 0xac, 0xab, 0x81, 0x7b, + 0x54, 0x61, 0x54, 0xc6, 0x72, 0x4e, 0x97, 0x59, 0x40, 0x6a, 0xa5, 0x4b, + 0x43, 0x82, 0x79, 0x63, 0x74, 0xa9, 0x53, 0xab, 0xa2, 0x4f, 0xa0, 0x96, + 0x47, 0xdb, 0xd3, 0x8d, 0x91, 0x65, 0x94, 0xb5, 0x6c, 0x89, 0xaa, 0xb0, + 0x5e, 0xaf, 0x43, 0x5e, 0x3c, 0x6c, 0x46, 0x91, 0x48, 0x32, 0x43, 0xcc, + 0xbe, 0x3e, 0xa3, 0x97, 0x75, 0x3f, 0x2d, 0xb8, 0x97, 0x3a, 0xaf, 0x9c, + 0x40, 0x74, 0x94, 0xbf, 0x93, 0x3a, 0x99, 0x58, 0x42, 0x9f, 0x51, 0xc4, + 0x40, 0x5e, 0x3a, 0x67, 0x77, 0xb0, 0x66, 0x86, 0x42, 0x5f, 0x87, 0x63, + 0xa2, 0x6a, 0x89, 0xb0, 0x69, 0x47, 0xd6, 0x4c, 0xbc, 0x4b, 0x94, 0x99, + 0x83, 0xba, 0xb4, 0x9f, 0xa7, 0xbe, 0x94, 0x35, 0x5d, 0x67, 0x4a, 0x85, + 0xcb, 0x87, 0x33, 0x81, 0x81, 0x41, 0x37, 0x99, 0x93, 0x79, 0x57, 0x7b, + 0x32, 0xc5, 0x76, 0xa0, 0x4e, 0xd5, 0x48, 0xb9, 0xce, 0xae, 0x49, 0xa4, + 0x91, 0x3b, 0x94, 0xd3, 0x97, 0xcc, 0xa0, 0x8f, 0x92, 0xb9, 0xcf, 0xcc, + 0xb5, 0x5c, 0xae, 0x80, 0x47, 0x43, 0x75, 0x8c, 0x5d, 0x6b, 0x8d, 0x5b, + 0x34, 0x60, 0x9d, 0x65, 0x55, 0x39, 0xc1, 0xbd, 0x6d, 0xd9, 0xb6, 0x47, + 0x67, 0x59, 0xa8, 0xb2, 0xa3, 0xc9, 0x7c, 0x36, 0xba, 0xac, 0x33, 0xba, + 0x9d, 0xab, 0xbb, 0x63, 0x4d, 0x97, 0x66, 0x6d, 0x54, 0xa2, 0x6c, 0x9f, + 0x57, 0x6b, 0x6c, 0x41, 0x75, 0x8d, 0x56, 0xaf, 0x97, 0x88, 0xba, 0x6c, + 0x4b, 0xc1, 0x97, 0x64, 0xc7, 0xaf, 0x8b, 0x96, 0x7a, 0x60, 0xa9, 0x38, + 0x39, 0x73, 0xd9, 0x4f, 0x89, 0x79, 0x84, 0x74, 0x39, 0x5a, 0x4b, 0x38, + 0x40, 0x4e, 0x87, 0x8b, 0xc5, 0x86, 0x52, 0x71, 0xd5, 0xd1, 0x5a, 0x32, + 0x74, 0x40, 0x3b, 0xc5, 0x6e, 0xb9, 0xbe, 0x78, 0x66, 0x82, 0xaa, 0x56, + 0x53, 0xad, 0x61, 0xb5, 0x93, 0xce, 0x67, 0x8c, 0x7d, 0x85, 0x88, 0x39, + 0x57, 0x86, 0x98, 0x94, 0x95, 0x5d, 0x71, 0x64, 0xb2, 0x41, 0x38, 0x90, + 0x92, 0xa1, 0x3a, 0x62, 0x66, 0x5b, 0x3f, 0x79, 0x5c, 0xc3, 0x3f, 0x39, + 0x5d, 0xbb, 0x83, 0x7d, 0xb5, 0x65, 0xc4, 0x42, 0x7c, 0x85, 0xae, 0xc0, + 0x61, 0x7a, 0x66, 0x76, 0xc9, 0x96, 0xb7, 0xc9, 0xa0, 0x7d, 0x46, 0x6b, + 0xc5, 0xb3, 0x66, 0x76, 0x4f, 0x3e, 0x9c, 0x88, 0xa7, 0xb7, 0xc3, 0xb3, + 0x35, 0xa6, 0x65, 0x5d, 0xc6, 0x6f, 0xbd, 0xa8, 0xc4, 0x71, 0x70, 0x98, + 0xba, 0x73, 0xc6, 0xc2, 0x73, 0x4e, 0x95, 0x8f, 0x4c, 0x6a, 0xd2, 0x92, + 0xc1, 0x51, 0x6e, 0x58, 0x3d, 0x51, 0xbc, 0x75, 0xbd, 0x60, 0x40, 0x83, + 0x49, 0x4f, 0x9f, 0x6d, 0xa9, 0x52, 0x6b, 0x6d, 0x94, 0x36, 0xb1, 0xab, + 0x77, 0xb9, 0x41, 0xbe, 0x4a, 0xd0, 0x83, 0xa4, 0x5e, 0x78, 0x5d, 0xd4, + 0xc4, 0x34, 0xcc, 0x40, 0x62, 0xbb, 0x61, 0x5f, 0xc0, 0x80, 0xab, 0x97, + 0x8e, 0xd2, 0xaa, 0x58, 0x5c, 0x2a, 0xc4, 0x82, 0x87, 0x3e, 0x42, 0xc9, + 0x41, 0x81, 0xb0, 0x94, 0x46, 0x39, 0x3f, 0xa1, 0x93, 0x8d, 0xc8, 0x44, + 0xcb, 0xd1, 0x36, 0xcc, 0x48, 0xb8, 0x40, 0x5f, 0x4b, 0xc7, 0xd8, 0xce, + 0x3b, 0x8c, 0xbb, 0x63, 0x51, 0x49, 0x45, 0x6e, 0x9c, 0x53, 0xa7, 0x7e, + 0x6b, 0x87, 0x66, 0x5f, 0xb8, 0xc6, 0x97, 0x5f, 0x64, 0x6d, 0x99, 0x42, + 0xa4, 0x45, 0x83, 0x7e, 0x4b, 0xba, 0x71, 0x8d, 0xbf, 0x38, 0x9f, 0xc9, + 0x81, 0x70, 0x36, 0x50, 0x5a, 0x69, 0x34, 0x74, 0xc5, 0xbf, 0x5c, 0x30, + 0x99, 0x44, 0x84, 0x73, 0x65, 0x7d, 0x97, 0xc4, 0x8f, 0x5d, 0x31, 0x9c, + 0x58, 0xaa, 0x61, 0x75, 0xb7, 0x3a, 0x50, 0xbe, 0xb3, 0x98, 0x68, 0xcf, + 0x41, 0x85, 0x49, 0x94, 0x37, 0x80, 0xc3, 0x96, 0xae, 0xc3, 0x6b, 0xa6, + 0x76, 0xaf, 0xaf, 0xae, 0x47, 0xd2, 0x74, 0x81, 0x81, 0xce, 0xc9, 0x91, + 0x7e, 0xa1, 0x95, 0x36, 0xca, 0xbf, 0x7e, 0x33, 0x82, 0xb2, 0xa4, 0x3b, + 0xbe, 0x5d, 0x6e, 0x97, 0x6d, 0xaa, 0x50, 0x64, 0x6c, 0x8b, 0x7f, 0x77, + 0xa0, 0x45, 0x46, 0x82, 0x43, 0x80, 0x8f, 0x3a, 0xc7, 0x3d, 0x8c, 0x8a, + 0x3a, 0x67, 0x9e, 0xa0, 0x5e, 0xc9, 0x3c, 0x3d, 0x8c, 0x97, 0x69, 0x6b, + 0x8f, 0xcb, 0x3a, 0x84, 0xbe, 0x77, 0xb1, 0x7a, 0x45, 0xc0, 0x9e, 0x5e, + 0xc0, 0x95, 0x73, 0x65, 0x37, 0xa3, 0x38, 0x53, 0x7e, 0x85, 0x87, 0xba, + 0xa6, 0xa2, 0x92, 0xaf, 0x69, 0x46, 0x4d, 0xaf, 0xc7, 0x64, 0x3c, 0xa8, + 0x70, 0x59, 0x3c, 0x86, 0xc7, 0xa1, 0x43, 0xa6, 0x6b, 0x2f, 0x34, 0xbd, + 0x6f, 0x4d, 0x5e, 0xaf, 0x81, 0xb7, 0x7b, 0x65, 0x67, 0x4e, 0xc5, 0xa0, + 0x3a, 0xb0, 0x5a, 0xcf, 0x49, 0x9f, 0x77, 0xae, 0xc8, 0x81, 0x6d, 0xad, + 0x65, 0xa1, 0x8c, 0x47, 0xaa, 0x41, 0xb4, 0x80, 0x40, 0x75, 0x66, 0x5f, + 0xb9, 0x4f, 0x8a, 0xb6, 0xae, 0x80, 0x60, 0x8e, 0x95, 0x8d, 0x67, 0x4d, + 0xad, 0x63, 0x78, 0x79, 0xae, 0x36, 0x55, 0x5a, 0x39, 0x4f, 0x85, 0x4f, + 0x95, 0xb9, 0x42, 0x5b, 0xb1, 0xc7, 0xa4, 0x9b, 0x88, 0x40, 0x7c, 0x8e, + 0x52, 0x81, 0x68, 0x3a, 0x6a, 0x71, 0xa2, 0x42, 0x6b, 0x52, 0xa2, 0x43, + 0xbc, 0x7b, 0x7d, 0x6e, 0x50, 0xc3, 0x72, 0x4d, 0x52, 0x93, 0xb6, 0xd0, + 0x65, 0x80, 0xd0, 0x3c, 0xa6, 0x2b, 0xbb, 0xce, 0x9b, 0x3e, 0xcc, 0xb0, + 0x7c, 0xb8, 0x7b, 0x64, 0xb3, 0x61, 0x76, 0x85, 0x4b, 0x34, 0xbe, 0xa9, + 0xd0, 0xb5, 0x71, 0xc1, 0xaa, 0x46, 0xae, 0xa3, 0xa3, 0xa6, 0xc5, 0x5b, + 0xb5, 0xbc, 0xa0, 0xbb, 0xd3, 0x79, 0x42, 0x52, 0xb6, 0x6f, 0xac, 0x54, + 0xac, 0x5e, 0xb7, 0x37, 0xa0, 0x74, 0x56, 0x59, 0xc6, 0x80, 0xa7, 0x32, + 0x81, 0x7c, 0xc7, 0x65, 0x4a, 0x87, 0x56, 0x79, 0x74, 0x8a, 0x6e, 0x3a, + 0x4d, 0x86, 0xc0, 0xd2, 0x60, 0xb8, 0x6c, 0x50, 0x44, 0x76, 0x90, 0xbe, + 0x6f, 0x89, 0x5b, 0xbf, 0x7c, 0x96, 0x9f, 0x4b, 0xc2, 0x42, 0x5c, 0x68, + 0xae, 0x52, 0xa0, 0xd0, 0x54, 0x5d, 0x31, 0x81, 0x85, 0xa9, 0x98, 0xae, + 0xcd, 0x95, 0xa2, 0x76, 0xac, 0xb1, 0xd0, 0x3d, 0x45, 0x31, 0x62, 0xb8, + 0xb4, 0x7a, 0xd2, 0x6d, 0x9a, 0x94, 0xa4, 0x4f, 0x6e, 0x32, 0x8f, 0x7e, + 0x5d, 0xa3, 0x4b, 0x75, 0x56, 0x68, 0xd0, 0x51, 0xb2, 0x5b, 0xac, 0x72, + 0xc2, 0x52, 0xbc, 0x7b, 0x33, 0x54, 0xca, 0x46, 0x53, 0x48, 0xc7, 0xab, + 0xc1, 0xac, 0xb0, 0x66, 0x83, 0xbd, 0xc8, 0x85, 0x3a, 0x9d, 0x44, 0x79, + 0x6b, 0x9e, 0x54, 0x8f, 0x58, 0x67, 0xcf, 0x5f, 0x4d, 0xbc, 0x8d, 0x60, + 0x5d, 0x4c, 0x8e, 0x3b, 0x52, 0x84, 0x53, 0xc4, 0xcf, 0xa5, 0x96, 0x4f, + 0xae, 0x85, 0x96, 0x97, 0x66, 0x76, 0x86, 0x4b, 0x85, 0xaa, 0x6d, 0x74, + 0x6b, 0x66, 0x48, 0x7b, 0x71, 0x7d, 0x39, 0x99, 0xc4, 0x4b, 0x69, 0x67, + 0xb5, 0x2e, 0x44, 0x58, 0xa6, 0xbc, 0x73, 0xcb, 0x2d, 0x9f, 0x53, 0x9f, + 0xbf, 0xa8, 0x3d, 0x9c, 0xab, 0x44, 0xce, 0x4c, 0x84, 0x49, 0x31, 0x9e, + 0xa7, 0x3c, 0x3a, 0x52, 0xba, 0x9f, 0x3b, 0x7e, 0xa2, 0x5c, 0x3b, 0x5a, + 0x77, 0x79, 0x3a, 0x86, 0x72, 0x41, 0xc7, 0x35, 0xc7, 0xaa, 0x5f, 0xbd, + 0x85, 0xc7, 0xab, 0x5a, 0x66, 0x3a, 0x97, 0x3c, 0x76, 0x96, 0x6f, 0x93, + 0xaa, 0x54, 0x58, 0x6a, 0xa0, 0x47, 0x47, 0x6f, 0xa8, 0x81, 0xbd, 0x71, + 0x73, 0xb0, 0xc8, 0x87, 0xb9, 0xa8, 0x9d, 0x4a, 0x70, 0x73, 0xb8, 0x30, + 0xa2, 0x67, 0xc5, 0x6f, 0xb0, 0x41, 0xd1, 0xc3, 0xbf, 0x40, 0xaa, 0x5d, + 0x82, 0x43, 0x34, 0x4c, 0xbe, 0xc3, 0xbb, 0x78, 0xa4, 0x36, 0x8a, 0xa0, + 0x5c, 0x69, 0xc1, 0x9b, 0xb9, 0x68, 0xa0, 0x3e, 0xb1, 0xd1, 0x97, 0xc5, + 0x56, 0x77, 0x35, 0x58, 0x75, 0x72, 0x7c, 0x4d, 0x81, 0x82, 0xa7, 0x7c, + 0x35, 0x7c, 0x9b, 0x65, 0xb2, 0xb3, 0x6e, 0x72, 0x7e, 0xc6, 0x43, 0xc3, + 0x3d, 0x75, 0xc2, 0x48, 0x8c, 0x52, 0xb3, 0x71, 0xc9, 0xcf, 0xbe, 0x60, + 0x8b, 0x7b, 0xae, 0x73, 0x54, 0x63, 0x94, 0xd6, 0xd6, 0xd5, 0x94, 0xa8, + 0x6e, 0x53, 0xaf, 0x49, 0x6d, 0xc3, 0x2c, 0xb2, 0x6c, 0xb7, 0xbe, 0xa0, + 0x73, 0x81, 0xbf, 0x41, 0xb4, 0x49, 0x40, 0xcf, 0xa2, 0xb0, 0xa9, 0xae, + 0x83, 0x85, 0xca, 0xce, 0x6c, 0x7b, 0x5f, 0x76, 0x40, 0x95, 0x9b, 0x95, + 0xb6, 0xb8, 0x4d, 0x33, 0x30, 0x39, 0x47, 0x32, 0x3b, 0x45, 0x32, 0xac, + 0xb6, 0x9a, 0x81, 0xa1, 0x80, 0x4f, 0xa8, 0x94, 0x95, 0x7d, 0x85, 0x41, + 0xb7, 0x39, 0xb8, 0x8d, 0x2f, 0xab, 0xb6, 0xca, 0xc0, 0x88, 0x6b, 0xbf, + 0xc0, 0xc2, 0xc2, 0xa9, 0x2b, 0x48, 0x4d, 0xba, 0x54, 0x79, 0x8e, 0x7c, + 0xac, 0xc6, 0xbb, 0x3a, 0xbe, 0xce, 0x57, 0xa1, 0x66, 0xaa, 0x60, 0xb2, + 0x93, 0xcb, 0xa5, 0x86, 0xc0, 0xbd, 0x62, 0x8f, 0x72, 0x13, 0x1a, 0x3e, + 0xb4, 0x5b, 0x42, 0xd1, 0x85, 0x8f, 0xa5, 0xad, 0x7c, 0x99, 0x76, 0xc8, + 0x58, 0x43, 0x51, 0xce, 0x9c, 0x51, 0x4c, 0xd3, 0x8d, 0xa0, 0x95, 0x5e, + 0xab, 0x99, 0x97, 0x93, 0x7f, 0x5e, 0x89, 0x9c, 0xba, 0x49, 0x97, 0x83, + 0x3c, 0xcc, 0x49, 0x5b, 0x89, 0x9e, 0x95, 0xc7, 0xae, 0x71, 0x95, 0x69, + 0x81, 0x8b, 0x8e, 0xd7, 0x67, 0x47, 0x98, 0xc8, 0xc7, 0xbe, 0x9d, 0xc6, + 0xa5, 0x9c, 0x7b, 0x70, 0x3c, 0x6c, 0x72, 0x8c, 0x94, 0x44, 0x5b, 0x31, + 0xbc, 0xb6, 0xcd, 0xa3, 0xd0, 0xd4, 0x43, 0x75, 0xcc, 0x33, 0x4f, 0x33, + 0xb8, 0x4b, 0x9b, 0xbd, 0x71, 0x74, 0xbb, 0xb6, 0x6c, 0x64, 0xc6, 0xb4, + 0xa3, 0x9a, 0x90, 0x7f, 0x80, 0x8d, 0x93, 0x8d, 0x51, 0x32, 0x28, 0xa7, + 0x6c, 0xc7, 0xae, 0x26, 0xbc, 0xc6, 0xb5, 0x86, 0x80, 0x64, 0x73, 0x52, + 0xcf, 0x76, 0x2f, 0x65, 0x95, 0x7b, 0x5f, 0xb7, 0x43, 0x95, 0x50, 0x68, + 0x3f, 0x2a, 0xcc, 0x39, 0x3d, 0x5f, 0x6d, 0xab, 0x48, 0x6d, 0x9b, 0xc1, + 0x56, 0x89, 0x69, 0x43, 0x85, 0xbb, 0xd5, 0x9d, 0x59, 0x49, 0x63, 0x4c, + 0x69, 0x99, 0x5f, 0x71, 0x4d, 0x9b, 0x54, 0x57, 0x99, 0x61, 0x35, 0x61, + 0xd0, 0xab, 0x99, 0x70, 0x7e, 0x70, 0xa8, 0x7b, 0x46, 0x42, 0x59, 0x6b, + 0x41, 0x47, 0x60, 0x93, 0x8f, 0x69, 0x71, 0xab, 0x5a, 0x3c, 0x55, 0x7c, + 0x7e, 0xa4, 0xaf, 0x42, 0xcf, 0x32, 0x5a, 0x81, 0x4e, 0x94, 0x4b, 0x72, + 0x6d, 0x33, 0xb5, 0x2b, 0xca, 0x4a, 0x49, 0xd2, 0x5c, 0x75, 0xae, 0x77, + 0x9f, 0x40, 0x47, 0x43, 0xcd, 0xcc, 0x8f, 0x9c, 0xa8, 0x45, 0x62, 0x8d, + 0x7f, 0x33, 0x7e, 0x74, 0x34, 0x47, 0xc4, 0x51, 0x69, 0x68, 0xbd, 0x89, + 0x75, 0xcc, 0x7d, 0x9f, 0x61, 0x66, 0x61, 0x80, 0x4d, 0x7a, 0x7b, 0x5b, + 0x8c, 0x5f, 0xb4, 0xa9, 0x8b, 0x8c, 0x84, 0x75, 0xbc, 0x66, 0x3a, 0x59, + 0xaf, 0xcb, 0x8e, 0x38, 0x8b, 0x3d, 0x6d, 0x76, 0xb9, 0x95, 0x53, 0x52, + 0xb6, 0xb3, 0x63, 0xbc, 0x55, 0x9c, 0xdb, 0x71, 0x32, 0x74, 0xcb, 0xc9, + 0x62, 0x58, 0xac, 0x72, 0x34, 0xc4, 0x57, 0x8a, 0x46, 0x75, 0xb5, 0x3c, + 0x65, 0xc4, 0x90, 0xc9, 0x5d, 0xb6, 0x3f, 0xa6, 0x50, 0x58, 0x50, 0x98, + 0x85, 0x82, 0xb5, 0xc2, 0x31, 0x54, 0x3a, 0x45, 0x70, 0x68, 0xc1, 0xb0, + 0xb8, 0x44, 0x8f, 0x80, 0xa7, 0xb9, 0x6f, 0x8c, 0x42, 0x84, 0x23, 0x6e, + 0x60, 0xa1, 0x45, 0x8c, 0x66, 0x54, 0x6d, 0x71, 0x59, 0x7c, 0x40, 0xca, + 0xc4, 0x78, 0xc1, 0x70, 0xca, 0x40, 0x81, 0xc1, 0x58, 0x70, 0xb0, 0x5e, + 0xcb, 0xa1, 0xbc, 0x69, 0x89, 0x5e, 0x49, 0x9a, 0xc7, 0x77, 0x59, 0xc4, + 0x81, 0x3d, 0xb3, 0x85, 0x9e, 0xcc, 0x6a, 0x6e, 0x6a, 0x6a, 0xac, 0x7f, + 0x3b, 0x43, 0xaa, 0x75, 0x3f, 0x92, 0x8f, 0xc5, 0x35, 0x71, 0x85, 0xa5, + 0x9d, 0xa4, 0x7c, 0x9d, 0xa9, 0x9c, 0x49, 0xb8, 0x93, 0xb6, 0x4a, 0x30, + 0xa0, 0x7b, 0x7c, 0xa4, 0x39, 0xa5, 0xc1, 0x5f, 0x8a, 0x83, 0x44, 0x5c, + 0x69, 0x71, 0x54, 0xa4, 0xa2, 0x94, 0x8b, 0x86, 0x7f, 0xb7, 0x3d, 0xb2, + 0xc4, 0x54, 0x9f, 0xb8, 0x57, 0xa8, 0x7c, 0x7f, 0x84, 0xcb, 0x7c, 0x92, + 0x54, 0xc1, 0x62, 0xb7, 0xc2, 0x8b, 0x65, 0xaa, 0x80, 0x45, 0x33, 0x5e, + 0xc5, 0x74, 0x50, 0xd5, 0xb2, 0x8e, 0xb6, 0x55, 0x31, 0x58, 0x69, 0xae, + 0x6d, 0x3a, 0x4d, 0x73, 0x5e, 0xbc, 0x7b, 0x69, 0x54, 0x99, 0x4a, 0x77, + 0x8e, 0xc1, 0x5b, 0x6c, 0x33, 0x5b, 0x74, 0x9c, 0x3e, 0xad, 0x70, 0xb1, + 0x35, 0x4a, 0x2f, 0x5a, 0x49, 0xb7, 0x44, 0xb9, 0x6b, 0xb7, 0x6d, 0xd3, + 0x6a, 0xa4, 0x50, 0xc4, 0x61, 0xb7, 0x5a, 0x5c, 0xc2, 0x4a, 0x82, 0x30, + 0x8b, 0x8e, 0xc0, 0x5f, 0xbc, 0x30, 0xac, 0x95, 0x84, 0x4e, 0xc7, 0x55, + 0xa4, 0x99, 0x9b, 0x63, 0x2f, 0xd2, 0x4b, 0x2d, 0x5e, 0x70, 0x50, 0x4e, + 0x99, 0x5b, 0x8f, 0x38, 0x60, 0x87, 0xa2, 0xa9, 0x36, 0x6f, 0xa7, 0x3a, + 0xc0, 0x84, 0x52, 0x5e, 0x53, 0xbf, 0x56, 0x7d, 0x4f, 0xcd, 0x7a, 0xc3, + 0x84, 0x41, 0xb9, 0x6f, 0xae, 0x77, 0x34, 0xa5, 0x53, 0xbe, 0xcf, 0x60, + 0xc9, 0x8a, 0x43, 0x62, 0xba, 0x9f, 0x85, 0x8e, 0x9a, 0x7c, 0x3a, 0xa4, + 0x48, 0x74, 0x4c, 0x61, 0x81, 0xb8, 0x7e, 0xc0, 0x4a, 0x4a, 0xa9, 0x6c, + 0x75, 0xb4, 0x52, 0xb9, 0xae, 0x4d, 0x4f, 0x68, 0x59, 0xa4, 0x3f, 0x81, + 0x39, 0x54, 0xc3, 0xb8, 0x53, 0xa0, 0xb0, 0x47, 0xd1, 0xcb, 0xa2, 0xd1, + 0x4c, 0xce, 0x7a, 0xa2, 0x4c, 0xae, 0x53, 0x38, 0x84, 0x9f, 0x40, 0xd0, + 0xc0, 0x94, 0xc4, 0xd6, 0x7a, 0xa1, 0x84, 0xca, 0x3d, 0x65, 0xba, 0x8f, + 0x92, 0x86, 0x61, 0xa1, 0xa2, 0xad, 0x66, 0x67, 0x8d, 0x34, 0x3d, 0x4c, + 0x40, 0x7d, 0x7c, 0x9b, 0x97, 0xc1, 0x49, 0x83, 0xa3, 0x5f, 0x81, 0xc1, + 0x81, 0xbc, 0xc1, 0xa9, 0xb1, 0x6e, 0x8a, 0xb4, 0x8e, 0x43, 0x4d, 0xbb, + 0x59, 0x5b, 0x5d, 0x60, 0xa5, 0x31, 0x79, 0xc1, 0xb0, 0x6e, 0xb7, 0x75, + 0x58, 0x50, 0x84, 0xbe, 0x69, 0x4d, 0xc7, 0xbc, 0xd2, 0xc1, 0x56, 0x9b, + 0xb3, 0x4a, 0xa1, 0x9e, 0x52, 0x62, 0xc1, 0xa8, 0x95, 0x29, 0x47, 0x55, + 0x98, 0x5a, 0xad, 0x8e, 0x45, 0x3a, 0x33, 0xa9, 0x6f, 0x7e, 0xaf, 0x5b, + 0xd3, 0xb1, 0x33, 0xbb, 0x47, 0x92, 0x71, 0xa1, 0x57, 0xa4, 0x87, 0x63, + 0x9d, 0x6d, 0xd6, 0x94, 0x49, 0x41, 0x9c, 0xa0, 0x41, 0x71, 0x66, 0x55, + 0x88, 0x4d, 0x70, 0x5c, 0x94, 0x5e, 0x8e, 0x6d, 0x82, 0x96, 0xb5, 0xa7, + 0x83, 0xc2, 0x9b, 0xa3, 0xd4, 0x70, 0xb6, 0xb1, 0xa3, 0xab, 0x56, 0x63, + 0x78, 0xb0, 0xa4, 0x4e, 0x8c, 0xc9, 0xb3, 0x70, 0x74, 0x76, 0xba, 0x8e, + 0x5a, 0xd1, 0x54, 0x4c, 0xca, 0xa0, 0xda, 0xdf, 0x9e, 0x41, 0x64, 0xd0, + 0xb5, 0x53, 0xb9, 0x68, 0x47, 0x83, 0x3f, 0x8a, 0xce, 0x69, 0x7d, 0x32, + 0x67, 0x50, 0x55, 0x55, 0x59, 0xc0, 0x3d, 0xad, 0x5e, 0xbe, 0x78, 0x29, + 0x9d, 0x67, 0xb5, 0xb0, 0xd2, 0x86, 0x3f, 0xde, 0x4b, 0x2e, 0x30, 0x8f, + 0x38, 0x74, 0x80, 0xc3, 0x85, 0x3c, 0x7b, 0xca, 0x61, 0x88, 0xac, 0x2b, + 0xb8, 0x3c, 0x42, 0x42, 0xb4, 0xbd, 0x78, 0xc7, 0x3f, 0x64, 0x67, 0x7c, + 0x5c, 0x88, 0x3b, 0xc1, 0x52, 0x57, 0x64, 0x43, 0x80, 0xcc, 0x53, 0x3b, + 0xca, 0xa5, 0x57, 0x99, 0x3c, 0x58, 0x99, 0x82, 0x41, 0x98, 0x9e, 0x31, + 0x9b, 0x2f, 0x7b, 0xa5, 0x97, 0x2c, 0xa8, 0xbf, 0xce, 0xa8, 0x71, 0x72, + 0x3c, 0x5d, 0xb8, 0xb4, 0xc5, 0x72, 0xc8, 0xa1, 0x87, 0x63, 0xac, 0xc2, + 0x81, 0x82, 0x51, 0x94, 0xab, 0xca, 0xa2, 0x42, 0x9a, 0x4a, 0xc6, 0x70, + 0x87, 0xc6, 0xc3, 0x9f, 0xb5, 0x6a, 0x58, 0x41, 0x62, 0x5e, 0x56, 0x52, + 0x4d, 0x44, 0x57, 0x31, 0x7b, 0x6a, 0xcd, 0x8b, 0xa3, 0xc1, 0x60, 0xbd, + 0x4e, 0x47, 0x3b, 0x3d, 0xa1, 0x72, 0x8e, 0xa9, 0xaf, 0x5f, 0xb1, 0xc2, + 0x5c, 0x41, 0x5c, 0x89, 0xa1, 0x85, 0x64, 0x5c, 0x6e, 0x58, 0x58, 0xb9, + 0x44, 0x61, 0xa8, 0x90, 0x4e, 0xa0, 0xa5, 0x63, 0x7d, 0xc2, 0x8a, 0xcf, + 0xcc, 0xd4, 0x8a, 0x8f, 0x4e, 0x84, 0x36, 0x74, 0x8d, 0x35, 0xc7, 0xa4, + 0x33, 0x57, 0x83, 0x37, 0xb8, 0x92, 0x4b, 0xdb, 0xe7, 0xac, 0xc6, 0x69, + 0xae, 0x80, 0x76, 0xc8, 0xad, 0x3a, 0xb5, 0x4e, 0x74, 0xa7, 0x7f, 0xc5, + 0x77, 0x88, 0xab, 0x35, 0x3d, 0xa9, 0x6d, 0x72, 0xae, 0x3a, 0x6e, 0x70, + 0x50, 0x6f, 0x76, 0x67, 0x81, 0x74, 0x86, 0x48, 0x9c, 0x5a, 0x50, 0xa4, + 0x88, 0x55, 0x92, 0xc6, 0x9e, 0x40, 0x3d, 0xb9, 0x2d, 0x3b, 0x67, 0x33, + 0x7b, 0x6a, 0x7d, 0x32, 0x53, 0x7a, 0x8f, 0x9d, 0xc6, 0x8c, 0xa9, 0x68, + 0x9b, 0x98, 0x5c, 0x5d, 0x7c, 0x92, 0xaa, 0x6a, 0x38, 0xd5, 0x39, 0x75, + 0x81, 0xb8, 0x84, 0x63, 0xe0, 0x78, 0x39, 0xb5, 0xcc, 0xcf, 0x3d, 0x9d, + 0xa7, 0x55, 0x2d, 0x59, 0xa7, 0xd2, 0x62, 0xba, 0x2e, 0xaa, 0xb1, 0xa5, + 0x8e, 0x43, 0xbb, 0x47, 0xbd, 0xc3, 0x40, 0xbc, 0x76, 0x54, 0x80, 0x3b, + 0x59, 0xbf, 0x62, 0xd3, 0xd1, 0xd6, 0xb8, 0x7f, 0x74, 0x5d, 0x72, 0xaa, + 0x93, 0x3e, 0x95, 0xc1, 0x4e, 0x87, 0x95, 0x51, 0xc4, 0x59, 0xd1, 0xbf, + 0x81, 0x47, 0xc9, 0x70, 0x80, 0xd8, 0x88, 0x9b, 0x46, 0x5a, 0x43, 0x94, + 0x4b, 0xb5, 0xd0, 0x38, 0x84, 0x6d, 0xbd, 0x3e, 0x4f, 0xbd, 0x48, 0xa9, + 0x66, 0x81, 0x3b, 0x84, 0x67, 0x9e, 0xb7, 0x8c, 0x54, 0x74, 0x70, 0xb7, + 0x8c, 0xbd, 0xb4, 0x9a, 0x34, 0x3f, 0x94, 0x4a, 0x6c, 0x50, 0xc6, 0x3c, + 0x40, 0x47, 0x61, 0xd0, 0xb7, 0x87, 0x94, 0x73, 0xd0, 0xb6, 0x9e, 0x9e, + 0x8d, 0xa6, 0x8b, 0x7d, 0x86, 0xa4, 0x81, 0x8f, 0xbc, 0xd2, 0xb6, 0x51, + 0x89, 0x63, 0x72, 0xba, 0xda, 0xd4, 0x5d, 0x91, 0x72, 0x4d, 0xaf, 0x5b, + 0x46, 0x34, 0xaf, 0x6b, 0x86, 0x42, 0x9e, 0x5a, 0x6d, 0x51, 0x7a, 0x59, + 0xcc, 0x68, 0xc6, 0xae, 0xa8, 0x82, 0x47, 0x98, 0xaf, 0x3d, 0x67, 0x3a, + 0x4e, 0x79, 0xd4, 0x3e, 0xa7, 0x44, 0x93, 0x67, 0xb1, 0x9f, 0x8c, 0x57, + 0x86, 0xb3, 0x68, 0x79, 0x80, 0xa6, 0xbb, 0xb7, 0x7a, 0xb3, 0x4f, 0xc7, + 0x93, 0x9e, 0xb0, 0x5b, 0xa6, 0x61, 0xa5, 0x4b, 0x9c, 0x83, 0x89, 0x87, + 0x57, 0xe4, 0xca, 0xa8, 0x44, 0x3f, 0x91, 0x50, 0x86, 0x37, 0xbf, 0x9d, + 0x57, 0x59, 0xaa, 0xaa, 0xab, 0xbc, 0xb0, 0xb4, 0x31, 0xc8, 0x62, 0x45, + 0x3e, 0x9c, 0x95, 0x79, 0x4e, 0x53, 0x63, 0xa6, 0x3c, 0x90, 0x8c, 0x56, + 0xa1, 0xdb, 0x72, 0xaa, 0xd6, 0x6b, 0x81, 0xa2, 0x95, 0x67, 0xd2, 0xae, + 0x3e, 0x8a, 0x50, 0x4c, 0x9b, 0xb7, 0xc2, 0x62, 0xc4, 0xb1, 0xc0, 0x89, + 0xb9, 0x6d, 0x3b, 0xb9, 0x8d, 0x98, 0xc2, 0x4f, 0x6d, 0x95, 0x95, 0x58, + 0x32, 0xce, 0x9f, 0x2b, 0x5c, 0xa6, 0xbb, 0x46, 0x99, 0x49, 0x47, 0xbf, + 0x8e, 0xa9, 0x41, 0x5f, 0x63, 0xd2, 0x96, 0x5f, 0x86, 0xb0, 0xbb, 0x5b, + 0xab, 0x5a, 0xd0, 0x3b, 0xbb, 0x77, 0xae, 0xb6, 0xc8, 0x8a, 0x42, 0xba, + 0x87, 0x5d, 0x80, 0x4a, 0xc9, 0x80, 0x5d, 0x7d, 0xa6, 0xc8, 0xa2, 0xe0, + 0x59, 0xb8, 0x93, 0x70, 0x5a, 0x72, 0x3e, 0x63, 0x60, 0xb6, 0x61, 0x6c, + 0x78, 0x33, 0xa5, 0x62, 0x9d, 0x72, 0x4e, 0xc2, 0x79, 0xb6, 0x57, 0x97, + 0x98, 0x57, 0x42, 0x87, 0x5c, 0x8b, 0xb1, 0x67, 0xc3, 0x64, 0x57, 0xcd, + 0x69, 0xb7, 0x9e, 0x5c, 0x3e, 0xa1, 0x8b, 0x6a, 0xcd, 0x7a, 0x37, 0x54, + 0xb4, 0x9f, 0x6c, 0xce, 0x56, 0xb9, 0x9e, 0x93, 0x72, 0x9f, 0x44, 0x46, + 0xb6, 0xb7, 0x42, 0x41, 0xbb, 0x82, 0xaf, 0xb5, 0xb8, 0x51, 0x4b, 0x7b, + 0xbb, 0x39, 0xb9, 0x72, 0x6f, 0x41, 0x8d, 0xa0, 0x5d, 0xae, 0x52, 0x50, + 0x3d, 0xa5, 0x4b, 0x46, 0x92, 0x81, 0xd2, 0xc3, 0x5f, 0x9d, 0x5e, 0x70, + 0x40, 0xc5, 0x3b, 0xab, 0x9b, 0x81, 0x4f, 0x30, 0xc2, 0xb6, 0x57, 0xd0, + 0x6c, 0x3b, 0xbc, 0x7d, 0x55, 0x33, 0x37, 0x6c, 0x52, 0x43, 0xa4, 0x88, + 0x35, 0x8c, 0x73, 0x56, 0xba, 0xab, 0x6c, 0x69, 0x57, 0x8e, 0x50, 0x66, + 0x9c, 0x5a, 0x74, 0xd7, 0x33, 0x90, 0x71, 0xd6, 0x84, 0x59, 0x32, 0x82, + 0x91, 0xb2, 0x5a, 0x91, 0xbc, 0xbc, 0x60, 0x59, 0x93, 0xa3, 0x3e, 0x49, + 0x6d, 0x85, 0x87, 0x57, 0xb6, 0x73, 0xaa, 0x8d, 0x5c, 0x90, 0x53, 0x76, + 0xa1, 0x27, 0x83, 0xc2, 0x90, 0xb9, 0x90, 0x51, 0x6f, 0x9c, 0xcd, 0x4e, + 0x8d, 0xa0, 0x7d, 0x54, 0x9f, 0xb5, 0xb1, 0xbe, 0xc8, 0xaf, 0x9d, 0x72, + 0x90, 0xa8, 0x74, 0x4e, 0xa9, 0x6d, 0x7c, 0x6b, 0x39, 0x8b, 0x4c, 0x5e, + 0xaf, 0x2f, 0x4f, 0x82, 0x7c, 0xb0, 0x4a, 0x9b, 0x37, 0x80, 0x80, 0x41, + 0xc2, 0x80, 0x57, 0x55, 0x98, 0xa6, 0x7b, 0xa3, 0xbe, 0x90, 0x6d, 0x33, + 0xb2, 0x9e, 0x77, 0xb1, 0x3e, 0xc5, 0xab, 0x8b, 0xc2, 0x66, 0xd3, 0x9c, + 0xb2, 0xbf, 0x94, 0x43, 0xbd, 0x9f, 0x6b, 0xb5, 0x39, 0xa4, 0xa8, 0x5b, + 0xa2, 0x78, 0x36, 0x5d, 0xd5, 0xb2, 0xbb, 0x85, 0x6d, 0x70, 0x8c, 0x6e, + 0x72, 0x83, 0xc6, 0x33, 0xb4, 0x63, 0x93, 0xd2, 0x9c, 0xa6, 0xa6, 0xc4, + 0xcd, 0xcc, 0xa0, 0xbc, 0x75, 0x38, 0x47, 0x67, 0xad, 0x98, 0x40, 0x3d, + 0xac, 0xd3, 0x7c, 0xa1, 0x3e, 0xb4, 0xcb, 0x8c, 0xc0, 0xcb, 0x3a, 0x99, + 0x6d, 0x95, 0xd6, 0xa0, 0x60, 0xa9, 0xa8, 0xa2, 0xc2, 0xba, 0xd2, 0x56, + 0xc8, 0x4c, 0x63, 0x76, 0x52, 0xd6, 0x78, 0x47, 0x9f, 0x63, 0x6a, 0xa6, + 0x85, 0x71, 0x4d, 0x5d, 0x60, 0x4f, 0x3d, 0xb9, 0x3f, 0x8e, 0xd3, 0xd2, + 0x6e, 0x71, 0x5c, 0xc6, 0xa9, 0x7c, 0x72, 0xa3, 0x72, 0x67, 0xa2, 0x7e, + 0xc0, 0x38, 0xa6, 0x59, 0x79, 0x56, 0x78, 0xcc, 0xb5, 0xa2, 0x3f, 0x9f, + 0xaa, 0x86, 0x53, 0x5e, 0x30, 0x42, 0xcb, 0x6b, 0x45, 0x8c, 0x53, 0x3c, + 0x9e, 0xb7, 0x31, 0x64, 0xc1, 0x85, 0xcd, 0xca, 0xd7, 0xc7, 0x4c, 0x57, + 0x72, 0x67, 0x56, 0x4c, 0x3a, 0xa7, 0x40, 0x83, 0xc5, 0xbe, 0xd0, 0x76, + 0x36, 0x8c, 0xa3, 0xae, 0x97, 0x81, 0xb7, 0x83, 0x81, 0xad, 0x9b, 0xbf, + 0xa3, 0x56, 0x49, 0x4a, 0x95, 0xbb, 0x90, 0x88, 0xbd, 0xa6, 0x6d, 0x72, + 0x71, 0x43, 0x98, 0x84, 0x4b, 0x60, 0x61, 0xa2, 0x33, 0x86, 0xbd, 0x79, + 0xc8, 0x6b, 0x9e, 0xb6, 0xc6, 0xae, 0xa5, 0x46, 0xcf, 0x5f, 0x47, 0xaf, + 0xc2, 0x4d, 0x83, 0x78, 0xb8, 0x9a, 0x64, 0xc4, 0x7a, 0x71, 0x41, 0x95, + 0x52, 0x7f, 0x4f, 0xb6, 0x83, 0x67, 0x4e, 0x31, 0x84, 0x54, 0xd2, 0x3b, + 0x52, 0xa8, 0x6e, 0xba, 0x76, 0x6e, 0x9d, 0x92, 0x48, 0x41, 0x5c, 0xc6, + 0x50, 0xb8, 0x86, 0x3c, 0xa9, 0x37, 0x93, 0x93, 0x6f, 0x6a, 0x3b, 0x4f, + 0x96, 0x74, 0x56, 0x33, 0x97, 0x43, 0xba, 0x94, 0xb6, 0x37, 0x5b, 0x77, + 0x76, 0xc4, 0x48, 0x3e, 0xd6, 0x7b, 0x45, 0x86, 0x39, 0x63, 0x79, 0x52, + 0x40, 0x6b, 0x96, 0xab, 0x41, 0x4f, 0xa1, 0x55, 0xa0, 0xb8, 0x71, 0x3f, + 0x39, 0x92, 0xb5, 0x82, 0xb1, 0xb9, 0x3b, 0x5e, 0xb7, 0xaa, 0x36, 0x34, + 0x80, 0x7e, 0x8d, 0xac, 0xbf, 0xad, 0x7e, 0x9a, 0xd3, 0xc1, 0x98, 0x5a, + 0x7c, 0x55, 0x8b, 0x50, 0x30, 0x87, 0x58, 0x48, 0x5e, 0x31, 0x8a, 0x3a, + 0x8e, 0x36, 0x67, 0x7b, 0xab, 0x8f, 0x50, 0x9f, 0x54, 0xd5, 0x5d, 0x85, + 0x5a, 0x37, 0x35, 0xcc, 0xc5, 0xa1, 0x71, 0x72, 0x98, 0x39, 0x7e, 0x75, + 0x51, 0x79, 0x47, 0xd0, 0x94, 0xa6, 0x81, 0x68, 0x5f, 0xca, 0xc4, 0x74, + 0x36, 0x70, 0x2e, 0x87, 0xd3, 0x94, 0x65, 0xc2, 0xac, 0x3d, 0x88, 0x61, + 0xb9, 0x64, 0x64, 0x5e, 0x31, 0x35, 0x63, 0xa5, 0xaf, 0xbf, 0x6f, 0xc3, + 0xa7, 0xb5, 0x81, 0xb5, 0x7a, 0x85, 0xb9, 0xb2, 0x71, 0x6c, 0xb8, 0xc8, + 0x87, 0x9f, 0xac, 0x9b, 0x6e, 0xb3, 0x7f, 0xc8, 0x91, 0x7e, 0x96, 0x59, + 0xba, 0x52, 0x53, 0xac, 0x9e, 0x66, 0x70, 0xb1, 0x3c, 0x3c, 0x80, 0xa1, + 0x6c, 0x69, 0x86, 0xcb, 0x9c, 0x50, 0x41, 0x4b, 0xcc, 0x59, 0x8f, 0x93, + 0x3a, 0xa6, 0xab, 0xc7, 0x43, 0x3a, 0xae, 0x70, 0xa0, 0x98, 0x8a, 0x78, + 0x67, 0x3c, 0x3e, 0x96, 0x94, 0x3c, 0x8a, 0xb6, 0xc2, 0x4f, 0x8c, 0x97, + 0x9a, 0x5b, 0x4d, 0x67, 0xc3, 0xd1, 0x36, 0x50, 0x78, 0x86, 0x4d, 0xb8, + 0x7d, 0x3e, 0xce, 0x9c, 0x4a, 0x97, 0xc4, 0x46, 0x38, 0xcf, 0x87, 0x89, + 0x6d, 0x83, 0xa4, 0x8f, 0x44, 0xa6, 0x7b, 0x66, 0x42, 0x8c, 0x57, 0x47, + 0x46, 0x68, 0x6c, 0x85, 0xa0, 0x6a, 0x92, 0x60, 0x8d, 0x71, 0x49, 0x7d, + 0x35, 0x9d, 0xb7, 0xbb, 0x71, 0x5e, 0xc7, 0x9b, 0x5e, 0xd4, 0x34, 0xa5, + 0x49, 0x85, 0x98, 0x73, 0x9c, 0x4d, 0x97, 0x78, 0xd1, 0x4c, 0x3d, 0xa5, + 0x56, 0x4b, 0xbf, 0xcb, 0x7d, 0x69, 0xc6, 0x83, 0x38, 0x67, 0x68, 0x31, + 0x60, 0xbe, 0x36, 0xc9, 0x9c, 0x56, 0x7d, 0x62, 0x52, 0x54, 0x70, 0xd5, + 0x60, 0x65, 0x61, 0xc6, 0xc4, 0x39, 0x32, 0x81, 0x6d, 0xb0, 0x9f, 0x68, + 0xd9, 0xb2, 0xae, 0x65, 0x62, 0xbc, 0xb2, 0x30, 0x57, 0xa0, 0x65, 0xa4, + 0xd8, 0xc2, 0xaf, 0xa4, 0x70, 0xa8, 0x65, 0x7d, 0x54, 0xbc, 0x31, 0x84, + 0x42, 0x71, 0x5f, 0x93, 0x7e, 0xd6, 0x6b, 0xaf, 0x98, 0x82, 0x5c, 0x3c, + 0x59, 0x8e, 0x62, 0x8a, 0x38, 0xc3, 0xab, 0x3e, 0xb1, 0x7e, 0x82, 0xb9, + 0xa2, 0x5b, 0x4c, 0xaa, 0xbb, 0x93, 0x58, 0xa0, 0x7d, 0x80, 0x8c, 0xd8, + 0x6a, 0xd3, 0x6c, 0x68, 0xb4, 0x42, 0x99, 0xc4, 0x3a, 0x4e, 0x8f, 0xbb, + 0x51, 0x40, 0x87, 0x9d, 0x78, 0x3d, 0x32, 0x91, 0xa9, 0x83, 0xba, 0x9e, + 0x60, 0x8c, 0x66, 0x57, 0x46, 0x62, 0xbc, 0xb6, 0x56, 0x94, 0xc3, 0x35, + 0x7a, 0x40, 0xa0, 0xc2, 0x81, 0x62, 0x2f, 0xb0, 0xbe, 0xb0, 0xba, 0xa5, + 0xaf, 0xc8, 0xab, 0x4c, 0x53, 0xcc, 0x9d, 0x60, 0x88, 0x42, 0x88, 0x68, + 0x78, 0x32, 0xc3, 0x38, 0x68, 0x80, 0x4c, 0x65, 0x96, 0x4b, 0x7a, 0xbf, + 0x3e, 0xb6, 0xd2, 0x7f, 0x57, 0xd0, 0x4e, 0x62, 0x32, 0xa5, 0xd7, 0x80, + 0x44, 0xd2, 0x6d, 0x3c, 0x7a, 0xa7, 0x46, 0x87, 0x67, 0x4b, 0x64, 0xbd, + 0xad, 0xc6, 0x3e, 0xb7, 0xcf, 0xc5, 0x55, 0x2f, 0x72, 0x74, 0x81, 0x30, + 0x5a, 0x30, 0xa3, 0x77, 0x4d, 0x4d, 0x4d, 0xd9, 0x62, 0x81, 0x3b, 0x94, + 0x85, 0x4e, 0x9a, 0x75, 0xce, 0x46, 0x5a, 0xa7, 0x67, 0x86, 0xd3, 0xb1, + 0x64, 0xb0, 0xae, 0xa9, 0x4b, 0xc1, 0xa0, 0x31, 0x8a, 0xc9, 0x69, 0x4d, + 0xbe, 0x95, 0x57, 0x46, 0x87, 0xc9, 0x4a, 0xa5, 0x57, 0xa4, 0x6b, 0x6b, + 0x67, 0x37, 0xaf, 0xc5, 0x47, 0x40, 0x8d, 0xba, 0x74, 0x36, 0xb4, 0xde, + 0x61, 0xb3, 0x84, 0xb0, 0x4f, 0x7d, 0xbb, 0xbb, 0x74, 0x8a, 0x6d, 0x7f, + 0x66, 0xc3, 0xc6, 0x2b, 0xc8, 0xa9, 0x63, 0xb2, 0xb3, 0x37, 0x93, 0x9e, + 0x81, 0x82, 0x43, 0xc1, 0x99, 0x8e, 0xcf, 0xa8, 0xb3, 0xaf, 0x5d, 0xcc, + 0xa0, 0xb0, 0xbc, 0x6a, 0xb8, 0x5f, 0x55, 0xd3, 0x4d, 0x8c, 0x8b, 0x2f, + 0x7e, 0xaa, 0x7f, 0x9f, 0x9e, 0x36, 0xbd, 0x65, 0x87, 0x8b, 0xd4, 0xad, + 0x36, 0xb8, 0x5a, 0x53, 0xc3, 0xca, 0xbe, 0x93, 0xa9, 0xb8, 0x9f, 0x35, + 0x43, 0x29, 0x96, 0x92, 0x40, 0x7f, 0xb6, 0x5e, 0xa1, 0xa9, 0xc3, 0x87, + 0xc7, 0x76, 0x82, 0xce, 0x75, 0xc3, 0x6c, 0xcd, 0x88, 0xcc, 0xac, 0x84, + 0x7f, 0x54, 0xb9, 0x4e, 0x59, 0x9b, 0x6a, 0x7f, 0x8f, 0x8b, 0x55, 0xaf, + 0x66, 0x84, 0x64, 0x9e, 0xb8, 0x69, 0x94, 0x55, 0xb7, 0x73, 0xa2, 0x5d, + 0xaf, 0x6d, 0x52, 0x68, 0x7a, 0x93, 0x5a, 0xa4, 0xb3, 0xa1, 0x3c, 0x87, + 0x87, 0xd0, 0x6d, 0xa8, 0xb9, 0xb1, 0x68, 0xa5, 0x93, 0xb6, 0xb6, 0x43, + 0x37, 0x2e, 0x4c, 0x9c, 0xaa, 0x2b, 0xcb, 0x8b, 0x4d, 0xc5, 0x39, 0x5d, + 0x6b, 0xb3, 0x96, 0x5c, 0xbc, 0x91, 0x56, 0x5e, 0xb8, 0x5a, 0xc2, 0x3e, + 0x4c, 0x86, 0xbb, 0x3d, 0x67, 0xab, 0x92, 0x40, 0x66, 0x8c, 0x9e, 0x8b, + 0xaf, 0xd2, 0x6d, 0xa2, 0x85, 0xa2, 0x37, 0x79, 0x5a, 0x73, 0x88, 0x9e, + 0xba, 0xc5, 0xc1, 0x46, 0x91, 0x65, 0x5c, 0x91, 0x9d, 0x38, 0xd4, 0xd3, + 0xcf, 0x58, 0x8e, 0x36, 0x8d, 0x90, 0x8b, 0x38, 0x6e, 0x9f, 0x53, 0x9b, + 0xa6, 0x8b, 0x70, 0x6f, 0x36, 0x5a, 0x48, 0x7a, 0xa6, 0x37, 0xbb, 0x24, + 0xc2, 0xd4, 0xa1, 0x3c, 0xc1, 0xb7, 0x53, 0x25, 0x88, 0x51, 0x56, 0x79, + 0x49, 0x9b, 0x6a, 0x9b, 0x56, 0x3e, 0x8f, 0xd0, 0x92, 0x22, 0x2e, 0x8f, + 0x7b, 0x36, 0x6b, 0x66, 0xce, 0xc4, 0x8a, 0x71, 0x6e, 0xb7, 0x47, 0x45, + 0x5a, 0x4c, 0x6b, 0x7d, 0x32, 0xc8, 0xd3, 0x62, 0x94, 0xd3, 0xc4, 0x6a, + 0x41, 0x58, 0x92, 0xbc, 0x67, 0x85, 0x4a, 0xa9, 0x91, 0x61, 0x97, 0x47, + 0x94, 0xae, 0x62, 0xbe, 0xaf, 0x86, 0x63, 0x79, 0x43, 0x62, 0x6d, 0x42, + 0x4c, 0x42, 0xab, 0xd4, 0xb8, 0x59, 0xb4, 0x4f, 0x5e, 0xd1, 0xba, 0x8f, + 0x49, 0x56, 0xc7, 0x4d, 0xcd, 0x88, 0x76, 0xbd, 0x7a, 0x99, 0xb8, 0x6c, + 0x4a, 0xca, 0x99, 0x3b, 0xea, 0xab, 0x5d, 0xc0, 0x6a, 0x70, 0xa8, 0x6d, + 0x75, 0x8e, 0x42, 0xaa, 0xc8, 0x70, 0xc1, 0xc5, 0xd0, 0x4c, 0x5a, 0x5e, + 0x49, 0xd2, 0x3a, 0xcf, 0x39, 0x4c, 0x71, 0x89, 0x49, 0x7a, 0x82, 0x42, + 0x75, 0x68, 0xb5, 0xc3, 0x37, 0x94, 0xbb, 0x77, 0xa6, 0x45, 0x37, 0x87, + 0xc2, 0x97, 0x7c, 0x96, 0x99, 0x58, 0xb1, 0x3b, 0x35, 0xd1, 0x47, 0xa0, + 0x86, 0x66, 0xde, 0xb3, 0x99, 0xaf, 0x62, 0x77, 0xa0, 0x3a, 0x4d, 0x58, + 0x77, 0x55, 0x76, 0x56, 0x51, 0xd4, 0x44, 0x50, 0x83, 0xa7, 0x3a, 0xc4, + 0xb9, 0x8b, 0x87, 0x8f, 0x83, 0x54, 0x93, 0x54, 0x8c, 0x53, 0xa9, 0x7f, + 0xb7, 0x8c, 0x77, 0xa9, 0x7a, 0x64, 0x9a, 0x56, 0x6b, 0xc9, 0x33, 0x9f, + 0x40, 0x6c, 0x9d, 0xa5, 0xc3, 0xc2, 0x58, 0xcf, 0x56, 0x2c, 0x75, 0x4c, + 0x73, 0xa7, 0x6e, 0x7c, 0xc8, 0xa9, 0xc5, 0x7a, 0xbd, 0x67, 0x46, 0x40, + 0x68, 0x6d, 0x81, 0x52, 0xa8, 0x37, 0xbd, 0x5b, 0x83, 0x6c, 0x51, 0x76, + 0x77, 0x75, 0x32, 0x99, 0x8c, 0x6a, 0x72, 0xaa, 0x77, 0x53, 0x66, 0x4c, + 0x80, 0x49, 0xa4, 0x77, 0x5e, 0xbf, 0x8e, 0xab, 0x86, 0x63, 0xa7, 0xa5, + 0x43, 0x42, 0x5f, 0x49, 0x73, 0x48, 0x7d, 0xa8, 0x56, 0xb5, 0x8f, 0x6e, + 0x56, 0x64, 0x70, 0x53, 0x76, 0x52, 0x37, 0xb7, 0xb3, 0x40, 0x98, 0xaa, + 0x48, 0xb7, 0xc9, 0x8e, 0xa0, 0x65, 0x46, 0x7b, 0x5d, 0x66, 0x9a, 0x84, + 0xae, 0xb4, 0xa5, 0xca, 0xaf, 0x66, 0xa2, 0x90, 0xba, 0x46, 0x60, 0x5b, + 0x4c, 0xbb, 0x53, 0x53, 0x65, 0x77, 0xc6, 0x55, 0x3d, 0xca, 0xa7, 0x8b, + 0x65, 0x6a, 0xd2, 0x83, 0xa3, 0xaa, 0xb3, 0xca, 0x69, 0x4e, 0xd6, 0x8f, + 0xa3, 0x72, 0x3d, 0xc0, 0x99, 0x78, 0x5e, 0xce, 0xc5, 0x62, 0x57, 0xa6, + 0x48, 0x67, 0xbd, 0xaa, 0x71, 0x15, 0x46, 0x60, 0x72, 0x3c, 0xb0, 0x82, + 0xd7, 0xd2, 0xcd, 0x5c, 0x4c, 0x70, 0x33, 0xcc, 0x80, 0x32, 0x2b, 0x57, + 0x78, 0x3e, 0x5e, 0x7e, 0xc4, 0xcc, 0x63, 0xb1, 0xca, 0x58, 0xcd, 0xb7, + 0xc8, 0xa6, 0x84, 0xad, 0x70, 0x4d, 0x9a, 0x38, 0xc3, 0x82, 0xb9, 0xc4, + 0x42, 0x66, 0xa4, 0xc7, 0x7c, 0x77, 0x65, 0x73, 0x68, 0x83, 0xd2, 0xa5, + 0xca, 0x93, 0x9c, 0xc2, 0x60, 0xb7, 0x7e, 0xc0, 0x45, 0x5e, 0x97, 0x78, + 0x6a, 0x84, 0xb6, 0xb7, 0x80, 0x9b, 0x87, 0x5e, 0xcc, 0x98, 0xad, 0x68, + 0x76, 0x51, 0xc6, 0x83, 0x61, 0x4e, 0x94, 0x8d, 0xac, 0xc8, 0x62, 0x3a, + 0x86, 0x41, 0x2b, 0x76, 0x88, 0xb2, 0x77, 0xb0, 0x84, 0x8a, 0x89, 0xc5, + 0xcf, 0xa0, 0xd8, 0x39, 0xad, 0x8c, 0x44, 0x2c, 0x61, 0x98, 0xa2, 0xb9, + 0x38, 0xc1, 0xc8, 0x52, 0x6f, 0x35, 0x98, 0x89, 0x8a, 0xae, 0xbe, 0x89, + 0x8a, 0xaa, 0x59, 0x7b, 0xba, 0xbc, 0x3a, 0xce, 0x96, 0x59, 0x3e, 0x94, + 0xc0, 0x86, 0x81, 0xc4, 0xc6, 0xd0, 0x4a, 0x30, 0xa5, 0x6b, 0xa0, 0xd4, + 0x7c, 0x9a, 0x51, 0x40, 0xb0, 0x93, 0x6d, 0x5f, 0x77, 0x5f, 0x80, 0x5d, + 0x4b, 0x64, 0x70, 0xc0, 0x51, 0x40, 0xce, 0xc6, 0x62, 0xa0, 0xa7, 0xbf, + 0x5d, 0x6c, 0xaf, 0xb0, 0xb1, 0xd4, 0x34, 0x8a, 0x4b, 0xca, 0x84, 0x5b, + 0x44, 0xcc, 0xa5, 0x76, 0x35, 0x7d, 0x60, 0xda, 0x35, 0x3a, 0x65, 0xa1, + 0x8f, 0xab, 0xc1, 0xc0, 0x36, 0x4a, 0xce, 0xc0, 0x97, 0x95, 0x91, 0x70, + 0x52, 0x5e, 0x47, 0x8d, 0x66, 0x35, 0x35, 0x43, 0x90, 0xa9, 0xa6, 0xb2, + 0xc5, 0xbf, 0xa0, 0xaf, 0x46, 0x9d, 0xbd, 0x84, 0x4b, 0x82, 0x50, 0xa3, + 0xb3, 0xc2, 0x72, 0xbf, 0x5b, 0xd7, 0xbf, 0x51, 0x48, 0xd0, 0x33, 0xa9, + 0x81, 0x4d, 0x7d, 0x96, 0x7c, 0x49, 0xa0, 0x5b, 0xb9, 0xb5, 0x2e, 0x8e, + 0x9e, 0x99, 0x39, 0xb8, 0x3f, 0xaf, 0x42, 0x6d, 0x94, 0x4b, 0x87, 0xb0, + 0x8e, 0x5c, 0xbb, 0xca, 0x3b, 0x91, 0xd2, 0x89, 0xae, 0x8c, 0x3c, 0xc4, + 0x6f, 0xb5, 0x45, 0xb6, 0x97, 0x90, 0x6a, 0x67, 0x69, 0x40, 0xc0, 0x39, + 0x87, 0x9d, 0x6b, 0x85, 0xc2, 0xcd, 0x7f, 0x70, 0x95, 0x93, 0x6d, 0xa3, + 0x66, 0x55, 0x80, 0x8f, 0x7b, 0xd6, 0x45, 0x37, 0x57, 0x89, 0xd7, 0x61, + 0xae, 0x7d, 0xc7, 0x74, 0x80, 0x7c, 0x83, 0xc1, 0x4d, 0x32, 0x63, 0xc3, + 0xb6, 0xcd, 0x93, 0xbc, 0x6b, 0xcc, 0x34, 0x9d, 0xb6, 0xa9, 0x7f, 0x66, + 0x57, 0x3c, 0x38, 0x54, 0x34, 0x62, 0x92, 0xb3, 0x60, 0x7a, 0x6d, 0x54, + 0x4d, 0x81, 0x8d, 0xcd, 0xb7, 0xd2, 0x97, 0x86, 0x58, 0xd8, 0x99, 0x80, + 0xa4, 0xd5, 0x90, 0xbd, 0x7e, 0x5c, 0x93, 0xc5, 0xb9, 0x74, 0xcf, 0x67, + 0x55, 0x80, 0x69, 0x68, 0x83, 0x38, 0xc4, 0x4c, 0x54, 0x5a, 0x1e, 0x6e, + 0xbd, 0x8c, 0xbf, 0x71, 0x86, 0x4f, 0xb1, 0x6c, 0x7b, 0x6a, 0x77, 0xbc, + 0xa7, 0x7a, 0x3e, 0xc0, 0x7d, 0x4c, 0x3d, 0x51, 0x4c, 0x46, 0xbd, 0x9a, + 0xc9, 0xb1, 0x2a, 0x37, 0x9c, 0x86, 0x3c, 0xb9, 0x5a, 0x84, 0x38, 0x8a, + 0x90, 0xb5, 0x5b, 0x44, 0x8f, 0x3e, 0x3c, 0xb2, 0x9a, 0x92, 0x91, 0x47, + 0xce, 0x69, 0x77, 0x42, 0xb2, 0x74, 0xa8, 0x9f, 0x47, 0x3e, 0x45, 0x44, + 0xcb, 0xcc, 0xd6, 0x81, 0x6d, 0x8f, 0xe3, 0x91, 0x4c, 0xb5, 0xce, 0x77, + 0x37, 0x3e, 0x2b, 0x35, 0xc5, 0x7e, 0xb3, 0x9c, 0x30, 0x40, 0xc7, 0x51, + 0x54, 0xa4, 0x45, 0xb9, 0xbd, 0x81, 0x58, 0xb5, 0xaa, 0x8c, 0x66, 0x8f, + 0xa7, 0x4a, 0x9e, 0x5b, 0x95, 0xca, 0xcb, 0x47, 0xb7, 0xc3, 0x94, 0x98, + 0x64, 0x66, 0x7a, 0x2d, 0xa5, 0x5d, 0xa0, 0xa3, 0x5f, 0x4e, 0xa7, 0x96, + 0x35, 0x6a, 0x4e, 0x72, 0x62, 0x67, 0x93, 0x54, 0xbd, 0x65, 0x51, 0x95, + 0xcc, 0x78, 0x57, 0x94, 0x76, 0x64, 0x92, 0x66, 0x76, 0x57, 0xc8, 0x62, + 0x39, 0xd4, 0x85, 0x3b, 0x55, 0x92, 0xa7, 0xce, 0x57, 0x36, 0xa9, 0x44, + 0xc4, 0x3a, 0xae, 0x7e, 0x76, 0xaf, 0xc1, 0xbd, 0x3c, 0x86, 0x80, 0xb9, + 0xc1, 0xcb, 0xb5, 0x87, 0xa7, 0x71, 0x5e, 0x45, 0xc9, 0x4e, 0x6d, 0x8c, + 0x9f, 0xa0, 0x63, 0x59, 0x92, 0x97, 0xcf, 0x84, 0x98, 0x5d, 0xb3, 0x6f, + 0x67, 0x95, 0xc8, 0x66, 0x59, 0x43, 0x5e, 0x6c, 0xbf, 0xb1, 0x50, 0xcf, + 0x86, 0x37, 0x5b, 0x9e, 0xd3, 0x32, 0x32, 0x57, 0x59, 0xb8, 0x74, 0x72, + 0x88, 0x2d, 0x60, 0x3b, 0xab, 0x46, 0x5d, 0x84, 0xb4, 0x9a, 0xd9, 0xae, + 0xbc, 0x8e, 0x5e, 0x96, 0xc6, 0x5e, 0x6d, 0xa1, 0xc9, 0x9b, 0xd6, 0x6c, + 0x8f, 0xa8, 0x62, 0x73, 0x54, 0x91, 0x99, 0xc9, 0xca, 0x46, 0xab, 0x92, + 0xb4, 0xc4, 0x45, 0x45, 0xbf, 0x9e, 0xcc, 0x34, 0x98, 0xc2, 0xa0, 0x84, + 0x62, 0x98, 0xb3, 0xba, 0x7d, 0x31, 0x83, 0x77, 0xa9, 0xd6, 0xd9, 0x3a, + 0x49, 0x57, 0xe0, 0xd0, 0x4e, 0x4f, 0x98, 0x33, 0x52, 0x5a, 0x2a, 0x78, + 0xcb, 0x82, 0x89, 0xac, 0xc9, 0x6b, 0x59, 0xa0, 0x80, 0x79, 0x44, 0x40, + 0xcb, 0x47, 0x8b, 0xbd, 0xaa, 0x60, 0xb5, 0x78, 0xa6, 0xc2, 0xcd, 0xa7, + 0xb8, 0xa8, 0xbc, 0xa4, 0x55, 0x9e, 0x51, 0x7f, 0x2e, 0x65, 0xbb, 0x51, + 0xa8, 0x86, 0x44, 0xb2, 0x9e, 0x46, 0x42, 0x57, 0xc4, 0x63, 0x90, 0x41, + 0xb0, 0x8f, 0xca, 0xae, 0xa1, 0xc1, 0xae, 0x70, 0xa0, 0x67, 0x40, 0xb0, + 0x93, 0x7d, 0x90, 0xb6, 0x66, 0x40, 0x6b, 0x50, 0x8c, 0x69, 0x72, 0x7d, + 0x9b, 0xb1, 0xc9, 0x55, 0x8a, 0x37, 0x65, 0x6e, 0x7c, 0xac, 0x4c, 0x47, + 0xc4, 0x6e, 0xc0, 0x55, 0x47, 0x9d, 0x9b, 0xbe, 0x4a, 0x62, 0x54, 0x31, + 0xb5, 0x7e, 0x8a, 0x5b, 0x94, 0x70, 0x76, 0xc2, 0x52, 0x87, 0x4f, 0x56, + 0xcf, 0x41, 0x8b, 0x95, 0x8a, 0x48, 0x41, 0xb5, 0x36, 0x3e, 0x8c, 0xc2, + 0x45, 0xb3, 0x41, 0x4b, 0x79, 0xc0, 0x55, 0x5f, 0x42, 0xbb, 0x39, 0x69, + 0x75, 0x3b, 0x5f, 0x37, 0x3b, 0xb7, 0x57, 0x94, 0xb3, 0x67, 0xcb, 0xce, + 0x75, 0xc2, 0x33, 0x6b, 0x54, 0x9b, 0x9a, 0x3a, 0x44, 0x52, 0x58, 0x94, + 0xc8, 0x91, 0x85, 0x4b, 0x66, 0x96, 0x46, 0xa2, 0x35, 0x39, 0x5d, 0xb5, + 0x3f, 0xb7, 0x6e, 0x56, 0x32, 0x61, 0x49, 0x43, 0x72, 0xab, 0x39, 0x4c, + 0x98, 0x6c, 0xce, 0xae, 0xc7, 0x59, 0x8a, 0x7d, 0x9f, 0x60, 0x5c, 0x88, + 0xd0, 0xcf, 0xc4, 0xc4, 0x97, 0xa8, 0x2e, 0x69, 0x66, 0x99, 0x7b, 0xb9, + 0x8c, 0xb8, 0x36, 0x4d, 0x5d, 0x99, 0xa6, 0x7a, 0x81, 0x55, 0xb4, 0xcf, + 0xaf, 0xa9, 0xa7, 0xb0, 0xbe, 0x5c, 0xa4, 0x66, 0x84, 0xa7, 0x41, 0xcc, + 0xaf, 0x7f, 0x96, 0x9c, 0x41, 0xae, 0x53, 0x42, 0x8a, 0xcf, 0x60, 0xc6, + 0x92, 0x84, 0xc1, 0x8b, 0x4c, 0x33, 0xa3, 0xb0, 0x87, 0x8d, 0x6d, 0x5d, + 0xba, 0x80, 0x90, 0xcc, 0x89, 0x8b, 0x90, 0x8f, 0xb2, 0x7b, 0x63, 0x43, + 0x3b, 0xb2, 0x69, 0xbf, 0xa5, 0x71, 0x3a, 0xac, 0xcd, 0x4b, 0x98, 0x92, + 0x83, 0xd1, 0xb6, 0xba, 0x3f, 0xb1, 0xcd, 0x65, 0xa9, 0x8e, 0x8e, 0xb0, + 0x99, 0x92, 0x91, 0x8c, 0x5f, 0xc1, 0x49, 0x35, 0xc8, 0x62, 0x97, 0x31, + 0x3e, 0x95, 0x45, 0xb8, 0x51, 0xa0, 0xab, 0x6d, 0xb6, 0xbe, 0xb0, 0xbb, + 0x4d, 0x86, 0x5d, 0x50, 0x5e, 0x95, 0x6a, 0x8f, 0x54, 0x53, 0x60, 0x77, + 0x7a, 0x74, 0x8c, 0xa1, 0x7f, 0xa3, 0x81, 0x86, 0x6a, 0x72, 0xaa, 0xb6, + 0x96, 0x87, 0xca, 0xc8, 0xbc, 0x9d, 0xd0, 0x6e, 0x98, 0x3f, 0xc1, 0x65, + 0x34, 0x9b, 0xab, 0x90, 0xbb, 0x39, 0xab, 0x59, 0xab, 0x9f, 0x39, 0x3d, + 0x7f, 0xba, 0xba, 0x71, 0x30, 0x36, 0x7a, 0x70, 0x3f, 0x9e, 0x9d, 0x81, + 0x53, 0x63, 0x5a, 0x6a, 0xc2, 0x87, 0x69, 0x43, 0x54, 0xd0, 0xb8, 0xc1, + 0xba, 0x62, 0x6b, 0x70, 0x45, 0x7d, 0x3b, 0x4d, 0x38, 0x4c, 0x7e, 0x9a, + 0x56, 0x92, 0xa4, 0xa9, 0x76, 0x49, 0x9c, 0x37, 0x53, 0x72, 0x74, 0xb1, + 0x7b, 0x44, 0xc1, 0xa7, 0x6a, 0x5e, 0xa3, 0x47, 0x63, 0xa1, 0x41, 0xb2, + 0x91, 0x9c, 0x9d, 0x35, 0xce, 0x95, 0xcf, 0x4d, 0x33, 0x4d, 0x61, 0xad, + 0xcd, 0xa3, 0x62, 0x69, 0x40, 0x8a, 0x9b, 0xd0, 0x76, 0x8c, 0xa7, 0x96, + 0xc2, 0x9e, 0x9c, 0x5e, 0x7b, 0xb5, 0xd4, 0x37, 0xcd, 0x94, 0xc4, 0x39, + 0xac, 0x59, 0x68, 0x9d, 0xa3, 0x9e, 0x63, 0x3b, 0x32, 0x66, 0x7b, 0xb7, + 0x45, 0x6d, 0x5f, 0x8f, 0xcb, 0xbd, 0x3c, 0xc7, 0x8a, 0x64, 0x3a, 0x61, + 0x3d, 0x5c, 0xc3, 0x5f, 0x49, 0x3b, 0x86, 0xa3, 0xcf, 0x6a, 0x8a, 0xa7, + 0x44, 0x60, 0x44, 0x35, 0x62, 0x44, 0x8e, 0xc2, 0xca, 0x6f, 0xbb, 0x70, + 0x4d, 0x5e, 0x97, 0x96, 0xca, 0x3e, 0x6c, 0x3f, 0x31, 0x6b, 0x62, 0x94, + 0xc5, 0x99, 0x6a, 0x5c, 0xb4, 0xc4, 0x96, 0xd3, 0xaf, 0x43, 0xa4, 0x68, + 0x68, 0x84, 0xa6, 0x84, 0xc9, 0x34, 0x8c, 0x38, 0x39, 0xac, 0x48, 0xc4, + 0x7b, 0x62, 0x3c, 0xcf, 0x50, 0xd3, 0xb4, 0xae, 0x8f, 0x9b, 0x6d, 0x53, + 0xb8, 0x92, 0xd2, 0xbc, 0x77, 0x3a, 0xaa, 0x90, 0x5a, 0xce, 0x99, 0x9b, + 0x5d, 0x4b, 0x8a, 0x66, 0x5f, 0xa7, 0x67, 0x36, 0xb5, 0x6c, 0x82, 0x54, + 0x8d, 0x5a, 0x8f, 0xcf, 0x3c, 0x64, 0x68, 0x43, 0x59, 0x91, 0xbf, 0x72, + 0x45, 0x8c, 0x39, 0xbc, 0x3d, 0x57, 0x9f, 0x4d, 0xca, 0xc6, 0x30, 0xd1, + 0xc6, 0x31, 0xa7, 0x5d, 0x9e, 0xac, 0xba, 0xce, 0xa4, 0x71, 0xc6, 0x53, + 0xc0, 0x82, 0xa5, 0x76, 0xbe, 0x3f, 0x3b, 0x94, 0x3c, 0x8c, 0x5c, 0x9a, + 0x32, 0x43, 0x38, 0x98, 0x35, 0x97, 0x7b, 0x53, 0x8b, 0x44, 0x67, 0x46, + 0xa2, 0x47, 0x81, 0x97, 0x77, 0x2e, 0xb6, 0x5f, 0x51, 0x91, 0xa0, 0x9f, + 0x47, 0x2c, 0x95, 0x9c, 0x59, 0x56, 0xbf, 0x37, 0xa7, 0xc6, 0x3e, 0x74, + 0x80, 0x97, 0x9e, 0x4d, 0x38, 0x4e, 0x54, 0x6f, 0x3a, 0x91, 0xcc, 0x80, + 0xc2, 0x92, 0x49, 0x36, 0x95, 0x3a, 0x6a, 0x8c, 0x94, 0x6c, 0xc0, 0xc3, + 0x47, 0x51, 0x4d, 0x41, 0x6c, 0x53, 0x70, 0xa9, 0xb5, 0x5d, 0x7e, 0x8d, + 0x55, 0x4c, 0x84, 0x6e, 0xc4, 0xc3, 0x9f, 0x9f, 0x44, 0x59, 0x59, 0x3e, + 0xc0, 0x6c, 0xbd, 0x34, 0xd4, 0xb7, 0x5c, 0xb9, 0xc6, 0x9f, 0xaf, 0x65, + 0x98, 0xa5, 0x6a, 0xa9, 0x50, 0xcf, 0x62, 0xb5, 0xd3, 0x6a, 0x5a, 0x66, + 0x98, 0xa6, 0xa3, 0x53, 0x81, 0xb3, 0x64, 0x52, 0x6c, 0x5a, 0xaf, 0xb5, + 0xb8, 0x49, 0x62, 0x4c, 0x4e, 0xab, 0x6a, 0x3c, 0xa7, 0xc4, 0x3a, 0x9d, + 0xa0, 0x5b, 0x40, 0x7d, 0xb6, 0x3f, 0x80, 0x96, 0xba, 0x93, 0x96, 0xb6, + 0x73, 0x71, 0xaf, 0x78, 0x46, 0xcd, 0xa7, 0xb2, 0x5f, 0x8a, 0x61, 0x3e, + 0x45, 0x5d, 0x93, 0x7c, 0x4a, 0xa5, 0x64, 0x64, 0x4a, 0x42, 0x57, 0x77, + 0x4f, 0x51, 0x57, 0x8d, 0x3f, 0x8e, 0x84, 0x9b, 0xb4, 0xba, 0xcb, 0x95, + 0x8c, 0x77, 0x36, 0x99, 0xb0, 0x6f, 0x9c, 0x8f, 0xcd, 0x92, 0x54, 0x53, + 0x44, 0x49, 0x7c, 0x8a, 0xb1, 0xbe, 0x9b, 0x8f, 0x36, 0x51, 0x94, 0xc8, + 0x7b, 0x3c, 0x72, 0x6a, 0xad, 0xb6, 0x5a, 0xcd, 0x98, 0x41, 0x95, 0x55, + 0x9a, 0x3a, 0xb4, 0x3e, 0xac, 0x66, 0x4f, 0x3c, 0x3d, 0x67, 0xd0, 0x7a, + 0xc0, 0xca, 0x3c, 0x33, 0xa3, 0x8f, 0x4b, 0x3d, 0xbb, 0xc8, 0x64, 0x73, + 0xab, 0x38, 0x6d, 0x4d, 0xa7, 0xc0, 0x8f, 0x88, 0xbe, 0xbc, 0x42, 0x89, + 0x93, 0x40, 0xc1, 0xc5, 0x5b, 0xc2, 0x9b, 0xa0, 0x6e, 0x41, 0xc2, 0xa1, + 0xaf, 0x73, 0xd7, 0x2f, 0x7f, 0xa0, 0x85, 0x52, 0x66, 0x98, 0xcb, 0x9d, + 0x6b, 0x8b, 0xbf, 0x64, 0x79, 0xc3, 0x31, 0x8b, 0x42, 0x47, 0xa6, 0x42, + 0x5f, 0x99, 0x54, 0x4f, 0xbd, 0x56, 0x39, 0x75, 0x65, 0xa4, 0xd2, 0x96, + 0x68, 0x84, 0x6d, 0x39, 0x58, 0x4f, 0x79, 0x49, 0x70, 0x7f, 0xad, 0xc0, + 0x3d, 0x83, 0xd0, 0xb6, 0xaf, 0xb8, 0xd3, 0xa3, 0xa4, 0x39, 0x54, 0x43, + 0x5e, 0xcd, 0x37, 0x58, 0x93, 0x80, 0x96, 0x45, 0xb9, 0x64, 0x8b, 0xcc, + 0x85, 0xca, 0x91, 0xaa, 0x6b, 0xbf, 0xdd, 0x40, 0x3f, 0x80, 0x72, 0xb4, + 0xb6, 0x3b, 0x9a, 0xc3, 0x44, 0x96, 0x97, 0xa1, 0x6d, 0x48, 0xbb, 0x41, + 0xcb, 0xc3, 0x7e, 0x71, 0x6e, 0xc9, 0xcc, 0xd7, 0x7a, 0xa5, 0x48, 0x67, + 0xbf, 0xd5, 0x4d, 0xa6, 0xae, 0xa9, 0x5c, 0xc8, 0xcd, 0x5e, 0x97, 0x8f, + 0x9e, 0x91, 0x43, 0x8e, 0xab, 0x69, 0x9e, 0xab, 0x60, 0x4d, 0x78, 0x75, + 0xa1, 0xb3, 0xb0, 0x65, 0xa3, 0x39, 0x84, 0x55, 0xa4, 0xa1, 0x60, 0x5a, + 0x93, 0xa0, 0xb3, 0x64, 0xc9, 0x5c, 0x3b, 0x77, 0x81, 0x96, 0x93, 0x9f, + 0x43, 0x3d, 0x6a, 0x8e, 0x46, 0x54, 0x8e, 0x7a, 0x52, 0x84, 0x51, 0x79, + 0x91, 0xc1, 0x86, 0xa7, 0x4c, 0x92, 0x78, 0x82, 0x66, 0x6b, 0xa3, 0x62, + 0xbf, 0x49, 0x76, 0x7e, 0x38, 0xc8, 0x77, 0x49, 0xbb, 0x9e, 0x58, 0xa5, + 0x84, 0xd4, 0x9f, 0x34, 0xac, 0x9b, 0x3d, 0xad, 0xd2, 0xc9, 0x3f, 0x64, + 0x2b, 0xbc, 0x4c, 0x4e, 0x9e, 0x35, 0x9d, 0x50, 0xd5, 0xda, 0x64, 0x97, + 0x8a, 0x5c, 0x66, 0xdb, 0x54, 0x54, 0x83, 0x64, 0x50, 0x74, 0xa3, 0x3e, + 0x72, 0xbd, 0x80, 0xb6, 0x70, 0xb8, 0x59, 0x63, 0x3a, 0xb1, 0x79, 0x71, + 0x85, 0xc3, 0x70, 0xcb, 0x39, 0xc9, 0x3c, 0xad, 0xb6, 0x73, 0x91, 0xcf, + 0x95, 0x41, 0x38, 0x45, 0x44, 0xb7, 0x61, 0xbd, 0x73, 0xc9, 0xc8, 0x54, + 0x9a, 0x4c, 0xb6, 0x41, 0xc7, 0x39, 0x31, 0xca, 0xa7, 0x65, 0x72, 0xab, + 0x58, 0xaf, 0xb4, 0xa8, 0x78, 0x60, 0xb9, 0x85, 0x53, 0x6b, 0x6b, 0x49, + 0x85, 0xa7, 0x5e, 0xaf, 0x3a, 0x36, 0x2f, 0xc7, 0x7f, 0x9a, 0x59, 0x8a, + 0x3b, 0x8d, 0x47, 0x8e, 0x96, 0x30, 0x9e, 0x94, 0x55, 0x3f, 0x65, 0xb9, + 0xae, 0x42, 0x49, 0x89, 0xa9, 0x79, 0x47, 0x7a, 0x90, 0x7b, 0x54, 0x78, + 0xa5, 0x3b, 0xd5, 0xbe, 0x44, 0x3e, 0x41, 0x9b, 0xac, 0xc5, 0x6b, 0x97, + 0x88, 0xc8, 0x94, 0xaf, 0x38, 0x51, 0x68, 0xa7, 0x32, 0x6a, 0xc1, 0x34, + 0xca, 0x6c, 0xc6, 0x8b, 0xb5, 0x46, 0x58, 0x65, 0xd1, 0x4d, 0xaf, 0x86, + 0xaa, 0x9e, 0x43, 0x8d, 0x6b, 0x75, 0xc8, 0x59, 0xb5, 0x9c, 0xc2, 0x5c, + 0x83, 0xc2, 0x6e, 0xcc, 0x61, 0x61, 0x58, 0x93, 0x59, 0x55, 0x3d, 0x56, + 0x6a, 0x89, 0x50, 0x62, 0x64, 0x77, 0x8e, 0xdb, 0x76, 0x71, 0xab, 0x76, + 0x60, 0xc3, 0x93, 0xbd, 0x44, 0x92, 0x80, 0xcb, 0x8a, 0x5e, 0x7f, 0x73, + 0xa0, 0x7a, 0x3b, 0xb2, 0x87, 0x67, 0xbb, 0x7a, 0x37, 0x43, 0x3c, 0x66, + 0xac, 0xba, 0x87, 0xc8, 0xba, 0xc9, 0xbc, 0x73, 0xa4, 0x8b, 0x52, 0xc3, + 0xc3, 0xba, 0xcc, 0xc3, 0x6c, 0xad, 0xa8, 0xd5, 0x82, 0x64, 0x9a, 0x3a, + 0xc9, 0x7a, 0x6f, 0x81, 0xa8, 0xa0, 0xd4, 0x37, 0x49, 0x8b, 0x5a, 0x53, + 0x49, 0x77, 0x7d, 0x89, 0xda, 0x71, 0xcb, 0x50, 0x5d, 0x62, 0x68, 0x42, + 0x8a, 0xbb, 0xaf, 0x4d, 0x73, 0x42, 0x58, 0x6e, 0x49, 0x37, 0x89, 0xb0, + 0x7d, 0xbc, 0x7a, 0x6c, 0x9f, 0x57, 0x86, 0x7c, 0xa2, 0x46, 0x4e, 0xa4, + 0xa7, 0xd1, 0xb2, 0xcd, 0x82, 0x88, 0x7d, 0xc4, 0x77, 0xc5, 0xcd, 0xbc, + 0x6f, 0x56, 0xcc, 0xb6, 0x98, 0x9c, 0x7f, 0x6d, 0x4c, 0x79, 0xdd, 0x42, + 0x6b, 0x81, 0xbc, 0xbc, 0x93, 0xb1, 0x7f, 0x79, 0xd6, 0x5a, 0x95, 0x41, + 0x77, 0xac, 0x99, 0xc0, 0xb0, 0x55, 0xa9, 0x67, 0xc2, 0x66, 0x90, 0x56, + 0x8d, 0x47, 0x99, 0x3d, 0x37, 0x47, 0x4b, 0x3f, 0x46, 0x5b, 0x66, 0xbf, + 0xbc, 0xc9, 0xb2, 0x90, 0xcc, 0x5c, 0xb5, 0x75, 0x8f, 0x8c, 0xd2, 0x78, + 0xaa, 0x76, 0xbb, 0x66, 0x42, 0x64, 0x53, 0x7d, 0xb3, 0x6a, 0x58, 0x82, + 0x37, 0x84, 0x81, 0xc8, 0x69, 0x86, 0x4e, 0x88, 0x2e, 0x3c, 0x4d, 0x61, + 0x50, 0x5c, 0x9d, 0x9c, 0x5b, 0xb4, 0x36, 0x5e, 0xb0, 0x37, 0x45, 0x90, + 0x3a, 0x56, 0xb1, 0x50, 0x70, 0x69, 0x84, 0x48, 0xb6, 0xba, 0x53, 0x9b, + 0x57, 0xb5, 0x38, 0x6f, 0xc6, 0x80, 0x80, 0x37, 0x56, 0x79, 0x54, 0xd6, + 0x75, 0xb1, 0x65, 0x7c, 0x8e, 0xa6, 0x31, 0x8e, 0xc3, 0x3b, 0xa3, 0x80, + 0x5e, 0x67, 0xc9, 0xcb, 0x5d, 0x77, 0x67, 0x39, 0x66, 0x57, 0x90, 0xac, + 0x4b, 0xc4, 0x3a, 0xb0, 0x9e, 0x79, 0x64, 0x57, 0xc5, 0xe0, 0x61, 0x64, + 0xa7, 0x64, 0x79, 0xb7, 0xbe, 0xca, 0x49, 0xcf, 0x45, 0x48, 0x5a, 0x96, + 0x9a, 0xc0, 0x63, 0x61, 0x7e, 0x64, 0x55, 0x3a, 0x3a, 0x67, 0xc2, 0x92, + 0x43, 0xd1, 0x65, 0x80, 0x5a, 0xb8, 0x83, 0x3c, 0x60, 0x3c, 0xcf, 0x5a, + 0x8f, 0xa6, 0x9e, 0x4d, 0xcf, 0xd4, 0x7a, 0x56, 0x48, 0x32, 0x9f, 0x62, + 0x51, 0x68, 0xc7, 0xb0, 0x72, 0xba, 0x87, 0x36, 0x74, 0x4b, 0x5e, 0x40, + 0x42, 0x60, 0x7e, 0xb0, 0xb4, 0xd5, 0xcb, 0xba, 0x50, 0x60, 0xa9, 0x43, + 0xce, 0x70, 0x69, 0x65, 0xc2, 0x6d, 0x90, 0x5c, 0x3b, 0xbe, 0xbf, 0xb2, + 0x37, 0xb1, 0x67, 0xae, 0x9b, 0x9d, 0x3d, 0xb3, 0x9f, 0xd8, 0x42, 0x53, + 0xa6, 0xbe, 0xbe, 0x61, 0x6d, 0x31, 0x76, 0xcf, 0xd6, 0x55, 0xa3, 0xc0, + 0x54, 0x8d, 0xad, 0x3b, 0x76, 0x69, 0x86, 0x76, 0xa9, 0xc9, 0x51, 0x68, + 0xc0, 0xbe, 0x5d, 0x4b, 0x9b, 0x55, 0x80, 0x38, 0xbc, 0x49, 0xb1, 0xd7, + 0x64, 0xa2, 0xb1, 0x9d, 0xa1, 0xd1, 0x49, 0xd1, 0xb1, 0x7b, 0x41, 0x59, + 0xc3, 0xa6, 0x40, 0xc6, 0x7a, 0x3d, 0xac, 0x7a, 0x55, 0xa7, 0x67, 0x9f, + 0xd7, 0x74, 0x33, 0xaf, 0x97, 0xbf, 0x59, 0x83, 0x64, 0xc3, 0x44, 0xab, + 0x8a, 0x48, 0x73, 0x68, 0x56, 0x6e, 0x94, 0xc5, 0xba, 0x9e, 0x81, 0x93, + 0x49, 0x97, 0x75, 0xa3, 0x5f, 0x83, 0x56, 0x7a, 0xc7, 0xa2, 0x48, 0x86, + 0x4d, 0x6a, 0xae, 0x9c, 0xa3, 0xa1, 0x3e, 0x86, 0x88, 0x56, 0x4b, 0x8e, + 0x7c, 0xb7, 0x91, 0xb9, 0x4c, 0x98, 0x9e, 0xa5, 0x7a, 0x73, 0xcf, 0x6e, + 0x83, 0xb3, 0x5f, 0x46, 0x91, 0x98, 0xaa, 0xa5, 0x76, 0x79, 0x77, 0xa8, + 0xc7, 0x36, 0xa8, 0x7c, 0xb9, 0x63, 0xa6, 0x91, 0xbb, 0xb0, 0xc2, 0x59, + 0xce, 0xbb, 0xbb, 0x9f, 0x8b, 0x7f, 0xab, 0x8c, 0xb9, 0xa4, 0x43, 0x98, + 0xa3, 0x5e, 0x39, 0x53, 0xb1, 0x40, 0x6b, 0xa8, 0xbe, 0xad, 0xca, 0x4f, + 0x33, 0xb2, 0x76, 0xc9, 0x57, 0xa2, 0xa4, 0xcb, 0x36, 0x9a, 0xa6, 0x5a, + 0xbf, 0x8b, 0x8a, 0xac, 0xad, 0x37, 0x64, 0xbd, 0x4c, 0x74, 0x92, 0x35, + 0xc8, 0x4b, 0xba, 0xc2, 0x66, 0xb1, 0x89, 0x38, 0x8d, 0x65, 0x7b, 0x83, + 0x97, 0xcb, 0x5e, 0x61, 0x43, 0x3f, 0xaf, 0x5d, 0x84, 0x5a, 0x68, 0xaa, + 0xb2, 0xb5, 0x4d, 0xc6, 0x41, 0x9e, 0x5b, 0xb8, 0x48, 0x8c, 0x64, 0xa5, + 0x60, 0xa5, 0xbd, 0x52, 0x6f, 0x3d, 0xa6, 0x83, 0x60, 0x4a, 0xa2, 0x71, + 0x71, 0x4a, 0x78, 0xd3, 0x5a, 0x68, 0x3a, 0xc3, 0x51, 0x9d, 0x8b, 0xca, + 0x4e, 0x6e, 0x8a, 0xc5, 0x66, 0x68, 0x4c, 0x93, 0x69, 0x6c, 0x59, 0x5b, + 0x39, 0x90, 0xc4, 0x98, 0xa1, 0xc8, 0xc5, 0xd1, 0x4a, 0x42, 0xab, 0x74, + 0xce, 0x51, 0x40, 0xbf, 0x81, 0xb8, 0x64, 0x6e, 0x73, 0x32, 0x8a, 0x42, + 0x9f, 0xa4, 0x71, 0x58, 0xc8, 0x5d, 0x9b, 0x34, 0x91, 0x6f, 0x6c, 0x74, + 0xac, 0xc5, 0x8c, 0x2f, 0x33, 0x44, 0x6c, 0x43, 0xaa, 0xa2, 0x80, 0xb0, + 0x50, 0xa4, 0x8a, 0xd6, 0x86, 0x6f, 0xbc, 0x71, 0x9c, 0x65, 0x4b, 0x6b, + 0x64, 0xb3, 0xa8, 0xa5, 0x55, 0x84, 0x7e, 0x71, 0xaf, 0x98, 0x64, 0x55, + 0x98, 0x53, 0x8d, 0x4c, 0x8d, 0xa7, 0xce, 0x4b, 0x92, 0xaf, 0x8b, 0x34, + 0xc0, 0x4d, 0xb5, 0x6b, 0x5f, 0xd1, 0xd3, 0x59, 0x68, 0x46, 0x7b, 0x83, + 0x81, 0x5b, 0xa6, 0x2e, 0x40, 0x90, 0xa9, 0x4a, 0x35, 0x6a, 0xc0, 0x65, + 0xcf, 0xa6, 0x87, 0x4e, 0x36, 0x72, 0x88, 0x56, 0x3d, 0x55, 0xb5, 0x6c, + 0xac, 0x78, 0x8d, 0xc6, 0x6f, 0x7a, 0x78, 0x71, 0x60, 0xaa, 0xc5, 0x7f, + 0x33, 0x63, 0xc1, 0x6f, 0x4c, 0x77, 0x30, 0x9f, 0x4e, 0xb8, 0x76, 0x54, + 0xc8, 0x57, 0xc2, 0x38, 0xb6, 0x88, 0x83, 0x7d, 0xc4, 0x9a, 0xaa, 0x39, + 0x76, 0xc6, 0x8f, 0x6b, 0x6b, 0xd4, 0x3c, 0x77, 0xbb, 0x68, 0xb6, 0xbe, + 0xb2, 0x56, 0x8c, 0x7f, 0xa3, 0x31, 0x52, 0x3e, 0x88, 0x98, 0x6d, 0xc7, + 0x36, 0xbf, 0x7a, 0x49, 0x8f, 0x63, 0x70, 0xb0, 0x7c, 0xca, 0x4a, 0xa0, + 0xd1, 0x72, 0x54, 0x62, 0x91, 0x93, 0x36, 0xc4, 0x95, 0x31, 0x55, 0x6d, + 0x8f, 0x96, 0x59, 0xb9, 0x67, 0x91, 0x76, 0x43, 0x65, 0x73, 0xc8, 0xa8, + 0x6d, 0x3c, 0xa4, 0x77, 0xd2, 0x87, 0x7c, 0x50, 0x3a, 0x5f, 0x5f, 0xcf, + 0x34, 0xd3, 0x83, 0xb9, 0xaa, 0x7b, 0x64, 0x8a, 0x4e, 0x6b, 0x35, 0x54, + 0x84, 0xc8, 0x8f, 0xd1, 0x43, 0xa4, 0xd1, 0x70, 0xaa, 0xb4, 0x70, 0x45, + 0x85, 0xce, 0x5e, 0x9c, 0x86, 0x64, 0x72, 0xc2, 0x77, 0x59, 0x41, 0xbf, + 0x76, 0xb7, 0xc1, 0xc7, 0x82, 0xaf, 0x6b, 0x3b, 0xd0, 0x6e, 0x39, 0x5f, + 0xb7, 0xc2, 0xb8, 0x53, 0xa3, 0x8b, 0x86, 0x3c, 0xb5, 0x3e, 0x49, 0x5e, + 0x4f, 0xb1, 0x31, 0x9c, 0x77, 0x36, 0xad, 0x74, 0x43, 0x9b, 0xc1, 0x88, + 0x38, 0x59, 0x36, 0x81, 0x45, 0x38, 0x7c, 0x4d, 0x3a, 0x38, 0x92, 0x31, + 0x6f, 0x6f, 0x31, 0x9a, 0x45, 0xd4, 0xac, 0xae, 0x9c, 0x87, 0x86, 0x48, + 0x75, 0x87, 0x44, 0x60, 0x52, 0x55, 0xc9, 0x36, 0xba, 0x46, 0xca, 0x36, + 0x55, 0x53, 0x71, 0x39, 0x89, 0xbf, 0xb9, 0x47, 0xce, 0xbc, 0x97, 0xa4, + 0x45, 0x39, 0x97, 0xc9, 0xc1, 0x4f, 0x56, 0x50, 0x94, 0x74, 0xb8, 0x50, + 0xc1, 0xa9, 0x57, 0x60, 0x61, 0xa4, 0x88, 0xb1, 0x9c, 0x5a, 0xa6, 0xa7, + 0x31, 0x85, 0xb7, 0x61, 0x61, 0x79, 0x2d, 0x87, 0xc3, 0x58, 0x34, 0xc7, + 0x61, 0x70, 0x46, 0x7c, 0x47, 0xae, 0x3f, 0x53, 0x78, 0x61, 0x8c, 0x32, + 0x9e, 0x3d, 0x34, 0x6a, 0x57, 0xc9, 0x86, 0xbf, 0xc5, 0x4d, 0xad, 0xc6, + 0x3b, 0x79, 0xc0, 0xc6, 0x41, 0xbf, 0xa3, 0x4d, 0x9c, 0x81, 0x8f, 0x80, + 0xc6, 0x9a, 0x43, 0x3b, 0x8d, 0xc1, 0xb0, 0x3f, 0x58, 0xb2, 0xaa, 0x7f, + 0xae, 0xa5, 0x8b, 0xb2, 0x6e, 0x34, 0xbf, 0xbd, 0xa8, 0x41, 0xa4, 0x47, + 0xcf, 0xca, 0x8f, 0x3c, 0x59, 0x34, 0xcc, 0x56, 0xcc, 0x6a, 0x72, 0x6b, + 0xb5, 0xb4, 0x55, 0xbb, 0x94, 0x91, 0xb0, 0xb7, 0xaa, 0x71, 0x7f, 0x43, + 0x81, 0x48, 0xcd, 0x6a, 0x9a, 0xa7, 0x79, 0xc8, 0xa8, 0x6a, 0x52, 0x47, + 0xd0, 0xc3, 0x3a, 0xcd, 0x66, 0x33, 0xd5, 0x4a, 0xc3, 0x72, 0x4b, 0x59, + 0x3f, 0x5d, 0x6e, 0xaf, 0xab, 0x43, 0x86, 0x95, 0x95, 0x7a, 0xa6, 0x8f, + 0xbc, 0x54, 0x9c, 0x3f, 0x88, 0x70, 0xa1, 0x35, 0x40, 0xa4, 0xaf, 0x39, + 0x68, 0x69, 0x34, 0xd2, 0x4c, 0xa8, 0x79, 0xc8, 0xc9, 0x9a, 0x42, 0x3a, + 0xa2, 0x7d, 0x5e, 0x37, 0x54, 0x3b, 0xd1, 0xad, 0x8a, 0x87, 0x66, 0xa3, + 0x85, 0xcd, 0x62, 0x78, 0xb2, 0x5d, 0xb6, 0x7d, 0xb8, 0x69, 0x85, 0xae, + 0xb5, 0x4b, 0xb4, 0x73, 0xcd, 0xae, 0x93, 0x78, 0xbd, 0x5e, 0x9f, 0xc4, + 0x8a, 0xc4, 0x35, 0x96, 0xbc, 0x33, 0xc7, 0x6d, 0x9e, 0x56, 0x3a, 0xb1, + 0x48, 0x75, 0x4f, 0x3d, 0x36, 0xcb, 0xaf, 0x81, 0x3d, 0xc6, 0xd6, 0x56, + 0x4f, 0xd0, 0xaf, 0x3e, 0x83, 0x59, 0x84, 0xd4, 0x31, 0xca, 0xc3, 0xb6, + 0x92, 0x6c, 0xae, 0x56, 0x97, 0xa4, 0xd5, 0xaf, 0x8a, 0x4b, 0x6c, 0x7d, + 0x7f, 0x95, 0x85, 0x98, 0x5a, 0x98, 0x8d, 0xbb, 0x8e, 0x5c, 0x4e, 0x4d, + 0x80, 0x62, 0x9f, 0x5f, 0x7a, 0x84, 0xae, 0x77, 0xb0, 0x5f, 0xc2, 0x5c, + 0x46, 0x3a, 0x33, 0xb9, 0x8f, 0x4a, 0x61, 0x94, 0xc4, 0x56, 0x7e, 0x97, + 0xba, 0x53, 0x77, 0x48, 0xcb, 0x9c, 0xb8, 0xa9, 0xbc, 0xa9, 0xb0, 0x41, + 0xac, 0x6a, 0x7e, 0x3a, 0xb4, 0x87, 0x69, 0xab, 0x3f, 0xb7, 0x4b, 0x32, + 0x43, 0x49, 0x78, 0x5a, 0xd0, 0xa3, 0x39, 0x68, 0x57, 0xcb, 0x8f, 0xa7, + 0x75, 0x44, 0x3f, 0x51, 0x2f, 0xc0, 0x9e, 0x74, 0x34, 0x48, 0x5a, 0xae, + 0xb8, 0x83, 0x93, 0x8f, 0xb2, 0x3e, 0x3c, 0x55, 0x77, 0x45, 0x79, 0xa4, + 0x3e, 0x3f, 0x48, 0xa4, 0x50, 0xa9, 0x9e, 0xb2, 0xcf, 0x35, 0x3d, 0xd2, + 0x9a, 0x72, 0x59, 0xa9, 0x98, 0xc9, 0xc3, 0xac, 0xa7, 0x45, 0x84, 0xa4, + 0x94, 0x5f, 0x74, 0x63, 0x74, 0x59, 0x90, 0x6f, 0x8f, 0xb1, 0x51, 0xae, + 0x43, 0x34, 0x8f, 0x8b, 0x4b, 0x6d, 0x9e, 0x86, 0x72, 0x88, 0x52, 0x85, + 0x87, 0x63, 0xa5, 0xb0, 0x37, 0x96, 0x42, 0x7a, 0xb2, 0x7e, 0xcf, 0x7f, + 0xc7, 0x65, 0x43, 0xab, 0x92, 0x45, 0x4f, 0x62, 0xcc, 0x51, 0x60, 0x91, + 0x8b, 0x3f, 0x8c, 0x3a, 0xaa, 0x83, 0x3e, 0x9d, 0x47, 0x3b, 0xaa, 0x3a, + 0x3f, 0x40, 0x7e, 0xc2, 0x64, 0x3f, 0x8e, 0x67, 0x78, 0x53, 0x90, 0x4f, + 0x2c, 0x6f, 0xbc, 0x6b, 0xa1, 0x7d, 0x40, 0x4d, 0x90, 0x8b, 0x54, 0xbe, + 0xc7, 0x6b, 0xc1, 0x49, 0x38, 0xa4, 0x4a, 0x9c, 0x41, 0x77, 0x3f, 0x95, + 0x9a, 0x86, 0x58, 0x34, 0xb2, 0xc9, 0x7a, 0xbc, 0x86, 0x49, 0x58, 0xa4, + 0x3c, 0x8c, 0x46, 0xc9, 0xa0, 0xa9, 0x34, 0x8b, 0xd4, 0x43, 0x48, 0x58, + 0x9d, 0x80, 0xc8, 0x5b, 0x45, 0x4d, 0xc2, 0x7b, 0x94, 0xca, 0x6a, 0x9b, + 0x4e, 0x64, 0x9d, 0xa9, 0x80, 0xb8, 0x78, 0x9c, 0xcb, 0x5d, 0x9b, 0xaa, + 0xa7, 0x67, 0x86, 0x41, 0x44, 0xa2, 0x39, 0xd2, 0x71, 0x70, 0xc1, 0xad, + 0xcd, 0x69, 0xb2, 0x4d, 0x79, 0xa7, 0xca, 0x61, 0x53, 0xb6, 0x8a, 0x6a, + 0x98, 0x59, 0xc1, 0x41, 0x4d, 0xb9, 0x69, 0xae, 0xa6, 0x47, 0x8a, 0x6b, + 0x85, 0xa5, 0x45, 0x57, 0x3e, 0x3d, 0xc8, 0x76, 0x81, 0x42, 0x49, 0x8a, + 0x8d, 0x32, 0x62, 0x5d, 0x93, 0xcc, 0x36, 0xc0, 0xa9, 0xb7, 0x30, 0x8f, + 0xd3, 0x70, 0x62, 0x81, 0xbc, 0xad, 0x61, 0x4d, 0x4e, 0x8f, 0x3d, 0x39, + 0x40, 0xcf, 0xbe, 0x7f, 0xd5, 0xa2, 0x72, 0x6c, 0x6a, 0x98, 0x7c, 0xc8, + 0xa1, 0x98, 0x46, 0x9a, 0x8f, 0x44, 0xa2, 0x4f, 0x95, 0x3c, 0x86, 0x54, + 0x39, 0x3c, 0x63, 0x46, 0x9b, 0x34, 0x49, 0x80, 0xbc, 0x55, 0xbb, 0xa4, + 0x61, 0x64, 0x8e, 0x72, 0x35, 0x56, 0x7d, 0x70, 0x7f, 0xb1, 0x6a, 0x6f, + 0x90, 0x94, 0x5e, 0x40, 0x7d, 0x5d, 0x91, 0xbf, 0x85, 0x79, 0x9b, 0xc2, + 0x97, 0x6e, 0x6a, 0x4e, 0x88, 0x80, 0xbf, 0x86, 0x79, 0x48, 0xb9, 0x90, + 0xa6, 0x94, 0xae, 0x3e, 0x80, 0x8c, 0xbb, 0xbe, 0x94, 0x75, 0x7c, 0x72, + 0x47, 0x82, 0x36, 0x5f, 0x48, 0x49, 0x4c, 0xc4, 0x32, 0x98, 0x99, 0x74, + 0xc4, 0x39, 0xc7, 0xc5, 0x74, 0x49, 0x39, 0x40, 0x9e, 0x45, 0xac, 0xa0, + 0x80, 0xce, 0x46, 0x87, 0x68, 0xb8, 0xb1, 0x87, 0xa5, 0x41, 0x79, 0x40, + 0xc7, 0x45, 0x78, 0x9b, 0x58, 0x32, 0x93, 0x89, 0x68, 0x78, 0x42, 0x8f, + 0x87, 0x89, 0x32, 0x83, 0x91, 0xbf, 0x3a, 0x49, 0x86, 0x9a, 0x6d, 0x93, + 0x45, 0xad, 0xbd, 0xa2, 0x58, 0xa8, 0x97, 0xc1, 0x95, 0x49, 0x6c, 0x58, + 0xb3, 0x7b, 0x57, 0x60, 0x36, 0xbc, 0x6f, 0x47, 0xc7, 0xc7, 0x6b, 0x7c, + 0x8e, 0x80, 0xd0, 0x6e, 0x72, 0xa4, 0x3c, 0x54, 0x61, 0x7f, 0x89, 0xa9, + 0x65, 0x32, 0x41, 0x55, 0xc0, 0x42, 0x96, 0x46, 0x9e, 0xc0, 0x98, 0x46, + 0xb4, 0xc6, 0x3d, 0x7b, 0x49, 0xc2, 0x60, 0x4d, 0x62, 0xc2, 0xb6, 0xb4, + 0x3b, 0xaf, 0x3a, 0xae, 0x60, 0xb1, 0xca, 0xb2, 0x78, 0xba, 0x5a, 0x8c, + 0x6f, 0x7a, 0xa7, 0x97, 0x8b, 0xbd, 0x86, 0x85, 0x43, 0x58, 0x97, 0xa4, + 0xcb, 0xc0, 0xaf, 0x70, 0x41, 0x50, 0x83, 0xbc, 0xb8, 0x6e, 0x33, 0x98, + 0x33, 0x7d, 0x73, 0xa2, 0xa7, 0x84, 0x4b, 0xcd, 0x96, 0xc7, 0xa2, 0xcc, + 0x82, 0xc6, 0x3a, 0xd1, 0x32, 0x2f, 0x54, 0x4a, 0x38, 0x40, 0xbd, 0x87, + 0x60, 0x8c, 0x59, 0x87, 0x34, 0x78, 0x86, 0x8a, 0x84, 0x91, 0x9f, 0x6c, + 0xa9, 0x3e, 0xa6, 0xce, 0x3f, 0x71, 0xb5, 0xc9, 0x71, 0x83, 0x75, 0x8c, + 0xd3, 0x4b, 0x5f, 0x31, 0x78, 0xc9, 0x88, 0x54, 0xcf, 0xb7, 0x89, 0x85, + 0x48, 0x4b, 0xb9, 0x39, 0x61, 0x3f, 0x37, 0x4b, 0xbd, 0x9e, 0xcf, 0xaa, + 0x90, 0x57, 0x96, 0xb0, 0x32, 0xce, 0x56, 0xa4, 0xb2, 0xb0, 0x59, 0x5c, + 0x52, 0xc1, 0x7d, 0x61, 0x6e, 0x77, 0x90, 0xb2, 0x5a, 0x4f, 0x4d, 0x9d, + 0x78, 0x7a, 0x43, 0xb2, 0x82, 0xa3, 0x42, 0x50, 0x8b, 0x8c, 0x76, 0x91, + 0x9b, 0x62, 0x3e, 0xa2, 0x88, 0x94, 0xa4, 0x9f, 0xb6, 0xb8, 0xb3, 0x4a, + 0x57, 0x65, 0xbb, 0xc4, 0x56, 0xb5, 0x9f, 0x98, 0x70, 0x9a, 0x6a, 0xa1, + 0x7b, 0xaa, 0x3a, 0xba, 0x2e, 0xcf, 0xb3, 0x7d, 0xc0, 0x64, 0x73, 0xbe, + 0xa8, 0x3e, 0xc7, 0x3c, 0xd2, 0x87, 0x35, 0xa9, 0x90, 0x88, 0xb0, 0x58, + 0x79, 0x60, 0x6a, 0x87, 0x89, 0xbf, 0xc4, 0x82, 0xcc, 0x3e, 0x72, 0x9c, + 0xc7, 0xc5, 0x61, 0x3c, 0xc1, 0x8d, 0x93, 0x64, 0xb6, 0xb2, 0x7f, 0x54, + 0x98, 0xa9, 0x85, 0x68, 0xab, 0xa8, 0x71, 0x5e, 0x71, 0x3c, 0x6b, 0x98, + 0xbc, 0xb8, 0xa7, 0xd2, 0xb2, 0xcb, 0x4f, 0x99, 0x37, 0x3f, 0xb7, 0xb9, + 0x57, 0x72, 0x60, 0x46, 0x41, 0x94, 0xcd, 0x49, 0x8a, 0xc7, 0xa4, 0x3a, + 0x7f, 0x95, 0x77, 0x9f, 0x74, 0xa2, 0x9c, 0x5e, 0x3f, 0x55, 0x49, 0x84, + 0x3f, 0xc3, 0x79, 0x3e, 0xd5, 0xd4, 0x64, 0x76, 0x86, 0x9f, 0xc1, 0xa8, + 0x67, 0x9d, 0xa2, 0xb4, 0xd5, 0xa9, 0xab, 0x4c, 0x3f, 0xc9, 0x38, 0x99, + 0xbb, 0xd0, 0x46, 0xa5, 0x51, 0xb9, 0xcd, 0xa6, 0x8d, 0x63, 0x60, 0x9d, + 0x3a, 0x9a, 0x36, 0x8b, 0x35, 0xcd, 0xb4, 0xca, 0x6b, 0xc7, 0x86, 0x7b, + 0x95, 0xbc, 0xd3, 0xb8, 0xd3, 0xbe, 0xa6, 0x82, 0x3f, 0x84, 0x2e, 0xc2, + 0x9c, 0xb3, 0x51, 0xcb, 0x6c, 0x51, 0xc5, 0x41, 0x74, 0x61, 0x43, 0xbf, + 0xb4, 0x3c, 0xbd, 0xb8, 0x63, 0xcd, 0x6d, 0x3d, 0x8e, 0xb0, 0x7a, 0x44, + 0x81, 0x95, 0x8a, 0xa7, 0x42, 0x5d, 0x41, 0xc8, 0x3b, 0x83, 0x96, 0x4e, + 0xd1, 0x6f, 0x8e, 0x8e, 0xcc, 0x61, 0xc9, 0xd2, 0x5d, 0x8f, 0x5a, 0x7a, + 0x65, 0x48, 0x8a, 0xac, 0x50, 0xc2, 0x3d, 0x92, 0xcf, 0xd3, 0x9e, 0x3c, + 0xd2, 0xb6, 0x8f, 0x66, 0x68, 0x40, 0xd0, 0x6f, 0x84, 0x5c, 0x8a, 0x98, + 0x9c, 0xd4, 0x38, 0x45, 0x38, 0x59, 0x40, 0x9e, 0x64, 0x96, 0x58, 0xce, + 0xc9, 0x3f, 0x3b, 0xa0, 0xb4, 0x46, 0x9e, 0x83, 0xc7, 0x6a, 0x87, 0xc9, + 0x62, 0x48, 0x9a, 0xa5, 0xa1, 0x78, 0x46, 0xad, 0x5a, 0x7b, 0x78, 0x50, + 0xbe, 0x70, 0x71, 0x84, 0x53, 0xc7, 0x76, 0xd3, 0x4f, 0x71, 0xdd, 0x45, + 0xb6, 0x74, 0x51, 0x98, 0xad, 0x8d, 0x5b, 0x87, 0xcb, 0xc3, 0x4b, 0x98, + 0x48, 0x62, 0x62, 0xc3, 0x62, 0x6d, 0xb5, 0xa4, 0x66, 0xb0, 0xad, 0xb7, + 0x50, 0x51, 0x7a, 0x94, 0xba, 0xb4, 0x6b, 0x45, 0x80, 0x52, 0x4c, 0x77, + 0x4e, 0x93, 0x56, 0x8d, 0xbb, 0x84, 0x5e, 0x75, 0x9a, 0xaa, 0x71, 0x7f, + 0xd0, 0x7c, 0x7f, 0x8d, 0xa6, 0x74, 0x68, 0x6a, 0xba, 0xbd, 0x8c, 0xd4, + 0xb7, 0x8d, 0x72, 0x40, 0x5e, 0x98, 0x73, 0x3d, 0x45, 0x6a, 0x4f, 0xbc, + 0xc5, 0x54, 0xa9, 0x7a, 0xca, 0x6e, 0x65, 0xaf, 0xb4, 0x3b, 0x4c, 0x66, + 0xa0, 0x56, 0x69, 0x3d, 0xc3, 0x63, 0x3d, 0x88, 0xa3, 0x8d, 0x88, 0xcd, + 0x85, 0xb4, 0x8f, 0x6c, 0xbf, 0xa0, 0x35, 0x61, 0x37, 0xa0, 0x5a, 0x37, + 0x47, 0x9c, 0xa1, 0x7b, 0x43, 0xba, 0x30, 0x38, 0x89, 0x7e, 0x8a, 0xab, + 0x91, 0x5f, 0x8c, 0x49, 0xbe, 0xc6, 0xce, 0x8e, 0xbd, 0x51, 0xaf, 0x6b, + 0x60, 0xb7, 0x52, 0xa0, 0x76, 0x9c, 0x91, 0x65, 0xb7, 0x58, 0xc2, 0x36, + 0x99, 0xc0, 0x7a, 0xb5, 0xc4, 0x44, 0x52, 0x51, 0xd7, 0x86, 0x7f, 0x7b, + 0xd4, 0xc4, 0xbb, 0x5f, 0xb7, 0x6e, 0xdc, 0x7d, 0xc1, 0x5e, 0x54, 0x5f, + 0xd2, 0xa1, 0x98, 0xc0, 0xad, 0x70, 0x3f, 0xae, 0x34, 0xb2, 0x66, 0x5c, + 0xcf, 0x9a, 0x7a, 0xa2, 0x57, 0xc2, 0x5a, 0xb7, 0x6f, 0x6c, 0x61, 0xb9, + 0x79, 0x68, 0x32, 0x31, 0x9c, 0x8f, 0x7a, 0x6e, 0xa8, 0xb9, 0x5a, 0x75, + 0x65, 0xa9, 0x80, 0x69, 0x7e, 0x3f, 0xd5, 0x83, 0xb2, 0x78, 0x9a, 0x85, + 0x58, 0xc5, 0x88, 0x33, 0x60, 0x91, 0x7d, 0x5c, 0x92, 0x5a, 0x50, 0x5d, + 0x66, 0x39, 0xc3, 0x60, 0x66, 0x8b, 0xb8, 0x4c, 0x58, 0x67, 0x7d, 0x56, + 0x9b, 0x6b, 0x57, 0xcd, 0x58, 0x76, 0x67, 0x9b, 0x33, 0x43, 0xd9, 0x85, + 0x71, 0x36, 0x9b, 0x9a, 0xc5, 0x44, 0xb5, 0xbb, 0x45, 0x44, 0x38, 0x49, + 0x50, 0x9c, 0x64, 0xb4, 0x86, 0x91, 0x86, 0x5c, 0x5e, 0x9c, 0x6d, 0x7d, + 0x62, 0xba, 0xcd, 0x57, 0xcc, 0xc2, 0xb6, 0x8e, 0x3f, 0x29, 0xd8, 0x7e, + 0x5f, 0xaf, 0xb5, 0x79, 0xa1, 0x3b, 0xa3, 0xcd, 0xb1, 0x7d, 0x8c, 0xc3, + 0x61, 0xba, 0x60, 0xdb, 0x3c, 0x4c, 0xa1, 0xb8, 0x9b, 0xd0, 0xa5, 0x96, + 0x8a, 0x59, 0x61, 0xba, 0x54, 0xa0, 0x90, 0x91, 0xb1, 0xc6, 0x8f, 0x60, + 0x3f, 0x3d, 0x74, 0x51, 0xa2, 0x36, 0x3f, 0xbd, 0x4e, 0x7f, 0x4d, 0x51, + 0xcc, 0xa8, 0xb9, 0x57, 0x75, 0x51, 0x94, 0x89, 0x85, 0x33, 0xce, 0x87, + 0xb0, 0x9c, 0x31, 0xc8, 0xb5, 0x8e, 0x5c, 0x60, 0x4f, 0x7d, 0x8e, 0x8d, + 0xb2, 0x8b, 0x54, 0x9c, 0x83, 0xba, 0xd0, 0x79, 0x60, 0x43, 0x87, 0xb2, + 0x5e, 0xb7, 0x56, 0xa0, 0xcc, 0x77, 0xbf, 0x5a, 0x5f, 0x78, 0x78, 0xad, + 0x50, 0xb7, 0x56, 0x7c, 0x88, 0x42, 0x84, 0xb3, 0xa4, 0x84, 0xc7, 0x48, + 0xc5, 0x6c, 0x67, 0x45, 0x45, 0x58, 0x9b, 0x67, 0x78, 0x38, 0x34, 0xca, + 0xbd, 0xa0, 0x7c, 0x96, 0xd2, 0x31, 0x53, 0x89, 0x34, 0x85, 0x9c, 0x5a, + 0xb0, 0x65, 0x3f, 0x43, 0x3a, 0x46, 0x98, 0x5c, 0xb9, 0x8d, 0x3c, 0x93, + 0x5a, 0x88, 0x46, 0x88, 0x42, 0x53, 0x99, 0x30, 0x58, 0x52, 0x49, 0xc6, + 0x4a, 0xb0, 0xbe, 0xc5, 0x63, 0x2a, 0xc1, 0x70, 0x2e, 0x7d, 0xcb, 0x5a, + 0x3b, 0xba, 0xad, 0x32, 0x96, 0x69, 0x4d, 0x87, 0x64, 0xce, 0x90, 0xc9, + 0xa7, 0x37, 0x89, 0x71, 0xab, 0x92, 0xa1, 0x65, 0x5e, 0xb7, 0x9d, 0xa4, + 0xc4, 0x63, 0x32, 0x74, 0x8b, 0x7a, 0xae, 0x7d, 0x46, 0x50, 0x44, 0xbe, + 0xc4, 0x6a, 0x62, 0x9a, 0x89, 0x8d, 0xb9, 0xca, 0xa5, 0xa8, 0x89, 0x5c, + 0xca, 0xcb, 0xa8, 0x40, 0x96, 0x62, 0xa7, 0x69, 0x63, 0xd6, 0x87, 0xa5, + 0xa2, 0xa9, 0x94, 0xc1, 0xcc, 0x41, 0xc8, 0xc2, 0x6f, 0x5b, 0x9f, 0x4b, + 0x67, 0xbc, 0x58, 0x3b, 0xbd, 0x46, 0xb4, 0xc0, 0x3a, 0xb3, 0x30, 0xca, + 0xcc, 0xb6, 0x53, 0x68, 0x38, 0xa2, 0xc9, 0x47, 0x66, 0x5f, 0x2e, 0x66, + 0xaf, 0xc9, 0x96, 0x91, 0xce, 0x6d, 0xbf, 0xa8, 0x51, 0xb3, 0x8c, 0x42, + 0x94, 0x3f, 0x69, 0xac, 0x97, 0xbd, 0xc3, 0x37, 0xc3, 0x6b, 0xac, 0x59, + 0x78, 0x62, 0x6f, 0xc5, 0xbf, 0xac, 0xa8, 0x61, 0xbe, 0x82, 0xb3, 0x50, + 0xc2, 0x5b, 0x8e, 0x51, 0x49, 0x3b, 0x96, 0x83, 0xc6, 0xaf, 0x8b, 0xa9, + 0xad, 0xce, 0xcf, 0x45, 0x47, 0x77, 0x8f, 0x50, 0xbb, 0xaf, 0x9a, 0xaa, + 0xae, 0x61, 0x8e, 0xd1, 0xc5, 0x5f, 0x51, 0x35, 0x8d, 0x60, 0x94, 0xa1, + 0xb6, 0xb2, 0x99, 0x43, 0x32, 0x79, 0x71, 0xcd, 0x3b, 0x51, 0x8a, 0x61, + 0xce, 0x67, 0x87, 0x47, 0x7a, 0x3f, 0xa3, 0x4a, 0x3d, 0xb5, 0x70, 0x84, + 0x7b, 0xb4, 0xc5, 0x39, 0x64, 0xb0, 0x5c, 0xa3, 0x9e, 0xa5, 0xad, 0xb4, + 0x60, 0x31, 0x5d, 0x80, 0xcb, 0x5f, 0x47, 0xd8, 0x34, 0xba, 0xd6, 0xab, + 0x3a, 0xaf, 0x2c, 0x9a, 0x4a, 0x3c, 0xc2, 0x34, 0x5a, 0xab, 0x72, 0xc0, + 0x6e, 0x4f, 0xb1, 0x8e, 0x7c, 0xbb, 0x4b, 0x68, 0x88, 0xb0, 0x9c, 0x70, + 0x96, 0x7e, 0xcb, 0xb6, 0x93, 0xb2, 0x4b, 0xa8, 0xa1, 0x9e, 0xb0, 0xc1, + 0x9b, 0x43, 0x46, 0xa2, 0x53, 0xde, 0xa4, 0xc8, 0x64, 0x84, 0x66, 0xa5, + 0x7f, 0x5e, 0xbf, 0x59, 0x69, 0x98, 0x3a, 0x62, 0x98, 0xbf, 0x51, 0xa0, + 0x3f, 0xaf, 0x66, 0xa0, 0x9c, 0xab, 0x69, 0xae, 0x96, 0x82, 0x76, 0xc3, + 0x79, 0xc3, 0x32, 0x94, 0xb8, 0xcd, 0xa0, 0x91, 0x5d, 0x80, 0x79, 0xce, + 0xae, 0xc7, 0xc1, 0x80, 0x7a, 0x9b, 0xab, 0xd1, 0x5f, 0xcf, 0x9d, 0xd6, + 0xb8, 0xbb, 0x82, 0x31, 0xd0, 0x2a, 0xc2, 0x7b, 0x20, 0x2b, 0x91, 0x6f, + 0x52, 0x99, 0xc0, 0xad, 0x2f, 0x65, 0x47, 0x96, 0xd4, 0x4b, 0xa9, 0x44, + 0x9d, 0x93, 0xc6, 0x8f, 0x8f, 0x7b, 0xb9, 0x94, 0xbd, 0xaf, 0x99, 0xbe, + 0x87, 0xc3, 0x4c, 0xd1, 0xc3, 0xd0, 0x50, 0x91, 0xc8, 0x3c, 0x4f, 0xbe, + 0x3c, 0x5b, 0x53, 0x42, 0xc3, 0x81, 0x54, 0x36, 0x45, 0x6d, 0x9d, 0xb7, + 0x3e, 0xa6, 0xb5, 0x56, 0x49, 0x76, 0x9a, 0x97, 0x68, 0x5d, 0x8c, 0x53, + 0x66, 0x70, 0x4e, 0xb2, 0xb2, 0x95, 0x8e, 0x65, 0xa7, 0x87, 0x8d, 0x46, + 0xbb, 0x37, 0xcf, 0xd0, 0xb8, 0x6c, 0xcb, 0xa2, 0xc4, 0x8c, 0x49, 0x38, + 0x70, 0x93, 0x6d, 0x39, 0x70, 0x83, 0xa4, 0x8f, 0xa2, 0xc3, 0x73, 0x37, + 0x35, 0x79, 0xbb, 0x3d, 0x68, 0xba, 0xc7, 0x46, 0xbb, 0xa8, 0x49, 0x6e, + 0x98, 0xac, 0x90, 0x93, 0x9e, 0x4b, 0x7b, 0x49, 0x50, 0xca, 0xb7, 0xd9, + 0x39, 0x3d, 0x6b, 0x6c, 0xbd, 0x60, 0x9f, 0x66, 0x8c, 0x70, 0x43, 0x67, + 0x44, 0x46, 0x77, 0x35, 0x50, 0xa9, 0xc9, 0x78, 0x76, 0xa6, 0xb8, 0xbe, + 0x36, 0x74, 0xca, 0x99, 0x4b, 0xbd, 0x28, 0x2a, 0x53, 0x2d, 0x65, 0x84, + 0x9d, 0x89, 0x47, 0x5c, 0xcb, 0x94, 0x9e, 0xdb, 0xc5, 0x9a, 0xd6, 0x9d, + 0x5a, 0x95, 0x89, 0xbc, 0x43, 0x9b, 0xc6, 0x7c, 0x5c, 0xca, 0xce, 0x35, + 0x93, 0x61, 0xc5, 0x63, 0x6c, 0x66, 0x5e, 0x52, 0x9e, 0x64, 0x3a, 0xa7, + 0x5b, 0x5e, 0xc1, 0x58, 0x99, 0x4f, 0x5e, 0xc0, 0x73, 0x8d, 0x63, 0x73, + 0x73, 0x2b, 0x6e, 0x9e, 0x69, 0x6f, 0x79, 0x57, 0xc8, 0x97, 0x3d, 0x4d, + 0x4f, 0x27, 0x57, 0xa2, 0xb6, 0xb2, 0x48, 0x4c, 0x80, 0x62, 0x82, 0x55, + 0xca, 0x36, 0xb2, 0xd3, 0x51, 0x85, 0x63, 0xa9, 0xb5, 0x7e, 0x6f, 0x95, + 0x91, 0xad, 0x48, 0x81, 0xb6, 0x5a, 0x9c, 0x73, 0xb7, 0xc8, 0x4d, 0x7b, + 0x8c, 0x2f, 0x8a, 0xa6, 0xb8, 0x8f, 0x35, 0xa7, 0x41, 0x45, 0xce, 0x87, + 0x8e, 0x5f, 0x89, 0x47, 0x68, 0xa8, 0xbe, 0x39, 0x53, 0xae, 0x5d, 0xd0, + 0x35, 0xa0, 0x87, 0x4c, 0x8b, 0x9d, 0x9a, 0x54, 0x61, 0xad, 0x59, 0xb9, + 0xa9, 0x91, 0x91, 0xb2, 0x98, 0xbc, 0x6c, 0x8b, 0xa0, 0x3d, 0x86, 0xb9, + 0xbe, 0x8c, 0x87, 0x7b, 0xb6, 0x6c, 0x62, 0xab, 0xb1, 0x63, 0x4b, 0x7f, + 0x39, 0xa1, 0x81, 0xad, 0xb3, 0x35, 0xa2, 0xb2, 0x8a, 0x6f, 0x97, 0x57, + 0xc8, 0x5b, 0xcf, 0x98, 0xbf, 0xa8, 0xd7, 0xc0, 0xa5, 0x31, 0x59, 0x94, + 0xc7, 0xa0, 0x99, 0x88, 0x94, 0x9f, 0x83, 0x7c, 0x75, 0xa3, 0x87, 0x89, + 0x4b, 0x78, 0x7c, 0x78, 0xa0, 0x43, 0xc5, 0x4c, 0x54, 0x51, 0xb3, 0x93, + 0x68, 0x4c, 0xb3, 0x7e, 0xd3, 0xb0, 0x4f, 0x54, 0x7b, 0x79, 0x9f, 0x45, + 0x91, 0x66, 0x81, 0xba, 0xcd, 0x8a, 0x39, 0x73, 0xc7, 0xa4, 0x25, 0x4b, + 0x3a, 0x81, 0x4e, 0x67, 0x62, 0x3e, 0x9e, 0x44, 0x78, 0xc2, 0xc0, 0x82, + 0x7b, 0x86, 0x94, 0x80, 0x7e, 0x97, 0xd1, 0x4d, 0x43, 0x45, 0x74, 0x82, + 0x94, 0x7b, 0xa3, 0xc2, 0xa5, 0x42, 0x5d, 0xbe, 0xad, 0xca, 0x84, 0x36, + 0xa6, 0x8f, 0x82, 0xa0, 0x96, 0x5c, 0xce, 0xca, 0x77, 0x8a, 0x4d, 0xbb, + 0xb4, 0x94, 0x32, 0xb1, 0x6c, 0xc7, 0xba, 0x3d, 0x5b, 0x32, 0x75, 0x53, + 0x98, 0x41, 0x6a, 0xbf, 0x58, 0xa4, 0x6f, 0x74, 0x45, 0x8f, 0xba, 0x6b, + 0x7a, 0x4b, 0x8e, 0x44, 0x71, 0x7a, 0x59, 0xa6, 0xce, 0x6c, 0xb4, 0xa7, + 0x8d, 0xcc, 0x44, 0x99, 0xa0, 0x37, 0x32, 0x2c, 0x44, 0x86, 0xc7, 0x83, + 0x4d, 0x3f, 0x9d, 0xc0, 0x52, 0x57, 0x57, 0x77, 0x47, 0x32, 0x5b, 0xc6, + 0x58, 0xba, 0xb8, 0xc9, 0xa6, 0xbe, 0x8a, 0x5f, 0xa1, 0x42, 0x74, 0xc7, + 0x50, 0x6e, 0x58, 0x5f, 0x91, 0xa2, 0xa6, 0x49, 0xd4, 0x44, 0x57, 0xa1, + 0xc2, 0x69, 0xb3, 0x7b, 0xc4, 0xaa, 0x60, 0xb9, 0x67, 0x69, 0x44, 0x59, + 0x36, 0x75, 0x72, 0x7f, 0xc4, 0xb4, 0x7c, 0x6c, 0x90, 0x58, 0x6e, 0x50, + 0x69, 0xcc, 0x50, 0xcd, 0x4d, 0x8f, 0x67, 0x99, 0xae, 0xb5, 0x5b, 0x48, + 0x30, 0xca, 0x38, 0xc1, 0x6e, 0x3f, 0xbf, 0x60, 0x61, 0xc9, 0x9b, 0x43, + 0xad, 0xa6, 0x5f, 0x84, 0x6d, 0x4b, 0xce, 0x70, 0x75, 0xc1, 0x3d, 0x52, + 0x8a, 0x36, 0x73, 0xd1, 0xae, 0xa7, 0x93, 0x55, 0x95, 0x52, 0xc3, 0xa2, + 0xbb, 0x58, 0x98, 0x8d, 0x3a, 0x7f, 0xad, 0x66, 0xbd, 0x69, 0x6f, 0x7a, + 0x90, 0xa4, 0x50, 0x96, 0xad, 0x33, 0xa7, 0xb7, 0x49, 0x74, 0x9f, 0x73, + 0x73, 0x86, 0xc5, 0x49, 0x8e, 0x83, 0x72, 0xa4, 0x9d, 0xcd, 0x80, 0x3f, + 0x3f, 0x96, 0x2d, 0xc0, 0x9f, 0x42, 0x8f, 0xc2, 0x5c, 0xa9, 0x56, 0x38, + 0x55, 0x89, 0xc1, 0x73, 0x46, 0xa7, 0xd5, 0x53, 0x3f, 0xb3, 0x7a, 0x36, + 0xb2, 0xc6, 0x5c, 0x20, 0x35, 0xa6, 0xbb, 0x44, 0xa0, 0x49, 0x39, 0x73, + 0x8b, 0x90, 0xb3, 0x35, 0x82, 0x46, 0xb1, 0xbb, 0x97, 0x45, 0xc0, 0x54, + 0xa7, 0x8b, 0x40, 0xce, 0x5e, 0x9a, 0x50, 0x94, 0x92, 0x5c, 0xac, 0xc9, + 0xbe, 0x5d, 0x3b, 0xcf, 0xb5, 0x66, 0xb2, 0x42, 0xc1, 0x5c, 0xc1, 0x58, + 0x68, 0x64, 0x35, 0x51, 0x88, 0xc8, 0x5b, 0x9f, 0xb3, 0x66, 0x9d, 0x8a, + 0x31, 0x41, 0x62, 0x46, 0x57, 0xb6, 0x5f, 0xa6, 0x47, 0x58, 0xaf, 0x37, + 0x82, 0xa5, 0x64, 0x2a, 0x68, 0xa4, 0xa9, 0x4e, 0xa7, 0xa0, 0x3d, 0xd7, + 0x3a, 0x52, 0x43, 0xa0, 0x8b, 0xa5, 0x50, 0x98, 0x9a, 0xb3, 0xb4, 0x4a, + 0x78, 0x67, 0x91, 0x90, 0xc2, 0x5a, 0x73, 0x7f, 0xc2, 0xaf, 0x53, 0x3b, + 0xb5, 0x4d, 0xa1, 0x84, 0xbb, 0x96, 0x7a, 0x89, 0x68, 0x38, 0x80, 0x8c, + 0x67, 0xa1, 0x82, 0x59, 0x46, 0x4c, 0x45, 0x93, 0xc3, 0xc4, 0x58, 0x5c, + 0x6a, 0x9c, 0x63, 0x8d, 0x69, 0x7d, 0x6f, 0x3c, 0xca, 0x69, 0x22, 0x3e, + 0x2c, 0x53, 0xa6, 0x89, 0x8d, 0x3c, 0xcb, 0xc7, 0x98, 0xca, 0x5b, 0x7e, + 0xb2, 0x3a, 0x73, 0x75, 0x9f, 0x92, 0x4a, 0x63, 0x2b, 0x88, 0x40, 0x87, + 0x95, 0x9d, 0x7c, 0x4a, 0x45, 0xcc, 0x2e, 0x47, 0xc2, 0xa6, 0xc9, 0xbb, + 0x3a, 0x87, 0xa8, 0x81, 0xb5, 0xc5, 0xa6, 0x5b, 0xaf, 0x59, 0x91, 0x5e, + 0x8f, 0x82, 0x81, 0xa9, 0xab, 0x46, 0x60, 0x83, 0x53, 0x5e, 0x93, 0x49, + 0xa6, 0x81, 0xbf, 0x4f, 0x69, 0x8b, 0x9b, 0x51, 0x84, 0x40, 0x62, 0x63, + 0x42, 0xae, 0x77, 0x7b, 0x77, 0x43, 0x46, 0x35, 0x6f, 0x64, 0x77, 0x3d, + 0x7d, 0x66, 0x5e, 0x55, 0xc5, 0x54, 0x9d, 0xc2, 0x94, 0x5b, 0x64, 0x72, + 0x3e, 0xcb, 0x51, 0x98, 0x7d, 0x9c, 0x91, 0x3b, 0x6b, 0xad, 0x34, 0x7e, + 0x7f, 0xba, 0x5e, 0x3c, 0x72, 0x3b, 0x60, 0x4a, 0x52, 0xc8, 0x5b, 0x5f, + 0x5e, 0x3d, 0xae, 0x69, 0xc0, 0x59, 0xc1, 0xa3, 0xb6, 0xc2, 0xbc, 0xab, + 0xad, 0xb1, 0x71, 0x7d, 0x41, 0x47, 0x64, 0x8f, 0xc7, 0x50, 0x4e, 0x89, + 0x55, 0x62, 0xa6, 0x7c, 0x5e, 0xd3, 0xd0, 0xa4, 0xa7, 0x6a, 0xce, 0x95, + 0x8e, 0x74, 0x80, 0x70, 0x6c, 0x8d, 0x4b, 0x3b, 0x7f, 0x75, 0x53, 0x90, + 0x3a, 0xcc, 0xc2, 0xd4, 0x52, 0xc5, 0x42, 0x38, 0x69, 0x3e, 0x90, 0x5f, + 0xb2, 0x34, 0xca, 0xa1, 0x66, 0x3d, 0xcd, 0xb6, 0xbc, 0x4c, 0xaf, 0x50, + 0xdd, 0x4c, 0x35, 0x81, 0x4c, 0xa0, 0x8d, 0x48, 0x5c, 0x90, 0xcc, 0x20, + 0x37, 0x6a, 0x49, 0x33, 0x8a, 0x80, 0x53, 0xb7, 0xc6, 0x7b, 0xa1, 0xc1, + 0x34, 0xa7, 0x82, 0x64, 0xc8, 0xbb, 0xd0, 0xb4, 0x85, 0x96, 0x6e, 0xb4, + 0xca, 0x95, 0x61, 0x99, 0x5d, 0x80, 0x71, 0x87, 0x7a, 0x7e, 0x57, 0x52, + 0x35, 0x85, 0xcc, 0x93, 0x78, 0xb9, 0x8c, 0x44, 0xd5, 0x5b, 0x7f, 0xa8, + 0x61, 0xc6, 0x4b, 0xc3, 0x9c, 0xb7, 0x58, 0x9f, 0xc3, 0x98, 0x3a, 0x5a, + 0x5b, 0x33, 0x85, 0xb9, 0x6f, 0x58, 0xcb, 0x3a, 0xcb, 0x67, 0xbf, 0x54, + 0xd9, 0x63, 0xb1, 0xbe, 0x56, 0x96, 0x66, 0x83, 0xb5, 0x2d, 0x69, 0xac, + 0xa9, 0x78, 0xbd, 0x59, 0x5e, 0x9b, 0xbe, 0x3d, 0x37, 0xc3, 0x5a, 0x9f, + 0x41, 0x52, 0x4c, 0x66, 0xc9, 0x8e, 0x4b, 0xca, 0xb4, 0xb7, 0x45, 0xb7, + 0xd1, 0x98, 0x51, 0x8e, 0xb8, 0x41, 0xb8, 0xa8, 0x4e, 0x6c, 0x33, 0xc8, + 0xa2, 0xa3, 0x90, 0x86, 0x66, 0x4c, 0x7c, 0xa3, 0x77, 0x60, 0xb9, 0xa5, + 0x87, 0xc4, 0xa2, 0x79, 0x66, 0xc1, 0x4a, 0x97, 0x82, 0x8a, 0xa8, 0x4d, + 0x88, 0x4e, 0x6e, 0x4f, 0x2e, 0x44, 0xd2, 0x79, 0x7b, 0xa6, 0x45, 0x58, + 0x9d, 0xba, 0x39, 0x92, 0xbb, 0xc1, 0x77, 0x67, 0x41, 0x68, 0x81, 0x80, + 0xa0, 0xbd, 0x7b, 0x32, 0x61, 0xd9, 0xd2, 0x36, 0x35, 0x68, 0xc1, 0x3d, + 0xd1, 0x8a, 0x9c, 0x57, 0x5d, 0xcc, 0xbb, 0x9b, 0xb6, 0x31, 0x27, 0xb1, + 0x86, 0x49, 0x6f, 0x5b, 0xcd, 0xc1, 0x81, 0x67, 0x45, 0x94, 0xd5, 0xa2, + 0xc5, 0x5e, 0x84, 0xb3, 0x41, 0xb4, 0x66, 0x79, 0xa8, 0x86, 0xa4, 0xbc, + 0x51, 0xa9, 0x86, 0x60, 0xb5, 0x8f, 0x8e, 0x64, 0x93, 0xa3, 0x46, 0x70, + 0xbf, 0x90, 0xc6, 0x49, 0x6c, 0xcf, 0xb2, 0xab, 0xaa, 0xd2, 0xc4, 0x8e, + 0xa6, 0x3e, 0x84, 0x5d, 0xc4, 0x4e, 0x91, 0x8e, 0x81, 0x4c, 0x43, 0xd1, + 0x4b, 0xbd, 0x57, 0xbd, 0x44, 0x2f, 0x56, 0x52, 0x7e, 0x27, 0x34, 0xac, + 0xa7, 0x8f, 0x83, 0x64, 0x99, 0x5f, 0x79, 0x5b, 0xad, 0xa5, 0x99, 0xb6, + 0x50, 0xb3, 0x28, 0xa5, 0x48, 0xe0, 0x79, 0x3b, 0xaa, 0xac, 0x5e, 0x45, + 0xcb, 0x56, 0xc5, 0xb0, 0x8e, 0x3b, 0x80, 0x42, 0x95, 0x6c, 0x97, 0x64, + 0xca, 0x5e, 0x7a, 0x3d, 0x87, 0xe4, 0x69, 0x77, 0x71, 0x99, 0x7e, 0x74, + 0xd7, 0x74, 0xa4, 0xba, 0x48, 0xc3, 0x9d, 0x88, 0x84, 0x4b, 0xbf, 0xc7, + 0x8e, 0x78, 0xc2, 0x2d, 0xb3, 0x78, 0x2d, 0x33, 0x64, 0x90, 0x8e, 0x3e, + 0x5a, 0xc8, 0x6d, 0x89, 0x3e, 0x67, 0x48, 0x7a, 0x41, 0x6c, 0x3a, 0x53, + 0xaa, 0x8a, 0x69, 0x69, 0xa1, 0x5c, 0x6c, 0x96, 0x99, 0x72, 0x6e, 0x66, + 0xbd, 0x83, 0xa9, 0x51, 0xab, 0x48, 0xca, 0x30, 0x3b, 0x6f, 0x66, 0xad, + 0x67, 0x71, 0x42, 0x31, 0x2d, 0x60, 0x8c, 0x95, 0x95, 0x44, 0xba, 0x92, + 0xa2, 0xaf, 0x80, 0x9d, 0x43, 0xaf, 0x8f, 0x2a, 0x5e, 0xd1, 0x70, 0x56, + 0xc8, 0x67, 0x48, 0x48, 0x84, 0x4f, 0x8a, 0x81, 0x32, 0x84, 0x78, 0xb7, + 0x7b, 0x79, 0x59, 0xc1, 0xaf, 0xac, 0x5d, 0x85, 0xc5, 0xca, 0xc8, 0x7e, + 0xa3, 0x4d, 0x80, 0x4c, 0x2f, 0x9a, 0x59, 0x71, 0x4b, 0x77, 0xbe, 0xc4, + 0x93, 0xa2, 0x66, 0x3a, 0x68, 0x92, 0xc0, 0x81, 0x70, 0x7a, 0xce, 0xc1, + 0x2d, 0x75, 0x55, 0xa7, 0xa8, 0x68, 0xa5, 0x55, 0x44, 0x8d, 0x4e, 0x33, + 0x74, 0x25, 0x90, 0x91, 0x93, 0x6d, 0xb4, 0xdc, 0xaa, 0x63, 0x7f, 0x6c, + 0x5b, 0xc1, 0x6c, 0x32, 0x67, 0x90, 0xb7, 0x69, 0x57, 0x70, 0x59, 0xc8, + 0x41, 0x84, 0x40, 0x45, 0x88, 0x43, 0xbf, 0x90, 0x79, 0xc1, 0x4b, 0x3b, + 0x3d, 0x6a, 0xcf, 0x8b, 0x44, 0x73, 0x89, 0xc9, 0xcc, 0xa1, 0x7b, 0x5d, + 0x7f, 0x45, 0xa9, 0x33, 0xa3, 0xc1, 0x9d, 0xcb, 0x81, 0x40, 0xaf, 0x65, + 0x40, 0x80, 0x90, 0x6b, 0xc5, 0xc6, 0x31, 0x5c, 0xbe, 0x90, 0x36, 0xaa, + 0x5f, 0xa1, 0x88, 0x93, 0x70, 0xc9, 0x7d, 0xd4, 0x3b, 0x80, 0xd4, 0xd2, + 0xc1, 0xca, 0xa9, 0x7e, 0xb0, 0x55, 0x8d, 0x6c, 0xb3, 0x4b, 0x6e, 0x85, + 0x42, 0xa3, 0x9a, 0xab, 0x54, 0x34, 0xc1, 0xac, 0x5a, 0xcf, 0x4b, 0xb1, + 0x73, 0xd3, 0x4e, 0x6f, 0x5e, 0x4f, 0x81, 0xc6, 0xd4, 0x71, 0xc3, 0x89, + 0x3f, 0x7b, 0xbe, 0x51, 0x69, 0x72, 0x57, 0xcd, 0x87, 0xcb, 0xa2, 0x64, + 0x57, 0x83, 0x6d, 0xa1, 0x48, 0x87, 0xb6, 0xa0, 0x84, 0x55, 0x78, 0x9b, + 0xd2, 0x6b, 0x47, 0x31, 0x36, 0xd0, 0x7f, 0x40, 0x95, 0xc1, 0xb1, 0x50, + 0x83, 0xa6, 0x68, 0x70, 0x9b, 0x5b, 0xb6, 0x51, 0xbd, 0x87, 0x4a, 0x7e, + 0x62, 0xa3, 0x63, 0x69, 0xc5, 0x54, 0xac, 0xa9, 0xc4, 0x71, 0x5f, 0xce, + 0x43, 0x6e, 0xae, 0x3b, 0xc4, 0xc6, 0x95, 0x73, 0x60, 0xa0, 0xcf, 0x37, + 0x3c, 0x3f, 0x4e, 0x7b, 0x2b, 0x36, 0x77, 0xc5, 0xaa, 0x9f, 0x34, 0x33, + 0x7d, 0x66, 0xcb, 0xa3, 0x50, 0x93, 0xb4, 0xa1, 0x42, 0xb7, 0x2f, 0x9f, + 0x4a, 0x6d, 0xa9, 0xb7, 0x3f, 0x77, 0x61, 0x8d, 0x6c, 0xc0, 0x48, 0x33, + 0x4a, 0x57, 0x93, 0xb2, 0xad, 0x7a, 0x8d, 0x8b, 0x3e, 0xc4, 0x7f, 0x5e, + 0x9c, 0x48, 0x4e, 0x77, 0xa5, 0x6f, 0x73, 0x51, 0x62, 0x83, 0x8b, 0xa1, + 0x65, 0xa7, 0xc1, 0x46, 0x7c, 0x63, 0x77, 0x4a, 0x5b, 0x5c, 0x8f, 0xb5, + 0xb2, 0xab, 0x59, 0x98, 0x3f, 0x68, 0xba, 0xa5, 0x59, 0x63, 0x40, 0x36, + 0xc6, 0x41, 0x4d, 0x4b, 0x95, 0x76, 0x59, 0x8f, 0x4b, 0x7e, 0x4c, 0x43, + 0xa0, 0xa2, 0x82, 0x69, 0xab, 0x38, 0x8d, 0xc7, 0xc5, 0x9d, 0x88, 0x90, + 0x2f, 0x79, 0x41, 0x4d, 0xaf, 0xb7, 0x50, 0x31, 0xa6, 0x77, 0x9c, 0x5a, + 0x7a, 0x3f, 0xc5, 0xb8, 0xc4, 0xa7, 0x7e, 0x38, 0x72, 0x7b, 0x98, 0x55, + 0x91, 0x3e, 0xa3, 0x5d, 0x76, 0x83, 0x90, 0x8b, 0xb3, 0xc0, 0x80, 0x6b, + 0x77, 0xda, 0xcf, 0x99, 0x5c, 0xd5, 0x8d, 0xb3, 0xbd, 0x59, 0x81, 0x50, + 0xc1, 0xad, 0xaf, 0xbb, 0x5b, 0x9d, 0x88, 0xc1, 0xbf, 0x3f, 0xc9, 0x54, + 0x73, 0x91, 0x6a, 0x72, 0x6a, 0x3b, 0x6c, 0x4b, 0x38, 0x4f, 0xa1, 0xc8, + 0x55, 0xc7, 0xa7, 0x52, 0x43, 0x78, 0x95, 0x72, 0xd5, 0xa4, 0x78, 0x87, + 0xad, 0xb7, 0x58, 0xb7, 0x5c, 0xcb, 0xc4, 0xb0, 0x5d, 0x6a, 0x94, 0x84, + 0x3c, 0xba, 0x46, 0xba, 0xc7, 0xb8, 0x75, 0x3a, 0xaf, 0x7f, 0x68, 0xbc, + 0x52, 0xc4, 0x3c, 0x76, 0x3d, 0xc5, 0xcc, 0x8c, 0xb0, 0x4b, 0x34, 0x96, + 0xbe, 0x8d, 0x50, 0xc4, 0xae, 0x57, 0x87, 0xc2, 0x92, 0x75, 0x58, 0x6b, + 0x6d, 0x8b, 0x9b, 0x33, 0x59, 0x93, 0xaa, 0x60, 0x52, 0x55, 0x6a, 0x45, + 0xa9, 0x47, 0x6b, 0x57, 0xad, 0xc7, 0xc9, 0x94, 0x9b, 0x4c, 0x3d, 0x7f, + 0x96, 0x8f, 0xac, 0xd7, 0x32, 0x52, 0xd8, 0xa3, 0x7a, 0xb6, 0xc3, 0x60, + 0x4b, 0x83, 0x57, 0x79, 0x93, 0xc2, 0x33, 0xbf, 0xb1, 0x7f, 0x99, 0x8d, + 0xce, 0xc2, 0xa5, 0x75, 0x73, 0xb6, 0x6b, 0x5c, 0xba, 0x54, 0x93, 0x5e, + 0xcb, 0x3f, 0xb5, 0x59, 0x91, 0xbc, 0x5d, 0x9f, 0x87, 0xa9, 0xc2, 0xbb, + 0x66, 0xa4, 0x55, 0x80, 0xbb, 0x70, 0x64, 0xa2, 0x5c, 0x9e, 0x76, 0x82, + 0x94, 0x68, 0x7e, 0x78, 0x31, 0x3f, 0x91, 0x73, 0x44, 0x3d, 0xc4, 0x3d, + 0x56, 0x76, 0x94, 0x37, 0x58, 0xac, 0xad, 0x97, 0x58, 0x88, 0x39, 0x65, + 0x69, 0xd0, 0x7c, 0x34, 0xaf, 0x9f, 0x88, 0x61, 0xcb, 0x52, 0xc1, 0xc1, + 0xaa, 0x45, 0x36, 0x5a, 0x5a, 0xab, 0x6c, 0x8d, 0x37, 0xcd, 0x4c, 0x68, + 0x8d, 0xcd, 0x76, 0x6a, 0x8f, 0x36, 0xa7, 0xa2, 0x7a, 0x99, 0x50, 0xbf, + 0x55, 0x35, 0xc7, 0xd0, 0xd0, 0x83, 0xc3, 0xc4, 0xc7, 0x6d, 0x3b, 0x82, + 0x5b, 0x4c, 0x55, 0x4c, 0x53, 0x55, 0xa7, 0x40, 0x34, 0xa4, 0x49, 0x6c, + 0x5d, 0x78, 0x5d, 0x8b, 0x89, 0xb2, 0x3c, 0x55, 0x64, 0x53, 0xc6, 0x9b, + 0x7b, 0x69, 0x6b, 0x6b, 0x56, 0xc0, 0x45, 0x98, 0xaf, 0x43, 0x2d, 0x3d, + 0x44, 0x4a, 0x80, 0x7f, 0x3c, 0x94, 0x93, 0x78, 0x43, 0x36, 0x8d, 0x56, + 0x47, 0x92, 0x3f, 0x3e, 0x81, 0x61, 0x70, 0x5e, 0xbb, 0x68, 0xac, 0xcc, + 0x31, 0x69, 0x3e, 0x5b, 0x92, 0x38, 0xab, 0x5a, 0xa7, 0x58, 0x7b, 0x75, + 0x87, 0x9a, 0xbb, 0x5f, 0x87, 0xd6, 0xc5, 0x8e, 0xc4, 0xb7, 0x87, 0x41, + 0xbd, 0x80, 0x76, 0xa0, 0x70, 0x70, 0x46, 0x5e, 0x65, 0x43, 0xa1, 0xc2, + 0xba, 0x80, 0x52, 0x8a, 0xba, 0x39, 0x37, 0x33, 0x8b, 0xa7, 0xa1, 0x48, + 0x45, 0x4f, 0xcd, 0x73, 0x89, 0x34, 0x50, 0x57, 0x34, 0x38, 0x73, 0x9d, + 0x45, 0xbf, 0x5a, 0xab, 0x59, 0x76, 0x58, 0xb2, 0xb2, 0xb2, 0xac, 0x5c, + 0x4a, 0x85, 0xaf, 0x30, 0xb9, 0x73, 0xaf, 0xcf, 0x3d, 0xab, 0xb9, 0x8a, + 0xa3, 0x3a, 0x40, 0x66, 0x6e, 0x84, 0x75, 0xd0, 0x5b, 0x48, 0xcf, 0xb4, + 0xae, 0x97, 0xb7, 0x51, 0x5c, 0x6e, 0x5e, 0x9f, 0x44, 0xb7, 0x41, 0xc2, + 0x3e, 0x7d, 0x8c, 0x5f, 0x64, 0x84, 0x6b, 0xc0, 0xce, 0x97, 0x6f, 0x78, + 0xbb, 0xb8, 0x87, 0x47, 0x56, 0xbe, 0x39, 0x45, 0xb3, 0xd3, 0xbc, 0x87, + 0x72, 0x70, 0x5a, 0x80, 0x7f, 0xc1, 0x4f, 0x6f, 0x89, 0xd8, 0x53, 0x47, + 0xd2, 0x8e, 0xa9, 0xa5, 0x5f, 0xcb, 0x76, 0xb3, 0x3c, 0xd2, 0xaa, 0xa8, + 0x3c, 0x4b, 0xce, 0x55, 0xb0, 0x65, 0xc7, 0xa2, 0x9b, 0x66, 0xa8, 0x7f, + 0x80, 0x72, 0x58, 0x34, 0x54, 0x74, 0x81, 0x7a, 0x9d, 0xbc, 0xd2, 0x3e, + 0xb3, 0xb9, 0x83, 0xad, 0x45, 0xc1, 0xcc, 0x5b, 0xca, 0xba, 0x9a, 0xd2, + 0x8e, 0x76, 0x7d, 0x8c, 0x3b, 0x58, 0x99, 0xa0, 0xbf, 0x41, 0x7d, 0x70, + 0x87, 0x98, 0x3e, 0x61, 0x99, 0x31, 0xc6, 0xb4, 0x8c, 0x77, 0x97, 0x87, + 0x39, 0x8e, 0x5c, 0x4f, 0x72, 0x7d, 0xab, 0xd4, 0xb5, 0x8a, 0x92, 0x9b, + 0xbd, 0x56, 0x6e, 0x69, 0x8c, 0x2d, 0xc1, 0xa8, 0xa3, 0x88, 0x5d, 0x66, + 0x9b, 0x49, 0x5f, 0x87, 0xa3, 0x38, 0xaf, 0x97, 0x45, 0xab, 0x4f, 0x39, + 0xcc, 0x73, 0xb6, 0x8f, 0x97, 0x4d, 0xc6, 0x58, 0x7f, 0x44, 0xa7, 0x69, + 0x6e, 0x36, 0xd2, 0xbf, 0x77, 0x6e, 0x6e, 0x4c, 0x45, 0xa1, 0x76, 0x82, + 0x44, 0xa2, 0x9d, 0x3e, 0x33, 0x98, 0xc3, 0x89, 0x53, 0x5e, 0x9f, 0x5b, + 0x40, 0x82, 0x30, 0x4f, 0x2b, 0x9b, 0x7c, 0x68, 0xb0, 0xce, 0x9b, 0x6f, + 0x6f, 0xb8, 0x44, 0x3e, 0x85, 0xd9, 0xd1, 0x30, 0xbb, 0x4a, 0x73, 0xc4, + 0x55, 0x4c, 0x52, 0x8e, 0xe2, 0x77, 0x7e, 0xb0, 0xc1, 0x69, 0xbf, 0x5b, + 0x44, 0xcd, 0x70, 0xbb, 0xa7, 0xac, 0x6e, 0x36, 0xb9, 0x9a, 0x96, 0x89, + 0x86, 0x4d, 0x82, 0x9c, 0x7a, 0x65, 0xc2, 0x44, 0x35, 0x3e, 0x8a, 0x45, + 0x73, 0x3d, 0x57, 0xd2, 0x7f, 0xd2, 0xc6, 0x7f, 0x34, 0xcc, 0xd9, 0x43, + 0xb7, 0xaf, 0xad, 0x9f, 0x68, 0x8f, 0x62, 0xaa, 0x74, 0xb2, 0x46, 0xa1, + 0xc1, 0xa7, 0x56, 0x99, 0x40, 0x9d, 0x8e, 0x61, 0x9f, 0x6c, 0x7a, 0xb5, + 0xc3, 0x8c, 0xbb, 0x91, 0x3d, 0x3f, 0xba, 0x51, 0xb5, 0x43, 0x5e, 0x82, + 0x7e, 0x63, 0x34, 0x9c, 0xbe, 0xad, 0xa7, 0x85, 0x4d, 0xad, 0x79, 0x3c, + 0x56, 0xc9, 0xb5, 0x63, 0x50, 0x53, 0xa6, 0x76, 0xa4, 0xc8, 0xc8, 0x37, + 0x35, 0x79, 0xa3, 0x87, 0x8a, 0xa5, 0x84, 0xb8, 0xaf, 0xb7, 0x74, 0x5d, + 0x71, 0x80, 0xa5, 0xce, 0xa6, 0x3d, 0x61, 0x65, 0xb6, 0xce, 0x72, 0xa6, + 0xbd, 0x79, 0x4a, 0x61, 0x3d, 0x65, 0xc6, 0xc0, 0x80, 0xd0, 0x9f, 0x89, + 0xd2, 0x5f, 0x79, 0x96, 0xb7, 0x50, 0x41, 0x9b, 0xc5, 0x40, 0x62, 0x40, + 0xcd, 0xc9, 0xbc, 0x4f, 0x8d, 0x54, 0x63, 0xa8, 0x32, 0xc6, 0xa1, 0x8a, + 0x84, 0x77, 0x4c, 0x2f, 0x5c, 0x5b, 0xad, 0x27, 0x82, 0x43, 0x34, 0x9d, + 0x3e, 0x3d, 0xbf, 0x8f, 0x96, 0x77, 0x3b, 0xc5, 0x55, 0xe2, 0xa7, 0x49, + 0x63, 0x8e, 0xb8, 0x41, 0x47, 0x9c, 0x84, 0xa8, 0x7f, 0x53, 0xb8, 0xa1, + 0x51, 0x83, 0x56, 0x45, 0xd0, 0xa4, 0x3b, 0xaa, 0xa0, 0x81, 0xc1, 0xb7, + 0x43, 0xbf, 0x4c, 0x6c, 0xa1, 0x4c, 0x3e, 0x3b, 0x50, 0x3a, 0x56, 0x92, + 0x9f, 0x83, 0x56, 0xc7, 0x68, 0x7a, 0x5a, 0xac, 0x4f, 0x3f, 0x43, 0x66, + 0xc9, 0x8f, 0x3f, 0xa6, 0xa0, 0x88, 0x5f, 0x86, 0x45, 0x43, 0x63, 0xa9, + 0x8c, 0x89, 0xbd, 0x78, 0xca, 0x54, 0xcf, 0x5e, 0xb1, 0x68, 0xa3, 0x46, + 0x8d, 0xab, 0xd2, 0x4e, 0x7e, 0x79, 0x51, 0xa3, 0x87, 0xb9, 0x6d, 0x9c, + 0x3b, 0x54, 0x54, 0x9b, 0x42, 0x34, 0x68, 0x52, 0xd6, 0xda, 0x3b, 0x8b, + 0x6f, 0x67, 0x41, 0x2e, 0x8c, 0xbb, 0xbb, 0x4e, 0xc9, 0xce, 0x72, 0x91, + 0xca, 0xc0, 0x9d, 0xa8, 0x3b, 0x71, 0x50, 0x84, 0xad, 0xc3, 0x96, 0x95, + 0xc6, 0x58, 0x43, 0x99, 0x4e, 0xa8, 0x9c, 0xa0, 0x4b, 0x8a, 0x5b, 0xa3, + 0xbc, 0x7b, 0x70, 0xbc, 0x34, 0xbd, 0xa8, 0x8c, 0x93, 0x66, 0xb0, 0xb0, + 0x90, 0x64, 0x56, 0x34, 0x44, 0xa0, 0xbc, 0x6b, 0xc2, 0x3d, 0x9a, 0x44, + 0x71, 0xc8, 0x42, 0x49, 0x9f, 0xd7, 0xb5, 0x2f, 0x83, 0x63, 0x54, 0xcd, + 0x90, 0x47, 0x5f, 0x40, 0x73, 0xa8, 0x8b, 0x6c, 0xad, 0x53, 0x90, 0x85, + 0x4f, 0x96, 0xd6, 0x58, 0x44, 0xc8, 0x35, 0x7b, 0xcc, 0x6e, 0x89, 0x40, + 0x71, 0xd1, 0x47, 0xcd, 0x46, 0xb9, 0x74, 0x45, 0x9c, 0xc9, 0x90, 0x51, + 0xb2, 0x88, 0x96, 0xbc, 0x42, 0x54, 0xaa, 0xa6, 0xa1, 0x52, 0xc8, 0x56, + 0x32, 0xd3, 0x7a, 0xcb, 0x67, 0x64, 0xb1, 0x8b, 0xdd, 0xc3, 0xaf, 0x42, + 0xb7, 0x6d, 0x36, 0xa5, 0xa3, 0xab, 0x53, 0x49, 0x8e, 0xc0, 0x3b, 0x8b, + 0xd9, 0xb6, 0xbf, 0x2c, 0x4e, 0x9d, 0xa0, 0xa4, 0x46, 0x39, 0x9d, 0xba, + 0x69, 0x7f, 0x3e, 0x45, 0x62, 0x64, 0xd0, 0xa8, 0x8e, 0xaf, 0x61, 0x78, + 0x7b, 0x6c, 0x9f, 0x3e, 0xcc, 0x69, 0xbd, 0x81, 0xac, 0xb7, 0x3b, 0xa2, + 0xab, 0xc3, 0x4e, 0x5b, 0x81, 0xa5, 0x77, 0x74, 0xca, 0xa9, 0x86, 0x4c, + 0x4b, 0x72, 0x34, 0xd4, 0x42, 0x72, 0x31, 0x65, 0xc9, 0xb1, 0x7a, 0xa2, + 0x5f, 0x72, 0x80, 0xb5, 0x72, 0x7b, 0x3b, 0x96, 0xa3, 0x57, 0xd0, 0xd2, + 0x6c, 0xd3, 0xb3, 0x3f, 0xcd, 0x7a, 0x37, 0x9b, 0x7f, 0x49, 0x9a, 0x8c, + 0x9d, 0x5b, 0x6b, 0xd0, 0x3f, 0xb4, 0xbc, 0xb4, 0x48, 0x3c, 0xbf, 0x96, + 0xc1, 0x44, 0x3d, 0xa6, 0xb5, 0x9e, 0xaa, 0x36, 0x4c, 0x98, 0xbd, 0xb3, + 0x6f, 0xa2, 0xbb, 0xb5, 0xbd, 0x81, 0xb6, 0xcb, 0x49, 0x71, 0x5f, 0x72, + 0x6f, 0x68, 0x43, 0x60, 0x58, 0x3e, 0x5e, 0x44, 0xae, 0xc0, 0x7b, 0xa6, + 0xc9, 0x91, 0x63, 0x32, 0x50, 0xa7, 0x9c, 0xb7, 0xc4, 0x43, 0xb8, 0x7b, + 0xd3, 0xb3, 0x4a, 0x96, 0xb0, 0x70, 0x8e, 0xc0, 0x42, 0x77, 0x38, 0x46, + 0x70, 0xa8, 0x4a, 0x2f, 0x7e, 0xa7, 0xa5, 0x82, 0x43, 0x43, 0x79, 0xcf, + 0x60, 0x96, 0x30, 0x71, 0x45, 0x36, 0x40, 0x81, 0x7d, 0x80, 0x50, 0x60, + 0xbe, 0x69, 0x64, 0xcb, 0xa3, 0x47, 0x63, 0x6c, 0x4b, 0xd1, 0x3b, 0x41, + 0x54, 0x5d, 0x7e, 0x71, 0xa2, 0x9b, 0x62, 0x82, 0x7d, 0x73, 0xa9, 0x9d, + 0x9a, 0x91, 0x49, 0xa5, 0xd8, 0x57, 0xc0, 0x46, 0xc0, 0x38, 0x9e, 0xc0, + 0x3b, 0xa1, 0x56, 0x76, 0x46, 0x3f, 0xb5, 0x39, 0x65, 0xbd, 0xa7, 0x3e, + 0x69, 0x71, 0x41, 0x7d, 0x85, 0x52, 0xc5, 0x75, 0x35, 0x81, 0x5e, 0xae, + 0xa6, 0x5a, 0x9d, 0xcc, 0x39, 0x32, 0x6e, 0x8d, 0x2c, 0x75, 0x51, 0xb2, + 0xce, 0x54, 0xa7, 0x54, 0x51, 0x46, 0x6a, 0x40, 0x68, 0xce, 0x92, 0x46, + 0x80, 0x82, 0x5f, 0xb6, 0xcf, 0x61, 0x8d, 0x52, 0x77, 0x94, 0x9f, 0xc7, + 0xbc, 0xba, 0x74, 0xc5, 0x38, 0x5a, 0x51, 0x4c, 0xba, 0xa3, 0x90, 0x53, + 0xbc, 0x99, 0x8d, 0x52, 0x7f, 0x53, 0x58, 0x4d, 0x7d, 0x39, 0x6c, 0x41, + 0x8a, 0x5b, 0x7c, 0x5d, 0xdb, 0x81, 0x53, 0x87, 0xbf, 0x40, 0xa3, 0xc2, + 0x47, 0x8c, 0x88, 0xce, 0x95, 0x5b, 0xc5, 0x69, 0xa0, 0xc1, 0x97, 0xbd, + 0x49, 0xbf, 0xb4, 0x4b, 0xc1, 0xd3, 0x57, 0x85, 0x9f, 0x79, 0x6e, 0xb5, + 0x53, 0x72, 0x44, 0xc6, 0xb3, 0x8e, 0x84, 0xaa, 0x30, 0xa5, 0x97, 0x56, + 0x81, 0x7f, 0x7c, 0x34, 0x5e, 0xb6, 0x61, 0x8f, 0xa6, 0x6d, 0xb0, 0x58, + 0x67, 0xda, 0xc2, 0xbe, 0x44, 0x40, 0x35, 0x88, 0x3f, 0x6a, 0x50, 0xc8, + 0xb3, 0xa2, 0x64, 0x4f, 0xd4, 0x86, 0xae, 0x95, 0x99, 0x40, 0xc0, 0xa0, + 0x96, 0x89, 0xa5, 0xbf, 0x4d, 0xb1, 0x99, 0x77, 0x7f, 0x95, 0xd5, 0xb2, + 0x5b, 0x6b, 0xc3, 0x32, 0x78, 0x52, 0x5e, 0x6c, 0x50, 0x4a, 0x80, 0x6b, + 0x47, 0xdc, 0x75, 0x41, 0xac, 0xd6, 0xca, 0xa7, 0xab, 0x84, 0x39, 0x31, + 0x7d, 0x41, 0x3c, 0xa3, 0xa3, 0xc4, 0xa1, 0xb6, 0x80, 0xda, 0xd9, 0x8f, + 0x71, 0x59, 0x5d, 0xca, 0x57, 0x94, 0xbc, 0x88, 0x6d, 0x67, 0xbd, 0x96, + 0x56, 0xd7, 0xb5, 0x5e, 0x55, 0x36, 0x7c, 0xb3, 0x58, 0x9d, 0x76, 0x50, + 0x9e, 0xbc, 0x4d, 0x8a, 0x77, 0x52, 0xac, 0x53, 0xc6, 0x9e, 0xc8, 0x70, + 0x9d, 0x3e, 0xaa, 0x8a, 0x22, 0x40, 0x39, 0xce, 0x5e, 0x58, 0x8c, 0x88, + 0xa8, 0x52, 0x54, 0x6b, 0x70, 0xd1, 0xc1, 0x96, 0xb1, 0xc9, 0x43, 0x66, + 0xa9, 0xc7, 0xc2, 0x42, 0x47, 0x80, 0xd8, 0x8a, 0x73, 0x35, 0x3a, 0xa8, + 0x3f, 0x9d, 0xac, 0xad, 0xdf, 0xb1, 0x45, 0x47, 0x5b, 0xc7, 0x6a, 0xbc, + 0xbb, 0xc3, 0xb3, 0xa1, 0x76, 0x80, 0x4e, 0xbd, 0x94, 0x74, 0x3f, 0x9a, + 0xb8, 0xcb, 0xb3, 0x92, 0x32, 0x44, 0x96, 0x83, 0xa7, 0xb1, 0x80, 0x33, + 0x9d, 0x9e, 0xcd, 0x4c, 0xa4, 0xc1, 0x4e, 0xcf, 0x65, 0x33, 0xcb, 0x92, + 0x65, 0x7e, 0x60, 0x49, 0xbe, 0x44, 0xd1, 0xb1, 0x4c, 0xa2, 0x87, 0x71, + 0x43, 0x87, 0x26, 0x39, 0x56, 0x6b, 0xb3, 0xa5, 0x56, 0xa4, 0xc7, 0xc3, + 0xad, 0x8b, 0xa9, 0xae, 0xd5, 0xc5, 0xc5, 0x43, 0x55, 0xc5, 0x63, 0x6c, + 0xc8, 0xc3, 0x5e, 0xbf, 0x7e, 0x32, 0xa6, 0x4a, 0x63, 0x79, 0x36, 0x7a, + 0x5b, 0x74, 0xae, 0x3b, 0xbf, 0x51, 0xd6, 0x3f, 0xbe, 0x3f, 0xa5, 0x38, + 0xb9, 0xd6, 0x53, 0xb8, 0x63, 0xcd, 0x4c, 0xc6, 0x89, 0x7d, 0x89, 0x66, + 0x94, 0x4b, 0xd5, 0x3c, 0x45, 0x36, 0x9e, 0xc0, 0xad, 0x81, 0x75, 0x79, + 0x24, 0x60, 0x40, 0x4e, 0x8d, 0xab, 0xbd, 0xb6, 0x4f, 0x75, 0xd3, 0x62, + 0xc8, 0x8f, 0xa6, 0x9b, 0xc0, 0x86, 0x4f, 0x84, 0x8b, 0xde, 0x59, 0x5c, + 0x81, 0x88, 0xa3, 0xb1, 0x7a, 0xb7, 0x78, 0x5b, 0x95, 0x32, 0xb3, 0xb6, + 0xac, 0xd0, 0x40, 0xa2, 0x1a, 0x37, 0xd0, 0xcd, 0x7b, 0xb0, 0x53, 0x4a, + 0xbe, 0x81, 0x7c, 0x44, 0x8a, 0x7c, 0x3b, 0x59, 0x38, 0xa9, 0xb3, 0xa1, + 0x9a, 0x77, 0x67, 0x8b, 0x8e, 0xe0, 0xce, 0xcf, 0xbb, 0x2a, 0xb4, 0x60, + 0x8f, 0x8d, 0x62, 0xc7, 0x92, 0x95, 0x95, 0x7e, 0xbf, 0x95, 0x8c, 0x81, + 0xc1, 0x86, 0xb3, 0x5b, 0x8f, 0xc0, 0xc9, 0x9a, 0x6a, 0xc5, 0x76, 0x5c, + 0x5c, 0xbd, 0x93, 0xa9, 0xa6, 0x84, 0x34, 0x53, 0xb0, 0x50, 0x40, 0xba, + 0x43, 0x48, 0x80, 0xb9, 0x58, 0x87, 0x8e, 0xcc, 0x2f, 0xb9, 0x41, 0x66, + 0x50, 0x48, 0x6a, 0x4d, 0x9f, 0x2b, 0x85, 0xbd, 0x87, 0xcf, 0xac, 0xc6, + 0x5e, 0x8c, 0x9a, 0xa0, 0x4c, 0xc3, 0x95, 0xc0, 0xa1, 0x9e, 0xcc, 0x37, + 0x82, 0x9a, 0x68, 0xca, 0xa2, 0x8e, 0x32, 0x96, 0x95, 0xbd, 0x61, 0x7e, + 0x65, 0x6d, 0xb8, 0xaa, 0xd9, 0xa1, 0xd6, 0x4f, 0xd1, 0x81, 0x55, 0x68, + 0x6b, 0x8e, 0xc3, 0x73, 0xbe, 0xc8, 0xc4, 0xa4, 0x72, 0xc0, 0xd3, 0x47, + 0x3f, 0x91, 0x72, 0x5b, 0x34, 0x45, 0x82, 0xb7, 0xca, 0x98, 0x9c, 0x83, + 0xd0, 0xad, 0x4a, 0x87, 0x81, 0x49, 0x9e, 0x70, 0x49, 0xa0, 0x4b, 0x69, + 0x8e, 0x9e, 0x98, 0x9e, 0x87, 0x81, 0xce, 0x3e, 0x41, 0x9e, 0x3f, 0x79, + 0xa1, 0x95, 0x43, 0x89, 0xbf, 0x90, 0x8f, 0xc5, 0x63, 0xb0, 0xb6, 0x3e, + 0xd1, 0x57, 0xc6, 0x64, 0x99, 0xca, 0xad, 0xbc, 0x45, 0x44, 0xd1, 0x8b, + 0x91, 0x91, 0x82, 0x36, 0xae, 0x67, 0x57, 0x56, 0x55, 0x94, 0x6f, 0xc1, + 0xc7, 0x6e, 0x34, 0xa4, 0xaa, 0x55, 0x3f, 0x9f, 0x60, 0xcd, 0xd5, 0x6c, + 0xdd, 0x6e, 0xbe, 0x8c, 0xd4, 0x53, 0xc3, 0x44, 0x5a, 0x45, 0x8b, 0xb8, + 0xbf, 0x4f, 0x4f, 0x66, 0x3b, 0x4c, 0xb1, 0x84, 0x7d, 0x8a, 0xc8, 0x33, + 0x6d, 0xcb, 0x50, 0xa3, 0x52, 0x3c, 0xaf, 0xba, 0x71, 0x8b, 0xc5, 0xb0, + 0xa4, 0x81, 0x74, 0x4a, 0x98, 0xbd, 0x59, 0x63, 0x52, 0x50, 0xa8, 0x56, + 0x5a, 0x7c, 0x52, 0x2d, 0x94, 0xb6, 0x97, 0x35, 0x5a, 0x5d, 0x82, 0xd2, + 0xd1, 0x9f, 0xa6, 0x86, 0x76, 0x64, 0x40, 0x9d, 0x3b, 0x48, 0x80, 0x31, + 0xdb, 0x42, 0x62, 0xb7, 0xca, 0xbf, 0xa5, 0xd8, 0xe4, 0x8c, 0x33, 0xc6, + 0xc7, 0x72, 0x4c, 0x40, 0xe8, 0x94, 0x52, 0x94, 0x80, 0xae, 0x53, 0xa2, + 0x9c, 0x78, 0x77, 0xc7, 0x97, 0xb3, 0x83, 0x2e, 0xc5, 0x83, 0x8c, 0x4d, + 0xe0, 0x74, 0x57, 0x40, 0x56, 0xca, 0xc7, 0x4a, 0x71, 0x7a, 0xaf, 0x7b, + 0x72, 0x7c, 0x49, 0x4b, 0x8b, 0x78, 0xac, 0x37, 0x57, 0x59, 0x8c, 0x82, + 0x69, 0x4b, 0x51, 0x47, 0x95, 0x5e, 0xa8, 0xa6, 0x49, 0x51, 0x7c, 0xd7, + 0x76, 0x85, 0x90, 0x4d, 0x3e, 0x7b, 0xd2, 0x48, 0xa6, 0xcc, 0x8b, 0xbe, + 0xc6, 0x89, 0xa8, 0x9f, 0x62, 0xcd, 0x65, 0x64, 0x4d, 0x3a, 0x73, 0xa1, + 0xcf, 0x80, 0x8f, 0x3c, 0xbe, 0x5f, 0xb8, 0x9d, 0x99, 0x41, 0xc1, 0x93, + 0x39, 0x75, 0x52, 0x6a, 0xc3, 0x71, 0x65, 0x3a, 0x3b, 0x90, 0x4c, 0x7b, + 0xa0, 0x8c, 0xa7, 0x5a, 0xb3, 0xc5, 0x76, 0x9e, 0x4c, 0x98, 0x3e, 0xb3, + 0xa1, 0x6b, 0x47, 0xcf, 0xc8, 0x50, 0x4f, 0x92, 0x7a, 0xad, 0xc7, 0x82, + 0xda, 0xcb, 0x40, 0x8b, 0xdd, 0xaa, 0x69, 0x92, 0x85, 0x71, 0x64, 0x8e, + 0x53, 0xbb, 0x79, 0xbd, 0xc4, 0x33, 0xaf, 0xad, 0x5c, 0xa4, 0xc7, 0x62, + 0x74, 0x4d, 0x62, 0x5c, 0xaa, 0x4f, 0x94, 0x51, 0xa3, 0xb6, 0x43, 0x53, + 0xbb, 0x99, 0x72, 0x42, 0x58, 0xbc, 0x3f, 0xbe, 0x72, 0x86, 0x6e, 0x5b, + 0x60, 0x7d, 0xbc, 0x76, 0x42, 0xd4, 0xa4, 0xd1, 0x8b, 0x7c, 0xc2, 0x5b, + 0x50, 0x64, 0x53, 0x63, 0x48, 0xbc, 0x74, 0xc4, 0x81, 0xbf, 0x49, 0x4a, + 0xb7, 0xb1, 0x59, 0xbe, 0xb9, 0xae, 0x7f, 0xc2, 0x5e, 0x56, 0xbd, 0x84, + 0xab, 0xc0, 0x6d, 0x92, 0x42, 0xca, 0xc7, 0xc2, 0x3e, 0x56, 0x37, 0xb0, + 0xcb, 0xc3, 0x5b, 0x50, 0x9f, 0xb0, 0x33, 0x90, 0x76, 0x6a, 0x43, 0x3a, + 0xae, 0x74, 0xc0, 0x74, 0x98, 0x2a, 0x6f, 0x3d, 0x40, 0x84, 0xba, 0x8a, + 0x9a, 0x9f, 0xb0, 0x55, 0x5c, 0x76, 0xaf, 0xbb, 0x93, 0x6d, 0x6d, 0x66, + 0xa0, 0x90, 0x73, 0x66, 0xcc, 0x8a, 0x76, 0x39, 0x94, 0x5b, 0xaf, 0x9f, + 0xb7, 0xc2, 0x7e, 0x40, 0xaf, 0x52, 0x95, 0xb7, 0x9d, 0xbb, 0x81, 0xc5, + 0x82, 0x93, 0x7d, 0xb4, 0xcf, 0xd2, 0x50, 0xb5, 0x6e, 0xd7, 0xb6, 0x87, + 0x62, 0x81, 0xb8, 0xab, 0x4f, 0x32, 0x66, 0xab, 0x64, 0x7b, 0xb3, 0xca, + 0x35, 0x5c, 0x4f, 0x63, 0x71, 0x52, 0x6d, 0xa8, 0xbe, 0x66, 0x8c, 0xc6, + 0x71, 0x33, 0x90, 0xdb, 0xc3, 0x91, 0x9b, 0x37, 0xb7, 0x7f, 0xb5, 0x72, + 0x5d, 0x68, 0x79, 0xd4, 0xb7, 0xb3, 0xcf, 0xa3, 0x87, 0xa7, 0x56, 0xb8, + 0x3b, 0x4c, 0xa4, 0xaf, 0x7f, 0x3e, 0xcc, 0x6f, 0xcf, 0x70, 0xb0, 0xc0, + 0x7d, 0xa0, 0xc2, 0x84, 0x90, 0x65, 0x4a, 0x75, 0x88, 0x56, 0xac, 0x74, + 0x5a, 0x54, 0xbd, 0x92, 0x2f, 0x6b, 0xdc, 0x46, 0x92, 0x5d, 0x8b, 0x38, + 0x6c, 0x84, 0xc0, 0x47, 0xc9, 0x68, 0x5e, 0x45, 0xa2, 0xce, 0x81, 0xbe, + 0x86, 0x8c, 0xa6, 0x93, 0x51, 0x70, 0xad, 0x4e, 0x6d, 0x79, 0x9f, 0x92, + 0x35, 0xb2, 0xad, 0xb1, 0x8d, 0xad, 0x39, 0x7d, 0x53, 0x3f, 0xc6, 0x47, + 0x7c, 0x70, 0x95, 0x40, 0x81, 0x66, 0xc8, 0xca, 0x3a, 0x87, 0x57, 0x5c, + 0x56, 0xa2, 0x99, 0x78, 0x60, 0xa6, 0xb3, 0xab, 0x33, 0x56, 0x68, 0xb3, + 0x65, 0x47, 0x37, 0x38, 0xaa, 0x8c, 0xaa, 0xad, 0x71, 0x5e, 0xbd, 0x57, + 0x91, 0x62, 0x5c, 0xb5, 0x6c, 0xb8, 0x9e, 0x50, 0x4f, 0xc2, 0x50, 0x27, + 0x69, 0x37, 0xcd, 0xa4, 0xa4, 0x6f, 0xbc, 0x7b, 0xb9, 0x61, 0xca, 0xaa, + 0xa7, 0x6c, 0xb1, 0xd5, 0x71, 0x45, 0x4e, 0xa5, 0x8b, 0xa8, 0x9b, 0xc0, + 0x65, 0x78, 0xc1, 0xa0, 0x60, 0x73, 0xc5, 0x8e, 0x41, 0x81, 0x8b, 0xc2, + 0x9b, 0x7d, 0xc5, 0xc3, 0x67, 0xc1, 0x56, 0x54, 0x74, 0x57, 0x7d, 0x97, + 0x67, 0xa4, 0x55, 0xb2, 0x4f, 0x87, 0x8a, 0x73, 0xba, 0xc6, 0xcc, 0x3f, + 0x83, 0x57, 0x54, 0xa8, 0x2f, 0xbd, 0x75, 0xbd, 0xa8, 0xd6, 0x48, 0x42, + 0x8e, 0x8c, 0x35, 0xb1, 0xbe, 0x63, 0xa0, 0xc7, 0x79, 0x34, 0x92, 0xb9, + 0xb7, 0xc5, 0x94, 0xd6, 0x70, 0x99, 0x94, 0x94, 0xa8, 0xa1, 0x9c, 0x49, + 0x8b, 0xcb, 0x97, 0xd2, 0x38, 0x2d, 0x81, 0x8f, 0xcb, 0xa2, 0x3f, 0x87, + 0xb6, 0x71, 0x69, 0xcc, 0x45, 0x9a, 0x57, 0x3a, 0x44, 0x4d, 0x80, 0x49, + 0x81, 0xb7, 0x44, 0x4f, 0xcb, 0x3b, 0xbf, 0xba, 0x85, 0x8b, 0x39, 0x76, + 0xa3, 0x95, 0x7c, 0x5d, 0xa6, 0x92, 0x56, 0x66, 0x82, 0x43, 0x55, 0x42, + 0xd0, 0xcf, 0xcb, 0x3e, 0x61, 0x6c, 0x67, 0x67, 0x86, 0xb5, 0xba, 0x35, + 0x42, 0x53, 0x4b, 0x8b, 0x46, 0x65, 0x96, 0x42, 0xb1, 0xa8, 0xb1, 0x9e, + 0x6c, 0x52, 0xa1, 0x76, 0x4a, 0x89, 0x76, 0x38, 0x4d, 0x38, 0x7f, 0xa4, + 0x9f, 0x7b, 0x92, 0xca, 0xae, 0x5b, 0x97, 0xc8, 0x5f, 0x49, 0x9d, 0x2f, + 0x6c, 0x57, 0x9d, 0xb1, 0x41, 0x38, 0x42, 0xab, 0x98, 0x61, 0x5d, 0xc2, + 0x3e, 0x34, 0x3e, 0x9e, 0xaf, 0xb5, 0x31, 0x51, 0x47, 0x51, 0xb1, 0xb6, + 0x49, 0x3a, 0x6d, 0x64, 0x4c, 0x5b, 0x3c, 0xc4, 0x66, 0x8b, 0x7d, 0xb5, + 0x47, 0x5a, 0x59, 0x50, 0x9e, 0x40, 0xd1, 0x98, 0x62, 0x64, 0x8f, 0x44, + 0x42, 0x5f, 0x78, 0x4b, 0x7f, 0x86, 0x2f, 0xd1, 0xc0, 0xcb, 0xa4, 0x42, + 0xbf, 0x5f, 0x76, 0x6a, 0x60, 0x47, 0x59, 0x8e, 0x81, 0xa4, 0x9a, 0xaa, + 0xc8, 0x34, 0x8d, 0x70, 0x7f, 0xbd, 0xbf, 0x46, 0x9c, 0x4c, 0x72, 0xb6, + 0x94, 0x8e, 0x75, 0xca, 0x52, 0x41, 0x75, 0x88, 0xc1, 0xd8, 0x7d, 0x6a, + 0x6c, 0x9c, 0xa0, 0x75, 0x7c, 0x68, 0x97, 0x58, 0x4e, 0x52, 0x4e, 0x3e, + 0x6d, 0x86, 0xc5, 0x37, 0x61, 0x95, 0x88, 0x5a, 0x5a, 0x50, 0x52, 0x36, + 0x51, 0x5d, 0xa6, 0x46, 0x6e, 0x3f, 0x7c, 0xb4, 0xb5, 0xc2, 0x38, 0x4d, + 0x68, 0xca, 0xb0, 0xa8, 0x9c, 0x9b, 0x5d, 0x65, 0x84, 0xa5, 0xe2, 0x78, + 0xc6, 0x49, 0xb2, 0xbd, 0x3d, 0x30, 0xc6, 0x62, 0x62, 0xa4, 0x94, 0x57, + 0x74, 0xa4, 0x51, 0x7e, 0xa4, 0x36, 0x5c, 0xa0, 0x5e, 0x5f, 0x71, 0xaf, + 0x84, 0x55, 0x6a, 0x8c, 0x97, 0x4e, 0x8e, 0x67, 0x82, 0xa5, 0xae, 0x2d, + 0x37, 0xb9, 0x9a, 0x55, 0x3c, 0x90, 0x2d, 0x4b, 0xb1, 0xa5, 0x84, 0x50, + 0xbf, 0x47, 0x42, 0x7b, 0x7b, 0x68, 0x9a, 0xbb, 0x72, 0x70, 0xa5, 0x73, + 0x7b, 0x9f, 0x76, 0x57, 0x54, 0xba, 0x71, 0x84, 0xac, 0x84, 0x5a, 0xc7, + 0x61, 0x97, 0xd4, 0xb8, 0x7e, 0xb8, 0xb9, 0xbe, 0xa6, 0xc8, 0x8b, 0x4e, + 0x70, 0x88, 0x5c, 0x77, 0x57, 0x39, 0x4d, 0xb8, 0x9d, 0x4c, 0x4e, 0x77, + 0x8a, 0xcd, 0x78, 0x76, 0x91, 0xd1, 0xab, 0x64, 0xba, 0x61, 0x96, 0x73, + 0x3c, 0x88, 0x43, 0xaa, 0x73, 0x51, 0xbf, 0x4a, 0xa7, 0x42, 0xa9, 0x90, + 0x36, 0xcb, 0xbd, 0x41, 0x46, 0xa7, 0x33, 0x45, 0x56, 0x9c, 0x33, 0x3f, + 0x56, 0x54, 0x3f, 0xce, 0xc1, 0xb0, 0x3f, 0xb9, 0xaa, 0x85, 0x8a, 0x6b, + 0x35, 0x69, 0x7d, 0x49, 0x5a, 0x3c, 0x84, 0x6b, 0xc5, 0x61, 0x55, 0x61, + 0x68, 0xa3, 0xc8, 0x62, 0x62, 0x4b, 0xb3, 0x6f, 0x79, 0x63, 0x7a, 0x3f, + 0x6c, 0x5d, 0x55, 0x5c, 0x5f, 0x45, 0x8d, 0x7c, 0x55, 0x79, 0x38, 0x6e, + 0x6f, 0xc1, 0xb0, 0xc7, 0x9d, 0x55, 0x68, 0x5c, 0x71, 0x48, 0x96, 0x8e, + 0xc0, 0x34, 0x62, 0xc7, 0x72, 0xc5, 0x3e, 0x46, 0x97, 0x99, 0x66, 0x78, + 0xb7, 0x7c, 0xb3, 0x61, 0xa1, 0x83, 0x81, 0xb4, 0x8f, 0xa2, 0xac, 0x83, + 0x40, 0x45, 0x50, 0x2e, 0x5e, 0xbe, 0xcf, 0xa7, 0x85, 0x74, 0xc6, 0x69, + 0x8f, 0x89, 0x53, 0xb0, 0x32, 0xaf, 0xa5, 0xa0, 0x79, 0xc6, 0x63, 0x59, + 0x46, 0x4a, 0xb0, 0x6f, 0x30, 0xa6, 0x3e, 0x59, 0x38, 0x62, 0xd1, 0x3d, + 0x3c, 0x87, 0x52, 0x50, 0x59, 0x33, 0x3e, 0xbc, 0x5c, 0x55, 0x86, 0xc8, + 0x46, 0xaf, 0x99, 0xae, 0xa7, 0x87, 0xc0, 0x7e, 0x3c, 0x4b, 0x3e, 0x87, + 0xcf, 0x5f, 0x86, 0x76, 0x89, 0x7c, 0x4f, 0xad, 0xbd, 0x91, 0x56, 0x2e, + 0x4d, 0x4d, 0x30, 0x30, 0xc3, 0x8c, 0x90, 0x78, 0x39, 0x6c, 0x72, 0x74, + 0x4f, 0x83, 0x42, 0x50, 0x49, 0xb5, 0x76, 0x9a, 0xa7, 0xc7, 0x52, 0x80, + 0x3e, 0x72, 0xbf, 0x4f, 0xad, 0x43, 0x49, 0x90, 0x59, 0x52, 0x85, 0x43, + 0x40, 0x3a, 0x6e, 0xcf, 0xbb, 0x6d, 0x9b, 0xc4, 0x83, 0x40, 0x45, 0xa5, + 0x6c, 0xc9, 0x68, 0xd1, 0x4c, 0xc0, 0xb9, 0x84, 0x48, 0xcb, 0x7f, 0xa0, + 0x52, 0xcc, 0x39, 0x58, 0xb3, 0xc8, 0x66, 0x48, 0x43, 0xc2, 0x79, 0x58, + 0xc0, 0xaa, 0xb6, 0x7f, 0x5f, 0x44, 0x7a, 0xb3, 0x43, 0xbd, 0xba, 0x34, + 0x4f, 0xad, 0x39, 0xbd, 0x7b, 0x53, 0x7e, 0x3d, 0x6f, 0x47, 0x50, 0xb4, + 0x85, 0x4f, 0x42, 0x8c, 0xc0, 0x84, 0x4a, 0xd0, 0x85, 0x57, 0x8d, 0x3b, + 0x63, 0xa0, 0x86, 0xc5, 0xce, 0xce, 0x33, 0xcc, 0xc4, 0x4d, 0x85, 0xb2, + 0x3f, 0x70, 0x98, 0x54, 0xac, 0xa5, 0x86, 0x59, 0xb6, 0x64, 0x4d, 0x77, + 0x87, 0x4c, 0x62, 0xc3, 0xc2, 0x78, 0x61, 0x5c, 0xbe, 0x39, 0x50, 0x34, + 0x54, 0x44, 0x3c, 0x92, 0xc7, 0x4e, 0x3d, 0xbc, 0x7d, 0xae, 0x6b, 0x68, + 0xa0, 0xc3, 0x60, 0xc9, 0xa2, 0xc7, 0x95, 0xa3, 0x40, 0xc3, 0x70, 0x3c, + 0xc5, 0x94, 0xd4, 0x56, 0x66, 0x4c, 0x6e, 0xca, 0x67, 0x5c, 0xbc, 0xae, + 0xac, 0x5b, 0x45, 0xc0, 0xa0, 0xa0, 0x32, 0x52, 0x9f, 0x46, 0x42, 0x98, + 0x9a, 0xad, 0x96, 0xa6, 0xcc, 0xba, 0x79, 0x4b, 0x48, 0x32, 0x3c, 0x43, + 0x7a, 0xa8, 0xa3, 0x59, 0x61, 0x80, 0x6d, 0x79, 0x9c, 0x96, 0x4e, 0x3d, + 0x8f, 0x54, 0xad, 0x7c, 0xd1, 0x4e, 0xb2, 0x8c, 0x8a, 0xbb, 0x56, 0x4f, + 0x44, 0xcd, 0x63, 0xae, 0xcd, 0xc6, 0x82, 0x9b, 0x83, 0x5d, 0x85, 0x6a, + 0xc9, 0x6a, 0x4f, 0x29, 0x94, 0x60, 0xac, 0x8b, 0xbc, 0x2d, 0xc8, 0xaa, + 0x65, 0x82, 0xc9, 0x89, 0x88, 0xac, 0x34, 0x3e, 0x99, 0x81, 0x69, 0x91, + 0x42, 0x2d, 0x2e, 0x75, 0x56, 0xc1, 0x71, 0xb8, 0xaf, 0x98, 0x95, 0x63, + 0x83, 0x96, 0x43, 0xc5, 0xa1, 0x8b, 0x9f, 0x6c, 0x4e, 0x55, 0x79, 0xcc, + 0x44, 0x67, 0xc7, 0x2c, 0xb3, 0x8c, 0x70, 0xad, 0x7e, 0x9b, 0xcd, 0xa1, + 0x90, 0x47, 0x7c, 0x9c, 0x7c, 0x86, 0x87, 0x8e, 0xbc, 0x7d, 0x88, 0x96, + 0x55, 0x44, 0xc3, 0xc7, 0xad, 0x47, 0xd0, 0x8b, 0x41, 0x78, 0x7d, 0xa2, + 0x8a, 0x71, 0xb1, 0xb5, 0x5e, 0x62, 0x3e, 0x41, 0x4a, 0x69, 0x6a, 0x3e, + 0x77, 0x71, 0xc7, 0x49, 0xa2, 0xcb, 0xcc, 0xce, 0xa3, 0xae, 0xa0, 0xd5, + 0xbf, 0xbc, 0x45, 0x73, 0xc5, 0x51, 0x68, 0x4c, 0xb6, 0x39, 0xc3, 0x6f, + 0x90, 0x70, 0x35, 0x8e, 0x3d, 0xa3, 0xbe, 0x4b, 0x6d, 0xac, 0xc8, 0xbb, + 0x38, 0x4d, 0x50, 0xb4, 0xb3, 0x43, 0x5f, 0x9d, 0xbb, 0xc6, 0xb0, 0x3e, + 0x45, 0x62, 0xa7, 0xcd, 0xb8, 0x33, 0x3a, 0x5b, 0x59, 0xac, 0xcd, 0xa9, + 0xaf, 0x98, 0x5c, 0x64, 0x38, 0xb7, 0xa4, 0x6f, 0x66, 0x98, 0x80, 0xa1, + 0x48, 0x3a, 0x95, 0x33, 0xbe, 0x70, 0x33, 0x5e, 0x40, 0x62, 0x4f, 0x5d, + 0x8f, 0xc9, 0x87, 0x7b, 0xae, 0x3a, 0x81, 0x42, 0x59, 0x47, 0x86, 0x52, + 0x79, 0x40, 0x39, 0xb7, 0xc0, 0x7b, 0x9f, 0x8d, 0x3f, 0x49, 0x42, 0x4f, + 0x81, 0x94, 0x89, 0x46, 0x5a, 0x65, 0x42, 0xbb, 0x88, 0x6a, 0x34, 0xd0, + 0xa7, 0xc4, 0x92, 0x62, 0x6c, 0x84, 0x36, 0xd0, 0x86, 0xc4, 0x6e, 0x76, + 0x8e, 0x41, 0x63, 0x62, 0x77, 0x82, 0x35, 0x38, 0x36, 0xb9, 0x31, 0x52, + 0x88, 0x9b, 0xba, 0x3c, 0xba, 0x61, 0xc8, 0x38, 0xcc, 0xbc, 0x8f, 0x86, + 0xb5, 0x65, 0x59, 0x60, 0xa0, 0x5d, 0x97, 0x53, 0x62, 0x5f, 0x79, 0x54, + 0x44, 0xcb, 0x63, 0xc0, 0x81, 0xb8, 0xbe, 0x96, 0xa9, 0x2f, 0xd0, 0x92, + 0xae, 0x91, 0x50, 0xa4, 0x9b, 0x35, 0x71, 0xc8, 0x97, 0x88, 0xa4, 0xb4, + 0xa6, 0x43, 0x46, 0xb6, 0x6c, 0xb5, 0xba, 0x92, 0x3f, 0xad, 0xaf, 0x7a, + 0xa2, 0x8e, 0x67, 0x4c, 0x3e, 0x59, 0xcd, 0xcf, 0x81, 0x7a, 0x84, 0x44, + 0x80, 0x4d, 0x6d, 0xd1, 0x77, 0xce, 0xbc, 0x85, 0xa3, 0x60, 0x30, 0x93, + 0xa8, 0x5a, 0x56, 0xd1, 0x67, 0x4c, 0x52, 0x43, 0xcd, 0x88, 0x79, 0xb5, + 0xb0, 0xc1, 0x94, 0xc1, 0x95, 0x43, 0x72, 0x3e, 0x36, 0xb0, 0x57, 0xa9, + 0x55, 0x67, 0x50, 0xb3, 0xd4, 0x55, 0x60, 0x43, 0x5f, 0x5f, 0x34, 0xa3, + 0x7c, 0x5e, 0xcb, 0x39, 0x3e, 0x54, 0x98, 0x88, 0x36, 0x69, 0x99, 0x55, + 0x3a, 0xa8, 0xcd, 0x47, 0x83, 0x90, 0x40, 0x6f, 0x47, 0x57, 0xbc, 0x9c, + 0xc5, 0x9b, 0x5e, 0x69, 0xc6, 0xc1, 0xd3, 0xc4, 0xce, 0x4e, 0x7c, 0x45, + 0x5b, 0x58, 0x47, 0x76, 0x6d, 0x52, 0x85, 0x3f, 0xc2, 0x3e, 0x60, 0x9d, + 0xb4, 0x53, 0x45, 0x4b, 0xd3, 0x5c, 0x7e, 0x82, 0x37, 0x8a, 0xbc, 0x9f, + 0x87, 0x75, 0xc9, 0x98, 0x73, 0x84, 0x83, 0xc0, 0x4d, 0x72, 0x96, 0xd8, + 0x99, 0x8d, 0x55, 0xad, 0x55, 0x88, 0x87, 0x56, 0x68, 0x63, 0x37, 0x59, + 0x6a, 0x6e, 0x48, 0x64, 0xd2, 0xb6, 0x92, 0xd3, 0xbb, 0x46, 0xcb, 0xad, + 0xc1, 0xb0, 0x5a, 0x78, 0xc1, 0xc4, 0x6b, 0x9e, 0x44, 0x79, 0x9c, 0xbb, + 0x58, 0x4e, 0x7c, 0x7b, 0x6c, 0x76, 0x4c, 0x41, 0xc5, 0x6a, 0x8c, 0x69, + 0x56, 0x52, 0xc3, 0x84, 0x59, 0xc4, 0x6b, 0x75, 0x6b, 0xb9, 0xd5, 0xc2, + 0xb8, 0x83, 0x3f, 0xc5, 0x7a, 0xb4, 0xc6, 0x67, 0xcb, 0xa9, 0x58, 0x30, + 0x56, 0xc5, 0xba, 0xc6, 0x6d, 0xa2, 0x68, 0x76, 0xcd, 0x81, 0x79, 0x48, + 0x7b, 0x97, 0x6d, 0xcc, 0x7b, 0x5a, 0x80, 0x80, 0xcc, 0xc0, 0x91, 0xba, + 0x9d, 0x7a, 0xa6, 0x37, 0x8a, 0x88, 0x9a, 0x59, 0xb4, 0x45, 0xc9, 0x47, + 0x48, 0x66, 0x5c, 0x61, 0x80, 0x85, 0xa0, 0xb1, 0x41, 0x6e, 0x61, 0x4d, + 0x8d, 0x7a, 0xb8, 0x61, 0x64, 0x63, 0x87, 0x54, 0xad, 0xa9, 0x81, 0xc9, + 0xcc, 0x67, 0xbf, 0x6a, 0xc2, 0x85, 0x90, 0x72, 0x79, 0x78, 0xa3, 0x98, + 0x3b, 0x41, 0x5c, 0xa6, 0x7d, 0x3c, 0x3d, 0x89, 0x79, 0x75, 0x39, 0x39, + 0x87, 0x87, 0x4d, 0xb3, 0xc0, 0x3c, 0x8f, 0x5d, 0x7e, 0x59, 0x73, 0x95, + 0xb9, 0xb8, 0x75, 0x8e, 0xab, 0x57, 0x53, 0x62, 0x83, 0xa3, 0xa0, 0x7b, + 0xc8, 0x76, 0xc9, 0x9f, 0x96, 0xad, 0x3d, 0xc7, 0x6b, 0x3f, 0xc7, 0x8c, + 0xc5, 0x36, 0x7e, 0x63, 0xab, 0x9f, 0x7e, 0x4e, 0xc7, 0x5c, 0xbd, 0xb0, + 0xc2, 0xc3, 0x55, 0xc8, 0xc1, 0xcf, 0x36, 0xa4, 0x62, 0x9d, 0x7f, 0x5e, + 0xd3, 0x50, 0x59, 0x56, 0xc9, 0xc0, 0x98, 0x62, 0x62, 0x5a, 0x92, 0xb9, + 0xaf, 0x9b, 0x43, 0x9c, 0x31, 0x96, 0x5d, 0x92, 0x78, 0x32, 0x76, 0x64, + 0x98, 0x8a, 0x3f, 0x7b, 0xaa, 0x37, 0x9a, 0x31, 0x54, 0xad, 0x73, 0xae, + 0x82, 0x9a, 0xad, 0xbd, 0xa5, 0x59, 0x30, 0x78, 0xab, 0x31, 0x3f, 0x82, + 0xcf, 0xb7, 0x62, 0x43, 0xc0, 0x87, 0x3d, 0xd7, 0x70, 0x81, 0x36, 0xbd, + 0x97, 0xb8, 0x99, 0xd1, 0x61, 0xaa, 0xa5, 0x5a, 0xac, 0xa6, 0x5a, 0x91, + 0x59, 0xd3, 0xa2, 0xc3, 0x6b, 0xa9, 0x4e, 0x7c, 0xcc, 0x91, 0x38, 0x81, + 0xaf, 0xa0, 0x88, 0x46, 0x3b, 0xa6, 0x3b, 0x4e, 0x81, 0x62, 0x83, 0x37, + 0x3a, 0x65, 0x4a, 0xd4, 0xbb, 0x4f, 0xa0, 0x53, 0x40, 0x53, 0x5e, 0x76, + 0x5d, 0xb6, 0x5a, 0x91, 0xc7, 0x75, 0x8c, 0x32, 0x48, 0x6c, 0x36, 0x98, + 0x8a, 0x6e, 0x56, 0x71, 0x40, 0x9d, 0xaf, 0x3d, 0x62, 0x3b, 0x6f, 0x78, + 0x9f, 0x49, 0x78, 0xb3, 0xb3, 0x58, 0x2f, 0xba, 0x87, 0x81, 0x53, 0x3e, + 0x56, 0x85, 0xbe, 0x89, 0xb6, 0x98, 0x9c, 0x94, 0x84, 0x6c, 0xcb, 0x84, + 0x58, 0xbb, 0x5d, 0x97, 0x94, 0x49, 0x96, 0xb3, 0x94, 0x94, 0x8f, 0x4c, + 0x9d, 0x44, 0x6d, 0x7e, 0x6a, 0x35, 0x76, 0x49, 0x76, 0x60, 0x4a, 0xaf, + 0x4e, 0x51, 0x71, 0x43, 0x70, 0x75, 0x79, 0xb5, 0x52, 0x81, 0x9b, 0xc0, + 0x5f, 0x61, 0x2d, 0x3c, 0xc4, 0xb7, 0x58, 0x4a, 0x8a, 0xbd, 0x73, 0xbc, + 0xa1, 0xc3, 0x4f, 0x4f, 0xb5, 0x5d, 0x3e, 0xb0, 0x9f, 0x67, 0xd0, 0xd1, + 0x6b, 0x7d, 0xb1, 0x42, 0x30, 0x77, 0x6b, 0x7b, 0x58, 0x45, 0xb6, 0x8e, + 0x3b, 0xa5, 0x97, 0x61, 0x74, 0x62, 0x97, 0x3f, 0x3a, 0x39, 0x85, 0x41, + 0xcb, 0xd0, 0xc5, 0x8a, 0x39, 0x77, 0xdc, 0x6f, 0x97, 0x5c, 0x83, 0xb9, + 0xcb, 0x66, 0xc7, 0x80, 0xc8, 0xd1, 0x61, 0x98, 0x4e, 0x67, 0xca, 0xbe, + 0xb3, 0x81, 0x4c, 0x39, 0x34, 0x43, 0x81, 0xbd, 0x4b, 0xc3, 0xd6, 0x55, + 0x87, 0xa3, 0x58, 0x90, 0xb9, 0x52, 0x7c, 0x76, 0xcb, 0xd0, 0x8f, 0xb9, + 0x49, 0xb2, 0x42, 0xbf, 0xac, 0xcf, 0x9f, 0x6c, 0x45, 0x5c, 0x4b, 0xc8, + 0x47, 0xb3, 0x9f, 0x91, 0x42, 0xc9, 0xb1, 0x6d, 0xc9, 0xc5, 0x7f, 0x63, + 0x72, 0x57, 0x34, 0xa7, 0x5e, 0x2e, 0x6d, 0x44, 0xb9, 0xbd, 0x5c, 0x9b, + 0x50, 0xcc, 0xc1, 0xcc, 0xbd, 0x79, 0xbc, 0xa5, 0x71, 0xad, 0x79, 0x84, + 0xae, 0xb6, 0x3a, 0x6c, 0x33, 0x95, 0x7d, 0xb0, 0xbc, 0x59, 0x38, 0x52, + 0xc5, 0xa0, 0x5d, 0x69, 0x75, 0xc2, 0xcb, 0xa8, 0xc3, 0x6f, 0x81, 0xa7, + 0x69, 0x54, 0x35, 0x94, 0x86, 0x45, 0xb5, 0xc4, 0xc6, 0x72, 0x84, 0x3f, + 0xb4, 0xc7, 0x8d, 0xd2, 0x50, 0x6d, 0x87, 0xd1, 0x8e, 0xc3, 0xad, 0x88, + 0xb4, 0x9b, 0xcf, 0xad, 0x58, 0x65, 0x7a, 0x5d, 0x63, 0x40, 0x44, 0x3f, + 0x71, 0x51, 0xb4, 0x88, 0xba, 0x50, 0xa9, 0x7a, 0xb0, 0xae, 0xb5, 0xc7, + 0x41, 0x57, 0x37, 0x65, 0x86, 0xad, 0x6f, 0x4f, 0xa6, 0x61, 0x83, 0x55, + 0x76, 0xb6, 0xcf, 0x64, 0x97, 0x9d, 0xa7, 0x78, 0x93, 0xbe, 0xa7, 0xc7, + 0xce, 0x5c, 0x35, 0x5a, 0x60, 0x5f, 0x5c, 0x64, 0xc6, 0xb2, 0x9a, 0x4a, + 0xcc, 0xb9, 0x76, 0x5b, 0xc8, 0xc7, 0x91, 0x52, 0x48, 0xa6, 0xa5, 0x35, + 0xb3, 0x8e, 0xc2, 0x65, 0x37, 0x98, 0x84, 0x8e, 0x74, 0x51, 0xab, 0x72, + 0x40, 0x4e, 0xae, 0x6c, 0xbe, 0x37, 0xac, 0x9e, 0x58, 0xb5, 0x68, 0x96, + 0xb5, 0xad, 0x85, 0x5e, 0x96, 0x7a, 0x5e, 0x5a, 0xcc, 0x9c, 0x41, 0x89, + 0xbc, 0x68, 0xa7, 0xc8, 0x34, 0xd2, 0x54, 0xb7, 0x7d, 0x85, 0xca, 0xc7, + 0xac, 0x9d, 0xc7, 0x6a, 0x58, 0x75, 0x78, 0x8b, 0x70, 0x3f, 0x5f, 0x3d, + 0xa1, 0x6d, 0x4d, 0xbf, 0x44, 0xa2, 0x86, 0xbf, 0x3b, 0xc0, 0xae, 0x81, + 0x64, 0x4f, 0xca, 0x57, 0xae, 0xc0, 0xa9, 0xb3, 0xaf, 0x7e, 0xb0, 0xb3, + 0xac, 0xa7, 0x94, 0xc4, 0x56, 0x42, 0x2f, 0xb3, 0x7e, 0xad, 0xb7, 0x60, + 0xba, 0x95, 0xb9, 0x4a, 0x97, 0x42, 0xb3, 0x60, 0x45, 0x7b, 0x45, 0x86, + 0x8c, 0x33, 0x34, 0x49, 0x40, 0xa7, 0xa2, 0xa7, 0x7a, 0x55, 0x68, 0x97, + 0xd1, 0xbc, 0x90, 0xc4, 0x69, 0x6f, 0x5a, 0x7b, 0xd2, 0x99, 0xce, 0xa4, + 0x3b, 0xa2, 0x67, 0x50, 0x79, 0xa1, 0x87, 0xa6, 0x5f, 0x48, 0x42, 0xc4, + 0x6c, 0x8d, 0xb7, 0x3f, 0x69, 0x9b, 0xb6, 0xde, 0x6d, 0x45, 0x97, 0xc1, + 0x98, 0x92, 0x36, 0xab, 0xa9, 0x4e, 0x40, 0xa0, 0xa9, 0x8e, 0x43, 0x7c, + 0xce, 0xd9, 0xbf, 0x8a, 0x34, 0xa0, 0x45, 0xb9, 0x46, 0xb8, 0x70, 0xcc, + 0x97, 0xa7, 0xbe, 0x78, 0xb0, 0x6e, 0x6e, 0x6c, 0x7a, 0x65, 0x74, 0x4d, + 0x98, 0x4a, 0xb3, 0x5f, 0x5e, 0x35, 0x44, 0x59, 0xb6, 0xbb, 0xd3, 0x6d, + 0x67, 0xb3, 0xce, 0xc0, 0x47, 0xb3, 0x54, 0x65, 0x84, 0x7c, 0x55, 0x82, + 0x75, 0xb0, 0x9e, 0xa9, 0x7b, 0xc7, 0xbb, 0xcc, 0xc1, 0x41, 0x9d, 0x4c, + 0x44, 0x8e, 0xa8, 0x94, 0x4b, 0x58, 0x93, 0x98, 0x94, 0x50, 0x89, 0x46, + 0x3c, 0x72, 0xba, 0xc3, 0x9c, 0x70, 0x80, 0x88, 0xcd, 0xa0, 0xab, 0x90, + 0x72, 0x8e, 0x9c, 0x6d, 0xb3, 0x77, 0xb2, 0x53, 0x34, 0x37, 0x9d, 0xca, + 0x3f, 0xb8, 0x57, 0xa9, 0x75, 0x7e, 0x72, 0x63, 0x62, 0xaa, 0x92, 0xcf, + 0x48, 0xce, 0x8d, 0x43, 0x50, 0xb0, 0x86, 0xb6, 0x87, 0x7a, 0x42, 0xa1, + 0xc3, 0x8c, 0xd1, 0x4f, 0x4b, 0xaa, 0x87, 0x2e, 0xce, 0x78, 0xd2, 0x73, + 0x58, 0x59, 0x58, 0xbb, 0x57, 0x53, 0x4e, 0x7d, 0x79, 0xa4, 0xd1, 0x60, + 0xaa, 0x87, 0x66, 0x93, 0x6f, 0x54, 0x42, 0x71, 0x35, 0x90, 0x91, 0x79, + 0xa4, 0xc1, 0x52, 0x3f, 0xc4, 0x67, 0x38, 0x57, 0xb8, 0x65, 0x6d, 0x86, + 0x36, 0x3d, 0xc3, 0x86, 0xb1, 0xaa, 0xd0, 0x49, 0x89, 0x7a, 0x8b, 0xcf, + 0x79, 0x76, 0x36, 0x76, 0x85, 0x80, 0xb8, 0xd2, 0x9f, 0x6d, 0x80, 0x7d, + 0x9a, 0x7c, 0x8e, 0x4d, 0x48, 0x66, 0x64, 0x5f, 0x54, 0xa6, 0x42, 0xc1, + 0x4c, 0x57, 0x62, 0x65, 0x70, 0x71, 0x5a, 0x38, 0xac, 0xa0, 0x39, 0x59, + 0x70, 0x81, 0x4b, 0x63, 0x84, 0x7f, 0xbc, 0x5d, 0x68, 0x52, 0x3d, 0x8e, + 0xce, 0x60, 0x8a, 0x3c, 0xc1, 0xd0, 0xa8, 0xd0, 0x91, 0x70, 0xcf, 0xc9, + 0x96, 0x72, 0x4b, 0xa9, 0x30, 0x65, 0x4d, 0xc8, 0x8a, 0x5d, 0x9f, 0x64, + 0x82, 0x3e, 0xa0, 0xb6, 0xbd, 0xc8, 0x5e, 0xa3, 0x64, 0x3b, 0x49, 0xb6, + 0xb5, 0x3e, 0xb3, 0x67, 0xb8, 0x55, 0x84, 0x9c, 0x77, 0xac, 0xd3, 0x9e, + 0x82, 0x7f, 0x6f, 0x44, 0x66, 0x78, 0xa3, 0x62, 0x62, 0x79, 0x7d, 0xbf, + 0xbb, 0xb7, 0x86, 0xae, 0x47, 0x39, 0x75, 0xca, 0x6a, 0xce, 0xbe, 0x3e, + 0x5c, 0x32, 0x90, 0xcd, 0x8a, 0x9b, 0x8e, 0x4c, 0xb1, 0x41, 0xcd, 0xb2, + 0x90, 0xab, 0x78, 0x66, 0x99, 0xbf, 0xab, 0x7e, 0xcf, 0x8f, 0x52, 0xa3, + 0x76, 0xc1, 0xcb, 0xb8, 0x76, 0xc7, 0x3f, 0x6f, 0xcb, 0x63, 0x54, 0x93, + 0xb8, 0x84, 0x65, 0x99, 0x9a, 0x6d, 0x38, 0xb4, 0x88, 0x49, 0x64, 0x37, + 0x6e, 0x6d, 0x8a, 0x8c, 0x99, 0xad, 0xa0, 0x61, 0x9b, 0x69, 0xc0, 0x62, + 0x2c, 0xa5, 0xaf, 0x7e, 0x28, 0x9a, 0x37, 0xbe, 0xb1, 0x4a, 0x7a, 0x6a, + 0x7a, 0xa9, 0x4f, 0x7a, 0x3b, 0x33, 0xb9, 0x91, 0xc2, 0x33, 0x5c, 0xbe, + 0x4e, 0x64, 0xb8, 0xb2, 0x87, 0x34, 0x71, 0x52, 0xca, 0x36, 0x3d, 0x39, + 0xcc, 0xd1, 0x58, 0x38, 0x44, 0x4d, 0xb4, 0x9f, 0xb1, 0x64, 0x73, 0xa7, + 0x5f, 0x3d, 0x77, 0x74, 0xb7, 0xce, 0xbe, 0x3d, 0xbd, 0x96, 0x3b, 0xa4, + 0xbf, 0x5c, 0xa1, 0x61, 0xc0, 0x80, 0x97, 0x7e, 0x43, 0x46, 0x6c, 0x70, + 0x64, 0x95, 0x57, 0x6c, 0xa8, 0xc9, 0x90, 0xae, 0x39, 0xc5, 0x32, 0x88, + 0x8d, 0x54, 0x60, 0x7c, 0x37, 0xbb, 0x7d, 0x6d, 0xab, 0xb6, 0xb4, 0x51, + 0x6f, 0x6c, 0xca, 0xb8, 0x57, 0x94, 0x4d, 0x60, 0x8e, 0x33, 0x32, 0x85, + 0x61, 0x92, 0xc1, 0x82, 0x6c, 0xbc, 0x3c, 0xcd, 0x9c, 0xc1, 0x4f, 0xc3, + 0x67, 0x8a, 0x6a, 0x5b, 0x84, 0x98, 0x79, 0x46, 0x4d, 0x7a, 0x49, 0x50, + 0x33, 0xc6, 0x49, 0x3f, 0xc3, 0x97, 0x3e, 0x8e, 0x91, 0xb4, 0x3b, 0x8c, + 0x99, 0x89, 0x4f, 0xda, 0xb1, 0xad, 0xa6, 0x60, 0x39, 0xb8, 0x59, 0x53, + 0xa2, 0x8c, 0x5b, 0x40, 0xc5, 0xaa, 0x98, 0x83, 0x53, 0xce, 0xbc, 0xb1, + 0xd2, 0xd9, 0x63, 0x92, 0x44, 0x41, 0x3d, 0xb4, 0xbd, 0x5e, 0x60, 0xa8, + 0x44, 0x84, 0x61, 0x36, 0x6c, 0x49, 0xe0, 0xc5, 0x6f, 0x9c, 0x40, 0x8a, + 0xbb, 0x89, 0x92, 0x44, 0x9e, 0x53, 0x81, 0x44, 0xc4, 0x37, 0x74, 0x9a, + 0x4e, 0x34, 0x7a, 0x5b, 0x84, 0xba, 0x95, 0x83, 0xac, 0x41, 0xa2, 0x4c, + 0x75, 0xac, 0xc1, 0xb7, 0x67, 0x8c, 0x56, 0xbc, 0xb3, 0x83, 0xad, 0x9b, + 0xbc, 0xd8, 0x56, 0x83, 0x33, 0xb9, 0x74, 0x90, 0x30, 0x49, 0x5c, 0xc6, + 0x93, 0xb3, 0x50, 0x30, 0x71, 0x57, 0x63, 0xbc, 0x54, 0xae, 0xd9, 0xa2, + 0xbc, 0x83, 0x92, 0xb7, 0x5a, 0x65, 0x51, 0xb9, 0x6e, 0x33, 0xbb, 0x3d, + 0xa6, 0x63, 0x50, 0x41, 0xa9, 0xb7, 0x7e, 0x79, 0xa0, 0x61, 0x8c, 0x7a, + 0xcf, 0x54, 0x82, 0x90, 0x66, 0x74, 0x85, 0x99, 0xa4, 0x64, 0x83, 0x3c, + 0x8a, 0x6d, 0x52, 0xbc, 0x7d, 0x57, 0x39, 0x45, 0x79, 0xb8, 0xcf, 0x52, + 0x7a, 0x43, 0x8c, 0x3c, 0x76, 0x33, 0x86, 0x38, 0x75, 0x34, 0xa3, 0xa7, + 0x98, 0xb6, 0x5f, 0x7e, 0xc3, 0xcb, 0xc5, 0x5b, 0x39, 0x3d, 0x92, 0xca, + 0xad, 0xa2, 0x94, 0x3c, 0x85, 0xaf, 0x4e, 0xcd, 0xa5, 0xab, 0xc1, 0x5d, + 0x99, 0xa2, 0x93, 0x73, 0xca, 0x4d, 0x55, 0x4b, 0x6f, 0x49, 0x8e, 0x68, + 0x82, 0x93, 0x3f, 0x79, 0x42, 0x75, 0x44, 0x9a, 0x68, 0x3d, 0xcc, 0x4b, + 0x9f, 0x69, 0x94, 0x6a, 0x5d, 0x8c, 0xc2, 0x61, 0x94, 0xcd, 0x4a, 0x48, + 0x67, 0xa3, 0x49, 0x51, 0x59, 0xa3, 0x7c, 0x6d, 0x55, 0xab, 0xb0, 0x7d, + 0xa0, 0x9f, 0xa5, 0x79, 0x44, 0x75, 0x79, 0xa8, 0xb7, 0xd0, 0x51, 0x79, + 0x8a, 0x64, 0x76, 0x95, 0x89, 0xc2, 0x3e, 0x46, 0xac, 0x7f, 0x65, 0x71, + 0xb4, 0xbc, 0x4e, 0x6e, 0x97, 0x7a, 0x57, 0xc7, 0xc1, 0x7e, 0x2d, 0xb6, + 0x3f, 0xa3, 0x82, 0xb7, 0x51, 0x47, 0xcb, 0x97, 0xc5, 0x33, 0xac, 0x3d, + 0x77, 0xa7, 0xbe, 0x42, 0x8a, 0x56, 0xcf, 0x3e, 0x76, 0xc7, 0x46, 0x40, + 0xa8, 0x63, 0x87, 0xb4, 0x4e, 0xbc, 0xb6, 0xba, 0xa0, 0x39, 0x68, 0x56, + 0x96, 0x36, 0xac, 0x90, 0x9d, 0xc7, 0x51, 0x2f, 0x72, 0x82, 0x7b, 0x4b, + 0xbd, 0xba, 0x9f, 0xbf, 0x48, 0xa3, 0xb6, 0x9e, 0xad, 0x9f, 0x43, 0x37, + 0xd1, 0xb3, 0x97, 0x54, 0xc9, 0x4a, 0x70, 0x7e, 0x4d, 0x64, 0xa7, 0x38, + 0xcf, 0xbb, 0x61, 0xa5, 0xca, 0xaa, 0x9c, 0x3b, 0x6e, 0x91, 0xb8, 0xbe, + 0x63, 0x6e, 0x6d, 0x38, 0x96, 0xa0, 0x94, 0xb8, 0x33, 0x37, 0x85, 0x45, + 0xca, 0x7e, 0x98, 0xb6, 0x68, 0x64, 0xcf, 0x86, 0xc9, 0xc4, 0x7e, 0x60, + 0xc2, 0xd4, 0x88, 0x2e, 0x81, 0x85, 0xcf, 0x82, 0xbf, 0xbb, 0x8d, 0xc6, + 0xb1, 0x96, 0x5c, 0xa3, 0x69, 0x5c, 0xb8, 0xaf, 0x9e, 0xc8, 0xbc, 0x6a, + 0x73, 0x9d, 0x70, 0x56, 0x48, 0xc4, 0x5e, 0x97, 0xc2, 0xb7, 0x7f, 0x55, + 0xc4, 0x87, 0x71, 0x9e, 0xae, 0xb4, 0xa9, 0x3c, 0xa5, 0x32, 0x44, 0xce, + 0x6c, 0xc4, 0x7a, 0x6a, 0xad, 0x3d, 0x75, 0x36, 0x42, 0x87, 0x7c, 0x33, + 0x43, 0x64, 0x39, 0x67, 0x87, 0x91, 0x3d, 0x8d, 0xa6, 0x8e, 0x80, 0x39, + 0xab, 0x53, 0x92, 0xa9, 0x96, 0x9b, 0xb8, 0x4e, 0x9c, 0xca, 0xb8, 0x61, + 0x5c, 0x36, 0x7f, 0x6a, 0x95, 0x54, 0xbd, 0x4d, 0x9f, 0x92, 0x55, 0x34, + 0x9a, 0x8a, 0xc7, 0x56, 0xc0, 0x4c, 0xa7, 0x63, 0xcc, 0x61, 0x3e, 0x39, + 0xca, 0x39, 0x75, 0xc5, 0xad, 0xd1, 0xa3, 0x65, 0x53, 0x6f, 0x39, 0x84, + 0x7b, 0x88, 0x65, 0xcb, 0x56, 0xba, 0x5a, 0x86, 0x9f, 0x83, 0x88, 0xb4, + 0x61, 0xa7, 0x41, 0x33, 0x35, 0xb1, 0x86, 0x3b, 0x4c, 0x6f, 0x70, 0x6a, + 0x4c, 0x35, 0x96, 0xaa, 0x45, 0xa4, 0x34, 0x66, 0xca, 0x4b, 0xc2, 0x32, + 0xcc, 0x98, 0xd1, 0x5f, 0x5e, 0xbd, 0xd2, 0x9c, 0x85, 0xd2, 0x7b, 0x60, + 0x70, 0x9d, 0x5c, 0x7d, 0x56, 0x79, 0x3a, 0x98, 0x3d, 0x9a, 0xc8, 0x4c, + 0x7a, 0x97, 0x90, 0x6d, 0xd0, 0x44, 0xa7, 0x46, 0xc1, 0x9d, 0x80, 0x39, + 0x3e, 0x74, 0xc8, 0x6f, 0x6e, 0x79, 0x6b, 0xbc, 0x54, 0x47, 0x58, 0xb4, + 0x61, 0x60, 0x4d, 0x51, 0x32, 0xb4, 0xa4, 0x6c, 0xbe, 0xcd, 0x46, 0xa9, + 0x47, 0x46, 0x79, 0x60, 0x41, 0x9c, 0xad, 0x5c, 0x64, 0x73, 0x33, 0x75, + 0x36, 0x48, 0x42, 0xc2, 0xa2, 0x5a, 0x31, 0x5e, 0x9f, 0x40, 0x9c, 0x4d, + 0xc4, 0x3c, 0x65, 0x35, 0xb2, 0xac, 0x38, 0xa1, 0x5b, 0xb0, 0x34, 0x88, + 0x95, 0x5e, 0x39, 0xa8, 0x95, 0xba, 0x65, 0x44, 0x5d, 0xa9, 0x40, 0xbd, + 0xc9, 0x3c, 0x99, 0x91, 0xc4, 0xb0, 0xaf, 0x85, 0x35, 0x75, 0x39, 0x89, + 0xcd, 0xcb, 0xc2, 0xb2, 0x7a, 0x7f, 0x63, 0x61, 0xaa, 0x72, 0x83, 0x5e, + 0x46, 0x41, 0x59, 0x7a, 0xca, 0x47, 0xbc, 0x46, 0xb9, 0x7e, 0x4f, 0x86, + 0x62, 0xd0, 0x41, 0xab, 0xcd, 0x6f, 0x77, 0xc4, 0xae, 0x61, 0xb5, 0x5e, + 0xa8, 0xbf, 0x88, 0xcb, 0xbc, 0x64, 0x49, 0xb7, 0x4a, 0x9f, 0xbd, 0xb6, + 0x5e, 0xb2, 0x5a, 0xb5, 0x4f, 0x95, 0xc4, 0xa3, 0xc0, 0x75, 0x59, 0xc1, + 0x46, 0xbe, 0x65, 0x58, 0xbe, 0xab, 0xcd, 0x41, 0xb1, 0xc1, 0x92, 0xbe, + 0x82, 0xb7, 0x7b, 0xcb, 0x69, 0x5e, 0x5c, 0xa2, 0x9f, 0xa1, 0x85, 0x8b, + 0xc7, 0x81, 0x83, 0x89, 0x62, 0x9c, 0x8e, 0xbc, 0xa4, 0x61, 0x2f, 0x8e, + 0x6e, 0x35, 0x34, 0x57, 0x8a, 0x62, 0x66, 0x8f, 0xad, 0xc7, 0xb4, 0x43, + 0x5b, 0x51, 0x58, 0x8b, 0x64, 0x60, 0x69, 0x69, 0x57, 0x4b, 0x34, 0xc5, + 0x5c, 0x52, 0x80, 0xb9, 0xad, 0x63, 0xc6, 0xc5, 0xca, 0xb9, 0x4f, 0x77, + 0x6e, 0x95, 0xcc, 0x39, 0x3b, 0xb2, 0x60, 0x88, 0xb7, 0x4c, 0x92, 0xb6, + 0x90, 0x9a, 0x81, 0x3f, 0x61, 0x85, 0x8b, 0xba, 0xcf, 0x85, 0x4c, 0xc6, + 0xc9, 0xa1, 0x58, 0x86, 0x83, 0xcb, 0x5a, 0x86, 0x4b, 0x3b, 0xb3, 0xa8, + 0x8f, 0x47, 0x64, 0x5f, 0xa0, 0xa7, 0x38, 0x68, 0x86, 0xa1, 0x7d, 0x3f, + 0x57, 0x84, 0x89, 0x4c, 0x47, 0x8a, 0x8f, 0x69, 0x33, 0xa8, 0x70, 0x8f, + 0xcb, 0xb1, 0x49, 0x5c, 0x88, 0x95, 0x81, 0x4c, 0xc8, 0xb0, 0xca, 0x93, + 0x93, 0x97, 0x3e, 0x57, 0x65, 0xa9, 0xc2, 0x59, 0x31, 0x53, 0xa8, 0x83, + 0x7f, 0x2f, 0x30, 0x65, 0xb8, 0x74, 0x77, 0x47, 0xc7, 0x57, 0x54, 0xa2, + 0x4a, 0x3a, 0x31, 0x90, 0x81, 0xa9, 0xce, 0x3f, 0x51, 0x97, 0xb3, 0x5d, + 0x62, 0x54, 0x59, 0x61, 0xbd, 0xad, 0x31, 0x66, 0x74, 0xbc, 0xab, 0xa5, + 0x80, 0xd0, 0x66, 0xa7, 0x3b, 0x71, 0xa4, 0x7d, 0x54, 0xcb, 0x9a, 0x9a, + 0x71, 0x75, 0x7f, 0xc3, 0x6d, 0xd5, 0x32, 0xb4, 0xc1, 0x2f, 0x9b, 0x3b, + 0x39, 0x56, 0x7e, 0x8f, 0xd2, 0x5a, 0xbe, 0x5d, 0x35, 0x48, 0x73, 0x67, + 0x73, 0x7d, 0x52, 0x6d, 0x4d, 0xa5, 0xa1, 0x53, 0x67, 0xa0, 0xc3, 0x83, + 0x94, 0x30, 0x51, 0x58, 0x97, 0x83, 0x43, 0x45, 0x43, 0x77, 0x59, 0xcc, + 0x84, 0x57, 0x4b, 0x45, 0x2e, 0x8f, 0xc5, 0x4e, 0x94, 0x42, 0x4c, 0x46, + 0x4b, 0xa5, 0x3f, 0x88, 0xb5, 0xb2, 0xd2, 0xd1, 0x51, 0xc5, 0x75, 0x72, + 0x71, 0xc1, 0x44, 0x81, 0x88, 0xcc, 0x7e, 0xab, 0xaf, 0x73, 0xb9, 0x58, + 0xa5, 0xa1, 0x69, 0xc4, 0x48, 0x89, 0xc0, 0x39, 0x7c, 0x85, 0x85, 0xa3, + 0x30, 0x5d, 0x48, 0xb8, 0xc4, 0x35, 0x4d, 0x7c, 0x80, 0x63, 0x4f, 0x4b, + 0x74, 0xae, 0x8d, 0xb0, 0xb0, 0x7b, 0x52, 0x70, 0x9f, 0x8d, 0x46, 0x7a, + 0x92, 0x79, 0x6c, 0x8f, 0x71, 0xcb, 0x46, 0x54, 0xd1, 0x80, 0x58, 0x91, + 0xc2, 0x68, 0x99, 0x4d, 0x9f, 0x91, 0x7e, 0x50, 0x4d, 0x86, 0xa8, 0xb0, + 0xc2, 0xa1, 0x79, 0x3c, 0x75, 0x90, 0x61, 0xc0, 0x96, 0x89, 0x5d, 0x32, + 0x77, 0xa0, 0x7a, 0x39, 0x3c, 0xb6, 0x88, 0xc0, 0xc7, 0xc6, 0xc8, 0x87, + 0x62, 0x77, 0xbd, 0x62, 0xc0, 0xa0, 0x48, 0x43, 0x4d, 0x63, 0x66, 0x65, + 0xd2, 0x3c, 0xc1, 0xb0, 0xc9, 0x5a, 0x46, 0x55, 0x59, 0x6f, 0xb8, 0x6b, + 0xc7, 0x62, 0x81, 0x49, 0xc2, 0x4d, 0xa1, 0x8d, 0x63, 0x77, 0xd9, 0x6f, + 0xc9, 0xa3, 0x4e, 0x8d, 0xc2, 0x4e, 0x78, 0xae, 0x7e, 0xc5, 0x86, 0xc8, + 0x4e, 0x6f, 0xb9, 0x42, 0x70, 0x75, 0xb5, 0xa5, 0x91, 0x52, 0x91, 0xc5, + 0x3e, 0xc6, 0x4f, 0x48, 0x34, 0x7f, 0x6e, 0xc7, 0xa9, 0xa8, 0xbd, 0x3f, + 0xbf, 0x77, 0x5f, 0x86, 0xc2, 0x43, 0x42, 0x89, 0x4d, 0x64, 0xc4, 0x71, + 0x66, 0xb2, 0x3b, 0x5e, 0xae, 0xa9, 0x70, 0xab, 0xb6, 0x6d, 0xcd, 0x88, + 0x6d, 0x85, 0x5f, 0x8a, 0xcd, 0xc8, 0x4a, 0x67, 0xbb, 0x54, 0xa2, 0xa1, + 0x58, 0xc2, 0x58, 0x78, 0xc2, 0xcf, 0xbe, 0x3c, 0x65, 0xb7, 0xa0, 0x77, + 0x6e, 0x49, 0x4c, 0x43, 0x4d, 0x3c, 0xc7, 0xab, 0xbf, 0xc7, 0xbf, 0x92, + 0x89, 0x38, 0xab, 0x79, 0xa5, 0xb7, 0x3c, 0x88, 0x8a, 0x96, 0x5c, 0x49, + 0xb2, 0xa4, 0xc1, 0x7c, 0x8a, 0xa2, 0x4a, 0x68, 0x48, 0x77, 0x87, 0xb4, + 0x96, 0x49, 0x5a, 0xd5, 0x38, 0x3e, 0x83, 0x48, 0x37, 0xa1, 0x38, 0xa8, + 0x49, 0x71, 0x96, 0x47, 0xba, 0xac, 0xb6, 0x94, 0xc3, 0xd7, 0xbf, 0x5a, + 0xbb, 0xaf, 0x93, 0x53, 0xb6, 0x86, 0x57, 0x3a, 0xb6, 0x4a, 0x87, 0x93, + 0xb3, 0xc7, 0x41, 0x3e, 0x5f, 0x3c, 0xa1, 0x5b, 0x81, 0xc4, 0x90, 0xbe, + 0x36, 0x35, 0x86, 0x40, 0xbd, 0x48, 0x41, 0x8e, 0x6b, 0x99, 0x7c, 0x3a, + 0xd1, 0x70, 0x4c, 0x6a, 0xc5, 0x7a, 0x5b, 0xa4, 0x96, 0x95, 0x52, 0x7f, + 0xbe, 0x56, 0xc6, 0xb7, 0x8f, 0x6b, 0x6a, 0xb3, 0x9c, 0x7e, 0x84, 0x8d, + 0x8a, 0x4c, 0xc7, 0xa1, 0x6e, 0x6f, 0x52, 0x97, 0x3a, 0xbe, 0xca, 0xc0, + 0x9a, 0x87, 0xa6, 0x64, 0x45, 0xcc, 0x7d, 0xa8, 0xa0, 0x8f, 0x66, 0xae, + 0xd7, 0x63, 0xc8, 0xcc, 0xb2, 0x8d, 0x49, 0x3a, 0x56, 0x4f, 0xa2, 0xc0, + 0xb8, 0xcc, 0xaf, 0xc2, 0xa9, 0x9a, 0xd2, 0x48, 0xcb, 0x61, 0x54, 0x9d, + 0x37, 0x5e, 0xc8, 0x33, 0x9d, 0x7b, 0xb4, 0x3c, 0xa9, 0x47, 0x6b, 0x36, + 0xab, 0xc0, 0x51, 0xc3, 0xcd, 0x75, 0x63, 0x48, 0x55, 0x74, 0x3b, 0x4c, + 0x72, 0x6c, 0xcc, 0x53, 0xa6, 0x88, 0x42, 0x3a, 0xb7, 0x88, 0xd2, 0x8b, + 0xc6, 0xc3, 0xaf, 0x5d, 0x84, 0x55, 0x81, 0xcf, 0x55, 0x80, 0x2f, 0x4b, + 0x8b, 0xbe, 0x8f, 0x99, 0x8f, 0x37, 0xa6, 0xb9, 0x46, 0x6d, 0xd4, 0x5e, + 0x7b, 0x96, 0xaa, 0x75, 0xbf, 0xbf, 0x5a, 0x50, 0x54, 0x76, 0xa4, 0x91, + 0xbd, 0x7f, 0xb8, 0x8f, 0x9e, 0x66, 0x3c, 0x81, 0x5c, 0xa8, 0x92, 0x52, + 0x66, 0xb6, 0x83, 0x62, 0x37, 0x67, 0x82, 0x61, 0x5f, 0x5b, 0xa6, 0x81, + 0xba, 0xaf, 0x6b, 0xc4, 0x5e, 0x67, 0x9d, 0x54, 0xac, 0x8c, 0x62, 0x47, + 0xbb, 0x49, 0x3a, 0x55, 0xc0, 0xc0, 0xa3, 0x7e, 0xb7, 0xb2, 0x53, 0x78, + 0xae, 0x49, 0x5a, 0x69, 0xb6, 0xc5, 0x9f, 0xae, 0x3f, 0x8e, 0x52, 0xaa, + 0xcf, 0x5e, 0xbd, 0x6b, 0x4e, 0x6e, 0x9e, 0x4b, 0x66, 0xd1, 0x39, 0x5f, + 0x58, 0x82, 0x61, 0x53, 0xa2, 0x35, 0x7d, 0x76, 0x99, 0xa4, 0xc3, 0x96, + 0x80, 0x3c, 0x3e, 0x8b, 0x68, 0x81, 0x68, 0xcb, 0x4c, 0x4f, 0x8c, 0x3c, + 0x5e, 0x67, 0x81, 0xb2, 0x39, 0x40, 0x94, 0xcf, 0x51, 0x3f, 0xcf, 0xcf, + 0x6b, 0x6d, 0x98, 0x44, 0x4f, 0xce, 0xb3, 0x45, 0x41, 0xc4, 0x66, 0x85, + 0x4e, 0x5c, 0x2b, 0x27, 0x5a, 0x53, 0xd8, 0xc6, 0x73, 0xc5, 0x82, 0xb6, + 0x41, 0xc5, 0x6f, 0x4b, 0x8b, 0xc7, 0x65, 0x4a, 0x81, 0x62, 0x3a, 0xbd, + 0x9f, 0x7e, 0x3e, 0xb0, 0x7b, 0x3b, 0x48, 0x4f, 0xa1, 0x49, 0x8d, 0x9a, + 0xd3, 0x66, 0xcf, 0x3f, 0x53, 0xbb, 0x62, 0x86, 0xac, 0x4c, 0x3b, 0x9d, + 0xb2, 0x58, 0x8e, 0xa5, 0x77, 0x9f, 0x61, 0xb1, 0x63, 0x80, 0xbd, 0xae, + 0x5e, 0xb3, 0x4b, 0x8f, 0x92, 0x47, 0xa2, 0x97, 0x76, 0xa6, 0x6a, 0xb8, + 0xb5, 0x62, 0xac, 0x54, 0xcd, 0xb7, 0x4d, 0x91, 0xac, 0x7d, 0x54, 0xaf, + 0x47, 0x99, 0x5b, 0xc5, 0x93, 0xc6, 0x4e, 0x82, 0x5d, 0x67, 0x5b, 0xc2, + 0x7a, 0x41, 0xca, 0x80, 0x8f, 0x47, 0xb0, 0x49, 0xcd, 0x68, 0x5a, 0xc3, + 0x9d, 0xa7, 0x49, 0x50, 0x7b, 0x76, 0x9e, 0xd7, 0xcc, 0x50, 0x51, 0xb9, + 0x39, 0xd7, 0x66, 0xae, 0x50, 0xbd, 0x8c, 0x83, 0x6d, 0xb5, 0xc6, 0xb7, + 0x39, 0xa8, 0x9d, 0x2e, 0x48, 0xd2, 0x55, 0x82, 0x9b, 0x7f, 0x37, 0x9e, + 0x8c, 0xc9, 0x87, 0x5c, 0x81, 0xbc, 0xd0, 0x5e, 0xa1, 0x80, 0x6e, 0xbf, + 0xb6, 0x5c, 0x73, 0x34, 0xbc, 0xa2, 0x34, 0xb3, 0x43, 0x9f, 0x74, 0x41, + 0x7d, 0x3a, 0xbc, 0x73, 0xb8, 0xac, 0x70, 0xca, 0xc4, 0x95, 0x9f, 0xa8, + 0x9e, 0x62, 0xa4, 0x49, 0x6b, 0xcc, 0x8a, 0x77, 0xcd, 0x77, 0xcd, 0x56, + 0xba, 0x9e, 0x9b, 0xd0, 0x9c, 0x90, 0x59, 0xa8, 0x5f, 0x66, 0xb1, 0x64, + 0xa5, 0xca, 0xa9, 0xbb, 0xa6, 0xc1, 0x3c, 0x74, 0x59, 0x97, 0xce, 0x40, + 0x56, 0x43, 0x95, 0x47, 0xcb, 0xbe, 0x44, 0xa6, 0x77, 0x41, 0x5e, 0xb6, + 0x4b, 0xb1, 0xcc, 0x73, 0x69, 0x6a, 0xa1, 0xb9, 0xc6, 0x34, 0x42, 0xa5, + 0x76, 0x76, 0x68, 0x36, 0x73, 0x4d, 0xcd, 0xb4, 0x5f, 0x74, 0x5c, 0xa3, + 0xa8, 0x5c, 0x58, 0x67, 0x4f, 0x3a, 0x90, 0x93, 0x64, 0x35, 0x87, 0xc8, + 0xd3, 0x8f, 0x63, 0x5b, 0x96, 0x3e, 0x6b, 0x98, 0x3c, 0x3e, 0x77, 0x64, + 0x42, 0x8a, 0x4e, 0xa5, 0xce, 0xd1, 0x3b, 0x65, 0x4c, 0xc9, 0x9b, 0x29, + 0x4f, 0x58, 0xad, 0x4c, 0x92, 0xae, 0x7a, 0x38, 0x6c, 0xc1, 0x72, 0x76, + 0x83, 0x75, 0xa9, 0x48, 0x95, 0x79, 0x38, 0x58, 0xa9, 0x94, 0xbc, 0x96, + 0x4f, 0xcb, 0x79, 0x35, 0x3b, 0xcb, 0x79, 0x9f, 0x64, 0x6b, 0x9a, 0x79, + 0x66, 0x55, 0xc8, 0x35, 0x86, 0x6a, 0x3f, 0x3b, 0x48, 0x83, 0xcf, 0x87, + 0x78, 0x5f, 0x92, 0x43, 0x55, 0x47, 0x9e, 0xab, 0x47, 0x75, 0x93, 0xc6, + 0x65, 0x7e, 0x7c, 0xd7, 0xc7, 0x48, 0x9e, 0x54, 0x4e, 0x7a, 0x57, 0xac, + 0xa0, 0x55, 0x37, 0xc4, 0x94, 0xb4, 0x34, 0x97, 0x80, 0x6a, 0x90, 0x40, + 0x86, 0xb1, 0xcf, 0xa2, 0x9d, 0x52, 0xa9, 0xc9, 0xcf, 0x56, 0x62, 0x32, + 0x3b, 0x89, 0xc3, 0x6f, 0x4b, 0x30, 0x61, 0x41, 0xa5, 0xa3, 0xbe, 0x6c, + 0x74, 0xbc, 0x94, 0x97, 0x82, 0xc6, 0xd0, 0x8a, 0x50, 0x3c, 0x74, 0x84, + 0x7d, 0x49, 0xaa, 0xb5, 0x81, 0xb3, 0xc8, 0x5b, 0x37, 0x54, 0x89, 0x53, + 0x46, 0x90, 0x6b, 0x99, 0xb5, 0x6e, 0x62, 0x4b, 0x31, 0x73, 0x61, 0x47, + 0xce, 0x7b, 0x9d, 0x68, 0x7c, 0x5e, 0x6d, 0xbd, 0xb3, 0x50, 0x9b, 0x5d, + 0x81, 0xba, 0x65, 0x9e, 0x97, 0x44, 0x81, 0xb6, 0x7d, 0x9d, 0xd5, 0x77, + 0x67, 0xa7, 0x71, 0x55, 0x7a, 0x55, 0x51, 0x9c, 0x40, 0x48, 0x72, 0xac, + 0x36, 0x65, 0x55, 0x6e, 0xa5, 0xa0, 0xb8, 0x63, 0x64, 0xc1, 0x91, 0x61, + 0x53, 0x7a, 0x84, 0x68, 0x5b, 0x94, 0x4e, 0x38, 0x30, 0xbc, 0x44, 0x3c, + 0x90, 0x93, 0x9e, 0xb5, 0xaf, 0x43, 0x68, 0x42, 0x97, 0x52, 0x8e, 0x50, + 0x72, 0x53, 0xd6, 0xad, 0x70, 0xb6, 0xb2, 0x8c, 0x8d, 0x4e, 0xae, 0x79, + 0x5d, 0x50, 0x4f, 0xd3, 0xbc, 0xc8, 0x44, 0xb3, 0x8b, 0x98, 0x8a, 0x6d, + 0x45, 0x78, 0x86, 0x8c, 0x39, 0x66, 0x62, 0x4d, 0x3d, 0x4b, 0x4a, 0xb0, + 0x6b, 0xba, 0x94, 0xc2, 0x6a, 0x38, 0xc3, 0xb5, 0x37, 0x5e, 0x37, 0x94, + 0x5a, 0x5a, 0x42, 0x8a, 0xbd, 0x7c, 0x7c, 0x48, 0x5e, 0x66, 0x74, 0x4c, + 0x49, 0xcd, 0x83, 0x7f, 0x39, 0xad, 0xb7, 0x97, 0x3e, 0x52, 0xaf, 0x5b, + 0x9b, 0x47, 0xa0, 0x56, 0xcb, 0x71, 0x4a, 0xc1, 0x83, 0x6f, 0x96, 0xb6, + 0x46, 0x3e, 0x34, 0x59, 0x93, 0xd5, 0x4b, 0x88, 0x92, 0x40, 0x8d, 0x65, + 0x5c, 0xbf, 0x74, 0x3a, 0xc3, 0x40, 0xce, 0x4e, 0x8f, 0x9f, 0xb5, 0x3c, + 0x46, 0x88, 0x3d, 0xa8, 0xb7, 0x73, 0xc3, 0x9d, 0x5f, 0x40, 0x93, 0xc0, + 0x53, 0x8f, 0x62, 0x37, 0xb6, 0xd3, 0x37, 0x91, 0x8e, 0xa5, 0x51, 0x47, + 0x3b, 0x9f, 0x47, 0x65, 0x65, 0x3c, 0x6d, 0xb3, 0xa9, 0x74, 0x55, 0x61, + 0x86, 0xb0, 0x4c, 0x7c, 0x6d, 0xbf, 0x79, 0x47, 0x9f, 0x76, 0x65, 0xb6, + 0x94, 0xcc, 0x4c, 0xc2, 0x5e, 0xd0, 0xae, 0xc8, 0x39, 0x79, 0xbb, 0xac, + 0xba, 0x4b, 0x87, 0x88, 0x84, 0xad, 0xcf, 0xd0, 0xb9, 0x5b, 0x2e, 0x39, + 0xad, 0xc9, 0x8e, 0x94, 0xcb, 0x40, 0xcf, 0xa1, 0xaf, 0x74, 0xd2, 0x8d, + 0xb5, 0xc1, 0xcf, 0x4b, 0x7a, 0xad, 0xb0, 0x57, 0xa2, 0xa9, 0xa9, 0x6b, + 0xad, 0x36, 0xa1, 0x9b, 0x4e, 0x69, 0xbf, 0x32, 0x6c, 0x9e, 0x79, 0xc9, + 0x5a, 0x37, 0x52, 0xce, 0x8c, 0xaa, 0xa4, 0x9d, 0xd1, 0x4d, 0x9d, 0x5f, + 0x7d, 0x96, 0x60, 0x43, 0x3a, 0xbd, 0x98, 0x88, 0x40, 0x9a, 0x89, 0xa0, + 0xa8, 0x6c, 0x73, 0x33, 0x74, 0x3a, 0xae, 0x7a, 0xb7, 0xb0, 0x3d, 0x9a, + 0xa5, 0x49, 0xc1, 0xb4, 0x87, 0xbb, 0x51, 0xa4, 0xa5, 0xb6, 0xcb, 0xd2, + 0x86, 0xc8, 0xc5, 0x81, 0xc1, 0x8e, 0x86, 0x94, 0x64, 0xba, 0xb4, 0x2f, + 0x93, 0x98, 0x48, 0x80, 0xca, 0xb3, 0xa2, 0xd1, 0x7e, 0xad, 0xd8, 0x4a, + 0xa7, 0xa6, 0xbf, 0x8d, 0x85, 0xa2, 0x5b, 0x7c, 0x8c, 0x71, 0xaf, 0x36, + 0x3c, 0x86, 0x54, 0xbd, 0x2f, 0x45, 0x62, 0x77, 0x9a, 0xab, 0xbc, 0x3e, + 0x5b, 0x72, 0x81, 0x90, 0x37, 0x74, 0xc4, 0x77, 0x53, 0x3e, 0x54, 0x89, + 0x88, 0x89, 0x59, 0x2d, 0x31, 0xc1, 0xd0, 0xbd, 0x38, 0x9d, 0xa6, 0xb5, + 0xb6, 0x4e, 0x46, 0x63, 0xa0, 0xb7, 0x48, 0x85, 0x37, 0x4c, 0xa4, 0x6b, + 0x98, 0x3d, 0x5a, 0x74, 0x4e, 0x35, 0x56, 0x42, 0x3e, 0x98, 0xcf, 0x5d, + 0x44, 0xb4, 0x80, 0x9c, 0x56, 0xa5, 0x53, 0x60, 0x3c, 0x64, 0x71, 0x68, + 0xce, 0x65, 0x6c, 0x38, 0x98, 0x45, 0x87, 0x38, 0xbd, 0x87, 0xac, 0x9a, + 0x99, 0x86, 0xb2, 0x9f, 0x34, 0xa1, 0x9f, 0x82, 0x6b, 0x90, 0xcb, 0x99, + 0x60, 0x6e, 0x3e, 0x7a, 0xc9, 0x9b, 0xc4, 0xca, 0x9a, 0x55, 0x8a, 0x3b, + 0xa5, 0x73, 0x8b, 0x6b, 0x68, 0xb3, 0x40, 0x8e, 0xce, 0x33, 0x6a, 0x2d, + 0x5d, 0x62, 0xcb, 0xae, 0x8d, 0xc0, 0x34, 0x7c, 0xaf, 0x9a, 0x61, 0x87, + 0x65, 0xb6, 0xc0, 0x33, 0x67, 0x8c, 0xb4, 0xb5, 0x47, 0x7b, 0x80, 0x61, + 0xc5, 0x84, 0xa3, 0xbe, 0x6e, 0x44, 0x5f, 0xc7, 0x6e, 0x33, 0x61, 0xc2, + 0x46, 0xc6, 0xa9, 0xaa, 0x82, 0xab, 0x42, 0x81, 0xb3, 0x5e, 0x56, 0x4f, + 0xb7, 0x3c, 0x9f, 0xa0, 0x37, 0x6a, 0x5c, 0xc4, 0xa0, 0x80, 0x91, 0x86, + 0xd3, 0x4a, 0x95, 0x3e, 0x5e, 0x7f, 0x3b, 0x76, 0x3f, 0xba, 0x9b, 0x74, + 0x7d, 0xdf, 0x44, 0x64, 0x96, 0x91, 0x3c, 0x9f, 0x85, 0xc2, 0x89, 0x4f, + 0x77, 0x86, 0x7e, 0x73, 0x57, 0x2d, 0xa9, 0x42, 0x52, 0x79, 0x42, 0x8c, + 0xae, 0x99, 0xb8, 0xc0, 0x48, 0x6a, 0x59, 0xac, 0x40, 0xcc, 0x60, 0x47, + 0x35, 0x67, 0x33, 0x91, 0xac, 0x91, 0x3a, 0x6d, 0xbc, 0x34, 0x43, 0x87, + 0x69, 0xa1, 0x8a, 0x3c, 0x91, 0x65, 0xbd, 0xbf, 0x58, 0xcb, 0x36, 0xae, + 0xb1, 0x2f, 0x50, 0x90, 0x6e, 0x88, 0x33, 0xaa, 0xaf, 0x55, 0x74, 0x4c, + 0xb5, 0x95, 0xb9, 0xa1, 0xba, 0x62, 0x7f, 0x35, 0x2e, 0xd4, 0x7f, 0x52, + 0x9a, 0x97, 0xc3, 0x40, 0x78, 0x94, 0x7b, 0xae, 0x3c, 0x93, 0x7c, 0x51, + 0x79, 0x96, 0x6a, 0xb2, 0x90, 0xa5, 0x92, 0x5d, 0x52, 0x33, 0x99, 0xa2, + 0x49, 0xa9, 0x93, 0x7e, 0x5a, 0xc2, 0x8b, 0x8b, 0x99, 0x65, 0x5e, 0xaf, + 0x79, 0xab, 0x75, 0xd0, 0x83, 0xbe, 0x78, 0x75, 0x81, 0xd1, 0xbf, 0x96, + 0xbc, 0x58, 0x5a, 0x5b, 0x71, 0x8f, 0x68, 0xbf, 0xa6, 0x72, 0x87, 0x3e, + 0x90, 0xa7, 0x51, 0x4e, 0x51, 0x57, 0x3e, 0x3f, 0x6d, 0x9b, 0xbc, 0x2f, + 0x93, 0x73, 0x81, 0x95, 0x8d, 0xc6, 0x99, 0x87, 0x80, 0x93, 0xc9, 0x79, + 0x5b, 0x9b, 0x29, 0x72, 0x90, 0x51, 0x6f, 0x4b, 0xd3, 0xcc, 0x85, 0xaf, + 0x69, 0x8c, 0x70, 0x8a, 0xcc, 0x7c, 0x3e, 0x8c, 0x4c, 0x4d, 0xaa, 0x38, + 0x8b, 0xab, 0x69, 0xce, 0x62, 0x88, 0xb3, 0x85, 0xa0, 0xae, 0xc1, 0xc2, + 0xc8, 0x89, 0x3c, 0xc0, 0x92, 0x95, 0xcc, 0x72, 0x86, 0xd5, 0x43, 0xd0, + 0x6d, 0x8b, 0x5c, 0x55, 0x72, 0x71, 0x9b, 0x5e, 0x81, 0x53, 0x9a, 0x5c, + 0x7c, 0xb9, 0x99, 0xa1, 0xa8, 0x72, 0xb5, 0x85, 0x4e, 0x76, 0x5d, 0x83, + 0x3a, 0x60, 0x62, 0x9f, 0x31, 0x8d, 0x7c, 0x34, 0x8e, 0x77, 0x53, 0xcb, + 0x49, 0x80, 0xd2, 0x2b, 0x4b, 0xa6, 0x59, 0xa7, 0x70, 0x75, 0xbc, 0xa7, + 0x41, 0xc4, 0x51, 0x9a, 0x96, 0x9c, 0xd9, 0x55, 0x8a, 0x2b, 0xc6, 0x8f, + 0x94, 0x83, 0x9d, 0x69, 0x68, 0x61, 0xc6, 0x9b, 0x7d, 0x47, 0x97, 0xc5, + 0xc0, 0x90, 0x78, 0x79, 0x4b, 0x3a, 0x76, 0xd4, 0x88, 0x82, 0xdb, 0x51, + 0xd3, 0xd9, 0xb5, 0xa5, 0x6c, 0x8a, 0xb3, 0x4b, 0x48, 0xc9, 0xb8, 0x78, + 0x79, 0xb4, 0xb7, 0x76, 0x7d, 0xc7, 0x82, 0xa1, 0x5f, 0x45, 0xa8, 0xd0, + 0x8c, 0x78, 0x90, 0x7f, 0x60, 0x59, 0x3a, 0x82, 0x6d, 0x31, 0x7b, 0x9e, + 0x9a, 0x96, 0x39, 0xc8, 0x3b, 0xa1, 0x40, 0x9d, 0xa8, 0xd0, 0xb0, 0x70, + 0x2b, 0x35, 0x4d, 0xa6, 0x75, 0x52, 0xb9, 0x35, 0xb9, 0x8d, 0x9d, 0x74, + 0xc6, 0x6b, 0x7b, 0x97, 0x73, 0x97, 0x93, 0x29, 0x51, 0xb8, 0x34, 0xc4, + 0x59, 0x6d, 0xbb, 0xb0, 0x56, 0x53, 0x39, 0xda, 0x65, 0x4e, 0xd7, 0xc8, + 0x91, 0x54, 0x75, 0xb3, 0x5e, 0x81, 0xb3, 0xc1, 0x90, 0xcd, 0x48, 0xb0, + 0xc1, 0xc4, 0x93, 0x98, 0x42, 0x7e, 0xa4, 0x76, 0xa3, 0x3b, 0x7f, 0x61, + 0x50, 0x7d, 0x73, 0x56, 0xb0, 0x65, 0xd8, 0x4b, 0x43, 0x6a, 0x8a, 0x33, + 0x5b, 0x39, 0x9f, 0x9d, 0xce, 0x8b, 0x82, 0x4a, 0x82, 0x41, 0x8a, 0x35, + 0x9e, 0xcb, 0x45, 0xc0, 0x66, 0x41, 0xb4, 0x41, 0x62, 0x7c, 0xb5, 0x83, + 0x4d, 0x77, 0xc1, 0x33, 0xa2, 0xcd, 0xb8, 0xb7, 0xd3, 0x76, 0x30, 0x65, + 0x2c, 0x42, 0x2e, 0x80, 0x4c, 0x6e, 0x7c, 0x93, 0x4e, 0x6c, 0x45, 0xb5, + 0x54, 0x38, 0xc6, 0xc9, 0xc8, 0x63, 0xca, 0x9e, 0x4b, 0x5f, 0xc2, 0x75, + 0x9d, 0x47, 0xc6, 0xa9, 0x77, 0xe1, 0x60, 0xbd, 0xa9, 0x8d, 0xdf, 0xbe, + 0x41, 0x5e, 0xa7, 0xcb, 0x4a, 0x6d, 0x52, 0xd6, 0x86, 0xa0, 0x6e, 0xd5, + 0x9b, 0xce, 0x84, 0x5d, 0x77, 0x3e, 0x4f, 0x3b, 0xc1, 0x8a, 0xc6, 0x5e, + 0x72, 0x6a, 0x22, 0xb7, 0x9a, 0xca, 0xa3, 0xb5, 0x3e, 0xb6, 0x72, 0xb1, + 0xb2, 0xb7, 0xcd, 0x6c, 0xbc, 0x6c, 0x2a, 0x8a, 0xb1, 0x58, 0x9e, 0xc7, + 0xa8, 0xc7, 0xc9, 0xad, 0xd4, 0x79, 0x5e, 0xa5, 0x75, 0xb7, 0x69, 0xaa, + 0xcb, 0x49, 0x54, 0x6f, 0x58, 0x55, 0x47, 0x3d, 0xc3, 0xf1, 0x96, 0xcd, + 0x54, 0x34, 0xbf, 0x56, 0x5a, 0x86, 0xb0, 0xb3, 0x66, 0x97, 0xc7, 0xc9, + 0xd5, 0xb3, 0x9e, 0xa8, 0x62, 0x7a, 0x6f, 0x4a, 0x4f, 0x4c, 0xd3, 0x6c, + 0x6c, 0x8b, 0x37, 0xcd, 0x87, 0x43, 0xc8, 0x6d, 0x7c, 0x62, 0xa1, 0x3e, + 0xc8, 0x36, 0x76, 0x97, 0xcc, 0x99, 0x4a, 0xa8, 0x26, 0x91, 0x4a, 0x71, + 0x94, 0xbd, 0x64, 0x94, 0x68, 0x8a, 0x3c, 0x66, 0x7b, 0x87, 0x3b, 0x4a, + 0xc2, 0xb6, 0x40, 0x44, 0xbe, 0x5f, 0x9c, 0x69, 0x47, 0x39, 0x8c, 0x57, + 0x94, 0xc3, 0x6f, 0x9a, 0xae, 0x73, 0x7c, 0x84, 0xb6, 0x50, 0x98, 0x40, + 0x36, 0xce, 0x9f, 0x4f, 0x90, 0xd3, 0x7b, 0x63, 0xae, 0xd4, 0x84, 0x93, + 0x91, 0x34, 0x2c, 0xbc, 0xa9, 0x68, 0x4d, 0xb9, 0x52, 0x9b, 0xa3, 0xca, + 0x8c, 0xd1, 0xaf, 0xa9, 0x6e, 0xae, 0x77, 0x75, 0x40, 0x43, 0x4f, 0x86, + 0x4d, 0xaa, 0x5b, 0xc0, 0x39, 0xb1, 0x9d, 0x8a, 0x68, 0xb9, 0x82, 0xb2, + 0xb9, 0xc0, 0x8d, 0x80, 0xb1, 0x78, 0x2f, 0x87, 0x4a, 0x9c, 0x63, 0xbc, + 0x29, 0x71, 0x89, 0x8a, 0x87, 0x7b, 0x53, 0xc4, 0x93, 0xb9, 0xb9, 0x4a, + 0x56, 0x84, 0xb5, 0x30, 0xa4, 0xb7, 0x5c, 0x81, 0xd9, 0x52, 0x63, 0xbb, + 0x4d, 0x93, 0xc6, 0x91, 0xa4, 0xc7, 0xc2, 0x9a, 0x76, 0xc4, 0x53, 0xb4, + 0x6d, 0xb9, 0xa0, 0xbb, 0x41, 0x3b, 0xc9, 0xb3, 0xb1, 0xb7, 0xa4, 0x38, + 0x9f, 0x3d, 0xbe, 0xbf, 0xa8, 0x51, 0x71, 0xc7, 0xae, 0xb6, 0xc7, 0x87, + 0x77, 0x8c, 0xc5, 0x5e, 0x98, 0xa9, 0xd7, 0x85, 0xa3, 0x44, 0x63, 0x3c, + 0x4a, 0x65, 0x4e, 0xb0, 0xc8, 0x37, 0x61, 0xcc, 0xab, 0xca, 0xc5, 0x6d, + 0x89, 0x8e, 0x51, 0x36, 0x73, 0x69, 0x41, 0xbf, 0x88, 0xb3, 0x69, 0xcb, + 0x8a, 0xb7, 0x49, 0xc0, 0xaa, 0xbd, 0xbc, 0x62, 0xbf, 0x73, 0xc7, 0x7d, + 0x8a, 0x39, 0x7a, 0x7e, 0x89, 0x9e, 0xb0, 0x81, 0x52, 0xbd, 0x30, 0xc4, + 0xa9, 0x93, 0x80, 0xc3, 0xb0, 0x7a, 0xb4, 0x56, 0xb7, 0xad, 0x45, 0x39, + 0x9a, 0x73, 0xc2, 0x4a, 0x71, 0xa9, 0x9f, 0x35, 0xa2, 0xac, 0x6e, 0x33, + 0x92, 0x3f, 0x86, 0x8c, 0xde, 0xc9, 0xc3, 0x9a, 0x97, 0x4e, 0x87, 0xc5, + 0x5d, 0xc7, 0x96, 0xb5, 0xa3, 0xc0, 0x6a, 0xcc, 0xae, 0x4f, 0x81, 0x5d, + 0x96, 0x6a, 0xae, 0xb4, 0xb3, 0xa0, 0xb6, 0x44, 0xb3, 0x9d, 0x80, 0xa0, + 0x9d, 0xd3, 0xaa, 0xa0, 0x30, 0x46, 0x4b, 0xc2, 0x59, 0x8f, 0xc6, 0xbc, + 0x3a, 0xcd, 0x87, 0xca, 0xbf, 0x74, 0xa5, 0x37, 0xd6, 0x44, 0x71, 0xa1, + 0x7c, 0x44, 0xc4, 0x5f, 0x95, 0x6e, 0xb7, 0x96, 0x58, 0x62, 0x49, 0x69, + 0xb7, 0xbe, 0x99, 0x4a, 0x54, 0xdd, 0xb4, 0x30, 0x98, 0x94, 0x6e, 0x3f, + 0xad, 0x86, 0x95, 0xb9, 0xa5, 0xbc, 0xae, 0x61, 0xd1, 0xd2, 0xc8, 0x8e, + 0xab, 0x29, 0x9f, 0x8c, 0x3a, 0x44, 0xc6, 0x6d, 0xb7, 0x52, 0x8b, 0xab, + 0x7b, 0xda, 0x5c, 0xa2, 0x3f, 0x53, 0x5a, 0xb9, 0x48, 0x8c, 0x57, 0x5a, + 0x39, 0x50, 0x80, 0xaf, 0x8e, 0x6d, 0xb9, 0x4d, 0xcc, 0xcc, 0xa7, 0x5a, + 0x7f, 0x68, 0x35, 0x83, 0xa9, 0x33, 0xcb, 0x42, 0x82, 0x7d, 0xc3, 0xd0, + 0x6f, 0x55, 0x63, 0x89, 0xb1, 0xd1, 0x98, 0x90, 0xbb, 0x71, 0xc7, 0x66, + 0x4d, 0x58, 0x37, 0x32, 0xac, 0x33, 0x7c, 0x5f, 0x88, 0x50, 0x4e, 0xa6, + 0x4d, 0xcc, 0x69, 0x8f, 0xc0, 0x9a, 0x6f, 0x3b, 0x49, 0xc2, 0x8b, 0x82, + 0x76, 0x33, 0x60, 0xd0, 0x4a, 0x61, 0x90, 0xbc, 0x59, 0xb2, 0x53, 0x88, + 0x60, 0x87, 0x76, 0xbd, 0x59, 0xaf, 0xa9, 0x46, 0x8e, 0x79, 0xb3, 0xad, + 0x58, 0x31, 0xbe, 0x4e, 0xc1, 0x4e, 0xde, 0x3f, 0x65, 0x2d, 0xd4, 0x48, + 0x5f, 0x32, 0x77, 0xbd, 0xa7, 0xbb, 0xb5, 0xb9, 0x61, 0x5d, 0x69, 0xa0, + 0xd5, 0x4c, 0x92, 0x68, 0x6a, 0x4f, 0x6a, 0xad, 0xd1, 0xc6, 0x66, 0x99, + 0x4c, 0x95, 0x60, 0x71, 0x2a, 0x58, 0x41, 0xb7, 0xb1, 0xc1, 0x71, 0x91, + 0x8b, 0x4d, 0x92, 0xc4, 0x8a, 0x5e, 0xb0, 0xc9, 0xc2, 0x31, 0x2f, 0x99, + 0x9c, 0x59, 0xb5, 0x7c, 0x51, 0x7e, 0xb8, 0x40, 0xbe, 0x2c, 0xbc, 0x99, + 0x73, 0x69, 0xa2, 0xb1, 0x8b, 0x80, 0x38, 0xb5, 0x81, 0x6c, 0xb5, 0x5e, + 0x40, 0x9c, 0x61, 0x32, 0x83, 0x85, 0x76, 0x3c, 0x77, 0x42, 0xb0, 0xce, + 0x90, 0x98, 0x86, 0x63, 0x7e, 0x61, 0x61, 0xcb, 0xa5, 0x44, 0xbc, 0x5e, + 0x3d, 0x6d, 0x34, 0x87, 0x5e, 0xc9, 0x8f, 0x82, 0xd5, 0x57, 0x8e, 0x84, + 0xa8, 0x87, 0x86, 0xcd, 0xbf, 0x35, 0x77, 0x76, 0x67, 0x42, 0x9b, 0xb2, + 0x66, 0x48, 0xd3, 0xb4, 0x57, 0xa1, 0xb9, 0x8e, 0x54, 0x45, 0x77, 0xd9, + 0xc9, 0x56, 0xb7, 0xa8, 0xc0, 0x5c, 0x92, 0x7a, 0x5e, 0xb3, 0x60, 0xa7, + 0x46, 0x8f, 0xa6, 0x8b, 0x6f, 0x5a, 0x7d, 0x44, 0x75, 0xb3, 0x66, 0x43, + 0x77, 0xa3, 0x4c, 0xbe, 0x83, 0x57, 0x64, 0xa9, 0x35, 0xbb, 0x84, 0x86, + 0x65, 0x60, 0xca, 0xa4, 0xc6, 0x36, 0x69, 0xb4, 0xbf, 0x4c, 0x4d, 0xc5, + 0x8d, 0x80, 0xb3, 0xc4, 0x93, 0x3a, 0xab, 0xb2, 0xa3, 0x63, 0x37, 0xca, + 0xa7, 0x54, 0x33, 0x9b, 0x3f, 0xd2, 0x89, 0x77, 0xb9, 0x62, 0x99, 0x5c, + 0xa7, 0xa0, 0x6f, 0xbf, 0x60, 0xad, 0x2d, 0x39, 0x5d, 0xc1, 0x45, 0xc9, + 0xcc, 0xb7, 0x9b, 0x8c, 0x7f, 0x83, 0xca, 0xdb, 0xc7, 0x37, 0x61, 0x61, + 0x8e, 0x57, 0x33, 0xb8, 0x66, 0xd4, 0x53, 0x5e, 0x97, 0x73, 0x80, 0x3b, + 0x6a, 0x3a, 0x51, 0x64, 0x42, 0xbc, 0x5c, 0x74, 0x43, 0xd2, 0xa7, 0x4a, + 0x4f, 0x35, 0x5e, 0x74, 0x7c, 0x6f, 0x6a, 0xab, 0x29, 0xaa, 0xc3, 0x4a, + 0x4b, 0x64, 0x3a, 0x1f, 0x7d, 0xc6, 0x69, 0xab, 0x5e, 0xd4, 0x47, 0x40, + 0x96, 0xac, 0x51, 0x7c, 0xa2, 0x5d, 0xae, 0x44, 0x43, 0xa7, 0xc6, 0xd9, + 0x35, 0x34, 0xc6, 0x76, 0x83, 0x89, 0xb9, 0x81, 0x8a, 0x5a, 0xa6, 0xa2, + 0x8f, 0x9f, 0xa4, 0xad, 0x27, 0x58, 0xc4, 0x9f, 0x63, 0x88, 0xa3, 0x99, + 0xbf, 0x7d, 0x89, 0x98, 0xbf, 0x55, 0x51, 0x48, 0x69, 0xb4, 0x6b, 0x87, + 0x31, 0xa9, 0x31, 0xcb, 0xcb, 0x47, 0x5c, 0xa1, 0x5e, 0x80, 0x59, 0xc0, + 0x71, 0x99, 0x9b, 0xaa, 0x47, 0xa3, 0xb2, 0xb2, 0x97, 0x8a, 0x43, 0x3c, + 0x97, 0x37, 0x53, 0x83, 0x79, 0x33, 0x6f, 0x92, 0x57, 0xa7, 0xba, 0x35, + 0x4c, 0x4e, 0x70, 0xab, 0xb0, 0x6b, 0x3d, 0x33, 0xd9, 0xae, 0x3d, 0x7b, + 0xad, 0x84, 0x4e, 0x9d, 0xa0, 0xa2, 0xd0, 0x31, 0x5c, 0x87, 0x90, 0xd2, + 0xae, 0x8d, 0x96, 0x84, 0xa0, 0x80, 0xaf, 0x4c, 0xa8, 0xd6, 0x9d, 0x40, + 0xbe, 0x81, 0xa5, 0x42, 0x9e, 0xb8, 0x49, 0x46, 0x8e, 0x72, 0x86, 0x48, + 0x7a, 0x31, 0x9f, 0x80, 0x4c, 0x62, 0x3c, 0xbc, 0x44, 0x70, 0x9d, 0x8a, + 0x96, 0x2d, 0x9a, 0xaf, 0x7c, 0xbb, 0xb9, 0x38, 0x34, 0x58, 0xa5, 0x39, + 0x3e, 0xa6, 0xb1, 0xa7, 0x77, 0x6e, 0x8e, 0xb7, 0x63, 0x48, 0x47, 0x7f, + 0x80, 0xa5, 0x4b, 0x8d, 0xa7, 0x4f, 0xc3, 0x46, 0x9c, 0xac, 0x98, 0x63, + 0xb4, 0x3a, 0x4b, 0x2e, 0x73, 0x9e, 0xcf, 0x90, 0x7a, 0x68, 0xc6, 0x81, + 0x3f, 0x4d, 0x99, 0xa9, 0x46, 0x81, 0xa4, 0x7f, 0x33, 0xaa, 0x7b, 0x8d, + 0xab, 0x59, 0xa4, 0xc1, 0xa0, 0xaa, 0x81, 0x6e, 0x47, 0x99, 0x92, 0x50, + 0x98, 0x3f, 0x49, 0xac, 0xc7, 0x75, 0x47, 0xaf, 0x71, 0x6a, 0x8d, 0x6f, + 0x3f, 0xa1, 0xb0, 0xb6, 0x4e, 0x34, 0x67, 0xbb, 0x6e, 0x58, 0x69, 0x53, + 0xbe, 0xb9, 0x59, 0x39, 0xbe, 0x71, 0x44, 0xb7, 0xa0, 0x78, 0x94, 0x56, + 0x6b, 0x7d, 0xa7, 0x70, 0x99, 0x53, 0xca, 0x7d, 0x88, 0x71, 0x62, 0x91, + 0xa3, 0x94, 0x52, 0x75, 0x8a, 0x83, 0x62, 0x69, 0x44, 0x57, 0x7b, 0xbb, + 0xb7, 0xb6, 0x92, 0xd3, 0x4f, 0xca, 0x3a, 0x38, 0x61, 0x9e, 0x36, 0xa5, + 0x4a, 0x5f, 0xb5, 0x4f, 0x49, 0x75, 0xb5, 0xb5, 0x39, 0x65, 0xb7, 0x42, + 0x9a, 0xb0, 0xcc, 0xd1, 0x44, 0x5c, 0x54, 0x2d, 0x4a, 0xb9, 0x6e, 0x86, + 0x5b, 0x47, 0xd7, 0x87, 0x32, 0x36, 0xaa, 0x76, 0xc0, 0x56, 0x7a, 0x73, + 0x91, 0x9e, 0x71, 0xaf, 0x6f, 0x56, 0x5b, 0x5d, 0xad, 0x37, 0x88, 0xa0, + 0x76, 0x7a, 0x8c, 0x79, 0x43, 0x8c, 0x92, 0xd0, 0x60, 0xc2, 0xd6, 0xd1, + 0x73, 0x67, 0x97, 0xb2, 0x8a, 0x6e, 0x79, 0x73, 0x74, 0x88, 0x35, 0x33, + 0x47, 0x9e, 0x81, 0x8b, 0xab, 0x6f, 0x5b, 0x92, 0xce, 0x75, 0xc8, 0xb9, + 0x65, 0x78, 0x73, 0xca, 0x4a, 0x32, 0x8e, 0x71, 0xbb, 0x70, 0x4b, 0xc7, + 0x96, 0x52, 0xb9, 0xc7, 0x39, 0x6c, 0x6a, 0x3e, 0x9d, 0x53, 0x8a, 0x8b, + 0x5a, 0x3c, 0x7f, 0xad, 0xc6, 0x9b, 0x6e, 0xba, 0x9c, 0x30, 0x8d, 0x69, + 0xcd, 0x99, 0x37, 0xa5, 0x73, 0x84, 0x85, 0x55, 0x6e, 0x79, 0x6e, 0xce, + 0xbc, 0x83, 0x98, 0x66, 0x2f, 0x3d, 0x40, 0x2e, 0xaf, 0x30, 0x41, 0xbb, + 0x6e, 0xa4, 0x46, 0x5e, 0x49, 0xab, 0xa1, 0x89, 0x39, 0x53, 0xc6, 0x98, + 0x4a, 0xcf, 0x5a, 0x89, 0x8c, 0x71, 0xba, 0x7a, 0xc3, 0x76, 0x7d, 0x42, + 0x84, 0x80, 0xa8, 0x3d, 0xb1, 0xc9, 0xb2, 0x57, 0x3b, 0x37, 0xbf, 0xbe, + 0xbb, 0x51, 0x95, 0x3d, 0x65, 0x49, 0xcf, 0x50, 0x3c, 0xc4, 0xb0, 0xac, + 0x76, 0x8f, 0x73, 0xab, 0x73, 0x46, 0x6a, 0x61, 0xa8, 0x71, 0xce, 0x3b, + 0x53, 0xc2, 0x7a, 0x98, 0xc9, 0x74, 0x91, 0xcd, 0x61, 0xb9, 0x75, 0xd3, + 0x64, 0x98, 0x4f, 0x2e, 0xa2, 0x5c, 0x95, 0x3a, 0x6b, 0x71, 0xa7, 0xaa, + 0x63, 0x5a, 0x73, 0x6a, 0x66, 0x7f, 0xc2, 0x7d, 0x40, 0x6e, 0x5b, 0x9f, + 0x4c, 0x32, 0x92, 0x75, 0xac, 0x9d, 0x5e, 0x91, 0x65, 0x8a, 0xac, 0x66, + 0x85, 0xc3, 0x95, 0x41, 0x7d, 0x3f, 0xb8, 0x88, 0x60, 0x83, 0x89, 0x80, + 0x4f, 0x44, 0x30, 0x76, 0xcb, 0x54, 0xa3, 0x77, 0x59, 0xca, 0x63, 0x7b, + 0x5c, 0x31, 0x75, 0x60, 0xd3, 0x42, 0xb1, 0xca, 0xbc, 0x83, 0xbf, 0x83, + 0x36, 0xc7, 0xb4, 0x36, 0x46, 0x36, 0x54, 0xa8, 0x63, 0x66, 0xcc, 0x53, + 0xad, 0x9a, 0x7b, 0x86, 0xa4, 0xc5, 0xa9, 0xab, 0xad, 0xa5, 0xac, 0x3a, + 0x92, 0x38, 0xab, 0xcc, 0x53, 0x7b, 0xa1, 0x86, 0xd4, 0x57, 0xca, 0x9a, + 0x6d, 0x37, 0xb6, 0x8b, 0x4d, 0xa6, 0x7b, 0x3b, 0x45, 0x62, 0x8b, 0xbc, + 0x47, 0xb8, 0x97, 0x82, 0x9b, 0x9a, 0x51, 0x4f, 0x49, 0x42, 0x57, 0x83, + 0x66, 0x3f, 0x7a, 0x5a, 0x3e, 0x90, 0x53, 0xac, 0x5b, 0x60, 0x2b, 0x84, + 0xac, 0x8f, 0x76, 0x9e, 0x43, 0xb9, 0x89, 0xa1, 0x89, 0xca, 0xa0, 0x49, + 0xc5, 0x58, 0x8e, 0x92, 0xbc, 0x2d, 0x53, 0x7d, 0x55, 0x83, 0x6c, 0x9b, + 0x70, 0x3c, 0xb7, 0x5d, 0x86, 0x42, 0xa7, 0xab, 0x37, 0x81, 0xb4, 0x62, + 0x98, 0xbf, 0xbb, 0x3a, 0x44, 0x96, 0x81, 0x9d, 0xa5, 0x48, 0x96, 0x70, + 0x5d, 0x63, 0xa0, 0x95, 0x8e, 0x7e, 0xa6, 0x7f, 0x75, 0xac, 0x8f, 0x50, + 0x6a, 0xb3, 0x2d, 0xa4, 0x60, 0x4c, 0xa1, 0xa0, 0xbf, 0x46, 0x91, 0xa8, + 0xc9, 0x54, 0xca, 0xa6, 0x5a, 0xb5, 0x87, 0x58, 0x51, 0xc0, 0x33, 0xad, + 0x9c, 0xa7, 0x40, 0xab, 0x4e, 0x59, 0x4b, 0x5d, 0x83, 0x67, 0xa7, 0x5a, + 0x30, 0x96, 0x60, 0x8b, 0x68, 0x81, 0xbe, 0x93, 0x37, 0xb4, 0x50, 0x3b, + 0x75, 0xc5, 0xd2, 0xab, 0x85, 0x65, 0x4b, 0x95, 0xce, 0x44, 0x49, 0x46, + 0x4d, 0xd3, 0x92, 0xab, 0xbd, 0x42, 0xc8, 0xbb, 0x3b, 0x7e, 0x63, 0xac, + 0xd5, 0xa6, 0x57, 0x4b, 0x7c, 0x38, 0x93, 0x7e, 0x6f, 0x54, 0x89, 0xc4, + 0x5f, 0x6e, 0x46, 0x95, 0xd0, 0x7e, 0x6c, 0x3b, 0xb2, 0x71, 0x79, 0xc6, + 0x97, 0x35, 0x42, 0x7f, 0x82, 0x33, 0x91, 0x48, 0xa1, 0x95, 0xa6, 0x2e, + 0x96, 0xb9, 0x75, 0xb6, 0xcb, 0xc3, 0xc5, 0xcb, 0x7e, 0xbd, 0x7b, 0x38, + 0x63, 0x5d, 0x60, 0xb2, 0x99, 0xc0, 0xb2, 0x3e, 0x69, 0x6d, 0x46, 0xb2, + 0x93, 0x98, 0x46, 0x7f, 0x7c, 0x95, 0xb2, 0xaa, 0xc1, 0x47, 0x37, 0x9e, + 0x48, 0xcc, 0x2f, 0xb2, 0xac, 0xcf, 0x8e, 0x4a, 0x4e, 0xd6, 0x4c, 0xcb, + 0xb5, 0x84, 0xc9, 0x89, 0x3f, 0x50, 0x98, 0x53, 0x9f, 0x59, 0xb1, 0x37, + 0x99, 0x63, 0xd4, 0xa1, 0x40, 0xc7, 0x72, 0x52, 0x77, 0x96, 0x3e, 0xb6, + 0x7d, 0xaf, 0x3f, 0x97, 0xc3, 0x95, 0x85, 0x53, 0x8b, 0x92, 0x94, 0xc8, + 0xc6, 0x55, 0xc4, 0x9f, 0xc2, 0xc5, 0xbe, 0xbd, 0xa6, 0xaa, 0x9f, 0xbf, + 0x58, 0xbc, 0xa3, 0xc4, 0xa2, 0xb2, 0xba, 0x7f, 0x57, 0x30, 0xb3, 0x5c, + 0x63, 0x79, 0x40, 0x52, 0xbe, 0xc5, 0xa4, 0x7b, 0x65, 0xcd, 0x90, 0x42, + 0x57, 0xcf, 0xa9, 0x47, 0x86, 0xbc, 0x94, 0xb4, 0x99, 0xba, 0x39, 0xce, + 0x4e, 0xbd, 0xd2, 0x81, 0x47, 0x9b, 0xca, 0x8b, 0x6d, 0x78, 0x87, 0x98, + 0x95, 0x5f, 0x86, 0x6a, 0x3f, 0xcf, 0xb0, 0xd5, 0x84, 0x88, 0x3d, 0x84, + 0x69, 0x35, 0xc3, 0x85, 0x7d, 0x51, 0x44, 0xa2, 0x8b, 0x6a, 0xdb, 0xcc, + 0x53, 0x68, 0x61, 0xc6, 0xb2, 0x83, 0x78, 0x8b, 0xd9, 0xcf, 0x9b, 0x8d, + 0x69, 0x81, 0xd7, 0x78, 0xb1, 0x5b, 0x3d, 0x96, 0x80, 0x6f, 0xbf, 0x2d, + 0xc0, 0xc8, 0xa1, 0xba, 0x49, 0x4a, 0x64, 0x43, 0x69, 0x98, 0xc5, 0x9a, + 0x7e, 0xaf, 0xa9, 0x35, 0x8a, 0xce, 0x90, 0x80, 0x8e, 0xa6, 0xc1, 0x9f, + 0xb3, 0xa4, 0x93, 0x57, 0xbf, 0x8d, 0xc9, 0xad, 0x82, 0x4e, 0x40, 0x63, + 0xce, 0x52, 0xce, 0xbb, 0x42, 0xa5, 0xc5, 0x46, 0x6e, 0x40, 0xa1, 0x69, + 0x77, 0x63, 0x40, 0xd2, 0x3b, 0xbf, 0x7f, 0x99, 0x88, 0xd2, 0x4f, 0x88, + 0x6a, 0x63, 0x65, 0x87, 0xca, 0x3d, 0x4a, 0x33, 0x84, 0xb1, 0x9f, 0xb7, + 0x9c, 0xaa, 0xd7, 0x74, 0x63, 0xe2, 0x5f, 0x6a, 0x90, 0xa6, 0x68, 0x2e, + 0x71, 0x7f, 0xb2, 0xaa, 0x79, 0x91, 0x8a, 0xaf, 0x7b, 0x3b, 0xbc, 0x6f, + 0x2d, 0x5e, 0x44, 0xcb, 0xb8, 0x4d, 0x88, 0xc6, 0x6f, 0x53, 0x40, 0xc4, + 0x5c, 0x65, 0xcc, 0x48, 0x5a, 0x6a, 0xa7, 0x3d, 0x2c, 0x55, 0xa1, 0x32, + 0x80, 0x64, 0xc9, 0xbb, 0x3f, 0x68, 0x9f, 0x67, 0xa8, 0x29, 0x5f, 0x7f, + 0xc2, 0x6f, 0x74, 0x99, 0xbc, 0x7c, 0x56, 0xa7, 0xa3, 0x93, 0xda, 0x43, + 0x22, 0x7a, 0x60, 0xbc, 0xac, 0xb5, 0xca, 0x6c, 0x62, 0x91, 0xa4, 0x94, + 0x41, 0x53, 0x67, 0xba, 0x88, 0x72, 0x88, 0xc3, 0x45, 0xd5, 0x89, 0x23, + 0x58, 0x50, 0x90, 0xb6, 0x67, 0x9f, 0xbe, 0x58, 0x6d, 0x62, 0x3f, 0xde, + 0x69, 0xac, 0x7d, 0xb0, 0x7d, 0xbf, 0x95, 0x82, 0x90, 0x44, 0x4d, 0xc6, + 0x32, 0x36, 0xad, 0x79, 0x54, 0x5a, 0x34, 0xc5, 0xb1, 0xab, 0x98, 0xca, + 0xaa, 0x5d, 0xa2, 0xc7, 0x6a, 0x6a, 0x3a, 0x53, 0x3e, 0xa0, 0x86, 0x6c, + 0x9a, 0x9d, 0xdc, 0xcf, 0x45, 0x68, 0x9e, 0xb8, 0x82, 0xad, 0xce, 0x3f, + 0x62, 0x8a, 0xa8, 0x86, 0x69, 0x74, 0xbd, 0xb2, 0x5c, 0xcb, 0x4b, 0x61, + 0x39, 0xbe, 0x3b, 0x76, 0xaa, 0xd7, 0xb0, 0x5e, 0x36, 0xbe, 0x70, 0x7c, + 0x39, 0xd5, 0xc4, 0xa1, 0x7e, 0x5c, 0x7b, 0x54, 0xb3, 0xa4, 0xcf, 0xa1, + 0xc7, 0x51, 0x35, 0x88, 0x4d, 0x8d, 0x5b, 0x43, 0x39, 0x67, 0xa3, 0x4e, + 0xcc, 0xae, 0xc1, 0x62, 0xcc, 0x3a, 0x38, 0x89, 0x94, 0xd9, 0xa1, 0x35, + 0xc0, 0xdc, 0x91, 0x67, 0x69, 0x7a, 0xb8, 0xc4, 0x74, 0x24, 0xcf, 0x60, + 0xb5, 0x34, 0xcf, 0x6d, 0x4b, 0xc1, 0xa5, 0x9f, 0xbc, 0x38, 0xad, 0x8b, + 0xc2, 0xbf, 0x95, 0x38, 0x98, 0xbc, 0xc8, 0xba, 0xb5, 0xae, 0xcf, 0x49, + 0x66, 0xcb, 0x30, 0x9e, 0x4e, 0x8a, 0x44, 0x59, 0xe2, 0xce, 0xbc, 0x61, + 0x8a, 0x59, 0xce, 0xc5, 0x5a, 0x53, 0x93, 0x70, 0x88, 0x9e, 0x66, 0x98, + 0x71, 0x4c, 0xbe, 0x40, 0x99, 0x4a, 0x54, 0x57, 0x6f, 0x66, 0xaa, 0x40, + 0x52, 0xba, 0x56, 0xb9, 0x7b, 0x83, 0x44, 0x66, 0x2d, 0xc7, 0x6b, 0x42, + 0x42, 0x6f, 0x6a, 0xc7, 0xc6, 0xbf, 0xbb, 0x95, 0xb9, 0xbd, 0xbe, 0x78, + 0x8a, 0xa6, 0xcd, 0x6a, 0xaa, 0x57, 0x38, 0x8f, 0xd0, 0xc7, 0x46, 0x71, + 0x39, 0x79, 0x4d, 0x80, 0x84, 0x96, 0x5a, 0xbd, 0x9f, 0x6c, 0xd1, 0x44, + 0x8a, 0x81, 0xce, 0x69, 0xa8, 0x3c, 0xaa, 0xcd, 0xcf, 0xc8, 0x55, 0x3f, + 0x54, 0x72, 0xcd, 0x49, 0x83, 0xdc, 0x3c, 0x56, 0x60, 0x40, 0x3d, 0x56, + 0xcf, 0x6d, 0xd1, 0x71, 0x6b, 0x6a, 0x87, 0x37, 0xaf, 0xc9, 0xcd, 0xc3, + 0x8b, 0x70, 0xad, 0xc5, 0x8d, 0x63, 0x5a, 0x7e, 0x53, 0x67, 0x30, 0x93, + 0x9c, 0x7f, 0x83, 0x3f, 0x4e, 0x84, 0x7e, 0x44, 0x8a, 0x7c, 0x96, 0xd0, + 0x87, 0x7c, 0x46, 0x82, 0xb9, 0xa1, 0xb5, 0x51, 0xd2, 0x87, 0x48, 0x4b, + 0x73, 0x80, 0x7e, 0x80, 0x8b, 0x60, 0x4d, 0x75, 0xa3, 0x3c, 0xb0, 0x64, + 0x5c, 0x6f, 0x53, 0x93, 0x8d, 0x45, 0x97, 0x7b, 0x96, 0x5b, 0x43, 0xc7, + 0x43, 0xb1, 0xb8, 0xcc, 0x6c, 0x86, 0x67, 0xc9, 0x35, 0x62, 0x7f, 0x96, + 0xce, 0xb6, 0x37, 0x83, 0xb2, 0x8e, 0x83, 0x92, 0xd5, 0x80, 0x88, 0xad, + 0xad, 0x61, 0x6b, 0xc0, 0x99, 0x70, 0x30, 0x72, 0x9f, 0x91, 0xad, 0x4e, + 0xd4, 0x80, 0xb3, 0xb8, 0xd3, 0x9c, 0xd3, 0x76, 0x9b, 0x38, 0x67, 0x97, + 0xb7, 0x75, 0x70, 0x29, 0x41, 0x75, 0x4b, 0x73, 0x8a, 0x57, 0x45, 0x70, + 0x97, 0x93, 0xd5, 0x3c, 0x52, 0x97, 0x7f, 0x96, 0x65, 0x9f, 0xcb, 0x69, + 0x4e, 0x9d, 0x71, 0x8e, 0x6f, 0x54, 0xda, 0xc5, 0xb6, 0x37, 0x8e, 0xc5, + 0x46, 0xc6, 0xa8, 0x88, 0x8d, 0xd1, 0x52, 0x87, 0x9a, 0x69, 0x4f, 0x71, + 0x54, 0x56, 0xb8, 0x8f, 0x54, 0x80, 0x50, 0x57, 0x46, 0x5a, 0x60, 0x4e, + 0x4f, 0xd7, 0xdd, 0xcd, 0x47, 0x9c, 0x91, 0x9d, 0x91, 0x56, 0xd6, 0xcd, + 0x45, 0x9c, 0x56, 0xca, 0x6a, 0x44, 0x82, 0xc2, 0x3a, 0x39, 0xca, 0xad, + 0x9b, 0xad, 0x96, 0x92, 0x38, 0x80, 0xcb, 0x82, 0x3d, 0x45, 0xbe, 0x6e, + 0x4d, 0x47, 0x6b, 0x78, 0x70, 0x36, 0x62, 0x82, 0xd4, 0xb4, 0xcc, 0xb6, + 0xbf, 0x66, 0x58, 0x75, 0x67, 0xc9, 0x5b, 0xba, 0x7f, 0xbb, 0x8f, 0x9b, + 0x9a, 0x9c, 0x78, 0x4d, 0x6c, 0xd7, 0x7e, 0xbc, 0x48, 0x91, 0x4e, 0x88, + 0x96, 0x34, 0xa2, 0xa5, 0x7c, 0x51, 0x4a, 0x69, 0x94, 0xa4, 0x90, 0x3b, + 0x58, 0x85, 0x3d, 0xa9, 0x48, 0x61, 0x5b, 0x39, 0x56, 0x54, 0x4c, 0xb2, + 0x37, 0xbc, 0x44, 0x87, 0x50, 0x37, 0x6a, 0x60, 0x6f, 0x99, 0x38, 0xb4, + 0x8c, 0xa8, 0xa0, 0x83, 0xcf, 0x6e, 0x7e, 0xb6, 0xa7, 0x5f, 0x33, 0x2d, + 0xcd, 0x71, 0x93, 0x9c, 0xc4, 0x52, 0x50, 0xbd, 0x74, 0x28, 0x77, 0x9d, + 0x2f, 0x78, 0x9c, 0x30, 0x5f, 0x52, 0x97, 0x5f, 0x96, 0x65, 0x89, 0x68, + 0x5b, 0xb6, 0x44, 0x53, 0xa9, 0x31, 0x73, 0x89, 0x23, 0x49, 0x73, 0x86, + 0x56, 0xce, 0x9b, 0xa2, 0x9e, 0x7e, 0x9d, 0xbd, 0x38, 0x68, 0x9c, 0x53, + 0x5c, 0x8e, 0x8b, 0x4e, 0x96, 0xe1, 0x85, 0x55, 0x82, 0x33, 0x7d, 0xae, + 0x2a, 0x32, 0xc2, 0x2b, 0xbb, 0x48, 0x57, 0x5c, 0xcd, 0xb8, 0x88, 0x4a, + 0x73, 0x64, 0xd0, 0x5e, 0x88, 0x3f, 0xc6, 0x77, 0x38, 0xc8, 0xb5, 0xd2, + 0x6c, 0xa7, 0xb8, 0x62, 0xb5, 0x41, 0xd2, 0x38, 0x4b, 0x93, 0xc2, 0xb8, + 0xc9, 0xcb, 0x72, 0xc1, 0xa0, 0x4f, 0x49, 0xb4, 0x73, 0xbf, 0x99, 0x96, + 0x52, 0xc8, 0x45, 0x3d, 0xd3, 0xb4, 0xb8, 0xa7, 0xd3, 0xc7, 0x3c, 0x86, + 0x93, 0xd6, 0x5a, 0x84, 0xa9, 0x8a, 0x60, 0xce, 0x2e, 0x7a, 0x8e, 0xb5, + 0x4d, 0x57, 0x71, 0x62, 0x84, 0x92, 0x96, 0xc6, 0x70, 0x40, 0x7e, 0x80, + 0xa5, 0x5c, 0x8c, 0x67, 0xbd, 0x63, 0xb4, 0x43, 0x52, 0xc1, 0xcd, 0x94, + 0x4c, 0x91, 0x7c, 0x84, 0x42, 0x9b, 0xa9, 0x62, 0x81, 0x53, 0xaa, 0x76, + 0x55, 0xc3, 0xc1, 0x77, 0xaa, 0xc3, 0xc1, 0x5b, 0xcf, 0x39, 0x50, 0xb2, + 0xc0, 0x4b, 0x50, 0xb0, 0xc9, 0x78, 0xe1, 0x5a, 0x3b, 0x65, 0xc7, 0x49, + 0x45, 0x45, 0x47, 0x66, 0x48, 0xa6, 0x75, 0x97, 0x7f, 0xa5, 0xc7, 0x91, + 0xd2, 0xd5, 0xac, 0xc3, 0x53, 0x8f, 0x98, 0xb7, 0x45, 0xe3, 0x97, 0x6e, + 0x84, 0x42, 0x5b, 0x68, 0x4c, 0x70, 0xbb, 0x8c, 0x76, 0x9c, 0xb0, 0xa3, + 0x60, 0x80, 0xb6, 0x90, 0x83, 0x4b, 0x84, 0xb0, 0x65, 0x5a, 0x7e, 0x5c, + 0xbf, 0x6a, 0x95, 0x75, 0xb5, 0x4b, 0xa1, 0xbe, 0x80, 0xc7, 0x5d, 0xbd, + 0xab, 0xb8, 0xbc, 0x72, 0x7d, 0x9d, 0x51, 0xd3, 0x94, 0x8b, 0x86, 0xc4, + 0x80, 0x9e, 0xbf, 0xb2, 0x9c, 0x9d, 0xe2, 0x99, 0x89, 0x32, 0xa7, 0x8e, + 0x4f, 0x28, 0x57, 0xae, 0xbc, 0x4f, 0x2f, 0x51, 0x40, 0xd2, 0x33, 0xbd, + 0xbc, 0x61, 0x7e, 0xc2, 0x5a, 0x5f, 0x59, 0x46, 0xc5, 0x4c, 0x85, 0xac, + 0x53, 0xa6, 0x7e, 0xa9, 0xa9, 0xb8, 0xba, 0xa3, 0x9b, 0x67, 0xbf, 0x92, + 0x94, 0xb7, 0x6c, 0x97, 0xca, 0x46, 0x70, 0xb8, 0x49, 0xa5, 0x5d, 0x63, + 0x2d, 0x88, 0x98, 0x9a, 0xb5, 0xa5, 0x84, 0xa5, 0x3b, 0x65, 0xc4, 0x5a, + 0x8d, 0x4f, 0x7d, 0xd3, 0x67, 0x3d, 0x4f, 0xcc, 0x69, 0x90, 0xab, 0x80, + 0x85, 0x3b, 0x96, 0x5c, 0x41, 0x80, 0xd4, 0x4d, 0x86, 0x8a, 0xa3, 0x7b, + 0x95, 0xb9, 0x4b, 0xa0, 0x93, 0xa1, 0xaf, 0xa5, 0xb1, 0x74, 0xcc, 0x81, + 0x66, 0xb9, 0xba, 0x3f, 0x58, 0x99, 0x59, 0x90, 0x94, 0x72, 0xa8, 0xce, + 0x7a, 0x50, 0xba, 0xa2, 0x90, 0x44, 0xc0, 0x50, 0x6f, 0xa1, 0xd2, 0xb3, + 0x2c, 0xd0, 0xa3, 0x72, 0x48, 0xc4, 0x84, 0x69, 0x6c, 0xbd, 0x40, 0xc0, + 0xa6, 0x92, 0xb4, 0xc6, 0x70, 0xac, 0x71, 0xae, 0x98, 0xd3, 0x7d, 0xd4, + 0xb2, 0x92, 0x78, 0xd3, 0xac, 0x9d, 0x98, 0x4b, 0x6a, 0xcd, 0xa3, 0xbe, + 0x9c, 0xc5, 0xcf, 0x90, 0xcb, 0x51, 0x5c, 0x95, 0x71, 0xd8, 0xc9, 0x7e, + 0xb7, 0xc1, 0x7f, 0x7f, 0x81, 0x8c, 0xab, 0x44, 0xa3, 0x86, 0xad, 0x31, + 0x35, 0x45, 0x6a, 0x36, 0x8f, 0x8b, 0x83, 0x88, 0x47, 0x34, 0xbe, 0x91, + 0xae, 0xd6, 0x85, 0x69, 0x6e, 0x46, 0xca, 0x94, 0x3f, 0x4d, 0xca, 0xbe, + 0x5e, 0x9a, 0x49, 0x7e, 0x60, 0x92, 0x41, 0x88, 0x59, 0x35, 0x67, 0x78, + 0x9d, 0xcd, 0xd1, 0xcd, 0x66, 0x4b, 0x65, 0xbc, 0x41, 0x77, 0x57, 0xaf, + 0x51, 0x6d, 0xc0, 0x89, 0x55, 0xa8, 0xba, 0xad, 0x61, 0xbf, 0x75, 0xad, + 0x65, 0xd8, 0xa1, 0xaf, 0xa2, 0x80, 0xb4, 0xc3, 0x48, 0xc3, 0x3f, 0x45, + 0x37, 0x3d, 0xd4, 0x5a, 0xaa, 0xc7, 0xc4, 0x7f, 0x9a, 0xb1, 0xc2, 0x50, + 0x82, 0x4d, 0xc4, 0x94, 0x5b, 0xaa, 0x3a, 0xaf, 0x2d, 0xd7, 0x60, 0xcd, + 0x53, 0x8e, 0x53, 0xaa, 0xa2, 0x4e, 0x7b, 0xaf, 0x70, 0x63, 0x4b, 0x9d, + 0x59, 0x5b, 0x79, 0x62, 0xb7, 0xae, 0xb8, 0x30, 0x5b, 0xaa, 0x7e, 0xae, + 0xb4, 0x9a, 0x47, 0x6c, 0x5e, 0xb8, 0xd1, 0x47, 0x6e, 0x74, 0x9f, 0x77, + 0x38, 0xb0, 0x43, 0x68, 0xb9, 0x72, 0x8a, 0x95, 0x57, 0x92, 0x81, 0x8d, + 0x66, 0x98, 0x4d, 0xa2, 0x33, 0x9c, 0x81, 0x46, 0x7a, 0x4c, 0xc6, 0xcb, + 0x43, 0x7e, 0x9f, 0x93, 0x72, 0x37, 0x8d, 0xae, 0x43, 0x3d, 0xbd, 0x98, + 0x43, 0x58, 0x80, 0xa7, 0x44, 0xbe, 0x64, 0x3c, 0x40, 0x4b, 0x5a, 0xac, + 0xa6, 0x89, 0x38, 0xaf, 0xac, 0x70, 0x51, 0xb3, 0x6d, 0x73, 0xaa, 0xc4, + 0x4d, 0xcb, 0x4c, 0x6b, 0xa2, 0x37, 0x9c, 0x56, 0xa2, 0xb8, 0xce, 0x3b, + 0x5b, 0xbe, 0x7a, 0x8c, 0x48, 0xd2, 0xe0, 0x7c, 0x46, 0x69, 0xd3, 0x2e, + 0x5b, 0x99, 0x6a, 0x4a, 0xce, 0xa5, 0x9e, 0x7e, 0x71, 0xd0, 0xa8, 0x47, + 0x43, 0x66, 0x71, 0x7b, 0x6f, 0x5a, 0xc1, 0x79, 0x47, 0x74, 0x8c, 0x4d, + 0x2f, 0x6c, 0x6b, 0x8b, 0x7b, 0xbf, 0xc9, 0x8c, 0x95, 0x6f, 0x8e, 0xab, + 0xbe, 0x9d, 0xc9, 0x99, 0xb2, 0x53, 0x72, 0x5b, 0x9f, 0x9f, 0xcf, 0xcd, + 0x36, 0x97, 0xab, 0xd0, 0x6c, 0x84, 0xa9, 0x48, 0x74, 0xa0, 0xaf, 0xb7, + 0x76, 0x95, 0x44, 0xbc, 0xa3, 0xbb, 0x55, 0x32, 0x72, 0x5b, 0x68, 0x32, + 0x66, 0xd3, 0x75, 0xc4, 0x95, 0x8c, 0x5f, 0xa8, 0x84, 0x9a, 0xbd, 0x5b, + 0x49, 0x3b, 0xa0, 0x45, 0xab, 0xa8, 0x98, 0x91, 0x9b, 0x8a, 0x5e, 0xc4, + 0xc2, 0x60, 0x61, 0xd2, 0xb3, 0x79, 0x50, 0xa0, 0x97, 0xcf, 0x8b, 0x96, + 0x82, 0x6a, 0xd3, 0xaf, 0xb9, 0x78, 0x70, 0x85, 0x7d, 0xc7, 0x54, 0xc0, + 0xd3, 0xbd, 0x4b, 0x8d, 0x49, 0x92, 0xb4, 0xaa, 0xa4, 0xbc, 0xac, 0xac, + 0x4e, 0xcf, 0xb0, 0x68, 0x57, 0x3f, 0xc5, 0x61, 0xc9, 0x46, 0xb7, 0xbd, + 0x44, 0xb6, 0x7b, 0xb1, 0x93, 0x42, 0xc4, 0x66, 0x50, 0x71, 0x6c, 0xb5, + 0xd2, 0xaf, 0x55, 0x6a, 0x98, 0x4a, 0x95, 0x73, 0x87, 0x4c, 0xb3, 0xab, + 0x31, 0xb9, 0xaf, 0xb4, 0x71, 0x51, 0x7b, 0x41, 0x5d, 0xd7, 0xd8, 0x63, + 0x61, 0xa2, 0x5f, 0xc3, 0xa8, 0xc7, 0x56, 0xa7, 0x82, 0x45, 0x8c, 0x45, + 0xc4, 0x80, 0xac, 0x5e, 0x9c, 0xaa, 0xa1, 0x58, 0x6b, 0x9d, 0x51, 0xb3, + 0xc4, 0xbc, 0xc0, 0xb0, 0x90, 0x9b, 0xd4, 0x74, 0x96, 0x98, 0x3d, 0xb5, + 0xab, 0x74, 0x3c, 0x6f, 0xb9, 0xb3, 0x96, 0x5a, 0xb9, 0x97, 0xba, 0xbf, + 0x52, 0x77, 0x5e, 0x4d, 0x88, 0x4d, 0xa1, 0xc5, 0x54, 0xab, 0x77, 0x7d, + 0x62, 0x6f, 0x93, 0xb0, 0x7e, 0x75, 0xb5, 0x9e, 0x84, 0x8d, 0x93, 0x53, + 0x70, 0xa5, 0x75, 0x99, 0x48, 0x36, 0x6f, 0x42, 0x73, 0xcd, 0xaa, 0x48, + 0x48, 0x59, 0xb5, 0x6c, 0xc1, 0x81, 0x5c, 0x34, 0xb9, 0x58, 0xbd, 0x4d, + 0xab, 0x3a, 0x49, 0x40, 0xbc, 0x73, 0x89, 0xd0, 0xbf, 0x62, 0x39, 0xad, + 0x95, 0xd0, 0x8a, 0x80, 0x6d, 0x76, 0x56, 0xb6, 0xb2, 0xbe, 0xb9, 0xcc, + 0x72, 0x5a, 0x54, 0x39, 0x9a, 0xaa, 0x6c, 0x32, 0x80, 0x66, 0x5f, 0xaf, + 0x63, 0xbc, 0x87, 0x89, 0x42, 0x76, 0xc5, 0x4b, 0xcd, 0x75, 0x93, 0x82, + 0x5f, 0xc6, 0x79, 0x3a, 0x4d, 0xc1, 0x69, 0x61, 0xa5, 0x6c, 0x60, 0x5d, + 0x55, 0xc9, 0x40, 0x8d, 0x2f, 0xbe, 0x72, 0xbf, 0x9b, 0x94, 0x8d, 0x4c, + 0xbf, 0xcc, 0x4b, 0xba, 0x66, 0x7c, 0xb5, 0x82, 0xd2, 0x8c, 0x94, 0xa9, + 0x8b, 0x80, 0x68, 0x8e, 0x83, 0x4e, 0x9c, 0xb7, 0x81, 0xa9, 0x66, 0x3e, + 0x8e, 0xd4, 0x71, 0x52, 0x88, 0xc2, 0x34, 0xa7, 0x7c, 0xae, 0x51, 0x7c, + 0x59, 0xd0, 0x65, 0x90, 0x6a, 0x71, 0xa1, 0x6e, 0x5c, 0x47, 0x5a, 0x96, + 0x62, 0x50, 0x74, 0x47, 0x51, 0x88, 0xcb, 0x5a, 0x7a, 0x4e, 0x76, 0xd0, + 0x9e, 0xba, 0x54, 0x7c, 0x6d, 0x33, 0xc8, 0x42, 0x36, 0x7a, 0x5f, 0x7e, + 0xc9, 0x8c, 0x9c, 0x93, 0x55, 0x8e, 0x53, 0xc4, 0x71, 0xd2, 0x44, 0xcd, + 0x66, 0xc8, 0x3b, 0x87, 0x5f, 0x2f, 0x75, 0x8d, 0x34, 0x5e, 0xa5, 0x8e, + 0x96, 0x64, 0x72, 0x60, 0x4b, 0x7f, 0x47, 0x69, 0x5f, 0x33, 0x74, 0x96, + 0x75, 0x63, 0x45, 0x66, 0xd2, 0xd3, 0x34, 0x65, 0x4d, 0x96, 0x79, 0x81, + 0x9a, 0x60, 0x7f, 0x45, 0xbf, 0x56, 0x5a, 0xc9, 0xb4, 0x59, 0x54, 0x4a, + 0x4d, 0x7d, 0x3d, 0x7a, 0x41, 0x5c, 0xc6, 0x4b, 0x83, 0x3f, 0x69, 0xb3, + 0x90, 0xab, 0x65, 0xbc, 0x50, 0x64, 0xb9, 0xb9, 0xaf, 0x5f, 0x7d, 0xd6, + 0x5f, 0xa6, 0x54, 0x63, 0x68, 0x80, 0xaa, 0xa1, 0x9c, 0x92, 0xc6, 0x4e, + 0x81, 0xcb, 0x34, 0xc5, 0x7f, 0x6b, 0x6a, 0x65, 0xa7, 0xc2, 0x5f, 0x68, + 0x8c, 0xcd, 0x60, 0x3c, 0xd0, 0xb2, 0x77, 0x32, 0x64, 0x86, 0xb7, 0xcc, + 0x56, 0xbe, 0x83, 0x63, 0x94, 0x32, 0xc3, 0xc0, 0x49, 0x9d, 0x83, 0x44, + 0x5d, 0xbd, 0xb9, 0xb6, 0x57, 0x3a, 0xa2, 0xd6, 0x68, 0xab, 0xc5, 0xb9, + 0x43, 0x53, 0x40, 0xbb, 0xa3, 0xc1, 0x79, 0xad, 0xd1, 0x53, 0x98, 0x89, + 0x52, 0xba, 0xad, 0x9c, 0x57, 0xc2, 0x5d, 0x61, 0x40, 0xc7, 0x99, 0x3c, + 0xa9, 0xa2, 0x77, 0x38, 0xba, 0x41, 0x49, 0x5a, 0x99, 0x9c, 0x47, 0x3f, + 0x4b, 0x99, 0xd1, 0x83, 0x8b, 0x6a, 0x4b, 0x9e, 0x66, 0x80, 0x97, 0x5d, + 0x82, 0xa2, 0x78, 0x98, 0x4b, 0xb3, 0x56, 0x73, 0xac, 0x88, 0x57, 0xcc, + 0x75, 0x62, 0x5b, 0xcc, 0xc0, 0x6f, 0x93, 0xb7, 0x8b, 0x4b, 0xa6, 0x45, + 0x7a, 0x7a, 0x30, 0xd1, 0x70, 0x40, 0xc7, 0xa0, 0x56, 0x83, 0x89, 0x7d, + 0xb8, 0x67, 0x60, 0x5a, 0x9c, 0x3c, 0x95, 0xce, 0x88, 0x89, 0x6d, 0xab, + 0x53, 0xd3, 0x7b, 0x7b, 0xd1, 0x90, 0xca, 0xc7, 0x7d, 0x63, 0xa4, 0xa4, + 0x84, 0xd3, 0xc1, 0xcb, 0xa6, 0x4e, 0x98, 0x64, 0x77, 0x47, 0xa5, 0xa4, + 0x86, 0x75, 0x7b, 0x9c, 0x9c, 0xbe, 0x5e, 0x5e, 0xb4, 0xce, 0x8b, 0x3d, + 0xa6, 0x56, 0xa6, 0x3d, 0x72, 0x73, 0x6c, 0x39, 0xbf, 0x9f, 0x6c, 0xa8, + 0x80, 0x49, 0x9d, 0x6f, 0xa8, 0x6b, 0x65, 0x9e, 0x4e, 0xa1, 0x59, 0xc9, + 0x5b, 0xbf, 0x7d, 0x63, 0xa8, 0xd3, 0x83, 0x89, 0xa0, 0x53, 0x72, 0x72, + 0x61, 0xd6, 0xb3, 0x80, 0x88, 0x4d, 0xba, 0x34, 0xa3, 0xac, 0xb0, 0x9c, + 0xa8, 0x75, 0xb5, 0xc8, 0x77, 0x31, 0x88, 0xa5, 0xbd, 0x40, 0xc6, 0x3d, + 0x5f, 0x3d, 0x50, 0xca, 0x6a, 0x38, 0xaa, 0x97, 0x45, 0xb3, 0x57, 0x33, + 0x44, 0x60, 0x7e, 0x61, 0x80, 0xb6, 0x6f, 0x81, 0x61, 0x5f, 0x9c, 0x86, + 0x8d, 0x95, 0x3d, 0x37, 0xaf, 0x79, 0x75, 0x89, 0x42, 0x39, 0xb2, 0xc9, + 0x6f, 0x59, 0xc2, 0x57, 0xca, 0x94, 0x49, 0x63, 0x2e, 0x58, 0x7d, 0x8a, + 0x94, 0x7b, 0x9b, 0xb1, 0x49, 0xb5, 0x7a, 0xb9, 0xc3, 0xa8, 0xd4, 0x6d, + 0x3a, 0x90, 0xb3, 0x57, 0x35, 0x60, 0x6d, 0xd4, 0x40, 0x89, 0x3d, 0x3a, + 0x52, 0xa0, 0x4b, 0xc8, 0x94, 0xcd, 0x8c, 0x33, 0x3c, 0x69, 0x54, 0xc1, + 0x61, 0xaf, 0x67, 0x7e, 0x4b, 0xc1, 0x9c, 0x9e, 0x43, 0x89, 0x66, 0xbe, + 0x76, 0x4c, 0xc8, 0x32, 0x7c, 0x93, 0xbf, 0x78, 0xc9, 0x98, 0xb4, 0x9e, + 0x9a, 0x4f, 0x86, 0x5c, 0x9c, 0xba, 0x33, 0xc0, 0x92, 0x95, 0xb5, 0x57, + 0x3f, 0xab, 0x8e, 0x36, 0xb8, 0xb5, 0xb8, 0xba, 0x5d, 0xb5, 0x68, 0x37, + 0x4b, 0xb2, 0x66, 0xcc, 0xb4, 0x4a, 0x53, 0x8c, 0x3f, 0x4b, 0x49, 0xc0, + 0xc2, 0xb7, 0x7e, 0x55, 0x8a, 0xb8, 0x5b, 0x4d, 0x37, 0x6a, 0x42, 0x31, + 0x88, 0x6c, 0x83, 0x33, 0x9a, 0x48, 0x38, 0xa3, 0x47, 0xbe, 0xc2, 0x39, + 0x87, 0x48, 0x45, 0x41, 0xd2, 0xc6, 0xab, 0x91, 0xa2, 0xa9, 0x9c, 0x35, + 0x8a, 0x41, 0xb0, 0x8a, 0x54, 0x3c, 0x42, 0x46, 0x99, 0xc4, 0xbe, 0xc5, + 0x8e, 0x7c, 0x95, 0xcd, 0x61, 0x3b, 0xbe, 0x4d, 0x4a, 0x67, 0xce, 0xaf, + 0x47, 0xb8, 0xbe, 0x64, 0x92, 0x73, 0x63, 0x63, 0x43, 0xa7, 0xa1, 0x7a, + 0xbe, 0xcd, 0x8b, 0xab, 0xc9, 0x75, 0x3d, 0x3d, 0x93, 0xae, 0xc6, 0x63, + 0x63, 0x7e, 0xd6, 0xcb, 0x58, 0x63, 0x9f, 0x6a, 0x32, 0x74, 0x71, 0x3c, + 0xbb, 0x73, 0x6a, 0x9e, 0xae, 0x67, 0x55, 0x4b, 0xb0, 0x46, 0xb9, 0xc8, + 0xbf, 0x78, 0xc5, 0x41, 0xb6, 0x58, 0xd1, 0xa3, 0xbf, 0xa7, 0x63, 0x8b, + 0x66, 0xa9, 0xb4, 0x61, 0xb4, 0x5f, 0x4b, 0x5c, 0x40, 0x82, 0x51, 0xbc, + 0xcf, 0x76, 0x64, 0x4d, 0xb8, 0x43, 0x81, 0x99, 0x4f, 0x27, 0x72, 0x44, + 0x81, 0x8d, 0xcd, 0x6f, 0x7a, 0x39, 0x6d, 0x8a, 0xc8, 0x92, 0xce, 0xd8, + 0x62, 0xcc, 0xa6, 0x8f, 0xdd, 0xc0, 0xba, 0xa5, 0x8a, 0xa0, 0x39, 0x50, + 0x2a, 0x4f, 0x64, 0x6c, 0x75, 0x51, 0x9b, 0x9f, 0xcb, 0x2e, 0x41, 0x80, + 0x60, 0x47, 0x9a, 0x65, 0x8a, 0xb9, 0x95, 0x4e, 0xc4, 0xc1, 0xb7, 0xa9, + 0xd1, 0x85, 0x7e, 0x9c, 0x94, 0xaf, 0x69, 0x93, 0x56, 0xa5, 0xbd, 0xb4, + 0x88, 0x5b, 0x33, 0x40, 0xb8, 0x8e, 0x60, 0x4a, 0x9c, 0xcb, 0x98, 0x3d, + 0x99, 0x77, 0xb0, 0x76, 0x8c, 0x75, 0xc4, 0x55, 0x42, 0x40, 0xc1, 0x5e, + 0x47, 0x60, 0x9c, 0xc1, 0x7e, 0x4e, 0x2a, 0x81, 0xb5, 0x52, 0x99, 0x65, + 0x56, 0x57, 0xc8, 0xbb, 0x91, 0x53, 0x69, 0x63, 0x8b, 0x94, 0xd5, 0x86, + 0x50, 0x61, 0x55, 0x90, 0xcb, 0xd9, 0x87, 0xb9, 0x96, 0x53, 0x65, 0x91, + 0x42, 0x2c, 0xdc, 0x91, 0xc4, 0xb2, 0xa0, 0x80, 0x9f, 0x7b, 0x9b, 0x56, + 0x36, 0x6b, 0xb4, 0x97, 0x75, 0xb5, 0x3e, 0x3f, 0x58, 0x70, 0x6b, 0x39, + 0x93, 0x88, 0xda, 0xb7, 0x4f, 0x70, 0x89, 0xae, 0x6d, 0x57, 0x2f, 0x6f, + 0xa8, 0xbc, 0x50, 0x35, 0xb1, 0xbd, 0xa0, 0xa6, 0xb8, 0x85, 0x46, 0x63, + 0xdf, 0x77, 0x49, 0x9b, 0x78, 0xd8, 0x87, 0xab, 0x74, 0xad, 0x8c, 0xca, + 0x87, 0x9a, 0x64, 0x6d, 0x51, 0xac, 0x2f, 0xd1, 0x38, 0x3d, 0x46, 0xa7, + 0xae, 0xb3, 0x8b, 0x8c, 0x4d, 0x5b, 0x78, 0x38, 0x83, 0x71, 0x7b, 0xd5, + 0x69, 0x52, 0x46, 0x44, 0x58, 0x5a, 0xa3, 0xc5, 0x39, 0xb6, 0x4c, 0x78, + 0x8e, 0x81, 0x49, 0xaa, 0x88, 0x27, 0x45, 0x46, 0xbc, 0x8e, 0xb5, 0xc3, + 0xa3, 0x56, 0x84, 0x46, 0x2e, 0x59, 0x63, 0x70, 0x7e, 0x86, 0x8e, 0x45, + 0x8f, 0xb0, 0x81, 0xb6, 0xce, 0x8e, 0x42, 0x3e, 0xb6, 0x55, 0xaa, 0x77, + 0xb1, 0x64, 0xb4, 0x6d, 0x3b, 0xa8, 0x35, 0xbd, 0x7e, 0xa4, 0x8a, 0x6c, + 0x9b, 0xd7, 0x7f, 0x9e, 0x57, 0xc6, 0x83, 0xbf, 0x6a, 0x97, 0x91, 0x4c, + 0x9c, 0x6b, 0x8b, 0x7a, 0xa5, 0x96, 0xcf, 0xb6, 0x74, 0x33, 0x7c, 0x99, + 0x95, 0x54, 0x88, 0x87, 0x5f, 0x45, 0x25, 0x45, 0xd9, 0x90, 0xc1, 0xa7, + 0x56, 0x5b, 0xcf, 0x63, 0x4b, 0x50, 0xaa, 0xcc, 0x52, 0xc2, 0xb1, 0xa1, + 0xa3, 0x90, 0xa2, 0x4a, 0x73, 0x4f, 0xcb, 0x4f, 0x8f, 0xb0, 0x92, 0xb4, + 0x9b, 0x57, 0x47, 0x3e, 0x5d, 0xab, 0xa2, 0x5b, 0xc5, 0x8d, 0xb0, 0x84, + 0x82, 0xd7, 0x3c, 0x88, 0x74, 0xa5, 0xc4, 0x47, 0x86, 0xaa, 0x52, 0x90, + 0x50, 0x9b, 0xc4, 0x72, 0x6f, 0x3e, 0x83, 0x42, 0xc7, 0xae, 0x73, 0xc8, + 0xd2, 0x7b, 0x61, 0x81, 0x56, 0x7f, 0x43, 0xc2, 0x6e, 0xc0, 0x31, 0x73, + 0xc0, 0x29, 0xae, 0xa3, 0xab, 0xa6, 0xab, 0x31, 0x55, 0x4b, 0xa1, 0x7e, + 0x95, 0x5f, 0x52, 0x91, 0x91, 0xb4, 0x51, 0x88, 0x74, 0x50, 0x58, 0x81, + 0x87, 0x47, 0x51, 0x57, 0x68, 0xd6, 0x59, 0x81, 0x4d, 0x9b, 0x59, 0x4c, + 0x4e, 0x9d, 0x65, 0xa4, 0x8b, 0x80, 0xa0, 0x35, 0x79, 0x30, 0xcf, 0xbb, + 0x41, 0x6b, 0x45, 0x8f, 0x78, 0x79, 0xd7, 0x96, 0xa6, 0xb8, 0x43, 0x3d, + 0xac, 0xaf, 0x52, 0xaa, 0x8f, 0xa2, 0x80, 0x71, 0x9a, 0x5f, 0xcc, 0x9d, + 0x32, 0x4f, 0xc6, 0x3a, 0xbc, 0xb6, 0x56, 0xaa, 0x7e, 0x4d, 0x49, 0xd0, + 0x46, 0x8c, 0xd8, 0x6e, 0xbe, 0x54, 0xa0, 0xb5, 0xa1, 0x6d, 0x4f, 0x85, + 0x4c, 0xe2, 0x78, 0xa5, 0xc8, 0xbb, 0xbc, 0x7c, 0xb2, 0xa3, 0xab, 0x7a, + 0x6d, 0x42, 0x94, 0x8e, 0x39, 0x95, 0x32, 0x6e, 0xc3, 0xcb, 0x7c, 0x3e, + 0x54, 0x48, 0xa8, 0x80, 0xbc, 0x3b, 0x7a, 0x4f, 0x5d, 0x85, 0x9d, 0x42, + 0x87, 0x3c, 0x77, 0x4d, 0x4e, 0x33, 0x43, 0x4c, 0x30, 0xaf, 0x68, 0xa6, + 0xae, 0xca, 0x57, 0xc5, 0x45, 0x77, 0x43, 0xc7, 0x4d, 0x3d, 0x44, 0xa7, + 0x8e, 0xc4, 0xc5, 0x4e, 0x35, 0x75, 0x6b, 0x97, 0x7b, 0x8e, 0x8b, 0xd3, + 0x5b, 0x40, 0x9d, 0x7b, 0xab, 0x64, 0xca, 0x2f, 0x98, 0xb8, 0x65, 0xb4, + 0x3b, 0xc2, 0x4b, 0x46, 0xa1, 0xd3, 0xe0, 0x71, 0x53, 0x5f, 0x74, 0xd2, + 0x32, 0x5d, 0x25, 0x9a, 0x98, 0x4b, 0x93, 0xbd, 0xaa, 0xdc, 0x9d, 0x4e, + 0x76, 0x57, 0x9b, 0x9f, 0x4a, 0x4f, 0x66, 0x82, 0x57, 0x9b, 0x46, 0x50, + 0x81, 0x72, 0x5f, 0x55, 0x66, 0x82, 0x7c, 0xac, 0x99, 0x42, 0x94, 0x4b, + 0xd4, 0x95, 0x91, 0x6d, 0x4d, 0xd1, 0xc9, 0x6f, 0xad, 0xdc, 0xbd, 0x8d, + 0x86, 0x60, 0x82, 0xc9, 0xc7, 0xa6, 0xb8, 0x33, 0xc5, 0xc4, 0x81, 0x6c, + 0xa7, 0x4f, 0xb1, 0x2d, 0x88, 0x6d, 0x6b, 0x3b, 0x70, 0x4e, 0x71, 0x54, + 0x7c, 0x93, 0x62, 0x3a, 0xb7, 0x57, 0xc8, 0xb1, 0x90, 0x92, 0xb1, 0x98, + 0xdd, 0x3e, 0x9d, 0x75, 0x63, 0xab, 0x96, 0x96, 0x34, 0x5d, 0x4f, 0xc9, + 0x84, 0x83, 0xa7, 0x36, 0xae, 0x8d, 0xbe, 0xc3, 0x32, 0x51, 0xd9, 0x4f, + 0xba, 0x76, 0x8d, 0x49, 0x77, 0x6d, 0x6a, 0xaa, 0x8f, 0x85, 0x5d, 0x4b, + 0x79, 0xaa, 0x60, 0x8f, 0x34, 0xcb, 0x90, 0xc1, 0x3f, 0x9d, 0x6a, 0x60, + 0x8c, 0xb0, 0xca, 0xc4, 0x96, 0xa0, 0x8e, 0x51, 0xb4, 0x64, 0x6d, 0x2f, + 0xa1, 0xc4, 0x8f, 0x49, 0xaa, 0x3e, 0x5e, 0xc4, 0xd9, 0x89, 0x6b, 0xb5, + 0x85, 0xed, 0xb5, 0x6a, 0xad, 0xaf, 0x8a, 0x1a, 0xaa, 0xd0, 0xb1, 0x6e, + 0x25, 0x2f, 0xb7, 0x75, 0xa6, 0x55, 0x6a, 0xbd, 0xa0, 0x78, 0xb2, 0x43, + 0x8a, 0x6c, 0x58, 0xab, 0x61, 0xb5, 0x40, 0x4b, 0x1e, 0xaa, 0x91, 0x75, + 0x54, 0xcd, 0x4d, 0x6f, 0x9f, 0x73, 0x83, 0x99, 0x68, 0xc9, 0x59, 0x91, + 0x51, 0x53, 0xce, 0xa7, 0x85, 0xb1, 0xae, 0xdc, 0xc0, 0x86, 0x7b, 0x49, + 0x86, 0xb3, 0x99, 0x7d, 0x42, 0xb3, 0x8a, 0x3b, 0xc5, 0x8b, 0xa9, 0x6c, + 0x53, 0xbf, 0x91, 0xa4, 0x85, 0xb8, 0x8f, 0x4e, 0xb0, 0x43, 0x94, 0x9e, + 0xcb, 0x66, 0x50, 0x38, 0xc1, 0x9a, 0xba, 0x56, 0x6a, 0x8c, 0x9b, 0x6d, + 0x55, 0x8e, 0x58, 0xd8, 0x55, 0x49, 0x3a, 0x8e, 0x32, 0x59, 0x4e, 0xaf, + 0x71, 0xba, 0x7d, 0x64, 0x44, 0x93, 0x6b, 0xad, 0x89, 0x4c, 0xad, 0x36, + 0x4d, 0x4e, 0x36, 0xbb, 0x57, 0x50, 0xbd, 0x51, 0xb2, 0x20, 0xbe, 0xab, + 0x73, 0x34, 0xd1, 0x7a, 0x7c, 0x79, 0x9c, 0xa9, 0x44, 0x5c, 0xe3, 0xce, + 0x8c, 0x4a, 0xd8, 0x4a, 0x70, 0x82, 0x7d, 0x3a, 0x94, 0x3c, 0xb0, 0x9f, + 0x57, 0x3a, 0xaf, 0xa0, 0xbb, 0xb1, 0xcb, 0x8c, 0x47, 0xa5, 0x74, 0x49, + 0x42, 0xaf, 0xc3, 0x59, 0xaa, 0xc0, 0xd1, 0x8b, 0xbf, 0x94, 0x5c, 0x4d, + 0x87, 0xb0, 0x5d, 0x50, 0x54, 0x74, 0x59, 0xb9, 0xad, 0xd0, 0x88, 0x87, + 0x39, 0x91, 0x93, 0xa4, 0x5f, 0x53, 0xc1, 0xb2, 0x9b, 0x49, 0x80, 0x53, + 0x76, 0x36, 0x47, 0xc2, 0xbe, 0x90, 0x9c, 0x81, 0x53, 0xb2, 0x7e, 0xb6, + 0x8b, 0x41, 0x59, 0x4c, 0x39, 0x67, 0x3b, 0x5d, 0x6d, 0x5b, 0xb1, 0xa9, + 0xbe, 0x6f, 0xc4, 0x75, 0xbf, 0xce, 0x81, 0xb7, 0x52, 0x96, 0xb9, 0x59, + 0x35, 0x8e, 0x4c, 0x93, 0xc6, 0xaa, 0x9a, 0xbc, 0xa2, 0xcf, 0xbb, 0xae, + 0x97, 0xcc, 0x70, 0x43, 0x5f, 0x96, 0xbc, 0x9e, 0xc8, 0x70, 0xc0, 0x5f, + 0xba, 0x85, 0x7f, 0xa2, 0xce, 0x81, 0x9f, 0x3c, 0x98, 0x42, 0xae, 0xa4, + 0x9a, 0xd1, 0x46, 0x77, 0xbc, 0x32, 0x5d, 0x8d, 0x41, 0x50, 0xc0, 0x89, + 0x4e, 0x5c, 0x2b, 0xa3, 0x46, 0x8a, 0x8a, 0x53, 0x6e, 0x60, 0x4d, 0xc4, + 0x48, 0xa0, 0xc1, 0x8d, 0xb1, 0x3d, 0x7c, 0x67, 0xa5, 0x7f, 0xc3, 0x65, + 0x24, 0x53, 0xb0, 0x3f, 0x4d, 0x60, 0x88, 0xc3, 0xb7, 0x74, 0x55, 0xc1, + 0xb8, 0x84, 0x8e, 0x3c, 0x7c, 0x71, 0x47, 0xb1, 0xac, 0x86, 0x65, 0x7e, + 0xc7, 0x45, 0x36, 0x24, 0xd9, 0x92, 0x7b, 0xd9, 0x6f, 0x92, 0xb9, 0x3b, + 0x52, 0x4b, 0x61, 0x61, 0xad, 0xb0, 0xb8, 0x54, 0x29, 0xbd, 0xac, 0x4e, + 0x43, 0x36, 0x68, 0x80, 0x65, 0x70, 0xdc, 0x7c, 0xa9, 0xaf, 0x6d, 0xc2, + 0xb2, 0x59, 0xdc, 0x76, 0x9c, 0x7e, 0x95, 0xb4, 0x47, 0x63, 0x48, 0x81, + 0x38, 0x44, 0xb3, 0x8b, 0x90, 0x48, 0x44, 0xc2, 0xa2, 0x28, 0x86, 0x3f, + 0x99, 0xe0, 0x98, 0x6d, 0xa9, 0xb9, 0xcd, 0x90, 0x38, 0xb7, 0x3c, 0x8a, + 0x49, 0xca, 0xc4, 0xab, 0xde, 0x84, 0x49, 0x7a, 0x7d, 0x47, 0x8c, 0xd5, + 0x77, 0x8d, 0x6b, 0x48, 0x97, 0x8a, 0x95, 0xc4, 0x72, 0x4b, 0x4a, 0x98, + 0x47, 0x58, 0x49, 0x77, 0x66, 0x47, 0x81, 0x7e, 0xa0, 0x7d, 0xb6, 0xd2, + 0xbb, 0xa4, 0x4f, 0x8a, 0x64, 0x92, 0xb9, 0xb8, 0x67, 0x44, 0x41, 0x4e, + 0x8f, 0x78, 0xd1, 0x46, 0x4a, 0x96, 0xbb, 0x77, 0x47, 0x3a, 0xa6, 0xa6, + 0xb5, 0xdc, 0xac, 0x58, 0x57, 0x7b, 0xae, 0x4f, 0x2f, 0xc2, 0x26, 0x56, + 0xc4, 0x4a, 0x6c, 0x9c, 0xbe, 0x5d, 0x87, 0x86, 0x76, 0xae, 0x78, 0xc2, + 0x42, 0xa1, 0x88, 0x85, 0xaa, 0xd7, 0x97, 0x9f, 0x34, 0xb8, 0x43, 0xc4, + 0xc2, 0x4f, 0x68, 0xd8, 0x3e, 0x50, 0x59, 0xcd, 0xc8, 0xba, 0x93, 0x93, + 0xbf, 0x87, 0x6c, 0x4a, 0x93, 0x97, 0x92, 0xb8, 0x96, 0x91, 0x95, 0x8c, + 0xb2, 0x61, 0x8b, 0x6f, 0xc5, 0x4e, 0x5c, 0xa1, 0x8c, 0xd0, 0xa1, 0x4a, + 0x4d, 0x5a, 0xb4, 0xc7, 0x6e, 0x51, 0x72, 0xa6, 0x40, 0x8b, 0x4d, 0xd6, + 0xc7, 0x9d, 0xc2, 0xc4, 0x73, 0x5e, 0x56, 0x6c, 0x75, 0xaf, 0x92, 0x7a, + 0x5f, 0xb0, 0xa3, 0x69, 0x88, 0x4d, 0x75, 0xad, 0x75, 0x6c, 0x3e, 0x84, + 0x61, 0x4c, 0x8d, 0xad, 0x61, 0x83, 0xb6, 0xc0, 0x8e, 0x7b, 0x5e, 0xd7, + 0x49, 0xbf, 0xcf, 0x98, 0x74, 0xbc, 0x83, 0xb5, 0x3a, 0x6f, 0xc4, 0xc8, + 0xb4, 0x85, 0xd7, 0x3c, 0xc2, 0x97, 0x29, 0xc0, 0xa9, 0x32, 0x77, 0x79, + 0x63, 0xaf, 0x6b, 0x9e, 0x93, 0x97, 0x41, 0x5d, 0x71, 0x30, 0x77, 0x56, + 0x46, 0x64, 0x62, 0x80, 0x45, 0x54, 0x4f, 0x59, 0x42, 0x7b, 0xa6, 0xb5, + 0xbc, 0xd5, 0x33, 0xa2, 0x89, 0x7b, 0x59, 0x49, 0xac, 0x2d, 0x8a, 0x34, + 0x9a, 0x96, 0x63, 0x95, 0x3b, 0x62, 0x4a, 0x78, 0x93, 0x4f, 0x9c, 0x61, + 0xca, 0x57, 0xb2, 0x78, 0x77, 0x9f, 0x48, 0xc2, 0x9f, 0xad, 0x8b, 0x80, + 0xc8, 0x3b, 0x3a, 0x43, 0x6a, 0x58, 0x9e, 0xc7, 0x66, 0x25, 0xac, 0x9b, + 0x54, 0x54, 0xba, 0x48, 0x8c, 0xba, 0x72, 0x87, 0x3e, 0x9f, 0xd3, 0x91, + 0x31, 0x9d, 0xb5, 0x5a, 0x93, 0x90, 0x8f, 0x99, 0xa7, 0xa2, 0xcb, 0x65, + 0xb2, 0x9b, 0x9f, 0x42, 0x84, 0xb5, 0x96, 0xae, 0x25, 0x36, 0x5d, 0x60, + 0xd3, 0xe6, 0x63, 0x78, 0xcc, 0xbf, 0x40, 0x41, 0xb4, 0x57, 0x3c, 0x49, + 0xcd, 0x99, 0x8f, 0x38, 0x40, 0x97, 0x39, 0x53, 0x98, 0x5f, 0xb3, 0xd3, + 0xa5, 0x8d, 0xae, 0x6e, 0x87, 0x7e, 0x86, 0x86, 0x69, 0x89, 0x4f, 0x43, + 0x2f, 0x61, 0xa9, 0xa2, 0xbe, 0x68, 0x68, 0x9f, 0x45, 0xa9, 0xd5, 0x48, + 0x51, 0x6b, 0xab, 0xc7, 0x76, 0xb9, 0x65, 0xd5, 0x9b, 0xc7, 0xcd, 0x92, + 0x74, 0x71, 0xc5, 0x3c, 0x33, 0x91, 0x64, 0x9d, 0x7d, 0x71, 0x96, 0x62, + 0x69, 0x82, 0xbf, 0x2d, 0xb6, 0x45, 0x42, 0x78, 0x54, 0x88, 0xb3, 0xc8, + 0x39, 0x4d, 0x97, 0xac, 0x2c, 0x63, 0xa4, 0x96, 0xb2, 0x80, 0x54, 0x54, + 0xd2, 0x63, 0x19, 0x9b, 0xaa, 0x71, 0x8c, 0xa6, 0xa2, 0x82, 0x4d, 0x3d, + 0xb9, 0xe5, 0x73, 0x57, 0x76, 0x7f, 0x9b, 0x83, 0x54, 0x88, 0xbc, 0x5a, + 0x60, 0xb4, 0x5f, 0x58, 0x49, 0xba, 0x67, 0x47, 0x63, 0x68, 0x6f, 0x2f, + 0x95, 0xd0, 0xa4, 0xc3, 0x99, 0xb9, 0x32, 0xcc, 0x6e, 0xc0, 0x68, 0xc2, + 0x7f, 0x37, 0x75, 0x4e, 0x69, 0x5d, 0x8d, 0xba, 0xc1, 0xbc, 0x8d, 0x3c, + 0x55, 0xc4, 0xab, 0x9d, 0xb4, 0x79, 0xb6, 0xa0, 0x64, 0x91, 0x90, 0xb2, + 0x37, 0xad, 0x6d, 0x92, 0x51, 0x69, 0x52, 0xa9, 0x9a, 0x93, 0x8e, 0xd0, + 0xc9, 0x4e, 0x67, 0x4f, 0x8e, 0x3b, 0x46, 0x8b, 0x43, 0xa0, 0x8f, 0x94, + 0x3a, 0x72, 0x36, 0x7a, 0x87, 0x7f, 0x88, 0xc6, 0x5d, 0x55, 0x63, 0x7d, + 0x56, 0x61, 0x5c, 0x91, 0x9f, 0x85, 0xb1, 0x8d, 0xa9, 0xa3, 0x92, 0x56, + 0x93, 0x74, 0xaf, 0xa3, 0x58, 0x46, 0xbc, 0x59, 0xa4, 0x79, 0x84, 0x43, + 0xa8, 0x95, 0x63, 0x6b, 0x9d, 0x72, 0x54, 0x4c, 0xa9, 0x3b, 0xcb, 0x4b, + 0x81, 0x75, 0x6b, 0xc8, 0x5c, 0x8e, 0xba, 0x82, 0xc2, 0xb9, 0xbd, 0x67, + 0x63, 0x9f, 0x3b, 0x3a, 0x79, 0x22, 0x9e, 0x91, 0x82, 0x3e, 0x6b, 0x88, + 0x3f, 0x98, 0x32, 0xb9, 0x52, 0x49, 0x72, 0xa4, 0x39, 0xa4, 0xae, 0x3e, + 0xc9, 0x58, 0xcd, 0x9c, 0x99, 0xd9, 0xa8, 0x4e, 0x3c, 0x96, 0x4c, 0x5d, + 0xc7, 0xb3, 0xbf, 0x9a, 0xc5, 0xab, 0x8b, 0x96, 0x46, 0x79, 0x7a, 0x4f, + 0xae, 0xb7, 0x53, 0xa8, 0x4c, 0x8a, 0x8f, 0xaf, 0x8d, 0x4b, 0xa2, 0x97, + 0x6d, 0x91, 0x72, 0x99, 0xd3, 0xc9, 0xca, 0xa7, 0x69, 0x47, 0xbb, 0xbd, + 0x58, 0x47, 0x62, 0x4e, 0x52, 0x73, 0x98, 0x5a, 0x89, 0x9f, 0x6d, 0xba, + 0x8b, 0x6a, 0x62, 0x48, 0xcd, 0x3a, 0xca, 0x41, 0x8e, 0xaa, 0xad, 0x7e, + 0xbf, 0x7d, 0x68, 0x7f, 0x79, 0x30, 0xa6, 0xa6, 0x81, 0x56, 0x7a, 0x9d, + 0x38, 0x65, 0xab, 0x95, 0x71, 0xd2, 0x97, 0x47, 0x95, 0x73, 0x34, 0x5e, + 0x7c, 0xb6, 0xa0, 0x6c, 0x93, 0x43, 0xb7, 0x32, 0x9d, 0x64, 0x3a, 0x4a, + 0xb4, 0x80, 0x75, 0x49, 0xcb, 0x9d, 0x5e, 0xba, 0x52, 0x3b, 0x70, 0xa5, + 0xc9, 0x62, 0xb3, 0x82, 0x3a, 0xbd, 0xc4, 0xb4, 0x3b, 0x34, 0xbf, 0x41, + 0xcd, 0xb4, 0x79, 0x5b, 0x5a, 0xc9, 0x8f, 0xaf, 0xc2, 0xb8, 0x40, 0x70, + 0xae, 0x96, 0x8e, 0xd1, 0x5b, 0x48, 0x44, 0x62, 0x52, 0x31, 0x93, 0x59, + 0x86, 0xd2, 0xc6, 0x35, 0x9c, 0xa6, 0x53, 0xb6, 0x77, 0x4c, 0x97, 0xc6, + 0x91, 0x93, 0xba, 0x46, 0xa6, 0x9c, 0xc0, 0x8f, 0xce, 0xc6, 0xba, 0xbe, + 0xa0, 0xb4, 0xb3, 0x64, 0xc8, 0x7e, 0x99, 0x75, 0xc6, 0x3d, 0x85, 0xb1, + 0x6a, 0x4d, 0x6d, 0x7e, 0x4e, 0x42, 0x5c, 0xb2, 0x74, 0xbe, 0xd1, 0xc9, + 0x6f, 0xbd, 0x7f, 0x2a, 0x6f, 0x9a, 0xc7, 0x87, 0x6e, 0x72, 0xa8, 0xb9, + 0x8b, 0x5c, 0x38, 0x6b, 0x5a, 0xa2, 0x61, 0x49, 0x56, 0x2e, 0xaa, 0xaf, + 0x85, 0x94, 0x7b, 0xa7, 0x8e, 0xcf, 0xc2, 0x80, 0x64, 0x70, 0x5e, 0x71, + 0x3c, 0x93, 0x56, 0x82, 0x86, 0x34, 0x6f, 0xc6, 0xaf, 0x7d, 0x75, 0x73, + 0xcd, 0x95, 0xc9, 0x9d, 0x3d, 0x9e, 0x44, 0x57, 0x91, 0x49, 0x5b, 0x5e, + 0xc5, 0x6c, 0x9d, 0x38, 0x39, 0xc5, 0xc2, 0x6a, 0x9c, 0xb8, 0x82, 0x38, + 0xbf, 0x5e, 0x38, 0x5b, 0xc5, 0x9a, 0x9c, 0x6b, 0x58, 0xb1, 0x6c, 0x79, + 0xb1, 0x6f, 0xd6, 0xcf, 0xaa, 0x90, 0xa0, 0x52, 0xcd, 0xc3, 0x6d, 0x4a, + 0xbd, 0x83, 0x77, 0x83, 0x51, 0x97, 0x40, 0x69, 0x53, 0xb8, 0x46, 0xd1, + 0x97, 0xc6, 0xaa, 0x76, 0x92, 0xa6, 0x5f, 0xd1, 0x4c, 0xca, 0x4e, 0xac, + 0x86, 0x91, 0x27, 0xa1, 0xb0, 0xa0, 0x51, 0xb8, 0xc1, 0xce, 0x33, 0xbb, + 0x69, 0x80, 0x6f, 0x50, 0xab, 0x8b, 0xa0, 0x5a, 0x79, 0x55, 0x41, 0x86, + 0xce, 0xab, 0xae, 0x38, 0xcf, 0x3b, 0xc7, 0x48, 0x7e, 0x66, 0xc7, 0x4d, + 0xd4, 0xb7, 0x68, 0xc6, 0xc5, 0xc8, 0xc2, 0xa2, 0xba, 0xa4, 0x57, 0x51, + 0x6b, 0xab, 0x84, 0x50, 0x51, 0x71, 0xba, 0x35, 0x85, 0xc6, 0x46, 0x35, + 0x4f, 0x41, 0x92, 0x9b, 0x46, 0x65, 0xa7, 0x83, 0xc5, 0x94, 0x49, 0x46, + 0xb6, 0x6b, 0x4b, 0xca, 0x60, 0x7f, 0xd5, 0xa3, 0x39, 0x6c, 0xc1, 0xc5, + 0x57, 0xb2, 0x38, 0xbb, 0x6c, 0x91, 0x38, 0x61, 0x3b, 0xc1, 0x94, 0xbb, + 0x81, 0x73, 0x94, 0x37, 0x49, 0x94, 0xc9, 0x64, 0x68, 0x5e, 0xbc, 0x5e, + 0x38, 0xce, 0xa2, 0x99, 0x53, 0x7d, 0x86, 0xa9, 0x6a, 0xb3, 0x7e, 0x8d, + 0x83, 0x53, 0xaf, 0x5e, 0x9a, 0x3d, 0x9b, 0x52, 0x53, 0x86, 0x5e, 0x37, + 0x3c, 0x8d, 0x92, 0x8a, 0x82, 0xc3, 0x8f, 0x30, 0x57, 0x82, 0x8a, 0x86, + 0xb6, 0x8d, 0x99, 0xbe, 0xb8, 0x77, 0xcd, 0xc3, 0x5f, 0x80, 0x4e, 0x7e, + 0x5c, 0x79, 0x49, 0x8f, 0x77, 0x34, 0xa7, 0x38, 0x5a, 0x88, 0xa6, 0x4d, + 0x4f, 0xa4, 0xcb, 0xa6, 0x49, 0x8d, 0x9a, 0xb0, 0x3e, 0x65, 0x69, 0xcd, + 0x89, 0x83, 0x93, 0x3e, 0xa8, 0x34, 0xba, 0x41, 0x3d, 0xb4, 0x49, 0x5f, + 0xc8, 0x5d, 0x53, 0x53, 0xa7, 0x51, 0x69, 0x3a, 0x35, 0xac, 0x55, 0xc7, + 0x90, 0xb5, 0x37, 0x3e, 0xa0, 0x32, 0xbc, 0x46, 0x98, 0x6c, 0x79, 0xc1, + 0x78, 0xb9, 0x57, 0x63, 0x72, 0x41, 0xcf, 0x5d, 0x91, 0x49, 0x32, 0xa3, + 0xc4, 0xb5, 0xb1, 0xcc, 0x81, 0x55, 0xb8, 0xc3, 0x7c, 0x5a, 0x71, 0x47, + 0x97, 0x42, 0x94, 0xbb, 0x74, 0x7d, 0xb3, 0x4e, 0xcd, 0x7e, 0x6d, 0x99, + 0xd1, 0x75, 0x68, 0x54, 0xbf, 0x83, 0x3b, 0x70, 0x41, 0x2a, 0xab, 0x98, + 0x83, 0x44, 0x6e, 0x97, 0xcd, 0x34, 0xa9, 0x7c, 0x4b, 0x4f, 0xd6, 0x64, + 0xd2, 0x32, 0xb3, 0x4c, 0x9b, 0xce, 0x9c, 0x32, 0xcd, 0x5f, 0x49, 0xb3, + 0x78, 0x2e, 0x47, 0x4d, 0xc3, 0x93, 0xa5, 0xbc, 0xaa, 0x4d, 0xa5, 0xa4, + 0x58, 0x3a, 0x96, 0x6f, 0x98, 0x8d, 0x40, 0x88, 0x2c, 0x61, 0x6c, 0x3b, + 0x74, 0x95, 0x57, 0x7c, 0x63, 0x7d, 0xcb, 0xb7, 0xa8, 0x8c, 0x8a, 0xaa, + 0x62, 0x81, 0x32, 0x79, 0x6b, 0xa6, 0x3a, 0x9e, 0x3f, 0x8e, 0xa3, 0x38, + 0x2c, 0xc7, 0x40, 0x97, 0x2e, 0xab, 0x7f, 0x83, 0x5d, 0xb9, 0x73, 0x74, + 0x59, 0x98, 0xd0, 0x54, 0x4e, 0x49, 0x5c, 0xa9, 0xd5, 0x81, 0x97, 0x81, + 0x38, 0xcc, 0xaa, 0x9c, 0xc1, 0xc9, 0x75, 0xac, 0x87, 0x8d, 0x4a, 0x38, + 0x71, 0xd7, 0xab, 0xb3, 0x94, 0x56, 0x35, 0x2e, 0xc9, 0x87, 0x68, 0x96, + 0x71, 0x59, 0xc8, 0xb2, 0x5d, 0x97, 0x6a, 0x69, 0x43, 0x99, 0xd3, 0x78, + 0x60, 0x85, 0x93, 0x69, 0x92, 0xa7, 0xbd, 0x49, 0x72, 0x52, 0x4e, 0x30, + 0xcd, 0xca, 0x41, 0x41, 0x3b, 0x64, 0xb3, 0x3b, 0x8f, 0x72, 0x5a, 0x59, + 0x5c, 0x50, 0x87, 0x7f, 0x62, 0x55, 0x7c, 0x9d, 0x30, 0x3d, 0x80, 0xcb, + 0x40, 0xd1, 0xd0, 0xa9, 0x50, 0xa0, 0x60, 0x7c, 0xba, 0xab, 0xd1, 0x44, + 0x2a, 0x5d, 0x63, 0xb3, 0xbc, 0x6f, 0x7b, 0x64, 0xb0, 0x73, 0x66, 0x36, + 0x75, 0x9b, 0x64, 0xc7, 0x21, 0xa1, 0x9a, 0x57, 0x64, 0xbf, 0x6f, 0x89, + 0x9b, 0x9d, 0xb1, 0x23, 0x43, 0x90, 0x6e, 0x40, 0x73, 0x71, 0x97, 0x7d, + 0x69, 0x49, 0xc2, 0x3e, 0x99, 0xbd, 0x8c, 0xc1, 0xaa, 0x98, 0x51, 0x8a, + 0x79, 0x9c, 0x42, 0xb7, 0xc3, 0x79, 0x8c, 0x9c, 0x8f, 0x4d, 0x8f, 0x86, + 0x56, 0x49, 0xbc, 0x5b, 0xc6, 0x47, 0xb4, 0xd0, 0xcd, 0x6e, 0x71, 0xa7, + 0x39, 0x70, 0x6d, 0x46, 0x33, 0x73, 0x99, 0x9c, 0x3c, 0xa0, 0x31, 0xa1, + 0x70, 0xab, 0x72, 0x55, 0xa3, 0x80, 0x63, 0xa7, 0xc1, 0x3e, 0xc4, 0x72, + 0x7a, 0x47, 0xc5, 0x53, 0xcd, 0x5a, 0xba, 0xd2, 0x76, 0x89, 0x74, 0x98, + 0x7e, 0x8a, 0x7c, 0xc0, 0xc1, 0xa0, 0xa4, 0xc2, 0xbf, 0xb5, 0x5f, 0x7b, + 0xca, 0xc4, 0x2f, 0xb1, 0xa2, 0x51, 0xad, 0x4e, 0x3a, 0x4d, 0x79, 0x91, + 0x39, 0xb3, 0x33, 0x60, 0x6f, 0xb2, 0xc1, 0x9a, 0xce, 0x67, 0x73, 0x55, + 0x7e, 0xc3, 0x4c, 0x8a, 0xcb, 0x67, 0x64, 0x52, 0x89, 0x3e, 0x65, 0x5f, + 0x2f, 0x61, 0x78, 0xd4, 0x8f, 0xbc, 0x54, 0x4e, 0x47, 0x70, 0xc9, 0x62, + 0x70, 0xc8, 0x57, 0x3b, 0xbb, 0x83, 0x8e, 0xbc, 0x80, 0xa8, 0x8c, 0xba, + 0x99, 0x95, 0xa7, 0x40, 0x89, 0x40, 0x82, 0x49, 0xa1, 0x5f, 0x2e, 0x6a, + 0x61, 0x45, 0xc3, 0x8f, 0xaa, 0xa3, 0x8f, 0x60, 0x35, 0xa2, 0x9c, 0xb8, + 0xab, 0xa7, 0x86, 0x38, 0x32, 0xa0, 0xc8, 0xcd, 0x33, 0x95, 0xcb, 0x44, + 0x6c, 0x68, 0xa9, 0x53, 0x65, 0x9a, 0x9c, 0x64, 0x96, 0xd0, 0x6b, 0x68, + 0xa0, 0x3c, 0x9a, 0x93, 0x9f, 0xc0, 0xa4, 0x2f, 0x65, 0x68, 0x7d, 0x8b, + 0xbb, 0x93, 0xd0, 0x6f, 0xb2, 0xb9, 0x4f, 0x76, 0xa6, 0xcf, 0x2b, 0x35, + 0x93, 0xae, 0x64, 0x92, 0xc7, 0xab, 0x89, 0x37, 0xcd, 0xcb, 0xc8, 0x73, + 0x54, 0xa5, 0xc1, 0x5c, 0x4f, 0x81, 0x47, 0xca, 0xc4, 0x64, 0xd3, 0xbd, + 0x6f, 0x3b, 0x96, 0xcc, 0xb6, 0xc2, 0x52, 0x81, 0x9a, 0x62, 0xbe, 0x85, + 0x65, 0x88, 0xd4, 0x4c, 0xad, 0x4a, 0xbe, 0xb7, 0x3a, 0x8e, 0xc8, 0xa4, + 0x98, 0xdc, 0xa7, 0xbe, 0xa6, 0x89, 0x81, 0x48, 0xa2, 0x99, 0xa1, 0x44, + 0xc7, 0x3f, 0x9f, 0x2e, 0x30, 0xce, 0xcf, 0x64, 0x45, 0x9a, 0x9e, 0x86, + 0x6a, 0x7a, 0xc3, 0x68, 0xaa, 0x52, 0xbd, 0x31, 0x30, 0x86, 0xb9, 0xc0, + 0x4a, 0xc4, 0x45, 0x6b, 0x3d, 0xca, 0x64, 0x73, 0x4b, 0x84, 0x4a, 0x46, + 0x72, 0xc9, 0xd3, 0x3b, 0x61, 0x6a, 0x3b, 0xb2, 0x85, 0x91, 0xad, 0x91, + 0xac, 0x71, 0xba, 0x3b, 0xaf, 0x5e, 0x64, 0x59, 0x45, 0x54, 0x86, 0x76, + 0x87, 0x58, 0x43, 0xca, 0x92, 0x98, 0xac, 0xb5, 0x87, 0x5d, 0x98, 0x70, + 0x68, 0xd0, 0x40, 0x99, 0x47, 0xc8, 0x75, 0x38, 0x80, 0x2e, 0x4f, 0x2e, + 0xc0, 0xb6, 0x23, 0x43, 0x58, 0xcb, 0x6a, 0x6f, 0x79, 0x7f, 0x67, 0x64, + 0xbc, 0x7a, 0xa1, 0x43, 0x99, 0x37, 0x59, 0x74, 0xcc, 0xc1, 0xba, 0xa7, + 0x8b, 0xaf, 0xd1, 0x65, 0xd5, 0xa8, 0x40, 0x52, 0xa7, 0xa0, 0xa7, 0xb0, + 0x7e, 0x71, 0xbf, 0x56, 0xa3, 0xa6, 0x81, 0x61, 0x65, 0x41, 0x8c, 0x7e, + 0x6f, 0x35, 0x28, 0x7d, 0x3d, 0x64, 0x57, 0x4d, 0x47, 0xa5, 0x7b, 0x7a, + 0xa3, 0xb5, 0xa3, 0xc8, 0xb1, 0x8e, 0x4e, 0x7d, 0xa4, 0x74, 0x96, 0x81, + 0x63, 0xa7, 0xbd, 0xb8, 0x59, 0xbe, 0x7e, 0xa6, 0x88, 0x6d, 0x6b, 0x6b, + 0x8b, 0x8b, 0xb6, 0x69, 0x91, 0x9c, 0x9d, 0xa6, 0xa9, 0xd9, 0x95, 0xa9, + 0x41, 0x65, 0xc7, 0xa5, 0x5a, 0x41, 0xa0, 0x7a, 0x73, 0xa4, 0x93, 0xa4, + 0x87, 0x56, 0x96, 0x61, 0xa7, 0x92, 0x8c, 0xc0, 0x58, 0x76, 0x5e, 0x5b, + 0x24, 0xbe, 0x52, 0xb8, 0x32, 0x9a, 0xad, 0xb8, 0xc2, 0x37, 0x7e, 0x25, + 0x98, 0x6a, 0xd0, 0x40, 0x37, 0xad, 0x70, 0x95, 0x64, 0x3b, 0x52, 0xa7, + 0x92, 0xaf, 0xad, 0x93, 0xc4, 0x93, 0x80, 0xcb, 0x86, 0x98, 0xb7, 0xc1, + 0xa5, 0xba, 0x95, 0x24, 0x88, 0x9f, 0x41, 0x4a, 0xa1, 0xb4, 0xd5, 0x34, + 0x57, 0x9f, 0x81, 0x66, 0xae, 0x33, 0x8a, 0x79, 0x76, 0xa0, 0x2a, 0x9c, + 0x40, 0x32, 0x73, 0xad, 0x7c, 0x5c, 0x43, 0x93, 0x2e, 0xa0, 0x8d, 0xbb, + 0xc1, 0xc9, 0x41, 0x82, 0x72, 0x5a, 0x66, 0xbb, 0xac, 0xc5, 0xb3, 0x91, + 0xd3, 0xa8, 0x76, 0x4f, 0xb8, 0x79, 0x39, 0xa7, 0x43, 0xa4, 0x8c, 0x8f, + 0x8a, 0xb4, 0x9c, 0xcd, 0x78, 0x97, 0xba, 0xb2, 0xbe, 0x80, 0x9c, 0xb8, + 0x67, 0x47, 0x72, 0x54, 0x41, 0x60, 0xbe, 0x7f, 0xb7, 0x56, 0x29, 0xab, + 0xa8, 0x7f, 0x35, 0x89, 0x66, 0x70, 0x9c, 0x89, 0xb1, 0x8e, 0xc4, 0xd1, + 0x38, 0x6b, 0x5e, 0x56, 0x78, 0x67, 0x2b, 0x9d, 0xc1, 0x5e, 0x3d, 0x94, + 0xc0, 0x5c, 0x69, 0xbf, 0x8d, 0xd5, 0x3b, 0x69, 0x50, 0x9e, 0xc0, 0x6a, + 0x8a, 0x4a, 0x5b, 0x40, 0x78, 0xd1, 0x68, 0x70, 0xb5, 0x6d, 0x55, 0xa1, + 0x50, 0x95, 0x3e, 0x30, 0x3f, 0xd4, 0x75, 0x36, 0x46, 0x79, 0xe1, 0x86, + 0x73, 0x60, 0x2e, 0x9f, 0xd3, 0xb9, 0x96, 0x74, 0xd2, 0x4a, 0x54, 0xba, + 0x53, 0x54, 0x36, 0x7e, 0x52, 0x92, 0x50, 0xb6, 0x90, 0x5e, 0x86, 0xa3, + 0x2e, 0xab, 0x98, 0xb0, 0x80, 0xd3, 0xc1, 0x96, 0x59, 0x63, 0x6a, 0x5f, + 0xaf, 0x9a, 0xa6, 0xc6, 0x41, 0x5d, 0x69, 0x3b, 0x77, 0x72, 0xa6, 0x54, + 0x97, 0xbc, 0x96, 0x95, 0x86, 0xbf, 0xa9, 0x47, 0x56, 0x78, 0xc2, 0x62, + 0x53, 0xa9, 0x6f, 0x49, 0xc6, 0x74, 0xbf, 0x5c, 0xcd, 0xae, 0x2f, 0x3e, + 0x66, 0x51, 0x71, 0x73, 0x3e, 0x25, 0x53, 0x39, 0x63, 0xc9, 0x8f, 0xcf, + 0x58, 0x48, 0x6a, 0x87, 0x36, 0xc9, 0x9c, 0x56, 0xb4, 0xbd, 0xb8, 0x82, + 0x9a, 0xa3, 0xcc, 0xbc, 0xb9, 0xb6, 0x94, 0x9d, 0xa8, 0x91, 0x68, 0x55, + 0x6d, 0xbf, 0x8d, 0x3a, 0xc4, 0x52, 0x50, 0x93, 0x83, 0x8a, 0x3f, 0xc0, + 0x92, 0x5a, 0x5a, 0xb0, 0x64, 0x3b, 0x94, 0x86, 0x82, 0xaf, 0xc6, 0x82, + 0x40, 0x6a, 0x70, 0x1e, 0x58, 0xab, 0x91, 0x74, 0x59, 0x7b, 0xa4, 0xc6, + 0x59, 0x36, 0x83, 0x96, 0xac, 0x3a, 0x97, 0xd0, 0x61, 0x4d, 0x65, 0x63, + 0xcc, 0x7a, 0xba, 0xce, 0x72, 0x4a, 0xbe, 0x47, 0xc5, 0x7b, 0x61, 0x72, + 0x90, 0x9a, 0x66, 0x73, 0x39, 0x29, 0x52, 0x3e, 0x3a, 0xb9, 0x3a, 0x85, + 0xbb, 0xc0, 0x5c, 0x7d, 0x96, 0xb8, 0x61, 0x26, 0x7b, 0x4f, 0x7c, 0x77, + 0x85, 0xcd, 0x5c, 0x6a, 0xa7, 0x56, 0xa1, 0xbd, 0x5f, 0xa3, 0x4b, 0x8a, + 0x70, 0x7a, 0xc5, 0x87, 0x82, 0xb4, 0x46, 0xaf, 0x34, 0x9e, 0xac, 0x70, + 0xa4, 0x9a, 0x42, 0xb1, 0xd5, 0xa6, 0xc6, 0x9c, 0xbb, 0x60, 0x84, 0x67, + 0x88, 0xab, 0x9d, 0xbf, 0xa0, 0x33, 0xd2, 0x73, 0x97, 0x71, 0x46, 0x45, + 0xd1, 0x4f, 0xab, 0x9e, 0xc5, 0x95, 0xa9, 0xdd, 0x97, 0x3a, 0x56, 0x44, + 0x67, 0xd3, 0xaf, 0x9f, 0x29, 0x5d, 0xcd, 0x9d, 0xb8, 0x94, 0x9f, 0x6c, + 0x73, 0xc6, 0xac, 0x90, 0x3d, 0x79, 0x44, 0x4c, 0xab, 0x4d, 0xc5, 0xce, + 0xc6, 0xa4, 0xb9, 0x54, 0xc8, 0xaa, 0x6e, 0x5b, 0x65, 0x7d, 0x29, 0xcd, + 0xbf, 0x5a, 0x49, 0x41, 0xc0, 0x5c, 0xb1, 0xb1, 0x4e, 0x83, 0x73, 0xa6, + 0x8c, 0x2f, 0x9c, 0xa6, 0x85, 0x51, 0xa0, 0xbb, 0x4b, 0x9b, 0x8c, 0x91, + 0x63, 0x9c, 0x4d, 0xc2, 0xd5, 0x70, 0xbc, 0x90, 0x4a, 0x4d, 0x45, 0x7f, + 0xc2, 0x5e, 0x85, 0x55, 0x6d, 0x57, 0x83, 0xbe, 0x40, 0x8c, 0x4f, 0x7d, + 0xbd, 0x83, 0xc4, 0xa7, 0x66, 0x38, 0xa9, 0x4b, 0x92, 0x5f, 0x39, 0x8a, + 0x28, 0xa1, 0x9a, 0x85, 0x51, 0x64, 0x72, 0x5f, 0xcb, 0x87, 0xab, 0xa3, + 0x5f, 0x2f, 0xcf, 0x7c, 0xa4, 0x8e, 0xa6, 0xc3, 0x7a, 0x98, 0x47, 0xa2, + 0x65, 0x76, 0xa4, 0xb8, 0x9b, 0xb1, 0x8d, 0xa5, 0xb1, 0xdd, 0x82, 0x8a, + 0x9c, 0x59, 0x83, 0x78, 0xd8, 0xb5, 0x5c, 0x63, 0x5f, 0x86, 0x93, 0xa1, + 0xb3, 0x4d, 0x74, 0x73, 0x8e, 0xb4, 0x5a, 0x8a, 0x59, 0x39, 0x8a, 0x49, + 0x93, 0xd1, 0xb5, 0x92, 0xa2, 0xb5, 0x5d, 0xa5, 0xa7, 0x61, 0xa9, 0xa4, + 0x7f, 0x6e, 0x91, 0x48, 0x41, 0x6e, 0x80, 0x52, 0xba, 0x7d, 0x97, 0x6e, + 0x5f, 0x43, 0xa9, 0x96, 0xd2, 0x91, 0xa8, 0x4f, 0x61, 0x8b, 0xa9, 0x40, + 0x48, 0x91, 0x6c, 0x4a, 0x6e, 0xc3, 0x91, 0x36, 0x6a, 0x6a, 0xaf, 0xc9, + 0x90, 0xd2, 0x47, 0xb8, 0x34, 0xcd, 0x71, 0x53, 0x36, 0x88, 0x84, 0x42, + 0xbb, 0x4e, 0xd0, 0xbe, 0x96, 0x9e, 0x56, 0x63, 0x6f, 0x6b, 0x45, 0x57, + 0x43, 0x9c, 0xc4, 0x38, 0xca, 0x61, 0x53, 0xa8, 0x47, 0xab, 0xc4, 0x3f, + 0xcb, 0x58, 0xd6, 0xb7, 0x6d, 0x72, 0xc0, 0x59, 0x6d, 0xcc, 0x34, 0x8a, + 0x91, 0xc6, 0x65, 0x60, 0x61, 0x43, 0xb1, 0xa9, 0x56, 0xb0, 0x40, 0xca, + 0xae, 0xc8, 0x6f, 0x50, 0xb0, 0xb4, 0xc1, 0xcc, 0x5f, 0x81, 0x47, 0xd3, + 0xd8, 0xb3, 0x59, 0xa0, 0xb1, 0x59, 0x6e, 0x41, 0xa8, 0xa0, 0x94, 0x45, + 0x2c, 0x85, 0x8d, 0x79, 0xaf, 0x47, 0x8c, 0xaf, 0x8b, 0xc1, 0xc0, 0x8f, + 0x89, 0x53, 0xce, 0xbd, 0x6b, 0x54, 0xca, 0xcf, 0x76, 0x61, 0x9b, 0x7a, + 0xa6, 0xcc, 0x94, 0x88, 0x3f, 0x7a, 0x70, 0xaa, 0x96, 0xa7, 0xb1, 0x57, + 0x93, 0x43, 0x94, 0x61, 0xa2, 0xc8, 0x72, 0x5a, 0x7b, 0x3c, 0xcd, 0x44, + 0x42, 0x77, 0x2f, 0x4b, 0xb6, 0x92, 0xc1, 0xcd, 0xc5, 0xc4, 0xae, 0xc7, + 0x8b, 0x98, 0x6c, 0x9d, 0xbe, 0x88, 0xbd, 0x85, 0x96, 0x39, 0x92, 0x91, + 0xa8, 0xaf, 0x6a, 0x5c, 0x72, 0x74, 0xda, 0xd0, 0xa9, 0xcc, 0xd0, 0x5e, + 0x93, 0x89, 0x88, 0x40, 0xae, 0x42, 0x5a, 0x43, 0x90, 0xb5, 0x83, 0xba, + 0xb4, 0xcc, 0xb4, 0x80, 0x41, 0x62, 0xab, 0xaa, 0x64, 0x8d, 0x86, 0x9b, + 0xb8, 0xa2, 0xa7, 0x9d, 0x63, 0x52, 0x49, 0x96, 0x8a, 0x9b, 0x4f, 0x8b, + 0xd3, 0x5e, 0x6c, 0x8c, 0x40, 0x84, 0x7b, 0xcb, 0x9e, 0x81, 0x89, 0x7d, + 0xb9, 0x87, 0xb4, 0x36, 0x9a, 0xb9, 0x5d, 0xa4, 0x39, 0x94, 0x92, 0x55, + 0x9c, 0x61, 0x6c, 0xc7, 0xa6, 0x88, 0xdc, 0x87, 0x85, 0x84, 0x8a, 0x51, + 0x43, 0xbe, 0x5a, 0x39, 0x5a, 0x3c, 0xbf, 0x6b, 0x59, 0xce, 0x72, 0x78, + 0x77, 0x6f, 0x88, 0x73, 0xab, 0xa4, 0x4f, 0x6a, 0x83, 0x8c, 0xbd, 0x9a, + 0x6f, 0x46, 0xb6, 0x5b, 0xc0, 0x8a, 0x60, 0x34, 0xbf, 0xc6, 0x5b, 0xc8, + 0x60, 0x35, 0x4c, 0x52, 0x3a, 0x5f, 0xb8, 0x52, 0xcb, 0xc8, 0x95, 0x71, + 0x8a, 0x40, 0x36, 0x3b, 0x74, 0xaa, 0x68, 0x56, 0x47, 0x58, 0x9b, 0xbc, + 0x58, 0xb7, 0x9f, 0x67, 0x3e, 0xad, 0x3d, 0x9e, 0x49, 0xc8, 0x9d, 0xc0, + 0x8e, 0x80, 0xbc, 0x9d, 0x44, 0x90, 0x4c, 0x32, 0x31, 0x41, 0x63, 0x6c, + 0x51, 0xad, 0x39, 0x8c, 0x4b, 0xc5, 0x60, 0x80, 0xa8, 0x35, 0x7e, 0x38, + 0xa3, 0xba, 0xd3, 0x74, 0x94, 0x8c, 0x43, 0xa6, 0x80, 0x6b, 0xaa, 0xd2, + 0x8e, 0xc7, 0x52, 0x4d, 0xa3, 0x91, 0x69, 0x51, 0x6d, 0xa1, 0xa5, 0x5c, + 0x3b, 0xb4, 0xc9, 0x66, 0x59, 0x47, 0xa8, 0xa6, 0xd3, 0x80, 0x9c, 0xce, + 0x69, 0x3e, 0x69, 0xa5, 0x7d, 0xad, 0xa5, 0x54, 0xb0, 0x6d, 0x7b, 0xa2, + 0x37, 0xc7, 0x47, 0xa0, 0xd0, 0xa6, 0xc0, 0x97, 0x9e, 0xc5, 0xd1, 0xad, + 0x96, 0x9d, 0xcc, 0x6e, 0x7b, 0xbb, 0xca, 0xc9, 0xae, 0x7f, 0x7f, 0xa2, + 0xc5, 0x34, 0x6e, 0xc2, 0xd5, 0x64, 0xb5, 0xa5, 0x71, 0xac, 0x4f, 0x7a, + 0xaa, 0x91, 0x99, 0x8a, 0xba, 0x9f, 0x3b, 0x69, 0x5e, 0xad, 0x40, 0xa0, + 0x71, 0x50, 0x4b, 0x87, 0x4d, 0x54, 0x5c, 0xbe, 0x6a, 0x93, 0xbd, 0xc6, + 0x80, 0x50, 0x8f, 0xd3, 0x8a, 0x41, 0x61, 0x77, 0xa2, 0x76, 0xb3, 0xa4, + 0xba, 0x8d, 0xc0, 0x3b, 0xb7, 0xd5, 0x6d, 0x8c, 0xb9, 0xce, 0x88, 0x5a, + 0x7d, 0xc6, 0x80, 0x49, 0x80, 0xc0, 0x36, 0x31, 0x7b, 0x88, 0xab, 0x6c, + 0x9b, 0x77, 0xb3, 0x85, 0x50, 0xa8, 0x8c, 0xd2, 0xba, 0xcc, 0xa2, 0xda, + 0x71, 0xc6, 0x86, 0xda, 0xad, 0xd0, 0x95, 0x36, 0x60, 0x47, 0x43, 0x91, + 0xbe, 0x92, 0xa5, 0x7e, 0x57, 0x9f, 0xbc, 0xaf, 0x78, 0xaf, 0x8f, 0x4f, + 0x54, 0x46, 0xb0, 0x84, 0xb3, 0xb2, 0xb8, 0x9c, 0x8d, 0xc4, 0x41, 0x4a, + 0x81, 0x74, 0x2f, 0x8c, 0x62, 0xac, 0x9d, 0xa7, 0x4c, 0x91, 0xb0, 0xaf, + 0x79, 0x66, 0x3d, 0x9f, 0xcc, 0x4b, 0x9e, 0xc1, 0x60, 0xcc, 0xad, 0xd4, + 0x36, 0xa5, 0x4e, 0x8b, 0xc5, 0x35, 0xa5, 0x4a, 0xa1, 0x48, 0x81, 0xbe, + 0xb1, 0xd1, 0x5c, 0xb6, 0xa3, 0xa1, 0xce, 0x4b, 0x4b, 0x9f, 0xc3, 0xd2, + 0x8c, 0xa5, 0x99, 0x86, 0x98, 0x30, 0x9d, 0xbe, 0x92, 0xc5, 0x94, 0x3e, + 0x51, 0x9c, 0x6f, 0x99, 0xb1, 0x7e, 0x73, 0xc5, 0x78, 0xbb, 0x31, 0xa5, + 0xad, 0xcb, 0x5f, 0x36, 0xaf, 0xbf, 0x6a, 0x41, 0x87, 0x98, 0xb7, 0x5e, + 0x7d, 0x9d, 0x3c, 0xca, 0x99, 0x74, 0xcd, 0xa3, 0x67, 0xb7, 0x8c, 0x77, + 0x58, 0x42, 0x84, 0x71, 0x7a, 0x65, 0x5d, 0x9c, 0x81, 0xba, 0xa5, 0xc5, + 0x4d, 0x6d, 0x96, 0xc4, 0x31, 0xb7, 0x5e, 0x72, 0xcc, 0xce, 0x62, 0x5a, + 0x5a, 0x6b, 0x90, 0x7e, 0x8c, 0x86, 0x47, 0x34, 0xb9, 0x66, 0x5c, 0x7e, + 0x91, 0xae, 0x2c, 0x49, 0xb5, 0xb9, 0xd4, 0x55, 0xc7, 0x44, 0x46, 0xb8, + 0x78, 0x51, 0x92, 0xb7, 0xbc, 0x41, 0xab, 0x8d, 0xc1, 0x6a, 0x31, 0xc4, + 0x8d, 0xc6, 0x6c, 0x68, 0x4e, 0xcd, 0xa6, 0xa4, 0x93, 0xab, 0x3a, 0x3f, + 0xbe, 0xc2, 0x3b, 0xc8, 0xaf, 0x5f, 0x3f, 0x3c, 0x7b, 0xa0, 0x87, 0x68, + 0xcc, 0x91, 0xbd, 0x8d, 0xa0, 0xa9, 0x63, 0x48, 0x70, 0xa3, 0x64, 0xc9, + 0x55, 0xb3, 0x60, 0x6b, 0xbb, 0x77, 0xc1, 0xd0, 0x56, 0x45, 0x76, 0x52, + 0x64, 0x40, 0x8b, 0x6f, 0x8b, 0xc1, 0x3a, 0x79, 0x50, 0xd3, 0x44, 0xa5, + 0x69, 0xc9, 0x6f, 0xc2, 0xd0, 0x7a, 0x58, 0x8f, 0xbb, 0x9e, 0xa7, 0xa9, + 0x6b, 0xa7, 0x3a, 0xd5, 0xb4, 0xd1, 0x8f, 0xb3, 0x50, 0xc4, 0xbd, 0xc1, + 0x82, 0x4f, 0x5f, 0x69, 0x9b, 0xc9, 0x36, 0x77, 0x71, 0xd0, 0x73, 0x34, + 0xb0, 0xb4, 0xbc, 0xb8, 0x32, 0x5e, 0xb9, 0x63, 0x60, 0xcf, 0x48, 0x44, + 0x5e, 0xa1, 0xc4, 0xd1, 0x92, 0x92, 0x78, 0xb4, 0x87, 0xb8, 0xc5, 0x4b, + 0xce, 0x97, 0x2d, 0x50, 0x37, 0xc1, 0xc4, 0x5a, 0x5b, 0xcd, 0x78, 0xb0, + 0x89, 0x4f, 0x8b, 0x8d, 0x3f, 0xc6, 0x90, 0x42, 0xaf, 0x92, 0x86, 0xb1, + 0xae, 0x81, 0xd0, 0x62, 0xa9, 0x69, 0xbb, 0x8b, 0x8e, 0x94, 0x9b, 0x97, + 0x7e, 0xbb, 0x99, 0x3e, 0x6a, 0xa9, 0xa6, 0x92, 0x74, 0x80, 0x43, 0x73, + 0x78, 0xa6, 0x80, 0x56, 0x5e, 0x78, 0x3d, 0x82, 0xb4, 0x86, 0xae, 0x8d, + 0x38, 0x64, 0x81, 0x92, 0x93, 0x92, 0x54, 0xd0, 0xbe, 0x7d, 0xce, 0xad, + 0x66, 0x6b, 0x77, 0x8f, 0x3b, 0x7d, 0x87, 0x9c, 0xc2, 0x81, 0xb1, 0xb4, + 0x48, 0xa6, 0x9b, 0x2e, 0xd3, 0x3f, 0x3a, 0x4b, 0x8e, 0x52, 0x4f, 0x93, + 0x4b, 0xa4, 0xd5, 0x3e, 0x79, 0x5d, 0x98, 0x7b, 0xd6, 0xd8, 0x46, 0x57, + 0x88, 0x3e, 0xce, 0xc4, 0xb5, 0x3e, 0xad, 0x4a, 0xa5, 0xb0, 0x2b, 0xc4, + 0x93, 0xb3, 0x98, 0xa2, 0x7c, 0xac, 0x9a, 0x3b, 0xaf, 0xb2, 0x2d, 0x7e, + 0xad, 0x6c, 0xaa, 0x84, 0x4d, 0xb3, 0x65, 0x7e, 0xa6, 0x55, 0x4c, 0x7f, + 0x81, 0x94, 0xb8, 0x66, 0xb1, 0x57, 0x61, 0x3f, 0x65, 0x37, 0xd8, 0xae, + 0x7e, 0x7e, 0xd0, 0x75, 0x61, 0xc9, 0x43, 0x9c, 0xc4, 0x48, 0x4f, 0x37, + 0x68, 0xab, 0x7a, 0x9e, 0x98, 0x9e, 0x38, 0x87, 0x54, 0x54, 0x84, 0xab, + 0x57, 0xc7, 0x97, 0x3e, 0x9a, 0xcf, 0x6f, 0x8d, 0x9e, 0xb8, 0xb3, 0x59, + 0x92, 0x60, 0x7e, 0x94, 0x5b, 0x28, 0x8b, 0x60, 0x76, 0x33, 0xa9, 0x77, + 0x66, 0x98, 0x6c, 0x5f, 0x9a, 0xc6, 0xc9, 0x39, 0xb9, 0x45, 0x56, 0x6b, + 0x3d, 0x3c, 0x66, 0x41, 0x7e, 0xc0, 0xbe, 0x6a, 0xaf, 0x7d, 0xd0, 0xcc, + 0x61, 0x59, 0x9d, 0x8c, 0x3c, 0x9a, 0xb2, 0x81, 0xa6, 0xb6, 0x49, 0x86, + 0x3f, 0x3b, 0x4d, 0x51, 0xc0, 0x7b, 0x39, 0x6f, 0x9b, 0x91, 0xa5, 0x77, + 0x2d, 0x38, 0xad, 0x7a, 0x83, 0x40, 0xb1, 0xa1, 0x6d, 0x44, 0x41, 0x9c, + 0x40, 0x39, 0xaa, 0xa2, 0xb1, 0x6f, 0x3a, 0x52, 0x89, 0x5f, 0x7c, 0xb7, + 0x8c, 0x61, 0x45, 0x6f, 0x5a, 0xa8, 0x8f, 0x4b, 0x55, 0x87, 0xa4, 0x56, + 0x58, 0x32, 0x57, 0x4c, 0x53, 0xcb, 0x31, 0x38, 0xce, 0x7a, 0x55, 0xd3, + 0x34, 0x90, 0x50, 0xae, 0xa3, 0xb2, 0x93, 0xae, 0x62, 0x4f, 0xb8, 0xac, + 0x66, 0x54, 0x99, 0xdb, 0xaa, 0x6b, 0x4f, 0xa9, 0xcb, 0x48, 0x54, 0x6a, + 0x6f, 0x54, 0x7b, 0xa7, 0xa5, 0xbd, 0x7f, 0x92, 0x39, 0x93, 0xac, 0x53, + 0x2a, 0x3c, 0xc3, 0xc9, 0x53, 0xa3, 0x99, 0x2f, 0xcf, 0x5c, 0x9a, 0x74, + 0x91, 0x67, 0xb6, 0x64, 0xb9, 0x76, 0x7f, 0x80, 0x9e, 0x7e, 0x69, 0x5c, + 0xb0, 0x37, 0xaa, 0x52, 0x8a, 0x49, 0x45, 0x53, 0xc8, 0x6f, 0x62, 0x68, + 0x44, 0x51, 0x82, 0x4f, 0x9e, 0x41, 0x63, 0xb5, 0x52, 0x9a, 0x80, 0x6d, + 0x45, 0xa3, 0xd3, 0x71, 0xb1, 0x36, 0x91, 0x6c, 0xc1, 0x94, 0x4c, 0x9d, + 0x7a, 0x8b, 0xc7, 0x60, 0xb3, 0xd9, 0xae, 0x5e, 0x4a, 0x76, 0x83, 0x3b, + 0x44, 0xa6, 0x9a, 0x65, 0x96, 0x6a, 0x99, 0xb9, 0x4a, 0x9e, 0x71, 0xbe, + 0x95, 0x6d, 0xb0, 0x7e, 0xb5, 0x73, 0xc4, 0xa1, 0x6c, 0x80, 0x84, 0x5f, + 0x49, 0x87, 0x74, 0xb8, 0x1f, 0xbb, 0x41, 0xb3, 0x9a, 0x46, 0x95, 0x3a, + 0x3b, 0x75, 0xae, 0xaf, 0x8d, 0xd1, 0x52, 0x46, 0x7e, 0x6b, 0x61, 0x74, + 0xcf, 0x67, 0xa0, 0x6e, 0xb9, 0x32, 0x9a, 0xa0, 0x4a, 0x9d, 0x94, 0x5b, + 0x35, 0x4a, 0x5d, 0xa4, 0xeb, 0xcd, 0x99, 0xc8, 0x98, 0xb9, 0x62, 0x62, + 0xd6, 0x48, 0x5f, 0xb8, 0x8a, 0x4e, 0x85, 0xcd, 0x71, 0x64, 0x94, 0x94, + 0x6e, 0xae, 0xba, 0xb9, 0x61, 0x9f, 0x78, 0x93, 0x53, 0xaa, 0x4d, 0x84, + 0x9b, 0xcf, 0x57, 0xc4, 0xc8, 0x89, 0x64, 0x4c, 0x97, 0xc0, 0xd9, 0xcd, + 0x88, 0x34, 0xa1, 0xb1, 0xdd, 0xab, 0xcf, 0x98, 0xa7, 0xcd, 0x85, 0x6c, + 0xbb, 0xae, 0x7f, 0x3d, 0x6c, 0xd2, 0x8a, 0x94, 0xcb, 0xbb, 0xcd, 0x69, + 0xc9, 0xc8, 0x78, 0x77, 0x9b, 0x49, 0x95, 0x79, 0xb4, 0x3a, 0x38, 0x78, + 0x7a, 0x62, 0x54, 0x42, 0xc4, 0x8f, 0x46, 0x6e, 0xce, 0x47, 0x8d, 0x4e, + 0x6b, 0x51, 0x83, 0xaf, 0xc9, 0x6c, 0x43, 0x65, 0xac, 0x65, 0x74, 0x8f, + 0xc4, 0xc3, 0x67, 0x23, 0x77, 0x76, 0x8e, 0x7b, 0x7c, 0x84, 0xdc, 0xcd, + 0x4c, 0xaa, 0x98, 0x7c, 0xa1, 0x46, 0x40, 0x4e, 0x40, 0xbc, 0x7c, 0x6f, + 0xc4, 0x8f, 0x57, 0x94, 0xc0, 0x3d, 0xb9, 0x41, 0x61, 0xb5, 0x63, 0x5e, + 0x88, 0x3a, 0xc4, 0x77, 0x84, 0x79, 0xcb, 0xb2, 0x4c, 0x7a, 0x59, 0x98, + 0x5e, 0xc0, 0xc0, 0x6b, 0x5f, 0x6d, 0x34, 0xa0, 0xaa, 0x9b, 0x48, 0x65, + 0xa4, 0x84, 0x92, 0x4a, 0xbb, 0xc0, 0x7d, 0x8e, 0xa7, 0x87, 0xb7, 0x6a, + 0xb1, 0x8d, 0x67, 0x88, 0x8c, 0xb0, 0xa0, 0x33, 0x6c, 0x4c, 0x8b, 0x44, + 0xaf, 0x88, 0x88, 0xb4, 0x4f, 0x9a, 0xda, 0x5b, 0x82, 0x76, 0x9c, 0x79, + 0xc6, 0xa8, 0x7b, 0x7b, 0x57, 0x9b, 0x97, 0x6d, 0x90, 0x51, 0xc8, 0x78, + 0x9a, 0xa9, 0x49, 0xcc, 0xcc, 0x9c, 0xc7, 0xa4, 0x41, 0x45, 0xb9, 0xd3, + 0xda, 0x37, 0xa8, 0x3a, 0x7c, 0x96, 0x80, 0xbd, 0x9b, 0x6c, 0x4b, 0x33, + 0xaa, 0x9b, 0xba, 0x98, 0x54, 0x73, 0x85, 0xae, 0xc4, 0x47, 0x33, 0x3f, + 0x3f, 0xe0, 0x90, 0x48, 0xc5, 0x95, 0x46, 0x85, 0x76, 0xbd, 0xd4, 0xc8, + 0x3c, 0x6a, 0xaf, 0x92, 0x89, 0xce, 0x47, 0x81, 0x2d, 0xa0, 0x42, 0x40, + 0xa2, 0xa3, 0xc7, 0x4f, 0xce, 0x86, 0x95, 0x8a, 0xc9, 0x83, 0x56, 0x68, + 0x91, 0x84, 0xb4, 0x60, 0xa5, 0xc8, 0xa8, 0x40, 0x3d, 0xaa, 0xc9, 0xc1, + 0x58, 0x52, 0x89, 0x53, 0x48, 0x75, 0x8a, 0x48, 0x69, 0x7f, 0xb2, 0x9b, + 0x47, 0x6f, 0x3a, 0x44, 0xc2, 0x6d, 0xa8, 0x58, 0xaa, 0x73, 0xc5, 0x3f, + 0x4e, 0x64, 0x6b, 0x2d, 0x68, 0x34, 0x2f, 0xa0, 0x59, 0x52, 0x62, 0x8f, + 0x5e, 0x5f, 0xc9, 0xbd, 0x4e, 0xa4, 0xbd, 0x7b, 0xa3, 0x2c, 0x5d, 0x3b, + 0x8d, 0x29, 0x87, 0x65, 0x6b, 0x98, 0xae, 0x4e, 0xbc, 0x55, 0x39, 0xba, + 0x32, 0x65, 0xc1, 0x47, 0x6e, 0x55, 0xd7, 0x45, 0xa9, 0x3a, 0x3a, 0x75, + 0xce, 0x54, 0x40, 0x46, 0xc6, 0xd5, 0xbd, 0x68, 0x65, 0x99, 0xbd, 0x84, + 0x37, 0xa9, 0x78, 0xd1, 0xbf, 0x31, 0x46, 0x69, 0xb1, 0xc0, 0x47, 0x91, + 0x41, 0x95, 0x45, 0xc1, 0xca, 0x5a, 0xbf, 0xb9, 0x5b, 0xaa, 0x48, 0x44, + 0x40, 0xcd, 0x73, 0xc0, 0x7d, 0x54, 0x49, 0x40, 0x6e, 0xcc, 0x97, 0x3f, + 0x2f, 0x71, 0xb2, 0xcc, 0x93, 0xbf, 0xb6, 0x90, 0xb8, 0x65, 0x34, 0x91, + 0x47, 0x3f, 0x3b, 0xd0, 0x58, 0xb1, 0x9e, 0x58, 0xb2, 0x6e, 0xe0, 0x83, + 0xbf, 0xcf, 0x3d, 0xc7, 0x7e, 0xbb, 0xbf, 0x91, 0x5b, 0xc7, 0x99, 0x99, + 0x85, 0x95, 0xb0, 0x9e, 0xcd, 0x94, 0xb5, 0x95, 0x6d, 0x68, 0x4d, 0x61, + 0x31, 0xb7, 0x9f, 0xb4, 0x98, 0x9d, 0x44, 0x64, 0x43, 0x86, 0x6f, 0x70, + 0xb4, 0x44, 0x90, 0x43, 0x93, 0x45, 0x38, 0xa6, 0x59, 0x3f, 0x92, 0xd0, + 0x3f, 0xa9, 0xaf, 0xca, 0x42, 0x40, 0xb5, 0x9b, 0x6f, 0xcd, 0xc2, 0x4f, + 0x82, 0x65, 0x6e, 0x3c, 0x43, 0x5d, 0x2c, 0xb7, 0x83, 0x7a, 0x64, 0xb9, + 0x5f, 0x56, 0x54, 0x59, 0x99, 0xa7, 0xa6, 0x4b, 0xb4, 0x80, 0x31, 0x76, + 0xa1, 0xda, 0x5b, 0x6a, 0x5b, 0x52, 0xcd, 0xcd, 0x91, 0x3e, 0x69, 0x31, + 0xce, 0xc0, 0xa6, 0xa8, 0x4a, 0xc8, 0x2f, 0x47, 0x38, 0x74, 0x9e, 0x70, + 0x71, 0x50, 0x6e, 0xa2, 0x8a, 0x3f, 0x8a, 0xbe, 0x27, 0x91, 0x9f, 0x8f, + 0x45, 0xc1, 0x53, 0x33, 0xd8, 0x96, 0xac, 0xb7, 0xbf, 0xa1, 0x7e, 0xb3, + 0xb8, 0x96, 0xa1, 0xcf, 0x45, 0x53, 0x8f, 0x6c, 0xc6, 0x45, 0x49, 0xc4, + 0x3d, 0x8b, 0xdb, 0x83, 0x69, 0x9f, 0x70, 0x68, 0xb2, 0x92, 0x43, 0x7d, + 0x28, 0xc8, 0x42, 0x52, 0xab, 0x9a, 0x82, 0x53, 0xa3, 0xcd, 0xcb, 0x68, + 0xe5, 0x9a, 0x79, 0xaf, 0x74, 0x51, 0x7c, 0x4e, 0xc0, 0x73, 0xb1, 0x6a, + 0x30, 0x8c, 0x95, 0xc1, 0x4a, 0x58, 0x70, 0xa0, 0x6f, 0xa7, 0x55, 0x42, + 0x66, 0x63, 0x7d, 0xa6, 0xb0, 0xb0, 0xca, 0xce, 0x74, 0x4c, 0x58, 0xce, + 0x5f, 0x4f, 0x4a, 0x41, 0x74, 0xa0, 0xae, 0xa9, 0x7a, 0x5c, 0xa9, 0x56, + 0x9a, 0x6c, 0xbb, 0x43, 0xa4, 0x6d, 0x85, 0xb5, 0x6f, 0x93, 0x5e, 0x57, + 0xb9, 0xcd, 0xb1, 0x3f, 0xaf, 0xc3, 0x89, 0x71, 0xb5, 0x6e, 0x5e, 0x71, + 0x78, 0x87, 0x78, 0x37, 0xa2, 0xb7, 0x62, 0x62, 0xb7, 0x58, 0xa3, 0x2f, + 0xba, 0xb2, 0x90, 0xad, 0x54, 0x62, 0x57, 0x90, 0x5b, 0x8a, 0x68, 0x53, + 0x71, 0x68, 0x62, 0x70, 0x44, 0x8f, 0x5f, 0xc0, 0x91, 0xb0, 0x75, 0x40, + 0x68, 0x34, 0xd1, 0xc4, 0x40, 0x64, 0x66, 0xc2, 0x4d, 0xcb, 0x68, 0x7e, + 0xa8, 0x4e, 0x6f, 0x43, 0x58, 0xb6, 0x90, 0xd2, 0xc9, 0xb5, 0x7f, 0x37, + 0x7d, 0x74, 0xcd, 0x3a, 0x4e, 0x7f, 0x61, 0x4d, 0xd3, 0x36, 0x3a, 0x6d, + 0x4c, 0x6f, 0x9a, 0xa6, 0x3d, 0xa9, 0xa0, 0xac, 0xbd, 0x44, 0xc2, 0x9d, + 0x55, 0x77, 0xb3, 0x61, 0xa7, 0x98, 0xb5, 0xc2, 0x82, 0xd0, 0x70, 0x64, + 0xb0, 0x71, 0xaf, 0x49, 0xae, 0x81, 0x7b, 0xdb, 0x7c, 0x55, 0x7d, 0x86, + 0x47, 0xbb, 0x37, 0x95, 0x95, 0x90, 0xd3, 0x63, 0x8c, 0xd0, 0x86, 0x93, + 0x6a, 0xb6, 0xba, 0x90, 0xc5, 0xb2, 0xbe, 0x4d, 0x5f, 0x6d, 0x7c, 0x85, + 0xd7, 0xc6, 0x9b, 0x2b, 0xb3, 0x48, 0xbf, 0x76, 0x9b, 0x4c, 0x59, 0xc5, + 0xbc, 0xa4, 0xc2, 0xb0, 0xc7, 0x9b, 0xd4, 0x9e, 0x9b, 0x61, 0x45, 0xb6, + 0x4b, 0xa3, 0xb0, 0x7c, 0xc7, 0x3f, 0x6f, 0x6b, 0x30, 0xa7, 0xab, 0xa0, + 0x6b, 0x76, 0x3e, 0x66, 0xb3, 0xbf, 0xc1, 0x71, 0x3e, 0x8f, 0x89, 0x73, + 0xc1, 0x86, 0x5a, 0x44, 0x39, 0x2b, 0x45, 0x9a, 0x94, 0x74, 0x71, 0xa1, + 0x90, 0x5f, 0x3b, 0x83, 0x33, 0x58, 0x46, 0xc7, 0x73, 0x81, 0x99, 0x82, + 0xc6, 0x43, 0x93, 0x97, 0x88, 0xbc, 0x4f, 0x3a, 0x72, 0x5b, 0x70, 0x7c, + 0x53, 0xb5, 0x51, 0x7d, 0x71, 0x7b, 0xa3, 0x85, 0x71, 0xba, 0xc2, 0x6d, + 0x7a, 0xce, 0x79, 0x70, 0x70, 0xd2, 0x3d, 0x49, 0x5f, 0xb4, 0x3b, 0x9d, + 0x87, 0xb8, 0x56, 0x6e, 0xc5, 0x88, 0x73, 0xc7, 0xa5, 0x8c, 0x41, 0x97, + 0xd5, 0x6f, 0x93, 0x87, 0xae, 0x3b, 0x73, 0x72, 0xbc, 0x2e, 0x6c, 0x56, + 0x9b, 0x64, 0xb4, 0x75, 0x9c, 0x4b, 0xbc, 0x71, 0x5b, 0x87, 0x56, 0xcd, + 0xcd, 0x7d, 0xb6, 0x41, 0x5f, 0x44, 0x99, 0xc4, 0xbb, 0x55, 0x6f, 0xae, + 0x52, 0x3b, 0x6b, 0x8b, 0x59, 0xa6, 0x6a, 0x31, 0xc8, 0x81, 0xa7, 0xbf, + 0x45, 0x4d, 0x8b, 0x38, 0x4e, 0x64, 0x89, 0x89, 0x55, 0xbc, 0x68, 0x93, + 0xa9, 0x50, 0xa1, 0xcc, 0xc6, 0x2e, 0x7b, 0x6d, 0x42, 0xd6, 0x43, 0x73, + 0x6a, 0x9c, 0x7a, 0xca, 0xb0, 0x8d, 0x4f, 0x81, 0x89, 0x45, 0x5b, 0x44, + 0x89, 0xa8, 0xb2, 0xbd, 0xac, 0x55, 0x9b, 0x8d, 0x3f, 0x32, 0x36, 0xb9, + 0x81, 0x39, 0x57, 0xad, 0x32, 0xc0, 0xd0, 0xa5, 0xb2, 0x49, 0x64, 0x82, + 0x6e, 0xa5, 0x84, 0x35, 0x5a, 0x6f, 0x6a, 0x6d, 0x2e, 0x5d, 0x4b, 0xc5, + 0x6b, 0x57, 0xd3, 0x83, 0xa1, 0x3a, 0x72, 0x52, 0x6e, 0x83, 0xce, 0xc4, + 0x9f, 0x59, 0x59, 0x99, 0x59, 0x6a, 0x9c, 0xac, 0x62, 0x6a, 0xb5, 0x52, + 0x42, 0x5c, 0x2f, 0x4a, 0x48, 0x7e, 0x54, 0x73, 0x31, 0x46, 0x84, 0x98, + 0x84, 0x40, 0x40, 0x6c, 0xcc, 0x42, 0xa9, 0x81, 0x30, 0x4c, 0xdd, 0x5b, + 0xc8, 0x3d, 0x6e, 0xba, 0xab, 0x8a, 0xc3, 0xbe, 0xd2, 0xc7, 0x68, 0xba, + 0x95, 0x38, 0x66, 0xcd, 0xc9, 0xaf, 0x49, 0x74, 0xcf, 0x88, 0x35, 0x3e, + 0xc7, 0x7d, 0x68, 0x89, 0x98, 0x89, 0x76, 0x35, 0x38, 0xc6, 0x4c, 0x71, + 0x6b, 0x7a, 0xa6, 0xa9, 0x56, 0x62, 0x5b, 0x56, 0x4e, 0x3c, 0xb7, 0xaf, + 0x7d, 0x33, 0x73, 0x92, 0x49, 0x85, 0xb7, 0xcb, 0x54, 0xc2, 0x77, 0xb9, + 0xb3, 0x69, 0x43, 0x56, 0xaf, 0x84, 0x7f, 0xa1, 0x91, 0x87, 0x3f, 0x73, + 0x8b, 0x58, 0xa8, 0x49, 0x57, 0xb0, 0xb9, 0xbc, 0x30, 0x74, 0xa9, 0x96, + 0x9f, 0x7f, 0x41, 0x8d, 0x6f, 0xcc, 0xc3, 0xad, 0x39, 0x53, 0x75, 0xb3, + 0x45, 0xc0, 0x8d, 0x40, 0xc6, 0x6c, 0xcd, 0x82, 0x3f, 0x91, 0x5b, 0x6e, + 0xca, 0x3d, 0x7c, 0x40, 0xaf, 0x9f, 0x3b, 0x52, 0x99, 0xb8, 0x49, 0x72, + 0x62, 0xcd, 0x6e, 0x4f, 0x71, 0x8a, 0x99, 0xcd, 0x4e, 0xc4, 0x7c, 0x8e, + 0x30, 0xdf, 0x95, 0xc0, 0x68, 0xbc, 0xa1, 0x5d, 0x53, 0xb1, 0x68, 0x34, + 0xac, 0x73, 0x52, 0x7e, 0x6d, 0x83, 0x3f, 0x74, 0x53, 0x97, 0x66, 0x85, + 0xbb, 0xb5, 0x64, 0x82, 0x5d, 0xcd, 0x4f, 0xc1, 0xad, 0xb2, 0x90, 0xc7, + 0x8b, 0x7a, 0x7c, 0x9d, 0xc3, 0x83, 0x79, 0xb7, 0x8b, 0xce, 0xb2, 0x68, + 0x99, 0xd3, 0x42, 0x48, 0x5a, 0x44, 0x5e, 0x76, 0xc7, 0x50, 0x6a, 0x89, + 0x34, 0x85, 0xc9, 0xdc, 0x81, 0x73, 0xb4, 0x9f, 0xaa, 0x63, 0x8d, 0x9a, + 0x67, 0x4f, 0x3d, 0xca, 0x7a, 0x78, 0x41, 0x6b, 0x44, 0xc3, 0x51, 0xb9, + 0x73, 0x68, 0x70, 0x45, 0x77, 0x9a, 0x77, 0xb8, 0x96, 0x55, 0x78, 0xb9, + 0x43, 0x82, 0x51, 0x3c, 0x91, 0x8e, 0x60, 0xb4, 0x4c, 0xa8, 0xb9, 0x72, + 0xb5, 0x52, 0x90, 0x46, 0xce, 0x33, 0x76, 0x82, 0x82, 0x7d, 0xa7, 0x72, + 0x86, 0x9f, 0x8f, 0x64, 0x51, 0xba, 0x90, 0x5e, 0x99, 0x83, 0x65, 0x35, + 0xc8, 0xd2, 0xb5, 0x9c, 0x60, 0xa0, 0x7d, 0x5d, 0x65, 0x57, 0xc9, 0xcc, + 0x80, 0xba, 0x5a, 0x3e, 0x91, 0xae, 0xd1, 0x34, 0xc8, 0x82, 0xd2, 0x97, + 0xb6, 0xd6, 0xb9, 0x7e, 0xaa, 0x43, 0x77, 0xbc, 0x3d, 0x31, 0x3c, 0x8a, + 0x5c, 0xb4, 0x73, 0x66, 0x5d, 0x77, 0xd1, 0xb4, 0x59, 0xc1, 0x49, 0x8f, + 0xbd, 0x8d, 0xa0, 0x4b, 0x7a, 0x87, 0xaa, 0x69, 0x48, 0x93, 0x91, 0x67, + 0xba, 0x80, 0xc7, 0x50, 0xa5, 0xac, 0x3a, 0x4d, 0x93, 0x45, 0x76, 0x96, + 0x64, 0xa1, 0x9f, 0x52, 0xa9, 0x56, 0x66, 0xbb, 0xda, 0x45, 0xba, 0x68, + 0x5d, 0xaa, 0x97, 0xa4, 0xb8, 0x6a, 0x7c, 0x3e, 0x3a, 0x80, 0x8f, 0xb5, + 0x4a, 0xb7, 0x51, 0x44, 0x45, 0x4f, 0xc0, 0xc9, 0x4e, 0x6e, 0x41, 0x86, + 0x59, 0x3f, 0x3a, 0x6c, 0x8b, 0x5a, 0x80, 0xbf, 0x5b, 0x38, 0x81, 0xa4, + 0x9e, 0xc2, 0x36, 0x5e, 0xae, 0xab, 0x91, 0x59, 0x53, 0x75, 0xbe, 0x4f, + 0x40, 0x3c, 0x78, 0x94, 0x63, 0x64, 0x39, 0x63, 0x3a, 0x94, 0x97, 0x36, + 0x5a, 0x80, 0x3a, 0xb7, 0x64, 0x60, 0x67, 0x7f, 0xd0, 0x45, 0x6a, 0x94, + 0xc8, 0x36, 0x50, 0x39, 0xc1, 0x87, 0x66, 0xb3, 0xa9, 0x5f, 0x6e, 0xbe, + 0x8f, 0x8b, 0x8c, 0x82, 0xbc, 0xa3, 0x90, 0xb1, 0xb8, 0x22, 0x43, 0xc6, + 0xc2, 0xae, 0xc1, 0x3f, 0x74, 0x3f, 0x8e, 0x49, 0x8d, 0x53, 0xc7, 0x6f, + 0x53, 0x5e, 0x89, 0xaf, 0x74, 0x46, 0x7b, 0x94, 0x47, 0xae, 0xbf, 0xa9, + 0x42, 0x81, 0x87, 0x34, 0x3c, 0xb4, 0x2d, 0x3c, 0x7a, 0xc2, 0x48, 0x94, + 0x46, 0x49, 0xb3, 0x94, 0xaf, 0x2d, 0x49, 0x7e, 0xb7, 0x82, 0x49, 0x36, + 0x3d, 0x69, 0x2f, 0x73, 0x59, 0x92, 0xc3, 0x4e, 0x30, 0xa0, 0x89, 0x4c, + 0x4c, 0xbf, 0xb5, 0x85, 0xca, 0xc0, 0x4b, 0x86, 0x2d, 0x54, 0x9d, 0xd1, + 0x62, 0x64, 0x91, 0xd1, 0x8b, 0x56, 0x50, 0x2d, 0x54, 0x25, 0x67, 0xc4, + 0x83, 0xc8, 0xc6, 0x8f, 0x64, 0x9e, 0x56, 0x8e, 0xbf, 0x42, 0x80, 0xe1, + 0xc1, 0x92, 0xb5, 0x6c, 0x9f, 0xa6, 0x7f, 0x63, 0xc3, 0x75, 0x72, 0x94, + 0x6d, 0x63, 0x77, 0xa9, 0x85, 0x5e, 0xdd, 0xae, 0x58, 0x71, 0xaf, 0x63, + 0xb8, 0x4d, 0x8d, 0x7a, 0x8d, 0x3e, 0x76, 0xc4, 0x4e, 0x57, 0xa2, 0xaf, + 0xa5, 0x8d, 0xb0, 0xc7, 0x5b, 0x2f, 0x4b, 0xac, 0x46, 0xc7, 0xbc, 0x5b, + 0x38, 0x4b, 0xbf, 0xd3, 0xae, 0xc4, 0x3e, 0x4a, 0xc2, 0x79, 0x56, 0xc9, + 0x3b, 0xbb, 0x8e, 0xa7, 0xca, 0xa9, 0xb0, 0x3c, 0x8f, 0xab, 0xa9, 0x60, + 0xa9, 0x34, 0xb9, 0x50, 0xb7, 0x3b, 0x68, 0x3d, 0xc4, 0x51, 0xc6, 0xd0, + 0x7c, 0xc8, 0x3a, 0x7c, 0x6e, 0x7e, 0xad, 0x65, 0x80, 0xc0, 0x70, 0xbe, + 0x6f, 0x61, 0x5f, 0x86, 0xba, 0x7f, 0xab, 0xa5, 0x6e, 0xbc, 0xb2, 0x5f, + 0x9d, 0xc9, 0x72, 0x7e, 0xa6, 0x6b, 0xc0, 0xbb, 0x67, 0x3d, 0x3b, 0x64, + 0xc3, 0x74, 0xb3, 0xa7, 0xc0, 0x75, 0x30, 0x47, 0x68, 0x5c, 0xc5, 0x88, + 0x92, 0x9f, 0xa4, 0x85, 0x3f, 0x58, 0x55, 0xb1, 0xa4, 0x96, 0x30, 0x31, + 0x69, 0x7b, 0x8f, 0x8f, 0xb7, 0x4f, 0x56, 0x3e, 0xad, 0x3a, 0x96, 0xc6, + 0x5e, 0x75, 0xd1, 0xc2, 0x33, 0x8c, 0x4c, 0xb2, 0x65, 0x79, 0x36, 0x42, + 0xa0, 0xd5, 0x6c, 0xa2, 0xa7, 0xbc, 0xa0, 0xd6, 0x70, 0x51, 0x48, 0x41, + 0x81, 0x3d, 0xaa, 0x5d, 0x5e, 0xb1, 0x65, 0x9b, 0xc8, 0x48, 0x40, 0x78, + 0x50, 0xcf, 0x98, 0x47, 0x49, 0x78, 0xa9, 0x84, 0xbc, 0x7f, 0xd8, 0x99, + 0x68, 0x79, 0x60, 0xa7, 0x51, 0xc4, 0xc4, 0x9a, 0x34, 0x2b, 0xb7, 0x59, + 0x80, 0x75, 0x6d, 0x99, 0xbb, 0x95, 0x63, 0x91, 0xb5, 0x41, 0x61, 0x74, + 0x38, 0x8b, 0x41, 0x48, 0x3c, 0x5e, 0x30, 0x82, 0x80, 0xb9, 0x36, 0xae, + 0x57, 0x7d, 0x77, 0xb3, 0x47, 0xbf, 0x4f, 0x99, 0x9b, 0x77, 0x4b, 0xae, + 0x77, 0x4b, 0xb9, 0xcf, 0x4e, 0x7a, 0xde, 0xa0, 0x9d, 0x47, 0x93, 0x67, + 0xe0, 0x97, 0x8d, 0xa8, 0x89, 0x86, 0x98, 0x83, 0x5d, 0x52, 0x6e, 0x55, + 0xa3, 0xb2, 0x69, 0x8e, 0x5c, 0xbb, 0xb7, 0x5b, 0x31, 0x7b, 0x83, 0x43, + 0x43, 0xc2, 0xb7, 0xce, 0xd6, 0x9c, 0x7c, 0x57, 0x9e, 0x40, 0xab, 0x70, + 0x75, 0xd1, 0x8c, 0xd2, 0x50, 0x95, 0xc8, 0x54, 0x63, 0x82, 0xb9, 0xd0, + 0x8f, 0x91, 0x39, 0xaf, 0x3d, 0x4b, 0x5c, 0xa0, 0xd0, 0xb3, 0x4c, 0xd7, + 0xcd, 0x5d, 0xa5, 0xb1, 0x4d, 0x43, 0xca, 0x83, 0xaa, 0x61, 0xa2, 0x9e, + 0xd3, 0x88, 0x96, 0xbc, 0x36, 0x87, 0xda, 0xca, 0xd7, 0xc6, 0x5f, 0x3d, + 0x66, 0x64, 0xaf, 0x84, 0x66, 0x8d, 0xcf, 0x46, 0xa0, 0x54, 0xa5, 0xc8, + 0xc2, 0xd2, 0x97, 0x9e, 0x4f, 0x91, 0x8b, 0xb4, 0x39, 0xc9, 0x99, 0xa9, + 0xb9, 0xad, 0x57, 0xac, 0x97, 0x91, 0x4c, 0xc3, 0x87, 0x59, 0x35, 0x53, + 0x53, 0xbf, 0x58, 0x46, 0xc5, 0x32, 0xa4, 0xce, 0x98, 0x62, 0x58, 0xc5, + 0x83, 0x9a, 0x3e, 0x29, 0x83, 0x58, 0xb6, 0x2f, 0x72, 0x2f, 0x63, 0x46, + 0x49, 0x37, 0x79, 0xc2, 0xcf, 0xd6, 0x3f, 0x6b, 0xa9, 0x5f, 0x45, 0x5e, + 0xad, 0x83, 0x52, 0x90, 0x8f, 0x63, 0x5d, 0x73, 0x82, 0x3e, 0x6a, 0xc6, + 0x5b, 0x6a, 0x8d, 0x30, 0x63, 0x9b, 0xbb, 0x8d, 0x49, 0x8a, 0x82, 0x31, + 0xa7, 0x4e, 0x42, 0x5d, 0x6a, 0xac, 0x92, 0x4b, 0x33, 0x8a, 0x3a, 0xc7, + 0x5b, 0x99, 0x35, 0x5c, 0x7b, 0x73, 0x45, 0x72, 0x34, 0xb9, 0x8f, 0x40, + 0xaf, 0xb8, 0xb0, 0x92, 0x8a, 0xa9, 0xa3, 0xd2, 0x8e, 0x9e, 0x8a, 0x6c, + 0xc3, 0xb3, 0x80, 0xb4, 0x79, 0xca, 0x5b, 0xd4, 0xd3, 0xe2, 0x43, 0x3d, + 0x41, 0x78, 0x40, 0xc1, 0x6c, 0x94, 0x34, 0x62, 0x37, 0xb9, 0xd0, 0x74, + 0x54, 0xa7, 0x6a, 0x82, 0x76, 0x75, 0xc3, 0x7f, 0x8d, 0x96, 0x93, 0x9f, + 0xa8, 0x88, 0xa0, 0x8e, 0xcb, 0x40, 0x6c, 0x33, 0xa6, 0xbe, 0x3d, 0x5e, + 0x35, 0x69, 0x51, 0x70, 0x3e, 0x82, 0x4f, 0x98, 0x68, 0xba, 0xa4, 0x42, + 0xcb, 0xb6, 0x57, 0xa9, 0x3b, 0xb3, 0x58, 0xb2, 0xc0, 0x94, 0x5b, 0xaf, + 0x3d, 0x73, 0xd7, 0x6b, 0xa1, 0x80, 0x32, 0x9d, 0xc6, 0x85, 0x77, 0xb9, + 0x3b, 0x36, 0x94, 0x5d, 0xb6, 0x76, 0x5d, 0x6f, 0x91, 0x3c, 0x36, 0x9f, + 0x81, 0x66, 0xbc, 0xa6, 0xc7, 0x4f, 0xa5, 0x8e, 0xcd, 0xab, 0x85, 0x57, + 0x49, 0x56, 0x79, 0x3b, 0x26, 0xb6, 0x74, 0x28, 0x42, 0xa2, 0x6b, 0x3f, + 0xa9, 0x38, 0xc9, 0xb5, 0x84, 0x81, 0x7b, 0x31, 0x9b, 0xaa, 0x9f, 0xa8, + 0x99, 0xaa, 0x56, 0x8e, 0xc4, 0x25, 0xc3, 0x75, 0xcf, 0x72, 0x6b, 0x6f, + 0xcd, 0x4f, 0x43, 0x84, 0x30, 0xb5, 0x6c, 0x34, 0x53, 0x75, 0xaf, 0x6a, + 0x4a, 0xca, 0x5b, 0xaa, 0xa7, 0x91, 0x8d, 0xa2, 0x73, 0xbd, 0x7a, 0xa7, + 0xd1, 0xca, 0x9c, 0x8e, 0x5a, 0x75, 0x2d, 0xa7, 0xad, 0x80, 0xbc, 0x34, + 0xc6, 0x98, 0x72, 0x87, 0x6f, 0x87, 0x96, 0xc6, 0xb6, 0x4a, 0x3a, 0x67, + 0x3e, 0x54, 0xb6, 0x85, 0x3e, 0x7b, 0x80, 0x64, 0x7e, 0x5f, 0x98, 0x91, + 0x55, 0xc6, 0x91, 0x9b, 0x9b, 0x37, 0x36, 0x70, 0x52, 0xbc, 0x56, 0xcc, + 0x5d, 0xb5, 0x71, 0xb0, 0xc6, 0x32, 0x4a, 0x7f, 0x50, 0x91, 0xcf, 0xba, + 0x93, 0x79, 0xca, 0x46, 0x76, 0xba, 0x51, 0x90, 0x7c, 0x87, 0xb0, 0xc7, + 0xa9, 0xbe, 0x8e, 0x48, 0x59, 0x44, 0x86, 0x5b, 0x91, 0x82, 0x55, 0xa6, + 0x6c, 0x94, 0xa1, 0xc4, 0x7d, 0xb4, 0x45, 0x3a, 0x5e, 0xc7, 0x50, 0x9c, + 0x7e, 0xa2, 0x71, 0x4d, 0x8e, 0xb8, 0x80, 0x23, 0x77, 0x34, 0x95, 0x47, + 0x9b, 0x5d, 0x9d, 0x8b, 0x9b, 0x6a, 0x84, 0x82, 0xd0, 0xcc, 0x62, 0x4d, + 0x5b, 0x65, 0xb1, 0xbd, 0x9c, 0x48, 0x68, 0x2c, 0x4b, 0x32, 0x7c, 0xbc, + 0x96, 0x35, 0x9b, 0xc2, 0x80, 0x98, 0x3c, 0xb3, 0xc8, 0x5b, 0xad, 0x8c, + 0x4f, 0x38, 0x59, 0xba, 0x90, 0xa5, 0x87, 0x9f, 0x5b, 0x79, 0x34, 0x2a, + 0xa0, 0xb5, 0x32, 0xae, 0x74, 0xa6, 0x59, 0x93, 0x77, 0x7f, 0xc0, 0x3c, + 0x64, 0x5a, 0x7f, 0xb8, 0x9f, 0x3f, 0xc5, 0x78, 0x39, 0x41, 0x69, 0xd1, + 0x33, 0x94, 0x58, 0xb1, 0xa2, 0x71, 0x49, 0x5c, 0x31, 0x38, 0xab, 0xbd, + 0x45, 0x3e, 0xaa, 0xa3, 0x3c, 0x31, 0xa3, 0xa2, 0xc4, 0x7c, 0xb7, 0x38, + 0xc8, 0xc4, 0xb9, 0xb6, 0xa4, 0x9f, 0xcb, 0xa6, 0x79, 0x73, 0xc6, 0x2e, + 0xcc, 0x30, 0xc2, 0xc9, 0x38, 0x78, 0xac, 0x7d, 0x87, 0x9d, 0x87, 0x95, + 0xc1, 0x72, 0x74, 0x46, 0x34, 0x72, 0x67, 0x9d, 0xa2, 0xac, 0x63, 0x3c, + 0x67, 0x43, 0xa0, 0x50, 0x46, 0x8c, 0x80, 0xd4, 0xce, 0xcc, 0xac, 0x40, + 0x9c, 0xc4, 0xc2, 0x5a, 0x61, 0x47, 0x60, 0x62, 0x7c, 0x81, 0xcf, 0x75, + 0x52, 0x68, 0xc0, 0x8a, 0x68, 0x9d, 0xba, 0x60, 0xcc, 0xce, 0x8f, 0xa2, + 0x6b, 0x34, 0x68, 0x78, 0x77, 0x74, 0x97, 0x54, 0x7d, 0xde, 0x31, 0x3d, + 0x8b, 0xa8, 0x75, 0x95, 0x9c, 0x3f, 0x35, 0x58, 0x9b, 0xb1, 0x9c, 0x7b, + 0x73, 0x36, 0x39, 0x81, 0x9f, 0xb2, 0xa8, 0x4e, 0xb3, 0x84, 0xa3, 0xb7, + 0xb1, 0x99, 0x2e, 0xc9, 0x30, 0xa9, 0x8f, 0x5f, 0xb9, 0x83, 0x58, 0x91, + 0xc6, 0xac, 0x55, 0x87, 0x99, 0x8a, 0x5a, 0x81, 0xcc, 0x73, 0x84, 0x2f, + 0xc1, 0x98, 0x5d, 0xbc, 0xc8, 0x64, 0xab, 0x4c, 0xcd, 0x5c, 0x2f, 0x80, + 0x50, 0xb7, 0x58, 0x45, 0x28, 0x8a, 0xb5, 0x61, 0xa0, 0xc3, 0x3c, 0x30, + 0x55, 0x43, 0xbc, 0x8c, 0xc2, 0x67, 0x92, 0x46, 0x64, 0xa9, 0x86, 0x54, + 0xa4, 0xb7, 0x84, 0x99, 0xac, 0x70, 0x9e, 0xaf, 0xbc, 0x85, 0x7d, 0xb4, + 0x9c, 0xb5, 0x8a, 0x84, 0x34, 0x64, 0x89, 0x3f, 0x48, 0x9f, 0x54, 0x86, + 0x33, 0x9f, 0x8a, 0x48, 0x86, 0x8f, 0x78, 0xc8, 0xae, 0xca, 0xb8, 0x69, + 0xa5, 0xa8, 0xc0, 0x50, 0xcd, 0xcc, 0x94, 0x7e, 0xcc, 0x4b, 0x71, 0x6c, + 0x58, 0x75, 0xd0, 0x34, 0x42, 0x91, 0xb3, 0x4a, 0x63, 0xab, 0x76, 0x8b, + 0x7f, 0xba, 0x31, 0x9c, 0xb7, 0x93, 0x60, 0x61, 0x4f, 0x6a, 0x35, 0xb2, + 0x71, 0xc1, 0x87, 0x8f, 0xc1, 0x67, 0x52, 0x33, 0x3f, 0x5d, 0x3e, 0x4c, + 0x8e, 0x89, 0x47, 0x4a, 0x6b, 0x2e, 0x6f, 0x59, 0x71, 0xa3, 0x39, 0x7a, + 0x47, 0x78, 0x45, 0x69, 0x8f, 0x9d, 0x66, 0xbf, 0xa2, 0x7d, 0x8b, 0xa0, + 0xc1, 0x60, 0x89, 0x74, 0xc5, 0x87, 0x87, 0xbb, 0x83, 0xd4, 0x89, 0x38, + 0x88, 0x31, 0xb1, 0x80, 0x50, 0x93, 0x9d, 0xbb, 0xb7, 0xc3, 0xc1, 0x4b, + 0xbe, 0x58, 0x3f, 0x89, 0xa7, 0x55, 0x9e, 0x99, 0x72, 0x3c, 0x97, 0xc7, + 0x2e, 0x95, 0x75, 0x59, 0x96, 0x93, 0x64, 0xa6, 0x96, 0xa3, 0x85, 0x9b, + 0xb6, 0x47, 0xb1, 0xa4, 0x4e, 0x4c, 0x33, 0x93, 0x69, 0x93, 0xa1, 0xac, + 0x53, 0x48, 0x7b, 0xb8, 0x68, 0x54, 0x73, 0x4a, 0x61, 0x70, 0x4d, 0x70, + 0xb3, 0xab, 0x4b, 0x79, 0xd3, 0x5b, 0x4e, 0xc7, 0xaf, 0x54, 0x44, 0x87, + 0xcb, 0x3a, 0x4c, 0x41, 0x74, 0x3d, 0xbb, 0xcb, 0xad, 0xa2, 0x93, 0x44, + 0x30, 0x3e, 0xba, 0x78, 0x8d, 0x65, 0xbc, 0x5b, 0x95, 0x6a, 0x74, 0x70, + 0x87, 0x76, 0x9a, 0x57, 0x90, 0x41, 0x6b, 0x57, 0x7d, 0xa5, 0x2f, 0xa9, + 0x56, 0x35, 0xb9, 0xcc, 0xba, 0x85, 0x5b, 0x39, 0x2d, 0x50, 0xcd, 0x6f, + 0xb3, 0xcb, 0x46, 0x2d, 0x35, 0x36, 0xcb, 0xa4, 0xc0, 0x3c, 0xac, 0x3c, + 0x87, 0xbb, 0x48, 0x5a, 0x39, 0x57, 0x36, 0x4c, 0x74, 0x76, 0xbd, 0x80, + 0x27, 0x24, 0xb9, 0x62, 0xc8, 0x8f, 0xc5, 0xae, 0x7b, 0x6b, 0x66, 0x81, + 0x64, 0xad, 0x8b, 0x32, 0x85, 0x83, 0x51, 0x78, 0x51, 0xb3, 0x56, 0x58, + 0x42, 0x4c, 0x64, 0x86, 0xc2, 0x53, 0xa4, 0x59, 0x54, 0x6f, 0x58, 0x80, + 0x54, 0xa6, 0xba, 0x62, 0x99, 0x69, 0x37, 0xa7, 0x54, 0xb4, 0xc5, 0x97, + 0x3b, 0xbd, 0x6d, 0x85, 0x9d, 0x4f, 0x47, 0xb6, 0x4c, 0xce, 0x62, 0x6d, + 0x5e, 0xa5, 0xae, 0x83, 0x9b, 0x8d, 0x8a, 0x3b, 0xaf, 0x83, 0x42, 0x68, + 0x30, 0xac, 0xb8, 0x73, 0x36, 0x7f, 0xa5, 0xaf, 0x87, 0x77, 0x2e, 0xba, + 0x8d, 0x4c, 0xcf, 0x77, 0x6d, 0x79, 0xc2, 0x9d, 0x7e, 0xcd, 0x7d, 0x5c, + 0x87, 0xc6, 0x72, 0xb3, 0xb1, 0x7e, 0x5b, 0xaa, 0x86, 0x57, 0x7c, 0x31, + 0xcc, 0x71, 0x9b, 0x55, 0x75, 0xd1, 0x97, 0xa7, 0xa1, 0x95, 0x42, 0x83, + 0xc2, 0xaf, 0xb3, 0xaf, 0x76, 0x6e, 0x5d, 0xc9, 0x86, 0xaa, 0x80, 0xb8, + 0x92, 0xa7, 0xd4, 0x80, 0x94, 0x6b, 0x95, 0x72, 0x88, 0x82, 0xa4, 0x55, + 0xbf, 0x55, 0x7a, 0x9a, 0xca, 0x4e, 0x6e, 0x66, 0xa0, 0x71, 0xbb, 0xa1, + 0xaa, 0x4e, 0x30, 0xb6, 0x97, 0x70, 0x32, 0xc6, 0x36, 0x98, 0xa8, 0x38, + 0x90, 0xcc, 0x3a, 0x3f, 0x61, 0x5c, 0xbe, 0xa6, 0x48, 0x8d, 0x5c, 0xd1, + 0x8d, 0xd0, 0xb8, 0xca, 0xd1, 0x39, 0x93, 0xb3, 0xbd, 0xaf, 0xa6, 0x70, + 0x7f, 0xa0, 0xce, 0xab, 0x48, 0x88, 0x60, 0x61, 0xc1, 0x83, 0x43, 0xc5, + 0xc1, 0xc2, 0x81, 0x63, 0x4e, 0x3c, 0xc5, 0xd5, 0xb5, 0x64, 0x8d, 0x36, + 0x39, 0xaf, 0x37, 0x84, 0x90, 0xa6, 0x44, 0x9c, 0x7c, 0x33, 0xcc, 0x7e, + 0xcd, 0xaa, 0x93, 0xa4, 0xaf, 0x45, 0x48, 0x78, 0x79, 0x63, 0xc0, 0x40, + 0x6d, 0x8e, 0x4e, 0x9b, 0x89, 0xc9, 0x61, 0x53, 0xbf, 0x70, 0x6d, 0x65, + 0xa8, 0x6a, 0x64, 0x4f, 0x50, 0x39, 0x4e, 0x3b, 0xc1, 0xa6, 0x6d, 0x3c, + 0x4e, 0x90, 0x59, 0x91, 0x7b, 0xb8, 0x91, 0xc6, 0x66, 0x8c, 0x7b, 0x90, + 0x8c, 0x46, 0x97, 0x86, 0x9b, 0x91, 0x8a, 0xb8, 0x6c, 0x31, 0x9d, 0x48, + 0xcb, 0xa8, 0x34, 0x38, 0x56, 0xb0, 0x91, 0x4e, 0x7a, 0x36, 0x80, 0x71, + 0x98, 0xb5, 0xa7, 0x65, 0x3a, 0x6c, 0x79, 0xac, 0x5b, 0x3b, 0xae, 0x5c, + 0x47, 0x6b, 0xd2, 0x74, 0xb6, 0x2d, 0x9a, 0x6b, 0x99, 0x4b, 0xae, 0xbe, + 0xa6, 0x42, 0x67, 0xd5, 0x7d, 0x9e, 0x6b, 0x6d, 0x72, 0x54, 0x37, 0xa4, + 0x99, 0x80, 0xb6, 0xb1, 0x37, 0x91, 0x80, 0x7d, 0xcb, 0x9f, 0x6a, 0x5e, + 0x49, 0x6f, 0xa4, 0x92, 0x7d, 0x98, 0x72, 0x6f, 0xd2, 0x39, 0x46, 0x5b, + 0x91, 0x9b, 0x43, 0x4f, 0xd5, 0x7f, 0xc0, 0xb7, 0x6f, 0x59, 0x9b, 0xc0, + 0xcf, 0x7c, 0x83, 0xc6, 0xbd, 0xc8, 0x4f, 0x49, 0xcf, 0xbb, 0xc2, 0x66, + 0xa8, 0x45, 0x4c, 0xce, 0x68, 0x88, 0x61, 0xc2, 0x87, 0xae, 0xc0, 0x6b, + 0x33, 0xb7, 0x45, 0x72, 0x7e, 0x84, 0x3c, 0xc7, 0x6e, 0x8f, 0x5e, 0x9a, + 0x47, 0x32, 0xa5, 0xca, 0x47, 0x86, 0xb7, 0x55, 0xa9, 0x57, 0xc1, 0x89, + 0xa6, 0x40, 0xaf, 0xbf, 0xc8, 0x9d, 0xb5, 0xc4, 0x92, 0x66, 0x9e, 0x4f, + 0x2d, 0xb0, 0x58, 0x64, 0x76, 0x6c, 0x87, 0x60, 0x56, 0x96, 0x75, 0x86, + 0x9a, 0x7d, 0xcc, 0xcf, 0x86, 0xa4, 0xbc, 0xb4, 0x61, 0x61, 0x87, 0xdb, + 0x73, 0xc6, 0xc8, 0x4c, 0x3a, 0xab, 0xc6, 0xa8, 0xae, 0xcb, 0x71, 0x52, + 0xd0, 0x39, 0x72, 0x8b, 0x9d, 0x3c, 0x4c, 0xb3, 0x38, 0xc7, 0xb4, 0x7c, + 0x52, 0x42, 0xdf, 0xa8, 0x64, 0x7d, 0xbe, 0xa8, 0xb3, 0x76, 0xbb, 0xcb, + 0x52, 0xa1, 0x88, 0xb4, 0x35, 0x36, 0x4e, 0x40, 0x3e, 0xd3, 0x34, 0xae, + 0x7f, 0x51, 0x95, 0x4b, 0x7d, 0x51, 0x52, 0xa9, 0xde, 0x74, 0x44, 0x7a, + 0x32, 0x92, 0x8e, 0xc8, 0x65, 0xca, 0xaa, 0xa4, 0x65, 0x92, 0xc9, 0x54, + 0x39, 0xc2, 0x42, 0xc3, 0x4f, 0x9d, 0x3e, 0xab, 0x5b, 0x9c, 0xba, 0xb0, + 0x8c, 0x33, 0xc8, 0x79, 0xac, 0x95, 0xc3, 0x85, 0x90, 0x2c, 0x7c, 0x55, + 0x7b, 0x96, 0x6d, 0xc1, 0xcd, 0x89, 0xb4, 0x4a, 0x46, 0x9d, 0x42, 0x2c, + 0x5d, 0x9d, 0x68, 0x97, 0x73, 0xbb, 0x96, 0x91, 0x3b, 0x32, 0xae, 0xb8, + 0xb3, 0xc0, 0xb1, 0x30, 0x3c, 0x93, 0x6d, 0x4c, 0x5d, 0xb6, 0xb7, 0x4e, + 0xb8, 0x60, 0x96, 0x7f, 0xc0, 0x78, 0xc2, 0x4f, 0xc6, 0xc2, 0x50, 0x84, + 0x3b, 0x9d, 0x91, 0xc2, 0x79, 0xb0, 0x67, 0xab, 0x44, 0x49, 0x67, 0x9d, + 0x8a, 0xd6, 0x4c, 0xb1, 0xc2, 0x6f, 0x5d, 0x6e, 0x71, 0x84, 0x97, 0x6f, + 0x62, 0x96, 0x9d, 0xdb, 0x8c, 0x60, 0xa6, 0x3c, 0xa2, 0x60, 0x7d, 0x95, + 0xcb, 0xda, 0x92, 0x79, 0x6a, 0xc2, 0x7e, 0x67, 0xa2, 0x71, 0xc6, 0xa0, + 0xb7, 0x4e, 0x62, 0x56, 0x5c, 0x60, 0x72, 0xa2, 0x31, 0xa2, 0x82, 0x6a, + 0xc8, 0x39, 0x80, 0x9e, 0x3d, 0xcd, 0xa6, 0x9d, 0x9d, 0x40, 0x71, 0x80, + 0x8f, 0xc4, 0x47, 0x97, 0x94, 0x9d, 0x2b, 0x99, 0x7e, 0x42, 0x6c, 0x67, + 0x87, 0xb3, 0x46, 0x9d, 0xc1, 0x31, 0x68, 0x4c, 0x51, 0x9f, 0xbe, 0x4c, + 0xb9, 0x95, 0x68, 0x2a, 0x75, 0x7c, 0x9b, 0x99, 0x9e, 0x77, 0x87, 0xa3, + 0x46, 0x58, 0x9e, 0x6e, 0x7f, 0x6a, 0xba, 0x37, 0x65, 0x8d, 0x46, 0x69, + 0x57, 0x40, 0x8f, 0x35, 0xa3, 0x4e, 0x69, 0x89, 0x8a, 0x3c, 0x1e, 0xa8, + 0x8f, 0x5a, 0x7a, 0x7b, 0xaf, 0xb7, 0x37, 0x66, 0x3d, 0x4c, 0x66, 0x56, + 0x87, 0x93, 0xc1, 0xcb, 0xbd, 0xa8, 0x56, 0x7f, 0x83, 0x66, 0x74, 0x57, + 0xa8, 0x92, 0x97, 0x2c, 0x6f, 0xbd, 0x67, 0xa7, 0x93, 0x6e, 0xc2, 0x48, + 0x45, 0x9a, 0xc9, 0xc6, 0xc9, 0xc8, 0x43, 0x97, 0x53, 0xbe, 0x69, 0x44, + 0x4e, 0xd3, 0xe3, 0x46, 0x9d, 0x32, 0x71, 0x7d, 0xb0, 0xd0, 0xd9, 0xb2, + 0x7b, 0xc4, 0x58, 0x56, 0x4d, 0x30, 0xa6, 0x40, 0xa0, 0xc9, 0xcf, 0xcb, + 0x8e, 0xd0, 0x2e, 0x9b, 0x4c, 0x9c, 0xa4, 0x27, 0x51, 0x9e, 0xa3, 0xb8, + 0x3c, 0xd9, 0x7a, 0xba, 0x70, 0x8e, 0x85, 0xa1, 0x4f, 0xa8, 0x8e, 0x79, + 0x52, 0x62, 0x99, 0x72, 0xb5, 0x9b, 0x67, 0xc1, 0x47, 0x33, 0x52, 0x6f, + 0xc2, 0xb6, 0xc1, 0x39, 0xc7, 0x6f, 0x53, 0xad, 0x72, 0x4a, 0x92, 0x85, + 0x86, 0x96, 0x7f, 0xbf, 0xc9, 0xc9, 0x44, 0xb2, 0x80, 0xaf, 0x60, 0x9c, + 0x42, 0xc8, 0xcc, 0x33, 0x46, 0x93, 0x46, 0xb0, 0x3c, 0xa7, 0x66, 0xb9, + 0xbd, 0x67, 0xcb, 0x9d, 0xbd, 0x46, 0x3b, 0x6e, 0xd1, 0xd7, 0x9f, 0xb3, + 0x4e, 0x80, 0x3e, 0x82, 0x5c, 0x87, 0x8c, 0xc6, 0xa6, 0x65, 0x53, 0x52, + 0xca, 0x7f, 0x84, 0x5b, 0x77, 0x3c, 0x81, 0x75, 0x3e, 0x5f, 0x7c, 0x7d, + 0xc0, 0x55, 0x1a, 0x2a, 0xa8, 0x9b, 0x64, 0x76, 0xc3, 0x3f, 0x6e, 0xb9, + 0xb1, 0x52, 0x7a, 0x2e, 0xba, 0x62, 0x66, 0xd0, 0xc1, 0x9e, 0x6e, 0x86, + 0x41, 0x89, 0xad, 0xbd, 0xa6, 0x65, 0x36, 0x62, 0x42, 0xb3, 0x47, 0x8e, + 0x67, 0x61, 0x6f, 0x31, 0x8f, 0x80, 0x86, 0x32, 0x64, 0x46, 0xca, 0x7a, + 0xa9, 0xc2, 0xaf, 0x56, 0x90, 0x7e, 0x4e, 0x32, 0xab, 0x42, 0x63, 0x95, + 0x67, 0xd7, 0x3b, 0xcc, 0x60, 0xa4, 0x6f, 0x63, 0xc1, 0x96, 0x55, 0xc3, + 0x57, 0xc9, 0xa9, 0xa8, 0x91, 0xb2, 0x33, 0xb1, 0x46, 0x88, 0x36, 0xad, + 0x57, 0x9d, 0x65, 0x35, 0x40, 0x60, 0xa3, 0x94, 0x97, 0x54, 0x97, 0x93, + 0x9a, 0x8f, 0x5c, 0x74, 0x95, 0xc2, 0x9f, 0x48, 0x49, 0x82, 0x2c, 0x70, + 0xb8, 0x9d, 0xad, 0xae, 0x3d, 0x78, 0xcf, 0xa5, 0x56, 0x9c, 0x88, 0x4a, + 0xa7, 0xac, 0x97, 0xe7, 0xa3, 0x55, 0x6b, 0x83, 0xca, 0x58, 0xbe, 0x91, + 0xc2, 0x9e, 0x68, 0x32, 0xd4, 0x71, 0xcd, 0x84, 0x35, 0x33, 0x72, 0x93, + 0xb3, 0x73, 0x7d, 0x40, 0x48, 0x93, 0x9f, 0x9d, 0x68, 0x89, 0x69, 0x61, + 0xc1, 0x86, 0xc9, 0x81, 0x45, 0xc5, 0x62, 0xb3, 0x68, 0x65, 0x88, 0xad, + 0x48, 0xb7, 0x95, 0x67, 0x6a, 0xba, 0xa7, 0x96, 0x6a, 0x5b, 0xd2, 0x8b, + 0x89, 0x99, 0x7e, 0x3e, 0x37, 0x67, 0x5c, 0x71, 0xc3, 0x83, 0xb6, 0xa5, + 0x7d, 0x66, 0xac, 0x40, 0x54, 0x96, 0x40, 0x36, 0xca, 0x8d, 0xc3, 0xae, + 0x59, 0x3c, 0x4a, 0x91, 0x9a, 0x75, 0x88, 0x60, 0xb7, 0xc0, 0x48, 0x44, + 0xbc, 0x27, 0xc4, 0x78, 0x8c, 0x39, 0x4b, 0xbc, 0xc4, 0xbc, 0x87, 0x8b, + 0x8c, 0x43, 0x56, 0x46, 0x33, 0x57, 0xb6, 0x3c, 0x95, 0x7b, 0x7d, 0xab, + 0x40, 0x41, 0x54, 0x81, 0x64, 0x6e, 0xa2, 0x7e, 0x93, 0xbb, 0xbe, 0x95, + 0x7c, 0x71, 0xcb, 0x6a, 0x2e, 0x2e, 0x57, 0x2e, 0x89, 0x2e, 0xca, 0xb4, + 0x82, 0xc1, 0x49, 0x5a, 0x33, 0xb1, 0x8b, 0x80, 0x9d, 0x52, 0x90, 0x6e, + 0x84, 0xc4, 0x5a, 0xb9, 0xbe, 0xb2, 0xd8, 0xa7, 0x78, 0x7b, 0x4e, 0x95, + 0x49, 0x88, 0x84, 0x93, 0x54, 0xc7, 0x89, 0xba, 0x30, 0x9d, 0x48, 0x56, + 0x9c, 0x8f, 0x4f, 0xd1, 0xd4, 0x75, 0x4d, 0x4e, 0xb9, 0xbd, 0x7e, 0x3d, + 0xcd, 0x9f, 0x67, 0x5f, 0x43, 0x61, 0xd9, 0x31, 0xbd, 0xa7, 0x94, 0xbd, + 0x92, 0x82, 0x83, 0x75, 0xb7, 0xb8, 0x44, 0x9b, 0x4e, 0x9e, 0x74, 0xcd, + 0xb5, 0x70, 0xae, 0x71, 0x45, 0x65, 0x4b, 0x40, 0x34, 0x7b, 0x76, 0x45, + 0x8d, 0x4c, 0x99, 0xc8, 0x99, 0x34, 0x8b, 0x73, 0x49, 0x4d, 0x5d, 0x61, + 0x30, 0x65, 0xcf, 0x69, 0x63, 0xbf, 0x6b, 0xb6, 0x6b, 0xc6, 0xb0, 0x70, + 0x72, 0x92, 0xd1, 0xc9, 0x7d, 0x60, 0xd3, 0x64, 0xa7, 0x55, 0x5e, 0x88, + 0x6a, 0x79, 0x9d, 0x95, 0x4d, 0x6b, 0x7c, 0x5f, 0x6e, 0x3f, 0x32, 0x83, + 0xb8, 0x41, 0xa8, 0x46, 0xc4, 0x7c, 0xa0, 0x6d, 0x46, 0x37, 0x2d, 0xba, + 0x72, 0x4f, 0x95, 0xc6, 0x41, 0xbe, 0x52, 0x56, 0x9e, 0x90, 0x82, 0x8f, + 0x43, 0xb4, 0x43, 0xa3, 0x8c, 0x4b, 0x4b, 0xb4, 0xc7, 0x67, 0x72, 0xcb, + 0xbd, 0x81, 0xa4, 0x7b, 0x99, 0x94, 0x4a, 0xc1, 0xbb, 0xa4, 0x3e, 0x80, + 0x3b, 0x96, 0x70, 0x7a, 0xce, 0x98, 0x4a, 0xa0, 0xa0, 0x54, 0x55, 0x40, + 0x65, 0x97, 0x87, 0xca, 0x49, 0x43, 0x55, 0x89, 0xa3, 0x58, 0x83, 0xc5, + 0x89, 0x42, 0x3b, 0x40, 0x81, 0x74, 0x3d, 0x59, 0x3c, 0x98, 0xac, 0x32, + 0x5f, 0xa5, 0x85, 0xca, 0xb3, 0xb4, 0xc9, 0xb1, 0xbd, 0x48, 0x5a, 0xc5, + 0x61, 0xb5, 0x49, 0x86, 0xd2, 0x7c, 0x59, 0xa2, 0x33, 0xc7, 0xaf, 0x3f, + 0x5d, 0x32, 0x6a, 0x47, 0xad, 0x56, 0x3f, 0x84, 0x39, 0xbb, 0xbf, 0xb3, + 0xa5, 0x5a, 0xc2, 0x57, 0xcc, 0x8e, 0x33, 0x5c, 0xb2, 0x8a, 0x75, 0x45, + 0x30, 0xc7, 0x68, 0x7d, 0xa2, 0xb7, 0x8a, 0x51, 0xc6, 0xa8, 0x9a, 0x6c, + 0x8f, 0xad, 0x62, 0x9e, 0x8d, 0x9f, 0x36, 0x54, 0x3b, 0x7a, 0xaf, 0x43, + 0x6c, 0xc9, 0xb5, 0x84, 0x92, 0x95, 0x8f, 0x79, 0xc1, 0x39, 0x7e, 0x30, + 0x50, 0xc5, 0x9e, 0x89, 0xbd, 0x35, 0x39, 0x74, 0x59, 0x8c, 0xcd, 0x49, + 0x48, 0x67, 0xad, 0xc7, 0x7f, 0x57, 0xc8, 0x4b, 0x32, 0x5d, 0xce, 0x84, + 0x95, 0x7c, 0xbe, 0xa6, 0x48, 0x7f, 0xd4, 0x73, 0x95, 0x5e, 0x91, 0x4a, + 0xc3, 0x84, 0x8d, 0x9c, 0xa6, 0x32, 0x92, 0xa2, 0xaa, 0x9d, 0x40, 0x43, + 0x3a, 0x9d, 0xc4, 0x40, 0x44, 0x78, 0xb4, 0x4f, 0x50, 0x52, 0x75, 0xa8, + 0x4a, 0xbb, 0xbd, 0x67, 0x64, 0xb1, 0x65, 0x4c, 0xb5, 0x57, 0x65, 0xab, + 0xb9, 0xab, 0xda, 0xd2, 0x8a, 0x57, 0x83, 0xc4, 0xcf, 0x4e, 0x3d, 0x92, + 0xa2, 0x78, 0x79, 0x72, 0xaf, 0x5c, 0x87, 0x7b, 0x6b, 0x76, 0x64, 0xca, + 0x4a, 0xb0, 0x9c, 0xb0, 0xd3, 0x93, 0x92, 0x4d, 0x62, 0x8f, 0x70, 0x9b, + 0xad, 0x55, 0xc4, 0x8c, 0x50, 0xc5, 0xae, 0xbe, 0x5a, 0xa4, 0x8e, 0x9a, + 0x56, 0x3c, 0xad, 0xd1, 0x7e, 0x70, 0x7c, 0x7e, 0xad, 0xcc, 0x3e, 0x5c, + 0x66, 0xc8, 0x69, 0x57, 0x77, 0x8b, 0x4c, 0x3c, 0x57, 0xc3, 0xc2, 0x37, + 0xb6, 0xca, 0x89, 0x8a, 0x57, 0xb6, 0xcc, 0x42, 0x48, 0x2c, 0x3e, 0xaa, + 0xb8, 0x4d, 0xa9, 0xae, 0x55, 0x97, 0x7e, 0x97, 0x9b, 0x7c, 0xd2, 0x58, + 0x49, 0x8a, 0x85, 0x39, 0x64, 0x4e, 0x65, 0x8e, 0x3d, 0x4f, 0xcd, 0x43, + 0x86, 0x60, 0xb1, 0xc6, 0x7e, 0x70, 0x7b, 0xbd, 0xcd, 0x59, 0x97, 0x3c, + 0xac, 0x57, 0xbb, 0x92, 0x7e, 0xa0, 0x9c, 0xc2, 0x99, 0xc2, 0x95, 0x84, + 0x72, 0xc6, 0x5e, 0x86, 0xb2, 0x71, 0x7e, 0x63, 0x71, 0x58, 0xd9, 0x80, + 0x83, 0x1a, 0xaf, 0x65, 0xa8, 0xd9, 0x96, 0x62, 0x76, 0xcb, 0xca, 0x52, + 0x7d, 0x7c, 0x89, 0x55, 0x82, 0x88, 0x54, 0x9f, 0x61, 0x7f, 0xa6, 0x9b, + 0xb3, 0xa5, 0xaa, 0x33, 0xb2, 0x6c, 0x55, 0x6d, 0x72, 0x98, 0x48, 0x9c, + 0x6a, 0x4a, 0x54, 0x6b, 0x3a, 0x74, 0x85, 0x73, 0xbe, 0x5b, 0xb1, 0xd0, + 0x34, 0xc6, 0x92, 0xc7, 0xb3, 0x9f, 0xb6, 0x3e, 0x95, 0x6b, 0x7e, 0xc5, + 0xb9, 0x77, 0x77, 0x59, 0x8b, 0x52, 0xd1, 0xad, 0xcf, 0x83, 0xc8, 0x60, + 0xc0, 0xc5, 0xc7, 0x91, 0x63, 0x49, 0xa6, 0xa6, 0x6f, 0x4b, 0x41, 0x5b, + 0xb4, 0x36, 0x57, 0x60, 0x95, 0x79, 0xba, 0xb6, 0xc3, 0xa7, 0x66, 0x46, + 0xaa, 0x56, 0x68, 0x5f, 0xb6, 0x49, 0x94, 0x7c, 0x56, 0x49, 0x7a, 0x5c, + 0x91, 0xb6, 0xcf, 0x51, 0x91, 0xcc, 0xad, 0x3a, 0xb0, 0xb9, 0xb8, 0xcf, + 0x36, 0xaf, 0xca, 0x7f, 0xb8, 0x4d, 0x5e, 0xa9, 0x34, 0xbe, 0xb2, 0xa9, + 0xc0, 0xca, 0x73, 0x94, 0x82, 0x4e, 0xb4, 0xd3, 0x8d, 0x53, 0x7e, 0x78, + 0x9f, 0x7c, 0xca, 0x73, 0xa9, 0xac, 0x36, 0x40, 0x6c, 0x3c, 0x37, 0xb7, + 0x88, 0xd8, 0xd2, 0xa7, 0x5e, 0x76, 0xd2, 0x36, 0xcd, 0xb9, 0x82, 0x68, + 0x8b, 0x96, 0xbd, 0x51, 0xbe, 0x61, 0x54, 0x39, 0x6f, 0xc4, 0xcd, 0x39, + 0x38, 0x8c, 0x42, 0xcb, 0xa1, 0x91, 0x4d, 0xa7, 0x7f, 0x77, 0xbf, 0xbc, + 0x81, 0x37, 0x55, 0xaf, 0x48, 0x81, 0x8d, 0x81, 0x84, 0x85, 0x9c, 0xad, + 0xbf, 0x30, 0xc9, 0x4d, 0x3d, 0xc8, 0xbc, 0xc3, 0x4c, 0xc1, 0xaf, 0x50, + 0xc6, 0x73, 0x39, 0xa2, 0x77, 0xc9, 0xb8, 0x94, 0x91, 0x9d, 0xa7, 0x71, + 0xc4, 0x59, 0x91, 0x3c, 0xbf, 0xba, 0x4b, 0x7c, 0x32, 0x99, 0x41, 0x77, + 0x44, 0x6f, 0xc0, 0x4a, 0x7d, 0xbb, 0x9b, 0x5d, 0x40, 0x5b, 0x60, 0xd0, + 0x52, 0x3b, 0x57, 0x83, 0xc4, 0xae, 0xb3, 0xc1, 0x6a, 0x35, 0x92, 0x35, + 0x82, 0x7b, 0x81, 0x8b, 0xa6, 0x91, 0x7e, 0x48, 0x6b, 0x98, 0x71, 0x99, + 0x53, 0xb8, 0x9e, 0x88, 0xaf, 0x44, 0x6a, 0x37, 0xa1, 0x54, 0x54, 0x62, + 0x60, 0x73, 0x82, 0x7b, 0x69, 0x7a, 0xa7, 0x3b, 0x9e, 0x7f, 0x76, 0xa7, + 0x3e, 0xd2, 0x6e, 0x7a, 0xa5, 0xaf, 0x64, 0x48, 0x68, 0xce, 0xcc, 0x48, + 0x4c, 0x6e, 0x4b, 0xce, 0x94, 0xac, 0x82, 0xa3, 0x78, 0x68, 0x39, 0xcd, + 0xc1, 0xac, 0x31, 0x6f, 0x75, 0xaa, 0x87, 0xce, 0x38, 0x96, 0x7f, 0xca, + 0x48, 0xb8, 0xa4, 0x53, 0x43, 0xcb, 0x8b, 0x5b, 0xab, 0x39, 0x6f, 0xa1, + 0x89, 0xc5, 0x7e, 0xc6, 0x4d, 0xcd, 0x9f, 0x60, 0x92, 0x48, 0xbe, 0x44, + 0xbd, 0x42, 0x3a, 0x9e, 0x7b, 0xaf, 0x42, 0xc0, 0x79, 0x4f, 0x76, 0x79, + 0xab, 0xb9, 0x48, 0xa8, 0x30, 0x84, 0xd0, 0x98, 0x46, 0x32, 0x92, 0x6c, + 0xcc, 0xaa, 0x52, 0xa1, 0xd0, 0xa3, 0x99, 0xb1, 0x8f, 0xad, 0x44, 0xba, + 0xb8, 0x5c, 0xc5, 0x44, 0x87, 0x5d, 0x9c, 0x3b, 0x4c, 0x81, 0x7c, 0x48, + 0x61, 0x58, 0xb9, 0x9e, 0x70, 0x6f, 0xb0, 0x92, 0x2f, 0x96, 0x4e, 0x88, + 0x96, 0xc0, 0x38, 0x46, 0xc4, 0x58, 0xcb, 0xbe, 0x61, 0xaf, 0xb1, 0xcf, + 0x67, 0x39, 0x9c, 0x3f, 0x9d, 0x4a, 0x6a, 0xa0, 0x39, 0xcb, 0x3d, 0x46, + 0x86, 0xcd, 0xc2, 0xb3, 0x38, 0xa0, 0xa0, 0x49, 0x6a, 0x8b, 0xca, 0x33, + 0x8c, 0x8c, 0x81, 0xc0, 0x56, 0x8c, 0x41, 0x3d, 0x82, 0x4c, 0xc4, 0x51, + 0x70, 0xaf, 0x9e, 0xcf, 0x67, 0x63, 0x6c, 0x34, 0x9e, 0x6d, 0x37, 0x37, + 0x84, 0xac, 0x43, 0x4f, 0x3c, 0x38, 0x88, 0x41, 0xc8, 0x87, 0x8e, 0x59, + 0x75, 0x54, 0x90, 0xbe, 0xcc, 0xac, 0x45, 0x3f, 0xca, 0xaa, 0x9d, 0x33, + 0x49, 0xb5, 0x4d, 0xcf, 0x6e, 0x7e, 0x4d, 0xa5, 0x7d, 0xcf, 0x7f, 0xc8, + 0x4e, 0x3b, 0x42, 0x6c, 0x5b, 0xc0, 0xbc, 0x2d, 0xcd, 0x66, 0xb5, 0x38, + 0xaf, 0x2e, 0xb9, 0x72, 0xaa, 0x30, 0x3d, 0x43, 0x4f, 0x55, 0x8a, 0xb1, + 0x98, 0xac, 0x97, 0x74, 0x51, 0x3e, 0x5b, 0x52, 0x41, 0xa0, 0x4a, 0x42, + 0xc0, 0x3f, 0xc3, 0x2e, 0x3c, 0x87, 0x36, 0xd0, 0x87, 0x76, 0x41, 0x99, + 0xb8, 0xa0, 0x53, 0xc4, 0x63, 0xaf, 0xb2, 0x4a, 0xbe, 0x88, 0xc9, 0x91, + 0x9d, 0xc7, 0x75, 0xa1, 0x4b, 0x4b, 0xa4, 0x51, 0xb8, 0x64, 0x8c, 0x7a, + 0x3b, 0xa8, 0x68, 0x98, 0xc4, 0xbd, 0x5e, 0x7f, 0xbc, 0x91, 0x5a, 0x81, + 0x3a, 0x4e, 0x8b, 0x34, 0x42, 0x54, 0x79, 0xbb, 0x99, 0x46, 0x6d, 0x88, + 0x49, 0x52, 0x4d, 0x47, 0x2f, 0xa6, 0x7d, 0xb6, 0xd1, 0x43, 0x67, 0x9a, + 0xb8, 0x7d, 0x50, 0x4e, 0xbe, 0x83, 0x54, 0xd7, 0x55, 0x76, 0x85, 0xc5, + 0x79, 0x61, 0x3d, 0xcb, 0xa8, 0x54, 0xcf, 0x4b, 0x52, 0x6f, 0x80, 0x61, + 0x97, 0x8f, 0xab, 0x83, 0xc4, 0xa7, 0x85, 0x56, 0x59, 0x4a, 0xd0, 0x56, + 0xd5, 0xc6, 0xae, 0x6e, 0x7f, 0xc4, 0x4c, 0x70, 0x42, 0x2d, 0x6b, 0x9a, + 0x91, 0xc7, 0x66, 0xb0, 0x5a, 0x59, 0xcf, 0xb0, 0x3f, 0xb2, 0x35, 0x7b, + 0x7a, 0x79, 0x96, 0x8b, 0xae, 0x88, 0xa5, 0x3d, 0x42, 0x50, 0xcb, 0x35, + 0xaa, 0x7a, 0x47, 0x70, 0x31, 0xb3, 0x3a, 0x82, 0xb8, 0xa3, 0x4a, 0x72, + 0xcb, 0x60, 0x7a, 0x3c, 0x83, 0x47, 0x85, 0x49, 0xaa, 0xba, 0xb5, 0x77, + 0xcb, 0x7d, 0xaf, 0x77, 0x77, 0x49, 0xc1, 0xc4, 0x78, 0x7c, 0x5b, 0x99, + 0x43, 0xa5, 0x66, 0x76, 0x2f, 0x6e, 0xae, 0x59, 0x5d, 0xc4, 0x87, 0x3e, + 0x82, 0xb4, 0x8f, 0x5a, 0xc3, 0x9b, 0x8e, 0x95, 0x7e, 0x88, 0x88, 0x50, + 0xc5, 0x37, 0x81, 0xc9, 0x4f, 0x8a, 0x7d, 0x45, 0x92, 0x5b, 0x38, 0x3a, + 0x47, 0xa2, 0x86, 0xa9, 0xb9, 0x74, 0x99, 0x81, 0x46, 0x6d, 0x8e, 0xa8, + 0x41, 0xa8, 0xcf, 0x9a, 0x75, 0x5a, 0x5e, 0x7e, 0xd0, 0x97, 0x33, 0x31, + 0xbd, 0x75, 0xab, 0xb2, 0x86, 0x9d, 0x53, 0x7d, 0x5b, 0x44, 0xcb, 0xb6, + 0x56, 0xb5, 0x45, 0xa9, 0x93, 0x84, 0xb1, 0xd3, 0x36, 0x6b, 0x40, 0xa0, + 0x9c, 0x7a, 0xbd, 0x42, 0x8e, 0x46, 0x3d, 0xc7, 0xbe, 0xab, 0xa9, 0x3a, + 0x9a, 0x3b, 0x94, 0x32, 0xbc, 0x92, 0xbc, 0x69, 0x96, 0xa1, 0x69, 0xcc, + 0x8c, 0xbf, 0x8c, 0x3f, 0x81, 0x47, 0x50, 0xa7, 0xb2, 0x77, 0x8b, 0x46, + 0x51, 0x62, 0xaa, 0x67, 0xaf, 0x80, 0x78, 0x64, 0x6b, 0xa6, 0x43, 0xd5, + 0x91, 0x60, 0xa6, 0x44, 0xbf, 0xc2, 0x5b, 0xbe, 0x45, 0xcb, 0xa6, 0x34, + 0x3f, 0xcb, 0xa6, 0x6e, 0x94, 0xce, 0x80, 0x3d, 0x75, 0x5e, 0x3e, 0x96, + 0xa1, 0x5c, 0x8c, 0x56, 0xa1, 0x38, 0x78, 0x9d, 0x90, 0xc7, 0x3e, 0x76, + 0xa1, 0x9a, 0xdb, 0x89, 0xa5, 0xb0, 0x62, 0x3e, 0x38, 0x7c, 0x8e, 0x6f, + 0x38, 0x79, 0x42, 0x70, 0x47, 0xc6, 0x44, 0x52, 0x87, 0x5b, 0x42, 0x86, + 0x4b, 0x56, 0xd1, 0x4f, 0xba, 0x51, 0x55, 0xc8, 0x33, 0x83, 0xc2, 0x39, + 0xc8, 0xbb, 0xac, 0x70, 0x3f, 0x9e, 0x64, 0x96, 0x36, 0x51, 0xc8, 0x3d, + 0x40, 0x92, 0x7b, 0x69, 0x51, 0x94, 0x8c, 0x75, 0x51, 0xa7, 0xa2, 0xb6, + 0xc6, 0x97, 0xb5, 0x99, 0x3c, 0x81, 0x73, 0x7b, 0xe0, 0x52, 0xa6, 0x8c, + 0x9e, 0xa0, 0x5b, 0x36, 0x48, 0x9b, 0xcc, 0xc9, 0x68, 0xa0, 0x50, 0x94, + 0x97, 0xa2, 0x70, 0x7b, 0x5a, 0x39, 0x3f, 0x73, 0x65, 0x9f, 0x4b, 0x62, + 0x49, 0xa7, 0xdb, 0x8a, 0x3a, 0xbb, 0xd5, 0x34, 0x62, 0x7e, 0x9b, 0x85, + 0x94, 0x66, 0x45, 0x8e, 0x41, 0xa6, 0xa5, 0x8f, 0x60, 0xc4, 0x2d, 0x69, + 0x5b, 0xc9, 0x71, 0x5a, 0x4e, 0xc0, 0xaf, 0x48, 0x50, 0xc7, 0x77, 0xa6, + 0x83, 0xa9, 0xbf, 0x77, 0xa4, 0x5d, 0xb2, 0x64, 0x70, 0xc8, 0xae, 0xcf, + 0x5b, 0x76, 0x41, 0x3a, 0xb2, 0x69, 0x75, 0x3b, 0xb0, 0x79, 0xc2, 0x69, + 0x81, 0xa0, 0x6b, 0x9d, 0x82, 0x9a, 0x31, 0x3b, 0x33, 0x51, 0x57, 0x5a, + 0x42, 0xa1, 0xa1, 0x88, 0xc9, 0xc8, 0x6e, 0x36, 0xd1, 0x98, 0x9e, 0x7f, + 0xaa, 0x68, 0x8c, 0x51, 0x67, 0x56, 0x99, 0xa7, 0x52, 0x89, 0x36, 0x43, + 0x42, 0x39, 0x65, 0x95, 0xb3, 0x5c, 0xc3, 0x54, 0xba, 0x33, 0x42, 0x7d, + 0x5d, 0xad, 0xa5, 0x37, 0xc3, 0xad, 0xc3, 0xc6, 0x58, 0x9c, 0x61, 0x54, + 0xce, 0xb8, 0x54, 0x6f, 0xbd, 0xbe, 0x30, 0x4d, 0x64, 0x8b, 0x98, 0x3d, + 0x41, 0x77, 0x7b, 0x3a, 0xac, 0x45, 0xa4, 0xb1, 0x78, 0xbf, 0x9d, 0x86, + 0x4e, 0x56, 0x3f, 0xbf, 0x88, 0x5e, 0x4d, 0x4e, 0x56, 0x43, 0x64, 0xcf, + 0x6d, 0x74, 0x7d, 0x81, 0x59, 0x70, 0x75, 0x2c, 0xd0, 0x87, 0x3c, 0x73, + 0xa5, 0x6e, 0xd2, 0xb4, 0x6f, 0x41, 0xa0, 0x8b, 0x74, 0x4e, 0xc2, 0x3f, + 0x77, 0x80, 0x56, 0x81, 0x71, 0xb2, 0x3c, 0x4b, 0xa0, 0x5e, 0x64, 0x5c, + 0xac, 0xc5, 0x77, 0x35, 0xa3, 0x6e, 0x99, 0xc0, 0x9f, 0x6f, 0x8b, 0xa7, + 0xbb, 0x92, 0x9a, 0x9d, 0xcf, 0xab, 0x41, 0x63, 0x88, 0x45, 0xb2, 0x57, + 0x3c, 0x67, 0x51, 0x84, 0x9b, 0x7a, 0x46, 0x92, 0x8a, 0x83, 0xc3, 0x8c, + 0x7a, 0x60, 0xae, 0xa2, 0x6b, 0x6e, 0xc7, 0x91, 0xbd, 0x42, 0x37, 0x7d, + 0xbd, 0x75, 0x55, 0xb8, 0xc4, 0x7b, 0xca, 0xaf, 0x9a, 0x57, 0x98, 0x64, + 0x50, 0x73, 0x68, 0x92, 0xbb, 0x6b, 0x7b, 0xbb, 0xba, 0x79, 0xc7, 0xbe, + 0x6d, 0x4c, 0x9f, 0x32, 0x58, 0x36, 0xd0, 0x90, 0x4b, 0xd5, 0x6d, 0xb2, + 0xc5, 0xba, 0xa8, 0x69, 0xca, 0x53, 0x96, 0x4f, 0x6c, 0x51, 0x85, 0x9e, + 0xc4, 0x70, 0xc7, 0x76, 0x9f, 0x6a, 0x6f, 0x8d, 0xcb, 0x52, 0x37, 0xb6, + 0xa3, 0xbe, 0x94, 0x4c, 0x95, 0x73, 0xbc, 0xce, 0x5b, 0x6b, 0x96, 0x77, + 0x9e, 0x53, 0x6b, 0x85, 0x89, 0x72, 0x73, 0xa3, 0x42, 0x54, 0x7f, 0xa1, + 0xb4, 0x39, 0x45, 0xad, 0x59, 0x5b, 0x75, 0x66, 0x5a, 0x7a, 0x9b, 0x77, + 0x39, 0xc0, 0x70, 0xc7, 0xc6, 0x3a, 0x3e, 0x33, 0x65, 0x4a, 0x9e, 0xcb, + 0x55, 0xd4, 0x5b, 0x5c, 0x48, 0x5a, 0x75, 0x90, 0xa4, 0x55, 0x61, 0x51, + 0x65, 0x93, 0xa3, 0xc4, 0x63, 0xb6, 0xc1, 0x7e, 0x64, 0x54, 0x4c, 0x70, + 0xb9, 0x39, 0x71, 0x7e, 0xa2, 0xb2, 0x94, 0x9f, 0x5e, 0x4b, 0x49, 0x61, + 0xab, 0xba, 0xc6, 0x62, 0x9b, 0xa9, 0xcd, 0x81, 0x70, 0x8d, 0x8b, 0xc3, + 0x5a, 0x55, 0x92, 0x69, 0x64, 0x41, 0x76, 0xd3, 0x3a, 0x4d, 0x65, 0x3a, + 0xb3, 0x85, 0x72, 0x58, 0x64, 0x66, 0x6d, 0xc7, 0xd4, 0x70, 0xdd, 0x99, + 0x77, 0xdf, 0xbb, 0x4f, 0xb3, 0xac, 0x9a, 0xb6, 0x4a, 0x46, 0xab, 0x7d, + 0x73, 0x6e, 0x42, 0xd9, 0x3e, 0xae, 0xc5, 0x7c, 0x66, 0x64, 0x7c, 0x48, + 0x63, 0x5e, 0x44, 0xa9, 0x68, 0xc7, 0x93, 0xa7, 0x90, 0x5e, 0xa7, 0xa9, + 0x4a, 0x74, 0x9e, 0x4a, 0x32, 0xb8, 0x79, 0x85, 0x8f, 0xa3, 0x83, 0x77, + 0xab, 0x55, 0xdf, 0x62, 0x4b, 0xce, 0xcd, 0xc0, 0x59, 0xa6, 0x3b, 0x9f, + 0x90, 0x30, 0x77, 0x6f, 0xc0, 0x86, 0xcb, 0x66, 0x9d, 0xbc, 0x62, 0x77, + 0xd4, 0x3b, 0x7d, 0xc7, 0xc0, 0xa6, 0xd1, 0x37, 0x97, 0x7f, 0x59, 0x40, + 0xc9, 0xd6, 0xa3, 0x85, 0xb4, 0x9d, 0x9e, 0xa8, 0x3b, 0xbe, 0x9c, 0x32, + 0x62, 0x9c, 0xd2, 0xc3, 0xc5, 0xc7, 0x70, 0x3a, 0x7a, 0x95, 0xa8, 0xb9, + 0x89, 0x87, 0xd2, 0x8c, 0x2f, 0x91, 0x8c, 0x64, 0x5d, 0x71, 0x43, 0x70, + 0xd3, 0x8f, 0x77, 0x8f, 0x4f, 0xce, 0xd5, 0x41, 0x7a, 0xc0, 0xd1, 0x6f, + 0xa2, 0x75, 0x76, 0x55, 0x6d, 0x88, 0x6e, 0xa0, 0x39, 0x5b, 0x37, 0x76, + 0x78, 0x7f, 0x63, 0x4e, 0xc2, 0x96, 0x36, 0x68, 0x5b, 0x75, 0xaa, 0x78, + 0x5d, 0x77, 0x51, 0x33, 0x7b, 0xb2, 0x72, 0xb3, 0xa7, 0xb0, 0x3d, 0xac, + 0x4a, 0x3e, 0x88, 0x3c, 0xb0, 0x36, 0x96, 0x7c, 0x5d, 0x6a, 0xb6, 0xa9, + 0x67, 0x6c, 0xda, 0xbe, 0x82, 0x5f, 0xb4, 0x8b, 0x7b, 0xaf, 0xab, 0xc1, + 0xd4, 0x82, 0x74, 0xc5, 0x83, 0xca, 0x31, 0x36, 0x38, 0x4c, 0xaf, 0x93, + 0x55, 0x54, 0x8d, 0x54, 0x99, 0x7f, 0x96, 0xb2, 0xc2, 0x9d, 0xa3, 0x94, + 0x87, 0x3a, 0xa4, 0x92, 0x97, 0x9a, 0x66, 0x57, 0x6e, 0x52, 0x32, 0x5b, + 0x4f, 0x6e, 0xb1, 0x65, 0x27, 0x69, 0x5f, 0xaf, 0xa2, 0xcc, 0x55, 0x8e, + 0x90, 0xcf, 0x3c, 0x95, 0x83, 0xb6, 0x68, 0xb9, 0x53, 0x3a, 0x4d, 0x79, + 0x81, 0x90, 0x5f, 0xbe, 0x4d, 0x8a, 0x56, 0xca, 0x5e, 0x88, 0x63, 0x70, + 0xb3, 0x7a, 0x9b, 0x6f, 0xa5, 0x7b, 0x4e, 0x45, 0x4a, 0xa1, 0x7d, 0xb3, + 0xd3, 0x85, 0xa7, 0xb6, 0x8c, 0xa8, 0xb5, 0x4e, 0x75, 0xc7, 0x3c, 0xba, + 0x3d, 0xc1, 0x75, 0x99, 0x45, 0x47, 0x40, 0x55, 0x8f, 0x3e, 0x45, 0x9d, + 0x85, 0x44, 0x8b, 0x57, 0x8f, 0x53, 0xa2, 0x6c, 0x5e, 0xbc, 0xa2, 0xa7, + 0xa3, 0x93, 0xc9, 0x73, 0x54, 0x5a, 0xcc, 0x33, 0x4e, 0x6c, 0x55, 0x83, + 0xa4, 0x58, 0x65, 0x4b, 0x51, 0x7a, 0x5b, 0x9f, 0xa5, 0x43, 0xab, 0x56, + 0x8c, 0xc9, 0x65, 0x70, 0x87, 0xdc, 0xa6, 0xb0, 0x47, 0x6a, 0x9f, 0x54, + 0x67, 0x9b, 0x50, 0xb4, 0x9e, 0x46, 0x92, 0xbd, 0x43, 0x39, 0xa7, 0xa3, + 0xa8, 0x53, 0x7a, 0x75, 0xc7, 0xd9, 0xb0, 0x8b, 0x7a, 0x4e, 0xc1, 0x41, + 0xa8, 0x43, 0x8b, 0xc2, 0xaf, 0x49, 0xbf, 0x5c, 0xad, 0x96, 0x2e, 0x69, + 0x71, 0x50, 0x92, 0x8d, 0x60, 0xc2, 0xc3, 0x3d, 0x3c, 0x5f, 0x68, 0x60, + 0x7c, 0xc0, 0xa4, 0x74, 0x90, 0x84, 0x93, 0x85, 0xb7, 0x7c, 0x63, 0xa0, + 0xa1, 0xb0, 0x5d, 0x9f, 0x64, 0x6a, 0xa6, 0x63, 0x34, 0xa6, 0xcd, 0x83, + 0x56, 0xa3, 0x58, 0x60, 0x5b, 0x26, 0x5d, 0x3e, 0x70, 0xc8, 0xc1, 0x6e, + 0x65, 0x7b, 0x62, 0xcb, 0x79, 0x88, 0x69, 0xb8, 0x49, 0x6c, 0x88, 0x97, + 0xb7, 0x49, 0xae, 0x57, 0xa9, 0x91, 0xc0, 0xc6, 0x71, 0x36, 0x5f, 0x44, + 0x88, 0x38, 0x98, 0x9f, 0x3a, 0x2c, 0x85, 0xcb, 0x33, 0x82, 0xc8, 0x63, + 0x6a, 0xb9, 0xc8, 0x58, 0xa2, 0xbd, 0x65, 0xbc, 0x81, 0x52, 0x9e, 0xb7, + 0xbb, 0xaf, 0x38, 0x77, 0x83, 0x7a, 0x7c, 0x43, 0x7f, 0x69, 0x73, 0x3f, + 0x35, 0x2e, 0x35, 0x8b, 0x9a, 0xc2, 0x8f, 0x3f, 0xc2, 0x6f, 0x58, 0x8a, + 0x6d, 0x5f, 0x30, 0xd1, 0x6b, 0x32, 0x89, 0x9f, 0xa1, 0x93, 0x94, 0x9d, + 0x7d, 0x93, 0x4b, 0xb3, 0xca, 0x90, 0x72, 0x66, 0x8e, 0x89, 0x6e, 0x7b, + 0xcd, 0xab, 0x5c, 0x4c, 0x2c, 0xc2, 0x80, 0xda, 0xa5, 0xc2, 0xa5, 0x8f, + 0x85, 0x9a, 0xb1, 0x4f, 0x92, 0x31, 0x7b, 0xc4, 0x96, 0xcb, 0x40, 0xe3, + 0x4d, 0x93, 0xb6, 0xcf, 0xc1, 0x83, 0xb3, 0xa4, 0x54, 0xc4, 0x8a, 0xc1, + 0x36, 0x50, 0x84, 0x38, 0x49, 0x6e, 0xbf, 0x7a, 0x65, 0xde, 0x90, 0x8d, + 0x84, 0x5e, 0x49, 0x8a, 0x62, 0x4a, 0x3b, 0xb4, 0x8d, 0xcb, 0x5d, 0x6f, + 0x75, 0x5c, 0x8d, 0x6f, 0x90, 0x3c, 0x4c, 0x92, 0x2e, 0x5b, 0x3f, 0xd4, + 0x35, 0xa3, 0x74, 0x9d, 0x85, 0xca, 0x3a, 0x46, 0xc3, 0xc7, 0xb9, 0xca, + 0x83, 0x62, 0x55, 0x88, 0xa9, 0x6e, 0x87, 0xc3, 0xce, 0xaa, 0xd2, 0xc8, + 0x5c, 0xbd, 0x84, 0xd4, 0xb3, 0x59, 0xd4, 0x77, 0x69, 0x75, 0xaa, 0x8e, + 0xd1, 0x68, 0xc1, 0xb2, 0x45, 0x62, 0xa2, 0x84, 0x93, 0x61, 0x65, 0x86, + 0x98, 0x3e, 0x35, 0x93, 0x61, 0x78, 0xca, 0x8a, 0xb3, 0x7e, 0x78, 0x3f, + 0x3c, 0x88, 0xa9, 0x55, 0x76, 0xa1, 0x6e, 0xb4, 0xbf, 0x72, 0x55, 0xb9, + 0x46, 0x87, 0xcf, 0x3c, 0x51, 0x9f, 0x77, 0x7a, 0x46, 0x49, 0xa2, 0xab, + 0x88, 0xcb, 0x8e, 0x9a, 0x93, 0x8b, 0x70, 0x3c, 0xb5, 0xa9, 0x84, 0x52, + 0x43, 0xb9, 0xa0, 0x52, 0x97, 0x5c, 0x94, 0x47, 0x8e, 0x9f, 0x9f, 0x77, + 0x78, 0xca, 0x8f, 0x74, 0x5e, 0xa4, 0x62, 0xbf, 0x36, 0xa6, 0x85, 0x87, + 0x50, 0x43, 0x78, 0x82, 0x34, 0x3c, 0x68, 0xae, 0x2e, 0x6b, 0xa1, 0xbb, + 0x8a, 0xb8, 0xc4, 0x9f, 0x2d, 0x92, 0x82, 0x46, 0xae, 0x7c, 0xb4, 0xb0, + 0x3f, 0xbe, 0x41, 0x4e, 0x43, 0x8f, 0x9c, 0x9d, 0x7b, 0x36, 0xb3, 0x65, + 0x70, 0xc1, 0x45, 0xb3, 0x9c, 0xc8, 0x8a, 0x75, 0xc5, 0x54, 0x88, 0x6a, + 0x60, 0x95, 0x41, 0x8c, 0xc3, 0x89, 0xb4, 0xc6, 0x6b, 0x1f, 0x51, 0x7f, + 0xb7, 0x88, 0x61, 0xad, 0x8d, 0xd9, 0xd2, 0x85, 0x95, 0x59, 0x42, 0x65, + 0x96, 0x4c, 0xd1, 0x88, 0xac, 0x87, 0x61, 0x24, 0x94, 0x3f, 0x39, 0x60, + 0xa7, 0x83, 0x5b, 0x49, 0x43, 0x63, 0x86, 0x88, 0xa1, 0x2e, 0x48, 0x96, + 0x8b, 0x6e, 0xb5, 0x8e, 0x3e, 0x78, 0x4e, 0x77, 0x98, 0x7e, 0x60, 0x63, + 0x89, 0x92, 0x85, 0x67, 0x6e, 0x69, 0x64, 0x5b, 0xc1, 0xa1, 0xc2, 0x6a, + 0x65, 0x47, 0x98, 0x35, 0x6c, 0xbd, 0x3d, 0x3f, 0x54, 0xa3, 0x8a, 0x3a, + 0xb4, 0x24, 0xb7, 0x45, 0x93, 0x22, 0x7a, 0x8f, 0x3b, 0xdf, 0x96, 0x9e, + 0x88, 0x83, 0xa3, 0x44, 0xb1, 0x5b, 0xa3, 0x5d, 0xa8, 0x7e, 0x77, 0x81, + 0x42, 0x60, 0xd1, 0x42, 0x94, 0xc7, 0xa7, 0xcb, 0x57, 0x62, 0x5c, 0x54, + 0xb7, 0xc2, 0xaa, 0x73, 0x53, 0xc5, 0xa3, 0x50, 0xae, 0x4e, 0x63, 0x7d, + 0x8e, 0x2f, 0x72, 0x9a, 0x4e, 0x8a, 0x2b, 0x91, 0xb3, 0x89, 0x79, 0x68, + 0x48, 0x5e, 0xd6, 0x95, 0xb6, 0xd5, 0xc1, 0xcc, 0x3e, 0x31, 0x39, 0xa9, + 0x4f, 0x6d, 0x56, 0x66, 0x58, 0x4b, 0x53, 0x65, 0x49, 0x53, 0xc3, 0xbc, + 0x6a, 0x7c, 0xd2, 0xa2, 0x8f, 0xcf, 0xaa, 0x5c, 0x8a, 0x7b, 0x77, 0xa0, + 0x72, 0x46, 0x51, 0x5a, 0x3b, 0x9f, 0x6b, 0xa9, 0xb2, 0x79, 0xa4, 0x31, + 0xbb, 0x68, 0x8d, 0xd2, 0xa7, 0x6c, 0x70, 0x74, 0x5b, 0xaa, 0xc5, 0x35, + 0x81, 0x92, 0xce, 0x3d, 0xb0, 0x64, 0xa7, 0x59, 0xb0, 0xc8, 0x7e, 0x56, + 0xb5, 0x97, 0x97, 0x75, 0x85, 0x80, 0x95, 0x2b, 0x8c, 0xa0, 0xd8, 0x69, + 0x9a, 0xa8, 0xda, 0x74, 0x45, 0xa3, 0x5b, 0x98, 0x35, 0x53, 0x64, 0xc5, + 0x58, 0x73, 0xd0, 0x74, 0xc2, 0x54, 0xbc, 0x72, 0x74, 0x94, 0x76, 0x46, + 0x86, 0x42, 0x93, 0x37, 0xc2, 0x7e, 0x88, 0xc2, 0x79, 0xb9, 0x83, 0x4b, + 0xae, 0x49, 0xb5, 0xc5, 0xa9, 0x7a, 0x53, 0x80, 0xb4, 0x57, 0xd4, 0xb2, + 0x8b, 0x48, 0x67, 0x4d, 0x55, 0x47, 0xd3, 0x58, 0xc2, 0x4b, 0xad, 0x75, + 0xc7, 0x45, 0x37, 0x78, 0xa9, 0xaf, 0xcf, 0xce, 0x52, 0x9a, 0xd3, 0xb4, + 0x8d, 0xb0, 0xc1, 0x51, 0xb0, 0xae, 0xbb, 0x32, 0xaa, 0x3d, 0x5f, 0x92, + 0xc5, 0x97, 0x4c, 0x96, 0xcd, 0x6c, 0xc6, 0x6e, 0x44, 0xc9, 0xb4, 0x47, + 0xa4, 0x68, 0x87, 0x8f, 0x76, 0x86, 0x3c, 0x8d, 0x34, 0xb0, 0x98, 0xaa, + 0xa3, 0x65, 0xac, 0x96, 0xc8, 0x6c, 0x46, 0x71, 0xc6, 0x52, 0x68, 0xa5, + 0xb3, 0x48, 0xc5, 0xa8, 0xbf, 0x37, 0x34, 0xaf, 0x39, 0x8b, 0x8b, 0x88, + 0x8f, 0x3a, 0xd4, 0x9e, 0x50, 0x92, 0x96, 0x4c, 0xa8, 0x4a, 0x70, 0x7d, + 0xa1, 0x73, 0x44, 0xc7, 0x4b, 0x56, 0xad, 0x52, 0xa7, 0x36, 0x52, 0x2e, + 0x4b, 0xa5, 0xa6, 0xc7, 0xd4, 0x80, 0xc0, 0x71, 0xb5, 0xb3, 0xb4, 0xc3, + 0xc1, 0x4f, 0x52, 0x96, 0x78, 0xb8, 0xab, 0x5a, 0xb7, 0x60, 0x55, 0xc4, + 0xc3, 0x53, 0x40, 0x86, 0xbb, 0x42, 0xd0, 0x89, 0x46, 0x8c, 0xcd, 0x60, + 0x58, 0x80, 0xa2, 0xb4, 0xcc, 0xd7, 0x57, 0x90, 0x66, 0x8d, 0xc5, 0xa4, + 0xa6, 0x9e, 0x86, 0x7a, 0x84, 0x77, 0xaf, 0x85, 0x74, 0xc4, 0xc9, 0x7b, + 0xcc, 0x4d, 0xc3, 0xc7, 0xad, 0x91, 0x40, 0x6e, 0x33, 0x6e, 0xb4, 0x5f, + 0x7a, 0x8d, 0xb6, 0xbe, 0xa0, 0xc8, 0x66, 0x88, 0x83, 0x70, 0x3e, 0x86, + 0x64, 0x7b, 0x5f, 0x61, 0x42, 0x48, 0x87, 0xc7, 0x42, 0x42, 0xc8, 0xa4, + 0x34, 0xb7, 0x4b, 0x4b, 0x8e, 0x86, 0x51, 0x93, 0x35, 0x94, 0xcd, 0x64, + 0x9e, 0xd1, 0x38, 0x9c, 0xce, 0xa3, 0x7d, 0x51, 0x86, 0xc9, 0x91, 0xa2, + 0x75, 0xc3, 0xa2, 0x52, 0x31, 0x84, 0xc0, 0x79, 0x8b, 0x4d, 0x7a, 0x3d, + 0xd9, 0x51, 0x50, 0x51, 0x8e, 0x83, 0xb7, 0x3f, 0xb7, 0x83, 0xb2, 0x96, + 0xa6, 0x4d, 0x3b, 0x71, 0x3b, 0xa4, 0x61, 0xb7, 0x68, 0x4f, 0xb7, 0x89, + 0x63, 0x75, 0xb9, 0x67, 0xae, 0x71, 0x4d, 0x8b, 0x57, 0x3a, 0x57, 0x4f, + 0x6d, 0x33, 0xba, 0x77, 0x8c, 0x4c, 0xba, 0x33, 0x36, 0xcd, 0xab, 0x53, + 0x71, 0x4f, 0xc1, 0x66, 0x92, 0x80, 0x3a, 0x34, 0x4c, 0x39, 0x4e, 0xa0, + 0xba, 0xa9, 0x5a, 0x66, 0x9f, 0x28, 0x4d, 0x72, 0x8c, 0x7e, 0xa9, 0xa2, + 0xac, 0xa6, 0xd0, 0x90, 0x6a, 0x71, 0x72, 0x3c, 0xd7, 0xa9, 0xaf, 0x81, + 0x4b, 0xba, 0x3c, 0xc7, 0x75, 0xca, 0xad, 0x91, 0x75, 0x5e, 0xcd, 0x94, + 0xc3, 0x4f, 0x99, 0xa1, 0xc6, 0x6e, 0x9d, 0x9f, 0x56, 0xcc, 0x5a, 0x54, + 0x5d, 0x6f, 0x33, 0x90, 0xc9, 0xb9, 0x2d, 0x96, 0xd3, 0xb6, 0xb8, 0x66, + 0x35, 0xb0, 0x55, 0x71, 0xc8, 0xa5, 0x99, 0x62, 0x84, 0xa1, 0xc8, 0x6f, + 0x72, 0xa9, 0x6a, 0x9b, 0x93, 0x62, 0xcf, 0x3b, 0xd2, 0x60, 0x38, 0x8d, + 0x9a, 0x6b, 0x45, 0x5d, 0x48, 0x95, 0x5b, 0x68, 0xa7, 0xa7, 0xbe, 0xb0, + 0xa7, 0xa6, 0xc8, 0x90, 0x45, 0x6b, 0x92, 0x72, 0xa0, 0xc0, 0xd1, 0x39, + 0xd2, 0xcd, 0xa6, 0x54, 0x36, 0x2a, 0x4a, 0xc7, 0xb9, 0x68, 0x41, 0x44, + 0x77, 0xb3, 0x41, 0x7b, 0x68, 0xc7, 0x64, 0xc9, 0xbf, 0x86, 0x7e, 0x47, + 0x6d, 0xc2, 0x59, 0x54, 0x72, 0xba, 0xcd, 0x61, 0xac, 0x45, 0xce, 0x99, + 0x64, 0xc2, 0xaa, 0x5a, 0xd0, 0xb4, 0x53, 0xb7, 0x3d, 0x8f, 0x43, 0x4e, + 0xa7, 0x6d, 0xa3, 0xc8, 0xa9, 0xc0, 0x82, 0x7d, 0x50, 0xb1, 0x7e, 0x6a, + 0xbc, 0x6a, 0x41, 0x59, 0xa4, 0x3a, 0x68, 0xcf, 0x3a, 0xb1, 0xa0, 0x9f, + 0xa4, 0x8d, 0xc5, 0x4b, 0x4d, 0x89, 0xc4, 0xcd, 0x69, 0x72, 0xca, 0xca, + 0xa3, 0xa9, 0xa5, 0xb5, 0xbf, 0x5c, 0xa4, 0xd1, 0xc6, 0x71, 0x90, 0x70, + 0x4f, 0xb0, 0x4e, 0x4d, 0x43, 0x93, 0x75, 0x67, 0x61, 0x31, 0x35, 0x91, + 0xa0, 0x5b, 0x41, 0x72, 0x5a, 0xaf, 0x90, 0x91, 0x52, 0x56, 0xb6, 0x9e, + 0x3e, 0x29, 0xa9, 0xa8, 0x7b, 0xc5, 0x7f, 0x6b, 0x56, 0x82, 0x51, 0x36, + 0x61, 0x61, 0xb6, 0xa8, 0x9e, 0x68, 0xb0, 0x94, 0x61, 0x61, 0x9c, 0x7a, + 0x70, 0xce, 0x53, 0xa1, 0x9b, 0xc3, 0x36, 0x92, 0x54, 0x8b, 0xaf, 0x73, + 0x66, 0x5f, 0x52, 0xb3, 0xb6, 0x8b, 0x7d, 0xa7, 0x60, 0x7b, 0x9c, 0xbb, + 0x74, 0x46, 0x46, 0x32, 0x30, 0x66, 0x6c, 0x7a, 0x6a, 0x54, 0x54, 0x91, + 0x3b, 0x69, 0x6d, 0x60, 0x71, 0x74, 0x6a, 0x8a, 0x58, 0x83, 0x6b, 0x3a, + 0x5f, 0xa2, 0xcf, 0x2f, 0xbc, 0x79, 0xcf, 0xb7, 0x5c, 0x8f, 0xc7, 0xc8, + 0x74, 0x69, 0x9e, 0x86, 0x35, 0x84, 0x37, 0x9b, 0x88, 0x7d, 0x4b, 0x4e, + 0x43, 0x6f, 0x71, 0xba, 0x80, 0x51, 0x39, 0xa0, 0x63, 0x41, 0xc9, 0xa5, + 0xb1, 0xbb, 0xb1, 0xcf, 0x49, 0x46, 0x4a, 0x45, 0x87, 0x4b, 0xc5, 0xaa, + 0x6e, 0xa8, 0xaa, 0x89, 0x8a, 0x45, 0xc3, 0x3b, 0x49, 0xa7, 0x5b, 0x53, + 0xba, 0x36, 0x3b, 0x7e, 0xad, 0x86, 0x62, 0xd7, 0x88, 0x2d, 0x52, 0x72, + 0x9d, 0xbd, 0xaa, 0xbf, 0x3f, 0x95, 0x94, 0x8b, 0x86, 0x56, 0xa2, 0x85, + 0xce, 0x8f, 0xaf, 0x5e, 0xae, 0x76, 0x9a, 0x91, 0xc3, 0xac, 0x65, 0xad, + 0x8d, 0x54, 0xb2, 0xba, 0x8e, 0x99, 0x79, 0x79, 0x40, 0x7a, 0xc7, 0xad, + 0xb6, 0x42, 0xb1, 0xc7, 0xb3, 0x56, 0x43, 0xb1, 0x63, 0x54, 0xa2, 0x64, + 0x55, 0xa5, 0xad, 0x9e, 0x61, 0x68, 0xaa, 0x4a, 0xa0, 0x5c, 0x8a, 0x5f, + 0x53, 0x39, 0x33, 0x5f, 0x58, 0x93, 0x7f, 0x8f, 0x7d, 0x9b, 0xc4, 0x70, + 0x6e, 0x9a, 0xd2, 0x73, 0xbb, 0x8c, 0x67, 0xa7, 0x6d, 0xce, 0xac, 0x36, + 0x83, 0x4b, 0x9c, 0x7a, 0xa4, 0xb0, 0x3f, 0xc5, 0x6f, 0x6f, 0x44, 0x9d, + 0x4e, 0x76, 0x56, 0x34, 0x52, 0xa0, 0xca, 0x8d, 0xbf, 0x81, 0x80, 0xb9, + 0xda, 0x60, 0x9a, 0xaa, 0x43, 0x2e, 0xc4, 0x41, 0x77, 0xae, 0x63, 0x2a, + 0x4e, 0x44, 0x8c, 0x82, 0x59, 0x36, 0xb5, 0xda, 0xa7, 0xbf, 0xab, 0xc1, + 0x89, 0xd4, 0x9a, 0x5d, 0xae, 0x49, 0x9e, 0x51, 0xc7, 0xe3, 0x9b, 0xbf, + 0x86, 0x71, 0x98, 0x70, 0x9c, 0x72, 0x96, 0x70, 0x41, 0x38, 0xcd, 0x73, + 0x28, 0x4e, 0x44, 0xd3, 0xbc, 0xba, 0xe0, 0x89, 0xad, 0x90, 0x8f, 0x9c, + 0x5c, 0xc3, 0xc9, 0xb3, 0x83, 0x76, 0xbc, 0x57, 0x65, 0x51, 0x76, 0x9d, + 0x33, 0xcb, 0x8a, 0x61, 0x76, 0x60, 0x76, 0x5c, 0x77, 0xa2, 0x4d, 0x4d, + 0xc8, 0xd8, 0x9d, 0x82, 0x55, 0x92, 0xbd, 0xc4, 0x82, 0x8f, 0x55, 0x6a, + 0xcc, 0x68, 0x9c, 0x41, 0x80, 0x63, 0x4a, 0x83, 0x6f, 0xb5, 0x97, 0x91, + 0x37, 0x99, 0x55, 0x9f, 0xae, 0xa4, 0xba, 0x4a, 0xa2, 0xbb, 0x60, 0x4e, + 0xcd, 0xd6, 0xc3, 0xc4, 0x79, 0xd2, 0x3d, 0x72, 0xc3, 0x34, 0x62, 0xaf, + 0x4d, 0x8e, 0x98, 0x8c, 0x3c, 0x7c, 0x89, 0x7a, 0x6a, 0x97, 0xae, 0xc6, + 0x88, 0x1a, 0x7e, 0x61, 0x4b, 0x47, 0x7d, 0x49, 0xaf, 0x41, 0x5b, 0xbe, + 0x45, 0x77, 0x71, 0xa3, 0xc5, 0xa6, 0xac, 0x8c, 0x89, 0x61, 0x8b, 0xa4, + 0xbc, 0x38, 0x64, 0x62, 0x57, 0xdb, 0xc0, 0x4a, 0xbc, 0x7a, 0xb7, 0xca, + 0xbc, 0x83, 0x6f, 0x71, 0x5a, 0x81, 0xbc, 0xb1, 0x8f, 0x39, 0x6c, 0xdc, + 0xaf, 0xae, 0xe0, 0xd3, 0x90, 0xaf, 0x70, 0x79, 0x9f, 0xa6, 0x86, 0x89, + 0x49, 0xc6, 0x50, 0xb2, 0xb2, 0xce, 0x55, 0xd9, 0x96, 0x47, 0xa3, 0x62, + 0xa6, 0x92, 0x40, 0x88, 0x40, 0x43, 0x56, 0x5e, 0x6f, 0x78, 0x59, 0x32, + 0x54, 0x44, 0x60, 0x48, 0x52, 0x5a, 0x93, 0x79, 0x76, 0xa0, 0x37, 0x90, + 0xb7, 0x4e, 0xb5, 0x3d, 0x6c, 0x97, 0x96, 0x81, 0xc8, 0x81, 0x49, 0x44, + 0x3d, 0x5b, 0x44, 0xa3, 0x74, 0xaf, 0x2a, 0x26, 0x9d, 0xd1, 0x86, 0xc5, + 0xd2, 0xa5, 0x53, 0x86, 0x80, 0x40, 0x76, 0x3e, 0x7b, 0x9e, 0xa1, 0xb7, + 0xc0, 0x52, 0x43, 0x50, 0x5d, 0xb5, 0xb1, 0x53, 0x41, 0x6c, 0x85, 0x8b, + 0xb7, 0x5f, 0x78, 0xa5, 0x35, 0xb5, 0x51, 0x89, 0x71, 0x9f, 0x6d, 0x4a, + 0xbc, 0x82, 0xb0, 0xbb, 0x50, 0xe2, 0x5f, 0xc7, 0xcb, 0x71, 0x76, 0xe0, + 0x5c, 0x36, 0xd1, 0x3c, 0xc1, 0x87, 0x91, 0x47, 0xd2, 0xd0, 0x32, 0x64, + 0x39, 0x5a, 0x4a, 0xa8, 0x65, 0x84, 0xa2, 0x5f, 0xb3, 0xca, 0x66, 0xb8, + 0x89, 0x52, 0xdf, 0x7f, 0x41, 0xa9, 0xc5, 0xad, 0xb9, 0xc0, 0x94, 0x69, + 0x9f, 0x94, 0x8f, 0xbf, 0x7b, 0x6c, 0x64, 0xc9, 0x9b, 0x31, 0x61, 0xa1, + 0x6e, 0xd2, 0x5b, 0x60, 0xc1, 0x9a, 0x5d, 0x4e, 0x89, 0x6a, 0x9b, 0xc4, + 0x69, 0x38, 0xa1, 0xa5, 0x63, 0x46, 0xc5, 0xa6, 0x3b, 0x7d, 0xaa, 0x38, + 0x84, 0x58, 0x9c, 0xc2, 0x62, 0x6c, 0x9d, 0x73, 0x46, 0x5d, 0x78, 0x3e, + 0x7e, 0x69, 0xcf, 0x70, 0x78, 0x5d, 0x43, 0x78, 0xd1, 0xe2, 0x68, 0xdc, + 0x5f, 0x60, 0xc6, 0x5e, 0x63, 0xc0, 0x53, 0x6e, 0xd9, 0xa9, 0x53, 0x57, + 0x94, 0x48, 0x83, 0x62, 0x9e, 0x6e, 0x76, 0x78, 0xa4, 0x49, 0xa9, 0xa1, + 0x48, 0x2f, 0x7b, 0x67, 0x3a, 0x6f, 0x82, 0x79, 0xb7, 0xb6, 0xaa, 0x67, + 0x64, 0xb4, 0xac, 0x4a, 0x89, 0xbf, 0x5b, 0x52, 0x4b, 0xbb, 0x58, 0x8d, + 0x2f, 0x7c, 0xb8, 0x92, 0x29, 0x5b, 0x43, 0x5e, 0xbe, 0xa3, 0xb4, 0x37, + 0xb8, 0x56, 0x56, 0x8a, 0x4f, 0xae, 0x26, 0x2d, 0x5d, 0xba, 0x59, 0x7a, + 0x65, 0x7d, 0x31, 0x39, 0x46, 0x94, 0x60, 0xa0, 0x40, 0xb1, 0x98, 0x5e, + 0x43, 0x76, 0xb7, 0x5f, 0xc8, 0x4a, 0x6f, 0x8d, 0x48, 0x79, 0x56, 0x80, + 0x75, 0x97, 0xc0, 0x7a, 0x92, 0xba, 0x6f, 0x83, 0x1d, 0x77, 0x6f, 0x72, + 0xb8, 0xa8, 0x6a, 0x55, 0x2a, 0x99, 0xb9, 0xc5, 0x9c, 0x4e, 0x6e, 0x3a, + 0x35, 0x65, 0xe3, 0xca, 0x8b, 0xbd, 0x86, 0x65, 0xa8, 0x92, 0xba, 0x7b, + 0x6b, 0x3f, 0x69, 0xbb, 0x6e, 0x80, 0x76, 0x64, 0x9f, 0x3b, 0x50, 0x85, + 0xcc, 0x6c, 0x18, 0x81, 0x82, 0xa2, 0x72, 0x98, 0x4b, 0xbf, 0x60, 0xbb, + 0x89, 0xb2, 0xa4, 0x67, 0x92, 0x38, 0x50, 0xb4, 0x46, 0x65, 0x73, 0xa3, + 0x91, 0xbc, 0x3b, 0xd5, 0x96, 0xc3, 0xd4, 0x50, 0x88, 0xae, 0x88, 0xbe, + 0x8c, 0x8d, 0xbb, 0x86, 0xd8, 0x50, 0x7b, 0x9a, 0xd2, 0xa7, 0xc8, 0xb4, + 0x61, 0x4f, 0x89, 0x89, 0x55, 0x89, 0x6d, 0x2d, 0x94, 0x9f, 0xab, 0x74, + 0x65, 0x7f, 0x97, 0x42, 0x35, 0x51, 0xa5, 0x59, 0x54, 0x74, 0xb3, 0xa0, + 0x5e, 0xa3, 0x8b, 0xd0, 0x87, 0x2c, 0x40, 0xa8, 0x5f, 0x91, 0x3f, 0x4e, + 0xbe, 0x97, 0x8d, 0xb1, 0xa9, 0x3d, 0xc3, 0x75, 0x2d, 0xcf, 0x67, 0x54, + 0x36, 0x82, 0xb8, 0xa5, 0x92, 0xd3, 0x8e, 0xb1, 0xae, 0x99, 0x7d, 0x8c, + 0xb9, 0x97, 0x93, 0x2b, 0x4f, 0x8d, 0xc3, 0x78, 0x8a, 0xb1, 0x42, 0x63, + 0x3c, 0x94, 0x74, 0x9d, 0x74, 0xb9, 0x41, 0x7c, 0x3b, 0x40, 0x9d, 0xba, + 0xa8, 0x7c, 0x70, 0xcf, 0x68, 0x47, 0xc3, 0x52, 0x80, 0x70, 0x6c, 0x5b, + 0xb6, 0x39, 0x5b, 0x94, 0x67, 0x51, 0x79, 0x2e, 0x50, 0xce, 0xd7, 0x5d, + 0xc6, 0x71, 0xe3, 0x4c, 0xce, 0xab, 0x8a, 0x78, 0x7b, 0x71, 0xa7, 0xa2, + 0xc6, 0x47, 0x80, 0xc6, 0x80, 0xb8, 0x51, 0x89, 0x4c, 0xb3, 0x62, 0x52, + 0x68, 0x2e, 0x4a, 0x4d, 0x79, 0x66, 0x58, 0xaf, 0x8a, 0x27, 0xc7, 0x8f, + 0xda, 0xc7, 0x9c, 0xaf, 0xc9, 0x50, 0xab, 0xcc, 0xa6, 0x28, 0xab, 0xd1, + 0xb1, 0xd4, 0xa8, 0xf2, 0x60, 0x95, 0x4d, 0xaa, 0x4c, 0x93, 0x46, 0xbd, + 0x9f, 0xb6, 0xbc, 0x8e, 0x36, 0x34, 0x64, 0xc4, 0x4e, 0x80, 0x80, 0x4b, + 0x65, 0x81, 0x5a, 0x3a, 0x59, 0xd9, 0xdc, 0x7a, 0x93, 0x7c, 0x3d, 0x4f, + 0x89, 0x76, 0xd1, 0xb9, 0x78, 0x3d, 0x68, 0x86, 0x69, 0x67, 0xab, 0xb4, + 0x44, 0x56, 0x38, 0x2f, 0xa8, 0x72, 0x66, 0xb2, 0x4f, 0x91, 0xb3, 0xa3, + 0xa4, 0xc7, 0x82, 0x40, 0xba, 0xb1, 0x52, 0x8f, 0xa9, 0xd0, 0xb0, 0x62, + 0xa4, 0x3f, 0x84, 0xd1, 0x77, 0x52, 0x62, 0x99, 0x67, 0xc3, 0xb6, 0xa5, + 0x5b, 0x32, 0x73, 0x9e, 0x83, 0x3b, 0xc4, 0x9a, 0x70, 0x61, 0xc1, 0x77, + 0x7c, 0x97, 0x6d, 0x39, 0xba, 0x8b, 0x38, 0xc3, 0xb8, 0x98, 0x55, 0x44, + 0x33, 0x88, 0x34, 0x4f, 0xa0, 0x51, 0xbc, 0xb2, 0xb0, 0xcd, 0x8d, 0xad, + 0x8b, 0x6e, 0x43, 0x75, 0x35, 0x9e, 0xcb, 0x48, 0xb2, 0x31, 0x93, 0x40, + 0x4a, 0x3e, 0xab, 0x73, 0x91, 0xc8, 0x33, 0x44, 0x41, 0x96, 0xbe, 0x8d, + 0xba, 0x93, 0x55, 0x79, 0xb6, 0xb7, 0x56, 0x64, 0x56, 0xc6, 0x46, 0x79, + 0x3f, 0x48, 0x99, 0x5f, 0xc0, 0xd7, 0x3e, 0xa1, 0x89, 0x39, 0xad, 0xd3, + 0xc2, 0x36, 0x36, 0x51, 0x57, 0x5c, 0xa2, 0x38, 0x54, 0xb4, 0x3b, 0x7f, + 0x94, 0x50, 0x39, 0x4b, 0x3a, 0x46, 0xa1, 0xb8, 0x70, 0x8d, 0xa3, 0xc1, + 0x39, 0x6c, 0x52, 0xc3, 0x61, 0x51, 0x99, 0x46, 0xa3, 0x3d, 0x4c, 0xc7, + 0x87, 0x58, 0x98, 0x44, 0x61, 0xbc, 0x33, 0x71, 0x55, 0xad, 0x98, 0x64, + 0xb8, 0x73, 0x76, 0xa9, 0xc3, 0x43, 0x9a, 0xd1, 0x9f, 0x40, 0x52, 0x51, + 0x46, 0x6c, 0x9f, 0xb9, 0xc1, 0x75, 0xb4, 0x43, 0x8b, 0x38, 0x70, 0x48, + 0x9d, 0x72, 0x84, 0x57, 0x43, 0xcc, 0x55, 0x57, 0xb4, 0x5f, 0x98, 0xbb, + 0x3e, 0x48, 0x9a, 0xbd, 0x9f, 0xaa, 0x86, 0x47, 0x76, 0xa8, 0x5f, 0x44, + 0x39, 0xcd, 0x6c, 0x46, 0xb8, 0xac, 0xc3, 0x36, 0xd1, 0xc3, 0x4f, 0xac, + 0x79, 0x5c, 0x42, 0x6f, 0x67, 0x5f, 0xd3, 0xcf, 0x8c, 0xa9, 0xa9, 0x51, + 0x73, 0x62, 0x81, 0xa4, 0x99, 0x5a, 0x41, 0xd2, 0xad, 0xa2, 0xd3, 0x5f, + 0xa6, 0x8a, 0x8b, 0x7e, 0x5f, 0x5b, 0x58, 0xc2, 0xa5, 0x31, 0x48, 0x64, + 0x3f, 0x8d, 0xb6, 0x35, 0x46, 0x51, 0x9b, 0x37, 0x39, 0xa4, 0xb2, 0xb9, + 0xaa, 0xae, 0x57, 0xd4, 0x3d, 0x9f, 0x64, 0x4a, 0x8f, 0xc3, 0xac, 0x89, + 0xbf, 0x9a, 0x41, 0x44, 0xca, 0x80, 0x43, 0x7d, 0xcd, 0x67, 0xbc, 0x86, + 0x63, 0x78, 0x60, 0xd4, 0x77, 0x60, 0x3f, 0x5c, 0x85, 0x3b, 0xc1, 0x9d, + 0x7f, 0x96, 0x6b, 0x39, 0x7e, 0x44, 0x59, 0x8a, 0x53, 0xaf, 0x65, 0x3b, + 0xc6, 0x83, 0x66, 0x30, 0x71, 0xac, 0xa4, 0x73, 0x45, 0x6c, 0x44, 0xc0, + 0x9a, 0x8d, 0x54, 0x55, 0x99, 0xbe, 0x70, 0x87, 0x6d, 0x35, 0x48, 0x5f, + 0xa3, 0xbb, 0x82, 0xb4, 0x8d, 0xb0, 0xa0, 0x47, 0x8b, 0xa4, 0x66, 0xb0, + 0x8e, 0xa7, 0xb1, 0x65, 0xce, 0x5e, 0xd0, 0x9c, 0x43, 0x6e, 0x39, 0x42, + 0x79, 0x6b, 0x5a, 0xcd, 0x7b, 0x4d, 0x72, 0x50, 0x8e, 0x32, 0x70, 0x6c, + 0x8f, 0xb9, 0xbc, 0x45, 0x37, 0x62, 0x38, 0x43, 0x7d, 0x9f, 0x79, 0x5b, + 0x8c, 0x7f, 0x72, 0x7e, 0x73, 0x6d, 0xba, 0x84, 0xbf, 0x5b, 0x9f, 0x61, + 0x71, 0x39, 0x4f, 0x7e, 0x8d, 0x43, 0x8d, 0x52, 0xae, 0x3b, 0xd0, 0x42, + 0xa8, 0x6b, 0x61, 0x32, 0x9b, 0x33, 0x79, 0xa5, 0x39, 0xbd, 0x43, 0x49, + 0x9f, 0x4b, 0x7f, 0xa8, 0x5a, 0x6b, 0xbb, 0xbd, 0x8a, 0xcd, 0xa3, 0x9d, + 0xa9, 0xa2, 0x99, 0x67, 0x3c, 0x55, 0x85, 0x6d, 0xce, 0x43, 0xb9, 0x80, + 0xcb, 0xc0, 0x5e, 0x9a, 0xb5, 0x7e, 0xbe, 0x7b, 0xa8, 0xc2, 0x89, 0x52, + 0xc6, 0x93, 0xc6, 0xa4, 0x7e, 0x48, 0x3a, 0x54, 0x36, 0xbc, 0xb4, 0xae, + 0xa1, 0xcf, 0x3b, 0x5c, 0x63, 0x81, 0x52, 0xcd, 0x72, 0xb7, 0x3f, 0xd1, + 0x38, 0xb0, 0x69, 0x97, 0x41, 0x32, 0x72, 0xbf, 0x5d, 0x4d, 0x73, 0x46, + 0x55, 0x90, 0x45, 0x6a, 0x9a, 0x81, 0x3b, 0x7a, 0x39, 0x8d, 0x3f, 0xab, + 0x79, 0x6b, 0x8f, 0x4c, 0x7b, 0x31, 0xb1, 0xd5, 0x74, 0x46, 0x9f, 0x49, + 0x4f, 0x84, 0xc0, 0x6f, 0x72, 0xcf, 0x34, 0x34, 0x5e, 0xaf, 0x42, 0x85, + 0xab, 0xce, 0x98, 0x82, 0x5d, 0x40, 0xd0, 0x61, 0xbc, 0xad, 0x68, 0x58, + 0xc6, 0x3b, 0xd3, 0x92, 0x72, 0x65, 0xb8, 0x83, 0x51, 0x64, 0xb8, 0x3f, + 0x74, 0x55, 0x80, 0x30, 0xb3, 0x78, 0x99, 0xa9, 0x57, 0x58, 0x7f, 0xb0, + 0x35, 0x39, 0x5c, 0x70, 0x4c, 0xc8, 0x34, 0x4e, 0x8c, 0xbd, 0x5e, 0x97, + 0x86, 0xac, 0x7d, 0xb1, 0xb2, 0x69, 0x4f, 0x52, 0x35, 0x5c, 0x38, 0xae, + 0x76, 0xb2, 0x62, 0x3c, 0xa5, 0x5b, 0xa0, 0xca, 0x6d, 0x3d, 0xae, 0x71, + 0x49, 0x67, 0xc7, 0xa1, 0xa7, 0xb4, 0x43, 0x71, 0x84, 0x97, 0xd4, 0xc1, + 0x43, 0x82, 0x59, 0x8b, 0x4b, 0xaa, 0x48, 0x76, 0xba, 0xc2, 0x64, 0xa5, + 0xb6, 0x82, 0x39, 0x33, 0xad, 0xa9, 0x4f, 0x8e, 0xaa, 0x84, 0x32, 0x9b, + 0xb7, 0x60, 0x37, 0x75, 0x45, 0x63, 0x4a, 0x35, 0x87, 0x80, 0x59, 0xc6, + 0xb3, 0x61, 0x6c, 0xb1, 0xc0, 0xc7, 0x8f, 0x59, 0x97, 0x6f, 0xa7, 0x7f, + 0x7a, 0x80, 0x5e, 0x6c, 0x44, 0x5a, 0xa1, 0xb4, 0x96, 0x76, 0xb8, 0xc0, + 0x7b, 0x44, 0x39, 0xbc, 0x47, 0x4d, 0x76, 0xb8, 0x75, 0x92, 0x7c, 0xa5, + 0x67, 0x36, 0x6d, 0x83, 0xb1, 0x98, 0x72, 0x90, 0x32, 0x97, 0xc8, 0x5d, + 0x33, 0xa9, 0xb4, 0x79, 0x6d, 0x4e, 0xcd, 0x76, 0x41, 0x36, 0x72, 0x96, + 0x88, 0xcb, 0x55, 0x6a, 0x93, 0xbe, 0xcc, 0xa7, 0xc7, 0x8e, 0xb7, 0xb4, + 0xd0, 0x7c, 0x47, 0x77, 0xb8, 0x7a, 0x5c, 0x5f, 0x9e, 0x31, 0x43, 0x59, + 0x51, 0x52, 0x54, 0x9c, 0x3e, 0x42, 0x49, 0xa0, 0x65, 0x42, 0x2d, 0xba, + 0x85, 0x9b, 0xb7, 0x46, 0xa0, 0xa6, 0x6c, 0x75, 0x50, 0x91, 0x88, 0xa6, + 0xab, 0xc6, 0xc8, 0xc0, 0x66, 0x57, 0x82, 0x49, 0x4e, 0x9a, 0xc4, 0x6b, + 0xac, 0xa7, 0x35, 0x71, 0x7e, 0xac, 0x44, 0x8f, 0x33, 0x3e, 0xc6, 0x79, + 0x8f, 0x38, 0x4f, 0x53, 0xac, 0x58, 0x94, 0x62, 0xcf, 0x88, 0x59, 0xcb, + 0x47, 0x40, 0x55, 0xa9, 0x69, 0xc9, 0x7e, 0x95, 0xb8, 0x56, 0x92, 0x5f, + 0x85, 0x41, 0xb2, 0xb0, 0x8c, 0x31, 0x42, 0xc2, 0xc5, 0x90, 0x55, 0x48, + 0x3c, 0xba, 0xa0, 0x4a, 0x47, 0x3a, 0xa8, 0x35, 0x7d, 0x65, 0x8c, 0x6a, + 0x83, 0x4b, 0x40, 0x59, 0x8c, 0x6d, 0x68, 0x4a, 0xb1, 0x68, 0x59, 0xb3, + 0x75, 0xb5, 0x57, 0x95, 0x34, 0x87, 0x80, 0x37, 0x4e, 0x92, 0x5e, 0x4e, + 0x3b, 0x92, 0x6b, 0xd2, 0x5b, 0x8d, 0x8e, 0x4b, 0x45, 0xbd, 0x36, 0xb9, + 0x59, 0x59, 0x31, 0xa5, 0xb2, 0x54, 0x53, 0x53, 0xb9, 0xa5, 0x60, 0x53, + 0x92, 0x31, 0xae, 0xa4, 0x87, 0xc6, 0xa4, 0x41, 0x94, 0x79, 0x50, 0x5d, + 0x49, 0x44, 0xcd, 0xbc, 0x8b, 0x34, 0xc5, 0x47, 0x89, 0x6d, 0xb9, 0x73, + 0x96, 0x3b, 0x8b, 0x9a, 0x95, 0x63, 0xa0, 0xa7, 0x8b, 0xac, 0x93, 0xac, + 0xb8, 0x49, 0x8e, 0x6f, 0xb7, 0x3b, 0x63, 0xb5, 0xc1, 0xc0, 0x9d, 0x48, + 0xd2, 0x43, 0xc9, 0x2f, 0x6c, 0xa4, 0xad, 0x7c, 0xb5, 0x4c, 0x3a, 0xa5, + 0x59, 0xb1, 0x67, 0xcc, 0xae, 0x69, 0x51, 0x30, 0x8b, 0xd2, 0xce, 0x54, + 0x90, 0x38, 0x8b, 0x44, 0x76, 0x40, 0xa5, 0x81, 0x6a, 0xb5, 0x9d, 0xa5, + 0x8c, 0x58, 0xcc, 0x6f, 0x9a, 0x3c, 0xbd, 0x7a, 0x32, 0x89, 0x70, 0x82, + 0x8c, 0x8a, 0xc3, 0x80, 0x5b, 0x8e, 0x78, 0x89, 0x86, 0xa1, 0x99, 0xa5, + 0xb5, 0x4a, 0xa2, 0xcc, 0x6f, 0x5e, 0xb4, 0x75, 0xbb, 0x88, 0xbf, 0xb9, + 0x5b, 0xa8, 0xc1, 0xa3, 0x63, 0x34, 0x90, 0x36, 0xa1, 0x69, 0x91, 0x61, + 0x83, 0x88, 0x8c, 0x45, 0x58, 0x87, 0x85, 0x69, 0x4d, 0x4f, 0x82, 0x76, + 0xc9, 0x3c, 0xb6, 0x6d, 0x3c, 0x92, 0x39, 0x93, 0x47, 0x97, 0xb4, 0x7a, + 0x3d, 0x68, 0x72, 0x64, 0xbb, 0xaf, 0xae, 0x6e, 0xcb, 0x70, 0xc7, 0x53, + 0x9e, 0x48, 0x59, 0xbb, 0x4d, 0x38, 0x93, 0x81, 0x56, 0x80, 0xcb, 0x8c, + 0x76, 0x57, 0x59, 0x53, 0xd1, 0xc6, 0xa8, 0xa4, 0x32, 0x56, 0x68, 0xab, + 0x56, 0xd5, 0xa3, 0xa8, 0x3a, 0x8b, 0x37, 0x8c, 0x47, 0xb1, 0x51, 0xc2, + 0x4b, 0x8e, 0xc7, 0x60, 0x8b, 0x9d, 0x8c, 0xb1, 0x61, 0x9b, 0xc0, 0x4f, + 0x6e, 0x34, 0x49, 0x85, 0xbc, 0xd1, 0xba, 0xce, 0xcb, 0xc6, 0x43, 0x5e, + 0x88, 0x5b, 0x33, 0x5c, 0x96, 0x9f, 0x8f, 0x73, 0xbf, 0x9e, 0x47, 0xb4, + 0x55, 0x51, 0x58, 0x64, 0x6e, 0xa3, 0x5d, 0x96, 0x91, 0x9f, 0xb7, 0x36, + 0xcb, 0xba, 0xc5, 0xb8, 0x57, 0x42, 0x3a, 0x51, 0x82, 0x2b, 0x61, 0x91, + 0x62, 0xa0, 0xbf, 0x61, 0x49, 0x38, 0x6f, 0x7b, 0x53, 0x54, 0x9d, 0x75, + 0x64, 0x3c, 0x7d, 0x35, 0x30, 0x46, 0x56, 0xbf, 0x9d, 0x7a, 0x5e, 0x76, + 0xb7, 0xa5, 0x65, 0xcb, 0x4c, 0xba, 0xaa, 0x84, 0x86, 0x35, 0x6c, 0x3e, + 0x79, 0x96, 0x4e, 0x67, 0x65, 0x61, 0xaf, 0x99, 0x95, 0x86, 0x3f, 0xc5, + 0x77, 0x97, 0xb6, 0x5f, 0x8c, 0x47, 0xd5, 0xa4, 0xca, 0x70, 0x94, 0x32, + 0x34, 0xab, 0x48, 0x49, 0x68, 0xc2, 0xa9, 0x83, 0x49, 0x4a, 0x64, 0x9e, + 0xa6, 0x93, 0x3f, 0x4d, 0x4c, 0x58, 0x97, 0xa9, 0x52, 0x6c, 0x35, 0xc2, + 0x89, 0xcd, 0x4e, 0x3f, 0xd3, 0x5f, 0xaf, 0x4f, 0x8f, 0x98, 0x96, 0x3c, + 0x8c, 0x33, 0x44, 0x6a, 0xc4, 0x37, 0xa4, 0x7f, 0xa4, 0x90, 0xb7, 0x8a, + 0xa1, 0x5b, 0x58, 0xa4, 0x96, 0xc3, 0x59, 0x9d, 0x84, 0xc1, 0x9b, 0x43, + 0x77, 0xb7, 0x4d, 0xb2, 0xa7, 0x98, 0x71, 0xcb, 0x2f, 0xa6, 0x35, 0x93, + 0x53, 0x42, 0x6c, 0x48, 0x6f, 0xce, 0x45, 0x3f, 0xa1, 0x7d, 0x66, 0xb5, + 0x36, 0x43, 0x92, 0x8d, 0x52, 0x59, 0x78, 0xa4, 0x53, 0xd0, 0x95, 0x9d, + 0x3f, 0x82, 0xa2, 0x3a, 0x6e, 0xb3, 0xc5, 0x7a, 0x87, 0x4e, 0x5e, 0x7e, + 0xb0, 0x84, 0xab, 0x5d, 0x89, 0xb5, 0x8e, 0x51, 0x7b, 0x55, 0xd2, 0x75, + 0x3f, 0x7f, 0x8f, 0x78, 0x81, 0xce, 0x52, 0xc8, 0xaa, 0xcb, 0x67, 0x8c, + 0x6e, 0x4a, 0x83, 0x96, 0xc7, 0x65, 0x30, 0xc3, 0x74, 0x5b, 0xb6, 0x37, + 0xcf, 0xc0, 0xc5, 0x72, 0xd7, 0xbe, 0x4b, 0x51, 0xc3, 0xa3, 0xa8, 0x4c, + 0x9c, 0x7b, 0x8f, 0x3a, 0xae, 0x66, 0xb0, 0xa2, 0xb6, 0x45, 0x50, 0x80, + 0x76, 0x44, 0x3f, 0x88, 0x37, 0x7a, 0x4d, 0x7d, 0xa0, 0x73, 0x9f, 0xa5, + 0x82, 0x6f, 0xa8, 0xa5, 0x38, 0x92, 0xcb, 0x7c, 0xbc, 0xd0, 0x70, 0x5e, + 0x70, 0x61, 0x7b, 0x38, 0x95, 0x4c, 0xba, 0xb9, 0x9e, 0xb5, 0x63, 0x95, + 0x82, 0x35, 0xc4, 0xc5, 0xc7, 0x82, 0xd4, 0x59, 0xce, 0x94, 0xa2, 0x94, + 0x88, 0x8b, 0xc0, 0x7c, 0x8f, 0xc7, 0x6e, 0x90, 0xbb, 0x77, 0x5b, 0xbb, + 0xbd, 0x95, 0x75, 0x9f, 0xc1, 0xb1, 0x6c, 0x93, 0x4c, 0x6f, 0xbe, 0x56, + 0x9d, 0x3f, 0xb9, 0x54, 0x9a, 0xa3, 0x53, 0x36, 0x93, 0x40, 0x3b, 0xad, + 0xbc, 0x76, 0x97, 0x7e, 0xad, 0xbb, 0x43, 0x7e, 0xb3, 0xb9, 0x7e, 0xc6, + 0xbc, 0xbc, 0x3d, 0x33, 0xca, 0xd0, 0x36, 0x6b, 0xa7, 0x9c, 0x66, 0x60, + 0x48, 0xb6, 0xbb, 0x52, 0x5a, 0x63, 0x5c, 0xcb, 0x87, 0xc0, 0x92, 0x72, + 0x37, 0xa7, 0x92, 0x8b, 0x39, 0xb9, 0x3f, 0xa7, 0x82, 0x93, 0x68, 0x49, + 0x6b, 0x9f, 0x54, 0xae, 0x9c, 0x65, 0x54, 0x46, 0xc1, 0x61, 0xc8, 0x58, + 0x69, 0x45, 0x51, 0x3b, 0x6c, 0x55, 0x85, 0x41, 0xaa, 0x77, 0xa3, 0x4e, + 0xd6, 0xc4, 0x69, 0x57, 0x6f, 0xa6, 0x74, 0x9b, 0x6a, 0xb2, 0x8a, 0x53, + 0x94, 0x56, 0x6d, 0x71, 0x69, 0xa6, 0x58, 0x42, 0x49, 0xbe, 0x5c, 0x4c, + 0x63, 0x83, 0x61, 0x8a, 0x7e, 0x6c, 0xa4, 0xaf, 0x4b, 0xb1, 0x42, 0x6c, + 0x84, 0x51, 0x35, 0x42, 0x4b, 0x67, 0x80, 0xa4, 0x6b, 0xd3, 0xaf, 0xd3, + 0xd8, 0x60, 0x59, 0x8e, 0x87, 0x45, 0xd5, 0xd3, 0xc2, 0x61, 0x45, 0x69, + 0x57, 0x74, 0xa5, 0x86, 0x49, 0x53, 0x3b, 0xb9, 0x37, 0x8a, 0x36, 0xae, + 0x56, 0xba, 0x43, 0xbd, 0x9e, 0xc0, 0x56, 0xc3, 0x4e, 0x3e, 0x92, 0xb3, + 0xad, 0x7a, 0x5f, 0x37, 0xbf, 0x94, 0xc4, 0x78, 0x6c, 0x39, 0x6e, 0x80, + 0xa7, 0xab, 0x75, 0x99, 0x81, 0x5f, 0x4f, 0xa7, 0x4d, 0x7b, 0xc5, 0x46, + 0x54, 0x3d, 0x2d, 0x44, 0xd6, 0x4a, 0x90, 0x64, 0x55, 0x5e, 0x38, 0x59, + 0x75, 0x78, 0x53, 0xc3, 0xba, 0x91, 0x32, 0x80, 0x67, 0x51, 0x9b, 0x5f, + 0x40, 0x37, 0xa5, 0x3d, 0xbc, 0x8e, 0x38, 0x58, 0xaf, 0x8a, 0x7a, 0x41, + 0xa7, 0x76, 0x51, 0x8a, 0x4b, 0xdd, 0xc3, 0x6a, 0x85, 0x3a, 0x78, 0x6d, + 0xbf, 0xcb, 0xcb, 0xc4, 0x3e, 0x4f, 0x71, 0x6d, 0x8d, 0x4b, 0x51, 0xc6, + 0xc8, 0x45, 0x89, 0x58, 0xab, 0x95, 0x7a, 0x35, 0xdc, 0x82, 0x5b, 0x66, + 0xae, 0x4a, 0xdb, 0x40, 0xa3, 0x38, 0xc2, 0x47, 0x55, 0x3a, 0x73, 0xb3, + 0xaa, 0x7e, 0xa1, 0x7f, 0x42, 0xc3, 0x5e, 0xa5, 0x87, 0x6a, 0x69, 0x75, + 0x6f, 0x7f, 0x6d, 0x76, 0x8d, 0x5b, 0x51, 0x7c, 0x63, 0x6a, 0x47, 0xbe, + 0x81, 0x88, 0x70, 0x70, 0x94, 0xa9, 0x55, 0x3a, 0x2b, 0x94, 0xaf, 0x68, + 0x7c, 0xae, 0xc1, 0x96, 0x45, 0xc6, 0x96, 0x8f, 0x4d, 0x84, 0xa7, 0x8d, + 0xcb, 0x8a, 0x72, 0x8e, 0x32, 0x95, 0x94, 0x50, 0x7c, 0x55, 0xa7, 0xad, + 0xcd, 0x4f, 0x8e, 0x44, 0xb1, 0x95, 0xbe, 0x3e, 0x6a, 0x9a, 0x41, 0xcf, + 0x3f, 0x58, 0x5c, 0x47, 0x85, 0xae, 0xc0, 0x6f, 0x5d, 0x67, 0xd0, 0xb7, + 0xbd, 0xa7, 0x80, 0x77, 0xba, 0x70, 0x4a, 0x99, 0xba, 0xb3, 0x9d, 0x9a, + 0x74, 0xcb, 0x89, 0xaf, 0x6b, 0x8c, 0x97, 0xb0, 0x9d, 0x72, 0x80, 0xaf, + 0x73, 0x3b, 0x4d, 0x8f, 0x7b, 0xc7, 0x3e, 0xa0, 0x9a, 0x8b, 0x93, 0xb6, + 0x5e, 0x84, 0x6d, 0xa2, 0x63, 0x59, 0x32, 0xc0, 0xb4, 0x7d, 0x3f, 0xd1, + 0x41, 0x94, 0x55, 0xbd, 0x6a, 0x5d, 0x47, 0x72, 0x4f, 0x4b, 0x7d, 0x34, + 0x88, 0x35, 0x4c, 0x77, 0x41, 0x6b, 0x64, 0xa5, 0x72, 0x90, 0x5f, 0xce, + 0xc3, 0x9a, 0x59, 0x5d, 0x75, 0xa6, 0xd7, 0xd1, 0xb6, 0x40, 0xb0, 0x41, + 0x9f, 0x80, 0x99, 0xa3, 0x96, 0x87, 0x4b, 0x90, 0x77, 0x43, 0xbf, 0xbd, + 0xc6, 0xa2, 0xc4, 0x4a, 0xd8, 0x41, 0xc0, 0x4c, 0xcf, 0x84, 0x36, 0x9a, + 0x6b, 0x84, 0xba, 0x6c, 0x57, 0x40, 0xaf, 0x34, 0x46, 0x5b, 0x50, 0x68, + 0xac, 0x68, 0xc2, 0x93, 0x3c, 0x59, 0x33, 0x49, 0x73, 0x5b, 0x46, 0xc0, + 0x55, 0xde, 0x72, 0x7c, 0x74, 0x5e, 0x39, 0xa9, 0xb3, 0x38, 0x91, 0x3a, + 0x34, 0xbd, 0x5e, 0x99, 0xbf, 0x7d, 0x6f, 0x81, 0xa9, 0x85, 0x6a, 0x9b, + 0xa7, 0xb9, 0xba, 0xa1, 0xbe, 0x4c, 0x35, 0x7b, 0x9e, 0xbf, 0x5b, 0x90, + 0x3f, 0x97, 0x7a, 0x5a, 0xa2, 0xb3, 0x5c, 0x30, 0x90, 0x74, 0x82, 0x86, + 0x5c, 0x8e, 0xc3, 0x76, 0x8f, 0x88, 0xcc, 0xb1, 0x73, 0x72, 0xa3, 0x97, + 0x5b, 0x50, 0x73, 0xb9, 0xb5, 0x8c, 0x6f, 0x96, 0xc2, 0x56, 0x53, 0xa7, + 0x5c, 0xaa, 0xc9, 0xb5, 0x85, 0xd2, 0x5a, 0x55, 0x65, 0xb7, 0x77, 0x8c, + 0xa2, 0xc9, 0xc4, 0x91, 0xa8, 0x49, 0xab, 0x64, 0x96, 0x54, 0x5a, 0xb7, + 0x81, 0x67, 0x98, 0xcd, 0xbd, 0x3c, 0xb7, 0x63, 0x45, 0x62, 0x40, 0x6c, + 0xc2, 0xb6, 0xc6, 0x83, 0x4d, 0x76, 0xbd, 0xb8, 0xb8, 0x7f, 0xc4, 0xb6, + 0x49, 0x72, 0x77, 0x95, 0x83, 0x75, 0x90, 0xd1, 0xcf, 0xd4, 0x4f, 0x38, + 0x6f, 0xcf, 0x44, 0x7a, 0x46, 0x8f, 0x5d, 0x9b, 0xb6, 0x46, 0x54, 0x48, + 0x82, 0x4c, 0xa6, 0x46, 0xac, 0x88, 0x5d, 0xa9, 0x6a, 0x4e, 0x48, 0xb8, + 0x8c, 0x99, 0x81, 0x98, 0xc5, 0x50, 0xd2, 0x99, 0x47, 0x41, 0x4f, 0xb4, + 0xb5, 0x48, 0x5f, 0xa5, 0x6e, 0x4c, 0xb9, 0x52, 0xa4, 0xa5, 0x5b, 0x4f, + 0x8d, 0x4e, 0xd6, 0xc0, 0x88, 0xcc, 0x83, 0x5f, 0x42, 0x5a, 0x6f, 0x7c, + 0x55, 0xac, 0x9a, 0x9b, 0x7c, 0x57, 0x37, 0xbd, 0x4e, 0x8d, 0x7c, 0x86, + 0x33, 0xb4, 0x63, 0xcc, 0x7f, 0xd4, 0xc6, 0x80, 0x7a, 0x53, 0xc2, 0x34, + 0x97, 0xab, 0xa7, 0x79, 0x55, 0x50, 0xc0, 0x5d, 0x38, 0xd8, 0x39, 0x7b, + 0xd4, 0xa5, 0x83, 0x76, 0x82, 0x81, 0x36, 0xa2, 0x60, 0x77, 0xb6, 0xbd, + 0x3f, 0x5d, 0x9d, 0xb2, 0x65, 0x7d, 0x4d, 0xb8, 0x6a, 0x93, 0x8b, 0x7e, + 0x5d, 0x56, 0x97, 0xa0, 0x7a, 0xa6, 0x60, 0x35, 0x5c, 0xb7, 0xcb, 0x32, + 0x7a, 0x83, 0xc7, 0x5f, 0x8b, 0x65, 0xb8, 0x84, 0xc0, 0x3b, 0x45, 0x71, + 0x3c, 0xa2, 0x69, 0xca, 0xcf, 0x6f, 0x39, 0x2d, 0x56, 0x5d, 0x96, 0xb4, + 0x87, 0x81, 0x2e, 0x90, 0xa6, 0xbb, 0x5d, 0x7b, 0x6b, 0xae, 0xad, 0x7f, + 0x5c, 0x69, 0x42, 0xbf, 0x6a, 0x6d, 0xad, 0x46, 0x45, 0xae, 0xb4, 0x6b, + 0x86, 0x76, 0x55, 0xbe, 0x7c, 0x9b, 0xa7, 0xba, 0x58, 0x63, 0x8e, 0xb9, + 0xd0, 0xb2, 0x45, 0xbb, 0x3d, 0x93, 0xc5, 0x47, 0x8f, 0x3b, 0x6f, 0xab, + 0x50, 0x74, 0xb5, 0xa8, 0xc3, 0x51, 0x41, 0x3b, 0x6d, 0x48, 0xae, 0xd3, + 0xae, 0x53, 0xcd, 0x5b, 0x88, 0x7b, 0xbe, 0x78, 0xda, 0xa1, 0x89, 0x4a, + 0xbf, 0x93, 0x83, 0x7b, 0x3b, 0x7b, 0x41, 0x7f, 0xb1, 0x60, 0x69, 0xd5, + 0x6d, 0x62, 0x40, 0xbe, 0x78, 0xb8, 0x56, 0x64, 0xcc, 0x92, 0x79, 0x63, + 0xc1, 0xab, 0x45, 0x68, 0xd9, 0x51, 0x80, 0x7b, 0xc1, 0xb6, 0x84, 0xb1, + 0x68, 0xc8, 0xc6, 0xad, 0xa3, 0x59, 0x6e, 0x57, 0x7d, 0x69, 0xbd, 0x91, + 0x63, 0x4c, 0xd6, 0x52, 0xaf, 0x47, 0x46, 0x51, 0x4e, 0x79, 0x46, 0x5e, + 0x33, 0x56, 0x74, 0xc8, 0x64, 0xd7, 0xaf, 0x6f, 0xd0, 0xa5, 0xc3, 0x80, + 0x31, 0x92, 0x4a, 0x67, 0x53, 0x87, 0xbb, 0xba, 0x79, 0x99, 0x8c, 0x97, + 0x5e, 0x58, 0x3e, 0x58, 0x3f, 0x83, 0xac, 0xc3, 0xae, 0x53, 0x56, 0x54, + 0x5c, 0x68, 0x5e, 0x4d, 0xce, 0x8f, 0x8e, 0xa3, 0x8a, 0x53, 0x93, 0x95, + 0x65, 0xa3, 0xb0, 0x3c, 0xab, 0x36, 0xce, 0x50, 0xb8, 0x47, 0x33, 0x9e, + 0x5b, 0xc3, 0x5f, 0x37, 0x9e, 0xc5, 0x92, 0x2d, 0xc5, 0x4a, 0x49, 0x89, + 0xbc, 0x4d, 0xcb, 0x86, 0x98, 0x51, 0x95, 0x8f, 0xbd, 0x60, 0x43, 0xb6, + 0x74, 0xb3, 0xb1, 0x91, 0x4d, 0xae, 0x63, 0xc3, 0x77, 0x50, 0xab, 0xb2, + 0x68, 0xca, 0x79, 0xa2, 0x38, 0x2f, 0x31, 0x30, 0x9d, 0x34, 0xcf, 0x77, + 0x84, 0x95, 0xb3, 0x5c, 0x70, 0x44, 0x91, 0x68, 0x40, 0x5a, 0x9f, 0x5a, + 0x66, 0x91, 0x52, 0x67, 0xa3, 0xc3, 0x9c, 0xb7, 0xb4, 0x60, 0xbb, 0x39, + 0x42, 0x57, 0x7f, 0x4d, 0x91, 0x6d, 0x98, 0xa8, 0x67, 0x81, 0x38, 0x3c, + 0x81, 0x52, 0xa7, 0xb6, 0xa5, 0x74, 0xbb, 0xa0, 0xb5, 0x35, 0xd0, 0x60, + 0x57, 0xd7, 0xc1, 0x45, 0x97, 0x8a, 0x86, 0x4c, 0x51, 0xaf, 0x83, 0x40, + 0xc4, 0x79, 0x41, 0x25, 0x94, 0x76, 0x65, 0x51, 0xce, 0x72, 0x2d, 0xaa, + 0x60, 0x44, 0x4b, 0xc0, 0xce, 0x48, 0x4f, 0xaa, 0x58, 0x4a, 0xb0, 0xc1, + 0x49, 0xb5, 0xa3, 0x4a, 0x64, 0x77, 0x85, 0x68, 0x7d, 0xa6, 0xc9, 0x5f, + 0x98, 0xc3, 0x95, 0x3a, 0xbd, 0x38, 0x79, 0x34, 0xad, 0x98, 0xc7, 0x2f, + 0x4c, 0xa7, 0x55, 0x9d, 0x9a, 0x37, 0x49, 0x74, 0x66, 0x7c, 0xc8, 0x9d, + 0x6e, 0x60, 0xc2, 0x9b, 0x43, 0xcc, 0xb8, 0x38, 0x4c, 0x4c, 0x24, 0xb7, + 0x5d, 0x47, 0x68, 0xb5, 0x3a, 0xa8, 0xce, 0x7c, 0x5e, 0x68, 0x4b, 0xaf, + 0x8f, 0x92, 0x6d, 0xcb, 0xc8, 0x9c, 0x6c, 0x3c, 0x96, 0x30, 0x5c, 0x87, + 0x4d, 0x38, 0x52, 0x7a, 0x96, 0x66, 0xbd, 0x8b, 0x45, 0x85, 0x9d, 0xa7, + 0xb8, 0x9d, 0x88, 0x8e, 0x59, 0xb5, 0xbd, 0xc3, 0x72, 0x82, 0xb5, 0xb9, + 0xb2, 0xd3, 0x39, 0x2e, 0xb6, 0x78, 0xb8, 0xac, 0xae, 0x44, 0x5d, 0xa8, + 0xab, 0x62, 0x8c, 0x67, 0x9c, 0x73, 0xbd, 0xbb, 0x37, 0x65, 0x7d, 0x68, + 0x95, 0xa6, 0x6e, 0xc6, 0x52, 0x6f, 0xc9, 0xb4, 0x8c, 0xc9, 0xa2, 0x55, + 0xc7, 0x84, 0xd4, 0xcd, 0x5b, 0x4c, 0x40, 0x7d, 0x83, 0xa4, 0xa8, 0xcb, + 0x97, 0x7f, 0xb9, 0x4b, 0xa6, 0xa9, 0x93, 0x81, 0x2f, 0xae, 0x44, 0xa7, + 0xa9, 0x89, 0xb6, 0xc9, 0xd8, 0x4f, 0x61, 0xc5, 0xa6, 0x6b, 0x71, 0x4b, + 0x52, 0x7b, 0x31, 0xb5, 0x6b, 0x70, 0x4f, 0x5a, 0xbd, 0x5f, 0xae, 0x7b, + 0x66, 0xcf, 0x52, 0x2e, 0x57, 0x93, 0x3d, 0xb8, 0x95, 0xb0, 0x7b, 0x7a, + 0xaa, 0xaa, 0x3c, 0x5f, 0xae, 0xae, 0x36, 0x39, 0xd6, 0xa1, 0x9a, 0x69, + 0xa2, 0xb9, 0x36, 0x59, 0x95, 0x50, 0x5a, 0x8d, 0x33, 0xd5, 0x8c, 0x99, + 0x3b, 0xca, 0x53, 0x35, 0x89, 0x6b, 0x7f, 0x8d, 0xc7, 0x66, 0x4d, 0x35, + 0xb6, 0x6d, 0x97, 0x8d, 0x9d, 0x75, 0x42, 0x91, 0x8f, 0x54, 0x8f, 0x9d, + 0x3a, 0xd0, 0x4f, 0x64, 0x48, 0x5c, 0x8b, 0x93, 0x3f, 0x86, 0x72, 0xbb, + 0x5e, 0xaa, 0x3e, 0x6d, 0x5e, 0xa5, 0xae, 0xbc, 0x7e, 0x38, 0x58, 0x9e, + 0x8f, 0x80, 0x53, 0x5c, 0xb0, 0x5b, 0xb3, 0x46, 0xd0, 0xbf, 0x7e, 0xb7, + 0x95, 0xc1, 0xb3, 0x7f, 0xc5, 0xa2, 0x74, 0x4d, 0xbf, 0x57, 0x65, 0xab, + 0x43, 0x4f, 0x8e, 0x84, 0x9d, 0xd4, 0x72, 0x68, 0x75, 0x88, 0xb2, 0x51, + 0x8d, 0x5e, 0x71, 0x94, 0x47, 0xaf, 0x6b, 0x75, 0x51, 0xa7, 0x3f, 0x34, + 0x59, 0xc2, 0x88, 0x81, 0xa0, 0x4f, 0x69, 0x7c, 0x74, 0x5d, 0xcf, 0xb0, + 0x93, 0xcf, 0x94, 0x51, 0xa3, 0xca, 0xa8, 0x72, 0x83, 0x84, 0x55, 0x49, + 0x8d, 0x5b, 0x56, 0xaf, 0xc0, 0x8d, 0x49, 0x82, 0xaa, 0x92, 0xcf, 0x6f, + 0x96, 0xc2, 0x54, 0x59, 0x40, 0x75, 0x6e, 0x52, 0xb7, 0xa1, 0x3f, 0x9a, + 0x2f, 0x84, 0x44, 0x98, 0xbe, 0x3b, 0x99, 0x77, 0x72, 0x86, 0x77, 0xc4, + 0x40, 0x54, 0x32, 0x4f, 0xa5, 0xcd, 0x4a, 0xa7, 0xb1, 0x45, 0x4f, 0xb8, + 0x57, 0x4f, 0x7b, 0xa5, 0x52, 0x49, 0x7d, 0x67, 0x5c, 0xa4, 0x86, 0x50, + 0x44, 0x5e, 0xba, 0xcb, 0x9c, 0x78, 0xd0, 0xba, 0x9f, 0xb9, 0x9a, 0x7b, + 0x51, 0xa7, 0x85, 0x35, 0x65, 0x33, 0xa7, 0xb4, 0xc3, 0x9d, 0x64, 0x54, + 0x7c, 0x5c, 0x92, 0x7b, 0x73, 0xc7, 0x9a, 0x86, 0x35, 0x4f, 0x43, 0x8a, + 0x41, 0xc5, 0x97, 0x48, 0x3f, 0x65, 0x4c, 0x72, 0x42, 0x40, 0x6c, 0xc2, + 0xac, 0x90, 0x94, 0x6d, 0x8e, 0xa2, 0x67, 0x66, 0xaa, 0x80, 0xcc, 0x3a, + 0x8e, 0x9b, 0x84, 0xbc, 0x35, 0xb8, 0x30, 0x47, 0x8f, 0x2d, 0xb2, 0x4b, + 0x47, 0x91, 0x3f, 0x6b, 0x4f, 0x56, 0xa8, 0x96, 0x5c, 0x73, 0x89, 0x84, + 0x49, 0x8b, 0xbf, 0x88, 0x63, 0xac, 0x92, 0x92, 0x70, 0x61, 0x7b, 0xad, + 0x7a, 0x41, 0xb8, 0x53, 0x54, 0x5c, 0x6c, 0xcb, 0x44, 0x72, 0x5e, 0x7a, + 0x5f, 0xae, 0x41, 0xd7, 0x63, 0xab, 0xd4, 0xa3, 0x86, 0x92, 0x92, 0x87, + 0x9f, 0x6a, 0x7c, 0x91, 0x5f, 0xa3, 0xa8, 0x98, 0x70, 0x75, 0xc5, 0x5b, + 0x3f, 0xa4, 0x8a, 0xa5, 0x87, 0x3a, 0x47, 0x5c, 0x66, 0x91, 0x33, 0x6e, + 0x7f, 0x7d, 0xc9, 0xc5, 0x70, 0xa8, 0x9e, 0x4a, 0x72, 0xd7, 0x54, 0x9d, + 0xbb, 0x98, 0x5b, 0xcf, 0xc4, 0x8d, 0x2f, 0xb0, 0x95, 0x5e, 0x82, 0xa8, + 0xcd, 0x67, 0x3a, 0x9c, 0x85, 0x33, 0x51, 0xa6, 0x50, 0x93, 0xc3, 0xc5, + 0x63, 0x39, 0xb0, 0x44, 0x5c, 0xa8, 0xa0, 0x89, 0x9a, 0xa0, 0xbc, 0xc5, + 0x86, 0xad, 0xb5, 0xd5, 0xbc, 0x4b, 0xc5, 0x96, 0x47, 0x67, 0xd0, 0x6d, + 0x7c, 0x71, 0x4c, 0xa2, 0x73, 0x3c, 0x33, 0x95, 0x63, 0x73, 0x66, 0x51, + 0x68, 0x62, 0x42, 0x8b, 0xd2, 0xa3, 0x72, 0x4c, 0x7f, 0x91, 0x95, 0x74, + 0xb4, 0xd0, 0xc3, 0x8b, 0xb4, 0x84, 0x59, 0x78, 0x3a, 0x3a, 0x7d, 0x8b, + 0x3e, 0x40, 0x33, 0x49, 0x83, 0xd1, 0x8f, 0x3e, 0xc8, 0x56, 0xc4, 0x6a, + 0x51, 0x8c, 0x7d, 0x6f, 0x67, 0xa5, 0x7b, 0x52, 0xa4, 0x9b, 0x42, 0x92, + 0x57, 0x7b, 0xb9, 0x7e, 0x5b, 0x8d, 0x76, 0x5a, 0x67, 0x49, 0x64, 0x7a, + 0x3f, 0x83, 0x46, 0xa6, 0xb5, 0x37, 0xc8, 0x5e, 0x90, 0x3e, 0x59, 0xaa, + 0xd2, 0x3f, 0x7e, 0x3d, 0x97, 0x3e, 0x37, 0x5b, 0x6a, 0x8a, 0x61, 0x4d, + 0x5f, 0xb5, 0xba, 0xad, 0xbd, 0x3c, 0xc5, 0xda, 0x88, 0x7d, 0xbd, 0xc7, + 0xbb, 0x8b, 0xd0, 0x78, 0x48, 0x5a, 0x30, 0xd2, 0x42, 0x47, 0xaf, 0xa5, + 0xbe, 0x8a, 0x8c, 0x6d, 0x5c, 0x73, 0xb2, 0x51, 0xbb, 0x60, 0xd2, 0xc4, + 0x6d, 0x53, 0xaa, 0x41, 0xc7, 0xa4, 0xbf, 0xd1, 0x5c, 0x7c, 0x5b, 0x4b, + 0xc0, 0x8d, 0x70, 0x7c, 0x5d, 0x91, 0x79, 0x42, 0x97, 0x52, 0x3a, 0x3c, + 0x86, 0x80, 0x8e, 0x70, 0x99, 0x44, 0x79, 0xa7, 0xc7, 0x5d, 0xb2, 0x67, + 0x7d, 0x8b, 0xa0, 0xd4, 0x6b, 0x9f, 0x86, 0x30, 0x47, 0x90, 0x31, 0xb8, + 0x48, 0x37, 0x84, 0x33, 0xb3, 0x9e, 0x2d, 0x43, 0x72, 0xbc, 0x7d, 0xd4, + 0xb0, 0xb8, 0x8d, 0xcf, 0x9b, 0xa2, 0xcd, 0xcc, 0xe0, 0x63, 0x8a, 0x69, + 0x87, 0x76, 0xc9, 0x62, 0x7f, 0x3f, 0x73, 0x7a, 0x69, 0x87, 0xd5, 0x55, + 0x98, 0x79, 0x38, 0x32, 0x62, 0xb8, 0xc5, 0xb2, 0x8d, 0x44, 0xc3, 0x64, + 0x93, 0x5d, 0x9c, 0x4c, 0x6e, 0x70, 0x7b, 0xc5, 0x37, 0xa8, 0x4e, 0x41, + 0x92, 0xc2, 0x3b, 0x37, 0x83, 0x63, 0x7d, 0x66, 0x8f, 0x56, 0x8e, 0x5e, + 0x63, 0x7a, 0x51, 0x8f, 0x50, 0xad, 0x9a, 0x83, 0x89, 0xb2, 0xb2, 0x6f, + 0x3a, 0x44, 0x43, 0x91, 0xb2, 0x33, 0x48, 0x6b, 0x67, 0xcc, 0xab, 0xcd, + 0xc8, 0x77, 0xca, 0xb0, 0xcb, 0x8f, 0x61, 0x6f, 0x68, 0xd2, 0x6a, 0x46, + 0x85, 0x61, 0x47, 0x4f, 0x51, 0x80, 0x98, 0xcc, 0x41, 0x48, 0x2b, 0x50, + 0x4e, 0x40, 0xcb, 0xa9, 0xb2, 0x43, 0x6b, 0x98, 0x71, 0xd5, 0xb5, 0x3f, + 0x98, 0x6b, 0xad, 0x76, 0x74, 0xab, 0x38, 0x3c, 0x35, 0x2c, 0x88, 0x7c, + 0x4b, 0x5c, 0x7f, 0xa7, 0x94, 0x56, 0xc0, 0xb5, 0x66, 0x60, 0x7d, 0x66, + 0x7a, 0xa3, 0x9d, 0x55, 0x39, 0x50, 0x78, 0x91, 0x31, 0x8b, 0x91, 0xb1, + 0x78, 0xa8, 0x54, 0x84, 0x35, 0x86, 0x4e, 0xca, 0xb8, 0x52, 0x3e, 0x93, + 0x82, 0x49, 0x3f, 0x41, 0x96, 0x56, 0x95, 0x4d, 0x3a, 0x96, 0x5b, 0x3f, + 0x56, 0xb1, 0xa7, 0xc4, 0x99, 0xa3, 0x7e, 0x82, 0xc9, 0x3a, 0x4d, 0x8a, + 0x4b, 0x3b, 0x58, 0x78, 0xdc, 0xae, 0x87, 0x9c, 0x67, 0x70, 0x8b, 0x5d, + 0xb8, 0x3b, 0x8b, 0x82, 0xd9, 0xcd, 0x63, 0xca, 0xad, 0x2e, 0x5c, 0x69, + 0x57, 0x3f, 0x65, 0x70, 0x79, 0x57, 0x50, 0x97, 0x5d, 0x76, 0x58, 0xaa, + 0x51, 0xbc, 0x32, 0x8a, 0x80, 0xb2, 0x4c, 0x85, 0x85, 0xb2, 0x62, 0xc3, + 0x4d, 0x6a, 0x60, 0x66, 0x5f, 0x5a, 0x44, 0x56, 0x5b, 0x9e, 0xde, 0xb7, + 0x67, 0x7a, 0x89, 0xc0, 0x52, 0x34, 0x70, 0xc9, 0x85, 0xad, 0xa9, 0x62, + 0xa0, 0x6f, 0xc6, 0x82, 0xb3, 0x39, 0x2f, 0x97, 0x45, 0x43, 0x36, 0xc4, + 0xaf, 0xb3, 0x48, 0xa8, 0xa3, 0xb5, 0x43, 0x68, 0xbc, 0x9a, 0x9d, 0x8e, + 0x58, 0xbf, 0xb8, 0x4a, 0x3f, 0x45, 0x93, 0xba, 0xa7, 0xb7, 0xd7, 0x3e, + 0xcd, 0x50, 0x47, 0x95, 0x39, 0x57, 0x3f, 0x85, 0x90, 0x90, 0x66, 0xb6, + 0x59, 0xb9, 0xd1, 0x78, 0xae, 0xb8, 0x5a, 0x32, 0x3e, 0xbe, 0xa5, 0x71, + 0x72, 0x99, 0x8f, 0xa9, 0xbf, 0x75, 0xbd, 0x2f, 0x85, 0x55, 0x69, 0xad, + 0x68, 0x63, 0x8f, 0x96, 0x3e, 0x2e, 0xc0, 0x70, 0xb9, 0xad, 0xda, 0x7d, + 0x42, 0x35, 0x58, 0x39, 0x9c, 0xb4, 0xc1, 0x44, 0x80, 0x96, 0x7e, 0x9d, + 0x7a, 0x67, 0x4e, 0xb8, 0xc2, 0x43, 0xcd, 0x99, 0x50, 0x31, 0xda, 0xae, + 0x9d, 0x7d, 0x5a, 0xc8, 0xb4, 0x39, 0xc6, 0x9a, 0x66, 0xa1, 0x41, 0xaa, + 0xc1, 0x38, 0x5c, 0xb6, 0xa8, 0x3e, 0xb3, 0x87, 0xd2, 0xc1, 0xad, 0x38, + 0x6b, 0x6a, 0x48, 0xa3, 0x94, 0x8a, 0x3b, 0x82, 0x63, 0x57, 0x53, 0x80, + 0x95, 0xa7, 0x77, 0x51, 0x71, 0x80, 0xa7, 0x7b, 0x30, 0x87, 0x2e, 0xc5, + 0x36, 0xd4, 0x7e, 0x9c, 0xe0, 0xba, 0x7c, 0x72, 0xcb, 0x7f, 0xa1, 0x7e, + 0x8f, 0xaf, 0x8b, 0x69, 0xb6, 0x2c, 0x5f, 0x84, 0x39, 0xba, 0xd0, 0x8d, + 0x4e, 0x35, 0x55, 0xae, 0x53, 0x77, 0x9f, 0x4f, 0x6e, 0xba, 0x35, 0x7f, + 0x4d, 0x55, 0xda, 0x52, 0x54, 0x47, 0xcd, 0x3c, 0xbf, 0x71, 0x7c, 0x31, + 0x55, 0xad, 0xc8, 0xb6, 0x74, 0x4d, 0x50, 0x6e, 0x37, 0x90, 0x2b, 0x8c, + 0x56, 0x6d, 0xc6, 0x94, 0x74, 0xb8, 0xc8, 0x3e, 0x38, 0x64, 0xba, 0xa9, + 0xb5, 0xa4, 0x6a, 0xbc, 0x6b, 0xcd, 0xa9, 0x32, 0x48, 0xbe, 0x55, 0x3a, + 0x9e, 0x53, 0x41, 0x66, 0x59, 0x9f, 0xa7, 0xc0, 0x72, 0x75, 0x7d, 0x64, + 0x51, 0xbb, 0x71, 0x3f, 0x80, 0x1a, 0x32, 0x9e, 0x31, 0xac, 0x89, 0x41, + 0x93, 0x91, 0x42, 0x67, 0x61, 0x8b, 0x37, 0xc3, 0x64, 0xbb, 0xa2, 0xad, + 0x90, 0x67, 0xa5, 0x55, 0x7e, 0x3f, 0x53, 0x9b, 0x49, 0x82, 0x36, 0x84, + 0xbe, 0xac, 0x4b, 0x4d, 0x97, 0x9b, 0x5f, 0xcb, 0x4e, 0x75, 0xa5, 0xbf, + 0x31, 0xc6, 0x7d, 0x84, 0xb3, 0x8f, 0x49, 0xc1, 0xc5, 0xc7, 0x31, 0x85, + 0x34, 0x81, 0x79, 0xb6, 0xbf, 0x3a, 0x4e, 0xa7, 0xa7, 0x90, 0x69, 0x43, + 0xab, 0x4e, 0x63, 0xc7, 0x9f, 0x4d, 0x40, 0x69, 0x7a, 0xc3, 0xbf, 0xc1, + 0x68, 0x8c, 0x49, 0x61, 0x57, 0x92, 0x92, 0xa5, 0x59, 0x41, 0x82, 0xc7, + 0x88, 0x93, 0x90, 0x7e, 0x66, 0xa5, 0xcb, 0x97, 0x75, 0xb3, 0xa4, 0xd1, + 0x5a, 0x4b, 0x71, 0x7f, 0x98, 0x6d, 0x4a, 0xa9, 0x8a, 0x93, 0x4c, 0x47, + 0x46, 0x56, 0x60, 0xbc, 0x79, 0xb5, 0x79, 0xc8, 0xaf, 0x36, 0x57, 0x66, + 0xc6, 0xbe, 0x40, 0x7e, 0x39, 0x44, 0xc6, 0x7c, 0xa9, 0x33, 0x8c, 0x6c, + 0x71, 0xc1, 0x80, 0x79, 0x7d, 0x9a, 0x99, 0x4c, 0x6d, 0xd6, 0x5a, 0x87, + 0xb6, 0xc3, 0x7f, 0x5c, 0xad, 0xb1, 0xbe, 0xa3, 0xa1, 0x76, 0x83, 0xbd, + 0x57, 0x9c, 0xad, 0xa4, 0xc7, 0x68, 0x8c, 0xc6, 0xb4, 0x74, 0xa9, 0x5f, + 0x5f, 0x37, 0x77, 0x8a, 0xba, 0xcd, 0x58, 0x53, 0x7a, 0x6f, 0xb9, 0xd0, + 0x46, 0xd0, 0x3f, 0x89, 0x9b, 0x9a, 0x61, 0x5e, 0x4d, 0x78, 0xb8, 0x4b, + 0xcb, 0x3e, 0x3e, 0x40, 0x90, 0x6e, 0x88, 0x45, 0x58, 0xad, 0xc1, 0xcb, + 0xcf, 0x35, 0xb5, 0x92, 0x55, 0x5e, 0xb8, 0x43, 0x7d, 0xd7, 0x87, 0xa7, + 0x66, 0xb2, 0xcb, 0x42, 0x4b, 0x42, 0x93, 0x7b, 0xc9, 0xb5, 0x9f, 0x93, + 0xac, 0x6c, 0x9e, 0xa0, 0x85, 0x6c, 0x8d, 0xb5, 0x70, 0xd2, 0x73, 0x3c, + 0x9c, 0x95, 0x47, 0x51, 0x4d, 0x81, 0xd6, 0xd3, 0x7d, 0x3a, 0x3c, 0xca, + 0xa7, 0x6b, 0x3b, 0x36, 0x45, 0x92, 0x74, 0x42, 0xb2, 0x44, 0x4d, 0x5e, + 0x39, 0x4f, 0xa8, 0x90, 0x84, 0x41, 0xa9, 0xce, 0x7f, 0x52, 0x91, 0xc3, + 0x4a, 0x91, 0x55, 0xcf, 0x5c, 0xca, 0x71, 0x7b, 0xb1, 0x53, 0x84, 0xca, + 0x5b, 0x4a, 0x3b, 0x69, 0x6a, 0xc0, 0xc3, 0xab, 0x3a, 0x97, 0x9d, 0x74, + 0xa3, 0xba, 0xaf, 0x69, 0xa1, 0x5b, 0x83, 0x74, 0x72, 0x83, 0x65, 0x65, + 0x53, 0x6b, 0xaf, 0xb3, 0x97, 0x53, 0x39, 0x7a, 0xbd, 0x64, 0x9c, 0x49, + 0x40, 0x4d, 0x93, 0xb2, 0x4a, 0xca, 0x58, 0x3a, 0x80, 0xb0, 0x8c, 0xba, + 0xa6, 0x74, 0xc0, 0x3c, 0x87, 0xc8, 0x94, 0xd5, 0x5e, 0xd1, 0x73, 0xab, + 0x6a, 0xc9, 0x3b, 0x53, 0xc0, 0x60, 0xa7, 0x54, 0x6f, 0x45, 0xd5, 0x92, + 0x7d, 0xd7, 0x8c, 0x7d, 0x41, 0x49, 0xa4, 0x81, 0xd0, 0x8d, 0xbb, 0xaf, + 0x58, 0x43, 0xb6, 0xac, 0x58, 0xaf, 0xae, 0x5a, 0x47, 0x3b, 0x8f, 0x88, + 0xc1, 0x8f, 0x90, 0xac, 0x50, 0x9a, 0x53, 0xb6, 0x9d, 0xbe, 0xaf, 0x86, + 0xc7, 0xc0, 0xdd, 0xb7, 0x3e, 0x34, 0x49, 0x9d, 0xac, 0x4f, 0x7b, 0x9a, + 0x65, 0xd2, 0xc4, 0x78, 0xbb, 0xb2, 0x85, 0x8a, 0xa7, 0xc3, 0x3e, 0x73, + 0xc1, 0x6d, 0x30, 0x8b, 0x3c, 0x3a, 0x58, 0x81, 0x9e, 0xc3, 0x9c, 0x84, + 0x71, 0x40, 0x4e, 0x3f, 0x74, 0xa6, 0x61, 0x3c, 0x72, 0xae, 0x48, 0x4d, + 0x33, 0x3c, 0x75, 0xc9, 0x46, 0xa4, 0x9d, 0x6b, 0xa3, 0x63, 0x3e, 0x54, + 0xab, 0x85, 0xd9, 0x58, 0xba, 0x5e, 0x3a, 0x9b, 0x5f, 0xc2, 0x3d, 0x8d, + 0xc3, 0xcd, 0x78, 0x78, 0x74, 0x4a, 0x9e, 0xdb, 0xa6, 0x5e, 0x69, 0x9b, + 0xcf, 0x8a, 0x88, 0x52, 0xa8, 0xb9, 0xc1, 0xbb, 0x94, 0x53, 0xbf, 0xca, + 0xc4, 0xb3, 0x31, 0xa7, 0xce, 0x63, 0xa5, 0xa2, 0x49, 0xa0, 0x6a, 0xa2, + 0x6d, 0x71, 0x4a, 0x6e, 0x49, 0x54, 0x62, 0x80, 0x71, 0xc2, 0x57, 0x5c, + 0x70, 0xaf, 0x73, 0x46, 0x50, 0xb3, 0x47, 0x6b, 0x53, 0xc0, 0xdd, 0x8c, + 0x9c, 0xc4, 0x7c, 0x87, 0x6d, 0x94, 0x4b, 0xb2, 0xc1, 0xba, 0x74, 0x45, + 0x76, 0xa1, 0x47, 0x96, 0x56, 0x8a, 0x9d, 0xc9, 0x6c, 0xd3, 0xab, 0x88, + 0xab, 0xca, 0x79, 0x5e, 0xac, 0x46, 0x5a, 0x53, 0x51, 0xcd, 0xc3, 0xb9, + 0x73, 0x58, 0x39, 0xcb, 0xcf, 0xc3, 0x59, 0x46, 0x4e, 0xa2, 0xc7, 0xa8, + 0xc7, 0x48, 0x3a, 0x68, 0x8a, 0x52, 0xcb, 0x7e, 0x8e, 0x3b, 0x6a, 0x53, + 0x69, 0x6c, 0xa6, 0x7f, 0x59, 0x84, 0x3f, 0x3b, 0x9c, 0x86, 0x64, 0xa5, + 0x5a, 0x8a, 0xc3, 0xa1, 0x9e, 0x8c, 0x8d, 0x96, 0x6e, 0x2e, 0x5b, 0x69, + 0x4a, 0xb9, 0xae, 0x8b, 0x49, 0x7d, 0x43, 0x3d, 0x80, 0x3b, 0xbd, 0x36, + 0x6b, 0x7e, 0x68, 0x3a, 0xb5, 0x72, 0x42, 0x55, 0x92, 0x30, 0x49, 0x75, + 0xd0, 0xc7, 0x39, 0x9a, 0x7e, 0xd2, 0xcd, 0x71, 0xa4, 0xae, 0x42, 0x42, + 0x71, 0x50, 0x43, 0x84, 0x5c, 0x4c, 0xad, 0xb8, 0x59, 0x39, 0x6a, 0xb3, + 0xc4, 0xb3, 0x72, 0x9e, 0x46, 0x8e, 0x69, 0x95, 0x5b, 0x9f, 0x3b, 0x38, + 0x6a, 0x59, 0x8a, 0x9d, 0xb1, 0x9a, 0xb3, 0x8b, 0x77, 0xbf, 0x46, 0x3c, + 0x8e, 0x51, 0xc8, 0x90, 0x33, 0xb6, 0xba, 0xbc, 0x23, 0x3d, 0xcb, 0x4d, + 0x6b, 0x80, 0xcb, 0x55, 0xc3, 0xca, 0xa1, 0x54, 0xc6, 0x4b, 0x68, 0xbf, + 0x49, 0x7d, 0xa5, 0x8c, 0xa0, 0x95, 0x6e, 0x48, 0x54, 0x9c, 0xac, 0xaf, + 0x5b, 0x45, 0x58, 0x6f, 0x82, 0x5d, 0x56, 0xce, 0x3a, 0xd2, 0x7b, 0x5f, + 0xbf, 0x38, 0xad, 0xcb, 0xa6, 0x94, 0x9e, 0x77, 0x4d, 0x65, 0x7a, 0x96, + 0xb9, 0x7f, 0x83, 0x59, 0x3d, 0x71, 0x6c, 0x42, 0x5d, 0xa7, 0x72, 0x8f, + 0xa4, 0xbf, 0x48, 0x9d, 0x72, 0x48, 0xbf, 0x43, 0x36, 0xcc, 0x78, 0xcc, + 0x6f, 0x52, 0xc0, 0xa0, 0xb9, 0x60, 0x93, 0x9f, 0x6b, 0x62, 0x46, 0xc9, + 0x65, 0x60, 0xcd, 0x72, 0x68, 0x54, 0x9e, 0x74, 0x43, 0xc5, 0xc8, 0xa2, + 0x35, 0x8e, 0x4a, 0xc8, 0xca, 0x52, 0x58, 0xa2, 0x3b, 0xa7, 0x87, 0xbd, + 0x74, 0x73, 0x59, 0xba, 0xc5, 0x82, 0xdf, 0x92, 0x4f, 0xb6, 0x7d, 0x66, + 0x7e, 0x3e, 0x75, 0x44, 0xbb, 0x4b, 0x38, 0x92, 0xaa, 0xad, 0x97, 0x9e, + 0xae, 0x40, 0xa5, 0xcf, 0x99, 0x51, 0x4a, 0x91, 0x4e, 0x87, 0x61, 0x76, + 0x6e, 0x6c, 0x69, 0x3b, 0x46, 0xaf, 0x88, 0x79, 0x76, 0x54, 0x7b, 0x92, + 0x9a, 0xae, 0xd4, 0x47, 0x68, 0x36, 0x4a, 0x37, 0x68, 0xb0, 0xc1, 0x47, + 0x79, 0x97, 0x8b, 0x64, 0x3d, 0xa9, 0x6f, 0xb6, 0xbc, 0x6c, 0x9b, 0x7b, + 0x9d, 0x54, 0xda, 0x5b, 0x52, 0x9a, 0x3d, 0x6b, 0x89, 0x74, 0x7d, 0x8e, + 0x6f, 0x88, 0xa4, 0xbf, 0xb6, 0x70, 0x60, 0xbf, 0xc6, 0x69, 0x9b, 0xac, + 0x98, 0xc7, 0x92, 0x86, 0x72, 0x6b, 0xaa, 0x78, 0x6d, 0xb5, 0x68, 0x57, + 0x72, 0xd1, 0x93, 0xaf, 0x95, 0xb7, 0x67, 0x31, 0xbd, 0x87, 0x3a, 0x8b, + 0x3c, 0x35, 0xc1, 0x53, 0xad, 0x5b, 0x66, 0x4f, 0xa1, 0xb2, 0x96, 0x9a, + 0xcb, 0x2d, 0xb3, 0x7d, 0x73, 0x59, 0x3c, 0x84, 0x7c, 0x7c, 0x59, 0xac, + 0xcf, 0x87, 0x77, 0x3f, 0x74, 0xce, 0xba, 0x8d, 0x85, 0xba, 0x6e, 0x92, + 0xb4, 0x4c, 0x6b, 0xbc, 0x79, 0x47, 0x4e, 0x47, 0x5a, 0x2f, 0xbd, 0x74, + 0x97, 0xca, 0x4b, 0x34, 0xda, 0xb7, 0x63, 0xd7, 0xce, 0x69, 0x37, 0x64, + 0x4a, 0x94, 0x83, 0x89, 0x8a, 0x7f, 0x4a, 0xc4, 0x66, 0x64, 0x6f, 0xa0, + 0xb9, 0x55, 0x32, 0x72, 0xac, 0x8e, 0x57, 0x7c, 0xa0, 0xc3, 0x5b, 0xa5, + 0x4f, 0x35, 0x5f, 0x7a, 0x71, 0x7e, 0xa7, 0x1e, 0x7e, 0xa2, 0xa0, 0x68, + 0xbf, 0xaf, 0x71, 0x3d, 0xc0, 0x64, 0xd3, 0x45, 0xc2, 0xb4, 0x65, 0x59, + 0x65, 0x89, 0xa7, 0x6d, 0xaa, 0xc2, 0x90, 0x4b, 0x52, 0x64, 0xbb, 0x53, + 0xad, 0x8d, 0x99, 0x52, 0x42, 0x59, 0x2f, 0xa3, 0x33, 0xd5, 0xd3, 0xb4, + 0xc0, 0xb8, 0x8d, 0x95, 0x6e, 0x71, 0x65, 0x8e, 0x5d, 0x58, 0x9b, 0x9f, + 0x92, 0x54, 0x45, 0x39, 0x67, 0xb6, 0x6e, 0xbd, 0x4d, 0xc0, 0x5f, 0x6c, + 0xa3, 0xb7, 0x9b, 0xca, 0x7f, 0xa9, 0x40, 0x92, 0xa3, 0x81, 0xab, 0x6b, + 0x93, 0x3c, 0xc2, 0x4a, 0x81, 0x92, 0x69, 0x93, 0xbe, 0x20, 0x43, 0x4a, + 0xb2, 0x3b, 0x8f, 0xcf, 0xd7, 0x98, 0x9f, 0x99, 0x9a, 0x79, 0x9c, 0x8a, + 0xc0, 0x6b, 0xbd, 0x3f, 0xac, 0x81, 0xdc, 0x81, 0x48, 0x60, 0xaf, 0x3e, + 0x6e, 0x4d, 0xa1, 0x46, 0x45, 0x87, 0xb7, 0x9b, 0x5e, 0x44, 0x6f, 0xb6, + 0x64, 0xbf, 0xce, 0x93, 0xd0, 0x92, 0x60, 0x6c, 0x30, 0x75, 0xb7, 0x96, + 0x33, 0x64, 0x50, 0x5b, 0xa3, 0x96, 0xa1, 0xa3, 0x65, 0xa2, 0x99, 0x9b, + 0x90, 0x70, 0x84, 0x67, 0xbe, 0x45, 0xa6, 0x50, 0x8b, 0x87, 0x4b, 0x91, + 0xb8, 0xd5, 0xd0, 0x72, 0x7c, 0x3c, 0x65, 0xcd, 0x33, 0xa5, 0x40, 0x50, + 0xc6, 0xce, 0xcc, 0x45, 0x88, 0x93, 0x6f, 0x45, 0xa0, 0x89, 0x72, 0xc4, + 0x44, 0xa7, 0xd4, 0x83, 0xbe, 0x3b, 0x70, 0x84, 0xa4, 0xca, 0x87, 0x3a, + 0xc5, 0x83, 0x3a, 0x80, 0x5e, 0x61, 0x7c, 0xc3, 0x69, 0x6c, 0x5b, 0xb3, + 0x78, 0x65, 0x82, 0x94, 0xbb, 0xbd, 0xca, 0x91, 0x97, 0x5d, 0x47, 0xbe, + 0x54, 0xa7, 0xc9, 0xd8, 0x47, 0x31, 0x48, 0x7d, 0xa2, 0x34, 0x34, 0x3e, + 0xc6, 0x55, 0x65, 0x36, 0xc3, 0x83, 0x9a, 0xd6, 0x8e, 0x6c, 0xa5, 0x3a, + 0xb5, 0x48, 0x71, 0x79, 0x7e, 0x37, 0x46, 0xd1, 0xbf, 0xb9, 0x86, 0x95, + 0x3f, 0xa3, 0x7e, 0x9d, 0x5f, 0x37, 0x60, 0x72, 0x9e, 0x49, 0xa3, 0x48, + 0xbf, 0x38, 0xcf, 0x36, 0x7b, 0x7c, 0x72, 0xae, 0x64, 0x8a, 0x30, 0x3c, + 0x7b, 0x3b, 0xd5, 0xae, 0x58, 0x78, 0x96, 0xb0, 0xa2, 0x62, 0x81, 0x98, + 0x6a, 0xd2, 0x6f, 0x97, 0x7d, 0xa8, 0x3f, 0x45, 0x33, 0x83, 0x3d, 0x6a, + 0xb8, 0x60, 0x9b, 0x44, 0x73, 0xc1, 0x53, 0x73, 0x54, 0x67, 0x69, 0x7e, + 0x89, 0x51, 0x60, 0x93, 0x5d, 0x69, 0xa3, 0x69, 0x62, 0x96, 0x37, 0x73, + 0x56, 0x9a, 0x50, 0x90, 0x3d, 0x7a, 0x80, 0x36, 0xc4, 0x63, 0xa4, 0x51, + 0x7f, 0x3e, 0x61, 0x34, 0xcb, 0xbc, 0x97, 0xbd, 0x6b, 0x90, 0xb5, 0x86, + 0x55, 0x35, 0xc0, 0xc6, 0x55, 0x9f, 0x94, 0x8b, 0x3a, 0x2e, 0xd2, 0x7d, + 0x41, 0x81, 0xbd, 0x52, 0xc7, 0xa8, 0xbc, 0x6e, 0xaa, 0xac, 0x49, 0x8f, + 0x5a, 0x97, 0xa2, 0x42, 0x6f, 0x94, 0xa8, 0x66, 0xad, 0x5f, 0xad, 0x48, + 0x54, 0xc3, 0x7a, 0xc4, 0xcc, 0xa9, 0x40, 0x54, 0xca, 0x2e, 0x75, 0x71, + 0x6a, 0x3e, 0x82, 0xd1, 0xbd, 0xbf, 0x6d, 0x42, 0x8c, 0xa5, 0x7f, 0x4a, + 0x3f, 0x53, 0x76, 0x4f, 0x33, 0xc7, 0x4e, 0xab, 0x68, 0x8e, 0x4e, 0x64, + 0x95, 0xb7, 0xa6, 0x3b, 0xb2, 0xb9, 0x67, 0xa3, 0x67, 0xd3, 0xb6, 0x6a, + 0xae, 0x73, 0x7f, 0xb9, 0x51, 0xa5, 0x30, 0x89, 0x6b, 0xb0, 0xb2, 0x75, + 0x79, 0x71, 0x62, 0x7d, 0xc9, 0x9f, 0x83, 0x9d, 0x89, 0x87, 0x78, 0x7c, + 0xaf, 0x90, 0xd3, 0x2f, 0x78, 0xb9, 0x5f, 0x45, 0x4f, 0x9b, 0x56, 0x3c, + 0xc1, 0x5d, 0x99, 0x53, 0x4f, 0x35, 0x56, 0x75, 0xad, 0x2e, 0xaf, 0xa5, + 0xb6, 0x4b, 0x79, 0x57, 0x7a, 0x7c, 0x86, 0x81, 0xab, 0x3b, 0x77, 0xc8, + 0xa1, 0xac, 0xbb, 0xc3, 0x74, 0x8c, 0xab, 0xac, 0x42, 0xa9, 0x32, 0x8c, + 0x4d, 0x44, 0x7b, 0x5c, 0x61, 0xbf, 0xd3, 0x4d, 0x8d, 0xc2, 0xcb, 0xca, + 0xa5, 0xc9, 0x41, 0x53, 0x62, 0xd4, 0x68, 0x95, 0xc1, 0xd4, 0xb8, 0x6f, + 0x44, 0x83, 0xc1, 0x48, 0x55, 0x7a, 0x7b, 0x3d, 0xaf, 0x58, 0x3d, 0xc7, + 0xa1, 0xb7, 0x96, 0xd1, 0x9e, 0xaa, 0x3d, 0xd2, 0xb8, 0x88, 0xc6, 0x31, + 0x87, 0xbf, 0x95, 0x3f, 0x56, 0x59, 0xc8, 0xb7, 0x8a, 0x6d, 0xcb, 0xae, + 0x37, 0x40, 0x5d, 0x54, 0xba, 0x9d, 0x5e, 0x90, 0x34, 0x66, 0xa8, 0x71, + 0x45, 0x9e, 0x6e, 0x5a, 0x84, 0x3e, 0x80, 0xc4, 0xc5, 0x78, 0xc2, 0xcf, + 0x81, 0x61, 0xc0, 0xd1, 0xa2, 0x89, 0x69, 0x8a, 0xd5, 0x82, 0x88, 0x69, + 0x4e, 0x73, 0x3c, 0xa5, 0x6a, 0x50, 0x83, 0x5d, 0x91, 0xc5, 0x46, 0x89, + 0xb1, 0x46, 0x3b, 0x93, 0x6e, 0x75, 0x95, 0xa4, 0x80, 0x76, 0x6e, 0x47, + 0x4a, 0x72, 0x64, 0x8e, 0xc1, 0x73, 0x60, 0x3a, 0x2d, 0x36, 0xba, 0xbc, + 0x41, 0x55, 0x3b, 0x71, 0xbd, 0xd5, 0xcb, 0xbc, 0x3e, 0x42, 0x5b, 0x8f, + 0x29, 0x51, 0x57, 0x6a, 0x67, 0x39, 0x7a, 0x5e, 0xaf, 0x82, 0x41, 0x6e, + 0x50, 0x8e, 0x6e, 0x49, 0xae, 0x85, 0xc6, 0x65, 0x66, 0x60, 0x9a, 0x77, + 0x45, 0x41, 0xa0, 0x38, 0xa6, 0x76, 0xa6, 0x8a, 0x84, 0xc7, 0xcd, 0xc1, + 0xb5, 0x9d, 0x6e, 0xde, 0x92, 0xc6, 0x41, 0xbd, 0x6c, 0xbd, 0x47, 0x74, + 0xab, 0x52, 0x3c, 0x5d, 0x82, 0x69, 0x31, 0x79, 0x84, 0x6b, 0x58, 0x66, + 0x89, 0xa1, 0xc8, 0xc3, 0xbb, 0x2b, 0x46, 0xc7, 0x6a, 0x6d, 0x8c, 0xc0, + 0x43, 0x4d, 0xb3, 0xb9, 0x8c, 0xa0, 0x9f, 0x84, 0x7d, 0xcd, 0xc5, 0x4c, + 0xc0, 0x48, 0xce, 0x74, 0x4e, 0x9c, 0xcb, 0x76, 0x47, 0xb9, 0x6a, 0xb4, + 0x8d, 0x50, 0xb0, 0x7c, 0x77, 0x37, 0x7b, 0x9b, 0x5b, 0x75, 0xd1, 0x6d, + 0xc4, 0xbd, 0x3b, 0x89, 0x6f, 0x88, 0xa9, 0xbb, 0x6f, 0x35, 0x8f, 0xb1, + 0x85, 0x9a, 0xc0, 0x74, 0x61, 0x53, 0x77, 0x9e, 0xa5, 0x95, 0x70, 0xb8, + 0x79, 0x35, 0x7b, 0xb2, 0xcd, 0xc2, 0xb5, 0xaa, 0x89, 0x37, 0xb7, 0x61, + 0xcd, 0x9a, 0x36, 0x3b, 0x37, 0x67, 0x9e, 0xc0, 0xad, 0x8c, 0x32, 0xab, + 0xbf, 0x5a, 0x99, 0x38, 0xb0, 0x63, 0x48, 0x4b, 0x65, 0x9e, 0xa1, 0x3a, + 0x67, 0x9e, 0x9f, 0x59, 0x75, 0x6e, 0xc3, 0x5f, 0x6e, 0x75, 0xa4, 0xb4, + 0x5f, 0x4c, 0x8a, 0x60, 0x85, 0x4c, 0x71, 0x76, 0x84, 0x89, 0x46, 0x6f, + 0x46, 0xa2, 0x82, 0x9c, 0x53, 0x7d, 0x71, 0x62, 0x4e, 0xc9, 0xcc, 0x64, + 0x4a, 0x7b, 0x66, 0x45, 0x8d, 0x94, 0x64, 0x79, 0xaa, 0x4a, 0x9f, 0x76, + 0x65, 0xca, 0xaa, 0x54, 0xbd, 0x60, 0xc6, 0x52, 0xba, 0xc4, 0x60, 0x76, + 0x85, 0x5c, 0x81, 0xcc, 0x87, 0x49, 0x9a, 0x6c, 0x99, 0x78, 0x40, 0x65, + 0x37, 0x42, 0x9a, 0x32, 0x70, 0x8f, 0xaf, 0x7e, 0x9a, 0x87, 0xc7, 0x55, + 0x7b, 0x4d, 0x9f, 0x5b, 0x73, 0x99, 0x7f, 0x8e, 0xac, 0x71, 0xc7, 0x52, + 0x3c, 0x9c, 0xc2, 0x67, 0x95, 0x9c, 0xb2, 0x9c, 0xa3, 0x55, 0x94, 0xd2, + 0xc6, 0x67, 0xcf, 0xbc, 0x8f, 0x50, 0x53, 0xb1, 0xa5, 0x33, 0x3e, 0xb9, + 0x97, 0xb7, 0xa6, 0x49, 0x7b, 0xb9, 0xc0, 0x52, 0xc7, 0x9a, 0x97, 0xb0, + 0x84, 0x5a, 0x7a, 0x97, 0x52, 0x70, 0x43, 0x68, 0xad, 0xa9, 0x87, 0x73, + 0x51, 0xcc, 0x71, 0x31, 0xc2, 0x79, 0x65, 0x72, 0x46, 0x39, 0x37, 0x3b, + 0x88, 0x44, 0x8b, 0x5b, 0xbc, 0x8b, 0x4a, 0xbc, 0xad, 0x76, 0x61, 0x9b, + 0xc4, 0x8e, 0x86, 0x40, 0x8e, 0x6c, 0x6c, 0x3d, 0x3a, 0xa9, 0xc5, 0x85, + 0x6d, 0x57, 0xba, 0x85, 0x86, 0x8d, 0xb9, 0x88, 0x3a, 0x67, 0x47, 0x5d, + 0x7c, 0xac, 0xa1, 0xcd, 0x66, 0x71, 0xb9, 0x78, 0x3e, 0x36, 0xbb, 0x65, + 0x74, 0xb4, 0x5c, 0xc0, 0x3b, 0x68, 0xcc, 0xcf, 0xc7, 0x48, 0x4b, 0xac, + 0x4e, 0x9a, 0x9b, 0x4c, 0x40, 0x64, 0x7e, 0xc8, 0xc2, 0x99, 0x5f, 0x71, + 0x69, 0x73, 0x78, 0x5b, 0x32, 0x4e, 0x4c, 0x45, 0x8f, 0x59, 0x78, 0xcb, + 0xba, 0x97, 0x9b, 0x56, 0xbe, 0x6c, 0x63, 0xcf, 0x9e, 0xa9, 0xcf, 0x60, + 0x59, 0x3e, 0x5d, 0xcf, 0xc3, 0x65, 0x7a, 0x76, 0x64, 0x3f, 0x5b, 0xc7, + 0x88, 0x33, 0x82, 0xae, 0x64, 0xbb, 0xb0, 0xaa, 0xae, 0x32, 0x91, 0xd1, + 0x6f, 0x74, 0x5e, 0xd6, 0x46, 0xc6, 0xae, 0x64, 0x80, 0x96, 0x5a, 0x8c, + 0xce, 0x71, 0x8c, 0x54, 0x7e, 0x47, 0x7a, 0x4d, 0x94, 0x4c, 0x43, 0x36, + 0xce, 0x42, 0xb7, 0x6d, 0x69, 0xbe, 0x45, 0x7b, 0x82, 0x89, 0xd4, 0x3b, + 0x89, 0xa1, 0xae, 0x42, 0x54, 0xb6, 0x39, 0x45, 0xc4, 0xca, 0x3a, 0x3d, + 0x84, 0xb2, 0x53, 0x87, 0xbe, 0x2b, 0x61, 0xae, 0x90, 0xa8, 0x53, 0x64, + 0xad, 0x85, 0x8b, 0x80, 0x90, 0xca, 0xa5, 0xa0, 0x8b, 0xba, 0x53, 0x65, + 0x7a, 0x63, 0x6c, 0xbb, 0x88, 0xbf, 0xba, 0x42, 0xb7, 0x48, 0xbd, 0xad, + 0xcc, 0x72, 0x9a, 0xa5, 0x78, 0x70, 0xa0, 0xaa, 0xcd, 0x61, 0xc9, 0x36, + 0xa4, 0x8c, 0x31, 0x86, 0x88, 0xc9, 0xa6, 0x2d, 0x92, 0x2d, 0x82, 0x47, + 0x72, 0xc8, 0xb2, 0x32, 0x8c, 0x29, 0x74, 0xb5, 0x82, 0x40, 0x7e, 0x2d, + 0x88, 0x46, 0x35, 0xc1, 0xc9, 0xa7, 0x2f, 0x4f, 0xc3, 0x70, 0xa9, 0x82, + 0x2d, 0x91, 0xa5, 0x83, 0xa3, 0x61, 0x7f, 0x84, 0x81, 0x81, 0x80, 0x69, + 0x36, 0xa0, 0xaf, 0x35, 0xc3, 0x41, 0x55, 0x91, 0xce, 0xca, 0x38, 0x7a, + 0xc5, 0x8d, 0x72, 0xa7, 0x62, 0x85, 0x4b, 0x8e, 0x70, 0xaf, 0x8d, 0x68, + 0xa9, 0xb2, 0xa6, 0x7c, 0xc6, 0x50, 0x3f, 0x42, 0x9e, 0x38, 0x88, 0x7f, + 0x98, 0x75, 0x51, 0xc9, 0xa4, 0xa9, 0xc6, 0x6b, 0x34, 0x81, 0x4a, 0xaa, + 0x5b, 0xc6, 0xa9, 0x63, 0x52, 0x84, 0x62, 0xcb, 0x6b, 0xc3, 0x92, 0x5f, + 0x9f, 0xb6, 0x96, 0x8f, 0x9b, 0x74, 0x56, 0x3a, 0x4f, 0x40, 0x66, 0xb6, + 0x58, 0x97, 0x80, 0x88, 0x70, 0x4a, 0x3a, 0xad, 0x7a, 0x53, 0x80, 0x4a, + 0x6d, 0xa2, 0x93, 0xb1, 0xa9, 0x76, 0x88, 0x4a, 0x4f, 0xb4, 0x4b, 0x32, + 0x91, 0xa0, 0xc1, 0xca, 0xbe, 0x70, 0x4b, 0xaa, 0x40, 0x3d, 0x40, 0x84, + 0xca, 0x5e, 0x9e, 0x69, 0x92, 0xa2, 0x45, 0xaa, 0x72, 0x4f, 0x83, 0x8e, + 0x75, 0x66, 0xbe, 0xa5, 0x6d, 0x89, 0x97, 0x6b, 0x97, 0x3f, 0x90, 0x7c, + 0x53, 0x42, 0xcf, 0xbd, 0xa0, 0x64, 0x45, 0x49, 0x32, 0xa7, 0xb0, 0x7b, + 0x48, 0x83, 0x99, 0x60, 0x85, 0xaa, 0x73, 0x52, 0x5d, 0xa8, 0x76, 0x8b, + 0x90, 0x37, 0xc9, 0xab, 0x3c, 0x42, 0x51, 0x80, 0xab, 0x53, 0x8f, 0x7c, + 0xae, 0xc3, 0x59, 0x6f, 0xa2, 0x9d, 0x5e, 0xa5, 0x5a, 0xb6, 0x6c, 0x37, + 0xc9, 0x5b, 0x99, 0x8b, 0x41, 0xb3, 0x3a, 0x3d, 0x41, 0x7c, 0x5d, 0x84, + 0x40, 0x53, 0x96, 0x8a, 0xa6, 0x4c, 0xcb, 0xc3, 0xa0, 0xab, 0x4e, 0x3d, + 0xab, 0xa4, 0x3b, 0x46, 0xc5, 0xa7, 0xae, 0x30, 0xb8, 0x39, 0x84, 0xbd, + 0xce, 0xbf, 0xbb, 0x55, 0x2f, 0x54, 0xbf, 0xc4, 0x3d, 0xbb, 0x9a, 0x58, + 0xb0, 0x3f, 0xa1, 0xb9, 0xa8, 0x46, 0x40, 0x51, 0x49, 0x58, 0x87, 0xca, + 0x70, 0xb6, 0x4a, 0x92, 0xa1, 0xc2, 0x88, 0xbf, 0x92, 0x3c, 0x65, 0x41, + 0x59, 0x85, 0x5e, 0x9c, 0x48, 0x87, 0xbf, 0x9d, 0x90, 0xab, 0xbd, 0x9e, + 0x75, 0x99, 0x4e, 0xa9, 0x44, 0x55, 0x71, 0x6c, 0x99, 0xce, 0xb2, 0x73, + 0x8a, 0x4e, 0x49, 0x62, 0x40, 0x44, 0x4d, 0xb7, 0x89, 0x53, 0x7d, 0x65, + 0x41, 0x45, 0x36, 0xa8, 0x4e, 0x5c, 0xb6, 0x45, 0x9d, 0xa0, 0xa1, 0x45, + 0x54, 0xce, 0x8c, 0xd2, 0x4b, 0x73, 0x59, 0xbd, 0xc0, 0x6e, 0x8e, 0x5b, + 0x47, 0x6d, 0x66, 0x92, 0x88, 0x9e, 0x9f, 0x8a, 0x71, 0x96, 0x49, 0x76, + 0xaf, 0x48, 0x79, 0x4e, 0x60, 0x42, 0x72, 0xa1, 0x43, 0x79, 0x74, 0x5e, + 0x59, 0x67, 0x39, 0x73, 0x61, 0xcc, 0x6e, 0x8f, 0x8c, 0x66, 0xc0, 0x92, + 0x61, 0xa1, 0x67, 0x5d, 0x8c, 0xb7, 0x96, 0x97, 0x7c, 0x2d, 0xb5, 0x58, + 0x82, 0x8f, 0xd6, 0x47, 0xce, 0x3a, 0x28, 0x53, 0xd0, 0x7e, 0x5c, 0x3e, + 0xb9, 0x83, 0x99, 0x73, 0x50, 0x4a, 0x3d, 0xa1, 0x53, 0x4b, 0x5c, 0x6b, + 0x65, 0xbc, 0x72, 0x27, 0x68, 0xba, 0xa5, 0xbc, 0xaa, 0xa0, 0x4a, 0x63, + 0x41, 0xd6, 0x56, 0xd4, 0x4d, 0x96, 0xc8, 0x4c, 0x71, 0x4f, 0x6b, 0x34, + 0x80, 0x83, 0x91, 0xb1, 0xcf, 0x4b, 0x69, 0x69, 0x97, 0x5c, 0x73, 0xe6, + 0x6f, 0x44, 0xb0, 0xac, 0xb6, 0x94, 0xc3, 0x5e, 0x55, 0x64, 0x4b, 0xac, + 0x35, 0x6b, 0x5c, 0xc7, 0x94, 0xd4, 0x8a, 0x3d, 0x71, 0x69, 0x40, 0x8d, + 0x65, 0x48, 0xe2, 0x82, 0x40, 0x86, 0xcb, 0x3f, 0xc1, 0xa0, 0x77, 0x86, + 0x61, 0x7d, 0x57, 0x49, 0xc6, 0xbf, 0x9e, 0x3c, 0x51, 0x62, 0x8a, 0x48, + 0x77, 0x91, 0x2e, 0x66, 0x2e, 0xbd, 0x41, 0x6d, 0x92, 0x3c, 0x2d, 0x99, + 0x69, 0x81, 0xc6, 0x6d, 0x63, 0x91, 0xbc, 0xa9, 0x45, 0x4e, 0x3f, 0x72, + 0x44, 0x82, 0x63, 0x8c, 0x5d, 0x4e, 0xbc, 0xb3, 0xb5, 0xa8, 0x9b, 0x78, + 0x65, 0x85, 0x41, 0x76, 0xae, 0x46, 0x78, 0xb6, 0xbd, 0x4c, 0x7d, 0xb8, + 0x7b, 0xcc, 0xb2, 0x73, 0x64, 0xa8, 0x54, 0x8a, 0x4b, 0x5c, 0x46, 0xb2, + 0x69, 0x98, 0x74, 0x98, 0x97, 0x71, 0x49, 0xc1, 0x67, 0x7f, 0x7c, 0x7d, + 0x6b, 0x89, 0x7a, 0x6d, 0x48, 0x4e, 0xd6, 0xc7, 0x85, 0xc2, 0x65, 0x3a, + 0x6b, 0xd5, 0x83, 0x4a, 0x80, 0x99, 0xcb, 0x2c, 0xb8, 0x9c, 0xbb, 0x73, + 0x7c, 0x32, 0x68, 0x76, 0x97, 0x71, 0x3e, 0xa3, 0x88, 0xcb, 0xc8, 0xbf, + 0x9a, 0x70, 0xca, 0x38, 0x9a, 0xb1, 0x82, 0x72, 0x53, 0x59, 0x48, 0x50, + 0xcc, 0xb5, 0x96, 0x3d, 0xe0, 0xaa, 0x3e, 0x4a, 0x4b, 0xac, 0xa0, 0x99, + 0x55, 0x55, 0x8b, 0x45, 0x5a, 0xa1, 0x50, 0xc1, 0xa4, 0x46, 0x53, 0xaf, + 0x51, 0x51, 0x70, 0x8b, 0x52, 0x82, 0xd2, 0x6e, 0x78, 0xb9, 0xa6, 0x84, + 0x79, 0x68, 0x7c, 0x69, 0x45, 0xab, 0xb3, 0xc5, 0x8f, 0x82, 0x3a, 0x68, + 0x92, 0x3c, 0x81, 0xc1, 0x97, 0x59, 0x7c, 0x63, 0xb4, 0xd6, 0xc2, 0xae, + 0x8f, 0x51, 0xd2, 0x71, 0xbc, 0x45, 0xa2, 0xa6, 0x62, 0xa4, 0x84, 0x3b, + 0x7e, 0xc7, 0xa1, 0x8a, 0x85, 0x56, 0x69, 0x4a, 0xaa, 0xaf, 0x91, 0x3d, + 0xb5, 0x50, 0x32, 0xa8, 0xa5, 0x46, 0x4f, 0x4c, 0xb0, 0xb4, 0x80, 0x9c, + 0xbb, 0x79, 0x60, 0x57, 0x90, 0x78, 0x85, 0x56, 0x65, 0x3f, 0x51, 0x4d, + 0x32, 0x25, 0xaa, 0xb3, 0xb9, 0xca, 0x79, 0x65, 0x54, 0xa8, 0x83, 0x5e, + 0xbc, 0xb8, 0x68, 0xd1, 0xa8, 0xaa, 0xc9, 0x5a, 0x70, 0x8c, 0x54, 0x65, + 0x3f, 0xcd, 0xd2, 0xb1, 0xa6, 0xcc, 0x5a, 0x52, 0xb6, 0x73, 0xc1, 0x8b, + 0x59, 0xc6, 0x9b, 0x9e, 0x3d, 0x57, 0xb4, 0x44, 0xa1, 0xa9, 0x60, 0x57, + 0xcb, 0x68, 0x45, 0xbc, 0x4b, 0x9c, 0x95, 0x56, 0x4b, 0x87, 0x51, 0xc6, + 0x8f, 0x86, 0x45, 0x59, 0xc0, 0x34, 0x89, 0xb2, 0x4b, 0x99, 0xa2, 0xbc, + 0xbf, 0x6b, 0xc5, 0xc2, 0x9a, 0x8c, 0xb1, 0x3b, 0x78, 0x9e, 0x8b, 0x50, + 0x69, 0xb9, 0x76, 0x77, 0x84, 0x8e, 0x75, 0x2d, 0x7f, 0xce, 0x60, 0xbb, + 0x58, 0x3c, 0x3c, 0x90, 0x6f, 0xbf, 0x90, 0x74, 0x3f, 0xb7, 0x86, 0x44, + 0x46, 0x9c, 0x8c, 0x9f, 0x68, 0x80, 0x4f, 0x92, 0x88, 0x76, 0x6c, 0x85, + 0xe8, 0x67, 0x98, 0x6a, 0xb8, 0xbd, 0x91, 0xa0, 0x7f, 0xae, 0x52, 0x53, + 0x34, 0xcc, 0x7e, 0x36, 0xbe, 0x9c, 0x9f, 0xc7, 0xce, 0xa4, 0x7c, 0x5c, + 0xa5, 0x9c, 0x49, 0x80, 0x90, 0x99, 0x4f, 0x25, 0x8b, 0x83, 0x50, 0xd2, + 0xb1, 0x85, 0x90, 0xbd, 0xac, 0x30, 0xbd, 0x3a, 0x8f, 0x68, 0x27, 0x2e, + 0xd6, 0xb5, 0xd4, 0x54, 0x4c, 0x60, 0x90, 0xc5, 0x82, 0x42, 0x59, 0x3c, + 0x7b, 0xcb, 0x6e, 0x6b, 0x64, 0x85, 0xad, 0xb2, 0x7d, 0x8f, 0x64, 0xbc, + 0x5d, 0x6b, 0xd0, 0xc8, 0x4e, 0xb9, 0x4f, 0x6c, 0xc9, 0xc1, 0x67, 0xa0, + 0x47, 0x5f, 0x9b, 0xb2, 0x3a, 0xa1, 0x68, 0x9b, 0x87, 0x4b, 0x43, 0x46, + 0x5b, 0xc0, 0xba, 0x9f, 0x7c, 0x31, 0x89, 0x3e, 0x8e, 0xc7, 0x42, 0x67, + 0x2f, 0x87, 0x64, 0x77, 0xaf, 0x3a, 0x3e, 0xa5, 0xab, 0x69, 0x56, 0x80, + 0xbc, 0xf2, 0xc3, 0x61, 0x8b, 0xe2, 0x8b, 0xcb, 0xa1, 0x49, 0xbc, 0x72, + 0x94, 0xd0, 0x3b, 0xad, 0x4b, 0x8d, 0x96, 0x52, 0x65, 0x59, 0x7e, 0x77, + 0xa7, 0x5d, 0x74, 0x5f, 0x7f, 0x3f, 0xa5, 0x5a, 0xb1, 0xb5, 0x52, 0x39, + 0x86, 0x45, 0xc7, 0xc4, 0x35, 0xa9, 0x49, 0x42, 0x57, 0xa6, 0x91, 0xd0, + 0x8d, 0x7c, 0x89, 0x6e, 0x93, 0x87, 0xd8, 0x71, 0x62, 0x50, 0x48, 0x4e, + 0xdc, 0x62, 0x70, 0xcf, 0x5c, 0x57, 0x39, 0xca, 0xd3, 0x73, 0xb0, 0x70, + 0xc8, 0x5a, 0x84, 0x33, 0x7a, 0x95, 0x37, 0x51, 0x79, 0xac, 0xd9, 0xb4, + 0xd3, 0xbd, 0xaf, 0x45, 0xc5, 0x8e, 0xa4, 0xa3, 0xaf, 0x36, 0x99, 0x94, + 0x7c, 0x2e, 0x29, 0x62, 0xbe, 0x47, 0xb1, 0x73, 0xc9, 0x43, 0x8a, 0xb0, + 0x66, 0xb2, 0x92, 0x91, 0xc1, 0x6f, 0x5c, 0x4b, 0x9c, 0xa0, 0x9d, 0xbc, + 0x4f, 0x51, 0xcb, 0x57, 0xa5, 0xb3, 0xca, 0x59, 0xa4, 0x4c, 0x89, 0x91, + 0x7a, 0xcf, 0x84, 0x75, 0x8d, 0xd8, 0x33, 0x91, 0x81, 0x71, 0x84, 0xb0, + 0x56, 0xd2, 0x4a, 0xc4, 0x4d, 0x67, 0x98, 0x77, 0x6f, 0x89, 0xbd, 0x82, + 0x3d, 0x66, 0x53, 0x43, 0x36, 0xab, 0x49, 0xd0, 0x44, 0x87, 0x83, 0x3e, + 0x8e, 0xcb, 0x9a, 0x79, 0x92, 0x8c, 0xa3, 0x57, 0xca, 0xa4, 0x95, 0x81, + 0x94, 0x6b, 0x40, 0x7d, 0xc5, 0x9e, 0x4d, 0xa5, 0x88, 0xd7, 0xb1, 0xa8, + 0xb8, 0x7a, 0x58, 0xb1, 0x40, 0x7d, 0xa1, 0x9a, 0xbd, 0xa4, 0xc5, 0x3f, + 0xa0, 0x7d, 0x8c, 0x60, 0xca, 0xbb, 0xba, 0x68, 0x50, 0x42, 0xbb, 0x6e, + 0xbe, 0xa5, 0x89, 0x73, 0x74, 0xaf, 0x7a, 0xac, 0x2b, 0x4b, 0x7e, 0x7e, + 0x4d, 0x91, 0x81, 0x7c, 0x9e, 0xd8, 0x3f, 0xa0, 0x45, 0x4c, 0x65, 0xb7, + 0x97, 0x65, 0xca, 0x9a, 0xb5, 0x6c, 0xb6, 0xab, 0x85, 0x3d, 0x8b, 0x75, + 0xad, 0xc0, 0x2b, 0x61, 0x89, 0xac, 0x3d, 0x4b, 0xb6, 0x68, 0x53, 0xc6, + 0x4d, 0x3a, 0x7e, 0x89, 0xce, 0x69, 0x3f, 0x62, 0x9f, 0x84, 0xad, 0x98, + 0xb0, 0xcd, 0xe4, 0x8f, 0x7a, 0xc5, 0x58, 0xa5, 0x60, 0x37, 0x9d, 0xc7, + 0x69, 0x8c, 0xba, 0x78, 0xad, 0x62, 0xba, 0x59, 0x88, 0x64, 0x43, 0xbe, + 0x56, 0x84, 0x35, 0x31, 0x76, 0xcb, 0x6b, 0xb4, 0xb1, 0xa2, 0x5e, 0x33, + 0x94, 0x98, 0x80, 0x8e, 0x55, 0x78, 0x5a, 0xa8, 0x41, 0x9a, 0x5f, 0x77, + 0x53, 0xcd, 0x6a, 0x5b, 0x8c, 0x6e, 0x96, 0xd7, 0x2f, 0x63, 0x68, 0x4d, + 0x5b, 0x9b, 0xcb, 0x4a, 0xca, 0x53, 0xcf, 0x68, 0x7e, 0x90, 0x91, 0x79, + 0xc5, 0x94, 0x67, 0x8f, 0x8a, 0xa3, 0x50, 0x76, 0x6e, 0x8b, 0x80, 0x53, + 0xc1, 0xc0, 0x84, 0xa6, 0x6d, 0xbe, 0x68, 0x41, 0x63, 0xa8, 0xa7, 0x44, + 0x92, 0xcd, 0xc4, 0xbd, 0xbb, 0xd2, 0x57, 0x76, 0x56, 0xc0, 0xa5, 0x36, + 0x5f, 0x94, 0x4a, 0xab, 0x5d, 0x82, 0x49, 0xb2, 0x8a, 0xb2, 0xa3, 0xa3, + 0xc0, 0xc0, 0x4d, 0x6e, 0x44, 0xbf, 0xba, 0x86, 0x60, 0x6b, 0xa2, 0xc5, + 0x66, 0x64, 0x4f, 0xae, 0x98, 0x70, 0x49, 0xbb, 0x40, 0xb2, 0x83, 0x51, + 0xc5, 0x8b, 0x5c, 0x3f, 0xb6, 0xa5, 0xa4, 0x62, 0xa9, 0xa4, 0x89, 0x70, + 0xcb, 0x35, 0x91, 0x7d, 0xb0, 0xa6, 0xa0, 0x7e, 0x48, 0x63, 0x8a, 0x51, + 0xc9, 0x7f, 0x3b, 0x4a, 0x99, 0xa8, 0xc6, 0xb0, 0xb8, 0x3a, 0x6f, 0xb2, + 0xa3, 0xa2, 0x9e, 0x3e, 0xd3, 0x6b, 0x52, 0x52, 0xc5, 0xd6, 0xd6, 0xd1, + 0x25, 0x3c, 0x2c, 0x4c, 0x79, 0x70, 0x4f, 0x99, 0xc8, 0xc2, 0xc4, 0x8a, + 0xd6, 0xd1, 0x35, 0x65, 0xd3, 0xa4, 0x9f, 0x7d, 0xbc, 0x7a, 0xa8, 0x4c, + 0x26, 0xb5, 0x67, 0x78, 0x44, 0xc4, 0x5c, 0xd0, 0xba, 0xcc, 0x63, 0xa4, + 0x6b, 0xcd, 0xc9, 0x97, 0x86, 0x51, 0x5a, 0x90, 0xd4, 0xcf, 0xc3, 0x82, + 0x60, 0x8d, 0x7e, 0x5c, 0x72, 0x5e, 0x30, 0xb1, 0xc5, 0x64, 0x72, 0xb0, + 0x9c, 0xbb, 0xd4, 0x7e, 0xd2, 0xae, 0xbc, 0x72, 0xb9, 0xd0, 0xb9, 0xc3, + 0x95, 0x6b, 0xbe, 0xc1, 0xad, 0x98, 0x4c, 0x3b, 0xbf, 0x38, 0x85, 0x97, + 0x45, 0x69, 0x47, 0xb5, 0xa1, 0x6c, 0x65, 0x77, 0x5a, 0x99, 0x92, 0xd0, + 0xa3, 0x8b, 0x8d, 0xc7, 0x42, 0xb5, 0xbf, 0x4f, 0xd3, 0x69, 0x9b, 0xab, + 0x82, 0x99, 0x7c, 0x4c, 0x9b, 0x6c, 0x50, 0x99, 0x70, 0xc7, 0x4d, 0x70, + 0x6c, 0x50, 0x51, 0x60, 0x53, 0x35, 0x78, 0x7e, 0xbf, 0x78, 0x68, 0x7c, + 0xbb, 0xc1, 0x4b, 0x59, 0x35, 0x95, 0x49, 0xa7, 0xd2, 0x55, 0x9a, 0x81, + 0x62, 0x65, 0x8c, 0x8a, 0x52, 0x1e, 0xab, 0x4f, 0xa9, 0xb8, 0x5b, 0x87, + 0xbd, 0x8c, 0x5d, 0x9c, 0x34, 0x98, 0x42, 0x2a, 0x94, 0xa4, 0x95, 0xa3, + 0x74, 0xc1, 0x53, 0xc7, 0x3d, 0x9c, 0xa9, 0x7e, 0x7c, 0x85, 0x3b, 0x43, + 0x8e, 0x71, 0x82, 0x5b, 0x58, 0x83, 0xdd, 0x69, 0xa8, 0x95, 0xd7, 0x50, + 0xc1, 0x8d, 0x91, 0x4f, 0x3b, 0x3b, 0x43, 0xa7, 0xc4, 0x84, 0x78, 0x62, + 0x88, 0x8f, 0x91, 0x5e, 0xc8, 0x52, 0xc1, 0xbe, 0x8d, 0x57, 0x6d, 0x8c, + 0x57, 0x4f, 0x5d, 0x62, 0xb6, 0xb7, 0x35, 0x6d, 0xc4, 0xb6, 0x6b, 0xe1, + 0xb8, 0x7d, 0xbd, 0x7f, 0x2b, 0x65, 0x9a, 0x75, 0x48, 0xcf, 0x3f, 0x6d, + 0xa8, 0x98, 0xdd, 0x59, 0xa2, 0x7c, 0x83, 0xb8, 0x86, 0x84, 0xcc, 0x97, + 0x9b, 0x93, 0x87, 0x46, 0x3a, 0x4c, 0xd8, 0xce, 0x88, 0x8d, 0xab, 0x6e, + 0x6d, 0xc4, 0x64, 0x46, 0x6b, 0x91, 0x4a, 0xaf, 0xd6, 0x44, 0xd6, 0x8e, + 0xa3, 0xab, 0x92, 0x36, 0x8b, 0x48, 0x3b, 0x9f, 0x83, 0x8f, 0xd2, 0x4a, + 0x3c, 0x80, 0x3e, 0x8a, 0x57, 0x72, 0x78, 0x3d, 0x89, 0x95, 0x58, 0x51, + 0x9c, 0x6d, 0xc6, 0x42, 0x4e, 0x5a, 0x3b, 0x8e, 0x42, 0x6f, 0x2a, 0xad, + 0x65, 0xc6, 0x66, 0x7d, 0x82, 0xa0, 0xb5, 0xc8, 0x76, 0x66, 0x77, 0xbe, + 0x3d, 0x54, 0x91, 0x57, 0x8b, 0xb1, 0x8f, 0x6f, 0xa3, 0xa7, 0x51, 0x2f, + 0x33, 0xad, 0xa6, 0xaf, 0x8d, 0x3d, 0x39, 0xac, 0x75, 0xd2, 0x92, 0x71, + 0x70, 0x4b, 0x65, 0xa3, 0xc2, 0xcf, 0x5b, 0x44, 0x4d, 0x90, 0xbd, 0x41, + 0x80, 0x63, 0x5b, 0x44, 0x47, 0x49, 0xcb, 0x90, 0xac, 0x59, 0x85, 0x99, + 0x89, 0x57, 0x7f, 0x59, 0xa7, 0x9a, 0x8b, 0x9d, 0xd7, 0x62, 0xa5, 0x9a, + 0x54, 0xa5, 0x6c, 0x76, 0x79, 0xae, 0x31, 0x72, 0x86, 0x99, 0x36, 0x98, + 0x6e, 0xa0, 0x5d, 0x81, 0x36, 0x82, 0x9c, 0xc7, 0x73, 0x86, 0xaa, 0xca, + 0x69, 0xb8, 0xb2, 0x3d, 0x5c, 0x88, 0xb3, 0xbc, 0x41, 0x9a, 0x5f, 0x84, + 0x3c, 0xb1, 0x66, 0xc9, 0x69, 0x5c, 0x60, 0x7c, 0xbe, 0xae, 0xd8, 0xae, + 0x7a, 0xcb, 0x80, 0x9e, 0x7e, 0x60, 0x95, 0x6a, 0xb7, 0x2a, 0xb9, 0x8b, + 0xab, 0x65, 0x73, 0xc9, 0x82, 0xc8, 0x38, 0xac, 0xb9, 0x40, 0xa1, 0x39, + 0x33, 0x53, 0x4d, 0x9a, 0x58, 0xc8, 0x92, 0x57, 0xbe, 0x46, 0xa0, 0x6e, + 0x62, 0x40, 0x9a, 0x33, 0x39, 0x7b, 0xbf, 0x4e, 0xa2, 0x73, 0x49, 0x43, + 0xb3, 0x81, 0x7e, 0x3a, 0x44, 0x5a, 0x60, 0xa9, 0x8b, 0x6a, 0xad, 0x90, + 0x9f, 0x6f, 0x62, 0x97, 0xbe, 0x6e, 0x36, 0x3d, 0x7f, 0x85, 0x6c, 0xd1, + 0x52, 0x52, 0x3d, 0xcf, 0x59, 0xac, 0x4a, 0x82, 0xa9, 0xc5, 0xbc, 0x8f, + 0x88, 0x71, 0xcf, 0x96, 0x9e, 0x4c, 0xa8, 0xad, 0x9d, 0xd0, 0x38, 0x4a, + 0xc0, 0xa0, 0xb4, 0x78, 0xa1, 0xb0, 0xa8, 0xa4, 0x71, 0xb8, 0x73, 0x66, + 0x81, 0xc6, 0xbd, 0x42, 0xcb, 0xb5, 0x7e, 0xa7, 0xb3, 0x38, 0x42, 0xa0, + 0xcf, 0x57, 0x4a, 0xa2, 0xb1, 0xb5, 0x3f, 0x71, 0xa6, 0xba, 0x33, 0x8a, + 0x9d, 0xa1, 0x95, 0x40, 0xa5, 0x70, 0x7c, 0xa3, 0x7f, 0xb8, 0x91, 0x5e, + 0xd1, 0x64, 0x95, 0x4a, 0xbd, 0xca, 0xa9, 0x72, 0xbc, 0x8f, 0x94, 0x52, + 0x34, 0xb2, 0x9c, 0xd0, 0xb6, 0x51, 0x9e, 0x5f, 0x6f, 0xba, 0xc5, 0x48, + 0x9f, 0x50, 0xaf, 0x45, 0x58, 0x41, 0x75, 0x8a, 0xab, 0xc4, 0x73, 0x9a, + 0x3c, 0xb7, 0x9c, 0x47, 0xc8, 0xa5, 0x82, 0xad, 0xce, 0x8f, 0x7e, 0xbc, + 0x40, 0x57, 0x72, 0xa5, 0x81, 0x77, 0xad, 0x35, 0x3d, 0x45, 0xc0, 0xaa, + 0x9c, 0x55, 0x63, 0x82, 0x4b, 0x7f, 0x85, 0x80, 0x57, 0x74, 0x3c, 0x5b, + 0x43, 0x5e, 0xa3, 0x78, 0x8d, 0x7a, 0x69, 0x8c, 0xb5, 0x7b, 0x34, 0x70, + 0xa5, 0x8a, 0x6c, 0x4a, 0xbd, 0x9d, 0x41, 0xa8, 0xcd, 0x52, 0xb1, 0x69, + 0xad, 0x96, 0x8a, 0x91, 0xce, 0x54, 0x8d, 0x4a, 0x6d, 0x63, 0xd4, 0x8a, + 0x8a, 0x3b, 0x61, 0x6b, 0x3d, 0x46, 0x71, 0x8e, 0x58, 0xc3, 0x7f, 0xbb, + 0xc1, 0xa7, 0x5d, 0x9b, 0x66, 0xa6, 0xab, 0x9c, 0x49, 0x30, 0x42, 0x63, + 0x41, 0x80, 0x59, 0xb5, 0x45, 0xc1, 0xb4, 0x36, 0x3a, 0x9d, 0xc9, 0xcb, + 0xb1, 0xce, 0x3b, 0xb5, 0xc5, 0xc1, 0x5a, 0x39, 0x57, 0xa5, 0x47, 0x29, + 0xbf, 0xc8, 0xc0, 0x8e, 0x9c, 0xc2, 0x53, 0x8d, 0x61, 0xbd, 0xc4, 0x45, + 0x5f, 0x70, 0x4b, 0x36, 0x45, 0xb8, 0x40, 0xce, 0x5f, 0xa9, 0x54, 0x2d, + 0x6b, 0x63, 0xc2, 0x5e, 0xc2, 0x46, 0xcc, 0x51, 0x8e, 0x3d, 0xcb, 0x35, + 0x44, 0xbd, 0x36, 0x27, 0x4d, 0xae, 0x96, 0xb0, 0x8c, 0x9a, 0xa3, 0x9e, + 0xc7, 0x96, 0x3d, 0x42, 0x56, 0x89, 0x42, 0x92, 0x4d, 0x51, 0x59, 0x74, + 0x87, 0x39, 0x44, 0xa5, 0xd1, 0x90, 0x57, 0xaf, 0x7f, 0xc2, 0x8c, 0xd4, + 0xb0, 0x9c, 0xc1, 0x9d, 0x69, 0x79, 0x6e, 0x6d, 0x9c, 0x6f, 0x7e, 0x80, + 0x63, 0xa6, 0x47, 0xa3, 0x96, 0x33, 0xbe, 0xc7, 0x54, 0x4c, 0xc9, 0x53, + 0x36, 0x60, 0x90, 0x96, 0x75, 0x68, 0x6e, 0xac, 0x3c, 0x92, 0xc7, 0x7f, + 0x30, 0x82, 0x3c, 0xbe, 0x52, 0x32, 0xaa, 0xa3, 0x76, 0x5b, 0xa2, 0xdd, + 0x9c, 0x83, 0x4c, 0x5a, 0xbf, 0x66, 0xcf, 0xd1, 0xaf, 0x4d, 0x42, 0x74, + 0x9d, 0x77, 0x40, 0x80, 0x61, 0xbf, 0x9e, 0xba, 0xb9, 0x98, 0xce, 0x46, + 0x54, 0xa0, 0x83, 0xd1, 0x6f, 0x3a, 0xa6, 0x96, 0x2a, 0x36, 0x95, 0xcf, + 0xae, 0x30, 0xcf, 0xcf, 0x65, 0xae, 0x3a, 0x91, 0x56, 0x9d, 0x5c, 0x86, + 0x49, 0xbc, 0x95, 0x57, 0xc0, 0x7b, 0x41, 0x69, 0x9e, 0xb2, 0xa3, 0xa7, + 0x9e, 0x9a, 0xa5, 0x79, 0x60, 0x46, 0x72, 0x85, 0x9b, 0x66, 0x86, 0xb9, + 0xa4, 0xce, 0xa2, 0x5e, 0x5a, 0x8e, 0x9c, 0x68, 0x41, 0x8d, 0xbd, 0xd3, + 0x56, 0x6b, 0xc6, 0x90, 0x60, 0x3e, 0x94, 0xa1, 0x71, 0x65, 0x51, 0x48, + 0x94, 0xb9, 0x4c, 0xd9, 0xaa, 0x9d, 0x41, 0xbc, 0xb2, 0xa9, 0x59, 0x77, + 0xb8, 0x3c, 0x61, 0x6d, 0xc1, 0xaf, 0x7f, 0x7a, 0xa6, 0x7e, 0x68, 0x34, + 0xad, 0xa1, 0x28, 0x57, 0x81, 0x74, 0x25, 0x5e, 0xb5, 0x7f, 0x8d, 0x7c, + 0x6b, 0xcb, 0x6c, 0x4a, 0x99, 0x5c, 0x2c, 0x33, 0xad, 0x4e, 0x6e, 0x3a, + 0x2f, 0x63, 0x93, 0x73, 0x3a, 0xb5, 0x64, 0x96, 0xa1, 0xca, 0x54, 0x43, + 0x32, 0x35, 0x80, 0x9e, 0x5e, 0x6c, 0xd1, 0x7e, 0x88, 0x5f, 0xc8, 0x79, + 0xce, 0x8f, 0xa1, 0x97, 0xc6, 0x84, 0x53, 0x48, 0x40, 0x32, 0xd6, 0x41, + 0xa5, 0xb7, 0x95, 0xc8, 0x93, 0x64, 0x7d, 0xb3, 0x52, 0x83, 0x66, 0x5c, + 0x66, 0x63, 0xa0, 0x7c, 0x39, 0x44, 0x68, 0x99, 0xd0, 0x8c, 0x50, 0x82, + 0x31, 0x5d, 0x87, 0xd1, 0x50, 0x41, 0xbf, 0xd6, 0x9b, 0x82, 0x57, 0x7d, + 0x50, 0xae, 0x6f, 0x74, 0xae, 0x63, 0x85, 0xb4, 0x63, 0xa8, 0x71, 0xb7, + 0x7f, 0x90, 0x82, 0x3b, 0x37, 0x9b, 0x39, 0x57, 0x6a, 0xc4, 0x4a, 0xe4, + 0xaa, 0x59, 0xa2, 0x8b, 0xb8, 0xba, 0xa9, 0xb4, 0x4d, 0x58, 0x6e, 0xb8, + 0x49, 0x70, 0x5b, 0xd1, 0x64, 0x40, 0x71, 0x3d, 0x98, 0x49, 0x6a, 0x36, + 0x7f, 0x8d, 0xb8, 0x6a, 0x97, 0xd6, 0x35, 0x40, 0x6c, 0x75, 0x36, 0xa2, + 0xaf, 0x76, 0x90, 0x77, 0x9a, 0x59, 0x72, 0x35, 0x49, 0x70, 0x62, 0x54, + 0x8b, 0xc1, 0x52, 0x81, 0x45, 0x72, 0x52, 0x36, 0x84, 0x85, 0xae, 0x34, + 0x31, 0x45, 0x43, 0x68, 0x8c, 0xa2, 0x60, 0xd1, 0xc3, 0xd1, 0x7c, 0xac, + 0x36, 0x4d, 0x34, 0x45, 0x83, 0xbb, 0x84, 0xc4, 0x38, 0x5f, 0xce, 0x8c, + 0x53, 0x91, 0xb7, 0x4f, 0xab, 0xc3, 0x65, 0x4e, 0x85, 0x8d, 0x59, 0xa9, + 0x83, 0x79, 0x36, 0xbf, 0x87, 0x90, 0x65, 0x92, 0x35, 0x92, 0x98, 0x7c, + 0x84, 0xcd, 0xcd, 0x73, 0x96, 0x38, 0x5b, 0x59, 0xaa, 0xce, 0xba, 0xa6, + 0x7b, 0xd1, 0x63, 0x57, 0x3f, 0xca, 0x45, 0x91, 0xad, 0xb4, 0xbf, 0x7b, + 0x2f, 0x88, 0xcd, 0x65, 0x6c, 0x34, 0x57, 0x4b, 0x2b, 0x66, 0x73, 0xc1, + 0x80, 0x5e, 0x7d, 0x9e, 0xc0, 0xc9, 0x96, 0xb5, 0x2e, 0x80, 0x7f, 0xad, + 0x35, 0xaf, 0x83, 0x65, 0x73, 0xc2, 0x93, 0x9a, 0x8b, 0xc1, 0x88, 0x61, + 0x9c, 0x67, 0x4b, 0xc8, 0x90, 0x52, 0xa6, 0x34, 0xc8, 0xa2, 0xb0, 0xb5, + 0x3a, 0x8f, 0x59, 0x7b, 0x57, 0xbc, 0x4e, 0xa8, 0x5b, 0xc6, 0xaf, 0xb4, + 0x5d, 0x71, 0x92, 0xae, 0x62, 0x46, 0x97, 0x4b, 0xa8, 0x50, 0xa1, 0xcc, + 0x98, 0xd4, 0x5c, 0x3e, 0x44, 0x61, 0xb9, 0x70, 0x88, 0x6e, 0x97, 0xa0, + 0x32, 0x4f, 0xc1, 0x8d, 0xbe, 0xd2, 0xb4, 0x64, 0x5e, 0x52, 0xcd, 0x65, + 0xba, 0xa9, 0x59, 0x48, 0xd0, 0xcc, 0xbe, 0x60, 0x5b, 0x50, 0x95, 0xa5, + 0xc8, 0xa5, 0xb3, 0x40, 0x55, 0xba, 0x88, 0xbc, 0xbc, 0xb0, 0x81, 0x62, + 0xc8, 0x58, 0xaf, 0x98, 0x2d, 0x84, 0xc8, 0x84, 0xcc, 0x58, 0x44, 0x7a, + 0xca, 0x85, 0xbc, 0x3f, 0x8b, 0x7a, 0x8a, 0x31, 0xc5, 0xaf, 0xd0, 0x3c, + 0x8b, 0x6b, 0xd0, 0xc9, 0xb0, 0x51, 0x9f, 0xaf, 0x77, 0xca, 0x68, 0xb2, + 0x9e, 0x95, 0x58, 0xc2, 0x44, 0x81, 0x67, 0x4f, 0x69, 0x76, 0xd2, 0x4e, + 0xb5, 0x52, 0xca, 0x25, 0xb5, 0x3e, 0x4d, 0x5c, 0xc4, 0x3c, 0x8a, 0xad, + 0x76, 0x47, 0x3f, 0x4e, 0xa6, 0xc1, 0x91, 0x33, 0xa7, 0x44, 0xd1, 0x3c, + 0xd2, 0x92, 0x8c, 0x8d, 0x80, 0x5c, 0x67, 0x5f, 0x90, 0x77, 0x4a, 0x38, + 0xc8, 0xa9, 0x2f, 0x71, 0xba, 0xba, 0x37, 0x3d, 0xa6, 0x75, 0xa8, 0xc6, + 0xd1, 0x37, 0x39, 0x32, 0xce, 0xaf, 0xcd, 0xc5, 0x9c, 0x90, 0x4f, 0x95, + 0xa7, 0xb3, 0x79, 0x86, 0x40, 0x75, 0x4b, 0xd4, 0x69, 0x7d, 0x8f, 0x93, + 0xa0, 0xc6, 0x83, 0x36, 0x39, 0xcc, 0x8f, 0x95, 0x8b, 0x3b, 0x9d, 0x55, + 0x91, 0x46, 0x3f, 0x65, 0xb2, 0x39, 0x80, 0xb7, 0x3a, 0x83, 0x52, 0x68, + 0xa7, 0x9d, 0x48, 0x9a, 0x52, 0x30, 0x72, 0x9f, 0x40, 0xbf, 0xc5, 0x36, + 0x6c, 0x8b, 0x88, 0xa9, 0xc4, 0xd8, 0xc1, 0x7a, 0x4e, 0x53, 0x61, 0x3f, + 0x5f, 0xad, 0x83, 0x81, 0xb2, 0x48, 0xb7, 0xc7, 0x5e, 0xa2, 0x84, 0xb0, + 0x4d, 0x51, 0x35, 0xce, 0x37, 0xc6, 0x58, 0x33, 0x99, 0xc0, 0x5f, 0xac, + 0x8c, 0x5f, 0x63, 0x55, 0xca, 0xd0, 0xad, 0xc6, 0xa4, 0x80, 0x72, 0xb6, + 0xb6, 0x32, 0x8a, 0xaf, 0x50, 0x7c, 0xac, 0xa6, 0xa6, 0x79, 0x38, 0x52, + 0x56, 0xc3, 0x96, 0xc4, 0xa5, 0x82, 0xa2, 0x93, 0xbf, 0x95, 0x69, 0x79, + 0xc8, 0x8e, 0x34, 0x49, 0x5e, 0xa2, 0x57, 0x3f, 0x38, 0x48, 0x89, 0x79, + 0x90, 0x69, 0x73, 0xc0, 0xb0, 0xc1, 0x50, 0x36, 0x38, 0x47, 0x5b, 0x9d, + 0x40, 0x8b, 0xd3, 0xd4, 0x62, 0x56, 0x8c, 0xb6, 0x98, 0x73, 0x49, 0x77, + 0x68, 0x7d, 0x63, 0x97, 0x4c, 0xb0, 0x3b, 0xbb, 0xbf, 0xbb, 0xb1, 0x6e, + 0xc7, 0x41, 0x62, 0xd0, 0x52, 0x75, 0x57, 0x6a, 0xc7, 0x42, 0x45, 0x92, + 0xc5, 0x4c, 0xb0, 0x85, 0x53, 0x50, 0x46, 0x9e, 0x77, 0xd0, 0xc2, 0xa5, + 0xb0, 0x44, 0x9e, 0x54, 0x9f, 0x77, 0xc2, 0x95, 0x7d, 0xbf, 0x39, 0x57, + 0xc3, 0x53, 0xbb, 0x9c, 0xa1, 0x43, 0x5b, 0xc6, 0x86, 0xc9, 0xda, 0x9c, + 0x8d, 0x9d, 0x7b, 0x76, 0x77, 0x4a, 0x89, 0xa7, 0xd0, 0x72, 0x54, 0x59, + 0x7d, 0x64, 0xb1, 0x34, 0x56, 0x5a, 0x36, 0x3e, 0x34, 0xb5, 0xc5, 0xa4, + 0x5c, 0x7d, 0xa2, 0x64, 0x4e, 0xb9, 0x64, 0x50, 0x73, 0x79, 0x5e, 0x62, + 0x5f, 0xcd, 0xce, 0x71, 0x36, 0xaa, 0x81, 0xd1, 0x9e, 0x40, 0x48, 0x58, + 0xce, 0x8e, 0x4d, 0xa1, 0x7f, 0x6d, 0x8b, 0x2e, 0x3d, 0x86, 0x3e, 0xbf, + 0x95, 0xb9, 0xba, 0xb1, 0xb2, 0x52, 0x4e, 0x51, 0x6c, 0xcb, 0x73, 0x63, + 0x7f, 0x8e, 0x58, 0x70, 0x30, 0x7d, 0x35, 0xb7, 0xc6, 0xca, 0xbc, 0x4c, + 0xab, 0x92, 0xc4, 0x4f, 0x7e, 0x7e, 0x97, 0x96, 0x4f, 0x58, 0x30, 0x56, + 0x7d, 0xc5, 0x50, 0x64, 0xb7, 0xb5, 0xb4, 0x9f, 0xa1, 0xbf, 0x3d, 0x8e, + 0x89, 0xc4, 0x77, 0x68, 0xbf, 0x3f, 0x60, 0xc7, 0xad, 0xca, 0x3a, 0x9e, + 0x6c, 0xc5, 0x90, 0x9e, 0x2e, 0x82, 0x53, 0x52, 0x4a, 0xa4, 0xab, 0x3c, + 0x62, 0x52, 0x86, 0x2d, 0xd4, 0xa1, 0x4d, 0x97, 0xbb, 0xb4, 0x64, 0xb8, + 0xa5, 0x39, 0xb8, 0xb7, 0x5c, 0xce, 0x5c, 0xa3, 0xb1, 0x88, 0x59, 0x37, + 0xa4, 0x55, 0x60, 0x93, 0x5a, 0xc5, 0x8b, 0x91, 0xb7, 0xc6, 0x48, 0x6b, + 0xcc, 0x8c, 0x66, 0xa0, 0x50, 0xc4, 0xa9, 0x70, 0x9c, 0x8f, 0x53, 0x9f, + 0x8b, 0xd3, 0x97, 0x9e, 0xbc, 0x3c, 0xd4, 0xa9, 0x8b, 0x55, 0xa2, 0x6a, + 0xb9, 0x69, 0xac, 0x41, 0x77, 0xbf, 0x3a, 0xb8, 0x66, 0xc1, 0xa4, 0x72, + 0x42, 0x6e, 0x69, 0xad, 0x52, 0x40, 0x3e, 0xa7, 0x46, 0x6f, 0xb1, 0x3d, + 0x89, 0xbd, 0x41, 0x37, 0x71, 0x42, 0x8f, 0xc0, 0x90, 0x81, 0x4e, 0x3d, + 0x7b, 0x64, 0x6c, 0x68, 0x77, 0xbf, 0x63, 0x63, 0x99, 0x38, 0x80, 0xc3, + 0xcb, 0xaa, 0x65, 0x32, 0xb8, 0xaa, 0x44, 0x37, 0x31, 0xa9, 0x86, 0xba, + 0xd4, 0xb1, 0xaf, 0xb9, 0x46, 0x2e, 0xba, 0x35, 0x29, 0xc3, 0x96, 0x4f, + 0x78, 0x75, 0x7f, 0x62, 0xb9, 0x84, 0x55, 0xb7, 0x81, 0x5a, 0x76, 0xb0, + 0x54, 0xbd, 0xc2, 0x57, 0x71, 0xb8, 0x88, 0x71, 0x32, 0x64, 0x37, 0x99, + 0xa5, 0x8a, 0x8d, 0x80, 0xc4, 0x54, 0x81, 0x43, 0x60, 0x64, 0x80, 0x96, + 0x93, 0x9c, 0x81, 0xae, 0x40, 0x36, 0x79, 0x4d, 0x88, 0x68, 0x48, 0xaa, + 0x65, 0x49, 0x3a, 0xae, 0xaf, 0x34, 0x32, 0x64, 0xa8, 0x31, 0x38, 0x3d, + 0xb5, 0x4f, 0x64, 0x66, 0xc8, 0x3b, 0xd1, 0xb9, 0x42, 0x8b, 0xc7, 0x65, + 0xa7, 0xbe, 0xa7, 0x5a, 0x2d, 0xc2, 0x38, 0x3a, 0x2d, 0xae, 0x54, 0xbd, + 0x39, 0x5b, 0x2f, 0xb3, 0xdb, 0xc8, 0x99, 0xbf, 0x7d, 0xb4, 0x7d, 0x63, + 0xc1, 0x84, 0x69, 0x57, 0x92, 0xbb, 0x99, 0x88, 0x54, 0x36, 0x5d, 0xb1, + 0xbe, 0xa9, 0x91, 0x93, 0x70, 0x75, 0x4f, 0x5d, 0x91, 0x68, 0xc1, 0x84, + 0xb0, 0xb8, 0x82, 0xb8, 0xa5, 0x5f, 0x71, 0x4c, 0x97, 0x58, 0x8f, 0x81, + 0x76, 0x58, 0x5d, 0xd2, 0x41, 0x5d, 0x7a, 0xb8, 0xae, 0xb8, 0x50, 0xa3, + 0xc3, 0xc7, 0x3c, 0xbe, 0x39, 0x7f, 0x57, 0x96, 0x99, 0x79, 0xdb, 0xa9, + 0xa6, 0x5f, 0xcf, 0xb2, 0x81, 0xca, 0x7f, 0x87, 0xc1, 0x4d, 0xb2, 0x7a, + 0x36, 0x81, 0x5c, 0xb1, 0x6a, 0x66, 0xa9, 0x99, 0x99, 0xba, 0x4b, 0x4f, + 0x38, 0xd7, 0x47, 0xc3, 0x55, 0x38, 0xbf, 0x84, 0x93, 0xcf, 0xb4, 0xa3, + 0x67, 0xa1, 0x87, 0x88, 0x71, 0x70, 0x7f, 0x36, 0x69, 0xa2, 0x65, 0x92, + 0xb7, 0xad, 0xb8, 0xad, 0xb5, 0x42, 0x92, 0x62, 0x8b, 0x3b, 0xbb, 0x32, + 0x8a, 0xd1, 0x74, 0x9a, 0x8f, 0x3a, 0x90, 0xdc, 0x8d, 0xd5, 0xca, 0x7b, + 0xc9, 0x65, 0x72, 0xbd, 0x56, 0xb0, 0x8f, 0xc5, 0x83, 0x99, 0x86, 0x55, + 0x2f, 0x96, 0xc0, 0xa2, 0xa2, 0x48, 0x33, 0xcb, 0x55, 0xb2, 0x72, 0x55, + 0x3f, 0xcb, 0xc7, 0x63, 0xa9, 0xb5, 0xac, 0x8b, 0x46, 0x41, 0x59, 0x94, + 0x96, 0xa2, 0x97, 0x41, 0xc2, 0x63, 0x6c, 0x56, 0x9f, 0x9d, 0xa1, 0x79, + 0xc3, 0x68, 0x72, 0xa6, 0xa5, 0x77, 0xd9, 0x8f, 0x9c, 0x9b, 0x55, 0x65, + 0x97, 0x8e, 0x87, 0xca, 0x98, 0x9c, 0x32, 0x55, 0x6b, 0x48, 0x71, 0xc9, + 0x96, 0x75, 0xcb, 0x3e, 0x47, 0x5a, 0xc1, 0x5e, 0x69, 0xda, 0x59, 0x83, + 0xce, 0xbd, 0xbc, 0x32, 0x34, 0x53, 0x7b, 0x98, 0x7d, 0x86, 0x71, 0x73, + 0x92, 0xab, 0x90, 0xa1, 0x61, 0x2e, 0x62, 0x93, 0x67, 0x50, 0xb0, 0xae, + 0x7c, 0x8b, 0x54, 0x5f, 0xa1, 0xc2, 0x62, 0x88, 0x8e, 0xc3, 0xb6, 0x48, + 0x44, 0x34, 0x66, 0x65, 0x5a, 0x6f, 0x6c, 0x39, 0x6f, 0xb9, 0xbb, 0x90, + 0x62, 0x65, 0xd6, 0x5f, 0xb2, 0x37, 0x71, 0x41, 0x75, 0x42, 0x6b, 0xa8, + 0x94, 0x86, 0xb3, 0xba, 0xad, 0x71, 0x43, 0x32, 0xb4, 0x51, 0x5d, 0xd9, + 0x5c, 0x97, 0x7f, 0x49, 0xad, 0xa7, 0xbf, 0x86, 0x9a, 0xb3, 0x47, 0xbc, + 0x6a, 0x8a, 0x61, 0x33, 0xcb, 0x66, 0x89, 0x9d, 0xa3, 0x48, 0x89, 0xc7, + 0x5b, 0x47, 0x61, 0x7c, 0x77, 0x94, 0x96, 0x73, 0x40, 0xac, 0x83, 0x73, + 0x86, 0x3b, 0x4f, 0x6c, 0xa5, 0x40, 0x3b, 0x6e, 0xb0, 0xb9, 0xa0, 0x79, + 0xb6, 0x5f, 0x9a, 0xc0, 0x6b, 0x72, 0xa0, 0x54, 0xce, 0xa7, 0x77, 0x69, + 0xad, 0x7c, 0xbe, 0xa7, 0x60, 0xa1, 0xcf, 0xb4, 0x89, 0x65, 0x4f, 0x89, + 0x66, 0x88, 0x44, 0x8a, 0xcf, 0x54, 0x79, 0x81, 0xca, 0x75, 0x59, 0x6b, + 0x50, 0x4d, 0x69, 0xb6, 0x74, 0x9c, 0x9b, 0x33, 0xbe, 0x82, 0xce, 0x6a, + 0x5b, 0x98, 0x8c, 0x44, 0x5a, 0x6a, 0x46, 0x97, 0x8f, 0x94, 0xa6, 0x8b, + 0xc0, 0xc8, 0xbe, 0x79, 0x64, 0x4c, 0xc5, 0xbe, 0x43, 0xcd, 0xc5, 0xa0, + 0x99, 0x38, 0x72, 0xa7, 0x7e, 0xc1, 0x66, 0x62, 0xc7, 0x65, 0xbd, 0x73, + 0x6f, 0x82, 0x7a, 0xd8, 0xca, 0x76, 0x96, 0xa6, 0x6a, 0xd3, 0x43, 0x3c, + 0x95, 0xa0, 0x67, 0xc0, 0x63, 0x8e, 0xdd, 0x51, 0x5d, 0x9b, 0x81, 0x90, + 0xd3, 0xaf, 0x66, 0x78, 0x75, 0x4c, 0x8e, 0x96, 0xb1, 0x7c, 0x80, 0xa5, + 0x4e, 0xab, 0x54, 0x6f, 0xa3, 0xa2, 0x5c, 0x6b, 0x70, 0x59, 0x88, 0xc3, + 0x8b, 0x5f, 0xb7, 0xb5, 0x36, 0x41, 0x93, 0x41, 0x58, 0xb8, 0x54, 0x84, + 0x54, 0xb5, 0xc8, 0xc5, 0x5a, 0x8c, 0x66, 0x3b, 0x8e, 0x39, 0x4f, 0x6b, + 0xb0, 0x95, 0xa0, 0x54, 0x30, 0x83, 0xc2, 0x95, 0x4e, 0xd6, 0xb0, 0x50, + 0x67, 0x71, 0x4c, 0x8d, 0x67, 0xab, 0xaa, 0xad, 0x7b, 0xbd, 0x4a, 0x76, + 0x5f, 0x3a, 0x48, 0xc7, 0x75, 0x36, 0x63, 0xce, 0x75, 0x4c, 0x8a, 0x9a, + 0x72, 0x42, 0x5f, 0x4e, 0x35, 0x62, 0x34, 0x4c, 0xb9, 0x9c, 0x77, 0x84, + 0x72, 0x56, 0x60, 0xce, 0x75, 0xbd, 0x97, 0x52, 0x56, 0x54, 0x82, 0x7c, + 0xce, 0x8c, 0x77, 0x70, 0x9d, 0xd7, 0x81, 0xae, 0x9b, 0xbf, 0x9e, 0xaa, + 0x82, 0x4c, 0x36, 0xd1, 0xcb, 0x94, 0xd6, 0x77, 0x59, 0x4e, 0x84, 0x66, + 0x85, 0xa4, 0x89, 0xbd, 0xb3, 0xc9, 0x74, 0xa7, 0x4e, 0x32, 0x66, 0x62, + 0x3d, 0x8b, 0x7b, 0x9d, 0xc3, 0x42, 0x39, 0x74, 0x78, 0x8f, 0xa0, 0x45, + 0xa1, 0x47, 0xa0, 0x51, 0x9c, 0xaf, 0x97, 0x40, 0x5d, 0xc4, 0x57, 0xb9, + 0x63, 0xb0, 0xd0, 0x91, 0x63, 0xb4, 0x8f, 0xd7, 0x6c, 0x48, 0x8c, 0x6f, + 0xcd, 0x43, 0xbd, 0xca, 0xd5, 0x44, 0x54, 0xb5, 0xaa, 0x6c, 0x42, 0xcb, + 0xd7, 0x8a, 0x94, 0xb8, 0x52, 0x33, 0x85, 0x97, 0x9d, 0x77, 0xd0, 0xd5, + 0x8d, 0x98, 0x38, 0x78, 0x3c, 0xa8, 0x6c, 0xa2, 0x6f, 0x35, 0x3d, 0xc5, + 0x69, 0x4d, 0x39, 0xbb, 0x3b, 0xb0, 0xa3, 0xc0, 0xbf, 0xd2, 0x75, 0x6f, + 0xa3, 0x9e, 0x8a, 0x69, 0x5e, 0xd2, 0x57, 0x6a, 0x81, 0xd6, 0x81, 0x42, + 0x56, 0x8e, 0x38, 0x56, 0x4d, 0xc8, 0x50, 0xa3, 0x88, 0x87, 0x57, 0xb0, + 0x86, 0x42, 0x49, 0x72, 0x58, 0xd4, 0x7b, 0x42, 0xcc, 0xd0, 0xab, 0xb6, + 0xca, 0x71, 0x9a, 0x8a, 0x32, 0x4f, 0x7f, 0xb6, 0x9a, 0x85, 0x87, 0x29, + 0x6d, 0x76, 0x83, 0xc8, 0x32, 0x5d, 0x79, 0x3e, 0xd4, 0xa7, 0xc0, 0xd1, + 0xbb, 0x69, 0xba, 0x5d, 0xa7, 0xc2, 0x51, 0x34, 0x5e, 0x5c, 0x66, 0x8e, + 0x79, 0xab, 0x41, 0x78, 0x5c, 0x3e, 0xa9, 0xcb, 0x7e, 0x57, 0xab, 0xab, + 0xa0, 0xac, 0x8f, 0x87, 0x3a, 0x3f, 0xa0, 0xb8, 0xbd, 0xbb, 0xc0, 0x93, + 0x5e, 0xb9, 0x3e, 0xc6, 0x8d, 0x40, 0x4e, 0x62, 0xc7, 0x41, 0x7a, 0xca, + 0x76, 0xa7, 0x75, 0xaa, 0x8b, 0x64, 0x4c, 0xc2, 0x85, 0x69, 0x73, 0xe7, + 0x86, 0x8e, 0x32, 0xb8, 0xbc, 0x3f, 0x7f, 0xc3, 0x7b, 0x9a, 0x79, 0xa8, + 0xa1, 0x63, 0x2e, 0xb1, 0x9f, 0x77, 0xd4, 0x94, 0x4b, 0x8f, 0x50, 0x5d, + 0xcc, 0x85, 0xa2, 0xb0, 0x81, 0x90, 0x6e, 0x5c, 0x68, 0x42, 0x6b, 0x67, + 0xb4, 0xa2, 0x66, 0x66, 0x7c, 0xca, 0x45, 0x91, 0xac, 0xc5, 0x8d, 0xd1, + 0x77, 0x6b, 0xb4, 0xa0, 0x68, 0x49, 0x9e, 0x3f, 0x61, 0x35, 0x51, 0x5f, + 0x73, 0x7d, 0x72, 0x99, 0x65, 0xc9, 0x6a, 0x63, 0xd3, 0xc4, 0x62, 0x76, + 0xbf, 0xcb, 0xb7, 0xd1, 0x6c, 0xa6, 0x7e, 0x55, 0x33, 0x3c, 0x85, 0xca, + 0x98, 0x56, 0x5f, 0x6a, 0x34, 0x91, 0x62, 0x39, 0x87, 0x35, 0xb1, 0xbe, + 0x72, 0x7f, 0x4e, 0xa3, 0x76, 0x81, 0xd7, 0xa1, 0x82, 0x4a, 0x6a, 0x67, + 0x51, 0x6b, 0x3d, 0xc5, 0xc9, 0x3e, 0x8f, 0x90, 0x6a, 0x7c, 0xcb, 0xb2, + 0xa3, 0x3c, 0xa4, 0xd5, 0x95, 0xac, 0x3b, 0x98, 0xb6, 0x99, 0x4a, 0x71, + 0xb0, 0xd4, 0xaa, 0x5b, 0x45, 0x6f, 0xaf, 0xb3, 0x54, 0x91, 0xd3, 0x8d, + 0x7d, 0xcc, 0x5a, 0xa8, 0x95, 0xd7, 0xd7, 0x4d, 0x96, 0xad, 0x83, 0x8a, + 0x82, 0xd1, 0x72, 0x89, 0x35, 0x75, 0x86, 0xb8, 0xc6, 0xaa, 0x33, 0xae, + 0x9a, 0x65, 0x59, 0x92, 0x60, 0xb6, 0x84, 0x44, 0x9a, 0x49, 0xad, 0xb3, + 0x51, 0x33, 0x83, 0x46, 0x45, 0x7f, 0xab, 0xba, 0xc2, 0x8f, 0x6b, 0x6e, + 0xcf, 0x7e, 0xb6, 0xbc, 0x40, 0x7e, 0xd0, 0xbc, 0xcf, 0x41, 0x47, 0xca, + 0x78, 0x64, 0x62, 0xce, 0x7e, 0xaf, 0xb0, 0x43, 0x8d, 0x9e, 0xba, 0x59, + 0x80, 0x49, 0x2d, 0xd0, 0xc8, 0x74, 0x3d, 0x3f, 0x9a, 0xa3, 0x41, 0x74, + 0x86, 0xac, 0x93, 0x34, 0xab, 0xcb, 0x4e, 0x33, 0x85, 0xc9, 0xb2, 0x87, + 0xa1, 0x6d, 0xca, 0xac, 0x74, 0xb6, 0xce, 0x7f, 0x83, 0x71, 0xb9, 0x88, + 0xa3, 0xd0, 0x7a, 0x7b, 0xa4, 0x32, 0x79, 0x52, 0x65, 0x7b, 0xc0, 0x97, + 0x42, 0x56, 0x35, 0xae, 0x3d, 0x4d, 0xad, 0x45, 0x8e, 0x45, 0x71, 0x49, + 0x9b, 0x58, 0x6e, 0x8b, 0x78, 0xc4, 0x5e, 0x5f, 0x6b, 0x5c, 0x34, 0xbb, + 0xa8, 0x6b, 0x58, 0x42, 0xc3, 0xac, 0x89, 0x94, 0x51, 0xd2, 0x6b, 0x6c, + 0x72, 0x82, 0xc8, 0xa9, 0x78, 0x72, 0x59, 0x75, 0x49, 0xd5, 0x8c, 0xbf, + 0xb1, 0x73, 0xc1, 0xa8, 0xa4, 0x38, 0x86, 0x89, 0xb5, 0x55, 0xb6, 0x9a, + 0xa1, 0x91, 0x46, 0x69, 0x3a, 0xad, 0xaf, 0x90, 0xb7, 0x56, 0xa7, 0xb7, + 0x61, 0x40, 0xb3, 0x38, 0x97, 0xa1, 0xab, 0xc3, 0x74, 0xa5, 0x5a, 0x8a, + 0x8c, 0xc9, 0xcd, 0xd2, 0x70, 0x5a, 0xc0, 0x2f, 0x9c, 0xc1, 0x6f, 0x55, + 0x6a, 0xc1, 0xae, 0xba, 0xbb, 0xd0, 0x62, 0x35, 0x72, 0xbd, 0x58, 0x4b, + 0x41, 0x73, 0x84, 0xaa, 0x7d, 0xb3, 0x5d, 0x54, 0xbe, 0x82, 0xae, 0x6c, + 0xac, 0xba, 0x2e, 0x80, 0x5d, 0xb3, 0x32, 0x44, 0x4d, 0x75, 0x42, 0x53, + 0x96, 0x8b, 0xb2, 0xc2, 0x99, 0x9c, 0x60, 0xba, 0x4f, 0xa3, 0x62, 0xd5, + 0x46, 0x7c, 0x68, 0x4a, 0x7b, 0xb3, 0x46, 0x69, 0x40, 0xc6, 0xbe, 0x4d, + 0x74, 0x53, 0x65, 0x36, 0x7b, 0x35, 0x3a, 0xb9, 0xbf, 0xc7, 0x53, 0xb9, + 0xa7, 0xa9, 0x89, 0x3b, 0x63, 0x4b, 0xb1, 0x72, 0xc9, 0xa1, 0x38, 0x74, + 0x9d, 0x50, 0x68, 0xa5, 0x50, 0xa8, 0x66, 0xad, 0x66, 0x94, 0x5c, 0x31, + 0x5e, 0x60, 0x6c, 0x88, 0x74, 0x91, 0xb1, 0x45, 0x79, 0x47, 0xb4, 0x66, + 0x53, 0xaf, 0xb6, 0x8e, 0x67, 0x2a, 0x7f, 0x7e, 0x48, 0xb9, 0x64, 0x7b, + 0x3a, 0xc2, 0xad, 0x9e, 0x9c, 0xa4, 0xc4, 0xae, 0x45, 0x5e, 0xbb, 0x70, + 0x76, 0xa6, 0x5e, 0x6b, 0x77, 0xb0, 0xc8, 0x46, 0x66, 0x5e, 0x9e, 0x63, + 0x49, 0x78, 0xc2, 0x62, 0x72, 0xb9, 0x81, 0x54, 0x3d, 0xb8, 0x79, 0xae, + 0x66, 0xbb, 0xb3, 0xc3, 0x67, 0x65, 0x3a, 0x4c, 0x34, 0xa4, 0x72, 0x48, + 0x79, 0x80, 0x9d, 0xa2, 0xb5, 0xb8, 0x3e, 0x45, 0x8d, 0xb1, 0xc1, 0x31, + 0xc5, 0x50, 0x36, 0x75, 0x63, 0x57, 0x64, 0x45, 0xc8, 0xa9, 0xa8, 0x4d, + 0x36, 0x9b, 0x85, 0x88, 0xca, 0xaa, 0xb9, 0x6c, 0x3f, 0x80, 0x70, 0x38, + 0xd3, 0x70, 0xab, 0xbb, 0x71, 0xad, 0x52, 0x9d, 0xb3, 0x99, 0x44, 0x58, + 0xae, 0x59, 0xa3, 0xc6, 0x55, 0x3a, 0x69, 0x49, 0x34, 0x4f, 0x60, 0x69, + 0x54, 0x9c, 0x9a, 0xad, 0x8c, 0x88, 0xb4, 0xd2, 0x4d, 0x2c, 0xcd, 0x94, + 0x85, 0xbb, 0x98, 0xbe, 0x8f, 0x72, 0x6e, 0x62, 0x66, 0x6e, 0xd6, 0x88, + 0x92, 0x9c, 0xcc, 0xd7, 0x49, 0xc1, 0x83, 0xa1, 0x64, 0x91, 0x81, 0x55, + 0xce, 0x94, 0xd0, 0xa4, 0x55, 0x97, 0xb6, 0x5b, 0xa0, 0x79, 0x45, 0xbb, + 0xcf, 0x83, 0xb2, 0x64, 0x4f, 0x71, 0xcb, 0x8b, 0x8d, 0x6e, 0xcd, 0xba, + 0x54, 0x46, 0x9f, 0x87, 0xb6, 0x3f, 0x5b, 0x83, 0x9b, 0x55, 0x77, 0x5c, + 0xcd, 0x62, 0x5b, 0xbb, 0xc8, 0xc4, 0xc8, 0x72, 0x93, 0xbb, 0xc9, 0x44, + 0xc5, 0x3f, 0x3e, 0x44, 0x9a, 0x7b, 0x45, 0x39, 0x70, 0x90, 0x4c, 0xa8, + 0x50, 0x8c, 0xd4, 0x73, 0x99, 0x79, 0xb9, 0x78, 0x86, 0x49, 0x50, 0x54, + 0x4c, 0x44, 0x4b, 0xc7, 0x4e, 0x3a, 0x75, 0xbe, 0x78, 0x81, 0x7c, 0x4e, + 0x2c, 0x70, 0x3d, 0x79, 0xa7, 0x65, 0xca, 0xa2, 0x3e, 0x59, 0x6a, 0x64, + 0x53, 0xc6, 0x98, 0x37, 0x68, 0x54, 0xb6, 0x4a, 0x91, 0x76, 0x60, 0x97, + 0xb4, 0x44, 0x5c, 0x9b, 0x47, 0x3e, 0xa3, 0x8d, 0xc4, 0x98, 0xc0, 0xbf, + 0xce, 0x89, 0x6d, 0x96, 0x58, 0x92, 0xa0, 0x54, 0xc6, 0xa4, 0x4f, 0x32, + 0xb4, 0x3f, 0xbd, 0xb2, 0x33, 0x3b, 0xc3, 0xbf, 0x58, 0x89, 0x9b, 0xc7, + 0x8a, 0xc1, 0xaa, 0x5a, 0x3e, 0x8b, 0x3a, 0x44, 0x3d, 0xa1, 0xc6, 0x9c, + 0xcd, 0xb9, 0x8f, 0x3b, 0xa7, 0x88, 0xa5, 0x73, 0xc1, 0x91, 0xc2, 0x75, + 0x37, 0xb1, 0x9b, 0xdb, 0x5e, 0x40, 0x38, 0xa5, 0x43, 0x43, 0x60, 0x45, + 0x70, 0x43, 0xd4, 0x42, 0x77, 0x9a, 0x6a, 0xca, 0x9f, 0x58, 0x51, 0x31, + 0x50, 0x6b, 0xa4, 0xbc, 0xa9, 0xc4, 0x51, 0x69, 0xa2, 0x89, 0x69, 0x92, + 0x83, 0x47, 0x6b, 0x40, 0xbf, 0xc6, 0xa2, 0x56, 0x6c, 0xc7, 0xaf, 0x56, + 0x46, 0x64, 0xb5, 0x7d, 0xb0, 0xbb, 0xb0, 0xba, 0xb7, 0x59, 0xc6, 0x5d, + 0xc0, 0xbf, 0xc6, 0x71, 0xb2, 0xc1, 0x44, 0xc8, 0x77, 0x3e, 0xcf, 0xa2, + 0xd1, 0x9c, 0xc7, 0x64, 0xae, 0x6f, 0x98, 0xb0, 0x47, 0xa7, 0xc1, 0x3a, + 0x6f, 0x73, 0xaf, 0x9f, 0x40, 0x72, 0xc4, 0x4d, 0x8e, 0xd2, 0xaf, 0xca, + 0x79, 0x63, 0xbd, 0x8f, 0x61, 0x8f, 0x8c, 0x70, 0xb5, 0x9e, 0xbf, 0x70, + 0x74, 0x71, 0xb9, 0x9d, 0x94, 0x76, 0xb3, 0x8b, 0xc1, 0x4d, 0xc3, 0x74, + 0x7a, 0x9a, 0xc1, 0xc8, 0x8d, 0x56, 0x7d, 0x88, 0x95, 0x75, 0x6e, 0x46, + 0xb2, 0x4c, 0xaf, 0xaf, 0x71, 0xac, 0x72, 0xd5, 0xb4, 0xc7, 0x7c, 0x66, + 0x66, 0x69, 0x48, 0x84, 0x6a, 0xaa, 0x8c, 0x74, 0xc1, 0x4c, 0x75, 0x3a, + 0xc5, 0xab, 0x8e, 0x6f, 0x60, 0x56, 0x96, 0xb3, 0x69, 0x6f, 0x86, 0xc9, + 0xcc, 0x57, 0xd1, 0x4d, 0x6c, 0xb2, 0x70, 0x95, 0x86, 0xcb, 0x90, 0x3c, + 0x81, 0xbe, 0x96, 0xb2, 0x36, 0x27, 0xa1, 0x5c, 0xb3, 0x4e, 0x93, 0xa8, + 0x85, 0x5b, 0x81, 0xa2, 0x86, 0xc8, 0xa8, 0xd8, 0xba, 0xcd, 0x7a, 0xac, + 0xae, 0x87, 0x34, 0x5c, 0x44, 0x8f, 0xa2, 0x61, 0x8e, 0x73, 0xb1, 0x99, + 0x2f, 0x76, 0xd4, 0x9c, 0xca, 0xb5, 0x96, 0x6d, 0xc5, 0x58, 0x7b, 0x4c, + 0x4c, 0xc4, 0x5e, 0x35, 0x91, 0x56, 0xba, 0x39, 0x40, 0x8c, 0x97, 0x4a, + 0x89, 0x84, 0xd2, 0xa4, 0x74, 0xcd, 0x7d, 0x9a, 0x9c, 0x89, 0x48, 0x37, + 0x64, 0x5d, 0x90, 0x99, 0xb4, 0x81, 0x90, 0x74, 0x53, 0x86, 0xb5, 0x98, + 0x73, 0x42, 0x47, 0xc7, 0x4f, 0x44, 0x52, 0xb3, 0xba, 0x72, 0xce, 0xcf, + 0x4d, 0x97, 0x6d, 0x70, 0xac, 0xb5, 0x48, 0x41, 0x96, 0x67, 0xae, 0xd8, + 0x9e, 0xc3, 0xa3, 0x38, 0x95, 0xb2, 0xc1, 0x9d, 0x47, 0x9a, 0xb6, 0x98, + 0x50, 0x91, 0x4b, 0xd7, 0x86, 0xb3, 0x6c, 0x35, 0x9c, 0x3f, 0xcd, 0xa3, + 0x56, 0x4e, 0x35, 0x6f, 0xb0, 0x41, 0x82, 0x33, 0xab, 0x8a, 0x5b, 0x77, + 0x94, 0xb8, 0x6a, 0x3c, 0x82, 0x9b, 0x45, 0xc6, 0xa6, 0xb9, 0xa0, 0xc4, + 0xa6, 0x45, 0x88, 0xab, 0x4b, 0x7d, 0x87, 0x80, 0x87, 0xca, 0xb6, 0x85, + 0xb3, 0xa3, 0xc5, 0x4f, 0x40, 0x92, 0x5c, 0xb2, 0x75, 0xd1, 0xd3, 0xd2, + 0x6a, 0xd0, 0xb7, 0xcb, 0xa7, 0x54, 0x71, 0xcf, 0x7e, 0x60, 0x83, 0x82, + 0x6b, 0x7c, 0xb3, 0x99, 0x76, 0x48, 0x78, 0x49, 0xc0, 0xca, 0x9e, 0xa2, + 0xaa, 0x86, 0x6c, 0x8e, 0xa8, 0x45, 0x54, 0x7f, 0x91, 0xcb, 0x99, 0x4e, + 0x52, 0x49, 0x7f, 0x5d, 0x40, 0x97, 0x30, 0x72, 0x97, 0xa3, 0x4e, 0x96, + 0x8a, 0xb9, 0x9e, 0x32, 0xbd, 0x77, 0x9c, 0x74, 0xab, 0x53, 0xbd, 0x53, + 0xa2, 0x79, 0x6e, 0x68, 0x8d, 0x61, 0xab, 0x42, 0x83, 0x46, 0x88, 0xbe, + 0xb1, 0x84, 0x76, 0x5f, 0xbc, 0x9c, 0x3e, 0x95, 0x9f, 0x8c, 0x6c, 0x42, + 0x8c, 0x5b, 0x82, 0xa4, 0x29, 0xab, 0xca, 0x38, 0xb5, 0xbb, 0x36, 0xc2, + 0x9e, 0x5a, 0x99, 0x85, 0xad, 0xd1, 0xa7, 0x35, 0xac, 0xb2, 0x98, 0x80, + 0x5a, 0x3e, 0xa3, 0xa5, 0x5b, 0xc1, 0x52, 0x4d, 0xa2, 0x7d, 0x9b, 0x9c, + 0x81, 0x54, 0x4c, 0x8f, 0x80, 0x78, 0x4a, 0x7c, 0x59, 0x5e, 0xb4, 0x53, + 0x99, 0x6a, 0x8f, 0xa1, 0x2d, 0x47, 0xb6, 0xaa, 0x40, 0x49, 0x47, 0x93, + 0x54, 0xa8, 0xb3, 0x46, 0xcd, 0xbe, 0x7b, 0xa1, 0x4d, 0x95, 0x45, 0xc1, + 0x73, 0x3d, 0x66, 0x80, 0x69, 0x42, 0xb0, 0x91, 0x78, 0x44, 0x67, 0x4c, + 0x43, 0xd4, 0xc5, 0x86, 0xc0, 0x4a, 0xb3, 0xcc, 0xb1, 0xb6, 0x47, 0xb8, + 0xc9, 0x6d, 0x45, 0xbb, 0x97, 0x54, 0xcf, 0xab, 0xa8, 0xbc, 0x2f, 0x91, + 0x7d, 0x3e, 0x97, 0xb5, 0x78, 0x50, 0x3a, 0xcf, 0x97, 0x43, 0x2b, 0x3e, + 0x7f, 0x95, 0x8a, 0x63, 0x5e, 0xac, 0xa1, 0x57, 0x53, 0x84, 0x42, 0xb7, + 0x7a, 0x4f, 0x72, 0x39, 0x7e, 0xc2, 0x91, 0x5d, 0xa1, 0x2c, 0x65, 0xd3, + 0xce, 0xb0, 0x60, 0x7d, 0x82, 0x91, 0x76, 0x73, 0x3d, 0xc6, 0xbb, 0x40, + 0x9b, 0x6d, 0xbc, 0xc3, 0x44, 0xb3, 0x65, 0xcb, 0xe3, 0xb8, 0x80, 0x91, + 0xba, 0x44, 0x98, 0xb5, 0xcb, 0x3a, 0x36, 0x6d, 0xa7, 0x88, 0xac, 0x7d, + 0xe1, 0xc2, 0x3d, 0xd9, 0xbe, 0xcb, 0x86, 0x7a, 0x89, 0x3a, 0xa7, 0xa1, + 0xc3, 0xb8, 0x41, 0xa3, 0x97, 0xbc, 0xe5, 0x9b, 0xca, 0x41, 0x66, 0xb3, + 0x6f, 0xa8, 0x80, 0x9c, 0xc0, 0x6d, 0x5c, 0x5e, 0x55, 0xac, 0xde, 0x4b, + 0x6d, 0x3e, 0x3d, 0xb4, 0x67, 0x7d, 0xc2, 0x45, 0x9f, 0x91, 0x47, 0xb3, + 0x71, 0x67, 0xc3, 0xb7, 0xcd, 0x30, 0x6c, 0xbd, 0x70, 0xb4, 0x5c, 0x35, + 0x71, 0xcb, 0x55, 0x4f, 0x44, 0x6f, 0x32, 0x3d, 0x8a, 0x31, 0x39, 0x96, + 0x94, 0xbb, 0x6d, 0x79, 0x94, 0x7d, 0x92, 0xb1, 0x69, 0x3e, 0x67, 0x62, + 0xa3, 0x70, 0x4a, 0x99, 0x86, 0x68, 0x62, 0x64, 0x78, 0x2f, 0xc7, 0x8a, + 0xca, 0x4d, 0xcb, 0x33, 0x97, 0x9e, 0x4d, 0x3c, 0x3d, 0x61, 0x85, 0x9f, + 0x6a, 0x67, 0x31, 0xb5, 0x81, 0x9b, 0xc2, 0x52, 0xaf, 0xa8, 0xbe, 0x59, + 0x38, 0x85, 0x87, 0x8e, 0x87, 0x7f, 0x5a, 0xb1, 0x57, 0xc5, 0xba, 0x6d, + 0xc4, 0x3f, 0x3f, 0x65, 0x36, 0xb0, 0x7c, 0xb9, 0xad, 0x46, 0x7e, 0x8c, + 0x49, 0x5d, 0x62, 0x8b, 0x5b, 0xa4, 0x34, 0xd1, 0x93, 0x8e, 0x58, 0x55, + 0x85, 0x94, 0x6f, 0xb9, 0xb4, 0xd3, 0x78, 0x41, 0xa0, 0x4c, 0x78, 0x3d, + 0x6a, 0x7c, 0xa3, 0x82, 0xa5, 0xb1, 0x4d, 0x64, 0x87, 0x84, 0x2e, 0xa1, + 0x8c, 0x94, 0xcc, 0x64, 0x5e, 0x92, 0x4f, 0xab, 0x32, 0x66, 0x68, 0xc3, + 0x92, 0xbb, 0xa5, 0xa9, 0x2e, 0x38, 0x82, 0x45, 0xc6, 0x68, 0x40, 0x34, + 0x6a, 0x72, 0x5c, 0x85, 0xb5, 0xd6, 0xd4, 0x4c, 0x94, 0x91, 0x57, 0x70, + 0x65, 0x93, 0xa6, 0x86, 0x5d, 0x8c, 0xc3, 0xc8, 0xc3, 0xaf, 0x95, 0x3b, + 0x40, 0x53, 0x70, 0x7d, 0x3f, 0x62, 0x5e, 0xbb, 0x48, 0xad, 0x8d, 0x73, + 0x4e, 0x43, 0x84, 0x48, 0xc4, 0xb0, 0xcc, 0x49, 0x91, 0xbc, 0x51, 0x4f, + 0x83, 0x8f, 0x66, 0x74, 0x7c, 0x9f, 0x6f, 0x6f, 0xa0, 0x3b, 0xb7, 0xae, + 0x55, 0x46, 0xac, 0xab, 0xb3, 0xa5, 0x53, 0x8e, 0xb3, 0xc5, 0x70, 0xc2, + 0xbb, 0x9c, 0x57, 0xac, 0xc1, 0x6f, 0x99, 0x55, 0xd3, 0x88, 0x6f, 0xc7, + 0x3c, 0x7f, 0x67, 0xb6, 0xba, 0x7d, 0x7f, 0xa0, 0x65, 0x34, 0x4b, 0x6e, + 0x3c, 0x96, 0x6c, 0xa7, 0x45, 0xa5, 0x37, 0x48, 0x40, 0xbf, 0x92, 0x98, + 0xd1, 0xa3, 0xa4, 0x83, 0x67, 0xc8, 0xb5, 0xb6, 0x5b, 0x8a, 0x31, 0x6d, + 0xa5, 0x55, 0x77, 0x9d, 0xd7, 0xab, 0x7f, 0x4d, 0xa0, 0x45, 0x3e, 0x5d, + 0x60, 0x9d, 0x3c, 0x7e, 0x9e, 0x46, 0x84, 0x88, 0x89, 0x78, 0x67, 0x38, + 0x6e, 0x91, 0xaf, 0x63, 0x6f, 0x2e, 0x9f, 0x76, 0x49, 0xbd, 0x6e, 0xb3, + 0xbb, 0x80, 0xb6, 0x8d, 0x6b, 0xc3, 0x9c, 0xd5, 0x9e, 0xbc, 0x5c, 0x4c, + 0xc3, 0x63, 0xc7, 0x40, 0xb8, 0xc9, 0xb2, 0x67, 0xbb, 0x6b, 0x8c, 0xa5, + 0x87, 0x4f, 0x6c, 0x4c, 0x55, 0x83, 0x62, 0x8e, 0xa8, 0xb8, 0x5e, 0x4a, + 0x81, 0x3a, 0x54, 0xcb, 0x3b, 0x7f, 0x5f, 0xa4, 0x46, 0x55, 0xae, 0xc1, + 0xb8, 0xa8, 0xc2, 0xaf, 0x60, 0xae, 0x89, 0xae, 0x8f, 0x3f, 0x3e, 0xac, + 0x5e, 0x3f, 0x75, 0x62, 0x8c, 0xaa, 0xb1, 0xa3, 0x81, 0x3f, 0x57, 0xc9, + 0xac, 0x6f, 0xad, 0x73, 0x99, 0x53, 0x58, 0xa3, 0x37, 0x59, 0x61, 0xcc, + 0xa6, 0x9a, 0x69, 0x68, 0xa5, 0x63, 0xb8, 0xa2, 0x76, 0xbd, 0x51, 0x46, + 0xa8, 0x82, 0x78, 0x91, 0x5e, 0xb7, 0x71, 0x9e, 0xbb, 0x79, 0x9f, 0x95, + 0xd5, 0x57, 0xae, 0x36, 0x54, 0xa3, 0x41, 0x91, 0x91, 0x56, 0x5c, 0xa9, + 0x89, 0x97, 0x5c, 0xcc, 0x70, 0x8f, 0x61, 0x53, 0xa4, 0x5c, 0xbf, 0x9f, + 0x8e, 0x39, 0x67, 0x3f, 0x93, 0x4f, 0x45, 0xbe, 0x64, 0x82, 0x7f, 0x44, + 0x56, 0x7a, 0x69, 0xb9, 0x8c, 0x6f, 0xbb, 0xc2, 0x5a, 0xc0, 0x67, 0x6f, + 0x39, 0x3d, 0xbb, 0x9d, 0x2e, 0xd2, 0xac, 0x5c, 0x38, 0xb6, 0xbd, 0x37, + 0x95, 0x94, 0x79, 0x5c, 0xa9, 0x4b, 0x70, 0x6b, 0xa3, 0x41, 0x65, 0x32, + 0x3f, 0xa1, 0x30, 0x64, 0xb4, 0x7f, 0x54, 0x87, 0xa1, 0x74, 0x6e, 0x83, + 0x5d, 0xa0, 0x6c, 0xb7, 0x8c, 0xcd, 0x49, 0x5c, 0x64, 0xbf, 0x41, 0x4b, + 0x5d, 0xb0, 0x7e, 0xad, 0xb2, 0x6a, 0xab, 0x67, 0xb3, 0x3d, 0x4d, 0x80, + 0x58, 0x3c, 0xc4, 0x55, 0x95, 0xa3, 0xbb, 0xb7, 0x97, 0xc2, 0x82, 0x8b, + 0xb3, 0x3a, 0x6c, 0xc4, 0xb5, 0x3a, 0x43, 0xc1, 0x6e, 0xae, 0x9f, 0x31, + 0xc5, 0x96, 0x71, 0x4f, 0x90, 0x53, 0xce, 0xcb, 0x4a, 0xc0, 0x93, 0x6a, + 0x95, 0x9f, 0x66, 0x88, 0xb5, 0xab, 0x52, 0xc5, 0x5a, 0x74, 0x6c, 0x44, + 0x76, 0xcc, 0x7d, 0x88, 0x5d, 0xcc, 0x85, 0x79, 0xd7, 0x48, 0x61, 0x3a, + 0x50, 0x58, 0x8f, 0x81, 0x6d, 0xcf, 0xb9, 0xc4, 0x7f, 0x92, 0xc3, 0x52, + 0x52, 0x97, 0x42, 0x40, 0x5f, 0x36, 0xb1, 0xa3, 0xcf, 0x4c, 0x67, 0xbc, + 0x44, 0x76, 0xc5, 0x5f, 0xa1, 0x79, 0x89, 0x73, 0x99, 0x7e, 0xa0, 0x6e, + 0xcb, 0x3e, 0x89, 0x3c, 0x41, 0x6e, 0x63, 0xb2, 0x52, 0x90, 0x65, 0xb2, + 0x60, 0xa7, 0x7e, 0xaa, 0xba, 0xb6, 0xa5, 0xd3, 0xcd, 0xc3, 0xc2, 0x56, + 0x58, 0x5e, 0x69, 0x6e, 0x3b, 0x96, 0x3f, 0x3b, 0xa2, 0x69, 0x9f, 0x3f, + 0xbd, 0x8a, 0x5b, 0x94, 0x89, 0xc3, 0x96, 0xcc, 0xa5, 0x37, 0xa0, 0x95, + 0xce, 0xb6, 0x8d, 0x65, 0x64, 0xa6, 0x49, 0x8a, 0x46, 0x3b, 0x93, 0xc1, + 0xc3, 0xb1, 0x3c, 0x80, 0x35, 0x3a, 0x8d, 0xb3, 0x99, 0x65, 0xbb, 0x9c, + 0xa5, 0x5d, 0x3a, 0x4d, 0x51, 0x31, 0x36, 0x61, 0x94, 0xa1, 0x3e, 0x42, + 0xa7, 0x46, 0xcd, 0xcb, 0x72, 0x9b, 0x9a, 0x9d, 0xc8, 0xc1, 0x83, 0x7c, + 0x9e, 0x94, 0x70, 0x87, 0xa7, 0xad, 0xd1, 0x7d, 0x43, 0x84, 0x7c, 0xb4, + 0x38, 0xba, 0xcd, 0x52, 0x47, 0x75, 0xcb, 0xa5, 0xaf, 0x68, 0x37, 0x9a, + 0x7d, 0x49, 0xbf, 0x66, 0x57, 0x9a, 0x7a, 0xc1, 0x57, 0x83, 0x4d, 0x38, + 0x55, 0x4b, 0xb7, 0x6a, 0x91, 0x66, 0x9e, 0x9f, 0xc2, 0x33, 0x7c, 0xa0, + 0x48, 0x3c, 0xc0, 0x94, 0x94, 0xad, 0xa8, 0xad, 0xa3, 0x54, 0x48, 0x7b, + 0xb2, 0xc4, 0x9b, 0x7f, 0x42, 0x9c, 0x50, 0x8a, 0xc2, 0xb9, 0x76, 0x85, + 0x58, 0xa2, 0x53, 0x4f, 0x84, 0xaa, 0xd1, 0x51, 0x64, 0x6a, 0x62, 0x7d, + 0x4c, 0x38, 0xc9, 0x9f, 0x71, 0x3d, 0x69, 0xb4, 0xb4, 0xa6, 0x46, 0x67, + 0xc7, 0xd0, 0x59, 0xce, 0xbb, 0xd2, 0xb2, 0x4d, 0xaf, 0x5b, 0xc0, 0x4f, + 0x4e, 0x3f, 0x80, 0xce, 0x4b, 0xba, 0x6a, 0x7e, 0x92, 0x42, 0x42, 0x4a, + 0xb5, 0x7c, 0x5a, 0x61, 0xa2, 0x6b, 0xa3, 0x36, 0x51, 0x67, 0x4a, 0x9d, + 0x82, 0x98, 0x99, 0xaa, 0x55, 0x3e, 0xa3, 0xba, 0x33, 0x8c, 0x40, 0x79, + 0x5b, 0x60, 0xc8, 0xcd, 0x6b, 0x37, 0x98, 0xa4, 0xbb, 0x86, 0xb6, 0xc2, + 0x30, 0xc1, 0x37, 0xc5, 0xc9, 0xb7, 0xb9, 0xc1, 0x37, 0x47, 0x55, 0x6a, + 0xc0, 0x6a, 0x75, 0x7b, 0xb4, 0x74, 0x59, 0xca, 0x76, 0xb5, 0x6d, 0xb1, + 0xbe, 0xb8, 0x53, 0x63, 0xa8, 0x96, 0xc1, 0xad, 0x80, 0x52, 0x70, 0x8f, + 0xb2, 0x66, 0x67, 0x79, 0xab, 0x51, 0xa7, 0x70, 0x39, 0x47, 0x68, 0xbc, + 0xa7, 0x93, 0x7d, 0x90, 0x87, 0x8c, 0x47, 0x66, 0x3d, 0x50, 0x92, 0x7b, + 0x6f, 0xc5, 0x9e, 0x98, 0x4f, 0xae, 0x56, 0x50, 0x5f, 0xaf, 0x63, 0x7c, + 0x94, 0x37, 0x8a, 0x62, 0xb0, 0xac, 0xb6, 0xaf, 0x74, 0x9b, 0x7b, 0xa1, + 0xae, 0x9e, 0xac, 0xce, 0x5e, 0x4e, 0x7f, 0x7f, 0x61, 0x5c, 0x56, 0x89, + 0x6e, 0x3a, 0x5b, 0x7f, 0x92, 0x7b, 0xbe, 0xad, 0xa1, 0x88, 0x6d, 0x9c, + 0x7b, 0xa9, 0x28, 0x84, 0x7f, 0x7f, 0xb9, 0xb5, 0x3d, 0x3c, 0x68, 0x7f, + 0xb8, 0xc9, 0x53, 0xc4, 0x65, 0x76, 0x9a, 0xbb, 0x3b, 0xba, 0xc0, 0x3e, + 0x50, 0x9a, 0x7d, 0x2d, 0x4e, 0x77, 0xd6, 0xa4, 0x48, 0x99, 0x83, 0x7f, + 0x8d, 0xa4, 0x4f, 0xa1, 0x6d, 0x82, 0x42, 0xa1, 0x59, 0x31, 0x55, 0xd8, + 0xcf, 0xac, 0x97, 0x8e, 0x4b, 0x5e, 0x5a, 0x83, 0xbd, 0xca, 0x96, 0x58, + 0x76, 0x38, 0xa2, 0x77, 0x68, 0x43, 0x70, 0x8b, 0x5b, 0x7e, 0x5d, 0x8a, + 0x47, 0x57, 0x7e, 0x97, 0x44, 0xd2, 0x54, 0x7f, 0x76, 0xb4, 0x47, 0xa1, + 0x92, 0x8d, 0xd5, 0x4d, 0xb6, 0xad, 0xcf, 0xa1, 0x3e, 0x95, 0x8d, 0x5d, + 0xd9, 0x69, 0x92, 0xad, 0x98, 0x99, 0xae, 0xaa, 0x47, 0x74, 0x8d, 0xc6, + 0x99, 0xb0, 0xbd, 0x3f, 0x92, 0x52, 0x94, 0x3e, 0xd6, 0x93, 0xb7, 0x65, + 0xab, 0x92, 0xf2, 0xc2, 0xb1, 0xd6, 0x8d, 0xa4, 0xa8, 0x2f, 0x9d, 0x9d, + 0x49, 0xa5, 0x39, 0x34, 0xa7, 0x93, 0xd4, 0x99, 0x7c, 0xda, 0x8b, 0x8b, + 0xdf, 0xd3, 0x86, 0xdc, 0xd6, 0x7d, 0x6b, 0xca, 0x56, 0x5e, 0x82, 0x26, + 0xcc, 0x66, 0x7d, 0x77, 0x78, 0x64, 0xaf, 0x38, 0x75, 0x85, 0x4e, 0xad, + 0x3b, 0x74, 0x5b, 0x72, 0x95, 0xc4, 0xbc, 0xc7, 0x57, 0xd6, 0xa4, 0x63, + 0x46, 0x98, 0xb9, 0x73, 0xa9, 0x31, 0xb6, 0x9d, 0x47, 0x38, 0x86, 0xd5, + 0x5f, 0xae, 0x4e, 0xd1, 0x38, 0x4e, 0x93, 0x3d, 0x6c, 0x42, 0x5e, 0xa1, + 0x46, 0xcd, 0x6a, 0x4a, 0xa2, 0x56, 0x9c, 0x35, 0xb8, 0x58, 0xb0, 0x86, + 0x95, 0xc9, 0xbb, 0x77, 0x62, 0xb4, 0x63, 0x5a, 0xaf, 0x58, 0xd2, 0xb4, + 0xc3, 0xa6, 0x91, 0xc9, 0x90, 0x9c, 0x89, 0x65, 0x99, 0x72, 0x52, 0x94, + 0xc0, 0x8e, 0x82, 0x87, 0x45, 0x82, 0x85, 0xaa, 0x56, 0x85, 0x5e, 0xb1, + 0x6a, 0xd2, 0x49, 0x57, 0x42, 0xa3, 0x6d, 0xc6, 0xb5, 0xae, 0xd5, 0x94, + 0x9c, 0x82, 0x33, 0x7b, 0x80, 0x5f, 0xc8, 0xc9, 0xa3, 0x56, 0x4b, 0xb8, + 0x87, 0xa1, 0x9b, 0xc2, 0x8d, 0x6f, 0x3a, 0x4a, 0x47, 0xa5, 0xdb, 0xcd, + 0x36, 0xb3, 0xb3, 0xbb, 0x94, 0x65, 0x77, 0x3e, 0xa1, 0x4e, 0x65, 0x8d, + 0x96, 0x46, 0x7a, 0x87, 0x60, 0x9c, 0x39, 0x37, 0x8d, 0xcc, 0x88, 0x8f, + 0x86, 0x66, 0xd5, 0x72, 0x55, 0x96, 0x53, 0x5e, 0x57, 0xa7, 0x98, 0xc0, + 0xc5, 0x3f, 0x39, 0xa7, 0x31, 0x74, 0x77, 0x8e, 0x5c, 0xb8, 0x53, 0x74, + 0x4e, 0x27, 0xa9, 0x4f, 0x5f, 0x77, 0x74, 0xac, 0xa0, 0x4e, 0x8c, 0x4e, + 0x83, 0xbc, 0xbf, 0xb7, 0x3f, 0x35, 0x69, 0xc6, 0xaa, 0x53, 0x69, 0xa1, + 0xd7, 0x7c, 0x69, 0x7b, 0x6b, 0xd6, 0x58, 0xcb, 0x53, 0x54, 0x70, 0xac, + 0xb3, 0x65, 0x46, 0x88, 0x58, 0x68, 0x53, 0x6a, 0xb1, 0xbd, 0xb3, 0x36, + 0x92, 0x6b, 0xb6, 0xb7, 0x32, 0xaa, 0x4f, 0x6d, 0x98, 0x53, 0x67, 0x68, + 0xb7, 0x41, 0x5e, 0x55, 0xdb, 0x99, 0x80, 0xb0, 0x9a, 0xd4, 0x57, 0x49, + 0x91, 0xb9, 0xb4, 0xa6, 0x74, 0x88, 0x6a, 0x9f, 0xd5, 0x47, 0x2c, 0x76, + 0x9a, 0x41, 0x60, 0xbb, 0xc0, 0x86, 0x40, 0x98, 0x85, 0x56, 0xcd, 0xad, + 0xc3, 0x84, 0x66, 0x94, 0xd5, 0xa1, 0xcb, 0xce, 0xc3, 0xd4, 0xc0, 0x2f, + 0x9e, 0x37, 0xaa, 0x41, 0xbc, 0x93, 0x47, 0xc2, 0x7a, 0x6a, 0x48, 0xba, + 0x40, 0xbb, 0xb5, 0x77, 0x51, 0xc2, 0xd9, 0xc2, 0x76, 0x8f, 0x51, 0x65, + 0xb1, 0x7b, 0x34, 0xaa, 0x4d, 0x6a, 0x2b, 0x68, 0xa4, 0xbf, 0x6f, 0x7f, + 0x62, 0x61, 0xa8, 0x93, 0x82, 0xa0, 0x91, 0x33, 0x4f, 0x70, 0x74, 0x38, + 0x6b, 0x3c, 0x69, 0x6d, 0xcc, 0xa5, 0xb3, 0xb4, 0x54, 0x40, 0x96, 0xcd, + 0xc3, 0xb5, 0xa9, 0x88, 0xbf, 0x5c, 0x61, 0xcf, 0x8a, 0x34, 0x82, 0x84, + 0x93, 0xa5, 0xcd, 0x35, 0x64, 0x96, 0x68, 0xc6, 0x39, 0x7b, 0x7b, 0x80, + 0x32, 0x83, 0xaa, 0x42, 0x9d, 0x7e, 0xcc, 0x9e, 0x82, 0x8b, 0x4d, 0x88, + 0x96, 0x73, 0x8f, 0x45, 0xb4, 0xdf, 0xe1, 0xa4, 0xc7, 0x53, 0x9b, 0xa0, + 0x49, 0x89, 0x76, 0x86, 0x51, 0xce, 0x41, 0x8c, 0x56, 0xa5, 0xb5, 0x99, + 0x6e, 0x44, 0x81, 0x7b, 0xa2, 0xae, 0x38, 0xc7, 0x88, 0x73, 0xc8, 0x7e, + 0xd7, 0xc6, 0xcf, 0x3d, 0x8c, 0x96, 0xae, 0x5f, 0x72, 0x67, 0x38, 0x67, + 0x8a, 0xc4, 0x58, 0x5d, 0x62, 0x7f, 0x6b, 0x53, 0x54, 0x89, 0xa9, 0xd2, + 0x27, 0xb1, 0xb7, 0x7d, 0x6a, 0xa0, 0x64, 0xb4, 0x62, 0x4f, 0x8b, 0x3c, + 0xd2, 0x7e, 0xc5, 0xa9, 0xc5, 0x69, 0x59, 0x73, 0x66, 0xde, 0xcd, 0x5a, + 0x9d, 0xa2, 0x58, 0x3b, 0x6f, 0x5c, 0x86, 0x34, 0xc3, 0x9b, 0x5c, 0xb6, + 0xcc, 0xb2, 0x73, 0x83, 0xa3, 0xa5, 0x4d, 0x63, 0x7e, 0xa0, 0xa3, 0x47, + 0x59, 0x9b, 0x4c, 0x86, 0x57, 0xb8, 0xaa, 0x7e, 0x46, 0xb8, 0x51, 0x9e, + 0xc4, 0x6b, 0x3e, 0x3b, 0x4a, 0x79, 0xd4, 0xc9, 0xa9, 0x41, 0x9e, 0xbe, + 0x89, 0x53, 0x67, 0xa7, 0x9f, 0xbb, 0xca, 0x5c, 0x35, 0xbe, 0x9c, 0x36, + 0x61, 0xc1, 0x95, 0xaa, 0x49, 0x8a, 0x81, 0x63, 0x78, 0x39, 0x2e, 0xa2, + 0xad, 0x82, 0x46, 0x52, 0x42, 0x97, 0x7d, 0xb2, 0x60, 0xbf, 0xc5, 0x8c, + 0xc0, 0xc6, 0xbf, 0x9d, 0x3a, 0x9b, 0xbf, 0xc0, 0x6b, 0xb4, 0x8d, 0xbe, + 0x80, 0x34, 0x3d, 0x3b, 0x47, 0x8d, 0x42, 0x98, 0x3a, 0x27, 0x3f, 0x41, + 0xbf, 0x89, 0xb1, 0x95, 0x91, 0x81, 0x91, 0x49, 0xa1, 0xaf, 0x93, 0xd2, + 0x6b, 0x96, 0xec, 0x92, 0x2f, 0xc4, 0x74, 0x4b, 0x3b, 0xcc, 0x8a, 0x85, + 0x6d, 0xc4, 0xe6, 0xcb, 0x53, 0x4a, 0xc1, 0x54, 0x50, 0xac, 0xa9, 0x77, + 0x65, 0xbf, 0xa6, 0x53, 0x58, 0x82, 0x64, 0xa2, 0x51, 0x97, 0x55, 0x68, + 0x6e, 0xd9, 0x45, 0xaf, 0xd2, 0x3e, 0xa1, 0x67, 0xb6, 0xbd, 0x3e, 0xbd, + 0x56, 0x98, 0x9b, 0xc4, 0x65, 0x2d, 0x84, 0xb5, 0x4b, 0x36, 0x90, 0x67, + 0x90, 0xaf, 0xdb, 0xcc, 0xba, 0x6d, 0x47, 0x4c, 0x74, 0x87, 0x78, 0xa1, + 0x9c, 0x47, 0x77, 0x7f, 0x72, 0x90, 0x86, 0xca, 0x81, 0x66, 0x69, 0xc4, + 0xbe, 0xd4, 0x2f, 0xb9, 0xcc, 0x94, 0xd5, 0x73, 0xb9, 0xa0, 0xb9, 0xbf, + 0x9c, 0x63, 0x57, 0x8b, 0xb9, 0x65, 0x62, 0xca, 0x95, 0x3e, 0x3b, 0xcb, + 0x82, 0x6f, 0x37, 0xb8, 0x56, 0xac, 0x88, 0x4f, 0xab, 0xb6, 0x9c, 0x66, + 0x52, 0xaa, 0x7a, 0x49, 0xcc, 0x91, 0xa7, 0xd7, 0xb4, 0xc1, 0xc8, 0x48, + 0xcc, 0x58, 0x95, 0xc8, 0x6a, 0x97, 0xb4, 0xb8, 0x58, 0x40, 0x65, 0x5b, + 0x98, 0x4b, 0x9a, 0x70, 0xd6, 0x86, 0x33, 0x66, 0xb9, 0x73, 0xbe, 0x94, + 0xde, 0x6a, 0xb7, 0xb5, 0xd2, 0x8a, 0x93, 0x45, 0x82, 0x33, 0x7f, 0xb2, + 0xc7, 0x43, 0x86, 0x61, 0xe2, 0x75, 0x96, 0x76, 0x69, 0x7a, 0xa6, 0x44, + 0x7c, 0x38, 0xa4, 0x3f, 0xe5, 0x8f, 0xa6, 0x3b, 0xe1, 0x9d, 0x65, 0x86, + 0xb4, 0xba, 0x53, 0xcd, 0xa7, 0xbe, 0x4d, 0x5c, 0x64, 0x37, 0xb4, 0xe2, + 0x4a, 0x9e, 0x69, 0xc8, 0x99, 0x4b, 0x74, 0xbb, 0x49, 0x2d, 0x27, 0x6e, + 0xa8, 0xc3, 0xb2, 0xa6, 0x30, 0x45, 0x83, 0xb5, 0xb0, 0xa6, 0x7f, 0x39, + 0xcd, 0x77, 0x95, 0xbb, 0x99, 0x7c, 0x7a, 0x53, 0x55, 0x3e, 0x3e, 0xb5, + 0x52, 0x62, 0x45, 0xc3, 0x9d, 0x83, 0xb2, 0x88, 0x5c, 0x4a, 0x3b, 0x72, + 0x6f, 0xba, 0x55, 0x93, 0x95, 0x58, 0xbf, 0xc7, 0x3a, 0x62, 0x89, 0x7b, + 0x6e, 0x61, 0x48, 0x95, 0x7f, 0x6b, 0xa6, 0xb3, 0xc9, 0x64, 0xcb, 0x9a, + 0x52, 0xc2, 0x85, 0x93, 0x38, 0x54, 0x9d, 0x6d, 0x62, 0x8f, 0x7c, 0x77, + 0x3e, 0x2e, 0x69, 0x3b, 0x52, 0x92, 0x96, 0x9d, 0x84, 0x82, 0xb5, 0xb0, + 0x5c, 0x90, 0x86, 0x4a, 0x6c, 0xbd, 0x6e, 0xa1, 0x92, 0xc9, 0x5d, 0x98, + 0xb5, 0x44, 0x3d, 0xa8, 0x52, 0x28, 0x58, 0x66, 0x74, 0x69, 0x82, 0x4f, + 0xac, 0x89, 0x86, 0x2b, 0x44, 0xb3, 0xc4, 0x9e, 0xbf, 0x9b, 0x4e, 0x47, + 0x32, 0x6a, 0x8b, 0x55, 0xa5, 0x7f, 0xc0, 0xbc, 0x78, 0x7e, 0x7c, 0x51, + 0x4a, 0x6f, 0x57, 0x73, 0xa5, 0xcb, 0x75, 0x7a, 0x59, 0x3c, 0x66, 0xcd, + 0x37, 0xb3, 0xc5, 0x81, 0x98, 0x82, 0x77, 0xd6, 0x99, 0x7d, 0x3f, 0x9f, + 0x75, 0x98, 0x78, 0xc8, 0xaa, 0x6f, 0x7a, 0x43, 0x59, 0xae, 0xcb, 0xcf, + 0x8d, 0x5b, 0x6f, 0x4d, 0x4c, 0x98, 0x4e, 0x8c, 0xb6, 0xbd, 0x81, 0x94, + 0x8f, 0xb6, 0xa8, 0x80, 0x2d, 0x77, 0xda, 0xaa, 0x38, 0x5f, 0xca, 0x7d, + 0xd1, 0x6b, 0xc5, 0x63, 0x40, 0x8e, 0xce, 0x91, 0x2d, 0x6c, 0x91, 0x6c, + 0xd3, 0xc5, 0xa9, 0x57, 0xd7, 0x2e, 0x85, 0x73, 0x97, 0xb4, 0x51, 0x53, + 0x5a, 0x8f, 0x3f, 0x81, 0xb9, 0x7b, 0x56, 0xb2, 0x4d, 0x92, 0x8b, 0x48, + 0xc4, 0x2c, 0xac, 0xd9, 0x3e, 0x5d, 0x6e, 0x50, 0x83, 0x27, 0x86, 0x42, + 0x4e, 0x4c, 0x3d, 0xd2, 0x3f, 0xa8, 0x7d, 0x2c, 0xad, 0x3b, 0x21, 0x33, + 0xac, 0x77, 0xbd, 0x40, 0xc9, 0x35, 0x65, 0xb2, 0xca, 0xad, 0x5e, 0x92, + 0x9f, 0x97, 0x99, 0xb8, 0xa1, 0xc9, 0xd1, 0xcc, 0x8c, 0xc3, 0x57, 0x4b, + 0x7b, 0xaf, 0xa8, 0x8b, 0x7f, 0x38, 0xbd, 0x3b, 0x47, 0x99, 0xd1, 0xa4, + 0x9b, 0x60, 0x42, 0x66, 0x79, 0x48, 0x60, 0x3a, 0x93, 0x5a, 0x6c, 0x50, + 0x6a, 0xc5, 0x67, 0x8d, 0x9c, 0xce, 0x95, 0x2e, 0x9e, 0xcc, 0x9e, 0x3d, + 0xa5, 0x70, 0xe0, 0x54, 0x47, 0x80, 0x6f, 0x51, 0x91, 0x86, 0xbc, 0x68, + 0x34, 0xbe, 0x42, 0x87, 0x7d, 0x83, 0xaa, 0x79, 0xc0, 0x67, 0xe0, 0x6a, + 0xb5, 0x91, 0xba, 0x39, 0x67, 0x71, 0x5c, 0xbc, 0xce, 0x92, 0xda, 0x78, + 0xc8, 0x94, 0xe4, 0x9f, 0xa0, 0x78, 0xcb, 0xba, 0x57, 0x93, 0x4b, 0x80, + 0xd1, 0xae, 0x8b, 0x4f, 0xa0, 0x64, 0x7a, 0x70, 0x38, 0xba, 0xa0, 0x52, + 0x83, 0x90, 0x91, 0xa7, 0x78, 0xc5, 0xd0, 0x54, 0xbc, 0x75, 0xc5, 0x98, + 0xce, 0xd2, 0x49, 0xb9, 0x6f, 0xc3, 0x51, 0x40, 0x92, 0x73, 0x59, 0xdb, + 0x70, 0x97, 0x63, 0x43, 0xc5, 0x74, 0x7c, 0xa3, 0xca, 0xae, 0x9e, 0x52, + 0xb9, 0x43, 0xab, 0xa3, 0x4a, 0xbb, 0x3f, 0x7d, 0xd6, 0xaa, 0xd6, 0xac, + 0x85, 0x67, 0x5c, 0x8c, 0x7c, 0x3f, 0x59, 0xc5, 0x75, 0x91, 0x5f, 0xb5, + 0xa4, 0xbf, 0x9d, 0x68, 0x88, 0xb6, 0x63, 0x9c, 0x3a, 0xbe, 0x8a, 0xa7, + 0xbb, 0x38, 0x39, 0x90, 0x73, 0x4f, 0xbc, 0xa3, 0xc1, 0x45, 0x8f, 0x5d, + 0x89, 0x48, 0x76, 0xa8, 0x5f, 0x3e, 0x29, 0x74, 0xa3, 0x57, 0x5e, 0x4e, + 0xd5, 0x68, 0xd9, 0x87, 0x4b, 0x73, 0x7f, 0xc6, 0x9a, 0x81, 0x4c, 0x88, + 0x88, 0xb1, 0x6a, 0x76, 0xa9, 0xab, 0xca, 0xcf, 0x9a, 0x42, 0x7f, 0xad, + 0x3d, 0x46, 0xb8, 0x78, 0xb9, 0x85, 0x53, 0xc3, 0xbb, 0xce, 0x58, 0xc3, + 0x4b, 0x81, 0xd2, 0xc7, 0x97, 0xe3, 0xba, 0x65, 0x50, 0x6e, 0xbb, 0xbe, + 0x8e, 0x40, 0x66, 0x92, 0x83, 0xd0, 0x5f, 0x80, 0xc4, 0xb4, 0x37, 0x60, + 0xaf, 0xbe, 0x44, 0xab, 0xb5, 0xa9, 0x46, 0x42, 0x4a, 0x97, 0xa6, 0xa8, + 0xac, 0x60, 0x5d, 0x5c, 0xc4, 0xd4, 0x81, 0xdc, 0x23, 0x70, 0x7c, 0xb2, + 0xb9, 0x55, 0x92, 0x34, 0xd3, 0x9a, 0x81, 0x5f, 0x7a, 0x7c, 0xac, 0xc4, + 0xa1, 0xac, 0x7a, 0x79, 0x83, 0xac, 0x6d, 0xc1, 0x42, 0x4e, 0x77, 0xb5, + 0xbe, 0x7f, 0xd8, 0x2d, 0x63, 0x4a, 0x89, 0x5f, 0x53, 0x93, 0x62, 0xab, + 0x87, 0x7b, 0x9f, 0x51, 0xba, 0x84, 0x88, 0x7e, 0x72, 0x46, 0x75, 0x84, + 0xd2, 0x9e, 0x9a, 0xb0, 0x6a, 0x37, 0xcc, 0x41, 0x86, 0x33, 0x9d, 0xab, + 0x9f, 0xb0, 0x52, 0x5b, 0x50, 0x50, 0xc0, 0xb4, 0xb6, 0x48, 0x4a, 0x9c, + 0x60, 0xc0, 0x4c, 0x39, 0xae, 0x52, 0x9c, 0x5c, 0xbf, 0x66, 0xd2, 0x3d, + 0xcb, 0x67, 0x62, 0x3b, 0x38, 0x78, 0xb2, 0x70, 0x80, 0x6c, 0x4e, 0xa0, + 0x85, 0x83, 0xcf, 0x99, 0xa0, 0x5e, 0xa3, 0x4d, 0xd3, 0x59, 0x4f, 0x83, + 0x9d, 0xa7, 0x7b, 0x96, 0xc1, 0x53, 0x3e, 0x56, 0xa4, 0xb8, 0x33, 0x99, + 0xac, 0x7a, 0x56, 0xc2, 0xaf, 0x3b, 0xbf, 0x56, 0x4d, 0xba, 0x40, 0x94, + 0x95, 0x40, 0x7c, 0x7d, 0x76, 0x4a, 0x37, 0xba, 0x55, 0x3a, 0xa0, 0x92, + 0x86, 0xda, 0xce, 0x82, 0xd2, 0x46, 0x9c, 0x6b, 0x9f, 0x6d, 0x37, 0x67, + 0x65, 0x92, 0x96, 0x92, 0x64, 0x89, 0x96, 0x7f, 0x4c, 0x6c, 0x96, 0xa3, + 0x61, 0xc2, 0x76, 0x63, 0x76, 0x9d, 0x58, 0x5a, 0x2a, 0xd4, 0xc4, 0xb5, + 0x49, 0xb1, 0xc8, 0xa9, 0xb7, 0xb5, 0x32, 0x76, 0xc7, 0xc3, 0x7e, 0x9e, + 0xd6, 0x8a, 0xa3, 0xa7, 0x6d, 0x4c, 0x81, 0x5a, 0x6c, 0x40, 0xab, 0x3a, + 0xa8, 0x6c, 0x65, 0xae, 0x7b, 0x6f, 0x83, 0x8e, 0xb7, 0x2d, 0x8c, 0xa9, + 0x7e, 0x3c, 0x48, 0x43, 0xab, 0x7d, 0x64, 0x70, 0xb8, 0xb7, 0x48, 0xae, + 0x62, 0x5f, 0x40, 0x89, 0x5c, 0x55, 0xc3, 0x48, 0x71, 0xa1, 0x84, 0x9e, + 0x97, 0x53, 0xd3, 0x53, 0x91, 0x8b, 0xb4, 0x8d, 0x74, 0x92, 0x47, 0x4b, + 0xa4, 0xab, 0xcc, 0xaa, 0xb6, 0x7f, 0xd1, 0x92, 0xb7, 0x50, 0xcc, 0x8c, + 0xba, 0x7c, 0x38, 0x3d, 0xb7, 0x98, 0x5f, 0x85, 0x68, 0xd1, 0xc5, 0xc4, + 0x50, 0xcc, 0x9d, 0xb8, 0x95, 0xab, 0x49, 0x6c, 0xd2, 0x38, 0x37, 0x74, + 0xb9, 0x4a, 0xb5, 0x5c, 0x3d, 0x60, 0xb0, 0xc0, 0x57, 0xc7, 0x35, 0xcf, + 0xc3, 0xa2, 0xc2, 0x40, 0x58, 0xb4, 0xa7, 0x71, 0x70, 0x79, 0x62, 0x8b, + 0x62, 0xc6, 0xc5, 0x6b, 0xd4, 0x76, 0xc8, 0xc4, 0x7c, 0x77, 0x79, 0x4a, + 0x7e, 0xc1, 0x2c, 0x67, 0xc1, 0xc1, 0x74, 0x72, 0x34, 0xaf, 0x76, 0xb2, + 0xbd, 0x8e, 0x7f, 0x75, 0x79, 0xa1, 0x3a, 0x9c, 0x60, 0x67, 0xba, 0x63, + 0x7e, 0x60, 0xb8, 0x92, 0x8d, 0x50, 0x9d, 0x8a, 0x78, 0x61, 0x34, 0x51, + 0x94, 0x6f, 0x80, 0xba, 0xc0, 0xa9, 0x76, 0xba, 0x7b, 0x3e, 0x46, 0x64, + 0x3c, 0xb6, 0xd9, 0xc2, 0x61, 0x6f, 0xdb, 0x66, 0x3e, 0x5b, 0xd3, 0x77, + 0x86, 0x40, 0x34, 0x87, 0x72, 0x75, 0x7e, 0x8c, 0x4a, 0x88, 0x7f, 0x51, + 0x3f, 0x7f, 0x91, 0xb4, 0xa2, 0x6d, 0x8a, 0x7e, 0x33, 0xad, 0x6c, 0xbb, + 0xaa, 0x83, 0x95, 0xcc, 0x7c, 0xb0, 0xb8, 0xcb, 0xc4, 0x8e, 0x3b, 0xa6, + 0x7c, 0x6a, 0x8e, 0x86, 0x3e, 0x44, 0xd6, 0xba, 0x75, 0x46, 0x5a, 0xab, + 0xaf, 0xaa, 0x89, 0x46, 0x83, 0x85, 0xc9, 0xbe, 0xde, 0x4b, 0xa4, 0x4d, + 0x3b, 0x84, 0xbb, 0xb8, 0x67, 0x51, 0x8f, 0xc3, 0x3b, 0x32, 0xc2, 0xbf, + 0x37, 0x3c, 0xb2, 0x35, 0x9c, 0x62, 0x8a, 0x71, 0x4f, 0x71, 0x9d, 0x6d, + 0xc4, 0x78, 0xb9, 0xc6, 0x47, 0x38, 0xa1, 0x80, 0x37, 0xbe, 0x4b, 0x46, + 0xcc, 0x6d, 0x4f, 0xac, 0xc0, 0xc6, 0x8e, 0x46, 0xac, 0xd4, 0x84, 0x81, + 0xd1, 0x7d, 0x38, 0x73, 0x64, 0x8a, 0xb5, 0x9f, 0x62, 0xc2, 0x74, 0x39, + 0xab, 0x3f, 0x52, 0x7f, 0xd2, 0x47, 0xd3, 0x72, 0x60, 0x32, 0x70, 0x9d, + 0x8a, 0x5b, 0xd7, 0xca, 0xbf, 0x8c, 0x38, 0x47, 0x70, 0x73, 0x60, 0x49, + 0x63, 0xba, 0x62, 0x69, 0x34, 0x60, 0x6c, 0x93, 0x98, 0xbc, 0x36, 0x5b, + 0xbb, 0xd1, 0xab, 0x41, 0x4f, 0x65, 0x6a, 0xc8, 0x38, 0xd2, 0x93, 0x4a, + 0x36, 0xa2, 0x83, 0x9a, 0x77, 0x66, 0x74, 0xae, 0x78, 0x38, 0xbb, 0x75, + 0x5f, 0x96, 0xd6, 0xa8, 0x8e, 0xb5, 0x71, 0x5b, 0x4c, 0x92, 0x32, 0x62, + 0x68, 0x4a, 0x9b, 0x3d, 0x44, 0x8b, 0xcb, 0x44, 0x6c, 0x33, 0x4a, 0xa5, + 0x4b, 0x81, 0xa0, 0xc5, 0x77, 0xb3, 0x60, 0x84, 0xa9, 0xd2, 0xdb, 0x63, + 0x82, 0x56, 0xe3, 0xb7, 0x88, 0x5f, 0xde, 0xcf, 0x66, 0xd8, 0x79, 0xaf, + 0x7c, 0xa3, 0xcb, 0x9e, 0xa9, 0x56, 0x78, 0x5a, 0x92, 0x4f, 0xbc, 0x8a, + 0x31, 0xc7, 0x83, 0x86, 0xb5, 0xae, 0x95, 0xbd, 0xbd, 0x4a, 0x89, 0x47, + 0x55, 0x7a, 0x35, 0x67, 0xc7, 0xb9, 0x85, 0x42, 0x4d, 0x41, 0x7d, 0x82, + 0xc1, 0x50, 0x73, 0x65, 0x5c, 0x88, 0xbb, 0x86, 0x56, 0x8b, 0x5b, 0x78, + 0xa4, 0xc9, 0xaa, 0x39, 0x50, 0xb3, 0x9b, 0x51, 0x99, 0x23, 0x2d, 0x36, + 0x66, 0x9c, 0xb4, 0x3f, 0x85, 0x53, 0x99, 0x73, 0x56, 0x96, 0xc4, 0xab, + 0xad, 0x84, 0x3b, 0x8f, 0x61, 0x75, 0x4f, 0x37, 0x43, 0x41, 0x38, 0x3f, + 0xcf, 0x9a, 0xd5, 0x85, 0x5f, 0x87, 0x42, 0xb0, 0x41, 0xd7, 0xb5, 0x44, + 0x6b, 0x6b, 0x3a, 0x64, 0x5f, 0x24, 0x99, 0xb1, 0xbe, 0x78, 0x80, 0xd0, + 0xb8, 0xb1, 0xb6, 0xd5, 0xce, 0x3e, 0x95, 0xa2, 0xb9, 0xb5, 0xcb, 0x4c, + 0x90, 0xb9, 0x32, 0xc5, 0xb6, 0xa1, 0x46, 0x96, 0xae, 0x41, 0x79, 0xa3, + 0x7c, 0x63, 0x80, 0x8f, 0x36, 0x78, 0x2d, 0x8d, 0x49, 0xcd, 0x72, 0x6f, + 0x76, 0xd2, 0x9f, 0xd1, 0xb1, 0x5d, 0x6d, 0xc2, 0x76, 0x6a, 0x95, 0x7f, + 0xa4, 0xbe, 0xa9, 0xc7, 0xcb, 0x55, 0xa7, 0x6f, 0x7d, 0x74, 0x81, 0x60, + 0x71, 0x62, 0xa5, 0x79, 0x3b, 0xa1, 0x3d, 0x99, 0x81, 0x60, 0x49, 0x3a, + 0xa8, 0x96, 0x80, 0xa7, 0x6c, 0x3e, 0x96, 0x78, 0x9e, 0x37, 0xa2, 0x79, + 0x5d, 0x5c, 0x51, 0x8f, 0x3f, 0x2f, 0x3f, 0x93, 0x5b, 0x8a, 0x47, 0x54, + 0xd5, 0x41, 0x91, 0xc9, 0x47, 0x48, 0xb1, 0xbe, 0x71, 0x6d, 0x79, 0x53, + 0x95, 0x92, 0x9c, 0xb6, 0x34, 0x82, 0xc9, 0xa1, 0x7f, 0x77, 0x72, 0x87, + 0x99, 0x85, 0x47, 0x90, 0xab, 0x56, 0xbc, 0x3f, 0x63, 0x67, 0xd3, 0x61, + 0x44, 0x8a, 0x86, 0xa9, 0x4b, 0x9f, 0x7a, 0x76, 0x68, 0x86, 0xcb, 0x5f, + 0xac, 0x59, 0x5b, 0x3d, 0x47, 0x6b, 0x9b, 0x39, 0xa4, 0x4d, 0x74, 0xc3, + 0x60, 0x35, 0x85, 0x6f, 0x52, 0x4f, 0x38, 0x35, 0xc4, 0xbc, 0x4f, 0x82, + 0xb7, 0xd4, 0x45, 0xa2, 0x3c, 0xac, 0x32, 0xb1, 0x63, 0xa8, 0x8c, 0xc9, + 0x9c, 0x5e, 0x53, 0x79, 0x6e, 0xb0, 0x75, 0xb5, 0x9e, 0x35, 0x4a, 0xa3, + 0x58, 0xba, 0xcc, 0x81, 0x97, 0xb7, 0xc9, 0x4d, 0x74, 0xcd, 0xc1, 0xb0, + 0xbc, 0x33, 0x52, 0x6f, 0x4a, 0xbc, 0x43, 0x98, 0x65, 0xa8, 0xa2, 0xa0, + 0xc4, 0x84, 0x8c, 0xaa, 0xcb, 0x35, 0x7b, 0x7e, 0x47, 0x91, 0xc1, 0x6d, + 0x62, 0x50, 0xd7, 0x3a, 0xa0, 0x58, 0x8d, 0x89, 0x92, 0xa8, 0x67, 0x75, + 0xb1, 0x4f, 0x9f, 0xac, 0x7d, 0xa8, 0x94, 0x8d, 0x9a, 0xcd, 0xab, 0xca, + 0xa7, 0x81, 0xaa, 0xa6, 0x39, 0x40, 0x5f, 0x67, 0x2f, 0xbc, 0x70, 0x98, + 0x61, 0xd6, 0xb6, 0xa3, 0x42, 0x5a, 0xbe, 0x8e, 0x7e, 0xb6, 0x7e, 0xa6, + 0xc8, 0xcb, 0x81, 0xa3, 0x3b, 0x47, 0xb5, 0x74, 0xac, 0xab, 0x83, 0xae, + 0x72, 0x7c, 0xa3, 0x66, 0xb9, 0x4c, 0x88, 0x63, 0xa2, 0x5e, 0x7e, 0x3d, + 0xa2, 0x67, 0x8f, 0x2f, 0x6f, 0xce, 0xd4, 0xa6, 0xa9, 0xac, 0x4e, 0x40, + 0x7b, 0x44, 0x50, 0x7d, 0xb6, 0x40, 0x31, 0x38, 0x77, 0xaf, 0x4a, 0x81, + 0xc8, 0x58, 0xa6, 0xc8, 0x73, 0xbe, 0x9f, 0x69, 0x32, 0x8d, 0xc1, 0x81, + 0xa1, 0xbe, 0x68, 0xc7, 0x5b, 0x30, 0x3e, 0xaa, 0x38, 0xc1, 0x50, 0x5a, + 0x9f, 0x43, 0x73, 0xd1, 0x3a, 0x56, 0x99, 0x5a, 0x86, 0x4b, 0xa3, 0xc3, + 0x7b, 0xc6, 0xbc, 0x55, 0x50, 0xbd, 0x42, 0xab, 0x99, 0xb1, 0x56, 0x7b, + 0xa1, 0x3c, 0xb2, 0x9c, 0x56, 0x46, 0x78, 0x3b, 0xce, 0x5a, 0xa2, 0x85, + 0x4f, 0x65, 0x80, 0x6f, 0x64, 0xbf, 0xaf, 0x8a, 0x6c, 0x31, 0xa0, 0x65, + 0x56, 0x8a, 0x35, 0x9a, 0x40, 0x52, 0x62, 0x87, 0xb3, 0x5b, 0x2d, 0x44, + 0x6d, 0x86, 0x51, 0x5a, 0xae, 0x4b, 0x88, 0x70, 0x7d, 0xa8, 0x57, 0x9d, + 0xac, 0x53, 0x9e, 0xb8, 0x69, 0x97, 0x68, 0xac, 0x64, 0x46, 0x66, 0x9f, + 0x6e, 0x56, 0xc7, 0x73, 0x37, 0x35, 0x4a, 0xad, 0x44, 0xb2, 0x58, 0xce, + 0x5b, 0x9c, 0x58, 0xb3, 0x6e, 0x52, 0xa7, 0x83, 0x53, 0x3c, 0x72, 0xb7, + 0xcc, 0xa9, 0xc5, 0x32, 0x85, 0x4a, 0x54, 0x72, 0x85, 0xbd, 0x61, 0x57, + 0xb5, 0xd2, 0xc5, 0x63, 0x7e, 0x89, 0xce, 0xb8, 0x97, 0xa0, 0x3d, 0x4a, + 0x6f, 0xb8, 0x9e, 0xa3, 0x94, 0x81, 0x78, 0x9e, 0xd2, 0x81, 0xbe, 0x8c, + 0x92, 0x6f, 0xbb, 0xb8, 0xc3, 0x55, 0x4c, 0x49, 0x83, 0x8d, 0x9d, 0x9b, + 0x86, 0xb2, 0x36, 0x9a, 0xa3, 0x4e, 0x8e, 0x70, 0x65, 0x8f, 0x8e, 0xb2, + 0xd3, 0x32, 0x81, 0x38, 0x35, 0x87, 0xac, 0x39, 0x45, 0x8a, 0x5f, 0xcc, + 0xba, 0x4d, 0x58, 0x57, 0x79, 0x85, 0x5b, 0xa2, 0x9b, 0xb8, 0xae, 0x61, + 0xac, 0x54, 0x8b, 0xc2, 0x80, 0x5e, 0x8e, 0x40, 0x78, 0x7c, 0x59, 0xcc, + 0x70, 0x70, 0x62, 0xb0, 0x47, 0x81, 0x37, 0x6d, 0x82, 0x41, 0xaf, 0xc0, + 0x9e, 0x5e, 0x82, 0x4c, 0x8d, 0xc7, 0x58, 0x5f, 0xc8, 0x47, 0x49, 0x8a, + 0xa6, 0x8b, 0x66, 0x96, 0xb0, 0xce, 0x5e, 0x95, 0x47, 0xb8, 0x72, 0x3f, + 0x38, 0x9b, 0x91, 0xc7, 0x39, 0x46, 0x81, 0x56, 0x94, 0x3d, 0x3e, 0x3e, + 0x6d, 0xce, 0xc2, 0x9b, 0xc7, 0x90, 0xad, 0x7e, 0x9d, 0x43, 0x43, 0x92, + 0x90, 0x89, 0x76, 0x8f, 0x91, 0x84, 0xad, 0x52, 0x64, 0xa8, 0xb2, 0x3e, + 0x3e, 0xcc, 0x46, 0xc1, 0x82, 0x87, 0x9d, 0x72, 0x7e, 0x8f, 0x73, 0xc1, + 0x43, 0x45, 0x86, 0x67, 0xac, 0x4c, 0x64, 0xcc, 0x37, 0x5a, 0x8d, 0x76, + 0x96, 0x67, 0x70, 0xa9, 0xb5, 0xb7, 0x2f, 0x5c, 0x87, 0x83, 0xab, 0x3a, + 0x9e, 0xd2, 0x9f, 0x55, 0x78, 0x31, 0x5f, 0xbb, 0xa2, 0xcd, 0xc9, 0x32, + 0xa9, 0xb6, 0x69, 0xa2, 0x42, 0xbb, 0xd1, 0x56, 0x94, 0x99, 0x34, 0xa2, + 0xae, 0x41, 0xa3, 0x3b, 0xcd, 0x6a, 0x64, 0xa8, 0xab, 0xb4, 0x38, 0x64, + 0xce, 0x83, 0x76, 0x78, 0xa5, 0x8e, 0x58, 0xb2, 0x7a, 0xc9, 0xab, 0x8f, + 0x5d, 0x92, 0x76, 0xa2, 0xb7, 0x38, 0xaf, 0xda, 0x7f, 0x73, 0x92, 0x61, + 0x8d, 0x4d, 0x79, 0x88, 0x3b, 0x79, 0x72, 0xb5, 0x4d, 0x4e, 0x7b, 0x6b, + 0xbd, 0x50, 0xbf, 0x90, 0x44, 0xce, 0x71, 0x45, 0x3a, 0x5f, 0xce, 0x3e, + 0xae, 0xae, 0xb0, 0xc1, 0xcf, 0x52, 0x8c, 0x3f, 0x88, 0x65, 0xc2, 0xb3, + 0x81, 0xa5, 0xa5, 0xbc, 0x9c, 0x7e, 0x9d, 0x92, 0x4f, 0x41, 0x72, 0x59, + 0xcf, 0x50, 0x32, 0xd1, 0x9e, 0x88, 0x7f, 0x93, 0x9e, 0x8b, 0x9a, 0x7a, + 0x7d, 0x8b, 0x40, 0x33, 0x81, 0xd1, 0x99, 0x85, 0x6e, 0x42, 0xdf, 0x96, + 0x3f, 0x87, 0x42, 0x8a, 0x66, 0x7b, 0xcc, 0xae, 0x60, 0x63, 0xb8, 0xa1, + 0x6e, 0xc8, 0x45, 0x95, 0x99, 0xb8, 0x37, 0x70, 0xae, 0x72, 0x41, 0x4d, + 0x6e, 0xc0, 0x4a, 0x99, 0x61, 0x91, 0x70, 0x9a, 0x88, 0xda, 0xb6, 0xce, + 0x4e, 0xb2, 0xc3, 0x99, 0x65, 0xcd, 0x97, 0xb5, 0x80, 0x98, 0x5c, 0x7b, + 0xbe, 0x9d, 0xc8, 0xa9, 0xbf, 0xb7, 0xbf, 0x5a, 0x44, 0x35, 0x69, 0x3f, + 0xb3, 0x6b, 0xb3, 0x3e, 0x82, 0x56, 0x8b, 0x47, 0x43, 0x52, 0x8b, 0x86, + 0x88, 0xb0, 0x57, 0xad, 0x6c, 0xa7, 0x41, 0x77, 0xb1, 0x43, 0x49, 0xb5, + 0xbc, 0x3b, 0x81, 0x5b, 0xb6, 0x55, 0x4e, 0x90, 0xc2, 0xc6, 0x57, 0xbc, + 0x37, 0x7e, 0x9b, 0xc5, 0x41, 0xb8, 0x77, 0x3a, 0xa5, 0x5e, 0x72, 0x3a, + 0x5b, 0xbc, 0x69, 0x3b, 0xb0, 0xc9, 0x4e, 0xb5, 0x98, 0xd2, 0xbe, 0x85, + 0x96, 0x9d, 0x82, 0x6e, 0xd3, 0xd1, 0x5a, 0x42, 0x61, 0x73, 0xb9, 0xcc, + 0x61, 0xbb, 0x3a, 0x6b, 0x3f, 0x59, 0x93, 0x7c, 0x69, 0x7a, 0x47, 0x88, + 0x4b, 0x9b, 0xd1, 0xb2, 0xb8, 0x7d, 0x4b, 0x94, 0x54, 0xaa, 0x8b, 0xb2, + 0x78, 0x97, 0xcd, 0x99, 0x46, 0x53, 0x8f, 0x69, 0x46, 0x41, 0xad, 0xc1, + 0x69, 0x6b, 0x6f, 0xa5, 0x5c, 0x58, 0x86, 0x73, 0x99, 0x44, 0xc5, 0x93, + 0x98, 0x7f, 0xb7, 0x74, 0x5f, 0x4c, 0x48, 0xa5, 0x47, 0x56, 0x45, 0x58, + 0x78, 0x65, 0x94, 0xbc, 0x9f, 0xba, 0xa8, 0xb9, 0x9a, 0x2f, 0x3f, 0xbb, + 0x6e, 0x7e, 0x34, 0xb9, 0xcb, 0x8c, 0x45, 0x4a, 0x39, 0x4e, 0x9c, 0xcb, + 0x6b, 0xa3, 0xa4, 0x46, 0x95, 0xc8, 0xc0, 0xc5, 0x6f, 0xbb, 0x9f, 0x3e, + 0x52, 0x84, 0xad, 0x3c, 0xbb, 0x7e, 0x72, 0x62, 0x46, 0x5c, 0xbc, 0x6b, + 0x4a, 0x4b, 0xab, 0x7b, 0xa0, 0x6f, 0x73, 0x65, 0x30, 0x5e, 0x86, 0xbe, + 0x3f, 0x32, 0xae, 0x89, 0x87, 0x53, 0xac, 0x78, 0xb4, 0x38, 0x69, 0x73, + 0x8c, 0x4d, 0x74, 0x36, 0xa3, 0xd1, 0x50, 0x4b, 0xa1, 0xa6, 0xc4, 0xd3, + 0xcd, 0xaa, 0x86, 0x8f, 0x80, 0x66, 0x91, 0xbe, 0xb6, 0x7e, 0x92, 0x50, + 0xc6, 0x88, 0x86, 0xc7, 0x36, 0xb8, 0xb3, 0xd5, 0x4a, 0x8c, 0xb3, 0x8d, + 0xaa, 0x42, 0x72, 0xad, 0xa1, 0x6d, 0x8e, 0x7c, 0x77, 0x4c, 0x63, 0x43, + 0x84, 0x32, 0xc9, 0x89, 0x85, 0x6e, 0x72, 0x30, 0x7b, 0x2e, 0x66, 0xb5, + 0x74, 0xa3, 0x37, 0x38, 0x5b, 0x57, 0x59, 0x65, 0x6c, 0x70, 0x33, 0x6b, + 0x7c, 0x45, 0x8a, 0xad, 0x53, 0xb1, 0x91, 0x39, 0x59, 0xbf, 0x72, 0xcc, + 0x6e, 0x47, 0x66, 0xa3, 0x33, 0x8d, 0xc2, 0xb3, 0xcb, 0xa3, 0x83, 0xc5, + 0xad, 0x33, 0x41, 0x86, 0x89, 0xcd, 0x94, 0xcf, 0x9b, 0x33, 0x95, 0x61, + 0x45, 0xb3, 0x81, 0xb3, 0xb6, 0x5d, 0x60, 0x6d, 0xb4, 0x5d, 0xce, 0x82, + 0x82, 0x7d, 0xaf, 0xb8, 0xbb, 0xc9, 0xa3, 0x9e, 0x89, 0x90, 0xc7, 0xa2, + 0x68, 0x52, 0xba, 0x63, 0xc7, 0x56, 0xd4, 0x91, 0xa0, 0x3f, 0x6e, 0xa0, + 0x5b, 0x48, 0x45, 0x4f, 0x41, 0x43, 0x75, 0x8e, 0x47, 0x3f, 0x70, 0x8e, + 0x3b, 0x93, 0x99, 0xb1, 0x3f, 0xbe, 0xb5, 0x44, 0x4f, 0x98, 0xcc, 0x90, + 0xd1, 0x8d, 0x32, 0x59, 0x92, 0xa2, 0x45, 0xb2, 0x58, 0x46, 0xa4, 0x4c, + 0xbf, 0x40, 0xc4, 0x74, 0x90, 0x5c, 0xae, 0x93, 0x57, 0xbe, 0xc8, 0xb9, + 0x69, 0x8f, 0x4e, 0x3b, 0xbc, 0xba, 0xa0, 0x41, 0x89, 0x89, 0x53, 0xa2, + 0x96, 0x6d, 0x81, 0x64, 0x90, 0x7f, 0x84, 0x7b, 0x78, 0x94, 0xa5, 0x7b, + 0x77, 0x98, 0x66, 0x74, 0x63, 0xc5, 0x3f, 0x7f, 0x97, 0xaa, 0xcd, 0x5d, + 0x30, 0xbe, 0x89, 0xa2, 0x94, 0x65, 0x6a, 0x41, 0xa4, 0x77, 0x71, 0x99, + 0x4f, 0xce, 0x4f, 0x7b, 0x81, 0x47, 0xa1, 0xb2, 0x3a, 0xb2, 0x8a, 0x47, + 0x7a, 0xb9, 0x3d, 0xcd, 0x49, 0xaa, 0xcd, 0xc8, 0x8f, 0xab, 0x77, 0x31, + 0xc1, 0x4c, 0xba, 0x4b, 0x89, 0xc0, 0x75, 0xcc, 0xae, 0x75, 0x31, 0x42, + 0xc2, 0x99, 0x94, 0x46, 0x37, 0xcd, 0x5d, 0x4d, 0xc9, 0xa8, 0xc5, 0xc2, + 0xb4, 0x4b, 0x33, 0x67, 0xbc, 0x63, 0xb9, 0xc0, 0x46, 0x31, 0x6f, 0x5e, + 0x69, 0xb6, 0xb2, 0x57, 0x9c, 0x4d, 0x8b, 0x81, 0x77, 0x6e, 0x40, 0x72, + 0xb3, 0xac, 0x4b, 0x80, 0x84, 0x47, 0x35, 0x83, 0x7b, 0x81, 0x7e, 0xa7, + 0x86, 0x99, 0x83, 0xc6, 0x2d, 0xad, 0x66, 0x64, 0x54, 0x47, 0xa7, 0xd1, + 0x3b, 0x89, 0x53, 0x5d, 0x8b, 0x74, 0x81, 0x83, 0x35, 0x66, 0x87, 0x7e, + 0x82, 0xa7, 0x5a, 0xc0, 0xc6, 0x62, 0xb3, 0xb8, 0xa1, 0xc7, 0x80, 0xd2, + 0x46, 0x91, 0x5e, 0x8c, 0x9b, 0xb8, 0x36, 0xbe, 0x38, 0x89, 0xbd, 0xce, + 0x93, 0x50, 0x5b, 0x93, 0xa6, 0x86, 0x7b, 0x99, 0x72, 0x49, 0x86, 0x7f, + 0x74, 0x69, 0xba, 0x86, 0xa4, 0x38, 0x67, 0x94, 0x62, 0x4d, 0x48, 0x78, + 0x38, 0x90, 0xa9, 0x94, 0x41, 0xb9, 0x40, 0x31, 0x63, 0x8f, 0x34, 0x8f, + 0x5e, 0x36, 0x4c, 0x83, 0xab, 0xba, 0x76, 0xba, 0xb7, 0x67, 0x6c, 0x39, + 0xbc, 0xc5, 0x5e, 0x9d, 0x6c, 0x8b, 0x80, 0x78, 0x48, 0xca, 0xc7, 0xc8, + 0xa9, 0x42, 0x5e, 0x5d, 0x9c, 0x83, 0x6a, 0xc4, 0xcc, 0x9c, 0x75, 0x37, + 0x84, 0x98, 0xcd, 0x82, 0xb5, 0x58, 0x93, 0x74, 0x64, 0xc5, 0x8b, 0x84, + 0x9a, 0x40, 0x42, 0x4c, 0x69, 0xa4, 0xba, 0xa6, 0x61, 0xba, 0x4b, 0x32, + 0x91, 0xd5, 0x71, 0xa3, 0x76, 0x6a, 0x37, 0x4e, 0x9d, 0x40, 0x71, 0xbb, + 0x80, 0xd0, 0x89, 0x47, 0x35, 0x4d, 0x48, 0xd4, 0xb5, 0x95, 0xac, 0x87, + 0x6a, 0x60, 0xac, 0x40, 0xa4, 0x82, 0x96, 0x5f, 0x98, 0x6a, 0x72, 0xd2, + 0x31, 0x47, 0xc0, 0x97, 0xc7, 0x55, 0x4b, 0x68, 0xac, 0xc5, 0x67, 0x80, + 0x66, 0xab, 0xc9, 0x46, 0xcb, 0x55, 0xc2, 0x41, 0xbe, 0xc9, 0xa6, 0x74, + 0x98, 0x60, 0x45, 0xa5, 0x6a, 0x75, 0xa4, 0x97, 0x3f, 0xca, 0x96, 0x35, + 0x56, 0x50, 0x88, 0x99, 0xd2, 0xb1, 0x89, 0xb3, 0x6b, 0x90, 0x66, 0x6f, + 0x5b, 0x43, 0x53, 0x9c, 0xcb, 0xd1, 0x66, 0x82, 0x73, 0xc9, 0x98, 0x4f, + 0x9e, 0x34, 0x77, 0x6a, 0xd5, 0x9f, 0x3a, 0xc3, 0x7b, 0x56, 0x43, 0xa4, + 0xbf, 0x29, 0x95, 0xa2, 0x9b, 0x3b, 0xc1, 0x80, 0x2c, 0x90, 0xa9, 0x4f, + 0xa7, 0x48, 0x5c, 0x7d, 0x35, 0xb3, 0x9a, 0xb3, 0x63, 0xca, 0xcb, 0xcd, + 0x36, 0xb1, 0xc2, 0x71, 0x8e, 0xa3, 0x46, 0x42, 0x63, 0x84, 0x6b, 0xa9, + 0x91, 0xd4, 0xb6, 0x58, 0xcb, 0xa9, 0x67, 0xc8, 0xae, 0xc4, 0x4e, 0x93, + 0x92, 0x33, 0xc0, 0xbc, 0x8c, 0xad, 0xa0, 0xba, 0xc5, 0x92, 0x3b, 0x47, + 0x6d, 0x6b, 0x67, 0xd1, 0x86, 0x79, 0x71, 0x3d, 0x5f, 0xa3, 0x52, 0xb8, + 0x83, 0xba, 0xbf, 0xb9, 0x9e, 0x63, 0x9b, 0xd3, 0x58, 0x2c, 0x6f, 0x7a, + 0xcd, 0xc6, 0xda, 0xa9, 0x82, 0xba, 0x42, 0x5b, 0xd0, 0x8d, 0x47, 0x9b, + 0xba, 0x70, 0x90, 0x75, 0x97, 0x61, 0xad, 0x65, 0xc3, 0x69, 0x9f, 0x97, + 0x45, 0x41, 0x2e, 0xd2, 0xb4, 0x4e, 0x37, 0x64, 0x5e, 0x86, 0x75, 0xc6, + 0x8d, 0xb9, 0x49, 0x76, 0x98, 0x75, 0x35, 0x5b, 0xbb, 0x45, 0x92, 0x8a, + 0xa9, 0x67, 0x7a, 0x5e, 0x98, 0x37, 0x4a, 0xc6, 0x5b, 0xb5, 0x89, 0x8f, + 0x9b, 0x4b, 0x45, 0x67, 0xa2, 0x72, 0x41, 0x8c, 0x39, 0x6d, 0x57, 0x62, + 0xa4, 0x5f, 0x85, 0xb9, 0xcc, 0x85, 0xa2, 0xa2, 0xbb, 0x30, 0xb3, 0x63, + 0x4a, 0x3c, 0x73, 0x72, 0x80, 0x8c, 0x65, 0x68, 0xc5, 0x4c, 0xb1, 0x3f, + 0xb2, 0x83, 0x92, 0xc8, 0xb6, 0x39, 0x4a, 0x9c, 0xc1, 0x69, 0x5c, 0xb5, + 0x69, 0xc1, 0x73, 0x31, 0x35, 0x51, 0x5b, 0xd2, 0x8b, 0xab, 0x5d, 0x37, + 0x48, 0x6d, 0x6f, 0x50, 0x94, 0x76, 0xcb, 0x51, 0x95, 0x4e, 0xba, 0xa2, + 0x3b, 0x5f, 0x83, 0x90, 0x79, 0xc6, 0x71, 0x87, 0xcf, 0x7a, 0x47, 0x5c, + 0x66, 0x63, 0x6a, 0xcf, 0x49, 0x4b, 0x3d, 0x32, 0x4f, 0x65, 0x63, 0xb0, + 0xa6, 0x6a, 0x9b, 0xcf, 0xa4, 0xbb, 0x49, 0x7c, 0x40, 0x4d, 0x7f, 0x4e, + 0x51, 0x4c, 0x85, 0xd0, 0xc4, 0x4f, 0xce, 0xb8, 0x9c, 0xb1, 0x84, 0xc7, + 0x5a, 0x5f, 0x76, 0x36, 0x33, 0x81, 0x61, 0xc6, 0xb6, 0x70, 0x44, 0x54, + 0x58, 0xce, 0x88, 0x80, 0x64, 0x34, 0x58, 0x94, 0xaf, 0x67, 0xc7, 0x85, + 0x96, 0xa5, 0x84, 0x5a, 0x4c, 0x9c, 0x37, 0x42, 0x5e, 0x64, 0x59, 0x65, + 0x33, 0x67, 0x48, 0x4a, 0xa2, 0x93, 0x74, 0xa7, 0x52, 0x33, 0x92, 0xb0, + 0x85, 0xae, 0x72, 0xba, 0x3a, 0x40, 0xc8, 0x3c, 0x41, 0x54, 0x5a, 0x3a, + 0x5c, 0x85, 0x56, 0xb2, 0xbc, 0x60, 0x6a, 0xb1, 0x54, 0xba, 0xb6, 0x8f, + 0x72, 0x95, 0xc5, 0xa7, 0x56, 0xab, 0x59, 0x74, 0x57, 0x6b, 0x6a, 0x8f, + 0x43, 0x77, 0x91, 0x97, 0x77, 0xbe, 0xad, 0x67, 0x5e, 0xcc, 0x6c, 0x4d, + 0x31, 0x77, 0x89, 0xc4, 0x8a, 0x6d, 0x9d, 0x64, 0x7d, 0xc9, 0x82, 0xac, + 0x80, 0x92, 0x95, 0x8a, 0x78, 0xae, 0x7b, 0x86, 0x58, 0x69, 0x8a, 0xb7, + 0xca, 0xb3, 0x3a, 0x4c, 0x75, 0x55, 0x43, 0x46, 0x82, 0x65, 0x52, 0xb1, + 0x91, 0x8d, 0xca, 0xa4, 0x4d, 0xb5, 0x47, 0xc6, 0x7b, 0x46, 0x7b, 0x50, + 0x54, 0x89, 0x67, 0x92, 0x5f, 0x72, 0x55, 0x60, 0x3d, 0x66, 0xcd, 0x51, + 0x98, 0x7f, 0x9e, 0x63, 0xa6, 0x4d, 0x9c, 0x9d, 0x7a, 0x90, 0xcd, 0x51, + 0x3c, 0x71, 0xb3, 0x73, 0x9b, 0x54, 0x9b, 0xae, 0xb1, 0x87, 0x5b, 0x55, + 0x73, 0x78, 0x5f, 0xa0, 0x68, 0x5f, 0x67, 0x40, 0x57, 0x63, 0x47, 0xad, + 0x3d, 0xb4, 0xa7, 0xa6, 0x80, 0x91, 0x31, 0x76, 0xd2, 0xcb, 0xcc, 0x99, + 0x81, 0x7e, 0x4d, 0x3e, 0xc1, 0xcf, 0x38, 0x71, 0x88, 0x83, 0xbb, 0x38, + 0x56, 0x5a, 0x99, 0x78, 0x88, 0xc8, 0x37, 0xbf, 0x3b, 0x4a, 0x88, 0xb4, + 0xcd, 0x58, 0x90, 0x66, 0x52, 0x4d, 0x56, 0x41, 0xc3, 0x7b, 0x40, 0x3b, + 0x88, 0x3a, 0x66, 0xc6, 0xca, 0x85, 0x8f, 0xc9, 0x44, 0x44, 0xa0, 0x4f, + 0x65, 0x5e, 0x3b, 0x58, 0x53, 0x9c, 0x8d, 0x8f, 0xb5, 0x41, 0xaf, 0xc1, + 0x32, 0xc8, 0x60, 0x4e, 0x3e, 0xad, 0x39, 0x4c, 0xb2, 0x54, 0x3a, 0x42, + 0xb5, 0x6e, 0x59, 0x4a, 0x62, 0x45, 0xc6, 0x35, 0x62, 0x7d, 0x88, 0x78, + 0x51, 0x92, 0x5d, 0xcc, 0x5f, 0x2f, 0x83, 0x58, 0x49, 0x5d, 0x7b, 0x47, + 0xbd, 0x7b, 0xa0, 0xbf, 0xb9, 0x6d, 0x8b, 0x6b, 0xbc, 0x99, 0x88, 0x32, + 0xcd, 0x40, 0x83, 0xb3, 0x47, 0x4e, 0x84, 0xb4, 0x31, 0x55, 0x63, 0xd1, + 0xc2, 0x56, 0x50, 0x8c, 0xa3, 0x5a, 0x85, 0xd1, 0x39, 0x92, 0xcc, 0x48, + 0x6b, 0xb8, 0x7b, 0x50, 0x82, 0x3b, 0xb6, 0xa1, 0xbc, 0xa9, 0xbd, 0xcf, + 0x92, 0x65, 0xad, 0x89, 0xbd, 0xc0, 0x73, 0xa0, 0x59, 0xb1, 0x81, 0x86, + 0xa3, 0x50, 0xc5, 0x63, 0x62, 0x62, 0x57, 0xc9, 0x4a, 0xaf, 0xcf, 0x61, + 0x48, 0x42, 0x34, 0xae, 0xc4, 0x76, 0xb0, 0xa8, 0x4d, 0x40, 0x64, 0x99, + 0x72, 0x86, 0x54, 0x78, 0xb0, 0x9f, 0x2d, 0x46, 0x73, 0xa1, 0x75, 0x56, + 0xc2, 0x9e, 0x53, 0x30, 0x42, 0xc1, 0x4b, 0x92, 0x63, 0x51, 0x3e, 0x30, + 0x6e, 0xaa, 0x8f, 0x53, 0xc3, 0x43, 0x97, 0x62, 0x87, 0x4f, 0x91, 0x5a, + 0x47, 0xa1, 0xd3, 0x41, 0x69, 0x5b, 0x4c, 0x90, 0x9f, 0x63, 0x57, 0x4e, + 0x3d, 0xb7, 0x8a, 0x49, 0x93, 0x42, 0xb8, 0x40, 0x57, 0x83, 0x8b, 0xa9, + 0x6c, 0x85, 0x84, 0x5d, 0x6e, 0x6b, 0xa0, 0x45, 0xa4, 0x5f, 0x7f, 0xa6, + 0x6f, 0xa8, 0xd2, 0x4e, 0x99, 0xc3, 0x49, 0x83, 0x45, 0x6d, 0x83, 0xa4, + 0xb2, 0xb3, 0x7c, 0xc7, 0x88, 0x67, 0x72, 0x96, 0xac, 0x62, 0xa3, 0xc0, + 0x68, 0x52, 0x31, 0xb7, 0xc9, 0x97, 0xa7, 0x79, 0x3a, 0x8e, 0x5c, 0x5d, + 0x58, 0xc9, 0x38, 0x7b, 0x46, 0x55, 0x8b, 0x76, 0xcc, 0x58, 0xb2, 0xa0, + 0xa4, 0xb2, 0x98, 0x9d, 0x4d, 0x9d, 0xb2, 0x93, 0xa6, 0x34, 0x5f, 0xc0, + 0x5c, 0x31, 0x9c, 0x91, 0x79, 0x6b, 0x6f, 0xc9, 0x53, 0x44, 0xa0, 0x6e, + 0xaa, 0x99, 0x8a, 0x50, 0xba, 0xbc, 0x6f, 0x62, 0x92, 0x4b, 0xcd, 0xb9, + 0x54, 0xb8, 0x7a, 0xcb, 0xc3, 0x9c, 0x85, 0x7c, 0x36, 0x5d, 0x61, 0x60, + 0x95, 0x82, 0xd0, 0xb8, 0xc7, 0x52, 0x9c, 0x91, 0xc2, 0x42, 0x65, 0x40, + 0x64, 0x82, 0x88, 0xbf, 0x38, 0x35, 0x93, 0xbc, 0xb1, 0xb0, 0xcc, 0xbf, + 0x58, 0xa4, 0xcd, 0xc0, 0xa0, 0x68, 0x8a, 0x80, 0xba, 0x5d, 0x3f, 0xae, + 0x38, 0x62, 0x31, 0x9d, 0xb0, 0x56, 0xbd, 0xb2, 0x2f, 0x4d, 0xb7, 0x8a, + 0x9b, 0xb4, 0x38, 0xc9, 0xc2, 0x36, 0x68, 0xc1, 0x63, 0x95, 0x42, 0x51, + 0x69, 0x83, 0xab, 0x99, 0x9e, 0xc2, 0x97, 0xc1, 0x6b, 0xc6, 0x4c, 0x78, + 0x65, 0x59, 0x4d, 0x49, 0x3d, 0xce, 0x9c, 0xa9, 0x7d, 0x8c, 0x44, 0x63, + 0x3f, 0x76, 0x9c, 0xc7, 0x54, 0x47, 0xaa, 0x6c, 0x95, 0x69, 0x93, 0x64, + 0xc2, 0x40, 0x51, 0x50, 0x9c, 0x74, 0x57, 0xb2, 0x79, 0xbe, 0x35, 0xb6, + 0x68, 0x9b, 0x72, 0xa3, 0xb5, 0xd7, 0x63, 0x9e, 0x76, 0x37, 0x89, 0x70, + 0x6f, 0xa7, 0x96, 0x3b, 0x94, 0xc6, 0xc1, 0x76, 0x7c, 0x6f, 0x66, 0x43, + 0x75, 0x72, 0x43, 0x8a, 0xc8, 0xd5, 0x96, 0xa8, 0x4b, 0xc2, 0x73, 0x6a, + 0xa3, 0x7a, 0xe5, 0x7e, 0x57, 0x6e, 0x9d, 0x7a, 0xad, 0xb1, 0xc1, 0x3c, + 0xad, 0xb5, 0xa9, 0x97, 0x34, 0x2a, 0x30, 0x6b, 0x36, 0x63, 0x68, 0xa6, + 0x79, 0x93, 0x55, 0xb8, 0x80, 0x81, 0xd4, 0x7b, 0x7d, 0x99, 0x9e, 0x67, + 0x6f, 0x4e, 0x5e, 0x9e, 0x34, 0xa0, 0x9c, 0x94, 0x5f, 0xb3, 0xcd, 0xad, + 0x9a, 0x5b, 0x43, 0x7a, 0x73, 0x33, 0x82, 0xc7, 0x56, 0x9e, 0x3f, 0x54, + 0x50, 0xa3, 0xa1, 0x29, 0x75, 0x44, 0x9f, 0xb9, 0x9a, 0x74, 0x7b, 0xbb, + 0x3c, 0xd7, 0x84, 0x53, 0x45, 0x5d, 0x5d, 0x36, 0xa5, 0x54, 0x67, 0xba, + 0x8b, 0x99, 0xce, 0x97, 0xac, 0xa0, 0xc0, 0xa7, 0x3f, 0x90, 0x60, 0x7e, + 0xa2, 0x5a, 0x86, 0xdb, 0x25, 0x9d, 0xca, 0xbe, 0x84, 0x50, 0xab, 0x7e, + 0x25, 0x44, 0x59, 0x44, 0x9a, 0x51, 0x5b, 0x6d, 0x4e, 0xc7, 0x33, 0xa2, + 0x42, 0x97, 0x5c, 0xe1, 0x37, 0x5c, 0xb0, 0x87, 0x56, 0xd4, 0x9d, 0x94, + 0x68, 0x9b, 0xce, 0x78, 0x89, 0xaa, 0x94, 0x5b, 0x45, 0x6a, 0x6b, 0x4f, + 0x51, 0x90, 0x3c, 0x55, 0x76, 0x48, 0xbc, 0x5b, 0x60, 0xbb, 0x84, 0x97, + 0x79, 0x4e, 0x6c, 0xcc, 0x9b, 0x73, 0x6f, 0xa3, 0x9f, 0x79, 0xcc, 0x74, + 0x42, 0x6e, 0x50, 0xca, 0xc8, 0xc8, 0x61, 0xbb, 0x63, 0xd0, 0x9e, 0x4f, + 0xa1, 0xa7, 0xaf, 0x39, 0x5d, 0x5c, 0xac, 0x9b, 0xa3, 0x99, 0x6d, 0xa2, + 0x83, 0x52, 0x8c, 0x7f, 0x7c, 0xc1, 0xa4, 0xb2, 0xd0, 0xd2, 0xb5, 0xb4, + 0xd5, 0x9e, 0xcb, 0x6d, 0xa0, 0xe3, 0x95, 0x51, 0xce, 0x90, 0xc7, 0x90, + 0xad, 0x88, 0x52, 0x34, 0x49, 0xa7, 0x41, 0xb3, 0x9f, 0x8f, 0x63, 0x5f, + 0x9c, 0xbc, 0x39, 0x5b, 0x8b, 0xb4, 0xad, 0x91, 0x6f, 0xca, 0x49, 0xa4, + 0x91, 0x42, 0x9a, 0x44, 0x60, 0x66, 0x8d, 0xa5, 0x33, 0xc6, 0xba, 0xb8, + 0x97, 0xd9, 0xbe, 0x72, 0x56, 0xca, 0xa3, 0x3a, 0x76, 0xae, 0xa7, 0x90, + 0x55, 0x9d, 0x49, 0xc5, 0x46, 0x6d, 0xbe, 0x3d, 0x5a, 0xab, 0x85, 0xb9, + 0xa7, 0x78, 0x3e, 0x83, 0xb1, 0x84, 0x39, 0x8c, 0x8e, 0xad, 0x84, 0x50, + 0xa5, 0xaa, 0xd2, 0x65, 0x39, 0xbe, 0x9a, 0x5f, 0x6f, 0x53, 0xa4, 0xa7, + 0xd1, 0x73, 0x5e, 0x6a, 0x81, 0x43, 0x8c, 0xa2, 0x3b, 0x83, 0x64, 0xa9, + 0xb7, 0xa2, 0x94, 0x3b, 0x3c, 0x88, 0x4d, 0x5f, 0xc0, 0x96, 0x2a, 0x87, + 0x38, 0xdd, 0x47, 0xc3, 0x81, 0x2d, 0x7e, 0x60, 0xc3, 0x5e, 0xa9, 0xba, + 0xa5, 0xb6, 0x60, 0x56, 0x9d, 0x58, 0x39, 0x4f, 0x8f, 0x43, 0x73, 0x6c, + 0xb7, 0x96, 0xc1, 0x81, 0x34, 0x4f, 0x2b, 0x7b, 0xaf, 0xcb, 0x83, 0x93, + 0x42, 0xb5, 0x57, 0x9b, 0x49, 0x7f, 0xd2, 0x52, 0xb1, 0x8b, 0x9b, 0xcd, + 0x75, 0xb3, 0xbf, 0x77, 0xd2, 0xca, 0x6a, 0x69, 0x55, 0x4b, 0xbd, 0x78, + 0xc5, 0x9d, 0x9a, 0x47, 0x54, 0x5b, 0x4b, 0x84, 0xa1, 0x95, 0x4f, 0x53, + 0x77, 0x6e, 0xcc, 0x4e, 0xac, 0x36, 0x6a, 0xba, 0x46, 0x5e, 0x59, 0x50, + 0xa6, 0xa7, 0xa4, 0xb6, 0x31, 0xbf, 0x88, 0x68, 0x7e, 0x6c, 0xa7, 0xa5, + 0xd0, 0x9f, 0x79, 0x8f, 0xb1, 0x2e, 0x38, 0x7f, 0xc5, 0xab, 0x9b, 0x89, + 0x9b, 0x34, 0xd0, 0xc8, 0xb1, 0xaa, 0x79, 0x50, 0xb8, 0xba, 0xc9, 0x62, + 0x9d, 0xb2, 0x83, 0xc2, 0x5a, 0xaa, 0x71, 0x3f, 0x6d, 0x39, 0x7e, 0xbf, + 0x5d, 0x39, 0x42, 0x52, 0xbc, 0x82, 0xa8, 0x98, 0xb0, 0x5f, 0xa8, 0x59, + 0x4a, 0x8a, 0x8a, 0x2b, 0x7f, 0xc3, 0xb6, 0x9e, 0x4b, 0x27, 0xaa, 0x64, + 0x8a, 0xa9, 0x5e, 0x98, 0xac, 0x61, 0x8b, 0xb5, 0x95, 0x87, 0xd6, 0x44, + 0x9d, 0x67, 0xc3, 0xd4, 0x9a, 0x2d, 0x4b, 0x88, 0xa8, 0x41, 0xc9, 0x61, + 0xdd, 0x41, 0x3b, 0xc2, 0x8b, 0x5c, 0xb5, 0xab, 0x8f, 0x3f, 0xa7, 0x50, + 0x59, 0x2f, 0x9f, 0x65, 0x8d, 0x75, 0xa8, 0x6c, 0x93, 0xa4, 0x96, 0x96, + 0x4d, 0x89, 0x47, 0x73, 0xab, 0x89, 0x76, 0x9c, 0x7e, 0xb2, 0x69, 0xd0, + 0x33, 0xb4, 0xc8, 0x60, 0xc7, 0x86, 0xb0, 0xc2, 0x8f, 0x40, 0x89, 0xa3, + 0xb7, 0x9a, 0xa2, 0x38, 0x80, 0x9a, 0x82, 0x4d, 0x75, 0xeb, 0x7c, 0xa0, + 0x91, 0x51, 0x41, 0xbd, 0x63, 0xc4, 0x47, 0x67, 0xc7, 0xa3, 0x49, 0xda, + 0xb0, 0xa4, 0xca, 0x5c, 0xbe, 0x37, 0x68, 0xbb, 0x94, 0xb8, 0x8f, 0x91, + 0xc5, 0xc4, 0x51, 0x8b, 0x51, 0x42, 0x85, 0x6f, 0x56, 0x57, 0x8e, 0x63, + 0xa4, 0xc6, 0x83, 0x75, 0x44, 0x6d, 0x49, 0x8c, 0x73, 0x1d, 0x6f, 0x7d, + 0x9f, 0x64, 0x98, 0xaf, 0x8e, 0x55, 0x2d, 0xbf, 0xa3, 0xa7, 0x38, 0x27, + 0x92, 0xae, 0x31, 0x53, 0x81, 0x40, 0x5c, 0xa6, 0x33, 0x8a, 0xd7, 0x4c, + 0x24, 0x53, 0x8c, 0x6d, 0x4d, 0xbe, 0x7c, 0x44, 0x3d, 0x26, 0x98, 0xba, + 0xa6, 0x59, 0xa3, 0x80, 0xd4, 0x42, 0x4d, 0x37, 0x9a, 0xb7, 0x98, 0x9a, + 0x78, 0xbe, 0x40, 0xac, 0x94, 0x57, 0x9f, 0x7e, 0x2d, 0xcc, 0x85, 0x74, + 0xb2, 0xd7, 0x8d, 0x3c, 0xd5, 0x78, 0xa3, 0x5c, 0x49, 0x4c, 0xa4, 0x79, + 0x54, 0x37, 0x56, 0x51, 0xa7, 0x41, 0xc5, 0x6e, 0x38, 0x70, 0x2c, 0xc5, + 0x3c, 0xb9, 0x75, 0x8f, 0xd4, 0x3f, 0xc7, 0xc6, 0x8e, 0x51, 0x3a, 0x6e, + 0xc6, 0xb0, 0xd8, 0x6f, 0x84, 0xba, 0x44, 0x2c, 0x4a, 0xb8, 0x1b, 0x2e, + 0x4c, 0x43, 0x51, 0x89, 0x76, 0xa3, 0x70, 0x5e, 0x5f, 0x85, 0xa5, 0x97, + 0x5f, 0xb2, 0x96, 0x79, 0x60, 0xe6, 0xda, 0x75, 0x74, 0x7f, 0x94, 0x7a, + 0x8e, 0x8e, 0x70, 0xd5, 0x9c, 0x9f, 0x59, 0x52, 0xba, 0x90, 0x9d, 0x7f, + 0x4a, 0x8b, 0x66, 0x84, 0x95, 0x70, 0x8d, 0x63, 0x37, 0x82, 0x50, 0xa8, + 0x98, 0x9a, 0x9c, 0x79, 0x82, 0x9d, 0x8e, 0xb3, 0x5e, 0x95, 0xa0, 0x4b, + 0xb1, 0x92, 0x3e, 0xc6, 0xa3, 0xd5, 0x58, 0x3b, 0xaa, 0xbd, 0x4d, 0x5e, + 0x77, 0x3a, 0x60, 0x80, 0x3b, 0x7d, 0x30, 0x8f, 0x7b, 0xa4, 0x6e, 0x84, + 0x9b, 0x82, 0x4a, 0xd3, 0xb4, 0x8c, 0xdc, 0x82, 0x6f, 0xe4, 0x71, 0xa1, + 0xae, 0x85, 0x91, 0x42, 0xa4, 0x54, 0x8d, 0x29, 0x5a, 0xb5, 0x5d, 0x89, + 0x9e, 0x80, 0xda, 0xa9, 0x6c, 0x39, 0x5b, 0xda, 0x7c, 0x73, 0xd4, 0xaf, + 0x47, 0x64, 0x9e, 0x96, 0xac, 0xad, 0x7c, 0x4d, 0x41, 0x41, 0x42, 0x95, + 0xab, 0xc0, 0x7e, 0x24, 0x76, 0x46, 0x64, 0xa0, 0xb8, 0xa1, 0xc5, 0x33, + 0x3f, 0x75, 0x61, 0xd1, 0x46, 0x7b, 0x9c, 0xae, 0x96, 0x8c, 0x49, 0x65, + 0xb3, 0x70, 0x48, 0x1e, 0x61, 0x8b, 0xa8, 0x6d, 0xba, 0x73, 0x45, 0xcd, + 0x66, 0xc2, 0x31, 0x5d, 0xaf, 0x7c, 0x7f, 0x61, 0x51, 0x64, 0x6a, 0xe2, + 0xa4, 0xa1, 0x5a, 0xce, 0xa4, 0xc5, 0xaf, 0xa5, 0x90, 0x5c, 0x33, 0x62, + 0x61, 0x6a, 0x3e, 0x42, 0x7c, 0x82, 0x88, 0x9a, 0x3b, 0x5e, 0xca, 0x88, + 0x34, 0x80, 0x92, 0x66, 0x4e, 0x6d, 0xc7, 0x51, 0xbd, 0x4d, 0x96, 0x92, + 0x31, 0x79, 0x4b, 0x4c, 0x6e, 0xc7, 0xa6, 0x39, 0x4f, 0x9a, 0x6f, 0x6a, + 0xaf, 0xa5, 0xc7, 0x40, 0xce, 0x89, 0xb6, 0xc7, 0xa6, 0x9c, 0x6b, 0x3b, + 0x38, 0x9b, 0xa8, 0x54, 0xd4, 0x6b, 0xba, 0xa7, 0x7e, 0x3f, 0x66, 0x5e, + 0xbb, 0x84, 0x6b, 0xdc, 0xcc, 0x8f, 0x2f, 0xaa, 0x85, 0xaf, 0xbb, 0x83, + 0x47, 0x5f, 0x50, 0xb8, 0xd1, 0x5f, 0x8d, 0x49, 0x69, 0x52, 0x37, 0x89, + 0xa2, 0x86, 0x46, 0xbb, 0x45, 0x67, 0x70, 0xbd, 0xbc, 0x42, 0x59, 0x89, + 0xa6, 0x44, 0xad, 0x88, 0x80, 0x48, 0xa8, 0x23, 0x30, 0xad, 0x74, 0x7a, + 0x3d, 0xad, 0x52, 0xac, 0x96, 0xc8, 0xa8, 0x4b, 0x8a, 0xbb, 0xcb, 0x88, + 0x8a, 0xc3, 0x30, 0x65, 0x70, 0x83, 0xa5, 0x35, 0xb0, 0xca, 0x75, 0x43, + 0x5e, 0x5d, 0xca, 0xb7, 0x53, 0x4c, 0x24, 0x66, 0x58, 0xab, 0x63, 0x3d, + 0x8b, 0x6c, 0xbc, 0xc8, 0xb4, 0x37, 0x30, 0x37, 0x94, 0xac, 0x37, 0x90, + 0x92, 0x58, 0x76, 0x40, 0xcf, 0x69, 0x42, 0xd6, 0x8f, 0x70, 0xd4, 0x7e, + 0x82, 0xa7, 0x6c, 0xb6, 0xa4, 0x48, 0x88, 0x65, 0xb4, 0x64, 0xbd, 0xc5, + 0x5d, 0xa9, 0x3f, 0xbc, 0x70, 0xd3, 0xae, 0x41, 0xa5, 0xac, 0x55, 0x99, + 0x88, 0x67, 0xbe, 0x8c, 0x51, 0x65, 0xb6, 0xa2, 0x89, 0x54, 0xc7, 0x57, + 0x49, 0xa3, 0xcd, 0x72, 0x7e, 0xcb, 0x47, 0x44, 0xbf, 0x5e, 0xd2, 0x95, + 0x41, 0x99, 0x4e, 0x5d, 0x40, 0xc5, 0x97, 0x42, 0x4a, 0xa1, 0xbf, 0xc4, + 0x9f, 0xbf, 0x8a, 0xae, 0x9b, 0xad, 0x79, 0x5e, 0x9f, 0xa1, 0x95, 0x6a, + 0x4d, 0x98, 0x96, 0x4e, 0xa3, 0x85, 0x61, 0x5f, 0x86, 0x87, 0xa1, 0xc7, + 0x98, 0x49, 0x46, 0xb9, 0x58, 0x3d, 0x6e, 0x73, 0xa0, 0x49, 0x57, 0xae, + 0xa5, 0x58, 0x65, 0x37, 0x2f, 0x87, 0x77, 0x81, 0x7a, 0xb2, 0x6f, 0x87, + 0x56, 0x97, 0x77, 0x76, 0x51, 0xcd, 0x75, 0xb4, 0x65, 0xb3, 0x71, 0xce, + 0x71, 0xa2, 0x96, 0x2c, 0xc5, 0x51, 0x78, 0x8a, 0xbb, 0x67, 0x49, 0xba, + 0x5b, 0xd8, 0x73, 0x83, 0x48, 0x60, 0x49, 0x63, 0x91, 0x73, 0x5d, 0xb7, + 0x68, 0xc2, 0x9d, 0xc9, 0xc7, 0x98, 0xd4, 0x66, 0x84, 0x4e, 0xc6, 0x80, + 0xa9, 0x8d, 0x5e, 0x8b, 0x7a, 0x90, 0xc8, 0x9c, 0xc2, 0x5e, 0x7f, 0x45, + 0x6d, 0x4a, 0xa2, 0x35, 0x86, 0xcc, 0x4d, 0x8d, 0x45, 0xa4, 0xc3, 0x98, + 0xbc, 0xd7, 0xd4, 0xc4, 0xdd, 0xbf, 0x3c, 0x81, 0x91, 0xd8, 0x37, 0x54, + 0x98, 0xcf, 0x86, 0xb4, 0x62, 0x51, 0xd3, 0x4e, 0xad, 0xd5, 0x51, 0x34, + 0xc2, 0x7e, 0x7d, 0x55, 0x42, 0xd8, 0x95, 0x8f, 0xc8, 0xbe, 0x9d, 0xa4, + 0x77, 0x90, 0x76, 0x84, 0x34, 0x4c, 0xaf, 0xc0, 0xbd, 0x64, 0x56, 0xa0, + 0x59, 0xa6, 0x61, 0xd2, 0x7a, 0x87, 0xa6, 0x65, 0x49, 0x35, 0xbf, 0xc7, + 0x32, 0x46, 0xbd, 0xb1, 0xbe, 0x3f, 0xc6, 0x89, 0x82, 0xb0, 0x36, 0x44, + 0x20, 0x3c, 0x68, 0xbe, 0x8c, 0x65, 0xaa, 0x4d, 0x65, 0x72, 0x50, 0x5b, + 0x57, 0x57, 0xa6, 0x3a, 0xd2, 0xb6, 0xc4, 0x50, 0xad, 0x6a, 0xa6, 0xa4, + 0xc8, 0x80, 0xa3, 0x75, 0x63, 0xc4, 0x78, 0xb0, 0x45, 0x44, 0x5d, 0x7b, + 0xc8, 0xb9, 0x73, 0x44, 0x83, 0x8a, 0x5b, 0x57, 0x7a, 0xc4, 0x6a, 0xcb, + 0xb0, 0x69, 0x7f, 0x39, 0xa0, 0xa2, 0x6d, 0x53, 0xcd, 0x38, 0x52, 0x84, + 0xcb, 0x33, 0x5e, 0x76, 0x56, 0xc2, 0x52, 0x8f, 0x59, 0x73, 0x8b, 0xeb, + 0x9a, 0xd1, 0x34, 0xb2, 0x7f, 0xd9, 0x70, 0x98, 0x83, 0x49, 0x70, 0x9c, + 0x82, 0xb4, 0x76, 0xcd, 0x56, 0x33, 0x6a, 0x71, 0x42, 0x95, 0x37, 0x40, + 0x58, 0xad, 0x9e, 0x86, 0x82, 0xb9, 0x3e, 0xd8, 0xc9, 0xbc, 0x4d, 0x8a, + 0xc5, 0xd5, 0x87, 0xb3, 0xaf, 0xaa, 0x99, 0xc5, 0x69, 0x98, 0xa5, 0x84, + 0x68, 0xd4, 0x80, 0x4b, 0x89, 0xbc, 0x63, 0x4f, 0x92, 0x4c, 0x81, 0x5d, + 0x63, 0x8b, 0xca, 0x63, 0x56, 0x72, 0x5f, 0x7b, 0x7f, 0x33, 0x7c, 0x31, + 0x36, 0x63, 0xc6, 0x66, 0x39, 0x9f, 0x5c, 0x78, 0xa1, 0xdd, 0xa0, 0x5a, + 0x8e, 0x4b, 0x6f, 0x49, 0xbb, 0x4b, 0xc2, 0x95, 0x97, 0x51, 0xa8, 0x97, + 0xb3, 0x60, 0x33, 0xa9, 0xdd, 0x5c, 0x48, 0xc1, 0xbc, 0xbd, 0xcf, 0x58, + 0x62, 0xaf, 0x5e, 0x7d, 0x86, 0x42, 0x7b, 0xbb, 0x7c, 0xb1, 0x75, 0x96, + 0x8c, 0xd2, 0x7a, 0xc2, 0x57, 0x3b, 0x92, 0x63, 0xbf, 0x7d, 0x3f, 0xb6, + 0x6d, 0x75, 0x38, 0xaa, 0xc6, 0x56, 0xb3, 0x46, 0xcd, 0x66, 0xa8, 0xc6, + 0x89, 0x5f, 0x87, 0xbe, 0x41, 0x4f, 0xb9, 0x98, 0xb9, 0x49, 0x94, 0x99, + 0x88, 0x9e, 0xc4, 0x2d, 0xb9, 0xc3, 0xcf, 0xa0, 0x7a, 0xc0, 0x63, 0x73, + 0x81, 0x91, 0x74, 0x77, 0x60, 0xb3, 0x3d, 0x88, 0x77, 0x30, 0x9c, 0x75, + 0x31, 0x91, 0x4a, 0x9a, 0x9c, 0x89, 0x5f, 0x69, 0x81, 0x63, 0x7c, 0x62, + 0x6d, 0x7a, 0x48, 0x9b, 0xb9, 0x44, 0x46, 0xc6, 0x63, 0x9a, 0xb1, 0x74, + 0xaf, 0x8b, 0x89, 0xcd, 0xa6, 0xc4, 0x7d, 0x5b, 0xc1, 0x47, 0x5a, 0x37, + 0xb9, 0xb4, 0x64, 0x3f, 0x4b, 0xb6, 0xbf, 0x95, 0x70, 0xa8, 0x71, 0x8b, + 0x4c, 0x42, 0x4a, 0x84, 0x89, 0xa4, 0x4c, 0x67, 0xad, 0xc4, 0xd2, 0x6a, + 0x99, 0x70, 0x99, 0x9a, 0x96, 0x45, 0xa4, 0xb7, 0xa3, 0xc0, 0x36, 0x89, + 0x75, 0xcf, 0xb0, 0xa5, 0x2d, 0x36, 0xae, 0xa3, 0x53, 0x9a, 0x36, 0x5b, + 0x6f, 0x8c, 0x8c, 0x96, 0x76, 0x85, 0xd1, 0xd5, 0xb0, 0x38, 0x99, 0x72, + 0x7e, 0xab, 0x68, 0xd4, 0xaa, 0x9d, 0xc2, 0x49, 0xa8, 0x91, 0x70, 0xaa, + 0x90, 0xa3, 0xbe, 0x74, 0x6a, 0xa6, 0xbc, 0x98, 0xb0, 0x3b, 0x52, 0xc3, + 0x8c, 0x9f, 0xba, 0x3e, 0xa7, 0x53, 0xa2, 0x69, 0x5c, 0x8e, 0x64, 0xad, + 0xcf, 0xaf, 0x62, 0xd1, 0x5b, 0x4e, 0xa8, 0x9b, 0x84, 0xa8, 0x6a, 0xd0, + 0x34, 0xca, 0xa8, 0x51, 0x9a, 0x7b, 0x95, 0x59, 0x56, 0x41, 0x96, 0x33, + 0x46, 0x96, 0x71, 0x8b, 0x5e, 0xc6, 0xb8, 0x52, 0x68, 0x7d, 0x26, 0x86, + 0x37, 0x3b, 0xab, 0xc1, 0x6f, 0x42, 0x99, 0x92, 0xa6, 0xb6, 0x72, 0x96, + 0x77, 0x9d, 0x4d, 0x5c, 0x9c, 0x64, 0xce, 0x94, 0x67, 0x36, 0xb0, 0x5d, + 0x36, 0x5b, 0x3c, 0xa3, 0xcb, 0x6c, 0x47, 0x39, 0x4a, 0x92, 0xbb, 0xc9, + 0x72, 0x65, 0xc9, 0xcb, 0x98, 0x5a, 0x5c, 0x59, 0x61, 0xb1, 0x5c, 0xd9, + 0x90, 0xdc, 0x4c, 0xc4, 0x35, 0xc5, 0x87, 0x47, 0x3a, 0xc6, 0x3e, 0x49, + 0x66, 0xd2, 0x88, 0xa9, 0xb6, 0x92, 0x7b, 0xb6, 0x5f, 0x55, 0x83, 0x35, + 0xbf, 0x6e, 0x4b, 0x61, 0x5e, 0x74, 0xc8, 0xd5, 0xb7, 0x66, 0x77, 0xc4, + 0x80, 0x6a, 0x6f, 0x32, 0x44, 0x9a, 0xa6, 0xa2, 0x99, 0x97, 0xaf, 0x7a, + 0x57, 0x4a, 0x38, 0x65, 0x9d, 0x73, 0xa4, 0x47, 0x56, 0x2e, 0x4c, 0x7a, + 0x82, 0x92, 0xab, 0xa2, 0xc0, 0xc6, 0xc4, 0xc1, 0x6d, 0x47, 0x63, 0x32, + 0x53, 0x9c, 0x67, 0xc6, 0xca, 0x95, 0x67, 0x9f, 0x73, 0xcb, 0xcd, 0x75, + 0xce, 0x85, 0xc8, 0x3a, 0x9f, 0x6e, 0x5c, 0xca, 0x9a, 0x2c, 0x51, 0x5b, + 0x5a, 0x8b, 0xb0, 0x4d, 0x59, 0xcd, 0x93, 0xaf, 0x89, 0xae, 0x72, 0x4e, + 0x36, 0xb1, 0x84, 0xb2, 0x8a, 0x83, 0x7b, 0x58, 0xa9, 0x8f, 0x93, 0x77, + 0x84, 0x51, 0x73, 0xb4, 0x67, 0xbd, 0xbe, 0x5c, 0x54, 0x6e, 0x9b, 0xb8, + 0x92, 0xac, 0x67, 0x3d, 0xd8, 0x51, 0x30, 0x39, 0x4f, 0x74, 0x45, 0x61, + 0xbe, 0x9f, 0xa2, 0xc0, 0xcb, 0x5a, 0x48, 0x9f, 0xbb, 0xbd, 0x65, 0x51, + 0x3f, 0x8e, 0x94, 0x5e, 0x58, 0x84, 0x68, 0x4e, 0x2f, 0x52, 0x8c, 0x50, + 0xaf, 0x80, 0xa9, 0xd1, 0x96, 0x4e, 0x4e, 0xb5, 0x6c, 0x77, 0x95, 0x40, + 0xd5, 0x73, 0xa1, 0x3e, 0xcd, 0x49, 0x5f, 0xc9, 0x9d, 0x9e, 0xb0, 0x60, + 0xc2, 0xd2, 0x9e, 0xb6, 0x4f, 0xbd, 0x8e, 0x93, 0x91, 0x74, 0xba, 0x3f, + 0x7d, 0x7f, 0x5c, 0xb6, 0xce, 0x74, 0x69, 0x91, 0xcd, 0x42, 0xb7, 0x70, + 0x81, 0x45, 0xc8, 0x7d, 0x51, 0x51, 0x74, 0x65, 0xbf, 0x50, 0x57, 0x63, + 0x86, 0x49, 0x83, 0xa6, 0x50, 0xb3, 0x66, 0x5e, 0x3b, 0x2f, 0xaf, 0x83, + 0xb8, 0x30, 0x9e, 0x57, 0x42, 0xa8, 0x95, 0x4a, 0x4b, 0x6c, 0x2e, 0x6e, + 0x47, 0xb0, 0x9a, 0x85, 0xc3, 0x77, 0xaf, 0x71, 0x63, 0x59, 0x7b, 0x36, + 0x45, 0x91, 0xb5, 0xca, 0x87, 0x47, 0x99, 0x75, 0x91, 0x48, 0x73, 0x4f, + 0x84, 0x4a, 0xa3, 0x3c, 0x5b, 0x31, 0x3c, 0xc6, 0x7b, 0x55, 0x48, 0x42, + 0xcb, 0x65, 0x5e, 0xb8, 0xa1, 0x30, 0xa9, 0xbc, 0x73, 0x5e, 0x4b, 0x95, + 0xa3, 0x80, 0x7e, 0x6a, 0xa9, 0xc8, 0x95, 0x38, 0x91, 0x54, 0x9a, 0xb2, + 0xb5, 0x4b, 0x95, 0x8d, 0x93, 0x87, 0x61, 0x93, 0x56, 0x1d, 0x49, 0x80, + 0x6b, 0xb2, 0xbb, 0x61, 0x82, 0x61, 0xa1, 0xb4, 0x3b, 0x5a, 0x5e, 0x78, + 0x47, 0x64, 0xb4, 0xcd, 0x6a, 0x79, 0xc2, 0xba, 0x7d, 0x7c, 0xa2, 0xdd, + 0xbc, 0xc2, 0x86, 0x3c, 0x45, 0x69, 0xc2, 0x54, 0x9d, 0x49, 0xa9, 0x6c, + 0x84, 0xa3, 0x48, 0x86, 0x31, 0x82, 0x76, 0x49, 0x2e, 0x99, 0x9b, 0xac, + 0xb4, 0x3c, 0xe1, 0xad, 0xba, 0x3f, 0xc1, 0x8b, 0x82, 0xbd, 0x76, 0x64, + 0xbf, 0x5f, 0xa2, 0x95, 0x5b, 0x78, 0x6d, 0x4e, 0xd9, 0x59, 0x49, 0xa0, + 0x44, 0x62, 0x89, 0x39, 0x9c, 0x37, 0xa7, 0x99, 0x79, 0x4a, 0x35, 0x4a, + 0x73, 0x6f, 0xbb, 0x3c, 0xbd, 0xcf, 0x3f, 0x32, 0xcc, 0x84, 0x39, 0x64, + 0x40, 0xc6, 0x8f, 0xb5, 0xc0, 0xab, 0x97, 0x6c, 0x67, 0x67, 0x3d, 0x8f, + 0x74, 0x32, 0x92, 0xd0, 0xcb, 0x69, 0xa1, 0xc0, 0x9d, 0x70, 0x70, 0x70, + 0x3c, 0x81, 0xb5, 0xbf, 0x4c, 0x90, 0xd9, 0x90, 0xc7, 0xac, 0xa0, 0x89, + 0x91, 0x99, 0x92, 0xc5, 0xbb, 0x77, 0xa8, 0x69, 0xb9, 0x76, 0x8e, 0x45, + 0xa7, 0xdb, 0xa3, 0xa9, 0xb0, 0x72, 0xa1, 0x60, 0x8a, 0x9e, 0x86, 0x69, + 0xaa, 0x4a, 0x6e, 0x81, 0x3f, 0xac, 0xc0, 0x86, 0xbf, 0xb9, 0xb9, 0x98, + 0x3a, 0x32, 0x8d, 0xab, 0x62, 0x3e, 0x26, 0x54, 0x3a, 0xbc, 0xb6, 0x73, + 0x5a, 0x58, 0x7f, 0x3f, 0x9d, 0xa7, 0xcf, 0x60, 0x5b, 0xbd, 0xac, 0x58, + 0xae, 0x2f, 0x66, 0x86, 0x99, 0xaf, 0x59, 0x2e, 0x34, 0xc5, 0x7f, 0xd1, + 0x89, 0x55, 0xda, 0x78, 0xd9, 0x64, 0x50, 0x4d, 0xdb, 0xcd, 0xca, 0xa3, + 0x5a, 0x78, 0x6a, 0x95, 0x33, 0x87, 0x82, 0xd2, 0x60, 0x64, 0x7f, 0x72, + 0xb8, 0x46, 0x53, 0x3a, 0x69, 0x77, 0xc3, 0xb3, 0x8a, 0x58, 0x47, 0x55, + 0x48, 0xaa, 0xa2, 0x65, 0xaa, 0xb3, 0x64, 0xd0, 0xaf, 0xdc, 0xc5, 0xbf, + 0x77, 0xa0, 0x7d, 0x5e, 0x77, 0x8e, 0x6a, 0x38, 0x53, 0xd2, 0x84, 0xa9, + 0x85, 0xa1, 0xbe, 0x39, 0x6c, 0xc4, 0x64, 0x4d, 0x3f, 0xae, 0x49, 0xc4, + 0x3a, 0x97, 0x6b, 0xb0, 0x55, 0x3d, 0xbf, 0x62, 0xd2, 0xd6, 0x46, 0x7e, + 0x36, 0x6a, 0x97, 0x93, 0x85, 0x48, 0xc6, 0x63, 0xca, 0x35, 0x2d, 0x93, + 0xaa, 0x57, 0x7c, 0x93, 0xaa, 0x85, 0x59, 0x5b, 0x55, 0x88, 0x80, 0xb4, + 0x4b, 0x6c, 0x61, 0xa6, 0xa0, 0x8e, 0x8c, 0x7c, 0x95, 0xd7, 0x6c, 0x84, + 0xcc, 0x35, 0x38, 0xbe, 0xbd, 0x47, 0x4b, 0x86, 0x65, 0x63, 0x29, 0xd5, + 0x9a, 0x48, 0xb7, 0x58, 0x84, 0xb1, 0x4b, 0x89, 0x4a, 0x71, 0x89, 0xb1, + 0x93, 0xae, 0x8c, 0x6e, 0x85, 0x92, 0x99, 0xa4, 0xaa, 0x55, 0x51, 0x61, + 0x80, 0x46, 0x90, 0x72, 0x52, 0x40, 0x64, 0xcc, 0x87, 0x3a, 0x84, 0xc9, + 0x8f, 0x80, 0xbd, 0xa4, 0xba, 0x9b, 0x63, 0xc9, 0x9a, 0x4f, 0x5b, 0xee, + 0x6f, 0x60, 0x77, 0x79, 0x73, 0x59, 0x64, 0x78, 0x9d, 0x50, 0xd5, 0x8f, + 0x3e, 0x93, 0xb7, 0x59, 0xb5, 0x7f, 0x66, 0xbd, 0x4f, 0x48, 0x66, 0xab, + 0x4a, 0xbb, 0x47, 0xb9, 0xb8, 0x68, 0xb1, 0xd3, 0x9a, 0x6a, 0x98, 0x4f, + 0xba, 0x34, 0xca, 0x85, 0x41, 0x89, 0xb0, 0x94, 0x88, 0x3a, 0x88, 0x92, + 0x69, 0x3b, 0x57, 0xb2, 0xb7, 0x7b, 0x7e, 0x3c, 0xc9, 0x8c, 0x9b, 0xba, + 0xca, 0xa9, 0x30, 0xa6, 0x5f, 0xba, 0xc1, 0x46, 0x68, 0x8b, 0x46, 0x7a, + 0xbf, 0x2d, 0xb0, 0xbe, 0x95, 0xa2, 0x6c, 0x82, 0xca, 0x71, 0x8d, 0x9f, + 0xd4, 0x4d, 0xa0, 0x67, 0x63, 0x74, 0x91, 0xc3, 0x59, 0x98, 0x43, 0x38, + 0xab, 0x5f, 0x7f, 0xac, 0x53, 0x47, 0x64, 0x85, 0x87, 0x91, 0x4c, 0x64, + 0x3c, 0x49, 0x5c, 0x8b, 0x32, 0x6d, 0xd2, 0x82, 0xae, 0x90, 0x7e, 0xbf, + 0xd8, 0x52, 0x75, 0xa8, 0x44, 0x43, 0xbc, 0x83, 0x91, 0x40, 0xa3, 0x7b, + 0x7b, 0xb8, 0x7f, 0xa0, 0xc7, 0xa7, 0x38, 0x5d, 0x81, 0x79, 0x44, 0xcb, + 0x85, 0xb7, 0x9a, 0x4e, 0x52, 0xc8, 0x4d, 0x43, 0x4a, 0x44, 0x3f, 0x5f, + 0xb1, 0xa6, 0x32, 0x6a, 0xd6, 0x4a, 0x7d, 0x6b, 0x82, 0xd2, 0xad, 0x88, + 0x96, 0x96, 0x81, 0x64, 0x4b, 0xdc, 0x4b, 0x6c, 0x7e, 0xbf, 0x7f, 0x8c, + 0x41, 0x5e, 0xc3, 0x46, 0x91, 0x77, 0x72, 0xbc, 0x9d, 0x43, 0x8c, 0x85, + 0xc7, 0xaf, 0xc7, 0x82, 0xcf, 0x59, 0xab, 0x53, 0x7e, 0x58, 0x8d, 0x3d, + 0x91, 0xd5, 0xac, 0xd7, 0x46, 0x9e, 0xc3, 0x95, 0x8e, 0x87, 0x6a, 0xd0, + 0x5e, 0xc0, 0x48, 0x63, 0x5c, 0x94, 0xa9, 0x6e, 0xab, 0xc8, 0x5d, 0xd4, + 0xab, 0xb3, 0xa0, 0x44, 0x5e, 0x8a, 0x9f, 0x67, 0xc6, 0xd5, 0x31, 0x5e, + 0x84, 0xac, 0x89, 0xa9, 0xa6, 0x6e, 0x46, 0x9a, 0xcf, 0x46, 0x98, 0x63, + 0x9f, 0x75, 0x42, 0xd7, 0x48, 0x73, 0x6f, 0x58, 0x9a, 0x51, 0x42, 0x5d, + 0x62, 0x85, 0x96, 0x4a, 0xbe, 0x58, 0x6d, 0x54, 0x7e, 0x97, 0x75, 0x54, + 0x5c, 0xbb, 0x84, 0xd3, 0x38, 0x61, 0x7e, 0x77, 0x40, 0x5c, 0xb3, 0x9a, + 0x6a, 0x73, 0x80, 0x4c, 0x57, 0xc9, 0x97, 0xc1, 0x92, 0x69, 0x2d, 0x99, + 0x91, 0xa4, 0x83, 0x23, 0x89, 0xbe, 0x9a, 0x89, 0xae, 0xba, 0xad, 0x4d, + 0x49, 0x8b, 0x59, 0x92, 0x67, 0x41, 0xa2, 0x3c, 0xab, 0xb4, 0x74, 0xc5, + 0xd1, 0x78, 0x6d, 0xa4, 0xa3, 0xc0, 0xb4, 0xc3, 0xa6, 0x6c, 0x61, 0x9e, + 0x5d, 0x96, 0x41, 0xac, 0xac, 0x6d, 0xa8, 0x81, 0x54, 0x8c, 0x63, 0x44, + 0xa1, 0x9d, 0xdf, 0x4a, 0xc9, 0x8d, 0x55, 0x46, 0xd3, 0x5a, 0x3a, 0x9e, + 0x59, 0xca, 0x44, 0x47, 0x8f, 0x5a, 0x43, 0x8b, 0xa8, 0xbb, 0x99, 0x8d, + 0x34, 0x6a, 0xb2, 0x8d, 0x54, 0xd3, 0x6f, 0xb7, 0x90, 0xa3, 0xa9, 0x80, + 0x83, 0x67, 0xa7, 0x6c, 0x98, 0x55, 0x7e, 0xc3, 0x4e, 0xc3, 0x82, 0xba, + 0x30, 0x61, 0xcf, 0x5e, 0x4f, 0x96, 0x8f, 0x4a, 0xce, 0x5e, 0x49, 0x4b, + 0x9b, 0x6e, 0x62, 0xe5, 0x37, 0x52, 0x55, 0xc5, 0x9d, 0xcd, 0x7f, 0x88, + 0xad, 0x44, 0xaf, 0xc5, 0x39, 0xa7, 0xdc, 0xda, 0x5e, 0x83, 0xd9, 0x8e, + 0x54, 0x36, 0x8f, 0xb5, 0xc4, 0xa7, 0x35, 0x7b, 0x48, 0xb5, 0xc7, 0x71, + 0x99, 0x67, 0xa1, 0x81, 0xc5, 0x9f, 0x8a, 0xb0, 0xac, 0xa4, 0x71, 0x79, + 0x35, 0xa1, 0x9d, 0xd7, 0x72, 0xa1, 0xd5, 0x7a, 0x80, 0x68, 0xa7, 0xc3, + 0x8a, 0x32, 0x63, 0x6e, 0x53, 0xc6, 0x89, 0xc3, 0x47, 0xb4, 0xd9, 0x6d, + 0x6a, 0x64, 0xb9, 0x95, 0x6a, 0x91, 0x46, 0x9e, 0x54, 0xb9, 0xdb, 0x6a, + 0x7c, 0x61, 0x7c, 0xaf, 0xab, 0x86, 0x23, 0x2e, 0x5e, 0x81, 0x70, 0x35, + 0x51, 0x78, 0xa1, 0xbc, 0xad, 0x37, 0x9f, 0x57, 0xb5, 0x51, 0xb3, 0x9f, + 0xc0, 0x8d, 0xcf, 0x3f, 0x3f, 0x74, 0xd8, 0x36, 0xbb, 0x8e, 0xa3, 0x73, + 0x4a, 0xad, 0x50, 0x53, 0x5c, 0x9d, 0x8c, 0x8b, 0xb9, 0x31, 0xc4, 0x81, + 0x8a, 0xc0, 0xb3, 0xae, 0xa6, 0xb1, 0xc0, 0xc6, 0xc9, 0x65, 0x70, 0xbb, + 0x4e, 0x7b, 0x7e, 0x87, 0x5c, 0x80, 0xad, 0xb3, 0xa9, 0xa3, 0xd4, 0x3d, + 0xae, 0x3c, 0x33, 0x62, 0xb0, 0xb7, 0x63, 0x61, 0xb4, 0x7d, 0x3d, 0x2b, + 0xd7, 0xb8, 0x68, 0x9b, 0x97, 0x41, 0x47, 0x69, 0x59, 0xcd, 0xbc, 0x91, + 0x95, 0xbd, 0x50, 0x72, 0x4e, 0xd3, 0xb9, 0x74, 0x73, 0xbe, 0xc7, 0xcc, + 0x70, 0x7d, 0xc9, 0x80, 0xb9, 0x30, 0xae, 0x40, 0x3d, 0x93, 0xaa, 0xcc, + 0xb3, 0x47, 0x5c, 0x40, 0xc2, 0x7c, 0xab, 0x96, 0x3e, 0x4f, 0x7d, 0x34, + 0x30, 0x2e, 0x5d, 0x71, 0xb7, 0x80, 0x8e, 0xb7, 0xc3, 0xa3, 0x85, 0x52, + 0x38, 0xa4, 0x68, 0xb6, 0x52, 0x40, 0x92, 0x81, 0x61, 0x4f, 0x70, 0x74, + 0xbf, 0x7f, 0x78, 0x4e, 0x9c, 0x39, 0x95, 0x92, 0xa1, 0x82, 0xcc, 0x7b, + 0x4e, 0x38, 0x54, 0x5f, 0xa8, 0x63, 0x67, 0xa6, 0x59, 0x72, 0xa1, 0x3c, + 0x99, 0xbd, 0x53, 0x50, 0x5c, 0x2c, 0xb8, 0x6d, 0x50, 0x3b, 0x46, 0x68, + 0xaa, 0x79, 0x52, 0x63, 0x35, 0xa5, 0x53, 0x6a, 0x75, 0x6e, 0x56, 0xab, + 0x43, 0x5d, 0x3b, 0xb7, 0xbc, 0x95, 0x9a, 0xa3, 0x62, 0x69, 0xbd, 0x8a, + 0x61, 0x94, 0xb1, 0xb5, 0x9a, 0x4e, 0x9b, 0x96, 0x92, 0x7d, 0x46, 0x65, + 0x4f, 0xb0, 0x45, 0x33, 0x5c, 0x9d, 0x60, 0x68, 0xc5, 0xbf, 0x46, 0x3a, + 0xb6, 0x3d, 0xc6, 0x7d, 0xb9, 0x73, 0xa7, 0xc9, 0x37, 0xa9, 0xc8, 0x75, + 0x6a, 0xba, 0x51, 0x40, 0xaa, 0xa9, 0x98, 0x7e, 0xa0, 0x85, 0xbb, 0x71, + 0x56, 0xb0, 0x6c, 0x37, 0xc2, 0xc0, 0x71, 0x5e, 0x38, 0xb9, 0x76, 0x6e, + 0xc1, 0x4b, 0x90, 0x92, 0x7c, 0xce, 0x9a, 0x99, 0xa5, 0x4e, 0x9a, 0x59, + 0x9a, 0xcd, 0x9d, 0xc3, 0x57, 0x52, 0x65, 0x56, 0x34, 0x53, 0xb4, 0x89, + 0x96, 0xb2, 0xc6, 0xda, 0xa8, 0xbe, 0xc3, 0x31, 0xbe, 0x61, 0xc1, 0x82, + 0x8a, 0x7a, 0x43, 0x84, 0x4e, 0x5f, 0x76, 0xbd, 0x6c, 0x59, 0x93, 0x57, + 0xb2, 0xac, 0x3b, 0xa5, 0x60, 0x51, 0x2f, 0x62, 0x3e, 0xd1, 0x8a, 0x4e, + 0x5d, 0x79, 0xa3, 0x9c, 0x4b, 0xd1, 0x79, 0x40, 0x54, 0x99, 0x4f, 0xac, + 0xd1, 0x75, 0x76, 0xcc, 0x78, 0xc7, 0xbe, 0x88, 0xb3, 0x54, 0x3c, 0x4c, + 0x44, 0x74, 0x48, 0x2e, 0xad, 0x5d, 0x87, 0x89, 0xb9, 0xb4, 0xd1, 0xd9, + 0x64, 0xb7, 0x3a, 0xb1, 0xb4, 0x88, 0xb6, 0x5f, 0xa3, 0x69, 0x5c, 0x7a, + 0xca, 0x5e, 0x4d, 0x87, 0x60, 0x48, 0x8d, 0xaf, 0x44, 0x94, 0x3d, 0x73, + 0x50, 0xa1, 0x72, 0x36, 0x9c, 0x90, 0x76, 0x51, 0x98, 0x72, 0x7b, 0x89, + 0xa5, 0xbc, 0x4f, 0x5a, 0x61, 0x3d, 0xa3, 0xae, 0xce, 0xa7, 0x54, 0xac, + 0xc3, 0xa1, 0x4a, 0x2b, 0x41, 0xcb, 0x3f, 0x9c, 0xb8, 0x9f, 0x7f, 0x9b, + 0x99, 0x39, 0x33, 0x62, 0xae, 0x39, 0x5a, 0x70, 0x36, 0xb0, 0x53, 0x76, + 0xa1, 0xa1, 0x94, 0xa8, 0x71, 0x6f, 0x8d, 0xc2, 0xba, 0x6d, 0xac, 0xba, + 0x51, 0x53, 0x7f, 0xcf, 0x7c, 0xcb, 0x5f, 0xb2, 0x2a, 0x4c, 0xbe, 0xab, + 0x69, 0xaf, 0x9f, 0xd2, 0xad, 0x3f, 0x74, 0x42, 0xa0, 0x41, 0x5e, 0x86, + 0x42, 0x9a, 0x6f, 0xb6, 0x6a, 0x6c, 0x31, 0xa7, 0x34, 0x77, 0xc9, 0x95, + 0x6d, 0x3c, 0xd6, 0x35, 0x79, 0x7e, 0xa5, 0x56, 0xa0, 0x9e, 0x83, 0x53, + 0x63, 0x5b, 0x8a, 0x8d, 0x90, 0xaa, 0x37, 0x4f, 0x77, 0x8b, 0x52, 0xae, + 0x80, 0xaf, 0x63, 0x2c, 0x62, 0x5f, 0x74, 0xb1, 0xb2, 0xae, 0xcd, 0xd8, + 0xa6, 0xce, 0x71, 0x46, 0x55, 0x7b, 0x46, 0x82, 0x90, 0x9e, 0x9d, 0xb1, + 0x68, 0xc3, 0x6d, 0xac, 0xc0, 0x33, 0x48, 0xbf, 0x59, 0x5d, 0x3d, 0x91, + 0xa4, 0x65, 0x39, 0x55, 0x52, 0xd1, 0x7c, 0x72, 0x85, 0x71, 0x8c, 0x5f, + 0x37, 0x50, 0xa2, 0xa1, 0x66, 0x2d, 0x68, 0x9e, 0x4f, 0xc3, 0xce, 0x54, + 0x75, 0x4f, 0xb7, 0xd1, 0xbc, 0x9f, 0xc8, 0x38, 0x92, 0x81, 0x83, 0xc0, + 0x7b, 0xa3, 0x81, 0x7d, 0x98, 0x54, 0x95, 0x7b, 0x48, 0xbc, 0x82, 0xc8, + 0x9b, 0x70, 0x7f, 0xd6, 0xb5, 0x4c, 0xa3, 0x84, 0xa5, 0xaa, 0x91, 0x66, + 0x99, 0xb6, 0x61, 0x7d, 0xa2, 0x3e, 0x2e, 0xc9, 0xc6, 0xc0, 0x72, 0xa8, + 0x62, 0x8f, 0x38, 0x83, 0x42, 0x8c, 0x42, 0x93, 0x57, 0xad, 0x62, 0xbc, + 0x85, 0x64, 0x84, 0x3e, 0x68, 0x59, 0xc2, 0x7f, 0x7d, 0xa0, 0xac, 0xbd, + 0xa4, 0x68, 0xbc, 0x77, 0xae, 0xcd, 0x56, 0xc3, 0x49, 0x41, 0xc0, 0x5c, + 0xc3, 0x6e, 0x70, 0x57, 0xcc, 0x6a, 0x3a, 0xb6, 0x90, 0x76, 0x5b, 0xd6, + 0x41, 0x93, 0x9d, 0xc7, 0x72, 0xbd, 0x89, 0xb1, 0xcf, 0x2d, 0x5d, 0xca, + 0xcc, 0x94, 0xc6, 0x76, 0xca, 0xab, 0xa9, 0xcc, 0x45, 0x92, 0x3d, 0x49, + 0x49, 0x8b, 0x38, 0xa6, 0x45, 0x52, 0xbd, 0x83, 0xac, 0x88, 0xcf, 0x70, + 0x94, 0x77, 0x62, 0xb2, 0x63, 0xb6, 0xaf, 0x3e, 0x64, 0x58, 0xa3, 0xb5, + 0x9f, 0x78, 0x55, 0x7e, 0x2a, 0xaa, 0x37, 0x5d, 0x30, 0x36, 0x63, 0x8b, + 0x3c, 0x86, 0xaf, 0x68, 0x9f, 0xc6, 0x54, 0xab, 0xb3, 0x8a, 0x93, 0x3b, + 0x92, 0x6d, 0x67, 0x42, 0x7a, 0x84, 0x5e, 0x5b, 0x4e, 0xc7, 0xcf, 0xb5, + 0x67, 0x5a, 0x77, 0xba, 0x35, 0x67, 0x60, 0x43, 0x34, 0x54, 0xbb, 0x54, + 0x94, 0x54, 0x53, 0x87, 0x5c, 0xb3, 0x77, 0xc4, 0xae, 0xc2, 0xa3, 0xd1, + 0x66, 0x43, 0x85, 0xbc, 0x73, 0x3c, 0x69, 0x65, 0xbc, 0xaa, 0xa7, 0x50, + 0x4c, 0x5b, 0x79, 0xac, 0xc5, 0xce, 0x41, 0xcf, 0x48, 0x84, 0x61, 0x49, + 0x7d, 0x35, 0x94, 0x85, 0x42, 0xc2, 0x95, 0x7f, 0x42, 0x8a, 0x81, 0x50, + 0x91, 0xc3, 0x2e, 0x4a, 0x7d, 0x8f, 0x6d, 0x61, 0x3a, 0x47, 0x6e, 0x42, + 0xac, 0x31, 0x4d, 0xc8, 0xa6, 0x8b, 0x62, 0x91, 0xc8, 0x53, 0x34, 0x80, + 0xc2, 0x90, 0x79, 0x9c, 0x2f, 0xc4, 0x94, 0xc8, 0x6f, 0x43, 0x31, 0x7e, + 0x4a, 0x74, 0x39, 0xa9, 0xa7, 0x46, 0x86, 0x9c, 0xc4, 0xc9, 0x3e, 0x4d, + 0x95, 0x51, 0x95, 0x63, 0xc3, 0x62, 0x96, 0xcb, 0xaf, 0x33, 0x57, 0xa8, + 0x8a, 0x63, 0xbc, 0xce, 0x93, 0x30, 0x62, 0x76, 0x6f, 0x72, 0xa0, 0x77, + 0x6f, 0x54, 0xd5, 0x88, 0x4f, 0x84, 0x5f, 0x5b, 0x64, 0x71, 0x76, 0xc3, + 0xcb, 0x6b, 0x33, 0x62, 0x40, 0x5b, 0xa5, 0x69, 0x90, 0x81, 0x77, 0x67, + 0xc9, 0x9a, 0x2a, 0x49, 0x72, 0x51, 0xa6, 0x8f, 0x62, 0xab, 0x8b, 0x4e, + 0xa4, 0x51, 0x5e, 0x4a, 0x6e, 0x5b, 0x9e, 0xb8, 0x66, 0x97, 0xb0, 0x5d, + 0x9e, 0x34, 0xaf, 0xa8, 0x38, 0xb9, 0x4a, 0x48, 0x93, 0xa2, 0x91, 0xd5, + 0x8e, 0x81, 0x3e, 0xba, 0xaf, 0x75, 0xc6, 0x60, 0x4f, 0xc2, 0x45, 0xa7, + 0xb4, 0x71, 0x50, 0x9f, 0x7c, 0x83, 0x73, 0x9d, 0x56, 0xd2, 0xba, 0xb5, + 0xc7, 0xa9, 0x77, 0x7b, 0x76, 0x64, 0xb6, 0x55, 0xd5, 0xa8, 0x8b, 0x65, + 0xc3, 0x5e, 0xad, 0x61, 0xa8, 0x3c, 0xa7, 0x8a, 0x3d, 0x6c, 0xa9, 0x58, + 0x6c, 0xb2, 0xb1, 0x5d, 0x7d, 0x59, 0x6e, 0x6a, 0xad, 0xb8, 0xc7, 0x4f, + 0xc3, 0xd5, 0xc2, 0x91, 0xc6, 0xb9, 0x90, 0x50, 0x9d, 0x70, 0x71, 0x66, + 0x82, 0x3e, 0x43, 0x40, 0x5d, 0x95, 0xa2, 0x5b, 0xc4, 0x93, 0x7e, 0x4f, + 0x7a, 0x50, 0x6e, 0x7a, 0xcb, 0x4f, 0x4e, 0xc1, 0xd1, 0x44, 0xd6, 0x5c, + 0x9a, 0x91, 0xc3, 0x80, 0xad, 0x3f, 0x73, 0x4e, 0x5c, 0xa9, 0x3e, 0x97, + 0x73, 0x53, 0xb6, 0x99, 0xa1, 0x70, 0xc7, 0x35, 0x75, 0x80, 0xd4, 0x67, + 0x3d, 0x91, 0x4d, 0xb5, 0x90, 0x89, 0xc2, 0x3a, 0x97, 0x87, 0x68, 0xcb, + 0xb4, 0x96, 0x75, 0x63, 0xa1, 0x9b, 0x71, 0xd6, 0x75, 0x27, 0x66, 0x80, + 0x73, 0x94, 0xa4, 0xce, 0x5f, 0x6d, 0x51, 0x58, 0x47, 0xd8, 0xa3, 0x99, + 0x84, 0xd0, 0x3c, 0xb7, 0x3c, 0x4e, 0x34, 0x7d, 0x63, 0x44, 0xd6, 0x86, + 0x78, 0x3a, 0xb6, 0x7c, 0x5a, 0x8e, 0xad, 0x30, 0xbb, 0xd8, 0x59, 0x7e, + 0xd1, 0x82, 0xa0, 0x76, 0xbd, 0xb0, 0xdc, 0xc6, 0xcd, 0x7f, 0x4c, 0xa8, + 0x5a, 0x63, 0x67, 0x5b, 0xaf, 0xa7, 0x40, 0x5c, 0x3d, 0x7a, 0x7d, 0x76, + 0xad, 0x9e, 0x82, 0x40, 0xc8, 0xdc, 0x95, 0x88, 0x5a, 0x97, 0x7e, 0x74, + 0x71, 0x42, 0x33, 0x3e, 0xbe, 0xc7, 0xc5, 0xc5, 0x59, 0x51, 0x71, 0xb1, + 0xc2, 0xbd, 0x66, 0xcd, 0x94, 0x7a, 0x97, 0xaf, 0x9e, 0x8e, 0xa0, 0xa7, + 0x77, 0x61, 0x7b, 0x45, 0x3d, 0x5e, 0x42, 0x8a, 0x60, 0x7c, 0x81, 0x29, + 0x88, 0x70, 0x97, 0x61, 0x6d, 0x9b, 0x82, 0x92, 0x5b, 0xc4, 0xb7, 0x6a, + 0x49, 0x5d, 0x80, 0x79, 0x85, 0xa7, 0x78, 0x47, 0x99, 0x82, 0x3e, 0xa6, + 0x6c, 0x53, 0xc8, 0x5a, 0x98, 0x91, 0xc3, 0x46, 0xcf, 0x36, 0x9f, 0x7c, + 0x5e, 0xcf, 0xa3, 0x98, 0x81, 0xd8, 0x42, 0x64, 0x43, 0x76, 0x80, 0x98, + 0xcd, 0x68, 0xbe, 0x88, 0x73, 0x3f, 0x75, 0x83, 0x9b, 0x65, 0x4c, 0x6b, + 0x2f, 0x65, 0x6d, 0x3f, 0x93, 0x3e, 0x6b, 0xca, 0x75, 0xbb, 0x4e, 0x53, + 0x35, 0x99, 0xca, 0x77, 0x53, 0xae, 0xb5, 0x53, 0x61, 0x67, 0x3c, 0x11, + 0xa6, 0x36, 0x68, 0x8e, 0x37, 0xc0, 0x73, 0xb2, 0x81, 0xa0, 0xd5, 0xce, + 0xb4, 0xaf, 0xbd, 0x68, 0x47, 0x9e, 0x40, 0x8d, 0x45, 0xb2, 0x5a, 0x98, + 0xb1, 0x95, 0x93, 0xd0, 0x76, 0x61, 0xb5, 0x94, 0x67, 0x2b, 0x8e, 0x7b, + 0xcc, 0x35, 0x73, 0x7e, 0xd4, 0xac, 0x2e, 0x2d, 0x7f, 0x6d, 0x6f, 0x8b, + 0x24, 0xc9, 0x7c, 0x42, 0xd9, 0x59, 0x91, 0x39, 0x9e, 0x41, 0x6e, 0x4b, + 0x3b, 0x51, 0x8a, 0xbd, 0x51, 0x4a, 0xcc, 0x59, 0x6e, 0xd6, 0xa9, 0xc8, + 0xb7, 0x4d, 0x39, 0x7f, 0x7b, 0x53, 0xc1, 0xa3, 0x84, 0x97, 0xc7, 0x69, + 0x73, 0xa8, 0x43, 0x63, 0x5e, 0x56, 0xb8, 0xb5, 0x8e, 0x49, 0xaf, 0xcc, + 0x34, 0xc9, 0xc0, 0xd1, 0x6b, 0xa6, 0x43, 0xcd, 0xa5, 0x39, 0xc5, 0xaf, + 0x76, 0x37, 0xce, 0x65, 0xca, 0xc1, 0xce, 0x72, 0xc5, 0xb4, 0xbf, 0x32, + 0x4c, 0x67, 0x44, 0x37, 0x62, 0x62, 0x95, 0x5d, 0x45, 0x76, 0xa1, 0xd5, + 0x55, 0x65, 0x55, 0xa4, 0x9e, 0xba, 0xa5, 0x63, 0x6f, 0xbf, 0x9c, 0xc8, + 0x42, 0x72, 0x61, 0x49, 0xce, 0x62, 0x48, 0xaf, 0xba, 0xa3, 0xd0, 0x40, + 0xb6, 0xa1, 0xc9, 0x71, 0x81, 0x36, 0x38, 0x75, 0xa7, 0x6d, 0x50, 0x8a, + 0xa9, 0xcc, 0x5a, 0xae, 0xc1, 0x66, 0x6f, 0xe9, 0xdb, 0x58, 0x95, 0x73, + 0x86, 0xd4, 0xa6, 0xd1, 0xb6, 0x81, 0x4d, 0x7c, 0x9a, 0x48, 0x4c, 0x9c, + 0xa8, 0x9f, 0x7c, 0x7c, 0x52, 0x41, 0xbe, 0x48, 0x4b, 0xbc, 0x38, 0xad, + 0x9e, 0x96, 0x51, 0x5b, 0x77, 0x57, 0xa8, 0x8f, 0x82, 0xaa, 0xc4, 0xd3, + 0xcf, 0xaf, 0x52, 0x37, 0x41, 0x49, 0xa5, 0x50, 0x97, 0x41, 0xd2, 0x90, + 0x66, 0x77, 0xac, 0x92, 0x94, 0x77, 0xb9, 0x57, 0xc3, 0x95, 0xb3, 0xdd, + 0x97, 0x8c, 0xa7, 0x2d, 0x94, 0x5a, 0x34, 0x63, 0x5c, 0x3e, 0x44, 0x56, + 0xb9, 0xbd, 0x76, 0xd1, 0xb6, 0xcb, 0xb7, 0x7b, 0x56, 0xaf, 0x99, 0x70, + 0x5d, 0x74, 0xaa, 0xba, 0xa7, 0x68, 0x56, 0x95, 0x74, 0x73, 0x4a, 0xa1, + 0x7a, 0xbe, 0x44, 0x77, 0x31, 0x78, 0x72, 0x3e, 0xae, 0x6f, 0xd3, 0x8a, + 0xe5, 0xcf, 0x60, 0x85, 0x6c, 0x65, 0xb4, 0xb2, 0x65, 0x34, 0xb7, 0xca, + 0x6e, 0x96, 0x42, 0x84, 0xca, 0x8a, 0x9f, 0x9e, 0xa2, 0x70, 0x85, 0xbf, + 0x3a, 0x60, 0xb0, 0x4f, 0x69, 0x54, 0x4d, 0xbb, 0x89, 0x4f, 0x4d, 0x99, + 0xb1, 0x48, 0xa1, 0x45, 0x85, 0x7d, 0xb9, 0xba, 0xb1, 0x4e, 0x57, 0x82, + 0x41, 0xc5, 0xc8, 0x8e, 0x59, 0x43, 0xc6, 0xb6, 0xcd, 0x6a, 0x52, 0x39, + 0x72, 0x50, 0xb4, 0x43, 0x48, 0x9f, 0x36, 0x5b, 0x6f, 0xa2, 0x69, 0x9c, + 0xb8, 0x8f, 0x8d, 0x49, 0xd2, 0x45, 0xc6, 0x3b, 0x6c, 0xda, 0x32, 0xb4, + 0x3f, 0x59, 0x46, 0xb5, 0xb2, 0x4e, 0x4e, 0x38, 0xcf, 0x52, 0x7a, 0x95, + 0xa4, 0xcb, 0xc2, 0xa0, 0x9d, 0xca, 0xaf, 0x99, 0x6b, 0x8a, 0x55, 0x83, + 0xc7, 0xa8, 0x9d, 0x2c, 0xcf, 0xd2, 0x76, 0xa1, 0x6d, 0xbe, 0xb0, 0x56, + 0xa2, 0x7d, 0x9a, 0xa2, 0x5b, 0x49, 0x8b, 0xa5, 0x8f, 0xd0, 0x39, 0x62, + 0x5c, 0x98, 0x33, 0x52, 0xbe, 0xbc, 0x7c, 0xbd, 0x69, 0xac, 0x1e, 0x50, + 0x3a, 0xa9, 0x94, 0xc8, 0x87, 0xbd, 0x84, 0x71, 0xbb, 0xbe, 0x67, 0x2d, + 0x85, 0x61, 0x73, 0xcd, 0x6e, 0x5c, 0xab, 0x72, 0x67, 0x34, 0xba, 0xb9, + 0x9f, 0x38, 0x35, 0xc3, 0xc7, 0x88, 0xc3, 0x39, 0x52, 0x51, 0xa1, 0x8a, + 0xa3, 0xd5, 0xcf, 0x55, 0x3f, 0xaf, 0xc6, 0x4a, 0xc5, 0x34, 0x53, 0x34, + 0x7b, 0xc2, 0x74, 0x46, 0xc5, 0x46, 0x80, 0xad, 0x39, 0x68, 0x42, 0x4a, + 0x47, 0x9a, 0xa2, 0xab, 0x70, 0x7d, 0x7d, 0x3e, 0xcc, 0x4f, 0x8d, 0x43, + 0xd2, 0xad, 0xce, 0x72, 0xa2, 0x72, 0x68, 0x7f, 0xbb, 0x7c, 0xc2, 0xa2, + 0x82, 0x3b, 0x71, 0xb2, 0x3b, 0xc9, 0x85, 0x60, 0xa1, 0xaf, 0x76, 0xac, + 0x7e, 0xc0, 0x79, 0xb6, 0x3b, 0x95, 0x82, 0xbe, 0xb1, 0xc5, 0x7e, 0xba, + 0xbb, 0xa3, 0x8d, 0x4b, 0xa0, 0xb0, 0x79, 0xab, 0x35, 0x7d, 0xba, 0x50, + 0x53, 0x43, 0xbe, 0x46, 0x79, 0xaf, 0x7a, 0x32, 0x47, 0xbe, 0x7c, 0x35, + 0x72, 0x3f, 0xbf, 0xa2, 0x4d, 0xbb, 0x8e, 0xb6, 0x80, 0x34, 0xa0, 0x3f, + 0x50, 0xaa, 0x3a, 0xb0, 0x39, 0x9a, 0xd4, 0xca, 0x5b, 0xce, 0x67, 0xac, + 0x5c, 0x60, 0xa4, 0x8a, 0x4e, 0x47, 0xba, 0xc9, 0xbc, 0x53, 0x49, 0x2e, + 0xba, 0xc2, 0x71, 0xe0, 0x98, 0x5d, 0xa2, 0xbe, 0x51, 0x3b, 0x7b, 0x9c, + 0xd7, 0xa0, 0x90, 0x96, 0xcd, 0x47, 0xa0, 0x46, 0x4e, 0xae, 0x4a, 0x57, + 0xbe, 0xce, 0x64, 0xa7, 0xb6, 0xa2, 0x95, 0x2e, 0xcb, 0x43, 0x78, 0x51, + 0x80, 0xb0, 0x5b, 0x49, 0x87, 0xa9, 0x96, 0xc9, 0xd3, 0xb0, 0x95, 0xb1, + 0x3a, 0x7a, 0xc8, 0x98, 0x42, 0x89, 0x7f, 0x9c, 0x3f, 0x64, 0x5b, 0x8e, + 0xa0, 0x2e, 0xae, 0x62, 0x8d, 0x3c, 0xb9, 0xb8, 0x8a, 0x8c, 0xa8, 0x37, + 0x53, 0xca, 0x32, 0x3b, 0x7f, 0xcb, 0xaa, 0x88, 0x6b, 0x9d, 0x88, 0xba, + 0x90, 0x89, 0xb1, 0x87, 0xc6, 0x53, 0x82, 0x3e, 0x6c, 0xc9, 0xd0, 0x7f, + 0x31, 0x7e, 0x58, 0x3e, 0x38, 0x46, 0x8f, 0x7f, 0x7e, 0x67, 0x4b, 0xc4, + 0x7e, 0x53, 0xa5, 0xbe, 0xb9, 0x4c, 0xac, 0x55, 0x7c, 0x72, 0x6f, 0x72, + 0xa6, 0x41, 0xb4, 0x34, 0xb0, 0x63, 0xb7, 0xc4, 0x7e, 0xc1, 0x35, 0x64, + 0xc3, 0xaf, 0x6c, 0x47, 0xc8, 0xbf, 0x8c, 0x7b, 0xba, 0xb4, 0x88, 0x57, + 0x7d, 0x94, 0x3a, 0xc1, 0xa8, 0xc7, 0x9d, 0x37, 0xc0, 0x3b, 0x97, 0xcd, + 0x3d, 0x5f, 0x8e, 0x5b, 0xbd, 0x6b, 0x3e, 0x3c, 0xd2, 0xad, 0x4e, 0x4d, + 0x44, 0x53, 0x5d, 0x6a, 0xce, 0xb4, 0xa9, 0x64, 0x81, 0xa0, 0x91, 0x6b, + 0x84, 0x30, 0x94, 0xb2, 0x9c, 0x41, 0x7e, 0x3c, 0xbb, 0xc5, 0x34, 0x4e, + 0x88, 0x6e, 0x47, 0x6d, 0xa3, 0xa0, 0x9e, 0x61, 0xa5, 0x42, 0x81, 0xc5, + 0xa8, 0x9c, 0x95, 0xaf, 0xa6, 0xbc, 0xc8, 0x83, 0xca, 0xba, 0xa2, 0x4c, + 0xa1, 0xaa, 0x70, 0x93, 0xcf, 0x84, 0xaa, 0x7e, 0x62, 0xc8, 0xc6, 0xbb, + 0xcc, 0x99, 0x8b, 0x75, 0x5a, 0x49, 0x56, 0x3a, 0x96, 0x2f, 0x81, 0x8e, + 0xc7, 0xa5, 0x36, 0xbd, 0x4b, 0x3d, 0xa9, 0x6f, 0x80, 0xba, 0x5b, 0xab, + 0x85, 0x60, 0x84, 0xa7, 0xb7, 0xcf, 0xa5, 0x7a, 0xa1, 0xad, 0x91, 0xa6, + 0xac, 0x52, 0x64, 0x31, 0x68, 0x5f, 0x8e, 0xb3, 0x87, 0xa5, 0x9c, 0xc3, + 0x8b, 0x86, 0x59, 0xab, 0xa2, 0x79, 0xad, 0x68, 0xab, 0x59, 0x4c, 0x39, + 0x85, 0x57, 0xc4, 0x7c, 0x9c, 0x41, 0x51, 0x5c, 0xb1, 0x38, 0x44, 0x8c, + 0x52, 0xc5, 0x7a, 0x9b, 0x6f, 0x88, 0x64, 0xaa, 0x55, 0xb5, 0xce, 0xb8, + 0xa6, 0x2c, 0xc5, 0x6b, 0xcb, 0x67, 0xad, 0xbd, 0x69, 0xb5, 0x92, 0xc0, + 0x82, 0x9b, 0xa5, 0x60, 0x5e, 0x40, 0x5a, 0x7f, 0x58, 0xcc, 0xc6, 0x80, + 0x52, 0xc9, 0x2d, 0x86, 0x3b, 0x96, 0x6d, 0xc4, 0x49, 0x96, 0x8e, 0x91, + 0x73, 0xc2, 0x54, 0xcd, 0x67, 0xbe, 0x32, 0x6b, 0x9c, 0x93, 0x93, 0xc7, + 0x3c, 0xbb, 0x6a, 0x38, 0x50, 0xc0, 0x95, 0xb0, 0x5e, 0x33, 0x4f, 0x4b, + 0xaa, 0x64, 0x3f, 0x84, 0xb0, 0x53, 0xa6, 0x6e, 0x8d, 0xa2, 0x97, 0x32, + 0x73, 0x65, 0xaf, 0xb4, 0x99, 0x90, 0xa0, 0x8f, 0x4a, 0xac, 0x84, 0x72, + 0xa2, 0xad, 0x7a, 0xb7, 0x56, 0x92, 0x42, 0x7f, 0xab, 0x64, 0x49, 0x50, + 0x84, 0x92, 0x2c, 0x97, 0x86, 0xc0, 0x92, 0x60, 0x58, 0x32, 0x88, 0x5b, + 0xc1, 0xd5, 0x3a, 0x77, 0xab, 0x8a, 0xb4, 0xba, 0x54, 0xcb, 0xb5, 0x3b, + 0x6c, 0x5a, 0xb0, 0xd6, 0xcb, 0x8a, 0x39, 0x7e, 0x3c, 0x54, 0x88, 0xc4, + 0x9c, 0xa9, 0x3b, 0x8f, 0x76, 0x56, 0x4b, 0xc6, 0x3d, 0x60, 0x93, 0x96, + 0x3e, 0x33, 0x9a, 0x8d, 0x46, 0x3c, 0x3f, 0x3d, 0x88, 0x85, 0xbb, 0x3e, + 0x5b, 0xc8, 0x9d, 0x88, 0x9c, 0x63, 0x93, 0x65, 0xbb, 0x44, 0xa5, 0x6c, + 0x84, 0x70, 0xc2, 0x9d, 0xc2, 0x43, 0xd3, 0x8a, 0x7d, 0x72, 0xc9, 0x9e, + 0xca, 0x9d, 0xce, 0xbc, 0x5f, 0x6b, 0x52, 0x5c, 0x51, 0x7f, 0x88, 0xd0, + 0x58, 0x9f, 0x60, 0xac, 0x7a, 0xa7, 0x68, 0x81, 0x72, 0xc0, 0x69, 0x64, + 0xbc, 0x57, 0xa2, 0x5c, 0x97, 0xc2, 0x51, 0x76, 0xd0, 0x47, 0x60, 0x48, + 0x99, 0x39, 0xc3, 0xcb, 0xa4, 0xaa, 0xc7, 0x93, 0xd2, 0xbe, 0xb4, 0x96, + 0x7c, 0xa3, 0x9a, 0xbc, 0x4a, 0x69, 0xbe, 0xae, 0x3c, 0xb9, 0x3c, 0x7a, + 0x64, 0x9a, 0x54, 0x51, 0xab, 0xc9, 0x64, 0x69, 0x6a, 0x4c, 0xaa, 0x75, + 0x89, 0x36, 0x53, 0xb5, 0x60, 0xad, 0xb4, 0x4f, 0x78, 0x95, 0xa6, 0x30, + 0xc2, 0xa2, 0x41, 0x9b, 0xbe, 0x64, 0x8a, 0x4d, 0x3c, 0x55, 0x82, 0xcb, + 0xbc, 0x97, 0x7e, 0x50, 0xb8, 0x8d, 0xae, 0x96, 0xb5, 0x9a, 0x54, 0x35, + 0x98, 0x5f, 0xa5, 0x57, 0x3e, 0xcc, 0xc9, 0xb4, 0xae, 0x93, 0xad, 0x56, + 0x5c, 0x3b, 0x5e, 0xa9, 0xd3, 0x8a, 0xb2, 0x53, 0x39, 0x68, 0x93, 0x91, + 0xb5, 0x5f, 0x6b, 0x61, 0x3f, 0x44, 0xc2, 0x3e, 0xa5, 0x9e, 0xc6, 0x34, + 0xd2, 0xc7, 0x97, 0x66, 0x5d, 0x35, 0x50, 0x7d, 0xa0, 0xa3, 0x7d, 0x71, + 0x53, 0xc0, 0x7c, 0x5b, 0x40, 0xc4, 0x7a, 0x91, 0xc8, 0x4c, 0xaf, 0xd6, + 0x4d, 0x97, 0xb4, 0x41, 0x40, 0xc4, 0xbe, 0x92, 0xaa, 0x3c, 0xa1, 0xd8, + 0x72, 0x33, 0xcf, 0x4e, 0x98, 0xb8, 0x41, 0xae, 0x51, 0xc8, 0xa1, 0x37, + 0x36, 0x99, 0x4c, 0x7d, 0x56, 0x3b, 0x5a, 0xd0, 0x8a, 0x7a, 0x4f, 0x84, + 0x52, 0x4f, 0x58, 0x3e, 0x9b, 0x5b, 0x4b, 0xaf, 0x4c, 0x58, 0x68, 0xb5, + 0xb4, 0xd1, 0x6f, 0x8a, 0x53, 0xb1, 0x5c, 0x9b, 0xc4, 0x87, 0x34, 0x88, + 0x44, 0x7f, 0x39, 0xaa, 0x9d, 0xcd, 0xb1, 0x99, 0x55, 0xbd, 0xbe, 0x45, + 0xd1, 0x76, 0x53, 0x32, 0xbb, 0x8c, 0x6e, 0x92, 0x78, 0x8a, 0x4c, 0x7f, + 0xac, 0x89, 0xc0, 0x6e, 0x89, 0x6d, 0x61, 0x8f, 0xa3, 0x6f, 0x52, 0x36, + 0xb9, 0x71, 0xb6, 0x48, 0xd3, 0x83, 0xaf, 0x7b, 0x64, 0x7f, 0xbc, 0xcd, + 0x4d, 0x55, 0x54, 0x8a, 0xcf, 0xbd, 0xa0, 0xaf, 0xa0, 0x4e, 0xa5, 0x37, + 0x38, 0xb4, 0x91, 0x5c, 0xb7, 0x71, 0xbc, 0x88, 0x98, 0x3b, 0x3a, 0x80, + 0x9b, 0x53, 0x52, 0x91, 0xc6, 0xb1, 0x62, 0x68, 0x51, 0xc7, 0xb7, 0xc1, + 0x81, 0x3f, 0xcf, 0xba, 0x86, 0x7b, 0x7a, 0x9a, 0x59, 0x92, 0x3c, 0x57, + 0x49, 0x45, 0x7d, 0x58, 0xa1, 0xce, 0x4a, 0x75, 0x62, 0x47, 0x37, 0xaa, + 0xab, 0x59, 0x8c, 0x88, 0x96, 0x32, 0x5f, 0x9c, 0xc4, 0x8e, 0xd1, 0x4e, + 0x49, 0x63, 0x31, 0xcd, 0xbb, 0x42, 0x7b, 0x94, 0x5f, 0x48, 0x3d, 0x97, + 0x8e, 0x97, 0xc4, 0x7a, 0x8e, 0x64, 0x35, 0xa6, 0x72, 0xb8, 0x57, 0x5d, + 0x51, 0xb3, 0xb4, 0x93, 0x6e, 0x52, 0xc6, 0xb7, 0x90, 0x43, 0xa5, 0x38, + 0x50, 0x66, 0xb3, 0x48, 0xc9, 0x4d, 0x9d, 0x83, 0xcf, 0x4d, 0xd0, 0x3e, + 0xa0, 0x6f, 0x5f, 0xa3, 0xc4, 0x97, 0xd0, 0x6f, 0x31, 0xb3, 0x35, 0xc9, + 0xbc, 0x8a, 0xac, 0x91, 0x4b, 0x43, 0x41, 0x9e, 0xc2, 0x59, 0xa7, 0x7a, + 0x5c, 0x35, 0xa9, 0x52, 0x9a, 0x62, 0x8a, 0x9d, 0xac, 0xb8, 0x5b, 0x38, + 0x34, 0xd1, 0x98, 0x7f, 0xaa, 0x71, 0xa3, 0xce, 0xa5, 0x8e, 0x4a, 0x9d, + 0x33, 0xac, 0xb4, 0x3f, 0x31, 0x91, 0xab, 0x92, 0x7f, 0x6e, 0x9f, 0x55, + 0xd1, 0x3e, 0x9c, 0x99, 0x88, 0x82, 0x3e, 0xd1, 0x69, 0xa9, 0x82, 0xcc, + 0xc0, 0x7b, 0x75, 0x4f, 0x5c, 0xd2, 0x70, 0xc9, 0xc4, 0xa4, 0xc6, 0x60, + 0x68, 0x55, 0x72, 0x52, 0xc3, 0x3e, 0x69, 0xa7, 0x7f, 0x9a, 0x75, 0xa7, + 0x64, 0x62, 0xa3, 0x7e, 0xa7, 0x60, 0x74, 0x4f, 0x93, 0x51, 0xd4, 0x53, + 0x75, 0x5e, 0x63, 0x8b, 0xc2, 0x54, 0x48, 0xa2, 0x4f, 0x4a, 0x87, 0x4e, + 0x60, 0x55, 0xc2, 0xa2, 0xb3, 0x58, 0xcd, 0xc6, 0xa3, 0x80, 0x57, 0x3f, + 0xbf, 0x8d, 0x81, 0x9a, 0xc1, 0x80, 0x33, 0x75, 0x7f, 0x70, 0xae, 0xa3, + 0x67, 0x65, 0x42, 0xcb, 0x8b, 0x39, 0x60, 0xd1, 0x6b, 0x62, 0x44, 0x45, + 0x74, 0x70, 0x98, 0xab, 0x6b, 0xcb, 0x74, 0xa5, 0x9d, 0x34, 0xa7, 0xa8, + 0x96, 0x4b, 0x70, 0x41, 0x46, 0xc9, 0x4c, 0x6c, 0xb8, 0x7c, 0x4b, 0xb3, + 0xa7, 0x81, 0xba, 0x49, 0x90, 0x51, 0xaa, 0x93, 0x9d, 0x97, 0xb5, 0x82, + 0x67, 0xb0, 0x8f, 0xc0, 0x79, 0x78, 0x93, 0x4c, 0x69, 0x35, 0x68, 0x81, + 0x47, 0x8f, 0x6c, 0x4e, 0xab, 0xb3, 0x70, 0x94, 0xae, 0xc8, 0x63, 0xb2, + 0xb5, 0x67, 0xc6, 0x7e, 0xbf, 0x6f, 0xc7, 0xb3, 0x63, 0x45, 0x6e, 0x34, + 0xd1, 0x84, 0x62, 0xb9, 0x67, 0x60, 0x52, 0x68, 0x9d, 0x8c, 0x97, 0x9c, + 0x9d, 0xca, 0x53, 0x87, 0xb3, 0x84, 0x69, 0xbe, 0x65, 0x57, 0x48, 0x8f, + 0x85, 0xc5, 0x7e, 0x4e, 0x32, 0x56, 0x6e, 0x8e, 0x70, 0x3b, 0x8d, 0x6d, + 0x8e, 0x87, 0x6d, 0x43, 0x36, 0x77, 0x86, 0xa7, 0x85, 0x44, 0x4f, 0x97, + 0xc1, 0x39, 0x7e, 0x4d, 0x59, 0xa3, 0x8a, 0x79, 0x7c, 0xcb, 0xb5, 0x93, + 0xb8, 0x3f, 0xb8, 0x6e, 0x5c, 0x39, 0xb0, 0x80, 0xca, 0x40, 0xcb, 0x8a, + 0xb6, 0x95, 0x62, 0x92, 0xb1, 0x60, 0x6a, 0x80, 0xba, 0x35, 0x69, 0x6e, + 0x6d, 0xc2, 0x6f, 0x56, 0x40, 0x45, 0x94, 0xd1, 0xd1, 0x6f, 0x91, 0x46, + 0x3d, 0xc6, 0xa8, 0x39, 0x94, 0xad, 0xad, 0xb2, 0x9e, 0xa1, 0x7f, 0x5e, + 0x37, 0x91, 0xab, 0x4a, 0x73, 0x9c, 0x9d, 0xa6, 0xa5, 0x68, 0xb1, 0xae, + 0xb9, 0x47, 0x52, 0xc7, 0x84, 0x7d, 0x81, 0x76, 0x9e, 0x96, 0x59, 0xb0, + 0x60, 0x9c, 0x6f, 0x46, 0x90, 0xb7, 0xc8, 0xbc, 0x52, 0x5a, 0x4d, 0x45, + 0x39, 0x90, 0x32, 0x33, 0x3a, 0x9c, 0xbd, 0xc5, 0xb7, 0x87, 0xb2, 0xbb, + 0x4c, 0xb2, 0xab, 0x5b, 0x79, 0xb4, 0xcf, 0x9c, 0xb3, 0x38, 0x9c, 0xb1, + 0x81, 0x97, 0x7c, 0x93, 0x3d, 0x49, 0x4c, 0x95, 0x9f, 0x52, 0x6b, 0x85, + 0xa7, 0xaa, 0xad, 0x88, 0x4f, 0x30, 0x97, 0x50, 0x8a, 0xbe, 0x94, 0x8f, + 0x8e, 0x63, 0x74, 0x65, 0x64, 0x39, 0x45, 0x62, 0x42, 0x37, 0xbb, 0x92, + 0x76, 0xb5, 0x54, 0x76, 0xb2, 0x89, 0x74, 0x75, 0x3e, 0xa3, 0xb6, 0xb0, + 0xd0, 0x80, 0xae, 0xbe, 0xb8, 0x87, 0x8a, 0xc4, 0x41, 0x6f, 0x51, 0x56, + 0x88, 0x44, 0x39, 0x33, 0x32, 0x79, 0x96, 0x41, 0x44, 0xbf, 0x48, 0x40, + 0x7a, 0x3c, 0x9e, 0x8a, 0x47, 0xc3, 0xce, 0xab, 0x8b, 0x44, 0xc5, 0xa8, + 0xa3, 0x7e, 0x4a, 0x3a, 0xc3, 0x72, 0x7e, 0xae, 0x67, 0x6c, 0x9b, 0xc6, + 0xb1, 0x38, 0xa2, 0x79, 0xb4, 0x55, 0x3e, 0x54, 0x9b, 0xbf, 0x47, 0xc3, + 0x62, 0x99, 0x6d, 0x4f, 0x3b, 0x4e, 0xbd, 0xcc, 0x39, 0x48, 0x32, 0xbc, + 0x49, 0x4c, 0x66, 0x9e, 0x4a, 0x7d, 0xbd, 0x83, 0x93, 0x35, 0xb9, 0xae, + 0x91, 0xd2, 0x7a, 0xcd, 0x37, 0xc6, 0x57, 0xbe, 0x5a, 0x9d, 0x72, 0x9b, + 0x80, 0x8e, 0xa8, 0x38, 0x70, 0x48, 0x9b, 0x7e, 0xa4, 0xca, 0x7d, 0x7b, + 0x4c, 0x6a, 0x88, 0x7a, 0x43, 0x6f, 0x5c, 0x69, 0xa6, 0x8c, 0x39, 0xaa, + 0xd3, 0x6c, 0x58, 0x93, 0x77, 0x5e, 0xb4, 0x47, 0xa0, 0xa3, 0x3a, 0x6f, + 0xcb, 0x81, 0x44, 0x31, 0x86, 0xbb, 0x83, 0x4a, 0x7f, 0x87, 0xb0, 0xc5, + 0xb5, 0x7d, 0x4a, 0x6b, 0xad, 0x7d, 0xae, 0x86, 0xb3, 0x84, 0x65, 0xcf, + 0x40, 0x3f, 0x48, 0x3e, 0x8b, 0xb1, 0x9c, 0xa3, 0x77, 0xae, 0xa0, 0x77, + 0xab, 0x39, 0x81, 0xc7, 0x93, 0x69, 0x7e, 0x73, 0x49, 0xc9, 0x9f, 0x8f, + 0x9e, 0x7d, 0x4f, 0x93, 0xac, 0x73, 0x30, 0x5a, 0xb9, 0x3b, 0xb0, 0xa1, + 0xad, 0x78, 0x91, 0xb0, 0xa5, 0x55, 0x60, 0x56, 0x99, 0x85, 0x7a, 0x83, + 0x5f, 0xbc, 0x41, 0xd1, 0xa4, 0xc3, 0x8f, 0x7f, 0x39, 0xcd, 0x90, 0x65, + 0x9a, 0x8e, 0x3c, 0x76, 0x93, 0x37, 0x98, 0xc4, 0x56, 0xb8, 0x2e, 0x87, + 0x84, 0x43, 0x57, 0x9c, 0x74, 0x46, 0x62, 0x5b, 0x69, 0x39, 0xa6, 0x7d, + 0x3e, 0x6c, 0xb1, 0xb2, 0xae, 0x8c, 0xb9, 0x60, 0x87, 0xc6, 0x57, 0x31, + 0x3a, 0x7b, 0x73, 0x53, 0x49, 0xad, 0xc3, 0x72, 0x85, 0x9c, 0xc0, 0xe1, + 0x3e, 0x68, 0x6b, 0x41, 0xbc, 0x90, 0x98, 0x43, 0x73, 0x64, 0xc9, 0x98, + 0x8a, 0x78, 0xae, 0x6b, 0x2a, 0x24, 0x47, 0x7c, 0x84, 0x73, 0x3f, 0x32, + 0x40, 0x37, 0xa7, 0xa5, 0x6b, 0x93, 0x4d, 0x45, 0xa7, 0xb1, 0x9c, 0xce, + 0x7c, 0xa3, 0x66, 0xa0, 0x49, 0xc1, 0x49, 0x36, 0xbd, 0xe1, 0xb0, 0x5b, + 0xae, 0xdc, 0xa0, 0x4c, 0x3c, 0x43, 0x77, 0x64, 0x58, 0xcf, 0xa4, 0x79, + 0x65, 0xcc, 0xc7, 0xb4, 0xcb, 0x34, 0xaf, 0x67, 0x70, 0xa1, 0x36, 0x69, + 0xa3, 0xb0, 0xbb, 0xb5, 0x8c, 0x3f, 0x50, 0x5f, 0x53, 0x45, 0x8b, 0xbe, + 0x89, 0x98, 0xb8, 0xbf, 0x71, 0x71, 0xd9, 0x8a, 0x69, 0x67, 0x2f, 0xb2, + 0x34, 0x5b, 0x3e, 0x8c, 0xc0, 0x66, 0x8d, 0x68, 0xcd, 0x37, 0x71, 0xb6, + 0x47, 0xba, 0x55, 0x6b, 0x4a, 0xa1, 0x95, 0xbd, 0x3e, 0xaa, 0xbe, 0x46, + 0xcc, 0x99, 0x80, 0x3a, 0x91, 0x6d, 0x36, 0x58, 0xc1, 0xca, 0x3f, 0xb4, + 0x67, 0x91, 0xcf, 0xb1, 0x3e, 0xc2, 0x98, 0x42, 0xc0, 0x75, 0x31, 0xca, + 0x6f, 0x5d, 0x82, 0x3d, 0x77, 0x33, 0xb7, 0x75, 0x4d, 0xc8, 0x98, 0xd1, + 0xb7, 0x7f, 0x9b, 0x67, 0xb3, 0xa2, 0xb4, 0xce, 0xa2, 0x9f, 0x62, 0x8f, + 0x6d, 0xa7, 0xc2, 0x53, 0xca, 0x59, 0x2f, 0x5f, 0xcd, 0x6a, 0x99, 0xaa, + 0x5c, 0x8c, 0xb2, 0xc1, 0x7f, 0xbe, 0x74, 0x53, 0xb2, 0x6a, 0xae, 0xbd, + 0x6c, 0x69, 0x3d, 0xa4, 0x32, 0xa3, 0xae, 0x85, 0xa7, 0xd1, 0x5d, 0x80, + 0x86, 0xa8, 0x83, 0xac, 0x60, 0x5b, 0xd1, 0xa0, 0xc0, 0x6c, 0xb7, 0xaa, + 0x70, 0x2c, 0x58, 0xc4, 0x3f, 0xa6, 0x57, 0x95, 0xcb, 0x55, 0x42, 0x45, + 0x9f, 0x5a, 0xb4, 0x75, 0x53, 0x6f, 0x69, 0xcd, 0x59, 0x80, 0x64, 0x49, + 0x40, 0x84, 0xc0, 0xbd, 0x5d, 0x3c, 0x4d, 0xbf, 0x5a, 0xc3, 0x6b, 0x9a, + 0x38, 0x9c, 0x6d, 0x95, 0x8b, 0x5c, 0x8c, 0xcb, 0xb6, 0xbd, 0xd5, 0x82, + 0xb0, 0x4a, 0x57, 0x77, 0x5d, 0xa2, 0x50, 0xbc, 0x45, 0x58, 0x84, 0xb7, + 0x37, 0x37, 0x9e, 0x2c, 0x7a, 0x3b, 0x45, 0x8e, 0x98, 0xcc, 0x63, 0x94, + 0xa9, 0x96, 0xbf, 0x9c, 0x44, 0x35, 0x4f, 0x7e, 0x9b, 0x5c, 0xa9, 0x75, + 0x7c, 0x9b, 0x90, 0xb2, 0x49, 0x58, 0x9c, 0x3a, 0xbe, 0xcf, 0xaf, 0x3a, + 0x96, 0xc9, 0xc9, 0x83, 0xa9, 0x83, 0x84, 0x57, 0x5a, 0x70, 0xbd, 0xd7, + 0x8a, 0xbe, 0x70, 0x84, 0xa6, 0x68, 0x95, 0x65, 0xc7, 0x90, 0x54, 0x62, + 0xac, 0x32, 0x60, 0x5b, 0x51, 0x6a, 0x63, 0xb7, 0x62, 0x63, 0xbb, 0xaa, + 0xa5, 0x36, 0x74, 0x94, 0x5f, 0x37, 0xad, 0xb3, 0x8d, 0x6d, 0x49, 0x58, + 0x42, 0x5e, 0x68, 0x84, 0x42, 0xbc, 0xbf, 0x9c, 0x37, 0x5d, 0x92, 0xbe, + 0x60, 0xc4, 0xa7, 0xa8, 0xca, 0x8c, 0x53, 0x97, 0x70, 0xc0, 0x5e, 0x36, + 0xad, 0x42, 0x7f, 0x91, 0x43, 0x76, 0x65, 0xb6, 0xc9, 0x3a, 0x65, 0x53, + 0x46, 0xc7, 0x89, 0x52, 0xbb, 0xd2, 0x5c, 0x9b, 0x34, 0x62, 0x79, 0xc1, + 0x7d, 0x60, 0x48, 0x55, 0xc5, 0x76, 0xc1, 0xc4, 0x51, 0x7d, 0x50, 0x50, + 0xc7, 0x9c, 0x70, 0x70, 0x2b, 0xb3, 0x7b, 0xcd, 0xb6, 0x42, 0x7d, 0xb3, + 0x8e, 0xda, 0x9a, 0x77, 0xc5, 0xa0, 0xc8, 0x72, 0x8d, 0xc2, 0x54, 0xb9, + 0x34, 0xb1, 0x67, 0x9a, 0xa9, 0x52, 0x75, 0x80, 0x6f, 0x81, 0xb3, 0x6a, + 0x4c, 0xce, 0xd8, 0x6c, 0x6d, 0xa7, 0xaf, 0x51, 0x55, 0x3e, 0xde, 0xda, + 0x81, 0x97, 0xb7, 0x7e, 0x9c, 0x6a, 0xae, 0x98, 0xbb, 0x7b, 0x72, 0x5f, + 0x47, 0x71, 0x96, 0xa3, 0xcd, 0xca, 0x2a, 0x21, 0xa4, 0x5f, 0x82, 0x69, + 0x5d, 0x62, 0x43, 0x3c, 0x74, 0x5c, 0xc3, 0xc6, 0xc6, 0xc3, 0x48, 0x5b, + 0x4a, 0x8c, 0x67, 0x5b, 0x4d, 0x8e, 0x68, 0x40, 0x40, 0x7d, 0x79, 0x72, + 0x6b, 0xac, 0x48, 0xc1, 0xd2, 0x62, 0xc2, 0x41, 0x74, 0x70, 0x55, 0xc7, + 0x91, 0x5b, 0x4d, 0xcd, 0x89, 0xaf, 0x8a, 0x9c, 0xaa, 0x34, 0x9a, 0x85, + 0xb4, 0x87, 0x54, 0xbf, 0xa1, 0x57, 0x5d, 0xa6, 0xbc, 0x41, 0x49, 0xbd, + 0xa1, 0xc7, 0xae, 0x3c, 0x92, 0xbc, 0x88, 0x94, 0xd3, 0x42, 0x3d, 0x78, + 0x32, 0x3e, 0x3c, 0xbf, 0x86, 0x3b, 0x6d, 0x7b, 0x9e, 0x7c, 0x71, 0xd1, + 0x5e, 0xcb, 0xad, 0x4a, 0xac, 0xa2, 0x80, 0xc9, 0x5a, 0xd0, 0x50, 0x8f, + 0x90, 0x45, 0x7e, 0x60, 0x59, 0x80, 0xab, 0x83, 0xd4, 0x40, 0xcc, 0x8e, + 0xc3, 0xb4, 0xba, 0x36, 0x7f, 0xc3, 0x82, 0xb4, 0x7d, 0xaf, 0x39, 0x4f, + 0x99, 0xce, 0x59, 0x52, 0xc1, 0x78, 0xce, 0x87, 0x72, 0x6b, 0x82, 0x7c, + 0xa0, 0x98, 0x58, 0xc9, 0xa6, 0x80, 0xca, 0x53, 0xd5, 0xcd, 0xcc, 0x87, + 0x4f, 0x85, 0x3d, 0xaa, 0x93, 0x74, 0x70, 0x7e, 0x44, 0x94, 0x5c, 0x34, + 0xbc, 0x98, 0xb1, 0xc9, 0xce, 0x45, 0x4d, 0xd0, 0x38, 0x7d, 0xb7, 0x6e, + 0xc8, 0xda, 0x48, 0xac, 0x76, 0x65, 0x43, 0x7b, 0xa0, 0xc5, 0x60, 0x3f, + 0x76, 0x4d, 0x5c, 0x69, 0x7a, 0x8f, 0x68, 0xaf, 0x9f, 0x4a, 0xd5, 0x7e, + 0xc7, 0x97, 0x9d, 0xc0, 0xb2, 0x88, 0x6f, 0xc4, 0x3d, 0x94, 0xc6, 0x48, + 0xa9, 0xc7, 0xc6, 0x4b, 0x5c, 0xbb, 0xb6, 0x92, 0x89, 0xaa, 0xb4, 0xb0, + 0x7d, 0xad, 0x8e, 0x5e, 0x39, 0x7b, 0xc3, 0x58, 0xba, 0x47, 0x52, 0x99, + 0x7a, 0x99, 0xbe, 0x6c, 0xc3, 0xb7, 0x5f, 0x43, 0x47, 0xaa, 0x2c, 0x99, + 0x69, 0x95, 0xd3, 0x51, 0x63, 0xb7, 0xcc, 0xd7, 0x99, 0x6f, 0x99, 0x83, + 0x39, 0x4b, 0xc0, 0x62, 0x6e, 0x72, 0xb4, 0x53, 0x51, 0xab, 0x6c, 0x47, + 0xd6, 0x95, 0xb8, 0x69, 0x70, 0x4d, 0x69, 0xa2, 0xb5, 0x8d, 0x56, 0xc5, + 0x93, 0x88, 0x7b, 0x69, 0x8e, 0x94, 0xcf, 0x5c, 0x8f, 0x90, 0xc5, 0x3a, + 0x76, 0x40, 0xa8, 0x97, 0x66, 0xac, 0x95, 0x5b, 0x92, 0xa2, 0xb5, 0x6f, + 0xb3, 0x3e, 0xd6, 0x5f, 0x69, 0x91, 0x9c, 0x9d, 0x8b, 0xb9, 0xc5, 0x51, + 0x70, 0x97, 0x54, 0xad, 0x4b, 0xd6, 0x74, 0xd1, 0x9b, 0x82, 0xa5, 0x9d, + 0x59, 0x87, 0xaf, 0x91, 0x5f, 0xd4, 0x6e, 0x7d, 0x67, 0xcf, 0xbc, 0x45, + 0xc6, 0x5f, 0x77, 0xc1, 0x89, 0x45, 0xaa, 0x74, 0x4d, 0x53, 0x97, 0x47, + 0xb6, 0x85, 0xa5, 0xd0, 0x9e, 0x81, 0xbb, 0xbb, 0xb8, 0x98, 0x78, 0x55, + 0x57, 0x79, 0x48, 0x7f, 0x6c, 0xa3, 0xa0, 0x5c, 0x65, 0xd0, 0x5f, 0xd4, + 0x44, 0x77, 0x38, 0x45, 0x48, 0x52, 0xab, 0x39, 0x81, 0xd0, 0xc2, 0xb6, + 0x4d, 0x51, 0x94, 0x75, 0xc2, 0x62, 0xd3, 0xbc, 0x7f, 0xb4, 0x4d, 0x77, + 0xaf, 0x79, 0x84, 0x7a, 0x75, 0x8f, 0xb0, 0x3f, 0x60, 0xbb, 0x7c, 0xcd, + 0xb9, 0xa5, 0xb0, 0x5d, 0xb2, 0x48, 0x69, 0x8d, 0x6c, 0xb9, 0x33, 0x98, + 0xc1, 0xc4, 0x73, 0x74, 0x96, 0x34, 0x53, 0x9b, 0xc7, 0x39, 0xad, 0x3c, + 0x6d, 0x6f, 0x6c, 0x33, 0x8e, 0x72, 0xa1, 0x7e, 0xaa, 0x82, 0x83, 0xd4, + 0x9c, 0x7f, 0xd0, 0x48, 0x7a, 0x46, 0x70, 0xba, 0x9b, 0x96, 0xbb, 0xd0, + 0x38, 0xa7, 0xc2, 0xa2, 0x86, 0x68, 0xa3, 0xcf, 0xcb, 0x80, 0x39, 0x3e, + 0x85, 0xbe, 0x7f, 0x94, 0xa9, 0xd9, 0x71, 0x51, 0x4f, 0x70, 0x8f, 0xae, + 0x46, 0x53, 0x84, 0xd2, 0xa6, 0x51, 0xb7, 0x54, 0x26, 0x6d, 0x67, 0xb7, + 0x3c, 0x9d, 0xc1, 0x70, 0x51, 0x6d, 0x46, 0x59, 0x78, 0xc2, 0x59, 0x47, + 0x45, 0x59, 0x7d, 0x70, 0x94, 0x85, 0xbd, 0x9e, 0xb6, 0x9c, 0xb8, 0x7b, + 0x9f, 0x54, 0x7f, 0xab, 0x61, 0x53, 0x75, 0x74, 0x47, 0x6a, 0x71, 0x37, + 0x35, 0xa6, 0x75, 0xbe, 0x45, 0xcd, 0x4c, 0x97, 0x74, 0x65, 0x6f, 0xce, + 0xd8, 0x4b, 0x59, 0x49, 0x59, 0xb9, 0x4a, 0xc8, 0x66, 0xcc, 0x7c, 0x81, + 0x99, 0x3a, 0xab, 0x6d, 0x51, 0xb1, 0xc9, 0xae, 0x74, 0xc7, 0xbb, 0x5a, + 0x97, 0xb7, 0x8f, 0x4f, 0xa2, 0xd1, 0x9b, 0x7e, 0xcd, 0x48, 0x7e, 0x9b, + 0x65, 0x37, 0xa6, 0x47, 0x82, 0x63, 0x79, 0xb5, 0xc0, 0xa3, 0xaa, 0xba, + 0x5e, 0x9f, 0x73, 0x8c, 0x71, 0x7e, 0x36, 0x7c, 0x6f, 0x36, 0xa4, 0xad, + 0xc2, 0x79, 0x83, 0x9e, 0x7b, 0xbf, 0xd7, 0x5c, 0x4d, 0x4b, 0xa5, 0x8a, + 0xca, 0x3f, 0x5c, 0x94, 0xcf, 0xc2, 0x80, 0x5c, 0xdb, 0x43, 0x6a, 0x80, + 0xa8, 0x62, 0x4e, 0x99, 0x6c, 0xd8, 0xbc, 0xd5, 0x4e, 0x92, 0x60, 0xd4, + 0x8e, 0x92, 0xb1, 0x6c, 0x66, 0x8c, 0x8d, 0x81, 0x6f, 0x4a, 0x6e, 0x80, + 0x53, 0xc1, 0xcb, 0xaa, 0x9e, 0xd0, 0x99, 0x8a, 0xbe, 0xc6, 0xb2, 0x76, + 0x69, 0x46, 0xa8, 0x92, 0xae, 0xb2, 0x77, 0x9e, 0x8e, 0xd8, 0x3c, 0x53, + 0x62, 0x4f, 0x8d, 0xad, 0x47, 0x95, 0x7a, 0xb6, 0x85, 0xbf, 0x82, 0xab, + 0xc9, 0x56, 0x43, 0x3c, 0x49, 0xd1, 0x6c, 0xcc, 0x3d, 0xbd, 0x91, 0x6e, + 0x6a, 0x86, 0xd8, 0xa7, 0x3e, 0x6a, 0x7a, 0x63, 0x3e, 0x59, 0xa9, 0xa8, + 0x3b, 0x94, 0xdd, 0x65, 0xc2, 0x79, 0x5f, 0x72, 0x46, 0x2e, 0xb8, 0xcf, + 0x7d, 0x4c, 0x5e, 0x92, 0xa0, 0x51, 0x47, 0x37, 0x32, 0x8b, 0x9d, 0x71, + 0x4e, 0x6c, 0x97, 0x56, 0x90, 0xb8, 0x58, 0xa8, 0xc6, 0x3c, 0x8a, 0x61, + 0xdc, 0xbc, 0x78, 0x63, 0xdf, 0xbf, 0x94, 0x2f, 0xbb, 0x52, 0x48, 0xab, + 0x96, 0x77, 0x75, 0x55, 0x4a, 0x4d, 0x74, 0x66, 0x52, 0x6c, 0xa3, 0x68, + 0x61, 0x80, 0x3b, 0x8a, 0xbe, 0x86, 0x88, 0xc4, 0xc2, 0x95, 0xb3, 0x59, + 0x6a, 0xbe, 0x4c, 0x99, 0x82, 0x6b, 0x35, 0xac, 0x45, 0x65, 0x9c, 0x43, + 0xb0, 0x52, 0x4e, 0x2a, 0x69, 0x5b, 0x94, 0xac, 0x96, 0x89, 0x8a, 0x9b, + 0x75, 0x94, 0x8d, 0xac, 0x4a, 0x87, 0x30, 0x3a, 0xa2, 0x33, 0x46, 0xba, + 0x6a, 0xb3, 0x44, 0xd2, 0x53, 0x34, 0x88, 0xc5, 0x57, 0xa3, 0x53, 0xb9, + 0x8c, 0x87, 0xae, 0x38, 0xc3, 0x55, 0x5a, 0xa0, 0x9c, 0x8e, 0x75, 0xa5, + 0x5e, 0x6c, 0xa1, 0x90, 0xb1, 0x8c, 0xa9, 0x58, 0x94, 0x5c, 0x6e, 0xa5, + 0x6a, 0xc6, 0x9f, 0x5d, 0xab, 0xc3, 0x69, 0x65, 0x93, 0x8e, 0xb3, 0x3f, + 0x2c, 0x8c, 0xcf, 0xd8, 0xa7, 0xb7, 0xa1, 0x66, 0x5d, 0xaf, 0x49, 0x28, + 0x76, 0xbb, 0x3a, 0x37, 0x40, 0xd4, 0xd6, 0xb7, 0x57, 0xb7, 0x87, 0x4c, + 0xc0, 0x67, 0x5d, 0x7d, 0xab, 0xbf, 0x66, 0x58, 0x9f, 0x7f, 0x65, 0xa5, + 0x56, 0x59, 0x40, 0x36, 0xb8, 0xaa, 0xc3, 0x89, 0x61, 0x35, 0xa6, 0xd3, + 0x57, 0xc6, 0xbb, 0xae, 0x77, 0x46, 0x2d, 0xc7, 0xe4, 0xb2, 0xd6, 0x9f, + 0x4f, 0xb3, 0xc2, 0x4f, 0x41, 0x95, 0x49, 0xc9, 0x41, 0xb4, 0xb6, 0xcf, + 0x75, 0xdc, 0xd2, 0x83, 0x69, 0x4e, 0x61, 0x55, 0x48, 0x6a, 0xb3, 0xa3, + 0x7f, 0x91, 0x53, 0xcb, 0x9c, 0x2e, 0x9d, 0xc0, 0x7f, 0x85, 0xba, 0xc5, + 0xc2, 0xb9, 0xb6, 0x51, 0x68, 0x59, 0xba, 0x80, 0xa1, 0xc1, 0x40, 0x6b, + 0x93, 0x55, 0x7e, 0x8a, 0xaa, 0x88, 0xb7, 0x50, 0x35, 0x65, 0x5d, 0xa3, + 0xcc, 0x59, 0xb6, 0x9a, 0x73, 0x91, 0xba, 0x96, 0xc8, 0x78, 0x3d, 0x61, + 0x7c, 0x9c, 0xc8, 0x42, 0x7e, 0x8f, 0x34, 0x43, 0x41, 0x68, 0x6d, 0xa1, + 0x5f, 0x34, 0x97, 0x4b, 0x2f, 0x81, 0x7b, 0x58, 0x73, 0xc8, 0x9b, 0x43, + 0x5f, 0xa5, 0x6a, 0x66, 0xa9, 0xd2, 0x40, 0x8c, 0x4e, 0x69, 0x99, 0x9f, + 0xa6, 0x54, 0x70, 0x64, 0xc8, 0x65, 0xac, 0x39, 0x5a, 0x6a, 0xce, 0x36, + 0xb0, 0x62, 0x3a, 0x8e, 0x34, 0x5c, 0x6b, 0xab, 0x3b, 0x60, 0x34, 0xca, + 0xc9, 0x65, 0xcf, 0x7a, 0x89, 0x72, 0xa7, 0xae, 0x97, 0x9d, 0x9d, 0x3c, + 0x64, 0xc8, 0x55, 0xca, 0xb4, 0x78, 0xcb, 0x53, 0xb8, 0x50, 0x52, 0x4d, + 0x96, 0x82, 0xb3, 0x81, 0xa1, 0x61, 0x60, 0xa6, 0xa2, 0x97, 0xb0, 0x63, + 0x39, 0xa4, 0x9b, 0x91, 0xc9, 0x42, 0x5c, 0x53, 0xbe, 0xb7, 0xbc, 0xb8, + 0x36, 0x74, 0xc7, 0xbd, 0xce, 0xb2, 0xcb, 0x77, 0xa0, 0x39, 0x6f, 0xaf, + 0xad, 0x42, 0x78, 0x3b, 0x61, 0x82, 0x8c, 0xca, 0xbb, 0xae, 0x61, 0x9b, + 0x57, 0x57, 0x84, 0x3c, 0xa6, 0xa5, 0x3b, 0x9d, 0xa3, 0x99, 0x78, 0x37, + 0x61, 0x68, 0xb4, 0x43, 0xc6, 0xa7, 0xcf, 0x7c, 0xa8, 0x9d, 0x60, 0x3a, + 0x9a, 0x87, 0x68, 0xd2, 0x96, 0xac, 0x59, 0x34, 0xd3, 0x41, 0x2d, 0x73, + 0x67, 0x6a, 0x7c, 0x5a, 0xbb, 0xd3, 0x8d, 0x9a, 0xb2, 0xb8, 0x52, 0x33, + 0x7b, 0x52, 0x68, 0x7a, 0x78, 0x93, 0xca, 0x6b, 0x52, 0xa6, 0x7c, 0xb3, + 0x59, 0x59, 0xae, 0xbf, 0x98, 0x61, 0x4c, 0x8f, 0x39, 0x79, 0x4e, 0x7b, + 0x98, 0xc9, 0x5f, 0xb4, 0xc8, 0x82, 0x33, 0x7f, 0x4f, 0x6e, 0xa8, 0xa8, + 0xb4, 0x7a, 0xd4, 0xcd, 0x68, 0x59, 0xbf, 0x4c, 0x5e, 0x81, 0x4b, 0xb6, + 0xa3, 0x90, 0x48, 0x74, 0x87, 0xc6, 0x47, 0x33, 0x84, 0xb6, 0x4c, 0x61, + 0x71, 0x9f, 0xa9, 0x85, 0x93, 0xbb, 0x65, 0x9a, 0x6f, 0xad, 0xa0, 0xba, + 0x45, 0x96, 0xbc, 0x37, 0x4c, 0x74, 0x7f, 0x30, 0x51, 0x4a, 0xb1, 0x3b, + 0xba, 0xc0, 0x9d, 0x48, 0x63, 0x43, 0x7e, 0xb4, 0x94, 0x9e, 0x9e, 0x86, + 0x72, 0x74, 0xb6, 0x81, 0x32, 0xc4, 0x90, 0x96, 0xc0, 0xc5, 0x5d, 0x4e, + 0xce, 0x89, 0x7d, 0x59, 0xba, 0xbe, 0xad, 0x36, 0x7b, 0x8c, 0xbc, 0xb5, + 0x59, 0x47, 0x6d, 0x6e, 0xac, 0x82, 0x87, 0x50, 0x46, 0xc3, 0x64, 0x3c, + 0x3e, 0x64, 0x4f, 0x97, 0x9c, 0x47, 0xa5, 0xaa, 0x4f, 0xad, 0xb8, 0xad, + 0x6e, 0x36, 0x8c, 0x64, 0x74, 0x47, 0x8b, 0x6b, 0xb4, 0x96, 0x3f, 0x5a, + 0x91, 0x6f, 0x94, 0xaa, 0x50, 0x66, 0x6f, 0x5b, 0xc1, 0xd5, 0xc2, 0xcd, + 0x91, 0x47, 0x57, 0x45, 0x84, 0x7c, 0x3a, 0x6c, 0xcc, 0x3c, 0x58, 0xb1, + 0xa2, 0x3b, 0xb3, 0x7a, 0xcf, 0x95, 0x38, 0x6b, 0x37, 0xaf, 0x61, 0x69, + 0xb5, 0x9a, 0x81, 0xc1, 0xba, 0x60, 0x50, 0x3f, 0xb7, 0x8f, 0x8f, 0xb1, + 0x9f, 0xbd, 0x80, 0xc4, 0x5e, 0x3a, 0x76, 0xd3, 0x54, 0xb6, 0xa6, 0xa4, + 0x82, 0x5f, 0x7b, 0x6d, 0x62, 0x69, 0x45, 0x97, 0x94, 0xa1, 0xbd, 0xc9, + 0xc0, 0xb6, 0x67, 0xc7, 0xca, 0xb2, 0xd2, 0xbc, 0x76, 0x4e, 0xb2, 0x68, + 0x38, 0x48, 0xb8, 0x67, 0x7e, 0x5e, 0x67, 0xc6, 0xca, 0xb8, 0x9b, 0x53, + 0x99, 0xaf, 0x75, 0x54, 0x75, 0xb1, 0x9c, 0x74, 0xcc, 0x6a, 0xa0, 0x38, + 0xab, 0x39, 0x55, 0x40, 0xb0, 0x7d, 0x84, 0x65, 0x45, 0xaf, 0xab, 0xc5, + 0x6b, 0x8a, 0x31, 0xbf, 0x5d, 0x59, 0x80, 0xa1, 0x4d, 0x7c, 0xb5, 0x7c, + 0x5b, 0x70, 0x44, 0xa1, 0x33, 0x4d, 0x69, 0xc9, 0x4d, 0x77, 0x44, 0xc2, + 0x8e, 0x82, 0x9f, 0x81, 0x72, 0x9e, 0x93, 0x8b, 0x5f, 0x44, 0x80, 0x91, + 0x58, 0xbe, 0xbc, 0xb6, 0x2f, 0x3e, 0x98, 0xb9, 0x42, 0xad, 0x85, 0x83, + 0xd0, 0x3f, 0xb3, 0x45, 0xbe, 0x3b, 0x89, 0x4b, 0x5b, 0xc1, 0x46, 0x36, + 0x60, 0x40, 0xaa, 0x86, 0xb2, 0x61, 0xb8, 0x88, 0x30, 0xbc, 0x52, 0x8c, + 0x95, 0xb9, 0x52, 0xb8, 0xb6, 0xca, 0xa3, 0x9b, 0x94, 0x70, 0x92, 0x9f, + 0x34, 0xd5, 0x5f, 0x94, 0x7d, 0xa5, 0xd1, 0x6b, 0x65, 0x68, 0x66, 0x99, + 0x9d, 0x7c, 0x65, 0x61, 0x3a, 0x78, 0x46, 0xb1, 0x6a, 0x63, 0x45, 0x6d, + 0x49, 0x2b, 0x35, 0x66, 0x3c, 0x35, 0x6e, 0x62, 0xa3, 0xc5, 0x96, 0x8d, + 0xbb, 0x9e, 0xa6, 0x8c, 0xb7, 0x4d, 0x67, 0x55, 0xa9, 0x37, 0x7c, 0x5d, + 0xaf, 0xa3, 0xc1, 0xd0, 0x44, 0xce, 0x68, 0xa8, 0x57, 0x45, 0x6c, 0x90, + 0x63, 0x82, 0x6d, 0x81, 0xaa, 0xc6, 0xad, 0x4e, 0x40, 0x78, 0x7f, 0x78, + 0x88, 0x6b, 0xa5, 0x62, 0x73, 0xd7, 0xba, 0x98, 0x92, 0x53, 0x35, 0xce, + 0x6d, 0x3c, 0x94, 0xc1, 0x62, 0x72, 0x81, 0xa1, 0xc3, 0x3b, 0x65, 0x85, + 0x98, 0x89, 0xa3, 0x5b, 0xbf, 0x75, 0xa5, 0xb9, 0x33, 0x7e, 0xbe, 0x6b, + 0x8b, 0xba, 0xab, 0x87, 0x97, 0x4e, 0x5b, 0x91, 0xc7, 0xaf, 0xaf, 0x56, + 0x92, 0x3a, 0x50, 0x9d, 0x73, 0x60, 0x59, 0xaa, 0x9e, 0x4f, 0x70, 0x96, + 0x41, 0x3d, 0x6d, 0x7c, 0x69, 0x49, 0xa1, 0xc5, 0xc8, 0x83, 0x42, 0x5e, + 0x63, 0x51, 0x4b, 0x9f, 0x96, 0xad, 0x3d, 0x34, 0xba, 0x46, 0xb1, 0x82, + 0x7d, 0x2f, 0xa6, 0x84, 0x47, 0x90, 0x37, 0x55, 0x7d, 0x81, 0xa9, 0x39, + 0x98, 0x41, 0x8a, 0x8b, 0xd5, 0x7e, 0x93, 0x62, 0x6a, 0x82, 0xb9, 0x8b, + 0x65, 0x6d, 0x99, 0x9c, 0x50, 0x50, 0x93, 0x59, 0xbd, 0xa0, 0x35, 0x89, + 0xa4, 0x8c, 0xa6, 0x54, 0x93, 0x7f, 0x4d, 0x65, 0x56, 0x3e, 0x66, 0x30, + 0x5f, 0xb4, 0xce, 0x5d, 0x5a, 0xcf, 0xb2, 0xa6, 0xb5, 0x7b, 0xc1, 0xcc, + 0xb8, 0x68, 0x52, 0x77, 0x4b, 0x7b, 0x78, 0xbc, 0xa1, 0x33, 0x9c, 0xb9, + 0x68, 0x48, 0x3a, 0xa4, 0x9c, 0x98, 0xd8, 0x65, 0xb3, 0xb0, 0x93, 0x43, + 0x3b, 0x3f, 0x57, 0x39, 0x60, 0xd2, 0x36, 0x57, 0x93, 0x46, 0xcf, 0xcf, + 0x3f, 0x5c, 0x8a, 0x6b, 0x62, 0x8a, 0x43, 0xaf, 0x6a, 0x6b, 0x81, 0xaf, + 0xae, 0x63, 0x3b, 0x9f, 0x87, 0xc9, 0x52, 0x6e, 0x7c, 0x6a, 0xd3, 0xc9, + 0x6b, 0x76, 0xc9, 0xc1, 0x98, 0xc2, 0x61, 0x4b, 0x58, 0x40, 0x99, 0xc2, + 0x68, 0xcc, 0xbf, 0x74, 0x44, 0xbf, 0x4f, 0x32, 0x70, 0x97, 0x8f, 0xc7, + 0xc6, 0x53, 0x9f, 0x8a, 0xac, 0x88, 0x6b, 0xb0, 0x4c, 0xa0, 0x68, 0x5b, + 0x44, 0xb4, 0xc9, 0x73, 0x7b, 0x5c, 0x80, 0x38, 0xb7, 0x63, 0x85, 0x73, + 0x54, 0x8f, 0x61, 0x4c, 0x77, 0x7a, 0xcb, 0x4e, 0x73, 0xa6, 0x55, 0xbe, + 0xc3, 0x3c, 0x49, 0xd2, 0x54, 0xae, 0x3e, 0xba, 0x6c, 0xb7, 0x4b, 0x31, + 0x91, 0x79, 0x47, 0x98, 0x75, 0x98, 0x53, 0x5b, 0x66, 0x4e, 0xc6, 0x96, + 0x70, 0x86, 0xa8, 0xcd, 0x52, 0x83, 0x96, 0x44, 0x56, 0xb5, 0x37, 0x73, + 0x6b, 0x65, 0x71, 0x84, 0x96, 0x52, 0xc7, 0xbf, 0xa3, 0xb0, 0xae, 0x89, + 0x87, 0x42, 0xb9, 0xbd, 0x79, 0x42, 0x77, 0x64, 0xb0, 0x40, 0x42, 0xd2, + 0x7e, 0x93, 0xaa, 0x7b, 0xa6, 0x6e, 0x9e, 0x57, 0xa3, 0x93, 0x37, 0x83, + 0x83, 0xa4, 0x92, 0x62, 0x9b, 0x81, 0xcf, 0x3c, 0xbd, 0x7c, 0x7d, 0xc6, + 0xcb, 0x57, 0x92, 0x74, 0xc5, 0xa3, 0x30, 0x3a, 0x6e, 0x31, 0x7e, 0x70, + 0x68, 0x5f, 0xa0, 0x8b, 0x34, 0x3e, 0x43, 0x74, 0x97, 0x70, 0xc6, 0xcb, + 0xad, 0x7a, 0xbd, 0x48, 0x6d, 0xc7, 0xbf, 0xac, 0x57, 0x5a, 0x45, 0x84, + 0x88, 0x57, 0x60, 0x8e, 0x3b, 0x56, 0xaa, 0x50, 0x40, 0x39, 0x77, 0x53, + 0xb8, 0x8f, 0xa8, 0x7d, 0x91, 0xa2, 0x63, 0x89, 0x52, 0x86, 0x98, 0xac, + 0x45, 0xcc, 0xc3, 0xcc, 0x56, 0x3f, 0xb5, 0x33, 0xca, 0xa7, 0xb8, 0xbc, + 0x60, 0xd0, 0xd1, 0x70, 0xa9, 0x63, 0x45, 0x79, 0x49, 0x9b, 0x4c, 0x6b, + 0xa7, 0xae, 0x49, 0x4f, 0x9b, 0x47, 0x8a, 0x32, 0x47, 0x55, 0x38, 0x5f, + 0xab, 0x97, 0x42, 0xa7, 0x5f, 0x3d, 0x3d, 0x36, 0x5f, 0x74, 0xbe, 0x9e, + 0xd0, 0x79, 0x8c, 0xad, 0x98, 0x67, 0x3e, 0xc3, 0x47, 0x6c, 0x3d, 0xc6, + 0xa5, 0xbb, 0x89, 0x5f, 0x52, 0x66, 0x3c, 0xa6, 0x77, 0x51, 0xc1, 0x89, + 0xac, 0x5c, 0x96, 0x71, 0x68, 0x40, 0x80, 0x9b, 0xa2, 0xa4, 0x32, 0x74, + 0xb9, 0x96, 0x92, 0x73, 0xa6, 0x73, 0xa5, 0x55, 0xa4, 0x7e, 0x54, 0xc5, + 0x59, 0xa2, 0x91, 0x4d, 0x53, 0x91, 0x3b, 0x5f, 0x57, 0x87, 0xa1, 0x85, + 0x8c, 0x60, 0x81, 0xb1, 0xc7, 0xc0, 0x99, 0x5f, 0x94, 0x3e, 0xc1, 0xce, + 0x9a, 0x5c, 0x41, 0xa1, 0x6c, 0xc3, 0xcd, 0xa0, 0x7f, 0xbc, 0x9c, 0xc9, + 0xce, 0x8c, 0x47, 0x6a, 0x81, 0x95, 0x3f, 0x51, 0x53, 0x4e, 0x5c, 0x87, + 0x96, 0xad, 0xcf, 0x89, 0x62, 0x6e, 0xa9, 0x54, 0xc9, 0xc3, 0x33, 0x9f, + 0x82, 0x57, 0xca, 0xa3, 0x98, 0x47, 0xc1, 0xca, 0x3e, 0x8b, 0x65, 0xbd, + 0x6e, 0x98, 0x9d, 0xc9, 0x61, 0x98, 0x9c, 0x8f, 0xb3, 0x3a, 0x75, 0xce, + 0xa0, 0xc1, 0x5e, 0x94, 0x79, 0x4f, 0xac, 0x32, 0xc5, 0xbd, 0x4d, 0x3d, + 0x3c, 0xb1, 0xb7, 0x9b, 0xb8, 0x7d, 0x37, 0x66, 0xad, 0xa9, 0x4c, 0x8d, + 0x55, 0x7e, 0x5e, 0xa3, 0x48, 0x7f, 0x38, 0x4e, 0x3f, 0x5a, 0x30, 0x6d, + 0x73, 0x66, 0x80, 0x9e, 0x7a, 0xad, 0x4d, 0xa4, 0xa8, 0xcd, 0x9a, 0x6a, + 0x34, 0x9f, 0x4e, 0x6f, 0x37, 0x67, 0x61, 0xd0, 0x91, 0x36, 0x84, 0x73, + 0x5c, 0x5d, 0x8d, 0x8e, 0x4d, 0x36, 0x5f, 0x32, 0x59, 0xc2, 0xa7, 0x78, + 0x76, 0x6a, 0xcc, 0x66, 0xcc, 0xab, 0x96, 0xbf, 0x6f, 0x9b, 0x8e, 0x63, + 0x33, 0xc1, 0x36, 0x46, 0x47, 0x9a, 0x69, 0x50, 0xa7, 0x4a, 0x64, 0x98, + 0x78, 0x3d, 0x86, 0xb8, 0x4b, 0x5f, 0x71, 0x90, 0x8b, 0x50, 0x34, 0x4c, + 0x99, 0x62, 0x9c, 0xcf, 0x7f, 0x75, 0x7b, 0x5d, 0xb6, 0x5f, 0xa1, 0x8a, + 0x2e, 0xb3, 0x51, 0x67, 0x5c, 0x92, 0x4f, 0xb9, 0x6a, 0x7b, 0xbf, 0x3f, + 0x3f, 0xaf, 0x4e, 0x83, 0xca, 0xa4, 0x71, 0x5e, 0x4c, 0x90, 0xc8, 0x6a, + 0x6a, 0xcf, 0x55, 0x34, 0x8e, 0xab, 0x4f, 0x9d, 0x30, 0x43, 0x4c, 0x79, + 0x51, 0x90, 0x88, 0x4f, 0x8f, 0xc0, 0x9c, 0xc9, 0x9d, 0x84, 0xb3, 0x3a, + 0xcd, 0x53, 0x48, 0xc6, 0x62, 0xd0, 0x5e, 0x85, 0x41, 0x55, 0xc3, 0x5e, + 0xca, 0x3d, 0x42, 0x39, 0xb8, 0x9b, 0xb4, 0x55, 0x8c, 0x73, 0x47, 0x81, + 0xa5, 0x96, 0xa1, 0x3c, 0x74, 0xbd, 0x40, 0x8c, 0x52, 0xbf, 0xaf, 0x5a, + 0xc8, 0xd4, 0xb1, 0x3c, 0xab, 0x76, 0x82, 0x57, 0xd1, 0xa7, 0x8f, 0x99, + 0xb5, 0x49, 0xc3, 0xa5, 0x8a, 0x59, 0x98, 0x84, 0x82, 0x62, 0xd2, 0xbf, + 0xcf, 0x6d, 0x5c, 0xa3, 0x91, 0xc8, 0x99, 0x44, 0x7e, 0x74, 0x79, 0x87, + 0x9a, 0x39, 0x32, 0x8a, 0xc7, 0xb7, 0x5e, 0xa4, 0xd2, 0x68, 0xc4, 0x42, + 0xa3, 0xd0, 0x69, 0x75, 0x96, 0x9a, 0x8d, 0xac, 0x91, 0x5f, 0x34, 0x79, + 0x55, 0x69, 0x38, 0xbd, 0x5e, 0xb7, 0x8a, 0x6a, 0x30, 0x61, 0x63, 0x4d, + 0xc1, 0x3e, 0x93, 0x90, 0xd3, 0xae, 0x93, 0x7f, 0xbe, 0x61, 0x89, 0x62, + 0x64, 0xc3, 0xc8, 0x6b, 0xca, 0x8e, 0x74, 0x41, 0xc3, 0xaf, 0xad, 0x48, + 0x58, 0x62, 0x8a, 0x3a, 0x47, 0x7f, 0x76, 0xa9, 0x5f, 0xbb, 0xb1, 0x56, + 0xac, 0x64, 0x54, 0xaf, 0x7d, 0xb6, 0x5a, 0x95, 0x6d, 0xc5, 0xc8, 0x8d, + 0x7b, 0xb2, 0x82, 0xd2, 0x59, 0x8d, 0x46, 0x5b, 0xa5, 0x34, 0x6b, 0x91, + 0x9c, 0x58, 0xa3, 0xc2, 0xb8, 0x4b, 0x68, 0x44, 0x9f, 0xc0, 0x38, 0xc4, + 0x53, 0xab, 0x35, 0x98, 0xcb, 0xc6, 0x52, 0x93, 0xc0, 0x90, 0x5c, 0x51, + 0xc2, 0x8f, 0x8c, 0x4c, 0x8a, 0xd3, 0x63, 0x9f, 0x43, 0x3d, 0x56, 0x6f, + 0xcd, 0xce, 0xbd, 0x45, 0xa3, 0xc0, 0xca, 0x7e, 0x88, 0x8c, 0x68, 0x9d, + 0x81, 0xb3, 0x9d, 0xc3, 0x9f, 0xbe, 0x73, 0x68, 0x6d, 0xb1, 0x55, 0xcc, + 0x4a, 0x55, 0x84, 0x71, 0x7f, 0x93, 0xc9, 0x74, 0x4b, 0x54, 0xab, 0xd4, + 0x86, 0xb8, 0xc2, 0x39, 0x45, 0xc2, 0xa8, 0x2f, 0x45, 0x6f, 0x3b, 0x2f, + 0xc2, 0xba, 0x84, 0x8e, 0x65, 0xd9, 0x99, 0x6d, 0x7b, 0x92, 0xa6, 0xaa, + 0x57, 0x5c, 0x46, 0x9b, 0x61, 0x79, 0xbe, 0x77, 0x60, 0x91, 0xaf, 0xc7, + 0xac, 0x38, 0xb6, 0x4e, 0x38, 0x8f, 0x8d, 0x78, 0x90, 0x4a, 0xbb, 0x73, + 0x79, 0xcc, 0xa6, 0xbd, 0xaf, 0xae, 0x3a, 0x72, 0x85, 0x5a, 0x61, 0x80, + 0x68, 0xb8, 0xa5, 0x78, 0x66, 0x6a, 0x61, 0x3f, 0xc3, 0xa7, 0x5c, 0xd1, + 0x7a, 0x9e, 0x45, 0x84, 0x7e, 0x6d, 0xb0, 0x65, 0xcd, 0xa5, 0xc1, 0x46, + 0xb0, 0x73, 0xb3, 0xd1, 0x79, 0x51, 0xb3, 0xc4, 0x4d, 0x30, 0x5d, 0x7a, + 0xa7, 0x7e, 0x4a, 0x7e, 0x43, 0x70, 0xa8, 0x46, 0x85, 0xc8, 0xc0, 0x8f, + 0x58, 0x9c, 0x56, 0xaf, 0x92, 0x76, 0xa2, 0xb1, 0xf1, 0x58, 0x8c, 0x55, + 0x56, 0x62, 0xb1, 0xa3, 0x4f, 0xc5, 0xab, 0xae, 0xab, 0x7d, 0x29, 0x49, + 0x73, 0x64, 0x6d, 0x74, 0x80, 0x51, 0x62, 0x81, 0x70, 0x36, 0x8d, 0x5c, + 0xbb, 0xae, 0x6d, 0x53, 0x39, 0x80, 0xae, 0x4e, 0x9d, 0xc4, 0x55, 0xcd, + 0x47, 0xdb, 0xc9, 0xde, 0xaf, 0x44, 0x45, 0xb4, 0xd0, 0xbf, 0x93, 0x92, + 0x69, 0xac, 0x56, 0x6f, 0x3d, 0xcb, 0xcb, 0x4e, 0x42, 0xb3, 0xd0, 0x4a, + 0x69, 0xb3, 0x6f, 0x41, 0xa7, 0x52, 0xb8, 0x92, 0x4a, 0x82, 0x46, 0xd1, + 0xdb, 0xa9, 0xd8, 0x92, 0x7b, 0xa1, 0x47, 0xc2, 0xab, 0x6c, 0x56, 0xd6, + 0x39, 0x33, 0xb2, 0x88, 0x52, 0xb2, 0x59, 0x8c, 0x49, 0x50, 0xb4, 0xa4, + 0x6d, 0xb4, 0xc4, 0x3c, 0x74, 0x91, 0x9d, 0x3c, 0xb6, 0x47, 0x65, 0x35, + 0x92, 0x81, 0x6d, 0xc4, 0xd3, 0x6e, 0x83, 0x35, 0x79, 0x9b, 0x9c, 0x53, + 0x40, 0x4c, 0x99, 0xb6, 0xd1, 0x46, 0x79, 0x8b, 0x43, 0x6d, 0x44, 0xac, + 0x6e, 0x66, 0x42, 0x5d, 0x3a, 0xb7, 0x7d, 0x3e, 0xcd, 0x87, 0x60, 0x96, + 0xa7, 0xd2, 0x51, 0x7c, 0xba, 0xb7, 0x57, 0x6f, 0x80, 0xae, 0x83, 0x6f, + 0xbe, 0x48, 0x42, 0x9b, 0xa9, 0x61, 0x51, 0x46, 0x45, 0x99, 0xac, 0x40, + 0xbb, 0x49, 0x80, 0x62, 0x51, 0xae, 0xbc, 0xc3, 0x9d, 0xc7, 0x2d, 0x8c, + 0x32, 0x5f, 0x85, 0x8a, 0x35, 0x75, 0x31, 0x8e, 0x74, 0xc4, 0xbc, 0xc3, + 0xe5, 0x6f, 0x9d, 0x5b, 0x69, 0xac, 0x92, 0xa3, 0x74, 0xc1, 0xc3, 0xb0, + 0x9c, 0x94, 0x54, 0xa6, 0x5a, 0x43, 0x9c, 0xcb, 0xcb, 0x45, 0xb8, 0xaf, + 0x60, 0x8e, 0xc2, 0x6f, 0x6b, 0xdb, 0x73, 0xaf, 0xc5, 0x6a, 0xaa, 0x3c, + 0x33, 0x94, 0x8a, 0x8d, 0x86, 0xad, 0xcc, 0x82, 0x6d, 0xbd, 0x83, 0xab, + 0x61, 0x73, 0x6f, 0x55, 0x47, 0x96, 0xba, 0x48, 0x39, 0x9a, 0x4f, 0xad, + 0x7a, 0x90, 0x49, 0x6d, 0xa7, 0xa8, 0x8d, 0x83, 0x9c, 0xc9, 0x7a, 0xb5, + 0x9a, 0x4f, 0x40, 0xb7, 0xb7, 0xb1, 0xa5, 0x84, 0x8e, 0x74, 0x87, 0x3a, + 0x4f, 0x82, 0x66, 0xc1, 0x64, 0x35, 0xa6, 0x5e, 0xac, 0x70, 0x9b, 0xab, + 0xa0, 0xbe, 0x3b, 0x5d, 0x90, 0x46, 0xbd, 0xa7, 0xd4, 0xa1, 0x4e, 0x54, + 0x72, 0x9a, 0xb0, 0x51, 0xaa, 0xa8, 0x45, 0x6a, 0x55, 0xa0, 0x85, 0xab, + 0x7d, 0x63, 0xa9, 0x47, 0x2f, 0x90, 0x63, 0x8c, 0xa7, 0x43, 0xcd, 0xa2, + 0x55, 0x3a, 0x37, 0xbc, 0x5f, 0x89, 0x70, 0x53, 0x97, 0xaa, 0x84, 0x80, + 0xbf, 0x82, 0x3c, 0x8b, 0x56, 0x73, 0xca, 0xd2, 0xc9, 0xb7, 0x7b, 0x81, + 0xcd, 0x80, 0x56, 0x4f, 0x66, 0x75, 0x4a, 0x70, 0xc2, 0xc0, 0x71, 0x77, + 0x70, 0x4e, 0x8b, 0x3f, 0x3a, 0x55, 0x98, 0xc6, 0x29, 0xa3, 0x76, 0x86, + 0xa4, 0xa0, 0x4c, 0x93, 0x6c, 0xa4, 0xcc, 0x9d, 0x73, 0x79, 0x3f, 0x97, + 0x73, 0xc5, 0xae, 0x6e, 0x91, 0x9a, 0x48, 0xa6, 0x83, 0xc5, 0x79, 0x39, + 0x96, 0x6b, 0xd1, 0x77, 0xd1, 0x8c, 0xa9, 0xc0, 0x85, 0x4d, 0x8e, 0x57, + 0xd5, 0xca, 0x6a, 0x3b, 0x2c, 0x2d, 0x77, 0x8d, 0x65, 0x2f, 0x4c, 0x8d, + 0x33, 0x42, 0x5f, 0x5a, 0x77, 0x78, 0xc7, 0x81, 0x67, 0x8c, 0x8d, 0xa7, + 0x62, 0xab, 0x56, 0x62, 0x82, 0x9b, 0xc3, 0x4b, 0x88, 0xc4, 0x87, 0x8b, + 0x74, 0xbe, 0xcf, 0xa1, 0x24, 0x2e, 0x79, 0xcb, 0xc2, 0x5d, 0xb3, 0x72, + 0xd5, 0xbf, 0x6c, 0x65, 0x39, 0x86, 0xc8, 0xb3, 0x56, 0x5d, 0x2a, 0xb5, + 0x54, 0x61, 0x37, 0x70, 0xd6, 0xbe, 0x6d, 0x55, 0x33, 0x45, 0x61, 0xb0, + 0x8c, 0x53, 0x39, 0x5e, 0x7e, 0xca, 0xb9, 0x91, 0x94, 0x40, 0x99, 0xa1, + 0x64, 0x45, 0x88, 0x8d, 0xa6, 0x68, 0x64, 0x67, 0x64, 0x46, 0xa2, 0x68, + 0x2d, 0x69, 0xac, 0xd5, 0x7c, 0x7c, 0x3d, 0x78, 0x9e, 0x93, 0xc5, 0x75, + 0x8c, 0xa7, 0xd3, 0xbc, 0x32, 0x49, 0xd4, 0x3a, 0x78, 0xa0, 0x72, 0x66, + 0x3f, 0x5b, 0x4d, 0xc2, 0x32, 0x6c, 0xa3, 0x47, 0xb7, 0x72, 0x42, 0xa9, + 0x3a, 0x60, 0xa3, 0x5c, 0xaf, 0x9b, 0x41, 0xce, 0xba, 0xc3, 0xb2, 0x78, + 0x89, 0xb5, 0x48, 0x39, 0x7b, 0x9a, 0x68, 0x43, 0xaa, 0x9b, 0x6a, 0x6c, + 0x7b, 0xbe, 0x78, 0x61, 0x3b, 0x65, 0x98, 0x66, 0x8e, 0x3b, 0x65, 0x2a, + 0xc2, 0x8d, 0xb1, 0x56, 0x63, 0x4a, 0x4e, 0x8d, 0xce, 0xc0, 0x44, 0x44, + 0x40, 0x83, 0x99, 0xc8, 0x9c, 0x46, 0x74, 0x7d, 0x65, 0xca, 0x40, 0xda, + 0x88, 0x87, 0x75, 0xc9, 0xa0, 0xb1, 0x61, 0x39, 0x4a, 0x84, 0x53, 0xd0, + 0x84, 0x9a, 0x74, 0x3a, 0x3f, 0x62, 0x8c, 0x5b, 0x86, 0x60, 0x64, 0xa2, + 0x47, 0x40, 0xd7, 0x9e, 0x3d, 0x7b, 0xaa, 0x9a, 0xc1, 0xb1, 0xaf, 0x81, + 0x4d, 0x93, 0x52, 0x3e, 0xd2, 0xbe, 0x92, 0xb0, 0xb8, 0x42, 0x70, 0x70, + 0xdf, 0x5d, 0x83, 0x44, 0xaa, 0xa1, 0x77, 0x8f, 0x32, 0x32, 0x83, 0x55, + 0x30, 0xcb, 0x8b, 0x60, 0x79, 0x6c, 0xbd, 0x42, 0x9c, 0x94, 0xc4, 0xcf, + 0x8e, 0x85, 0x30, 0xae, 0xa5, 0x5e, 0x66, 0x9d, 0x80, 0xd2, 0x67, 0x3f, + 0x9d, 0x89, 0xd3, 0x55, 0xab, 0xba, 0x72, 0x32, 0x8d, 0x44, 0xbd, 0xc7, + 0xb5, 0x4e, 0x6e, 0x97, 0x88, 0x90, 0xc9, 0x8c, 0xad, 0x97, 0x57, 0x3b, + 0x2f, 0x99, 0x3b, 0x6a, 0x9e, 0x88, 0xcd, 0x3b, 0x9d, 0x89, 0x66, 0x5e, + 0xc7, 0x6d, 0xcb, 0x61, 0xdf, 0x53, 0x55, 0x71, 0xce, 0x3b, 0x35, 0x42, + 0x9c, 0x81, 0x9c, 0x87, 0xcd, 0x9c, 0x9e, 0xb9, 0x9d, 0xd4, 0x66, 0x58, + 0x90, 0xad, 0x85, 0x43, 0x44, 0xbb, 0xb5, 0x8a, 0x95, 0xa9, 0x62, 0x5d, + 0xad, 0x77, 0x45, 0xbc, 0x9c, 0xcf, 0x2e, 0xc4, 0xbd, 0x39, 0x34, 0x6e, + 0xb5, 0x91, 0x8e, 0x43, 0x90, 0x5e, 0x67, 0x71, 0x74, 0x70, 0xba, 0x65, + 0x3e, 0x56, 0xcd, 0x49, 0xc1, 0x85, 0x8e, 0x86, 0x37, 0x71, 0x68, 0xba, + 0xc1, 0x2a, 0x62, 0xc4, 0x95, 0x47, 0x27, 0x47, 0x82, 0xa9, 0x5f, 0xb8, + 0x6b, 0x51, 0x35, 0xd5, 0xaf, 0x9e, 0xba, 0x3d, 0x44, 0x7c, 0x65, 0xb4, + 0x9c, 0xb8, 0x9f, 0x7a, 0x6b, 0x9e, 0x96, 0x58, 0xae, 0x3b, 0xc1, 0x84, + 0x8b, 0x36, 0xb7, 0x60, 0x9e, 0x80, 0x5a, 0xb8, 0x51, 0x9c, 0x87, 0x44, + 0x6f, 0x99, 0xa5, 0x71, 0x7f, 0x3d, 0x3d, 0x5d, 0x3b, 0x8d, 0x9d, 0xc4, + 0xc8, 0x9e, 0x5f, 0x55, 0x92, 0xa5, 0xb1, 0x41, 0x89, 0xd0, 0x92, 0xc5, + 0x58, 0x32, 0x88, 0x6f, 0x53, 0x71, 0x75, 0xbd, 0xbe, 0x2d, 0x61, 0x61, + 0xae, 0x97, 0xc0, 0xbd, 0x60, 0x4b, 0x94, 0xc9, 0x52, 0x88, 0x9c, 0x8b, + 0x67, 0x56, 0x3f, 0x85, 0x7f, 0x3e, 0x89, 0x99, 0xd6, 0x6e, 0xb9, 0xaf, + 0x93, 0x5d, 0x40, 0x63, 0xc5, 0xd0, 0xc1, 0xb0, 0x5a, 0xbf, 0x89, 0x37, + 0xb7, 0xce, 0x7a, 0x7d, 0x85, 0x89, 0x6b, 0x74, 0x9c, 0xa9, 0x9f, 0x44, + 0x8b, 0x3c, 0x70, 0x56, 0x8f, 0x90, 0x43, 0xa2, 0x4d, 0x7e, 0xca, 0x6b, + 0x4b, 0x7c, 0x41, 0x86, 0xb4, 0x3e, 0x36, 0x3c, 0xbd, 0x8a, 0x3d, 0x4b, + 0xa7, 0xb7, 0x79, 0x82, 0x69, 0x69, 0x50, 0x50, 0xa7, 0x3e, 0x86, 0xb9, + 0x60, 0x49, 0x42, 0xb0, 0x78, 0xd0, 0x6d, 0x95, 0xcd, 0xc7, 0xa4, 0xb9, + 0x8e, 0x94, 0xb7, 0xad, 0x43, 0xb1, 0x9a, 0xcc, 0xbf, 0xbf, 0xc2, 0x59, + 0x45, 0x84, 0x82, 0x67, 0x42, 0x52, 0xa3, 0x6c, 0x84, 0xca, 0x87, 0x6c, + 0x5f, 0x88, 0x3c, 0x8e, 0xaf, 0x8a, 0x6e, 0x84, 0x5d, 0xc7, 0x89, 0x63, + 0x34, 0xbd, 0xa3, 0x73, 0x73, 0xa4, 0x3c, 0xc8, 0xa5, 0x3a, 0x44, 0x5d, + 0x55, 0x75, 0xbc, 0x94, 0xbd, 0x4b, 0x47, 0xcc, 0x7f, 0x54, 0xa4, 0x5e, + 0x55, 0xb4, 0x7d, 0x4d, 0x71, 0x94, 0x60, 0xc5, 0x55, 0x87, 0x73, 0x71, + 0x8c, 0x3c, 0x5f, 0x58, 0xd0, 0x63, 0x5e, 0xa0, 0x40, 0x4e, 0x5a, 0xcf, + 0x4a, 0x3d, 0x8d, 0x5d, 0xa1, 0x77, 0x61, 0x3d, 0x8f, 0x7d, 0xbe, 0xd2, + 0x5d, 0x75, 0x2b, 0xb5, 0xb5, 0x4f, 0x52, 0x47, 0xd8, 0xda, 0xa4, 0xa8, + 0xc5, 0x77, 0xc5, 0xaf, 0xbf, 0xc2, 0xbe, 0x96, 0xae, 0x4a, 0x9c, 0x7c, + 0x34, 0x47, 0xaa, 0x68, 0x7c, 0x3b, 0x50, 0xb8, 0xc6, 0xa4, 0xb2, 0x94, + 0x7e, 0x61, 0x2c, 0x66, 0xc3, 0x5d, 0x81, 0x95, 0x98, 0x8b, 0x5f, 0xa6, + 0x47, 0x43, 0x3f, 0x97, 0xcc, 0x86, 0x5b, 0x79, 0x5b, 0x4d, 0x90, 0xbd, + 0xad, 0x59, 0x3e, 0xa4, 0x3c, 0x8f, 0xb8, 0xa9, 0x8f, 0xc6, 0x90, 0x9a, + 0x56, 0x8b, 0xad, 0xcb, 0x8f, 0x88, 0x5f, 0x5f, 0x94, 0xb6, 0xb8, 0xd0, + 0xd2, 0x42, 0x36, 0x2e, 0xa2, 0x3b, 0x9a, 0x8a, 0x89, 0x3a, 0x60, 0xa6, + 0x74, 0x68, 0x9d, 0xc8, 0x39, 0x86, 0x43, 0x9c, 0x4e, 0xc9, 0x7c, 0xbb, + 0x70, 0x82, 0x42, 0x8d, 0x83, 0x93, 0xc0, 0x89, 0xcb, 0xc5, 0x46, 0x9f, + 0xca, 0xc1, 0xd4, 0xa7, 0xad, 0x7a, 0xc9, 0x54, 0x63, 0x89, 0x90, 0xa8, + 0x98, 0x77, 0xc5, 0x34, 0x63, 0xc4, 0x8d, 0x69, 0x68, 0x43, 0x96, 0xc9, + 0x66, 0x62, 0x69, 0x67, 0x45, 0x6b, 0xcc, 0xb8, 0x2e, 0x3a, 0x8e, 0x87, + 0x66, 0x59, 0x66, 0x9a, 0x6a, 0x87, 0x73, 0x8c, 0xb0, 0x6e, 0x72, 0x61, + 0x96, 0xa7, 0x41, 0x59, 0x8b, 0x5b, 0x99, 0x8c, 0x79, 0x5c, 0xc1, 0xab, + 0xa5, 0x8b, 0x91, 0xb9, 0xa1, 0x84, 0x35, 0x64, 0x7a, 0x2c, 0x39, 0x68, + 0x6d, 0x7f, 0xba, 0x90, 0x81, 0x83, 0x88, 0x90, 0x6a, 0x91, 0xd1, 0x84, + 0x46, 0xce, 0x81, 0x82, 0x53, 0xb1, 0xb8, 0x3b, 0x64, 0x5c, 0x8e, 0x9b, + 0x68, 0x5e, 0x82, 0x4e, 0x61, 0xb5, 0x76, 0x59, 0xa5, 0xbe, 0x85, 0xb7, + 0x7f, 0x46, 0x5a, 0xaa, 0xac, 0x89, 0xd2, 0xbc, 0x3c, 0x2e, 0x8d, 0x52, + 0x90, 0x3c, 0x28, 0x79, 0x82, 0xb7, 0x7b, 0xa3, 0xa5, 0x9e, 0x36, 0x5d, + 0x47, 0xc4, 0x59, 0x7f, 0xc4, 0xcd, 0xa1, 0x41, 0x9a, 0xd1, 0x3a, 0xd1, + 0x3b, 0x88, 0x38, 0x60, 0x80, 0x72, 0xcb, 0x6e, 0x5c, 0x5e, 0x74, 0xc7, + 0x35, 0x7b, 0x77, 0x2e, 0x69, 0xa8, 0xc9, 0x8f, 0x9b, 0x80, 0xa5, 0x75, + 0x91, 0xa8, 0x6c, 0x49, 0x93, 0x6d, 0x8b, 0x82, 0x4a, 0x3e, 0xb0, 0x6d, + 0xb1, 0xcb, 0x7c, 0xb1, 0x31, 0x86, 0x82, 0xb6, 0xbf, 0x35, 0x86, 0x3b, + 0xb9, 0x9e, 0x46, 0x95, 0x53, 0x3c, 0x9d, 0xc1, 0xbc, 0xb1, 0x6e, 0xa8, + 0xd4, 0x77, 0x38, 0xa2, 0x95, 0x6e, 0x81, 0x89, 0x5c, 0x33, 0x86, 0x6b, + 0xd2, 0x6d, 0xbb, 0x58, 0x6d, 0x91, 0xc0, 0x6c, 0x54, 0x76, 0xc6, 0x44, + 0x56, 0x65, 0x58, 0xbe, 0xc4, 0x3d, 0xb5, 0x4b, 0x55, 0xc3, 0x3d, 0xaf, + 0x4d, 0x3c, 0x79, 0x87, 0xc1, 0x3f, 0x29, 0xd2, 0x7c, 0x63, 0x33, 0x98, + 0x7e, 0xb8, 0x59, 0xcb, 0xb0, 0x88, 0x63, 0xd5, 0xa3, 0xa1, 0x5c, 0x42, + 0xcd, 0x3c, 0x6a, 0x82, 0xac, 0x31, 0x46, 0x83, 0x37, 0xce, 0xa2, 0xb9, + 0xaa, 0x78, 0x8d, 0xd0, 0x8c, 0x59, 0x51, 0xcb, 0x35, 0xa0, 0x92, 0x6a, + 0x5b, 0x79, 0x56, 0x4b, 0x62, 0xcc, 0xa5, 0xae, 0x4d, 0x4c, 0xca, 0x9f, + 0x7c, 0x90, 0x9b, 0x50, 0x85, 0x72, 0x70, 0x4c, 0x62, 0x41, 0xaf, 0xc0, + 0x9b, 0xbf, 0x76, 0xb2, 0x89, 0x46, 0xa6, 0x6e, 0xaf, 0x97, 0x6e, 0xae, + 0xcb, 0xc1, 0x5d, 0x64, 0xc2, 0x5d, 0xb0, 0x6c, 0x57, 0x4f, 0x3e, 0xae, + 0x8a, 0x68, 0x8f, 0x78, 0x31, 0x97, 0x9a, 0xcb, 0x55, 0x44, 0xb6, 0x9f, + 0x3c, 0x58, 0xb8, 0x81, 0xa6, 0x81, 0x94, 0xa5, 0xb8, 0xcd, 0x37, 0x80, + 0x42, 0x59, 0xa2, 0x4b, 0x82, 0xba, 0x38, 0x4d, 0xbd, 0x72, 0x7c, 0xb3, + 0x71, 0x9d, 0x9a, 0x78, 0xa7, 0xc0, 0x8b, 0x95, 0x4a, 0x94, 0xa7, 0x8d, + 0x88, 0x56, 0xc9, 0xa1, 0xa7, 0x79, 0x37, 0x5e, 0x6d, 0x47, 0x65, 0x5c, + 0x7f, 0x41, 0x3e, 0xa7, 0x77, 0xc7, 0x58, 0xa0, 0x5c, 0x71, 0xc8, 0x5a, + 0x30, 0x7f, 0xab, 0x80, 0xce, 0xb2, 0x3c, 0x35, 0xa0, 0x6a, 0x5e, 0x39, + 0xcc, 0x4b, 0x3e, 0xae, 0x8b, 0xce, 0x8f, 0xc1, 0xbd, 0x4e, 0x6d, 0x6a, + 0x8b, 0x92, 0x57, 0xb7, 0x33, 0x58, 0x76, 0xcc, 0x82, 0x88, 0x74, 0x49, + 0x44, 0x4d, 0x9d, 0x66, 0x7b, 0x76, 0xd2, 0x78, 0x48, 0x8d, 0xb6, 0xaf, + 0x5d, 0xca, 0x4f, 0xd3, 0x89, 0x4f, 0x45, 0x90, 0x32, 0x34, 0x62, 0x88, + 0x3b, 0xb6, 0x83, 0xa8, 0xa5, 0xb6, 0x33, 0x7e, 0x90, 0xbe, 0xa3, 0xa1, + 0x8a, 0x43, 0x88, 0xb0, 0x33, 0x76, 0x3f, 0x35, 0xa5, 0x36, 0xa2, 0x4c, + 0x42, 0x86, 0x49, 0x75, 0xc7, 0xc9, 0x58, 0x69, 0x7e, 0xbd, 0x37, 0x87, + 0xa3, 0x75, 0x8d, 0x4d, 0x46, 0xbb, 0x8d, 0xab, 0x89, 0x7d, 0xcd, 0xc4, + 0x3b, 0x9b, 0x55, 0x96, 0x67, 0x35, 0xa2, 0x4d, 0x6d, 0x3a, 0xae, 0x47, + 0xbd, 0x8b, 0xba, 0x3b, 0x3e, 0xa9, 0x66, 0x83, 0x7a, 0xc7, 0xa9, 0x9e, + 0x98, 0x53, 0x82, 0xa6, 0x8d, 0x63, 0x41, 0x6b, 0x70, 0x43, 0x47, 0x49, + 0x7c, 0x9a, 0x69, 0x2d, 0x5a, 0xca, 0xb4, 0x5b, 0x3c, 0x7b, 0x7a, 0x3b, + 0x37, 0x8e, 0xa4, 0x9d, 0x99, 0xba, 0x9b, 0xc0, 0x74, 0xad, 0x68, 0xbc, + 0x7d, 0x97, 0x78, 0xc0, 0x6e, 0xae, 0x46, 0xa3, 0x40, 0x89, 0x58, 0xb1, + 0x32, 0x92, 0xa0, 0x90, 0x83, 0x86, 0xa1, 0xc2, 0x53, 0x7b, 0xac, 0x9b, + 0x35, 0x5a, 0x63, 0x45, 0x7f, 0x3f, 0x4b, 0x41, 0xbf, 0xa3, 0x44, 0x56, + 0xc7, 0x94, 0xa4, 0xb9, 0xb0, 0x62, 0xa8, 0x3e, 0x4f, 0x4d, 0xaf, 0x89, + 0x45, 0x35, 0x99, 0x38, 0x87, 0xcd, 0x57, 0xcc, 0x55, 0xc4, 0x43, 0x67, + 0xb5, 0xb9, 0xbf, 0x73, 0x7e, 0x49, 0xc9, 0x9c, 0x68, 0x89, 0x59, 0x79, + 0x6a, 0xd5, 0xaa, 0x64, 0xc6, 0xd0, 0x34, 0x65, 0xa4, 0xc5, 0x98, 0xb4, + 0xaa, 0x85, 0x63, 0x35, 0x93, 0xaa, 0x74, 0x5b, 0xb6, 0x48, 0x5f, 0xa5, + 0x74, 0x5b, 0x94, 0xc6, 0x70, 0x43, 0x51, 0xad, 0x32, 0x94, 0xcf, 0x76, + 0x48, 0x4b, 0x40, 0xc3, 0x38, 0x4d, 0x97, 0x4b, 0x4c, 0x32, 0x31, 0xc2, + 0x7a, 0xc5, 0xb8, 0x84, 0x5a, 0xbd, 0xa4, 0x4c, 0xd1, 0x68, 0xcb, 0x9a, + 0x84, 0xb6, 0xbe, 0xa0, 0x32, 0x63, 0xd0, 0x3e, 0x45, 0x56, 0xbd, 0xce, + 0x44, 0x95, 0x80, 0xb3, 0x56, 0x52, 0x88, 0xd4, 0x9c, 0x74, 0x47, 0x83, + 0x45, 0xb9, 0x6f, 0xc7, 0x64, 0x9c, 0xa2, 0xc5, 0x31, 0xac, 0x3f, 0x52, + 0xbc, 0x90, 0x78, 0xc8, 0x60, 0xbf, 0x56, 0x9b, 0x8c, 0xbb, 0x90, 0xa7, + 0x36, 0x43, 0x8e, 0x3a, 0x82, 0xb1, 0x78, 0x93, 0x69, 0xc3, 0x7e, 0x78, + 0x4a, 0x8b, 0xad, 0x5e, 0x90, 0x94, 0x98, 0xc8, 0x85, 0x57, 0x72, 0xb3, + 0x50, 0xa4, 0xbb, 0x53, 0x5d, 0x95, 0x5e, 0x63, 0x44, 0xca, 0x83, 0x75, + 0xce, 0x43, 0xbe, 0x43, 0x49, 0x9d, 0x32, 0x39, 0x6d, 0x49, 0x54, 0x5d, + 0x80, 0x5a, 0x32, 0x94, 0x7f, 0x58, 0x6d, 0x57, 0x7a, 0x9a, 0x57, 0x62, + 0xb1, 0xa7, 0x54, 0xaf, 0xb5, 0xa3, 0x44, 0x5f, 0x61, 0x7b, 0xcf, 0x34, + 0xbe, 0x3c, 0x56, 0x5b, 0x44, 0x6f, 0x6d, 0x98, 0x7f, 0x9c, 0x71, 0x8b, + 0xc7, 0x3f, 0x66, 0x98, 0x7b, 0x86, 0x9c, 0x5f, 0xa2, 0xb3, 0x7a, 0x4d, + 0x54, 0xaa, 0x81, 0xcc, 0xa7, 0xae, 0x7c, 0x2d, 0xcb, 0x40, 0x65, 0x61, + 0xc2, 0x3b, 0x7b, 0x7e, 0xbd, 0x4a, 0x61, 0xc5, 0xbb, 0x7a, 0xbc, 0x8b, + 0x49, 0x7b, 0x5e, 0x62, 0xcf, 0x53, 0xa8, 0x78, 0xc5, 0x8c, 0xad, 0x50, + 0xac, 0xcf, 0x81, 0x53, 0xc7, 0x2e, 0xb0, 0x5e, 0x89, 0x4a, 0x75, 0x7f, + 0xb6, 0xbd, 0x91, 0x61, 0x98, 0x48, 0xd0, 0x5c, 0x4e, 0x5e, 0x60, 0xcb, + 0xbf, 0x71, 0xc8, 0x78, 0x9f, 0x72, 0x3f, 0xb3, 0xd2, 0x39, 0xcc, 0xcc, + 0xa0, 0x3f, 0x7e, 0xc2, 0x39, 0x36, 0x35, 0x4f, 0xad, 0x92, 0x40, 0x50, + 0x7f, 0x85, 0x9b, 0x8e, 0x8d, 0xcb, 0x46, 0x74, 0x38, 0xaa, 0x7e, 0xa9, + 0x88, 0xbc, 0x7f, 0x99, 0x72, 0xd4, 0x48, 0x65, 0x82, 0x86, 0x59, 0x3b, + 0x42, 0x87, 0x9b, 0xae, 0x53, 0x89, 0xae, 0x9c, 0xbb, 0x38, 0x98, 0x71, + 0x5e, 0xb6, 0x98, 0xba, 0x5a, 0x35, 0x9d, 0xbb, 0xa9, 0xc7, 0x4e, 0x50, + 0x8c, 0x70, 0x74, 0xb8, 0x9b, 0x73, 0xcd, 0x36, 0x3b, 0x3d, 0x6e, 0xc4, + 0x92, 0x5d, 0x4c, 0x3d, 0x55, 0x7e, 0xac, 0xc8, 0xc6, 0xc0, 0x91, 0x34, + 0x9c, 0x89, 0x49, 0x7c, 0x89, 0xb5, 0x3b, 0x33, 0xcd, 0x5b, 0xca, 0x72, + 0xb1, 0x3d, 0x3e, 0x86, 0xa2, 0xb3, 0xc9, 0x8d, 0x90, 0x57, 0x7c, 0x6e, + 0xa7, 0x4a, 0xbb, 0x93, 0x9a, 0x72, 0x55, 0x5f, 0x83, 0xbf, 0x82, 0x4e, + 0x4c, 0xc9, 0xc8, 0x8b, 0xb5, 0x30, 0xb6, 0x2d, 0x6d, 0x40, 0x6b, 0xa1, + 0x3c, 0x3f, 0x7e, 0x7f, 0xad, 0xc8, 0xa9, 0x8f, 0x40, 0xbd, 0xc0, 0x3c, + 0x44, 0xaa, 0x9e, 0xae, 0x3f, 0xbe, 0xc2, 0x3f, 0x73, 0xb9, 0xab, 0x54, + 0xb0, 0x52, 0x56, 0x37, 0xc9, 0xbb, 0x43, 0x3a, 0x8a, 0xc8, 0x73, 0x97, + 0x71, 0x47, 0x4d, 0x3c, 0x51, 0x5e, 0x9a, 0x56, 0x6e, 0x4b, 0x8f, 0x3b, + 0x4b, 0x72, 0x90, 0x7a, 0x88, 0x6c, 0xa3, 0x37, 0x93, 0x65, 0xc3, 0x4b, + 0xa0, 0xa1, 0xc9, 0x4b, 0xae, 0x5c, 0x92, 0x95, 0x7c, 0xb4, 0xb1, 0x7d, + 0xa3, 0x37, 0x38, 0x7d, 0x6b, 0x5f, 0x83, 0x5a, 0x37, 0x82, 0x4c, 0x72, + 0x3b, 0xb9, 0xb5, 0x9d, 0x42, 0x57, 0x6d, 0xa5, 0xbe, 0x91, 0xcc, 0x46, + 0x89, 0x53, 0x81, 0xd1, 0x85, 0x8f, 0x74, 0x67, 0xc3, 0xc6, 0x61, 0x77, + 0xd1, 0x5f, 0x34, 0xaa, 0x44, 0xbc, 0xcc, 0x86, 0x97, 0x39, 0xb5, 0x43, + 0x7d, 0x81, 0xc9, 0x4b, 0x86, 0x8f, 0x42, 0x7f, 0x6d, 0x6c, 0x52, 0x83, + 0xaf, 0x91, 0xb8, 0xc0, 0xc2, 0x4f, 0xb1, 0x3e, 0x4b, 0x70, 0x95, 0x76, + 0x9d, 0x86, 0x5b, 0x76, 0x9d, 0x36, 0x79, 0x42, 0x53, 0x44, 0xbc, 0x76, + 0xba, 0x3a, 0xaf, 0x44, 0xcb, 0x58, 0xc8, 0xa4, 0x8d, 0x4d, 0x44, 0xd0, + 0x5a, 0x88, 0x3a, 0x53, 0x48, 0x89, 0xa3, 0x48, 0xc0, 0x74, 0xcc, 0xa7, + 0xa1, 0x46, 0x86, 0x47, 0x91, 0xb4, 0x73, 0x52, 0xcc, 0x5d, 0x56, 0x6d, + 0xb4, 0x69, 0x78, 0xd1, 0x37, 0xaf, 0x48, 0x47, 0x9d, 0x75, 0x64, 0x66, + 0x78, 0x3c, 0x9e, 0x55, 0x76, 0xa8, 0x62, 0x63, 0xa8, 0x56, 0xbd, 0x80, + 0xa2, 0x76, 0x8e, 0x7c, 0x9e, 0x69, 0xc6, 0x7b, 0x86, 0x7c, 0x41, 0xc2, + 0x51, 0x36, 0x77, 0xc2, 0x76, 0xae, 0xa2, 0x43, 0x36, 0xcc, 0x6a, 0x39, + 0x3d, 0xbc, 0x66, 0xa0, 0xcf, 0x54, 0x3e, 0x99, 0xc8, 0x73, 0xc9, 0xbd, + 0x8a, 0x76, 0x6e, 0x84, 0x7d, 0x65, 0x53, 0x8b, 0xad, 0x59, 0xc3, 0xbf, + 0x3e, 0x36, 0x95, 0x39, 0xa3, 0xb8, 0x99, 0x86, 0xae, 0x66, 0x40, 0x92, + 0x5e, 0x8e, 0xbf, 0xb8, 0x59, 0xae, 0x72, 0x41, 0x65, 0xa0, 0xba, 0x73, + 0x81, 0x4b, 0x7d, 0xa4, 0x78, 0xab, 0xc0, 0x5b, 0xc7, 0xc6, 0xbd, 0x98, + 0x37, 0xb3, 0x3b, 0x3e, 0x9b, 0xd0, 0x97, 0x59, 0x8e, 0x89, 0x9b, 0xa1, + 0x9c, 0x3c, 0x8a, 0x3d, 0x85, 0x46, 0xa9, 0x47, 0x36, 0xc9, 0x34, 0x6f, + 0x74, 0x74, 0x94, 0x33, 0x57, 0x55, 0xb0, 0xc2, 0x39, 0x82, 0x3e, 0x6c, + 0x60, 0xcb, 0x6e, 0x5b, 0xca, 0x98, 0x30, 0x35, 0x9f, 0xa8, 0xb0, 0xc1, + 0x49, 0x55, 0x74, 0xb5, 0x4b, 0x5a, 0x59, 0x65, 0xca, 0xbd, 0x67, 0x4a, + 0x8b, 0x36, 0x6e, 0x9e, 0xa8, 0x48, 0x6a, 0x32, 0x38, 0x9b, 0x7e, 0x8b, + 0x3e, 0xa3, 0xaa, 0xa7, 0x76, 0xa5, 0xb1, 0x4c, 0xad, 0x3f, 0xbb, 0xb1, + 0x40, 0x7c, 0x7d, 0x96, 0xa1, 0xb4, 0x58, 0x57, 0x40, 0x68, 0xb1, 0xb0, + 0x4d, 0x3d, 0xc2, 0xce, 0x65, 0x99, 0x53, 0x57, 0x76, 0x72, 0x5e, 0x5f, + 0x9f, 0x37, 0x3b, 0x8d, 0xa2, 0x98, 0x98, 0x78, 0x97, 0xd1, 0xc7, 0xb0, + 0xc5, 0xa1, 0x5a, 0x9f, 0x44, 0x52, 0x68, 0x56, 0xa2, 0x49, 0x67, 0x8f, + 0xa9, 0x36, 0x43, 0x81, 0xa0, 0xc6, 0xa7, 0x86, 0x6d, 0x9c, 0x35, 0x76, + 0xd0, 0xa5, 0x95, 0x60, 0xab, 0xbc, 0x58, 0x7a, 0xcf, 0x54, 0xcb, 0xa9, + 0x52, 0x65, 0xa5, 0xa4, 0xd2, 0x5e, 0xaa, 0x7b, 0xb2, 0xbd, 0x74, 0xbb, + 0x5c, 0x7e, 0x4d, 0x70, 0x92, 0xbd, 0x7e, 0x62, 0x49, 0xb1, 0xbf, 0x8a, + 0x77, 0xad, 0x93, 0x61, 0x83, 0xaa, 0x96, 0x54, 0x87, 0x4e, 0x39, 0xca, + 0x5c, 0x77, 0x70, 0x5c, 0xb8, 0x9b, 0x3b, 0x3c, 0xd1, 0x9f, 0x47, 0xbe, + 0x7f, 0x71, 0x99, 0x38, 0x4e, 0x46, 0x64, 0x41, 0x5a, 0xa9, 0x7a, 0x6f, + 0x59, 0xcc, 0x86, 0x4f, 0x91, 0x70, 0xb0, 0xc8, 0x40, 0x98, 0x8e, 0xbb, + 0xb7, 0x76, 0x89, 0x49, 0x51, 0x65, 0xa7, 0x4e, 0xbc, 0x39, 0x3e, 0xaf, + 0x5a, 0xc8, 0x79, 0x99, 0x4b, 0x68, 0x32, 0x38, 0x4f, 0xa0, 0xaa, 0xba, + 0xc3, 0x4b, 0xc1, 0xc0, 0x60, 0x4d, 0x4d, 0x35, 0xaf, 0x4a, 0x96, 0x43, + 0x9c, 0x36, 0x56, 0xae, 0xd3, 0xa7, 0xa1, 0xa1, 0x97, 0xa8, 0xa8, 0xc1, + 0x3f, 0x8a, 0x51, 0x8c, 0x64, 0x4a, 0x42, 0x92, 0x3d, 0x47, 0xd3, 0x91, + 0x4c, 0xb6, 0x64, 0xa2, 0x4b, 0x69, 0x6e, 0x4d, 0x50, 0x60, 0x86, 0x46, + 0xbf, 0xa5, 0x5b, 0x38, 0xb9, 0x3f, 0xcb, 0x3e, 0x74, 0x5e, 0xb5, 0x78, + 0xd0, 0xa9, 0x52, 0x3d, 0xcc, 0x94, 0xcc, 0x78, 0x41, 0x33, 0x3e, 0xbd, + 0x9b, 0x9b, 0x6a, 0xc2, 0xa2, 0x82, 0x65, 0x78, 0xcb, 0x6e, 0xa1, 0x8f, + 0x7c, 0xa2, 0xc2, 0x58, 0x64, 0xb9, 0xa9, 0xb7, 0x88, 0x34, 0x5a, 0x3e, + 0x59, 0xd4, 0x36, 0x5b, 0x8a, 0xa5, 0xbf, 0x95, 0x53, 0x62, 0x94, 0xc5, + 0x40, 0x83, 0xca, 0x67, 0x9a, 0x36, 0x3b, 0x31, 0x4f, 0x51, 0x4f, 0x70, + 0xa5, 0x4d, 0x41, 0x74, 0x81, 0x4d, 0x66, 0xa6, 0x5d, 0x92, 0x58, 0x39, + 0xad, 0x93, 0x89, 0xbe, 0x81, 0xaa, 0xa7, 0x4e, 0x71, 0x4f, 0xa0, 0x59, + 0x9e, 0x3c, 0x4c, 0xb4, 0x62, 0x90, 0x34, 0x98, 0x62, 0x41, 0x93, 0xa3, + 0x42, 0xd1, 0xa9, 0x90, 0x89, 0x66, 0x66, 0x98, 0x9a, 0x84, 0x3e, 0x58, + 0x89, 0xa1, 0x7b, 0xc0, 0x35, 0xb5, 0x4f, 0x73, 0x4b, 0xc7, 0x75, 0x35, + 0xbe, 0x50, 0x97, 0xd3, 0x73, 0xa5, 0x90, 0x8e, 0x80, 0x90, 0xd2, 0xd0, + 0x50, 0xc0, 0x62, 0xbb, 0xc7, 0x96, 0x5f, 0xb6, 0x81, 0x65, 0x99, 0x3d, + 0x8f, 0xab, 0xb8, 0xa9, 0xd2, 0x59, 0x58, 0x86, 0x86, 0x6a, 0xc6, 0xbc, + 0x43, 0x7d, 0x5f, 0x65, 0xc3, 0x77, 0x64, 0x3a, 0xbc, 0x8d, 0x4f, 0x63, + 0xa7, 0x4d, 0x45, 0x66, 0x5b, 0xa9, 0x32, 0x37, 0x36, 0x53, 0xd4, 0xae, + 0x4b, 0xb2, 0x3f, 0x45, 0xc9, 0x3a, 0xc5, 0x37, 0x70, 0x62, 0x72, 0x54, + 0x80, 0x4a, 0x77, 0xcb, 0x79, 0x4d, 0x8c, 0x31, 0x6f, 0x56, 0x4d, 0x6e, + 0x93, 0x47, 0xa6, 0x52, 0x7b, 0x77, 0x3b, 0xa3, 0x46, 0x8e, 0xcd, 0xd3, + 0x4b, 0x82, 0x4b, 0xd1, 0x44, 0x3f, 0xad, 0x4f, 0x5e, 0x67, 0x4c, 0x3c, + 0x56, 0xd3, 0xcd, 0x88, 0xab, 0x43, 0xad, 0x40, 0x76, 0x3f, 0xa3, 0x78, + 0x8f, 0x3e, 0xca, 0x78, 0x53, 0x82, 0x67, 0xd3, 0x65, 0x89, 0x47, 0x74, + 0xc7, 0x53, 0x9e, 0x86, 0x41, 0x7e, 0x4e, 0xcb, 0x94, 0xb3, 0x81, 0x51, + 0x61, 0xbe, 0xb7, 0xa9, 0x47, 0xa4, 0xa1, 0x4c, 0xaa, 0x57, 0x76, 0x6c, + 0x84, 0x8a, 0xc9, 0x4b, 0x3c, 0x7f, 0x8a, 0x30, 0x7b, 0x8d, 0x61, 0x51, + 0x4f, 0x5c, 0x9a, 0x4a, 0xc3, 0x96, 0x66, 0x53, 0x3f, 0x41, 0x3d, 0xa8, + 0x75, 0x5a, 0x4f, 0x62, 0x62, 0x35, 0x5c, 0xaa, 0x93, 0x89, 0x59, 0x4f, + 0x46, 0x78, 0x74, 0xc3, 0x8a, 0x47, 0x84, 0x35, 0x96, 0x96, 0x76, 0x64, + 0x9a, 0x53, 0x6e, 0xa1, 0x46, 0x3d, 0x5e, 0xb5, 0x6d, 0x9e, 0xa5, 0x89, + 0xcc, 0x4c, 0xbf, 0x87, 0xb0, 0xc0, 0x5c, 0xb9, 0x44, 0x6f, 0x95, 0x53, + 0x6d, 0x4f, 0x53, 0x7e, 0x5a, 0xaf, 0xa9, 0x5d, 0x66, 0x8f, 0xbf, 0x5b, + 0xb4, 0x83, 0xc1, 0x8b, 0xa0, 0xc2, 0xa2, 0x64, 0x92, 0x5a, 0xca, 0x74, + 0x5f, 0x84, 0x80, 0x39, 0x3a, 0xa8, 0xbb, 0xa7, 0xd2, 0xb5, 0xa5, 0x74, + 0x7b, 0xa8, 0x7f, 0xbb, 0x8d, 0x58, 0x3c, 0x75, 0xa9, 0xc1, 0xb3, 0x94, + 0xc1, 0xc4, 0x3b, 0x39, 0x98, 0x94, 0x6c, 0x95, 0xb1, 0x63, 0x76, 0x83, + 0x6d, 0x5d, 0x4c, 0x9e, 0x8b, 0x43, 0x49, 0x97, 0x4c, 0x80, 0x8c, 0x6d, + 0x78, 0xaa, 0x47, 0x62, 0x51, 0x56, 0x34, 0xb8, 0x69, 0x4e, 0x86, 0xaa, + 0x5c, 0x59, 0x6c, 0xa9, 0x79, 0x85, 0xbd, 0xb9, 0x89, 0x77, 0x91, 0xb0, + 0x52, 0x99, 0x92, 0x49, 0x5c, 0x3f, 0x40, 0x8b, 0xb9, 0x3a, 0xa3, 0x83, + 0x41, 0xd1, 0x9e, 0xc0, 0xa7, 0x59, 0xa0, 0x4b, 0x68, 0x93, 0xc3, 0x3a, + 0x8f, 0x60, 0x4b, 0x5e, 0xc5, 0x3f, 0x49, 0x4d, 0x4d, 0x4c, 0x9c, 0x4b, + 0x51, 0xa5, 0x62, 0x70, 0x89, 0x4c, 0x99, 0xa7, 0x7d, 0xa1, 0xbe, 0x7a, + 0x32, 0xa0, 0x9d, 0x33, 0x3d, 0x4c, 0x5d, 0x58, 0x86, 0x83, 0xd0, 0x53, + 0xce, 0x96, 0x37, 0x91, 0x7d, 0xcd, 0xae, 0x36, 0x60, 0x5a, 0x52, 0x49, + 0x5a, 0x88, 0x52, 0x7a, 0x45, 0x3a, 0x68, 0xb8, 0xc9, 0xc9, 0xcb, 0x8e, + 0xc0, 0x8b, 0xa1, 0x6d, 0xc3, 0xb5, 0x54, 0x32, 0x87, 0xd0, 0x52, 0x66, + 0x4e, 0x63, 0xbe, 0x63, 0xa9, 0x7a, 0x4a, 0x7e, 0x5c, 0x77, 0x59, 0x4e, + 0x67, 0x9f, 0x32, 0x8b, 0x32, 0x36, 0xb4, 0x9a, 0xc0, 0x39, 0x4a, 0xb1, + 0x85, 0xc3, 0xba, 0x75, 0xab, 0x7a, 0xb5, 0x41, 0x5a, 0x8e, 0xd5, 0x3f, + 0xc3, 0x33, 0x73, 0xcb, 0x6a, 0xc8, 0x45, 0x5e, 0x6c, 0xb0, 0xbc, 0x86, + 0xd1, 0xd4, 0x85, 0xca, 0x3e, 0xb5, 0xcc, 0x64, 0x85, 0x5f, 0x95, 0xbb, + 0x42, 0xae, 0xcb, 0x59, 0x9a, 0xae, 0xd0, 0x4e, 0x7d, 0x4e, 0xb9, 0x9e, + 0x7e, 0x8a, 0x70, 0xb5, 0x3b, 0xbe, 0x64, 0x94, 0x42, 0x92, 0x65, 0x75, + 0x83, 0x5b, 0x67, 0xc0, 0xca, 0x7d, 0xd1, 0x84, 0x80, 0x3b, 0x40, 0xb5, + 0x97, 0x89, 0x40, 0xbb, 0x5f, 0x3c, 0x77, 0xc6, 0x58, 0x91, 0xc5, 0x91, + 0x46, 0x4e, 0x35, 0xc6, 0xc5, 0x66, 0x6d, 0x83, 0x45, 0x7b, 0x8d, 0xd1, + 0x33, 0x4e, 0xd5, 0x7e, 0xbb, 0x81, 0x6f, 0x75, 0xc0, 0xa5, 0xcf, 0xc6, + 0x40, 0x9c, 0xcc, 0xbe, 0x46, 0xc3, 0x78, 0xc2, 0xad, 0xd0, 0xac, 0x5a, + 0xc3, 0x50, 0xa7, 0x47, 0xac, 0xac, 0xaa, 0x7b, 0xb5, 0x54, 0x94, 0x7d, + 0x63, 0xa5, 0x65, 0x9c, 0x8c, 0x6c, 0xa8, 0x37, 0x5e, 0x53, 0xd2, 0x3d, + 0xc8, 0x87, 0x48, 0xb6, 0x50, 0x48, 0xae, 0x66, 0x5a, 0x5f, 0x96, 0x71, + 0x2c, 0xb6, 0x53, 0x62, 0xc4, 0xa9, 0x88, 0xaf, 0x4a, 0x3d, 0x54, 0x73, + 0x74, 0xaf, 0xa4, 0xaf, 0x9e, 0x68, 0xce, 0x79, 0x7c, 0x4c, 0xc9, 0x64, + 0x75, 0xb9, 0xb9, 0x72, 0x4a, 0xb7, 0xae, 0x55, 0xac, 0xb2, 0x6f, 0x9f, + 0x5e, 0xaf, 0x65, 0xc2, 0xb8, 0x74, 0x6a, 0x44, 0xad, 0xd1, 0xa2, 0xcc, + 0xda, 0xc7, 0x75, 0x6b, 0x47, 0x69, 0x4a, 0x34, 0xc4, 0x44, 0x5d, 0x94, + 0xab, 0x9e, 0x8a, 0x87, 0xde, 0xde, 0xbf, 0xcd, 0x4b, 0xa2, 0xb3, 0xa5, + 0xb0, 0x57, 0x36, 0x6d, 0x6a, 0xd2, 0x50, 0xb0, 0xc5, 0xbb, 0x79, 0x89, + 0x58, 0xcf, 0x3e, 0xcd, 0xbc, 0xb5, 0x74, 0x48, 0x9b, 0xad, 0x6f, 0x4c, + 0xd7, 0x8d, 0x7e, 0x71, 0x84, 0xba, 0x81, 0xad, 0xd0, 0x69, 0x34, 0x6f, + 0xcd, 0x61, 0xc7, 0x4a, 0x50, 0xb4, 0x8c, 0x9d, 0x96, 0xb2, 0x5c, 0xd0, + 0x98, 0xb8, 0x8b, 0x9c, 0x6c, 0x85, 0xc0, 0x4c, 0x7d, 0xad, 0x6a, 0x6c, + 0xb5, 0xca, 0xb2, 0x65, 0x83, 0x4b, 0x8c, 0x9c, 0xc4, 0x42, 0x90, 0x76, + 0x6c, 0x7e, 0x73, 0x2f, 0x42, 0xaf, 0x4a, 0x9a, 0x6b, 0x39, 0x9e, 0xaf, + 0x79, 0xba, 0x84, 0x40, 0xc8, 0xad, 0xb6, 0x69, 0x8c, 0x97, 0x56, 0xbd, + 0x50, 0xb5, 0xc9, 0xa9, 0x4c, 0xc0, 0x7f, 0x94, 0x89, 0xab, 0xd3, 0x82, + 0x63, 0x6a, 0xb6, 0xb3, 0x2e, 0x34, 0xbf, 0x30, 0x50, 0x40, 0x62, 0x43, + 0x61, 0xc0, 0xc8, 0x71, 0x70, 0x39, 0xa4, 0xca, 0x75, 0x46, 0xbe, 0x74, + 0x39, 0x3c, 0x7f, 0xaa, 0xb5, 0xcb, 0x75, 0x3e, 0x5d, 0x59, 0x88, 0xad, + 0x36, 0x9f, 0x6c, 0x6a, 0xaa, 0x65, 0x91, 0x71, 0x82, 0x8b, 0xaa, 0xc1, + 0xbf, 0xc7, 0x80, 0xba, 0xb3, 0xc5, 0xbd, 0x35, 0x7b, 0x9f, 0xba, 0x31, + 0x9b, 0x56, 0x60, 0x95, 0x43, 0x3a, 0x2b, 0x3b, 0x5a, 0x31, 0xae, 0x85, + 0x7a, 0xc9, 0xac, 0x85, 0x5c, 0x9e, 0x53, 0x48, 0x74, 0x90, 0x4b, 0x67, + 0x8e, 0x6d, 0xbf, 0xc7, 0x58, 0x67, 0x69, 0xcc, 0x95, 0x6e, 0xb5, 0x7d, + 0x97, 0xdb, 0xc8, 0x90, 0x92, 0x3a, 0x67, 0xaf, 0x50, 0xb2, 0x80, 0x40, + 0x83, 0x5b, 0x88, 0x92, 0xcb, 0x7e, 0x80, 0x63, 0x6f, 0xb0, 0xc4, 0x3a, + 0x94, 0xac, 0x8e, 0x76, 0xbb, 0xb7, 0xb4, 0xa7, 0x41, 0xb3, 0xac, 0x50, + 0x64, 0xac, 0x4a, 0xc1, 0x95, 0x5c, 0x51, 0x3f, 0xab, 0x44, 0x7a, 0xc0, + 0x6f, 0x3e, 0x33, 0x6a, 0xb0, 0xaf, 0x85, 0x44, 0x4b, 0xc4, 0x28, 0x59, + 0x30, 0x73, 0x62, 0xcf, 0x49, 0xb8, 0xaa, 0x9b, 0x81, 0xa3, 0xda, 0x43, + 0x6c, 0x68, 0x82, 0x67, 0x74, 0xd6, 0x88, 0x64, 0x47, 0xcd, 0xbd, 0x48, + 0x9f, 0xa2, 0x62, 0x76, 0x9c, 0x35, 0x86, 0x45, 0xa8, 0x88, 0xa3, 0x49, + 0x97, 0x5d, 0xa4, 0x81, 0x4d, 0xb4, 0x9d, 0xcb, 0x51, 0x8d, 0x76, 0x2c, + 0x55, 0x75, 0xbd, 0xab, 0xbe, 0xa5, 0x7f, 0xa0, 0x99, 0x44, 0xa0, 0xa4, + 0x45, 0xb1, 0x75, 0x67, 0x3e, 0x75, 0x5a, 0xa3, 0x4e, 0xb8, 0x74, 0x3c, + 0xc1, 0xb2, 0x60, 0xcd, 0xd2, 0x4a, 0x6d, 0xa7, 0xc8, 0xbc, 0x96, 0xc8, + 0x66, 0xb0, 0xcd, 0xc7, 0x67, 0x6f, 0x83, 0xaf, 0xbe, 0x37, 0x7f, 0xb8, + 0x37, 0x4a, 0x89, 0x84, 0x51, 0xa4, 0x54, 0x2d, 0x7e, 0xcc, 0x7a, 0x4e, + 0x3a, 0xb5, 0xad, 0x59, 0xb7, 0x2f, 0x44, 0x7f, 0x7f, 0xa8, 0x6b, 0x69, + 0x3e, 0x74, 0x62, 0x4a, 0xbc, 0x73, 0x5c, 0x6f, 0x64, 0x6f, 0x5f, 0xa3, + 0x8a, 0x3c, 0x8a, 0x6b, 0x55, 0x64, 0xd8, 0x6a, 0x95, 0x85, 0x5e, 0x71, + 0x63, 0x3d, 0x9f, 0x81, 0x90, 0x48, 0x3e, 0x77, 0x68, 0xb1, 0x8f, 0x95, + 0x7c, 0xb2, 0x97, 0x66, 0xa9, 0xb0, 0xa8, 0x76, 0x61, 0x4a, 0xc1, 0xd3, + 0xa4, 0x7e, 0x5d, 0xaa, 0x53, 0x93, 0x75, 0xd3, 0x3f, 0xa1, 0x38, 0xa1, + 0x6c, 0xc6, 0xc3, 0xb4, 0x76, 0x70, 0x6e, 0xa7, 0xc1, 0x9e, 0xc9, 0xc7, + 0xa8, 0x73, 0xd3, 0xb1, 0x68, 0x93, 0xd1, 0x99, 0xa9, 0xc4, 0x90, 0xa4, + 0x45, 0xc6, 0x9f, 0xaa, 0x95, 0x4f, 0xd7, 0xc8, 0x53, 0x72, 0xa4, 0xc0, + 0x4f, 0x80, 0xa3, 0x90, 0x98, 0xa5, 0x6e, 0x5d, 0x54, 0x96, 0xc1, 0x65, + 0x50, 0x67, 0x59, 0x81, 0x4a, 0x90, 0x41, 0xc2, 0xb1, 0x41, 0xca, 0xc3, + 0xa5, 0xbd, 0x46, 0x75, 0x5b, 0x99, 0xb9, 0xbf, 0xa2, 0x4f, 0x6a, 0xbf, + 0xdb, 0x7c, 0x51, 0x95, 0xb9, 0x7d, 0x39, 0x8b, 0x3d, 0x59, 0x50, 0x78, + 0xe2, 0xad, 0x40, 0x74, 0xab, 0x9f, 0x95, 0x85, 0x72, 0x4a, 0x8a, 0x36, + 0x48, 0xe0, 0xd9, 0x67, 0xac, 0xb4, 0xc0, 0xb7, 0xd5, 0x39, 0x69, 0xe0, + 0x34, 0x39, 0x7d, 0x3a, 0x34, 0xcc, 0x70, 0xa3, 0x7a, 0xaa, 0xbe, 0xce, + 0x65, 0x46, 0xad, 0x33, 0xba, 0xaf, 0x4e, 0x76, 0xb0, 0xc3, 0xbe, 0x62, + 0x68, 0x44, 0x47, 0xc7, 0x5c, 0xa1, 0xa3, 0x8c, 0x50, 0x5d, 0x4e, 0x84, + 0xe1, 0x71, 0xd5, 0xc8, 0x6b, 0x45, 0x8a, 0xc3, 0xbf, 0x8c, 0x35, 0xc9, + 0xce, 0x98, 0x4b, 0x32, 0x97, 0x83, 0x7b, 0x38, 0x87, 0x61, 0x7e, 0x55, + 0x5e, 0xb5, 0xd6, 0x74, 0x93, 0x36, 0xc0, 0x76, 0x87, 0x31, 0x3a, 0x3d, + 0xac, 0xcc, 0xc8, 0xc9, 0x62, 0xdf, 0xb6, 0x5b, 0xcf, 0xd7, 0xb6, 0x34, + 0xca, 0x42, 0x7e, 0x8a, 0x38, 0x43, 0x6c, 0xa4, 0x9e, 0xa0, 0x8d, 0xb5, + 0x8d, 0x41, 0xc1, 0xc9, 0x44, 0x8f, 0x69, 0xaa, 0xb1, 0x7d, 0x81, 0x8a, + 0xa6, 0xa8, 0xc8, 0x67, 0x45, 0x62, 0x63, 0x4e, 0xcc, 0xcd, 0xb4, 0x91, + 0x58, 0x99, 0x9d, 0xc2, 0xaf, 0x7b, 0x65, 0xa7, 0x3d, 0x99, 0xb1, 0x64, + 0x6b, 0xd3, 0x94, 0x52, 0x7b, 0x59, 0x3d, 0x8a, 0x6c, 0x92, 0x60, 0x50, + 0xa5, 0x7d, 0x27, 0x45, 0x6a, 0xdd, 0x87, 0x48, 0xb6, 0x6a, 0xc3, 0xac, + 0xc8, 0x3b, 0x73, 0x7c, 0x33, 0x30, 0x7e, 0xb6, 0x73, 0xac, 0x3e, 0x75, + 0x4f, 0x66, 0x80, 0x72, 0x8c, 0x62, 0x3c, 0x72, 0x5e, 0x5f, 0xb8, 0x52, + 0x8f, 0x6e, 0xd5, 0x58, 0x8f, 0x65, 0xbb, 0x69, 0xd5, 0xb4, 0x64, 0x97, + 0xcf, 0xc5, 0x82, 0xd6, 0x8a, 0x6d, 0x62, 0xab, 0x56, 0x75, 0x3f, 0xba, + 0x34, 0x81, 0x90, 0x4a, 0xa5, 0x65, 0xc6, 0xb3, 0x65, 0x80, 0x7b, 0x44, + 0x30, 0x85, 0x88, 0xa2, 0x66, 0x7e, 0xbc, 0xb6, 0x6f, 0x72, 0x58, 0xa2, + 0xc1, 0x92, 0x64, 0x4f, 0x93, 0xa4, 0x55, 0x65, 0x34, 0x9c, 0x76, 0xb9, + 0xa3, 0xcd, 0x36, 0x42, 0x7c, 0xac, 0x62, 0xd6, 0x28, 0x3a, 0x2e, 0xc5, + 0x93, 0x80, 0x76, 0xb9, 0x44, 0xb9, 0x6b, 0xd7, 0x60, 0x77, 0x38, 0x4d, + 0xa6, 0x40, 0x99, 0xad, 0x30, 0xa4, 0x55, 0x7d, 0x37, 0xb3, 0x8f, 0xc9, + 0xa0, 0x7e, 0x65, 0xbd, 0x7e, 0x2a, 0x58, 0x63, 0x71, 0x7d, 0x6a, 0x90, + 0x43, 0xa4, 0x78, 0xbc, 0x83, 0x79, 0x72, 0x34, 0xb8, 0x43, 0xd4, 0xc8, + 0x37, 0x38, 0x43, 0xac, 0x46, 0xc1, 0xa4, 0xcd, 0x78, 0x31, 0x32, 0xc6, + 0x63, 0x46, 0xa0, 0x67, 0x51, 0x44, 0xa0, 0x37, 0x8a, 0x8f, 0x82, 0x69, + 0xb1, 0xc1, 0x8f, 0x46, 0xcc, 0x4e, 0xc3, 0xd6, 0x5a, 0x4c, 0xd4, 0xd0, + 0xae, 0xa9, 0xc3, 0xcc, 0x70, 0xae, 0x5e, 0x9c, 0x5d, 0x61, 0x77, 0xb0, + 0x69, 0x90, 0x47, 0x98, 0x8a, 0x64, 0x7a, 0xcc, 0xc0, 0x29, 0x66, 0xcc, + 0x83, 0x9b, 0xb6, 0x59, 0x7a, 0xa0, 0x9d, 0xc6, 0xb1, 0x60, 0x6f, 0xa0, + 0xad, 0x4d, 0x36, 0x3f, 0xaa, 0x34, 0x47, 0xb3, 0x56, 0xc2, 0xde, 0x2e, + 0xaf, 0xb3, 0x9a, 0xca, 0x9f, 0x90, 0x75, 0x3a, 0xbd, 0x47, 0xac, 0x7f, + 0x21, 0x4b, 0xa2, 0xad, 0x96, 0xa8, 0xc8, 0xa4, 0xb0, 0x87, 0x83, 0xbf, + 0xc0, 0xb8, 0x39, 0x72, 0x44, 0x7b, 0xa6, 0x89, 0x9c, 0x84, 0xc4, 0xd0, + 0x8f, 0x36, 0xce, 0x88, 0xb8, 0x7c, 0xe0, 0x91, 0x4d, 0x86, 0xad, 0xa5, + 0x93, 0x3d, 0x31, 0x3b, 0x8d, 0x91, 0x7b, 0xbc, 0x63, 0xae, 0xa1, 0x50, + 0x69, 0xa0, 0xd1, 0x81, 0xb3, 0xac, 0x8e, 0x7c, 0xb0, 0x5b, 0x4f, 0x59, + 0x7a, 0x83, 0x58, 0x73, 0x3b, 0x7d, 0x8e, 0x50, 0xd2, 0x55, 0xbd, 0xd0, + 0x76, 0x40, 0x4a, 0x81, 0x37, 0x53, 0xb5, 0x44, 0x85, 0x64, 0x7e, 0x5b, + 0x7e, 0x90, 0xb7, 0x5e, 0xab, 0x7b, 0x96, 0x89, 0x8a, 0x71, 0x7b, 0xbe, + 0xbb, 0x75, 0x37, 0x95, 0x47, 0x9e, 0x7d, 0x5c, 0x80, 0x84, 0x6e, 0xba, + 0x4f, 0x7c, 0x95, 0xb3, 0x72, 0x7d, 0x90, 0x5f, 0x7d, 0x7d, 0xc4, 0x9a, + 0x9c, 0x73, 0x34, 0x51, 0xd2, 0x96, 0x78, 0x6f, 0x6c, 0x65, 0x96, 0xbe, + 0x6e, 0xb8, 0x8e, 0x33, 0xd2, 0x7b, 0x75, 0xae, 0xa7, 0xbd, 0x3a, 0x44, + 0xa7, 0x45, 0x68, 0x72, 0x74, 0x9c, 0x8f, 0x79, 0xb7, 0xab, 0x51, 0xc8, + 0x3a, 0xc9, 0x95, 0x5b, 0x9d, 0xbf, 0x6d, 0x97, 0x47, 0x7f, 0xae, 0x8b, + 0xa0, 0x8e, 0xad, 0x75, 0xae, 0xa0, 0xd3, 0x2c, 0xce, 0x9a, 0x35, 0x62, + 0x6c, 0x78, 0x5b, 0xb0, 0x94, 0x6c, 0x58, 0x8d, 0xc6, 0x3c, 0x77, 0x8a, + 0x99, 0x30, 0x3d, 0x69, 0x59, 0x39, 0x41, 0xa7, 0x43, 0xa5, 0x43, 0x4b, + 0x76, 0x2a, 0x82, 0xbb, 0x99, 0x38, 0x84, 0xc2, 0x64, 0x3f, 0x4a, 0x46, + 0x47, 0xa7, 0x51, 0xc3, 0x39, 0x43, 0x94, 0xc6, 0x92, 0x49, 0x84, 0x59, + 0x9f, 0x5d, 0xbb, 0x68, 0x9f, 0xc3, 0xd3, 0x93, 0x8d, 0x46, 0x4e, 0x41, + 0x5c, 0x56, 0xba, 0x3f, 0x3a, 0x9d, 0x59, 0xa7, 0xa5, 0xd7, 0xb8, 0x98, + 0x94, 0x5b, 0x53, 0xc7, 0xc0, 0x4a, 0x84, 0xd1, 0xb8, 0x3e, 0xc4, 0x72, + 0x45, 0x31, 0xa5, 0x2f, 0xd3, 0xad, 0x8b, 0xc7, 0xc2, 0x94, 0xbc, 0x84, + 0xbc, 0x9c, 0x68, 0x53, 0x52, 0x50, 0xab, 0xc3, 0x75, 0x2a, 0x57, 0x7c, + 0x56, 0x93, 0x9c, 0x89, 0x66, 0xce, 0x40, 0xa0, 0xb8, 0x60, 0x62, 0xcc, + 0x3b, 0x76, 0x8e, 0xad, 0xcd, 0x9f, 0x51, 0x9c, 0x42, 0x69, 0xb4, 0x43, + 0xa3, 0x44, 0x3d, 0xa4, 0x8b, 0x94, 0x48, 0xb3, 0x55, 0x30, 0x45, 0x9a, + 0xa3, 0x3a, 0xa1, 0x79, 0xc7, 0x66, 0x7c, 0x96, 0xa7, 0xc7, 0xaa, 0xd6, + 0xaf, 0xbd, 0x6b, 0x5e, 0xc9, 0x7d, 0xc1, 0x7c, 0x71, 0x8e, 0x45, 0xc2, + 0x37, 0x88, 0x35, 0x57, 0x86, 0xbd, 0xb6, 0x3e, 0x8c, 0xa6, 0xa6, 0x4f, + 0xb1, 0x50, 0x9b, 0x89, 0xb4, 0x9b, 0x71, 0xac, 0x8d, 0x6d, 0xac, 0x86, + 0x38, 0xc0, 0x7e, 0xcc, 0x34, 0xb4, 0x9b, 0x86, 0x40, 0x8d, 0x98, 0xb9, + 0xa1, 0x41, 0x91, 0x2b, 0x38, 0x45, 0x59, 0x7f, 0xbb, 0xb9, 0x59, 0x89, + 0x95, 0xbf, 0x79, 0xb1, 0x40, 0x3f, 0x72, 0x7c, 0xb3, 0x3b, 0x48, 0xd1, + 0x92, 0xc6, 0x5c, 0xd5, 0xcb, 0x5d, 0x7d, 0x96, 0x68, 0x92, 0x83, 0x75, + 0x65, 0x41, 0xb9, 0x82, 0x8b, 0x78, 0x79, 0x68, 0xbc, 0x91, 0x7c, 0xae, + 0x46, 0x58, 0xa9, 0x42, 0x48, 0x74, 0xc0, 0x31, 0x88, 0xd2, 0x6b, 0x7b, + 0x47, 0x3e, 0xb0, 0x66, 0x8a, 0x3c, 0x68, 0xad, 0xa0, 0x90, 0x69, 0x48, + 0x42, 0xb9, 0x6b, 0x85, 0xac, 0x71, 0xae, 0xbc, 0x84, 0x6f, 0xab, 0xa6, + 0xba, 0x6f, 0x6e, 0xb1, 0xb1, 0x41, 0x85, 0x8b, 0xbb, 0x4d, 0x79, 0x4b, + 0xd4, 0xab, 0x82, 0x2e, 0xa1, 0x6e, 0x4c, 0x8a, 0x92, 0xb6, 0xa3, 0xc2, + 0xb1, 0x8b, 0x66, 0x91, 0x47, 0xd2, 0x8d, 0x2f, 0x30, 0x58, 0x28, 0x45, + 0x6b, 0x4b, 0x4d, 0xad, 0x45, 0x50, 0x6f, 0x92, 0x5e, 0xa2, 0xc0, 0x6c, + 0x67, 0xd4, 0x52, 0xc0, 0xc6, 0x84, 0x70, 0x3c, 0xb7, 0x58, 0xb0, 0x3a, + 0x64, 0xcd, 0xd0, 0x82, 0xa1, 0xda, 0xa0, 0xc7, 0xd1, 0x95, 0x9a, 0x82, + 0x76, 0x59, 0xc1, 0x9e, 0xc9, 0x53, 0xb7, 0xc3, 0x40, 0xb1, 0x89, 0xc9, + 0x68, 0x9f, 0x79, 0x58, 0x5e, 0xc2, 0x3f, 0x79, 0xab, 0x43, 0x4a, 0x43, + 0x8a, 0x7d, 0xcc, 0x72, 0x9a, 0x8f, 0x32, 0xc2, 0x5a, 0x8f, 0x37, 0x58, + 0xac, 0x42, 0xce, 0x58, 0x3a, 0x55, 0xc2, 0x7c, 0x7b, 0xb7, 0x32, 0x8a, + 0xd0, 0xba, 0x4a, 0xc2, 0x46, 0x7a, 0xc0, 0x50, 0x93, 0x78, 0x69, 0x46, + 0x73, 0xd2, 0xcb, 0xac, 0xae, 0x81, 0xbe, 0xac, 0x81, 0x99, 0x9d, 0x81, + 0x7c, 0x72, 0x97, 0x78, 0x57, 0x6c, 0xca, 0x50, 0xba, 0x37, 0x4e, 0x74, + 0xc0, 0x3d, 0x74, 0x93, 0x6c, 0xa6, 0x3c, 0x7a, 0x85, 0x84, 0xb3, 0x9b, + 0xa9, 0x98, 0x54, 0xae, 0xcd, 0x46, 0x82, 0x34, 0x48, 0x46, 0x6f, 0x90, + 0x8a, 0x64, 0x41, 0x96, 0x43, 0x3c, 0x96, 0xd3, 0x90, 0x91, 0x39, 0x27, + 0x96, 0x64, 0x4c, 0xc5, 0x5c, 0x47, 0x47, 0xd4, 0x60, 0x65, 0x9b, 0x65, + 0x38, 0xc9, 0x78, 0x49, 0x57, 0x56, 0x4f, 0x61, 0x42, 0x6c, 0x93, 0x5d, + 0x57, 0x2f, 0x76, 0xb6, 0x5c, 0x5b, 0x81, 0x31, 0xc9, 0x51, 0xc7, 0x87, + 0x7b, 0x5d, 0xc9, 0x3e, 0xad, 0x8f, 0x4a, 0x96, 0xc1, 0xce, 0x3b, 0x2e, + 0x62, 0x6b, 0xb2, 0x5a, 0x3b, 0xc0, 0x8f, 0xd9, 0x8f, 0x61, 0x82, 0xa3, + 0xb6, 0x95, 0x9e, 0xb7, 0xae, 0xb0, 0x98, 0x49, 0x67, 0xa1, 0x81, 0xa3, + 0x5c, 0x79, 0xd1, 0xce, 0x51, 0x51, 0x5c, 0xb6, 0x40, 0x8b, 0x9a, 0x5c, + 0xcb, 0x5f, 0x3d, 0x52, 0x52, 0x79, 0x73, 0x88, 0x46, 0xa2, 0x4e, 0xd0, + 0x36, 0x59, 0x42, 0x93, 0xa8, 0xaa, 0x89, 0xcc, 0x4d, 0x84, 0x94, 0x7a, + 0x87, 0x88, 0x6d, 0xbf, 0xa2, 0xae, 0xb7, 0x3d, 0x89, 0x42, 0x77, 0x6c, + 0xbe, 0x52, 0xa1, 0x3a, 0xa3, 0x75, 0x34, 0xb7, 0x9c, 0x8d, 0xb8, 0x44, + 0xc2, 0x81, 0xa6, 0x7e, 0x50, 0x54, 0x6e, 0x85, 0x4b, 0x9c, 0xd2, 0xaf, + 0x3c, 0xc2, 0x73, 0x8e, 0x81, 0xaa, 0xb7, 0xcf, 0xa9, 0x70, 0x5d, 0x3d, + 0x62, 0x35, 0x8d, 0x53, 0x3b, 0xc9, 0x80, 0x3a, 0x59, 0x90, 0x80, 0xb2, + 0x7c, 0xa7, 0x8a, 0x93, 0x84, 0x92, 0x50, 0x8f, 0x9d, 0xc0, 0x85, 0x50, + 0xa3, 0x5e, 0x5e, 0xb6, 0x67, 0xbf, 0xbb, 0x4f, 0xcd, 0x6a, 0x88, 0x2c, + 0xc8, 0x91, 0x70, 0x76, 0x7e, 0xa5, 0x4d, 0xbb, 0x3e, 0x2a, 0x49, 0xb6, + 0xc4, 0x53, 0x9f, 0x4b, 0x5c, 0x45, 0x6d, 0x55, 0x7c, 0x8d, 0xb6, 0x69, + 0x51, 0x60, 0x9d, 0x99, 0x6d, 0x31, 0x71, 0xa0, 0x98, 0x55, 0xb1, 0x34, + 0xa3, 0x69, 0xb5, 0x8c, 0x74, 0xc1, 0x64, 0xcc, 0x75, 0xc0, 0xb6, 0x99, + 0x4e, 0x77, 0x81, 0x86, 0x60, 0x38, 0x4d, 0x36, 0x70, 0x57, 0xa8, 0x39, + 0x8d, 0x38, 0x97, 0xa9, 0x3c, 0xc4, 0x3d, 0xd2, 0x4e, 0xc1, 0x63, 0x9b, + 0x80, 0x4d, 0xb6, 0x66, 0xc4, 0xc1, 0x64, 0x91, 0xcf, 0xad, 0x4a, 0x52, + 0x3a, 0x64, 0x96, 0x8a, 0xd0, 0xce, 0xbb, 0xb6, 0x50, 0x41, 0x62, 0x81, + 0xb6, 0xa0, 0x51, 0xa2, 0x99, 0x37, 0x3e, 0xc0, 0x83, 0x42, 0xc7, 0x68, + 0xa3, 0xc3, 0xc2, 0x7b, 0x71, 0x8f, 0x87, 0x34, 0xd5, 0x41, 0xa3, 0x40, + 0x6d, 0xce, 0x80, 0x7d, 0x35, 0x35, 0xcb, 0xcb, 0x7b, 0x48, 0xc7, 0x8b, + 0x58, 0xb5, 0x73, 0x6d, 0x94, 0x53, 0x53, 0x78, 0x75, 0xcb, 0xa3, 0x55, + 0x78, 0x41, 0x60, 0x91, 0x56, 0x87, 0x3b, 0x57, 0x95, 0x9c, 0x60, 0xa8, + 0x6c, 0xa2, 0x61, 0x6c, 0x4c, 0xbd, 0x59, 0x4e, 0xae, 0x8a, 0x3b, 0xaa, + 0xc2, 0x3b, 0xa7, 0x6c, 0xa6, 0x97, 0x5d, 0xab, 0x5a, 0x97, 0x88, 0x49, + 0x4e, 0x39, 0x74, 0x42, 0x69, 0x6e, 0x54, 0x33, 0x48, 0x91, 0xab, 0x6f, + 0x9a, 0x92, 0xb5, 0xa5, 0x59, 0x2b, 0x2d, 0xc8, 0x6e, 0x45, 0x8a, 0xa8, + 0xac, 0x39, 0xb4, 0x2f, 0x8a, 0xaa, 0x67, 0x82, 0xb1, 0x2b, 0x38, 0x7f, + 0x3a, 0x34, 0x76, 0xc2, 0x69, 0x98, 0x6b, 0xac, 0x4b, 0xbf, 0xa9, 0x3a, + 0x98, 0x48, 0xa2, 0xc2, 0x8f, 0x95, 0x9c, 0x92, 0x4a, 0xbb, 0x7d, 0x37, + 0x76, 0x59, 0x34, 0xd0, 0xaa, 0x65, 0x79, 0x88, 0xac, 0x93, 0x35, 0x7d, + 0x7b, 0x7e, 0x66, 0xaf, 0xb3, 0x83, 0x91, 0x69, 0x58, 0xc9, 0xb7, 0x53, + 0x81, 0xbd, 0xbe, 0xa3, 0x7b, 0x58, 0x44, 0xa3, 0x6e, 0x7d, 0x92, 0x32, + 0x5c, 0xbf, 0xd5, 0x31, 0x5b, 0x91, 0x3d, 0x97, 0x6a, 0x53, 0x7e, 0xb9, + 0x54, 0x39, 0x44, 0x3d, 0x36, 0x5a, 0x66, 0x93, 0xad, 0x2f, 0xc2, 0x43, + 0xa8, 0x73, 0x3f, 0xc6, 0x57, 0x3d, 0x4c, 0x62, 0xa3, 0x95, 0x68, 0x37, + 0xbf, 0x85, 0x3f, 0x47, 0x6b, 0xc0, 0x61, 0x4d, 0x34, 0x69, 0xbe, 0xd2, + 0xbf, 0x64, 0x58, 0x8a, 0xaf, 0xc0, 0x67, 0x48, 0x51, 0x7e, 0x31, 0x6a, + 0xb2, 0x75, 0x6f, 0x42, 0x74, 0xd6, 0x58, 0xa3, 0x97, 0x44, 0x8a, 0x87, + 0x55, 0x66, 0x39, 0x76, 0x3a, 0xa5, 0x9b, 0x4b, 0x66, 0xce, 0x48, 0x91, + 0x9b, 0xcc, 0x94, 0xc5, 0xa5, 0x89, 0x30, 0xaf, 0x84, 0xb6, 0x5c, 0x73, + 0x5e, 0x3c, 0xc7, 0x6c, 0x7f, 0x48, 0x99, 0x69, 0x6e, 0x3f, 0x8e, 0x39, + 0x8c, 0x34, 0x75, 0x63, 0xae, 0x92, 0xcd, 0x68, 0x95, 0xa6, 0x5b, 0x75, + 0xcf, 0x5b, 0xa7, 0xc8, 0x5c, 0x76, 0x4a, 0x92, 0x2e, 0x72, 0xb8, 0xc0, + 0x82, 0xbd, 0x81, 0x65, 0x61, 0xcc, 0xab, 0x3a, 0x72, 0xab, 0x82, 0x74, + 0x7a, 0x78, 0x87, 0x95, 0x66, 0x9e, 0x5c, 0x98, 0x9a, 0xd5, 0xb7, 0xa1, + 0xce, 0xab, 0xb9, 0x52, 0xb2, 0x57, 0x6a, 0xaf, 0x9e, 0x5b, 0x9d, 0x7f, + 0xca, 0xbc, 0x3f, 0xc7, 0xc4, 0xb2, 0xd2, 0x63, 0x85, 0x99, 0xa2, 0x4b, + 0x94, 0xc0, 0xaf, 0x70, 0x88, 0x56, 0x67, 0xb2, 0x92, 0x6a, 0x70, 0x8a, + 0xa2, 0x31, 0x93, 0x4f, 0x94, 0x48, 0x73, 0x7c, 0x94, 0x76, 0x5c, 0x5c, + 0xcc, 0xaf, 0x6c, 0x35, 0x44, 0x31, 0x72, 0x4d, 0x86, 0xaf, 0x6e, 0x58, + 0x57, 0x6f, 0x78, 0xad, 0xa5, 0x95, 0x5a, 0xc8, 0x5f, 0xae, 0x6e, 0x45, + 0xcb, 0x7f, 0x77, 0x3f, 0x5a, 0x4c, 0x5e, 0xd0, 0xc1, 0xb7, 0x69, 0x5f, + 0x68, 0x52, 0xc5, 0xb1, 0x2d, 0x6b, 0x9b, 0x8b, 0x3d, 0x6a, 0xa3, 0xa1, + 0xc2, 0x71, 0x9b, 0x8c, 0x3c, 0x31, 0xc6, 0x90, 0x69, 0x55, 0xa1, 0xc3, + 0xba, 0x57, 0xc1, 0x37, 0xbb, 0x89, 0x66, 0x7e, 0x5e, 0xb4, 0xa6, 0x4c, + 0x93, 0x42, 0x48, 0x40, 0x68, 0x99, 0xbe, 0xbb, 0xa8, 0xb7, 0xad, 0xb5, + 0xa6, 0x3d, 0x5c, 0xbd, 0x8a, 0x80, 0x4d, 0x54, 0xa3, 0x88, 0xd2, 0xb5, + 0x94, 0x9f, 0xa9, 0x60, 0x6f, 0x36, 0xbc, 0xce, 0xba, 0x66, 0x6f, 0x93, + 0x90, 0xb8, 0x79, 0xbc, 0xa9, 0xc2, 0xac, 0x55, 0xc5, 0x6e, 0x55, 0xcc, + 0xb9, 0x90, 0xbe, 0x5d, 0x9f, 0x8b, 0x3d, 0x56, 0xae, 0x8a, 0x82, 0x89, + 0xc6, 0xa6, 0x61, 0x71, 0x31, 0x33, 0x8e, 0x42, 0x46, 0x7a, 0x9c, 0x70, + 0x4f, 0x85, 0x58, 0x77, 0x6d, 0x82, 0xd1, 0x3f, 0xa3, 0x70, 0x67, 0x6b, + 0xd2, 0x75, 0x33, 0x69, 0x8c, 0xb8, 0x6e, 0x5e, 0x58, 0x91, 0x33, 0xc9, + 0x9b, 0x60, 0xb1, 0x79, 0x5c, 0xbb, 0xa3, 0xaf, 0x9b, 0x50, 0x40, 0x35, + 0xa4, 0x9a, 0xba, 0xc5, 0x6a, 0x8e, 0xb8, 0x99, 0x72, 0x61, 0x5e, 0x4f, + 0x65, 0xa6, 0x4f, 0x3e, 0x4c, 0x95, 0x4f, 0xb3, 0xa6, 0xbe, 0x85, 0x50, + 0x8a, 0x2d, 0x72, 0x59, 0xa0, 0xb8, 0x47, 0x8a, 0x99, 0x5a, 0x43, 0x58, + 0xb1, 0x87, 0xc0, 0x9c, 0x32, 0x8a, 0x67, 0xc0, 0x66, 0x3c, 0x84, 0xd2, + 0x2c, 0x7b, 0x52, 0x50, 0x48, 0x40, 0x31, 0x3b, 0x39, 0xa7, 0xb5, 0x79, + 0x4d, 0xa0, 0xc8, 0xa1, 0xa9, 0x89, 0x92, 0x5e, 0x2f, 0x7c, 0x32, 0x36, + 0x83, 0x85, 0x5b, 0x69, 0xcc, 0x70, 0xc0, 0x92, 0x49, 0xb3, 0x4b, 0x73, + 0xc6, 0x7d, 0x9f, 0xa4, 0xa3, 0x5e, 0x5a, 0x32, 0x95, 0x7d, 0x66, 0x63, + 0xc0, 0x9a, 0xc5, 0xb2, 0x54, 0x4e, 0xa6, 0xa4, 0x7d, 0x32, 0xb7, 0xb4, + 0x6c, 0xab, 0x4b, 0x74, 0x49, 0x88, 0x58, 0xc5, 0xab, 0x4b, 0x66, 0x93, + 0x8c, 0xae, 0x5e, 0x69, 0x75, 0xbc, 0xa3, 0xc1, 0x63, 0x50, 0xc7, 0x3d, + 0xb7, 0x98, 0x3b, 0x42, 0x70, 0x8f, 0xb2, 0x8b, 0xa2, 0x9e, 0x67, 0x55, + 0xa4, 0x8e, 0x6b, 0x8e, 0x53, 0x57, 0xc6, 0xb0, 0xad, 0xb1, 0x64, 0xc0, + 0x3b, 0x72, 0x53, 0xc7, 0xc3, 0x9c, 0x9e, 0x61, 0x5a, 0x41, 0xba, 0x8a, + 0x93, 0x32, 0xa3, 0x6d, 0xa8, 0xc9, 0x58, 0x4e, 0xa2, 0xd4, 0x5a, 0x81, + 0xa0, 0x86, 0xb8, 0xba, 0xb9, 0x5c, 0x60, 0x4d, 0xcb, 0x8f, 0x3e, 0x57, + 0x6f, 0x63, 0x31, 0x79, 0xbe, 0xd2, 0x56, 0x38, 0x77, 0x95, 0x31, 0xc7, + 0x40, 0x63, 0xc1, 0x73, 0x44, 0xc3, 0x63, 0x9c, 0x2d, 0x84, 0xae, 0xa8, + 0x3b, 0xb1, 0x49, 0x58, 0xca, 0xbe, 0xb5, 0x48, 0x93, 0xb2, 0x77, 0xbc, + 0x6f, 0x39, 0x8d, 0x59, 0x2c, 0x8d, 0x5a, 0x34, 0x8a, 0x55, 0x53, 0xa3, + 0xad, 0xa7, 0x61, 0x5e, 0x84, 0x6f, 0xd3, 0x89, 0x3b, 0x3e, 0xaf, 0xc7, + 0x4e, 0x74, 0x8b, 0xa8, 0x86, 0xc4, 0xca, 0x91, 0x82, 0x49, 0x79, 0x70, + 0x98, 0x96, 0xb3, 0x85, 0x5b, 0x79, 0x56, 0xa3, 0xce, 0xa3, 0x5b, 0xa7, + 0xc9, 0xba, 0x42, 0x4c, 0x4c, 0x74, 0x8b, 0x80, 0x9d, 0x5a, 0xa7, 0xdc, + 0xc4, 0xb5, 0x54, 0x8d, 0x94, 0x56, 0x61, 0x4d, 0x7c, 0xbf, 0x59, 0x7f, + 0x7b, 0xa6, 0x7d, 0x49, 0xc3, 0x89, 0x35, 0xd5, 0xa0, 0xa7, 0x62, 0x91, + 0x90, 0x87, 0x9f, 0x3b, 0x9d, 0xb0, 0x9c, 0x85, 0x8b, 0x56, 0xbb, 0x4c, + 0x76, 0xb8, 0x9b, 0x9a, 0xb6, 0x46, 0x45, 0x77, 0x66, 0x76, 0x75, 0x49, + 0xb3, 0x67, 0x87, 0xc5, 0x43, 0x95, 0xb1, 0xb2, 0x45, 0xce, 0x4d, 0x6c, + 0xc4, 0x49, 0x33, 0x63, 0x6e, 0x7a, 0x92, 0x65, 0x8b, 0xa8, 0x7f, 0x4f, + 0x63, 0xbf, 0xc1, 0x40, 0xa6, 0xc1, 0x8d, 0x34, 0x3b, 0x98, 0x69, 0x40, + 0xa4, 0x54, 0x7b, 0x64, 0x8c, 0x36, 0x84, 0xb4, 0xb2, 0xaa, 0x66, 0x95, + 0x84, 0x9f, 0x93, 0xa4, 0x4a, 0x82, 0x34, 0x62, 0xcb, 0xbf, 0x87, 0xbf, + 0xbe, 0x8e, 0x93, 0x81, 0x79, 0x59, 0x45, 0x30, 0xc1, 0x36, 0x8b, 0x35, + 0x97, 0xa0, 0xa6, 0x36, 0x9f, 0x77, 0x4c, 0x79, 0x8b, 0xcd, 0xd5, 0x4d, + 0x62, 0x63, 0x63, 0x3c, 0x3d, 0x36, 0xb1, 0x73, 0x82, 0x50, 0x40, 0x3d, + 0x4c, 0x6e, 0xa0, 0x74, 0xcd, 0x95, 0x81, 0x38, 0x58, 0x40, 0x6b, 0xd7, + 0x70, 0xa6, 0xad, 0xca, 0xa4, 0x83, 0x36, 0xbc, 0x9b, 0x5f, 0x5b, 0x6a, + 0x8b, 0xa1, 0xcc, 0xaa, 0xae, 0x7f, 0x77, 0x38, 0xd7, 0x70, 0x56, 0xd1, + 0x77, 0x84, 0x6b, 0xd2, 0xa2, 0x6d, 0x30, 0x8c, 0x3f, 0xa8, 0x61, 0x9d, + 0x8e, 0x50, 0x5a, 0x98, 0x9a, 0x75, 0x41, 0x71, 0x96, 0xab, 0xc5, 0x43, + 0x69, 0xb2, 0xb1, 0x62, 0x86, 0x9e, 0x42, 0x83, 0xb4, 0xd4, 0x7e, 0xac, + 0x69, 0x72, 0x5a, 0xa7, 0x50, 0x60, 0xa9, 0x7a, 0x92, 0xb3, 0x7e, 0xba, + 0xa1, 0x6c, 0x84, 0x71, 0x68, 0xce, 0x64, 0xa0, 0xd1, 0x67, 0x68, 0x4d, + 0xcf, 0x97, 0xd2, 0x37, 0x8a, 0xb9, 0xb3, 0xb0, 0xb3, 0x92, 0x7b, 0xac, + 0x82, 0x48, 0xd0, 0x59, 0xa4, 0xbd, 0x44, 0x66, 0x7c, 0x6c, 0xac, 0xb1, + 0xa2, 0x81, 0x30, 0x6e, 0x51, 0x79, 0x68, 0x87, 0x53, 0x5f, 0xce, 0xc8, + 0xac, 0x7c, 0xd7, 0x59, 0xba, 0xbd, 0x93, 0xc4, 0x9c, 0x5b, 0x9c, 0x8d, + 0x59, 0x9a, 0x49, 0xae, 0xc9, 0xd2, 0x53, 0x6f, 0xb7, 0x76, 0x2f, 0x69, + 0x2f, 0xcb, 0x58, 0xbc, 0x6c, 0x48, 0x53, 0x9d, 0xce, 0x90, 0x4e, 0x4e, + 0xab, 0xa5, 0x77, 0x4a, 0xd2, 0x45, 0xb0, 0xb0, 0x38, 0x6c, 0xbf, 0x5f, + 0x96, 0x6a, 0xb3, 0x8b, 0xb3, 0xc1, 0xd1, 0xb0, 0x7b, 0xce, 0x43, 0x85, + 0xab, 0x58, 0xce, 0xb7, 0xb6, 0x93, 0x66, 0x71, 0xca, 0x62, 0x4c, 0x9b, + 0x9f, 0x58, 0xbb, 0x5d, 0xb9, 0x34, 0x96, 0xbd, 0x81, 0x72, 0x83, 0x7e, + 0x78, 0x67, 0xb5, 0x8b, 0xc6, 0xc4, 0x45, 0xa4, 0xae, 0x3e, 0x9b, 0x37, + 0x94, 0x5f, 0x90, 0x31, 0x8a, 0xb4, 0x47, 0x46, 0x94, 0x50, 0x39, 0x86, + 0x38, 0x5d, 0x30, 0x9d, 0x76, 0xbc, 0x60, 0x40, 0xc0, 0x2e, 0x34, 0x4f, + 0x32, 0x2c, 0x42, 0x93, 0xb0, 0x76, 0x4a, 0x5c, 0xbe, 0xd4, 0x3c, 0x55, + 0x61, 0x4e, 0xd1, 0x74, 0x59, 0x7a, 0x89, 0x75, 0x6b, 0x78, 0x9b, 0x58, + 0x78, 0x86, 0x30, 0xbf, 0x3b, 0x8f, 0x77, 0x79, 0xae, 0xc1, 0xce, 0x55, + 0x5c, 0x96, 0x54, 0xac, 0x2e, 0xa8, 0x5e, 0x74, 0xb0, 0xa4, 0x86, 0x70, + 0x9d, 0xbb, 0xa3, 0xc2, 0x46, 0xb8, 0x53, 0x3a, 0x9b, 0x99, 0x3d, 0x31, + 0x97, 0xba, 0xbc, 0x8e, 0x47, 0x45, 0x40, 0x69, 0xc3, 0x8f, 0x46, 0x79, + 0x37, 0x5b, 0xc9, 0x92, 0xa5, 0x66, 0xba, 0xc7, 0xaa, 0x70, 0x4d, 0x6b, + 0x8d, 0x76, 0x98, 0x52, 0xb2, 0x71, 0x75, 0xb5, 0x56, 0x92, 0x7b, 0xb1, + 0xa9, 0x7c, 0xb4, 0xb3, 0xd1, 0xc4, 0x6e, 0x79, 0xd2, 0x88, 0x81, 0xca, + 0x75, 0xad, 0x66, 0x8b, 0x3f, 0x75, 0x58, 0xac, 0x4f, 0x61, 0x37, 0x4a, + 0x73, 0x91, 0xbc, 0x5a, 0xc3, 0x33, 0xb6, 0x8f, 0x8a, 0x96, 0x5e, 0xb6, + 0x91, 0xd0, 0x52, 0x93, 0x98, 0xc8, 0x93, 0x8a, 0xbd, 0x4e, 0x8b, 0x4a, + 0x95, 0x50, 0x6d, 0x71, 0x6a, 0xcf, 0x59, 0xb3, 0x4b, 0x9c, 0x9f, 0x84, + 0xb1, 0x62, 0x8c, 0xb2, 0xa5, 0x9e, 0x8e, 0xb8, 0x66, 0x67, 0x3d, 0x9c, + 0x3d, 0x34, 0x73, 0x3e, 0xa9, 0x8f, 0x31, 0x4c, 0x37, 0x7c, 0x87, 0x48, + 0x6e, 0x3b, 0x9b, 0x95, 0xbb, 0x96, 0x7e, 0x96, 0x53, 0xc8, 0x63, 0x6b, + 0x39, 0x88, 0x9f, 0xa5, 0x5c, 0x52, 0xae, 0x47, 0xa0, 0xa9, 0xa2, 0xa6, + 0xc1, 0x98, 0x56, 0x95, 0xb7, 0x52, 0x70, 0x9e, 0xc1, 0x89, 0x92, 0x6f, + 0x70, 0x6e, 0xaf, 0x35, 0x63, 0xa6, 0x52, 0x39, 0x48, 0x6e, 0x42, 0x60, + 0xa9, 0x3e, 0x46, 0x4d, 0x68, 0x33, 0x57, 0xcf, 0x87, 0xc2, 0x4e, 0xaf, + 0xba, 0x34, 0x58, 0xb5, 0xc4, 0xce, 0x36, 0x53, 0x9a, 0x49, 0x85, 0x58, + 0x37, 0x6b, 0xdf, 0xbd, 0x7e, 0x85, 0xc0, 0x69, 0xbe, 0x57, 0x9f, 0x99, + 0x4f, 0x79, 0x73, 0x75, 0x65, 0x3b, 0x72, 0xd2, 0x61, 0x7e, 0x6a, 0xb5, + 0x44, 0xbd, 0x73, 0x98, 0x83, 0x3b, 0xc3, 0xca, 0x5d, 0xba, 0x35, 0xb4, + 0x81, 0x95, 0x8d, 0x4f, 0x4a, 0xb3, 0x45, 0x71, 0xbb, 0x91, 0xa8, 0x79, + 0x65, 0x66, 0xad, 0x42, 0xa6, 0x7c, 0x98, 0xc2, 0x47, 0x60, 0x62, 0x54, + 0x39, 0x8b, 0xc2, 0x35, 0xd2, 0x75, 0x91, 0x88, 0x37, 0x4d, 0x7d, 0x5f, + 0x69, 0x99, 0x3e, 0x4c, 0x5c, 0xa6, 0x38, 0x9a, 0xad, 0x63, 0xb5, 0xcf, + 0x76, 0x83, 0xb9, 0x4e, 0x4c, 0xab, 0x56, 0xc3, 0x79, 0x41, 0x8c, 0x45, + 0x92, 0x4d, 0x9a, 0xad, 0xd3, 0x6a, 0x74, 0x56, 0xc9, 0xb5, 0xb7, 0x69, + 0x74, 0xcd, 0x54, 0x5c, 0x53, 0x67, 0x66, 0x78, 0xaa, 0xa5, 0x8a, 0x94, + 0xb9, 0xc1, 0x3f, 0x7b, 0x3c, 0x7b, 0x53, 0x6b, 0x76, 0x77, 0xcc, 0x77, + 0x78, 0x7e, 0xa8, 0x5b, 0x6c, 0xa3, 0x52, 0x98, 0xa0, 0xa3, 0x51, 0xaa, + 0x75, 0xbc, 0x3d, 0x71, 0xd2, 0x75, 0x52, 0xa7, 0xa2, 0x85, 0x6b, 0x80, + 0xa6, 0xca, 0x39, 0x9d, 0xb9, 0xaf, 0x9c, 0x87, 0x91, 0xa1, 0x69, 0xab, + 0x47, 0x76, 0x6b, 0x56, 0xc7, 0xcd, 0x79, 0x64, 0x4b, 0x32, 0xb1, 0x82, + 0x3f, 0x49, 0xa2, 0x50, 0x60, 0x5e, 0x96, 0xa6, 0x72, 0xc8, 0x3a, 0x35, + 0xb1, 0x4f, 0x9e, 0x81, 0xaf, 0x89, 0x5e, 0xc2, 0x65, 0x32, 0x72, 0xa3, + 0x4f, 0x6a, 0x38, 0x7d, 0xc0, 0xc9, 0x88, 0x7f, 0x58, 0xaa, 0x87, 0x8e, + 0x57, 0x39, 0x70, 0xae, 0x7e, 0x71, 0x3b, 0x44, 0xbf, 0x44, 0x88, 0x62, + 0x3c, 0x8d, 0xb1, 0x6c, 0x97, 0x5a, 0x83, 0x39, 0x38, 0xc8, 0x90, 0xb3, + 0x59, 0x6f, 0x73, 0xcd, 0xa1, 0x4d, 0x49, 0x45, 0xc0, 0xd1, 0x94, 0x5b, + 0x7c, 0xbb, 0x4b, 0x66, 0x9f, 0x75, 0x3a, 0x91, 0x9a, 0x92, 0x70, 0x6f, + 0xa9, 0x57, 0x6f, 0xc6, 0x80, 0xc4, 0xaa, 0x80, 0xca, 0xad, 0x99, 0x7d, + 0xa2, 0xa1, 0x47, 0x42, 0x85, 0xc3, 0xcd, 0x6c, 0x59, 0x82, 0x69, 0x63, + 0xc9, 0x51, 0x66, 0xd3, 0x4b, 0x82, 0x4d, 0x43, 0x78, 0xc4, 0x62, 0x67, + 0x33, 0x96, 0x7c, 0x8e, 0x73, 0xba, 0x50, 0x43, 0x8d, 0x4a, 0xbf, 0x76, + 0x4e, 0x3c, 0x50, 0x3b, 0x4d, 0xaf, 0x4a, 0x76, 0x5c, 0x91, 0x37, 0x8d, + 0x8f, 0x9f, 0xc1, 0x56, 0x6e, 0x7f, 0x8c, 0x3d, 0xc5, 0xa1, 0x93, 0x9e, + 0xbc, 0x90, 0xb3, 0x95, 0xc4, 0x7a, 0x62, 0xa2, 0x50, 0x5e, 0x5a, 0x64, + 0xc9, 0x53, 0x7f, 0x5b, 0xd5, 0x6f, 0x61, 0x56, 0x7d, 0x91, 0xa3, 0x84, + 0x2e, 0x37, 0x7f, 0x8c, 0x5e, 0x72, 0x45, 0x70, 0x74, 0x4e, 0x5a, 0xc2, + 0x67, 0x5b, 0xc5, 0xa9, 0x66, 0x3d, 0x45, 0x5c, 0x60, 0xcc, 0x8f, 0x5b, + 0xc6, 0xa0, 0xcb, 0x61, 0x97, 0xb2, 0x83, 0x55, 0x3e, 0x9b, 0x4a, 0x75, + 0xce, 0x5b, 0xcd, 0x55, 0x69, 0xb3, 0x68, 0x65, 0xc4, 0x68, 0xb7, 0x4d, + 0xcf, 0x64, 0xc2, 0x5f, 0x50, 0xbe, 0x4b, 0x72, 0x71, 0xaf, 0x79, 0xce, + 0x89, 0x35, 0x39, 0x71, 0x8b, 0xce, 0x63, 0xc7, 0x44, 0x99, 0x3c, 0x5b, + 0xb1, 0xad, 0x3d, 0x4a, 0x94, 0x8f, 0xc0, 0x48, 0x35, 0x4c, 0x3f, 0x2e, + 0xa5, 0x69, 0x7d, 0x67, 0x51, 0x9c, 0x8c, 0x57, 0xcf, 0xa6, 0xa7, 0x56, + 0x2e, 0x9b, 0xa8, 0x8c, 0xb4, 0x60, 0xc9, 0xab, 0x5b, 0xa0, 0xc5, 0x8e, + 0x74, 0x8d, 0x6a, 0x62, 0xa5, 0xaf, 0x48, 0xa7, 0xcc, 0xbe, 0xc3, 0x94, + 0xac, 0x7b, 0x33, 0x46, 0xd1, 0x9f, 0x97, 0x36, 0x90, 0x76, 0x7b, 0x73, + 0xc5, 0x97, 0x98, 0x88, 0x63, 0xbc, 0x4c, 0x43, 0x70, 0x7e, 0x65, 0x53, + 0x69, 0xb5, 0x49, 0x63, 0xc0, 0x37, 0x75, 0x89, 0x89, 0x95, 0x43, 0x8b, + 0x71, 0xb6, 0x6a, 0x4e, 0x83, 0x44, 0x7d, 0x92, 0x31, 0x84, 0x68, 0xb2, + 0x54, 0x51, 0x6c, 0x5c, 0x3e, 0x48, 0x4f, 0x6f, 0x3f, 0xc8, 0x69, 0x6f, + 0x35, 0x35, 0x9c, 0xba, 0xb5, 0xa8, 0xbe, 0xb5, 0xa6, 0x86, 0x84, 0x6a, + 0x99, 0xcd, 0x92, 0xc4, 0x60, 0x70, 0x9b, 0x4f, 0xce, 0x98, 0x80, 0x78, + 0x9f, 0x80, 0x59, 0xbf, 0x76, 0x9d, 0x4a, 0x3e, 0x36, 0x4a, 0x9d, 0xa1, + 0xc1, 0x60, 0xb4, 0xcd, 0xc9, 0x54, 0x9e, 0x37, 0x7d, 0x3c, 0x34, 0xcd, + 0x98, 0x9c, 0x40, 0x99, 0x70, 0x44, 0x8c, 0x83, 0x99, 0x4c, 0x34, 0x3e, + 0xa3, 0x3a, 0x5c, 0x52, 0x60, 0x2d, 0x93, 0x48, 0x6a, 0x6b, 0x6f, 0xae, + 0x72, 0x52, 0xc6, 0x42, 0x83, 0xcd, 0x3e, 0xb7, 0x39, 0x5a, 0x34, 0x99, + 0x90, 0xaa, 0x3d, 0xc8, 0x64, 0x94, 0x63, 0xa3, 0xc4, 0x63, 0x49, 0xc2, + 0x7a, 0xbb, 0x88, 0x4d, 0x45, 0x79, 0x9b, 0xb4, 0xb4, 0x55, 0x4c, 0x38, + 0x6d, 0x5e, 0xbe, 0x7d, 0x77, 0x7a, 0x67, 0xb7, 0x8f, 0x42, 0x8e, 0x3f, + 0x36, 0x88, 0xb2, 0x66, 0xca, 0x4c, 0x9a, 0x7a, 0x96, 0x32, 0x3e, 0x61, + 0x79, 0x48, 0xa0, 0x41, 0x6b, 0x4d, 0xd1, 0x9b, 0x5f, 0x66, 0x30, 0xac, + 0x5c, 0x5e, 0x39, 0x8f, 0x35, 0xa6, 0xc3, 0x84, 0xbd, 0x76, 0x4f, 0x77, + 0x3e, 0x3b, 0x32, 0x79, 0x40, 0xa9, 0xb7, 0x6d, 0x6f, 0x52, 0x91, 0x34, + 0x56, 0x72, 0x6b, 0xd2, 0xb1, 0x31, 0xc4, 0x89, 0xa5, 0x36, 0xa9, 0xcb, + 0x53, 0x5f, 0x55, 0xd0, 0x61, 0x99, 0x40, 0x82, 0xb4, 0x83, 0x67, 0xb2, + 0x82, 0xb1, 0xc3, 0x9c, 0xa6, 0x6f, 0x3a, 0x47, 0xcc, 0xc6, 0x92, 0x9f, + 0xae, 0x8b, 0x61, 0x80, 0xb2, 0x3d, 0x40, 0xc3, 0xb6, 0x80, 0xc5, 0x56, + 0xca, 0x7e, 0xc5, 0x5f, 0xa3, 0xb3, 0x35, 0x6d, 0xc0, 0x53, 0x9b, 0xaf, + 0x41, 0xa4, 0x8d, 0xc4, 0xb3, 0x6e, 0x51, 0x57, 0x6c, 0xc4, 0xbe, 0x9a, + 0x94, 0x5e, 0x34, 0x52, 0x55, 0x48, 0x59, 0x57, 0x56, 0xcc, 0x60, 0xa6, + 0x5c, 0xb1, 0x93, 0x7c, 0x7d, 0xcc, 0x51, 0x6b, 0x99, 0x56, 0x7d, 0x83, + 0xab, 0x67, 0x97, 0xc9, 0xd3, 0x8f, 0x6b, 0x66, 0xc1, 0x46, 0xcf, 0x67, + 0x30, 0x73, 0xda, 0x66, 0xd3, 0x76, 0xc3, 0xb2, 0x83, 0x46, 0x71, 0x99, + 0x5b, 0xc6, 0x93, 0xcf, 0xad, 0xd6, 0x56, 0xa7, 0x4b, 0x6f, 0xc2, 0x84, + 0x3d, 0xa1, 0x91, 0x5e, 0x60, 0xc4, 0x93, 0xa6, 0x50, 0xc1, 0x49, 0xdb, + 0xcd, 0x81, 0x97, 0x80, 0x5d, 0x7d, 0x3c, 0x93, 0x8e, 0xae, 0x99, 0x8c, + 0xab, 0x5b, 0x68, 0x68, 0xa3, 0x78, 0x6a, 0x9f, 0x52, 0xe0, 0x50, 0x2f, + 0xc1, 0x34, 0x49, 0x58, 0xd1, 0xd0, 0xc8, 0xa8, 0x3c, 0x9b, 0x2b, 0xaf, + 0x9e, 0x6b, 0x44, 0x8a, 0x45, 0xbc, 0xaf, 0xb5, 0x60, 0xa1, 0x6d, 0xce, + 0x4e, 0x71, 0xcf, 0x3f, 0x73, 0xb9, 0x4a, 0x52, 0x64, 0x65, 0x3e, 0xc9, + 0x3f, 0x82, 0x70, 0xc9, 0x97, 0x59, 0x52, 0x3c, 0xbc, 0xaf, 0x36, 0xd2, + 0x4e, 0xc0, 0xce, 0x3b, 0x94, 0x8b, 0x31, 0x3a, 0x6f, 0xae, 0x97, 0x8a, + 0x5d, 0xbe, 0x86, 0x8c, 0xce, 0x9b, 0xce, 0x5f, 0x91, 0x73, 0xcb, 0xdb, + 0x55, 0xc2, 0x7e, 0x32, 0x2f, 0x19, 0x7d, 0x7a, 0xc7, 0x6d, 0x8e, 0x8c, + 0x9a, 0x94, 0x50, 0xe0, 0x85, 0x79, 0x79, 0xa6, 0xcb, 0xce, 0x5a, 0x4c, + 0xb1, 0x87, 0xbf, 0x4d, 0x6a, 0xa4, 0xbe, 0x59, 0x96, 0x5c, 0x62, 0xbb, + 0x7b, 0x71, 0x2f, 0x77, 0xd0, 0x4d, 0x6c, 0x57, 0x8b, 0x3c, 0xd9, 0x8d, + 0x99, 0x9d, 0x55, 0x3f, 0x9b, 0x88, 0x79, 0x53, 0x7d, 0x2f, 0xbe, 0x73, + 0x48, 0x58, 0xb3, 0x79, 0x7c, 0x9e, 0x54, 0x65, 0xaa, 0x5c, 0xe4, 0x56, + 0x84, 0x3f, 0xc3, 0xbe, 0x42, 0x5e, 0x66, 0x99, 0x6b, 0x8f, 0x88, 0xa7, + 0x88, 0x5b, 0xb5, 0xb5, 0x71, 0x9a, 0x53, 0x37, 0x40, 0x98, 0x97, 0x74, + 0x5a, 0xb0, 0x62, 0xbc, 0xa3, 0xb3, 0x8a, 0xb8, 0x46, 0x8a, 0xa8, 0x97, + 0xab, 0x68, 0xb4, 0x36, 0x47, 0x52, 0x9b, 0xb1, 0x8d, 0x67, 0x9e, 0x4d, + 0xa8, 0xb5, 0xa5, 0x3c, 0x3c, 0x95, 0xd6, 0x47, 0x8c, 0x9a, 0x91, 0x54, + 0x2c, 0x86, 0x81, 0xd3, 0xae, 0x98, 0xa2, 0xc2, 0x9c, 0x98, 0x3b, 0x8a, + 0x55, 0xc1, 0x58, 0xc1, 0xc0, 0x43, 0x81, 0x8d, 0x6b, 0x50, 0x93, 0x8b, + 0xa2, 0x9c, 0x92, 0x3b, 0xc4, 0x5b, 0x9e, 0xa0, 0x61, 0xa3, 0x63, 0xa3, + 0xbd, 0xd4, 0x49, 0x76, 0x96, 0x4d, 0x46, 0x48, 0xca, 0x71, 0x27, 0x92, + 0x7b, 0x45, 0xb6, 0x82, 0x3c, 0x36, 0xcf, 0xc2, 0x56, 0x42, 0xaa, 0xb5, + 0x49, 0x5c, 0x3e, 0x83, 0x56, 0x47, 0x6b, 0x54, 0x7f, 0x2c, 0xa1, 0x79, + 0xc3, 0xb7, 0xa6, 0x5d, 0xb8, 0x97, 0x50, 0xc9, 0x4d, 0x9c, 0x8b, 0x5e, + 0x9c, 0xcd, 0x41, 0x6b, 0x84, 0x85, 0x60, 0x4a, 0xb9, 0x67, 0x6f, 0x9d, + 0xc5, 0x48, 0xb7, 0x4d, 0x93, 0xcf, 0x74, 0x78, 0xcb, 0x6d, 0x5b, 0x55, + 0xb4, 0xbc, 0xcd, 0x30, 0x8a, 0xc0, 0x77, 0xbb, 0x3d, 0x59, 0x4b, 0x90, + 0x88, 0xd3, 0xca, 0x95, 0x50, 0x86, 0xa8, 0x98, 0x74, 0x96, 0xb8, 0x2e, + 0x89, 0x5f, 0x96, 0xc5, 0x99, 0x5e, 0x9f, 0xb6, 0x6c, 0x85, 0xa8, 0x74, + 0xd7, 0x76, 0x49, 0x72, 0x76, 0x5d, 0x8b, 0x3a, 0x6a, 0xc5, 0x65, 0x86, + 0x81, 0xb4, 0xbe, 0xc2, 0x5b, 0x9b, 0x67, 0xbd, 0x30, 0xa2, 0x74, 0xab, + 0x8d, 0xc7, 0x5e, 0x5f, 0x33, 0x5d, 0x4e, 0xcf, 0xb0, 0xa0, 0xcc, 0xb6, + 0xb4, 0xbd, 0xbb, 0x4b, 0x88, 0xcb, 0x68, 0x5e, 0xc3, 0xba, 0x1d, 0x8a, + 0xce, 0x91, 0x54, 0x7f, 0x80, 0xbd, 0x7a, 0xc8, 0x97, 0xd2, 0x7d, 0xb8, + 0xcd, 0xb3, 0x3d, 0xcd, 0x78, 0xa8, 0x8f, 0xb4, 0x66, 0xb0, 0xdc, 0x4c, + 0xa8, 0x64, 0x3a, 0x62, 0x90, 0x47, 0xbe, 0xc9, 0x5f, 0x52, 0xc0, 0xbc, + 0xd7, 0x55, 0x54, 0x55, 0x2a, 0x3f, 0xb0, 0x56, 0x8e, 0xa0, 0x7a, 0xe0, + 0x99, 0x65, 0x64, 0x91, 0x79, 0x4f, 0xbe, 0xba, 0x7a, 0xbd, 0xa4, 0x65, + 0xb1, 0xaa, 0x61, 0xd1, 0x6e, 0x77, 0x78, 0x57, 0xa5, 0xa6, 0x40, 0xae, + 0x52, 0xa8, 0xad, 0x74, 0x82, 0xcf, 0x55, 0x9e, 0x4f, 0x56, 0x8f, 0xb7, + 0x84, 0x8f, 0x33, 0x73, 0x5c, 0xcc, 0x9a, 0x53, 0x6d, 0x68, 0x55, 0xc9, + 0xcd, 0xa4, 0x2c, 0x9c, 0xd6, 0x73, 0x32, 0xc5, 0xd1, 0xce, 0x76, 0xb3, + 0x82, 0x63, 0x97, 0xa2, 0x36, 0xcb, 0x9d, 0x57, 0xa7, 0x98, 0xa2, 0x3d, + 0xb1, 0x8b, 0x78, 0x4d, 0x5d, 0x75, 0x8a, 0xcd, 0x50, 0x56, 0x35, 0xa5, + 0xe0, 0xb7, 0xc9, 0x9f, 0x74, 0x71, 0x7f, 0xbc, 0xbb, 0x15, 0x26, 0x85, + 0x88, 0x95, 0x63, 0x51, 0xc7, 0xbf, 0x68, 0x4f, 0x58, 0x85, 0x58, 0xbd, + 0x65, 0x86, 0xc0, 0x66, 0x8a, 0x5a, 0x28, 0x78, 0xaf, 0xe9, 0x7c, 0x89, + 0x7d, 0xac, 0x82, 0x60, 0xaa, 0x73, 0xd1, 0x47, 0xac, 0xa7, 0x90, 0x91, + 0xc5, 0x46, 0x73, 0xcc, 0x3a, 0x8c, 0x84, 0x71, 0x75, 0x92, 0x8f, 0x44, + 0xb2, 0xb2, 0x4c, 0xad, 0x83, 0x8a, 0xc6, 0xa3, 0x5a, 0x9c, 0x68, 0x74, + 0x49, 0x4a, 0x7e, 0xbb, 0x86, 0x2e, 0xcc, 0x51, 0xaa, 0xa8, 0x8d, 0xca, + 0xa8, 0x70, 0x97, 0xa7, 0xce, 0x69, 0x9b, 0xb8, 0x5e, 0x73, 0x97, 0x89, + 0x87, 0x41, 0x97, 0x4d, 0x2f, 0x69, 0x54, 0xc8, 0x48, 0xce, 0x3e, 0x3f, + 0x80, 0xb2, 0x4c, 0x36, 0xd4, 0x64, 0xb8, 0x42, 0x7e, 0x6f, 0x50, 0xb1, + 0x4d, 0xa5, 0x9f, 0x84, 0x59, 0x36, 0x46, 0x57, 0x3c, 0x52, 0xa6, 0x32, + 0x9f, 0xa5, 0x66, 0x3f, 0xbc, 0x4e, 0xc2, 0x70, 0x74, 0x59, 0x8e, 0x5a, + 0x8c, 0x86, 0xa3, 0x9b, 0xbf, 0xc6, 0x8d, 0x8d, 0x50, 0xc5, 0x5a, 0x45, + 0xa6, 0xe0, 0x91, 0x42, 0xa4, 0xbd, 0x72, 0xb1, 0x4c, 0x9f, 0xa1, 0xcf, + 0x4f, 0x69, 0xd0, 0x7b, 0xa0, 0xd3, 0xb5, 0x47, 0x4a, 0x56, 0x43, 0xd9, + 0x8f, 0x6c, 0x96, 0x89, 0xb7, 0x86, 0x48, 0x91, 0x4c, 0x93, 0x8b, 0xb8, + 0x63, 0x32, 0xb6, 0x4f, 0x7c, 0xad, 0x77, 0x8d, 0xca, 0x36, 0x5e, 0xdf, + 0xa4, 0xa5, 0x8d, 0x8e, 0xa3, 0x4a, 0x9b, 0x33, 0x3b, 0x71, 0x9a, 0x8b, + 0x69, 0xc6, 0x42, 0x9a, 0xab, 0x30, 0x34, 0x35, 0xc5, 0x87, 0xc0, 0xac, + 0x45, 0xbd, 0x95, 0xd0, 0x6a, 0x34, 0xc3, 0xb7, 0x7d, 0x48, 0x5d, 0xc7, + 0x3b, 0x6b, 0x7b, 0xb2, 0xa8, 0x49, 0x35, 0x47, 0xae, 0xb4, 0xba, 0xa3, + 0x39, 0x58, 0xca, 0x52, 0x3d, 0xbd, 0x7b, 0xc6, 0xcc, 0x50, 0x69, 0xcc, + 0x4c, 0x7f, 0x9a, 0x7c, 0x7a, 0x4a, 0xcc, 0x5b, 0x60, 0x8b, 0xd0, 0xbf, + 0x85, 0xa9, 0x4e, 0x38, 0xb9, 0x36, 0x52, 0x37, 0x53, 0x7c, 0x41, 0xbe, + 0x6f, 0x5d, 0xb0, 0xae, 0xa8, 0x58, 0x6d, 0x7a, 0xa1, 0x54, 0xd0, 0x9a, + 0x83, 0x67, 0x91, 0xa7, 0x9c, 0xc0, 0x37, 0x88, 0x8e, 0xa9, 0x94, 0x64, + 0x9e, 0xaa, 0x8e, 0xa0, 0x55, 0x31, 0x76, 0xc7, 0x6c, 0xa9, 0x38, 0x9f, + 0x55, 0x67, 0xb6, 0x6a, 0x87, 0x52, 0x8a, 0x47, 0x7f, 0x40, 0x8c, 0xc3, + 0x76, 0x49, 0x4a, 0xc4, 0x34, 0x7a, 0xc0, 0x68, 0x99, 0x95, 0x40, 0x96, + 0x6c, 0x6e, 0xb7, 0x77, 0xa0, 0x91, 0x49, 0x9e, 0x42, 0x93, 0x5b, 0x36, + 0x90, 0xb6, 0x3b, 0x36, 0x9c, 0x7f, 0x9e, 0xb2, 0xa4, 0xaa, 0xb0, 0x32, + 0x88, 0x91, 0x64, 0x7e, 0x41, 0x8c, 0x97, 0x33, 0x5e, 0xca, 0x52, 0xb6, + 0x5f, 0x73, 0x4c, 0x45, 0xa0, 0x92, 0x5a, 0x9e, 0x51, 0xa7, 0xa2, 0x52, + 0x3f, 0x80, 0x41, 0x73, 0xb1, 0xc7, 0x6a, 0x79, 0x65, 0x61, 0x6d, 0x8a, + 0x6a, 0x9b, 0x50, 0x40, 0x99, 0x60, 0x9a, 0x4d, 0xbb, 0xb6, 0x60, 0x7d, + 0xa1, 0x57, 0xa5, 0x4a, 0xb5, 0xaf, 0xa9, 0x4b, 0xc4, 0x38, 0x65, 0xc0, + 0xba, 0x30, 0x66, 0x85, 0x58, 0x36, 0xad, 0x98, 0x73, 0xc5, 0xa7, 0x33, + 0xc1, 0x91, 0x9f, 0xc5, 0x81, 0x3b, 0x96, 0x37, 0xb6, 0xad, 0x9d, 0x68, + 0xb8, 0x3c, 0x76, 0x7c, 0x83, 0x34, 0x83, 0x35, 0xa8, 0x69, 0x31, 0x93, + 0x38, 0x48, 0x6f, 0x34, 0x66, 0x5c, 0x90, 0xc1, 0x31, 0x7b, 0x36, 0x82, + 0x76, 0x36, 0x9c, 0xbf, 0xc1, 0x93, 0x81, 0x8e, 0xaf, 0xc4, 0xc9, 0x67, + 0x6b, 0x90, 0xba, 0x7d, 0x91, 0x4f, 0x82, 0x6f, 0xbf, 0xb9, 0x56, 0xaf, + 0x55, 0x4a, 0xb1, 0x78, 0x75, 0x5d, 0x33, 0x49, 0x9a, 0x44, 0xb6, 0x9a, + 0x67, 0x35, 0xab, 0x8c, 0xc3, 0xc5, 0x94, 0xa8, 0x48, 0x50, 0x63, 0x50, + 0x93, 0x50, 0xb1, 0xb1, 0x54, 0x47, 0x5d, 0x9e, 0xad, 0x52, 0x38, 0xb7, + 0x45, 0x86, 0xbd, 0xb5, 0xa9, 0xb3, 0x68, 0x36, 0x94, 0x61, 0xaf, 0xd0, + 0xaa, 0xa8, 0x42, 0x4e, 0xa7, 0x31, 0xb2, 0xb5, 0x9f, 0xb3, 0x84, 0xa6, + 0xc6, 0x9e, 0x45, 0xa2, 0xc9, 0xb2, 0x7d, 0x36, 0x5e, 0x7b, 0x67, 0xcb, + 0x79, 0xa9, 0x75, 0x81, 0x5f, 0xd2, 0xa3, 0xa1, 0x6e, 0xb9, 0x35, 0x6d, + 0x3c, 0xc6, 0x95, 0x6e, 0x47, 0x6c, 0xb6, 0xb4, 0xa0, 0x85, 0x47, 0xd3, + 0x8f, 0xc8, 0x9d, 0x46, 0xd2, 0x43, 0x44, 0x7c, 0x7c, 0xd0, 0x76, 0x74, + 0x63, 0x90, 0x48, 0x78, 0x6f, 0x66, 0x42, 0x4c, 0x5f, 0x41, 0xc6, 0x35, + 0x3a, 0x85, 0xb2, 0xa0, 0x35, 0xc6, 0x97, 0x55, 0xb5, 0x7e, 0xb8, 0xbe, + 0xb8, 0xbc, 0x7c, 0x5f, 0x84, 0xb0, 0x5f, 0x75, 0x74, 0xa2, 0x7f, 0xba, + 0x42, 0x90, 0x3e, 0x4f, 0x70, 0x6a, 0xa0, 0xcf, 0x91, 0x4f, 0xb6, 0xb7, + 0xbc, 0x2f, 0x58, 0xad, 0x6b, 0x79, 0x8d, 0x8d, 0x9e, 0x9d, 0xc5, 0xc3, + 0x52, 0xbc, 0x87, 0x72, 0x42, 0x32, 0xb5, 0x3f, 0x5d, 0x34, 0xac, 0x53, + 0x48, 0x71, 0xae, 0x30, 0x63, 0x39, 0x72, 0xb9, 0x56, 0x3c, 0x9c, 0x3f, + 0x4b, 0x3e, 0xca, 0xa1, 0xad, 0x96, 0x71, 0x4e, 0xa0, 0x7e, 0x6a, 0x83, + 0xb5, 0x8d, 0x78, 0x56, 0x85, 0xb7, 0xb8, 0xcb, 0xa8, 0x41, 0x9a, 0x65, + 0x48, 0xa5, 0x56, 0xae, 0x67, 0x8f, 0xc1, 0x54, 0xb0, 0x70, 0x33, 0x70, + 0x9a, 0x30, 0x61, 0xd1, 0x9a, 0xb2, 0xba, 0x55, 0x97, 0x2f, 0x88, 0x57, + 0x69, 0xcc, 0x8d, 0x6e, 0x76, 0x73, 0x55, 0x96, 0x79, 0xbb, 0x4a, 0x44, + 0x64, 0x9c, 0x61, 0x5e, 0x85, 0x50, 0x89, 0xaa, 0xa2, 0x45, 0xae, 0x3d, + 0xaa, 0x6d, 0x48, 0xab, 0xcf, 0x79, 0xcd, 0xc9, 0x7b, 0x7b, 0x9c, 0x77, + 0x72, 0xb1, 0xd3, 0x76, 0xbd, 0x8c, 0xc1, 0x37, 0x3a, 0xce, 0x55, 0xcb, + 0x87, 0x7a, 0x4d, 0x5d, 0x4a, 0x44, 0xa8, 0x6a, 0x8d, 0x7e, 0x5b, 0xbd, + 0x60, 0x7c, 0xc3, 0x4b, 0xa7, 0x3f, 0x94, 0x46, 0x90, 0xa9, 0x99, 0x64, + 0xcb, 0x72, 0x4d, 0xb8, 0xcd, 0xb9, 0xa3, 0x3c, 0x3b, 0xb1, 0x51, 0xa3, + 0x9f, 0xa3, 0xca, 0xb8, 0x57, 0xa5, 0xad, 0x59, 0x3e, 0x52, 0x84, 0x34, + 0xba, 0x76, 0xc6, 0x3e, 0x95, 0x7a, 0x3b, 0xbf, 0xd0, 0xc5, 0x4e, 0xa3, + 0xb2, 0x6d, 0xb1, 0x4e, 0x86, 0x89, 0xb9, 0x3b, 0x6a, 0xbc, 0x68, 0x65, + 0x4c, 0x3c, 0x63, 0x87, 0x45, 0x8a, 0xa0, 0xc9, 0xa8, 0x83, 0x92, 0x8e, + 0x36, 0x84, 0x4b, 0xbe, 0x34, 0xa8, 0x9c, 0xcb, 0x94, 0x81, 0xa6, 0xae, + 0x96, 0x67, 0xc5, 0x39, 0x4e, 0x9f, 0x9c, 0xc7, 0x46, 0x42, 0x5f, 0xbc, + 0x63, 0xbb, 0xd3, 0xb7, 0x71, 0x4f, 0x55, 0x42, 0xb5, 0x8c, 0x85, 0x50, + 0x41, 0x5c, 0x52, 0x99, 0x4f, 0x52, 0x76, 0x64, 0x57, 0x7e, 0x30, 0x2f, + 0x94, 0x36, 0xa4, 0xce, 0x8b, 0x6f, 0x5f, 0x8d, 0x57, 0x6c, 0x99, 0xc0, + 0x5d, 0x31, 0xbe, 0x66, 0xc5, 0xa1, 0x4d, 0x42, 0xbb, 0xaf, 0x41, 0x42, + 0x63, 0xcb, 0x9f, 0x38, 0xcf, 0x98, 0xbe, 0x77, 0xa5, 0x7f, 0x7d, 0xbb, + 0x63, 0xbe, 0x31, 0x68, 0xb3, 0x4b, 0x3d, 0xbd, 0xcd, 0x70, 0xbc, 0x70, + 0xc3, 0x9b, 0xbc, 0x4b, 0x48, 0x67, 0x30, 0x6e, 0x35, 0x82, 0x33, 0xbe, + 0x33, 0x62, 0xae, 0x4f, 0x5e, 0xd2, 0x34, 0x73, 0x68, 0x63, 0x4e, 0xb8, + 0x5e, 0xa4, 0x57, 0x58, 0xc7, 0x81, 0xd1, 0xb1, 0x5f, 0x9f, 0x4a, 0x41, + 0xa8, 0x8d, 0x8f, 0xb7, 0x77, 0x4e, 0x5e, 0xab, 0x44, 0xa2, 0x72, 0xba, + 0x6e, 0xb7, 0x8e, 0x83, 0x2b, 0x94, 0x87, 0xb6, 0xab, 0xc9, 0xb9, 0xc7, + 0x64, 0x99, 0xb2, 0x9e, 0x9d, 0xc4, 0xd4, 0x79, 0x94, 0xa4, 0x9e, 0x61, + 0xa0, 0x83, 0x4c, 0x4f, 0x5e, 0x8f, 0x41, 0x6d, 0x42, 0x99, 0x42, 0x85, + 0x78, 0x96, 0x2e, 0x74, 0xa9, 0x36, 0x40, 0x4e, 0x45, 0x6d, 0xb3, 0xbd, + 0x68, 0xcb, 0x69, 0xae, 0x97, 0xc4, 0x72, 0xc8, 0x81, 0xca, 0x3e, 0x8c, + 0xaa, 0xcb, 0x82, 0xa5, 0x8e, 0x53, 0x6e, 0xa0, 0xa8, 0x7e, 0xa2, 0x89, + 0x8b, 0x69, 0xba, 0x30, 0x81, 0x6b, 0x78, 0x55, 0x7f, 0xb4, 0x4d, 0x5d, + 0x88, 0x32, 0xbe, 0x70, 0x45, 0xb0, 0x60, 0xb8, 0xd0, 0xb3, 0x33, 0x9f, + 0xd1, 0xb7, 0x69, 0x87, 0x9c, 0x6e, 0x52, 0x97, 0xaf, 0x5c, 0x58, 0x2e, + 0x3e, 0x4c, 0x8e, 0xcd, 0x54, 0x64, 0xc4, 0x74, 0xd5, 0x90, 0x71, 0x7a, + 0x9d, 0x82, 0x31, 0x60, 0xc4, 0xcc, 0x4a, 0x6d, 0x86, 0x6a, 0x33, 0x55, + 0x9e, 0x87, 0x85, 0xa2, 0x4f, 0x34, 0x4e, 0x60, 0x37, 0xd5, 0xbf, 0x92, + 0x86, 0x4f, 0x5a, 0x4b, 0xc3, 0x7a, 0x59, 0xc9, 0xa0, 0x60, 0xa5, 0xc3, + 0x6f, 0xc0, 0x73, 0xbc, 0x76, 0x9b, 0x5a, 0x6b, 0x46, 0x69, 0xb8, 0x3a, + 0x3b, 0xa9, 0x68, 0x45, 0xd1, 0x73, 0x7c, 0xa4, 0xb3, 0x4a, 0xad, 0xc2, + 0x69, 0x72, 0x6e, 0xc9, 0x8d, 0x9d, 0x30, 0x6e, 0x5e, 0x67, 0xb6, 0xa1, + 0x43, 0x86, 0x78, 0x70, 0x8c, 0xc7, 0xa3, 0x90, 0x72, 0x85, 0xd8, 0x4d, + 0x8c, 0x7e, 0xb2, 0x62, 0x37, 0x54, 0x35, 0x4d, 0x63, 0xad, 0x7a, 0x7b, + 0x8a, 0x37, 0x73, 0xac, 0xd0, 0x53, 0x35, 0x9f, 0x4a, 0xc3, 0xc8, 0x82, + 0x51, 0x45, 0x57, 0x41, 0x4a, 0x44, 0x61, 0xab, 0x95, 0xb9, 0x37, 0x92, + 0x5b, 0xb4, 0x97, 0xbc, 0x7b, 0x9b, 0xaa, 0x8a, 0x8d, 0x4c, 0x9c, 0xb3, + 0x83, 0x37, 0x9e, 0x88, 0xaa, 0x82, 0x45, 0x55, 0x90, 0x73, 0xc8, 0x62, + 0xb9, 0xa5, 0x61, 0x46, 0x84, 0xaa, 0x39, 0xbb, 0x97, 0x35, 0xd2, 0xb2, + 0x8a, 0x64, 0x44, 0xd2, 0x46, 0x57, 0x52, 0x74, 0x41, 0xb1, 0xcb, 0xc5, + 0x33, 0xc7, 0x6b, 0xd1, 0x8e, 0x96, 0xb1, 0x76, 0x74, 0x5c, 0xb9, 0x45, + 0x3e, 0xcc, 0x30, 0x5b, 0x68, 0x4f, 0xbc, 0x61, 0xba, 0x4c, 0x4c, 0x83, + 0x8b, 0x76, 0xc9, 0xb6, 0x6d, 0x7f, 0xc5, 0xe7, 0x42, 0x7b, 0x56, 0x9f, + 0x97, 0x64, 0x77, 0x86, 0x97, 0x7e, 0x7f, 0x48, 0x9f, 0x42, 0xa4, 0xbf, + 0x52, 0x79, 0x92, 0x48, 0x35, 0x55, 0x7c, 0x34, 0xcb, 0x67, 0x52, 0xa0, + 0x76, 0x44, 0x5e, 0x93, 0xaa, 0x80, 0x5f, 0x6b, 0x37, 0xba, 0x8e, 0xb1, + 0xbe, 0x88, 0xb6, 0x6c, 0x59, 0x7b, 0x33, 0x90, 0xa1, 0xb7, 0xa6, 0xb4, + 0x52, 0xc2, 0x5f, 0x5a, 0x7f, 0x88, 0x48, 0xc7, 0x6b, 0xaa, 0x3d, 0x44, + 0x77, 0x76, 0x31, 0x51, 0x98, 0x63, 0x92, 0x64, 0x4d, 0x68, 0xbc, 0x82, + 0x80, 0xb2, 0x6c, 0x73, 0x6a, 0x89, 0x3c, 0x71, 0xa0, 0xb4, 0x80, 0x53, + 0x34, 0x76, 0x54, 0x3b, 0xc1, 0x6d, 0xbe, 0xc8, 0x3e, 0x6d, 0x3b, 0x54, + 0x68, 0x55, 0xaf, 0xac, 0x46, 0x86, 0x5e, 0x78, 0x38, 0x3a, 0x6f, 0xad, + 0x89, 0xcb, 0x36, 0xa2, 0x48, 0x97, 0x81, 0x6c, 0x5b, 0x6b, 0x4a, 0xad, + 0x8a, 0x76, 0x42, 0x4b, 0x46, 0x77, 0xa8, 0xa4, 0xbb, 0xb5, 0x6e, 0xaa, + 0xa4, 0xbf, 0x5f, 0x9a, 0x41, 0x60, 0x39, 0xa0, 0xc1, 0x41, 0x60, 0xd0, + 0x3e, 0xab, 0x6a, 0x9c, 0xc9, 0x8c, 0x4c, 0xac, 0xbf, 0xb9, 0x63, 0xa9, + 0xc2, 0x60, 0x6f, 0x82, 0x87, 0x88, 0x88, 0x61, 0x96, 0xbc, 0xd3, 0x45, + 0x64, 0xb5, 0xbc, 0x8f, 0x43, 0x79, 0x63, 0xaa, 0x56, 0x73, 0x85, 0xaf, + 0x6d, 0xb7, 0x82, 0x4a, 0xa9, 0x66, 0xc7, 0xb2, 0xc8, 0xd2, 0x8c, 0x51, + 0xc4, 0x56, 0x81, 0x7e, 0x38, 0x74, 0x55, 0x9f, 0xcb, 0x4a, 0x52, 0x7f, + 0x79, 0x4c, 0xc6, 0x4d, 0xa0, 0x3a, 0x79, 0x36, 0xb1, 0x8b, 0x28, 0xc6, + 0xa4, 0x72, 0x6a, 0xae, 0x9b, 0x89, 0x81, 0xd2, 0x9a, 0x66, 0xb0, 0x71, + 0x96, 0xcd, 0x79, 0xac, 0x8f, 0x48, 0xc4, 0x72, 0x6f, 0x36, 0xcb, 0xe0, + 0xb0, 0x66, 0x6f, 0x1f, 0x4e, 0x8c, 0x7c, 0xa2, 0xa4, 0x5d, 0xbc, 0x4a, + 0x57, 0x4a, 0x3f, 0xa5, 0x42, 0x6b, 0x56, 0x97, 0x37, 0x87, 0x49, 0x80, + 0xce, 0x86, 0x4e, 0x58, 0x64, 0xaa, 0x86, 0x4e, 0x3d, 0x43, 0xce, 0x91, + 0xc7, 0xac, 0x52, 0x8c, 0x38, 0xa1, 0x84, 0x57, 0x2e, 0x79, 0x64, 0x9d, + 0x8e, 0xd2, 0x45, 0xa7, 0x4d, 0xae, 0x4a, 0xaf, 0xb6, 0xc6, 0x42, 0xae, + 0x4e, 0x51, 0x8c, 0xa4, 0x46, 0x84, 0x84, 0x95, 0x33, 0xd0, 0x49, 0x6d, + 0x4b, 0x94, 0xd1, 0xcb, 0xc4, 0xc8, 0x55, 0xc0, 0x7d, 0xaa, 0xb2, 0x41, + 0x60, 0x5b, 0x69, 0xa5, 0x53, 0x5a, 0x92, 0x32, 0xa7, 0x57, 0x6d, 0x6c, + 0xab, 0x4a, 0x6a, 0xa9, 0x73, 0xa5, 0x4a, 0x37, 0x52, 0x5c, 0x40, 0x6b, + 0x4c, 0x73, 0xd2, 0xd9, 0xc6, 0xcd, 0xc4, 0x50, 0x4b, 0x43, 0xb8, 0xac, + 0xd2, 0xc1, 0x4e, 0x46, 0xa4, 0xac, 0x94, 0x89, 0xa5, 0x44, 0x79, 0x94, + 0x55, 0x8e, 0x9f, 0x7d, 0xbc, 0xcb, 0x68, 0x5d, 0xab, 0x8f, 0x69, 0x49, + 0x4d, 0x60, 0xce, 0x43, 0xa4, 0x3a, 0x34, 0x60, 0x65, 0x5a, 0xbc, 0x38, + 0xa7, 0x3f, 0x62, 0x60, 0x81, 0x79, 0x43, 0x56, 0xd6, 0xd1, 0xc5, 0xa6, + 0xd3, 0x5a, 0x88, 0x81, 0x8d, 0x57, 0x42, 0xaa, 0xac, 0xbc, 0x71, 0x6a, + 0xc9, 0xb3, 0x79, 0xb4, 0x33, 0xc1, 0xcb, 0x4e, 0x73, 0x6e, 0x94, 0x3c, + 0x8a, 0x6c, 0x79, 0x75, 0x8c, 0x3c, 0x30, 0xc1, 0x68, 0xca, 0x3c, 0x76, + 0xc3, 0xb5, 0x65, 0xc5, 0x3f, 0xb3, 0xc2, 0x50, 0x4b, 0xbe, 0x9f, 0x53, + 0x46, 0x5f, 0x77, 0x97, 0x3c, 0xd3, 0x66, 0x7a, 0x5f, 0xc2, 0xc5, 0x8c, + 0x34, 0xb7, 0xb4, 0xc2, 0x37, 0xb0, 0x51, 0x34, 0x89, 0x9b, 0x51, 0xb0, + 0x95, 0x4b, 0xbe, 0x83, 0x3f, 0xa0, 0x93, 0xb2, 0x9b, 0x3e, 0x53, 0x8c, + 0xcc, 0xaa, 0xb8, 0x92, 0xa7, 0xa0, 0x7a, 0x38, 0x8a, 0x6c, 0x76, 0xab, + 0xbc, 0x76, 0x69, 0x9a, 0xc2, 0x5b, 0x92, 0x3e, 0xc3, 0x9c, 0x81, 0x5a, + 0x44, 0xc3, 0x43, 0xab, 0x33, 0x5a, 0x60, 0xa6, 0x81, 0x3b, 0xcf, 0xc7, + 0x3b, 0xaf, 0x79, 0xd2, 0x42, 0x6f, 0x99, 0x42, 0xca, 0x97, 0xc2, 0xcd, + 0x72, 0x6d, 0x41, 0xcf, 0x3c, 0x60, 0x8f, 0xce, 0x48, 0xc9, 0x9d, 0x66, + 0x82, 0x90, 0x83, 0xc3, 0x9a, 0x8f, 0xb9, 0x77, 0x69, 0x95, 0x36, 0xb8, + 0x9a, 0xb1, 0x64, 0xa7, 0x8b, 0x6d, 0xc9, 0x70, 0x79, 0xc6, 0x7a, 0xc2, + 0x60, 0xa5, 0x65, 0x4a, 0xa9, 0x5b, 0x65, 0x43, 0x9c, 0x6c, 0x8c, 0x4a, + 0xcf, 0x4a, 0x82, 0x62, 0x8f, 0xb0, 0xd0, 0xb2, 0x77, 0xb5, 0x9e, 0x67, + 0x52, 0x91, 0x77, 0x89, 0x65, 0xb2, 0x86, 0x5b, 0x75, 0xc0, 0x75, 0x53, + 0x3e, 0x63, 0xc9, 0xb9, 0x6a, 0x47, 0x7d, 0x46, 0xcd, 0xb9, 0x9e, 0x72, + 0x30, 0x3b, 0xb0, 0xb3, 0xc4, 0x63, 0x95, 0x62, 0xa8, 0x5f, 0x6d, 0x45, + 0x90, 0x34, 0xb0, 0xcc, 0x95, 0xaa, 0x45, 0xbc, 0x96, 0x6a, 0x99, 0x6b, + 0x9d, 0x59, 0x56, 0x76, 0x80, 0x77, 0x48, 0x33, 0x7e, 0xc4, 0x6d, 0x85, + 0x7e, 0x6a, 0x5f, 0x6b, 0x5d, 0x52, 0x96, 0x88, 0xa5, 0x8d, 0x62, 0x51, + 0x66, 0x30, 0x66, 0xb8, 0xc0, 0x44, 0x7f, 0x52, 0x73, 0x32, 0x62, 0x94, + 0x5e, 0x7c, 0x7f, 0xd0, 0x37, 0x70, 0xc4, 0x4e, 0x54, 0x87, 0xca, 0xab, + 0xcd, 0x9a, 0x82, 0x6e, 0x46, 0x5b, 0x53, 0x3b, 0x66, 0x33, 0x6e, 0x95, + 0x90, 0x4a, 0x3e, 0x58, 0xb9, 0xc2, 0x34, 0xc5, 0x87, 0x91, 0xbe, 0x74, + 0x87, 0x63, 0x9b, 0x48, 0x67, 0xb8, 0xae, 0xc4, 0xd0, 0x82, 0x8a, 0x6d, + 0x55, 0xbe, 0x94, 0xd1, 0xa2, 0x64, 0x9f, 0x8b, 0x44, 0x61, 0xc2, 0x8d, + 0x92, 0x6e, 0x52, 0x37, 0x5c, 0xad, 0xa1, 0xa3, 0x43, 0x65, 0x4d, 0xb3, + 0xa5, 0xc5, 0x64, 0x90, 0x59, 0x7f, 0x81, 0xb0, 0xb9, 0xce, 0x31, 0x7e, + 0xce, 0x63, 0x61, 0xac, 0x41, 0xb1, 0xa7, 0x55, 0xa2, 0x87, 0x56, 0xb2, + 0x3f, 0x43, 0x62, 0x99, 0x32, 0x33, 0xb6, 0xb8, 0x7c, 0x5e, 0x33, 0x3f, + 0x33, 0x97, 0x9f, 0xc9, 0x39, 0xc1, 0x68, 0xb5, 0x55, 0x4a, 0x50, 0xb9, + 0xb4, 0xc8, 0x65, 0xb5, 0x54, 0xb3, 0x73, 0xb8, 0xad, 0xc1, 0x4f, 0x9d, + 0x7f, 0x78, 0x46, 0x84, 0x64, 0x69, 0x4f, 0x73, 0xa6, 0xca, 0xb2, 0xbb, + 0x40, 0x78, 0xcb, 0xcd, 0x41, 0x67, 0xca, 0xae, 0xca, 0x42, 0x95, 0x35, + 0x9d, 0x7a, 0x52, 0x5a, 0x5c, 0x3e, 0xc2, 0xcf, 0xc7, 0x35, 0x65, 0x66, + 0x89, 0x35, 0x69, 0x93, 0x56, 0x53, 0x44, 0x9c, 0x5d, 0xc2, 0xcc, 0xab, + 0x94, 0x43, 0x4d, 0xb9, 0x7e, 0xae, 0x3d, 0x43, 0x90, 0x7b, 0x6f, 0x84, + 0x57, 0xc9, 0xc8, 0xbd, 0x6f, 0xae, 0x9a, 0x4c, 0x8d, 0x76, 0x70, 0x97, + 0xa2, 0xca, 0x42, 0xb2, 0xce, 0xc9, 0x4d, 0x4a, 0x62, 0x46, 0x6c, 0x8d, + 0x8c, 0x65, 0x91, 0x8c, 0x42, 0x42, 0x92, 0xcf, 0xa4, 0x82, 0x4f, 0x91, + 0x31, 0x68, 0x6d, 0x66, 0xca, 0x88, 0x88, 0xa8, 0x3f, 0xa4, 0xb7, 0x71, + 0xa8, 0x56, 0x65, 0x80, 0x88, 0x51, 0x48, 0xb6, 0x94, 0x9f, 0xbe, 0x53, + 0x50, 0x50, 0xa1, 0x61, 0x30, 0xc2, 0x50, 0xb0, 0x55, 0xb4, 0x5a, 0xb2, + 0xb3, 0x42, 0xc9, 0x3b, 0xbd, 0x7e, 0x47, 0x50, 0x8c, 0x36, 0x7d, 0x5b, + 0x60, 0x54, 0x74, 0x5c, 0x42, 0x71, 0x76, 0x60, 0x3c, 0x99, 0x43, 0x92, + 0x4c, 0x90, 0x8b, 0xa3, 0x40, 0x7f, 0x56, 0x41, 0xc5, 0x80, 0xbf, 0x49, + 0xac, 0x73, 0xb8, 0xc9, 0x9a, 0x67, 0x95, 0x8c, 0xbe, 0x85, 0xba, 0x64, + 0x60, 0x59, 0x31, 0x32, 0x33, 0xb8, 0x75, 0x52, 0xcb, 0x5a, 0x72, 0x9f, + 0xa2, 0x69, 0x50, 0x31, 0x85, 0xaf, 0xc3, 0x48, 0xc7, 0x83, 0xa7, 0x5a, + 0x92, 0xa8, 0x67, 0xa9, 0x6f, 0xb4, 0x3d, 0x59, 0xca, 0xae, 0xc7, 0xab, + 0x2e, 0x79, 0xa8, 0xc2, 0xd2, 0xa1, 0xcb, 0x72, 0x64, 0x84, 0x87, 0x4c, + 0x40, 0x5a, 0x76, 0x4d, 0x47, 0x87, 0xc7, 0x87, 0x5e, 0x59, 0x6b, 0xbe, + 0x64, 0x9d, 0xa2, 0x3b, 0xb9, 0x56, 0x89, 0xac, 0x7e, 0x5f, 0xd2, 0x87, + 0x5c, 0xb5, 0xa6, 0x96, 0x94, 0xce, 0xb8, 0x38, 0x6d, 0xa3, 0x6d, 0x6f, + 0xc5, 0x31, 0x6b, 0xb2, 0x4f, 0x9b, 0x86, 0x82, 0xa2, 0x74, 0x35, 0xb1, + 0x77, 0xb9, 0xae, 0x96, 0x3a, 0xc8, 0xa8, 0x80, 0x38, 0x37, 0x75, 0x59, + 0xac, 0x6b, 0x38, 0xb1, 0xc2, 0x83, 0x7f, 0x90, 0x6e, 0x7a, 0xc2, 0xc6, + 0x50, 0xad, 0xae, 0x45, 0xbe, 0x62, 0x6f, 0xbe, 0x72, 0x49, 0x4a, 0xa8, + 0x8d, 0x76, 0xb6, 0xbc, 0x3c, 0xb1, 0x81, 0x47, 0xb2, 0x60, 0xc9, 0xab, + 0x42, 0x97, 0x7d, 0x6f, 0xa6, 0x37, 0x57, 0x56, 0x8b, 0x46, 0xb4, 0x64, + 0x79, 0x61, 0x82, 0xd2, 0x6d, 0xa4, 0x89, 0x6c, 0x88, 0x90, 0xa1, 0x5c, + 0xb5, 0xad, 0x54, 0xb7, 0x70, 0x51, 0xb0, 0xb5, 0xaa, 0xd2, 0x6b, 0xb8, + 0xad, 0xae, 0x5a, 0xa6, 0xcf, 0x5b, 0xc3, 0x5b, 0xa0, 0x34, 0xa4, 0x38, + 0x81, 0x45, 0x4f, 0xb6, 0x45, 0xd2, 0xcf, 0x87, 0x75, 0x69, 0xc0, 0x41, + 0xc6, 0xb4, 0x53, 0x7e, 0x40, 0x8d, 0x8b, 0x80, 0xb3, 0x7d, 0x4a, 0x46, + 0x69, 0x84, 0x54, 0x40, 0x72, 0x88, 0xa4, 0x96, 0xb7, 0x3e, 0x59, 0x91, + 0x36, 0x50, 0xc9, 0x90, 0xac, 0x84, 0x4e, 0x41, 0x68, 0x9f, 0x61, 0x7a, + 0x32, 0xcb, 0xc2, 0xbc, 0x84, 0x7e, 0x59, 0xa0, 0x96, 0x43, 0xa2, 0xb5, + 0xc8, 0x90, 0x8a, 0xb4, 0x79, 0x9d, 0xa1, 0xb7, 0x65, 0x8d, 0x65, 0x70, + 0x95, 0x3a, 0xa0, 0x6d, 0xaa, 0x74, 0x4a, 0xcf, 0x5a, 0x87, 0xe3, 0xb7, + 0x93, 0x8d, 0xca, 0xc5, 0x78, 0x55, 0xce, 0x9e, 0xb7, 0x8c, 0x8c, 0x6d, + 0xbc, 0x50, 0x5b, 0xb3, 0x89, 0x4f, 0xad, 0xb4, 0xaf, 0xc4, 0x89, 0x3d, + 0xae, 0xc7, 0x97, 0x4d, 0x54, 0xb6, 0x8c, 0x52, 0x62, 0x5e, 0x7a, 0xbc, + 0x84, 0x84, 0xa3, 0x7f, 0x46, 0x37, 0x77, 0x2d, 0x91, 0x8d, 0x52, 0xb3, + 0xaf, 0xa0, 0xbb, 0xb1, 0x36, 0x97, 0x84, 0xbd, 0xd4, 0x82, 0xbe, 0x84, + 0x70, 0x54, 0x6f, 0xa5, 0x41, 0xbc, 0xc4, 0x96, 0x4c, 0x96, 0x93, 0x97, + 0xa4, 0x42, 0x60, 0x39, 0x70, 0xba, 0xac, 0x4b, 0x82, 0x4e, 0x66, 0x6d, + 0xad, 0x92, 0x2d, 0x98, 0xaf, 0x52, 0xba, 0x5e, 0x97, 0x9a, 0xcd, 0x84, + 0xaa, 0x6d, 0x6b, 0xcd, 0xd9, 0x86, 0x66, 0xa5, 0xdd, 0x48, 0x43, 0x67, + 0x7d, 0x54, 0x98, 0xb4, 0xbb, 0x8e, 0x49, 0x77, 0xd0, 0x6c, 0xbb, 0xbe, + 0xb1, 0x82, 0x54, 0x35, 0x39, 0x95, 0x32, 0x3a, 0xb8, 0x4a, 0x8c, 0x8d, + 0x5b, 0x58, 0x5f, 0x82, 0x57, 0x36, 0x92, 0x91, 0x3c, 0x4e, 0x59, 0x28, + 0x5b, 0x74, 0xe1, 0x24, 0xbb, 0xa8, 0xa2, 0xbc, 0x6e, 0x50, 0xa7, 0xd3, + 0xa6, 0x7e, 0x35, 0xac, 0x2c, 0x65, 0xd0, 0x54, 0x52, 0x46, 0xc4, 0x5e, + 0xd5, 0xc6, 0xb1, 0x70, 0x68, 0x92, 0xb5, 0x77, 0x42, 0x87, 0x3e, 0x86, + 0x32, 0x4e, 0x46, 0x96, 0x53, 0x7d, 0x6a, 0xc6, 0x6c, 0x9c, 0x5e, 0xc3, + 0xaf, 0xbf, 0xc6, 0x60, 0xb2, 0x7a, 0x7b, 0x78, 0x93, 0xa4, 0x5a, 0x83, + 0x8e, 0xd2, 0x91, 0xd4, 0x91, 0xbd, 0x30, 0x45, 0x62, 0x38, 0x5c, 0x88, + 0x61, 0xbd, 0x5f, 0xb5, 0x9a, 0x9a, 0x9b, 0x83, 0x55, 0x81, 0x2f, 0x6b, + 0xc1, 0x95, 0x99, 0x8e, 0x66, 0x35, 0x9b, 0xa8, 0xa2, 0x2d, 0x2f, 0x73, + 0x63, 0xc4, 0x83, 0x50, 0xaa, 0x87, 0x60, 0x60, 0xa4, 0xa8, 0x3f, 0x60, + 0xb4, 0x4c, 0x62, 0x76, 0xb2, 0xd2, 0x8c, 0x7a, 0xb9, 0x8c, 0xbe, 0x32, + 0x9d, 0x81, 0x81, 0x6b, 0xbf, 0x6a, 0x40, 0x54, 0xcf, 0x9e, 0xcd, 0xae, + 0xb3, 0x71, 0x9d, 0x97, 0xbf, 0x48, 0x3f, 0x4e, 0x3e, 0xbc, 0xbd, 0x5f, + 0xd2, 0xd1, 0x40, 0xc7, 0x7c, 0x75, 0x6a, 0x3b, 0x3e, 0x98, 0xb0, 0x75, + 0x30, 0x9e, 0x49, 0xaa, 0x71, 0xa2, 0x33, 0x5e, 0x41, 0x5e, 0x34, 0xa2, + 0xc1, 0x8d, 0x46, 0x9e, 0xa2, 0x4c, 0xa7, 0x74, 0x55, 0x4b, 0x9d, 0x8e, + 0x6a, 0x6a, 0xbe, 0x3a, 0xd4, 0x92, 0x70, 0x76, 0x93, 0xbb, 0x69, 0x8c, + 0x42, 0x6d, 0x56, 0xa3, 0xca, 0xc7, 0x96, 0x7c, 0x8a, 0x5e, 0xbb, 0x6f, + 0x23, 0x4a, 0xb0, 0x99, 0xc5, 0x84, 0x89, 0xac, 0x54, 0xbe, 0xc3, 0x9d, + 0x68, 0x9f, 0xc0, 0xe3, 0x4f, 0x61, 0x92, 0xd5, 0x89, 0xaf, 0xb2, 0x72, + 0x46, 0x28, 0xc9, 0x96, 0x40, 0x60, 0x72, 0xd1, 0x7f, 0xbf, 0x58, 0xba, + 0xa1, 0x79, 0xac, 0x71, 0xd3, 0xc9, 0xaa, 0xcd, 0x80, 0x4e, 0x9c, 0x64, + 0x44, 0xa6, 0x2f, 0x68, 0x78, 0xc6, 0x87, 0x8a, 0x44, 0xbf, 0xc1, 0x7f, + 0xa2, 0x50, 0x7c, 0x8f, 0x9d, 0xa1, 0xc6, 0x7a, 0x7c, 0xb9, 0x64, 0x83, + 0x6d, 0x97, 0x4b, 0x65, 0x50, 0x46, 0x91, 0x7f, 0x82, 0xa7, 0x65, 0x35, + 0xd8, 0x5d, 0xa0, 0x55, 0x7e, 0xb1, 0x8f, 0x8c, 0x65, 0xd4, 0xd1, 0x77, + 0x44, 0xae, 0xd7, 0x69, 0x94, 0x2f, 0x4b, 0x3d, 0x3c, 0x5c, 0xbd, 0xc0, + 0x9b, 0x49, 0x8a, 0x60, 0x58, 0x71, 0x87, 0x85, 0x6e, 0x83, 0xc5, 0x8c, + 0x40, 0x6a, 0x52, 0x3e, 0x76, 0x3c, 0x8d, 0x3e, 0xd2, 0x49, 0x7c, 0x82, + 0x46, 0x4b, 0xb6, 0x50, 0x27, 0xd2, 0x46, 0x60, 0x27, 0x4f, 0x3e, 0x60, + 0x39, 0xbd, 0xa3, 0xc1, 0xad, 0x78, 0xb2, 0x31, 0x4e, 0x71, 0x65, 0x53, + 0x3e, 0x90, 0x63, 0x98, 0x79, 0xd5, 0x7e, 0xbd, 0x30, 0x67, 0x91, 0xb5, + 0x93, 0x66, 0xb6, 0xad, 0xd2, 0x8b, 0x90, 0xc0, 0xa2, 0x9c, 0xb2, 0xd1, + 0x28, 0x73, 0xa3, 0x75, 0x86, 0xbd, 0x5a, 0xcc, 0x4e, 0x4d, 0xa0, 0xae, + 0x8c, 0xc9, 0x36, 0x49, 0x4a, 0x75, 0x68, 0x98, 0x5b, 0xcf, 0x6c, 0x9b, + 0x3e, 0xbe, 0x9c, 0xaf, 0x62, 0xb1, 0xad, 0x7f, 0xdd, 0xc1, 0x38, 0xb9, + 0x5b, 0x4b, 0x3e, 0x70, 0x68, 0x56, 0xc9, 0x35, 0xa1, 0xbd, 0x4c, 0x81, + 0x6a, 0xb7, 0x4f, 0x42, 0xb8, 0xa2, 0x49, 0x66, 0x93, 0x65, 0x6c, 0xbb, + 0xb2, 0xb8, 0xc4, 0xce, 0x98, 0xac, 0x93, 0xb2, 0xa4, 0xac, 0x70, 0xd2, + 0x74, 0x78, 0xb8, 0x95, 0xd7, 0xa9, 0x51, 0xbe, 0xaa, 0x9c, 0x6c, 0x71, + 0xc1, 0xbd, 0x9c, 0xdd, 0x3d, 0xc2, 0x68, 0xbf, 0x74, 0xcc, 0x71, 0x4b, + 0x83, 0x4b, 0x52, 0x8b, 0x9b, 0xa6, 0x50, 0x75, 0x3c, 0x61, 0x6e, 0x60, + 0x4b, 0xb5, 0x3a, 0xa3, 0x97, 0xd4, 0xa3, 0xc8, 0x93, 0x32, 0xa2, 0x75, + 0xc8, 0xc8, 0xe1, 0x52, 0x74, 0x3b, 0xcb, 0x50, 0x63, 0xa1, 0x6d, 0xb9, + 0x31, 0x67, 0xb3, 0xa6, 0x55, 0xb7, 0xc8, 0xa1, 0xc4, 0x95, 0xab, 0x35, + 0x8e, 0x58, 0x74, 0xa3, 0xc8, 0x8e, 0xb4, 0xbc, 0x72, 0xb2, 0xa5, 0x97, + 0xcd, 0x7e, 0x43, 0x8e, 0xb2, 0x48, 0xa9, 0x9e, 0xba, 0xa1, 0x67, 0xac, + 0xd2, 0xd7, 0x8e, 0x60, 0x94, 0x4d, 0x80, 0xad, 0x3f, 0x3a, 0x9f, 0x62, + 0x5b, 0x9d, 0x40, 0x70, 0xb6, 0x9a, 0x94, 0xb0, 0x66, 0xb3, 0x59, 0xbd, + 0xc6, 0xbf, 0x7f, 0x95, 0x83, 0x8d, 0x9d, 0xad, 0x36, 0x38, 0xb1, 0x4e, + 0xe5, 0x95, 0xcf, 0x93, 0xc4, 0x64, 0xbd, 0xa9, 0x5c, 0x8f, 0xc4, 0x88, + 0xcd, 0x73, 0xd8, 0x5a, 0x3f, 0x9a, 0x52, 0x67, 0x97, 0x7e, 0x60, 0x8f, + 0x7b, 0x4d, 0x70, 0xa8, 0x5e, 0x71, 0x85, 0xb6, 0xc4, 0x97, 0x49, 0x6d, + 0x95, 0x85, 0x70, 0x76, 0x5a, 0x46, 0x82, 0xae, 0x76, 0x88, 0x95, 0x72, + 0x74, 0x7b, 0x3c, 0xc8, 0x63, 0x8d, 0x7f, 0x74, 0xcf, 0x86, 0x5f, 0x6f, + 0x6e, 0x6c, 0x8a, 0x59, 0x4d, 0x65, 0x7b, 0xa4, 0xb3, 0x54, 0xaa, 0xae, + 0xa5, 0x89, 0x49, 0x88, 0x74, 0x57, 0xb2, 0x74, 0x42, 0x9c, 0xc2, 0x78, + 0xc6, 0xa4, 0x31, 0x6b, 0xc3, 0x38, 0x9a, 0x82, 0x65, 0xb4, 0x67, 0x69, + 0xb0, 0x82, 0xa1, 0xc9, 0xa2, 0x4b, 0x42, 0x85, 0xb7, 0xcb, 0xa5, 0x38, + 0xac, 0x57, 0x91, 0x3b, 0x69, 0xb5, 0x7b, 0x85, 0x95, 0x38, 0xa0, 0x9d, + 0x61, 0x45, 0x8b, 0xcf, 0x9c, 0xac, 0x82, 0x60, 0xaa, 0xb3, 0x63, 0xad, + 0x80, 0xb6, 0xa4, 0xaa, 0x4a, 0x6f, 0xac, 0xb2, 0xc9, 0x5f, 0x5a, 0x3c, + 0xbe, 0x9c, 0xc7, 0x6a, 0x78, 0xb8, 0xab, 0x8c, 0x7f, 0x49, 0x53, 0x7d, + 0x5d, 0x62, 0x69, 0x46, 0x9b, 0x96, 0xa6, 0x44, 0x3c, 0xb8, 0x74, 0x2d, + 0xbc, 0xb2, 0xa3, 0xa7, 0x8c, 0xc3, 0x68, 0x60, 0x9d, 0x31, 0x40, 0xca, + 0xb4, 0x46, 0xb6, 0xae, 0xd2, 0x9c, 0x78, 0x6c, 0x32, 0x84, 0xa3, 0x99, + 0x6e, 0xc6, 0x93, 0x82, 0x5f, 0x3a, 0x97, 0x3a, 0x84, 0x4d, 0x83, 0xaa, + 0x4c, 0x54, 0xb6, 0x4e, 0xa6, 0x51, 0x95, 0xaf, 0x61, 0x53, 0x85, 0x71, + 0x44, 0x34, 0x44, 0xac, 0xcf, 0xa7, 0x9b, 0x41, 0x45, 0x9d, 0xa5, 0x65, + 0x42, 0x5e, 0x60, 0x95, 0x62, 0x5a, 0x67, 0x8e, 0x5c, 0x8d, 0x57, 0xb7, + 0x70, 0x53, 0x85, 0x9b, 0xbb, 0x47, 0xa1, 0xaa, 0xa6, 0xaa, 0xab, 0xb5, + 0x66, 0x96, 0xb5, 0x43, 0xa7, 0x4f, 0x56, 0x7d, 0x8a, 0x86, 0x4c, 0xc0, + 0x56, 0x5b, 0x87, 0xb3, 0x68, 0xab, 0x78, 0xc7, 0x8d, 0x68, 0x52, 0xa8, + 0xcc, 0x8a, 0x79, 0x42, 0xad, 0x60, 0x9b, 0xc8, 0x84, 0x5f, 0x74, 0x8b, + 0xbd, 0x87, 0x7d, 0x86, 0x69, 0xb8, 0x8b, 0x34, 0x70, 0x75, 0xce, 0x92, + 0x3a, 0xaa, 0x64, 0x67, 0x56, 0x38, 0x64, 0x53, 0x7f, 0x4f, 0x4b, 0x7f, + 0xbf, 0x31, 0xc5, 0x8b, 0x4e, 0x70, 0x87, 0xb9, 0x87, 0x3e, 0x74, 0xc7, + 0xc0, 0x5a, 0x4c, 0x5b, 0xb6, 0xa5, 0x91, 0x8d, 0x33, 0xc5, 0x51, 0xb9, + 0x80, 0x88, 0x4c, 0x97, 0xbe, 0x3d, 0xc4, 0xb2, 0x8e, 0x51, 0xac, 0xc2, + 0xbb, 0x39, 0xb4, 0x48, 0x9d, 0xc5, 0x52, 0x8f, 0x89, 0xcd, 0xbc, 0xbb, + 0x57, 0x5e, 0xa6, 0xb9, 0x51, 0x6a, 0xae, 0xd1, 0xb5, 0x4c, 0x7b, 0x3b, + 0x78, 0x8c, 0x88, 0xbe, 0x41, 0x44, 0x5d, 0xd2, 0x83, 0xcc, 0x3d, 0x3c, + 0xd5, 0x73, 0x76, 0x74, 0x3b, 0xac, 0x5f, 0x88, 0x49, 0x9b, 0xbc, 0xc7, + 0x44, 0x50, 0x2d, 0x48, 0x53, 0xb8, 0x54, 0x47, 0x8f, 0x5a, 0xb0, 0x3c, + 0x5f, 0x41, 0x9d, 0xad, 0x42, 0x39, 0x70, 0xa6, 0x71, 0xa7, 0x40, 0xc5, + 0xb2, 0x59, 0x60, 0xbc, 0x81, 0x5c, 0x35, 0x49, 0x96, 0xd1, 0xbd, 0x7d, + 0xad, 0x85, 0x3f, 0x37, 0xc5, 0x91, 0xb8, 0xd1, 0xbb, 0x3b, 0x4a, 0x95, + 0x42, 0x58, 0xa2, 0x91, 0x41, 0x71, 0x36, 0xab, 0xa6, 0x50, 0x3f, 0x9f, + 0x5d, 0x8a, 0x3b, 0xb2, 0xa8, 0x66, 0xaa, 0x4d, 0xbf, 0xaf, 0x69, 0xa6, + 0x8b, 0xc7, 0x31, 0x41, 0x3e, 0x5d, 0x6a, 0xaa, 0xb7, 0x37, 0x5a, 0xc9, + 0x61, 0x37, 0xc5, 0x6d, 0xb4, 0x70, 0xc0, 0xa7, 0x95, 0x33, 0xd0, 0x4a, + 0x36, 0xb3, 0xac, 0x3e, 0x81, 0x9c, 0x4f, 0xcd, 0x58, 0x62, 0x31, 0x89, + 0x7f, 0x65, 0xb3, 0xac, 0x7a, 0x40, 0x57, 0x61, 0x6b, 0xa0, 0x96, 0xb0, + 0x84, 0xa4, 0xc7, 0x5a, 0x54, 0x7b, 0x6c, 0x37, 0x6f, 0x48, 0xa8, 0x44, + 0x9c, 0x62, 0xa0, 0xc3, 0x52, 0x93, 0x94, 0x5a, 0x92, 0x8a, 0x49, 0xa5, + 0x6a, 0x62, 0x95, 0xd2, 0x5b, 0x71, 0xac, 0x57, 0x88, 0x34, 0x82, 0xa3, + 0x4b, 0xb8, 0xb0, 0x44, 0xb7, 0x8e, 0x7a, 0x7b, 0x57, 0xb3, 0x45, 0xbc, + 0xc5, 0x77, 0x55, 0xd1, 0x32, 0x93, 0xa3, 0xa3, 0x36, 0xaf, 0x53, 0x50, + 0x60, 0xbb, 0x80, 0x39, 0xb4, 0xb2, 0xc0, 0xaf, 0x5e, 0xb9, 0x37, 0x59, + 0xd0, 0x9e, 0xbd, 0x41, 0x31, 0x7d, 0x8e, 0x52, 0x49, 0x46, 0xbd, 0x52, + 0x79, 0xb8, 0x80, 0x54, 0x71, 0xd1, 0x7b, 0xc7, 0x75, 0x45, 0x60, 0xbf, + 0x42, 0xcb, 0x52, 0xbf, 0xbd, 0xb7, 0x7a, 0xc4, 0x74, 0x33, 0xb3, 0x61, + 0x74, 0x77, 0x59, 0x73, 0xc5, 0xa1, 0x55, 0x6b, 0x4f, 0x9a, 0xba, 0x5f, + 0xa1, 0xa7, 0x89, 0x8a, 0x9e, 0x40, 0x7a, 0x75, 0x63, 0xab, 0xb1, 0x3b, + 0x3d, 0xa4, 0x58, 0xb3, 0x3d, 0xc6, 0x61, 0x67, 0x94, 0x3a, 0xb6, 0x67, + 0x85, 0x67, 0x78, 0x90, 0x91, 0x45, 0x76, 0x96, 0x47, 0x4e, 0xa6, 0x5d, + 0xc2, 0xbf, 0xcf, 0x71, 0x6d, 0x68, 0x98, 0x47, 0x8d, 0x8d, 0x9a, 0xc6, + 0x69, 0x88, 0x3a, 0xc8, 0x77, 0xcf, 0xa1, 0x5a, 0x3a, 0xc5, 0x56, 0x9c, + 0xcb, 0x96, 0x4e, 0x3a, 0x42, 0xcf, 0x58, 0x97, 0x7a, 0x74, 0x89, 0x53, + 0x46, 0x46, 0x82, 0x33, 0x36, 0xce, 0x6d, 0x91, 0x4b, 0x42, 0x9e, 0x8c, + 0x6a, 0xa4, 0x63, 0x87, 0x76, 0x8f, 0x37, 0x8e, 0x37, 0x3d, 0xd0, 0x3c, + 0x94, 0x72, 0xa5, 0x42, 0x90, 0x96, 0xae, 0x6c, 0x3b, 0x8a, 0x2f, 0x86, + 0xad, 0x54, 0xa5, 0x38, 0xb4, 0xbe, 0xc7, 0x65, 0xa9, 0x3f, 0x5e, 0x90, + 0x3f, 0xba, 0x50, 0xbc, 0x8a, 0xb0, 0xd0, 0x86, 0x85, 0x54, 0xbc, 0x33, + 0xa4, 0xc6, 0x52, 0x62, 0xd0, 0x55, 0x34, 0x6f, 0x95, 0x84, 0x5b, 0x87, + 0xbf, 0x97, 0xb5, 0x59, 0x8c, 0x62, 0x56, 0x4c, 0xcc, 0x99, 0x67, 0x4c, + 0xa2, 0x85, 0xd0, 0x7a, 0xa2, 0x96, 0x70, 0x97, 0xce, 0x57, 0x8b, 0xac, + 0xa6, 0x61, 0x57, 0xa0, 0x4f, 0x7a, 0x59, 0xa4, 0x79, 0x7a, 0x48, 0x56, + 0x84, 0x35, 0x6c, 0xaf, 0xba, 0x85, 0x74, 0x81, 0x3a, 0x7d, 0x3b, 0x57, + 0xa7, 0x8c, 0xb7, 0x7c, 0x65, 0xb9, 0x51, 0xc3, 0x4b, 0xbd, 0xd3, 0x64, + 0x3f, 0x60, 0x41, 0x5a, 0x7a, 0x7d, 0x8c, 0x66, 0x82, 0x64, 0x96, 0xa8, + 0x94, 0x4a, 0xa1, 0x3a, 0x8c, 0x66, 0x62, 0xd3, 0x91, 0xb0, 0xb8, 0xbd, + 0x7f, 0x49, 0xce, 0xb4, 0xb7, 0x33, 0xb2, 0x3d, 0x3a, 0x6f, 0x6c, 0x31, + 0x84, 0x94, 0x6b, 0x4f, 0x43, 0xbc, 0xcd, 0xcf, 0x61, 0x42, 0xab, 0x53, + 0x7a, 0xc5, 0xe3, 0xc0, 0x49, 0x90, 0xbf, 0x93, 0x82, 0xa7, 0x99, 0x93, + 0x8b, 0x7c, 0xd5, 0xbf, 0x93, 0x92, 0x5b, 0x95, 0x7b, 0x65, 0x92, 0x69, + 0xd4, 0xa7, 0xa8, 0xb3, 0x9f, 0x80, 0xa5, 0xcc, 0x93, 0x42, 0xd3, 0x47, + 0x29, 0x5c, 0x56, 0x6e, 0xa7, 0x4f, 0x6d, 0x90, 0x63, 0x8f, 0x3a, 0x59, + 0x77, 0x83, 0x70, 0x77, 0x50, 0xb0, 0xcf, 0x63, 0x69, 0x98, 0x9d, 0x4a, + 0xad, 0x8e, 0xa6, 0x3d, 0x35, 0xc9, 0x7a, 0x5a, 0x5e, 0x85, 0x76, 0x8d, + 0x7d, 0x58, 0x7d, 0x68, 0x69, 0xd3, 0x5e, 0x76, 0x3e, 0x60, 0x89, 0x4e, + 0x71, 0x2d, 0xb3, 0x4e, 0x52, 0x88, 0xb6, 0xb8, 0x92, 0x30, 0x5f, 0x77, + 0xb1, 0x95, 0xbc, 0x40, 0xa5, 0x8d, 0x36, 0x44, 0x59, 0x42, 0x7c, 0x82, + 0x9d, 0x8d, 0x55, 0x87, 0xc3, 0xc6, 0x67, 0x5e, 0x5d, 0xb0, 0x9d, 0x93, + 0x71, 0x48, 0x4e, 0x2b, 0xa5, 0x4d, 0x34, 0x98, 0x3e, 0x8f, 0xaf, 0xa4, + 0xb2, 0xd, 0x48, 0x21, 0x4c, 0x9c, 0xc3, 0xc7, 0x9b, 0x89, 0xc4, 0x95, + 0xbe, 0xbe, 0x48, 0xdd, 0x37, 0xba, 0xbb, 0x5c, 0x77, 0x63, 0xb4, 0x6c, + 0xaf, 0xb2, 0x28, 0x65, 0x73, 0xa0, 0x9b, 0x96, 0x98, 0x53, 0x6e, 0x3c, + 0x8d, 0xc8, 0xa8, 0xbf, 0xbc, 0xc3, 0xd3, 0xcd, 0x52, 0xc4, 0x63, 0x7f, + 0x77, 0xc1, 0x66, 0x9d, 0x4c, 0xc9, 0x52, 0x35, 0xd1, 0xc2, 0x72, 0xb9, + 0x4d, 0xaf, 0x9d, 0x49, 0xbc, 0x41, 0x60, 0x37, 0x43, 0x7a, 0x77, 0x9b, + 0x96, 0x5a, 0x3e, 0x8b, 0xa5, 0x8d, 0x61, 0x4b, 0x94, 0x8b, 0x39, 0x36, + 0x88, 0x6e, 0x6a, 0x6a, 0x42, 0xa2, 0x75, 0x3b, 0x86, 0x9c, 0x4b, 0x44, + 0x59, 0x8e, 0xd1, 0xc8, 0xcf, 0x41, 0x61, 0xa6, 0x8b, 0xaa, 0x42, 0x8a, + 0x77, 0x6d, 0x7b, 0x86, 0x54, 0x91, 0x58, 0x48, 0x9a, 0xa5, 0x9a, 0x8c, + 0x65, 0xba, 0xbd, 0x3d, 0x3a, 0xc2, 0x9c, 0x86, 0x50, 0x78, 0x6d, 0xd7, + 0x60, 0xc9, 0x8f, 0x42, 0xb3, 0x65, 0xac, 0x6e, 0xb4, 0xcb, 0xb4, 0x61, + 0xaa, 0x9b, 0xaf, 0xa9, 0xca, 0x81, 0x84, 0x67, 0x39, 0x72, 0x6a, 0x94, + 0xc2, 0xab, 0x61, 0x3f, 0xcb, 0x75, 0x5b, 0x47, 0x47, 0xbd, 0x44, 0xa7, + 0xad, 0x93, 0x7a, 0x82, 0x6d, 0x69, 0xdb, 0xb9, 0x70, 0xd0, 0x92, 0xb7, + 0xa7, 0x78, 0xa7, 0x8a, 0x47, 0x4d, 0x32, 0x7b, 0x89, 0x7c, 0xa6, 0xd3, + 0xb4, 0x60, 0x9b, 0x5f, 0xc0, 0x8c, 0x45, 0xad, 0x91, 0xa1, 0xc1, 0x8c, + 0xbf, 0x65, 0xd2, 0x59, 0x20, 0x6a, 0x69, 0xb3, 0x79, 0x40, 0xc3, 0xad, + 0x93, 0x70, 0x46, 0xb7, 0x3a, 0x2d, 0x55, 0x66, 0x47, 0x61, 0x9e, 0xc3, + 0xbe, 0x48, 0xa1, 0xb3, 0x77, 0x80, 0xb2, 0xa0, 0xc3, 0xa4, 0x8d, 0xb0, + 0x3a, 0x70, 0xa0, 0x92, 0x5b, 0xaf, 0x44, 0xca, 0xaa, 0xb4, 0x4b, 0x55, + 0xcd, 0xdf, 0x56, 0x78, 0xa7, 0xcd, 0x86, 0xa4, 0xa3, 0x9e, 0xb8, 0x78, + 0x9c, 0x44, 0xb9, 0x75, 0x7e, 0xd9, 0x92, 0x5f, 0xaa, 0xa5, 0x9c, 0xa6, + 0xc7, 0x58, 0x86, 0x4b, 0xac, 0x8e, 0xb1, 0xbf, 0x34, 0xa2, 0x76, 0xc4, + 0xa4, 0x2e, 0x4e, 0x73, 0x30, 0x5f, 0xd8, 0x75, 0x8c, 0x3b, 0x5a, 0x76, + 0x5f, 0xd8, 0x52, 0x69, 0x33, 0xd0, 0x94, 0x5c, 0x83, 0x6c, 0x35, 0x5a, + 0x75, 0x42, 0xca, 0xd6, 0x4c, 0x57, 0xab, 0x42, 0xc0, 0xb0, 0x8b, 0xa8, + 0x38, 0xa7, 0x4a, 0x83, 0xdb, 0x3f, 0x83, 0xcf, 0xd1, 0xa7, 0x2d, 0x3a, + 0x92, 0x6a, 0x4f, 0x55, 0xcb, 0x5f, 0x56, 0x44, 0xb1, 0xd0, 0x85, 0xa7, + 0x4c, 0x57, 0x5d, 0x79, 0x7d, 0xbc, 0x8f, 0x74, 0xb4, 0x6a, 0xb2, 0x76, + 0xa4, 0xcc, 0x9a, 0xba, 0x6e, 0x52, 0x6f, 0x8a, 0xa6, 0x45, 0x6a, 0xbf, + 0x38, 0x70, 0x75, 0xd8, 0x57, 0x58, 0x4b, 0x4f, 0x39, 0x9d, 0x8e, 0x48, + 0xc2, 0x82, 0x59, 0x58, 0xcd, 0x4c, 0x7a, 0xb3, 0x51, 0x2c, 0x74, 0x42, + 0x7b, 0xa8, 0x46, 0x72, 0x36, 0xb6, 0xac, 0x51, 0x76, 0xa4, 0x46, 0x30, + 0x72, 0x86, 0x75, 0xcb, 0xad, 0xc2, 0x64, 0xce, 0xae, 0xb2, 0x29, 0x5e, + 0x76, 0xbc, 0x85, 0xa9, 0x74, 0x6e, 0xae, 0x3c, 0xa8, 0x86, 0xaf, 0x46, + 0x51, 0x36, 0x87, 0xb8, 0xa4, 0x78, 0x5a, 0xcc, 0x9c, 0x6c, 0x39, 0x32, + 0xcf, 0x4f, 0x7f, 0xbe, 0x63, 0x8a, 0x81, 0x90, 0xb3, 0x5c, 0x90, 0x67, + 0xd5, 0xbb, 0x95, 0x6f, 0x9b, 0x74, 0x6f, 0xa4, 0x65, 0x64, 0x39, 0xb7, + 0xa0, 0xa4, 0x41, 0x66, 0x78, 0x62, 0xa7, 0x64, 0x9e, 0x83, 0x8b, 0xa5, + 0xa1, 0x9a, 0xa1, 0x7e, 0x90, 0x70, 0x65, 0xc9, 0x39, 0x32, 0x87, 0xc6, + 0x6e, 0x69, 0xa2, 0x64, 0x8b, 0x20, 0xc5, 0x50, 0x7e, 0xa0, 0x89, 0x8b, + 0x51, 0xd0, 0x74, 0x68, 0xbe, 0x48, 0xb8, 0x9a, 0x9d, 0x73, 0x36, 0x42, + 0x6c, 0xcb, 0x94, 0x43, 0x6f, 0x31, 0x79, 0x33, 0x81, 0x22, 0x99, 0x8e, + 0x8d, 0x50, 0x8c, 0x77, 0x7b, 0x51, 0x83, 0x42, 0xd7, 0x7d, 0x4f, 0x2d, + 0x93, 0x45, 0x8e, 0xa5, 0x7d, 0x35, 0x55, 0x3c, 0x39, 0xb5, 0x86, 0x35, + 0xc8, 0xca, 0x8b, 0xa8, 0x4b, 0x48, 0x3b, 0x7d, 0xbd, 0x71, 0x4a, 0x49, + 0x33, 0x61, 0xc6, 0xb0, 0x84, 0x6a, 0x35, 0x4b, 0x96, 0x4a, 0xa0, 0x70, + 0x5f, 0xa3, 0xc1, 0xbd, 0xbe, 0x93, 0xb4, 0x4f, 0x42, 0x7a, 0x56, 0x63, + 0x9a, 0x5f, 0xcc, 0xc2, 0x8f, 0x82, 0x38, 0x43, 0x85, 0x66, 0x9c, 0xb0, + 0x34, 0x73, 0x6e, 0x92, 0xc7, 0x2a, 0x65, 0x7a, 0x84, 0x96, 0x7e, 0xb5, + 0x37, 0x40, 0x40, 0xa5, 0x95, 0xaa, 0xbe, 0x75, 0xc2, 0xc9, 0x3e, 0x46, + 0xd6, 0x90, 0x3d, 0x64, 0xc1, 0x5a, 0xc8, 0x57, 0x3b, 0x3a, 0x4f, 0x69, + 0xda, 0x8e, 0x6c, 0x87, 0xb7, 0x65, 0x8d, 0xac, 0x34, 0x93, 0x60, 0xb9, + 0xce, 0x89, 0x7d, 0xc5, 0x33, 0xc1, 0xac, 0x8a, 0x71, 0x90, 0x56, 0xbd, + 0xd0, 0x9c, 0x69, 0x6a, 0xbf, 0x62, 0x55, 0x6a, 0x9a, 0x4c, 0xca, 0xc8, + 0xa2, 0x41, 0xb6, 0xba, 0x42, 0xb4, 0x78, 0x34, 0x86, 0xa4, 0xc8, 0xce, + 0x8d, 0xab, 0x44, 0x83, 0x61, 0x4a, 0x8c, 0xbc, 0xcf, 0x87, 0xab, 0x6d, + 0x54, 0xba, 0xab, 0xa0, 0x39, 0x74, 0xae, 0x37, 0x7d, 0x73, 0xb7, 0xc1, + 0xa4, 0xb5, 0x96, 0xc4, 0xd0, 0x52, 0x34, 0x7b, 0x68, 0x7f, 0x41, 0x63, + 0x43, 0xa8, 0xbf, 0x66, 0xd2, 0x9c, 0x5b, 0x51, 0x8a, 0x36, 0xd1, 0x6d, + 0x4d, 0xbd, 0xbb, 0x3f, 0x8c, 0x5d, 0xb8, 0xc0, 0x9d, 0xc8, 0x99, 0x83, + 0x3e, 0x70, 0x44, 0x71, 0xa5, 0x82, 0xb8, 0xb6, 0xc0, 0xa8, 0x3c, 0x50, + 0x83, 0x42, 0x91, 0x47, 0x96, 0x62, 0x53, 0x9d, 0x6a, 0xba, 0x6c, 0x63, + 0xb2, 0xbc, 0x45, 0x3f, 0x61, 0xa2, 0x61, 0x68, 0x7c, 0x97, 0xbe, 0x7c, + 0x8e, 0x53, 0x39, 0x62, 0xa8, 0x74, 0xb9, 0xba, 0xa9, 0xc5, 0x6d, 0x5a, + 0x86, 0x71, 0x86, 0x82, 0x8b, 0x95, 0xb9, 0x77, 0x90, 0x6a, 0xa0, 0x3d, + 0x97, 0x96, 0xb1, 0x5d, 0x44, 0x36, 0x9d, 0xc5, 0x48, 0x31, 0x6c, 0x79, + 0xce, 0x76, 0xc6, 0xc8, 0x36, 0xa8, 0x9d, 0x5b, 0xc5, 0xa1, 0x99, 0x3f, + 0x48, 0x88, 0xa2, 0x51, 0x55, 0x7f, 0x54, 0x40, 0xbe, 0x56, 0x6c, 0x59, + 0x62, 0x44, 0x35, 0x33, 0xab, 0x58, 0x38, 0xa2, 0x69, 0x56, 0x8e, 0x5f, + 0x69, 0x55, 0x75, 0xce, 0x44, 0xbd, 0x9c, 0x9c, 0x89, 0x61, 0x68, 0xbd, + 0x86, 0xa4, 0x92, 0x6c, 0xd3, 0x54, 0x79, 0x9a, 0x9f, 0x56, 0xbc, 0x6d, + 0x3c, 0x4d, 0xa3, 0x83, 0x7f, 0xb3, 0x4d, 0xcb, 0x72, 0x5a, 0x72, 0xb3, + 0x87, 0x35, 0xbd, 0x91, 0x76, 0xcb, 0xa9, 0xa2, 0xc9, 0x77, 0xc1, 0xa9, + 0x85, 0x75, 0x7a, 0x9e, 0x38, 0xb5, 0xbf, 0xc2, 0x7a, 0x46, 0x56, 0x90, + 0x67, 0x7e, 0x6a, 0x8b, 0xc1, 0xad, 0x9b, 0xc2, 0xba, 0x6a, 0x31, 0x70, + 0x99, 0xcc, 0x73, 0xc8, 0xc1, 0xa9, 0x8b, 0x3b, 0xc6, 0x30, 0x70, 0x64, + 0xc6, 0x9e, 0xb9, 0x6a, 0x8a, 0x60, 0x67, 0xa9, 0xb9, 0x34, 0xc4, 0x6c, + 0x34, 0x2f, 0xc4, 0x99, 0x99, 0x69, 0x48, 0x9d, 0x66, 0x55, 0x6f, 0x51, + 0x46, 0x52, 0xce, 0xd3, 0x7b, 0x4b, 0x7c, 0x37, 0xb6, 0xac, 0x2a, 0x3b, + 0x95, 0x6f, 0x98, 0x38, 0x93, 0x42, 0xcb, 0x78, 0xcd, 0x56, 0x78, 0x4f, + 0xc8, 0x85, 0x94, 0x4b, 0x5e, 0xd6, 0xc4, 0x35, 0x6f, 0x7c, 0x51, 0x58, + 0x78, 0xcc, 0xc1, 0x7e, 0x62, 0x80, 0xcc, 0x74, 0x61, 0x8d, 0x70, 0x3d, + 0x6d, 0x3c, 0xa2, 0x6f, 0x52, 0x51, 0x90, 0x77, 0x84, 0xb2, 0x81, 0x71, + 0xa3, 0xc0, 0xb2, 0x46, 0xa9, 0xd2, 0x9d, 0x3b, 0x5a, 0xbf, 0xc0, 0xb3, + 0x7a, 0xa9, 0x55, 0x90, 0xc8, 0x62, 0x42, 0x97, 0xa2, 0xa3, 0x43, 0xb6, + 0xb3, 0x6a, 0xc8, 0x4a, 0x96, 0x79, 0xad, 0x64, 0x78, 0x53, 0x7e, 0x9a, + 0x37, 0x59, 0xbb, 0x4a, 0xb6, 0xb5, 0xb9, 0xcb, 0x80, 0x68, 0xcb, 0xc6, + 0x63, 0xb1, 0xd1, 0x38, 0xbe, 0x35, 0x95, 0xb2, 0xa5, 0x2d, 0x54, 0x47, + 0x36, 0x63, 0x53, 0x7d, 0xbb, 0xac, 0x86, 0xa2, 0xa7, 0xba, 0x40, 0x6c, + 0x68, 0x7d, 0x78, 0x57, 0x47, 0xaf, 0x68, 0x61, 0xcc, 0x40, 0x3e, 0x9d, + 0x92, 0xad, 0x47, 0x73, 0xbe, 0x95, 0x31, 0x45, 0x9f, 0x6b, 0xb4, 0x8f, + 0x50, 0x96, 0xac, 0x38, 0x7f, 0x4b, 0xaf, 0x9f, 0x88, 0x76, 0x7b, 0xca, + 0x77, 0x7f, 0x49, 0x7e, 0xb7, 0x6f, 0x2f, 0x83, 0x7a, 0x84, 0x98, 0xbd, + 0xb8, 0x8f, 0x42, 0x9c, 0x6e, 0x39, 0x31, 0x45, 0xb3, 0xa0, 0x78, 0x76, + 0x52, 0xba, 0x4c, 0x5b, 0x8c, 0xb7, 0x45, 0x80, 0x78, 0xc6, 0x35, 0x83, + 0x90, 0x94, 0x8b, 0xa1, 0x59, 0x8c, 0x8a, 0x50, 0x5f, 0xc9, 0xbb, 0x50, + 0x5b, 0x52, 0x55, 0x2f, 0x56, 0xb5, 0x97, 0xbd, 0xa9, 0x56, 0xb6, 0xcd, + 0x57, 0x5a, 0xbc, 0xaa, 0x47, 0xda, 0xc7, 0x59, 0x4c, 0x5c, 0x8e, 0x6e, + 0x43, 0x66, 0x34, 0x73, 0x60, 0x8c, 0x2e, 0x80, 0x3e, 0x3e, 0x4f, 0x92, + 0x38, 0xc2, 0x31, 0x3a, 0xd4, 0x89, 0x30, 0xcf, 0xa5, 0x8a, 0x42, 0x9f, + 0x89, 0x8a, 0x47, 0x8a, 0xa8, 0x74, 0x6f, 0x2c, 0x55, 0x4c, 0x89, 0x7b, + 0x7e, 0xb2, 0x45, 0xc3, 0xad, 0x59, 0x3f, 0xb7, 0x92, 0x76, 0x83, 0x76, + 0xc5, 0x81, 0x68, 0x3b, 0xc3, 0xb8, 0xcf, 0x9a, 0x66, 0xba, 0x71, 0x3f, + 0x4d, 0x76, 0xc8, 0x9e, 0xa3, 0x3c, 0x5a, 0x77, 0x84, 0xa6, 0x86, 0x3d, + 0x9b, 0xac, 0x42, 0x69, 0x6d, 0x6b, 0x6a, 0xc7, 0x50, 0xb3, 0xaf, 0xb0, + 0x33, 0x96, 0x62, 0xbb, 0x9c, 0x93, 0xca, 0x5a, 0x44, 0xc2, 0x9a, 0x3a, + 0x3d, 0x51, 0x7b, 0x53, 0x97, 0x73, 0xc7, 0x36, 0x5d, 0x4e, 0x37, 0xab, + 0xa8, 0x68, 0x99, 0xad, 0x4e, 0x51, 0x51, 0x4c, 0xcb, 0x59, 0x6f, 0x60, + 0x68, 0x31, 0x51, 0x92, 0xb9, 0x54, 0x26, 0x5b, 0x5a, 0x9e, 0x3c, 0x67, + 0xaf, 0x3b, 0x8f, 0x49, 0x4d, 0x49, 0x57, 0x6a, 0x50, 0x88, 0x5a, 0xac, + 0x43, 0xca, 0x8f, 0xae, 0x3f, 0xa5, 0x8c, 0xa5, 0x4b, 0xb7, 0xa7, 0x79, + 0x55, 0x8a, 0x97, 0x92, 0x39, 0x90, 0xa5, 0xc1, 0x66, 0x50, 0xa2, 0x34, + 0x76, 0x58, 0xa6, 0xcd, 0xcc, 0x98, 0x9e, 0xa2, 0x53, 0x7e, 0xd2, 0x8a, + 0xcb, 0x63, 0x93, 0x39, 0x7d, 0x46, 0xb8, 0x57, 0x84, 0x78, 0x7c, 0x83, + 0xab, 0x43, 0xa6, 0xa4, 0x77, 0xb4, 0x96, 0xb4, 0xcc, 0x80, 0xb4, 0x42, + 0xa3, 0xc3, 0xb3, 0x3a, 0x91, 0xca, 0x42, 0x78, 0x9e, 0x51, 0xad, 0x86, + 0x73, 0xc5, 0x81, 0xc7, 0x83, 0x8f, 0x9c, 0xcf, 0x5b, 0xbd, 0x4e, 0x36, + 0x72, 0xcb, 0x5d, 0xbe, 0x88, 0xa2, 0x3f, 0x54, 0x5e, 0x5b, 0xcf, 0xa3, + 0x9a, 0x61, 0xdb, 0x78, 0x39, 0x8a, 0xc2, 0xb6, 0xb5, 0x41, 0xb2, 0x37, + 0xca, 0xa7, 0x7e, 0x44, 0x66, 0x55, 0xa6, 0xb9, 0x8e, 0x74, 0xc6, 0x8d, + 0xc4, 0x73, 0x98, 0x9b, 0x58, 0xbe, 0x7a, 0x47, 0xc8, 0x4e, 0x44, 0x3e, + 0xcc, 0x83, 0xc1, 0x54, 0x5b, 0x5d, 0x51, 0x2c, 0xaf, 0x97, 0xa3, 0x5b, + 0x86, 0x9a, 0x9b, 0x90, 0x67, 0x5c, 0x72, 0x81, 0x49, 0x4d, 0x3f, 0x7c, + 0x2a, 0x83, 0xa2, 0xa3, 0x7e, 0xa4, 0x63, 0x8c, 0x7a, 0x77, 0x6f, 0x69, + 0xb0, 0x91, 0xa1, 0x9b, 0x63, 0x57, 0x4e, 0xce, 0xa8, 0xbd, 0x72, 0x42, + 0xa2, 0xb2, 0x3a, 0x87, 0x36, 0xab, 0x71, 0x31, 0x2e, 0x92, 0xb5, 0x48, + 0x3e, 0x88, 0xd3, 0x60, 0x64, 0xb9, 0x6a, 0x58, 0x85, 0x81, 0x9f, 0x87, + 0x7d, 0x6a, 0x96, 0x3d, 0xda, 0x64, 0xc2, 0xc2, 0xad, 0x49, 0x2c, 0x5a, + 0xa8, 0xb9, 0xc7, 0xbd, 0xc0, 0xcc, 0x7d, 0x78, 0x53, 0xd4, 0x72, 0x62, + 0x4b, 0xb5, 0x28, 0x63, 0x4d, 0x6f, 0x7a, 0x35, 0x9c, 0x3b, 0x58, 0x8d, + 0x29, 0x30, 0x75, 0x5e, 0x93, 0xd9, 0x52, 0xc8, 0x44, 0x8f, 0xca, 0x5b, + 0x83, 0x42, 0x7c, 0x34, 0x38, 0x50, 0xc3, 0x8e, 0x69, 0x93, 0x6b, 0x3a, + 0x3f, 0x8b, 0xc8, 0x5a, 0x8a, 0xa3, 0x73, 0x9f, 0x4f, 0xa3, 0xbb, 0xc9, + 0x58, 0x85, 0x44, 0x61, 0xbb, 0x97, 0x2a, 0x57, 0x7b, 0x44, 0x99, 0xb5, + 0xcb, 0x2c, 0xb6, 0xc9, 0x70, 0x5e, 0x95, 0xd2, 0x5c, 0x96, 0xa8, 0x88, + 0xc8, 0x68, 0xb0, 0x9f, 0xad, 0x94, 0x5f, 0x72, 0x96, 0x70, 0x98, 0x9b, + 0xb2, 0x93, 0xca, 0x72, 0xad, 0xbe, 0x6b, 0x90, 0x49, 0x44, 0x93, 0xc1, + 0x65, 0x6f, 0x72, 0x30, 0x41, 0xc6, 0x5f, 0x5b, 0xb9, 0xa6, 0xa1, 0xa2, + 0x37, 0x90, 0x56, 0x9b, 0x77, 0xad, 0xa9, 0xb7, 0x54, 0xae, 0x84, 0x6f, + 0x47, 0x74, 0x53, 0x59, 0xb5, 0x97, 0x8b, 0x5c, 0x6e, 0x79, 0x4d, 0x8e, + 0x5a, 0x37, 0xb2, 0x3e, 0xbd, 0xce, 0xa5, 0x24, 0x76, 0x29, 0xa0, 0x53, + 0x8e, 0xaa, 0x77, 0x86, 0x77, 0x8c, 0x7f, 0x8a, 0xb6, 0x7b, 0xc0, 0xa6, + 0xb4, 0xb8, 0x8d, 0x56, 0x39, 0x7f, 0x7a, 0xa9, 0xc5, 0x5e, 0xb9, 0xa2, + 0xc8, 0xbd, 0x40, 0x5f, 0x6c, 0x64, 0x57, 0x84, 0xbe, 0x8e, 0x46, 0xb0, + 0x83, 0xc5, 0x7b, 0x7c, 0x85, 0x9c, 0xb6, 0x37, 0x7c, 0x7d, 0x9d, 0x48, + 0xd1, 0x95, 0x38, 0x2c, 0x8b, 0x74, 0x3f, 0x61, 0xa5, 0x78, 0x73, 0x5a, + 0xcd, 0x6c, 0x2a, 0x34, 0x65, 0xc8, 0xba, 0x6a, 0xcc, 0x81, 0x5f, 0xb7, + 0x36, 0x2b, 0x95, 0x52, 0x80, 0x7f, 0x46, 0x7c, 0xd4, 0x8b, 0xa3, 0xd4, + 0xb2, 0x88, 0xa2, 0x4e, 0x4e, 0x53, 0x84, 0x5c, 0x85, 0x4b, 0x4d, 0xa1, + 0x89, 0x65, 0x4b, 0x8a, 0x61, 0xb1, 0x90, 0x97, 0x4b, 0x40, 0xcc, 0x9b, + 0xb6, 0x79, 0x99, 0xb0, 0x34, 0xbd, 0x69, 0x43, 0x9b, 0x4f, 0x8e, 0x7d, + 0xb4, 0xa0, 0x53, 0xd8, 0xc5, 0xad, 0xd3, 0x44, 0x6b, 0x51, 0xc5, 0x5e, + 0x63, 0x8e, 0xb5, 0x37, 0x53, 0x78, 0xc6, 0xc9, 0x96, 0x7d, 0x8e, 0x67, + 0x99, 0x8f, 0x86, 0xc1, 0x9f, 0x56, 0x6f, 0x6a, 0x6b, 0xbc, 0xb8, 0x2f, + 0x80, 0x35, 0x9e, 0x3d, 0x42, 0x61, 0x8e, 0x6e, 0x67, 0xb1, 0x45, 0xd2, + 0x60, 0x65, 0x4d, 0xe0, 0x9d, 0x4b, 0x31, 0xaf, 0x59, 0x3f, 0xb7, 0x51, + 0x45, 0x7d, 0x8e, 0x65, 0x47, 0x7c, 0x75, 0x42, 0x44, 0xbf, 0x68, 0xa0, + 0xc8, 0xa4, 0x5c, 0x67, 0x5a, 0x4b, 0xb6, 0xb4, 0x3d, 0x8e, 0x7e, 0xab, + 0x95, 0xc7, 0xb4, 0x48, 0x9a, 0xb6, 0xbd, 0x76, 0xb8, 0x4a, 0x95, 0x7e, + 0xda, 0x92, 0x8e, 0x5c, 0xbb, 0x8b, 0x79, 0x48, 0x55, 0xd9, 0x59, 0xd6, + 0x42, 0xa1, 0x75, 0x50, 0x8d, 0x6d, 0x72, 0x50, 0x5e, 0x77, 0x78, 0x98, + 0x85, 0x9b, 0x91, 0xbc, 0x8a, 0xa6, 0x84, 0x58, 0xa4, 0x6f, 0x87, 0xd5, + 0xc3, 0x67, 0xb9, 0x38, 0xba, 0xae, 0xb2, 0x4f, 0x93, 0xad, 0x5d, 0x95, + 0x3e, 0xb3, 0x83, 0x3a, 0xbc, 0xc8, 0x46, 0x74, 0x37, 0xa3, 0x87, 0x68, + 0x33, 0x8a, 0x49, 0xb7, 0x2d, 0x78, 0xb8, 0x43, 0xa8, 0xc3, 0x97, 0x79, + 0xc8, 0x71, 0xa4, 0x9e, 0x7d, 0x7c, 0x77, 0x7f, 0x7d, 0x80, 0xa0, 0xd4, + 0xc4, 0x8c, 0x69, 0xa5, 0x3f, 0x9c, 0x3f, 0x27, 0xbf, 0xc5, 0x9f, 0xbf, + 0x89, 0x3e, 0x79, 0x7d, 0x98, 0x41, 0xb8, 0x5c, 0x70, 0x35, 0x46, 0x1f, + 0x81, 0xcd, 0xab, 0x88, 0xa2, 0x45, 0x64, 0x78, 0xac, 0x5f, 0x5a, 0x79, + 0x5b, 0x5e, 0xc7, 0x2a, 0x6f, 0xb3, 0xcf, 0xc6, 0xa9, 0x93, 0xd8, 0x73, + 0x42, 0xb7, 0xcc, 0x74, 0xa7, 0x3a, 0x73, 0x3e, 0x5f, 0x59, 0xa3, 0x42, + 0x55, 0x6e, 0x99, 0x64, 0x86, 0x52, 0x8a, 0x69, 0x8b, 0xe5, 0xc0, 0xa9, + 0x94, 0x55, 0x3e, 0xd5, 0x8a, 0xba, 0xb7, 0xb3, 0xa3, 0x55, 0x38, 0xa0, + 0x91, 0x98, 0x5d, 0xae, 0x9b, 0xc1, 0x45, 0x8e, 0x8c, 0xd0, 0xa4, 0xa7, + 0x9d, 0xcd, 0x91, 0x8c, 0x92, 0xb7, 0xbb, 0x8e, 0xd5, 0xc2, 0xab, 0x8c, + 0xbd, 0x5c, 0xab, 0xb0, 0x41, 0x8e, 0x5a, 0x90, 0xb1, 0x98, 0x85, 0x70, + 0x9e, 0xa3, 0xad, 0x42, 0x7c, 0x71, 0xa5, 0xc5, 0xc9, 0x62, 0x74, 0x3a, + 0x80, 0x91, 0x5c, 0x74, 0xb7, 0x33, 0xad, 0x8b, 0x63, 0x5c, 0x65, 0x98, + 0xcc, 0x39, 0xac, 0x9f, 0x80, 0x49, 0x96, 0xc2, 0xc1, 0x71, 0xbf, 0x8e, + 0x4a, 0xb4, 0x56, 0x85, 0xbb, 0x83, 0xbc, 0x83, 0xa4, 0xb0, 0x6f, 0x64, + 0x7b, 0x64, 0xa7, 0xd1, 0x8d, 0xcd, 0x9c, 0x51, 0xad, 0x39, 0x6b, 0xd5, + 0x97, 0x60, 0xa3, 0x71, 0xa4, 0x61, 0xa5, 0xd1, 0x50, 0x31, 0xca, 0x75, + 0x59, 0x87, 0x64, 0xc9, 0x68, 0x55, 0x1b, 0x7b, 0x7a, 0x74, 0xaa, 0xad, + 0xb3, 0x47, 0x51, 0xd3, 0x65, 0x5f, 0x79, 0x4d, 0xa0, 0x3b, 0xa0, 0x1e, + 0x32, 0x24, 0xc8, 0xa4, 0xc8, 0x82, 0x3a, 0x5c, 0x9a, 0xb8, 0x55, 0x8d, + 0x99, 0x99, 0x5f, 0x65, 0x55, 0x7d, 0x3d, 0x91, 0x89, 0x98, 0xd1, 0xb1, + 0x8f, 0x57, 0x9e, 0xcf, 0x7f, 0xbb, 0x38, 0x8c, 0x4d, 0x3d, 0x7a, 0x79, + 0x8f, 0x4e, 0xc3, 0x9a, 0x8c, 0x9a, 0x45, 0xb0, 0xc8, 0x3a, 0x93, 0x96, + 0x78, 0x3a, 0x71, 0x6f, 0x86, 0x34, 0x3e, 0x31, 0xaf, 0xaa, 0x2f, 0x7e, + 0x9a, 0x53, 0x9b, 0x72, 0x80, 0x88, 0x69, 0x6d, 0x4c, 0x61, 0x80, 0x64, + 0x3b, 0x79, 0x85, 0xb4, 0xb3, 0x6e, 0xb7, 0x4f, 0x4d, 0x70, 0x78, 0x5f, + 0xc8, 0xc3, 0x38, 0x87, 0x85, 0x50, 0x36, 0xaa, 0xcc, 0xcb, 0xc4, 0x31, + 0x80, 0x96, 0x49, 0x68, 0x75, 0x4a, 0x48, 0xc6, 0x3f, 0x8b, 0x59, 0x35, + 0x58, 0x90, 0xa5, 0x95, 0x8e, 0x58, 0x36, 0x54, 0x50, 0x82, 0x5f, 0x9f, + 0xa5, 0x53, 0xa2, 0xb3, 0x57, 0xb5, 0x7a, 0x88, 0xc0, 0x42, 0xa1, 0xaa, + 0x51, 0xaf, 0x5b, 0x85, 0xbd, 0x89, 0x33, 0xbf, 0x69, 0x7c, 0xbe, 0xb5, + 0xb1, 0x91, 0xd2, 0x64, 0x39, 0xd2, 0xa1, 0x4f, 0x93, 0x4b, 0x7c, 0x3b, + 0x9c, 0x91, 0x93, 0x42, 0xb8, 0x9b, 0x9c, 0x8b, 0x67, 0xa7, 0x63, 0x7e, + 0xa7, 0xa5, 0x7a, 0x54, 0xa2, 0x95, 0x78, 0x9b, 0x71, 0x81, 0xa1, 0x5b, + 0x42, 0x3a, 0xb9, 0x3c, 0xb8, 0xbe, 0x5c, 0x4d, 0x74, 0x30, 0x60, 0x31, + 0xb4, 0xc3, 0x6c, 0xce, 0x63, 0x84, 0x63, 0x79, 0x44, 0x8f, 0x63, 0x3b, + 0x72, 0x3a, 0x77, 0xab, 0x8e, 0x55, 0x7e, 0x55, 0x77, 0x62, 0x50, 0x85, + 0xa2, 0xa2, 0x68, 0x39, 0x48, 0x6b, 0xb0, 0x9b, 0x4b, 0x94, 0x69, 0x73, + 0x58, 0x8e, 0x52, 0x43, 0xa0, 0x36, 0x60, 0xc8, 0xa3, 0x70, 0x75, 0x65, + 0x8f, 0xb7, 0xad, 0x4e, 0x91, 0x6e, 0x89, 0xc8, 0x5b, 0x9f, 0x76, 0x94, + 0x94, 0x54, 0x94, 0x7d, 0x37, 0x83, 0xc7, 0x90, 0xac, 0x81, 0x3e, 0x6c, + 0x9b, 0xc9, 0xcf, 0x90, 0x42, 0xb5, 0xcb, 0x7d, 0xbc, 0x50, 0xb8, 0x5f, + 0x70, 0x5e, 0x7c, 0x7e, 0x36, 0x33, 0x39, 0x72, 0x5c, 0xd0, 0x9f, 0xb9, + 0x4b, 0xa4, 0x30, 0x8e, 0x66, 0xd3, 0xcc, 0xc8, 0xb3, 0x6c, 0x96, 0x56, + 0xba, 0x59, 0x3a, 0x33, 0x3e, 0x63, 0x4d, 0x8e, 0x92, 0x71, 0xb9, 0x7c, + 0x3c, 0x49, 0x7b, 0xa3, 0x46, 0x7d, 0x87, 0xa8, 0x6c, 0x62, 0x67, 0xae, + 0x3c, 0x70, 0x55, 0x96, 0x70, 0x6d, 0x6d, 0x37, 0x59, 0xd0, 0xd5, 0xdd, + 0xb3, 0x48, 0xb4, 0x68, 0x33, 0x7f, 0x80, 0x3b, 0xc6, 0x71, 0x56, 0xa8, + 0xd1, 0x51, 0x8d, 0xc0, 0x72, 0x48, 0xbc, 0xaf, 0x89, 0xba, 0xa9, 0x57, + 0x77, 0x78, 0x69, 0xae, 0x90, 0x87, 0x99, 0x49, 0xd0, 0x4e, 0x77, 0x35, + 0xc6, 0xa3, 0xb6, 0xbb, 0xc3, 0xb7, 0x38, 0xc6, 0xb5, 0x7c, 0x99, 0xc5, + 0x8b, 0xd5, 0xc7, 0xcd, 0x93, 0x5b, 0x87, 0x64, 0x7a, 0x48, 0xaf, 0x3c, + 0xa0, 0x83, 0x6c, 0x35, 0xa1, 0xb5, 0xc3, 0xcd, 0x91, 0x9d, 0x7e, 0x81, + 0xa1, 0x91, 0x7f, 0xc1, 0x92, 0x40, 0xb2, 0x3e, 0x39, 0x47, 0x50, 0xa8, + 0xcb, 0x78, 0x89, 0xc8, 0x6e, 0x55, 0x75, 0x4d, 0x86, 0xd2, 0xa5, 0xb5, + 0x8e, 0x38, 0xd6, 0x78, 0xd3, 0xcb, 0x99, 0x81, 0x9d, 0x90, 0x50, 0x5f, + 0x67, 0xaf, 0xce, 0xc5, 0x62, 0x3c, 0x38, 0x9a, 0x44, 0x4f, 0x3f, 0xcb, + 0xa2, 0xd3, 0x99, 0xd0, 0x47, 0x3a, 0x7f, 0x88, 0x63, 0x4a, 0xa4, 0x8c, + 0x8d, 0x84, 0x36, 0x84, 0x68, 0xb1, 0x7f, 0x4d, 0x3a, 0x53, 0x9e, 0xae, + 0x91, 0x7d, 0x95, 0xd1, 0xca, 0x8e, 0x55, 0x87, 0x7b, 0x40, 0xc6, 0xb3, + 0x46, 0xc8, 0x56, 0x9f, 0x8b, 0xa5, 0x7c, 0x7f, 0x5f, 0x4a, 0xb0, 0xc1, + 0x9f, 0xb4, 0xa1, 0x32, 0x5c, 0xc9, 0xc5, 0x43, 0x57, 0x73, 0x4f, 0x80, + 0xc0, 0xc7, 0xd1, 0x80, 0x81, 0x7f, 0x66, 0xb4, 0x4d, 0x61, 0x56, 0xbd, + 0x96, 0x3c, 0xca, 0x6e, 0x7e, 0x4d, 0xb3, 0x83, 0xca, 0x9e, 0xcc, 0x5c, + 0x43, 0x35, 0x74, 0x96, 0xbd, 0x59, 0x7b, 0xa1, 0x6f, 0x4e, 0xc7, 0x39, + 0x81, 0x85, 0x3a, 0x45, 0x88, 0xbe, 0x6a, 0x9e, 0x73, 0xc7, 0x49, 0x91, + 0x68, 0x8c, 0x89, 0xb5, 0x52, 0x3c, 0x9c, 0xc4, 0x8b, 0x52, 0xc4, 0x98, + 0x65, 0x45, 0x32, 0xb0, 0x3c, 0x35, 0xa1, 0x49, 0x54, 0x39, 0xb5, 0x7e, + 0x6f, 0x97, 0x60, 0xaf, 0x8b, 0xcc, 0x61, 0xc4, 0xb3, 0x45, 0xa2, 0x50, + 0x3d, 0x42, 0x7c, 0x57, 0x4a, 0x8a, 0x86, 0x7e, 0xa9, 0x55, 0x67, 0x54, + 0x78, 0x3e, 0x72, 0xba, 0x8e, 0x82, 0x4b, 0x64, 0xb4, 0x97, 0xa9, 0xcd, + 0x39, 0x99, 0x7f, 0x5a, 0x4d, 0xc4, 0x39, 0x3a, 0x81, 0x80, 0x94, 0xcf, + 0xaa, 0xbf, 0x37, 0xa4, 0x5f, 0x35, 0x5a, 0xa2, 0x40, 0x96, 0x7c, 0x41, + 0xd1, 0x3e, 0x42, 0x35, 0x7f, 0xbe, 0x3c, 0x5c, 0x99, 0x75, 0x95, 0xd1, + 0x53, 0xa6, 0xa6, 0xa2, 0x5e, 0x43, 0xa9, 0x43, 0x30, 0x39, 0x98, 0x9b, + 0x95, 0x81, 0xc8, 0x8f, 0x7c, 0x70, 0x75, 0xb8, 0x60, 0x4f, 0x93, 0x6c, + 0x3b, 0x6b, 0x64, 0xcc, 0xa8, 0x3a, 0x6c, 0x61, 0x47, 0x42, 0x69, 0x4b, + 0x79, 0x8d, 0x74, 0x9c, 0x4f, 0x76, 0x3f, 0xc1, 0x3c, 0x42, 0x35, 0x45, + 0xa8, 0x85, 0x4e, 0x93, 0x68, 0x88, 0x71, 0xaa, 0x36, 0x60, 0x54, 0x9d, + 0xba, 0xc6, 0x6f, 0x5e, 0x50, 0xa2, 0x8b, 0xb1, 0x30, 0x50, 0x85, 0x62, + 0x8c, 0x8b, 0xad, 0x55, 0xab, 0x46, 0x90, 0x84, 0x8f, 0xa6, 0x50, 0xa3, + 0x6e, 0x49, 0x45, 0xb1, 0x81, 0xb1, 0x8a, 0x42, 0x6c, 0x38, 0xd4, 0x52, + 0x30, 0xc4, 0x60, 0x89, 0x57, 0x3b, 0xa4, 0x40, 0x30, 0x53, 0x8e, 0x9e, + 0xb7, 0x95, 0x50, 0x9b, 0xd9, 0xd3, 0x3a, 0xb1, 0xae, 0x67, 0x50, 0x57, + 0xc4, 0x6e, 0xdf, 0x36, 0xa0, 0x50, 0xb8, 0x6f, 0x90, 0x6d, 0xa0, 0x95, + 0x94, 0x82, 0x6f, 0x71, 0x76, 0x6b, 0xa7, 0xbc, 0x6a, 0x61, 0x54, 0x6d, + 0x3c, 0xb4, 0x7a, 0x59, 0x7a, 0x5b, 0xd1, 0x82, 0x65, 0x5e, 0x70, 0x8c, + 0xc5, 0x9a, 0x4d, 0x44, 0xb9, 0x4b, 0xbe, 0x58, 0x50, 0x8a, 0x52, 0xa3, + 0x5c, 0xd6, 0xa6, 0x9b, 0xb9, 0x74, 0x51, 0x3c, 0x81, 0xd3, 0x49, 0xaf, + 0x59, 0xa7, 0xbd, 0x87, 0x34, 0x9f, 0x4f, 0x72, 0x61, 0x6a, 0xc8, 0xc8, + 0x9a, 0x47, 0x72, 0x9a, 0x9c, 0x75, 0x79, 0xc3, 0x35, 0xac, 0x8c, 0x6f, + 0x4c, 0xbc, 0x42, 0x6b, 0x3f, 0x91, 0xbc, 0x57, 0x71, 0x72, 0x56, 0x3e, + 0x9c, 0xd2, 0xb7, 0xa3, 0x77, 0x7d, 0x3c, 0x3c, 0x8d, 0x49, 0x7f, 0x85, + 0x70, 0x64, 0x4b, 0x3a, 0x52, 0x9d, 0x98, 0x74, 0x33, 0x42, 0x94, 0x86, + 0x68, 0x40, 0xd2, 0x96, 0x93, 0xaa, 0x70, 0x3a, 0x60, 0x58, 0x6e, 0x9c, + 0xb8, 0xe1, 0xdd, 0x7a, 0x2c, 0xa2, 0xac, 0xd9, 0x50, 0xbc, 0x8c, 0xac, + 0x76, 0x6e, 0x6c, 0x37, 0x99, 0xad, 0xb7, 0xc7, 0xae, 0x98, 0xc7, 0x37, + 0x97, 0xa7, 0x2e, 0x8c, 0xc9, 0xab, 0x74, 0x4c, 0xba, 0xc5, 0xac, 0x9a, + 0x71, 0x9c, 0x58, 0x65, 0xa7, 0x9b, 0x9f, 0xa9, 0x87, 0x49, 0xba, 0x5e, + 0xca, 0x65, 0xd1, 0xbd, 0x8e, 0xc4, 0x83, 0x62, 0x9b, 0xa8, 0xa8, 0xc4, + 0x49, 0xbe, 0xa3, 0x4f, 0x7a, 0xc0, 0x3b, 0x3e, 0xb9, 0xb2, 0x68, 0x83, + 0x86, 0xc8, 0xca, 0x81, 0xa5, 0x46, 0x53, 0x55, 0xca, 0xa7, 0x42, 0xb6, + 0xb0, 0x76, 0x6b, 0x44, 0x78, 0x43, 0x6f, 0x49, 0x89, 0xd4, 0x77, 0x95, + 0x3a, 0xa2, 0xb2, 0x97, 0x8a, 0xb7, 0xe0, 0x5a, 0x4d, 0x52, 0x63, 0xd7, + 0x5e, 0xc5, 0xba, 0x72, 0x50, 0x5a, 0xbc, 0xa6, 0x73, 0xaa, 0xb9, 0xc0, + 0x8b, 0x4c, 0x88, 0x62, 0x5b, 0x7b, 0xc0, 0xa7, 0x93, 0x8b, 0x64, 0x86, + 0xa9, 0x65, 0x99, 0xae, 0x50, 0x69, 0x74, 0x5c, 0xa1, 0x37, 0xb9, 0x8c, + 0xc5, 0x4f, 0x69, 0xaa, 0x66, 0x63, 0x3c, 0x66, 0x53, 0x41, 0x62, 0x37, + 0x4b, 0xa0, 0x46, 0x8a, 0x7f, 0xca, 0xa6, 0xab, 0xb4, 0x89, 0x7e, 0x35, + 0x2b, 0x1b, 0x2c, 0x4c, 0x64, 0x47, 0x68, 0x8d, 0xb0, 0x8e, 0xbb, 0x91, + 0x9f, 0x40, 0xc7, 0xb7, 0xa5, 0x65, 0xae, 0xc9, 0xc3, 0x77, 0xba, 0xa6, + 0x9e, 0x89, 0x83, 0xcb, 0xa1, 0x30, 0x66, 0xbe, 0xb8, 0xc5, 0xc1, 0x72, + 0x98, 0x3f, 0x7f, 0x51, 0xaf, 0xbc, 0xb8, 0x5c, 0xab, 0x63, 0x37, 0x86, + 0x42, 0x8b, 0x50, 0x8a, 0xcb, 0x99, 0x51, 0x97, 0x52, 0x63, 0x46, 0x7f, + 0xdd, 0x77, 0x5a, 0x6f, 0x88, 0x9f, 0x7e, 0x9d, 0x68, 0x4b, 0xae, 0x4c, + 0x48, 0x38, 0xaa, 0x80, 0x60, 0x5c, 0x8a, 0x75, 0x92, 0xbc, 0x77, 0x62, + 0x7c, 0x51, 0x46, 0x9a, 0xbd, 0xb1, 0xd0, 0x6b, 0x70, 0xbe, 0x4b, 0x8d, + 0x79, 0x9f, 0x9e, 0x82, 0xc2, 0x63, 0x45, 0x77, 0x82, 0x96, 0xa3, 0x7b, + 0x75, 0xc8, 0x8e, 0x78, 0x6a, 0x56, 0x8d, 0x46, 0xb1, 0x7b, 0x60, 0x2f, + 0x3f, 0xbc, 0x41, 0xca, 0xc2, 0x7e, 0x4a, 0xc4, 0xaa, 0x87, 0xa8, 0xbd, + 0x3a, 0xa6, 0x43, 0xbb, 0x2c, 0xa5, 0x82, 0xa4, 0x6d, 0xb1, 0x45, 0xbd, + 0xe1, 0x5b, 0xa4, 0x6e, 0xd5, 0x9e, 0xaf, 0xa4, 0x55, 0x86, 0x6c, 0x37, + 0x7c, 0xa4, 0x5d, 0x7f, 0x80, 0x41, 0x8c, 0x6a, 0x44, 0x91, 0xb3, 0xd0, + 0xb5, 0x2a, 0xb3, 0xb9, 0x8b, 0x7b, 0xbc, 0xb1, 0x69, 0x84, 0x7c, 0x97, + 0x86, 0x63, 0xcc, 0x9d, 0x72, 0xca, 0x72, 0x63, 0x5e, 0x50, 0x8f, 0x42, + 0xb8, 0xb2, 0x97, 0x9a, 0x37, 0xad, 0x56, 0x7c, 0xd7, 0xa0, 0xca, 0x82, + 0xa8, 0xcc, 0xcb, 0x96, 0x65, 0x73, 0xc0, 0x4a, 0x9f, 0xaf, 0x60, 0x76, + 0x55, 0xa1, 0x9d, 0x8e, 0xca, 0x94, 0xd0, 0x7b, 0x81, 0x93, 0x4e, 0x7a, + 0x7c, 0x9b, 0x63, 0x90, 0x87, 0xc6, 0x6d, 0xac, 0x63, 0x71, 0x87, 0x53, + 0xe2, 0x89, 0x98, 0x77, 0x98, 0x62, 0xce, 0x80, 0x55, 0x71, 0x5a, 0x41, + 0xb8, 0x8c, 0x7f, 0x9e, 0xd2, 0x85, 0xa6, 0xa0, 0xc3, 0x68, 0x8a, 0x90, + 0xe2, 0xa2, 0x91, 0xb7, 0x8b, 0x77, 0x2e, 0x7d, 0xab, 0x7c, 0x8d, 0x8f, + 0x8c, 0x3b, 0x8e, 0xa7, 0x8d, 0xaa, 0x51, 0x5f, 0xa9, 0x7c, 0x72, 0x47, + 0x48, 0xc3, 0x4b, 0x5e, 0x3f, 0xa8, 0x71, 0xc3, 0x64, 0x61, 0x56, 0xa8, + 0x5e, 0xbc, 0x9d, 0xad, 0x81, 0x51, 0x79, 0xba, 0xc2, 0x9a, 0xc4, 0xb2, + 0x68, 0x4f, 0xc6, 0xa1, 0x46, 0x8e, 0xb9, 0x7e, 0x41, 0x6e, 0x9d, 0xbc, + 0xb2, 0xc4, 0xaf, 0xbe, 0x6e, 0x9e, 0x5b, 0xb5, 0x58, 0x73, 0x30, 0x51, + 0x3c, 0x2f, 0xb5, 0xbc, 0x68, 0xcc, 0xbd, 0x7f, 0x8e, 0xa0, 0x32, 0x8b, + 0x98, 0x41, 0x95, 0x83, 0x70, 0xb3, 0x4a, 0xc2, 0xc4, 0xcd, 0x44, 0xa0, + 0xc1, 0xa9, 0xc9, 0x60, 0x6c, 0x9d, 0xc5, 0x91, 0x76, 0x3d, 0x62, 0x82, + 0x5e, 0x66, 0x6d, 0x4b, 0xd0, 0x74, 0x39, 0xbb, 0x7a, 0xdc, 0xa8, 0x76, + 0x68, 0xab, 0x4a, 0xbf, 0xbd, 0x8a, 0x47, 0xc3, 0x8f, 0xbf, 0x8f, 0x90, + 0x4e, 0x7e, 0x86, 0x77, 0xbb, 0x97, 0x8b, 0xa1, 0xb6, 0x7b, 0xa3, 0x67, + 0x8e, 0xa1, 0x4f, 0x50, 0x5c, 0xbe, 0x79, 0xbd, 0x4d, 0xb9, 0x9b, 0x75, + 0x82, 0x45, 0xcf, 0x37, 0x40, 0x4a, 0x6d, 0x58, 0x63, 0xa4, 0x74, 0xa3, + 0x74, 0x6d, 0xca, 0x72, 0xb5, 0x81, 0x77, 0xaa, 0xb1, 0x74, 0x3c, 0x81, + 0x99, 0x6d, 0xb4, 0xac, 0xdc, 0xc4, 0xcc, 0x8a, 0xbb, 0x9a, 0x8b, 0xa4, + 0xaf, 0x7a, 0x9c, 0x83, 0xaa, 0x44, 0x57, 0xca, 0xbd, 0x69, 0xa4, 0xb3, + 0xa8, 0x9e, 0xb4, 0x79, 0x85, 0x7a, 0x3b, 0xbc, 0xab, 0x67, 0x89, 0x5f, + 0x91, 0x5e, 0x83, 0x3c, 0xaf, 0xb4, 0x77, 0xac, 0x75, 0xbf, 0x7a, 0xad, + 0x35, 0xa8, 0x35, 0xd8, 0x60, 0x81, 0x91, 0x4b, 0xc7, 0x77, 0xb1, 0xbc, + 0x44, 0xa5, 0x38, 0x47, 0xa4, 0x27, 0x23, 0xb8, 0xbf, 0x71, 0x5a, 0xa3, + 0x67, 0x73, 0x6d, 0x76, 0x44, 0x76, 0x49, 0x75, 0x89, 0x75, 0xba, 0x9e, + 0x9d, 0x87, 0x65, 0x6f, 0xa0, 0x81, 0x55, 0x47, 0x63, 0x74, 0x66, 0xa0, + 0x6a, 0x34, 0x50, 0x35, 0x90, 0x53, 0x75, 0x84, 0x8d, 0x9d, 0x94, 0x55, + 0x53, 0x62, 0x99, 0x43, 0xb5, 0x33, 0x91, 0x6f, 0x9a, 0x78, 0x55, 0x54, + 0xb7, 0xd3, 0x61, 0xc1, 0xac, 0xad, 0xb0, 0x39, 0x4b, 0xac, 0xa3, 0x6b, + 0xcb, 0x8a, 0x8e, 0x74, 0xbd, 0xa3, 0x5c, 0xaf, 0xbf, 0xaa, 0x70, 0x5d, + 0x85, 0x7c, 0x85, 0x8b, 0x45, 0x4b, 0xa4, 0x92, 0x51, 0x6d, 0xaf, 0x3d, + 0x6d, 0xa8, 0xa8, 0x68, 0xa6, 0x74, 0x9e, 0x40, 0xcf, 0x70, 0x37, 0xbf, + 0x6e, 0x36, 0x59, 0x69, 0xab, 0x80, 0x43, 0x47, 0x8d, 0x4e, 0xb3, 0x74, + 0x3a, 0x66, 0xb5, 0x8d, 0x9c, 0x91, 0xa3, 0x35, 0x9c, 0x3b, 0x4f, 0x4f, + 0xa8, 0x92, 0xb4, 0x3f, 0xb0, 0x83, 0xca, 0x58, 0x5e, 0x41, 0x58, 0x7c, + 0x42, 0xa7, 0x9e, 0x92, 0x9a, 0x68, 0xa1, 0xb0, 0xce, 0x93, 0x91, 0xbd, + 0x5f, 0xd4, 0x8b, 0x44, 0x4c, 0x31, 0x8e, 0xce, 0x69, 0x59, 0xc3, 0x98, + 0xc8, 0xa3, 0x98, 0x8b, 0x35, 0x66, 0x3a, 0x9c, 0x4d, 0xc5, 0xb5, 0xe6, + 0xb1, 0x5f, 0x80, 0x2d, 0xba, 0x5d, 0x3e, 0x61, 0x9e, 0x7f, 0xae, 0x6a, + 0x4d, 0xbe, 0x56, 0x41, 0x8a, 0xd0, 0xa7, 0x32, 0xcf, 0xaa, 0x4d, 0x3e, + 0xad, 0xb4, 0x70, 0xba, 0xb0, 0x52, 0x35, 0x86, 0xc9, 0xb0, 0x37, 0x7c, + 0xb2, 0x38, 0x5c, 0xa0, 0x69, 0x87, 0x5a, 0x62, 0x62, 0x54, 0x6e, 0x8e, + 0x8f, 0xb7, 0x62, 0xb4, 0x71, 0x43, 0x7b, 0xad, 0x5f, 0x75, 0xac, 0xb2, + 0x63, 0x34, 0xb4, 0x42, 0xb4, 0x56, 0x68, 0x4d, 0x8e, 0x54, 0x3c, 0x90, + 0xcc, 0xbe, 0xa8, 0x8d, 0x5b, 0x9d, 0xc3, 0x53, 0x69, 0x58, 0x54, 0x9a, + 0x33, 0x57, 0xa1, 0x75, 0x8d, 0xcc, 0xaf, 0x97, 0x6d, 0xb4, 0x48, 0xa8, + 0x9f, 0x66, 0x89, 0x91, 0x8e, 0x61, 0x7e, 0x9d, 0xd3, 0x90, 0xd0, 0xbe, + 0x8c, 0x8a, 0x8a, 0x9b, 0xc1, 0x38, 0xb0, 0x3d, 0x72, 0x63, 0x94, 0x66, + 0x9c, 0x44, 0x5e, 0xcb, 0x5d, 0x85, 0xa4, 0x54, 0x70, 0x87, 0xcc, 0x64, + 0x70, 0xb1, 0xb6, 0x95, 0x36, 0xcd, 0xd6, 0xa1, 0x5a, 0xa7, 0x51, 0x4f, + 0xb2, 0xc3, 0x8d, 0x7d, 0x3c, 0xa3, 0xbf, 0xa8, 0x6c, 0x74, 0xac, 0xa3, + 0x52, 0x77, 0x9e, 0xa3, 0xc6, 0x95, 0xbd, 0x41, 0xc6, 0x99, 0x88, 0xc0, + 0x3c, 0x4b, 0xa3, 0x32, 0xbd, 0x57, 0x7a, 0x51, 0xbb, 0xaa, 0x3b, 0x57, + 0x54, 0xa0, 0x39, 0x89, 0x7d, 0x2a, 0x5c, 0xa5, 0x67, 0xa7, 0x3c, 0xaf, + 0xd2, 0x4d, 0xd3, 0x4c, 0xaf, 0xcf, 0xcc, 0x9f, 0x67, 0x65, 0x50, 0x87, + 0x9c, 0x56, 0x58, 0x4a, 0x49, 0xab, 0x84, 0x5f, 0xa4, 0x55, 0x91, 0x8b, + 0xa0, 0x7b, 0x42, 0x48, 0x4c, 0xc2, 0x71, 0x4b, 0x34, 0x55, 0x39, 0x62, + 0x9b, 0xae, 0x3b, 0x9f, 0xbf, 0x68, 0x79, 0x84, 0x43, 0xa6, 0xa1, 0xb4, + 0x6f, 0x8a, 0xc9, 0x3c, 0xab, 0x88, 0x67, 0x80, 0x30, 0x75, 0x99, 0x3b, + 0xce, 0x68, 0x54, 0xad, 0x3a, 0x97, 0xc4, 0x5b, 0x5e, 0xcc, 0xcf, 0x9a, + 0x6f, 0x97, 0x4c, 0x44, 0x90, 0x60, 0x7b, 0x5f, 0x54, 0x67, 0x5e, 0x9d, + 0x47, 0x4c, 0xaf, 0x9e, 0x6c, 0x6e, 0x56, 0xa9, 0xd0, 0x8f, 0xca, 0x85, + 0x97, 0x74, 0x84, 0x87, 0xa1, 0x4b, 0x83, 0x3a, 0x88, 0xa7, 0x9c, 0x8b, + 0xb0, 0x53, 0x9e, 0x7e, 0x34, 0xc9, 0xaf, 0x95, 0x9b, 0x9d, 0x89, 0x63, + 0x7a, 0xc7, 0xcf, 0x72, 0x9d, 0xbb, 0xc9, 0xc7, 0x95, 0x3b, 0xc3, 0xb6, + 0x8e, 0x5b, 0x92, 0x43, 0xb4, 0x50, 0x55, 0x43, 0xa8, 0x79, 0xb9, 0x36, + 0xab, 0xa7, 0x5c, 0xce, 0x5f, 0x47, 0x98, 0x3c, 0x6a, 0x3a, 0x9a, 0x8f, + 0xa2, 0x82, 0x78, 0x69, 0x86, 0x9e, 0xaf, 0xa7, 0x68, 0x7f, 0x73, 0x32, + 0x8e, 0x53, 0x8d, 0x6f, 0x50, 0x4b, 0x55, 0xa1, 0x4a, 0xb4, 0x57, 0xb8, + 0xcd, 0xd2, 0x4c, 0xcf, 0x4d, 0xba, 0x59, 0x3b, 0xc8, 0x4e, 0x45, 0xd2, + 0x8b, 0xcf, 0x41, 0x9d, 0xb6, 0xb2, 0xd1, 0x5c, 0xc4, 0x96, 0xa6, 0x89, + 0x76, 0x5a, 0x34, 0xb7, 0x42, 0xa8, 0x87, 0x8a, 0x98, 0xc9, 0xce, 0xaf, + 0x8a, 0x7e, 0x7b, 0xbd, 0x9d, 0xc4, 0x88, 0x49, 0x99, 0x57, 0x9d, 0x9c, + 0xb0, 0x7a, 0xa0, 0x63, 0x32, 0x8a, 0x41, 0x4b, 0xa3, 0x4d, 0x81, 0x30, + 0x67, 0x53, 0x75, 0x4c, 0x91, 0x62, 0xb5, 0x77, 0xb3, 0x93, 0x5b, 0xb2, + 0x4c, 0x7f, 0xa3, 0x30, 0xc7, 0x93, 0x6d, 0x43, 0x80, 0x5f, 0x95, 0x6a, + 0x9e, 0xc2, 0x92, 0xde, 0xc4, 0x8a, 0x70, 0x4b, 0xc3, 0x3f, 0x78, 0x94, + 0x3e, 0x79, 0x88, 0x50, 0x55, 0xcb, 0x77, 0x67, 0xae, 0x49, 0x9a, 0x48, + 0xa1, 0x55, 0x6c, 0x5b, 0x5b, 0xd3, 0x58, 0xc3, 0x58, 0xac, 0xa6, 0x51, + 0x7b, 0x3c, 0x80, 0x6d, 0xcc, 0x62, 0x57, 0xc4, 0xbd, 0xb6, 0x51, 0xb2, + 0xbb, 0x5d, 0xa2, 0xa6, 0x5c, 0x93, 0x44, 0x5c, 0xd1, 0x54, 0x49, 0x8d, + 0x8b, 0x4c, 0x8a, 0x80, 0x86, 0xce, 0xcf, 0x5a, 0x6d, 0x3c, 0x8e, 0x4e, + 0x69, 0x8a, 0x57, 0x8f, 0x60, 0x8b, 0xa9, 0x7a, 0x9e, 0x99, 0x34, 0xcd, + 0xc7, 0x7b, 0xcd, 0x8e, 0x65, 0x75, 0x5c, 0xab, 0xd0, 0xaa, 0x8a, 0xc5, + 0x8b, 0x7f, 0x9c, 0x5c, 0xc8, 0xa4, 0x4d, 0x8e, 0x67, 0x3a, 0xc6, 0xd4, + 0x6d, 0x3e, 0xa7, 0x74, 0xbc, 0xbc, 0x3c, 0x5f, 0x82, 0x81, 0x71, 0xbc, + 0xa4, 0x4b, 0x8d, 0x61, 0x46, 0x3a, 0x52, 0x3f, 0x9b, 0xa9, 0x90, 0xaf, + 0xac, 0x4c, 0x40, 0x7a, 0x9d, 0x8a, 0x89, 0xc0, 0x94, 0x70, 0x50, 0xc7, + 0xbd, 0xb9, 0xb4, 0xb5, 0x58, 0x8b, 0x8c, 0xd1, 0x86, 0x68, 0xb9, 0x7e, + 0xb8, 0xa9, 0xa2, 0x3e, 0x89, 0xbb, 0xb6, 0x41, 0x48, 0x3a, 0x85, 0xc2, + 0xd1, 0x5e, 0x5a, 0xa7, 0xa5, 0x94, 0xa0, 0xb9, 0x35, 0x98, 0x64, 0xd6, + 0xa1, 0x7d, 0x99, 0x9a, 0x79, 0x39, 0x79, 0x57, 0xc9, 0xd2, 0x75, 0xaa, + 0x58, 0x50, 0xb2, 0xd1, 0x75, 0x77, 0x49, 0xd0, 0x34, 0x76, 0x9a, 0x5d, + 0xc7, 0x6a, 0xb6, 0x68, 0x62, 0x4a, 0x6e, 0x7e, 0x75, 0x56, 0xa6, 0x51, + 0xa5, 0xb5, 0x51, 0xa6, 0x32, 0xc5, 0xbd, 0x92, 0x58, 0x56, 0xaa, 0xbd, + 0x44, 0x92, 0x78, 0x33, 0xc0, 0xae, 0x5e, 0xc3, 0x32, 0x40, 0x9a, 0x60, + 0x39, 0x52, 0x6f, 0x65, 0xa1, 0xca, 0x5d, 0xcf, 0x4c, 0x43, 0xb6, 0x6a, + 0x82, 0xc9, 0xd4, 0xa9, 0xab, 0xbe, 0x99, 0x7e, 0x87, 0x4c, 0xa4, 0xbd, + 0x5c, 0x8f, 0x96, 0x3c, 0xc1, 0x8e, 0x64, 0xa4, 0x6f, 0x9c, 0x2e, 0x5f, + 0x36, 0x78, 0xb1, 0x39, 0x37, 0x3d, 0x6d, 0x6a, 0xa8, 0x77, 0x90, 0x4d, + 0xa9, 0x9d, 0x50, 0x2e, 0xd3, 0xd2, 0x86, 0xbb, 0x8e, 0x8f, 0xd1, 0xbb, + 0x6d, 0xc1, 0xb9, 0x90, 0xa7, 0x38, 0xb7, 0x83, 0xbc, 0xcd, 0x58, 0xcb, + 0xb5, 0xcb, 0x42, 0xb7, 0x7b, 0x5a, 0x83, 0xb5, 0xa3, 0xe1, 0x96, 0xce, + 0xa0, 0x81, 0x89, 0x6b, 0x54, 0x76, 0x4d, 0xcc, 0x52, 0x38, 0x33, 0x8a, + 0x46, 0x3c, 0x20, 0xbc, 0x58, 0x88, 0x98, 0xa7, 0xc9, 0x68, 0x47, 0x8b, + 0x8e, 0xac, 0x6d, 0xb7, 0x8b, 0x65, 0x5f, 0x9b, 0x80, 0x5a, 0x50, 0xbe, + 0x48, 0x88, 0x77, 0x80, 0x74, 0x5d, 0x82, 0x8b, 0x86, 0x54, 0x5c, 0xb8, + 0x44, 0x5b, 0x3d, 0x52, 0xca, 0xb7, 0x76, 0xa4, 0xcd, 0xb0, 0xce, 0xa7, + 0xa4, 0x6e, 0x37, 0x7c, 0x95, 0x73, 0x7a, 0x82, 0x93, 0x83, 0x64, 0xa2, + 0xa5, 0x3c, 0xb8, 0x3f, 0x66, 0xd2, 0x4d, 0x6d, 0x96, 0x77, 0x5a, 0x3e, + 0x30, 0x9b, 0x93, 0x38, 0xd4, 0x8c, 0x7d, 0x47, 0x58, 0xaf, 0x48, 0x52, + 0x72, 0x6f, 0x85, 0x9e, 0xac, 0x46, 0x3e, 0x6b, 0x5d, 0x5e, 0x92, 0xb6, + 0x97, 0x89, 0x70, 0x93, 0xc0, 0xb9, 0xbd, 0x65, 0xa9, 0x97, 0x30, 0x60, + 0xcd, 0xba, 0xb1, 0x8f, 0xaa, 0x7a, 0x8d, 0x41, 0xa4, 0xde, 0x52, 0xa7, + 0xca, 0x46, 0x33, 0x6c, 0x6c, 0xc9, 0x9d, 0xb7, 0xb6, 0xd8, 0x37, 0x40, + 0x6c, 0x66, 0x4b, 0xcd, 0x9f, 0xb4, 0x5e, 0x5c, 0x9c, 0x44, 0xc1, 0x58, + 0x9e, 0xbe, 0x88, 0xc7, 0x51, 0xa0, 0x3c, 0x38, 0x79, 0x83, 0x5b, 0x79, + 0xb8, 0x6b, 0x3a, 0xbd, 0xc7, 0x81, 0xa1, 0x82, 0x4b, 0x92, 0x57, 0xb4, + 0x63, 0xb2, 0x67, 0x8d, 0x6a, 0x6f, 0xb6, 0x4b, 0x72, 0x86, 0x49, 0xa4, + 0x98, 0x8d, 0xb9, 0xc2, 0xbf, 0x3f, 0x64, 0x3a, 0x6b, 0x5b, 0xa3, 0x84, + 0x57, 0x7c, 0xab, 0x3f, 0x53, 0xbd, 0x34, 0x30, 0xd2, 0x49, 0xc1, 0x3c, + 0x92, 0xb6, 0x6e, 0xd3, 0x31, 0x5d, 0x3f, 0x75, 0x7a, 0x73, 0x7f, 0x70, + 0xb8, 0x7f, 0xbf, 0xc9, 0x4a, 0x8f, 0x9e, 0x6b, 0xb2, 0xd0, 0x3c, 0x39, + 0x5e, 0xc8, 0x97, 0xc9, 0x72, 0xa1, 0xca, 0x99, 0x76, 0x8b, 0xbd, 0x59, + 0xc8, 0xbd, 0xa4, 0xd2, 0x71, 0x52, 0x9d, 0x45, 0x6c, 0x93, 0x3b, 0xab, + 0x80, 0xcd, 0x96, 0x3a, 0x2d, 0x8d, 0xb6, 0x3e, 0xa7, 0x54, 0xb2, 0x9a, + 0x94, 0xb4, 0xa4, 0x7b, 0x4e, 0x4a, 0x9d, 0xcd, 0xae, 0x66, 0x86, 0xc7, + 0xca, 0x98, 0x4d, 0x71, 0x5c, 0xbb, 0xc8, 0x44, 0x40, 0x41, 0x9b, 0x3f, + 0xba, 0x78, 0x36, 0xc6, 0xb6, 0xcb, 0x4b, 0x45, 0x4d, 0x9f, 0xbf, 0x73, + 0x72, 0x3f, 0x44, 0xd1, 0x81, 0x66, 0x5f, 0x61, 0x8a, 0x87, 0x4a, 0xb7, + 0xb6, 0xc7, 0xb8, 0xc3, 0xd5, 0xb2, 0x47, 0xa2, 0x37, 0x64, 0xcc, 0x85, + 0xbc, 0x51, 0x53, 0x76, 0xab, 0xd4, 0xb4, 0x95, 0x5c, 0xaf, 0x53, 0x79, + 0x82, 0x76, 0xc9, 0x5e, 0x67, 0x3e, 0x7e, 0xb9, 0xc7, 0x8f, 0xcc, 0x78, + 0x83, 0x41, 0x9f, 0x3a, 0xaf, 0x37, 0x7a, 0x6d, 0x57, 0xa3, 0x4f, 0x4b, + 0xaf, 0x84, 0x69, 0xb2, 0x5d, 0xbd, 0x76, 0x83, 0x55, 0x9a, 0x4c, 0x90, + 0xa1, 0xae, 0x8e, 0x55, 0x40, 0x8b, 0x53, 0xc3, 0xcf, 0x56, 0x3a, 0x91, + 0x92, 0xc7, 0x3f, 0xad, 0xb0, 0x34, 0x7e, 0x44, 0xc0, 0xc4, 0x89, 0x7b, + 0xb9, 0x60, 0x6c, 0x39, 0x3a, 0x65, 0x86, 0x61, 0xca, 0xc3, 0xb8, 0xbf, + 0x7a, 0x95, 0xae, 0x91, 0x6a, 0x4a, 0x47, 0x6b, 0xa6, 0x66, 0x79, 0x94, + 0xac, 0x8d, 0x2e, 0xc7, 0x55, 0x48, 0xa1, 0x4c, 0x40, 0xb8, 0x72, 0x69, + 0xd0, 0x7a, 0x47, 0x97, 0xb2, 0xc8, 0xba, 0x68, 0x57, 0x90, 0x40, 0xc0, + 0x9d, 0x46, 0xa1, 0xb0, 0x80, 0x65, 0x8c, 0x62, 0xc1, 0x7c, 0xb9, 0x4b, + 0xa8, 0xb1, 0x56, 0x67, 0x7b, 0x59, 0xac, 0x8e, 0x39, 0xa4, 0xd9, 0x66, + 0x66, 0xc3, 0xbe, 0xca, 0x4f, 0x40, 0xb8, 0x4f, 0x50, 0xb7, 0x99, 0x6b, + 0xaa, 0xaf, 0x70, 0xbc, 0xd1, 0x5e, 0x6f, 0x8a, 0x80, 0x3e, 0xa9, 0x4a, + 0x54, 0x55, 0x33, 0x7d, 0xb9, 0xb0, 0x4f, 0x3f, 0x3e, 0x4c, 0x4e, 0xc4, + 0x94, 0xa2, 0x89, 0xa3, 0xb0, 0x93, 0xa6, 0x48, 0x43, 0x30, 0xa8, 0xb5, + 0x7e, 0x7a, 0x68, 0xa9, 0x86, 0xc3, 0x54, 0xc3, 0xa6, 0x88, 0x6b, 0x83, + 0x39, 0x40, 0x43, 0x50, 0xb4, 0x4b, 0xb4, 0x7e, 0x75, 0x95, 0x71, 0xa4, + 0xaa, 0x4c, 0x47, 0xc6, 0x74, 0x5f, 0x8f, 0x34, 0xae, 0x71, 0x9c, 0xb4, + 0xbf, 0x3e, 0xad, 0x63, 0x86, 0x52, 0x3c, 0xbf, 0x47, 0xc6, 0xae, 0x70, + 0x71, 0x2c, 0x54, 0xb5, 0x56, 0xd5, 0x9d, 0x9c, 0x94, 0x46, 0x4b, 0xbe, + 0x3b, 0x94, 0x55, 0xb9, 0x6a, 0x9c, 0x67, 0x36, 0x90, 0x65, 0x9c, 0xcd, + 0x91, 0x5e, 0x3c, 0x56, 0x99, 0x75, 0xb7, 0x67, 0x6b, 0xa7, 0xbc, 0x3d, + 0xbd, 0xbc, 0xb4, 0x59, 0x66, 0xc4, 0xc5, 0xd0, 0x59, 0x8b, 0xae, 0xc6, + 0x89, 0x84, 0xb6, 0xd2, 0xac, 0xaa, 0xb6, 0x8d, 0xa5, 0x66, 0x8f, 0x60, + 0x3f, 0xdc, 0x6a, 0xbc, 0x66, 0xb7, 0x2b, 0x2f, 0xb8, 0x79, 0x35, 0x60, + 0x65, 0xc8, 0x4d, 0x49, 0x38, 0x42, 0xa8, 0xdd, 0x71, 0x88, 0x74, 0x98, + 0x80, 0x4e, 0xbc, 0x66, 0x3f, 0xc4, 0x46, 0x75, 0x7d, 0x32, 0x59, 0xb3, + 0xc0, 0x68, 0xb0, 0x4e, 0x8f, 0xa5, 0xd4, 0xae, 0xc7, 0x4b, 0x5a, 0xb5, + 0x8d, 0x40, 0x9b, 0x68, 0xaa, 0x55, 0x7d, 0x69, 0xc3, 0xcd, 0x4b, 0xb6, + 0xa1, 0x7e, 0x4c, 0x86, 0x8a, 0x45, 0x91, 0xc4, 0xd4, 0xaa, 0x4c, 0x43, + 0xba, 0xa5, 0xc7, 0xbd, 0x49, 0x43, 0x9a, 0x61, 0x4a, 0x76, 0xaa, 0x45, + 0xcb, 0xba, 0x7d, 0xaf, 0xb2, 0x83, 0x4b, 0x37, 0x58, 0xae, 0x6f, 0xc6, + 0xb5, 0xd6, 0xc8, 0xb4, 0xb5, 0xc1, 0x5b, 0x79, 0x3e, 0x41, 0x3c, 0xbd, + 0x3d, 0x8b, 0x4d, 0xac, 0x54, 0x73, 0xb8, 0x42, 0x7f, 0x73, 0x52, 0x5b, + 0x37, 0x3b, 0x53, 0x91, 0x63, 0x67, 0xce, 0x88, 0x71, 0xa0, 0x3f, 0x7d, + 0x7f, 0x67, 0xbd, 0x30, 0x91, 0xce, 0x97, 0x7e, 0xbc, 0x41, 0x7d, 0xc6, + 0x80, 0xcf, 0xd1, 0xac, 0x3f, 0x56, 0x61, 0xab, 0xac, 0x83, 0x5d, 0x5f, + 0x3e, 0xbd, 0xc7, 0x8b, 0x9d, 0x53, 0xcc, 0xba, 0x92, 0x89, 0xa3, 0x63, + 0x8f, 0xc5, 0xac, 0x50, 0xa8, 0x3c, 0x68, 0x8a, 0x30, 0x53, 0x9e, 0x5f, + 0xc2, 0x8d, 0xb5, 0x4e, 0x47, 0x73, 0xce, 0x63, 0x70, 0x6d, 0x53, 0xbc, + 0x33, 0xab, 0x51, 0x9d, 0x8a, 0xba, 0x3d, 0x4c, 0x60, 0x74, 0xc6, 0x5b, + 0x6d, 0x56, 0x90, 0x84, 0xa9, 0x64, 0xa9, 0x79, 0x4d, 0xd0, 0x6f, 0x45, + 0x6c, 0x6a, 0x79, 0x9e, 0xac, 0x3f, 0x5b, 0xbe, 0x44, 0x8c, 0x6f, 0x4a, + 0x8f, 0x6c, 0x7c, 0x94, 0xaa, 0x4b, 0x48, 0x54, 0x81, 0x65, 0x35, 0xae, + 0x6e, 0x82, 0x72, 0x4b, 0x8e, 0x37, 0x5b, 0x9c, 0x44, 0x4b, 0x73, 0x46, + 0x31, 0x5c, 0x99, 0x6d, 0x41, 0x49, 0xc7, 0x88, 0x6c, 0x90, 0xb3, 0x9a, + 0xb8, 0xd2, 0xc5, 0xa0, 0xa9, 0xb6, 0x4a, 0x5b, 0x4b, 0x47, 0xa4, 0x48, + 0xb1, 0xaa, 0x40, 0xd4, 0x71, 0xb5, 0xb3, 0x54, 0x40, 0x48, 0x4d, 0xbf, + 0x87, 0xde, 0x6f, 0x7c, 0x36, 0x5c, 0x4b, 0x4b, 0x64, 0x99, 0x5d, 0x6e, + 0x6f, 0xc8, 0x67, 0xb5, 0xcf, 0x3d, 0x9e, 0x40, 0xd0, 0x96, 0xaf, 0x97, + 0x9a, 0x94, 0x88, 0x79, 0x5f, 0xc9, 0x4e, 0x43, 0x57, 0xc0, 0x74, 0x5a, + 0x89, 0x76, 0x2e, 0xb4, 0x7d, 0x5c, 0xa0, 0xb4, 0xaa, 0x82, 0x5d, 0xbe, + 0xa1, 0x5f, 0x82, 0xcb, 0x45, 0x8d, 0x85, 0xb2, 0x4b, 0x61, 0x90, 0x9c, + 0x8b, 0xb7, 0xce, 0x72, 0xb8, 0x8a, 0x8e, 0xa8, 0xca, 0x84, 0xb6, 0x79, + 0x71, 0xad, 0x9c, 0x89, 0x9d, 0xb7, 0x5c, 0x87, 0x3a, 0x94, 0xbf, 0x58, + 0x71, 0x8e, 0x7b, 0x71, 0xa2, 0x98, 0x43, 0x85, 0xb0, 0xa8, 0x94, 0x96, + 0xb7, 0xb9, 0x31, 0x30, 0x46, 0x6b, 0x5f, 0x36, 0x91, 0xdc, 0x78, 0x3e, + 0xa5, 0x79, 0x99, 0x3a, 0x41, 0x6e, 0x40, 0xbb, 0xbb, 0x89, 0x71, 0x64, + 0x7d, 0x4c, 0x38, 0xc2, 0x91, 0x9e, 0x66, 0x88, 0xb2, 0xce, 0x58, 0xa3, + 0x7f, 0x42, 0xba, 0x75, 0x7e, 0x70, 0x55, 0x57, 0x41, 0x43, 0x72, 0x90, + 0xd7, 0x97, 0x6c, 0x7d, 0x7d, 0x86, 0x55, 0xa5, 0xd0, 0x4d, 0x9d, 0x75, + 0x33, 0x31, 0xcd, 0x44, 0xbf, 0x5d, 0x4b, 0xa5, 0x43, 0x41, 0x60, 0xb0, + 0x5e, 0x86, 0xc3, 0xa2, 0x92, 0x6c, 0xad, 0x58, 0xad, 0x6b, 0xba, 0x95, + 0xac, 0x9e, 0x4d, 0xba, 0xb6, 0x9b, 0x43, 0x5e, 0x9f, 0xb4, 0x76, 0x95, + 0xd0, 0xa6, 0x55, 0x58, 0x3f, 0x9a, 0x6f, 0xd1, 0xaa, 0x8f, 0xb1, 0x7b, + 0x30, 0x4c, 0x98, 0x54, 0xb9, 0x3f, 0xaa, 0xcd, 0x51, 0x67, 0x89, 0x92, + 0x4a, 0xd1, 0x73, 0x55, 0xcc, 0x72, 0x5c, 0xcb, 0x71, 0xab, 0xbe, 0x64, + 0xaa, 0xc7, 0x89, 0x30, 0x6e, 0x6d, 0x43, 0xa9, 0x66, 0x9e, 0x5c, 0x6b, + 0xc6, 0x64, 0x6c, 0x6b, 0x52, 0xb8, 0x56, 0x6c, 0x4a, 0x97, 0x5b, 0x6b, + 0x76, 0x44, 0xa6, 0xbd, 0xb8, 0xcd, 0x7e, 0xbb, 0xd0, 0x68, 0x49, 0x78, + 0xa5, 0xb8, 0x54, 0xc8, 0x9c, 0xb0, 0xbb, 0x38, 0xd0, 0x3b, 0x80, 0x43, + 0x65, 0xb8, 0x48, 0x3b, 0x60, 0x46, 0x45, 0x95, 0x46, 0x5e, 0xae, 0x62, + 0x50, 0x9a, 0x9d, 0x62, 0x59, 0x34, 0x57, 0xba, 0x3e, 0x72, 0x83, 0xbd, + 0x92, 0x41, 0xaf, 0x80, 0x54, 0x96, 0xaf, 0x67, 0x53, 0x2f, 0x6b, 0x56, + 0x92, 0x92, 0xb0, 0xa3, 0xc5, 0x9c, 0x78, 0xb6, 0x7d, 0x3b, 0xc1, 0x8b, + 0x9c, 0xaa, 0xa5, 0x63, 0x77, 0xb5, 0x3a, 0xbb, 0x62, 0x6f, 0xb3, 0x41, + 0x5d, 0x54, 0xad, 0x8f, 0xac, 0x71, 0x59, 0x40, 0x69, 0x5e, 0x44, 0xbc, + 0xad, 0xbe, 0x4a, 0xcf, 0x75, 0x87, 0x84, 0xbd, 0x92, 0x72, 0x39, 0x58, + 0x40, 0xb7, 0x3a, 0xb2, 0xaa, 0x64, 0xb3, 0xac, 0xb5, 0x45, 0xad, 0x5c, + 0x8c, 0x53, 0xbc, 0xa1, 0xbb, 0x7b, 0x89, 0xbd, 0x9e, 0x6b, 0x84, 0xb7, + 0x6f, 0x2d, 0x4a, 0x97, 0x82, 0x7f, 0xce, 0x59, 0xb1, 0xbe, 0x9f, 0x41, + 0x7c, 0x98, 0x37, 0xc3, 0xcf, 0xbd, 0x3a, 0xbe, 0xc0, 0xac, 0x81, 0x3b, + 0xbf, 0x4b, 0x5e, 0x6f, 0x36, 0x87, 0xac, 0xc4, 0xa5, 0xa0, 0x7c, 0xba, + 0x37, 0x87, 0x5c, 0x64, 0x86, 0x91, 0xaf, 0x5a, 0x97, 0x5a, 0x98, 0xac, + 0x9e, 0x57, 0x2c, 0x42, 0x36, 0x8a, 0x39, 0x9d, 0xd5, 0x53, 0x42, 0x56, + 0x5d, 0xb5, 0x62, 0x57, 0x79, 0xc4, 0x78, 0xcc, 0x80, 0xa8, 0xab, 0x63, + 0x93, 0x41, 0x7b, 0x39, 0x5a, 0xa0, 0xd0, 0xa6, 0x63, 0xc3, 0x90, 0x87, + 0xab, 0xa6, 0xb1, 0xc1, 0x91, 0xa8, 0x53, 0x57, 0x5c, 0x94, 0x81, 0x60, + 0x6a, 0x7c, 0x2e, 0x48, 0xaf, 0x70, 0x45, 0x48, 0xd9, 0x70, 0x87, 0x6b, + 0x3c, 0x76, 0xaa, 0xd0, 0x83, 0x6e, 0x60, 0x7f, 0x3c, 0x3e, 0x93, 0x58, + 0x62, 0x76, 0xc1, 0x6a, 0x6e, 0x91, 0x7e, 0x5f, 0xb2, 0x69, 0xa5, 0xb6, + 0x3f, 0x55, 0x38, 0xa6, 0x8a, 0x5f, 0x97, 0x3b, 0xb5, 0xd0, 0x67, 0x40, + 0x69, 0x37, 0x4c, 0x37, 0x5c, 0x5c, 0xc8, 0xa6, 0xc2, 0x8c, 0xcc, 0x4a, + 0xb0, 0x74, 0x6f, 0x5b, 0xa5, 0x60, 0x4b, 0x87, 0x39, 0xa6, 0x4f, 0xab, + 0x50, 0x66, 0x58, 0xb4, 0x5d, 0x67, 0xb3, 0x82, 0x66, 0x43, 0x88, 0xa4, + 0xaa, 0x33, 0xb6, 0xa7, 0xb9, 0xd2, 0xa3, 0xd0, 0x67, 0xc6, 0x6e, 0xca, + 0xc2, 0xa0, 0x78, 0x99, 0x4a, 0xbb, 0xa4, 0x65, 0xcf, 0xb6, 0x46, 0x61, + 0x3e, 0x47, 0x6d, 0xa4, 0xb2, 0x53, 0x50, 0xba, 0xd2, 0x62, 0x43, 0x84, + 0x7e, 0x57, 0xad, 0xc7, 0x85, 0x3b, 0x6e, 0x41, 0x82, 0xb3, 0x96, 0xce, + 0x60, 0x7c, 0x80, 0x4b, 0x50, 0x3b, 0x39, 0xc8, 0x4a, 0x54, 0x62, 0x9d, + 0xc2, 0x6b, 0x2e, 0x47, 0x81, 0xcf, 0x4a, 0x59, 0xa9, 0xab, 0x52, 0xac, + 0xba, 0x62, 0x3c, 0x92, 0x7e, 0xd1, 0x7b, 0x70, 0x54, 0x87, 0x64, 0x3b, + 0x71, 0x2e, 0x44, 0x56, 0x51, 0x8c, 0x41, 0xcc, 0x98, 0xb9, 0xd6, 0xae, + 0x7e, 0x85, 0xb9, 0x5c, 0x75, 0x9e, 0x46, 0xc4, 0xaa, 0x72, 0x57, 0x55, + 0xc6, 0x38, 0xa4, 0x54, 0x7a, 0x56, 0x6c, 0x64, 0xa5, 0x42, 0x8a, 0x61, + 0xc3, 0x4e, 0x67, 0x9d, 0x45, 0xaa, 0x76, 0x37, 0x3a, 0x91, 0xae, 0x9c, + 0x8f, 0x81, 0x5d, 0xb1, 0x93, 0x27, 0x45, 0xc2, 0x91, 0x55, 0xad, 0xb8, + 0x77, 0x71, 0x7b, 0xb9, 0x67, 0x73, 0x5e, 0x98, 0x51, 0x80, 0x6a, 0x86, + 0xce, 0xce, 0x96, 0x74, 0x3a, 0x5a, 0x32, 0x4f, 0xaa, 0x69, 0x45, 0x44, + 0xc8, 0x58, 0x9a, 0x4e, 0x63, 0x6e, 0xa8, 0x77, 0xa3, 0x9c, 0x49, 0xb8, + 0xb5, 0xbe, 0xa6, 0x6d, 0x6a, 0xb8, 0xaa, 0xab, 0x8a, 0xd4, 0x60, 0xa0, + 0x50, 0x64, 0x30, 0x4f, 0x3a, 0xb6, 0x96, 0x71, 0xc6, 0x53, 0x80, 0xb5, + 0xc6, 0xa1, 0x4e, 0x67, 0xab, 0x44, 0x7b, 0xa9, 0x4d, 0xcc, 0x98, 0x4f, + 0x59, 0x7e, 0xbf, 0x47, 0x64, 0xc5, 0x41, 0xa6, 0x39, 0xb6, 0xcc, 0x5f, + 0x9b, 0xcb, 0x3f, 0x3f, 0x75, 0xd2, 0xb2, 0x7b, 0x87, 0xa1, 0x9b, 0xa6, + 0x34, 0x64, 0x80, 0x2e, 0x89, 0xb8, 0xbf, 0xd3, 0x9e, 0x3d, 0x91, 0x6b, + 0x90, 0x9f, 0x79, 0x63, 0x3e, 0x47, 0x7a, 0x88, 0x74, 0x2c, 0x41, 0x7a, + 0x47, 0x32, 0xab, 0xbd, 0x93, 0x88, 0x86, 0x3b, 0x8a, 0x72, 0xc9, 0x79, + 0x57, 0xbf, 0x61, 0x47, 0x86, 0x7c, 0x33, 0x4b, 0x6b, 0x46, 0x61, 0xac, + 0x57, 0x37, 0x60, 0x43, 0x8a, 0xae, 0xb9, 0x66, 0x3e, 0x98, 0x8f, 0xbf, + 0x70, 0x4c, 0xc7, 0x50, 0x6e, 0xc4, 0xdf, 0x97, 0x77, 0xd3, 0x3e, 0x5d, + 0x63, 0x94, 0x8b, 0x5c, 0x7e, 0xa0, 0x56, 0x41, 0xb0, 0x78, 0x4a, 0xb7, + 0x5b, 0x87, 0x66, 0x49, 0xa9, 0xa8, 0x87, 0x71, 0x59, 0xba, 0x93, 0xa0, + 0x90, 0x8f, 0x28, 0xba, 0xa8, 0x58, 0x62, 0x9c, 0x82, 0xd2, 0x4e, 0xbf, + 0x7b, 0xa4, 0xcd, 0x4f, 0xbc, 0x75, 0x3e, 0xcd, 0x7a, 0x62, 0x64, 0x5f, + 0xce, 0x93, 0x7b, 0x6c, 0x33, 0x90, 0x61, 0xb6, 0x34, 0xbe, 0x3d, 0x31, + 0x4a, 0xd6, 0xb4, 0xb0, 0x33, 0x8b, 0x92, 0xa9, 0xba, 0x5d, 0xa8, 0x66, + 0x65, 0xbb, 0x83, 0x7d, 0xab, 0x96, 0x89, 0x53, 0xa7, 0x9e, 0x5f, 0xa8, + 0x8b, 0x75, 0xb9, 0x87, 0xb4, 0x6a, 0xb9, 0x88, 0x6a, 0x9f, 0x92, 0x61, + 0x6c, 0x54, 0x89, 0x8f, 0x41, 0x77, 0x49, 0xa1, 0x8f, 0xae, 0x84, 0x62, + 0xc4, 0x6b, 0x96, 0xaf, 0x3b, 0xaf, 0x47, 0x9d, 0x5d, 0x98, 0x5a, 0xb5, + 0x63, 0xc3, 0xb5, 0x6c, 0x84, 0x95, 0xd0, 0x50, 0x4e, 0x51, 0x3d, 0xa5, + 0x8c, 0x70, 0x5e, 0x7e, 0x64, 0xd2, 0x8d, 0xb2, 0x85, 0x76, 0x79, 0xc2, + 0x88, 0x65, 0x70, 0x95, 0x5a, 0x8f, 0x8d, 0x8c, 0xaf, 0xa7, 0x8a, 0xb7, + 0xc2, 0x42, 0x62, 0x88, 0x34, 0x9d, 0xbd, 0x94, 0x78, 0x6b, 0x8f, 0x57, + 0x83, 0x93, 0xb1, 0x4d, 0xc4, 0x94, 0x31, 0x42, 0x6f, 0xbc, 0xc0, 0xc9, + 0x91, 0x57, 0x5f, 0x44, 0xc6, 0xbc, 0x6e, 0xc0, 0x78, 0x66, 0xd1, 0xc7, + 0x55, 0xca, 0xd8, 0xd0, 0x45, 0xc9, 0x76, 0x41, 0xc9, 0x9e, 0xbb, 0xaf, + 0x77, 0x6e, 0x42, 0xa0, 0x53, 0x87, 0x7b, 0xc0, 0x57, 0xab, 0x99, 0x3f, + 0x4b, 0x89, 0xcd, 0x54, 0x9d, 0xab, 0x7f, 0xb3, 0x67, 0x93, 0x39, 0x6e, + 0xbf, 0x55, 0x54, 0x7e, 0x8c, 0xd6, 0x32, 0x3a, 0x6e, 0x80, 0xa8, 0x74, + 0x74, 0xaa, 0x98, 0x82, 0x93, 0x49, 0xad, 0xa8, 0x63, 0xa6, 0x81, 0x50, + 0xcc, 0x85, 0xa5, 0x5f, 0xd1, 0x3c, 0x9b, 0x82, 0x5c, 0x94, 0xa3, 0x4e, + 0x85, 0xc7, 0x5a, 0xcc, 0x67, 0xba, 0x3a, 0x70, 0x4b, 0xc6, 0x4d, 0x7f, + 0x91, 0xca, 0x2f, 0x96, 0xc7, 0x4a, 0x49, 0x83, 0x9a, 0x50, 0x7d, 0x36, + 0x48, 0xca, 0x63, 0xa3, 0xba, 0x9f, 0x43, 0x73, 0xaf, 0x69, 0xa5, 0x37, + 0xb6, 0x96, 0xd1, 0x8b, 0x46, 0x2f, 0x2c, 0xbe, 0xb9, 0x9a, 0xb8, 0x49, + 0x49, 0x6c, 0x54, 0x62, 0xa9, 0x45, 0x53, 0xc4, 0x56, 0xb0, 0x92, 0x52, + 0x62, 0xab, 0xcc, 0x8b, 0xab, 0xc7, 0x77, 0xa9, 0x82, 0x46, 0x4b, 0x45, + 0x3a, 0xa0, 0xc5, 0x44, 0x64, 0x79, 0x64, 0xb5, 0xb0, 0x56, 0x97, 0xc8, + 0x3e, 0x6f, 0x7f, 0x77, 0x6d, 0x72, 0x31, 0xb0, 0x77, 0x8c, 0xae, 0x47, + 0x74, 0x30, 0xb8, 0x8e, 0x9f, 0x4d, 0xa6, 0x40, 0x59, 0x7f, 0x6a, 0xc6, + 0x8f, 0x38, 0x83, 0xb9, 0x50, 0x38, 0xb2, 0x6a, 0x43, 0x5f, 0xc4, 0x83, + 0xb2, 0x6d, 0xb5, 0x67, 0x36, 0x88, 0x60, 0xb9, 0x79, 0x8a, 0x4e, 0xc3, + 0xae, 0xba, 0xc6, 0x6c, 0x53, 0x93, 0xaf, 0x6c, 0x61, 0xb3, 0x2d, 0x63, + 0xcc, 0x34, 0x7e, 0x78, 0x2d, 0xae, 0x7c, 0x3e, 0x7e, 0x94, 0xb3, 0x3c, + 0x31, 0x63, 0x2b, 0x94, 0x9f, 0x58, 0x82, 0xbb, 0x91, 0x2c, 0x3e, 0x55, + 0x7e, 0xbe, 0x6c, 0x7c, 0xa9, 0xa7, 0xcf, 0xc8, 0x42, 0x93, 0x78, 0x83, + 0x61, 0xb3, 0xbb, 0x78, 0x4c, 0x70, 0x22, 0xbc, 0xb5, 0x3e, 0xac, 0x89, + 0xaa, 0xb3, 0x35, 0x91, 0x62, 0xbd, 0x6a, 0xb6, 0x9a, 0x77, 0x53, 0x4b, + 0xa8, 0x5e, 0x9f, 0x92, 0xaa, 0xb2, 0x46, 0x98, 0xc6, 0x85, 0x94, 0x91, + 0x86, 0x91, 0x7e, 0x78, 0x82, 0x7e, 0x70, 0x5a, 0x9d, 0xcc, 0x32, 0x2e, + 0x4e, 0x35, 0xbb, 0x97, 0xb8, 0x9e, 0x93, 0x43, 0x7d, 0x79, 0x95, 0x3d, + 0x62, 0xb9, 0xa6, 0x9f, 0xba, 0xac, 0xbf, 0xce, 0x56, 0xd0, 0xcd, 0x93, + 0xc5, 0xdb, 0x3d, 0xd1, 0x43, 0x60, 0x8f, 0x5d, 0x69, 0x45, 0x3f, 0x70, + 0x3b, 0x46, 0x8a, 0xbd, 0x8e, 0xcb, 0xb0, 0x87, 0xaa, 0x58, 0x5f, 0x9e, + 0x3b, 0x2e, 0xaa, 0x44, 0x84, 0x74, 0x9a, 0x9e, 0xcc, 0x69, 0x8d, 0xa5, + 0x93, 0x82, 0xd7, 0x65, 0xa5, 0x45, 0x9c, 0x73, 0x91, 0x85, 0xd7, 0x97, + 0xaf, 0xc2, 0x3a, 0x71, 0x3b, 0x52, 0xa2, 0x78, 0xaf, 0x4e, 0xc4, 0x75, + 0x77, 0x91, 0x53, 0x69, 0x46, 0x70, 0x94, 0xbf, 0x55, 0x45, 0x3e, 0x49, + 0xaf, 0x9d, 0x79, 0x4d, 0x85, 0x47, 0xb9, 0x47, 0x33, 0xad, 0xd2, 0xb1, + 0xd5, 0x60, 0x72, 0xc9, 0xa5, 0xb7, 0x40, 0xc3, 0x2b, 0xb8, 0xac, 0xc9, + 0x57, 0x8c, 0xad, 0x92, 0x7c, 0xc8, 0x59, 0x90, 0x57, 0x2e, 0x37, 0x5d, + 0xce, 0x3f, 0x3f, 0xc5, 0x60, 0xa6, 0x88, 0x42, 0x53, 0x8c, 0xca, 0x2e, + 0x58, 0xbf, 0xad, 0x70, 0x5e, 0x38, 0x27, 0xbd, 0x66, 0x94, 0x61, 0x9d, + 0x84, 0x60, 0x6e, 0x3c, 0xc4, 0xb1, 0x4c, 0x7a, 0x89, 0xd3, 0xb6, 0x66, + 0x5c, 0x34, 0x39, 0x4a, 0x95, 0x7c, 0xd1, 0x54, 0x68, 0x5d, 0x2c, 0x2c, + 0x59, 0x74, 0xb1, 0x96, 0x5a, 0xb2, 0x5b, 0xcb, 0x57, 0x60, 0xb9, 0x41, + 0x3a, 0xbc, 0x47, 0x43, 0xc1, 0xc5, 0x72, 0xb4, 0x67, 0x94, 0x44, 0x93, + 0x7a, 0x70, 0x45, 0xc3, 0x8a, 0x4d, 0x73, 0xae, 0xb5, 0x67, 0x5e, 0xa2, + 0x32, 0x43, 0x73, 0xcb, 0x80, 0x3f, 0x65, 0x9c, 0x89, 0x84, 0xb5, 0x87, + 0x37, 0x39, 0x8d, 0x96, 0xaa, 0x8d, 0x34, 0xa0, 0x3b, 0x59, 0x77, 0x56, + 0x67, 0x4e, 0x76, 0xa7, 0xa7, 0x5c, 0x7d, 0x4e, 0xb5, 0x56, 0x5b, 0x5f, + 0x73, 0x8e, 0x3f, 0x43, 0x7c, 0x9b, 0x66, 0x52, 0x73, 0xd4, 0xab, 0x89, + 0x34, 0xab, 0x7a, 0x9a, 0xc3, 0x8d, 0x76, 0x83, 0xa5, 0xc6, 0x3d, 0x7a, + 0x78, 0xbd, 0x95, 0x53, 0x78, 0xbc, 0x49, 0x39, 0xb8, 0x93, 0x32, 0x63, + 0x8c, 0xbc, 0x4a, 0x76, 0x99, 0xc1, 0x4b, 0xa4, 0xb5, 0x8a, 0x41, 0x61, + 0xb9, 0xcd, 0x39, 0x60, 0xd7, 0x3b, 0x85, 0x34, 0x38, 0xa7, 0xb9, 0xa7, + 0x9f, 0x87, 0x63, 0x47, 0x73, 0x89, 0x63, 0x94, 0x4b, 0xb9, 0x57, 0x3a, + 0x7d, 0x38, 0x70, 0x3e, 0x69, 0x97, 0x7f, 0x97, 0x7b, 0x3a, 0xcf, 0x60, + 0x90, 0xce, 0xae, 0xb2, 0x62, 0xc9, 0x6a, 0x81, 0xa6, 0xa1, 0x48, 0x85, + 0x67, 0x91, 0x49, 0x53, 0x40, 0x34, 0xa4, 0x38, 0x76, 0xab, 0x60, 0xb9, + 0xb6, 0x8b, 0x9e, 0xb5, 0x33, 0x4c, 0x65, 0x7c, 0xb1, 0x65, 0xa3, 0x73, + 0x38, 0xa9, 0xa9, 0xb0, 0x90, 0x61, 0x70, 0xb8, 0x82, 0x73, 0xa5, 0xa0, + 0x63, 0x9f, 0xc7, 0x5d, 0x64, 0x72, 0x9a, 0x66, 0xb8, 0xc2, 0xa2, 0x9e, + 0xc3, 0x4e, 0x52, 0x6b, 0x67, 0x38, 0x75, 0xac, 0x86, 0x37, 0xc2, 0xb0, + 0xc5, 0x4b, 0xd2, 0x43, 0x4b, 0x79, 0x95, 0x3e, 0x94, 0xc0, 0xb4, 0x90, + 0x64, 0x7e, 0xc3, 0x5d, 0x6e, 0x70, 0x81, 0x9c, 0xc8, 0x3f, 0x51, 0x99, + 0x59, 0x74, 0x78, 0x5a, 0x4b, 0xa3, 0x98, 0x75, 0x54, 0x59, 0x7d, 0x72, + 0xa9, 0x46, 0x55, 0xa0, 0xc2, 0x8a, 0xba, 0xb9, 0xbb, 0x66, 0x35, 0x3d, + 0x43, 0xb6, 0xc9, 0x41, 0x99, 0xa2, 0x96, 0xca, 0x3a, 0x70, 0x78, 0x70, + 0x8c, 0xaa, 0xa8, 0x8d, 0x81, 0x5b, 0x9a, 0x6a, 0xbb, 0x71, 0x7b, 0x48, + 0x69, 0x8a, 0xb7, 0x7f, 0xc8, 0x59, 0x58, 0x58, 0xb0, 0xcc, 0x67, 0x7b, + 0xaf, 0xa9, 0xa9, 0xc1, 0x7d, 0xcb, 0x90, 0x82, 0x51, 0x49, 0x78, 0x3f, + 0x64, 0x3b, 0x4d, 0x8f, 0x6a, 0xd5, 0x8c, 0x53, 0x6f, 0x70, 0xa1, 0xb4, + 0x6b, 0x67, 0xa9, 0xc4, 0x62, 0xaa, 0x3d, 0xc7, 0x91, 0xa7, 0x9f, 0x8a, + 0x47, 0xc3, 0x9d, 0x68, 0x56, 0xc1, 0xc5, 0xb4, 0x6d, 0xc6, 0x4c, 0x54, + 0x85, 0xb5, 0x90, 0x7c, 0x49, 0xa3, 0x3d, 0xd1, 0x7c, 0xac, 0xa4, 0xad, + 0x61, 0x58, 0x9c, 0x96, 0x66, 0x63, 0xb7, 0x49, 0x48, 0xbb, 0x61, 0xa1, + 0x43, 0x93, 0x98, 0x9f, 0x9a, 0x46, 0x4b, 0xb4, 0x3d, 0x95, 0xa4, 0xb8, + 0xc0, 0x3e, 0xac, 0xc4, 0x5f, 0xc9, 0x7f, 0x82, 0x83, 0x6e, 0x37, 0x66, + 0x9a, 0xbd, 0xa5, 0x9c, 0x5b, 0x5a, 0x5b, 0x6f, 0xce, 0xa7, 0x52, 0x73, + 0x87, 0x99, 0xb3, 0x83, 0x3b, 0xab, 0xc9, 0xb1, 0x99, 0x7a, 0x81, 0x4f, + 0x3f, 0xd2, 0x8f, 0x61, 0xd1, 0x8e, 0x7d, 0x91, 0xaf, 0x4c, 0x38, 0x67, + 0x3d, 0xc8, 0x85, 0xbd, 0xd4, 0xb4, 0xb7, 0xb4, 0x87, 0xa4, 0xa3, 0x70, + 0xa5, 0x34, 0x57, 0x4f, 0xbc, 0xae, 0xc3, 0x6c, 0x99, 0x7d, 0x85, 0xad, + 0x6c, 0x51, 0xc6, 0x76, 0x8d, 0x54, 0x31, 0x56, 0x8d, 0x63, 0xa6, 0x7a, + 0xc7, 0x52, 0xc5, 0xc0, 0x66, 0xc9, 0x88, 0x81, 0x88, 0x3b, 0x88, 0x44, + 0xb2, 0xb4, 0x39, 0x47, 0x4a, 0x8c, 0xd2, 0x7e, 0xbb, 0xa7, 0xc4, 0x61, + 0x63, 0x96, 0xd6, 0x3d, 0x6f, 0xc3, 0x69, 0x88, 0xb4, 0x96, 0xb3, 0x4c, + 0x8f, 0xce, 0xd0, 0x58, 0x6b, 0x3f, 0x74, 0x64, 0xca, 0xb2, 0x46, 0x7b, + 0x7e, 0xae, 0xcc, 0x3b, 0x70, 0x4d, 0x44, 0x53, 0x33, 0x65, 0x8c, 0x3a, + 0x6c, 0x86, 0xc1, 0xa4, 0xa7, 0x52, 0xb1, 0xc5, 0x9d, 0x5f, 0x92, 0xd3, + 0xa2, 0x52, 0xc4, 0x59, 0x49, 0x6f, 0x8d, 0x71, 0xad, 0xb4, 0x84, 0x99, + 0x82, 0x93, 0x5f, 0x86, 0x7a, 0x55, 0x59, 0x6e, 0x41, 0xb5, 0xd1, 0xa3, + 0xaf, 0x53, 0x6f, 0x8e, 0xa0, 0x67, 0xac, 0xa2, 0xbd, 0xb9, 0x48, 0x47, + 0xa0, 0x73, 0x6b, 0xa1, 0x71, 0x5b, 0xc1, 0x76, 0x64, 0x4b, 0xbe, 0x91, + 0xaa, 0xb2, 0x7e, 0x32, 0x62, 0xa0, 0xb6, 0xa5, 0x8f, 0xa3, 0x36, 0x4c, + 0xc3, 0x34, 0x89, 0x5d, 0x8f, 0xb5, 0x7b, 0x96, 0x2e, 0xce, 0xb4, 0x51, + 0x9d, 0x35, 0x65, 0x36, 0x37, 0xb2, 0x61, 0x8b, 0x89, 0x84, 0x3a, 0x56, + 0x86, 0xa6, 0x67, 0x3c, 0x4b, 0x4e, 0xc7, 0x39, 0x66, 0x70, 0x5f, 0x94, + 0x57, 0x39, 0x55, 0x71, 0x6b, 0xa9, 0xbe, 0x3f, 0x7b, 0x9f, 0x41, 0x31, + 0xc3, 0xcd, 0xa1, 0x60, 0xa6, 0x8c, 0x8e, 0xca, 0xaf, 0xcd, 0xba, 0xa1, + 0x3b, 0x74, 0x4b, 0xc4, 0x7d, 0xaf, 0xa9, 0x46, 0xb3, 0xbe, 0x4b, 0xa3, + 0x4d, 0xc2, 0x70, 0x89, 0x47, 0xac, 0x9f, 0x90, 0xa3, 0x84, 0xcd, 0x32, + 0xc7, 0x7f, 0x4f, 0xcb, 0x65, 0x43, 0xbd, 0x35, 0x46, 0x88, 0xcb, 0xc5, + 0x60, 0xb8, 0x9d, 0xc5, 0x6c, 0x93, 0x51, 0x3b, 0x60, 0x96, 0xc5, 0xca, + 0x8c, 0xd9, 0x55, 0x8b, 0x4d, 0x5c, 0xd7, 0x70, 0x83, 0x91, 0x4d, 0x61, + 0x4e, 0x5c, 0x5b, 0x61, 0xc1, 0xc5, 0x8b, 0xd6, 0x3d, 0x37, 0xa8, 0xac, + 0x38, 0xac, 0x9e, 0x99, 0x77, 0x85, 0x91, 0x54, 0x41, 0x59, 0x54, 0xc4, + 0x59, 0x3f, 0x6d, 0xb1, 0x94, 0xa5, 0xb5, 0x63, 0x9e, 0xbb, 0xc2, 0x50, + 0x3d, 0x88, 0x60, 0x6a, 0x84, 0xb4, 0x52, 0xa0, 0x53, 0x64, 0xc9, 0x4c, + 0x62, 0xc5, 0x33, 0xbc, 0x2f, 0x82, 0x82, 0x5c, 0xd0, 0x46, 0xb6, 0xcf, + 0xbb, 0x38, 0x89, 0x7e, 0xb1, 0x51, 0xb6, 0x83, 0x9e, 0xc0, 0x69, 0x88, + 0x68, 0x94, 0xb3, 0xd0, 0x91, 0x98, 0x5c, 0x94, 0x83, 0x74, 0x38, 0xb5, + 0x7a, 0xd0, 0x40, 0x88, 0x36, 0x3c, 0x63, 0x9f, 0x6f, 0xd4, 0x67, 0x8c, + 0xb8, 0x3e, 0x4c, 0x62, 0x9a, 0xb7, 0x48, 0xc0, 0x59, 0x97, 0x54, 0x9b, + 0x3f, 0xcb, 0x80, 0x6d, 0xb4, 0x51, 0x71, 0xaa, 0xa2, 0x39, 0xa2, 0x6f, + 0x69, 0xa3, 0x6c, 0x4d, 0x3e, 0x5c, 0x85, 0x60, 0x7a, 0x5a, 0x64, 0x85, + 0xb8, 0x65, 0x85, 0x67, 0x8c, 0x55, 0xa6, 0xb1, 0xb5, 0x33, 0x76, 0x54, + 0x5a, 0x93, 0xbb, 0xae, 0xc3, 0x46, 0x77, 0x37, 0x6e, 0x34, 0x7c, 0x8a, + 0x6d, 0x87, 0x4e, 0x5f, 0x87, 0x91, 0xc4, 0xc7, 0xc9, 0x58, 0xab, 0x63, + 0x47, 0x90, 0xb8, 0x40, 0x3e, 0xce, 0xba, 0xd2, 0xb4, 0x38, 0x82, 0x5f, + 0x7f, 0xc6, 0xb2, 0xb8, 0x66, 0x39, 0xa0, 0x90, 0xc9, 0x42, 0x76, 0x9f, + 0x69, 0x51, 0xcc, 0xbc, 0x85, 0x57, 0x6b, 0xaa, 0x7b, 0xa6, 0x38, 0x42, + 0xc8, 0x48, 0x60, 0x53, 0x73, 0x82, 0xb4, 0x73, 0x56, 0x8a, 0x38, 0x5f, + 0x80, 0x4e, 0xa4, 0x76, 0xcf, 0x7c, 0x97, 0x91, 0xa5, 0x35, 0xb0, 0xc7, + 0x83, 0xbd, 0x45, 0x63, 0x71, 0x51, 0x91, 0x33, 0xbd, 0x40, 0xb6, 0x7a, + 0xb2, 0x8e, 0x65, 0x9f, 0xa3, 0x4c, 0x5e, 0x89, 0xbd, 0x6e, 0x6e, 0xb4, + 0x90, 0x46, 0x51, 0xc3, 0x92, 0xce, 0x99, 0xbd, 0x2e, 0x54, 0xbd, 0x99, + 0x41, 0xac, 0x47, 0xa3, 0x76, 0x4c, 0x57, 0x86, 0x57, 0xa3, 0xc9, 0x5d, + 0xa9, 0xbf, 0xb5, 0xcf, 0xbc, 0xc9, 0x76, 0x5a, 0x41, 0xa1, 0x70, 0xa2, + 0x47, 0x65, 0x49, 0x9f, 0x4a, 0x3e, 0xa6, 0x8c, 0xbb, 0x46, 0x9f, 0x79, + 0xb4, 0xab, 0xad, 0xbf, 0x3d, 0x43, 0x82, 0x4c, 0x56, 0x63, 0x7c, 0x65, + 0xb4, 0xa8, 0x7d, 0x4a, 0xc0, 0x94, 0x94, 0xd1, 0x2a, 0xd0, 0x9e, 0x6d, + 0x6c, 0x5e, 0x86, 0x44, 0xa9, 0x38, 0x77, 0x77, 0xc3, 0xc7, 0xa3, 0xb6, + 0x43, 0x6b, 0x89, 0x89, 0xb7, 0xd3, 0x83, 0x34, 0x73, 0xb7, 0x61, 0x6b, + 0x57, 0x9a, 0x8e, 0x98, 0x99, 0x95, 0x84, 0x93, 0x95, 0x40, 0xd2, 0xc7, + 0xa3, 0xc8, 0x52, 0xa8, 0xd1, 0x72, 0x91, 0x91, 0x71, 0xb6, 0x4b, 0xbf, + 0x3b, 0x9f, 0xa8, 0x5e, 0x79, 0x59, 0x57, 0xd1, 0xcb, 0xcc, 0x3f, 0xac, + 0x41, 0x75, 0x54, 0x6d, 0x9c, 0xc1, 0x4f, 0x67, 0x72, 0xb2, 0x7e, 0x4a, + 0x7a, 0x51, 0x8b, 0x87, 0x7d, 0xa3, 0x51, 0x44, 0xc1, 0x3d, 0x2e, 0x5f, + 0xae, 0x62, 0x2e, 0x41, 0x4d, 0xad, 0xb9, 0x3e, 0xc8, 0x9d, 0x74, 0x47, + 0x52, 0x79, 0x55, 0x90, 0xd5, 0x4e, 0xcb, 0xd2, 0x98, 0xd5, 0x6a, 0xa4, + 0x62, 0xa9, 0x63, 0x9c, 0xb4, 0x65, 0x73, 0x87, 0x62, 0xba, 0x7d, 0x3a, + 0x79, 0x72, 0x3d, 0xca, 0xb3, 0x2c, 0x80, 0x4b, 0x51, 0x86, 0x6b, 0x44, + 0x7c, 0xb7, 0x80, 0x48, 0x9f, 0x4e, 0x92, 0x7b, 0xa9, 0x62, 0x6c, 0xa7, + 0xc2, 0xca, 0x4b, 0x87, 0xb5, 0x46, 0x84, 0x5b, 0x7b, 0x3b, 0x9b, 0xb0, + 0x5f, 0x92, 0xaf, 0x7f, 0x7e, 0x6f, 0xa9, 0x66, 0xc8, 0x34, 0x3b, 0x66, + 0xb3, 0xc4, 0x61, 0x48, 0x6d, 0x90, 0x60, 0x54, 0xac, 0x91, 0xbc, 0xbd, + 0xc2, 0x7d, 0xcd, 0xac, 0xd1, 0xcf, 0x45, 0x8b, 0x6e, 0xa4, 0x70, 0x92, + 0xa0, 0xa1, 0xc3, 0x77, 0xa0, 0xab, 0xcb, 0x88, 0x70, 0x50, 0xc7, 0x71, + 0x43, 0x33, 0xc0, 0x8d, 0xd3, 0xae, 0x6a, 0x72, 0x67, 0x65, 0x92, 0x38, + 0xba, 0x7a, 0x42, 0x65, 0x66, 0x7f, 0x45, 0x74, 0x78, 0x35, 0x9f, 0xd2, + 0xd7, 0xb9, 0x44, 0xa1, 0x4a, 0x8a, 0x5d, 0x86, 0x5c, 0xa2, 0xb8, 0x6f, + 0x80, 0x78, 0x86, 0x59, 0x71, 0x37, 0xae, 0xb7, 0xb1, 0x82, 0xb3, 0x7d, + 0x3f, 0xa6, 0x5e, 0x67, 0xc3, 0x58, 0x75, 0x9d, 0x5c, 0x56, 0xa0, 0x5f, + 0x63, 0xcd, 0xa2, 0x7e, 0xb3, 0xad, 0x77, 0x4b, 0x67, 0xce, 0xd4, 0x6c, + 0x92, 0x48, 0x76, 0xc3, 0x4e, 0x64, 0x3c, 0x62, 0xd0, 0xbe, 0x93, 0x96, + 0xac, 0x75, 0x3b, 0x72, 0x59, 0x63, 0x64, 0xb0, 0x52, 0x91, 0xc9, 0x8b, + 0x32, 0x81, 0x3e, 0xc8, 0xbe, 0xb7, 0x9e, 0x8e, 0xbc, 0x4c, 0xd2, 0xc1, + 0x66, 0x55, 0xa2, 0xbe, 0x38, 0x76, 0x9d, 0xc5, 0xb5, 0x74, 0xc1, 0x56, + 0x7a, 0x6c, 0xa7, 0x3e, 0x9a, 0x9a, 0xd1, 0x60, 0x59, 0x42, 0x96, 0x8f, + 0xc3, 0x73, 0x8f, 0x41, 0x95, 0x6b, 0x78, 0x94, 0x97, 0x97, 0x56, 0x3b, + 0xcd, 0x43, 0xaa, 0x75, 0xd2, 0x5d, 0xb5, 0x98, 0x9c, 0x7e, 0x80, 0x70, + 0x55, 0x64, 0x4f, 0x5c, 0x87, 0x7a, 0xb5, 0xaf, 0x3b, 0x6f, 0x63, 0xc3, + 0x59, 0x92, 0x75, 0x71, 0xa4, 0xca, 0x65, 0x57, 0x73, 0x44, 0x93, 0x85, + 0x69, 0xa8, 0x9a, 0x56, 0x9b, 0x8f, 0xbb, 0x33, 0xa8, 0x7d, 0x37, 0x5e, + 0xa7, 0x94, 0x65, 0x5a, 0x80, 0x91, 0x79, 0x6d, 0x64, 0x8d, 0x8a, 0xa4, + 0x56, 0xa9, 0xb2, 0x42, 0x70, 0xb0, 0x54, 0xaa, 0x73, 0xa0, 0x77, 0x83, + 0xc6, 0x66, 0x5d, 0xbe, 0x39, 0xa7, 0xd0, 0xae, 0xbc, 0x3f, 0x47, 0xbc, + 0x5f, 0xcb, 0x8a, 0x75, 0xc0, 0x4b, 0x53, 0xa3, 0x6d, 0xb8, 0x4a, 0x4e, + 0x92, 0xa3, 0xd1, 0x4c, 0x55, 0xca, 0x72, 0x9c, 0xc2, 0x75, 0xbf, 0xb1, + 0x75, 0xb4, 0x64, 0x4a, 0x81, 0x46, 0xb5, 0x74, 0x98, 0x40, 0xc7, 0x85, + 0xa7, 0x34, 0xc7, 0x56, 0xe3, 0x38, 0x82, 0x51, 0x52, 0x89, 0xad, 0x3e, + 0x87, 0x44, 0x85, 0x68, 0xb0, 0x8f, 0xc0, 0xdc, 0x52, 0xd3, 0xbf, 0x5d, + 0x61, 0xb7, 0x92, 0x4e, 0xbf, 0x55, 0x37, 0x2c, 0x64, 0x4e, 0x68, 0x3d, + 0x86, 0x96, 0x36, 0x46, 0x41, 0x86, 0xb0, 0x8c, 0xaa, 0xa6, 0xa2, 0x4f, + 0xbe, 0x3e, 0xa5, 0x8b, 0xc4, 0x64, 0x7a, 0x2b, 0x8e, 0x38, 0x57, 0x69, + 0xa0, 0x73, 0xa9, 0x2e, 0x74, 0x41, 0xa1, 0x86, 0x6c, 0x63, 0x39, 0xa7, + 0x31, 0x86, 0xb8, 0x3a, 0x84, 0xbc, 0x85, 0x9f, 0xd2, 0xba, 0xcb, 0x68, + 0x86, 0x34, 0x5f, 0xd3, 0xb7, 0x58, 0x83, 0xa1, 0xb5, 0xcd, 0x37, 0x81, + 0xb0, 0xab, 0x61, 0xc0, 0xb3, 0x81, 0x8e, 0x69, 0x3d, 0xc0, 0x32, 0xca, + 0x44, 0xaf, 0xa3, 0x65, 0xad, 0xc0, 0xc2, 0x3f, 0xb8, 0xaf, 0x34, 0x3b, + 0xb7, 0x3f, 0x9f, 0x74, 0x62, 0xb0, 0x62, 0x56, 0x30, 0xad, 0x55, 0x6c, + 0x52, 0x58, 0x58, 0xa0, 0x42, 0x38, 0x9e, 0xa4, 0x4d, 0xc4, 0x84, 0x2d, + 0xbe, 0xcc, 0xc2, 0x87, 0x7c, 0x89, 0x6a, 0xa8, 0x82, 0x36, 0x44, 0x6b, + 0x9a, 0x58, 0x49, 0x4a, 0x94, 0x71, 0xa5, 0x6f, 0x9e, 0x3c, 0x80, 0x52, + 0x93, 0x84, 0xc9, 0xb5, 0x44, 0xd2, 0xbf, 0x80, 0x7a, 0x9b, 0x5d, 0x83, + 0x99, 0x2a, 0xd2, 0x5a, 0x36, 0x33, 0xaa, 0xaa, 0x57, 0x74, 0x3e, 0xc1, + 0x85, 0xaa, 0xa1, 0xe9, 0x97, 0x82, 0x39, 0x52, 0x9e, 0x94, 0x95, 0x72, + 0x7d, 0xb9, 0xa7, 0x55, 0xbc, 0x89, 0xbe, 0xc6, 0x53, 0xae, 0x4e, 0x4d, + 0x92, 0xb2, 0x41, 0x96, 0x55, 0x6c, 0x67, 0xc4, 0x41, 0x5e, 0x42, 0x35, + 0xab, 0x28, 0x92, 0x58, 0xcb, 0x3e, 0xab, 0x83, 0xa1, 0x91, 0x81, 0x68, + 0x49, 0x81, 0xc2, 0x6b, 0x81, 0x3f, 0xc2, 0x84, 0xbd, 0xaa, 0x71, 0x70, + 0xa3, 0x5c, 0x8b, 0xad, 0xae, 0xc3, 0x37, 0x68, 0x9a, 0xca, 0xad, 0x66, + 0x40, 0x76, 0x31, 0x8f, 0xb1, 0xa4, 0x8f, 0x3f, 0x41, 0x7a, 0x9c, 0x4c, + 0x8c, 0xb2, 0x8d, 0xc3, 0xcf, 0x4b, 0xb1, 0xc3, 0xc6, 0x7d, 0x68, 0xb0, + 0x65, 0x79, 0x62, 0xc6, 0xb8, 0x82, 0xce, 0x9e, 0x7b, 0xb5, 0xc2, 0x92, + 0x99, 0x73, 0x83, 0x5a, 0xa9, 0x62, 0x3c, 0xb4, 0x76, 0x70, 0x46, 0x5e, + 0xc5, 0x9b, 0x5c, 0x78, 0x56, 0x62, 0x5a, 0x47, 0xc2, 0x41, 0xbe, 0x56, + 0xbe, 0x5e, 0x38, 0xc4, 0xbd, 0x53, 0x99, 0xa7, 0x7c, 0x6d, 0x7f, 0x8d, + 0xd1, 0x64, 0x7d, 0x51, 0x57, 0x3d, 0x47, 0x2f, 0xac, 0x59, 0xa7, 0x73, + 0xa7, 0x4f, 0xca, 0x95, 0x6a, 0x47, 0x37, 0xc9, 0xca, 0xbc, 0xc1, 0xa9, + 0x83, 0x5b, 0xa1, 0x62, 0x72, 0xcb, 0xb1, 0xb1, 0xc1, 0xb9, 0x61, 0x7c, + 0xc2, 0xd0, 0x73, 0xca, 0x69, 0xba, 0x63, 0x50, 0xbe, 0x90, 0xb3, 0xaf, + 0xd3, 0x94, 0x58, 0xa8, 0x42, 0x3a, 0x48, 0x59, 0x97, 0xcb, 0xcf, 0x8a, + 0xac, 0x5b, 0x4d, 0x7e, 0x3d, 0x60, 0xb2, 0xa8, 0x41, 0x2f, 0xb1, 0x4e, + 0xb6, 0xa1, 0xbc, 0xae, 0x33, 0xbd, 0x74, 0xca, 0x8b, 0xc9, 0x7d, 0xc2, + 0x54, 0x89, 0x79, 0xbc, 0x76, 0x86, 0x9a, 0x8f, 0xc4, 0x32, 0x37, 0x46, + 0xa0, 0x36, 0x2d, 0x7e, 0x5e, 0xc6, 0x90, 0x58, 0x9b, 0xcc, 0x38, 0xb1, + 0x6e, 0x6a, 0x8c, 0x81, 0x85, 0x9e, 0x6a, 0x80, 0x65, 0x56, 0x8c, 0xd3, + 0xbc, 0x90, 0x39, 0x2d, 0xab, 0x8b, 0xa0, 0x50, 0x97, 0xa3, 0xc9, 0x2e, + 0xc6, 0xc5, 0x4e, 0x97, 0x8d, 0x9b, 0x72, 0x4e, 0xb3, 0x4e, 0x84, 0xac, + 0x83, 0x82, 0x49, 0x3d, 0x8c, 0xb6, 0x51, 0xc5, 0xaa, 0x89, 0xa5, 0x93, + 0xab, 0x91, 0x8d, 0xaa, 0x90, 0xbc, 0x66, 0x85, 0x49, 0xb0, 0x44, 0xcd, + 0x98, 0xb1, 0xd1, 0xbb, 0x3d, 0x81, 0x52, 0x65, 0x71, 0x92, 0x70, 0xbc, + 0x51, 0x64, 0xaf, 0x40, 0x8f, 0xdc, 0xc0, 0xc4, 0xad, 0xa6, 0x38, 0x74, + 0xb5, 0x68, 0x4c, 0xc4, 0x91, 0xb7, 0x76, 0x53, 0xa2, 0x9c, 0x34, 0x99, + 0x82, 0x68, 0x6c, 0x2e, 0x7b, 0x30, 0x7e, 0xa1, 0x8f, 0x7a, 0xd3, 0x77, + 0x83, 0xd4, 0x40, 0x8c, 0xc6, 0x4a, 0xad, 0x9d, 0x37, 0xa4, 0xa9, 0x80, + 0x2c, 0x8e, 0x68, 0xaa, 0xac, 0x79, 0xa9, 0x2d, 0xd4, 0x8c, 0xd7, 0x51, + 0x53, 0x76, 0x87, 0xb2, 0xad, 0xb1, 0x3a, 0xd2, 0x47, 0xa2, 0x5a, 0x94, + 0xa2, 0x82, 0x6e, 0x41, 0xc5, 0x44, 0x3c, 0x9a, 0x98, 0xb9, 0xd4, 0x66, + 0xcc, 0xa9, 0xad, 0x75, 0x86, 0x70, 0x9e, 0x60, 0x70, 0xb3, 0x76, 0x5a, + 0xa4, 0x4c, 0x6d, 0x7d, 0x70, 0x3c, 0x95, 0x5b, 0x57, 0xa5, 0x63, 0x77, + 0x92, 0x74, 0x51, 0xbd, 0x4a, 0x91, 0xaa, 0x8e, 0x69, 0x9f, 0x70, 0xac, + 0x73, 0x38, 0x41, 0x4a, 0x88, 0xa4, 0xab, 0x90, 0x4e, 0xbb, 0x98, 0x3d, + 0x5c, 0x59, 0x45, 0xa8, 0x68, 0x9c, 0x98, 0x73, 0xca, 0x67, 0x9e, 0x4e, + 0xd2, 0x8a, 0x92, 0x9d, 0x85, 0x9e, 0xa0, 0x8d, 0x2b, 0x72, 0x33, 0x4f, + 0x6e, 0xb7, 0x83, 0x88, 0x53, 0xa7, 0xd8, 0xa3, 0xc7, 0x64, 0x7b, 0xc0, + 0xaa, 0x6d, 0xab, 0x5a, 0xd4, 0x80, 0x62, 0x7d, 0xb1, 0xbf, 0x77, 0x5f, + 0x83, 0x99, 0x6d, 0xa7, 0xa4, 0xc3, 0x8c, 0x7a, 0xa6, 0x6d, 0x5e, 0xad, + 0x3e, 0x5c, 0x7d, 0x86, 0x46, 0x8a, 0x54, 0x4a, 0x88, 0x8d, 0xae, 0x3e, + 0xd0, 0x53, 0xc0, 0xcb, 0x41, 0xcc, 0xb8, 0x7d, 0xc4, 0x9d, 0x72, 0xc7, + 0xa1, 0x7c, 0x57, 0x54, 0x5c, 0x61, 0xae, 0x71, 0x6b, 0x6a, 0xc3, 0xb5, + 0x56, 0x7b, 0x82, 0x4f, 0xb0, 0x3a, 0xc0, 0xb4, 0xbf, 0x61, 0x5b, 0x98, + 0xb0, 0x8d, 0x3b, 0x9f, 0xa9, 0x31, 0x72, 0xbe, 0xbb, 0x3a, 0x6e, 0x80, + 0x57, 0x49, 0x5a, 0x9d, 0x89, 0xae, 0x9e, 0x52, 0x84, 0x8b, 0x47, 0x2f, + 0x7b, 0xbe, 0x88, 0x8d, 0x5a, 0xd5, 0xb7, 0xa1, 0xc9, 0x95, 0x8f, 0xa5, + 0xc9, 0x7f, 0x75, 0xc8, 0x66, 0x9c, 0xba, 0x72, 0xcc, 0xae, 0x6d, 0x61, + 0x8a, 0x80, 0x88, 0x67, 0x95, 0xb3, 0xac, 0x6c, 0xa8, 0x7a, 0x31, 0x62, + 0xd3, 0x79, 0xaf, 0x3e, 0x71, 0xbd, 0xb6, 0x2a, 0x6c, 0x7e, 0x8f, 0x63, + 0xb8, 0x24, 0x97, 0x3a, 0x7b, 0xbd, 0x4a, 0x63, 0xcd, 0x55, 0x86, 0x7a, + 0xd3, 0xbc, 0xaf, 0xd3, 0xbe, 0x6e, 0x7c, 0x4e, 0x49, 0x48, 0xde, 0x37, + 0x43, 0xb0, 0x6d, 0x89, 0x43, 0x7f, 0xb1, 0xb7, 0xa5, 0x33, 0x6f, 0x82, + 0x8d, 0xa3, 0x8b, 0x4e, 0x65, 0x60, 0xc9, 0x69, 0x98, 0xb8, 0x55, 0x84, + 0x7d, 0x85, 0x68, 0xa9, 0x83, 0xce, 0x50, 0x39, 0x91, 0x6a, 0x7a, 0xb4, + 0x50, 0x5e, 0xa1, 0x50, 0xb9, 0xbd, 0x3c, 0xbd, 0x3c, 0x75, 0xaf, 0x63, + 0x3b, 0x61, 0x81, 0xc6, 0x92, 0xb5, 0xc6, 0x62, 0x49, 0xca, 0x48, 0xd1, + 0xd1, 0x99, 0x6f, 0xa6, 0x7c, 0x44, 0x92, 0x71, 0x2e, 0x65, 0x56, 0x90, + 0x99, 0x68, 0x4c, 0x5e, 0x63, 0xae, 0x7b, 0x3f, 0x5d, 0x7c, 0x45, 0xa3, + 0x57, 0x70, 0xbf, 0xa9, 0x94, 0xa7, 0xb9, 0x9a, 0x5a, 0x8d, 0xca, 0x75, + 0x9c, 0x64, 0x70, 0xb2, 0xb7, 0x81, 0x8d, 0xb5, 0x49, 0x50, 0x49, 0x82, + 0x6d, 0x91, 0x31, 0x6a, 0xa2, 0x92, 0xcc, 0x56, 0x5a, 0x5e, 0x91, 0x4c, + 0x94, 0x3d, 0x96, 0x8b, 0x73, 0xd1, 0x74, 0x6f, 0xb2, 0x6c, 0x80, 0xb7, + 0x7f, 0x96, 0xa0, 0x82, 0xb1, 0x59, 0xc5, 0x78, 0xb0, 0x8b, 0x49, 0x50, + 0xa7, 0x46, 0x43, 0xd0, 0x95, 0xcb, 0xa9, 0x8d, 0xc1, 0xa9, 0xb0, 0x6d, + 0x88, 0x44, 0xb8, 0x5f, 0xb3, 0x5d, 0x8a, 0xb2, 0x2d, 0x97, 0xd3, 0xa1, + 0x87, 0x52, 0x59, 0x4d, 0xa3, 0x87, 0x94, 0x55, 0x6f, 0xc0, 0x53, 0x5f, + 0xcc, 0x3d, 0xa4, 0x5e, 0xb7, 0x3f, 0xa1, 0x44, 0x8f, 0x32, 0x6c, 0xaf, + 0x46, 0x4b, 0x4d, 0x7c, 0xe7, 0x4f, 0x8a, 0x79, 0x59, 0xb7, 0xc9, 0xa1, + 0x92, 0xae, 0x99, 0x68, 0x51, 0x3f, 0x63, 0x7e, 0xcf, 0xc2, 0x4a, 0xa2, + 0xaf, 0x69, 0x53, 0x63, 0x96, 0xdc, 0xcb, 0x75, 0xb6, 0x35, 0xae, 0x98, + 0x45, 0x53, 0xb5, 0x72, 0xb2, 0xc1, 0x8d, 0x8e, 0x33, 0x9c, 0xa7, 0xce, + 0x95, 0xc8, 0x91, 0xbb, 0x4a, 0x77, 0xa3, 0x45, 0x4a, 0x57, 0x55, 0x8d, + 0xcb, 0xce, 0x48, 0x5a, 0x34, 0x37, 0x99, 0xb3, 0xaf, 0xc6, 0xcd, 0x51, + 0x63, 0x57, 0xa3, 0x86, 0xad, 0x6a, 0xd3, 0xc6, 0x8f, 0xce, 0xa1, 0xdc, + 0x59, 0xac, 0x9b, 0x7d, 0xa1, 0x8f, 0x7d, 0xc2, 0x31, 0xa4, 0xc5, 0x87, + 0xc1, 0xa3, 0x6d, 0x4d, 0x56, 0x69, 0x84, 0x56, 0x37, 0xb6, 0x4c, 0x48, + 0x55, 0x96, 0x97, 0x40, 0xaa, 0x76, 0x89, 0xd1, 0xc9, 0x6f, 0x37, 0x26, + 0x92, 0x6b, 0x8c, 0x72, 0x74, 0x6e, 0x8f, 0x91, 0xa8, 0xa4, 0xa8, 0x32, + 0x96, 0x74, 0x5f, 0xd3, 0x34, 0x25, 0xb5, 0x79, 0xb2, 0x96, 0x96, 0xd5, + 0x97, 0x3b, 0x90, 0xb7, 0xc0, 0x7c, 0xbd, 0x35, 0xa6, 0xca, 0x4b, 0x95, + 0x58, 0x4c, 0xa2, 0x8f, 0xa2, 0x6f, 0xb1, 0x43, 0xbb, 0x3c, 0xb7, 0x50, + 0xbc, 0x3d, 0x38, 0x61, 0x74, 0x8f, 0x4f, 0x79, 0x9c, 0x49, 0xaa, 0xb9, + 0x60, 0xba, 0xa9, 0x50, 0x36, 0x74, 0xbc, 0x8c, 0xb2, 0xad, 0x77, 0x44, + 0xd7, 0xd6, 0x48, 0x81, 0xb2, 0x81, 0x2a, 0x90, 0x67, 0x8e, 0xb6, 0x9e, + 0x86, 0x74, 0xc3, 0x5c, 0xdf, 0x69, 0xd5, 0x53, 0xa4, 0x7a, 0x8d, 0xc2, + 0x4d, 0x7f, 0xd0, 0xc1, 0x4f, 0x7d, 0x40, 0x43, 0xce, 0xdc, 0x94, 0x8e, + 0x3d, 0x93, 0x63, 0x56, 0x47, 0x35, 0x77, 0xb0, 0x6f, 0x8f, 0x43, 0x8a, + 0xc4, 0x88, 0x5e, 0x87, 0x8f, 0xa6, 0xbf, 0xd2, 0xa0, 0xab, 0x31, 0x44, + 0x33, 0x88, 0x2a, 0xa8, 0x6f, 0x3d, 0xbd, 0x5c, 0x85, 0x75, 0x36, 0xaf, + 0x7a, 0x44, 0xc4, 0xcf, 0x8f, 0x9b, 0x8d, 0x79, 0xa1, 0x75, 0x4c, 0x60, + 0x82, 0x3c, 0x56, 0x8f, 0x9a, 0x54, 0x62, 0x4e, 0x73, 0x6b, 0x91, 0x57, + 0xa8, 0xb5, 0x9c, 0xcd, 0x3b, 0x36, 0x74, 0xc7, 0x6c, 0x2a, 0x3c, 0x84, + 0x69, 0xcf, 0xa1, 0x7d, 0x9a, 0x7e, 0x56, 0x86, 0x96, 0x43, 0xa8, 0xa3, + 0xc1, 0xc0, 0xce, 0x58, 0x6f, 0xe2, 0x6f, 0xcc, 0x85, 0x5b, 0xc0, 0x71, + 0x59, 0xa8, 0x62, 0x71, 0x8f, 0xb3, 0x72, 0x5b, 0xc5, 0x81, 0xd9, 0x34, + 0xa7, 0x3e, 0x6d, 0x76, 0xc1, 0x3d, 0x97, 0xd6, 0xa5, 0xca, 0x88, 0xc8, + 0x44, 0xcb, 0xa2, 0x8c, 0x52, 0x7a, 0x97, 0x75, 0x2f, 0x7c, 0x59, 0x51, + 0x82, 0xc0, 0xa1, 0x7c, 0x92, 0xaf, 0x67, 0xc3, 0x72, 0xd2, 0x4f, 0xd4, + 0x31, 0x37, 0xab, 0x79, 0x70, 0x77, 0x88, 0x3e, 0xd8, 0x9c, 0x2e, 0x7d, + 0xae, 0xd8, 0xa7, 0x3a, 0xa0, 0xaa, 0x8d, 0x94, 0x44, 0x61, 0x49, 0x85, + 0xde, 0xb2, 0x90, 0xbe, 0x6b, 0xa5, 0xce, 0x6c, 0x3f, 0x99, 0x48, 0xb1, + 0x51, 0xa6, 0x86, 0x52, 0x71, 0xc5, 0xa7, 0x47, 0xc2, 0xa8, 0xcb, 0x6b, + 0xd3, 0xd4, 0x9f, 0x6a, 0x33, 0x35, 0xb3, 0xca, 0xbe, 0xc9, 0x85, 0x50, + 0xba, 0x8d, 0x98, 0xb7, 0xaf, 0x82, 0x78, 0xb5, 0x4b, 0x6c, 0x9c, 0xac, + 0xb2, 0xb2, 0x62, 0xd1, 0x83, 0xcb, 0x7a, 0x4f, 0x5a, 0x89, 0xac, 0x3d, + 0x59, 0x96, 0x34, 0xc2, 0x61, 0x32, 0x71, 0xc6, 0x6e, 0x3f, 0x73, 0x3d, + 0x64, 0xbe, 0x6d, 0x87, 0x50, 0x8e, 0xaf, 0x9a, 0x6b, 0x85, 0xb7, 0x60, + 0x9e, 0x7c, 0x30, 0xba, 0xc1, 0xd4, 0xbf, 0x86, 0xad, 0x9f, 0xb5, 0x45, + 0xb4, 0x4f, 0x6f, 0xa8, 0x62, 0x40, 0x5d, 0xa6, 0xa3, 0x47, 0x67, 0xc1, + 0x66, 0x8c, 0x9c, 0x98, 0x7f, 0x97, 0x5b, 0x67, 0x60, 0x4c, 0x92, 0x38, + 0x74, 0x5f, 0x69, 0x6d, 0x62, 0x38, 0x46, 0xb6, 0xa2, 0x5b, 0x95, 0xcf, + 0x81, 0x46, 0xca, 0x7e, 0xbd, 0x7e, 0x82, 0x6e, 0x52, 0xc7, 0x6b, 0x61, + 0xac, 0xbf, 0xa6, 0x81, 0x69, 0xb0, 0xbd, 0x47, 0x46, 0x3c, 0x7f, 0x75, + 0xa9, 0xd0, 0xbd, 0x44, 0x2f, 0x5f, 0x75, 0x95, 0x8b, 0x72, 0x56, 0xa8, + 0x63, 0x68, 0x9a, 0x59, 0x7a, 0xc2, 0xc9, 0x6f, 0x98, 0xa3, 0x3d, 0xbc, + 0x47, 0xbb, 0xc3, 0x90, 0xd5, 0xd4, 0x54, 0xd1, 0xc6, 0x4f, 0xb2, 0x97, + 0x8c, 0xc0, 0x44, 0xdf, 0xb4, 0x4b, 0x4a, 0xa5, 0x79, 0x2f, 0x3b, 0xc6, + 0x36, 0x7a, 0x35, 0xb2, 0x61, 0x83, 0x56, 0xc6, 0x3f, 0xbb, 0x6f, 0xc8, + 0x6d, 0xcc, 0x3f, 0x7f, 0xb2, 0x81, 0xb1, 0x6f, 0xce, 0x8e, 0xa2, 0xd4, + 0xd1, 0x46, 0x66, 0xa7, 0xc0, 0x77, 0x3c, 0xb7, 0xc8, 0x60, 0x96, 0x74, + 0x77, 0xb4, 0xaa, 0x32, 0x84, 0x60, 0x59, 0x91, 0x7c, 0x34, 0x76, 0x85, + 0x65, 0x56, 0xb5, 0x58, 0x9c, 0x5e, 0x76, 0xc3, 0x6a, 0x82, 0x89, 0x9d, + 0x3d, 0xa0, 0x73, 0x82, 0x2e, 0xba, 0xb7, 0xbf, 0xd0, 0xc2, 0x4e, 0x64, + 0xb9, 0xc3, 0x57, 0xb8, 0x7c, 0x93, 0x97, 0xc7, 0xc4, 0x69, 0x7f, 0x66, + 0xbc, 0xc8, 0xb1, 0x43, 0x37, 0x39, 0x5a, 0x73, 0xc1, 0xa0, 0x52, 0xaf, + 0x7e, 0xbd, 0xcb, 0xc3, 0x56, 0x6d, 0x90, 0xc9, 0x83, 0x9c, 0x6f, 0xb5, + 0x9a, 0x66, 0x8f, 0x79, 0x52, 0x30, 0xd3, 0x54, 0x7c, 0xbb, 0x7a, 0x7d, + 0xb4, 0x43, 0x7b, 0xa2, 0x6e, 0x74, 0x40, 0x8e, 0x94, 0x37, 0xcf, 0x64, + 0xa2, 0xaa, 0x89, 0xba, 0x41, 0xa1, 0x4a, 0xca, 0x38, 0x7d, 0x42, 0xb6, + 0x7f, 0x2a, 0x83, 0x4f, 0x49, 0x2d, 0x3f, 0x68, 0x85, 0x64, 0x76, 0x80, + 0xa8, 0xbe, 0xac, 0x39, 0x54, 0x58, 0xa0, 0x76, 0xd7, 0x5f, 0x76, 0x43, + 0x64, 0x7d, 0x7a, 0xb2, 0x51, 0xa6, 0x86, 0xb0, 0xb9, 0xd3, 0x6c, 0xba, + 0xb2, 0xa9, 0x75, 0xbc, 0x62, 0xb7, 0x2f, 0x30, 0x9d, 0x5e, 0x96, 0x3d, + 0x74, 0x74, 0x52, 0xa5, 0x48, 0xad, 0x2e, 0xb3, 0xb3, 0xa5, 0x60, 0xca, + 0x47, 0x2e, 0x4b, 0xb8, 0x66, 0xa4, 0x77, 0x43, 0xa6, 0xa3, 0x64, 0x48, + 0xa3, 0x7c, 0x6c, 0xab, 0x45, 0x48, 0x99, 0x5c, 0xac, 0x45, 0xad, 0x79, + 0x9b, 0x8e, 0xaf, 0x2e, 0xb7, 0x9e, 0x8c, 0x43, 0x95, 0xba, 0x9c, 0x97, + 0xba, 0x4a, 0x33, 0x6a, 0x51, 0x30, 0x7b, 0x3d, 0x9b, 0xd6, 0x58, 0x9b, + 0x97, 0x6f, 0xab, 0x81, 0x37, 0x4a, 0x34, 0xd3, 0x8a, 0x56, 0x82, 0xc3, + 0x48, 0x7f, 0x7c, 0x57, 0x6b, 0x71, 0x91, 0x66, 0x7d, 0xdc, 0x44, 0xca, + 0x62, 0xb8, 0x4d, 0xa5, 0x9c, 0xa1, 0x89, 0x74, 0x4b, 0x76, 0xb9, 0x9f, + 0x5c, 0x34, 0xa8, 0xab, 0x4b, 0x52, 0x6c, 0xa2, 0xb0, 0x7f, 0x37, 0xc3, + 0x5a, 0x7e, 0xc9, 0xc1, 0xaa, 0x6d, 0x56, 0xcf, 0xa3, 0xaf, 0x37, 0x6f, + 0x96, 0xba, 0x7a, 0x9b, 0x23, 0x8b, 0x76, 0xbe, 0x5b, 0x90, 0xa3, 0xac, + 0x64, 0x8e, 0x48, 0xb1, 0x8d, 0xbd, 0x34, 0x3f, 0xaf, 0x4e, 0x96, 0x8c, + 0x4e, 0x70, 0x4a, 0x36, 0x43, 0x38, 0x76, 0x5e, 0xaf, 0x93, 0xa6, 0x2d, + 0x99, 0x34, 0x6b, 0xc1, 0xc8, 0x9c, 0xb0, 0x94, 0xc2, 0x3a, 0x85, 0xca, + 0x76, 0xb9, 0x8f, 0x33, 0x99, 0xd4, 0xbe, 0xa9, 0x9a, 0xa3, 0x4b, 0xb5, + 0x60, 0x2f, 0x7c, 0x71, 0x61, 0x3f, 0xb3, 0x48, 0xda, 0x5c, 0x57, 0xcd, + 0xae, 0xa0, 0x5d, 0xb8, 0x93, 0xa5, 0x5b, 0x58, 0xef, 0xa7, 0xa4, 0x49, + 0x96, 0xca, 0xca, 0x72, 0x84, 0x56, 0x40, 0x46, 0x5b, 0xc1, 0x35, 0xb7, + 0xd5, 0x3f, 0x3b, 0x5e, 0x85, 0x2b, 0xb1, 0x96, 0xc7, 0x61, 0x79, 0x9d, + 0x2e, 0xb2, 0xcb, 0xba, 0x41, 0xd1, 0xdd, 0x75, 0x64, 0x6d, 0xae, 0x44, + 0xbe, 0x57, 0xab, 0x60, 0xb8, 0x8f, 0x9d, 0xa4, 0x96, 0x6a, 0xb2, 0x8e, + 0xcd, 0x51, 0xc5, 0x91, 0x80, 0xc4, 0x79, 0x34, 0x41, 0x7b, 0x4c, 0x84, + 0x92, 0xb0, 0x94, 0xd4, 0xb5, 0x6d, 0x9f, 0x6a, 0x6e, 0x3d, 0x53, 0x81, + 0x5c, 0xa8, 0x7f, 0x3f, 0x82, 0x45, 0xd4, 0x34, 0xc1, 0xa4, 0xd5, 0xb9, + 0xb7, 0x6d, 0xc5, 0x74, 0x73, 0x36, 0x95, 0xcc, 0x34, 0x9f, 0x44, 0xa4, + 0x97, 0xd5, 0x75, 0x82, 0xbe, 0x71, 0x54, 0x93, 0x38, 0x8f, 0xa2, 0x80, + 0x4d, 0xad, 0xa0, 0x94, 0x2f, 0x39, 0xd8, 0x3a, 0xcc, 0xae, 0x80, 0x91, + 0x51, 0xcb, 0x92, 0x94, 0xa5, 0x61, 0x7d, 0x33, 0xc3, 0x36, 0xd4, 0x48, + 0xb8, 0x6f, 0x8b, 0x72, 0x6d, 0xcc, 0x7d, 0x56, 0x3e, 0x50, 0xbb, 0x69, + 0x98, 0x51, 0xcb, 0x55, 0x89, 0x6f, 0xa3, 0x4a, 0x83, 0x65, 0x66, 0x47, + 0xc3, 0x45, 0x33, 0x92, 0x4f, 0xbb, 0xb8, 0x49, 0x89, 0xb6, 0xc8, 0x4b, + 0xd2, 0x82, 0xaa, 0x85, 0x69, 0x8c, 0x4e, 0x7a, 0x65, 0x5e, 0x9c, 0xb0, + 0xb2, 0x7d, 0x40, 0x5b, 0xae, 0x64, 0x47, 0x3d, 0x9a, 0xd5, 0x74, 0x65, + 0xb7, 0x9b, 0xc7, 0xda, 0x7c, 0xc6, 0x8e, 0xb2, 0x80, 0x89, 0xad, 0x86, + 0xd1, 0xe9, 0xa4, 0xc1, 0x41, 0x8d, 0x9f, 0x88, 0x84, 0x64, 0x35, 0xa7, + 0x6e, 0xb2, 0xc7, 0x43, 0xa4, 0xc9, 0xc5, 0x52, 0x4a, 0xcb, 0x29, 0x54, + 0x7e, 0xbd, 0x75, 0x2e, 0xcf, 0xbc, 0xb6, 0x5d, 0x7e, 0xc1, 0x5e, 0x44, + 0xc7, 0xac, 0xb4, 0xa2, 0x64, 0x76, 0x52, 0xd1, 0x3e, 0x8f, 0x3c, 0xa4, + 0xc3, 0x83, 0x9a, 0xc6, 0x3b, 0x4f, 0x41, 0x73, 0x8c, 0x38, 0xa7, 0x6d, + 0xb9, 0xbb, 0x4e, 0x93, 0x86, 0x46, 0x6e, 0xb8, 0xac, 0x6e, 0x7b, 0x6f, + 0xad, 0x4c, 0x99, 0xb3, 0xcc, 0x3f, 0x61, 0x74, 0xcb, 0xd0, 0xbb, 0x36, + 0x51, 0x7c, 0x3e, 0x70, 0x48, 0xa4, 0x39, 0xc3, 0x86, 0xc2, 0x7a, 0xa5, + 0xae, 0x9b, 0xb2, 0x48, 0x75, 0x46, 0x41, 0x77, 0x5c, 0xc1, 0xab, 0xa4, + 0x90, 0xa2, 0x48, 0x64, 0xcf, 0x9f, 0x5a, 0xad, 0x4e, 0x38, 0x7c, 0x71, + 0x6c, 0x59, 0x86, 0xc5, 0x4e, 0x36, 0x45, 0x80, 0x4f, 0xa0, 0x8c, 0x44, + 0xb7, 0xa5, 0x86, 0x48, 0x42, 0x88, 0xd0, 0xb8, 0x4c, 0x38, 0x92, 0x36, + 0x80, 0x3d, 0xc9, 0x59, 0xd3, 0xb8, 0x47, 0xb8, 0x86, 0x86, 0x5a, 0xbf, + 0x3d, 0x86, 0xa3, 0x8b, 0x4a, 0x53, 0x8e, 0x98, 0x59, 0xc0, 0x7a, 0x45, + 0x76, 0x83, 0x9f, 0x6d, 0xaf, 0x64, 0x32, 0xa4, 0xcd, 0x4b, 0x34, 0x39, + 0x31, 0x3b, 0xd1, 0x80, 0x69, 0x36, 0xc9, 0x62, 0x5e, 0x6a, 0x9a, 0x77, + 0x88, 0xa2, 0x73, 0xcf, 0x3a, 0x45, 0x7b, 0x96, 0x73, 0xb2, 0xa8, 0x43, + 0x67, 0xc3, 0xcc, 0x4b, 0x89, 0x5d, 0x71, 0x93, 0x74, 0xb2, 0x96, 0xb6, + 0x82, 0x5b, 0x52, 0x7b, 0x47, 0x8e, 0x2e, 0x7b, 0x98, 0x95, 0xba, 0x96, + 0xc4, 0x70, 0x99, 0x48, 0xc0, 0xbb, 0x32, 0xb8, 0x89, 0x6e, 0x59, 0x82, + 0xaf, 0x6b, 0x48, 0xc2, 0x8b, 0x36, 0x65, 0x9e, 0x3e, 0x73, 0x51, 0x8b, + 0x6d, 0x66, 0xa2, 0xb3, 0x75, 0xcd, 0x38, 0x3a, 0xa5, 0x6a, 0x75, 0x61, + 0x47, 0x34, 0xd2, 0x5c, 0x92, 0x44, 0x6f, 0x35, 0x35, 0xce, 0x88, 0xaf, + 0x4c, 0x34, 0x9e, 0x95, 0x8d, 0xa1, 0xa2, 0xd2, 0x63, 0x49, 0x9b, 0xd2, + 0x9b, 0x3d, 0x8f, 0x71, 0x3d, 0x90, 0xc4, 0x9a, 0x79, 0x4b, 0x9e, 0xc8, + 0x6b, 0xb2, 0xb1, 0x49, 0x7b, 0x87, 0x76, 0x3c, 0x9e, 0x37, 0x83, 0x96, + 0xb9, 0x8f, 0xba, 0x3e, 0x53, 0x79, 0xb6, 0x65, 0xb3, 0x6c, 0xd3, 0x93, + 0x6b, 0x9e, 0x40, 0x8a, 0x9b, 0xc2, 0x5f, 0x86, 0x82, 0x71, 0xac, 0x79, + 0x9d, 0x75, 0x77, 0x64, 0x7a, 0x35, 0xa3, 0x3e, 0x33, 0x94, 0x5c, 0xa8, + 0x5e, 0x2e, 0xa3, 0x5e, 0x34, 0x8a, 0x7b, 0x92, 0xad, 0x3d, 0xa1, 0xcb, + 0x47, 0x69, 0xc6, 0xcd, 0x60, 0x4e, 0x95, 0x43, 0x41, 0x2e, 0x60, 0x83, + 0xc5, 0x3b, 0x9c, 0x5a, 0xcc, 0xb7, 0x6a, 0x3d, 0x9f, 0x3a, 0x9e, 0xa0, + 0xb8, 0x47, 0x84, 0xbe, 0x8f, 0xba, 0x50, 0x9e, 0xc9, 0x8f, 0x7e, 0x98, + 0xcc, 0x39, 0x6f, 0x47, 0xa4, 0x82, 0x81, 0x35, 0x68, 0x98, 0x60, 0x79, + 0x88, 0x64, 0x87, 0x7d, 0x47, 0xad, 0xbf, 0x5e, 0x45, 0x46, 0x57, 0xbb, + 0x7c, 0x33, 0xa2, 0xc7, 0x94, 0x4b, 0x9d, 0xbb, 0x9d, 0x8c, 0x8a, 0xbd, + 0xb1, 0xb2, 0xc4, 0x56, 0x55, 0x99, 0x98, 0x7c, 0xc1, 0x53, 0x39, 0xc7, + 0x94, 0xcd, 0xc4, 0x70, 0x6f, 0x44, 0x78, 0x6a, 0x92, 0xa6, 0xcd, 0x96, + 0x6f, 0x48, 0x3d, 0x48, 0x66, 0xc8, 0x93, 0xcc, 0x47, 0x4e, 0xaf, 0x68, + 0xc1, 0xcd, 0x85, 0x74, 0xcb, 0xb6, 0x80, 0x3e, 0xab, 0xb2, 0x7d, 0xcd, + 0x48, 0x6d, 0x7f, 0xc4, 0xa0, 0xa4, 0xc7, 0x43, 0xae, 0x9a, 0x68, 0x90, + 0x35, 0x36, 0x56, 0xc0, 0x31, 0x86, 0x76, 0xac, 0x6f, 0x48, 0x53, 0xba, + 0x99, 0x6e, 0xcd, 0xaf, 0x6f, 0xcf, 0xb7, 0x7e, 0x48, 0x5b, 0x3f, 0x53, + 0x95, 0x71, 0xcd, 0xb8, 0xa7, 0x6a, 0x30, 0x90, 0x4d, 0x33, 0x47, 0x59, + 0xc5, 0x32, 0x86, 0xd0, 0x52, 0xac, 0x73, 0xa3, 0x3b, 0xc8, 0x4b, 0x9a, + 0xae, 0xa5, 0x63, 0x9b, 0x34, 0xba, 0x97, 0xb1, 0x7e, 0x42, 0x43, 0xbb, + 0x3b, 0x41, 0xc7, 0x57, 0xb9, 0x94, 0x69, 0x32, 0xac, 0x97, 0xcf, 0x3b, + 0x87, 0x8b, 0x83, 0x48, 0xaf, 0x52, 0xac, 0x71, 0x91, 0x69, 0x34, 0x3a, + 0x85, 0x98, 0x83, 0x4f, 0x63, 0x4b, 0xc5, 0xb2, 0xb8, 0x5e, 0x5a, 0xbe, + 0x96, 0xa9, 0xbb, 0x5c, 0x91, 0x8d, 0x51, 0x82, 0xb9, 0xab, 0x83, 0x40, + 0x35, 0x8f, 0x89, 0x6c, 0x39, 0x41, 0x39, 0x35, 0xb0, 0x54, 0x81, 0x70, + 0x60, 0xa1, 0x80, 0x64, 0x5d, 0x66, 0xb8, 0x39, 0x9d, 0xb0, 0x4e, 0xba, + 0xce, 0x4d, 0xae, 0x62, 0x98, 0xc2, 0xc9, 0x5b, 0x2e, 0x5a, 0xac, 0xbd, + 0x51, 0xb3, 0x56, 0x43, 0x9d, 0x37, 0xab, 0x64, 0x47, 0x4b, 0x9c, 0xc1, + 0xa2, 0x83, 0x66, 0x89, 0x51, 0x54, 0x69, 0xae, 0xb8, 0x6f, 0xc8, 0x80, + 0x45, 0xaf, 0x45, 0x44, 0xb6, 0xb6, 0xa8, 0xc7, 0x8c, 0x4a, 0xd1, 0xa2, + 0x33, 0xa2, 0xc3, 0xcc, 0x52, 0xcf, 0xc8, 0x4b, 0xab, 0x62, 0x4b, 0x41, + 0xb2, 0xc7, 0xb7, 0x7a, 0x76, 0x5a, 0x54, 0x34, 0x51, 0x60, 0x65, 0x7e, + 0x72, 0x30, 0x7b, 0xa8, 0x89, 0x86, 0x55, 0x4c, 0x97, 0x7e, 0x5b, 0x81, + 0x9b, 0x99, 0x44, 0x99, 0xd1, 0x8d, 0x8c, 0x64, 0x59, 0x40, 0xc6, 0x67, + 0x8e, 0x39, 0x3f, 0x39, 0xb4, 0x6b, 0x68, 0xbc, 0x62, 0x56, 0x74, 0x7b, + 0x97, 0x47, 0x81, 0x4a, 0x3a, 0x65, 0x7a, 0x4e, 0x70, 0x7f, 0x4a, 0x8a, + 0x98, 0x31, 0x51, 0x46, 0x62, 0x96, 0x70, 0x6f, 0x86, 0x44, 0x70, 0x8e, + 0x92, 0x3f, 0x84, 0xb9, 0x97, 0xce, 0xc5, 0xa3, 0x5f, 0x53, 0x8f, 0x5d, + 0xb9, 0x7c, 0xc6, 0x58, 0x58, 0x5c, 0xc7, 0x61, 0x9b, 0xbc, 0xa4, 0x3e, + 0x82, 0x43, 0xa3, 0xaf, 0x37, 0x6a, 0xab, 0x38, 0xb4, 0x63, 0x5c, 0x95, + 0xa2, 0x8d, 0x7b, 0x68, 0x7b, 0xc3, 0x7e, 0xc9, 0x8e, 0x44, 0x40, 0x64, + 0x65, 0x63, 0x7c, 0x53, 0x99, 0x52, 0x52, 0x7c, 0x84, 0xb5, 0xaa, 0xa1, + 0x97, 0x5a, 0xb5, 0xb7, 0x6e, 0x63, 0x4f, 0xbe, 0x67, 0x92, 0xd0, 0x9d, + 0x37, 0x3c, 0xb0, 0x7e, 0xae, 0xb3, 0xb2, 0xbc, 0x40, 0x81, 0x3b, 0x5c, + 0xac, 0x97, 0x80, 0xc7, 0x48, 0x8b, 0x52, 0xb5, 0x59, 0x40, 0xbb, 0x4d, + 0xa8, 0x46, 0x4f, 0x37, 0x32, 0xc7, 0x7f, 0x79, 0x94, 0x5a, 0xd0, 0x93, + 0x50, 0x85, 0xba, 0x83, 0x66, 0x3e, 0xa7, 0xab, 0xbf, 0x42, 0x82, 0x83, + 0xbb, 0xce, 0x3a, 0x67, 0x55, 0x38, 0x84, 0x4b, 0x6d, 0xcf, 0x98, 0x88, + 0xc6, 0x8d, 0x63, 0xc3, 0x55, 0xb6, 0x32, 0x8b, 0x3d, 0x99, 0x35, 0x82, + 0x5b, 0x6f, 0x78, 0x3f, 0x4c, 0xb4, 0x96, 0xa5, 0x4b, 0xb7, 0xb6, 0x6a, + 0xac, 0x9e, 0x62, 0x41, 0x47, 0x34, 0x63, 0xcf, 0xa9, 0x39, 0x99, 0xb8, + 0xab, 0x5b, 0x3e, 0x66, 0xc5, 0xc7, 0x35, 0x8f, 0x95, 0x7f, 0x8f, 0x88, + 0x59, 0xa2, 0xb2, 0x45, 0x96, 0xc1, 0x84, 0x47, 0xaf, 0x79, 0xc6, 0x4a, + 0x6f, 0xc9, 0x65, 0xc2, 0x91, 0x8b, 0x88, 0x99, 0xba, 0x71, 0x65, 0x77, + 0x2e, 0xa7, 0x9e, 0x64, 0x7d, 0x44, 0x64, 0x4a, 0xbf, 0x93, 0x60, 0xc6, + 0x89, 0xbb, 0x6c, 0x4d, 0x3f, 0xb6, 0xa9, 0x62, 0x57, 0x7c, 0x58, 0x34, + 0xad, 0x58, 0x32, 0x94, 0x63, 0xb6, 0x5f, 0x95, 0x54, 0xb7, 0x78, 0xa1, + 0x8e, 0x92, 0x7b, 0x88, 0x4c, 0x60, 0x59, 0x46, 0x5f, 0xaa, 0x64, 0x76, + 0x88, 0x4e, 0x6a, 0x77, 0xb1, 0xba, 0x8e, 0xb3, 0xa7, 0xa0, 0xb4, 0x56, + 0xd0, 0x8a, 0x34, 0x79, 0x61, 0xc3, 0x51, 0xa0, 0x56, 0x6f, 0x96, 0x86, + 0x77, 0x58, 0x5b, 0x45, 0xbf, 0x4a, 0x90, 0x91, 0x83, 0xaa, 0x65, 0xc9, + 0x3d, 0x86, 0x8e, 0xb0, 0x7e, 0x51, 0xc3, 0x65, 0x5f, 0x42, 0x7d, 0xce, + 0x7e, 0x69, 0xc1, 0x6e, 0x6c, 0x88, 0x58, 0x37, 0xab, 0x87, 0x6f, 0x6d, + 0x56, 0x68, 0xc4, 0x88, 0xc7, 0x32, 0x43, 0xd1, 0x97, 0xb2, 0x4d, 0xbf, + 0xb3, 0xd2, 0x6a, 0x45, 0xcb, 0x8f, 0x98, 0xc0, 0x47, 0x93, 0xb4, 0x37, + 0xc5, 0xb8, 0x77, 0x34, 0xc5, 0xca, 0x5c, 0xca, 0xaa, 0x61, 0xb3, 0x4e, + 0xc5, 0xb2, 0x54, 0x61, 0x71, 0x76, 0xa0, 0x9c, 0x3f, 0x9a, 0x67, 0xbd, + 0xb8, 0xc9, 0x79, 0x52, 0xc9, 0x37, 0xc0, 0xb8, 0x77, 0x47, 0x39, 0x3a, + 0x9a, 0x38, 0x39, 0xca, 0x8a, 0xa0, 0x9a, 0xb0, 0x90, 0xa6, 0xb7, 0x74, + 0x8c, 0x53, 0x3a, 0xc9, 0x4d, 0x3c, 0x7b, 0xbe, 0x91, 0x8f, 0x8a, 0x4f, + 0x3b, 0xa0, 0x48, 0x56, 0x32, 0x64, 0x73, 0xb1, 0x80, 0x55, 0x7f, 0x4f, + 0xa2, 0x84, 0x8a, 0xb4, 0xb5, 0x84, 0x40, 0x57, 0xa3, 0xb9, 0x96, 0xa6, + 0x7f, 0x9d, 0xba, 0x59, 0x6b, 0x61, 0xc8, 0x64, 0xb3, 0xbe, 0xa6, 0xaf, + 0x74, 0x36, 0x7d, 0x8d, 0x9d, 0x9d, 0x7e, 0x8e, 0xa4, 0x53, 0xb1, 0x92, + 0xbe, 0xb8, 0xd2, 0x9e, 0x5a, 0xc8, 0x87, 0xc4, 0xa6, 0xba, 0x47, 0x50, + 0x68, 0x66, 0x4a, 0x54, 0xa2, 0x73, 0xc4, 0xc1, 0xad, 0xbc, 0x6a, 0x7b, + 0x89, 0x36, 0x8d, 0x36, 0x44, 0x47, 0x77, 0xc2, 0xc9, 0x56, 0xb1, 0xa0, + 0xb8, 0x9a, 0x5f, 0x88, 0xb9, 0xc3, 0x46, 0x71, 0x57, 0x84, 0xb7, 0x89, + 0xb0, 0xc5, 0xb9, 0x46, 0xbf, 0x49, 0x7d, 0x8d, 0x46, 0xb6, 0x52, 0xae, + 0xb9, 0x74, 0x34, 0xac, 0x88, 0x38, 0x54, 0xac, 0x88, 0x5e, 0xa1, 0x39, + 0x3d, 0x94, 0xc0, 0xb4, 0xab, 0xb4, 0x48, 0x74, 0x65, 0x2f, 0xb0, 0x78, + 0x7a, 0x9c, 0xc6, 0xb8, 0x9c, 0x68, 0x62, 0x8e, 0x41, 0x34, 0xb8, 0x78, + 0x86, 0x77, 0x67, 0x59, 0x83, 0x32, 0x81, 0xc6, 0x5a, 0x60, 0x4b, 0xbf, + 0x8f, 0xcd, 0x4a, 0x96, 0x43, 0xc5, 0x56, 0x3c, 0xa5, 0x92, 0x5b, 0x62, + 0x45, 0x52, 0x65, 0x84, 0x3d, 0x9a, 0xba, 0x86, 0x51, 0x42, 0x52, 0x3c, + 0x8b, 0x7d, 0xc4, 0x6e, 0x9b, 0xc0, 0xd2, 0x40, 0x94, 0x63, 0x6a, 0x73, + 0xd1, 0x6a, 0x6d, 0xa4, 0x40, 0x54, 0x41, 0x3e, 0x43, 0x53, 0x46, 0x55, + 0x64, 0x84, 0x44, 0x42, 0x7c, 0xbf, 0xc2, 0x43, 0xc0, 0x9e, 0xc3, 0x46, + 0x4a, 0x9e, 0x9e, 0x72, 0x82, 0xd4, 0x70, 0x5a, 0x40, 0x4f, 0x67, 0x89, + 0xac, 0xad, 0x33, 0x93, 0xb0, 0xaa, 0x66, 0x9b, 0x96, 0xa3, 0x61, 0x3b, + 0x40, 0x9b, 0xaa, 0x3d, 0xa5, 0x92, 0x4c, 0x6a, 0x67, 0x7b, 0x93, 0xc5, + 0xba, 0xc0, 0x86, 0x8e, 0x89, 0xa3, 0xc7, 0x60, 0x4a, 0x9c, 0x7f, 0x51, + 0x3f, 0x89, 0x7f, 0x9c, 0xb6, 0x6c, 0xad, 0x50, 0x41, 0xc5, 0x8d, 0xc9, + 0xcc, 0x75, 0x3b, 0x52, 0x55, 0x52, 0xba, 0x72, 0x64, 0x45, 0xaa, 0x99, + 0xb9, 0xb6, 0x3d, 0x9d, 0x87, 0x95, 0xce, 0x7d, 0xa4, 0x83, 0x76, 0x91, + 0x8c, 0x34, 0x80, 0xba, 0x7d, 0xa6, 0xc1, 0x3c, 0x4e, 0x3c, 0x3a, 0x66, + 0x70, 0x40, 0x76, 0x5d, 0x77, 0x2f, 0x47, 0x4d, 0x6b, 0x9d, 0x9b, 0x55, + 0xbf, 0xb4, 0xc6, 0x8f, 0xc3, 0xa3, 0xbd, 0xcb, 0xae, 0x6f, 0xb1, 0xc0, + 0xc6, 0xa7, 0x5d, 0x7d, 0x89, 0xa5, 0x45, 0xb3, 0x7a, 0x68, 0xb5, 0xa8, + 0x85, 0xc6, 0xb6, 0xcd, 0x42, 0x5a, 0x5c, 0x62, 0x9d, 0x3a, 0x8f, 0xc5, + 0x82, 0x66, 0x4e, 0xa5, 0x9f, 0x4a, 0xaf, 0x56, 0xd2, 0x83, 0xb6, 0x8b, + 0x6e, 0x81, 0xd0, 0xc1, 0x51, 0x9b, 0x41, 0x53, 0x4c, 0xa6, 0x59, 0x6c, + 0x85, 0xd0, 0x45, 0x72, 0x84, 0x74, 0x9c, 0x89, 0x7d, 0x73, 0x6c, 0x3d, + 0x7d, 0xbf, 0x93, 0x92, 0xa9, 0xaa, 0x5e, 0x9b, 0x30, 0x8e, 0x98, 0x42, + 0x6f, 0x92, 0x37, 0xb5, 0xad, 0xa2, 0x6e, 0x88, 0x3d, 0xbb, 0x80, 0x9d, + 0x9a, 0x3e, 0x42, 0xcb, 0x79, 0x38, 0x4c, 0xaf, 0xc2, 0x63, 0x75, 0xc4, + 0x9a, 0x68, 0x48, 0xa2, 0x38, 0x91, 0x4b, 0x5d, 0x9b, 0x66, 0x9a, 0xb8, + 0x3e, 0x38, 0xba, 0xc7, 0xb9, 0x59, 0x6e, 0x56, 0x68, 0xd3, 0x6d, 0x3f, + 0x76, 0x67, 0x85, 0x7c, 0x5f, 0x37, 0x74, 0x95, 0xa4, 0xb1, 0x5f, 0x62, + 0x4c, 0xa7, 0xbf, 0x9d, 0x72, 0x47, 0x3d, 0x56, 0x99, 0x46, 0x78, 0x4d, + 0x37, 0x94, 0x4a, 0xb5, 0x9f, 0xb0, 0xa0, 0x76, 0x2f, 0x5a, 0x77, 0xa0, + 0xb6, 0x2f, 0xa6, 0x99, 0x3a, 0x78, 0x87, 0x7b, 0x82, 0x93, 0xca, 0xcb, + 0x2d, 0x4b, 0x45, 0xb1, 0x7e, 0xbe, 0xa7, 0x87, 0x79, 0xb3, 0x73, 0xbf, + 0xd5, 0x49, 0x55, 0x7c, 0x49, 0x6c, 0xb9, 0x65, 0x52, 0x9f, 0xb0, 0x73, + 0xd0, 0x78, 0x6d, 0x9d, 0x71, 0xdb, 0x83, 0x69, 0xa9, 0x7c, 0xb3, 0xa1, + 0xb7, 0x3b, 0x7b, 0x69, 0xa5, 0x90, 0x35, 0x68, 0x88, 0x6c, 0xbf, 0xb6, + 0xd1, 0x9e, 0xa2, 0xaf, 0x47, 0x6b, 0xa4, 0x76, 0xb1, 0xbb, 0x40, 0xc1, + 0x62, 0xbd, 0x48, 0x71, 0x4b, 0xbd, 0x8d, 0xa7, 0x97, 0x87, 0x73, 0xad, + 0x97, 0x81, 0xc8, 0xbd, 0xaf, 0x21, 0xce, 0x62, 0x90, 0x2d, 0xb3, 0x72, + 0xcb, 0xb1, 0xa9, 0xcd, 0x55, 0xc0, 0x41, 0xb9, 0xb0, 0x50, 0x3f, 0x8e, + 0xc2, 0x59, 0x76, 0xb7, 0x79, 0xb0, 0x47, 0x6d, 0x85, 0xa5, 0xa2, 0x6a, + 0xb5, 0x7e, 0x8c, 0x30, 0x54, 0xcb, 0x58, 0x8f, 0xba, 0x4e, 0xd8, 0xb9, + 0xcc, 0x3c, 0x8b, 0xb7, 0x8e, 0x69, 0xbb, 0x52, 0x67, 0x73, 0x68, 0xaf, + 0x7a, 0xc3, 0x4e, 0x30, 0x7c, 0xa8, 0xbc, 0x4e, 0xab, 0x7f, 0x2a, 0x94, + 0x56, 0x4d, 0x82, 0xb5, 0x8a, 0x60, 0xa3, 0x5d, 0x83, 0x4a, 0x55, 0x72, + 0xca, 0x58, 0x79, 0xbd, 0x55, 0x42, 0x89, 0x89, 0x6d, 0x54, 0xa4, 0x3c, + 0x5a, 0xd0, 0x85, 0xa3, 0xc9, 0xc3, 0x65, 0xd2, 0x9b, 0x3d, 0x92, 0x7e, + 0xb7, 0x2c, 0xa2, 0xb2, 0x4c, 0x38, 0xcd, 0x9b, 0xb6, 0x7f, 0x8c, 0x50, + 0x32, 0xa7, 0x76, 0xba, 0x99, 0x4b, 0xca, 0x44, 0x78, 0xa8, 0x5d, 0xd0, + 0x8f, 0x8b, 0x49, 0xa9, 0x32, 0x95, 0x63, 0x4c, 0xc0, 0xaf, 0x8e, 0x63, + 0x62, 0xcb, 0x5b, 0x93, 0xca, 0x25, 0xcd, 0xb1, 0x41, 0x3e, 0x9e, 0xcd, + 0x58, 0x90, 0x55, 0x68, 0x57, 0x53, 0x78, 0xc5, 0x9f, 0x68, 0x7a, 0x62, + 0xb6, 0xd3, 0xd3, 0xa4, 0x6d, 0x4c, 0x4a, 0x3d, 0xcf, 0x63, 0x7b, 0xc8, + 0x77, 0x2f, 0xb2, 0x46, 0x89, 0x85, 0x4a, 0x95, 0x2f, 0x8e, 0xad, 0x82, + 0x7b, 0xa9, 0x83, 0x98, 0x5f, 0x82, 0x83, 0x87, 0x8c, 0x97, 0x94, 0xa0, + 0x5d, 0x73, 0xbc, 0xd2, 0xaf, 0x38, 0xcd, 0x9d, 0x67, 0x8e, 0x7b, 0x2d, + 0x44, 0x24, 0x46, 0xc3, 0x59, 0x84, 0xc1, 0xb2, 0x51, 0x97, 0xb7, 0x62, + 0x8b, 0x46, 0x40, 0x56, 0xa6, 0xbe, 0x93, 0x8f, 0x92, 0x37, 0xba, 0x86, + 0x8a, 0x46, 0x52, 0x7a, 0x7f, 0xd6, 0x7e, 0x44, 0xc8, 0xbf, 0x81, 0x87, + 0x95, 0xa0, 0x55, 0xd1, 0x3c, 0xc8, 0xa1, 0x87, 0xbd, 0x3e, 0x6a, 0x93, + 0x9d, 0x6e, 0x7a, 0x4b, 0x85, 0xa9, 0x60, 0x9f, 0xa5, 0xb7, 0x7f, 0x78, + 0x8b, 0x55, 0x92, 0xb9, 0x3b, 0x5f, 0x9d, 0xd8, 0x84, 0x95, 0xd0, 0xb0, + 0x9f, 0x6f, 0x36, 0xb6, 0xbd, 0x6e, 0x83, 0x6e, 0x54, 0xc6, 0x6e, 0x41, + 0x7c, 0x9f, 0xc0, 0x5e, 0xd9, 0xa7, 0xbb, 0x6e, 0xa2, 0xc4, 0x3b, 0x4f, + 0x58, 0x52, 0x98, 0x87, 0x6c, 0x78, 0xab, 0x42, 0x69, 0x35, 0x90, 0x9f, + 0x5e, 0x63, 0xc7, 0x9f, 0x9c, 0x99, 0x4f, 0x4a, 0xcf, 0x66, 0xc1, 0x9f, + 0x3b, 0x2f, 0x42, 0xb3, 0x49, 0x8c, 0x63, 0xa7, 0x4c, 0xa9, 0x7a, 0x86, + 0x3a, 0x99, 0x6f, 0xc5, 0xba, 0x60, 0xc4, 0x93, 0xa4, 0x8a, 0x6b, 0x3a, + 0x7e, 0x52, 0xa0, 0x4f, 0x67, 0x83, 0x66, 0xa7, 0xc3, 0x4b, 0x76, 0x89, + 0x9b, 0x89, 0x72, 0x61, 0xa1, 0x3b, 0xbe, 0xd0, 0x93, 0x31, 0x43, 0x91, + 0xc1, 0x96, 0xa4, 0x7b, 0x4b, 0xba, 0xb1, 0xc5, 0xbf, 0x53, 0xc0, 0xb9, + 0x9a, 0x40, 0x34, 0x5f, 0xbf, 0x6c, 0xc7, 0x9e, 0xcd, 0x83, 0x6a, 0xb7, + 0x99, 0xd4, 0xa5, 0x74, 0xc4, 0x3f, 0x7a, 0x54, 0x67, 0x86, 0xbf, 0x81, + 0x82, 0xc9, 0x9b, 0x47, 0x9e, 0x90, 0x4b, 0x5c, 0x46, 0x5a, 0xd7, 0xbb, + 0x67, 0x44, 0xc5, 0x52, 0x39, 0xd6, 0x64, 0x43, 0xc7, 0x9e, 0x47, 0x9c, + 0x8c, 0x60, 0xb7, 0x5c, 0x94, 0x85, 0xc8, 0x6b, 0x7b, 0x60, 0x94, 0x67, + 0x8e, 0xbb, 0xb9, 0x35, 0x9d, 0xcc, 0x51, 0x9e, 0xc0, 0x6e, 0x4e, 0xbb, + 0xc7, 0x94, 0xbd, 0xa5, 0x7d, 0xa1, 0x92, 0x87, 0x79, 0xb6, 0x93, 0x6b, + 0xd4, 0x6f, 0xc8, 0x8d, 0xe0, 0xcd, 0x31, 0x40, 0xa3, 0xc2, 0x96, 0x2c, + 0x30, 0xad, 0x34, 0x31, 0xa3, 0x6a, 0xb0, 0x6a, 0x8f, 0x44, 0x97, 0x99, + 0x84, 0xa8, 0x78, 0x85, 0x83, 0x8a, 0x44, 0x6c, 0x9d, 0xad, 0xbe, 0x58, + 0x5b, 0x4c, 0x3b, 0xbd, 0x46, 0xa4, 0x95, 0x9a, 0x65, 0x79, 0xc0, 0xb4, + 0x6a, 0x53, 0xc6, 0x6c, 0x7a, 0xd1, 0xb2, 0x75, 0x4d, 0xad, 0xc5, 0x45, + 0x5c, 0x45, 0xc4, 0xb0, 0xc7, 0xaa, 0x65, 0x5e, 0x42, 0x6e, 0x49, 0x9f, + 0x27, 0x5d, 0x61, 0x39, 0xce, 0xa9, 0x8f, 0xba, 0x6b, 0x85, 0xcd, 0x3c, + 0x8d, 0x50, 0xb8, 0xc4, 0xc0, 0x87, 0x8b, 0x87, 0x63, 0x79, 0x75, 0x46, + 0x69, 0xcd, 0x9b, 0x90, 0x7c, 0x77, 0xbe, 0x5f, 0xdc, 0x61, 0x45, 0x3c, + 0x96, 0xb2, 0x8b, 0x42, 0x8c, 0x78, 0x98, 0x95, 0xb4, 0xd1, 0x66, 0xce, + 0xc0, 0xc3, 0xc4, 0xc0, 0x87, 0xbc, 0x5d, 0xa6, 0x57, 0xc8, 0x4b, 0xca, + 0x8d, 0x32, 0x5d, 0xcc, 0xa8, 0x3f, 0x85, 0x46, 0x57, 0x96, 0xc7, 0xbd, + 0x3e, 0x6e, 0xd2, 0xa6, 0x3e, 0x4c, 0x69, 0xb3, 0xc1, 0xc8, 0xb4, 0xa7, + 0xa0, 0x47, 0x7f, 0x7c, 0x80, 0xb9, 0xd1, 0x46, 0x41, 0x44, 0x76, 0x9e, + 0x5c, 0x6f, 0x4d, 0xaa, 0xb0, 0x81, 0xa3, 0x9b, 0x92, 0xa0, 0xd0, 0xc5, + 0xae, 0x82, 0xaf, 0x9e, 0x94, 0x6f, 0xa3, 0xb9, 0x80, 0x36, 0xae, 0x7a, + 0xcd, 0xc5, 0xbc, 0x75, 0xab, 0x56, 0x4b, 0x87, 0x4c, 0x6f, 0x53, 0x40, + 0x4d, 0x52, 0x8f, 0x67, 0x62, 0x72, 0xb2, 0x5d, 0x82, 0xbd, 0x5a, 0x5b, + 0x7b, 0xbd, 0x7c, 0xa4, 0x6e, 0xc2, 0x48, 0x44, 0xc7, 0xb1, 0x59, 0x9b, + 0x7f, 0x74, 0xc6, 0x36, 0x75, 0x7f, 0x67, 0x49, 0x7d, 0xdb, 0x78, 0x36, + 0x77, 0x5f, 0xb9, 0x48, 0xc5, 0xa8, 0xb2, 0x85, 0x4a, 0xa0, 0xaa, 0x63, + 0xa3, 0xc2, 0x47, 0x90, 0x73, 0x6e, 0x8c, 0x90, 0x8c, 0x94, 0xc2, 0x6d, + 0xbe, 0x3f, 0xba, 0x45, 0x60, 0x97, 0x92, 0xb9, 0x38, 0x85, 0x35, 0x4b, + 0x61, 0x45, 0xc5, 0xcc, 0x74, 0x82, 0x97, 0x79, 0xa7, 0xca, 0xcb, 0x7f, + 0x39, 0x9f, 0xc2, 0x9c, 0x44, 0x9c, 0x82, 0xae, 0x63, 0x5d, 0x49, 0x99, + 0x87, 0x56, 0xb9, 0x93, 0x3f, 0xa7, 0x95, 0xad, 0x59, 0xda, 0x64, 0x53, + 0x3a, 0xa2, 0x88, 0x54, 0x7d, 0x95, 0x83, 0x62, 0x77, 0xcc, 0x73, 0x9a, + 0x72, 0x8e, 0x76, 0x3c, 0x3e, 0x8a, 0xaf, 0x8d, 0x3c, 0xd4, 0x63, 0xca, + 0x43, 0x35, 0x84, 0xb0, 0x9e, 0x30, 0xa2, 0xab, 0x95, 0x55, 0x52, 0x51, + 0xd4, 0x56, 0x80, 0xa8, 0x27, 0x6a, 0xcb, 0x39, 0x82, 0x73, 0x38, 0x77, + 0x9b, 0x57, 0x4e, 0xd3, 0x96, 0x9d, 0xaf, 0xd5, 0x73, 0xa8, 0x6b, 0x8e, + 0x52, 0x6a, 0x50, 0x6e, 0x72, 0x9b, 0xde, 0x8b, 0x4a, 0x60, 0x5e, 0x62, + 0xb9, 0xce, 0x47, 0xb5, 0x46, 0x51, 0xb0, 0x4f, 0xb0, 0xc8, 0xb0, 0xa6, + 0x60, 0xa1, 0x6f, 0xc0, 0x45, 0x2f, 0xc9, 0xc7, 0x6f, 0xd7, 0xac, 0x5f, + 0xca, 0x5b, 0x38, 0xbf, 0xce, 0x36, 0xcd, 0xb6, 0x58, 0x55, 0x48, 0x40, + 0x6a, 0x59, 0x4a, 0x33, 0xa3, 0x58, 0xaf, 0x5e, 0x9a, 0xa6, 0x89, 0xa3, + 0x7d, 0xc9, 0x68, 0x73, 0x70, 0xa4, 0xb5, 0xa8, 0x97, 0x6b, 0xa6, 0x91, + 0x4e, 0xa7, 0x92, 0xb6, 0xc8, 0x98, 0x65, 0x57, 0x4d, 0xc6, 0x8f, 0x7b, + 0x92, 0x41, 0x8e, 0x8e, 0x5c, 0x8b, 0x8d, 0xc3, 0x3a, 0x89, 0x7d, 0x83, + 0xc1, 0x4c, 0x33, 0x7a, 0x9a, 0xbf, 0xcc, 0x65, 0x3e, 0x50, 0x91, 0xa0, + 0x75, 0xbc, 0x60, 0x5c, 0xbe, 0xba, 0x44, 0x88, 0x4a, 0xa5, 0x9c, 0x4f, + 0x52, 0xd0, 0x7c, 0xab, 0x3b, 0x8d, 0x38, 0xc7, 0x7a, 0x72, 0x42, 0x49, + 0xab, 0x97, 0xb3, 0x79, 0x32, 0x31, 0x66, 0x99, 0x75, 0xb1, 0x93, 0xad, + 0x9c, 0xac, 0x73, 0xc9, 0x70, 0x78, 0xb1, 0x5b, 0x76, 0x3a, 0x46, 0xb9, + 0x53, 0x35, 0xaa, 0xbb, 0xba, 0x2d, 0xc5, 0x5e, 0xae, 0x78, 0x77, 0x33, + 0x75, 0x60, 0x40, 0x34, 0xae, 0x30, 0x27, 0x24, 0x50, 0xc2, 0x4b, 0x4e, + 0x95, 0x7b, 0x49, 0xc8, 0xa8, 0x62, 0x81, 0x81, 0x56, 0x74, 0xb7, 0x4b, + 0xcc, 0x5f, 0xba, 0x31, 0xa4, 0x7f, 0x74, 0x86, 0xc8, 0x91, 0x76, 0x55, + 0xaa, 0x9a, 0x97, 0x9c, 0x60, 0x46, 0x4f, 0xb5, 0x87, 0x92, 0xb3, 0x78, + 0x65, 0x4b, 0x46, 0xca, 0xab, 0xb2, 0x59, 0x61, 0x89, 0x98, 0x56, 0xc1, + 0x72, 0x8b, 0xd1, 0x31, 0x46, 0x8d, 0x8e, 0x70, 0xc3, 0x76, 0x7d, 0x8e, + 0x4e, 0xc1, 0x64, 0x4d, 0x94, 0x69, 0xa8, 0x56, 0x64, 0x8e, 0xc6, 0x69, + 0xc3, 0x66, 0x68, 0xb5, 0x43, 0xcd, 0x6e, 0x5e, 0x63, 0xb2, 0x3e, 0x84, + 0x96, 0x45, 0xc6, 0xa4, 0x79, 0x4e, 0xa1, 0xbe, 0xa4, 0xa5, 0x58, 0xa2, + 0x60, 0x8e, 0x31, 0x3e, 0xa6, 0x9c, 0x59, 0x64, 0x79, 0x82, 0xd1, 0xae, + 0x6d, 0xac, 0x3d, 0x46, 0x5b, 0x46, 0x81, 0x52, 0x77, 0x95, 0x87, 0x62, + 0x80, 0x8c, 0x98, 0x47, 0x98, 0xba, 0x8d, 0x69, 0x3b, 0xb0, 0xcc, 0x6b, + 0x94, 0xae, 0x35, 0x48, 0x3d, 0x5b, 0x72, 0x51, 0xa7, 0x36, 0x8a, 0xca, + 0x48, 0xc1, 0xb8, 0xa7, 0xa6, 0xb3, 0x58, 0xb1, 0xc7, 0x3b, 0x60, 0xac, + 0xae, 0x41, 0x5f, 0xd8, 0xa5, 0x3a, 0x4e, 0x94, 0xa2, 0xa3, 0xd4, 0x8f, + 0x64, 0x88, 0x9e, 0xc3, 0x43, 0x8b, 0xcf, 0x77, 0x98, 0x5c, 0xa0, 0xae, + 0xc9, 0x61, 0x97, 0x80, 0xd2, 0x4c, 0x6f, 0xa2, 0xb9, 0x90, 0x82, 0xd0, + 0x59, 0x76, 0xd2, 0x83, 0x74, 0x2f, 0x51, 0x9d, 0xcd, 0x4e, 0x73, 0x82, + 0xba, 0x4f, 0xac, 0xb7, 0xc7, 0x7b, 0x9d, 0xda, 0x60, 0x6f, 0x74, 0x84, + 0xb7, 0xc4, 0x58, 0x67, 0xbc, 0xbd, 0x67, 0xbd, 0x93, 0x6b, 0x3e, 0x5a, + 0x9c, 0x7d, 0xd2, 0xa9, 0x2a, 0x4e, 0x58, 0x9e, 0x79, 0x38, 0xb7, 0x55, + 0x97, 0x3e, 0xad, 0x5f, 0x48, 0xa4, 0x7d, 0xbd, 0x39, 0xd1, 0x6c, 0xa4, + 0x56, 0x92, 0x87, 0x36, 0x95, 0x84, 0x63, 0x3b, 0x60, 0x3f, 0x8e, 0xbe, + 0xca, 0x26, 0x74, 0xd3, 0xc7, 0x95, 0x8e, 0x84, 0x46, 0x8e, 0x8c, 0x7a, + 0x79, 0xb4, 0x64, 0x59, 0x28, 0x5e, 0x98, 0xc1, 0x97, 0x46, 0xb8, 0x57, + 0x8e, 0x60, 0x59, 0xc4, 0x79, 0x74, 0x64, 0x3f, 0x54, 0xd0, 0x6b, 0x99, + 0x92, 0xb4, 0x9c, 0xb1, 0x36, 0x62, 0x65, 0x8d, 0xa8, 0x45, 0x51, 0x4e, + 0x60, 0x6d, 0x64, 0x6c, 0xc0, 0x44, 0x8e, 0xb9, 0x8a, 0x97, 0x74, 0x91, + 0x8a, 0x4d, 0xa1, 0xbb, 0x50, 0xaa, 0x54, 0x8c, 0x7b, 0x39, 0x99, 0xcc, + 0x5d, 0x4d, 0x77, 0xc4, 0xb0, 0xc5, 0xa2, 0x63, 0xdf, 0x95, 0xb6, 0x88, + 0x7b, 0x54, 0x38, 0xb7, 0x48, 0xcd, 0xbc, 0x7e, 0x94, 0xa5, 0x89, 0xb0, + 0x79, 0x55, 0x88, 0x51, 0x46, 0xc9, 0x3f, 0x3c, 0x76, 0x80, 0x83, 0x90, + 0x64, 0x8b, 0xae, 0x90, 0x56, 0x52, 0x4a, 0x66, 0x58, 0x40, 0xab, 0x69, + 0x50, 0x44, 0xaa, 0x92, 0x86, 0xc4, 0x50, 0xa1, 0x59, 0x79, 0x52, 0x63, + 0xa7, 0xa4, 0xa1, 0x67, 0x32, 0xc0, 0x72, 0xc9, 0x9a, 0xc9, 0x43, 0x52, + 0xab, 0x37, 0x9b, 0x78, 0x78, 0xc8, 0xb7, 0xc4, 0x69, 0x45, 0xaa, 0xb6, + 0x68, 0x80, 0xc8, 0xb5, 0x9c, 0x85, 0x6b, 0xd8, 0x9e, 0x65, 0x55, 0x91, + 0xa9, 0x60, 0x6a, 0x3e, 0xad, 0xc2, 0x4c, 0x6c, 0x7b, 0x6c, 0x92, 0x84, + 0x5f, 0x49, 0x4f, 0x60, 0x5c, 0xd4, 0x87, 0x3e, 0x73, 0x36, 0x31, 0x4f, + 0xc8, 0xa2, 0xd3, 0x8b, 0xb4, 0xbe, 0xbd, 0x54, 0xc9, 0xa1, 0x60, 0x50, + 0x5f, 0x91, 0xbc, 0xa3, 0x8e, 0x79, 0x93, 0x79, 0x3f, 0x8c, 0x67, 0x86, + 0x53, 0x6e, 0x67, 0x6d, 0x70, 0x54, 0x64, 0x5f, 0x9a, 0x64, 0x31, 0xa7, + 0xae, 0xb1, 0xac, 0x43, 0xaf, 0x58, 0x45, 0x7e, 0x61, 0xb2, 0xbc, 0xc7, + 0xbd, 0x86, 0xbc, 0xcb, 0xb2, 0x90, 0xba, 0x83, 0x81, 0x5a, 0x52, 0x49, + 0x3b, 0xc0, 0x39, 0x39, 0x55, 0x8c, 0x7f, 0xb1, 0xd5, 0x81, 0x5f, 0x56, + 0x71, 0x34, 0x46, 0x36, 0x3b, 0x45, 0x7e, 0xaf, 0x9b, 0x4f, 0x60, 0x75, + 0x7a, 0x3b, 0x81, 0xb1, 0x63, 0x44, 0x7c, 0x3d, 0x5c, 0x9d, 0xcd, 0xae, + 0x99, 0x6d, 0x6a, 0x6e, 0x9f, 0xaf, 0x66, 0x5c, 0x81, 0x3e, 0xc3, 0x49, + 0x87, 0x73, 0x3c, 0xca, 0x41, 0xa9, 0x82, 0xb1, 0x74, 0x7c, 0x59, 0x66, + 0x98, 0x80, 0x73, 0xb8, 0x6c, 0xb9, 0x61, 0x53, 0x7d, 0x46, 0x6d, 0x5d, + 0xb4, 0xba, 0x4b, 0xc9, 0xd5, 0x71, 0xd6, 0xc6, 0x92, 0x7b, 0xc6, 0x97, + 0x5b, 0x8c, 0x38, 0x6d, 0x79, 0x42, 0x5d, 0xa6, 0x26, 0x4b, 0x35, 0x9f, + 0x3e, 0x85, 0x5c, 0xb7, 0xa6, 0x88, 0x63, 0x38, 0x4a, 0x70, 0x5e, 0xb9, + 0xcb, 0x9a, 0xbb, 0x7e, 0x88, 0xcd, 0xdc, 0xa9, 0x82, 0xd6, 0xb3, 0x55, + 0xa9, 0x4d, 0x49, 0xcf, 0xd8, 0xae, 0xd4, 0x2f, 0x6b, 0x40, 0x56, 0x95, + 0x87, 0xc0, 0xcc, 0x35, 0x7c, 0x88, 0xcf, 0x81, 0x5f, 0xbd, 0x7f, 0x59, + 0xac, 0xb6, 0xa7, 0xb9, 0x7b, 0x35, 0xbc, 0xb9, 0x87, 0x91, 0x90, 0x58, + 0x7d, 0xa0, 0x83, 0xb0, 0xd0, 0x9b, 0xd2, 0x78, 0x96, 0xc4, 0xa7, 0xba, + 0x8d, 0x44, 0x6c, 0x4b, 0xd6, 0x64, 0x5c, 0x5b, 0xb2, 0xca, 0x87, 0x44, + 0x3d, 0xae, 0xac, 0xc8, 0x99, 0xb7, 0xa8, 0x96, 0x39, 0xd1, 0xaf, 0x86, + 0xb8, 0x6a, 0x54, 0x6e, 0x76, 0x72, 0xaf, 0x5e, 0x41, 0xbc, 0x60, 0x3b, + 0x6f, 0xbc, 0x4a, 0xb2, 0xd1, 0xc6, 0x64, 0xa4, 0x8a, 0xd3, 0x41, 0xb1, + 0xa1, 0x67, 0x96, 0xa3, 0x83, 0x61, 0xab, 0xaf, 0x99, 0xaf, 0x99, 0x92, + 0xce, 0x99, 0xa9, 0x4c, 0x5a, 0x7f, 0xa8, 0x5d, 0xa8, 0xc3, 0xa6, 0x3a, + 0xcf, 0x75, 0x44, 0x70, 0x54, 0x92, 0x8a, 0xc9, 0x5e, 0x4e, 0x69, 0x46, + 0x4b, 0x69, 0x3f, 0x43, 0x94, 0x87, 0x88, 0xa2, 0x9e, 0x7b, 0x3f, 0x9d, + 0xb7, 0xa5, 0x4d, 0x9a, 0x53, 0x41, 0x90, 0x5d, 0x97, 0x8d, 0x46, 0x97, + 0x4b, 0x3d, 0xa6, 0xb4, 0xdb, 0x68, 0x8e, 0x9a, 0x8c, 0xc0, 0x77, 0x2b, + 0x7e, 0x9c, 0x63, 0x97, 0x70, 0x8d, 0x56, 0xa1, 0x38, 0xa8, 0x6f, 0x64, + 0x5a, 0x59, 0x7b, 0x56, 0xb6, 0xb7, 0x98, 0x73, 0x8e, 0xc1, 0xba, 0x53, + 0xb8, 0x5c, 0x87, 0x9e, 0xb1, 0x94, 0x2f, 0x9b, 0xa1, 0x3d, 0x9c, 0x65, + 0xd0, 0xc3, 0x71, 0x6c, 0xa4, 0xd9, 0xc5, 0xac, 0x90, 0x61, 0x8c, 0xa1, + 0x6c, 0xa6, 0x92, 0x45, 0x3f, 0x64, 0xc2, 0x9a, 0x93, 0x70, 0xaf, 0x3e, + 0x4f, 0xc4, 0x4a, 0x74, 0x57, 0x43, 0x76, 0xc5, 0x27, 0x63, 0x47, 0x63, + 0x73, 0x65, 0xcd, 0xc7, 0xd9, 0x41, 0x99, 0x7b, 0xa9, 0x5e, 0x4c, 0xaa, + 0x69, 0xbc, 0xd6, 0x73, 0xbe, 0x5e, 0x7d, 0x7d, 0x65, 0x8a, 0x6e, 0x49, + 0xab, 0x80, 0x64, 0xb5, 0x98, 0xa2, 0x64, 0x79, 0x4e, 0x60, 0x80, 0x31, + 0xc9, 0xe1, 0xc4, 0xa4, 0xb9, 0xcf, 0xa2, 0x44, 0x7b, 0x57, 0x8c, 0x46, + 0x52, 0x3f, 0x4c, 0xa1, 0x58, 0xa8, 0x9e, 0x34, 0x4e, 0xc9, 0xd0, 0x42, + 0x63, 0xad, 0xb8, 0x42, 0x42, 0x7f, 0xb0, 0xbe, 0x69, 0x9c, 0xb9, 0xb4, + 0xb2, 0xa6, 0x76, 0x70, 0x76, 0x8e, 0xa9, 0xc1, 0x38, 0x45, 0x64, 0xb6, + 0x5b, 0x2e, 0x91, 0xb3, 0x45, 0x8e, 0x56, 0x56, 0x51, 0x3d, 0xa4, 0x62, + 0x41, 0x54, 0x34, 0x3d, 0x67, 0xbc, 0xa0, 0x5e, 0xc8, 0x9b, 0x58, 0x38, + 0x5b, 0x73, 0x5c, 0x62, 0x59, 0x78, 0x50, 0x8d, 0xa2, 0xd4, 0x70, 0x74, + 0x8a, 0xbb, 0x4a, 0x63, 0x4d, 0xcc, 0xbe, 0xa4, 0xbc, 0x7b, 0xad, 0xc2, + 0xb7, 0x8c, 0xba, 0xb7, 0x88, 0x45, 0x3f, 0x68, 0x61, 0xc5, 0xb7, 0xbe, + 0xac, 0xb6, 0x97, 0x3b, 0xad, 0x49, 0x7d, 0x9b, 0xa0, 0x74, 0x9a, 0xdd, + 0x79, 0xc3, 0xc1, 0x99, 0x3a, 0x8c, 0x7c, 0x4f, 0x63, 0x8c, 0x5c, 0xa1, + 0x7c, 0x58, 0xb6, 0x7e, 0x70, 0xaa, 0x79, 0x91, 0xbd, 0x9b, 0xac, 0xab, + 0x8e, 0x55, 0xc0, 0x79, 0xc5, 0xa4, 0xa6, 0x78, 0x6e, 0x93, 0x54, 0xc2, + 0x92, 0x4c, 0x8a, 0x9b, 0xb4, 0xd3, 0xbc, 0x56, 0x4f, 0x6b, 0xc7, 0xc1, + 0xa9, 0x78, 0x44, 0xc6, 0x42, 0x47, 0x83, 0x2f, 0x81, 0x98, 0x41, 0x85, + 0x78, 0x8d, 0xc6, 0x43, 0x51, 0xb4, 0xa8, 0x8b, 0x53, 0x3d, 0x9d, 0x9c, + 0x9f, 0x7c, 0x8f, 0x95, 0xb3, 0x41, 0x3d, 0x9d, 0x83, 0x89, 0x87, 0xc2, + 0x47, 0x8b, 0x29, 0xc8, 0xb8, 0x61, 0x8b, 0xbc, 0x87, 0x6a, 0xc6, 0xad, + 0x95, 0x61, 0xb2, 0xc7, 0x95, 0x80, 0xd0, 0x5d, 0xb8, 0xc1, 0xb8, 0xbd, + 0xc9, 0x9f, 0x46, 0x3f, 0x3e, 0xa3, 0x6e, 0xb9, 0x3d, 0x60, 0x3d, 0xc5, + 0x79, 0x73, 0x9e, 0xb7, 0xb7, 0x5c, 0x5e, 0x8c, 0xbf, 0x69, 0x3f, 0xc8, + 0xb3, 0xb1, 0xa4, 0x46, 0x5f, 0xa3, 0xc6, 0x80, 0x65, 0x70, 0x85, 0x48, + 0xa6, 0x9a, 0x5f, 0x98, 0x64, 0xb2, 0x79, 0x43, 0xa6, 0x9c, 0x50, 0x47, + 0x8c, 0x6e, 0xc7, 0x6d, 0x7b, 0x3f, 0x40, 0xd6, 0x45, 0x93, 0x9c, 0x78, + 0x72, 0x88, 0x9d, 0x87, 0x8a, 0xc4, 0x5e, 0x5f, 0xcb, 0x36, 0x9f, 0x73, + 0x67, 0xb6, 0x95, 0xb2, 0xcb, 0xba, 0x67, 0x3a, 0xc5, 0xb7, 0x39, 0x37, + 0xd8, 0x9c, 0x66, 0xa0, 0x5a, 0x93, 0xb6, 0x55, 0x36, 0x34, 0x6c, 0x9c, + 0x7c, 0x79, 0xc8, 0x89, 0xa3, 0x5a, 0x97, 0x6a, 0x7e, 0x54, 0x5a, 0x7d, + 0xb5, 0x56, 0xc2, 0xa4, 0x60, 0xa4, 0xc8, 0x68, 0x7f, 0x52, 0xc1, 0x78, + 0x86, 0x2d, 0x4a, 0x3f, 0xca, 0x96, 0x92, 0xad, 0xab, 0x5c, 0x57, 0x9a, + 0xa0, 0xca, 0x80, 0xa5, 0xa0, 0x99, 0xab, 0x41, 0x55, 0x72, 0x34, 0x6d, + 0xae, 0x97, 0x97, 0x54, 0xc1, 0x47, 0x6c, 0x95, 0x32, 0x75, 0x8b, 0xd1, + 0xbe, 0x9f, 0x50, 0xa2, 0x98, 0x84, 0x64, 0xc9, 0xa5, 0x6d, 0x81, 0x66, + 0xa5, 0x68, 0x79, 0xd2, 0x75, 0x69, 0xb4, 0x84, 0x7e, 0x3a, 0xab, 0x7e, + 0x89, 0x7e, 0x3a, 0x4b, 0x97, 0xc0, 0x99, 0xbb, 0x90, 0x80, 0x7a, 0x75, + 0x45, 0xb7, 0x4f, 0xb2, 0x57, 0xc8, 0xc8, 0xc1, 0x34, 0x9a, 0xab, 0xac, + 0x95, 0x38, 0x6d, 0xcd, 0x84, 0x77, 0x3a, 0xb5, 0x35, 0x4d, 0xce, 0x94, + 0x97, 0xa9, 0xaa, 0x51, 0x8a, 0x69, 0x3c, 0x4d, 0x5c, 0x3e, 0x76, 0x59, + 0x44, 0xcb, 0x71, 0xb6, 0x3a, 0x70, 0x5a, 0x94, 0xb6, 0x71, 0x8b, 0x67, + 0x60, 0xcf, 0x9f, 0xbe, 0x9b, 0x5e, 0xd0, 0x8e, 0x89, 0x83, 0x7e, 0xa8, + 0x6f, 0xaa, 0x2e, 0x87, 0x38, 0x4a, 0xa1, 0xd8, 0xbe, 0x59, 0x5e, 0x47, + 0x78, 0x46, 0x86, 0x51, 0x79, 0xd4, 0x6c, 0xaf, 0xa0, 0x77, 0x91, 0x32, + 0xb0, 0x65, 0x70, 0xc3, 0x6e, 0x53, 0xbf, 0x40, 0x6b, 0x99, 0x8c, 0x7c, + 0x91, 0x34, 0xb5, 0x71, 0x98, 0x3c, 0xcc, 0x4a, 0x74, 0x45, 0x49, 0x4b, + 0x5c, 0xa7, 0x78, 0xb5, 0x6c, 0x30, 0xce, 0x76, 0xcc, 0x54, 0x57, 0x68, + 0x97, 0x85, 0xbb, 0x94, 0x67, 0xc3, 0x8d, 0xb1, 0xb8, 0x9c, 0x57, 0x50, + 0xa4, 0x79, 0x6f, 0x88, 0xc1, 0x32, 0x5d, 0x66, 0x98, 0x83, 0xcb, 0x7e, + 0x9b, 0xcd, 0xa9, 0x87, 0x80, 0x66, 0x9c, 0x5b, 0xbc, 0x4f, 0xa9, 0x65, + 0xcd, 0x76, 0x88, 0x36, 0x4c, 0xbf, 0x5c, 0x9a, 0xcd, 0x5e, 0xa5, 0xc5, + 0xac, 0x4a, 0xc4, 0x3e, 0x35, 0x3e, 0xa4, 0xc9, 0xc7, 0xb6, 0x64, 0xae, + 0x3d, 0xc2, 0xca, 0x74, 0x4a, 0xc7, 0x8d, 0xa0, 0x57, 0x37, 0x99, 0x46, + 0xcb, 0xa9, 0xc6, 0x56, 0x4b, 0xb5, 0x96, 0x92, 0x51, 0x36, 0x75, 0x35, + 0xa2, 0x68, 0x49, 0x84, 0x3f, 0x56, 0x5b, 0x81, 0x3f, 0x98, 0x9c, 0x3e, + 0x76, 0xa3, 0x6d, 0x60, 0x36, 0x93, 0xbd, 0x3d, 0xa6, 0xaf, 0x2e, 0xc9, + 0x4a, 0x71, 0x97, 0x7b, 0xa9, 0x73, 0x99, 0x39, 0x8f, 0x47, 0x79, 0x7a, + 0x83, 0x55, 0x78, 0xc1, 0xba, 0x89, 0x7c, 0x9f, 0x93, 0x49, 0x89, 0x3d, + 0x3f, 0x8f, 0x32, 0x44, 0x4e, 0x89, 0x8d, 0x55, 0x79, 0x82, 0x78, 0x8b, + 0x43, 0x58, 0xbe, 0x4d, 0x5f, 0x69, 0x4c, 0xa2, 0x75, 0x78, 0x7b, 0x6a, + 0x40, 0x61, 0xa5, 0xcd, 0x61, 0xd7, 0x65, 0x7b, 0x87, 0x7d, 0xa0, 0x6e, + 0xba, 0xa4, 0x60, 0x3b, 0x41, 0x34, 0xd2, 0x53, 0x82, 0x9d, 0xa3, 0x7a, + 0xa8, 0x76, 0x59, 0x96, 0x77, 0x86, 0x53, 0xa3, 0x66, 0xc9, 0x8a, 0x48, + 0x81, 0x94, 0xb9, 0x8b, 0x66, 0x50, 0x80, 0xb2, 0x95, 0x38, 0xc8, 0x68, + 0xd1, 0x74, 0xc4, 0xad, 0x77, 0x58, 0xb7, 0xc9, 0xbe, 0x7c, 0x3d, 0x6c, + 0x91, 0x95, 0xce, 0x81, 0xaf, 0x9e, 0x65, 0x7b, 0x3c, 0x94, 0x9c, 0xc9, + 0x66, 0xa8, 0x6f, 0x4b, 0xd1, 0xbf, 0x4c, 0x78, 0x8d, 0xb1, 0x43, 0x6c, + 0xb0, 0x40, 0x3d, 0xbc, 0xa3, 0xb8, 0x9a, 0xcb, 0x99, 0x96, 0x37, 0x3c, + 0xa5, 0x85, 0xa5, 0x9b, 0x36, 0xba, 0x39, 0xaa, 0xc3, 0x6c, 0x79, 0x8c, + 0xb9, 0xcf, 0x5c, 0x79, 0x55, 0x50, 0x7b, 0x63, 0xcb, 0x56, 0x5a, 0x59, + 0x66, 0x7f, 0x9b, 0x4d, 0xcb, 0x7e, 0x7b, 0xb4, 0xaf, 0x44, 0xb9, 0x59, + 0x7e, 0x69, 0xad, 0xc3, 0x5e, 0x4d, 0x86, 0x44, 0xa5, 0xa2, 0x7b, 0x63, + 0x4f, 0x52, 0x7b, 0x5b, 0x6b, 0x98, 0x73, 0x57, 0x4c, 0x89, 0xac, 0xb0, + 0x3c, 0x56, 0x51, 0x76, 0xc9, 0x99, 0x62, 0xc7, 0x83, 0x88, 0xae, 0x3b, + 0x5a, 0xc4, 0x77, 0x96, 0x83, 0xb8, 0x63, 0xc7, 0x3a, 0xc5, 0x67, 0xcc, + 0x64, 0x4d, 0x86, 0x51, 0x3b, 0x69, 0xa5, 0xcf, 0x73, 0x6a, 0xd2, 0xd3, + 0x93, 0xcb, 0x8e, 0xb8, 0x9d, 0x67, 0x83, 0x6a, 0x8c, 0x43, 0xa6, 0xb0, + 0xbf, 0x3f, 0x4d, 0x59, 0xd3, 0x8f, 0xc1, 0x7c, 0x86, 0xcc, 0x72, 0x79, + 0x6f, 0x74, 0x78, 0x74, 0x88, 0x77, 0x42, 0xc8, 0x6b, 0xa8, 0x3c, 0x4d, + 0x6e, 0x79, 0x8a, 0x96, 0x4f, 0x8f, 0x3f, 0x82, 0x6c, 0x2d, 0xbe, 0x79, + 0x44, 0x4c, 0x85, 0x6a, 0x75, 0x9e, 0x7c, 0xb5, 0x60, 0xb6, 0x3f, 0x93, + 0x92, 0xc5, 0xa0, 0x8a, 0x59, 0xbe, 0x81, 0xa8, 0x3c, 0xa1, 0x69, 0xb5, + 0xa7, 0x71, 0x95, 0xc2, 0x63, 0x3e, 0x39, 0x31, 0xb9, 0x5a, 0xa1, 0x76, + 0xca, 0xd4, 0x5d, 0x46, 0xaa, 0xd0, 0x75, 0x3b, 0x66, 0x50, 0x3d, 0xb4, + 0xb9, 0x35, 0xbf, 0x8a, 0x6a, 0xac, 0xa0, 0x7c, 0x3d, 0x3f, 0x34, 0x73, + 0x82, 0xac, 0x35, 0x4d, 0xcf, 0xb3, 0xc0, 0x52, 0x94, 0xce, 0x88, 0x7e, + 0xc0, 0x4c, 0xc7, 0x55, 0xb2, 0x36, 0x8c, 0x42, 0x9d, 0x5f, 0x48, 0xba, + 0x52, 0x88, 0x3f, 0x81, 0x81, 0xb7, 0x83, 0x76, 0x94, 0x47, 0x4a, 0x5e, + 0x81, 0x6b, 0x8e, 0x74, 0x83, 0xce, 0xb9, 0xc0, 0xcf, 0x98, 0x9d, 0x30, + 0x89, 0xb9, 0x91, 0x54, 0x89, 0xa4, 0xbc, 0x83, 0x8b, 0x51, 0xad, 0x87, + 0x89, 0x36, 0x44, 0x5e, 0xc3, 0xa1, 0x8b, 0x9f, 0xa6, 0xb2, 0xd1, 0x54, + 0x95, 0x8d, 0x96, 0x45, 0x98, 0xb5, 0xce, 0xbc, 0xbc, 0x91, 0x84, 0xb7, + 0xbc, 0x82, 0xcb, 0xc1, 0xba, 0x38, 0xc2, 0x40, 0xc6, 0x9c, 0x48, 0xb3, + 0xa6, 0xb2, 0x67, 0xa8, 0xbf, 0x63, 0xcb, 0xbf, 0x46, 0xcb, 0x8e, 0xc5, + 0x32, 0x92, 0xb1, 0x8a, 0x31, 0xc5, 0x8f, 0x75, 0x4f, 0x58, 0x8b, 0x6f, + 0x35, 0x37, 0xcb, 0xbf, 0x51, 0x6d, 0x7d, 0x8e, 0x49, 0xba, 0x5b, 0xa8, + 0xa3, 0xce, 0xd7, 0x3a, 0x6c, 0x36, 0x91, 0x8a, 0x5c, 0x37, 0xc9, 0x78, + 0x45, 0xc2, 0x84, 0xa2, 0xbf, 0x59, 0x9a, 0x37, 0xa3, 0x5d, 0xc4, 0x4f, + 0x9b, 0x5b, 0x7c, 0xbc, 0x81, 0xb2, 0x87, 0xcc, 0x7f, 0xb7, 0x89, 0xc4, + 0x57, 0x81, 0x97, 0x41, 0x91, 0x65, 0xcd, 0x7e, 0x9b, 0x66, 0x32, 0x89, + 0x4b, 0x70, 0x41, 0xad, 0x8f, 0xb9, 0x55, 0x96, 0x74, 0x2e, 0x80, 0x53, + 0x87, 0xbe, 0x82, 0xc7, 0xad, 0x54, 0x8f, 0x63, 0xaa, 0x58, 0xc5, 0x8c, + 0x6b, 0x49, 0xab, 0x31, 0x47, 0x37, 0x61, 0x78, 0xc1, 0x81, 0xac, 0x89, + 0x96, 0x3f, 0xa9, 0x9c, 0xad, 0x99, 0x8f, 0x48, 0x44, 0x62, 0x49, 0x7e, + 0x7e, 0xa0, 0x57, 0xcd, 0x67, 0x36, 0xad, 0x87, 0xa0, 0xa9, 0x48, 0xbb, + 0x42, 0xa6, 0xb8, 0xa7, 0x9d, 0x8d, 0x99, 0xc4, 0xac, 0x93, 0x93, 0x90, + 0x31, 0x44, 0x8c, 0xb1, 0xaf, 0x77, 0x53, 0x5d, 0x5c, 0xbb, 0x86, 0xbf, + 0xb5, 0x36, 0x32, 0xba, 0x94, 0x58, 0x50, 0xb7, 0xc8, 0x3d, 0x49, 0x4a, + 0x8b, 0x64, 0x73, 0x8b, 0xcd, 0x3c, 0x57, 0x66, 0x83, 0xc1, 0x50, 0x48, + 0x90, 0xc4, 0x5f, 0x30, 0xbd, 0xae, 0x8b, 0xb2, 0x3b, 0x5c, 0x92, 0xaa, + 0x81, 0xaa, 0x6d, 0x62, 0x41, 0x5f, 0xb4, 0x96, 0x3f, 0xc7, 0xa6, 0x75, + 0x49, 0x62, 0x66, 0x8a, 0xca, 0x83, 0x39, 0x55, 0xcb, 0xa6, 0x7f, 0x5b, + 0x87, 0xb2, 0x7a, 0x64, 0x80, 0x95, 0xbf, 0xc4, 0x62, 0xd0, 0xb2, 0x83, + 0x88, 0xbc, 0x80, 0xb5, 0x41, 0x34, 0xa8, 0x54, 0xbc, 0x55, 0x50, 0x54, + 0x77, 0x39, 0x94, 0x8b, 0x9c, 0x73, 0xb8, 0x44, 0x91, 0x66, 0x6d, 0xac, + 0x6f, 0x8c, 0x57, 0x7e, 0x60, 0x4a, 0x4a, 0xab, 0xb6, 0x97, 0xb5, 0xb8, + 0x78, 0x7d, 0x64, 0x81, 0x9f, 0xa3, 0xc8, 0xbc, 0x88, 0x57, 0x66, 0x8f, + 0x37, 0xa2, 0xc7, 0x4f, 0xd3, 0x72, 0x94, 0xa5, 0xaa, 0xad, 0xb7, 0xbb, + 0x3e, 0xbf, 0x70, 0x59, 0x70, 0x3b, 0x80, 0xb4, 0x3f, 0x78, 0xcd, 0x86, + 0x8d, 0x40, 0x79, 0xa6, 0x85, 0xc6, 0x45, 0x5c, 0x3b, 0x55, 0x59, 0x3d, + 0x78, 0xa6, 0x9e, 0x9d, 0x9c, 0x59, 0x5a, 0x6f, 0xb5, 0x90, 0x9b, 0x56, + 0x84, 0x46, 0x94, 0xb1, 0x69, 0x60, 0x8d, 0x74, 0x77, 0x89, 0x3e, 0x4e, + 0x79, 0x4d, 0x5c, 0xaf, 0x6a, 0xaf, 0xb7, 0xaa, 0x3c, 0x92, 0xc0, 0x73, + 0x55, 0x7f, 0x7e, 0x9c, 0x4e, 0x39, 0xd0, 0x6d, 0xab, 0x6b, 0xbf, 0x54, + 0x88, 0xcc, 0x7a, 0x5a, 0xa7, 0x6c, 0x59, 0x87, 0x35, 0x9d, 0xc4, 0x56, + 0x69, 0xb5, 0x3b, 0xc6, 0xb1, 0xad, 0x96, 0x32, 0xbf, 0x56, 0x65, 0xae, + 0x74, 0xc9, 0x59, 0x51, 0x93, 0x7f, 0xba, 0xc1, 0xc7, 0x8d, 0x7a, 0x58, + 0x79, 0x81, 0xbe, 0x56, 0xac, 0x90, 0x51, 0x42, 0x96, 0x7b, 0x6c, 0x72, + 0x9c, 0x7e, 0xc5, 0x5e, 0x93, 0x37, 0x47, 0xd0, 0x97, 0x5e, 0x45, 0x62, + 0x7e, 0xc0, 0x72, 0xa8, 0xcb, 0x3a, 0x4a, 0x82, 0x89, 0x77, 0xb2, 0xba, + 0xce, 0x74, 0xa4, 0x6d, 0x4b, 0xb3, 0x70, 0x88, 0x86, 0x99, 0xc3, 0x45, + 0x32, 0xbe, 0xaf, 0x78, 0x67, 0xa4, 0x62, 0x96, 0xac, 0x7a, 0x6b, 0xad, + 0x33, 0x5b, 0x97, 0x75, 0x87, 0x95, 0x8b, 0x8c, 0x37, 0x58, 0xaa, 0x50, + 0x90, 0x81, 0x73, 0xb2, 0xbd, 0xb0, 0x3e, 0x54, 0x62, 0x37, 0x6d, 0xc1, + 0x84, 0x58, 0x8c, 0x59, 0xc6, 0x72, 0x8f, 0xa5, 0xca, 0xab, 0xb6, 0xb2, + 0x43, 0x3a, 0xa5, 0x36, 0x63, 0x93, 0x3e, 0x96, 0x9c, 0x5c, 0x77, 0x6b, + 0x68, 0x7f, 0x64, 0x8a, 0x5a, 0xa1, 0xc5, 0xb1, 0xb6, 0xc3, 0x64, 0x9a, + 0x54, 0x4b, 0x6b, 0xb0, 0xbb, 0x7c, 0xa4, 0xa5, 0x44, 0xae, 0x99, 0xa3, + 0x4b, 0x65, 0xa6, 0x60, 0x52, 0x8c, 0x93, 0x4e, 0x39, 0x7b, 0x45, 0xb9, + 0x7c, 0xcc, 0x82, 0xb2, 0x73, 0x62, 0x86, 0x2e, 0x4b, 0xcf, 0x45, 0x53, + 0x8e, 0x9c, 0xa3, 0x82, 0x7e, 0x99, 0xb0, 0x32, 0x70, 0xa1, 0x84, 0x5e, + 0x51, 0x3e, 0xc2, 0x34, 0x6b, 0x44, 0x79, 0x55, 0x48, 0xb4, 0x6a, 0xa8, + 0x79, 0x53, 0x55, 0x5c, 0x2e, 0x77, 0x52, 0x66, 0x4d, 0x83, 0x98, 0x75, + 0x3d, 0x2e, 0x64, 0x47, 0x3d, 0x7e, 0x59, 0x43, 0x79, 0x38, 0xa3, 0x9c, + 0x3b, 0xb8, 0xb1, 0x7d, 0xa1, 0x96, 0x86, 0x88, 0x93, 0x6d, 0x41, 0x58, + 0x9e, 0x89, 0x6e, 0xc3, 0x4a, 0xc4, 0x71, 0x64, 0xa5, 0x6d, 0x92, 0x36, + 0x9a, 0xcf, 0x3b, 0x33, 0x7a, 0x94, 0x6c, 0xb2, 0x96, 0x33, 0x7e, 0x34, + 0x57, 0xad, 0x77, 0x77, 0x43, 0x3d, 0x45, 0x6d, 0x93, 0x9c, 0x67, 0x4b, + 0x77, 0x53, 0xb6, 0x69, 0xbf, 0x8e, 0xbe, 0x9f, 0xab, 0xbd, 0xb1, 0x9e, + 0x70, 0x75, 0x57, 0x48, 0x4a, 0xd0, 0x7e, 0xa2, 0x6a, 0x40, 0x3b, 0x84, + 0x43, 0x3c, 0xc8, 0x61, 0x89, 0xa5, 0xb0, 0xb0, 0x86, 0x6a, 0x79, 0xb1, + 0x89, 0x87, 0xb3, 0xce, 0x84, 0x97, 0x6a, 0xb5, 0xb7, 0x47, 0x70, 0xc8, + 0x30, 0xaa, 0x56, 0x86, 0xb5, 0x90, 0x47, 0x79, 0xd0, 0xb8, 0x69, 0xb3, + 0xa7, 0x6e, 0x92, 0x46, 0x9c, 0x35, 0x52, 0x5f, 0xcc, 0x37, 0x58, 0x46, + 0x6a, 0x6d, 0xc1, 0x80, 0x52, 0xbe, 0x76, 0x89, 0xc9, 0x5b, 0x3f, 0x6a, + 0x76, 0x2d, 0x5e, 0x5d, 0xc2, 0x39, 0x8e, 0xd0, 0x98, 0xae, 0x9b, 0x44, + 0xc6, 0xce, 0xca, 0x81, 0x90, 0xc4, 0x7d, 0xa6, 0x6a, 0x48, 0x9e, 0x67, + 0x71, 0x49, 0xac, 0x55, 0x3c, 0x70, 0x8e, 0xa3, 0x53, 0x57, 0x56, 0x67, + 0x70, 0x57, 0x71, 0xa6, 0xd0, 0xbd, 0x4c, 0x9e, 0xcb, 0x3d, 0x5e, 0x93, + 0xce, 0xc2, 0x81, 0x3d, 0x36, 0x87, 0xca, 0x31, 0x69, 0x31, 0xc3, 0xa7, + 0xb9, 0x83, 0x74, 0xb5, 0xb2, 0xb1, 0x3e, 0xa6, 0x8e, 0x58, 0x30, 0x61, + 0xad, 0x67, 0x92, 0x4a, 0x8b, 0xd0, 0x9c, 0xad, 0x63, 0x2f, 0x40, 0x46, + 0x70, 0xc0, 0xce, 0x2f, 0xb4, 0x9d, 0x52, 0x6a, 0x6c, 0xb1, 0x64, 0x81, + 0x9b, 0x8d, 0x43, 0xb4, 0x6c, 0xcd, 0x96, 0x73, 0x94, 0xb6, 0x72, 0x37, + 0x72, 0x59, 0xbb, 0xa9, 0x70, 0xbc, 0x87, 0xab, 0x5b, 0xb4, 0x39, 0xa2, + 0x9a, 0x8c, 0x33, 0xbb, 0x2e, 0xab, 0x49, 0x65, 0x34, 0x56, 0x73, 0xc6, + 0xb7, 0xaf, 0x7f, 0x33, 0x67, 0x3f, 0x4b, 0x4a, 0xb7, 0x66, 0x80, 0xc2, + 0x96, 0x59, 0x5b, 0xa4, 0x7f, 0x7b, 0x79, 0x6b, 0xc7, 0xad, 0x7b, 0x53, + 0xad, 0x91, 0x3d, 0xc0, 0x8b, 0xb6, 0xb9, 0x84, 0x9e, 0xa5, 0x8b, 0xc4, + 0x2e, 0xaa, 0xc1, 0x9c, 0x9c, 0x45, 0x76, 0xb7, 0x87, 0x4f, 0xb3, 0x99, + 0x9b, 0x4e, 0x43, 0x9f, 0xb8, 0x48, 0x67, 0x6a, 0xa9, 0x55, 0x39, 0x5c, + 0xa5, 0x68, 0x52, 0x7c, 0xcb, 0xb7, 0x79, 0xd3, 0xa2, 0x81, 0x53, 0xac, + 0x7e, 0x92, 0x86, 0xd4, 0x8a, 0x9f, 0x53, 0x61, 0x44, 0x84, 0xb3, 0x57, + 0x47, 0xc3, 0x7f, 0x85, 0xba, 0x37, 0x53, 0xb9, 0xb6, 0x52, 0x33, 0x65, + 0x2a, 0x93, 0x81, 0x72, 0x56, 0xc8, 0xc0, 0x32, 0x3a, 0x3c, 0x66, 0x42, + 0xb3, 0xcf, 0x79, 0x58, 0xb5, 0x71, 0x95, 0xd4, 0x88, 0x85, 0x65, 0xbb, + 0x40, 0x62, 0x51, 0xc6, 0x4e, 0xd2, 0x7d, 0xc7, 0xaa, 0x88, 0x91, 0x8a, + 0x8c, 0x53, 0x90, 0xb3, 0x4e, 0x77, 0x5e, 0xc0, 0x6e, 0xce, 0xa7, 0x87, + 0x9b, 0x3c, 0x4b, 0xa6, 0x4a, 0x51, 0x6b, 0x55, 0x31, 0x3f, 0x60, 0x91, + 0xa4, 0x4d, 0x7c, 0xcc, 0xbc, 0x3b, 0xb7, 0x6d, 0xaa, 0x5a, 0x6a, 0x3f, + 0x37, 0x8c, 0x66, 0xc2, 0x60, 0xb4, 0x9e, 0xad, 0x67, 0xaf, 0x90, 0xc1, + 0x41, 0xc7, 0x3a, 0xd6, 0xa9, 0x93, 0x80, 0x81, 0x76, 0xa0, 0xa4, 0xc0, + 0x80, 0xa0, 0x87, 0xa1, 0x49, 0x7b, 0x74, 0x41, 0xa9, 0x9d, 0x64, 0x4e, + 0xbf, 0x6d, 0x9a, 0x38, 0xb1, 0xca, 0x87, 0x78, 0x78, 0xaa, 0xa0, 0x85, + 0xa1, 0xce, 0x6b, 0x3c, 0xd2, 0x6b, 0xbb, 0x42, 0xbd, 0xa6, 0x68, 0xb4, + 0xaa, 0x83, 0xb2, 0x47, 0x77, 0xad, 0x29, 0xce, 0xa3, 0xa2, 0x5d, 0xbb, + 0x39, 0x8d, 0x3c, 0x52, 0x82, 0xb3, 0x61, 0x5d, 0xd4, 0x72, 0x97, 0x6c, + 0xa1, 0x8e, 0x6a, 0x4d, 0x9e, 0xbe, 0x81, 0xb5, 0xc9, 0x60, 0xbb, 0xbc, + 0xac, 0xbc, 0x69, 0x48, 0x7e, 0x82, 0xc7, 0x5d, 0x8e, 0xc5, 0xa1, 0xb6, + 0x4a, 0xce, 0x69, 0xb7, 0xab, 0xab, 0x65, 0x89, 0x8e, 0x51, 0xb0, 0x3a, + 0xcd, 0x53, 0x65, 0xca, 0x70, 0xd4, 0xab, 0xc6, 0xcb, 0x89, 0x74, 0x33, + 0x8b, 0xac, 0xbf, 0x91, 0x5b, 0x30, 0x64, 0x7d, 0xbd, 0xbf, 0x4e, 0x6c, + 0xba, 0x9b, 0xc2, 0x96, 0xc3, 0x50, 0x36, 0x9b, 0xb6, 0xca, 0xdd, 0x8c, + 0x36, 0x3f, 0x7c, 0x92, 0xbd, 0xcd, 0x56, 0x81, 0x7f, 0x6a, 0xba, 0xb0, + 0x49, 0x38, 0xd2, 0xca, 0x60, 0x58, 0x82, 0x68, 0x85, 0x99, 0xaa, 0xc9, + 0x60, 0xc6, 0x9f, 0x44, 0x99, 0xa4, 0x37, 0xc6, 0x9e, 0xbc, 0xc3, 0x5f, + 0xc0, 0x3e, 0xb0, 0x3d, 0x45, 0x58, 0x81, 0x4c, 0x9b, 0xbc, 0x7a, 0xa0, + 0xce, 0x62, 0xb6, 0x7e, 0xcc, 0x64, 0xb4, 0xa4, 0xb8, 0x5e, 0x3f, 0xaf, + 0x7c, 0x98, 0xc5, 0x53, 0x7e, 0x84, 0x45, 0xc2, 0x95, 0x8e, 0xc6, 0x55, + 0x86, 0xc4, 0x3e, 0x3a, 0x52, 0x8a, 0x60, 0xb7, 0xb9, 0x3e, 0x63, 0x7a, + 0x4e, 0x9b, 0x60, 0x72, 0x4f, 0x59, 0xc6, 0xb0, 0x83, 0xb1, 0x6b, 0xae, + 0x63, 0x69, 0x9d, 0x46, 0x4f, 0x6b, 0x71, 0x3e, 0xb3, 0x3b, 0xa1, 0x72, + 0x5b, 0xcb, 0xd9, 0xaf, 0x6f, 0x60, 0x69, 0x31, 0x91, 0xae, 0x57, 0x46, + 0x56, 0x44, 0x7f, 0x97, 0x43, 0x47, 0x5d, 0x74, 0xce, 0x54, 0xc0, 0x54, + 0x4d, 0x4a, 0x3c, 0xc8, 0x47, 0x5d, 0x55, 0x71, 0x4a, 0xc8, 0xcf, 0x3e, + 0x53, 0x9b, 0x34, 0x5c, 0xa3, 0x2e, 0x83, 0x47, 0x96, 0x9f, 0x50, 0x5e, + 0xc4, 0x50, 0x7d, 0x8a, 0x40, 0xa8, 0x81, 0x34, 0x42, 0xab, 0xae, 0xa9, + 0x5e, 0x51, 0xcf, 0x74, 0x51, 0xb1, 0x83, 0x45, 0x2d, 0xb7, 0xb0, 0x34, + 0x6c, 0xbe, 0xa2, 0xbb, 0xd2, 0x5e, 0xcd, 0x88, 0xbb, 0x53, 0x99, 0xcb, + 0xca, 0xb2, 0xc3, 0x89, 0xb7, 0xc2, 0x40, 0xca, 0xca, 0x69, 0xa1, 0x95, + 0xac, 0xa2, 0xc0, 0xd0, 0x84, 0xa2, 0x42, 0x4f, 0xa1, 0x48, 0x5c, 0x72, + 0x4c, 0xc8, 0x7d, 0x50, 0x67, 0xaa, 0x87, 0xd5, 0x3f, 0x48, 0x57, 0xc8, + 0x4d, 0x7c, 0x5f, 0xb0, 0x32, 0x9a, 0xd5, 0x41, 0xab, 0x94, 0x3b, 0xb5, + 0x7e, 0x84, 0x45, 0x99, 0xb8, 0x44, 0xa9, 0x66, 0x54, 0x50, 0x9b, 0xab, + 0x7c, 0xb0, 0x71, 0xb6, 0x57, 0x50, 0x4e, 0x8e, 0x44, 0x37, 0x87, 0xb5, + 0xa9, 0xb2, 0x7e, 0x7d, 0x78, 0x92, 0x61, 0x76, 0x87, 0x34, 0x5d, 0xd2, + 0x63, 0x96, 0x26, 0xb9, 0xbd, 0x8f, 0x30, 0x7a, 0x7a, 0xba, 0x9b, 0x6b, + 0x5a, 0xc2, 0x57, 0x8d, 0x6a, 0x6b, 0xa3, 0x68, 0x4d, 0x4e, 0x4a, 0x68, + 0x8d, 0x31, 0xac, 0x3f, 0x7c, 0x46, 0x89, 0x2a, 0xb6, 0xa2, 0xc7, 0x9a, + 0x75, 0x3d, 0xc6, 0x70, 0xd4, 0x6a, 0x85, 0xac, 0x36, 0xac, 0x68, 0x68, + 0x65, 0x80, 0x3b, 0x6c, 0xd0, 0x88, 0x91, 0xb6, 0x82, 0x32, 0x8e, 0x5b, + 0x96, 0x4c, 0x59, 0xa5, 0x80, 0x3e, 0x9e, 0xc9, 0x45, 0x5c, 0x96, 0x5d, + 0x58, 0x63, 0x66, 0x34, 0xbb, 0x8f, 0x85, 0x55, 0x2b, 0x87, 0x71, 0x83, + 0x67, 0x5b, 0x65, 0x9a, 0xa3, 0xa4, 0xa4, 0x36, 0x40, 0x68, 0x9f, 0x3f, + 0xa3, 0xb1, 0x78, 0x54, 0x5a, 0x53, 0x9a, 0xb5, 0x43, 0xbd, 0x47, 0x52, + 0x80, 0xbf, 0x42, 0xb4, 0x57, 0x85, 0xb8, 0xc7, 0x72, 0x9c, 0x32, 0x8e, + 0x67, 0x83, 0x40, 0x3c, 0x5c, 0xb2, 0xbf, 0xa6, 0xb1, 0x52, 0x89, 0x7d, + 0xbd, 0xc8, 0x51, 0x83, 0x52, 0x8e, 0xd4, 0x7b, 0x7f, 0xaa, 0x8a, 0xbe, + 0x9d, 0x89, 0xa9, 0x82, 0x34, 0xb7, 0x6b, 0x9b, 0xad, 0x34, 0x28, 0x76, + 0x37, 0x91, 0xcc, 0xaf, 0xb5, 0x9e, 0x79, 0x97, 0xc1, 0xaa, 0x97, 0x46, + 0x96, 0x8e, 0x8a, 0x8c, 0x36, 0xcd, 0xb7, 0xd6, 0xb6, 0x57, 0x4d, 0x75, + 0xa2, 0xa8, 0xcf, 0xae, 0x82, 0xb5, 0xcc, 0xa6, 0xb4, 0x3a, 0x3f, 0x66, + 0x85, 0x3c, 0x79, 0x83, 0xce, 0xb1, 0x6b, 0x6b, 0x87, 0xa4, 0xad, 0x9b, + 0xa6, 0x84, 0x8b, 0xbb, 0x74, 0x53, 0x66, 0x6b, 0x83, 0xbd, 0xd3, 0x49, + 0xbe, 0x92, 0x53, 0x66, 0x3e, 0x8f, 0x3e, 0x7b, 0xbd, 0x62, 0x8d, 0x63, + 0x38, 0x99, 0x5b, 0x86, 0xb9, 0xdd, 0x9c, 0x9f, 0x97, 0x91, 0x4c, 0x9e, + 0xbf, 0x71, 0x61, 0x58, 0x55, 0x88, 0xa8, 0x30, 0xb8, 0xc0, 0x5e, 0x60, + 0x6e, 0x53, 0xb8, 0x78, 0x6c, 0x58, 0x56, 0x7e, 0x3e, 0xd5, 0xa8, 0xc9, + 0x7b, 0xbd, 0x7d, 0xab, 0xcf, 0x4a, 0xae, 0x95, 0xd5, 0xac, 0x3e, 0xc6, + 0xa3, 0x59, 0x5e, 0x4d, 0xc9, 0x66, 0xb7, 0x92, 0x4c, 0x67, 0xae, 0x40, + 0x44, 0xb4, 0x81, 0xd1, 0xac, 0xb0, 0x49, 0x47, 0x8f, 0xbd, 0xbe, 0xb3, + 0x83, 0xad, 0x34, 0x3b, 0x8d, 0xc1, 0x40, 0x5d, 0xad, 0xce, 0x6b, 0x7c, + 0x63, 0x85, 0x3e, 0x3e, 0x65, 0x3f, 0x38, 0x79, 0x91, 0xb7, 0x77, 0x52, + 0x74, 0xa5, 0xa8, 0xb0, 0x5d, 0x95, 0x8d, 0x46, 0x6e, 0x22, 0x8f, 0x3a, + 0x9a, 0x9a, 0x5c, 0x69, 0xcd, 0x65, 0x99, 0x79, 0x5f, 0x41, 0xa2, 0x22, + 0x4a, 0x93, 0x7b, 0xa6, 0xa4, 0x61, 0x79, 0x79, 0xd7, 0x53, 0xcb, 0x95, + 0x63, 0x99, 0x74, 0xcd, 0x33, 0xc3, 0x38, 0x3e, 0xcb, 0x9e, 0xac, 0x7c, + 0x9c, 0x4b, 0xaf, 0xa7, 0x46, 0x39, 0x78, 0x47, 0x80, 0x43, 0x97, 0xd7, + 0xa0, 0x93, 0xb9, 0xc0, 0xb3, 0xd2, 0x7c, 0xca, 0x79, 0xd1, 0x59, 0xb4, + 0x3d, 0xa7, 0xa4, 0x49, 0xa1, 0xc9, 0x7c, 0xb4, 0x69, 0x64, 0xc9, 0x7e, + 0x3e, 0x7c, 0xaa, 0x99, 0x96, 0xad, 0x73, 0x3d, 0x3b, 0x66, 0x50, 0xae, + 0x4b, 0x5e, 0x5b, 0x42, 0x58, 0x84, 0xd2, 0x4d, 0x76, 0xb7, 0xc7, 0xaf, + 0x85, 0x87, 0x5c, 0x49, 0xc3, 0x40, 0x9e, 0x4e, 0x95, 0xc6, 0x85, 0x57, + 0x9e, 0x81, 0x7e, 0x9a, 0x4f, 0xb4, 0x3a, 0xb6, 0x60, 0x6e, 0xc7, 0x4a, + 0xca, 0x59, 0x81, 0xb7, 0x39, 0x85, 0x71, 0x38, 0x6f, 0x62, 0xb8, 0x72, + 0x33, 0x6f, 0x6f, 0x3d, 0xdb, 0xc1, 0x4b, 0xc3, 0x75, 0xca, 0x84, 0x2b, + 0x8b, 0x61, 0x87, 0x88, 0xae, 0xd3, 0x46, 0x33, 0x6d, 0x72, 0x40, 0x7d, + 0x84, 0x89, 0xbf, 0x39, 0x82, 0x5b, 0x98, 0xaf, 0x4d, 0x4b, 0x4e, 0xc5, + 0x66, 0x84, 0x58, 0x3c, 0x70, 0x46, 0x59, 0x89, 0xc7, 0x82, 0xc5, 0xdb, + 0x26, 0x50, 0x6c, 0x7f, 0x27, 0x96, 0xc4, 0x47, 0xac, 0x8e, 0x5c, 0xad, + 0x60, 0x49, 0x79, 0x63, 0x5f, 0xc2, 0x65, 0xb9, 0x35, 0x8d, 0xb5, 0x8b, + 0xc4, 0xa0, 0x84, 0x93, 0xaa, 0x85, 0x7c, 0x8e, 0x4b, 0x40, 0x5f, 0x8d, + 0x9e, 0x3e, 0xba, 0xba, 0x39, 0xab, 0x9c, 0x68, 0xa2, 0x94, 0x5d, 0x2f, + 0xb1, 0x49, 0x47, 0xd0, 0xab, 0x34, 0xd1, 0x8a, 0x8a, 0x36, 0x4d, 0x54, + 0x45, 0x99, 0x7c, 0x90, 0x94, 0x7c, 0xab, 0xc2, 0xba, 0x5f, 0x33, 0x48, + 0xc6, 0x4f, 0x88, 0x91, 0xcb, 0x4e, 0xc7, 0x73, 0xe3, 0x85, 0xa7, 0x71, + 0x68, 0xb7, 0x8c, 0x42, 0x75, 0xca, 0x4c, 0xc3, 0x86, 0xad, 0x47, 0xd9, + 0x92, 0x4f, 0x8a, 0x51, 0x77, 0x9f, 0xbd, 0xc9, 0x87, 0x6a, 0x2e, 0x3f, + 0xa7, 0x4c, 0x45, 0x9d, 0x4e, 0xa5, 0xc1, 0x53, 0xc2, 0xcd, 0x92, 0x5a, + 0xca, 0x59, 0x67, 0x63, 0xc8, 0x84, 0x86, 0xb6, 0x54, 0x9a, 0x7f, 0x67, + 0x6a, 0xb4, 0xac, 0x93, 0x4a, 0x62, 0x46, 0x35, 0x98, 0xbf, 0x8d, 0x74, + 0xad, 0x7e, 0x90, 0x33, 0xc4, 0x6f, 0x3f, 0x62, 0x4a, 0x86, 0xb1, 0x6d, + 0x98, 0x64, 0xa8, 0x38, 0x9a, 0x98, 0x47, 0xac, 0x41, 0xa5, 0x5e, 0xa6, + 0xa3, 0xbd, 0x3e, 0x35, 0x31, 0x54, 0x52, 0xa3, 0x7b, 0x71, 0xcf, 0x39, + 0xc5, 0x7c, 0xc3, 0xc2, 0x52, 0x6e, 0x4b, 0xd1, 0x62, 0x9c, 0xc3, 0xa0, + 0x85, 0x97, 0x9f, 0x8e, 0x3d, 0x22, 0xb1, 0x99, 0x55, 0x9f, 0x51, 0xbe, + 0x95, 0x8e, 0xdc, 0xc7, 0xb3, 0x7c, 0xd1, 0x6c, 0x30, 0xc1, 0x7a, 0xc7, + 0x86, 0xa5, 0x47, 0x77, 0x74, 0xa5, 0xa8, 0x80, 0xe2, 0x95, 0xa6, 0x56, + 0xb6, 0x98, 0xad, 0x49, 0xdd, 0xb0, 0x2b, 0x36, 0x95, 0x6a, 0x6b, 0xc4, + 0x2e, 0x6e, 0xb7, 0x67, 0x7d, 0xcf, 0x5d, 0x66, 0xc9, 0x72, 0x39, 0xad, + 0x9d, 0x29, 0xac, 0x9e, 0x68, 0x8d, 0x96, 0x47, 0xa0, 0x8b, 0xc6, 0xab, + 0x68, 0xaa, 0xc3, 0x24, 0x57, 0x69, 0x80, 0xc2, 0x41, 0xba, 0xd5, 0xc6, + 0x51, 0x65, 0x62, 0x64, 0x90, 0xbc, 0xb0, 0xb7, 0x6e, 0x46, 0x92, 0x2f, + 0x74, 0x85, 0x9b, 0xbf, 0x6e, 0x65, 0x6d, 0xc2, 0xa0, 0xc2, 0xab, 0x49, + 0xb1, 0x8b, 0x91, 0xc1, 0x92, 0x45, 0x9e, 0x88, 0x66, 0xc4, 0x6e, 0xd9, + 0xc6, 0x40, 0x2b, 0x2b, 0xcc, 0xa7, 0x57, 0x88, 0xc9, 0xde, 0xc5, 0x75, + 0xc5, 0x7d, 0x95, 0xc6, 0x84, 0xad, 0x92, 0x68, 0x8b, 0x52, 0x9d, 0x76, + 0xc3, 0xbf, 0x68, 0x88, 0x7a, 0xd1, 0x67, 0x44, 0xa4, 0x9c, 0x48, 0x9e, + 0x31, 0xc3, 0xad, 0x81, 0x91, 0x6d, 0x54, 0x58, 0xbe, 0x5e, 0xb2, 0xc5, + 0x6d, 0xb7, 0x91, 0x8b, 0x4f, 0x53, 0x9d, 0x85, 0x6e, 0x49, 0xa9, 0xb3, + 0x52, 0x39, 0xb2, 0x7c, 0x8b, 0x4b, 0x49, 0xb0, 0x6e, 0x50, 0xd3, 0xac, + 0xa2, 0xb7, 0xae, 0x6e, 0x4a, 0x69, 0xa2, 0xc3, 0xbf, 0x38, 0x80, 0xa6, + 0xc4, 0x80, 0x33, 0x71, 0xa6, 0x79, 0xc6, 0x34, 0x64, 0x9a, 0x4e, 0x9a, + 0x7f, 0x77, 0x3b, 0xad, 0xba, 0xcd, 0xc4, 0x27, 0x71, 0x95, 0x7f, 0x86, + 0xb6, 0xb6, 0xb1, 0x20, 0xd4, 0x87, 0x9b, 0xad, 0xa9, 0x73, 0xbb, 0x95, + 0x51, 0x65, 0x69, 0xb3, 0xd2, 0x67, 0x33, 0x3c, 0x80, 0xd7, 0x9a, 0x3f, + 0xd2, 0x72, 0x4c, 0x48, 0x53, 0x40, 0x5a, 0xab, 0x43, 0xdc, 0x5a, 0xac, + 0x37, 0x8f, 0x45, 0x65, 0xc8, 0xc8, 0x9e, 0x80, 0x5d, 0x69, 0x4a, 0x87, + 0xaf, 0xec, 0x61, 0xb7, 0x30, 0xbf, 0xab, 0x7b, 0x37, 0xa6, 0x81, 0x95, + 0xd3, 0xc4, 0x3c, 0x46, 0x9b, 0x4e, 0x87, 0x5c, 0xc3, 0x7b, 0x35, 0xbc, + 0x59, 0xa8, 0xb4, 0x72, 0xa2, 0xa6, 0xa4, 0x40, 0xa6, 0x5c, 0x2d, 0xb1, + 0x8d, 0x61, 0xa0, 0x8f, 0x5a, 0x3e, 0xda, 0xad, 0xc2, 0x8a, 0xdd, 0x80, + 0x9d, 0x6f, 0x56, 0x9a, 0x80, 0x98, 0x3e, 0xb1, 0x9f, 0x4b, 0x54, 0x2c, + 0x9a, 0x85, 0x68, 0xd0, 0x9b, 0x42, 0x68, 0xde, 0x84, 0x5f, 0x92, 0xc2, + 0xc0, 0xbf, 0xd0, 0xd3, 0x3e, 0x83, 0xaf, 0xab, 0x5a, 0x93, 0x40, 0x28, + 0xab, 0x88, 0xc0, 0x38, 0x79, 0x9f, 0x5f, 0x6f, 0xc7, 0xa4, 0xcc, 0x37, + 0x51, 0xbe, 0xaa, 0xb7, 0xcf, 0x52, 0x7f, 0x4d, 0x44, 0x29, 0xa8, 0x91, + 0xb7, 0x7f, 0x82, 0xd3, 0xb3, 0x84, 0x7b, 0x59, 0xa2, 0xc1, 0x95, 0xb5, + 0x6c, 0x68, 0x62, 0x59, 0xc4, 0xba, 0x42, 0x5b, 0x31, 0x93, 0x37, 0x59, + 0x4c, 0xc5, 0x78, 0x32, 0xa9, 0xb6, 0xd5, 0x4d, 0x41, 0x51, 0x38, 0x8d, + 0xb1, 0x61, 0x96, 0x76, 0xc4, 0xb2, 0x47, 0x25, 0xcf, 0x7f, 0x9c, 0x7e, + 0xa4, 0xb2, 0xc8, 0x91, 0x90, 0xc6, 0xd6, 0xbf, 0x9c, 0x25, 0xb6, 0x70, + 0x9d, 0x6b, 0xca, 0x6d, 0xb4, 0x8e, 0x7a, 0xbc, 0xca, 0xad, 0x41, 0x51, + 0x6a, 0x88, 0x65, 0x67, 0x5e, 0xa3, 0xcc, 0xa5, 0x73, 0xa0, 0x31, 0x29, + 0x57, 0x77, 0x8b, 0x4d, 0xbc, 0x83, 0x4d, 0x7b, 0x32, 0x70, 0xa2, 0x5c, + 0x43, 0x36, 0x48, 0x2b, 0x6d, 0x83, 0x77, 0x72, 0xb5, 0x41, 0x7b, 0x88, + 0xa4, 0x80, 0x98, 0x5b, 0xae, 0x3f, 0xbe, 0x97, 0x9c, 0xb4, 0xde, 0x6d, + 0x8e, 0x7a, 0x3f, 0xd2, 0xc3, 0xa5, 0x36, 0xbe, 0x3d, 0x76, 0x6a, 0xd3, + 0xba, 0x4d, 0x9f, 0x75, 0x76, 0xc7, 0xc6, 0x4f, 0x76, 0x4f, 0xa2, 0x4b, + 0x7f, 0xba, 0x35, 0x87, 0x51, 0xc2, 0x47, 0x32, 0xba, 0xd6, 0x98, 0x54, + 0x92, 0xe9, 0x87, 0x9f, 0x6a, 0x46, 0x6e, 0x4c, 0xba, 0x4a, 0x7e, 0xbf, + 0x29, 0xc1, 0x8a, 0x71, 0x4a, 0x6a, 0xc4, 0x60, 0x3a, 0x7a, 0x96, 0x88, + 0x83, 0x8f, 0xb0, 0x9f, 0x3f, 0x5d, 0x6e, 0x85, 0x34, 0x3c, 0x57, 0x73, + 0xbf, 0x7d, 0xb3, 0x59, 0x9b, 0x5b, 0x96, 0xbe, 0xbc, 0x84, 0xa5, 0x70, + 0xb1, 0x93, 0x73, 0x6f, 0x4b, 0x9d, 0x54, 0xb4, 0xa5, 0x43, 0x7b, 0x7a, + 0xa7, 0x52, 0x70, 0x7f, 0x8c, 0xd7, 0xd6, 0xc7, 0xbe, 0xa9, 0xa6, 0x80, + 0x81, 0x6f, 0x2c, 0x48, 0xcd, 0x61, 0xcb, 0xa1, 0x89, 0x5b, 0x49, 0xb6, + 0x42, 0xa0, 0x65, 0x57, 0xa4, 0xd3, 0x8e, 0x9d, 0xa9, 0xb5, 0x5f, 0x9a, + 0x4e, 0xa4, 0x6f, 0x6c, 0xa0, 0x6a, 0x54, 0xd6, 0xb3, 0x63, 0x75, 0xaa, + 0xc8, 0xc8, 0xd0, 0x90, 0x9b, 0xaa, 0xb9, 0xa6, 0x76, 0xc9, 0xb5, 0x59, + 0x84, 0x9e, 0xc3, 0x9d, 0x93, 0x9b, 0xb4, 0x93, 0x4f, 0x65, 0x99, 0x49, + 0x32, 0x44, 0x96, 0x38, 0x43, 0x8a, 0x55, 0xc0, 0xd2, 0xb1, 0xc9, 0x82, + 0x85, 0x9c, 0xa0, 0x61, 0x8d, 0xb4, 0x7b, 0xa2, 0x95, 0xae, 0x8f, 0x66, + 0x44, 0xd2, 0x7b, 0xa9, 0x6f, 0xa8, 0xb3, 0x7e, 0x76, 0x4c, 0x4f, 0x55, + 0xa9, 0x3a, 0x38, 0x60, 0xcc, 0x2c, 0x62, 0x39, 0xb9, 0xc0, 0x62, 0x4e, + 0xb2, 0xd0, 0x77, 0xc2, 0x83, 0x94, 0x72, 0xc2, 0x63, 0x79, 0xc4, 0xb4, + 0x83, 0x9e, 0xd8, 0x8e, 0x4b, 0xa1, 0x44, 0x60, 0x32, 0xca, 0xcc, 0x63, + 0xc1, 0xc4, 0xd2, 0x68, 0x67, 0x95, 0xb7, 0x78, 0x92, 0xce, 0xb9, 0x31, + 0xc1, 0xbc, 0x3e, 0x9d, 0x44, 0x94, 0x55, 0x6c, 0x67, 0x6c, 0x8f, 0x3e, + 0x4a, 0x2d, 0x35, 0x6f, 0x5c, 0x80, 0xa0, 0x9a, 0x57, 0xa3, 0x29, 0x7f, + 0x75, 0x4b, 0xb1, 0x20, 0x3b, 0x8d, 0x5f, 0x69, 0x77, 0x46, 0x53, 0xbf, + 0x51, 0x9a, 0xd0, 0x55, 0x62, 0x30, 0x76, 0x94, 0x9b, 0x6f, 0x92, 0xa9, + 0x96, 0xc5, 0x40, 0xbe, 0x2d, 0xbd, 0xcd, 0x58, 0xbb, 0x91, 0xb3, 0xd7, + 0x4f, 0xa8, 0xae, 0x7e, 0x97, 0x27, 0xca, 0x9e, 0xa7, 0x62, 0x6d, 0xc9, + 0xbd, 0x52, 0x8f, 0xb0, 0x90, 0x44, 0x45, 0x87, 0xb0, 0x4e, 0x71, 0x44, + 0xe2, 0xbd, 0xa0, 0xac, 0xbc, 0xb2, 0x59, 0x70, 0xaa, 0x35, 0x46, 0x82, + 0xb2, 0xc3, 0x68, 0x5c, 0xd0, 0x3a, 0x6d, 0x85, 0xd8, 0xcc, 0x5f, 0x50, + 0x54, 0x9a, 0xcf, 0x52, 0x41, 0x73, 0xd2, 0xc7, 0xc6, 0xe1, 0xb9, 0xd7, + 0x68, 0x80, 0xc1, 0x5b, 0xb4, 0x94, 0x49, 0xbd, 0x8c, 0xb4, 0x7a, 0x74, + 0x70, 0x5c, 0xc4, 0x49, 0x7e, 0x67, 0x7c, 0x61, 0x8a, 0x97, 0x56, 0xa1, + 0x3c, 0x67, 0xbd, 0xb9, 0x89, 0x55, 0x36, 0xcf, 0x42, 0x90, 0xbf, 0xbc, + 0xaa, 0x86, 0xaf, 0x5b, 0x97, 0x97, 0x59, 0x45, 0x8a, 0x88, 0x4b, 0x32, + 0xb0, 0x32, 0x48, 0x7f, 0x30, 0xa9, 0x95, 0x8f, 0x56, 0xbe, 0xc8, 0x9a, + 0x51, 0xc4, 0xc5, 0x98, 0xc9, 0xcb, 0xbb, 0x3a, 0x36, 0x7f, 0xbe, 0xd2, + 0x35, 0x5a, 0xab, 0x89, 0x8a, 0xb0, 0x80, 0x89, 0xb7, 0x8d, 0x79, 0xae, + 0x3a, 0xa0, 0xc7, 0x66, 0xcf, 0x7d, 0x42, 0xa3, 0x38, 0x79, 0x7e, 0x44, + 0x9e, 0xa6, 0x7e, 0x98, 0x89, 0xbc, 0xb6, 0x8c, 0xb7, 0xad, 0x92, 0x43, + 0xd3, 0x95, 0xc1, 0x5b, 0x51, 0x9c, 0x36, 0x45, 0xb7, 0xd0, 0x85, 0x88, + 0x9b, 0x5f, 0x55, 0xc5, 0x6a, 0x64, 0x42, 0x76, 0xbf, 0x30, 0xc0, 0x83, + 0x6c, 0xac, 0xb7, 0x7a, 0x7b, 0x84, 0x4d, 0xcf, 0x51, 0xd0, 0x3e, 0x9c, + 0xa4, 0x3a, 0x69, 0x22, 0xb1, 0x5b, 0xc3, 0x42, 0x9c, 0xd6, 0xac, 0xc2, + 0xa6, 0xd2, 0xb1, 0xaa, 0xd3, 0xad, 0x66, 0x86, 0xcc, 0x75, 0x7c, 0xd3, + 0x73, 0x77, 0x6a, 0x91, 0xac, 0x73, 0x77, 0xd6, 0xbc, 0xdc, 0x62, 0x44, + 0xa2, 0xdb, 0x41, 0xda, 0x60, 0xa8, 0xb5, 0x3f, 0x8c, 0x9f, 0xc4, 0x52, + 0x9a, 0xb1, 0x3f, 0xba, 0x4f, 0x7d, 0xc3, 0x56, 0x88, 0xa0, 0x87, 0x8c, + 0x9c, 0xce, 0x65, 0x7b, 0x7e, 0xb1, 0x55, 0xa7, 0xa7, 0x4b, 0x30, 0x6f, + 0x4d, 0x8c, 0xbb, 0x9a, 0xcb, 0x96, 0x69, 0xd6, 0x95, 0x78, 0xaf, 0x8e, + 0xc8, 0x41, 0x73, 0x8a, 0xa7, 0x35, 0xc9, 0xc8, 0x9d, 0xa3, 0x7d, 0xaa, + 0xd5, 0x87, 0xa3, 0xb1, 0x5e, 0x8f, 0xb5, 0x70, 0x5e, 0x52, 0x98, 0x4f, + 0xb1, 0x3c, 0xd5, 0x94, 0xa1, 0xa3, 0xb3, 0x48, 0x8b, 0x58, 0xbc, 0x39, + 0x85, 0xa9, 0x4d, 0x91, 0x80, 0xbf, 0x38, 0xa5, 0xd2, 0x81, 0xb3, 0x4e, + 0x67, 0x7c, 0xd0, 0xaf, 0xba, 0xba, 0x64, 0x90, 0x59, 0x61, 0xc3, 0xc0, + 0x79, 0x63, 0xd8, 0x7a, 0xa3, 0xa6, 0x80, 0x55, 0x77, 0x73, 0xd5, 0x46, + 0x71, 0xc1, 0x8d, 0x38, 0xa8, 0x3c, 0x98, 0xbe, 0xbf, 0xd3, 0xb3, 0x8e, + 0x66, 0xaf, 0x46, 0x6f, 0x8d, 0x9f, 0x69, 0x6a, 0x44, 0xb5, 0x62, 0xbd, + 0x56, 0x5d, 0xa8, 0x49, 0x52, 0x74, 0xd6, 0x6f, 0xdf, 0xa4, 0xb1, 0xdf, + 0x66, 0xb2, 0x76, 0xb0, 0x82, 0xa3, 0x7f, 0x7b, 0x71, 0x6d, 0x97, 0x60, + 0x30, 0xbe, 0x67, 0xc3, 0x4c, 0x7d, 0x69, 0x91, 0x40, 0x1b, 0x5e, 0xb1, + 0x60, 0x5a, 0xc9, 0xa7, 0x5e, 0x6e, 0x85, 0x40, 0x56, 0xe3, 0xd7, 0x9b, + 0x6d, 0xcc, 0xbc, 0x4d, 0x7a, 0x92, 0xba, 0x99, 0x63, 0x48, 0xa7, 0x62, + 0x82, 0x60, 0x97, 0x44, 0x7b, 0x95, 0x7d, 0xab, 0xcb, 0x63, 0x78, 0x4b, + 0xaf, 0xb8, 0x39, 0xa9, 0x93, 0x56, 0xe2, 0xc6, 0xad, 0xaf, 0x4d, 0x2f, + 0xae, 0xcd, 0xad, 0x30, 0x55, 0xa2, 0xb3, 0x8b, 0x2f, 0x42, 0x71, 0x50, + 0x4e, 0x88, 0xa2, 0xa2, 0x5d, 0xb8, 0x36, 0x83, 0x44, 0x60, 0x8e, 0xbc, + 0x99, 0x4a, 0x7e, 0x6c, 0x64, 0xc7, 0x52, 0x56, 0xd2, 0x62, 0x7a, 0xa4, + 0x7a, 0x6a, 0x94, 0x45, 0x59, 0xa9, 0xb3, 0x95, 0x6b, 0xaa, 0x9d, 0x89, + 0x5e, 0x3f, 0x8f, 0xc6, 0x59, 0x49, 0xc5, 0x63, 0x3c, 0xa8, 0x80, 0x67, + 0x7e, 0x72, 0xb1, 0x5e, 0xab, 0xc1, 0x64, 0x7d, 0xc1, 0x6b, 0x6e, 0x3a, + 0xa3, 0xa7, 0x3b, 0xd0, 0x7f, 0x51, 0xc5, 0x8b, 0x8f, 0xa0, 0xc1, 0x3d, + 0x81, 0x43, 0x46, 0x6c, 0x3c, 0x67, 0x86, 0x7d, 0x84, 0xc4, 0x4f, 0x89, + 0x6a, 0xc1, 0xbc, 0xd2, 0x7b, 0xd1, 0x5f, 0x6b, 0x75, 0x49, 0x57, 0x81, + 0x8f, 0x94, 0x97, 0xb8, 0xbe, 0xc5, 0x6b, 0xa1, 0x3c, 0xda, 0xd6, 0xa6, + 0xa9, 0xb9, 0xbf, 0x6e, 0x8b, 0x8e, 0x38, 0x81, 0x5c, 0x83, 0xa3, 0xbb, + 0x53, 0x5c, 0x35, 0x9e, 0x82, 0xa9, 0xd4, 0x6e, 0x8a, 0x83, 0xab, 0x5d, + 0x71, 0xb7, 0xb3, 0x63, 0xc5, 0x95, 0x87, 0xbc, 0x8a, 0x7b, 0x74, 0xdd, + 0x8c, 0x55, 0x5e, 0x7a, 0xb8, 0x93, 0x93, 0x38, 0xb4, 0xb8, 0xbc, 0x61, + 0xc6, 0x64, 0xac, 0x4e, 0x76, 0x7b, 0xb4, 0x9c, 0x67, 0xa2, 0x8c, 0x93, + 0x5d, 0x6e, 0xca, 0x88, 0xb4, 0x39, 0x91, 0x6d, 0x6c, 0x65, 0x9b, 0x56, + 0x9f, 0x68, 0x3d, 0x55, 0x34, 0x2e, 0xc9, 0x4e, 0x36, 0x8b, 0x5d, 0x6d, + 0x49, 0x3b, 0xcf, 0x75, 0xd1, 0x5e, 0x31, 0x53, 0x8b, 0x6c, 0x59, 0x81, + 0x8c, 0x6e, 0x52, 0xb1, 0xb2, 0xae, 0x7d, 0x65, 0xb8, 0x98, 0xcf, 0x62, + 0x84, 0x83, 0xc0, 0x82, 0xc3, 0x83, 0x5e, 0x7e, 0x3f, 0xaa, 0xb6, 0x74, + 0x3c, 0x95, 0xae, 0xa0, 0x5e, 0x59, 0x69, 0xa4, 0x4e, 0x57, 0x55, 0xa2, + 0xaa, 0x9b, 0x5d, 0xb0, 0x5e, 0xbd, 0x5c, 0x9c, 0x6b, 0x79, 0xbd, 0xb3, + 0xc8, 0x46, 0x75, 0x9e, 0x66, 0x87, 0xad, 0x57, 0x9d, 0x32, 0x6a, 0x39, + 0x8e, 0x60, 0x97, 0x69, 0xb0, 0x96, 0x7d, 0x96, 0x93, 0x8a, 0x7a, 0xa9, + 0x7e, 0x86, 0x3e, 0xb9, 0x8d, 0x70, 0x55, 0xa5, 0x49, 0x45, 0xc5, 0xd7, + 0x4c, 0x9c, 0xa7, 0x40, 0xd6, 0x62, 0x37, 0xc5, 0x89, 0x96, 0xad, 0xc4, + 0xac, 0x84, 0x9e, 0x91, 0x73, 0x86, 0x88, 0x65, 0xbf, 0xb0, 0xb0, 0x73, + 0x55, 0xa2, 0x73, 0x76, 0x73, 0xcb, 0xe1, 0x87, 0xdd, 0x86, 0xd6, 0x86, + 0xa2, 0x82, 0x5c, 0x67, 0x66, 0x8e, 0x73, 0x75, 0x90, 0x9e, 0xaa, 0x59, + 0x43, 0xa2, 0x6c, 0x55, 0xb7, 0x50, 0x40, 0x53, 0x5d, 0x29, 0x31, 0x53, + 0x34, 0x83, 0x88, 0x88, 0x9e, 0x5a, 0x6f, 0xb5, 0x82, 0x73, 0x40, 0x95, + 0xa8, 0x47, 0xdd, 0x67, 0x6b, 0xbe, 0xbe, 0xa6, 0xc4, 0x42, 0xcb, 0x77, + 0x9c, 0x70, 0x46, 0x7b, 0xbc, 0x50, 0x6e, 0x4b, 0x3b, 0xaa, 0x74, 0xc9, + 0xc8, 0x76, 0xc4, 0x83, 0x4b, 0xb5, 0xb1, 0x7f, 0x5e, 0x66, 0xda, 0x68, + 0x34, 0xb2, 0x84, 0xb8, 0xa0, 0x8a, 0x58, 0xbf, 0x70, 0x4a, 0x6b, 0x82, + 0x7b, 0xae, 0x97, 0x5d, 0x3f, 0xa6, 0xa4, 0xd4, 0x3c, 0xbd, 0xd9, 0x38, + 0xa7, 0x46, 0xbf, 0x5e, 0x5f, 0xb5, 0x93, 0xb1, 0x1c, 0x76, 0x6f, 0x76, + 0x83, 0x46, 0x64, 0x55, 0xd3, 0xb4, 0x9d, 0xc0, 0x71, 0x3d, 0xa2, 0x46, + 0x6d, 0x40, 0x6f, 0x62, 0xc3, 0x31, 0x82, 0x6d, 0x9e, 0xd1, 0x90, 0x54, + 0x86, 0x97, 0xb4, 0x4d, 0x6d, 0x7c, 0xa1, 0x44, 0x41, 0x68, 0xbe, 0x94, + 0xc2, 0x89, 0xbe, 0xae, 0x32, 0xa9, 0x2a, 0x6f, 0xb5, 0x37, 0x88, 0x73, + 0xa6, 0x7f, 0x36, 0x3a, 0x37, 0xc1, 0x69, 0x4b, 0x4a, 0x4f, 0x9e, 0x44, + 0x93, 0x6c, 0x57, 0x66, 0x6b, 0x8e, 0x75, 0x35, 0x52, 0x71, 0x79, 0xb0, + 0x5a, 0x9b, 0x74, 0x2f, 0x89, 0x99, 0x36, 0x5b, 0x7b, 0x77, 0x61, 0xa0, + 0xcb, 0x42, 0xa2, 0xb6, 0x3f, 0xb7, 0x27, 0x7f, 0x23, 0x4d, 0x8c, 0xaf, + 0x7e, 0x3e, 0xc7, 0xd0, 0x99, 0xbb, 0x83, 0xd1, 0xab, 0x7b, 0x46, 0xae, + 0xcb, 0x84, 0x3d, 0x47, 0xa0, 0x85, 0x6c, 0xc9, 0x5f, 0x4f, 0x44, 0x58, + 0xbf, 0xd3, 0xbd, 0xd6, 0x65, 0x7e, 0x73, 0x55, 0xa5, 0x44, 0xd3, 0xc8, + 0x7b, 0x9f, 0x63, 0x6a, 0xc1, 0xca, 0x7a, 0xa4, 0xa3, 0xbe, 0x86, 0x79, + 0x7c, 0x92, 0xcf, 0x70, 0xa5, 0x79, 0x3e, 0x63, 0x36, 0x3f, 0x4c, 0x2e, + 0x3a, 0x71, 0x98, 0xc1, 0x9f, 0x69, 0x42, 0x4f, 0x57, 0x31, 0x5d, 0x6e, + 0xab, 0x57, 0xb3, 0xca, 0xb3, 0x54, 0x4e, 0x38, 0xc4, 0x73, 0x52, 0x6c, + 0x3c, 0x85, 0xc4, 0xc2, 0x3e, 0x8d, 0x86, 0xbd, 0x68, 0x71, 0xbe, 0xaf, + 0x8c, 0xcd, 0x83, 0x5c, 0x9b, 0xb2, 0x48, 0x3e, 0xa1, 0xc7, 0x62, 0x38, + 0x79, 0xa5, 0x74, 0x4b, 0xa3, 0x6f, 0x75, 0xd7, 0x8b, 0x79, 0x8c, 0x91, + 0x6b, 0x55, 0x41, 0x71, 0xca, 0x85, 0xcf, 0xab, 0xc8, 0xad, 0x82, 0x33, + 0x7d, 0x4b, 0x76, 0x4b, 0x76, 0x71, 0x59, 0x80, 0xc9, 0x84, 0xb8, 0x3d, + 0x90, 0xcf, 0x85, 0xa6, 0x7d, 0xbc, 0x3d, 0xa8, 0x56, 0x3e, 0xca, 0x33, + 0xc3, 0x63, 0x61, 0x8b, 0x32, 0x74, 0x59, 0x7c, 0x7c, 0xb0, 0x5c, 0x4b, + 0xc1, 0xc8, 0xb1, 0xb3, 0x32, 0xa5, 0x59, 0xa5, 0x97, 0x44, 0x66, 0xbe, + 0xa6, 0xc9, 0xab, 0xc2, 0x77, 0x27, 0x9c, 0x54, 0x7c, 0x74, 0x4e, 0x4e, + 0xb9, 0x85, 0x57, 0x27, 0x6e, 0x79, 0x2b, 0xb5, 0xa2, 0x48, 0x51, 0x77, + 0xc1, 0x5e, 0x91, 0x4c, 0xc1, 0xe7, 0x61, 0xd0, 0x66, 0xa7, 0x88, 0x43, + 0xa7, 0x6a, 0xc0, 0x92, 0x43, 0xa4, 0xaa, 0xa1, 0x83, 0x88, 0x4c, 0x80, + 0x74, 0xc7, 0xc4, 0xca, 0x64, 0xa0, 0x5a, 0x5e, 0xc9, 0xb8, 0x4d, 0x56, + 0xae, 0xb8, 0x47, 0x7d, 0xa1, 0x5a, 0x35, 0x91, 0x50, 0x9c, 0xd9, 0x48, + 0x69, 0xd9, 0xcc, 0xc6, 0xaf, 0x7d, 0x88, 0xbb, 0x70, 0x52, 0x51, 0x83, + 0xb0, 0xb6, 0x89, 0xc2, 0xa0, 0x97, 0x92, 0x80, 0x54, 0x7d, 0x3e, 0x73, + 0x40, 0x70, 0xa9, 0xab, 0xbc, 0x57, 0x66, 0xb4, 0xb9, 0x98, 0xb3, 0x37, + 0x3a, 0xcf, 0x71, 0x47, 0xca, 0x38, 0x41, 0xd6, 0xc9, 0x73, 0x68, 0x93, + 0xc8, 0x22, 0x9b, 0xc8, 0xa1, 0x99, 0x44, 0x58, 0x7c, 0xd1, 0x70, 0x8f, + 0x4b, 0x7d, 0x93, 0x9c, 0xc5, 0xc2, 0x41, 0x7f, 0xb7, 0x3e, 0xc1, 0xd0, + 0xab, 0x6a, 0x52, 0x46, 0xb8, 0x89, 0x69, 0x47, 0x31, 0xdb, 0x45, 0xae, + 0x73, 0x67, 0x9a, 0x88, 0x34, 0xa8, 0x60, 0x70, 0x57, 0x3a, 0xa4, 0x8f, + 0x50, 0x54, 0x5c, 0x60, 0xa6, 0x9f, 0x50, 0xc5, 0x52, 0x47, 0x5b, 0x41, + 0xc9, 0xa2, 0xd3, 0xbe, 0x9f, 0x5b, 0xc1, 0xcb, 0x57, 0xb8, 0x2c, 0x2c, + 0x70, 0x4a, 0x75, 0xb8, 0x50, 0x3d, 0x77, 0x4d, 0x1e, 0x8f, 0x52, 0x70, + 0x47, 0x8c, 0xc7, 0x41, 0x61, 0xa7, 0x90, 0x6c, 0x6e, 0xc3, 0x90, 0x79, + 0xba, 0x61, 0xc0, 0x32, 0xa0, 0xa3, 0x65, 0x86, 0x60, 0x68, 0xc0, 0x56, + 0xb9, 0x7d, 0x8b, 0x8f, 0x65, 0xc3, 0x73, 0x69, 0x52, 0x95, 0x9a, 0xad, + 0x5c, 0xa6, 0xd0, 0x5f, 0x78, 0x39, 0xa4, 0x81, 0x65, 0xba, 0x4a, 0x71, + 0x81, 0x38, 0xa9, 0xa3, 0xb9, 0xc6, 0x59, 0x6a, 0x38, 0x6c, 0x96, 0xcc, + 0xc4, 0x8b, 0x95, 0xcc, 0xbb, 0xaf, 0x47, 0x34, 0xb6, 0x77, 0x9a, 0xad, + 0xbb, 0x69, 0x6b, 0x9f, 0xda, 0xc6, 0x85, 0x6e, 0x3a, 0x55, 0x3e, 0x57, + 0x9c, 0x39, 0x54, 0x74, 0x44, 0xa0, 0x35, 0xbc, 0x73, 0x5a, 0xc7, 0xb2, + 0x93, 0x3a, 0x40, 0x43, 0x72, 0x47, 0xab, 0x82, 0xa9, 0xb8, 0x63, 0xbc, + 0x4d, 0x75, 0x93, 0x9b, 0x93, 0x8f, 0xa3, 0x3f, 0x6d, 0xa3, 0x90, 0x9f, + 0xd4, 0x6a, 0x64, 0x73, 0x35, 0x95, 0x61, 0xd1, 0x94, 0x74, 0xa0, 0xa2, + 0x98, 0x62, 0xbf, 0x4f, 0xa4, 0x3d, 0x53, 0x6d, 0xb2, 0x52, 0xd7, 0xb7, + 0x80, 0x85, 0xad, 0x82, 0x8d, 0x4c, 0x96, 0x79, 0x90, 0xc6, 0x6d, 0xb2, + 0x80, 0xdd, 0x79, 0x9b, 0x78, 0x2d, 0x33, 0x4a, 0x62, 0x3b, 0x69, 0x73, + 0x67, 0x80, 0x5f, 0x97, 0x86, 0x48, 0x51, 0xb8, 0x77, 0x95, 0xa1, 0xcc, + 0xb2, 0xca, 0xa2, 0xdc, 0x49, 0x59, 0x81, 0xad, 0x9b, 0xd3, 0xa8, 0x2d, + 0xc8, 0xcd, 0x6e, 0x3d, 0x96, 0x47, 0x76, 0xcc, 0x84, 0x6f, 0xc7, 0xbe, + 0x90, 0x64, 0xb6, 0x37, 0x97, 0x73, 0x66, 0x6a, 0xac, 0xa6, 0x63, 0x56, + 0x5a, 0x59, 0x8e, 0x5d, 0xcb, 0x56, 0x2d, 0x81, 0xb8, 0x80, 0x2e, 0x9a, + 0x46, 0x96, 0x87, 0x5d, 0xaf, 0x94, 0xd3, 0x4f, 0xf3, 0xd9, 0x70, 0x60, + 0xac, 0xbf, 0xc0, 0xb7, 0x62, 0x40, 0x92, 0x7f, 0xc2, 0x44, 0xbc, 0xab, + 0x93, 0x5f, 0x5e, 0xd9, 0x6c, 0x55, 0xcd, 0x95, 0x68, 0x58, 0x41, 0x4d, + 0xa7, 0x59, 0xa1, 0x66, 0xc2, 0xe4, 0xac, 0x6a, 0x54, 0x9b, 0xbc, 0xbc, + 0x58, 0x3f, 0x3c, 0x48, 0x99, 0x75, 0xb8, 0x60, 0x9b, 0x86, 0xc8, 0xa3, + 0xcc, 0x6d, 0xb4, 0x58, 0x69, 0x85, 0x61, 0x8b, 0xb5, 0xc7, 0x46, 0x71, + 0x76, 0xb7, 0x81, 0xa6, 0xbf, 0x67, 0xb2, 0xd3, 0x7a, 0x2f, 0xc5, 0x8e, + 0xbd, 0xae, 0x9b, 0xcd, 0x68, 0x90, 0x4f, 0xad, 0xdf, 0x60, 0x5c, 0x98, + 0x7d, 0xb6, 0x95, 0xbe, 0x65, 0xc9, 0x95, 0x67, 0x41, 0xbb, 0x89, 0x33, + 0x44, 0x68, 0xcb, 0x8c, 0x4c, 0x48, 0x39, 0x71, 0x43, 0x51, 0x72, 0xaf, + 0xae, 0x8b, 0x59, 0xd9, 0x6e, 0xc4, 0x4c, 0xb5, 0x56, 0x48, 0x65, 0x52, + 0xdd, 0x59, 0x74, 0x79, 0x4e, 0xcf, 0xcd, 0x7d, 0x37, 0x78, 0x83, 0xa0, + 0xab, 0xce, 0x3a, 0x9f, 0x79, 0x91, 0xd2, 0xbd, 0x59, 0xda, 0xd1, 0xbc, + 0x4e, 0x64, 0x8d, 0x8a, 0x7f, 0x6d, 0x8c, 0x95, 0x34, 0xab, 0xcd, 0xb0, + 0xaf, 0x70, 0x60, 0x77, 0x60, 0x54, 0x5b, 0x88, 0x71, 0xba, 0x31, 0x83, + 0xd0, 0xd3, 0xa3, 0x3d, 0x87, 0xaf, 0x82, 0x53, 0xbb, 0x48, 0x73, 0x6d, + 0x46, 0xc8, 0xbc, 0xd2, 0xc9, 0x4c, 0xaa, 0x4b, 0xaf, 0x7a, 0xc5, 0x3e, + 0x49, 0x6f, 0x85, 0xab, 0x75, 0x58, 0x75, 0x95, 0x5f, 0x72, 0x8f, 0x4b, + 0x57, 0xb0, 0x88, 0xa8, 0x54, 0x87, 0x84, 0x43, 0xb6, 0xc2, 0x6b, 0x4a, + 0x65, 0xa8, 0xa7, 0x35, 0x6f, 0x56, 0x4d, 0x5f, 0x8d, 0x4c, 0x93, 0x8d, + 0x86, 0x6b, 0x9d, 0xb3, 0x49, 0x92, 0xc0, 0x8c, 0x4a, 0x68, 0x58, 0xc9, + 0xab, 0xab, 0x54, 0x67, 0xc5, 0x39, 0x80, 0x50, 0x6a, 0x77, 0xba, 0x4f, + 0x81, 0xbc, 0x63, 0xc3, 0x31, 0x57, 0x77, 0xa0, 0x9b, 0x6a, 0x59, 0x95, + 0xa7, 0xcb, 0x8f, 0x96, 0x6e, 0xa0, 0x3f, 0x67, 0xb9, 0xa1, 0x4e, 0x7f, + 0x91, 0x74, 0x44, 0x4e, 0xd3, 0x82, 0x62, 0xae, 0xa5, 0x96, 0x4f, 0x4c, + 0x6d, 0x73, 0xc2, 0x51, 0x69, 0xcc, 0x89, 0xb3, 0xb2, 0x71, 0x4b, 0x71, + 0xd3, 0x81, 0x47, 0x69, 0xc1, 0x9c, 0x8f, 0x53, 0x50, 0x9a, 0x80, 0x3c, + 0x6d, 0x97, 0x4e, 0x57, 0xa1, 0x77, 0x69, 0xa1, 0x9c, 0x8f, 0x93, 0x8a, + 0xaa, 0x6b, 0x72, 0x6c, 0xa0, 0x70, 0x43, 0x5b, 0x84, 0xb6, 0x31, 0xca, + 0x62, 0xd1, 0x86, 0xc9, 0x78, 0xb8, 0xc7, 0x32, 0x47, 0x7b, 0x42, 0x94, + 0x86, 0x6b, 0xc4, 0xa0, 0x32, 0x3d, 0xab, 0x57, 0x76, 0x52, 0x4e, 0x61, + 0x30, 0xbe, 0x6c, 0x95, 0xb5, 0xae, 0xa4, 0x4f, 0x87, 0xd8, 0x3a, 0x6b, + 0x91, 0xa2, 0x7b, 0x94, 0x7b, 0x72, 0x9a, 0x56, 0x3b, 0x30, 0x71, 0xbe, + 0x39, 0xac, 0x50, 0x48, 0x9e, 0x4a, 0xc2, 0x78, 0x5c, 0xca, 0x37, 0x3c, + 0x98, 0xac, 0x43, 0xa7, 0xa2, 0xa0, 0x3b, 0x51, 0xb2, 0x8f, 0xcf, 0x50, + 0x55, 0x51, 0x7a, 0x8a, 0xcc, 0xb7, 0x92, 0x50, 0x7d, 0xad, 0x81, 0xa9, + 0x54, 0x9b, 0x65, 0xa3, 0x58, 0x76, 0x9e, 0xa4, 0xce, 0x7d, 0x93, 0xca, + 0x3b, 0xab, 0x81, 0x35, 0x64, 0x38, 0x82, 0x36, 0xca, 0x83, 0x2f, 0xbb, + 0x43, 0x44, 0xbb, 0x5d, 0x4a, 0x46, 0xb6, 0x5d, 0x33, 0xab, 0x63, 0x5f, + 0x9a, 0x93, 0x4a, 0xab, 0xbe, 0x3a, 0xbd, 0x83, 0x5e, 0xd0, 0x75, 0x7f, + 0x7a, 0x76, 0x3f, 0x84, 0x4a, 0xd3, 0x37, 0x45, 0x7b, 0x69, 0xbe, 0x79, + 0x7b, 0x6f, 0x62, 0xcb, 0xa4, 0x9e, 0xb1, 0x66, 0x8c, 0x5d, 0x82, 0x73, + 0xd1, 0x72, 0x49, 0xb1, 0xc5, 0x5a, 0x5a, 0x9c, 0xa2, 0xbf, 0x7e, 0x6f, + 0x77, 0x45, 0x63, 0xd9, 0xa1, 0xaa, 0x87, 0x91, 0x6f, 0xa4, 0xaa, 0xa3, + 0xb2, 0xad, 0xc3, 0x9c, 0xb2, 0x62, 0x44, 0xcc, 0xc4, 0xb8, 0x9e, 0x43, + 0xc6, 0xa0, 0x3c, 0x9c, 0x6e, 0x7d, 0x9d, 0x8c, 0xc8, 0xaa, 0x5c, 0x52, + 0xc4, 0xc8, 0xb6, 0x6c, 0xab, 0x7f, 0x2e, 0xce, 0xc8, 0x49, 0x8e, 0xa9, + 0x88, 0x9e, 0x96, 0x31, 0x3a, 0x95, 0x91, 0x3b, 0x5b, 0x31, 0xda, 0x39, + 0x59, 0x6c, 0x63, 0xa3, 0xc4, 0x95, 0xce, 0xb7, 0x77, 0x91, 0xad, 0x96, + 0x62, 0x34, 0x30, 0xb2, 0x7c, 0xb2, 0x96, 0x3f, 0xb1, 0xd2, 0x7a, 0xba, + 0x4e, 0x92, 0xb4, 0x82, 0x88, 0x35, 0x29, 0x88, 0x56, 0xb7, 0xc3, 0x7b, + 0x78, 0xaf, 0x7d, 0x9e, 0x9a, 0x85, 0x72, 0x56, 0x57, 0xa6, 0x57, 0x6d, + 0x4e, 0xb6, 0x8d, 0x7a, 0x6f, 0x75, 0x87, 0x61, 0xb5, 0x8d, 0x49, 0x65, + 0x65, 0x87, 0xab, 0x7e, 0x7b, 0x48, 0x3b, 0x65, 0x54, 0x8d, 0x5d, 0xb8, + 0x6b, 0x58, 0x42, 0x6e, 0x6d, 0xcb, 0x72, 0x8d, 0x72, 0xaa, 0x7b, 0xb0, + 0x73, 0xa1, 0x5b, 0xa4, 0xcc, 0x81, 0x6f, 0x73, 0xb4, 0xb5, 0x56, 0x58, + 0x67, 0x4e, 0x60, 0x53, 0x9b, 0xd0, 0x4f, 0xc8, 0xbf, 0xcb, 0x40, 0x3c, + 0x62, 0x9f, 0x9c, 0x5f, 0xd4, 0x71, 0x88, 0xa5, 0x4e, 0x51, 0x90, 0x9a, + 0x58, 0xae, 0xd8, 0x55, 0xac, 0xad, 0xc4, 0x49, 0xa2, 0x9a, 0x9d, 0x67, + 0x75, 0xab, 0x73, 0x4f, 0xb6, 0x30, 0xa8, 0x5b, 0x33, 0x51, 0xb7, 0x2e, + 0x97, 0x73, 0x95, 0xc6, 0x60, 0x56, 0xb3, 0x6f, 0x8e, 0xb0, 0x7b, 0x41, + 0xc7, 0x2f, 0xb9, 0x8a, 0x43, 0x77, 0x45, 0x38, 0x81, 0x91, 0xaa, 0x87, + 0xcf, 0xbf, 0x7f, 0x34, 0x36, 0x40, 0xac, 0x7b, 0xb3, 0x87, 0xb6, 0xb9, + 0xbd, 0x87, 0x3a, 0x50, 0x38, 0x83, 0x31, 0xce, 0xb3, 0x83, 0xc4, 0x2e, + 0xc1, 0xaa, 0x59, 0x66, 0xb2, 0xb6, 0x54, 0x80, 0x51, 0x45, 0x95, 0x7c, + 0x6a, 0x8e, 0x77, 0xb3, 0x93, 0xc3, 0xa5, 0xb9, 0x83, 0x89, 0x73, 0x3e, + 0x7f, 0xb9, 0x62, 0x4d, 0x68, 0x3d, 0x65, 0x5e, 0x7a, 0x46, 0x75, 0x9e, + 0x8c, 0x3a, 0xc2, 0xa9, 0x7b, 0x3e, 0xc6, 0x97, 0x98, 0xc5, 0x6a, 0x5f, + 0x88, 0x58, 0x98, 0x6f, 0x47, 0x91, 0x9f, 0x4d, 0xb8, 0x78, 0x94, 0xc4, + 0x83, 0xb1, 0x7d, 0x8a, 0x72, 0x4e, 0x3b, 0x6b, 0x32, 0x93, 0x8c, 0x36, + 0x6f, 0x9f, 0x78, 0x48, 0x7b, 0xc9, 0x3e, 0xaf, 0x8e, 0x48, 0x7c, 0x85, + 0x75, 0x4a, 0x7b, 0x62, 0xb7, 0xc0, 0x94, 0x8f, 0x41, 0x5f, 0xb0, 0x37, + 0xa2, 0x46, 0x9e, 0x59, 0x44, 0x44, 0x75, 0x59, 0x34, 0xa6, 0x83, 0x7c, + 0x67, 0x64, 0x34, 0x3f, 0xcb, 0x80, 0x8a, 0x86, 0x61, 0x5d, 0x40, 0x6a, + 0x7e, 0xab, 0xc9, 0xba, 0x93, 0xa6, 0x5f, 0xcd, 0x4b, 0x7c, 0x51, 0x84, + 0x8a, 0x48, 0xb4, 0x36, 0x37, 0x8e, 0x53, 0x93, 0xa0, 0x49, 0x84, 0x40, + 0x84, 0x82, 0x59, 0x64, 0x6f, 0x47, 0x88, 0x4d, 0xaa, 0xb5, 0xb0, 0xcd, + 0x42, 0x7c, 0xc9, 0x6e, 0xa2, 0x5c, 0xc6, 0x79, 0x8a, 0xb7, 0x91, 0xcf, + 0xb5, 0x80, 0x63, 0x36, 0x7c, 0x32, 0xb6, 0x32, 0x72, 0x65, 0x80, 0xa6, + 0x45, 0xce, 0x4e, 0xaa, 0xb4, 0x70, 0x92, 0x49, 0x5c, 0x68, 0x87, 0x50, + 0x3f, 0x82, 0x7d, 0xb3, 0xac, 0x9c, 0xc0, 0xcc, 0x4c, 0x7b, 0x94, 0xc7, + 0x5c, 0x9b, 0xba, 0x6b, 0x56, 0xce, 0xc9, 0x38, 0x64, 0x76, 0x4c, 0xbe, + 0x88, 0x61, 0x78, 0x92, 0xc7, 0x5e, 0x88, 0x3c, 0x8b, 0xb9, 0x3d, 0xd2, + 0x47, 0xb8, 0x39, 0xca, 0x7a, 0x75, 0x80, 0x4b, 0x7d, 0x94, 0xbb, 0x4a, + 0x3b, 0x28, 0x57, 0xc2, 0x27, 0xa3, 0x74, 0x48, 0x8b, 0x9e, 0x8b, 0x8f, + 0x96, 0xa7, 0x64, 0x73, 0xc1, 0x50, 0x3a, 0x3b, 0xb6, 0x9d, 0x4d, 0x7d, + 0x39, 0x88, 0x88, 0x54, 0xa0, 0x88, 0xc7, 0x3f, 0x6d, 0xb9, 0xac, 0x7d, + 0x74, 0xd8, 0xca, 0x39, 0xa9, 0xd4, 0xb5, 0x8b, 0xa7, 0x8d, 0x61, 0x59, + 0x82, 0x78, 0xc6, 0x85, 0xcc, 0x74, 0xb1, 0xab, 0xd0, 0x8c, 0xd2, 0xc7, + 0x75, 0xb9, 0x3e, 0x89, 0x83, 0x5a, 0xa9, 0x39, 0x94, 0xa0, 0xbb, 0x4e, + 0x71, 0xa6, 0x30, 0x46, 0x5b, 0x7b, 0xa4, 0x5a, 0xb2, 0x44, 0x57, 0x64, + 0x84, 0x5f, 0xd4, 0x71, 0x92, 0x5f, 0x60, 0x3d, 0x4c, 0x8d, 0x31, 0x9a, + 0x76, 0x8f, 0x68, 0xbf, 0xab, 0x77, 0xa4, 0x90, 0x8a, 0x80, 0x50, 0xb8, + 0x95, 0x41, 0xb8, 0x60, 0xc7, 0x8c, 0x9b, 0x55, 0xba, 0xc1, 0xb0, 0x32, + 0xd2, 0x53, 0x77, 0x41, 0xbb, 0xb9, 0x58, 0x36, 0x8b, 0x40, 0xa5, 0x4e, + 0xa1, 0x3f, 0x85, 0xda, 0x56, 0x3a, 0x6f, 0x71, 0x8b, 0x94, 0x88, 0xb3, + 0x94, 0xaf, 0xc4, 0xa4, 0xc4, 0x89, 0x59, 0x4a, 0x90, 0x30, 0x1e, 0x78, + 0xa4, 0x73, 0x62, 0xb7, 0xc8, 0x5c, 0xca, 0x5a, 0x47, 0x2d, 0xf, 0x95, + 0x6b, 0x93, 0x57, 0x5b, 0xc8, 0xaa, 0x7f, 0x57, 0x3b, 0x5b, 0x82, 0x8b, + 0x44, 0x9b, 0xc9, 0xa9, 0x8d, 0x6e, 0x96, 0xbf, 0x2d, 0x76, 0x57, 0x9e, + 0x3a, 0x7c, 0xce, 0x56, 0x6b, 0x96, 0x9e, 0xbf, 0xb6, 0x6b, 0xbc, 0x4d, + 0x29, 0x64, 0x49, 0xc8, 0x78, 0x73, 0xa8, 0x77, 0x62, 0x69, 0xcd, 0x59, + 0xa8, 0xb4, 0x89, 0x9c, 0xb6, 0x72, 0x9f, 0x18, 0x5a, 0x30, 0xbc, 0x9d, + 0xa8, 0x42, 0xa5, 0x84, 0x42, 0x50, 0xba, 0xd4, 0x3c, 0x4e, 0xae, 0xbd, + 0x79, 0xd2, 0x2e, 0xae, 0x7a, 0xba, 0x36, 0x2a, 0xad, 0xbb, 0x96, 0x41, + 0xa1, 0xcd, 0x5a, 0x68, 0xc7, 0x92, 0xb2, 0x9d, 0x71, 0xbf, 0x3d, 0x91, + 0x63, 0x7d, 0x66, 0x4b, 0x63, 0x67, 0x4b, 0xa5, 0x62, 0xc4, 0x8e, 0x4f, + 0xcc, 0x6a, 0x46, 0x76, 0x70, 0x98, 0x84, 0x3f, 0x62, 0xb4, 0x3e, 0xc9, + 0x4a, 0xb5, 0x75, 0xa2, 0x51, 0x37, 0xe0, 0x75, 0x88, 0x38, 0xa7, 0x95, + 0x40, 0x59, 0xcb, 0x73, 0xc6, 0x87, 0xdb, 0xd3, 0x89, 0x5d, 0x5c, 0x8b, + 0x7d, 0x96, 0x51, 0x6d, 0x3f, 0x88, 0x3b, 0x96, 0xab, 0x65, 0x53, 0xd4, + 0x41, 0xce, 0xc3, 0x24, 0x9c, 0x73, 0xca, 0x6a, 0x4a, 0x60, 0x72, 0xb8, + 0x7e, 0x5c, 0x71, 0x97, 0xc6, 0x4b, 0x52, 0x3f, 0x3a, 0x61, 0x40, 0xba, + 0x41, 0x58, 0x61, 0xb0, 0x87, 0xc4, 0x3b, 0x5f, 0x7f, 0x88, 0x41, 0x74, + 0xb4, 0xa2, 0x85, 0xd0, 0x65, 0x97, 0x46, 0xa0, 0x69, 0x51, 0x2f, 0x4b, + 0x9a, 0xc0, 0x72, 0x3a, 0xa8, 0x95, 0x67, 0x56, 0x97, 0x58, 0x30, 0x82, + 0x6b, 0x40, 0x79, 0xbd, 0xc3, 0xc5, 0x3a, 0x51, 0xb6, 0xe5, 0x92, 0x4a, + 0xad, 0xae, 0x77, 0x29, 0xbb, 0x73, 0x81, 0x3d, 0xb8, 0xa1, 0x9e, 0xa9, + 0x4e, 0x85, 0x61, 0x96, 0xb7, 0x2b, 0x89, 0x78, 0xac, 0x8e, 0xc5, 0xaf, + 0x90, 0x61, 0x75, 0x69, 0xc2, 0x73, 0x5f, 0xe0, 0xc4, 0x9a, 0x4e, 0x74, + 0xa7, 0xbb, 0x5e, 0x8d, 0xc1, 0x53, 0xa2, 0xaf, 0xac, 0x74, 0x6e, 0x52, + 0x9f, 0xd3, 0x64, 0xb9, 0x75, 0x34, 0x93, 0xa5, 0x6f, 0xa8, 0x8d, 0x36, + 0xa5, 0xa6, 0x5c, 0x6d, 0x95, 0xc2, 0x6c, 0x53, 0x7f, 0x52, 0x36, 0x58, + 0x65, 0x43, 0xaa, 0x5f, 0xa9, 0x51, 0x33, 0xbf, 0x72, 0xa1, 0xa4, 0x36, + 0x40, 0xc9, 0xb7, 0x2e, 0x5d, 0xcf, 0xc1, 0xa2, 0x58, 0x78, 0x7f, 0xbd, + 0xc1, 0x74, 0xd3, 0x3b, 0x84, 0x7b, 0x95, 0x52, 0x44, 0xab, 0xd5, 0xd2, + 0x47, 0x61, 0x84, 0x85, 0xa1, 0xb7, 0x63, 0xc1, 0xbf, 0x97, 0xdc, 0x6a, + 0xb4, 0x3f, 0xae, 0xa5, 0x4e, 0x77, 0x79, 0xc2, 0x5d, 0xde, 0x6f, 0xa9, + 0x90, 0x93, 0xdc, 0xa4, 0x93, 0xc7, 0xe2, 0x96, 0xa9, 0xb4, 0x88, 0xd7, + 0xa0, 0x6d, 0x5f, 0x9b, 0x96, 0x9c, 0xb4, 0x62, 0x73, 0xa9, 0xcc, 0x78, + 0x9a, 0x4e, 0xb4, 0x42, 0x46, 0xd2, 0x6a, 0x71, 0xcf, 0x9c, 0x73, 0x6f, + 0x7f, 0x4c, 0x49, 0x4f, 0x9f, 0xd8, 0x25, 0x55, 0x66, 0x33, 0x78, 0x8b, + 0x41, 0x30, 0x7f, 0xaf, 0x5d, 0x8b, 0x94, 0xdf, 0xa7, 0x42, 0xc3, 0x62, + 0x8b, 0x8f, 0x84, 0x3e, 0x40, 0x5c, 0x46, 0x8d, 0xbb, 0x7d, 0xd0, 0xb7, + 0x6c, 0xb0, 0x62, 0xa7, 0xa7, 0xac, 0x85, 0x77, 0x62, 0x96, 0xae, 0x17, + 0x88, 0x85, 0x84, 0x5e, 0x3a, 0xc3, 0x43, 0x4a, 0xe3, 0x3f, 0xbb, 0x5a, + 0xa0, 0xb5, 0x85, 0x72, 0x26, 0x2a, 0xa3, 0xb4, 0x4e, 0x75, 0x62, 0x5f, + 0xcb, 0x3c, 0x42, 0x76, 0x6c, 0xd3, 0x90, 0x6e, 0xa1, 0xb5, 0x33, 0x45, + 0x5d, 0x48, 0xa5, 0x54, 0x9a, 0x78, 0xa7, 0x38, 0xb7, 0xbc, 0x2e, 0x5b, + 0xa3, 0x2d, 0x74, 0x35, 0x6a, 0xc2, 0xa7, 0xc6, 0x38, 0x97, 0x61, 0x8e, + 0x86, 0x67, 0xa7, 0x70, 0xb8, 0x76, 0xcd, 0xa1, 0x51, 0xc4, 0x99, 0xb7, + 0x7a, 0x94, 0x51, 0x41, 0x79, 0x9c, 0x9c, 0x7e, 0x81, 0x60, 0x6d, 0xbb, + 0xa1, 0x89, 0x53, 0xb0, 0x82, 0xb3, 0x35, 0x35, 0xbe, 0x89, 0x4a, 0x8b, + 0xa2, 0xc5, 0x95, 0x5c, 0x4f, 0x8e, 0xc7, 0x43, 0xbd, 0x2d, 0xa3, 0x79, + 0xa2, 0x60, 0x40, 0x45, 0xad, 0xab, 0x48, 0x79, 0x6d, 0x48, 0xa2, 0x5d, + 0x74, 0x83, 0xb9, 0x4c, 0xad, 0x43, 0x2f, 0x9a, 0x7c, 0x74, 0x85, 0x26, + 0x39, 0xc4, 0xdc, 0xb0, 0xa3, 0x33, 0xc3, 0x60, 0x78, 0x42, 0xb8, 0x8c, + 0x70, 0x5d, 0x70, 0xbc, 0xc8, 0x4a, 0xa0, 0x91, 0xcf, 0x36, 0x22, 0xa7, + 0x39, 0x74, 0x45, 0xaa, 0xc3, 0xf7, 0xcf, 0x2e, 0x44, 0x71, 0xd2, 0xd1, + 0xc2, 0x83, 0x56, 0x52, 0x6b, 0xd3, 0xbd, 0xaf, 0xc9, 0x63, 0x44, 0x95, + 0x56, 0x61, 0xd9, 0x8b, 0x39, 0x66, 0xab, 0x78, 0x8f, 0x7c, 0x9b, 0xe1, + 0x4a, 0x7f, 0x38, 0x34, 0xc0, 0x51, 0x50, 0x83, 0x63, 0xa2, 0x71, 0xd8, + 0xbe, 0x85, 0x5c, 0x31, 0x5f, 0xb8, 0xa8, 0x3f, 0x3a, 0x81, 0x94, 0x7d, + 0xa5, 0xba, 0x55, 0x51, 0x84, 0xb4, 0xa1, 0x29, 0xd0, 0x70, 0xad, 0x56, + 0x76, 0xc3, 0x9d, 0x6f, 0x8a, 0xa7, 0xb2, 0xc0, 0x59, 0x3c, 0xc0, 0x89, + 0x98, 0x7c, 0x4b, 0xd2, 0x89, 0xcf, 0x86, 0x82, 0x73, 0x75, 0xb0, 0x8f, + 0x7e, 0x89, 0x86, 0x6f, 0x72, 0x87, 0x75, 0x53, 0x30, 0xce, 0x84, 0x6e, + 0x8a, 0x59, 0x5b, 0x40, 0x4a, 0x68, 0x42, 0xad, 0x8a, 0x3b, 0x81, 0xa1, + 0x7f, 0x54, 0x71, 0x46, 0x7d, 0xb2, 0x81, 0xa3, 0x38, 0x6a, 0x3b, 0xce, + 0x6b, 0x95, 0x30, 0xc0, 0x91, 0x82, 0xb8, 0x76, 0x99, 0xba, 0xbb, 0x3b, + 0x91, 0x85, 0x7b, 0x9b, 0x74, 0x5d, 0x34, 0x9e, 0x75, 0xb6, 0x3b, 0x3a, + 0x51, 0x4f, 0x4a, 0xae, 0xbd, 0x57, 0x79, 0x59, 0xa1, 0xc1, 0x61, 0x86, + 0x74, 0x88, 0x97, 0x3f, 0xb5, 0x7e, 0x68, 0x70, 0xce, 0xa9, 0x49, 0xbd, + 0xcb, 0x71, 0x66, 0xac, 0x75, 0xbc, 0xb4, 0xc0, 0xcc, 0x5d, 0xaf, 0x77, + 0x40, 0x4c, 0x60, 0x43, 0x4a, 0x5e, 0x89, 0xa1, 0x70, 0x93, 0x43, 0xaf, + 0x7a, 0xb9, 0x5b, 0x62, 0x3e, 0x48, 0xb5, 0xcc, 0xa4, 0x86, 0x89, 0x37, + 0x9b, 0x47, 0x7b, 0x31, 0xa3, 0x3a, 0x50, 0xc5, 0xc2, 0x56, 0x74, 0x5b, + 0x69, 0xd6, 0x4a, 0x7e, 0xa5, 0x91, 0xa1, 0x40, 0xaf, 0x8d, 0xc6, 0x4d, + 0x40, 0x65, 0xd2, 0x65, 0xc9, 0x95, 0xce, 0xcd, 0xcf, 0xbf, 0xd5, 0xa9, + 0x9d, 0x9f, 0xb2, 0xbd, 0x55, 0x3a, 0x70, 0xaf, 0xbd, 0x39, 0x38, 0x90, + 0x90, 0xc0, 0x8c, 0xa0, 0x70, 0xa6, 0xa1, 0x8c, 0x49, 0x3e, 0x33, 0x97, + 0xc9, 0xa0, 0x7d, 0x73, 0x33, 0x9d, 0x8d, 0xc4, 0x4b, 0xb3, 0xd2, 0x62, + 0x66, 0xba, 0x9e, 0x51, 0xc2, 0xb0, 0x47, 0xb6, 0x45, 0xba, 0x8d, 0x48, + 0x5c, 0x99, 0x3d, 0xcf, 0x39, 0xba, 0x60, 0x47, 0xaf, 0xc0, 0x90, 0xca, + 0x7c, 0x85, 0xd4, 0x7f, 0xcb, 0x52, 0x9f, 0x71, 0xb2, 0x6d, 0x7d, 0xaa, + 0x57, 0x77, 0xa5, 0x38, 0xb7, 0x64, 0xba, 0x93, 0x8b, 0xca, 0x56, 0xc0, + 0x5d, 0xd1, 0xaf, 0xd2, 0xc9, 0x5b, 0x66, 0x9e, 0x43, 0x62, 0x81, 0x87, + 0x68, 0xcf, 0x56, 0xd0, 0xc5, 0x5b, 0x78, 0x92, 0xb1, 0xb0, 0x51, 0xac, + 0x7a, 0x70, 0x9a, 0xb0, 0x8d, 0x53, 0xc0, 0x48, 0xac, 0x7a, 0x9a, 0xaa, + 0x79, 0x52, 0xbf, 0x73, 0x7f, 0x3e, 0xa3, 0x47, 0x6c, 0xc0, 0xcf, 0x67, + 0x3f, 0x90, 0xa6, 0xa5, 0x7e, 0x7f, 0x70, 0x36, 0x70, 0xbd, 0x35, 0x3f, + 0x49, 0xa6, 0x5f, 0xb0, 0xc3, 0x3d, 0xcf, 0x9b, 0xc5, 0xc6, 0xd5, 0x61, + 0x69, 0xb6, 0xd3, 0xae, 0x74, 0xb0, 0x32, 0x92, 0xbb, 0x8e, 0x47, 0x49, + 0x6a, 0x7f, 0x95, 0x91, 0xb2, 0xd1, 0x74, 0x82, 0xc5, 0x64, 0x47, 0xba, + 0x44, 0xd3, 0x7f, 0x70, 0xba, 0xcf, 0x52, 0xcc, 0x8a, 0x3c, 0x5e, 0x5a, + 0xca, 0x87, 0xaf, 0x3b, 0x65, 0xc1, 0x7d, 0x6c, 0x97, 0xc5, 0x40, 0x68, + 0x43, 0x33, 0xc9, 0x3d, 0x93, 0x84, 0x96, 0x60, 0xb4, 0x8b, 0x33, 0x7a, + 0xbf, 0xd3, 0xc6, 0x69, 0x82, 0x50, 0x9d, 0x6c, 0x60, 0x71, 0xa4, 0x50, + 0x56, 0x69, 0x40, 0xaa, 0xc9, 0xa7, 0xb0, 0x6c, 0x65, 0x8d, 0x90, 0x59, + 0xb4, 0xcf, 0xa5, 0xb5, 0x5a, 0x35, 0x3f, 0x62, 0xb5, 0x5a, 0xc3, 0x46, + 0xaf, 0x5c, 0x64, 0x35, 0x7e, 0x88, 0x49, 0x9e, 0xc2, 0x5d, 0x72, 0x4f, + 0x9f, 0x90, 0x52, 0x46, 0xb3, 0xb5, 0x91, 0xd6, 0x79, 0x42, 0x48, 0x45, + 0x71, 0xba, 0x75, 0xc6, 0x70, 0x8b, 0xc4, 0x47, 0x6f, 0xa8, 0x93, 0xca, + 0x92, 0x4a, 0xb1, 0xad, 0xc4, 0x7b, 0x34, 0x7a, 0x7c, 0xb1, 0x8b, 0x6d, + 0x6e, 0xb8, 0x39, 0x6d, 0x5e, 0x80, 0x89, 0x7c, 0x9f, 0x50, 0xca, 0x7d, + 0x90, 0x41, 0xb6, 0xd2, 0x39, 0xb1, 0x4b, 0x52, 0x55, 0x59, 0x51, 0x99, + 0x83, 0xc1, 0x80, 0xbe, 0x4b, 0x85, 0x57, 0xba, 0x52, 0xd1, 0x4c, 0x6c, + 0xb4, 0xcb, 0x66, 0x71, 0x85, 0x60, 0xcb, 0xd0, 0xa5, 0x42, 0xc1, 0xcc, + 0x9f, 0xa1, 0x37, 0xd6, 0x57, 0x47, 0x52, 0xb4, 0x81, 0xc6, 0xaa, 0xcd, + 0x88, 0x46, 0x33, 0x75, 0x98, 0xa8, 0xc6, 0x4e, 0x43, 0xc4, 0x90, 0x55, + 0x9d, 0x6a, 0x75, 0x65, 0x82, 0x51, 0x58, 0x50, 0x6e, 0xb7, 0xb6, 0x80, + 0xb1, 0xd3, 0x5e, 0xa8, 0x5f, 0x82, 0xd5, 0x38, 0x5b, 0xa9, 0xa5, 0x32, + 0x58, 0x53, 0xde, 0x48, 0xb5, 0x4d, 0x8c, 0x7a, 0x5c, 0x95, 0x7c, 0xcc, + 0x54, 0xa5, 0x61, 0x9f, 0x47, 0xa7, 0xc7, 0xce, 0x59, 0xb6, 0x8b, 0x9b, + 0xa7, 0x75, 0x8c, 0xc3, 0x53, 0xc9, 0x78, 0xa6, 0x50, 0x5b, 0xa2, 0x98, + 0x6c, 0xa7, 0x9a, 0xb4, 0x6f, 0x58, 0x30, 0x40, 0xc4, 0x6f, 0xc0, 0xc6, + 0x7e, 0xcb, 0xac, 0x5b, 0x7e, 0xb0, 0x5a, 0x8a, 0xb2, 0x9e, 0xa0, 0x62, + 0x7e, 0x37, 0x84, 0x6e, 0x7c, 0x7c, 0x3a, 0x99, 0x5b, 0x80, 0xa2, 0x79, + 0x4d, 0x90, 0xb0, 0x3d, 0xaa, 0xa2, 0x65, 0x5f, 0x6e, 0x78, 0x65, 0x79, + 0x56, 0x91, 0xa7, 0xc8, 0x8e, 0x5e, 0x88, 0x45, 0x6d, 0x69, 0x3a, 0x52, + 0xd0, 0x5f, 0xc2, 0xa0, 0xbc, 0x70, 0x5d, 0x6e, 0x9d, 0x67, 0x8a, 0xa5, + 0xbe, 0x67, 0x5d, 0x7d, 0x67, 0x8e, 0xae, 0xb1, 0xc7, 0x4c, 0x45, 0x72, + 0x4e, 0xa1, 0xbd, 0xc3, 0x73, 0x74, 0x90, 0xc7, 0xa3, 0xa1, 0x7b, 0x41, + 0x87, 0x66, 0xb8, 0xa3, 0x53, 0xa7, 0x98, 0x60, 0x98, 0xb3, 0xd3, 0x90, + 0xa5, 0xcb, 0x97, 0xa7, 0x92, 0x9f, 0xcd, 0x68, 0xd1, 0x5c, 0x3f, 0x74, + 0x53, 0x86, 0x7d, 0x3a, 0x7b, 0x69, 0x9e, 0xd0, 0xd0, 0xd3, 0xb3, 0x5a, + 0xa2, 0x55, 0x97, 0x92, 0xb7, 0xae, 0x76, 0x3c, 0xd7, 0xd6, 0x3d, 0x6c, + 0xa5, 0x5b, 0x43, 0xae, 0x90, 0x7c, 0x3b, 0x41, 0x75, 0x8e, 0xc5, 0xcb, + 0xc8, 0x81, 0xb2, 0xbf, 0x7c, 0xa1, 0x4e, 0x58, 0xd4, 0x7b, 0x9b, 0xca, + 0x96, 0xbb, 0x52, 0x32, 0xc5, 0x85, 0x7c, 0x74, 0xcf, 0xb2, 0x7e, 0x77, + 0x58, 0x9a, 0xbd, 0x51, 0x7c, 0xcd, 0xba, 0x43, 0xa9, 0x47, 0xbc, 0xa1, + 0x51, 0x71, 0xd3, 0xba, 0x8a, 0xba, 0x3f, 0x9f, 0x90, 0x62, 0x8c, 0x99, + 0x79, 0xd0, 0x5c, 0x88, 0x3e, 0xaa, 0x57, 0x5e, 0xb4, 0x77, 0xae, 0x4b, + 0x5d, 0xc8, 0xc9, 0x6d, 0x4c, 0x93, 0x55, 0x87, 0xc5, 0xc3, 0x41, 0xbe, + 0xcf, 0xad, 0x79, 0x38, 0x9f, 0x72, 0x58, 0x8d, 0x9a, 0x61, 0x42, 0x78, + 0x72, 0xc5, 0x42, 0x9d, 0x3c, 0xcd, 0x87, 0x3a, 0x3f, 0x37, 0x44, 0xbc, + 0x50, 0x57, 0x95, 0x3e, 0x42, 0x3a, 0x4d, 0xa1, 0xad, 0x71, 0x5a, 0x38, + 0xc1, 0xae, 0x85, 0xb1, 0xb0, 0x4d, 0xbc, 0x9a, 0x7f, 0xd0, 0x71, 0x31, + 0xac, 0xb5, 0xb9, 0x4c, 0xd8, 0x61, 0xab, 0x3e, 0x92, 0x64, 0xc5, 0xbf, + 0x42, 0xd3, 0x51, 0x82, 0x9c, 0x62, 0x38, 0x60, 0x8b, 0x81, 0x56, 0x83, + 0xb9, 0xac, 0x67, 0x3f, 0xc5, 0x50, 0x48, 0x65, 0xd3, 0xac, 0x64, 0x5a, + 0xa2, 0xc0, 0xc2, 0x65, 0x6b, 0x9a, 0x4a, 0xd6, 0xb0, 0x92, 0x62, 0xb1, + 0xb7, 0xc7, 0x89, 0x44, 0x79, 0xb2, 0xbf, 0xb2, 0x3c, 0x90, 0x8c, 0xc3, + 0x53, 0x88, 0x71, 0x87, 0x66, 0x33, 0x9a, 0xbf, 0xcd, 0x9a, 0x68, 0xba, + 0xb0, 0x79, 0x3a, 0x45, 0x8c, 0x3e, 0x6d, 0x94, 0x89, 0x62, 0xb0, 0x41, + 0x85, 0xac, 0xa3, 0xad, 0xce, 0x31, 0xd3, 0x73, 0x48, 0x40, 0x8c, 0xbc, + 0x33, 0xb9, 0xa1, 0x92, 0x57, 0x3e, 0x93, 0x46, 0xad, 0x86, 0xc6, 0xa3, + 0xca, 0xba, 0x6e, 0x37, 0x4b, 0xa4, 0x80, 0x99, 0x33, 0x5b, 0x88, 0x51, + 0x31, 0xab, 0x9e, 0xd2, 0x7c, 0xb1, 0x81, 0xb2, 0x51, 0x68, 0xd4, 0x7a, + 0x99, 0xb6, 0x54, 0x86, 0x82, 0xc8, 0x75, 0x3b, 0x44, 0xa2, 0x5c, 0xcb, + 0x7d, 0x88, 0xc2, 0x7d, 0x5c, 0xb6, 0x5f, 0xb7, 0x77, 0xd3, 0xd0, 0x69, + 0xad, 0x57, 0x9d, 0x55, 0x7c, 0x5b, 0x7e, 0xc9, 0x89, 0x80, 0x33, 0x9c, + 0x41, 0x8f, 0x33, 0x35, 0xb4, 0x6b, 0x3d, 0x6b, 0x4c, 0x52, 0x44, 0x7d, + 0xd1, 0xa7, 0x34, 0x76, 0x98, 0x67, 0x2f, 0x66, 0xc8, 0x5b, 0x33, 0x42, + 0xaa, 0x9a, 0x4e, 0x3b, 0x51, 0x55, 0x67, 0xcb, 0x80, 0xc9, 0x3b, 0x83, + 0xca, 0x96, 0x9a, 0x84, 0xc9, 0xa0, 0x87, 0xb2, 0x45, 0xb5, 0x72, 0x63, + 0x94, 0x6a, 0xc9, 0x68, 0xc0, 0x4e, 0x3f, 0x9e, 0xb9, 0xca, 0x84, 0x3a, + 0x84, 0x90, 0x70, 0x99, 0xca, 0xa9, 0xab, 0x6e, 0x43, 0x44, 0xcd, 0x9b, + 0x5c, 0x35, 0x7d, 0xcf, 0x5d, 0xb7, 0xac, 0xc9, 0x24, 0xbb, 0xc9, 0x86, + 0x2d, 0x75, 0x96, 0x5f, 0xd3, 0xce, 0xb0, 0x45, 0x4f, 0xa2, 0x5c, 0x40, + 0x66, 0x88, 0xc0, 0x92, 0x43, 0x80, 0x68, 0x61, 0x79, 0x57, 0xb9, 0x5c, + 0x5e, 0x50, 0xa5, 0xac, 0x4f, 0x66, 0xa0, 0x48, 0x5c, 0x65, 0x91, 0xa8, + 0x6b, 0x6e, 0x82, 0x65, 0x3f, 0x5e, 0x8a, 0xb1, 0x9e, 0xc6, 0x82, 0xd2, + 0x60, 0x93, 0x57, 0x71, 0xbc, 0xb1, 0x56, 0x5b, 0x82, 0x69, 0x88, 0x90, + 0x68, 0x2e, 0x88, 0x86, 0x31, 0xc1, 0x8e, 0x8d, 0x34, 0x9a, 0xac, 0xd3, + 0x5c, 0xb7, 0x77, 0x36, 0x65, 0xb3, 0xa9, 0xcf, 0xa5, 0x4d, 0x31, 0x6f, + 0xc8, 0x74, 0x42, 0x7e, 0x4a, 0x84, 0x86, 0x84, 0x44, 0x82, 0x88, 0x86, + 0x47, 0x6a, 0x8d, 0x37, 0x51, 0xaa, 0x93, 0x67, 0xc1, 0x93, 0xcf, 0xc7, + 0x5b, 0x96, 0x88, 0x5f, 0xb4, 0x32, 0x5b, 0xb8, 0xac, 0x4e, 0xc5, 0xcf, + 0x45, 0xb5, 0x6d, 0x88, 0x8b, 0xbc, 0x3e, 0x5f, 0xce, 0xad, 0x81, 0x3a, + 0x86, 0x2f, 0xba, 0x81, 0x7a, 0x6e, 0x68, 0xd1, 0x82, 0x40, 0x67, 0xa7, + 0x52, 0x8b, 0xa3, 0x98, 0x61, 0xa2, 0x6e, 0x9b, 0xa1, 0x47, 0x56, 0xa0, + 0x6e, 0xbd, 0x79, 0x66, 0x93, 0xb4, 0x5f, 0x6a, 0xba, 0x8d, 0xcd, 0xc3, + 0xc4, 0x89, 0x68, 0x47, 0x5c, 0x3f, 0x84, 0x39, 0x6e, 0x40, 0x96, 0xb8, + 0x73, 0x31, 0xc9, 0xbc, 0xc7, 0x51, 0x3a, 0x62, 0x57, 0x2f, 0x66, 0x64, + 0x93, 0xb4, 0xbe, 0x9b, 0xb9, 0x54, 0x68, 0xa3, 0x72, 0x70, 0x9b, 0x82, + 0x46, 0xc8, 0xc6, 0x3b, 0x5f, 0x56, 0xb1, 0x50, 0x5a, 0xc7, 0xb2, 0x7e, + 0x5b, 0x65, 0xbb, 0xaf, 0xc9, 0x78, 0x94, 0x92, 0xaf, 0x34, 0x49, 0xa7, + 0x3c, 0x3e, 0x4d, 0x56, 0x83, 0xd1, 0x4a, 0x30, 0x53, 0xbb, 0x50, 0x9c, + 0x69, 0xbf, 0x55, 0xd4, 0x4b, 0x81, 0x44, 0xc8, 0xbf, 0x9c, 0x63, 0x61, + 0x33, 0x97, 0x88, 0xa9, 0x80, 0x3f, 0x4a, 0x94, 0xc8, 0x48, 0x41, 0xbe, + 0x35, 0x7d, 0xa7, 0x2f, 0x86, 0x67, 0xb6, 0xd0, 0x68, 0x85, 0x46, 0xb2, + 0x5e, 0xce, 0x49, 0x8a, 0x7a, 0xc4, 0x4c, 0x77, 0x8c, 0x78, 0x86, 0x46, + 0x50, 0xc4, 0x5f, 0x8c, 0x57, 0xd1, 0x44, 0x5e, 0x90, 0x66, 0x59, 0xca, + 0x5f, 0xc1, 0xb5, 0xba, 0xd1, 0x6a, 0xb4, 0x3d, 0x54, 0xb4, 0x55, 0x34, + 0x53, 0x3c, 0xcf, 0xbd, 0x71, 0x70, 0xa2, 0x8f, 0x56, 0x47, 0xa8, 0xb4, + 0xcf, 0x67, 0xbf, 0x76, 0x41, 0x6d, 0xc2, 0xa0, 0x3a, 0xb9, 0xbd, 0x4f, + 0x47, 0x5b, 0x4e, 0x66, 0x78, 0x5d, 0x53, 0x8e, 0xa3, 0xcd, 0xcf, 0xb2, + 0x6d, 0xa0, 0x5e, 0x93, 0x77, 0x9a, 0x41, 0x82, 0x53, 0x3a, 0x45, 0x41, + 0x82, 0x6a, 0x7f, 0x5b, 0x6c, 0xb8, 0xb0, 0xd5, 0x6c, 0x4d, 0x4f, 0x3d, + 0xc4, 0x99, 0x44, 0x8e, 0xc5, 0x5f, 0x52, 0x9a, 0x5c, 0x3b, 0x34, 0x92, + 0x48, 0x37, 0xd4, 0x58, 0x7f, 0x75, 0xc2, 0x3f, 0x54, 0x9f, 0xcb, 0x62, + 0xb1, 0x95, 0x87, 0x8e, 0xba, 0x9b, 0x79, 0xce, 0x86, 0xa7, 0x4c, 0x8c, + 0x8b, 0xca, 0x3e, 0x95, 0xd3, 0x5e, 0x80, 0xc3, 0xd2, 0x4d, 0x82, 0x6c, + 0x34, 0x35, 0xb4, 0xb2, 0x96, 0x74, 0xdc, 0x7e, 0x39, 0x6c, 0xa0, 0x83, + 0xbc, 0xc4, 0x7f, 0x75, 0xb5, 0x76, 0x62, 0xba, 0x45, 0xb5, 0x87, 0x64, + 0x95, 0x83, 0x63, 0xbe, 0x40, 0xa7, 0xbf, 0x4c, 0x42, 0x58, 0xcc, 0x48, + 0x6d, 0x59, 0xbf, 0xbe, 0x6c, 0x88, 0xc2, 0xbf, 0x50, 0x60, 0x8c, 0x94, + 0x66, 0xa0, 0x63, 0xa6, 0xb9, 0xae, 0x77, 0x3f, 0x85, 0xc3, 0x3b, 0x30, + 0x37, 0x68, 0x83, 0x5d, 0xa4, 0x64, 0xa4, 0xb1, 0xb3, 0x63, 0x47, 0xb7, + 0x76, 0xa1, 0x7d, 0x78, 0x5f, 0x8f, 0xc3, 0x50, 0x7b, 0x32, 0x80, 0x8d, + 0x31, 0x97, 0xbb, 0x60, 0xca, 0xb0, 0x87, 0x6d, 0x3a, 0x76, 0xae, 0xa0, + 0xb3, 0xc2, 0x9e, 0x4d, 0xb3, 0x81, 0xbf, 0x98, 0xbb, 0x61, 0xa6, 0x85, + 0x41, 0x5d, 0xab, 0x44, 0x7b, 0xc5, 0xbf, 0xc1, 0xce, 0xb3, 0x5d, 0xa7, + 0x66, 0x5e, 0x65, 0xce, 0xd2, 0x7d, 0xa7, 0x48, 0x85, 0x8b, 0x7f, 0x85, + 0x7f, 0xd2, 0x41, 0xbc, 0x54, 0x5e, 0x8e, 0x98, 0x81, 0x4e, 0x96, 0x4b, + 0x40, 0x64, 0x9d, 0x74, 0xb3, 0x8c, 0x6e, 0x32, 0xb6, 0xd3, 0x44, 0xac, + 0xce, 0x68, 0x56, 0x6b, 0x82, 0x96, 0xb2, 0xd6, 0xa8, 0x41, 0xb4, 0x4b, + 0x84, 0x47, 0x45, 0x96, 0x48, 0xad, 0x3c, 0xc8, 0xc6, 0xd1, 0x63, 0xb7, + 0xb3, 0xca, 0x65, 0x7a, 0x5c, 0xa9, 0x68, 0x46, 0xa4, 0xa2, 0xab, 0x9c, + 0x42, 0x43, 0x6c, 0x5d, 0x81, 0x85, 0xba, 0xc6, 0x5b, 0x63, 0xb4, 0x5b, + 0xbe, 0x6e, 0x62, 0x6f, 0xa9, 0x67, 0xc4, 0x62, 0x5a, 0x8e, 0x51, 0x86, + 0x6d, 0x79, 0x9e, 0xc7, 0xc6, 0x3e, 0x6d, 0xaa, 0x9f, 0x55, 0xc8, 0xd3, + 0x4a, 0xcd, 0xc3, 0x3c, 0xa6, 0x32, 0xa4, 0x34, 0x2b, 0x4f, 0x67, 0x4f, + 0xad, 0xbb, 0x4a, 0xab, 0x4f, 0x35, 0x83, 0x5d, 0x4e, 0x58, 0x7b, 0xaa, + 0xb6, 0x8b, 0x71, 0x75, 0xa5, 0x6c, 0xc2, 0xa7, 0x8a, 0x5c, 0x6d, 0xc0, + 0x50, 0xa8, 0x40, 0x77, 0xa9, 0x75, 0x80, 0x3f, 0xd1, 0xb5, 0x74, 0xc2, + 0x46, 0x42, 0x5c, 0x43, 0xa8, 0x72, 0xd1, 0x85, 0xbf, 0xa8, 0x7f, 0x48, + 0xcb, 0x40, 0x87, 0x65, 0x59, 0xc5, 0x8c, 0x29, 0xb7, 0x50, 0x79, 0xaf, + 0x5b, 0x8b, 0x5d, 0x74, 0x5f, 0x63, 0x45, 0x9f, 0x65, 0x5c, 0x70, 0x43, + 0x9e, 0x9b, 0x9c, 0xb7, 0xc4, 0xc7, 0xbe, 0x35, 0x98, 0x53, 0x4a, 0x5f, + 0x9f, 0xab, 0xb6, 0x75, 0x9b, 0x98, 0x3c, 0xb2, 0xae, 0x91, 0xbd, 0xba, + 0x47, 0xae, 0x81, 0x87, 0x7c, 0x88, 0x43, 0x3d, 0x3c, 0xd5, 0x84, 0x32, + 0xbe, 0x96, 0x38, 0xcf, 0x4d, 0x81, 0xa5, 0xa7, 0x3b, 0x62, 0x3e, 0x3c, + 0x6b, 0x37, 0x8c, 0x4e, 0x49, 0x5d, 0x79, 0x37, 0xc6, 0x7b, 0x88, 0x9d, + 0xb5, 0x7f, 0x6b, 0x55, 0x87, 0xd4, 0xae, 0x33, 0x67, 0x82, 0x65, 0x60, + 0x34, 0xa1, 0xce, 0x76, 0x4c, 0x72, 0x73, 0x92, 0xc3, 0xbb, 0x9a, 0x34, + 0xb0, 0xb3, 0x58, 0x5c, 0xcb, 0x47, 0x90, 0x53, 0x79, 0x91, 0xac, 0x9a, + 0x5e, 0xaa, 0x6e, 0xc0, 0x41, 0x35, 0x44, 0x68, 0x4f, 0x8e, 0x47, 0xa4, + 0x5f, 0x95, 0xcd, 0xa7, 0x7e, 0x70, 0x5b, 0x48, 0x6d, 0xa2, 0x7b, 0x31, + 0x6c, 0x6e, 0xcb, 0x8e, 0xcd, 0x6f, 0x98, 0x5a, 0xa1, 0x60, 0x6d, 0x82, + 0x50, 0x9b, 0xb1, 0xd6, 0xca, 0x4d, 0xa6, 0x5a, 0xad, 0xc8, 0x53, 0xb3, + 0x4a, 0xaa, 0x61, 0xc0, 0xa5, 0x48, 0x6d, 0x6e, 0xc7, 0x5f, 0x7b, 0x57, + 0x7f, 0xa5, 0xc2, 0x69, 0x52, 0xcb, 0xbb, 0x62, 0x7f, 0x73, 0x45, 0x34, + 0x51, 0x8b, 0x59, 0x50, 0x99, 0xc8, 0xc4, 0x95, 0x8d, 0x3a, 0x9e, 0x7c, + 0x91, 0x68, 0x4f, 0x43, 0x86, 0x8c, 0x61, 0xcd, 0x67, 0x6e, 0x6f, 0x65, + 0x39, 0x9c, 0x66, 0x48, 0x52, 0x65, 0xc9, 0x93, 0xaf, 0x89, 0x44, 0x54, + 0x77, 0x67, 0x49, 0x33, 0x65, 0x6a, 0xa4, 0x48, 0x4d, 0x75, 0x8b, 0x71, + 0x8d, 0x6b, 0x9a, 0x7d, 0xbd, 0xa9, 0xc7, 0x59, 0x93, 0x55, 0xb3, 0x79, + 0x87, 0xb6, 0x69, 0xb8, 0xb8, 0xc4, 0x83, 0x45, 0x8d, 0x37, 0x60, 0xc2, + 0xca, 0x66, 0xa6, 0x85, 0x43, 0x9c, 0x72, 0xbd, 0x60, 0xa1, 0xbd, 0x9f, + 0x4a, 0x80, 0xad, 0x93, 0x6c, 0x56, 0x86, 0x9b, 0xb3, 0x59, 0x24, 0xcc, + 0x3c, 0x90, 0x96, 0x69, 0xb4, 0xac, 0xb7, 0x5e, 0xc7, 0x43, 0x71, 0x75, + 0xca, 0xbd, 0x57, 0x5c, 0xc0, 0x6d, 0xb9, 0x5c, 0xd1, 0x66, 0xc9, 0xb6, + 0xa2, 0x5a, 0x55, 0x50, 0xaa, 0x7a, 0xa4, 0xa8, 0xce, 0x41, 0x8e, 0x88, + 0xd2, 0xa6, 0x93, 0xb4, 0xc2, 0x90, 0xc0, 0xc5, 0x9e, 0xbc, 0x5c, 0x8e, + 0xa3, 0x92, 0x3d, 0xd0, 0x52, 0xbc, 0xb3, 0xa8, 0xb8, 0xd6, 0x95, 0x53, + 0x84, 0x45, 0x5f, 0xa4, 0x9a, 0x94, 0x4c, 0x66, 0xae, 0x7e, 0x84, 0xa7, + 0xa7, 0x69, 0x8f, 0x8c, 0x70, 0x62, 0x44, 0x5b, 0xc9, 0xa3, 0x75, 0xad, + 0xcc, 0xc6, 0x66, 0x67, 0xb3, 0xb5, 0x41, 0x82, 0xa8, 0x84, 0xa5, 0x50, + 0xaa, 0x54, 0x4c, 0x4f, 0xd1, 0x4b, 0x4e, 0x53, 0xd3, 0x7d, 0x34, 0x9d, + 0xa7, 0x87, 0x5e, 0xa6, 0x69, 0x61, 0x4d, 0x82, 0xa9, 0x56, 0x5b, 0xa0, + 0x6b, 0x5b, 0xc5, 0xb0, 0x86, 0xa8, 0x60, 0x97, 0x3c, 0x37, 0x62, 0x63, + 0x9c, 0x3d, 0x91, 0xb2, 0x49, 0x4f, 0x41, 0xb3, 0x87, 0x70, 0x67, 0x4a, + 0x93, 0x63, 0x5d, 0x62, 0x34, 0x93, 0xa8, 0xa6, 0x7e, 0xc2, 0xca, 0x6e, + 0x52, 0xa5, 0xbf, 0x6b, 0x74, 0x85, 0x71, 0xbc, 0x7c, 0x30, 0x48, 0x9e, + 0xb4, 0x3b, 0x99, 0x97, 0x8f, 0x7f, 0xaa, 0xc1, 0x81, 0xbc, 0x6c, 0xcb, + 0x96, 0x6a, 0x5e, 0x3f, 0x61, 0x32, 0xa8, 0xbb, 0x8a, 0x80, 0x60, 0xc0, + 0x84, 0xcf, 0x81, 0x52, 0xb4, 0x53, 0x39, 0xbe, 0xd4, 0xb5, 0x6b, 0x48, + 0x6c, 0xa3, 0x98, 0xb2, 0x8c, 0x4e, 0x55, 0x68, 0xa2, 0x65, 0xab, 0x6a, + 0x96, 0x3d, 0x77, 0xc0, 0xb2, 0xbd, 0x66, 0x91, 0x59, 0x77, 0x7a, 0xc1, + 0x8a, 0x3e, 0xa2, 0xbe, 0xd4, 0x35, 0x80, 0x5b, 0xa4, 0x61, 0x90, 0x91, + 0x4b, 0x5b, 0xb1, 0x8b, 0x7a, 0x4a, 0x7c, 0x9f, 0x70, 0x95, 0x4f, 0xbb, + 0x52, 0x4c, 0x3d, 0x6b, 0x4e, 0xcb, 0x7f, 0x6f, 0x96, 0x72, 0x8b, 0x5a, + 0xb4, 0xc4, 0xad, 0x95, 0x2f, 0xb9, 0x3b, 0x37, 0x67, 0x86, 0x33, 0xd0, + 0x7a, 0xc9, 0x50, 0x7a, 0x31, 0xb4, 0x4d, 0x87, 0x2d, 0x69, 0xbe, 0x65, + 0x35, 0xa6, 0x71, 0x26, 0xbe, 0x9b, 0x7e, 0xa8, 0x4c, 0x6e, 0x48, 0x75, + 0x36, 0x63, 0x56, 0x9a, 0x4e, 0x98, 0x3b, 0x6b, 0xca, 0x5e, 0x54, 0x59, + 0x7f, 0x4c, 0x6e, 0xcf, 0x31, 0x58, 0x37, 0xa8, 0xc0, 0x77, 0xae, 0x51, + 0x70, 0xd1, 0x4e, 0x7e, 0x38, 0x7e, 0x36, 0x8f, 0xce, 0xc8, 0x51, 0xa4, + 0x84, 0x9a, 0xcc, 0x9e, 0x9a, 0xc3, 0x36, 0x52, 0x7c, 0x5a, 0x45, 0x90, + 0x99, 0xa3, 0x7d, 0x3c, 0x3b, 0xc6, 0xc8, 0x68, 0x91, 0x63, 0x90, 0xb1, + 0xd2, 0x75, 0xa9, 0xaf, 0xb7, 0x99, 0x3c, 0x63, 0x9d, 0xc8, 0x43, 0x57, + 0x4e, 0xd0, 0xb6, 0x76, 0x65, 0xca, 0x9f, 0x9c, 0xa3, 0xc3, 0x9b, 0xb3, + 0xa1, 0x72, 0xae, 0x91, 0x97, 0x41, 0x88, 0x8f, 0x97, 0x4e, 0xa5, 0x6c, + 0xce, 0xd8, 0x72, 0x65, 0xc8, 0xb0, 0x4b, 0x5c, 0xc3, 0x66, 0xce, 0xb3, + 0xb7, 0x59, 0xb0, 0xad, 0x56, 0xa0, 0x2f, 0xc2, 0xcb, 0x78, 0x80, 0x48, + 0x4b, 0x67, 0x3b, 0x4a, 0x84, 0x40, 0xab, 0x70, 0xb7, 0x8e, 0xb6, 0x4c, + 0x80, 0x40, 0xa9, 0x3e, 0x44, 0x4e, 0x9d, 0xb1, 0x33, 0xc6, 0x8b, 0x5c, + 0x81, 0xc7, 0x31, 0x3b, 0x8e, 0x75, 0x9f, 0x37, 0xa2, 0x5c, 0xb7, 0x78, + 0xa5, 0x4f, 0x2c, 0x68, 0x77, 0x3a, 0x40, 0xbc, 0xbe, 0x72, 0x44, 0x67, + 0x96, 0x67, 0xa4, 0x7e, 0xb7, 0x8a, 0x82, 0xbf, 0x6d, 0x48, 0x55, 0x66, + 0xa1, 0xb6, 0xbc, 0x7e, 0x7b, 0xab, 0x37, 0x9e, 0xcd, 0x4c, 0x90, 0xbe, + 0xa8, 0x9f, 0xa7, 0x5b, 0x68, 0x9f, 0x33, 0x74, 0xba, 0xb6, 0x86, 0x4a, + 0xcf, 0xba, 0xc9, 0x6f, 0xa6, 0x86, 0x68, 0x9d, 0x3a, 0xb2, 0x3c, 0xce, + 0x3b, 0x98, 0x31, 0x80, 0x88, 0x97, 0x8c, 0xa5, 0x45, 0xbf, 0xb1, 0x6f, + 0x8f, 0x82, 0x4e, 0xc4, 0x75, 0x57, 0x52, 0xcd, 0x6f, 0xd1, 0xa8, 0x8c, + 0x57, 0x86, 0x26, 0x75, 0xca, 0x79, 0x4f, 0x47, 0x6b, 0xa5, 0x6a, 0x66, + 0x9c, 0xcc, 0x3e, 0xbe, 0x49, 0x80, 0x3a, 0x77, 0xad, 0xbc, 0x34, 0x57, + 0x53, 0x32, 0x7c, 0xb3, 0x76, 0xcb, 0x7e, 0xc6, 0x8a, 0xc2, 0x9b, 0xc8, + 0xab, 0xa4, 0x31, 0x9a, 0x39, 0x59, 0xc9, 0x77, 0x32, 0x8c, 0x97, 0x8c, + 0x25, 0x33, 0x9c, 0xa4, 0xaa, 0x8a, 0xc6, 0x8b, 0x81, 0x8b, 0x56, 0x89, + 0x9b, 0xbe, 0x84, 0x5f, 0x34, 0x4c, 0x9b, 0x3b, 0x88, 0x89, 0x6d, 0x37, + 0x96, 0x59, 0x57, 0xc9, 0x60, 0xa7, 0x39, 0x93, 0x49, 0xaf, 0x9f, 0x5c, + 0x6b, 0xa5, 0x99, 0x91, 0x55, 0x3d, 0x87, 0x57, 0x4c, 0xd1, 0x36, 0x7c, + 0x84, 0x58, 0xb6, 0xa4, 0x57, 0xd0, 0xa2, 0xcd, 0x87, 0x9a, 0x35, 0x38, + 0x81, 0x98, 0x61, 0x6f, 0x92, 0xd3, 0x85, 0xb2, 0x5a, 0x5b, 0x57, 0xbe, + 0x6a, 0x99, 0xb6, 0x70, 0xb9, 0x6b, 0x75, 0x5d, 0x84, 0x62, 0x99, 0x91, + 0x90, 0x41, 0x75, 0x56, 0x88, 0x4d, 0xbd, 0xc5, 0x36, 0xcc, 0xb7, 0x35, + 0x45, 0xb1, 0xdc, 0xbe, 0x8d, 0x95, 0xcc, 0x32, 0x4b, 0x94, 0x92, 0xc0, + 0x2c, 0x6e, 0xbb, 0x6b, 0x69, 0x4d, 0xa2, 0xca, 0xc7, 0xc1, 0x30, 0x66, + 0x57, 0x2c, 0x97, 0x6a, 0x34, 0xc1, 0x9d, 0xc6, 0x95, 0x87, 0x8d, 0x63, + 0x5f, 0x4d, 0x3d, 0xc0, 0x34, 0x81, 0xc5, 0x3c, 0x52, 0x98, 0x4a, 0x9f, + 0x82, 0x4b, 0x83, 0xb9, 0x8c, 0xb0, 0x57, 0x90, 0xb9, 0xb1, 0x8d, 0x65, + 0x4d, 0x60, 0x60, 0x35, 0x4e, 0x82, 0x8e, 0x8e, 0x63, 0x75, 0x83, 0xbe, + 0xae, 0x37, 0x7f, 0x4c, 0xa5, 0x4c, 0x75, 0x52, 0x81, 0x33, 0xa6, 0x3d, + 0x3a, 0x37, 0x57, 0x80, 0x50, 0x56, 0x76, 0xc1, 0x7d, 0xa9, 0x7a, 0x88, + 0x4c, 0x82, 0x77, 0x83, 0x68, 0xcb, 0xa3, 0x8c, 0xa4, 0x5b, 0x6b, 0x5e, + 0x81, 0x6c, 0x5d, 0x6d, 0x46, 0x98, 0x3a, 0x55, 0xd2, 0x6f, 0x90, 0xac, + 0x5a, 0x97, 0x88, 0x6e, 0x60, 0x6c, 0x5f, 0x64, 0x67, 0x98, 0x5a, 0x51, + 0x82, 0xb5, 0x7a, 0xa3, 0xbd, 0xc1, 0x49, 0xd1, 0x9f, 0x9f, 0xb4, 0x46, + 0x7b, 0x55, 0x8b, 0x52, 0xb1, 0x54, 0x8f, 0x34, 0x53, 0x3e, 0xb3, 0x4d, + 0xbb, 0xc2, 0xa5, 0xb9, 0x8f, 0x31, 0x3e, 0x54, 0x99, 0x5d, 0x2f, 0x46, + 0x69, 0x61, 0x8c, 0x63, 0xce, 0xaa, 0x69, 0x34, 0x82, 0xa7, 0x7c, 0x77, + 0xcb, 0x76, 0xc5, 0x58, 0x77, 0x47, 0xa2, 0xca, 0x89, 0x4f, 0x6c, 0x58, + 0x8d, 0x84, 0x4c, 0x57, 0x98, 0xa3, 0x61, 0x98, 0x34, 0x75, 0x57, 0x57, + 0x76, 0xbc, 0x7b, 0xa7, 0x96, 0x9d, 0xa4, 0xce, 0x6e, 0x42, 0xb0, 0x64, + 0xc7, 0x65, 0x99, 0xac, 0x46, 0x47, 0x78, 0x78, 0x97, 0x78, 0xc1, 0x6a, + 0x35, 0x7d, 0xbb, 0x98, 0x42, 0x6d, 0x4c, 0x8e, 0x6e, 0x90, 0x38, 0x77, + 0x85, 0x76, 0xb4, 0xc1, 0x9f, 0x66, 0x57, 0x3e, 0x65, 0x8c, 0x8a, 0x89, + 0x6c, 0x86, 0x43, 0x49, 0xb1, 0x2f, 0x95, 0x89, 0x5b, 0x91, 0x90, 0x69, + 0x9e, 0xa2, 0xae, 0xa7, 0x2c, 0x80, 0x87, 0x59, 0x82, 0x66, 0x55, 0x96, + 0x5c, 0x59, 0xa7, 0x93, 0x72, 0x50, 0x7e, 0x4a, 0xc0, 0x44, 0x88, 0x50, + 0x6c, 0xbd, 0xb8, 0xa8, 0xc1, 0x65, 0x7f, 0x56, 0x6e, 0x87, 0xd0, 0x3b, + 0x39, 0x62, 0x7d, 0xc3, 0x67, 0x81, 0x59, 0x9e, 0x7d, 0x42, 0xa8, 0xc4, + 0xa1, 0xa6, 0x3b, 0x94, 0x3a, 0xb2, 0x8a, 0xb7, 0x9b, 0x73, 0xc8, 0x54, + 0x49, 0x96, 0x46, 0xa1, 0x75, 0x7d, 0x93, 0x7e, 0x71, 0xb9, 0xab, 0x7a, + 0xc5, 0xbe, 0xca, 0xb7, 0x49, 0x4e, 0x25, 0x5e, 0x4c, 0x7b, 0x45, 0x8a, + 0xac, 0x58, 0xc7, 0xd6, 0x52, 0x48, 0x74, 0x6e, 0xb1, 0xb9, 0x88, 0x80, + 0x99, 0x53, 0x83, 0x8f, 0x46, 0x9b, 0x55, 0x7e, 0x79, 0x48, 0x86, 0x9d, + 0xbf, 0x92, 0xa3, 0x7d, 0xbb, 0x3b, 0xca, 0x52, 0x50, 0x74, 0x5a, 0x45, + 0x67, 0x8f, 0x7d, 0x84, 0x42, 0xb6, 0xb7, 0x9c, 0xcf, 0xc4, 0x41, 0x2d, + 0x3c, 0x72, 0x45, 0xc1, 0x40, 0xa1, 0x64, 0x8b, 0xaf, 0x36, 0xa5, 0x3b, + 0x3e, 0x47, 0xd4, 0xb0, 0xbb, 0x4f, 0x45, 0x55, 0x8b, 0xb8, 0x5d, 0x8b, + 0x50, 0x6d, 0x43, 0x34, 0x4a, 0xcb, 0x86, 0x5d, 0x56, 0x55, 0x80, 0x7d, + 0x9f, 0xa3, 0xa4, 0x3b, 0x70, 0x80, 0x68, 0x3a, 0x68, 0x56, 0x57, 0xab, + 0xa0, 0xa8, 0xba, 0x93, 0xc8, 0x3f, 0xc7, 0xa8, 0x94, 0x5f, 0x41, 0x41, + 0x71, 0x67, 0x6e, 0xac, 0x7a, 0x79, 0x5a, 0x7e, 0x82, 0x9a, 0x9a, 0xa1, + 0xc6, 0x3a, 0xcc, 0x7e, 0x5b, 0x6a, 0x63, 0xaa, 0x75, 0x88, 0x61, 0x86, + 0xba, 0x36, 0x68, 0x84, 0xd4, 0x35, 0xa4, 0x68, 0xa7, 0x51, 0x8a, 0x4d, + 0xd7, 0xa2, 0x48, 0xa6, 0x2e, 0xbe, 0x86, 0x43, 0x6d, 0x59, 0xb5, 0x35, + 0x73, 0x6f, 0x34, 0x49, 0x7b, 0x60, 0xc6, 0x9a, 0x63, 0x95, 0x66, 0x70, + 0x91, 0x32, 0x2d, 0x55, 0xd9, 0xc6, 0xd4, 0x52, 0x4d, 0xab, 0x64, 0xc6, + 0x37, 0x4b, 0xaf, 0x41, 0x56, 0xaa, 0x65, 0x57, 0x36, 0x61, 0x9c, 0xa8, + 0xb1, 0x56, 0xb7, 0x3d, 0x9e, 0x3d, 0x59, 0xc9, 0x96, 0xdb, 0x53, 0xb4, + 0x5c, 0xaf, 0xa6, 0x58, 0x54, 0x7e, 0x64, 0x81, 0xbc, 0xcd, 0xaf, 0x5b, + 0xc9, 0x8b, 0xaf, 0x52, 0xb8, 0x3c, 0x68, 0xb9, 0x9b, 0x4f, 0xba, 0xb2, + 0x3a, 0x90, 0xbe, 0x6e, 0x5b, 0x33, 0x55, 0x62, 0x33, 0xa2, 0x49, 0x7b, + 0x42, 0xd5, 0xa4, 0x8b, 0x7a, 0x7d, 0xa5, 0x91, 0xb2, 0x31, 0x2f, 0xd7, + 0x70, 0xbb, 0x93, 0xa9, 0xa1, 0x85, 0x57, 0xb9, 0x6e, 0xcd, 0x5a, 0xcd, + 0x86, 0x97, 0x64, 0xc7, 0xa7, 0xbe, 0x7c, 0x8a, 0xc2, 0x99, 0x4e, 0x64, + 0x75, 0xd7, 0x8d, 0x75, 0x52, 0x79, 0x99, 0xa1, 0x9e, 0xa4, 0x65, 0xb1, + 0x9c, 0xa7, 0x6d, 0xb2, 0x64, 0x4b, 0xbd, 0x41, 0xd4, 0x54, 0x77, 0x4e, + 0x74, 0x8a, 0x95, 0x73, 0xc1, 0x72, 0xb4, 0xcc, 0x58, 0x47, 0x51, 0x74, + 0x45, 0xbc, 0x88, 0xc1, 0xcd, 0x6b, 0x42, 0x49, 0x5e, 0x76, 0xb2, 0x4f, + 0xb3, 0x71, 0x8f, 0x9a, 0xb4, 0x61, 0x61, 0x6c, 0xc4, 0xa2, 0x2a, 0x58, + 0x2e, 0xca, 0x7d, 0x7e, 0xcd, 0xb8, 0x5d, 0x51, 0xac, 0x54, 0x6b, 0x9c, + 0xac, 0x68, 0x7d, 0x7c, 0xb5, 0x35, 0x64, 0x7c, 0x5b, 0xba, 0x9f, 0x5f, + 0x9c, 0x52, 0x3f, 0x99, 0xa2, 0xaf, 0x90, 0xb1, 0xcc, 0x32, 0x61, 0x8d, + 0xb7, 0x84, 0x8a, 0x3e, 0xcb, 0xd8, 0x61, 0x83, 0xc8, 0x61, 0x3f, 0xbd, + 0x92, 0x95, 0xc6, 0xc8, 0x4f, 0xb1, 0x63, 0x60, 0xa9, 0x5e, 0x48, 0xcb, + 0x98, 0xc9, 0xcc, 0x66, 0xa9, 0xaf, 0x5c, 0x89, 0x91, 0xc3, 0x4d, 0x7e, + 0x62, 0x82, 0x73, 0x47, 0xd4, 0x6b, 0x7b, 0x30, 0x9c, 0xb1, 0x79, 0x34, + 0x42, 0x60, 0x36, 0xce, 0xc3, 0x35, 0x7a, 0x55, 0xaf, 0x58, 0x81, 0x31, + 0xb8, 0x52, 0x94, 0x3c, 0x85, 0x92, 0x84, 0x37, 0x5c, 0x5e, 0xa4, 0xa6, + 0x3c, 0x5d, 0xa7, 0x7f, 0x48, 0x38, 0xb5, 0xab, 0x65, 0xa0, 0xc1, 0xd2, + 0x4b, 0x67, 0xc7, 0x2d, 0xd7, 0x73, 0xa8, 0x94, 0x86, 0x8b, 0xa9, 0x51, + 0x7d, 0x7c, 0x45, 0x63, 0x5f, 0x76, 0x9d, 0x7b, 0xd0, 0x8f, 0x5a, 0x3d, + 0x6f, 0x74, 0x97, 0x4a, 0xc6, 0x57, 0xcf, 0xbb, 0x8f, 0x61, 0x3a, 0xb8, + 0x73, 0xb1, 0x58, 0x6d, 0xa2, 0xa9, 0x4a, 0x3a, 0x6a, 0xaa, 0x84, 0xac, + 0x50, 0x93, 0xc5, 0xc5, 0x84, 0xc6, 0x3b, 0xc5, 0x41, 0x5c, 0x57, 0x72, + 0x57, 0x53, 0x60, 0x66, 0xbe, 0x59, 0x4b, 0xad, 0xbe, 0xa2, 0x88, 0x3b, + 0x3f, 0x3b, 0x93, 0xab, 0xc5, 0x70, 0x83, 0x8a, 0x89, 0xbd, 0x34, 0xb7, + 0x62, 0xa2, 0x75, 0x98, 0x9e, 0x98, 0x66, 0x92, 0xb7, 0x5b, 0x35, 0x89, + 0xc0, 0x35, 0xbe, 0x85, 0x83, 0x52, 0xb1, 0x3b, 0x52, 0x57, 0x2a, 0x36, + 0xc2, 0x76, 0x81, 0x8d, 0x84, 0x6b, 0xc3, 0x6b, 0x60, 0x94, 0x50, 0x7b, + 0x3b, 0x49, 0x57, 0x70, 0xa9, 0xa7, 0x54, 0x56, 0x44, 0x7b, 0x8c, 0x41, + 0xc3, 0x69, 0x8a, 0x8d, 0xc1, 0xde, 0x54, 0x92, 0x56, 0x2e, 0xb4, 0x98, + 0xc3, 0xcf, 0x68, 0x63, 0x9b, 0x43, 0x3f, 0x87, 0x98, 0xd4, 0xae, 0x72, + 0x4e, 0x47, 0x85, 0xc0, 0x7b, 0xb4, 0x99, 0xd3, 0xca, 0x47, 0x5b, 0x94, + 0xc4, 0xca, 0x5a, 0x9a, 0x46, 0x54, 0x70, 0x7c, 0x40, 0x3d, 0x3a, 0x56, + 0x85, 0x91, 0xd5, 0x69, 0xd3, 0x37, 0x60, 0xa1, 0x35, 0x50, 0x96, 0xce, + 0xad, 0xa1, 0xcb, 0x85, 0x59, 0xb9, 0xc8, 0x85, 0x99, 0xb3, 0x3f, 0x63, + 0x9f, 0x36, 0x57, 0xd4, 0xc7, 0xc1, 0x50, 0x7f, 0xce, 0x9b, 0xb7, 0x95, + 0x70, 0xb1, 0xa2, 0x34, 0x3b, 0x3d, 0x6f, 0x4b, 0x90, 0xca, 0x60, 0x9f, + 0x83, 0x6d, 0x72, 0xc2, 0x6e, 0x4b, 0x52, 0x26, 0x45, 0xb5, 0xb3, 0x6e, + 0x89, 0x8a, 0xc2, 0x6c, 0x5b, 0x6e, 0xb9, 0x80, 0x52, 0x99, 0x33, 0x5c, + 0x52, 0x79, 0x39, 0x9b, 0x81, 0x35, 0xd4, 0x43, 0x8b, 0x5f, 0x44, 0xcf, + 0x68, 0xa6, 0xaa, 0x79, 0x43, 0x4c, 0xc7, 0x9f, 0x8c, 0xb0, 0x78, 0x4f, + 0x7a, 0x55, 0x96, 0xce, 0x8d, 0xc6, 0x9b, 0x4a, 0xd6, 0x8d, 0x94, 0x4c, + 0xba, 0xc5, 0xb0, 0x96, 0x51, 0x4c, 0x99, 0xc7, 0x8e, 0x4a, 0x6f, 0x3c, + 0x83, 0x4d, 0xd3, 0x7b, 0x3e, 0x44, 0xd2, 0xc0, 0x7d, 0xa7, 0x53, 0x55, + 0x86, 0x68, 0x7d, 0x33, 0x99, 0xbb, 0x4a, 0x8a, 0xae, 0x3d, 0x44, 0x59, + 0x9c, 0x55, 0x48, 0x8c, 0x91, 0x73, 0xa8, 0x40, 0x92, 0x80, 0xc9, 0xbe, + 0xa7, 0xbd, 0x42, 0xc4, 0x3b, 0x42, 0xbe, 0xb4, 0x54, 0x5f, 0x7d, 0xc2, + 0x86, 0xa9, 0x7e, 0xcc, 0x62, 0x99, 0x7f, 0x9a, 0x88, 0x42, 0xca, 0xb9, + 0x67, 0x97, 0x5c, 0x6d, 0xca, 0xa4, 0x8f, 0x8e, 0x86, 0xd5, 0x62, 0x61, + 0x96, 0x58, 0x67, 0x8b, 0xb1, 0x31, 0xc1, 0x60, 0xc4, 0x7a, 0x3b, 0xcf, + 0xca, 0x32, 0x64, 0xb4, 0xbc, 0xca, 0xc8, 0xca, 0x48, 0x95, 0x90, 0xc6, + 0x5a, 0xc8, 0xa6, 0x5c, 0x5e, 0xad, 0xa6, 0xb8, 0x75, 0x77, 0x3c, 0x45, + 0x6a, 0xa7, 0x6a, 0x83, 0x6c, 0x9a, 0x4d, 0x93, 0x61, 0xb1, 0x66, 0x61, + 0x5b, 0x8d, 0x90, 0xc2, 0x7a, 0x61, 0x3f, 0x3c, 0xb1, 0xad, 0x96, 0x88, + 0x50, 0x92, 0x39, 0xb5, 0x3a, 0x7e, 0x84, 0x98, 0x74, 0xc7, 0xa1, 0xd6, + 0xc6, 0x37, 0xcc, 0xae, 0xa6, 0xae, 0x37, 0x82, 0x3b, 0xb9, 0xbb, 0xb0, + 0xcc, 0xd0, 0x66, 0xb2, 0x9b, 0x82, 0x72, 0x80, 0x8a, 0x6b, 0xb9, 0x9f, + 0x3e, 0x9c, 0x4a, 0x5d, 0xa6, 0xde, 0x6f, 0x9a, 0x6b, 0x36, 0x57, 0x67, + 0xac, 0x4e, 0x6a, 0x5e, 0x57, 0xbc, 0xc6, 0xd3, 0xa6, 0x86, 0x77, 0x39, + 0x56, 0xdb, 0x84, 0x35, 0xb1, 0xb6, 0x91, 0x2c, 0xb7, 0x9b, 0xaa, 0xab, + 0x3e, 0xb2, 0x97, 0x50, 0x99, 0x82, 0xa1, 0x3a, 0xc0, 0xbb, 0x6c, 0xb5, + 0x3c, 0xbe, 0x4a, 0x3d, 0x43, 0x33, 0x47, 0x5a, 0x82, 0xaf, 0x90, 0x97, + 0x88, 0x71, 0xd0, 0xcc, 0x81, 0x9d, 0x87, 0x5e, 0x5b, 0x4d, 0x85, 0xc8, + 0xb6, 0xcc, 0x46, 0x93, 0x56, 0xd6, 0x80, 0x5c, 0x6f, 0x90, 0xb6, 0xaf, + 0x49, 0x64, 0xa2, 0xa0, 0x84, 0xba, 0x9c, 0x3f, 0x32, 0xaa, 0x81, 0x59, + 0x80, 0x5b, 0xac, 0x4b, 0xa0, 0xd5, 0x4f, 0xb9, 0x40, 0x60, 0xa0, 0x92, + 0x56, 0x89, 0x71, 0x92, 0x49, 0xb1, 0xa2, 0xbe, 0xd0, 0x93, 0x6f, 0x3c, + 0xc6, 0x3a, 0x84, 0xc9, 0x93, 0xa4, 0xbc, 0x74, 0x40, 0xa6, 0x64, 0x91, + 0x7f, 0x44, 0x94, 0x5a, 0x4e, 0x39, 0x58, 0x8f, 0x3c, 0x4c, 0x39, 0xab, + 0x74, 0x9a, 0xb2, 0xac, 0x44, 0x5a, 0x9a, 0x49, 0x5b, 0x81, 0x4c, 0x3c, + 0x4a, 0x78, 0x78, 0x4b, 0xac, 0x7c, 0x81, 0x44, 0xac, 0x49, 0x7f, 0x81, + 0x3f, 0x3d, 0x68, 0xd0, 0x3f, 0x37, 0x2f, 0x89, 0x62, 0xbe, 0xc4, 0x81, + 0x44, 0xac, 0x36, 0x3a, 0xa8, 0x57, 0xb2, 0x87, 0xb8, 0x64, 0x7e, 0x68, + 0x69, 0x4b, 0xb3, 0x8f, 0xc5, 0x59, 0x66, 0x5e, 0x9f, 0x7b, 0x88, 0xc2, + 0xac, 0xa2, 0x83, 0x78, 0x3a, 0x88, 0x7d, 0x74, 0x46, 0xc7, 0x3d, 0x3c, + 0x5e, 0x46, 0x4f, 0x3d, 0x49, 0xb3, 0xaa, 0x3f, 0xc3, 0x54, 0x8d, 0x7b, + 0x8e, 0x91, 0x64, 0xd2, 0x64, 0xc1, 0x57, 0x4e, 0xb7, 0x9a, 0x61, 0x61, + 0x3f, 0x97, 0x98, 0x47, 0xc5, 0xb2, 0x99, 0xbe, 0x3e, 0x8e, 0x33, 0xba, + 0x4a, 0x4d, 0xd4, 0xad, 0x87, 0xb8, 0xbd, 0x3b, 0x9b, 0x49, 0x89, 0x81, + 0x5f, 0x63, 0x36, 0x4f, 0x6c, 0xc8, 0xaa, 0xa4, 0x4b, 0x79, 0xba, 0xb4, + 0x91, 0x8f, 0xc3, 0x54, 0x9f, 0x6a, 0x56, 0xab, 0xc1, 0x90, 0x48, 0xa6, + 0x59, 0x77, 0xad, 0x39, 0xc6, 0xbe, 0x54, 0xc4, 0xbc, 0xbe, 0xd4, 0xd1, + 0x72, 0x71, 0x48, 0x38, 0x72, 0x3c, 0xca, 0x2c, 0xc2, 0x6a, 0xc9, 0x61, + 0x58, 0x83, 0x42, 0x8f, 0x9a, 0x50, 0x62, 0xc9, 0xb5, 0xc1, 0xd5, 0x94, + 0xa1, 0x48, 0x78, 0x80, 0xc9, 0x97, 0x98, 0xbe, 0x69, 0xc7, 0x7c, 0x46, + 0x71, 0xbb, 0xb2, 0xd0, 0xa6, 0x9b, 0x9a, 0x58, 0xbe, 0xc9, 0x57, 0xbe, + 0x3d, 0x8b, 0x7c, 0x79, 0x9c, 0xa2, 0xce, 0x57, 0x65, 0x5d, 0x50, 0x4a, + 0x48, 0x97, 0xd1, 0xca, 0x3b, 0xc7, 0x75, 0xbc, 0x84, 0x73, 0x84, 0xbc, + 0x8c, 0x4a, 0xa4, 0x61, 0xb5, 0xb8, 0x4b, 0x4f, 0xc5, 0x57, 0xb5, 0x4b, + 0x37, 0x94, 0xc6, 0x63, 0xb4, 0x3f, 0xb9, 0x3e, 0x3e, 0x70, 0x66, 0x5d, + 0x72, 0x82, 0x45, 0x80, 0xb3, 0x44, 0x87, 0x73, 0xcc, 0xb0, 0x85, 0x80, + 0x75, 0x7e, 0xcb, 0xc9, 0xb9, 0x45, 0x55, 0xa2, 0x6e, 0x89, 0xd7, 0xa5, + 0xca, 0xb5, 0xb0, 0x61, 0xc5, 0xc6, 0x67, 0x67, 0x63, 0x4d, 0x49, 0x42, + 0xce, 0x36, 0xcf, 0x75, 0x68, 0x9f, 0x7d, 0xb0, 0x66, 0xa1, 0x72, 0x2d, + 0x9d, 0x4a, 0xce, 0xc9, 0x8f, 0x8c, 0xb2, 0x78, 0xc9, 0x65, 0xca, 0x8f, + 0xca, 0x71, 0x4d, 0x65, 0x94, 0x96, 0x6c, 0x5c, 0xbf, 0xad, 0x7f, 0x7d, + 0x50, 0xbb, 0x6a, 0x7a, 0x36, 0xb5, 0x44, 0xc8, 0x9c, 0x4e, 0xc7, 0x46, + 0x77, 0xbe, 0x90, 0xaa, 0x7d, 0x99, 0x50, 0x5f, 0xa7, 0xa5, 0x8b, 0xa4, + 0xac, 0xb2, 0x2f, 0x7c, 0xb9, 0xb3, 0x5f, 0x70, 0xca, 0x9a, 0xd0, 0x5d, + 0x80, 0x31, 0xcc, 0xb1, 0x8f, 0x53, 0xb5, 0x75, 0xbb, 0xbd, 0x86, 0x9d, + 0x5e, 0x9b, 0x42, 0x48, 0x7f, 0x37, 0x5f, 0x92, 0x95, 0xbc, 0xc1, 0x7f, + 0x43, 0xb5, 0xbf, 0xa9, 0xaf, 0x6d, 0xb9, 0x6d, 0xa8, 0xb4, 0x31, 0x59, + 0x31, 0xa5, 0x97, 0xb2, 0x55, 0xaf, 0x3b, 0x8d, 0x39, 0x4f, 0xbb, 0xa5, + 0x42, 0xb5, 0x60, 0xc8, 0xbf, 0xd1, 0xa8, 0xb1, 0xb3, 0xa8, 0x79, 0x43, + 0x99, 0x38, 0x7a, 0x59, 0x8a, 0xaf, 0x9d, 0x88, 0x8c, 0x92, 0xb5, 0xc2, + 0x40, 0xbb, 0x72, 0xd4, 0xc3, 0x7f, 0x41, 0xaa, 0x56, 0x78, 0x48, 0xb9, + 0x6e, 0xcf, 0x3a, 0xae, 0x7e, 0xbe, 0x32, 0x52, 0x53, 0xd1, 0x3a, 0xa6, + 0x3d, 0xcf, 0x59, 0x46, 0x9d, 0x67, 0x4f, 0x4e, 0x3c, 0xc9, 0xae, 0x92, + 0x96, 0xb6, 0x89, 0x6c, 0x4d, 0xd6, 0x46, 0x3a, 0x77, 0x60, 0x3f, 0xa7, + 0x81, 0xc9, 0x66, 0x9c, 0x47, 0xbe, 0x3e, 0xc3, 0x86, 0x4a, 0x86, 0x84, + 0x74, 0x2d, 0xae, 0x6f, 0x63, 0xc0, 0x40, 0x55, 0x9d, 0x69, 0x33, 0x8b, + 0xcc, 0x53, 0xb5, 0x8d, 0x67, 0x8b, 0x2f, 0x4a, 0x66, 0x97, 0x37, 0x94, + 0x57, 0xcc, 0x81, 0xa3, 0x52, 0x44, 0x4f, 0xdb, 0x3d, 0xa4, 0x94, 0x78, + 0x43, 0xd8, 0xc7, 0x85, 0x89, 0x5a, 0xc9, 0xb5, 0x80, 0x71, 0xba, 0xab, + 0xcd, 0xa6, 0x52, 0x7a, 0xc9, 0xab, 0xa4, 0x86, 0x3a, 0xaf, 0x69, 0x4c, + 0xc2, 0x46, 0x6c, 0x88, 0xc4, 0x44, 0xa8, 0x65, 0x64, 0x7c, 0xba, 0x5e, + 0x97, 0xca, 0x7e, 0xad, 0x9b, 0xc6, 0x4d, 0x8d, 0x54, 0x68, 0xc2, 0xa0, + 0x3e, 0x57, 0x5d, 0xa3, 0xa5, 0xbd, 0x3d, 0xb9, 0x3c, 0x9a, 0x46, 0xa6, + 0xac, 0xb1, 0x98, 0x9c, 0xb5, 0x8c, 0x6c, 0xab, 0xbd, 0xa1, 0xb4, 0xbc, + 0xbc, 0x25, 0x95, 0x3b, 0x89, 0x75, 0xd1, 0x8c, 0x29, 0x4f, 0xad, 0xa7, + 0x51, 0x38, 0x3d, 0xa3, 0xc1, 0xaa, 0x84, 0x4d, 0x6d, 0x5c, 0x40, 0x91, + 0xcb, 0x58, 0x29, 0xcc, 0x47, 0x60, 0x59, 0xc9, 0xc3, 0x3c, 0xa4, 0x77, + 0x63, 0x53, 0xbc, 0xb3, 0x42, 0x5c, 0x5c, 0x44, 0x4b, 0xd0, 0x73, 0xaa, + 0x6e, 0x6f, 0x39, 0xcb, 0x8e, 0x9f, 0xa1, 0x4b, 0xa3, 0xb1, 0xbb, 0xb6, + 0x5e, 0x8f, 0x84, 0x66, 0x62, 0x4f, 0x45, 0x3e, 0x61, 0x9a, 0x77, 0x41, + 0x83, 0x82, 0x5f, 0x4d, 0xa0, 0x7b, 0x35, 0x8e, 0x49, 0xda, 0x87, 0xb9, + 0x78, 0xcc, 0x35, 0x99, 0xad, 0xbf, 0x99, 0x46, 0xb3, 0xce, 0x45, 0x7c, + 0x9f, 0x59, 0xc0, 0x98, 0x30, 0x7a, 0x65, 0x83, 0x5a, 0xa7, 0x78, 0xa4, + 0xaf, 0x25, 0x7e, 0xa7, 0x58, 0x4e, 0x94, 0xab, 0x46, 0x65, 0x9c, 0x5f, + 0xc4, 0x9e, 0x42, 0xad, 0xa5, 0x49, 0x4e, 0x84, 0xc6, 0xca, 0x44, 0xa0, + 0x9a, 0x37, 0x86, 0xac, 0x2f, 0xb1, 0x44, 0x41, 0xa1, 0x35, 0x6f, 0x78, + 0x8b, 0x89, 0x4a, 0x6b, 0x81, 0x43, 0xb8, 0xd3, 0x9e, 0xc3, 0x90, 0x8a, + 0xb1, 0x87, 0x7b, 0xa1, 0x85, 0x35, 0x4b, 0xbc, 0x88, 0x3a, 0x66, 0x59, + 0x51, 0x94, 0xbe, 0x95, 0x9e, 0xcb, 0x4e, 0x98, 0x9f, 0x36, 0x38, 0x61, + 0x8a, 0xa2, 0x84, 0x3d, 0x8a, 0xbd, 0x8e, 0x81, 0x7d, 0xc7, 0x37, 0x3b, + 0x74, 0x94, 0xcc, 0x34, 0x9e, 0x7a, 0x80, 0x60, 0x8e, 0x58, 0x52, 0x7b, + 0x4c, 0x46, 0x3b, 0x8c, 0x30, 0x72, 0x88, 0x35, 0x8a, 0x4c, 0xa1, 0x7b, + 0xb1, 0x94, 0xa6, 0xc6, 0x70, 0x70, 0xac, 0x6e, 0x74, 0x67, 0x7c, 0xb6, + 0x87, 0x8a, 0xb5, 0xab, 0x7c, 0xac, 0x94, 0x60, 0x7b, 0xc7, 0x35, 0x2e, + 0x3e, 0xc2, 0xc5, 0x71, 0x43, 0xcf, 0x93, 0xc4, 0x91, 0x43, 0x74, 0x46, + 0xcf, 0x76, 0x5f, 0x62, 0x53, 0x33, 0x3d, 0x98, 0x48, 0xad, 0xb4, 0x8c, + 0xba, 0x87, 0x24, 0x88, 0xaf, 0x32, 0xae, 0x4d, 0xb0, 0x6e, 0x97, 0xc8, + 0x45, 0x6d, 0x82, 0x87, 0x6b, 0x5f, 0x47, 0x94, 0x99, 0xa0, 0x4f, 0xbc, + 0xa8, 0xcf, 0x90, 0x6e, 0xac, 0x6b, 0x36, 0x4e, 0xb0, 0x4c, 0x42, 0x96, + 0xaf, 0x9e, 0xbe, 0x70, 0x56, 0xd1, 0x71, 0x48, 0x5b, 0x92, 0x9e, 0x41, + 0xc7, 0x34, 0x61, 0x61, 0x81, 0x75, 0x43, 0x47, 0x68, 0x29, 0x70, 0x4f, + 0x6b, 0x81, 0xc6, 0x82, 0x46, 0x97, 0x62, 0xb8, 0xa2, 0xcf, 0x62, 0xa5, + 0xba, 0x3a, 0xd2, 0x82, 0x4b, 0xd1, 0x5d, 0x8e, 0x5f, 0xb9, 0xc7, 0x33, + 0x62, 0x74, 0xd2, 0x52, 0xa2, 0x37, 0xcb, 0xc2, 0xa5, 0x83, 0x55, 0x6f, + 0xad, 0x7b, 0x45, 0x64, 0x55, 0x36, 0xa6, 0x5d, 0x84, 0xc5, 0x8d, 0x73, + 0xb2, 0x31, 0xa3, 0x61, 0x93, 0x28, 0xc0, 0x73, 0x8e, 0xa4, 0xa5, 0x53, + 0xa4, 0xc3, 0xb4, 0x62, 0x97, 0x99, 0xce, 0x89, 0x7d, 0x74, 0xbc, 0x6a, + 0x46, 0xc0, 0x61, 0xc3, 0x71, 0x9f, 0x71, 0x5c, 0xae, 0x7d, 0xce, 0x95, + 0x89, 0x64, 0xd0, 0x37, 0x5e, 0xc7, 0xa0, 0xae, 0x81, 0xad, 0x7b, 0x45, + 0xa8, 0x30, 0x40, 0x57, 0x38, 0xb5, 0xb7, 0xbc, 0xcf, 0xc3, 0x5a, 0x8e, + 0xd2, 0x59, 0x57, 0xb6, 0x9c, 0x5a, 0x97, 0xa1, 0x9d, 0xa2, 0xca, 0x9c, + 0x93, 0x70, 0x88, 0xcb, 0x48, 0x58, 0x51, 0x88, 0x38, 0x86, 0xb8, 0x63, + 0xad, 0x81, 0xc0, 0xa1, 0x3e, 0x2f, 0x79, 0x7d, 0x7b, 0x83, 0x49, 0x85, + 0x7a, 0x36, 0x5b, 0x8d, 0x6d, 0xbd, 0x8a, 0x3a, 0xd2, 0x99, 0x91, 0x79, + 0xa3, 0x7d, 0xce, 0x48, 0xd4, 0x73, 0xb0, 0x35, 0x43, 0x74, 0x3d, 0xce, + 0xc2, 0x9c, 0x30, 0xaa, 0x8c, 0xba, 0xcb, 0x39, 0xaf, 0x41, 0x55, 0xbc, + 0x4b, 0x7e, 0x8b, 0x90, 0x8c, 0xb9, 0x4f, 0x62, 0x37, 0x7c, 0xb1, 0x55, + 0x33, 0xb8, 0x62, 0x55, 0x61, 0x7b, 0x4d, 0x5e, 0x30, 0x63, 0x4e, 0x49, + 0x84, 0x91, 0xd2, 0x5a, 0x3b, 0xc7, 0x6b, 0x25, 0xb5, 0xa1, 0x54, 0xbc, + 0xa9, 0x6b, 0x61, 0x9f, 0xbd, 0x8d, 0xc1, 0xca, 0x39, 0x3e, 0xc0, 0x46, + 0x3c, 0x2b, 0x94, 0x35, 0xb0, 0x2a, 0x79, 0x59, 0x41, 0xc6, 0xce, 0xa9, + 0x9a, 0x6b, 0x39, 0x76, 0x48, 0xa3, 0x51, 0xcd, 0x9c, 0x4a, 0x78, 0xbc, + 0xbd, 0xaf, 0xa1, 0x71, 0xcf, 0xc7, 0x2f, 0x78, 0x82, 0xc9, 0x93, 0x75, + 0xc9, 0xa7, 0x42, 0x5e, 0xb0, 0x60, 0x6d, 0x62, 0x61, 0xaf, 0xa9, 0x2e, + 0x80, 0xd6, 0x47, 0x4a, 0xb0, 0x86, 0xa4, 0x64, 0x8b, 0x73, 0x8e, 0x65, + 0x4e, 0x9b, 0xce, 0x4e, 0xc7, 0x74, 0x4c, 0x88, 0x7d, 0x77, 0x71, 0x83, + 0x98, 0xb1, 0x4c, 0x62, 0x3f, 0x76, 0x8b, 0x4d, 0x8e, 0x53, 0x57, 0x3d, + 0x53, 0xca, 0x75, 0x98, 0x4a, 0x6e, 0x89, 0xa4, 0x62, 0xac, 0x3f, 0xa7, + 0x61, 0x41, 0xd4, 0xc8, 0x77, 0x32, 0x76, 0x51, 0x78, 0x83, 0x7f, 0x46, + 0xa9, 0xa7, 0x8a, 0x37, 0xbf, 0xa7, 0x50, 0x4e, 0x55, 0x7b, 0xc3, 0x46, + 0x8c, 0xc3, 0x8a, 0xbf, 0xbd, 0x90, 0x69, 0x95, 0x32, 0x73, 0x6d, 0x9e, + 0x9b, 0x54, 0xc5, 0x37, 0x7f, 0xad, 0x2f, 0x3f, 0x49, 0xbf, 0xb1, 0x5b, + 0x7a, 0xa4, 0x4a, 0x49, 0x5d, 0xa3, 0x9b, 0xaa, 0x56, 0x5c, 0x51, 0x81, + 0x99, 0xa7, 0x3c, 0xbf, 0x9e, 0x73, 0xba, 0xb8, 0x3a, 0xc9, 0xa3, 0xa9, + 0xc0, 0x7c, 0x72, 0xb7, 0xa3, 0x66, 0xa3, 0xa4, 0xdf, 0x48, 0x47, 0x5b, + 0x97, 0x6b, 0x6b, 0xd0, 0xad, 0x96, 0x4e, 0x78, 0x70, 0x8a, 0x6f, 0x4d, + 0x59, 0x82, 0x5b, 0x31, 0x68, 0x4a, 0x89, 0x3e, 0xcc, 0xbb, 0x8b, 0x87, + 0x78, 0xa3, 0x66, 0x70, 0xd1, 0x5b, 0x99, 0x6c, 0xcc, 0x6d, 0x5b, 0x92, + 0x5d, 0x40, 0xca, 0x55, 0x48, 0x82, 0x5f, 0x87, 0xa1, 0xd6, 0xc1, 0x45, + 0x7d, 0x5d, 0x85, 0xaf, 0x7b, 0x3f, 0x66, 0x7f, 0x76, 0xaf, 0x43, 0xa7, + 0x99, 0x49, 0x49, 0x4c, 0x62, 0xa3, 0xb5, 0x9c, 0x92, 0x51, 0x38, 0x3e, + 0x3b, 0xb2, 0x67, 0x8a, 0xcb, 0x44, 0x5c, 0xb7, 0xa4, 0x50, 0xca, 0x82, + 0x6a, 0x95, 0x67, 0x9a, 0x7a, 0xc5, 0x90, 0x4e, 0x8d, 0x69, 0xad, 0xc7, + 0x3b, 0x78, 0xb2, 0x48, 0x40, 0xca, 0x73, 0x74, 0x6b, 0xc0, 0xb9, 0x85, + 0x85, 0xa1, 0x67, 0x4d, 0x77, 0x34, 0x49, 0xac, 0xa5, 0xb0, 0xc8, 0x93, + 0xc4, 0xd9, 0x50, 0x46, 0xbb, 0x2e, 0xd0, 0xb0, 0xc5, 0xbb, 0xa7, 0x59, + 0x64, 0x4f, 0x8e, 0x86, 0x63, 0x63, 0xa6, 0x96, 0x63, 0x70, 0xc1, 0x72, + 0x86, 0xc1, 0x3a, 0xab, 0xae, 0xc8, 0x95, 0x48, 0xae, 0x87, 0x6d, 0x54, + 0x52, 0x6b, 0x49, 0xae, 0x15, 0x69, 0xaf, 0x32, 0xbd, 0x59, 0xbf, 0xbf, + 0x36, 0x97, 0xbb, 0x51, 0x6b, 0xce, 0x4a, 0x38, 0x37, 0x51, 0xa6, 0x7f, + 0x94, 0x5e, 0x61, 0x84, 0xd8, 0x61, 0xcd, 0x6a, 0x65, 0xd4, 0x75, 0x82, + 0x81, 0x97, 0x30, 0xad, 0x83, 0x57, 0xa6, 0x5d, 0xb3, 0x44, 0xa5, 0x75, + 0xa2, 0x4c, 0x6f, 0x6c, 0x41, 0x7c, 0x56, 0xba, 0x5d, 0x8c, 0x4e, 0x51, + 0xb3, 0x8a, 0x94, 0xcf, 0xb8, 0x5f, 0xd4, 0x99, 0x5f, 0xad, 0xc3, 0xb4, + 0xc1, 0xd5, 0x3b, 0x9a, 0xd1, 0x64, 0x52, 0x44, 0x8f, 0x66, 0xa1, 0x79, + 0x2a, 0x5a, 0x49, 0x8e, 0x3d, 0xca, 0x77, 0x30, 0xa8, 0x45, 0x97, 0x72, + 0x87, 0x53, 0xcb, 0x4c, 0x7e, 0x9d, 0x45, 0x7a, 0xbe, 0x7d, 0x7e, 0xb5, + 0xa0, 0xaf, 0x46, 0x9b, 0x4f, 0x65, 0x3b, 0x34, 0x79, 0x3a, 0x68, 0x62, + 0x96, 0x96, 0x42, 0x31, 0xbb, 0x9e, 0x47, 0x69, 0xcb, 0x47, 0x96, 0xa3, + 0x97, 0x78, 0x7c, 0xcd, 0x86, 0x83, 0xb5, 0x91, 0x4a, 0x78, 0x5c, 0xa1, + 0x37, 0x3b, 0xa8, 0x5b, 0xb3, 0x47, 0x5e, 0xc3, 0x6b, 0x76, 0x41, 0x54, + 0xba, 0x6b, 0x5a, 0xbb, 0xbe, 0x38, 0x69, 0x9f, 0xbf, 0x4f, 0x3e, 0x33, + 0x3f, 0x68, 0x6b, 0x76, 0x81, 0xac, 0xcc, 0x57, 0x4b, 0x3d, 0xd2, 0x38, + 0x34, 0x58, 0x4c, 0xb5, 0x64, 0x4d, 0xd0, 0x53, 0x8d, 0x64, 0x84, 0x74, + 0xb2, 0x44, 0x9c, 0xaf, 0xa7, 0x49, 0x90, 0x68, 0x3d, 0x6f, 0xc3, 0xbd, + 0x75, 0xab, 0xa2, 0x56, 0x86, 0xa5, 0xce, 0xc8, 0x67, 0xa0, 0xca, 0xa1, + 0xa5, 0x94, 0x4e, 0xc2, 0xbc, 0x51, 0x36, 0x8e, 0x94, 0xc7, 0x5d, 0x8d, + 0x41, 0xb2, 0x95, 0x58, 0x6b, 0x43, 0x49, 0x6a, 0xd7, 0x85, 0x9e, 0x71, + 0x5a, 0x56, 0x87, 0x63, 0xae, 0x40, 0xb1, 0xb2, 0x5d, 0x5a, 0xcf, 0x85, + 0x6c, 0x38, 0xac, 0xc2, 0x93, 0x70, 0x32, 0x59, 0x5c, 0x69, 0x83, 0x6d, + 0x40, 0x32, 0x4f, 0x6e, 0x6d, 0x41, 0x93, 0x6d, 0xab, 0x72, 0x7f, 0x31, + 0x67, 0x3f, 0xc0, 0x6e, 0xa4, 0xcb, 0x7e, 0x66, 0xd0, 0x78, 0x4f, 0x67, + 0x8f, 0xb6, 0xab, 0x43, 0x53, 0x8e, 0x5c, 0x97, 0x52, 0xb8, 0x3f, 0xaf, + 0x6c, 0x51, 0xca, 0x41, 0x63, 0xb7, 0x50, 0x30, 0x45, 0x79, 0x3b, 0x7c, + 0x2c, 0x9d, 0x84, 0xa0, 0xa8, 0x86, 0x97, 0xbe, 0xbe, 0x6c, 0xa9, 0xa5, + 0xd6, 0xcb, 0x55, 0x4b, 0xcb, 0x6c, 0x75, 0x45, 0x99, 0x98, 0xaf, 0xa6, + 0x67, 0x9d, 0x5a, 0xaa, 0x85, 0x75, 0x73, 0x7f, 0x97, 0x93, 0x64, 0x4c, + 0x9e, 0x55, 0x4a, 0x43, 0x36, 0x79, 0xd4, 0x3b, 0x36, 0xc2, 0xa4, 0x43, + 0x83, 0x36, 0x8f, 0xa3, 0x4b, 0xd1, 0xa5, 0xd2, 0xb9, 0xc3, 0xb4, 0x90, + 0xc3, 0x9c, 0x3c, 0x9f, 0xba, 0x3f, 0x5d, 0x5b, 0x77, 0xa0, 0x5b, 0x76, + 0x52, 0x7f, 0x74, 0xbc, 0xd2, 0x99, 0xba, 0x3d, 0x9d, 0x7a, 0x76, 0xc9, + 0xa7, 0x71, 0x34, 0xab, 0x44, 0xc1, 0xaa, 0x77, 0xb6, 0x9c, 0xa6, 0x98, + 0xb5, 0x2b, 0xc3, 0xad, 0x5f, 0xbf, 0x39, 0x94, 0x74, 0x61, 0x51, 0x35, + 0xaa, 0x87, 0x68, 0x6b, 0x75, 0xa1, 0xa5, 0xb5, 0x98, 0x8f, 0xc0, 0x70, + 0x7a, 0x50, 0xc0, 0x5d, 0x66, 0x72, 0x52, 0x96, 0xb2, 0x8d, 0xb7, 0x44, + 0xb2, 0xbf, 0xc5, 0x3c, 0x7b, 0x4a, 0xcf, 0xbe, 0x8b, 0xc3, 0x91, 0x50, + 0x43, 0xbd, 0x45, 0x77, 0xb3, 0x8e, 0x3f, 0xc1, 0xbb, 0x3d, 0x8c, 0x4a, + 0x2a, 0x92, 0xad, 0x72, 0x3d, 0xc1, 0x9d, 0x5a, 0x61, 0x98, 0x8b, 0xa8, + 0x4f, 0x7a, 0x83, 0x8e, 0x9f, 0x3e, 0xd2, 0x90, 0x51, 0xa1, 0xc4, 0x8f, + 0xb7, 0x9f, 0x44, 0xb4, 0xa9, 0x7b, 0x72, 0xd1, 0x9b, 0x5f, 0x44, 0x8b, + 0x37, 0x80, 0x36, 0x49, 0x3f, 0x5a, 0x61, 0x98, 0x98, 0x99, 0x42, 0x36, + 0x3f, 0x60, 0x4b, 0x67, 0xd0, 0xa8, 0x56, 0x99, 0xb2, 0x52, 0x87, 0xa5, + 0x3f, 0xc2, 0x6e, 0x45, 0x77, 0x86, 0x5f, 0x4d, 0x36, 0x7a, 0x58, 0xd2, + 0x34, 0xc7, 0x66, 0x7d, 0x56, 0xad, 0x74, 0xbc, 0xac, 0x3f, 0xbd, 0xc9, + 0x3b, 0xa9, 0x83, 0xb4, 0x52, 0x36, 0x76, 0x67, 0xc0, 0x6f, 0x48, 0x5a, + 0x4b, 0xc2, 0xc4, 0x65, 0xaf, 0x7a, 0x45, 0x6d, 0x67, 0xcd, 0x78, 0xc2, + 0x6a, 0x76, 0x98, 0x64, 0x80, 0xd1, 0x3f, 0x36, 0x86, 0x45, 0x8e, 0xb9, + 0x47, 0x4f, 0xc8, 0x55, 0xb8, 0x6c, 0xa3, 0xaa, 0xa7, 0x55, 0x6e, 0x88, + 0xc6, 0xc9, 0x5e, 0xd5, 0x7b, 0x8d, 0xb8, 0xc3, 0xc5, 0x6c, 0x39, 0x88, + 0x4c, 0x3e, 0x3a, 0x72, 0x85, 0xc2, 0x94, 0xba, 0x78, 0xac, 0xcc, 0xa1, + 0x96, 0x4f, 0xa7, 0x59, 0xc4, 0x53, 0x51, 0xbd, 0x64, 0x94, 0x9d, 0x35, + 0x98, 0xbc, 0xaa, 0xbe, 0x87, 0x2a, 0x59, 0xd2, 0x70, 0x9e, 0x6f, 0x4e, + 0x62, 0x53, 0x56, 0xb9, 0x6e, 0x57, 0xab, 0x9f, 0xa2, 0x43, 0x4e, 0x43, + 0x82, 0x78, 0xa4, 0x5b, 0x34, 0x45, 0xa8, 0x79, 0xd5, 0xb0, 0x5c, 0x43, + 0x98, 0x37, 0x4b, 0x99, 0x59, 0x9b, 0x98, 0xb9, 0x70, 0x87, 0x92, 0x90, + 0x8f, 0x38, 0x60, 0xaa, 0x58, 0x2f, 0x9b, 0x3a, 0xc7, 0xb4, 0x82, 0x6f, + 0x69, 0x3c, 0x83, 0x4e, 0xb3, 0x33, 0xb9, 0x5a, 0x6d, 0xaa, 0xb9, 0xab, + 0x5b, 0x90, 0x6f, 0x54, 0x4a, 0xa1, 0xbd, 0x7e, 0xd2, 0xbb, 0x95, 0xa1, + 0x40, 0x4f, 0x6d, 0x61, 0xad, 0x48, 0xa3, 0x7a, 0x8a, 0x93, 0x47, 0xb0, + 0xa2, 0x90, 0xad, 0xb3, 0x52, 0x72, 0x77, 0xb4, 0x8d, 0xc7, 0x50, 0xc2, + 0x3f, 0x77, 0x68, 0x8d, 0x6a, 0x3b, 0xc2, 0xcf, 0xb8, 0x43, 0x48, 0xd4, + 0x39, 0xb6, 0x73, 0x6f, 0x86, 0xae, 0xb4, 0x44, 0x63, 0x96, 0xa6, 0x58, + 0x36, 0xa0, 0x43, 0x7a, 0x4d, 0x9a, 0x2e, 0x5e, 0x5e, 0x6f, 0x2e, 0xb6, + 0x36, 0x4a, 0x79, 0x74, 0xad, 0x6a, 0x5d, 0x3c, 0x42, 0xc3, 0x69, 0xc6, + 0x8b, 0x7d, 0x58, 0x3d, 0x7c, 0x5e, 0x4a, 0xd4, 0xac, 0x92, 0x82, 0x6a, + 0x40, 0xa6, 0x7e, 0xb7, 0x75, 0xbc, 0x8e, 0x97, 0x42, 0x67, 0xcb, 0xad, + 0xa7, 0xd3, 0x3c, 0x6c, 0xa4, 0xaa, 0x75, 0x99, 0x6b, 0xcc, 0x33, 0x75, + 0xb7, 0x47, 0x9e, 0x7a, 0x49, 0x61, 0x51, 0xac, 0x81, 0x3b, 0x70, 0x3f, + 0x3d, 0x8b, 0xc2, 0xb0, 0x61, 0xbe, 0xc7, 0x5a, 0x49, 0x67, 0x8b, 0x4a, + 0x87, 0x81, 0x69, 0xbe, 0x73, 0xc2, 0xb0, 0xaf, 0xcd, 0x54, 0x5a, 0x87, + 0x6f, 0x38, 0x93, 0x69, 0x62, 0x52, 0xa5, 0x5a, 0x59, 0x98, 0xb6, 0x41, + 0xbc, 0x69, 0xcf, 0x35, 0xc5, 0x84, 0x65, 0xc9, 0xc7, 0xa0, 0x89, 0xd0, + 0x3c, 0xa3, 0xc1, 0xcf, 0xa1, 0xb6, 0xb0, 0xc0, 0x8b, 0x94, 0x5b, 0xcf, + 0x90, 0x8a, 0x55, 0x51, 0xc3, 0xa0, 0x96, 0x98, 0x61, 0xc5, 0x75, 0x92, + 0xd1, 0x5c, 0xc0, 0xcd, 0x72, 0x89, 0xd1, 0xa3, 0x56, 0xab, 0xca, 0xcf, + 0x43, 0x6f, 0x46, 0x44, 0xae, 0x40, 0xcd, 0x64, 0x6b, 0xb8, 0xaa, 0x4e, + 0x4f, 0x39, 0x3b, 0x5d, 0xc6, 0x3d, 0x65, 0x68, 0x40, 0x3c, 0xc3, 0x8a, + 0xb6, 0x84, 0x41, 0x34, 0x7d, 0x51, 0x39, 0x9c, 0xc1, 0x86, 0x45, 0x7a, + 0x50, 0x5a, 0x94, 0x36, 0x9c, 0xc5, 0x94, 0x47, 0xb2, 0xb0, 0x53, 0xc3, + 0x95, 0x5f, 0x3f, 0x71, 0x92, 0x43, 0x66, 0x34, 0x70, 0x74, 0xbc, 0x52, + 0x50, 0xc7, 0x39, 0x96, 0x68, 0x59, 0x98, 0x54, 0x86, 0xa3, 0xa1, 0x85, + 0xa4, 0x91, 0xa3, 0x42, 0x33, 0x78, 0x63, 0x52, 0xc6, 0xae, 0xad, 0x3d, + 0x86, 0x38, 0x30, 0xc1, 0x51, 0x61, 0x4d, 0x7f, 0xca, 0x88, 0x51, 0xca, + 0xcc, 0xb9, 0x77, 0x45, 0x5d, 0x78, 0x56, 0x48, 0x3c, 0x8c, 0x57, 0x50, + 0x4d, 0x67, 0x8e, 0x34, 0x39, 0x2b, 0x31, 0x99, 0xa2, 0x6d, 0xae, 0x75, + 0x9b, 0x8b, 0x9f, 0xa7, 0xc2, 0x55, 0x3e, 0xd2, 0xcc, 0xb4, 0x89, 0x6c, + 0x43, 0xbf, 0x62, 0x9f, 0x66, 0x4c, 0xaf, 0xb9, 0xc0, 0x5b, 0x93, 0x46, + 0xbe, 0x5e, 0xa8, 0x82, 0xb2, 0x56, 0x86, 0x8d, 0x3e, 0x7f, 0xb0, 0xa8, + 0x4d, 0x9d, 0x77, 0x4d, 0x5a, 0x50, 0xcd, 0x6d, 0x3c, 0xa2, 0xd6, 0x77, + 0x68, 0xbd, 0xcb, 0x32, 0x47, 0x3f, 0x8c, 0x4e, 0xa7, 0x81, 0x99, 0x55, + 0xb0, 0x77, 0x39, 0xc4, 0x92, 0xcb, 0xb6, 0x33, 0x82, 0x40, 0x91, 0xbb, + 0xa2, 0xb3, 0x64, 0x7d, 0x42, 0x9c, 0xbd, 0x83, 0x65, 0xaf, 0x60, 0xab, + 0xc3, 0xaf, 0xcd, 0x56, 0x81, 0xba, 0x49, 0xb6, 0xb5, 0xbd, 0x8a, 0xc4, + 0x69, 0xaf, 0x8e, 0x65, 0x5e, 0x6e, 0x5e, 0x6a, 0x3b, 0x47, 0xd0, 0x83, + 0x68, 0xcb, 0x97, 0xb1, 0x72, 0x5a, 0x50, 0xaf, 0x60, 0x63, 0x58, 0xb2, + 0x88, 0x9e, 0x34, 0xa8, 0x5d, 0x71, 0xaa, 0x71, 0xa7, 0x3e, 0xad, 0x9d, + 0xc9, 0x41, 0x78, 0x67, 0x82, 0x8a, 0x5e, 0xae, 0x70, 0x5b, 0xcb, 0x35, + 0xa1, 0x7e, 0x71, 0xc4, 0x96, 0x84, 0x47, 0x3d, 0xaa, 0x60, 0x3e, 0xc9, + 0xc5, 0xa1, 0xa0, 0x7b, 0xb0, 0x9d, 0xac, 0x6d, 0x9a, 0xd2, 0x65, 0x6e, + 0x3f, 0xbf, 0x6b, 0x3a, 0x42, 0x44, 0x8e, 0xc9, 0x7b, 0xaf, 0x3d, 0xca, + 0x5c, 0x49, 0x78, 0x8e, 0x41, 0x98, 0x38, 0x3f, 0x3e, 0xc1, 0x59, 0x57, + 0xb1, 0x45, 0x88, 0x5a, 0x35, 0x83, 0xd3, 0x34, 0x8a, 0xc4, 0xab, 0x6f, + 0x67, 0xc6, 0xad, 0x86, 0x5b, 0x9a, 0xad, 0x3e, 0x8d, 0xa4, 0x75, 0xba, + 0x52, 0xac, 0xa1, 0x88, 0x7a, 0xca, 0xca, 0x95, 0x4d, 0xd1, 0x2e, 0xbc, + 0x36, 0xaa, 0x59, 0x78, 0x94, 0x4a, 0x79, 0xb7, 0xd0, 0x30, 0x75, 0x91, + 0x36, 0x54, 0x47, 0x5e, 0x7d, 0xce, 0x6a, 0xc2, 0x4e, 0x8e, 0x99, 0xb2, + 0x6a, 0x87, 0xbf, 0x74, 0x79, 0x58, 0xab, 0x6b, 0xbf, 0x87, 0xad, 0x41, + 0x39, 0x9c, 0x44, 0x87, 0xaf, 0x60, 0x5a, 0x58, 0xce, 0x88, 0x75, 0xc4, + 0xb1, 0x57, 0x6e, 0xc8, 0x30, 0x4d, 0xcd, 0x86, 0xb2, 0x36, 0x7e, 0xad, + 0x31, 0x3d, 0xa4, 0x62, 0x7f, 0x38, 0x79, 0x43, 0x30, 0xca, 0x9a, 0xac, + 0x76, 0xb2, 0xad, 0xd5, 0xb6, 0xaa, 0x7e, 0x65, 0xb3, 0x83, 0x45, 0x4c, + 0xaf, 0x2e, 0xba, 0x6f, 0xcb, 0x59, 0x79, 0x2d, 0x42, 0x3d, 0x4e, 0x4a, + 0x89, 0x3a, 0x4c, 0x8a, 0xa5, 0xc4, 0x71, 0xae, 0x78, 0x81, 0x57, 0x5f, + 0x88, 0xb4, 0xae, 0x6d, 0x70, 0x66, 0x46, 0xb1, 0x88, 0xd9, 0x72, 0xbc, + 0x4c, 0xa6, 0x89, 0xc5, 0x45, 0x50, 0x6c, 0x30, 0xb5, 0x48, 0x88, 0xcf, + 0x96, 0x35, 0xbb, 0x7f, 0x52, 0x76, 0x71, 0x65, 0x48, 0xad, 0x5f, 0x33, + 0x94, 0x32, 0x7d, 0xc2, 0xd4, 0x54, 0x6e, 0x72, 0xc5, 0x8e, 0x42, 0xcf, + 0xc7, 0x58, 0x6c, 0xa0, 0x39, 0x9a, 0x8a, 0xac, 0xaa, 0x35, 0x81, 0x81, + 0xb2, 0x93, 0x58, 0x5b, 0xc2, 0x56, 0x31, 0xa6, 0x57, 0xc5, 0x39, 0xdc, + 0xa6, 0xac, 0x83, 0x58, 0x7a, 0x39, 0x3d, 0xb7, 0xab, 0xaa, 0x42, 0x3e, + 0xcd, 0x4a, 0xa0, 0x6f, 0x9f, 0x70, 0x78, 0xb0, 0xba, 0xa7, 0x38, 0x33, + 0x4e, 0x82, 0x43, 0x7b, 0xbd, 0x9d, 0x3c, 0x63, 0x3b, 0xaa, 0x57, 0x57, + 0x75, 0xbb, 0x7d, 0x43, 0x72, 0x53, 0x31, 0x73, 0x76, 0x71, 0x77, 0xad, + 0xa7, 0x72, 0x83, 0xc4, 0x3f, 0x61, 0x8e, 0x75, 0x6c, 0x57, 0x92, 0x79, + 0xba, 0xbe, 0x32, 0x3c, 0x66, 0x76, 0xd2, 0x75, 0x42, 0x6c, 0xa8, 0x94, + 0x32, 0xa6, 0x3d, 0x6a, 0x8b, 0x77, 0x71, 0x6a, 0x7f, 0x74, 0x4e, 0x46, + 0x41, 0x5f, 0xcc, 0xcd, 0xb6, 0x57, 0x77, 0xad, 0x8f, 0x44, 0xbf, 0x40, + 0x6e, 0x5d, 0x4f, 0xc4, 0x62, 0x69, 0x4d, 0x71, 0xc0, 0xa3, 0xa7, 0x61, + 0x5d, 0xa1, 0x3e, 0x91, 0xa5, 0x76, 0xd3, 0xa2, 0xb5, 0x44, 0x59, 0x40, + 0x80, 0xb7, 0x48, 0x56, 0x77, 0xb3, 0x47, 0x39, 0xae, 0xb9, 0x9b, 0xbb, + 0xb4, 0x32, 0x8c, 0xaa, 0x5d, 0xb5, 0x70, 0x66, 0x55, 0x45, 0x53, 0x63, + 0x57, 0x84, 0x78, 0x72, 0x98, 0xc5, 0x5b, 0xab, 0x80, 0x7f, 0xca, 0xc2, + 0x71, 0x38, 0x79, 0x5e, 0x3d, 0xa3, 0xab, 0x82, 0xb4, 0x5d, 0x95, 0x9f, + 0xa0, 0x46, 0x4d, 0xcd, 0x46, 0xc6, 0x87, 0x99, 0x90, 0x73, 0x75, 0x8c, + 0x77, 0x36, 0x9c, 0x49, 0xb1, 0x8b, 0x2f, 0x64, 0x33, 0xc8, 0xaa, 0x3a, + 0xb7, 0x84, 0x5e, 0xc5, 0xa8, 0x78, 0x7c, 0x84, 0xc9, 0xb8, 0xa3, 0x3c, + 0x50, 0x56, 0x66, 0xa3, 0xb9, 0x4c, 0x5c, 0x80, 0x93, 0x8d, 0x42, 0x6d, + 0xd3, 0xcc, 0x3a, 0x6a, 0x60, 0x30, 0x3f, 0x97, 0x44, 0x95, 0x7b, 0x4d, + 0xa2, 0x71, 0x8a, 0xbe, 0x89, 0x47, 0x3f, 0x79, 0x95, 0x42, 0x3d, 0x6b, + 0x8a, 0x72, 0x9c, 0x3b, 0xce, 0x9e, 0x6f, 0xaf, 0xce, 0x41, 0xa0, 0xbf, + 0x84, 0x87, 0xca, 0x30, 0x74, 0xc8, 0x53, 0x9f, 0x4e, 0x81, 0x45, 0xa2, + 0x97, 0x8f, 0x62, 0xa0, 0x96, 0xbc, 0x53, 0x55, 0xad, 0xc3, 0x96, 0xa6, + 0x90, 0x55, 0x5f, 0x79, 0x9c, 0x54, 0xce, 0x5e, 0x6c, 0xc8, 0x71, 0x4a, + 0x54, 0xb7, 0x71, 0x36, 0xa8, 0x31, 0x82, 0x99, 0xa1, 0x95, 0x9a, 0x66, + 0x3a, 0x48, 0x45, 0xb9, 0xa2, 0x64, 0xc9, 0x5a, 0xb1, 0x31, 0x90, 0xcc, + 0x5c, 0xc2, 0x87, 0x50, 0x71, 0x3b, 0x78, 0x74, 0xce, 0x44, 0x7b, 0xb1, + 0x4d, 0xc9, 0xc4, 0x63, 0x51, 0x95, 0xcc, 0x89, 0xbd, 0xb0, 0x98, 0x7e, + 0x40, 0x86, 0x3e, 0x44, 0x38, 0x45, 0x42, 0xaa, 0xa4, 0x69, 0x90, 0xb0, + 0x7a, 0x9b, 0xc6, 0xbb, 0xa4, 0xcf, 0x67, 0x88, 0xcd, 0x36, 0x36, 0x5c, + 0x51, 0x65, 0x5d, 0xd2, 0x86, 0x89, 0x6c, 0x65, 0x7e, 0x41, 0xaa, 0x78, + 0x94, 0x8c, 0x47, 0x35, 0x44, 0xa3, 0xc6, 0x5b, 0x5b, 0x86, 0x51, 0xab, + 0x83, 0xd0, 0x60, 0x8b, 0x89, 0x74, 0xa5, 0xbf, 0x37, 0x72, 0x7f, 0x79, + 0x89, 0x4b, 0xb8, 0x58, 0x42, 0x53, 0x7f, 0xb8, 0x69, 0x5c, 0x92, 0x3e, + 0x87, 0x74, 0xd1, 0xa0, 0x5c, 0x7b, 0x89, 0x57, 0x4d, 0x76, 0xa6, 0x5f, + 0x85, 0x95, 0x65, 0xcc, 0x57, 0x88, 0x85, 0x32, 0x59, 0x7d, 0x8d, 0x99, + 0xd1, 0x60, 0x58, 0x48, 0xad, 0xb3, 0x98, 0xc3, 0xb1, 0xb9, 0x5e, 0x74, + 0x91, 0x61, 0x7c, 0xa3, 0xcd, 0xcb, 0x54, 0xa5, 0x65, 0x8a, 0x43, 0xbc, + 0x43, 0x77, 0xa3, 0x45, 0xb8, 0x56, 0x4f, 0x67, 0x95, 0x6f, 0xb8, 0xd2, + 0xac, 0x39, 0x37, 0x3b, 0xc6, 0xca, 0x99, 0x6f, 0x76, 0xbf, 0x5a, 0xc9, + 0x3f, 0xd0, 0x95, 0x8a, 0x36, 0xa3, 0x7b, 0x96, 0x56, 0x73, 0x56, 0xbe, + 0xc0, 0x71, 0xaf, 0x35, 0x62, 0x47, 0x90, 0xb3, 0x3e, 0x42, 0x4b, 0x32, + 0x97, 0x75, 0x31, 0x40, 0xae, 0xad, 0xd1, 0x96, 0xbb, 0xa1, 0x54, 0x69, + 0xc3, 0xae, 0xbf, 0x74, 0xb7, 0xcb, 0x82, 0x9c, 0x6d, 0xba, 0xbd, 0x70, + 0xc0, 0xc7, 0x4e, 0x3b, 0x3c, 0x77, 0x98, 0x3e, 0xb1, 0xa0, 0x9f, 0x38, + 0x6a, 0xcd, 0x79, 0xa1, 0x6c, 0xbe, 0x4b, 0x40, 0x8f, 0x9a, 0x82, 0xa3, + 0xab, 0x68, 0x5a, 0xcf, 0xa5, 0x6c, 0x9e, 0x7b, 0x94, 0x66, 0xb5, 0x6a, + 0x3c, 0x74, 0xa5, 0xd0, 0x6e, 0xba, 0x33, 0xcb, 0xc8, 0xa6, 0x62, 0xbe, + 0xaf, 0x4d, 0x50, 0x36, 0x50, 0x9f, 0xaa, 0x71, 0x86, 0x47, 0x38, 0x95, + 0xb8, 0x7f, 0xa4, 0x95, 0x64, 0x5d, 0x68, 0xcf, 0x54, 0xa8, 0x66, 0xab, + 0xa8, 0x55, 0xc1, 0xb4, 0x7d, 0x7c, 0xc9, 0x5f, 0xbf, 0x8e, 0x65, 0x3e, + 0xa8, 0xaa, 0xcc, 0xcc, 0x95, 0x69, 0x7e, 0x5b, 0xa4, 0xbe, 0xcc, 0xa2, + 0x68, 0x8e, 0x39, 0xba, 0xa5, 0xa5, 0x3a, 0xbc, 0x8a, 0x89, 0xa7, 0x43, + 0xca, 0xd2, 0x92, 0xa2, 0x97, 0x87, 0xd5, 0x8c, 0xa7, 0x9d, 0x9b, 0x51, + 0x56, 0x75, 0x4a, 0x95, 0x53, 0x8d, 0x4a, 0x65, 0xca, 0x78, 0x4e, 0x51, + 0x37, 0x59, 0x68, 0x86, 0x48, 0x4b, 0x71, 0xcf, 0xa9, 0x9a, 0x9f, 0xcb, + 0x7e, 0xb8, 0x43, 0xb8, 0x39, 0x6d, 0xbd, 0x46, 0x8c, 0x3f, 0x37, 0xb4, + 0x6a, 0x55, 0x90, 0x37, 0x41, 0x52, 0x74, 0x33, 0x8e, 0x82, 0x89, 0x85, + 0x9d, 0xbd, 0xb6, 0x81, 0x81, 0x4b, 0x72, 0x66, 0x70, 0x41, 0x84, 0x5e, + 0xce, 0x70, 0xa9, 0xd1, 0x35, 0x8d, 0x74, 0x75, 0x62, 0x40, 0x94, 0xce, + 0x57, 0x73, 0x3b, 0x36, 0x32, 0xc0, 0x40, 0x5d, 0x64, 0x47, 0x33, 0xc2, + 0x8a, 0x6c, 0x59, 0x37, 0x7e, 0xa9, 0xb4, 0x9b, 0x8f, 0x41, 0xb2, 0x39, + 0xbe, 0x4c, 0x86, 0x56, 0x77, 0x42, 0xb8, 0x8d, 0xc9, 0x4c, 0xa7, 0x9a, + 0xcf, 0x53, 0x32, 0x65, 0xa1, 0xc2, 0x72, 0xad, 0x7b, 0x5c, 0x32, 0x42, + 0x88, 0x43, 0x9a, 0x32, 0xbf, 0x8d, 0xb6, 0x40, 0xc4, 0x5c, 0x68, 0x62, + 0x66, 0xab, 0x7b, 0x8f, 0xbc, 0x5a, 0x5d, 0xc9, 0x9c, 0xbd, 0x9c, 0xcd, + 0x31, 0xab, 0xa3, 0x3a, 0x76, 0xb6, 0x65, 0xc7, 0x94, 0xd3, 0x49, 0xc4, + 0x6a, 0x4e, 0xca, 0x3f, 0x45, 0x73, 0x37, 0x8a, 0x60, 0x3c, 0x4a, 0x58, + 0x95, 0x4d, 0x53, 0xcf, 0xb1, 0xc9, 0x6c, 0x58, 0xca, 0x51, 0x35, 0x94, + 0x36, 0xae, 0x77, 0xa5, 0xd1, 0xb4, 0xb4, 0xc9, 0x4d, 0x3a, 0xb8, 0x72, + 0x41, 0x8e, 0x91, 0xb3, 0x63, 0xad, 0x63, 0x99, 0x86, 0x87, 0x91, 0x40, + 0x34, 0x90, 0x76, 0xc1, 0x91, 0xa4, 0x6e, 0xa5, 0xc9, 0xb9, 0x63, 0x60, + 0x6a, 0xbe, 0x70, 0xa4, 0x8f, 0x3b, 0x36, 0x48, 0x5c, 0x7d, 0x3e, 0xbf, + 0x92, 0x44, 0x9b, 0x88, 0x71, 0x9a, 0x6e, 0x6a, 0x4c, 0x7b, 0x82, 0x77, + 0xd2, 0xcf, 0xc0, 0xb1, 0xbe, 0xa8, 0x8e, 0x7e, 0x49, 0x76, 0xa2, 0x5d, + 0x7d, 0xb8, 0x68, 0x58, 0xbe, 0x69, 0x53, 0x33, 0x71, 0xa7, 0x7f, 0x59, + 0xc1, 0x6e, 0xbe, 0x59, 0x92, 0xd0, 0xc5, 0x5c, 0xb7, 0xb4, 0xb0, 0x90, + 0xa1, 0xad, 0xa4, 0x9d, 0xca, 0xcc, 0x33, 0x96, 0x3e, 0x6b, 0x4e, 0xc0, + 0x81, 0xaa, 0x9a, 0x8f, 0x3a, 0xbc, 0x7f, 0xb0, 0x91, 0x58, 0xcd, 0x77, + 0xa1, 0x9f, 0xc6, 0xba, 0x3d, 0x88, 0x66, 0xbf, 0x86, 0xb6, 0xa8, 0xb7, + 0xb2, 0xb2, 0xa7, 0xc7, 0x7a, 0x32, 0x67, 0x5e, 0x70, 0xa6, 0x48, 0xce, + 0x95, 0x8b, 0x70, 0x3f, 0xb9, 0x72, 0x52, 0xb2, 0xa0, 0x61, 0x6b, 0x40, + 0x72, 0x8a, 0x6a, 0xbf, 0x94, 0x97, 0xbf, 0xc9, 0x8f, 0x78, 0x51, 0x6c, + 0x77, 0x66, 0x62, 0x9c, 0x52, 0xa8, 0xc5, 0xb1, 0x60, 0xad, 0xc7, 0x5f, + 0x96, 0x86, 0xbd, 0xbb, 0x75, 0x3f, 0xa9, 0x2f, 0xa2, 0x4a, 0xc7, 0xb3, + 0xd0, 0xa7, 0x7d, 0xb7, 0x65, 0xb8, 0xcc, 0xb7, 0xc7, 0x6c, 0x93, 0x62, + 0xcf, 0x59, 0xbb, 0xbb, 0x4b, 0x74, 0x47, 0x68, 0xae, 0x67, 0x97, 0x9b, + 0x96, 0x49, 0x56, 0xb2, 0x86, 0xbf, 0xc7, 0x6d, 0xc9, 0x6d, 0x7c, 0xa5, + 0x6e, 0xbc, 0xc6, 0x7c, 0x7a, 0x3f, 0xa2, 0x2c, 0xba, 0xb1, 0xaf, 0x77, + 0x87, 0xb5, 0x49, 0x39, 0xac, 0x53, 0x8a, 0x73, 0x63, 0xcc, 0x89, 0x3a, + 0x51, 0x3c, 0xa9, 0xcd, 0x3a, 0x42, 0xa7, 0x32, 0x63, 0xcd, 0x42, 0xbc, + 0x44, 0x95, 0x81, 0x69, 0x5f, 0xa4, 0x86, 0x86, 0x8f, 0x7a, 0x80, 0x40, + 0xba, 0x5b, 0xae, 0x5e, 0xc5, 0xa7, 0xa4, 0xb2, 0x9d, 0x5e, 0xb9, 0x9b, + 0xc8, 0x53, 0x97, 0x60, 0x6a, 0xa1, 0xbb, 0xbf, 0x3d, 0x77, 0x9d, 0xb5, + 0x79, 0x40, 0x37, 0x62, 0x35, 0x6e, 0xa4, 0xc5, 0xa0, 0x76, 0xca, 0x70, + 0x9f, 0x67, 0x3c, 0x7e, 0xad, 0x34, 0x9a, 0x9e, 0xad, 0x29, 0x4f, 0xbe, + 0x7c, 0x98, 0x42, 0x55, 0x59, 0xb5, 0x50, 0xce, 0x71, 0xcc, 0x6e, 0x6b, + 0x8d, 0x7d, 0xb5, 0x44, 0x6a, 0x93, 0x5e, 0x4f, 0x3b, 0xcc, 0x30, 0x6b, + 0xbb, 0xc5, 0xae, 0xc1, 0xa4, 0xb8, 0x60, 0x49, 0xa0, 0x6b, 0x4a, 0x89, + 0xaa, 0x61, 0x8c, 0x90, 0xa8, 0xac, 0x5f, 0x92, 0x4a, 0x65, 0x69, 0x4b, + 0x4b, 0x45, 0x8b, 0x47, 0xcd, 0xb2, 0x67, 0xb2, 0x36, 0x82, 0x62, 0xb7, + 0x69, 0x91, 0xcb, 0x81, 0xc5, 0x8f, 0x9f, 0xab, 0x3a, 0xa4, 0x65, 0x6e, + 0xa0, 0x61, 0x7e, 0xc0, 0xaa, 0x8a, 0x45, 0xbf, 0x3d, 0xb4, 0x69, 0x48, + 0x36, 0x4f, 0xa5, 0x90, 0xcf, 0xb9, 0x72, 0x99, 0x3b, 0xba, 0x30, 0xb3, + 0xca, 0x8d, 0x54, 0xa8, 0x5f, 0xce, 0x68, 0xcd, 0x8f, 0x5d, 0xaf, 0xc1, + 0x71, 0xc1, 0x6e, 0x66, 0x7f, 0x9f, 0x70, 0x65, 0x38, 0xc7, 0xba, 0x99, + 0x83, 0x4a, 0x66, 0xc4, 0x48, 0xbe, 0xc9, 0x35, 0x7d, 0x2a, 0x57, 0x7b, + 0x52, 0xb0, 0x2f, 0x71, 0x4e, 0xa1, 0x68, 0x67, 0xcc, 0xb7, 0x52, 0x6b, + 0x42, 0x79, 0x5b, 0x5b, 0x43, 0xa2, 0xc6, 0x77, 0x8d, 0x6f, 0x59, 0x31, + 0xd0, 0x3e, 0x51, 0x6c, 0xab, 0x3b, 0xbf, 0xae, 0xce, 0x52, 0x4e, 0x94, + 0x79, 0xb8, 0x54, 0xc9, 0xa7, 0x5c, 0x38, 0xa1, 0x8e, 0x60, 0x9a, 0x58, + 0x93, 0x76, 0x69, 0xb1, 0x68, 0x46, 0x71, 0x44, 0x7a, 0xcc, 0x73, 0xa9, + 0xbb, 0x8e, 0x33, 0x8b, 0xa6, 0x95, 0x8d, 0x33, 0x4e, 0x72, 0xab, 0x66, + 0xaf, 0x35, 0x8b, 0x5e, 0x8a, 0xac, 0x7b, 0x9d, 0x6a, 0x96, 0x89, 0x90, + 0x96, 0xd2, 0x59, 0x39, 0x41, 0x62, 0xa1, 0x47, 0x91, 0x64, 0x7d, 0xab, + 0xac, 0x76, 0x52, 0x9e, 0x61, 0xb1, 0x60, 0x51, 0x90, 0x8a, 0xab, 0x3d, + 0x95, 0x62, 0xc5, 0xce, 0x87, 0x74, 0xb5, 0xa8, 0xa1, 0x77, 0x38, 0xb1, + 0x41, 0x38, 0xb0, 0x31, 0xa1, 0x7b, 0x85, 0xa1, 0x9a, 0xc6, 0x48, 0x89, + 0x86, 0x60, 0x9a, 0x37, 0x58, 0xbf, 0x60, 0x66, 0x5b, 0xb0, 0x36, 0xcc, + 0xd1, 0x67, 0x49, 0x63, 0xc4, 0x91, 0x4c, 0x82, 0x7e, 0x4a, 0x40, 0xd2, + 0x7e, 0x80, 0x96, 0x49, 0x3d, 0xab, 0x66, 0xa3, 0x33, 0x8a, 0x75, 0x79, + 0x9b, 0x81, 0x6e, 0x34, 0xab, 0xbf, 0x59, 0xc0, 0xa1, 0x95, 0x9c, 0x42, + 0x94, 0xc6, 0x7d, 0xc9, 0xc0, 0x54, 0x4d, 0x8e, 0xa5, 0x2f, 0x3a, 0xc8, + 0xa2, 0xa9, 0x6a, 0x9e, 0x49, 0x31, 0xb4, 0x6a, 0x69, 0x6a, 0x47, 0xc2, + 0x35, 0x88, 0x3f, 0x93, 0x3b, 0x3a, 0x5a, 0x66, 0x9c, 0xc0, 0x3a, 0x53, + 0xc7, 0x9b, 0x4f, 0x75, 0x51, 0x9a, 0xaa, 0x5e, 0xae, 0x61, 0x88, 0xbf, + 0xa9, 0x70, 0xa7, 0x6b, 0x76, 0x9c, 0x3a, 0xa8, 0x3e, 0x3a, 0x8d, 0x66, + 0xa0, 0xa8, 0x8b, 0xab, 0x77, 0xb7, 0x8b, 0x91, 0x82, 0xa1, 0x53, 0x9a, + 0x7f, 0x78, 0x6f, 0x85, 0x90, 0x83, 0xc1, 0xa8, 0x96, 0x4d, 0xb2, 0x40, + 0x87, 0x3b, 0x57, 0xd2, 0xbd, 0xd2, 0x9d, 0xc3, 0x76, 0x40, 0xae, 0x7a, + 0x52, 0x91, 0xc4, 0x3f, 0x9b, 0x2e, 0x35, 0x99, 0xcb, 0x60, 0x8a, 0x8b, + 0x54, 0x65, 0x6a, 0x39, 0x61, 0x52, 0x87, 0x73, 0xb1, 0xae, 0x5c, 0x77, + 0x8e, 0xbf, 0x5c, 0x67, 0xbc, 0x5f, 0x60, 0x5b, 0x33, 0x50, 0x9a, 0xa5, + 0x7e, 0x5d, 0x44, 0xc1, 0x2a, 0x9a, 0x4e, 0x3d, 0x46, 0x37, 0xc1, 0x9b, + 0xc1, 0xc3, 0xd4, 0x95, 0x44, 0x68, 0x9a, 0xc3, 0x6b, 0x5a, 0xb1, 0x68, + 0x63, 0x9d, 0x72, 0x7f, 0xd2, 0xb2, 0xba, 0xc3, 0x6f, 0xc9, 0x7c, 0xcd, + 0x87, 0x9a, 0xae, 0x7c, 0xc6, 0xca, 0x93, 0x40, 0xbb, 0xba, 0x72, 0xc7, + 0x6f, 0x6b, 0x97, 0xd1, 0x81, 0x6b, 0x89, 0xa0, 0x6f, 0x67, 0x9a, 0xc8, + 0xb6, 0x4e, 0x70, 0x4c, 0xb6, 0x6f, 0xae, 0xc6, 0x68, 0x6f, 0x56, 0x73, + 0xbf, 0xc0, 0x8e, 0x9c, 0x73, 0x3a, 0x43, 0x68, 0x39, 0x56, 0x64, 0x54, + 0x8c, 0x3c, 0x63, 0x2e, 0x7f, 0xca, 0x2e, 0x6b, 0xbb, 0x93, 0xbe, 0x7f, + 0x6b, 0x96, 0x89, 0x71, 0xb6, 0x4a, 0x70, 0x56, 0x86, 0x84, 0x53, 0x4a, + 0x44, 0x41, 0x69, 0x36, 0x69, 0xa4, 0xba, 0x63, 0x41, 0x7e, 0xd2, 0x38, + 0x49, 0xb6, 0xc3, 0x4f, 0x99, 0xd1, 0xb0, 0xce, 0xca, 0xc0, 0x91, 0x61, + 0xa6, 0xc3, 0x56, 0x36, 0x91, 0xb5, 0x31, 0xbf, 0x9a, 0xb6, 0xbe, 0xbe, + 0xb4, 0xaf, 0x9f, 0x98, 0xb9, 0x52, 0x38, 0x7c, 0x43, 0xd8, 0x65, 0x4f, + 0x4d, 0x82, 0xb1, 0xc7, 0xb0, 0x9b, 0x6e, 0x4e, 0xa9, 0x9c, 0xde, 0x8a, + 0xbf, 0x9d, 0x8b, 0x5d, 0x7d, 0xaf, 0x5e, 0x64, 0xcc, 0x59, 0x68, 0x99, + 0xb4, 0x66, 0xbc, 0xac, 0x3a, 0xa9, 0x92, 0xc2, 0xc5, 0x93, 0x8b, 0xbc, + 0x83, 0xd0, 0xa9, 0x91, 0x97, 0xb8, 0x60, 0xb6, 0x8a, 0x60, 0x89, 0x86, + 0x41, 0x3e, 0xd4, 0x34, 0x6f, 0xc6, 0x4d, 0x8e, 0x44, 0x6f, 0xc2, 0x67, + 0x3d, 0xc2, 0x8e, 0x76, 0x9c, 0xcc, 0x4e, 0xa5, 0xd8, 0x8e, 0x97, 0x3c, + 0x7b, 0x43, 0x5f, 0x56, 0x3b, 0xa1, 0x64, 0x82, 0x33, 0x5f, 0x9a, 0xb5, + 0x31, 0xcd, 0xd4, 0xbe, 0x3d, 0x92, 0x55, 0x80, 0xbf, 0x38, 0x59, 0x6a, + 0x99, 0x3d, 0x65, 0xac, 0x58, 0x38, 0x97, 0xbf, 0xb4, 0xb9, 0x97, 0xcd, + 0x44, 0xa4, 0x46, 0x5b, 0xbe, 0xc9, 0x54, 0x8c, 0x41, 0x5a, 0xb2, 0x92, + 0x8c, 0xa2, 0xc4, 0x57, 0x89, 0x7c, 0x9b, 0x5d, 0xa8, 0x7f, 0xc8, 0x71, + 0xcd, 0x76, 0x90, 0xb4, 0x9a, 0x6f, 0xd3, 0x94, 0xc0, 0xab, 0x68, 0x4b, + 0xaa, 0xc3, 0x78, 0xa7, 0x33, 0x4b, 0x77, 0xb5, 0x76, 0x92, 0x5f, 0x80, + 0x72, 0xb7, 0x51, 0x35, 0xab, 0x33, 0x5c, 0x52, 0x8b, 0x50, 0x43, 0x86, + 0x6b, 0xc7, 0x60, 0x70, 0xb3, 0xb7, 0x3b, 0xa9, 0x70, 0x7b, 0xba, 0x9d, + 0x45, 0x9c, 0xa0, 0xca, 0xce, 0x58, 0x7a, 0x76, 0x62, 0xb2, 0x96, 0x6b, + 0xc5, 0xb5, 0x91, 0x55, 0x97, 0x3b, 0x96, 0xb0, 0x7b, 0xad, 0x5d, 0xcc, + 0x34, 0x6b, 0x89, 0x83, 0x7a, 0x51, 0xac, 0x99, 0x4b, 0xc3, 0xb6, 0x4f, + 0x5b, 0x95, 0x6d, 0x47, 0xb0, 0xc5, 0x3a, 0x7a, 0x76, 0x7e, 0x42, 0x7d, + 0xa0, 0x3d, 0x5c, 0x4f, 0x5a, 0x8a, 0xa1, 0x36, 0xbb, 0x82, 0x72, 0x51, + 0xa7, 0xc3, 0x45, 0x55, 0xa2, 0x6f, 0x8e, 0x79, 0xb0, 0x45, 0x7f, 0x99, + 0xcf, 0xc1, 0x8a, 0xc8, 0xbb, 0x39, 0xa9, 0x38, 0x3a, 0x33, 0x55, 0xa9, + 0xce, 0xbf, 0x5d, 0x6a, 0x31, 0x66, 0xb9, 0xc4, 0x57, 0xb9, 0x88, 0xae, + 0xb3, 0xc7, 0x56, 0x4d, 0xd0, 0x7a, 0x4f, 0xb6, 0x82, 0x6d, 0x71, 0x5e, + 0x48, 0x9e, 0x8f, 0x69, 0x93, 0xbb, 0xae, 0xaa, 0x52, 0x4e, 0x58, 0x8c, + 0xb3, 0xb2, 0x39, 0x3e, 0xd4, 0x37, 0x87, 0x6a, 0xc5, 0x57, 0x70, 0xcc, + 0xab, 0xa2, 0xb3, 0x5d, 0xb0, 0x62, 0x5d, 0x55, 0xc6, 0xc5, 0x56, 0x78, + 0x9d, 0xa8, 0x56, 0x98, 0xb5, 0x98, 0x80, 0x96, 0xb9, 0x8b, 0x69, 0x5d, + 0x4b, 0x76, 0xb6, 0xb4, 0x95, 0x44, 0x70, 0x6c, 0x8b, 0xae, 0x40, 0xc3, + 0x4a, 0x81, 0x9c, 0x3b, 0x39, 0xab, 0x59, 0xcd, 0xcc, 0x8c, 0x6b, 0xb2, + 0x34, 0x35, 0x62, 0xca, 0xbe, 0x61, 0x54, 0x84, 0x56, 0xcb, 0x61, 0xc6, + 0x3c, 0x9f, 0x8a, 0xc6, 0xbf, 0x2d, 0x4c, 0x67, 0x39, 0x98, 0xa7, 0xaf, + 0x74, 0x8f, 0xd6, 0x6b, 0x44, 0x35, 0x8d, 0x7e, 0x3c, 0x44, 0xbe, 0x9a, + 0x70, 0x46, 0x8f, 0x62, 0x66, 0x74, 0x78, 0x92, 0xc3, 0x93, 0x93, 0xb6, + 0x50, 0xce, 0xd1, 0x55, 0xc9, 0x3d, 0xb7, 0x6a, 0x3d, 0xd3, 0x81, 0x8a, + 0x75, 0x9b, 0x62, 0x91, 0xb2, 0x40, 0x3e, 0x5e, 0x36, 0xc2, 0x78, 0x6d, + 0x2d, 0x2f, 0x4a, 0xcf, 0x6f, 0x3d, 0x6b, 0xa5, 0x6e, 0x94, 0xcf, 0xc6, + 0x85, 0x99, 0x9f, 0xd1, 0x49, 0x81, 0x7f, 0x82, 0x51, 0x9c, 0xc2, 0x56, + 0x43, 0x62, 0x3b, 0x41, 0x36, 0x7f, 0x8e, 0xaf, 0x4a, 0x7c, 0xa3, 0x4b, + 0x76, 0x31, 0x73, 0x7c, 0x59, 0xdc, 0x94, 0xac, 0x8a, 0xa2, 0x44, 0xb8, + 0xb0, 0xb6, 0x48, 0x88, 0x3b, 0x98, 0x79, 0xc2, 0x5b, 0x81, 0xa9, 0xa6, + 0x58, 0x83, 0xb0, 0x89, 0x40, 0x40, 0x40, 0x4f, 0x7b, 0x51, 0x8e, 0xc0, + 0xa4, 0x9a, 0x53, 0x53, 0x5b, 0xb2, 0x47, 0x53, 0x99, 0xc5, 0xb1, 0xb4, + 0xbc, 0x67, 0x2c, 0x57, 0x63, 0x5c, 0x34, 0xc1, 0x8b, 0x80, 0xa2, 0x71, + 0x75, 0x53, 0x37, 0x7e, 0x66, 0x8e, 0xcb, 0x46, 0xc3, 0x80, 0x87, 0xc5, + 0x5c, 0x52, 0xd4, 0xc0, 0x3e, 0xa9, 0x48, 0xc3, 0x6a, 0x36, 0x6f, 0x7b, + 0x99, 0x4e, 0x70, 0x34, 0x2e, 0x4f, 0x50, 0xb0, 0x3b, 0xa3, 0xc4, 0x98, + 0xac, 0x63, 0xcc, 0x3b, 0x7c, 0xc2, 0x69, 0x8d, 0x33, 0x9e, 0xc5, 0x87, + 0x8a, 0xc9, 0x3b, 0x64, 0x95, 0x74, 0xc6, 0x47, 0x35, 0xc4, 0x31, 0xbe, + 0x7f, 0x6e, 0x87, 0x3a, 0xc8, 0xa3, 0xac, 0x55, 0xc4, 0xd2, 0xa3, 0x9a, + 0xa3, 0xbe, 0x72, 0xc5, 0xb3, 0x42, 0xb0, 0x67, 0xa0, 0xc3, 0x4f, 0xc8, + 0x80, 0x89, 0xb3, 0xb0, 0x50, 0x9c, 0x6c, 0xad, 0xc8, 0x4a, 0x67, 0x7b, + 0x7f, 0x50, 0x9b, 0x98, 0x3c, 0x47, 0x3d, 0x49, 0xc8, 0x6f, 0x5f, 0x5a, + 0x9d, 0x91, 0x8f, 0xd5, 0x6d, 0x95, 0xbf, 0x67, 0xad, 0x91, 0x83, 0x80, + 0x53, 0xc4, 0x42, 0xd3, 0x9a, 0xcd, 0xda, 0x3e, 0x85, 0x51, 0xcf, 0xd2, + 0x85, 0x48, 0xc2, 0x35, 0x8f, 0x8f, 0x60, 0x3b, 0x94, 0x72, 0x8e, 0xb5, + 0x48, 0x6e, 0x48, 0x61, 0x8e, 0x9e, 0x83, 0xb3, 0xb0, 0x96, 0xc8, 0x7f, + 0x83, 0xc9, 0x5f, 0xa5, 0xba, 0xcc, 0x43, 0xd1, 0xd3, 0xa6, 0x4d, 0x87, + 0x5c, 0x47, 0xa5, 0x32, 0xa8, 0x79, 0x86, 0x60, 0x4c, 0x9f, 0xc5, 0x6a, + 0x79, 0xac, 0x52, 0xcd, 0x64, 0x85, 0x36, 0xb1, 0xc6, 0x41, 0xba, 0xb1, + 0x8b, 0xc2, 0x5e, 0x6e, 0x6d, 0x7d, 0x65, 0xc3, 0xb0, 0xb8, 0x89, 0x49, + 0xc3, 0x60, 0x6e, 0x82, 0x32, 0x81, 0xce, 0x4d, 0x48, 0x38, 0x54, 0x60, + 0x6c, 0xb5, 0xa6, 0x9f, 0x31, 0x33, 0x72, 0x60, 0x8e, 0x6c, 0x7c, 0x71, + 0xa1, 0x57, 0xbb, 0x8f, 0x99, 0x41, 0x95, 0x87, 0xb5, 0x73, 0x3e, 0x7f, + 0xb5, 0xa8, 0x9a, 0xcb, 0x6b, 0x88, 0xc8, 0xa1, 0xd7, 0x53, 0xa2, 0x71, + 0x56, 0xa4, 0xcf, 0x49, 0xc4, 0x85, 0xa8, 0xca, 0x4e, 0x55, 0x5f, 0x76, + 0xa6, 0x95, 0x27, 0xa1, 0x80, 0x91, 0xbf, 0x4b, 0xa0, 0x8b, 0x9d, 0xb6, + 0xf2, 0xab, 0xad, 0xab, 0x68, 0x8e, 0xa4, 0x68, 0x89, 0x54, 0xaa, 0x7a, + 0x69, 0x3b, 0xaa, 0x7e, 0x61, 0xa1, 0xc9, 0xc9, 0xdc, 0x8e, 0x88, 0x5c, + 0x7d, 0xbb, 0x84, 0x57, 0x43, 0x3c, 0x8f, 0x5f, 0x73, 0x9a, 0x9d, 0x89, + 0x89, 0x36, 0xb0, 0x53, 0x49, 0x65, 0xb9, 0x53, 0x54, 0x9d, 0x80, 0xaf, + 0x8d, 0xc8, 0xaf, 0x3f, 0x90, 0xa4, 0x6c, 0x55, 0x6b, 0x6e, 0xb9, 0x3e, + 0xa0, 0xac, 0x72, 0xa1, 0x46, 0x90, 0x98, 0x75, 0x6a, 0x85, 0x91, 0x69, + 0x52, 0xc5, 0x5e, 0x85, 0xb1, 0x72, 0x40, 0x32, 0x6e, 0x62, 0x45, 0xb7, + 0x4f, 0x6b, 0x93, 0x87, 0x97, 0x4b, 0xa7, 0x6c, 0xb1, 0x4b, 0xab, 0x65, + 0x2d, 0x7d, 0x86, 0xaf, 0x76, 0xa0, 0x80, 0x60, 0x58, 0xa4, 0x7a, 0x7e, + 0xa7, 0x74, 0x75, 0x38, 0x95, 0xb7, 0x95, 0x82, 0xd9, 0x5e, 0x4f, 0xbe, + 0xcc, 0x95, 0x5d, 0xbc, 0x9c, 0x95, 0x42, 0x4b, 0xce, 0x6d, 0xdd, 0x3a, + 0x4c, 0x2b, 0xcb, 0xd5, 0xc9, 0x8a, 0x47, 0x5c, 0x65, 0x54, 0x6a, 0xbc, + 0x5b, 0xae, 0x93, 0xcd, 0x99, 0x38, 0xa1, 0x79, 0x55, 0xab, 0xc1, 0x8b, + 0x38, 0x6a, 0x38, 0x3c, 0x59, 0xa2, 0xc6, 0xba, 0x43, 0xc9, 0xbc, 0x64, + 0xc5, 0x45, 0x6f, 0x93, 0x7e, 0x39, 0x8b, 0x9a, 0x32, 0x6a, 0xac, 0xa7, + 0x66, 0xa9, 0x6a, 0x49, 0x29, 0x87, 0x45, 0x83, 0x5c, 0x3a, 0x91, 0x39, + 0xaf, 0xd9, 0x52, 0x79, 0xcc, 0xa7, 0x53, 0x67, 0xa6, 0xd2, 0x7d, 0x70, + 0x43, 0xbe, 0x97, 0xb4, 0x81, 0x56, 0x9b, 0xa5, 0xc4, 0x72, 0x4d, 0xc2, + 0x97, 0xba, 0xa8, 0x73, 0xc6, 0x40, 0x5f, 0x70, 0xb0, 0xbc, 0xbf, 0x6c, + 0xb8, 0x6a, 0xcd, 0xa1, 0xbb, 0xb3, 0x8b, 0x4d, 0x61, 0x5f, 0xad, 0x75, + 0xd2, 0x98, 0x66, 0x76, 0x52, 0xc6, 0xb0, 0x3b, 0x7d, 0x6b, 0x66, 0xc8, + 0x53, 0xa6, 0x3e, 0xc2, 0x45, 0xc4, 0x41, 0xb8, 0x8a, 0xaa, 0x62, 0x89, + 0xac, 0xa2, 0x71, 0x7f, 0xb7, 0x3d, 0x43, 0xad, 0x3f, 0xae, 0x84, 0x6c, + 0x53, 0xa4, 0x77, 0x85, 0x45, 0x3a, 0xa3, 0x71, 0x29, 0x7e, 0x4a, 0x43, + 0xa0, 0x52, 0xa1, 0x5c, 0x9f, 0x4b, 0x88, 0xbf, 0x4e, 0x44, 0xb6, 0x97, + 0x4e, 0xa1, 0x8d, 0xaf, 0x5c, 0xd2, 0x51, 0x73, 0x90, 0x49, 0x5d, 0xa7, + 0x5f, 0x42, 0x82, 0x60, 0xaa, 0x7f, 0x33, 0x91, 0x9b, 0x7b, 0xc0, 0x49, + 0xa9, 0x44, 0x79, 0x9e, 0x53, 0xc0, 0xd1, 0x85, 0x7b, 0xbf, 0x88, 0x55, + 0x5f, 0xc8, 0x41, 0xb3, 0x91, 0x7c, 0x46, 0xbe, 0x39, 0xd2, 0xab, 0x80, + 0x35, 0xd5, 0x80, 0x80, 0xb4, 0x71, 0x69, 0x74, 0x58, 0x76, 0xc5, 0x4b, + 0xa7, 0x3a, 0x6b, 0x5d, 0xbf, 0xcf, 0x79, 0x40, 0x6d, 0x97, 0xab, 0xa6, + 0x42, 0x7c, 0x4d, 0xc3, 0x96, 0xbe, 0xc1, 0x90, 0xb6, 0x6d, 0x36, 0x68, + 0x89, 0xc2, 0x97, 0xd7, 0x9c, 0x7c, 0x33, 0x3c, 0x71, 0xb9, 0x31, 0xad, + 0x98, 0x88, 0xbc, 0xb6, 0x4e, 0x3f, 0x87, 0xaa, 0x65, 0x39, 0xb1, 0x96, + 0x9c, 0x9e, 0x63, 0x95, 0x6d, 0x87, 0xb7, 0x6d, 0x4b, 0x4a, 0x93, 0x57, + 0x5a, 0x53, 0x4b, 0x73, 0x91, 0x56, 0xa0, 0x80, 0x61, 0x92, 0x40, 0x6f, + 0x8a, 0x59, 0x62, 0xbd, 0x9e, 0x92, 0x64, 0xd8, 0x7b, 0x84, 0x75, 0x57, + 0x96, 0x50, 0xca, 0xb4, 0x4c, 0xaa, 0x8b, 0x6b, 0x35, 0x77, 0xb1, 0x66, + 0xa8, 0x4a, 0x4b, 0xa4, 0x8e, 0x5c, 0x31, 0xc7, 0xae, 0x36, 0xb5, 0x77, + 0x65, 0x7d, 0x3a, 0xc2, 0x40, 0x4c, 0x58, 0x7d, 0x4f, 0x70, 0x4d, 0x82, + 0x89, 0xa3, 0x54, 0xa1, 0x37, 0x52, 0x79, 0x93, 0xc0, 0x4b, 0x8f, 0x5c, + 0x64, 0xbe, 0xa8, 0x38, 0x57, 0x2b, 0x62, 0x9b, 0x44, 0x9b, 0x9e, 0xc3, + 0xb8, 0x9d, 0x7a, 0x6e, 0x8b, 0x39, 0xc7, 0x8a, 0x57, 0xcb, 0xcc, 0x6a, + 0xd0, 0x49, 0xc8, 0x45, 0xa6, 0xcb, 0xae, 0x5c, 0x9d, 0xb1, 0x74, 0xcd, + 0xd2, 0x68, 0x49, 0x8f, 0xc5, 0x9a, 0xa5, 0x57, 0x4a, 0xa7, 0x65, 0x5a, + 0xac, 0x89, 0x9f, 0x36, 0x4c, 0xc7, 0x58, 0xd7, 0x6a, 0xf1, 0xc5, 0x54, + 0xc5, 0xa2, 0xad, 0x46, 0x3f, 0xa4, 0x60, 0x6c, 0x49, 0xb7, 0xd2, 0x7d, + 0x4b, 0x4b, 0xad, 0x32, 0x67, 0x4d, 0x8d, 0xa2, 0x43, 0xac, 0x52, 0xbd, + 0x93, 0x84, 0x69, 0x49, 0x8b, 0x91, 0x68, 0xba, 0x9b, 0x98, 0xa6, 0xb2, + 0xc3, 0x71, 0x72, 0xbf, 0xc9, 0x71, 0xaa, 0xc2, 0x97, 0xa3, 0xb9, 0x74, + 0x97, 0x5d, 0x4e, 0x3d, 0xb4, 0x59, 0x74, 0x96, 0xac, 0x50, 0x75, 0xd2, + 0x30, 0xa1, 0x8f, 0x9b, 0xbd, 0x70, 0x7d, 0x75, 0xbb, 0x63, 0xa3, 0x80, + 0x56, 0xac, 0x5a, 0x64, 0xac, 0x64, 0x51, 0xdc, 0xd2, 0x37, 0xad, 0x93, + 0x6a, 0x91, 0x99, 0xcb, 0xe5, 0x77, 0x96, 0x42, 0x50, 0x44, 0x3a, 0x47, + 0x6d, 0x8f, 0xb5, 0x64, 0xe2, 0x2c, 0xac, 0x36, 0xdb, 0x3c, 0xc1, 0x86, + 0xb9, 0xb9, 0x29, 0x78, 0x56, 0x5c, 0x55, 0x43, 0xa1, 0x7a, 0x43, 0x71, + 0x5c, 0x85, 0xce, 0x6d, 0x81, 0x47, 0x63, 0xc7, 0xaf, 0x4c, 0xa9, 0x82, + 0x5d, 0xb4, 0x44, 0x52, 0xc1, 0x40, 0x5e, 0x32, 0x43, 0x99, 0xa7, 0xb3, + 0x65, 0xa2, 0x4f, 0x79, 0xb0, 0x6f, 0x7c, 0x43, 0xb2, 0x7a, 0x6a, 0xc2, + 0xc3, 0x85, 0x5a, 0x3d, 0xc8, 0x68, 0x35, 0x4e, 0xb8, 0x58, 0x68, 0xa2, + 0x68, 0x5e, 0x80, 0x98, 0x7a, 0xb8, 0x53, 0x7d, 0x6b, 0x7d, 0x9a, 0x8a, + 0x8c, 0xd3, 0xc5, 0xca, 0x6e, 0xbf, 0x44, 0x56, 0x3d, 0x8d, 0x7c, 0x61, + 0xa4, 0x76, 0x46, 0xb6, 0x95, 0xba, 0x5c, 0x63, 0x9d, 0x40, 0x3a, 0x68, + 0xcf, 0x70, 0x59, 0x34, 0xc2, 0xa7, 0x86, 0x4a, 0x5a, 0x52, 0x4c, 0x8b, + 0x33, 0x3d, 0x3c, 0x9e, 0xc6, 0xac, 0xc6, 0xa3, 0x7f, 0x59, 0x43, 0x38, + 0xaa, 0x54, 0xb7, 0xb6, 0x54, 0xba, 0x7e, 0xa2, 0x7f, 0x81, 0x8d, 0xa5, + 0x4c, 0x89, 0x98, 0x68, 0x36, 0xab, 0xa8, 0xa9, 0x64, 0x8e, 0x65, 0x7d, + 0x2f, 0xa0, 0x77, 0xd8, 0x63, 0x58, 0x69, 0x38, 0xa0, 0x7b, 0x47, 0x6e, + 0x61, 0x8a, 0xa4, 0x66, 0x69, 0x75, 0x81, 0x7c, 0x3a, 0x50, 0x77, 0x50, + 0x7a, 0x98, 0x93, 0x5d, 0xc4, 0x6c, 0x9b, 0xb8, 0x54, 0x39, 0xc5, 0x68, + 0xa6, 0x8a, 0x55, 0xaf, 0xa9, 0x9b, 0x46, 0x4a, 0x91, 0x39, 0x96, 0xb5, + 0x79, 0x55, 0xb1, 0x74, 0x99, 0x46, 0x3d, 0xcd, 0x66, 0x6a, 0x31, 0xb2, + 0x43, 0x48, 0xbf, 0x78, 0x87, 0x82, 0x7a, 0x90, 0xbc, 0x86, 0x51, 0xa6, + 0xaa, 0x60, 0x5d, 0xbe, 0xa1, 0xd2, 0xbf, 0x51, 0x67, 0xce, 0x70, 0x81, + 0x5c, 0xb7, 0x93, 0xd2, 0x72, 0xb1, 0x9d, 0x3b, 0x95, 0x9a, 0xa5, 0xaf, + 0x7b, 0x5d, 0x66, 0xac, 0x46, 0xc1, 0x44, 0x81, 0xc9, 0x46, 0x60, 0xd0, + 0x5f, 0x87, 0x55, 0x8d, 0x7d, 0x75, 0x70, 0x50, 0x38, 0xaa, 0x7b, 0x56, + 0x83, 0x4f, 0x97, 0x3b, 0xb1, 0x44, 0x7d, 0x44, 0x43, 0x7e, 0x89, 0x42, + 0xb8, 0xaf, 0x8c, 0x41, 0xaa, 0xa2, 0x90, 0xcc, 0x5d, 0xb4, 0xa5, 0x97, + 0x89, 0x69, 0x9b, 0x38, 0x8c, 0x6c, 0x78, 0xa5, 0x55, 0x46, 0x53, 0x4a, + 0xa5, 0xab, 0xa1, 0x38, 0xca, 0x3d, 0x33, 0xb0, 0xa6, 0x90, 0xa6, 0xc1, + 0x44, 0x71, 0x55, 0x91, 0x95, 0x73, 0xaa, 0x4e, 0x81, 0x55, 0xa5, 0xc4, + 0x68, 0x3a, 0xc4, 0x75, 0x83, 0x61, 0xaa, 0x94, 0x38, 0x4f, 0x4b, 0x74, + 0x56, 0x3a, 0xbf, 0x5f, 0x60, 0x3c, 0x77, 0xb1, 0x56, 0x35, 0x8a, 0x5f, + 0x6e, 0x65, 0xcd, 0xd2, 0x5c, 0x2a, 0x57, 0x56, 0x37, 0x85, 0x4e, 0xa0, + 0x75, 0x43, 0x8d, 0x68, 0x4a, 0x3b, 0x5e, 0x83, 0x98, 0x8a, 0x4e, 0xc7, + 0xbc, 0x8f, 0xd2, 0x50, 0x81, 0x3d, 0xcd, 0xa2, 0x65, 0x6d, 0x91, 0x8d, + 0x65, 0x43, 0xb1, 0xc7, 0x62, 0x9a, 0x3d, 0xbe, 0x7e, 0x3e, 0xb0, 0xaa, + 0xa5, 0xca, 0x5b, 0x6b, 0x49, 0x44, 0xa2, 0xb4, 0x96, 0x8d, 0x8d, 0xd6, + 0x30, 0x9d, 0x38, 0x3f, 0x33, 0x4d, 0x44, 0x55, 0xc1, 0x4c, 0x96, 0x8a, + 0x8f, 0x54, 0x33, 0x80, 0xb7, 0x38, 0x89, 0xd2, 0xbd, 0x78, 0x70, 0x43, + 0xb2, 0xa7, 0x8e, 0x4f, 0x4e, 0x6b, 0x61, 0x62, 0x97, 0xa0, 0xb8, 0xa4, + 0xc1, 0xa3, 0x95, 0x7d, 0x6a, 0x65, 0x97, 0x68, 0xc1, 0xad, 0x90, 0x9d, + 0x34, 0x6a, 0x64, 0x8a, 0x3f, 0x56, 0x45, 0x59, 0xab, 0x3e, 0x5c, 0xb5, + 0x7b, 0x82, 0x3e, 0x3e, 0x6d, 0xb5, 0x77, 0x37, 0x98, 0x3a, 0x46, 0x9c, + 0xa0, 0xb6, 0x63, 0x31, 0x96, 0x98, 0xc5, 0xb9, 0xb1, 0xc2, 0x9b, 0x5d, + 0xaf, 0xb2, 0x40, 0xac, 0xb5, 0x9a, 0x9c, 0xc7, 0xb5, 0xd2, 0x9a, 0x98, + 0xa4, 0x98, 0xb9, 0x34, 0x47, 0x67, 0x76, 0xae, 0x4e, 0x8e, 0x6d, 0x58, + 0xcd, 0xc6, 0x52, 0x94, 0x68, 0x2f, 0xca, 0x35, 0x48, 0x71, 0x49, 0xb9, + 0x7c, 0xb7, 0x48, 0xaa, 0xb6, 0x58, 0x93, 0x82, 0x72, 0x94, 0x55, 0x87, + 0x58, 0x63, 0x83, 0xc1, 0x9e, 0xc5, 0xbf, 0xa4, 0xa2, 0x3c, 0x90, 0x7c, + 0x53, 0xb7, 0xc7, 0xb7, 0x8b, 0x70, 0x43, 0x73, 0x4a, 0x7f, 0xca, 0x57, + 0x81, 0x45, 0x8c, 0x54, 0x82, 0x6b, 0x80, 0x8f, 0x6b, 0xc5, 0xc1, 0x7c, + 0x9e, 0xcd, 0xd1, 0x37, 0x33, 0xce, 0x63, 0xc2, 0x35, 0x7c, 0x48, 0x51, + 0xc4, 0x82, 0x81, 0x3e, 0x5c, 0xcd, 0x8a, 0x57, 0x37, 0x80, 0x3a, 0xcb, + 0x44, 0x75, 0x9b, 0x33, 0xae, 0x45, 0x69, 0xc3, 0xb6, 0x4b, 0x41, 0x4b, + 0x79, 0x61, 0x52, 0x67, 0xd0, 0x7c, 0x94, 0xc7, 0xa3, 0x70, 0xc4, 0x70, + 0x7a, 0x99, 0x60, 0xbe, 0xae, 0xac, 0x54, 0x58, 0xbe, 0xd3, 0xca, 0x62, + 0x73, 0x65, 0x3a, 0x36, 0xb7, 0x8d, 0xaf, 0x60, 0x37, 0x96, 0x5c, 0xa4, + 0x63, 0xa2, 0xc7, 0x5d, 0x9a, 0x7b, 0x7d, 0x35, 0x67, 0x49, 0x5e, 0x3f, + 0x57, 0x7a, 0x81, 0x38, 0x53, 0x3d, 0xca, 0x49, 0x58, 0xa5, 0x91, 0x99, + 0x60, 0x7e, 0x30, 0x87, 0xb2, 0x7b, 0x57, 0x9a, 0x9d, 0x7b, 0x45, 0x34, + 0x97, 0xc3, 0x92, 0xb5, 0xc5, 0xb2, 0x54, 0x33, 0x85, 0xc0, 0x60, 0xc4, + 0xac, 0x6a, 0xc4, 0x60, 0x9e, 0x91, 0xce, 0x4f, 0xca, 0xc9, 0x99, 0x4b, + 0x74, 0x48, 0x8a, 0x52, 0xa2, 0xb0, 0xb5, 0x84, 0x68, 0x5d, 0xcd, 0x95, + 0x82, 0x39, 0x4a, 0xc9, 0xcf, 0x83, 0x42, 0x6f, 0xa6, 0xb0, 0xc8, 0xb8, + 0x7c, 0x32, 0x6a, 0x33, 0x79, 0x9d, 0x4d, 0x8c, 0x92, 0x49, 0xcd, 0x6f, + 0xb3, 0xb2, 0xa5, 0x52, 0xc1, 0x5a, 0x7b, 0x5e, 0x7d, 0x91, 0x8a, 0x8a, + 0x6e, 0x98, 0x8e, 0xaa, 0xbf, 0x71, 0x5a, 0x98, 0xb1, 0x3a, 0x48, 0xae, + 0x87, 0x93, 0x87, 0xc6, 0x40, 0x2f, 0xa0, 0x4d, 0x51, 0x73, 0xcd, 0x5f, + 0x9e, 0x7c, 0xd3, 0x53, 0x58, 0xc5, 0x58, 0x55, 0x76, 0x8d, 0x61, 0xba, + 0x3c, 0xb4, 0x65, 0xb0, 0x8a, 0x3a, 0x53, 0xd7, 0x30, 0x44, 0xd5, 0x7b, + 0xc4, 0x30, 0x58, 0x56, 0x6e, 0x5a, 0x56, 0x84, 0x4c, 0x69, 0x67, 0x5f, + 0x89, 0x82, 0x97, 0x70, 0x60, 0xca, 0x7a, 0x4b, 0x92, 0x85, 0xcd, 0xba, + 0x43, 0xbb, 0x8e, 0xa2, 0x83, 0x96, 0xae, 0xa0, 0x3d, 0x71, 0x52, 0xb5, + 0x51, 0x91, 0x68, 0x80, 0xb7, 0x4f, 0x8f, 0x7a, 0xa7, 0xab, 0x7b, 0x69, + 0xd1, 0xab, 0x69, 0x46, 0x91, 0xcb, 0x94, 0x9b, 0xaa, 0x4d, 0xa5, 0x74, + 0x4a, 0x5f, 0x81, 0xc4, 0xc2, 0x34, 0x35, 0x6c, 0x36, 0x82, 0xa8, 0xc5, + 0xd0, 0x6a, 0x65, 0x4e, 0x73, 0x8d, 0xb2, 0x3b, 0xd0, 0x3b, 0xbf, 0x59, + 0xb8, 0xd0, 0x87, 0xad, 0x41, 0xc3, 0x9d, 0x74, 0x6d, 0x42, 0xb1, 0xa2, + 0xa4, 0x60, 0x35, 0x44, 0x7a, 0x7b, 0x3f, 0x33, 0x44, 0x49, 0x90, 0x5f, + 0x82, 0x66, 0x89, 0xbb, 0x6e, 0x3c, 0x9f, 0x56, 0xcf, 0xbe, 0x51, 0x95, + 0xa3, 0x48, 0x59, 0x5d, 0xb3, 0x54, 0x32, 0x4d, 0x3c, 0xb1, 0x52, 0x4b, + 0xaf, 0xcf, 0x4d, 0xbe, 0x9f, 0x60, 0x77, 0x86, 0x7c, 0xc4, 0x84, 0xbf, + 0x80, 0x67, 0x76, 0x50, 0x80, 0x63, 0x99, 0xb1, 0xa2, 0x43, 0x50, 0x65, + 0x7b, 0xbd, 0x6c, 0xce, 0x9f, 0xc3, 0x3a, 0x6c, 0x97, 0xd3, 0x34, 0x97, + 0xcf, 0xcb, 0x47, 0x38, 0x3d, 0x68, 0x78, 0x2b, 0x99, 0x3c, 0x74, 0x79, + 0xcf, 0x6b, 0x90, 0x57, 0xb9, 0x57, 0x7d, 0xa3, 0x4b, 0xb1, 0x49, 0xaa, + 0x67, 0x5e, 0xc8, 0xa1, 0x7e, 0x8e, 0x8a, 0xc8, 0xd4, 0xc3, 0x84, 0xcb, + 0x56, 0xa9, 0x95, 0x9f, 0x80, 0x8b, 0x83, 0x82, 0xb3, 0x6b, 0x39, 0xa6, + 0x8e, 0x6a, 0xac, 0x4f, 0xc0, 0x42, 0xcb, 0x9e, 0xd2, 0x7d, 0xb1, 0xc5, + 0x49, 0xab, 0x7b, 0x59, 0x8f, 0x7b, 0x54, 0x45, 0xb7, 0x96, 0x76, 0x4b, + 0xd2, 0x4e, 0x6e, 0xa1, 0x74, 0x3b, 0x9e, 0x6d, 0xcf, 0x87, 0x8b, 0xa3, + 0xb6, 0x68, 0x82, 0x59, 0x8a, 0xd1, 0x9f, 0xbb, 0x5c, 0x3a, 0xc4, 0x76, + 0xcb, 0x85, 0x46, 0x70, 0x98, 0xc3, 0xca, 0xb0, 0x63, 0x90, 0xa2, 0x60, + 0x8d, 0xba, 0x8e, 0x37, 0x7f, 0x83, 0xc2, 0xcd, 0x70, 0x6e, 0x8c, 0x4e, + 0x52, 0x81, 0x73, 0x40, 0x56, 0x93, 0xcb, 0x67, 0x65, 0xb6, 0xbe, 0xc3, + 0x89, 0x80, 0xa6, 0x32, 0xbe, 0x88, 0x5e, 0xba, 0xcb, 0x72, 0x6f, 0x9a, + 0xab, 0x9c, 0x9b, 0x69, 0x6d, 0x6d, 0x90, 0xb6, 0xb2, 0xbf, 0xc8, 0x61, + 0x5f, 0x4b, 0xb9, 0x89, 0x3c, 0x37, 0xa1, 0x36, 0x62, 0x68, 0x82, 0x7f, + 0x3a, 0x78, 0x82, 0x81, 0x56, 0xb0, 0xd3, 0x4e, 0xb4, 0xca, 0xb3, 0x66, + 0x7c, 0x3b, 0x42, 0xc6, 0x90, 0x3c, 0x82, 0x8b, 0x9c, 0x8a, 0x4e, 0x44, + 0x36, 0x49, 0xc2, 0xd1, 0x79, 0x66, 0x69, 0x74, 0x7c, 0x56, 0x60, 0x40, + 0x85, 0xa2, 0xb6, 0x56, 0x9a, 0x67, 0xb3, 0xad, 0x73, 0x6c, 0xa8, 0x8c, + 0xce, 0x3b, 0x73, 0x89, 0xbf, 0xa0, 0x8e, 0xb7, 0x6b, 0x7c, 0x41, 0x4e, + 0x48, 0xc0, 0xd2, 0x44, 0x7c, 0x46, 0xc3, 0x87, 0xcb, 0x71, 0x67, 0x53, + 0x96, 0x91, 0x3c, 0xca, 0x41, 0xc5, 0x93, 0x88, 0x84, 0xc9, 0x37, 0x6d, + 0x97, 0x65, 0x7d, 0xb1, 0x53, 0x3f, 0x4d, 0x9b, 0xbc, 0xa8, 0x3d, 0x89, + 0x62, 0xb2, 0xc2, 0xb7, 0xa8, 0xc0, 0xc9, 0x48, 0x9c, 0xb3, 0x53, 0xa5, + 0x41, 0xae, 0x56, 0x9d, 0x5c, 0x32, 0x96, 0x46, 0x40, 0x63, 0x78, 0x71, + 0x6f, 0xac, 0x3f, 0xc5, 0x56, 0xc6, 0x8a, 0x35, 0x5e, 0x96, 0x71, 0x38, + 0x79, 0xb7, 0x78, 0x3b, 0x76, 0x5e, 0x95, 0x37, 0x61, 0xb4, 0xb8, 0xcb, + 0x71, 0xc2, 0xbf, 0x76, 0x85, 0xb4, 0x42, 0x98, 0x87, 0x66, 0x2e, 0x6d, + 0xae, 0xba, 0x31, 0xa3, 0x83, 0x5f, 0xa2, 0x38, 0x3e, 0x3e, 0x9a, 0x91, + 0xab, 0xc6, 0xa6, 0xbc, 0xc4, 0x73, 0x47, 0xa6, 0xba, 0x70, 0x7c, 0xb4, + 0xa5, 0x6c, 0x6f, 0x96, 0x5a, 0x3d, 0x51, 0x6d, 0x9f, 0xa9, 0xa9, 0x8c, + 0xd1, 0x74, 0x72, 0xb7, 0x67, 0x69, 0x31, 0x57, 0xc7, 0x53, 0xc7, 0xa7, + 0x71, 0x3e, 0x56, 0x80, 0x46, 0x3d, 0xb5, 0x64, 0x5d, 0x95, 0x89, 0x8d, + 0x79, 0x48, 0x56, 0x74, 0x51, 0x87, 0x6d, 0x5b, 0x4c, 0x6e, 0x78, 0xcb, + 0x93, 0x38, 0x80, 0x7b, 0x50, 0xbe, 0xcd, 0x9e, 0xce, 0xaa, 0xa7, 0x66, + 0x75, 0x72, 0xb2, 0x7f, 0x33, 0x95, 0xdc, 0xa3, 0x6f, 0x78, 0x60, 0x46, + 0xd1, 0x80, 0x55, 0x59, 0x83, 0xc0, 0xa6, 0x98, 0x5a, 0x68, 0x7d, 0xc9, + 0x71, 0x56, 0x61, 0xc2, 0x74, 0x3e, 0x65, 0xba, 0xaf, 0xd2, 0x6a, 0xc6, + 0x58, 0xd8, 0xb2, 0x59, 0xa9, 0xcc, 0xae, 0xad, 0x87, 0xbb, 0x88, 0x8d, + 0xb4, 0x92, 0x84, 0x7c, 0x33, 0xc3, 0xb9, 0xa9, 0xcf, 0x86, 0xb6, 0xaa, + 0x91, 0x35, 0x98, 0xc4, 0x7a, 0x66, 0x51, 0xc7, 0xbd, 0x3c, 0xce, 0xbc, + 0x80, 0x80, 0x4f, 0x5b, 0x38, 0x51, 0x7f, 0x3e, 0x83, 0x41, 0xba, 0x6d, + 0xb2, 0x74, 0x7b, 0x6c, 0x68, 0x9e, 0xb8, 0x41, 0x78, 0x52, 0xb9, 0x7d, + 0xc0, 0xbf, 0x38, 0xc4, 0x41, 0x43, 0x5d, 0x59, 0xa0, 0xaa, 0x6e, 0x47, + 0x9b, 0x6c, 0x45, 0x9a, 0x41, 0xa1, 0x4f, 0xbe, 0x91, 0x82, 0x53, 0x72, + 0x9e, 0x76, 0x78, 0x83, 0xbe, 0x34, 0x3f, 0xb1, 0x36, 0xa9, 0x95, 0xd0, + 0xcf, 0x58, 0xce, 0x6e, 0x7a, 0x97, 0xc6, 0x8f, 0x45, 0x83, 0x5a, 0x60, + 0xab, 0x46, 0x48, 0x32, 0xac, 0x66, 0x97, 0xc3, 0x70, 0x51, 0x60, 0xa5, + 0x5a, 0x52, 0x4d, 0x93, 0xc2, 0xc9, 0x43, 0x7f, 0x32, 0x73, 0xa4, 0x56, + 0xb9, 0x69, 0x41, 0x46, 0x84, 0x7e, 0x72, 0x7e, 0xb7, 0x7d, 0x9d, 0xbd, + 0x71, 0x99, 0x88, 0x47, 0x61, 0x58, 0xa8, 0x48, 0xa8, 0x6e, 0xd1, 0x55, + 0x95, 0x6f, 0xd5, 0xbb, 0xc1, 0x9b, 0x4d, 0x38, 0x6e, 0x9c, 0x65, 0x82, + 0x66, 0xab, 0x8f, 0x88, 0x8a, 0xb0, 0x88, 0xb0, 0x99, 0x80, 0xa9, 0xc9, + 0x97, 0x94, 0xa8, 0x9b, 0xcf, 0x91, 0x40, 0xd1, 0x93, 0x66, 0x3c, 0x49, + 0x64, 0x87, 0x51, 0xca, 0x34, 0xa3, 0xb0, 0xb3, 0x37, 0x80, 0xc9, 0xcc, + 0xc6, 0xd0, 0xbc, 0x8f, 0x79, 0x4c, 0x5a, 0x7b, 0x4a, 0xad, 0x6a, 0x86, + 0x47, 0x60, 0x42, 0xc1, 0xc9, 0x8b, 0x44, 0x77, 0xb8, 0x5e, 0x86, 0x64, + 0x73, 0x61, 0x85, 0x6a, 0x6e, 0x5b, 0xd2, 0x76, 0x9f, 0x64, 0x76, 0x6f, + 0x7c, 0x78, 0x35, 0xc7, 0x4c, 0x93, 0x6b, 0x31, 0x64, 0x62, 0x25, 0x29, + 0xaf, 0xb4, 0x55, 0x60, 0xc3, 0x40, 0x2d, 0xc8, 0xd3, 0x5f, 0x57, 0x86, + 0xa0, 0xd3, 0x4c, 0x35, 0xa2, 0x97, 0x86, 0x9c, 0xaf, 0x94, 0x3d, 0x57, + 0x37, 0xbb, 0x41, 0x33, 0x3e, 0xb4, 0x49, 0x4b, 0x51, 0xad, 0xc1, 0x61, + 0x7d, 0x64, 0x4c, 0x8a, 0x33, 0xbf, 0x62, 0x1e, 0x79, 0x60, 0x3b, 0x5b, + 0x41, 0xcc, 0x48, 0xba, 0x99, 0x87, 0x93, 0x92, 0x2b, 0x89, 0x51, 0x61, + 0x32, 0xce, 0xc7, 0x76, 0x76, 0x2c, 0xa1, 0x5c, 0xc2, 0xbb, 0x51, 0x2a, + 0x3c, 0xda, 0x44, 0x7a, 0x9c, 0xbc, 0x40, 0x5c, 0xb1, 0x2d, 0x8f, 0xb5, + 0x34, 0x5a, 0x6a, 0xb8, 0x7b, 0x9d, 0xaf, 0xa1, 0x69, 0x34, 0x8d, 0x15, + 0xab, 0xbb, 0xa1, 0x75, 0x56, 0x8d, 0x65, 0xab, 0x5a, 0x4a, 0x80, 0x82, + 0xc6, 0x90, 0x85, 0x75, 0x57, 0x5d, 0xca, 0xbc, 0x40, 0x60, 0x3f, 0x52, + 0x3c, 0xb3, 0xb7, 0x62, 0x84, 0x7d, 0x7c, 0x8c, 0x9e, 0x4e, 0x4b, 0xce, + 0x70, 0xb3, 0x34, 0x55, 0xbd, 0x90, 0x3c, 0x65, 0xa9, 0x87, 0xa7, 0x77, + 0x65, 0x3e, 0x4a, 0x71, 0x4f, 0xcf, 0x96, 0x7e, 0x93, 0x3d, 0x45, 0x95, + 0xce, 0x86, 0x70, 0x86, 0x59, 0x8c, 0xd3, 0x3d, 0x9c, 0x55, 0x4a, 0x36, + 0xa0, 0xc5, 0x62, 0x44, 0xd0, 0x6b, 0x59, 0xb3, 0xb7, 0x58, 0x8b, 0xa9, + 0x7e, 0x3f, 0x44, 0x3c, 0xba, 0xc8, 0x52, 0xd0, 0x3f, 0x97, 0x53, 0xb3, + 0xb6, 0xab, 0xc8, 0x74, 0xb5, 0xae, 0x6c, 0xc5, 0xbe, 0x7d, 0xa1, 0xbf, + 0x73, 0x60, 0x45, 0x83, 0xb2, 0x45, 0x3c, 0x6c, 0x7c, 0xbf, 0x8e, 0x3b, + 0xc8, 0x9d, 0x70, 0x36, 0x50, 0x8f, 0x45, 0x84, 0xbb, 0x3e, 0x63, 0x8d, + 0xc5, 0x86, 0x5e, 0x84, 0x7f, 0x8c, 0x94, 0xa3, 0x63, 0x9b, 0xaf, 0xae, + 0x8a, 0xab, 0x89, 0x80, 0x48, 0x7f, 0xb7, 0x5c, 0xcd, 0x79, 0xb0, 0x92, + 0x5a, 0x79, 0xab, 0x93, 0xbb, 0x9b, 0xc0, 0x63, 0xc8, 0x93, 0x84, 0x43, + 0x6c, 0xb4, 0x9e, 0x87, 0x76, 0x77, 0xd6, 0xaa, 0xb7, 0x71, 0x48, 0x86, + 0x8e, 0x3b, 0x3f, 0x6f, 0x81, 0x53, 0x7c, 0x9d, 0x96, 0xc8, 0x8e, 0x6f, + 0x39, 0x87, 0x35, 0x76, 0x63, 0x69, 0x7f, 0x43, 0x59, 0xb2, 0xa5, 0xa0, + 0x30, 0xbe, 0x53, 0x77, 0x84, 0x5c, 0x9d, 0xcb, 0x91, 0xb1, 0xbc, 0xdc, + 0x76, 0xb4, 0x4c, 0x4d, 0x7e, 0xa4, 0x74, 0xa6, 0x4b, 0xce, 0x8e, 0xad, + 0xae, 0x3b, 0xc3, 0x85, 0x35, 0x51, 0x77, 0x6a, 0x40, 0x48, 0xb6, 0xb3, + 0x92, 0x3d, 0xb0, 0x76, 0xa5, 0xbb, 0x7b, 0x2e, 0x7d, 0x53, 0x7f, 0x43, + 0xbc, 0x62, 0x48, 0xa3, 0x8d, 0x94, 0xc4, 0x6d, 0xa4, 0x5d, 0xa4, 0x6f, + 0x6b, 0xc0, 0x83, 0x4c, 0xa3, 0x9d, 0xa6, 0xbb, 0x7c, 0xcf, 0x89, 0xa2, + 0xbf, 0xd3, 0xb9, 0xa9, 0x46, 0x5f, 0x57, 0x37, 0xa8, 0x32, 0x78, 0xab, + 0x3f, 0x77, 0x76, 0x6b, 0xb6, 0x56, 0xb3, 0xb9, 0x9f, 0x78, 0x76, 0xc1, + 0xc5, 0x96, 0xcf, 0x57, 0xc6, 0xa5, 0xd2, 0xc6, 0xb2, 0x4f, 0xc2, 0x5c, + 0x6c, 0x81, 0x60, 0x60, 0x89, 0x3e, 0x8c, 0xc6, 0xc0, 0x4c, 0x87, 0x7a, + 0x83, 0x3e, 0x6e, 0x8e, 0x70, 0x9d, 0x6d, 0x31, 0x3c, 0x7e, 0x61, 0x41, + 0x62, 0xb5, 0x64, 0x8c, 0x77, 0xbc, 0x97, 0xb5, 0xa8, 0xbd, 0x68, 0xa2, + 0x3c, 0x5d, 0xb0, 0xa1, 0x94, 0x41, 0x8e, 0x79, 0x57, 0x84, 0xa7, 0x9f, + 0xc9, 0x32, 0xc9, 0x96, 0x56, 0xb9, 0x30, 0xc0, 0xca, 0x5e, 0xb9, 0x97, + 0xce, 0xaa, 0x8c, 0x66, 0xbe, 0x3e, 0xc9, 0x53, 0x34, 0x40, 0xd6, 0x59, + 0xaa, 0x93, 0x8c, 0xda, 0x52, 0x63, 0x32, 0xbe, 0x3d, 0x37, 0x72, 0x3c, + 0xc6, 0x6b, 0x37, 0x35, 0xc0, 0xc2, 0x91, 0x29, 0xa2, 0xc1, 0xc3, 0x9b, + 0x4b, 0x9f, 0x57, 0x77, 0xd6, 0x3f, 0x8e, 0x99, 0x76, 0x5c, 0x48, 0x39, + 0x2f, 0x97, 0x75, 0x67, 0x8b, 0x4b, 0x49, 0xb0, 0xa4, 0x75, 0xca, 0x46, + 0x38, 0x80, 0xc6, 0x3d, 0xb3, 0x2f, 0x35, 0xba, 0x94, 0xca, 0xd7, 0x87, + 0x82, 0xa7, 0xbe, 0x60, 0x4a, 0x53, 0x60, 0xb1, 0xbf, 0x81, 0x52, 0x81, + 0xb0, 0xd0, 0x63, 0x8a, 0x91, 0x50, 0xc2, 0xc7, 0x85, 0x31, 0xbd, 0x38, + 0x53, 0x54, 0x7e, 0x5f, 0xb1, 0x9c, 0x8d, 0x50, 0xad, 0x5d, 0xcd, 0xb6, + 0xac, 0x55, 0x2b, 0x92, 0x21, 0x63, 0xaf, 0x55, 0xa6, 0xaf, 0xcc, 0x5c, + 0x2e, 0x7f, 0x6b, 0x2b, 0x60, 0x76, 0x71, 0xce, 0xb5, 0xbb, 0x47, 0x65, + 0x77, 0x82, 0x3d, 0x71, 0x4a, 0xa4, 0x41, 0x3c, 0x3d, 0x60, 0x92, 0xd3, + 0xca, 0x9b, 0x46, 0x9c, 0x4f, 0xca, 0x6d, 0x7a, 0xc6, 0x79, 0x53, 0x87, + 0x31, 0x50, 0xb2, 0xb3, 0x50, 0x51, 0x8e, 0xae, 0xc1, 0x67, 0xaa, 0x3c, + 0x98, 0x81, 0x6f, 0xb6, 0x51, 0x6a, 0x84, 0xa7, 0x80, 0x3c, 0xa6, 0xa8, + 0x82, 0x7a, 0xa3, 0xa0, 0xbf, 0x5e, 0x8f, 0x6b, 0x61, 0x88, 0xb1, 0xce, + 0x97, 0x32, 0x47, 0x96, 0xa5, 0x96, 0x90, 0x98, 0x96, 0xb8, 0x85, 0xd8, + 0x70, 0xcc, 0xab, 0xca, 0x29, 0x5c, 0xcd, 0x80, 0x51, 0xa5, 0x48, 0x25, + 0xa4, 0x3f, 0x76, 0x2f, 0x4a, 0x70, 0x33, 0x89, 0xae, 0x55, 0x57, 0xbe, + 0x50, 0xa5, 0x9f, 0x3a, 0xbb, 0x4f, 0x29, 0x36, 0xc6, 0x9c, 0x7d, 0x99, + 0x5d, 0x9e, 0xca, 0x52, 0x52, 0x9d, 0x51, 0xcf, 0x39, 0x99, 0xc4, 0xca, + 0x92, 0xbe, 0x69, 0x38, 0xa9, 0xbe, 0x59, 0x62, 0x28, 0x42, 0xc0, 0x71, + 0xd7, 0xbd, 0xce, 0xae, 0x62, 0x77, 0x54, 0x90, 0xba, 0x87, 0x83, 0xa4, + 0x75, 0x9f, 0xc3, 0x3d, 0x45, 0x5e, 0xb5, 0xbb, 0xcf, 0x7a, 0x33, 0x66, + 0x65, 0x67, 0x49, 0xae, 0x4c, 0x5d, 0x66, 0x99, 0x85, 0xae, 0x97, 0x4d, + 0xb0, 0x58, 0x4b, 0x9b, 0x74, 0x7d, 0x55, 0xab, 0x77, 0x6e, 0xcf, 0x60, + 0x84, 0xd4, 0x74, 0xa7, 0x48, 0x8e, 0x3f, 0xa0, 0x5a, 0x71, 0x7a, 0x46, + 0x59, 0xb2, 0xc1, 0x86, 0x8b, 0x94, 0x60, 0xa7, 0xa9, 0x8b, 0xca, 0xcc, + 0x65, 0x7e, 0xc6, 0x6f, 0xa3, 0xc5, 0x65, 0xa8, 0x97, 0x9a, 0xb9, 0x84, + 0x66, 0x46, 0x81, 0xbf, 0x48, 0x51, 0x8b, 0x3f, 0x5e, 0x32, 0x8a, 0xb8, + 0x35, 0xc7, 0x92, 0x71, 0x79, 0xa6, 0x6d, 0xa7, 0x8c, 0x9a, 0x65, 0x4d, + 0x71, 0xb5, 0x2f, 0x44, 0x89, 0x90, 0x82, 0x94, 0x86, 0x8c, 0xbc, 0x3a, + 0x8a, 0xa1, 0xb5, 0xcc, 0x80, 0xa9, 0x5e, 0x40, 0x51, 0x3d, 0x5b, 0x5a, + 0x2e, 0xd2, 0x5f, 0x45, 0x7c, 0xc0, 0xc4, 0x9b, 0xc6, 0x9c, 0x9c, 0xc4, + 0x6b, 0xa9, 0x6e, 0xbb, 0x3e, 0x58, 0x39, 0xbe, 0xad, 0x50, 0x52, 0x6f, + 0x2d, 0x72, 0xae, 0x57, 0xb7, 0x6b, 0x71, 0x76, 0x5f, 0x4f, 0x7c, 0x38, + 0x81, 0x35, 0xc1, 0x87, 0x87, 0x7e, 0x91, 0x48, 0x6e, 0x8f, 0xc9, 0x75, + 0x6b, 0xd2, 0x76, 0xce, 0x97, 0xbd, 0x7e, 0xa9, 0x6c, 0xb5, 0x7c, 0xab, + 0xc1, 0x9d, 0xc3, 0x5e, 0x62, 0x76, 0x58, 0xa5, 0xc6, 0x59, 0x8a, 0xbf, + 0x43, 0x8a, 0x82, 0x4d, 0x81, 0x9e, 0xc1, 0xa3, 0xb6, 0x65, 0x3e, 0x44, + 0x98, 0x46, 0xad, 0xbe, 0xab, 0x6c, 0xb6, 0xaa, 0x32, 0x9c, 0x7d, 0x8e, + 0xb1, 0x6d, 0x50, 0x9d, 0x6e, 0xcc, 0x5d, 0xc5, 0x6c, 0xbf, 0x86, 0x9d, + 0xbb, 0xb6, 0xce, 0x9d, 0xb8, 0x6a, 0x87, 0x79, 0x98, 0x91, 0xc8, 0x7b, + 0x68, 0x74, 0x83, 0xb1, 0x9c, 0xa1, 0x9b, 0x71, 0x7f, 0xc0, 0xa4, 0x56, + 0x4b, 0x9d, 0x83, 0x44, 0xd2, 0x6e, 0xa1, 0x79, 0x6d, 0x3c, 0x6e, 0x5b, + 0x61, 0xba, 0xa7, 0x6b, 0xae, 0xac, 0x81, 0x93, 0x78, 0x9a, 0x48, 0x6c, + 0xb9, 0x3c, 0x86, 0x41, 0xb5, 0xbe, 0xc5, 0xce, 0x8c, 0x69, 0x65, 0x4e, + 0x3a, 0x38, 0xbc, 0x77, 0xa5, 0x55, 0x85, 0x41, 0x92, 0x56, 0x52, 0x59, + 0xa3, 0x61, 0x6f, 0x62, 0xa0, 0x44, 0xa5, 0xb2, 0xa1, 0x6d, 0x36, 0x79, + 0x62, 0x87, 0x45, 0x3a, 0x57, 0x34, 0x8b, 0xb9, 0xbb, 0x6e, 0x35, 0x91, + 0xa3, 0x45, 0x44, 0x91, 0xcf, 0x48, 0x4b, 0x84, 0x8e, 0x85, 0x6a, 0xbd, + 0xc9, 0x86, 0x3e, 0x36, 0x38, 0x57, 0xc2, 0xb6, 0x65, 0x37, 0xb4, 0x5f, + 0xb0, 0x2f, 0x38, 0x35, 0x2e, 0xcf, 0x7b, 0x7b, 0x40, 0xc5, 0x6f, 0x6d, + 0x7d, 0x88, 0x44, 0xbe, 0xa6, 0xae, 0xa9, 0x3e, 0x44, 0xbe, 0x72, 0x64, + 0x5b, 0xa1, 0x76, 0x48, 0x78, 0x75, 0x4e, 0xac, 0x38, 0x87, 0xc8, 0xcd, + 0xb8, 0x6f, 0xa9, 0x98, 0x94, 0x5e, 0xd1, 0xaa, 0x77, 0x92, 0xac, 0xce, + 0xbe, 0x5d, 0x5d, 0x8a, 0x5d, 0x81, 0x65, 0x65, 0x8e, 0x6c, 0x6b, 0x37, + 0xd1, 0x36, 0x36, 0x6f, 0x79, 0x61, 0xb5, 0x72, 0x5d, 0x37, 0x7f, 0x53, + 0xb8, 0x3f, 0x84, 0xa8, 0x92, 0xac, 0xc6, 0xa5, 0xb1, 0xcd, 0xc0, 0x37, + 0x9a, 0x50, 0xb6, 0x6b, 0x92, 0x73, 0x5d, 0xc2, 0x63, 0x5d, 0x93, 0x5d, + 0xb7, 0x6a, 0xaa, 0x78, 0x9c, 0xad, 0x73, 0x97, 0xb8, 0xcb, 0xcf, 0x93, + 0xc0, 0x94, 0x6e, 0xcd, 0x5a, 0x86, 0x3c, 0xb2, 0x9a, 0x92, 0x5a, 0x51, + 0x8b, 0x73, 0x4d, 0xb2, 0x72, 0x4b, 0xaa, 0xb8, 0x90, 0x94, 0x7b, 0x68, + 0x2f, 0x9a, 0x80, 0x42, 0x84, 0x4a, 0x46, 0x55, 0xbb, 0x7a, 0x3c, 0xb5, + 0xbc, 0xc0, 0x32, 0x33, 0x9e, 0x7b, 0x5d, 0x93, 0x42, 0x60, 0xb9, 0x74, + 0x8d, 0xc1, 0x91, 0x9a, 0xb5, 0x87, 0x98, 0x8d, 0xa3, 0x7c, 0xc9, 0x44, + 0x89, 0xc2, 0x57, 0x61, 0xca, 0x31, 0x42, 0xc3, 0xb6, 0xb5, 0x9d, 0xa6, + 0x70, 0x58, 0x44, 0x6d, 0x4c, 0x91, 0x40, 0xa2, 0x8b, 0x7d, 0xc3, 0xa8, + 0x96, 0xcc, 0x98, 0x89, 0xaa, 0xa6, 0x77, 0x88, 0xc0, 0xa0, 0x94, 0x8b, + 0x7c, 0x81, 0x91, 0xcb, 0x94, 0x73, 0xbe, 0x9f, 0x57, 0x6f, 0x62, 0x3c, + 0x32, 0xce, 0x2f, 0xa3, 0x51, 0x97, 0xb0, 0x79, 0x3e, 0x33, 0x89, 0x9c, + 0xae, 0x97, 0x71, 0xc5, 0x54, 0x55, 0xac, 0x8c, 0xc3, 0x63, 0xb3, 0xcb, + 0xb7, 0x97, 0x89, 0xce, 0xba, 0x66, 0x41, 0x8c, 0x7e, 0x51, 0xa0, 0x6d, + 0x32, 0x84, 0x46, 0xa8, 0x9f, 0xb0, 0xc8, 0x43, 0x37, 0x3a, 0xd5, 0x42, + 0xaf, 0x87, 0xd2, 0x4c, 0xa6, 0x72, 0x5c, 0xcc, 0x7e, 0x62, 0xb3, 0xc3, + 0x52, 0x45, 0xbf, 0x89, 0xcd, 0x90, 0x37, 0x6b, 0xb4, 0x57, 0xa2, 0x67, + 0x84, 0x5c, 0xac, 0xa3, 0xa8, 0x47, 0x3a, 0xb3, 0xc8, 0x61, 0x82, 0x35, + 0x91, 0x3e, 0x51, 0xad, 0xb5, 0x69, 0xc4, 0x35, 0x9e, 0x48, 0x31, 0x69, + 0x80, 0x2d, 0x8d, 0x4d, 0xa1, 0x75, 0x34, 0x56, 0x9a, 0x29, 0x4a, 0x5e, + 0x72, 0x92, 0xcc, 0x78, 0x44, 0x7a, 0xa9, 0x4e, 0x44, 0x7a, 0x83, 0x35, + 0xab, 0x79, 0x4d, 0x80, 0x38, 0x41, 0x39, 0xb1, 0x9d, 0xc2, 0x30, 0x2d, + 0x94, 0x60, 0x6c, 0x65, 0xc9, 0x59, 0x4b, 0x8b, 0xac, 0xcd, 0xb8, 0x46, + 0xb5, 0x85, 0x8b, 0x7c, 0x3e, 0x99, 0xcb, 0x7c, 0x33, 0x63, 0xc0, 0xb2, + 0x87, 0x5d, 0x77, 0x6d, 0xcc, 0x77, 0x50, 0x92, 0x99, 0x45, 0x7a, 0x49, + 0x52, 0x41, 0x44, 0x70, 0xba, 0xc8, 0xc2, 0xbf, 0x40, 0x34, 0x81, 0x71, + 0x91, 0x45, 0xc4, 0xa7, 0x32, 0xb1, 0x4f, 0x71, 0x37, 0x45, 0x52, 0xb4, + 0x88, 0x99, 0x6f, 0x6e, 0x7c, 0x80, 0x97, 0x7b, 0x37, 0x81, 0xcf, 0x90, + 0xc0, 0xb4, 0x62, 0xa1, 0x62, 0x4f, 0x52, 0x63, 0x3d, 0xa2, 0xd3, 0x8c, + 0x7f, 0x6b, 0x72, 0x69, 0xd0, 0xa2, 0x40, 0x4a, 0xa4, 0x59, 0x7d, 0xa6, + 0x8c, 0xad, 0x76, 0x99, 0xa4, 0x8c, 0x87, 0xc5, 0x92, 0x8b, 0x43, 0x5b, + 0xc9, 0x5e, 0x8e, 0x9a, 0x4e, 0xcd, 0xce, 0xa1, 0x97, 0x42, 0xae, 0xa5, + 0x53, 0x4e, 0x37, 0xa5, 0x9b, 0x76, 0x7a, 0xc7, 0x88, 0xb8, 0x30, 0x77, + 0xb6, 0x9b, 0x43, 0xac, 0x45, 0xd1, 0x36, 0xd0, 0xb4, 0x86, 0x3f, 0x7d, + 0xcc, 0x55, 0xa8, 0x6f, 0x51, 0x88, 0x92, 0x89, 0x91, 0x34, 0x97, 0xaa, + 0x34, 0x9c, 0x56, 0x85, 0x60, 0x88, 0x3f, 0x6d, 0x73, 0x8f, 0x65, 0xa5, + 0x85, 0x5f, 0x5c, 0xcd, 0x66, 0x80, 0x34, 0xb0, 0x82, 0x6b, 0xc5, 0x7e, + 0x29, 0xac, 0x37, 0xcd, 0x3e, 0xc3, 0xcb, 0xaa, 0xb3, 0x62, 0xca, 0x75, + 0x6d, 0xc6, 0x55, 0x36, 0xcc, 0x89, 0x37, 0xab, 0x73, 0xb3, 0x5e, 0x6f, + 0x53, 0xa0, 0x99, 0x72, 0x30, 0x7b, 0x71, 0x74, 0xc4, 0x5b, 0x77, 0x56, + 0xb1, 0xa0, 0x46, 0x8e, 0xba, 0xa3, 0xcf, 0x66, 0x89, 0x3b, 0xb1, 0x7f, + 0x51, 0x45, 0x2e, 0xcb, 0x7a, 0x4b, 0x32, 0x55, 0x29, 0xa8, 0x76, 0x48, + 0x88, 0x65, 0x5c, 0xc7, 0x46, 0x37, 0x68, 0x53, 0x72, 0x2b, 0x87, 0xd0, + 0x86, 0x49, 0x88, 0x8e, 0x99, 0xbc, 0x4f, 0x7b, 0x99, 0xc9, 0xca, 0xb4, + 0xa3, 0xf4, 0x4f, 0x63, 0x90, 0x44, 0xc5, 0x7d, 0x47, 0xc4, 0x49, 0x81, + 0xa3, 0x33, 0x78, 0x89, 0xc9, 0x58, 0x9a, 0x61, 0x2a, 0x98, 0xbf, 0x99, + 0x79, 0xc4, 0xbb, 0x58, 0xd4, 0xad, 0x95, 0x9c, 0x8b, 0x97, 0x31, 0x48, + 0x6e, 0xc0, 0x51, 0x85, 0x4a, 0xcc, 0x4d, 0xd5, 0x80, 0x7e, 0xbd, 0x78, + 0xc0, 0x1c, 0x44, 0xd6, 0x64, 0x2f, 0x95, 0x4f, 0x56, 0x83, 0x32, 0x91, + 0x97, 0x82, 0x35, 0x7b, 0x98, 0xaa, 0xd0, 0x4b, 0x8b, 0xb0, 0x57, 0x72, + 0x51, 0x59, 0xbd, 0x8f, 0x86, 0xa5, 0xc6, 0xa8, 0x3c, 0xbe, 0x4b, 0xba, + 0xcb, 0x40, 0x7d, 0x50, 0xaa, 0x71, 0x28, 0xb4, 0xac, 0xcb, 0x47, 0x4e, + 0x36, 0xc1, 0x90, 0x42, 0x42, 0x6f, 0x65, 0x88, 0x43, 0xc7, 0xb1, 0x57, + 0xa5, 0x78, 0x70, 0x7b, 0x50, 0xd4, 0x90, 0xc7, 0x62, 0xa9, 0x32, 0x51, + 0xbc, 0xbe, 0x37, 0x3a, 0x40, 0x94, 0x43, 0x90, 0x94, 0x9c, 0x5b, 0x4a, + 0x61, 0x79, 0x96, 0xac, 0xd6, 0x82, 0x83, 0x3b, 0x46, 0x31, 0xcb, 0xa4, + 0x35, 0xaf, 0x58, 0xd2, 0xc6, 0xb1, 0xb9, 0x70, 0xca, 0x6a, 0x45, 0x77, + 0x9d, 0x77, 0x42, 0x23, 0x91, 0x82, 0x74, 0xca, 0xe9, 0x9e, 0xc1, 0xc4, + 0xa1, 0x8b, 0x37, 0x44, 0xa2, 0xa3, 0xa2, 0x98, 0x26, 0x69, 0x3c, 0xda, + 0xba, 0x52, 0x51, 0x4e, 0x70, 0x9c, 0x69, 0xb5, 0x6a, 0x8b, 0xbe, 0x55, + 0x77, 0x56, 0xbf, 0x65, 0x65, 0x4c, 0x7f, 0x7c, 0xbd, 0xb2, 0x23, 0x3e, + 0x83, 0x3d, 0xb6, 0xbd, 0x5d, 0x69, 0x3d, 0x3d, 0xb5, 0x8b, 0x45, 0xc3, + 0xbb, 0xbf, 0xbd, 0x45, 0xc4, 0x8a, 0x8f, 0x7f, 0xce, 0x50, 0xc4, 0x80, + 0x41, 0x99, 0x93, 0x9a, 0x3c, 0x66, 0x53, 0x63, 0xa2, 0x72, 0x8b, 0xcd, + 0x3f, 0x85, 0xbd, 0xa2, 0x67, 0x74, 0x37, 0x77, 0xb4, 0xcf, 0x6c, 0x9d, + 0x3b, 0x5c, 0x32, 0x78, 0x58, 0xe2, 0xb2, 0xae, 0x9b, 0x5d, 0x6f, 0x7e, + 0x76, 0x6f, 0x3e, 0xbc, 0x61, 0x7c, 0x81, 0xc1, 0x65, 0x32, 0xa4, 0xc0, + 0x73, 0x74, 0x9f, 0xb1, 0x48, 0x46, 0xba, 0x74, 0xbc, 0xd8, 0xa9, 0x37, + 0x6a, 0x39, 0x7c, 0xa1, 0x6e, 0x91, 0x4e, 0x91, 0x88, 0x5a, 0xbd, 0x99, + 0xca, 0x9f, 0x4b, 0x88, 0x45, 0x5e, 0xc1, 0x59, 0x7d, 0xb7, 0xc0, 0xd6, + 0xb5, 0xa5, 0xb5, 0x6d, 0x37, 0xb3, 0xae, 0x90, 0x7d, 0x6d, 0x39, 0x9d, + 0x31, 0xa5, 0x69, 0x9f, 0x46, 0x76, 0xa6, 0xb9, 0x38, 0xd7, 0x73, 0xce, + 0x2d, 0x2a, 0x33, 0x5b, 0xaf, 0x62, 0x50, 0xd4, 0x94, 0x78, 0x86, 0x9f, + 0x62, 0x58, 0x8d, 0xb0, 0x6e, 0xba, 0x6f, 0x75, 0x87, 0x36, 0x41, 0x6d, + 0xc1, 0x8a, 0xa0, 0xca, 0xb5, 0xa9, 0xcb, 0x79, 0x8a, 0xb5, 0xb7, 0xaf, + 0xb8, 0x7a, 0xce, 0x98, 0x38, 0x3f, 0x8f, 0x5e, 0x50, 0xb9, 0x53, 0x9e, + 0x75, 0x69, 0x6a, 0x3c, 0x4f, 0xc2, 0x6d, 0xa8, 0x42, 0xc1, 0x37, 0x39, + 0x5f, 0x69, 0x50, 0x70, 0x46, 0x4e, 0x8c, 0x33, 0x98, 0x38, 0x7f, 0xb9, + 0x39, 0x9e, 0x7d, 0x45, 0x8a, 0xa9, 0x9c, 0x54, 0xac, 0x3b, 0x6e, 0x9a, + 0x57, 0x65, 0x58, 0x53, 0xd3, 0x82, 0x57, 0x59, 0xd8, 0xde, 0x79, 0x8e, + 0xd4, 0x2d, 0xbd, 0x4c, 0xa5, 0x98, 0xbb, 0xcd, 0x75, 0x65, 0x42, 0x30, + 0x40, 0xa4, 0x51, 0x3a, 0x78, 0x2d, 0xd2, 0x84, 0x7a, 0x58, 0xb6, 0xcc, + 0x3b, 0x5a, 0xa4, 0xb2, 0x35, 0xa1, 0x4e, 0x98, 0x4d, 0x62, 0x87, 0x62, + 0xc3, 0x40, 0xc5, 0x91, 0xb7, 0xa6, 0xda, 0x8e, 0x54, 0x65, 0x98, 0x34, + 0x2e, 0x73, 0x40, 0x6e, 0x2b, 0x2a, 0x50, 0x5c, 0x5d, 0x62, 0x5f, 0xd3, + 0xb0, 0x34, 0xaf, 0x7a, 0x8e, 0x59, 0x5a, 0x68, 0xa5, 0xa0, 0x40, 0xb9, + 0xd7, 0xba, 0xbe, 0x92, 0x4d, 0x7e, 0xa4, 0xa5, 0x6e, 0xaf, 0x96, 0x6c, + 0xc5, 0xc7, 0x94, 0xcc, 0x81, 0x38, 0x32, 0x56, 0x35, 0x6f, 0x87, 0xa9, + 0xb9, 0x2e, 0x9e, 0x6c, 0x27, 0x8c, 0x53, 0xc8, 0x55, 0x52, 0x46, 0x68, + 0x3c, 0xa5, 0x3b, 0x71, 0xba, 0xab, 0x5b, 0x9d, 0x53, 0x67, 0x3b, 0x38, + 0x78, 0x7d, 0xce, 0x74, 0x6b, 0x98, 0xa1, 0xb1, 0xb9, 0x81, 0x92, 0x60, + 0x5d, 0x7d, 0xa4, 0xac, 0xc7, 0x22, 0x8c, 0xd6, 0x8f, 0xb2, 0x63, 0xc9, + 0xa2, 0xaa, 0xa7, 0x9f, 0xab, 0x98, 0xa7, 0x47, 0xb7, 0x49, 0x9f, 0xc9, + 0x95, 0x3f, 0x48, 0x71, 0x7b, 0x71, 0x8a, 0xc2, 0x66, 0x9a, 0x9a, 0x88, + 0xa1, 0xa7, 0xb2, 0x91, 0xc4, 0xa8, 0xaa, 0xa7, 0xc2, 0xa3, 0xb1, 0xce, + 0xa3, 0x30, 0x68, 0xb1, 0x86, 0xa7, 0xbd, 0xa9, 0x9e, 0x6f, 0xc7, 0x68, + 0xa3, 0x9c, 0xaf, 0x64, 0x4e, 0x48, 0x3e, 0x95, 0x37, 0x4f, 0x4a, 0x3a, + 0x59, 0x65, 0x63, 0x9a, 0xba, 0xd3, 0x40, 0x5a, 0xc7, 0x9f, 0x58, 0x8b, + 0x87, 0x70, 0x6e, 0x3b, 0x7f, 0xb5, 0xa0, 0x8d, 0xaf, 0x59, 0xa9, 0x42, + 0xcf, 0xb1, 0x68, 0x42, 0xb2, 0xb9, 0xb4, 0x98, 0xa4, 0x34, 0xb4, 0x93, + 0x61, 0x9f, 0xc7, 0x7f, 0x5b, 0x76, 0x38, 0xcd, 0x52, 0x35, 0xc1, 0x67, + 0xc6, 0xc1, 0xc4, 0xad, 0x65, 0xa0, 0x70, 0x30, 0x3c, 0x4e, 0x6e, 0xa0, + 0x90, 0x5f, 0x88, 0xb5, 0x5a, 0x67, 0xa5, 0x54, 0x4c, 0x68, 0xca, 0xaa, + 0x50, 0x4d, 0xa8, 0xa3, 0x93, 0x65, 0x71, 0x9b, 0x60, 0x4e, 0xc3, 0x89, + 0x4a, 0xa4, 0x5e, 0xad, 0xc7, 0xb6, 0x61, 0x94, 0xa3, 0x79, 0x52, 0xb6, + 0xbd, 0x6b, 0xcb, 0x66, 0x7e, 0xd0, 0x4f, 0x38, 0x81, 0x4d, 0x6c, 0x58, + 0x8e, 0xaf, 0xa0, 0x5d, 0x8c, 0x95, 0x2a, 0x83, 0x86, 0xb8, 0x42, 0xd5, + 0x7a, 0x41, 0xb1, 0x79, 0x87, 0x49, 0x89, 0xa6, 0x8a, 0x65, 0xae, 0x83, + 0x4f, 0x41, 0xb6, 0x84, 0x33, 0xab, 0x87, 0x94, 0x71, 0xb4, 0x4f, 0x8c, + 0x87, 0x4a, 0x41, 0x6e, 0x4c, 0xca, 0x99, 0x5e, 0x44, 0xa2, 0xaa, 0xca, + 0x84, 0xb4, 0x9f, 0x64, 0xb7, 0xb3, 0x81, 0xca, 0x9d, 0xa0, 0x46, 0x6f, + 0xa6, 0x73, 0x68, 0x7b, 0x3d, 0xb5, 0x8e, 0x33, 0x88, 0x3f, 0x79, 0xa7, + 0x7f, 0x81, 0xcb, 0x34, 0xc3, 0xd3, 0x7c, 0x3c, 0x71, 0xc4, 0x94, 0xb2, + 0x61, 0x58, 0xa2, 0x36, 0x5d, 0xcd, 0xc5, 0x98, 0x9b, 0xd0, 0x95, 0xa9, + 0x85, 0x4b, 0x35, 0xac, 0x2d, 0x64, 0xb1, 0x88, 0x44, 0x90, 0x70, 0x5d, + 0x91, 0x7f, 0x87, 0x54, 0x42, 0x79, 0xc5, 0x65, 0x86, 0x39, 0x72, 0x5b, + 0xc3, 0x42, 0x7e, 0x6e, 0x92, 0x36, 0xb6, 0x7a, 0xb5, 0x9f, 0x6c, 0x38, + 0x7b, 0x9a, 0x7e, 0x66, 0xcf, 0xaf, 0x50, 0xac, 0x99, 0xc0, 0x87, 0x59, + 0xb1, 0xbd, 0x6f, 0x6e, 0xc8, 0x41, 0xb7, 0xac, 0x3a, 0x9a, 0x4d, 0xc7, + 0x2e, 0x71, 0x59, 0x62, 0xc7, 0x67, 0x4f, 0x37, 0xd1, 0xc2, 0x9f, 0xc0, + 0x7d, 0xb5, 0x95, 0xc7, 0x38, 0xaf, 0x9a, 0xb1, 0x5f, 0x47, 0xc1, 0x3b, + 0x5f, 0xab, 0x67, 0x3e, 0x82, 0xd0, 0xab, 0x56, 0x66, 0x70, 0xca, 0x7a, + 0x56, 0xab, 0x6a, 0xb2, 0x52, 0x86, 0xba, 0xb0, 0x4d, 0xa0, 0x43, 0x59, + 0x62, 0xb6, 0x75, 0x58, 0x8f, 0x7b, 0x4e, 0x6a, 0x88, 0x96, 0x76, 0x68, + 0x79, 0x5a, 0x97, 0x76, 0xcf, 0x7a, 0xc3, 0x4b, 0x90, 0x88, 0x53, 0x8a, + 0x3c, 0xc8, 0x84, 0x78, 0x5c, 0x7c, 0x75, 0x44, 0x98, 0x5f, 0x44, 0xba, + 0x88, 0x9d, 0xa2, 0xbc, 0x43, 0xb0, 0x65, 0x52, 0x5b, 0x61, 0x30, 0x35, + 0x69, 0xc2, 0x49, 0xb6, 0x79, 0xd0, 0xa8, 0x31, 0x7e, 0x70, 0x43, 0x6f, + 0x91, 0x8e, 0x9c, 0x98, 0x31, 0x36, 0xa3, 0x87, 0x99, 0x3f, 0x35, 0x5b, + 0x89, 0x62, 0xbf, 0x30, 0xa2, 0xa1, 0xc6, 0xcb, 0xcf, 0x5a, 0x4a, 0x81, + 0x3d, 0xb7, 0x8a, 0x57, 0x5a, 0xa2, 0x73, 0x47, 0x8c, 0xa7, 0x59, 0x3f, + 0x96, 0x92, 0xc5, 0xba, 0x3b, 0x37, 0xb1, 0x6c, 0x52, 0x83, 0x41, 0xae, + 0x69, 0x9b, 0x42, 0x99, 0x73, 0x7e, 0x89, 0x34, 0x75, 0xa5, 0xc8, 0x4e, + 0x89, 0x48, 0x60, 0xc1, 0xb3, 0x66, 0x36, 0xc9, 0x54, 0x5a, 0xb0, 0xc3, + 0x37, 0x81, 0x41, 0xa7, 0xce, 0x6f, 0x42, 0xc4, 0x75, 0x89, 0xcf, 0xc4, + 0x94, 0x5b, 0x4e, 0x40, 0x44, 0x8f, 0x77, 0xb9, 0xad, 0x50, 0x7b, 0x61, + 0xa9, 0xb8, 0x60, 0x77, 0x6d, 0xc2, 0x3e, 0x38, 0x4d, 0xba, 0x8c, 0x9a, + 0x3b, 0xd2, 0xc6, 0xd7, 0xd8, 0x9b, 0x82, 0x54, 0x45, 0xae, 0x8c, 0x91, + 0x4e, 0x4e, 0xd3, 0xd3, 0xc5, 0x7c, 0x64, 0xb7, 0x75, 0x64, 0x51, 0x43, + 0x5d, 0xbc, 0xb7, 0xac, 0x41, 0x67, 0x71, 0x72, 0xba, 0x4f, 0x90, 0x9a, + 0xa0, 0xa4, 0xcf, 0xc2, 0x57, 0x8c, 0x4e, 0xa4, 0x4c, 0x88, 0x36, 0x86, + 0xdd, 0xb1, 0x42, 0x43, 0x9f, 0x70, 0xc8, 0xd5, 0x88, 0xbc, 0xbe, 0x6c, + 0x91, 0xc9, 0x54, 0xa2, 0xa1, 0x3b, 0x48, 0xc7, 0x46, 0xa2, 0xb8, 0x63, + 0x3c, 0x6c, 0x5b, 0xad, 0x68, 0xd2, 0xcf, 0x5d, 0x33, 0x99, 0x44, 0xc6, + 0x59, 0xd0, 0x80, 0x73, 0x59, 0x8e, 0xd6, 0x76, 0x34, 0x85, 0xd8, 0xa1, + 0x64, 0xb9, 0x78, 0x9d, 0x4a, 0x8c, 0xcf, 0x57, 0xad, 0xa6, 0x63, 0x3d, + 0xa3, 0x63, 0x6b, 0x68, 0xb8, 0xa9, 0x62, 0xc7, 0xb5, 0x35, 0x4e, 0x93, + 0x7e, 0x92, 0x6b, 0xcb, 0xcb, 0x63, 0xa9, 0x8d, 0xb5, 0xb2, 0x48, 0x7a, + 0x59, 0xc6, 0x9c, 0x7f, 0x7c, 0x6e, 0x79, 0xca, 0x8e, 0x3c, 0x8e, 0x96, + 0x40, 0xa0, 0x62, 0x88, 0x84, 0x54, 0x42, 0x48, 0x4e, 0x74, 0x50, 0x56, + 0x4d, 0xb5, 0x60, 0x93, 0x8e, 0x48, 0x45, 0x4e, 0xd9, 0xcd, 0xbd, 0x93, + 0x96, 0x9a, 0x86, 0x52, 0xa2, 0xa1, 0x59, 0x55, 0x3e, 0x9f, 0x75, 0x9f, + 0x46, 0x5b, 0xce, 0xb1, 0x45, 0x4e, 0x79, 0x94, 0x9a, 0x52, 0xd3, 0x8d, + 0x55, 0xdf, 0x57, 0x98, 0x54, 0x51, 0x87, 0xa7, 0xc7, 0xab, 0x8a, 0x68, + 0x2a, 0x63, 0x36, 0xcf, 0x66, 0x5e, 0x7f, 0xb4, 0xc2, 0xa1, 0x35, 0x93, + 0xa8, 0xb5, 0x89, 0x57, 0x82, 0x4e, 0xc0, 0x9f, 0xa9, 0x6a, 0x8f, 0xcf, + 0xb7, 0xb2, 0x60, 0x7d, 0xb3, 0xb3, 0x95, 0xbc, 0xb7, 0x4d, 0x54, 0xc6, + 0x63, 0x8b, 0x27, 0x69, 0x89, 0x7d, 0x49, 0xa4, 0x7d, 0x45, 0x5e, 0xbe, + 0xb2, 0xc1, 0xcc, 0x74, 0x59, 0x86, 0xb3, 0xa3, 0x5b, 0x93, 0x70, 0x38, + 0x8b, 0x56, 0x3d, 0xab, 0x94, 0x66, 0x59, 0x45, 0x99, 0x50, 0xc6, 0x47, + 0x8a, 0x60, 0xad, 0x48, 0xb9, 0x50, 0xbe, 0xad, 0x7a, 0xcd, 0x4e, 0x73, + 0xbe, 0xa4, 0x8e, 0x82, 0x57, 0x56, 0xe5, 0xa8, 0x74, 0x9c, 0xb4, 0xcd, + 0xc1, 0xd0, 0xad, 0x49, 0xbb, 0xa2, 0x35, 0x4a, 0xa1, 0x54, 0x7d, 0x37, + 0x5c, 0x43, 0x81, 0xc9, 0x42, 0xc2, 0xb6, 0x6f, 0x9b, 0x3c, 0xde, 0xa5, + 0xd4, 0x5d, 0xb6, 0x75, 0x7b, 0x78, 0x41, 0x4b, 0xba, 0xc0, 0x54, 0xc6, + 0xd3, 0x4e, 0xd1, 0xa2, 0x6e, 0x6f, 0x3c, 0x5b, 0x51, 0xbd, 0x4c, 0xc6, + 0x2f, 0x3c, 0x9f, 0x6e, 0x87, 0xca, 0x92, 0x48, 0xb7, 0x76, 0x4a, 0xb5, + 0xbc, 0xb4, 0x4e, 0x98, 0xd8, 0x53, 0x9b, 0x79, 0x5c, 0xb9, 0xb0, 0x36, + 0xa1, 0x47, 0x91, 0xb0, 0xba, 0x90, 0x64, 0x43, 0xaf, 0x74, 0xb4, 0xcb, + 0x67, 0x60, 0xa6, 0xb2, 0xcd, 0x96, 0x73, 0x40, 0x9a, 0x23, 0x36, 0x6c, + 0xc3, 0x74, 0x5e, 0x60, 0x83, 0x3a, 0x70, 0x50, 0x6c, 0xb8, 0x93, 0x54, + 0x45, 0x45, 0x34, 0xa4, 0x3d, 0xcd, 0x7d, 0x81, 0xb4, 0xc6, 0xe1, 0xa1, + 0x49, 0x49, 0x93, 0x9c, 0x79, 0x9d, 0xb4, 0x81, 0x4f, 0xb3, 0x63, 0x60, + 0x3a, 0x5b, 0x91, 0x5a, 0x53, 0xbe, 0xab, 0x54, 0x48, 0x82, 0x4f, 0x5e, + 0xc6, 0x8e, 0x35, 0x88, 0xbd, 0x83, 0x39, 0xc5, 0x7f, 0xb8, 0x3d, 0xc6, + 0x71, 0x71, 0xa2, 0x5f, 0xd0, 0xbe, 0xae, 0x3c, 0xbb, 0x63, 0x8c, 0x54, + 0x87, 0x98, 0x5c, 0x48, 0xb7, 0x58, 0xa5, 0x87, 0xb7, 0x65, 0x3c, 0x98, + 0x33, 0x3f, 0xc9, 0x6f, 0x9d, 0x58, 0x9c, 0xc4, 0xa0, 0xb1, 0x64, 0x7b, + 0x60, 0xba, 0xa2, 0x92, 0xa7, 0x9d, 0x82, 0xd6, 0x33, 0xb9, 0x31, 0x74, + 0x7d, 0x7e, 0x34, 0xa5, 0x8c, 0xbe, 0xa6, 0x8c, 0x8a, 0xc6, 0x3f, 0x66, + 0x38, 0x34, 0x3f, 0x69, 0x94, 0x96, 0x4a, 0x70, 0x3b, 0x5f, 0xaa, 0x30, + 0xd5, 0xbb, 0x59, 0x32, 0x9e, 0x64, 0xa0, 0x60, 0x57, 0x49, 0x83, 0xba, + 0x52, 0x7c, 0x5c, 0x38, 0x68, 0x34, 0x3a, 0x56, 0x3f, 0x77, 0x6a, 0x6b, + 0x9a, 0xbb, 0x66, 0xb8, 0x34, 0xa8, 0xc4, 0xad, 0x91, 0x2f, 0x8d, 0xa8, + 0x7f, 0x58, 0xcb, 0x3e, 0x49, 0x58, 0x91, 0xb7, 0xbb, 0x96, 0x41, 0xc8, + 0x38, 0x94, 0x5a, 0x6b, 0x80, 0xa0, 0x3d, 0x31, 0xd6, 0x79, 0x99, 0x84, + 0x5c, 0x6d, 0xcb, 0x75, 0x5a, 0xb2, 0x97, 0x34, 0xc3, 0xac, 0x59, 0xaa, + 0x74, 0x38, 0xaf, 0x67, 0x3d, 0x63, 0xba, 0x74, 0xd2, 0xb0, 0x35, 0xbc, + 0x7c, 0x6f, 0x2f, 0xd3, 0x28, 0x50, 0xc1, 0xb7, 0x68, 0xcc, 0x58, 0x95, + 0x91, 0x35, 0xbb, 0x42, 0x6e, 0x9b, 0xbc, 0x99, 0x59, 0x9b, 0x4e, 0xb0, + 0xa5, 0x77, 0x89, 0x89, 0xc2, 0x53, 0x48, 0x5e, 0x45, 0x54, 0x41, 0x9b, + 0xa1, 0xa8, 0x7e, 0x62, 0x80, 0xbe, 0xd7, 0x6a, 0x83, 0x9f, 0x78, 0x3a, + 0x3c, 0x41, 0xab, 0x4e, 0x9f, 0xc0, 0x35, 0x68, 0x36, 0x7b, 0x4a, 0xb8, + 0xbf, 0xa7, 0x9d, 0x55, 0x48, 0xd3, 0x7f, 0x5c, 0x60, 0xcc, 0x48, 0x6e, + 0x3f, 0x86, 0x45, 0xa6, 0x50, 0xd4, 0x49, 0xba, 0x3d, 0x34, 0xa5, 0x3e, + 0x5a, 0x47, 0x9c, 0x69, 0x63, 0x52, 0x56, 0x57, 0x54, 0xa1, 0xd4, 0x6f, + 0xd7, 0x76, 0x66, 0x4a, 0x84, 0xbf, 0x49, 0x65, 0x45, 0x53, 0x72, 0xa1, + 0x58, 0x41, 0xa8, 0xa8, 0xbb, 0x85, 0x96, 0x7f, 0x4e, 0xb0, 0x34, 0x9e, + 0x55, 0x57, 0x9b, 0xce, 0x33, 0x49, 0x94, 0x7c, 0x7c, 0xc9, 0xa3, 0x88, + 0x5c, 0x3e, 0x60, 0x71, 0x96, 0x95, 0xc3, 0x89, 0x5b, 0x56, 0x82, 0x83, + 0x75, 0x64, 0xa4, 0xc3, 0x51, 0xb0, 0x32, 0x52, 0xbb, 0xbd, 0x4c, 0x6e, + 0x63, 0xae, 0x53, 0xbf, 0x9b, 0x92, 0xaf, 0x34, 0x9f, 0xb5, 0xd5, 0x86, + 0xb1, 0x99, 0xcd, 0x3a, 0x99, 0xc0, 0xd2, 0x51, 0xd0, 0x50, 0x81, 0x9d, + 0xa6, 0x84, 0x57, 0x41, 0x60, 0xc2, 0x65, 0x5d, 0x83, 0xab, 0x34, 0xb5, + 0xb1, 0xc6, 0x9d, 0x61, 0x63, 0x9a, 0xab, 0x38, 0xc7, 0xcb, 0xac, 0xae, + 0xbe, 0x8d, 0x9d, 0x48, 0x93, 0x54, 0x86, 0xca, 0xb2, 0x7b, 0x7f, 0x3c, + 0x91, 0x9d, 0xb7, 0x9b, 0x58, 0x82, 0x6e, 0x9c, 0x61, 0xd2, 0x6d, 0x85, + 0xd1, 0x98, 0x97, 0xd4, 0xc1, 0x43, 0x6b, 0x7b, 0x54, 0xa2, 0x35, 0xc0, + 0x9d, 0xc6, 0x6d, 0x3d, 0x7f, 0x7d, 0x91, 0xaa, 0x95, 0x9d, 0x42, 0x4f, + 0x66, 0x48, 0xac, 0x51, 0x92, 0x64, 0x42, 0x3d, 0xce, 0x82, 0x72, 0x61, + 0x68, 0x40, 0x5b, 0x86, 0x7a, 0xd8, 0x6b, 0x59, 0x58, 0x98, 0x90, 0x6d, + 0x4b, 0x5f, 0xb9, 0x8e, 0xa6, 0x67, 0xa8, 0x6d, 0x5b, 0x8a, 0xbc, 0x5c, + 0x88, 0x8f, 0xa2, 0x86, 0xc1, 0x82, 0x8c, 0x54, 0x41, 0xaf, 0xa7, 0x40, + 0xc8, 0xb2, 0x6d, 0xb3, 0x6d, 0x33, 0x50, 0x89, 0x33, 0x42, 0x32, 0xbc, + 0x62, 0xa1, 0x56, 0x3f, 0x90, 0xab, 0x89, 0x71, 0xc0, 0xa7, 0x54, 0x98, + 0x5b, 0x55, 0x9c, 0xc6, 0x56, 0x68, 0x6b, 0x4b, 0xc2, 0x67, 0x86, 0x5d, + 0x5b, 0x52, 0xc8, 0xa8, 0x94, 0x6b, 0x8f, 0xa9, 0x61, 0x68, 0x63, 0xb1, + 0x71, 0x38, 0x55, 0x3e, 0x59, 0x45, 0x8e, 0xa0, 0xb4, 0x6a, 0x57, 0x81, + 0x55, 0x67, 0x5f, 0x8d, 0x64, 0xc7, 0x7c, 0x48, 0x9f, 0x67, 0x7a, 0x5c, + 0xb2, 0xa0, 0x9f, 0x8c, 0xd0, 0x92, 0x59, 0x6a, 0x65, 0x2e, 0x44, 0x8f, + 0x4f, 0xb8, 0x92, 0x40, 0xd3, 0x64, 0x89, 0x66, 0xc8, 0xaa, 0x59, 0x82, + 0x9c, 0x99, 0x4b, 0x60, 0x59, 0xc0, 0x83, 0x60, 0x95, 0x9e, 0x52, 0x4e, + 0x4e, 0x29, 0x84, 0xc3, 0x67, 0x44, 0x37, 0x8f, 0x82, 0x89, 0x3e, 0x54, + 0x52, 0xb6, 0x89, 0xbd, 0x5d, 0x94, 0x51, 0x6a, 0x8d, 0xc0, 0x94, 0x8e, + 0xbb, 0xbd, 0x6e, 0x3f, 0x68, 0x99, 0x76, 0xb8, 0x4d, 0x88, 0x5c, 0xcd, + 0x54, 0x92, 0x4e, 0xbf, 0xbd, 0xdb, 0x89, 0x89, 0x7b, 0x61, 0x88, 0x72, + 0x78, 0x55, 0x3e, 0x55, 0x3a, 0xcd, 0x54, 0x32, 0x63, 0x87, 0xd4, 0xb1, + 0x80, 0x60, 0x4c, 0xa7, 0x83, 0x6c, 0xbc, 0x38, 0x87, 0x97, 0x5f, 0x4f, + 0xd8, 0x81, 0x33, 0xbd, 0x69, 0xaf, 0x3c, 0x9e, 0x50, 0x5a, 0xb3, 0x54, + 0xb9, 0x2f, 0xa2, 0x4a, 0x59, 0x41, 0x59, 0x53, 0xbb, 0x91, 0x9f, 0x6e, + 0x58, 0xc1, 0xb7, 0xc0, 0x74, 0xd0, 0x43, 0xb7, 0x80, 0x59, 0x54, 0x83, + 0x8f, 0x8c, 0x4a, 0x39, 0xa7, 0x69, 0xaa, 0xac, 0xb9, 0x8e, 0x8f, 0x40, + 0x63, 0xd1, 0x55, 0x7e, 0xb0, 0x91, 0x96, 0x96, 0x86, 0x43, 0x88, 0x75, + 0xd0, 0x64, 0x33, 0x8e, 0x86, 0x97, 0x60, 0x45, 0x37, 0x3d, 0x79, 0xc5, + 0x4b, 0xcb, 0x96, 0x60, 0x68, 0x78, 0xcc, 0x98, 0x82, 0x9f, 0x53, 0x76, + 0x66, 0xbc, 0xb2, 0x36, 0x34, 0xc2, 0x5f, 0x94, 0xbf, 0x44, 0xb2, 0x6d, + 0x39, 0x5f, 0xad, 0xaa, 0x40, 0xce, 0x74, 0x7b, 0xb6, 0xa2, 0x75, 0x5a, + 0x50, 0x70, 0x54, 0xa4, 0x3e, 0xb2, 0x6b, 0xb3, 0x4b, 0x97, 0x40, 0x36, + 0x47, 0x50, 0x3b, 0xd3, 0xd1, 0x56, 0xcc, 0x33, 0x7d, 0x3f, 0x96, 0xb0, + 0x7e, 0x51, 0x75, 0x6a, 0x32, 0xc0, 0x62, 0x35, 0x7c, 0x4a, 0x52, 0x43, + 0x4a, 0x60, 0xb8, 0x56, 0x96, 0x55, 0xd0, 0xd1, 0xc5, 0x65, 0xbe, 0x36, + 0x68, 0xa7, 0xa6, 0xc5, 0xce, 0xc8, 0x4e, 0x99, 0x2b, 0x77, 0xb1, 0xc1, + 0x74, 0x5e, 0x51, 0x32, 0x59, 0x85, 0x9c, 0xa3, 0x92, 0x7e, 0x85, 0x33, + 0x39, 0x92, 0xad, 0x62, 0x7d, 0x4d, 0x58, 0x44, 0x32, 0x3c, 0x5d, 0x4e, + 0x48, 0xa0, 0x6b, 0x5d, 0x93, 0x8f, 0x90, 0x84, 0x65, 0x3b, 0x86, 0x8a, + 0x54, 0x7e, 0x67, 0xc4, 0xb2, 0xc5, 0x8e, 0x8c, 0x56, 0x75, 0xc4, 0xc1, + 0x8d, 0x5d, 0x53, 0xc0, 0x38, 0x49, 0x8d, 0x4a, 0x83, 0x39, 0x41, 0xb4, + 0xa9, 0x32, 0xbd, 0x45, 0x4c, 0x90, 0xa9, 0x77, 0xb8, 0x31, 0x80, 0xa2, + 0x54, 0x97, 0x4d, 0xb7, 0x3e, 0x56, 0xa0, 0x83, 0x81, 0xb6, 0x79, 0xd0, + 0x34, 0xc6, 0x59, 0xc1, 0x39, 0x4b, 0x55, 0x6c, 0x34, 0x66, 0xb3, 0xbf, + 0xd3, 0x72, 0x56, 0xa9, 0x70, 0xcc, 0xcc, 0xa4, 0x66, 0x89, 0x7d, 0x7c, + 0xcb, 0x94, 0x9d, 0x58, 0x35, 0x38, 0x63, 0x45, 0xb1, 0x64, 0x83, 0xbc, + 0x5f, 0x64, 0xae, 0x32, 0x77, 0x7a, 0xd8, 0x4c, 0xc8, 0xac, 0xc5, 0xb4, + 0xc7, 0x84, 0x58, 0x49, 0x96, 0x81, 0x49, 0x37, 0x62, 0xab, 0x49, 0x7f, + 0x69, 0x70, 0x7a, 0x39, 0xc4, 0xb5, 0xd4, 0x8d, 0x6b, 0x74, 0xb5, 0xb3, + 0x9b, 0x4e, 0x77, 0xc5, 0x66, 0x4b, 0x43, 0xaa, 0xa2, 0x32, 0x66, 0xb4, + 0x3c, 0x5b, 0x56, 0xb4, 0xb6, 0xc8, 0x57, 0xb3, 0xb1, 0x4a, 0x55, 0x90, + 0x79, 0x3a, 0xb7, 0x56, 0xc4, 0x8b, 0x44, 0x66, 0x3f, 0x36, 0xc5, 0x79, + 0x3b, 0x31, 0x59, 0x64, 0x67, 0x99, 0xa0, 0xcc, 0x29, 0xb2, 0x5c, 0x9a, + 0x61, 0xba, 0x80, 0x5c, 0x66, 0x91, 0x30, 0xa8, 0x58, 0x56, 0x71, 0x95, + 0xab, 0x68, 0xb8, 0x91, 0x8d, 0xd8, 0x87, 0x42, 0x40, 0x99, 0xbd, 0xa6, + 0x40, 0xa0, 0x8a, 0x32, 0xa8, 0x39, 0x30, 0x52, 0xc9, 0x37, 0x79, 0x7c, + 0x4c, 0xae, 0x45, 0x5c, 0xc9, 0xc6, 0xd2, 0x92, 0x3e, 0x61, 0x63, 0x5a, + 0x6e, 0xaf, 0xa9, 0x96, 0x6d, 0x97, 0x56, 0x5f, 0x77, 0xc4, 0x47, 0xc1, + 0xaa, 0x91, 0xd0, 0xc1, 0x3f, 0x88, 0xa4, 0x36, 0x4b, 0x46, 0x4d, 0xaf, + 0x95, 0xaf, 0x84, 0x4e, 0x63, 0x51, 0x87, 0xbc, 0x86, 0xca, 0x66, 0x94, + 0xb0, 0x38, 0x46, 0x5a, 0x8f, 0xa2, 0x6f, 0x67, 0xa6, 0x35, 0xc6, 0x36, + 0x56, 0xb0, 0x45, 0xd0, 0x30, 0x9a, 0xba, 0xb8, 0x53, 0xcf, 0x50, 0xc2, + 0x47, 0x65, 0xc5, 0x98, 0x62, 0xc7, 0x82, 0xae, 0xd1, 0x48, 0x6f, 0x94, + 0x7d, 0xa1, 0xc9, 0x93, 0x3b, 0x7b, 0xcd, 0x36, 0x47, 0x7b, 0x5c, 0xba, + 0x35, 0xb4, 0xc4, 0x59, 0xcf, 0x96, 0x38, 0x77, 0x95, 0x89, 0xb9, 0xa2, + 0x45, 0x67, 0xa0, 0x7a, 0xb5, 0x6e, 0xa3, 0xbc, 0xa6, 0x51, 0x91, 0xa3, + 0x99, 0xb4, 0x81, 0xc5, 0x60, 0x4c, 0x72, 0x79, 0x99, 0x4c, 0x72, 0x57, + 0x96, 0x8a, 0xaa, 0x39, 0xc3, 0x51, 0x6e, 0x91, 0x6c, 0x97, 0x9f, 0xb8, + 0xb8, 0x62, 0x5b, 0x64, 0xc6, 0xd3, 0x39, 0x8d, 0x68, 0x6d, 0xb1, 0xad, + 0xc5, 0x3a, 0x45, 0x5c, 0xab, 0xc0, 0xab, 0xa8, 0xa4, 0xa9, 0x74, 0x7a, + 0x54, 0xd0, 0x4f, 0x32, 0x38, 0xad, 0xb9, 0x9d, 0xa6, 0xd9, 0x4d, 0xa5, + 0x7e, 0xba, 0x79, 0x55, 0x3a, 0x6d, 0x8a, 0xa6, 0x4e, 0xc6, 0xd1, 0x83, + 0x8b, 0x59, 0x31, 0x8d, 0x97, 0xbb, 0x35, 0x6e, 0x7c, 0x79, 0xb2, 0x6a, + 0xa6, 0x68, 0xb2, 0x88, 0x69, 0x95, 0x33, 0x40, 0x40, 0x3f, 0x4c, 0xb1, + 0xd3, 0x5c, 0x69, 0x96, 0x75, 0x6f, 0x39, 0xa3, 0x73, 0xcb, 0x42, 0x62, + 0x6c, 0x44, 0x33, 0xcb, 0x8e, 0x6c, 0xae, 0x98, 0x73, 0xbd, 0x3b, 0x92, + 0xa6, 0x96, 0xa6, 0xb8, 0x64, 0x3f, 0xcf, 0x33, 0x6d, 0xad, 0x5c, 0x83, + 0xa9, 0x4c, 0x40, 0x92, 0x8a, 0xac, 0xce, 0x53, 0x84, 0x8c, 0x9b, 0x9c, + 0x64, 0x6b, 0xb9, 0x67, 0x5b, 0xbb, 0x5a, 0xc2, 0x36, 0xbf, 0xbd, 0x32, + 0xba, 0xd0, 0xc8, 0x9d, 0x68, 0x3c, 0x36, 0x98, 0xbe, 0x87, 0xc0, 0xb2, + 0x39, 0x5b, 0xa9, 0x53, 0x6b, 0xc8, 0x9d, 0x37, 0x6e, 0xa8, 0x62, 0x54, + 0xc0, 0x3a, 0x48, 0x4a, 0x58, 0x3c, 0xcb, 0xa5, 0xcb, 0xcb, 0x9e, 0xc2, + 0x72, 0x60, 0x80, 0x8a, 0x84, 0x45, 0xb7, 0x64, 0x6b, 0x72, 0x8f, 0xba, + 0x8c, 0x37, 0x7b, 0x45, 0xb7, 0x57, 0x95, 0xa7, 0x84, 0x4d, 0x86, 0x4f, + 0x5c, 0x9c, 0x75, 0xa4, 0xd2, 0x7f, 0xb4, 0x52, 0x7e, 0x77, 0xb3, 0xa4, + 0x51, 0x9b, 0x5d, 0x36, 0xb5, 0xcd, 0xc0, 0x38, 0x8f, 0x4c, 0x9a, 0xc7, + 0xc6, 0x9a, 0x5a, 0xa1, 0xcb, 0x83, 0xb1, 0x51, 0x5a, 0x34, 0x46, 0x91, + 0xd2, 0xa7, 0x39, 0x7f, 0x74, 0x75, 0xca, 0x46, 0x8c, 0x65, 0x7b, 0x64, + 0x9d, 0x86, 0x83, 0x4f, 0xbd, 0xc5, 0xd4, 0xb4, 0x5e, 0x63, 0x8b, 0x7d, + 0x7e, 0xb3, 0x46, 0xb8, 0xd1, 0x48, 0x88, 0x60, 0x76, 0xa8, 0xa9, 0xa4, + 0xbf, 0xb1, 0x8c, 0xcd, 0x80, 0x91, 0xbd, 0xb6, 0x96, 0x5a, 0x58, 0x3b, + 0xcb, 0xa8, 0x35, 0xa9, 0x5d, 0xaf, 0x53, 0x84, 0x39, 0x4e, 0xd2, 0x99, + 0x8a, 0x50, 0x85, 0x51, 0x7a, 0x57, 0xce, 0xcf, 0xa4, 0xc6, 0x86, 0xb2, + 0x9a, 0xa9, 0x88, 0x63, 0x95, 0x6a, 0x92, 0xb4, 0x9b, 0x5f, 0x53, 0xaa, + 0x8e, 0x60, 0x66, 0x51, 0x56, 0x59, 0xc9, 0x7b, 0x6d, 0x8f, 0xc6, 0x5c, + 0x83, 0x6e, 0x61, 0x87, 0x7e, 0x3e, 0xbc, 0xb9, 0x63, 0x51, 0xb0, 0x90, + 0x38, 0x77, 0xbb, 0x50, 0xcf, 0xd1, 0xa6, 0x64, 0x54, 0xc2, 0xb6, 0xb7, + 0xa6, 0xa4, 0x6a, 0x3b, 0xc2, 0xb6, 0x7e, 0x3b, 0xbe, 0x55, 0x97, 0xa0, + 0x8b, 0x95, 0xae, 0xce, 0xb6, 0xaa, 0x3c, 0x8b, 0xd0, 0x43, 0x42, 0xc7, + 0x3e, 0x37, 0xc2, 0x5b, 0xc6, 0x56, 0x8a, 0x53, 0x40, 0x9a, 0x97, 0x7c, + 0xcb, 0xab, 0x4b, 0x5f, 0x71, 0xce, 0x67, 0x36, 0x48, 0x71, 0x9d, 0x42, + 0x74, 0x90, 0x5f, 0x43, 0x2d, 0x5a, 0x6b, 0x5a, 0x7f, 0x4b, 0xbf, 0xb2, + 0x3b, 0x8e, 0x41, 0x75, 0x2e, 0xb9, 0x51, 0xc8, 0xb7, 0x40, 0x75, 0x93, + 0x95, 0xbd, 0x50, 0x8b, 0x50, 0x46, 0x5b, 0x72, 0x5c, 0x69, 0x8b, 0xac, + 0xa7, 0xc9, 0xc9, 0x44, 0xc9, 0x83, 0x79, 0x7b, 0xcf, 0x49, 0x2d, 0x8b, + 0xb0, 0x91, 0xab, 0x35, 0x55, 0xa1, 0xa8, 0x8a, 0xd0, 0x72, 0x82, 0xbf, + 0xc3, 0x56, 0x65, 0x90, 0x9d, 0x4c, 0x52, 0x69, 0xa0, 0x7f, 0x7e, 0xb0, + 0x43, 0x57, 0x5e, 0x73, 0xa9, 0x53, 0x8c, 0xbf, 0x41, 0x4b, 0x91, 0x2f, + 0x7d, 0x53, 0xbf, 0x3a, 0xba, 0x69, 0xcd, 0x90, 0xb6, 0xb3, 0xa2, 0x82, + 0x6d, 0x40, 0x60, 0x7b, 0x6d, 0x50, 0x98, 0x3b, 0x65, 0x40, 0x40, 0x5b, + 0x8a, 0x79, 0xb0, 0xbf, 0x8c, 0x54, 0x6e, 0xa4, 0x38, 0xc8, 0x55, 0x4c, + 0xbf, 0xb6, 0x7b, 0xc5, 0xb2, 0xa2, 0x85, 0x7a, 0x56, 0x8d, 0xcb, 0x86, + 0xa4, 0xbb, 0x51, 0x8e, 0x92, 0x33, 0x3d, 0x2b, 0xb5, 0xb0, 0x57, 0x96, + 0xa7, 0x47, 0x50, 0xb8, 0x6c, 0x5a, 0x8c, 0x5d, 0xd2, 0x4a, 0x3f, 0x65, + 0x72, 0x91, 0xce, 0xae, 0x5b, 0xbb, 0x79, 0x5c, 0x55, 0xbf, 0x50, 0x6c, + 0x71, 0x83, 0x4b, 0x8c, 0x50, 0xa8, 0xc2, 0xc7, 0x62, 0x9a, 0xc0, 0x55, + 0x8a, 0xce, 0xaf, 0x67, 0x66, 0xcb, 0xac, 0x54, 0x9f, 0x5d, 0x69, 0x3d, + 0x50, 0x52, 0x3d, 0x36, 0x38, 0x88, 0x7f, 0x48, 0x76, 0x6b, 0x98, 0x76, + 0xa7, 0x8a, 0xaa, 0x9a, 0x98, 0xca, 0x41, 0xc6, 0xbb, 0x87, 0x73, 0x7e, + 0x60, 0x8f, 0x56, 0x97, 0xc2, 0xbb, 0x3b, 0x3f, 0x41, 0xac, 0xb4, 0x62, + 0x58, 0x62, 0x68, 0x6c, 0x35, 0x3d, 0xce, 0x38, 0x43, 0x44, 0x3c, 0x5e, + 0x6c, 0x61, 0x8c, 0x3f, 0x35, 0x4b, 0x47, 0x90, 0xc1, 0x4b, 0x38, 0xb5, + 0x36, 0x79, 0x8e, 0x3d, 0xbe, 0x4a, 0x89, 0x88, 0x9e, 0x47, 0x55, 0xcc, + 0xaa, 0x5b, 0xbf, 0x40, 0x80, 0xb4, 0x44, 0xc5, 0x9c, 0x37, 0x8c, 0x94, + 0xb4, 0x9d, 0x7e, 0x64, 0x4b, 0xa3, 0x5d, 0x50, 0xaa, 0xb1, 0xb1, 0x7a, + 0x80, 0xb0, 0xa1, 0x4d, 0x9d, 0x88, 0x68, 0x2f, 0x53, 0x9b, 0x3e, 0x93, + 0xa5, 0xa1, 0xb4, 0xb5, 0x66, 0x49, 0xbc, 0xcc, 0x37, 0x9c, 0x4e, 0x40, + 0x82, 0xa5, 0xa7, 0x81, 0x84, 0x35, 0xce, 0x92, 0xc2, 0x80, 0x49, 0x34, + 0x48, 0x86, 0x3f, 0xbd, 0x5c, 0x87, 0x3c, 0x85, 0x8d, 0xca, 0x52, 0x49, + 0x77, 0x35, 0xb8, 0xa6, 0x80, 0x9e, 0x80, 0x6e, 0x78, 0x98, 0xa9, 0x64, + 0x92, 0x4d, 0x8a, 0x50, 0xa7, 0x6a, 0xac, 0xcb, 0x78, 0x63, 0x7e, 0x8e, + 0x89, 0x59, 0x2e, 0x4a, 0x69, 0x58, 0xc5, 0x34, 0xbf, 0x76, 0x5a, 0x35, + 0x59, 0xa9, 0xb8, 0x4c, 0x50, 0x59, 0x83, 0x52, 0x70, 0xb5, 0x56, 0x55, + 0x6c, 0x66, 0xc4, 0x38, 0x4a, 0x6d, 0x87, 0xbc, 0xa0, 0x91, 0xa2, 0xa5, + 0x3f, 0x49, 0x6d, 0xac, 0xa8, 0x4b, 0x5c, 0x5f, 0x84, 0x8c, 0xba, 0x6d, + 0xa6, 0x61, 0x9d, 0x81, 0xc8, 0x3d, 0x4b, 0x5e, 0xb4, 0x3d, 0x91, 0x68, + 0x4c, 0x4d, 0x2e, 0x54, 0xb3, 0x78, 0x7e, 0x3b, 0x4e, 0xb4, 0x81, 0x72, + 0x6a, 0x85, 0x4a, 0x37, 0xb1, 0x5a, 0x51, 0x74, 0x3e, 0x35, 0x4e, 0x42, + 0x9f, 0x34, 0xb0, 0xbd, 0x4a, 0x3f, 0x8b, 0x4e, 0x58, 0x7b, 0xce, 0x46, + 0x9c, 0xd4, 0x98, 0x53, 0xaf, 0xca, 0x89, 0x87, 0xad, 0xc3, 0x57, 0x6a, + 0xb3, 0x97, 0x7e, 0x36, 0xb0, 0x9d, 0x9c, 0x5b, 0xd0, 0x4d, 0x81, 0x67, + 0x44, 0x90, 0x67, 0xa0, 0x3d, 0xc0, 0x6a, 0xc8, 0x80, 0x51, 0xbf, 0xa9, + 0xb9, 0x7d, 0xb6, 0xbd, 0x59, 0x82, 0x79, 0x7b, 0x49, 0x66, 0xc0, 0x38, + 0x6c, 0x95, 0x94, 0x8e, 0x49, 0x42, 0x95, 0xc8, 0x78, 0x3d, 0x60, 0x41, + 0x4a, 0xb9, 0x67, 0x34, 0x45, 0xa2, 0xaa, 0xc1, 0x34, 0x45, 0x6c, 0xaf, + 0x95, 0x9f, 0x77, 0xce, 0xaf, 0x9c, 0xcb, 0x44, 0xaa, 0x35, 0x44, 0x7c, + 0x68, 0x99, 0xa7, 0x46, 0x71, 0x76, 0xd2, 0x70, 0x5c, 0x4c, 0x66, 0x9d, + 0x60, 0xbc, 0xb3, 0xa8, 0xc1, 0x60, 0x94, 0xc9, 0x69, 0xc7, 0x37, 0xd3, + 0xb9, 0x54, 0xc2, 0xa3, 0x98, 0x65, 0x4c, 0xaf, 0x90, 0xc9, 0x82, 0xc2, + 0x9e, 0x91, 0xa2, 0x8f, 0xce, 0x79, 0x6d, 0x3b, 0xab, 0x4a, 0xa2, 0x6e, + 0x99, 0x8e, 0xbd, 0x95, 0xa8, 0x83, 0x61, 0x9c, 0x48, 0x40, 0x53, 0x80, + 0xa9, 0x5d, 0x87, 0x5b, 0xb4, 0xb6, 0xb4, 0x9d, 0x4d, 0xbf, 0x43, 0x71, + 0x8b, 0x7f, 0x88, 0xb7, 0x6f, 0x3f, 0x94, 0x7d, 0xaa, 0x80, 0xd2, 0x76, + 0x5c, 0xbd, 0x50, 0x73, 0xc6, 0x82, 0x6c, 0xb3, 0xbd, 0x53, 0x89, 0x94, + 0xbe, 0x63, 0x83, 0x90, 0x70, 0xb1, 0x7d, 0x69, 0x72, 0xb7, 0x6b, 0xb1, + 0x92, 0xc6, 0x64, 0xd7, 0xbb, 0x8e, 0xac, 0x6e, 0xb2, 0xa6, 0x3b, 0x8c, + 0x57, 0x83, 0xa9, 0xa8, 0x9f, 0x84, 0xb8, 0x40, 0x48, 0x71, 0x5c, 0x40, + 0x95, 0xbb, 0xc6, 0x6d, 0xb4, 0x54, 0x69, 0xae, 0x91, 0xa5, 0x9e, 0xd4, + 0xae, 0x4b, 0xa9, 0x3b, 0x53, 0xa3, 0x6a, 0x85, 0xaa, 0xa4, 0x7c, 0x36, + 0x91, 0x7b, 0x7d, 0xb7, 0xc9, 0x81, 0x82, 0xb8, 0xaa, 0xb6, 0x97, 0xc7, + 0xa0, 0x99, 0x78, 0x62, 0x58, 0xc9, 0xa9, 0xba, 0x3a, 0x3a, 0x5c, 0xa9, + 0x97, 0x78, 0xd3, 0xc3, 0x5c, 0x44, 0x33, 0xab, 0x42, 0xcb, 0x3c, 0x94, + 0xc5, 0x5e, 0xb1, 0x7e, 0x60, 0x76, 0x40, 0x47, 0x70, 0x71, 0x58, 0x40, + 0x48, 0x76, 0x67, 0x47, 0x7d, 0xb3, 0x4f, 0x8a, 0xbf, 0x92, 0x78, 0x3e, + 0x49, 0x7a, 0xae, 0x3a, 0x3e, 0x48, 0xbd, 0xbd, 0x47, 0x66, 0x91, 0x85, + 0xa0, 0xd4, 0x6b, 0x91, 0x75, 0x8d, 0x8b, 0x3e, 0x70, 0x4a, 0x5e, 0xa2, + 0x4a, 0x57, 0x53, 0x86, 0x84, 0x45, 0x79, 0xc4, 0x8d, 0xaf, 0xcf, 0x2e, + 0xae, 0xb2, 0x83, 0x9a, 0x42, 0x44, 0x4c, 0x99, 0x41, 0xb0, 0x63, 0x65, + 0xb1, 0xcb, 0x89, 0x5e, 0x65, 0x3c, 0xc2, 0x44, 0x72, 0x4b, 0x49, 0xa6, + 0x95, 0x75, 0x9a, 0xc8, 0xa8, 0x87, 0xa9, 0x7b, 0x4b, 0x9b, 0x68, 0x6f, + 0xaf, 0x61, 0x5c, 0x9c, 0x8e, 0x74, 0x9f, 0x77, 0x3d, 0x62, 0xc5, 0x82, + 0x8d, 0xd0, 0x92, 0x37, 0x63, 0x59, 0x62, 0xc1, 0x53, 0xb3, 0x8d, 0x3c, + 0x38, 0x79, 0x87, 0x6b, 0x5d, 0xd0, 0x48, 0x78, 0x95, 0x51, 0xba, 0x9f, + 0x9a, 0xab, 0xa6, 0x94, 0x66, 0xa2, 0xbc, 0x84, 0x53, 0xb4, 0x4c, 0x2e, + 0xc6, 0x4a, 0xb4, 0x3b, 0xad, 0x9c, 0x9d, 0xac, 0x31, 0x31, 0x60, 0x6e, + 0x8d, 0x3a, 0x9e, 0x7b, 0x6f, 0xbb, 0x3a, 0xc6, 0x77, 0x82, 0x7c, 0x62, + 0xce, 0x90, 0x8d, 0x60, 0x6b, 0x9f, 0x79, 0xad, 0xd2, 0xba, 0x5e, 0x62, + 0x80, 0x43, 0x9d, 0x84, 0xcc, 0x3c, 0x94, 0x56, 0x8e, 0xa3, 0x86, 0xc6, + 0x88, 0x85, 0x77, 0x83, 0xa9, 0x8e, 0x6f, 0x50, 0xa6, 0x4c, 0x9e, 0x43, + 0x73, 0x3a, 0x7f, 0x75, 0xc9, 0xaf, 0x3b, 0x37, 0x3e, 0x90, 0x7d, 0xbc, + 0xc2, 0xc8, 0x31, 0x8a, 0x7f, 0x32, 0x8b, 0x3c, 0x73, 0x3a, 0x53, 0x3a, + 0xa7, 0x83, 0x8e, 0x3d, 0x50, 0xcf, 0xb2, 0x92, 0xab, 0xa8, 0xae, 0x34, + 0x6e, 0x31, 0x5f, 0xc4, 0xc8, 0x53, 0x4a, 0x92, 0x6c, 0xc5, 0x3f, 0x87, + 0x92, 0xaf, 0x45, 0xa8, 0x69, 0xca, 0xbb, 0x83, 0x41, 0xd3, 0x98, 0x49, + 0xb9, 0x94, 0x98, 0x72, 0x44, 0xbe, 0xd3, 0x5d, 0x97, 0x57, 0xac, 0x4c, + 0x69, 0x76, 0xd1, 0x7c, 0x93, 0xb7, 0xb4, 0x63, 0x9b, 0x39, 0x48, 0x98, + 0x49, 0x42, 0xad, 0x89, 0x9d, 0xaa, 0x9f, 0x74, 0x88, 0x95, 0x42, 0x32, + 0x78, 0xb0, 0x4f, 0x41, 0x39, 0x95, 0x60, 0xb7, 0xc9, 0xd1, 0x2e, 0x47, + 0x60, 0x69, 0x54, 0x9e, 0x6c, 0xc8, 0x8e, 0x95, 0x4c, 0x51, 0x64, 0x72, + 0x4c, 0x9a, 0x68, 0x5e, 0xb0, 0x61, 0x43, 0xcb, 0x85, 0xb7, 0x74, 0x76, + 0x5a, 0x9c, 0xd1, 0xba, 0x49, 0xa6, 0x9b, 0x51, 0xb8, 0x7a, 0x6c, 0x67, + 0x68, 0x58, 0x4b, 0xa7, 0x90, 0xc1, 0x8d, 0x30, 0x5e, 0xa9, 0xb3, 0x98, + 0xa8, 0x3b, 0x5d, 0x9c, 0x4f, 0x43, 0x85, 0xba, 0xd2, 0x62, 0xa9, 0x5b, + 0xae, 0x38, 0xa2, 0xb6, 0x53, 0x52, 0x99, 0x46, 0xab, 0x7b, 0xc2, 0x69, + 0x7e, 0x49, 0xd2, 0x7f, 0xbd, 0xa1, 0x93, 0x54, 0x82, 0x59, 0xc0, 0x7f, + 0x6c, 0x64, 0x57, 0xd3, 0xc4, 0x6a, 0x36, 0xb4, 0x42, 0x5c, 0x40, 0x46, + 0x7f, 0x30, 0xc9, 0xbd, 0x8d, 0x81, 0x5b, 0x74, 0xb7, 0xd5, 0xc1, 0x4a, + 0x4d, 0xc7, 0x78, 0x62, 0xb4, 0xa6, 0x46, 0x48, 0x6d, 0x72, 0x3a, 0x68, + 0xcd, 0x3a, 0x51, 0x77, 0x3d, 0x51, 0xcf, 0x4a, 0xc2, 0x6e, 0x70, 0x46, + 0x8e, 0x6b, 0x95, 0x3a, 0x39, 0xcf, 0x6c, 0xa0, 0x5e, 0x4f, 0xb1, 0x99, + 0x51, 0xbb, 0xd1, 0x99, 0xa9, 0xcf, 0x49, 0xb7, 0x7b, 0x83, 0x40, 0x45, + 0x49, 0x41, 0x8d, 0xca, 0xac, 0xb8, 0x42, 0xa6, 0x58, 0xb5, 0x85, 0xc1, + 0x65, 0x9e, 0x36, 0x80, 0x92, 0xb7, 0x92, 0x5e, 0x93, 0x8b, 0x56, 0xaa, + 0x72, 0xa3, 0x76, 0xaa, 0x62, 0xac, 0x4e, 0x8f, 0xb2, 0xc6, 0x3d, 0x60, + 0x53, 0x76, 0x39, 0x96, 0x5f, 0x57, 0x4a, 0x52, 0x3f, 0xa4, 0x97, 0x96, + 0x7d, 0x8b, 0xce, 0xa9, 0x94, 0x3c, 0xad, 0x45, 0x76, 0x3a, 0xc2, 0x72, + 0x7d, 0x5c, 0xb7, 0xa7, 0xaf, 0xb8, 0x5b, 0xa3, 0x57, 0xca, 0x73, 0x66, + 0xc6, 0x4c, 0xab, 0x38, 0xb8, 0x4d, 0x53, 0x66, 0x91, 0x57, 0xcb, 0x51, + 0x92, 0x69, 0x53, 0x6e, 0x4e, 0x62, 0x6f, 0xc4, 0x8b, 0x76, 0xa6, 0xb9, + 0xb5, 0x3f, 0xcd, 0x43, 0x58, 0x79, 0xcf, 0x60, 0x46, 0xc3, 0xce, 0xa4, + 0x3c, 0x6b, 0xba, 0x57, 0x69, 0xaa, 0x54, 0x75, 0x78, 0xb1, 0xb9, 0x5c, + 0x91, 0x46, 0xcb, 0x3c, 0x6e, 0x4c, 0x8f, 0x90, 0xc2, 0x42, 0x35, 0xb5, + 0x8e, 0x3c, 0xa8, 0xa9, 0x6b, 0xcf, 0x8a, 0x9e, 0xa4, 0xd3, 0x98, 0x4f, + 0xac, 0xc5, 0xb6, 0x65, 0xb3, 0x9f, 0x62, 0x83, 0x8c, 0x6c, 0xd4, 0xa5, + 0xbd, 0x63, 0xbb, 0x34, 0x83, 0x6f, 0x58, 0x6c, 0xc3, 0x42, 0x85, 0xa1, + 0x9f, 0x57, 0x7d, 0x47, 0x3e, 0x64, 0x7e, 0xaa, 0x8e, 0x2f, 0x50, 0xc6, + 0xca, 0x41, 0xa1, 0xd0, 0x95, 0x61, 0x49, 0x75, 0x34, 0x9f, 0x5a, 0x42, + 0x46, 0xa3, 0xc2, 0x9b, 0x46, 0xa7, 0x52, 0x4a, 0x62, 0xb0, 0x78, 0x84, + 0xc0, 0x71, 0xa3, 0xa8, 0x72, 0x44, 0xab, 0xc1, 0x89, 0x3c, 0xcc, 0xa1, + 0xa0, 0x37, 0xaf, 0x74, 0xaf, 0x8b, 0x91, 0x51, 0x39, 0xa4, 0x8f, 0x50, + 0x7d, 0x57, 0xcc, 0xcd, 0x3e, 0x68, 0x7c, 0x5b, 0x79, 0x2d, 0x67, 0xc1, + 0x95, 0x59, 0x69, 0x7f, 0x7b, 0x8f, 0x80, 0xb2, 0x6a, 0xc6, 0x5f, 0x98, + 0x57, 0x6b, 0xa7, 0x82, 0x8d, 0x5b, 0x77, 0xcd, 0xbc, 0xbd, 0x6a, 0x9a, + 0x8c, 0x38, 0x47, 0x55, 0xd0, 0xbb, 0xca, 0x41, 0x63, 0xc0, 0x64, 0x54, + 0xce, 0xa6, 0xb5, 0xc0, 0xb2, 0x52, 0x43, 0x2f, 0x75, 0x69, 0x9f, 0xac, + 0x95, 0xba, 0x88, 0xc1, 0x77, 0xc2, 0xa5, 0xb1, 0x3b, 0x36, 0x49, 0x78, + 0xbc, 0x6c, 0xa3, 0xbe, 0xce, 0x2f, 0x91, 0x34, 0x33, 0xcc, 0xcc, 0x3f, + 0x51, 0xa1, 0x5e, 0x86, 0x7b, 0x38, 0x54, 0x85, 0x9d, 0xa9, 0xab, 0x48, + 0x6b, 0x67, 0x52, 0x5d, 0xc6, 0x85, 0x43, 0x90, 0x62, 0x71, 0xc4, 0x75, + 0x9b, 0x92, 0xc2, 0x9f, 0xa1, 0xc1, 0x4a, 0x97, 0x67, 0xb0, 0x87, 0x52, + 0x31, 0xd3, 0xc6, 0x96, 0xd1, 0x38, 0x5a, 0x3f, 0xb2, 0xb2, 0x47, 0xad, + 0x7f, 0x89, 0xc2, 0x64, 0x82, 0x6b, 0xa9, 0x97, 0x55, 0x42, 0x64, 0x3a, + 0x8e, 0x75, 0xb7, 0xb3, 0x5b, 0x8a, 0x7c, 0x48, 0x8c, 0x34, 0x97, 0xc5, + 0x8e, 0xcd, 0x84, 0x54, 0x93, 0x91, 0x8b, 0x66, 0xc4, 0xd1, 0x35, 0xa6, + 0x95, 0xba, 0xc8, 0xa8, 0x75, 0xc6, 0xae, 0xa3, 0xcc, 0x36, 0x45, 0xa5, + 0x79, 0x33, 0x8a, 0xcc, 0x69, 0xc5, 0x48, 0x88, 0xc3, 0x81, 0x40, 0x8c, + 0x99, 0x99, 0x85, 0x79, 0x5e, 0x60, 0x73, 0x58, 0x3f, 0x53, 0xcc, 0x46, + 0x34, 0x37, 0x4d, 0x6b, 0xbc, 0xce, 0x36, 0xbc, 0xa4, 0x80, 0x5d, 0x5a, + 0xa3, 0xc5, 0xa6, 0x3d, 0xa2, 0x8e, 0x4f, 0xbf, 0x85, 0xbb, 0x38, 0x9f, + 0xbd, 0x60, 0x87, 0x56, 0x92, 0x31, 0x59, 0x7b, 0xa4, 0x3c, 0xae, 0xbd, + 0x6b, 0x7d, 0x7c, 0x68, 0xbe, 0x35, 0x52, 0x62, 0x70, 0x8a, 0x41, 0x43, + 0x80, 0x62, 0x3e, 0x9b, 0xc4, 0x40, 0xa1, 0xae, 0x9b, 0x41, 0x6e, 0xa6, + 0x6b, 0x79, 0x5b, 0x8c, 0x6b, 0xb6, 0x6a, 0xb6, 0xcf, 0x33, 0xa8, 0x52, + 0x5c, 0xb7, 0x9d, 0xc1, 0x57, 0xd3, 0x3f, 0x7b, 0x97, 0x56, 0x8b, 0xd1, + 0xb2, 0xc6, 0x72, 0x9c, 0x63, 0xa4, 0xbd, 0xa0, 0x8f, 0x55, 0x87, 0x8a, + 0x42, 0x48, 0x45, 0x3e, 0xc3, 0xce, 0xa7, 0x99, 0x7c, 0xd1, 0xcb, 0x49, + 0x97, 0x70, 0x95, 0x2f, 0x8f, 0xae, 0xc5, 0x77, 0x9a, 0x81, 0x48, 0x79, + 0x5c, 0x35, 0x79, 0x86, 0x64, 0xad, 0xc2, 0x2e, 0x8e, 0xa8, 0x74, 0x71, + 0xa2, 0x66, 0xb9, 0xc3, 0x4b, 0xa3, 0x50, 0x4c, 0xb3, 0x4e, 0xa3, 0xc8, + 0x7a, 0xa2, 0xbc, 0x50, 0x31, 0xaf, 0x6f, 0x34, 0x7e, 0x7a, 0x76, 0x8b, + 0x55, 0x34, 0x64, 0x39, 0x53, 0x81, 0x8e, 0xc4, 0x7b, 0x5e, 0x73, 0x89, + 0x47, 0x31, 0x77, 0x4c, 0x4c, 0x71, 0x67, 0x37, 0x92, 0x98, 0x7f, 0x6c, + 0x34, 0x5e, 0x3c, 0xca, 0xc1, 0xc5, 0xa3, 0xc6, 0xbd, 0x7a, 0x9c, 0xa8, + 0x49, 0x43, 0x41, 0xa4, 0xbd, 0x3b, 0x54, 0x71, 0x66, 0xb3, 0xcc, 0xa6, + 0x4f, 0x64, 0xba, 0x59, 0x33, 0x50, 0xc5, 0xc1, 0x7e, 0xb3, 0xa1, 0x40, + 0xd1, 0x7d, 0x4b, 0x35, 0xb7, 0xbf, 0x3c, 0x96, 0x91, 0x33, 0x55, 0xb4, + 0x36, 0x40, 0x9b, 0x6b, 0xb0, 0x38, 0xc2, 0x7e, 0x89, 0x76, 0xa3, 0x8c, + 0xb4, 0x80, 0xa9, 0x5b, 0x6e, 0xbb, 0xa9, 0x4a, 0xb2, 0xb9, 0x7d, 0x73, + 0x81, 0x7d, 0x36, 0x49, 0xa0, 0x42, 0xa2, 0x69, 0x51, 0x7e, 0x87, 0xcd, + 0xc5, 0x8d, 0x8c, 0xc2, 0x91, 0xaa, 0xcc, 0xa7, 0x68, 0x6e, 0x93, 0x9b, + 0x5d, 0x57, 0x63, 0x3c, 0x38, 0x38, 0x64, 0xc7, 0x69, 0xc2, 0x73, 0xad, + 0x67, 0xd1, 0x9e, 0x65, 0xc9, 0xa8, 0xba, 0x7c, 0xb8, 0x3c, 0x7e, 0x45, + 0x44, 0x43, 0xb8, 0x42, 0x82, 0xbf, 0x8d, 0x2d, 0x92, 0xa4, 0x7a, 0x93, + 0xc3, 0x38, 0x50, 0x5f, 0x87, 0xc2, 0x9b, 0xa9, 0xc2, 0x75, 0xc8, 0xbd, + 0x35, 0x99, 0x69, 0x93, 0x6f, 0x48, 0x9d, 0x3e, 0x4d, 0x3e, 0x8a, 0x96, + 0xce, 0x97, 0x8f, 0x9a, 0x87, 0x98, 0x8f, 0x95, 0xb4, 0x89, 0x9e, 0x38, + 0xa5, 0x95, 0x84, 0xa8, 0x41, 0x5c, 0x6f, 0xcc, 0xb1, 0xbb, 0x80, 0x98, + 0xc4, 0x75, 0xc2, 0x3d, 0x89, 0x50, 0x58, 0x7d, 0x8d, 0x45, 0x93, 0xc2, + 0xc9, 0xc0, 0x36, 0x4b, 0x75, 0xbb, 0x53, 0x5b, 0xc0, 0x53, 0xba, 0x55, + 0xb8, 0x7b, 0x50, 0x69, 0x8b, 0x98, 0x97, 0x30, 0xa8, 0x8a, 0x39, 0x40, + 0xd1, 0x74, 0xb4, 0x53, 0xb1, 0x42, 0xa2, 0x67, 0xcb, 0x5e, 0x80, 0x62, + 0x47, 0x6a, 0xa6, 0x7a, 0xa4, 0xa3, 0x3a, 0x95, 0x56, 0x4a, 0x4d, 0x77, + 0x8c, 0xc5, 0x80, 0x42, 0x4e, 0x4d, 0x82, 0xad, 0x4e, 0x50, 0xaf, 0xc0, + 0x4e, 0xaa, 0xa5, 0xce, 0xa8, 0x49, 0xd3, 0xb5, 0x33, 0x54, 0x84, 0xa6, + 0x5e, 0x35, 0xd2, 0x59, 0x53, 0x51, 0xbc, 0x62, 0xb2, 0x46, 0xa8, 0x80, + 0x39, 0x5b, 0x97, 0x92, 0xcb, 0x56, 0x52, 0x4c, 0x31, 0x97, 0x35, 0xb2, + 0x46, 0x87, 0xc4, 0x6d, 0x36, 0x52, 0x37, 0xc6, 0x35, 0xbe, 0x3f, 0xd1, + 0x83, 0x84, 0x39, 0x91, 0x4b, 0x93, 0x32, 0x5f, 0xa8, 0xa0, 0x8c, 0xa8, + 0x2e, 0x9a, 0x48, 0xa5, 0xa4, 0x71, 0x3e, 0x71, 0xa4, 0x4c, 0x9e, 0x3e, + 0xb4, 0x4e, 0xab, 0x8e, 0x88, 0x44, 0x75, 0x53, 0x55, 0x44, 0x50, 0xbe, + 0x33, 0xa7, 0x9e, 0x53, 0x58, 0x92, 0x2b, 0xaf, 0x36, 0xcb, 0x60, 0xa5, + 0xaf, 0x7d, 0xac, 0xca, 0xbb, 0xab, 0x73, 0x65, 0x88, 0x40, 0xb1, 0xa8, + 0x36, 0xc8, 0x5f, 0x90, 0xb7, 0xb9, 0xbc, 0x7d, 0x73, 0x4a, 0xad, 0x88, + 0xbf, 0x69, 0x35, 0x7f, 0x42, 0x69, 0xb4, 0x76, 0x50, 0x3d, 0x8a, 0x6e, + 0xbe, 0x8f, 0xb1, 0xad, 0x32, 0x30, 0x81, 0x6d, 0x44, 0xc1, 0xc6, 0x90, + 0xac, 0x76, 0x90, 0x33, 0x9e, 0x9c, 0x8b, 0x4c, 0xba, 0x74, 0x7c, 0xc9, + 0x5b, 0x9b, 0x7e, 0x40, 0x45, 0x5f, 0x7e, 0x3d, 0x78, 0xb4, 0x87, 0x84, + 0xb1, 0x60, 0x83, 0xc3, 0x66, 0xc6, 0x9b, 0xcc, 0x88, 0x60, 0x5d, 0x3a, + 0xb9, 0xb4, 0xd4, 0x45, 0xc1, 0x34, 0xd1, 0x54, 0xb8, 0x3f, 0xa5, 0xb0, + 0x37, 0xd2, 0x61, 0x65, 0x61, 0xd4, 0x55, 0x45, 0x6a, 0x91, 0xb4, 0x71, + 0xa7, 0xbc, 0x8c, 0x6f, 0xbf, 0x41, 0xb7, 0xcd, 0xb5, 0xb3, 0xa8, 0x71, + 0x36, 0xc3, 0x5e, 0x87, 0xb0, 0x32, 0xcb, 0xba, 0xbb, 0x42, 0x3a, 0xd2, + 0x73, 0xaa, 0x40, 0x8e, 0x6d, 0x9b, 0x6a, 0x8f, 0x73, 0x8f, 0x54, 0x53, + 0xaf, 0x51, 0xa3, 0x3a, 0x4c, 0xd3, 0xc9, 0x8e, 0x45, 0x6b, 0xc0, 0x7b, + 0x9d, 0xb6, 0xad, 0x5c, 0x39, 0x65, 0x34, 0xbf, 0xb1, 0xaa, 0x48, 0x40, + 0x48, 0x65, 0x68, 0x8b, 0xcd, 0x38, 0x67, 0x35, 0xc5, 0x70, 0xaa, 0x73, + 0x85, 0x87, 0x7b, 0x69, 0xa9, 0xb1, 0x96, 0xb6, 0x83, 0x62, 0x91, 0x4c, + 0x8b, 0x5e, 0xa1, 0x7d, 0x39, 0x71, 0x9f, 0x67, 0x96, 0x5d, 0x74, 0x43, + 0x68, 0x6c, 0xb1, 0x6c, 0x93, 0xc2, 0x53, 0x5d, 0xc4, 0x76, 0x6e, 0xa1, + 0xcb, 0x30, 0xa2, 0x34, 0x99, 0xaa, 0x32, 0xbe, 0x5b, 0xb5, 0x51, 0x5b, + 0x35, 0xbc, 0x93, 0x89, 0x30, 0x45, 0xcf, 0x3a, 0x7f, 0x4f, 0x9a, 0x3d, + 0x97, 0xb5, 0xb6, 0x4e, 0xcc, 0x7e, 0x5a, 0xb3, 0x80, 0x46, 0x7f, 0x6e, + 0xb9, 0x88, 0x9e, 0x7b, 0x64, 0xa2, 0x6d, 0x8c, 0x9d, 0xa4, 0x3b, 0x68, + 0x80, 0x4d, 0x46, 0x75, 0xbd, 0xc7, 0x84, 0x76, 0x9b, 0x75, 0x7d, 0xaf, + 0x7e, 0xce, 0x88, 0x7e, 0x9f, 0xb5, 0xa1, 0xc0, 0xbb, 0x78, 0x70, 0x82, + 0x41, 0xa6, 0x8f, 0x7a, 0x84, 0xc1, 0xcc, 0xb3, 0x6f, 0x5d, 0x8c, 0x9f, + 0x98, 0x40, 0xc7, 0x42, 0x81, 0x5f, 0x81, 0x9d, 0xa2, 0x50, 0x72, 0x98, + 0x66, 0xcf, 0x61, 0xa8, 0xcd, 0xc6, 0x98, 0xbd, 0x4f, 0xb3, 0x7c, 0x3c, + 0x90, 0x65, 0x57, 0xbb, 0x5c, 0xb9, 0xbb, 0x9c, 0x69, 0x3f, 0xc0, 0xd0, + 0x46, 0x4f, 0x50, 0xcd, 0x50, 0xcb, 0x9e, 0xb8, 0xbb, 0xa0, 0x80, 0x71, + 0xa8, 0x8d, 0x4d, 0x6d, 0xbf, 0x37, 0x8d, 0x52, 0xc0, 0x7d, 0xb8, 0x50, + 0x5a, 0xbe, 0x81, 0x75, 0x35, 0x6c, 0x50, 0xb3, 0x4e, 0x8e, 0x35, 0xd0, + 0x9b, 0x5a, 0x60, 0x76, 0x4c, 0x75, 0x35, 0x98, 0x5e, 0xa9, 0x45, 0x4d, + 0x96, 0x9e, 0x80, 0xb3, 0x90, 0x64, 0x3e, 0x81, 0x6d, 0xd0, 0xc4, 0xcb, + 0x36, 0x7c, 0x8b, 0x57, 0x37, 0x56, 0x46, 0x97, 0x9f, 0x80, 0x7c, 0x8e, + 0x6d, 0x69, 0xc0, 0xae, 0x3f, 0xd3, 0x5f, 0x6e, 0xba, 0x43, 0xc8, 0x83, + 0x99, 0x78, 0xc0, 0xb6, 0x96, 0x7b, 0x6a, 0x64, 0x6e, 0x66, 0x4e, 0x6e, + 0xa2, 0x4c, 0x98, 0x8d, 0x64, 0xa6, 0x54, 0xca, 0x61, 0x81, 0x99, 0x71, + 0x84, 0x5c, 0x3d, 0x57, 0x47, 0xa0, 0x7f, 0x49, 0x5b, 0x53, 0x7b, 0x90, + 0x46, 0x9e, 0x33, 0xb2, 0x37, 0xa2, 0x4a, 0xb1, 0xa6, 0x58, 0x64, 0x65, + 0x33, 0x77, 0x72, 0xb1, 0xa0, 0xb3, 0xbf, 0x4f, 0x63, 0xa5, 0x8a, 0x8f, + 0xc3, 0x97, 0x38, 0xb4, 0x8c, 0x73, 0xc6, 0x5a, 0xa2, 0x94, 0x42, 0x73, + 0xcb, 0xc2, 0x60, 0x7b, 0xd1, 0xcf, 0x9a, 0x5a, 0x5c, 0xb1, 0x5b, 0x60, + 0x7c, 0xb3, 0x7e, 0xa8, 0x72, 0x68, 0xcd, 0x9a, 0x82, 0xd0, 0x37, 0x7f, + 0xc4, 0x3a, 0xb3, 0xbf, 0x55, 0x74, 0x66, 0x6b, 0x5c, 0x98, 0x48, 0xb4, + 0xaf, 0x5d, 0x53, 0x47, 0x68, 0x44, 0x5d, 0x7c, 0x50, 0x41, 0xc1, 0xa3, + 0x99, 0x3d, 0x43, 0xcd, 0x7b, 0x3d, 0x92, 0x5c, 0x36, 0xae, 0x9c, 0x35, + 0xb2, 0x50, 0x41, 0x5c, 0x42, 0x3d, 0xd0, 0xb0, 0x7a, 0x63, 0x44, 0x67, + 0x7b, 0x65, 0x4c, 0x5f, 0xa3, 0x9d, 0x6e, 0x39, 0x58, 0xa9, 0x71, 0x4b, + 0x4e, 0x96, 0x3d, 0x69, 0xb8, 0x4c, 0xa8, 0xb5, 0xa3, 0x58, 0xa1, 0x76, + 0x54, 0x32, 0xaa, 0x6c, 0xbe, 0x4d, 0xc1, 0xc1, 0x89, 0x66, 0x82, 0x9b, + 0xa9, 0x5d, 0x58, 0x6b, 0xc0, 0x34, 0xc3, 0x48, 0x8e, 0x3d, 0x8a, 0x39, + 0x8e, 0xc5, 0xae, 0x36, 0xb0, 0xa5, 0xbe, 0x68, 0x99, 0xc6, 0xae, 0x69, + 0x73, 0x54, 0xaa, 0xc2, 0xae, 0xbb, 0xa2, 0x3b, 0x72, 0x56, 0x8c, 0x2f, + 0x33, 0x99, 0x93, 0x93, 0x9c, 0x6c, 0x45, 0x7d, 0x98, 0x65, 0x5c, 0x90, + 0x89, 0x61, 0x9f, 0x4b, 0x85, 0x36, 0x40, 0x30, 0xa6, 0x94, 0xab, 0x7e, + 0x76, 0xc8, 0xa1, 0x81, 0x78, 0x4d, 0x5c, 0xcc, 0xbf, 0x6a, 0x67, 0x45, + 0xb4, 0x56, 0x69, 0xae, 0xb1, 0xcd, 0x91, 0x9b, 0x6f, 0x2e, 0x64, 0xa4, + 0x86, 0x3e, 0xb6, 0x4e, 0xcb, 0x94, 0x3d, 0xc8, 0x46, 0xb9, 0x5d, 0x9b, + 0x8d, 0x90, 0xa0, 0xd3, 0x80, 0xaf, 0xb2, 0x5e, 0x69, 0x6f, 0xd2, 0x75, + 0xb0, 0x67, 0x78, 0x88, 0xc1, 0x8b, 0x7c, 0xc3, 0x6d, 0x5d, 0x8a, 0x82, + 0xc8, 0xa4, 0xc2, 0x42, 0x4c, 0x52, 0xc4, 0x45, 0x38, 0xc8, 0x49, 0x50, + 0xc6, 0x8c, 0x64, 0x9b, 0x84, 0x82, 0xb3, 0xa7, 0x8d, 0xa3, 0x7b, 0x59, + 0x59, 0x4c, 0xcb, 0xc1, 0xad, 0x56, 0x9f, 0xc0, 0x45, 0x6d, 0x5b, 0xc0, + 0x4a, 0xa8, 0x49, 0x60, 0x73, 0x65, 0x83, 0x71, 0x5a, 0x63, 0x4d, 0x7d, + 0x30, 0xb8, 0x65, 0x51, 0xbc, 0x80, 0xcb, 0x99, 0x5d, 0xcf, 0xc9, 0x83, + 0x51, 0x40, 0x4a, 0x7e, 0x98, 0x45, 0x81, 0xa1, 0x47, 0x62, 0x40, 0x57, + 0xa9, 0x9e, 0x4f, 0x86, 0xc6, 0xa6, 0x3d, 0x7f, 0xb8, 0x82, 0xba, 0xc1, + 0xd1, 0x30, 0xb0, 0xcc, 0x46, 0xa9, 0xac, 0xab, 0xb5, 0x54, 0xb3, 0x93, + 0x38, 0x92, 0xa1, 0xa3, 0x7c, 0xcb, 0x7b, 0x4c, 0xa0, 0x8e, 0xad, 0x5f, + 0x7d, 0x9b, 0xc2, 0xb0, 0x73, 0x6a, 0xaa, 0xc9, 0xa0, 0x97, 0x54, 0x3a, + 0x34, 0xcd, 0x92, 0x5c, 0x5b, 0xce, 0x8d, 0xcd, 0xae, 0x66, 0xbd, 0x44, + 0xc4, 0xa5, 0x5c, 0xc1, 0x45, 0x95, 0x8a, 0x5c, 0x4e, 0x3d, 0x38, 0x2f, + 0xbd, 0xb0, 0xc9, 0x9c, 0xa2, 0x4f, 0x9f, 0x64, 0x81, 0x5d, 0xd3, 0xc5, + 0x71, 0x6c, 0xa5, 0xc5, 0xa5, 0x53, 0x5e, 0x66, 0x56, 0x3e, 0xbe, 0x51, + 0x9a, 0x41, 0xbb, 0x89, 0x90, 0x4e, 0xd0, 0xaf, 0x50, 0x99, 0x91, 0x78, + 0xb1, 0x64, 0xac, 0x5b, 0x34, 0x71, 0x4b, 0xd6, 0x7e, 0x70, 0x7d, 0x3e, + 0xcf, 0x8a, 0xb7, 0xb5, 0x6d, 0xc7, 0x9d, 0x82, 0xae, 0x5a, 0x8e, 0xca, + 0x45, 0x35, 0x6d, 0x93, 0x98, 0x43, 0x9e, 0xc0, 0xa1, 0xc0, 0x5a, 0x9d, + 0x4d, 0x9d, 0x39, 0x58, 0x3f, 0x83, 0x64, 0x7c, 0x35, 0x64, 0xa1, 0x48, + 0x58, 0x4e, 0x57, 0xa7, 0x53, 0x6f, 0x81, 0x7c, 0x86, 0x51, 0x7c, 0x83, + 0x55, 0xcf, 0x7a, 0xc8, 0x9c, 0xaf, 0xb6, 0x46, 0x65, 0x5c, 0xc4, 0xa9, + 0xc8, 0x96, 0xb6, 0x8e, 0x33, 0x4a, 0x9f, 0xc5, 0x5e, 0xa5, 0xb3, 0x84, + 0x6b, 0x50, 0xc1, 0xce, 0x78, 0x47, 0x6b, 0x59, 0x7c, 0x33, 0x91, 0x31, + 0x77, 0xc1, 0x91, 0x82, 0x58, 0xd0, 0x99, 0xc1, 0xd0, 0x5e, 0x99, 0x88, + 0xab, 0x93, 0x51, 0xb6, 0x98, 0x7b, 0x4f, 0x65, 0x48, 0x4b, 0xab, 0x3b, + 0x7a, 0x3c, 0xa4, 0xc1, 0x3c, 0x8b, 0xa7, 0x90, 0x77, 0xc7, 0x30, 0xc7, + 0xa5, 0xb3, 0xae, 0xd2, 0x81, 0x3d, 0x64, 0xa4, 0x82, 0x3e, 0x8c, 0x59, + 0x5d, 0x61, 0x75, 0x65, 0x42, 0x75, 0x69, 0xa6, 0x7c, 0xc3, 0xb5, 0x58, + 0x93, 0xd8, 0x76, 0xbb, 0x4e, 0xb5, 0x5a, 0x85, 0x71, 0x2f, 0x84, 0x82, + 0x77, 0x60, 0x59, 0x8c, 0x6e, 0x74, 0xae, 0x2d, 0x96, 0xbb, 0x91, 0x73, + 0x40, 0x64, 0x7d, 0x43, 0x89, 0xa9, 0x42, 0x65, 0xc4, 0xc2, 0xc3, 0xaf, + 0x4b, 0x62, 0xa5, 0x84, 0x54, 0xca, 0x35, 0x5b, 0x64, 0x86, 0x32, 0xc2, + 0x60, 0x3b, 0x6c, 0xbf, 0x3a, 0x32, 0xcc, 0x9f, 0xce, 0x41, 0x9f, 0x3f, + 0x4d, 0xbf, 0xbf, 0x33, 0x92, 0xb8, 0x86, 0x76, 0x38, 0xb8, 0x52, 0x66, + 0xcb, 0x3d, 0x3c, 0xb8, 0x84, 0x36, 0x7d, 0x7e, 0xb3, 0x99, 0x97, 0x47, + 0xa0, 0x5c, 0xc8, 0x57, 0x6f, 0x94, 0x45, 0x37, 0xc6, 0x8b, 0x73, 0xce, + 0x4a, 0xc9, 0x60, 0xb7, 0xc1, 0x6e, 0x48, 0x33, 0xb8, 0x54, 0x99, 0xca, + 0x5d, 0x6d, 0x65, 0x58, 0x3d, 0x83, 0x8e, 0xc6, 0x40, 0x93, 0xd0, 0xa5, + 0x8b, 0x67, 0x63, 0x5e, 0xcb, 0x81, 0x9d, 0x57, 0x46, 0xd5, 0x6d, 0x37, + 0xa6, 0xaa, 0x5b, 0x5a, 0xc6, 0x71, 0x4d, 0x86, 0x9d, 0xb9, 0x6c, 0xb0, + 0xb1, 0xca, 0x47, 0x78, 0x69, 0xab, 0x60, 0xbf, 0x4e, 0xa1, 0xa0, 0xb6, + 0xa4, 0x9a, 0x40, 0x8f, 0x78, 0x69, 0xb2, 0x9d, 0xc2, 0x60, 0xac, 0x37, + 0x4a, 0x61, 0xc0, 0x9b, 0x3a, 0xcf, 0x77, 0x8e, 0xbd, 0x49, 0xbc, 0x71, + 0xb0, 0xaa, 0x9a, 0x65, 0x65, 0xbe, 0x9a, 0x8e, 0xc2, 0xb0, 0xb4, 0x30, + 0x2c, 0x8d, 0x35, 0x83, 0xbd, 0xc6, 0xce, 0x3a, 0x34, 0xa8, 0xb0, 0xa3, + 0x7a, 0x5a, 0x5e, 0x3a, 0x54, 0xc8, 0x9d, 0xc2, 0x4f, 0x87, 0x56, 0x4d, + 0x41, 0xaf, 0xa6, 0x70, 0x99, 0x8a, 0xc5, 0x57, 0x88, 0xa2, 0x74, 0xb1, + 0x80, 0xa1, 0x95, 0xb1, 0xc7, 0x8b, 0x30, 0x52, 0xa3, 0x96, 0xb8, 0x9f, + 0x65, 0xae, 0x8b, 0x5d, 0x6b, 0x68, 0x54, 0x3d, 0x92, 0x47, 0x51, 0x72, + 0x38, 0x96, 0xb4, 0x73, 0x3e, 0x51, 0xb5, 0xbd, 0x37, 0x8a, 0x84, 0x9c, + 0x3f, 0x39, 0x56, 0xb3, 0x42, 0x8a, 0x46, 0x59, 0xc2, 0xc1, 0x71, 0xa5, + 0xac, 0x81, 0xcc, 0x77, 0xa5, 0x55, 0xd5, 0xa1, 0xc0, 0xb2, 0x59, 0x5e, + 0x47, 0x92, 0xc0, 0xd4, 0xc3, 0x3d, 0x87, 0x6e, 0x75, 0x52, 0x9d, 0x7c, + 0x9c, 0xc2, 0xc1, 0x70, 0xb6, 0x89, 0x55, 0x6a, 0xc2, 0xab, 0x6c, 0x35, + 0x3b, 0x8a, 0x3c, 0x45, 0x81, 0xa0, 0x30, 0xae, 0xad, 0x84, 0x6a, 0x67, + 0x87, 0xb5, 0xc3, 0x42, 0xb0, 0xa4, 0xaa, 0xcb, 0x5f, 0x5a, 0x4a, 0x48, + 0x72, 0xaa, 0xba, 0xcd, 0x9c, 0x8e, 0xb4, 0x63, 0x30, 0x99, 0x65, 0x6b, + 0xa8, 0x5b, 0x94, 0x3f, 0x9a, 0x5a, 0x90, 0x35, 0x9f, 0x94, 0x2c, 0x55, + 0x3c, 0xcd, 0x8b, 0x2d, 0x8d, 0xa5, 0x5f, 0x3f, 0x4a, 0x6e, 0x63, 0x45, + 0xc1, 0x3f, 0x32, 0xa0, 0x36, 0xbb, 0x41, 0x87, 0x64, 0x9b, 0x40, 0x50, + 0xb3, 0xc9, 0x6a, 0x6c, 0x82, 0x7c, 0xbc, 0x6a, 0x86, 0x9f, 0xa4, 0xca, + 0x6e, 0x85, 0x38, 0x52, 0x7b, 0x60, 0x45, 0xc0, 0x4e, 0x6d, 0x9f, 0xbd, + 0x4e, 0x79, 0x33, 0x67, 0x70, 0xce, 0x4c, 0xae, 0x7c, 0x9e, 0x96, 0xb0, + 0x79, 0x87, 0xb6, 0x6c, 0x62, 0x73, 0x5f, 0xc7, 0x31, 0xc5, 0xa3, 0xbb, + 0xc8, 0xb1, 0x6e, 0x8f, 0x4a, 0xc5, 0x63, 0xa1, 0xba, 0x38, 0xb5, 0xc8, + 0x9e, 0xa8, 0xbf, 0xba, 0xac, 0x67, 0x90, 0x57, 0x76, 0x86, 0x58, 0xd3, + 0xbe, 0x66, 0x7c, 0xbb, 0x93, 0x93, 0x3b, 0x57, 0x86, 0xb5, 0x72, 0x73, + 0x57, 0x9d, 0xc7, 0x47, 0x9e, 0x34, 0xa5, 0xce, 0x8f, 0x46, 0xac, 0xbe, + 0x34, 0x7f, 0xd1, 0xae, 0xa8, 0x7c, 0x67, 0xa9, 0xc0, 0xa2, 0xa7, 0x6f, + 0x36, 0xa0, 0x82, 0x7d, 0x55, 0x87, 0x8e, 0xc8, 0x4d, 0x73, 0x3e, 0xcd, + 0xb9, 0x84, 0xbf, 0x4d, 0x67, 0x49, 0x5a, 0x6b, 0x59, 0x79, 0x36, 0x4e, + 0x96, 0x89, 0x9f, 0x64, 0x5f, 0xc3, 0xc4, 0x9c, 0x85, 0x40, 0xc5, 0x8b, + 0x7b, 0xa9, 0xa9, 0x6a, 0x8e, 0xb3, 0xc6, 0x88, 0x8b, 0x45, 0xc5, 0xb2, + 0x46, 0x6f, 0x47, 0xb0, 0xce, 0xa6, 0x44, 0xba, 0x49, 0x71, 0xc3, 0x80, + 0xc3, 0x42, 0x6a, 0x74, 0x43, 0x8c, 0x59, 0x54, 0x7e, 0x2a, 0x97, 0x64, + 0xc4, 0x99, 0xcc, 0x3d, 0xa7, 0x54, 0x65, 0x8c, 0x77, 0x42, 0x82, 0x6a, + 0xbf, 0x60, 0x33, 0x4a, 0xa4, 0x4a, 0x96, 0x4c, 0xb8, 0x90, 0x6a, 0xab, + 0x4d, 0xa7, 0xa3, 0x31, 0x53, 0xa4, 0x3e, 0xc4, 0xb7, 0x3f, 0xb8, 0x49, + 0x54, 0x8b, 0x98, 0x6b, 0x8f, 0xa2, 0x3f, 0xbc, 0xb3, 0x3d, 0x5c, 0x60, + 0x97, 0x37, 0x98, 0x97, 0x8c, 0xab, 0x5f, 0xb9, 0x66, 0x53, 0x5b, 0x67, + 0x7a, 0xc4, 0x4a, 0x67, 0x3d, 0xab, 0xc0, 0x5a, 0xbe, 0xa0, 0xbd, 0xa6, + 0x7c, 0x42, 0x93, 0xc2, 0x47, 0xad, 0x74, 0xbd, 0xc4, 0xbc, 0x6d, 0x67, + 0x5b, 0x70, 0x4f, 0xd6, 0x4f, 0x56, 0x82, 0x74, 0x55, 0xdb, 0x6b, 0x86, + 0x41, 0x31, 0xc1, 0x41, 0x9a, 0xcb, 0x81, 0x70, 0x61, 0x58, 0x48, 0x70, + 0xad, 0xca, 0xa6, 0xca, 0xcd, 0x68, 0xd3, 0x85, 0x92, 0x8b, 0xd0, 0x2d, + 0x80, 0x76, 0x39, 0x41, 0x7f, 0x49, 0x2e, 0xb0, 0x93, 0x7d, 0x7a, 0xa5, + 0x43, 0xc0, 0x5e, 0xc9, 0x47, 0x4e, 0x7e, 0x49, 0x53, 0x4a, 0xb4, 0x2d, + 0x53, 0x85, 0xbe, 0xd1, 0x8d, 0x6f, 0x71, 0x47, 0x4d, 0x99, 0x49, 0x6b, + 0x67, 0xc4, 0x3e, 0x9c, 0x97, 0xb3, 0xb6, 0x9e, 0xa6, 0x69, 0x66, 0x3b, + 0xc7, 0xc2, 0x5b, 0x8d, 0x8b, 0x34, 0xbd, 0x4c, 0x9a, 0x9d, 0x6d, 0x3e, + 0xaf, 0xc5, 0x48, 0x81, 0x73, 0xce, 0xda, 0x67, 0xc6, 0x3e, 0xa4, 0xbd, + 0xcd, 0xc4, 0x67, 0x54, 0xba, 0x62, 0x39, 0xb7, 0x54, 0xc5, 0xd8, 0xb7, + 0x67, 0x3f, 0x57, 0xd3, 0x3a, 0x5e, 0x94, 0xb7, 0xad, 0x87, 0x2e, 0x62, + 0x71, 0xd4, 0xd3, 0x41, 0x50, 0x87, 0xc2, 0x6c, 0x3d, 0xb5, 0x31, 0x72, + 0xd0, 0x2a, 0x23, 0x2c, 0x7a, 0xd9, 0x94, 0xce, 0x74, 0x5c, 0xa4, 0x6b, + 0xaa, 0xce, 0x67, 0x4b, 0xae, 0x5b, 0x60, 0x48, 0xd7, 0xb8, 0xbb, 0xa2, + 0xce, 0x86, 0xac, 0xaf, 0x39, 0xc4, 0x86, 0x3d, 0x9b, 0x5a, 0xe4, 0x4f, + 0x54, 0x96, 0x4b, 0xcd, 0x4d, 0x82, 0x44, 0x9c, 0xb5, 0xa6, 0x6a, 0x87, + 0xc5, 0xce, 0x47, 0x6c, 0x46, 0x68, 0x66, 0x9a, 0xc9, 0x45, 0x9e, 0x97, + 0x71, 0x58, 0xbd, 0xcc, 0x94, 0x95, 0x7d, 0x78, 0x5c, 0x8b, 0xb4, 0xaf, + 0x4d, 0xaf, 0x6b, 0xae, 0xd6, 0xc3, 0x52, 0x92, 0xd9, 0xc0, 0x8a, 0xbe, + 0xb1, 0x70, 0x25, 0xbf, 0x34, 0x69, 0x44, 0x89, 0x7b, 0x60, 0x40, 0xae, + 0xa7, 0x92, 0xb2, 0x69, 0xaf, 0x9f, 0xd0, 0xd6, 0xb8, 0x40, 0x6b, 0x94, + 0x97, 0x53, 0x90, 0x8a, 0x95, 0xa0, 0x70, 0x4a, 0x5c, 0x94, 0xaa, 0x54, + 0x81, 0xc8, 0xa7, 0xbb, 0x4f, 0x44, 0x41, 0x5e, 0x87, 0xb8, 0xa4, 0x71, + 0x4f, 0x9b, 0xb2, 0x8b, 0x63, 0x5b, 0x4d, 0xbf, 0xa6, 0x97, 0xdc, 0x91, + 0x8a, 0xa9, 0xb6, 0xd9, 0xd6, 0x4f, 0x58, 0x44, 0x75, 0x85, 0x66, 0xb0, + 0xc6, 0x61, 0x9f, 0x44, 0x6a, 0xce, 0x8c, 0x69, 0xad, 0xd4, 0x39, 0xa0, + 0x54, 0x50, 0x9d, 0x9c, 0x50, 0xbb, 0xc8, 0x78, 0x5b, 0x9c, 0x55, 0xcc, + 0x4e, 0x40, 0x60, 0x33, 0x54, 0xca, 0x7b, 0x72, 0x71, 0x52, 0xc8, 0x5c, + 0xb7, 0xaf, 0x62, 0x2e, 0x53, 0x57, 0x42, 0x2a, 0x6b, 0x55, 0x3e, 0xd3, + 0x43, 0x8e, 0xa7, 0x5a, 0xa3, 0x6b, 0xc0, 0xb2, 0xbd, 0x7d, 0x90, 0x61, + 0xce, 0x6d, 0x6c, 0x7f, 0xcb, 0x71, 0xa0, 0x88, 0x47, 0xab, 0x6b, 0x39, + 0x43, 0xdd, 0x94, 0x71, 0xdf, 0xb1, 0xcf, 0xc8, 0x53, 0x43, 0xac, 0x8a, + 0x7c, 0x7c, 0xac, 0x4a, 0xbf, 0x50, 0x37, 0x6e, 0xa2, 0x82, 0x91, 0x99, + 0x51, 0x74, 0x40, 0x6a, 0x6a, 0x58, 0x45, 0x93, 0xd6, 0xd9, 0xa8, 0x89, + 0xb7, 0xc9, 0x69, 0xd3, 0x95, 0xbe, 0xa9, 0x9b, 0x7f, 0xa2, 0xae, 0xc0, + 0x97, 0x49, 0xbc, 0xac, 0x5e, 0xa1, 0x6e, 0x63, 0x38, 0x89, 0xb8, 0x98, + 0x67, 0x81, 0xb5, 0x4e, 0x46, 0x48, 0xcd, 0x94, 0xc1, 0xcc, 0x9b, 0x2d, + 0x59, 0x53, 0x4a, 0xd3, 0x5e, 0x62, 0xbe, 0xb0, 0x4e, 0x8d, 0x94, 0xcb, + 0x47, 0xbe, 0x36, 0x3d, 0x8a, 0x59, 0x50, 0x99, 0x54, 0x67, 0x6e, 0x89, + 0xcb, 0x7e, 0xaf, 0xb2, 0x45, 0x9c, 0x69, 0x4f, 0x60, 0xbc, 0x69, 0xb0, + 0x5b, 0x6d, 0x62, 0x86, 0xbb, 0x91, 0x96, 0xc1, 0x9d, 0x56, 0xa7, 0x8a, + 0x6c, 0xad, 0x9c, 0xb4, 0x99, 0xcf, 0x94, 0x94, 0xae, 0xd0, 0x98, 0xd0, + 0x71, 0xcd, 0xc7, 0xbf, 0x84, 0x93, 0x50, 0xd6, 0xa5, 0xa6, 0x6c, 0x4e, + 0x39, 0x53, 0xcc, 0xb5, 0x77, 0xb4, 0x95, 0x8e, 0xce, 0x73, 0x7c, 0x9b, + 0xc7, 0x66, 0x7e, 0x32, 0x69, 0x49, 0x79, 0xd5, 0xc2, 0xac, 0x68, 0x83, + 0xd1, 0xac, 0x45, 0xc0, 0xba, 0x82, 0xab, 0x4a, 0x3c, 0xd6, 0x8b, 0x96, + 0x95, 0xd5, 0x7b, 0x7b, 0x3d, 0xc0, 0x50, 0x5d, 0xaa, 0x8a, 0xd0, 0x66, + 0x2c, 0xc6, 0xb4, 0x7a, 0x8c, 0x91, 0xbd, 0x52, 0xbe, 0x4f, 0x9e, 0x75, + 0x56, 0x78, 0x6d, 0x72, 0x55, 0xa0, 0xb5, 0x4b, 0xa3, 0x3b, 0x7c, 0xb5, + 0x73, 0x6e, 0x92, 0x64, 0xc2, 0x96, 0xae, 0xa5, 0x40, 0x55, 0x6b, 0x4a, + 0x57, 0xcd, 0xca, 0xb2, 0x77, 0x2e, 0x79, 0x63, 0x7b, 0x6c, 0x8e, 0x4b, + 0x63, 0xc7, 0xb7, 0x6f, 0x5f, 0x8f, 0xb8, 0x6c, 0x77, 0x53, 0xc9, 0x89, + 0x8a, 0x3c, 0x98, 0x5f, 0xd2, 0x63, 0x88, 0xd2, 0x83, 0x48, 0x61, 0x94, + 0xba, 0x5e, 0xa7, 0x4d, 0xa8, 0xb5, 0xcd, 0xc8, 0x9c, 0xcd, 0x58, 0x6e, + 0x7a, 0xd5, 0x71, 0x4c, 0xbe, 0xcd, 0xa9, 0x5e, 0xce, 0xb0, 0x5e, 0xa1, + 0xa4, 0x96, 0x87, 0x43, 0x81, 0x46, 0x41, 0xcd, 0xb4, 0xa6, 0xb0, 0x97, + 0x49, 0xc5, 0x96, 0x92, 0x77, 0x8f, 0x92, 0x44, 0x84, 0x87, 0x5b, 0x9a, + 0x37, 0x44, 0x8b, 0xa0, 0xb1, 0x5c, 0xa3, 0x5a, 0x7e, 0x5e, 0x45, 0xd5, + 0x56, 0xcd, 0x69, 0x69, 0xc9, 0x90, 0x5f, 0x54, 0x62, 0x9a, 0x34, 0x75, + 0x4d, 0x2e, 0x98, 0xc9, 0x99, 0xa8, 0x44, 0x91, 0x55, 0x43, 0xb8, 0x61, + 0x61, 0xc0, 0xb6, 0xb8, 0x65, 0x45, 0xa1, 0x4c, 0xb0, 0x46, 0x73, 0x73, + 0xa0, 0x72, 0x37, 0x49, 0x41, 0xa4, 0x6d, 0xb3, 0x5f, 0x9f, 0x69, 0x40, + 0xa6, 0x53, 0xa6, 0xa5, 0x7b, 0x31, 0x47, 0x52, 0x5a, 0x6d, 0xcf, 0x5c, + 0x8f, 0x8c, 0xd0, 0x71, 0x54, 0x3e, 0xaa, 0x5a, 0x7e, 0x63, 0xb9, 0x4d, + 0x32, 0x8d, 0x7b, 0xbe, 0xcf, 0xa7, 0x9c, 0xaa, 0x85, 0x4e, 0xae, 0xda, + 0xac, 0xa2, 0x68, 0xb2, 0xb2, 0x6f, 0xb3, 0x9d, 0xca, 0x52, 0x99, 0x43, + 0xa7, 0x5d, 0x85, 0xc3, 0xb8, 0x41, 0x58, 0x61, 0xba, 0x25, 0xc4, 0x5b, + 0x65, 0x5d, 0x8e, 0x6c, 0x5a, 0x3d, 0xbc, 0xc5, 0xc8, 0x9a, 0x8a, 0xc4, + 0x83, 0xba, 0x85, 0x84, 0x5b, 0x61, 0xd6, 0xa9, 0xcc, 0xb5, 0x84, 0x40, + 0x8d, 0x53, 0x61, 0xb2, 0x59, 0xb1, 0x3a, 0xac, 0x64, 0xd2, 0x28, 0x79, + 0x95, 0x78, 0xc6, 0x65, 0xb4, 0x99, 0x3b, 0x81, 0xc7, 0x5d, 0x9e, 0xa3, + 0xc3, 0xa3, 0x43, 0x8e, 0x5e, 0xa3, 0x5c, 0xbc, 0xc1, 0xb8, 0x6a, 0x6a, + 0x5f, 0x87, 0x8d, 0x8b, 0xcf, 0x57, 0x97, 0xc1, 0x94, 0x9f, 0x87, 0x7a, + 0x7e, 0x6e, 0x5e, 0xa7, 0xd0, 0x7a, 0x90, 0x6f, 0xc9, 0xa9, 0x40, 0x8b, + 0xc1, 0xe5, 0xa6, 0x60, 0xa6, 0xa5, 0xba, 0xc6, 0x86, 0x9e, 0x41, 0x94, + 0x79, 0x5c, 0x71, 0x39, 0x93, 0x95, 0xd1, 0x61, 0xb2, 0xc0, 0x9d, 0xa5, + 0x4b, 0x90, 0x30, 0x93, 0xb7, 0x81, 0xa7, 0x95, 0x81, 0x9a, 0xa9, 0x39, + 0x65, 0xcf, 0xa1, 0x5f, 0x31, 0x6f, 0x69, 0x3b, 0xa9, 0x82, 0xa2, 0x3e, + 0xc7, 0xa2, 0x8a, 0x79, 0x8a, 0xc1, 0x7f, 0x72, 0x9a, 0x75, 0x55, 0xbf, + 0x76, 0x78, 0xcc, 0xba, 0x49, 0xce, 0x96, 0xd9, 0x66, 0xaa, 0x9f, 0x7b, + 0x6c, 0x75, 0x71, 0xb0, 0x33, 0x2a, 0x97, 0x88, 0x2b, 0xaa, 0x7a, 0x89, + 0x8e, 0xc3, 0xa2, 0x9e, 0xc6, 0xc3, 0x90, 0xb0, 0xa0, 0x5d, 0x4c, 0xc0, + 0x51, 0x54, 0x4b, 0x97, 0xbc, 0x60, 0x57, 0x3a, 0x92, 0x3e, 0xa1, 0x94, + 0xaa, 0xc8, 0xcb, 0x63, 0x53, 0x5a, 0x37, 0x92, 0xb9, 0x2e, 0x9f, 0xd6, + 0x64, 0x4a, 0x50, 0xb6, 0x51, 0x9c, 0x85, 0xab, 0x31, 0xd0, 0x42, 0x90, + 0x50, 0x88, 0xb0, 0x39, 0xac, 0x8e, 0xaa, 0x59, 0x76, 0x95, 0x84, 0x76, + 0x3e, 0x48, 0x4e, 0x54, 0x8f, 0x73, 0x6e, 0xc9, 0x4e, 0x77, 0xc8, 0x72, + 0xd0, 0x35, 0x5f, 0x62, 0x51, 0x8e, 0x78, 0xd1, 0x67, 0x63, 0x79, 0x29, + 0xb0, 0x67, 0x6e, 0xd6, 0x8a, 0x9c, 0x61, 0x6a, 0x87, 0xc3, 0x8d, 0x4e, + 0xa3, 0x2e, 0x67, 0xad, 0x99, 0x89, 0x67, 0x54, 0x36, 0xa9, 0x90, 0x86, + 0x55, 0xac, 0x3d, 0x5d, 0xd6, 0x3c, 0x88, 0x3c, 0x73, 0x3a, 0xc9, 0x6f, + 0xb1, 0x99, 0xbd, 0x40, 0x9f, 0x5f, 0x3d, 0xc7, 0x5d, 0x97, 0xc8, 0xbd, + 0xa0, 0xcc, 0x5c, 0xbe, 0xb6, 0xd1, 0xbc, 0x55, 0x4e, 0x40, 0x9e, 0xa4, + 0xcd, 0x53, 0x7e, 0x25, 0x81, 0x58, 0x4c, 0x83, 0x8c, 0x81, 0x98, 0xd5, + 0x76, 0x5f, 0xac, 0x7b, 0x77, 0xd1, 0xc4, 0x86, 0xaa, 0x91, 0x35, 0x90, + 0x58, 0x3c, 0xc6, 0xca, 0x3d, 0x7d, 0xbd, 0x37, 0x7e, 0x3e, 0xcb, 0x6e, + 0x42, 0xac, 0xc7, 0x8c, 0x88, 0xba, 0x49, 0x73, 0x69, 0xb3, 0xa3, 0xd1, + 0x59, 0x83, 0xa8, 0x4f, 0x3c, 0xd0, 0x7c, 0x69, 0xaf, 0xb8, 0x78, 0x29, + 0xc0, 0x3e, 0xb1, 0x6e, 0xde, 0x6a, 0x8e, 0x8a, 0x44, 0x82, 0x46, 0xbf, + 0xaf, 0xd4, 0xcf, 0xad, 0x5e, 0x4c, 0x33, 0x9d, 0x7b, 0xd3, 0xbd, 0x61, + 0x9a, 0x4a, 0xbe, 0x57, 0x57, 0xa7, 0x97, 0xa9, 0x52, 0x5a, 0x89, 0x33, + 0x4e, 0x47, 0x43, 0xcb, 0x81, 0xba, 0x4a, 0x68, 0x51, 0x9b, 0x87, 0xb4, + 0x94, 0xbb, 0x78, 0x95, 0xa5, 0x5f, 0x5d, 0x4d, 0xd2, 0x92, 0x7a, 0x84, + 0x5b, 0x9b, 0x80, 0xca, 0xc8, 0xa9, 0x9f, 0xbd, 0x60, 0xa7, 0xa3, 0x90, + 0x72, 0x53, 0xb3, 0x33, 0xca, 0x90, 0xd9, 0x57, 0x73, 0x92, 0xb8, 0x87, + 0x94, 0xa1, 0xa7, 0xaa, 0xc1, 0x74, 0x46, 0x9a, 0x57, 0xcd, 0x91, 0x8b, + 0x83, 0xc4, 0x88, 0xa3, 0xbf, 0x52, 0xaa, 0xb8, 0x6a, 0x82, 0x9d, 0xbf, + 0xd0, 0x87, 0x56, 0x84, 0xb7, 0x99, 0x6f, 0x64, 0xa1, 0x6b, 0x3d, 0x72, + 0x47, 0xc7, 0x49, 0x8f, 0x82, 0xbf, 0x4c, 0x71, 0xa1, 0x40, 0x9c, 0x49, + 0x6b, 0xd3, 0x69, 0x82, 0x74, 0x4a, 0xb6, 0x51, 0x3e, 0xa2, 0xb4, 0x2c, + 0x52, 0x64, 0xc9, 0x90, 0xb6, 0x75, 0x42, 0xaf, 0xa3, 0x61, 0x67, 0xd0, + 0x4f, 0xc8, 0xb1, 0x40, 0xbc, 0x94, 0x95, 0x9b, 0x6a, 0x70, 0xa9, 0x2a, + 0xa9, 0xc8, 0xd5, 0x7f, 0xca, 0x87, 0xd9, 0xb8, 0x80, 0x88, 0xa7, 0x6a, + 0x43, 0x4f, 0x4d, 0x7f, 0x8c, 0x9e, 0xab, 0x84, 0xd5, 0x61, 0x35, 0x4c, + 0x78, 0x49, 0x99, 0x4a, 0x6c, 0x8e, 0x8c, 0x6b, 0x7f, 0x92, 0x6b, 0x98, + 0x61, 0x34, 0x84, 0x95, 0xba, 0x40, 0x6e, 0x46, 0xc5, 0x85, 0x47, 0x66, + 0x2f, 0xce, 0xbd, 0x31, 0x2e, 0x6a, 0xad, 0xb8, 0x7c, 0x65, 0x9b, 0xd8, + 0x38, 0x8c, 0x4b, 0x71, 0x61, 0x5e, 0x39, 0x66, 0x8e, 0x42, 0x48, 0x75, + 0x8a, 0xbc, 0x7c, 0x64, 0xb8, 0xa3, 0x9f, 0x5a, 0x3a, 0x56, 0xba, 0x62, + 0x8c, 0x57, 0x49, 0x46, 0x82, 0x4c, 0xd6, 0xc0, 0xa1, 0x7a, 0x46, 0xa4, + 0x4b, 0x48, 0x40, 0xbc, 0x37, 0xa0, 0xb3, 0x36, 0x78, 0x6c, 0xcb, 0x65, + 0x5a, 0xa8, 0x6a, 0xac, 0x94, 0xcb, 0x86, 0x4a, 0x97, 0x42, 0x8f, 0xa8, + 0x74, 0x8c, 0x5b, 0xb2, 0x4e, 0x95, 0x61, 0x78, 0xa8, 0x59, 0x72, 0x7a, + 0xb6, 0x8d, 0x8b, 0x4c, 0xd4, 0xb3, 0x41, 0x56, 0x78, 0x8b, 0x77, 0x5b, + 0xa7, 0x6f, 0x31, 0xa9, 0x48, 0x93, 0x33, 0x88, 0x7a, 0xd0, 0x76, 0x47, + 0x42, 0x90, 0x83, 0xdb, 0x9c, 0x4d, 0x6b, 0x5d, 0x60, 0x8e, 0x59, 0x75, + 0x78, 0x8d, 0x77, 0x68, 0x78, 0x68, 0x60, 0x63, 0x8a, 0x82, 0x9c, 0xc7, + 0x8c, 0x71, 0x97, 0x97, 0x92, 0xaa, 0xd0, 0xac, 0x57, 0x36, 0x33, 0x8d, + 0x83, 0x41, 0x77, 0xcc, 0xa4, 0x75, 0x82, 0xaf, 0xa3, 0x3a, 0x6a, 0xcd, + 0x45, 0x64, 0x82, 0xc6, 0x75, 0x4d, 0xbd, 0x91, 0x66, 0xcc, 0x67, 0xc5, + 0x84, 0x63, 0x53, 0x7b, 0x40, 0xaf, 0x92, 0x7d, 0x71, 0x8b, 0x45, 0x91, + 0x6f, 0xba, 0x36, 0x5d, 0x52, 0x62, 0x6c, 0x34, 0xcb, 0x69, 0x3e, 0x73, + 0x3c, 0x2e, 0x68, 0x9b, 0x36, 0x7b, 0xcd, 0xd2, 0x6b, 0xcf, 0xd7, 0x9e, + 0xa7, 0xcc, 0x3b, 0xb1, 0xb7, 0x35, 0x2f, 0x77, 0xbc, 0x97, 0x86, 0x79, + 0xb5, 0x46, 0xd1, 0x58, 0xbc, 0x6e, 0x85, 0x7d, 0x78, 0x59, 0x4b, 0x87, + 0xca, 0x59, 0x56, 0x68, 0x35, 0xd8, 0x66, 0xa5, 0xd2, 0x81, 0x7b, 0xad, + 0x7d, 0x7d, 0x66, 0x9c, 0xbf, 0x64, 0xc8, 0x9a, 0xae, 0xcc, 0x60, 0x38, + 0xb9, 0x92, 0xd9, 0xca, 0xc8, 0x9e, 0x85, 0x7a, 0x89, 0xce, 0xa9, 0x70, + 0x7c, 0x8f, 0x64, 0x86, 0x80, 0x75, 0x71, 0x32, 0x81, 0x3d, 0x8c, 0x54, + 0xca, 0xbe, 0xae, 0x73, 0x6b, 0x68, 0xc3, 0xa4, 0x57, 0x92, 0x6f, 0x84, + 0x54, 0xc2, 0x64, 0x8c, 0x3e, 0xbc, 0x6c, 0x4c, 0x86, 0x8c, 0x6b, 0x7c, + 0x82, 0xa6, 0x52, 0x4e, 0x3e, 0xc5, 0x3c, 0x41, 0x6f, 0x2b, 0xa8, 0x42, + 0xb5, 0x31, 0xac, 0x5b, 0xc9, 0xac, 0x43, 0x4d, 0xcf, 0x4c, 0x64, 0x61, + 0xac, 0xa3, 0x7f, 0x69, 0x8f, 0xce, 0x79, 0x76, 0xa0, 0x4f, 0xbf, 0x46, + 0x74, 0x7e, 0xb2, 0x5c, 0x93, 0x53, 0x3a, 0x3d, 0x9c, 0x9e, 0x5a, 0xc8, + 0x48, 0xc3, 0x90, 0x6b, 0x61, 0x72, 0x4f, 0x7d, 0x86, 0xb8, 0x9f, 0xd3, + 0x60, 0xbc, 0x63, 0x33, 0x4d, 0x90, 0x67, 0x9e, 0xa7, 0xae, 0x60, 0x84, + 0x7b, 0x31, 0x99, 0xa2, 0xac, 0x65, 0xc8, 0xbe, 0xc1, 0x4e, 0xc9, 0x60, + 0x69, 0x50, 0x6e, 0x44, 0x67, 0x4e, 0x7a, 0x8d, 0x5a, 0x95, 0xac, 0x7a, + 0xc8, 0x88, 0x65, 0x9c, 0xb6, 0xa4, 0x34, 0xa6, 0xcb, 0xb5, 0xbd, 0x9a, + 0x7b, 0x42, 0xc5, 0x5c, 0x3e, 0x44, 0xbb, 0x8f, 0x9b, 0xc9, 0xc8, 0x62, + 0x3f, 0x81, 0xd3, 0xbf, 0xad, 0x72, 0xb6, 0x78, 0x40, 0xd1, 0x35, 0x73, + 0xb3, 0xb3, 0xaf, 0xb2, 0x91, 0x79, 0xc5, 0xa7, 0x7b, 0xc3, 0xc7, 0x6d, + 0x5b, 0x80, 0x8a, 0xd2, 0x70, 0x90, 0xb6, 0x37, 0xb6, 0x41, 0xd0, 0x6a, + 0xb1, 0x48, 0x79, 0xa5, 0xc6, 0xce, 0x68, 0xb3, 0x94, 0xab, 0x9b, 0xbf, + 0x3d, 0xc6, 0x85, 0x7e, 0x6c, 0xcd, 0xad, 0x8c, 0x3d, 0xcc, 0xad, 0xb2, + 0x62, 0x94, 0x50, 0x9c, 0x67, 0x48, 0x4f, 0x81, 0xc3, 0x45, 0xaa, 0x3b, + 0x3b, 0xa8, 0x98, 0x5c, 0xab, 0xae, 0xa8, 0x82, 0x51, 0x5a, 0x7e, 0xb2, + 0x85, 0x75, 0xcd, 0xc1, 0x4c, 0x8f, 0x56, 0x90, 0x4e, 0xa9, 0x81, 0xa4, + 0xb1, 0xb5, 0xb8, 0x8b, 0x40, 0x57, 0x78, 0x8a, 0xa3, 0xc5, 0x54, 0x73, + 0x79, 0x6d, 0x68, 0xbf, 0x51, 0x50, 0x62, 0xcc, 0xcc, 0x93, 0xb9, 0x55, + 0x6f, 0x9b, 0xc5, 0x3a, 0x55, 0x5f, 0x69, 0xa9, 0x3f, 0x76, 0x46, 0xa0, + 0x6d, 0x5d, 0x5a, 0x9c, 0x83, 0x5a, 0xb9, 0xd2, 0xcc, 0xcd, 0x64, 0xc6, + 0x77, 0x80, 0x31, 0x55, 0x86, 0x5f, 0x5b, 0xc0, 0x58, 0xb2, 0x98, 0x57, + 0xba, 0xc4, 0x67, 0x58, 0xb1, 0xc0, 0x38, 0x6d, 0x47, 0xac, 0x54, 0x44, + 0xa9, 0xcd, 0x66, 0x54, 0x3b, 0xc4, 0x46, 0xce, 0x32, 0x8a, 0xc8, 0x78, + 0x46, 0x41, 0x93, 0xc9, 0x3e, 0xd1, 0xb1, 0x49, 0x59, 0x37, 0x8c, 0x6d, + 0x95, 0xa9, 0xce, 0x5b, 0x88, 0x9d, 0x4a, 0xc2, 0xca, 0x96, 0xc6, 0x6b, + 0x36, 0xa3, 0x69, 0xb6, 0xc0, 0xcc, 0xc5, 0xb7, 0x8b, 0x80, 0x78, 0xa0, + 0x4f, 0x48, 0x7f, 0x92, 0x3b, 0xd0, 0xad, 0xa2, 0x73, 0x5a, 0xa1, 0x53, + 0xb8, 0xa5, 0x81, 0xb1, 0xd3, 0x67, 0x5a, 0x45, 0x89, 0x77, 0x93, 0x55, + 0xc9, 0xc7, 0x3f, 0x2f, 0x8d, 0x6b, 0x40, 0x55, 0x43, 0x99, 0xa4, 0xad, + 0x3e, 0x56, 0x94, 0x97, 0x88, 0xcf, 0xb6, 0x55, 0x8b, 0x48, 0x94, 0x3e, + 0xcf, 0xcc, 0xaa, 0x7b, 0x36, 0x3a, 0xaf, 0x5c, 0x3b, 0x77, 0xc7, 0xc0, + 0x8d, 0x6a, 0x5e, 0x6e, 0x62, 0x83, 0x57, 0x6c, 0x96, 0x7c, 0xb7, 0x6e, + 0xc4, 0x3c, 0xa9, 0x71, 0x71, 0xb3, 0xcc, 0x61, 0x3d, 0xc4, 0xa4, 0x64, + 0x41, 0x4e, 0x93, 0x96, 0x50, 0x75, 0x64, 0x9d, 0x66, 0x59, 0x6f, 0xd1, + 0x84, 0xac, 0x61, 0x7a, 0x3f, 0xc1, 0xc3, 0x74, 0x6f, 0x7c, 0xbe, 0x42, + 0x53, 0xcf, 0x8b, 0x9a, 0x4c, 0x7d, 0xb5, 0x8f, 0x57, 0x5b, 0x9d, 0xd4, + 0x43, 0x63, 0xca, 0xb5, 0x81, 0xb3, 0x44, 0x8f, 0x6f, 0x6f, 0xbb, 0x5b, + 0xb7, 0x4b, 0xbd, 0x4d, 0x83, 0xc6, 0x47, 0x66, 0x74, 0xd0, 0xc2, 0x67, + 0x87, 0x4b, 0x80, 0x92, 0x68, 0x67, 0x98, 0xa0, 0x97, 0x8a, 0x7a, 0x8d, + 0x6b, 0x6a, 0x33, 0x7f, 0x80, 0x93, 0xa7, 0xc1, 0x57, 0x73, 0x81, 0x6e, + 0x94, 0xa0, 0xb3, 0x4b, 0x7a, 0x77, 0xb7, 0x44, 0x76, 0xd1, 0x81, 0x91, + 0x7a, 0x81, 0xc9, 0x9a, 0x2e, 0xb1, 0x72, 0x4b, 0xcc, 0x72, 0xa5, 0x66, + 0xcb, 0x54, 0xc0, 0xc8, 0x37, 0x8a, 0x74, 0x35, 0xa0, 0x95, 0x5e, 0x90, + 0xb7, 0x49, 0x9d, 0xc0, 0xc8, 0x3b, 0x70, 0xd1, 0xac, 0x76, 0xcd, 0x72, + 0x79, 0x9d, 0x5c, 0x42, 0x58, 0x68, 0x73, 0xaa, 0x42, 0x37, 0x30, 0x71, + 0x6d, 0x4d, 0x35, 0x38, 0xb0, 0x4c, 0xab, 0x83, 0x86, 0xd2, 0x43, 0xac, + 0xa2, 0xb5, 0x35, 0x8d, 0x45, 0x4a, 0x73, 0x6d, 0x59, 0x9d, 0x99, 0xc2, + 0x7d, 0x7b, 0x3b, 0x64, 0xc6, 0x32, 0xaa, 0x86, 0x65, 0x4d, 0xb6, 0x8a, + 0x89, 0x9f, 0xcb, 0x3a, 0xd0, 0x7b, 0xbf, 0x52, 0xb5, 0x84, 0x53, 0x42, + 0x73, 0x8b, 0x61, 0x76, 0x4d, 0xa6, 0x99, 0x71, 0x86, 0x80, 0x3e, 0xd1, + 0x78, 0xc8, 0xc5, 0x6d, 0x48, 0x36, 0x3a, 0x84, 0x76, 0x74, 0x90, 0x3d, + 0x62, 0x6f, 0x70, 0x57, 0x41, 0x62, 0x7a, 0x69, 0x90, 0xa6, 0x78, 0x7e, + 0xa6, 0x31, 0x96, 0x72, 0xcd, 0x9b, 0xa6, 0xbf, 0x87, 0x75, 0x3f, 0x64, + 0x4d, 0x7d, 0x50, 0x38, 0x3a, 0x76, 0x40, 0x7c, 0x76, 0xcc, 0x50, 0x81, + 0xb3, 0x5d, 0x80, 0xac, 0x9c, 0x3a, 0xb3, 0x3a, 0xa1, 0xb8, 0xae, 0x41, + 0x75, 0x9e, 0x46, 0x4d, 0xb3, 0xa6, 0x99, 0x63, 0xca, 0x9b, 0x62, 0x7d, + 0x70, 0x77, 0x3b, 0x3d, 0xc5, 0x47, 0xc2, 0x3e, 0x52, 0x5e, 0x82, 0xd7, + 0xbd, 0x85, 0x5d, 0x3e, 0x48, 0xab, 0x60, 0x8a, 0x92, 0x74, 0x92, 0x56, + 0xc8, 0xce, 0x58, 0x9d, 0x81, 0x35, 0xb3, 0x7a, 0x84, 0x45, 0x53, 0x69, + 0xad, 0x99, 0x93, 0xa8, 0xb9, 0x90, 0x4c, 0xad, 0x7f, 0xce, 0x78, 0x8c, + 0x9c, 0x8e, 0x70, 0x9b, 0x2c, 0x68, 0x8e, 0x9d, 0x66, 0x32, 0x5c, 0x7f, + 0xc9, 0xba, 0x45, 0x42, 0x38, 0x86, 0xd7, 0xb6, 0x3a, 0xa1, 0xc8, 0xad, + 0xc3, 0xa1, 0xa5, 0x89, 0xbe, 0xb8, 0x32, 0xc6, 0xba, 0x58, 0x65, 0x43, + 0x88, 0xbe, 0xb0, 0x40, 0x34, 0xae, 0x26, 0xab, 0xb3, 0x49, 0x4f, 0xbd, + 0x95, 0xb5, 0x40, 0x55, 0xc1, 0x82, 0xc0, 0x3e, 0xa4, 0xad, 0xa7, 0xa6, + 0x5c, 0x9f, 0x5f, 0x3c, 0xad, 0x62, 0x91, 0x68, 0xa6, 0x61, 0x66, 0xc7, + 0x97, 0xc2, 0x92, 0xb6, 0x62, 0x96, 0x72, 0x7e, 0x9c, 0x72, 0x30, 0x83, + 0x37, 0xc2, 0xa0, 0xb4, 0x6d, 0x7c, 0x46, 0x38, 0xd1, 0x53, 0x74, 0x47, + 0x5f, 0xaa, 0x58, 0x65, 0x5a, 0x44, 0xe0, 0x98, 0x3f, 0xb9, 0x33, 0x44, + 0x4d, 0xa1, 0x43, 0x86, 0x94, 0x5a, 0xb0, 0x8d, 0x7d, 0x94, 0xdd, 0xcd, + 0xbc, 0x89, 0xd4, 0xb4, 0xb4, 0xac, 0xb3, 0xb8, 0xc1, 0x83, 0xba, 0x89, + 0x34, 0x8e, 0x4c, 0xa3, 0xca, 0x4b, 0x9a, 0xb5, 0x40, 0x4f, 0x82, 0x55, + 0x50, 0x75, 0x59, 0xac, 0x60, 0x87, 0x4a, 0xb7, 0xc7, 0x4c, 0xc2, 0x43, + 0xcf, 0x90, 0x42, 0x50, 0x93, 0x55, 0x8b, 0x58, 0xc0, 0xa1, 0xb5, 0x99, + 0x9f, 0xc5, 0x3d, 0xd3, 0xbb, 0x31, 0x7f, 0xab, 0xb9, 0x45, 0xa6, 0x99, + 0x6a, 0x73, 0x89, 0xaa, 0xba, 0x84, 0x65, 0x55, 0x79, 0x46, 0xca, 0x38, + 0xc5, 0x7a, 0xd1, 0x9f, 0x56, 0xd9, 0xba, 0x49, 0x92, 0x39, 0x62, 0xc2, + 0x38, 0x85, 0x46, 0x56, 0x4e, 0x68, 0xc6, 0x89, 0x4a, 0x86, 0xa8, 0x9a, + 0x74, 0x45, 0x61, 0xc2, 0x50, 0x43, 0xaf, 0x65, 0x71, 0x84, 0xc5, 0x64, + 0x65, 0x49, 0xc2, 0x6e, 0xa1, 0xab, 0x90, 0x80, 0x57, 0xaa, 0x5c, 0x9f, + 0x94, 0x5b, 0xaf, 0xa2, 0xaf, 0x87, 0x9b, 0x6c, 0x68, 0x8f, 0xc0, 0x3e, + 0x89, 0x7c, 0x90, 0x79, 0xba, 0xb5, 0x4b, 0xa9, 0x48, 0x6e, 0x46, 0x98, + 0x80, 0xcb, 0x93, 0xb5, 0x35, 0xd9, 0xc6, 0xbc, 0x72, 0xc9, 0x93, 0x4e, + 0xbb, 0x49, 0x85, 0xbe, 0xbe, 0x2f, 0x5c, 0x36, 0x8b, 0x5e, 0xa1, 0x43, + 0xbd, 0xd2, 0x83, 0x37, 0x68, 0x49, 0xc4, 0xa8, 0x48, 0x71, 0x50, 0xb0, + 0x8a, 0xc2, 0xc3, 0x85, 0x73, 0xab, 0xa1, 0x4a, 0xa9, 0x46, 0xb0, 0x37, + 0x82, 0x58, 0xa7, 0x9c, 0x75, 0x4b, 0x76, 0xa0, 0x8a, 0x86, 0x91, 0x58, + 0x53, 0x53, 0x3d, 0x42, 0x53, 0x4c, 0xc3, 0xd3, 0xa9, 0x69, 0xc1, 0xa5, + 0x8c, 0x84, 0xa8, 0x8c, 0xb8, 0x5f, 0x49, 0xc1, 0x42, 0x9c, 0x67, 0x49, + 0x5b, 0xa5, 0x37, 0x87, 0x84, 0x8d, 0x95, 0xb9, 0x76, 0x95, 0xad, 0x6b, + 0x69, 0x7a, 0x8c, 0xae, 0x49, 0xab, 0xcc, 0x78, 0x6f, 0x70, 0x58, 0x64, + 0x39, 0x54, 0x73, 0xc2, 0x5f, 0x6a, 0xc9, 0x94, 0xa8, 0x7c, 0x53, 0xa4, + 0x4b, 0xb2, 0x4b, 0x84, 0xcc, 0xb2, 0x3d, 0x75, 0x8e, 0x36, 0x91, 0x5d, + 0x50, 0x9a, 0x8b, 0x31, 0x66, 0x8c, 0x4c, 0xda, 0x7b, 0xcf, 0x84, 0xd7, + 0x8f, 0x86, 0x5b, 0xb6, 0x4f, 0x8f, 0x7c, 0xc9, 0xa8, 0x53, 0x5c, 0x50, + 0x6d, 0x89, 0xb2, 0xaf, 0x5d, 0xa2, 0x62, 0x4e, 0xb8, 0x70, 0xb3, 0xc9, + 0xd0, 0x89, 0x67, 0xcd, 0x6c, 0x77, 0xa9, 0x9b, 0xc3, 0x89, 0x32, 0x57, + 0xa8, 0xcf, 0x38, 0x7b, 0x68, 0x2e, 0xcb, 0x68, 0xb0, 0x93, 0xd5, 0xaa, + 0x33, 0x74, 0x3c, 0x62, 0xaa, 0xc1, 0x44, 0x75, 0xba, 0x71, 0x8e, 0x93, + 0xcb, 0xc6, 0x24, 0x5d, 0xa7, 0x7d, 0xae, 0x45, 0xd3, 0x8e, 0xd8, 0xa9, + 0x4d, 0x46, 0x2e, 0xb1, 0x49, 0x91, 0x87, 0x9f, 0x5b, 0xd1, 0x50, 0x7d, + 0x55, 0x51, 0xa6, 0x55, 0x97, 0x65, 0x93, 0x44, 0x53, 0xb4, 0x87, 0xba, + 0x58, 0x80, 0x4f, 0xc7, 0xa5, 0x3b, 0x45, 0xaa, 0xac, 0xb4, 0xcc, 0x38, + 0xc0, 0x2a, 0x47, 0xbf, 0x50, 0x9d, 0x6d, 0x5d, 0xaf, 0x9c, 0xae, 0xd3, + 0xb7, 0x90, 0x55, 0x65, 0x82, 0xb4, 0xa2, 0x33, 0x34, 0x7f, 0xb7, 0x84, + 0xc8, 0x48, 0xd1, 0xd7, 0x86, 0x5d, 0x3d, 0x72, 0x74, 0x94, 0x57, 0xbe, + 0xcd, 0x7f, 0x94, 0x70, 0x32, 0xba, 0x47, 0x47, 0x71, 0xd4, 0x72, 0xc8, + 0xd1, 0xae, 0x7b, 0x87, 0x4a, 0x74, 0x3b, 0x60, 0xa3, 0x94, 0x68, 0x60, + 0x34, 0xd1, 0x8a, 0xb6, 0x7b, 0xa1, 0xb0, 0x43, 0x62, 0x3b, 0xad, 0x38, + 0xaa, 0xac, 0x53, 0x91, 0x6b, 0xd3, 0xba, 0x3b, 0xb7, 0xc0, 0x8b, 0x6a, + 0x43, 0x6d, 0x5a, 0xa9, 0x90, 0x6f, 0x56, 0xa5, 0x80, 0x4e, 0x8d, 0xd1, + 0xa3, 0x4a, 0x4b, 0x52, 0x45, 0xb0, 0x8c, 0xb9, 0x7d, 0xa9, 0xcf, 0x55, + 0x53, 0xc3, 0x8a, 0x38, 0x37, 0x1f, 0x49, 0x96, 0x89, 0x51, 0xbe, 0x6b, + 0x63, 0x45, 0x31, 0x64, 0x2b, 0x9a, 0x5e, 0x67, 0x4f, 0xc6, 0x39, 0x28, + 0xa5, 0x5e, 0x8e, 0xbf, 0x79, 0x6e, 0xc3, 0x93, 0x32, 0x5a, 0x82, 0x8d, + 0x97, 0x7a, 0x63, 0x94, 0xb5, 0xc1, 0x81, 0xac, 0x89, 0x99, 0x67, 0xad, + 0x5b, 0x74, 0x97, 0xa9, 0xbc, 0x4f, 0x98, 0xaa, 0xa4, 0x5b, 0x75, 0x82, + 0x7e, 0x96, 0x72, 0xa8, 0x69, 0x75, 0x7b, 0x59, 0x5a, 0x52, 0xcd, 0x4f, + 0xc6, 0x8d, 0x4c, 0x98, 0x8c, 0xc5, 0x43, 0xae, 0x44, 0x79, 0xc1, 0x78, + 0xb9, 0xaf, 0x4e, 0x80, 0x72, 0xba, 0xcf, 0x60, 0x4b, 0xb4, 0x7e, 0xb7, + 0x6f, 0xb9, 0x7d, 0x47, 0xcb, 0xab, 0x71, 0x54, 0xad, 0xb2, 0x93, 0xb5, + 0x84, 0x8a, 0x5e, 0x55, 0x8c, 0x6d, 0xae, 0x7a, 0x7e, 0x3b, 0x72, 0x83, + 0xc7, 0xa3, 0x4f, 0xb5, 0x92, 0x7a, 0xba, 0x4e, 0x2d, 0xc8, 0x78, 0xa1, + 0xa3, 0x47, 0x83, 0x54, 0x86, 0xca, 0x59, 0x50, 0xa7, 0x39, 0x6e, 0xb0, + 0xa7, 0xcc, 0x55, 0x86, 0xd1, 0x93, 0xa2, 0xc1, 0xcd, 0x52, 0x7c, 0xa9, + 0xd4, 0x63, 0xa3, 0xae, 0x68, 0x99, 0x90, 0xcd, 0xaf, 0x4c, 0x4b, 0x7a, + 0x62, 0xb3, 0xa8, 0x62, 0x8d, 0xd1, 0x88, 0xc3, 0x97, 0x9e, 0x3b, 0x47, + 0x68, 0xc5, 0xc0, 0x85, 0x59, 0xc6, 0x6a, 0x73, 0x63, 0x93, 0x5b, 0xd2, + 0x75, 0xcf, 0x34, 0xb0, 0x76, 0xaa, 0xd5, 0xbf, 0x3f, 0x92, 0x76, 0x6a, + 0xd0, 0xbc, 0x5b, 0x73, 0xc5, 0x89, 0x81, 0x6b, 0xd4, 0x57, 0x3d, 0x52, + 0x63, 0xd5, 0xad, 0x4c, 0x75, 0x50, 0x3a, 0x97, 0x7b, 0xaf, 0x80, 0xba, + 0xc5, 0x44, 0x86, 0x55, 0x84, 0x76, 0xac, 0x64, 0xbe, 0xc8, 0x85, 0x77, + 0x7d, 0xcc, 0x89, 0xb7, 0xd1, 0xad, 0xab, 0x47, 0xb3, 0x9b, 0x5d, 0x72, + 0x7b, 0xaf, 0xca, 0x6e, 0x63, 0xb9, 0x3f, 0xd5, 0x9c, 0x79, 0x8e, 0xa2, + 0xb1, 0x4f, 0xb4, 0xd8, 0x4c, 0xc7, 0x5e, 0x98, 0x3f, 0x68, 0x82, 0x47, + 0xcf, 0x2d, 0x98, 0x7e, 0x95, 0xb9, 0x67, 0x6c, 0xbe, 0xad, 0x3d, 0xc8, + 0x37, 0xc1, 0xd1, 0x98, 0x8c, 0x4d, 0x66, 0x66, 0xb2, 0x97, 0xbe, 0x8b, + 0x64, 0xc5, 0x6c, 0x2f, 0x8f, 0xc1, 0x6c, 0x97, 0x71, 0x50, 0xbe, 0x5a, + 0xd3, 0x6f, 0x8c, 0xaf, 0x89, 0x56, 0x9f, 0xad, 0x6c, 0x89, 0xaa, 0x9a, + 0x98, 0xd3, 0x79, 0xbb, 0x3a, 0xd2, 0x31, 0x91, 0x99, 0xbb, 0xdc, 0xb9, + 0x8c, 0x46, 0x70, 0x61, 0x65, 0x2f, 0xa5, 0x56, 0xc3, 0xc4, 0xbe, 0x67, + 0x81, 0xa9, 0x95, 0xba, 0xab, 0x87, 0x8f, 0x63, 0xa6, 0x83, 0x50, 0x61, + 0x7e, 0xd1, 0x77, 0x44, 0xae, 0x7f, 0x69, 0x30, 0x3e, 0x57, 0xcd, 0x70, + 0x8e, 0xd8, 0xc7, 0x73, 0xbc, 0xb8, 0xa3, 0x57, 0x73, 0xa2, 0x9e, 0xb6, + 0x85, 0x91, 0x71, 0xc8, 0x83, 0xb8, 0x42, 0x94, 0xba, 0x4c, 0x59, 0x89, + 0x5c, 0x6c, 0x55, 0x2e, 0x80, 0x63, 0x3e, 0x48, 0x34, 0x50, 0x82, 0x9d, + 0x8b, 0xd1, 0x84, 0x6e, 0x73, 0x84, 0x90, 0x67, 0xcc, 0x58, 0x5b, 0x69, + 0xb5, 0x3d, 0xc5, 0xb2, 0x99, 0xb1, 0x4b, 0xcc, 0x53, 0x8b, 0xcd, 0xc3, + 0x89, 0x43, 0xb5, 0x3d, 0xa9, 0x43, 0x37, 0xb9, 0x4d, 0x79, 0xa9, 0x8e, + 0xc7, 0xac, 0xa6, 0x41, 0x59, 0x47, 0xae, 0xa0, 0x8d, 0x75, 0x88, 0x4b, + 0xc3, 0x5f, 0x52, 0x56, 0xb0, 0x7e, 0x51, 0xa6, 0x87, 0x8b, 0xd8, 0xa6, + 0x24, 0x85, 0xb7, 0x52, 0x52, 0xbc, 0xc8, 0x8e, 0x33, 0x45, 0x94, 0xb1, + 0x2b, 0x28, 0xc4, 0x73, 0x19, 0xa8, 0x89, 0x64, 0xab, 0xc0, 0x95, 0x47, + 0x8b, 0xc4, 0xc1, 0x5e, 0x77, 0xae, 0x44, 0x5b, 0x56, 0x3b, 0x84, 0x48, + 0x8d, 0x94, 0x81, 0x8c, 0xa7, 0x69, 0x4e, 0x90, 0x92, 0x3e, 0xb3, 0xb4, + 0x3e, 0x6b, 0xb2, 0x6a, 0x6d, 0x5b, 0x6d, 0xc1, 0x69, 0x49, 0xb5, 0x4e, + 0x83, 0x2e, 0xa4, 0x68, 0xd8, 0x84, 0x36, 0x39, 0x99, 0x48, 0xa7, 0x6b, + 0xa4, 0xba, 0x8c, 0x73, 0x98, 0x64, 0xd0, 0x9c, 0x76, 0xb5, 0x56, 0x5e, + 0x4a, 0x3b, 0x9e, 0x81, 0xca, 0x9a, 0x52, 0xb7, 0xc0, 0x6c, 0x6b, 0x55, + 0x38, 0x63, 0x4b, 0x9b, 0xa4, 0x8b, 0xc4, 0xb9, 0xad, 0x9f, 0xbd, 0x92, + 0xbb, 0x51, 0x97, 0xcf, 0x46, 0x96, 0x69, 0x40, 0x60, 0x4a, 0x74, 0x6b, + 0xc3, 0x65, 0x7e, 0x48, 0xa3, 0x66, 0x92, 0x49, 0x45, 0x86, 0xa7, 0x7e, + 0x89, 0x73, 0xac, 0x7f, 0x80, 0xcc, 0xc2, 0x63, 0x36, 0xc0, 0x67, 0x8e, + 0xbd, 0x87, 0xbd, 0x9f, 0xc4, 0xd0, 0x36, 0xa1, 0x6a, 0x47, 0xa2, 0x7c, + 0x76, 0x8a, 0xb4, 0x4f, 0x36, 0x98, 0x51, 0x90, 0x98, 0x57, 0x92, 0x3e, + 0x4e, 0x58, 0x99, 0xb9, 0x83, 0xb6, 0xd6, 0x33, 0x81, 0x80, 0x6b, 0xb0, + 0x84, 0xd6, 0x9c, 0x42, 0x40, 0x2f, 0x45, 0x75, 0x80, 0x9f, 0xd2, 0xa6, + 0xc5, 0xab, 0xd1, 0xc8, 0xb3, 0x40, 0xc2, 0x5d, 0xcf, 0x78, 0xcd, 0xcb, + 0x62, 0x51, 0x83, 0x85, 0x82, 0x94, 0x7c, 0xd4, 0xba, 0xc8, 0x65, 0x65, + 0x98, 0x7d, 0x53, 0x45, 0x37, 0x7c, 0x8d, 0x65, 0x75, 0x9f, 0x8a, 0x49, + 0xc7, 0x6a, 0xa6, 0x7f, 0x4f, 0x8f, 0xc1, 0x73, 0x9b, 0xa2, 0x61, 0xc0, + 0x6f, 0x3d, 0x29, 0x34, 0x95, 0x7f, 0x5f, 0xbc, 0xb9, 0x41, 0x65, 0xad, + 0x75, 0xa5, 0x39, 0x3f, 0x8d, 0xbc, 0xba, 0x3c, 0x79, 0x3a, 0x36, 0x3f, + 0xa3, 0x6a, 0x46, 0x88, 0xa9, 0x71, 0xd1, 0xa2, 0xa8, 0x78, 0x6f, 0x80, + 0x52, 0x6c, 0x77, 0x4e, 0x96, 0x50, 0x3c, 0x66, 0x58, 0x3d, 0x57, 0x62, + 0xb2, 0x5a, 0xa4, 0x76, 0x8d, 0xba, 0x54, 0x6d, 0xb5, 0xb4, 0xbc, 0xd7, + 0xb3, 0xdb, 0xa7, 0x62, 0xaf, 0xd5, 0xd3, 0x71, 0x69, 0x7b, 0x4d, 0xd4, + 0xa6, 0xc7, 0x53, 0xbe, 0x82, 0x67, 0xc9, 0x7f, 0xc2, 0x4d, 0xb9, 0x50, + 0x58, 0x9a, 0x60, 0x57, 0x3a, 0x7d, 0xa0, 0xc9, 0x6c, 0xd9, 0xcf, 0x9f, + 0xb6, 0x63, 0x7c, 0xc7, 0x87, 0x9f, 0x6e, 0x93, 0x9a, 0x83, 0x4c, 0x9e, + 0x36, 0xa5, 0x86, 0x3b, 0x9b, 0x85, 0xcc, 0xaf, 0x5c, 0x7c, 0xc3, 0x85, + 0xd2, 0x73, 0xcc, 0x74, 0x58, 0x49, 0x86, 0xb2, 0x98, 0xa1, 0xad, 0xa9, + 0xa2, 0x8b, 0x50, 0xb5, 0xc7, 0xc8, 0xa6, 0xb6, 0x4d, 0x64, 0x32, 0x2d, + 0xb0, 0x4a, 0x80, 0x4e, 0xb9, 0x44, 0x3d, 0x6c, 0x94, 0x8f, 0x68, 0x99, + 0x90, 0xa1, 0x8c, 0x8d, 0x70, 0xbf, 0xd1, 0x7e, 0x87, 0xb8, 0xc2, 0x99, + 0xd2, 0x90, 0x99, 0x88, 0x6f, 0xd3, 0xa0, 0x87, 0x44, 0xa3, 0xbd, 0x3e, + 0xa9, 0x65, 0x8f, 0xd0, 0x86, 0x41, 0x41, 0x8a, 0x3d, 0xc6, 0x9f, 0x6e, + 0x95, 0xb6, 0x95, 0x73, 0x58, 0x41, 0x59, 0xb1, 0x9e, 0xa0, 0xa8, 0xa5, + 0xc8, 0x9b, 0x5d, 0x3d, 0x7c, 0x9c, 0x58, 0xbb, 0xc5, 0xa5, 0x6c, 0x51, + 0x79, 0x52, 0xd1, 0x57, 0x22, 0x3f, 0xab, 0x76, 0x85, 0x8b, 0x6c, 0x66, + 0xb8, 0xc4, 0xcf, 0x4b, 0x4c, 0x4d, 0xd5, 0x66, 0x6b, 0x36, 0xa9, 0x5c, + 0x37, 0x5c, 0x99, 0x71, 0x73, 0x6f, 0x5a, 0xcd, 0xc9, 0x73, 0x49, 0x55, + 0x4b, 0x6f, 0x36, 0x59, 0xb1, 0xa4, 0x7c, 0xbd, 0xc2, 0xbf, 0xb1, 0x61, + 0x7d, 0xc8, 0x53, 0x70, 0x5b, 0x6b, 0x8a, 0xaa, 0x91, 0x9e, 0x39, 0x80, + 0xb4, 0x41, 0xb7, 0x8c, 0x82, 0xc7, 0x4e, 0x41, 0x8f, 0x3e, 0x4d, 0xcd, + 0x68, 0x5c, 0x35, 0xaa, 0x91, 0xc4, 0x64, 0x59, 0x83, 0x5c, 0x62, 0x74, + 0x86, 0x67, 0x43, 0x4c, 0x80, 0x74, 0x44, 0x7c, 0x8e, 0xa2, 0xce, 0xa0, + 0x9a, 0x8d, 0x9c, 0x59, 0xa7, 0x8c, 0x8c, 0x5d, 0x7a, 0x6e, 0x84, 0x49, + 0x6e, 0x97, 0xca, 0x57, 0x3c, 0x50, 0x68, 0xa0, 0x83, 0x98, 0x90, 0x50, + 0xb4, 0x41, 0xac, 0x86, 0x57, 0x94, 0x91, 0x71, 0x6b, 0xbd, 0x79, 0xc8, + 0x87, 0x9b, 0xba, 0xc7, 0xd4, 0xbb, 0xc6, 0xb4, 0x96, 0x5a, 0x9c, 0x38, + 0xb4, 0x88, 0x7f, 0x8c, 0xd1, 0x43, 0x33, 0x7b, 0x3e, 0xb7, 0x3d, 0xc9, + 0x60, 0xae, 0xd8, 0x9b, 0xb8, 0xbc, 0x79, 0x2e, 0xdf, 0x8f, 0x4e, 0xaf, + 0xcd, 0x38, 0x33, 0x81, 0x7c, 0xda, 0x41, 0x85, 0xac, 0x95, 0x61, 0x88, + 0xa0, 0x53, 0x70, 0xac, 0x4d, 0x5f, 0xbf, 0x46, 0x5c, 0x9a, 0x50, 0x4c, + 0x87, 0x62, 0x9d, 0x67, 0x65, 0x87, 0x99, 0x90, 0x47, 0xcc, 0xc3, 0xc3, + 0xd7, 0x7f, 0x2d, 0x66, 0xd6, 0x5c, 0xa1, 0xaf, 0x84, 0x64, 0x8b, 0x34, + 0xae, 0x6e, 0xd7, 0x3b, 0xb7, 0x8d, 0x6c, 0x85, 0x98, 0x4b, 0xbb, 0x6e, + 0x74, 0xa1, 0x8a, 0xa4, 0xa1, 0x38, 0xbe, 0x7b, 0xc1, 0x68, 0x6f, 0x56, + 0xdb, 0x8d, 0x60, 0xcd, 0x7a, 0x96, 0x43, 0xb2, 0xa8, 0x6c, 0xa1, 0xb0, + 0x9c, 0x44, 0xa9, 0xa2, 0xd2, 0x80, 0x8e, 0x7d, 0x8a, 0xc9, 0x59, 0x70, + 0x6a, 0x36, 0x37, 0x63, 0x54, 0x59, 0xb6, 0x44, 0xbd, 0x43, 0x9d, 0x41, + 0x96, 0x90, 0x36, 0x3e, 0xc6, 0x89, 0x6d, 0x5b, 0x2b, 0x74, 0x2f, 0xcf, + 0x85, 0x89, 0xa3, 0xbb, 0x36, 0x59, 0x4e, 0x70, 0x81, 0xa5, 0x8a, 0xb1, + 0x8c, 0x9c, 0x96, 0xb4, 0xac, 0x51, 0x92, 0xd3, 0x37, 0x4d, 0xab, 0x74, + 0xc3, 0xc6, 0xab, 0x6c, 0x8e, 0x59, 0x30, 0x9c, 0x83, 0x6f, 0xa2, 0xc0, + 0xb9, 0x51, 0x40, 0x66, 0xbe, 0x62, 0x84, 0x8d, 0xba, 0xbb, 0xda, 0xac, + 0xa7, 0xc7, 0x75, 0x37, 0x72, 0x43, 0xbf, 0x75, 0x6f, 0x7e, 0xac, 0x5a, + 0x41, 0x8d, 0x6e, 0x6f, 0x39, 0x8e, 0x9a, 0xc5, 0xd5, 0xdc, 0xce, 0x7f, + 0xaa, 0x7d, 0x40, 0xa8, 0x3c, 0x62, 0xb1, 0x3e, 0x61, 0x82, 0x96, 0x3e, + 0xa3, 0x30, 0x62, 0x4c, 0x43, 0x51, 0xc2, 0x56, 0xcf, 0x78, 0x4a, 0x82, + 0x7a, 0x52, 0xc0, 0x30, 0x96, 0x90, 0xd0, 0x78, 0x4e, 0x64, 0xc1, 0x75, + 0x8c, 0x89, 0x52, 0x9c, 0x44, 0x43, 0x52, 0x51, 0xc6, 0xcf, 0x31, 0x3c, + 0xa6, 0xd4, 0xcc, 0xa3, 0xb5, 0xc2, 0x97, 0x6d, 0x5b, 0x82, 0x8d, 0xc9, + 0x65, 0x54, 0x77, 0x3a, 0x33, 0xb5, 0x72, 0x59, 0x61, 0xbd, 0x65, 0x8b, + 0x71, 0x51, 0x6d, 0xc3, 0x82, 0x93, 0x7c, 0x82, 0xc0, 0x2f, 0x9f, 0x94, + 0xb0, 0x4d, 0xd1, 0xc0, 0xc6, 0x68, 0x93, 0xcd, 0xa7, 0x83, 0xae, 0xb5, + 0x30, 0x2d, 0x8a, 0xa1, 0x4a, 0x46, 0xd5, 0x9e, 0x45, 0x33, 0x4a, 0x95, + 0x8a, 0x84, 0x37, 0x8e, 0x9e, 0x8d, 0x94, 0x5b, 0x3e, 0xc5, 0x38, 0x5a, + 0x92, 0xc2, 0x6c, 0xa1, 0x33, 0x73, 0x78, 0x41, 0x4f, 0xca, 0x6f, 0xb3, + 0x64, 0x5d, 0x35, 0x7c, 0x42, 0x59, 0xc5, 0x3a, 0xb5, 0x5c, 0xc7, 0x62, + 0xa4, 0xb4, 0x71, 0x45, 0x93, 0x33, 0x47, 0xbe, 0xb9, 0x41, 0xbb, 0x38, + 0x5c, 0xbe, 0x54, 0x5d, 0x6f, 0x5b, 0x72, 0xd4, 0x6c, 0x3a, 0x90, 0xb8, + 0x7b, 0x4f, 0x66, 0xad, 0xb4, 0x90, 0x42, 0xc7, 0x51, 0x3c, 0xd1, 0xcd, + 0xa6, 0xbd, 0x84, 0x80, 0x4d, 0xa6, 0x2d, 0xd0, 0x73, 0x47, 0x5a, 0x83, + 0xc4, 0x6e, 0x78, 0x92, 0x68, 0xd5, 0xcb, 0x66, 0x3f, 0x82, 0x53, 0x7c, + 0x28, 0x88, 0xa5, 0x88, 0x82, 0xd6, 0xa7, 0x2a, 0xb8, 0x73, 0x3f, 0x3f, + 0x48, 0xb4, 0xa5, 0x3b, 0xca, 0xbb, 0x3c, 0x93, 0xa4, 0x6d, 0xcd, 0x8f, + 0x71, 0x3e, 0x76, 0x4d, 0x87, 0x9c, 0xb0, 0x7d, 0x68, 0xc4, 0x4b, 0xc4, + 0x6f, 0xa4, 0x7a, 0x39, 0xcf, 0x35, 0xc0, 0x51, 0x69, 0x97, 0x8c, 0x65, + 0x4a, 0x7d, 0x96, 0xc5, 0xc0, 0xa8, 0x78, 0x64, 0x6b, 0x40, 0x38, 0x35, + 0xa6, 0xc5, 0xc7, 0x6e, 0x83, 0x49, 0x96, 0xa4, 0x54, 0x69, 0x99, 0x89, + 0xa0, 0x5c, 0x92, 0x78, 0x7d, 0x5b, 0xbe, 0xd3, 0x5c, 0x75, 0x4d, 0x65, + 0x4f, 0xaf, 0xab, 0x64, 0xa1, 0x91, 0xbd, 0x63, 0xaf, 0x50, 0x80, 0x8b, + 0x93, 0xc9, 0x49, 0x38, 0xb1, 0x9a, 0x36, 0x60, 0x85, 0xa0, 0xb5, 0xa5, + 0x7a, 0x66, 0x64, 0x60, 0x95, 0x44, 0xbd, 0xce, 0x97, 0xa0, 0x8b, 0xca, + 0x6f, 0x97, 0xab, 0xc2, 0x4a, 0xbd, 0xa5, 0x46, 0xb3, 0x9d, 0x44, 0x79, + 0x91, 0x71, 0x73, 0x6e, 0x3d, 0x43, 0x6f, 0x9d, 0x96, 0xaf, 0xbf, 0x8b, + 0x5f, 0x76, 0x9e, 0xae, 0xc8, 0xc5, 0x6f, 0x4f, 0x73, 0xc9, 0x86, 0x3d, + 0xc2, 0x86, 0xc5, 0xaa, 0x33, 0x8b, 0x40, 0xb4, 0x72, 0x7c, 0x36, 0xae, + 0x3e, 0x5b, 0xd3, 0x35, 0xa7, 0x85, 0x49, 0x8f, 0x7f, 0xa7, 0x8b, 0x46, + 0xa0, 0x92, 0xca, 0xb4, 0x84, 0xab, 0x62, 0x52, 0x8f, 0x74, 0xbc, 0x3b, + 0xac, 0xcf, 0xac, 0xab, 0xcd, 0x7b, 0x75, 0x38, 0x81, 0x41, 0x3e, 0x83, + 0x51, 0x4b, 0x64, 0xc1, 0x6f, 0x6c, 0xb3, 0x70, 0xc2, 0xa7, 0x39, 0x3a, + 0x67, 0x4d, 0xcd, 0xcd, 0x3f, 0x7e, 0x77, 0x47, 0xbb, 0x38, 0x79, 0x76, + 0xdb, 0xd2, 0x63, 0x90, 0xd2, 0x81, 0xa2, 0xc7, 0x5b, 0x4d, 0x40, 0x4d, + 0x4c, 0xdd, 0xb0, 0xd1, 0x3c, 0x47, 0x9e, 0x37, 0xd0, 0x89, 0x6f, 0x8e, + 0xb7, 0x3b, 0xb6, 0x36, 0x75, 0x63, 0x7f, 0x64, 0x4a, 0x56, 0x56, 0x3f, + 0x86, 0x76, 0x75, 0x3f, 0x79, 0xdb, 0x49, 0xa5, 0xc5, 0xa3, 0xab, 0x90, + 0xd2, 0x96, 0x6e, 0xa0, 0x94, 0xa5, 0xc6, 0xaf, 0x4b, 0xbf, 0xa5, 0x72, + 0x50, 0x35, 0x60, 0xaa, 0x9e, 0x97, 0xbf, 0x4f, 0x40, 0xc1, 0xb3, 0x8e, + 0xc3, 0xb8, 0x4e, 0xac, 0x42, 0xc1, 0x39, 0x9f, 0x84, 0x3f, 0x6a, 0xbc, + 0x8b, 0x66, 0x44, 0x6c, 0x97, 0x82, 0x3a, 0x8b, 0x71, 0x67, 0x4c, 0x45, + 0x37, 0xb3, 0x98, 0xa1, 0x50, 0x85, 0x76, 0x7c, 0x61, 0x5a, 0xcc, 0xdb, + 0x4c, 0x3c, 0x74, 0x8d, 0xa5, 0xc1, 0x62, 0x83, 0x75, 0xa7, 0x63, 0x44, + 0x6d, 0xa5, 0x53, 0x7c, 0xd7, 0x57, 0x45, 0x93, 0x7d, 0xc5, 0xd9, 0x40, + 0x8e, 0x5c, 0xaf, 0xbf, 0x3f, 0xaf, 0x30, 0x5e, 0x4e, 0xca, 0xa5, 0xba, + 0x68, 0x5d, 0x84, 0x39, 0xcc, 0x97, 0x97, 0xb3, 0xbf, 0x46, 0xc4, 0xaf, + 0xc5, 0xa6, 0xa7, 0xcd, 0x6b, 0x80, 0x82, 0x4e, 0x97, 0x80, 0x3d, 0x53, + 0xe7, 0x56, 0x3b, 0x67, 0x3d, 0x65, 0x8d, 0x92, 0x61, 0xb3, 0xa4, 0xa4, + 0x87, 0x85, 0xd0, 0x7e, 0xb6, 0xc0, 0x32, 0x31, 0xbf, 0xa5, 0x9e, 0xcd, + 0xc0, 0x83, 0x4e, 0x7b, 0x69, 0xc6, 0x3c, 0x90, 0xac, 0xd2, 0x90, 0x65, + 0x9c, 0x7d, 0x69, 0x62, 0x57, 0xc8, 0x81, 0xdf, 0x7a, 0x97, 0xbe, 0x7c, + 0xa1, 0x72, 0xd3, 0x5b, 0xa2, 0x5a, 0xb2, 0x99, 0xbf, 0x6f, 0x8d, 0xc5, + 0x46, 0x89, 0xad, 0x44, 0x8b, 0x43, 0x4f, 0xc6, 0x61, 0xb6, 0x25, 0x35, + 0x50, 0x63, 0x83, 0xb3, 0x4c, 0x61, 0xab, 0xa2, 0x6b, 0x61, 0xc4, 0x39, + 0x4c, 0x42, 0x37, 0x90, 0x50, 0xb9, 0xb3, 0xb4, 0x47, 0x39, 0x38, 0x67, + 0xb1, 0x4f, 0x54, 0x6c, 0x51, 0x5e, 0x3e, 0x8e, 0xc7, 0xc6, 0x42, 0xb5, + 0x87, 0xc0, 0x6d, 0xc8, 0xd0, 0x9a, 0x6e, 0xc4, 0x7f, 0xd1, 0x98, 0x42, + 0x7e, 0x87, 0x3d, 0x43, 0x86, 0x44, 0x8b, 0xd4, 0x6c, 0xc1, 0xb9, 0x90, + 0x74, 0xb5, 0x6e, 0x56, 0x5b, 0xba, 0xc4, 0xc0, 0xaa, 0xab, 0x5b, 0x65, + 0x7c, 0x60, 0x89, 0xae, 0xcf, 0xc6, 0x42, 0x8c, 0xca, 0xc2, 0x91, 0x31, + 0x35, 0xab, 0x77, 0x64, 0x9f, 0x5a, 0x75, 0x60, 0x5f, 0x95, 0xa1, 0x86, + 0xa5, 0x3e, 0x5c, 0xd5, 0x6b, 0x85, 0x4f, 0xb3, 0x4b, 0x70, 0x4e, 0x98, + 0xc1, 0xc4, 0x78, 0x9d, 0x44, 0x90, 0x77, 0x84, 0xa4, 0x2b, 0xb7, 0x59, + 0x5e, 0x56, 0xbc, 0x44, 0xd1, 0x46, 0xd4, 0xe7, 0x5f, 0x44, 0xcb, 0x4b, + 0xd4, 0x7a, 0x8f, 0x8c, 0x95, 0x25, 0x66, 0xa9, 0xa8, 0x80, 0x4e, 0xc5, + 0x93, 0x74, 0x63, 0xcc, 0x82, 0x8e, 0xcc, 0xb1, 0x5a, 0xb5, 0x4c, 0x84, + 0x26, 0x38, 0xd4, 0x4e, 0x7b, 0xae, 0xb0, 0x2c, 0x30, 0x7b, 0xc6, 0xae, + 0xb8, 0x3e, 0x8f, 0xbc, 0x92, 0xb2, 0xc7, 0x4b, 0x37, 0xa0, 0x6f, 0x25, + 0x5d, 0xd9, 0xc3, 0x41, 0x35, 0xcd, 0x9c, 0x85, 0x42, 0x3d, 0xae, 0xa3, + 0xc7, 0xb3, 0x7f, 0x65, 0x5c, 0xbc, 0x59, 0x99, 0x63, 0x38, 0x32, 0x62, + 0x80, 0x3b, 0x7e, 0x5c, 0x67, 0x5f, 0xd3, 0x81, 0xaf, 0x57, 0x8b, 0xc7, + 0x6d, 0x6f, 0x57, 0x80, 0x9c, 0x5e, 0xc1, 0x76, 0x32, 0xaf, 0x88, 0x77, + 0xe6, 0x91, 0x25, 0x79, 0xc4, 0xa1, 0x7e, 0xa6, 0x2c, 0x7f, 0x87, 0x90, + 0x4b, 0x4b, 0x93, 0x76, 0xc9, 0x83, 0x46, 0xb2, 0x55, 0x9f, 0xa3, 0x71, + 0xca, 0x7e, 0x61, 0x8f, 0x8a, 0x7b, 0xab, 0x61, 0x54, 0x69, 0xca, 0xba, + 0x5e, 0x32, 0x67, 0x5f, 0x74, 0x32, 0x37, 0x83, 0x41, 0xb1, 0x2f, 0xbb, + 0x58, 0x3d, 0xbf, 0x74, 0xad, 0xcd, 0xa8, 0xbc, 0xc8, 0xa2, 0x47, 0x89, + 0x6a, 0x3f, 0xc1, 0x45, 0x92, 0xb8, 0x51, 0x40, 0xc3, 0xae, 0x72, 0x39, + 0x65, 0x34, 0xc9, 0x5a, 0xa4, 0xa4, 0x79, 0x85, 0xaf, 0x90, 0x88, 0x40, + 0x41, 0x41, 0xcc, 0x78, 0x68, 0x4c, 0xc7, 0x38, 0xc5, 0x52, 0x3b, 0x4f, + 0x57, 0x69, 0x8a, 0xcd, 0x38, 0x47, 0x40, 0xaa, 0x7e, 0x63, 0x66, 0xa3, + 0x84, 0xb6, 0x56, 0x72, 0x42, 0x91, 0x7d, 0x90, 0x87, 0x4c, 0x82, 0x31, + 0x5f, 0xcc, 0x38, 0x61, 0x88, 0xa1, 0x53, 0xbd, 0x8d, 0x75, 0xc5, 0xa6, + 0xac, 0x7b, 0x67, 0x8f, 0x39, 0x74, 0x34, 0xa8, 0x5a, 0x54, 0x6d, 0xc0, + 0xaa, 0x51, 0x89, 0x98, 0xe5, 0x8e, 0x43, 0x56, 0x45, 0x9b, 0x59, 0xa2, + 0xca, 0xa9, 0x66, 0x73, 0x37, 0x59, 0xbc, 0x86, 0x4b, 0xa9, 0x48, 0x5b, + 0xb9, 0x9d, 0x51, 0xae, 0xa6, 0x3b, 0xa6, 0xbe, 0x66, 0x61, 0x41, 0xbc, + 0x70, 0x9c, 0xd3, 0xc6, 0x41, 0xaa, 0x87, 0x71, 0x5b, 0x91, 0x44, 0x61, + 0xb4, 0xc8, 0x3b, 0xaf, 0x79, 0x3e, 0x3a, 0x6d, 0x9b, 0xd8, 0x5e, 0x4c, + 0x54, 0x92, 0x76, 0x54, 0xb1, 0x6a, 0xd0, 0x97, 0x38, 0x63, 0x6a, 0x78, + 0x6d, 0x6c, 0x7b, 0x82, 0x61, 0xcf, 0x82, 0x96, 0xbb, 0x6d, 0x54, 0x52, + 0x87, 0x7e, 0xc3, 0x4f, 0x5f, 0xd6, 0xd7, 0x49, 0xa3, 0xa1, 0xa0, 0x3a, + 0xbb, 0xb9, 0xbb, 0x92, 0xbb, 0x83, 0x67, 0x98, 0x9b, 0xdd, 0x6f, 0x83, + 0x41, 0x91, 0xa4, 0x65, 0x62, 0x6e, 0xce, 0x6a, 0x8e, 0x34, 0xb9, 0x3f, + 0x55, 0xb2, 0x54, 0x7b, 0x52, 0xce, 0xd9, 0x97, 0xa7, 0x93, 0x3a, 0x70, + 0x94, 0x86, 0x7d, 0xa7, 0x5d, 0x2f, 0x33, 0x8b, 0xb5, 0x71, 0x3a, 0xbb, + 0xa4, 0x36, 0xc8, 0xab, 0x3f, 0x99, 0x3f, 0x42, 0xaa, 0xc7, 0xa2, 0x86, + 0x75, 0x83, 0x44, 0xab, 0x5b, 0x82, 0x4b, 0x6c, 0x98, 0xca, 0xb3, 0x44, + 0xb3, 0x56, 0xb6, 0x78, 0x31, 0xbb, 0xca, 0x88, 0x38, 0xcc, 0x4f, 0x9a, + 0xc5, 0x79, 0x8e, 0xc0, 0x90, 0x9e, 0x83, 0x7f, 0xdd, 0x57, 0xaa, 0x4c, + 0x97, 0x9d, 0x54, 0x96, 0x47, 0xf0, 0x5c, 0xb5, 0xb2, 0xa8, 0x9b, 0x8b, + 0x9b, 0x3e, 0x78, 0xae, 0x5a, 0xab, 0x54, 0x2f, 0xb7, 0x40, 0xd1, 0x6d, + 0xba, 0x34, 0x2c, 0x4b, 0x8b, 0x6c, 0x9d, 0xbb, 0x5c, 0x4d, 0x6c, 0xb0, + 0x91, 0x41, 0x69, 0x34, 0x64, 0x4b, 0x36, 0x62, 0x4a, 0x7a, 0x63, 0x9f, + 0x3d, 0x43, 0xa2, 0x63, 0xc1, 0x58, 0x7a, 0x38, 0x55, 0xa1, 0x35, 0x3a, + 0x9c, 0x99, 0xba, 0xc3, 0x5f, 0x72, 0x39, 0x5f, 0xa5, 0xbc, 0x64, 0x47, + 0x93, 0x76, 0x70, 0x9d, 0x84, 0x68, 0x72, 0x40, 0x90, 0x48, 0x8c, 0x5e, + 0x97, 0x78, 0x6e, 0x9d, 0x5d, 0x76, 0xc9, 0x7d, 0x6a, 0x74, 0xd9, 0xab, + 0x68, 0x65, 0x77, 0x9a, 0xaa, 0x78, 0x9d, 0x95, 0x52, 0x61, 0xcb, 0xad, + 0x67, 0x59, 0xb9, 0x29, 0x38, 0x47, 0xc2, 0xc9, 0xa0, 0x6d, 0x34, 0x47, + 0x6d, 0x68, 0x38, 0x8a, 0x69, 0x87, 0x80, 0xa6, 0x6b, 0x33, 0x91, 0xe1, + 0x6c, 0x50, 0xb8, 0x5b, 0xb2, 0xa9, 0xca, 0x99, 0x50, 0xd0, 0xb8, 0x3e, + 0xb1, 0x68, 0x9b, 0xa9, 0x78, 0x5b, 0x59, 0x8e, 0xd0, 0x66, 0x71, 0x76, + 0x75, 0x3f, 0x89, 0xd4, 0x73, 0x6b, 0x9f, 0x91, 0x8a, 0x38, 0xa4, 0x64, + 0xb0, 0x8a, 0xb8, 0x5c, 0xaf, 0xb7, 0xa4, 0x86, 0x44, 0x9a, 0xd9, 0x42, + 0x93, 0x37, 0xa7, 0x2b, 0x5e, 0x5b, 0xb6, 0x7e, 0xaf, 0xc5, 0xbd, 0x70, + 0x7a, 0xce, 0xcd, 0x37, 0xda, 0xd8, 0xd0, 0x69, 0x49, 0x8c, 0x85, 0x72, + 0x5c, 0x39, 0xbe, 0x46, 0x8d, 0x72, 0x81, 0xc1, 0x6c, 0x42, 0x45, 0x49, + 0x5d, 0x82, 0x6d, 0xbb, 0x79, 0xbe, 0xcc, 0xd5, 0x8a, 0x71, 0x52, 0x63, + 0xa1, 0x6d, 0xb7, 0xab, 0x38, 0xda, 0x3f, 0x5b, 0xa1, 0x70, 0x56, 0x37, + 0x77, 0xae, 0xc6, 0x37, 0x64, 0xa3, 0xc5, 0x5c, 0xce, 0xad, 0xa8, 0x9b, + 0x51, 0xca, 0x2f, 0xa6, 0x67, 0x34, 0xbe, 0x8d, 0xba, 0x8f, 0x71, 0xa3, + 0x45, 0xa4, 0x40, 0x72, 0xc4, 0xb1, 0xae, 0x9d, 0xa6, 0x3c, 0x77, 0x67, + 0xc3, 0xcd, 0xc9, 0x37, 0xb2, 0xd1, 0x39, 0x95, 0x3e, 0xc4, 0x56, 0xd8, + 0xc5, 0x4b, 0x69, 0x3e, 0xd0, 0xa3, 0x4a, 0x40, 0x94, 0x5c, 0x7e, 0xad, + 0x61, 0xbb, 0x54, 0x77, 0x8d, 0xa3, 0x5c, 0xcb, 0xa4, 0x63, 0xa5, 0xc6, + 0x8f, 0xb1, 0xc1, 0x3b, 0x41, 0x5d, 0x53, 0xbe, 0xc1, 0xbb, 0xbe, 0xc4, + 0x96, 0x71, 0xc4, 0x84, 0xb1, 0x6e, 0xa0, 0x91, 0xa6, 0x84, 0xad, 0x6c, + 0xa5, 0xa5, 0x5b, 0x7f, 0x45, 0x89, 0xbe, 0xb8, 0x7a, 0xb2, 0xc5, 0x6d, + 0x41, 0xc4, 0xa5, 0xb4, 0x70, 0x7b, 0x71, 0x46, 0x5c, 0x98, 0x54, 0x53, + 0xa8, 0x7b, 0xa8, 0x56, 0x46, 0x8e, 0x41, 0xae, 0xc5, 0x46, 0xab, 0x33, + 0x39, 0x39, 0x45, 0xb2, 0x47, 0x2c, 0x84, 0x3d, 0x80, 0x47, 0x97, 0x6c, + 0x4a, 0x8a, 0x98, 0x3e, 0x74, 0x74, 0x92, 0xd0, 0x89, 0x46, 0x81, 0x98, + 0x92, 0x70, 0xbb, 0x43, 0x85, 0x38, 0x87, 0xce, 0x72, 0x5a, 0x37, 0x4f, + 0x8f, 0x89, 0xd5, 0xa1, 0xc7, 0x7f, 0x5a, 0x8e, 0x9f, 0xd7, 0xb6, 0xc4, + 0x47, 0x79, 0x8d, 0x4f, 0x45, 0xcc, 0x4f, 0x72, 0xb1, 0x67, 0xa5, 0x7e, + 0x63, 0xb0, 0x57, 0x63, 0x2e, 0xa2, 0x37, 0xa9, 0x97, 0x8f, 0xbf, 0xbe, + 0xd1, 0x89, 0xbc, 0x6e, 0x55, 0xbc, 0xd5, 0x42, 0x48, 0x6c, 0xaf, 0x50, + 0xcd, 0x8a, 0x82, 0x3d, 0xa2, 0x8f, 0x3b, 0xaf, 0xac, 0xb7, 0x7d, 0x63, + 0x6c, 0x9c, 0xbd, 0xc4, 0x48, 0xcd, 0xcc, 0xbb, 0x63, 0xc7, 0x96, 0xa8, + 0x44, 0x59, 0x8a, 0x57, 0xa0, 0x5f, 0xb5, 0xb5, 0xc9, 0x42, 0x9d, 0x6b, + 0xb0, 0xb2, 0x7c, 0x87, 0xd2, 0xc9, 0x68, 0x84, 0x77, 0xbb, 0x49, 0xc9, + 0x79, 0x54, 0xbf, 0x8f, 0x91, 0x9e, 0x38, 0x5c, 0x52, 0xbe, 0xb3, 0x33, + 0x76, 0x5c, 0x5a, 0x96, 0xca, 0x5c, 0x93, 0x4d, 0x3e, 0x5f, 0x9d, 0xc7, + 0x8e, 0x71, 0xb7, 0xb2, 0x87, 0x3f, 0x63, 0x35, 0x7b, 0x41, 0x5f, 0x99, + 0x61, 0x4c, 0x34, 0x6d, 0xc9, 0x38, 0xb7, 0x9d, 0x8a, 0x68, 0x79, 0x87, + 0xcd, 0xaf, 0xcd, 0x35, 0xb4, 0xb5, 0xcb, 0x3a, 0xab, 0x9b, 0x66, 0x5d, + 0xc0, 0xc9, 0x9b, 0xc3, 0xd3, 0xb8, 0x3f, 0x7a, 0xa2, 0xd6, 0x66, 0x41, + 0x9d, 0x58, 0xaa, 0xd5, 0x7b, 0x91, 0xcf, 0x56, 0x9c, 0x3f, 0xc7, 0x39, + 0x7a, 0xab, 0x90, 0x8a, 0x43, 0xb5, 0x43, 0xbe, 0x80, 0x5e, 0x9a, 0xa1, + 0xcf, 0x6c, 0x49, 0xc8, 0xa6, 0x68, 0x42, 0x52, 0x79, 0xc7, 0x7b, 0x46, + 0xa1, 0x7b, 0x65, 0x9a, 0xba, 0xa5, 0x7e, 0x6b, 0x62, 0x32, 0x68, 0x34, + 0x4d, 0x48, 0x43, 0x33, 0x44, 0xbc, 0xac, 0x43, 0x87, 0x8b, 0x66, 0x60, + 0x61, 0x41, 0x58, 0xaa, 0xc4, 0xa2, 0x96, 0x31, 0x9a, 0x50, 0xb5, 0xc5, + 0x21, 0xcf, 0xb2, 0x45, 0x4b, 0x81, 0x7f, 0xc1, 0x4b, 0xcb, 0x60, 0x63, + 0x6f, 0xa6, 0x36, 0xc9, 0x9c, 0x66, 0xd8, 0x41, 0xd4, 0x8c, 0xcd, 0x86, + 0xaa, 0xd4, 0x81, 0x4a, 0x78, 0x54, 0x52, 0x43, 0xc6, 0x65, 0xbf, 0x72, + 0x5d, 0xd4, 0x7b, 0x52, 0x94, 0x6c, 0xc1, 0xb7, 0x35, 0xc7, 0x82, 0x94, + 0x90, 0x4e, 0xd2, 0x53, 0xb4, 0x5e, 0xab, 0x45, 0x65, 0x84, 0x93, 0x51, + 0xca, 0x91, 0xbf, 0xb8, 0xcb, 0xa7, 0xb7, 0x56, 0x49, 0xa3, 0x57, 0x5c, + 0xaf, 0x88, 0xaa, 0x4b, 0x5b, 0x3e, 0x3c, 0x6d, 0x57, 0x52, 0xaf, 0xb2, + 0x64, 0xb2, 0x7f, 0x46, 0xbb, 0xbd, 0x8c, 0x6c, 0xcf, 0xa2, 0xc1, 0xa0, + 0x96, 0x7d, 0x94, 0x91, 0x58, 0x4e, 0x48, 0x3b, 0x4e, 0xb5, 0x31, 0x97, + 0x38, 0x44, 0x67, 0x9f, 0x34, 0x9c, 0x39, 0x9e, 0x8d, 0x97, 0x6d, 0x5d, + 0x4f, 0x92, 0xbf, 0x66, 0x38, 0x80, 0x9e, 0x7e, 0x5c, 0x84, 0x72, 0xce, + 0x5d, 0xc6, 0x3d, 0x97, 0x57, 0x35, 0xc0, 0xb6, 0xa9, 0x56, 0x85, 0xb7, + 0x3b, 0x9c, 0x5e, 0x88, 0x63, 0x9d, 0x9e, 0x8e, 0x5b, 0x7f, 0x42, 0x3a, + 0x60, 0xab, 0x52, 0x84, 0xb2, 0x50, 0x9e, 0x49, 0xd8, 0xca, 0x3e, 0x2f, + 0xb5, 0xa1, 0x88, 0x60, 0x76, 0x32, 0xc8, 0x4f, 0x65, 0x3b, 0xa1, 0xaa, + 0x96, 0x95, 0xcd, 0xbe, 0xc5, 0xc4, 0xb7, 0x7b, 0x37, 0x47, 0xa1, 0xce, + 0x85, 0xcd, 0x79, 0x6b, 0x46, 0xaf, 0x3f, 0x73, 0x72, 0x70, 0x74, 0x6a, + 0xcd, 0x35, 0x8a, 0x6e, 0x75, 0xba, 0xbd, 0xbc, 0x64, 0x71, 0x3e, 0x31, + 0x54, 0x8e, 0x49, 0x61, 0x37, 0x76, 0xbe, 0x8d, 0x5c, 0x64, 0x44, 0x3a, + 0x8e, 0x55, 0x79, 0x85, 0xa6, 0xd6, 0xb2, 0x96, 0x7d, 0x89, 0xa2, 0xa3, + 0xa8, 0xb1, 0x79, 0x5b, 0x61, 0x37, 0x60, 0x5a, 0x4c, 0xab, 0xb7, 0xc3, + 0xc5, 0x63, 0x54, 0x46, 0x9d, 0xbb, 0x2f, 0xc6, 0x72, 0xb4, 0xca, 0x57, + 0x76, 0x64, 0xb3, 0x76, 0x30, 0xc2, 0x76, 0x3b, 0xcf, 0xb4, 0x74, 0x9a, + 0xcf, 0x37, 0x55, 0x53, 0x39, 0xbd, 0x5b, 0x70, 0xd5, 0x48, 0x57, 0x52, + 0x4b, 0x56, 0x78, 0x39, 0x7b, 0x8d, 0x62, 0x9a, 0x49, 0x8b, 0xc3, 0x81, + 0x79, 0xc4, 0x72, 0xd0, 0x4d, 0x34, 0xcd, 0x81, 0xce, 0xb4, 0x3f, 0x8c, + 0x5b, 0x59, 0xa4, 0x7e, 0x53, 0x78, 0x2a, 0x2b, 0xc2, 0x6f, 0x95, 0xc7, + 0x3a, 0x3b, 0x72, 0x42, 0x99, 0x44, 0x48, 0x79, 0x7c, 0x7e, 0x68, 0x47, + 0x46, 0x4c, 0x3d, 0xb9, 0x71, 0x88, 0xbe, 0x4d, 0xb8, 0x8b, 0x5e, 0x80, + 0x9d, 0x72, 0xaf, 0x3c, 0x74, 0xcf, 0xaf, 0x7f, 0x3d, 0x5c, 0x41, 0x9f, + 0x5e, 0xa9, 0x98, 0xa9, 0x99, 0x6d, 0x43, 0x53, 0xd0, 0x52, 0x6e, 0x9a, + 0xc0, 0x75, 0x54, 0xb5, 0x3c, 0xc9, 0x47, 0xb9, 0x83, 0x3a, 0x2b, 0x47, + 0x99, 0xb7, 0x78, 0x8e, 0x9c, 0xcc, 0xa4, 0xce, 0xc1, 0x61, 0x66, 0x3f, + 0xb4, 0x9c, 0xc1, 0xb1, 0x59, 0x7b, 0x7b, 0xbe, 0x9c, 0xa4, 0x72, 0xa9, + 0xbe, 0xb3, 0x34, 0xbf, 0x6a, 0x41, 0xc8, 0x5d, 0x9f, 0xa2, 0x8a, 0xa2, + 0x6b, 0x6b, 0x38, 0x33, 0x85, 0x65, 0x33, 0x53, 0xd2, 0xaf, 0x9b, 0x68, + 0x46, 0x45, 0x5a, 0xc2, 0x47, 0x43, 0x76, 0x78, 0xbe, 0x92, 0xa8, 0xb6, + 0x67, 0x64, 0xae, 0x42, 0xcc, 0x7b, 0xae, 0xb0, 0x65, 0x55, 0xab, 0x94, + 0xb0, 0x42, 0x54, 0x6c, 0x51, 0x83, 0xb2, 0x36, 0x91, 0x61, 0x32, 0x99, + 0xbc, 0x31, 0xc1, 0x8a, 0x57, 0x61, 0xb8, 0xa0, 0x6c, 0xbc, 0xbf, 0x3a, + 0x86, 0x40, 0xa7, 0xb3, 0x5a, 0x56, 0x50, 0x56, 0x90, 0x54, 0xb9, 0x53, + 0x7d, 0x89, 0x6b, 0x91, 0x55, 0xa4, 0x66, 0x78, 0xae, 0x2f, 0x32, 0x6d, + 0x5a, 0xc5, 0xb5, 0xd5, 0x30, 0x61, 0xab, 0x93, 0x99, 0x4e, 0x58, 0xd5, + 0x98, 0xbe, 0xce, 0x85, 0x4c, 0xcd, 0xbb, 0xbd, 0x39, 0x83, 0xb9, 0xcf, + 0x69, 0x57, 0x6d, 0x6c, 0x94, 0x3f, 0x9f, 0xc7, 0x9e, 0xc3, 0xca, 0x54, + 0xbd, 0x92, 0xbd, 0x72, 0x39, 0x60, 0x67, 0x6a, 0xcd, 0x4b, 0x2b, 0xa5, + 0xc8, 0x57, 0x4a, 0xac, 0x61, 0x39, 0x89, 0x62, 0xcc, 0x7d, 0xb8, 0x66, + 0xbd, 0xd8, 0xb8, 0x4f, 0xca, 0xba, 0x51, 0x68, 0xc0, 0x9d, 0xb3, 0x6d, + 0x6e, 0x44, 0x87, 0x39, 0x88, 0x6d, 0x49, 0x76, 0xcd, 0x70, 0x3e, 0x68, + 0x75, 0xaf, 0xb4, 0xac, 0xa3, 0x4e, 0x74, 0x53, 0x8f, 0x7c, 0x3c, 0x9e, + 0x4a, 0x35, 0xa4, 0x72, 0xd0, 0xa6, 0xc5, 0x52, 0x80, 0xad, 0x7b, 0x8c, + 0x50, 0xb6, 0xaa, 0x64, 0xd1, 0xce, 0xba, 0x23, 0xb0, 0x8c, 0xcc, 0x96, + 0x77, 0xc6, 0xa8, 0xab, 0xaf, 0xc2, 0xca, 0x72, 0x9a, 0xb6, 0xd3, 0xb3, + 0xc9, 0x97, 0xa9, 0x8f, 0xb1, 0xae, 0x8b, 0x63, 0xad, 0x75, 0x4d, 0x69, + 0xa9, 0x8c, 0x80, 0x51, 0x4b, 0x94, 0x89, 0xa9, 0x4d, 0x9f, 0x55, 0x66, + 0xb6, 0xa4, 0x33, 0xaf, 0xa2, 0xc7, 0x88, 0x5d, 0x74, 0xa3, 0x82, 0xa4, + 0x59, 0x9c, 0xcd, 0x68, 0xbc, 0xce, 0x8e, 0x58, 0xa6, 0xb7, 0x45, 0x50, + 0x7b, 0x9c, 0x70, 0x7e, 0xac, 0x74, 0x55, 0x45, 0x9f, 0x43, 0xa8, 0x9d, + 0x32, 0xb4, 0x81, 0x9a, 0x75, 0x79, 0x56, 0x67, 0x7f, 0xd8, 0xc0, 0x3f, + 0xca, 0x8a, 0x72, 0x52, 0xa5, 0x70, 0x5b, 0x54, 0x5a, 0x63, 0x3e, 0xaa, + 0x86, 0xd7, 0x7f, 0x4b, 0xd6, 0x57, 0xac, 0x62, 0xad, 0x4d, 0x96, 0x97, + 0x46, 0x89, 0xa5, 0x57, 0x33, 0x4a, 0xc9, 0x58, 0x6e, 0xc2, 0xb8, 0x9b, + 0x70, 0x83, 0x94, 0x3a, 0xcb, 0x7e, 0x8e, 0x8d, 0x8b, 0x47, 0x84, 0x3e, + 0x7f, 0x85, 0x36, 0xcd, 0x45, 0xa4, 0xb8, 0x48, 0x46, 0x39, 0xad, 0xb9, + 0x6a, 0x30, 0x48, 0x42, 0xad, 0x80, 0x51, 0xb7, 0xab, 0x5b, 0xa3, 0x71, + 0x48, 0x3d, 0x6b, 0xc4, 0x4f, 0xa5, 0x39, 0x92, 0xb3, 0xd1, 0x6d, 0x7d, + 0xae, 0x63, 0x78, 0x84, 0x7b, 0xaf, 0x9e, 0x25, 0x57, 0x98, 0xc7, 0x3b, + 0x3f, 0xb4, 0x6e, 0x62, 0x3a, 0xbc, 0xb3, 0x62, 0x36, 0x35, 0x66, 0x46, + 0x58, 0x37, 0xad, 0xbb, 0x37, 0xb9, 0x65, 0xc3, 0x96, 0x90, 0x52, 0x43, + 0xa4, 0xd5, 0x94, 0x88, 0x59, 0xd0, 0x8f, 0xc3, 0xbd, 0x74, 0x40, 0x54, + 0x94, 0x97, 0xd1, 0x5e, 0x69, 0x48, 0xa6, 0x82, 0xab, 0xc4, 0x96, 0xb3, + 0x5b, 0x5f, 0x6d, 0x2c, 0x81, 0xa9, 0xcf, 0x8e, 0x88, 0x66, 0x41, 0x57, + 0x3b, 0x7c, 0xb0, 0x3f, 0x36, 0x75, 0x96, 0x79, 0xbc, 0x7c, 0x5e, 0x50, + 0x6b, 0xa6, 0x84, 0xbd, 0x59, 0xce, 0xbe, 0x7d, 0x78, 0x68, 0xc7, 0x5c, + 0xb7, 0x4a, 0xa0, 0xb1, 0x74, 0xc5, 0xcf, 0xe0, 0x89, 0x60, 0x59, 0xd1, + 0x8a, 0x63, 0xad, 0x71, 0x6c, 0x66, 0xad, 0x46, 0x55, 0xa6, 0x51, 0xb2, + 0xc5, 0x85, 0x97, 0x40, 0xbb, 0xcc, 0x57, 0x33, 0xc2, 0x65, 0xd4, 0xa3, + 0x46, 0x3a, 0xb1, 0x4b, 0x6b, 0x52, 0x5e, 0xba, 0x4f, 0x6f, 0xb1, 0x8f, + 0xd6, 0xc5, 0xb7, 0x3d, 0x5c, 0xab, 0x30, 0x74, 0x92, 0xad, 0x52, 0x8c, + 0x7f, 0x9b, 0xa9, 0x33, 0x5c, 0x32, 0x42, 0x4d, 0xc3, 0x78, 0xb1, 0x8e, + 0x77, 0xa7, 0xb2, 0x52, 0x88, 0xaf, 0x50, 0xd2, 0x53, 0x8b, 0x57, 0x1c, + 0x68, 0x79, 0x65, 0x4a, 0x87, 0x41, 0xa7, 0xb4, 0x66, 0x92, 0x77, 0x6b, + 0xc2, 0xc9, 0x4f, 0xa2, 0x95, 0xa8, 0x3b, 0x69, 0x87, 0x44, 0xcd, 0xcf, + 0x95, 0xc3, 0xbf, 0x56, 0xad, 0x40, 0x4b, 0xc1, 0x7b, 0x95, 0xad, 0xab, + 0x59, 0x37, 0x5e, 0xa2, 0x91, 0x71, 0x6a, 0x36, 0x5a, 0xad, 0x49, 0x50, + 0xcf, 0xc7, 0x75, 0x5a, 0x3d, 0x9c, 0xbb, 0x8e, 0x87, 0xc5, 0xa2, 0x99, + 0x37, 0xae, 0x67, 0xd1, 0xbe, 0x60, 0x40, 0x5e, 0xa2, 0xaa, 0x73, 0x5d, + 0x80, 0xb9, 0xcc, 0xc0, 0xae, 0x66, 0x42, 0x7f, 0x6e, 0x66, 0xaf, 0x9e, + 0x7b, 0x8b, 0xbf, 0x3f, 0xca, 0xc2, 0x59, 0xbf, 0x95, 0x57, 0x62, 0x94, + 0x6b, 0x52, 0x37, 0x90, 0x5a, 0x7b, 0xbe, 0x70, 0x90, 0x44, 0x5d, 0x60, + 0xb3, 0x88, 0xc9, 0x8d, 0x37, 0x7b, 0xd0, 0x5e, 0x9b, 0x3d, 0x3b, 0xc4, + 0x4a, 0x50, 0x6d, 0x9b, 0x54, 0xd1, 0x8c, 0x7a, 0xcc, 0xc5, 0x44, 0xbe, + 0xbc, 0x41, 0x3e, 0xb0, 0x94, 0x93, 0x36, 0xb0, 0x4d, 0xa8, 0x48, 0xc3, + 0x72, 0x73, 0x8d, 0x8b, 0x4c, 0x67, 0x4d, 0x53, 0x3c, 0xa7, 0x9b, 0x81, + 0x73, 0x4f, 0x68, 0x5e, 0x96, 0x7e, 0x6e, 0x37, 0x4d, 0x59, 0x9a, 0xc5, + 0xb5, 0xb2, 0x75, 0x77, 0xbb, 0xb6, 0x74, 0x5a, 0x40, 0xb3, 0x3d, 0x68, + 0xc0, 0xb9, 0xc5, 0x76, 0xcd, 0x56, 0x6a, 0x30, 0x33, 0x56, 0x68, 0x35, + 0x4d, 0xbd, 0x34, 0x8c, 0x7f, 0xb5, 0xad, 0x61, 0x5c, 0xba, 0x74, 0x9b, + 0xab, 0x65, 0x9a, 0x7a, 0x9c, 0xa1, 0x79, 0x99, 0xaa, 0xc3, 0xc6, 0x3e, + 0xc6, 0x95, 0x96, 0xa2, 0xb9, 0xc3, 0x9f, 0x91, 0x46, 0x9f, 0x90, 0x75, + 0x4c, 0xa2, 0x3e, 0x54, 0x4f, 0xab, 0x6a, 0x3c, 0x79, 0x77, 0x3c, 0x8c, + 0x3c, 0x6e, 0x60, 0x49, 0xca, 0xa8, 0x73, 0x6b, 0x34, 0x94, 0x4d, 0x76, + 0x87, 0x86, 0x95, 0x4f, 0x41, 0x4a, 0xab, 0x61, 0x4f, 0xc5, 0x82, 0x4d, + 0x31, 0x9a, 0xa2, 0x6a, 0x3e, 0xd2, 0x79, 0x8b, 0x65, 0x64, 0x33, 0xb0, + 0xcf, 0x8a, 0x6a, 0x3c, 0x3f, 0x9c, 0xd0, 0xc1, 0x6f, 0xae, 0xd1, 0x59, + 0x6e, 0x97, 0xd0, 0xb0, 0x93, 0x7d, 0xb0, 0x7a, 0xd2, 0xb3, 0x44, 0x46, + 0x4c, 0x4c, 0xaf, 0x6e, 0x99, 0xa0, 0x56, 0xb9, 0xc8, 0x97, 0x32, 0x72, + 0x9d, 0x46, 0x8b, 0x68, 0x88, 0x9a, 0x64, 0x4e, 0x77, 0xba, 0x56, 0x9f, + 0x58, 0xb1, 0x33, 0xc7, 0x71, 0x4f, 0x32, 0x77, 0x70, 0x95, 0xcb, 0xa5, + 0x3a, 0x60, 0xc9, 0xa9, 0x31, 0x8a, 0x4c, 0x86, 0xc2, 0x40, 0x35, 0xba, + 0x7a, 0x9d, 0xa5, 0x6a, 0x7b, 0x94, 0x98, 0x5a, 0xba, 0xa3, 0x92, 0x4f, + 0x83, 0xa5, 0x86, 0x96, 0x74, 0x36, 0x97, 0x58, 0x91, 0x4c, 0x76, 0xa0, + 0x7a, 0x77, 0x6f, 0xce, 0x42, 0x91, 0x92, 0x93, 0xb2, 0x69, 0x5c, 0x41, + 0xaf, 0xc6, 0x50, 0x99, 0xaa, 0x69, 0x5e, 0x7c, 0x59, 0xb9, 0x6b, 0x97, + 0x3e, 0x77, 0x3f, 0x67, 0x73, 0x32, 0x3f, 0x84, 0x34, 0x74, 0xa5, 0x4c, + 0x62, 0x6f, 0x77, 0x3e, 0x74, 0x55, 0x49, 0xa4, 0x56, 0x32, 0x6a, 0x4f, + 0x31, 0x5e, 0x72, 0xb7, 0x3f, 0xae, 0x9c, 0x6d, 0xb9, 0xbd, 0x42, 0xa5, + 0x7e, 0xd3, 0x49, 0xbf, 0x84, 0x6f, 0x78, 0x97, 0x42, 0xae, 0x3e, 0xaf, + 0x71, 0x4c, 0x50, 0xd1, 0x76, 0x35, 0x33, 0xcf, 0xce, 0xa6, 0x49, 0xbf, + 0x39, 0x40, 0x72, 0x30, 0xc8, 0x72, 0x8c, 0x66, 0x37, 0x4e, 0x57, 0xba, + 0x35, 0x6b, 0x85, 0x30, 0x51, 0x90, 0xd2, 0x7a, 0x82, 0xb4, 0x4b, 0xd3, + 0x43, 0x7b, 0xce, 0x7c, 0x78, 0xcc, 0xa0, 0xa9, 0xa7, 0x81, 0x4e, 0x66, + 0xcf, 0xbe, 0x42, 0x56, 0x6f, 0xc4, 0x53, 0xb6, 0x77, 0xc7, 0x8c, 0x50, + 0x81, 0xa3, 0x70, 0x9d, 0x68, 0x64, 0xa8, 0xbe, 0x91, 0x45, 0x48, 0xab, + 0xb3, 0x81, 0xab, 0xa4, 0x9e, 0x45, 0x64, 0x30, 0xba, 0x42, 0x80, 0xad, + 0x36, 0x84, 0xaf, 0x37, 0x9e, 0x6a, 0x56, 0xcf, 0x4c, 0x5d, 0xbf, 0x99, + 0x6f, 0x43, 0x60, 0xc2, 0xba, 0x7f, 0x6c, 0x88, 0xa0, 0x6c, 0x8c, 0x47, + 0x42, 0xc7, 0xba, 0xba, 0xd3, 0xc0, 0x3b, 0x7e, 0x33, 0x34, 0x3b, 0x9a, + 0xcb, 0xc4, 0x41, 0x71, 0x6a, 0xc4, 0x47, 0x92, 0xc4, 0xc1, 0xb8, 0x86, + 0x4c, 0xc5, 0x79, 0x50, 0xab, 0xc6, 0xce, 0xb6, 0xa3, 0xab, 0x3a, 0x69, + 0x96, 0x7a, 0xb2, 0xa5, 0xb7, 0xc1, 0x67, 0x5a, 0xcb, 0xbb, 0x7a, 0x34, + 0x96, 0xd0, 0x57, 0x61, 0xa6, 0x71, 0xca, 0xa4, 0xcb, 0xc3, 0xba, 0x56, + 0xb6, 0x87, 0x85, 0x3c, 0xc1, 0x55, 0xaa, 0x65, 0x64, 0x8d, 0xa7, 0xb8, + 0x82, 0xb8, 0x9d, 0xd0, 0x51, 0x89, 0xb8, 0x88, 0x78, 0x63, 0x52, 0x74, + 0x66, 0x33, 0x74, 0x8e, 0x9d, 0x6e, 0xd0, 0xac, 0x8d, 0x66, 0x4e, 0x84, + 0x5d, 0x4f, 0x5f, 0xb5, 0x8e, 0x54, 0x79, 0x91, 0x60, 0x4c, 0x4a, 0x3d, + 0x7b, 0x7a, 0x95, 0xad, 0xab, 0x8d, 0xb0, 0xb3, 0xa6, 0x44, 0xb4, 0x88, + 0x3c, 0xa0, 0x7b, 0x5d, 0x5b, 0x89, 0x4e, 0x36, 0x3f, 0x8c, 0x7f, 0xa0, + 0x69, 0x91, 0x86, 0xd1, 0xba, 0x7d, 0xbc, 0x45, 0x62, 0x68, 0xa1, 0x80, + 0x8e, 0x8f, 0x85, 0x7f, 0x62, 0x97, 0x48, 0x3f, 0x3f, 0xc1, 0x98, 0xc4, + 0x7d, 0xb4, 0x4d, 0x34, 0xad, 0x78, 0x73, 0xa0, 0xb4, 0x5b, 0xb8, 0x48, + 0x97, 0x7d, 0xc7, 0xa2, 0xaf, 0x76, 0xb5, 0x80, 0x6f, 0xcc, 0x97, 0x48, + 0x4c, 0x7a, 0x65, 0x4e, 0xc4, 0xbd, 0x3b, 0xc0, 0x9a, 0x89, 0x8d, 0x70, + 0x3c, 0x67, 0xb6, 0x82, 0x50, 0xd1, 0x73, 0x39, 0x71, 0x47, 0xd2, 0x31, + 0x4e, 0xa5, 0x4a, 0x34, 0xa9, 0x67, 0xba, 0x8c, 0x7b, 0x70, 0xcc, 0x45, + 0x7d, 0x58, 0x6c, 0x7f, 0x7b, 0xc1, 0x3a, 0xad, 0x8b, 0x68, 0xaf, 0x90, + 0x53, 0xa0, 0x92, 0xc0, 0x86, 0x69, 0xc0, 0x44, 0x6c, 0x44, 0x4c, 0xbc, + 0xb7, 0x6c, 0xa0, 0x46, 0x59, 0x99, 0x59, 0xad, 0x39, 0x7d, 0x64, 0x9c, + 0x8a, 0xab, 0xa1, 0x80, 0x7d, 0x7c, 0x58, 0xca, 0x3d, 0x88, 0xba, 0xb9, + 0x34, 0x6c, 0x72, 0xd1, 0x90, 0x75, 0x90, 0xa8, 0x34, 0xc9, 0xa9, 0x9c, + 0xcd, 0x48, 0x38, 0xa2, 0xa3, 0x49, 0x8f, 0xba, 0x3c, 0xae, 0x2f, 0xc0, + 0x43, 0x5b, 0x81, 0x33, 0xac, 0xb4, 0x5a, 0x5a, 0xa7, 0x3d, 0x87, 0x54, + 0xbe, 0xc2, 0xb4, 0x3d, 0x83, 0x3f, 0x81, 0x7e, 0xaa, 0xc2, 0x7f, 0xb8, + 0x69, 0x31, 0x72, 0x7d, 0x41, 0x85, 0x6f, 0x54, 0x99, 0x43, 0x9a, 0xa6, + 0xc8, 0x96, 0x8d, 0x5c, 0x8d, 0x2f, 0x5e, 0x70, 0x32, 0xb2, 0x65, 0x78, + 0x9c, 0x86, 0xca, 0xd1, 0xcd, 0x69, 0x56, 0x90, 0x72, 0xd2, 0x3a, 0x5d, + 0xc7, 0x63, 0xa5, 0xb8, 0xaa, 0x47, 0x47, 0x8d, 0x9e, 0xc1, 0x60, 0xaf, + 0x55, 0xae, 0x43, 0x5d, 0xc8, 0x5c, 0xc9, 0xb7, 0x7c, 0xaf, 0x36, 0x44, + 0x3f, 0x9d, 0x56, 0x3b, 0x95, 0x8a, 0xa2, 0x3d, 0x5a, 0xbb, 0x41, 0xb9, + 0xa2, 0xd0, 0xa0, 0x89, 0x65, 0xb8, 0xbb, 0x4a, 0x3f, 0xa8, 0x63, 0xb1, + 0x91, 0x49, 0x97, 0x37, 0x65, 0x57, 0x5c, 0xa8, 0xb9, 0xc2, 0xc9, 0x53, + 0x3c, 0x3d, 0x80, 0xab, 0x94, 0x79, 0x6f, 0x68, 0x83, 0xc1, 0x71, 0x6a, + 0xca, 0x87, 0x9d, 0xbe, 0x4c, 0x5c, 0xbb, 0x49, 0x7d, 0x5d, 0x45, 0x44, + 0x9b, 0x31, 0x54, 0x6f, 0x5d, 0xc3, 0x68, 0x7d, 0x5c, 0x9d, 0x83, 0x5f, + 0xc6, 0xa9, 0x40, 0xae, 0x8b, 0x6d, 0x77, 0x81, 0xb9, 0x96, 0x6e, 0x7a, + 0x34, 0x5e, 0x9a, 0x7d, 0x33, 0xa3, 0x7a, 0x6d, 0xa2, 0x94, 0x4b, 0x6b, + 0x65, 0x59, 0x58, 0x62, 0x44, 0xbf, 0x88, 0xb1, 0xb3, 0x95, 0x96, 0xb6, + 0x5b, 0x57, 0x56, 0xa3, 0x64, 0x91, 0xd3, 0x7c, 0xb7, 0x67, 0x88, 0xbc, + 0x91, 0x75, 0x42, 0x6d, 0x35, 0x71, 0xa1, 0x59, 0x45, 0x7d, 0xba, 0x46, + 0xc5, 0x48, 0x96, 0xc9, 0x3b, 0xaa, 0x6c, 0xcd, 0x4a, 0x3a, 0x4b, 0x93, + 0x6a, 0xbf, 0x6a, 0xbc, 0xc5, 0xc1, 0xb6, 0x8f, 0x98, 0x5d, 0x3a, 0xa5, + 0x3d, 0x7b, 0xc3, 0x4b, 0x39, 0xc9, 0x72, 0xaa, 0x4e, 0x52, 0x6b, 0x3f, + 0x7b, 0x4d, 0xbc, 0x9b, 0xca, 0x6a, 0xb7, 0xa7, 0xc2, 0x4a, 0x4f, 0xb3, + 0x50, 0xaa, 0xb4, 0x36, 0x88, 0x94, 0x5b, 0x83, 0x58, 0x89, 0xa5, 0x6f, + 0xa4, 0xa2, 0x2f, 0x93, 0xba, 0x98, 0xca, 0x4b, 0x5c, 0x4d, 0xa1, 0x85, + 0x86, 0xb0, 0x3f, 0x5c, 0xc1, 0xa1, 0x9e, 0xb2, 0x98, 0x9d, 0x52, 0x37, + 0x81, 0xab, 0x61, 0x81, 0x75, 0x7c, 0x7d, 0x66, 0x35, 0x5c, 0x49, 0x43, + 0x44, 0x5a, 0xc9, 0x75, 0x56, 0x53, 0xad, 0xb1, 0x95, 0x8c, 0x94, 0xb0, + 0x3f, 0x4c, 0x2f, 0x5c, 0xce, 0xb7, 0x90, 0x40, 0x70, 0xa8, 0x45, 0x8e, + 0xb6, 0x98, 0x61, 0x3c, 0x81, 0x5f, 0xa8, 0x98, 0xad, 0x72, 0x50, 0x8b, + 0x7c, 0xbe, 0x78, 0x7b, 0x8d, 0xc8, 0x3b, 0xa1, 0x95, 0x9e, 0xd2, 0x3d, + 0xbd, 0xba, 0x52, 0xc5, 0x76, 0xbd, 0x79, 0x79, 0x7a, 0x6f, 0xc1, 0x62, + 0x50, 0x72, 0xbd, 0x8d, 0x56, 0x5d, 0x71, 0x69, 0xa4, 0x6c, 0x58, 0x31, + 0x8c, 0x56, 0x7a, 0x8a, 0x35, 0x80, 0xc3, 0xcf, 0xb6, 0x8b, 0x36, 0x72, + 0x7f, 0x90, 0x41, 0x60, 0x8f, 0x66, 0x3c, 0x72, 0xa5, 0x52, 0xa3, 0x33, + 0x93, 0x75, 0x7f, 0x71, 0xa9, 0x96, 0x60, 0x83, 0x38, 0xa9, 0x74, 0x62, + 0xb0, 0x35, 0x48, 0x3a, 0x68, 0x8a, 0xc0, 0x8d, 0x4a, 0xc6, 0xa2, 0xd0, + 0xb0, 0xbe, 0x9b, 0x89, 0x5a, 0x49, 0x9d, 0x4d, 0x9b, 0xad, 0x3e, 0x61, + 0x60, 0x76, 0x48, 0x75, 0xa8, 0x45, 0xb3, 0x85, 0x9f, 0xb3, 0xa0, 0x61, + 0x9f, 0x80, 0x77, 0x3e, 0xcd, 0x5d, 0x91, 0x97, 0x87, 0x45, 0x8a, 0x7b, + 0x33, 0x35, 0x97, 0x9a, 0x37, 0x40, 0xa7, 0x5a, 0xaa, 0x3d, 0xa4, 0xa6, + 0x4b, 0x56, 0x99, 0x9d, 0x5f, 0x6b, 0x76, 0x3e, 0x42, 0x79, 0x73, 0x98, + 0x45, 0x35, 0x9d, 0x44, 0xa3, 0x4b, 0x9b, 0x50, 0x3d, 0x77, 0x81, 0x5e, + 0xaf, 0x9f, 0x49, 0x66, 0xaa, 0x4f, 0xa2, 0x6f, 0xb1, 0x7e, 0xcb, 0x41, + 0x8c, 0x97, 0x64, 0x63, 0x99, 0x4d, 0xcc, 0xd2, 0x3a, 0x81, 0xbe, 0x43, + 0xa8, 0x79, 0x45, 0x8d, 0x8f, 0xd1, 0xa6, 0xb6, 0xbb, 0x8e, 0x49, 0x73, + 0x9b, 0x42, 0x8c, 0x49, 0x74, 0xa3, 0x6b, 0x42, 0xd6, 0x9e, 0x66, 0x55, + 0x64, 0x75, 0x99, 0x31, 0xb4, 0x8e, 0xa7, 0x9d, 0x6a, 0x89, 0x86, 0x6f, + 0x9e, 0x88, 0x64, 0x79, 0x5f, 0xb6, 0x3c, 0x33, 0xb1, 0xa9, 0xc7, 0xa7, + 0x9c, 0x96, 0x63, 0x6d, 0xc3, 0x85, 0xa6, 0x87, 0xa9, 0xc2, 0x9f, 0x37, + 0x46, 0x39, 0x60, 0x51, 0xa6, 0xa4, 0xd0, 0x4d, 0x7d, 0x4b, 0x3a, 0xb7, + 0x63, 0x9f, 0x9e, 0x58, 0x95, 0x6f, 0x87, 0x6d, 0x47, 0x77, 0x3a, 0x63, + 0x31, 0xc8, 0x98, 0x52, 0x59, 0x7d, 0xd1, 0x3f, 0x40, 0x78, 0x8a, 0x92, + 0x62, 0x8d, 0x7f, 0xb7, 0xa4, 0xbe, 0x76, 0xb8, 0x76, 0xc1, 0xc8, 0x7a, + 0x4f, 0x47, 0x8a, 0x83, 0x9f, 0xc9, 0xa0, 0x32, 0x73, 0x3f, 0xad, 0x38, + 0x51, 0x45, 0x8a, 0xbc, 0x7d, 0x3d, 0x74, 0xc2, 0x91, 0x4b, 0x6d, 0xb8, + 0xa7, 0x6f, 0xab, 0x9c, 0x36, 0x9e, 0x65, 0xbb, 0x49, 0x4e, 0xc5, 0x43, + 0xaa, 0x8a, 0xa3, 0x84, 0x6a, 0x7d, 0xa7, 0xa3, 0x8d, 0x4d, 0x61, 0x3d, + 0x42, 0xc3, 0x6b, 0x38, 0x60, 0x8a, 0xa0, 0xaa, 0x57, 0x9c, 0x47, 0x65, + 0xb5, 0x8f, 0x5d, 0x75, 0x4b, 0x94, 0x6e, 0x9e, 0x4a, 0x59, 0xce, 0x85, + 0x6b, 0x3f, 0x82, 0x99, 0x93, 0xbd, 0xae, 0x40, 0xc9, 0x54, 0xc8, 0x4d, + 0x9c, 0x58, 0x47, 0x30, 0x2f, 0xd4, 0x8a, 0xab, 0x75, 0x51, 0x80, 0x6f, + 0x9e, 0x64, 0x4b, 0xa5, 0xbd, 0x41, 0xc5, 0xb6, 0x94, 0x50, 0x4a, 0x9d, + 0xb6, 0x98, 0xb6, 0xd1, 0x6e, 0x4f, 0xb4, 0xb4, 0xd1, 0x5b, 0x62, 0x99, + 0x75, 0x85, 0x75, 0x7b, 0x5b, 0x4e, 0x5f, 0x91, 0xa8, 0x50, 0xb4, 0x75, + 0x63, 0x3f, 0x6a, 0x4b, 0x5c, 0x3f, 0xc8, 0x7f, 0x45, 0x2a, 0x71, 0x33, + 0x4a, 0x51, 0x9c, 0xa3, 0x60, 0x7a, 0x46, 0x57, 0x42, 0xc4, 0x92, 0xac, + 0xb8, 0x7f, 0x6d, 0x69, 0x4f, 0x71, 0x4c, 0x9b, 0xbf, 0x97, 0xcc, 0xa4, + 0x8d, 0x43, 0x43, 0xac, 0xb2, 0xa3, 0xcc, 0x39, 0xc0, 0x31, 0xac, 0x42, + 0x97, 0x2a, 0x94, 0x36, 0x91, 0xc3, 0x4b, 0xab, 0x55, 0xdc, 0x80, 0x90, + 0xcb, 0x8e, 0x3f, 0x3b, 0x2f, 0xb3, 0xbd, 0xc2, 0xc1, 0x5c, 0x9c, 0xa6, + 0xcb, 0x5b, 0x56, 0xac, 0x3e, 0xbd, 0xbe, 0xad, 0xb3, 0x6e, 0xcc, 0x68, + 0x2f, 0xac, 0x8f, 0x7d, 0x89, 0x8c, 0xde, 0xad, 0x73, 0xcf, 0x50, 0xcc, + 0x46, 0x7b, 0x7e, 0xbc, 0x8b, 0x9f, 0x45, 0x9a, 0xbc, 0xc7, 0x50, 0x92, + 0x6a, 0x9a, 0xa3, 0x69, 0x77, 0x2f, 0xc0, 0xd3, 0xae, 0x45, 0xaf, 0x4f, + 0x92, 0x94, 0x76, 0xd1, 0x52, 0xc6, 0xae, 0xb8, 0xd0, 0x95, 0xb7, 0x8b, + 0xb3, 0x9d, 0x8d, 0x71, 0x65, 0x6a, 0xd9, 0x92, 0x55, 0x74, 0x8e, 0x9a, + 0x54, 0x53, 0x46, 0x5c, 0x9b, 0x12, 0x23, 0x5f, 0xa2, 0xe3, 0x46, 0x95, + 0xa4, 0x96, 0xa2, 0xbe, 0x7e, 0x8a, 0xc0, 0xb6, 0x84, 0x77, 0xbb, 0x96, + 0x75, 0x9f, 0xc7, 0x4a, 0x44, 0x8f, 0x53, 0x5e, 0x57, 0x59, 0x42, 0x99, + 0xc3, 0x81, 0x9b, 0x50, 0xd1, 0xb4, 0x57, 0xa4, 0x7f, 0x47, 0xd2, 0xc8, + 0x3d, 0xa7, 0x95, 0x71, 0x44, 0x52, 0x57, 0xaa, 0xcd, 0x53, 0x36, 0xa0, + 0x79, 0x73, 0xa9, 0x46, 0xc9, 0x9b, 0x49, 0xb4, 0x58, 0x62, 0x65, 0x59, + 0x63, 0x53, 0x54, 0xb8, 0xae, 0x43, 0x4a, 0x4a, 0x6e, 0xa7, 0x57, 0x8b, + 0xc2, 0xb7, 0xc9, 0x52, 0x77, 0x5a, 0x5b, 0xa1, 0xb8, 0x6b, 0x57, 0x5d, + 0xd2, 0x77, 0xb6, 0xab, 0xb4, 0xbb, 0x45, 0x94, 0xbb, 0x6b, 0xae, 0x38, + 0x4f, 0xa9, 0xb1, 0xb4, 0xad, 0xa1, 0xcf, 0x4f, 0x67, 0x64, 0x8e, 0x6d, + 0x8f, 0x6c, 0xa5, 0xba, 0xa6, 0x39, 0x32, 0x42, 0x6f, 0x9b, 0xb3, 0x93, + 0x7f, 0xc1, 0x99, 0x2f, 0xda, 0x8e, 0xca, 0xcb, 0x9b, 0x58, 0x91, 0xcd, + 0xb6, 0x7f, 0x43, 0xac, 0x89, 0x95, 0x49, 0x8e, 0xaa, 0xb8, 0x44, 0x7e, + 0x5b, 0xae, 0xc3, 0x3e, 0x84, 0x8b, 0xcd, 0x40, 0xb2, 0x6e, 0x69, 0xba, + 0x77, 0x40, 0x62, 0x7a, 0xaa, 0x84, 0x30, 0x48, 0x4c, 0xc1, 0x77, 0x45, + 0x9d, 0x9f, 0xc8, 0xeb, 0x50, 0x75, 0xde, 0x8d, 0xb7, 0x92, 0xab, 0x86, + 0xa0, 0x8e, 0x6a, 0x4a, 0x64, 0xa3, 0x32, 0x75, 0x49, 0x5a, 0x9d, 0x7f, + 0xa4, 0x96, 0x71, 0xbf, 0x59, 0x91, 0x54, 0xc0, 0xbf, 0x2b, 0x34, 0x39, + 0x6e, 0xa0, 0x79, 0x55, 0x50, 0x64, 0x62, 0x8b, 0x4a, 0x5e, 0x60, 0x62, + 0xc5, 0x4a, 0xa3, 0xad, 0x8d, 0xa8, 0x35, 0x62, 0x91, 0x8a, 0x8d, 0xc7, + 0x5a, 0x39, 0xc8, 0x7a, 0x63, 0x6f, 0xb6, 0x71, 0xcd, 0x8c, 0x63, 0x8d, + 0x84, 0x6f, 0x92, 0x6b, 0x93, 0x74, 0xc7, 0xba, 0xbf, 0xc0, 0x65, 0x43, + 0x6a, 0x59, 0x82, 0x9d, 0x49, 0x86, 0x87, 0x45, 0x9c, 0x67, 0xc9, 0x65, + 0x89, 0xc7, 0x9e, 0x66, 0x49, 0x92, 0x69, 0x50, 0xa8, 0xad, 0x2f, 0xa9, + 0x6f, 0xa4, 0xae, 0x8a, 0x3c, 0x41, 0x50, 0x8a, 0x3c, 0xb7, 0x92, 0x68, + 0xb9, 0x4d, 0xd0, 0x5d, 0xb1, 0x87, 0x5b, 0x56, 0x48, 0xb9, 0xac, 0xc9, + 0xa6, 0x8e, 0x8a, 0x52, 0x7f, 0x5f, 0x68, 0xad, 0x7c, 0x93, 0x83, 0xc1, + 0x79, 0xa1, 0xa6, 0x35, 0xba, 0x6f, 0xa0, 0x5e, 0x8e, 0x6b, 0x64, 0xc6, + 0x39, 0xa0, 0xac, 0xb4, 0xc9, 0xad, 0x7b, 0xa5, 0xc7, 0x9e, 0xa4, 0x5e, + 0xb6, 0x94, 0x9d, 0xb0, 0x70, 0x87, 0xb3, 0xe1, 0x5e, 0x42, 0x44, 0x8e, + 0x86, 0x78, 0x51, 0x60, 0xd3, 0xc2, 0xcb, 0xc7, 0x52, 0x89, 0x55, 0xd0, + 0xb9, 0xbf, 0x7d, 0x3a, 0x58, 0x47, 0xb5, 0x3a, 0xd8, 0x71, 0x44, 0x72, + 0x51, 0x51, 0xca, 0x4f, 0xab, 0xd1, 0xa7, 0xa8, 0xb5, 0x92, 0xd4, 0x4a, + 0x5f, 0x6a, 0xb3, 0xc8, 0x4d, 0x42, 0xc7, 0x69, 0xae, 0xae, 0x4a, 0x48, + 0x37, 0x5d, 0x63, 0xc7, 0x43, 0xc2, 0x35, 0x3a, 0xc6, 0xb2, 0x3e, 0x3f, + 0xcc, 0x37, 0x66, 0xc2, 0x64, 0xac, 0x69, 0x87, 0xd0, 0x55, 0xc2, 0x82, + 0xb1, 0x8e, 0xcb, 0x8a, 0xc0, 0x8b, 0xa0, 0xc0, 0xa7, 0x2c, 0x85, 0xc9, + 0x6d, 0x93, 0xc1, 0x7c, 0x87, 0xcc, 0x75, 0x88, 0x46, 0x63, 0x6b, 0x55, + 0x52, 0x24, 0x6f, 0x9c, 0xc8, 0x54, 0x2e, 0x54, 0x59, 0x57, 0x7d, 0x48, + 0xb4, 0xa2, 0xb3, 0xa2, 0x6a, 0x49, 0xa2, 0x56, 0x6d, 0x94, 0x41, 0x93, + 0xa7, 0x4c, 0x92, 0x91, 0x9b, 0xb4, 0x7b, 0x98, 0x87, 0xaf, 0xc7, 0xb0, + 0x80, 0xbd, 0x8c, 0xa6, 0x8b, 0x93, 0xc1, 0xc5, 0xa2, 0x5d, 0xab, 0xa6, + 0x5e, 0x94, 0x41, 0xc9, 0xc1, 0xbd, 0x8b, 0xc9, 0xc6, 0xa4, 0x7c, 0x3a, + 0x47, 0xa7, 0xa3, 0x40, 0x7d, 0x53, 0xcc, 0x71, 0x8f, 0x69, 0x3d, 0xad, + 0xcc, 0x6a, 0x5d, 0x47, 0x6d, 0x43, 0x6b, 0x78, 0xb2, 0x54, 0xd6, 0xc8, + 0xce, 0x4a, 0xb4, 0x41, 0x91, 0x74, 0x77, 0x4e, 0x4e, 0x76, 0x37, 0x56, + 0xab, 0x5d, 0x31, 0x6a, 0x51, 0xdf, 0x49, 0x37, 0x91, 0x3b, 0xc8, 0x5f, + 0xab, 0x6a, 0xb6, 0xa5, 0xca, 0x25, 0x2e, 0x5c, 0x66, 0x78, 0x3e, 0x7a, + 0x47, 0x68, 0x9c, 0x8a, 0x98, 0x6f, 0x67, 0x74, 0x83, 0xa7, 0xa3, 0xaf, + 0x4a, 0xa8, 0x55, 0x7f, 0xc4, 0x94, 0x7b, 0xc5, 0x70, 0x77, 0xb4, 0xa9, + 0x5e, 0x56, 0x74, 0x88, 0x52, 0x3e, 0x6b, 0x6e, 0x43, 0x97, 0xcb, 0xa1, + 0xaf, 0x39, 0xb8, 0x9d, 0x55, 0xb0, 0xd0, 0x90, 0x71, 0x43, 0x98, 0x62, + 0x35, 0x9c, 0xa1, 0xd4, 0xa6, 0x6f, 0xb9, 0xb4, 0x59, 0xa7, 0xbe, 0xc9, + 0xd7, 0x96, 0x62, 0x42, 0xa9, 0x3b, 0x5d, 0x65, 0x52, 0x47, 0xc8, 0xaf, + 0x58, 0x8a, 0x5c, 0x63, 0x6d, 0x45, 0x5e, 0x83, 0x8d, 0x61, 0x32, 0x69, + 0x7c, 0xbd, 0x34, 0x2f, 0x41, 0x77, 0xcc, 0x9c, 0x5d, 0x43, 0x6c, 0x6e, + 0xa1, 0x60, 0x60, 0x97, 0xb0, 0x7c, 0x31, 0x8e, 0x98, 0x6a, 0xa4, 0x44, + 0x81, 0xac, 0x62, 0x70, 0x55, 0x59, 0xa6, 0x6c, 0x9e, 0xb5, 0xd2, 0xb8, + 0xbb, 0x82, 0x3b, 0x65, 0xc7, 0x5f, 0x91, 0x75, 0x57, 0x64, 0x64, 0x7c, + 0x6e, 0xbe, 0xbb, 0xa5, 0xba, 0xb7, 0x5c, 0x3a, 0x51, 0x6d, 0x4d, 0xd0, + 0xac, 0x73, 0x48, 0xac, 0x99, 0x73, 0xc2, 0x44, 0x72, 0x7e, 0xae, 0x50, + 0xa4, 0xa6, 0xad, 0x4c, 0x65, 0xa6, 0x8c, 0xb2, 0x75, 0x62, 0x46, 0xa1, + 0xc4, 0xb8, 0x59, 0x83, 0xbd, 0xa1, 0xb6, 0x90, 0x86, 0x54, 0x58, 0xa4, + 0x7e, 0xa6, 0x78, 0x53, 0x4f, 0xe1, 0xc7, 0x5a, 0xb6, 0x77, 0xca, 0x59, + 0x90, 0x76, 0x96, 0x72, 0xa0, 0x5f, 0xb7, 0x98, 0x7b, 0x35, 0x57, 0xbe, + 0xce, 0x65, 0x6a, 0xb5, 0x70, 0xcf, 0xcc, 0xa4, 0x5f, 0x5d, 0x4d, 0xa1, + 0x7f, 0xa9, 0xb9, 0x41, 0x8f, 0xc1, 0x73, 0x6b, 0x98, 0xa2, 0x9d, 0x82, + 0x52, 0x2d, 0x87, 0x5e, 0x67, 0xc9, 0x93, 0xc9, 0x73, 0x8c, 0xd2, 0x65, + 0x49, 0x92, 0x3c, 0x84, 0xac, 0xd6, 0xa5, 0x82, 0x83, 0x81, 0x99, 0xb7, + 0xab, 0x98, 0xab, 0x9e, 0x87, 0x9e, 0x7b, 0x80, 0x5b, 0x69, 0x62, 0xc5, + 0x90, 0xaa, 0x69, 0x4e, 0xc4, 0x65, 0x72, 0x5e, 0x4f, 0x6c, 0x62, 0xc3, + 0x38, 0x6c, 0x6b, 0x4f, 0xbc, 0x75, 0xa8, 0xc6, 0x5d, 0x93, 0x91, 0xb3, + 0xa7, 0x83, 0xa8, 0xdb, 0x94, 0xb1, 0x82, 0x43, 0x6b, 0x41, 0xc4, 0x62, + 0x69, 0x9b, 0xb7, 0xbc, 0xca, 0x8a, 0x54, 0x94, 0x4c, 0x7c, 0xb9, 0x40, + 0x80, 0xa5, 0x7d, 0xae, 0x92, 0x55, 0xc8, 0x50, 0x75, 0xc7, 0x5b, 0xa9, + 0xbb, 0x8c, 0xc3, 0x4f, 0x48, 0x50, 0x52, 0x9d, 0x84, 0x31, 0x74, 0x73, + 0xc6, 0x9b, 0x9b, 0xc7, 0xb9, 0xa8, 0x2a, 0x85, 0x7e, 0x7b, 0x30, 0x67, + 0x64, 0x7a, 0x78, 0x98, 0xc6, 0x73, 0xa1, 0x65, 0x84, 0x90, 0x8d, 0x64, + 0x45, 0x5c, 0xb8, 0x89, 0x50, 0x4c, 0xa1, 0x6c, 0x58, 0xba, 0xbc, 0xaa, + 0x88, 0xa8, 0x96, 0x6c, 0xa3, 0x8a, 0xb4, 0xd1, 0xca, 0x42, 0xce, 0xab, + 0x4e, 0x82, 0xa6, 0x37, 0xbd, 0x79, 0x91, 0xb8, 0x91, 0x44, 0x8e, 0x3a, + 0x75, 0x33, 0xad, 0x91, 0x51, 0x90, 0x8b, 0x86, 0xba, 0x7e, 0x57, 0x52, + 0x5a, 0x40, 0xa0, 0x7a, 0x78, 0x9b, 0xad, 0x42, 0x9c, 0x71, 0xa0, 0xb5, + 0x6d, 0x94, 0xba, 0x7a, 0xa9, 0x4b, 0x41, 0x37, 0xb1, 0x4f, 0x7a, 0x66, + 0xb8, 0xb3, 0xb8, 0xa1, 0x97, 0x6f, 0xaa, 0x3e, 0x6a, 0x56, 0xc9, 0x61, + 0x3f, 0x65, 0xc4, 0x98, 0xa8, 0xd3, 0x9c, 0xa4, 0x9f, 0xa9, 0xd4, 0x4f, + 0x81, 0xcf, 0x4f, 0x74, 0x34, 0x4a, 0x5f, 0x88, 0xb7, 0xb3, 0x42, 0xcb, + 0xb0, 0x90, 0xbf, 0x45, 0xb7, 0x75, 0x70, 0x95, 0x8a, 0xc9, 0xcb, 0x7d, + 0x6f, 0xc0, 0x35, 0x8d, 0x6c, 0x38, 0xb9, 0x62, 0xc5, 0xaf, 0x62, 0x7f, + 0x86, 0x9c, 0x9e, 0xcb, 0xc2, 0xc8, 0x4c, 0x93, 0x68, 0xd0, 0x46, 0x6d, + 0xd2, 0xd2, 0x87, 0x9a, 0x82, 0x67, 0x3e, 0x6b, 0x7a, 0x56, 0x79, 0xca, + 0x66, 0x77, 0xa5, 0x8f, 0x7b, 0xb2, 0x9f, 0xd4, 0x9e, 0x6c, 0x48, 0xb5, + 0x4f, 0x60, 0x35, 0x8b, 0x77, 0x75, 0x44, 0x6b, 0x90, 0xbb, 0x64, 0x4a, + 0x68, 0x99, 0x39, 0x69, 0x76, 0x3b, 0x7b, 0xaa, 0xc2, 0x66, 0x40, 0xb9, + 0x95, 0x3b, 0x61, 0x70, 0xd5, 0x73, 0xd3, 0x98, 0xae, 0xa1, 0xbf, 0x49, + 0xbe, 0x2e, 0x50, 0x5e, 0xb2, 0x45, 0xc0, 0xb6, 0x81, 0x6a, 0x79, 0xa4, + 0x60, 0xb3, 0x85, 0x6e, 0x54, 0x5a, 0x54, 0x5b, 0x45, 0x57, 0x46, 0x95, + 0xa5, 0xc2, 0x9d, 0x9d, 0x98, 0x49, 0x84, 0x6f, 0x73, 0x72, 0xbc, 0xc7, + 0xd5, 0x5f, 0x8d, 0xbf, 0x5a, 0xbe, 0x3d, 0x49, 0x4e, 0x47, 0x4d, 0xb6, + 0xa5, 0xb8, 0x9d, 0x4d, 0x9d, 0x7a, 0xeb, 0xa8, 0x4c, 0x94, 0xad, 0xc4, + 0x92, 0x35, 0xd5, 0xad, 0x59, 0xba, 0xc4, 0xba, 0xc9, 0x97, 0x38, 0x3f, + 0x7f, 0x39, 0x74, 0x82, 0x72, 0xca, 0xc1, 0x55, 0x52, 0xc4, 0x33, 0x78, + 0x64, 0x2d, 0x25, 0xa8, 0x7d, 0xc8, 0xc2, 0xbc, 0xa8, 0x85, 0xd6, 0x68, + 0x71, 0x42, 0x84, 0x7d, 0xbc, 0x7a, 0x6c, 0x78, 0xd2, 0x72, 0x92, 0x3c, + 0x85, 0xb5, 0xba, 0x54, 0x46, 0xc9, 0x35, 0x70, 0xab, 0x89, 0x99, 0x82, + 0x42, 0x76, 0x42, 0xc1, 0xd7, 0xbb, 0x70, 0xb9, 0x4c, 0xd6, 0xcb, 0x9c, + 0x2c, 0x4f, 0x3e, 0x64, 0x86, 0x47, 0x92, 0xc0, 0x35, 0xbf, 0x4b, 0x62, + 0xad, 0xc1, 0x36, 0x7d, 0xbf, 0xcb, 0xa1, 0xbd, 0x70, 0x43, 0x40, 0x9b, + 0x7d, 0x41, 0x36, 0x6e, 0x53, 0x1f, 0x8b, 0x6d, 0xc9, 0x5e, 0x40, 0x6c, + 0x5d, 0x3f, 0xd3, 0xa3, 0x74, 0xc9, 0x3e, 0x3d, 0x55, 0x5b, 0x76, 0x2e, + 0x64, 0x93, 0x47, 0xbb, 0xd3, 0x64, 0x57, 0x6c, 0xd0, 0x8f, 0xbf, 0xc0, + 0xb3, 0x3e, 0x92, 0x60, 0xb4, 0x8f, 0x48, 0x85, 0x4e, 0x47, 0xcc, 0x99, + 0xab, 0x45, 0x84, 0x83, 0x84, 0xa4, 0xb8, 0xb4, 0x39, 0xd2, 0x4c, 0xac, + 0xa9, 0x83, 0x99, 0x53, 0x61, 0xbd, 0x72, 0xa4, 0xaf, 0x86, 0xd2, 0xd4, + 0x7d, 0x29, 0x63, 0x9b, 0x5b, 0x42, 0x40, 0x39, 0xb2, 0x96, 0xbf, 0x6e, + 0x72, 0xb5, 0x7a, 0x32, 0x89, 0x88, 0xc2, 0x75, 0x8a, 0x73, 0x59, 0x31, + 0x5c, 0x53, 0x3d, 0x9d, 0x3b, 0xa6, 0x39, 0x38, 0x81, 0x86, 0xaa, 0xaf, + 0x4e, 0x47, 0x8e, 0x7e, 0x36, 0x5e, 0x96, 0xa9, 0xbb, 0xb7, 0x7c, 0x98, + 0xd2, 0xd9, 0xac, 0x7a, 0x53, 0xb8, 0x9e, 0x65, 0x6e, 0x45, 0x37, 0xa7, + 0x49, 0xc5, 0xa3, 0xd0, 0x74, 0x7e, 0x59, 0x3f, 0xbd, 0x94, 0x39, 0x37, + 0x40, 0x52, 0xa3, 0xa8, 0x38, 0x56, 0xb3, 0x96, 0xaf, 0x90, 0x36, 0x56, + 0x41, 0xad, 0x89, 0x42, 0xa4, 0xa2, 0x4d, 0xcb, 0x6f, 0x7f, 0x7d, 0xc6, + 0xc7, 0x9a, 0x8e, 0x5f, 0xaa, 0x41, 0xb8, 0x34, 0x3a, 0x69, 0xc8, 0x87, + 0x8c, 0x98, 0x37, 0x42, 0x76, 0x77, 0x76, 0x72, 0x3b, 0x3a, 0xd5, 0x8f, + 0xcd, 0x6b, 0x74, 0xc5, 0x45, 0x5b, 0xc1, 0x8a, 0x7e, 0x6a, 0x5a, 0x41, + 0x5e, 0x73, 0xcf, 0xc5, 0x62, 0x76, 0xb7, 0xa8, 0x91, 0x82, 0x38, 0x9a, + 0x65, 0x39, 0x83, 0x4d, 0xa3, 0xa0, 0xaf, 0x53, 0x81, 0x60, 0x60, 0x33, + 0x9d, 0x6c, 0xc8, 0x90, 0x54, 0xa8, 0xd2, 0xc7, 0x46, 0xad, 0x5c, 0x51, + 0x41, 0x41, 0x42, 0xba, 0xc2, 0x85, 0xba, 0x8b, 0x4b, 0x3f, 0x77, 0xca, + 0xa7, 0x70, 0x8f, 0x7f, 0xbc, 0x7f, 0xc8, 0x7a, 0xa8, 0x8f, 0xad, 0x68, + 0x6f, 0xb0, 0xc4, 0xc5, 0xc2, 0xa2, 0x5b, 0xcd, 0x6c, 0x52, 0x53, 0xca, + 0x5e, 0x9d, 0x9a, 0x4d, 0x7d, 0x3e, 0x79, 0x76, 0xd3, 0xa9, 0xc6, 0x97, + 0x8b, 0x65, 0xd4, 0xaf, 0x88, 0x98, 0xc3, 0xc9, 0x90, 0xb1, 0x68, 0x99, + 0x48, 0x32, 0xb5, 0x32, 0x4e, 0x99, 0x8f, 0xc3, 0x9b, 0x84, 0x4a, 0x5f, + 0x35, 0x44, 0x38, 0x99, 0x4b, 0x63, 0xa2, 0x35, 0xae, 0xc8, 0x72, 0x4a, + 0xb9, 0xb7, 0x93, 0xbf, 0x3e, 0xa2, 0x67, 0x4b, 0x87, 0x98, 0x33, 0xc3, + 0xc0, 0x6a, 0xc0, 0x60, 0x8e, 0x7c, 0x6b, 0x6c, 0x3a, 0xb3, 0x96, 0xb0, + 0x8e, 0x45, 0x9f, 0x9f, 0x35, 0x57, 0x4b, 0xc0, 0x3a, 0xa4, 0x92, 0x8f, + 0x64, 0xb0, 0x4d, 0x48, 0x7c, 0x6e, 0x69, 0xbc, 0x5d, 0x8d, 0x89, 0xcf, + 0x84, 0xc3, 0x90, 0xad, 0x53, 0xca, 0x4f, 0x95, 0x39, 0xd5, 0x64, 0xa7, + 0x70, 0x79, 0xd7, 0x31, 0xaf, 0x4f, 0x78, 0xbe, 0x53, 0x44, 0x97, 0x94, + 0x82, 0xbf, 0xb7, 0x5e, 0x7d, 0x83, 0x71, 0xc0, 0x64, 0x56, 0x9f, 0xd1, + 0xb4, 0x44, 0x33, 0x87, 0xa3, 0x5e, 0x8f, 0x3f, 0xbd, 0x51, 0x85, 0x61, + 0x6b, 0xa1, 0x42, 0x60, 0x39, 0xc8, 0x67, 0x71, 0xae, 0x92, 0xa0, 0x53, + 0x6b, 0x54, 0x6d, 0xd2, 0xd1, 0xd6, 0xa1, 0x43, 0xb1, 0xac, 0x6a, 0x97, + 0x84, 0xb4, 0xbd, 0x42, 0x84, 0x9b, 0xb1, 0x9f, 0x97, 0xaf, 0xc8, 0x39, + 0xac, 0x9b, 0xc6, 0x9b, 0xb7, 0x5a, 0x35, 0x47, 0x4a, 0x39, 0x66, 0xbf, + 0xb3, 0x5c, 0x45, 0xac, 0x5b, 0xbe, 0x51, 0x64, 0x52, 0x55, 0xcc, 0x47, + 0x82, 0x49, 0xbc, 0x48, 0x3b, 0xa9, 0xd2, 0xb1, 0xbb, 0xc5, 0x94, 0x58, + 0x53, 0x60, 0xa7, 0xb9, 0xbf, 0x38, 0x8f, 0xcd, 0x49, 0x39, 0xc0, 0x3c, + 0x74, 0xb0, 0x92, 0x58, 0xb0, 0x88, 0xa3, 0x40, 0x83, 0x70, 0x63, 0xa4, + 0xbc, 0x57, 0x8d, 0x41, 0x67, 0x41, 0x8b, 0x44, 0x51, 0xc1, 0x82, 0x9b, + 0x54, 0xab, 0xd7, 0x7f, 0x53, 0x58, 0x4d, 0x52, 0x43, 0x53, 0x5f, 0xb2, + 0xac, 0x3d, 0xc8, 0x90, 0xb3, 0x76, 0x9b, 0x46, 0xd2, 0xb5, 0x8d, 0x56, + 0x48, 0x4d, 0x57, 0x85, 0x86, 0xb9, 0x69, 0xa5, 0xd2, 0xb2, 0xb4, 0xa0, + 0x34, 0xc3, 0x6f, 0xd1, 0x5e, 0x8c, 0xb1, 0x9a, 0x82, 0x37, 0xa1, 0x7a, + 0x52, 0x40, 0x88, 0x4a, 0x4a, 0x9d, 0x5a, 0xb3, 0x68, 0x99, 0x87, 0x71, + 0xb8, 0xcc, 0xa2, 0x7a, 0x55, 0x76, 0xd3, 0xc2, 0x46, 0x53, 0x3d, 0x40, + 0x7f, 0x86, 0x9f, 0x3e, 0xae, 0xbf, 0x3e, 0xae, 0x44, 0x5f, 0x5f, 0x60, + 0xb0, 0x38, 0x71, 0xbe, 0xa4, 0xa4, 0x8e, 0x83, 0xd5, 0xbc, 0x85, 0x88, + 0x62, 0xad, 0x44, 0x79, 0x56, 0x7d, 0x85, 0x53, 0xc8, 0x56, 0x89, 0x5a, + 0x8e, 0x63, 0x39, 0x5c, 0x60, 0x5f, 0x59, 0xa7, 0x40, 0x9e, 0x5a, 0x3c, + 0x6c, 0x9e, 0xb1, 0x83, 0x7f, 0x7f, 0x7e, 0xbd, 0x91, 0x70, 0x6a, 0x72, + 0x31, 0x93, 0x56, 0xc0, 0x8c, 0x9e, 0xa3, 0x56, 0x6f, 0xa4, 0x4d, 0x3b, + 0xae, 0x43, 0x8d, 0xcc, 0x35, 0x40, 0x82, 0xd2, 0xad, 0x74, 0xbb, 0x6d, + 0xa2, 0xcd, 0x65, 0xd9, 0x93, 0x6b, 0xc4, 0x67, 0x7d, 0xc5, 0xba, 0x7c, + 0x5b, 0x76, 0x96, 0xaf, 0x8f, 0x6b, 0xb6, 0x59, 0x54, 0xd4, 0x4f, 0x53, + 0x6e, 0xa5, 0xaa, 0x45, 0x52, 0x42, 0x5b, 0x64, 0x8f, 0x72, 0xc5, 0x67, + 0xc8, 0x3f, 0xbc, 0x96, 0x78, 0x7c, 0x95, 0xbf, 0x37, 0x7d, 0x4c, 0x5a, + 0xc1, 0x7f, 0xa5, 0xc0, 0x66, 0x6a, 0xb5, 0x98, 0x5b, 0x72, 0xd3, 0xcd, + 0x9d, 0xd2, 0xc0, 0xcd, 0xb5, 0x9a, 0xbf, 0x44, 0xb8, 0xd2, 0xc6, 0x4b, + 0x3d, 0x4b, 0xcb, 0x73, 0x79, 0xa3, 0x9e, 0x89, 0xb2, 0x9b, 0xcd, 0x8b, + 0xca, 0xba, 0x88, 0xa6, 0xcf, 0x4d, 0x8e, 0x35, 0x38, 0x55, 0xa6, 0x3d, + 0x5f, 0x51, 0x44, 0x98, 0x9c, 0x83, 0x88, 0x6b, 0x54, 0xb1, 0x61, 0x4e, + 0x6f, 0xc9, 0x85, 0x3f, 0x6f, 0x45, 0xa5, 0x6b, 0x78, 0x54, 0xab, 0x92, + 0x8e, 0xb6, 0x86, 0x32, 0x91, 0xb5, 0x67, 0xa8, 0x9b, 0x53, 0x5f, 0x53, + 0xa5, 0x78, 0x67, 0x3d, 0x65, 0x9b, 0xaf, 0x7e, 0x4a, 0x6f, 0xc6, 0x3e, + 0xb4, 0xc2, 0xc7, 0x69, 0x51, 0x60, 0xd2, 0x42, 0x83, 0x42, 0x84, 0x8a, + 0x64, 0xa0, 0x40, 0x61, 0x7e, 0x72, 0x66, 0x46, 0x54, 0x7c, 0x85, 0x34, + 0xa2, 0x36, 0xad, 0xcb, 0xbb, 0x9d, 0x8c, 0x4f, 0xcf, 0xd4, 0xa2, 0xc4, + 0xc9, 0xd0, 0x56, 0xa9, 0xb4, 0xa7, 0x70, 0x96, 0xac, 0x51, 0x3d, 0xcd, + 0xc5, 0xd0, 0x5a, 0x89, 0xaa, 0xa9, 0x82, 0xcd, 0x6a, 0xa2, 0x64, 0x74, + 0xa6, 0x3d, 0xab, 0x9b, 0x69, 0x64, 0x66, 0x33, 0xbc, 0xd2, 0x44, 0x49, + 0x39, 0x30, 0x8d, 0x9b, 0x53, 0x58, 0x69, 0x4d, 0x89, 0x91, 0x42, 0xd2, + 0x73, 0x7e, 0x36, 0x58, 0x3c, 0x79, 0xb6, 0x64, 0x6f, 0xce, 0x44, 0xb0, + 0x9a, 0xcb, 0x56, 0xc6, 0xa4, 0xb9, 0xa6, 0xa1, 0xbb, 0x6e, 0xae, 0x3d, + 0x79, 0x51, 0xa0, 0x79, 0xa3, 0xd8, 0xcc, 0xb9, 0x75, 0x86, 0xbd, 0x4a, + 0x71, 0x2e, 0xb2, 0xa9, 0xc4, 0x9e, 0x43, 0xa8, 0xb1, 0x63, 0x8c, 0xc4, + 0x4e, 0xcf, 0xc3, 0xa0, 0x78, 0x4a, 0x31, 0x5a, 0x56, 0x48, 0xb1, 0x70, + 0x39, 0x58, 0x50, 0x6e, 0xae, 0xb3, 0xa3, 0x72, 0x54, 0xc6, 0x92, 0xc6, + 0x6d, 0xca, 0xb1, 0xcd, 0x81, 0x90, 0xc6, 0x47, 0x84, 0xa5, 0x69, 0x2e, + 0x6e, 0xa4, 0x95, 0x96, 0x36, 0xba, 0x96, 0x5b, 0xc1, 0x71, 0x51, 0x41, + 0xc6, 0x61, 0xb1, 0x98, 0x61, 0xb7, 0xd1, 0x45, 0x96, 0xba, 0x35, 0x51, + 0x5e, 0xc4, 0xca, 0x81, 0x42, 0x4b, 0xc6, 0xa6, 0x54, 0x7a, 0x64, 0x8f, + 0xa6, 0xc4, 0x71, 0x89, 0xa2, 0x9d, 0x7f, 0x3e, 0xaf, 0x45, 0x47, 0xc3, + 0x58, 0xd4, 0x87, 0x93, 0xc2, 0x63, 0x64, 0x45, 0x7f, 0x44, 0x50, 0x8a, + 0x36, 0x72, 0xb9, 0x81, 0x96, 0x8d, 0x43, 0xb8, 0x53, 0x4d, 0x45, 0x86, + 0xc2, 0x60, 0x99, 0x38, 0x7a, 0x6c, 0x36, 0xcf, 0x82, 0xbd, 0x98, 0x64, + 0x64, 0xb8, 0x7b, 0xac, 0xac, 0xbd, 0x43, 0xb3, 0x8f, 0xa0, 0x8b, 0x70, + 0x72, 0x7c, 0xaf, 0xd9, 0xd0, 0x6b, 0x8e, 0x94, 0xc4, 0x6a, 0xc6, 0x57, + 0x84, 0xc6, 0x92, 0xc3, 0xd3, 0x8e, 0xdb, 0xb5, 0x7c, 0x75, 0xa6, 0x77, + 0xa4, 0x62, 0x9e, 0x3e, 0xb9, 0x4d, 0x6d, 0x98, 0x9f, 0x96, 0xa2, 0x29, + 0xa2, 0x9f, 0x37, 0x98, 0xa6, 0xa2, 0x44, 0x62, 0x9c, 0xac, 0x7d, 0x77, + 0x57, 0xb2, 0xb6, 0x67, 0x33, 0x75, 0x7d, 0x4b, 0xc7, 0xa7, 0xb6, 0x4b, + 0x97, 0xb0, 0x44, 0xa3, 0x94, 0xbf, 0xcb, 0x63, 0xc7, 0x6e, 0x7f, 0x86, + 0x53, 0x81, 0x87, 0xd2, 0x35, 0x9d, 0x6b, 0x6b, 0x58, 0xb4, 0xad, 0x3e, + 0x9f, 0xb7, 0x51, 0x2f, 0xbb, 0x8f, 0xbe, 0x75, 0x40, 0x56, 0x43, 0x34, + 0x5f, 0x41, 0x4d, 0x73, 0x3e, 0x72, 0x5b, 0x69, 0xac, 0xbb, 0x98, 0x87, + 0xa6, 0x99, 0x79, 0xb5, 0x8d, 0xc9, 0xa9, 0x4e, 0x9d, 0xb1, 0x70, 0xa3, + 0x83, 0xa7, 0x6a, 0xcf, 0x3b, 0x7e, 0x88, 0xa1, 0x90, 0xb2, 0x67, 0xb9, + 0x8b, 0xab, 0x5c, 0xcf, 0xa4, 0x49, 0xaf, 0x5e, 0xb4, 0xa7, 0x96, 0x72, + 0xc1, 0xcb, 0x55, 0x8d, 0x30, 0xcb, 0x96, 0xc4, 0x56, 0x46, 0x56, 0x66, + 0x41, 0xb9, 0xc5, 0x3b, 0x85, 0x9a, 0x97, 0x76, 0x64, 0x6e, 0x7f, 0x7c, + 0xbc, 0x8c, 0x41, 0xc1, 0xa9, 0x3f, 0x6c, 0x35, 0x53, 0x4c, 0xb4, 0xc9, + 0x88, 0x53, 0xa2, 0x40, 0x7c, 0xc2, 0xa6, 0x88, 0x98, 0xb6, 0x6f, 0xa2, + 0xb9, 0x83, 0xa5, 0xbf, 0x7e, 0x75, 0x54, 0x67, 0x5a, 0x3a, 0xd2, 0xb1, + 0x8a, 0xbd, 0x35, 0x72, 0x5a, 0x97, 0xa9, 0x69, 0x7e, 0x32, 0x84, 0xa3, + 0x5f, 0xaa, 0x3f, 0x4b, 0xcc, 0x7c, 0x38, 0x82, 0x90, 0xc2, 0xc8, 0xa5, + 0x3c, 0x57, 0x57, 0x92, 0x85, 0x9c, 0x5d, 0x46, 0x21, 0xab, 0x95, 0x6a, + 0x76, 0x8e, 0x84, 0x88, 0x86, 0xc4, 0xce, 0xbd, 0xa6, 0x41, 0xd1, 0x8e, + 0xc6, 0x39, 0x71, 0x6c, 0x86, 0xca, 0x87, 0x68, 0x94, 0x79, 0x84, 0xbc, + 0x96, 0x5c, 0x55, 0xcc, 0x6d, 0x7e, 0x78, 0x31, 0xc9, 0xc0, 0x8d, 0x8b, + 0x61, 0x5c, 0x78, 0x3f, 0x9c, 0x7a, 0x39, 0x60, 0xc9, 0x5f, 0x59, 0xa9, + 0x31, 0xa5, 0x81, 0xa9, 0x61, 0x5b, 0x9c, 0x46, 0xc9, 0xaa, 0xdc, 0x47, + 0xc3, 0xa3, 0x67, 0x49, 0x8b, 0x97, 0x98, 0x90, 0xb0, 0xc9, 0x30, 0xb1, + 0x81, 0xb1, 0x42, 0x4a, 0xa9, 0xbc, 0x2b, 0x58, 0xb5, 0x3a, 0x77, 0xa7, + 0x43, 0x41, 0x66, 0xad, 0xc0, 0x4d, 0x5e, 0x44, 0xc0, 0x6a, 0x77, 0x8b, + 0xc0, 0x54, 0x42, 0x38, 0x73, 0x37, 0xa3, 0x5e, 0x5f, 0xcb, 0x60, 0x9a, + 0x74, 0xb9, 0xa9, 0x75, 0x78, 0xa9, 0x8b, 0xc5, 0xce, 0xbb, 0x78, 0x38, + 0x9d, 0x3c, 0x3a, 0x6d, 0x7a, 0x66, 0x5f, 0x57, 0xa4, 0xbe, 0x6d, 0x58, + 0xb3, 0xc2, 0x85, 0x75, 0x91, 0x7d, 0x30, 0x40, 0x89, 0x36, 0x5a, 0x47, + 0x9a, 0xa7, 0xc7, 0xc5, 0xc8, 0xb2, 0x7a, 0x97, 0xaa, 0x51, 0x3c, 0x54, + 0x62, 0x92, 0x34, 0x86, 0xa5, 0x39, 0x57, 0xca, 0x38, 0x5d, 0x57, 0x3d, + 0xc9, 0x61, 0x42, 0x96, 0x46, 0x74, 0x5b, 0xc4, 0xa0, 0xcd, 0x43, 0x78, + 0x61, 0x52, 0xb0, 0xcc, 0x92, 0x46, 0x48, 0xa6, 0x37, 0x2d, 0xb1, 0xc2, + 0xb9, 0x65, 0xb7, 0x78, 0x6d, 0x7d, 0x9a, 0xcf, 0xbd, 0x73, 0x54, 0x84, + 0xba, 0xaa, 0x2d, 0x3e, 0x67, 0x75, 0xa0, 0xc5, 0xc8, 0x6d, 0x3e, 0x2c, + 0x85, 0xb3, 0x5b, 0xbd, 0x43, 0x82, 0x7e, 0xd3, 0xc6, 0x64, 0xc0, 0xc3, + 0x4b, 0x50, 0x74, 0x7f, 0x59, 0x47, 0x84, 0x9d, 0x5d, 0x85, 0xce, 0x97, + 0x63, 0xb2, 0x49, 0x5e, 0xa0, 0x3d, 0x98, 0xb9, 0x9e, 0x3e, 0x77, 0x45, + 0xba, 0x5e, 0x83, 0xd3, 0x6e, 0x89, 0x47, 0x2e, 0xa3, 0xbf, 0x45, 0xb0, + 0xb5, 0x36, 0x76, 0xbd, 0x76, 0x81, 0x38, 0x8c, 0x56, 0xc7, 0x6c, 0x39, + 0x50, 0xb0, 0x77, 0xa1, 0x58, 0x9b, 0xd1, 0x9d, 0x99, 0x79, 0x9f, 0x9e, + 0xab, 0x84, 0xc7, 0x4e, 0x45, 0x6d, 0xce, 0xb1, 0xc5, 0xd6, 0x46, 0xac, + 0x5e, 0x9e, 0x39, 0x3e, 0xa9, 0xc8, 0x86, 0xc8, 0x95, 0xb6, 0x48, 0xc0, + 0x65, 0xa5, 0xbd, 0xaf, 0xc3, 0x70, 0xb3, 0xa4, 0x98, 0x56, 0xa9, 0x68, + 0x8a, 0x83, 0x52, 0x5d, 0x34, 0x9b, 0xb8, 0xc4, 0x3e, 0x55, 0x60, 0x3c, + 0xb8, 0x4e, 0x5f, 0xc7, 0xa3, 0x97, 0x77, 0x6d, 0x9f, 0x7d, 0x70, 0xa5, + 0x58, 0x54, 0x78, 0xd1, 0x89, 0x97, 0xaa, 0x4c, 0xc3, 0x6d, 0x4b, 0x70, + 0x41, 0x83, 0x95, 0x6a, 0x8e, 0xa2, 0xb8, 0x3f, 0xae, 0x9a, 0xc0, 0x4e, + 0xa8, 0xb4, 0x95, 0x31, 0xab, 0x43, 0xab, 0xb9, 0xcc, 0x5a, 0x54, 0x6a, + 0x4d, 0xab, 0x48, 0xc2, 0xc4, 0x7e, 0xd4, 0xc5, 0x80, 0xcb, 0x36, 0x81, + 0x89, 0x90, 0x94, 0x46, 0x42, 0x90, 0x59, 0xa2, 0x91, 0x32, 0xca, 0x3e, + 0x5a, 0xc6, 0xb2, 0x75, 0x91, 0x6a, 0x80, 0x38, 0x45, 0x81, 0x90, 0x92, + 0x98, 0x77, 0x91, 0x73, 0x7b, 0x61, 0xb6, 0xc3, 0x75, 0xa4, 0x58, 0x80, + 0x3b, 0xcf, 0xba, 0x8f, 0x38, 0x4f, 0x89, 0x44, 0xb8, 0x92, 0xc3, 0x4e, + 0x3f, 0x58, 0x46, 0x71, 0x56, 0x4d, 0xcf, 0x85, 0x4b, 0x6a, 0xaa, 0x4e, + 0x54, 0xca, 0x6f, 0x81, 0x86, 0x51, 0x6f, 0x3d, 0xd2, 0x65, 0x94, 0x5b, + 0x69, 0xcc, 0x51, 0xdb, 0x99, 0x54, 0x57, 0x97, 0xa2, 0x3a, 0xb9, 0x94, + 0xa8, 0xbe, 0x8b, 0xa1, 0xc7, 0x9a, 0xbf, 0x34, 0xcd, 0x88, 0x46, 0x88, + 0x4f, 0x4d, 0x7c, 0xc0, 0xce, 0x88, 0x73, 0x99, 0x5a, 0x7e, 0x9a, 0x43, + 0x86, 0x89, 0x36, 0xd1, 0xa2, 0x74, 0x40, 0x33, 0x6f, 0xc4, 0xc5, 0xbf, + 0xbf, 0xbc, 0x5f, 0x63, 0x46, 0xa9, 0x4d, 0x77, 0x62, 0x5f, 0xc9, 0x7f, + 0x72, 0xcc, 0x9b, 0x6b, 0x62, 0x8f, 0x57, 0x71, 0x5a, 0x8c, 0x95, 0xb9, + 0x6d, 0x96, 0x86, 0x8f, 0x84, 0x7a, 0xbe, 0xdd, 0xc8, 0xab, 0xaf, 0x8a, + 0x60, 0xa0, 0x3a, 0x91, 0x75, 0x9c, 0x3b, 0x84, 0x60, 0x51, 0x35, 0x3b, + 0x79, 0x3e, 0xc2, 0x9d, 0x92, 0x91, 0x64, 0x64, 0x81, 0xda, 0x8c, 0x91, + 0x52, 0x47, 0x4e, 0x47, 0x7a, 0x39, 0x97, 0x97, 0xbe, 0x34, 0x74, 0xb3, + 0x77, 0x88, 0x4d, 0x5b, 0x3b, 0x89, 0xda, 0x4b, 0x58, 0xd0, 0x79, 0x44, + 0xa1, 0x56, 0xbc, 0x4c, 0x82, 0x73, 0x3a, 0x85, 0x40, 0x2a, 0x89, 0x7d, + 0x4e, 0xa1, 0x9b, 0x33, 0xbf, 0x84, 0x43, 0xcb, 0x44, 0x3b, 0xb9, 0x7a, + 0x38, 0x4f, 0x98, 0xd0, 0x43, 0xab, 0x95, 0x62, 0x52, 0x68, 0xa0, 0x8e, + 0x80, 0x8c, 0xb5, 0x62, 0x74, 0x81, 0x5d, 0xa0, 0x9d, 0x60, 0x55, 0xb1, + 0x79, 0x80, 0x72, 0x8b, 0x65, 0x56, 0xae, 0x88, 0x6b, 0xc3, 0x3c, 0x88, + 0xa5, 0x82, 0x93, 0xa8, 0x6c, 0x7d, 0x81, 0x3e, 0x95, 0x3c, 0x80, 0xb9, + 0x62, 0x88, 0xc6, 0xc3, 0x38, 0x7e, 0xc8, 0x57, 0x85, 0xb7, 0xac, 0xaf, + 0x99, 0x84, 0xc9, 0xb3, 0x70, 0x3a, 0xcc, 0x66, 0x4a, 0xcf, 0x87, 0x58, + 0x9b, 0x63, 0x94, 0x6f, 0xbe, 0x74, 0x9a, 0x51, 0xb9, 0xa1, 0x91, 0x67, + 0x39, 0x94, 0x50, 0xba, 0xc9, 0xb2, 0x3c, 0xcc, 0xb8, 0xb1, 0xa6, 0xba, + 0x91, 0x54, 0x6c, 0x90, 0x84, 0x62, 0x95, 0xbf, 0x9a, 0x86, 0x5d, 0x84, + 0x73, 0x51, 0x56, 0xa4, 0xb0, 0x97, 0x6c, 0x51, 0x68, 0x31, 0xc7, 0x6a, + 0xc5, 0x90, 0x72, 0x9d, 0x89, 0xb8, 0x55, 0x4f, 0xcf, 0x8b, 0xcd, 0xc6, + 0x96, 0x4f, 0x80, 0x3e, 0xa4, 0x96, 0x48, 0xc7, 0x93, 0x5b, 0xae, 0x73, + 0x34, 0x55, 0xc3, 0xac, 0x62, 0x5f, 0xce, 0x5b, 0x8a, 0x72, 0xc9, 0x70, + 0x6d, 0x8c, 0xcb, 0xa4, 0xa9, 0x2d, 0xb8, 0x88, 0xbc, 0x9a, 0x4b, 0xae, + 0x84, 0x43, 0x59, 0x43, 0x5c, 0x5b, 0xb1, 0x33, 0xd0, 0x75, 0xbb, 0xc6, + 0x82, 0x6f, 0xa8, 0xb3, 0xbd, 0x74, 0x7b, 0x54, 0x93, 0x6b, 0xbf, 0x73, + 0xb8, 0x54, 0xad, 0x68, 0xa7, 0xd1, 0x7e, 0xad, 0xbe, 0x3d, 0x43, 0xc6, + 0xc8, 0xa1, 0x76, 0xb9, 0x5c, 0xcf, 0x93, 0x6c, 0x5d, 0xb7, 0xa7, 0x78, + 0xad, 0xd1, 0xcb, 0xa1, 0x3e, 0xb5, 0x77, 0xc3, 0x5e, 0x39, 0xbf, 0xb1, + 0xc6, 0x62, 0xb9, 0xc8, 0x61, 0x5c, 0x85, 0x74, 0xa5, 0x5a, 0x3e, 0x50, + 0x3f, 0x84, 0x97, 0xbd, 0xb4, 0x74, 0x78, 0x9a, 0xa0, 0x72, 0xc4, 0x60, + 0x59, 0x96, 0x83, 0xa7, 0x41, 0x7d, 0xc9, 0x44, 0x62, 0xce, 0x4e, 0xac, + 0x4b, 0x79, 0x4f, 0xc5, 0x61, 0xa5, 0xb7, 0xb2, 0x66, 0x3e, 0xb5, 0x92, + 0x53, 0x6e, 0x6a, 0xa6, 0x33, 0xa3, 0x9a, 0x9c, 0x35, 0xc4, 0x7c, 0x67, + 0xd0, 0x7d, 0xc7, 0x90, 0x5c, 0xac, 0x43, 0x78, 0xce, 0xa7, 0x88, 0x39, + 0xb5, 0x4b, 0xb4, 0x8d, 0xa4, 0x33, 0x60, 0xd2, 0x49, 0x9d, 0x9b, 0x6b, + 0x46, 0x63, 0xb3, 0xcd, 0xd7, 0xa1, 0xbe, 0xa7, 0x79, 0x56, 0x3f, 0x46, + 0xc4, 0x56, 0x79, 0x5d, 0xcd, 0x68, 0x38, 0x4b, 0x77, 0x70, 0xa5, 0xbd, + 0xc2, 0x48, 0xa8, 0x60, 0x8b, 0x37, 0xb0, 0x31, 0x75, 0xac, 0x7b, 0xd2, + 0xcb, 0x97, 0xb9, 0x52, 0xb6, 0x7d, 0xd6, 0x8c, 0x45, 0xbc, 0x5c, 0x9d, + 0xb5, 0xc5, 0xc2, 0x84, 0x6a, 0x57, 0xb7, 0x43, 0x28, 0x31, 0x4e, 0x88, + 0x49, 0xc9, 0xbb, 0xc0, 0x3f, 0x71, 0x3e, 0x63, 0x9f, 0x92, 0x57, 0x8d, + 0x7b, 0x94, 0xca, 0x44, 0x9d, 0x3c, 0x34, 0xcc, 0x64, 0xd3, 0x96, 0x5c, + 0x4e, 0xd5, 0x49, 0x92, 0xad, 0x6b, 0x3f, 0xa1, 0x30, 0xb9, 0xcc, 0xb5, + 0x59, 0x95, 0x5f, 0xc6, 0x7e, 0x71, 0x74, 0xbd, 0x67, 0x32, 0x59, 0xb4, + 0x9c, 0x4d, 0x94, 0x4d, 0xb7, 0x9a, 0xd2, 0x6a, 0x60, 0xb0, 0x40, 0x48, + 0x5a, 0x70, 0x3f, 0x86, 0x84, 0x9f, 0x53, 0xba, 0x99, 0x4f, 0x2f, 0x77, + 0x38, 0xd6, 0x92, 0x4b, 0xa7, 0x40, 0x3e, 0xaa, 0x61, 0xbb, 0x84, 0x32, + 0x8a, 0xa5, 0x4f, 0x67, 0x8f, 0xa8, 0x6c, 0xb0, 0x87, 0x5b, 0x85, 0x8d, + 0x9a, 0x9f, 0x5f, 0x6a, 0x8e, 0x8f, 0xb3, 0xb2, 0xc7, 0x5c, 0x7b, 0xcd, + 0x79, 0x91, 0x38, 0x9c, 0x66, 0x77, 0x6c, 0x71, 0xd7, 0x87, 0xb3, 0x76, + 0x4a, 0xc6, 0x98, 0x8c, 0xaa, 0x73, 0xaa, 0xac, 0x57, 0x9a, 0x99, 0x7c, + 0x50, 0x40, 0x60, 0x37, 0x45, 0x41, 0x4e, 0xc0, 0xc1, 0xc7, 0x9f, 0xa7, + 0xb7, 0xaf, 0x5e, 0x5c, 0x3b, 0xcd, 0x8d, 0xa5, 0x61, 0xa5, 0x45, 0x87, + 0x9e, 0x55, 0x9e, 0x4e, 0xa3, 0x92, 0xad, 0xac, 0x67, 0x98, 0xc7, 0x8d, + 0xb4, 0x69, 0xd1, 0x55, 0x98, 0xab, 0xa0, 0x5c, 0x69, 0x77, 0x7d, 0x74, + 0xd0, 0xd3, 0x5d, 0xc0, 0xd2, 0xc0, 0x8d, 0xd3, 0xa1, 0x34, 0xa4, 0x73, + 0x74, 0xc5, 0x9a, 0xb2, 0x95, 0xb9, 0x68, 0x72, 0x3b, 0x79, 0x99, 0x73, + 0x60, 0xb1, 0x56, 0x8c, 0x5d, 0x73, 0xa2, 0x9c, 0x28, 0xcd, 0x54, 0x41, + 0xb0, 0x84, 0xbb, 0x30, 0xb4, 0x49, 0xa9, 0x68, 0x2f, 0xa6, 0xb9, 0xa1, + 0x6a, 0xa3, 0xd9, 0x40, 0x44, 0x3f, 0x59, 0x82, 0x7b, 0x66, 0x36, 0x9b, + 0x4c, 0x62, 0xd3, 0x63, 0x95, 0x7e, 0x56, 0xb6, 0xb5, 0xa6, 0xb7, 0x80, + 0xc7, 0x76, 0x50, 0x63, 0xaa, 0x39, 0x80, 0x4a, 0x7d, 0xcb, 0xd6, 0xa6, + 0x43, 0xce, 0xb2, 0x4d, 0xa0, 0x59, 0x8b, 0x37, 0x37, 0x4c, 0x7d, 0x3e, + 0x75, 0x72, 0xab, 0x4f, 0xca, 0x57, 0xb6, 0x6d, 0xb6, 0x3e, 0x52, 0xda, + 0x72, 0xd0, 0x96, 0x59, 0x64, 0x68, 0x8d, 0xcb, 0x5b, 0xbc, 0x55, 0x86, + 0x5c, 0x66, 0x9d, 0xad, 0x58, 0xc4, 0x46, 0xc7, 0x4b, 0x65, 0xa8, 0xb4, + 0x61, 0x4a, 0xb6, 0x93, 0x4e, 0x6e, 0x89, 0x7c, 0x85, 0x68, 0xcf, 0x42, + 0x56, 0x86, 0x4e, 0xc7, 0x3b, 0x99, 0x3b, 0xbc, 0x7a, 0xba, 0xc6, 0x74, + 0xc2, 0x88, 0xd4, 0x3e, 0xc4, 0x36, 0x88, 0x81, 0x81, 0x9a, 0x50, 0x69, + 0xc0, 0x83, 0x66, 0x58, 0xa5, 0x77, 0x33, 0x95, 0x6a, 0x64, 0x77, 0xa3, + 0xb4, 0xbb, 0x35, 0x92, 0xa1, 0x76, 0x48, 0xb7, 0x74, 0x54, 0x72, 0x7d, + 0x76, 0x50, 0xab, 0x97, 0x53, 0xac, 0xcf, 0xb3, 0x32, 0x64, 0xc2, 0x84, + 0xaf, 0x84, 0x61, 0x52, 0x4f, 0xc0, 0x29, 0x57, 0x8e, 0x3f, 0xab, 0x59, + 0x98, 0x46, 0xb2, 0x51, 0x8e, 0x59, 0x85, 0xb6, 0xb0, 0xd2, 0xc5, 0xa4, + 0x9c, 0x46, 0xc5, 0xa2, 0x9f, 0xc8, 0xd5, 0x63, 0x55, 0xb8, 0x9a, 0x85, + 0x4d, 0xc9, 0xc4, 0x82, 0x52, 0x52, 0xbd, 0xc6, 0xbb, 0x63, 0xab, 0x59, + 0xa3, 0x44, 0x6a, 0x3e, 0x88, 0x98, 0xca, 0xa0, 0x63, 0x5c, 0x4e, 0x72, + 0xae, 0x5a, 0x3e, 0x8e, 0x45, 0xd6, 0x64, 0x9f, 0xa1, 0x5f, 0x5e, 0x65, + 0x9f, 0x7c, 0x44, 0xa5, 0x4d, 0xa9, 0x56, 0x66, 0x4d, 0x37, 0x52, 0xd4, + 0x31, 0x3e, 0xa9, 0x85, 0xb3, 0xa6, 0x84, 0x36, 0xb2, 0xd6, 0x48, 0x43, + 0x5f, 0xd1, 0x38, 0x99, 0x2d, 0x3c, 0x62, 0x37, 0xb3, 0x39, 0xb9, 0xa1, + 0xbb, 0x7b, 0x79, 0xa3, 0x3d, 0xbc, 0xa2, 0xb4, 0x73, 0x73, 0xb0, 0x9d, + 0xcd, 0x62, 0x55, 0xb1, 0xcc, 0xc1, 0x61, 0xb7, 0x8b, 0x46, 0x7d, 0x62, + 0xc1, 0xc9, 0xd1, 0x42, 0x69, 0xc1, 0x69, 0xb2, 0x74, 0x9d, 0x9f, 0x67, + 0x88, 0x99, 0xb2, 0x65, 0x76, 0x92, 0x42, 0xce, 0x69, 0x7f, 0xb5, 0x65, + 0xb4, 0x85, 0x96, 0x56, 0xa6, 0x69, 0x5b, 0xb9, 0x52, 0xa3, 0xcb, 0x5b, + 0x67, 0xc2, 0x4c, 0x98, 0x69, 0x4b, 0x41, 0x3b, 0x5f, 0x5e, 0x66, 0x73, + 0x85, 0xa3, 0xa5, 0x93, 0x72, 0xa9, 0x96, 0x4a, 0xa2, 0x3c, 0x86, 0x79, + 0x74, 0x9a, 0x8f, 0x49, 0x63, 0x9c, 0x49, 0xa2, 0xca, 0x82, 0x37, 0x59, + 0x52, 0xc1, 0x87, 0x53, 0xd2, 0x92, 0xc6, 0x8f, 0xa1, 0x6c, 0x2f, 0x63, + 0x33, 0x63, 0xa9, 0x6f, 0x77, 0xd3, 0x46, 0x41, 0x5d, 0xa1, 0x78, 0xaf, + 0xa5, 0x86, 0x6d, 0xb3, 0x2f, 0x58, 0x3a, 0xa2, 0x55, 0x5f, 0x4c, 0x85, + 0x5d, 0x79, 0xb3, 0xac, 0xb7, 0xc9, 0x36, 0x56, 0xce, 0xa0, 0x49, 0x58, + 0x90, 0xb0, 0x96, 0xae, 0xbc, 0x9f, 0xbd, 0xcd, 0x8a, 0x5a, 0x41, 0x96, + 0x88, 0xcb, 0x64, 0x2d, 0xa9, 0xa7, 0x3c, 0x46, 0x4b, 0x40, 0xa1, 0x89, + 0x6f, 0xad, 0x8a, 0xb5, 0x58, 0xc3, 0xd2, 0x79, 0x7d, 0x92, 0x81, 0xd8, + 0xbb, 0x50, 0xba, 0x47, 0x37, 0xb5, 0x7c, 0x65, 0x59, 0xa1, 0x5f, 0xb7, + 0x9c, 0x2a, 0x4c, 0xb9, 0x75, 0xc4, 0x4c, 0x85, 0xaf, 0xb7, 0xaa, 0x8e, + 0xa9, 0x5e, 0x9b, 0xb6, 0xcf, 0x40, 0x3d, 0xa4, 0x3a, 0x91, 0xd1, 0x89, + 0xac, 0xb5, 0x79, 0x73, 0x53, 0xab, 0x5e, 0xc5, 0x74, 0x7c, 0x6d, 0x94, + 0x59, 0xa2, 0x59, 0x6d, 0x4e, 0xa5, 0xa9, 0xc4, 0xa2, 0x4e, 0xc4, 0xa6, + 0x3b, 0xa3, 0x59, 0xc1, 0xb9, 0x5a, 0x6d, 0xc4, 0xc2, 0x42, 0x66, 0x5c, + 0xb3, 0x44, 0x5d, 0x2f, 0xbe, 0x9a, 0x77, 0xcc, 0xce, 0x42, 0x8b, 0x3d, + 0xa2, 0x92, 0x71, 0x67, 0x36, 0xc3, 0x3f, 0x79, 0x53, 0x48, 0xa6, 0xb7, + 0xd7, 0x52, 0xa7, 0x4a, 0x64, 0x7f, 0xa0, 0x45, 0x3a, 0xc8, 0x95, 0x50, + 0x6e, 0xd4, 0x4d, 0x4f, 0x81, 0x66, 0xcc, 0x47, 0x76, 0xa1, 0xdb, 0xd9, + 0x52, 0x74, 0x91, 0xa8, 0x48, 0x66, 0x95, 0xa6, 0x4a, 0x98, 0x68, 0xa5, + 0xbc, 0x8f, 0xb1, 0x91, 0xc3, 0x47, 0x39, 0x8b, 0x52, 0x9f, 0x5b, 0xc3, + 0x80, 0x89, 0x85, 0x5c, 0x8a, 0x81, 0xd9, 0x5d, 0xba, 0x5c, 0xa8, 0xc8, + 0x97, 0xb7, 0x96, 0x44, 0xac, 0x46, 0xa6, 0xcf, 0x65, 0xc1, 0xcc, 0x4d, + 0x42, 0x34, 0x42, 0x33, 0xb7, 0x67, 0xb2, 0x5c, 0xcb, 0x3f, 0x44, 0x32, + 0x55, 0xa0, 0xca, 0x96, 0x57, 0x46, 0x60, 0x8e, 0x49, 0xce, 0x48, 0x6b, + 0xa6, 0x40, 0x90, 0xb7, 0x9d, 0x74, 0x73, 0xc8, 0x71, 0xbb, 0xb1, 0xaf, + 0xc8, 0x4b, 0x6e, 0x42, 0x48, 0x88, 0x69, 0x8c, 0xcb, 0x58, 0x5e, 0x3e, + 0x42, 0x77, 0xc2, 0x82, 0x9a, 0x7b, 0x39, 0xb3, 0xc6, 0xa8, 0x37, 0x47, + 0xbe, 0x47, 0x62, 0x7f, 0x37, 0xd3, 0x74, 0x9a, 0x83, 0xd3, 0xa6, 0x5c, + 0x68, 0xca, 0xc3, 0x65, 0xa1, 0x57, 0xa0, 0x3e, 0x91, 0x4c, 0x7d, 0xa9, + 0x50, 0xab, 0xcf, 0xb7, 0x8d, 0x7b, 0x5c, 0x62, 0x4a, 0x42, 0x5d, 0x5a, + 0x84, 0x9d, 0x85, 0x9f, 0x48, 0xc6, 0x5b, 0x3d, 0x4b, 0x2f, 0xc2, 0xd4, + 0x96, 0x52, 0x67, 0xc4, 0xce, 0x96, 0x96, 0x51, 0x43, 0x69, 0x90, 0x32, + 0xc1, 0x8c, 0xc3, 0x37, 0xc6, 0xc7, 0x7f, 0x6d, 0x5d, 0xc4, 0x4f, 0xa0, + 0x51, 0x34, 0x70, 0xca, 0x75, 0xbb, 0x8f, 0x53, 0x89, 0x6c, 0x90, 0x7f, + 0x88, 0xa4, 0x4f, 0x3d, 0x62, 0xa0, 0x43, 0xb4, 0x5a, 0x55, 0xc4, 0x4c, + 0x9e, 0x47, 0xac, 0x8a, 0x62, 0x26, 0x57, 0xa5, 0xb8, 0x47, 0xb8, 0x28, + 0x52, 0x3d, 0x2e, 0x4a, 0x46, 0xca, 0xb4, 0xd8, 0xdd, 0xa6, 0x96, 0x85, + 0x5e, 0xd3, 0x3b, 0x7f, 0x65, 0x3b, 0xad, 0xbe, 0x9b, 0xac, 0x74, 0x84, + 0xd2, 0x9c, 0x59, 0x9f, 0xc8, 0x49, 0x94, 0x7d, 0x60, 0x98, 0x89, 0x9b, + 0xb2, 0xb3, 0x4a, 0xc2, 0x58, 0xc8, 0xc7, 0x4d, 0xa0, 0xb8, 0x3f, 0xd1, + 0x71, 0x74, 0x48, 0xb1, 0xd3, 0x53, 0x49, 0xab, 0xba, 0x63, 0x4d, 0x62, + 0xb6, 0x9b, 0x7b, 0x8f, 0xd3, 0x7d, 0x8d, 0xcc, 0xcd, 0x45, 0x9a, 0x82, + 0x5a, 0xbf, 0x55, 0x46, 0xcb, 0x79, 0x3d, 0x4c, 0xcf, 0x6f, 0x5c, 0x52, + 0x5b, 0x85, 0x6d, 0x3a, 0x59, 0x69, 0x8c, 0x58, 0xb8, 0xb9, 0xb9, 0x68, + 0x33, 0x55, 0xd1, 0x38, 0x9a, 0x6f, 0x73, 0x3f, 0xb2, 0x79, 0x5c, 0x7d, + 0x89, 0x99, 0x84, 0x89, 0x71, 0xd0, 0x44, 0xaf, 0x44, 0xb9, 0xca, 0xc4, + 0x96, 0x4a, 0x7c, 0x42, 0x38, 0x61, 0x94, 0xa8, 0xa1, 0x38, 0x3f, 0x92, + 0x8d, 0xaf, 0x8c, 0x7a, 0xcc, 0xab, 0x3f, 0x72, 0x73, 0xcc, 0x4f, 0x60, + 0x8f, 0x74, 0x42, 0xc9, 0x55, 0x9b, 0x3f, 0x5e, 0x49, 0xa8, 0x33, 0x4b, + 0x45, 0xb5, 0x8f, 0x93, 0xc1, 0x40, 0x50, 0x9d, 0x86, 0x3b, 0x90, 0xad, + 0xa8, 0x64, 0xcc, 0x99, 0x7e, 0x39, 0x67, 0x54, 0xc7, 0x6a, 0xaa, 0x72, + 0x8c, 0x74, 0x7b, 0x8c, 0x49, 0x48, 0x37, 0x6b, 0x67, 0x83, 0xbc, 0x46, + 0x3c, 0x9a, 0xcf, 0x80, 0xb4, 0xc5, 0xca, 0x82, 0xcc, 0x93, 0x78, 0x53, + 0xa7, 0x64, 0xa7, 0xa4, 0x9f, 0x91, 0xcc, 0x6e, 0xd3, 0x78, 0x97, 0x4c, + 0x90, 0xb2, 0xa5, 0x74, 0x56, 0x97, 0x55, 0x77, 0x4f, 0xc5, 0x90, 0xb0, + 0xb0, 0xab, 0xdd, 0xb8, 0x5b, 0x9c, 0x6c, 0xaf, 0x5b, 0x82, 0x88, 0x51, + 0xaa, 0x38, 0x7f, 0x6a, 0xad, 0x3e, 0x4b, 0x8f, 0x93, 0x37, 0xa2, 0x67, + 0x4a, 0xb7, 0x78, 0x35, 0xa9, 0xc5, 0x5b, 0x42, 0x31, 0x66, 0x4d, 0x55, + 0xc0, 0x5d, 0x85, 0x7b, 0x9c, 0x85, 0x95, 0x5b, 0xb3, 0xd8, 0x62, 0xa3, + 0xb8, 0x97, 0x75, 0x55, 0xce, 0x3f, 0x59, 0x94, 0x50, 0x80, 0x65, 0x58, + 0x95, 0x38, 0xb0, 0x71, 0x45, 0x5f, 0x7b, 0x69, 0x93, 0xb7, 0x59, 0x4c, + 0x60, 0x79, 0x67, 0xb5, 0x56, 0x41, 0xa4, 0x42, 0x77, 0x85, 0x9c, 0x3b, + 0x98, 0xc0, 0x72, 0x7b, 0xc2, 0x98, 0x52, 0xd7, 0x9c, 0x6f, 0x37, 0x7b, + 0xa2, 0x78, 0xa9, 0xa5, 0x83, 0xc7, 0xd2, 0xb2, 0x63, 0x97, 0xb8, 0x7a, + 0x63, 0x90, 0x88, 0xa7, 0xa6, 0x78, 0x37, 0x91, 0xb5, 0x31, 0xa5, 0x49, + 0x49, 0xbf, 0xca, 0x72, 0x98, 0x6e, 0x42, 0x6c, 0x96, 0x51, 0x33, 0xc1, + 0x98, 0x5e, 0x7f, 0x3b, 0x8d, 0xac, 0xc9, 0x89, 0x84, 0x4a, 0x77, 0xa0, + 0x7d, 0x69, 0xd1, 0x8d, 0x5c, 0x4d, 0xa9, 0xb2, 0x30, 0x3c, 0x4a, 0xb5, + 0xca, 0x6d, 0xcb, 0xba, 0x57, 0x73, 0x89, 0x61, 0x66, 0x3c, 0xa7, 0x5d, + 0xcc, 0xc5, 0x96, 0xbb, 0x6a, 0x9c, 0x9b, 0x70, 0xc1, 0x99, 0x4d, 0x78, + 0x76, 0x3f, 0xa1, 0x43, 0x47, 0x97, 0x61, 0x39, 0x5c, 0x91, 0xc8, 0x49, + 0xb5, 0x6b, 0xb3, 0xa2, 0xaa, 0xd1, 0x72, 0x47, 0xc5, 0x71, 0x52, 0x63, + 0xa1, 0x6b, 0xc9, 0xce, 0xa4, 0x73, 0xaa, 0x68, 0x75, 0xcf, 0x56, 0x7f, + 0x73, 0x76, 0xce, 0x3d, 0x73, 0xb4, 0x34, 0x87, 0xb1, 0x50, 0x70, 0x5d, + 0xcd, 0x37, 0x47, 0x4e, 0x66, 0xb7, 0x5b, 0x55, 0x2f, 0x3f, 0x35, 0x4e, + 0xc1, 0x43, 0xd3, 0x35, 0xc3, 0x65, 0xba, 0xa0, 0x87, 0xac, 0xaf, 0xa6, + 0x3d, 0x6d, 0x3b, 0x4b, 0xa6, 0xd2, 0x56, 0x9c, 0xa8, 0xa5, 0xbf, 0xa0, + 0xcc, 0xca, 0x67, 0x67, 0x89, 0x8c, 0xbe, 0x99, 0xa0, 0x6e, 0x37, 0x37, + 0x7a, 0xcd, 0xcc, 0x8e, 0xc0, 0xa8, 0x3c, 0x3e, 0xd6, 0x56, 0x63, 0x9c, + 0xbb, 0xb8, 0xaf, 0x99, 0xb8, 0xcf, 0xdd, 0x74, 0x74, 0x78, 0xa9, 0xcb, + 0xc9, 0x73, 0x91, 0x61, 0x81, 0x3c, 0x80, 0xa3, 0xae, 0xdb, 0x8b, 0xc2, + 0x76, 0xba, 0x92, 0xb9, 0xa4, 0x97, 0x3a, 0xbb, 0x70, 0x7a, 0xcd, 0xb3, + 0x49, 0x3c, 0xa7, 0x84, 0x64, 0xcc, 0xcf, 0x72, 0x74, 0x3a, 0x69, 0x34, + 0x3a, 0xa0, 0xc2, 0x7e, 0x58, 0x31, 0x3f, 0xaf, 0xd0, 0x76, 0x6a, 0xc4, + 0x6c, 0x67, 0xbe, 0x82, 0x5b, 0xa1, 0xbf, 0x3f, 0xc3, 0x8a, 0x44, 0x7a, + 0x38, 0x6b, 0xc6, 0x98, 0xc6, 0xa7, 0x5c, 0xa3, 0xa5, 0xbc, 0x4e, 0x3f, + 0x60, 0x77, 0xcd, 0x3b, 0xbc, 0x91, 0x42, 0xa5, 0x84, 0x82, 0xc2, 0xc4, + 0xc5, 0xc3, 0x5e, 0xbe, 0x79, 0xe0, 0x41, 0x45, 0x94, 0xa2, 0xad, 0x94, + 0x4e, 0xc1, 0x54, 0xc5, 0x48, 0xb8, 0xb9, 0xaf, 0x95, 0xc1, 0x63, 0xc5, + 0x35, 0x8f, 0x71, 0x3e, 0x86, 0x92, 0x75, 0x91, 0x52, 0xcf, 0xb9, 0xce, + 0x87, 0x7b, 0x44, 0x70, 0xb7, 0x46, 0xd5, 0x9f, 0x3d, 0x36, 0x93, 0x6d, + 0x39, 0x9d, 0xd1, 0x69, 0xc4, 0x51, 0x3d, 0x96, 0x49, 0x30, 0xcc, 0x8c, + 0xd2, 0x3c, 0x48, 0x9a, 0x47, 0x7c, 0xb5, 0x79, 0x68, 0x78, 0x5f, 0xc3, + 0x4a, 0x36, 0xb2, 0xc7, 0x96, 0x8e, 0x3a, 0x80, 0x36, 0x66, 0xaf, 0xc4, + 0x8b, 0xd6, 0xb2, 0x38, 0x3f, 0x96, 0x6c, 0x6b, 0x51, 0xb4, 0x3a, 0x65, + 0x71, 0x4c, 0x46, 0x5f, 0x7b, 0x99, 0xd3, 0x54, 0xc2, 0x84, 0xc0, 0x77, + 0x92, 0x3b, 0xaa, 0x9f, 0xca, 0x46, 0x6c, 0x64, 0x9e, 0x64, 0xab, 0xac, + 0x97, 0x6f, 0xa5, 0x50, 0x7e, 0x80, 0x74, 0xc9, 0x54, 0x65, 0x58, 0xca, + 0xc7, 0x6a, 0x6a, 0x9c, 0x68, 0x4d, 0x8f, 0x70, 0x66, 0x48, 0xb4, 0xaa, + 0x49, 0x4c, 0x83, 0x4a, 0x36, 0x48, 0xa2, 0x88, 0x4a, 0x88, 0x97, 0xc8, + 0xa8, 0x69, 0x63, 0xbc, 0x58, 0x5d, 0x69, 0x62, 0xa4, 0xa4, 0x4d, 0xb1, + 0xa8, 0x7f, 0xb3, 0x8c, 0xd8, 0x94, 0x73, 0x64, 0x4c, 0x9f, 0xbc, 0x64, + 0x71, 0xdb, 0xd8, 0x58, 0xb7, 0xb8, 0x75, 0x8d, 0xbc, 0x7d, 0x80, 0x98, + 0x8b, 0x4a, 0x5a, 0x5e, 0xcf, 0x67, 0xbc, 0x4f, 0xcc, 0x98, 0xa8, 0x72, + 0x61, 0x53, 0x76, 0x3c, 0x77, 0x59, 0x44, 0x57, 0x59, 0xb4, 0x44, 0xb4, + 0x52, 0xa2, 0x43, 0x95, 0xc3, 0x96, 0x98, 0x5b, 0x64, 0x32, 0xbe, 0x5a, + 0xd6, 0x75, 0x2c, 0xad, 0x55, 0x37, 0x9f, 0x6a, 0xb1, 0x99, 0x6c, 0x4f, + 0xc0, 0x9c, 0x4e, 0x5d, 0x6e, 0xbe, 0x80, 0x69, 0xc5, 0x7e, 0x79, 0xcd, + 0x59, 0x9f, 0x38, 0xae, 0x61, 0xcc, 0x88, 0x3e, 0x66, 0xbc, 0x56, 0x92, + 0x3b, 0xb5, 0xad, 0x5c, 0x73, 0x39, 0x55, 0xc2, 0xb9, 0x2f, 0x83, 0xa8, + 0x61, 0x5a, 0x38, 0x6a, 0x8a, 0xb6, 0xc0, 0xa0, 0x7b, 0x8b, 0x56, 0x57, + 0x55, 0xd3, 0x79, 0x7e, 0x63, 0x92, 0xcb, 0xd6, 0x53, 0x8d, 0x6e, 0xbe, + 0x4b, 0xc6, 0x43, 0xd6, 0xd8, 0x8e, 0x68, 0x72, 0x5d, 0x67, 0x34, 0x4c, + 0x46, 0xb5, 0x86, 0xac, 0x74, 0x4e, 0x76, 0x49, 0x46, 0xc9, 0xae, 0x5f, + 0xd2, 0x7a, 0xc0, 0x89, 0x98, 0x4a, 0x62, 0xb2, 0xa6, 0x92, 0x96, 0x6d, + 0x88, 0x55, 0x4a, 0x79, 0xb9, 0x7c, 0x49, 0x38, 0x55, 0xc1, 0x4d, 0xd6, + 0x88, 0x72, 0xab, 0x74, 0xc9, 0x96, 0x4f, 0x60, 0xc2, 0x98, 0xc1, 0xd4, + 0x30, 0xa2, 0xa5, 0xd3, 0x3d, 0x72, 0xaa, 0x6b, 0x49, 0xaf, 0x31, 0x93, + 0x24, 0x54, 0x63, 0x78, 0xa6, 0xbd, 0x99, 0xe1, 0x8e, 0x5c, 0x78, 0xd8, + 0xa9, 0x41, 0x4f, 0xb0, 0xc1, 0x5b, 0x60, 0x7d, 0x3b, 0x91, 0x62, 0xb4, + 0x7d, 0xc9, 0x89, 0x95, 0x6c, 0x6c, 0x83, 0x8e, 0x51, 0x30, 0x46, 0x33, + 0x4f, 0x3e, 0x26, 0xbb, 0xb3, 0xd2, 0xcb, 0xce, 0x9b, 0x6b, 0x80, 0x59, + 0x87, 0x73, 0x74, 0x8f, 0x7b, 0x75, 0x3e, 0x42, 0x8f, 0x7c, 0x62, 0x88, + 0x7e, 0x84, 0x46, 0x71, 0x50, 0x78, 0x34, 0x70, 0xac, 0x79, 0xb8, 0x75, + 0x69, 0x9f, 0x48, 0x79, 0x66, 0x9e, 0x9c, 0x43, 0x4d, 0xc7, 0x3a, 0x7c, + 0x94, 0x59, 0xc4, 0x57, 0xa5, 0xa3, 0x51, 0xba, 0x82, 0x77, 0x51, 0x7e, + 0x74, 0xe7, 0xc0, 0x8a, 0x91, 0x80, 0xbb, 0x60, 0xa8, 0x83, 0x5f, 0xa0, + 0x5a, 0x31, 0x54, 0x6f, 0x78, 0xb5, 0xc3, 0x67, 0xaa, 0x99, 0xc0, 0x97, + 0x83, 0x4b, 0x6a, 0x4e, 0x58, 0x99, 0x6b, 0x8c, 0x6d, 0xb7, 0x38, 0xbd, + 0xa9, 0x9e, 0xbe, 0xbc, 0x50, 0xb9, 0x44, 0x8a, 0x94, 0x8e, 0x6e, 0xce, + 0xc8, 0x7d, 0x94, 0x97, 0xad, 0x53, 0x3e, 0x9c, 0xc7, 0x39, 0xd7, 0xaf, + 0x8e, 0x74, 0x90, 0x6d, 0x94, 0x47, 0xd7, 0x88, 0x5a, 0xb5, 0x50, 0xe2, + 0xbf, 0x3b, 0xaa, 0x98, 0xb4, 0x91, 0xc6, 0x21, 0xb7, 0xd0, 0x77, 0x58, + 0x5c, 0xb8, 0x3d, 0x67, 0x7e, 0x6d, 0x81, 0xc2, 0x67, 0x9f, 0xd2, 0xd3, + 0xa7, 0xc4, 0x75, 0x34, 0x9a, 0x70, 0x87, 0x98, 0xc7, 0xaa, 0xd4, 0xcc, + 0xc6, 0x94, 0xb2, 0x57, 0x91, 0x3f, 0x78, 0xbc, 0x71, 0x82, 0x38, 0x9e, + 0x7a, 0x43, 0x7b, 0xc6, 0x70, 0x7c, 0x5a, 0xa8, 0x71, 0x8a, 0xb8, 0x98, + 0x93, 0x63, 0xca, 0xcb, 0xb8, 0x9c, 0x74, 0x56, 0x47, 0xce, 0xca, 0x39, + 0xcd, 0x76, 0x36, 0x5d, 0x58, 0xa5, 0xb1, 0x9a, 0xac, 0x5e, 0xa2, 0xba, + 0x31, 0x62, 0x65, 0xa4, 0x87, 0x72, 0xc6, 0x65, 0xc1, 0xcb, 0x8d, 0x7e, + 0x78, 0x81, 0x60, 0xaa, 0x5d, 0x7d, 0xb9, 0xb4, 0x45, 0x80, 0xa5, 0xda, + 0xac, 0xb3, 0x89, 0x43, 0x92, 0x84, 0x36, 0xb3, 0x43, 0x95, 0x91, 0xb1, + 0xc0, 0x30, 0xb5, 0x33, 0x86, 0xce, 0x43, 0x52, 0xa1, 0xc7, 0x54, 0xb1, + 0x6b, 0x89, 0xcf, 0x74, 0xb8, 0x92, 0x92, 0x87, 0x41, 0x46, 0xa0, 0x6f, + 0x48, 0x63, 0x5a, 0x36, 0x8d, 0x38, 0x91, 0x66, 0x47, 0xc9, 0xc2, 0x37, + 0xd1, 0xa5, 0x95, 0x85, 0xd3, 0x4c, 0x74, 0x36, 0xd1, 0x90, 0x69, 0x74, + 0x62, 0xc1, 0x6d, 0x67, 0x5f, 0x6b, 0x59, 0x58, 0x92, 0x67, 0x5e, 0x73, + 0x86, 0x62, 0x58, 0x69, 0x30, 0xa6, 0x7e, 0x5e, 0x90, 0xc4, 0x98, 0x49, + 0x34, 0xac, 0x70, 0xba, 0x9d, 0x9b, 0x51, 0x90, 0xc4, 0x34, 0x6d, 0x9d, + 0xa9, 0x6d, 0xa7, 0x7a, 0x8d, 0x76, 0xc4, 0xc9, 0xae, 0x54, 0xc4, 0x92, + 0x50, 0xba, 0x7d, 0xab, 0xd4, 0x8d, 0x64, 0x61, 0x3c, 0x35, 0x61, 0xa1, + 0xc9, 0xa8, 0x86, 0xa4, 0x5c, 0x70, 0x73, 0x9f, 0x30, 0x4c, 0x47, 0x30, + 0xb7, 0x84, 0x8a, 0x94, 0x4b, 0xc5, 0xad, 0x34, 0x9d, 0x83, 0xd0, 0xcd, + 0x72, 0x79, 0x37, 0xbc, 0xb2, 0xa6, 0x42, 0xb5, 0x9d, 0xba, 0xe0, 0x55, + 0xc1, 0xdf, 0x61, 0x51, 0x34, 0x55, 0x7b, 0xab, 0xb8, 0xbc, 0xb7, 0xbe, + 0xcf, 0xc6, 0xa9, 0xc4, 0xb6, 0x9f, 0x55, 0x79, 0x91, 0xd2, 0xda, 0x8c, + 0xc3, 0x3c, 0xc7, 0x43, 0x84, 0xb7, 0x47, 0x6e, 0x57, 0xd5, 0xe3, 0xcb, + 0xd5, 0x49, 0x85, 0xce, 0x83, 0x70, 0x33, 0xd2, 0x46, 0x56, 0xda, 0x7a, + 0x8c, 0x51, 0xac, 0xda, 0x27, 0x34, 0x7b, 0x7f, 0x88, 0x72, 0xc6, 0x7e, + 0x3b, 0x94, 0xa2, 0x3f, 0x76, 0x7f, 0x73, 0xcc, 0xbc, 0x58, 0x98, 0xb7, + 0x81, 0x34, 0x67, 0xbc, 0x3b, 0xb0, 0x29, 0xb3, 0x48, 0x82, 0x7f, 0xc1, + 0x80, 0x93, 0x99, 0xbf, 0xae, 0x4f, 0xca, 0xd9, 0x76, 0x5b, 0xab, 0xb8, + 0x7b, 0x86, 0x45, 0x3d, 0x78, 0x30, 0xa8, 0x41, 0x99, 0x65, 0xc0, 0xbe, + 0x96, 0x64, 0xc9, 0x4c, 0x31, 0x96, 0x64, 0x6f, 0xd3, 0xc6, 0x36, 0x76, + 0x5c, 0x4c, 0x30, 0xae, 0x4e, 0x3d, 0x98, 0x40, 0x8a, 0x88, 0x7e, 0xa3, + 0x7a, 0x91, 0xc1, 0x97, 0x61, 0x59, 0xa3, 0x4b, 0x4e, 0x99, 0x73, 0x2f, + 0xa3, 0x5d, 0xb8, 0xa9, 0x3e, 0x77, 0x54, 0x6a, 0x91, 0x5e, 0x60, 0x83, + 0x72, 0xb4, 0xcc, 0x9c, 0xa9, 0x97, 0x53, 0x75, 0xb0, 0xc6, 0x7a, 0x64, + 0xa9, 0x38, 0x3c, 0xb7, 0x52, 0xa4, 0x6e, 0x3b, 0xbd, 0x41, 0xaa, 0x6d, + 0x46, 0xa4, 0x8f, 0xb7, 0xce, 0x65, 0x73, 0x7e, 0x39, 0xce, 0xd8, 0x65, + 0x92, 0x77, 0x48, 0xc2, 0xb0, 0xd8, 0x50, 0x94, 0x71, 0x94, 0x74, 0xb0, + 0x7a, 0x8a, 0xb2, 0x54, 0x7f, 0xe5, 0x9d, 0xe3, 0xc1, 0x54, 0xb3, 0x52, + 0x9c, 0x2e, 0x9d, 0x8a, 0x83, 0xca, 0x65, 0x83, 0x57, 0xcb, 0x37, 0x63, + 0x65, 0x5f, 0xbe, 0x56, 0xbb, 0x8b, 0x74, 0x86, 0x7c, 0x6d, 0x83, 0x43, + 0xa2, 0xc5, 0x46, 0x4c, 0xa8, 0x54, 0x50, 0xb9, 0xbe, 0x93, 0x40, 0x59, + 0x5f, 0x34, 0x9d, 0xbf, 0x65, 0xd7, 0xc0, 0xbe, 0x46, 0xd2, 0x82, 0xad, + 0xb2, 0x40, 0xc3, 0x72, 0x5d, 0xbe, 0xa3, 0x89, 0xb4, 0x6c, 0x83, 0x38, + 0xbd, 0xcc, 0xc1, 0x8d, 0x52, 0x58, 0x2e, 0x6f, 0x7d, 0x33, 0x3d, 0x62, + 0x50, 0xb5, 0xc6, 0xb1, 0x6b, 0x41, 0x5e, 0xbe, 0x80, 0x74, 0x46, 0xc0, + 0x8e, 0x98, 0x79, 0x38, 0xc7, 0xcf, 0x75, 0xa5, 0x72, 0x49, 0xbc, 0xb9, + 0xbe, 0x88, 0x90, 0x3f, 0x3e, 0x3f, 0x9a, 0x3f, 0x6d, 0xd1, 0xc6, 0x51, + 0xb8, 0x9a, 0x74, 0xc2, 0x89, 0xd1, 0xa2, 0x9a, 0x57, 0xcb, 0x96, 0x6d, + 0x8a, 0xc2, 0xc0, 0x88, 0x45, 0x41, 0x30, 0xb1, 0x95, 0x53, 0xd2, 0x9c, + 0xc9, 0xc0, 0x82, 0xcc, 0x82, 0x82, 0x67, 0x3a, 0xbe, 0xc8, 0x76, 0x70, + 0x60, 0x67, 0xc0, 0xa7, 0x93, 0x6f, 0x63, 0x6e, 0x81, 0x9d, 0x9c, 0xba, + 0x5f, 0x7e, 0x81, 0x85, 0x89, 0x9a, 0x53, 0x9d, 0xb3, 0x69, 0x56, 0xab, + 0x73, 0x8d, 0xd8, 0xd3, 0xaf, 0x90, 0xb8, 0x53, 0xb8, 0x62, 0x69, 0x37, + 0xae, 0x93, 0x6f, 0x5b, 0xc9, 0x4e, 0xaa, 0x4f, 0x8d, 0x4e, 0xd7, 0x90, + 0x6f, 0x3f, 0x56, 0x63, 0xb0, 0x46, 0x9a, 0x37, 0xb2, 0x65, 0x73, 0x7b, + 0xc8, 0x61, 0x5a, 0xaf, 0xab, 0xb6, 0xb9, 0x73, 0x81, 0x7c, 0xa7, 0xb1, + 0xae, 0xc6, 0x95, 0xac, 0xc6, 0x36, 0x6f, 0x72, 0x80, 0x97, 0xc1, 0xa2, + 0x5c, 0xaf, 0xc4, 0x6c, 0xa0, 0xd2, 0x61, 0x94, 0xcc, 0x7b, 0x46, 0x7a, + 0x89, 0x5c, 0x32, 0x70, 0xc8, 0xb8, 0xa0, 0x70, 0x77, 0x3e, 0xa6, 0xbd, + 0x50, 0x8e, 0x70, 0xc6, 0x94, 0xab, 0x79, 0xd0, 0xbb, 0x91, 0xb8, 0xb2, + 0xc2, 0xcf, 0xc9, 0x32, 0x35, 0x54, 0x4e, 0x4d, 0x30, 0x70, 0xc7, 0x37, + 0xb4, 0x49, 0x67, 0x4c, 0x84, 0x9c, 0x5d, 0x78, 0x5a, 0x7e, 0xad, 0xac, + 0x48, 0xa5, 0xc7, 0xbc, 0x39, 0xca, 0x3c, 0x8c, 0x38, 0x3d, 0x7f, 0xc4, + 0x7d, 0xca, 0x3d, 0xd6, 0x45, 0xa8, 0x96, 0x4c, 0x7a, 0x39, 0x69, 0x58, + 0xab, 0x9a, 0x96, 0x54, 0x49, 0xb7, 0x33, 0x8b, 0x5a, 0x50, 0x7e, 0x99, + 0x85, 0x3c, 0x9b, 0xc0, 0xae, 0x5c, 0xae, 0x62, 0x88, 0x6b, 0x3c, 0xb9, + 0x51, 0x5f, 0xa0, 0x33, 0x9a, 0xaf, 0xc3, 0x84, 0x87, 0x48, 0xa8, 0x49, + 0x8b, 0xc4, 0x6e, 0x67, 0xbb, 0x5c, 0x38, 0x57, 0x56, 0x64, 0x4a, 0xa3, + 0x8e, 0x73, 0xcd, 0x7b, 0x81, 0xbb, 0xd1, 0xca, 0x3b, 0x39, 0x6e, 0xd2, + 0x2e, 0xba, 0xd0, 0x38, 0x6d, 0x8c, 0xac, 0xd6, 0x3e, 0xd0, 0xcb, 0x88, + 0x74, 0x67, 0x5d, 0xc7, 0x4f, 0x78, 0x34, 0xa9, 0x8f, 0x6a, 0xa0, 0x58, + 0x68, 0xae, 0x3e, 0xaa, 0xaa, 0x40, 0x4f, 0x55, 0xa0, 0x40, 0xb0, 0x9b, + 0x61, 0x7e, 0x64, 0xc9, 0xb0, 0xc6, 0x80, 0xc1, 0x69, 0xb2, 0xaf, 0x5f, + 0x76, 0x36, 0x7b, 0xb7, 0xab, 0x66, 0x86, 0x84, 0x60, 0x69, 0xcc, 0xca, + 0xbe, 0xc2, 0x5d, 0x5f, 0x9a, 0x95, 0x51, 0x68, 0x92, 0x3c, 0xa0, 0xb8, + 0x68, 0xd3, 0x87, 0xa2, 0x6f, 0x72, 0xc6, 0x8c, 0x65, 0x88, 0x98, 0x5e, + 0x55, 0x6e, 0x69, 0x59, 0x42, 0xc7, 0xcd, 0x5b, 0x4d, 0xa0, 0x65, 0x76, + 0x83, 0x8c, 0xbe, 0x68, 0x42, 0x6a, 0x44, 0x39, 0x52, 0x54, 0xa2, 0x34, + 0xbb, 0x5d, 0xa5, 0x6d, 0x72, 0x39, 0xb7, 0x47, 0xba, 0xaa, 0x36, 0x43, + 0xc3, 0x93, 0x5a, 0x6e, 0xb5, 0x86, 0x9a, 0x7b, 0xc3, 0x92, 0x3d, 0xca, + 0x36, 0x69, 0x7c, 0x9f, 0xcc, 0x5f, 0xac, 0x7a, 0x32, 0x7b, 0xcd, 0x44, + 0x9d, 0x6d, 0x61, 0xbe, 0x40, 0x7c, 0xc9, 0x50, 0x99, 0x41, 0x96, 0x52, + 0xa5, 0x56, 0x59, 0x59, 0xbb, 0xc3, 0xc0, 0xcc, 0x4d, 0x3c, 0xbb, 0x99, + 0xba, 0xbe, 0x91, 0x69, 0x7e, 0xae, 0x35, 0xb6, 0x9d, 0x82, 0xca, 0x39, + 0xba, 0x7d, 0x94, 0x4d, 0x7e, 0x81, 0x43, 0x6d, 0xa0, 0xcf, 0xa4, 0xad, + 0xbf, 0xcd, 0x53, 0x8e, 0xd9, 0x76, 0xca, 0xbe, 0xd5, 0xad, 0x4e, 0xa1, + 0x3e, 0x71, 0x3a, 0x58, 0x3d, 0x4b, 0x57, 0xa5, 0xda, 0x41, 0x4b, 0xa0, + 0x96, 0x77, 0x89, 0x38, 0x83, 0x72, 0x37, 0x55, 0xb9, 0x99, 0x7f, 0x5f, + 0x66, 0xbe, 0x63, 0xbd, 0xc9, 0x57, 0x36, 0x9d, 0x81, 0xa0, 0xc0, 0x4c, + 0x6e, 0x42, 0xb9, 0xaa, 0x77, 0x7e, 0x53, 0x33, 0x97, 0x52, 0x3e, 0xb4, + 0xa4, 0x33, 0x61, 0x9a, 0x97, 0xa9, 0x4a, 0x76, 0x88, 0x73, 0x37, 0x58, + 0x8f, 0x80, 0x91, 0x66, 0xcc, 0x56, 0x53, 0x34, 0x85, 0xa7, 0xbc, 0xa2, + 0xb9, 0xc2, 0x88, 0x3e, 0xca, 0x97, 0xc4, 0xc0, 0x3a, 0xa9, 0x73, 0xa8, + 0x6e, 0x55, 0xcf, 0x6c, 0x3e, 0x89, 0x8b, 0x92, 0x97, 0x53, 0x56, 0x99, + 0xce, 0xa5, 0xcf, 0xb4, 0x82, 0xa3, 0x71, 0x39, 0x53, 0x84, 0xb0, 0x33, + 0xaf, 0x58, 0x93, 0xba, 0x9b, 0x34, 0x81, 0xb9, 0x7f, 0xab, 0x50, 0x77, + 0x70, 0x54, 0x62, 0x4b, 0x7d, 0x95, 0x94, 0x6f, 0x8c, 0x5b, 0x8f, 0x37, + 0x6f, 0x8a, 0xa3, 0x5a, 0x78, 0x81, 0x7b, 0x55, 0xaf, 0x78, 0x57, 0xa0, + 0x3c, 0xa9, 0x3a, 0xb0, 0xbc, 0xb0, 0x89, 0x6e, 0x9d, 0x91, 0x90, 0xa0, + 0xaf, 0x74, 0x57, 0x8f, 0x5a, 0xc5, 0x95, 0x98, 0x62, 0x80, 0xc3, 0xba, + 0x3c, 0x7e, 0x81, 0x56, 0x7a, 0x89, 0x4d, 0x75, 0x44, 0xc9, 0xb2, 0x51, + 0x80, 0x9c, 0x37, 0x31, 0xa9, 0x32, 0x83, 0x81, 0x63, 0x4c, 0x47, 0x7a, + 0x97, 0x5a, 0x57, 0x8d, 0x7e, 0x5c, 0xca, 0x3f, 0x96, 0x9b, 0x63, 0x5c, + 0xaa, 0x3c, 0xc3, 0x3b, 0x3d, 0x9c, 0x7f, 0x75, 0x7b, 0x92, 0xaa, 0x68, + 0xa5, 0xc7, 0x5e, 0xb9, 0x4c, 0x43, 0x6b, 0xd7, 0x3a, 0x7f, 0x7a, 0x7a, + 0xa5, 0xcd, 0x57, 0x71, 0xa0, 0xa8, 0xa2, 0x64, 0x54, 0x5d, 0x88, 0xa3, + 0xb1, 0x91, 0x33, 0x7d, 0xc6, 0xcc, 0x82, 0x74, 0x8f, 0x60, 0x5a, 0x67, + 0x38, 0x53, 0x3f, 0xb2, 0x9c, 0xae, 0x62, 0xb3, 0xae, 0xbc, 0x6d, 0x7d, + 0x93, 0x55, 0x49, 0x44, 0x92, 0x7d, 0x66, 0xd8, 0x70, 0x6e, 0xd3, 0xb6, + 0x48, 0x5b, 0x75, 0x92, 0x9c, 0x95, 0xc4, 0x3a, 0x78, 0x54, 0x7d, 0x62, + 0xb6, 0x45, 0x54, 0x34, 0xa3, 0xa4, 0xc5, 0xac, 0xb9, 0x75, 0x49, 0x4f, + 0x3d, 0xd7, 0x9d, 0xa7, 0x5a, 0xbf, 0x83, 0x5a, 0xb8, 0x40, 0x4f, 0xaa, + 0x94, 0xbc, 0x3e, 0x3a, 0x73, 0x89, 0x68, 0x8d, 0xb4, 0x38, 0x3c, 0xb7, + 0x59, 0xd1, 0xd5, 0xa1, 0x6b, 0x4c, 0x75, 0x5e, 0x8e, 0x71, 0x7b, 0x57, + 0xa0, 0x68, 0x62, 0x6d, 0x5d, 0x5b, 0x73, 0x77, 0x86, 0xb7, 0x61, 0x51, + 0xb8, 0xa6, 0xc9, 0xe8, 0x85, 0x3d, 0xb8, 0x65, 0x73, 0xa3, 0x89, 0xcf, + 0xdc, 0x45, 0x91, 0x6a, 0xc7, 0xba, 0x77, 0xa1, 0xb1, 0x3b, 0x56, 0x62, + 0x7a, 0xa1, 0x51, 0x41, 0x3c, 0x9c, 0x79, 0x63, 0x3d, 0x44, 0x8d, 0x5b, + 0xc9, 0xae, 0x9d, 0x63, 0xb5, 0x77, 0xb6, 0x40, 0x94, 0x6d, 0x9b, 0xbd, + 0x99, 0x9a, 0xcf, 0x55, 0x64, 0x6d, 0xd5, 0x3e, 0x98, 0x57, 0xc6, 0xb8, + 0x92, 0x66, 0x39, 0x99, 0x6b, 0xc1, 0x6d, 0x9f, 0x62, 0xda, 0x66, 0x54, + 0x92, 0x86, 0x31, 0x68, 0x5f, 0x7b, 0x9f, 0x86, 0xa1, 0xbe, 0xbd, 0xb9, + 0xb9, 0x55, 0x50, 0x3f, 0x56, 0x68, 0xcd, 0x72, 0x60, 0x7d, 0xcc, 0x5c, + 0xb7, 0x7a, 0x61, 0xc3, 0x5f, 0x54, 0x8c, 0x73, 0xcb, 0x88, 0x81, 0x9b, + 0x39, 0x83, 0x8a, 0x9f, 0x56, 0x69, 0x68, 0x6e, 0x3a, 0xcd, 0xb2, 0x68, + 0x9a, 0x4e, 0x6a, 0x6f, 0xbc, 0x84, 0x6f, 0x3d, 0x64, 0x77, 0x44, 0x47, + 0x6f, 0x5f, 0x83, 0x3f, 0x61, 0xb2, 0x3a, 0xcc, 0xca, 0xd5, 0x47, 0xb7, + 0x92, 0xc4, 0x64, 0x57, 0x62, 0x79, 0x9e, 0x39, 0x9f, 0x88, 0xca, 0x45, + 0x67, 0x5c, 0x9d, 0x78, 0xbe, 0x6d, 0x69, 0x6e, 0x53, 0xa7, 0x41, 0xab, + 0xaa, 0x3a, 0x98, 0xb0, 0xc3, 0xa8, 0xa5, 0xa5, 0xcb, 0xc9, 0x76, 0x66, + 0xc7, 0x6c, 0xbf, 0xbb, 0xae, 0x9c, 0xc0, 0xc7, 0xd1, 0xcf, 0x9a, 0x2f, + 0x71, 0x9c, 0x6f, 0x71, 0x4c, 0x87, 0x6b, 0x29, 0xc0, 0x66, 0xab, 0x9a, + 0x58, 0xbb, 0x9a, 0x8d, 0x7e, 0x3b, 0xb7, 0x5b, 0xb0, 0xb6, 0x3e, 0x37, + 0xa9, 0x93, 0x81, 0x62, 0x87, 0x61, 0x6b, 0x9e, 0xb7, 0x38, 0xae, 0xb5, + 0x76, 0x38, 0x49, 0x75, 0x72, 0x4c, 0xca, 0x86, 0x97, 0x96, 0x93, 0x36, + 0x6f, 0x79, 0x55, 0x66, 0xaa, 0x7e, 0x7a, 0xc6, 0xc1, 0x8e, 0xc4, 0xcc, + 0x72, 0x6e, 0x58, 0x62, 0xc7, 0x85, 0x79, 0x65, 0x5e, 0x7d, 0xce, 0x34, + 0x33, 0xa6, 0xd6, 0xbc, 0x8b, 0x59, 0x8f, 0x9b, 0x5f, 0xc6, 0x9e, 0xa4, + 0x8d, 0x70, 0x98, 0x56, 0x68, 0xa1, 0x9c, 0x4b, 0xb3, 0xb2, 0x9f, 0xca, + 0x90, 0xbd, 0x74, 0x55, 0x43, 0xab, 0x8a, 0xb2, 0xb6, 0xa4, 0x58, 0x3e, + 0xa9, 0xad, 0xd6, 0xa1, 0x88, 0x6b, 0x94, 0x7a, 0x98, 0xcc, 0xba, 0x59, + 0x80, 0x9f, 0xa2, 0x9f, 0x78, 0xa4, 0x82, 0x87, 0xcc, 0xaa, 0x2e, 0x8a, + 0x90, 0x9c, 0x39, 0xa8, 0xbd, 0xcd, 0xa6, 0x47, 0x3d, 0x74, 0x90, 0xa5, + 0x42, 0x3d, 0x56, 0xab, 0xb3, 0xbf, 0xa5, 0xb6, 0x5a, 0x4e, 0x7f, 0x65, + 0xad, 0xcb, 0x93, 0x5f, 0x90, 0x75, 0x7a, 0x61, 0x5d, 0x5b, 0xb1, 0xaa, + 0xcb, 0x53, 0x79, 0x7f, 0xd7, 0x7b, 0x81, 0x56, 0x3c, 0x3c, 0x3a, 0xc6, + 0x4c, 0x97, 0x6c, 0x84, 0x50, 0x30, 0xa9, 0xc5, 0x5c, 0x93, 0xc5, 0x9c, + 0xd1, 0x4c, 0x6b, 0xad, 0xb5, 0x6f, 0xa0, 0x46, 0x62, 0x82, 0x58, 0x95, + 0x63, 0xae, 0x41, 0x78, 0xaf, 0xb6, 0x9e, 0x8a, 0xb2, 0xae, 0xaf, 0x3e, + 0xcb, 0x97, 0x87, 0x95, 0xce, 0x48, 0x5f, 0xc8, 0x9a, 0x37, 0xa2, 0x38, + 0x8a, 0x89, 0x48, 0x72, 0x63, 0x77, 0x56, 0x33, 0xb9, 0xd3, 0xaa, 0x57, + 0xa2, 0xcd, 0xaa, 0xb6, 0x35, 0x87, 0xb9, 0xa5, 0x9c, 0x5e, 0xd1, 0x4e, + 0x4c, 0x45, 0xd2, 0xbc, 0x41, 0xd9, 0xd6, 0x68, 0x8d, 0xc7, 0x5f, 0x7b, + 0x84, 0x73, 0x81, 0x77, 0xaf, 0x78, 0xa8, 0xd0, 0x93, 0x39, 0x81, 0xbd, + 0xa5, 0x37, 0x6f, 0xc3, 0x4f, 0xb5, 0x71, 0xc1, 0xc4, 0xd7, 0xdc, 0xbc, + 0x8f, 0x95, 0x83, 0x5f, 0xc8, 0x65, 0xb9, 0x6f, 0x98, 0xbe, 0x5f, 0x8f, + 0x49, 0x4d, 0xc6, 0x9e, 0x7e, 0xbd, 0x98, 0x37, 0xb2, 0x5b, 0x48, 0x47, + 0x41, 0xc3, 0x50, 0xb9, 0x70, 0x9f, 0xbc, 0x70, 0x84, 0x63, 0xc6, 0x82, + 0x9c, 0xcd, 0x82, 0xb1, 0x98, 0x41, 0x5d, 0xb6, 0xc8, 0x95, 0x55, 0xac, + 0x85, 0x68, 0xa2, 0x62, 0x40, 0x6b, 0x81, 0x5e, 0xba, 0x95, 0x5f, 0x98, + 0xd3, 0x60, 0x31, 0x9d, 0xce, 0xe1, 0x87, 0xae, 0x72, 0xda, 0x37, 0xce, + 0xc0, 0x6f, 0x44, 0x4b, 0x8f, 0x9c, 0xdd, 0x41, 0xcf, 0x79, 0x66, 0x65, + 0x53, 0x61, 0x8f, 0x81, 0xaf, 0x5b, 0xae, 0x3e, 0x3e, 0xcc, 0xd1, 0x67, + 0x49, 0xca, 0xa3, 0x6f, 0x32, 0x59, 0x88, 0xb9, 0x94, 0x7a, 0x80, 0x54, + 0x5a, 0x80, 0x73, 0x5b, 0xa8, 0x9e, 0x4a, 0xce, 0x7b, 0xab, 0xca, 0x98, + 0x6e, 0x63, 0x38, 0xaa, 0xb4, 0xc2, 0xcf, 0xc5, 0x9d, 0xa6, 0x6c, 0x76, + 0x4a, 0xc5, 0x5c, 0x34, 0x86, 0x61, 0x5d, 0xdd, 0x52, 0x5c, 0x8d, 0x70, + 0x98, 0x6c, 0x70, 0xc2, 0xa0, 0x7a, 0x69, 0xb3, 0x3b, 0x5a, 0x87, 0xaa, + 0x7e, 0x75, 0x5e, 0x5a, 0xaf, 0xbe, 0xcc, 0xa6, 0x31, 0x97, 0x79, 0x6d, + 0xbb, 0x78, 0xad, 0x9d, 0x86, 0x59, 0x60, 0x59, 0x50, 0x6b, 0xaf, 0x48, + 0x87, 0x29, 0x85, 0x61, 0xc8, 0xa8, 0x53, 0x61, 0x6e, 0x6f, 0xc7, 0xd3, + 0xc2, 0x5a, 0xa9, 0x93, 0x62, 0xb2, 0x5f, 0x71, 0x8e, 0x60, 0xbb, 0x78, + 0x7a, 0x3f, 0x4a, 0xa9, 0xac, 0x52, 0x4b, 0x9b, 0x4c, 0x7e, 0x4d, 0x88, + 0x50, 0x8e, 0x96, 0xb7, 0x85, 0x3c, 0x7a, 0x37, 0xae, 0x5f, 0xc3, 0x38, + 0x34, 0xa7, 0x5c, 0x38, 0xa1, 0x57, 0x54, 0x78, 0x8a, 0x4e, 0x64, 0xce, + 0xa5, 0x45, 0x82, 0xab, 0x75, 0xb3, 0x78, 0x3f, 0x7b, 0x3e, 0xac, 0xba, + 0x55, 0x3d, 0xb1, 0x45, 0x86, 0xaa, 0x7f, 0x79, 0xac, 0xa4, 0x9b, 0x7c, + 0x3a, 0x38, 0xcb, 0x3f, 0x65, 0xb7, 0x50, 0xaf, 0x82, 0x7b, 0x42, 0xa7, + 0xc6, 0x97, 0x78, 0xa4, 0x31, 0x6b, 0x5e, 0xcd, 0xa4, 0x5a, 0x33, 0xc8, + 0xa0, 0x3e, 0x75, 0x68, 0x41, 0x7e, 0xb5, 0xac, 0x5d, 0x7e, 0x8f, 0xac, + 0xc5, 0x44, 0xd3, 0x58, 0x9c, 0xcb, 0x6a, 0xc0, 0x74, 0x72, 0xd0, 0x73, + 0xa6, 0x48, 0x54, 0x68, 0xbe, 0xae, 0xb7, 0x3e, 0x90, 0x96, 0x79, 0x45, + 0x4a, 0x67, 0x5a, 0x65, 0x3f, 0x9b, 0x9d, 0xab, 0x7c, 0x34, 0x38, 0xac, + 0x48, 0x52, 0x4c, 0x8f, 0xbb, 0x84, 0xc5, 0xa4, 0x52, 0xd3, 0xa0, 0x9b, + 0x73, 0x87, 0x37, 0xbb, 0x48, 0x7e, 0xa9, 0x93, 0xb4, 0x5d, 0x37, 0x88, + 0x30, 0x3d, 0xa2, 0x96, 0x2e, 0xbd, 0x82, 0xa5, 0xbe, 0x97, 0xb6, 0x75, + 0x48, 0xc3, 0x46, 0x3b, 0xbf, 0x89, 0xa4, 0xc5, 0x31, 0xa3, 0xc6, 0x94, + 0x63, 0x87, 0x6a, 0x94, 0xad, 0xcc, 0xca, 0x68, 0x6a, 0x78, 0x3c, 0xa8, + 0x8a, 0x48, 0x8d, 0xb9, 0xca, 0x88, 0x70, 0x8c, 0xca, 0x43, 0x68, 0x76, + 0xd2, 0x6d, 0x8e, 0xa6, 0x9a, 0xb7, 0x3b, 0x68, 0x5d, 0x3c, 0x5a, 0x85, + 0x40, 0x67, 0xae, 0xd5, 0x51, 0x88, 0x56, 0x87, 0x56, 0x9e, 0x8d, 0x43, + 0xd4, 0x64, 0x58, 0x78, 0x9a, 0x3a, 0x9c, 0x52, 0xa7, 0x63, 0x8f, 0x58, + 0x55, 0x6a, 0x98, 0xa4, 0xcc, 0x8e, 0x83, 0xab, 0x67, 0x46, 0x5f, 0x8e, + 0x91, 0xb7, 0xb7, 0x68, 0xbf, 0xb1, 0xbd, 0x80, 0x82, 0xa2, 0x91, 0x48, + 0x84, 0x46, 0xb7, 0xa2, 0x47, 0x62, 0xc4, 0x83, 0xc6, 0x3f, 0x8a, 0xd2, + 0x86, 0x9f, 0x88, 0x92, 0x46, 0x6b, 0x8f, 0x4a, 0x37, 0xa0, 0x9f, 0x33, + 0x8a, 0x88, 0xa0, 0xb5, 0x50, 0x57, 0x9b, 0x60, 0x6d, 0x6f, 0x58, 0xbb, + 0x7f, 0x8e, 0xa8, 0x93, 0x92, 0xa8, 0x90, 0x85, 0xba, 0x5d, 0xb9, 0x86, + 0x98, 0x62, 0xc4, 0x6a, 0x4e, 0xa3, 0xcb, 0x39, 0x72, 0x45, 0x99, 0xbf, + 0xa1, 0x9a, 0xb9, 0x96, 0xaf, 0xc5, 0x87, 0x78, 0x83, 0x3d, 0x50, 0xb8, + 0xc4, 0x3a, 0x68, 0x5d, 0xbe, 0x3d, 0x72, 0x38, 0x2a, 0x82, 0x61, 0x6d, + 0x95, 0x4f, 0x50, 0x32, 0x6c, 0x62, 0x9f, 0x4a, 0x9c, 0x7b, 0x6d, 0x55, + 0x7f, 0xcf, 0xb6, 0x81, 0x34, 0x41, 0xc6, 0x30, 0x84, 0x93, 0x6d, 0x8b, + 0x39, 0xc5, 0xa3, 0xc8, 0xcd, 0x31, 0x94, 0xd0, 0x40, 0x49, 0x7c, 0x4b, + 0x3a, 0x46, 0x49, 0x6e, 0x40, 0xac, 0x9d, 0x94, 0x60, 0xb7, 0x39, 0xb6, + 0x3e, 0x51, 0x40, 0x44, 0x67, 0x58, 0x85, 0x47, 0x68, 0xdb, 0x8d, 0x49, + 0x2d, 0xc8, 0x56, 0x8d, 0x36, 0x96, 0xc2, 0xb4, 0x8b, 0x51, 0x45, 0x89, + 0x81, 0xc4, 0xc8, 0x5b, 0x62, 0x68, 0xc8, 0x8d, 0xbf, 0xa8, 0x9f, 0x4d, + 0x8e, 0xc1, 0x7f, 0x4a, 0xa1, 0xa2, 0x6d, 0x40, 0x46, 0xd9, 0xb8, 0x5c, + 0x97, 0xd0, 0x98, 0x2f, 0x38, 0xbb, 0x85, 0xa6, 0x8e, 0x71, 0x47, 0x90, + 0xae, 0x8b, 0xa8, 0x4d, 0x4e, 0x86, 0x9d, 0xc0, 0x6c, 0x88, 0x9c, 0x3d, + 0x6d, 0x75, 0x91, 0x88, 0xad, 0x9a, 0x6d, 0x7b, 0x3f, 0xba, 0x9e, 0xcb, + 0x5d, 0x91, 0xc1, 0xd2, 0xb8, 0x92, 0x54, 0xbf, 0x37, 0xa5, 0x37, 0x7e, + 0x30, 0x5b, 0x51, 0xa1, 0xbf, 0x97, 0xd1, 0x44, 0x5d, 0x74, 0xbd, 0x64, + 0xaf, 0x53, 0x90, 0x8f, 0x3f, 0x47, 0x91, 0x8d, 0x97, 0x83, 0x7a, 0x87, + 0x77, 0x57, 0xc3, 0xa0, 0x73, 0x78, 0x6b, 0x6b, 0x35, 0xcf, 0xab, 0x72, + 0xcb, 0xd9, 0x84, 0x6b, 0xb2, 0x49, 0x9d, 0x75, 0xba, 0xc6, 0xa2, 0x3e, + 0x2e, 0x30, 0xa0, 0x6d, 0x98, 0x60, 0x88, 0x59, 0x8e, 0x8c, 0x42, 0xcb, + 0xa5, 0x5a, 0x4e, 0x4f, 0xb5, 0x97, 0x95, 0x62, 0x79, 0x76, 0x3c, 0x40, + 0xa3, 0xaa, 0xd1, 0x59, 0xb6, 0x4e, 0x48, 0x81, 0xcd, 0x28, 0x7f, 0x65, + 0xc2, 0x84, 0x93, 0x4c, 0xa3, 0xbc, 0x8f, 0x79, 0xb5, 0x51, 0x57, 0xac, + 0xb8, 0xb2, 0xb8, 0x4a, 0xcf, 0xc7, 0xcc, 0x62, 0x52, 0x4d, 0x92, 0x65, + 0xb6, 0x93, 0x62, 0x71, 0x59, 0xbe, 0x46, 0xa8, 0x4d, 0x4d, 0xb4, 0x64, + 0x36, 0x6e, 0x95, 0xc5, 0x5d, 0x66, 0x6b, 0xb6, 0x39, 0xc2, 0xbc, 0x86, + 0x38, 0x50, 0x4c, 0x9e, 0x9b, 0x43, 0xa5, 0x8c, 0x88, 0x63, 0x36, 0xb6, + 0xc6, 0xa0, 0x6b, 0x67, 0x29, 0x5c, 0x4a, 0x79, 0x6e, 0xcc, 0x56, 0x9d, + 0x6a, 0x4b, 0xa3, 0x4b, 0x96, 0x42, 0x6e, 0xba, 0xc6, 0x37, 0xbd, 0xaf, + 0x65, 0xac, 0xce, 0x87, 0x9b, 0xcc, 0x37, 0x79, 0x63, 0xd1, 0x36, 0x9f, + 0x65, 0x82, 0xb7, 0x57, 0x9c, 0xd5, 0x8c, 0x88, 0x71, 0x37, 0xb9, 0x46, + 0x5a, 0x55, 0x7c, 0x50, 0xc5, 0xaa, 0x94, 0x99, 0x3e, 0x8a, 0x6a, 0x61, + 0x76, 0x97, 0x2d, 0x68, 0x53, 0x5b, 0x87, 0x84, 0x54, 0x47, 0x6a, 0x3f, + 0x51, 0x92, 0x7d, 0x86, 0x51, 0x63, 0x5d, 0x85, 0x78, 0x3c, 0x56, 0x99, + 0xa2, 0xb7, 0x6e, 0x34, 0x41, 0xcd, 0x89, 0x4b, 0xca, 0x81, 0x99, 0xa3, + 0x9a, 0x46, 0x7c, 0x99, 0x8a, 0xae, 0xc1, 0xc8, 0xa2, 0x92, 0x61, 0x8c, + 0xa9, 0x77, 0x4d, 0x93, 0xb6, 0xb2, 0x69, 0x74, 0x5c, 0x7a, 0x68, 0x39, + 0x84, 0x64, 0x9a, 0x95, 0x52, 0x8e, 0x56, 0x3d, 0xe7, 0x58, 0x85, 0x50, + 0xbe, 0x44, 0x8e, 0xc4, 0xab, 0xc1, 0xa6, 0x5e, 0xc6, 0x63, 0x5a, 0x4a, + 0xaf, 0x57, 0xac, 0x69, 0x63, 0x7d, 0x67, 0x9e, 0x88, 0x8b, 0x3a, 0xb7, + 0xd3, 0x3b, 0xb8, 0x5a, 0x32, 0x8e, 0x90, 0x89, 0x67, 0x6e, 0x82, 0x3c, + 0xc4, 0xd1, 0x49, 0xb6, 0x36, 0xa7, 0x55, 0xa2, 0x42, 0x77, 0xbd, 0x76, + 0xab, 0x35, 0x9d, 0xa6, 0xb8, 0xcb, 0xa4, 0x78, 0xb5, 0x68, 0xa7, 0x5b, + 0x58, 0x39, 0x67, 0x50, 0xb5, 0xa6, 0xb8, 0xb0, 0x94, 0x88, 0x41, 0x87, + 0x94, 0xb1, 0x45, 0x73, 0x46, 0x3c, 0x3f, 0x9a, 0x96, 0x9b, 0xb8, 0x49, + 0x92, 0x51, 0x81, 0x97, 0x84, 0xaa, 0x7b, 0xa0, 0xb9, 0x36, 0x9e, 0x7f, + 0xaf, 0xc1, 0x7e, 0x6e, 0x77, 0x69, 0x56, 0x3d, 0x58, 0x38, 0xba, 0x4d, + 0x6d, 0x40, 0x49, 0x7b, 0x41, 0x32, 0xad, 0xa1, 0x98, 0x56, 0xb9, 0x81, + 0xac, 0x81, 0x41, 0x34, 0x34, 0x63, 0x3d, 0x48, 0xb0, 0x42, 0xa6, 0x74, + 0x8f, 0x40, 0x76, 0x3d, 0x8a, 0x9f, 0xc7, 0xcd, 0x55, 0xbd, 0xa2, 0x5d, + 0x67, 0xba, 0x66, 0xb1, 0xc4, 0x7b, 0x67, 0xb2, 0x3f, 0x74, 0x9f, 0x86, + 0x74, 0xb0, 0x3a, 0xd0, 0x53, 0x86, 0x4c, 0x91, 0x9f, 0x43, 0x7d, 0x37, + 0xab, 0x80, 0xba, 0xcc, 0x66, 0xa7, 0x70, 0x3f, 0x73, 0xcb, 0x39, 0xb6, + 0xa6, 0xc1, 0x39, 0xbe, 0x35, 0xb8, 0x52, 0x65, 0x7d, 0x32, 0x55, 0xc6, + 0xc5, 0xcd, 0x7d, 0xcd, 0x3b, 0x76, 0x6d, 0xb8, 0xbc, 0x3e, 0xc0, 0x51, + 0x4a, 0x92, 0xc1, 0xb2, 0x9d, 0x42, 0x83, 0xc2, 0x7a, 0xc8, 0x73, 0x65, + 0xc7, 0xc3, 0xc1, 0x4d, 0x46, 0x4b, 0xb4, 0x36, 0x79, 0xcd, 0xba, 0xa4, + 0x71, 0x6b, 0x68, 0x6e, 0x73, 0xbf, 0xa6, 0x95, 0x42, 0x93, 0x77, 0xc9, + 0x4a, 0x92, 0xb4, 0xca, 0xa3, 0x86, 0x44, 0x77, 0x75, 0xc8, 0xd0, 0x90, + 0x55, 0x69, 0xd0, 0xa2, 0xa5, 0x44, 0x75, 0xc5, 0xc2, 0x48, 0x70, 0x6e, + 0x7c, 0x93, 0x4d, 0xa8, 0x85, 0x60, 0xc5, 0xc0, 0x42, 0xcc, 0xb8, 0x69, + 0x82, 0x9e, 0x42, 0x62, 0x61, 0x55, 0xa2, 0x91, 0x36, 0x8c, 0x34, 0xc3, + 0x78, 0x3e, 0x85, 0x57, 0x63, 0x93, 0x38, 0x6c, 0x4f, 0x3e, 0x91, 0x6e, + 0x33, 0x45, 0x7f, 0x36, 0xcb, 0x61, 0x9b, 0xd0, 0xd2, 0x6c, 0xbb, 0xa1, + 0xa0, 0x51, 0x4d, 0xc2, 0x92, 0x7d, 0xbf, 0x3d, 0x90, 0x56, 0xc4, 0xc5, + 0x88, 0xce, 0xb8, 0x7b, 0x78, 0x3e, 0xc8, 0xbe, 0xb6, 0x77, 0x33, 0x8e, + 0x51, 0xaa, 0x60, 0x63, 0x95, 0xcb, 0xb5, 0x30, 0x66, 0xd6, 0xac, 0xd3, + 0x79, 0x7c, 0x62, 0x43, 0xc8, 0x6f, 0xb7, 0xa6, 0xb7, 0x53, 0x9b, 0x4c, + 0xd3, 0x62, 0xb9, 0xc1, 0x36, 0xce, 0x5a, 0xa2, 0x8f, 0x32, 0x4a, 0x92, + 0x3a, 0x94, 0x94, 0x53, 0x6c, 0x35, 0x7a, 0x48, 0x77, 0x5a, 0xa4, 0x75, + 0x82, 0x45, 0xd3, 0x99, 0x6a, 0x39, 0x3b, 0xb8, 0x60, 0xb5, 0xaf, 0x74, + 0x76, 0x66, 0xa1, 0xaf, 0x83, 0x64, 0x5d, 0xb5, 0x7b, 0xbe, 0x3a, 0x84, + 0x9e, 0xa9, 0xbf, 0xae, 0xb6, 0x46, 0x67, 0x4a, 0xae, 0x9f, 0xae, 0xd0, + 0x4b, 0x58, 0x3e, 0x54, 0x8b, 0x54, 0xcc, 0x5c, 0x6e, 0xa3, 0x41, 0xcf, + 0x46, 0xd1, 0x6e, 0x57, 0xbb, 0x47, 0xa4, 0x37, 0x33, 0x7d, 0x3c, 0xb5, + 0xd2, 0x50, 0xb0, 0x94, 0xc2, 0x52, 0x75, 0x80, 0xce, 0x6b, 0xc5, 0xbf, + 0x7a, 0xb8, 0x63, 0xb8, 0x3a, 0x46, 0x63, 0x7a, 0x73, 0x55, 0x74, 0xb2, + 0x64, 0x55, 0x72, 0x8d, 0x83, 0x64, 0xd1, 0x80, 0x6c, 0x87, 0xc0, 0xc1, + 0x3e, 0xa3, 0xca, 0x42, 0x45, 0x67, 0x98, 0xa3, 0xd2, 0x58, 0x3e, 0x80, + 0x69, 0x39, 0x62, 0x3e, 0x9b, 0x36, 0xa9, 0x9f, 0xb8, 0x70, 0x7a, 0x74, + 0xa1, 0x54, 0x6b, 0x89, 0xbf, 0x53, 0x32, 0xc0, 0x5f, 0x5e, 0x92, 0x86, + 0x8b, 0xb5, 0xc3, 0x49, 0xa5, 0xb2, 0x4b, 0x37, 0xa6, 0xc2, 0x91, 0xcc, + 0x4a, 0x9b, 0xbc, 0xb4, 0xaa, 0x58, 0xa7, 0x6e, 0x8e, 0x88, 0x8f, 0xa7, + 0x84, 0x4f, 0x91, 0x4d, 0x6e, 0xd0, 0xa5, 0xd2, 0xce, 0x37, 0xaa, 0xb2, + 0xd3, 0xb7, 0x46, 0x90, 0x68, 0x88, 0x7a, 0x4a, 0xcb, 0x56, 0x81, 0x4b, + 0xa3, 0x8b, 0x9e, 0x86, 0xad, 0x58, 0xc4, 0x67, 0x53, 0xb8, 0xa2, 0x82, + 0xb7, 0x85, 0x64, 0x55, 0xcd, 0x8d, 0xbe, 0x87, 0x81, 0xcb, 0x6d, 0xbf, + 0x66, 0x51, 0x7f, 0x9c, 0x59, 0x4f, 0xbd, 0x4f, 0x45, 0x3a, 0xbe, 0x55, + 0x9a, 0x69, 0x85, 0x7b, 0x57, 0xc0, 0xbc, 0x4d, 0x72, 0xca, 0x4b, 0x40, + 0xa0, 0x6f, 0xab, 0x82, 0x6d, 0x76, 0x4f, 0xac, 0x3a, 0x46, 0xc5, 0x9f, + 0xb8, 0x7e, 0xbe, 0x57, 0x38, 0x64, 0x78, 0x5a, 0x3a, 0x40, 0xbe, 0xb8, + 0xaa, 0xbc, 0x70, 0xb2, 0x3a, 0xc7, 0x98, 0x60, 0xb4, 0xb3, 0x80, 0xcd, + 0x42, 0x9c, 0x34, 0xb3, 0x52, 0xc4, 0x7e, 0x54, 0x64, 0x6d, 0x52, 0x69, + 0x75, 0x87, 0x3e, 0x39, 0x7c, 0x8f, 0x59, 0xaf, 0x66, 0x40, 0xbf, 0x60, + 0xbb, 0x98, 0x78, 0x86, 0xb6, 0x77, 0x60, 0x4f, 0x41, 0x34, 0x9a, 0xd3, + 0x36, 0x4e, 0x93, 0x5a, 0xb0, 0x8e, 0x56, 0x56, 0x39, 0x39, 0xb0, 0xcf, + 0x68, 0x51, 0xab, 0x7a, 0x50, 0x31, 0x55, 0x4d, 0xc7, 0x36, 0x82, 0x5a, + 0x3d, 0x52, 0x5c, 0x4e, 0x3d, 0x55, 0x43, 0x6a, 0x48, 0x8e, 0xd0, 0x59, + 0x47, 0x8e, 0x75, 0x64, 0x61, 0x6d, 0x39, 0xac, 0x86, 0x52, 0xc7, 0x98, + 0x3e, 0xb1, 0x49, 0x7f, 0x94, 0x5f, 0x60, 0xc7, 0x9e, 0x98, 0x5d, 0xa8, + 0x9d, 0x6a, 0x72, 0xa1, 0xa2, 0x4e, 0xad, 0x33, 0x75, 0x76, 0x39, 0xce, + 0xb1, 0xcc, 0xa1, 0xc5, 0x57, 0x69, 0x34, 0x38, 0xa2, 0x59, 0x7e, 0xa4, + 0x32, 0xd3, 0x91, 0x9b, 0x60, 0x69, 0xaa, 0x48, 0xb2, 0xbb, 0x83, 0x71, + 0xa4, 0x49, 0x45, 0x40, 0x66, 0x86, 0xcc, 0x48, 0xbe, 0x32, 0x83, 0xcd, + 0xb1, 0xb0, 0xbd, 0xb4, 0xd2, 0x5f, 0x50, 0xcc, 0xa5, 0x42, 0x7a, 0x97, + 0x49, 0xae, 0x62, 0x9f, 0x88, 0x3b, 0xbd, 0x7e, 0x45, 0x9d, 0x8d, 0xbf, + 0xc4, 0xd0, 0x68, 0xa4, 0x64, 0x56, 0x85, 0x9f, 0x4f, 0x80, 0x51, 0xd0, + 0xbb, 0x3c, 0x63, 0x36, 0xc3, 0x58, 0xc3, 0x6a, 0x41, 0xc6, 0x79, 0x51, + 0x38, 0x64, 0x7d, 0x93, 0xc5, 0x4d, 0x57, 0x9d, 0x7b, 0x8a, 0xc2, 0x32, + 0xc8, 0x4c, 0x67, 0xb4, 0x3b, 0xaf, 0x42, 0x4b, 0xb6, 0xba, 0x83, 0xc7, + 0x52, 0xbe, 0x3e, 0xc6, 0x76, 0x44, 0x4d, 0x41, 0x6b, 0xaa, 0x99, 0x4d, + 0x9e, 0x45, 0xb0, 0x7f, 0x8f, 0x5d, 0xbf, 0xbc, 0x83, 0x5a, 0x4d, 0xaa, + 0xbf, 0x7f, 0xa0, 0x75, 0xbc, 0x5f, 0x98, 0x4d, 0x64, 0x39, 0x4f, 0xb3, + 0x96, 0xc3, 0x71, 0x71, 0x4c, 0x5f, 0x9a, 0xd0, 0xc7, 0x67, 0xa2, 0xcd, + 0x96, 0xb8, 0x5f, 0x5b, 0x76, 0x49, 0x64, 0x4c, 0x6a, 0x5b, 0x9a, 0x37, + 0xaf, 0x68, 0xc6, 0x45, 0x52, 0x93, 0xaa, 0x7a, 0xc3, 0x8a, 0x5d, 0x9b, + 0x90, 0xc8, 0x73, 0x6a, 0x39, 0xbc, 0x5b, 0xad, 0x89, 0x30, 0x95, 0xbc, + 0x3e, 0x92, 0x77, 0xb8, 0x34, 0x60, 0x9a, 0x55, 0x9e, 0x58, 0x88, 0x90, + 0x66, 0xaf, 0x98, 0x75, 0x39, 0x6a, 0x4a, 0x72, 0xc9, 0x5e, 0x86, 0x7f, + 0xab, 0xae, 0x6b, 0xa5, 0xae, 0xce, 0x9c, 0x67, 0xbc, 0xbc, 0x41, 0x51, + 0x6c, 0x39, 0x9a, 0x9b, 0x8b, 0xa9, 0x61, 0x3c, 0x47, 0xac, 0x7e, 0x85, + 0x80, 0xb7, 0xc9, 0xd2, 0x35, 0x5d, 0xaa, 0xac, 0xaf, 0x40, 0xb3, 0xc6, + 0xb9, 0x52, 0xbf, 0x7f, 0x75, 0xc8, 0xaf, 0x87, 0x66, 0x92, 0x41, 0x36, + 0xb7, 0x42, 0x34, 0x6b, 0xd4, 0xb3, 0xd4, 0x94, 0x30, 0xd1, 0xc8, 0x82, + 0x6c, 0x7c, 0x69, 0xb8, 0xca, 0x97, 0x83, 0xcf, 0x65, 0x41, 0x45, 0xc8, + 0x9e, 0x70, 0x4c, 0x97, 0xbb, 0x57, 0x57, 0x34, 0x78, 0x95, 0x4c, 0x79, + 0x8a, 0xb4, 0xc2, 0x6b, 0x6a, 0xcb, 0xd1, 0x5e, 0xb1, 0x7e, 0x8e, 0x97, + 0x3b, 0x3f, 0xa3, 0xbd, 0x49, 0x3f, 0x56, 0x96, 0x3c, 0xa7, 0xce, 0x79, + 0x41, 0x42, 0xc4, 0x48, 0x50, 0x32, 0x76, 0x71, 0xb6, 0xb9, 0x97, 0x89, + 0xc1, 0x61, 0xc5, 0xb7, 0x36, 0x47, 0x5f, 0x68, 0x83, 0x73, 0x6e, 0x91, + 0xb1, 0xc7, 0x7c, 0xb9, 0x65, 0x8a, 0x76, 0xd0, 0x95, 0x8d, 0x5b, 0xcb, + 0xb4, 0x72, 0x65, 0xb1, 0x38, 0x7c, 0xb6, 0xc6, 0xa8, 0xa4, 0x57, 0xaf, + 0x9c, 0x5e, 0xae, 0xbc, 0xc7, 0xaa, 0x50, 0xd2, 0x99, 0x7f, 0xa7, 0x8a, + 0xce, 0x56, 0xba, 0x53, 0x99, 0x71, 0xc8, 0x5e, 0xa2, 0x58, 0x7b, 0x64, + 0x7a, 0xc2, 0xa6, 0x88, 0xc9, 0x8d, 0x96, 0x58, 0xc3, 0xcf, 0x90, 0xb7, + 0xb2, 0x72, 0x47, 0x33, 0x71, 0x6b, 0xd0, 0xc9, 0x8c, 0xb6, 0x92, 0xca, + 0xc7, 0x33, 0xa8, 0x6d, 0x6d, 0xa4, 0x53, 0x73, 0x52, 0x4b, 0x69, 0x52, + 0x9e, 0x4e, 0x85, 0xb2, 0x8c, 0x9f, 0x95, 0xc9, 0x41, 0x94, 0x5d, 0xc3, + 0x8b, 0x87, 0x52, 0xcc, 0x68, 0xca, 0x88, 0x3e, 0x3b, 0x71, 0x5d, 0xa3, + 0x71, 0x48, 0x2f, 0xa3, 0xd0, 0x87, 0x53, 0x9a, 0xaf, 0x82, 0x4b, 0x8e, + 0xc5, 0x89, 0xc1, 0x51, 0x35, 0x6b, 0x79, 0x4b, 0x48, 0x44, 0x96, 0x71, + 0x54, 0x58, 0x92, 0x97, 0x35, 0x35, 0x6a, 0x8b, 0x95, 0x33, 0x53, 0x41, + 0x65, 0x9d, 0x7a, 0xce, 0xd2, 0x6c, 0xaf, 0xcf, 0x93, 0xb1, 0x68, 0xb5, + 0x33, 0x46, 0x88, 0x98, 0xce, 0x57, 0x9c, 0xc6, 0x4e, 0x8d, 0xbd, 0xc9, + 0x7f, 0x90, 0x42, 0x44, 0x98, 0xbb, 0xb0, 0x65, 0xb5, 0xa7, 0x96, 0xb3, + 0xb5, 0x76, 0x2d, 0x39, 0x70, 0xa0, 0xb8, 0xbf, 0x3d, 0x6d, 0x9e, 0x67, + 0x35, 0xc1, 0x74, 0xa2, 0x91, 0x48, 0x46, 0x4b, 0x6a, 0x9d, 0x46, 0x49, + 0x70, 0x4f, 0x2e, 0xa8, 0x56, 0xa6, 0x68, 0x3f, 0x51, 0xb0, 0x36, 0x7d, + 0x38, 0xa9, 0x7e, 0x4a, 0xb3, 0x77, 0x3e, 0x5d, 0x68, 0xa7, 0x60, 0x74, + 0x3d, 0xc9, 0xa3, 0x76, 0x52, 0x8e, 0x5f, 0x95, 0xc3, 0x81, 0x4d, 0x69, + 0xbe, 0x53, 0x5d, 0x4f, 0x3d, 0x93, 0x3a, 0x59, 0x4c, 0xce, 0xa6, 0x68, + 0x8c, 0x6c, 0xbd, 0x99, 0x39, 0x47, 0x52, 0xb4, 0x4a, 0x61, 0xc3, 0x9f, + 0x3d, 0x4f, 0x3a, 0xbf, 0x37, 0xb8, 0x94, 0xa0, 0xc6, 0xb6, 0xd1, 0xb9, + 0x84, 0x3a, 0x93, 0xae, 0xb4, 0x8f, 0x84, 0x66, 0xc2, 0xc3, 0xa3, 0x95, + 0x59, 0x5e, 0xb2, 0x48, 0x4a, 0x55, 0xaf, 0xad, 0x3b, 0xbd, 0xa4, 0x3b, + 0x4a, 0x3a, 0x7a, 0x3e, 0x78, 0x40, 0x86, 0x68, 0xaf, 0xbe, 0x53, 0x35, + 0x71, 0x4f, 0xc4, 0xd7, 0x88, 0x74, 0xc1, 0xb2, 0x4e, 0x9c, 0x66, 0x91, + 0xa8, 0xb0, 0xa0, 0x64, 0x7d, 0x95, 0xcd, 0x95, 0x7f, 0xbd, 0x5a, 0x87, + 0x37, 0xb0, 0x4f, 0x3f, 0x64, 0x91, 0xad, 0x54, 0x92, 0xc0, 0x3a, 0x4b, + 0x4f, 0x74, 0x6f, 0x8f, 0x70, 0x48, 0x6c, 0xa3, 0xac, 0xbb, 0xca, 0x79, + 0x95, 0x5c, 0x40, 0x81, 0x6f, 0x52, 0x5f, 0x93, 0x63, 0x4d, 0x6d, 0x37, + 0x83, 0xb1, 0x8a, 0x37, 0xc1, 0xbd, 0x71, 0x6f, 0xb4, 0xb5, 0x7f, 0x90, + 0x64, 0x5a, 0x4f, 0x99, 0x93, 0xb4, 0x8d, 0xb6, 0xb2, 0x56, 0x32, 0x7a, + 0x3d, 0x5e, 0x42, 0x60, 0x49, 0x5f, 0xbe, 0x61, 0xa1, 0x78, 0x8a, 0xaf, + 0x66, 0xa1, 0x5b, 0x40, 0x9a, 0x93, 0x41, 0x5b, 0xc3, 0x5b, 0x74, 0xb5, + 0x6c, 0x3e, 0x98, 0xa7, 0x56, 0x84, 0x9b, 0xcd, 0x65, 0x9b, 0x62, 0x5c, + 0x6b, 0x5a, 0x30, 0x33, 0xcd, 0x7e, 0xa4, 0x88, 0x80, 0x75, 0x53, 0x33, + 0x6c, 0x62, 0xad, 0x52, 0x4f, 0x52, 0x43, 0x53, 0x45, 0xb8, 0x31, 0xc4, + 0x2c, 0x6f, 0x3e, 0xb4, 0xbc, 0x7e, 0xc6, 0x42, 0x39, 0x67, 0x53, 0x7b, + 0xc8, 0xb0, 0x2e, 0x98, 0x42, 0x5f, 0x8c, 0x88, 0x91, 0x2e, 0xad, 0xaf, + 0xc2, 0x75, 0x88, 0xcf, 0x72, 0x82, 0x6f, 0x8c, 0x4e, 0xc4, 0x4b, 0xb1, + 0x5e, 0x88, 0x39, 0x37, 0x99, 0x51, 0x40, 0x95, 0xc8, 0xa4, 0xca, 0xd2, + 0x5d, 0x8f, 0x5d, 0xa4, 0x9a, 0x7c, 0x86, 0xcd, 0x76, 0xc9, 0x7b, 0x8e, + 0x85, 0x8f, 0xc0, 0xa5, 0xad, 0xad, 0x7e, 0xd0, 0x31, 0xa8, 0x3e, 0x31, + 0xb7, 0xac, 0x3d, 0xa4, 0xc1, 0x84, 0xca, 0xc1, 0x42, 0x58, 0x3b, 0x6e, + 0xb1, 0xa8, 0x44, 0xad, 0x47, 0x8c, 0x41, 0x40, 0x8a, 0xb8, 0x8d, 0x32, + 0x55, 0x90, 0xb6, 0x92, 0x5e, 0xc9, 0x67, 0x64, 0x5d, 0xa1, 0x49, 0xa6, + 0xa0, 0xa9, 0x31, 0x4f, 0xa0, 0x7a, 0x79, 0x35, 0x4b, 0x9c, 0x6e, 0x3f, + 0x7e, 0x87, 0x6a, 0x68, 0xc8, 0xb1, 0xc5, 0x86, 0x9b, 0x3d, 0xad, 0x98, + 0x79, 0x71, 0x9d, 0x8b, 0x37, 0x99, 0x4b, 0x9e, 0x97, 0x56, 0x67, 0xca, + 0x5a, 0x72, 0x35, 0x96, 0x6b, 0x4f, 0x54, 0xa0, 0xa8, 0x56, 0x44, 0xc7, + 0x62, 0xb0, 0xa6, 0xb7, 0xcf, 0xb8, 0xa9, 0x48, 0x68, 0x4f, 0x34, 0x2c, + 0x86, 0x8d, 0xa6, 0x5d, 0xc7, 0x42, 0xaf, 0x9a, 0x69, 0x37, 0x7e, 0x4e, + 0x4c, 0x7b, 0x94, 0xaf, 0xbb, 0xd6, 0x99, 0x9d, 0x67, 0x3d, 0x67, 0xd6, + 0xa9, 0xbe, 0xc3, 0x8a, 0x87, 0x88, 0x75, 0xc7, 0x60, 0x47, 0xca, 0x9e, + 0xc5, 0x92, 0xcb, 0xc2, 0xb4, 0x81, 0xd8, 0x37, 0x62, 0x49, 0x79, 0xb4, + 0x66, 0x8f, 0xc6, 0x9f, 0xa0, 0x69, 0x8b, 0x4c, 0xba, 0x55, 0x69, 0x67, + 0x95, 0x8a, 0x8c, 0x44, 0xb4, 0xb8, 0xcc, 0x76, 0x89, 0x96, 0x9d, 0xa4, + 0xc6, 0xb0, 0xb0, 0xb5, 0x58, 0x53, 0x50, 0x39, 0x38, 0x96, 0xa1, 0xa4, + 0x70, 0xb1, 0x3f, 0x4c, 0xaa, 0xbd, 0x69, 0x4e, 0xbe, 0xab, 0xad, 0x48, + 0x4f, 0xb0, 0x9d, 0x48, 0x6d, 0xc4, 0xb8, 0x71, 0xc1, 0x56, 0x58, 0x45, + 0x8e, 0xae, 0x68, 0x65, 0x5d, 0x55, 0x56, 0xbd, 0xb0, 0x92, 0x82, 0x77, + 0xb5, 0xcb, 0xa5, 0x62, 0x84, 0x35, 0x95, 0x9f, 0x60, 0x4a, 0x4a, 0xa3, + 0x3e, 0x53, 0x5c, 0x70, 0xa9, 0x98, 0x53, 0x66, 0xa9, 0x62, 0x6e, 0x93, + 0x8b, 0x86, 0x5d, 0x7f, 0x88, 0x96, 0x93, 0x5f, 0x59, 0xbb, 0xbf, 0xcf, + 0x65, 0x6b, 0x53, 0xa2, 0xb9, 0xc5, 0x56, 0x41, 0xa0, 0xa7, 0x9c, 0x54, + 0xba, 0xba, 0xc2, 0xb6, 0x53, 0x6e, 0x51, 0x82, 0x39, 0x3b, 0xac, 0xad, + 0x8e, 0x45, 0x3d, 0x72, 0x46, 0x70, 0x61, 0x68, 0x9e, 0x5f, 0x5c, 0x3b, + 0x6d, 0x3d, 0x4f, 0x59, 0x57, 0x64, 0xb3, 0x39, 0x91, 0x8e, 0x7f, 0x71, + 0x81, 0x7f, 0x47, 0xbf, 0xc9, 0x33, 0x4c, 0xa4, 0x55, 0x43, 0xbf, 0x85, + 0x4c, 0x73, 0xd3, 0x8d, 0x8f, 0x3f, 0xdc, 0x65, 0x53, 0x98, 0x73, 0xad, + 0xcb, 0x89, 0xb3, 0xac, 0xb3, 0x87, 0xaa, 0xa0, 0xc5, 0xca, 0xd1, 0x3c, + 0x6a, 0x45, 0x7f, 0x3a, 0x42, 0x96, 0xa0, 0xbe, 0xb7, 0x9f, 0xac, 0x8f, + 0x7c, 0xc9, 0x8b, 0x81, 0x44, 0x53, 0x5c, 0x68, 0x3c, 0x49, 0x77, 0x63, + 0xd2, 0x71, 0xa3, 0x61, 0xa1, 0xbe, 0xca, 0xaa, 0xd5, 0x57, 0x3c, 0xab, + 0x4f, 0x77, 0x67, 0x4c, 0xba, 0x98, 0x6b, 0x4e, 0x9c, 0x6b, 0xb3, 0x6b, + 0x4e, 0xc2, 0x6d, 0x35, 0xc5, 0xc8, 0xc1, 0x9d, 0x45, 0x62, 0x3d, 0x8d, + 0xa0, 0x43, 0x83, 0xaf, 0xa3, 0xa7, 0x80, 0x5a, 0x56, 0xaa, 0x44, 0x7d, + 0x8f, 0x56, 0xb8, 0x77, 0x36, 0x4c, 0xba, 0x88, 0x7a, 0xb8, 0x80, 0xc3, + 0x6a, 0x43, 0x7d, 0x93, 0x49, 0x7f, 0x74, 0x6a, 0x51, 0x70, 0xce, 0x4d, + 0x4a, 0x64, 0x32, 0x69, 0x83, 0x49, 0xaa, 0xa6, 0xb6, 0x33, 0x6f, 0x68, + 0xc8, 0xd3, 0x4b, 0x55, 0x63, 0x75, 0x56, 0xc1, 0xb8, 0xc8, 0x4e, 0x9f, + 0x57, 0x4f, 0xb1, 0x7b, 0x78, 0xca, 0x60, 0x4f, 0x98, 0x5e, 0x58, 0x57, + 0xb8, 0x56, 0xbd, 0x40, 0x83, 0xd0, 0x84, 0x57, 0x85, 0x99, 0x59, 0x53, + 0x3d, 0x60, 0x80, 0x5a, 0xcb, 0x6e, 0x2e, 0x44, 0x67, 0x42, 0x32, 0x6b, + 0x4a, 0xcc, 0x97, 0x96, 0x97, 0x98, 0xa8, 0xb0, 0xcb, 0x79, 0x43, 0x82, + 0x88, 0x80, 0x82, 0xc3, 0xb5, 0x9e, 0x42, 0x9b, 0xb5, 0x66, 0x6c, 0x39, + 0x48, 0x5b, 0x94, 0xbc, 0x4f, 0x59, 0x3d, 0x80, 0x45, 0xbf, 0x35, 0x31, + 0x83, 0xa7, 0x7d, 0x4e, 0x63, 0x61, 0x87, 0x81, 0x97, 0x57, 0x55, 0x31, + 0x9f, 0xce, 0x4b, 0x52, 0x45, 0x36, 0x74, 0xa3, 0xb7, 0x4c, 0x34, 0x8d, + 0x3a, 0x83, 0x49, 0xa5, 0x95, 0x5c, 0xb6, 0x76, 0xa9, 0x81, 0x3b, 0xd9, + 0x76, 0x74, 0x93, 0x90, 0x9e, 0x6f, 0x8d, 0xca, 0x9c, 0xca, 0x88, 0xd1, + 0x8d, 0x3b, 0xb2, 0xb3, 0x60, 0x80, 0x7a, 0x40, 0x48, 0xa5, 0xca, 0x56, + 0xc6, 0xb8, 0xc5, 0x76, 0x7a, 0xae, 0x36, 0x99, 0x98, 0x4a, 0xd2, 0x49, + 0x48, 0x80, 0x64, 0xb8, 0x35, 0x7b, 0x35, 0xbd, 0x4a, 0x78, 0xca, 0x39, + 0x5e, 0x74, 0x41, 0x60, 0x98, 0xbd, 0x56, 0x56, 0x71, 0xab, 0xa3, 0x4b, + 0x81, 0x88, 0x71, 0x60, 0x2f, 0x65, 0xbc, 0xab, 0x5c, 0x69, 0x45, 0xaf, + 0x53, 0xcb, 0xb0, 0x4c, 0xa3, 0x9f, 0x92, 0x79, 0x68, 0x5f, 0xcc, 0x87, + 0x3b, 0x3f, 0xaa, 0x30, 0x6e, 0x8e, 0x47, 0x85, 0xd1, 0xbd, 0x6f, 0xc7, + 0x9f, 0xba, 0x86, 0x74, 0x64, 0x55, 0x8a, 0xa5, 0xad, 0xb0, 0x8e, 0x46, + 0x35, 0x8f, 0x33, 0x9e, 0x54, 0x8f, 0xbe, 0x6c, 0x6b, 0x8a, 0x61, 0x83, + 0x9a, 0xc9, 0x6a, 0xa0, 0xb8, 0x65, 0x92, 0xa7, 0xbe, 0x7f, 0xc3, 0x60, + 0xb8, 0xaa, 0x6f, 0x72, 0x56, 0x50, 0xba, 0x89, 0xc5, 0x5a, 0xd2, 0x7c, + 0x4a, 0x67, 0xb2, 0x41, 0x3e, 0x4b, 0xa8, 0x41, 0xc7, 0x34, 0x51, 0xbb, + 0x91, 0x7f, 0xbb, 0x79, 0x6e, 0x63, 0x41, 0xa4, 0x57, 0x76, 0x8f, 0x66, + 0xb8, 0xc0, 0xcc, 0x94, 0x99, 0x9b, 0xa6, 0x4d, 0x4a, 0x6a, 0x40, 0x34, + 0x91, 0x62, 0x5b, 0x47, 0x4f, 0x3b, 0x5b, 0xc4, 0x86, 0x6a, 0xb8, 0xa3, + 0x88, 0x40, 0x52, 0xc0, 0x52, 0xc6, 0x2e, 0x7f, 0x8d, 0x3f, 0x2f, 0xb8, + 0xa4, 0x3b, 0x9f, 0x40, 0x94, 0x50, 0x52, 0x36, 0xb1, 0x4c, 0xa3, 0xce, + 0xc0, 0xd2, 0x6a, 0x66, 0x72, 0xce, 0xbd, 0x3d, 0x63, 0xa5, 0x7d, 0x5d, + 0x97, 0x68, 0x84, 0x8b, 0x63, 0x49, 0x9c, 0xcb, 0xad, 0xaa, 0xc2, 0x8e, + 0xce, 0x41, 0x7c, 0xc4, 0x87, 0xc9, 0x53, 0x68, 0xb7, 0xbc, 0x92, 0x84, + 0x40, 0x27, 0x87, 0x3e, 0xb0, 0xcd, 0x61, 0xaf, 0x74, 0x73, 0x51, 0x5c, + 0x84, 0xb6, 0x89, 0xb3, 0x8e, 0x8e, 0x9e, 0xa0, 0xc3, 0x3c, 0x9d, 0x68, + 0x6e, 0xb2, 0x62, 0x74, 0x95, 0x45, 0x6e, 0xc5, 0x8f, 0x74, 0xab, 0x5b, + 0xa9, 0xa2, 0xae, 0x8d, 0x5d, 0x7c, 0xa3, 0x93, 0x72, 0x41, 0xa5, 0x7b, + 0x7f, 0x61, 0x30, 0xb9, 0x64, 0x79, 0x7c, 0x79, 0x3d, 0xb8, 0x44, 0x34, + 0x94, 0x42, 0xca, 0x7f, 0x61, 0x71, 0x9a, 0x32, 0x70, 0xd8, 0x54, 0xc1, + 0x88, 0x36, 0x6f, 0x52, 0x5a, 0x89, 0xb9, 0xd1, 0xa0, 0x61, 0x6a, 0xbb, + 0x5a, 0x48, 0x5d, 0x71, 0xb3, 0x88, 0x6c, 0xc6, 0xae, 0x5a, 0x87, 0xc7, + 0xc4, 0x28, 0x96, 0x86, 0x81, 0x9a, 0xcb, 0x64, 0x62, 0x80, 0x70, 0x55, + 0x56, 0x35, 0xb8, 0xb9, 0xcf, 0xde, 0x69, 0x86, 0x50, 0xca, 0x57, 0x5c, + 0xba, 0xba, 0x59, 0x6d, 0x70, 0x26, 0x44, 0x91, 0x6f, 0x6f, 0x3b, 0x61, + 0x7c, 0x36, 0x38, 0x8c, 0xcc, 0x62, 0x83, 0x9c, 0x95, 0xba, 0x52, 0x5c, + 0x97, 0x54, 0xb2, 0x42, 0x53, 0x7e, 0x76, 0xa6, 0xc3, 0x7c, 0x3d, 0x31, + 0x35, 0x7b, 0xb9, 0x92, 0x54, 0xcf, 0x4b, 0xbc, 0x6c, 0xc1, 0xcc, 0x93, + 0x76, 0x6f, 0xbc, 0x60, 0x5e, 0x7f, 0x43, 0x2e, 0xa9, 0x70, 0x48, 0xba, + 0x3e, 0x9f, 0x4d, 0x8f, 0xcf, 0xd3, 0xa0, 0xad, 0x35, 0x43, 0x5b, 0xc6, + 0x6b, 0x7d, 0x6d, 0x37, 0x73, 0xc3, 0x6a, 0xa5, 0xc6, 0x5e, 0x56, 0xae, + 0x9b, 0xd6, 0xc7, 0x81, 0xaa, 0x62, 0xb0, 0x96, 0xc9, 0x43, 0x5e, 0x95, + 0xba, 0x9f, 0x4a, 0xb2, 0x90, 0x93, 0x52, 0x70, 0x94, 0xad, 0x83, 0xba, + 0x90, 0x84, 0x74, 0x78, 0x66, 0x77, 0x63, 0xb1, 0x8f, 0x5f, 0x94, 0xc6, + 0x70, 0x91, 0xad, 0x91, 0xcf, 0xbe, 0xc3, 0x49, 0x59, 0xcd, 0xbc, 0x9e, + 0x72, 0xd0, 0xb6, 0x75, 0xc6, 0x75, 0xc8, 0xb3, 0x7f, 0x53, 0x77, 0x54, + 0x4c, 0xa1, 0xbd, 0x7c, 0x4c, 0xb4, 0xab, 0x92, 0xb4, 0xb8, 0x4d, 0x5c, + 0x36, 0x89, 0x47, 0xa4, 0x98, 0x97, 0xa0, 0x85, 0x32, 0x45, 0x86, 0xba, + 0x3b, 0x3d, 0xbd, 0xb1, 0xa1, 0xc2, 0xa0, 0x9c, 0x7f, 0xd8, 0x68, 0x4c, + 0xab, 0xa4, 0x8a, 0x70, 0xad, 0x51, 0x77, 0x66, 0x35, 0xa5, 0xbc, 0xa7, + 0x7f, 0x5a, 0x4e, 0x40, 0xc8, 0x65, 0xca, 0x44, 0xbe, 0x59, 0x5b, 0x5d, + 0x98, 0x3e, 0xc2, 0xbe, 0x9b, 0x81, 0xc6, 0xd2, 0x68, 0xb2, 0x9d, 0x7a, + 0x4f, 0x98, 0xac, 0x74, 0x3b, 0x98, 0xcb, 0xd3, 0xd0, 0x6a, 0x4c, 0x88, + 0x8b, 0xc4, 0x57, 0xcd, 0x61, 0x83, 0x83, 0x3c, 0x87, 0x8d, 0x50, 0x33, + 0x43, 0xc3, 0xae, 0xb3, 0x54, 0x90, 0xc7, 0x6d, 0x75, 0x53, 0xb5, 0xc5, + 0xb6, 0xb0, 0x9e, 0x47, 0x76, 0x45, 0xc8, 0xa4, 0x55, 0xd6, 0x74, 0x42, + 0x3a, 0x5f, 0x3d, 0x59, 0x65, 0xab, 0x5f, 0x93, 0x59, 0x70, 0x5d, 0xa1, + 0x40, 0x4f, 0xaf, 0xb8, 0x82, 0x39, 0x9b, 0x6e, 0x88, 0x96, 0xb0, 0x4f, + 0xd2, 0x9c, 0x51, 0xa1, 0x4d, 0x96, 0x96, 0x4d, 0x56, 0x4e, 0x8e, 0x46, + 0x92, 0x8d, 0x47, 0x51, 0xb9, 0x92, 0x85, 0x7b, 0x81, 0xbc, 0xa6, 0x6d, + 0x7c, 0xa5, 0xb5, 0x7a, 0xa7, 0x8f, 0x48, 0xb1, 0x48, 0x64, 0x69, 0x56, + 0x9a, 0x56, 0x5e, 0x85, 0x8a, 0x65, 0x6d, 0x4f, 0xcb, 0x50, 0x83, 0xab, + 0xb9, 0x6c, 0xb7, 0x7c, 0xce, 0xba, 0xd3, 0x64, 0x48, 0xa3, 0x70, 0xae, + 0x52, 0xac, 0x97, 0x2f, 0xaa, 0x58, 0x95, 0x8a, 0xc9, 0x7e, 0xc8, 0x55, + 0xce, 0x71, 0x56, 0x80, 0x86, 0x3e, 0x5d, 0x7f, 0x4d, 0xb9, 0xd5, 0xce, + 0x9b, 0xb2, 0x73, 0xcc, 0x9f, 0x92, 0x6e, 0x3a, 0x99, 0xaf, 0x86, 0x6d, + 0xc7, 0x24, 0x95, 0xda, 0xb1, 0xb8, 0xcd, 0x93, 0x56, 0xd0, 0x8f, 0x66, + 0x6d, 0xcc, 0x3a, 0xaf, 0x59, 0x43, 0x49, 0x36, 0x4c, 0x91, 0xc7, 0xba, + 0x85, 0x90, 0x9f, 0x70, 0x4f, 0x3b, 0xc8, 0xc0, 0x67, 0x5b, 0xc3, 0xb2, + 0x4a, 0xa3, 0xba, 0xcb, 0x53, 0x54, 0x32, 0xbb, 0x97, 0xad, 0x5c, 0x76, + 0xbd, 0x9f, 0x9e, 0xb3, 0x99, 0x8c, 0x3b, 0x40, 0xb9, 0x94, 0x3e, 0x99, + 0x9d, 0xd4, 0xa8, 0x49, 0xb8, 0xae, 0x50, 0x8b, 0xc2, 0x91, 0xa1, 0x6f, + 0xc2, 0x97, 0xa4, 0x62, 0xcf, 0x82, 0xcc, 0x79, 0x96, 0x4f, 0x89, 0xcd, + 0x7d, 0x83, 0xcf, 0x73, 0x3f, 0x88, 0xb8, 0x73, 0x62, 0x47, 0xa0, 0x48, + 0x78, 0x2e, 0x32, 0x5b, 0x57, 0xb1, 0x62, 0x69, 0xb5, 0x72, 0x52, 0x9a, + 0x50, 0x64, 0xc2, 0x41, 0x79, 0x38, 0x5a, 0x3e, 0x3c, 0x37, 0x9c, 0xca, + 0x49, 0xcc, 0x37, 0x42, 0x41, 0x91, 0xc9, 0xd7, 0xd2, 0xa5, 0x54, 0xae, + 0x6a, 0x4b, 0x58, 0xbb, 0xc5, 0x8a, 0xd1, 0x4f, 0xd1, 0x99, 0x4a, 0xcf, + 0x60, 0x4e, 0x93, 0xd9, 0x34, 0x96, 0xb6, 0xa5, 0x64, 0xb3, 0x3d, 0xc0, + 0x3f, 0x67, 0x68, 0x59, 0x61, 0x40, 0x9f, 0x85, 0x52, 0x64, 0x41, 0x76, + 0xcd, 0xbc, 0x80, 0x66, 0x8e, 0x3d, 0x92, 0xc0, 0xd3, 0xa6, 0x8c, 0x5a, + 0x88, 0x49, 0x8a, 0x43, 0xb1, 0x67, 0xb9, 0xcf, 0x46, 0x64, 0xae, 0x9c, + 0x30, 0xa1, 0x67, 0xac, 0x7f, 0x7b, 0xd1, 0x8a, 0x78, 0x72, 0xa9, 0x95, + 0x8a, 0x24, 0xc7, 0xb8, 0x3d, 0x43, 0xa9, 0x69, 0x69, 0x32, 0xd6, 0xbd, + 0x63, 0xc2, 0xa1, 0xba, 0x95, 0x7a, 0x3d, 0x9d, 0x5a, 0x72, 0x42, 0x4c, + 0x3e, 0xce, 0x39, 0xa6, 0x7f, 0xb6, 0x8d, 0xc6, 0x6c, 0x6b, 0x8a, 0xba, + 0xab, 0x4e, 0xa0, 0xa8, 0xc6, 0xc4, 0xde, 0x6d, 0x48, 0x5a, 0x92, 0xab, + 0x48, 0x7b, 0xbf, 0x62, 0x3d, 0xae, 0x8d, 0x79, 0x9f, 0x5a, 0x5d, 0x83, + 0x36, 0x45, 0x79, 0x91, 0xae, 0x9a, 0x78, 0x42, 0xbe, 0x76, 0x59, 0xc7, + 0xc8, 0x8e, 0xb1, 0x36, 0x59, 0x44, 0x80, 0x61, 0x8e, 0x3f, 0xb1, 0xd1, + 0x62, 0x29, 0xbb, 0x83, 0x5c, 0x42, 0x5c, 0x45, 0xcf, 0x81, 0x3e, 0x34, + 0x42, 0x72, 0x8b, 0xc5, 0x42, 0x7f, 0xd1, 0x96, 0x8a, 0x5f, 0x74, 0xbb, + 0x87, 0xa6, 0x70, 0x3e, 0xc8, 0xa4, 0x83, 0xc9, 0x37, 0x3f, 0x3a, 0x59, + 0x76, 0xa2, 0x87, 0x63, 0xbe, 0x95, 0x9d, 0x47, 0x80, 0x62, 0x8f, 0x9e, + 0x4c, 0x81, 0xcc, 0x49, 0xc9, 0x64, 0x5e, 0xa2, 0xb3, 0x7a, 0x54, 0x58, + 0x41, 0x5d, 0xac, 0xa0, 0x98, 0x98, 0x97, 0x6c, 0x6f, 0x8d, 0x61, 0x8b, + 0x63, 0x53, 0x6d, 0x52, 0x49, 0x55, 0x49, 0x98, 0x38, 0x64, 0x9d, 0x97, + 0x4c, 0x8b, 0x60, 0x91, 0xbe, 0x6f, 0x2d, 0x6f, 0x70, 0x8f, 0x89, 0x73, + 0xb4, 0xb8, 0xaf, 0x44, 0xb5, 0x93, 0x81, 0x39, 0x8b, 0xc6, 0xcc, 0xcf, + 0x51, 0x99, 0x3e, 0x86, 0xcf, 0x6a, 0xd7, 0xa6, 0x51, 0x83, 0x48, 0x58, + 0x41, 0xcd, 0xaa, 0x56, 0x5d, 0x8e, 0x9b, 0x90, 0x56, 0xc0, 0x73, 0x7d, + 0xb4, 0xb0, 0x8b, 0xdd, 0x83, 0xb5, 0x5b, 0xa3, 0x9e, 0xb2, 0xb9, 0x65, + 0x8c, 0xa0, 0x4c, 0x3e, 0x83, 0xbe, 0x80, 0xb3, 0x4f, 0x7b, 0x67, 0x42, + 0x9c, 0x47, 0x41, 0xba, 0x82, 0x6d, 0x2d, 0x88, 0x9d, 0x7c, 0x8e, 0x66, + 0x4b, 0x74, 0x71, 0xad, 0xa0, 0xb0, 0xc1, 0xcd, 0x89, 0x98, 0x8b, 0x3d, + 0xd4, 0x87, 0x99, 0xbd, 0x99, 0x85, 0x36, 0xc1, 0x77, 0x8e, 0x6b, 0x4e, + 0xd2, 0x4a, 0xa2, 0xad, 0xa7, 0x55, 0x4b, 0xb9, 0x2f, 0x30, 0x8c, 0xa0, + 0xa3, 0xc8, 0xb0, 0x5a, 0x59, 0xbc, 0x5d, 0xb5, 0xb6, 0x44, 0xa1, 0xb5, + 0xb2, 0x99, 0x9c, 0xc0, 0xaa, 0x45, 0x5f, 0x4d, 0x32, 0xab, 0x45, 0xbd, + 0x3f, 0x81, 0x87, 0x88, 0xc9, 0x84, 0x88, 0x42, 0x61, 0x74, 0x58, 0xba, + 0xae, 0xac, 0x2d, 0xb4, 0xaf, 0x98, 0x6e, 0x91, 0x32, 0xbc, 0x94, 0x58, + 0x4e, 0x91, 0xcc, 0x51, 0xbc, 0xa4, 0x7d, 0x73, 0xce, 0x81, 0x2d, 0x47, + 0x76, 0x43, 0xc3, 0x64, 0x42, 0x52, 0x60, 0xb5, 0x6e, 0x94, 0x83, 0xbf, + 0x80, 0xcd, 0xa8, 0x38, 0x44, 0x6a, 0xa5, 0xd1, 0xae, 0xcd, 0xbf, 0x81, + 0x4b, 0x5f, 0xc1, 0x9e, 0x7e, 0x90, 0x47, 0x4d, 0xb4, 0x40, 0x9d, 0x97, + 0x7d, 0x58, 0x74, 0x71, 0x9a, 0xbc, 0x38, 0x72, 0x46, 0xad, 0x58, 0x65, + 0x94, 0x65, 0xc3, 0x58, 0x81, 0x86, 0xcc, 0x45, 0x8a, 0x3f, 0x41, 0x76, + 0x69, 0xc7, 0x34, 0x94, 0x43, 0xaa, 0xc9, 0xcc, 0x58, 0x89, 0x9a, 0xbe, + 0xc3, 0x9d, 0x52, 0xd1, 0x8d, 0x44, 0xa2, 0x6f, 0x95, 0xac, 0x95, 0xb6, + 0xc4, 0x55, 0xc3, 0x5b, 0x91, 0x3b, 0x3d, 0x85, 0xa5, 0x93, 0xc8, 0x36, + 0x86, 0x7f, 0x8a, 0x36, 0x4a, 0x7a, 0x89, 0xb0, 0x5d, 0x6d, 0x8b, 0x3b, + 0x5e, 0x7b, 0x59, 0xcd, 0xca, 0xbc, 0x48, 0x71, 0xd9, 0xb0, 0xd3, 0xac, + 0x77, 0x67, 0x5d, 0x2f, 0xc2, 0x38, 0x61, 0xae, 0x65, 0x9a, 0x4d, 0x65, + 0xaa, 0x7a, 0xcd, 0x48, 0x2f, 0xaf, 0x9c, 0xcb, 0x49, 0x90, 0xd2, 0x2f, + 0xcd, 0x68, 0x3a, 0x48, 0xaf, 0x7a, 0xd4, 0x53, 0xc6, 0x32, 0x86, 0x56, + 0xb7, 0x32, 0x5d, 0x66, 0x7a, 0x5a, 0x4a, 0x88, 0x9f, 0xb3, 0x5c, 0x8e, + 0xcf, 0x3c, 0xd1, 0xa8, 0x9b, 0xc4, 0xc6, 0x9b, 0x3a, 0x4c, 0xbc, 0x9a, + 0x8f, 0x32, 0x55, 0xc3, 0x99, 0x95, 0x5a, 0x42, 0xbc, 0xb3, 0xcc, 0xd1, + 0x77, 0x8a, 0x98, 0x69, 0xb0, 0x90, 0x5a, 0xa2, 0x81, 0x41, 0xa7, 0xb7, + 0x85, 0xa3, 0x3c, 0x72, 0x73, 0x4c, 0x55, 0x6d, 0x68, 0xd1, 0xc8, 0xc6, + 0x67, 0x8b, 0x53, 0x59, 0x66, 0x9e, 0x4e, 0x3e, 0xb4, 0xb0, 0x90, 0x60, + 0x6a, 0x68, 0x3f, 0x83, 0x32, 0x56, 0x8d, 0x46, 0xa6, 0x58, 0x81, 0x96, + 0xae, 0xb1, 0x38, 0x44, 0x99, 0x42, 0x50, 0x5f, 0xb1, 0xd0, 0xa2, 0xb9, + 0x80, 0xc1, 0x81, 0x97, 0x4d, 0x73, 0x51, 0xb7, 0xd7, 0xc0, 0xc1, 0x71, + 0xb8, 0x77, 0x72, 0xb4, 0x55, 0xb5, 0x39, 0x2f, 0x6e, 0x4f, 0x4b, 0xd1, + 0x97, 0xae, 0x51, 0x76, 0x3c, 0x6f, 0x65, 0x8e, 0x6f, 0x9e, 0x39, 0x32, + 0x96, 0x60, 0x5e, 0x38, 0x72, 0x91, 0x88, 0xcd, 0xd8, 0x77, 0xc1, 0x39, + 0x84, 0x9c, 0x6b, 0x49, 0xac, 0xc5, 0xc1, 0xd3, 0xa7, 0x76, 0x6d, 0xc9, + 0xb5, 0x64, 0xb7, 0x97, 0xae, 0x57, 0x4e, 0xa0, 0x69, 0xc6, 0x96, 0x74, + 0xb3, 0xc5, 0xc0, 0x3e, 0xcc, 0xaf, 0x90, 0xa1, 0xc9, 0x96, 0x7c, 0x60, + 0xca, 0x5e, 0x5a, 0xa7, 0x8d, 0x46, 0xbf, 0xa3, 0xa0, 0x4f, 0x8c, 0x92, + 0x76, 0x78, 0xa3, 0xa1, 0x98, 0x38, 0x37, 0x9b, 0x87, 0x98, 0x94, 0x65, + 0xc5, 0x88, 0x33, 0xb1, 0x4a, 0x50, 0x97, 0x61, 0xbb, 0x75, 0x38, 0x4e, + 0x3a, 0xb9, 0x89, 0x62, 0x59, 0x7b, 0x9b, 0x65, 0xab, 0x43, 0xd4, 0x3f, + 0x84, 0x9c, 0x67, 0x92, 0x86, 0xab, 0xc3, 0x3e, 0x57, 0x60, 0xb8, 0xab, + 0x69, 0x61, 0xab, 0x4c, 0x8e, 0xac, 0x44, 0x59, 0xb2, 0xd8, 0x8f, 0xa4, + 0x5e, 0x5c, 0x53, 0xbc, 0x50, 0x60, 0xb8, 0xb4, 0xca, 0x60, 0x73, 0xbb, + 0xcb, 0x38, 0x6e, 0x71, 0x2e, 0x84, 0xac, 0x8d, 0x86, 0xc9, 0x55, 0x49, + 0xc0, 0xce, 0xa4, 0xc6, 0xc6, 0x5d, 0x3d, 0x39, 0xd0, 0x80, 0x4d, 0x3f, + 0x72, 0x33, 0x8e, 0x85, 0x91, 0x8f, 0x7e, 0xa6, 0x36, 0xda, 0xa0, 0x97, + 0x46, 0xbc, 0x89, 0x85, 0x5e, 0xa0, 0x31, 0x5d, 0x34, 0x81, 0x37, 0x6a, + 0x7a, 0xab, 0x6a, 0x32, 0x7f, 0x97, 0xbd, 0x7e, 0xbc, 0xb8, 0x56, 0xa7, + 0xaf, 0xbc, 0x46, 0x7a, 0xa1, 0x4b, 0x99, 0x51, 0x37, 0x60, 0xd7, 0xb0, + 0xca, 0xbe, 0x9e, 0x1f, 0xbb, 0x66, 0x30, 0x4c, 0xcd, 0x4b, 0x54, 0xd4, + 0xcf, 0x70, 0xcc, 0x73, 0xcd, 0xaf, 0x44, 0x30, 0xc2, 0xca, 0xb4, 0x76, + 0x6b, 0x40, 0xb2, 0xc6, 0x97, 0x69, 0xca, 0x7a, 0x8f, 0x6a, 0xb7, 0x59, + 0x66, 0xbf, 0x40, 0xaf, 0x61, 0x2f, 0x93, 0xbe, 0x43, 0x45, 0x74, 0x55, + 0x58, 0xb8, 0x95, 0x83, 0xa9, 0x96, 0x88, 0x37, 0x88, 0xa0, 0x52, 0x5f, + 0x30, 0xae, 0xbf, 0x8d, 0x34, 0xb5, 0x8e, 0xbf, 0xbd, 0xac, 0xba, 0x88, + 0x90, 0x82, 0x43, 0x44, 0xb5, 0x82, 0x8f, 0xc6, 0x64, 0x6f, 0x99, 0xcb, + 0xb5, 0x97, 0xbf, 0xc2, 0xb3, 0xc4, 0xa9, 0xd3, 0xcc, 0x96, 0x79, 0xc8, + 0x81, 0x56, 0x61, 0x82, 0x6d, 0x8a, 0x7e, 0x38, 0x8f, 0x87, 0x8a, 0x45, + 0x50, 0x31, 0x59, 0x49, 0x5a, 0xbc, 0x3e, 0xae, 0x99, 0x50, 0x3a, 0x8c, + 0x57, 0xaf, 0x59, 0x34, 0x5e, 0x6c, 0xd1, 0x9d, 0x54, 0x43, 0x4a, 0x59, + 0xc3, 0x76, 0xb6, 0x80, 0xd5, 0x8f, 0xcb, 0x69, 0xb5, 0x43, 0x7b, 0xbc, + 0xb4, 0xcc, 0x5a, 0x39, 0x60, 0xba, 0xbd, 0xb3, 0x35, 0xb3, 0xa3, 0xc4, + 0x36, 0xc8, 0x8a, 0x83, 0x60, 0x82, 0xd6, 0x7a, 0x3e, 0xb0, 0x32, 0xc8, + 0x37, 0x5b, 0x6f, 0x5e, 0x6c, 0xc1, 0xd4, 0xb2, 0x59, 0x40, 0x86, 0x9d, + 0x78, 0x49, 0x54, 0x5a, 0xc4, 0x7a, 0x6d, 0x60, 0x45, 0xa4, 0x56, 0x72, + 0x60, 0xc0, 0x48, 0x8a, 0xca, 0x84, 0xd3, 0x76, 0xb8, 0x44, 0x59, 0xbf, + 0x54, 0xc4, 0xa4, 0x37, 0x63, 0xcb, 0x7f, 0x63, 0xa5, 0x49, 0x72, 0xd1, + 0x94, 0x62, 0x8c, 0x49, 0xcd, 0x43, 0x53, 0x72, 0x6d, 0xb2, 0xa9, 0x7f, + 0xb3, 0xad, 0x87, 0x6f, 0x59, 0xc0, 0x71, 0xcd, 0x64, 0x3f, 0x4f, 0xa6, + 0x92, 0xba, 0x86, 0x78, 0x44, 0xa2, 0x71, 0xab, 0x72, 0x9a, 0xb3, 0x32, + 0xaa, 0x9e, 0x8e, 0x75, 0xae, 0x41, 0xba, 0xbd, 0x56, 0xb9, 0x9e, 0x88, + 0x7b, 0x9a, 0xa9, 0xc9, 0x89, 0x87, 0x71, 0x59, 0x44, 0xc8, 0x42, 0x89, + 0x9d, 0x6d, 0xc2, 0x38, 0x42, 0xb9, 0xa1, 0x8d, 0xbf, 0xcc, 0x64, 0xc4, + 0x3d, 0x42, 0x99, 0x99, 0xc0, 0x3e, 0x56, 0x71, 0xaf, 0x75, 0xca, 0x90, + 0x41, 0x3c, 0x47, 0xb8, 0x75, 0x6d, 0x5b, 0x6a, 0x42, 0x31, 0xa9, 0xae, + 0x6c, 0x8a, 0x67, 0x8b, 0x91, 0xc8, 0x36, 0x81, 0xc4, 0x56, 0xa8, 0x3c, + 0xa5, 0x3f, 0x50, 0x7a, 0x38, 0x3f, 0x41, 0xb9, 0x2e, 0x79, 0x8b, 0xbb, + 0x75, 0x56, 0x6f, 0x5a, 0x43, 0x3f, 0x8a, 0xa4, 0x6a, 0x34, 0x56, 0x33, + 0xba, 0x42, 0xa5, 0xae, 0x78, 0x86, 0x4d, 0x71, 0x69, 0x70, 0xc7, 0x6d, + 0xd4, 0x97, 0x64, 0x33, 0x99, 0x6e, 0x8b, 0x61, 0x7a, 0x75, 0x7f, 0x9e, + 0x84, 0x91, 0x97, 0x67, 0x8a, 0xbe, 0x47, 0xab, 0x60, 0xaf, 0xb7, 0xbb, + 0x82, 0x49, 0x50, 0x46, 0xba, 0x89, 0x7e, 0xcd, 0x9d, 0x3b, 0x32, 0x3c, + 0x5c, 0x81, 0xd0, 0x39, 0x40, 0xce, 0x4a, 0x5a, 0x61, 0x6b, 0xb0, 0x47, + 0x8e, 0x33, 0xab, 0x8e, 0xa5, 0x6b, 0x70, 0x47, 0xc4, 0xb9, 0x4b, 0x59, + 0x56, 0x80, 0xc2, 0x35, 0xae, 0x86, 0x95, 0x8a, 0x9f, 0xb1, 0x3d, 0x4b, + 0x88, 0x99, 0xd1, 0xba, 0x9e, 0xcf, 0x70, 0x64, 0x31, 0xb1, 0xb3, 0xbe, + 0xb8, 0x9b, 0x54, 0xc5, 0x94, 0x34, 0x78, 0x51, 0xaf, 0xb8, 0x38, 0x9c, + 0xd3, 0x55, 0x93, 0x74, 0x49, 0x34, 0x31, 0x82, 0x38, 0xa2, 0x39, 0x92, + 0x8c, 0xb1, 0xb4, 0xc0, 0xc4, 0x9c, 0x4c, 0x44, 0xc0, 0x3c, 0x9e, 0x34, + 0x4e, 0xa6, 0x75, 0x9b, 0x5d, 0x79, 0x61, 0xa3, 0xb7, 0xb5, 0x93, 0xa4, + 0xbd, 0x31, 0x87, 0xc2, 0xc3, 0x93, 0x88, 0xb6, 0xb5, 0x45, 0xb4, 0x65, + 0xa5, 0x3e, 0x45, 0xbd, 0x73, 0x68, 0xc9, 0x97, 0xb6, 0x5a, 0x77, 0xd1, + 0x75, 0x7e, 0x3a, 0xb3, 0x79, 0xb3, 0x4c, 0x40, 0x70, 0x74, 0x84, 0x48, + 0x61, 0x4c, 0xab, 0xb4, 0x9a, 0xcb, 0xb4, 0xd8, 0x6e, 0xb4, 0x8a, 0x96, + 0xc6, 0xb4, 0x73, 0xba, 0x45, 0xa0, 0xc3, 0x82, 0x7d, 0x88, 0x53, 0x6e, + 0x74, 0x9b, 0x93, 0x3c, 0x5a, 0x87, 0xbe, 0x5b, 0x3e, 0x3f, 0x8e, 0x5f, + 0xab, 0xaa, 0x9a, 0x80, 0x5b, 0xc8, 0x73, 0xa2, 0x7b, 0xae, 0x89, 0xc9, + 0x78, 0xca, 0x82, 0x82, 0x87, 0xb0, 0x46, 0xa0, 0xb9, 0xa7, 0xb6, 0x56, + 0xa0, 0xce, 0x52, 0x4d, 0x55, 0x42, 0x9b, 0x6d, 0xa4, 0x4a, 0x71, 0x9f, + 0xb1, 0x56, 0x70, 0x7d, 0x40, 0x90, 0xaf, 0x29, 0x41, 0xc1, 0x9d, 0x49, + 0xca, 0x38, 0x72, 0xd2, 0x6e, 0x43, 0x9c, 0x55, 0x84, 0x9c, 0xbb, 0x71, + 0x4b, 0x3c, 0x9d, 0x3e, 0x56, 0xc8, 0x70, 0x5f, 0xbf, 0x93, 0xce, 0xd8, + 0x89, 0x9b, 0x60, 0x9d, 0xa8, 0x6e, 0x35, 0x5a, 0xbb, 0x5f, 0xaa, 0xd2, + 0xaa, 0x5d, 0x35, 0x3c, 0x6f, 0xcb, 0x8a, 0x47, 0x99, 0x4c, 0x41, 0xb4, + 0x39, 0xab, 0x94, 0x96, 0x61, 0x43, 0x49, 0x6b, 0xb2, 0x7a, 0x6c, 0x3b, + 0x6b, 0x53, 0x5c, 0xdc, 0x6c, 0xb7, 0x9b, 0x53, 0xc3, 0x93, 0xcb, 0x93, + 0xa3, 0x64, 0x36, 0x3f, 0x7c, 0x65, 0xcd, 0x81, 0xc9, 0x97, 0x5a, 0xb3, + 0x34, 0x31, 0x55, 0x32, 0x34, 0x75, 0xd2, 0xae, 0xb1, 0x28, 0x9d, 0x97, + 0x96, 0x62, 0x40, 0x6e, 0xa8, 0x96, 0xa2, 0x99, 0xbc, 0x6c, 0x42, 0x91, + 0x72, 0x43, 0xad, 0xc2, 0xb7, 0xcd, 0xa6, 0x50, 0x92, 0xa2, 0x5d, 0x60, + 0x4a, 0x91, 0x84, 0xb3, 0x3d, 0xb9, 0x62, 0x3b, 0xa1, 0x50, 0xbd, 0xba, + 0x5e, 0x8d, 0xd7, 0xce, 0x8d, 0x6e, 0x4d, 0x70, 0x33, 0x31, 0xc5, 0xb5, + 0x5c, 0x86, 0x6e, 0x96, 0x51, 0xc9, 0xd0, 0xcc, 0x8f, 0xae, 0x8d, 0xa4, + 0x56, 0x3a, 0xb6, 0x99, 0x5a, 0x87, 0x5c, 0xa4, 0x6e, 0xab, 0x72, 0xad, + 0x67, 0xa7, 0xcd, 0x79, 0x6f, 0x3a, 0x53, 0xd2, 0x37, 0xce, 0x9a, 0xb7, + 0x84, 0x6a, 0x6b, 0x8e, 0xb0, 0x65, 0x61, 0x8d, 0x73, 0x9b, 0xa1, 0x4f, + 0xd2, 0x9a, 0x9a, 0x8b, 0xd3, 0x9d, 0x9a, 0x40, 0xb8, 0x2f, 0x5d, 0x8c, + 0xa4, 0xcd, 0x48, 0x38, 0xbb, 0xc8, 0xbe, 0x93, 0x88, 0x9d, 0x3b, 0x74, + 0x5b, 0xc1, 0x35, 0x41, 0xc0, 0x80, 0xae, 0x87, 0x78, 0x52, 0x4a, 0x9e, + 0x67, 0x33, 0xbe, 0x4f, 0xb5, 0x83, 0xca, 0xd6, 0xa6, 0xa4, 0x70, 0xab, + 0x49, 0xaa, 0x6d, 0x59, 0x3f, 0x59, 0x51, 0x7b, 0x61, 0x3f, 0x38, 0x43, + 0xd1, 0x5a, 0x64, 0xc2, 0x8a, 0xd4, 0xc4, 0x48, 0x40, 0xd7, 0xb9, 0x81, + 0x75, 0x5c, 0xae, 0x4d, 0x8c, 0x93, 0x97, 0x91, 0x8e, 0x6d, 0xa8, 0x21, + 0x57, 0x34, 0xaf, 0x6d, 0x71, 0x88, 0x69, 0x99, 0xbd, 0x5a, 0xb0, 0xc7, + 0xb8, 0xc8, 0x37, 0x5c, 0x50, 0x98, 0x3f, 0x51, 0x39, 0x7b, 0x55, 0x69, + 0x70, 0x44, 0x5d, 0x30, 0xc1, 0x81, 0x2b, 0x4d, 0x62, 0x60, 0x81, 0x8d, + 0x58, 0x72, 0x99, 0x45, 0x3c, 0x32, 0x57, 0x5e, 0xa6, 0x2d, 0xce, 0x3e, + 0x7d, 0x63, 0xc4, 0xb8, 0xca, 0x86, 0xaf, 0x86, 0x32, 0x60, 0x62, 0xcd, + 0x48, 0x3b, 0x40, 0x34, 0xa1, 0x44, 0xb4, 0xa4, 0x33, 0xb6, 0xbd, 0x28, + 0xc0, 0xb1, 0xb4, 0x13, 0xa6, 0xb1, 0x37, 0x69, 0x46, 0x7d, 0xac, 0xb8, + 0x5a, 0xba, 0xa2, 0xa2, 0x60, 0x53, 0xb2, 0x47, 0x8b, 0x68, 0x8f, 0x38, + 0x45, 0x52, 0x4c, 0x6c, 0x8a, 0xad, 0x5b, 0x66, 0x50, 0xb6, 0xc9, 0xaa, + 0x73, 0x36, 0x82, 0xb5, 0x97, 0x4b, 0x7b, 0x73, 0xcf, 0xca, 0x5a, 0x6b, + 0xa8, 0xaa, 0x53, 0xba, 0x40, 0x5d, 0x69, 0x88, 0x99, 0xa4, 0xcf, 0x7b, + 0xc4, 0x63, 0xc6, 0x42, 0x3f, 0x6c, 0x96, 0x53, 0xc7, 0xc0, 0x67, 0x93, + 0x37, 0x8e, 0x9e, 0x95, 0x4d, 0x70, 0x3b, 0x83, 0xaa, 0x87, 0xb6, 0x42, + 0x47, 0x40, 0x38, 0xa2, 0x9c, 0x4f, 0x62, 0xc5, 0xb3, 0xa0, 0xc3, 0x8e, + 0x9f, 0xbc, 0x37, 0x5e, 0xad, 0x30, 0xb2, 0x6f, 0x69, 0xc6, 0xbc, 0x58, + 0x77, 0x7a, 0x75, 0x97, 0x36, 0x74, 0xc2, 0x5e, 0xa4, 0xaf, 0x72, 0x4f, + 0xb5, 0x34, 0x77, 0x69, 0xc5, 0xba, 0xcd, 0x3e, 0xad, 0x3e, 0x1b, 0xaf, + 0x6a, 0x23, 0xb7, 0x53, 0x79, 0x30, 0xc5, 0xc7, 0xb4, 0x5c, 0xc5, 0x77, + 0x83, 0x84, 0x4a, 0xaf, 0x55, 0xbd, 0x81, 0x76, 0xa8, 0x83, 0xc9, 0x8d, + 0x63, 0x66, 0x61, 0xa4, 0x51, 0x86, 0x48, 0xa5, 0x3d, 0x58, 0xa5, 0xa0, + 0x36, 0x40, 0x5f, 0xb2, 0x84, 0x4e, 0x74, 0x51, 0x5e, 0x4b, 0x4a, 0xb0, + 0x9e, 0x5e, 0x3b, 0x6c, 0x6d, 0x59, 0x96, 0x66, 0xb2, 0x89, 0x8b, 0x84, + 0x88, 0x9d, 0x5a, 0x72, 0xd5, 0xa2, 0x82, 0x7b, 0x8c, 0xa5, 0xb9, 0x63, + 0x5a, 0x61, 0xac, 0x9e, 0xa3, 0x4f, 0x69, 0xa3, 0x5a, 0xa8, 0x81, 0xb7, + 0x9a, 0x6d, 0x4f, 0xb6, 0xcc, 0x51, 0x75, 0x8d, 0xd0, 0x34, 0x50, 0x64, + 0x5c, 0xa3, 0xa8, 0x97, 0x6a, 0xad, 0xc4, 0x89, 0x5d, 0xc4, 0xad, 0x4c, + 0xa0, 0x2d, 0xcf, 0xae, 0x6e, 0x97, 0x8f, 0x65, 0x4f, 0x54, 0x90, 0xc3, + 0x38, 0x5e, 0xae, 0x62, 0x6b, 0x5c, 0xc6, 0xb7, 0xa0, 0x41, 0x82, 0x73, + 0x87, 0x8e, 0x5a, 0x96, 0xdc, 0x54, 0x56, 0x90, 0x4a, 0x89, 0xbe, 0x44, + 0x84, 0xc4, 0xbe, 0x8c, 0x38, 0x6f, 0x63, 0x4b, 0x4f, 0x57, 0x73, 0x48, + 0xbc, 0x45, 0xbe, 0x45, 0x3e, 0x46, 0xe5, 0x52, 0x72, 0x9b, 0xba, 0x42, + 0x34, 0x4c, 0xab, 0x6b, 0x55, 0xd8, 0xbe, 0xbc, 0x53, 0xa9, 0x3d, 0x5c, + 0x80, 0x6d, 0x4e, 0x58, 0x78, 0x6d, 0xd8, 0x9b, 0x75, 0xe0, 0xcd, 0x3a, + 0x2e, 0x89, 0x8b, 0x71, 0xce, 0x3d, 0xbb, 0xc7, 0x78, 0xc5, 0x7c, 0xbd, + 0x57, 0x96, 0xb3, 0x32, 0xb5, 0xd4, 0x5f, 0x49, 0x71, 0x3f, 0xd0, 0x53, + 0xb2, 0xae, 0x92, 0x50, 0xcc, 0x3f, 0x3f, 0x82, 0x4c, 0x62, 0x66, 0xb3, + 0x43, 0x73, 0xb8, 0x8c, 0x5c, 0x66, 0x6d, 0x88, 0xd0, 0xb9, 0xbb, 0x5c, + 0xc5, 0xab, 0x5c, 0x7b, 0xa3, 0xb4, 0xdd, 0x4d, 0xce, 0xb5, 0x3e, 0xda, + 0x80, 0x86, 0xd4, 0x56, 0x94, 0x78, 0x55, 0x51, 0x5b, 0xb8, 0x39, 0x98, + 0xba, 0x4e, 0x79, 0x8b, 0xa8, 0xe0, 0xa3, 0xac, 0xba, 0x47, 0x9b, 0xd2, + 0x8a, 0x7e, 0x28, 0xaf, 0xc2, 0x5a, 0x6d, 0xb9, 0xb3, 0xaa, 0x5b, 0x71, + 0xc7, 0x96, 0x54, 0x52, 0xba, 0xd0, 0x8e, 0x2a, 0x50, 0xd1, 0x38, 0xd9, + 0x60, 0xb9, 0x9b, 0x4f, 0xc0, 0x5d, 0x32, 0x54, 0xc6, 0x5a, 0xa9, 0x78, + 0x42, 0x94, 0xb7, 0xd6, 0xc7, 0x36, 0x78, 0x95, 0xbb, 0x87, 0x4b, 0x93, + 0xbd, 0xa7, 0xaa, 0x28, 0x6d, 0x43, 0x5e, 0x4c, 0x57, 0x4c, 0x9f, 0x97, + 0x78, 0x68, 0xbc, 0x42, 0x4d, 0xc9, 0xb9, 0x42, 0x39, 0x41, 0xa5, 0x29, + 0x92, 0xb7, 0x32, 0xc5, 0x85, 0x88, 0x92, 0x62, 0xa4, 0x66, 0x75, 0x38, + 0x50, 0x4a, 0x59, 0x6a, 0xa5, 0xa6, 0x91, 0xd1, 0x94, 0x8b, 0xad, 0xb6, + 0xda, 0x7a, 0x3f, 0x82, 0xc6, 0x8d, 0xad, 0x7b, 0x80, 0xa6, 0x7f, 0x47, + 0x24, 0x68, 0xbb, 0x36, 0x9b, 0x41, 0x61, 0x89, 0xb4, 0x5f, 0x50, 0x5e, + 0x9f, 0xb4, 0x6c, 0x2b, 0xd3, 0x79, 0xb4, 0x22, 0xad, 0xb6, 0x79, 0x9a, + 0xab, 0x61, 0x84, 0x7f, 0x66, 0x28, 0xa6, 0x92, 0x86, 0xd0, 0xce, 0x5c, + 0x35, 0x67, 0x65, 0x9a, 0xbb, 0x96, 0x68, 0x86, 0xc4, 0x8f, 0x49, 0xd1, + 0xb7, 0x38, 0x9c, 0x7e, 0x79, 0x33, 0x4d, 0x41, 0x67, 0x72, 0xd9, 0xcd, + 0x9a, 0x6c, 0x6e, 0xba, 0xc0, 0x87, 0x54, 0x96, 0x32, 0x98, 0x5e, 0x5c, + 0x3c, 0x4a, 0xc6, 0xb6, 0x33, 0x3f, 0xc7, 0x33, 0xa1, 0x48, 0xd1, 0x4e, + 0x75, 0x5b, 0xad, 0x9e, 0xad, 0x5a, 0xb9, 0x3f, 0xb6, 0x66, 0x62, 0x4a, + 0xa7, 0x5c, 0x89, 0x50, 0xb8, 0x9d, 0x82, 0x6c, 0x53, 0x5c, 0xa3, 0xc0, + 0x42, 0x92, 0x8d, 0xa9, 0x3d, 0x34, 0x9e, 0x8d, 0xbe, 0x83, 0xa1, 0x96, + 0x5c, 0x5d, 0x4b, 0x5e, 0x96, 0x98, 0x39, 0x95, 0xd5, 0xde, 0x7e, 0x73, + 0x4c, 0x9a, 0x86, 0x23, 0x67, 0x8c, 0x61, 0x2e, 0x9e, 0xc6, 0x50, 0x75, + 0x7d, 0xb6, 0x39, 0xab, 0x57, 0x43, 0xbb, 0x3f, 0x66, 0x2c, 0x8a, 0x7b, + 0xae, 0x81, 0x65, 0x58, 0x71, 0xa5, 0x75, 0x5d, 0x6a, 0x4d, 0x67, 0xd2, + 0x75, 0xab, 0x71, 0xd5, 0x97, 0xc1, 0xbc, 0xae, 0x56, 0x4a, 0x32, 0xd2, + 0xc1, 0xa2, 0xb6, 0x75, 0x9a, 0x6a, 0x39, 0xb1, 0x88, 0x44, 0x54, 0xbc, + 0x4b, 0x9a, 0x78, 0x8d, 0x8c, 0x50, 0x29, 0x79, 0xa4, 0xcb, 0x5e, 0x5a, + 0x7f, 0xc4, 0x94, 0xa0, 0x7d, 0xb7, 0x8d, 0xa4, 0x9e, 0xa9, 0x84, 0x7b, + 0x81, 0xb4, 0x7f, 0xd5, 0x32, 0x66, 0x85, 0xca, 0xb1, 0xb1, 0xd5, 0x88, + 0x39, 0x3a, 0xcd, 0x58, 0xa3, 0x4e, 0x5b, 0xbb, 0x73, 0x93, 0xd2, 0xbd, + 0x59, 0x5b, 0x49, 0xbe, 0x9d, 0x4b, 0xa0, 0xb7, 0x8d, 0xaf, 0xcd, 0xa7, + 0x58, 0x42, 0x5a, 0x68, 0x3f, 0x5c, 0x59, 0xd2, 0x76, 0x35, 0x7b, 0xab, + 0x9b, 0xae, 0xa2, 0xab, 0xb4, 0x7a, 0x63, 0x7b, 0x2d, 0xb5, 0x99, 0x3e, + 0x81, 0x54, 0x4f, 0x2a, 0x19, 0x6c, 0xb0, 0xd6, 0x47, 0x9d, 0x3b, 0x31, + 0xe1, 0xbd, 0x8a, 0x9b, 0xa6, 0x37, 0x54, 0x9e, 0xaf, 0x39, 0x3d, 0x6a, + 0x27, 0x7e, 0xcd, 0x65, 0x4f, 0x7d, 0x70, 0x5e, 0xad, 0xc6, 0x6f, 0xd7, + 0x85, 0x8d, 0x7c, 0x70, 0x93, 0x6f, 0xe1, 0x8c, 0x92, 0x53, 0x86, 0xa5, + 0x5d, 0x3b, 0x2d, 0x7c, 0x9c, 0xc5, 0x49, 0xb1, 0x57, 0x63, 0xc6, 0xab, + 0x78, 0x6b, 0x67, 0x50, 0x9c, 0x50, 0x54, 0x7c, 0x66, 0xab, 0x2f, 0x3f, + 0x98, 0x85, 0x41, 0xbb, 0x49, 0xbc, 0x81, 0x6a, 0x78, 0x99, 0x77, 0xc9, + 0x43, 0xc6, 0x2a, 0x56, 0xca, 0x6b, 0x98, 0xc2, 0x9c, 0x76, 0x67, 0x64, + 0x61, 0xa7, 0x60, 0xa2, 0x84, 0x54, 0x74, 0x81, 0x49, 0x59, 0xc3, 0x36, + 0x68, 0x86, 0x76, 0xa0, 0x62, 0xc6, 0x60, 0xd8, 0x88, 0x90, 0x38, 0xb0, + 0x50, 0x69, 0xd4, 0xbe, 0xab, 0xcb, 0x8e, 0x49, 0x57, 0xcc, 0x9b, 0x38, + 0xb7, 0xaa, 0xc7, 0xe4, 0x7e, 0xbb, 0x8e, 0xac, 0x9b, 0x80, 0xa5, 0x57, + 0x33, 0x2f, 0x8f, 0x6b, 0xae, 0xa7, 0x87, 0xb1, 0x8b, 0x92, 0xa9, 0x8f, + 0xa7, 0x4c, 0x9b, 0xc0, 0x77, 0x57, 0xb2, 0x6a, 0x7b, 0x60, 0x72, 0x7f, + 0x5b, 0x8d, 0xbb, 0x7b, 0x36, 0x47, 0x95, 0x51, 0x7c, 0x3a, 0x97, 0xbd, + 0x6f, 0x37, 0xcc, 0xdc, 0x37, 0x93, 0x40, 0x60, 0x9a, 0x51, 0xa4, 0x8d, + 0x6c, 0xdf, 0xd4, 0xa9, 0xab, 0x3c, 0x3a, 0xec, 0x83, 0x8a, 0x87, 0x69, + 0x54, 0xa4, 0x63, 0xad, 0x52, 0x33, 0x9f, 0x5a, 0x5f, 0x5c, 0xae, 0x8b, + 0xb7, 0x57, 0x5c, 0x37, 0x8e, 0xc2, 0xa1, 0xb8, 0x55, 0x87, 0xbe, 0xaa, + 0x40, 0x67, 0x51, 0x3b, 0x5b, 0x44, 0xa6, 0x69, 0x87, 0x5b, 0xa0, 0x47, + 0x44, 0x28, 0x59, 0xb9, 0xa8, 0x95, 0x4a, 0x98, 0xca, 0xaa, 0x75, 0x62, + 0x8b, 0x9a, 0x94, 0xc7, 0x6e, 0xb1, 0x81, 0x55, 0x73, 0x45, 0x93, 0x3b, + 0x8c, 0xcf, 0xc7, 0xc6, 0x7a, 0xa6, 0x4e, 0x63, 0x3e, 0x63, 0x47, 0x51, + 0x49, 0xa8, 0x3b, 0x75, 0x6a, 0xd3, 0x79, 0x6f, 0xc8, 0xd5, 0xc7, 0x51, + 0xac, 0x64, 0x8c, 0xbb, 0xc1, 0xb7, 0xb4, 0x54, 0x57, 0xa6, 0x81, 0x62, + 0x90, 0x88, 0x6e, 0x3d, 0x7c, 0x66, 0x4e, 0xbd, 0xc9, 0xb7, 0x4a, 0x5c, + 0x38, 0x4a, 0xa0, 0x59, 0x7c, 0x14, 0x58, 0x50, 0x8f, 0xca, 0x6e, 0x45, + 0xb8, 0xb1, 0xa6, 0x70, 0xd5, 0xcb, 0xb1, 0x81, 0xaa, 0x8b, 0xa9, 0x59, + 0x50, 0x7c, 0x6b, 0x3b, 0xc3, 0xc8, 0xc1, 0x51, 0x8e, 0x8d, 0x53, 0x7c, + 0x92, 0x3f, 0x9a, 0x99, 0xaf, 0xb4, 0x93, 0x7e, 0x82, 0xad, 0x74, 0x76, + 0xa1, 0x5c, 0x57, 0x39, 0xa2, 0xa6, 0x76, 0xcc, 0xbd, 0x7c, 0x4f, 0xbd, + 0xad, 0xac, 0x70, 0x9c, 0x56, 0xa4, 0x6e, 0xaa, 0x7c, 0xb4, 0x40, 0x86, + 0xcb, 0x7f, 0x30, 0x63, 0xb1, 0x31, 0x3c, 0x5f, 0x73, 0x91, 0x6c, 0x51, + 0xc5, 0x62, 0x80, 0xcd, 0x49, 0xc8, 0x70, 0xba, 0xad, 0xd8, 0x61, 0x9d, + 0xa8, 0x60, 0x6b, 0x56, 0x85, 0xce, 0x9e, 0x3f, 0x76, 0x71, 0x70, 0x38, + 0xa2, 0xac, 0x99, 0xb2, 0xb8, 0xc2, 0x40, 0x99, 0xa2, 0x3b, 0x8a, 0x54, + 0xcb, 0x8c, 0x61, 0x42, 0x9e, 0x2d, 0x79, 0x84, 0x67, 0x55, 0x52, 0xa5, + 0xd9, 0xb5, 0x82, 0x48, 0x9a, 0x8d, 0xa4, 0x50, 0xc3, 0x96, 0x3c, 0xb8, + 0x89, 0x4e, 0x92, 0xc7, 0x4c, 0xd0, 0x73, 0x29, 0x80, 0x99, 0x2d, 0xac, + 0x5f, 0xc2, 0x9d, 0xa6, 0x81, 0xe0, 0xc5, 0x78, 0x90, 0xb0, 0x64, 0x45, + 0x69, 0xa6, 0xbb, 0xca, 0x45, 0x30, 0x4b, 0x90, 0x4c, 0xab, 0x7f, 0x72, + 0x68, 0xc5, 0x3b, 0x5f, 0x88, 0x40, 0x84, 0x85, 0xca, 0x5f, 0x77, 0x6a, + 0xab, 0xa3, 0x84, 0x73, 0x59, 0x52, 0xbb, 0x9e, 0xc3, 0x3c, 0x97, 0x2d, + 0x94, 0x80, 0xb7, 0x9e, 0xba, 0x91, 0xd1, 0xb3, 0x7e, 0x16, 0x8c, 0x47, + 0xb5, 0x59, 0x9b, 0xa9, 0xc0, 0x9b, 0x80, 0x78, 0x51, 0x7c, 0x9f, 0x43, + 0xb6, 0x47, 0xb8, 0x6e, 0x37, 0x5d, 0x9b, 0x51, 0xb3, 0x7b, 0x92, 0xb9, + 0x9b, 0x7f, 0x93, 0x75, 0x5c, 0x73, 0xbc, 0x64, 0xad, 0x41, 0x59, 0x75, + 0x58, 0x63, 0x3c, 0xc1, 0xce, 0x59, 0x72, 0xb4, 0xcc, 0x46, 0x7a, 0x4b, + 0xd2, 0x9a, 0xc3, 0xa8, 0x41, 0x43, 0xab, 0x78, 0x9f, 0x71, 0x70, 0xa8, + 0x42, 0x59, 0x4d, 0x84, 0xb3, 0x82, 0xa4, 0x8b, 0xd2, 0x61, 0x9a, 0x49, + 0x5d, 0xad, 0x6e, 0x3d, 0x6f, 0x8f, 0x5c, 0x95, 0x3c, 0x92, 0x5d, 0x5a, + 0xda, 0x46, 0x66, 0xb6, 0x5e, 0x7e, 0x7f, 0x76, 0x3c, 0x5e, 0xc2, 0x72, + 0x8d, 0x97, 0x73, 0xa6, 0x2d, 0x6d, 0xcb, 0x4f, 0x9c, 0x3d, 0x63, 0x57, + 0x71, 0x2c, 0xc9, 0xc4, 0x40, 0xa3, 0x35, 0x69, 0x32, 0x3f, 0xa9, 0x75, + 0x5c, 0x92, 0xae, 0xc6, 0xf5, 0xc3, 0x7b, 0x9b, 0x8e, 0x93, 0xaa, 0x73, + 0x81, 0xa7, 0x6f, 0xc6, 0xa5, 0x7b, 0x6f, 0x68, 0xd0, 0x60, 0x4e, 0x76, + 0x79, 0x6e, 0xca, 0xcc, 0x8d, 0x5a, 0x89, 0x5c, 0x97, 0xbf, 0x93, 0x3d, + 0xaf, 0x71, 0xbb, 0x8f, 0x9e, 0xbc, 0x39, 0xc1, 0x71, 0xd0, 0x46, 0xa1, + 0x80, 0x82, 0x9c, 0x52, 0x97, 0x33, 0x56, 0x96, 0x86, 0xb8, 0xb3, 0x7e, + 0x43, 0xa0, 0xca, 0xc0, 0x61, 0x79, 0x94, 0xd6, 0xb3, 0x84, 0x5d, 0xce, + 0xa3, 0x43, 0x76, 0x58, 0x97, 0xc1, 0x64, 0xc2, 0x9c, 0xdb, 0xab, 0xa3, + 0x6b, 0xd3, 0x7e, 0x36, 0x73, 0xb9, 0x5f, 0x48, 0x2f, 0x3f, 0x6d, 0x62, + 0x8e, 0x8b, 0x34, 0x5f, 0x42, 0x88, 0x3d, 0x3e, 0xc1, 0x59, 0x71, 0x78, + 0x87, 0xae, 0x78, 0x40, 0x7b, 0x4e, 0xba, 0xac, 0x9e, 0x67, 0x81, 0xd7, + 0xaf, 0xb1, 0xc6, 0x33, 0x6b, 0x3e, 0x70, 0xad, 0xcc, 0x43, 0xa0, 0xba, + 0x85, 0x2c, 0xb9, 0xe3, 0x60, 0xb1, 0x99, 0x63, 0x45, 0x8a, 0xc1, 0x4a, + 0x8f, 0x57, 0x81, 0x43, 0xa9, 0x3b, 0x2c, 0x85, 0xbb, 0x35, 0x3a, 0x6b, + 0x65, 0xc6, 0x91, 0x2f, 0x23, 0x9d, 0x51, 0x3e, 0x94, 0x5a, 0x84, 0x4e, + 0x38, 0x65, 0x43, 0xc6, 0x74, 0x48, 0x96, 0x4d, 0x56, 0xa0, 0x82, 0xc8, + 0x36, 0x2c, 0xc2, 0xb0, 0x65, 0xa5, 0x9d, 0x87, 0x6f, 0x51, 0x76, 0x57, + 0x57, 0x7b, 0x45, 0x5c, 0x35, 0x79, 0x7f, 0xa0, 0xbb, 0xbf, 0x39, 0xa1, + 0x8d, 0x88, 0xcd, 0xd2, 0x53, 0xa5, 0x36, 0xc2, 0xbf, 0x74, 0x4e, 0x84, + 0x75, 0xb0, 0x2c, 0x38, 0xa4, 0x9d, 0x58, 0x44, 0x87, 0xaf, 0x4d, 0x46, + 0xcf, 0xa0, 0x8c, 0x6c, 0x71, 0x8c, 0x88, 0x8e, 0x5f, 0x58, 0x58, 0x60, + 0x7c, 0xcf, 0x34, 0x7f, 0x9a, 0xbe, 0x4c, 0x3a, 0x9f, 0xa2, 0xa8, 0x7d, + 0xaf, 0x80, 0xcf, 0x51, 0x9f, 0x7a, 0x60, 0x82, 0xb0, 0xb8, 0x90, 0x80, + 0xb0, 0x86, 0x85, 0x35, 0xd7, 0x4e, 0x74, 0xb4, 0x67, 0xb8, 0x5f, 0x77, + 0x9c, 0xab, 0x7d, 0xab, 0x79, 0xa6, 0x56, 0x68, 0xd5, 0x63, 0x52, 0x8e, + 0xc2, 0x5e, 0x7d, 0xd4, 0x75, 0x6f, 0xbd, 0x96, 0x53, 0xb1, 0x58, 0x9c, + 0x7d, 0xbd, 0x42, 0x39, 0xb5, 0x7d, 0x5b, 0xd0, 0x5d, 0xc5, 0x5f, 0xca, + 0xc8, 0x8d, 0x4f, 0xa6, 0xa3, 0x3c, 0x81, 0x81, 0x94, 0x50, 0xb1, 0xd8, + 0x32, 0x76, 0xa2, 0x3b, 0x74, 0xc6, 0x73, 0xcc, 0x83, 0xab, 0x4a, 0x3d, + 0xc3, 0xad, 0xae, 0x61, 0xb6, 0x52, 0xbd, 0xd1, 0x6e, 0xb9, 0xd0, 0xb6, + 0x85, 0xd0, 0xbd, 0xb1, 0x38, 0x5e, 0x6e, 0x60, 0x6f, 0x7c, 0x35, 0x84, + 0x8e, 0x40, 0xb5, 0x50, 0xa9, 0x7c, 0x43, 0x55, 0x97, 0xa1, 0x7c, 0x30, + 0xd1, 0x77, 0x37, 0x87, 0x81, 0x67, 0xcf, 0xc0, 0xc7, 0x5c, 0x80, 0x51, + 0xa1, 0xcd, 0x9a, 0x75, 0x58, 0x3d, 0x3e, 0x6c, 0xc7, 0x77, 0xb1, 0x5f, + 0x3f, 0x6a, 0x6b, 0xc4, 0xbb, 0xc8, 0x91, 0xc2, 0xae, 0x2f, 0x55, 0xa3, + 0x53, 0x89, 0x3f, 0x83, 0x92, 0xb5, 0x91, 0xc4, 0xb3, 0x83, 0x44, 0x9d, + 0x62, 0x6f, 0x91, 0x3c, 0x66, 0x7d, 0x89, 0x4d, 0x97, 0x6f, 0xac, 0x3b, + 0x63, 0xa9, 0x30, 0xad, 0xcc, 0x9e, 0xaa, 0x47, 0xd9, 0xc8, 0x62, 0x70, + 0x93, 0x81, 0xcc, 0x9d, 0x7f, 0xa6, 0x97, 0xce, 0x9d, 0x4d, 0x71, 0x57, + 0x1b, 0x78, 0xdc, 0x47, 0x59, 0x72, 0x7e, 0x5e, 0x9e, 0x82, 0x61, 0x3c, + 0x43, 0x51, 0x28, 0x85, 0x68, 0x67, 0xd5, 0x42, 0x4c, 0xac, 0xb1, 0x7f, + 0x5c, 0xc2, 0xa0, 0x55, 0x7d, 0x87, 0x43, 0x7f, 0x2f, 0xcd, 0x8e, 0xae, + 0x92, 0x8e, 0xbc, 0x66, 0x53, 0x98, 0x8e, 0x4b, 0xb5, 0x6d, 0xb6, 0x55, + 0x40, 0xbf, 0x93, 0xb1, 0x5e, 0xb1, 0x75, 0xa2, 0x89, 0x93, 0x63, 0x7f, + 0xe0, 0x73, 0x7b, 0xcb, 0x5a, 0x2d, 0x86, 0x95, 0x86, 0x94, 0x40, 0xb6, + 0x6e, 0x5a, 0xb3, 0x9f, 0xb3, 0x7b, 0xc4, 0xa7, 0x3d, 0x4a, 0x75, 0x90, + 0xe0, 0x49, 0x51, 0x8f, 0xb9, 0xaf, 0x4f, 0x6d, 0xc3, 0x9a, 0xb7, 0xba, + 0x5c, 0xca, 0xc8, 0xb6, 0xb8, 0x9f, 0x41, 0x83, 0xc0, 0xc4, 0x58, 0x3a, + 0x3d, 0x39, 0x3c, 0x65, 0x55, 0x6b, 0x52, 0xbd, 0xb7, 0xc1, 0xae, 0x53, + 0x83, 0xc2, 0xb8, 0x53, 0xb7, 0x66, 0xa5, 0x3f, 0x98, 0x30, 0x8e, 0x84, + 0x52, 0xaa, 0x52, 0x80, 0x75, 0x7a, 0x75, 0x8d, 0xc6, 0x72, 0x88, 0x9f, + 0x54, 0xc2, 0x39, 0x54, 0x92, 0x76, 0x91, 0x8e, 0x3b, 0xb5, 0x3b, 0x4f, + 0x80, 0x7e, 0x3a, 0xb1, 0x9f, 0x79, 0xae, 0x90, 0xa5, 0x64, 0x32, 0x6c, + 0x8c, 0xa0, 0x48, 0x45, 0x71, 0xca, 0x8a, 0x89, 0x5f, 0x34, 0x5a, 0xb9, + 0x2d, 0x63, 0xca, 0xb5, 0x99, 0x43, 0x79, 0x75, 0x84, 0xc4, 0x6e, 0x60, + 0x52, 0x79, 0xb1, 0x66, 0xae, 0x70, 0x83, 0x7f, 0xb5, 0xb0, 0x78, 0x52, + 0x3a, 0x54, 0x66, 0x71, 0x7b, 0x5e, 0x4d, 0x7c, 0x7b, 0xbd, 0x98, 0xb2, + 0xd2, 0x58, 0xa1, 0xb9, 0x4a, 0x9e, 0x75, 0xa3, 0xd5, 0x69, 0xb4, 0x87, + 0x9d, 0x7c, 0xda, 0x8e, 0xb4, 0x6f, 0x2f, 0x9a, 0x95, 0xcf, 0x5c, 0x5c, + 0x2a, 0x48, 0x4b, 0x8a, 0x45, 0x61, 0xc0, 0xcb, 0x92, 0x38, 0x6a, 0x87, + 0x7b, 0xc8, 0x9f, 0x89, 0x3f, 0x83, 0xc0, 0x9d, 0x82, 0x65, 0xd0, 0xc2, + 0xca, 0x70, 0x85, 0xc9, 0x90, 0xc9, 0x9f, 0x48, 0x4a, 0x2d, 0x99, 0x53, + 0xc3, 0x9d, 0x6c, 0xbb, 0x7b, 0x88, 0xb1, 0xac, 0xaa, 0x42, 0x83, 0x99, + 0xd2, 0xb5, 0x81, 0xc7, 0xcf, 0x61, 0xa4, 0x74, 0xd2, 0xcf, 0xa3, 0x81, + 0x90, 0xba, 0xc2, 0xba, 0xa2, 0xd5, 0x61, 0x9d, 0x80, 0xad, 0xb5, 0xb1, + 0xd9, 0x41, 0x81, 0x3e, 0xd8, 0x74, 0xa0, 0x2b, 0x72, 0xaf, 0xa5, 0x70, + 0x56, 0x87, 0xad, 0x87, 0xa4, 0xc1, 0x34, 0x2d, 0x4a, 0xa8, 0x92, 0x5e, + 0x7d, 0x69, 0x66, 0x7e, 0xd0, 0x50, 0x9c, 0x4e, 0x75, 0x91, 0xa5, 0x64, + 0xb0, 0x3b, 0xb4, 0xae, 0x59, 0xc3, 0xad, 0x63, 0x5c, 0x36, 0xc3, 0x85, + 0x3f, 0x82, 0x51, 0x4d, 0x54, 0xd2, 0xa1, 0x49, 0x42, 0x40, 0x75, 0xac, + 0x6e, 0x4d, 0x66, 0xb7, 0x9a, 0x8c, 0x94, 0xaf, 0x8c, 0x6b, 0x4c, 0xae, + 0x3b, 0x47, 0xcd, 0xba, 0xaa, 0x8e, 0xc9, 0xcf, 0x56, 0x35, 0xbe, 0x7f, + 0x5b, 0x59, 0x42, 0x7d, 0x7e, 0x88, 0x94, 0x39, 0x36, 0x8e, 0xc4, 0xbc, + 0x3c, 0x77, 0x50, 0x81, 0x68, 0xd3, 0x47, 0x7c, 0x48, 0x35, 0xb0, 0xa5, + 0x90, 0x52, 0xbe, 0xae, 0xc0, 0x3b, 0xce, 0x9d, 0xb1, 0xbe, 0x6f, 0xa9, + 0xc1, 0x8f, 0x79, 0x51, 0xb3, 0xa0, 0x33, 0x90, 0x8e, 0xc5, 0x41, 0xb2, + 0x66, 0x54, 0x9b, 0xc5, 0xb6, 0x70, 0x4a, 0x6c, 0xb5, 0x8a, 0x95, 0x76, + 0x72, 0x74, 0x76, 0x85, 0xc7, 0x94, 0x85, 0xb3, 0xb1, 0xc2, 0xa8, 0x81, + 0x45, 0xb6, 0x84, 0x62, 0x6b, 0xc9, 0x35, 0x62, 0x5d, 0x82, 0x76, 0x94, + 0xca, 0x49, 0xb4, 0xc4, 0x94, 0xcd, 0x63, 0x8f, 0x4e, 0x65, 0x32, 0x44, + 0x3a, 0x60, 0x4f, 0xa0, 0x83, 0x62, 0x3c, 0xce, 0xa0, 0x91, 0x9c, 0x60, + 0xab, 0x72, 0x78, 0x65, 0x46, 0x8e, 0x95, 0xca, 0x4b, 0xbe, 0x51, 0x6d, + 0x48, 0x54, 0x38, 0x59, 0x5d, 0x8f, 0xc6, 0x64, 0x3f, 0xce, 0xbb, 0xba, + 0xb0, 0xa8, 0x96, 0x33, 0x97, 0x73, 0x8a, 0xae, 0x3a, 0x96, 0xa3, 0x3b, + 0xaf, 0x62, 0x65, 0x64, 0x69, 0x36, 0x85, 0x58, 0x74, 0x76, 0x94, 0x31, + 0x82, 0xd4, 0xa5, 0x7a, 0x7d, 0x87, 0x56, 0x79, 0xa0, 0xb1, 0x6c, 0x3d, + 0x91, 0x66, 0x47, 0xcd, 0x31, 0x7f, 0x2f, 0xa4, 0xad, 0x7a, 0x43, 0xa8, + 0x38, 0x47, 0x4e, 0xa9, 0x84, 0x84, 0x87, 0x99, 0xc2, 0x97, 0xac, 0x64, + 0x4a, 0x71, 0x79, 0xcc, 0x54, 0x50, 0x40, 0xcf, 0x7b, 0xc2, 0x3f, 0xc7, + 0x8c, 0xc0, 0x3f, 0x2f, 0xad, 0x46, 0x9e, 0xb7, 0xc4, 0x9f, 0x99, 0x97, + 0xc1, 0x74, 0x5a, 0xb8, 0xd2, 0xbe, 0x73, 0x91, 0x9b, 0x6d, 0x5e, 0x3d, + 0x79, 0x9d, 0x65, 0x67, 0x47, 0x51, 0xa5, 0x60, 0x7a, 0x80, 0x67, 0x31, + 0xbc, 0xa5, 0xb4, 0xb7, 0xc1, 0x6d, 0x3d, 0xcf, 0x80, 0x9a, 0x75, 0xac, + 0x6e, 0x6e, 0x73, 0xc1, 0xd0, 0x71, 0x62, 0x68, 0x43, 0xc4, 0x43, 0xcf, + 0x83, 0x50, 0x8a, 0x54, 0x69, 0x8a, 0x7b, 0xc9, 0xb2, 0x87, 0x92, 0x62, + 0x8e, 0x35, 0x8b, 0x4a, 0xca, 0xaa, 0xa2, 0x3e, 0xaf, 0x32, 0xc9, 0x6d, + 0x40, 0x56, 0x39, 0xb5, 0x87, 0x42, 0x64, 0x91, 0x80, 0x88, 0x51, 0x9b, + 0x9c, 0x4e, 0xa4, 0x80, 0x74, 0xa7, 0x38, 0x62, 0xa8, 0xba, 0xa5, 0x67, + 0x34, 0x6e, 0xaa, 0xa3, 0x79, 0x97, 0x3b, 0x9c, 0x48, 0xbe, 0x90, 0xb8, + 0xbc, 0xab, 0x33, 0x2c, 0x6e, 0x81, 0x34, 0xae, 0x74, 0x89, 0x44, 0x7d, + 0xb1, 0x48, 0x50, 0x36, 0xa5, 0x3c, 0x77, 0x7c, 0x41, 0x3d, 0x4b, 0x63, + 0x7c, 0x4b, 0x38, 0x49, 0x93, 0x82, 0xa5, 0x80, 0x7d, 0x7b, 0x6d, 0x4c, + 0x7d, 0xa7, 0xbf, 0x8e, 0x37, 0x7e, 0x7a, 0x4c, 0xac, 0x9f, 0x5a, 0xb4, + 0x5c, 0x89, 0x5d, 0x4d, 0xc3, 0xce, 0xc5, 0x7d, 0xc9, 0xb7, 0x52, 0xb6, + 0xa1, 0x74, 0x41, 0x92, 0x7b, 0x7f, 0x46, 0x47, 0xa1, 0x9d, 0x74, 0x9a, + 0x82, 0x4a, 0x7f, 0x4c, 0x7e, 0xcd, 0xa5, 0x7d, 0x9a, 0x5c, 0xa7, 0xcb, + 0xbb, 0x36, 0x58, 0x4c, 0x47, 0x37, 0x54, 0x64, 0x3d, 0x9e, 0xc0, 0x73, + 0x5a, 0x91, 0xca, 0x47, 0x7c, 0xc6, 0xba, 0xbc, 0x72, 0x33, 0x7a, 0xa4, + 0x30, 0xbf, 0xc0, 0x3c, 0x6d, 0x96, 0x42, 0xc3, 0xab, 0x3d, 0x99, 0x50, + 0x71, 0xce, 0x33, 0x9a, 0xa7, 0x70, 0x3c, 0xb2, 0xa0, 0xa0, 0xd0, 0x8f, + 0x88, 0x94, 0x3b, 0xaa, 0xc4, 0x80, 0xc5, 0xbe, 0x30, 0x93, 0x9b, 0x89, + 0x65, 0x7d, 0x41, 0x58, 0x3a, 0x88, 0x76, 0x91, 0x8e, 0x67, 0x3b, 0x4a, + 0x33, 0xa8, 0x2f, 0x80, 0xa5, 0xb2, 0xcb, 0x68, 0xaf, 0x5f, 0x7e, 0x4b, + 0xcf, 0x3f, 0xcf, 0x82, 0x5e, 0x61, 0x90, 0x32, 0x8c, 0x5f, 0x85, 0x7e, + 0xb2, 0xcd, 0x7c, 0xa2, 0x39, 0x74, 0x77, 0xa4, 0x34, 0xa7, 0x6e, 0x45, + 0xc0, 0xbe, 0x90, 0xc0, 0x52, 0x3d, 0x54, 0x93, 0x63, 0x5a, 0x67, 0x6a, + 0x5b, 0xbd, 0xaa, 0x8f, 0x9f, 0xab, 0xb7, 0xb1, 0x54, 0x30, 0xbe, 0x32, + 0x64, 0xb0, 0x30, 0x5e, 0x92, 0x9a, 0x7a, 0x7d, 0xc0, 0x69, 0x4e, 0x96, + 0x45, 0x5f, 0xbd, 0x8e, 0xca, 0xc1, 0x7c, 0x4c, 0xba, 0x63, 0x57, 0x72, + 0xb9, 0x5b, 0xb5, 0x47, 0xa4, 0xa3, 0xa4, 0xbb, 0x77, 0x32, 0x6f, 0xa1, + 0xbf, 0xcd, 0xc9, 0xcb, 0x4d, 0x82, 0xbe, 0xd1, 0x93, 0xca, 0xa5, 0xc1, + 0x42, 0x32, 0x64, 0xa7, 0xa4, 0x60, 0x46, 0xc2, 0x62, 0xc0, 0x82, 0x4c, + 0x48, 0x6d, 0x95, 0x39, 0x49, 0x5a, 0x65, 0xac, 0x65, 0x63, 0xcf, 0x35, + 0x86, 0xa9, 0x6c, 0xb6, 0x92, 0x6e, 0x49, 0xa1, 0x68, 0x87, 0xa0, 0x33, + 0xa3, 0x90, 0x3f, 0x38, 0x6d, 0x9d, 0x60, 0x8f, 0x54, 0xa6, 0x4e, 0x36, + 0x79, 0x3e, 0x79, 0x5b, 0xcf, 0xb3, 0x9c, 0xb1, 0x4e, 0x67, 0xaa, 0x89, + 0xb1, 0x46, 0x37, 0x6b, 0x93, 0x7a, 0x9b, 0x95, 0x7f, 0x33, 0x90, 0x92, + 0xc9, 0xc4, 0x5d, 0x4c, 0x83, 0xab, 0x77, 0x5a, 0x56, 0xc1, 0x9d, 0x90, + 0x9b, 0xc1, 0x42, 0x42, 0x6e, 0x8e, 0x8f, 0xb0, 0xa8, 0x9f, 0x49, 0x82, + 0x67, 0x96, 0x87, 0x80, 0xad, 0x5e, 0xc9, 0x8e, 0x49, 0x43, 0xa7, 0xa3, + 0x5a, 0x45, 0x83, 0xb1, 0x52, 0x90, 0x9b, 0x68, 0x41, 0xce, 0xa3, 0x45, + 0x63, 0x73, 0x51, 0x89, 0x71, 0xc5, 0x84, 0xc9, 0x64, 0x8b, 0x99, 0x3d, + 0x94, 0x94, 0x64, 0xcd, 0x6e, 0xbb, 0xad, 0xc8, 0x4f, 0x39, 0x9e, 0x6f, + 0xa4, 0x3a, 0xbf, 0x45, 0xa6, 0x44, 0x99, 0x5c, 0x3b, 0x76, 0x49, 0x39, + 0x73, 0x45, 0x68, 0x82, 0x71, 0x3c, 0xa5, 0xc1, 0x48, 0x74, 0xc4, 0xcb, + 0x52, 0x6c, 0xce, 0x5c, 0xa9, 0xa5, 0xa9, 0x87, 0x94, 0xbd, 0xab, 0x6f, + 0x8c, 0xae, 0x50, 0x9f, 0x7e, 0xca, 0x9c, 0xab, 0x3d, 0x5e, 0xcb, 0xc9, + 0x42, 0x64, 0x34, 0x39, 0xb1, 0x6b, 0x8c, 0x6c, 0x4c, 0x85, 0x86, 0xbb, + 0x6a, 0x82, 0x61, 0x9b, 0x9f, 0x37, 0x4c, 0x49, 0x73, 0x6a, 0xc5, 0x63, + 0x75, 0xa6, 0xbf, 0x82, 0x59, 0x72, 0x8f, 0x5c, 0x33, 0x81, 0xb9, 0x87, + 0xa1, 0x9e, 0x68, 0x36, 0x79, 0x6f, 0x43, 0xa5, 0xb2, 0x77, 0x40, 0x49, + 0x3b, 0x82, 0x30, 0x60, 0xae, 0xcf, 0x41, 0x6b, 0xab, 0x9a, 0xd0, 0x6a, + 0x97, 0xb8, 0xc2, 0xcf, 0xd2, 0x59, 0xb3, 0xb8, 0x67, 0xbf, 0x5f, 0xaa, + 0x6f, 0x39, 0xcb, 0x56, 0xa7, 0x94, 0xba, 0x58, 0x78, 0x6e, 0xaa, 0x72, + 0x73, 0xc4, 0x6c, 0x7e, 0x80, 0x9c, 0x70, 0x5c, 0x7f, 0x6a, 0xc4, 0xb9, + 0xa1, 0x54, 0x8b, 0x98, 0x55, 0x49, 0x40, 0x5d, 0x4f, 0x77, 0x93, 0xd2, + 0x4a, 0xa7, 0xba, 0xc2, 0x3f, 0x64, 0xac, 0x45, 0x54, 0x33, 0x68, 0x96, + 0xa6, 0x4e, 0x9a, 0xc2, 0x32, 0x58, 0x5d, 0x84, 0xbc, 0x56, 0xb1, 0xab, + 0x8f, 0x3c, 0x98, 0xd3, 0x6c, 0x59, 0x39, 0xc3, 0x71, 0x7d, 0x72, 0x69, + 0xc6, 0x5b, 0xbc, 0x42, 0x7b, 0xa4, 0x3f, 0x8e, 0x34, 0xbb, 0xb6, 0x9a, + 0x93, 0x7a, 0xcc, 0x59, 0x94, 0x50, 0xcc, 0x75, 0x3c, 0x60, 0x84, 0x43, + 0x87, 0xa3, 0x55, 0x3e, 0xbe, 0x39, 0x59, 0x64, 0xc6, 0x7d, 0x30, 0x77, + 0xc8, 0x8b, 0x8a, 0x6a, 0x8d, 0x5b, 0x8f, 0x3f, 0x3e, 0xbd, 0xb0, 0x4b, + 0xb6, 0xc6, 0x8c, 0x46, 0xb3, 0x67, 0x77, 0x7d, 0x76, 0x54, 0x86, 0x93, + 0x74, 0xb1, 0x7e, 0xd0, 0x3e, 0x35, 0x68, 0x64, 0x8f, 0x39, 0x77, 0x53, + 0x65, 0xce, 0xcf, 0x34, 0x59, 0x74, 0x74, 0x65, 0x71, 0x7e, 0xc4, 0x7e, + 0x60, 0xbb, 0xbf, 0x78, 0x85, 0x3a, 0x78, 0x8b, 0xcc, 0xb3, 0xa6, 0x6e, + 0x6c, 0x3a, 0x98, 0x84, 0x4c, 0xae, 0xcc, 0x77, 0x61, 0x6f, 0x47, 0xab, + 0x50, 0x7a, 0x77, 0x80, 0x6f, 0xc6, 0xba, 0x6d, 0x9a, 0x81, 0xa1, 0x93, + 0x54, 0x74, 0xcc, 0x72, 0x7f, 0x66, 0xa4, 0x92, 0x3a, 0xcf, 0x86, 0x70, + 0x32, 0x67, 0x95, 0x72, 0xa6, 0x4c, 0x3d, 0xb7, 0x81, 0xcf, 0x8f, 0x65, + 0x93, 0xcd, 0xbe, 0xb9, 0xc1, 0x3b, 0xc8, 0x35, 0x5b, 0x85, 0xab, 0x3b, + 0xaf, 0x5a, 0xaa, 0xb2, 0x41, 0x99, 0xcc, 0x47, 0x47, 0x8e, 0xc5, 0x7e, + 0x40, 0x65, 0xaa, 0x45, 0xab, 0xb9, 0x69, 0x58, 0x3b, 0x77, 0x83, 0x60, + 0x6e, 0xaa, 0xa8, 0x72, 0xa8, 0x61, 0x88, 0x89, 0x34, 0x39, 0x6c, 0x7a, + 0x52, 0xa8, 0xcb, 0xad, 0x77, 0xb8, 0x6b, 0x98, 0x3f, 0x34, 0x32, 0x2f, + 0x56, 0x8e, 0x74, 0xa8, 0x30, 0x45, 0x8e, 0x91, 0xc6, 0x54, 0x9f, 0x73, + 0xc9, 0xad, 0x7b, 0xc5, 0xc8, 0x47, 0xd3, 0x98, 0x72, 0x59, 0xc0, 0x70, + 0x64, 0x32, 0xc7, 0xa8, 0x86, 0x42, 0x8d, 0xb1, 0x6a, 0x37, 0xb7, 0x6b, + 0x4e, 0xcb, 0xb7, 0x9e, 0xb4, 0x64, 0x98, 0xae, 0x75, 0x56, 0x44, 0x46, + 0x5c, 0x6d, 0x5e, 0x5b, 0x73, 0x37, 0x4a, 0xc0, 0x7d, 0x38, 0xbb, 0x96, + 0xbf, 0xc9, 0x85, 0x8a, 0x73, 0x7e, 0xc2, 0x51, 0xc1, 0x71, 0x6a, 0x6d, + 0x8d, 0xce, 0xa1, 0x6c, 0x6e, 0x9e, 0x84, 0xa2, 0x70, 0x85, 0x88, 0x74, + 0x66, 0x5f, 0x3d, 0x8c, 0xbc, 0xbb, 0xd0, 0x92, 0x9c, 0x87, 0x88, 0x81, + 0xc3, 0x3f, 0x67, 0x3f, 0x50, 0xb9, 0x5e, 0x9f, 0x81, 0xae, 0x9d, 0x9e, + 0x5b, 0x4d, 0x50, 0x41, 0x42, 0x8c, 0x38, 0x46, 0xb4, 0x57, 0x95, 0x88, + 0x55, 0x59, 0x32, 0xb6, 0x57, 0x49, 0xbc, 0x7e, 0x38, 0x6a, 0x6a, 0xb9, + 0xba, 0x54, 0xa0, 0xb3, 0xb2, 0x5b, 0xc9, 0xa0, 0x56, 0x69, 0xb2, 0xc5, + 0x9c, 0x68, 0x75, 0x4b, 0xa9, 0x72, 0xc6, 0x5a, 0x42, 0x95, 0x3b, 0x83, + 0xbc, 0x99, 0x46, 0x47, 0x4d, 0x99, 0x62, 0x5f, 0x68, 0x46, 0x53, 0x91, + 0x3f, 0x78, 0xb5, 0x8f, 0x8b, 0x3e, 0x72, 0xd0, 0xc6, 0x8e, 0x76, 0xaa, + 0xb5, 0x70, 0x7d, 0xc7, 0x7e, 0x49, 0x7a, 0x6b, 0x92, 0xc5, 0x7b, 0xc8, + 0x8a, 0x6f, 0x35, 0x50, 0xc2, 0x4a, 0x43, 0xd0, 0xc1, 0x83, 0x3b, 0x4b, + 0x9c, 0xa0, 0x69, 0x53, 0x64, 0x92, 0x95, 0x66, 0x97, 0x8b, 0xbd, 0xcf, + 0x4c, 0x66, 0xb7, 0x39, 0x81, 0xbd, 0x90, 0xad, 0x31, 0x5f, 0x52, 0x4d, + 0x30, 0x7f, 0x9e, 0xaa, 0x84, 0xb9, 0xb1, 0x47, 0x69, 0xba, 0xce, 0xb4, + 0x92, 0xc0, 0x75, 0x59, 0x70, 0x6b, 0xb9, 0x89, 0x84, 0x7b, 0xad, 0xc6, + 0x8a, 0xb9, 0x43, 0x3a, 0x9b, 0x91, 0x99, 0x80, 0x4d, 0x4f, 0x3e, 0x49, + 0x58, 0xaf, 0x84, 0x72, 0xc6, 0x52, 0xbe, 0x61, 0xcf, 0x55, 0x86, 0x9c, + 0x77, 0xbd, 0x68, 0x40, 0x98, 0x5d, 0xba, 0xbe, 0xb9, 0xa5, 0xcd, 0x47, + 0x76, 0x40, 0xb3, 0xbb, 0x9d, 0x3d, 0xc4, 0xcb, 0x50, 0xb0, 0x7b, 0xbc, + 0xa5, 0xa6, 0xb6, 0x7f, 0x84, 0xba, 0xae, 0x6a, 0xc0, 0xa4, 0x4f, 0x65, + 0x88, 0x62, 0xaa, 0x33, 0x9e, 0xb8, 0x96, 0x35, 0xbf, 0x62, 0x75, 0xaf, + 0x44, 0x7b, 0xb7, 0xc9, 0xb9, 0xcb, 0xbe, 0xbd, 0x62, 0x56, 0x64, 0xc9, + 0x59, 0x9b, 0xbc, 0x97, 0xcf, 0x5f, 0x9f, 0x4a, 0x9d, 0x79, 0x47, 0x60, + 0x85, 0x62, 0xcb, 0x30, 0x78, 0x8b, 0x82, 0x79, 0xb5, 0xc0, 0x7b, 0x42, + 0xb8, 0xac, 0xa1, 0x6f, 0xc8, 0x7f, 0x65, 0x87, 0xc8, 0x89, 0x7f, 0x54, + 0x88, 0xa0, 0x84, 0x33, 0xc0, 0x43, 0xa5, 0xba, 0x80, 0x9e, 0x91, 0xcd, + 0x88, 0xbe, 0x88, 0x32, 0x41, 0x75, 0x81, 0xc4, 0x8d, 0x71, 0x7b, 0x9d, + 0x7c, 0x75, 0xba, 0x3a, 0x65, 0xa7, 0x92, 0x49, 0xb3, 0x37, 0x42, 0x82, + 0x3c, 0x87, 0xc4, 0xb1, 0x65, 0xcf, 0xd3, 0x60, 0x5a, 0x68, 0xc9, 0x4c, + 0x53, 0x72, 0x81, 0x4c, 0x8f, 0xb0, 0x3c, 0x7e, 0x8d, 0x70, 0xbb, 0x81, + 0xc5, 0x93, 0xb6, 0x5c, 0xc9, 0xb8, 0x4e, 0xc4, 0x9b, 0xa6, 0x47, 0xbc, + 0x4f, 0xc2, 0xc1, 0xc2, 0x9b, 0x94, 0x7d, 0xc9, 0xa3, 0x59, 0x5e, 0x3f, + 0xa5, 0x4a, 0xac, 0xaf, 0x59, 0x73, 0xaf, 0xd5, 0x34, 0xac, 0xb0, 0x9a, + 0xd2, 0x74, 0xcc, 0x6b, 0x85, 0x56, 0x89, 0x68, 0x4d, 0x62, 0x5e, 0x7b, + 0x56, 0x57, 0x8f, 0x3b, 0x72, 0x99, 0x75, 0x7c, 0x97, 0x86, 0x62, 0x75, + 0x5e, 0x93, 0x4f, 0xbe, 0xbe, 0x31, 0xcb, 0x60, 0xad, 0x97, 0xcf, 0x5e, + 0xb5, 0x6e, 0x2e, 0xd3, 0x89, 0xbe, 0x77, 0x7c, 0x9f, 0x52, 0x9f, 0x9b, + 0x5e, 0xca, 0xb3, 0x9b, 0xb6, 0x4e, 0x78, 0x31, 0x85, 0x65, 0x3a, 0xa0, + 0x66, 0x3e, 0x4b, 0x5f, 0xaa, 0x8d, 0x48, 0xc1, 0x74, 0xc3, 0x8d, 0x8b, + 0x66, 0x47, 0x63, 0x78, 0x38, 0x50, 0x6f, 0x30, 0xc6, 0x5c, 0x52, 0xa0, + 0x4e, 0xcf, 0xc8, 0x86, 0xa5, 0x95, 0xbd, 0x34, 0x95, 0xc2, 0x70, 0xae, + 0x67, 0xd5, 0x47, 0x6b, 0x5d, 0xca, 0x8c, 0x60, 0xce, 0x9c, 0x34, 0x34, + 0x66, 0x4c, 0x82, 0x39, 0x63, 0x90, 0x78, 0x62, 0x86, 0x5f, 0x57, 0x67, + 0x4f, 0x62, 0x44, 0x82, 0x2c, 0xcc, 0x40, 0x7b, 0x99, 0xb9, 0x55, 0xb9, + 0xc4, 0x43, 0xc2, 0x5b, 0x54, 0x9e, 0x4c, 0xbe, 0xb7, 0x43, 0x6c, 0xcc, + 0x40, 0x72, 0xe6, 0x9b, 0x48, 0x9d, 0xb3, 0x3a, 0x88, 0x92, 0x70, 0x5b, + 0x5a, 0x48, 0xc1, 0x48, 0x55, 0x55, 0xb1, 0x50, 0x61, 0xa3, 0x69, 0x6e, + 0x69, 0xbb, 0x61, 0xd4, 0x67, 0x51, 0x47, 0x3a, 0x35, 0xc6, 0xbb, 0x72, + 0x6f, 0xc8, 0x84, 0x5d, 0x91, 0xbc, 0x46, 0x56, 0x4a, 0xa5, 0x70, 0x3c, + 0x7d, 0x61, 0x2c, 0x82, 0x63, 0x94, 0xac, 0x41, 0x3c, 0x64, 0x4e, 0x3c, + 0x65, 0x64, 0xc9, 0x85, 0x6f, 0x89, 0x57, 0xd7, 0x49, 0x40, 0x7c, 0x7a, + 0xa4, 0x7b, 0x54, 0xc6, 0x82, 0x36, 0xab, 0xb2, 0x3d, 0x60, 0x5a, 0xc5, + 0x4e, 0x8a, 0x57, 0x54, 0xb7, 0x4d, 0x55, 0x91, 0x65, 0xa3, 0x72, 0x62, + 0xc6, 0x8a, 0x44, 0x7e, 0x94, 0x9f, 0x55, 0xc2, 0x48, 0x89, 0xa4, 0x95, + 0xb0, 0x5c, 0xa4, 0x4e, 0xae, 0x67, 0x93, 0x9e, 0x57, 0x95, 0xb8, 0x70, + 0xc9, 0xaa, 0xa7, 0x8e, 0xd1, 0x41, 0x85, 0xb9, 0x95, 0xb7, 0x31, 0xaf, + 0x50, 0x95, 0x9a, 0xcf, 0x62, 0x88, 0xbb, 0x9f, 0x51, 0xab, 0xc0, 0xa8, + 0x8a, 0x4d, 0xb8, 0xbb, 0x97, 0x5f, 0x67, 0x38, 0xc7, 0xc9, 0xc4, 0x76, + 0x33, 0x2d, 0x40, 0xae, 0x97, 0xd3, 0x9f, 0xc5, 0x8e, 0x7c, 0x4f, 0x5a, + 0x66, 0x63, 0x64, 0x95, 0x85, 0x5c, 0x7d, 0xbc, 0x6b, 0x9e, 0x63, 0x59, + 0x63, 0x36, 0x79, 0x6e, 0x37, 0x73, 0xa9, 0x50, 0xb3, 0x3c, 0x5c, 0x78, + 0x73, 0xaf, 0x48, 0xc5, 0x5f, 0xc9, 0x53, 0x97, 0x30, 0xae, 0x5e, 0x40, + 0x3f, 0x90, 0x54, 0x5f, 0xac, 0x8f, 0x59, 0x38, 0x7d, 0x36, 0xa6, 0x2c, + 0xc5, 0x64, 0x50, 0x75, 0xbf, 0x48, 0x30, 0xcf, 0xa2, 0x93, 0xa3, 0x6d, + 0x9f, 0x96, 0xaa, 0x6f, 0xcd, 0x86, 0x92, 0x8d, 0xb7, 0xbb, 0x56, 0xa2, + 0x50, 0x50, 0xaf, 0xd6, 0x86, 0x45, 0x90, 0xd4, 0xb1, 0x6a, 0x6f, 0x84, + 0x9f, 0x87, 0x8e, 0xbf, 0xb2, 0xc9, 0xa8, 0x7b, 0xd4, 0x41, 0x56, 0x93, + 0x5e, 0x84, 0x94, 0x68, 0xbc, 0x8c, 0x93, 0xbd, 0xa5, 0xa8, 0xb7, 0x67, + 0xb5, 0x48, 0x4a, 0xb3, 0xb8, 0x38, 0xc5, 0xa4, 0xcf, 0x38, 0xb3, 0x42, + 0xa6, 0xbd, 0x5d, 0x3d, 0xa2, 0xa5, 0x56, 0x73, 0x60, 0x58, 0xa9, 0x7e, + 0xbc, 0x4c, 0xb3, 0x72, 0x35, 0x5e, 0x96, 0x5a, 0x4d, 0xa8, 0xc1, 0xbe, + 0xc3, 0x35, 0xb0, 0xc4, 0x52, 0x7c, 0xb1, 0x6f, 0x65, 0x93, 0x44, 0xc4, + 0xc7, 0x3c, 0x89, 0xd2, 0x8a, 0x81, 0x4c, 0xc1, 0xd4, 0x4a, 0xcd, 0x97, + 0x5f, 0x3f, 0x91, 0x5f, 0x39, 0xcd, 0x70, 0x5c, 0x57, 0x8f, 0xa0, 0x9f, + 0x9d, 0x53, 0x3a, 0x50, 0x37, 0x6f, 0x8f, 0x97, 0xb9, 0x5c, 0x49, 0xa7, + 0xcf, 0x35, 0x97, 0xaf, 0x3b, 0x70, 0x95, 0x8e, 0x4f, 0x39, 0x45, 0x6c, + 0x5b, 0x71, 0xc9, 0x59, 0xce, 0x46, 0xaa, 0xc2, 0xa6, 0xaf, 0xa1, 0xc7, + 0x40, 0xb3, 0xce, 0x4b, 0x53, 0x6f, 0xac, 0x3d, 0xc4, 0xbb, 0x4a, 0x9e, + 0x3d, 0xc9, 0x7c, 0xc4, 0xa7, 0x6c, 0x3d, 0x87, 0x3e, 0x92, 0xb4, 0xcb, + 0x95, 0x42, 0x90, 0x75, 0x52, 0xde, 0xb8, 0x4f, 0x99, 0x85, 0xa8, 0x87, + 0x89, 0x63, 0x79, 0xd3, 0x7f, 0x95, 0xc6, 0xaa, 0xe1, 0xdb, 0x7a, 0x65, + 0x34, 0x50, 0x42, 0xc0, 0x95, 0x66, 0x91, 0x4a, 0x50, 0x7d, 0x9a, 0x52, + 0xaa, 0x86, 0xa3, 0x2d, 0x2a, 0x5d, 0x5a, 0xa8, 0x4d, 0x8d, 0x4e, 0x76, + 0x7c, 0x73, 0xd5, 0xbf, 0x3c, 0xb4, 0x4d, 0x8c, 0xb9, 0x69, 0xb1, 0x46, + 0x60, 0x7e, 0x68, 0x7b, 0x59, 0xcb, 0x60, 0x7a, 0x3a, 0xaf, 0xb9, 0x7f, + 0xc9, 0x85, 0x3d, 0x79, 0x97, 0xd0, 0x64, 0x93, 0x56, 0xc8, 0x8d, 0xbf, + 0x5d, 0x77, 0x93, 0x53, 0x48, 0xd1, 0x8d, 0xc1, 0x40, 0x30, 0x33, 0xa4, + 0x58, 0xd1, 0x56, 0x44, 0x8b, 0xd4, 0x68, 0xaa, 0x9e, 0x95, 0xa1, 0x8f, + 0x80, 0xcc, 0xcb, 0x74, 0xbc, 0xb0, 0x74, 0xa8, 0x87, 0x3d, 0x9b, 0x68, + 0x45, 0x66, 0x89, 0x9e, 0x57, 0x94, 0x2e, 0x40, 0x3f, 0x7b, 0x43, 0x3f, + 0x78, 0x8c, 0xbc, 0x74, 0x7a, 0x99, 0x54, 0x49, 0x3a, 0xcd, 0x36, 0xc5, + 0x68, 0x8e, 0x43, 0x61, 0xae, 0x44, 0xce, 0xad, 0xb7, 0x42, 0xc9, 0x68, + 0xad, 0x3b, 0x7d, 0x5a, 0x91, 0xbd, 0xb5, 0xcc, 0x63, 0x64, 0xa1, 0xb8, + 0x8a, 0x66, 0xa8, 0x9d, 0x6e, 0x99, 0xbd, 0x6e, 0x3c, 0x5a, 0x61, 0x74, + 0x41, 0xd0, 0xd6, 0x55, 0xdc, 0x7e, 0x27, 0x93, 0x6e, 0xc4, 0x6f, 0x65, + 0x7c, 0x41, 0x47, 0x3e, 0xcc, 0xd4, 0x3e, 0x38, 0x85, 0x38, 0xa6, 0xe1, + 0x49, 0x7d, 0xd2, 0x46, 0x74, 0x5d, 0x85, 0x36, 0x90, 0x33, 0x5f, 0xa0, + 0x58, 0x35, 0xb7, 0x9b, 0x7b, 0xbc, 0xa8, 0xb0, 0xa5, 0x54, 0x62, 0x4f, + 0xa1, 0x82, 0x99, 0x52, 0xbd, 0xa5, 0x6e, 0x5c, 0xba, 0x5b, 0x78, 0x3f, + 0xa6, 0x6d, 0x82, 0x9b, 0x45, 0x4d, 0x84, 0xaf, 0xa5, 0x6f, 0xda, 0x46, + 0xbe, 0xb6, 0x7e, 0x9c, 0xb5, 0x3b, 0x64, 0x9e, 0xcd, 0xa2, 0x81, 0x5f, + 0x56, 0x40, 0x7c, 0x3c, 0xb7, 0xd8, 0x55, 0x47, 0x6a, 0xb0, 0x3c, 0x4e, + 0x4c, 0x77, 0xac, 0x57, 0xc5, 0xda, 0x3a, 0xbb, 0x4a, 0x38, 0x4d, 0x4b, + 0x3b, 0x77, 0x87, 0x99, 0x9a, 0xd2, 0xad, 0x3e, 0x9b, 0xb1, 0x71, 0x52, + 0x87, 0x3b, 0x6f, 0x41, 0xd2, 0x69, 0xc5, 0xb0, 0x43, 0xaa, 0x95, 0x9c, + 0x3d, 0xaa, 0xca, 0x59, 0xb9, 0x79, 0x86, 0xc8, 0x3d, 0x8a, 0x77, 0x7a, + 0x85, 0x99, 0x97, 0x4b, 0x72, 0x8e, 0x73, 0x29, 0x54, 0xc4, 0xb4, 0xa5, + 0x55, 0xa6, 0x40, 0xa3, 0x2e, 0x91, 0x85, 0x82, 0x9f, 0x42, 0x90, 0x3e, + 0x91, 0x4a, 0x36, 0xb7, 0x9a, 0x7e, 0xa4, 0x71, 0x38, 0xc6, 0x50, 0x28, + 0xae, 0x59, 0xca, 0x6a, 0xd9, 0xd4, 0x62, 0x89, 0xb7, 0x58, 0xbb, 0x78, + 0xbd, 0x4d, 0xae, 0x9a, 0x36, 0x97, 0x70, 0x58, 0x71, 0xae, 0x4a, 0x7e, + 0x58, 0x8d, 0x7b, 0x26, 0x44, 0xbf, 0xa7, 0xa3, 0xa3, 0x3c, 0x98, 0xaf, + 0xa9, 0x33, 0xa3, 0x5d, 0xb6, 0xae, 0x64, 0xbd, 0xcd, 0x97, 0x9e, 0x99, + 0x98, 0x72, 0x36, 0x5b, 0xcd, 0xa6, 0x25, 0x4b, 0x7a, 0xd1, 0x3e, 0x84, + 0xc7, 0xbf, 0x6d, 0x8d, 0x45, 0x87, 0xdd, 0x93, 0x42, 0x78, 0x70, 0x4c, + 0x42, 0x33, 0xc3, 0x9d, 0x6c, 0xaa, 0x5a, 0x91, 0xc5, 0x60, 0xac, 0xa9, + 0x4e, 0x6d, 0x61, 0x79, 0x99, 0xc9, 0x52, 0x9d, 0x5c, 0x36, 0x5a, 0x61, + 0x62, 0x39, 0xb2, 0x9b, 0xb7, 0xb8, 0x44, 0x8d, 0xd1, 0x5e, 0x87, 0x4f, + 0x45, 0x3f, 0xa2, 0x56, 0x34, 0x3c, 0x63, 0x9b, 0xbe, 0x38, 0xc7, 0x48, + 0x98, 0xb2, 0xb7, 0xb2, 0x6e, 0xbf, 0xba, 0x65, 0x8c, 0xb9, 0x3d, 0x47, + 0x50, 0xa7, 0xbb, 0x86, 0x4c, 0x4f, 0xc9, 0xc0, 0xae, 0x67, 0xa4, 0x4a, + 0x81, 0xcd, 0x85, 0xcc, 0x42, 0xbf, 0xa5, 0xd3, 0x45, 0x37, 0x82, 0x57, + 0x36, 0xb7, 0xb5, 0x83, 0x66, 0xc7, 0xa6, 0x8e, 0x84, 0x81, 0x7b, 0xaf, + 0x45, 0x80, 0xb8, 0x97, 0xbd, 0xc9, 0xd0, 0x55, 0x5f, 0x3e, 0xd5, 0x54, + 0x6b, 0xa0, 0x64, 0x33, 0xcb, 0x70, 0x9a, 0xb8, 0xb7, 0x5c, 0x68, 0xac, + 0xa7, 0xa1, 0x32, 0x3e, 0xb3, 0x33, 0xb5, 0x83, 0x40, 0x43, 0xbd, 0x46, + 0xb2, 0xa8, 0x8f, 0xa6, 0xd7, 0x96, 0xa4, 0x56, 0x4e, 0xbc, 0x3e, 0x8c, + 0x60, 0x9c, 0xb7, 0xbd, 0xaa, 0x57, 0x67, 0x31, 0x8c, 0x5a, 0x6e, 0x8c, + 0xc1, 0x4a, 0x35, 0xba, 0xa6, 0xce, 0xbb, 0x99, 0xa5, 0x54, 0x8b, 0xa4, + 0x97, 0xc3, 0x95, 0x52, 0xa8, 0x6a, 0x44, 0xa9, 0x96, 0xd0, 0x5e, 0x8a, + 0x77, 0x98, 0xc7, 0xaa, 0xc6, 0x97, 0x80, 0x8f, 0xc5, 0x3b, 0x8a, 0xc4, + 0x85, 0x7c, 0x48, 0x4f, 0x67, 0x64, 0xbc, 0x9e, 0x59, 0xe2, 0xae, 0xcb, + 0xc3, 0xc7, 0x73, 0x93, 0x45, 0x55, 0x8f, 0x84, 0x6c, 0xad, 0x30, 0x3e, + 0xa6, 0x41, 0x78, 0xc0, 0x7b, 0xd2, 0xcf, 0x8b, 0x6b, 0xaa, 0x5c, 0x4b, + 0x72, 0xd2, 0x7a, 0x98, 0x80, 0x77, 0xb3, 0xc6, 0x2f, 0x89, 0x6f, 0x41, + 0x69, 0x87, 0x9b, 0x7f, 0xab, 0x3f, 0x30, 0x8c, 0xb9, 0xbc, 0xd2, 0x99, + 0x32, 0x8f, 0x8b, 0x72, 0x95, 0x64, 0x98, 0xd8, 0xd3, 0x93, 0xca, 0x77, + 0x91, 0x72, 0x98, 0x84, 0xbf, 0x6f, 0xbe, 0x62, 0xba, 0x5d, 0xb1, 0x7a, + 0x7d, 0xab, 0x4a, 0x4f, 0xa1, 0xbe, 0xaf, 0x86, 0xd6, 0xd6, 0xcd, 0x6e, + 0x5f, 0x60, 0x43, 0x98, 0x9b, 0x53, 0x33, 0x71, 0x41, 0xaf, 0x55, 0x81, + 0xb2, 0x38, 0x95, 0x65, 0x57, 0x72, 0x42, 0xc8, 0x74, 0x91, 0xcf, 0x3b, + 0xb6, 0x4a, 0x45, 0xce, 0x90, 0x3a, 0x7b, 0x4b, 0x7c, 0x96, 0x5c, 0xa0, + 0x62, 0x98, 0xd5, 0x61, 0x58, 0xb8, 0x40, 0xb5, 0x47, 0x48, 0x5d, 0x6f, + 0xb4, 0xb3, 0xa8, 0x84, 0x5c, 0x60, 0x9e, 0x62, 0x62, 0x61, 0x77, 0x62, + 0x62, 0xa8, 0x5d, 0xc1, 0x40, 0x4c, 0x90, 0x2f, 0xa8, 0x45, 0xa3, 0x58, + 0x4e, 0xa0, 0x9e, 0x97, 0x88, 0xd9, 0xae, 0x4b, 0x34, 0x57, 0x45, 0x98, + 0x4e, 0x34, 0xb5, 0x73, 0x92, 0x5e, 0xb5, 0xcd, 0x60, 0x7e, 0x7b, 0x70, + 0x90, 0xb7, 0x9a, 0xca, 0x66, 0x2d, 0x38, 0x7f, 0xa2, 0x6e, 0x84, 0xa6, + 0x62, 0xde, 0x7a, 0x7b, 0x28, 0x89, 0x82, 0x8a, 0x77, 0x35, 0x35, 0xa8, + 0xe7, 0xab, 0x8d, 0x91, 0x44, 0xe4, 0x3c, 0xb4, 0xb2, 0xcb, 0xb0, 0x92, + 0x42, 0x86, 0xb4, 0x63, 0x62, 0xb9, 0x5b, 0x3f, 0x68, 0x3d, 0xd0, 0xd8, + 0x38, 0xbd, 0xb6, 0xc8, 0x89, 0x82, 0x7f, 0xa6, 0x84, 0xda, 0x4b, 0x4d, + 0x3a, 0x62, 0x9e, 0x22, 0x5a, 0x82, 0x98, 0xa2, 0x6c, 0xa1, 0xa7, 0x8a, + 0xb6, 0xbf, 0xcd, 0x70, 0x45, 0xc1, 0x2c, 0xb6, 0x85, 0xb9, 0x73, 0xac, + 0xb1, 0x95, 0xc3, 0xc1, 0x40, 0xaa, 0x42, 0x68, 0x64, 0x30, 0x9b, 0xc9, + 0x4d, 0x92, 0xc9, 0x72, 0xa4, 0x7c, 0x37, 0x48, 0xc4, 0x62, 0xaa, 0x53, + 0xa0, 0x3c, 0xa3, 0x58, 0xb7, 0x36, 0x4d, 0x42, 0xc6, 0x5e, 0xc4, 0xc2, + 0x2b, 0xaf, 0x7b, 0xd1, 0xca, 0xb3, 0xcc, 0x54, 0x4e, 0x3e, 0x8f, 0x6a, + 0x57, 0x48, 0xa6, 0x7e, 0xaf, 0x8f, 0x36, 0x66, 0x57, 0x6b, 0xda, 0x5a, + 0x88, 0x45, 0x59, 0x3d, 0x7f, 0xce, 0xa2, 0x5d, 0x48, 0xa1, 0x84, 0x84, + 0x46, 0x57, 0x6f, 0x54, 0x57, 0x99, 0x5a, 0x3e, 0x5f, 0xc8, 0x5d, 0x9b, + 0x69, 0xc9, 0xb9, 0x93, 0x6e, 0x50, 0x32, 0xc4, 0x97, 0x5b, 0x89, 0xa2, + 0xb8, 0x39, 0xb9, 0x34, 0x8e, 0xa3, 0x65, 0x4c, 0x3d, 0x68, 0x96, 0x51, + 0x49, 0xa1, 0x50, 0x84, 0xa8, 0xd0, 0x35, 0x6e, 0x3a, 0x54, 0xbe, 0x90, + 0xb8, 0x9d, 0xde, 0x49, 0x91, 0xa2, 0x52, 0x83, 0x90, 0x32, 0xcd, 0xa5, + 0xba, 0x54, 0x9c, 0xa2, 0xb3, 0x52, 0x49, 0x63, 0x83, 0x6e, 0x5e, 0xab, + 0x75, 0x77, 0x4e, 0x49, 0x81, 0xa4, 0xc1, 0x71, 0x3f, 0xc0, 0x83, 0x7a, + 0x59, 0x40, 0x69, 0x9a, 0xba, 0x9b, 0x3a, 0xb0, 0xc0, 0x6f, 0xaa, 0x83, + 0xca, 0x93, 0xa6, 0x91, 0x3c, 0x61, 0x6d, 0x3d, 0xce, 0x51, 0x76, 0x3b, + 0x99, 0x87, 0x34, 0x7d, 0x34, 0xa5, 0x67, 0xb9, 0x9d, 0x40, 0x8d, 0x4a, + 0x51, 0x70, 0x9c, 0xcc, 0x6d, 0x2f, 0xd4, 0x70, 0x7b, 0x88, 0x98, 0x64, + 0x67, 0xb1, 0x36, 0x80, 0x4b, 0xba, 0x94, 0x7d, 0xa0, 0x54, 0x7b, 0x4b, + 0xb1, 0x2f, 0x68, 0x3d, 0x50, 0x8f, 0x4c, 0x65, 0xc9, 0x87, 0x3e, 0x7a, + 0x42, 0x56, 0x9c, 0x68, 0x96, 0xb2, 0x4d, 0x3d, 0x3e, 0x9c, 0x66, 0x31, + 0x66, 0x52, 0x54, 0x77, 0x81, 0xad, 0x4e, 0xa6, 0x69, 0x4b, 0x33, 0x6d, + 0x83, 0x6c, 0x3e, 0xaf, 0x69, 0xb8, 0xc9, 0xdc, 0x5d, 0x96, 0x75, 0x4d, + 0xd3, 0xc2, 0x71, 0x3c, 0x32, 0x7b, 0x32, 0xc9, 0x4a, 0x49, 0xaa, 0x98, + 0xc5, 0x3e, 0x86, 0xb0, 0xa4, 0x42, 0xcb, 0xcd, 0x68, 0x3f, 0x63, 0x4e, + 0x4f, 0x3a, 0x5b, 0x49, 0x41, 0x49, 0x97, 0x4d, 0x67, 0x5c, 0x6b, 0xa2, + 0xb2, 0x35, 0x6f, 0xc7, 0xad, 0xba, 0x4f, 0xa5, 0x40, 0x64, 0x7f, 0x3f, + 0x55, 0xc5, 0x6a, 0x9d, 0xb8, 0x40, 0xab, 0x73, 0x81, 0x8d, 0x37, 0xc1, + 0xad, 0xcc, 0x92, 0x97, 0x47, 0xcb, 0x3b, 0xaa, 0x82, 0xb1, 0x98, 0x95, + 0x60, 0xa8, 0x79, 0x8e, 0x73, 0x3d, 0x92, 0xbd, 0xc5, 0x97, 0xc0, 0xc4, + 0xb7, 0x9b, 0x70, 0x5e, 0xc6, 0xae, 0x64, 0xb2, 0x81, 0x48, 0x9d, 0x4d, + 0x74, 0x88, 0x49, 0x82, 0x72, 0x62, 0xc6, 0x43, 0xca, 0xa6, 0x50, 0x48, + 0xba, 0x38, 0xbd, 0x83, 0xd1, 0x57, 0x91, 0x62, 0x40, 0x75, 0x8a, 0xa5, + 0x33, 0x99, 0xb9, 0xc8, 0x3e, 0x34, 0x90, 0x99, 0x76, 0x8b, 0x83, 0x55, + 0x35, 0x96, 0x7e, 0xc6, 0x61, 0x92, 0xba, 0x71, 0x74, 0x55, 0xcd, 0xa6, + 0xb6, 0xb4, 0x6b, 0xa0, 0x45, 0x7e, 0x3e, 0x96, 0xbd, 0x69, 0x95, 0xd5, + 0xbb, 0x4c, 0xca, 0x72, 0x5a, 0xa0, 0xd4, 0x74, 0x3a, 0x58, 0x5f, 0xad, + 0x67, 0xa4, 0x83, 0xab, 0x56, 0x99, 0x7b, 0x81, 0x84, 0xaa, 0xaa, 0xbf, + 0x37, 0xb4, 0x65, 0xab, 0xa2, 0xa0, 0xb8, 0x41, 0xbb, 0x8b, 0xa5, 0x88, + 0xce, 0x37, 0x4a, 0x67, 0x40, 0x46, 0x5d, 0x9a, 0xc3, 0xb6, 0x60, 0x76, + 0xbd, 0xc5, 0x68, 0x87, 0x6a, 0x9f, 0x40, 0xce, 0xab, 0x94, 0xbc, 0xa8, + 0x49, 0xcb, 0x31, 0x99, 0x70, 0x54, 0x36, 0x68, 0xca, 0x32, 0x95, 0xcf, + 0x3c, 0x4f, 0x4b, 0x3e, 0xab, 0x8a, 0x77, 0x53, 0x59, 0x32, 0x93, 0xc6, + 0x3b, 0xba, 0x96, 0x68, 0x47, 0x57, 0xba, 0x47, 0x71, 0xc9, 0x41, 0xd1, + 0xa1, 0x8a, 0x55, 0xc5, 0x6f, 0xba, 0x7e, 0xc9, 0x4f, 0xcd, 0x77, 0xbd, + 0x63, 0x91, 0x90, 0x34, 0xc9, 0x4d, 0x5d, 0x90, 0x68, 0xaf, 0xa3, 0x6f, + 0x94, 0x44, 0xa5, 0xa7, 0x8d, 0x43, 0xba, 0x6d, 0x6b, 0xd0, 0xa4, 0x5a, + 0x95, 0x50, 0xb0, 0x70, 0x9f, 0x9c, 0xae, 0x39, 0xb1, 0x6e, 0x5b, 0x7e, + 0x87, 0x96, 0xce, 0x37, 0x32, 0x83, 0xba, 0x58, 0xb9, 0x5f, 0xa3, 0x76, + 0x89, 0x5a, 0x93, 0x92, 0x98, 0x6d, 0x55, 0x7d, 0x94, 0xc6, 0xb6, 0x46, + 0x97, 0x67, 0xac, 0x92, 0x57, 0xd1, 0xad, 0x5e, 0x6f, 0x52, 0x5e, 0x68, + 0xc4, 0x9d, 0xd4, 0x44, 0xc3, 0xa5, 0xc7, 0xc3, 0x93, 0x9a, 0x9b, 0x3d, + 0xa6, 0x31, 0x98, 0xb0, 0x87, 0x56, 0xa4, 0x9b, 0xae, 0x2d, 0x26, 0x51, + 0x35, 0x3f, 0x2e, 0x5b, 0xc7, 0xba, 0x76, 0x73, 0x7d, 0x62, 0x42, 0x68, + 0x8a, 0xb4, 0xcc, 0x93, 0x44, 0x6f, 0xa3, 0xc6, 0x94, 0xb5, 0xb3, 0xaf, + 0x46, 0x85, 0xb0, 0x3b, 0xbf, 0x77, 0x48, 0x83, 0x8c, 0x7c, 0xc1, 0xb4, + 0x65, 0x94, 0xac, 0x2f, 0x33, 0xad, 0x99, 0x7b, 0x44, 0x99, 0xad, 0x89, + 0xc6, 0x67, 0x5a, 0x87, 0x6b, 0x88, 0x4e, 0x62, 0xa8, 0xbd, 0xb9, 0xa7, + 0xa4, 0xa4, 0x86, 0x7b, 0x9f, 0xba, 0x81, 0x5a, 0xa0, 0xb2, 0x80, 0x6f, + 0x80, 0x54, 0x7f, 0x83, 0xaf, 0x73, 0x48, 0x5c, 0xcb, 0x47, 0x4a, 0xbe, + 0x98, 0x76, 0x9e, 0x77, 0x91, 0x72, 0x5b, 0x91, 0x4e, 0x80, 0x55, 0x66, + 0x31, 0x37, 0x80, 0xa6, 0x60, 0xba, 0x8b, 0x4d, 0x86, 0x49, 0x7e, 0x8a, + 0x72, 0xb4, 0x8f, 0x77, 0x5b, 0x6c, 0x49, 0x65, 0xcc, 0x43, 0x65, 0x4c, + 0xbb, 0x78, 0x90, 0xb5, 0xa8, 0x48, 0xa3, 0x98, 0x46, 0x8b, 0x78, 0x70, + 0xd0, 0x3c, 0x6e, 0xd5, 0xa8, 0xc8, 0x40, 0x61, 0x4f, 0x72, 0x75, 0xae, + 0x9f, 0x42, 0x7a, 0x75, 0x3c, 0x93, 0x8a, 0xb1, 0xc3, 0x7c, 0xcd, 0xa2, + 0x5e, 0xa0, 0x3e, 0x58, 0xc0, 0xa6, 0xa0, 0x77, 0x31, 0xb5, 0x8b, 0xc7, + 0xb7, 0xa0, 0x62, 0x6a, 0x46, 0xbe, 0x47, 0xbc, 0x5f, 0x31, 0x4d, 0xbb, + 0x58, 0xc9, 0x38, 0xb7, 0x9d, 0xa6, 0x43, 0xcf, 0xca, 0xbe, 0xc1, 0x3f, + 0x5a, 0x83, 0x76, 0x35, 0x4c, 0xae, 0x3a, 0x65, 0xce, 0xaa, 0x73, 0x6b, + 0xbe, 0x3f, 0x4f, 0x63, 0x74, 0x9d, 0x8a, 0x2e, 0xa5, 0xce, 0x5f, 0x2f, + 0x48, 0x31, 0x6c, 0x70, 0x38, 0xa1, 0x99, 0x55, 0x75, 0xae, 0x96, 0x42, + 0x8d, 0x37, 0x48, 0x31, 0x59, 0xcd, 0x33, 0x8d, 0x89, 0x97, 0xc8, 0xa3, + 0xab, 0x57, 0x79, 0x6c, 0x55, 0x69, 0x91, 0x8d, 0xb5, 0x9e, 0x44, 0xa3, + 0x37, 0x97, 0x91, 0x8f, 0x75, 0xd0, 0x3d, 0x5f, 0xcf, 0x81, 0xc3, 0x63, + 0x5f, 0x4f, 0x3c, 0x78, 0x57, 0xb3, 0x3e, 0xdb, 0x7b, 0x69, 0xaf, 0xb6, + 0xc2, 0x4e, 0x4e, 0x6c, 0x6c, 0x68, 0x4f, 0x8c, 0x76, 0x35, 0x4c, 0xd1, + 0x97, 0xa2, 0x4a, 0xca, 0x7d, 0xb6, 0x7a, 0xb0, 0xc5, 0x46, 0xa2, 0x9e, + 0x69, 0x5d, 0x61, 0xa5, 0x7f, 0x5e, 0x48, 0xd3, 0x89, 0x9c, 0x87, 0x5d, + 0xa7, 0x60, 0x6f, 0x78, 0xd1, 0x42, 0xa5, 0x75, 0x57, 0xad, 0x3f, 0x6b, + 0x58, 0xc8, 0x68, 0x41, 0x38, 0x3e, 0xa1, 0x80, 0x80, 0xaa, 0xd1, 0xae, + 0xa0, 0xa4, 0xd0, 0xc1, 0x53, 0xbd, 0x4a, 0x60, 0x4c, 0xc4, 0x69, 0x78, + 0x56, 0x62, 0x42, 0xc9, 0x48, 0x62, 0xbe, 0x43, 0xc2, 0x8b, 0xa9, 0x4a, + 0x51, 0x97, 0xcd, 0x6e, 0xa4, 0x73, 0x8a, 0x74, 0x50, 0x8f, 0xcf, 0x57, + 0x72, 0xb0, 0xb8, 0xc1, 0xd1, 0x62, 0x6f, 0x8e, 0x78, 0x6f, 0xbd, 0x6f, + 0xce, 0x69, 0x9f, 0xac, 0x97, 0x9e, 0x3c, 0xbf, 0x4b, 0x69, 0xca, 0x53, + 0x6f, 0xb5, 0x4e, 0x3c, 0x54, 0x6b, 0x63, 0x90, 0x64, 0x37, 0x5e, 0x7a, + 0xb2, 0x43, 0x58, 0xb6, 0x3a, 0x79, 0x8a, 0x4c, 0xba, 0x3b, 0x83, 0xbd, + 0x73, 0xb9, 0x7f, 0x30, 0x88, 0xc6, 0x83, 0xa4, 0x73, 0xc5, 0x4b, 0x58, + 0xb6, 0x98, 0x9f, 0x5b, 0x38, 0xc9, 0x7c, 0x60, 0x90, 0xc8, 0x78, 0xd3, + 0x43, 0x8a, 0xa3, 0xbb, 0x44, 0x52, 0xba, 0x99, 0x92, 0x39, 0x75, 0xb1, + 0x6c, 0xd5, 0x81, 0x96, 0xae, 0x3c, 0x41, 0x7e, 0x5c, 0x76, 0x95, 0xac, + 0x37, 0x57, 0xc2, 0xcd, 0x44, 0x75, 0x55, 0x3a, 0xb9, 0x88, 0x7a, 0x3c, + 0x76, 0x80, 0xba, 0x6a, 0xc2, 0x69, 0xba, 0x53, 0x71, 0x6c, 0xb4, 0xc2, + 0x72, 0x9c, 0x4e, 0xc6, 0x5e, 0x4d, 0xb7, 0x63, 0x36, 0x54, 0x7c, 0x73, + 0xc1, 0xce, 0xae, 0x52, 0x52, 0x42, 0x37, 0x77, 0xa6, 0x3c, 0x62, 0x6d, + 0x70, 0x3f, 0x9d, 0x3e, 0x46, 0x76, 0x4d, 0xa3, 0x50, 0xa0, 0xc0, 0x54, + 0x9f, 0x4a, 0x57, 0x6f, 0x6e, 0x5a, 0x62, 0xbe, 0x9b, 0x8f, 0xad, 0xbc, + 0xb8, 0xbf, 0x43, 0x63, 0xd1, 0x78, 0x5a, 0x44, 0x91, 0x89, 0x3e, 0x7d, + 0x72, 0x33, 0x67, 0x4e, 0xcb, 0xc3, 0x80, 0xb6, 0x54, 0x7e, 0xcf, 0x3c, + 0x6a, 0x32, 0x49, 0xc1, 0xc7, 0xbf, 0x77, 0x4e, 0x4b, 0xa6, 0xb6, 0x70, + 0x41, 0x49, 0x47, 0x91, 0x8f, 0xa9, 0x40, 0x5b, 0xc1, 0x4e, 0x38, 0xbc, + 0x3d, 0x43, 0x47, 0x4c, 0x60, 0xc8, 0x57, 0xac, 0xc1, 0x63, 0x50, 0xcf, + 0x91, 0x78, 0x93, 0xaf, 0x8f, 0x58, 0xa7, 0x81, 0x46, 0xb8, 0x72, 0xd3, + 0x97, 0x37, 0xb3, 0xb6, 0xaa, 0xc9, 0x88, 0x92, 0x40, 0x9b, 0x49, 0xc2, + 0xa5, 0x98, 0xa3, 0x8d, 0xc7, 0x3b, 0x76, 0x34, 0x52, 0xab, 0x33, 0x37, + 0x83, 0x7a, 0x40, 0xbb, 0x3d, 0x42, 0xae, 0xa2, 0x81, 0x7d, 0x7b, 0xc7, + 0x4e, 0xcb, 0x59, 0x71, 0xd2, 0x3a, 0xd6, 0xc1, 0x72, 0x9d, 0x3a, 0x8d, + 0xb0, 0xb0, 0x9b, 0x7e, 0x37, 0x72, 0x99, 0x3d, 0x45, 0x81, 0x71, 0xa3, + 0xa9, 0xcf, 0x98, 0xb2, 0xbe, 0x99, 0xcb, 0x39, 0xb0, 0x3e, 0xcd, 0x79, + 0xaf, 0x3a, 0xa5, 0x35, 0x3c, 0xa2, 0x72, 0xb3, 0x31, 0x7c, 0x47, 0x8b, + 0x4d, 0xb1, 0xc7, 0x55, 0xca, 0x97, 0x8a, 0xa1, 0x8b, 0xa0, 0x4e, 0xab, + 0x9a, 0x59, 0x8c, 0x8b, 0x6f, 0x80, 0x53, 0x3e, 0xc1, 0x4a, 0xad, 0xb5, + 0xcb, 0x7b, 0xc1, 0x7a, 0x44, 0x43, 0x9e, 0xc9, 0xbf, 0x83, 0x3c, 0x35, + 0x93, 0xc8, 0x80, 0x87, 0xa9, 0x49, 0xbf, 0x6c, 0x3f, 0xa6, 0x97, 0x9e, + 0x87, 0x33, 0x39, 0xaa, 0x92, 0xd3, 0x57, 0xc0, 0x49, 0x89, 0x44, 0xb5, + 0x94, 0x97, 0x3e, 0x5e, 0x7e, 0x6c, 0xa3, 0x74, 0x8b, 0xc5, 0xc7, 0x89, + 0x85, 0x5f, 0x42, 0x60, 0x9a, 0xc5, 0xb2, 0x7f, 0xae, 0x42, 0xa7, 0x69, + 0x42, 0x7a, 0x7b, 0x5a, 0xa7, 0x94, 0x57, 0x6b, 0x4a, 0x58, 0xa6, 0x84, + 0xb4, 0x92, 0x93, 0x80, 0x94, 0x7b, 0x81, 0xc7, 0xd2, 0x96, 0x5d, 0xa6, + 0x3a, 0x5b, 0x5a, 0x47, 0x62, 0x57, 0x91, 0x3b, 0x41, 0x91, 0xb9, 0x6a, + 0x34, 0x43, 0x9c, 0x77, 0x62, 0xaa, 0x86, 0x9a, 0x7a, 0x56, 0x97, 0x89, + 0xc1, 0x7d, 0x5d, 0xba, 0x8e, 0xb5, 0x49, 0x84, 0x7b, 0x88, 0xbd, 0x67, + 0x7d, 0x8d, 0x51, 0xd2, 0x8f, 0x84, 0xa5, 0x97, 0x34, 0xce, 0xcb, 0x35, + 0xd2, 0x75, 0x8b, 0x39, 0xcb, 0x8e, 0x4d, 0xba, 0xcc, 0x6c, 0x78, 0x70, + 0xd9, 0x62, 0xde, 0x6f, 0x52, 0x6e, 0x90, 0xad, 0xa8, 0xc5, 0x7f, 0x45, + 0x81, 0x8f, 0xbc, 0x69, 0x6c, 0xd1, 0x99, 0xdc, 0xc8, 0x5e, 0x37, 0x96, + 0x69, 0xa6, 0x51, 0xb1, 0x49, 0x4d, 0x6c, 0x4e, 0xa0, 0xcc, 0xd7, 0xc4, + 0x85, 0xab, 0xc3, 0x8e, 0xa0, 0x53, 0x9f, 0xc0, 0x78, 0x89, 0xa7, 0xaf, + 0xba, 0x61, 0xc1, 0xc1, 0x56, 0x75, 0x71, 0x50, 0x53, 0xd2, 0x7e, 0x67, + 0xb6, 0x8e, 0x4d, 0xc0, 0x35, 0xad, 0xb7, 0x43, 0x8c, 0x5f, 0x4f, 0x3f, + 0xc3, 0xce, 0x73, 0x85, 0xb6, 0x6e, 0xb6, 0x6d, 0x5f, 0x74, 0x73, 0x88, + 0xd4, 0x78, 0x78, 0xbc, 0xce, 0x49, 0xb2, 0x57, 0x8c, 0x68, 0x8a, 0xd4, + 0xd3, 0x5d, 0xc6, 0xcb, 0x77, 0xbf, 0x53, 0x66, 0x46, 0xcd, 0x66, 0x78, + 0x9c, 0x93, 0x81, 0x9b, 0x8b, 0x96, 0xb7, 0xc9, 0xa0, 0x67, 0x6e, 0x6c, + 0xcb, 0x92, 0x9f, 0xbd, 0x7a, 0x45, 0x8d, 0xc8, 0xb3, 0xc8, 0xbf, 0x48, + 0x90, 0xa1, 0xac, 0xd1, 0x86, 0x7e, 0x97, 0x34, 0xcc, 0xba, 0x77, 0xaf, + 0x89, 0x90, 0x94, 0x3b, 0x8d, 0x69, 0x44, 0xcd, 0x56, 0x61, 0xae, 0xc1, + 0x9a, 0x53, 0x5c, 0x53, 0x87, 0x6d, 0x5b, 0x7d, 0xcf, 0xb8, 0x6b, 0x5b, + 0xbf, 0x7d, 0xad, 0x9e, 0x69, 0x60, 0xb6, 0xbe, 0x92, 0x3c, 0x83, 0x5c, + 0xbc, 0xb8, 0x47, 0x44, 0x8b, 0x98, 0xd3, 0x9e, 0x53, 0x41, 0xb6, 0xd3, + 0xb9, 0x42, 0x5f, 0xb9, 0xa1, 0xaa, 0xae, 0x80, 0x7e, 0x7d, 0x73, 0x75, + 0x64, 0x3f, 0x6d, 0x77, 0x39, 0x6c, 0xa5, 0xbf, 0xaa, 0x77, 0x96, 0x9b, + 0x3b, 0x7f, 0x6d, 0xcf, 0x3a, 0xcc, 0x72, 0xae, 0xa5, 0x38, 0x95, 0x4c, + 0x4f, 0x67, 0x55, 0x72, 0x43, 0x72, 0xaa, 0x42, 0x6a, 0xac, 0xc6, 0x47, + 0x5d, 0x3e, 0x3d, 0x7e, 0x3c, 0xcd, 0xa5, 0xa8, 0x7d, 0x51, 0xb8, 0x77, + 0x6c, 0x97, 0xb4, 0x5d, 0xbd, 0x54, 0xb3, 0xbc, 0x48, 0x95, 0xad, 0xb1, + 0x93, 0x36, 0x53, 0x5d, 0x9e, 0x9d, 0xd2, 0x66, 0x4c, 0x96, 0xbc, 0x7c, + 0x75, 0xa6, 0xd5, 0x85, 0x82, 0xdb, 0xd5, 0xd7, 0xb8, 0xb2, 0x3b, 0x7c, + 0xaf, 0x96, 0x39, 0xad, 0x9b, 0x66, 0x84, 0xb4, 0x2b, 0x88, 0xab, 0x57, + 0x81, 0x74, 0xaa, 0x52, 0x78, 0x8e, 0x60, 0x4a, 0x83, 0x46, 0xca, 0x4e, + 0xa0, 0xa5, 0xaf, 0xb3, 0x87, 0xb3, 0x57, 0x43, 0xb8, 0xc6, 0x77, 0x45, + 0xc6, 0x68, 0x84, 0x91, 0xa2, 0x59, 0x7d, 0x97, 0x6b, 0xc9, 0x52, 0x96, + 0x8e, 0x2e, 0x9a, 0xb6, 0x53, 0x5e, 0xb4, 0x4f, 0x87, 0x63, 0x9b, 0x76, + 0xbe, 0x55, 0xb4, 0x58, 0xb6, 0xd1, 0x89, 0x63, 0x5f, 0x90, 0xa5, 0x3e, + 0x3d, 0x9e, 0x73, 0x4c, 0x5c, 0xc9, 0x67, 0x79, 0xb2, 0x5c, 0x64, 0x5a, + 0x90, 0xd6, 0xb6, 0xa9, 0x56, 0x8e, 0xa1, 0x8a, 0x3c, 0xcd, 0xa3, 0xc7, + 0xb0, 0x6a, 0xc2, 0x8a, 0x7f, 0xb5, 0x85, 0x69, 0x58, 0x9b, 0x3a, 0x81, + 0xcb, 0xaf, 0x76, 0x82, 0x9c, 0x71, 0xc1, 0x6a, 0x57, 0xc1, 0x59, 0x8b, + 0x92, 0x4f, 0xa0, 0x8b, 0x49, 0x44, 0x5e, 0x64, 0x40, 0x7e, 0x9e, 0x8c, + 0x38, 0x50, 0x41, 0x39, 0x8a, 0xb1, 0x9a, 0x38, 0x9a, 0x3b, 0x36, 0x50, + 0xd8, 0xd7, 0xa9, 0xb0, 0x4d, 0x41, 0xb6, 0x9b, 0xaf, 0x31, 0xc9, 0x9a, + 0x87, 0x66, 0x9d, 0x35, 0xcb, 0x74, 0xb9, 0x74, 0x5b, 0xbe, 0xa3, 0x85, + 0x86, 0xd2, 0x5d, 0x7f, 0x92, 0x81, 0x4a, 0x58, 0x64, 0x89, 0x86, 0x62, + 0x68, 0x71, 0x51, 0xb0, 0x42, 0x36, 0x5e, 0xad, 0x75, 0xbd, 0x55, 0xbf, + 0x4e, 0x6b, 0xd1, 0x6c, 0x45, 0x80, 0x80, 0x58, 0x6f, 0x95, 0x5b, 0xb6, + 0x58, 0x65, 0x8d, 0x64, 0x48, 0x74, 0xad, 0x35, 0xc4, 0x82, 0x38, 0x8b, + 0x9d, 0xa3, 0x92, 0x6e, 0xc4, 0x78, 0x50, 0x3c, 0xa2, 0x52, 0x5f, 0xc9, + 0x39, 0xb4, 0xca, 0x44, 0x6b, 0x99, 0xb0, 0x8b, 0x8d, 0xc4, 0xc8, 0xc9, + 0x49, 0xb9, 0x85, 0x50, 0xb4, 0x41, 0x4a, 0x6a, 0x79, 0xa3, 0x4d, 0xa9, + 0x75, 0x43, 0x4f, 0x56, 0xa6, 0x53, 0x77, 0xb1, 0x69, 0x39, 0x55, 0xcc, + 0x9f, 0x39, 0x4f, 0x3f, 0x50, 0x4c, 0x55, 0xc0, 0x94, 0x48, 0x96, 0x60, + 0x32, 0x6a, 0x63, 0x88, 0xca, 0x9f, 0xb1, 0x56, 0xa8, 0x9c, 0xc0, 0x67, + 0x7f, 0x58, 0x55, 0xc1, 0xd2, 0xab, 0xd1, 0x4d, 0xc3, 0x6d, 0x74, 0x3a, + 0xd8, 0x93, 0x45, 0xa8, 0xb1, 0x5c, 0x9c, 0xd1, 0x91, 0x57, 0x34, 0xbe, + 0x94, 0x52, 0x8c, 0x30, 0x5d, 0x4d, 0x3e, 0x3e, 0x9c, 0x41, 0x99, 0x53, + 0x97, 0x70, 0x48, 0xd1, 0x65, 0x76, 0x98, 0x30, 0xa1, 0x3d, 0x7e, 0x33, + 0x5b, 0xbe, 0xb5, 0x2e, 0x31, 0x67, 0x8b, 0xc5, 0xcd, 0xd2, 0x78, 0x5a, + 0xa7, 0x90, 0x32, 0x49, 0x9d, 0xb6, 0xc0, 0x9f, 0xb6, 0xc4, 0x4e, 0xb3, + 0x30, 0x71, 0x81, 0x86, 0x57, 0xb6, 0x39, 0x3d, 0x6c, 0x86, 0x62, 0xaa, + 0x90, 0x48, 0x7c, 0xa9, 0xa0, 0x39, 0x8c, 0x5a, 0xbc, 0xda, 0x87, 0x8a, + 0x91, 0x43, 0x52, 0x9b, 0xbf, 0xb3, 0x84, 0xa7, 0x9c, 0x78, 0x54, 0xa2, + 0x50, 0x31, 0x36, 0xa8, 0x99, 0x33, 0xb4, 0x8e, 0x83, 0x2a, 0x57, 0x55, + 0x89, 0x3a, 0x67, 0xcd, 0x4e, 0x3a, 0x43, 0x35, 0x87, 0xc5, 0xb5, 0x54, + 0x7f, 0x6c, 0xb2, 0xb6, 0xc0, 0x6e, 0x77, 0xba, 0x33, 0x93, 0x42, 0x40, + 0xac, 0x80, 0x48, 0x3a, 0x4a, 0x88, 0x74, 0x95, 0x79, 0xa6, 0xc0, 0x90, + 0xd2, 0xc1, 0x5c, 0x41, 0xdd, 0x50, 0xb6, 0x41, 0x9f, 0x9d, 0xc4, 0xbe, + 0x90, 0x7d, 0xb8, 0x4a, 0x78, 0xc2, 0xa5, 0xca, 0x9e, 0x48, 0x3a, 0x55, + 0x64, 0xa0, 0x97, 0x9d, 0x6c, 0x89, 0x8b, 0xc7, 0xb9, 0xa6, 0xb6, 0xaf, + 0x9c, 0x67, 0x80, 0xc0, 0x32, 0xd0, 0x83, 0x59, 0xc3, 0x46, 0xa0, 0x82, + 0x7f, 0x5b, 0x7e, 0x71, 0xb7, 0x7e, 0x61, 0x5b, 0xa0, 0x8f, 0x79, 0x5e, + 0x83, 0x68, 0xb0, 0xb8, 0x6d, 0x51, 0xc6, 0x67, 0xe1, 0xd9, 0xaf, 0x6c, + 0x8c, 0x6d, 0x4a, 0x45, 0x75, 0xb5, 0xab, 0x64, 0x66, 0xac, 0x36, 0x98, + 0xbc, 0xbe, 0x5f, 0x3e, 0x63, 0x85, 0xc5, 0xc5, 0x49, 0x65, 0x4f, 0xaf, + 0x66, 0x8d, 0x9b, 0x77, 0x5d, 0x88, 0x80, 0x9b, 0xd0, 0xd1, 0x37, 0xb0, + 0x7d, 0xd0, 0x7b, 0x69, 0x7d, 0xbf, 0xa0, 0xb5, 0xd9, 0xa8, 0x4f, 0x7a, + 0x7a, 0xd8, 0x92, 0x46, 0xd7, 0x70, 0x9b, 0xc7, 0x30, 0x67, 0x9d, 0x73, + 0x78, 0x83, 0x63, 0x9a, 0xa0, 0x6c, 0x63, 0x3e, 0x92, 0xa4, 0xcb, 0x3a, + 0x64, 0x7a, 0x7d, 0x3a, 0x3c, 0xa9, 0x7d, 0x7c, 0xbb, 0xcc, 0x62, 0xce, + 0x93, 0x54, 0x64, 0x38, 0xcb, 0x5c, 0xd4, 0xc4, 0x2c, 0x8f, 0x8b, 0xc3, + 0x9e, 0xbe, 0x73, 0x3f, 0x4e, 0xab, 0x4b, 0xa1, 0x85, 0xaf, 0xc9, 0x8a, + 0xbd, 0x5e, 0x85, 0x8e, 0x62, 0x98, 0x8f, 0xa8, 0x5f, 0x30, 0x84, 0xb7, + 0xa3, 0xcd, 0x3a, 0xbb, 0x39, 0xaf, 0xcf, 0x85, 0x70, 0xcf, 0x76, 0xc1, + 0xb2, 0x44, 0xba, 0xd0, 0xbc, 0x4f, 0x5f, 0x3e, 0x8e, 0x80, 0x46, 0xae, + 0xaf, 0x77, 0x8e, 0x71, 0x4b, 0x92, 0x72, 0x7f, 0x3b, 0x60, 0x7b, 0x9a, + 0x55, 0x73, 0x6f, 0x6f, 0x89, 0x80, 0x73, 0x89, 0x46, 0xcc, 0x8b, 0xc6, + 0xb3, 0xc0, 0xaa, 0xb7, 0xa3, 0xa2, 0x4b, 0x88, 0x69, 0xc5, 0xa7, 0x4e, + 0xa1, 0xc4, 0x57, 0x85, 0x55, 0xb5, 0xb9, 0x95, 0xa6, 0xa4, 0x6e, 0x39, + 0x52, 0x59, 0xbe, 0xb7, 0x71, 0x7b, 0x7f, 0x4c, 0xb2, 0xb6, 0x5b, 0xb2, + 0x38, 0xad, 0xce, 0x5d, 0xb7, 0x78, 0x77, 0xad, 0xbb, 0x8b, 0x91, 0xaa, + 0x7b, 0xd4, 0x7c, 0xc4, 0x73, 0x72, 0x8f, 0x43, 0x68, 0x54, 0x7a, 0x3c, + 0xce, 0x6d, 0x5d, 0x57, 0x91, 0x8c, 0x8e, 0x96, 0x73, 0x61, 0x6a, 0x9e, + 0x9e, 0x7f, 0x99, 0x60, 0x98, 0x65, 0x36, 0x8d, 0x3a, 0xb2, 0xcc, 0xc6, + 0xb3, 0xa9, 0x5d, 0x72, 0x86, 0xc2, 0x92, 0xce, 0x55, 0x35, 0x99, 0x3a, + 0x41, 0x75, 0x8a, 0x90, 0x54, 0x5b, 0x50, 0x6a, 0xb1, 0xa6, 0x7c, 0xce, + 0x4d, 0xca, 0xd5, 0x37, 0x99, 0xbc, 0x70, 0xae, 0xc7, 0x5b, 0x58, 0xb7, + 0x95, 0x89, 0x43, 0x4c, 0x7d, 0x75, 0x96, 0x3d, 0x35, 0x9d, 0x55, 0x3a, + 0x96, 0xbb, 0x5c, 0xb5, 0x4e, 0xda, 0x5f, 0x60, 0x65, 0x81, 0xce, 0x6a, + 0x4c, 0x80, 0xd4, 0x3d, 0x2e, 0xac, 0xaf, 0xaf, 0x5b, 0x50, 0xce, 0x6e, + 0x4f, 0x7a, 0x92, 0x33, 0x9d, 0x3e, 0x7c, 0x7c, 0xae, 0xd5, 0xc5, 0x37, + 0x90, 0x80, 0x3b, 0x57, 0x68, 0xd8, 0x97, 0x54, 0x8e, 0x5d, 0x99, 0xbc, + 0x46, 0x9d, 0x94, 0xad, 0x84, 0x3b, 0xc0, 0x80, 0x50, 0xb8, 0x4b, 0xbc, + 0x74, 0xbe, 0xa7, 0x83, 0xa1, 0x8f, 0x53, 0x7e, 0x2f, 0x87, 0x92, 0x4a, + 0xa7, 0xb3, 0xe1, 0xbe, 0x8b, 0x77, 0x72, 0x98, 0x7a, 0x8e, 0xaf, 0xbc, + 0xcd, 0xb5, 0xc7, 0xbd, 0x97, 0x71, 0xc2, 0x48, 0x4e, 0x91, 0x26, 0xa7, + 0xc0, 0x55, 0x78, 0x4d, 0xda, 0x83, 0x7d, 0xd7, 0xb1, 0x84, 0xaf, 0xd7, + 0xbe, 0x96, 0x9b, 0x8a, 0x35, 0x65, 0x9e, 0x5e, 0xcb, 0x62, 0x54, 0x3a, + 0x82, 0x70, 0x72, 0xbd, 0xd8, 0xde, 0x7f, 0x8a, 0xaf, 0xda, 0x7a, 0x7e, + 0x49, 0x58, 0x69, 0xa6, 0xa2, 0x50, 0x82, 0x87, 0xa9, 0x90, 0xbf, 0x5c, + 0x9b, 0x9f, 0x38, 0x82, 0xcc, 0x97, 0xa2, 0x95, 0x55, 0xc2, 0x6b, 0xbb, + 0x7f, 0x57, 0x71, 0x4d, 0x63, 0x32, 0x72, 0x77, 0x7c, 0x56, 0xcc, 0xb9, + 0x83, 0xcf, 0x3d, 0x97, 0x6d, 0x8d, 0x3a, 0x95, 0xc0, 0x73, 0xa7, 0xc6, + 0x48, 0x96, 0xd1, 0x88, 0x9e, 0x8b, 0xc4, 0x56, 0xa4, 0x94, 0x82, 0xb2, + 0xa6, 0xab, 0xac, 0x69, 0xaa, 0xca, 0x65, 0xca, 0x8e, 0xb8, 0x3e, 0xcf, + 0x87, 0x87, 0x67, 0xc6, 0x33, 0xa8, 0xa3, 0x7c, 0xb9, 0x5b, 0xd1, 0x4b, + 0x85, 0x58, 0xa9, 0x7b, 0x65, 0x30, 0xa3, 0x87, 0x7c, 0xcd, 0xbb, 0xbc, + 0x3d, 0x55, 0xba, 0x4e, 0xa6, 0x86, 0x94, 0xd1, 0x7e, 0x4e, 0x63, 0xbc, + 0x93, 0x8d, 0x5b, 0x47, 0xb1, 0xcc, 0x8f, 0xad, 0x33, 0xb2, 0xbc, 0x72, + 0x34, 0xa5, 0x78, 0x94, 0x73, 0x7a, 0x83, 0x53, 0x52, 0xb2, 0x96, 0x36, + 0x75, 0xcd, 0x75, 0x8b, 0x4f, 0xa0, 0xc3, 0xbb, 0xcd, 0x8f, 0x66, 0x72, + 0x75, 0x88, 0xcd, 0x97, 0xb5, 0x69, 0xa9, 0x6a, 0x33, 0x84, 0xc0, 0xa9, + 0xb2, 0x93, 0xaf, 0xab, 0xbb, 0x60, 0xb7, 0xc9, 0x53, 0x7f, 0xb7, 0x9c, + 0x3a, 0xb6, 0x56, 0xb3, 0x42, 0xbc, 0x57, 0x98, 0x30, 0x7e, 0x8a, 0xbf, + 0x80, 0xac, 0x3e, 0x5b, 0x68, 0x64, 0xcb, 0x62, 0x66, 0x8e, 0x28, 0x69, + 0x99, 0xc2, 0x6f, 0x4d, 0x48, 0x7e, 0x83, 0x9a, 0x3b, 0xb4, 0xd8, 0x75, + 0x77, 0x7d, 0x4f, 0x8d, 0xa4, 0x95, 0x4e, 0x62, 0x98, 0x9a, 0x88, 0xaf, + 0x87, 0xd0, 0xd5, 0x83, 0x85, 0x82, 0x76, 0x9d, 0x8c, 0xa9, 0x8c, 0xa4, + 0x3e, 0x63, 0xd6, 0x48, 0x4f, 0x8c, 0xcf, 0x7f, 0xcd, 0x68, 0x94, 0x40, + 0x5f, 0x8f, 0x82, 0xbe, 0xa1, 0x83, 0xc5, 0x49, 0xc3, 0x3d, 0x6e, 0x79, + 0x64, 0xbd, 0x98, 0x9b, 0xb1, 0x92, 0x73, 0xcf, 0x78, 0x34, 0xd9, 0x4a, + 0xb1, 0x55, 0x39, 0x5f, 0x51, 0xcd, 0xc2, 0x33, 0x34, 0xac, 0xa0, 0x54, + 0x49, 0x98, 0x55, 0xb5, 0x6e, 0x8c, 0xce, 0x84, 0x43, 0x7c, 0x32, 0x65, + 0x86, 0x9b, 0x5b, 0x3d, 0x9b, 0xb4, 0x54, 0x57, 0x92, 0x43, 0x77, 0x43, + 0x9b, 0xac, 0x6f, 0xc0, 0x38, 0x54, 0x8a, 0x68, 0x99, 0x57, 0xb0, 0x35, + 0xa1, 0xba, 0x87, 0x6e, 0xc4, 0x68, 0x68, 0x68, 0x90, 0x8f, 0xb8, 0x6c, + 0x8b, 0x96, 0x95, 0x7f, 0xa0, 0x8e, 0xe0, 0xba, 0x5d, 0x61, 0xd5, 0x61, + 0xbc, 0x67, 0x8f, 0x74, 0x94, 0x98, 0xa4, 0x97, 0xcf, 0xa5, 0xac, 0xa7, + 0x91, 0xa1, 0xb0, 0x76, 0xdb, 0x9a, 0x3f, 0x94, 0x71, 0x42, 0x7a, 0xa3, + 0xc2, 0x71, 0x94, 0x2d, 0xc1, 0x81, 0x3a, 0x7a, 0x60, 0x4b, 0x51, 0x40, + 0xb9, 0x81, 0xc9, 0xc8, 0x42, 0x5c, 0xad, 0x2e, 0x73, 0x33, 0x85, 0xc7, + 0x77, 0x69, 0x9e, 0x79, 0xcb, 0xa7, 0x39, 0x3b, 0xcc, 0x3d, 0x57, 0x6d, + 0x45, 0xaa, 0x66, 0xc4, 0xac, 0x7c, 0x4f, 0x62, 0xcb, 0xdd, 0xa6, 0xc4, + 0x37, 0x55, 0x93, 0x47, 0x4c, 0x5f, 0xa0, 0xb8, 0x5f, 0x7c, 0x87, 0x8a, + 0xc4, 0x80, 0x6b, 0xb4, 0x42, 0x7d, 0x3d, 0x74, 0x6f, 0xb7, 0xaf, 0x39, + 0x63, 0xb6, 0x8b, 0x44, 0x50, 0xc2, 0x3a, 0x8d, 0x64, 0x65, 0x6d, 0x50, + 0x50, 0xab, 0xab, 0x38, 0xd8, 0xbb, 0x7b, 0xa5, 0xa9, 0xb4, 0xb5, 0xb4, + 0xc9, 0xbb, 0x83, 0x74, 0x52, 0x5c, 0x90, 0x79, 0x6a, 0x6e, 0x48, 0x32, + 0x60, 0xc0, 0x92, 0x82, 0xcb, 0x93, 0x75, 0x9f, 0x72, 0x42, 0x9b, 0xb9, + 0x81, 0xc7, 0x89, 0xb7, 0x78, 0xb1, 0x5c, 0x5d, 0x73, 0xbf, 0x98, 0xa9, + 0x63, 0xbf, 0xa0, 0x99, 0xa5, 0x4b, 0xc0, 0xc4, 0x2a, 0x6d, 0x34, 0x4d, + 0x73, 0xa1, 0x6d, 0x63, 0x5f, 0x3d, 0xa1, 0x5d, 0x95, 0x99, 0x6b, 0x8f, + 0x9b, 0xc1, 0x5f, 0x76, 0xb1, 0x5d, 0x6e, 0x31, 0x78, 0x66, 0x83, 0x42, + 0xba, 0x8f, 0xb3, 0xa3, 0x74, 0xad, 0x7a, 0xd6, 0x70, 0xb7, 0x40, 0x6d, + 0xbd, 0x96, 0xb7, 0x47, 0x42, 0x73, 0x71, 0x98, 0x4e, 0x31, 0x38, 0x4e, + 0x75, 0x9a, 0x88, 0x93, 0xa8, 0x9f, 0x97, 0x40, 0x5a, 0xb5, 0x68, 0x6f, + 0x47, 0x69, 0xb8, 0x6a, 0x2f, 0x3a, 0xce, 0xdb, 0x8b, 0x45, 0x42, 0x5c, + 0x2a, 0x8f, 0xde, 0x64, 0x8b, 0x95, 0xa7, 0x58, 0x39, 0xcd, 0xe6, 0x6d, + 0xa9, 0xa2, 0x4b, 0x84, 0xaa, 0x74, 0x7c, 0x45, 0xac, 0xe6, 0xb0, 0x3f, + 0x45, 0x68, 0xcb, 0x98, 0x72, 0xad, 0x42, 0xb3, 0x83, 0x50, 0x4b, 0x7c, + 0x76, 0x96, 0xd0, 0x7b, 0xb8, 0x79, 0x4a, 0xb5, 0xbb, 0x38, 0x91, 0xd1, + 0x49, 0x82, 0x34, 0xb3, 0xbf, 0xaf, 0x61, 0x9c, 0x7a, 0x71, 0x48, 0xd0, + 0xcf, 0x7e, 0xaf, 0x72, 0xc1, 0x73, 0xc7, 0x93, 0x67, 0x3d, 0x64, 0xa7, + 0x9d, 0x72, 0x9e, 0x88, 0xbb, 0x8c, 0x7c, 0x6c, 0x84, 0xa7, 0x60, 0xa1, + 0xbc, 0x89, 0x32, 0xab, 0xa5, 0x43, 0xb7, 0x80, 0x3c, 0xb8, 0x49, 0xde, + 0x44, 0x4e, 0x4e, 0x75, 0xa3, 0xcb, 0x81, 0x6c, 0x88, 0x4e, 0x7a, 0x76, + 0x9e, 0xd6, 0xbe, 0x3c, 0x76, 0x41, 0x5a, 0x62, 0x5a, 0x80, 0x4b, 0x89, + 0xa3, 0x31, 0x50, 0xbd, 0xc1, 0xc0, 0x5a, 0x8e, 0x35, 0xa1, 0x4a, 0x82, + 0xbb, 0x67, 0x33, 0x36, 0xc0, 0x58, 0x7d, 0xb3, 0x9c, 0xaf, 0xa9, 0xc5, + 0x34, 0xc6, 0x3c, 0x84, 0x3b, 0xa9, 0x93, 0xaa, 0x85, 0xc8, 0x83, 0x61, + 0x78, 0x69, 0xd8, 0x86, 0x86, 0xcc, 0x6a, 0x50, 0x9a, 0x47, 0xcb, 0x5e, + 0x30, 0xb3, 0xa8, 0x80, 0x4d, 0xd8, 0xca, 0x49, 0xb4, 0x4c, 0xb0, 0xb5, + 0xc0, 0xa4, 0x55, 0x28, 0x44, 0x7d, 0xc1, 0xb0, 0x5e, 0xbc, 0xcd, 0x63, + 0xc0, 0x4c, 0xae, 0x35, 0x6e, 0x57, 0x5e, 0xa1, 0xb0, 0xa4, 0xca, 0x40, + 0x5a, 0x4e, 0x9c, 0x2d, 0xbb, 0x3a, 0x4c, 0x3a, 0x60, 0xb4, 0x96, 0x8f, + 0xa6, 0x5f, 0x84, 0xbd, 0x57, 0x6b, 0x8c, 0x67, 0x83, 0x8d, 0xb3, 0xb5, + 0xa4, 0x68, 0xc7, 0x95, 0x68, 0xca, 0x9e, 0x8a, 0x7d, 0x54, 0xb2, 0xb7, + 0x97, 0xc3, 0x56, 0xcc, 0xb4, 0xa7, 0x3d, 0x88, 0xbf, 0xb8, 0xd5, 0x6b, + 0xa8, 0xbc, 0xbb, 0x60, 0xb9, 0x50, 0x4f, 0x94, 0xa6, 0x72, 0xcc, 0xad, + 0x68, 0xa8, 0x98, 0x8d, 0xce, 0x45, 0xbc, 0xc5, 0x6f, 0x3f, 0xb4, 0x31, + 0x7d, 0x47, 0x9b, 0xbd, 0x3b, 0x6d, 0xbf, 0xcd, 0x82, 0x59, 0x6a, 0xb8, + 0x93, 0x7c, 0x54, 0x51, 0x55, 0xd5, 0x60, 0x3f, 0x37, 0xc4, 0x72, 0x97, + 0x8d, 0x92, 0x79, 0xb1, 0xaa, 0x7a, 0x53, 0x4d, 0xcb, 0xcf, 0x9f, 0x32, + 0xbc, 0xc2, 0x7c, 0x82, 0x95, 0x92, 0xb0, 0xce, 0x58, 0x87, 0xb9, 0x99, + 0xb2, 0xa8, 0x55, 0x40, 0x2d, 0xb8, 0x4a, 0xc0, 0x7e, 0x8e, 0x5b, 0xc7, + 0x38, 0x5e, 0xc4, 0x4e, 0x65, 0xb4, 0x45, 0x63, 0x35, 0x4a, 0x72, 0xc8, + 0x53, 0x39, 0x55, 0x9f, 0xa1, 0x48, 0x8a, 0xb1, 0x9c, 0x90, 0x8d, 0xb5, + 0x2f, 0x87, 0xcd, 0x49, 0x2e, 0x57, 0x41, 0x5f, 0xb0, 0x9d, 0x40, 0xc2, + 0x9c, 0xc5, 0x76, 0xa8, 0x3a, 0x6b, 0x5f, 0xa2, 0x6c, 0x4b, 0xc3, 0x8d, + 0x45, 0xa1, 0x53, 0x82, 0xce, 0x7b, 0x65, 0xb9, 0xa4, 0xd9, 0xbd, 0x8d, + 0x7b, 0xa4, 0xac, 0x55, 0xbe, 0x93, 0x2a, 0xab, 0xcb, 0xa8, 0xac, 0x76, + 0x3a, 0x76, 0x51, 0x70, 0xa2, 0x34, 0x7d, 0xce, 0x8d, 0x78, 0x47, 0x58, + 0x46, 0x94, 0x73, 0x32, 0xc6, 0x39, 0xc5, 0x43, 0xbc, 0x9a, 0xaf, 0xb5, + 0xc6, 0x3d, 0x46, 0x61, 0x8e, 0xaf, 0x6d, 0xd5, 0x8c, 0xcd, 0x85, 0x3c, + 0xb0, 0xba, 0xc2, 0xad, 0x59, 0x96, 0x98, 0xc4, 0x89, 0x86, 0x83, 0x5a, + 0xad, 0x80, 0x41, 0x92, 0x33, 0x57, 0xd4, 0x39, 0x95, 0x4a, 0xaa, 0x4c, + 0x92, 0xb0, 0xb4, 0xc8, 0x99, 0x36, 0xca, 0x6e, 0xc0, 0x45, 0xbf, 0x5f, + 0x90, 0x4f, 0xb8, 0x59, 0x31, 0x65, 0xaa, 0xa8, 0x33, 0x91, 0x4a, 0x49, + 0x88, 0x32, 0x50, 0xbc, 0x4a, 0x70, 0x8e, 0x5f, 0xaf, 0x4e, 0xa4, 0x2d, + 0xb4, 0x2d, 0x4b, 0xbe, 0x67, 0xaa, 0xc4, 0x53, 0x51, 0x3a, 0x6d, 0x6a, + 0x59, 0x2e, 0x22, 0x8a, 0xc8, 0x7a, 0x95, 0xb3, 0xad, 0x43, 0x87, 0x5a, + 0x93, 0x59, 0x60, 0x8a, 0x65, 0xae, 0x55, 0x8f, 0xb8, 0x85, 0xc1, 0x95, + 0x57, 0xc8, 0x85, 0xc1, 0x56, 0x2a, 0x3b, 0x56, 0xca, 0x26, 0x26, 0x38, + 0x42, 0xac, 0x5d, 0xbb, 0x2c, 0x9d, 0x56, 0x49, 0xc2, 0xb1, 0x59, 0x59, + 0x5c, 0x48, 0x6a, 0xaa, 0x42, 0x79, 0x33, 0x36, 0x9d, 0x88, 0x37, 0x52, + 0x57, 0x55, 0x96, 0xa7, 0x54, 0x6d, 0x76, 0x6d, 0x91, 0x59, 0xc6, 0x40, + 0x9c, 0x43, 0xb0, 0xa8, 0x65, 0xa3, 0x80, 0x9d, 0x51, 0x46, 0xd0, 0x9c, + 0x9a, 0x90, 0xc2, 0x63, 0xa4, 0x84, 0x87, 0xcd, 0xbc, 0x34, 0x48, 0x80, + 0x67, 0x9e, 0xc6, 0x5e, 0x85, 0x52, 0x90, 0xbd, 0x3a, 0xad, 0x5e, 0x82, + 0x96, 0x7a, 0x47, 0xac, 0x83, 0x44, 0xde, 0x56, 0x3f, 0x64, 0x54, 0x6b, + 0x31, 0x48, 0x65, 0xa0, 0x5b, 0xd5, 0x72, 0x8b, 0x7d, 0xcf, 0x70, 0x98, + 0x4a, 0x44, 0xac, 0x6c, 0xc9, 0x69, 0x8b, 0xa1, 0x5b, 0xca, 0xcb, 0xce, + 0x98, 0xab, 0xaf, 0x4d, 0x33, 0x6b, 0x45, 0x54, 0x46, 0xcc, 0x7d, 0x6b, + 0xb1, 0x9a, 0x88, 0xbc, 0x2f, 0x28, 0x3c, 0x84, 0x2d, 0xbc, 0x35, 0xa1, + 0x30, 0x71, 0x8a, 0x86, 0x8e, 0xb1, 0x76, 0x8c, 0x7e, 0x94, 0x35, 0xce, + 0xb1, 0x77, 0x81, 0x45, 0xaa, 0x48, 0x47, 0xcf, 0x56, 0x34, 0x58, 0x55, + 0x69, 0xb3, 0x36, 0x5a, 0xbf, 0x6c, 0x3a, 0x87, 0x82, 0x7b, 0x59, 0x8d, + 0x60, 0x8f, 0xa2, 0xb3, 0x86, 0x2b, 0x32, 0x88, 0x97, 0x4b, 0x71, 0x44, + 0xb2, 0x80, 0xc0, 0x5f, 0x7c, 0x65, 0x83, 0x60, 0x50, 0xa7, 0x56, 0x32, + 0xa0, 0x5e, 0x9a, 0x57, 0x6c, 0xc0, 0xb3, 0x33, 0x5d, 0x3c, 0xcb, 0x3c, + 0x7e, 0xa8, 0xd1, 0xba, 0xb1, 0xc9, 0x32, 0x61, 0xa9, 0x41, 0xaa, 0xb2, + 0x5c, 0x91, 0x5c, 0x92, 0x76, 0xc4, 0x70, 0x97, 0x65, 0x3c, 0xa0, 0x86, + 0x80, 0xc9, 0x50, 0x6e, 0x8b, 0x8d, 0xa9, 0x2b, 0x83, 0x39, 0xb3, 0x31, + 0x73, 0x77, 0x90, 0x64, 0x85, 0xba, 0x72, 0xa3, 0xbc, 0x5b, 0xa5, 0xaa, + 0xca, 0xc8, 0x94, 0xa1, 0x40, 0x47, 0xdf, 0x53, 0x74, 0xae, 0x78, 0x50, + 0x32, 0xae, 0x6a, 0x47, 0x87, 0x5f, 0xc2, 0x9d, 0xba, 0x9b, 0xb6, 0x8b, + 0x98, 0x67, 0xb4, 0x80, 0x84, 0xb9, 0x3a, 0x4d, 0x3f, 0x4c, 0x46, 0xa3, + 0x5a, 0x7c, 0x6d, 0x9d, 0xb0, 0xb0, 0x9f, 0x7a, 0xcd, 0x35, 0xc6, 0x6b, + 0x58, 0x8d, 0x3f, 0xba, 0x96, 0xa5, 0xa8, 0xc1, 0x7a, 0xc4, 0x51, 0x67, + 0xab, 0xa2, 0xb2, 0x33, 0x63, 0x6f, 0x89, 0x5d, 0x98, 0x79, 0x7e, 0xa8, + 0x48, 0x3e, 0xc3, 0xc1, 0x5b, 0x5f, 0x68, 0x88, 0xc7, 0x5f, 0xa8, 0x96, + 0x68, 0x47, 0x41, 0x64, 0x3e, 0xc0, 0xc7, 0x67, 0x4a, 0xd3, 0xa8, 0xb0, + 0x8f, 0xcb, 0x93, 0x6b, 0x34, 0x4c, 0x61, 0x86, 0x97, 0xac, 0x69, 0xb9, + 0x7c, 0x3e, 0x39, 0x3a, 0x4d, 0xbb, 0x99, 0xc3, 0xc1, 0x9c, 0xae, 0x49, + 0xb6, 0xb7, 0xbe, 0xa3, 0x9f, 0x84, 0xb2, 0x50, 0x4c, 0x6b, 0xc7, 0x73, + 0x45, 0x99, 0x73, 0x84, 0xc7, 0xd2, 0xb0, 0x74, 0x60, 0x59, 0xbf, 0x6a, + 0x8a, 0x95, 0xad, 0x57, 0xc6, 0xd0, 0x67, 0x53, 0x54, 0x78, 0x61, 0x45, + 0x5e, 0x81, 0xd1, 0x52, 0x37, 0x9e, 0x3b, 0xa1, 0x77, 0xc5, 0x79, 0x39, + 0x67, 0x64, 0x59, 0xc1, 0xcb, 0xad, 0x48, 0x9a, 0x9f, 0x67, 0x41, 0x85, + 0xb3, 0xd2, 0x94, 0x45, 0x55, 0x93, 0xb4, 0x56, 0x98, 0xba, 0x70, 0x3f, + 0xb4, 0x5e, 0xaa, 0x41, 0x44, 0x83, 0xca, 0xa1, 0x52, 0xc1, 0x45, 0xc4, + 0x95, 0x4d, 0xce, 0x51, 0xae, 0xa3, 0xc1, 0x61, 0x45, 0x5c, 0xb6, 0x6b, + 0xc0, 0x6a, 0x47, 0x49, 0x72, 0x93, 0x65, 0x70, 0x9c, 0xa2, 0x5e, 0xc1, + 0x71, 0x88, 0x8d, 0x31, 0x3d, 0x7e, 0x8d, 0x36, 0x63, 0x5d, 0x83, 0x7e, + 0x6e, 0x36, 0xa5, 0xc1, 0x60, 0xae, 0x72, 0x89, 0x3e, 0x9f, 0x93, 0x4e, + 0x44, 0xd1, 0x61, 0x3c, 0xbf, 0x60, 0x60, 0x4b, 0x94, 0x4f, 0x93, 0x34, + 0x42, 0x39, 0xc5, 0xa6, 0x47, 0x57, 0xca, 0x7f, 0x56, 0x49, 0x88, 0x56, + 0xa8, 0xd2, 0x6b, 0x5c, 0x62, 0x75, 0x4b, 0x6a, 0x4e, 0x79, 0x4c, 0xb9, + 0x82, 0x85, 0xcc, 0x38, 0xcf, 0x8d, 0xd3, 0xb1, 0x8d, 0x67, 0x4d, 0x43, + 0xb1, 0x41, 0x6d, 0xcf, 0x39, 0x36, 0x46, 0x5f, 0x8c, 0xae, 0x93, 0xd0, + 0xa0, 0x7b, 0x93, 0x8b, 0x71, 0xae, 0x88, 0x7c, 0x65, 0x55, 0x2f, 0xcd, + 0x4d, 0x9b, 0x66, 0x76, 0x5e, 0x6e, 0xa1, 0x60, 0x91, 0xa2, 0xd3, 0xa0, + 0x57, 0x76, 0xac, 0xcf, 0x95, 0xa8, 0xa9, 0x4d, 0xab, 0x3a, 0x5a, 0x3f, + 0xa9, 0x34, 0xc3, 0x79, 0x4c, 0x39, 0xa0, 0x9b, 0x47, 0x34, 0x74, 0x94, + 0x54, 0x56, 0x82, 0xa1, 0x3c, 0xc0, 0x57, 0xad, 0x98, 0xbe, 0x6b, 0xa5, + 0x63, 0x34, 0x38, 0x7c, 0xa1, 0x4e, 0x4c, 0x48, 0x95, 0x99, 0x9f, 0x92, + 0xcb, 0x6f, 0x5a, 0x78, 0xa6, 0x93, 0xbb, 0x82, 0x79, 0x74, 0x8f, 0x9d, + 0x4f, 0xc9, 0x42, 0x93, 0x4c, 0x7b, 0xaa, 0x76, 0x46, 0x64, 0x38, 0x8e, + 0xc6, 0xb0, 0x87, 0xcc, 0x68, 0x32, 0x65, 0xc5, 0x92, 0x41, 0x85, 0xb9, + 0x77, 0x5f, 0x8d, 0xbf, 0x67, 0x7d, 0x48, 0x49, 0x51, 0x3e, 0xb7, 0x89, + 0x6b, 0x63, 0x67, 0x99, 0xaa, 0x91, 0x4d, 0xbb, 0x62, 0x66, 0x76, 0xd3, + 0x8e, 0xc3, 0x44, 0x90, 0x4b, 0x83, 0xba, 0x4b, 0x89, 0x4d, 0x53, 0x8c, + 0xc8, 0x90, 0x80, 0xa7, 0x33, 0x96, 0x51, 0x7b, 0x6c, 0x5a, 0xac, 0x78, + 0xaf, 0x67, 0xd3, 0xbf, 0xa9, 0xca, 0xcc, 0x35, 0xbc, 0x57, 0x5f, 0xb9, + 0x40, 0x88, 0x35, 0x51, 0xb4, 0x48, 0xc2, 0xb0, 0x49, 0xbd, 0x7f, 0x86, + 0xa8, 0x69, 0xb8, 0xbd, 0x59, 0x58, 0x8e, 0x95, 0x6c, 0x48, 0x35, 0xb2, + 0x80, 0x80, 0x46, 0x6a, 0x61, 0x74, 0xb7, 0x39, 0x47, 0x37, 0x3b, 0x45, + 0x3a, 0xae, 0x86, 0x5f, 0x86, 0x8a, 0xaf, 0x55, 0xcb, 0x64, 0xab, 0xa3, + 0x93, 0x3c, 0x63, 0xa8, 0xb9, 0x8f, 0x6f, 0x7d, 0x6e, 0x93, 0x99, 0xc5, + 0x96, 0x61, 0xa3, 0xb0, 0x70, 0x9d, 0xa1, 0x64, 0x73, 0x4e, 0x9d, 0xc5, + 0x9b, 0x75, 0xa0, 0x60, 0xad, 0x2f, 0x67, 0x64, 0xb5, 0x35, 0xb4, 0x89, + 0x65, 0x49, 0x8d, 0x84, 0x94, 0x75, 0xce, 0x7e, 0x81, 0x31, 0xcd, 0x90, + 0x58, 0x68, 0xc6, 0xc9, 0xa0, 0x6c, 0x32, 0xb8, 0x75, 0xcc, 0xbe, 0xa8, + 0x4f, 0x31, 0x5a, 0x42, 0x65, 0x96, 0x52, 0x94, 0x99, 0x7a, 0xa2, 0x55, + 0x34, 0x6f, 0xca, 0xcd, 0x58, 0xb4, 0xb7, 0x7b, 0x7e, 0xb8, 0xa2, 0xca, + 0x95, 0x54, 0xaf, 0x96, 0x37, 0x7e, 0xb5, 0x7d, 0x33, 0x42, 0x44, 0x78, + 0xb3, 0x58, 0x66, 0x3b, 0x68, 0x64, 0x55, 0x7f, 0xb6, 0x3c, 0x6e, 0xba, + 0xbe, 0x5c, 0x3c, 0x4a, 0xc6, 0xc8, 0xcb, 0x2b, 0xbb, 0x7b, 0x6c, 0x6c, + 0x93, 0x52, 0x39, 0xa3, 0xbe, 0xa6, 0xbc, 0x48, 0xaf, 0x3c, 0x79, 0xc3, + 0x9b, 0x75, 0x39, 0x71, 0x8a, 0xa9, 0x56, 0x6d, 0x73, 0x3e, 0x38, 0x56, + 0x9a, 0x37, 0x73, 0xb5, 0x7e, 0xb5, 0x31, 0x9b, 0xa7, 0x8c, 0x7b, 0x36, + 0x6d, 0x6d, 0xa3, 0x44, 0x72, 0x45, 0x50, 0x58, 0x76, 0xc6, 0x41, 0x8d, + 0x54, 0x9c, 0x47, 0x93, 0x8e, 0xab, 0xad, 0xcd, 0x9c, 0xb3, 0xc3, 0xb4, + 0x39, 0xb6, 0x74, 0x4e, 0xc6, 0x72, 0x6e, 0x86, 0x5d, 0xcf, 0xa8, 0x96, + 0xa3, 0x3d, 0x6d, 0x8b, 0x5f, 0x5c, 0x7e, 0x3b, 0x5e, 0x42, 0x7b, 0x67, + 0x51, 0xae, 0x3a, 0x3d, 0xbe, 0xbe, 0x70, 0x53, 0xb2, 0xae, 0x4d, 0x62, + 0xa9, 0xab, 0x8b, 0xb2, 0xc0, 0xc7, 0x42, 0xac, 0x7f, 0x98, 0x3d, 0x5a, + 0x62, 0x78, 0x60, 0x51, 0x4c, 0x35, 0x67, 0x4b, 0x68, 0x65, 0x6a, 0x9f, + 0x5d, 0x96, 0x61, 0xaa, 0x3a, 0x8c, 0xb2, 0x6f, 0x34, 0x8f, 0x33, 0xa6, + 0x44, 0x79, 0x83, 0xce, 0xa4, 0x77, 0xb1, 0x47, 0xaa, 0x38, 0x4c, 0x40, + 0xd0, 0x3a, 0x54, 0xce, 0x2d, 0xc3, 0x8d, 0x42, 0xd2, 0xc3, 0x4d, 0x6d, + 0x92, 0x37, 0x4d, 0x95, 0x71, 0xce, 0xb2, 0x73, 0x91, 0x84, 0x57, 0xc8, + 0x5d, 0xbf, 0x4d, 0xad, 0x9d, 0xc3, 0x71, 0x34, 0x70, 0x7a, 0x39, 0x6d, + 0xa0, 0x6d, 0x8d, 0xd1, 0xac, 0x75, 0x80, 0xd2, 0x95, 0xce, 0x94, 0x93, + 0xcd, 0x4f, 0xca, 0x9f, 0x46, 0xb3, 0xc2, 0x94, 0x51, 0x90, 0xab, 0x6c, + 0x73, 0x66, 0x60, 0x4e, 0x4c, 0xa6, 0x8f, 0xa4, 0x5b, 0xae, 0x8e, 0x9b, + 0xa8, 0xcb, 0xc3, 0x75, 0x53, 0x54, 0xc9, 0x90, 0x4c, 0x77, 0x87, 0x73, + 0xd0, 0x95, 0x8d, 0x61, 0x3b, 0x2a, 0x68, 0x8e, 0x5d, 0xb7, 0x53, 0xa4, + 0x36, 0x2a, 0x73, 0x8f, 0x7a, 0x89, 0x62, 0x57, 0xc9, 0x82, 0x8c, 0x37, + 0xcb, 0x62, 0x8c, 0xa4, 0x54, 0x6a, 0xaf, 0x7b, 0xa2, 0xbd, 0x72, 0x89, + 0xd9, 0x53, 0x9c, 0x67, 0xbb, 0x36, 0x6d, 0x54, 0xba, 0x3d, 0x36, 0x90, + 0x6a, 0x8a, 0x82, 0x3a, 0x94, 0xcc, 0xb2, 0xa3, 0xb4, 0x48, 0xc0, 0xc5, + 0x74, 0xcb, 0xc3, 0x9d, 0x65, 0x68, 0xb2, 0xb2, 0xa9, 0x9b, 0xb6, 0x99, + 0xaa, 0x8a, 0x35, 0x56, 0x50, 0xd2, 0x9c, 0xa6, 0x72, 0x5e, 0x8d, 0x9b, + 0x49, 0x83, 0x7d, 0x8e, 0x91, 0xba, 0x5e, 0x6f, 0x4c, 0x58, 0xc3, 0xad, + 0x72, 0xb4, 0x8c, 0x78, 0x74, 0x3c, 0x82, 0xc5, 0x9d, 0x42, 0x97, 0x9d, + 0x6d, 0x81, 0x47, 0xd4, 0x7d, 0x48, 0xaf, 0xb9, 0x99, 0xcb, 0x7c, 0x8f, + 0x63, 0xc4, 0x94, 0x9f, 0xc5, 0x72, 0xbf, 0x4e, 0x92, 0x64, 0x41, 0xb2, + 0x92, 0x85, 0x69, 0xa2, 0x43, 0x79, 0xb2, 0x93, 0x79, 0x61, 0x55, 0x88, + 0x58, 0x48, 0x66, 0xc6, 0xaf, 0x65, 0x3d, 0xd0, 0xb6, 0xd6, 0xa8, 0xa9, + 0x6d, 0x84, 0x87, 0xc8, 0xb5, 0x95, 0x90, 0x86, 0x99, 0x5d, 0xc8, 0xba, + 0x49, 0xc5, 0xca, 0x3f, 0x99, 0xc5, 0x95, 0x74, 0x91, 0xa3, 0x55, 0x82, + 0x3b, 0xa5, 0x56, 0xcb, 0x82, 0x8b, 0x60, 0xc8, 0xc2, 0x99, 0x3d, 0x6f, + 0xb5, 0x7c, 0x55, 0x8e, 0x9e, 0xae, 0x60, 0x73, 0xa9, 0x60, 0x8a, 0xaf, + 0xa7, 0x5b, 0x44, 0x96, 0x50, 0x44, 0x81, 0x49, 0x9d, 0x46, 0x8c, 0x33, + 0xc4, 0xc4, 0xd0, 0x65, 0xb3, 0x53, 0x9e, 0x7e, 0x83, 0x57, 0x80, 0x91, + 0x52, 0xca, 0xbf, 0x35, 0x69, 0x77, 0x5f, 0xac, 0x5a, 0xd0, 0x94, 0xa9, + 0x65, 0x78, 0xd0, 0x91, 0x9f, 0xb5, 0xac, 0x32, 0x8e, 0x2c, 0x83, 0x44, + 0x6f, 0xac, 0xa0, 0xcd, 0xae, 0x7b, 0x89, 0xcb, 0x66, 0x5f, 0x4a, 0x37, + 0x65, 0xb4, 0x77, 0x48, 0xa6, 0x57, 0xcb, 0xad, 0x81, 0xa5, 0x63, 0x58, + 0x3e, 0xbb, 0xa9, 0x5a, 0x3b, 0xbe, 0x4f, 0x39, 0x66, 0xaf, 0xc0, 0x85, + 0xc4, 0xf5, 0xdd, 0xd2, 0x8c, 0xc7, 0x75, 0x59, 0x46, 0xaf, 0x44, 0xa5, + 0xb5, 0x6c, 0x83, 0x63, 0x8d, 0x69, 0x3f, 0x50, 0xad, 0x74, 0x9b, 0x5c, + 0x61, 0x39, 0xd0, 0x42, 0x50, 0xb4, 0x5c, 0x6a, 0x93, 0x33, 0xcf, 0x97, + 0x48, 0xc5, 0xb1, 0x93, 0x7b, 0x6e, 0x93, 0x81, 0xcf, 0x7b, 0x85, 0xd0, + 0xb6, 0x78, 0x56, 0x88, 0x5b, 0x7e, 0xa8, 0x61, 0xac, 0xc9, 0x96, 0x70, + 0xa0, 0x5b, 0xb2, 0x2a, 0xbf, 0x77, 0x7c, 0x3b, 0x78, 0x59, 0xc0, 0x40, + 0x87, 0x3a, 0x94, 0x8c, 0x6e, 0xc1, 0xa7, 0x76, 0x48, 0xb9, 0xac, 0x7a, + 0xae, 0x9c, 0x3a, 0x77, 0x54, 0x72, 0x93, 0x86, 0x87, 0xa3, 0x79, 0x30, + 0x79, 0xe5, 0x5d, 0x54, 0xb6, 0x5f, 0x56, 0x49, 0xa5, 0x84, 0x99, 0x68, + 0x81, 0x63, 0x6f, 0x95, 0x6f, 0xb4, 0x7c, 0x5a, 0xb6, 0xc3, 0x9f, 0x5e, + 0x47, 0xb5, 0x71, 0x33, 0x89, 0xc1, 0x85, 0x9a, 0xd1, 0xb4, 0x5c, 0xa6, + 0x4a, 0x9c, 0x84, 0xcf, 0xbb, 0x46, 0x98, 0x3f, 0xd1, 0x7d, 0xb3, 0x90, + 0xa2, 0xb8, 0x4e, 0xbc, 0x8a, 0x82, 0xbd, 0x87, 0xbc, 0x53, 0x34, 0xc5, + 0xcf, 0xbe, 0x30, 0x61, 0x7f, 0x4b, 0xc4, 0x36, 0xce, 0x3d, 0x8e, 0x91, + 0xbe, 0xc6, 0xbf, 0xa5, 0x54, 0xb8, 0x5a, 0x67, 0x98, 0x96, 0x59, 0x99, + 0x39, 0x82, 0x57, 0x3f, 0xc0, 0xba, 0xb6, 0x6a, 0x73, 0x7f, 0x5b, 0x78, + 0xc3, 0xce, 0x34, 0x4f, 0x34, 0xb5, 0xd5, 0xc2, 0x56, 0x87, 0x7b, 0x3c, + 0xb1, 0x49, 0xbb, 0xaa, 0xd0, 0xb5, 0x53, 0x60, 0x4e, 0x43, 0xb2, 0x3b, + 0xd8, 0xd3, 0x5e, 0x2a, 0xa7, 0x61, 0x56, 0x65, 0xa1, 0xc0, 0xce, 0xd6, + 0x4d, 0xd1, 0xdc, 0x63, 0xa6, 0xab, 0x91, 0x31, 0x51, 0xa6, 0xac, 0xaa, + 0xd2, 0x8e, 0x52, 0xae, 0x9c, 0x9f, 0x7a, 0x50, 0x64, 0x9b, 0xae, 0x77, + 0xb6, 0xa2, 0x3b, 0x75, 0x7e, 0x9b, 0xa5, 0x6c, 0x83, 0xc4, 0xa6, 0xa5, + 0xd3, 0xc1, 0x39, 0x75, 0x82, 0x44, 0x94, 0x79, 0xb9, 0x51, 0x6e, 0x97, + 0x45, 0x5f, 0x5c, 0xcf, 0x7a, 0x7f, 0x97, 0x33, 0x7b, 0x75, 0x3e, 0x7a, + 0xa3, 0x47, 0xb0, 0xe2, 0x63, 0x6d, 0x54, 0xcb, 0xd3, 0xa1, 0x55, 0x20, + 0x57, 0x72, 0x34, 0x41, 0x75, 0xbd, 0x65, 0xbc, 0xb6, 0xa8, 0x80, 0x38, + 0xb5, 0x85, 0x8d, 0x9c, 0xb9, 0x43, 0x9b, 0x3d, 0x32, 0xd3, 0x40, 0xa1, + 0x9f, 0x34, 0x47, 0xc1, 0x39, 0xc4, 0x45, 0xc1, 0x86, 0x67, 0x7c, 0xcb, + 0x46, 0x4c, 0x77, 0xbc, 0xd6, 0x39, 0x69, 0x52, 0x59, 0xbb, 0xbb, 0x2d, + 0x5b, 0xd0, 0x97, 0x4f, 0x77, 0x72, 0x6e, 0x2f, 0xb2, 0x6e, 0x99, 0xbe, + 0x89, 0x62, 0x46, 0x33, 0x3c, 0x47, 0x8a, 0x4c, 0xbb, 0x40, 0x6c, 0x3a, + 0xb8, 0xe0, 0x49, 0xa4, 0x80, 0x72, 0x91, 0xa9, 0x85, 0xb9, 0xb0, 0x8e, + 0x92, 0x74, 0x7d, 0xa9, 0xc5, 0xad, 0xa8, 0x54, 0x92, 0xc7, 0xb6, 0x58, + 0x3f, 0xc6, 0x44, 0x33, 0x5d, 0x5c, 0x73, 0x8c, 0x96, 0x60, 0xd6, 0xad, + 0x77, 0xb2, 0x60, 0x26, 0x35, 0xa0, 0x9a, 0x39, 0x46, 0x37, 0x53, 0x41, + 0x7d, 0x84, 0x89, 0x61, 0x7e, 0x30, 0xc6, 0x8b, 0x66, 0xdb, 0xc8, 0xb8, + 0xb1, 0x5d, 0xdc, 0x8a, 0x19, 0x6f, 0xad, 0x96, 0x6d, 0x40, 0xd0, 0xb6, + 0x4e, 0x8b, 0x33, 0x6c, 0xd8, 0x9d, 0xa5, 0xbc, 0x65, 0x63, 0xd1, 0xd7, + 0xcb, 0xa8, 0x45, 0x71, 0x61, 0x78, 0x6c, 0x2e, 0xc4, 0x69, 0x87, 0x54, + 0x2b, 0xae, 0x46, 0xcb, 0x7e, 0x6f, 0xb4, 0x3e, 0x4e, 0x5c, 0x34, 0xa1, + 0x40, 0xc5, 0xc7, 0x34, 0xb0, 0x75, 0x75, 0x33, 0x30, 0x40, 0x62, 0x7f, + 0xb4, 0x92, 0xd0, 0xcb, 0x7a, 0xcc, 0xd7, 0x7f, 0x60, 0x3a, 0xd1, 0xa1, + 0xd1, 0x82, 0x7d, 0x8a, 0xbe, 0xac, 0x8f, 0x38, 0x3e, 0x5e, 0x67, 0x50, + 0xcd, 0x8b, 0x70, 0x7b, 0x63, 0x67, 0x64, 0x6c, 0x60, 0x91, 0xa4, 0x80, + 0x6a, 0xcf, 0x73, 0xa0, 0x2b, 0x37, 0x71, 0x56, 0xc2, 0x44, 0x3b, 0x61, + 0xb5, 0x33, 0x94, 0x99, 0x66, 0xb4, 0x72, 0x45, 0x6c, 0x2c, 0xd6, 0x6c, + 0x91, 0x71, 0x9f, 0x1f, 0x49, 0xc9, 0xc8, 0xa3, 0x7c, 0xb9, 0x61, 0x58, + 0xc1, 0xa0, 0xa3, 0xb9, 0xbf, 0xc2, 0x9e, 0xc9, 0xbe, 0x4d, 0x5b, 0x40, + 0xa3, 0x66, 0xc2, 0x86, 0x72, 0x75, 0x99, 0x98, 0x4a, 0xa1, 0x3b, 0xa7, + 0x8b, 0x41, 0x56, 0x80, 0xbd, 0x42, 0x9e, 0xa3, 0x88, 0xa8, 0x50, 0xa6, + 0x54, 0x6b, 0xa1, 0xa3, 0x40, 0xa4, 0xa5, 0x59, 0x60, 0x48, 0x4d, 0x91, + 0xb1, 0x73, 0xa6, 0x94, 0x2e, 0x92, 0xa5, 0xaf, 0x5b, 0xd8, 0x71, 0x95, + 0xb4, 0x54, 0x6d, 0x40, 0x63, 0xb1, 0x3a, 0x3d, 0x4b, 0xa5, 0x5c, 0x7c, + 0x45, 0x55, 0x4a, 0xb4, 0xac, 0xa5, 0x9f, 0xb7, 0x8b, 0xa9, 0x82, 0x5b, + 0x60, 0x98, 0xaf, 0x76, 0x4f, 0x83, 0xdb, 0x9c, 0xa4, 0xac, 0x3e, 0x7a, + 0x3e, 0x87, 0x3d, 0xb4, 0x56, 0xc1, 0x66, 0xa1, 0xbc, 0x7f, 0x3a, 0x7f, + 0x44, 0x59, 0x6b, 0xa1, 0xcb, 0xc4, 0x7a, 0x39, 0xb2, 0x76, 0x6e, 0x3c, + 0x73, 0xd2, 0xcf, 0xa2, 0x60, 0xce, 0x45, 0x5a, 0x50, 0x89, 0x95, 0xbd, + 0xc0, 0xb6, 0xbc, 0xc3, 0x80, 0x69, 0xc1, 0x2f, 0x68, 0x34, 0x7a, 0x5d, + 0xa7, 0x9e, 0x3d, 0x85, 0x3d, 0x65, 0x58, 0x9e, 0xb3, 0xae, 0xc4, 0x9d, + 0x82, 0x61, 0x7a, 0x77, 0xc5, 0x84, 0x62, 0x6f, 0x63, 0xaa, 0x4f, 0xa0, + 0x8f, 0x82, 0x51, 0x89, 0xcb, 0xb1, 0x80, 0x40, 0x70, 0x42, 0x80, 0x40, + 0x7f, 0x5f, 0x9f, 0xc4, 0xab, 0x77, 0xce, 0xa6, 0xd3, 0x50, 0x79, 0xbc, + 0x64, 0x94, 0x69, 0x8a, 0x5e, 0x61, 0xb9, 0x9b, 0x63, 0x7a, 0xb8, 0x3e, + 0x91, 0x31, 0x70, 0xad, 0xba, 0x77, 0xb0, 0x43, 0x4e, 0x69, 0x90, 0xaa, + 0x34, 0x44, 0xc9, 0xbf, 0xc6, 0x56, 0xbd, 0xa2, 0xb1, 0x6c, 0x76, 0x69, + 0xbb, 0x90, 0x7a, 0xbb, 0x71, 0xbf, 0xbe, 0xd9, 0x8f, 0x9f, 0x39, 0x5b, + 0x62, 0x8e, 0x40, 0x6d, 0xdf, 0x70, 0x7f, 0xa8, 0xc0, 0x3f, 0x61, 0x30, + 0x5b, 0xba, 0xb2, 0x71, 0x43, 0xaa, 0x70, 0xa6, 0x9d, 0xc0, 0xc4, 0x5a, + 0x5d, 0x66, 0x48, 0x43, 0xca, 0xac, 0x41, 0x6a, 0xa0, 0xc4, 0x48, 0xa3, + 0x38, 0x35, 0x5c, 0x4f, 0x2d, 0xcc, 0x5c, 0x50, 0x83, 0x82, 0xb5, 0x51, + 0xae, 0x7c, 0xaf, 0x9d, 0x7f, 0x45, 0x7a, 0x4e, 0xad, 0x5e, 0xc7, 0x8a, + 0xa3, 0x43, 0x90, 0xcf, 0x2b, 0xc3, 0x7c, 0xc4, 0x6b, 0x7e, 0x9e, 0xaf, + 0xb9, 0xc0, 0xa0, 0x6a, 0x4b, 0xa4, 0x56, 0xb4, 0x80, 0xb0, 0x6f, 0x51, + 0x84, 0x99, 0xbc, 0x5f, 0x9d, 0x83, 0x76, 0x3a, 0x52, 0x63, 0x31, 0x53, + 0x38, 0x42, 0x4d, 0xb4, 0x95, 0xa6, 0x34, 0xcc, 0x68, 0x61, 0x89, 0x89, + 0x86, 0x78, 0x95, 0x85, 0x76, 0x86, 0x36, 0x4a, 0xa4, 0x58, 0x45, 0x38, + 0x96, 0x57, 0xab, 0x72, 0x86, 0x8c, 0x7f, 0x95, 0xc0, 0x52, 0x73, 0x67, + 0x54, 0x6f, 0x91, 0x83, 0xa7, 0x40, 0x4b, 0x40, 0x8b, 0x7a, 0x91, 0xc9, + 0x4c, 0x98, 0x86, 0x77, 0xb8, 0x89, 0x64, 0xca, 0x7c, 0x76, 0xdd, 0xa0, + 0x59, 0xb0, 0x45, 0x52, 0xad, 0xa5, 0x7d, 0x7d, 0xd4, 0x88, 0x7f, 0x93, + 0x8b, 0xd4, 0x90, 0x3d, 0xca, 0xb1, 0x64, 0x9f, 0x8c, 0x3a, 0xb6, 0xb3, + 0x5a, 0x58, 0xbe, 0xbc, 0x98, 0x9c, 0x7e, 0x49, 0x2d, 0x88, 0x6c, 0xbf, + 0xc8, 0x9e, 0x2f, 0x6c, 0x7b, 0xa3, 0x66, 0xc1, 0x79, 0x89, 0x89, 0x9e, + 0x3f, 0x9c, 0xc4, 0x3f, 0x3e, 0x68, 0xda, 0x90, 0xa2, 0x4c, 0x32, 0xd0, + 0xe6, 0xe8, 0xaa, 0xd2, 0xab, 0xb0, 0x41, 0x8e, 0x63, 0x5a, 0xcb, 0x51, + 0xcf, 0xaa, 0x4c, 0xce, 0xe0, 0x89, 0xcd, 0x68, 0x9d, 0x6f, 0x7a, 0xc3, + 0x73, 0x97, 0x36, 0xbf, 0x68, 0x4a, 0x96, 0x5b, 0xbd, 0x56, 0x57, 0xb7, + 0x6d, 0x85, 0x99, 0x6f, 0x52, 0x4e, 0xb4, 0x4d, 0x32, 0x78, 0x5b, 0x76, + 0xa6, 0x50, 0x72, 0xe4, 0x97, 0xa7, 0xd7, 0x5a, 0x79, 0x2b, 0x3c, 0x93, + 0x80, 0x65, 0x3c, 0xb9, 0x35, 0x62, 0x66, 0x92, 0xc8, 0x4c, 0xbb, 0x66, + 0x77, 0x85, 0x96, 0x68, 0x8f, 0xd8, 0xe8, 0xb9, 0xcf, 0x3c, 0x32, 0x4c, + 0x6e, 0x5f, 0x81, 0x4a, 0xb5, 0xb0, 0x6a, 0xcd, 0xcf, 0x53, 0x8e, 0xc6, + 0xa7, 0x52, 0x2e, 0x31, 0x69, 0x5a, 0xd2, 0x5b, 0x3e, 0x97, 0x3b, 0xb5, + 0x91, 0x67, 0xd6, 0x46, 0xc0, 0x48, 0xba, 0x43, 0x9b, 0x3a, 0x72, 0xbe, + 0x6a, 0xe9, 0xc2, 0x73, 0x58, 0x6b, 0x70, 0xb7, 0xc8, 0xaf, 0x74, 0x6e, + 0x75, 0x5f, 0x79, 0x63, 0xbe, 0x8a, 0x3b, 0x71, 0x3a, 0x53, 0x5c, 0xc8, + 0x7c, 0xde, 0xa5, 0xb3, 0xbb, 0x4e, 0xc2, 0x3d, 0x61, 0x9e, 0x94, 0xaf, + 0x57, 0x8c, 0x63, 0x93, 0xd3, 0xa5, 0x58, 0x52, 0xb4, 0x43, 0x85, 0x66, + 0x7a, 0x36, 0xa0, 0x97, 0x65, 0x7f, 0x69, 0x42, 0x9b, 0x69, 0x5f, 0x41, + 0xaf, 0x32, 0x5f, 0xba, 0xb5, 0xaf, 0xa8, 0x4a, 0x99, 0x9d, 0x8f, 0x85, + 0x55, 0x5a, 0x31, 0xaa, 0x3c, 0xb4, 0xb7, 0xc2, 0xb8, 0x48, 0xda, 0x99, + 0x40, 0x7d, 0x6d, 0x47, 0xdb, 0x90, 0x8f, 0x44, 0x84, 0x90, 0x7c, 0x9c, + 0xb6, 0xb6, 0x51, 0xb0, 0xa6, 0x49, 0x64, 0xd1, 0x42, 0xd0, 0x97, 0x89, + 0x66, 0xc9, 0x63, 0x58, 0xc8, 0xcd, 0x74, 0x49, 0xc7, 0xc4, 0xac, 0x63, + 0xa1, 0x65, 0x40, 0x9a, 0x71, 0xa1, 0x9f, 0xb1, 0x59, 0xa8, 0x46, 0xa5, + 0x64, 0x44, 0xca, 0x7d, 0xb2, 0xbb, 0x81, 0x8c, 0xb4, 0x42, 0x41, 0xba, + 0xb9, 0xd0, 0x41, 0x4e, 0x39, 0x4c, 0x38, 0x61, 0x52, 0x90, 0xc6, 0x7a, + 0xce, 0xd2, 0x72, 0x77, 0x3e, 0x99, 0x78, 0xb4, 0x8b, 0x4e, 0x39, 0x70, + 0xb0, 0x45, 0xa6, 0x91, 0x61, 0xbd, 0x49, 0x35, 0x6c, 0x70, 0x3a, 0x3f, + 0xc4, 0x3a, 0x61, 0x33, 0x69, 0x85, 0xb2, 0x56, 0x40, 0x3d, 0x4f, 0x3e, + 0x56, 0x95, 0xc2, 0x3d, 0x87, 0x2c, 0x89, 0x42, 0xae, 0xac, 0x88, 0x5a, + 0x98, 0x44, 0x40, 0x5a, 0x48, 0x6f, 0x39, 0x2e, 0x93, 0x4b, 0x2b, 0x79, + 0x6d, 0x5d, 0x82, 0xb7, 0x64, 0x95, 0xbc, 0xbb, 0xad, 0xc4, 0x49, 0xbe, + 0x8e, 0x5a, 0xc1, 0x2b, 0x33, 0x33, 0x51, 0x61, 0x8f, 0x88, 0xbf, 0xba, + 0x50, 0x4c, 0x32, 0xb5, 0xb1, 0x53, 0x27, 0x34, 0x5c, 0x74, 0xe7, 0x47, + 0x5f, 0x78, 0x80, 0x64, 0x9c, 0x9f, 0x65, 0xb7, 0x58, 0x57, 0x61, 0x3f, + 0x66, 0x8d, 0x90, 0xc3, 0xa0, 0x35, 0x87, 0x96, 0x6b, 0x89, 0x82, 0xa7, + 0x90, 0xde, 0x9f, 0x32, 0xc0, 0x8e, 0x8b, 0x28, 0x59, 0x92, 0xbc, 0xa7, + 0x3b, 0x8e, 0x77, 0x76, 0xc2, 0x97, 0xc9, 0xb4, 0x34, 0x4e, 0x51, 0xc8, + 0x5e, 0xa1, 0x3c, 0x8f, 0x34, 0xab, 0x65, 0xc8, 0x7b, 0x96, 0xc1, 0x70, + 0xbf, 0x78, 0x55, 0x5c, 0xb1, 0x66, 0x68, 0xbe, 0x95, 0xa9, 0x34, 0xbc, + 0xc0, 0x4c, 0x6e, 0xd0, 0x7d, 0x80, 0xaf, 0x39, 0xb6, 0x30, 0x5a, 0x2e, + 0x5d, 0xb0, 0x43, 0x38, 0xaa, 0x5e, 0x5e, 0x5b, 0x72, 0xaf, 0x42, 0x4e, + 0x80, 0x71, 0x9a, 0xcb, 0x68, 0xab, 0x6f, 0x4b, 0x54, 0x80, 0x9f, 0x7c, + 0x9c, 0x3e, 0x3c, 0x4c, 0x35, 0x3d, 0xc2, 0x90, 0x88, 0x64, 0x91, 0xa7, + 0xce, 0x3e, 0x9c, 0x69, 0xbb, 0x6b, 0x9d, 0xae, 0x71, 0xaa, 0x97, 0xc1, + 0x44, 0xa6, 0x99, 0x60, 0x75, 0x69, 0x9f, 0x3d, 0x7b, 0x58, 0xb3, 0xaa, + 0x5a, 0xc1, 0x4e, 0xb3, 0xbd, 0x47, 0x7e, 0x73, 0xc6, 0x42, 0x8f, 0xc7, + 0xaf, 0x44, 0x96, 0xa1, 0x75, 0x97, 0x96, 0x80, 0x42, 0x8d, 0xb9, 0x8f, + 0x41, 0x63, 0xc7, 0x74, 0xb6, 0x74, 0x93, 0x79, 0xc3, 0xc2, 0xae, 0x60, + 0x77, 0x43, 0x4d, 0x3c, 0x49, 0x5d, 0xa9, 0xae, 0xc5, 0xb6, 0xa7, 0xd3, + 0x62, 0x5b, 0x97, 0xc2, 0xd7, 0xb3, 0xba, 0xb3, 0x57, 0x75, 0xcc, 0x37, + 0x33, 0x6a, 0xc6, 0xb3, 0x80, 0x7a, 0x81, 0x66, 0xa7, 0xb3, 0xcb, 0x43, + 0xbb, 0xb8, 0x3e, 0x32, 0x9a, 0x7a, 0x3b, 0x8c, 0xa3, 0x87, 0x4f, 0x71, + 0x89, 0x5b, 0x98, 0xd4, 0x91, 0x9c, 0xc4, 0x55, 0x4d, 0x65, 0x9b, 0x84, + 0x50, 0x92, 0xa5, 0x89, 0xac, 0x6c, 0x36, 0xb7, 0x99, 0xa9, 0xb1, 0x7f, + 0x58, 0x55, 0x64, 0x99, 0x84, 0xa9, 0x7b, 0x8f, 0xc1, 0xd0, 0x98, 0x41, + 0x77, 0xd7, 0x77, 0x99, 0x6f, 0x73, 0x43, 0x47, 0x2f, 0x30, 0x5f, 0xa4, + 0x92, 0x91, 0xde, 0x8b, 0x75, 0x5c, 0xa0, 0x61, 0x88, 0xcc, 0x4f, 0xc5, + 0xa3, 0x96, 0x3d, 0xc9, 0x88, 0xc8, 0x70, 0x5d, 0x89, 0x88, 0x82, 0x3c, + 0x5a, 0xb1, 0x54, 0xcb, 0x9c, 0x8a, 0xc3, 0x5f, 0x91, 0xca, 0x8d, 0xc0, + 0x59, 0xc4, 0x63, 0xc5, 0xa0, 0x9f, 0x50, 0x72, 0x66, 0xa9, 0x8e, 0xc9, + 0xc4, 0xd6, 0x87, 0xc3, 0x40, 0xcc, 0x58, 0x50, 0xa8, 0x6c, 0xcb, 0x97, + 0xb7, 0x3f, 0x36, 0x49, 0x7e, 0xcc, 0x7e, 0xc5, 0x52, 0x9e, 0xd3, 0x7a, + 0xb8, 0x3d, 0xd7, 0xa7, 0xa5, 0x58, 0x9d, 0xa0, 0x8f, 0xbc, 0x8c, 0x5c, + 0xd0, 0x74, 0x84, 0x60, 0xb9, 0x93, 0xc6, 0x9a, 0xb4, 0x26, 0x56, 0xaa, + 0x3d, 0x88, 0xb4, 0xab, 0x43, 0x32, 0x67, 0x65, 0x8f, 0x53, 0xcc, 0xae, + 0xca, 0x9d, 0x33, 0x96, 0x41, 0xa5, 0x9e, 0xa9, 0xb7, 0x87, 0x8e, 0x6a, + 0x3d, 0x43, 0xa7, 0x51, 0xb4, 0xc3, 0x63, 0x68, 0x9a, 0x8e, 0x90, 0x61, + 0x8e, 0xcf, 0x88, 0x58, 0xc6, 0xa4, 0xbb, 0xa6, 0xa0, 0x53, 0x67, 0xa9, + 0x9c, 0x9c, 0x59, 0x4f, 0x7b, 0x6a, 0x62, 0xce, 0xbb, 0x7d, 0x83, 0xa5, + 0x36, 0x5e, 0x5f, 0x48, 0xa0, 0x54, 0xa0, 0x45, 0x9d, 0x5f, 0xcd, 0x38, + 0x92, 0xd8, 0x6e, 0xa9, 0x6f, 0x54, 0x4e, 0xa0, 0x3f, 0x40, 0xa3, 0xae, + 0x36, 0xb3, 0xd9, 0x51, 0x95, 0x6e, 0x7d, 0x59, 0xa6, 0xaf, 0x79, 0x36, + 0x5d, 0xa7, 0x80, 0xd3, 0x55, 0x85, 0x5e, 0x7d, 0xc4, 0xae, 0x70, 0x98, + 0x3f, 0x64, 0x27, 0x7a, 0x49, 0x4e, 0xad, 0x82, 0x56, 0x92, 0x7f, 0xc4, + 0xc4, 0x3d, 0x8c, 0x8a, 0x37, 0x39, 0x4e, 0x67, 0x85, 0xca, 0xb5, 0x54, + 0x8a, 0x9c, 0x52, 0x87, 0x8b, 0x82, 0x76, 0xb6, 0xb4, 0x3a, 0x46, 0x6c, + 0xa8, 0xa2, 0x4f, 0x77, 0x95, 0x7b, 0x36, 0x6a, 0x3b, 0x82, 0x67, 0x9d, + 0x5a, 0xa4, 0x9f, 0x7c, 0x59, 0x2e, 0x76, 0xcd, 0x37, 0x93, 0x97, 0x68, + 0xb6, 0xc1, 0xb3, 0x9b, 0x54, 0xc6, 0x70, 0xd5, 0xaf, 0xb0, 0x75, 0x7e, + 0x59, 0x47, 0x65, 0xb7, 0xbe, 0x93, 0x54, 0x89, 0x22, 0x8f, 0x86, 0x51, + 0x60, 0x3d, 0xc0, 0x6c, 0x40, 0xc7, 0xbb, 0xc2, 0xd8, 0x36, 0x6e, 0x34, + 0x45, 0xa3, 0x4f, 0xbf, 0xb4, 0xa4, 0x36, 0x78, 0x85, 0x85, 0x77, 0x62, + 0xa0, 0x83, 0x73, 0x40, 0x39, 0x3a, 0x89, 0x74, 0x67, 0x79, 0x6e, 0x39, + 0x7e, 0xcd, 0x99, 0xa7, 0x43, 0xbf, 0xa0, 0x71, 0x7e, 0x6d, 0x63, 0xb2, + 0x95, 0x2f, 0x41, 0x90, 0xa3, 0x7d, 0x45, 0x63, 0xd1, 0xcf, 0x8e, 0x93, + 0xa1, 0x81, 0x59, 0x59, 0x85, 0x2e, 0x7d, 0x97, 0x5e, 0xa1, 0x54, 0x31, + 0xb8, 0xae, 0xdb, 0xa3, 0xc6, 0xc2, 0x91, 0x85, 0xa1, 0x9e, 0x71, 0x5d, + 0xcc, 0xd3, 0xb0, 0x73, 0xc0, 0x60, 0x4d, 0x47, 0x32, 0x35, 0xa7, 0x49, + 0xbf, 0x3b, 0xa9, 0xa0, 0xa4, 0x55, 0x8b, 0xb6, 0x59, 0x58, 0x5f, 0xb0, + 0xa8, 0xac, 0x7a, 0x7f, 0x60, 0x68, 0x81, 0x72, 0xb5, 0x75, 0xb1, 0x96, + 0x47, 0x4f, 0xb7, 0x59, 0xbf, 0xa8, 0x8e, 0xa7, 0x84, 0x8d, 0xb0, 0xac, + 0x81, 0x3e, 0x50, 0x6c, 0x4c, 0xa4, 0xcd, 0xc5, 0x87, 0x3b, 0x48, 0xa3, + 0x45, 0xb1, 0x38, 0x35, 0x8e, 0x61, 0x5a, 0xa2, 0xbd, 0xbf, 0x3b, 0x98, + 0x82, 0xd0, 0x57, 0x97, 0x90, 0xa3, 0x95, 0x78, 0xa0, 0x45, 0x4b, 0x5e, + 0x52, 0x84, 0x6d, 0xb8, 0xa5, 0x3d, 0x8a, 0x95, 0x8a, 0x44, 0x36, 0x8d, + 0x71, 0x61, 0xd2, 0x44, 0x53, 0x43, 0xc9, 0x89, 0xa5, 0x36, 0x9e, 0x5a, + 0x73, 0x57, 0x3a, 0x7b, 0xa5, 0xc8, 0x7f, 0x74, 0x4b, 0x80, 0x7e, 0xbf, + 0x8e, 0x55, 0x35, 0x43, 0xc9, 0xb6, 0xae, 0x5b, 0xa0, 0xd0, 0x95, 0xb9, + 0x7e, 0x32, 0x66, 0x71, 0xa4, 0xa0, 0x61, 0x6f, 0x62, 0x52, 0xb0, 0x5f, + 0x77, 0xf, 0x95, 0xa8, 0x3a, 0x98, 0x97, 0x83, 0x9d, 0x47, 0xb6, 0x38, + 0x7a, 0x30, 0x44, 0x65, 0x6a, 0x72, 0x7e, 0x49, 0xb7, 0x40, 0x64, 0x81, + 0x53, 0xa4, 0x9f, 0x52, 0x42, 0x7a, 0x9d, 0xa0, 0x9b, 0xdf, 0x98, 0xa8, + 0x6f, 0x45, 0x77, 0x73, 0xa3, 0xc1, 0x9d, 0x45, 0x9e, 0x59, 0x9e, 0xa6, + 0x53, 0xd1, 0xbe, 0x34, 0xc4, 0xbd, 0x55, 0xaa, 0xd0, 0x53, 0x70, 0x9b, + 0x73, 0x7b, 0x81, 0x87, 0x33, 0x4b, 0x5d, 0xdb, 0x8e, 0xa3, 0xba, 0x53, + 0x9a, 0x8c, 0xbb, 0xd8, 0x83, 0x6f, 0xc0, 0xc7, 0x39, 0x35, 0x69, 0xce, + 0xa9, 0x8a, 0xd0, 0x7b, 0x63, 0x70, 0x4b, 0x4b, 0xc4, 0xc7, 0x54, 0x44, + 0x3b, 0x91, 0x8d, 0xa3, 0x4b, 0x91, 0xbf, 0x33, 0xce, 0x5b, 0xa5, 0x3a, + 0x36, 0x44, 0x3f, 0xb8, 0x8f, 0x9f, 0x63, 0x5b, 0x7c, 0x43, 0x7c, 0xb0, + 0x6e, 0x76, 0xbd, 0x43, 0x60, 0xcd, 0x94, 0xba, 0x74, 0xb8, 0x3b, 0xa6, + 0x4a, 0x74, 0x58, 0xa3, 0x73, 0x98, 0x53, 0x6a, 0x6f, 0x84, 0x64, 0x44, + 0x43, 0x4d, 0xd0, 0x78, 0xbd, 0x3d, 0xbd, 0xb1, 0xa7, 0x5a, 0x4d, 0x58, + 0x60, 0x67, 0x41, 0xa1, 0xb7, 0x3e, 0xd2, 0xb3, 0x37, 0x81, 0x8f, 0x32, + 0x86, 0xca, 0x48, 0x51, 0x59, 0x5e, 0xcf, 0xc8, 0x6f, 0xbb, 0xad, 0xbe, + 0x57, 0x65, 0x6d, 0xba, 0xad, 0x4b, 0x97, 0x5d, 0xb6, 0x54, 0x47, 0x99, + 0xd6, 0x94, 0x70, 0xac, 0xa7, 0x8f, 0x5d, 0x9a, 0x96, 0x6d, 0x43, 0xa4, + 0xc8, 0x37, 0xa2, 0xcf, 0x8f, 0x6b, 0x3b, 0xcf, 0xce, 0x3f, 0x5c, 0x62, + 0xcc, 0x8e, 0x89, 0xa5, 0x71, 0x97, 0x9f, 0x90, 0x33, 0x2e, 0x53, 0x94, + 0x9b, 0x7c, 0x7a, 0xa6, 0xaa, 0x30, 0x4d, 0xb8, 0xce, 0x42, 0xb4, 0x3b, + 0x2f, 0x35, 0x95, 0x62, 0x76, 0xb6, 0x62, 0x38, 0x35, 0x54, 0xa8, 0xb2, + 0x57, 0xbb, 0x6a, 0x30, 0x5f, 0x63, 0x6e, 0xcf, 0x87, 0xbe, 0xbe, 0x97, + 0xa0, 0xc4, 0x8a, 0x58, 0x37, 0x78, 0x62, 0x6c, 0x98, 0x7e, 0x54, 0x34, + 0x74, 0x7f, 0x4e, 0x9c, 0x7c, 0x79, 0x52, 0xa4, 0x45, 0x3c, 0x6a, 0xa3, + 0xd0, 0x5c, 0xc6, 0x39, 0xc1, 0x41, 0x48, 0x99, 0x58, 0x48, 0x31, 0xcc, + 0xb9, 0x6f, 0x7c, 0xd0, 0x7f, 0x50, 0x71, 0xa9, 0xbc, 0x8a, 0x6d, 0x61, + 0x4e, 0x93, 0xc5, 0x40, 0x5a, 0x5a, 0xaf, 0x92, 0xcc, 0x7a, 0x5c, 0xb7, + 0xa4, 0xc3, 0x43, 0x51, 0x41, 0xb0, 0x53, 0x50, 0x5e, 0xca, 0x8f, 0xb4, + 0xb6, 0xa3, 0x34, 0xc1, 0x4d, 0x3f, 0xb2, 0x3d, 0x98, 0x41, 0x5c, 0x8d, + 0x6e, 0x86, 0x5f, 0xb1, 0xaa, 0xc0, 0xab, 0xc4, 0x82, 0xbe, 0x53, 0xd2, + 0x97, 0xa1, 0xd7, 0xcc, 0x61, 0x50, 0x7a, 0xc9, 0xce, 0x9f, 0x78, 0x57, + 0x39, 0xcc, 0x3b, 0x38, 0xad, 0x7b, 0xd1, 0x8f, 0x93, 0x95, 0x35, 0xa4, + 0x89, 0xa7, 0xcf, 0x5c, 0xcb, 0x40, 0x89, 0x88, 0xc0, 0x90, 0x5d, 0x5e, + 0xc4, 0x63, 0x4b, 0xbb, 0x49, 0xa5, 0x52, 0x37, 0xb6, 0x35, 0x6f, 0x4a, + 0xd5, 0x8f, 0x94, 0xc8, 0x64, 0xc0, 0x5b, 0xad, 0x6a, 0xb9, 0x7d, 0x7e, + 0x48, 0x81, 0x6f, 0xba, 0x9d, 0x8f, 0xab, 0x86, 0xb1, 0x57, 0x47, 0x49, + 0x57, 0x97, 0xbb, 0xda, 0xc4, 0xc9, 0xb5, 0x34, 0x97, 0x32, 0x97, 0x44, + 0x5c, 0xa7, 0xb1, 0x4c, 0xa7, 0x74, 0xd7, 0x97, 0xc1, 0xc3, 0x7e, 0x6d, + 0xb5, 0x65, 0x9a, 0x8d, 0x47, 0x47, 0xa4, 0x5c, 0x67, 0xc9, 0x93, 0x93, + 0x95, 0x2b, 0x47, 0x86, 0xc7, 0x9f, 0xa0, 0x88, 0x48, 0xd0, 0x78, 0xa5, + 0x74, 0x3b, 0xb5, 0xc2, 0x78, 0x7e, 0x97, 0x5f, 0xba, 0x79, 0x97, 0xb9, + 0xd7, 0x8d, 0xc0, 0x39, 0x5b, 0x63, 0xbc, 0x96, 0xc9, 0xb7, 0xbf, 0x83, + 0xa8, 0x50, 0xb0, 0x50, 0x41, 0xc8, 0x4e, 0x8f, 0xcf, 0x91, 0x7b, 0x52, + 0x40, 0x73, 0x60, 0xb3, 0x85, 0x3a, 0xa6, 0x55, 0xa4, 0x82, 0x5b, 0x52, + 0x98, 0xa4, 0x55, 0x86, 0x94, 0x79, 0x94, 0x5e, 0x86, 0xac, 0x72, 0xbc, + 0xb9, 0x6d, 0x5f, 0x93, 0xbb, 0x80, 0x5b, 0xcd, 0x4b, 0x38, 0x95, 0x5a, + 0x92, 0x4a, 0x84, 0x86, 0xb3, 0xce, 0x77, 0xc0, 0x39, 0x86, 0x61, 0x9a, + 0x6e, 0xb1, 0xc6, 0x68, 0x3a, 0xa4, 0x87, 0x37, 0x75, 0x52, 0xa4, 0xb9, + 0x96, 0x65, 0x8c, 0xb2, 0x5d, 0x40, 0xd1, 0x2d, 0xb7, 0x69, 0x4a, 0x42, + 0x6c, 0x74, 0x68, 0x7e, 0x8c, 0xbc, 0xbc, 0x66, 0xb8, 0x35, 0xc2, 0x8d, + 0x70, 0x3c, 0x69, 0x62, 0x99, 0x72, 0x80, 0xc9, 0xa4, 0x5f, 0xa3, 0x48, + 0x85, 0xa8, 0x92, 0x8c, 0xae, 0x65, 0xba, 0x5a, 0x4c, 0x68, 0x9b, 0x79, + 0x77, 0x89, 0xbf, 0x5d, 0x59, 0x96, 0x54, 0xc2, 0x5b, 0x4c, 0xc9, 0xb1, + 0x34, 0x49, 0xa5, 0xb2, 0x68, 0x6b, 0xae, 0xcf, 0x2f, 0x7f, 0x53, 0x3c, + 0x92, 0x61, 0x78, 0xb8, 0xa0, 0x4c, 0xc4, 0x9f, 0xa0, 0x5b, 0x80, 0xc1, + 0x5f, 0x68, 0xc1, 0x87, 0xce, 0xbf, 0x7b, 0x9f, 0x66, 0xac, 0x5a, 0xaa, + 0x3c, 0xa6, 0x7f, 0x93, 0xa0, 0x48, 0x8f, 0x8f, 0x49, 0x9c, 0x8e, 0x97, + 0x43, 0xb6, 0x42, 0x8a, 0x39, 0xa1, 0xc0, 0x72, 0x5c, 0x61, 0xcb, 0x6e, + 0x75, 0xd5, 0x58, 0xa6, 0x6b, 0x9d, 0x8b, 0xd2, 0x76, 0xd1, 0x73, 0x4f, + 0x9c, 0x3e, 0xb7, 0xcd, 0x5c, 0x5a, 0xc0, 0x30, 0x7b, 0x6c, 0xb4, 0x8d, + 0x8a, 0xaa, 0x8a, 0x3b, 0x9e, 0x8b, 0x36, 0xa4, 0x4e, 0xaa, 0x6f, 0x3f, + 0x41, 0xc7, 0xba, 0xd0, 0x40, 0xb8, 0x50, 0x51, 0xa0, 0xb4, 0x78, 0x2c, + 0x88, 0xba, 0x2f, 0x75, 0xac, 0x8a, 0xbf, 0xb5, 0x84, 0x5b, 0xa1, 0x6b, + 0x7b, 0x36, 0x79, 0x63, 0x4a, 0x67, 0xac, 0x4c, 0x54, 0xa6, 0x6a, 0x46, + 0x3a, 0xcb, 0x97, 0x5d, 0xc2, 0x75, 0x6b, 0xa6, 0x93, 0xc5, 0x61, 0x3f, + 0x62, 0x7f, 0x84, 0xc5, 0xcf, 0xb4, 0x90, 0xd7, 0x84, 0xa8, 0x52, 0xc2, + 0x56, 0xb2, 0xbf, 0x8f, 0xbf, 0x92, 0x64, 0x3c, 0x72, 0xa2, 0x5f, 0x36, + 0x8a, 0xd0, 0x42, 0xe3, 0x32, 0x4b, 0x75, 0x8b, 0xb1, 0x89, 0xb3, 0x52, + 0x6c, 0x9a, 0x66, 0xb8, 0x89, 0x95, 0x73, 0x5c, 0x52, 0x52, 0xca, 0xae, + 0xc3, 0x76, 0x65, 0xa3, 0xb5, 0xda, 0x50, 0x6f, 0x4d, 0xd0, 0x97, 0x63, + 0x4d, 0xbc, 0x94, 0x8e, 0x70, 0x40, 0x9a, 0xa5, 0x2e, 0x7b, 0x44, 0x9a, + 0x42, 0x90, 0xb6, 0x87, 0x44, 0x58, 0xd1, 0xd3, 0x60, 0x97, 0xa4, 0x9f, + 0x9c, 0x75, 0x5f, 0xac, 0x50, 0x55, 0x9f, 0x23, 0x35, 0xcf, 0x4e, 0x2f, + 0x97, 0x59, 0xcc, 0x6b, 0x76, 0xa1, 0xd2, 0x8f, 0x75, 0x6c, 0x33, 0x61, + 0x85, 0xac, 0x85, 0x47, 0xc1, 0x25, 0xc4, 0x3a, 0x92, 0xcb, 0xb2, 0x9c, + 0x6c, 0x89, 0x52, 0x87, 0x35, 0x7e, 0x4a, 0x38, 0x74, 0x66, 0x4a, 0x40, + 0x4a, 0xb9, 0xcc, 0x9b, 0x6d, 0xb4, 0x94, 0x47, 0x83, 0xb0, 0x8d, 0x5e, + 0x9a, 0x40, 0xc6, 0xd1, 0x8f, 0xa5, 0xa1, 0x88, 0x45, 0xd3, 0xc3, 0x71, + 0x7d, 0xdc, 0x44, 0xba, 0x26, 0x68, 0xa7, 0xb6, 0xaa, 0xc4, 0x6f, 0xbc, + 0x67, 0x68, 0xa5, 0x69, 0xb3, 0x48, 0x4d, 0x4e, 0xab, 0x70, 0xb6, 0xb6, + 0x89, 0x3c, 0x3a, 0x71, 0x41, 0xc2, 0x93, 0x56, 0x93, 0x8d, 0x43, 0xa2, + 0xa4, 0x3b, 0x8e, 0x4e, 0xd5, 0x85, 0xc1, 0x46, 0x3b, 0x88, 0xbd, 0xc5, + 0x85, 0x5f, 0xc2, 0x2f, 0xde, 0x79, 0xcf, 0xb7, 0x73, 0x95, 0x57, 0x9b, + 0x3e, 0x7c, 0x5d, 0xcd, 0xcd, 0x81, 0x2f, 0xcd, 0x81, 0xb9, 0x30, 0x4d, + 0x85, 0x8c, 0x74, 0x8e, 0x3d, 0x63, 0xb7, 0x36, 0x7d, 0x99, 0x43, 0xc8, + 0x91, 0x50, 0xb4, 0xa4, 0x6f, 0x6f, 0x6c, 0x44, 0x4a, 0x89, 0x47, 0x90, + 0x92, 0x79, 0x84, 0xa0, 0x74, 0xa0, 0x3a, 0x9b, 0x52, 0xc9, 0x7c, 0x80, + 0xd5, 0x98, 0xab, 0xa7, 0x5c, 0x48, 0xb4, 0x70, 0x44, 0x47, 0x8a, 0xad, + 0x50, 0xad, 0x51, 0x73, 0x75, 0x7a, 0x37, 0x78, 0xd0, 0x39, 0x72, 0x9c, + 0x86, 0x5a, 0x85, 0x79, 0x8a, 0xbc, 0x41, 0xcc, 0x39, 0x27, 0x8b, 0xaa, + 0x7d, 0x47, 0x59, 0xb5, 0xc0, 0xae, 0x3a, 0x60, 0x41, 0x49, 0xd2, 0x4f, + 0x6c, 0xaf, 0x95, 0xcb, 0x55, 0x84, 0x97, 0x86, 0xc5, 0x78, 0x9c, 0x98, + 0x95, 0xa4, 0x6b, 0x5d, 0x7a, 0x87, 0xbb, 0x7f, 0x6c, 0x8f, 0x79, 0xca, + 0x75, 0x62, 0xc1, 0xba, 0x8d, 0x6a, 0x7f, 0x78, 0x4a, 0x85, 0x71, 0xc0, + 0x6b, 0x60, 0x6c, 0x56, 0xdf, 0xc1, 0x71, 0x65, 0xb5, 0x53, 0xb2, 0x4b, + 0x36, 0x90, 0x4a, 0x56, 0xcc, 0x83, 0x52, 0xd0, 0xaa, 0x39, 0xca, 0xd3, + 0xa3, 0x8c, 0x7b, 0xb4, 0xc3, 0x8b, 0x8b, 0x62, 0x5c, 0xba, 0xc4, 0x6e, + 0xbe, 0x9d, 0x4a, 0x44, 0x8f, 0x89, 0x97, 0x99, 0x6c, 0xbf, 0x7d, 0x31, + 0xb0, 0x5f, 0x87, 0x7e, 0x75, 0xa6, 0x6d, 0x3d, 0xb2, 0x93, 0x5d, 0xa0, + 0x60, 0xc4, 0x6d, 0x74, 0x42, 0x62, 0x46, 0xb8, 0xaa, 0x8d, 0x42, 0xbc, + 0x5f, 0x2b, 0xaa, 0xa2, 0xdd, 0xc1, 0x68, 0xcd, 0xd9, 0xa2, 0x85, 0x93, + 0x5c, 0x72, 0xa5, 0x86, 0x31, 0x85, 0x94, 0x4c, 0x81, 0x90, 0xbf, 0x74, + 0x75, 0xa8, 0x3f, 0x63, 0x5f, 0x78, 0xae, 0x39, 0xcd, 0x8b, 0x7c, 0x70, + 0x56, 0x3a, 0xb8, 0x70, 0xad, 0x3f, 0xcb, 0x9c, 0xaf, 0xa2, 0x75, 0xb3, + 0xc0, 0x8e, 0x98, 0x7c, 0x57, 0x7e, 0xaa, 0x6a, 0xb6, 0x9f, 0x62, 0x67, + 0x5b, 0x51, 0x5f, 0x8d, 0x33, 0xb8, 0xa5, 0xd5, 0xae, 0xa0, 0x8b, 0x97, + 0x51, 0x1f, 0x4a, 0x43, 0x3f, 0x8e, 0x27, 0x52, 0x4a, 0x37, 0x59, 0x43, + 0x23, 0x52, 0x75, 0x2e, 0x39, 0x7e, 0x97, 0x2d, 0x6a, 0x6d, 0xa5, 0xc7, + 0xbe, 0x73, 0xad, 0x92, 0x5a, 0xa3, 0x8b, 0xab, 0x3a, 0x6a, 0xa3, 0x56, + 0x6c, 0x47, 0x6c, 0x47, 0x9d, 0xa3, 0x80, 0x48, 0xc2, 0x65, 0x8e, 0xb0, + 0x8e, 0x37, 0x9b, 0x99, 0x77, 0x58, 0xb3, 0x3f, 0xa3, 0x77, 0x64, 0xa2, + 0xb1, 0x5d, 0x9a, 0x63, 0xca, 0xa0, 0xcc, 0x52, 0x88, 0xa2, 0xae, 0xa5, + 0x4f, 0x42, 0xc4, 0x55, 0x73, 0x53, 0xd1, 0x56, 0x64, 0x77, 0x63, 0xc3, + 0x4b, 0xde, 0x64, 0x61, 0x61, 0x75, 0x6f, 0x61, 0x2f, 0x3f, 0x5e, 0x59, + 0xca, 0x8d, 0x69, 0x5c, 0xc3, 0xb4, 0xc0, 0x6f, 0x5e, 0x60, 0x8e, 0xe0, + 0x6f, 0xbf, 0x2c, 0x46, 0x97, 0xbf, 0xc0, 0x94, 0xa1, 0x5a, 0x8c, 0xbb, + 0x8f, 0xce, 0xb3, 0x8a, 0xa7, 0xca, 0x7b, 0x57, 0xa9, 0x41, 0xa1, 0x77, + 0x9d, 0x83, 0xba, 0x97, 0x34, 0x3b, 0x7a, 0x5f, 0x4b, 0x79, 0xca, 0x52, + 0xb8, 0xb0, 0x6c, 0xa6, 0x3c, 0xbc, 0x3d, 0xa5, 0xc3, 0xb9, 0xa1, 0x2c, + 0x5f, 0x9d, 0xa8, 0x5f, 0x89, 0xaf, 0x56, 0x2d, 0x53, 0x51, 0x4b, 0x7a, + 0xd9, 0xa0, 0xdd, 0x5f, 0x8b, 0x70, 0x39, 0x55, 0xbf, 0x77, 0x2c, 0x5f, + 0xad, 0x83, 0x36, 0xa2, 0x3b, 0xb3, 0xdf, 0xb7, 0x68, 0xb0, 0x40, 0x53, + 0x7a, 0x82, 0x43, 0xbf, 0x79, 0x38, 0x4c, 0x93, 0x72, 0x97, 0x32, 0xa1, + 0xca, 0x93, 0xa0, 0xb2, 0x49, 0x42, 0x47, 0xa4, 0x2c, 0x54, 0x9a, 0x87, + 0x7e, 0xc8, 0x99, 0x2f, 0x46, 0x69, 0x85, 0xc9, 0x66, 0x3d, 0xb6, 0xd3, + 0x53, 0x82, 0x4b, 0xa4, 0x60, 0xa7, 0x6f, 0x78, 0x3d, 0x6e, 0x9c, 0xae, + 0x69, 0xb2, 0xc8, 0x7c, 0x76, 0x44, 0x7f, 0x98, 0xab, 0x4a, 0x3a, 0xa2, + 0x92, 0xc2, 0x8f, 0x7f, 0x61, 0xa7, 0xd0, 0x9a, 0xba, 0x84, 0x84, 0xbd, + 0xc7, 0x8d, 0x6f, 0x4a, 0x57, 0x38, 0x9d, 0x2e, 0x66, 0x93, 0x52, 0x82, + 0xa1, 0x57, 0x7e, 0x6a, 0xae, 0x54, 0x52, 0x70, 0x7c, 0x9c, 0xa4, 0x61, + 0xb8, 0xb8, 0x84, 0x75, 0x65, 0xb5, 0x4f, 0xcd, 0x6b, 0x5a, 0xc4, 0x31, + 0x9e, 0x60, 0x81, 0x3e, 0xc6, 0x37, 0xb8, 0x48, 0x30, 0x3f, 0x93, 0x85, + 0x9d, 0x4d, 0xcb, 0x48, 0x62, 0x3d, 0x4e, 0xb5, 0xcb, 0x65, 0x33, 0x9a, + 0x5b, 0xbd, 0x32, 0xca, 0x5f, 0xad, 0x40, 0x3d, 0x93, 0x44, 0xa5, 0x5e, + 0xa7, 0x8d, 0x59, 0xc6, 0x47, 0x73, 0x4b, 0x5b, 0x41, 0x45, 0xc1, 0xc2, + 0x84, 0x4f, 0x79, 0xce, 0x9f, 0xc8, 0xa4, 0x73, 0x42, 0x38, 0x3b, 0xd2, + 0x4f, 0x9d, 0x39, 0x79, 0x76, 0x82, 0x74, 0x94, 0xc3, 0xc2, 0x48, 0xa5, + 0x4d, 0xb6, 0xa8, 0x2c, 0x54, 0xa4, 0x49, 0xaf, 0x3d, 0x6f, 0x89, 0x5f, + 0xb8, 0x67, 0x52, 0x63, 0x38, 0x80, 0x37, 0x89, 0x6e, 0x66, 0x96, 0xc6, + 0x7d, 0x67, 0x42, 0x2b, 0x72, 0x94, 0xb1, 0x3b, 0xd5, 0x3d, 0x8f, 0x4d, + 0x88, 0x45, 0xba, 0x82, 0x7d, 0x42, 0xcd, 0x9a, 0x73, 0x38, 0xd2, 0x87, + 0x41, 0x6c, 0x39, 0x78, 0x5e, 0x4d, 0xb6, 0x68, 0x9e, 0xc4, 0x57, 0xb9, + 0x56, 0x9f, 0x77, 0x4a, 0x72, 0x4b, 0x3e, 0x48, 0x8f, 0x62, 0xb2, 0x9a, + 0xc8, 0xae, 0x9e, 0x32, 0x42, 0x69, 0x5d, 0x87, 0xa4, 0x32, 0x57, 0xb9, + 0x8d, 0xb9, 0x62, 0x8c, 0x67, 0x48, 0x8b, 0x65, 0xc3, 0xab, 0x2e, 0x5a, + 0x59, 0x3d, 0xc4, 0x37, 0x69, 0x38, 0x51, 0x70, 0x3e, 0x43, 0x94, 0x98, + 0x81, 0x94, 0xc1, 0xcc, 0x53, 0x51, 0x53, 0x67, 0xbf, 0x58, 0xb7, 0xd8, + 0x66, 0x60, 0x8e, 0x5b, 0x75, 0xc3, 0x97, 0xbd, 0xae, 0x84, 0x81, 0x89, + 0x78, 0xbc, 0x5a, 0xb5, 0x4f, 0x4f, 0xc2, 0x6e, 0x30, 0xa3, 0xd3, 0x36, + 0x63, 0xb6, 0x80, 0xc7, 0x8c, 0x79, 0x4b, 0xc7, 0x6a, 0xae, 0xa8, 0xc5, + 0x68, 0xb3, 0xc0, 0xa2, 0xb5, 0x3e, 0xa3, 0xad, 0x7f, 0x7d, 0x63, 0xd5, + 0xa5, 0xcd, 0x9c, 0x70, 0x84, 0x84, 0x8a, 0xb3, 0x5d, 0xc3, 0xb0, 0x75, + 0x75, 0x89, 0xa0, 0x84, 0x5a, 0x86, 0xa0, 0x95, 0x34, 0x42, 0x5c, 0x4a, + 0xad, 0x3e, 0x99, 0x58, 0x53, 0x82, 0xbd, 0x93, 0x44, 0x79, 0xce, 0xaa, + 0xac, 0x6c, 0xc4, 0x8c, 0x7f, 0x6f, 0x35, 0xb3, 0x31, 0xd1, 0xc8, 0x77, + 0xc3, 0xcd, 0x5f, 0xac, 0x47, 0xb7, 0x3e, 0xc1, 0xa6, 0x4e, 0xb1, 0x62, + 0x6d, 0x6a, 0x85, 0x5c, 0x55, 0x2e, 0x79, 0x37, 0xb1, 0xbd, 0x43, 0x7c, + 0x5b, 0x75, 0xd5, 0x61, 0x62, 0x54, 0x8b, 0x68, 0x5c, 0xca, 0x99, 0x3d, + 0xd6, 0x9d, 0xa7, 0x78, 0xb9, 0x31, 0x5c, 0xce, 0xbf, 0x6d, 0x55, 0xc0, + 0xc8, 0x56, 0x76, 0x67, 0xad, 0x4f, 0x4e, 0xb0, 0xb6, 0x7b, 0x9e, 0xb2, + 0x41, 0x95, 0x9e, 0xce, 0x59, 0x78, 0x83, 0xc6, 0x90, 0x56, 0xca, 0xb4, + 0x39, 0x79, 0x79, 0x70, 0xcd, 0x49, 0x82, 0xcf, 0x9c, 0xc4, 0x4a, 0x46, + 0x6b, 0xbd, 0x8f, 0xa2, 0xc2, 0xc4, 0xb9, 0xc9, 0x81, 0x69, 0x49, 0xb5, + 0x58, 0xa9, 0x7c, 0x94, 0x4a, 0xca, 0xad, 0xc0, 0x60, 0xaa, 0x70, 0x86, + 0xce, 0x87, 0xa2, 0x36, 0x34, 0x7d, 0xca, 0xc6, 0x6b, 0x6d, 0xb5, 0x43, + 0x37, 0x3e, 0x65, 0x8d, 0x4d, 0x9f, 0xa3, 0x8b, 0x9a, 0x81, 0x62, 0x86, + 0x9a, 0x31, 0xad, 0x9a, 0x7c, 0xa3, 0x66, 0xda, 0x7e, 0x50, 0x3e, 0x6d, + 0x94, 0xc9, 0xc4, 0xc3, 0x69, 0x5c, 0x79, 0x46, 0x57, 0x4a, 0x39, 0x6b, + 0x7d, 0xd7, 0x9b, 0xc0, 0x93, 0xc6, 0xb5, 0x5b, 0x93, 0xba, 0x99, 0x61, + 0xa3, 0x91, 0x79, 0x6b, 0x88, 0x68, 0xaf, 0xb0, 0xd0, 0xa1, 0xa0, 0x3a, + 0xa2, 0x8c, 0x44, 0x5c, 0x3a, 0xd6, 0xba, 0xca, 0x63, 0x7d, 0x51, 0x84, + 0xab, 0x71, 0x43, 0x5c, 0x6a, 0xc4, 0x3b, 0x4a, 0xa7, 0xce, 0xba, 0x66, + 0x92, 0x67, 0xbc, 0x9a, 0x51, 0x64, 0xcc, 0xa7, 0x48, 0x37, 0x86, 0x66, + 0x9b, 0x4f, 0x51, 0xc0, 0x59, 0x6a, 0x7d, 0xcb, 0x50, 0x68, 0xac, 0x67, + 0x83, 0x8a, 0xa9, 0x68, 0x4a, 0x45, 0x3a, 0x5a, 0x79, 0xc5, 0xb9, 0x7c, + 0x8e, 0x6b, 0x88, 0x55, 0x4d, 0x8d, 0x64, 0x39, 0xc4, 0x6f, 0xbb, 0xab, + 0xb5, 0x2e, 0xb8, 0x51, 0x7a, 0x50, 0x65, 0xc7, 0xc3, 0x95, 0x67, 0x6e, + 0x7f, 0x6e, 0x9b, 0x6c, 0xbc, 0x96, 0x6c, 0xae, 0x8b, 0x81, 0xbd, 0x71, + 0x66, 0x73, 0x65, 0x67, 0x43, 0x33, 0x52, 0xd7, 0x7e, 0x35, 0x76, 0xa2, + 0x9c, 0xca, 0xb6, 0xca, 0xc5, 0x61, 0xb2, 0xcc, 0x8e, 0x33, 0x5a, 0x8c, + 0x47, 0x47, 0x8e, 0x56, 0xb6, 0xc9, 0x8c, 0xb3, 0x82, 0x47, 0x88, 0x5a, + 0x34, 0x53, 0x6d, 0x56, 0xab, 0x61, 0x6e, 0x4f, 0xa7, 0x63, 0x6b, 0x66, + 0x64, 0x57, 0x42, 0x99, 0xa8, 0xb4, 0xa4, 0x64, 0xb3, 0xd0, 0xbc, 0xb0, + 0xa9, 0x3e, 0x57, 0xbf, 0xc0, 0x5a, 0x4c, 0xd7, 0x48, 0xb9, 0xaa, 0x4c, + 0xb2, 0xb0, 0x92, 0x36, 0xc6, 0x59, 0x3e, 0x7a, 0x3e, 0xc1, 0x52, 0x95, + 0x80, 0x67, 0xa2, 0xc4, 0x6c, 0xa8, 0xb7, 0x46, 0x9f, 0xac, 0x4c, 0x6f, + 0xba, 0xa9, 0x36, 0x9a, 0x8e, 0x3a, 0xbe, 0xa6, 0x90, 0xbf, 0x77, 0x4a, + 0xa9, 0x94, 0x4a, 0x70, 0x9f, 0x96, 0x8a, 0x57, 0x57, 0xaa, 0xd2, 0x8f, + 0xd1, 0x91, 0x94, 0x55, 0xc2, 0x4f, 0xd4, 0xc8, 0xcd, 0x87, 0xcd, 0x6c, + 0xc2, 0x94, 0x86, 0x9e, 0x81, 0x8a, 0x7d, 0x7b, 0x3a, 0x97, 0x43, 0xba, + 0x6c, 0x37, 0x4f, 0x80, 0x52, 0xcc, 0x5c, 0x50, 0x56, 0x5e, 0x9e, 0x4d, + 0x4a, 0x8a, 0x95, 0x73, 0x38, 0xcb, 0x3f, 0x79, 0xb0, 0x6d, 0x50, 0x55, + 0x32, 0x7d, 0x9c, 0x38, 0x84, 0xbb, 0x2c, 0x91, 0xc3, 0xb3, 0x4f, 0xcb, + 0x4d, 0xbb, 0x66, 0xd6, 0xb3, 0x39, 0xc6, 0xb9, 0xa9, 0xc8, 0xb5, 0x6c, + 0x46, 0xdd, 0x52, 0x55, 0x40, 0x3c, 0x2d, 0x56, 0xd0, 0x71, 0x56, 0x6b, + 0x86, 0x37, 0x7f, 0x8f, 0x8e, 0xa4, 0x8d, 0x3e, 0xba, 0x49, 0xb2, 0x75, + 0x62, 0x77, 0xa7, 0x7c, 0xb8, 0x6a, 0xc2, 0x87, 0x63, 0x85, 0xbe, 0x37, + 0x7c, 0x5c, 0x41, 0x6d, 0x61, 0x8b, 0x44, 0x47, 0x7a, 0xc6, 0x5c, 0x30, + 0x86, 0x9c, 0x4f, 0x7f, 0x69, 0xb4, 0x54, 0x49, 0x48, 0x93, 0x89, 0xb7, + 0xbb, 0xae, 0x96, 0x54, 0xd0, 0x5e, 0x34, 0x80, 0x70, 0x37, 0x88, 0x9e, + 0x56, 0x2a, 0x81, 0xc1, 0x90, 0x2e, 0xc1, 0x2b, 0x88, 0x7d, 0x66, 0xba, + 0x64, 0x64, 0x3a, 0x50, 0x35, 0x82, 0xe3, 0xf1, 0xa1, 0xae, 0x8c, 0x6d, + 0x6b, 0x84, 0x8b, 0xc4, 0x64, 0x4f, 0xb4, 0x3f, 0x5e, 0x3b, 0x4e, 0x36, + 0x5f, 0x5f, 0x4b, 0x8b, 0x65, 0x5e, 0x7a, 0x87, 0x68, 0x4b, 0x34, 0xb8, + 0x4c, 0x84, 0xc2, 0x3a, 0x99, 0x88, 0x39, 0x60, 0x9f, 0x64, 0xc3, 0x4a, + 0xac, 0x33, 0x42, 0x64, 0x54, 0x63, 0xbd, 0x84, 0xa1, 0xb4, 0x96, 0xbb, + 0xce, 0x96, 0xb3, 0x89, 0x85, 0x5e, 0x68, 0x4f, 0x63, 0x72, 0x8a, 0x3c, + 0x82, 0x70, 0x6d, 0x3c, 0x52, 0x72, 0x92, 0x89, 0xcb, 0x94, 0x8b, 0xa4, + 0x9e, 0x7c, 0x35, 0x55, 0x4f, 0xc8, 0x67, 0x9c, 0x38, 0x8f, 0x95, 0x78, + 0xc9, 0xca, 0x65, 0x34, 0x8a, 0x3a, 0x8b, 0x88, 0xd5, 0xbd, 0xa2, 0xbe, + 0xa7, 0xb6, 0xa3, 0x90, 0x8b, 0x3c, 0x83, 0xac, 0x9d, 0xbd, 0xa8, 0x5d, + 0x4c, 0xa9, 0xa3, 0x7e, 0xb8, 0x61, 0xc2, 0x4d, 0x82, 0x3a, 0xcf, 0xc0, + 0x4e, 0x65, 0x30, 0x62, 0x81, 0xb7, 0x80, 0x97, 0x55, 0xc8, 0xa1, 0x41, + 0xcd, 0xd6, 0x5d, 0xaf, 0x93, 0x63, 0x5e, 0x67, 0x58, 0x99, 0x80, 0x3a, + 0xa8, 0x86, 0x39, 0x7f, 0x62, 0x6f, 0xa3, 0x48, 0xa5, 0x4e, 0xb7, 0x9f, + 0x48, 0xce, 0x4e, 0x70, 0x54, 0x78, 0xa8, 0x4f, 0xb1, 0xc8, 0x67, 0x58, + 0x42, 0x8c, 0x8c, 0x6a, 0x7b, 0x84, 0x80, 0x57, 0x49, 0x2f, 0x48, 0xa6, + 0x84, 0xb3, 0x3d, 0xa5, 0x46, 0x55, 0x9f, 0x44, 0x3e, 0x39, 0x87, 0x7d, + 0x72, 0x3d, 0x56, 0xbd, 0x56, 0x38, 0x5d, 0x3d, 0xc6, 0x3c, 0x63, 0xd9, + 0x4b, 0xb8, 0x50, 0x71, 0x9d, 0x57, 0x69, 0x6c, 0x3f, 0x7e, 0xbd, 0x69, + 0x72, 0xcb, 0x48, 0x41, 0x6a, 0xd1, 0x86, 0x4c, 0xa9, 0x7b, 0x42, 0x92, + 0xbb, 0x3b, 0xd2, 0xb9, 0x9f, 0x60, 0xae, 0xa7, 0x43, 0x78, 0x5e, 0x75, + 0xba, 0x47, 0x80, 0x9d, 0x5b, 0x50, 0x6a, 0x9f, 0x34, 0xcc, 0x39, 0x5d, + 0xab, 0xae, 0x50, 0x8c, 0x5d, 0x44, 0x2e, 0xce, 0xd7, 0x47, 0x28, 0x6f, + 0x52, 0x37, 0x4c, 0x6b, 0x38, 0x70, 0x58, 0xaa, 0xb9, 0x6e, 0xc0, 0x98, + 0x9a, 0x57, 0xb9, 0x47, 0x33, 0x94, 0xcc, 0xda, 0x82, 0x81, 0xb1, 0xbd, + 0xcd, 0x5a, 0xc3, 0xa7, 0xbb, 0x5b, 0x35, 0x75, 0xd4, 0x8e, 0x65, 0x76, + 0xd8, 0x96, 0x6b, 0x5f, 0xe3, 0x4d, 0x65, 0x6b, 0x50, 0xba, 0xc2, 0x66, + 0x6f, 0xcc, 0xb0, 0x36, 0x56, 0x8f, 0x8b, 0x8d, 0xd7, 0x3a, 0x6e, 0xb8, + 0x73, 0x87, 0xbe, 0x5b, 0x40, 0xb8, 0x88, 0x94, 0xa0, 0xad, 0x68, 0x5a, + 0xc7, 0x6b, 0xc2, 0xa7, 0x8f, 0xa2, 0x56, 0x62, 0xc2, 0x6a, 0xa8, 0xb2, + 0xcb, 0xaa, 0x8f, 0xc7, 0x9a, 0x95, 0x36, 0x57, 0x4d, 0x61, 0xa5, 0xa7, + 0x5c, 0x47, 0x64, 0x63, 0xc0, 0x72, 0xc8, 0x80, 0xd0, 0xb4, 0x92, 0x4c, + 0x5c, 0x84, 0x93, 0xb9, 0x52, 0x92, 0xab, 0xab, 0x6a, 0xd1, 0x9a, 0x2d, + 0x76, 0x85, 0x66, 0x64, 0xae, 0x60, 0xad, 0x87, 0x81, 0x78, 0x6f, 0xca, + 0xcd, 0x8d, 0x89, 0x3e, 0x97, 0x51, 0xc2, 0x7d, 0x72, 0x54, 0xce, 0x77, + 0x94, 0x43, 0x8b, 0x4f, 0xb6, 0xc4, 0x6a, 0xd8, 0xc7, 0xbb, 0xa9, 0x5a, + 0x61, 0x8d, 0xa7, 0x71, 0x42, 0x52, 0x73, 0x5b, 0x9a, 0xd1, 0xb9, 0xa4, + 0xb2, 0xb9, 0x8a, 0x83, 0x8f, 0xb1, 0xc8, 0x39, 0xaa, 0x7d, 0x59, 0x3b, + 0xaa, 0x43, 0x87, 0x8d, 0x97, 0x4c, 0x94, 0x3a, 0xc3, 0x9e, 0xbf, 0x88, + 0x35, 0x36, 0x5d, 0x79, 0x59, 0x56, 0x87, 0x77, 0x90, 0xc6, 0x64, 0xa0, + 0xa7, 0x8a, 0x96, 0x84, 0xb1, 0x9d, 0x4d, 0x6f, 0x54, 0x57, 0x9e, 0x5f, + 0x4b, 0x89, 0xa2, 0x9e, 0x87, 0x46, 0xa8, 0x73, 0x5d, 0x60, 0xde, 0x57, + 0x5e, 0x7c, 0x9e, 0x8c, 0x9b, 0x88, 0xa8, 0x95, 0x38, 0x47, 0xa4, 0x40, + 0x4f, 0xb9, 0x68, 0x71, 0x95, 0x74, 0xc1, 0x98, 0x4c, 0x61, 0xa3, 0x8a, + 0xae, 0x87, 0xa0, 0x56, 0x86, 0x6b, 0x8f, 0x6a, 0xbb, 0xad, 0xc7, 0xa3, + 0x7c, 0x5e, 0xc7, 0x7a, 0x65, 0xc4, 0x5f, 0x78, 0x7e, 0x8b, 0x81, 0xc2, + 0x7c, 0x83, 0x66, 0x75, 0x49, 0xcc, 0x8b, 0xc4, 0x83, 0x67, 0xc2, 0xc0, + 0x96, 0xc4, 0xcc, 0x93, 0x5d, 0xa3, 0x76, 0x55, 0xc6, 0x6b, 0x79, 0x97, + 0xcc, 0xc6, 0x33, 0xc5, 0x77, 0x4d, 0x8b, 0x76, 0x8c, 0xd0, 0xa3, 0x71, + 0x76, 0xd1, 0xbd, 0x42, 0xbe, 0x74, 0x8f, 0xa1, 0xc9, 0x9f, 0x4e, 0x84, + 0x47, 0x9d, 0x72, 0xdf, 0x57, 0x56, 0x61, 0xbb, 0x55, 0xd3, 0x90, 0x9d, + 0x43, 0x6b, 0x3e, 0xbf, 0x7e, 0x4b, 0x48, 0xc9, 0x69, 0x7c, 0x45, 0xc8, + 0xa9, 0x63, 0xb4, 0x7f, 0xd9, 0x50, 0xb7, 0xa2, 0xbd, 0x32, 0x44, 0xc9, + 0x52, 0x5c, 0x4b, 0x79, 0xcb, 0xa9, 0x70, 0x35, 0x84, 0xc1, 0x51, 0x94, + 0x55, 0x69, 0x93, 0x65, 0xd4, 0x9f, 0x93, 0x67, 0x59, 0x63, 0x5a, 0xb8, + 0xcb, 0xcc, 0x99, 0x52, 0x4b, 0x4c, 0x61, 0x95, 0xbf, 0x73, 0xab, 0x58, + 0x9c, 0x3c, 0x4c, 0xb1, 0x44, 0xc1, 0xb1, 0x6b, 0x48, 0x76, 0x46, 0x81, + 0x90, 0x7c, 0xc2, 0xb2, 0x5f, 0xce, 0xc4, 0xd9, 0x75, 0xc0, 0xb7, 0x54, + 0x5d, 0x40, 0x67, 0xad, 0x4c, 0x40, 0x75, 0x8b, 0x40, 0x9d, 0xad, 0xbe, + 0x67, 0x63, 0x80, 0xa4, 0x6a, 0x6e, 0xb3, 0x56, 0x35, 0x82, 0xb6, 0x4e, + 0x5b, 0x41, 0x3f, 0xa9, 0xd4, 0x49, 0x5a, 0x95, 0x5f, 0x3b, 0x89, 0x37, + 0x2a, 0xb3, 0xa1, 0x67, 0x98, 0x8a, 0xb2, 0xd4, 0xbc, 0x8c, 0x67, 0x9d, + 0xa5, 0xd8, 0x5c, 0x44, 0x42, 0x38, 0x4d, 0x6b, 0x6b, 0x9b, 0xb9, 0xc8, + 0xd8, 0x80, 0xae, 0x9b, 0x7b, 0x86, 0x79, 0x97, 0xa1, 0x50, 0x88, 0xb5, + 0x59, 0x35, 0x9a, 0x7a, 0xa7, 0x6a, 0xda, 0xd5, 0x8f, 0xba, 0xae, 0x37, + 0x6d, 0x5d, 0xa0, 0xa2, 0x4c, 0x62, 0x71, 0x9f, 0x37, 0xc3, 0x7a, 0x36, + 0x92, 0xd3, 0x8b, 0xaa, 0x6e, 0x62, 0x6b, 0x6d, 0xb0, 0x2e, 0x6b, 0x9c, + 0xad, 0x46, 0x38, 0x51, 0x55, 0xb7, 0x44, 0xd3, 0xaf, 0xbb, 0x5f, 0x5e, + 0x62, 0xce, 0xc9, 0xb6, 0xc7, 0x47, 0xda, 0xc6, 0x83, 0x3e, 0xb2, 0x2f, + 0x86, 0xc7, 0xa4, 0x20, 0x41, 0xa3, 0x76, 0xa4, 0x99, 0x9a, 0x97, 0xc0, + 0xab, 0xbf, 0x68, 0x98, 0x33, 0x62, 0x6a, 0x83, 0xaa, 0x92, 0x6e, 0xd0, + 0xbf, 0xae, 0x61, 0x59, 0xc0, 0x73, 0xa9, 0xa6, 0xd0, 0x42, 0x79, 0x5c, + 0xbc, 0xa2, 0x36, 0x6c, 0x6c, 0x39, 0x6c, 0x62, 0xc6, 0xc1, 0x67, 0x50, + 0xcf, 0x5c, 0x67, 0xc4, 0xb4, 0x3e, 0x53, 0x3f, 0xd0, 0x38, 0xa9, 0x5c, + 0x9b, 0xc7, 0xcb, 0x57, 0x68, 0x6d, 0x30, 0xc1, 0xc9, 0x6f, 0x6f, 0xb6, + 0x86, 0xd5, 0xa9, 0x98, 0x59, 0x93, 0xa4, 0x61, 0x50, 0x6c, 0x87, 0x6d, + 0x3f, 0x88, 0x4c, 0x51, 0xa6, 0x5a, 0x97, 0x50, 0xb1, 0xb3, 0x47, 0xc1, + 0x42, 0xc8, 0xbb, 0x52, 0xc5, 0x24, 0x80, 0x70, 0x83, 0xc9, 0xa2, 0xbc, + 0x9f, 0x9e, 0x42, 0x74, 0x94, 0x43, 0x69, 0x82, 0x71, 0xa7, 0xbd, 0xb7, + 0x70, 0x76, 0xc6, 0x95, 0xa9, 0x8e, 0xc5, 0x5c, 0x73, 0xc2, 0x4b, 0xce, + 0xc2, 0x94, 0x46, 0x3d, 0x9f, 0xaf, 0x64, 0xb0, 0x49, 0x7e, 0x95, 0x79, + 0x62, 0x40, 0xb5, 0x42, 0x46, 0x87, 0xcf, 0x98, 0x57, 0xa6, 0x71, 0x94, + 0x5a, 0x9c, 0xb9, 0xce, 0x88, 0x5e, 0x99, 0x80, 0x83, 0xba, 0x54, 0x61, + 0x72, 0x53, 0x9b, 0xb1, 0x39, 0x3f, 0x69, 0x96, 0x9c, 0x59, 0x5c, 0x8f, + 0x55, 0x7d, 0xaa, 0x94, 0x98, 0x56, 0x45, 0x6d, 0x74, 0xc1, 0x98, 0xa7, + 0x99, 0xb6, 0xc8, 0xd4, 0x5d, 0xbb, 0x5c, 0x45, 0x9a, 0x3c, 0xc2, 0xb0, + 0x6c, 0xa8, 0x6e, 0x68, 0x33, 0xae, 0x9a, 0xa5, 0x55, 0x52, 0x8c, 0x9b, + 0x4a, 0x5c, 0xc6, 0xa0, 0x97, 0x8e, 0x9d, 0x60, 0x9e, 0x38, 0x73, 0xad, + 0x90, 0x74, 0xa2, 0x7f, 0xca, 0x9d, 0x66, 0xd0, 0x9c, 0xa3, 0x79, 0x99, + 0xb1, 0x9b, 0xa2, 0x8b, 0x45, 0x59, 0xc8, 0xb7, 0xd7, 0x87, 0x93, 0x7f, + 0x6e, 0x4d, 0xa8, 0x40, 0x6f, 0x42, 0xc4, 0x9a, 0x8c, 0x31, 0x96, 0x57, + 0xc4, 0x3c, 0x79, 0x9b, 0xb3, 0xd1, 0x41, 0x65, 0x38, 0xbf, 0xb9, 0x5c, + 0xab, 0x85, 0xd5, 0xac, 0xaf, 0x6f, 0x69, 0xc7, 0xab, 0x5e, 0x5e, 0x49, + 0xc1, 0xc7, 0x6f, 0x6c, 0xa3, 0xbd, 0x7f, 0x43, 0xc6, 0xc6, 0x65, 0xc8, + 0x62, 0x65, 0xab, 0xb1, 0x36, 0xb3, 0xd1, 0x39, 0x7a, 0x5f, 0x6c, 0x75, + 0x44, 0x93, 0xae, 0xc6, 0xab, 0x64, 0x87, 0x91, 0x63, 0x61, 0xbe, 0x45, + 0x3e, 0xc7, 0x86, 0x87, 0x4d, 0x49, 0xac, 0xbf, 0xb4, 0x79, 0xb2, 0xa1, + 0xbc, 0x94, 0xb2, 0x90, 0xc0, 0xbb, 0x55, 0x4d, 0x38, 0x66, 0x59, 0x96, + 0x81, 0x8d, 0x60, 0x5e, 0xcf, 0x78, 0x7c, 0xc5, 0xe2, 0xc2, 0xa4, 0xa4, + 0x40, 0xad, 0xa6, 0x84, 0xb9, 0xa3, 0x50, 0x4a, 0x45, 0xae, 0x44, 0x4b, + 0xa8, 0x53, 0xc3, 0xc5, 0xc3, 0xc2, 0x81, 0xd0, 0x4f, 0x89, 0x97, 0x3c, + 0x90, 0x50, 0xc3, 0x5f, 0x78, 0xbe, 0x5e, 0x80, 0x9f, 0x97, 0x95, 0x94, + 0x7b, 0x45, 0x4c, 0x8c, 0x64, 0xca, 0x8f, 0x97, 0x71, 0x74, 0xa2, 0x6c, + 0x78, 0x64, 0x96, 0x48, 0x5e, 0x61, 0xaa, 0xb0, 0x71, 0x5b, 0xa4, 0x38, + 0x6e, 0xb7, 0x8a, 0x41, 0x67, 0x4f, 0xaa, 0xa8, 0x7d, 0x77, 0x92, 0xa6, + 0xab, 0x3a, 0x6a, 0x75, 0xcb, 0x5e, 0x4e, 0xa4, 0x46, 0x42, 0x41, 0xb5, + 0x5f, 0x80, 0xb4, 0x44, 0x48, 0x54, 0xc1, 0x44, 0x6e, 0xc7, 0x3e, 0xb0, + 0x89, 0xc6, 0x82, 0x4f, 0xc1, 0x68, 0x3f, 0x43, 0xcb, 0x73, 0x53, 0x3b, + 0x4f, 0x6f, 0xc8, 0x59, 0x2c, 0x20, 0xb7, 0x91, 0x8b, 0xb3, 0xc0, 0x97, + 0xca, 0xcd, 0x6e, 0xb5, 0x33, 0xb9, 0x4f, 0xc4, 0xbf, 0xa1, 0xa3, 0x51, + 0xb6, 0x3e, 0xc2, 0xcb, 0xbf, 0xb6, 0xbd, 0x4f, 0x6c, 0x3f, 0xb3, 0x53, + 0x9a, 0x3c, 0x55, 0x8f, 0xa2, 0x58, 0xdc, 0xac, 0x75, 0x47, 0xdc, 0x64, + 0xbf, 0x98, 0x38, 0x89, 0x6d, 0x3d, 0x61, 0xa9, 0x36, 0x77, 0xbd, 0x6a, + 0x7c, 0xbe, 0x56, 0xba, 0x6b, 0x59, 0x38, 0xd3, 0x6a, 0x47, 0x5b, 0xab, + 0x77, 0x4e, 0x2c, 0x95, 0x96, 0x58, 0x6d, 0x4d, 0x95, 0xb4, 0x35, 0x76, + 0x98, 0x80, 0xaf, 0x5a, 0x81, 0x99, 0x49, 0x3f, 0x83, 0xd3, 0x97, 0x72, + 0xbe, 0xb7, 0xa2, 0x97, 0xb8, 0x86, 0x70, 0x7e, 0xa7, 0x8d, 0xb1, 0x67, + 0xad, 0x82, 0x7f, 0x8e, 0x7c, 0x9b, 0x67, 0x87, 0x3a, 0x6f, 0x5f, 0x6d, + 0x9d, 0x4c, 0x8e, 0xd2, 0x90, 0xb6, 0x86, 0x81, 0xb3, 0xdc, 0x8e, 0xbf, + 0xc4, 0xa0, 0x3e, 0xbc, 0x34, 0x91, 0x47, 0x83, 0xc1, 0x60, 0x38, 0x94, + 0x72, 0x74, 0x75, 0x5a, 0xa5, 0x9a, 0xbb, 0x3f, 0x85, 0xa9, 0x52, 0xd5, + 0x80, 0x84, 0x8f, 0x49, 0x7b, 0x49, 0x91, 0x74, 0xb3, 0x69, 0x84, 0xbe, + 0xcb, 0x9b, 0xba, 0x9b, 0x8e, 0x7b, 0x6b, 0x5d, 0x68, 0x39, 0x49, 0xa9, + 0x51, 0x3b, 0xd0, 0xab, 0xc5, 0xa9, 0x71, 0xc9, 0x43, 0xaa, 0x76, 0x85, + 0x78, 0x9b, 0x39, 0x29, 0x95, 0x64, 0x86, 0x2f, 0x99, 0x35, 0x87, 0x52, + 0x4e, 0x8f, 0x62, 0xc4, 0xb1, 0xc2, 0x48, 0x79, 0x84, 0x5a, 0x59, 0xcb, + 0xa8, 0x4a, 0x62, 0x61, 0x77, 0x4e, 0x98, 0x77, 0x86, 0x63, 0xb0, 0x52, + 0xc4, 0xb7, 0x95, 0x44, 0x43, 0x84, 0x7e, 0x88, 0x99, 0x9d, 0x70, 0x52, + 0xa3, 0x56, 0x4a, 0x47, 0x88, 0x7f, 0x7a, 0x40, 0xd4, 0x5f, 0x8c, 0x88, + 0x5f, 0xcd, 0xa5, 0x5e, 0x4d, 0x48, 0xb8, 0xc7, 0x78, 0x44, 0xd2, 0x6f, + 0x75, 0x53, 0x7a, 0x9e, 0x78, 0xc4, 0x7f, 0x87, 0x87, 0x4c, 0x7e, 0xd6, + 0xb0, 0x83, 0x6a, 0x65, 0xb6, 0x31, 0xb1, 0xb0, 0x6c, 0x51, 0x90, 0x7d, + 0xbb, 0x72, 0x48, 0x91, 0x5b, 0x74, 0x23, 0xb8, 0x9e, 0x1c, 0xc5, 0x64, + 0xd4, 0x48, 0x49, 0xb8, 0x37, 0x95, 0xb9, 0x4a, 0x6d, 0xbc, 0xc3, 0xbe, + 0x63, 0xa9, 0x77, 0x74, 0xb0, 0x44, 0xc6, 0x61, 0x81, 0x8b, 0x7f, 0x30, + 0x4f, 0x4d, 0x3f, 0x4c, 0xa7, 0xa3, 0x61, 0xb6, 0x95, 0x53, 0x7e, 0xbd, + 0xa0, 0xc4, 0x5e, 0x47, 0x61, 0xae, 0x5d, 0x37, 0xcc, 0x71, 0x7a, 0xd2, + 0x6b, 0x5a, 0x92, 0x59, 0x73, 0xa1, 0x5e, 0x6a, 0xb4, 0xc6, 0xaf, 0xdb, + 0xd4, 0x84, 0x60, 0x97, 0xa4, 0x8d, 0x51, 0xd2, 0xc8, 0xa6, 0x67, 0xba, + 0x52, 0x9e, 0xc4, 0x42, 0x63, 0xcf, 0x69, 0x3a, 0x7e, 0x4b, 0xb3, 0x53, + 0x97, 0xb9, 0x95, 0xa3, 0x8d, 0x98, 0x48, 0x46, 0xb4, 0x90, 0xbf, 0x94, + 0xb1, 0x91, 0x61, 0x7d, 0x51, 0xc3, 0xcf, 0x45, 0xa4, 0x61, 0xae, 0xa3, + 0xa7, 0xa4, 0xca, 0xa2, 0x47, 0x97, 0x7f, 0xcf, 0x4c, 0x50, 0x9d, 0xbf, + 0x36, 0x77, 0x49, 0x7d, 0xaf, 0x94, 0x8f, 0x6d, 0xd3, 0x5d, 0x40, 0x9f, + 0xbf, 0x77, 0x3f, 0x8b, 0xb9, 0x92, 0xb1, 0xab, 0x9c, 0xd0, 0x6b, 0x9a, + 0xb4, 0x49, 0x7d, 0xa1, 0xb9, 0x30, 0x72, 0xdb, 0x82, 0x5c, 0x66, 0x30, + 0x86, 0x99, 0x70, 0x4f, 0x87, 0xb4, 0xd0, 0x55, 0x72, 0x61, 0x51, 0x47, + 0xc4, 0x38, 0x4e, 0x61, 0x98, 0x3c, 0x29, 0xa9, 0xb0, 0xaf, 0x89, 0xae, + 0xa6, 0xb8, 0xbb, 0x4f, 0xd4, 0xb9, 0x53, 0x21, 0x4a, 0x71, 0xb8, 0x90, + 0xac, 0xa7, 0xa6, 0x67, 0x6a, 0x7d, 0x38, 0x73, 0x3e, 0x90, 0xb9, 0x6c, + 0x67, 0xb8, 0xa6, 0xc5, 0xc1, 0x4c, 0x49, 0x6f, 0xad, 0x8c, 0x8c, 0x96, + 0x84, 0x3b, 0xd7, 0x4c, 0x70, 0x76, 0x4e, 0x9b, 0xac, 0x96, 0xc2, 0x72, + 0x5d, 0x92, 0x92, 0x9f, 0x97, 0x9b, 0xb4, 0xb2, 0xbb, 0x98, 0xb9, 0x73, + 0xa7, 0x41, 0x8d, 0x92, 0xbc, 0xb6, 0xb1, 0xaf, 0x76, 0x39, 0xc5, 0xbf, + 0x93, 0x7b, 0x67, 0xc5, 0x69, 0xa8, 0x4d, 0x73, 0x6b, 0xb7, 0xb5, 0xc2, + 0x83, 0xaf, 0xa3, 0x9d, 0xc6, 0x60, 0x3e, 0xd3, 0x79, 0x3a, 0x2b, 0x36, + 0xba, 0xc2, 0xbd, 0x88, 0xbe, 0x27, 0xbf, 0x3a, 0x86, 0x88, 0x62, 0xc0, + 0x3b, 0xa1, 0xcd, 0x50, 0x7f, 0xc8, 0xa7, 0x80, 0x33, 0xc2, 0x7b, 0xc2, + 0x45, 0xa9, 0x5f, 0x4b, 0x47, 0x94, 0x85, 0x3f, 0x65, 0xab, 0x9a, 0x8d, + 0x4d, 0x8e, 0xbc, 0xab, 0x53, 0x68, 0x85, 0xae, 0x73, 0x5f, 0x8f, 0x5e, + 0x54, 0x8b, 0x8b, 0xb7, 0xad, 0xa9, 0xbf, 0x52, 0xa0, 0xaf, 0x4a, 0x6e, + 0xb6, 0x85, 0x8c, 0x69, 0x5f, 0xd3, 0x3a, 0xbd, 0x79, 0xc7, 0x5c, 0xa8, + 0xbd, 0x87, 0x57, 0x7b, 0xb1, 0x6c, 0xc3, 0x4b, 0x42, 0xa5, 0x4b, 0x7b, + 0xa3, 0x4f, 0x67, 0x87, 0xb5, 0xbb, 0x82, 0x6c, 0xd5, 0x97, 0x5d, 0xc5, + 0x5e, 0x37, 0x44, 0x3e, 0xb3, 0x78, 0xa5, 0xa2, 0x76, 0xd1, 0xb5, 0x7b, + 0x4f, 0xc7, 0xc8, 0x43, 0x6c, 0x87, 0x40, 0x51, 0x33, 0xb3, 0xae, 0x74, + 0x45, 0x97, 0x68, 0x59, 0xc5, 0x39, 0x66, 0x4d, 0xb1, 0xb2, 0x4f, 0x6e, + 0x6b, 0x58, 0x54, 0x5b, 0x49, 0xa4, 0xb3, 0x81, 0x3e, 0x97, 0x47, 0xa3, + 0xb0, 0x9d, 0x71, 0x7c, 0x4b, 0x62, 0xb3, 0x3a, 0xa1, 0xa5, 0xc0, 0x68, + 0xc3, 0x8c, 0xc6, 0x58, 0x9d, 0x53, 0x58, 0x3b, 0xa1, 0x3c, 0xbd, 0xc2, + 0x56, 0x77, 0x37, 0x85, 0xb6, 0xdc, 0x3d, 0x59, 0x80, 0x7c, 0x90, 0x2c, + 0x79, 0x71, 0xa6, 0x8f, 0x8c, 0x53, 0xba, 0x70, 0x99, 0xc6, 0x52, 0x64, + 0xc8, 0x8c, 0x76, 0x7a, 0x67, 0x79, 0x8f, 0xc0, 0x3d, 0x4b, 0xad, 0x54, + 0xc2, 0xce, 0x66, 0x88, 0xd0, 0xb2, 0x91, 0xa0, 0x43, 0x7c, 0x74, 0xa7, + 0x59, 0x38, 0xb0, 0xc1, 0x9e, 0x2d, 0x3c, 0x8c, 0x56, 0x71, 0xcb, 0x83, + 0x42, 0xcd, 0x3f, 0x8f, 0x92, 0x88, 0x42, 0xc7, 0x4c, 0xd0, 0x33, 0x2f, + 0x7d, 0x77, 0xc5, 0x46, 0xc3, 0xc3, 0x6c, 0xa5, 0x7d, 0x6c, 0xc3, 0x92, + 0xc5, 0x4d, 0x5e, 0x47, 0x6b, 0x9e, 0x4c, 0x4c, 0x94, 0xc8, 0x4d, 0x5c, + 0xc9, 0xc9, 0x50, 0xc0, 0x64, 0xb4, 0x64, 0xbe, 0xd3, 0x66, 0x8f, 0x84, + 0x90, 0x7d, 0xcf, 0x9b, 0x8f, 0x3d, 0x97, 0x88, 0x50, 0x42, 0xa9, 0x5d, + 0x79, 0x5b, 0x31, 0x98, 0x6d, 0x39, 0xbe, 0x82, 0x60, 0xc1, 0x7f, 0xc3, + 0x83, 0x8c, 0x91, 0x95, 0x4d, 0xa7, 0x4f, 0x67, 0x37, 0x57, 0x41, 0xcc, + 0xc0, 0xbc, 0x5e, 0xa7, 0xd3, 0xbd, 0x8c, 0x3f, 0x68, 0x33, 0x6e, 0x6f, + 0x73, 0x72, 0xd5, 0x99, 0xd8, 0xad, 0x5c, 0xaf, 0xbf, 0xb0, 0x87, 0x2c, + 0x6c, 0x76, 0x90, 0x8e, 0xb7, 0xa1, 0x2f, 0xbd, 0x76, 0x7b, 0xc9, 0x3e, + 0xb7, 0xa9, 0xab, 0xbc, 0x67, 0x82, 0x7a, 0x73, 0x91, 0x45, 0x50, 0x27, + 0xa8, 0x12, 0x82, 0x38, 0x65, 0x5a, 0x3c, 0x3b, 0xd7, 0x61, 0x55, 0xaf, + 0x92, 0xb7, 0x38, 0x4f, 0xc3, 0x5a, 0xb3, 0xae, 0x60, 0x7e, 0x5d, 0x6e, + 0x30, 0x38, 0x75, 0x32, 0xc4, 0x48, 0x47, 0x61, 0xc1, 0x7a, 0x34, 0x93, + 0xd0, 0x85, 0xcf, 0xac, 0x84, 0x96, 0x87, 0x9b, 0x54, 0x3f, 0x90, 0x87, + 0xb2, 0x9b, 0xbc, 0x34, 0x36, 0x95, 0xb9, 0x5b, 0x8b, 0x92, 0x56, 0x94, + 0xab, 0x7e, 0x7d, 0x99, 0x38, 0xd0, 0x71, 0x71, 0x83, 0xae, 0xe1, 0xa2, + 0x44, 0xdd, 0xa6, 0xc6, 0x84, 0x92, 0x5b, 0xca, 0x8e, 0x8f, 0x68, 0x49, + 0x55, 0x6b, 0x80, 0x7d, 0x43, 0x81, 0x42, 0x9c, 0x97, 0xc3, 0x63, 0x8d, + 0xb5, 0x5a, 0xaf, 0x67, 0xc1, 0x90, 0xc4, 0xbb, 0x5b, 0x5c, 0xd1, 0xd3, + 0x5a, 0x38, 0x5f, 0xb7, 0x5f, 0x89, 0x8a, 0xbd, 0x44, 0x60, 0x4c, 0xad, + 0x52, 0x3a, 0xb4, 0x49, 0x8e, 0x6f, 0x91, 0x5e, 0x34, 0x26, 0x3f, 0xcf, + 0xc2, 0x85, 0x92, 0x86, 0xc4, 0xc7, 0x48, 0x31, 0x4c, 0x5b, 0x2a, 0x37, + 0x76, 0xbf, 0x5d, 0xaa, 0x3a, 0x41, 0x46, 0xcd, 0x87, 0x8b, 0x82, 0x60, + 0x9a, 0x5f, 0xad, 0xd7, 0xa6, 0x50, 0x97, 0xd1, 0x88, 0xae, 0xa9, 0x3e, + 0xb8, 0xd4, 0xd7, 0x65, 0xba, 0x90, 0xba, 0x85, 0xc0, 0x82, 0xb0, 0x71, + 0x44, 0x6f, 0xb7, 0x8e, 0x3b, 0x63, 0x2a, 0x25, 0x6f, 0x7d, 0x2d, 0xac, + 0x72, 0xa0, 0x58, 0x77, 0xa0, 0xa0, 0x7d, 0x8e, 0x52, 0x77, 0xbb, 0x75, + 0x5e, 0xc8, 0x6c, 0x7f, 0xc6, 0x62, 0xbb, 0xad, 0x7f, 0x35, 0x6a, 0x7a, + 0x42, 0x51, 0x4f, 0x6b, 0x68, 0xbd, 0x7e, 0x70, 0x99, 0x58, 0xa8, 0x87, + 0xc4, 0xaa, 0x85, 0x9c, 0x81, 0x63, 0xa6, 0x5f, 0x4c, 0x52, 0x56, 0x8a, + 0x65, 0xb6, 0x86, 0xb0, 0x47, 0x8d, 0x6b, 0x6c, 0x7a, 0x66, 0x75, 0x7f, + 0x68, 0x6b, 0x2e, 0x4f, 0x4d, 0x6e, 0x8c, 0x63, 0x3d, 0x3d, 0x85, 0xbb, + 0xb0, 0x5b, 0xce, 0x87, 0x84, 0x91, 0x64, 0xab, 0x93, 0x68, 0x78, 0xb4, + 0x9f, 0xa0, 0x3f, 0x7a, 0x8a, 0xa4, 0xbf, 0x49, 0x33, 0xbb, 0x70, 0xa4, + 0x8c, 0x46, 0xaf, 0xa7, 0x4d, 0x6f, 0x46, 0x4b, 0xbe, 0x91, 0x76, 0x2d, + 0x5e, 0x44, 0xb2, 0xb8, 0x78, 0xb1, 0x94, 0x4a, 0x4f, 0xc2, 0x92, 0x6a, + 0x80, 0x7d, 0xc9, 0x82, 0xa5, 0x7b, 0xa2, 0x76, 0x9f, 0x54, 0xbf, 0xa2, + 0xb1, 0x76, 0x7a, 0x6f, 0x98, 0xbf, 0x3d, 0x7c, 0x62, 0x9a, 0x6e, 0x54, + 0x31, 0x40, 0xbe, 0xbe, 0xa3, 0x54, 0xc5, 0xc9, 0x48, 0x69, 0x40, 0x91, + 0x4b, 0xb7, 0x4f, 0xbd, 0x49, 0xcf, 0xcf, 0x70, 0xb7, 0x67, 0x39, 0x47, + 0x75, 0x6c, 0x57, 0x47, 0xd3, 0xb3, 0x4a, 0x72, 0x6f, 0xae, 0x35, 0x81, + 0x94, 0x96, 0x4e, 0x49, 0xb8, 0xcd, 0x62, 0xac, 0xcd, 0xce, 0x8a, 0x41, + 0x78, 0xb1, 0x88, 0x5a, 0xad, 0x38, 0xa0, 0x7a, 0x69, 0x3f, 0x69, 0xc6, + 0xb6, 0x7b, 0x54, 0xbf, 0x43, 0xa3, 0x77, 0xdf, 0xc7, 0xc1, 0xaa, 0x4e, + 0xbe, 0x73, 0x78, 0x8a, 0xd3, 0xb5, 0x49, 0x31, 0x96, 0xd7, 0x9f, 0x8d, + 0x80, 0xc1, 0x4a, 0x84, 0xb1, 0xc6, 0x39, 0x64, 0xb7, 0x8d, 0xc3, 0x6d, + 0x95, 0x58, 0x80, 0x4a, 0xce, 0x5b, 0xb9, 0x72, 0x3e, 0x79, 0x51, 0xb8, + 0x3c, 0x8b, 0x63, 0x96, 0x32, 0x65, 0xb4, 0xb4, 0x78, 0x56, 0x98, 0x30, + 0x98, 0x89, 0xa1, 0x4d, 0x80, 0xd0, 0x61, 0xb3, 0x45, 0x9c, 0x3c, 0x5e, + 0xca, 0xa5, 0xa0, 0x73, 0xb4, 0xd6, 0xa3, 0x5a, 0x8a, 0x94, 0x9f, 0xc5, + 0x31, 0x5e, 0x69, 0xc5, 0x2f, 0x60, 0xdd, 0x69, 0x3e, 0x5a, 0x6e, 0xa4, + 0x9f, 0xb8, 0xad, 0x40, 0xa6, 0xad, 0x5e, 0x66, 0x5c, 0x8b, 0xb4, 0x5e, + 0x9f, 0x90, 0x40, 0xd0, 0x52, 0x83, 0xb6, 0x97, 0x86, 0xb3, 0x85, 0x3b, + 0xaf, 0x4e, 0x4a, 0x65, 0x78, 0xbf, 0x72, 0xca, 0x3f, 0x52, 0x4f, 0x3a, + 0x8e, 0xd3, 0xc3, 0x36, 0xd2, 0x90, 0xd4, 0xb0, 0x30, 0x51, 0x83, 0x44, + 0x38, 0x72, 0xa7, 0x58, 0x6b, 0x7b, 0xc2, 0xb4, 0x92, 0xbb, 0x54, 0xaf, + 0x2a, 0xc4, 0x85, 0x71, 0x98, 0xae, 0x9d, 0x48, 0x45, 0x8e, 0x5a, 0x54, + 0x3c, 0x50, 0x4a, 0xb2, 0x7e, 0xb5, 0xd6, 0xb7, 0x98, 0x77, 0x66, 0x8d, + 0x71, 0x26, 0xc1, 0xba, 0x6b, 0xc6, 0xc7, 0x4e, 0x6a, 0x8a, 0x7a, 0xa1, + 0x65, 0x77, 0x64, 0x2d, 0xd6, 0x5d, 0x5e, 0x6c, 0xc2, 0x82, 0x3b, 0x2f, + 0x90, 0x5a, 0xa0, 0x6a, 0xc1, 0x7a, 0x42, 0xc5, 0x4a, 0xa7, 0x46, 0x3e, + 0xd9, 0x65, 0xac, 0x83, 0xb8, 0x71, 0x7c, 0xa4, 0x3a, 0xc6, 0xc1, 0x31, + 0x67, 0x52, 0xbb, 0x89, 0xc4, 0x9e, 0x7c, 0x5d, 0x92, 0xe3, 0x58, 0xc6, + 0x77, 0xbf, 0xb7, 0x95, 0x70, 0x8a, 0x40, 0x3b, 0xb1, 0x56, 0x62, 0x8e, + 0xc7, 0xd1, 0x93, 0xc7, 0xa8, 0xa2, 0xa6, 0xa0, 0x94, 0x9d, 0xab, 0xa4, + 0x76, 0x3b, 0xda, 0x4f, 0x4b, 0x52, 0xa2, 0x7f, 0xc5, 0x97, 0x97, 0x69, + 0xd4, 0xca, 0x88, 0xc1, 0x89, 0x5e, 0xb3, 0x9d, 0xb8, 0x79, 0x8c, 0x34, + 0x4c, 0xa7, 0x5f, 0xa3, 0x5a, 0x3b, 0xb2, 0x44, 0x4c, 0x61, 0x7a, 0x52, + 0x35, 0x70, 0xbc, 0xcf, 0xa5, 0x65, 0x72, 0x7e, 0x5b, 0xaf, 0xc7, 0x66, + 0x64, 0x55, 0x3d, 0x67, 0x63, 0x4d, 0xb6, 0xb3, 0x73, 0xc0, 0x30, 0xac, + 0x3c, 0xac, 0xab, 0xb0, 0xc5, 0x44, 0xc0, 0x85, 0x31, 0xb4, 0x9a, 0xbf, + 0x5a, 0xc7, 0x45, 0x5a, 0x67, 0x5a, 0x51, 0x8a, 0x69, 0x99, 0x4c, 0x5f, + 0xc8, 0xbc, 0x74, 0xb8, 0x32, 0x8f, 0xd8, 0xd7, 0xbc, 0x60, 0x73, 0x67, + 0x47, 0xab, 0xa7, 0xbd, 0xa2, 0x50, 0xb0, 0x75, 0x67, 0xa2, 0xca, 0x71, + 0xd4, 0x4a, 0x40, 0x57, 0x7d, 0xd3, 0x66, 0xb7, 0x9c, 0x6c, 0x33, 0x4b, + 0xc1, 0xda, 0xa5, 0xb8, 0x66, 0x9e, 0xc1, 0xbd, 0x7f, 0xcb, 0x32, 0x64, + 0xab, 0xa6, 0x87, 0xaa, 0x79, 0x67, 0x38, 0x9c, 0xba, 0x8d, 0x9a, 0x2c, + 0x60, 0x48, 0x8b, 0x92, 0xc7, 0x85, 0x5a, 0xaa, 0xbf, 0x7b, 0x63, 0x60, + 0x59, 0xa1, 0x70, 0x75, 0xbe, 0xa2, 0x2c, 0x57, 0xc3, 0xbb, 0x77, 0x5f, + 0x61, 0xd2, 0x88, 0x4b, 0xa5, 0x3b, 0xbf, 0x86, 0x66, 0x5c, 0x75, 0x81, + 0x37, 0x8a, 0x32, 0xb1, 0x30, 0xc4, 0x8f, 0x9d, 0xb3, 0x35, 0x92, 0xad, + 0xb8, 0x8d, 0xc9, 0x9f, 0x68, 0x8f, 0xbc, 0x83, 0x54, 0xbb, 0x64, 0x73, + 0xd2, 0x4a, 0x56, 0x98, 0xcd, 0xc3, 0x31, 0x61, 0x47, 0x61, 0x3d, 0x3d, + 0x3c, 0xd0, 0x6b, 0x8c, 0xc2, 0x80, 0x63, 0x3d, 0xa3, 0x7b, 0x4e, 0x72, + 0x37, 0x8f, 0x5b, 0x4d, 0xa6, 0x39, 0x4c, 0xcd, 0x5a, 0x85, 0xc0, 0x57, + 0x6f, 0xc3, 0xb3, 0x76, 0x7e, 0x43, 0x69, 0x66, 0x83, 0xb7, 0x69, 0xc2, + 0xbe, 0xca, 0x92, 0x9f, 0x67, 0x42, 0xca, 0xc5, 0x79, 0x9a, 0xb8, 0xb5, + 0xc5, 0x76, 0x6f, 0x8e, 0x81, 0x58, 0x83, 0xb9, 0x46, 0x50, 0x9a, 0x36, + 0x83, 0xac, 0x43, 0xaf, 0x37, 0x97, 0x6a, 0xcb, 0x7f, 0x89, 0x8e, 0x85, + 0x72, 0x78, 0x59, 0x68, 0x8e, 0xc5, 0x3a, 0x7c, 0xba, 0xc9, 0x82, 0x87, + 0x7b, 0xd4, 0x84, 0x5d, 0xa6, 0x42, 0x6b, 0xb7, 0xa3, 0x94, 0x65, 0x95, + 0x4b, 0x7d, 0x36, 0x74, 0xae, 0x5d, 0xb9, 0x83, 0xd5, 0xa1, 0x9c, 0xd2, + 0xd3, 0x49, 0xa9, 0xbc, 0xb7, 0x8d, 0xc1, 0x6b, 0x78, 0x46, 0x94, 0xb6, + 0xae, 0x3b, 0x5f, 0x3f, 0x3a, 0x47, 0x50, 0x67, 0x97, 0x7a, 0x8f, 0x37, + 0x66, 0x9f, 0x5d, 0xaf, 0x4c, 0x84, 0xa9, 0x95, 0x96, 0x44, 0xad, 0x55, + 0xc3, 0x78, 0xce, 0x32, 0xc6, 0x3f, 0xc3, 0x55, 0x48, 0x70, 0xd3, 0x42, + 0x69, 0xa7, 0x86, 0xcb, 0x88, 0x8e, 0x4b, 0xb2, 0xc5, 0xbb, 0xa4, 0xb4, + 0x73, 0xbb, 0x52, 0x37, 0x92, 0x7f, 0xa9, 0x73, 0x50, 0xb6, 0x59, 0x6b, + 0x80, 0x66, 0x56, 0xb1, 0xc3, 0x97, 0x4d, 0x64, 0x63, 0x6e, 0x3a, 0x71, + 0x54, 0x3a, 0x66, 0xd2, 0x74, 0x41, 0x4d, 0xa6, 0x63, 0x52, 0x7c, 0x3a, + 0x54, 0x35, 0x63, 0x71, 0x82, 0x6b, 0x41, 0x59, 0x4f, 0x9d, 0x95, 0x38, + 0x90, 0xc2, 0xc7, 0x87, 0xa3, 0x5c, 0x41, 0x8a, 0xc8, 0x4c, 0xb0, 0x5e, + 0x8b, 0x69, 0x9e, 0x39, 0x53, 0x52, 0x41, 0x8a, 0x6e, 0x51, 0x34, 0x41, + 0x84, 0x5a, 0x6c, 0x72, 0x36, 0x38, 0xbf, 0xc5, 0xa9, 0xb7, 0x93, 0x30, + 0xd0, 0x4c, 0x73, 0x85, 0x94, 0xab, 0x97, 0x3f, 0x6a, 0xcf, 0x9d, 0x69, + 0x4a, 0x60, 0x82, 0xc0, 0x4a, 0x3a, 0x3e, 0xc1, 0xb5, 0x32, 0x5a, 0x3f, + 0x4a, 0x70, 0xb7, 0x51, 0xb3, 0x4a, 0x73, 0x31, 0x89, 0xb4, 0x53, 0x9a, + 0xbf, 0x82, 0x82, 0xc7, 0x87, 0x49, 0x5f, 0x92, 0x81, 0xc6, 0x30, 0xc3, + 0x34, 0x51, 0x8d, 0x36, 0x30, 0x7a, 0x78, 0xa7, 0x9a, 0xa7, 0x5a, 0xa9, + 0x68, 0x73, 0x39, 0x71, 0x90, 0x8a, 0x88, 0x75, 0x4b, 0xd5, 0x45, 0x9b, + 0x47, 0xc0, 0x35, 0x76, 0xac, 0xcf, 0xb5, 0xc1, 0x90, 0x78, 0x99, 0xa2, + 0x88, 0xc7, 0x7e, 0xcb, 0x6a, 0xa0, 0x4c, 0x54, 0x52, 0x85, 0x6a, 0x46, + 0x77, 0xc1, 0x58, 0x4e, 0x80, 0xa4, 0xab, 0x48, 0xa0, 0xc4, 0x61, 0xb1, + 0x52, 0x38, 0x99, 0x97, 0x68, 0x6d, 0x50, 0x74, 0x47, 0xb5, 0xb8, 0x3f, + 0x8f, 0x7f, 0x3c, 0x8a, 0xa3, 0x5f, 0x8b, 0xbf, 0x8b, 0x6b, 0x7d, 0x96, + 0x60, 0x70, 0xd0, 0xd0, 0x6e, 0x9a, 0xb5, 0xa6, 0x53, 0x85, 0x75, 0x72, + 0x60, 0xba, 0xb8, 0xb2, 0x47, 0xbe, 0xc2, 0xa7, 0xaa, 0x3f, 0x76, 0x7d, + 0xcb, 0x68, 0x64, 0x73, 0x7a, 0xc3, 0x68, 0x8f, 0x71, 0xa0, 0x87, 0xa2, + 0x80, 0x65, 0x6c, 0xca, 0x9e, 0x56, 0x82, 0x89, 0x9f, 0x3b, 0x5d, 0x7d, + 0xcf, 0x80, 0xc0, 0x68, 0x70, 0x72, 0x73, 0xc0, 0x81, 0xa3, 0x78, 0x5d, + 0xba, 0x51, 0x4b, 0x88, 0xa0, 0xbe, 0x38, 0xc8, 0x44, 0x71, 0x45, 0xd7, + 0x76, 0x49, 0x75, 0x7e, 0x7b, 0x5d, 0xb0, 0x36, 0xca, 0x9b, 0x97, 0x42, + 0x99, 0x70, 0xb1, 0x69, 0x47, 0x6e, 0xa5, 0xa0, 0x96, 0x5a, 0x6e, 0x8b, + 0x6a, 0x3f, 0xa7, 0x41, 0x4b, 0x88, 0x70, 0x4c, 0x8c, 0x82, 0xaa, 0x60, + 0xab, 0x3c, 0x5e, 0x59, 0x68, 0xc1, 0xa0, 0x95, 0x80, 0x3d, 0x6e, 0x9a, + 0x64, 0x73, 0xbd, 0xa8, 0x71, 0x8c, 0x53, 0xd2, 0x82, 0x55, 0x70, 0xac, + 0x97, 0x60, 0x57, 0x94, 0xbd, 0x6c, 0x44, 0x8f, 0x70, 0xb9, 0x76, 0x33, + 0x7f, 0x95, 0x8f, 0x5a, 0x58, 0x77, 0x87, 0x4d, 0x67, 0x9a, 0x60, 0x51, + 0x65, 0x56, 0x39, 0x75, 0x7c, 0x58, 0xaa, 0x67, 0xc0, 0xb4, 0x53, 0x8a, + 0x5a, 0x39, 0x6a, 0xaf, 0x5c, 0x3c, 0xcb, 0x35, 0x3a, 0x5f, 0xb0, 0x83, + 0x5a, 0x83, 0x3d, 0x46, 0xb5, 0x34, 0x47, 0x75, 0x4f, 0x48, 0x7f, 0x99, + 0x8d, 0x7a, 0x98, 0xbf, 0x6a, 0xab, 0xc0, 0x60, 0xce, 0x4d, 0xb8, 0x3d, + 0x99, 0x44, 0x42, 0x3c, 0x5d, 0x3f, 0xd5, 0x83, 0x4e, 0x41, 0x61, 0x6b, + 0x8b, 0xa8, 0x62, 0x6f, 0xd6, 0xb2, 0x61, 0x51, 0x59, 0x43, 0x90, 0x8b, + 0x3f, 0xab, 0xcb, 0x8b, 0xa9, 0x69, 0x86, 0x79, 0x8a, 0x36, 0xd1, 0xac, + 0x4b, 0x42, 0xd1, 0x8a, 0x35, 0x67, 0x3b, 0x3c, 0xa7, 0x53, 0xa6, 0x86, + 0xd1, 0x74, 0x99, 0x9b, 0x93, 0x8b, 0x35, 0x87, 0x5c, 0x6b, 0xb2, 0xcd, + 0x5d, 0xbe, 0x6d, 0x58, 0xb0, 0xd6, 0xb6, 0xd3, 0x86, 0xac, 0x32, 0xb8, + 0x39, 0x3c, 0x73, 0x6a, 0x5a, 0x42, 0xc0, 0x74, 0xce, 0x2d, 0x87, 0xad, + 0xcb, 0x64, 0xa9, 0x5a, 0xce, 0x71, 0xab, 0xa7, 0x58, 0x59, 0xb4, 0x31, + 0x64, 0x51, 0x72, 0xc6, 0x49, 0x63, 0x3c, 0xb9, 0xbd, 0xa8, 0x9d, 0xa6, + 0x36, 0x9d, 0x61, 0x94, 0xc0, 0x69, 0xac, 0x5d, 0x54, 0xb7, 0x63, 0x76, + 0x46, 0xac, 0xaa, 0x89, 0xad, 0x9e, 0x4e, 0xcd, 0xc3, 0x96, 0x6a, 0x65, + 0x4b, 0x8c, 0xd3, 0x51, 0xb0, 0xa8, 0x41, 0x98, 0x55, 0x7f, 0x8c, 0xc7, + 0x47, 0x42, 0x77, 0x66, 0xb7, 0xbd, 0x47, 0x80, 0x38, 0x9d, 0x64, 0x8d, + 0xab, 0xae, 0x48, 0x8e, 0x78, 0x98, 0x8f, 0x78, 0x89, 0x7d, 0x76, 0xcb, + 0xb7, 0xaa, 0xa4, 0x45, 0xd5, 0x5a, 0xb4, 0xa7, 0x57, 0xd3, 0x55, 0xb9, + 0xd1, 0x4b, 0x72, 0x94, 0x69, 0x9a, 0x8c, 0x33, 0x99, 0xb5, 0xa5, 0xa8, + 0xa5, 0x53, 0xcc, 0xa9, 0x86, 0xbc, 0x3d, 0x4d, 0x47, 0x8f, 0xa9, 0x3f, + 0x93, 0x6e, 0x63, 0x90, 0xb3, 0x72, 0x3b, 0xb7, 0x84, 0xa6, 0x3b, 0x44, + 0x6c, 0x6b, 0xb9, 0x8d, 0xcb, 0x3b, 0x6a, 0x6c, 0xa1, 0xc5, 0xa1, 0x39, + 0x9c, 0xc0, 0x95, 0x52, 0x90, 0xcf, 0x94, 0x3b, 0xa6, 0xc9, 0xbd, 0x95, + 0x91, 0xa8, 0xc7, 0x48, 0x4d, 0x70, 0x7c, 0x96, 0x77, 0x61, 0x5a, 0x7c, + 0xc8, 0x5d, 0x43, 0x4b, 0x78, 0x35, 0xb5, 0x5a, 0x6c, 0x84, 0x46, 0xce, + 0x72, 0x8b, 0x90, 0x42, 0x48, 0xb6, 0xa8, 0xa5, 0x50, 0xb7, 0x51, 0xb9, + 0xc0, 0x9a, 0x86, 0x35, 0x98, 0x2c, 0xcf, 0x3f, 0x6a, 0x76, 0xae, 0x6b, + 0x71, 0x80, 0x96, 0x99, 0xb1, 0x53, 0x62, 0x38, 0x8d, 0xb1, 0xbc, 0xc9, + 0xc3, 0xb2, 0xd2, 0x50, 0xb6, 0x8b, 0x49, 0x63, 0xa4, 0xb5, 0x31, 0x74, + 0x4c, 0x54, 0x48, 0x77, 0xbc, 0x70, 0xb0, 0x74, 0x39, 0xc6, 0x71, 0x62, + 0x7b, 0xa9, 0x94, 0x48, 0xad, 0xc9, 0xc1, 0xcb, 0x9a, 0x5c, 0x50, 0xc3, + 0x5b, 0x64, 0x5a, 0x39, 0x7a, 0x59, 0xc3, 0x3f, 0x4b, 0x70, 0x3c, 0x38, + 0xb4, 0xac, 0x7a, 0x65, 0x8a, 0xbb, 0xc3, 0x9c, 0xc5, 0xb1, 0x42, 0x68, + 0x7b, 0xd2, 0xb5, 0x60, 0x7b, 0x97, 0xa0, 0x9f, 0x3e, 0xad, 0x57, 0x64, + 0xb1, 0x44, 0x6e, 0x40, 0x5e, 0xb7, 0xc3, 0x6f, 0x45, 0x69, 0x71, 0xb8, + 0xb4, 0xd2, 0xa4, 0x68, 0x88, 0x8a, 0x5e, 0xb4, 0x60, 0xbc, 0x90, 0xab, + 0xaa, 0x35, 0x91, 0x78, 0x5a, 0x71, 0x7a, 0x4f, 0x51, 0x4d, 0x87, 0xc8, + 0x96, 0x54, 0xa0, 0x55, 0x75, 0x8c, 0x69, 0x58, 0x71, 0x4b, 0xbd, 0xcf, + 0xab, 0x74, 0x86, 0x3a, 0x51, 0xc5, 0x55, 0x5b, 0x97, 0x48, 0x80, 0x5d, + 0x34, 0xba, 0xac, 0xc9, 0x60, 0x71, 0xab, 0x68, 0xa8, 0x40, 0xbc, 0xa8, + 0xc3, 0x6d, 0x89, 0xaa, 0xc7, 0xb9, 0x6f, 0x5d, 0x99, 0xcd, 0xd5, 0xae, + 0xaa, 0x7b, 0x5d, 0xa8, 0x74, 0x5b, 0xbc, 0x82, 0x92, 0xbf, 0xa6, 0xa8, + 0x99, 0x32, 0x57, 0xae, 0x98, 0x7f, 0x56, 0x2f, 0x9c, 0x45, 0x43, 0xa7, + 0x65, 0x53, 0xb6, 0xa9, 0x67, 0xa9, 0x5e, 0x75, 0x73, 0x78, 0x40, 0x3c, + 0x3d, 0x8c, 0x4c, 0x5f, 0xc3, 0x70, 0x88, 0xa9, 0xb3, 0x9a, 0xab, 0xc3, + 0xc3, 0xd0, 0x72, 0xbb, 0xcc, 0x81, 0x4d, 0x45, 0x54, 0xcd, 0x3f, 0x48, + 0xc5, 0xc6, 0xc2, 0x33, 0x52, 0xb7, 0x46, 0x7b, 0xc3, 0x6d, 0x3c, 0x65, + 0x95, 0xd1, 0x62, 0x99, 0xad, 0xa5, 0x6e, 0xb2, 0x76, 0x83, 0x6d, 0xca, + 0x46, 0x52, 0x55, 0x59, 0x68, 0x3f, 0x59, 0x72, 0x6a, 0x81, 0x7c, 0x92, + 0x8f, 0xa6, 0x6c, 0xb0, 0x67, 0xc7, 0x48, 0xb7, 0x42, 0xb0, 0x8a, 0x83, + 0xd0, 0x78, 0x57, 0x9b, 0x84, 0xbb, 0xa3, 0x7a, 0x64, 0xa7, 0xc2, 0x8a, + 0x43, 0x38, 0xa5, 0xd4, 0xc5, 0x91, 0x46, 0x9f, 0x6b, 0x58, 0x9a, 0x33, + 0xc2, 0xa3, 0x86, 0x4a, 0xac, 0xbd, 0xcb, 0x8d, 0x53, 0x3f, 0x35, 0xce, + 0x69, 0x31, 0x55, 0x4d, 0x8e, 0xcd, 0x58, 0x3f, 0x88, 0xd2, 0xc6, 0x7e, + 0x61, 0x88, 0xab, 0x5b, 0x3f, 0xcf, 0xd4, 0x7c, 0x79, 0x54, 0x53, 0xc5, + 0xba, 0x89, 0x83, 0x89, 0xa7, 0xa1, 0x52, 0x61, 0x71, 0xa2, 0x8e, 0x64, + 0xa9, 0xb2, 0x72, 0x86, 0xc1, 0xa6, 0x55, 0xa2, 0x55, 0x72, 0x7c, 0x89, + 0x44, 0xb8, 0x64, 0x3e, 0x93, 0x8d, 0xbf, 0x3d, 0x58, 0x4d, 0x43, 0x4d, + 0x3a, 0xb2, 0xa3, 0x45, 0x9e, 0x38, 0x67, 0x79, 0xa0, 0x3f, 0x37, 0x81, + 0xa6, 0xb6, 0x72, 0xbb, 0xb7, 0x79, 0x57, 0xb5, 0x92, 0xad, 0x90, 0x57, + 0xbb, 0x4c, 0xbf, 0xc8, 0xce, 0x61, 0xbf, 0xb1, 0xce, 0xad, 0xac, 0x4c, + 0x5f, 0x6e, 0xb5, 0x9f, 0x99, 0x4b, 0xaf, 0x53, 0x55, 0x3a, 0x5e, 0xb1, + 0x9b, 0x66, 0xd1, 0x9f, 0x6d, 0x84, 0x5d, 0xcc, 0x77, 0x5e, 0x5f, 0x46, + 0x5f, 0x71, 0x95, 0x46, 0x7d, 0x3a, 0x70, 0x60, 0x35, 0x92, 0xa4, 0x64, + 0xae, 0xad, 0x71, 0x4b, 0x41, 0xb5, 0xb4, 0xc5, 0x5e, 0xbb, 0x8d, 0x41, + 0xc0, 0xb4, 0x5d, 0x41, 0x8a, 0x3f, 0x34, 0x78, 0x5b, 0x35, 0x9a, 0x8f, + 0xb7, 0xc9, 0xc3, 0x6f, 0x4d, 0xab, 0x8f, 0x78, 0xd0, 0x76, 0xb3, 0x97, + 0xc8, 0x80, 0xd1, 0xd4, 0xa7, 0x31, 0xb5, 0xc6, 0x65, 0xb0, 0xc7, 0x9e, + 0x67, 0x49, 0xcd, 0xc8, 0x5d, 0x38, 0xaf, 0x3c, 0x84, 0x58, 0x95, 0x7f, + 0x5a, 0xd2, 0xca, 0x5b, 0xca, 0x9f, 0xac, 0x69, 0xc9, 0x77, 0x55, 0x49, + 0x96, 0x72, 0x61, 0x9c, 0xb0, 0x37, 0x55, 0xc3, 0x9f, 0x50, 0x3b, 0xc1, + 0x4f, 0x3d, 0x84, 0x51, 0x4a, 0x9e, 0x79, 0x92, 0xcf, 0x39, 0x8c, 0xca, + 0x4c, 0x42, 0x5b, 0x78, 0x38, 0x50, 0x44, 0xc2, 0x98, 0xad, 0x41, 0xa8, + 0x66, 0x5b, 0x67, 0x3e, 0x6e, 0x31, 0x96, 0xb4, 0xc4, 0xc4, 0x5c, 0x32, + 0x32, 0xcc, 0xa1, 0x75, 0x8c, 0x9d, 0x7a, 0xd0, 0x94, 0x32, 0x5c, 0x4b, + 0x4e, 0x41, 0xcd, 0xc0, 0x45, 0xa7, 0xc5, 0x33, 0x5b, 0x39, 0x5a, 0x79, + 0xd2, 0x9c, 0x89, 0xb0, 0x92, 0x3b, 0xa5, 0x74, 0x68, 0x98, 0x50, 0x8e, + 0x41, 0x64, 0x39, 0x90, 0x42, 0x33, 0xb9, 0x3b, 0xa6, 0x98, 0x35, 0x99, + 0x56, 0x84, 0x6d, 0x53, 0x69, 0x82, 0x80, 0x73, 0x41, 0x7e, 0x5f, 0x56, + 0x43, 0x84, 0x57, 0x80, 0x51, 0xc5, 0x5a, 0xa3, 0x7d, 0x7e, 0xb0, 0x9f, + 0x38, 0x9d, 0xc7, 0x48, 0x7a, 0x4e, 0x32, 0x79, 0x85, 0xcf, 0xa0, 0xa3, + 0xc5, 0xa6, 0x6b, 0x88, 0x92, 0x94, 0xc2, 0xc0, 0x48, 0xd0, 0x4c, 0xba, + 0x61, 0x63, 0xae, 0x5c, 0xb2, 0x5f, 0x3c, 0x48, 0x91, 0x5b, 0xd1, 0xb2, + 0x8b, 0x76, 0x4d, 0x50, 0x49, 0x8b, 0x3e, 0x98, 0x7d, 0xca, 0x4a, 0xc2, + 0x98, 0x3b, 0x61, 0x61, 0x3c, 0xc3, 0xbb, 0xba, 0x72, 0xaf, 0xac, 0x2f, + 0x90, 0x72, 0x9e, 0x49, 0x87, 0x93, 0x86, 0xab, 0x52, 0xc5, 0x2e, 0x3c, + 0x90, 0x27, 0x4f, 0x84, 0x48, 0xad, 0x6d, 0x94, 0xa3, 0x88, 0x63, 0xc5, + 0xc2, 0x26, 0xd2, 0x8d, 0x43, 0xb4, 0x7d, 0xce, 0x81, 0xcf, 0x98, 0x3e, + 0x4e, 0x98, 0x8c, 0xc6, 0x48, 0x81, 0x42, 0x8b, 0xd0, 0x37, 0x31, 0xb7, + 0x72, 0x2a, 0x2b, 0x74, 0x68, 0xab, 0x4d, 0x47, 0x71, 0xa9, 0xa0, 0xb6, + 0xd6, 0x54, 0x53, 0xac, 0x9b, 0x5d, 0x83, 0xa2, 0x83, 0x51, 0x92, 0x5d, + 0xcf, 0xc8, 0x9e, 0x7f, 0x42, 0x73, 0x6f, 0x8e, 0x3d, 0x8a, 0x43, 0x87, + 0x44, 0xa3, 0x59, 0xda, 0x73, 0x7a, 0x3c, 0xa5, 0x9c, 0xd3, 0x8f, 0x3b, + 0x8b, 0xa9, 0x69, 0x6a, 0x6e, 0x5c, 0xc1, 0xb6, 0xad, 0xcd, 0xc6, 0x79, + 0x92, 0x4a, 0x46, 0x37, 0x5f, 0xd6, 0x50, 0xa5, 0xb9, 0xa8, 0x97, 0x5a, + 0x76, 0x91, 0xa9, 0x4d, 0x5c, 0x78, 0xae, 0xb3, 0x6e, 0x9a, 0x48, 0x58, + 0x83, 0xce, 0x4c, 0x4f, 0x64, 0xca, 0xa0, 0xc9, 0x61, 0x6e, 0xa3, 0x45, + 0x77, 0x7d, 0x55, 0xd5, 0x71, 0x44, 0x56, 0xce, 0x98, 0xb4, 0x46, 0xa8, + 0xcc, 0x75, 0x77, 0x86, 0xc0, 0x5e, 0xc5, 0xdb, 0x45, 0xaf, 0x8b, 0x49, + 0x4f, 0xce, 0x46, 0x47, 0x4a, 0xcd, 0x45, 0x4f, 0x8c, 0x83, 0xca, 0xbc, + 0xa3, 0xa4, 0xa7, 0x70, 0xaf, 0x3b, 0x53, 0x6d, 0x5e, 0xae, 0x3e, 0x3c, + 0x58, 0x90, 0x99, 0x6d, 0x4a, 0xcf, 0x6e, 0x90, 0x67, 0xb7, 0x44, 0x80, + 0x74, 0x7c, 0x73, 0xc9, 0x89, 0x3f, 0x7c, 0xaa, 0x9c, 0x43, 0x7e, 0x45, + 0xcf, 0x9c, 0x67, 0x6c, 0x7c, 0x87, 0xae, 0x2d, 0xaf, 0x5a, 0xa6, 0xcc, + 0x55, 0x39, 0xda, 0x69, 0xda, 0xb7, 0x8a, 0xbb, 0x5a, 0x39, 0xc4, 0xb9, + 0xb0, 0x75, 0x5b, 0x9d, 0x4a, 0xb5, 0x9f, 0xd7, 0x84, 0x2c, 0x55, 0x7a, + 0x55, 0x49, 0x80, 0x68, 0x45, 0x89, 0xd7, 0x7e, 0xc9, 0x54, 0x77, 0x91, + 0xa1, 0x9b, 0x6f, 0xb3, 0xae, 0xaf, 0x96, 0x44, 0xd8, 0x8c, 0xbe, 0x79, + 0xad, 0x41, 0xbe, 0xc8, 0x98, 0xa4, 0xa3, 0x5e, 0x32, 0x88, 0x8c, 0x72, + 0x90, 0x71, 0x9e, 0x2e, 0x97, 0x68, 0x33, 0x40, 0xcb, 0x57, 0x7d, 0x91, + 0x91, 0x92, 0x37, 0x46, 0xa7, 0x51, 0x8e, 0xdb, 0xb9, 0x88, 0x6b, 0xb2, + 0x44, 0x8a, 0x8b, 0x96, 0x38, 0x60, 0xa6, 0xbf, 0x83, 0x89, 0x55, 0x61, + 0x81, 0x4f, 0x79, 0x54, 0x91, 0xd8, 0x97, 0x69, 0x6d, 0x39, 0xa4, 0xce, + 0xc4, 0x8f, 0x5b, 0x4e, 0x72, 0x6c, 0xb3, 0xcb, 0xc1, 0xd2, 0xbd, 0xd5, + 0x3d, 0x64, 0x3f, 0x77, 0xcc, 0x41, 0x35, 0x8c, 0xc7, 0x32, 0xad, 0x3d, + 0x6a, 0x9f, 0x9e, 0x51, 0x82, 0xac, 0x67, 0xd2, 0x7b, 0x44, 0xdc, 0xb7, + 0x72, 0xd1, 0x7f, 0xaa, 0x76, 0x9f, 0x72, 0xa8, 0xbe, 0xbb, 0x3f, 0xae, + 0x78, 0x7a, 0x9f, 0xaa, 0x33, 0xc2, 0xab, 0xbf, 0x3a, 0xc7, 0x71, 0x7b, + 0x52, 0x76, 0xa9, 0x77, 0x49, 0x55, 0xca, 0xb5, 0xa8, 0x72, 0xc8, 0xa4, + 0x8e, 0x6d, 0x7f, 0xa8, 0xc1, 0xa1, 0x41, 0x8c, 0xbd, 0x9d, 0x7f, 0x56, + 0x51, 0xd0, 0xc8, 0x71, 0xb4, 0xd5, 0x30, 0x67, 0x96, 0xa7, 0x96, 0xd3, + 0x3e, 0xc9, 0x4d, 0xd5, 0x92, 0x66, 0x85, 0x97, 0x51, 0x84, 0xda, 0x91, + 0x41, 0x76, 0x4a, 0x55, 0xc9, 0xa4, 0x8d, 0xcf, 0x85, 0xa2, 0x4f, 0x9d, + 0x57, 0x48, 0x5a, 0x8d, 0x77, 0x9b, 0x52, 0xa0, 0xc6, 0xb7, 0x8a, 0x7b, + 0xae, 0x76, 0xaf, 0x55, 0x44, 0x4d, 0x7f, 0x9f, 0x32, 0x74, 0x7b, 0xa1, + 0x79, 0xab, 0x86, 0x66, 0xb4, 0xa6, 0x94, 0xb6, 0x75, 0xb4, 0x7e, 0x9b, + 0x5c, 0xcb, 0xb1, 0xae, 0x60, 0xce, 0x3f, 0xb4, 0xcc, 0x86, 0xcf, 0x79, + 0x86, 0xd3, 0xb0, 0x4c, 0x2e, 0x9d, 0xbe, 0x86, 0x87, 0x33, 0x8a, 0x8f, + 0x84, 0x95, 0x81, 0x52, 0x4d, 0xaf, 0x48, 0x8a, 0xa0, 0x66, 0x52, 0xa6, + 0xbf, 0x52, 0x7d, 0x91, 0xc9, 0x5f, 0xa0, 0x4c, 0xad, 0x54, 0x56, 0x39, + 0x59, 0xe7, 0x9d, 0x61, 0x8c, 0x6a, 0xe2, 0x8f, 0xa4, 0x58, 0x7c, 0xac, + 0x34, 0xcc, 0x96, 0x85, 0x69, 0x6c, 0x87, 0x5a, 0x85, 0x73, 0x9a, 0xbe, + 0x7d, 0x38, 0x30, 0x58, 0x65, 0x63, 0x8d, 0x45, 0x92, 0xb1, 0x61, 0xc4, + 0xb8, 0x9b, 0x90, 0x5d, 0x71, 0x93, 0x87, 0x91, 0x85, 0x2f, 0x68, 0x51, + 0xa2, 0x3a, 0xc2, 0x9b, 0xce, 0x98, 0x3d, 0x7b, 0x87, 0x50, 0x62, 0xbb, + 0x96, 0x3b, 0xb9, 0x4c, 0x55, 0x71, 0x9e, 0xae, 0x67, 0xc1, 0x48, 0x78, + 0x65, 0x45, 0xac, 0x78, 0x9e, 0x63, 0x76, 0xd0, 0xc3, 0x7b, 0x61, 0x40, + 0x5f, 0xb3, 0xb2, 0x39, 0xcc, 0x4b, 0x4c, 0xa0, 0x67, 0x55, 0x4d, 0x59, + 0x40, 0x3b, 0x62, 0x5d, 0x5a, 0x83, 0x53, 0x49, 0x45, 0xc4, 0x9c, 0xac, + 0xc6, 0x87, 0xb3, 0x65, 0xa2, 0xcf, 0x9a, 0x84, 0xb6, 0x48, 0x9b, 0x54, + 0xb2, 0xba, 0xd8, 0xa9, 0x60, 0x6d, 0xa0, 0x94, 0x9c, 0x8b, 0x70, 0x71, + 0xa7, 0x59, 0x82, 0x6e, 0xd2, 0x65, 0x9c, 0x6c, 0x70, 0x4a, 0x37, 0x9d, + 0x5e, 0xcb, 0x45, 0xcf, 0x92, 0x45, 0xa4, 0x57, 0xbd, 0x68, 0xa6, 0xa1, + 0x4f, 0x4d, 0x99, 0xc1, 0x92, 0x81, 0x7d, 0x7f, 0x87, 0x9e, 0x73, 0x5c, + 0x72, 0x2a, 0xa4, 0x3a, 0x79, 0x31, 0x26, 0xb3, 0xca, 0xbb, 0xdc, 0xd0, + 0xd8, 0x76, 0xa4, 0xad, 0x73, 0xa1, 0xd3, 0x4e, 0x69, 0x75, 0x55, 0x8c, + 0x89, 0x91, 0x57, 0xd2, 0xd3, 0x47, 0x6a, 0x9e, 0x48, 0x79, 0xba, 0x9c, + 0x7b, 0xac, 0x56, 0x62, 0x56, 0xcb, 0xa3, 0x29, 0x4d, 0x82, 0xd7, 0xb5, + 0x68, 0xb1, 0xa6, 0xc7, 0x38, 0xbe, 0x78, 0x55, 0xa3, 0x3e, 0x9f, 0x38, + 0x5b, 0x93, 0x82, 0x80, 0xb3, 0xbf, 0x4b, 0xbd, 0x45, 0xbc, 0xa6, 0x9d, + 0x79, 0x8f, 0x8c, 0xb8, 0xa8, 0x37, 0x65, 0x47, 0x56, 0x35, 0x41, 0x98, + 0xc7, 0x9e, 0x5e, 0x9a, 0x44, 0xce, 0xbd, 0x3d, 0x7f, 0x97, 0x48, 0xb9, + 0x4c, 0x4f, 0xb1, 0xa4, 0xca, 0x93, 0x35, 0x97, 0xb7, 0xa1, 0x41, 0x51, + 0x9e, 0x4c, 0x3e, 0x31, 0x96, 0x7c, 0x80, 0xd3, 0x30, 0x37, 0x55, 0x45, + 0x3c, 0xc0, 0x7a, 0x66, 0xa6, 0xb6, 0x8e, 0x37, 0x85, 0x4b, 0x54, 0x41, + 0x77, 0x45, 0x4e, 0x94, 0x4a, 0x9d, 0xc3, 0x3b, 0xa1, 0x84, 0x93, 0xba, + 0x6c, 0xc2, 0x5a, 0xcd, 0x54, 0x57, 0xb2, 0x81, 0xbf, 0x79, 0x5c, 0x7e, + 0xc1, 0x47, 0x3d, 0x79, 0xb1, 0xc6, 0x78, 0x44, 0xae, 0xc4, 0x71, 0xaf, + 0x60, 0x73, 0x90, 0x4e, 0x9d, 0x7e, 0x7d, 0xb7, 0x6c, 0x69, 0x93, 0x7a, + 0x41, 0x72, 0x60, 0x7c, 0xa6, 0xa1, 0x7d, 0xa5, 0x64, 0x81, 0x6d, 0xae, + 0xa8, 0x78, 0x49, 0xc7, 0xbd, 0x6d, 0xba, 0xa6, 0xc1, 0x6a, 0xbf, 0xd3, + 0x6a, 0x78, 0x8c, 0x69, 0x62, 0x38, 0x8f, 0x4c, 0x99, 0x80, 0x6d, 0x51, + 0x4e, 0x78, 0xa1, 0xa1, 0xb6, 0x50, 0xcc, 0xba, 0x82, 0x85, 0x66, 0x5f, + 0xd5, 0x64, 0x6f, 0x53, 0x3d, 0x5b, 0x50, 0x6a, 0x88, 0x59, 0xd0, 0x87, + 0x94, 0x32, 0xb5, 0xa3, 0x98, 0x91, 0x60, 0xba, 0x71, 0x58, 0xa4, 0x77, + 0x48, 0x50, 0x9b, 0x8b, 0xc5, 0x57, 0x75, 0x7e, 0x3e, 0x90, 0x68, 0xa5, + 0xc1, 0x97, 0x32, 0x83, 0xc5, 0xc7, 0xa1, 0x48, 0xb4, 0x3d, 0x3d, 0x96, + 0xc7, 0xa7, 0x3c, 0x62, 0xc2, 0x82, 0x73, 0xb7, 0x64, 0xbc, 0x40, 0xb8, + 0xd2, 0x88, 0x47, 0x84, 0xc1, 0x41, 0x52, 0xa1, 0xa3, 0x3e, 0xd1, 0x67, + 0xc7, 0x97, 0xbd, 0xb1, 0x8e, 0xac, 0xad, 0xa3, 0x48, 0xa8, 0x36, 0x5a, + 0xb3, 0x67, 0x5e, 0x61, 0x7b, 0x4b, 0xc1, 0xc4, 0x38, 0x91, 0xbb, 0xad, + 0x40, 0x87, 0xb0, 0x71, 0x3b, 0x8a, 0x87, 0x72, 0xcb, 0x54, 0x33, 0x85, + 0xb7, 0xc9, 0x95, 0x52, 0x57, 0xa1, 0x5d, 0x8a, 0x7e, 0x3f, 0x93, 0x9a, + 0x3c, 0xcd, 0x9e, 0x68, 0xc7, 0xcf, 0xc6, 0xa9, 0x7b, 0x49, 0x84, 0xbe, + 0x71, 0xc4, 0x52, 0x8d, 0x34, 0xa8, 0xba, 0x5c, 0xb1, 0xcf, 0xc4, 0xab, + 0x9a, 0x9a, 0x9f, 0xcc, 0xa8, 0x4d, 0xce, 0xce, 0xa3, 0x6e, 0x6f, 0x6a, + 0x76, 0xbb, 0xc6, 0x49, 0xcb, 0x44, 0x3c, 0x8d, 0x7e, 0xd2, 0xa4, 0x65, + 0xb2, 0xb9, 0xb8, 0xac, 0x5d, 0x93, 0x45, 0xa5, 0x2c, 0x6a, 0x77, 0xa5, + 0xb8, 0x58, 0x90, 0x85, 0xd5, 0x77, 0x9e, 0xc1, 0xbb, 0x75, 0x4f, 0x48, + 0x86, 0x37, 0x66, 0xac, 0x5a, 0xbd, 0x99, 0x8c, 0xad, 0x48, 0x5f, 0xa3, + 0x87, 0x7b, 0x6c, 0x8c, 0x80, 0x3d, 0xb1, 0x47, 0xb9, 0x92, 0x36, 0xc4, + 0x4f, 0x37, 0x34, 0xc2, 0x49, 0x77, 0xbc, 0x7e, 0xc9, 0xc1, 0xc8, 0xbb, + 0xcb, 0x60, 0x3d, 0xb3, 0x9a, 0x94, 0xad, 0x6f, 0x69, 0x80, 0x9f, 0x48, + 0xae, 0xae, 0x82, 0x37, 0x4e, 0x67, 0xac, 0x69, 0x93, 0x52, 0xc6, 0x6b, + 0x69, 0x56, 0x93, 0x8c, 0x41, 0x3a, 0x6b, 0x73, 0xa3, 0xaf, 0x4b, 0x9f, + 0x37, 0x5e, 0x42, 0x90, 0xb8, 0x91, 0x89, 0x31, 0x9e, 0xbe, 0x30, 0x7d, + 0x97, 0x6f, 0x8a, 0x41, 0x79, 0x33, 0xac, 0xc9, 0x97, 0xa2, 0x7d, 0x87, + 0x70, 0xa9, 0x48, 0xa6, 0x44, 0x60, 0x34, 0x92, 0x4a, 0xba, 0x41, 0x8d, + 0x49, 0x76, 0xc8, 0x55, 0x52, 0x3b, 0x7c, 0xc2, 0x60, 0xa7, 0x93, 0x93, + 0x72, 0x8a, 0x50, 0x64, 0xc7, 0xa5, 0x67, 0x7f, 0x92, 0xc1, 0x2e, 0x95, + 0x54, 0x85, 0x72, 0x8d, 0xb7, 0x9a, 0x6a, 0xbb, 0x62, 0x7d, 0x89, 0x93, + 0x6b, 0x70, 0x98, 0xce, 0x4c, 0x87, 0x5d, 0x5e, 0x3a, 0x7d, 0x4a, 0x5c, + 0xbf, 0x91, 0xc4, 0x7c, 0x6c, 0xcc, 0xaf, 0x37, 0x72, 0x4e, 0x77, 0xa6, + 0x5e, 0xca, 0x5c, 0x5d, 0x79, 0x5e, 0x41, 0x6a, 0x61, 0x50, 0xaf, 0x57, + 0x5f, 0x67, 0x84, 0xc5, 0xaa, 0xc7, 0x8e, 0x2c, 0x54, 0x80, 0x58, 0xa5, + 0x9a, 0x7f, 0x61, 0x41, 0x65, 0x81, 0x99, 0x9d, 0xab, 0x6b, 0x66, 0x45, + 0xa2, 0x73, 0x8b, 0x5a, 0x89, 0xb5, 0x75, 0xa0, 0xc6, 0xa6, 0x99, 0x64, + 0x74, 0xb4, 0x4c, 0xa5, 0x3c, 0x47, 0xa2, 0x83, 0x82, 0x36, 0x40, 0x8d, + 0xc8, 0xb6, 0x32, 0xbd, 0x9b, 0x37, 0xd0, 0x99, 0x4c, 0xa8, 0x37, 0x63, + 0x55, 0xc2, 0xc9, 0x9d, 0xb3, 0x36, 0x54, 0xa7, 0xab, 0x48, 0xd6, 0x9c, + 0x51, 0x3a, 0xa2, 0x5f, 0x8b, 0xb3, 0x7a, 0xac, 0xa7, 0x3a, 0xa1, 0x82, + 0x37, 0x47, 0xc5, 0x6a, 0x8a, 0x3c, 0xa0, 0xbc, 0xbb, 0x81, 0x79, 0x56, + 0xa4, 0xca, 0x60, 0x95, 0x61, 0x92, 0x6d, 0x48, 0xa4, 0xba, 0x80, 0x81, + 0xa5, 0x2e, 0x9d, 0x49, 0x84, 0xa2, 0xba, 0x6e, 0x3d, 0x46, 0x94, 0x7c, + 0xbb, 0x3c, 0x48, 0x3e, 0x82, 0xad, 0x75, 0xb6, 0x7c, 0x8e, 0x3b, 0x60, + 0x6f, 0x52, 0x58, 0x54, 0x9b, 0x82, 0x35, 0xc1, 0x46, 0x62, 0xbe, 0x3f, + 0xc8, 0x73, 0x87, 0x2d, 0x48, 0x5d, 0x72, 0xa4, 0x7b, 0x3e, 0xad, 0xa8, + 0x3a, 0x78, 0x48, 0x3a, 0x74, 0x53, 0x6d, 0x83, 0x45, 0x9b, 0x7a, 0xd3, + 0x2f, 0x8e, 0x8e, 0xa8, 0x69, 0x81, 0xbb, 0x64, 0x8f, 0x7a, 0x4f, 0x36, + 0xb9, 0x56, 0x68, 0xb3, 0x7f, 0xb2, 0x97, 0xc7, 0xc5, 0x9e, 0x41, 0x97, + 0x70, 0x88, 0x95, 0x66, 0x31, 0x3f, 0x95, 0x6e, 0x43, 0xcf, 0x62, 0x5c, + 0xa1, 0x4e, 0xd0, 0x77, 0x63, 0x53, 0x50, 0xa5, 0x5e, 0x2e, 0x3a, 0xad, + 0x8a, 0x86, 0x51, 0xb4, 0x79, 0x5f, 0x92, 0x30, 0x44, 0xb1, 0xd1, 0x56, + 0x56, 0xc4, 0xce, 0x30, 0xbc, 0x9a, 0x35, 0x35, 0x30, 0x92, 0xce, 0x4e, + 0xb3, 0x4a, 0xb6, 0xa9, 0x51, 0x91, 0x77, 0x57, 0x96, 0x64, 0x92, 0xd4, + 0x7a, 0x30, 0x5d, 0x55, 0x7b, 0x3f, 0xa3, 0x92, 0x77, 0xa0, 0x76, 0x96, + 0x7a, 0x61, 0x76, 0x8f, 0xd4, 0xbc, 0x28, 0x59, 0x6c, 0x80, 0x81, 0x58, + 0xbb, 0x63, 0xc0, 0x5d, 0xa1, 0x50, 0xca, 0xa7, 0xc5, 0x5c, 0xc0, 0x71, + 0x39, 0x74, 0x57, 0x52, 0x62, 0xc1, 0x68, 0x7d, 0x4b, 0x5e, 0x78, 0x94, + 0x7c, 0xac, 0x72, 0x98, 0x3e, 0x82, 0x57, 0xa3, 0x9c, 0x58, 0x9a, 0x4b, + 0x5a, 0x3b, 0xab, 0x67, 0xab, 0x73, 0xac, 0x3d, 0x5e, 0x97, 0x6b, 0x94, + 0xcc, 0x52, 0x43, 0x79, 0x3a, 0x78, 0xaa, 0xa9, 0xc9, 0x8c, 0x32, 0x36, + 0x7e, 0xcf, 0xc0, 0x5b, 0xd3, 0xaa, 0x8c, 0xc8, 0x61, 0xc5, 0x38, 0xbc, + 0x51, 0x91, 0xaa, 0x4f, 0x86, 0xaf, 0x65, 0xa5, 0x56, 0x8f, 0x33, 0x6e, + 0x8e, 0x84, 0xa4, 0x74, 0x81, 0x66, 0x3d, 0x35, 0x3c, 0xc1, 0x2f, 0x33, + 0xa3, 0x6c, 0x9e, 0x6d, 0x5b, 0xcd, 0x66, 0x51, 0x8d, 0x9e, 0x79, 0x6e, + 0xcb, 0xa3, 0x89, 0xa9, 0x72, 0xa1, 0x42, 0x63, 0x5a, 0x92, 0x42, 0x5b, + 0xcc, 0xad, 0xb7, 0x7b, 0x7c, 0x96, 0x59, 0x5e, 0x90, 0x3d, 0xbf, 0x92, + 0xbe, 0x70, 0xb4, 0xd8, 0xc8, 0xca, 0xa5, 0x3b, 0xd0, 0x50, 0xd2, 0x9b, + 0x68, 0x4f, 0x64, 0x61, 0xb0, 0xc2, 0x4c, 0xbb, 0xb0, 0x4c, 0xcd, 0x3b, + 0xbf, 0x3c, 0x3f, 0x51, 0x44, 0x71, 0xa0, 0x63, 0xcf, 0x64, 0x63, 0xa1, + 0xd1, 0x5a, 0xa1, 0x3a, 0x54, 0x90, 0x8a, 0x5c, 0xcc, 0xb3, 0xc1, 0x64, + 0x98, 0xba, 0xb4, 0x58, 0x7f, 0x67, 0x3b, 0x89, 0x82, 0x56, 0x79, 0x9e, + 0xd0, 0x3d, 0x9e, 0x9a, 0xce, 0x40, 0xca, 0xb9, 0x43, 0x7c, 0x8a, 0x96, + 0x99, 0x72, 0x72, 0xd3, 0x4d, 0xa6, 0xa3, 0x5c, 0x6d, 0xc3, 0x3c, 0x56, + 0xcd, 0x5b, 0x5c, 0xc8, 0x5c, 0x6d, 0x4e, 0x99, 0x9f, 0x4f, 0x91, 0x86, + 0x63, 0x6b, 0x65, 0xac, 0xbb, 0x98, 0x7c, 0xcc, 0x97, 0x54, 0x91, 0x67, + 0x52, 0xca, 0x9f, 0x36, 0xb5, 0x75, 0xcf, 0xb9, 0x64, 0xc9, 0xa5, 0x6f, + 0xbf, 0x9b, 0x49, 0x50, 0x9a, 0xcc, 0x6e, 0xa7, 0x80, 0x8c, 0x49, 0xb3, + 0xa0, 0x49, 0xbf, 0x39, 0x3e, 0x33, 0xad, 0x6b, 0x9f, 0x43, 0x43, 0x6b, + 0xc0, 0x5b, 0x33, 0x35, 0x86, 0x7e, 0x62, 0x7e, 0x8d, 0xcb, 0x4f, 0x50, + 0x4b, 0x32, 0xad, 0xae, 0x37, 0x7b, 0x5b, 0x44, 0xd2, 0x41, 0x9d, 0x3e, + 0x92, 0x7c, 0x7c, 0x83, 0xab, 0x89, 0x6d, 0x91, 0x30, 0xb6, 0xbb, 0x5c, + 0x89, 0x39, 0x8b, 0x80, 0xad, 0xcb, 0x53, 0x52, 0x9a, 0x51, 0x3e, 0x7c, + 0xd4, 0xcf, 0xba, 0x5f, 0xbe, 0xbc, 0x58, 0xa4, 0x69, 0xb0, 0xa7, 0xcf, + 0x9c, 0x73, 0xab, 0xaa, 0x83, 0xb6, 0x36, 0x7a, 0xc9, 0xc3, 0x3b, 0xbc, + 0x73, 0x68, 0xcb, 0x49, 0x3c, 0x4f, 0xa7, 0x4f, 0x54, 0x58, 0x3e, 0xbc, + 0x58, 0xc9, 0x38, 0x62, 0x67, 0x55, 0x4e, 0x56, 0x3b, 0x31, 0x62, 0xab, + 0x7d, 0xb0, 0x6d, 0x6c, 0xcd, 0x61, 0x86, 0x91, 0xbc, 0x9d, 0x46, 0x32, + 0x84, 0xb7, 0xb8, 0xc4, 0x3b, 0x59, 0x74, 0x85, 0x7f, 0x53, 0xa7, 0x97, + 0xc1, 0x94, 0x6a, 0xb9, 0x4d, 0x34, 0x4d, 0xcf, 0xa9, 0x7a, 0xaf, 0xb2, + 0x46, 0xd1, 0x74, 0x74, 0x8f, 0xb4, 0x47, 0x9e, 0x9e, 0x7a, 0x69, 0x9b, + 0xcf, 0xd4, 0x58, 0x85, 0x3b, 0x85, 0x8a, 0x84, 0xc2, 0x9b, 0x4a, 0x87, + 0xae, 0x9a, 0x5b, 0x3a, 0xb5, 0x98, 0xaa, 0x32, 0x61, 0x73, 0x38, 0xbf, + 0x5e, 0xa1, 0xca, 0xa8, 0xba, 0x91, 0x5f, 0x67, 0x58, 0x40, 0xb1, 0xc6, + 0x92, 0x31, 0xb8, 0xba, 0x73, 0x32, 0x47, 0x7c, 0xb4, 0x46, 0x6a, 0x73, + 0x89, 0x5d, 0x39, 0x57, 0x47, 0x4e, 0x76, 0x34, 0x65, 0xbd, 0x67, 0x9a, + 0xa2, 0x9b, 0x83, 0x35, 0x98, 0x81, 0x42, 0xbb, 0x84, 0x64, 0x76, 0xd0, + 0x2e, 0x30, 0x8c, 0x37, 0x6e, 0x2d, 0x55, 0x5d, 0x3e, 0xa0, 0x7f, 0x42, + 0x42, 0xa4, 0xaa, 0x97, 0xbe, 0x4c, 0xa6, 0xac, 0x4a, 0xca, 0x55, 0x61, + 0x76, 0xb4, 0xb9, 0x66, 0xa7, 0xaa, 0x9b, 0xb1, 0x8f, 0xc9, 0x5c, 0x5f, + 0xc9, 0x41, 0xbd, 0x61, 0x39, 0x78, 0xcc, 0xc0, 0x4a, 0xaa, 0x55, 0xc4, + 0x6a, 0xad, 0x4d, 0x67, 0x38, 0x7e, 0xb2, 0x3a, 0x38, 0x55, 0xb6, 0xc4, + 0x98, 0x73, 0x35, 0x4c, 0x84, 0xc2, 0x80, 0x36, 0x51, 0x98, 0x5f, 0xaa, + 0x50, 0xcc, 0xbd, 0x84, 0x57, 0x3b, 0xb8, 0x83, 0x7e, 0x31, 0xb5, 0x36, + 0x92, 0x58, 0x39, 0x52, 0x67, 0x3f, 0x53, 0x66, 0xae, 0x3a, 0xb7, 0xcd, + 0xc1, 0x4c, 0xa4, 0x91, 0xb5, 0x39, 0x3f, 0x56, 0x9b, 0xcb, 0x40, 0xaf, + 0xce, 0x31, 0xcc, 0x9a, 0x9f, 0x79, 0x59, 0xd5, 0xbe, 0x5e, 0xb2, 0x7b, + 0xce, 0x53, 0x74, 0xcf, 0xc3, 0x48, 0x7a, 0x35, 0x70, 0x47, 0x42, 0x7e, + 0x53, 0x71, 0xbe, 0xc2, 0x87, 0x50, 0x61, 0xa8, 0x6f, 0x88, 0x34, 0x45, + 0xa4, 0x5d, 0xc9, 0xb7, 0x6d, 0x8d, 0x62, 0x8f, 0x30, 0x7e, 0x6c, 0x71, + 0x37, 0xa8, 0x60, 0xcb, 0xc8, 0x3d, 0x3c, 0x7b, 0x8e, 0xcf, 0x63, 0x4e, + 0x8d, 0x48, 0xb7, 0x7c, 0x80, 0xab, 0xc6, 0x62, 0x84, 0x97, 0x3e, 0xac, + 0x55, 0x7f, 0x4a, 0x67, 0xb8, 0xbf, 0x75, 0x78, 0xc9, 0xa7, 0xc3, 0x82, + 0xcd, 0x92, 0x56, 0x6b, 0xc3, 0x31, 0xc8, 0x3f, 0x6b, 0x49, 0x93, 0xb1, + 0x3a, 0x87, 0x51, 0x59, 0xbe, 0x88, 0xc0, 0x5c, 0x85, 0xb7, 0xab, 0xcb, + 0xca, 0x8e, 0x7f, 0x7c, 0xc2, 0x75, 0xca, 0xd2, 0xab, 0x7f, 0x70, 0x71, + 0x33, 0xaf, 0x7b, 0x7d, 0xa4, 0x31, 0x95, 0xa9, 0x82, 0x79, 0xb4, 0xab, + 0x76, 0x93, 0x8a, 0x73, 0xc6, 0x8c, 0xa5, 0x83, 0xc3, 0xc4, 0xa7, 0x8e, + 0x59, 0x73, 0xcd, 0xc5, 0x7e, 0xcf, 0x46, 0x4c, 0x37, 0x99, 0x63, 0xcd, + 0xcb, 0x4e, 0xb7, 0x54, 0x5c, 0x53, 0x61, 0x44, 0x7b, 0xa9, 0x86, 0xc7, + 0x91, 0x5d, 0xc0, 0xba, 0x75, 0x8b, 0x38, 0x6b, 0x54, 0x5a, 0x5b, 0xb4, + 0xa5, 0xd4, 0x51, 0xd1, 0x81, 0x85, 0xaa, 0xd5, 0x33, 0x54, 0x88, 0x85, + 0xba, 0x54, 0xca, 0x3d, 0xac, 0x8a, 0x64, 0x9e, 0xcb, 0x68, 0xe1, 0x5c, + 0x7f, 0xc5, 0xb3, 0x81, 0x87, 0x8b, 0x64, 0x67, 0x7c, 0xa2, 0x47, 0xaf, + 0x82, 0x3a, 0x5e, 0xc7, 0x3a, 0x3f, 0xb0, 0x55, 0x72, 0x75, 0x81, 0x58, + 0xb0, 0x78, 0x8b, 0x57, 0xca, 0x4b, 0xc5, 0xb0, 0xda, 0x3f, 0x44, 0xa0, + 0x85, 0x7a, 0xa3, 0x73, 0x50, 0x64, 0x6f, 0xbc, 0xd3, 0x99, 0x9b, 0x87, + 0x68, 0x50, 0xb0, 0xaa, 0x9f, 0x4b, 0xc5, 0xd9, 0x2a, 0x39, 0x88, 0x99, + 0x9d, 0xb0, 0x51, 0x7c, 0x49, 0x4c, 0x41, 0x52, 0x8d, 0x3b, 0x40, 0x30, + 0x49, 0x54, 0xce, 0x8e, 0xac, 0x7d, 0xd4, 0xb4, 0x6a, 0x8e, 0x38, 0xd9, + 0xc5, 0x4f, 0x86, 0x7c, 0x36, 0x6f, 0xba, 0x64, 0x26, 0x74, 0x9a, 0x6b, + 0x82, 0xb0, 0xb1, 0x5f, 0x5b, 0x7a, 0x81, 0x7e, 0xcd, 0xcc, 0xa3, 0x76, + 0x68, 0x47, 0x60, 0x8e, 0xb0, 0xbd, 0x39, 0x5b, 0xc6, 0x8b, 0x88, 0xb8, + 0x7d, 0x1f, 0x80, 0xc7, 0x88, 0x95, 0x78, 0xa2, 0xac, 0x40, 0x7e, 0x48, + 0xa6, 0x5c, 0x42, 0x2e, 0x57, 0xaf, 0x45, 0x91, 0x9a, 0xb6, 0xb8, 0x83, + 0xcf, 0xbd, 0xbf, 0xdd, 0x82, 0xa4, 0x89, 0x41, 0xa3, 0xb3, 0xbf, 0xa5, + 0xd4, 0x72, 0x34, 0x80, 0x60, 0x7a, 0xb4, 0x6a, 0x7b, 0xbf, 0x74, 0xc7, + 0xd6, 0xce, 0xcb, 0xb3, 0xab, 0x80, 0x62, 0x93, 0x78, 0xda, 0x42, 0x8e, + 0xb8, 0x44, 0x35, 0x39, 0x8d, 0xc1, 0xc7, 0xb9, 0xdb, 0x69, 0x69, 0xa3, + 0x30, 0x34, 0xb3, 0xc0, 0x6a, 0x8f, 0x66, 0xd5, 0x9e, 0x9f, 0x3f, 0xae, + 0x4d, 0xd8, 0xa7, 0xb6, 0xb7, 0x88, 0x6f, 0x6a, 0x6a, 0x89, 0xc6, 0xd7, + 0xb4, 0xb6, 0x92, 0xb2, 0xcd, 0xa0, 0xb7, 0xb4, 0xa1, 0xa7, 0x53, 0xc0, + 0x5f, 0x30, 0xb1, 0x73, 0xcb, 0x92, 0x3b, 0x52, 0x6e, 0x6d, 0x5a, 0x95, + 0xd2, 0x4c, 0xb7, 0x73, 0x71, 0x91, 0x30, 0xd8, 0x49, 0x46, 0xab, 0x77, + 0x2c, 0x8b, 0xa7, 0xe0, 0x8a, 0xb0, 0x7a, 0x61, 0x4d, 0xc0, 0x46, 0xad, + 0x7e, 0xa6, 0x77, 0x78, 0xc5, 0x60, 0x94, 0x71, 0x5d, 0x5f, 0x94, 0x52, + 0x51, 0x98, 0x97, 0x80, 0x6d, 0xbf, 0xc4, 0xc8, 0xbe, 0x85, 0x94, 0x8c, + 0x63, 0xca, 0x48, 0x36, 0x83, 0x8a, 0xbd, 0x41, 0x94, 0x4e, 0x42, 0xa0, + 0x8c, 0x76, 0x4e, 0x53, 0x69, 0x55, 0xb6, 0x93, 0x67, 0x65, 0x75, 0xb4, + 0xc3, 0x9d, 0x56, 0x73, 0x6b, 0x53, 0x6b, 0x3a, 0xc6, 0x9a, 0x49, 0x60, + 0xab, 0xb4, 0x33, 0x55, 0x53, 0x60, 0x7a, 0xc6, 0xcd, 0x72, 0x6d, 0xa0, + 0xc9, 0x51, 0x3b, 0xd3, 0xca, 0xd8, 0x23, 0xaf, 0x72, 0x51, 0xbc, 0xc0, + 0x95, 0xbe, 0xbe, 0x9b, 0x35, 0x57, 0xb0, 0xbb, 0x4c, 0x56, 0x4f, 0x5a, + 0x58, 0x9f, 0x80, 0x57, 0x3c, 0xdb, 0xd2, 0xa1, 0xa4, 0x30, 0x62, 0x87, + 0x38, 0xc2, 0xcc, 0xa8, 0x4b, 0x3f, 0x61, 0x39, 0x45, 0xa8, 0x5b, 0x97, + 0x43, 0x81, 0xc5, 0x83, 0x36, 0x69, 0x61, 0xae, 0x6b, 0x6f, 0x66, 0x61, + 0xd7, 0x92, 0x44, 0x84, 0xc3, 0x93, 0x8b, 0x63, 0x9c, 0xa0, 0x81, 0x55, + 0x4a, 0x24, 0x8e, 0xaa, 0x57, 0x8a, 0x8f, 0x65, 0x51, 0x5f, 0xcf, 0xc0, + 0x33, 0x8f, 0x8f, 0x43, 0xe3, 0xe8, 0x5b, 0x4a, 0x6b, 0x3d, 0x98, 0x56, + 0x48, 0x4c, 0xd8, 0x58, 0x90, 0xbb, 0x76, 0x73, 0x67, 0x74, 0xac, 0x79, + 0xb9, 0x37, 0xcc, 0x4e, 0xa7, 0x8d, 0x72, 0x3b, 0x34, 0x31, 0x59, 0x72, + 0x53, 0x47, 0x6b, 0x4e, 0x7e, 0x54, 0x71, 0x7b, 0x37, 0xd3, 0xb0, 0x39, + 0x41, 0x71, 0xd3, 0x96, 0x3b, 0x8e, 0xb1, 0xa3, 0xa1, 0x64, 0x58, 0x4c, + 0xd3, 0xcf, 0x7e, 0x6d, 0x35, 0xa1, 0x3a, 0x95, 0x45, 0x72, 0x66, 0xb0, + 0x7c, 0x87, 0x73, 0x75, 0x46, 0x8b, 0x97, 0xae, 0xa6, 0x39, 0xd0, 0xa1, + 0x88, 0x3b, 0x8d, 0xb7, 0x92, 0x77, 0xa0, 0x73, 0xa6, 0xc6, 0xce, 0x83, + 0xcd, 0x6b, 0x85, 0x8c, 0x57, 0x84, 0x8c, 0x60, 0x99, 0xb5, 0xb2, 0xaf, + 0x2a, 0xc2, 0x5b, 0x86, 0x5c, 0x8e, 0xb1, 0xb5, 0x74, 0xd4, 0x34, 0x48, + 0x4d, 0x60, 0x3b, 0xcd, 0x30, 0x37, 0x33, 0x53, 0x3a, 0x99, 0x79, 0x53, + 0x4f, 0x9c, 0xdc, 0xa7, 0xf1, 0xfb, 0x5e, 0xc2, 0xba, 0xc8, 0x68, 0x77, + 0x37, 0x4d, 0x59, 0xa4, 0x97, 0xbb, 0x67, 0x4d, 0x74, 0xaa, 0x60, 0x8e, + 0x6f, 0x2c, 0x6d, 0xc3, 0x38, 0x73, 0x85, 0xbf, 0xa0, 0xc2, 0x92, 0xab, + 0xe5, 0x9c, 0xaf, 0x42, 0xcb, 0x47, 0x5a, 0x76, 0x8c, 0xaa, 0xdd, 0x9c, + 0x64, 0xad, 0x86, 0x80, 0xb9, 0x56, 0x7b, 0x73, 0x83, 0x57, 0xa0, 0x79, + 0x52, 0xaf, 0x79, 0x7c, 0xae, 0x42, 0x8a, 0x9e, 0x4b, 0xd0, 0xc3, 0x60, + 0xb9, 0x62, 0x9d, 0x91, 0xb8, 0xa3, 0x7d, 0x89, 0x7e, 0xa2, 0xa7, 0x81, + 0xb4, 0xcd, 0x6c, 0x99, 0xaf, 0x7c, 0x4e, 0xcf, 0x37, 0xc0, 0x82, 0xbf, + 0x5b, 0xd1, 0xda, 0x64, 0x3c, 0x5e, 0x3f, 0xab, 0xa4, 0xa7, 0x88, 0x67, + 0xbc, 0x74, 0x79, 0x90, 0xd0, 0x8c, 0x78, 0xb1, 0x88, 0x27, 0xd6, 0x51, + 0x5a, 0x37, 0xae, 0xd7, 0xd8, 0xdf, 0x48, 0xa6, 0x49, 0x7c, 0x9b, 0x63, + 0xcf, 0xcb, 0xa6, 0x46, 0x2c, 0x53, 0xcb, 0xd4, 0x7b, 0x5e, 0xc6, 0xc6, + 0x89, 0x41, 0xc9, 0x73, 0x3f, 0xb2, 0xb3, 0x54, 0xbc, 0xcf, 0x87, 0x44, + 0x9a, 0x32, 0xc9, 0x85, 0xaf, 0x7d, 0x7c, 0x3e, 0x43, 0x6f, 0x52, 0xd7, + 0xa1, 0x37, 0x32, 0x86, 0x77, 0x48, 0x4e, 0x2f, 0xd4, 0x6f, 0xc7, 0x63, + 0xbf, 0x5b, 0xc8, 0x7d, 0xc6, 0x8f, 0xcd, 0xd0, 0x6f, 0x3d, 0x84, 0x86, + 0x5b, 0x97, 0xd9, 0xd0, 0x92, 0xb0, 0xd6, 0x5c, 0xa1, 0xcb, 0x63, 0xa1, + 0xcb, 0x54, 0x5c, 0x81, 0xb4, 0x2b, 0x79, 0x84, 0x3a, 0xb0, 0x9b, 0x5a, + 0xdf, 0xbb, 0xce, 0xad, 0xbe, 0x67, 0x69, 0x9f, 0x9c, 0x5e, 0x70, 0xa7, + 0xd1, 0x57, 0x7e, 0x96, 0xd0, 0x82, 0x9c, 0xb0, 0x44, 0xad, 0xb5, 0x7b, + 0xb0, 0x9a, 0x6e, 0x49, 0xa0, 0xbc, 0x59, 0xaa, 0x83, 0x55, 0xa0, 0x6b, + 0xc4, 0x4f, 0x5b, 0xcf, 0x82, 0x4d, 0x40, 0xcb, 0x3d, 0xb7, 0xb9, 0x7a, + 0xc3, 0xaa, 0xb0, 0x31, 0x45, 0xa6, 0x95, 0x41, 0x71, 0x62, 0x4b, 0xbb, + 0xb7, 0xce, 0x59, 0x4f, 0x89, 0xc3, 0xc0, 0xbd, 0x8c, 0x7b, 0xa7, 0xc2, + 0xb9, 0xc5, 0xb7, 0x6e, 0x33, 0x44, 0xcd, 0x70, 0xd2, 0x91, 0x88, 0xb3, + 0x60, 0x5d, 0xab, 0xce, 0xc8, 0x9f, 0x42, 0xd4, 0x4d, 0x52, 0x32, 0xcb, + 0x46, 0xc7, 0x95, 0x91, 0x36, 0x9c, 0xaa, 0x6b, 0x40, 0x49, 0x41, 0x78, + 0x52, 0xad, 0x43, 0xb7, 0x5f, 0x6b, 0xc2, 0x70, 0x44, 0x51, 0x84, 0x8f, + 0x64, 0x3b, 0x95, 0xac, 0x33, 0xcd, 0x90, 0x3c, 0xae, 0xae, 0x80, 0x8d, + 0x5c, 0x58, 0x6f, 0x52, 0x9b, 0x38, 0xb9, 0xb0, 0xbe, 0xad, 0x54, 0x4f, + 0x63, 0x3b, 0x4b, 0x4b, 0x75, 0xd2, 0x7f, 0x94, 0x9d, 0x5b, 0x54, 0x73, + 0x8c, 0x5e, 0x4a, 0x33, 0x7a, 0x50, 0x44, 0x96, 0x3c, 0x92, 0xb3, 0xbc, + 0x85, 0x43, 0x31, 0x72, 0x57, 0xc2, 0x8b, 0x81, 0xa5, 0xcf, 0x93, 0x91, + 0xb1, 0x61, 0x39, 0xbe, 0x65, 0x39, 0xc5, 0x87, 0xa3, 0x36, 0x5a, 0xc3, + 0xbe, 0x76, 0x38, 0x80, 0x5d, 0x48, 0x43, 0x34, 0x54, 0xa4, 0x8f, 0xcb, + 0x6d, 0xaa, 0x8a, 0x6f, 0x6a, 0xb2, 0x6e, 0xca, 0x4a, 0x3a, 0xd3, 0x54, + 0x73, 0x5d, 0x36, 0x53, 0x80, 0xc5, 0xab, 0x36, 0x8d, 0x6c, 0x90, 0x3b, + 0xbb, 0xad, 0x97, 0x8d, 0xb7, 0xc4, 0xcb, 0x36, 0x82, 0xa7, 0x4c, 0x49, + 0xa6, 0xbd, 0x80, 0x47, 0x6b, 0x91, 0x73, 0x5a, 0x40, 0x7c, 0xd1, 0x54, + 0x3c, 0xce, 0x6e, 0x68, 0x5c, 0xa4, 0x45, 0x5e, 0x47, 0x93, 0x49, 0x4b, + 0x3b, 0x6b, 0x88, 0xa7, 0x83, 0xab, 0x71, 0xc6, 0xc7, 0x6e, 0x74, 0x9a, + 0x71, 0xb3, 0x66, 0xaf, 0x6f, 0x93, 0x5f, 0xd5, 0x82, 0xa6, 0x83, 0x7e, + 0x58, 0xcb, 0x49, 0xbd, 0x84, 0x7f, 0x32, 0x9e, 0xc9, 0x59, 0x85, 0x4f, + 0x3e, 0x55, 0x52, 0x66, 0xc8, 0x31, 0xc9, 0x9d, 0x6e, 0x7e, 0x51, 0x84, + 0x58, 0x5a, 0xc7, 0x76, 0x5c, 0x8c, 0xce, 0x5f, 0xbe, 0x3f, 0x88, 0xd0, + 0x7f, 0x4e, 0x58, 0x39, 0x38, 0x4b, 0xa0, 0x44, 0x32, 0x7b, 0x73, 0x3f, + 0x39, 0x92, 0x83, 0x8b, 0x4e, 0x7d, 0x7a, 0x65, 0x57, 0x86, 0xb9, 0x42, + 0xc4, 0x6e, 0x7e, 0x33, 0xbc, 0x37, 0x8b, 0x33, 0x35, 0xd9, 0xad, 0x64, + 0x3e, 0x4c, 0xb5, 0x5d, 0xad, 0x5d, 0x73, 0xb7, 0x73, 0x5f, 0x67, 0x9b, + 0x9b, 0xab, 0x8e, 0xa7, 0xc4, 0x36, 0x9a, 0x8c, 0xc1, 0x83, 0x5e, 0x75, + 0x34, 0xb5, 0x8d, 0xbb, 0xd7, 0x5b, 0x48, 0x93, 0x8c, 0x4d, 0xac, 0xbe, + 0xce, 0x86, 0x88, 0xa0, 0x49, 0x8b, 0x56, 0x91, 0xcb, 0x7e, 0x93, 0x3d, + 0x3e, 0xaa, 0x6d, 0x41, 0xd2, 0xb2, 0x78, 0xb1, 0x82, 0xc9, 0x7c, 0xb7, + 0x3b, 0xb4, 0x9f, 0x4f, 0x84, 0x71, 0x5c, 0x33, 0x7c, 0xa6, 0x6b, 0x36, + 0x62, 0x97, 0x7e, 0x57, 0x46, 0x87, 0x86, 0xb3, 0xac, 0xb4, 0x7c, 0x3d, + 0x96, 0x52, 0xb4, 0x5b, 0x76, 0xbf, 0xa2, 0x62, 0xbe, 0xb7, 0xa2, 0x84, + 0x9a, 0xb8, 0x79, 0xb0, 0x85, 0xca, 0xa9, 0x70, 0x77, 0x7a, 0x8a, 0x72, + 0xad, 0xd0, 0xa0, 0x76, 0x67, 0x3d, 0x76, 0x8a, 0x7e, 0x44, 0x37, 0xbc, + 0xb1, 0x87, 0xcd, 0x42, 0x58, 0x3e, 0x35, 0x60, 0x99, 0xbc, 0x47, 0xc5, + 0x56, 0x7d, 0x6a, 0x88, 0xae, 0x2e, 0x70, 0x70, 0x33, 0x76, 0x81, 0xc4, + 0x9a, 0x68, 0x6e, 0xc5, 0x94, 0x45, 0x91, 0x91, 0xad, 0x9a, 0x68, 0x97, + 0x9f, 0x64, 0x8b, 0x9a, 0xa0, 0x6d, 0x85, 0x43, 0x69, 0x81, 0x40, 0x6a, + 0x74, 0x89, 0xcb, 0x66, 0x3d, 0x3b, 0x94, 0x88, 0x3c, 0x51, 0x41, 0xab, + 0xc2, 0x8b, 0xa1, 0x57, 0xc9, 0x70, 0x39, 0xbe, 0x57, 0xc8, 0x3d, 0x69, + 0xbc, 0xa7, 0x85, 0x67, 0x50, 0x4d, 0x95, 0x8c, 0xbb, 0xa7, 0x8d, 0x3d, + 0x7c, 0x5c, 0x58, 0x44, 0x50, 0x89, 0x96, 0x97, 0x74, 0xa8, 0x51, 0xb9, + 0x46, 0x6a, 0x69, 0xd0, 0x9f, 0x85, 0x81, 0x96, 0x74, 0xcb, 0xca, 0x9b, + 0xca, 0xc2, 0x81, 0xbd, 0x98, 0x95, 0x60, 0x37, 0x8b, 0xac, 0x6d, 0x58, + 0xae, 0x9c, 0xba, 0xaa, 0x84, 0x50, 0x7f, 0x83, 0x67, 0x99, 0xd5, 0xb6, + 0x48, 0xbb, 0xc1, 0xd5, 0x7b, 0xa5, 0x97, 0x75, 0x56, 0x8a, 0x4c, 0x92, + 0x9c, 0x3a, 0x34, 0x82, 0x3b, 0x6d, 0x6f, 0x35, 0xd0, 0x6b, 0xab, 0xa1, + 0x6e, 0x7f, 0x66, 0x6c, 0xa9, 0x92, 0x7b, 0x3e, 0x2f, 0x76, 0x58, 0x38, + 0x6a, 0x98, 0xc2, 0xce, 0x66, 0xb4, 0xc4, 0x96, 0xca, 0xb6, 0x83, 0xbf, + 0x72, 0xbe, 0xb3, 0x91, 0xb4, 0x5f, 0x7e, 0x70, 0x32, 0x92, 0xd0, 0x5d, + 0x69, 0xb3, 0x33, 0xc4, 0xa8, 0x45, 0x82, 0x46, 0x85, 0x8a, 0x54, 0x4c, + 0x88, 0xc9, 0x49, 0x3f, 0x57, 0x9f, 0x5b, 0x8b, 0x7a, 0xa1, 0x38, 0xbe, + 0xc0, 0x8a, 0xa6, 0x67, 0x6c, 0x91, 0xb2, 0x5c, 0x91, 0x51, 0x84, 0x8e, + 0x3c, 0x90, 0x4d, 0xc8, 0x9b, 0x97, 0xc4, 0x90, 0x38, 0xa5, 0xb7, 0x5f, + 0xb4, 0x55, 0x67, 0x8e, 0x37, 0x68, 0x76, 0x71, 0x8e, 0x95, 0x98, 0xcc, + 0x39, 0x3e, 0xab, 0x79, 0x71, 0x52, 0xa0, 0xbe, 0x36, 0xd1, 0x4c, 0x4a, + 0x59, 0x65, 0x78, 0x99, 0x39, 0x7f, 0x40, 0x9a, 0x5b, 0x5c, 0xc8, 0xb5, + 0xb1, 0xcd, 0xbf, 0x3a, 0x36, 0x7c, 0x5d, 0x9e, 0xa5, 0x4b, 0xc0, 0xc0, + 0x96, 0x31, 0xb4, 0xcf, 0xb5, 0xac, 0x4d, 0x58, 0x47, 0x40, 0x60, 0x92, + 0x4c, 0x9e, 0x59, 0x78, 0xb4, 0x39, 0xbb, 0x69, 0x86, 0xb9, 0x3c, 0x31, + 0x43, 0xd2, 0x46, 0x33, 0x5b, 0xa5, 0xa4, 0xad, 0x3b, 0x5d, 0x58, 0x49, + 0x85, 0xc7, 0xc8, 0x6d, 0x3e, 0xd1, 0xba, 0x9d, 0xbb, 0xc3, 0xc8, 0x4b, + 0x43, 0x95, 0x81, 0x3b, 0x82, 0x5c, 0x79, 0x74, 0xc4, 0xae, 0x52, 0x5b, + 0xb2, 0xce, 0x9d, 0xae, 0xc8, 0x7c, 0x77, 0x31, 0xc0, 0xa8, 0x4a, 0xc3, + 0x8a, 0xac, 0x4c, 0x7b, 0x80, 0x4d, 0x4e, 0x58, 0x3c, 0xc9, 0x33, 0x67, + 0x67, 0x59, 0x3b, 0x9a, 0x56, 0x6e, 0x93, 0xb6, 0x88, 0xcb, 0x9e, 0x75, + 0xb1, 0x9e, 0xa1, 0x6c, 0xc4, 0x67, 0xaa, 0xb2, 0x6e, 0x78, 0x7a, 0x4d, + 0x48, 0x65, 0x56, 0xcc, 0x32, 0xa7, 0xc1, 0x8f, 0xb1, 0x7a, 0xad, 0xb2, + 0xc7, 0x3c, 0x7a, 0x89, 0x40, 0x79, 0x4b, 0x8d, 0xbf, 0xc2, 0xba, 0x54, + 0xd1, 0xa4, 0x33, 0xc8, 0xaf, 0xb3, 0x56, 0xa5, 0x83, 0x7c, 0x99, 0x90, + 0x6e, 0x63, 0xb9, 0x30, 0xb5, 0xa8, 0x64, 0x37, 0x6b, 0x39, 0x9b, 0x9a, + 0x33, 0xad, 0x39, 0x71, 0xc6, 0x3e, 0x7f, 0xc7, 0xbb, 0x8a, 0x73, 0xb8, + 0xb5, 0x7b, 0x78, 0xaf, 0x4a, 0x72, 0xb8, 0x55, 0xbb, 0x3d, 0x44, 0x54, + 0x7f, 0x3c, 0xba, 0x9f, 0x81, 0x93, 0x86, 0xc0, 0x26, 0x96, 0x91, 0xbb, + 0x43, 0x5b, 0xb8, 0x2d, 0x98, 0x3c, 0xc1, 0xb2, 0x8f, 0x64, 0x77, 0x9f, + 0xb2, 0xb5, 0xa0, 0x70, 0xc1, 0xc0, 0x37, 0x5a, 0x41, 0x57, 0xda, 0x56, + 0xc6, 0xbd, 0x3b, 0x78, 0x9f, 0xc4, 0x39, 0x46, 0xba, 0xd0, 0x6d, 0x5c, + 0x70, 0x93, 0x4d, 0xa6, 0x3d, 0x99, 0x38, 0x65, 0xd1, 0xb6, 0xd8, 0xc4, + 0xbe, 0x32, 0xc1, 0x5e, 0xa6, 0x4c, 0x91, 0x60, 0x87, 0x8e, 0xc8, 0x42, + 0x8a, 0xa2, 0x56, 0x93, 0x92, 0x83, 0xc7, 0xc9, 0x8a, 0x6f, 0x76, 0x46, + 0x8c, 0xa6, 0x57, 0xb8, 0x2e, 0xcb, 0x3c, 0x8e, 0x72, 0x67, 0xd9, 0xca, + 0x5f, 0x5b, 0xba, 0x44, 0x68, 0x82, 0x7e, 0x83, 0x6b, 0xcc, 0xb2, 0xa0, + 0x56, 0x67, 0xb9, 0x53, 0x86, 0x36, 0xbc, 0xcf, 0x43, 0x46, 0x8c, 0x78, + 0x87, 0xa7, 0xd1, 0x2f, 0xd2, 0x4f, 0x66, 0xb4, 0x96, 0x95, 0x4a, 0x9d, + 0x8c, 0x7e, 0x65, 0xa3, 0x84, 0x78, 0xb9, 0xb3, 0x80, 0x6d, 0x4c, 0xab, + 0x75, 0xa7, 0x6e, 0x6e, 0x6b, 0x62, 0x7c, 0xa5, 0x5f, 0xa8, 0x6c, 0x4d, + 0x52, 0x78, 0x5c, 0x48, 0x98, 0x45, 0x48, 0x65, 0x91, 0x45, 0x84, 0x85, + 0x73, 0x5c, 0xa7, 0xac, 0x5e, 0xd0, 0x93, 0x70, 0x59, 0xbf, 0x7d, 0x74, + 0xcd, 0x3d, 0x9d, 0x3b, 0x7e, 0xc9, 0x70, 0xb9, 0x6d, 0x5d, 0xd4, 0x96, + 0x80, 0x47, 0xb0, 0xbc, 0x60, 0xd1, 0xa8, 0xca, 0xa9, 0x86, 0x53, 0xbf, + 0x8f, 0xc7, 0x36, 0xd9, 0x46, 0x47, 0xb6, 0x8c, 0x47, 0x61, 0x62, 0x42, + 0x81, 0xc0, 0x79, 0x75, 0xa7, 0x57, 0x9b, 0x70, 0xcf, 0x5b, 0x6a, 0xb8, + 0xba, 0x44, 0x50, 0xc4, 0xa1, 0xa1, 0xc0, 0x70, 0xd3, 0x5a, 0x47, 0x5c, + 0x78, 0x50, 0x5c, 0x45, 0xc1, 0x6d, 0x57, 0x6a, 0x75, 0x98, 0x8a, 0xa3, + 0x7f, 0x9b, 0xa7, 0xbc, 0x92, 0x36, 0x4d, 0x51, 0x70, 0x76, 0x82, 0xa6, + 0x66, 0xa9, 0x59, 0x48, 0x9d, 0x58, 0xbb, 0x6e, 0x4c, 0xb4, 0x36, 0x75, + 0xa5, 0x8b, 0x50, 0x7c, 0xb8, 0x3c, 0x36, 0x37, 0x83, 0x51, 0x4d, 0x7a, + 0x74, 0x4a, 0x8a, 0x32, 0xaa, 0x91, 0x69, 0xa3, 0x9f, 0x77, 0x59, 0xcc, + 0xa7, 0x97, 0x79, 0x65, 0x53, 0xca, 0x85, 0xa2, 0xa5, 0x98, 0xaf, 0x7b, + 0x30, 0x37, 0xd5, 0x44, 0x8d, 0xc1, 0x40, 0x56, 0x61, 0xd0, 0x74, 0xcb, + 0x57, 0x53, 0xa9, 0x51, 0x88, 0x8a, 0x82, 0x85, 0xd5, 0x94, 0x77, 0xda, + 0x96, 0x3d, 0xa1, 0x41, 0x92, 0x70, 0x8d, 0xc7, 0x33, 0xb2, 0xb8, 0x48, + 0xa4, 0xc8, 0x79, 0x5c, 0x57, 0x91, 0xc9, 0xcb, 0xd3, 0x5b, 0xc4, 0xd4, + 0x8d, 0x89, 0x6c, 0x67, 0x7a, 0x4e, 0x87, 0xcf, 0x70, 0x3f, 0xa4, 0x36, + 0xba, 0x4c, 0xc0, 0x5e, 0xb6, 0xae, 0x50, 0xa8, 0xb0, 0x5f, 0x3e, 0x33, + 0xa9, 0x92, 0xaf, 0x74, 0x4c, 0x77, 0x6d, 0x73, 0x79, 0x8d, 0x57, 0x4f, + 0x7e, 0xbb, 0x44, 0x9a, 0xcd, 0xaf, 0x3a, 0x88, 0xcc, 0x8e, 0xb6, 0xa0, + 0x71, 0x5a, 0x96, 0xa8, 0xa3, 0x33, 0xbb, 0xb3, 0x4a, 0x7a, 0x3b, 0x6f, + 0xb8, 0x33, 0x4f, 0x70, 0x5f, 0x8c, 0x43, 0x4d, 0x5f, 0x7b, 0x91, 0x7e, + 0x33, 0x4b, 0x61, 0x94, 0x86, 0xa9, 0x7c, 0x41, 0xb9, 0x72, 0xcd, 0x55, + 0x75, 0x5e, 0x8c, 0x91, 0x65, 0x4a, 0x4a, 0x99, 0x8d, 0x67, 0x46, 0x64, + 0xc1, 0x5e, 0x86, 0x68, 0x82, 0xa9, 0x9b, 0x84, 0x3c, 0xbc, 0x7f, 0x86, + 0x47, 0x5c, 0x58, 0x4c, 0x41, 0x8d, 0x7a, 0xb2, 0x9c, 0x45, 0xaa, 0x3b, + 0xbc, 0x86, 0x82, 0x50, 0x7b, 0x5c, 0x7e, 0x59, 0x92, 0xa8, 0xac, 0xb2, + 0x49, 0x3b, 0x75, 0x53, 0x49, 0xbf, 0x81, 0x9a, 0x3b, 0x78, 0x59, 0x5b, + 0x95, 0xb4, 0x86, 0x4f, 0x69, 0x6c, 0xc6, 0x33, 0xd2, 0x94, 0x83, 0xa9, + 0x44, 0x96, 0x48, 0x73, 0x8d, 0xb9, 0x5a, 0xb1, 0x70, 0xa1, 0x55, 0x37, + 0x45, 0x5f, 0x39, 0x98, 0xc2, 0x6e, 0x6e, 0xc4, 0x77, 0xb6, 0x6b, 0x74, + 0x9e, 0x7b, 0xbc, 0x6d, 0xad, 0x8e, 0xbb, 0xce, 0x80, 0xaa, 0x9f, 0x9e, + 0xbb, 0xc5, 0x6d, 0x8f, 0xbb, 0xb8, 0xa3, 0x66, 0x6f, 0xb4, 0x3c, 0x38, + 0x6b, 0x96, 0x4c, 0xd4, 0x62, 0xc2, 0x63, 0xb4, 0x5e, 0x8d, 0xa2, 0xa1, + 0x66, 0x34, 0xb0, 0xc9, 0x9c, 0x86, 0x4e, 0x88, 0x98, 0x58, 0x82, 0x38, + 0x78, 0x3e, 0x8f, 0xc0, 0x41, 0xaf, 0xbb, 0xb7, 0x3a, 0xba, 0xc5, 0x38, + 0x5e, 0x4f, 0x9f, 0x97, 0x59, 0x82, 0x6c, 0xc6, 0x53, 0x92, 0xb3, 0x53, + 0x98, 0x8f, 0x6d, 0x8f, 0x3a, 0xa9, 0x4d, 0x75, 0xc1, 0x57, 0xd2, 0xc7, + 0x7c, 0x40, 0xcb, 0x38, 0x5f, 0xcb, 0x6e, 0x60, 0x6c, 0x4b, 0x71, 0x5e, + 0x6a, 0x7a, 0xd5, 0x7d, 0xc2, 0x79, 0x3a, 0x76, 0x32, 0x35, 0xc7, 0xbc, + 0x28, 0x8b, 0x8b, 0x9b, 0x6a, 0x53, 0x47, 0xbc, 0x87, 0x8d, 0xc8, 0xa1, + 0x5f, 0x45, 0xb9, 0x61, 0x82, 0x38, 0x87, 0x77, 0x6e, 0xcb, 0xa4, 0xce, + 0xcd, 0x51, 0x75, 0x7a, 0x3d, 0xd0, 0x99, 0x66, 0xa6, 0x99, 0x94, 0xb2, + 0x73, 0x92, 0x83, 0x39, 0xb7, 0x8e, 0xc9, 0x9b, 0x39, 0xca, 0x33, 0x95, + 0xbe, 0x37, 0x87, 0xaf, 0x4f, 0x54, 0xad, 0xb0, 0x3e, 0x73, 0x57, 0x5b, + 0xd8, 0x5d, 0x45, 0xb7, 0xc9, 0x40, 0x9c, 0x48, 0x7d, 0x39, 0x4e, 0xae, + 0x6e, 0x60, 0x7e, 0x68, 0x4c, 0x5c, 0xb9, 0x9f, 0xad, 0x95, 0x9d, 0xcc, + 0xab, 0x65, 0x82, 0x9b, 0xd0, 0x4b, 0x4a, 0x61, 0xb0, 0x9f, 0xb4, 0xb4, + 0x68, 0x5e, 0xb4, 0xb4, 0x9f, 0xbc, 0x8f, 0xbf, 0x41, 0xc1, 0x9f, 0x7c, + 0x74, 0x4d, 0x7a, 0x33, 0xc3, 0x82, 0xc4, 0x91, 0x2e, 0xc5, 0xb9, 0xb1, + 0x56, 0x44, 0xaf, 0x9a, 0x6b, 0x93, 0x7a, 0xca, 0xb0, 0xad, 0xda, 0x33, + 0xa8, 0x4a, 0xc9, 0xa3, 0xd0, 0xbd, 0x57, 0xb2, 0x3a, 0x70, 0xa1, 0x6b, + 0x6f, 0xa0, 0x4e, 0x6e, 0x42, 0xa1, 0xd6, 0xc5, 0x4b, 0x8e, 0x9a, 0x7e, + 0xcb, 0x56, 0x24, 0x62, 0xb1, 0x9c, 0x63, 0xa5, 0x4f, 0x70, 0x49, 0x5a, + 0x2e, 0x7d, 0x61, 0xc2, 0x80, 0x5c, 0x58, 0x2f, 0xbc, 0x93, 0x83, 0xcf, + 0x5c, 0xd4, 0xdc, 0x66, 0x4e, 0x95, 0xa1, 0x43, 0x38, 0x94, 0x7a, 0x94, + 0x7e, 0xa6, 0x3f, 0x5d, 0x69, 0x58, 0xa8, 0x84, 0x9f, 0x9f, 0xac, 0xa2, + 0xc1, 0x4f, 0x88, 0x40, 0xc4, 0xd0, 0x87, 0xc3, 0x47, 0xae, 0xc2, 0xa1, + 0xa2, 0x90, 0x7f, 0x5b, 0xa1, 0x34, 0xa1, 0x4f, 0x9b, 0xa9, 0x2b, 0x27, + 0x47, 0xc3, 0x7e, 0x9a, 0x4a, 0xb2, 0x95, 0x9b, 0xc1, 0x7c, 0x5d, 0x75, + 0x63, 0xc5, 0x3f, 0x74, 0xbb, 0x7b, 0x96, 0xa2, 0x80, 0x75, 0x60, 0xd2, + 0x7c, 0x8a, 0x50, 0x39, 0x77, 0xa4, 0xb6, 0x46, 0x9c, 0x68, 0x33, 0xce, + 0x59, 0x5f, 0xc1, 0x34, 0x7b, 0x39, 0xce, 0x34, 0x65, 0x80, 0x6c, 0x80, + 0xbd, 0x5b, 0x3c, 0x5d, 0xa9, 0xa4, 0xd8, 0x98, 0x56, 0xca, 0xa0, 0x3c, + 0xb8, 0x74, 0x8e, 0x75, 0x82, 0x96, 0xbb, 0x5b, 0x51, 0xab, 0xab, 0x42, + 0x5d, 0x73, 0x7f, 0x8a, 0x46, 0xb6, 0x7d, 0x7b, 0xb7, 0x76, 0xb7, 0x73, + 0x5a, 0x8a, 0x5f, 0xd3, 0x49, 0xa6, 0x47, 0x43, 0x6b, 0x93, 0xb7, 0x54, + 0xbb, 0xab, 0x8c, 0x72, 0x62, 0x89, 0xac, 0xce, 0x93, 0xb0, 0xad, 0xbf, + 0xcb, 0x72, 0x5c, 0x8e, 0x67, 0xc0, 0x49, 0x63, 0x47, 0x85, 0xb9, 0x34, + 0x65, 0xd3, 0x55, 0x74, 0xc5, 0x61, 0x94, 0x7a, 0x54, 0x42, 0x62, 0x93, + 0xd3, 0xb5, 0xb3, 0x8b, 0xbc, 0x2f, 0x6a, 0x67, 0x4b, 0x39, 0x4f, 0xad, + 0x3c, 0x6d, 0xd8, 0xb0, 0xc1, 0x37, 0x2a, 0x8d, 0x98, 0x90, 0x63, 0x78, + 0x65, 0x83, 0x49, 0x5f, 0xb2, 0x44, 0xae, 0x9e, 0xab, 0x52, 0x40, 0x52, + 0x70, 0xaa, 0x71, 0xd3, 0xa8, 0xab, 0x6a, 0x74, 0xbe, 0x73, 0x78, 0xa6, + 0x6f, 0x5b, 0xc8, 0xda, 0x35, 0x39, 0x5f, 0x96, 0xbb, 0xb7, 0x9d, 0x58, + 0x9b, 0xb6, 0x47, 0x32, 0x5c, 0x5a, 0xb8, 0x41, 0x42, 0x5e, 0x96, 0x4c, + 0x76, 0x89, 0x68, 0x9b, 0x60, 0x60, 0x7e, 0x8c, 0x3e, 0xba, 0xab, 0x60, + 0x43, 0xc7, 0x8b, 0xc4, 0x86, 0xa8, 0x7b, 0x45, 0x80, 0xb6, 0x88, 0x7e, + 0x67, 0x81, 0xba, 0x9a, 0x6f, 0x93, 0x4c, 0x36, 0x79, 0x85, 0x66, 0xa3, + 0x65, 0xb2, 0xa3, 0x86, 0x32, 0xaa, 0x36, 0x58, 0x7d, 0xca, 0x82, 0x65, + 0x98, 0x94, 0x76, 0x85, 0x84, 0xb4, 0x6b, 0x9c, 0xd5, 0x44, 0x4b, 0xbb, + 0x50, 0xbf, 0x56, 0x79, 0xd6, 0x70, 0x74, 0xad, 0x4c, 0xa5, 0x3c, 0xd1, + 0xb3, 0x6e, 0x59, 0x31, 0x7b, 0x40, 0x9c, 0x55, 0xa9, 0x59, 0x89, 0x44, + 0x76, 0x43, 0x8f, 0xa1, 0xc2, 0x9b, 0xaa, 0x4b, 0x28, 0xa2, 0xe2, 0x70, + 0x99, 0x7f, 0x50, 0x53, 0x98, 0x9a, 0xd5, 0x9a, 0xb5, 0x9c, 0xca, 0x2b, + 0x89, 0x92, 0x78, 0xaf, 0x54, 0x57, 0x8e, 0x39, 0x5c, 0xb9, 0x34, 0x9c, + 0x69, 0xb2, 0xc0, 0x64, 0x45, 0x63, 0x6d, 0x4f, 0x5e, 0x8c, 0x44, 0xac, + 0x72, 0x59, 0x94, 0x3a, 0xc8, 0x35, 0x76, 0x7b, 0xaa, 0x75, 0x5b, 0x91, + 0xb2, 0xac, 0x5d, 0xa6, 0x57, 0xd8, 0xc4, 0xb2, 0x7f, 0x8d, 0xcf, 0x90, + 0x89, 0x38, 0x55, 0xb1, 0x3a, 0x58, 0x95, 0x4e, 0xcd, 0xb9, 0x7e, 0x92, + 0x37, 0xd1, 0x92, 0xad, 0x9f, 0x3d, 0xc1, 0x5e, 0x2c, 0x4b, 0xd1, 0x45, + 0x54, 0xe3, 0x66, 0xbd, 0xbe, 0xae, 0x5a, 0x86, 0x3e, 0x7b, 0x3a, 0x2b, + 0xc8, 0x9f, 0x97, 0x37, 0xa0, 0x4e, 0x84, 0x8b, 0x77, 0x5a, 0x94, 0x85, + 0x66, 0x4b, 0x2d, 0x51, 0x72, 0x7d, 0x2e, 0x9c, 0xa8, 0x87, 0x4c, 0xac, + 0x3a, 0x55, 0x4b, 0xbf, 0x5f, 0x6f, 0xcf, 0xb8, 0x59, 0x6c, 0x73, 0x9e, + 0x48, 0x6e, 0x43, 0x4e, 0x42, 0xcf, 0x3f, 0x45, 0xb4, 0x9e, 0x8b, 0x86, + 0x3e, 0xab, 0x81, 0x9a, 0xab, 0xa3, 0xc8, 0x4f, 0x60, 0x3e, 0x7b, 0xc3, + 0xa5, 0x4c, 0x34, 0x74, 0xb2, 0x75, 0x8e, 0x88, 0xb3, 0x9d, 0x73, 0xac, + 0x4c, 0xa3, 0x6a, 0x6e, 0xbd, 0x6e, 0xac, 0x8f, 0xac, 0x76, 0xce, 0xc7, + 0x78, 0x90, 0xa8, 0x54, 0xd7, 0x71, 0x36, 0xc5, 0x37, 0x6a, 0x83, 0x57, + 0xca, 0x78, 0xca, 0x39, 0x9b, 0xb7, 0xad, 0x59, 0xd1, 0x35, 0x93, 0xb6, + 0x9d, 0x33, 0x73, 0x84, 0x36, 0x97, 0xac, 0xa4, 0x64, 0xd5, 0x49, 0x9b, + 0x4f, 0xbc, 0x8a, 0x31, 0x67, 0x91, 0x72, 0x72, 0x80, 0x42, 0x8a, 0xd2, + 0x40, 0x3f, 0x99, 0xc4, 0x51, 0x98, 0xbe, 0xa8, 0xb0, 0xa8, 0x75, 0xcd, + 0x85, 0x60, 0xae, 0x70, 0x54, 0xbe, 0x44, 0xad, 0xac, 0xb7, 0xac, 0xcc, + 0xa1, 0x61, 0x43, 0x40, 0x68, 0xaa, 0x6b, 0x59, 0xd2, 0xad, 0x90, 0x2a, + 0x85, 0x53, 0x6c, 0xa2, 0x6e, 0xa6, 0x81, 0x81, 0x59, 0x70, 0x8f, 0x87, + 0x80, 0xa5, 0x97, 0x66, 0xa1, 0x7a, 0x8b, 0x53, 0x5f, 0xcc, 0xc2, 0x84, + 0x46, 0x21, 0x5a, 0x94, 0x62, 0xad, 0x49, 0x73, 0x90, 0xac, 0x2d, 0x9d, + 0xd1, 0x70, 0x42, 0xb5, 0xcc, 0xa9, 0xac, 0x4a, 0x96, 0x76, 0xd5, 0xa9, + 0xc4, 0x82, 0xae, 0x90, 0x59, 0xcc, 0x63, 0x9d, 0xcb, 0x98, 0x61, 0x75, + 0xa8, 0x5b, 0xc5, 0x36, 0xbc, 0x8d, 0x83, 0x2f, 0xb7, 0x7c, 0x4d, 0xb3, + 0x7b, 0x68, 0x69, 0x3d, 0xb8, 0x47, 0x6a, 0x7d, 0x4b, 0xe3, 0x60, 0xa1, + 0x42, 0xd8, 0x7e, 0x99, 0x8d, 0x34, 0x3b, 0x51, 0x99, 0x39, 0x76, 0x9d, + 0xa2, 0x8b, 0x47, 0xab, 0x3d, 0x40, 0x4d, 0x65, 0x54, 0x41, 0x8d, 0x40, + 0x91, 0x63, 0x83, 0xd8, 0x5d, 0x7d, 0xac, 0x68, 0xb2, 0x65, 0xc6, 0xad, + 0x61, 0xce, 0xa9, 0x3b, 0x5d, 0x5d, 0xab, 0xcf, 0x4b, 0xd3, 0x8a, 0xaa, + 0x69, 0xa4, 0x86, 0x43, 0x88, 0x6d, 0xc8, 0x86, 0x57, 0x55, 0x92, 0x63, + 0x92, 0xbb, 0x75, 0xa2, 0x69, 0xce, 0x7d, 0x70, 0xcd, 0xda, 0x69, 0xa1, + 0x2c, 0x9c, 0x3b, 0x36, 0x39, 0xb2, 0x74, 0x4a, 0x28, 0x75, 0x5f, 0xbb, + 0xb9, 0x73, 0x4f, 0x37, 0x4d, 0xc0, 0x55, 0x4c, 0xe5, 0xbb, 0xa6, 0x9e, + 0xc9, 0x87, 0x53, 0x86, 0x6b, 0xc2, 0x74, 0xc1, 0x6c, 0xd3, 0x89, 0x54, + 0x83, 0x49, 0x6e, 0x74, 0xcc, 0x79, 0x58, 0xb0, 0xbc, 0x6f, 0x63, 0x60, + 0x89, 0x49, 0x82, 0xbf, 0x99, 0x48, 0x70, 0x60, 0x38, 0x77, 0x50, 0x64, + 0xb0, 0x86, 0x43, 0x48, 0xae, 0xc9, 0x38, 0x47, 0xbb, 0xc8, 0x79, 0x32, + 0x42, 0x61, 0xcd, 0xce, 0xa2, 0xaf, 0xc8, 0x8f, 0x69, 0xb1, 0xa1, 0x76, + 0xb4, 0x44, 0x39, 0x8f, 0x6e, 0x75, 0x54, 0x4d, 0xa6, 0x6f, 0x75, 0xa0, + 0x55, 0x44, 0x50, 0x99, 0x3a, 0x9f, 0xa2, 0xc5, 0xb4, 0xb4, 0xc5, 0x5c, + 0x97, 0x78, 0x72, 0xbd, 0xd1, 0xba, 0x67, 0x71, 0x9c, 0x53, 0x93, 0x4c, + 0xb0, 0xc8, 0x53, 0x41, 0xa5, 0x5b, 0xa4, 0xb8, 0x6d, 0x88, 0x73, 0xda, + 0x7c, 0x39, 0xab, 0x47, 0x69, 0xa0, 0xda, 0x70, 0xcd, 0x3e, 0x7c, 0xd4, + 0x4e, 0xaa, 0x72, 0x89, 0x54, 0xcd, 0x88, 0xa4, 0xa8, 0x53, 0xb0, 0x43, + 0x7e, 0xb8, 0x74, 0xb3, 0x95, 0x60, 0x63, 0xc1, 0xc1, 0x50, 0x65, 0xc1, + 0x7b, 0xc3, 0x80, 0x51, 0x42, 0xae, 0x8a, 0xae, 0x7a, 0x80, 0xde, 0xb6, + 0xa8, 0xc4, 0xd9, 0x79, 0x45, 0x6d, 0x9d, 0x9d, 0x57, 0x55, 0xbf, 0x6d, + 0x8d, 0x3d, 0xb4, 0xd4, 0x4a, 0x8a, 0x9d, 0x32, 0x9e, 0x8d, 0x95, 0xa1, + 0x34, 0xa3, 0x84, 0xc6, 0x2a, 0xa7, 0xaa, 0x51, 0xa8, 0x80, 0x72, 0x33, + 0x49, 0xb7, 0x7f, 0x56, 0xaf, 0xad, 0x3a, 0x7f, 0x8e, 0x9c, 0xc1, 0xb1, + 0x89, 0xa6, 0x38, 0x99, 0xc1, 0xbd, 0xc7, 0x93, 0xa7, 0x53, 0xb9, 0xbf, + 0xa5, 0x63, 0x48, 0x4c, 0x5e, 0xa3, 0x48, 0x80, 0x6d, 0xa7, 0x7f, 0x35, + 0x6d, 0x3c, 0x78, 0x88, 0x5c, 0xb0, 0x59, 0x4f, 0x8a, 0x9c, 0x72, 0xb2, + 0x6f, 0x85, 0x60, 0x98, 0x36, 0xbd, 0x9c, 0xcf, 0xd1, 0x5d, 0x39, 0xc7, + 0x56, 0x69, 0xcb, 0x79, 0x53, 0xc1, 0x5a, 0xac, 0x7f, 0x5d, 0xd4, 0x8d, + 0x86, 0x7a, 0xb1, 0xb1, 0x2c, 0x66, 0x39, 0x9e, 0x9f, 0x65, 0xb1, 0x59, + 0xc4, 0x5b, 0xb3, 0x71, 0xac, 0x61, 0x8c, 0x35, 0x56, 0xd3, 0xe5, 0x3a, + 0x80, 0x9f, 0x90, 0x7c, 0xdb, 0x4b, 0xa0, 0x34, 0x71, 0xcd, 0x6a, 0x36, + 0x44, 0x53, 0xac, 0xac, 0xa8, 0xb4, 0xa8, 0xbe, 0x3d, 0x87, 0x86, 0x3c, + 0x8f, 0x65, 0x94, 0xb2, 0x5d, 0x51, 0x92, 0x7a, 0xd4, 0x87, 0xcd, 0x6c, + 0xbd, 0x3c, 0x92, 0x65, 0x55, 0xd0, 0x79, 0xba, 0xb1, 0xbb, 0xba, 0x66, + 0xc3, 0xa8, 0xcd, 0x9c, 0xad, 0x93, 0x7c, 0x75, 0x85, 0x33, 0x4d, 0x9f, + 0x31, 0x37, 0xab, 0x31, 0xb2, 0x8d, 0x46, 0x7c, 0x76, 0x50, 0xc3, 0x62, + 0xa3, 0xa8, 0xa4, 0xcd, 0x98, 0x8d, 0x56, 0xcf, 0x5f, 0xa9, 0x6b, 0x59, + 0x73, 0x89, 0x51, 0xc3, 0xaa, 0x51, 0x68, 0x57, 0x56, 0xc5, 0xc3, 0x64, + 0xc8, 0x36, 0x32, 0x9a, 0xc6, 0x4f, 0xb2, 0x88, 0x9a, 0x8e, 0xb5, 0xa5, + 0x3c, 0x64, 0x77, 0xcf, 0x5b, 0x3d, 0x65, 0x6d, 0x67, 0x38, 0x50, 0xc4, + 0xd5, 0x3b, 0x59, 0x54, 0xae, 0x9d, 0xc2, 0x48, 0x60, 0x7b, 0x83, 0x3e, + 0xd6, 0xa4, 0x51, 0x9f, 0x53, 0xbc, 0x49, 0xc3, 0xb4, 0xce, 0xb9, 0xa5, + 0x64, 0xb8, 0xd1, 0x7a, 0xa5, 0xb2, 0x43, 0xc3, 0xc8, 0x82, 0x59, 0x58, + 0x4d, 0x58, 0xa3, 0x8a, 0x62, 0xba, 0x7c, 0xb4, 0x77, 0xca, 0x73, 0x56, + 0xac, 0x7a, 0x7b, 0xa8, 0x42, 0x74, 0x9e, 0x4b, 0x75, 0x47, 0xbb, 0x7b, + 0xd6, 0xb9, 0x92, 0xb5, 0x7f, 0x8f, 0x96, 0x5a, 0xd8, 0xe0, 0x80, 0x56, + 0x3d, 0x48, 0x3c, 0x67, 0x5f, 0x95, 0x3c, 0x58, 0x6e, 0x69, 0x47, 0x59, + 0x8d, 0x7f, 0x9b, 0xcb, 0xb2, 0x5f, 0x82, 0x45, 0x9d, 0x68, 0x7b, 0x2b, + 0xa9, 0x4c, 0xa0, 0x69, 0x7f, 0xbf, 0xb8, 0x8b, 0xc2, 0xb8, 0xd0, 0x40, + 0x59, 0x8e, 0x31, 0x54, 0x85, 0xb4, 0x59, 0x38, 0xcb, 0x5d, 0xaf, 0x60, + 0x97, 0x8d, 0x8c, 0x6f, 0x2f, 0x4b, 0x72, 0x73, 0x80, 0xb9, 0x6b, 0x9c, + 0xa6, 0x58, 0xa2, 0x9b, 0x40, 0x4a, 0xa6, 0xc2, 0x46, 0xa1, 0x44, 0xb4, + 0x55, 0xb8, 0x59, 0x5b, 0x79, 0x91, 0x9b, 0xd2, 0x8f, 0x2e, 0x87, 0x71, + 0x46, 0x4e, 0x92, 0x49, 0x67, 0x80, 0x4e, 0x4c, 0x3a, 0xc7, 0x7a, 0x91, + 0xb2, 0x59, 0x70, 0xc8, 0xc7, 0x84, 0x9b, 0x81, 0x83, 0x89, 0x79, 0xb2, + 0x3f, 0xc4, 0xac, 0xa2, 0x4a, 0xcd, 0x4f, 0xb4, 0xb2, 0xd2, 0xd7, 0xc3, + 0x79, 0xd1, 0x8e, 0x5b, 0x5c, 0xab, 0x4d, 0x95, 0xc8, 0x99, 0xbe, 0x7f, + 0x97, 0xaf, 0x7d, 0x67, 0xb9, 0xa3, 0x52, 0xa2, 0x42, 0x61, 0x5c, 0x63, + 0x31, 0x43, 0x23, 0x4c, 0x74, 0x51, 0xa8, 0x9c, 0xc4, 0x61, 0xa8, 0xc7, + 0x34, 0x61, 0xc4, 0xb4, 0xb2, 0x3f, 0x73, 0x75, 0x39, 0x66, 0xa5, 0x5c, + 0x4f, 0xd1, 0xba, 0x6a, 0x63, 0xa2, 0xc0, 0x96, 0xcd, 0x96, 0xa3, 0x60, + 0xc5, 0x6a, 0x90, 0x7d, 0xb5, 0xb4, 0x61, 0xba, 0x86, 0x38, 0x4b, 0xb3, + 0xae, 0x95, 0xcc, 0x6a, 0x54, 0x8e, 0x3d, 0xad, 0x80, 0x58, 0x37, 0x66, + 0x7a, 0xb1, 0x54, 0xd0, 0x73, 0xa1, 0xc6, 0x38, 0x67, 0x56, 0xd1, 0xd0, + 0x3c, 0xb2, 0x2e, 0xbb, 0x9c, 0x80, 0x63, 0x79, 0xce, 0x5f, 0x49, 0x4b, + 0x67, 0xd0, 0xa4, 0x85, 0x87, 0x94, 0xd9, 0x9c, 0x81, 0xbb, 0xce, 0x39, + 0x79, 0x64, 0x60, 0x46, 0x38, 0x39, 0x4c, 0x7e, 0x73, 0x51, 0x71, 0x4a, + 0x91, 0x93, 0x6a, 0x71, 0x4e, 0x6b, 0xbd, 0xbb, 0x7f, 0xc3, 0x99, 0xc5, + 0x84, 0x8e, 0xa7, 0x9c, 0xae, 0x46, 0x86, 0x50, 0xc2, 0xc9, 0x7d, 0x58, + 0xaf, 0xa7, 0x96, 0x97, 0x93, 0xbd, 0x73, 0x78, 0x8a, 0x5d, 0x87, 0x4f, + 0x5e, 0x35, 0xda, 0x3e, 0x90, 0x69, 0x6b, 0xb9, 0x81, 0x4a, 0x7c, 0xa0, + 0xa2, 0x7a, 0x65, 0x9a, 0x81, 0x80, 0x32, 0x7f, 0x9f, 0x2c, 0x84, 0xc1, + 0x36, 0x4c, 0x94, 0xbb, 0x9b, 0xac, 0x6e, 0x74, 0x97, 0xa8, 0x53, 0xca, + 0x98, 0x96, 0x9c, 0x34, 0x8a, 0x9e, 0x77, 0x83, 0x9c, 0xc4, 0xae, 0x57, + 0x87, 0x35, 0x5b, 0xdc, 0xb1, 0x83, 0x95, 0x36, 0xb3, 0x83, 0x96, 0x71, + 0x40, 0xce, 0x97, 0x6c, 0x3e, 0x55, 0x89, 0xcb, 0x45, 0xa9, 0x39, 0x33, + 0x3b, 0x50, 0x3d, 0xbe, 0x6d, 0x75, 0xdb, 0x9a, 0x3a, 0xc4, 0x47, 0x7e, + 0xcd, 0xa7, 0x39, 0x4c, 0xbe, 0xd2, 0x6a, 0xc8, 0x7c, 0x99, 0xa6, 0xcc, + 0xb8, 0x9a, 0x61, 0x60, 0xca, 0x4c, 0x6f, 0x52, 0xd0, 0x9e, 0xbf, 0x4c, + 0x44, 0x9e, 0x80, 0x77, 0x52, 0x5c, 0x8d, 0x7d, 0xbf, 0x62, 0x38, 0xc5, + 0xbf, 0x56, 0x66, 0x57, 0x8d, 0x3c, 0xa3, 0x33, 0x5f, 0xc1, 0x57, 0xd0, + 0xa6, 0x46, 0x6d, 0x9d, 0x86, 0xba, 0x93, 0x98, 0xd0, 0x77, 0x91, 0x3f, + 0x51, 0x46, 0x90, 0xb1, 0x37, 0x64, 0x73, 0xa9, 0x3f, 0xc6, 0xad, 0x3b, + 0x37, 0xa8, 0x8c, 0xa0, 0x9a, 0xc0, 0xb7, 0x8d, 0xb0, 0x5a, 0xb5, 0x4a, + 0x5f, 0xae, 0x63, 0x63, 0x58, 0xaa, 0x3a, 0x61, 0x72, 0x50, 0x8e, 0xcb, + 0x32, 0x90, 0x62, 0x59, 0x75, 0x94, 0xa2, 0xb7, 0xbd, 0xac, 0xbe, 0xb4, + 0x42, 0x7f, 0xcf, 0xa6, 0x4c, 0xc9, 0x6a, 0x78, 0x49, 0x7c, 0x4c, 0xae, + 0x7c, 0xbd, 0x97, 0x79, 0x6c, 0xc9, 0x83, 0x74, 0x38, 0x46, 0x75, 0xba, + 0x4a, 0x3b, 0xc6, 0x40, 0x68, 0x9d, 0x62, 0x45, 0x50, 0xb9, 0x3b, 0x86, + 0x85, 0x82, 0xb1, 0xcb, 0xaf, 0xc7, 0x3d, 0x9f, 0xa6, 0x69, 0x5a, 0x88, + 0x4a, 0x46, 0x46, 0xc5, 0x4e, 0x35, 0x87, 0xb1, 0x45, 0xc4, 0x60, 0x38, + 0x88, 0x4f, 0x9d, 0xbe, 0x4b, 0xbb, 0xb5, 0xc0, 0x79, 0x9e, 0xc1, 0x4b, + 0x5e, 0x54, 0x94, 0xa2, 0xc1, 0x5a, 0xbd, 0xb6, 0x6e, 0x3c, 0xb2, 0xb0, + 0xb0, 0xb4, 0x3b, 0xa7, 0x9c, 0xbd, 0x49, 0xb8, 0xb1, 0x62, 0xbe, 0x70, + 0x66, 0xc7, 0xcf, 0x8a, 0x43, 0x93, 0xb8, 0xd2, 0x68, 0x66, 0x75, 0xa7, + 0x79, 0x5b, 0xbf, 0x9a, 0x86, 0x89, 0x9e, 0x56, 0x66, 0x39, 0x82, 0x54, + 0x4a, 0x3f, 0xb4, 0x7d, 0x72, 0x44, 0x69, 0x3f, 0x2c, 0x35, 0xca, 0x90, + 0x56, 0x77, 0x68, 0x53, 0xc1, 0x94, 0xb0, 0x3a, 0xc0, 0xb8, 0xa6, 0x6d, + 0xc2, 0xc9, 0x94, 0xbe, 0x68, 0x47, 0xc2, 0x8c, 0x62, 0xbd, 0x38, 0x71, + 0x64, 0x9b, 0x78, 0xb3, 0x69, 0xa9, 0x88, 0x4b, 0x62, 0x63, 0x62, 0x8e, + 0x4b, 0x68, 0xa7, 0xc5, 0xa9, 0xd0, 0xaf, 0xc9, 0x7e, 0x5d, 0xcd, 0xad, + 0x7b, 0x5e, 0x5b, 0x4e, 0x9b, 0xb3, 0x88, 0xa2, 0xc8, 0x63, 0x64, 0xa3, + 0x58, 0x5b, 0xb4, 0xa0, 0x92, 0xb7, 0x5f, 0x53, 0xcf, 0x57, 0x96, 0x49, + 0x92, 0xc4, 0xc2, 0xd0, 0x60, 0x69, 0x63, 0x46, 0x41, 0x31, 0xa2, 0xbd, + 0x65, 0x93, 0x38, 0x63, 0x5a, 0x63, 0xbf, 0x72, 0x61, 0x88, 0x3e, 0x7f, + 0xc3, 0x3e, 0x4e, 0x6d, 0xb9, 0x54, 0xc7, 0x32, 0x4d, 0x8a, 0xb0, 0xce, + 0x36, 0xbe, 0x43, 0x8e, 0x51, 0x8a, 0x7c, 0xab, 0xc3, 0xb1, 0x85, 0xc5, + 0x88, 0x54, 0xab, 0x76, 0xbc, 0x48, 0x35, 0x58, 0x9e, 0x81, 0x55, 0x5e, + 0x45, 0x8d, 0xce, 0x44, 0x82, 0xab, 0x5a, 0xd3, 0x68, 0x55, 0x59, 0x53, + 0x71, 0x90, 0x63, 0x37, 0x65, 0x5b, 0xa9, 0x99, 0x6b, 0x91, 0xc7, 0x39, + 0x4b, 0xce, 0x2b, 0x89, 0x46, 0x61, 0x64, 0x7f, 0xa0, 0x8f, 0xc2, 0x40, + 0xaa, 0x36, 0xaf, 0x76, 0x84, 0xb4, 0x62, 0x95, 0x58, 0x7f, 0x9c, 0x53, + 0x70, 0x8f, 0x7f, 0xc2, 0xb0, 0x9b, 0x96, 0x7d, 0xc9, 0x90, 0x5d, 0x52, + 0xac, 0xc8, 0x6e, 0x8b, 0x49, 0x48, 0x94, 0xd1, 0xd2, 0xc5, 0xa2, 0x69, + 0x89, 0x31, 0x9a, 0x39, 0x99, 0x4c, 0x55, 0xb7, 0x62, 0xb6, 0x61, 0x55, + 0xa6, 0xc1, 0xab, 0x72, 0x4c, 0x36, 0x8b, 0xc3, 0x87, 0x5f, 0x37, 0x49, + 0x61, 0x67, 0x61, 0x5f, 0x52, 0x98, 0x65, 0x62, 0x71, 0xbe, 0x52, 0x61, + 0x54, 0x87, 0xa0, 0x91, 0xca, 0xa6, 0x76, 0x6f, 0x58, 0x55, 0x4d, 0x43, + 0x3d, 0x55, 0xa0, 0x9d, 0x5f, 0x98, 0x55, 0xae, 0x48, 0xa0, 0xa7, 0x73, + 0xab, 0xd3, 0xc9, 0xa1, 0x3c, 0x38, 0xcb, 0x79, 0x37, 0x4d, 0xc2, 0x57, + 0x63, 0xc3, 0xc6, 0x79, 0xca, 0x4a, 0xac, 0xd4, 0xaf, 0x54, 0x78, 0x7c, + 0xc0, 0xbd, 0xad, 0x44, 0x7e, 0x59, 0x81, 0x83, 0x94, 0x44, 0x99, 0x3a, + 0x9e, 0x77, 0x9f, 0x55, 0xd4, 0x4e, 0x51, 0xc6, 0x7e, 0x9f, 0x66, 0xa3, + 0x86, 0x75, 0x4d, 0xb9, 0x67, 0x6f, 0x34, 0x84, 0xc9, 0xa5, 0x5c, 0x45, + 0xb9, 0x53, 0x3e, 0xb9, 0xb4, 0x7c, 0x4c, 0x91, 0x9c, 0xc2, 0x58, 0xd0, + 0x4a, 0x73, 0x52, 0x4d, 0x9a, 0x9a, 0x93, 0x96, 0x62, 0x87, 0x34, 0xcc, + 0x38, 0x62, 0x30, 0x8e, 0x38, 0xa1, 0x70, 0x4b, 0x67, 0x44, 0x7d, 0x7a, + 0xa4, 0x5d, 0x4e, 0x96, 0x71, 0xac, 0x5b, 0x85, 0x72, 0x7d, 0x60, 0x5e, + 0xca, 0xcc, 0xae, 0xab, 0xaf, 0x63, 0xd4, 0x31, 0xb6, 0x75, 0x38, 0x64, + 0x47, 0x67, 0x8a, 0x52, 0x8a, 0x8a, 0x83, 0x60, 0x32, 0x76, 0xcd, 0x40, + 0x9d, 0x3d, 0x88, 0xa5, 0x96, 0x88, 0xc1, 0x8e, 0x71, 0x3f, 0x6d, 0xc9, + 0x52, 0xb1, 0xd1, 0x3b, 0x3c, 0xa2, 0x6d, 0x8b, 0x63, 0x37, 0x90, 0x3f, + 0xcd, 0x2f, 0x66, 0xa1, 0xa0, 0x43, 0x49, 0x43, 0x48, 0x7d, 0x6f, 0x61, + 0x67, 0x9c, 0xcc, 0xab, 0x41, 0xb1, 0x7b, 0xa1, 0x5a, 0x38, 0xa1, 0x42, + 0xc2, 0x3b, 0x7e, 0xc3, 0xca, 0x5c, 0x67, 0xa6, 0x69, 0x3f, 0x35, 0x39, + 0xc0, 0x6d, 0xb3, 0x73, 0x69, 0xd1, 0xbc, 0x63, 0x5b, 0x6a, 0x94, 0x36, + 0xb4, 0x6a, 0x9d, 0x97, 0xbb, 0x8c, 0x8e, 0x90, 0x35, 0xad, 0x4b, 0x54, + 0xce, 0xc7, 0x71, 0x8a, 0x95, 0x9a, 0x34, 0x71, 0xd0, 0x80, 0x46, 0x86, + 0x40, 0xb2, 0xbe, 0xb7, 0x84, 0x7c, 0xd3, 0x8c, 0xb7, 0xbe, 0x7a, 0x6d, + 0xc8, 0x4b, 0x97, 0xb4, 0x8c, 0x3d, 0xba, 0x7c, 0xc2, 0x57, 0x51, 0x6d, + 0x82, 0x44, 0x46, 0xcd, 0xcd, 0x92, 0x83, 0xc6, 0x4d, 0x4d, 0xb8, 0xb5, + 0xc5, 0x9b, 0x48, 0x92, 0x3f, 0x4a, 0x3b, 0x97, 0x55, 0xcd, 0xa1, 0x7c, + 0x47, 0x4c, 0x70, 0x8f, 0x60, 0xab, 0x64, 0x45, 0x40, 0xb2, 0xbd, 0xab, + 0x5f, 0x31, 0x8a, 0x5e, 0x6d, 0x8f, 0xb8, 0x6f, 0x3b, 0x49, 0x64, 0xa7, + 0x9b, 0xc6, 0x9d, 0xcf, 0x9e, 0x81, 0x7b, 0xbd, 0x46, 0xb5, 0xc3, 0x3d, + 0x97, 0x91, 0xa5, 0xd1, 0xbe, 0xa7, 0x5d, 0x73, 0x7b, 0x58, 0x50, 0xa7, + 0x4c, 0x68, 0x85, 0x43, 0x87, 0x5a, 0x6b, 0x94, 0xb8, 0x42, 0x9d, 0xce, + 0x9f, 0x36, 0x56, 0x81, 0xb2, 0xd1, 0x40, 0x46, 0xa7, 0x48, 0x87, 0x56, + 0x79, 0x5e, 0xd1, 0x43, 0x8e, 0xbe, 0x81, 0x34, 0x48, 0x6d, 0x97, 0xa0, + 0xb3, 0xad, 0x56, 0x70, 0x5e, 0x88, 0x5d, 0xb2, 0xb3, 0x43, 0xac, 0x3e, + 0x85, 0x7e, 0x8d, 0x67, 0x5b, 0x46, 0x90, 0xb0, 0x37, 0xa9, 0x4a, 0x5c, + 0x93, 0x37, 0xc1, 0x8f, 0x3f, 0xcc, 0x66, 0xab, 0x5b, 0x80, 0x86, 0x61, + 0x74, 0x56, 0xa9, 0x3a, 0x49, 0x5c, 0x9b, 0x35, 0x80, 0xc0, 0x5a, 0x42, + 0xaa, 0xa6, 0x5d, 0xcb, 0x54, 0x7c, 0xa9, 0xb2, 0xb0, 0x7d, 0x46, 0xce, + 0x9b, 0xb4, 0xa9, 0x55, 0x62, 0xc3, 0x3b, 0xb3, 0x58, 0x42, 0x8c, 0x63, + 0x3e, 0x68, 0x49, 0x73, 0x78, 0x5f, 0x62, 0xcc, 0x65, 0x54, 0x9d, 0x63, + 0xc1, 0x41, 0xaa, 0x90, 0x34, 0xaa, 0xa4, 0xb3, 0x6d, 0x90, 0x48, 0x39, + 0x96, 0xc3, 0x41, 0x79, 0x50, 0x3f, 0x57, 0xaa, 0xc3, 0xce, 0x46, 0x3f, + 0x3d, 0x8b, 0x4f, 0x49, 0xb3, 0x92, 0x7c, 0x62, 0x53, 0x83, 0x65, 0x57, + 0xb8, 0xd3, 0x7b, 0xa3, 0x46, 0x75, 0x5a, 0x9e, 0x68, 0xb0, 0x8b, 0x8f, + 0xa4, 0x45, 0x82, 0xb7, 0x82, 0x84, 0x30, 0x77, 0x90, 0x86, 0x75, 0x99, + 0x4a, 0x56, 0x8e, 0xd2, 0xa8, 0x61, 0xb7, 0xa4, 0x5a, 0x8a, 0x78, 0xd1, + 0x9f, 0x56, 0x55, 0xc4, 0x35, 0x87, 0xba, 0x63, 0x56, 0xb4, 0xa9, 0x84, + 0x5e, 0x4f, 0xc3, 0x7d, 0x86, 0x33, 0x8f, 0x93, 0x3d, 0x9b, 0x78, 0x3e, + 0x63, 0xb1, 0x69, 0x69, 0x9e, 0x3b, 0x51, 0x6c, 0x46, 0xcd, 0x65, 0x38, + 0x9e, 0xa4, 0xab, 0xa4, 0x48, 0x93, 0x74, 0x53, 0x54, 0xcf, 0x98, 0x30, + 0x97, 0xb0, 0xc6, 0xa5, 0x81, 0x83, 0x9c, 0x39, 0x5a, 0xb0, 0x85, 0x44, + 0x4b, 0x44, 0xa9, 0x4d, 0xca, 0x35, 0x96, 0x4c, 0x9c, 0x8d, 0x4f, 0x9a, + 0x5b, 0x91, 0x36, 0xab, 0xa5, 0x2e, 0xc4, 0x32, 0xaf, 0xac, 0x43, 0x43, + 0x65, 0x77, 0x93, 0x6b, 0xbe, 0x7e, 0x4b, 0xa6, 0x4f, 0x57, 0x5d, 0x92, + 0x35, 0x4f, 0x6d, 0xaa, 0x91, 0x5b, 0x7f, 0x9e, 0x88, 0xb3, 0x74, 0x39, + 0x72, 0x8c, 0x35, 0xb2, 0x55, 0x35, 0xc5, 0x8c, 0x45, 0xc0, 0xb0, 0x82, + 0x8f, 0x5a, 0x4c, 0xc0, 0x93, 0xa2, 0x7a, 0x50, 0x9e, 0x41, 0x5c, 0x58, + 0xbe, 0x67, 0x64, 0x33, 0x6d, 0x6b, 0xb9, 0xbd, 0x5e, 0xab, 0x8b, 0x84, + 0x5d, 0x92, 0xb4, 0x39, 0x6d, 0xa8, 0x68, 0x51, 0x70, 0x54, 0x80, 0xc2, + 0x4b, 0x9f, 0x41, 0x78, 0x69, 0x95, 0x9a, 0x7b, 0xc6, 0x86, 0x63, 0x68, + 0x7f, 0x8a, 0xc6, 0x36, 0x9a, 0x67, 0xad, 0x32, 0x7d, 0xc4, 0x91, 0x98, + 0xac, 0x34, 0xc4, 0x5d, 0xbe, 0x7d, 0xc1, 0xca, 0x7c, 0x34, 0x49, 0x54, + 0x4f, 0x77, 0x34, 0x7c, 0x99, 0x44, 0x5b, 0x9c, 0xa9, 0xb7, 0x75, 0xc3, + 0xbe, 0xc8, 0x54, 0x73, 0xa8, 0xb3, 0xc3, 0x88, 0x71, 0xbb, 0x3d, 0x96, + 0xaa, 0xba, 0x96, 0xa4, 0x5f, 0x5d, 0x3a, 0x9b, 0x5f, 0xb4, 0x55, 0x90, + 0xcf, 0x7a, 0x5f, 0xb1, 0xc1, 0x3a, 0xc1, 0xa1, 0x49, 0x8d, 0x3e, 0x7a, + 0x88, 0x83, 0xb8, 0xbd, 0x61, 0x30, 0x54, 0x80, 0xd0, 0x90, 0xcc, 0xcd, + 0x81, 0xab, 0x63, 0xbb, 0x72, 0xb6, 0x4c, 0x6c, 0xc1, 0xa5, 0x87, 0xb1, + 0x6b, 0xbe, 0x4b, 0xb5, 0x5b, 0x55, 0x63, 0xce, 0x93, 0x53, 0xa7, 0x57, + 0xbb, 0x92, 0x68, 0x73, 0xb7, 0xaa, 0x57, 0x3f, 0x8f, 0xc2, 0x43, 0x97, + 0x3a, 0x9c, 0x44, 0xb1, 0xbf, 0xa3, 0x5d, 0x3e, 0xcc, 0x72, 0xbf, 0x34, + 0x8e, 0xb4, 0x58, 0xb2, 0x89, 0x90, 0x62, 0x84, 0x3c, 0x4c, 0x97, 0x51, + 0xc1, 0x46, 0xb2, 0xc9, 0x68, 0xce, 0xc7, 0x4f, 0x4e, 0x55, 0x49, 0x32, + 0x66, 0xa9, 0xc0, 0x5d, 0x30, 0x3e, 0x68, 0x4a, 0x52, 0x81, 0x8c, 0x35, + 0xad, 0xa3, 0x5f, 0xb3, 0x31, 0xcf, 0x99, 0x63, 0x5e, 0xc2, 0xab, 0x85, + 0x3c, 0x71, 0x42, 0xb3, 0xc9, 0x57, 0x65, 0x32, 0x48, 0x61, 0x82, 0xbc, + 0x50, 0x3b, 0x55, 0x93, 0xa7, 0x62, 0x76, 0x3b, 0x50, 0xa5, 0x4b, 0x54, + 0x62, 0x5e, 0xad, 0x51, 0x89, 0x9c, 0x58, 0xc7, 0xb2, 0xb1, 0xba, 0x90, + 0x45, 0x8b, 0xb7, 0xc2, 0x64, 0x7d, 0xd3, 0x9f, 0x5f, 0xd1, 0x38, 0x62, + 0xae, 0x42, 0x99, 0x4d, 0x5b, 0xc2, 0xca, 0x73, 0xa9, 0x6a, 0x92, 0xba, + 0x4f, 0x6d, 0x3e, 0x66, 0xa3, 0x92, 0x92, 0x3a, 0xc0, 0xa9, 0xa8, 0x73, + 0x95, 0xb7, 0xaf, 0x68, 0x91, 0x34, 0x91, 0x39, 0x59, 0x81, 0xd0, 0xc2, + 0x6c, 0x54, 0xaf, 0x9e, 0x95, 0x6c, 0x3d, 0xc8, 0x5a, 0x68, 0x81, 0x6b, + 0x7f, 0x59, 0x84, 0x41, 0x9a, 0x6c, 0x3a, 0x63, 0x4c, 0x73, 0x99, 0x81, + 0x9c, 0xa6, 0x60, 0x69, 0xb7, 0xb5, 0xd5, 0x7b, 0x4a, 0x8b, 0x5f, 0xb5, + 0xb7, 0x52, 0x5d, 0x72, 0xc0, 0x42, 0x9a, 0xd1, 0xc6, 0x5f, 0x78, 0x61, + 0xb3, 0x3d, 0x45, 0x48, 0xb5, 0x88, 0x58, 0x9f, 0xa2, 0xc0, 0x86, 0x7b, + 0x4f, 0x8d, 0x36, 0xa3, 0xc9, 0x8a, 0x68, 0x9a, 0x4d, 0x81, 0xb9, 0x99, + 0x6a, 0xc3, 0x37, 0x9f, 0x70, 0x68, 0x6b, 0x74, 0x51, 0x67, 0x6f, 0x9c, + 0x55, 0xa8, 0xbc, 0x89, 0x7a, 0x8d, 0x43, 0x85, 0x8a, 0x73, 0xd6, 0x9f, + 0x3b, 0x6b, 0x7d, 0x66, 0xc7, 0x3f, 0xa0, 0x89, 0x9d, 0x71, 0x94, 0xb0, + 0x6f, 0xbe, 0x68, 0x80, 0x9f, 0x5d, 0x48, 0xb4, 0xbe, 0xbf, 0x36, 0x85, + 0xa7, 0xc3, 0x90, 0x72, 0x64, 0xa4, 0x31, 0x6e, 0x3a, 0x89, 0x5c, 0x49, + 0xa9, 0x45, 0x69, 0xc6, 0x99, 0x7e, 0x95, 0x6e, 0x47, 0xa6, 0x58, 0xc7, + 0xc5, 0x4a, 0xce, 0x86, 0x91, 0x86, 0xbc, 0x5c, 0x72, 0xc2, 0x6d, 0x6f, + 0xcd, 0xa8, 0x59, 0xb3, 0x94, 0x9b, 0x78, 0x64, 0x42, 0x66, 0x97, 0xc5, + 0xc9, 0x84, 0xbe, 0xb9, 0x70, 0x7b, 0x6a, 0xb5, 0xc9, 0xcf, 0xc9, 0xb6, + 0x39, 0xbf, 0xc0, 0xcc, 0x92, 0xc2, 0xc2, 0x39, 0x6b, 0x50, 0x67, 0x4b, + 0x91, 0x51, 0xae, 0x9b, 0x95, 0x84, 0x67, 0x6c, 0x7e, 0x43, 0xa5, 0x63, + 0x80, 0x76, 0x8b, 0x43, 0x44, 0xa9, 0x60, 0x88, 0xd3, 0xd0, 0xb9, 0x66, + 0x9b, 0x91, 0x9f, 0x95, 0x82, 0x8f, 0xa8, 0x8f, 0x64, 0xb2, 0xa7, 0x7c, + 0x94, 0x93, 0x6a, 0x70, 0x63, 0x4f, 0x54, 0x39, 0xb5, 0x3c, 0x50, 0x53, + 0x35, 0x5f, 0xc6, 0x8f, 0x7a, 0x2f, 0x39, 0x7b, 0x91, 0xc6, 0xc2, 0x41, + 0xd2, 0x42, 0x52, 0x7b, 0xd7, 0x52, 0x6d, 0x4f, 0x7a, 0xc7, 0xd0, 0x6a, + 0x94, 0x65, 0x43, 0x92, 0x80, 0xb2, 0x57, 0x40, 0xb5, 0xc0, 0x81, 0x37, + 0xc1, 0x71, 0x5a, 0x40, 0xb4, 0x56, 0x9d, 0x5d, 0x81, 0x4b, 0xac, 0xcd, + 0xab, 0xa9, 0x67, 0xc3, 0x93, 0xa0, 0x5b, 0xc8, 0xad, 0x7b, 0xc1, 0x65, + 0x58, 0xa2, 0x68, 0x63, 0xd1, 0x4d, 0xb8, 0xb4, 0x54, 0x84, 0x5f, 0x97, + 0xb4, 0x6e, 0x37, 0x94, 0x89, 0xc4, 0xa1, 0x34, 0x58, 0x2b, 0x97, 0x66, + 0x74, 0x9b, 0x78, 0x86, 0x38, 0x56, 0xa0, 0x9e, 0x34, 0xba, 0x54, 0x9e, + 0xd2, 0xc1, 0xc0, 0x8c, 0x41, 0xa0, 0x58, 0x35, 0x8a, 0xbd, 0x91, 0x78, + 0x5b, 0x38, 0xb5, 0x6d, 0x7f, 0x94, 0xba, 0x65, 0x93, 0x5a, 0xbe, 0x7f, + 0xb4, 0x78, 0x55, 0x9c, 0x5f, 0x94, 0x92, 0x8f, 0xa7, 0x9d, 0x7c, 0x83, + 0x5f, 0x3a, 0x6f, 0x86, 0xa2, 0x82, 0xa0, 0x5f, 0x89, 0xb3, 0x6e, 0xbe, + 0x54, 0xb0, 0x5a, 0x78, 0x8e, 0xa5, 0x97, 0xc0, 0xd0, 0x9c, 0x6b, 0x90, + 0xb6, 0x60, 0xcd, 0x91, 0x80, 0xa7, 0x54, 0x37, 0xcc, 0x43, 0x5d, 0xa2, + 0xc5, 0x6a, 0x52, 0xcc, 0xb3, 0x7a, 0xb9, 0xba, 0x9b, 0x40, 0x81, 0x46, + 0x48, 0xb2, 0xcd, 0x6b, 0xa9, 0xc8, 0xaa, 0xc0, 0x87, 0x4f, 0xae, 0xb6, + 0x3d, 0x7f, 0x43, 0xbc, 0xb8, 0xab, 0x77, 0x48, 0x51, 0x67, 0xc5, 0x45, + 0x79, 0x9d, 0xd1, 0x54, 0x41, 0x8f, 0xdd, 0xa2, 0x93, 0x94, 0x66, 0xa5, + 0xa4, 0x92, 0xac, 0xd1, 0xdc, 0xb1, 0xb0, 0x58, 0x8d, 0x97, 0x9c, 0x44, + 0x55, 0xba, 0x47, 0xd7, 0x7e, 0x99, 0xc5, 0x73, 0x7e, 0x6e, 0x6a, 0x8b, + 0x54, 0xce, 0x7e, 0x63, 0x8a, 0x80, 0x55, 0x50, 0x7a, 0xcb, 0x69, 0xb2, + 0x9f, 0x67, 0x39, 0x9f, 0x73, 0x74, 0x5c, 0xc8, 0x99, 0xb2, 0x7a, 0x31, + 0x9f, 0xb4, 0x7a, 0x69, 0xb0, 0x38, 0x4e, 0xaf, 0x51, 0x3b, 0xd4, 0x46, + 0x92, 0x7e, 0xcf, 0x59, 0xb7, 0x56, 0x95, 0x63, 0xba, 0x78, 0x86, 0x81, + 0xb1, 0xaf, 0xa1, 0xbc, 0x9a, 0xb7, 0x4c, 0x2f, 0x34, 0xcf, 0xca, 0x92, + 0xc0, 0x72, 0x43, 0x2b, 0xbc, 0x60, 0x3b, 0x88, 0xaf, 0x60, 0xb2, 0x6f, + 0xb5, 0x91, 0x85, 0x62, 0xcb, 0x59, 0xae, 0xc3, 0x53, 0x3a, 0xc4, 0x4d, + 0x91, 0xa4, 0x7a, 0x31, 0x4a, 0x6b, 0x93, 0x58, 0x3a, 0x6f, 0x62, 0x3f, + 0xab, 0x4f, 0x84, 0xb2, 0x5c, 0x96, 0x7d, 0xce, 0x30, 0xc5, 0x54, 0xb9, + 0xa6, 0xb5, 0x3a, 0x97, 0x61, 0xa7, 0x7d, 0x41, 0x89, 0xc0, 0x4a, 0x9c, + 0xdb, 0xb8, 0x61, 0x35, 0x38, 0x87, 0x68, 0xaf, 0x91, 0x79, 0x5e, 0x37, + 0x60, 0x5e, 0x45, 0xa9, 0x47, 0x59, 0xb4, 0xa5, 0x8c, 0x9c, 0x76, 0xaa, + 0x6d, 0x84, 0x35, 0x43, 0xd4, 0x91, 0x57, 0x5d, 0x6f, 0xc1, 0x46, 0xb6, + 0x9f, 0xbb, 0x79, 0x87, 0x8f, 0x98, 0x65, 0x9b, 0x39, 0x91, 0x95, 0x48, + 0x5b, 0x6e, 0x91, 0x86, 0x89, 0x54, 0xbc, 0x93, 0x7f, 0x4d, 0x46, 0x68, + 0x81, 0x87, 0xc9, 0x53, 0x64, 0x9a, 0x47, 0x77, 0xb1, 0x3c, 0x7d, 0x33, + 0x7b, 0x8a, 0x47, 0x69, 0xcb, 0x6c, 0xae, 0x7e, 0xac, 0x3e, 0x4b, 0xa5, + 0x66, 0x3e, 0xcb, 0x40, 0xb5, 0x3c, 0xc0, 0x55, 0x6c, 0x70, 0x46, 0xa1, + 0x7e, 0xb5, 0xd3, 0x9d, 0xcd, 0x8c, 0x3a, 0xbd, 0x4e, 0x43, 0xa7, 0xcb, + 0x8c, 0x63, 0xd6, 0x6b, 0x76, 0x86, 0x6c, 0x5b, 0x47, 0x40, 0x8e, 0xcd, + 0xa3, 0x4b, 0xb0, 0x8c, 0x98, 0x31, 0xb2, 0xac, 0x81, 0x81, 0x6a, 0x40, + 0x4a, 0xb0, 0x75, 0x8f, 0x8e, 0x7c, 0x92, 0xcb, 0x63, 0x35, 0x58, 0xc3, + 0x84, 0x2f, 0x69, 0x6f, 0xcf, 0x5a, 0x36, 0x42, 0x84, 0x5c, 0x75, 0x41, + 0x2f, 0x56, 0x95, 0xa0, 0xae, 0xb2, 0xbf, 0x8d, 0x7f, 0xbf, 0xb7, 0xa7, + 0xbc, 0x3d, 0xd4, 0x28, 0xa3, 0xab, 0xa9, 0x8b, 0x53, 0xa5, 0x4d, 0x63, + 0x85, 0x97, 0xb3, 0x34, 0xbc, 0xbc, 0xa5, 0xae, 0x8b, 0x33, 0x97, 0xd0, + 0x89, 0xa2, 0x50, 0x93, 0x3a, 0x55, 0x50, 0x64, 0x8a, 0x48, 0x57, 0x48, + 0x44, 0x54, 0x98, 0x70, 0xbd, 0x45, 0xa5, 0xbf, 0x7b, 0x47, 0x92, 0x7a, + 0xb5, 0x61, 0x32, 0xc5, 0x9c, 0x8b, 0x94, 0xc4, 0xc8, 0x82, 0xcd, 0x9a, + 0x5a, 0xbd, 0x68, 0x2f, 0x94, 0x81, 0xa4, 0x39, 0x78, 0x98, 0x91, 0x99, + 0x39, 0xb4, 0x5c, 0x85, 0x33, 0x5a, 0x53, 0x43, 0x45, 0x5d, 0x53, 0x75, + 0x80, 0x7b, 0x70, 0x9d, 0x6f, 0x4f, 0x6b, 0x67, 0x3b, 0xce, 0x69, 0x3c, + 0xbb, 0x5e, 0xd4, 0xbd, 0x82, 0xbf, 0x3f, 0xa5, 0x51, 0xc5, 0x55, 0xb1, + 0x3e, 0xd6, 0x60, 0x79, 0xc8, 0xa1, 0x52, 0xbd, 0xbd, 0xa0, 0x9e, 0x9e, + 0xc9, 0x51, 0x43, 0x90, 0x49, 0x98, 0xa1, 0x58, 0x90, 0xb4, 0x9c, 0x96, + 0x94, 0x7f, 0x47, 0x67, 0x56, 0x38, 0x32, 0x57, 0xd0, 0x4d, 0x3d, 0x56, + 0xb4, 0xa8, 0x69, 0x82, 0x67, 0x92, 0x68, 0x98, 0xa9, 0x92, 0xbb, 0x9e, + 0x54, 0x5d, 0xc8, 0xb4, 0xba, 0x32, 0xbc, 0x3a, 0xc7, 0xc2, 0x5d, 0x6b, + 0x75, 0xa0, 0xb0, 0x95, 0x6d, 0x8a, 0xcf, 0x88, 0x79, 0x5a, 0xba, 0x42, + 0x74, 0x33, 0xa9, 0xa6, 0x4d, 0xc1, 0x4a, 0x60, 0x57, 0xaa, 0xbf, 0x3a, + 0xd3, 0x2f, 0x59, 0x3b, 0xa6, 0xc9, 0x84, 0xdc, 0x68, 0x8e, 0xb2, 0xa8, + 0x49, 0x8f, 0x53, 0x67, 0x70, 0x8e, 0x84, 0x77, 0x9c, 0xc4, 0x99, 0x9a, + 0xb7, 0x31, 0x8b, 0x45, 0xb5, 0xbf, 0x83, 0x36, 0xc6, 0x5a, 0x70, 0x64, + 0xbd, 0x3c, 0xd1, 0x9c, 0xc6, 0x3a, 0xb3, 0x7d, 0x93, 0x9a, 0x90, 0x60, + 0x4d, 0x97, 0xa2, 0xba, 0xa3, 0x62, 0x3a, 0xab, 0x56, 0x9b, 0x58, 0x81, + 0x50, 0xae, 0x42, 0x6c, 0x5d, 0x5b, 0x62, 0x47, 0xb5, 0x74, 0x75, 0x89, + 0x9e, 0x67, 0x6a, 0xb6, 0x36, 0x9e, 0xa7, 0x44, 0x62, 0x9d, 0x68, 0x76, + 0x32, 0xd7, 0x44, 0x99, 0x77, 0xa1, 0x7b, 0x9a, 0xb9, 0x92, 0xad, 0x53, + 0x49, 0xd3, 0x43, 0x50, 0x56, 0x7e, 0x90, 0xbc, 0x7a, 0xaa, 0x8f, 0x60, + 0x7e, 0x92, 0x30, 0xb0, 0xa3, 0x92, 0xd6, 0x3b, 0x50, 0xb6, 0xbb, 0x48, + 0xc5, 0xc6, 0x62, 0x8e, 0xca, 0xc2, 0x68, 0xb8, 0xb1, 0x46, 0xa9, 0xda, + 0xa5, 0x79, 0xc2, 0x57, 0x40, 0x7a, 0x5b, 0xc5, 0xd9, 0x93, 0xb9, 0x89, + 0x55, 0x9b, 0xb4, 0x54, 0x49, 0x73, 0x78, 0xbf, 0xad, 0x6e, 0x3a, 0x89, + 0x61, 0xb1, 0x44, 0x7d, 0x79, 0xc7, 0x47, 0x31, 0x49, 0xa3, 0x79, 0x9a, + 0xc2, 0x62, 0xa5, 0xa5, 0x31, 0xa7, 0xc0, 0x56, 0x36, 0x89, 0xcb, 0xcb, + 0x74, 0x46, 0x61, 0xa8, 0x46, 0x87, 0x4a, 0xd1, 0x3e, 0x4d, 0x7b, 0xb5, + 0x84, 0x9d, 0x98, 0x7d, 0x38, 0xb9, 0x48, 0x3e, 0xb3, 0x9f, 0x3a, 0x67, + 0x84, 0x48, 0x55, 0x8f, 0xa6, 0xc1, 0x75, 0xbe, 0xba, 0xd5, 0x74, 0x53, + 0xb2, 0x71, 0xa2, 0x91, 0x39, 0xb5, 0x35, 0x92, 0x7d, 0x87, 0x5f, 0x7a, + 0x5e, 0x77, 0x73, 0x7b, 0x83, 0x6d, 0x97, 0xc1, 0x53, 0x40, 0xb9, 0x9d, + 0x3f, 0xc3, 0x6f, 0xb3, 0x4c, 0xbe, 0x48, 0xce, 0xae, 0x44, 0x59, 0x5b, + 0x61, 0x3b, 0x7a, 0xa4, 0x39, 0x36, 0xa0, 0x3c, 0x3b, 0x72, 0x34, 0xaa, + 0xc8, 0xbd, 0xb3, 0xc7, 0x99, 0xd2, 0xc2, 0x78, 0x78, 0x92, 0x73, 0x97, + 0xb3, 0x9e, 0x8b, 0x78, 0xb7, 0xd6, 0x2c, 0x73, 0x34, 0x37, 0x69, 0x44, + 0x9d, 0x91, 0xcc, 0x68, 0xa7, 0xb6, 0xb7, 0xaf, 0x78, 0x84, 0x9e, 0x50, + 0xaf, 0xa7, 0xcc, 0x66, 0xa5, 0x8c, 0x80, 0x83, 0x4f, 0x6c, 0x81, 0x7a, + 0x65, 0x3c, 0x54, 0xcf, 0x69, 0xc8, 0x89, 0x5c, 0x7c, 0x5f, 0x9d, 0xb4, + 0xa9, 0x99, 0x4e, 0x60, 0x69, 0x57, 0x3e, 0x9e, 0xdf, 0x78, 0x8f, 0x3e, + 0xd0, 0x6a, 0x8e, 0x69, 0x3e, 0xb1, 0x51, 0x48, 0xbf, 0x41, 0xd1, 0x76, + 0x4b, 0x81, 0x61, 0x42, 0x72, 0x33, 0xa1, 0xdf, 0xc1, 0x79, 0xcb, 0xcd, + 0x73, 0x40, 0xa4, 0xd2, 0x42, 0x63, 0x88, 0x5a, 0x89, 0x8e, 0x8a, 0x79, + 0x8f, 0x38, 0xd2, 0x53, 0x30, 0x58, 0xc5, 0xd5, 0xaa, 0x90, 0xdf, 0x36, + 0x9b, 0x6a, 0xb3, 0x79, 0xc9, 0x7e, 0x9c, 0x92, 0xb0, 0x3e, 0x5a, 0x54, + 0x9d, 0x20, 0x37, 0x67, 0xcb, 0x87, 0x88, 0x9d, 0x92, 0x3e, 0x8d, 0x45, + 0x93, 0x55, 0x38, 0x7a, 0x31, 0x52, 0x38, 0xd0, 0x93, 0x45, 0x87, 0x5f, + 0x39, 0x9c, 0x8f, 0x89, 0x9e, 0x2f, 0x9b, 0x86, 0x4c, 0xa5, 0x9d, 0x88, + 0x3a, 0xaf, 0x8b, 0x81, 0x76, 0xc7, 0xce, 0x8d, 0xc4, 0x3d, 0x74, 0xc6, + 0x32, 0x80, 0x45, 0x93, 0x89, 0x66, 0xa0, 0x7f, 0xc9, 0x40, 0x36, 0x62, + 0xd8, 0x76, 0xbb, 0xb0, 0x73, 0x67, 0xd3, 0x57, 0xc4, 0x3e, 0xd7, 0x3e, + 0x8e, 0x67, 0xc8, 0x4f, 0x3f, 0x37, 0xa1, 0x76, 0x69, 0x78, 0x6b, 0x50, + 0x48, 0x9e, 0xb9, 0x9f, 0xa9, 0xa3, 0xad, 0x5a, 0x4e, 0xbf, 0xb9, 0x5a, + 0x53, 0x5e, 0x96, 0xa9, 0x7d, 0x4e, 0xc3, 0x9f, 0x37, 0x68, 0x8b, 0x7d, + 0x4b, 0x74, 0x94, 0xb7, 0x84, 0xd3, 0x96, 0xaf, 0x95, 0xb9, 0x4a, 0xc3, + 0xa1, 0x50, 0xd4, 0xac, 0xb8, 0x68, 0x9f, 0x52, 0x58, 0x73, 0x70, 0x50, + 0x3a, 0x9e, 0x90, 0xc1, 0x72, 0x7a, 0x66, 0x58, 0xaf, 0xc1, 0x7e, 0x49, + 0xc3, 0x8b, 0x63, 0x92, 0x63, 0x60, 0x95, 0x62, 0x4f, 0x91, 0xe4, 0x4e, + 0x69, 0x7b, 0x6b, 0x98, 0xb0, 0x73, 0xc0, 0x6d, 0x7f, 0xb2, 0x75, 0x94, + 0x3f, 0x5d, 0xad, 0xb7, 0xb7, 0x41, 0x71, 0x43, 0x5f, 0x42, 0x46, 0x8b, + 0x6e, 0x9f, 0x98, 0xc2, 0x3b, 0x83, 0x8d, 0x46, 0x81, 0x89, 0x47, 0x80, + 0xbd, 0xb1, 0x35, 0xb9, 0x83, 0x72, 0xd0, 0x51, 0xbc, 0x78, 0x79, 0x43, + 0xc7, 0xbe, 0x62, 0x39, 0x9f, 0x63, 0x86, 0xbc, 0x9f, 0xb4, 0x5b, 0x3b, + 0x7e, 0x3a, 0x9b, 0x53, 0xd9, 0xb2, 0xbd, 0x3c, 0x62, 0xa3, 0xb1, 0x6f, + 0x91, 0x88, 0xac, 0x9e, 0x74, 0x66, 0xb2, 0x38, 0x48, 0x3e, 0x5c, 0x52, + 0xc3, 0x99, 0x7a, 0x4b, 0x8b, 0x3c, 0x59, 0x8f, 0x8a, 0x53, 0xc1, 0xd1, + 0x68, 0x55, 0xa8, 0x6a, 0x6a, 0x9e, 0x8c, 0x51, 0x8d, 0x78, 0x39, 0xcb, + 0x7e, 0x46, 0x70, 0xb2, 0xaa, 0x40, 0x6a, 0x7c, 0xa6, 0xb7, 0xcb, 0x9e, + 0xb1, 0xaf, 0x52, 0x86, 0x54, 0xb4, 0x9d, 0xc0, 0x8b, 0xaa, 0x3c, 0x66, + 0x73, 0xad, 0x86, 0xbe, 0xa9, 0x4e, 0x61, 0x54, 0x87, 0x4d, 0xd0, 0x7b, + 0xbc, 0xc5, 0xa9, 0xb6, 0x8a, 0x77, 0x79, 0xa0, 0x9c, 0x45, 0x87, 0xba, + 0xa4, 0x6d, 0xa3, 0x3b, 0xcf, 0xa3, 0x84, 0x99, 0x53, 0x4f, 0x9b, 0xb3, + 0x90, 0xa8, 0x5d, 0xaf, 0x3c, 0x7c, 0xb3, 0xca, 0xc0, 0xb9, 0x60, 0xb5, + 0xa9, 0xad, 0x72, 0x63, 0xcc, 0x80, 0x47, 0xc3, 0x8d, 0x54, 0x73, 0x91, + 0x8e, 0xc0, 0x77, 0xbd, 0x77, 0xbc, 0x58, 0x97, 0x68, 0x3d, 0x98, 0xb7, + 0x2f, 0xb9, 0xd0, 0x65, 0x3d, 0xb1, 0x79, 0xa3, 0x34, 0x71, 0x99, 0xa8, + 0x41, 0x5b, 0x4d, 0x8b, 0xa2, 0x53, 0x4b, 0x4d, 0x8f, 0xa3, 0xdb, 0x85, + 0x48, 0x83, 0x8c, 0x59, 0x43, 0x43, 0x9f, 0xba, 0x62, 0xcb, 0x3b, 0x3d, + 0x63, 0x50, 0x75, 0x4b, 0x41, 0x70, 0x36, 0x7e, 0x62, 0x94, 0x6d, 0x75, + 0xaf, 0x7e, 0x52, 0x6b, 0xa5, 0x9b, 0x41, 0x59, 0xc2, 0x7f, 0x9c, 0x94, + 0x83, 0x83, 0xcd, 0xa0, 0xa5, 0x42, 0x69, 0x58, 0x34, 0x48, 0x8a, 0x67, + 0x97, 0x39, 0x65, 0x5d, 0x63, 0x95, 0x38, 0x85, 0x68, 0x63, 0x91, 0xb6, + 0x57, 0xaa, 0x7d, 0x44, 0x7c, 0x57, 0x4c, 0xc3, 0x4e, 0x78, 0x58, 0x53, + 0x8c, 0x54, 0xbc, 0x70, 0x39, 0xb5, 0x81, 0x73, 0x57, 0x85, 0x47, 0x79, + 0xd4, 0x79, 0x64, 0xb9, 0x3b, 0xc6, 0x79, 0x45, 0x8d, 0x39, 0x92, 0x98, + 0x38, 0x9b, 0x43, 0xca, 0x42, 0x2f, 0x5c, 0xbd, 0x6a, 0x96, 0x4f, 0x46, + 0x9f, 0xb9, 0x30, 0xa3, 0x61, 0x4e, 0xbd, 0x9a, 0x6b, 0xb0, 0x7c, 0xbf, + 0x40, 0x46, 0x3a, 0x99, 0x75, 0xa1, 0xc5, 0x85, 0xb0, 0x87, 0x5e, 0x94, + 0xbd, 0x98, 0x2c, 0x76, 0x54, 0x86, 0x76, 0xbb, 0x70, 0xc9, 0x4d, 0x85, + 0x5d, 0x78, 0x44, 0x71, 0xb3, 0x34, 0x4b, 0x5c, 0x34, 0x79, 0x9f, 0xb0, + 0x48, 0x3e, 0x65, 0x91, 0x7c, 0x40, 0x73, 0xc4, 0x51, 0x46, 0x97, 0x67, + 0xae, 0x78, 0xa7, 0xb4, 0xc7, 0x9c, 0x3f, 0x2b, 0x7f, 0x79, 0x9a, 0x42, + 0xcb, 0x42, 0x5f, 0x50, 0x46, 0x3c, 0x8c, 0x9a, 0x7a, 0x6c, 0x50, 0x36, + 0x80, 0x89, 0x3a, 0x86, 0x77, 0xcc, 0xa8, 0xc7, 0x60, 0x97, 0xb4, 0xac, + 0xba, 0x7f, 0x9c, 0xca, 0x46, 0x72, 0x7e, 0x6a, 0xb9, 0x9c, 0x36, 0xbc, + 0x3b, 0x6a, 0x88, 0xbf, 0xb8, 0x20, 0xb5, 0xd4, 0xbd, 0x2f, 0xae, 0x84, + 0x38, 0xb8, 0xc6, 0x48, 0x95, 0x43, 0xc4, 0xa3, 0xba, 0xdd, 0x47, 0xbb, + 0x98, 0x9c, 0x51, 0x89, 0x96, 0x6c, 0xc9, 0x3f, 0x59, 0x3f, 0x8a, 0x77, + 0x8d, 0xc3, 0xcc, 0x99, 0x29, 0x55, 0x8b, 0x90, 0x6b, 0xa8, 0x8a, 0x6e, + 0xa1, 0x6e, 0xa7, 0xa2, 0xc0, 0x78, 0x9a, 0x85, 0xdd, 0xb7, 0xae, 0x64, + 0x59, 0x44, 0xd3, 0x8d, 0x89, 0x3b, 0x90, 0x53, 0x6b, 0x56, 0x81, 0xb0, + 0x46, 0xb7, 0xbf, 0x80, 0x48, 0x44, 0x49, 0xba, 0x80, 0x40, 0x65, 0x4e, + 0x65, 0x6c, 0xb4, 0xa6, 0xb0, 0x3d, 0x85, 0xa3, 0x49, 0x43, 0x75, 0xa4, + 0x3b, 0x9e, 0x45, 0x3f, 0x2f, 0x97, 0xba, 0x9d, 0xbd, 0x3e, 0x7b, 0xaa, + 0x90, 0x71, 0xc4, 0xbb, 0x51, 0x79, 0x85, 0x4b, 0x69, 0x4c, 0xaf, 0x78, + 0x5d, 0xc2, 0x96, 0x31, 0xb1, 0x63, 0x94, 0x7b, 0x57, 0x6a, 0x95, 0x80, + 0xad, 0x70, 0xc3, 0x8a, 0x4e, 0xb0, 0xc9, 0x58, 0x99, 0xb4, 0xcd, 0xba, + 0xa6, 0x5f, 0xa1, 0x7a, 0x57, 0x87, 0xa9, 0x2b, 0x5f, 0x80, 0x9d, 0x81, + 0x70, 0x43, 0xbb, 0x53, 0x7d, 0x63, 0xab, 0xae, 0x77, 0x48, 0xd8, 0x6c, + 0xa7, 0x47, 0xcb, 0xd9, 0x4c, 0x9f, 0x5b, 0x57, 0xd2, 0xcf, 0x89, 0x3c, + 0x2f, 0x70, 0x6b, 0x7f, 0x6b, 0x5a, 0x79, 0x52, 0x8e, 0x52, 0x53, 0xa0, + 0x97, 0xbd, 0x4a, 0xca, 0x86, 0x53, 0x94, 0xcc, 0x61, 0x65, 0x92, 0xb3, + 0x5c, 0x63, 0x7b, 0x73, 0xb9, 0x3e, 0x61, 0xa5, 0x1d, 0x36, 0x87, 0xc1, + 0x80, 0x46, 0x53, 0x8f, 0x85, 0x6b, 0x56, 0x8b, 0xa2, 0xc7, 0x98, 0xb4, + 0x5b, 0x87, 0x75, 0x79, 0x61, 0xdb, 0xba, 0x53, 0xd8, 0xbd, 0xc1, 0xb6, + 0xba, 0x72, 0xac, 0x54, 0xa2, 0x5a, 0xa9, 0xae, 0x60, 0x99, 0x70, 0x9a, + 0x82, 0xd1, 0xb0, 0x98, 0xa7, 0x7b, 0xc5, 0x31, 0xd3, 0x8c, 0x78, 0x32, + 0x7e, 0xaf, 0x8a, 0x6b, 0x5e, 0x89, 0x6e, 0x40, 0x3c, 0xd5, 0x9e, 0x6d, + 0x9b, 0x97, 0x29, 0x6c, 0x72, 0x84, 0xa8, 0xb5, 0x73, 0x71, 0x3c, 0xc3, + 0xce, 0x91, 0xaa, 0x4e, 0x68, 0x75, 0x86, 0x51, 0x42, 0x96, 0x73, 0xbe, + 0xd2, 0x3f, 0x82, 0x73, 0x70, 0xa7, 0xb0, 0xa2, 0xbd, 0x7f, 0x97, 0x5f, + 0x44, 0x7b, 0xc3, 0x8b, 0xca, 0x52, 0x95, 0x6b, 0x85, 0xcb, 0xa4, 0xba, + 0x36, 0x45, 0x49, 0x56, 0x52, 0x68, 0x66, 0xc4, 0x53, 0xd0, 0xb2, 0x69, + 0x7d, 0xb4, 0x6e, 0xd0, 0x41, 0x36, 0xa4, 0x56, 0x7b, 0x54, 0x53, 0xc6, + 0x7b, 0x5a, 0x96, 0x86, 0x94, 0x73, 0xd5, 0x3a, 0xae, 0x6c, 0x3b, 0x6c, + 0xac, 0xd6, 0xce, 0x73, 0x8b, 0xce, 0x49, 0x77, 0xa1, 0x6d, 0xb4, 0x5e, + 0x9e, 0x76, 0x37, 0xcb, 0xb2, 0x51, 0x74, 0x4c, 0xc3, 0xdd, 0x44, 0x33, + 0xa0, 0xb9, 0x70, 0x9f, 0xb9, 0xb8, 0x8f, 0x53, 0x35, 0x4c, 0xc5, 0xaa, + 0x5b, 0x95, 0x6c, 0x5a, 0x61, 0xb6, 0x4f, 0xd2, 0x58, 0x9f, 0x82, 0x90, + 0xa5, 0x97, 0x61, 0x40, 0xcc, 0x93, 0xd1, 0x97, 0x3f, 0xd1, 0x71, 0x91, + 0x53, 0x54, 0x51, 0x52, 0xa8, 0xcd, 0x95, 0x41, 0x3b, 0x9a, 0xb6, 0x58, + 0xac, 0xb4, 0x6e, 0xc6, 0x3d, 0x3b, 0x57, 0x69, 0x62, 0xa9, 0x30, 0x60, + 0x70, 0x5d, 0x8f, 0xb6, 0xce, 0x8c, 0x55, 0xb3, 0xca, 0xc3, 0x44, 0x85, + 0x3a, 0x44, 0xbb, 0xaf, 0x3d, 0xc5, 0x3f, 0x72, 0x2f, 0x5f, 0xa0, 0x9c, + 0xba, 0xc9, 0x7e, 0x9f, 0xcf, 0x67, 0xc2, 0x3e, 0x6b, 0x9c, 0xcd, 0x54, + 0x2f, 0x4f, 0xb8, 0x99, 0x5d, 0x6f, 0xc0, 0xb1, 0xb1, 0x57, 0x80, 0xcd, + 0xd2, 0x76, 0x56, 0x3c, 0x83, 0x42, 0xcb, 0x90, 0x54, 0x2e, 0x3f, 0x8c, + 0xc1, 0xc4, 0xd1, 0xbf, 0x76, 0xab, 0xcd, 0x3b, 0xab, 0xc7, 0x55, 0x70, + 0xb5, 0x59, 0xc2, 0x40, 0x48, 0x97, 0x92, 0x47, 0x38, 0x8a, 0x6c, 0x90, + 0x5a, 0x5f, 0x6a, 0x52, 0xb1, 0x7e, 0x5e, 0xb3, 0x73, 0x80, 0x68, 0x72, + 0x63, 0x7b, 0x3a, 0x4e, 0x9a, 0x5e, 0x7c, 0xb6, 0x98, 0x50, 0xa9, 0x4c, + 0x42, 0x35, 0xb3, 0x95, 0xb5, 0xd3, 0xcc, 0xc6, 0x2f, 0xa4, 0x58, 0x9e, + 0x9c, 0x3e, 0xaf, 0x83, 0x7a, 0x86, 0x61, 0x46, 0xcc, 0x59, 0xcc, 0x8f, + 0xa2, 0x7c, 0x74, 0x7a, 0xc0, 0x35, 0x41, 0xb2, 0x9c, 0x52, 0x3c, 0x41, + 0x7d, 0x63, 0x99, 0x8a, 0x95, 0x4e, 0x65, 0x33, 0x63, 0xce, 0x85, 0x44, + 0x5b, 0x4e, 0x32, 0x61, 0xb9, 0xbb, 0x47, 0xad, 0x90, 0x9c, 0x28, 0xb5, + 0x56, 0x80, 0xb1, 0xc6, 0x86, 0xc1, 0xad, 0xca, 0x9c, 0x89, 0x62, 0x54, + 0xd4, 0x58, 0xc7, 0xcf, 0x4e, 0xbd, 0x9a, 0x6b, 0x3f, 0x59, 0x3e, 0xb4, + 0x5f, 0x7a, 0x52, 0x3d, 0x62, 0xc2, 0xa6, 0x49, 0x57, 0xbd, 0x7f, 0x5b, + 0xb5, 0x73, 0x84, 0xca, 0x35, 0x78, 0x49, 0x61, 0x49, 0x45, 0xc1, 0xb2, + 0xa9, 0x29, 0x3a, 0xa4, 0x8c, 0x2c, 0x42, 0x8c, 0x76, 0x4a, 0xd2, 0xb0, + 0x39, 0xa4, 0x8c, 0x53, 0x94, 0xac, 0xbe, 0x62, 0x89, 0x6e, 0x42, 0xcb, + 0xb4, 0xc6, 0xa8, 0xae, 0x45, 0x91, 0x72, 0xc0, 0xcd, 0x70, 0x68, 0xcb, + 0xb5, 0x46, 0x9a, 0xcb, 0x79, 0x54, 0xaf, 0x7d, 0x42, 0x56, 0x7d, 0x3c, + 0x52, 0x94, 0x4b, 0x57, 0x43, 0x73, 0x8b, 0xa3, 0x76, 0xb6, 0xb4, 0xa0, + 0x57, 0x40, 0x48, 0x7f, 0x6f, 0x82, 0xa1, 0x52, 0xcd, 0xae, 0x78, 0x6c, + 0x70, 0xb7, 0x34, 0xad, 0xb4, 0x3f, 0x73, 0x65, 0xd6, 0x3f, 0x57, 0x54, + 0x40, 0x3d, 0x45, 0x94, 0x8e, 0x9d, 0x95, 0x74, 0xc5, 0x5a, 0x54, 0x8c, + 0xc0, 0x92, 0xb9, 0x46, 0x9a, 0x9f, 0x38, 0xb6, 0x64, 0x3c, 0x60, 0x44, + 0x94, 0x4f, 0x83, 0xbc, 0x80, 0x90, 0x9f, 0xb6, 0xc0, 0xb2, 0x89, 0x90, + 0x3c, 0x58, 0xb8, 0xd5, 0x97, 0x61, 0x54, 0x57, 0x60, 0x78, 0x73, 0x78, + 0x69, 0x89, 0xbc, 0xc7, 0x9b, 0xc7, 0x50, 0x69, 0xa1, 0x95, 0xa1, 0xb3, + 0x3b, 0x58, 0x56, 0x3d, 0x58, 0xbe, 0x4f, 0x76, 0x40, 0xbd, 0x3e, 0x38, + 0x4d, 0x65, 0xa8, 0x64, 0x4e, 0x47, 0x7f, 0x7f, 0x37, 0x73, 0x3e, 0xd0, + 0xc7, 0x64, 0x53, 0xb6, 0x77, 0x6a, 0x98, 0x3d, 0x95, 0x86, 0x45, 0xcf, + 0xb0, 0x5a, 0x62, 0xa3, 0xab, 0xa4, 0xbe, 0xa6, 0x70, 0x8c, 0x9f, 0xce, + 0xd6, 0x80, 0x3a, 0x82, 0x83, 0x54, 0x42, 0x4d, 0xb9, 0x5a, 0x6b, 0xb7, + 0xa3, 0x62, 0x4e, 0x32, 0x5e, 0x7f, 0xac, 0x95, 0xb7, 0x9a, 0x9a, 0xb2, + 0x39, 0x59, 0x96, 0x3a, 0x54, 0x9f, 0x4f, 0x84, 0xb2, 0xa3, 0xbb, 0x3f, + 0xc5, 0x43, 0xcb, 0x3c, 0x64, 0x6a, 0x50, 0xd8, 0x77, 0x70, 0x56, 0xc8, + 0x6d, 0xab, 0x62, 0xc5, 0xc1, 0xc8, 0x91, 0xae, 0x88, 0x50, 0xc8, 0xb1, + 0xc5, 0xca, 0x43, 0xcd, 0x31, 0x57, 0xbc, 0xc4, 0x66, 0x61, 0x71, 0xb4, + 0x3e, 0x92, 0x31, 0xa2, 0xd0, 0x40, 0x6e, 0x9a, 0x3a, 0x98, 0x38, 0x8b, + 0x93, 0xc0, 0x5c, 0x4b, 0x4d, 0x99, 0x36, 0x3c, 0x82, 0x79, 0xb3, 0x6e, + 0xb3, 0x80, 0x4a, 0x42, 0x8c, 0x68, 0xc4, 0xb4, 0xb5, 0xc5, 0xd0, 0x53, + 0x4a, 0xc9, 0x6d, 0x3e, 0x81, 0xab, 0x68, 0x85, 0x44, 0x9b, 0xa3, 0x81, + 0x3a, 0xd2, 0x47, 0xb9, 0x6f, 0xc7, 0x96, 0xb1, 0xd6, 0x90, 0x7f, 0xc0, + 0xa4, 0x4a, 0x54, 0x8b, 0x62, 0x86, 0xae, 0xc0, 0xd1, 0xa4, 0xbc, 0xa7, + 0x3e, 0xc3, 0xcc, 0x50, 0xcf, 0x8c, 0x43, 0x3c, 0x45, 0x63, 0xa9, 0x37, + 0xbe, 0xaa, 0x3c, 0x73, 0x64, 0xcf, 0x6d, 0x37, 0x63, 0x9c, 0xc4, 0x9a, + 0xa3, 0x74, 0xcb, 0x51, 0xa9, 0x75, 0xa7, 0xbf, 0x47, 0x51, 0xbb, 0x73, + 0xc6, 0x82, 0x90, 0x8d, 0x9c, 0xbc, 0x78, 0x80, 0xaf, 0xc9, 0xcb, 0xad, + 0xad, 0x40, 0xce, 0xb1, 0x89, 0x6d, 0xd7, 0x3e, 0xc1, 0x60, 0x91, 0xc1, + 0x3d, 0x43, 0xce, 0xcb, 0xc9, 0x41, 0x4f, 0x82, 0x59, 0xb8, 0x74, 0x97, + 0x3e, 0xa7, 0x3f, 0xa6, 0xd7, 0xcb, 0x49, 0x80, 0x82, 0x79, 0x52, 0x33, + 0x5e, 0xd3, 0xc1, 0xa6, 0x79, 0x64, 0x76, 0xca, 0xc5, 0x3d, 0xbb, 0xc9, + 0xa1, 0xb6, 0xb6, 0xb6, 0xba, 0x91, 0x30, 0xb1, 0x6b, 0x77, 0x5d, 0x56, + 0x3d, 0xb8, 0x80, 0x96, 0x9f, 0x41, 0x64, 0x92, 0x33, 0x97, 0xa1, 0x7f, + 0xa2, 0x92, 0x93, 0x80, 0x75, 0xae, 0xa6, 0xbf, 0xaf, 0xa0, 0x85, 0xc2, + 0x70, 0x6a, 0x63, 0x46, 0x46, 0x4b, 0x36, 0xcd, 0xbf, 0x47, 0x83, 0x7b, + 0x49, 0x4c, 0x34, 0x63, 0x3d, 0x85, 0x6a, 0x6e, 0x48, 0x47, 0x67, 0x42, + 0x5f, 0x8e, 0xcd, 0x82, 0x65, 0xa4, 0xbc, 0x9f, 0x5d, 0x90, 0xd2, 0x8b, + 0x89, 0x5e, 0x3e, 0x60, 0x99, 0x5f, 0x82, 0x75, 0x59, 0x46, 0x7e, 0x45, + 0x97, 0x81, 0xbb, 0x52, 0x45, 0x75, 0xc5, 0x92, 0xa9, 0x66, 0x66, 0x47, + 0xd8, 0xbb, 0x79, 0xbf, 0x8e, 0x6e, 0x64, 0x6f, 0x5b, 0x4e, 0xa1, 0x3a, + 0xb2, 0x8d, 0x4e, 0x98, 0xaf, 0x33, 0x56, 0x4e, 0x33, 0x3e, 0x59, 0x35, + 0x45, 0xb1, 0x5b, 0x87, 0x3d, 0x6d, 0x89, 0xc7, 0x41, 0xc7, 0x8d, 0x50, + 0x6e, 0xa8, 0x5a, 0x4f, 0x7d, 0x6f, 0xd3, 0x99, 0x34, 0x73, 0x5a, 0x83, + 0x7d, 0x5f, 0x51, 0x4a, 0x46, 0x97, 0x62, 0xd4, 0xa9, 0x6e, 0x58, 0x5b, + 0x6f, 0xa8, 0x40, 0x36, 0xcb, 0x7e, 0x9b, 0xa3, 0x35, 0xb4, 0x7c, 0xa2, + 0x88, 0x50, 0x5f, 0x58, 0x9b, 0xb5, 0xb5, 0x8c, 0xc8, 0x4f, 0x7a, 0xa8, + 0x9a, 0x9c, 0xa2, 0x59, 0x9e, 0xc8, 0x52, 0x40, 0x56, 0x72, 0xae, 0x8b, + 0xcf, 0x70, 0x4d, 0x5e, 0x4a, 0x53, 0xb1, 0xa0, 0xac, 0x88, 0x5d, 0xa4, + 0xd0, 0x4e, 0x72, 0x4c, 0xc7, 0xc4, 0x87, 0xb0, 0x9b, 0xc6, 0x68, 0xaf, + 0xb2, 0xb6, 0x4e, 0xcd, 0xb4, 0x5c, 0xc9, 0xac, 0x50, 0xc4, 0x8d, 0x75, + 0xa3, 0x2e, 0x4e, 0x5c, 0x5b, 0x57, 0x61, 0xb5, 0x79, 0x98, 0x46, 0x87, + 0x7b, 0x43, 0x6d, 0x56, 0xb2, 0xa6, 0x87, 0xaf, 0x56, 0xc7, 0x54, 0x53, + 0x95, 0x7c, 0x63, 0x37, 0x74, 0x3a, 0x6c, 0x99, 0x7a, 0x92, 0x80, 0xb1, + 0x8c, 0xc7, 0x2e, 0x9e, 0x8f, 0xdd, 0xc0, 0x4d, 0xcb, 0xa1, 0x4d, 0x59, + 0x47, 0x5d, 0x67, 0xc8, 0x87, 0xae, 0x76, 0x84, 0xa4, 0x53, 0xae, 0x34, + 0x89, 0x3a, 0xa2, 0xcb, 0x31, 0x95, 0xcd, 0x65, 0x3a, 0xa4, 0x9c, 0xac, + 0xd8, 0xb8, 0x3b, 0xc1, 0x6d, 0x57, 0x64, 0x60, 0xad, 0xb6, 0x84, 0x6c, + 0x81, 0xb9, 0xc5, 0x92, 0x57, 0x63, 0x51, 0xcd, 0x7d, 0x91, 0xc0, 0xd3, + 0x5c, 0xc8, 0xa5, 0x95, 0x8a, 0xc3, 0x8e, 0xc2, 0xbb, 0x42, 0xac, 0x4a, + 0xca, 0x91, 0x8a, 0x5a, 0x7a, 0xd0, 0x5b, 0x49, 0x96, 0x4f, 0xd2, 0x5f, + 0xb4, 0xc5, 0x88, 0x4a, 0xca, 0x35, 0x7c, 0x3b, 0xab, 0xc7, 0x7e, 0xc2, + 0x77, 0x95, 0x3a, 0x6b, 0x97, 0x87, 0x54, 0xb0, 0xa5, 0xd0, 0xb4, 0xb1, + 0xa1, 0xac, 0x69, 0x60, 0x9a, 0xb1, 0x74, 0x39, 0x50, 0xab, 0xa3, 0x8b, + 0x72, 0xa7, 0x4e, 0x76, 0x8c, 0x7c, 0x3f, 0x99, 0x68, 0x49, 0x3c, 0x41, + 0x3a, 0x48, 0xa9, 0x77, 0x50, 0x5d, 0x55, 0x4e, 0xbe, 0x98, 0xc3, 0x7d, + 0x8f, 0x74, 0xa9, 0x59, 0xba, 0x8f, 0xb9, 0xd7, 0x7c, 0x93, 0x4b, 0x62, + 0x60, 0x30, 0xbb, 0x50, 0xa5, 0x3c, 0x8b, 0xc3, 0xb1, 0xaf, 0x54, 0xb9, + 0x91, 0xba, 0x30, 0x99, 0x4b, 0xc8, 0xc6, 0x7f, 0x73, 0x89, 0xaa, 0x88, + 0xb6, 0x33, 0x3e, 0xaf, 0x71, 0x55, 0x78, 0x3b, 0x30, 0xb7, 0x98, 0x69, + 0x8a, 0x3a, 0xc6, 0xc1, 0x3e, 0x68, 0x39, 0x9a, 0x37, 0x73, 0xb2, 0xad, + 0xa0, 0x54, 0x5f, 0xbf, 0xba, 0x3f, 0x59, 0x5c, 0x9e, 0x91, 0x53, 0x7d, + 0x76, 0x6d, 0xb1, 0xc5, 0x9e, 0xa5, 0x99, 0xb7, 0xd1, 0xcf, 0x77, 0x5d, + 0x50, 0xc5, 0x8d, 0x6f, 0xc1, 0x83, 0x71, 0x97, 0x99, 0x30, 0x63, 0x51, + 0x40, 0x50, 0x73, 0x6e, 0x2f, 0x73, 0x9a, 0xc9, 0x68, 0xc0, 0x8b, 0x8f, + 0x44, 0x36, 0xc9, 0xd5, 0x78, 0x95, 0xc1, 0xa5, 0xc1, 0x63, 0x31, 0xa5, + 0xa8, 0x96, 0xcb, 0xb7, 0x97, 0x63, 0x87, 0x5d, 0xc4, 0x6b, 0x6c, 0x8c, + 0x65, 0x50, 0x37, 0x6e, 0x65, 0xa6, 0x55, 0x85, 0x3c, 0x39, 0x37, 0x44, + 0xc1, 0xba, 0xa4, 0x3e, 0x90, 0x9e, 0x3b, 0x7d, 0x87, 0x34, 0x47, 0x6a, + 0x62, 0x49, 0x72, 0x5b, 0x9b, 0x55, 0x56, 0x39, 0x7d, 0x61, 0x5a, 0xce, + 0x37, 0x62, 0x83, 0x4f, 0x59, 0x3f, 0xbc, 0xb9, 0xb5, 0xbc, 0x78, 0x3a, + 0x5e, 0x7c, 0xcb, 0xa1, 0xc9, 0x98, 0x3d, 0x50, 0x71, 0x52, 0x79, 0x50, + 0x3a, 0x69, 0x34, 0x5d, 0xa4, 0xa4, 0x3c, 0xa0, 0x73, 0x8b, 0x8a, 0xbe, + 0x4e, 0x6a, 0xc3, 0x48, 0xb4, 0x80, 0xba, 0x88, 0xac, 0xc3, 0xcd, 0x9c, + 0x38, 0x2a, 0x86, 0xc9, 0xcb, 0xcd, 0x6d, 0xc7, 0x8b, 0x4a, 0x7e, 0x59, + 0xc9, 0x92, 0x70, 0xba, 0x52, 0xc6, 0x64, 0x53, 0xbe, 0x50, 0x8b, 0x80, + 0xa3, 0x86, 0x60, 0xab, 0x78, 0x65, 0xb0, 0xb8, 0x99, 0x9c, 0x81, 0xa6, + 0x6e, 0x94, 0x57, 0xce, 0xc9, 0xbf, 0xbf, 0x37, 0x5d, 0x62, 0x52, 0x3d, + 0x3f, 0x8d, 0x3e, 0xab, 0x85, 0x2f, 0xc9, 0x51, 0x3a, 0x50, 0x8a, 0xbe, + 0xab, 0xdc, 0x6b, 0x55, 0x61, 0x85, 0xb4, 0x98, 0x8e, 0xc2, 0x77, 0x85, + 0x75, 0x4c, 0x3a, 0xcb, 0x4f, 0x8b, 0x5d, 0x92, 0xa0, 0x4d, 0xd0, 0xd7, + 0x73, 0xcc, 0x4c, 0xab, 0x57, 0x54, 0xd1, 0x91, 0x84, 0xa1, 0x3c, 0xa4, + 0x99, 0xb4, 0x7d, 0x54, 0xb9, 0x58, 0x71, 0x4c, 0x4c, 0x4c, 0x53, 0xab, + 0xb9, 0x65, 0x69, 0xc6, 0xc5, 0x53, 0xb4, 0x83, 0x7e, 0x50, 0x7a, 0xbb, + 0x48, 0xd4, 0xcd, 0xb4, 0x90, 0x97, 0x68, 0x36, 0x48, 0x4b, 0x55, 0x6e, + 0x6f, 0x48, 0x4d, 0xc0, 0xb2, 0xcd, 0xbb, 0x77, 0x96, 0x3f, 0x99, 0x7d, + 0x81, 0x97, 0x40, 0xb2, 0x46, 0x63, 0x98, 0x52, 0xcf, 0xbb, 0x8f, 0xd0, + 0x7a, 0x85, 0xa0, 0x47, 0x98, 0x95, 0x8d, 0x89, 0x45, 0x63, 0x38, 0x3d, + 0x58, 0x9c, 0x4a, 0x90, 0x3e, 0x97, 0xca, 0xb8, 0x4a, 0x48, 0x5c, 0x6f, + 0x3f, 0x48, 0x34, 0x9b, 0xcd, 0xa5, 0x99, 0x62, 0xb8, 0x67, 0x4e, 0x8e, + 0xb1, 0x7c, 0xa6, 0x83, 0x4c, 0xc0, 0xae, 0x79, 0xb3, 0x94, 0xb9, 0x9b, + 0x8d, 0xc9, 0xc0, 0x7b, 0xb8, 0xc7, 0x51, 0x91, 0xc8, 0x80, 0xa9, 0x3b, + 0x2d, 0x98, 0xa3, 0x81, 0x6c, 0xaf, 0x4b, 0x3a, 0xb4, 0x5b, 0x36, 0xd2, + 0xc5, 0xbf, 0xc2, 0xaa, 0x65, 0x63, 0xce, 0xa7, 0x63, 0x3f, 0x5b, 0x74, + 0x89, 0x6f, 0x94, 0x6c, 0x79, 0x5e, 0x87, 0x88, 0x32, 0xa4, 0x76, 0xb8, + 0x56, 0x9f, 0x69, 0x4a, 0x3b, 0xa4, 0xc4, 0x8a, 0xa2, 0xb2, 0x87, 0x90, + 0x86, 0x9a, 0xb7, 0x58, 0x9a, 0x73, 0xa0, 0x7c, 0x9f, 0x9e, 0x9e, 0x5c, + 0xae, 0x3e, 0x74, 0xc7, 0x9e, 0x4f, 0x7a, 0x49, 0x7c, 0x8b, 0x44, 0x6b, + 0xad, 0x92, 0xcc, 0x9d, 0x7b, 0x7b, 0x67, 0x57, 0xc3, 0x82, 0x6f, 0x61, + 0x7b, 0xb1, 0xb2, 0x4d, 0x32, 0xb9, 0x9b, 0x8a, 0x4d, 0x81, 0x54, 0x8d, + 0x91, 0x9d, 0x7e, 0x92, 0x99, 0xb3, 0xcd, 0x7b, 0xd3, 0xa2, 0xd0, 0x37, + 0x36, 0x6c, 0x75, 0x34, 0x9c, 0xbe, 0xc8, 0xce, 0xa6, 0x7c, 0x7f, 0xd1, + 0x37, 0x3d, 0xb6, 0x39, 0x6b, 0xb7, 0x35, 0x77, 0x65, 0xd1, 0xa7, 0x5f, + 0x96, 0x31, 0x93, 0x5f, 0x93, 0x5f, 0x86, 0x4b, 0xc2, 0x7d, 0x3c, 0x5f, + 0x44, 0x61, 0x63, 0x59, 0x84, 0x34, 0x4b, 0x53, 0x56, 0x87, 0x81, 0x71, + 0x36, 0xa4, 0x81, 0x41, 0xaf, 0x5e, 0x6a, 0xa5, 0x44, 0x9a, 0x2d, 0xb9, + 0x3f, 0x5c, 0x83, 0x6f, 0x7e, 0x3c, 0xd5, 0xa2, 0xa4, 0xd1, 0x8d, 0x8a, + 0xbd, 0x91, 0x6d, 0x68, 0x8c, 0x50, 0x49, 0x2d, 0x3b, 0xa8, 0xac, 0x33, + 0x4d, 0x36, 0x94, 0x51, 0xa3, 0xaa, 0x86, 0x8e, 0x8b, 0xc4, 0x76, 0x4e, + 0x57, 0x5e, 0x91, 0xa8, 0xd1, 0xac, 0x63, 0x61, 0x98, 0x65, 0x57, 0x87, + 0xb0, 0x38, 0xa2, 0x85, 0x5a, 0x43, 0x70, 0x46, 0x64, 0xbd, 0xa2, 0x39, + 0x97, 0xae, 0xc3, 0xcf, 0xd1, 0x86, 0xbc, 0xac, 0xab, 0x71, 0x30, 0x98, + 0x38, 0x59, 0x70, 0x6c, 0x6a, 0x41, 0xd1, 0xa0, 0x40, 0xcb, 0x31, 0x5a, + 0x89, 0x3b, 0x69, 0xa6, 0xcc, 0x5e, 0xa8, 0x93, 0x3b, 0x60, 0x80, 0xa3, + 0x98, 0x87, 0x42, 0xcb, 0x65, 0xbe, 0x82, 0xa2, 0x9f, 0xab, 0x44, 0x51, + 0xcd, 0x5f, 0x6a, 0x62, 0x50, 0x4b, 0x54, 0xd4, 0x6e, 0x8e, 0x44, 0x36, + 0xa0, 0xb1, 0xa7, 0x9d, 0xd5, 0x46, 0x4b, 0x3d, 0x64, 0x3e, 0x58, 0x53, + 0x88, 0x77, 0x43, 0xbe, 0x41, 0xb5, 0xa7, 0x59, 0x33, 0xb8, 0xb7, 0x4d, + 0x4a, 0x41, 0x9d, 0x7d, 0xb9, 0x58, 0x93, 0x83, 0x57, 0x60, 0x49, 0x3a, + 0x48, 0x89, 0x6e, 0x4d, 0x63, 0x8f, 0x46, 0x9d, 0xad, 0x6c, 0x41, 0xc0, + 0x6d, 0x40, 0xcc, 0x82, 0xcd, 0x46, 0x7f, 0x41, 0x9b, 0x62, 0x8d, 0x9d, + 0xd4, 0x42, 0x5f, 0x9d, 0x91, 0xaf, 0xcf, 0x91, 0x72, 0x83, 0x95, 0xab, + 0x7b, 0x7d, 0x68, 0x87, 0x89, 0x66, 0x59, 0x3d, 0xab, 0xb5, 0x62, 0x37, + 0x3a, 0x52, 0xa1, 0x53, 0x86, 0x35, 0x37, 0x35, 0x40, 0xa1, 0x7d, 0xaf, + 0xbf, 0x94, 0x6b, 0xbe, 0xad, 0x54, 0xc6, 0xa7, 0x78, 0x64, 0x51, 0x60, + 0xcb, 0x2e, 0x55, 0x9f, 0x81, 0x48, 0x6b, 0xa4, 0x44, 0xa1, 0xd3, 0x4d, + 0x94, 0x70, 0x38, 0xb2, 0x9d, 0x7b, 0xb2, 0x5b, 0xb3, 0x55, 0x4c, 0x75, + 0x9c, 0xc5, 0x7f, 0x96, 0xbd, 0xc0, 0xc6, 0xc7, 0x98, 0x9a, 0x83, 0xa4, + 0xae, 0x78, 0x49, 0x92, 0xc6, 0x50, 0x8f, 0x93, 0x5c, 0x86, 0x77, 0xa9, + 0x52, 0xb0, 0xcc, 0x68, 0x76, 0x47, 0xb6, 0x69, 0xa1, 0x6f, 0x8a, 0xcd, + 0x75, 0xa3, 0x87, 0x98, 0x8f, 0x3f, 0x5c, 0xc6, 0xad, 0x92, 0xd3, 0xa3, + 0xbe, 0x7f, 0xc9, 0x87, 0x5f, 0x47, 0x6b, 0x6a, 0x32, 0x2c, 0x5b, 0xc3, + 0x66, 0xb7, 0x58, 0x9b, 0x75, 0x8f, 0xa0, 0xc4, 0x5a, 0x9e, 0x4b, 0xbc, + 0x3f, 0x8e, 0x55, 0x77, 0x83, 0xbf, 0x31, 0x61, 0x53, 0x5a, 0x34, 0x66, + 0xb4, 0x33, 0xa6, 0xbc, 0xcb, 0xaa, 0x41, 0x77, 0x4e, 0x97, 0xaa, 0x8a, + 0x97, 0xbb, 0x74, 0xb7, 0xa7, 0x4f, 0x72, 0x82, 0x32, 0xc1, 0x7d, 0xbd, + 0xa8, 0x8d, 0x4d, 0xa5, 0x89, 0x85, 0xca, 0x90, 0x47, 0xa1, 0x61, 0xb4, + 0x97, 0x98, 0xa9, 0x8c, 0x90, 0x8f, 0x5a, 0x90, 0x4f, 0xa6, 0xbd, 0x54, + 0x93, 0x46, 0x77, 0xb3, 0xb9, 0xbd, 0x54, 0x54, 0x52, 0x3b, 0xd6, 0x37, + 0xa3, 0x39, 0x9a, 0x63, 0x96, 0xaa, 0xaa, 0x68, 0x7c, 0x5c, 0x61, 0x52, + 0x8f, 0x9f, 0x7c, 0x99, 0x30, 0x4e, 0xa8, 0x9b, 0x42, 0xa2, 0x32, 0x43, + 0x5a, 0xd3, 0x57, 0x6b, 0x33, 0x37, 0xc5, 0x30, 0x4b, 0x97, 0xbe, 0xac, + 0x6c, 0xb3, 0x75, 0x6d, 0x7d, 0x4d, 0x9a, 0x36, 0x49, 0xb5, 0xb0, 0x51, + 0x9b, 0xb9, 0x51, 0xcf, 0xa9, 0xc8, 0x2f, 0x63, 0x49, 0x75, 0x55, 0x80, + 0xd0, 0xaf, 0x5d, 0x7e, 0x97, 0x49, 0x68, 0xa2, 0x56, 0x55, 0x90, 0x43, + 0x60, 0xb3, 0xb5, 0x9a, 0xa3, 0xc8, 0x9a, 0xb0, 0x9a, 0x7d, 0xb7, 0x45, + 0x4c, 0x33, 0xbb, 0xc7, 0x77, 0xc5, 0x7b, 0xcc, 0x96, 0x70, 0xc6, 0x55, + 0x6b, 0xbf, 0xa8, 0x78, 0x7d, 0x81, 0x6a, 0x73, 0xa0, 0x41, 0x85, 0x3c, + 0x71, 0x90, 0x96, 0x46, 0x8c, 0x63, 0x57, 0x8a, 0xcb, 0xc9, 0xa3, 0x82, + 0x9e, 0x6a, 0x7f, 0xba, 0x5b, 0x8b, 0x7f, 0x99, 0x6e, 0x66, 0x71, 0x8f, + 0xb9, 0x91, 0x4e, 0x49, 0x83, 0x48, 0x86, 0x52, 0x72, 0x95, 0xaf, 0xbb, + 0xd0, 0xc7, 0x46, 0x48, 0x96, 0xb5, 0x3d, 0x99, 0x8f, 0xc5, 0xb5, 0xcb, + 0x7a, 0xcc, 0xbc, 0x72, 0x5c, 0x7e, 0x95, 0x4b, 0x6a, 0x87, 0xb2, 0x5c, + 0xc5, 0xb0, 0xce, 0xc9, 0xa3, 0x84, 0xb8, 0x2a, 0x99, 0x98, 0xb1, 0x8c, + 0x5c, 0xb0, 0x6d, 0x5d, 0x7e, 0x9f, 0xd6, 0x63, 0x60, 0xac, 0x52, 0x4d, + 0x86, 0x5a, 0x47, 0x69, 0xb0, 0xd0, 0x53, 0x6b, 0xbe, 0x70, 0x48, 0xa0, + 0x5b, 0x4c, 0xab, 0xa8, 0x9f, 0x9d, 0xae, 0x58, 0x35, 0x9c, 0xc5, 0x9d, + 0x9e, 0x57, 0xa4, 0x45, 0xbf, 0x31, 0x96, 0x50, 0x98, 0x74, 0x87, 0xbe, + 0x94, 0x37, 0x51, 0x45, 0xc4, 0xc3, 0x97, 0xa3, 0x4b, 0xce, 0x7f, 0xce, + 0x5b, 0x56, 0x4e, 0x71, 0x93, 0xaa, 0x90, 0xc1, 0xd0, 0xa5, 0x88, 0x34, + 0x7f, 0xb5, 0x71, 0xa5, 0x8c, 0xc9, 0xb8, 0x37, 0xc2, 0x92, 0xa5, 0x84, + 0x88, 0xdf, 0xb0, 0x7a, 0x78, 0x65, 0x8c, 0x4f, 0xd2, 0xb4, 0x5a, 0x63, + 0xc0, 0xab, 0xb2, 0xc5, 0xc6, 0xb7, 0x31, 0xc6, 0x6d, 0x99, 0x5f, 0x30, + 0xc7, 0xb3, 0x9d, 0xcd, 0xd2, 0xab, 0xd1, 0xca, 0x89, 0x9a, 0x67, 0x35, + 0x57, 0x38, 0x85, 0x8b, 0x79, 0xbe, 0x65, 0xa1, 0x9c, 0x83, 0x91, 0x38, + 0x4d, 0x66, 0x6d, 0x85, 0x83, 0xa6, 0x8f, 0x58, 0xaf, 0xa8, 0xcd, 0x8e, + 0xc3, 0xb5, 0x60, 0xa3, 0x74, 0xa4, 0x99, 0xcc, 0x96, 0x82, 0x5f, 0x41, + 0x86, 0x92, 0x9c, 0x5f, 0x77, 0x3e, 0xbc, 0xa3, 0x66, 0x80, 0x8f, 0x92, + 0x74, 0x46, 0x84, 0x3b, 0x9b, 0x5e, 0xd3, 0x51, 0xd7, 0x34, 0xa9, 0xa8, + 0xc6, 0x48, 0x5d, 0x29, 0xd4, 0x7c, 0xbc, 0xb3, 0x8e, 0xac, 0x92, 0xc9, + 0x7f, 0xbd, 0x60, 0x6d, 0x45, 0xbe, 0x8e, 0xac, 0xc8, 0x36, 0x8a, 0x56, + 0x4e, 0x96, 0x1f, 0xa2, 0x41, 0xcb, 0x69, 0x72, 0xc6, 0xab, 0x5e, 0xc2, + 0xc6, 0xbd, 0x48, 0xc8, 0x6d, 0xbe, 0xc2, 0xaf, 0x79, 0xa3, 0x4f, 0x4e, + 0x78, 0x67, 0x80, 0x87, 0x50, 0x6b, 0x83, 0xca, 0x67, 0xb3, 0xaf, 0x78, + 0xc8, 0x32, 0x52, 0x7d, 0x77, 0x75, 0x6d, 0x49, 0xc7, 0x34, 0x54, 0x8a, + 0xc4, 0x74, 0x72, 0x7f, 0x30, 0xca, 0xb6, 0x4a, 0xda, 0xc8, 0x54, 0xa3, + 0xab, 0x70, 0x5a, 0xb8, 0x4a, 0x36, 0xce, 0x37, 0xc4, 0x7c, 0xb7, 0x9d, + 0x29, 0x3a, 0x3d, 0xaa, 0x55, 0x73, 0x24, 0xda, 0x88, 0xae, 0x5a, 0xd0, + 0x7f, 0x42, 0x59, 0xc0, 0x4b, 0xb4, 0xcb, 0xd7, 0x4c, 0x54, 0xb5, 0x37, + 0x93, 0xc1, 0x4c, 0x5a, 0xab, 0x65, 0xb4, 0x61, 0x8e, 0x37, 0x7b, 0x6d, + 0x4f, 0xcc, 0x7e, 0xb1, 0x39, 0x9e, 0x67, 0x60, 0x3e, 0x4b, 0xb2, 0xdb, + 0x77, 0xbe, 0x3a, 0x38, 0x92, 0x62, 0x99, 0x81, 0x69, 0x78, 0xc4, 0xbb, + 0x95, 0xdc, 0x81, 0x58, 0x53, 0xd1, 0xd5, 0x5a, 0x8a, 0x6c, 0x3b, 0xb9, + 0x59, 0x56, 0xa2, 0x82, 0xd6, 0xc9, 0x31, 0xb8, 0x8d, 0x5e, 0x79, 0xae, + 0xa7, 0x7f, 0x33, 0x53, 0x7e, 0x6c, 0x5e, 0x98, 0xc9, 0x58, 0x5b, 0x4d, + 0xba, 0x66, 0xa1, 0xc3, 0x5e, 0x7b, 0xd1, 0x74, 0x94, 0xb0, 0x5e, 0x82, + 0x8b, 0xb0, 0x44, 0x63, 0xb8, 0xaa, 0x53, 0x3d, 0x51, 0x5c, 0xc9, 0xa2, + 0x55, 0x87, 0x79, 0x3f, 0x39, 0x43, 0x72, 0x60, 0xc4, 0xc9, 0x47, 0xaf, + 0x55, 0x89, 0x5f, 0x9a, 0x69, 0x8e, 0xad, 0xc8, 0xb6, 0x68, 0x64, 0x6e, + 0xd5, 0x95, 0x60, 0x89, 0xb1, 0x8b, 0x87, 0x7f, 0x6b, 0xbf, 0x92, 0x9d, + 0xd0, 0x64, 0x7e, 0x8b, 0x90, 0xce, 0xb2, 0x76, 0x79, 0x3b, 0x7a, 0x71, + 0x47, 0x79, 0x5e, 0x4f, 0x5e, 0x9e, 0xc6, 0xa3, 0x6d, 0x6d, 0xcb, 0x61, + 0x46, 0x67, 0x58, 0x35, 0x4d, 0xb6, 0x41, 0xc3, 0xd1, 0x7e, 0xa5, 0x8d, + 0xbc, 0x92, 0x8c, 0x6a, 0x64, 0x35, 0x9e, 0x9b, 0xc3, 0x8e, 0xd8, 0x42, + 0x39, 0x8b, 0x62, 0x54, 0x4d, 0x9c, 0xb6, 0x94, 0xbe, 0x9d, 0x7d, 0xc7, + 0x40, 0x6e, 0x42, 0xb2, 0x64, 0xb2, 0xbc, 0x34, 0x5b, 0xc8, 0x99, 0x4a, + 0x85, 0xad, 0x3a, 0x75, 0x7b, 0xac, 0x36, 0x76, 0x9e, 0xaa, 0x80, 0x58, + 0x58, 0xbd, 0xb9, 0x32, 0x80, 0x55, 0x4b, 0xa2, 0x93, 0x7e, 0x5c, 0x3e, + 0x94, 0xbd, 0x7f, 0x9a, 0x2d, 0xc2, 0xa0, 0x35, 0x58, 0x69, 0x5a, 0x7a, + 0xd0, 0x8d, 0xc3, 0x9a, 0x8d, 0xd7, 0xac, 0x54, 0x4d, 0xa7, 0xda, 0x82, + 0xd0, 0x7c, 0x75, 0xcb, 0x63, 0x71, 0x8f, 0xa0, 0x51, 0xa3, 0x72, 0xad, + 0x77, 0x82, 0xb7, 0x8c, 0x40, 0x71, 0xb3, 0x5d, 0x79, 0x61, 0xa8, 0x42, + 0x80, 0xa3, 0x75, 0x9b, 0x47, 0x63, 0x99, 0x51, 0x8e, 0xab, 0x8d, 0x87, + 0xbe, 0x89, 0x76, 0x52, 0xcb, 0x44, 0x7e, 0xbd, 0x77, 0x69, 0x87, 0x56, + 0xbf, 0xa0, 0x66, 0xcd, 0x99, 0xb5, 0x6d, 0xc4, 0x36, 0xce, 0x86, 0x55, + 0xb3, 0x4f, 0x75, 0x8c, 0x73, 0x85, 0x88, 0x4e, 0xbe, 0x5b, 0x64, 0x81, + 0x78, 0x5f, 0x5c, 0xa4, 0x5e, 0xc1, 0x47, 0x8e, 0xa7, 0xc1, 0xb6, 0x84, + 0x39, 0x32, 0xa0, 0x68, 0x38, 0xbb, 0x86, 0x72, 0x2f, 0xa8, 0x52, 0x8f, + 0x40, 0x98, 0x63, 0xa3, 0x67, 0x7b, 0x67, 0x7d, 0xaf, 0xac, 0x69, 0xce, + 0x4e, 0x88, 0xa1, 0x6b, 0xa2, 0x65, 0x51, 0x9b, 0xcb, 0x51, 0x51, 0xbd, + 0xc7, 0x3d, 0xae, 0x56, 0xaf, 0x37, 0xbe, 0x3d, 0x9e, 0xce, 0x32, 0x85, + 0x81, 0xd4, 0x7d, 0x69, 0x67, 0xcd, 0x4c, 0x58, 0x45, 0x7d, 0x7b, 0xc9, + 0xb4, 0x98, 0x5a, 0x98, 0x38, 0xa7, 0x90, 0xb0, 0x64, 0x82, 0xc7, 0xc0, + 0x4a, 0x7d, 0xba, 0x75, 0x46, 0xcf, 0x90, 0x48, 0x2f, 0xa1, 0x74, 0x95, + 0x7e, 0xab, 0xd8, 0xce, 0xbc, 0xc3, 0x79, 0xd6, 0x33, 0x4a, 0x77, 0x7a, + 0x5f, 0x6c, 0xa6, 0x76, 0x7c, 0x7d, 0xca, 0xbc, 0x90, 0x9f, 0x81, 0xd6, + 0x7a, 0x3c, 0xb6, 0x44, 0x6c, 0x54, 0x5d, 0x2f, 0x45, 0xc9, 0x49, 0xd8, + 0x92, 0xb6, 0x7c, 0xbf, 0x56, 0xb7, 0x88, 0x57, 0x93, 0x7b, 0x5b, 0x7c, + 0xd0, 0xd3, 0xd4, 0x82, 0x40, 0x2a, 0xb8, 0xb9, 0x86, 0x37, 0x74, 0x83, + 0x59, 0xc6, 0x9c, 0x84, 0x92, 0x88, 0xc8, 0x74, 0x35, 0x83, 0xa0, 0x44, + 0x69, 0xa7, 0x71, 0x56, 0x3b, 0xcb, 0x36, 0x85, 0xbf, 0xb1, 0xa2, 0x9a, + 0x37, 0x60, 0xd3, 0x46, 0x3b, 0x81, 0x7f, 0xb6, 0x39, 0x5a, 0xbd, 0x4b, + 0xc5, 0xa7, 0x3b, 0xd3, 0x49, 0x97, 0x9f, 0xcb, 0xcc, 0xa6, 0x6a, 0x38, + 0xa0, 0x97, 0x56, 0x68, 0x88, 0x31, 0x7d, 0xab, 0xb9, 0x5b, 0xc7, 0x80, + 0x91, 0x4b, 0xc6, 0x88, 0x7f, 0x3d, 0xab, 0x57, 0x5a, 0x96, 0xb4, 0x7d, + 0xa3, 0x7d, 0x90, 0x9f, 0x4f, 0x93, 0x89, 0xc5, 0xa3, 0xae, 0x5d, 0x3c, + 0x98, 0x75, 0x9e, 0x65, 0xc1, 0x9e, 0x9a, 0x3b, 0x34, 0x36, 0x66, 0x6b, + 0x74, 0x48, 0x99, 0xbc, 0x78, 0x77, 0x95, 0x74, 0x5d, 0x55, 0x2f, 0xab, + 0x4d, 0x72, 0x51, 0x71, 0xb2, 0x8b, 0x5c, 0xba, 0x68, 0x5e, 0x88, 0x40, + 0x5b, 0xde, 0xb3, 0x75, 0x9e, 0xc8, 0xd0, 0x7b, 0x69, 0x87, 0xc1, 0x7a, + 0xb6, 0xce, 0x36, 0x51, 0x6b, 0x54, 0xd3, 0x60, 0x67, 0xc3, 0x8d, 0xc1, + 0xcc, 0xc3, 0xa4, 0x38, 0xce, 0x88, 0x49, 0x6a, 0x92, 0xd9, 0xa8, 0x65, + 0x70, 0x7d, 0x44, 0xc1, 0x44, 0xbf, 0x77, 0x95, 0xa6, 0x89, 0x81, 0xc2, + 0xa4, 0x69, 0x86, 0xa3, 0x34, 0x97, 0x8d, 0x82, 0x3f, 0x89, 0x3e, 0x66, + 0x40, 0x52, 0x89, 0xca, 0xcc, 0xd7, 0x64, 0x6c, 0x82, 0x92, 0x75, 0x69, + 0x8c, 0x58, 0x7e, 0x6f, 0xc7, 0x75, 0x3c, 0x50, 0x89, 0x80, 0xba, 0x83, + 0x86, 0x4e, 0xb9, 0x74, 0xb1, 0x84, 0xb4, 0x4a, 0x98, 0xcd, 0x7b, 0xb0, + 0x5f, 0xa0, 0xce, 0x53, 0xa1, 0x5c, 0xa1, 0x7b, 0xcd, 0x8c, 0xaa, 0x9f, + 0x64, 0x59, 0x66, 0xb3, 0xc7, 0x5b, 0x64, 0x4f, 0xbe, 0x3d, 0x96, 0x4e, + 0x9c, 0x4d, 0x60, 0x8d, 0x7d, 0x4b, 0xa1, 0x39, 0xb5, 0xa1, 0x48, 0x6a, + 0x77, 0xa4, 0xbe, 0xda, 0x83, 0xd1, 0x57, 0x57, 0xab, 0x70, 0xbf, 0xcb, + 0x79, 0x74, 0x94, 0xc7, 0x6d, 0x54, 0x88, 0x65, 0xa3, 0x83, 0xe3, 0x58, + 0x6f, 0xce, 0x8a, 0x6b, 0x50, 0x3c, 0x6a, 0x9d, 0x64, 0x9c, 0xa5, 0x35, + 0x7a, 0x5f, 0x61, 0x82, 0x6b, 0xd6, 0x6c, 0x7c, 0x48, 0xcb, 0x39, 0x6a, + 0x8c, 0x70, 0x7f, 0x53, 0x6f, 0xba, 0x43, 0x98, 0xc4, 0x9f, 0x7b, 0x64, + 0xb0, 0xb1, 0x94, 0xd0, 0x62, 0x50, 0x4c, 0x93, 0x60, 0x6a, 0x45, 0x70, + 0xbb, 0x9c, 0x47, 0x57, 0x93, 0xc4, 0x7d, 0x7f, 0x61, 0xc9, 0x3f, 0xbf, + 0x95, 0xa9, 0xb6, 0x8b, 0xa4, 0x79, 0x4c, 0x89, 0x97, 0xc9, 0x7f, 0x6b, + 0xbf, 0xd0, 0x6f, 0x75, 0xa6, 0xce, 0x6d, 0x45, 0x55, 0xae, 0xbd, 0x9c, + 0xb8, 0x96, 0x42, 0x6b, 0x65, 0x34, 0x64, 0x63, 0x50, 0x72, 0x76, 0x8c, + 0x41, 0xca, 0xa5, 0x57, 0x6f, 0x81, 0xa2, 0x83, 0xc5, 0x91, 0xc6, 0xac, + 0x9f, 0x47, 0xa9, 0x9d, 0x43, 0xb6, 0xb8, 0x5c, 0x4d, 0xda, 0x9f, 0x61, + 0x4c, 0xa3, 0xa1, 0x5e, 0x72, 0x73, 0x31, 0x3a, 0x7c, 0x4e, 0x36, 0xbe, + 0x81, 0x98, 0x37, 0x8b, 0x6f, 0xa6, 0x96, 0x79, 0x68, 0x4c, 0x90, 0x9c, + 0x5e, 0x64, 0x3d, 0x99, 0x7c, 0xcc, 0xc8, 0x8a, 0x4a, 0x6a, 0x49, 0x34, + 0x9b, 0xc8, 0xad, 0xaf, 0x91, 0xd7, 0x69, 0x5e, 0xd1, 0x6a, 0xcd, 0x94, + 0xae, 0xc2, 0x6d, 0x92, 0x35, 0x2f, 0x45, 0x4f, 0x7a, 0xbb, 0x45, 0x4d, + 0xcc, 0x81, 0xcb, 0xc3, 0xa2, 0x98, 0xae, 0xa3, 0x64, 0x6d, 0x9f, 0xbf, + 0x5f, 0xcf, 0xdf, 0x98, 0x43, 0x4d, 0xb2, 0x4b, 0x51, 0x5b, 0x3d, 0x33, + 0xad, 0x43, 0x87, 0x8d, 0x63, 0xbe, 0xd4, 0xc0, 0xb5, 0x84, 0x48, 0x60, + 0xda, 0x43, 0x87, 0x8d, 0xa2, 0x5b, 0x7f, 0x35, 0x3a, 0xa5, 0x37, 0x3f, + 0x62, 0x35, 0xac, 0x92, 0x77, 0x3b, 0xa7, 0x7e, 0x50, 0xc1, 0x8b, 0x60, + 0x3b, 0xa3, 0x6b, 0x87, 0x52, 0xbc, 0x70, 0x3e, 0x6c, 0x89, 0x8b, 0x88, + 0x9d, 0xd1, 0x99, 0x79, 0xab, 0xcd, 0x4a, 0xaf, 0x42, 0x5d, 0x3c, 0x82, + 0x9e, 0x42, 0xc2, 0x53, 0x89, 0x6d, 0xb5, 0x83, 0xa4, 0xbe, 0x94, 0xa1, + 0x81, 0x65, 0x9a, 0xbc, 0xca, 0x4d, 0x63, 0x90, 0x4b, 0x81, 0xc8, 0xac, + 0x36, 0xc1, 0x37, 0x4f, 0x59, 0xc8, 0xb7, 0x93, 0x5c, 0xcc, 0xcb, 0x85, + 0x97, 0xbf, 0x45, 0xc1, 0x4a, 0xa8, 0xa9, 0x84, 0xac, 0x77, 0x8a, 0xc3, + 0xa3, 0x93, 0xaf, 0x4f, 0x3d, 0x87, 0xcc, 0xb4, 0x99, 0x4d, 0x79, 0xd0, + 0x6e, 0xc7, 0xc9, 0x31, 0x76, 0x9b, 0xb5, 0xc6, 0x7e, 0x9a, 0x49, 0xc0, + 0x77, 0x39, 0x66, 0xcb, 0x96, 0x42, 0xaa, 0xb0, 0xb0, 0x83, 0xb4, 0x53, + 0x63, 0x47, 0x5d, 0x9c, 0x42, 0xa1, 0x59, 0x7d, 0x4b, 0x65, 0x46, 0x4a, + 0x48, 0x9d, 0xab, 0x43, 0x71, 0x40, 0x93, 0x30, 0x7e, 0x4b, 0xc8, 0x8c, + 0x3d, 0x96, 0xd4, 0xab, 0xc8, 0xc1, 0xb0, 0x4f, 0xad, 0x63, 0x92, 0x67, + 0xa2, 0xbb, 0x84, 0x79, 0x56, 0xce, 0x46, 0x47, 0x9f, 0x4d, 0x51, 0x92, + 0x36, 0x9e, 0xb2, 0xb3, 0xb2, 0xc2, 0x72, 0x98, 0xcd, 0xb3, 0x89, 0x79, + 0x4f, 0xc2, 0x97, 0x94, 0x54, 0x80, 0x6c, 0xaa, 0x3f, 0x70, 0xa7, 0xa2, + 0x2f, 0xcc, 0x8f, 0x9f, 0x7c, 0x7e, 0xbd, 0x5c, 0x56, 0x4b, 0xbd, 0x5f, + 0xd4, 0x78, 0xac, 0x85, 0x8f, 0xc0, 0x8f, 0x7e, 0x3c, 0x57, 0x8b, 0x56, + 0xb2, 0xc7, 0xb1, 0xb6, 0x65, 0xd4, 0x6b, 0xa6, 0xa6, 0xb8, 0x72, 0x4e, + 0x77, 0x92, 0xc3, 0xc6, 0x8f, 0xa6, 0xbc, 0x40, 0x4a, 0x42, 0x4c, 0x99, + 0x5d, 0x3d, 0xa7, 0x54, 0xc8, 0xd0, 0xb5, 0x6b, 0xbe, 0x96, 0x72, 0x5d, + 0x4e, 0x74, 0xb8, 0xb5, 0x30, 0xc0, 0x67, 0x3d, 0xd3, 0x45, 0x95, 0x79, + 0xb5, 0x95, 0x5d, 0x9c, 0x7a, 0xce, 0xa3, 0x5c, 0xda, 0x38, 0xc3, 0x40, + 0xa7, 0xaa, 0x56, 0x9e, 0xd2, 0xd3, 0x28, 0xa4, 0x3d, 0x7a, 0x6e, 0x8d, + 0x50, 0x62, 0x82, 0x46, 0x67, 0x3d, 0x3b, 0x34, 0x8b, 0xb8, 0x83, 0x51, + 0x5f, 0xa0, 0x51, 0x3e, 0x48, 0x7e, 0xb6, 0x99, 0x68, 0x59, 0x78, 0x5c, + 0x5f, 0xcc, 0xd2, 0xd6, 0x34, 0x8f, 0xd1, 0x9a, 0x9c, 0x9c, 0x50, 0x64, + 0x56, 0x9c, 0xae, 0x8f, 0x5e, 0x86, 0x5e, 0x98, 0x38, 0xd8, 0x52, 0xae, + 0xcc, 0x4f, 0xb4, 0xa6, 0x9f, 0x52, 0xbb, 0xbc, 0x8e, 0x38, 0x71, 0x39, + 0x70, 0xbe, 0x3e, 0xc5, 0xa4, 0x59, 0x8b, 0xc4, 0x73, 0x7b, 0x51, 0xb2, + 0x81, 0x5d, 0x82, 0x4d, 0x36, 0xa1, 0x99, 0xb7, 0x62, 0xc9, 0xb8, 0x31, + 0xa8, 0xab, 0x7c, 0x62, 0x85, 0x7b, 0x96, 0x8e, 0x82, 0x96, 0xbd, 0xa3, + 0x54, 0x60, 0xd8, 0x9a, 0x7e, 0x4c, 0xa1, 0x79, 0xb7, 0x8a, 0xdc, 0x5a, + 0x4c, 0x37, 0xa7, 0x74, 0x58, 0xb7, 0x92, 0xbc, 0x45, 0x74, 0xc8, 0xb1, + 0xce, 0x41, 0x97, 0x52, 0x4a, 0xbc, 0x3c, 0x9f, 0x93, 0x8e, 0xcd, 0x6a, + 0x3d, 0xd2, 0x97, 0xa9, 0x93, 0x78, 0xa7, 0x68, 0xc1, 0xaf, 0x5d, 0x8b, + 0x58, 0x8e, 0x7c, 0x49, 0x90, 0x73, 0xd4, 0x45, 0xc9, 0xbf, 0x79, 0x49, + 0x6a, 0x40, 0xc0, 0xc5, 0x7c, 0x86, 0xa8, 0x35, 0x64, 0x3c, 0xb0, 0x79, + 0x43, 0x60, 0xa2, 0x7a, 0x59, 0xbc, 0x34, 0x59, 0x53, 0x7f, 0x4b, 0xaf, + 0xb9, 0x6e, 0xb2, 0xcf, 0x86, 0x6d, 0x6c, 0x80, 0xb5, 0x45, 0x30, 0x46, + 0xba, 0xbb, 0x85, 0xd3, 0x3d, 0x33, 0xa2, 0x89, 0x7f, 0x2d, 0x3b, 0x83, + 0x6b, 0x51, 0x9c, 0x59, 0xa7, 0xb6, 0x64, 0x35, 0x79, 0x93, 0x2e, 0x36, + 0xc0, 0xb0, 0xc2, 0xa1, 0x9f, 0xb8, 0x8f, 0x45, 0x33, 0x42, 0x79, 0x97, + 0x77, 0xc2, 0x59, 0x7f, 0x3b, 0x3d, 0x4d, 0x9e, 0x4c, 0x9b, 0x6a, 0x7e, + 0x63, 0xaa, 0x9f, 0x9f, 0xc8, 0x4f, 0xbc, 0x38, 0xbc, 0x9f, 0x85, 0x32, + 0x75, 0xd5, 0x6c, 0x5a, 0x39, 0xa1, 0x9c, 0x61, 0x2a, 0x69, 0xca, 0x32, + 0x71, 0x79, 0x93, 0x4d, 0x81, 0xab, 0x8b, 0x7a, 0xae, 0xa5, 0x55, 0xca, + 0x58, 0x49, 0xa9, 0x26, 0x5c, 0xa7, 0x5f, 0x91, 0xac, 0x43, 0x42, 0x7f, + 0x4b, 0xc2, 0x2e, 0x93, 0x40, 0xb9, 0x59, 0x6a, 0xa5, 0x30, 0xb6, 0xdc, + 0x84, 0x3f, 0x80, 0xa9, 0x91, 0x3a, 0x97, 0x8f, 0x6d, 0x43, 0x4c, 0x34, + 0x66, 0x96, 0xa5, 0x52, 0x78, 0x41, 0x86, 0xb5, 0xae, 0x8f, 0xb3, 0xcb, + 0x5d, 0x63, 0x3c, 0x57, 0xbb, 0xa8, 0x69, 0xc1, 0x32, 0xca, 0x52, 0xc2, + 0xbb, 0xb9, 0x92, 0x76, 0xc5, 0x2e, 0x6e, 0x54, 0xad, 0x4a, 0x48, 0xb4, + 0x57, 0x33, 0x49, 0x80, 0x43, 0x38, 0x4d, 0xa0, 0xa7, 0x87, 0x8f, 0x38, + 0xb6, 0x6c, 0x62, 0x8f, 0xa2, 0xab, 0xb9, 0x42, 0xcb, 0x98, 0xcb, 0x66, + 0x68, 0x68, 0x55, 0x52, 0x5e, 0xcb, 0xbf, 0x8b, 0xac, 0x9e, 0xd4, 0xc1, + 0x8d, 0x6b, 0x5e, 0x5e, 0x98, 0x96, 0x6a, 0x2b, 0x59, 0x56, 0xbb, 0xd8, + 0xb8, 0x59, 0x8d, 0x73, 0x8f, 0x90, 0xc0, 0xd1, 0xc5, 0x57, 0x6b, 0x85, + 0x43, 0x9d, 0x90, 0x76, 0x47, 0x5b, 0xa0, 0x40, 0xc5, 0x3c, 0xe5, 0x81, + 0x9e, 0xc5, 0x66, 0xad, 0x89, 0x85, 0xbf, 0x58, 0xad, 0x7f, 0x33, 0x66, + 0xa5, 0x6b, 0x67, 0xd2, 0xd8, 0x4b, 0x6b, 0x5e, 0x93, 0x8f, 0x52, 0x91, + 0x33, 0x6e, 0x64, 0x61, 0x76, 0xb8, 0x5b, 0x97, 0x54, 0xa7, 0x7e, 0x77, + 0xb8, 0xb8, 0x64, 0xc2, 0x6f, 0xb7, 0x6e, 0xc7, 0xbf, 0x95, 0xb4, 0x64, + 0x3a, 0xbd, 0x91, 0x2f, 0xd0, 0x8e, 0x92, 0x35, 0xa3, 0x6a, 0x65, 0x95, + 0x3a, 0x66, 0x8f, 0x78, 0x7b, 0x59, 0x3b, 0x8d, 0x8b, 0xc0, 0x65, 0x85, + 0x77, 0xc0, 0xa2, 0x9e, 0xa1, 0x96, 0x4e, 0x31, 0x8b, 0xae, 0x38, 0xda, + 0x5b, 0x58, 0xca, 0xc7, 0x9c, 0xab, 0x5d, 0x99, 0x6f, 0x76, 0x2f, 0x86, + 0xde, 0x50, 0x60, 0x29, 0x6b, 0x41, 0x8b, 0x37, 0xae, 0x9a, 0x47, 0x40, + 0x32, 0x3e, 0xc3, 0x5d, 0x90, 0xc6, 0x9a, 0xae, 0xa3, 0xa1, 0x38, 0x77, + 0x94, 0x35, 0xb1, 0x8f, 0x60, 0x82, 0x71, 0x7a, 0x6d, 0x44, 0x78, 0x8c, + 0x62, 0x98, 0x58, 0xb1, 0x80, 0x2a, 0x42, 0x40, 0xaa, 0x53, 0xc0, 0x61, + 0xc9, 0x88, 0x76, 0x9f, 0x97, 0x4b, 0x63, 0x8e, 0x42, 0xd6, 0x6c, 0x92, + 0x9d, 0x5f, 0x77, 0x26, 0x49, 0x95, 0x6b, 0x72, 0xa9, 0xb9, 0x8b, 0x82, + 0x6f, 0x7c, 0xa2, 0xc1, 0xad, 0x9f, 0x87, 0xa7, 0x73, 0x66, 0x98, 0x72, + 0xca, 0x89, 0xb4, 0x45, 0x3b, 0xc0, 0xc3, 0x54, 0x90, 0x48, 0x92, 0x71, + 0xb1, 0xbd, 0xbe, 0x92, 0x4f, 0x84, 0x96, 0x41, 0x3d, 0x5c, 0x44, 0x35, + 0xcc, 0x58, 0x55, 0xc8, 0x9f, 0x8d, 0xbc, 0x81, 0x42, 0x3b, 0x96, 0x4f, + 0x9b, 0xbc, 0xc0, 0xd2, 0x58, 0x84, 0x6e, 0x4e, 0x4a, 0x9c, 0x6c, 0xb7, + 0x68, 0x5e, 0x67, 0x85, 0xba, 0x42, 0x55, 0x7c, 0x9c, 0x82, 0x3e, 0x3e, + 0xc4, 0xb7, 0x53, 0xcf, 0x3f, 0x6c, 0x2b, 0xb6, 0xcf, 0x75, 0x38, 0xc2, + 0x86, 0x58, 0xae, 0x27, 0xb9, 0xc5, 0xac, 0xbd, 0xcd, 0x71, 0x36, 0x87, + 0x73, 0x4d, 0x8b, 0xbf, 0x48, 0xb6, 0x3b, 0x82, 0x4f, 0x50, 0x6f, 0xab, + 0x48, 0x81, 0x81, 0xb4, 0xae, 0x59, 0xaa, 0x50, 0x8f, 0xa7, 0x8f, 0xac, + 0xc3, 0x65, 0x6b, 0x70, 0x9d, 0xab, 0xbb, 0xb8, 0x32, 0x9a, 0x32, 0x98, + 0xc4, 0x5f, 0xcd, 0x88, 0x99, 0x80, 0x9c, 0xc0, 0x5d, 0x3f, 0x77, 0x87, + 0x92, 0x35, 0x73, 0xad, 0x7a, 0x94, 0x86, 0xca, 0x5f, 0xcb, 0x9c, 0x2d, + 0x65, 0x5f, 0x7f, 0x51, 0x57, 0x2c, 0x3d, 0x66, 0x58, 0x82, 0x24, 0x4c, + 0x44, 0xc3, 0x7e, 0x37, 0x79, 0x8f, 0x4b, 0x36, 0x84, 0x81, 0x7b, 0xd2, + 0x3e, 0xbe, 0x2e, 0x9b, 0xc7, 0xa0, 0x50, 0x7b, 0x9d, 0x8c, 0xd0, 0x87, + 0x75, 0xac, 0x82, 0xbe, 0x92, 0x51, 0x6f, 0x50, 0x43, 0xd2, 0xc9, 0xd3, + 0x90, 0x52, 0x77, 0x90, 0x2e, 0xb3, 0xa9, 0xa4, 0xcf, 0x9f, 0x38, 0x54, + 0xb3, 0x9d, 0x2c, 0x52, 0xd1, 0x86, 0x34, 0xbd, 0x91, 0x72, 0x2c, 0xb0, + 0x8d, 0xc2, 0xc4, 0x41, 0x94, 0x86, 0xb2, 0xac, 0x4c, 0xcf, 0xa7, 0x83, + 0xda, 0x77, 0x65, 0x51, 0xbc, 0x9f, 0xa0, 0xa1, 0x8d, 0x56, 0x6e, 0xa0, + 0xbe, 0x60, 0xa6, 0xc7, 0x5a, 0x8f, 0x30, 0x80, 0x46, 0x3c, 0x8b, 0x94, + 0x91, 0xa0, 0x63, 0x43, 0x76, 0x4b, 0x7b, 0x48, 0x5f, 0x4d, 0xcd, 0xd7, + 0xc1, 0x89, 0x36, 0x65, 0x78, 0xa9, 0x79, 0x87, 0x75, 0x89, 0x9f, 0xac, + 0x89, 0xcb, 0xd2, 0x82, 0xc2, 0x5f, 0x44, 0x5f, 0x74, 0x43, 0x51, 0xaf, + 0x3f, 0x59, 0xce, 0xaf, 0x70, 0xbd, 0x4d, 0x78, 0x80, 0xcc, 0x4c, 0x9b, + 0xd0, 0x2f, 0x3d, 0xa7, 0xae, 0xc0, 0x40, 0x89, 0x39, 0x72, 0x29, 0xb2, + 0x6c, 0x83, 0xcc, 0x4b, 0x9c, 0x8e, 0x46, 0x32, 0xce, 0x6e, 0x89, 0x7b, + 0x92, 0x4b, 0x36, 0x48, 0x3e, 0x59, 0x51, 0x9e, 0x61, 0xd5, 0x63, 0xa4, + 0xb2, 0x4c, 0x77, 0xb0, 0x5b, 0x4c, 0x5f, 0xaf, 0x92, 0x6d, 0x44, 0x5c, + 0x6f, 0x83, 0x9d, 0x7f, 0xb1, 0x69, 0x36, 0x6f, 0x70, 0xbe, 0xcb, 0x9f, + 0x7b, 0xbe, 0x54, 0xaf, 0x65, 0x9b, 0xbf, 0x6c, 0x41, 0x3a, 0xba, 0x52, + 0x64, 0x4b, 0x74, 0xaf, 0xc8, 0x35, 0x9d, 0x6c, 0x37, 0x5c, 0x52, 0x51, + 0xb0, 0x50, 0xad, 0x88, 0xc3, 0xc0, 0xbc, 0x6d, 0x4d, 0x8b, 0x54, 0x61, + 0x7e, 0x77, 0x3d, 0x71, 0xce, 0x6f, 0x5b, 0x4b, 0x47, 0x4e, 0x9f, 0xb0, + 0xb6, 0x3d, 0xbf, 0xcd, 0x4a, 0x3b, 0x71, 0x4a, 0xa4, 0x53, 0x64, 0x68, + 0x3a, 0xd4, 0x5a, 0x3b, 0x52, 0xd0, 0xd1, 0x9e, 0x87, 0x6e, 0x6f, 0xce, + 0xae, 0xa5, 0x39, 0x35, 0x9a, 0x34, 0x5b, 0x83, 0x9b, 0xc8, 0xa5, 0xb0, + 0x86, 0x50, 0x70, 0x61, 0x8e, 0x6b, 0xd0, 0x55, 0x39, 0xa2, 0xb0, 0x6b, + 0xb6, 0xbd, 0x49, 0x2c, 0x8b, 0x44, 0x8e, 0x88, 0x53, 0x6d, 0x91, 0x55, + 0xcc, 0x90, 0x78, 0x93, 0x8c, 0x9e, 0xbd, 0xc9, 0x47, 0x92, 0x77, 0x79, + 0x35, 0x96, 0x93, 0x75, 0x34, 0x9d, 0xb5, 0x46, 0xc0, 0xce, 0x9c, 0x4a, + 0x8c, 0x4b, 0xad, 0xde, 0x81, 0xa5, 0x8e, 0xc6, 0xac, 0x64, 0xab, 0x30, + 0x41, 0x75, 0x7d, 0x57, 0x69, 0x88, 0xc4, 0x53, 0xcd, 0x72, 0xa6, 0x52, + 0x31, 0xbe, 0x9e, 0x67, 0x39, 0xad, 0x90, 0x99, 0x4d, 0xb7, 0x46, 0x6f, + 0xb8, 0x72, 0x63, 0x3b, 0x59, 0xc2, 0xba, 0xaa, 0x98, 0x38, 0xbb, 0x41, + 0x3e, 0x48, 0x6a, 0x88, 0xa9, 0xad, 0x86, 0x8c, 0x4d, 0x3c, 0xb6, 0x4f, + 0x8e, 0x5e, 0xbf, 0xa1, 0x73, 0xbc, 0x8e, 0x3e, 0x68, 0x4e, 0x99, 0x44, + 0xb6, 0x65, 0x5b, 0xc8, 0xb8, 0x68, 0x49, 0x53, 0x8d, 0xd0, 0xa2, 0x8e, + 0x97, 0xac, 0x9f, 0x77, 0xb3, 0x71, 0xa3, 0x8c, 0xb5, 0x68, 0xb3, 0x86, + 0x6d, 0x53, 0x99, 0xb4, 0x49, 0x55, 0xb1, 0xac, 0x5f, 0x9e, 0xcb, 0xac, + 0x51, 0x8a, 0x3b, 0xa3, 0x42, 0x5c, 0xae, 0xdc, 0xb6, 0x76, 0x9a, 0xb3, + 0x99, 0xcc, 0x43, 0x4b, 0xa2, 0x60, 0xcc, 0x8a, 0xcb, 0xb1, 0x96, 0x72, + 0xa5, 0xa5, 0x2c, 0x5f, 0x4c, 0x96, 0x56, 0x91, 0xb1, 0xa5, 0xc9, 0x4f, + 0x8f, 0xbf, 0xcb, 0x33, 0x4c, 0x43, 0x73, 0xbe, 0xb3, 0x35, 0x85, 0x62, + 0x59, 0x79, 0x42, 0xaa, 0x36, 0x67, 0xcd, 0x71, 0x67, 0x47, 0x6c, 0x97, + 0x42, 0x6d, 0x94, 0x37, 0x42, 0x35, 0x58, 0x4c, 0x96, 0xbc, 0xa4, 0x55, + 0x98, 0x6e, 0xb7, 0xd4, 0x98, 0x8b, 0x9e, 0xbe, 0x62, 0x93, 0x5b, 0xb7, + 0x96, 0x89, 0xa9, 0x39, 0x9c, 0x7d, 0xa4, 0x79, 0x4e, 0xc2, 0xa5, 0xc0, + 0xd5, 0x98, 0x7a, 0x73, 0x79, 0x96, 0x5b, 0x72, 0x58, 0x83, 0x38, 0x4b, + 0x99, 0x49, 0x78, 0x4a, 0x82, 0x55, 0x45, 0x54, 0x9e, 0xb1, 0x99, 0xcf, + 0xc2, 0xd0, 0x63, 0x69, 0xc4, 0x64, 0xc3, 0x30, 0x82, 0xcb, 0x44, 0xc5, + 0x8a, 0x88, 0x36, 0x85, 0xa3, 0x50, 0x5e, 0x96, 0x99, 0x9b, 0x55, 0x89, + 0xc0, 0x3e, 0xb4, 0xb3, 0x58, 0x8d, 0x65, 0x4d, 0x43, 0x3d, 0xbe, 0x8f, + 0x77, 0x5b, 0x9a, 0x81, 0xc9, 0x54, 0x86, 0xb6, 0x88, 0xaf, 0x7d, 0x3f, + 0x9b, 0x6c, 0x59, 0x69, 0xca, 0xaa, 0x91, 0x3d, 0xb8, 0x54, 0xd6, 0x84, + 0x54, 0x9e, 0x5c, 0xb1, 0x38, 0x92, 0x9c, 0xc6, 0x87, 0xaa, 0x5f, 0xc8, + 0x91, 0xac, 0x8a, 0x53, 0xb4, 0x35, 0x5e, 0x71, 0x3f, 0xb6, 0x8d, 0x8d, + 0x46, 0x31, 0x9c, 0x63, 0x5c, 0x6f, 0x66, 0xbd, 0xae, 0x57, 0x63, 0xb2, + 0xcd, 0xae, 0x95, 0x86, 0xb3, 0x70, 0x77, 0x6d, 0xcb, 0x30, 0x4c, 0xb6, + 0xac, 0x8c, 0xa6, 0x65, 0xa6, 0x7d, 0x53, 0x88, 0x50, 0x4e, 0x86, 0xc4, + 0x91, 0x96, 0x65, 0xb9, 0x31, 0x60, 0x65, 0x54, 0x7d, 0xb0, 0x6a, 0x91, + 0x39, 0x5b, 0xc0, 0x8f, 0x4d, 0xbe, 0xcc, 0xb2, 0x98, 0xbe, 0x67, 0x4e, + 0x6c, 0x45, 0xbc, 0x67, 0xb5, 0x7b, 0xb2, 0xb7, 0x90, 0xc9, 0xc3, 0x54, + 0x52, 0x62, 0x99, 0x74, 0x42, 0xc3, 0xb8, 0x40, 0x51, 0x8b, 0x5d, 0x4a, + 0xc2, 0x97, 0x6f, 0x61, 0x9e, 0xbe, 0x6d, 0x7e, 0x5d, 0x86, 0x97, 0xcf, + 0x3a, 0x7f, 0x58, 0x78, 0x39, 0xaf, 0x58, 0xae, 0x3d, 0xad, 0xb7, 0xad, + 0x42, 0x8d, 0x42, 0xb0, 0x4e, 0xca, 0xb6, 0xbe, 0x6e, 0x91, 0x9d, 0xba, + 0xa0, 0xb0, 0x8b, 0x33, 0x28, 0xa5, 0x97, 0x40, 0x8b, 0x6c, 0x8b, 0x85, + 0x4f, 0xac, 0x9e, 0x41, 0x5c, 0xd9, 0xc0, 0xc4, 0x73, 0x3c, 0x8a, 0x5a, + 0x32, 0x5b, 0x75, 0x8c, 0x8b, 0x91, 0x8b, 0x84, 0x7d, 0x64, 0xa2, 0x74, + 0xa4, 0x52, 0xb6, 0x9f, 0x3b, 0x85, 0xc7, 0x8b, 0x7b, 0x36, 0xca, 0x68, + 0xbb, 0x5f, 0xc1, 0xc6, 0x4e, 0x57, 0x81, 0xa4, 0x43, 0xc1, 0x8b, 0x36, + 0x4b, 0x4f, 0xbc, 0x69, 0x96, 0x67, 0x7c, 0x43, 0xba, 0x5e, 0x4c, 0x84, + 0x77, 0x95, 0xa6, 0x5f, 0xd1, 0xce, 0x8e, 0x6f, 0x49, 0xb8, 0x46, 0x8a, + 0x9c, 0x37, 0x9d, 0x91, 0x7b, 0xb3, 0x81, 0x66, 0xa4, 0x6f, 0xc5, 0x6f, + 0x71, 0xcd, 0xc9, 0x61, 0x5a, 0xb2, 0x60, 0x7f, 0xb4, 0x50, 0x6c, 0x94, + 0xc2, 0x9c, 0x4f, 0x97, 0x6c, 0x4b, 0x84, 0x5d, 0xb7, 0xc5, 0xd3, 0xd1, + 0x94, 0x8b, 0x94, 0xc4, 0x3a, 0x64, 0x3f, 0x50, 0x79, 0xb3, 0x6e, 0x58, + 0x6a, 0xba, 0xcb, 0x9f, 0xaa, 0x80, 0xcd, 0x93, 0x3d, 0x89, 0xaf, 0x9e, + 0x5c, 0xb2, 0x39, 0x95, 0x99, 0x78, 0x69, 0x46, 0x4d, 0xcd, 0xa8, 0x44, + 0xc2, 0xb2, 0xaa, 0x8a, 0x96, 0x54, 0x84, 0xcb, 0xaa, 0xbe, 0xa7, 0xc6, + 0xa6, 0x63, 0x9c, 0x95, 0x99, 0x5a, 0x63, 0x8a, 0x89, 0xd1, 0x54, 0x8f, + 0xc6, 0xaf, 0x5e, 0x6e, 0xac, 0x51, 0x8f, 0x33, 0x79, 0x91, 0xb6, 0x32, + 0xc7, 0x6a, 0xc0, 0x6b, 0x9e, 0x64, 0xb1, 0x89, 0xbf, 0xcb, 0x8c, 0x8b, + 0x2d, 0xbd, 0xc2, 0x4e, 0xa2, 0x56, 0x8b, 0x8a, 0x3c, 0x49, 0xac, 0xaa, + 0xcb, 0xc5, 0x65, 0x36, 0x6e, 0xb1, 0x91, 0x4d, 0xcf, 0xa4, 0x78, 0x49, + 0x5a, 0xa1, 0x48, 0x3e, 0x71, 0x7c, 0x86, 0x9c, 0x42, 0x3b, 0x44, 0x45, + 0x8c, 0x75, 0xcb, 0x81, 0x8e, 0x56, 0x87, 0xc7, 0xa2, 0x82, 0xb0, 0xa1, + 0x63, 0x87, 0x72, 0x5a, 0x78, 0xad, 0xb5, 0xa9, 0xb1, 0xc3, 0x58, 0x61, + 0xa3, 0xb1, 0xbe, 0x4e, 0xa5, 0xd3, 0xae, 0x4f, 0x4d, 0x97, 0x84, 0x72, + 0xbc, 0xa1, 0x4b, 0x66, 0x73, 0xba, 0xa0, 0x50, 0x4b, 0xa7, 0x42, 0x39, + 0x8d, 0x5a, 0xac, 0x91, 0x5c, 0x85, 0x37, 0x64, 0xb2, 0x37, 0xac, 0x77, + 0x9b, 0x48, 0xd2, 0x73, 0x3c, 0x88, 0x95, 0x6f, 0x5f, 0x90, 0x84, 0xa4, + 0xb9, 0xc9, 0x9b, 0xaf, 0xa9, 0xb6, 0xc1, 0x48, 0xc0, 0x87, 0xc7, 0x37, + 0x82, 0x55, 0xa9, 0x42, 0xce, 0xb9, 0xa2, 0xb6, 0x50, 0x95, 0x37, 0x9c, + 0xcd, 0x6e, 0xbe, 0x92, 0xbc, 0x9f, 0x55, 0x9a, 0x66, 0x93, 0x7c, 0x60, + 0x66, 0xae, 0x38, 0x8a, 0xba, 0x55, 0xa3, 0x77, 0xbe, 0x4d, 0x58, 0x29, + 0x9d, 0xb9, 0x38, 0xd5, 0xca, 0xc0, 0x80, 0x8d, 0x5f, 0x4c, 0x84, 0x97, + 0x39, 0xcb, 0xd4, 0xa6, 0x77, 0x84, 0x60, 0x3d, 0xce, 0xd8, 0xab, 0x57, + 0x49, 0x69, 0x47, 0xa0, 0x56, 0x3a, 0x43, 0xd2, 0x34, 0x83, 0x83, 0x4c, + 0x71, 0x98, 0xd9, 0x5c, 0x46, 0x9e, 0x49, 0xb0, 0xc4, 0xae, 0x39, 0x4c, + 0x4c, 0x60, 0x3e, 0x99, 0x51, 0x99, 0xbd, 0x75, 0xb0, 0x4c, 0xb5, 0x49, + 0x78, 0x82, 0x7a, 0x9d, 0x36, 0x87, 0xc2, 0x4b, 0x91, 0x5d, 0x4e, 0x71, + 0x44, 0xc1, 0x58, 0xaf, 0x52, 0x59, 0x9c, 0x68, 0x7d, 0x99, 0xc2, 0x45, + 0x4c, 0xc0, 0x8b, 0x5d, 0x37, 0x99, 0xc1, 0xd0, 0x7a, 0xcd, 0xa6, 0x32, + 0x42, 0xb3, 0x51, 0x6e, 0xaf, 0x38, 0x4b, 0x9e, 0xbd, 0x88, 0x87, 0x95, + 0xda, 0xa1, 0xa9, 0x83, 0xce, 0x71, 0x57, 0x83, 0x85, 0x3a, 0xd3, 0x4f, + 0xa3, 0x66, 0x5f, 0x51, 0xc1, 0x67, 0x54, 0x4f, 0xdc, 0xc2, 0x6b, 0xa8, + 0xca, 0x6c, 0xc2, 0xae, 0x43, 0x92, 0xce, 0xba, 0x35, 0x77, 0xb5, 0x51, + 0x92, 0xb6, 0x92, 0x66, 0x5c, 0x94, 0x8f, 0x8b, 0x5d, 0x95, 0x70, 0xb6, + 0x4f, 0xd4, 0xbd, 0x37, 0xc5, 0xc5, 0x46, 0x8a, 0x4a, 0xb5, 0x59, 0x4b, + 0xb2, 0xc9, 0x7a, 0x93, 0x7b, 0x87, 0x79, 0x36, 0x34, 0x4e, 0xd2, 0x54, + 0x37, 0xc4, 0x53, 0x62, 0x9c, 0xc6, 0xba, 0x97, 0x83, 0x9d, 0xa9, 0x88, + 0x48, 0x91, 0x63, 0x29, 0x5d, 0x60, 0xa8, 0x8a, 0x85, 0x4a, 0x3a, 0x76, + 0x53, 0xaa, 0xb7, 0x85, 0xba, 0xd8, 0x77, 0x99, 0x42, 0xc8, 0x5f, 0x40, + 0x4b, 0xa0, 0x55, 0xcc, 0xc1, 0x8e, 0x64, 0x57, 0x79, 0x6c, 0x39, 0xcb, + 0xca, 0xb2, 0x96, 0x47, 0x9e, 0x79, 0x9f, 0xb9, 0x83, 0x8f, 0x95, 0x40, + 0x87, 0x56, 0x84, 0x47, 0x62, 0x9f, 0xb5, 0x8b, 0x98, 0x60, 0x70, 0x41, + 0xc0, 0x99, 0x5f, 0x31, 0x4f, 0xa9, 0x6a, 0x7e, 0x77, 0xd0, 0x4a, 0x95, + 0x77, 0xc5, 0x59, 0xcb, 0xdc, 0xd1, 0x76, 0xce, 0x70, 0x5b, 0x80, 0xae, + 0x36, 0x46, 0xb9, 0x41, 0x65, 0x44, 0x52, 0xca, 0xc3, 0x42, 0xbf, 0x8c, + 0x66, 0x84, 0x60, 0x87, 0x68, 0x3f, 0xb6, 0x5e, 0x55, 0xb3, 0x49, 0x96, + 0x50, 0xd6, 0x9d, 0x9e, 0x75, 0xb2, 0x35, 0x55, 0x4e, 0xa3, 0x63, 0x45, + 0x9f, 0x3a, 0x66, 0x85, 0xcd, 0x4b, 0x70, 0x8b, 0xa9, 0xbe, 0x82, 0x83, + 0xa6, 0xb4, 0x38, 0x6c, 0x50, 0xbd, 0x6d, 0x77, 0xc8, 0xad, 0x48, 0x6f, + 0x4d, 0x8e, 0xa4, 0x9d, 0xb8, 0x6d, 0xd8, 0xc9, 0xb4, 0xb3, 0x84, 0x91, + 0x8a, 0x6e, 0x5b, 0x55, 0x71, 0x45, 0x8a, 0xc8, 0x7a, 0x8f, 0x91, 0x3a, + 0x57, 0xb0, 0xa2, 0x67, 0xa6, 0x97, 0x51, 0x76, 0xbb, 0x9b, 0x59, 0x5e, + 0xa7, 0xa2, 0x6f, 0x75, 0xd4, 0x7d, 0x80, 0xb0, 0x88, 0x97, 0x7b, 0x3a, + 0xb2, 0xb9, 0x5a, 0x45, 0xa2, 0xa2, 0xc8, 0xc8, 0x42, 0x5b, 0xd6, 0x57, + 0x65, 0x63, 0xcc, 0x92, 0x90, 0x65, 0x46, 0x53, 0x89, 0x4c, 0x8c, 0x7e, + 0xa5, 0x78, 0xc4, 0xb1, 0x5d, 0x29, 0xb0, 0x6a, 0xd1, 0x66, 0xb8, 0x5a, + 0x8b, 0xb1, 0x5b, 0x3f, 0x6a, 0x8f, 0x57, 0x7e, 0x9d, 0xa7, 0x87, 0x67, + 0x94, 0x7f, 0xd4, 0x81, 0x89, 0x81, 0x88, 0x34, 0x30, 0x73, 0x77, 0x95, + 0xb9, 0xa6, 0xaa, 0x49, 0xc8, 0xaa, 0x58, 0xa8, 0x43, 0xae, 0x9f, 0xb8, + 0x2f, 0xb7, 0x78, 0x97, 0x81, 0x32, 0x2f, 0xc0, 0xbd, 0xd6, 0x4a, 0xcc, + 0x47, 0x7a, 0xd6, 0x9d, 0xc2, 0x51, 0x87, 0x5d, 0x43, 0x6c, 0xb6, 0xc9, + 0xcf, 0xb3, 0xc2, 0x93, 0xcd, 0x43, 0xa1, 0x7a, 0x39, 0xa2, 0x71, 0xd6, + 0x47, 0x7f, 0xb0, 0x9b, 0x9c, 0xbd, 0x43, 0x64, 0x50, 0x32, 0xa9, 0xc7, + 0xae, 0x67, 0x87, 0x83, 0x62, 0x4a, 0x89, 0x42, 0xa1, 0x57, 0x6a, 0x64, + 0x7d, 0x9f, 0xd2, 0x82, 0xb4, 0xb5, 0xcd, 0x9b, 0xb6, 0x3a, 0xb1, 0xc7, + 0x75, 0x71, 0x31, 0x43, 0x77, 0x57, 0x99, 0xd2, 0xc0, 0xac, 0x49, 0x77, + 0x86, 0xbf, 0x74, 0x77, 0x80, 0x4f, 0x80, 0x7d, 0x6f, 0xcd, 0xbb, 0x40, + 0x39, 0xa4, 0x5e, 0x3e, 0xa7, 0x54, 0x81, 0x40, 0xb6, 0xb3, 0xd5, 0x6b, + 0x8f, 0x4e, 0xa2, 0xb1, 0x5a, 0x50, 0x74, 0x3b, 0x64, 0x8b, 0x33, 0x3c, + 0xb3, 0x5d, 0x4f, 0x47, 0x3e, 0x91, 0xae, 0x90, 0x52, 0x40, 0x58, 0x80, + 0xc1, 0xb8, 0x3c, 0xbb, 0x7d, 0xa2, 0x4f, 0xb8, 0xa6, 0x59, 0x5d, 0x83, + 0x75, 0x69, 0xb5, 0x68, 0x4b, 0xad, 0x38, 0x4e, 0x52, 0xe1, 0x8a, 0x7f, + 0xab, 0xa4, 0xab, 0x96, 0x2c, 0x36, 0x4c, 0x95, 0x73, 0x68, 0x54, 0x96, + 0xb8, 0xcd, 0x75, 0x48, 0x9b, 0x4c, 0x74, 0xc4, 0xd9, 0xa9, 0x4a, 0xc9, + 0x9d, 0xd9, 0x56, 0x8b, 0xab, 0x93, 0xa6, 0x7f, 0x84, 0x70, 0x41, 0x94, + 0xc9, 0x4b, 0x9c, 0x70, 0x62, 0xc3, 0xa8, 0x4e, 0x77, 0xc9, 0x3a, 0x55, + 0x95, 0x5e, 0xa5, 0x71, 0xaa, 0xb9, 0xb8, 0x97, 0x39, 0x67, 0xbb, 0x4a, + 0x7f, 0x98, 0xd1, 0x99, 0x5c, 0x6b, 0x63, 0xd1, 0xd0, 0xa6, 0x72, 0x5f, + 0xa6, 0x8f, 0x9a, 0xb4, 0x5a, 0x33, 0xac, 0x63, 0xb3, 0x53, 0xc0, 0xc2, + 0xb5, 0x67, 0xc4, 0x9e, 0x41, 0xa5, 0x96, 0x91, 0x6f, 0xb7, 0x45, 0x50, + 0x7a, 0x8b, 0x57, 0xa7, 0x73, 0xb0, 0x4a, 0x7a, 0x79, 0xca, 0xc8, 0x51, + 0x7e, 0x3e, 0x80, 0x8d, 0x4f, 0x86, 0x4e, 0xac, 0xc0, 0xc9, 0xa8, 0x3b, + 0x7f, 0x67, 0x86, 0x8b, 0x89, 0x34, 0xc6, 0x55, 0x62, 0x8f, 0x4f, 0x92, + 0x51, 0x41, 0x62, 0x79, 0x89, 0x51, 0xb4, 0xce, 0xa4, 0x44, 0xa2, 0x97, + 0x37, 0x97, 0xb9, 0xc0, 0xb3, 0x9e, 0x39, 0x32, 0xa3, 0x59, 0xab, 0xb2, + 0x5a, 0xb0, 0x72, 0xd0, 0x80, 0xbf, 0xd7, 0xca, 0xc5, 0x45, 0x6d, 0x64, + 0xb3, 0xb0, 0xbf, 0x56, 0xb3, 0x56, 0x9e, 0xac, 0xb1, 0xc7, 0x36, 0x51, + 0xba, 0x27, 0x93, 0xb7, 0x6f, 0x46, 0x28, 0x82, 0xb0, 0x21, 0x6d, 0x49, + 0xb8, 0x98, 0x37, 0x76, 0x93, 0x60, 0x8b, 0x79, 0xaa, 0x77, 0xc6, 0x6a, + 0xcf, 0xa9, 0x70, 0x41, 0x54, 0xa5, 0x65, 0x98, 0x81, 0x32, 0x65, 0x8c, + 0x36, 0xcd, 0xde, 0x32, 0x58, 0x66, 0x96, 0x55, 0x3c, 0xb1, 0x55, 0x88, + 0x9d, 0x78, 0xc7, 0xb7, 0x63, 0x97, 0x5e, 0x38, 0x65, 0xc0, 0xa9, 0xa3, + 0xaa, 0x2c, 0x64, 0x5f, 0x4c, 0x7e, 0xa1, 0x72, 0x61, 0xb0, 0xab, 0x57, + 0x4a, 0x99, 0xab, 0xaa, 0x8a, 0xca, 0x71, 0x2f, 0x4f, 0x9a, 0x61, 0xaa, + 0xbf, 0x5c, 0x48, 0x4f, 0x40, 0x33, 0xc5, 0xb3, 0x7e, 0x59, 0x6a, 0x7e, + 0x66, 0x37, 0x4e, 0x96, 0xb9, 0x80, 0x39, 0x77, 0x4f, 0x3f, 0x52, 0xa4, + 0x47, 0xb7, 0xb1, 0x64, 0xaa, 0x8f, 0xca, 0x98, 0xd3, 0xa3, 0x4f, 0xca, + 0x9a, 0x9e, 0xc7, 0x95, 0xb1, 0x58, 0x6a, 0x9c, 0x9a, 0x97, 0x8b, 0xcc, + 0xa2, 0xb2, 0x63, 0xa2, 0x87, 0xa2, 0xae, 0x54, 0x30, 0x6c, 0xab, 0x66, + 0x4a, 0x87, 0x7b, 0x31, 0x9d, 0x64, 0xbe, 0xcf, 0x84, 0x90, 0x84, 0x55, + 0x75, 0x9d, 0x82, 0xa6, 0xae, 0x48, 0x54, 0x86, 0x83, 0xba, 0x3b, 0x3b, + 0x4a, 0x87, 0xa7, 0x6e, 0x4a, 0x70, 0x57, 0x61, 0x4f, 0xc9, 0x8d, 0xbe, + 0xcd, 0x41, 0xa5, 0xa8, 0x4b, 0x33, 0x8f, 0xc4, 0x50, 0x71, 0x8a, 0xb2, + 0xad, 0xb0, 0x66, 0x98, 0x3c, 0x61, 0x5f, 0x70, 0xab, 0x8e, 0xad, 0xc7, + 0xaa, 0x2b, 0xae, 0x98, 0x8b, 0x4c, 0x76, 0xb1, 0x93, 0xb8, 0x8e, 0x82, + 0x32, 0x9b, 0x53, 0x62, 0xd1, 0x86, 0x2c, 0x72, 0xc8, 0x64, 0x70, 0x62, + 0xcd, 0xae, 0x4d, 0x9c, 0xaf, 0x8f, 0x8a, 0x76, 0x31, 0xc8, 0xb5, 0xb9, + 0x41, 0x6f, 0xb6, 0x49, 0x6e, 0xcd, 0x44, 0x69, 0xbf, 0x71, 0x5d, 0xc0, + 0x71, 0x98, 0x7a, 0x7e, 0xd0, 0x95, 0x87, 0xa8, 0x96, 0xb6, 0x58, 0x59, + 0xa3, 0x46, 0xcf, 0x3f, 0x3b, 0x6c, 0x6e, 0x8f, 0xbb, 0x62, 0x6b, 0x81, + 0x63, 0x5b, 0x77, 0xc5, 0xcd, 0xc9, 0x68, 0xbd, 0xd2, 0x8d, 0x3f, 0x45, + 0xca, 0x5b, 0x98, 0x9c, 0xb9, 0x94, 0xa4, 0x92, 0x41, 0xb0, 0xb4, 0x9f, + 0x33, 0x5e, 0xa6, 0x7a, 0xb0, 0x65, 0x3b, 0x83, 0x9d, 0x40, 0x69, 0x3e, + 0x8d, 0xc5, 0xa1, 0xa6, 0xc8, 0x50, 0x5a, 0x49, 0x41, 0x6a, 0x99, 0x86, + 0x97, 0x31, 0x8a, 0x83, 0xd6, 0xae, 0xbc, 0x8d, 0x68, 0xb1, 0xb6, 0xba, + 0xce, 0x5f, 0xc8, 0xcd, 0xc3, 0x57, 0xb3, 0x5d, 0x65, 0x79, 0x42, 0x46, + 0x65, 0xb3, 0xcd, 0x6b, 0x72, 0xaf, 0x81, 0x46, 0x7f, 0x3e, 0x51, 0x4a, + 0xbe, 0x31, 0x40, 0x3c, 0x3b, 0x8d, 0x64, 0x88, 0xbb, 0x44, 0x9e, 0xc2, + 0x79, 0x8f, 0x51, 0x5e, 0xb2, 0x91, 0x9b, 0x9f, 0xb1, 0x8f, 0x68, 0xaf, + 0x51, 0x96, 0xae, 0x93, 0xd1, 0x3f, 0x2a, 0x89, 0x61, 0x89, 0x45, 0xb8, + 0x43, 0x71, 0x50, 0x94, 0xce, 0x7a, 0x2c, 0xb1, 0x43, 0x6d, 0x33, 0x5a, + 0x76, 0x5f, 0x65, 0x7e, 0x3a, 0x36, 0x21, 0xba, 0x6c, 0xab, 0xbf, 0x46, + 0x94, 0xd2, 0x83, 0x89, 0xb8, 0x7a, 0x4a, 0x42, 0x7d, 0x6f, 0x3f, 0xaf, + 0x43, 0xc6, 0xb2, 0x91, 0xcd, 0xba, 0x68, 0x54, 0xa2, 0x46, 0xc2, 0xbb, + 0xc7, 0x78, 0x53, 0x3b, 0xad, 0x4b, 0x50, 0x67, 0xb3, 0xcc, 0xc4, 0x99, + 0xaf, 0x6a, 0xa1, 0xb0, 0x6a, 0x61, 0x97, 0xb5, 0x86, 0x91, 0x66, 0x3a, + 0x95, 0xac, 0x5f, 0xb5, 0x48, 0x55, 0x35, 0x8b, 0x62, 0x8a, 0x33, 0x5f, + 0xcd, 0x31, 0xc3, 0x41, 0x67, 0x7a, 0xaa, 0xc1, 0x93, 0x90, 0xa0, 0xad, + 0x38, 0x4f, 0xa3, 0x4e, 0x42, 0x69, 0xaf, 0x44, 0x7a, 0xd0, 0x8a, 0x82, + 0x76, 0xb1, 0x4c, 0x8a, 0x89, 0xb4, 0x96, 0x53, 0xb1, 0x9a, 0xb7, 0x46, + 0x93, 0xa4, 0x8a, 0xad, 0x48, 0x3f, 0xa6, 0x70, 0xc4, 0xb9, 0x4c, 0x30, + 0xbc, 0x9e, 0x83, 0x3f, 0xc0, 0xa8, 0xa1, 0x5f, 0x9b, 0x96, 0xbc, 0x88, + 0x58, 0x3f, 0xd7, 0x55, 0x6c, 0xce, 0x52, 0x3b, 0x59, 0x77, 0x7a, 0xac, + 0x82, 0x41, 0x68, 0xc6, 0x5e, 0x40, 0x31, 0xa3, 0x86, 0xa4, 0xcb, 0x3a, + 0xc4, 0x86, 0x7f, 0xc3, 0x61, 0x92, 0xcf, 0x93, 0x93, 0xb0, 0x49, 0x9d, + 0x78, 0x4b, 0x91, 0x36, 0xd0, 0xc1, 0x35, 0x8c, 0x91, 0xb6, 0xc4, 0x86, + 0x80, 0x74, 0x57, 0x52, 0xa4, 0x25, 0x59, 0xca, 0xa6, 0x69, 0x5c, 0x53, + 0xa8, 0x57, 0x36, 0x96, 0x9d, 0x54, 0xb5, 0xcc, 0x66, 0x75, 0x64, 0x7d, + 0x7f, 0x5b, 0xad, 0xb6, 0xad, 0x51, 0xa8, 0xc4, 0x70, 0xc9, 0x38, 0xd1, + 0x70, 0xc8, 0xb9, 0x87, 0x7a, 0xc7, 0x62, 0x59, 0x68, 0x5b, 0xa7, 0x53, + 0x62, 0xb0, 0x82, 0xa4, 0xae, 0x65, 0xbc, 0x45, 0xcb, 0x37, 0x28, 0x5d, + 0xc5, 0x75, 0xa4, 0x42, 0xb9, 0x39, 0xa8, 0x7a, 0xcf, 0x79, 0x56, 0x27, + 0x3f, 0xba, 0xb2, 0x3a, 0x74, 0x81, 0xb1, 0x73, 0xae, 0xbd, 0xb8, 0xac, + 0x5c, 0xce, 0xa7, 0x75, 0x74, 0x41, 0xb0, 0x93, 0x4b, 0xc0, 0x7f, 0xcd, + 0xb8, 0x3b, 0xda, 0x7a, 0x32, 0xb6, 0x80, 0xa3, 0xa8, 0xa6, 0x5d, 0xb1, + 0x88, 0x6e, 0x97, 0x79, 0xac, 0x40, 0x3a, 0xa6, 0x9a, 0x70, 0x78, 0x51, + 0xc1, 0x65, 0xae, 0x91, 0x50, 0x79, 0x34, 0x77, 0x87, 0x68, 0x8e, 0xa7, + 0xa0, 0x9a, 0x24, 0xc2, 0x7d, 0x69, 0x77, 0xd5, 0x90, 0x65, 0xc6, 0x63, + 0x4a, 0x96, 0xaf, 0x46, 0xd5, 0xc2, 0x2b, 0xbd, 0x43, 0x44, 0x62, 0x99, + 0xaf, 0xa9, 0x58, 0x3f, 0x84, 0xc6, 0xc3, 0xb0, 0x79, 0x89, 0xba, 0x3f, + 0xad, 0x3d, 0x31, 0x33, 0xa1, 0x65, 0x65, 0x54, 0x95, 0x37, 0x53, 0x78, + 0xaf, 0x29, 0x42, 0xd0, 0x45, 0xbb, 0xb8, 0x89, 0x46, 0x9c, 0x2d, 0x6b, + 0xb8, 0xb6, 0x5d, 0x75, 0x92, 0x40, 0x76, 0x2c, 0x49, 0x97, 0x50, 0x96, + 0x78, 0xce, 0x53, 0xa8, 0x5b, 0xb9, 0xa1, 0x51, 0x77, 0x63, 0x9e, 0x98, + 0x6f, 0x81, 0xb5, 0x95, 0x99, 0x26, 0x78, 0xb7, 0x73, 0x50, 0xab, 0x38, + 0xb6, 0x5b, 0x42, 0x35, 0x72, 0x3e, 0x8b, 0xad, 0x58, 0x3f, 0x7f, 0x8f, + 0xa4, 0x65, 0x40, 0x73, 0x80, 0x4f, 0x39, 0xbe, 0x8b, 0x2f, 0x85, 0x51, + 0x34, 0x4e, 0xad, 0xaa, 0x5e, 0x88, 0xca, 0xaf, 0xad, 0x75, 0x39, 0x36, + 0x9b, 0xc9, 0xbe, 0x5d, 0xce, 0x7d, 0x59, 0xa0, 0x98, 0x70, 0xa0, 0x7a, + 0x5d, 0x4d, 0x9a, 0xcb, 0xb9, 0x4d, 0x2a, 0x2e, 0x8a, 0x89, 0x80, 0x51, + 0x6a, 0x6d, 0x82, 0x35, 0xbd, 0xa4, 0x92, 0x8b, 0x9f, 0x53, 0x34, 0x57, + 0xe4, 0xdb, 0xd0, 0x9c, 0xa5, 0x62, 0x8e, 0x75, 0x91, 0x67, 0x9b, 0x6a, + 0x4c, 0xbe, 0x3e, 0xb7, 0x46, 0x3d, 0x98, 0x3b, 0x4d, 0xb4, 0x6d, 0x6e, + 0x66, 0x79, 0xb4, 0x63, 0x7e, 0xb1, 0x6e, 0x99, 0x33, 0x72, 0x52, 0x79, + 0xa8, 0x6f, 0x66, 0xc7, 0x35, 0x3e, 0x6f, 0xc9, 0x81, 0x35, 0x75, 0xae, + 0x33, 0x44, 0x48, 0xbf, 0x69, 0x2c, 0x9d, 0xcd, 0xc4, 0x5d, 0xb1, 0x8c, + 0xb6, 0x51, 0x8b, 0x6a, 0x68, 0x48, 0xc2, 0x6b, 0x2d, 0xc0, 0x6a, 0xba, + 0x34, 0x96, 0xb9, 0x47, 0x63, 0xa3, 0x8a, 0x40, 0x58, 0xc5, 0x96, 0x3b, + 0x80, 0x9f, 0xa6, 0x5f, 0x81, 0xcb, 0xb3, 0x4e, 0xaa, 0xc1, 0x88, 0x83, + 0xb9, 0xc6, 0x91, 0x6a, 0xb8, 0x6d, 0xb1, 0x3f, 0x3f, 0x4d, 0x65, 0x6f, + 0x6e, 0x7d, 0x49, 0x82, 0xc7, 0x7f, 0xab, 0xaa, 0x61, 0x8d, 0x5a, 0x94, + 0x29, 0x81, 0x9a, 0xaf, 0x7d, 0x6c, 0x6f, 0x87, 0x60, 0xa6, 0xb6, 0xb6, + 0xc7, 0x3f, 0x49, 0xc3, 0x33, 0xbf, 0xc8, 0x79, 0x55, 0xd3, 0xcd, 0x3c, + 0x5f, 0xad, 0xaf, 0x27, 0x5a, 0xb8, 0x81, 0x60, 0x66, 0x5e, 0xbe, 0x6c, + 0x66, 0x6c, 0xcb, 0xc2, 0xca, 0x35, 0x54, 0x77, 0xb5, 0x64, 0x36, 0x5c, + 0x7c, 0xb6, 0x46, 0x44, 0x57, 0xa4, 0x83, 0x89, 0x95, 0x69, 0x7e, 0x37, + 0xbb, 0x85, 0x48, 0x9e, 0xcf, 0x3f, 0x45, 0xbf, 0xa1, 0x9c, 0x7d, 0x2d, + 0xc4, 0xa4, 0x2c, 0x86, 0x5f, 0xaf, 0xaa, 0x66, 0x87, 0x33, 0x56, 0x69, + 0x45, 0x35, 0xb0, 0x72, 0x49, 0xaf, 0x96, 0x48, 0xb7, 0x95, 0x55, 0x52, + 0x9c, 0x2a, 0x31, 0x9b, 0x44, 0x8a, 0x5a, 0x5c, 0x9b, 0xd9, 0x39, 0x5b, + 0x44, 0x23, 0x96, 0xa8, 0x40, 0xbe, 0x8f, 0x92, 0x88, 0x55, 0x73, 0x98, + 0xb4, 0x97, 0xbe, 0x47, 0x9a, 0xd2, 0x4e, 0x42, 0x79, 0xc5, 0x93, 0x4d, + 0x60, 0x7b, 0x6f, 0x84, 0xd1, 0x9f, 0xaf, 0xb5, 0xb7, 0x8d, 0x41, 0xac, + 0x4f, 0x52, 0x35, 0x28, 0x5f, 0x8b, 0x6e, 0x97, 0x4f, 0xa0, 0x74, 0x29, + 0x5d, 0xc1, 0xaa, 0x8f, 0xb5, 0x44, 0x7b, 0xaf, 0x43, 0x6a, 0xa4, 0x55, + 0x63, 0xd0, 0x88, 0xb0, 0x3d, 0x80, 0x3f, 0x96, 0xaa, 0x88, 0x53, 0x7c, + 0xd7, 0x49, 0x85, 0x2b, 0x94, 0x90, 0x8f, 0x36, 0x6e, 0xd5, 0xad, 0x3c, + 0x7c, 0x6b, 0x93, 0x87, 0xdb, 0x51, 0xc5, 0x85, 0x7a, 0xc0, 0x91, 0xa2, + 0x87, 0xbf, 0xd0, 0xad, 0x92, 0xa2, 0xc6, 0x97, 0xa7, 0xa3, 0x58, 0x76, + 0x27, 0xc4, 0xc5, 0xb9, 0xa5, 0x86, 0xc3, 0x6a, 0x5f, 0x88, 0xaa, 0x4e, + 0x6d, 0x6a, 0x32, 0x6b, 0x6d, 0x4a, 0x8f, 0x78, 0x33, 0x55, 0xc6, 0x2d, + 0x7f, 0x70, 0x66, 0xb5, 0x33, 0x85, 0x4a, 0x7a, 0xbc, 0x7a, 0xa8, 0x6a, + 0x96, 0xc7, 0x76, 0x79, 0x37, 0x5d, 0x4c, 0x42, 0xc8, 0x74, 0xa5, 0x2a, + 0x63, 0x5b, 0x2c, 0xc0, 0x57, 0xc9, 0x3e, 0xc7, 0x61, 0x34, 0x99, 0x39, + 0xae, 0xa0, 0x1e, 0x49, 0x43, 0xb2, 0x89, 0x8e, 0x33, 0x47, 0x6f, 0x61, + 0xd4, 0xac, 0x57, 0xb6, 0x9e, 0x34, 0xc5, 0x6b, 0x38, 0x91, 0x5e, 0xbd, + 0x84, 0x61, 0x47, 0xb9, 0x63, 0x50, 0x55, 0xa2, 0xad, 0xa9, 0xa2, 0x9c, + 0x5e, 0x53, 0xcc, 0x1f, 0x48, 0x93, 0x6e, 0xb5, 0x67, 0xd1, 0x7c, 0xbc, + 0xcd, 0x45, 0x4d, 0x7a, 0xb1, 0xa6, 0xc0, 0x88, 0xc8, 0x8b, 0x7d, 0x6d, + 0x71, 0xcc, 0x47, 0xb6, 0x81, 0xb1, 0x59, 0x74, 0x7b, 0xa9, 0x5b, 0xc5, + 0x96, 0x75, 0x2d, 0x7e, 0x4c, 0x23, 0x72, 0x79, 0x93, 0x6a, 0x44, 0x95, + 0x88, 0xd1, 0x36, 0xa8, 0x35, 0x49, 0x5f, 0x35, 0x74, 0x8b, 0xad, 0xbd, + 0x3d, 0xb3, 0x78, 0x5c, 0x6a, 0x59, 0xdd, 0xd1, 0x53, 0xb0, 0x32, 0x37, + 0x3a, 0xc8, 0x70, 0x8e, 0xbe, 0x41, 0x32, 0xb3, 0x7b, 0x8f, 0x37, 0x71, + 0xca, 0x82, 0xcd, 0x8e, 0x65, 0x50, 0x9c, 0xcc, 0x7b, 0x69, 0x8f, 0x68, + 0x67, 0xcb, 0x8e, 0x6e, 0x7e, 0x44, 0x5a, 0x3d, 0x6d, 0x3a, 0xc1, 0xa5, + 0x33, 0xad, 0x85, 0x46, 0xbf, 0x5b, 0x9f, 0x3f, 0x50, 0x3e, 0x59, 0x74, + 0xcc, 0xc1, 0x6b, 0x5f, 0xbe, 0x95, 0x9c, 0xa8, 0x7d, 0xa2, 0x46, 0xae, + 0xc3, 0xc3, 0x81, 0x68, 0xc2, 0x72, 0xa4, 0xa0, 0x82, 0x8b, 0xaa, 0x89, + 0xb8, 0xd9, 0x19, 0x9f, 0xda, 0xc3, 0x52, 0xb5, 0xca, 0x9b, 0xbb, 0x41, + 0x5a, 0x4d, 0x46, 0xb3, 0x8e, 0xb6, 0x2f, 0x3c, 0x43, 0xb8, 0xbe, 0x56, + 0xb8, 0x63, 0xcb, 0x66, 0x87, 0xb9, 0xba, 0x34, 0x97, 0x82, 0x85, 0x44, + 0x27, 0x32, 0x48, 0x77, 0x44, 0x66, 0x99, 0xde, 0xa4, 0x9a, 0x58, 0x3b, + 0x80, 0x87, 0x7f, 0x92, 0xaa, 0x95, 0xd3, 0xd1, 0xaf, 0x67, 0xb2, 0x54, + 0x79, 0x5e, 0xb1, 0x5f, 0x4d, 0xba, 0x52, 0xa2, 0x3a, 0x49, 0x82, 0xa2, + 0xbe, 0x62, 0x80, 0x4b, 0xbd, 0x86, 0xd1, 0xb3, 0xd1, 0x8c, 0xd8, 0x95, + 0x60, 0xc6, 0xa5, 0xbc, 0xc7, 0x5b, 0x50, 0xc2, 0xa5, 0xbd, 0xa1, 0xbc, + 0x72, 0x8b, 0x87, 0x40, 0x70, 0xcc, 0xc0, 0x6c, 0x48, 0x8f, 0x9b, 0x3c, + 0x64, 0xa2, 0x49, 0x7b, 0x65, 0xc9, 0x76, 0x56, 0x70, 0xba, 0x61, 0x6d, + 0x41, 0x44, 0x6b, 0x52, 0xaf, 0x5d, 0x90, 0xab, 0x70, 0x4f, 0x7d, 0x48, + 0xc4, 0xc6, 0xa6, 0x7d, 0xae, 0xba, 0xa8, 0xaa, 0x93, 0xb4, 0x4e, 0x87, + 0x98, 0xa0, 0x9a, 0x6a, 0x5b, 0xcc, 0x55, 0x7e, 0x88, 0x7c, 0x42, 0x6e, + 0x83, 0x40, 0x7a, 0xba, 0x8b, 0x6e, 0x8f, 0xbb, 0x99, 0xaf, 0xc6, 0x54, + 0xb6, 0x7b, 0x64, 0xbc, 0xcf, 0x48, 0xa8, 0x87, 0xa3, 0xb4, 0x52, 0x3a, + 0x52, 0xd6, 0x9b, 0xc3, 0x4b, 0xb0, 0xbd, 0x28, 0x42, 0x8a, 0x9d, 0x34, + 0x97, 0x7e, 0x76, 0x7e, 0x32, 0x4d, 0x79, 0xce, 0x6d, 0xc0, 0xc2, 0x79, + 0xbf, 0x40, 0xd0, 0xab, 0x39, 0x9e, 0xc1, 0x78, 0x72, 0x30, 0x8c, 0x6b, + 0x93, 0x50, 0x38, 0xa5, 0x30, 0x40, 0x47, 0xd4, 0xc7, 0x9e, 0x84, 0xb2, + 0xd3, 0x7a, 0x55, 0xb7, 0x9a, 0x9a, 0xcd, 0x3b, 0x61, 0x66, 0x64, 0x4f, + 0x90, 0x51, 0xc9, 0x56, 0x6a, 0x82, 0xbd, 0xc4, 0x33, 0xa1, 0xbb, 0xbb, + 0x94, 0x41, 0xcc, 0x8d, 0xc2, 0xcf, 0x90, 0x56, 0xc7, 0xb0, 0x83, 0xbd, + 0x7c, 0x58, 0x6b, 0xca, 0x53, 0xa6, 0xc8, 0xb4, 0x72, 0x43, 0xd7, 0x64, + 0x57, 0x8b, 0x52, 0xa9, 0x57, 0xb2, 0x3d, 0x8f, 0x58, 0x6a, 0x49, 0xa5, + 0x6e, 0xba, 0x38, 0x93, 0x5f, 0x7f, 0x78, 0xa0, 0xc8, 0x74, 0xaa, 0x3b, + 0x3a, 0x65, 0xd1, 0x37, 0x98, 0x6b, 0x45, 0xa9, 0xaa, 0x67, 0x5a, 0xc6, + 0x8a, 0x6a, 0x47, 0x8f, 0x5b, 0xa8, 0x4d, 0x57, 0x62, 0xd0, 0x61, 0x66, + 0x9c, 0x69, 0x60, 0xc9, 0x94, 0x8c, 0x8b, 0x6c, 0x7c, 0x57, 0x87, 0x95, + 0xad, 0xab, 0x47, 0xba, 0x8e, 0x72, 0xb2, 0x5f, 0x73, 0xd1, 0x41, 0x4f, + 0x88, 0xcd, 0x52, 0x71, 0x96, 0xe7, 0xc5, 0xc3, 0x87, 0x3e, 0x9d, 0x6f, + 0x3d, 0xd1, 0x8c, 0x71, 0xc7, 0x7f, 0x36, 0x78, 0x36, 0x9e, 0x52, 0x81, + 0x93, 0x78, 0xbc, 0x7a, 0x6c, 0x52, 0xbe, 0x9f, 0xa0, 0x30, 0x8d, 0xde, + 0x47, 0x47, 0x82, 0x8d, 0xb8, 0x65, 0x94, 0x9a, 0x76, 0xc5, 0x3d, 0x60, + 0xb1, 0x53, 0xbe, 0x4d, 0xb8, 0x96, 0xa3, 0xa1, 0xb4, 0x52, 0xca, 0xb5, + 0x39, 0x66, 0x52, 0xcb, 0x7f, 0x44, 0x76, 0xa8, 0xa3, 0x6b, 0x41, 0x74, + 0x5e, 0x70, 0x95, 0x3c, 0xaf, 0xaa, 0x89, 0x8e, 0xbf, 0x67, 0xa9, 0x5b, + 0x55, 0x6b, 0x9c, 0xcc, 0x40, 0xbc, 0x71, 0xb1, 0x68, 0x86, 0x8f, 0x7a, + 0xa2, 0x98, 0x7f, 0x38, 0x76, 0x94, 0x78, 0x64, 0x9e, 0xb6, 0xbc, 0xcb, + 0xaf, 0xaa, 0x76, 0x36, 0xbe, 0x87, 0x5a, 0x45, 0xaf, 0xce, 0x54, 0x46, + 0x6c, 0x84, 0x5a, 0x79, 0xa2, 0xa3, 0x6e, 0x8c, 0xc1, 0x56, 0xce, 0x8c, + 0x98, 0x76, 0xb1, 0x8f, 0xb1, 0x7a, 0xd2, 0x78, 0x47, 0xbe, 0x32, 0x89, + 0x84, 0x7e, 0x30, 0xa1, 0xb4, 0xa3, 0xc0, 0x5e, 0x3a, 0x6c, 0xc3, 0x77, + 0x8a, 0x6d, 0xc2, 0x8d, 0xb3, 0xac, 0x25, 0x4e, 0x7a, 0x4b, 0x98, 0x29, + 0x7b, 0x79, 0xa5, 0x46, 0x39, 0xb0, 0x38, 0x8b, 0x8c, 0x9c, 0xa1, 0xc6, + 0xa0, 0x39, 0xaf, 0x66, 0xbb, 0xab, 0x86, 0x71, 0x92, 0x7f, 0x6a, 0x99, + 0x8a, 0xaf, 0x4e, 0xcb, 0x94, 0x62, 0xc5, 0xc0, 0xa8, 0xc6, 0xc2, 0x3e, + 0xc2, 0x56, 0x9b, 0x4f, 0x8c, 0xa5, 0xb9, 0x83, 0xa3, 0xb2, 0x8d, 0x5f, + 0x74, 0x66, 0x6a, 0xc1, 0xcc, 0xa8, 0x23, 0xaf, 0xd1, 0x90, 0x39, 0x2b, + 0x74, 0xc8, 0x5c, 0xaf, 0x79, 0xdd, 0x9f, 0xbd, 0xc4, 0x4b, 0xab, 0xb4, + 0x77, 0x58, 0x78, 0x4d, 0xa0, 0x83, 0x83, 0x21, 0xa5, 0x74, 0x95, 0x40, + 0x82, 0xc6, 0xc7, 0x93, 0x78, 0x92, 0x94, 0xb9, 0x99, 0x5e, 0x81, 0xa4, + 0x83, 0x58, 0xda, 0xbb, 0x4f, 0x3a, 0x7b, 0x6a, 0x3a, 0x43, 0x8c, 0x61, + 0xc9, 0xc8, 0x40, 0x7c, 0xa7, 0xd0, 0x47, 0xa5, 0x94, 0xd0, 0x49, 0x73, + 0x91, 0x8e, 0x79, 0xcf, 0x79, 0x74, 0x93, 0x36, 0x90, 0x55, 0x91, 0x85, + 0x6a, 0xc8, 0x93, 0x76, 0x47, 0x90, 0xc2, 0xb0, 0xbd, 0x62, 0x67, 0x5b, + 0x3c, 0x5b, 0x69, 0x4d, 0x7d, 0xa0, 0x5a, 0xbb, 0x97, 0x4a, 0x85, 0x7d, + 0xac, 0x6d, 0x6c, 0xab, 0xc5, 0x67, 0xa6, 0x9e, 0x9b, 0x89, 0x60, 0x7f, + 0x33, 0x2d, 0x89, 0x9b, 0x93, 0x35, 0x46, 0x5c, 0x45, 0x64, 0x58, 0x69, + 0x94, 0x78, 0x61, 0xb2, 0x51, 0x45, 0xae, 0x9a, 0x5c, 0x7a, 0x9b, 0x9f, + 0x82, 0xbe, 0xc3, 0x5d, 0x4c, 0x78, 0x47, 0x55, 0x9e, 0x84, 0x99, 0x42, + 0x7b, 0xba, 0x96, 0x2c, 0x92, 0x44, 0xbd, 0x34, 0xae, 0x3e, 0x96, 0xd6, + 0x52, 0x31, 0xaf, 0x5c, 0x9d, 0xc4, 0xb9, 0x92, 0x8d, 0x5a, 0xcf, 0x7b, + 0x85, 0xa7, 0x4f, 0x35, 0xa6, 0xa1, 0xc8, 0x47, 0x57, 0x34, 0x86, 0x76, + 0x33, 0xce, 0xb4, 0xc0, 0x81, 0xcc, 0x8e, 0xa5, 0x34, 0x83, 0x5f, 0x56, + 0x63, 0xad, 0xbd, 0x5e, 0xd9, 0x98, 0x6b, 0x83, 0xba, 0xc3, 0xbd, 0x83, + 0xc2, 0x53, 0x6c, 0x74, 0xc6, 0x78, 0x77, 0xbf, 0x7b, 0x7e, 0x65, 0x89, + 0x98, 0xbb, 0x5e, 0x8a, 0x66, 0xcc, 0xaa, 0x90, 0x4e, 0x94, 0x33, 0xba, + 0x69, 0xe3, 0xca, 0x93, 0x39, 0x50, 0x31, 0x8f, 0x34, 0x97, 0x75, 0x2f, + 0xc7, 0xc4, 0x7e, 0xc5, 0x78, 0x3f, 0x48, 0xac, 0x61, 0xb3, 0x8a, 0x58, + 0x94, 0x59, 0x6f, 0x9f, 0x48, 0xaa, 0xc2, 0xaf, 0xca, 0x39, 0x43, 0x7e, + 0xbc, 0xb8, 0x65, 0x63, 0xc6, 0x3b, 0x89, 0xac, 0x44, 0xb9, 0x8a, 0x46, + 0x47, 0x56, 0x3a, 0x6c, 0x52, 0x39, 0x89, 0x61, 0xbf, 0x65, 0x63, 0x8f, + 0xa4, 0xd6, 0x34, 0x47, 0x77, 0x75, 0xa0, 0xaf, 0xa0, 0x86, 0x7e, 0xc8, + 0xa6, 0x77, 0x73, 0x32, 0xa1, 0x5b, 0x8c, 0xaf, 0xd1, 0x4c, 0x52, 0x54, + 0xbb, 0xbe, 0x53, 0x6a, 0x6e, 0xb2, 0x63, 0xb7, 0x5e, 0x82, 0xca, 0x3d, + 0x3d, 0xcd, 0x6b, 0xb4, 0x6c, 0x7a, 0xa4, 0x8e, 0x91, 0x7f, 0xb7, 0xae, + 0xab, 0x7c, 0x4e, 0x34, 0x80, 0x64, 0xa0, 0x48, 0xa1, 0xa0, 0xc8, 0x8e, + 0xcb, 0xbc, 0x6c, 0x54, 0xaf, 0x5f, 0xbf, 0x36, 0x87, 0x88, 0xcc, 0x76, + 0x99, 0x87, 0x43, 0xd7, 0x79, 0x8f, 0xcc, 0xa9, 0x6c, 0x8c, 0x7c, 0x33, + 0x78, 0x37, 0x73, 0x27, 0x4b, 0xa5, 0x7c, 0xad, 0xc2, 0x33, 0x41, 0x85, + 0xac, 0xb7, 0xc4, 0x62, 0x95, 0x61, 0xd5, 0xb7, 0x52, 0x69, 0xcf, 0x94, + 0x6c, 0xe0, 0xaf, 0x69, 0x78, 0x49, 0x7a, 0x89, 0x3d, 0x3b, 0xb0, 0x2f, + 0x5b, 0xa8, 0x87, 0x52, 0xb6, 0x38, 0x4e, 0x4f, 0x36, 0x5e, 0xa9, 0x85, + 0x5a, 0xcd, 0x58, 0xae, 0xd0, 0xad, 0xae, 0x52, 0x6a, 0xcf, 0x4d, 0xa9, + 0xd6, 0xa7, 0x2c, 0x38, 0x7d, 0xce, 0x51, 0xc8, 0xbc, 0x33, 0x98, 0xc1, + 0x34, 0x47, 0x8a, 0x4c, 0x37, 0x3e, 0x91, 0x40, 0x48, 0x99, 0xc3, 0xb3, + 0xbd, 0x44, 0x6b, 0x6a, 0xb0, 0x6a, 0xd5, 0xc6, 0xc3, 0xc4, 0xa5, 0x4a, + 0x76, 0x42, 0xa9, 0x8c, 0x44, 0x40, 0x40, 0x52, 0xcb, 0x56, 0x86, 0x9f, + 0xad, 0xce, 0xbf, 0xae, 0xc7, 0xc6, 0x54, 0x54, 0x8b, 0x6a, 0x38, 0x61, + 0xc7, 0x74, 0xa4, 0xaf, 0x53, 0x58, 0x7a, 0xb0, 0xad, 0x52, 0xbb, 0x36, + 0x51, 0x55, 0x99, 0x59, 0x90, 0x9c, 0x7e, 0x95, 0xb7, 0x8e, 0xb5, 0x48, + 0x75, 0x79, 0x37, 0x9a, 0x61, 0xa3, 0x68, 0x4d, 0x6e, 0x9b, 0x7f, 0x85, + 0x44, 0xb9, 0xc7, 0xaa, 0xcb, 0x67, 0xb3, 0x3d, 0x90, 0xc7, 0x5b, 0xa7, + 0x9c, 0x7c, 0x6f, 0x80, 0x87, 0xae, 0xba, 0x44, 0xc1, 0x4e, 0x96, 0x76, + 0x8d, 0x89, 0x54, 0xa0, 0x56, 0xa2, 0xc9, 0x78, 0x55, 0x5f, 0x4f, 0x4b, + 0xc5, 0x6f, 0xae, 0x5f, 0x91, 0x6d, 0xcc, 0xa3, 0x99, 0xb3, 0x52, 0xc4, + 0x91, 0xa7, 0xc1, 0x83, 0x52, 0xcf, 0x8c, 0x42, 0x9e, 0x55, 0x4d, 0x9b, + 0x2e, 0x6a, 0x55, 0x82, 0xa3, 0x86, 0x7e, 0x56, 0xa9, 0x42, 0x88, 0x4a, + 0x44, 0x4d, 0x9a, 0xaf, 0x48, 0x9c, 0x45, 0x37, 0x78, 0xb4, 0x59, 0xab, + 0x70, 0x40, 0xbe, 0x42, 0xcc, 0xc5, 0x3d, 0xcf, 0x55, 0xbb, 0xb4, 0xa4, + 0xc8, 0x85, 0xa5, 0x4c, 0xa7, 0xd4, 0xcb, 0xc0, 0x84, 0x5b, 0x8d, 0x6a, + 0x2d, 0x59, 0xae, 0x57, 0x48, 0x47, 0x66, 0xab, 0xd8, 0xa8, 0x5f, 0x46, + 0x38, 0x40, 0xa1, 0x4f, 0xc7, 0x3f, 0x85, 0xc6, 0x99, 0x88, 0x94, 0xd2, + 0x42, 0xb2, 0x34, 0x9e, 0x67, 0x46, 0xca, 0xac, 0x57, 0x42, 0x5c, 0xab, + 0x7d, 0xb9, 0xa0, 0x72, 0x87, 0xa9, 0xba, 0x3f, 0x80, 0x3d, 0x83, 0x90, + 0xa2, 0x6b, 0xa8, 0x41, 0x41, 0x2f, 0x37, 0x56, 0xd0, 0x91, 0xa8, 0x4f, + 0x54, 0x70, 0x6e, 0xa1, 0x3f, 0x43, 0xa3, 0x3d, 0x8f, 0x55, 0x9c, 0xd4, + 0xac, 0x66, 0xbd, 0x51, 0x9a, 0xae, 0x36, 0x69, 0x50, 0x76, 0x81, 0xa0, + 0x79, 0x61, 0x95, 0xac, 0x81, 0x35, 0x6d, 0xbd, 0x2f, 0xc9, 0x83, 0x85, + 0xb2, 0x3b, 0x3c, 0x8a, 0x49, 0xbb, 0x8a, 0x63, 0x66, 0x54, 0x47, 0x99, + 0x89, 0xa3, 0x39, 0xbb, 0xc1, 0x2b, 0xd3, 0x61, 0xb6, 0xbe, 0xa6, 0x91, + 0x42, 0x6e, 0x36, 0xca, 0x7a, 0x82, 0x5b, 0x7a, 0x85, 0x50, 0x3f, 0x62, + 0x50, 0xbd, 0x9b, 0xaf, 0x77, 0x3f, 0x65, 0xc1, 0x4c, 0x6c, 0xb3, 0x73, + 0x5e, 0xbf, 0xb0, 0xcd, 0x63, 0x9a, 0x83, 0x76, 0x90, 0xd1, 0x33, 0x37, + 0x2e, 0x4c, 0xbe, 0xc5, 0x5b, 0x9e, 0x4b, 0xbc, 0x98, 0x3c, 0x5e, 0x30, + 0x44, 0xb8, 0x34, 0x7f, 0xc6, 0x53, 0x87, 0x7c, 0xc5, 0xbc, 0x40, 0x98, + 0x5d, 0x4e, 0x64, 0x40, 0x6f, 0x5d, 0xab, 0xc0, 0x6a, 0x32, 0x8f, 0xc4, + 0x4c, 0x7b, 0xb3, 0x3f, 0x6b, 0x74, 0x9a, 0xb5, 0x86, 0x7a, 0x42, 0xa4, + 0x3f, 0x96, 0xca, 0x41, 0x6b, 0x58, 0x7b, 0x2e, 0x86, 0x70, 0x53, 0xcc, + 0xc8, 0xbf, 0xad, 0x46, 0x5f, 0x97, 0x6a, 0xcd, 0x38, 0xc7, 0x44, 0x3b, + 0xa8, 0x6b, 0x7c, 0x70, 0x81, 0x9a, 0x2f, 0x7d, 0xa8, 0x9d, 0x34, 0x45, + 0x5d, 0xbd, 0xa6, 0xc6, 0x8e, 0x37, 0xb7, 0x3d, 0xb0, 0x57, 0xa9, 0xa4, + 0x92, 0x4c, 0x45, 0x41, 0x85, 0x5b, 0x2e, 0xa3, 0xa1, 0x55, 0x66, 0x3e, + 0xb7, 0xb5, 0x8c, 0x66, 0x41, 0x99, 0xc8, 0x6b, 0x56, 0x80, 0x7a, 0x58, + 0xcf, 0xc6, 0x62, 0xcc, 0xca, 0xb6, 0x8a, 0xbf, 0xc8, 0xb0, 0xce, 0x82, + 0xbe, 0x68, 0x63, 0x84, 0x61, 0x9b, 0x30, 0xb4, 0x47, 0xba, 0x42, 0x71, + 0x58, 0x61, 0x6f, 0x80, 0x37, 0x85, 0xcf, 0x76, 0x7d, 0x4b, 0xc7, 0x81, + 0x56, 0xb1, 0x51, 0x35, 0xba, 0x60, 0xb8, 0x9b, 0x58, 0xce, 0x50, 0x8b, + 0x59, 0xa3, 0xb7, 0x45, 0x56, 0xcc, 0x98, 0x7e, 0xc6, 0x9b, 0xb2, 0x55, + 0x61, 0xc0, 0xc3, 0x68, 0xbf, 0xbe, 0x76, 0xc1, 0x98, 0x59, 0xa4, 0xcf, + 0xbe, 0x68, 0x6a, 0x50, 0x74, 0x56, 0x87, 0x47, 0x6f, 0x64, 0xa9, 0x7b, + 0x9f, 0x81, 0x63, 0x5c, 0x38, 0x73, 0x48, 0x9b, 0xb7, 0x62, 0xa7, 0xae, + 0xaf, 0xcf, 0xac, 0x33, 0x7d, 0x62, 0x4f, 0x3e, 0x49, 0x59, 0xa3, 0x5f, + 0x39, 0xc4, 0xa2, 0x4c, 0x76, 0x70, 0x76, 0x64, 0xa3, 0xbf, 0x48, 0x5b, + 0x63, 0x6f, 0x70, 0x40, 0xa8, 0x99, 0x6e, 0xac, 0x30, 0xca, 0x7d, 0x92, + 0x9e, 0x8a, 0x71, 0x3b, 0xa3, 0x5f, 0xc9, 0xa4, 0x60, 0x2e, 0x50, 0x64, + 0xba, 0xc7, 0x4e, 0x97, 0xa5, 0x35, 0x31, 0x51, 0x7e, 0x88, 0xcb, 0x82, + 0x91, 0xa1, 0x90, 0x7a, 0x56, 0x94, 0x4f, 0xca, 0x58, 0xcd, 0xb5, 0x89, + 0x8a, 0x3a, 0xbb, 0x62, 0x44, 0x9c, 0x6c, 0xc8, 0x39, 0x9d, 0x6a, 0xac, + 0x8e, 0xa3, 0x2e, 0x53, 0x37, 0x70, 0x5c, 0xb0, 0xcf, 0x9c, 0x83, 0x5e, + 0x50, 0x9d, 0xc2, 0xbf, 0x65, 0x63, 0xc3, 0xbf, 0x9a, 0x91, 0x92, 0x69, + 0x39, 0xd7, 0xcf, 0xc3, 0x5b, 0x91, 0x3a, 0x75, 0x9b, 0x80, 0x43, 0x4c, + 0xc0, 0xa2, 0x36, 0x3b, 0x89, 0x4f, 0x92, 0x67, 0x96, 0x50, 0x74, 0xa1, + 0x9a, 0xce, 0x39, 0x87, 0x72, 0x6d, 0x5c, 0xb0, 0xa6, 0x91, 0x9b, 0x97, + 0xc2, 0xba, 0x5a, 0x61, 0x77, 0xba, 0xaa, 0x41, 0x44, 0x61, 0xa5, 0xa8, + 0xc6, 0x37, 0xb6, 0xaf, 0x4f, 0xa6, 0x61, 0xb5, 0x8e, 0x81, 0x3e, 0x54, + 0x3b, 0x2f, 0x84, 0x54, 0x69, 0x60, 0x4c, 0x83, 0x63, 0xb4, 0xb8, 0x97, + 0x48, 0x60, 0xcc, 0x49, 0x3e, 0xab, 0x97, 0x65, 0x76, 0xb2, 0xbb, 0x48, + 0x62, 0x51, 0x75, 0xaa, 0xbd, 0x31, 0x5a, 0x4e, 0x82, 0x73, 0x7c, 0x42, + 0x37, 0x96, 0xb5, 0x72, 0x5a, 0xbc, 0x4b, 0x62, 0x35, 0x7d, 0x54, 0x61, + 0x5b, 0x98, 0x55, 0x8c, 0xab, 0x67, 0x85, 0xcb, 0x6f, 0x62, 0x87, 0x3f, + 0x71, 0x7f, 0x62, 0x61, 0x88, 0x35, 0x54, 0x92, 0x94, 0xbf, 0x55, 0x47, + 0xa4, 0xbc, 0x9c, 0x53, 0x3a, 0x45, 0x6d, 0xb9, 0x77, 0x71, 0xbb, 0x6a, + 0x86, 0x71, 0x55, 0xd1, 0xc4, 0x79, 0xac, 0xb2, 0xc0, 0x8a, 0x59, 0x3f, + 0xbc, 0x3e, 0x50, 0xcb, 0x6e, 0x53, 0xa7, 0x4e, 0x86, 0x54, 0x48, 0x98, + 0x84, 0xa9, 0xbb, 0x86, 0x76, 0x81, 0x3a, 0xb0, 0x47, 0x3e, 0xd4, 0x6f, + 0x58, 0x3a, 0x58, 0x67, 0x76, 0xc3, 0x3a, 0x3f, 0x8f, 0x3b, 0xcb, 0x4a, + 0x52, 0xc4, 0x5d, 0x35, 0x84, 0x4a, 0x65, 0x93, 0x63, 0x57, 0x99, 0x95, + 0xbd, 0xbe, 0x87, 0x47, 0x72, 0x58, 0x3c, 0xbc, 0x53, 0x8f, 0xc7, 0xbf, + 0x64, 0x66, 0xa5, 0x4f, 0xa0, 0x60, 0x68, 0xc7, 0xcc, 0x33, 0x3d, 0xd8, + 0x8d, 0x5a, 0x97, 0xab, 0x69, 0x8f, 0x79, 0x67, 0xc9, 0x8f, 0xb7, 0x3f, + 0x71, 0xa6, 0x59, 0x98, 0x54, 0x43, 0x60, 0xbd, 0x7a, 0xa4, 0xb2, 0xaf, + 0x38, 0xc4, 0x59, 0x98, 0xab, 0x5e, 0x56, 0x58, 0x50, 0xc1, 0x6d, 0xce, + 0x6c, 0x41, 0x8f, 0x8b, 0xba, 0x49, 0x2e, 0xc2, 0xc2, 0xc7, 0xac, 0x95, + 0x9f, 0x86, 0x97, 0x4d, 0x77, 0x58, 0xc2, 0x4e, 0xcd, 0xc4, 0xa8, 0x6b, + 0x91, 0xa4, 0x5a, 0x6f, 0x3d, 0x91, 0x5c, 0x66, 0xa0, 0x7f, 0x9f, 0x6c, + 0x5a, 0xcb, 0x80, 0x8c, 0x99, 0xce, 0x5c, 0x80, 0x6e, 0x51, 0x46, 0xa5, + 0xb1, 0xb3, 0xa9, 0x8e, 0x6a, 0x65, 0x9a, 0x84, 0x73, 0x55, 0xaf, 0xa5, + 0x5a, 0x36, 0x6a, 0x6d, 0xc3, 0xd3, 0x5b, 0x84, 0x59, 0x6d, 0xbc, 0x3c, + 0x37, 0xd0, 0x54, 0x60, 0xb4, 0xc9, 0x6d, 0x5c, 0x87, 0x75, 0x74, 0x9a, + 0xac, 0x3f, 0xd2, 0x78, 0x49, 0x77, 0x37, 0x69, 0x43, 0x4f, 0x5a, 0x42, + 0x3e, 0x91, 0x7d, 0x66, 0x67, 0x78, 0x7e, 0xb0, 0xa4, 0x2d, 0x60, 0x73, + 0x84, 0xc8, 0x35, 0xb6, 0x8d, 0x61, 0x78, 0xb7, 0xca, 0x64, 0xcf, 0x4c, + 0xa0, 0x8d, 0x45, 0x6e, 0x58, 0xbe, 0x8c, 0x63, 0x9a, 0x3a, 0x49, 0xbb, + 0x40, 0x35, 0x8f, 0xa9, 0x7d, 0x57, 0x77, 0xa1, 0x3d, 0xc8, 0xcc, 0x64, + 0x3a, 0x46, 0xb8, 0xc7, 0x89, 0x5c, 0xac, 0xa1, 0x4d, 0x53, 0x4f, 0x4c, + 0x7a, 0xb4, 0x5f, 0x53, 0x5a, 0xaa, 0xbb, 0x99, 0x96, 0x87, 0xbc, 0xac, + 0xb3, 0xb6, 0x3f, 0x8d, 0xab, 0x41, 0xa4, 0xb2, 0x86, 0xcb, 0xb5, 0x56, + 0x32, 0xb7, 0xc6, 0x77, 0xc0, 0x8f, 0x3d, 0x5d, 0xc5, 0x6d, 0x59, 0x91, + 0x9d, 0x70, 0xd1, 0x89, 0x5f, 0xd2, 0x59, 0xad, 0x44, 0xc1, 0x3c, 0x85, + 0xb3, 0xd0, 0x5e, 0xa2, 0xb1, 0x6e, 0x6f, 0x44, 0xa7, 0x64, 0xaf, 0x78, + 0xcc, 0x9f, 0xa1, 0x89, 0x86, 0x3e, 0xaf, 0x98, 0x34, 0x7e, 0x98, 0xc3, + 0x8c, 0x90, 0x83, 0x50, 0x56, 0x4e, 0x55, 0x64, 0x6e, 0xbe, 0xa1, 0xb8, + 0x8c, 0x3e, 0x3e, 0x61, 0x90, 0x42, 0x82, 0x6a, 0x3a, 0xb0, 0x64, 0xd4, + 0x64, 0x42, 0x96, 0x65, 0xae, 0x81, 0xca, 0xcc, 0x66, 0x6f, 0x65, 0xac, + 0xb4, 0x92, 0x47, 0x5a, 0x8f, 0xb7, 0xc3, 0x40, 0xc9, 0x4a, 0xb7, 0x4a, + 0x9c, 0xa9, 0xc8, 0xcc, 0x6a, 0x99, 0x75, 0x8e, 0x91, 0x8c, 0x30, 0xa1, + 0x4c, 0x3d, 0x47, 0x6c, 0x41, 0x8d, 0x43, 0x3f, 0xbc, 0x56, 0xa9, 0xc8, + 0x66, 0x2d, 0x54, 0xa0, 0x71, 0xac, 0x3f, 0xc2, 0x49, 0xca, 0x8d, 0x33, + 0x74, 0xcf, 0xbd, 0xa9, 0x8a, 0xbf, 0x9a, 0x38, 0x5c, 0x8c, 0xb7, 0x95, + 0x66, 0xbc, 0x36, 0x70, 0xb6, 0xb4, 0x3f, 0x43, 0x9b, 0xcc, 0x91, 0x85, + 0x7d, 0xb2, 0x6b, 0x69, 0xb0, 0x68, 0xd0, 0x92, 0x34, 0xc3, 0x51, 0x7f, + 0xa0, 0x78, 0x63, 0xc7, 0x3e, 0xb7, 0x64, 0x79, 0xb2, 0x92, 0xd3, 0x9b, + 0x6d, 0x46, 0x77, 0x9f, 0x41, 0x5d, 0x85, 0x8c, 0x87, 0x33, 0x5b, 0x53, + 0x66, 0x5d, 0x89, 0x5c, 0xae, 0xa4, 0xbe, 0xd2, 0x57, 0x49, 0xc0, 0x89, + 0x45, 0x87, 0x61, 0xb8, 0x4d, 0x3e, 0xc4, 0x8d, 0xb2, 0xc6, 0x90, 0x83, + 0x52, 0xca, 0x76, 0xc7, 0xac, 0x83, 0x9d, 0xa4, 0xaa, 0xac, 0x3e, 0x3e, + 0x81, 0xce, 0xa8, 0x48, 0xcf, 0x75, 0x64, 0x80, 0x9b, 0x8c, 0xc4, 0xc6, + 0x42, 0xce, 0x9c, 0xda, 0x29, 0xbe, 0x66, 0x79, 0x83, 0xa7, 0x74, 0x97, + 0xbe, 0xc4, 0x39, 0x49, 0x9b, 0x90, 0x44, 0xad, 0x67, 0x8d, 0x43, 0xc4, + 0x41, 0xcb, 0xc1, 0x93, 0x65, 0xa6, 0xb8, 0x48, 0xa1, 0x32, 0xb7, 0x99, + 0x58, 0x3c, 0x67, 0x33, 0xb2, 0xd0, 0x94, 0x3a, 0x3f, 0x47, 0x7f, 0xcd, + 0x6a, 0x4a, 0xae, 0xb8, 0x73, 0xc0, 0x7c, 0x6a, 0x50, 0xb9, 0xc8, 0x52, + 0x5f, 0x44, 0x8d, 0x4e, 0xb0, 0x43, 0x8e, 0x7a, 0x4d, 0xab, 0x94, 0xa5, + 0xa4, 0x98, 0xad, 0xd4, 0xb1, 0x90, 0xa3, 0xcc, 0x74, 0xba, 0x9c, 0xcb, + 0xa8, 0x36, 0xac, 0x73, 0xad, 0x32, 0xb2, 0x5b, 0x7c, 0xc6, 0x70, 0x53, + 0xaf, 0x73, 0x5d, 0x8e, 0x78, 0xc6, 0xbe, 0x5d, 0xb8, 0x53, 0x3f, 0xc3, + 0x9f, 0x55, 0x9b, 0x6d, 0xc4, 0xc6, 0x88, 0x70, 0x79, 0x55, 0x5f, 0x5a, + 0x3e, 0x8b, 0xa0, 0x7a, 0x80, 0xca, 0xae, 0x69, 0x6e, 0x30, 0xb5, 0xb0, + 0x42, 0x5c, 0xa1, 0x81, 0x3a, 0xd0, 0x73, 0x2e, 0x52, 0xb6, 0x63, 0x9d, + 0x5a, 0xc9, 0x74, 0x96, 0x5b, 0xb5, 0x3a, 0xa6, 0x82, 0xb8, 0xb3, 0x3c, + 0xa9, 0x3f, 0xb7, 0x8d, 0x41, 0xa9, 0x93, 0x61, 0xb8, 0x35, 0x88, 0xa2, + 0x97, 0x6d, 0x45, 0x53, 0x69, 0x4e, 0xa0, 0x60, 0x85, 0x6b, 0xa0, 0x9f, + 0x6e, 0xa9, 0xa7, 0x7e, 0x3a, 0x6e, 0x74, 0x87, 0x88, 0x3b, 0x70, 0x5b, + 0x4d, 0x4b, 0x6d, 0x38, 0x77, 0xa5, 0x64, 0xc5, 0x37, 0x45, 0x5b, 0x54, + 0xa9, 0xb9, 0x46, 0xb9, 0x71, 0x91, 0x92, 0x73, 0xca, 0xb7, 0xb9, 0xa5, + 0x48, 0x9f, 0x57, 0xb8, 0x64, 0x47, 0x92, 0xbf, 0x8c, 0x77, 0x90, 0x3a, + 0x91, 0xbc, 0xd0, 0x78, 0xc9, 0x35, 0x83, 0x9c, 0x87, 0x3a, 0xba, 0x69, + 0xd0, 0x48, 0x74, 0x5b, 0x4e, 0x86, 0x99, 0x78, 0x6e, 0x60, 0xa8, 0x34, + 0x4c, 0x8c, 0x58, 0x3d, 0x63, 0x6b, 0xae, 0x8b, 0xa7, 0xb7, 0x7d, 0xc6, + 0x69, 0x49, 0xcc, 0xa0, 0x39, 0x8b, 0x52, 0x8a, 0x7c, 0xc1, 0xa0, 0xd0, + 0xcc, 0xa4, 0x6e, 0x8e, 0x6b, 0x39, 0x66, 0x93, 0xa3, 0xb1, 0x49, 0x32, + 0x84, 0x86, 0x42, 0x8e, 0x43, 0x75, 0xc3, 0xa4, 0xbb, 0x9e, 0x7c, 0x3c, + 0xbb, 0x5f, 0x8e, 0x75, 0xa8, 0x5e, 0x5c, 0xca, 0x9a, 0xbb, 0xa7, 0x54, + 0x72, 0x78, 0x94, 0x42, 0x51, 0x67, 0x90, 0x3f, 0xba, 0x4a, 0x9f, 0x63, + 0xb9, 0xb6, 0x88, 0x84, 0xa7, 0xaa, 0xd0, 0xcd, 0x49, 0x69, 0xce, 0x81, + 0x6a, 0xb5, 0xcf, 0x99, 0xa6, 0xa6, 0xc5, 0x53, 0x7d, 0x78, 0x95, 0xa9, + 0xce, 0x5d, 0xa7, 0xcc, 0x57, 0x4c, 0xc8, 0x5d, 0x46, 0xc3, 0x7f, 0x67, + 0x8b, 0x39, 0x8e, 0x8c, 0x50, 0x55, 0x4f, 0x60, 0x3b, 0xb9, 0x8c, 0x9a, + 0xca, 0x38, 0x4c, 0x88, 0xc2, 0xd1, 0x78, 0xbf, 0x3c, 0x8f, 0x31, 0x3f, + 0x78, 0xaa, 0xa4, 0x64, 0xc5, 0x89, 0x83, 0x6b, 0xcb, 0x54, 0x83, 0xc0, + 0x66, 0xa9, 0x58, 0xd2, 0x72, 0x3d, 0xbd, 0xb1, 0x8f, 0x46, 0x4f, 0xb0, + 0x71, 0x5a, 0xb7, 0xc4, 0x45, 0xb3, 0x45, 0xb5, 0x73, 0x6e, 0x96, 0x9f, + 0x3b, 0xa9, 0x9a, 0x4b, 0xa7, 0x4d, 0x44, 0xb0, 0xb9, 0x5e, 0xab, 0xb1, + 0xbf, 0x38, 0x85, 0x66, 0x8d, 0x9b, 0x54, 0xbb, 0xb3, 0xcc, 0x9c, 0x46, + 0x3e, 0x58, 0x90, 0xac, 0x6c, 0x54, 0x83, 0x7f, 0x4c, 0x98, 0xab, 0x4e, + 0xc7, 0xba, 0xd7, 0x42, 0x64, 0xc5, 0x94, 0x86, 0x68, 0x90, 0x9e, 0x3c, + 0xbc, 0xcb, 0x42, 0x63, 0x31, 0x56, 0xae, 0x5b, 0xc7, 0x2f, 0x76, 0x86, + 0x96, 0x37, 0x36, 0x3e, 0xa0, 0x50, 0x81, 0xb4, 0x72, 0x57, 0xb0, 0x7c, + 0x3e, 0xb2, 0x3e, 0x9f, 0x97, 0xa4, 0x4c, 0xbc, 0x52, 0x3a, 0xb7, 0x3b, + 0xc1, 0x37, 0x4a, 0xb6, 0x3a, 0x57, 0x5e, 0x7f, 0xbf, 0x5c, 0x55, 0xd0, + 0xc2, 0x56, 0x72, 0x51, 0x7c, 0x67, 0x57, 0x59, 0x7d, 0xc5, 0x34, 0x7e, + 0x79, 0x74, 0x97, 0x36, 0xce, 0x38, 0x31, 0x44, 0x9c, 0x85, 0xc7, 0x34, + 0x99, 0x51, 0x3c, 0xb6, 0x73, 0x44, 0x46, 0x83, 0xb2, 0xa8, 0x7f, 0x4b, + 0x89, 0xa6, 0x3c, 0x3b, 0x44, 0x46, 0x82, 0x4e, 0xd3, 0x73, 0x93, 0xbd, + 0xbf, 0x51, 0x79, 0x9b, 0xae, 0x5a, 0x39, 0x39, 0x60, 0x6a, 0xbd, 0x38, + 0x9a, 0xbb, 0xc3, 0xa8, 0x5c, 0x54, 0xb3, 0xc8, 0x3b, 0x74, 0x78, 0xbd, + 0x73, 0x7a, 0x90, 0x6b, 0x56, 0x7c, 0xbe, 0x85, 0x4b, 0x7b, 0x6c, 0x4a, + 0xc4, 0x9f, 0xcb, 0xac, 0x9f, 0xc2, 0x84, 0x7b, 0xa2, 0xca, 0xd3, 0x58, + 0xd4, 0xa3, 0xb6, 0x91, 0x98, 0x7b, 0x7e, 0xa9, 0xa1, 0x51, 0x6a, 0x9e, + 0xc1, 0x46, 0xcf, 0x96, 0x6e, 0xb5, 0xc2, 0x6d, 0x44, 0xc9, 0xd1, 0xd1, + 0x6e, 0x76, 0xce, 0xab, 0x91, 0xc3, 0x6a, 0xad, 0x32, 0x3f, 0xae, 0xa2, + 0x31, 0x54, 0x8d, 0x9d, 0x94, 0xce, 0xbb, 0x48, 0x98, 0x47, 0xcc, 0x45, + 0x3f, 0x55, 0x68, 0x52, 0x31, 0x42, 0x3a, 0xbe, 0x52, 0x42, 0xab, 0xd0, + 0xbe, 0xbd, 0xb6, 0x6f, 0x82, 0x33, 0x49, 0x59, 0xcb, 0xbc, 0xcd, 0x48, + 0x59, 0xc6, 0xba, 0x5a, 0xad, 0x34, 0x37, 0x69, 0xd2, 0xc9, 0x5f, 0x4e, + 0xaa, 0x9e, 0x50, 0x90, 0xa3, 0x3c, 0xb7, 0xab, 0x7f, 0xb0, 0x78, 0x5d, + 0xa8, 0x56, 0xcf, 0x33, 0x77, 0x35, 0xb9, 0x2f, 0x5c, 0x48, 0xd4, 0xc2, + 0xc0, 0x82, 0x62, 0x6e, 0x9b, 0xb4, 0xc7, 0x87, 0xbc, 0x52, 0xb3, 0x4f, + 0xb0, 0xbf, 0x9e, 0x69, 0x90, 0xba, 0x41, 0x75, 0xbb, 0xc3, 0xa4, 0x68, + 0xc2, 0x7d, 0xb5, 0xa8, 0x75, 0x45, 0xa4, 0x80, 0x77, 0x36, 0x65, 0x8a, + 0x3d, 0x64, 0xa5, 0x90, 0xd1, 0x7e, 0xcb, 0xa3, 0x77, 0x60, 0xb6, 0x67, + 0xc4, 0x85, 0x91, 0xc4, 0x3b, 0x6e, 0x42, 0x7d, 0x36, 0xba, 0x98, 0x53, + 0x9e, 0xbe, 0xcf, 0xbc, 0x4b, 0x9a, 0x88, 0x49, 0xbd, 0x63, 0x87, 0x71, + 0x97, 0x7a, 0xa4, 0x58, 0x99, 0x98, 0x66, 0xb3, 0x3a, 0xbc, 0x56, 0xbc, + 0x3b, 0xbc, 0x43, 0x99, 0xa1, 0x5f, 0xa2, 0x7d, 0x5b, 0x4c, 0x56, 0x58, + 0x68, 0x95, 0xcf, 0x48, 0x8a, 0x50, 0x57, 0x76, 0x46, 0x9c, 0xcb, 0x96, + 0x4f, 0xb7, 0x61, 0x76, 0x61, 0x44, 0x3e, 0x8b, 0x91, 0xb2, 0xa9, 0x86, + 0x8a, 0x6a, 0x89, 0x35, 0xcf, 0x35, 0x41, 0x57, 0x63, 0x9f, 0xc2, 0x9c, + 0x43, 0x39, 0x3d, 0x7f, 0x86, 0xab, 0xcc, 0x4c, 0x7d, 0x92, 0x40, 0x5e, + 0x6d, 0x8f, 0xa6, 0xb4, 0x4d, 0x65, 0xad, 0x72, 0x76, 0x59, 0x78, 0x9d, + 0x44, 0xcc, 0x36, 0x59, 0x9a, 0xc3, 0x43, 0x3c, 0x38, 0xd2, 0xc4, 0x91, + 0xd2, 0xc9, 0x59, 0x43, 0xa2, 0x7b, 0x7b, 0x59, 0x48, 0xae, 0x8b, 0xb7, + 0x50, 0xa7, 0x6e, 0x9a, 0x78, 0x8c, 0x3c, 0x60, 0xa7, 0xab, 0xcb, 0x43, + 0xa4, 0x95, 0x3e, 0x4e, 0xb1, 0x72, 0x3d, 0x76, 0x82, 0x8f, 0x67, 0x48, + 0x63, 0x6a, 0x6c, 0x4b, 0x95, 0xaa, 0x7c, 0x43, 0x91, 0xbb, 0x40, 0x5a, + 0x49, 0x5b, 0x89, 0xc4, 0x40, 0x7d, 0x98, 0x97, 0xa3, 0xb6, 0x92, 0x33, + 0xb6, 0x44, 0xc2, 0x4e, 0x8f, 0xbe, 0x68, 0x85, 0x8d, 0xc8, 0x4e, 0x82, + 0x75, 0x5d, 0xa6, 0xcb, 0x48, 0xb3, 0x8c, 0x45, 0x51, 0x64, 0x38, 0x48, + 0x22, 0x54, 0xb2, 0x8f, 0xc0, 0x85, 0x51, 0xa9, 0x98, 0x60, 0x3e, 0x51, + 0x60, 0x5f, 0x8e, 0xb0, 0x5d, 0xaf, 0xbd, 0x95, 0xbc, 0x72, 0x78, 0x99, + 0x52, 0x80, 0xae, 0x68, 0x5c, 0x56, 0x70, 0xaa, 0x7a, 0xbe, 0xc1, 0x71, + 0x96, 0x3a, 0xc7, 0xbb, 0x8c, 0x54, 0x3b, 0x56, 0x80, 0xc9, 0x75, 0xb7, + 0x9c, 0x70, 0xa1, 0x97, 0x50, 0xd0, 0x86, 0x45, 0x38, 0x7d, 0xb7, 0x3b, + 0x6d, 0x92, 0x8b, 0x48, 0x83, 0x89, 0x95, 0xa6, 0x55, 0x7b, 0xb0, 0x62, + 0x61, 0x7b, 0x8d, 0xa6, 0x9f, 0x9b, 0xc5, 0x48, 0x74, 0x55, 0x97, 0xbb, + 0x4e, 0x7d, 0xae, 0x9b, 0x3a, 0x94, 0x7c, 0xbf, 0x2f, 0xc3, 0x66, 0x3d, + 0xa3, 0x46, 0x82, 0x84, 0xc9, 0xb2, 0x5d, 0x7d, 0xc0, 0xa6, 0xc3, 0xa9, + 0xb1, 0x92, 0x95, 0xb1, 0xa5, 0xa6, 0x3f, 0xc8, 0xd0, 0xc2, 0x7c, 0x38, + 0x75, 0xbd, 0xbc, 0x37, 0x8f, 0x94, 0x8e, 0x92, 0xca, 0x95, 0xae, 0x81, + 0xa4, 0x3b, 0xb1, 0xc7, 0x73, 0x79, 0x6c, 0x3e, 0x9e, 0x7a, 0x71, 0x8b, + 0xc2, 0xca, 0xc7, 0x63, 0x88, 0xa6, 0x3b, 0xcf, 0x8a, 0x66, 0x9e, 0x48, + 0x5f, 0x6c, 0x3d, 0xb9, 0x90, 0x6d, 0x63, 0xbf, 0x83, 0x9d, 0xcf, 0x79, + 0x60, 0x52, 0x2f, 0xab, 0x9f, 0x86, 0xac, 0xb2, 0xbb, 0xd1, 0xa1, 0xc6, + 0x4c, 0x7b, 0x36, 0x8c, 0xc7, 0x41, 0x7d, 0x28, 0x6f, 0xbc, 0x38, 0x7f, + 0xb6, 0x3c, 0x2a, 0x8f, 0x51, 0x51, 0x4c, 0x65, 0xa2, 0xc2, 0x44, 0x8b, + 0xac, 0xa6, 0xbd, 0x90, 0xc4, 0x4f, 0xa8, 0x94, 0x6d, 0x4a, 0xb0, 0xad, + 0x7b, 0xd7, 0x88, 0xb0, 0x6b, 0x53, 0x8e, 0x90, 0x40, 0x57, 0xb4, 0x62, + 0x4e, 0x4f, 0xcb, 0x92, 0xbb, 0x8c, 0x56, 0xc2, 0xaf, 0x98, 0x57, 0xa7, + 0x7e, 0x47, 0x5b, 0xb7, 0xad, 0x94, 0x40, 0x43, 0x74, 0x95, 0x72, 0x6d, + 0x87, 0x45, 0xb1, 0xb5, 0x50, 0x7f, 0xbe, 0x62, 0xaa, 0x67, 0xbb, 0x9c, + 0x97, 0xb8, 0xb4, 0x62, 0x85, 0x68, 0x5d, 0x5e, 0x80, 0x87, 0x92, 0xc9, + 0x6a, 0xcc, 0x69, 0x32, 0x61, 0x3a, 0x42, 0x6d, 0x3c, 0x2d, 0x9e, 0x2e, + 0x77, 0x34, 0x39, 0x8f, 0x5c, 0xbe, 0x92, 0xa2, 0x7b, 0xa2, 0x7d, 0x3e, + 0xa8, 0x85, 0x66, 0xbf, 0x65, 0x59, 0xbe, 0x6e, 0xab, 0xb7, 0x64, 0x98, + 0x57, 0xcd, 0x7e, 0xce, 0xb2, 0x4a, 0x85, 0xb5, 0x4d, 0x72, 0x78, 0x38, + 0xbb, 0x9f, 0xc1, 0x3e, 0xc6, 0x4a, 0x5b, 0x47, 0x6c, 0x76, 0xb2, 0x5e, + 0x85, 0x8a, 0xc6, 0xcc, 0xcb, 0x9c, 0x6b, 0xa6, 0xa3, 0xa6, 0xc9, 0x6a, + 0x8b, 0xcb, 0x3e, 0x3f, 0x4a, 0x82, 0x84, 0x48, 0x4b, 0xc2, 0xbf, 0x96, + 0x74, 0x34, 0x52, 0xc2, 0x9c, 0xaf, 0x58, 0x38, 0x40, 0x5c, 0xa0, 0xa4, + 0x45, 0x8f, 0x61, 0x45, 0xca, 0x75, 0xba, 0xaa, 0x9a, 0xbe, 0x93, 0x64, + 0xcd, 0x65, 0xc1, 0x6d, 0x75, 0x57, 0x80, 0xa3, 0x89, 0x64, 0x55, 0x87, + 0xbb, 0x80, 0x8f, 0xd2, 0x9d, 0xc6, 0x5c, 0x48, 0x8c, 0x70, 0x81, 0x77, + 0x32, 0x74, 0x5d, 0x94, 0x8d, 0x47, 0xbf, 0x73, 0xd6, 0xb6, 0x73, 0x56, + 0x77, 0xd9, 0x8d, 0x40, 0x4a, 0x76, 0x3d, 0xc9, 0xbf, 0x97, 0xc5, 0x36, + 0x4f, 0x6c, 0x6c, 0xa3, 0xcb, 0x68, 0x93, 0xc5, 0x80, 0x7b, 0x58, 0x97, + 0xba, 0x98, 0x6a, 0x68, 0xc6, 0x7f, 0x5a, 0x7f, 0x43, 0x7c, 0xba, 0x59, + 0xdb, 0x70, 0x89, 0xc5, 0x7e, 0x7d, 0x9b, 0xbe, 0x42, 0xb5, 0x4e, 0x63, + 0x61, 0x65, 0x4c, 0xa9, 0xc3, 0x51, 0xc6, 0x44, 0x50, 0xa2, 0x4a, 0x30, + 0x3a, 0x46, 0x9a, 0xc2, 0xa5, 0x56, 0x47, 0xa9, 0x7b, 0xae, 0xba, 0x42, + 0x55, 0xc7, 0x63, 0x7b, 0x6d, 0xb3, 0xbc, 0xb6, 0xab, 0xa2, 0x8e, 0x33, + 0x80, 0x8d, 0x72, 0x7d, 0xc7, 0x68, 0x86, 0xb8, 0xd7, 0x7c, 0x4c, 0xa5, + 0x81, 0xbf, 0x5a, 0x6d, 0x69, 0xcb, 0x96, 0xab, 0xc8, 0x8c, 0x4c, 0x90, + 0x72, 0xc2, 0x30, 0xb8, 0xb2, 0x9f, 0x73, 0xd1, 0x60, 0x5a, 0x7e, 0x48, + 0xbc, 0xba, 0x9c, 0x58, 0xa4, 0x81, 0x7c, 0x54, 0x4c, 0x39, 0xa2, 0x78, + 0x8b, 0x83, 0x44, 0xa3, 0x8d, 0x45, 0x72, 0x68, 0x6c, 0x8a, 0x9b, 0xbe, + 0x8e, 0xb6, 0x92, 0x60, 0x59, 0xc7, 0x80, 0x62, 0x3d, 0x75, 0x8e, 0x8c, + 0x47, 0x8e, 0x8f, 0xb8, 0xd3, 0x48, 0x44, 0x93, 0x91, 0x7d, 0x7c, 0x55, + 0x2b, 0xad, 0x72, 0x43, 0x57, 0x64, 0x58, 0x3e, 0x7d, 0x64, 0x87, 0x53, + 0x76, 0x8e, 0xcf, 0x44, 0x9a, 0xca, 0x9a, 0xb1, 0x6c, 0xb0, 0x48, 0x6c, + 0xab, 0x60, 0x65, 0xc5, 0xcf, 0xbd, 0xc0, 0x66, 0xa6, 0x82, 0x8f, 0x63, + 0x9a, 0x7a, 0x7e, 0x99, 0xc3, 0x53, 0xc3, 0xab, 0x6a, 0x4c, 0x2d, 0x52, + 0x98, 0x35, 0x54, 0xa3, 0xc5, 0x47, 0xad, 0xd3, 0x39, 0xd1, 0x4d, 0xb0, + 0x68, 0x33, 0x76, 0xaa, 0x3f, 0x36, 0x68, 0xb6, 0x51, 0x47, 0x61, 0xb3, + 0xa3, 0x3c, 0x56, 0x7b, 0xb2, 0x5e, 0x44, 0xbb, 0x4a, 0x71, 0x71, 0xb1, + 0x8e, 0x5a, 0xc3, 0xc5, 0x89, 0x96, 0x83, 0xbf, 0xaf, 0xc2, 0xb1, 0x6d, + 0x83, 0x4f, 0x61, 0x54, 0xc9, 0x31, 0x42, 0xc7, 0xad, 0x76, 0x44, 0x9b, + 0x9b, 0x6f, 0xac, 0x8d, 0x83, 0x90, 0xb0, 0xc8, 0x94, 0x8c, 0x98, 0xc2, + 0xa9, 0x73, 0xb4, 0x8e, 0xce, 0xd4, 0xce, 0x44, 0x8c, 0x62, 0x9d, 0x76, + 0x8a, 0x99, 0x52, 0xaa, 0x63, 0x9d, 0xab, 0xbd, 0xc2, 0xcc, 0x6a, 0xa4, + 0x49, 0x8a, 0x49, 0x49, 0x68, 0x94, 0x3d, 0x69, 0xa5, 0x88, 0x50, 0xad, + 0x43, 0x47, 0xc6, 0x3c, 0x96, 0x8e, 0xac, 0x46, 0x93, 0x3d, 0x4c, 0x88, + 0x8a, 0xd6, 0x99, 0x4b, 0xd3, 0xa3, 0xbd, 0xd5, 0xa7, 0xb0, 0xda, 0x6e, + 0x9f, 0x6a, 0x95, 0x57, 0xa8, 0x66, 0x7c, 0x83, 0xd0, 0xcb, 0xb9, 0xc2, + 0xc4, 0x71, 0xc1, 0x7a, 0xc0, 0xaa, 0xb7, 0x7c, 0x64, 0x9e, 0x98, 0x9e, + 0x7e, 0x61, 0x6c, 0xa3, 0x47, 0x89, 0x5e, 0x64, 0x6b, 0x84, 0xdf, 0x45, + 0x96, 0x5e, 0x44, 0xe1, 0x92, 0x7c, 0x84, 0xce, 0x3f, 0xc8, 0xbf, 0x96, + 0x59, 0xc3, 0xbb, 0x5e, 0x39, 0x50, 0xb7, 0x38, 0x6e, 0x50, 0x7e, 0x91, + 0xaf, 0x70, 0x45, 0x88, 0x6b, 0x7c, 0x77, 0x4c, 0x6d, 0x70, 0x8a, 0x5e, + 0x76, 0x88, 0x3d, 0x8a, 0x9d, 0xc6, 0x9f, 0x8c, 0x33, 0x9e, 0x9c, 0x68, + 0x6a, 0xc2, 0xb9, 0xcb, 0x4f, 0x41, 0xba, 0x64, 0x8c, 0xbc, 0x58, 0xd0, + 0x4a, 0x6b, 0x76, 0xaf, 0xa5, 0x5d, 0x5d, 0xb1, 0x9d, 0x71, 0xcd, 0x75, + 0x57, 0xbc, 0x67, 0x5d, 0x87, 0x48, 0xab, 0x51, 0xbd, 0x50, 0xa1, 0x65, + 0xa8, 0xbb, 0xb8, 0xa5, 0x51, 0x8c, 0x80, 0x8c, 0x6c, 0x59, 0x59, 0x9d, + 0x48, 0xa8, 0xc4, 0x4e, 0x67, 0xa5, 0x8e, 0x46, 0x5f, 0xb0, 0x3f, 0x39, + 0x64, 0x8e, 0x7e, 0x8d, 0x48, 0xc2, 0x94, 0x29, 0x88, 0xdc, 0x34, 0x27, + 0xa1, 0x51, 0x5c, 0x37, 0x6b, 0xb8, 0x81, 0x7c, 0x87, 0x92, 0x55, 0x7d, + 0x5b, 0x9d, 0x68, 0x51, 0x8f, 0x74, 0x59, 0x78, 0xce, 0xb4, 0x45, 0x87, + 0x7b, 0x9c, 0x45, 0x7b, 0xd0, 0x5d, 0x94, 0x7c, 0x64, 0x87, 0x9d, 0x4b, + 0xb7, 0x6c, 0x42, 0x45, 0xb2, 0xc7, 0x48, 0xa6, 0xd4, 0x3e, 0x61, 0x4e, + 0x36, 0x8f, 0xb9, 0xac, 0x60, 0xa0, 0x93, 0x39, 0x88, 0xcd, 0x9e, 0xab, + 0xdc, 0x7d, 0x7e, 0x96, 0x38, 0x9e, 0x5b, 0x3a, 0x2f, 0x82, 0x5d, 0x79, + 0x80, 0xc0, 0x83, 0xc8, 0x7d, 0xa3, 0x4a, 0xab, 0xd8, 0xd7, 0xc7, 0x67, + 0x2f, 0xbf, 0x6d, 0x4d, 0xc1, 0x82, 0x54, 0xce, 0xbd, 0x62, 0x59, 0x99, + 0x84, 0x55, 0x62, 0x82, 0x8b, 0x56, 0xc2, 0xcb, 0x69, 0x98, 0x9f, 0x7e, + 0x53, 0x7d, 0x3a, 0x9e, 0x6f, 0x7d, 0xb3, 0x4f, 0x89, 0x47, 0xce, 0x93, + 0x34, 0x4e, 0x6b, 0x95, 0x71, 0x60, 0xb8, 0xbd, 0xb7, 0x96, 0x7a, 0x79, + 0x78, 0x3e, 0x4c, 0xaa, 0x67, 0xcb, 0x50, 0xa7, 0x64, 0x93, 0xbb, 0x9b, + 0x3c, 0xca, 0xa7, 0x49, 0xc9, 0x69, 0x7a, 0xce, 0x5c, 0x99, 0xcb, 0x82, + 0x91, 0x60, 0x49, 0x3c, 0x80, 0xb8, 0x8c, 0x69, 0x86, 0x8c, 0xba, 0x94, + 0x46, 0x60, 0xb7, 0x7f, 0x7e, 0x59, 0xc4, 0x43, 0xc1, 0x61, 0x4f, 0x7f, + 0x54, 0x71, 0x96, 0xcd, 0x79, 0x53, 0x4e, 0x64, 0x40, 0x48, 0x81, 0x99, + 0x5e, 0xd5, 0xb3, 0xd7, 0x91, 0x3c, 0x3e, 0xa4, 0xa5, 0xd1, 0x61, 0x3a, + 0xb3, 0xad, 0xa2, 0x3d, 0xa8, 0xa0, 0x53, 0x6e, 0xd9, 0xb1, 0x7f, 0x6c, + 0xb5, 0x87, 0x6b, 0x34, 0x76, 0x83, 0x75, 0x8f, 0xc9, 0xa9, 0xac, 0x44, + 0x9d, 0x6e, 0x9f, 0x48, 0x39, 0x53, 0x6c, 0x48, 0xc8, 0x5c, 0xce, 0x80, + 0xc8, 0x50, 0x31, 0x99, 0xe4, 0xca, 0x9d, 0x6b, 0xae, 0x48, 0x9d, 0xcd, + 0x80, 0x43, 0x96, 0x3e, 0x3c, 0x55, 0x7e, 0x96, 0x55, 0xc2, 0x5c, 0x34, + 0xd2, 0xb4, 0x6b, 0x89, 0x99, 0xbf, 0x60, 0xb0, 0x90, 0x72, 0x9f, 0x9c, + 0xa3, 0xbb, 0xbb, 0x93, 0x43, 0xb7, 0x46, 0x4e, 0xc4, 0x63, 0x9c, 0x81, + 0x72, 0x89, 0x84, 0x86, 0x62, 0x8f, 0xbe, 0xa8, 0xa2, 0x51, 0x70, 0xb8, + 0x9c, 0x2c, 0xb8, 0x50, 0x82, 0x49, 0x64, 0xb1, 0x52, 0x54, 0x6f, 0xb0, + 0x85, 0xab, 0xc5, 0x93, 0x3c, 0x7e, 0xbb, 0xa8, 0x95, 0xc7, 0xa3, 0x91, + 0x4b, 0xbc, 0x7a, 0xd4, 0x4d, 0x44, 0x6e, 0xd0, 0xbc, 0x21, 0x44, 0xdf, + 0x71, 0xb0, 0xa1, 0x8e, 0x4c, 0x68, 0x5b, 0x77, 0xc2, 0xc0, 0x70, 0xd7, + 0x32, 0xa7, 0xa6, 0x4f, 0xdb, 0x9f, 0xda, 0x94, 0xc0, 0xd0, 0x62, 0x83, + 0x51, 0x60, 0xcb, 0x56, 0xbd, 0x89, 0xac, 0x90, 0x3a, 0x80, 0x8a, 0xb9, + 0x65, 0xb3, 0x50, 0x97, 0x39, 0x52, 0x7a, 0xbe, 0x8d, 0xc2, 0x9b, 0x40, + 0xaf, 0xa0, 0x67, 0x4f, 0x43, 0xca, 0x8a, 0x6d, 0xc4, 0xad, 0x4c, 0x98, + 0x6c, 0x48, 0x93, 0x87, 0x8f, 0x4b, 0xad, 0x37, 0x80, 0xa1, 0x3d, 0x71, + 0x8c, 0x5e, 0x4f, 0xb4, 0x52, 0x83, 0xbb, 0xc5, 0xda, 0x78, 0x7d, 0x65, + 0xda, 0x7f, 0x65, 0xb3, 0x52, 0x79, 0x90, 0xe0, 0xa3, 0x93, 0x48, 0x67, + 0x5e, 0x95, 0xb2, 0xb0, 0xce, 0x4e, 0x49, 0x72, 0x87, 0x79, 0x78, 0xa6, + 0x3c, 0x89, 0x56, 0xcb, 0xae, 0xa7, 0x76, 0x3f, 0x87, 0xd0, 0xc9, 0x50, + 0x46, 0x91, 0x52, 0x7f, 0x64, 0x60, 0xa3, 0x2f, 0x87, 0x89, 0xba, 0x5b, + 0x5d, 0x8a, 0x9e, 0x99, 0x47, 0x67, 0xa2, 0xb0, 0x78, 0x35, 0x9e, 0xc0, + 0xb7, 0x94, 0x70, 0x86, 0x8c, 0xab, 0xae, 0x67, 0xbf, 0x36, 0x52, 0x4c, + 0x76, 0x4d, 0xa1, 0x36, 0x88, 0x99, 0x7f, 0x71, 0x57, 0x7c, 0x8b, 0xb3, + 0x4f, 0x8f, 0xac, 0x6b, 0x77, 0x56, 0x4a, 0x54, 0x3c, 0x47, 0xcb, 0xca, + 0x40, 0x7a, 0xcd, 0x76, 0x92, 0x7c, 0x61, 0x93, 0xc0, 0x5c, 0xc4, 0x5a, + 0x32, 0x38, 0x56, 0xd6, 0x3e, 0x4e, 0x71, 0x62, 0xca, 0x55, 0x41, 0x87, + 0x6b, 0x8b, 0xba, 0x2c, 0x49, 0x5b, 0x71, 0x64, 0x40, 0x9c, 0xbc, 0x7e, + 0x34, 0xbd, 0x71, 0x4a, 0x7c, 0x82, 0x66, 0x87, 0x5a, 0x5d, 0x8e, 0x7d, + 0x63, 0xc2, 0x63, 0xce, 0x62, 0x47, 0xc0, 0x87, 0x38, 0x5f, 0x7b, 0xbb, + 0xbc, 0x67, 0x8f, 0x36, 0x5b, 0xc7, 0x3f, 0x4e, 0xad, 0xae, 0xd1, 0x2e, + 0x56, 0x40, 0x91, 0xba, 0x2a, 0xa3, 0x9e, 0xa3, 0x7d, 0x88, 0x92, 0x89, + 0xc5, 0xba, 0x4a, 0x3f, 0x9e, 0x5d, 0xcd, 0x6b, 0xcd, 0xa2, 0xc4, 0xa5, + 0x9e, 0xac, 0x67, 0x6f, 0x38, 0xa3, 0x83, 0x3a, 0x54, 0x82, 0xd9, 0x6f, + 0xb6, 0x53, 0x78, 0xac, 0x8f, 0x3c, 0x79, 0xc9, 0xd1, 0xc1, 0xb9, 0x2e, + 0x63, 0xca, 0xd1, 0x62, 0xc4, 0x2c, 0xa8, 0xb3, 0xc8, 0xba, 0xa7, 0xab, + 0x56, 0xcc, 0x60, 0x53, 0xc3, 0x40, 0x30, 0x8d, 0xb9, 0xb3, 0xc0, 0x7e, + 0x3b, 0xad, 0x6a, 0x4e, 0x87, 0x6d, 0x85, 0xd1, 0xbe, 0xc7, 0x93, 0xe1, + 0xa4, 0xd9, 0x6e, 0x49, 0x6b, 0x7e, 0x7e, 0x56, 0xb6, 0xc3, 0xbf, 0xb9, + 0x8b, 0xcd, 0xa5, 0x93, 0x28, 0x80, 0x2e, 0x8d, 0x49, 0x80, 0xba, 0x77, + 0x7c, 0xa6, 0x9b, 0x44, 0x7a, 0x43, 0x8e, 0x23, 0xd0, 0xb8, 0x8f, 0x8e, + 0xc0, 0xa7, 0xba, 0xcc, 0xc9, 0xab, 0xa3, 0xb4, 0x9c, 0x6f, 0xc3, 0x84, + 0x4a, 0x43, 0x3c, 0xb9, 0x9e, 0x59, 0x9c, 0xc7, 0x9f, 0x85, 0x52, 0xa4, + 0x71, 0x80, 0xd4, 0x72, 0xab, 0x83, 0x75, 0x3a, 0xce, 0xca, 0xab, 0x96, + 0x38, 0x94, 0x81, 0x46, 0xa9, 0x69, 0x5b, 0x9b, 0x7e, 0x57, 0x68, 0xa6, + 0x62, 0x77, 0x41, 0x76, 0xc9, 0x51, 0x6f, 0xa7, 0x92, 0x76, 0x5c, 0x93, + 0x88, 0x51, 0x45, 0x5c, 0x6f, 0x54, 0x48, 0x3f, 0x37, 0x79, 0x67, 0x46, + 0x9c, 0x64, 0x8e, 0x8d, 0xaf, 0x90, 0x7d, 0xa9, 0x64, 0x54, 0x70, 0xc0, + 0x47, 0x83, 0xd9, 0xc0, 0xb3, 0x6d, 0xce, 0xbc, 0x33, 0x50, 0xb9, 0x87, + 0xba, 0x69, 0x62, 0x5e, 0xa6, 0x66, 0x3c, 0x90, 0xba, 0x66, 0x7b, 0x72, + 0x88, 0xd1, 0x85, 0xbf, 0xcf, 0xb5, 0x8e, 0x5e, 0xb2, 0x5e, 0x64, 0x5b, + 0x8f, 0xce, 0x45, 0x69, 0xab, 0x5d, 0xa9, 0x53, 0xd2, 0x90, 0x8f, 0x72, + 0x9d, 0xa7, 0x1c, 0x3f, 0x4b, 0x3d, 0xb2, 0x68, 0x49, 0x8a, 0x4b, 0xc9, + 0x2f, 0x6f, 0xcf, 0x5c, 0x92, 0xb2, 0xb6, 0x1d, 0x8f, 0x3f, 0x49, 0x9b, + 0x94, 0x2d, 0x77, 0x4f, 0x8f, 0x96, 0x53, 0x51, 0x86, 0x8a, 0x3a, 0xba, + 0x44, 0x73, 0xa7, 0x63, 0x91, 0xd9, 0xc4, 0xcb, 0xc9, 0x4f, 0x43, 0xcb, + 0x86, 0xc3, 0x97, 0x3f, 0x73, 0x90, 0x89, 0x9c, 0x49, 0x51, 0x8e, 0x76, + 0x70, 0x72, 0x52, 0xb3, 0x5e, 0x86, 0x96, 0xb0, 0x31, 0x67, 0xb7, 0x6f, + 0x8a, 0x9c, 0x90, 0x81, 0x6d, 0xa4, 0x7a, 0xb0, 0x6d, 0xbc, 0x5f, 0x3b, + 0xc3, 0xd0, 0x9c, 0xc4, 0x32, 0x4f, 0x54, 0x62, 0x88, 0xa7, 0x47, 0xad, + 0xaa, 0x54, 0x45, 0xdc, 0x6a, 0x5c, 0x2e, 0x90, 0x8b, 0x99, 0xbf, 0x45, + 0x3d, 0x76, 0xc0, 0x41, 0xd1, 0x47, 0x58, 0x74, 0x86, 0x51, 0x3d, 0xbd, + 0xb7, 0x4c, 0xa0, 0x8c, 0x8a, 0x4c, 0x7e, 0x6c, 0x83, 0x9b, 0xbd, 0x38, + 0x91, 0xa9, 0x74, 0x41, 0x7d, 0xc4, 0xcb, 0xa0, 0xbe, 0x47, 0x41, 0x97, + 0x71, 0x82, 0x79, 0x99, 0x34, 0x5f, 0x9d, 0x28, 0x7a, 0x7f, 0xbd, 0x8a, + 0x41, 0x67, 0x6a, 0x37, 0x6c, 0x5b, 0x37, 0x8d, 0x36, 0x41, 0xc4, 0x95, + 0xad, 0xa4, 0x9a, 0xb8, 0xa9, 0x49, 0x40, 0x40, 0xa0, 0xaf, 0x39, 0x88, + 0x88, 0x8d, 0xca, 0x82, 0xd3, 0x43, 0x5c, 0x84, 0xa7, 0xa7, 0x74, 0x74, + 0xb4, 0x8c, 0xc9, 0xb9, 0xb9, 0xa8, 0x57, 0x68, 0x91, 0x81, 0x37, 0xa4, + 0xab, 0xa3, 0x4d, 0x5a, 0x46, 0x2b, 0x3e, 0x4e, 0xbd, 0x68, 0x5e, 0x67, + 0x4e, 0x80, 0xb2, 0xc4, 0x87, 0x49, 0xd2, 0xad, 0x5f, 0xb5, 0x25, 0xaf, + 0x81, 0x9c, 0xbc, 0x98, 0x8f, 0x3b, 0xb9, 0xd2, 0x71, 0xc4, 0x7c, 0x53, + 0xa1, 0x63, 0x7b, 0x6e, 0x83, 0x3b, 0x4b, 0x95, 0x7a, 0x3e, 0x88, 0x6b, + 0x9d, 0xe2, 0x87, 0x66, 0xba, 0x74, 0xe4, 0x96, 0xc7, 0x97, 0x6b, 0x56, + 0xc5, 0xc6, 0x3e, 0x99, 0x39, 0x9c, 0x52, 0x6a, 0xc5, 0x7f, 0x81, 0x91, + 0x5a, 0x82, 0x30, 0x3c, 0xb6, 0x9b, 0xc8, 0x3c, 0x68, 0x3e, 0x7b, 0xb3, + 0x33, 0x70, 0xaf, 0x9c, 0xaa, 0xc6, 0xab, 0x44, 0x41, 0x75, 0xc9, 0x39, + 0x7b, 0xb8, 0xe8, 0xde, 0x54, 0x43, 0xe6, 0x5e, 0x47, 0x5a, 0xb6, 0x53, + 0xc5, 0x6f, 0x9a, 0x50, 0xd8, 0x8e, 0xa5, 0x8f, 0x50, 0x44, 0x86, 0x66, + 0x8e, 0x47, 0x9c, 0xa8, 0x4b, 0x6b, 0x53, 0x36, 0x36, 0x6f, 0xb5, 0xbf, + 0xcc, 0x60, 0x52, 0x3f, 0x40, 0xa9, 0x6a, 0x4e, 0x72, 0x6e, 0x92, 0x65, + 0xa1, 0x80, 0xc5, 0x62, 0x58, 0xa2, 0xba, 0x56, 0x3e, 0x4c, 0x5f, 0xba, + 0x65, 0xc6, 0xb3, 0xcf, 0x8b, 0x75, 0x99, 0x55, 0xc7, 0x40, 0xd1, 0x4b, + 0x62, 0x86, 0x3f, 0x5b, 0x52, 0x94, 0xab, 0x55, 0x9d, 0x59, 0xa5, 0x7a, + 0x80, 0x72, 0x54, 0x57, 0xaf, 0x57, 0xb9, 0x47, 0x7e, 0x3a, 0x80, 0x75, + 0xa1, 0xc1, 0xa0, 0x77, 0xa9, 0xbc, 0xab, 0x3d, 0xb1, 0x87, 0xc9, 0x6e, + 0x93, 0xaa, 0x57, 0x3d, 0x71, 0xb4, 0x76, 0xa7, 0xb5, 0xa9, 0xaf, 0x61, + 0xb2, 0x83, 0x55, 0x88, 0x6b, 0x30, 0x92, 0x68, 0xc6, 0xd4, 0x8b, 0x74, + 0x7c, 0x7d, 0xcb, 0xa4, 0x30, 0x57, 0x37, 0x3b, 0x94, 0x2b, 0x5f, 0x32, + 0xbc, 0xbb, 0xbd, 0x57, 0xb3, 0x69, 0xd6, 0xa8, 0x9a, 0x4e, 0x91, 0xcc, + 0x78, 0x45, 0x81, 0x88, 0x43, 0x4a, 0x70, 0xc9, 0x6d, 0x9f, 0x47, 0x73, + 0x79, 0x81, 0xa0, 0x95, 0x6c, 0x38, 0x95, 0x48, 0x50, 0x83, 0xcf, 0x94, + 0x59, 0x74, 0x80, 0x8a, 0xc2, 0xc1, 0x50, 0xce, 0x6d, 0x3f, 0x71, 0xac, + 0x6f, 0xb7, 0x4d, 0xc9, 0x80, 0x82, 0xaa, 0xab, 0xd1, 0xb0, 0xa2, 0x45, + 0x69, 0x68, 0x88, 0x42, 0xa6, 0x7d, 0xa5, 0xb3, 0x37, 0x53, 0x3c, 0x6b, + 0x7d, 0xa6, 0x39, 0x76, 0xcb, 0xb7, 0x5a, 0x71, 0x88, 0xad, 0xc6, 0x5c, + 0x59, 0xbd, 0x76, 0x49, 0x36, 0xcb, 0xaa, 0x8c, 0x39, 0x59, 0xd0, 0xcd, + 0x4f, 0xa5, 0x4b, 0x78, 0x44, 0x97, 0xa2, 0xb9, 0x68, 0xa6, 0x7e, 0x6c, + 0x95, 0x5d, 0xbb, 0x92, 0x81, 0x76, 0xa6, 0x65, 0xc2, 0x59, 0xc6, 0xa5, + 0x84, 0x8e, 0x3e, 0xa5, 0xd0, 0x94, 0x99, 0xab, 0xca, 0xb8, 0x41, 0x41, + 0x9d, 0x47, 0x4f, 0xa3, 0xbf, 0x67, 0x86, 0x61, 0xa9, 0x72, 0x83, 0x6a, + 0xd2, 0x91, 0x4f, 0xc9, 0x36, 0x9f, 0x8c, 0xbc, 0x98, 0xa6, 0x9d, 0x37, + 0x5c, 0x8f, 0x6d, 0x8a, 0x3a, 0x46, 0xcf, 0x3a, 0x99, 0x43, 0xa2, 0xa2, + 0x6e, 0x7f, 0x37, 0xb3, 0x42, 0x97, 0x5e, 0xac, 0x60, 0x62, 0xaa, 0xc8, + 0xab, 0xa2, 0x77, 0x9a, 0xc8, 0xac, 0x90, 0x5c, 0x91, 0xa8, 0x6f, 0x6f, + 0x72, 0x5d, 0xb7, 0x9c, 0xab, 0x72, 0x7e, 0x9b, 0x58, 0x81, 0x91, 0xa7, + 0x45, 0xca, 0xc5, 0x73, 0x3c, 0x34, 0x6b, 0xcf, 0x6e, 0x86, 0x5f, 0xc4, + 0x7d, 0x3f, 0x3b, 0x6e, 0xcc, 0x67, 0xaa, 0x4b, 0x59, 0xba, 0xa6, 0x87, + 0x45, 0x67, 0xa8, 0x7b, 0x4d, 0x69, 0x3c, 0x7b, 0x46, 0xb1, 0x50, 0xb4, + 0x5f, 0xc6, 0x84, 0xcc, 0x93, 0x34, 0xa7, 0x94, 0x7a, 0x77, 0x8b, 0xa3, + 0x73, 0x79, 0x48, 0xa8, 0x44, 0xb3, 0x60, 0x7c, 0xa5, 0x85, 0xcc, 0x94, + 0xd0, 0x96, 0x96, 0x8c, 0x87, 0x39, 0xa5, 0xbb, 0xa8, 0x89, 0x79, 0x8f, + 0x62, 0xd2, 0x65, 0x36, 0x8c, 0x91, 0x90, 0xc4, 0xb8, 0x49, 0x60, 0xa9, + 0xd0, 0xbf, 0xca, 0x5f, 0x59, 0x48, 0x42, 0xad, 0xab, 0x86, 0x39, 0x99, + 0xae, 0xca, 0x4e, 0x4c, 0xc5, 0x6e, 0x51, 0x7f, 0xcc, 0xc7, 0x78, 0x45, + 0xc1, 0xbf, 0x6e, 0x78, 0x90, 0xc2, 0xbb, 0x32, 0xbe, 0x6d, 0x7b, 0x80, + 0x5d, 0x6a, 0x65, 0xad, 0x5e, 0xa0, 0x31, 0xb9, 0x4d, 0x96, 0x59, 0x5a, + 0x58, 0x69, 0x59, 0x41, 0x41, 0x92, 0x92, 0xaa, 0x43, 0xa7, 0x87, 0xcf, + 0xcb, 0x5e, 0xaa, 0x7f, 0x6f, 0x41, 0x4f, 0xd0, 0x33, 0x3d, 0x8c, 0x52, + 0x49, 0x6d, 0x92, 0xcb, 0xc7, 0x6d, 0x37, 0xd1, 0x81, 0xa1, 0xc1, 0x44, + 0x63, 0x63, 0x54, 0x69, 0x8f, 0xbb, 0xb0, 0x3f, 0x3d, 0x59, 0xaa, 0x9f, + 0xb1, 0xb5, 0x59, 0xba, 0xaf, 0x43, 0xd9, 0xd8, 0x90, 0x97, 0x95, 0x60, + 0x89, 0x83, 0x9b, 0x43, 0x8a, 0x54, 0x47, 0x6f, 0x53, 0x89, 0x3c, 0x60, + 0xa2, 0x45, 0x9e, 0xc8, 0x30, 0x58, 0x78, 0xbe, 0x86, 0x60, 0x89, 0x8a, + 0xb3, 0x36, 0x48, 0x40, 0xac, 0x52, 0xd1, 0x5a, 0x3f, 0x36, 0x67, 0x64, + 0x99, 0xa7, 0x63, 0xc2, 0x3e, 0xa3, 0x90, 0x3a, 0x64, 0xbe, 0xc1, 0x51, + 0x89, 0xb3, 0x52, 0x88, 0x71, 0xba, 0x8f, 0xb9, 0x7b, 0xa2, 0x42, 0x86, + 0x60, 0x75, 0xba, 0xd0, 0x97, 0xd2, 0xad, 0x3d, 0x84, 0x89, 0xc5, 0xd0, + 0x98, 0x5e, 0xc0, 0x56, 0x5a, 0x85, 0x43, 0xac, 0x7b, 0xa4, 0xbf, 0xc3, + 0x9b, 0x40, 0xb7, 0x77, 0xa4, 0xa9, 0xc6, 0x6b, 0x45, 0xa2, 0xa5, 0xaa, + 0x32, 0xb1, 0x82, 0x83, 0x5c, 0x50, 0x8d, 0x70, 0xb4, 0xb9, 0xb2, 0x43, + 0xc6, 0x7e, 0x58, 0x9d, 0xc4, 0x46, 0x3c, 0x75, 0x8a, 0x9a, 0x7d, 0xcd, + 0xc0, 0x89, 0x62, 0x3f, 0x5c, 0x5b, 0x71, 0x94, 0x72, 0x6d, 0x38, 0xa5, + 0xcb, 0x8f, 0x83, 0xb7, 0x48, 0x6b, 0x36, 0x72, 0x9c, 0xa6, 0x59, 0x47, + 0x70, 0x33, 0x7a, 0x65, 0x78, 0x4b, 0xc5, 0xb4, 0x87, 0x31, 0xcd, 0x45, + 0xac, 0xb9, 0x32, 0x3c, 0x71, 0x47, 0x67, 0x72, 0xa6, 0x6c, 0xbc, 0xb9, + 0x47, 0x84, 0xb6, 0x97, 0x97, 0xc4, 0x6b, 0xb1, 0x5d, 0x63, 0x7e, 0x7c, + 0x95, 0x2f, 0x85, 0xa2, 0x58, 0x9d, 0x96, 0x48, 0x75, 0x78, 0x57, 0x38, + 0x8a, 0x8b, 0x79, 0x7e, 0x58, 0xaa, 0x37, 0x3f, 0xa3, 0x4c, 0xb7, 0xc7, + 0x4a, 0x90, 0x4d, 0xac, 0x3a, 0xb5, 0x51, 0xb4, 0xc6, 0xa6, 0x3f, 0x85, + 0x49, 0x7a, 0xa0, 0x92, 0x3a, 0x37, 0xbc, 0x46, 0x32, 0x84, 0x98, 0x3c, + 0x46, 0xcd, 0x4f, 0xc4, 0x72, 0xaa, 0x37, 0xab, 0x5e, 0x40, 0x5b, 0xc5, + 0xab, 0x77, 0x53, 0x5a, 0x62, 0x3c, 0x6c, 0x43, 0x3d, 0x7b, 0xad, 0x4c, + 0x84, 0x7f, 0x6a, 0x8e, 0x5b, 0xa9, 0x44, 0xab, 0xc0, 0x79, 0x52, 0x53, + 0x69, 0x99, 0xcc, 0x33, 0x60, 0xca, 0x51, 0x50, 0x36, 0xc4, 0x3a, 0x32, + 0xad, 0xc6, 0x5a, 0xa7, 0xa2, 0xb9, 0x62, 0x34, 0x38, 0x7d, 0x98, 0xc4, + 0xce, 0x58, 0xc8, 0xb7, 0x6b, 0x71, 0x55, 0x63, 0xc7, 0x8b, 0xb8, 0x65, + 0xb6, 0xad, 0x81, 0x62, 0xc7, 0x69, 0x3a, 0x80, 0x41, 0x52, 0xcd, 0xc0, + 0xc1, 0x3e, 0xb9, 0x40, 0x70, 0xbb, 0xa9, 0xb0, 0x56, 0x40, 0xcb, 0xa5, + 0x31, 0x54, 0x67, 0x53, 0x52, 0x6d, 0xaa, 0xc7, 0x36, 0x3e, 0x80, 0x65, + 0x8b, 0x4c, 0x99, 0x7b, 0x52, 0x91, 0x50, 0x77, 0x46, 0x96, 0x5b, 0x5f, + 0xc8, 0x7e, 0x5b, 0x81, 0x4a, 0xcd, 0x61, 0x4b, 0x81, 0xb4, 0x8f, 0x51, + 0xc3, 0x75, 0x56, 0x7a, 0x42, 0xa6, 0x84, 0x3f, 0x78, 0xd2, 0xbe, 0x58, + 0x7b, 0x38, 0x97, 0x47, 0xac, 0x71, 0xd0, 0xa3, 0x83, 0xb1, 0x66, 0x6a, + 0x8b, 0x49, 0xbb, 0x48, 0x7e, 0x69, 0xb1, 0xc7, 0xa8, 0x96, 0x5b, 0x57, + 0x6a, 0x5d, 0x72, 0x56, 0xcc, 0x3e, 0x9c, 0x3f, 0x42, 0x7f, 0x7c, 0x39, + 0x39, 0xc2, 0x45, 0xbb, 0x65, 0x8a, 0x59, 0x46, 0x80, 0xb1, 0xd2, 0xb0, + 0x4b, 0xa8, 0x4b, 0xd2, 0xc1, 0x97, 0x53, 0x51, 0x59, 0x73, 0xae, 0xa4, + 0x9f, 0x80, 0x9d, 0x5b, 0xae, 0x70, 0x98, 0x49, 0x55, 0x53, 0xd0, 0xc1, + 0x63, 0x9b, 0x8a, 0x3e, 0x36, 0x4e, 0x8d, 0x95, 0xcb, 0x81, 0x57, 0x80, + 0x41, 0x79, 0xa3, 0x63, 0x41, 0x2e, 0xb1, 0x3b, 0x35, 0x6a, 0x41, 0xcd, + 0x48, 0x8b, 0x88, 0x7c, 0x88, 0xb2, 0x39, 0xdb, 0x8c, 0x90, 0x7b, 0xbe, + 0x53, 0xbd, 0x3d, 0x54, 0x2c, 0x90, 0xd5, 0x3f, 0x93, 0x9d, 0x5e, 0xb4, + 0x31, 0x5c, 0x88, 0x9b, 0x9f, 0x4c, 0x5f, 0xbb, 0x9e, 0xa5, 0x40, 0x6f, + 0x8d, 0x69, 0x3f, 0xa7, 0xb5, 0x83, 0x30, 0x96, 0x3d, 0x5d, 0x65, 0x91, + 0x75, 0x7c, 0xc5, 0x36, 0x2e, 0xc8, 0x83, 0x86, 0x72, 0x4c, 0x64, 0x67, + 0xc5, 0xa0, 0x46, 0x6e, 0x94, 0xb6, 0xc0, 0xc6, 0x9e, 0x54, 0x67, 0xbe, + 0x81, 0x48, 0x62, 0xc0, 0x45, 0x8c, 0x40, 0x47, 0x6a, 0xbb, 0x59, 0xc9, + 0x71, 0xb8, 0x64, 0x99, 0x6c, 0xcb, 0xde, 0x47, 0x9e, 0x7d, 0xb0, 0x8f, + 0xcc, 0xa1, 0x7a, 0x50, 0xcb, 0x7e, 0x82, 0x6d, 0x69, 0x96, 0x9a, 0xaa, + 0x70, 0xbb, 0x4e, 0xc9, 0x59, 0x2e, 0xd2, 0x3d, 0x83, 0x66, 0x81, 0x58, + 0x70, 0x63, 0xd7, 0x4d, 0xa1, 0x5f, 0x6c, 0x3b, 0x6c, 0x90, 0x62, 0xb7, + 0x47, 0xa8, 0xa5, 0x4f, 0x5e, 0xd1, 0x4e, 0x89, 0x82, 0x41, 0x8a, 0x9a, + 0x82, 0x40, 0x59, 0x7b, 0x3c, 0x78, 0x7d, 0x5e, 0xb7, 0x56, 0x3e, 0x6d, + 0xdf, 0x5b, 0x62, 0xa4, 0x59, 0xb2, 0x69, 0x6b, 0xb1, 0x3d, 0xab, 0x91, + 0xc2, 0xad, 0x99, 0x8a, 0xb9, 0x6a, 0x68, 0xb0, 0x58, 0xb3, 0xa1, 0xa3, + 0xcc, 0x5b, 0x44, 0x63, 0x62, 0x83, 0x45, 0xc9, 0xa3, 0x92, 0xa5, 0x9c, + 0x88, 0xa1, 0x82, 0x80, 0x44, 0x40, 0x63, 0xce, 0x7b, 0x46, 0x8b, 0x5d, + 0x4a, 0x92, 0x55, 0x2f, 0x42, 0x8d, 0x9e, 0xce, 0x79, 0x69, 0xad, 0x57, + 0x4c, 0x6a, 0x7b, 0xc4, 0x8a, 0x68, 0xc6, 0x34, 0x51, 0xa6, 0x69, 0x6a, + 0x75, 0xb4, 0xbf, 0x64, 0x9f, 0x7f, 0xac, 0x50, 0xac, 0x88, 0xa7, 0x60, + 0x9b, 0x4a, 0xcf, 0x4b, 0x89, 0x94, 0x55, 0x8e, 0xb5, 0x86, 0x47, 0x5c, + 0x9f, 0x56, 0xc6, 0x7a, 0x5e, 0x47, 0x94, 0x96, 0xd0, 0x81, 0x8d, 0x41, + 0x78, 0xc6, 0x87, 0xcd, 0x3e, 0x7b, 0x47, 0x2f, 0x57, 0xa6, 0xc7, 0x9c, + 0x4b, 0x49, 0xb0, 0x46, 0x8d, 0xab, 0xa6, 0x7e, 0x6f, 0xad, 0x34, 0x54, + 0x57, 0xa4, 0xa1, 0x3f, 0xa8, 0xa9, 0xb6, 0x63, 0x3c, 0x3a, 0x58, 0xa4, + 0x9d, 0xd0, 0x40, 0x64, 0x6e, 0xd0, 0x3f, 0x6f, 0x44, 0x53, 0x98, 0xe4, + 0x54, 0xbb, 0x5b, 0x75, 0x7c, 0x3b, 0xc3, 0xe3, 0x6f, 0xd4, 0xbe, 0x59, + 0x67, 0x48, 0x86, 0xc0, 0x86, 0x75, 0x5e, 0xc3, 0x9d, 0x3b, 0x3b, 0x5a, + 0xa5, 0x73, 0x54, 0x77, 0x96, 0x45, 0x70, 0x9d, 0x96, 0x81, 0xc1, 0x32, + 0x52, 0x83, 0xbd, 0x76, 0x6a, 0x77, 0xa8, 0xb7, 0x44, 0x49, 0xb1, 0xa7, + 0xd0, 0x64, 0x40, 0xab, 0x72, 0x9a, 0xc8, 0x8e, 0xad, 0xa3, 0xc9, 0xaf, + 0x51, 0xcd, 0x7e, 0x36, 0x98, 0xd1, 0x51, 0x78, 0x50, 0x67, 0xc0, 0x4a, + 0xb4, 0xa6, 0x5b, 0x75, 0x72, 0x3f, 0x76, 0xb7, 0x47, 0x4b, 0x4c, 0x4a, + 0x40, 0x3e, 0xaf, 0x6e, 0x9b, 0xa3, 0x7b, 0xa9, 0x4b, 0xda, 0xa2, 0xd4, + 0x5b, 0x6c, 0x5c, 0xd8, 0xbf, 0x9f, 0xa4, 0x3a, 0x9c, 0x39, 0x6c, 0x71, + 0x59, 0xbc, 0x52, 0x7a, 0x7a, 0x40, 0x72, 0x99, 0x96, 0x25, 0x7a, 0x60, + 0xb2, 0x40, 0xa8, 0x47, 0xa8, 0x7e, 0x5d, 0x41, 0x74, 0xd2, 0xda, 0x86, + 0xd0, 0x43, 0xce, 0x75, 0xaa, 0x7d, 0x81, 0xa6, 0x52, 0xc8, 0x34, 0x3b, + 0x56, 0x64, 0x92, 0xa3, 0xc5, 0xc5, 0x94, 0x7c, 0x87, 0xb6, 0xc4, 0x82, + 0x5b, 0xba, 0xc0, 0x49, 0xad, 0x4d, 0xb3, 0x44, 0x5e, 0x8c, 0x40, 0x83, + 0x44, 0xc7, 0x96, 0x33, 0x96, 0x98, 0xa4, 0xbe, 0xbe, 0xe5, 0xbe, 0x87, + 0x71, 0x85, 0x44, 0xcc, 0x2f, 0xb8, 0x5f, 0xcb, 0xac, 0x7a, 0xbe, 0x48, + 0x88, 0x57, 0x7c, 0x6f, 0x81, 0x94, 0x2c, 0xbd, 0xc5, 0x33, 0xaa, 0x79, + 0x8b, 0x31, 0xa5, 0x6e, 0x4e, 0xb3, 0xda, 0x92, 0x52, 0x52, 0x87, 0x66, + 0x82, 0xaa, 0xbd, 0x50, 0xb5, 0xb1, 0xcd, 0xb6, 0x9a, 0x75, 0x75, 0xbb, + 0x34, 0xa0, 0xaf, 0x9a, 0xa6, 0x78, 0x4f, 0x64, 0x4b, 0xab, 0xb2, 0x99, + 0x41, 0xca, 0x36, 0xac, 0xc0, 0xd8, 0xb8, 0x8b, 0x88, 0xaa, 0xa2, 0x5d, + 0x79, 0xc2, 0x5c, 0xae, 0x7b, 0x6f, 0x80, 0xc3, 0xac, 0x64, 0x49, 0x95, + 0xc0, 0x39, 0xc3, 0x5b, 0xac, 0x86, 0xb8, 0xba, 0xb9, 0x8c, 0xd5, 0xbd, + 0xb5, 0x5f, 0xd8, 0x90, 0x71, 0x8a, 0xad, 0x84, 0x45, 0x3e, 0x35, 0x38, + 0xa7, 0x53, 0x4b, 0x7c, 0x33, 0x97, 0x99, 0x4c, 0x2d, 0x7c, 0xc8, 0x3f, + 0xa3, 0x50, 0x2e, 0x62, 0xad, 0xb0, 0xbd, 0x77, 0x9f, 0x9c, 0x3a, 0x58, + 0xb2, 0xd9, 0xc3, 0xa9, 0xba, 0x67, 0x33, 0x77, 0x47, 0x91, 0xb1, 0x7d, + 0x67, 0x77, 0xae, 0xd8, 0x3b, 0x53, 0x7e, 0x90, 0x76, 0x90, 0x44, 0x8b, + 0x34, 0x5a, 0x94, 0xc2, 0x4f, 0x6c, 0xd5, 0x8a, 0xb9, 0x52, 0x91, 0x78, + 0x85, 0xc4, 0x76, 0x79, 0x4f, 0x68, 0x88, 0x86, 0x72, 0x7e, 0x5f, 0xaa, + 0x57, 0x67, 0xa3, 0x8d, 0x93, 0x59, 0x2d, 0x88, 0xa3, 0x9a, 0x7d, 0x6a, + 0xbe, 0xb4, 0x37, 0x94, 0x92, 0x66, 0xba, 0x5d, 0x60, 0x8e, 0x42, 0x3f, + 0x36, 0x57, 0x65, 0xab, 0x97, 0xda, 0x62, 0x97, 0xd1, 0x3e, 0xcf, 0x51, + 0x43, 0xbf, 0x84, 0x6b, 0x3a, 0x9e, 0xcd, 0x9e, 0x88, 0x65, 0x4f, 0xa8, + 0x77, 0xc9, 0x41, 0x46, 0xb6, 0x7c, 0xbe, 0xd2, 0x79, 0x39, 0xb3, 0xcc, + 0x84, 0x6b, 0x32, 0x80, 0x37, 0xb6, 0xc2, 0xb1, 0x6f, 0xa7, 0xb2, 0xc2, + 0x5d, 0x67, 0xc5, 0x58, 0xc6, 0x59, 0xc4, 0x7a, 0xb7, 0x81, 0x92, 0x4f, + 0x8f, 0x29, 0xa6, 0xd0, 0x7b, 0x43, 0xd2, 0x50, 0xc4, 0xd0, 0x8c, 0x44, + 0x66, 0xd0, 0xa9, 0x63, 0x67, 0x47, 0x8d, 0x38, 0x64, 0x66, 0xba, 0x45, + 0x6c, 0x98, 0xc8, 0x84, 0x79, 0xaf, 0x84, 0x6e, 0x47, 0x68, 0x98, 0xc0, + 0x81, 0x88, 0x71, 0xa6, 0x9b, 0x70, 0x36, 0x62, 0x78, 0x5e, 0x66, 0x91, + 0xbc, 0xc6, 0xcf, 0xac, 0x2d, 0x9b, 0xb1, 0xbf, 0xb6, 0x3e, 0x9e, 0x35, + 0x73, 0x81, 0xb6, 0xbe, 0xb7, 0x9c, 0x48, 0xc2, 0x40, 0xbc, 0x5a, 0xbf, + 0x8f, 0xbb, 0x4a, 0x6e, 0x34, 0x6a, 0x9a, 0x58, 0x67, 0xbe, 0xa3, 0xce, + 0x40, 0x9c, 0x37, 0xb6, 0xb9, 0x55, 0xbc, 0x7a, 0x67, 0xcd, 0x50, 0xb8, + 0xb5, 0xc9, 0xb3, 0x7b, 0x94, 0xa6, 0x68, 0xc1, 0x8e, 0x4b, 0x38, 0xa0, + 0xc1, 0x83, 0xab, 0xd2, 0x5c, 0x2e, 0x9e, 0x7d, 0x77, 0x77, 0x56, 0x6a, + 0x37, 0x3c, 0xba, 0xc7, 0x70, 0x54, 0x86, 0x64, 0xc0, 0xa7, 0x71, 0x7a, + 0xbc, 0x36, 0x34, 0x81, 0x61, 0xa6, 0x67, 0xbe, 0x83, 0x75, 0x37, 0x92, + 0x72, 0xa9, 0x46, 0xa1, 0xc5, 0x76, 0xc1, 0x45, 0xd5, 0x77, 0x77, 0x82, + 0x5f, 0xa9, 0x6b, 0x2f, 0xac, 0x4a, 0xc9, 0xc9, 0xc1, 0x7e, 0x99, 0x53, + 0x4d, 0x3e, 0x97, 0x78, 0xa2, 0xcb, 0x3d, 0x5c, 0xad, 0x86, 0xbe, 0x7c, + 0x77, 0x79, 0x9c, 0xb9, 0x98, 0x68, 0x6c, 0xb2, 0x81, 0x62, 0x53, 0xbe, + 0x79, 0x7c, 0x6e, 0x6e, 0x95, 0x41, 0x36, 0x64, 0x74, 0x87, 0x51, 0xad, + 0x4e, 0xc1, 0x7d, 0xa7, 0x73, 0x35, 0xae, 0x8f, 0x4c, 0xca, 0xd3, 0xb7, + 0xc6, 0xcb, 0x8f, 0x3d, 0x79, 0x4c, 0x67, 0x7b, 0xc7, 0x48, 0x89, 0x58, + 0xc6, 0x37, 0x89, 0x38, 0x69, 0x9f, 0xbf, 0xa2, 0x3f, 0x82, 0x8c, 0x4c, + 0xd1, 0x9b, 0x5e, 0xae, 0x46, 0x9b, 0x84, 0x77, 0x79, 0x6e, 0x5a, 0x79, + 0xb4, 0x6a, 0xa5, 0x93, 0x63, 0x98, 0x75, 0x88, 0x3a, 0xd0, 0xb7, 0xba, + 0x4a, 0x8e, 0xa1, 0xa2, 0x94, 0xd4, 0x71, 0x61, 0x91, 0xd1, 0xb6, 0xbb, + 0x86, 0xce, 0x6c, 0x7e, 0xbe, 0x5d, 0x9f, 0xae, 0x8c, 0xc0, 0xd0, 0x46, + 0xaa, 0x65, 0xa1, 0x92, 0x3e, 0x5b, 0xc7, 0x33, 0x8b, 0x32, 0x79, 0x99, + 0x9f, 0xd2, 0xc8, 0x90, 0xd6, 0x83, 0x7f, 0x65, 0xac, 0x7c, 0x49, 0x8b, + 0x9a, 0x60, 0x8b, 0xad, 0x81, 0x31, 0x5b, 0xa7, 0x96, 0x67, 0x67, 0xa4, + 0xad, 0xa1, 0x56, 0x65, 0x44, 0x8d, 0x4e, 0x64, 0x9e, 0x72, 0x6b, 0x6e, + 0xc7, 0xbe, 0x64, 0x9b, 0x41, 0x6c, 0xb5, 0xb1, 0x49, 0x32, 0x40, 0x58, + 0x84, 0x5e, 0x4e, 0xbe, 0xbd, 0x8c, 0xa7, 0x52, 0xbc, 0xde, 0xad, 0xa8, + 0x76, 0x8d, 0x68, 0xc5, 0x6c, 0x85, 0x84, 0x55, 0x6c, 0x44, 0x82, 0xb5, + 0xa3, 0x46, 0xbb, 0x87, 0x43, 0x56, 0x42, 0x7e, 0xd0, 0x4f, 0x70, 0xbb, + 0xaa, 0x46, 0xd5, 0x5e, 0x9f, 0x95, 0x5a, 0x3b, 0x7a, 0xbd, 0x4e, 0x6f, + 0x5c, 0xa4, 0x38, 0x49, 0x73, 0x62, 0xcc, 0x8f, 0x98, 0xb9, 0xc1, 0x61, + 0x97, 0xb9, 0x76, 0x84, 0x64, 0x4b, 0x74, 0xd2, 0xbb, 0xa7, 0x9f, 0x56, + 0xc1, 0xae, 0x67, 0x55, 0x6a, 0xbb, 0x5d, 0x53, 0x9f, 0xb6, 0x7f, 0x84, + 0x33, 0x37, 0x72, 0xbf, 0x3d, 0xd1, 0xcd, 0xa3, 0x67, 0x5f, 0x8f, 0x5d, + 0x51, 0xc9, 0x91, 0xd7, 0xb5, 0x3a, 0xd9, 0x7e, 0x32, 0x75, 0xb7, 0x48, + 0x55, 0xb3, 0x66, 0x82, 0xb1, 0x95, 0xa2, 0x8a, 0x9b, 0x9c, 0xad, 0xa3, + 0x36, 0x3d, 0x5e, 0x9b, 0x78, 0xb8, 0x6c, 0x7f, 0xca, 0x51, 0x6e, 0x9a, + 0xb1, 0x7a, 0x81, 0x39, 0x9a, 0x3d, 0xd1, 0x9d, 0x68, 0xd0, 0x6d, 0x56, + 0x50, 0x76, 0x7a, 0xc6, 0xb3, 0xaf, 0x5c, 0xb9, 0x5a, 0x99, 0xb4, 0x80, + 0xb1, 0x6d, 0x85, 0x40, 0x7a, 0xb7, 0x5f, 0x39, 0x36, 0x5f, 0xa1, 0xc2, + 0x5e, 0xc1, 0x93, 0x33, 0x97, 0x3b, 0xd6, 0x5d, 0xa5, 0x75, 0xd5, 0x9a, + 0x8b, 0x49, 0x9c, 0xb3, 0x54, 0x56, 0x51, 0x90, 0xbf, 0xc6, 0x84, 0x40, + 0x77, 0x3a, 0xbb, 0x73, 0xc2, 0x9f, 0xc5, 0x2f, 0x95, 0x84, 0xb1, 0x68, + 0x6f, 0xc6, 0x8a, 0x48, 0x69, 0xba, 0x3b, 0x82, 0x41, 0x7a, 0x99, 0x54, + 0x64, 0x58, 0xb7, 0x9b, 0x5b, 0x86, 0x52, 0x4e, 0xb2, 0x63, 0x4b, 0x4f, + 0xa1, 0xba, 0x44, 0x5e, 0x72, 0xa6, 0x5f, 0x91, 0x45, 0x5b, 0x6e, 0x61, + 0xcb, 0x9a, 0x60, 0x76, 0x3f, 0x92, 0x2a, 0x9f, 0x2e, 0x42, 0x40, 0xb6, + 0x96, 0x3f, 0xca, 0x2d, 0x3e, 0x8b, 0x58, 0x62, 0x39, 0x3d, 0x7a, 0x33, + 0x6d, 0x32, 0x71, 0x56, 0xb9, 0xa5, 0x4c, 0x3e, 0xdb, 0x42, 0x89, 0xb6, + 0x87, 0x58, 0xb5, 0x71, 0x5a, 0xbc, 0x4f, 0xa5, 0x71, 0x55, 0xb6, 0xb7, + 0x67, 0x61, 0x3f, 0xbb, 0x8f, 0xb7, 0x51, 0x7d, 0x2e, 0x9e, 0xba, 0x86, + 0xc7, 0x3c, 0x37, 0x3e, 0x59, 0xcc, 0xd6, 0xc5, 0x5d, 0x6e, 0x84, 0xcb, + 0xab, 0xb7, 0x99, 0xa4, 0x98, 0x9a, 0x5d, 0xba, 0xd2, 0x5f, 0x58, 0x46, + 0x21, 0x62, 0x35, 0x78, 0xb2, 0x70, 0x4f, 0x89, 0x8d, 0xac, 0x6a, 0xcb, + 0x4e, 0x72, 0x34, 0x9e, 0x6b, 0x60, 0xbf, 0x6d, 0x68, 0xa9, 0xd0, 0x4f, + 0x7a, 0x37, 0xc1, 0xa0, 0x48, 0x8d, 0x8b, 0x5d, 0x96, 0x5b, 0xce, 0xb5, + 0x98, 0xb6, 0xc7, 0x62, 0x58, 0xce, 0x3b, 0x6b, 0x96, 0x89, 0x43, 0xbc, + 0x59, 0x59, 0xaa, 0x5b, 0x65, 0xac, 0x55, 0xa9, 0x68, 0x93, 0x31, 0xaf, + 0x3b, 0xbd, 0x5f, 0x9d, 0x80, 0x7f, 0x93, 0xcb, 0x4f, 0x5b, 0x65, 0x95, + 0x83, 0x99, 0x91, 0xac, 0x9a, 0xa7, 0x58, 0xdb, 0xb8, 0x6b, 0x89, 0xad, + 0xbb, 0x6b, 0x91, 0xcc, 0x80, 0x4a, 0x96, 0x3d, 0x43, 0x93, 0x3b, 0x73, + 0x56, 0x47, 0x7c, 0x8a, 0x97, 0xa7, 0x50, 0x3e, 0xbc, 0x47, 0xbe, 0x91, + 0x9f, 0xa7, 0x8a, 0x45, 0x54, 0x87, 0x50, 0x71, 0xcd, 0x34, 0x76, 0x50, + 0x57, 0x79, 0x5c, 0x7e, 0x89, 0xc7, 0x90, 0x66, 0xc3, 0xd9, 0x69, 0x9b, + 0x77, 0x44, 0x9b, 0x42, 0x63, 0x53, 0x9c, 0x6e, 0x64, 0x4e, 0x5d, 0x7a, + 0x86, 0x58, 0x8c, 0xd4, 0xa4, 0xa8, 0x71, 0xcc, 0x56, 0x7b, 0x62, 0x2b, + 0x91, 0xd1, 0xcf, 0x8a, 0xbf, 0x97, 0xa5, 0xa0, 0xa0, 0xb4, 0x56, 0x79, + 0xa7, 0x69, 0x43, 0xaf, 0xd4, 0x43, 0x5b, 0x46, 0x39, 0xc5, 0x90, 0x7a, + 0x91, 0x89, 0x4e, 0xcc, 0xcb, 0x99, 0x7b, 0x89, 0x61, 0xaf, 0x4d, 0x71, + 0xb8, 0x5c, 0xb7, 0x75, 0x49, 0xa8, 0x8e, 0x3c, 0x7a, 0x8e, 0x42, 0x36, + 0x60, 0xc0, 0xad, 0x3b, 0xbf, 0x56, 0x40, 0x7f, 0xd3, 0xb2, 0x5d, 0x76, + 0x34, 0x6d, 0x8f, 0xd2, 0x53, 0x76, 0x7a, 0x7a, 0x6a, 0x4c, 0x34, 0x70, + 0x8f, 0x7e, 0x91, 0x3a, 0x43, 0xa3, 0x6c, 0x9c, 0xd4, 0x92, 0xab, 0x43, + 0x91, 0x7f, 0x78, 0x82, 0x63, 0x70, 0xba, 0xd1, 0xaa, 0x86, 0x8e, 0x8a, + 0xbf, 0x3e, 0x54, 0xd6, 0x78, 0xc4, 0x7d, 0xd7, 0xc1, 0x95, 0xd6, 0x5a, + 0x88, 0x33, 0xa0, 0x40, 0x77, 0x4c, 0x4d, 0x76, 0x4c, 0x97, 0x8d, 0x42, + 0x40, 0x3e, 0x78, 0x9b, 0x5b, 0x23, 0xa4, 0x36, 0x3f, 0xb5, 0xa6, 0x2e, + 0xc1, 0x5d, 0x47, 0x8b, 0x91, 0x5a, 0x9b, 0xb7, 0x90, 0x63, 0xac, 0x95, + 0xd1, 0x75, 0x6b, 0x35, 0x6a, 0x67, 0xe3, 0x56, 0xc5, 0xbf, 0x6d, 0x4f, + 0x33, 0xa9, 0x7b, 0x88, 0x49, 0xa5, 0x95, 0xab, 0x89, 0xc5, 0x76, 0x51, + 0x44, 0x7c, 0x9a, 0xcd, 0x40, 0x75, 0xc9, 0xde, 0x58, 0x29, 0x76, 0xa6, + 0x67, 0x41, 0x68, 0x87, 0x50, 0xa2, 0x61, 0x8d, 0x6c, 0xa1, 0x38, 0x58, + 0x95, 0xd5, 0x54, 0xad, 0x87, 0x87, 0x31, 0x3f, 0x62, 0xc8, 0x70, 0x70, + 0xdb, 0xa3, 0xcc, 0x91, 0xbd, 0x3f, 0x46, 0xb0, 0x9b, 0x5f, 0xb8, 0xad, + 0x41, 0x8c, 0x6d, 0xaa, 0xb3, 0xa1, 0x42, 0xce, 0x4e, 0x51, 0x3b, 0x5a, + 0x50, 0x71, 0xc0, 0x99, 0x86, 0x3c, 0x8f, 0xc4, 0x45, 0x4a, 0xa1, 0xbd, + 0x4e, 0x9a, 0x64, 0x92, 0x8c, 0x5a, 0xcc, 0x72, 0x5e, 0x3b, 0xb2, 0x3d, + 0xd8, 0x5b, 0x6a, 0x5b, 0x6b, 0x41, 0xa4, 0xc3, 0x49, 0x53, 0x65, 0x6c, + 0x4c, 0x9f, 0x81, 0x3f, 0x42, 0x68, 0x75, 0xc1, 0x41, 0x2b, 0x73, 0x98, + 0x52, 0xcb, 0x9c, 0x3c, 0x48, 0x57, 0x67, 0xd2, 0x51, 0xcc, 0xa2, 0xca, + 0x71, 0xbc, 0x3a, 0x78, 0xbf, 0xb4, 0x33, 0xd7, 0x6d, 0xca, 0x9a, 0x7a, + 0x7f, 0xa2, 0xa2, 0x85, 0x67, 0x63, 0xaa, 0x4a, 0x8e, 0x70, 0x49, 0x74, + 0x5f, 0xa8, 0x71, 0x38, 0xb8, 0x46, 0x3a, 0x8a, 0x63, 0x65, 0x94, 0x4c, + 0xd3, 0x69, 0x82, 0x41, 0x99, 0x29, 0xd4, 0x54, 0xad, 0x22, 0xb7, 0x52, + 0x3f, 0x8e, 0x46, 0x87, 0x86, 0x68, 0xcc, 0x9d, 0x51, 0x59, 0xaa, 0x3c, + 0x3b, 0xb6, 0x58, 0x58, 0x70, 0x71, 0xa2, 0x78, 0xe0, 0x4d, 0xaa, 0x60, + 0xa0, 0xb7, 0x9b, 0x48, 0x7e, 0x74, 0xb6, 0x7e, 0x27, 0x4e, 0x97, 0x91, + 0x5c, 0xd2, 0x35, 0x92, 0xbe, 0x5f, 0x52, 0x8f, 0xce, 0x54, 0x53, 0xb9, + 0x74, 0x87, 0xb6, 0x8c, 0xad, 0xae, 0x99, 0x59, 0xc9, 0x5c, 0xc9, 0x4f, + 0xa0, 0x5c, 0xa2, 0xbc, 0x67, 0x9b, 0xa4, 0x8a, 0xc0, 0x51, 0x5a, 0xd2, + 0xab, 0xa4, 0x6b, 0x33, 0x95, 0x65, 0x7c, 0x41, 0xcc, 0xb7, 0x45, 0xc3, + 0xc1, 0xaf, 0xc1, 0xbf, 0x5c, 0x80, 0x9b, 0x7d, 0x7e, 0xb9, 0x40, 0x72, + 0x72, 0xb9, 0x7f, 0xc5, 0x99, 0x89, 0x92, 0xa1, 0xb6, 0x5e, 0x45, 0x77, + 0xa8, 0xb1, 0xc1, 0x3f, 0xad, 0x49, 0xcb, 0x5e, 0x86, 0x45, 0x90, 0x67, + 0x5a, 0xa4, 0x78, 0xa4, 0x70, 0x5f, 0x5a, 0x5d, 0xc8, 0xb4, 0xb3, 0xa5, + 0xab, 0x58, 0x55, 0xa7, 0x45, 0x8a, 0x9a, 0xb1, 0xb2, 0x9d, 0x92, 0xbd, + 0xd3, 0x72, 0x47, 0x64, 0xb5, 0xc9, 0x2c, 0x7b, 0x6a, 0xc5, 0x4d, 0x9a, + 0xba, 0x65, 0x83, 0x6a, 0x8e, 0x49, 0x39, 0x96, 0xbc, 0x5e, 0xa1, 0xb1, + 0xbb, 0xab, 0x7a, 0x3a, 0xdb, 0x78, 0x48, 0x80, 0x95, 0x96, 0x71, 0x98, + 0x3a, 0x33, 0x3f, 0xa2, 0x5b, 0x55, 0x77, 0x5c, 0xa1, 0x4a, 0x9c, 0xc1, + 0x36, 0xb6, 0xbd, 0x95, 0x4b, 0xb1, 0xc4, 0x76, 0xb1, 0x35, 0xb3, 0xb4, + 0xb8, 0xe0, 0x73, 0x71, 0x33, 0x71, 0x7f, 0x7a, 0x92, 0x6f, 0x5d, 0xc6, + 0xba, 0xd8, 0x40, 0xb1, 0x3a, 0x8d, 0xaf, 0xc1, 0x78, 0x3d, 0x59, 0xdb, + 0xa7, 0x73, 0xc7, 0x82, 0x77, 0xc5, 0xbd, 0x88, 0xcd, 0xce, 0x30, 0x37, + 0x88, 0x45, 0x27, 0x5f, 0xbf, 0xc9, 0x47, 0x3e, 0xa2, 0xaa, 0x54, 0x3c, + 0xc3, 0xc5, 0x51, 0x56, 0x7c, 0x47, 0xb1, 0x7b, 0xc8, 0x7e, 0xcd, 0xd3, + 0x3d, 0x53, 0x73, 0x36, 0xd5, 0xac, 0x84, 0x6e, 0x67, 0xae, 0x46, 0x49, + 0xd3, 0x4f, 0x5f, 0xc1, 0xaa, 0x56, 0x5b, 0x61, 0x70, 0x45, 0x6e, 0x52, + 0xbf, 0x9e, 0xb1, 0x30, 0xa0, 0xc8, 0x71, 0xbd, 0x3b, 0xd9, 0xca, 0x57, + 0x7e, 0xa4, 0x90, 0xd6, 0xc3, 0xbc, 0x35, 0x94, 0x5c, 0x8b, 0xcf, 0xbd, + 0x5c, 0xb3, 0x50, 0xa3, 0x61, 0xa4, 0x44, 0x52, 0x91, 0x42, 0x54, 0xb4, + 0x5f, 0x9d, 0x78, 0xc8, 0x56, 0x4d, 0x4f, 0x87, 0x7f, 0x55, 0x70, 0x86, + 0xc6, 0xbb, 0x33, 0x67, 0xa7, 0x5e, 0xc4, 0x95, 0x57, 0x67, 0x7b, 0xcf, + 0x7a, 0x75, 0xc9, 0xb7, 0x69, 0x68, 0x7e, 0xb2, 0xa4, 0x64, 0xa5, 0xca, + 0x43, 0x9a, 0x99, 0x4e, 0x7f, 0x6e, 0x4a, 0x3d, 0x7d, 0x85, 0xa9, 0x8b, + 0x80, 0xc3, 0x59, 0x99, 0x3a, 0x49, 0x4d, 0x93, 0xc8, 0x75, 0xc6, 0x4d, + 0x55, 0xcd, 0x9b, 0xaa, 0x52, 0xa9, 0x8b, 0x45, 0x87, 0xc7, 0x52, 0xa8, + 0x62, 0xa2, 0x9a, 0x60, 0xb1, 0x94, 0x87, 0x91, 0xd1, 0x71, 0x90, 0x84, + 0xcc, 0xb7, 0x41, 0xb3, 0x6e, 0x51, 0xa0, 0x83, 0x56, 0xa5, 0x82, 0xb1, + 0x77, 0xaf, 0x98, 0xad, 0xa5, 0x8c, 0xa8, 0x97, 0x32, 0xc2, 0x7c, 0xaa, + 0x4b, 0x95, 0x9a, 0x7b, 0x5e, 0xa3, 0xb1, 0x62, 0xa6, 0xaa, 0xc1, 0x8b, + 0x5d, 0x6b, 0x42, 0x4d, 0xe1, 0x60, 0x29, 0x4f, 0xbc, 0x8a, 0x5c, 0x53, + 0x7a, 0x8d, 0xb5, 0xb1, 0x60, 0xb9, 0x99, 0xc8, 0xa9, 0xe1, 0x4c, 0x37, + 0xc8, 0x83, 0x3b, 0x7f, 0xb0, 0xd0, 0xb0, 0x36, 0xb9, 0xc2, 0x6c, 0xca, + 0x68, 0xbe, 0x8a, 0x7d, 0x81, 0x4f, 0x40, 0xa7, 0x3e, 0xbe, 0xbd, 0xa5, + 0x91, 0x4c, 0x51, 0x97, 0x9a, 0x60, 0xa8, 0x94, 0xc8, 0x84, 0x87, 0x3b, + 0x44, 0x8e, 0x5a, 0x95, 0x35, 0x4c, 0x4e, 0x50, 0xc2, 0xbe, 0xe5, 0x6b, + 0x8e, 0xab, 0x4f, 0xb4, 0xb7, 0xc7, 0x75, 0x3c, 0x60, 0xc7, 0x4b, 0x76, + 0xc7, 0xcd, 0x88, 0xbd, 0x38, 0x94, 0x89, 0x8c, 0xc3, 0x5e, 0x59, 0x36, + 0x45, 0xa8, 0xba, 0x92, 0x62, 0x61, 0x8f, 0x65, 0x5b, 0x4e, 0xa2, 0xbd, + 0xae, 0x98, 0x80, 0xb0, 0x3e, 0x84, 0x29, 0x90, 0xd6, 0x4a, 0x44, 0x76, + 0x91, 0x50, 0xcd, 0x67, 0xa4, 0x9b, 0x73, 0x5f, 0x55, 0x81, 0xaa, 0x90, + 0x58, 0xc7, 0x90, 0xc6, 0x85, 0xb5, 0xae, 0x88, 0x93, 0x6d, 0xab, 0x39, + 0xb0, 0xae, 0x38, 0xc2, 0xc6, 0x4c, 0x4f, 0x4a, 0x2c, 0xa5, 0x6a, 0xaf, + 0xb8, 0x71, 0x7d, 0x38, 0x4c, 0x8e, 0x97, 0x64, 0xc3, 0xa9, 0x53, 0xd9, + 0xd2, 0x98, 0x7b, 0xc0, 0xca, 0x6a, 0xd1, 0xdb, 0x67, 0x7d, 0x65, 0x88, + 0xd7, 0x99, 0xba, 0xae, 0x48, 0x3c, 0x59, 0xbe, 0xbc, 0x37, 0xa1, 0x9d, + 0x81, 0xa2, 0xbc, 0xb6, 0xe6, 0x97, 0x95, 0x8d, 0x45, 0x9c, 0xb6, 0xb9, + 0xd2, 0x3a, 0xa4, 0xc6, 0x48, 0x44, 0x39, 0xb3, 0x62, 0xa0, 0x47, 0x89, + 0x66, 0x44, 0x97, 0x68, 0x57, 0xaf, 0x51, 0x40, 0x4c, 0x6d, 0x80, 0x79, + 0x84, 0xb3, 0xb8, 0xca, 0x34, 0x85, 0x87, 0x85, 0xbc, 0x66, 0x72, 0xb1, + 0x72, 0x3e, 0xa2, 0x64, 0x6d, 0x5c, 0x64, 0x4e, 0x8d, 0xa8, 0x3f, 0xca, + 0x63, 0xa1, 0x56, 0xb2, 0x40, 0xac, 0x3c, 0x53, 0xb3, 0xc1, 0xb8, 0x32, + 0xcb, 0x80, 0x70, 0x50, 0xd2, 0xa0, 0x8f, 0x6e, 0x7e, 0xb7, 0xbf, 0xcb, + 0x85, 0x39, 0xa7, 0x63, 0x86, 0x65, 0x84, 0x9c, 0x7c, 0x62, 0x51, 0x8f, + 0x6a, 0x3e, 0x53, 0xcd, 0xa6, 0x7c, 0xcb, 0xa9, 0x8d, 0x44, 0x68, 0x79, + 0x83, 0x7d, 0x6c, 0x77, 0x33, 0x3e, 0xb3, 0x56, 0x41, 0x9e, 0x94, 0xd3, + 0x39, 0x69, 0x93, 0x9f, 0xc9, 0x75, 0xca, 0x7a, 0x36, 0x80, 0x75, 0x7d, + 0x40, 0xbf, 0x44, 0xcb, 0x6a, 0x28, 0x89, 0x25, 0x82, 0x56, 0x63, 0x68, + 0xcf, 0x80, 0xb1, 0x9a, 0x9c, 0x7a, 0x4c, 0x67, 0x49, 0xc1, 0x22, 0x60, + 0x93, 0xa3, 0x61, 0x6a, 0x88, 0xb0, 0xa1, 0x51, 0x1e, 0xa2, 0x85, 0xc1, + 0xb8, 0x60, 0x72, 0xd7, 0x7c, 0x40, 0x60, 0xbc, 0x69, 0x72, 0x4b, 0x6a, + 0x48, 0x68, 0xb7, 0x8a, 0xbd, 0x5d, 0x56, 0x79, 0x9f, 0x3d, 0x72, 0xc9, + 0x63, 0x6f, 0xb0, 0xb9, 0xe2, 0x34, 0x8e, 0xd0, 0x74, 0x7c, 0x67, 0x4c, + 0x7b, 0x3a, 0xa6, 0x68, 0x82, 0x4e, 0x9f, 0x5b, 0x3c, 0x97, 0x95, 0x6b, + 0xcf, 0xb7, 0xb3, 0x68, 0xce, 0xd6, 0xcd, 0x88, 0x3e, 0x64, 0x54, 0x98, + 0xca, 0xc3, 0x39, 0x8e, 0xcf, 0xaa, 0x79, 0x94, 0x98, 0xba, 0xbd, 0x43, + 0x90, 0x8e, 0x54, 0x62, 0x4f, 0x7b, 0xa3, 0x51, 0x5d, 0x66, 0xb7, 0xb7, + 0xaf, 0x87, 0xb1, 0x53, 0x34, 0x78, 0xcf, 0xb1, 0x64, 0x99, 0x67, 0xcb, + 0xdb, 0x9b, 0xbe, 0xd2, 0x8c, 0x91, 0x55, 0x6e, 0x95, 0xa5, 0x64, 0x5e, + 0x60, 0x9b, 0xb6, 0x64, 0x83, 0x85, 0x77, 0x70, 0x39, 0x77, 0x2b, 0xc5, + 0xbc, 0x6b, 0x93, 0x6a, 0x4f, 0x99, 0xbe, 0xcb, 0x7a, 0x75, 0x60, 0x60, + 0xc3, 0x38, 0x95, 0x8b, 0xb8, 0x7a, 0x7c, 0xbf, 0x6d, 0x6e, 0x57, 0xc7, + 0x9e, 0x5f, 0xae, 0x9f, 0x60, 0x90, 0xcb, 0x78, 0xce, 0x89, 0xe3, 0x39, + 0xd2, 0xb2, 0xcb, 0x4f, 0x61, 0x42, 0xb5, 0xbb, 0xba, 0xbc, 0xa9, 0x7b, + 0x78, 0xc8, 0xd1, 0xa0, 0x91, 0xb2, 0xa2, 0xa9, 0xbd, 0xb6, 0xa2, 0x46, + 0xba, 0x4f, 0x72, 0xb4, 0xb9, 0xc9, 0xaf, 0x3f, 0x82, 0x2d, 0x68, 0xa7, + 0x2e, 0x90, 0xb5, 0x7a, 0x37, 0x6f, 0x34, 0xa7, 0x40, 0x67, 0x3a, 0xb6, + 0xa0, 0x90, 0x5c, 0xcf, 0xb1, 0x39, 0x7c, 0x88, 0x9f, 0x30, 0x88, 0x5e, + 0xb3, 0x96, 0x91, 0x92, 0xad, 0xa2, 0x41, 0x45, 0x8a, 0xc8, 0xc6, 0x74, + 0x62, 0x61, 0x79, 0xca, 0x4a, 0x8f, 0xd8, 0x59, 0xcc, 0x65, 0xac, 0xa7, + 0x93, 0x85, 0xa7, 0x6c, 0xaf, 0x9b, 0x78, 0x47, 0xa0, 0x40, 0x3f, 0x8f, + 0x72, 0xdc, 0x71, 0x4e, 0xa5, 0x68, 0x63, 0x6f, 0x6b, 0x78, 0x46, 0xc1, + 0x9a, 0x96, 0xd6, 0x92, 0x3b, 0x80, 0x8f, 0xd0, 0x69, 0xce, 0x5b, 0xab, + 0xcb, 0x99, 0xa0, 0x41, 0xd5, 0xd8, 0x35, 0xbc, 0xf4, 0xd2, 0xaf, 0xb1, + 0x43, 0x92, 0x47, 0x77, 0xbf, 0x39, 0xa0, 0x79, 0x4b, 0x48, 0x64, 0xa7, + 0x9e, 0x49, 0xa8, 0x9a, 0xbc, 0x55, 0xd1, 0x79, 0xd4, 0xa7, 0x74, 0x30, + 0x8a, 0xdc, 0x80, 0x75, 0x91, 0x5f, 0x55, 0x4d, 0x55, 0x30, 0x64, 0x90, + 0x7a, 0xc9, 0x97, 0xa0, 0x7f, 0x54, 0x3d, 0xb8, 0x99, 0x4f, 0xcb, 0x47, + 0x69, 0xa6, 0x65, 0x5b, 0x4f, 0xac, 0x57, 0x3a, 0x91, 0x69, 0x80, 0x4b, + 0xa5, 0xb1, 0x44, 0x7a, 0xbd, 0xc2, 0xc6, 0x38, 0x8c, 0x63, 0x54, 0x4b, + 0x7b, 0xb0, 0x8e, 0x3e, 0x64, 0xbb, 0xcd, 0x43, 0x73, 0x7d, 0x6b, 0x5c, + 0x38, 0x3b, 0xbd, 0x3e, 0xa5, 0xb1, 0x66, 0xa0, 0x77, 0xab, 0x84, 0x7a, + 0x81, 0x31, 0x97, 0x45, 0xb8, 0xc9, 0x34, 0x67, 0xc3, 0xcd, 0xc0, 0xca, + 0x3a, 0x71, 0x4d, 0xd4, 0x45, 0xb5, 0x8c, 0x3b, 0x6a, 0xe2, 0x9d, 0x3e, + 0x96, 0xc3, 0xbe, 0xb9, 0xbe, 0x98, 0xcd, 0xb7, 0xb8, 0x6d, 0x49, 0xce, + 0xb2, 0xa5, 0xd0, 0x69, 0xba, 0xbf, 0xc1, 0x3b, 0x57, 0x8b, 0xcc, 0xc4, + 0xa0, 0x69, 0xa3, 0x45, 0x35, 0xaa, 0x51, 0x45, 0xa0, 0x71, 0xa3, 0x5c, + 0x41, 0x39, 0x6e, 0xbe, 0xc0, 0x96, 0x9c, 0x9a, 0xa8, 0x8d, 0x70, 0x92, + 0x53, 0x4d, 0x8f, 0x52, 0x61, 0x4e, 0xac, 0xcc, 0x30, 0x90, 0x6c, 0x4a, + 0x40, 0x82, 0xc7, 0xc1, 0x47, 0xb9, 0x96, 0x5a, 0xd6, 0xa3, 0xad, 0xaf, + 0x81, 0x3d, 0x4c, 0x50, 0x78, 0x6c, 0x80, 0x87, 0x81, 0xc7, 0xc1, 0x3d, + 0x3e, 0x8d, 0x4e, 0x76, 0x7f, 0xa1, 0x6e, 0xae, 0x9e, 0x4a, 0x38, 0x6d, + 0x67, 0x7d, 0x79, 0x30, 0x80, 0x57, 0x6d, 0xc5, 0x65, 0x87, 0xa5, 0x79, + 0x90, 0x8d, 0x7d, 0x84, 0x48, 0x9a, 0x4b, 0xb5, 0xa0, 0x8b, 0x78, 0x7a, + 0xd2, 0x3f, 0x60, 0x97, 0x4e, 0xe0, 0x68, 0x3c, 0x88, 0xd0, 0xb6, 0xc2, + 0x9a, 0x75, 0xd0, 0xca, 0x36, 0x85, 0x80, 0x88, 0x97, 0x45, 0x4a, 0x94, + 0x77, 0x57, 0x53, 0xa9, 0x79, 0x6b, 0xc1, 0x4c, 0x5a, 0x3d, 0x5f, 0x5a, + 0xba, 0xbd, 0x58, 0xd4, 0xb4, 0x46, 0xcd, 0xbf, 0x7b, 0x6b, 0x45, 0x3c, + 0xae, 0x5b, 0xc9, 0x81, 0x87, 0xac, 0x62, 0x97, 0x8c, 0x59, 0x83, 0x6e, + 0x69, 0x68, 0x7e, 0xc9, 0x88, 0x80, 0xc8, 0x6f, 0xb8, 0x77, 0x5e, 0x78, + 0x99, 0xa3, 0x6d, 0xc1, 0x90, 0x3a, 0x4a, 0x5a, 0x3c, 0x9b, 0x25, 0x74, + 0x68, 0x3d, 0x66, 0x79, 0x8f, 0xca, 0x70, 0xd7, 0x8a, 0x43, 0x43, 0xc1, + 0xca, 0x88, 0xcb, 0x6f, 0x98, 0xc6, 0xc5, 0x54, 0x7b, 0x87, 0x46, 0x8c, + 0x86, 0x78, 0x95, 0xba, 0x71, 0x45, 0x4f, 0x8a, 0x6c, 0x6b, 0x58, 0x98, + 0x8f, 0x3e, 0x8b, 0x9b, 0x68, 0x39, 0x6d, 0xb4, 0x98, 0x38, 0x3e, 0x2c, + 0x38, 0xa1, 0x38, 0x65, 0xb6, 0xc0, 0x47, 0xa5, 0x62, 0x6f, 0xd0, 0x62, + 0xc7, 0xd3, 0x8b, 0x90, 0x8c, 0x2f, 0x51, 0x40, 0x3b, 0xa8, 0x6a, 0x7b, + 0x8e, 0x4e, 0x7d, 0x69, 0x95, 0x4e, 0x60, 0x62, 0xab, 0x8f, 0x8c, 0x7e, + 0xa3, 0x36, 0x83, 0xc5, 0x52, 0xd5, 0x65, 0xba, 0x9b, 0xa0, 0xd4, 0x82, + 0x8b, 0x45, 0x6f, 0xa7, 0xa4, 0x9f, 0x42, 0x94, 0x48, 0xb3, 0x97, 0xc1, + 0x9d, 0xa8, 0x4e, 0x33, 0xa5, 0x82, 0xb0, 0x5f, 0x8d, 0xa3, 0x3a, 0x5e, + 0x82, 0x3e, 0xaa, 0x47, 0xcd, 0x80, 0xbc, 0x99, 0x66, 0xbd, 0x70, 0xba, + 0xd0, 0x8a, 0x84, 0xc3, 0x36, 0x9f, 0x6a, 0xb4, 0x9d, 0x73, 0xb3, 0x73, + 0x67, 0x26, 0x33, 0x8b, 0xb4, 0xb4, 0xb2, 0x4d, 0x96, 0x77, 0x38, 0x6a, + 0x9d, 0x4a, 0x5e, 0x86, 0x49, 0xaa, 0x42, 0x45, 0x58, 0x57, 0xcb, 0x4e, + 0xc2, 0x6a, 0x68, 0xaf, 0x3a, 0x6d, 0xde, 0x74, 0x9f, 0x75, 0x73, 0x62, + 0xbc, 0x65, 0x5d, 0x9a, 0x8f, 0x38, 0x98, 0x3a, 0x6d, 0x6a, 0x34, 0xd8, + 0xc0, 0xa4, 0x64, 0x7d, 0x33, 0xb9, 0x9b, 0x7d, 0x8e, 0x7c, 0x8c, 0xcf, + 0xaa, 0x3f, 0xba, 0x31, 0xc2, 0xc1, 0x90, 0xbe, 0xab, 0xad, 0xa4, 0x4b, + 0x99, 0x88, 0x79, 0xb3, 0x7e, 0xbe, 0xac, 0x39, 0x54, 0x6d, 0x67, 0x97, + 0xc4, 0x7d, 0x92, 0x5e, 0x63, 0x82, 0x9c, 0x44, 0x3a, 0x60, 0x72, 0xb7, + 0x80, 0xb7, 0x64, 0xa1, 0x86, 0x90, 0xb7, 0x4d, 0xb6, 0x50, 0xb9, 0x43, + 0x39, 0xb6, 0xd4, 0x92, 0x73, 0x3c, 0x5c, 0x4c, 0x58, 0x73, 0x6f, 0x5b, + 0x57, 0xd6, 0xc8, 0x5c, 0x42, 0x8a, 0xce, 0x84, 0xb4, 0x85, 0x7b, 0xc9, + 0xb7, 0x6c, 0xae, 0x3e, 0x3a, 0x62, 0xa5, 0x74, 0x74, 0x98, 0x70, 0x7c, + 0x95, 0x71, 0x49, 0x52, 0x31, 0xb7, 0x6f, 0x49, 0xcb, 0x52, 0xb2, 0x4b, + 0x75, 0x37, 0x93, 0x72, 0x6e, 0xb5, 0xd3, 0x9f, 0x6f, 0x85, 0x64, 0x87, + 0xc8, 0xac, 0xc0, 0x7a, 0xc7, 0x87, 0x38, 0xb8, 0x70, 0x36, 0x38, 0x64, + 0x99, 0x4f, 0xab, 0x9e, 0xa6, 0x9a, 0x8f, 0x79, 0x95, 0x6c, 0x30, 0x4e, + 0x56, 0x6d, 0x73, 0x6e, 0x94, 0x5a, 0xb8, 0xbd, 0x60, 0xbb, 0x47, 0xc9, + 0xc3, 0x9d, 0xa8, 0x64, 0x85, 0x6e, 0xb7, 0x8d, 0x5e, 0xa6, 0x74, 0x9c, + 0xc8, 0x4d, 0x39, 0x45, 0x84, 0x26, 0x59, 0x64, 0x92, 0x91, 0xbe, 0xab, + 0xad, 0x7c, 0xd3, 0x87, 0x43, 0x87, 0xa9, 0xbb, 0xc5, 0xc9, 0x53, 0x7a, + 0xae, 0x5b, 0x67, 0x8e, 0x49, 0x4c, 0x92, 0x43, 0x8b, 0x9c, 0xa3, 0x47, + 0x81, 0x59, 0x4a, 0x8a, 0x66, 0xb5, 0x72, 0x81, 0x3a, 0xc7, 0x9c, 0xb2, + 0xc8, 0x7e, 0x98, 0x7e, 0x61, 0xb3, 0x50, 0x40, 0x40, 0x64, 0x51, 0x64, + 0x8b, 0xc4, 0x8e, 0x90, 0xc9, 0x55, 0x64, 0xa9, 0x94, 0x3e, 0x5a, 0x63, + 0x72, 0x3a, 0x61, 0x85, 0xb5, 0x4b, 0xb5, 0x84, 0xc4, 0x38, 0x43, 0x37, + 0xb0, 0x3a, 0xd5, 0x4d, 0xc3, 0x68, 0xcf, 0x91, 0x7a, 0x97, 0x90, 0x2a, + 0xc6, 0xc3, 0xc0, 0x76, 0x64, 0x55, 0xd7, 0x4c, 0xb1, 0xb0, 0xd7, 0x41, + 0xb6, 0x36, 0x41, 0xae, 0x68, 0x70, 0x7f, 0x69, 0x53, 0xc8, 0xc2, 0xbe, + 0xac, 0x6e, 0x54, 0xc2, 0x54, 0x71, 0x71, 0x4b, 0x8b, 0xac, 0x7d, 0xc3, + 0xa1, 0x36, 0xc3, 0x5f, 0xa8, 0x9b, 0x56, 0x98, 0xc1, 0xa7, 0xc5, 0xa0, + 0x8c, 0x86, 0x7b, 0x6f, 0x51, 0x6b, 0xa9, 0xb2, 0x75, 0xc0, 0xa7, 0xa4, + 0x97, 0x32, 0x8b, 0x85, 0xa4, 0xd0, 0x40, 0xcc, 0xbf, 0x8d, 0xb9, 0x3d, + 0x62, 0x58, 0x95, 0x65, 0x64, 0x71, 0x50, 0x43, 0x6c, 0x95, 0x65, 0xc7, + 0x52, 0x32, 0xb2, 0x4d, 0x6c, 0x52, 0xca, 0x3b, 0xb6, 0x41, 0x37, 0xc4, + 0x7c, 0x8b, 0x64, 0x92, 0xae, 0x7d, 0xb0, 0xd7, 0x8f, 0x2b, 0xac, 0x53, + 0xbd, 0x64, 0xc0, 0xa9, 0x89, 0x78, 0x3f, 0x59, 0xa4, 0xd4, 0x8b, 0x59, + 0xbc, 0x4d, 0x9d, 0x7a, 0x3d, 0x4a, 0xa5, 0x35, 0x54, 0xa4, 0x9c, 0xa7, + 0x6b, 0x5f, 0x9c, 0x6b, 0x4b, 0x56, 0x37, 0x7b, 0xa2, 0x4f, 0xaf, 0x87, + 0x5e, 0x8d, 0x97, 0xc1, 0x95, 0xaf, 0xab, 0x2c, 0xb2, 0xc1, 0x72, 0x9b, + 0x99, 0xac, 0xad, 0x98, 0x8e, 0x85, 0xaf, 0x68, 0x3f, 0x7b, 0x71, 0xd9, + 0xce, 0xcf, 0x5b, 0xb7, 0x62, 0xb9, 0xc7, 0xa8, 0xa6, 0x62, 0x4d, 0x45, + 0x73, 0x4a, 0x79, 0x46, 0xa4, 0x4e, 0x6f, 0xc9, 0xbf, 0x7d, 0x9c, 0x2e, + 0x85, 0x77, 0x5e, 0x77, 0xb5, 0x88, 0x37, 0xc6, 0x4c, 0x86, 0x6c, 0xa5, + 0xa3, 0x7f, 0x99, 0x9e, 0xbf, 0x57, 0x76, 0x98, 0x39, 0xba, 0x7e, 0xa1, + 0xc0, 0x39, 0x7f, 0xa9, 0x59, 0x56, 0x63, 0x63, 0xcd, 0xc0, 0x8a, 0x8f, + 0x8b, 0x97, 0xc9, 0x39, 0x3b, 0xc8, 0xa6, 0x33, 0x7e, 0x9a, 0x5e, 0x8a, + 0x3c, 0x6f, 0x52, 0x6e, 0x60, 0x57, 0x34, 0xb9, 0x4a, 0xa9, 0xbc, 0xac, + 0xa1, 0x84, 0xa6, 0x63, 0x7d, 0x45, 0x8f, 0x9d, 0xa8, 0xa2, 0xc9, 0xd2, + 0x6c, 0x4a, 0x7d, 0x77, 0x38, 0x84, 0x50, 0xb4, 0xa6, 0x81, 0x96, 0x77, + 0xcd, 0x7c, 0x95, 0xa7, 0x77, 0x66, 0x9f, 0x7a, 0x9d, 0x59, 0x8b, 0x93, + 0x5a, 0x41, 0x96, 0xa6, 0x6f, 0xd2, 0xad, 0x35, 0x2e, 0x46, 0xcb, 0x5c, + 0x70, 0x8d, 0xcb, 0x53, 0xa7, 0xb2, 0xc1, 0xbb, 0xae, 0x28, 0xb0, 0x46, + 0x38, 0x9c, 0x6e, 0x9b, 0x8e, 0x3d, 0x6a, 0x4b, 0xc8, 0x9d, 0x82, 0xc2, + 0x5a, 0x74, 0x86, 0x49, 0x48, 0x6d, 0x6c, 0xe3, 0xcb, 0xd5, 0x50, 0xe4, + 0xa0, 0x54, 0x43, 0x4e, 0x61, 0x57, 0xc1, 0x74, 0x42, 0x71, 0xb8, 0x2f, + 0xbc, 0x77, 0x9b, 0xb8, 0x65, 0xa7, 0xa1, 0x8a, 0x73, 0xaa, 0xb2, 0xe0, + 0x8e, 0x38, 0x6e, 0x9d, 0xc7, 0xc3, 0x58, 0xc3, 0xd7, 0x9e, 0x3b, 0x8c, + 0xb4, 0x7d, 0x2f, 0x6b, 0x8c, 0xaa, 0xca, 0xbb, 0x90, 0x40, 0xa6, 0xa4, + 0x5e, 0xbd, 0xaf, 0xc9, 0xc1, 0x63, 0xb6, 0x73, 0xa6, 0x7e, 0xa9, 0x91, + 0x3e, 0x39, 0xcb, 0xa3, 0x8f, 0x71, 0xc9, 0xc2, 0xc3, 0xcf, 0x75, 0xe0, + 0x9f, 0x3d, 0xa5, 0x52, 0x8b, 0xd5, 0xa9, 0x4d, 0xc3, 0xd2, 0x35, 0xc4, + 0x75, 0xa1, 0x84, 0x5d, 0x72, 0xcc, 0x8a, 0x39, 0x5b, 0x6d, 0x74, 0x76, + 0x85, 0x50, 0x6d, 0x3c, 0x7c, 0x50, 0x71, 0x47, 0xa9, 0x42, 0x75, 0x90, + 0x61, 0xa6, 0x5c, 0x5d, 0x79, 0x74, 0x5c, 0x5a, 0xb4, 0x56, 0x8f, 0x3a, + 0x8b, 0x64, 0xda, 0xcc, 0x3d, 0x95, 0xbc, 0x7c, 0xdb, 0x8c, 0xa0, 0x90, + 0x8f, 0x3c, 0x41, 0xbb, 0x7c, 0x5b, 0x87, 0x6c, 0x9f, 0xc4, 0xca, 0xd7, + 0xda, 0x63, 0x50, 0xc8, 0x4e, 0x9d, 0x67, 0x96, 0xb4, 0xd6, 0xbd, 0xca, + 0xaa, 0x87, 0x49, 0x56, 0x32, 0x6c, 0x83, 0x60, 0x6c, 0x40, 0x81, 0x80, + 0x31, 0x76, 0x3b, 0xa8, 0x35, 0x37, 0x48, 0x65, 0xc2, 0x41, 0xcc, 0xa1, + 0x66, 0x6c, 0x6a, 0x28, 0xbe, 0x2f, 0x3c, 0x74, 0x72, 0xa2, 0x46, 0x67, + 0xb8, 0xa5, 0x5d, 0xc7, 0x7b, 0x7f, 0x64, 0x3b, 0x76, 0xc2, 0x6f, 0x39, + 0x74, 0xa9, 0x57, 0x44, 0x6e, 0x84, 0xdd, 0x7f, 0x6d, 0x9a, 0x53, 0x84, + 0x4f, 0xdc, 0x48, 0xcc, 0x36, 0x80, 0x67, 0x6d, 0x8f, 0xa5, 0x87, 0xb3, + 0xcc, 0x32, 0x47, 0xb8, 0xad, 0x5b, 0x80, 0xc3, 0xc0, 0x6a, 0x6e, 0xbe, + 0x65, 0x50, 0x69, 0xc6, 0x75, 0x86, 0xa0, 0xa7, 0xa6, 0x45, 0x8d, 0x94, + 0xac, 0xaa, 0x41, 0xc6, 0x37, 0x48, 0x49, 0xa3, 0xcd, 0x5b, 0x86, 0x7c, + 0x8f, 0x4f, 0x5a, 0x39, 0x6a, 0x50, 0x5b, 0xa4, 0x35, 0xb5, 0xc9, 0xa9, + 0x41, 0x49, 0xab, 0x8a, 0x9a, 0x49, 0x80, 0xcf, 0xb7, 0xc6, 0xb9, 0x4a, + 0xa2, 0x8a, 0x5a, 0x41, 0xa8, 0x89, 0x9e, 0x8a, 0x68, 0x90, 0x62, 0x66, + 0x78, 0x68, 0x98, 0x66, 0x6f, 0x94, 0xcc, 0xdf, 0x8d, 0x7a, 0x40, 0x3c, + 0x5a, 0x3f, 0xa7, 0xac, 0xc0, 0xcb, 0x40, 0xb4, 0x8d, 0x32, 0xab, 0x7a, + 0x53, 0xcf, 0xd5, 0x8c, 0x3c, 0xa3, 0x64, 0x9c, 0xc9, 0xd5, 0x72, 0x3e, + 0xc9, 0x8b, 0x70, 0xa0, 0x57, 0xa5, 0x8c, 0xb6, 0xa3, 0xbc, 0x8c, 0x44, + 0x5a, 0x83, 0x35, 0x39, 0xaf, 0xc7, 0x51, 0xc9, 0x94, 0x72, 0xa5, 0x49, + 0x71, 0xd1, 0x50, 0xba, 0xac, 0x32, 0x79, 0x96, 0x67, 0x55, 0x7a, 0x59, + 0x49, 0xcc, 0xa6, 0x38, 0x83, 0x6f, 0xad, 0xac, 0xb2, 0x70, 0x8d, 0x90, + 0xb1, 0x7f, 0xdc, 0xcc, 0xcd, 0x7a, 0x5f, 0x79, 0x99, 0x88, 0x65, 0xce, + 0x6e, 0x7e, 0xb6, 0xa7, 0xa7, 0xc6, 0xc4, 0xc3, 0x62, 0x2f, 0xbb, 0xc2, + 0xcb, 0x8b, 0xa8, 0xbf, 0x75, 0x3f, 0x3f, 0x51, 0xc0, 0xc6, 0x75, 0xbf, + 0xb1, 0xad, 0x43, 0x3e, 0x95, 0x9e, 0x7e, 0x9c, 0xec, 0xa9, 0x77, 0x68, + 0xbd, 0xaf, 0xce, 0xb2, 0xba, 0x68, 0x91, 0xb4, 0x97, 0x94, 0x6e, 0x3d, + 0xcf, 0x3f, 0x59, 0x78, 0x66, 0x47, 0x4f, 0xa2, 0x3c, 0xbd, 0xc3, 0xb4, + 0xbc, 0x45, 0xc2, 0xd4, 0xcd, 0x3d, 0x98, 0x86, 0xa2, 0x43, 0x52, 0xb3, + 0x96, 0x7f, 0x9d, 0x62, 0x64, 0x99, 0x42, 0xae, 0xcc, 0x9c, 0x91, 0x48, + 0x99, 0xcc, 0xb0, 0x5e, 0x84, 0xac, 0x67, 0x3d, 0x50, 0x65, 0xca, 0xc1, + 0xb9, 0x8c, 0x9b, 0x5c, 0xbb, 0x8b, 0x6f, 0xc8, 0x6d, 0xd7, 0xa4, 0x2a, + 0xab, 0xa1, 0x8c, 0x47, 0x90, 0xa1, 0x33, 0x60, 0xa4, 0x7f, 0x7e, 0xb7, + 0x9e, 0xbb, 0x73, 0x3e, 0x34, 0x3c, 0x30, 0x49, 0x8c, 0x85, 0xd5, 0x5f, + 0x9a, 0x38, 0x88, 0xd8, 0x80, 0x38, 0xba, 0x81, 0xa5, 0x31, 0x4e, 0x90, + 0x55, 0x91, 0xa0, 0x4a, 0x6a, 0xa8, 0x94, 0x48, 0xa7, 0xc1, 0x86, 0x76, + 0x92, 0xac, 0x40, 0xc0, 0x53, 0x4e, 0x89, 0x77, 0x4f, 0x8d, 0xde, 0xa8, + 0x71, 0xca, 0x87, 0xa3, 0x53, 0x74, 0x3e, 0xad, 0x99, 0xa4, 0x34, 0xba, + 0x58, 0x31, 0x5a, 0xc0, 0xd0, 0x77, 0xa8, 0x96, 0x79, 0x7c, 0x65, 0xa6, + 0x5f, 0x29, 0xbb, 0x74, 0xb5, 0x3d, 0x57, 0xca, 0x55, 0x77, 0x6c, 0x57, + 0x48, 0xac, 0x34, 0x98, 0x4f, 0xb3, 0x84, 0x47, 0x59, 0xcb, 0xdd, 0x92, + 0xb4, 0x68, 0x64, 0x78, 0xb6, 0x46, 0x3d, 0x83, 0x81, 0x9d, 0x43, 0x63, + 0xdf, 0x38, 0xe0, 0x7f, 0x7d, 0x7d, 0x45, 0xd4, 0xa2, 0xa7, 0x60, 0x8c, + 0xbd, 0xdc, 0xd4, 0x3a, 0x79, 0x47, 0xb7, 0x5a, 0xbf, 0x56, 0x72, 0xb5, + 0xce, 0x60, 0x9d, 0x78, 0xae, 0xa4, 0x9f, 0xe1, 0x61, 0x7a, 0x7f, 0x8c, + 0x57, 0xb3, 0x96, 0x5b, 0x36, 0x6c, 0x5c, 0x9d, 0xb2, 0x71, 0xc7, 0x2f, + 0x32, 0xa4, 0x93, 0xa3, 0xc3, 0x7c, 0x90, 0x73, 0x39, 0x82, 0x5d, 0xab, + 0x46, 0xc6, 0xc1, 0x8f, 0xda, 0x56, 0xa4, 0xa9, 0x41, 0x37, 0x37, 0x47, + 0x57, 0x5c, 0x6c, 0xa1, 0x8c, 0x52, 0x90, 0x94, 0x9d, 0xa0, 0x8b, 0x77, + 0x49, 0x80, 0xac, 0x90, 0xc4, 0x74, 0x38, 0xb5, 0xc8, 0x70, 0x7e, 0xa5, + 0x8b, 0x4b, 0x89, 0x96, 0x6a, 0x4e, 0x68, 0x56, 0x9e, 0x4f, 0xcd, 0x57, + 0x41, 0x93, 0x98, 0x4b, 0x86, 0xbd, 0xb0, 0xb8, 0x73, 0xc6, 0xbc, 0xad, + 0x71, 0x9a, 0xcb, 0x6b, 0x34, 0xb8, 0x71, 0x54, 0x93, 0xc1, 0x8f, 0xc4, + 0xb5, 0xbd, 0x9d, 0xce, 0x51, 0x95, 0x70, 0x4e, 0x4b, 0xc9, 0x83, 0xb7, + 0x3f, 0xa6, 0x46, 0xa1, 0x78, 0x3b, 0xaf, 0x78, 0x80, 0x8f, 0x39, 0xb0, + 0xc6, 0xc4, 0x33, 0xbf, 0x63, 0xc9, 0x53, 0xbb, 0xc8, 0x3a, 0x23, 0xa9, + 0xba, 0x92, 0xd2, 0xce, 0x42, 0x82, 0xd6, 0x91, 0x93, 0x96, 0x2e, 0x82, + 0x7f, 0x90, 0x77, 0x31, 0x4b, 0xa9, 0x65, 0xc1, 0x5c, 0x73, 0xca, 0x9a, + 0xb0, 0xc4, 0xa2, 0xbe, 0x83, 0x59, 0x5c, 0x9e, 0xb9, 0x59, 0xb4, 0x48, + 0xc5, 0xc4, 0xa1, 0xa2, 0x59, 0x3e, 0x66, 0xb9, 0x68, 0x33, 0x60, 0x3d, + 0x64, 0xb8, 0x6e, 0x80, 0x41, 0xd9, 0xbf, 0x8e, 0x33, 0x85, 0x64, 0x32, + 0xa1, 0xb2, 0x79, 0x92, 0x9a, 0x3b, 0x6e, 0x8b, 0xa1, 0x60, 0xb1, 0x36, + 0x82, 0x9c, 0x5e, 0x6b, 0x89, 0xa3, 0x9f, 0xb5, 0xb6, 0x71, 0x9c, 0x95, + 0x95, 0x69, 0xc0, 0x56, 0x3e, 0x32, 0xb2, 0xd1, 0x42, 0xa8, 0xa2, 0xc9, + 0x8d, 0x7d, 0xb2, 0x4e, 0x9f, 0x6e, 0x61, 0x40, 0x9d, 0x4d, 0x59, 0x81, + 0x95, 0x82, 0x50, 0x8e, 0xbe, 0xa0, 0x4d, 0x6a, 0xad, 0xb1, 0x3d, 0x54, + 0x89, 0x3c, 0x5a, 0x66, 0x82, 0xad, 0x42, 0xbb, 0x6f, 0x47, 0x68, 0xac, + 0x63, 0xae, 0x97, 0x60, 0x88, 0xb0, 0x3a, 0x60, 0x67, 0x2f, 0xcc, 0x73, + 0x94, 0xad, 0xa3, 0x3b, 0xa0, 0xa8, 0x36, 0x89, 0x6c, 0x95, 0x8e, 0x55, + 0x96, 0xc3, 0x6d, 0x7d, 0x5d, 0x5d, 0x76, 0x70, 0xa5, 0x70, 0x54, 0x80, + 0xc0, 0xb5, 0x90, 0xc1, 0x62, 0x8e, 0x6d, 0x50, 0x80, 0x3c, 0x97, 0x4b, + 0x9a, 0x58, 0x80, 0x56, 0xa9, 0x64, 0xce, 0xa8, 0x64, 0x60, 0xba, 0x8d, + 0x74, 0xd0, 0x4d, 0x90, 0xd1, 0x88, 0xac, 0xba, 0x43, 0xa0, 0x53, 0x8a, + 0x4d, 0x99, 0xc4, 0xc0, 0x34, 0x8e, 0x82, 0x4c, 0x52, 0x2a, 0xb2, 0x5d, + 0xd4, 0x74, 0x7c, 0x70, 0x37, 0x94, 0x64, 0xab, 0xb8, 0x8b, 0x3b, 0x34, + 0xaf, 0xc0, 0xa4, 0x85, 0xd1, 0xd2, 0xa5, 0x9c, 0xdb, 0x4f, 0x74, 0x5c, + 0x95, 0xa1, 0xd2, 0xb0, 0x38, 0x55, 0x54, 0x70, 0x4c, 0xcf, 0x49, 0xc4, + 0x6b, 0xa0, 0x3d, 0xa2, 0x7c, 0x3a, 0x72, 0x7a, 0xd6, 0x74, 0xb8, 0x7f, + 0xae, 0xa5, 0xc1, 0x3c, 0xc0, 0x91, 0x51, 0x67, 0x66, 0x5d, 0x7d, 0x3e, + 0xc2, 0x9f, 0x7f, 0x8c, 0xcb, 0x6a, 0x9a, 0x42, 0xac, 0x68, 0x7f, 0x53, + 0x49, 0xb5, 0x59, 0xbb, 0x4c, 0x63, 0xb6, 0x59, 0x7a, 0xb2, 0x8b, 0x81, + 0x35, 0xa7, 0x84, 0x85, 0x28, 0x7f, 0x70, 0x76, 0x59, 0x97, 0x5e, 0xc6, + 0x80, 0x6d, 0x8d, 0x6c, 0x3a, 0x5e, 0x83, 0x8a, 0x7e, 0xc1, 0x5f, 0x96, + 0x4c, 0xca, 0x72, 0x5b, 0xc9, 0xc2, 0x4b, 0x7d, 0x62, 0xbe, 0x8e, 0x70, + 0x4a, 0x8b, 0xa5, 0x3a, 0x8d, 0x5c, 0x42, 0x86, 0xa2, 0x43, 0xa9, 0xba, + 0x57, 0x39, 0xad, 0x7e, 0xb4, 0xce, 0xc5, 0xad, 0x60, 0xbd, 0x6b, 0x76, + 0x73, 0xc4, 0x79, 0xb8, 0x4c, 0x5c, 0x4d, 0xa4, 0x2d, 0xaa, 0x6c, 0xc1, + 0x52, 0x5a, 0x4b, 0x98, 0x46, 0x98, 0xa7, 0xce, 0xc8, 0xa8, 0xbd, 0x63, + 0xa9, 0x5d, 0x2c, 0xc8, 0xd3, 0xc4, 0x7d, 0x84, 0xc7, 0x77, 0xcb, 0x43, + 0x5e, 0x40, 0x65, 0xb8, 0x4c, 0x70, 0x41, 0x89, 0x42, 0x9e, 0x8c, 0x40, + 0x53, 0x9e, 0xc2, 0xd1, 0x39, 0x3c, 0x34, 0xab, 0x72, 0x85, 0xc7, 0xc1, + 0xbf, 0xbc, 0xb3, 0xcc, 0x39, 0x94, 0x77, 0x79, 0x38, 0xa0, 0x53, 0xbe, + 0x5b, 0x5a, 0xd2, 0xc8, 0xbf, 0xb6, 0xb5, 0x60, 0x6d, 0xd4, 0x7d, 0x95, + 0x9d, 0xab, 0x28, 0x97, 0x74, 0x4b, 0xc0, 0x4b, 0x44, 0xc7, 0x7d, 0x4d, + 0x6b, 0x8d, 0x8c, 0x44, 0x9f, 0xb5, 0x69, 0x54, 0x37, 0x36, 0x6d, 0xd3, + 0x73, 0x9c, 0x54, 0x92, 0xaa, 0x93, 0xa9, 0x93, 0x60, 0x99, 0x4c, 0x60, + 0x8d, 0x56, 0x5f, 0x80, 0xb8, 0xbb, 0xa0, 0x93, 0x6b, 0x76, 0x71, 0xc7, + 0x8d, 0x48, 0x80, 0x3e, 0x97, 0xc5, 0xca, 0x6e, 0x4a, 0x61, 0x32, 0x7d, + 0x50, 0xcd, 0xb6, 0x30, 0x4b, 0xa9, 0x3d, 0x98, 0x4a, 0x2f, 0xc7, 0x70, + 0xcb, 0x7d, 0xd5, 0x68, 0x68, 0x7a, 0xc6, 0x38, 0xd8, 0xc6, 0x98, 0x81, + 0x89, 0x8a, 0x90, 0x89, 0x63, 0x44, 0xac, 0xb3, 0x63, 0x47, 0x5e, 0x97, + 0x64, 0x2d, 0x9a, 0xb4, 0xa2, 0x4f, 0xad, 0xb3, 0x82, 0xaa, 0x41, 0xc1, + 0x9a, 0xbc, 0x4d, 0x47, 0x95, 0x48, 0x56, 0xb7, 0x52, 0xa0, 0x71, 0x98, + 0x93, 0x44, 0x58, 0x66, 0x5f, 0xb9, 0xd4, 0xcc, 0x8f, 0x57, 0x46, 0x75, + 0xc9, 0x67, 0x3c, 0x98, 0x5d, 0xbe, 0x3b, 0x6b, 0xa2, 0x70, 0xd5, 0x97, + 0x69, 0x92, 0x5f, 0x4a, 0xb7, 0x68, 0xb6, 0x66, 0x73, 0xd3, 0x6a, 0x52, + 0x83, 0x4b, 0xa3, 0x4f, 0xc0, 0x7c, 0x70, 0xbc, 0x85, 0xb2, 0xa7, 0xb5, + 0x78, 0x75, 0x35, 0x80, 0x44, 0x6e, 0xae, 0x38, 0xc4, 0x59, 0xc4, 0x79, + 0x82, 0x36, 0xcb, 0x5a, 0x9f, 0x6e, 0x99, 0x50, 0x59, 0x52, 0x33, 0x48, + 0x7a, 0xa9, 0x88, 0xbb, 0xaa, 0xbc, 0x7c, 0x7e, 0x9b, 0x19, 0x45, 0x52, + 0xbd, 0x62, 0x65, 0x74, 0xa4, 0x45, 0x8c, 0x60, 0x5a, 0x79, 0x8f, 0x30, + 0x30, 0x40, 0xd9, 0x48, 0x65, 0x4e, 0xa9, 0x59, 0x6c, 0x6d, 0x39, 0x46, + 0x61, 0x6f, 0x6f, 0xb9, 0x53, 0xc1, 0xcc, 0x5a, 0x4f, 0x52, 0x50, 0x74, + 0x93, 0x2f, 0x32, 0xb4, 0x31, 0xc7, 0x92, 0xce, 0x69, 0x4e, 0xb1, 0x84, + 0x91, 0xb0, 0xc0, 0x69, 0x7e, 0x4d, 0x60, 0x3f, 0x89, 0x9d, 0x4b, 0x40, + 0x3f, 0xbd, 0x8f, 0x99, 0xb8, 0xd6, 0x83, 0x3d, 0x71, 0xa3, 0xc5, 0x3b, + 0x4a, 0xc0, 0x7d, 0x7c, 0x40, 0x35, 0xb5, 0x57, 0x50, 0x73, 0x5b, 0x5f, + 0x73, 0xc9, 0x96, 0xcf, 0x66, 0xbd, 0x37, 0x8f, 0x36, 0x6e, 0xd0, 0x5a, + 0x80, 0x40, 0x92, 0x7f, 0x67, 0x32, 0x97, 0x8b, 0x38, 0x46, 0x85, 0xce, + 0xba, 0x75, 0x5f, 0xbd, 0x5a, 0x5c, 0xc4, 0x6e, 0x41, 0x9f, 0x46, 0xc6, + 0xcb, 0x8c, 0x5c, 0x98, 0x99, 0x4e, 0x31, 0x5b, 0x9e, 0x37, 0xb7, 0x66, + 0xad, 0x9c, 0x85, 0x8d, 0x32, 0x54, 0xa7, 0xb0, 0x3d, 0x81, 0x69, 0x56, + 0xd1, 0x57, 0x67, 0x64, 0x4e, 0xaf, 0xcd, 0xa8, 0x99, 0xd9, 0x83, 0x7c, + 0xae, 0x4c, 0x61, 0x6f, 0x5e, 0x50, 0xa5, 0x5a, 0x95, 0x76, 0x75, 0x37, + 0x77, 0xaf, 0x30, 0x77, 0x54, 0xc7, 0xc6, 0xad, 0xac, 0x91, 0x7d, 0xa6, + 0x4c, 0xba, 0x49, 0x35, 0x7c, 0x60, 0xbb, 0x46, 0x61, 0x8e, 0xb1, 0x89, + 0x7f, 0x7f, 0x67, 0x9d, 0x71, 0xc0, 0xba, 0x9c, 0x50, 0x9a, 0x7c, 0xc3, + 0xcd, 0x92, 0x3b, 0x44, 0x99, 0xb1, 0xab, 0x75, 0xce, 0xab, 0x31, 0x69, + 0x8e, 0x52, 0x69, 0xb6, 0xa7, 0x75, 0xab, 0x3e, 0x49, 0xac, 0x78, 0x6d, + 0xcd, 0x4d, 0xb5, 0xac, 0x70, 0xc0, 0x55, 0x79, 0xc9, 0x7a, 0xbc, 0xc1, + 0x70, 0x42, 0x5f, 0xd7, 0x55, 0x5c, 0xa0, 0xad, 0x50, 0x7a, 0x68, 0x84, + 0x8b, 0x39, 0x68, 0x89, 0x99, 0x7c, 0x89, 0x75, 0x52, 0xa6, 0xd0, 0xa4, + 0x93, 0xcc, 0x57, 0x92, 0x79, 0xa7, 0x43, 0x52, 0xb3, 0xb3, 0x3f, 0xcf, + 0x3a, 0xac, 0x35, 0x44, 0x93, 0xab, 0xaa, 0xbc, 0x63, 0x4b, 0xc4, 0x88, + 0xaf, 0x5a, 0x65, 0x5c, 0x84, 0x66, 0x3a, 0x99, 0x69, 0xc4, 0x42, 0xb2, + 0x8b, 0x9c, 0xd6, 0x90, 0xbe, 0xa9, 0x54, 0x7b, 0xb5, 0x92, 0x7e, 0x67, + 0x34, 0x86, 0x70, 0x53, 0xc8, 0xa1, 0x38, 0x3f, 0x52, 0x47, 0x54, 0x59, + 0xcc, 0xc2, 0x39, 0x6d, 0x70, 0xc3, 0x96, 0xbc, 0x66, 0x51, 0x42, 0x6c, + 0x6f, 0x8f, 0x4a, 0x96, 0x7a, 0x42, 0xb0, 0xca, 0xb2, 0xbd, 0xc7, 0xc6, + 0x74, 0x8b, 0x5e, 0xaa, 0x50, 0x70, 0xa3, 0xa3, 0x55, 0xa2, 0x9a, 0x86, + 0x42, 0x86, 0x78, 0xaf, 0x4a, 0x69, 0xa0, 0xb2, 0x79, 0x71, 0x66, 0x7f, + 0x7d, 0x7f, 0x33, 0x77, 0xb7, 0xad, 0x48, 0x3d, 0xd2, 0x5e, 0x5d, 0x8e, + 0x65, 0x2f, 0x5b, 0x82, 0xb9, 0x5d, 0x68, 0x8e, 0xa9, 0x77, 0xac, 0x8b, + 0xc6, 0x9f, 0x36, 0xb4, 0xae, 0x8c, 0xb8, 0xb3, 0x9f, 0x90, 0x55, 0x67, + 0x93, 0x6a, 0xa3, 0xbd, 0xbe, 0x98, 0x39, 0x49, 0x49, 0x6b, 0x63, 0xba, + 0x41, 0x85, 0x38, 0x6f, 0x78, 0x3e, 0x74, 0xb4, 0x3b, 0xae, 0xa6, 0x64, + 0x4a, 0x2f, 0x55, 0x63, 0x46, 0xd7, 0xa1, 0x5b, 0xb2, 0xa2, 0x8a, 0x47, + 0x8d, 0x94, 0x57, 0xc9, 0x84, 0xd5, 0x5e, 0x96, 0x48, 0x43, 0x53, 0xa4, + 0x4f, 0xaf, 0x60, 0x95, 0xb7, 0xd2, 0x93, 0x5f, 0x9f, 0xc6, 0x72, 0x5f, + 0xa5, 0x90, 0x95, 0x50, 0x48, 0x64, 0xb9, 0x35, 0x35, 0x76, 0x4f, 0x6e, + 0x54, 0xae, 0x3b, 0x30, 0xb2, 0x67, 0xca, 0x63, 0x6d, 0x40, 0x76, 0xbb, + 0x93, 0x9f, 0x5f, 0x84, 0x90, 0x8b, 0xac, 0x31, 0x5a, 0x6b, 0x47, 0x61, + 0x65, 0x68, 0x6a, 0x79, 0x9e, 0x71, 0x54, 0x50, 0x66, 0x3c, 0x4c, 0x65, + 0x84, 0x53, 0x2b, 0x42, 0x9f, 0x5f, 0x78, 0x71, 0x4d, 0x9f, 0x40, 0x78, + 0xbb, 0x8a, 0x96, 0x68, 0x73, 0x93, 0xa5, 0xcd, 0xc1, 0x76, 0x87, 0x43, + 0x5b, 0x42, 0xb4, 0x57, 0x78, 0x8d, 0xb9, 0x5b, 0x8b, 0x6a, 0x52, 0x3f, + 0xb1, 0xae, 0x4f, 0xd3, 0x63, 0x71, 0x46, 0x99, 0xa6, 0xc4, 0xc9, 0x5f, + 0x6b, 0x4d, 0x82, 0x9e, 0x68, 0xc3, 0x6d, 0x96, 0x66, 0xc3, 0x5f, 0xa9, + 0xcf, 0x83, 0xbd, 0xba, 0x9b, 0x7c, 0x61, 0x7e, 0x3c, 0x4b, 0x66, 0x3b, + 0x84, 0x96, 0xc1, 0x5c, 0x91, 0xd2, 0x7c, 0x56, 0xc1, 0x92, 0xd2, 0xb2, + 0x2e, 0xad, 0x7c, 0x3d, 0x9e, 0x61, 0xa7, 0xab, 0x66, 0x4d, 0x6c, 0x88, + 0xcc, 0x72, 0xaa, 0x56, 0x8a, 0xa7, 0x61, 0x84, 0x60, 0xd0, 0x43, 0xaf, + 0x40, 0xbc, 0xd2, 0xc9, 0x88, 0x3d, 0x72, 0x5c, 0xa8, 0xca, 0x9b, 0xc0, + 0x68, 0xbf, 0xb7, 0x94, 0x41, 0x7a, 0x78, 0xb8, 0xce, 0x5d, 0x58, 0x57, + 0x66, 0x93, 0xaa, 0x54, 0x63, 0x82, 0xdb, 0x7e, 0xa6, 0x94, 0x93, 0xbb, + 0x29, 0x2e, 0x33, 0x39, 0xa6, 0x97, 0xb8, 0x53, 0xc6, 0xbc, 0xa8, 0x4a, + 0xc1, 0x37, 0xc2, 0x36, 0x4f, 0x53, 0x80, 0xbe, 0x8d, 0x50, 0x7a, 0x7d, + 0xab, 0xb1, 0x6b, 0x90, 0xc1, 0xa2, 0x6a, 0x5a, 0x3c, 0x3e, 0x66, 0x8d, + 0x5e, 0x76, 0xb9, 0xce, 0xa7, 0xc2, 0x69, 0xa2, 0x37, 0xb8, 0xa1, 0x7a, + 0x99, 0x31, 0x98, 0x8b, 0x39, 0x57, 0x8b, 0xce, 0x8e, 0x40, 0x51, 0x71, + 0xb9, 0x95, 0x6d, 0x34, 0x6c, 0xc2, 0xc8, 0x3e, 0x5c, 0x9b, 0x46, 0x78, + 0xc3, 0x71, 0x89, 0x7e, 0xb8, 0xa0, 0x40, 0x35, 0xb3, 0xcb, 0x55, 0x90, + 0x4a, 0x45, 0x3c, 0x34, 0x4d, 0x64, 0xa9, 0x6d, 0x9d, 0xa0, 0x9d, 0x41, + 0x9c, 0x46, 0x4a, 0xce, 0x94, 0xb7, 0x8f, 0x5e, 0x52, 0xb5, 0x80, 0xc5, + 0x98, 0x37, 0x3f, 0x5f, 0x48, 0xce, 0xc0, 0x91, 0xca, 0xba, 0x68, 0xa6, + 0x92, 0xa7, 0x30, 0x80, 0x98, 0x8a, 0x6f, 0xc7, 0xc6, 0x5c, 0x4a, 0x7c, + 0xb4, 0x99, 0xaa, 0x92, 0xb0, 0x61, 0x41, 0xc7, 0x8a, 0xb8, 0xab, 0x69, + 0x79, 0x5a, 0xa6, 0x9a, 0x67, 0x88, 0x49, 0xa0, 0x87, 0x84, 0x78, 0x76, + 0x73, 0x9f, 0xd8, 0xac, 0x9a, 0x4b, 0x8e, 0x6a, 0xcd, 0x42, 0x57, 0x97, + 0x73, 0x38, 0xc5, 0x4b, 0xcd, 0xa1, 0x98, 0x5c, 0x9a, 0x61, 0x8e, 0x66, + 0x4b, 0x34, 0x41, 0x66, 0x98, 0x82, 0x80, 0x5f, 0xa5, 0xba, 0x97, 0x56, + 0x78, 0x83, 0x36, 0x75, 0x31, 0xb9, 0xb8, 0xd1, 0xca, 0x79, 0x9d, 0x73, + 0x53, 0x64, 0x62, 0xaf, 0xa2, 0x54, 0xd3, 0x8e, 0x4d, 0xca, 0x8a, 0x57, + 0x82, 0x6e, 0x3f, 0xc2, 0x3f, 0xb1, 0x80, 0x56, 0xaa, 0x7a, 0x4b, 0x9e, + 0x65, 0x4c, 0xb8, 0xcb, 0x37, 0x41, 0x74, 0x89, 0x77, 0xbe, 0xc9, 0x9c, + 0xa9, 0xb8, 0x9b, 0x76, 0x60, 0xc3, 0x8f, 0x44, 0xca, 0xa8, 0x97, 0x5d, + 0x3d, 0xd4, 0x7f, 0x5a, 0xc8, 0x6b, 0x69, 0x51, 0x94, 0x92, 0x5c, 0x97, + 0x4c, 0xa1, 0x6b, 0x74, 0x46, 0x44, 0x4f, 0x7f, 0xa8, 0x95, 0x4c, 0x51, + 0xcc, 0x62, 0xbb, 0xaa, 0x39, 0x94, 0x89, 0x40, 0xa6, 0x7d, 0x5b, 0x60, + 0xd1, 0x4c, 0xa4, 0xb5, 0x7e, 0x96, 0x81, 0x4f, 0x3e, 0xb5, 0xb2, 0x7d, + 0x85, 0x93, 0x9c, 0x7a, 0x95, 0xc2, 0x67, 0x7f, 0x7b, 0x2c, 0x93, 0x84, + 0x68, 0x9a, 0xc2, 0xb7, 0x45, 0x76, 0x57, 0x54, 0xbe, 0x70, 0x6b, 0x6b, + 0x9f, 0xd1, 0xa6, 0x71, 0x84, 0x37, 0x2e, 0x64, 0x64, 0x78, 0x2e, 0x91, + 0xcd, 0xbc, 0x79, 0xd5, 0x3d, 0x84, 0xd0, 0x41, 0x35, 0xa4, 0x6f, 0x60, + 0x76, 0xc0, 0x86, 0xb0, 0x94, 0xc5, 0x8d, 0x8c, 0xb3, 0x84, 0x59, 0x87, + 0x84, 0xb1, 0x5e, 0xc3, 0xaf, 0xd4, 0xcb, 0x98, 0x5c, 0x7d, 0xa7, 0x46, + 0xa1, 0x5c, 0x79, 0x53, 0x69, 0x38, 0x5e, 0x62, 0x8d, 0x55, 0x8a, 0x72, + 0x9b, 0x6c, 0x43, 0x5d, 0x71, 0x6a, 0x7c, 0x63, 0x9b, 0xb7, 0x40, 0x9d, + 0x93, 0x88, 0xc5, 0x35, 0xc1, 0x82, 0xbe, 0xcd, 0x59, 0x73, 0x51, 0x5f, + 0xb8, 0x92, 0x3e, 0xae, 0x35, 0x72, 0x8b, 0x36, 0xd5, 0x80, 0x31, 0x95, + 0x7c, 0x66, 0x55, 0x48, 0x73, 0x8d, 0xcd, 0x99, 0x71, 0xbe, 0xa3, 0xae, + 0x8d, 0xae, 0xa7, 0x33, 0xaf, 0xbf, 0x57, 0x6c, 0xc6, 0x63, 0xb3, 0x51, + 0xae, 0x3d, 0xbf, 0x7f, 0xa7, 0xa9, 0x39, 0xa5, 0x7a, 0x4e, 0x41, 0xb1, + 0x58, 0x44, 0xca, 0x39, 0x89, 0xb4, 0x45, 0x6a, 0xb9, 0x37, 0x44, 0xa2, + 0x89, 0xa0, 0x40, 0x4b, 0x94, 0x67, 0x55, 0x9b, 0xc9, 0xbc, 0x6d, 0x55, + 0x58, 0x41, 0x9f, 0xd0, 0xd8, 0x3b, 0xc0, 0x8f, 0x80, 0x2a, 0x55, 0x9c, + 0x3f, 0x83, 0xcc, 0x92, 0x42, 0x95, 0x4b, 0x7c, 0x33, 0xbd, 0x8a, 0x3e, + 0xd3, 0x48, 0xb4, 0x68, 0x46, 0xad, 0x97, 0xab, 0x6d, 0x97, 0xc2, 0x57, + 0xc1, 0x30, 0xac, 0x56, 0xb9, 0xc5, 0x9a, 0x33, 0xa4, 0x76, 0x89, 0xc9, + 0x9a, 0x38, 0x61, 0x8e, 0x9b, 0xca, 0x39, 0x4c, 0x89, 0x96, 0x51, 0x5d, + 0xb2, 0x44, 0x73, 0x36, 0xd1, 0xb8, 0xad, 0xcd, 0x9d, 0x60, 0x3b, 0xcb, + 0x82, 0xc0, 0x68, 0x6d, 0xd4, 0x8c, 0xcc, 0x4a, 0x3e, 0x40, 0x99, 0x8e, + 0x85, 0xb6, 0x58, 0x89, 0x69, 0xac, 0x3c, 0xc8, 0x64, 0x35, 0xc5, 0x67, + 0x5e, 0x8a, 0x63, 0x39, 0x97, 0xc5, 0xa4, 0xa2, 0xa6, 0xab, 0xa7, 0x4e, + 0x3d, 0x5d, 0x81, 0xbf, 0x97, 0xc3, 0xc9, 0xd1, 0xb8, 0x60, 0x36, 0xa6, + 0xb2, 0x88, 0xb9, 0xc6, 0xce, 0xa6, 0xc3, 0x51, 0x41, 0xa7, 0x86, 0x58, + 0xc5, 0xa2, 0x63, 0x45, 0x99, 0x5c, 0x8a, 0x9a, 0x52, 0x92, 0xa7, 0xba, + 0x9c, 0xa5, 0x4c, 0xa2, 0x75, 0xb6, 0x43, 0x6f, 0x42, 0x87, 0xbc, 0xa4, + 0x50, 0x48, 0x7b, 0x7f, 0x67, 0xc2, 0x31, 0x9e, 0x9a, 0x7d, 0x61, 0xc1, + 0x83, 0x36, 0xc1, 0xcc, 0x88, 0x9f, 0xcb, 0x95, 0x46, 0x7a, 0xba, 0x79, + 0xcc, 0xcd, 0x58, 0x43, 0xaf, 0x6a, 0x83, 0x3f, 0xd6, 0x90, 0x56, 0xc4, + 0x9e, 0x9d, 0x67, 0xcc, 0x9c, 0xc4, 0x4f, 0xa7, 0x73, 0xb2, 0x84, 0x51, + 0x65, 0xba, 0xd2, 0x67, 0x5b, 0x2f, 0x78, 0x3b, 0x68, 0x2d, 0x58, 0x94, + 0x48, 0xcb, 0xab, 0x57, 0x9b, 0x60, 0x8a, 0x69, 0xd1, 0x65, 0x9c, 0xb0, + 0x9b, 0x48, 0xb3, 0x4c, 0x48, 0xc4, 0x56, 0x56, 0x63, 0xa1, 0x8f, 0xad, + 0x84, 0x83, 0xb5, 0xbf, 0x7d, 0x36, 0x45, 0x71, 0x80, 0xb4, 0x61, 0x76, + 0x4c, 0x5b, 0x4d, 0x91, 0x92, 0x52, 0x62, 0x5f, 0x78, 0xba, 0x6e, 0xb0, + 0x3d, 0x77, 0xcb, 0x49, 0x9c, 0xb4, 0xc8, 0xbd, 0xcd, 0xb4, 0x46, 0x49, + 0x33, 0x55, 0x99, 0x6c, 0x4c, 0x7c, 0x4f, 0x2d, 0xa4, 0xc8, 0xc3, 0x4f, + 0x3c, 0xa4, 0x90, 0x41, 0x2e, 0x45, 0x90, 0x94, 0x68, 0xb5, 0xc4, 0xc0, + 0x60, 0xc5, 0x44, 0x7e, 0x70, 0x63, 0xc0, 0xd0, 0x55, 0x46, 0x86, 0xb9, + 0xc7, 0x60, 0x87, 0x3d, 0xaf, 0x76, 0xab, 0x79, 0xc0, 0xab, 0xd0, 0x54, + 0x69, 0xc3, 0x53, 0xae, 0x41, 0x9a, 0x7d, 0xc6, 0x91, 0x45, 0xbb, 0x92, + 0x5a, 0x66, 0xaf, 0x5c, 0x57, 0x6f, 0x62, 0x41, 0x53, 0xce, 0x3c, 0xd1, + 0x67, 0x5d, 0x80, 0x38, 0x53, 0x45, 0xa5, 0xb0, 0x88, 0x3a, 0x60, 0x6a, + 0xc5, 0x6b, 0x74, 0xb7, 0x95, 0xab, 0x64, 0x94, 0x57, 0xad, 0x6b, 0x6f, + 0x39, 0xa0, 0x3f, 0x4a, 0x74, 0xc4, 0x57, 0x71, 0x7a, 0x94, 0x4b, 0xb7, + 0x91, 0xba, 0x91, 0x44, 0x7c, 0x91, 0x59, 0x39, 0x4c, 0x5b, 0x3d, 0x3e, + 0x8d, 0xa5, 0x45, 0x3d, 0x83, 0x41, 0x70, 0x66, 0x7a, 0x4a, 0xc5, 0x43, + 0x9f, 0x7d, 0x75, 0x8c, 0xa1, 0x98, 0x55, 0x4f, 0x3b, 0x93, 0x63, 0xb2, + 0x65, 0x71, 0x46, 0x5a, 0x4d, 0x37, 0x5f, 0xc4, 0x6e, 0x32, 0xcb, 0x6b, + 0x9c, 0x5b, 0xb5, 0x58, 0xad, 0xa4, 0x81, 0x69, 0x66, 0x5e, 0x86, 0x7d, + 0x65, 0x6c, 0xbb, 0xce, 0xbb, 0x39, 0xca, 0x89, 0x84, 0x7d, 0x63, 0xa8, + 0xba, 0x4a, 0x63, 0x9a, 0x57, 0x59, 0x5c, 0xae, 0x7f, 0x5e, 0x6a, 0xaf, + 0xbc, 0xae, 0xcc, 0xb4, 0x59, 0x90, 0x56, 0xca, 0xc7, 0x3b, 0x68, 0x97, + 0xb8, 0x3e, 0x65, 0xa5, 0x94, 0xc0, 0x69, 0x97, 0xc8, 0x73, 0xb6, 0x6b, + 0xc2, 0xa1, 0x69, 0xcc, 0xaa, 0xd3, 0xca, 0x9b, 0xb8, 0xa7, 0x69, 0xd2, + 0x43, 0x7d, 0x6f, 0x3d, 0x44, 0x40, 0x78, 0x41, 0x40, 0x8a, 0x5d, 0xc2, + 0xbd, 0x85, 0x4a, 0x38, 0x50, 0x9f, 0x4b, 0x3a, 0x59, 0xad, 0xa9, 0x8e, + 0xcd, 0x73, 0x85, 0x58, 0xad, 0xad, 0x62, 0x6e, 0x3d, 0x78, 0x91, 0x74, + 0x4a, 0x63, 0xae, 0xba, 0xc0, 0xb1, 0x9e, 0x41, 0x32, 0x4f, 0x79, 0x4b, + 0xae, 0x97, 0xb1, 0x6a, 0xa5, 0x67, 0x86, 0xa7, 0xc0, 0xc3, 0x61, 0xcf, + 0x8d, 0x7f, 0xa8, 0x89, 0xa8, 0x97, 0xa6, 0xbe, 0x89, 0x5d, 0xc7, 0x5f, + 0x45, 0x7d, 0x5c, 0x50, 0xa1, 0xd4, 0x8b, 0x6e, 0x3b, 0xaa, 0x80, 0x87, + 0x84, 0x70, 0x8f, 0x35, 0xce, 0xc2, 0x8c, 0xcb, 0xbb, 0xc0, 0x5e, 0xb9, + 0x44, 0xa1, 0xc7, 0x49, 0xa4, 0x7a, 0xbd, 0xb8, 0x78, 0x77, 0xb4, 0xd4, + 0x32, 0xb6, 0x5b, 0x5c, 0xb5, 0x81, 0x48, 0xab, 0x60, 0xa0, 0x88, 0x5a, + 0x7b, 0xa2, 0x3f, 0xb3, 0x80, 0x91, 0x47, 0xbe, 0x3c, 0x5c, 0x51, 0x31, + 0x4f, 0xce, 0x6d, 0xb6, 0xa9, 0xa1, 0x64, 0xb3, 0x70, 0x74, 0x31, 0xb5, + 0x58, 0x7f, 0x92, 0x79, 0xbd, 0xc4, 0x6c, 0x76, 0xcb, 0x8b, 0xbe, 0xd1, + 0x77, 0x7e, 0x94, 0x9e, 0xc4, 0x7d, 0x85, 0xcf, 0xa2, 0x46, 0x5d, 0xbc, + 0x8a, 0x90, 0x96, 0x37, 0x6b, 0xd0, 0x71, 0xb2, 0x6f, 0xc6, 0x91, 0x63, + 0x7e, 0x99, 0x7e, 0x4f, 0xbb, 0xc1, 0x7e, 0xa2, 0x62, 0x65, 0x65, 0x6b, + 0xd0, 0x76, 0x7c, 0x38, 0xb9, 0x78, 0x84, 0x43, 0x4f, 0xa4, 0xbc, 0x80, + 0x63, 0x89, 0xbb, 0x87, 0x52, 0xb5, 0x4b, 0x97, 0x86, 0x57, 0x79, 0xd1, + 0xc7, 0x3d, 0xb1, 0xad, 0x58, 0x85, 0xa1, 0xb0, 0xaa, 0x8f, 0x88, 0xb3, + 0xb2, 0xbb, 0x4e, 0x4f, 0xa0, 0x7c, 0xa3, 0x30, 0x99, 0x86, 0xc0, 0x3c, + 0x5e, 0x85, 0x75, 0x46, 0x43, 0xcb, 0x68, 0x64, 0x4a, 0x8b, 0x4d, 0x60, + 0x42, 0x3d, 0xb4, 0x9c, 0x5e, 0x7e, 0x3f, 0x6f, 0x45, 0x4b, 0x62, 0x91, + 0xb5, 0x7e, 0x4e, 0x8d, 0xc4, 0x65, 0x7b, 0xb2, 0x84, 0x8c, 0xa2, 0x70, + 0x5c, 0x68, 0x5c, 0x36, 0x99, 0x9b, 0x85, 0x91, 0x58, 0x6c, 0xc8, 0xc6, + 0x6f, 0x51, 0xa4, 0x87, 0xc2, 0x8a, 0x88, 0xac, 0x77, 0xaa, 0x8a, 0x3d, + 0xcd, 0xbe, 0x83, 0xa2, 0xad, 0x5f, 0x43, 0x8e, 0x90, 0x4c, 0x85, 0xa8, + 0xb4, 0xa7, 0x3a, 0x5f, 0x97, 0x5a, 0x92, 0xd1, 0xc4, 0xb7, 0x52, 0xa1, + 0xc2, 0xa6, 0xbb, 0xad, 0x5c, 0x61, 0x5f, 0x82, 0xb4, 0x3b, 0xc5, 0x3c, + 0x63, 0x99, 0x7d, 0x4c, 0xad, 0x96, 0xb4, 0x4b, 0x57, 0x70, 0x59, 0xc2, + 0xb1, 0xa4, 0xac, 0x34, 0x52, 0x43, 0x51, 0x75, 0x5a, 0x6a, 0xb5, 0x5b, + 0x55, 0x4d, 0x90, 0x46, 0x92, 0xbc, 0x31, 0x81, 0x3e, 0xb3, 0x39, 0x4e, + 0xb9, 0x3e, 0x37, 0xa5, 0x3f, 0xbe, 0x3e, 0x42, 0xb5, 0xa6, 0xab, 0x4a, + 0xce, 0xb0, 0x4a, 0x98, 0x78, 0x6c, 0x72, 0x96, 0x50, 0x55, 0x76, 0xae, + 0x81, 0x82, 0x32, 0x4b, 0xa4, 0xc3, 0x4a, 0x76, 0x9a, 0x5f, 0xc0, 0xc4, + 0x2e, 0x7d, 0xa8, 0x7f, 0xba, 0x9a, 0x6e, 0xad, 0x80, 0xa2, 0xd7, 0x39, + 0x42, 0x35, 0xa6, 0x36, 0x6e, 0x88, 0x9c, 0x4f, 0x96, 0x9e, 0xaa, 0x3c, + 0x4c, 0x4f, 0x30, 0x60, 0xb4, 0x3a, 0xce, 0x82, 0x79, 0xb3, 0xa5, 0x8a, + 0xce, 0x86, 0x90, 0xa6, 0x80, 0x84, 0x70, 0xa7, 0xc9, 0x69, 0x5b, 0x97, + 0x55, 0x6a, 0xcc, 0xc9, 0x97, 0xcb, 0xb4, 0x78, 0xbf, 0x52, 0x71, 0x52, + 0xad, 0x54, 0x5f, 0x86, 0x35, 0xad, 0x36, 0x66, 0x3e, 0xb8, 0xbb, 0x55, + 0x8f, 0x4a, 0x84, 0x71, 0x49, 0x98, 0x60, 0x9c, 0x5e, 0x7f, 0xba, 0x8d, + 0xc5, 0x75, 0x64, 0x9b, 0x84, 0x69, 0x48, 0x6d, 0x53, 0x3a, 0x4d, 0xb3, + 0xcc, 0x50, 0xc0, 0x49, 0xd7, 0x7b, 0x8d, 0x67, 0x42, 0xc9, 0xc5, 0xc7, + 0x78, 0x9b, 0x9c, 0xa4, 0x7f, 0xcd, 0x9f, 0x6b, 0x56, 0xc2, 0x4b, 0x9c, + 0x9b, 0x3d, 0x7d, 0x79, 0x4b, 0x3c, 0x43, 0x6f, 0xac, 0x51, 0x5e, 0x3e, + 0x82, 0x68, 0x3c, 0x5a, 0x73, 0x72, 0x76, 0x43, 0x42, 0xbf, 0x6c, 0xb5, + 0x7b, 0xc3, 0x4e, 0xa0, 0x2f, 0xa7, 0x89, 0x83, 0x71, 0xd1, 0x48, 0x4d, + 0x4f, 0xbc, 0x98, 0x59, 0xd3, 0xb5, 0x46, 0x3c, 0xad, 0xcd, 0x59, 0x80, + 0xbf, 0x37, 0x86, 0x89, 0x78, 0x83, 0x73, 0x66, 0x41, 0xa1, 0x99, 0x52, + 0x77, 0x3b, 0xb6, 0x4a, 0xa8, 0xc8, 0x36, 0x51, 0xa9, 0xc6, 0x77, 0x78, + 0x7a, 0xb2, 0xb4, 0x92, 0x62, 0xb9, 0x89, 0x76, 0x32, 0xc3, 0xb1, 0x3a, + 0xc6, 0xd5, 0x4d, 0xc3, 0xc2, 0x89, 0x9a, 0x9b, 0x4e, 0x6b, 0xd1, 0x55, + 0x36, 0x85, 0x5f, 0x8e, 0x86, 0x33, 0x57, 0x56, 0x58, 0x57, 0x46, 0x7a, + 0x7e, 0x7d, 0xd2, 0x3d, 0x98, 0xab, 0xb6, 0x7d, 0x60, 0x4f, 0x59, 0xbc, + 0x88, 0xbf, 0x33, 0x92, 0x9c, 0x64, 0x91, 0x52, 0xc1, 0xac, 0x5a, 0x52, + 0x33, 0xca, 0x93, 0x65, 0x40, 0x67, 0x99, 0x44, 0x51, 0x53, 0x46, 0xba, + 0xc2, 0x60, 0x68, 0x4b, 0xc9, 0x95, 0x8f, 0xc8, 0xaa, 0xc7, 0xcb, 0x94, + 0x6c, 0x59, 0x71, 0x7e, 0x8d, 0x94, 0x8a, 0x93, 0x91, 0x79, 0xb1, 0x4d, + 0xbd, 0xc3, 0xc3, 0x7a, 0xd9, 0x85, 0x64, 0x9f, 0x4d, 0x79, 0x8f, 0x80, + 0x37, 0xb4, 0x92, 0x60, 0xbd, 0x8c, 0x66, 0xa3, 0x2d, 0x61, 0x42, 0x9b, + 0xa8, 0x9c, 0x97, 0xcc, 0x3c, 0x40, 0x4f, 0xaa, 0x78, 0x93, 0x38, 0x67, + 0x4d, 0x86, 0x39, 0x94, 0x6d, 0x99, 0x41, 0xa5, 0xc0, 0x90, 0xb1, 0x6b, + 0x4a, 0x9d, 0x54, 0xa9, 0x5d, 0x80, 0x75, 0x9c, 0x9e, 0xbc, 0x4e, 0xbf, + 0x8e, 0xc6, 0x89, 0x43, 0x33, 0x9d, 0x7a, 0x9e, 0x78, 0x34, 0x64, 0x90, + 0x7f, 0xc8, 0x84, 0xa3, 0x6f, 0x90, 0xb1, 0x5c, 0x8f, 0x54, 0x51, 0x38, + 0xa6, 0x5c, 0x85, 0xbd, 0x88, 0xb3, 0x95, 0xa4, 0x9d, 0x4d, 0x36, 0x78, + 0xcb, 0x3d, 0x63, 0xbd, 0x68, 0xc6, 0x36, 0x8f, 0xc1, 0x37, 0xcf, 0x30, + 0xca, 0x40, 0x86, 0xa4, 0xb2, 0x9a, 0x88, 0x3e, 0xce, 0x6d, 0x43, 0xd4, + 0x3e, 0x60, 0xca, 0x3c, 0x8b, 0x5c, 0x71, 0x68, 0xa4, 0xca, 0x77, 0x70, + 0x76, 0x65, 0x53, 0x55, 0xa3, 0x76, 0xd1, 0xc0, 0x8d, 0x48, 0x75, 0xae, + 0x94, 0x81, 0x44, 0x8f, 0x93, 0x59, 0x71, 0x64, 0x8b, 0xc9, 0x60, 0x5b, + 0x3b, 0x9f, 0x8e, 0x71, 0x7d, 0x83, 0xba, 0x91, 0x42, 0xb8, 0x71, 0xb2, + 0x79, 0x3c, 0x38, 0x60, 0xc8, 0xbc, 0x53, 0x44, 0x37, 0x65, 0xca, 0xae, + 0x90, 0x88, 0x4f, 0x8d, 0x5e, 0x4f, 0x59, 0xd0, 0xca, 0x71, 0x4f, 0x3d, + 0x67, 0xc8, 0x85, 0x99, 0x62, 0x50, 0xb5, 0x38, 0x7d, 0xc6, 0x9d, 0x4c, + 0x70, 0xd0, 0xd6, 0xb6, 0x96, 0x68, 0xb3, 0x6a, 0xad, 0x2f, 0x98, 0x3e, + 0x97, 0x54, 0x60, 0xa1, 0x80, 0xd7, 0x3d, 0xc5, 0xd9, 0x4e, 0x9d, 0xdd, + 0x9e, 0xbc, 0x84, 0x6b, 0x91, 0x9a, 0x93, 0x42, 0xa9, 0xa6, 0x8e, 0xc3, + 0xae, 0xc9, 0xcc, 0x57, 0x48, 0xb5, 0x8e, 0x48, 0x8e, 0x9f, 0x50, 0xb0, + 0x9a, 0x80, 0x64, 0xa1, 0x4f, 0x6d, 0xca, 0x3e, 0x4c, 0xcd, 0x54, 0x34, + 0x99, 0x88, 0xb6, 0x9c, 0xb8, 0xa6, 0x41, 0x2e, 0x69, 0x4d, 0x93, 0x92, + 0xc7, 0x77, 0xb7, 0xca, 0xa9, 0x7b, 0x3e, 0x44, 0x3c, 0x94, 0x9e, 0x45, + 0xa3, 0xca, 0xab, 0xa9, 0xc6, 0x4e, 0xd3, 0x82, 0x72, 0xa4, 0x5b, 0xc7, + 0x6d, 0x50, 0xc8, 0xaa, 0xae, 0x83, 0x4a, 0xbb, 0xc7, 0x41, 0xa8, 0xa4, + 0xb5, 0xb7, 0x32, 0x52, 0x7f, 0x85, 0xa9, 0x97, 0x5b, 0xc8, 0x6b, 0x86, + 0x75, 0x7e, 0x6e, 0x47, 0x5f, 0x6c, 0x8a, 0x70, 0x68, 0x84, 0xce, 0xb1, + 0x30, 0xc0, 0x8a, 0x68, 0x45, 0xa7, 0xb0, 0x88, 0x97, 0xb2, 0x3d, 0xbc, + 0x66, 0x7f, 0x3d, 0x4f, 0x70, 0x83, 0xbf, 0xad, 0x79, 0x64, 0xd5, 0xb3, + 0x78, 0x3d, 0x4f, 0x89, 0x39, 0x68, 0xb1, 0x92, 0x72, 0x84, 0xcb, 0x6e, + 0x3a, 0x83, 0xcd, 0xa9, 0xcb, 0x44, 0x71, 0x4c, 0xd3, 0x81, 0xb9, 0x64, + 0xab, 0x92, 0x64, 0xbf, 0x63, 0x57, 0xa2, 0x43, 0x52, 0x7f, 0x8a, 0x52, + 0xc4, 0x71, 0x4d, 0x96, 0x46, 0x7b, 0x70, 0x47, 0x83, 0x41, 0x5c, 0xc3, + 0x79, 0x4a, 0xc5, 0x64, 0xb4, 0xc0, 0x9a, 0xa4, 0x5e, 0x4b, 0x97, 0xaf, + 0x7f, 0x38, 0xa5, 0x81, 0x58, 0x39, 0x75, 0x6d, 0x86, 0x9f, 0xa0, 0x8d, + 0x5e, 0xb8, 0x4f, 0x7f, 0xdc, 0x45, 0x82, 0x3f, 0x62, 0x50, 0x40, 0xa5, + 0xb6, 0x7d, 0x3c, 0x75, 0xa0, 0x5c, 0x4c, 0x97, 0x49, 0xa2, 0x96, 0x9f, + 0x9c, 0x75, 0x3b, 0x8d, 0x93, 0x67, 0x50, 0x50, 0xb2, 0xa7, 0x99, 0x3f, + 0x66, 0xc4, 0x75, 0x83, 0xa0, 0x85, 0x62, 0xa3, 0x9a, 0x6a, 0x6e, 0x64, + 0x65, 0xa0, 0x53, 0xa8, 0x6b, 0x66, 0xd1, 0x65, 0xa3, 0x94, 0x91, 0x54, + 0xc9, 0xbd, 0x79, 0x70, 0x32, 0x56, 0x34, 0xab, 0x37, 0x80, 0x47, 0x9e, + 0x95, 0xcc, 0x7e, 0x45, 0x8a, 0x3b, 0xa3, 0x6b, 0x89, 0x50, 0xb5, 0x88, + 0x38, 0x50, 0x87, 0x38, 0x77, 0x84, 0x97, 0x76, 0x78, 0x57, 0xb9, 0x8c, + 0xa6, 0x9b, 0xa6, 0xa1, 0x70, 0x7e, 0x52, 0xa9, 0x73, 0x8d, 0x82, 0x6c, + 0x58, 0x77, 0x52, 0x7a, 0xcf, 0x54, 0x39, 0x77, 0xd1, 0xb1, 0xbb, 0xc6, + 0x37, 0xc6, 0x79, 0xb4, 0x93, 0xb4, 0x57, 0x61, 0x3d, 0x52, 0xb8, 0x95, + 0xb9, 0xbc, 0xd9, 0x9d, 0x2f, 0xad, 0x48, 0x82, 0x65, 0xaa, 0xc5, 0x63, + 0xdc, 0x4f, 0x4d, 0xd2, 0x94, 0xae, 0x57, 0x7a, 0x87, 0x84, 0x60, 0x3d, + 0xa1, 0x49, 0xc8, 0x68, 0x5f, 0x48, 0x69, 0x2e, 0x3e, 0xaf, 0x6a, 0xc6, + 0x5e, 0xa6, 0xca, 0xb7, 0x76, 0xb6, 0x97, 0xa5, 0x4e, 0xa4, 0xbb, 0x4b, + 0xbf, 0x9b, 0x92, 0xa1, 0xa1, 0x6b, 0x36, 0x6c, 0x78, 0x8a, 0x93, 0x5b, + 0x35, 0x62, 0x39, 0x5c, 0x71, 0x5d, 0x8a, 0x63, 0x8e, 0x64, 0x59, 0xd6, + 0x97, 0x4d, 0x78, 0xa2, 0x7e, 0x41, 0x86, 0x57, 0x53, 0xc1, 0x99, 0x3c, + 0x54, 0x90, 0x4b, 0x54, 0x89, 0x70, 0x7e, 0x91, 0x51, 0x67, 0x62, 0x33, + 0x48, 0xb6, 0xac, 0x9d, 0xd3, 0x94, 0x42, 0x94, 0x3d, 0xb0, 0xd0, 0xb5, + 0xc3, 0x3d, 0x3c, 0x40, 0xc6, 0xd7, 0x82, 0x96, 0x4a, 0xdb, 0xc0, 0x42, + 0x94, 0x54, 0xa6, 0x98, 0x86, 0x62, 0x64, 0xcc, 0x7d, 0x73, 0xac, 0xcc, + 0x82, 0x93, 0x58, 0xce, 0x9e, 0xa7, 0xb0, 0xac, 0x55, 0x5a, 0x39, 0x42, + 0x4a, 0xbf, 0xb1, 0x67, 0xae, 0x72, 0x36, 0x38, 0xb9, 0x73, 0xad, 0x9a, + 0xd4, 0x4e, 0x8f, 0x9f, 0xb7, 0x70, 0xa1, 0xa3, 0x91, 0x86, 0xc0, 0xd6, + 0xb9, 0x2c, 0xbb, 0x85, 0x54, 0xa1, 0x6c, 0xc6, 0x40, 0xc1, 0x3f, 0x53, + 0x53, 0x45, 0x9e, 0x81, 0x3a, 0x46, 0x8c, 0xbb, 0x9d, 0x50, 0x3b, 0xb2, + 0x92, 0x79, 0x44, 0x78, 0xa0, 0x88, 0x83, 0x37, 0xa4, 0x72, 0x9e, 0x63, + 0x44, 0x92, 0x3c, 0xd5, 0x6a, 0xa9, 0xb6, 0xce, 0x3d, 0x4b, 0x71, 0x53, + 0xa6, 0xca, 0x7c, 0x63, 0x88, 0x5e, 0xbf, 0xa3, 0x4e, 0x70, 0xd7, 0x6f, + 0x84, 0x58, 0x91, 0x35, 0x63, 0x89, 0xd8, 0x8e, 0x48, 0xaa, 0xd1, 0x7c, + 0xc8, 0xc2, 0x6b, 0x34, 0x76, 0xc2, 0x59, 0xc8, 0x44, 0x4e, 0xb8, 0x54, + 0x9f, 0xd0, 0x40, 0x72, 0x61, 0x67, 0x42, 0x92, 0x78, 0x90, 0x98, 0x4f, + 0xb6, 0x97, 0xbe, 0x5f, 0xb3, 0xac, 0x4a, 0xa2, 0xa9, 0xaa, 0x38, 0x45, + 0xd2, 0x6d, 0x65, 0x76, 0xcf, 0x72, 0x42, 0x66, 0x74, 0xa4, 0x33, 0x75, + 0xc2, 0x7e, 0x57, 0x4f, 0xa6, 0x39, 0xc8, 0x34, 0x49, 0x8a, 0xd9, 0xcd, + 0x54, 0x97, 0x5a, 0x3d, 0xa6, 0x8e, 0xc0, 0x7b, 0x54, 0x7d, 0x6b, 0x48, + 0x74, 0xd4, 0x40, 0x73, 0x83, 0x5a, 0x55, 0x91, 0x4e, 0x7a, 0x5d, 0xbf, + 0xab, 0x3a, 0x76, 0x4f, 0xb8, 0x94, 0xbc, 0x56, 0xa7, 0x62, 0xba, 0x70, + 0x5f, 0x9d, 0x7d, 0x63, 0x54, 0x4b, 0xb7, 0x91, 0xac, 0xaf, 0x87, 0x54, + 0x6e, 0x86, 0xd1, 0x37, 0x62, 0xa6, 0x84, 0xcb, 0x8c, 0x7a, 0x72, 0xa5, + 0x78, 0x64, 0x81, 0x5e, 0x31, 0x88, 0x75, 0x35, 0x6a, 0x71, 0x93, 0x74, + 0x6a, 0x3f, 0x6c, 0xd0, 0x52, 0xab, 0xbf, 0x9a, 0x7a, 0x37, 0x91, 0x77, + 0x87, 0x5e, 0x82, 0x51, 0x3f, 0x3e, 0x43, 0x8f, 0x90, 0x6a, 0x7b, 0x53, + 0x59, 0x82, 0x99, 0x40, 0x8a, 0x3f, 0x7d, 0xb5, 0xb9, 0xbe, 0xd0, 0x56, + 0x3a, 0xa5, 0x97, 0x6a, 0x55, 0x61, 0x9d, 0x8b, 0x99, 0x2d, 0x33, 0xb9, + 0xd1, 0xbd, 0x3b, 0x7a, 0x40, 0x61, 0xb7, 0x79, 0x9b, 0x89, 0xcc, 0xbe, + 0x54, 0xbd, 0x32, 0x7e, 0x4a, 0x38, 0x45, 0x6d, 0x35, 0x6d, 0xbb, 0x65, + 0x40, 0x8e, 0x72, 0xb3, 0xac, 0x56, 0xa9, 0x3c, 0x3e, 0x34, 0x59, 0x6e, + 0xb5, 0x80, 0x72, 0x97, 0x88, 0x54, 0xc2, 0xd3, 0x3c, 0x50, 0xa8, 0x3a, + 0xb5, 0x4f, 0x48, 0x7e, 0xc4, 0xa8, 0x88, 0xc4, 0x89, 0xa2, 0x9a, 0x62, + 0x7b, 0x98, 0xcc, 0x92, 0x8d, 0xb2, 0x72, 0xb8, 0x9e, 0x95, 0xae, 0xa2, + 0xa0, 0x3b, 0xb4, 0x8c, 0xa9, 0x49, 0xb6, 0x4c, 0x6f, 0x7c, 0xc6, 0x50, + 0x92, 0xa8, 0xa3, 0x7f, 0xb9, 0xa1, 0x5c, 0x57, 0x31, 0x47, 0x34, 0xa9, + 0x8b, 0x88, 0x3a, 0xc2, 0x4c, 0x46, 0xb7, 0x33, 0xc1, 0x40, 0x57, 0x47, + 0xaf, 0xab, 0x8b, 0x3c, 0x74, 0xb2, 0xc6, 0x3e, 0x6c, 0x67, 0x89, 0xd3, + 0xa1, 0x77, 0xbe, 0x9d, 0x84, 0x57, 0x7b, 0x7d, 0xbf, 0x4a, 0xb5, 0x56, + 0xb5, 0x43, 0xb0, 0x98, 0x41, 0x32, 0x49, 0x8f, 0x96, 0x4c, 0x5a, 0xca, + 0x55, 0x31, 0x34, 0x6e, 0xa6, 0x95, 0x57, 0x4d, 0x89, 0xc2, 0xa3, 0x47, + 0xd0, 0x5d, 0x3c, 0x3f, 0x6e, 0x85, 0xcb, 0xbb, 0xd1, 0x7b, 0xab, 0x44, + 0x91, 0x37, 0xab, 0xc9, 0x67, 0xad, 0x6b, 0x9b, 0x9c, 0x88, 0x5f, 0x77, + 0x83, 0xa4, 0x37, 0x6b, 0x6f, 0xa0, 0x54, 0x5c, 0x66, 0xc5, 0xbd, 0x53, + 0x9c, 0xc8, 0x8f, 0x38, 0xbf, 0x81, 0x3f, 0x6f, 0x5c, 0x97, 0xc9, 0xa7, + 0x66, 0xac, 0x7d, 0x83, 0x85, 0x6c, 0x4b, 0x53, 0x5e, 0x7e, 0xd3, 0xb8, + 0x91, 0xc8, 0xd1, 0x75, 0xa5, 0x61, 0xa1, 0x9a, 0x37, 0x60, 0xcc, 0xbe, + 0x2f, 0x3a, 0x9f, 0x9f, 0x38, 0x53, 0x8b, 0x70, 0x34, 0xab, 0x48, 0x3e, + 0xcf, 0x96, 0xc4, 0x60, 0x9c, 0x54, 0x8c, 0xaa, 0xbb, 0x79, 0x4a, 0x33, + 0x55, 0x9b, 0x48, 0x41, 0x6b, 0xc2, 0x44, 0x3b, 0x4e, 0x7d, 0x31, 0x5b, + 0x3b, 0x6b, 0xc4, 0x55, 0x9a, 0x55, 0x63, 0x92, 0x4d, 0x93, 0x31, 0x47, + 0x7c, 0xb5, 0x52, 0x4a, 0x8a, 0x5a, 0x55, 0x3c, 0x40, 0xc2, 0xb0, 0x79, + 0x70, 0xa5, 0x3a, 0x9c, 0x95, 0xb7, 0x9c, 0xb0, 0xd0, 0x84, 0xad, 0x40, + 0x52, 0xb8, 0x66, 0x90, 0x87, 0xa0, 0xc1, 0x7e, 0x5e, 0x3c, 0x72, 0x46, + 0xb4, 0x5c, 0x72, 0x3a, 0xc8, 0x8b, 0x3e, 0x80, 0xba, 0x8e, 0x68, 0x69, + 0x53, 0xae, 0xa2, 0x54, 0x3e, 0xb2, 0xa9, 0x9b, 0x78, 0x33, 0xb2, 0x7d, + 0xb4, 0x7c, 0x8d, 0x8f, 0x47, 0x79, 0x81, 0xbe, 0xaf, 0xca, 0x41, 0x5a, + 0x72, 0x96, 0x63, 0x85, 0x63, 0x36, 0x91, 0x3b, 0xbd, 0x48, 0x69, 0x7f, + 0xbb, 0x5e, 0x6d, 0x93, 0x69, 0xaa, 0x97, 0x42, 0x9a, 0xb9, 0x8a, 0x8b, + 0x63, 0x40, 0x96, 0x4c, 0x5a, 0x86, 0x3a, 0x95, 0x3f, 0x48, 0xb6, 0xbf, + 0xc7, 0x55, 0x5b, 0xbe, 0x5d, 0x5e, 0x7c, 0x51, 0x54, 0x3e, 0x63, 0xce, + 0x50, 0xc3, 0x8d, 0x52, 0x51, 0x4e, 0x95, 0xc0, 0x57, 0x8c, 0x6e, 0xb0, + 0x55, 0x47, 0x8c, 0x4c, 0x3f, 0xb1, 0x6f, 0x47, 0xa4, 0x89, 0xba, 0xab, + 0x46, 0xa4, 0x80, 0x8d, 0x7b, 0x39, 0x50, 0x88, 0x9c, 0x4e, 0x59, 0x9b, + 0x37, 0x3a, 0xbe, 0x97, 0x77, 0x90, 0x5f, 0x71, 0x94, 0x79, 0x87, 0x41, + 0xbd, 0x8d, 0x74, 0x40, 0x47, 0x95, 0xbd, 0x8b, 0x45, 0xbf, 0xa9, 0x64, + 0xb3, 0x41, 0xcd, 0x7d, 0x6b, 0x58, 0x72, 0xc6, 0xb9, 0x4a, 0xad, 0x42, + 0xac, 0x7b, 0x6c, 0xb4, 0x35, 0x3c, 0x3a, 0x8c, 0x6b, 0xae, 0x7b, 0x87, + 0xab, 0x77, 0x64, 0x32, 0x67, 0x70, 0xc0, 0x73, 0x9a, 0xba, 0x8a, 0xb1, + 0x4f, 0x2f, 0x36, 0xbd, 0xb7, 0xcd, 0x5f, 0x52, 0xa3, 0xbc, 0xb3, 0xcf, + 0xa7, 0x32, 0xb7, 0x36, 0xb2, 0x2b, 0x76, 0xd5, 0x5c, 0xd1, 0x55, 0x2f, + 0xc7, 0x8f, 0x38, 0x7f, 0x48, 0xc3, 0xce, 0x39, 0x63, 0x98, 0x4c, 0x9c, + 0x79, 0x43, 0x6e, 0x37, 0x74, 0xbf, 0x3b, 0xb3, 0xae, 0xa8, 0x91, 0x88, + 0x43, 0xc9, 0x8d, 0x75, 0x3e, 0xc9, 0x3c, 0x48, 0x81, 0xcf, 0x67, 0x8a, + 0x57, 0xa1, 0xb3, 0xb6, 0x66, 0xae, 0xb9, 0x6d, 0xca, 0x82, 0x53, 0x8e, + 0xb3, 0x6f, 0xb4, 0x6d, 0x82, 0x40, 0xb0, 0x86, 0x3b, 0xa1, 0x80, 0x49, + 0xa3, 0x7b, 0x77, 0x55, 0xb4, 0x9c, 0xa5, 0x54, 0xb6, 0x41, 0xc9, 0x5f, + 0x64, 0x7c, 0x3d, 0xa3, 0x61, 0x37, 0xc1, 0x58, 0x81, 0xc4, 0xce, 0x71, + 0x67, 0x4d, 0xaa, 0x6a, 0xb3, 0x88, 0x62, 0x9a, 0x7f, 0x9a, 0x57, 0x9b, + 0xbd, 0x92, 0xd3, 0x8a, 0x6d, 0xba, 0xb7, 0x4c, 0x80, 0x63, 0xc8, 0x76, + 0x4e, 0x74, 0x4e, 0x84, 0x4a, 0x8e, 0x75, 0x8b, 0xd0, 0x80, 0x34, 0x76, + 0x5b, 0x7e, 0x8b, 0x6e, 0x65, 0x34, 0x92, 0x57, 0x57, 0x5d, 0x6e, 0x44, + 0x95, 0xb4, 0x63, 0xb3, 0x44, 0xc4, 0x44, 0x8b, 0xcb, 0x64, 0x87, 0xb4, + 0xc8, 0xd5, 0xb1, 0xcd, 0xbd, 0x7a, 0x8b, 0x58, 0x9c, 0xa4, 0x6c, 0x2e, + 0xb1, 0x6b, 0x7f, 0xb6, 0x8d, 0x6a, 0x4d, 0x73, 0x8b, 0x87, 0x6b, 0xb8, + 0x88, 0x65, 0xc0, 0x44, 0xcf, 0x8a, 0x99, 0xa9, 0xd1, 0x4b, 0x59, 0xaa, + 0x6e, 0x7f, 0x36, 0x7f, 0xa8, 0x8e, 0x5e, 0xce, 0xc9, 0x9f, 0x8b, 0x5c, + 0x3f, 0xc7, 0x4c, 0x9e, 0xd8, 0xba, 0x8a, 0x80, 0x6c, 0xb2, 0x73, 0xc7, + 0x32, 0x89, 0x58, 0xbd, 0x90, 0x81, 0xa9, 0xcf, 0x66, 0x77, 0x58, 0xcc, + 0xc7, 0x91, 0x98, 0x50, 0x9e, 0x9b, 0x50, 0x6a, 0x8a, 0x58, 0x6e, 0xb7, + 0xce, 0xbe, 0x62, 0x6c, 0x4e, 0x3c, 0x8a, 0x95, 0x45, 0xb5, 0x67, 0x7c, + 0xd1, 0xa8, 0x85, 0x63, 0x47, 0xa4, 0x38, 0x4a, 0x83, 0xba, 0x5e, 0xbf, + 0x3e, 0x6e, 0x56, 0xa2, 0x34, 0x93, 0x67, 0x9a, 0x57, 0x5c, 0x34, 0x36, + 0xc3, 0x8c, 0x9d, 0x59, 0x6a, 0xd1, 0xa5, 0x4c, 0x9c, 0x90, 0xbf, 0xad, + 0x30, 0xb6, 0xab, 0x86, 0xb6, 0x84, 0x96, 0x66, 0x93, 0x99, 0x37, 0xcd, + 0xcf, 0x83, 0x36, 0x47, 0xc9, 0x5a, 0x6b, 0xa9, 0x49, 0xd4, 0xd3, 0x77, + 0x74, 0xb1, 0xc5, 0x66, 0x93, 0xb9, 0xbf, 0xc4, 0x24, 0x49, 0x93, 0x53, + 0xad, 0x60, 0xba, 0x97, 0xb5, 0x93, 0xa0, 0x70, 0xb2, 0xd5, 0xb9, 0x73, + 0x43, 0xa6, 0x74, 0x63, 0x79, 0x8f, 0xcb, 0x3e, 0xd0, 0xc4, 0x62, 0xc2, + 0xae, 0x3e, 0x73, 0xb1, 0xbb, 0x9d, 0xcc, 0x53, 0x69, 0x50, 0x9e, 0x7d, + 0x7a, 0x63, 0x67, 0x92, 0x4a, 0x63, 0xc5, 0x7d, 0xcc, 0xc7, 0x74, 0x3c, + 0x62, 0xa3, 0xb3, 0x8c, 0xb5, 0xb6, 0x51, 0x38, 0xbc, 0x8b, 0x44, 0x61, + 0xc8, 0xd4, 0xaf, 0x5e, 0x56, 0x8e, 0x85, 0x5b, 0x53, 0xd4, 0xba, 0x65, + 0x67, 0xb5, 0x8c, 0xb2, 0x7c, 0xba, 0x4e, 0xc5, 0x73, 0x2e, 0xad, 0x7f, + 0x9d, 0x8f, 0x4b, 0x59, 0xad, 0x74, 0xa1, 0xcd, 0xd0, 0x45, 0xb2, 0xbb, + 0xb8, 0xa3, 0x41, 0xbe, 0xa4, 0x75, 0x5b, 0x30, 0xa7, 0xb8, 0x5f, 0xb1, + 0xb3, 0xb2, 0x46, 0xa5, 0xc2, 0x9e, 0x9d, 0xc4, 0x9f, 0x70, 0xc2, 0x54, + 0xc7, 0xac, 0x36, 0x8a, 0x9c, 0x8b, 0x64, 0x98, 0x50, 0x59, 0x4b, 0xac, + 0x98, 0x51, 0xc6, 0x69, 0xa9, 0x93, 0x6e, 0x5b, 0x31, 0x76, 0xa3, 0xc5, + 0x55, 0x5f, 0x44, 0x56, 0x36, 0xc3, 0xcd, 0x78, 0xc9, 0x58, 0x59, 0x44, + 0x4a, 0x93, 0x4a, 0x99, 0x34, 0x9a, 0xac, 0x99, 0x9e, 0x9a, 0x77, 0xad, + 0x93, 0x97, 0x51, 0x4e, 0x30, 0xac, 0x79, 0x97, 0xd4, 0x42, 0x99, 0x80, + 0x4f, 0x83, 0xc5, 0x3d, 0x31, 0xbe, 0x5f, 0x6b, 0x43, 0xbd, 0xc8, 0xca, + 0x32, 0x37, 0x94, 0x74, 0x6f, 0xa6, 0x34, 0x3d, 0x3a, 0x4d, 0x3d, 0x96, + 0xc9, 0xbd, 0x67, 0x77, 0x73, 0xb2, 0x6e, 0x98, 0x7c, 0x5d, 0xad, 0xb2, + 0x8c, 0x86, 0x40, 0x84, 0x45, 0x87, 0xbe, 0xa5, 0x47, 0xb0, 0x90, 0x92, + 0xa6, 0xb2, 0xc9, 0xa2, 0xc8, 0x54, 0x42, 0x9f, 0x72, 0xa8, 0xc9, 0xb5, + 0x6e, 0x9b, 0x9f, 0x6c, 0x73, 0x9c, 0xb7, 0x82, 0xd3, 0xc5, 0x93, 0x64, + 0x5a, 0xc5, 0xa4, 0xc5, 0x4d, 0x98, 0xd7, 0xc7, 0x7e, 0xaa, 0xbe, 0x3c, + 0x52, 0xd0, 0xbd, 0x80, 0xcc, 0x57, 0x5b, 0x55, 0x57, 0x77, 0x82, 0x67, + 0xc1, 0x61, 0xa1, 0x9a, 0xc0, 0x9c, 0x47, 0x81, 0x2f, 0xa2, 0xc5, 0xc7, + 0x7b, 0xa7, 0xb7, 0x6a, 0x89, 0x49, 0x82, 0xa8, 0x87, 0x78, 0x77, 0xb3, + 0xb6, 0x69, 0xb6, 0x6b, 0x59, 0x38, 0x44, 0x52, 0x9c, 0x6b, 0x5a, 0xab, + 0x8f, 0x7c, 0x8e, 0xc6, 0x43, 0x7e, 0xab, 0x34, 0x9a, 0x2d, 0xa6, 0xa8, + 0x60, 0xaf, 0x84, 0xca, 0x8f, 0x37, 0x7f, 0xcb, 0x50, 0xbb, 0xa9, 0xc5, + 0x67, 0x5d, 0x8e, 0x79, 0x81, 0x37, 0x9c, 0xd2, 0x8b, 0xa8, 0x65, 0xb4, + 0x93, 0x38, 0x95, 0x42, 0xb7, 0x99, 0x91, 0x8d, 0x33, 0x83, 0xdb, 0x76, + 0x3d, 0xab, 0x4b, 0x6e, 0xbe, 0x55, 0xb1, 0x75, 0xb3, 0x9f, 0xaf, 0x8b, + 0x89, 0xad, 0x65, 0x3e, 0x3c, 0x5c, 0xd9, 0xbb, 0x42, 0x53, 0x89, 0x80, + 0xbd, 0x6f, 0x8b, 0x51, 0x43, 0x76, 0xb2, 0x86, 0x90, 0x81, 0x73, 0xa8, + 0x6f, 0xc0, 0xd2, 0x8b, 0x2d, 0x30, 0x90, 0x49, 0x81, 0xcd, 0x6b, 0x5f, + 0x3b, 0xb5, 0x38, 0x3e, 0x80, 0xc4, 0xc7, 0xb6, 0xb6, 0x4f, 0x97, 0x69, + 0x7d, 0x83, 0x93, 0x39, 0xcc, 0x62, 0x2f, 0x96, 0xbb, 0xae, 0x4e, 0x54, + 0xe3, 0x4c, 0x7e, 0x3a, 0xbd, 0xc7, 0x6e, 0x3b, 0xa4, 0x6f, 0x90, 0xc5, + 0x7c, 0xbb, 0x61, 0x2f, 0xa2, 0x62, 0x57, 0x33, 0x76, 0x47, 0x8d, 0x94, + 0xc5, 0xd3, 0x67, 0xc4, 0x76, 0xa2, 0x72, 0x48, 0x5d, 0x68, 0x50, 0x7d, + 0xba, 0x59, 0x90, 0x70, 0x5a, 0x73, 0x83, 0x42, 0x4f, 0x7a, 0x55, 0xca, + 0x93, 0x9c, 0x66, 0x5c, 0x7e, 0x89, 0x4b, 0x95, 0x57, 0x90, 0x85, 0x92, + 0xc7, 0xcf, 0x71, 0x68, 0x64, 0xbe, 0x6c, 0xa9, 0x70, 0x71, 0x93, 0xd2, + 0x98, 0xc0, 0x45, 0x70, 0x7a, 0xbe, 0x7d, 0x6b, 0x8b, 0x5e, 0x67, 0x35, + 0x62, 0xd4, 0xc4, 0xb8, 0x79, 0xaf, 0x4a, 0xb9, 0x40, 0xb2, 0x74, 0xb1, + 0x7f, 0x58, 0x5d, 0x62, 0x83, 0x4c, 0x33, 0x78, 0x8d, 0xb7, 0xb0, 0xb7, + 0x3d, 0xc8, 0xb5, 0x78, 0x6d, 0x5c, 0xc8, 0x84, 0xb6, 0xbb, 0x3f, 0x72, + 0xaf, 0x85, 0x3e, 0x56, 0x94, 0x58, 0x35, 0x25, 0x81, 0x82, 0x8c, 0xa9, + 0x53, 0x77, 0x41, 0x48, 0x44, 0x3e, 0x9b, 0x4f, 0x90, 0x3e, 0x8c, 0x32, + 0x5e, 0x3b, 0x7a, 0xbb, 0xc7, 0xca, 0xc6, 0xcb, 0xa6, 0x5a, 0x82, 0x69, + 0x83, 0xa8, 0x65, 0x50, 0xa0, 0xc4, 0x8e, 0x5b, 0x9e, 0x55, 0x59, 0x8f, + 0x9b, 0xaa, 0x63, 0x5f, 0x87, 0x63, 0x2e, 0x8d, 0x5f, 0x3c, 0xac, 0x67, + 0x9f, 0x85, 0x41, 0x5a, 0xd4, 0xca, 0x74, 0x50, 0x4a, 0x91, 0xae, 0x81, + 0x8f, 0x88, 0x62, 0x84, 0x53, 0xa7, 0xad, 0x5a, 0x42, 0x5a, 0x6f, 0x31, + 0x8d, 0x66, 0x70, 0x64, 0x72, 0xa0, 0x97, 0xa5, 0x92, 0x50, 0x6e, 0x81, + 0x46, 0x76, 0x64, 0x5d, 0x4f, 0x39, 0x88, 0xc9, 0xcc, 0xa2, 0x2e, 0x2f, + 0xca, 0xc3, 0x36, 0x5e, 0xbc, 0x57, 0xb3, 0xbb, 0x9c, 0x49, 0x53, 0x65, + 0xa3, 0xa0, 0x98, 0xc5, 0xa1, 0x45, 0x6d, 0x76, 0x67, 0x64, 0x9a, 0xab, + 0x63, 0xa2, 0xba, 0x5b, 0xce, 0x5c, 0x5b, 0x3f, 0xae, 0x9e, 0x96, 0x8d, + 0x3d, 0xd6, 0x98, 0x99, 0xa2, 0xb7, 0xba, 0x8b, 0xa8, 0xca, 0x5e, 0x78, + 0x4b, 0x99, 0x94, 0xaa, 0xb6, 0xc5, 0x4f, 0x47, 0x6e, 0x7b, 0xaa, 0xc9, + 0xa4, 0xb8, 0xc5, 0x95, 0x77, 0x4e, 0x4a, 0x55, 0x87, 0x4e, 0x9d, 0x6a, + 0x77, 0x46, 0x49, 0x92, 0xba, 0xbc, 0x3e, 0x63, 0xcc, 0x95, 0xad, 0x3d, + 0xb4, 0xd8, 0xb4, 0xc9, 0xd4, 0x93, 0x87, 0xaa, 0x74, 0x4d, 0x98, 0x3f, + 0x5a, 0xd5, 0x29, 0x42, 0x43, 0x50, 0xb4, 0x66, 0x9c, 0x38, 0x82, 0x5b, + 0x4d, 0x41, 0xca, 0xb4, 0x5b, 0x9b, 0x58, 0x6d, 0xd2, 0xc2, 0x6b, 0x72, + 0xa1, 0x53, 0xca, 0x50, 0x7b, 0xaa, 0x40, 0x41, 0xcf, 0x50, 0x7e, 0x5a, + 0x3e, 0x59, 0x93, 0x93, 0xac, 0x54, 0x5a, 0x93, 0x42, 0x6f, 0x73, 0xa4, + 0x94, 0xac, 0xbc, 0x71, 0x5f, 0xb2, 0x34, 0x55, 0xb4, 0xbc, 0xa6, 0xc1, + 0xd2, 0x4a, 0x72, 0x61, 0x99, 0x7b, 0xc7, 0x4b, 0x43, 0x98, 0x69, 0x32, + 0x3e, 0x7f, 0x86, 0x42, 0x7d, 0x36, 0xc0, 0x76, 0x74, 0x45, 0xb8, 0xb3, + 0x54, 0x78, 0x85, 0x94, 0x37, 0x39, 0xa6, 0x5f, 0x47, 0xca, 0x36, 0xbb, + 0xd5, 0x71, 0x58, 0x81, 0x98, 0x4f, 0xab, 0x9b, 0x35, 0xb9, 0xa3, 0x58, + 0xb3, 0x96, 0x2e, 0x8b, 0xc5, 0x65, 0x74, 0xb9, 0x47, 0x40, 0x5a, 0x82, + 0xad, 0x9b, 0xcc, 0x76, 0x61, 0xce, 0x50, 0xc3, 0x90, 0xad, 0xb9, 0xb1, + 0x6d, 0x34, 0x69, 0xae, 0xbd, 0xb4, 0xd1, 0x4b, 0xad, 0xc5, 0xd2, 0x5a, + 0x8d, 0x40, 0x71, 0x39, 0x38, 0x8c, 0x6c, 0xa5, 0xbe, 0xc9, 0xaa, 0x62, + 0x36, 0x82, 0x97, 0x5f, 0x40, 0x5f, 0xc0, 0x9c, 0xbc, 0x44, 0xbe, 0x3e, + 0x79, 0x80, 0xac, 0x96, 0x39, 0x63, 0x89, 0x74, 0xbf, 0x4b, 0xa6, 0x31, + 0x7c, 0x6c, 0x3d, 0xa9, 0x2e, 0x51, 0x33, 0x5e, 0x69, 0xc1, 0x44, 0x64, + 0x7e, 0xc8, 0x7e, 0x9c, 0x6b, 0x91, 0xd0, 0x43, 0x56, 0x52, 0xc8, 0x66, + 0x64, 0xb0, 0x9e, 0x34, 0xac, 0x78, 0x3b, 0x9a, 0x76, 0xd0, 0x9b, 0x6c, + 0x48, 0x9f, 0x75, 0x43, 0x4b, 0x4e, 0xb9, 0xb6, 0x71, 0x43, 0xa8, 0x4d, + 0x43, 0xba, 0xc2, 0xca, 0x63, 0xa7, 0x58, 0xa8, 0x8b, 0x3a, 0x32, 0xd0, + 0x83, 0x90, 0x76, 0xac, 0xa4, 0xd4, 0xbc, 0xaf, 0x58, 0xb0, 0xa8, 0xba, + 0x67, 0x96, 0x5e, 0xb2, 0x54, 0xc2, 0xc4, 0xa9, 0x87, 0x4d, 0xa2, 0xb7, + 0x49, 0xc8, 0x8b, 0x5f, 0x3d, 0xa4, 0x92, 0xd4, 0xb4, 0x4e, 0x68, 0xa4, + 0x8a, 0x6c, 0x83, 0x8d, 0xb1, 0x31, 0x80, 0x66, 0xad, 0xb4, 0x53, 0x6b, + 0xe1, 0x8c, 0x8e, 0x67, 0x8b, 0xa0, 0xce, 0x69, 0x89, 0x67, 0x39, 0x32, + 0x91, 0xd0, 0x7b, 0x38, 0xbb, 0xbe, 0x4e, 0x4a, 0xaa, 0xae, 0x3d, 0x38, + 0xb4, 0x89, 0x59, 0x76, 0xb6, 0x62, 0x49, 0xc2, 0x54, 0x75, 0xba, 0xc3, + 0xd4, 0xbf, 0xc4, 0xb8, 0x87, 0x8d, 0xb6, 0x6a, 0xaa, 0x97, 0x97, 0xca, + 0xaa, 0x97, 0x5d, 0x7a, 0x56, 0xae, 0x58, 0x42, 0x78, 0x54, 0x49, 0x5b, + 0x97, 0x79, 0xc7, 0x63, 0xbe, 0x88, 0x42, 0x65, 0xa1, 0x76, 0x3b, 0xa0, + 0x74, 0x49, 0x6c, 0x80, 0x36, 0x74, 0x68, 0x71, 0xb4, 0x9e, 0x8c, 0x71, + 0xbd, 0x5c, 0x3a, 0x74, 0x4e, 0x75, 0x82, 0x9a, 0x73, 0xbb, 0x57, 0x54, + 0x78, 0xb4, 0x7b, 0xa8, 0x80, 0x8e, 0xb9, 0x9a, 0xd0, 0x85, 0x5c, 0x8d, + 0xb8, 0x53, 0x92, 0x53, 0x49, 0xb7, 0x9e, 0x9b, 0x4e, 0x37, 0xb8, 0xb8, + 0x8f, 0xd9, 0x56, 0x8d, 0x72, 0xaf, 0x3f, 0x5d, 0x7c, 0x53, 0x86, 0x48, + 0x34, 0xbd, 0x5f, 0x8a, 0xcb, 0x7b, 0x41, 0x54, 0x74, 0x8d, 0xb6, 0x5f, + 0xaf, 0x63, 0xb8, 0x6c, 0xaa, 0x9e, 0x53, 0xce, 0xb0, 0x95, 0xa7, 0x3a, + 0xb7, 0xda, 0xb5, 0x34, 0x9c, 0x81, 0xc8, 0x47, 0xb5, 0x93, 0x6f, 0x7d, + 0xb6, 0x58, 0xd5, 0x67, 0x98, 0x43, 0xa0, 0xa2, 0x54, 0xb3, 0x35, 0x83, + 0x9e, 0xc9, 0x54, 0xb4, 0x84, 0x71, 0xd3, 0x5c, 0xb9, 0xd1, 0xa5, 0x87, + 0x91, 0x96, 0x7c, 0x46, 0x98, 0xbe, 0x38, 0xb4, 0x97, 0x84, 0x8c, 0xa8, + 0x3b, 0xb5, 0xd2, 0x4a, 0x8d, 0xd3, 0x3d, 0x8b, 0xa5, 0xbe, 0x60, 0xa1, + 0x3f, 0xa3, 0x64, 0x9f, 0xbe, 0x70, 0x74, 0x5a, 0x95, 0x53, 0x5c, 0x8c, + 0x9f, 0x59, 0xc9, 0x30, 0x9f, 0x9f, 0xe1, 0x77, 0xcd, 0x5f, 0x81, 0x99, + 0x35, 0x86, 0x34, 0x5f, 0x8a, 0x9b, 0x8a, 0x6d, 0x8d, 0x59, 0xb2, 0xcf, + 0x79, 0x4e, 0xcf, 0x83, 0xba, 0xcb, 0xc8, 0xa9, 0x7a, 0x81, 0xd5, 0x3d, + 0xaf, 0xc7, 0xda, 0x91, 0x44, 0xb6, 0x52, 0x9e, 0x76, 0xcb, 0x64, 0x99, + 0x8a, 0x65, 0xc9, 0x7c, 0xa4, 0x4f, 0x55, 0x4b, 0x90, 0xd3, 0x65, 0x90, + 0xc2, 0xbd, 0xc4, 0x80, 0x7f, 0x6b, 0x51, 0xc2, 0x3a, 0xba, 0x5f, 0x47, + 0x62, 0x65, 0x5c, 0x94, 0x8b, 0xa3, 0x60, 0x72, 0xb1, 0x57, 0xae, 0xa9, + 0xd0, 0x63, 0x53, 0x6c, 0xcd, 0x9f, 0x44, 0x88, 0x52, 0x85, 0xb3, 0x3d, + 0xa3, 0xca, 0x51, 0x8b, 0x3b, 0x50, 0x9b, 0xc4, 0x3f, 0x55, 0x87, 0x5a, + 0x49, 0x8d, 0x57, 0x8c, 0xbc, 0x7a, 0x34, 0x70, 0x62, 0xab, 0xd7, 0x87, + 0x78, 0x4e, 0x8a, 0x65, 0x9d, 0xb5, 0xbb, 0x6b, 0x48, 0xae, 0x8d, 0x42, + 0x5e, 0xda, 0xcf, 0x4d, 0xd2, 0x56, 0x84, 0xc5, 0x75, 0x74, 0xb6, 0x52, + 0x96, 0xc5, 0x8e, 0x72, 0x5d, 0xc2, 0xb8, 0x76, 0x68, 0x71, 0xe0, 0x46, + 0x71, 0x94, 0xa6, 0x43, 0x66, 0x7f, 0x5f, 0x41, 0x46, 0x6d, 0x6f, 0x54, + 0xb2, 0x44, 0x7c, 0x41, 0xcf, 0x5d, 0x4f, 0x71, 0xb4, 0x35, 0x76, 0xc1, + 0x46, 0x7d, 0xaa, 0x66, 0x63, 0xb3, 0x76, 0x49, 0x8d, 0xbe, 0xc8, 0x45, + 0xb5, 0x9d, 0x3f, 0xb4, 0x5a, 0xae, 0x37, 0xa7, 0x8b, 0x7e, 0xc1, 0x6d, + 0xac, 0xaa, 0xcd, 0x52, 0x93, 0xc3, 0x7e, 0xa8, 0x45, 0x62, 0x9d, 0x51, + 0x3b, 0x46, 0xc9, 0x52, 0xb4, 0x6c, 0x54, 0x49, 0x90, 0x59, 0xbe, 0x35, + 0xc7, 0x60, 0x38, 0xb3, 0x5d, 0x30, 0x4b, 0x5a, 0x40, 0xae, 0x3e, 0x7c, + 0xc8, 0x89, 0x9f, 0xcc, 0x49, 0xbd, 0x56, 0xbb, 0x4e, 0x8b, 0x58, 0xd8, + 0x8f, 0xca, 0xa2, 0xc2, 0x5a, 0xce, 0x3e, 0x4a, 0x80, 0xcb, 0x3f, 0x31, + 0x48, 0x3f, 0x9f, 0x4d, 0x80, 0xb7, 0x57, 0x84, 0x61, 0x72, 0xa2, 0xa1, + 0x5b, 0x6a, 0xb8, 0xb4, 0x79, 0x75, 0x97, 0x60, 0x45, 0xa6, 0x60, 0xbe, + 0xad, 0xb7, 0x4e, 0x72, 0x5d, 0xae, 0x44, 0x6d, 0x5c, 0x96, 0x71, 0xd3, + 0x41, 0x59, 0x6b, 0xb8, 0x49, 0x3d, 0x83, 0x91, 0x5b, 0x55, 0x86, 0xb8, + 0x5a, 0x77, 0x32, 0x47, 0xb6, 0xc7, 0xa8, 0xb5, 0xc7, 0xd3, 0x65, 0xab, + 0x6a, 0xc3, 0x72, 0x8e, 0x8d, 0x53, 0xab, 0x43, 0x52, 0x70, 0x9f, 0x9f, + 0x50, 0x70, 0xc2, 0x4d, 0x96, 0xaa, 0x5a, 0xe0, 0x79, 0xb8, 0xd5, 0x9c, + 0xab, 0x46, 0xb0, 0x43, 0xb7, 0x8f, 0x7d, 0x40, 0x5f, 0x7a, 0xce, 0x2b, + 0xb2, 0x71, 0xcf, 0x98, 0xb9, 0x65, 0x94, 0x43, 0xbb, 0xb9, 0xd8, 0x7d, + 0x7f, 0xb8, 0x52, 0x4e, 0xb6, 0x6e, 0x3b, 0xaf, 0xa5, 0x5a, 0x3d, 0xbf, + 0xa5, 0x54, 0x4f, 0xa9, 0xa4, 0x67, 0x82, 0xbc, 0x49, 0xbe, 0xb7, 0x9d, + 0xa2, 0x6f, 0x37, 0xa7, 0x82, 0xdf, 0x56, 0xb6, 0x6d, 0x3b, 0x79, 0x79, + 0x76, 0xc4, 0xc6, 0x71, 0x85, 0x7f, 0xcf, 0xa0, 0x99, 0x94, 0x8a, 0xb3, + 0x90, 0x4d, 0xc8, 0x5a, 0xc1, 0x74, 0xbf, 0x9e, 0xae, 0xb4, 0x51, 0x58, + 0x43, 0x9a, 0xb2, 0xac, 0x92, 0x99, 0xc5, 0xba, 0x74, 0x49, 0x4a, 0xb3, + 0x37, 0x28, 0x87, 0x43, 0xb2, 0x92, 0x54, 0x60, 0x93, 0x35, 0x83, 0x50, + 0xd5, 0x4a, 0x2f, 0x3e, 0x6f, 0x8a, 0xc5, 0xb3, 0xa6, 0xc8, 0x50, 0x6f, + 0x40, 0x92, 0xcd, 0x8b, 0x85, 0x40, 0xca, 0x7a, 0x5f, 0x47, 0x74, 0xc0, + 0x8a, 0xcc, 0x89, 0x7a, 0x3c, 0xc1, 0x8f, 0xba, 0x54, 0xaa, 0x9f, 0xb8, + 0x52, 0x72, 0x58, 0x85, 0x50, 0x3e, 0x4e, 0x92, 0x56, 0xa5, 0x45, 0xa9, + 0xaa, 0xac, 0x8d, 0x91, 0x81, 0x2f, 0x99, 0x3f, 0x98, 0xbc, 0xbf, 0xd3, + 0x3c, 0x5b, 0x52, 0x7d, 0xd4, 0x80, 0x88, 0x68, 0x82, 0x9c, 0x4d, 0x33, + 0x62, 0x53, 0xad, 0x3a, 0x7d, 0x4b, 0x67, 0x8d, 0x92, 0x6a, 0x5f, 0x8f, + 0xcd, 0x33, 0xc1, 0x73, 0x72, 0x38, 0xbb, 0xa9, 0xb4, 0xd6, 0xc4, 0xa2, + 0xa7, 0x87, 0x7f, 0x45, 0xb0, 0x92, 0x39, 0xac, 0x87, 0x97, 0xa1, 0x3a, + 0xb2, 0x5a, 0x32, 0xb3, 0x3f, 0x38, 0x80, 0x40, 0xa9, 0x58, 0xc3, 0x70, + 0x8b, 0xd2, 0xd5, 0xc8, 0x61, 0x77, 0xbf, 0x7d, 0x3d, 0x85, 0x6c, 0x3b, + 0x35, 0x65, 0x44, 0xa8, 0xb4, 0xa5, 0x45, 0xc4, 0x4e, 0xcd, 0x71, 0xd4, + 0xa2, 0x8c, 0xa4, 0x3e, 0x8e, 0xae, 0x98, 0xac, 0x5d, 0xb8, 0x2d, 0x8c, + 0x76, 0x4f, 0x3e, 0xb4, 0xaf, 0x6b, 0x8e, 0x9f, 0x46, 0x2e, 0x62, 0x67, + 0xb4, 0x3e, 0x87, 0xa2, 0x65, 0x78, 0x77, 0x86, 0xc5, 0x35, 0x98, 0xdb, + 0x4f, 0xc2, 0x7a, 0x8c, 0xc2, 0x8f, 0xa3, 0xa9, 0x81, 0x53, 0x3e, 0x87, + 0x78, 0x88, 0x88, 0x83, 0xa6, 0xb4, 0x47, 0x5f, 0x8c, 0x63, 0xa9, 0x96, + 0x63, 0x41, 0x5d, 0x7d, 0xbb, 0x72, 0x6e, 0x82, 0x71, 0xad, 0x3f, 0xb9, + 0xd6, 0x98, 0x97, 0xa9, 0xcb, 0x74, 0x9e, 0x63, 0x47, 0x63, 0xdd, 0x69, + 0x8f, 0x70, 0xbc, 0x98, 0x98, 0xb1, 0xc0, 0x86, 0xb9, 0xa2, 0xdb, 0x75, + 0x4e, 0x8d, 0xbe, 0xed, 0x95, 0x4b, 0x78, 0xa6, 0xac, 0x54, 0xb5, 0x4e, + 0xb3, 0x92, 0x33, 0x67, 0x62, 0x51, 0x6d, 0x44, 0x96, 0xc8, 0xb7, 0x89, + 0x47, 0x7b, 0x2b, 0x40, 0xc5, 0xd0, 0x96, 0x95, 0x92, 0x7f, 0x56, 0x4c, + 0x44, 0x9f, 0x75, 0x53, 0x6f, 0x3f, 0x92, 0x86, 0x89, 0xcd, 0xbf, 0x43, + 0x8a, 0x7c, 0x8b, 0x36, 0xb9, 0x42, 0xd4, 0x71, 0xc9, 0x39, 0x55, 0xd3, + 0xb4, 0x69, 0x46, 0x5e, 0x71, 0x4e, 0xa0, 0xc8, 0xb9, 0x97, 0x5e, 0xc8, + 0x56, 0xd3, 0xca, 0x40, 0x73, 0xad, 0x86, 0x5a, 0x8c, 0x6a, 0xae, 0xda, + 0xb6, 0xca, 0xa3, 0x99, 0x60, 0xac, 0x4d, 0x48, 0xc0, 0xcc, 0x87, 0xa0, + 0xc3, 0x62, 0xa1, 0x70, 0x91, 0x82, 0xaf, 0x72, 0x9d, 0x8b, 0x9a, 0x93, + 0x2d, 0x95, 0x8d, 0x9c, 0xa1, 0x5d, 0x47, 0x97, 0xdb, 0x8c, 0x73, 0x69, + 0x43, 0x5c, 0x3f, 0x3a, 0x8d, 0x64, 0xa2, 0xb8, 0xd2, 0x3d, 0x90, 0x58, + 0x34, 0xcb, 0x63, 0x82, 0x5b, 0x59, 0xa9, 0xa3, 0x73, 0xc5, 0xd6, 0x68, + 0x85, 0xcf, 0x50, 0x7d, 0xb8, 0x88, 0x3d, 0xa2, 0x4d, 0x7c, 0xa3, 0xbe, + 0xb3, 0x40, 0x38, 0x73, 0x50, 0x43, 0x98, 0x90, 0x45, 0x6f, 0x8f, 0xc7, + 0x4d, 0x96, 0x7a, 0x46, 0x8d, 0x75, 0x60, 0x6d, 0x7b, 0x54, 0xc4, 0x7f, + 0xe2, 0xa0, 0x57, 0xc9, 0xba, 0x5f, 0xab, 0x6b, 0x8b, 0x9a, 0x97, 0xb3, + 0x84, 0xa5, 0xd4, 0x3f, 0x5c, 0xa4, 0xc1, 0x81, 0xe0, 0x83, 0x5a, 0x29, + 0x92, 0xc1, 0xb4, 0x7a, 0x35, 0x63, 0x9f, 0x72, 0x91, 0xa6, 0x68, 0xc2, + 0xd1, 0x4d, 0x81, 0xb6, 0x7d, 0x9e, 0x88, 0xc2, 0x79, 0xb4, 0x92, 0xad, + 0x88, 0x54, 0x45, 0xc0, 0xb0, 0x38, 0x82, 0xba, 0xa9, 0x70, 0x8f, 0xb9, + 0xbf, 0xcb, 0x9a, 0xce, 0xb5, 0xbe, 0xb6, 0x37, 0x5b, 0x3d, 0x9f, 0x85, + 0xb2, 0xa6, 0x78, 0xd8, 0x59, 0x69, 0x63, 0x94, 0x62, 0x90, 0xa3, 0x49, + 0x90, 0xb1, 0x78, 0x44, 0xc3, 0x95, 0x56, 0xcc, 0x57, 0x41, 0x67, 0xb6, + 0xbf, 0x91, 0x6e, 0x49, 0xc4, 0xae, 0x75, 0x96, 0x9f, 0x8f, 0xa9, 0xa7, + 0x98, 0xc0, 0x9b, 0x31, 0x8c, 0x74, 0x5d, 0x7e, 0x88, 0x39, 0xb9, 0x5f, + 0x95, 0xc5, 0x85, 0xba, 0x3c, 0x94, 0x39, 0x87, 0x69, 0x99, 0xbf, 0x40, + 0x9a, 0xa5, 0x8e, 0xb4, 0x7f, 0xbb, 0x2e, 0x9c, 0x9d, 0x3e, 0xc6, 0x76, + 0x85, 0x32, 0x8c, 0x6d, 0x66, 0xc1, 0xc4, 0x5d, 0x6a, 0x8f, 0x53, 0x70, + 0x36, 0x9c, 0xbb, 0x3f, 0x6d, 0x67, 0xcb, 0xaa, 0x8a, 0xd7, 0x71, 0xce, + 0x35, 0xb1, 0x4e, 0xd5, 0x5a, 0x7c, 0x68, 0x57, 0xc8, 0xd5, 0xb5, 0x56, + 0x69, 0x30, 0x42, 0xbd, 0xb1, 0x62, 0x5d, 0xb4, 0xc0, 0x56, 0x5c, 0x57, + 0x3b, 0x98, 0xca, 0xb1, 0xae, 0xcf, 0xb1, 0xc1, 0xad, 0x7c, 0xa4, 0x83, + 0x91, 0x9c, 0x68, 0x5a, 0x60, 0x5e, 0x9f, 0x66, 0x8c, 0x96, 0x46, 0x7a, + 0xc3, 0x52, 0xc5, 0x30, 0xd6, 0x53, 0x81, 0x81, 0x5d, 0x91, 0x67, 0x44, + 0x36, 0x4d, 0xb4, 0xb7, 0x8d, 0x6a, 0xb0, 0x9a, 0xd1, 0x50, 0x4d, 0xb3, + 0x67, 0xc4, 0x78, 0x9a, 0xc1, 0x5a, 0x3d, 0x5b, 0xd8, 0x97, 0x8c, 0x65, + 0xd4, 0xa2, 0x93, 0xb4, 0x4a, 0xc8, 0xaa, 0x7b, 0x4a, 0x4f, 0x82, 0xcc, + 0x97, 0x4d, 0x52, 0x72, 0x7a, 0x68, 0xb9, 0x32, 0x6a, 0xb3, 0x9c, 0xb1, + 0x35, 0x4d, 0x9c, 0x86, 0x6f, 0xae, 0x94, 0xca, 0xbd, 0x73, 0x44, 0x66, + 0x3c, 0xc1, 0x75, 0x3f, 0xe2, 0xc2, 0x8b, 0xb1, 0x91, 0xa0, 0x53, 0x6d, + 0x47, 0xb9, 0x2f, 0xc8, 0x76, 0xad, 0x75, 0x31, 0x3b, 0xb8, 0x72, 0xb8, + 0x8b, 0x2c, 0x2f, 0xb8, 0xba, 0x77, 0xb6, 0x4c, 0x60, 0x9e, 0x94, 0x67, + 0x81, 0x50, 0x4a, 0xa7, 0x32, 0x6f, 0xd3, 0x59, 0x51, 0x4a, 0xce, 0x68, + 0x73, 0xd6, 0x77, 0x9a, 0x8b, 0x84, 0xcc, 0x89, 0x73, 0xb0, 0x6d, 0x38, + 0x3a, 0x36, 0xa9, 0x58, 0x79, 0x74, 0x98, 0x70, 0x56, 0x50, 0x7d, 0x81, + 0xa7, 0x31, 0xa1, 0xbe, 0xb1, 0x8b, 0x86, 0xbf, 0x4e, 0xb1, 0x53, 0x61, + 0xc8, 0x51, 0x6f, 0x6b, 0x7d, 0xb6, 0xb8, 0x58, 0x40, 0xd8, 0x68, 0x47, + 0x57, 0x9b, 0x66, 0xc3, 0xa4, 0x5d, 0x53, 0xb2, 0xb8, 0x6a, 0xb3, 0x43, + 0x5d, 0x87, 0xa7, 0x3c, 0x69, 0x74, 0x9c, 0x79, 0x8a, 0x74, 0x63, 0x75, + 0xbb, 0x58, 0xca, 0x50, 0x53, 0xa4, 0x5b, 0x9b, 0xa9, 0x4b, 0x91, 0x3f, + 0x9b, 0x9f, 0x45, 0xc5, 0x90, 0x8d, 0xa7, 0x6a, 0x41, 0x4d, 0xa6, 0xa0, + 0x7e, 0xc1, 0x8c, 0x75, 0x3b, 0x75, 0xcb, 0x53, 0xb9, 0x82, 0xc7, 0x4e, + 0x5b, 0xb6, 0x52, 0x89, 0x77, 0x7f, 0x74, 0x4b, 0x5c, 0x7b, 0xa8, 0x62, + 0x3f, 0x3c, 0xdd, 0xb7, 0x68, 0x7d, 0x50, 0x61, 0x43, 0x3d, 0x71, 0x93, + 0xbb, 0xa9, 0x72, 0x7a, 0x8d, 0x33, 0x37, 0xc4, 0x5d, 0xbf, 0xc1, 0xa1, + 0x98, 0xd7, 0x3f, 0x8a, 0xca, 0xa4, 0xa7, 0xaa, 0x62, 0x73, 0x60, 0x93, + 0x55, 0x3c, 0xab, 0x43, 0xa7, 0x6d, 0x71, 0x9a, 0xb5, 0x78, 0x70, 0x59, + 0x44, 0x36, 0x45, 0xbf, 0xbb, 0x78, 0x88, 0x5a, 0xbf, 0xca, 0x59, 0x71, + 0x33, 0xcf, 0x8c, 0xb9, 0x93, 0x88, 0x6c, 0x4c, 0x39, 0x54, 0x3f, 0xcf, + 0x89, 0x5c, 0xc4, 0x45, 0x80, 0xac, 0x9e, 0x82, 0xad, 0xbd, 0x72, 0x34, + 0x2d, 0xbc, 0xda, 0xad, 0x5c, 0xa6, 0x9e, 0xa1, 0x7c, 0x36, 0x43, 0x76, + 0x51, 0xd2, 0x39, 0x9e, 0xab, 0xd2, 0x64, 0x65, 0x8a, 0x52, 0xb6, 0x54, + 0xc2, 0x3f, 0x71, 0x89, 0x41, 0xb8, 0xce, 0x8f, 0x80, 0xad, 0x70, 0x5a, + 0xab, 0x74, 0x75, 0x48, 0x3a, 0x3a, 0x76, 0xa7, 0x98, 0x43, 0x8e, 0x94, + 0xbe, 0x72, 0x8e, 0x36, 0x53, 0x80, 0x3c, 0x69, 0x3c, 0x96, 0x5c, 0x5d, + 0x85, 0x59, 0xc9, 0x66, 0x32, 0x7e, 0x9d, 0x85, 0x3c, 0xb6, 0xb8, 0x6c, + 0xcc, 0xb8, 0xd0, 0x4c, 0x48, 0xc6, 0x46, 0x44, 0xb2, 0x80, 0x79, 0x6e, + 0x8f, 0xc3, 0xa5, 0xae, 0xb5, 0x51, 0xb9, 0xac, 0x7f, 0x40, 0x4d, 0x4d, + 0x38, 0x5c, 0xa5, 0x36, 0xcb, 0xa2, 0xc6, 0xaf, 0x44, 0x92, 0x9f, 0x63, + 0x58, 0x3a, 0xb9, 0x3b, 0x4b, 0xb0, 0x5c, 0x7a, 0xc2, 0x99, 0x75, 0x77, + 0x5e, 0x8a, 0xa1, 0xc5, 0x38, 0x62, 0xbf, 0xcc, 0x48, 0x2c, 0x9c, 0x34, + 0x80, 0x9e, 0x6b, 0x3f, 0xa4, 0x9c, 0x6d, 0xc9, 0xaf, 0x7a, 0x9e, 0xd1, + 0xb2, 0xc9, 0x8c, 0x47, 0x9e, 0x46, 0x55, 0x4b, 0x59, 0x60, 0x65, 0xbf, + 0xa9, 0x7c, 0xd5, 0xad, 0x44, 0xab, 0x4b, 0x79, 0xc0, 0x3e, 0x98, 0x8a, + 0x81, 0xc3, 0x7b, 0x7a, 0x90, 0x6d, 0x91, 0xd0, 0x55, 0x7b, 0x81, 0xcd, + 0xa6, 0x5c, 0x88, 0x96, 0x74, 0xc8, 0x9e, 0x93, 0xbc, 0x51, 0x3f, 0x7a, + 0x99, 0x5c, 0x90, 0xcf, 0x52, 0x6e, 0x5b, 0xbf, 0xba, 0x53, 0xd1, 0xb9, + 0x39, 0x38, 0x69, 0x6e, 0x7a, 0x3d, 0xb4, 0x95, 0x4c, 0x39, 0x6c, 0xab, + 0x40, 0xc8, 0x65, 0x91, 0xc4, 0xc8, 0x59, 0x36, 0x70, 0x8a, 0x90, 0xb5, + 0x8f, 0xa3, 0x53, 0x85, 0x4d, 0x7b, 0x55, 0xa2, 0x35, 0x72, 0xb2, 0x32, + 0x64, 0x65, 0xd1, 0x4a, 0xc5, 0xb4, 0x58, 0x5b, 0x75, 0x68, 0x4b, 0x85, + 0x3f, 0x62, 0x73, 0xc2, 0xc9, 0xd6, 0xc1, 0x58, 0xbc, 0x9f, 0x55, 0xcb, + 0xb9, 0x4e, 0x8b, 0x40, 0x94, 0x66, 0xc6, 0xc9, 0xb5, 0xb6, 0x90, 0xb6, + 0x61, 0x5e, 0x8d, 0x35, 0x84, 0xcc, 0xd6, 0x68, 0x73, 0xaa, 0x9f, 0xcb, + 0x96, 0x98, 0xcf, 0x95, 0x85, 0x7b, 0x5f, 0x7e, 0x58, 0x7c, 0x4e, 0xa8, + 0x7d, 0x3a, 0x90, 0xc8, 0x94, 0x78, 0xa2, 0xb1, 0x38, 0x30, 0x5f, 0x57, + 0xb8, 0x32, 0x9a, 0x42, 0x36, 0x99, 0x8f, 0x35, 0x3b, 0x44, 0xcd, 0x5f, + 0x66, 0x89, 0x7f, 0x50, 0x63, 0x9e, 0xbb, 0x91, 0x8d, 0xa4, 0xa9, 0x3e, + 0x58, 0x74, 0xb4, 0x60, 0xad, 0x75, 0x72, 0x65, 0xc9, 0x9b, 0x40, 0xca, + 0x5e, 0x38, 0x7e, 0xaf, 0x7e, 0x4d, 0xa6, 0xb8, 0x26, 0x55, 0x61, 0x96, + 0x5b, 0xa8, 0xe1, 0x88, 0xd4, 0xa6, 0x80, 0x6e, 0xcb, 0xce, 0x3c, 0x5b, + 0x7f, 0x65, 0x61, 0xb0, 0x70, 0x90, 0x72, 0x3d, 0xbf, 0xa5, 0x5d, 0x53, + 0xb8, 0xd0, 0x91, 0x9c, 0x93, 0x47, 0x5d, 0x56, 0x75, 0x90, 0x4c, 0xac, + 0xb3, 0x75, 0x82, 0xa8, 0x58, 0x36, 0x71, 0x4e, 0xc1, 0x5c, 0x58, 0x62, + 0xa5, 0x7b, 0x84, 0xbd, 0xd6, 0xbf, 0x52, 0x5d, 0x3a, 0x41, 0x3a, 0x84, + 0x9a, 0x35, 0x8d, 0x3f, 0xae, 0x8a, 0x71, 0x8e, 0x33, 0x80, 0x81, 0x94, + 0xcb, 0xb5, 0x9b, 0x5d, 0xb2, 0xa6, 0xbd, 0x7f, 0x88, 0x92, 0xbb, 0x67, + 0xaa, 0x70, 0x5c, 0xd1, 0x82, 0x4d, 0x78, 0x49, 0x41, 0x8a, 0x8d, 0x61, + 0x73, 0xc0, 0x56, 0x5f, 0xc9, 0xb6, 0x3c, 0x9f, 0x85, 0x51, 0xa5, 0xb8, + 0x2a, 0x9a, 0x41, 0x32, 0x9b, 0x68, 0x34, 0x7f, 0xcf, 0xc5, 0x60, 0x70, + 0x3b, 0x8f, 0x7a, 0xc6, 0x83, 0xd6, 0x60, 0x4d, 0x59, 0x78, 0x9f, 0x54, + 0x7d, 0x94, 0x79, 0x5b, 0x30, 0x4f, 0xcc, 0xce, 0x81, 0x43, 0xc8, 0x74, + 0x37, 0xb9, 0xc6, 0x5c, 0xb1, 0x78, 0x92, 0xa2, 0x7d, 0x58, 0x43, 0x67, + 0x7d, 0x78, 0x62, 0x32, 0xaa, 0x75, 0x7b, 0xaa, 0xba, 0x50, 0xaf, 0x2f, + 0x98, 0x3d, 0xba, 0x63, 0xc2, 0x79, 0xa0, 0x58, 0x34, 0xba, 0x7a, 0xad, + 0x84, 0x8e, 0x7e, 0x30, 0xc5, 0x85, 0x3f, 0x5b, 0x3e, 0x60, 0xbe, 0x95, + 0x3b, 0x89, 0x63, 0x7b, 0x67, 0x3e, 0xc2, 0x3a, 0x8e, 0x79, 0x74, 0xa5, + 0x6c, 0xa1, 0x92, 0x83, 0xbb, 0x6e, 0x65, 0xa5, 0xa0, 0x76, 0xaf, 0x5d, + 0x73, 0x6a, 0x3d, 0x5d, 0x81, 0x94, 0xbe, 0xc2, 0xa9, 0x35, 0x6a, 0x5c, + 0x3e, 0x73, 0x3b, 0x76, 0x7f, 0x4d, 0xa5, 0xb8, 0xcf, 0x84, 0x47, 0xb1, + 0x8d, 0x72, 0x4a, 0x45, 0x65, 0xb9, 0xbb, 0x4e, 0xc9, 0xb1, 0xc5, 0x5e, + 0xaa, 0xb5, 0x81, 0x88, 0xb4, 0xc2, 0xb5, 0x6d, 0x50, 0xbc, 0xae, 0xa6, + 0x44, 0x5c, 0xb5, 0x5b, 0xc0, 0xb9, 0x9e, 0xb6, 0xd8, 0xc0, 0x55, 0x42, + 0x7c, 0x9d, 0x5a, 0x7c, 0x4a, 0x82, 0x69, 0xcb, 0xbe, 0x80, 0x50, 0x59, + 0x2d, 0xb6, 0x9e, 0x6e, 0xb2, 0xb6, 0x7b, 0x82, 0xb1, 0x6b, 0x41, 0x4d, + 0x48, 0x5f, 0x83, 0x80, 0xc5, 0xa4, 0xd3, 0xa2, 0x77, 0x78, 0x8c, 0x7e, + 0x5f, 0x5f, 0x6f, 0x58, 0x6c, 0x7b, 0xbc, 0xbb, 0x8c, 0x30, 0xc0, 0x49, + 0xc9, 0x84, 0x98, 0x8a, 0xc3, 0x63, 0xb5, 0xd3, 0xa9, 0x40, 0xb9, 0x85, + 0x46, 0x8a, 0xb8, 0xcb, 0xa0, 0x80, 0xc7, 0x4d, 0x5c, 0x52, 0x73, 0xaf, + 0x68, 0x66, 0xd9, 0xa1, 0x9e, 0x44, 0xb9, 0xa1, 0xb5, 0x92, 0x46, 0x39, + 0x7b, 0x6b, 0x9d, 0xcb, 0xb9, 0xb2, 0xba, 0x3c, 0x6c, 0x2e, 0x75, 0x8f, + 0x62, 0x2b, 0xa9, 0xb1, 0x2e, 0x9c, 0x7b, 0xba, 0x45, 0x67, 0x6e, 0x6c, + 0x89, 0x51, 0x81, 0xb2, 0xbd, 0x94, 0x8f, 0x46, 0x62, 0xbc, 0xd4, 0x2d, + 0x59, 0x4e, 0xc8, 0x98, 0x3d, 0x4d, 0xc8, 0x4c, 0x8a, 0x96, 0x72, 0x7b, + 0xd2, 0xc9, 0x81, 0xc4, 0x88, 0xd9, 0xc3, 0x77, 0x41, 0x2a, 0x36, 0x3c, + 0xc4, 0xca, 0x96, 0x54, 0xcb, 0x7a, 0x5c, 0xbb, 0xc8, 0x9e, 0x5f, 0x6b, + 0xa0, 0x86, 0x78, 0xb2, 0xd7, 0xbf, 0xa0, 0x90, 0x32, 0x5f, 0x53, 0x56, + 0x4f, 0x53, 0xaa, 0x96, 0x6d, 0x75, 0xba, 0x3d, 0xa8, 0xa3, 0x85, 0xa8, + 0x43, 0xc5, 0x68, 0x5e, 0x5e, 0xd2, 0x50, 0x8f, 0x2a, 0x6f, 0x6c, 0x77, + 0x62, 0x80, 0x8a, 0xa6, 0x5d, 0xd9, 0x9d, 0x78, 0xb0, 0xd5, 0x58, 0x7d, + 0xd9, 0xb6, 0x38, 0x69, 0x9a, 0x64, 0x54, 0x86, 0x94, 0x56, 0x5c, 0xb1, + 0x98, 0xce, 0x78, 0x76, 0x67, 0x4e, 0xb9, 0x9c, 0xb3, 0x52, 0x8b, 0xc1, + 0x4e, 0x8c, 0xbb, 0x95, 0x40, 0xe1, 0x61, 0x65, 0x30, 0x6e, 0x92, 0x71, + 0x8f, 0x6a, 0x80, 0xc2, 0xb5, 0xc9, 0x38, 0x9b, 0x4a, 0x76, 0x63, 0x47, + 0x40, 0x77, 0x45, 0x9f, 0x39, 0xcf, 0x44, 0x99, 0x8e, 0x6b, 0x37, 0x3f, + 0xc3, 0x58, 0xb7, 0x7b, 0x85, 0x65, 0x52, 0x8a, 0xc9, 0x5a, 0xad, 0xd1, + 0xcb, 0xd7, 0x6f, 0x7e, 0xbd, 0x97, 0xb5, 0x75, 0x5c, 0x84, 0xd3, 0x9a, + 0x94, 0x5a, 0x6e, 0xbe, 0xcb, 0x76, 0x89, 0x4a, 0x48, 0x57, 0xc5, 0xb7, + 0xa5, 0xa1, 0x9f, 0x70, 0x81, 0xab, 0x69, 0x82, 0x99, 0x38, 0x74, 0xa9, + 0x9e, 0x62, 0x97, 0x7a, 0xc1, 0x80, 0xa7, 0x61, 0x3d, 0xa9, 0x83, 0x5e, + 0x5f, 0x2f, 0x7c, 0xaf, 0x81, 0x77, 0x9a, 0x99, 0xa9, 0x3b, 0x51, 0xb1, + 0x57, 0x77, 0x99, 0x61, 0x9a, 0x9d, 0xba, 0xc0, 0x89, 0x9c, 0xd0, 0x91, + 0x77, 0x94, 0x77, 0x82, 0x9c, 0x69, 0x57, 0xc7, 0x7a, 0xa1, 0xa8, 0x2a, + 0x9c, 0x87, 0xa4, 0x3d, 0xb9, 0x51, 0x3b, 0xa7, 0x9e, 0x67, 0x5f, 0x98, + 0xb8, 0x8c, 0xbd, 0x3a, 0x72, 0x53, 0x41, 0x37, 0x96, 0x86, 0x9d, 0xb3, + 0x33, 0x3b, 0x87, 0x98, 0x92, 0xa6, 0x69, 0x78, 0x47, 0xb2, 0xc7, 0x97, + 0x50, 0x60, 0x7b, 0x31, 0xb1, 0x77, 0xba, 0x2d, 0xd6, 0x64, 0xdb, 0x8d, + 0x7f, 0x77, 0x7c, 0x5d, 0x49, 0x50, 0x4a, 0xa2, 0xa3, 0xa6, 0xba, 0x70, + 0x7e, 0x4e, 0x84, 0x8b, 0x3e, 0x41, 0x34, 0x52, 0xb7, 0xb3, 0xcd, 0x74, + 0x88, 0x45, 0xd2, 0x69, 0xa6, 0xc8, 0x7a, 0x86, 0xc0, 0x8b, 0x97, 0x88, + 0x61, 0x6a, 0xad, 0x30, 0xa3, 0xb5, 0x6a, 0x36, 0x9d, 0x3a, 0x67, 0x5e, + 0x36, 0x8d, 0x8e, 0x5b, 0x9d, 0x7a, 0xb0, 0x3e, 0x4f, 0x82, 0xc5, 0x34, + 0xc8, 0x3e, 0x52, 0x5b, 0x52, 0x4d, 0xad, 0xa4, 0x93, 0x86, 0x64, 0xbe, + 0x65, 0xa4, 0x9d, 0x71, 0x59, 0xb2, 0x44, 0xab, 0xce, 0xaf, 0xc1, 0x99, + 0xac, 0x4c, 0x54, 0x2f, 0x57, 0x23, 0x3c, 0x4d, 0x75, 0x51, 0xbf, 0x5e, + 0xae, 0x39, 0xaf, 0x47, 0x9c, 0x5d, 0x70, 0xb9, 0xc5, 0xc9, 0xa8, 0x76, + 0x67, 0x72, 0xc8, 0x54, 0x72, 0x4a, 0x35, 0xbe, 0xce, 0x5c, 0xb1, 0x58, + 0x72, 0x5b, 0x8b, 0x77, 0x69, 0xa7, 0x7a, 0x5a, 0x97, 0x4b, 0x46, 0x2a, + 0xa6, 0x2b, 0x65, 0x32, 0x84, 0x7e, 0x61, 0x62, 0xa4, 0xc4, 0x44, 0xb2, + 0xac, 0x9b, 0xac, 0x63, 0xcc, 0x4e, 0x85, 0xb2, 0x97, 0xa3, 0x62, 0x31, + 0x9f, 0x7d, 0xb2, 0x7b, 0xbb, 0xa1, 0x5d, 0x4e, 0xc4, 0x60, 0xb9, 0xb3, + 0x93, 0xcf, 0x6e, 0x60, 0x4f, 0x8b, 0xd1, 0x98, 0xb5, 0x43, 0x90, 0x8f, + 0x8b, 0x6c, 0x36, 0x7e, 0x8c, 0x68, 0x5d, 0xa6, 0x3a, 0x41, 0x7b, 0x5f, + 0xba, 0x6b, 0x6e, 0x6b, 0x5a, 0x4e, 0x99, 0x55, 0x63, 0x69, 0xce, 0xab, + 0x4e, 0x75, 0x53, 0x44, 0xc2, 0x38, 0xb6, 0x41, 0x96, 0x6b, 0x83, 0x93, + 0xce, 0x3c, 0x5d, 0xba, 0xa6, 0x50, 0x4c, 0x83, 0x3e, 0xc4, 0xb1, 0x73, + 0x72, 0x48, 0x7e, 0xa5, 0x67, 0x3a, 0xb9, 0xa3, 0x3c, 0x9a, 0xab, 0x34, + 0xa6, 0x71, 0x3a, 0xb2, 0x6a, 0x61, 0xb1, 0x9a, 0x62, 0x45, 0x4b, 0x40, + 0x85, 0xb5, 0xb6, 0x70, 0xbb, 0x75, 0x97, 0xa1, 0x9e, 0x82, 0x39, 0xa6, + 0x76, 0x88, 0x95, 0x4f, 0x75, 0x47, 0xc8, 0x93, 0xaf, 0x6b, 0x8d, 0x70, + 0xb8, 0xc6, 0x43, 0x3d, 0x37, 0x93, 0xb4, 0xbf, 0x8a, 0x82, 0x37, 0x7d, + 0x45, 0xc3, 0xb3, 0x99, 0x7a, 0x9b, 0x58, 0x56, 0xb4, 0x40, 0xca, 0xc3, + 0x9f, 0xc5, 0x7a, 0xc7, 0xb5, 0x48, 0x8d, 0x50, 0x6b, 0x3f, 0x99, 0x59, + 0xab, 0x83, 0x71, 0x86, 0xb0, 0xc4, 0x9d, 0x57, 0x42, 0x2f, 0x73, 0x89, + 0x86, 0x76, 0x41, 0x35, 0x73, 0x61, 0x2c, 0x98, 0x74, 0x36, 0x97, 0x5b, + 0xb4, 0xc9, 0x98, 0x87, 0xac, 0xbf, 0x6c, 0x95, 0xae, 0x4b, 0x36, 0xcd, + 0x97, 0xa7, 0xb0, 0x59, 0x6f, 0xb4, 0x4a, 0x40, 0x3e, 0x7b, 0x4f, 0x61, + 0x75, 0x93, 0xa7, 0xbb, 0x65, 0x46, 0x66, 0x45, 0xc8, 0xae, 0xc3, 0xca, + 0x6c, 0x67, 0x84, 0x9a, 0xb7, 0x8b, 0x77, 0x7c, 0x2d, 0x80, 0x9a, 0x6a, + 0x6e, 0x98, 0x97, 0x33, 0xb2, 0x9a, 0x35, 0x48, 0x53, 0xc9, 0xa2, 0x38, + 0xc8, 0x89, 0xb9, 0x56, 0x4f, 0xc3, 0xb2, 0x5b, 0x83, 0x74, 0x42, 0xc8, + 0xae, 0x83, 0x61, 0x63, 0xb9, 0x6b, 0x9a, 0xcc, 0x86, 0x8f, 0xa2, 0x3d, + 0x38, 0xc0, 0xb0, 0x3a, 0x7d, 0x43, 0xac, 0x8f, 0x97, 0x4a, 0x3d, 0x8b, + 0x3b, 0xa1, 0x3b, 0xb8, 0x99, 0x69, 0x90, 0xad, 0xae, 0x85, 0xb7, 0xbb, + 0x85, 0x3f, 0xcb, 0x47, 0x4a, 0x42, 0x65, 0x99, 0x5e, 0x70, 0xb6, 0x46, + 0x7f, 0x7e, 0x81, 0x42, 0xa7, 0xcb, 0x40, 0x4a, 0x89, 0xb1, 0xb7, 0xc5, + 0x60, 0x58, 0x66, 0x57, 0x4e, 0x65, 0x3f, 0x5c, 0x6f, 0x82, 0x65, 0x51, + 0x85, 0x50, 0x6c, 0xc0, 0x5e, 0xaa, 0xb7, 0x54, 0x51, 0x4e, 0x95, 0x84, + 0x90, 0x45, 0x31, 0x6e, 0x53, 0x2f, 0x9b, 0x68, 0x9f, 0x75, 0xc6, 0x36, + 0x57, 0x46, 0x9c, 0x6f, 0x3e, 0x85, 0x4a, 0x89, 0xc5, 0x7e, 0x8d, 0x52, + 0x97, 0xa0, 0x64, 0x40, 0x3f, 0x72, 0x37, 0x9d, 0xc1, 0xb7, 0x43, 0xa0, + 0x52, 0x5f, 0xc0, 0xa1, 0x7f, 0xb5, 0x70, 0x2e, 0xbc, 0x54, 0x71, 0xcd, + 0x31, 0xcd, 0x31, 0x36, 0x92, 0x9e, 0x4b, 0x5e, 0xa7, 0x9a, 0x77, 0x61, + 0x38, 0x86, 0x96, 0x73, 0x9e, 0x41, 0x9d, 0x78, 0x76, 0xd6, 0xc3, 0x40, + 0x6c, 0xb5, 0x78, 0xc6, 0x62, 0x92, 0x54, 0x58, 0x77, 0x8a, 0xb7, 0xc5, + 0xa0, 0x65, 0x9d, 0x9a, 0x35, 0xc0, 0xb5, 0x45, 0x92, 0x92, 0x5f, 0x74, + 0x81, 0x68, 0x37, 0xb9, 0x55, 0x44, 0x94, 0x2f, 0x6f, 0x91, 0x30, 0x89, + 0x95, 0x7a, 0x7d, 0x98, 0x81, 0x5c, 0xcf, 0x49, 0x30, 0x71, 0x3d, 0x71, + 0xa6, 0x76, 0x64, 0x34, 0x8e, 0xc3, 0xad, 0x7a, 0x47, 0xca, 0x60, 0xb1, + 0xaa, 0x9a, 0x79, 0x78, 0x6c, 0x94, 0x92, 0x8c, 0x92, 0x87, 0x3b, 0x83, + 0x4b, 0xbe, 0x7e, 0x81, 0x65, 0x33, 0xc2, 0x54, 0x80, 0xae, 0x78, 0x60, + 0x3d, 0x2c, 0x77, 0x91, 0xa8, 0xc4, 0x61, 0x5b, 0x8b, 0x95, 0x34, 0x67, + 0x38, 0x53, 0x2d, 0xa6, 0xaf, 0xb4, 0xb2, 0xba, 0xcc, 0x48, 0xbc, 0x76, + 0x82, 0xbf, 0xc5, 0xb6, 0x94, 0x9b, 0xc9, 0xa9, 0xc9, 0xc8, 0x34, 0x50, + 0xab, 0x53, 0x32, 0xbe, 0xb6, 0x30, 0x97, 0x8d, 0x46, 0x82, 0x79, 0x4d, + 0x8e, 0xbe, 0x76, 0x84, 0xa1, 0xc5, 0xc0, 0xc0, 0xa0, 0x51, 0xc3, 0x7c, + 0x8b, 0x4b, 0x8c, 0x52, 0x42, 0xb0, 0x4a, 0xb8, 0xae, 0xc3, 0x87, 0x5c, + 0xb5, 0x4a, 0x34, 0x3c, 0x9c, 0x6f, 0x3c, 0x37, 0xae, 0x5a, 0xc5, 0x4c, + 0x37, 0x58, 0xa7, 0xaa, 0xba, 0x88, 0xa4, 0x63, 0xab, 0x8d, 0x8c, 0x7c, + 0xcb, 0xc2, 0x8a, 0xc7, 0x5c, 0x8c, 0xb2, 0x92, 0x90, 0x96, 0x48, 0x71, + 0xc9, 0x55, 0xa3, 0xa1, 0x6c, 0xbc, 0x2f, 0x3b, 0x85, 0x98, 0x90, 0x57, + 0xb2, 0xc9, 0x63, 0x7c, 0x40, 0x86, 0x3f, 0xcc, 0x9f, 0xb2, 0x71, 0x98, + 0xae, 0xcc, 0x77, 0xc9, 0x8e, 0x3c, 0xac, 0x83, 0x86, 0x73, 0x81, 0xa5, + 0x5e, 0x83, 0x45, 0x41, 0xbc, 0x94, 0x45, 0x33, 0x6d, 0x8b, 0xb8, 0xa3, + 0xb4, 0xb1, 0x2f, 0xc2, 0xb8, 0x54, 0x67, 0x59, 0xbb, 0x61, 0x40, 0x4d, + 0xab, 0xba, 0x52, 0x48, 0x3f, 0xc2, 0xb4, 0xc1, 0x49, 0xaa, 0xab, 0xad, + 0x58, 0xa3, 0x5a, 0x50, 0xa5, 0x79, 0x5b, 0xa0, 0xc6, 0x97, 0x49, 0x54, + 0x95, 0x9f, 0x56, 0x34, 0x83, 0xa0, 0xa8, 0x9f, 0x87, 0xcb, 0x3b, 0x82, + 0x53, 0x84, 0x75, 0x2f, 0xa3, 0x58, 0x67, 0xc2, 0xd0, 0x4b, 0x6c, 0x39, + 0x7a, 0xc5, 0x36, 0xc0, 0xad, 0xa7, 0x7d, 0xbd, 0x35, 0x2c, 0xb1, 0x8c, + 0x43, 0x77, 0x44, 0x4c, 0x4a, 0xb6, 0xca, 0xbf, 0x94, 0x83, 0x75, 0x67, + 0x2d, 0x7b, 0x96, 0x52, 0xc3, 0x80, 0x73, 0x64, 0x67, 0x30, 0x37, 0xb4, + 0x92, 0x9a, 0xaa, 0x47, 0x31, 0x79, 0xaa, 0xc3, 0x88, 0x72, 0x8d, 0x92, + 0x59, 0xb4, 0x75, 0x47, 0x4d, 0x49, 0xb1, 0x55, 0x6d, 0x80, 0xd7, 0xc0, + 0x62, 0x85, 0x7a, 0xad, 0xce, 0xcb, 0x9d, 0x39, 0x58, 0x7a, 0xa7, 0x92, + 0x98, 0x3a, 0xb6, 0x4a, 0x3c, 0xa6, 0x2e, 0x97, 0x89, 0x69, 0xa8, 0x42, + 0x76, 0x6e, 0xa3, 0xc8, 0x59, 0x82, 0x4e, 0x80, 0xd2, 0xa7, 0xda, 0x93, + 0x69, 0xcd, 0xe4, 0xad, 0xa6, 0x40, 0xdc, 0x69, 0xc7, 0xc9, 0xd0, 0x85, + 0xce, 0x48, 0xc5, 0x3d, 0x9d, 0xbc, 0x5c, 0xb0, 0x9d, 0x4a, 0x4b, 0xaa, + 0x85, 0x55, 0x29, 0x5c, 0x4a, 0x36, 0x67, 0xcd, 0x44, 0x83, 0x43, 0xa9, + 0x5b, 0x31, 0x42, 0xd1, 0x8b, 0xcf, 0xc6, 0xdd, 0x67, 0x83, 0x85, 0x6a, + 0xaf, 0x5c, 0x4f, 0xa1, 0x4a, 0x87, 0x5d, 0x51, 0x4b, 0xd0, 0xa0, 0x6b, + 0x32, 0x4f, 0x49, 0x7b, 0x99, 0x4b, 0x4f, 0x4c, 0x62, 0x4a, 0x6d, 0xc8, + 0x9e, 0x4a, 0xd5, 0x9b, 0x68, 0x98, 0xc5, 0xa9, 0x83, 0x38, 0xca, 0xbd, + 0xb8, 0x28, 0x4c, 0x66, 0x8a, 0x27, 0xc9, 0xcf, 0xcb, 0x99, 0xc4, 0x6e, + 0x8c, 0xab, 0x7b, 0x86, 0x76, 0x92, 0x50, 0x81, 0x41, 0x7d, 0x77, 0x57, + 0x5b, 0xb8, 0x89, 0x58, 0x4e, 0x84, 0x5f, 0x2a, 0xb3, 0xb4, 0x9d, 0x71, + 0xd1, 0x44, 0x64, 0x6a, 0x7f, 0x62, 0x44, 0xa6, 0x9e, 0x80, 0xda, 0xd5, + 0x2d, 0x8e, 0x90, 0x2f, 0x85, 0x33, 0xd2, 0x65, 0x4d, 0x4e, 0x6a, 0x62, + 0xb4, 0x5b, 0x5d, 0x8f, 0x7f, 0x3b, 0x86, 0x74, 0x99, 0x42, 0x28, 0x94, + 0x50, 0x87, 0x9c, 0x83, 0xba, 0x5e, 0xd5, 0xcd, 0x8c, 0x51, 0x88, 0x8b, + 0x97, 0xb7, 0xa1, 0x6f, 0x5b, 0xc3, 0x39, 0x34, 0x48, 0x86, 0x93, 0x51, + 0x35, 0x7d, 0x81, 0x8f, 0x9f, 0x60, 0xa1, 0x77, 0x51, 0x7d, 0xbb, 0xa1, + 0xab, 0xc9, 0x6d, 0xb8, 0x46, 0x3e, 0x6c, 0x35, 0x91, 0x9b, 0xc6, 0x92, + 0x65, 0x56, 0xb9, 0xad, 0x97, 0xc5, 0xab, 0x63, 0x6d, 0xd2, 0x4a, 0x44, + 0x90, 0x67, 0x9b, 0x85, 0x76, 0xbf, 0x9e, 0xb2, 0x4c, 0xa7, 0x69, 0x92, + 0x83, 0xb6, 0x82, 0xd7, 0xb8, 0xd6, 0xd4, 0xe4, 0x87, 0x4c, 0x74, 0xaa, + 0x60, 0x60, 0x4e, 0x8d, 0x42, 0x57, 0x5d, 0x9c, 0xa6, 0xa3, 0x96, 0x7d, + 0x3e, 0x52, 0x68, 0x4e, 0xc5, 0x64, 0x7c, 0xaf, 0x70, 0x8f, 0x52, 0x7e, + 0x35, 0xb2, 0x4a, 0xb2, 0x43, 0x9f, 0xb2, 0x9f, 0xae, 0x96, 0x59, 0x60, + 0x72, 0x65, 0x90, 0x52, 0x7c, 0xd1, 0xec, 0x7f, 0xcc, 0xbb, 0x79, 0xc2, + 0xb9, 0xcb, 0x36, 0x53, 0x9e, 0x58, 0x62, 0xac, 0xb1, 0x3d, 0xdb, 0xc2, + 0x3b, 0x5e, 0xd8, 0xa9, 0x54, 0x9f, 0x9e, 0xbb, 0xe0, 0x8a, 0x90, 0x67, + 0xc2, 0x58, 0x39, 0x2b, 0xd2, 0x74, 0x98, 0xb7, 0x88, 0x9c, 0xc0, 0xaf, + 0x6b, 0x62, 0x4c, 0xb9, 0x6c, 0x86, 0x4b, 0x47, 0xb3, 0x90, 0x8f, 0x8b, + 0xa6, 0xbb, 0xb5, 0x32, 0x7a, 0x3a, 0x6b, 0x7c, 0x36, 0xb8, 0xc3, 0xdc, + 0xa8, 0xc2, 0x8c, 0x41, 0x4c, 0xb0, 0xaf, 0xbd, 0x78, 0xc3, 0x70, 0x93, + 0x9b, 0x4d, 0x7e, 0x5b, 0x4c, 0xca, 0xb0, 0x4b, 0x88, 0xad, 0x8d, 0x3a, + 0x69, 0xc4, 0x82, 0x5f, 0x51, 0x42, 0x39, 0x8e, 0x86, 0x66, 0x60, 0x5a, + 0x89, 0x3f, 0xcb, 0x43, 0x77, 0x60, 0x80, 0x8f, 0xba, 0x6c, 0xd2, 0x82, + 0x3b, 0x63, 0xc3, 0x38, 0x34, 0x66, 0xb3, 0xbd, 0xc7, 0xc6, 0xbf, 0x5e, + 0x94, 0xb4, 0xa7, 0xa8, 0xa8, 0x51, 0x7c, 0x23, 0x94, 0x9d, 0x86, 0xa8, + 0x80, 0x51, 0xc2, 0xc9, 0x3b, 0xb8, 0x69, 0x6a, 0xb2, 0x8a, 0xc3, 0x66, + 0x4a, 0x75, 0xb8, 0x3a, 0xcb, 0x82, 0xaf, 0x9b, 0x97, 0x51, 0x74, 0xc8, + 0x69, 0x56, 0xcb, 0x68, 0x91, 0x78, 0x69, 0xa7, 0x3a, 0x6e, 0x5e, 0x99, + 0x67, 0x8d, 0x62, 0xb6, 0xe0, 0x96, 0x2c, 0x9e, 0x45, 0xbe, 0xa9, 0x46, + 0x91, 0x9a, 0xc2, 0x6e, 0xc2, 0xc2, 0xb8, 0x55, 0xa4, 0x47, 0xa5, 0x1c, + 0xa4, 0xd9, 0xcb, 0xab, 0x69, 0xc5, 0xd1, 0xc9, 0x3b, 0x59, 0x79, 0x66, + 0xbd, 0x56, 0x86, 0xac, 0x40, 0xa8, 0x3e, 0x84, 0x3e, 0xc8, 0xa6, 0x70, + 0xd5, 0x50, 0xd9, 0x93, 0x72, 0xb7, 0x98, 0x42, 0x80, 0x85, 0x63, 0x84, + 0x86, 0xa2, 0x7c, 0x85, 0x63, 0xb8, 0x50, 0x3d, 0xa6, 0xd2, 0xd2, 0x9d, + 0x68, 0xab, 0x4e, 0x51, 0xb5, 0x9b, 0x8a, 0x3a, 0x51, 0x69, 0x19, 0x81, + 0xd8, 0x65, 0x67, 0x73, 0xae, 0x34, 0x65, 0x33, 0x6a, 0x58, 0x87, 0x39, + 0xa0, 0x32, 0x30, 0x5d, 0x38, 0x4d, 0xa6, 0x89, 0x55, 0xa6, 0xcf, 0xa8, + 0x50, 0xc3, 0x3e, 0xac, 0x38, 0xd6, 0x35, 0x47, 0xb6, 0xdb, 0x6c, 0x58, + 0x7e, 0xa0, 0x8f, 0x3a, 0xa3, 0x9d, 0x57, 0x99, 0x4d, 0x66, 0x72, 0x86, + 0x3a, 0x62, 0x42, 0x79, 0xc7, 0x63, 0x8e, 0xa2, 0x95, 0x6a, 0xb8, 0x3b, + 0x56, 0x4c, 0x8a, 0xc1, 0x4d, 0xbc, 0x41, 0xad, 0x5d, 0x85, 0xb6, 0x9a, + 0xd0, 0x47, 0xb4, 0x4e, 0x4b, 0x86, 0x33, 0x38, 0xb3, 0x33, 0xaf, 0x99, + 0x91, 0x61, 0xdb, 0x73, 0xb3, 0x98, 0xb4, 0x7c, 0x9b, 0x62, 0x76, 0xa1, + 0x6d, 0x48, 0x7b, 0x70, 0x69, 0x6f, 0xab, 0x59, 0xb4, 0x99, 0xca, 0xad, + 0x52, 0xa3, 0x96, 0xc5, 0x4d, 0xa4, 0xc3, 0x93, 0x28, 0xab, 0x82, 0x8c, + 0x8f, 0x4a, 0xb2, 0xaf, 0xb6, 0x6c, 0x3c, 0xaf, 0x33, 0x34, 0x79, 0x3d, + 0xc6, 0x65, 0x63, 0xb1, 0x56, 0x7e, 0x8f, 0x43, 0x73, 0x92, 0xba, 0x3d, + 0xb0, 0x92, 0xc9, 0xc2, 0xe3, 0x87, 0x64, 0x83, 0x4d, 0xcb, 0x46, 0xcd, + 0x58, 0x57, 0xd4, 0x66, 0x7d, 0xc7, 0x55, 0x8c, 0x72, 0xa3, 0xcc, 0x62, + 0x53, 0x8c, 0xb9, 0x9e, 0xc6, 0x76, 0xb0, 0xa1, 0xe3, 0x7e, 0x95, 0x6d, + 0x3d, 0x46, 0x86, 0x7b, 0xbf, 0x99, 0x5f, 0xc6, 0x2d, 0x96, 0xbd, 0x29, + 0x71, 0xa2, 0x7e, 0x8b, 0x4e, 0xa5, 0x6f, 0x87, 0x5e, 0xb9, 0x7b, 0x5a, + 0x6d, 0xbe, 0xb2, 0x84, 0x6d, 0x6f, 0x86, 0x4c, 0xa7, 0xc1, 0x77, 0x26, + 0xb4, 0x88, 0x51, 0x92, 0x58, 0x3e, 0x6f, 0xc6, 0x3a, 0x77, 0x82, 0x32, + 0x7c, 0xba, 0x3e, 0x2e, 0x4a, 0x47, 0x54, 0xa7, 0xc9, 0xb2, 0x57, 0x5e, + 0x99, 0x69, 0x4c, 0x3d, 0xca, 0x73, 0x7b, 0x71, 0x78, 0x77, 0x8b, 0xaa, + 0xc4, 0x44, 0x59, 0x41, 0x3a, 0x90, 0x51, 0xc0, 0xc0, 0x9b, 0x4e, 0xbd, + 0xcd, 0x6e, 0xae, 0x67, 0x86, 0x9c, 0x8f, 0xaf, 0xb3, 0x62, 0x62, 0x89, + 0x89, 0xc9, 0xb0, 0x42, 0x5a, 0x34, 0xc4, 0xc0, 0x6b, 0xb2, 0x59, 0x57, + 0x63, 0xbd, 0x81, 0x99, 0x79, 0x60, 0xa6, 0x5e, 0x9f, 0xd6, 0x8d, 0xbe, + 0xab, 0x72, 0x4e, 0x7b, 0xb7, 0x99, 0x5f, 0xca, 0x8b, 0x8e, 0x8c, 0x7e, + 0x38, 0x4d, 0xbc, 0x6a, 0x79, 0xa6, 0x4b, 0x4d, 0x94, 0x92, 0xcc, 0xa8, + 0x4e, 0xc9, 0x5a, 0xa3, 0x81, 0xb9, 0x74, 0x59, 0xbb, 0x3d, 0x8d, 0x8d, + 0x50, 0xca, 0x85, 0x74, 0xbf, 0xb0, 0x6b, 0x56, 0x34, 0x40, 0x87, 0x9e, + 0xa6, 0xc8, 0x35, 0x6b, 0x4f, 0x87, 0xa2, 0x54, 0xa8, 0x2e, 0x7a, 0x79, + 0x43, 0x77, 0x58, 0x87, 0x3d, 0x5d, 0x9c, 0x5a, 0xbb, 0x6c, 0xad, 0x84, + 0x6a, 0xb0, 0xb9, 0x85, 0x62, 0x5c, 0xb1, 0x6a, 0x61, 0xb5, 0x77, 0xad, + 0xbd, 0xb4, 0x49, 0x4f, 0x62, 0x41, 0x99, 0xb5, 0x7c, 0x8d, 0x88, 0x44, + 0x8b, 0x73, 0x98, 0x82, 0x7f, 0xb1, 0x86, 0x57, 0xa2, 0x41, 0xaa, 0x84, + 0xa8, 0x37, 0x77, 0xb4, 0xd0, 0xc3, 0x87, 0xad, 0x3e, 0x39, 0x95, 0x48, + 0xae, 0x36, 0x4e, 0x7e, 0xbe, 0x7b, 0x57, 0xc4, 0xbd, 0x9f, 0xc9, 0x43, + 0x39, 0xba, 0x3d, 0x68, 0x38, 0xc8, 0x3f, 0xa7, 0x83, 0xc5, 0x58, 0x37, + 0xd0, 0xa5, 0x8c, 0x35, 0x98, 0xa1, 0x73, 0x3b, 0x37, 0x66, 0x94, 0x76, + 0x3a, 0xb4, 0x7d, 0x8e, 0x41, 0x3f, 0xb9, 0x39, 0x3d, 0x80, 0xa7, 0x99, + 0x7e, 0xc7, 0x39, 0xa6, 0x77, 0x42, 0x82, 0x5f, 0xbc, 0x42, 0x7a, 0x49, + 0xb3, 0x45, 0x32, 0x66, 0x71, 0xc0, 0x31, 0xc7, 0x62, 0x8a, 0x4b, 0x51, + 0x3d, 0x8e, 0xbf, 0xa5, 0xd1, 0xad, 0x52, 0xa7, 0x50, 0x81, 0x35, 0xb6, + 0x8d, 0x98, 0x53, 0xcf, 0x83, 0xaa, 0x69, 0x4f, 0x42, 0xba, 0x55, 0xaf, + 0xaa, 0x8f, 0xc4, 0x65, 0xa0, 0xab, 0x9f, 0x33, 0xb8, 0x88, 0x65, 0xac, + 0x4b, 0x6c, 0xbb, 0x97, 0xa3, 0x8f, 0x98, 0x5c, 0x7c, 0x45, 0x3d, 0xba, + 0x76, 0xc0, 0xd5, 0x82, 0x5a, 0xc9, 0xb5, 0x4c, 0x3c, 0xbc, 0x87, 0xae, + 0xab, 0x78, 0x97, 0x3d, 0xd0, 0x8e, 0xac, 0x46, 0x72, 0x47, 0x2a, 0x75, + 0x6c, 0x3a, 0xa9, 0xb4, 0x82, 0x96, 0x84, 0x61, 0x71, 0xa2, 0xa9, 0x3d, + 0x61, 0x67, 0xd2, 0x54, 0x4c, 0x84, 0x67, 0x91, 0x3e, 0x2f, 0xcb, 0x5c, + 0xa8, 0x79, 0x94, 0x3c, 0x60, 0xb3, 0x75, 0x32, 0xb9, 0xc8, 0x3b, 0x7e, + 0xb1, 0xbe, 0x97, 0x6f, 0x98, 0x4b, 0x91, 0xbe, 0x71, 0x64, 0x60, 0x3d, + 0x42, 0x53, 0x97, 0xa8, 0x3b, 0x99, 0x89, 0x4b, 0x6e, 0x80, 0x43, 0x96, + 0xa1, 0x55, 0x36, 0x7d, 0x82, 0x6c, 0x4f, 0x6d, 0x84, 0x5c, 0xaf, 0x8c, + 0x30, 0xb6, 0x79, 0x51, 0x93, 0xc2, 0x3b, 0xa9, 0xbc, 0xd1, 0xa3, 0x98, + 0x6b, 0xb8, 0x5b, 0x5e, 0x71, 0x50, 0x91, 0x56, 0x4d, 0x94, 0xa0, 0x92, + 0x82, 0xb8, 0x9d, 0x7d, 0x87, 0x67, 0xae, 0x6e, 0xc6, 0x57, 0xba, 0x4e, + 0x51, 0x5b, 0x56, 0x84, 0xc6, 0xcb, 0xc4, 0x50, 0xc5, 0x77, 0xb1, 0x79, + 0x84, 0x60, 0xc3, 0x84, 0x55, 0xa3, 0xbd, 0x72, 0xb0, 0x35, 0x7e, 0x67, + 0x8f, 0xba, 0x51, 0x50, 0xce, 0x4d, 0x81, 0xb8, 0x9e, 0x33, 0x8d, 0x47, + 0x78, 0x9d, 0x60, 0x83, 0x5d, 0x76, 0x4b, 0xc6, 0x33, 0xca, 0x6e, 0x3a, + 0x3e, 0x45, 0x95, 0xa0, 0x8e, 0x43, 0x41, 0xd1, 0x4a, 0x4a, 0xac, 0xaa, + 0x8b, 0x5e, 0x5d, 0x64, 0x54, 0x5b, 0xbc, 0xb9, 0xa8, 0xca, 0x4a, 0x9e, + 0x86, 0x53, 0x35, 0x7d, 0xcb, 0xa6, 0xae, 0xc8, 0x7b, 0x9a, 0x47, 0x76, + 0x3d, 0x71, 0x5e, 0xb8, 0x7b, 0xa3, 0x6c, 0x6e, 0xb7, 0x77, 0x69, 0x42, + 0x4c, 0x98, 0x5d, 0x64, 0x5d, 0x86, 0x40, 0xa3, 0x8b, 0x4f, 0x99, 0xae, + 0xc5, 0x4b, 0x6b, 0x94, 0x78, 0xa6, 0x4a, 0x4f, 0x6a, 0x49, 0x66, 0x77, + 0xb1, 0x4a, 0xb1, 0xbd, 0x3c, 0x30, 0x43, 0x7d, 0xce, 0x96, 0xcd, 0x75, + 0x65, 0xd1, 0xa2, 0x9d, 0x51, 0x95, 0x93, 0xad, 0x6d, 0x9c, 0xc0, 0x89, + 0x7c, 0x96, 0x48, 0x9f, 0x69, 0xb3, 0xa7, 0xb4, 0x85, 0x93, 0x44, 0x51, + 0xcd, 0xc1, 0x5c, 0x9c, 0x91, 0x94, 0x38, 0x76, 0xa5, 0x74, 0x8d, 0xb9, + 0xcb, 0xc5, 0x3a, 0x8d, 0xb4, 0x5f, 0x6b, 0x9c, 0xc7, 0x48, 0xa7, 0xbe, + 0xbd, 0x37, 0x90, 0x77, 0x61, 0x84, 0x66, 0x94, 0x89, 0x95, 0x56, 0x79, + 0x4f, 0x81, 0x52, 0x65, 0x7d, 0xc7, 0x61, 0x30, 0x46, 0xae, 0x2e, 0xa6, + 0x51, 0x60, 0xc5, 0xbf, 0xb9, 0x8f, 0x6a, 0xbe, 0xc0, 0x43, 0x48, 0xba, + 0xcc, 0x5b, 0x4d, 0x53, 0x73, 0x90, 0x33, 0xd0, 0x35, 0x43, 0xb3, 0xa5, + 0xc9, 0xac, 0x36, 0xb1, 0x4b, 0x7c, 0xd5, 0x9e, 0xd1, 0xd3, 0x66, 0xd3, + 0x4e, 0x41, 0x47, 0x77, 0x7f, 0xa5, 0xa2, 0x68, 0xb4, 0xb6, 0xbb, 0x5b, + 0x6c, 0xb6, 0xd6, 0x49, 0x74, 0x78, 0xce, 0x99, 0x58, 0xb1, 0x64, 0x6c, + 0xa6, 0x50, 0x7f, 0x96, 0x75, 0x81, 0xc9, 0x36, 0x87, 0xb4, 0x7b, 0xd1, + 0x91, 0x5d, 0xc1, 0xa8, 0x52, 0x67, 0x85, 0xbf, 0x49, 0x4a, 0xa4, 0x46, + 0x45, 0xc4, 0x85, 0x3a, 0xc1, 0xad, 0xc1, 0x4e, 0x79, 0x76, 0x6f, 0x5f, + 0x96, 0x7e, 0x99, 0xc7, 0xa2, 0xd0, 0x82, 0x3e, 0x89, 0x91, 0x3b, 0xae, + 0xa2, 0x62, 0x72, 0x36, 0x37, 0x3f, 0x4a, 0xba, 0x9a, 0x68, 0xb5, 0x61, + 0xa1, 0xa4, 0xd1, 0x8f, 0xaa, 0x3d, 0xa5, 0xbe, 0x5c, 0x5e, 0x53, 0x69, + 0x72, 0x7d, 0x90, 0xae, 0x99, 0x5a, 0xb9, 0x7e, 0x6b, 0x88, 0x9b, 0x70, + 0xb9, 0x86, 0xb7, 0x9f, 0x2d, 0x83, 0xb3, 0x71, 0x9b, 0x88, 0x75, 0x41, + 0x99, 0x7d, 0xd2, 0x2e, 0xa2, 0x39, 0x6a, 0x97, 0x94, 0x76, 0x98, 0x77, + 0x64, 0xc2, 0xa6, 0x65, 0x78, 0x3c, 0xb7, 0xc4, 0x8b, 0x8a, 0x4d, 0xce, + 0x56, 0x91, 0x3f, 0x8c, 0xc9, 0xb8, 0x89, 0x34, 0x57, 0xb2, 0x5b, 0xc7, + 0x55, 0xc7, 0xde, 0x5c, 0x4b, 0xa8, 0x72, 0x86, 0x8b, 0xb0, 0x59, 0x8d, + 0xd4, 0xbc, 0x22, 0x90, 0xa4, 0x4c, 0xbc, 0x9a, 0x68, 0xc8, 0x5a, 0x84, + 0x60, 0x39, 0xc2, 0x86, 0x80, 0xb6, 0x94, 0xc1, 0x9e, 0x81, 0xaa, 0x7b, + 0x59, 0xa3, 0xcc, 0x89, 0x7e, 0xad, 0x6d, 0x54, 0x6f, 0xc8, 0x6f, 0x92, + 0x90, 0x78, 0x79, 0x7c, 0xb3, 0xc5, 0xcf, 0x95, 0x4f, 0x79, 0x8e, 0xca, + 0xa4, 0xa9, 0x45, 0xbc, 0x6e, 0x9f, 0x51, 0x53, 0xd2, 0x75, 0x57, 0xa1, + 0x93, 0x2b, 0x75, 0x9b, 0x7e, 0xbf, 0x94, 0xe4, 0x7f, 0xb1, 0xb1, 0x85, + 0xcf, 0x9b, 0xa5, 0xcb, 0x3f, 0xb4, 0x7f, 0x5c, 0x42, 0x3f, 0x34, 0x42, + 0xb0, 0x9d, 0xc6, 0x47, 0x3c, 0x7a, 0x83, 0x9e, 0x32, 0x33, 0xc6, 0x6f, + 0xbd, 0x68, 0x33, 0x7a, 0x81, 0x9b, 0x52, 0xa7, 0xc8, 0x69, 0x33, 0xbb, + 0xb1, 0x96, 0x88, 0x81, 0xc3, 0x1d, 0x78, 0x83, 0xb4, 0x5c, 0x99, 0x6c, + 0x33, 0x4e, 0xc9, 0xb0, 0x72, 0xd1, 0xbf, 0x4c, 0x65, 0xb3, 0x55, 0x81, + 0xa4, 0xc4, 0xa0, 0xaf, 0xd2, 0x74, 0x24, 0x69, 0xdb, 0x5c, 0x7f, 0xab, + 0x32, 0x63, 0xb0, 0xbc, 0xc6, 0xb4, 0x66, 0xac, 0x55, 0x8a, 0x73, 0x4c, + 0xa2, 0x9b, 0xc4, 0x86, 0xcf, 0xc5, 0xda, 0xc3, 0x45, 0xb2, 0x7d, 0xc0, + 0x9b, 0x6f, 0x6c, 0x38, 0xd5, 0x87, 0xa4, 0x4d, 0x47, 0xcb, 0x3f, 0x74, + 0xc9, 0x4d, 0x44, 0x58, 0x43, 0xbb, 0xd6, 0x7e, 0x8a, 0x58, 0x7b, 0xa6, + 0xa8, 0xa8, 0x9d, 0x5a, 0x8b, 0xdc, 0x7a, 0x39, 0x99, 0x9e, 0xbe, 0x4e, + 0xcf, 0x3d, 0x8c, 0x7c, 0x6a, 0x5f, 0xc0, 0xad, 0xa2, 0xd9, 0x98, 0x88, + 0xd1, 0xd1, 0x5e, 0x2d, 0xc0, 0x8b, 0xa2, 0x8c, 0x31, 0x89, 0x6f, 0x7a, + 0x6f, 0x4d, 0x9b, 0x66, 0x45, 0x60, 0x6f, 0x3f, 0x81, 0xa4, 0xa4, 0xc7, + 0xae, 0x3e, 0xc0, 0x3c, 0x8b, 0x4f, 0x88, 0x6b, 0x87, 0x4b, 0x8f, 0x96, + 0x9c, 0x37, 0x54, 0xad, 0xb0, 0x80, 0xcc, 0x29, 0x90, 0xc5, 0x9e, 0x94, + 0x6c, 0x91, 0x3d, 0x46, 0x78, 0xc9, 0x39, 0xc5, 0x84, 0xa1, 0x60, 0x3d, + 0xbe, 0x62, 0xa3, 0xc8, 0xce, 0xc3, 0xc1, 0x72, 0x41, 0x2c, 0x6e, 0x3c, + 0x66, 0xa3, 0x80, 0xb1, 0x9d, 0x9f, 0x67, 0x50, 0xab, 0xbd, 0x3d, 0x73, + 0xa5, 0x6a, 0x45, 0x7a, 0x5b, 0x60, 0x96, 0x70, 0x97, 0x8e, 0xc6, 0x62, + 0x42, 0x9f, 0x9e, 0x70, 0x2b, 0x7d, 0x7a, 0x69, 0xb9, 0x9c, 0x89, 0x3e, + 0x44, 0xb5, 0x69, 0x53, 0x7b, 0xc5, 0x71, 0xae, 0x36, 0x82, 0xaa, 0x7f, + 0x9e, 0x57, 0x3e, 0x60, 0x55, 0xb3, 0xa3, 0xaf, 0xd0, 0x60, 0xa2, 0x98, + 0x6c, 0xd9, 0x79, 0xaf, 0x89, 0xb4, 0xc5, 0xb6, 0x56, 0x6e, 0x77, 0xc0, + 0xa8, 0x4a, 0x87, 0xd4, 0x4d, 0x3f, 0x8c, 0xb1, 0x7b, 0xb5, 0x98, 0x7d, + 0x9b, 0x98, 0x72, 0x82, 0x41, 0x88, 0x7f, 0x7c, 0xb8, 0xad, 0x41, 0x82, + 0xab, 0xaa, 0x40, 0x89, 0x9a, 0x31, 0x46, 0x79, 0x63, 0x50, 0x86, 0xa3, + 0x4c, 0xc0, 0x46, 0x9d, 0x31, 0x8a, 0x9e, 0xc9, 0xa8, 0x92, 0x31, 0x64, + 0x70, 0x9d, 0x5b, 0xb1, 0xc1, 0x5d, 0x95, 0x3c, 0x55, 0x9b, 0x4e, 0x87, + 0x82, 0x56, 0x64, 0x64, 0x6e, 0xbb, 0x6a, 0xd5, 0x8c, 0xbd, 0x85, 0x67, + 0x47, 0xc6, 0x70, 0x95, 0x6a, 0x63, 0xa2, 0x6e, 0x6a, 0x84, 0x7b, 0x46, + 0x9a, 0xd9, 0x5c, 0x68, 0x9d, 0xcb, 0x94, 0xc4, 0x3a, 0xc7, 0xad, 0x64, + 0x7f, 0xb0, 0xd2, 0xa7, 0x46, 0xcd, 0x6a, 0x48, 0xc2, 0xb4, 0xb3, 0xcb, + 0x9f, 0x1c, 0x82, 0x5b, 0xc1, 0xc6, 0x9c, 0x6e, 0x58, 0xb9, 0x7e, 0x7d, + 0x2f, 0xd3, 0xb2, 0x92, 0x96, 0x85, 0xd1, 0xa4, 0x70, 0x59, 0x91, 0x5c, + 0x59, 0xac, 0xa1, 0xb2, 0xb0, 0xbd, 0xcf, 0xb7, 0x8d, 0x77, 0xb9, 0x2b, + 0xbb, 0x9b, 0xe0, 0x94, 0x47, 0xbe, 0xbe, 0x83, 0x5a, 0x82, 0x4a, 0x3e, + 0xb3, 0x29, 0x57, 0x69, 0x4e, 0x24, 0xaf, 0xb6, 0xa0, 0xcf, 0x71, 0x78, + 0xb9, 0x6f, 0x43, 0x86, 0xac, 0xac, 0x8f, 0x37, 0x93, 0x79, 0x53, 0x39, + 0x8f, 0x40, 0x3f, 0xa8, 0xc3, 0x80, 0x34, 0xaa, 0xcf, 0xb1, 0x40, 0x95, + 0xb7, 0xc7, 0xa3, 0x7d, 0xdd, 0x6c, 0x66, 0xcb, 0x4e, 0x85, 0x92, 0xaf, + 0x4d, 0x3e, 0x52, 0x54, 0x6a, 0xad, 0x49, 0xad, 0x36, 0x68, 0x9e, 0x8e, + 0x6e, 0x88, 0x3f, 0x69, 0x7a, 0xb5, 0x7f, 0x54, 0x8b, 0x5b, 0x3e, 0xae, + 0x85, 0xa1, 0xd3, 0x34, 0x38, 0x48, 0x29, 0x84, 0x71, 0x67, 0x52, 0x65, + 0x7b, 0x9c, 0x6c, 0x4a, 0x69, 0x9a, 0xb6, 0x93, 0x90, 0x92, 0xb0, 0xb5, + 0x56, 0x94, 0xa1, 0xae, 0xb5, 0xd0, 0x46, 0xb2, 0xbb, 0x52, 0x37, 0x7e, + 0x4f, 0x44, 0x8a, 0x4e, 0xa7, 0x90, 0xd8, 0x5f, 0x3b, 0x4a, 0x6a, 0xbf, + 0x55, 0x8a, 0x8f, 0x70, 0xbb, 0x83, 0x3e, 0xb2, 0x33, 0xad, 0x54, 0x4d, + 0xba, 0x35, 0xa7, 0xd9, 0xb1, 0x75, 0x7e, 0x98, 0x40, 0x8a, 0xc0, 0xaf, + 0x33, 0x72, 0x7f, 0x60, 0x2d, 0x34, 0xb3, 0x42, 0xad, 0xba, 0x8c, 0xbd, + 0x83, 0x30, 0x5d, 0x34, 0xbb, 0xca, 0x7b, 0xa6, 0x7a, 0x34, 0x90, 0x72, + 0x71, 0x90, 0xc8, 0x55, 0x9d, 0xc0, 0xba, 0x96, 0x61, 0x1e, 0x81, 0x41, + 0x5d, 0x47, 0x30, 0x9c, 0x70, 0x45, 0x3c, 0xa2, 0x8f, 0x8d, 0x42, 0x49, + 0xbc, 0x5b, 0xc7, 0x67, 0x52, 0x8f, 0x53, 0x4e, 0x62, 0x3b, 0x64, 0x8f, + 0xb5, 0x62, 0x65, 0x7b, 0x98, 0x81, 0x66, 0x4e, 0x52, 0x6f, 0xbb, 0xa1, + 0xc1, 0x89, 0x48, 0xaa, 0xb6, 0x80, 0x42, 0x44, 0x93, 0x47, 0xaf, 0x55, + 0xbd, 0x27, 0x65, 0x84, 0x55, 0x7c, 0xb5, 0xd3, 0x74, 0xbe, 0x6d, 0x96, + 0xb6, 0x4d, 0x62, 0x71, 0xc8, 0x4a, 0x8b, 0x8f, 0x70, 0x83, 0x93, 0x51, + 0xc1, 0xc3, 0x44, 0x4b, 0xad, 0xbd, 0xad, 0x3a, 0xbc, 0xc6, 0x4b, 0x64, + 0x8f, 0x66, 0xbd, 0x38, 0x47, 0xb0, 0x4a, 0xaf, 0xc7, 0x54, 0xcc, 0xba, + 0xa8, 0x78, 0x70, 0x4f, 0x82, 0x3e, 0x9d, 0x4b, 0x80, 0x6f, 0x9e, 0x5d, + 0x8f, 0xc7, 0x48, 0x5b, 0x4b, 0x65, 0x30, 0x83, 0xd3, 0xd4, 0xbf, 0x8e, + 0x9e, 0x96, 0x31, 0xcb, 0x5e, 0x3e, 0x66, 0xc9, 0xbf, 0x88, 0x32, 0xd3, + 0xc2, 0xb4, 0x67, 0x8a, 0xcd, 0xd3, 0x4d, 0x83, 0x94, 0xb8, 0x9b, 0x41, + 0x43, 0xac, 0x7c, 0xc6, 0xad, 0x9c, 0xcd, 0x82, 0x51, 0xc4, 0x3a, 0x48, + 0xbf, 0x7f, 0x31, 0xa4, 0x49, 0xc0, 0x39, 0x3b, 0x4c, 0xd5, 0x51, 0x9b, + 0xa3, 0x4c, 0x46, 0x9b, 0x73, 0x57, 0xcd, 0xad, 0xa7, 0x8d, 0x64, 0x34, + 0x97, 0x92, 0x49, 0xab, 0x71, 0x4c, 0x3c, 0x7e, 0xbe, 0x6e, 0xa5, 0x61, + 0x5f, 0xa7, 0x6d, 0x9f, 0x54, 0xb0, 0xbc, 0x9a, 0xc3, 0x74, 0xa1, 0x5b, + 0x82, 0x54, 0x50, 0x63, 0x61, 0x6c, 0x31, 0x9e, 0x6b, 0x62, 0x9b, 0xab, + 0x9c, 0x99, 0x6f, 0x66, 0x7c, 0x46, 0x79, 0x42, 0xd4, 0xa8, 0xb0, 0x6a, + 0xbe, 0xa2, 0xbe, 0x73, 0xbb, 0xb3, 0xc0, 0xba, 0x3f, 0x7a, 0x4a, 0x53, + 0x39, 0x79, 0x50, 0x78, 0x6a, 0x2f, 0x4b, 0x81, 0xa9, 0x86, 0xb4, 0x4a, + 0x73, 0x8c, 0xac, 0x7b, 0xb3, 0x3d, 0x5d, 0x7c, 0xda, 0x98, 0x3e, 0x56, + 0x5a, 0x62, 0xb5, 0x4e, 0x35, 0x3f, 0xc6, 0x44, 0x99, 0x64, 0x66, 0x4b, + 0x76, 0x9b, 0x52, 0xb3, 0x7f, 0xaa, 0xac, 0xb1, 0xd3, 0x63, 0x3d, 0x55, + 0x43, 0x81, 0x60, 0xb6, 0x35, 0x97, 0xb7, 0x7f, 0x3d, 0xbc, 0x99, 0x7c, + 0x3f, 0x4e, 0x4d, 0x4d, 0x4d, 0x56, 0x67, 0xad, 0x93, 0x80, 0xa0, 0x3a, + 0x54, 0x95, 0x93, 0x59, 0x87, 0x9a, 0x54, 0x7c, 0xd7, 0x98, 0x5a, 0x80, + 0xa8, 0x67, 0xaa, 0xa7, 0x7a, 0x79, 0x76, 0x9a, 0xb6, 0x95, 0xba, 0x45, + 0xd0, 0x98, 0x6c, 0x3a, 0x39, 0x59, 0x7a, 0x99, 0x5b, 0x4a, 0x8e, 0x74, + 0xb7, 0x77, 0x52, 0x78, 0xc0, 0x4a, 0x3d, 0x4c, 0x96, 0x54, 0x67, 0x42, + 0xac, 0x9c, 0x36, 0xbe, 0xd0, 0x5b, 0xba, 0xa9, 0xa9, 0x58, 0xb4, 0xd6, + 0xa8, 0x88, 0x63, 0x72, 0xac, 0x9e, 0xd1, 0x7c, 0x53, 0x7b, 0x77, 0xab, + 0x8c, 0xaf, 0x75, 0x9b, 0x5b, 0x75, 0x5e, 0x6e, 0x58, 0xc3, 0x6e, 0x96, + 0x38, 0x39, 0xbe, 0x8a, 0x3c, 0x87, 0x52, 0xc0, 0x82, 0x81, 0xa3, 0x73, + 0x70, 0x86, 0x89, 0x34, 0x94, 0x97, 0xab, 0x36, 0x48, 0xa4, 0xae, 0x8b, + 0x52, 0x5b, 0x72, 0x43, 0x50, 0x7e, 0x67, 0x3b, 0x49, 0xaf, 0x41, 0x5f, + 0x5a, 0x9e, 0xbd, 0x9f, 0x69, 0x4d, 0x47, 0x35, 0x87, 0x81, 0xcc, 0x7e, + 0xb8, 0x3b, 0xd2, 0x6a, 0x3f, 0xd4, 0xc2, 0x7b, 0x75, 0x72, 0xc4, 0x75, + 0xb8, 0x85, 0x79, 0x98, 0x51, 0x90, 0x56, 0x72, 0x51, 0x4f, 0xc8, 0x53, + 0xb7, 0x70, 0x44, 0xc5, 0x64, 0xb2, 0x7a, 0x73, 0x34, 0xb9, 0x60, 0x90, + 0xcd, 0x47, 0x9e, 0x58, 0x62, 0x4a, 0x46, 0xb4, 0xca, 0x4e, 0xbe, 0x80, + 0x64, 0x6d, 0x9c, 0x99, 0xb1, 0x8a, 0x34, 0x39, 0xc8, 0x79, 0x83, 0x4d, + 0xae, 0x4b, 0x32, 0x2f, 0xca, 0xcf, 0xac, 0xc7, 0xd5, 0x6b, 0x45, 0x93, + 0xbe, 0xa1, 0x93, 0x4a, 0x43, 0xb7, 0x7f, 0xb8, 0x35, 0x7f, 0xad, 0xa4, + 0x50, 0x4e, 0x6b, 0x7f, 0xb7, 0xbc, 0xa7, 0x5a, 0x4e, 0xa2, 0x73, 0xa9, + 0x9f, 0x56, 0x4e, 0xce, 0x9e, 0x8b, 0x75, 0xd7, 0x40, 0x89, 0x74, 0x34, + 0x6a, 0x38, 0x53, 0x33, 0xbb, 0xc5, 0xb6, 0x7b, 0x53, 0xa4, 0xc5, 0x83, + 0x5e, 0xb9, 0xc8, 0x36, 0x69, 0x93, 0x8a, 0x8d, 0xcb, 0xc0, 0x7b, 0xaf, + 0x3d, 0x5f, 0x65, 0xb1, 0x4a, 0x59, 0xca, 0x4e, 0xd2, 0x67, 0x9a, 0xb6, + 0x41, 0x6d, 0x85, 0xd9, 0x8f, 0x43, 0xa9, 0x9e, 0x3c, 0x3e, 0x8f, 0xa2, + 0x72, 0xd2, 0xd5, 0x97, 0x7f, 0x6d, 0x6f, 0x7c, 0xb7, 0xae, 0x43, 0x5f, + 0xa0, 0x57, 0x41, 0x7d, 0x9e, 0x3e, 0x6a, 0xaf, 0x4d, 0x70, 0x9a, 0x3a, + 0xb3, 0xd0, 0x4a, 0x7f, 0xc0, 0x7d, 0x51, 0x87, 0x5e, 0x5c, 0x7b, 0xc8, + 0xbe, 0x52, 0x37, 0xb5, 0x6a, 0x73, 0x53, 0x8f, 0x79, 0x86, 0x39, 0x58, + 0xa9, 0x88, 0x8d, 0x75, 0xd2, 0x8c, 0x5c, 0x60, 0x52, 0xbe, 0xcb, 0xcd, + 0xbd, 0xc3, 0x5b, 0x5c, 0xcb, 0x3c, 0x76, 0x73, 0x85, 0xb7, 0x58, 0xa5, + 0x3d, 0xa5, 0x42, 0x67, 0xc0, 0x3b, 0xd0, 0x9f, 0xaa, 0x39, 0x8b, 0x97, + 0x69, 0xa2, 0xa5, 0xb9, 0x33, 0x45, 0xd4, 0xc6, 0x5d, 0xcd, 0x75, 0x54, + 0x8f, 0x85, 0x5f, 0x9f, 0xc2, 0xce, 0xce, 0x74, 0x5e, 0xa7, 0xaa, 0x43, + 0x8a, 0xae, 0x6b, 0x6c, 0x8b, 0x6c, 0xbd, 0x5e, 0x8c, 0x7f, 0xa9, 0xc3, + 0xca, 0xc9, 0x7f, 0x5a, 0x4b, 0x3e, 0x5c, 0xa9, 0xc0, 0x32, 0x36, 0x5e, + 0x32, 0xc8, 0x6b, 0x4c, 0x5b, 0x56, 0x38, 0xb6, 0xa2, 0x51, 0x48, 0xaf, + 0x46, 0x67, 0x68, 0x5b, 0xa2, 0xc0, 0xd9, 0x93, 0xb1, 0xaa, 0x45, 0xb7, + 0x70, 0x36, 0x77, 0x52, 0xb4, 0x37, 0xc2, 0x92, 0xa7, 0x76, 0x35, 0x44, + 0x61, 0x7f, 0x74, 0xb6, 0x7b, 0x69, 0x4f, 0x79, 0x7d, 0x6e, 0xd3, 0x54, + 0xc2, 0x98, 0x4e, 0x4b, 0x3c, 0xbc, 0xc1, 0xa2, 0x58, 0x7d, 0x9c, 0xa1, + 0xce, 0x9b, 0x45, 0x37, 0xb9, 0xb7, 0x74, 0x3c, 0x80, 0x45, 0x4f, 0x8b, + 0x85, 0x32, 0x4e, 0x44, 0x81, 0xac, 0x4b, 0xc0, 0xb0, 0x5a, 0x4d, 0xb0, + 0x9b, 0xd8, 0x93, 0x80, 0xbb, 0x92, 0x5a, 0x83, 0x9e, 0xaf, 0x5d, 0x52, + 0xa8, 0x82, 0x5d, 0xbe, 0xac, 0x69, 0x95, 0x51, 0x73, 0x5e, 0xc7, 0x59, + 0x60, 0xbb, 0x7b, 0xb6, 0x3d, 0x6e, 0xa1, 0xc6, 0x9f, 0x8f, 0xcb, 0xc2, + 0x81, 0x82, 0xb0, 0x5f, 0xc6, 0x7c, 0x34, 0x8f, 0xa9, 0xb9, 0x81, 0x67, + 0x75, 0xc6, 0x9e, 0x6d, 0x58, 0x52, 0xd0, 0xa5, 0xbc, 0x3e, 0xb8, 0x66, + 0x8b, 0x42, 0x6b, 0x94, 0xc1, 0x66, 0x49, 0xd6, 0xc2, 0x61, 0xa2, 0x63, + 0x5f, 0xaf, 0x89, 0x83, 0x78, 0x8b, 0x70, 0xad, 0x65, 0x6a, 0x3c, 0x40, + 0xca, 0x93, 0x9d, 0xb7, 0x9f, 0x44, 0x49, 0x40, 0x61, 0xda, 0xc4, 0x7d, + 0x76, 0x99, 0xb4, 0x70, 0x73, 0xc9, 0x7a, 0x78, 0x38, 0x7f, 0xac, 0x5a, + 0x60, 0x41, 0x5f, 0xb3, 0xbb, 0xc6, 0x8a, 0x6b, 0xc6, 0xb0, 0xda, 0x60, + 0xd9, 0xa0, 0x6e, 0xb7, 0xd3, 0xbd, 0x60, 0xab, 0x4b, 0x57, 0x4e, 0xd1, + 0x39, 0xa7, 0x5c, 0x6c, 0x8b, 0xb0, 0xb0, 0x77, 0x46, 0xa8, 0xd4, 0x8a, + 0x9c, 0x66, 0xd5, 0xba, 0x61, 0x5a, 0x89, 0x4e, 0xd0, 0xae, 0x2f, 0x60, + 0x7b, 0xc1, 0x35, 0xb1, 0x9d, 0xc2, 0x78, 0x33, 0x4e, 0x80, 0x45, 0x81, + 0x68, 0x3c, 0xaf, 0x69, 0x86, 0x66, 0x97, 0x6e, 0x88, 0xcc, 0xcb, 0x4c, + 0x78, 0x90, 0xb0, 0xa5, 0xd4, 0x56, 0xd5, 0xac, 0x95, 0xae, 0xbe, 0x4c, + 0xca, 0x89, 0xad, 0x88, 0xb2, 0xa0, 0xab, 0x58, 0x2a, 0xa0, 0xb4, 0x5a, + 0x98, 0xa5, 0xa2, 0x95, 0x81, 0xb8, 0x73, 0x78, 0xce, 0xa4, 0xad, 0xa2, + 0xd2, 0x8a, 0x54, 0x8a, 0x3c, 0x97, 0x5e, 0xcb, 0xcd, 0x62, 0x4b, 0xae, + 0x5e, 0x56, 0xd2, 0x50, 0xb5, 0x43, 0xb3, 0x8e, 0x6f, 0xd5, 0xac, 0x4e, + 0x80, 0xcb, 0x9d, 0x6c, 0x67, 0x78, 0xbc, 0xa5, 0xb0, 0x7a, 0x37, 0x50, + 0xb0, 0x4b, 0xc5, 0x7f, 0x60, 0x72, 0x45, 0x63, 0xb9, 0xa6, 0x6f, 0x9a, + 0x82, 0xb7, 0x36, 0x45, 0x80, 0x3e, 0x3a, 0x50, 0x94, 0x94, 0x51, 0x68, + 0x95, 0x4a, 0x58, 0x3e, 0x67, 0x7e, 0x69, 0xaf, 0x58, 0xbc, 0x2e, 0x75, + 0x51, 0x84, 0x4f, 0xc0, 0x9a, 0x76, 0x62, 0x7f, 0x8c, 0xcf, 0xd8, 0x6a, + 0x6f, 0x74, 0x8e, 0x4e, 0x35, 0xad, 0xc3, 0x8e, 0x77, 0x5d, 0x38, 0x4f, + 0x53, 0x63, 0x93, 0x29, 0x6c, 0x4e, 0xaa, 0x64, 0x92, 0xd3, 0x41, 0xa7, + 0x97, 0x69, 0x68, 0xb3, 0x6b, 0x71, 0xc7, 0xcf, 0xc6, 0x89, 0x95, 0xa7, + 0x5d, 0x5a, 0x4b, 0xa7, 0xcb, 0x4e, 0xa7, 0x71, 0x42, 0xb2, 0xa9, 0x8e, + 0x5b, 0x60, 0xcc, 0x6a, 0xbc, 0x32, 0x92, 0x68, 0x9e, 0xa9, 0x3c, 0x53, + 0x8a, 0xd3, 0xa1, 0x7f, 0x31, 0x66, 0xbc, 0x72, 0xa2, 0xaa, 0xb0, 0xc3, + 0x55, 0xcf, 0xa7, 0x8c, 0x47, 0xbd, 0xd7, 0x58, 0xd2, 0xc7, 0x92, 0xbd, + 0xac, 0x83, 0x8e, 0x7a, 0xb5, 0x4b, 0xca, 0xd9, 0x2f, 0x46, 0x9c, 0xb6, + 0xd4, 0x52, 0xb3, 0xc9, 0xc1, 0x59, 0x3c, 0x56, 0x92, 0x88, 0x66, 0xa8, + 0x68, 0x62, 0x4e, 0x8c, 0x3c, 0x64, 0xbc, 0xd1, 0xbe, 0x76, 0xcb, 0x86, + 0xae, 0x3a, 0xc2, 0x35, 0x51, 0x9f, 0xc4, 0x8f, 0x51, 0xa5, 0xb8, 0x49, + 0x9e, 0x55, 0x36, 0x99, 0x44, 0x71, 0xc5, 0x9c, 0x71, 0x93, 0x9a, 0x86, + 0xa1, 0xd1, 0x50, 0xc4, 0x98, 0xad, 0x94, 0x61, 0x89, 0xc1, 0x5d, 0x3b, + 0x54, 0x81, 0x70, 0xcf, 0x84, 0x96, 0x7c, 0x9e, 0xb1, 0x84, 0x7f, 0x91, + 0x73, 0x82, 0x64, 0x4c, 0x97, 0x6e, 0x77, 0x93, 0xab, 0x87, 0xa2, 0x60, + 0x45, 0x64, 0x76, 0xb7, 0x98, 0xa3, 0x80, 0x46, 0x8a, 0x50, 0x71, 0x4e, + 0xcd, 0x9f, 0xcd, 0xd1, 0x2b, 0x6e, 0x48, 0x92, 0xa1, 0x8e, 0x5d, 0x4b, + 0xbe, 0x32, 0xc8, 0x5d, 0xce, 0x59, 0xc1, 0xb9, 0x9d, 0xb2, 0xc8, 0x3f, + 0x82, 0x50, 0xb8, 0xc8, 0x92, 0x7e, 0x69, 0x67, 0x4d, 0x82, 0xbb, 0xcf, + 0x8a, 0xb3, 0x9f, 0xca, 0x53, 0x64, 0xb2, 0xb4, 0x56, 0x77, 0x2f, 0x66, + 0xad, 0x67, 0x4a, 0xa7, 0xb6, 0x77, 0x6e, 0x30, 0x78, 0x38, 0x6e, 0xd2, + 0x9a, 0xcc, 0x44, 0xb1, 0x8e, 0x72, 0x9d, 0xbe, 0xbf, 0xaa, 0xc1, 0x3f, + 0x75, 0x9d, 0x32, 0x52, 0x57, 0x47, 0x6a, 0x90, 0x32, 0x56, 0x82, 0xce, + 0x32, 0x3a, 0xbd, 0x47, 0x2f, 0x40, 0x68, 0x9c, 0x60, 0x7d, 0x95, 0x74, + 0x61, 0xa1, 0x94, 0x9f, 0xcb, 0x78, 0xc8, 0xa6, 0x8b, 0x81, 0x8e, 0xc2, + 0x43, 0x9e, 0x64, 0x4b, 0xd6, 0xc8, 0x96, 0x58, 0x44, 0xe5, 0xc1, 0xb2, + 0x68, 0xa9, 0x4a, 0x6a, 0x9e, 0x73, 0x93, 0x5d, 0xa6, 0x7d, 0x68, 0x65, + 0x9e, 0xd2, 0x68, 0x4b, 0xb8, 0x80, 0x5c, 0x86, 0xc5, 0xb1, 0xab, 0x8c, + 0x8c, 0x9d, 0x5b, 0x94, 0x68, 0xc9, 0xa7, 0x56, 0x5f, 0xa0, 0xba, 0x8f, + 0x54, 0xa0, 0x9b, 0x97, 0xb4, 0x9e, 0x74, 0xbd, 0xac, 0x91, 0x4a, 0xbb, + 0x59, 0xad, 0x93, 0xdc, 0x99, 0xc8, 0x81, 0x98, 0xd0, 0x5f, 0xa9, 0xca, + 0xc8, 0xaf, 0xc1, 0x60, 0xc5, 0x72, 0x51, 0xd5, 0x81, 0x78, 0x64, 0x9e, + 0xc3, 0xcc, 0xa8, 0xc0, 0x35, 0x8d, 0x7e, 0x36, 0x57, 0x84, 0x84, 0x72, + 0x24, 0x4c, 0x34, 0xb5, 0x78, 0xc5, 0xa2, 0x3d, 0xbc, 0x8d, 0xaf, 0xc8, + 0x6b, 0xc2, 0xa7, 0x67, 0x51, 0x91, 0x4f, 0xcb, 0x4d, 0x39, 0x99, 0x89, + 0x5b, 0xca, 0x61, 0xbf, 0xa0, 0x9e, 0xbe, 0x7c, 0x6b, 0x89, 0xa7, 0xac, + 0xb9, 0x9f, 0xd6, 0x65, 0x8c, 0xda, 0x9b, 0x9b, 0xb9, 0x99, 0xc2, 0x86, + 0x38, 0x6c, 0x8f, 0x67, 0x4b, 0xc7, 0x5d, 0x90, 0x41, 0xde, 0xdb, 0xb7, + 0x62, 0x32, 0xb6, 0xbd, 0x7b, 0x4e, 0xd7, 0x44, 0x33, 0x98, 0x94, 0x83, + 0x90, 0x34, 0xb5, 0x5f, 0xa3, 0xbd, 0x8a, 0xb6, 0xc2, 0x47, 0x4a, 0x38, + 0x9b, 0x69, 0x70, 0x9b, 0x55, 0x57, 0xb2, 0x7f, 0xc3, 0x87, 0x97, 0x73, + 0xc5, 0x6b, 0x63, 0x98, 0x56, 0xa5, 0xcb, 0x54, 0xc1, 0x66, 0x74, 0x51, + 0x9f, 0xc4, 0xc5, 0x76, 0x46, 0x59, 0x33, 0x9e, 0xc5, 0x67, 0x4b, 0x49, + 0xd8, 0xdb, 0x92, 0x5d, 0x8b, 0xdb, 0x8c, 0x76, 0x9e, 0x61, 0x40, 0xa9, + 0x31, 0x74, 0xcb, 0xc1, 0x40, 0x46, 0x89, 0xb1, 0x41, 0x52, 0x63, 0x6c, + 0x90, 0xe2, 0x92, 0xa8, 0xd2, 0x76, 0xa8, 0xab, 0x5d, 0xce, 0x59, 0x7c, + 0x94, 0xa0, 0x66, 0x84, 0xd4, 0x77, 0xb5, 0x8e, 0x6b, 0x2b, 0x84, 0xa4, + 0x79, 0x6b, 0x66, 0x3d, 0x70, 0xba, 0xc2, 0x42, 0x7a, 0x7a, 0xdb, 0x88, + 0xc8, 0x71, 0xe5, 0x38, 0x70, 0xba, 0x72, 0x73, 0xdc, 0x81, 0x90, 0x99, + 0x6d, 0x88, 0x4a, 0x4d, 0x81, 0x56, 0x5d, 0x9e, 0x65, 0xad, 0xa1, 0x57, + 0xce, 0xc6, 0xa7, 0x4d, 0x70, 0x4e, 0xc4, 0x77, 0xab, 0x43, 0x90, 0xd4, + 0x98, 0xd5, 0x8f, 0x3c, 0xc1, 0x9a, 0x8d, 0x2e, 0x79, 0x2e, 0x3b, 0xa9, + 0xbc, 0x40, 0x62, 0x31, 0x62, 0xc5, 0x39, 0x7e, 0xcd, 0xa7, 0x4d, 0xd8, + 0xa8, 0xbf, 0xa5, 0x6b, 0xca, 0xa8, 0x3f, 0xa8, 0xca, 0xa9, 0xc8, 0xa4, + 0x78, 0x91, 0xa9, 0x7b, 0xbb, 0x84, 0x5f, 0xcc, 0x4a, 0x94, 0x87, 0xc5, + 0xaf, 0x46, 0xf2, 0x79, 0xab, 0xa6, 0xd6, 0x63, 0x81, 0x92, 0x3f, 0xae, + 0x38, 0xd3, 0xb3, 0xa5, 0x6d, 0x73, 0xe0, 0x72, 0x31, 0x70, 0x4f, 0x90, + 0xb3, 0xa6, 0x89, 0x67, 0xae, 0x39, 0x8a, 0xa1, 0x36, 0xb9, 0x98, 0x7f, + 0xcd, 0xcb, 0x54, 0x62, 0x52, 0x79, 0xbd, 0x7a, 0xa5, 0x93, 0x36, 0xbb, + 0x75, 0xb8, 0x97, 0xcc, 0x46, 0x90, 0x68, 0xb0, 0xc0, 0x40, 0x3a, 0xdc, + 0x8b, 0x73, 0x93, 0x68, 0x6a, 0x6b, 0x7c, 0x2f, 0x5d, 0x67, 0xc7, 0xbe, + 0x8a, 0xc9, 0x94, 0x54, 0x58, 0x96, 0xc3, 0x8f, 0x6f, 0x99, 0x7c, 0x7c, + 0x6b, 0x5d, 0x47, 0x56, 0xaf, 0x87, 0xcb, 0x82, 0xc1, 0x61, 0x8b, 0x63, + 0xc3, 0xaf, 0xc1, 0x7f, 0x7e, 0x83, 0x97, 0xc8, 0x92, 0xb4, 0x65, 0x73, + 0x6a, 0x46, 0x9c, 0x99, 0x38, 0x44, 0xa5, 0x3f, 0x37, 0x59, 0x68, 0x37, + 0x78, 0x8c, 0x41, 0xc6, 0x9b, 0xb7, 0x62, 0xa0, 0x99, 0x9a, 0x71, 0x99, + 0x76, 0x42, 0x54, 0x86, 0x65, 0x45, 0x5d, 0x67, 0xaf, 0xe0, 0x94, 0xd4, + 0xa2, 0xba, 0x4e, 0x6c, 0x4f, 0x4d, 0xb7, 0x5d, 0xaa, 0x4f, 0xbf, 0x53, + 0x2c, 0xa6, 0xcd, 0x88, 0xc4, 0xb3, 0x54, 0xd5, 0x3a, 0x80, 0x8b, 0x42, + 0x4c, 0x80, 0x6e, 0xa8, 0x32, 0x40, 0x5a, 0x5a, 0x3d, 0xd1, 0x66, 0x31, + 0x9e, 0x63, 0x54, 0x3b, 0x94, 0xb9, 0xbc, 0xc8, 0x36, 0x70, 0xac, 0x50, + 0x71, 0x7d, 0x85, 0xd2, 0x3a, 0x73, 0x3d, 0xb0, 0x9d, 0x56, 0xaf, 0x5b, + 0xc8, 0x37, 0x89, 0xc2, 0x95, 0x5d, 0x8f, 0x9a, 0x39, 0x5e, 0x94, 0x83, + 0x4c, 0x6a, 0x69, 0x87, 0x83, 0x73, 0xca, 0xba, 0xa2, 0x64, 0xb3, 0x2f, + 0x85, 0x59, 0x33, 0xcb, 0x40, 0x78, 0xbd, 0x4c, 0xad, 0x95, 0x44, 0x98, + 0x70, 0xc2, 0x69, 0xa8, 0xa6, 0x54, 0xc2, 0xb3, 0x55, 0x74, 0x8a, 0x71, + 0x92, 0x32, 0xa3, 0xd9, 0x52, 0xc9, 0x9f, 0xb7, 0x3a, 0x90, 0xae, 0x2b, + 0xbc, 0xcf, 0x2d, 0xa4, 0x39, 0x7d, 0x85, 0x7b, 0x37, 0x70, 0x70, 0x5a, + 0xa6, 0x3d, 0xbd, 0x74, 0xab, 0xa0, 0x6f, 0xc5, 0xd7, 0x8d, 0x71, 0x6e, + 0x97, 0xcc, 0x60, 0x7d, 0xd0, 0x81, 0x65, 0x35, 0x5c, 0x53, 0x5d, 0xa9, + 0xaa, 0x8e, 0xa8, 0x67, 0xb1, 0xbb, 0xd8, 0xab, 0x51, 0x2a, 0xbe, 0xb3, + 0xbf, 0x3e, 0xa2, 0x67, 0x77, 0xbf, 0xc3, 0x69, 0x81, 0xa7, 0x6b, 0xa0, + 0x3f, 0x47, 0x50, 0x67, 0x50, 0x60, 0x65, 0xc2, 0xaf, 0x85, 0x9d, 0xa4, + 0x45, 0x95, 0x71, 0x7f, 0x63, 0xa3, 0x72, 0x64, 0x70, 0x62, 0x6c, 0x48, + 0x4c, 0xc1, 0x9c, 0x68, 0xc9, 0x69, 0x5c, 0xa9, 0x63, 0x91, 0xaa, 0x8c, + 0x6b, 0xc2, 0x3f, 0x35, 0x5d, 0x7d, 0x8c, 0x6e, 0x6e, 0x9d, 0x47, 0x81, + 0x55, 0x61, 0x5d, 0xdb, 0x46, 0x43, 0xa3, 0x62, 0xb0, 0xb8, 0x5e, 0x78, + 0xce, 0xcb, 0x97, 0xb4, 0x55, 0x48, 0x75, 0x44, 0x54, 0x89, 0x3e, 0x3a, + 0x7c, 0x81, 0x6b, 0x73, 0xba, 0xb2, 0x96, 0xc9, 0x47, 0xb1, 0xd5, 0xd1, + 0x8f, 0x87, 0xaa, 0xcc, 0x7d, 0x8e, 0x83, 0x8e, 0x30, 0x7a, 0x80, 0xc1, + 0x93, 0xd2, 0xbd, 0x5c, 0x50, 0x50, 0xc0, 0x2e, 0xc3, 0x9e, 0x7b, 0x4c, + 0x4c, 0x8c, 0x7f, 0xc8, 0xe0, 0x93, 0xb7, 0x95, 0xc7, 0x73, 0x3c, 0xa5, + 0x4f, 0x9a, 0x57, 0x8c, 0x70, 0x43, 0x84, 0x32, 0x83, 0x58, 0xb8, 0x9c, + 0xb7, 0x86, 0x65, 0x79, 0xa7, 0x88, 0x99, 0x67, 0x5a, 0xd5, 0x30, 0x4d, + 0x76, 0x46, 0x69, 0x8f, 0x7a, 0x66, 0x67, 0x83, 0x8b, 0x9e, 0xdb, 0xd0, + 0x9f, 0x47, 0xa5, 0x2c, 0x47, 0xe0, 0x85, 0x2b, 0xb8, 0x7c, 0x68, 0xb1, + 0xab, 0x45, 0x37, 0xd3, 0xa8, 0x5a, 0xcc, 0x4e, 0xce, 0x95, 0xa4, 0xb5, + 0xc6, 0xc8, 0x7c, 0x63, 0xb4, 0x4e, 0x74, 0x7c, 0x60, 0xda, 0xb9, 0xc8, + 0x6a, 0xaf, 0x67, 0xb5, 0x75, 0x7b, 0x30, 0x44, 0x46, 0x5c, 0x91, 0xba, + 0xbb, 0x91, 0xb6, 0x5f, 0xa6, 0x47, 0x75, 0x8c, 0xa8, 0xc6, 0xd0, 0x66, + 0x9b, 0x99, 0x3d, 0x3c, 0xc0, 0x66, 0xae, 0x59, 0x3f, 0x5c, 0xbd, 0x59, + 0x3b, 0xc2, 0x3b, 0x43, 0x5f, 0x94, 0x44, 0x4e, 0x94, 0x4e, 0x9b, 0xcc, + 0x78, 0x47, 0xa6, 0x74, 0x66, 0x9e, 0x9a, 0x7e, 0x86, 0x87, 0x77, 0x47, + 0x70, 0xc7, 0x8f, 0x61, 0x5e, 0x9a, 0x53, 0x72, 0x51, 0x6f, 0xa9, 0xa7, + 0xc5, 0xac, 0x9b, 0x65, 0x8a, 0xc3, 0x5a, 0x30, 0x6e, 0x95, 0xce, 0x35, + 0x73, 0x89, 0x42, 0xb1, 0x97, 0x9e, 0xb4, 0x6f, 0xd3, 0x3b, 0xb8, 0xa3, + 0x43, 0x33, 0xaf, 0xc2, 0xc3, 0xd8, 0xb0, 0x89, 0x8f, 0x4a, 0x7f, 0x52, + 0x84, 0x60, 0x76, 0x9a, 0x66, 0xbf, 0xc8, 0x51, 0x62, 0xb6, 0xce, 0x43, + 0x52, 0xbb, 0x73, 0x7e, 0x65, 0xa4, 0xb6, 0x84, 0x6e, 0x9f, 0x80, 0xbc, + 0x7f, 0x78, 0xa3, 0xbc, 0x3b, 0x51, 0xbd, 0x67, 0xba, 0x77, 0x5a, 0x41, + 0xb6, 0x76, 0x64, 0x62, 0x7f, 0x87, 0x4c, 0xab, 0xd0, 0x63, 0x81, 0x45, + 0x35, 0xcc, 0x68, 0x9f, 0x49, 0xa5, 0x62, 0x4a, 0xb5, 0xb4, 0xaa, 0x4a, + 0x69, 0x98, 0x5a, 0x5f, 0x6b, 0x6c, 0x4d, 0x5e, 0xcd, 0x5f, 0xb3, 0xa4, + 0xc2, 0xd2, 0x67, 0x50, 0x97, 0x62, 0xb2, 0x54, 0x9f, 0x82, 0xa8, 0xc5, + 0xa8, 0xb7, 0x8d, 0x3c, 0xbb, 0xc0, 0xba, 0x4f, 0x6e, 0x38, 0xae, 0x76, + 0x42, 0xc9, 0xad, 0x8a, 0x76, 0xba, 0x7c, 0x8b, 0x52, 0xc9, 0x74, 0x74, + 0x35, 0x9e, 0x93, 0x83, 0xbf, 0x82, 0x4d, 0x7a, 0x92, 0x8f, 0x8d, 0x6c, + 0x7e, 0xcc, 0xc1, 0x93, 0x8b, 0x6d, 0x6b, 0x7f, 0x67, 0x5b, 0x97, 0x80, + 0x72, 0xb8, 0xa7, 0xb9, 0x8b, 0xc8, 0x64, 0x52, 0x95, 0xc7, 0x37, 0x87, + 0x64, 0x40, 0x61, 0x9c, 0x3b, 0x8d, 0x6f, 0x49, 0x8b, 0xa0, 0x73, 0x4b, + 0xa4, 0x3d, 0x4a, 0x7f, 0x7c, 0x9b, 0xb1, 0x5e, 0xca, 0x3d, 0x42, 0x37, + 0x78, 0xa8, 0xb6, 0x39, 0x9f, 0xa5, 0x61, 0x70, 0x2e, 0xbe, 0x84, 0x95, + 0x99, 0x3e, 0x62, 0xc0, 0x82, 0x85, 0x82, 0xbe, 0x40, 0xad, 0x8b, 0xa1, + 0x92, 0x59, 0x83, 0xa9, 0x73, 0x58, 0xcf, 0x9c, 0x76, 0xd1, 0x63, 0x68, + 0x55, 0x76, 0xb9, 0x33, 0x67, 0xbd, 0x59, 0x6e, 0xad, 0xb1, 0x9e, 0x53, + 0x7b, 0x66, 0x3b, 0x4e, 0x90, 0xc0, 0x3d, 0x48, 0xb6, 0x5c, 0x2a, 0x46, + 0x6d, 0xbb, 0xa8, 0xd3, 0x6c, 0x8b, 0x6c, 0x97, 0x67, 0xc9, 0x98, 0x75, + 0x34, 0x49, 0x75, 0x49, 0x77, 0x4e, 0x8f, 0x57, 0x77, 0x81, 0xcc, 0x78, + 0xcf, 0x7f, 0x67, 0xd5, 0x5f, 0x60, 0x5c, 0x98, 0x7b, 0xb7, 0xc3, 0x91, + 0xb0, 0xc6, 0xa3, 0xd3, 0x9a, 0x48, 0x4a, 0x46, 0x39, 0xa3, 0x77, 0x50, + 0x2f, 0x63, 0x46, 0x4a, 0xcf, 0x50, 0xc1, 0x87, 0x49, 0x52, 0xc0, 0x88, + 0xbe, 0x3c, 0xc9, 0x96, 0x65, 0x83, 0x36, 0x56, 0x64, 0x4c, 0x84, 0x67, + 0x7a, 0xb3, 0x61, 0x27, 0xc3, 0xba, 0x6b, 0x36, 0x59, 0x90, 0xb8, 0xd4, + 0xa6, 0x3c, 0x37, 0xbf, 0x44, 0x7a, 0x3a, 0x9a, 0x3f, 0x85, 0xb3, 0x8b, + 0x48, 0x76, 0x44, 0x7a, 0x8c, 0x8b, 0x78, 0xc1, 0x6c, 0x9b, 0x32, 0x80, + 0xcd, 0x8d, 0x89, 0x8f, 0x67, 0x67, 0xcd, 0x74, 0x5b, 0xd1, 0x76, 0x64, + 0x62, 0x64, 0xbc, 0x96, 0xc8, 0x39, 0x88, 0x75, 0xc2, 0x8c, 0x73, 0x2d, + 0x77, 0xd6, 0x76, 0x57, 0x44, 0x44, 0xbc, 0x2d, 0x62, 0x98, 0xa6, 0xa6, + 0x58, 0x9a, 0x85, 0xc1, 0x86, 0x9e, 0x46, 0x9b, 0x89, 0x67, 0x40, 0x9f, + 0x32, 0x82, 0x61, 0x4e, 0x73, 0x65, 0x9e, 0xb3, 0x6c, 0x79, 0x64, 0xc5, + 0x62, 0x98, 0x87, 0x46, 0x85, 0x76, 0x55, 0x54, 0xb8, 0x59, 0x84, 0x94, + 0x62, 0x3f, 0x82, 0xa4, 0xa8, 0xb2, 0xc4, 0x5b, 0xa4, 0x3e, 0x72, 0x44, + 0x80, 0x82, 0x6b, 0x5b, 0xa4, 0x5e, 0x51, 0x37, 0x74, 0x99, 0xb0, 0x9f, + 0x8d, 0x2f, 0x88, 0x4d, 0x98, 0xc3, 0x39, 0x3d, 0x9a, 0x82, 0xb5, 0xb0, + 0xad, 0xc0, 0x54, 0x4e, 0x72, 0x8e, 0xbe, 0x57, 0x44, 0xb6, 0x4b, 0xae, + 0xbf, 0x45, 0xa6, 0x4b, 0x6a, 0x8c, 0x8f, 0x39, 0x52, 0xac, 0x5a, 0x50, + 0xb7, 0x6a, 0x33, 0xac, 0x79, 0xd1, 0x93, 0x50, 0x5c, 0xb5, 0xa4, 0x4d, + 0x88, 0xb3, 0x58, 0xb4, 0x7f, 0x75, 0x96, 0x9d, 0x92, 0x9c, 0x3f, 0x34, + 0x3b, 0xb3, 0xc5, 0x58, 0x99, 0x3f, 0x30, 0x4d, 0xc4, 0x78, 0xac, 0xa2, + 0x8e, 0x45, 0x7d, 0x60, 0x52, 0x8c, 0xb0, 0x44, 0x2d, 0x72, 0xbf, 0x53, + 0xd6, 0x42, 0x65, 0x43, 0x97, 0xca, 0xa7, 0xbb, 0x54, 0x74, 0xcd, 0x51, + 0x87, 0xc9, 0x36, 0x66, 0x7e, 0xad, 0x5f, 0xbb, 0xb1, 0x80, 0x77, 0x47, + 0x80, 0x41, 0xc4, 0x86, 0x2b, 0xc1, 0x93, 0x84, 0xbe, 0xd1, 0x81, 0x60, + 0x9b, 0x90, 0x99, 0x89, 0x4f, 0x7a, 0x6d, 0x46, 0x2e, 0x6d, 0x82, 0x3b, + 0x4b, 0x95, 0x9b, 0x8d, 0x7d, 0xd5, 0x90, 0x75, 0x70, 0x6e, 0x6e, 0x5e, + 0xca, 0x55, 0xcb, 0x98, 0x9b, 0xd2, 0xa5, 0xc3, 0x39, 0xab, 0xd3, 0x78, + 0x92, 0x3f, 0x8e, 0xa3, 0xbd, 0x81, 0x6a, 0x51, 0xd7, 0x7d, 0xa6, 0x84, + 0xc0, 0x75, 0x60, 0xcc, 0x8f, 0xcb, 0xa9, 0xb7, 0x5e, 0x58, 0x58, 0xc1, + 0x6a, 0x34, 0xab, 0x77, 0xcd, 0xad, 0xab, 0x5b, 0x5b, 0x5b, 0xb1, 0x80, + 0x69, 0xbd, 0xb4, 0x5f, 0x82, 0x89, 0xa8, 0xb0, 0x98, 0x7a, 0x4f, 0xc6, + 0xdb, 0x5a, 0xe4, 0x6c, 0x39, 0xb4, 0x5c, 0x1f, 0x8d, 0x45, 0x5c, 0x62, + 0x77, 0x7e, 0xcb, 0xb1, 0x72, 0xbf, 0x6c, 0x6c, 0x35, 0x77, 0xc0, 0x2c, + 0x74, 0x22, 0x9e, 0xa9, 0x92, 0x8a, 0x6a, 0x91, 0xa8, 0x81, 0xd7, 0x69, + 0x88, 0xc0, 0x9a, 0x47, 0xa4, 0x43, 0x9c, 0x67, 0x80, 0x80, 0x84, 0x4d, + 0xd3, 0xba, 0xb2, 0x78, 0x65, 0xc6, 0x7e, 0x74, 0x6c, 0xb5, 0x82, 0x38, + 0x64, 0x68, 0x77, 0x84, 0x5e, 0x92, 0x83, 0x3f, 0xb6, 0x9e, 0x48, 0x3d, + 0x4a, 0x8e, 0x39, 0x67, 0xbb, 0xbd, 0x84, 0xa1, 0xc8, 0x66, 0xbc, 0x94, + 0x6d, 0x5a, 0x5f, 0xa9, 0x6f, 0xc8, 0x61, 0xa2, 0x85, 0xc8, 0x82, 0x51, + 0xba, 0x3e, 0x2d, 0x68, 0x3a, 0xda, 0xac, 0x41, 0x8c, 0x56, 0x6a, 0x62, + 0x40, 0x83, 0x8a, 0xa7, 0xcd, 0xcf, 0xc9, 0x80, 0xa0, 0x74, 0xb8, 0x3a, + 0x46, 0x56, 0x49, 0x68, 0xa5, 0x49, 0xa7, 0xac, 0xc1, 0xd4, 0xc4, 0x42, + 0x9b, 0x39, 0x7d, 0x6b, 0xa4, 0xa7, 0x39, 0x52, 0x48, 0x44, 0x9f, 0xa7, + 0x3e, 0x3f, 0x3a, 0x9d, 0x4b, 0x50, 0xaa, 0xd1, 0x73, 0x35, 0x74, 0xd6, + 0x79, 0x4f, 0xb1, 0xa5, 0x54, 0x85, 0xc1, 0xb4, 0x63, 0x91, 0xbf, 0x2a, + 0x35, 0xa8, 0x8e, 0x93, 0x89, 0x73, 0xd6, 0x59, 0x90, 0x3c, 0x41, 0x9a, + 0xd1, 0x9e, 0x54, 0xce, 0xad, 0xa4, 0x84, 0x40, 0x55, 0xb8, 0x40, 0x49, + 0x91, 0x79, 0x62, 0x76, 0xcc, 0xa3, 0xa1, 0x50, 0x55, 0x37, 0x83, 0x8a, + 0x5e, 0x49, 0x95, 0xc7, 0xce, 0xc2, 0x76, 0xa1, 0x3e, 0x4c, 0x5d, 0x45, + 0x52, 0x7e, 0x74, 0xb2, 0xb4, 0xc6, 0xda, 0xa7, 0x88, 0x30, 0x46, 0x50, + 0x56, 0x68, 0x49, 0xc1, 0x83, 0x6f, 0x4d, 0x54, 0x85, 0x6c, 0x58, 0x99, + 0xbf, 0x78, 0x3d, 0xda, 0x9a, 0x85, 0x57, 0xd8, 0x67, 0x3c, 0xa7, 0xb7, + 0x9f, 0x4d, 0x71, 0xc3, 0xbd, 0x82, 0x9a, 0x35, 0xc9, 0xcd, 0x5f, 0x6c, + 0x7e, 0xc6, 0x9c, 0xb4, 0xcb, 0x35, 0xb4, 0x3f, 0x37, 0x43, 0x95, 0x6a, + 0x66, 0xa7, 0x8e, 0x57, 0xaa, 0xb2, 0xcf, 0xc2, 0xb4, 0x8f, 0xcf, 0x73, + 0xa4, 0x9d, 0x5c, 0x90, 0x9d, 0x86, 0xc6, 0x4c, 0x38, 0x48, 0xcc, 0xa0, + 0x56, 0xa6, 0xc9, 0x65, 0x82, 0xa0, 0xb2, 0x88, 0x6f, 0x5c, 0x5a, 0x74, + 0x2e, 0xb3, 0xd6, 0xd6, 0x4c, 0xae, 0x71, 0x8b, 0x88, 0x47, 0xbd, 0xb3, + 0x84, 0xd4, 0x35, 0x8c, 0x89, 0x45, 0xca, 0x4f, 0xc6, 0x91, 0x6d, 0x9b, + 0xc4, 0xbd, 0x57, 0x7f, 0x54, 0x8e, 0x71, 0x97, 0x9e, 0x6b, 0x7b, 0xbe, + 0xc5, 0xc9, 0x4b, 0x60, 0xce, 0x9e, 0xc6, 0xa3, 0x85, 0xc5, 0x96, 0x29, + 0x6f, 0x43, 0x47, 0x89, 0x94, 0x89, 0xbd, 0x87, 0x45, 0x92, 0x59, 0x1d, + 0x7d, 0xb5, 0x7a, 0x34, 0xb3, 0x69, 0x3a, 0x8e, 0x60, 0xa1, 0x92, 0x49, + 0xc0, 0xb8, 0xb1, 0xb0, 0x60, 0x46, 0xcf, 0x75, 0x49, 0xb9, 0xd5, 0x55, + 0x51, 0xd3, 0x54, 0x54, 0x49, 0x52, 0x7d, 0x79, 0xa9, 0xd9, 0x9a, 0x6c, + 0x77, 0xa0, 0xcb, 0x4f, 0x6b, 0x40, 0xad, 0x5f, 0x9a, 0x84, 0x7e, 0x8c, + 0x33, 0x5c, 0x45, 0xcf, 0xb0, 0x48, 0x2b, 0xbb, 0x60, 0x77, 0x76, 0x81, + 0x62, 0xcc, 0xab, 0x7d, 0xc9, 0x40, 0xac, 0x9f, 0xd9, 0x88, 0x3a, 0x33, + 0x91, 0xb3, 0xcd, 0x40, 0x95, 0x69, 0xa6, 0xc1, 0x89, 0x70, 0x49, 0x51, + 0x65, 0x40, 0x80, 0xc8, 0x50, 0xbb, 0xa6, 0x8e, 0x6e, 0xc2, 0xad, 0xbc, + 0x3d, 0x91, 0x9b, 0x56, 0x64, 0xa5, 0x75, 0xaf, 0xd0, 0x74, 0xa9, 0x6a, + 0x73, 0x96, 0x45, 0xbd, 0xc4, 0xc4, 0x42, 0x9d, 0x5b, 0x4d, 0x5a, 0x5f, + 0xa1, 0x6f, 0x3f, 0x9d, 0x78, 0x9c, 0xcf, 0xaf, 0xc5, 0x35, 0xb6, 0x70, + 0xa5, 0x5e, 0x5c, 0xc1, 0xca, 0xc5, 0xcc, 0x93, 0x91, 0x8e, 0x74, 0x40, + 0x67, 0xba, 0xb8, 0x55, 0xc6, 0xbb, 0x9e, 0xb0, 0x91, 0x5c, 0x3f, 0x3a, + 0xd5, 0xa8, 0xd3, 0x50, 0x9e, 0x68, 0xcd, 0xac, 0x60, 0x66, 0x65, 0x28, + 0x93, 0x5e, 0xb3, 0x7a, 0x47, 0x64, 0x87, 0xa3, 0x2b, 0xbc, 0xcc, 0xb0, + 0x56, 0xa7, 0x8b, 0x4e, 0x59, 0x72, 0x34, 0x29, 0x8b, 0x3e, 0xc5, 0x61, + 0x37, 0xb1, 0x57, 0x5b, 0x52, 0x89, 0xce, 0x98, 0x7b, 0x75, 0x88, 0x5d, + 0x8f, 0x63, 0x40, 0x53, 0x61, 0xbe, 0x74, 0x72, 0x45, 0xc5, 0x9e, 0xc2, + 0x7a, 0x76, 0xcd, 0x8b, 0x9c, 0xc1, 0xc5, 0xc1, 0x59, 0x79, 0xbb, 0x7e, + 0x6a, 0x62, 0xcd, 0xb9, 0x4f, 0x53, 0xb0, 0x49, 0x51, 0x67, 0x4e, 0x97, + 0x60, 0x72, 0x7f, 0x72, 0xd2, 0x4a, 0xa6, 0x38, 0x9f, 0xb0, 0x95, 0xae, + 0x6f, 0x8a, 0xb3, 0xa6, 0x8a, 0x4f, 0x92, 0xc1, 0x32, 0x91, 0x7b, 0x8f, + 0xcb, 0x32, 0x6e, 0xae, 0x8a, 0xc8, 0xc9, 0xb8, 0x97, 0x3f, 0x61, 0xcd, + 0x9b, 0x60, 0x66, 0x7f, 0x7a, 0x73, 0x34, 0xab, 0x92, 0x51, 0x55, 0x92, + 0x97, 0x6f, 0xc7, 0x3b, 0x65, 0x75, 0x95, 0xbc, 0x5a, 0x9c, 0x63, 0x82, + 0xab, 0x49, 0x8c, 0xb2, 0x46, 0xcb, 0x86, 0xbd, 0xbc, 0x91, 0x7e, 0x5a, + 0xab, 0x59, 0x8a, 0x8c, 0xd2, 0x92, 0x8c, 0x8a, 0x4b, 0x35, 0x98, 0x9f, + 0x59, 0xcf, 0x69, 0xaa, 0x80, 0xa6, 0x4b, 0x6b, 0x4a, 0xc0, 0x6e, 0x58, + 0xb5, 0xd2, 0x91, 0xa6, 0xaf, 0xd4, 0x48, 0xa0, 0x6b, 0x5a, 0xd0, 0xc8, + 0xa4, 0x71, 0x39, 0x58, 0xac, 0xb7, 0x30, 0x8b, 0x6c, 0xd7, 0x95, 0x6a, + 0x45, 0xa1, 0x88, 0x8a, 0x6e, 0x9a, 0x45, 0xa5, 0x55, 0x85, 0x6d, 0x64, + 0xd4, 0x97, 0xa3, 0x32, 0x3d, 0x5b, 0xc0, 0x54, 0x9e, 0x7f, 0x9c, 0x40, + 0x82, 0x58, 0x7b, 0xcf, 0x44, 0xa3, 0x8a, 0x6f, 0x6c, 0xce, 0xb0, 0x9d, + 0x61, 0xbb, 0x55, 0xc1, 0x86, 0x69, 0xb7, 0x4c, 0x5b, 0x47, 0x52, 0x9f, + 0xbd, 0x5c, 0x63, 0x6c, 0x3f, 0x29, 0xab, 0x34, 0x91, 0x6e, 0xa6, 0x60, + 0x5f, 0x9e, 0x98, 0xc2, 0xc7, 0x69, 0xc1, 0x89, 0x8d, 0x33, 0x30, 0x36, + 0xc5, 0xbb, 0x35, 0xa8, 0x4e, 0x6a, 0x61, 0xb8, 0xae, 0x3d, 0xae, 0x67, + 0xb2, 0xd5, 0x38, 0x25, 0xca, 0x52, 0xb5, 0x5f, 0xad, 0x55, 0xd4, 0x33, + 0x5a, 0x9e, 0x82, 0x81, 0xab, 0x4b, 0x62, 0xa2, 0x5a, 0x49, 0xc4, 0x67, + 0x80, 0xc5, 0x3c, 0x74, 0x84, 0xa0, 0x58, 0x66, 0x59, 0xd0, 0x2c, 0x8e, + 0x86, 0x35, 0xc6, 0xbd, 0x7a, 0x94, 0x47, 0xb1, 0x5b, 0x4b, 0x3d, 0xb3, + 0x51, 0x33, 0x7f, 0x66, 0x98, 0x79, 0xb5, 0xd5, 0x80, 0xcb, 0xc6, 0xb6, + 0xc6, 0x37, 0x7b, 0xb0, 0xb8, 0x9b, 0xb5, 0xb2, 0x67, 0x96, 0xdd, 0x69, + 0x45, 0x4c, 0x63, 0x41, 0x3c, 0x6a, 0xcf, 0x3f, 0x74, 0xb9, 0x87, 0x9b, + 0x72, 0x47, 0x2d, 0x9f, 0x49, 0x86, 0x4d, 0xa9, 0x3c, 0x87, 0xbf, 0xa4, + 0x74, 0x39, 0x78, 0x35, 0x80, 0x62, 0xcc, 0x78, 0x5d, 0xcf, 0x84, 0x6e, + 0x3c, 0x73, 0x8a, 0x69, 0x9b, 0xb2, 0xb8, 0x6c, 0xa8, 0xa6, 0xcc, 0xc7, + 0x75, 0xa4, 0xca, 0xc5, 0xaa, 0x4c, 0x59, 0xc8, 0x7a, 0x8b, 0xcf, 0x94, + 0x69, 0xdb, 0xb9, 0xbb, 0x37, 0x33, 0xca, 0x3e, 0x81, 0x61, 0xb1, 0x5c, + 0x43, 0x4a, 0xa7, 0xc9, 0xbd, 0xc9, 0x67, 0x4f, 0xa8, 0x77, 0xad, 0xb7, + 0xa0, 0x84, 0x3e, 0xd0, 0x9e, 0xba, 0x5d, 0x2a, 0xaf, 0xad, 0x3a, 0x6d, + 0x45, 0x97, 0x35, 0xb5, 0x77, 0x7d, 0x4a, 0xb8, 0x4d, 0x84, 0x7f, 0xa0, + 0x53, 0x27, 0x87, 0x4b, 0xe2, 0xc6, 0xe1, 0x32, 0xaa, 0x47, 0x53, 0x7b, + 0xc7, 0x4e, 0xb9, 0xc9, 0x6d, 0x65, 0xa1, 0xbc, 0xa6, 0xa0, 0xb1, 0x57, + 0x84, 0x61, 0x6e, 0x92, 0x8f, 0x6b, 0x75, 0xae, 0x43, 0x45, 0xb6, 0x55, + 0x5d, 0x7d, 0x64, 0x6e, 0xad, 0x7e, 0x42, 0xd6, 0xc4, 0x7f, 0xc2, 0x78, + 0x7f, 0xb9, 0xa4, 0xb9, 0x95, 0x67, 0xc7, 0x35, 0x3b, 0x6a, 0xc3, 0x84, + 0x61, 0xac, 0xc1, 0xb8, 0xc7, 0x96, 0x38, 0xa0, 0xbb, 0x58, 0xc5, 0xce, + 0x54, 0x6a, 0x5c, 0xba, 0x6e, 0x55, 0x85, 0x68, 0x75, 0x4b, 0x7e, 0x70, + 0xd1, 0xd0, 0x5e, 0x88, 0xa5, 0x49, 0x84, 0x6f, 0x61, 0xca, 0xac, 0xcd, + 0xce, 0x89, 0x93, 0x30, 0x60, 0x98, 0x40, 0x71, 0x84, 0x6c, 0x3d, 0x9e, + 0x99, 0x6e, 0xaa, 0x92, 0x72, 0xdb, 0x50, 0x4b, 0x76, 0xc2, 0x7d, 0xd4, + 0x54, 0x56, 0x9a, 0x3f, 0x96, 0xd3, 0x83, 0x58, 0x5b, 0xc0, 0x81, 0x78, + 0xa5, 0xc6, 0x72, 0xc9, 0x9a, 0x4b, 0x7f, 0x5a, 0x6e, 0xcb, 0xd1, 0x5f, + 0x98, 0x6e, 0x98, 0x35, 0x7a, 0x92, 0x91, 0x52, 0x44, 0x5f, 0x4b, 0x71, + 0xaa, 0x7a, 0xab, 0xe2, 0x9f, 0xc4, 0x33, 0xa7, 0x8c, 0x75, 0x38, 0x94, + 0x45, 0x47, 0x69, 0x64, 0x80, 0x85, 0x51, 0xa0, 0x5c, 0xb8, 0xad, 0x6f, + 0xb5, 0x94, 0xa4, 0x9c, 0x94, 0xde, 0xcc, 0x36, 0xb3, 0x59, 0x62, 0x58, + 0xa3, 0xa4, 0x52, 0xac, 0xa7, 0x5b, 0x4c, 0x61, 0xcd, 0x4e, 0x55, 0x44, + 0xb2, 0x5d, 0x3f, 0x4e, 0x38, 0xc6, 0xae, 0xa9, 0x78, 0xc0, 0xc4, 0x9e, + 0x56, 0x4b, 0x54, 0xa3, 0x77, 0xbc, 0xa0, 0xb9, 0xb9, 0xa0, 0x4c, 0x45, + 0xc4, 0xad, 0x5c, 0x6a, 0x86, 0x4e, 0x74, 0xa3, 0xc1, 0x49, 0x38, 0xbf, + 0x96, 0x71, 0x92, 0x9c, 0x7a, 0xad, 0x89, 0xcb, 0x9e, 0xb2, 0x40, 0x63, + 0x9c, 0xb2, 0x50, 0x84, 0x78, 0xa0, 0x41, 0x3b, 0x78, 0x6d, 0x84, 0x3f, + 0x41, 0x71, 0x59, 0xd0, 0x6a, 0xc3, 0xb2, 0x88, 0xcc, 0xbc, 0x7e, 0x77, + 0x9b, 0x91, 0x68, 0xd2, 0xc4, 0x39, 0x8a, 0xb3, 0xa4, 0xb4, 0x99, 0x6d, + 0x97, 0x5b, 0x89, 0x6b, 0x7d, 0x3a, 0x4c, 0xab, 0x8e, 0x47, 0x64, 0xc0, + 0x99, 0xaa, 0x3c, 0x6e, 0x83, 0x56, 0x68, 0xcb, 0xbb, 0x4a, 0xcb, 0x4f, + 0x66, 0x3c, 0x85, 0xc8, 0x33, 0x57, 0x32, 0xd5, 0x3f, 0x69, 0x83, 0x36, + 0xcb, 0x41, 0x89, 0xcd, 0x89, 0xb5, 0x38, 0x9f, 0xbe, 0x5f, 0x9d, 0x56, + 0x5e, 0x88, 0xb6, 0x66, 0xa8, 0xa6, 0xb5, 0x92, 0x3b, 0xc0, 0x62, 0x4c, + 0xcb, 0x52, 0x78, 0x86, 0xa6, 0x8d, 0x3e, 0x95, 0x55, 0xa9, 0x9d, 0xc3, + 0x85, 0x74, 0xb8, 0x96, 0x43, 0x86, 0x3e, 0xbc, 0xa4, 0x64, 0x3a, 0x7c, + 0x7b, 0x45, 0x49, 0x41, 0xa9, 0xca, 0x34, 0xa2, 0x8e, 0x43, 0x3e, 0xa2, + 0x9a, 0x37, 0xcf, 0xb8, 0xcb, 0xc2, 0xd1, 0xca, 0xd7, 0xaf, 0xbb, 0x9c, + 0xb0, 0x42, 0x88, 0x6d, 0x9d, 0x63, 0x73, 0x7a, 0x9c, 0x94, 0x6f, 0x75, + 0x58, 0x68, 0xae, 0x96, 0xbe, 0x45, 0x32, 0x36, 0xbc, 0x76, 0x39, 0x56, + 0x4b, 0x4a, 0xc9, 0xaf, 0xb2, 0x78, 0xa7, 0x98, 0xc8, 0x59, 0x9a, 0xa0, + 0x70, 0x54, 0x8f, 0x63, 0xa3, 0x58, 0xd9, 0xbf, 0x3b, 0xbe, 0x97, 0x42, + 0x85, 0xd5, 0x5d, 0x60, 0x7e, 0x4a, 0x4f, 0x99, 0x92, 0x56, 0x50, 0xc9, + 0x67, 0x5b, 0x5b, 0x7a, 0x31, 0x6f, 0xc6, 0xab, 0x31, 0x55, 0xcb, 0x3f, + 0x7b, 0x69, 0xbc, 0x49, 0x53, 0x7f, 0x97, 0xc2, 0xc9, 0x57, 0xb9, 0xa1, + 0x32, 0x69, 0x6c, 0x6f, 0x60, 0xc1, 0x9c, 0x67, 0xd8, 0xce, 0x39, 0x91, + 0x35, 0x66, 0xb2, 0x52, 0x92, 0x62, 0x90, 0xaf, 0x57, 0xa3, 0x8e, 0x6d, + 0x65, 0xa8, 0xc2, 0x42, 0x87, 0x69, 0x91, 0xab, 0x40, 0x79, 0xd8, 0x57, + 0x72, 0x73, 0xc9, 0x4a, 0xc8, 0xab, 0x98, 0xab, 0x5f, 0x66, 0x71, 0x30, + 0xad, 0x43, 0x90, 0x6d, 0x7e, 0x4d, 0xcf, 0x57, 0x90, 0xaf, 0xbf, 0x43, + 0xbd, 0x8a, 0x55, 0x58, 0x6f, 0xb1, 0x3c, 0x4d, 0x95, 0xaa, 0x4c, 0x4d, + 0x84, 0x7e, 0xb4, 0xd3, 0x6f, 0x9f, 0xd9, 0x60, 0xc9, 0x8d, 0x52, 0x8e, + 0x4e, 0xbd, 0xc1, 0x52, 0x4e, 0x47, 0x31, 0x34, 0xd0, 0x56, 0x79, 0x6e, + 0x9f, 0xae, 0x3d, 0x70, 0x95, 0x8c, 0x92, 0xa7, 0x58, 0x58, 0x89, 0xbf, + 0x77, 0x81, 0x95, 0x8e, 0xc7, 0x48, 0x79, 0x61, 0x81, 0xa8, 0x74, 0xb6, + 0x52, 0xc9, 0x64, 0xb4, 0x5a, 0xba, 0x7d, 0x3d, 0x73, 0x48, 0x45, 0xb8, + 0x62, 0xb1, 0x77, 0xa8, 0xe1, 0x63, 0xc7, 0x96, 0xba, 0xb1, 0x84, 0xd5, + 0xd5, 0x95, 0x3c, 0x72, 0x3c, 0x34, 0x66, 0x65, 0x3d, 0x96, 0x7a, 0xc7, + 0x54, 0x5c, 0x96, 0xb9, 0x50, 0x6c, 0x96, 0xdc, 0x78, 0x85, 0xc9, 0x8b, + 0x49, 0x4b, 0x47, 0xcd, 0x47, 0x6b, 0xb2, 0xcb, 0x47, 0xb8, 0xcc, 0x76, + 0x46, 0x74, 0x83, 0x45, 0xba, 0x8f, 0x79, 0x8d, 0x76, 0xa0, 0x3e, 0xc3, + 0xc2, 0x57, 0x5b, 0x62, 0xce, 0x3e, 0x60, 0x87, 0x8e, 0x9f, 0x51, 0x3e, + 0x4e, 0x9a, 0x97, 0x36, 0x59, 0x9f, 0x50, 0x95, 0x40, 0x61, 0x51, 0x55, + 0x71, 0x51, 0xc2, 0x37, 0xbf, 0xc7, 0xae, 0x47, 0x77, 0xca, 0xb8, 0xc7, + 0x87, 0x45, 0xc4, 0x4c, 0x38, 0xb3, 0x3a, 0xa9, 0xc1, 0x80, 0x74, 0x4b, + 0xa9, 0x60, 0xd1, 0x93, 0xb2, 0x8d, 0x3f, 0x44, 0x82, 0x8d, 0x74, 0xcf, + 0x96, 0x98, 0xac, 0x94, 0x50, 0x3c, 0x93, 0x9a, 0x2a, 0x4e, 0xc4, 0x63, + 0x95, 0xc6, 0x51, 0xc5, 0xd0, 0x58, 0xaf, 0x4c, 0x66, 0xda, 0x5d, 0x9b, + 0x82, 0xbd, 0x9e, 0x4b, 0x64, 0x4e, 0x73, 0x3e, 0x3d, 0x5c, 0xbc, 0xc0, + 0x9d, 0x82, 0x92, 0x3c, 0xb7, 0x4a, 0x82, 0x93, 0xde, 0x61, 0x92, 0x65, + 0x6d, 0x9b, 0xb5, 0xc3, 0x44, 0x80, 0x51, 0x70, 0xc7, 0xd3, 0x69, 0x4e, + 0xc2, 0xa2, 0x8e, 0xa4, 0x8e, 0x5f, 0x7d, 0x5d, 0x42, 0x5d, 0x71, 0xc9, + 0x28, 0x46, 0x6f, 0x88, 0xb0, 0x43, 0xc5, 0x72, 0x33, 0xb1, 0x4d, 0xc1, + 0xa1, 0x80, 0x99, 0xb9, 0xb0, 0x68, 0x81, 0x9a, 0x77, 0x71, 0x76, 0xb9, + 0x92, 0xc0, 0xc7, 0xc0, 0x8a, 0xa8, 0xb2, 0xbe, 0x79, 0x70, 0xcb, 0x5b, + 0xb5, 0xbe, 0x42, 0xa7, 0x80, 0x87, 0x38, 0x53, 0x6b, 0x58, 0x8f, 0x4c, + 0x3d, 0xb3, 0xd1, 0xa3, 0x55, 0x9c, 0x91, 0x2f, 0xc4, 0x51, 0x6d, 0x48, + 0x4f, 0x91, 0xb6, 0xb2, 0x56, 0x4a, 0x34, 0x86, 0x55, 0xc4, 0x36, 0xc8, + 0xa7, 0x92, 0x42, 0x75, 0xb4, 0x76, 0x83, 0xcd, 0x5a, 0x44, 0x7d, 0xa6, + 0x4c, 0x46, 0x9d, 0x97, 0xc6, 0x72, 0x34, 0x28, 0xdc, 0xa7, 0x37, 0x1e, + 0x6b, 0x4b, 0x5a, 0xc2, 0x9f, 0x62, 0xa2, 0xaf, 0x55, 0x3c, 0xd5, 0xb3, + 0x46, 0xae, 0x72, 0x4b, 0xb0, 0x86, 0x87, 0x92, 0x84, 0x66, 0x4a, 0x75, + 0x6d, 0xac, 0xa1, 0x86, 0x8a, 0xb2, 0x47, 0xcf, 0xad, 0x9d, 0x99, 0xc7, + 0x74, 0x7d, 0x4b, 0xa8, 0x63, 0xb8, 0xd6, 0xbd, 0x26, 0xbc, 0x43, 0x4b, + 0x9f, 0x56, 0x7e, 0x80, 0xb6, 0xae, 0xd5, 0x7b, 0xbe, 0xa4, 0xb1, 0x98, + 0x6e, 0x59, 0x78, 0xa8, 0xa6, 0x57, 0x3a, 0x61, 0x9f, 0xc0, 0xc6, 0x54, + 0x80, 0x58, 0xa6, 0x3f, 0x61, 0x7b, 0x43, 0xb7, 0xb5, 0x6a, 0x35, 0x3c, + 0x75, 0xa9, 0xbc, 0x33, 0x53, 0xcc, 0xad, 0x6b, 0xc7, 0x94, 0x3a, 0x9a, + 0x73, 0xbf, 0x61, 0xa4, 0x6d, 0xba, 0x7d, 0x5f, 0x53, 0x6b, 0xc6, 0xd7, + 0x95, 0x5a, 0x8d, 0x5f, 0xad, 0xda, 0x85, 0xad, 0x6f, 0xb9, 0x99, 0x2f, + 0x3f, 0x68, 0x8f, 0x69, 0x4a, 0x47, 0x88, 0x80, 0xa0, 0xbf, 0xa4, 0x9d, + 0x38, 0x4e, 0xb8, 0x80, 0x3e, 0x5f, 0xa7, 0x50, 0x6d, 0x69, 0x97, 0xac, + 0x81, 0x4d, 0xa8, 0x63, 0x8f, 0xa1, 0x65, 0xa8, 0x46, 0x91, 0x92, 0xc6, + 0x8a, 0x9b, 0x66, 0x62, 0x7e, 0x85, 0xb2, 0xac, 0x62, 0x8c, 0x81, 0x7e, + 0x37, 0x78, 0xcf, 0x44, 0x39, 0xc8, 0x2d, 0xcf, 0x76, 0x96, 0x75, 0xda, + 0x3a, 0x4c, 0xac, 0x54, 0x9c, 0xba, 0xd5, 0x29, 0xb6, 0x59, 0x77, 0x7f, + 0xbf, 0x8c, 0x6e, 0x80, 0x3c, 0x32, 0xc6, 0xc0, 0xb1, 0xad, 0xb2, 0x6c, + 0xc9, 0xce, 0xa9, 0x4e, 0x4d, 0xd8, 0xb7, 0x38, 0x5d, 0x9d, 0x4c, 0x6f, + 0xc4, 0x43, 0xb5, 0x9b, 0x6a, 0x4a, 0x65, 0x6e, 0xad, 0x69, 0xbb, 0x89, + 0x50, 0xc1, 0x4e, 0xa5, 0x51, 0x88, 0xa4, 0xab, 0x74, 0x52, 0xc0, 0x29, + 0xbe, 0x32, 0x49, 0x40, 0x93, 0x4e, 0x6f, 0x96, 0xe0, 0xc0, 0xd9, 0x97, + 0x7d, 0x86, 0x52, 0x5b, 0x80, 0x35, 0x79, 0x33, 0x35, 0x88, 0xdf, 0x63, + 0x3d, 0x2c, 0xd4, 0xbf, 0x7e, 0x1b, 0xb4, 0x6d, 0xa3, 0xb5, 0xc7, 0x6f, + 0xad, 0x87, 0xd9, 0x7c, 0x67, 0x67, 0x31, 0x88, 0x71, 0xc9, 0xb4, 0x7a, + 0x78, 0x3c, 0x8b, 0x8b, 0x49, 0xcc, 0xa7, 0x48, 0x5b, 0x44, 0x3e, 0x7f, + 0x96, 0xd3, 0xdb, 0x4b, 0x34, 0x96, 0x53, 0x4f, 0x8e, 0x7a, 0xc8, 0xa9, + 0xaa, 0x77, 0x58, 0x5c, 0x5e, 0x7e, 0x9b, 0x65, 0xa9, 0xcd, 0x9d, 0x3b, + 0x4b, 0x8e, 0xba, 0xc0, 0x67, 0x55, 0x40, 0xcb, 0x76, 0x6d, 0xbe, 0x80, + 0x57, 0x69, 0x3d, 0x89, 0x8f, 0x75, 0x6a, 0xb5, 0xcc, 0xd0, 0xcd, 0x34, + 0xd1, 0xc4, 0xa3, 0x7e, 0xa6, 0x40, 0x7b, 0x48, 0x50, 0xbd, 0x90, 0x5c, + 0x5a, 0x61, 0x88, 0x6e, 0x20, 0x94, 0x35, 0x6a, 0xc2, 0x4d, 0x42, 0xb9, + 0x7f, 0xaf, 0x43, 0x86, 0x4b, 0x5f, 0x73, 0xc5, 0x98, 0x68, 0x90, 0x99, + 0xa0, 0xd5, 0x92, 0xb1, 0xd4, 0x83, 0x80, 0xb4, 0x2f, 0x9d, 0x8e, 0x60, + 0x6d, 0x87, 0x94, 0x69, 0x3d, 0x82, 0x97, 0x64, 0x55, 0x8c, 0x82, 0x5f, + 0x73, 0xa2, 0xb7, 0xa2, 0x4e, 0x56, 0xd4, 0x4f, 0x8c, 0x51, 0xcd, 0xd0, + 0x79, 0x37, 0x4a, 0x5c, 0xc8, 0x9f, 0x2b, 0x5c, 0x7d, 0x6a, 0x96, 0xbe, + 0xc0, 0x68, 0x46, 0x69, 0xb9, 0x68, 0xc9, 0x38, 0x35, 0xaf, 0x92, 0x91, + 0xa2, 0x9d, 0x8a, 0xad, 0xb5, 0x72, 0xde, 0xbe, 0x56, 0x45, 0xb2, 0x70, + 0xe0, 0x43, 0xaf, 0x2b, 0xcc, 0xa9, 0x60, 0x9b, 0xd6, 0x79, 0x6b, 0x82, + 0x7d, 0x8f, 0xaa, 0xaf, 0xb4, 0x64, 0x9f, 0xa6, 0x53, 0x8d, 0x7a, 0xaf, + 0x6d, 0xb2, 0x9e, 0x6f, 0x3c, 0xa1, 0xaf, 0x60, 0x83, 0xbc, 0x6e, 0xf4, + 0x3b, 0x2e, 0xbd, 0xdd, 0x28, 0x71, 0xbf, 0x42, 0x68, 0x59, 0xda, 0x67, + 0x57, 0xaa, 0x5b, 0xf7, 0x29, 0x87, 0x9e, 0xb4, 0x5e, 0x4b, 0x56, 0xb6, + 0xba, 0xc8, 0x95, 0xa5, 0x68, 0x5e, 0x5d, 0x92, 0x35, 0x40, 0x7f, 0x6e, + 0x52, 0x85, 0x62, 0x7c, 0x59, 0x86, 0x47, 0x94, 0x8c, 0xa8, 0xe4, 0xc4, + 0x94, 0x83, 0x81, 0x92, 0x94, 0x40, 0x9c, 0x25, 0x75, 0x45, 0x87, 0xb1, + 0x5b, 0x63, 0x80, 0x96, 0x4f, 0x8f, 0x97, 0xc8, 0x44, 0xbe, 0x8d, 0x3b, + 0x89, 0x48, 0x39, 0x8a, 0x38, 0x84, 0x78, 0x9a, 0x1a, 0xae, 0x81, 0x48, + 0xd0, 0x4d, 0xa0, 0x60, 0xa5, 0xa2, 0x9d, 0x5f, 0x92, 0x90, 0xa4, 0x4e, + 0x90, 0xb6, 0x88, 0xa2, 0x69, 0x42, 0xcc, 0xce, 0x95, 0x80, 0x6b, 0x83, + 0x6b, 0x8a, 0xb3, 0x75, 0xc4, 0xdd, 0x7c, 0x96, 0x8c, 0x5f, 0x71, 0x9b, + 0x95, 0x86, 0x39, 0xbc, 0x62, 0xc7, 0x68, 0x64, 0xab, 0x4f, 0x37, 0x7a, + 0x42, 0x74, 0x97, 0xb0, 0xbd, 0x8a, 0x61, 0x41, 0xb8, 0xb0, 0x7b, 0x97, + 0x3b, 0xba, 0xb7, 0xcf, 0xb3, 0xa2, 0x54, 0x94, 0x41, 0x54, 0x70, 0x8e, + 0x38, 0x78, 0x7b, 0xc2, 0x47, 0xac, 0x9c, 0xd6, 0x85, 0x64, 0x46, 0x8e, + 0xab, 0x63, 0x46, 0x52, 0x76, 0x60, 0xb4, 0x67, 0x80, 0x63, 0x61, 0x74, + 0xa5, 0xb5, 0x83, 0x6e, 0x9e, 0x99, 0x49, 0x9f, 0x57, 0xbd, 0x62, 0x9d, + 0x33, 0x37, 0x6d, 0xae, 0xaa, 0x8b, 0x70, 0x9c, 0x97, 0xc7, 0xa4, 0x2d, + 0x7e, 0xae, 0x66, 0x8a, 0x5d, 0x30, 0x5d, 0xa9, 0x90, 0x76, 0x68, 0x3a, + 0xc2, 0xd1, 0xc2, 0xbb, 0x9e, 0xc0, 0xf3, 0x8b, 0x49, 0x96, 0x32, 0xb0, + 0x67, 0x4f, 0x91, 0xa6, 0xcd, 0x9d, 0x97, 0x5b, 0x66, 0x2b, 0xb2, 0xcc, + 0xab, 0xbb, 0xba, 0x72, 0xa4, 0x91, 0x4d, 0xd0, 0xa7, 0xc9, 0x42, 0x57, + 0xa2, 0x69, 0xbc, 0x64, 0xa7, 0x75, 0xa4, 0x5b, 0x52, 0xc8, 0x3c, 0x60, + 0x3e, 0xd4, 0x44, 0x75, 0x3c, 0xc1, 0x68, 0x88, 0x7a, 0x68, 0xb2, 0x5a, + 0xb8, 0x4a, 0x4d, 0x36, 0x95, 0x88, 0x5b, 0xac, 0xb8, 0x51, 0xaf, 0x69, + 0x63, 0x69, 0x84, 0xa6, 0xab, 0xbf, 0x6f, 0x86, 0x3a, 0xb3, 0x3f, 0x54, + 0x4f, 0xad, 0x9e, 0x44, 0x6f, 0x73, 0x3a, 0x5f, 0xa2, 0x67, 0x3e, 0x96, + 0x64, 0x51, 0xc5, 0xd3, 0x6b, 0x7f, 0x8f, 0xd2, 0x6a, 0xaa, 0xdb, 0xa9, + 0x86, 0xa7, 0x90, 0x88, 0xa4, 0x67, 0xa6, 0xab, 0xca, 0x82, 0xc9, 0x75, + 0x85, 0xd1, 0x7e, 0xcb, 0x50, 0x8c, 0x6c, 0xb5, 0x53, 0x81, 0xa4, 0x63, + 0xa5, 0xb0, 0xb3, 0xd6, 0x4c, 0x9f, 0x60, 0xb2, 0x9c, 0x65, 0xdd, 0x8a, + 0x3e, 0x96, 0x39, 0xbb, 0x78, 0x52, 0x7f, 0xbb, 0x45, 0x85, 0xc3, 0x46, + 0xad, 0x9f, 0x41, 0xc5, 0x77, 0xab, 0x33, 0x9c, 0xb2, 0x91, 0x31, 0x48, + 0x7a, 0x2f, 0xbd, 0xb3, 0x61, 0x37, 0x9e, 0x9b, 0xa7, 0x91, 0x61, 0x54, + 0x45, 0x6d, 0xb5, 0x9e, 0xb4, 0xa9, 0x7c, 0x7c, 0x47, 0x66, 0xc9, 0x6f, + 0xc2, 0x78, 0x83, 0x8b, 0x83, 0x90, 0x40, 0xae, 0x54, 0x6e, 0x74, 0x6e, + 0x4d, 0x58, 0x9d, 0x7c, 0xca, 0xa8, 0x8c, 0x64, 0xa6, 0x85, 0x4d, 0x57, + 0x45, 0x9a, 0x8d, 0x57, 0xd2, 0x66, 0xcc, 0xa6, 0x5f, 0xd2, 0x4f, 0x6f, + 0x8d, 0xc7, 0x90, 0x9a, 0x84, 0xb8, 0x37, 0x76, 0xa5, 0xce, 0x49, 0x74, + 0xb9, 0xb2, 0xab, 0x57, 0x81, 0x30, 0x98, 0xa0, 0x8b, 0x38, 0x50, 0x7b, + 0xa5, 0xb2, 0xba, 0x6b, 0x6e, 0x59, 0x5c, 0xc1, 0x88, 0xd7, 0x4f, 0xcf, + 0x7a, 0x88, 0x64, 0x45, 0xb3, 0x79, 0x42, 0x81, 0xb7, 0x51, 0x7e, 0x83, + 0x85, 0xac, 0x3c, 0x34, 0xb6, 0x3d, 0xb8, 0x89, 0x9b, 0x34, 0x3f, 0xcd, + 0x87, 0xac, 0xb3, 0x85, 0x97, 0x9e, 0xd2, 0x5e, 0xba, 0x51, 0x75, 0x6c, + 0x90, 0x53, 0x61, 0x6a, 0x93, 0xb5, 0xae, 0xab, 0x93, 0x35, 0x8c, 0xd4, + 0x45, 0x84, 0xa4, 0x69, 0x9b, 0xba, 0x90, 0x7d, 0x51, 0x62, 0x49, 0xab, + 0xa7, 0x5a, 0xe0, 0xd1, 0xc5, 0x52, 0x92, 0x93, 0x79, 0x3c, 0xb1, 0x76, + 0xa3, 0xb4, 0x68, 0x57, 0x6e, 0x52, 0xdf, 0xac, 0x93, 0x83, 0xcc, 0xa0, + 0xa2, 0xa7, 0xd1, 0x28, 0x79, 0x6a, 0xac, 0x63, 0x4a, 0x66, 0x31, 0xc8, + 0x55, 0xa6, 0xa3, 0x67, 0xc8, 0x8e, 0x9d, 0x7b, 0xa7, 0xde, 0x60, 0xb5, + 0x7d, 0x94, 0x9d, 0x96, 0xc9, 0x72, 0x84, 0xc4, 0xd0, 0x4b, 0x5f, 0x7a, + 0xab, 0x72, 0x85, 0x43, 0xc8, 0x4b, 0x4c, 0x93, 0x45, 0x8b, 0x38, 0x70, + 0x6f, 0x94, 0x79, 0x9d, 0xb4, 0xc5, 0x5a, 0xd0, 0x65, 0x8c, 0x81, 0xa6, + 0x5c, 0x6b, 0x8e, 0x42, 0xa4, 0xd0, 0x50, 0x61, 0x7f, 0xd6, 0x48, 0xb1, + 0x85, 0xac, 0xc2, 0xbc, 0x71, 0xc8, 0x98, 0xca, 0x83, 0xa6, 0x73, 0x4d, + 0xd5, 0xc7, 0x50, 0xd2, 0x71, 0xdf, 0xa7, 0x7d, 0xb6, 0x62, 0xc8, 0x6a, + 0x9d, 0x66, 0x95, 0x34, 0x8f, 0xb7, 0x95, 0x4e, 0x60, 0x79, 0x98, 0xb7, + 0xc8, 0x50, 0x71, 0xac, 0x52, 0x8e, 0x39, 0x55, 0xbf, 0xa2, 0x6d, 0x96, + 0xa0, 0xbd, 0x8a, 0x66, 0xbf, 0x69, 0xda, 0x53, 0x7b, 0x65, 0x84, 0x2d, + 0xba, 0xb7, 0x54, 0x51, 0xcd, 0x51, 0x76, 0xc5, 0x65, 0xbe, 0x7b, 0x77, + 0xa4, 0x71, 0x4e, 0x4e, 0x61, 0x3b, 0x9e, 0xab, 0xae, 0xc7, 0x3e, 0x66, + 0x98, 0x69, 0x55, 0x68, 0x4e, 0x7a, 0x48, 0x6e, 0x9e, 0xb7, 0x70, 0x6b, + 0x88, 0x6c, 0x4d, 0xc3, 0x4b, 0xc6, 0x3b, 0x87, 0xba, 0xb7, 0xa7, 0x80, + 0x7b, 0x54, 0xba, 0x76, 0x32, 0x95, 0x69, 0x54, 0x45, 0xd0, 0x91, 0xca, + 0x5b, 0xc2, 0xc8, 0x7f, 0x72, 0x80, 0x71, 0xb7, 0xd0, 0x94, 0x6a, 0x92, + 0xcf, 0xd3, 0x44, 0xd3, 0x66, 0x83, 0x79, 0x4b, 0xae, 0x8e, 0x8b, 0x91, + 0x89, 0x3a, 0xaa, 0x7e, 0x97, 0x61, 0x90, 0x57, 0x4e, 0x5d, 0xd9, 0xc0, + 0x2e, 0x79, 0x64, 0x73, 0x77, 0xb2, 0x85, 0xa6, 0x4b, 0x6c, 0x97, 0x87, + 0x3b, 0xcb, 0x7c, 0xbd, 0x3a, 0xcd, 0x36, 0x99, 0x81, 0xce, 0x6b, 0xa9, + 0x30, 0x8f, 0x86, 0xb3, 0x93, 0xae, 0x74, 0x53, 0x44, 0x35, 0x93, 0x96, + 0x6a, 0x95, 0x3d, 0x4e, 0x39, 0x62, 0xbb, 0xcf, 0xa2, 0xab, 0xcf, 0x9a, + 0x7d, 0x45, 0xc1, 0x63, 0x39, 0x8d, 0xc2, 0x7d, 0x45, 0x91, 0x77, 0x91, + 0x7e, 0x89, 0x78, 0x51, 0x2c, 0x39, 0xb9, 0x32, 0xd1, 0x91, 0x58, 0xaf, + 0x62, 0x35, 0x30, 0x51, 0x83, 0x49, 0x83, 0x9c, 0xa0, 0x62, 0xc2, 0x5c, + 0x44, 0x6c, 0x6f, 0x60, 0x7a, 0x38, 0xc5, 0x4e, 0x31, 0x5d, 0xc7, 0x6c, + 0xad, 0x8f, 0xc9, 0x61, 0x46, 0xd4, 0x7a, 0x6d, 0x7e, 0xa1, 0x78, 0xa6, + 0x3d, 0x9f, 0x8f, 0xbe, 0xb5, 0x7d, 0x8a, 0x86, 0x40, 0x3b, 0x9f, 0xc1, + 0x66, 0x45, 0x57, 0xd7, 0x7e, 0xb3, 0x41, 0x4f, 0x89, 0x4f, 0x3d, 0x84, + 0x6f, 0x33, 0x5a, 0x5e, 0x69, 0x70, 0xa0, 0x40, 0xae, 0x5c, 0x3e, 0xce, + 0x49, 0x60, 0xb8, 0xb1, 0xad, 0x5d, 0xc8, 0x7e, 0x40, 0x31, 0x94, 0xa3, + 0x5f, 0x6c, 0x31, 0xbb, 0xbd, 0xc4, 0x38, 0xa9, 0x6b, 0x72, 0x76, 0xad, + 0x9e, 0x83, 0x4a, 0x88, 0x90, 0x39, 0x60, 0xb6, 0xb2, 0x3d, 0x3f, 0x68, + 0x58, 0x7c, 0x85, 0xa6, 0x76, 0x99, 0x2d, 0x4c, 0x77, 0x7b, 0xc4, 0xc5, + 0x8a, 0x92, 0xa8, 0x50, 0x32, 0x79, 0xa0, 0x99, 0x68, 0x99, 0x62, 0xbe, + 0xd7, 0x8b, 0xc9, 0x81, 0x6c, 0x88, 0xa2, 0x64, 0x97, 0xa5, 0x53, 0xe0, + 0x7e, 0x88, 0x63, 0x92, 0x61, 0x6a, 0x3f, 0x6b, 0x49, 0x44, 0x57, 0x5d, + 0x48, 0xbe, 0x75, 0xa6, 0xb2, 0x99, 0xc1, 0xd2, 0xc2, 0x61, 0x47, 0x58, + 0x7d, 0x87, 0x8b, 0xad, 0x6e, 0xb7, 0x55, 0xa7, 0xc7, 0x86, 0x85, 0x7e, + 0xc6, 0xa3, 0x3e, 0xa4, 0xbd, 0x97, 0x4c, 0x90, 0x56, 0x88, 0x42, 0x38, + 0x67, 0x60, 0xc6, 0x33, 0xc2, 0x5f, 0x35, 0x8f, 0x6a, 0xa1, 0x85, 0xa1, + 0x60, 0x79, 0xa7, 0x4a, 0x84, 0x60, 0x87, 0xac, 0xc9, 0xa4, 0x6f, 0x3e, + 0xcf, 0x56, 0x7f, 0x6e, 0xcf, 0x63, 0x45, 0x32, 0xce, 0x90, 0x8f, 0x9f, + 0x6a, 0x4c, 0xbc, 0x54, 0x6e, 0x44, 0x54, 0xb0, 0xb8, 0xca, 0x68, 0x9c, + 0xbf, 0x62, 0x61, 0x72, 0x8d, 0xb1, 0xc9, 0x4b, 0xb7, 0x66, 0x62, 0xcd, + 0x3c, 0x3c, 0xb9, 0x95, 0x50, 0x87, 0xaf, 0x68, 0x96, 0x73, 0xbf, 0x55, + 0x86, 0x9e, 0x64, 0x6c, 0x6b, 0xa0, 0xaa, 0xd3, 0x80, 0xb8, 0xbf, 0x2d, + 0x5b, 0x9b, 0x55, 0xc5, 0x51, 0x6c, 0x4d, 0xc3, 0xa9, 0xcb, 0x7b, 0xab, + 0x42, 0x58, 0xd5, 0x52, 0x71, 0x2c, 0x44, 0x52, 0x6e, 0xc1, 0xb2, 0xc4, + 0x7d, 0x6e, 0x99, 0xa9, 0x70, 0x46, 0xcf, 0x59, 0x63, 0xaf, 0x85, 0x7e, + 0xae, 0x68, 0x7a, 0x3b, 0xd2, 0x70, 0xc5, 0xab, 0x45, 0xaf, 0xa1, 0xbe, + 0x83, 0x9e, 0x75, 0x46, 0xbd, 0x69, 0xac, 0x86, 0xa1, 0x6d, 0x4d, 0x67, + 0x77, 0x68, 0xb8, 0x97, 0x9a, 0x72, 0x38, 0x8d, 0x53, 0x73, 0x6a, 0xd4, + 0x2e, 0xac, 0x8e, 0x6c, 0xcc, 0xad, 0xb4, 0x94, 0x50, 0x70, 0xc2, 0x52, + 0x4c, 0xb3, 0x72, 0x36, 0x5d, 0x83, 0x39, 0x46, 0x37, 0x6a, 0x94, 0x3a, + 0x8d, 0x3d, 0x76, 0x2d, 0xad, 0xa8, 0x6e, 0x96, 0x3c, 0x5f, 0x9a, 0x50, + 0x6d, 0x4f, 0x79, 0xc3, 0x87, 0x36, 0x66, 0x3a, 0x67, 0xc1, 0xb7, 0x5b, + 0x47, 0x81, 0xd2, 0x45, 0x7c, 0xc8, 0xd3, 0x53, 0x6f, 0x5e, 0x89, 0xab, + 0x41, 0x59, 0x37, 0x84, 0x72, 0x3a, 0xaa, 0x95, 0xc5, 0x95, 0x75, 0x9e, + 0xa2, 0xc5, 0x44, 0xbe, 0x2f, 0x40, 0x3f, 0xaa, 0xad, 0x9c, 0x70, 0x8e, + 0x3e, 0xc3, 0xb7, 0x6d, 0x7e, 0x54, 0xa5, 0x5c, 0xa1, 0x60, 0x46, 0x81, + 0x9f, 0xb2, 0x76, 0x3a, 0xcd, 0xb5, 0x3a, 0x55, 0x51, 0xbd, 0x78, 0x78, + 0x73, 0x41, 0x80, 0x9c, 0x4e, 0x65, 0x36, 0x39, 0x2e, 0xbd, 0x80, 0x2f, + 0x95, 0x5b, 0x40, 0x4e, 0x93, 0x91, 0xc2, 0x50, 0xb1, 0xc0, 0xa0, 0x42, + 0x87, 0x71, 0x74, 0xbf, 0xbe, 0x3f, 0x60, 0x84, 0x50, 0xac, 0x67, 0x4d, + 0x97, 0xac, 0x34, 0x6b, 0xc1, 0x7c, 0x4c, 0xab, 0x36, 0xbb, 0xa6, 0x4b, + 0xa4, 0x3c, 0x7c, 0xbe, 0x56, 0x8d, 0xb7, 0xc1, 0x89, 0x5f, 0x58, 0x88, + 0xa7, 0xc7, 0x68, 0x43, 0xa6, 0xcc, 0x59, 0x5d, 0x8f, 0xa4, 0xa8, 0xd0, + 0x4f, 0xcc, 0x7b, 0x4f, 0x60, 0x35, 0x77, 0x74, 0x2f, 0x8b, 0x9c, 0xa8, + 0x34, 0x52, 0x56, 0xbb, 0x80, 0x96, 0xa0, 0x8d, 0xcc, 0x7e, 0x5d, 0x45, + 0x6b, 0xa3, 0x3d, 0xc1, 0x31, 0x43, 0x5a, 0xa4, 0xae, 0x50, 0x3b, 0x71, + 0xc9, 0x8b, 0x58, 0x9d, 0x64, 0x60, 0xaa, 0x96, 0x6e, 0x78, 0xc7, 0xb7, + 0x96, 0x53, 0x54, 0xbf, 0x41, 0x9a, 0x88, 0xc5, 0x57, 0x94, 0xc2, 0xcd, + 0xae, 0x49, 0x7a, 0x35, 0xc5, 0x5a, 0x94, 0xb4, 0x40, 0xbe, 0xcb, 0xc5, + 0x34, 0x8e, 0xd6, 0x44, 0x30, 0xa5, 0x68, 0x50, 0x85, 0xc5, 0x7f, 0x8e, + 0x45, 0xc4, 0xb7, 0x7f, 0x38, 0xb9, 0xa8, 0xa2, 0xcb, 0xa5, 0x50, 0x6a, + 0x81, 0xac, 0x78, 0xbe, 0xa7, 0xb3, 0x50, 0x66, 0x3e, 0x74, 0x56, 0x6a, + 0x7b, 0xab, 0xc2, 0x44, 0x3d, 0x2c, 0x73, 0xac, 0x34, 0x2e, 0x86, 0xa8, + 0x46, 0x45, 0x43, 0xc5, 0x3e, 0x6a, 0x7b, 0x53, 0xac, 0xa1, 0x8c, 0x35, + 0x91, 0x81, 0xaf, 0x5b, 0x4a, 0x83, 0xc8, 0xb3, 0xb5, 0xa8, 0xb8, 0x71, + 0xb8, 0x68, 0x60, 0xb9, 0x33, 0xa5, 0x91, 0x45, 0xb0, 0x3e, 0x85, 0xc5, + 0x66, 0x93, 0xc5, 0x69, 0x96, 0x89, 0xc7, 0x82, 0x38, 0x77, 0xae, 0xb7, + 0x74, 0x97, 0x51, 0x59, 0x8a, 0x3b, 0x3d, 0xba, 0xaf, 0x95, 0xc3, 0xb5, + 0x9a, 0x4c, 0xbd, 0x39, 0x66, 0xa1, 0x5c, 0x58, 0x33, 0x93, 0x8d, 0x33, + 0x5b, 0x83, 0x94, 0x87, 0x9f, 0x6f, 0xb4, 0x95, 0x34, 0x40, 0x36, 0xaa, + 0x6c, 0x49, 0xb7, 0x2b, 0xb1, 0xbc, 0x39, 0xbe, 0xc0, 0x75, 0x93, 0x87, + 0xb1, 0x9b, 0x68, 0xbb, 0xa7, 0xba, 0x5a, 0x55, 0xbc, 0xb4, 0xb7, 0x44, + 0x69, 0x46, 0x51, 0x75, 0xca, 0xb0, 0xb2, 0x6c, 0x8e, 0xaf, 0xba, 0x46, + 0xae, 0xae, 0x8e, 0xa2, 0x5d, 0x7f, 0xbf, 0x70, 0xa8, 0x5a, 0xcd, 0x49, + 0xa3, 0x9a, 0xb3, 0x50, 0x81, 0x85, 0xc6, 0x50, 0x74, 0x96, 0xa6, 0x49, + 0x79, 0x89, 0x7b, 0x62, 0xca, 0x48, 0x6d, 0x57, 0x2e, 0x95, 0x9f, 0x59, + 0xae, 0x6a, 0xce, 0x54, 0x80, 0x89, 0x9a, 0xa3, 0xa1, 0x54, 0xa1, 0x4e, + 0x40, 0xcb, 0xb4, 0x3e, 0x45, 0xc5, 0xcb, 0x32, 0x41, 0x90, 0x2f, 0x42, + 0x4c, 0x60, 0x37, 0x47, 0x33, 0x96, 0xb0, 0x3d, 0x66, 0x38, 0x79, 0x65, + 0x77, 0x4b, 0xa4, 0x60, 0x54, 0xac, 0xa2, 0x42, 0x73, 0x35, 0x60, 0x5e, + 0x6d, 0x68, 0xc5, 0x97, 0xc5, 0x2e, 0x72, 0x2e, 0x58, 0x43, 0xaf, 0xb3, + 0xaa, 0x35, 0xce, 0xc7, 0x62, 0x37, 0xa8, 0xc2, 0x95, 0x5c, 0xc0, 0x72, + 0x65, 0xb0, 0xba, 0x65, 0x59, 0xc7, 0x72, 0x6c, 0x5e, 0x42, 0x34, 0x86, + 0xb6, 0x58, 0xa6, 0x30, 0xbf, 0xcc, 0xc2, 0x3b, 0x5c, 0x82, 0xa7, 0x89, + 0x5e, 0x85, 0x7f, 0x82, 0x41, 0xcb, 0x3b, 0x97, 0x67, 0x8c, 0x5a, 0x74, + 0x5d, 0xc9, 0xc5, 0x59, 0x99, 0x71, 0x85, 0xb8, 0xb0, 0xb1, 0xc2, 0xc6, + 0x37, 0x70, 0x8c, 0x9c, 0xc9, 0x2d, 0x8a, 0x6c, 0x36, 0x8f, 0x48, 0x54, + 0x67, 0xbe, 0xb3, 0xb8, 0x4b, 0x76, 0xc6, 0x6c, 0x9e, 0xda, 0xa6, 0x98, + 0x86, 0xaa, 0x9f, 0xb0, 0xcd, 0x8c, 0xbe, 0x72, 0x93, 0xc2, 0x46, 0x8c, + 0x90, 0x71, 0x4f, 0x4c, 0x82, 0x9e, 0x42, 0x31, 0x75, 0x77, 0x97, 0xc8, + 0x48, 0xa4, 0xc1, 0x73, 0x37, 0x42, 0x41, 0xd7, 0x88, 0x5b, 0x36, 0x79, + 0x96, 0x7a, 0x9a, 0x72, 0xb4, 0x4e, 0xae, 0x2f, 0xa9, 0x89, 0x95, 0xc2, + 0x94, 0x81, 0x4a, 0xbf, 0x94, 0xcc, 0x26, 0x98, 0x5a, 0xa5, 0x57, 0x41, + 0x85, 0xbd, 0x43, 0x3f, 0x77, 0x7b, 0x54, 0x9c, 0x37, 0x4c, 0xbe, 0x3c, + 0xc9, 0xb8, 0xbc, 0xb0, 0x49, 0x53, 0x97, 0xa2, 0x2a, 0x67, 0x4e, 0x4b, + 0x43, 0x46, 0x6b, 0x7e, 0xc7, 0x6f, 0x9e, 0xb1, 0x6b, 0xa5, 0x54, 0x8f, + 0xb2, 0x66, 0x62, 0x30, 0x6d, 0x67, 0x51, 0xc3, 0x6a, 0x6d, 0x5b, 0x71, + 0x87, 0x63, 0xb2, 0x6c, 0xb8, 0xc7, 0xc9, 0x2d, 0xb9, 0x2c, 0x98, 0xc7, + 0xb9, 0x5e, 0x8c, 0xb6, 0x38, 0x9c, 0x4c, 0xc1, 0x76, 0x9e, 0x45, 0xcc, + 0x72, 0x86, 0x5e, 0x4f, 0xd6, 0x67, 0x90, 0x66, 0xa9, 0xbb, 0x5e, 0x9e, + 0xa0, 0x85, 0x80, 0xc1, 0x82, 0xaf, 0x7c, 0xa5, 0xb0, 0xba, 0xb2, 0x4c, + 0x71, 0xe6, 0xa4, 0xd3, 0xbb, 0xa5, 0xba, 0xae, 0xb3, 0x45, 0x4c, 0x46, + 0xc9, 0xc7, 0x74, 0x65, 0x95, 0x47, 0x92, 0x60, 0x61, 0x76, 0xc1, 0x7e, + 0x58, 0x69, 0x57, 0x5f, 0x3a, 0x36, 0x97, 0x50, 0x9e, 0xaa, 0x73, 0xc3, + 0x99, 0x5b, 0xa9, 0xb3, 0x5f, 0x37, 0x5a, 0x87, 0x36, 0xaa, 0x49, 0x74, + 0x82, 0xa6, 0x92, 0x4f, 0x40, 0x50, 0x70, 0xb2, 0x7f, 0x4b, 0x7d, 0xb2, + 0x35, 0x73, 0xc9, 0x62, 0x42, 0x51, 0x56, 0x90, 0x8d, 0xa8, 0xba, 0xc4, + 0xc9, 0x42, 0xa8, 0x4d, 0x76, 0x3a, 0xd2, 0x99, 0x79, 0xcc, 0x33, 0xa6, + 0x6b, 0xbe, 0x2f, 0x8f, 0x86, 0x68, 0xaa, 0x58, 0x80, 0x47, 0xa6, 0xa5, + 0x76, 0xb0, 0xd0, 0x41, 0x95, 0xab, 0x33, 0x74, 0x58, 0x3d, 0xc1, 0xbe, + 0x7c, 0xbd, 0x3b, 0xb5, 0x5d, 0x7a, 0xc1, 0x34, 0x79, 0x5a, 0x49, 0x7e, + 0xd8, 0x8d, 0x9b, 0x43, 0x57, 0x45, 0xb4, 0xb5, 0x50, 0x50, 0xaa, 0x5a, + 0x90, 0xd7, 0xc6, 0x31, 0x75, 0x68, 0xa1, 0xc2, 0x72, 0x40, 0x4d, 0xca, + 0x7f, 0x90, 0x5e, 0x82, 0x5e, 0x77, 0x7f, 0xba, 0x70, 0x83, 0x44, 0xbb, + 0x5d, 0xbe, 0x97, 0xc4, 0xa1, 0xa5, 0xd0, 0xd5, 0x4d, 0x79, 0xa1, 0xd2, + 0x98, 0x7d, 0x63, 0x33, 0x8e, 0x75, 0x99, 0x9c, 0x46, 0xb6, 0x50, 0x5a, + 0xcc, 0x5c, 0x7c, 0xa5, 0x45, 0x83, 0x43, 0x8d, 0x58, 0x52, 0x81, 0xaf, + 0x99, 0xb7, 0x9b, 0x92, 0x40, 0x52, 0x4a, 0x62, 0xae, 0x37, 0x1d, 0x97, + 0x93, 0x3b, 0x58, 0x90, 0xc7, 0x78, 0x95, 0x41, 0xa1, 0x6b, 0x63, 0x99, + 0x92, 0x86, 0x5b, 0x35, 0x72, 0x7f, 0x3d, 0xca, 0x5b, 0x48, 0x65, 0x3c, + 0x45, 0x7d, 0xa8, 0x99, 0xa8, 0x61, 0xb5, 0xa0, 0x9f, 0xa1, 0xaa, 0xa3, + 0x9b, 0x75, 0x7e, 0x8f, 0x59, 0x97, 0x99, 0x71, 0xbf, 0xaa, 0x9c, 0x64, + 0x81, 0xa3, 0x94, 0xcb, 0x5d, 0x36, 0xa6, 0xbb, 0x7e, 0x4b, 0x3b, 0x6d, + 0x66, 0x9e, 0x8b, 0xc8, 0xc5, 0x74, 0x40, 0x54, 0x64, 0xb1, 0x74, 0x58, + 0xb7, 0x30, 0x49, 0x3a, 0x97, 0x31, 0x84, 0x4d, 0x7f, 0x38, 0xc5, 0xc5, + 0x65, 0x6b, 0xaf, 0x6e, 0xa8, 0xba, 0xba, 0xc9, 0x97, 0x3d, 0x7a, 0x6c, + 0x7a, 0x6c, 0x6c, 0xb4, 0x91, 0x3c, 0x32, 0x78, 0x33, 0xc0, 0x7e, 0x58, + 0xa5, 0xc8, 0x7b, 0x80, 0x6a, 0x9f, 0xbe, 0x3c, 0xb1, 0xb1, 0x94, 0xd3, + 0x58, 0xbb, 0x40, 0x8b, 0x53, 0x5c, 0x5f, 0x72, 0xd0, 0xa4, 0xc3, 0x37, + 0xbc, 0x9d, 0xad, 0x8d, 0x3e, 0xba, 0x43, 0x68, 0xd7, 0xc6, 0xcf, 0x6a, + 0xcb, 0xcb, 0x72, 0x86, 0xc6, 0x8f, 0xb9, 0x86, 0x71, 0x31, 0x32, 0x59, + 0x30, 0xcf, 0xc3, 0xc4, 0xbc, 0x91, 0x64, 0x64, 0x80, 0x59, 0xa2, 0x2c, + 0xa4, 0xb5, 0xb0, 0xc9, 0x79, 0xc5, 0x6c, 0x56, 0x94, 0x3a, 0x43, 0x80, + 0x7e, 0xa0, 0xb3, 0x72, 0x54, 0xc1, 0x4a, 0x5f, 0xbf, 0x6f, 0xda, 0xbf, + 0x5d, 0xa0, 0x7e, 0xaf, 0x53, 0xb0, 0x45, 0x7a, 0x36, 0xc6, 0x46, 0xb9, + 0x9f, 0x83, 0x75, 0x46, 0x6a, 0xa1, 0x9c, 0xc6, 0x4a, 0xba, 0x69, 0xad, + 0x59, 0x59, 0x90, 0xaa, 0xd7, 0x64, 0x3b, 0x5b, 0x41, 0x4f, 0x2f, 0x6f, + 0xb2, 0xc2, 0x72, 0x7b, 0x87, 0x6e, 0x8a, 0xad, 0xad, 0xa7, 0x67, 0x8f, + 0x79, 0xb6, 0x33, 0x5c, 0x3e, 0x86, 0x8c, 0x2c, 0x97, 0x57, 0x5a, 0x5f, + 0x69, 0x3e, 0x77, 0x41, 0x90, 0xab, 0x77, 0xc6, 0x33, 0xe1, 0x5c, 0x65, + 0x3f, 0xcd, 0xb0, 0x44, 0xb6, 0xa4, 0x58, 0x33, 0x54, 0x4e, 0x96, 0x40, + 0x6d, 0xc0, 0x8f, 0x5b, 0x67, 0x7e, 0x98, 0x5a, 0x35, 0x99, 0x5d, 0x3f, + 0x7a, 0xb5, 0xd0, 0xd5, 0x4d, 0x70, 0xac, 0xcc, 0x38, 0x78, 0xaf, 0xcf, + 0x83, 0x91, 0xc7, 0xad, 0x95, 0x4a, 0x83, 0xbd, 0x59, 0x88, 0x7a, 0xb9, + 0x61, 0xa7, 0x72, 0x89, 0x2f, 0xcc, 0x9c, 0xb5, 0x60, 0x98, 0x3d, 0xa6, + 0x46, 0x77, 0x40, 0x72, 0x39, 0xc2, 0x4c, 0x8e, 0x52, 0xc8, 0x8d, 0xb0, + 0xbd, 0xae, 0xbb, 0x3e, 0x7b, 0x3b, 0x40, 0xbc, 0x59, 0xb2, 0xa0, 0x98, + 0x3f, 0x42, 0xc8, 0x87, 0xcc, 0x58, 0xb4, 0xad, 0xcc, 0x99, 0x50, 0x62, + 0xb5, 0x5e, 0x89, 0x83, 0x45, 0x4e, 0x97, 0x67, 0x87, 0xac, 0xd0, 0xd2, + 0x70, 0x4c, 0x7b, 0x2a, 0xc6, 0x51, 0x6d, 0x73, 0xa7, 0xab, 0x80, 0x41, + 0x5b, 0x5a, 0xb4, 0x74, 0x60, 0x85, 0xb0, 0xb3, 0x7f, 0xbd, 0xb3, 0xd7, + 0x5f, 0xc4, 0x4e, 0x61, 0x90, 0x40, 0x55, 0x63, 0x74, 0x82, 0x77, 0x84, + 0x23, 0x81, 0x74, 0xd4, 0x3a, 0x7f, 0x86, 0x9d, 0xbb, 0x54, 0x7d, 0x77, + 0x85, 0x52, 0xc8, 0x8b, 0xc0, 0xc1, 0x3a, 0x89, 0x6b, 0xb4, 0xd8, 0x65, + 0xa6, 0x4c, 0x47, 0x44, 0x99, 0x6f, 0x5a, 0x8f, 0x86, 0x7b, 0x37, 0x70, + 0x77, 0x54, 0x38, 0x74, 0x68, 0x76, 0x50, 0xae, 0xa7, 0xaf, 0x6a, 0x5b, + 0x4e, 0xc8, 0x74, 0xc8, 0x71, 0x47, 0x62, 0x82, 0xcd, 0x6b, 0xd1, 0x40, + 0xb3, 0xd1, 0xb9, 0xb0, 0x81, 0x69, 0x9d, 0xab, 0x5b, 0x37, 0xab, 0x30, + 0x31, 0xaf, 0xc0, 0xb3, 0x52, 0x4d, 0xcf, 0x50, 0xc9, 0xaa, 0x57, 0xa2, + 0xad, 0x49, 0x9a, 0xa8, 0x3d, 0x60, 0x3e, 0x6d, 0x8a, 0xcc, 0x71, 0x34, + 0xa2, 0x31, 0xa4, 0x6f, 0xa0, 0x82, 0x34, 0x6f, 0xaf, 0xa9, 0xb3, 0x93, + 0x90, 0xcf, 0xda, 0x79, 0x99, 0xbf, 0x4f, 0xc3, 0xc9, 0x56, 0x5d, 0x6f, + 0xb8, 0xd4, 0x51, 0xa2, 0xc6, 0x43, 0x87, 0xb0, 0xb9, 0x7a, 0x56, 0x86, + 0x7e, 0x80, 0xbb, 0x58, 0x71, 0x50, 0xc7, 0xd4, 0x7c, 0x42, 0x83, 0x98, + 0x96, 0xa1, 0xa6, 0x8f, 0xc2, 0xbc, 0x4c, 0x99, 0x43, 0x4a, 0xa5, 0x42, + 0x95, 0x61, 0x6e, 0x3c, 0x3f, 0x46, 0x39, 0x3c, 0xd3, 0x79, 0x41, 0x8b, + 0xc7, 0x90, 0xae, 0x6f, 0xaf, 0x4c, 0x86, 0xba, 0x3b, 0x64, 0xaa, 0x61, + 0x8f, 0x67, 0x74, 0x5f, 0x5d, 0x93, 0xb8, 0x4b, 0xb2, 0xbe, 0x62, 0x55, + 0xcf, 0xb1, 0x3a, 0x7e, 0xcd, 0xa8, 0xae, 0x3a, 0x9f, 0xad, 0x56, 0xca, + 0xcb, 0xa0, 0x52, 0x6c, 0xb3, 0xae, 0x50, 0xbd, 0x97, 0x45, 0xaf, 0x48, + 0xb2, 0x5c, 0xd0, 0xc2, 0xce, 0x27, 0x5e, 0xa1, 0x3e, 0xd7, 0x77, 0x7d, + 0x81, 0xb5, 0x4a, 0x8c, 0x97, 0x9d, 0x4a, 0x86, 0x3f, 0x4e, 0x30, 0x8f, + 0xc5, 0x85, 0x3c, 0x8e, 0x4d, 0xac, 0x47, 0x6e, 0x80, 0x83, 0xc7, 0x3a, + 0x78, 0x56, 0x50, 0xbf, 0xce, 0x2f, 0xa9, 0xcd, 0x60, 0x90, 0x88, 0x3f, + 0x7b, 0x9c, 0x59, 0x42, 0xb9, 0xd1, 0x63, 0x82, 0x84, 0xcc, 0x60, 0xa7, + 0xab, 0xbf, 0x8f, 0xba, 0x9d, 0x34, 0x92, 0x93, 0x6e, 0x7a, 0x57, 0x56, + 0x96, 0x69, 0x40, 0x68, 0x56, 0x74, 0x46, 0x99, 0x43, 0x89, 0x36, 0x4b, + 0x3d, 0x8e, 0x74, 0xb1, 0x30, 0x94, 0xcb, 0x73, 0x4e, 0x86, 0x6a, 0xb0, + 0x7d, 0x42, 0xb1, 0x6a, 0x65, 0x3f, 0x8f, 0x3d, 0x95, 0x70, 0x44, 0x9c, + 0x85, 0x52, 0xc6, 0x4e, 0xb9, 0x93, 0x9e, 0x78, 0x49, 0x66, 0x68, 0xbd, + 0x87, 0x7f, 0x78, 0xa9, 0xc9, 0xa8, 0x96, 0xa5, 0x36, 0x83, 0x64, 0x41, + 0x51, 0x6f, 0xb8, 0xba, 0x92, 0x64, 0x58, 0x52, 0x46, 0x6d, 0x6e, 0x9b, + 0x48, 0x5f, 0x62, 0x3d, 0x4b, 0xa7, 0xce, 0x79, 0x33, 0xa4, 0xcb, 0x8d, + 0x5f, 0xa8, 0xa3, 0x4d, 0x6f, 0x9f, 0xc6, 0x5b, 0x2d, 0x91, 0x7f, 0xb1, + 0x85, 0xc0, 0x8e, 0xbf, 0xd4, 0x66, 0x6f, 0x9d, 0x39, 0xab, 0x39, 0x49, + 0x72, 0x75, 0xab, 0x8a, 0x3a, 0xd1, 0x4a, 0x77, 0x5c, 0x87, 0x94, 0x3b, + 0x2a, 0x68, 0x2d, 0xa0, 0xb1, 0x6e, 0x98, 0x33, 0xc9, 0xbc, 0xa8, 0x84, + 0x48, 0xc5, 0x4b, 0x6e, 0x75, 0x68, 0x64, 0x67, 0x3c, 0x66, 0xb9, 0x63, + 0x9f, 0xc1, 0x67, 0x3d, 0xb9, 0x9b, 0x3f, 0xb5, 0xb7, 0x9c, 0x49, 0xb0, + 0x4d, 0xa5, 0xac, 0x7b, 0x45, 0x66, 0x93, 0x51, 0xa3, 0xb8, 0x85, 0x89, + 0xab, 0xc9, 0x7a, 0xc9, 0x7d, 0x99, 0x40, 0x56, 0xca, 0x3e, 0x74, 0x9b, + 0x72, 0x39, 0xc4, 0x41, 0xd4, 0x4e, 0x89, 0x98, 0x55, 0xc3, 0x62, 0x4b, + 0x46, 0xa1, 0x78, 0x36, 0x3a, 0x94, 0x75, 0x9a, 0x4b, 0x66, 0xd0, 0xaf, + 0x97, 0xd3, 0x89, 0xcc, 0x5d, 0x9b, 0x90, 0xa6, 0x6d, 0x47, 0x94, 0xb2, + 0x6d, 0x47, 0x55, 0x41, 0xb6, 0x4d, 0x71, 0x31, 0xa3, 0x7e, 0xbd, 0x65, + 0x4e, 0x83, 0x85, 0xb5, 0x93, 0xb6, 0xc6, 0x93, 0x74, 0xa3, 0xad, 0x6b, + 0x79, 0xce, 0xb4, 0x6b, 0xc1, 0x41, 0xbc, 0x70, 0x37, 0xc0, 0x54, 0x38, + 0xbc, 0x5c, 0xbf, 0x94, 0x40, 0x5e, 0xb3, 0x6b, 0x64, 0x44, 0x58, 0x58, + 0x3a, 0x72, 0xc8, 0x54, 0xc0, 0x7c, 0x92, 0x62, 0x45, 0x85, 0xb4, 0x58, + 0x75, 0xa4, 0xa7, 0x56, 0x6b, 0xa7, 0x89, 0x62, 0x63, 0x49, 0x75, 0xa8, + 0xac, 0xc3, 0xaf, 0x52, 0x5e, 0x49, 0x3d, 0xa0, 0xc4, 0x40, 0x38, 0x33, + 0x94, 0x64, 0xb6, 0x32, 0x52, 0x63, 0x4e, 0xac, 0x7d, 0x93, 0x6a, 0xb1, + 0x85, 0x63, 0x72, 0x5a, 0x7a, 0x54, 0x8f, 0x63, 0xbe, 0x7e, 0xc9, 0x3a, + 0x46, 0x3d, 0x4d, 0xca, 0x5d, 0x76, 0xa7, 0x91, 0x3e, 0x4c, 0x3f, 0xa1, + 0xb7, 0x84, 0x9b, 0x3c, 0x3d, 0x69, 0xb4, 0x57, 0xb1, 0xa7, 0x69, 0xb4, + 0x7d, 0x61, 0x8d, 0xaf, 0x96, 0x6f, 0x8a, 0x33, 0x5b, 0x59, 0x7c, 0x7c, + 0xce, 0x4d, 0x47, 0x95, 0x4b, 0xa1, 0x47, 0xa3, 0xa8, 0x6a, 0xcc, 0xbd, + 0x5b, 0x79, 0x78, 0x91, 0x5d, 0x69, 0xc9, 0xb7, 0x7a, 0x85, 0x99, 0x51, + 0x7c, 0x6e, 0xaf, 0xc0, 0x61, 0x48, 0x6f, 0x6c, 0xcd, 0xc3, 0x79, 0xac, + 0x6c, 0x7e, 0xb1, 0x46, 0xb1, 0x66, 0x91, 0x43, 0x59, 0xc7, 0x94, 0x80, + 0x87, 0x8b, 0xa8, 0xcb, 0xbb, 0x49, 0x57, 0x41, 0xb6, 0x4d, 0x37, 0xa8, + 0x56, 0x54, 0x5c, 0x5e, 0xcd, 0xb3, 0xc0, 0xc5, 0x75, 0xa8, 0x6f, 0x69, + 0xc4, 0x7b, 0xc1, 0xa7, 0x64, 0x94, 0xac, 0xb1, 0x3e, 0x86, 0xb4, 0x6b, + 0x79, 0x3e, 0x97, 0xa9, 0x34, 0x69, 0xaa, 0x55, 0x40, 0x4f, 0x89, 0x97, + 0x47, 0x3a, 0x91, 0x94, 0x65, 0x35, 0x6f, 0x62, 0x59, 0xb9, 0x7f, 0x5f, + 0xb7, 0x95, 0x6d, 0x43, 0x9e, 0x34, 0xb8, 0x65, 0x82, 0x3d, 0x9f, 0xa4, + 0x96, 0x65, 0xbe, 0x90, 0x5c, 0xa2, 0x45, 0x90, 0x45, 0x50, 0xd3, 0x5d, + 0xa4, 0x99, 0x8a, 0x9d, 0x38, 0x57, 0x74, 0x5a, 0xad, 0x93, 0x85, 0x99, + 0xca, 0x79, 0x88, 0x55, 0x3e, 0x4e, 0x45, 0xbf, 0xc3, 0x49, 0xab, 0x6a, + 0xb8, 0x2e, 0x8d, 0x52, 0x38, 0xca, 0xca, 0xcd, 0x37, 0xbe, 0x93, 0x48, + 0xb7, 0x80, 0x37, 0x6d, 0x37, 0x5d, 0x6c, 0xa3, 0x91, 0x40, 0x54, 0x33, + 0x96, 0x4b, 0x8f, 0x88, 0x57, 0xce, 0x32, 0xc4, 0x70, 0x3b, 0xa6, 0x52, + 0xc1, 0xbb, 0x54, 0x38, 0x51, 0xb7, 0x84, 0x85, 0x4d, 0x47, 0x73, 0x74, + 0xa3, 0x36, 0x41, 0x65, 0xbe, 0x7a, 0x76, 0x92, 0x9d, 0x89, 0x81, 0x55, + 0xb0, 0x53, 0x96, 0xa7, 0xbf, 0x86, 0x85, 0x6f, 0x90, 0xa5, 0x43, 0x86, + 0xca, 0xa1, 0x67, 0x88, 0xa9, 0x93, 0x71, 0xaa, 0x3a, 0x64, 0x9c, 0xa9, + 0xa5, 0xab, 0x90, 0x43, 0x6e, 0x9b, 0xad, 0x3f, 0x49, 0x33, 0x49, 0xd0, + 0x4e, 0x90, 0xaa, 0x31, 0x47, 0x3d, 0x79, 0x30, 0x7c, 0x4a, 0xc1, 0x8e, + 0x48, 0xa2, 0xda, 0x6f, 0x70, 0x44, 0xb4, 0xc7, 0x65, 0x8b, 0xaa, 0x87, + 0xa6, 0x8e, 0x9e, 0x33, 0xa5, 0x50, 0x89, 0x48, 0x66, 0x7c, 0x54, 0x7f, + 0x3d, 0x6c, 0xcb, 0x28, 0x3b, 0x86, 0x63, 0x9b, 0x3d, 0x44, 0xb0, 0x3a, + 0x38, 0x65, 0xd7, 0xaf, 0xac, 0x77, 0x8f, 0x88, 0x57, 0x8e, 0x99, 0xc6, + 0xb9, 0x89, 0xcb, 0xbd, 0x47, 0x49, 0x3f, 0xc4, 0xc3, 0xa6, 0xbb, 0x48, + 0xd4, 0x9b, 0xab, 0xa9, 0xc0, 0x7b, 0x89, 0xbb, 0x51, 0x69, 0x2a, 0xca, + 0x41, 0x4a, 0x34, 0x1e, 0x3f, 0xd0, 0x7a, 0x55, 0x43, 0x1c, 0x39, 0x79, + 0x64, 0xa5, 0xa2, 0x4d, 0x56, 0x77, 0x63, 0x95, 0xd3, 0xb2, 0x83, 0x79, + 0xa0, 0xda, 0xce, 0x90, 0x52, 0x7f, 0x9d, 0xd0, 0xa1, 0x34, 0xca, 0x58, + 0x83, 0x48, 0xb1, 0xcd, 0x84, 0x43, 0x6f, 0x62, 0x3e, 0xb6, 0x4d, 0xb5, + 0x5f, 0x32, 0x6b, 0x72, 0xaf, 0x94, 0x91, 0xd4, 0x84, 0xb5, 0xd9, 0xb7, + 0x51, 0x31, 0x70, 0xad, 0x7d, 0xb6, 0x66, 0x67, 0x94, 0xa7, 0x16, 0x6b, + 0xa8, 0x98, 0xcb, 0xa1, 0x4b, 0x92, 0x44, 0xd0, 0xb0, 0x3b, 0xb2, 0x54, + 0x99, 0x26, 0x35, 0x8b, 0x74, 0xa5, 0x91, 0x4c, 0xb4, 0x56, 0x98, 0x7d, + 0x6b, 0x8a, 0x6a, 0x5e, 0x6c, 0x92, 0x4b, 0x80, 0xbb, 0xce, 0x9b, 0x33, + 0xac, 0xbf, 0x46, 0x49, 0xa5, 0xc7, 0xb2, 0x61, 0x57, 0x86, 0x97, 0x8c, + 0x39, 0x47, 0x83, 0x41, 0x68, 0x84, 0x7f, 0x78, 0x54, 0x4e, 0xbc, 0x7d, + 0x84, 0x61, 0x50, 0xb9, 0x55, 0x68, 0x8e, 0xc4, 0x68, 0x2b, 0x4b, 0xb8, + 0xcd, 0x42, 0xbe, 0x32, 0xa4, 0x88, 0x33, 0x76, 0xa2, 0xaa, 0x9d, 0xc7, + 0x82, 0x3a, 0xc4, 0x7b, 0xd1, 0x36, 0x88, 0x76, 0x70, 0xc8, 0x56, 0x51, + 0x90, 0x94, 0xad, 0x8c, 0x43, 0x40, 0xa3, 0x89, 0x49, 0xbf, 0xba, 0x3c, + 0xaf, 0x3e, 0x78, 0x83, 0x55, 0x5b, 0x55, 0x53, 0xb0, 0x91, 0xb4, 0xb0, + 0x5b, 0x7e, 0x83, 0xbe, 0x6a, 0x85, 0xbb, 0xc4, 0xd4, 0x5d, 0x54, 0x84, + 0x4c, 0xc8, 0x77, 0x6e, 0xa9, 0x45, 0x48, 0x80, 0xa9, 0x7e, 0xa6, 0xb2, + 0x45, 0x6b, 0xd0, 0x30, 0x68, 0x82, 0x6e, 0xac, 0x62, 0xb4, 0x8d, 0x74, + 0xd0, 0x52, 0x99, 0x73, 0x54, 0x81, 0x7d, 0x61, 0x3a, 0xc3, 0x71, 0xc2, + 0x9e, 0x6c, 0xae, 0xd0, 0xb4, 0xba, 0x30, 0x9e, 0xc9, 0xbb, 0x83, 0x35, + 0x53, 0x5c, 0x45, 0xd0, 0xc0, 0xc1, 0x3d, 0x63, 0xa4, 0x40, 0x98, 0xd0, + 0x60, 0xcc, 0xa5, 0x30, 0x92, 0xc9, 0x69, 0x8e, 0x7f, 0x58, 0x9f, 0x49, + 0x8b, 0x39, 0x58, 0x3c, 0xce, 0x64, 0x6d, 0x59, 0x6b, 0xae, 0x66, 0x35, + 0xcb, 0x8a, 0x45, 0x3d, 0xa5, 0xcc, 0xa3, 0x7f, 0x5d, 0x71, 0xba, 0x34, + 0xcb, 0x4e, 0x98, 0x4a, 0x68, 0x6d, 0x46, 0x67, 0x80, 0xc8, 0x6e, 0xb0, + 0xaf, 0xc7, 0x66, 0x8f, 0xc5, 0x57, 0x5b, 0xb8, 0x41, 0xbb, 0x81, 0x61, + 0xb1, 0x8c, 0x60, 0x80, 0xb3, 0x55, 0xb6, 0x7a, 0x9c, 0x9b, 0x52, 0x82, + 0x74, 0x3d, 0x3f, 0x6b, 0xd8, 0xac, 0xa6, 0x5a, 0x6d, 0x78, 0xbd, 0x5f, + 0xba, 0x82, 0xb4, 0x4a, 0xa9, 0x38, 0x50, 0x98, 0x6a, 0xb1, 0xac, 0x91, + 0xc1, 0xbd, 0xad, 0x93, 0x4b, 0x65, 0xd2, 0xd6, 0xbd, 0x4c, 0xc0, 0x7c, + 0xa6, 0x4a, 0x24, 0xb3, 0x86, 0x67, 0x62, 0x96, 0x7a, 0xc8, 0x54, 0x77, + 0x8f, 0xc0, 0x40, 0x92, 0x9e, 0x3c, 0xa5, 0x7d, 0xd2, 0x46, 0x8f, 0x95, + 0xd7, 0x4b, 0x85, 0xa0, 0x4f, 0x73, 0xcb, 0x98, 0x71, 0xc3, 0x40, 0x9b, + 0xcd, 0x35, 0xa0, 0x82, 0xcc, 0xa2, 0xb3, 0x87, 0xa5, 0x6e, 0xcb, 0x34, + 0x9b, 0x3b, 0x91, 0x73, 0x8b, 0x4a, 0x5c, 0x7f, 0x9e, 0x80, 0x4a, 0xb9, + 0xa2, 0x52, 0x44, 0xa6, 0x8a, 0x76, 0xaf, 0x4e, 0x99, 0xae, 0xbb, 0xce, + 0x99, 0x7b, 0xc2, 0x3e, 0xae, 0xa0, 0x32, 0x4b, 0xcc, 0x4a, 0x61, 0x6a, + 0x35, 0x4b, 0x46, 0x75, 0x4c, 0x95, 0xdc, 0x88, 0xc1, 0x32, 0x7a, 0x93, + 0x60, 0x92, 0x35, 0x88, 0x50, 0x72, 0x7d, 0x8a, 0x5c, 0x53, 0x8f, 0x52, + 0x74, 0xbc, 0x9f, 0x8b, 0xab, 0xad, 0x2d, 0xa5, 0x9e, 0x3b, 0x77, 0x63, + 0x42, 0x6f, 0x9f, 0xaa, 0xcf, 0x83, 0xc1, 0x8a, 0xb3, 0xcb, 0x98, 0x6d, + 0x4a, 0xcb, 0xa4, 0x4b, 0x4f, 0x83, 0x44, 0x67, 0xc7, 0x62, 0x54, 0x48, + 0xda, 0xb9, 0x82, 0x87, 0xc0, 0xc2, 0x9d, 0xc5, 0xc2, 0x9a, 0x94, 0x9c, + 0x67, 0xc2, 0x67, 0x4c, 0x3c, 0x9b, 0x9f, 0x66, 0x52, 0x98, 0x62, 0x51, + 0x5e, 0x36, 0x36, 0xb1, 0x5c, 0x6d, 0xb9, 0xb0, 0x6a, 0x7f, 0x53, 0xa9, + 0x8c, 0x8e, 0x87, 0xe4, 0xa4, 0x83, 0x5c, 0x70, 0x2c, 0xd2, 0x50, 0x76, + 0x2f, 0x5c, 0xbc, 0x8f, 0x89, 0x9d, 0xb5, 0x61, 0xd6, 0xc5, 0x94, 0x55, + 0xd1, 0x7e, 0x7f, 0xa2, 0x85, 0x58, 0xa0, 0xaa, 0x3a, 0x9d, 0x7a, 0x84, + 0xb5, 0x86, 0xd5, 0x6f, 0x9e, 0x3e, 0x33, 0x82, 0x6f, 0x63, 0x46, 0xb5, + 0x73, 0x4b, 0x90, 0xa8, 0x87, 0xa5, 0x5b, 0xc8, 0xc2, 0xae, 0xc2, 0x4f, + 0x9f, 0x44, 0x6e, 0x87, 0x50, 0x5f, 0x48, 0xab, 0x8e, 0x44, 0xab, 0xd2, + 0xcd, 0xb3, 0x40, 0xb5, 0x85, 0xb9, 0xc0, 0x55, 0x7d, 0x3d, 0xa0, 0xbd, + 0x47, 0x84, 0x40, 0x70, 0x73, 0xc3, 0x38, 0xc3, 0x7c, 0xc4, 0xb2, 0x65, + 0x9a, 0x7d, 0x92, 0x9b, 0x5c, 0x6b, 0x9a, 0xc3, 0x65, 0xcb, 0x83, 0xa9, + 0x44, 0xbd, 0xd2, 0x7d, 0x3b, 0x6c, 0x48, 0x45, 0x83, 0x30, 0x36, 0x55, + 0x77, 0x6e, 0x65, 0x70, 0x93, 0x56, 0x5d, 0xb2, 0xb4, 0xcd, 0xaa, 0x6c, + 0xcc, 0x86, 0xa7, 0x4f, 0x5e, 0x99, 0x3d, 0xa0, 0x43, 0x98, 0x5c, 0x8b, + 0xb6, 0x5a, 0x64, 0x49, 0x94, 0xb4, 0xba, 0xcb, 0x34, 0x50, 0xca, 0x7c, + 0x51, 0x94, 0xb8, 0x9b, 0x75, 0x4e, 0x71, 0x68, 0x31, 0xa7, 0x58, 0x58, + 0xb0, 0x6a, 0x36, 0x3e, 0x50, 0x97, 0x8d, 0xc9, 0x81, 0x50, 0x3b, 0xb1, + 0xa4, 0xc1, 0xb1, 0x4f, 0x94, 0x6d, 0xbb, 0x42, 0x6c, 0x75, 0x6d, 0x8a, + 0x79, 0x32, 0xc2, 0xa5, 0x95, 0x6e, 0x77, 0xb9, 0xcc, 0x39, 0xb7, 0xa0, + 0xb8, 0x6b, 0x4d, 0x8b, 0x6a, 0xb9, 0xc3, 0x88, 0x6f, 0x6f, 0xc7, 0x70, + 0x4d, 0x66, 0x8e, 0xa5, 0x4d, 0x59, 0x40, 0xbe, 0xb5, 0x67, 0x6b, 0xb2, + 0xab, 0x5b, 0x51, 0x33, 0xb1, 0x6b, 0xbe, 0x62, 0x83, 0x96, 0x39, 0x43, + 0x3b, 0x77, 0x58, 0x8b, 0x5e, 0x47, 0xa8, 0x99, 0xa4, 0x9e, 0x89, 0xbc, + 0x80, 0x3a, 0x4f, 0x67, 0x8a, 0xa4, 0x6a, 0xb2, 0xa4, 0x49, 0xc6, 0x95, + 0xa9, 0xae, 0xbb, 0x5c, 0x55, 0xb4, 0x3b, 0xb6, 0xaa, 0x7f, 0x7b, 0xb6, + 0x86, 0xb4, 0x7c, 0xc5, 0x46, 0x38, 0x85, 0x45, 0x36, 0x6e, 0x64, 0x3a, + 0x3b, 0x34, 0x7a, 0x7f, 0x97, 0x8c, 0x96, 0x5f, 0x76, 0x48, 0x54, 0x55, + 0xba, 0xa2, 0xb8, 0x7d, 0x3b, 0x4a, 0x69, 0x5f, 0x6c, 0x3d, 0x76, 0x50, + 0x3a, 0xb1, 0x6d, 0x44, 0x5a, 0xa9, 0xc4, 0x6b, 0x9f, 0x95, 0x88, 0x74, + 0xb0, 0x6a, 0x36, 0x5e, 0x8a, 0x6e, 0x7a, 0x74, 0x9c, 0x6b, 0xc3, 0xae, + 0xad, 0xba, 0xa1, 0x84, 0x3c, 0x75, 0xa8, 0x59, 0xb0, 0xcd, 0x52, 0x3a, + 0x56, 0xa2, 0x6c, 0x52, 0x83, 0x91, 0xad, 0x98, 0x39, 0x57, 0x67, 0xa9, + 0x31, 0xd1, 0xb8, 0x32, 0x8a, 0x63, 0x9e, 0xa0, 0x91, 0xc6, 0xb3, 0xb1, + 0x6d, 0x92, 0x79, 0x6c, 0x41, 0xb8, 0x85, 0x57, 0x7b, 0x7b, 0xa1, 0x50, + 0x76, 0x9c, 0x46, 0xb5, 0xa6, 0xa6, 0x83, 0xce, 0x62, 0x4d, 0xa2, 0x93, + 0xb0, 0x71, 0x6e, 0xbd, 0xb6, 0x41, 0x5d, 0x52, 0xa9, 0x85, 0x93, 0x39, + 0xb4, 0x7e, 0xcb, 0x6c, 0xcc, 0xc4, 0x71, 0xb9, 0x65, 0x68, 0x72, 0x59, + 0xae, 0xb9, 0x43, 0x60, 0x74, 0xb8, 0xce, 0xb4, 0x85, 0x9a, 0x70, 0x84, + 0x92, 0xc0, 0x85, 0x6c, 0x6d, 0x88, 0xbe, 0x90, 0xb4, 0x33, 0x34, 0xa8, + 0x75, 0x8f, 0x79, 0x8c, 0x72, 0x9a, 0x3b, 0x73, 0x3b, 0xa9, 0x33, 0xbf, + 0xbd, 0x66, 0xb4, 0x82, 0xb1, 0xcc, 0x7b, 0x9f, 0x69, 0xc4, 0xd2, 0xce, + 0xb8, 0x90, 0x30, 0x59, 0x3c, 0xc3, 0xa8, 0xc0, 0x48, 0x43, 0x56, 0x9f, + 0x53, 0x5b, 0x74, 0x5e, 0x5b, 0x52, 0x57, 0x68, 0xab, 0xb9, 0x5d, 0x56, + 0x62, 0xbe, 0x5f, 0x7b, 0x80, 0x86, 0xc0, 0x8c, 0xc1, 0x57, 0x72, 0x72, + 0xcb, 0x30, 0xb4, 0xba, 0x45, 0xae, 0x31, 0x89, 0xb3, 0x91, 0x3c, 0xb2, + 0x6a, 0x3e, 0xc2, 0x7b, 0x44, 0xa3, 0x64, 0x73, 0x9e, 0x4e, 0xb3, 0x3c, + 0x5f, 0xb7, 0x81, 0xb4, 0x57, 0x42, 0x82, 0x54, 0xad, 0xc9, 0xb4, 0x66, + 0x2f, 0x4d, 0x90, 0x5a, 0x70, 0xb6, 0x6c, 0x6b, 0x72, 0x3c, 0x6f, 0x3b, + 0x49, 0x88, 0xce, 0xad, 0x45, 0xa8, 0x7a, 0x46, 0x30, 0x47, 0xb7, 0x8f, + 0x56, 0xa2, 0x85, 0x7b, 0xae, 0x84, 0x3f, 0x56, 0x34, 0x5f, 0x41, 0x58, + 0x84, 0x41, 0x56, 0x54, 0xaa, 0x88, 0x72, 0xab, 0xa4, 0x81, 0xa3, 0xbe, + 0x33, 0xaf, 0x7c, 0x74, 0x66, 0x96, 0xcc, 0x92, 0x7e, 0x39, 0x44, 0x78, + 0x9b, 0xa3, 0x85, 0xac, 0xc5, 0xc1, 0x55, 0x75, 0xce, 0x42, 0x54, 0x5c, + 0x34, 0x75, 0x72, 0x96, 0xad, 0x72, 0x47, 0x46, 0xc0, 0x65, 0xc2, 0xca, + 0x65, 0xc1, 0x57, 0x5b, 0x81, 0x5f, 0x43, 0xa9, 0x6c, 0x34, 0x8b, 0x37, + 0x61, 0x89, 0xa7, 0xa8, 0xaf, 0xab, 0x3c, 0xb5, 0x6c, 0x33, 0xa7, 0x92, + 0xb0, 0xcb, 0x97, 0xa6, 0x64, 0xc8, 0x43, 0x86, 0x44, 0x47, 0x4e, 0x92, + 0x94, 0xb5, 0x3d, 0xa4, 0x9f, 0x60, 0xbb, 0xa7, 0xbd, 0x57, 0x32, 0x43, + 0x35, 0x30, 0x53, 0xaa, 0xc9, 0x58, 0xa4, 0x47, 0xa4, 0x3e, 0x84, 0x90, + 0xd3, 0x85, 0xb6, 0x6c, 0x88, 0x5c, 0x3c, 0x9f, 0x64, 0x8b, 0x7e, 0xbc, + 0x4e, 0x96, 0x42, 0x92, 0x57, 0x59, 0x56, 0xc3, 0xb2, 0x6b, 0xbe, 0x99, + 0x8b, 0x2d, 0xaf, 0x5b, 0xbb, 0x53, 0x63, 0x92, 0x51, 0x67, 0xba, 0x62, + 0x95, 0x41, 0x79, 0x8b, 0x6c, 0x41, 0x9f, 0x3e, 0xcd, 0x45, 0xa1, 0xbc, + 0xb4, 0x4a, 0xb4, 0x3f, 0x52, 0x49, 0x4d, 0x34, 0x46, 0x75, 0x8a, 0xc7, + 0xc6, 0x96, 0x5c, 0xad, 0x62, 0xb8, 0xa1, 0x6f, 0xb7, 0xd2, 0x3e, 0x8e, + 0x8b, 0xaf, 0x32, 0xd1, 0x65, 0x5d, 0x42, 0xc3, 0x57, 0x97, 0x75, 0x3c, + 0xc6, 0x43, 0x4f, 0x50, 0x96, 0x60, 0xad, 0x60, 0xb8, 0xc2, 0xc3, 0xae, + 0x4f, 0xa6, 0x9f, 0x5f, 0xaa, 0x64, 0xba, 0xa7, 0xaf, 0xc5, 0x7c, 0xab, + 0x6d, 0x68, 0x5f, 0x9a, 0x82, 0xa2, 0x52, 0xc8, 0x55, 0x54, 0xce, 0x57, + 0xc8, 0x9d, 0x42, 0xbd, 0x63, 0xc0, 0x3e, 0x74, 0xcb, 0xb2, 0x71, 0x3c, + 0x34, 0x7e, 0x50, 0x7d, 0x77, 0x37, 0x38, 0x8b, 0xb5, 0xab, 0x5d, 0x8d, + 0x5b, 0x8a, 0x57, 0x60, 0x3b, 0x3b, 0x8b, 0x57, 0xc0, 0x5f, 0x56, 0xa3, + 0xa0, 0xa2, 0xb9, 0xb4, 0xa5, 0xbf, 0x57, 0x3d, 0x41, 0xc7, 0x54, 0x9d, + 0x5a, 0x57, 0x77, 0xbf, 0x74, 0x50, 0xbf, 0xc0, 0x6e, 0xa8, 0xa3, 0xca, + 0x91, 0x74, 0x7b, 0x5f, 0x38, 0x35, 0x43, 0x92, 0x61, 0x97, 0xcd, 0xc3, + 0x47, 0x87, 0xa4, 0x99, 0xaf, 0xa4, 0x48, 0x85, 0x9b, 0x82, 0x40, 0x96, + 0xc2, 0xbe, 0xa5, 0xca, 0x64, 0x38, 0x66, 0x78, 0x8e, 0xbb, 0x78, 0x2f, + 0xcc, 0xa7, 0x41, 0x77, 0x7a, 0x9b, 0x9a, 0xa5, 0x6d, 0x76, 0x80, 0xa1, + 0x89, 0x80, 0x92, 0xab, 0x72, 0x50, 0x9b, 0x74, 0x77, 0x85, 0x55, 0x65, + 0x44, 0x56, 0xcf, 0x5a, 0x4d, 0xa9, 0xc1, 0xcc, 0x4b, 0x7c, 0xa3, 0x70, + 0x9e, 0x5b, 0x53, 0xd9, 0x89, 0xb6, 0x44, 0x9d, 0x2e, 0x95, 0x59, 0x95, + 0x86, 0xc9, 0xaf, 0x52, 0xc3, 0x34, 0x56, 0x91, 0xa9, 0x95, 0xcc, 0xad, + 0xd9, 0x7e, 0x68, 0xb3, 0x47, 0xb2, 0x8b, 0x86, 0xc9, 0x44, 0x6c, 0x44, + 0x52, 0x78, 0xad, 0x61, 0x87, 0x62, 0xbf, 0xc3, 0x73, 0x29, 0xb6, 0xa4, + 0xbe, 0xc5, 0x5e, 0xa5, 0xab, 0xa7, 0xcd, 0x4f, 0xa0, 0x5a, 0x96, 0xd2, + 0x5b, 0xac, 0x98, 0xbb, 0x88, 0x97, 0x83, 0xb4, 0x81, 0x47, 0x97, 0x67, + 0x83, 0xd4, 0xab, 0x47, 0x78, 0x71, 0x2e, 0x34, 0x52, 0x8a, 0xbe, 0x6f, + 0x34, 0x45, 0xa6, 0xaa, 0x9e, 0xa3, 0xbe, 0xa3, 0xb0, 0x5f, 0x43, 0x6a, + 0x83, 0x86, 0xa0, 0xc5, 0x4e, 0x80, 0xb1, 0x9c, 0xcd, 0x69, 0x6f, 0x56, + 0xa3, 0xad, 0x42, 0x7f, 0x31, 0xc2, 0xb9, 0x7a, 0x83, 0xc0, 0x72, 0x5b, + 0x83, 0xe8, 0x52, 0xa8, 0xc9, 0xf1, 0xbd, 0x4d, 0x98, 0xb1, 0x94, 0xa2, + 0xc1, 0xd9, 0x92, 0x75, 0xa5, 0xd0, 0x84, 0x4a, 0x5e, 0xcc, 0x34, 0xc1, + 0x8a, 0xc4, 0xa8, 0x9b, 0x52, 0xbf, 0x54, 0x69, 0x7e, 0x2b, 0x3d, 0x7e, + 0x5a, 0x52, 0xc4, 0x7d, 0xd4, 0xc1, 0xbb, 0xad, 0xb6, 0xbd, 0x8b, 0x49, + 0x93, 0x4f, 0xbd, 0x37, 0xb7, 0xaf, 0xb6, 0x34, 0xbb, 0xb5, 0xd2, 0xb4, + 0x52, 0xbb, 0x55, 0xd1, 0xa3, 0xc1, 0xaa, 0x5a, 0x36, 0x75, 0xc5, 0x6e, + 0xb2, 0x97, 0x72, 0x3f, 0xca, 0x45, 0xa3, 0xc8, 0xc3, 0x81, 0xb2, 0x3f, + 0x2b, 0x52, 0xa2, 0xb8, 0xa8, 0x55, 0x51, 0x31, 0xa0, 0x54, 0x9e, 0x70, + 0x48, 0x4a, 0x79, 0x4a, 0x6e, 0xce, 0x55, 0xc0, 0x56, 0x9d, 0x61, 0x52, + 0x8b, 0x9c, 0x84, 0x43, 0x38, 0x3f, 0xa6, 0x9d, 0x32, 0x31, 0xca, 0x99, + 0x2d, 0x3b, 0x95, 0x3a, 0x5e, 0x91, 0xb5, 0xaa, 0x4f, 0x6f, 0x61, 0x65, + 0xda, 0xb3, 0xad, 0x33, 0x7a, 0x6e, 0x6e, 0xb5, 0xb9, 0x31, 0x94, 0x58, + 0xc2, 0x8f, 0x71, 0x3a, 0xbb, 0xcc, 0x63, 0x92, 0xc7, 0x9c, 0x61, 0xa9, + 0x63, 0x5d, 0x78, 0x9b, 0xd0, 0xd4, 0xd7, 0x69, 0x92, 0x69, 0x9f, 0x98, + 0x98, 0xc0, 0x56, 0x79, 0x4d, 0xbc, 0x40, 0x74, 0xa2, 0x7f, 0xbf, 0x55, + 0xa8, 0x78, 0x7a, 0x56, 0x83, 0x33, 0xb4, 0xdb, 0x53, 0x67, 0xa3, 0x73, + 0xb7, 0x62, 0x3b, 0x37, 0x14, 0x9f, 0x69, 0x66, 0xe2, 0xb8, 0x92, 0x93, + 0xaa, 0xa7, 0xa3, 0x8a, 0xa3, 0xc8, 0x76, 0xab, 0x63, 0x69, 0x44, 0x35, + 0xbc, 0x57, 0x51, 0x87, 0x77, 0x8b, 0x78, 0x6a, 0xbe, 0x86, 0xb7, 0xd2, + 0xe0, 0x8d, 0x49, 0x7e, 0x45, 0x96, 0x38, 0x54, 0x78, 0x9c, 0x87, 0x62, + 0x57, 0x81, 0x81, 0x3c, 0x99, 0x98, 0x9b, 0xb3, 0x85, 0x64, 0xc4, 0xa7, + 0xca, 0x68, 0x56, 0x54, 0x77, 0x70, 0x78, 0x31, 0x9b, 0x39, 0x5e, 0x9a, + 0xe3, 0xd4, 0x6b, 0xaf, 0x94, 0x3f, 0xb3, 0x83, 0xbd, 0x9a, 0x63, 0xb1, + 0x5b, 0x84, 0xbb, 0xb6, 0xc3, 0x42, 0xbb, 0x2d, 0x5a, 0x96, 0x94, 0x71, + 0xb4, 0x79, 0x8f, 0x73, 0xaa, 0x7b, 0x9d, 0x89, 0xd7, 0x78, 0x89, 0xcb, + 0x4d, 0xce, 0xbc, 0x7b, 0xdd, 0x53, 0x55, 0xb5, 0xbd, 0x8d, 0x79, 0x6f, + 0xce, 0x79, 0x7f, 0x95, 0x31, 0x63, 0x6b, 0x7d, 0xa8, 0x88, 0xbf, 0xc5, + 0x3b, 0x7d, 0xc3, 0xa2, 0xb6, 0x90, 0x42, 0xaa, 0x3e, 0xb4, 0x65, 0xb7, + 0x92, 0x65, 0x87, 0xb8, 0x94, 0xbd, 0xd3, 0x5f, 0x3d, 0xb5, 0xd1, 0x7c, + 0x7f, 0x7b, 0xd9, 0x9d, 0x3e, 0x67, 0x34, 0x5b, 0xb7, 0xc7, 0xc6, 0x58, + 0x83, 0x39, 0x47, 0x28, 0x77, 0x79, 0x3e, 0x66, 0x5e, 0x81, 0xc3, 0xd0, + 0xba, 0x5d, 0x66, 0xa5, 0x85, 0x47, 0x6b, 0x87, 0xb3, 0xa1, 0x42, 0xab, + 0x49, 0x5d, 0x56, 0xad, 0x61, 0x64, 0xc2, 0xa1, 0xc6, 0x5a, 0xad, 0xc6, + 0x8b, 0xac, 0xb2, 0xce, 0x47, 0xd7, 0x2d, 0x3e, 0x66, 0x52, 0x98, 0x59, + 0xd6, 0x8e, 0x6e, 0x87, 0xa8, 0xcd, 0xad, 0x67, 0xf4, 0xb4, 0x59, 0x4d, + 0xcf, 0x7a, 0x79, 0x4d, 0x8f, 0x62, 0x6a, 0x76, 0xa9, 0xef, 0xea, 0xcc, + 0xb0, 0xb1, 0x3d, 0xa7, 0x7d, 0xdb, 0x6c, 0xd2, 0x53, 0xa7, 0xc1, 0x96, + 0xf7, 0x94, 0xa5, 0x89, 0x50, 0x3c, 0x8a, 0x89, 0x7e, 0xd4, 0xc6, 0x48, + 0xac, 0xd0, 0x82, 0xb5, 0x48, 0x8c, 0x6c, 0x62, 0x6f, 0xc1, 0x5e, 0x6d, + 0x40, 0xcf, 0x93, 0xd8, 0x72, 0x5b, 0x3f, 0x53, 0xc3, 0x8b, 0x92, 0x5e, + 0x64, 0xea, 0x99, 0x90, 0xac, 0x84, 0x55, 0x49, 0xa8, 0x91, 0x4b, 0xb3, + 0x59, 0x36, 0x83, 0x39, 0x85, 0x64, 0x5e, 0x74, 0x9d, 0xc3, 0x5e, 0xc2, + 0xde, 0xa3, 0xa5, 0x82, 0x60, 0x7c, 0x66, 0x3d, 0xdf, 0x76, 0x7f, 0x6b, + 0x80, 0xc1, 0x3e, 0xba, 0x60, 0x6d, 0x58, 0x71, 0x8c, 0xa8, 0x63, 0xc7, + 0xb6, 0x6c, 0xcd, 0xbc, 0x36, 0xd5, 0xb8, 0x3a, 0xb6, 0x76, 0x34, 0xb8, + 0x43, 0x98, 0x8e, 0xc3, 0xcb, 0x38, 0x59, 0x8f, 0x6f, 0xab, 0xc2, 0x4e, + 0x8b, 0xe1, 0xa4, 0x5e, 0xae, 0x98, 0x89, 0x9c, 0xd6, 0x9c, 0xb0, 0x56, + 0xae, 0xc2, 0xc7, 0x46, 0x71, 0x46, 0xbc, 0x61, 0x60, 0xbd, 0xb7, 0x43, + 0x59, 0xc1, 0x37, 0x69, 0x96, 0xbc, 0x84, 0x55, 0x5d, 0x3c, 0x65, 0x71, + 0xc0, 0x46, 0x95, 0x8b, 0xa7, 0x59, 0x7e, 0x39, 0xbe, 0x9e, 0x4f, 0xa8, + 0xe4, 0x46, 0x8b, 0xa7, 0x56, 0xb9, 0x40, 0xd5, 0x90, 0x74, 0xa5, 0x36, + 0x55, 0xae, 0xc8, 0x66, 0xb5, 0xcd, 0xae, 0xa0, 0x7a, 0x31, 0xbc, 0x41, + 0xae, 0xc1, 0x6a, 0xb6, 0x92, 0xe2, 0x7a, 0x43, 0x5e, 0x96, 0xd8, 0x85, + 0x98, 0xdd, 0x5e, 0x92, 0x83, 0x3e, 0x8c, 0x74, 0x2e, 0x43, 0x6b, 0x61, + 0x2b, 0x9d, 0x45, 0x42, 0xcc, 0xd4, 0xcf, 0x69, 0xb9, 0xcd, 0x84, 0x64, + 0x45, 0x94, 0xb2, 0xe3, 0x6f, 0x98, 0xc6, 0x8e, 0x57, 0xc7, 0x7a, 0xd1, + 0x2d, 0x3b, 0x60, 0xc9, 0x90, 0xcb, 0x3b, 0xa3, 0xbe, 0x6f, 0xa0, 0x59, + 0xa1, 0x96, 0xb9, 0x64, 0x3a, 0x81, 0x41, 0x89, 0x60, 0xa9, 0x80, 0x66, + 0xb9, 0xb4, 0xd3, 0x6e, 0x91, 0x54, 0x3e, 0xa4, 0xc6, 0xa3, 0x52, 0xaa, + 0xa7, 0x57, 0x5a, 0x52, 0x39, 0x2b, 0x9c, 0x3f, 0xc1, 0x4b, 0xa9, 0x91, + 0x5f, 0xb5, 0x2c, 0x71, 0x50, 0xc9, 0x6b, 0x65, 0x89, 0x6a, 0x53, 0x68, + 0x71, 0x4e, 0x63, 0xbe, 0xab, 0x37, 0x39, 0x52, 0x78, 0xca, 0x3c, 0x8c, + 0x6c, 0x79, 0x64, 0x5f, 0xab, 0x58, 0x96, 0xc3, 0x53, 0x78, 0x80, 0x85, + 0xae, 0x59, 0x85, 0x83, 0x49, 0xbf, 0x93, 0xa0, 0x74, 0xc6, 0xad, 0x98, + 0x5c, 0x30, 0x3d, 0x4c, 0x67, 0x74, 0x94, 0x6b, 0x51, 0x94, 0xcc, 0xc5, + 0xbe, 0xcf, 0x5b, 0x63, 0xc5, 0x4f, 0xc5, 0x39, 0xb1, 0xc7, 0x4e, 0xb8, + 0x82, 0xdd, 0x61, 0x9a, 0xbc, 0xcc, 0x50, 0xd9, 0x59, 0x98, 0x5b, 0x40, + 0x2c, 0x9b, 0x72, 0x3e, 0xc4, 0x3b, 0x84, 0x77, 0x92, 0x97, 0x98, 0x8e, + 0x35, 0xbd, 0x78, 0x4f, 0x5a, 0x73, 0xc7, 0xb0, 0xa7, 0xd3, 0xcd, 0xca, + 0x70, 0x6b, 0x97, 0x84, 0xbc, 0x67, 0x49, 0x5d, 0x2a, 0x98, 0x81, 0xcc, + 0x82, 0x93, 0x9f, 0x78, 0x55, 0xa1, 0x8f, 0xc7, 0x51, 0x68, 0x6a, 0x3d, + 0x69, 0x6c, 0xcb, 0x56, 0x72, 0x76, 0x53, 0x3b, 0x99, 0xbf, 0xc2, 0x9e, + 0x9a, 0x7a, 0x39, 0x75, 0xa8, 0x78, 0xcd, 0x5a, 0xbd, 0x7d, 0x30, 0x6b, + 0x84, 0x7a, 0x5e, 0x96, 0x68, 0x7c, 0x97, 0x97, 0xce, 0x73, 0x37, 0xb6, + 0x57, 0xa0, 0x7f, 0x95, 0x55, 0x94, 0x55, 0x42, 0x94, 0x6f, 0x9d, 0xc2, + 0x3b, 0x43, 0x69, 0xbd, 0xc5, 0xd1, 0x7f, 0xbe, 0x54, 0xd1, 0x55, 0x97, + 0x6c, 0x3b, 0x38, 0xb6, 0x67, 0x96, 0xa5, 0xb5, 0x53, 0xaa, 0x59, 0x35, + 0x8c, 0xa0, 0x39, 0x44, 0xa7, 0x5d, 0x7a, 0xc1, 0xa3, 0x86, 0x7b, 0x89, + 0x3a, 0x90, 0x6b, 0xb0, 0x83, 0xa0, 0xa6, 0x8a, 0xba, 0x4b, 0x8f, 0x8a, + 0x97, 0xaa, 0x5c, 0xba, 0xd0, 0xa4, 0x76, 0x67, 0x29, 0xbb, 0x4c, 0x52, + 0x62, 0x43, 0xc8, 0x97, 0x54, 0x6b, 0x76, 0x80, 0x55, 0xcb, 0x7c, 0x47, + 0x49, 0x68, 0xaa, 0x48, 0xb2, 0x44, 0xc8, 0xbb, 0xb3, 0xae, 0x4b, 0x5c, + 0x9e, 0xb6, 0xb7, 0x85, 0x67, 0xde, 0x54, 0x43, 0x68, 0x58, 0x36, 0x4b, + 0x36, 0x67, 0xab, 0x3c, 0x64, 0x42, 0x6c, 0x7f, 0x80, 0x5d, 0x5a, 0x94, + 0xb3, 0x65, 0x1f, 0x53, 0x81, 0x54, 0x82, 0x4d, 0x2c, 0xd3, 0xbb, 0x4d, + 0xbf, 0x92, 0x3a, 0xd0, 0x93, 0xc4, 0xcd, 0x8b, 0x39, 0xcf, 0x79, 0x68, + 0x65, 0x73, 0x30, 0xad, 0x98, 0xae, 0x4d, 0x5f, 0x86, 0xd6, 0x6c, 0x57, + 0xac, 0x48, 0xba, 0x4b, 0x88, 0xb3, 0xcd, 0xac, 0x39, 0x7e, 0x89, 0x30, + 0x96, 0x78, 0xb2, 0x75, 0xb3, 0x4d, 0x74, 0x76, 0x6b, 0x51, 0xae, 0x76, + 0x9a, 0x56, 0xb2, 0x67, 0x40, 0xc9, 0xa7, 0x5d, 0x61, 0x95, 0xc4, 0x97, + 0xcf, 0xe7, 0xc2, 0x4a, 0xe1, 0x89, 0xc7, 0x49, 0x71, 0x6f, 0x60, 0x67, + 0x4a, 0x66, 0x45, 0xc8, 0xb8, 0x78, 0x52, 0x81, 0x2d, 0xb1, 0x7b, 0x68, + 0x47, 0x48, 0x60, 0x9f, 0x7a, 0x83, 0xa1, 0xb2, 0xab, 0xbc, 0xcb, 0x72, + 0x4c, 0xd3, 0x9d, 0x6e, 0x9d, 0xb1, 0x87, 0xb5, 0x91, 0x6f, 0x65, 0x6f, + 0x55, 0x8a, 0x30, 0x5b, 0xc4, 0x4b, 0x98, 0x6c, 0xa8, 0x7b, 0x59, 0xa9, + 0x35, 0xc5, 0x7c, 0xa6, 0xac, 0x76, 0x60, 0x70, 0x75, 0x4f, 0x73, 0x45, + 0xa3, 0x62, 0x82, 0x4f, 0xd1, 0x8a, 0xa0, 0x5b, 0x97, 0xb8, 0x5b, 0x69, + 0x39, 0xa3, 0xa9, 0xa7, 0x45, 0x38, 0x50, 0x76, 0xc5, 0x7c, 0xb5, 0x39, + 0x2b, 0x52, 0xb2, 0xb0, 0x4e, 0x99, 0x7d, 0x72, 0x62, 0x4f, 0x4d, 0x7b, + 0x54, 0xcf, 0x4a, 0x43, 0x66, 0xcf, 0x9a, 0x32, 0x71, 0x75, 0x67, 0xb2, + 0x6f, 0xbe, 0x73, 0xc7, 0x92, 0x58, 0x6f, 0xc5, 0x54, 0xc6, 0xa0, 0x49, + 0xb6, 0x94, 0x5c, 0x2d, 0x6c, 0xc6, 0xc8, 0xd0, 0x9e, 0x44, 0x40, 0x61, + 0x6b, 0xa3, 0x5b, 0x7e, 0x7d, 0x2e, 0xb1, 0x82, 0xa3, 0x89, 0x8a, 0xb8, + 0xc4, 0xb4, 0x36, 0x47, 0x95, 0x6c, 0x69, 0xaf, 0x3a, 0x44, 0xbf, 0xe0, + 0x8d, 0xa8, 0xc3, 0x36, 0x7a, 0x90, 0x25, 0xa2, 0x40, 0xc1, 0xd4, 0x7e, + 0x9e, 0x66, 0x46, 0xa3, 0x60, 0x4e, 0x9a, 0x9e, 0x55, 0xcb, 0x5a, 0x9c, + 0x8c, 0x69, 0x4e, 0x86, 0x4e, 0xb8, 0x56, 0xc4, 0x84, 0x25, 0x55, 0x63, + 0xa6, 0x79, 0xbd, 0x8b, 0x80, 0x83, 0x8a, 0x95, 0x44, 0xc9, 0x94, 0x65, + 0x99, 0xae, 0xac, 0x45, 0x58, 0x39, 0x96, 0x4b, 0x30, 0xab, 0x59, 0x60, + 0x63, 0x96, 0xd1, 0x9d, 0x9c, 0x71, 0x40, 0x3d, 0x88, 0x5f, 0xb5, 0x33, + 0x38, 0x68, 0x69, 0x8b, 0x7a, 0x3d, 0x75, 0x63, 0xd3, 0x86, 0x31, 0xcb, + 0xb3, 0x73, 0x93, 0x95, 0xc3, 0x6e, 0x87, 0x56, 0x77, 0xa1, 0xe0, 0x4f, + 0x68, 0xc8, 0x7a, 0x89, 0x60, 0x36, 0x6b, 0xc3, 0xc0, 0x48, 0xcb, 0xaa, + 0x4f, 0x8a, 0x94, 0x30, 0xa8, 0xa7, 0xae, 0x34, 0xd4, 0x79, 0x62, 0xd1, + 0x88, 0x5b, 0x50, 0xd2, 0x99, 0x87, 0x63, 0x42, 0x60, 0x93, 0xc4, 0xa4, + 0xb0, 0xd4, 0x80, 0x6d, 0x5f, 0xc8, 0x3c, 0x8d, 0xd8, 0x60, 0x81, 0x49, + 0x3d, 0x61, 0x48, 0x86, 0xd4, 0x94, 0x36, 0x57, 0x7c, 0xaf, 0xdc, 0x62, + 0x2b, 0x78, 0x70, 0x9a, 0x65, 0x48, 0xa6, 0x41, 0xad, 0xb0, 0xc1, 0x47, + 0x96, 0x89, 0x9c, 0x78, 0x75, 0x99, 0xbc, 0xaa, 0x71, 0x96, 0xcb, 0x35, + 0x72, 0x85, 0x55, 0x3b, 0x70, 0xaa, 0x74, 0x64, 0xb1, 0xc2, 0x9c, 0x86, + 0x3e, 0xcc, 0x36, 0xa8, 0x74, 0x8d, 0xa8, 0x6c, 0x81, 0x94, 0x58, 0xd0, + 0x8e, 0x6d, 0x74, 0x65, 0xc6, 0x60, 0x46, 0xc4, 0xdf, 0x73, 0x81, 0x60, + 0x39, 0xc1, 0x5c, 0x86, 0x3e, 0x3a, 0x6e, 0x8a, 0x78, 0x92, 0x78, 0x63, + 0x7f, 0x4f, 0xab, 0x93, 0x9c, 0xb4, 0x48, 0xba, 0xad, 0x62, 0x3f, 0x9b, + 0x74, 0x32, 0x86, 0x7c, 0x64, 0x5b, 0x86, 0x35, 0x53, 0x4a, 0x35, 0x43, + 0xb7, 0xa5, 0x62, 0x5f, 0xbf, 0x3c, 0x3e, 0xb1, 0xae, 0x4c, 0x47, 0xaa, + 0xb0, 0x6f, 0x56, 0x65, 0x81, 0x71, 0xcd, 0x4c, 0xba, 0x4b, 0x53, 0x85, + 0x96, 0xd3, 0xc3, 0xc4, 0x4c, 0x6d, 0x61, 0x58, 0xae, 0x98, 0x63, 0xd6, + 0x41, 0x8a, 0x53, 0x41, 0x4e, 0x93, 0x81, 0xb2, 0xd2, 0xc1, 0x49, 0x9d, + 0x3f, 0xc7, 0xa4, 0x58, 0x8b, 0x7d, 0x7d, 0x43, 0x39, 0x91, 0x4e, 0x4f, + 0x56, 0x6c, 0xa8, 0xcd, 0x45, 0xc6, 0xc6, 0x64, 0xc8, 0xbd, 0xa7, 0xcb, + 0xae, 0x32, 0xba, 0x56, 0x9d, 0x97, 0xca, 0xcc, 0xc7, 0x5c, 0xcf, 0xc2, + 0xa4, 0x48, 0xba, 0x4a, 0x77, 0xc2, 0x55, 0xa7, 0x6f, 0x6a, 0xc0, 0xc2, + 0x58, 0x57, 0xa6, 0x50, 0x9a, 0xaa, 0x8b, 0x7e, 0x97, 0xd3, 0x3e, 0xc6, + 0x38, 0xcb, 0x42, 0xc6, 0x90, 0x9c, 0x89, 0xa8, 0x4c, 0xd5, 0xce, 0x36, + 0x60, 0x6c, 0x6b, 0x3e, 0x68, 0x6a, 0x2c, 0x53, 0x61, 0x9d, 0x66, 0x45, + 0x59, 0x86, 0x63, 0x59, 0x76, 0xd0, 0x54, 0x5e, 0x94, 0x82, 0x7f, 0xac, + 0xb6, 0x7c, 0x9a, 0x4d, 0x66, 0x56, 0xc9, 0xb1, 0x69, 0x58, 0x54, 0xb8, + 0x46, 0xce, 0x3e, 0x36, 0x90, 0x51, 0xba, 0x3c, 0x5b, 0xc0, 0xc2, 0x95, + 0x85, 0x6a, 0x9d, 0x38, 0xa0, 0x98, 0x46, 0x91, 0x61, 0x53, 0x50, 0x8d, + 0x46, 0x6b, 0xc2, 0x6a, 0x54, 0x99, 0x3e, 0xa6, 0xb4, 0xba, 0x3f, 0xc8, + 0x81, 0x84, 0x70, 0x3b, 0x3a, 0x8f, 0x56, 0x53, 0x88, 0xd2, 0x31, 0x6f, + 0x72, 0xc8, 0x91, 0xa9, 0xa8, 0xb3, 0xd1, 0x33, 0x8c, 0x85, 0xcc, 0x91, + 0x49, 0x86, 0x42, 0x41, 0x6c, 0xba, 0x90, 0x3e, 0x73, 0x6e, 0x55, 0x4d, + 0x7c, 0x58, 0x48, 0xac, 0xa7, 0xcf, 0xa0, 0x8d, 0x58, 0x9d, 0x9b, 0x32, + 0xbb, 0x7c, 0xad, 0xd0, 0x6a, 0xc1, 0x54, 0xbc, 0xc3, 0x50, 0x6b, 0x84, + 0x30, 0x32, 0x46, 0x6d, 0x5f, 0x98, 0x7e, 0x8c, 0x5d, 0x39, 0xb1, 0x9c, + 0xb2, 0xaa, 0x36, 0x4f, 0xb6, 0x69, 0x47, 0xc2, 0x30, 0x42, 0x64, 0x86, + 0x9d, 0xa3, 0xd0, 0x7f, 0x50, 0x71, 0x6f, 0x73, 0xb8, 0x5b, 0x9f, 0x66, + 0x45, 0xa1, 0x4a, 0xa8, 0x64, 0xbb, 0x80, 0xc3, 0x60, 0xb8, 0x7a, 0x49, + 0x78, 0xcd, 0x39, 0x49, 0x8b, 0x56, 0x8a, 0xa4, 0x5b, 0x79, 0x71, 0x48, + 0x44, 0x45, 0x71, 0x39, 0x8d, 0x85, 0xd4, 0xa9, 0x43, 0xc7, 0x2d, 0x7a, + 0x97, 0xab, 0x7e, 0xa4, 0xb9, 0xbc, 0xa8, 0x57, 0x34, 0x37, 0xb3, 0x7d, + 0x3d, 0xc3, 0x47, 0x4a, 0x81, 0x7b, 0x40, 0xbe, 0x45, 0xc6, 0xcf, 0xa6, + 0x9b, 0xd4, 0xa1, 0x3f, 0xd1, 0x75, 0xc1, 0x4f, 0x4c, 0x8b, 0x79, 0xaf, + 0x5a, 0x99, 0xce, 0x7c, 0x62, 0xad, 0x6f, 0xd0, 0xc7, 0x80, 0x86, 0x8d, + 0x47, 0x9e, 0x7c, 0xc3, 0xa0, 0xaa, 0x54, 0x48, 0x3d, 0x8c, 0xa0, 0x64, + 0x39, 0x97, 0x8c, 0x5a, 0x40, 0xbe, 0x43, 0x93, 0x9a, 0xae, 0x72, 0xa2, + 0x74, 0x82, 0x95, 0x63, 0x6e, 0x5b, 0xa3, 0x49, 0x4f, 0x85, 0xa9, 0x91, + 0x50, 0xc2, 0x77, 0x50, 0x2e, 0x9e, 0x4b, 0xd4, 0x8d, 0x57, 0x89, 0xbb, + 0x6f, 0xad, 0x85, 0x58, 0xbd, 0x70, 0x91, 0x62, 0x99, 0x49, 0x5d, 0x83, + 0xa5, 0x34, 0x33, 0x5e, 0x95, 0x6e, 0x99, 0xc8, 0x92, 0x8d, 0xb0, 0x9c, + 0xab, 0x9e, 0x69, 0x68, 0x79, 0x71, 0x3a, 0x70, 0x98, 0x54, 0x32, 0x38, + 0xc8, 0x5e, 0xc6, 0x67, 0x4d, 0x6a, 0x5e, 0x9a, 0xa0, 0x9d, 0x5b, 0x82, + 0x8b, 0xc0, 0xb1, 0xa4, 0xc1, 0x90, 0xd0, 0x8d, 0x73, 0xc6, 0xc1, 0xa3, + 0x4b, 0xb0, 0x4b, 0x8d, 0x40, 0x40, 0xac, 0x9a, 0x7c, 0x82, 0x6c, 0x83, + 0xca, 0x61, 0xac, 0xcb, 0x60, 0xc1, 0xcc, 0x95, 0x6c, 0x3a, 0xae, 0x9e, + 0x7c, 0x8d, 0xa4, 0x8a, 0xa5, 0x80, 0x6a, 0x54, 0x61, 0x3f, 0x59, 0x69, + 0x4e, 0xa4, 0x70, 0xb4, 0x88, 0x42, 0x99, 0x5a, 0x5c, 0x48, 0x3c, 0x70, + 0xbe, 0xa6, 0x74, 0x77, 0x9e, 0x64, 0x79, 0x7c, 0x32, 0x55, 0x36, 0xa1, + 0x36, 0xaf, 0x72, 0xbc, 0x9b, 0xbb, 0x7b, 0x54, 0x33, 0xae, 0x4a, 0x48, + 0x83, 0x96, 0xcb, 0xc0, 0x51, 0x76, 0xa5, 0x6a, 0x95, 0xbe, 0x7a, 0x43, + 0x48, 0xb1, 0xbd, 0x43, 0xb7, 0xa3, 0x5b, 0xab, 0xb4, 0x79, 0xb9, 0x7a, + 0x4f, 0xb5, 0x71, 0x9d, 0x61, 0xd6, 0x50, 0xa4, 0x87, 0x33, 0xbb, 0x49, + 0x50, 0x52, 0x87, 0x47, 0x32, 0xa0, 0x6d, 0x38, 0x97, 0x39, 0xa9, 0x52, + 0x5a, 0x9c, 0x9a, 0xc8, 0xcc, 0xcf, 0xb2, 0x49, 0x96, 0x97, 0x64, 0x52, + 0x62, 0xbc, 0x7c, 0x33, 0x7d, 0x78, 0xc3, 0x76, 0xc9, 0x8d, 0xaa, 0xce, + 0x95, 0xc9, 0xc7, 0xc8, 0xc2, 0xb3, 0x31, 0xaa, 0x3a, 0x38, 0x4f, 0x6b, + 0x4a, 0x4d, 0x74, 0x70, 0x70, 0x94, 0xcb, 0xb1, 0x3a, 0xb0, 0x35, 0x62, + 0xa3, 0x72, 0x82, 0x3b, 0x43, 0xbe, 0xc7, 0x7c, 0x6c, 0x48, 0xbf, 0x56, + 0xbe, 0x4d, 0xc0, 0xa5, 0xc8, 0x5e, 0xb4, 0x42, 0x32, 0xc0, 0x65, 0xc1, + 0xb8, 0x5d, 0x87, 0x69, 0xab, 0xcb, 0x7b, 0x61, 0x6e, 0x94, 0xbf, 0x9a, + 0x37, 0x98, 0xcd, 0x51, 0x82, 0xbe, 0xb2, 0x91, 0x44, 0x69, 0xc0, 0x61, + 0x48, 0xcf, 0xcc, 0x70, 0x5a, 0xaf, 0x4e, 0xb6, 0x47, 0x37, 0x45, 0x7f, + 0x70, 0xa5, 0x68, 0x7e, 0xc9, 0x46, 0x8d, 0xa2, 0xa9, 0x9f, 0x92, 0x7e, + 0xd2, 0xba, 0x8d, 0x86, 0x3a, 0xbd, 0x9b, 0x4d, 0xb1, 0x96, 0x5a, 0x3d, + 0x5f, 0xd2, 0xd8, 0x33, 0x8b, 0x5f, 0x41, 0x80, 0x8a, 0x82, 0x38, 0x6b, + 0x30, 0x7b, 0xb0, 0xc5, 0x44, 0xba, 0x76, 0x9d, 0x6c, 0xb6, 0x34, 0x4d, + 0x78, 0xcb, 0xcf, 0xc0, 0xb9, 0x70, 0xaf, 0x7d, 0xb5, 0x63, 0xc8, 0x86, + 0x44, 0x75, 0x9c, 0x66, 0x84, 0x7f, 0x52, 0x3d, 0xb3, 0x6c, 0x76, 0xb0, + 0x3f, 0x80, 0xaa, 0x39, 0x70, 0x37, 0xbc, 0xa4, 0x52, 0xcf, 0xc2, 0x92, + 0xca, 0x63, 0xa0, 0x4a, 0x41, 0x7a, 0xc5, 0x82, 0x62, 0xbe, 0x68, 0x6b, + 0x8d, 0x63, 0xa5, 0x70, 0x9a, 0x99, 0xa4, 0x7b, 0x84, 0x82, 0x30, 0xb5, + 0x5c, 0x9b, 0x6c, 0xa1, 0xc3, 0xcd, 0x79, 0x92, 0x82, 0x60, 0x83, 0x3a, + 0x61, 0xac, 0x88, 0x6f, 0x5e, 0x70, 0x41, 0x45, 0x84, 0x52, 0x86, 0x70, + 0xd1, 0xa2, 0x58, 0xaf, 0x79, 0x56, 0x91, 0x70, 0x6b, 0xb8, 0x42, 0xc0, + 0xcf, 0xb7, 0xbd, 0xc3, 0x76, 0x42, 0x9c, 0x9c, 0x87, 0x7c, 0xaa, 0xbe, + 0x48, 0xc9, 0xd5, 0x5a, 0xcd, 0x9e, 0x7a, 0xb8, 0xba, 0xc5, 0x73, 0x6a, + 0x7c, 0xaf, 0x55, 0xb8, 0xbd, 0x91, 0x7b, 0x95, 0x3a, 0xb8, 0xb7, 0x8e, + 0x9b, 0xc2, 0x5b, 0x6c, 0x88, 0x91, 0x9e, 0x40, 0xcb, 0xc9, 0x8d, 0x7b, + 0x5c, 0x5b, 0xbc, 0xb9, 0xb5, 0xbd, 0x3b, 0x7c, 0x9b, 0x33, 0x29, 0x44, + 0x87, 0xd2, 0x30, 0x2e, 0x38, 0x7f, 0xb5, 0xac, 0x7c, 0xb9, 0xac, 0x4c, + 0xd0, 0x4c, 0x6b, 0xc6, 0xc3, 0x71, 0x56, 0x6e, 0x67, 0xa0, 0x67, 0x99, + 0x8b, 0x53, 0x53, 0x55, 0xca, 0x5c, 0x38, 0xbd, 0x60, 0xd2, 0x53, 0xd2, + 0x87, 0x5d, 0x52, 0x8c, 0x63, 0x97, 0x44, 0x69, 0x32, 0xaa, 0x4b, 0xd4, + 0x73, 0xcf, 0xc3, 0x6c, 0x5e, 0xb9, 0xca, 0x38, 0x67, 0xc3, 0x5e, 0x98, + 0xba, 0xc0, 0x6d, 0x82, 0x66, 0x86, 0x74, 0xa9, 0x49, 0xc0, 0x91, 0x7d, + 0x7a, 0xb2, 0x96, 0x5c, 0x6b, 0x46, 0xd0, 0x75, 0x86, 0xb7, 0xa7, 0x7d, + 0xb3, 0x93, 0x7c, 0x90, 0xad, 0xae, 0x51, 0xd3, 0x32, 0xb0, 0x6b, 0x8f, + 0x78, 0x2f, 0x62, 0xc1, 0x57, 0x6e, 0x9e, 0xc3, 0x74, 0x71, 0x8f, 0xb0, + 0x5b, 0x6e, 0x5c, 0x3e, 0x64, 0xa7, 0x66, 0x4d, 0x9d, 0x3c, 0xb2, 0xac, + 0x9f, 0x51, 0x90, 0xa0, 0x33, 0x8c, 0x8b, 0x5d, 0x5e, 0x9f, 0xba, 0x52, + 0x86, 0x65, 0x74, 0x37, 0x76, 0x99, 0xc1, 0x55, 0x43, 0xc4, 0x7c, 0xc1, + 0x4a, 0x4c, 0x9c, 0x5a, 0x92, 0x4b, 0x42, 0x46, 0x75, 0x7b, 0x7f, 0x59, + 0x98, 0x45, 0x45, 0xc9, 0x93, 0x73, 0x80, 0xd5, 0x3a, 0x62, 0xd0, 0xc1, + 0xb9, 0x56, 0x70, 0x76, 0x70, 0x62, 0x36, 0x5f, 0x6e, 0x8d, 0x63, 0x5f, + 0xb9, 0xc1, 0x7f, 0x3c, 0x6c, 0x7b, 0x8a, 0x70, 0x67, 0x97, 0xce, 0xa8, + 0xd1, 0x49, 0xa0, 0x7a, 0x3e, 0x62, 0x7a, 0x78, 0xc8, 0x50, 0xbf, 0x4c, + 0x7f, 0xc0, 0x6b, 0x42, 0x7d, 0xb0, 0xba, 0xba, 0xd2, 0x9e, 0x4f, 0x6c, + 0x64, 0x6b, 0x71, 0x7c, 0x75, 0x41, 0xb1, 0x5c, 0x89, 0x5f, 0xd3, 0x52, + 0x92, 0xcf, 0x97, 0x65, 0xbf, 0xb0, 0x9a, 0x4c, 0x75, 0x46, 0x78, 0x51, + 0x51, 0x5a, 0x46, 0x71, 0x6e, 0x57, 0xcf, 0xb8, 0x30, 0x77, 0x63, 0x41, + 0x67, 0x8f, 0x81, 0x70, 0x92, 0x6f, 0x9b, 0xa3, 0x66, 0x43, 0x58, 0x4f, + 0x54, 0x4b, 0x4a, 0xa7, 0x45, 0x4b, 0x34, 0xca, 0xb6, 0x37, 0x75, 0x89, + 0x46, 0xc1, 0xd3, 0x97, 0xbb, 0x34, 0x34, 0x6b, 0x6d, 0xae, 0x8a, 0xb8, + 0xb8, 0xa8, 0x77, 0x43, 0x71, 0x46, 0xab, 0xa1, 0xab, 0x35, 0xc4, 0x49, + 0xcb, 0x47, 0x52, 0xd0, 0x49, 0x81, 0xac, 0xc1, 0x59, 0x4f, 0x39, 0x3f, + 0x86, 0xa0, 0x99, 0xa9, 0x8c, 0xb1, 0x5d, 0x82, 0x64, 0xc4, 0x66, 0xa3, + 0x7a, 0x5a, 0xc7, 0x47, 0x3f, 0x63, 0x4c, 0x40, 0x7c, 0xa4, 0x78, 0x3f, + 0x81, 0x8d, 0xb8, 0x84, 0x40, 0x50, 0xa3, 0x57, 0x80, 0x36, 0x4d, 0x8a, + 0x5b, 0x92, 0x6d, 0x5b, 0x75, 0x3b, 0xd0, 0x40, 0x92, 0x5e, 0x57, 0xc2, + 0x7b, 0x41, 0x69, 0x2e, 0x6c, 0x9f, 0xa0, 0xa2, 0x93, 0x7a, 0xcc, 0x57, + 0xa6, 0x9f, 0xa9, 0x8d, 0x4c, 0xc3, 0x53, 0x4a, 0x6e, 0xa0, 0xad, 0xb7, + 0x88, 0xc6, 0xab, 0xb1, 0x97, 0x40, 0x7d, 0x6a, 0xce, 0xbf, 0x79, 0xc2, + 0x85, 0x54, 0x49, 0x56, 0x56, 0xc1, 0x91, 0x93, 0x36, 0x9b, 0xa1, 0x6b, + 0x42, 0x37, 0xc9, 0x56, 0xbb, 0x86, 0x56, 0x81, 0x4b, 0x96, 0x7f, 0xa7, + 0xa7, 0x71, 0xa2, 0x3c, 0x98, 0x8e, 0x9e, 0x67, 0x62, 0x42, 0xb4, 0xc9, + 0x50, 0x90, 0xce, 0x65, 0x8a, 0x4c, 0x9a, 0x4f, 0x8e, 0x3b, 0x40, 0x69, + 0x4a, 0x9f, 0x9a, 0x4f, 0x3f, 0x4a, 0x54, 0x54, 0x47, 0x82, 0x40, 0xc7, + 0xa1, 0xd0, 0x40, 0x44, 0x7c, 0xc7, 0x8c, 0xb7, 0x40, 0xbf, 0xc0, 0x3d, + 0xbf, 0x4f, 0x85, 0xa1, 0x8b, 0xa3, 0x4c, 0xba, 0x54, 0xc8, 0x8c, 0x6f, + 0x87, 0x7d, 0x74, 0x90, 0xc0, 0xb2, 0x55, 0x5d, 0x86, 0x47, 0x76, 0x3e, + 0x5e, 0x8b, 0x6a, 0x50, 0x3d, 0x56, 0xcf, 0x64, 0xae, 0x59, 0x4b, 0x64, + 0x8b, 0x41, 0xa9, 0x79, 0xb9, 0x8c, 0x8b, 0x9a, 0x8b, 0xc0, 0x6f, 0xd1, + 0x52, 0x97, 0x9b, 0x8f, 0x61, 0xae, 0x6e, 0x8d, 0x46, 0x9e, 0x7b, 0x6a, + 0xcf, 0xd2, 0x6f, 0x82, 0xa8, 0xa8, 0xb6, 0x38, 0xd0, 0xce, 0x71, 0x61, + 0x72, 0xb3, 0x44, 0x42, 0x7e, 0x83, 0xc0, 0x4a, 0x5e, 0x9b, 0x74, 0x8f, + 0xc4, 0x7e, 0xcb, 0xbf, 0xd3, 0x82, 0x8d, 0xc8, 0x8b, 0x79, 0xb0, 0x40, + 0x54, 0xae, 0x83, 0xa1, 0xb3, 0x9a, 0xc9, 0xc8, 0x66, 0x9d, 0x8a, 0xa8, + 0x91, 0xd9, 0x70, 0xab, 0x51, 0x32, 0x85, 0x5d, 0xaa, 0x93, 0x6b, 0x56, + 0xbb, 0xd4, 0x87, 0xaa, 0x46, 0x5a, 0x95, 0x2e, 0x83, 0x6e, 0x58, 0xb4, + 0xc3, 0x45, 0x90, 0x59, 0x6e, 0x46, 0x54, 0x5e, 0xc4, 0xa3, 0x49, 0x44, + 0x8f, 0x51, 0xd0, 0x75, 0x9e, 0xa9, 0xb0, 0xbd, 0xb5, 0x5c, 0xce, 0x95, + 0x64, 0xa3, 0x8b, 0xbb, 0x3d, 0x50, 0x8e, 0xad, 0x46, 0x56, 0x45, 0xc6, + 0xa1, 0x74, 0xc7, 0xc2, 0xca, 0xb6, 0x9b, 0xbb, 0x5e, 0x54, 0xc3, 0x95, + 0x64, 0x63, 0x5f, 0x82, 0xa5, 0xd3, 0xba, 0x4a, 0xb7, 0x3d, 0x9c, 0xd3, + 0xb5, 0x68, 0x53, 0x7f, 0xa5, 0xc4, 0x59, 0x65, 0xc7, 0xc5, 0x6a, 0x44, + 0x90, 0x44, 0x80, 0x9a, 0x35, 0x49, 0x43, 0xc8, 0x47, 0x46, 0x92, 0x44, + 0xa5, 0x7d, 0x4b, 0xaa, 0x3b, 0x89, 0x5e, 0x33, 0x73, 0x80, 0xc1, 0xd6, + 0xd3, 0xbb, 0x51, 0x8d, 0xb2, 0xc1, 0x66, 0xa8, 0x7e, 0xd6, 0xd9, 0x4a, + 0xbf, 0xc0, 0xb7, 0x89, 0x40, 0xbc, 0x44, 0xbb, 0x97, 0x69, 0x6f, 0x9e, + 0x6b, 0x84, 0x5b, 0x57, 0xd4, 0x40, 0xc7, 0xc7, 0x69, 0x67, 0x89, 0xc2, + 0xc1, 0xba, 0xb8, 0x42, 0x6a, 0x80, 0xa1, 0xab, 0x9b, 0x91, 0xb5, 0x92, + 0xa8, 0x71, 0xa1, 0x5a, 0x67, 0x42, 0x92, 0xb8, 0x3c, 0x49, 0xd8, 0x33, + 0xa5, 0xc3, 0x41, 0x52, 0x52, 0xca, 0x9f, 0xce, 0xbf, 0xd3, 0xd0, 0x2e, + 0x5d, 0x9f, 0x9c, 0x65, 0x6b, 0x34, 0xa3, 0xd0, 0x40, 0xd6, 0x45, 0xa2, + 0x8b, 0x3c, 0xc0, 0x5a, 0x32, 0x90, 0x44, 0x44, 0xa1, 0x59, 0x5c, 0x9d, + 0x41, 0x9b, 0x4b, 0xd7, 0x7d, 0xb0, 0x79, 0x62, 0xb3, 0x6f, 0x44, 0x72, + 0x4a, 0x6d, 0xc7, 0x90, 0x44, 0x9e, 0x53, 0x3d, 0x37, 0x7e, 0x2b, 0x45, + 0x34, 0xa7, 0x60, 0xc7, 0x9e, 0xb8, 0xca, 0x9a, 0x77, 0xb6, 0x5d, 0x68, + 0x72, 0x32, 0xc1, 0x64, 0xd8, 0xac, 0x68, 0xb9, 0x40, 0x3d, 0x63, 0x55, + 0xbd, 0x55, 0x9a, 0x60, 0x71, 0x8a, 0x97, 0x3b, 0xcf, 0xab, 0x7d, 0xaa, + 0x5e, 0x39, 0xb5, 0x5f, 0x67, 0x9d, 0x51, 0x70, 0x4b, 0x36, 0x90, 0x98, + 0x99, 0xae, 0x3c, 0x41, 0x6a, 0x6f, 0xbc, 0x6b, 0x44, 0x87, 0xa5, 0x61, + 0x68, 0x3a, 0x7d, 0x68, 0x37, 0x45, 0x9d, 0x8f, 0x67, 0x60, 0x76, 0xb3, + 0x47, 0x47, 0x8b, 0x37, 0x78, 0xb7, 0xc8, 0x62, 0x78, 0x88, 0xd0, 0x7b, + 0x61, 0x5a, 0x44, 0x95, 0x4c, 0x92, 0x45, 0x49, 0x99, 0x41, 0xbf, 0xa9, + 0x36, 0xc8, 0x5e, 0x69, 0x53, 0xd0, 0x45, 0x32, 0x58, 0x6c, 0x9f, 0xc3, + 0xd2, 0x5c, 0x43, 0xd0, 0x79, 0x6a, 0x71, 0xb0, 0x59, 0x8a, 0x3b, 0xcf, + 0x93, 0xbf, 0xd7, 0x9c, 0x47, 0x3a, 0x4b, 0xcf, 0x70, 0xcb, 0xc7, 0x73, + 0x44, 0xad, 0xc1, 0x93, 0x44, 0x5a, 0xc4, 0x90, 0x49, 0x54, 0x53, 0x43, + 0x61, 0x43, 0x90, 0x56, 0x51, 0x67, 0x2e, 0xd4, 0x50, 0x94, 0xc6, 0x90, + 0x82, 0x9f, 0x3a, 0x5a, 0x82, 0xaf, 0x44, 0x87, 0x81, 0xd1, 0x58, 0x70, + 0x8f, 0x62, 0x46, 0x50, 0x92, 0x4a, 0x67, 0x40, 0x6a, 0xdb, 0xb0, 0x69, + 0xae, 0x4f, 0xb8, 0xbe, 0xc0, 0x7c, 0x76, 0xa6, 0x80, 0xb5, 0xb1, 0x79, + 0x5d, 0xca, 0x33, 0x8b, 0x7d, 0x99, 0x70, 0xb2, 0x49, 0x95, 0x39, 0x92, + 0xa2, 0xb2, 0xc8, 0x45, 0xb0, 0xa4, 0x60, 0x5c, 0x93, 0x4b, 0xba, 0x53, + 0xc3, 0x37, 0x92, 0xbb, 0x7a, 0xa3, 0x82, 0xb1, 0x84, 0x61, 0x3d, 0x40, + 0x68, 0xd4, 0x30, 0x72, 0xa0, 0x82, 0x8c, 0xca, 0xc6, 0x9b, 0x66, 0x63, + 0x52, 0xc0, 0x8d, 0xd5, 0xd0, 0xb1, 0x72, 0x57, 0xa0, 0xcb, 0xa9, 0x86, + 0x92, 0x7f, 0xaa, 0x78, 0x85, 0x48, 0x4b, 0x47, 0x4e, 0x52, 0xb7, 0x75, + 0x80, 0x8c, 0x57, 0x62, 0x7e, 0x51, 0x78, 0x38, 0x64, 0xa6, 0xcc, 0xb0, + 0x2e, 0x54, 0x43, 0xbf, 0xb3, 0x8b, 0x28, 0x84, 0x46, 0x30, 0xc4, 0x6d, + 0x61, 0xb4, 0x9f, 0x72, 0xa4, 0x8c, 0x5c, 0x85, 0x90, 0x9b, 0x7c, 0xae, + 0xbf, 0xc5, 0x74, 0x81, 0xc8, 0x80, 0xc9, 0x88, 0xc3, 0xbc, 0xcc, 0x7f, + 0x73, 0x7e, 0xbb, 0x46, 0x39, 0x70, 0xb7, 0x8f, 0x44, 0xc1, 0x4b, 0x69, + 0x65, 0x3a, 0x8e, 0x31, 0xb4, 0x52, 0x4d, 0xc6, 0x51, 0x59, 0x90, 0x60, + 0x50, 0x5a, 0xce, 0xbf, 0x4f, 0x33, 0x63, 0x42, 0x6a, 0xc5, 0xce, 0x82, + 0x32, 0xcf, 0xbc, 0x75, 0xc5, 0x7e, 0xb3, 0xb7, 0xb2, 0x8c, 0x69, 0xa4, + 0x2f, 0xb6, 0x7c, 0x8f, 0x90, 0xab, 0xc6, 0x6b, 0x90, 0x9c, 0x3a, 0x88, + 0xaa, 0x73, 0x79, 0x78, 0x46, 0x6f, 0x59, 0x63, 0xbd, 0x6a, 0xcf, 0xb6, + 0xae, 0x8e, 0x86, 0x8f, 0x49, 0x9a, 0xca, 0xd6, 0x8c, 0x3e, 0x50, 0x88, + 0xa5, 0x91, 0xa3, 0xbb, 0x68, 0x23, 0x88, 0x77, 0x38, 0x4d, 0x31, 0x7f, + 0x85, 0xb9, 0xbe, 0x45, 0xd1, 0x96, 0xd7, 0xbb, 0x75, 0x2f, 0x9a, 0x70, + 0x94, 0x45, 0x58, 0x40, 0x69, 0xbe, 0x52, 0x75, 0x93, 0x55, 0x34, 0x4e, + 0xab, 0xb9, 0x5d, 0x63, 0x81, 0xd3, 0x45, 0x5c, 0xb3, 0x71, 0x41, 0x9b, + 0x2e, 0x4f, 0xb4, 0xcd, 0x94, 0x84, 0xb5, 0x41, 0x9a, 0x89, 0x69, 0x8b, + 0x5e, 0x9a, 0x3c, 0x46, 0x5a, 0xaf, 0x92, 0x96, 0x72, 0xcf, 0x5a, 0x81, + 0x91, 0x62, 0x4e, 0xd1, 0xa7, 0x73, 0x50, 0xd4, 0xc0, 0x8a, 0x70, 0x56, + 0xa0, 0x3c, 0xb8, 0x64, 0x34, 0x9d, 0xb7, 0xbb, 0x88, 0xaf, 0xb5, 0x2b, + 0x43, 0x81, 0x63, 0x40, 0x80, 0xab, 0x9b, 0x9b, 0x6c, 0x73, 0x2d, 0xbf, + 0x6a, 0x93, 0x91, 0xbf, 0xa3, 0xa5, 0xc3, 0x70, 0x95, 0x58, 0x50, 0x2e, + 0x68, 0x6d, 0x6a, 0x5c, 0x38, 0x49, 0x61, 0x7c, 0x7a, 0x5f, 0x5f, 0x80, + 0xc6, 0xc9, 0x5d, 0xc7, 0x56, 0x92, 0x8d, 0xb4, 0xb6, 0xb6, 0x39, 0x51, + 0xbb, 0xa8, 0x60, 0xc8, 0xbb, 0x90, 0x48, 0xc7, 0xac, 0x9f, 0x79, 0x7e, + 0x75, 0x79, 0x5e, 0xbc, 0x3e, 0xa4, 0x84, 0x6e, 0x4c, 0x6c, 0x6c, 0x9a, + 0xa6, 0x67, 0xce, 0x6d, 0x8a, 0x39, 0x6e, 0x4f, 0x87, 0x98, 0xad, 0xa3, + 0xbb, 0x73, 0x47, 0xa6, 0x5d, 0x3f, 0x90, 0x3d, 0x83, 0x67, 0x9c, 0x9f, + 0xaf, 0xbf, 0x5f, 0x88, 0x73, 0x79, 0x92, 0xb7, 0xa0, 0x50, 0x39, 0x76, + 0x4a, 0x78, 0x93, 0xd8, 0xaa, 0xa8, 0x36, 0x58, 0x50, 0x79, 0x55, 0x70, + 0x67, 0xbf, 0xb3, 0x4d, 0x71, 0x67, 0x6c, 0x69, 0xb7, 0xbb, 0x60, 0xb6, + 0x8d, 0xa3, 0xd3, 0x8d, 0x4d, 0xd9, 0x53, 0x7b, 0x67, 0x38, 0x30, 0x8a, + 0xb9, 0x9e, 0x93, 0xa9, 0x7e, 0x4d, 0x3c, 0x6a, 0xd6, 0x8a, 0xd8, 0xd1, + 0x96, 0x7e, 0x3e, 0x61, 0xc0, 0xc4, 0x8c, 0xbc, 0x59, 0x57, 0x57, 0x62, + 0x50, 0xa5, 0x6b, 0x66, 0xa8, 0x86, 0x3c, 0xd4, 0xb9, 0x82, 0xb5, 0xc3, + 0x8e, 0x59, 0xa6, 0xae, 0xb0, 0x5a, 0xdf, 0xa8, 0x67, 0xce, 0xbe, 0xc7, + 0x7a, 0xb0, 0xb3, 0x51, 0x9d, 0x59, 0xbf, 0xca, 0xb6, 0x3e, 0xa7, 0x34, + 0xa7, 0x59, 0x8d, 0xb3, 0x83, 0x78, 0xc5, 0xbc, 0x78, 0xca, 0xb4, 0x59, + 0x60, 0x65, 0xa4, 0x5c, 0x76, 0x64, 0x56, 0xdb, 0x97, 0x74, 0x30, 0x70, + 0x95, 0xbf, 0x6f, 0x3c, 0xc2, 0x3c, 0xb9, 0x8b, 0x7b, 0x8b, 0x38, 0xac, + 0x37, 0xc0, 0xcb, 0xad, 0x83, 0x80, 0x7e, 0xab, 0x8b, 0xd0, 0x98, 0x7b, + 0x3b, 0x89, 0x38, 0x4f, 0x3a, 0x96, 0x44, 0x9c, 0x87, 0xa6, 0xc7, 0xba, + 0x66, 0x60, 0xd2, 0x50, 0xc0, 0x9f, 0x5f, 0xb2, 0x42, 0xb8, 0x93, 0x6f, + 0x79, 0x57, 0x96, 0x6f, 0x57, 0x9c, 0xcd, 0x38, 0x84, 0xe4, 0x9f, 0x74, + 0xc9, 0x6a, 0xa3, 0x74, 0xd7, 0xe2, 0x5a, 0x71, 0xab, 0xc5, 0x61, 0x47, + 0xca, 0x5b, 0x8e, 0xa6, 0xd4, 0x2f, 0x54, 0xcf, 0xb2, 0x6f, 0x43, 0x77, + 0x66, 0x65, 0xd2, 0x70, 0x40, 0xa4, 0x61, 0x77, 0x4c, 0x9a, 0xa5, 0xc0, + 0x46, 0xac, 0x3e, 0x92, 0x35, 0x3f, 0x40, 0x38, 0xa5, 0x83, 0xbc, 0x2c, + 0xa1, 0x9a, 0xdb, 0xa3, 0x84, 0x4a, 0xc4, 0xcc, 0x9d, 0x5d, 0x3c, 0x5b, + 0x91, 0x4d, 0xaa, 0x40, 0x7a, 0xa3, 0x67, 0x47, 0x51, 0xd5, 0xad, 0x56, + 0x9b, 0x88, 0x62, 0x57, 0x9e, 0x4e, 0x7b, 0x6f, 0x81, 0x40, 0xc1, 0xa1, + 0x83, 0x72, 0xd0, 0x60, 0x5b, 0x79, 0x6c, 0xc5, 0xbf, 0x39, 0xb2, 0x9e, + 0xc4, 0x5f, 0x5f, 0xb6, 0xa7, 0x9d, 0x48, 0x8c, 0x60, 0xa2, 0x98, 0x61, + 0x39, 0xa4, 0xc5, 0xa5, 0x4b, 0x93, 0xd9, 0xb2, 0x95, 0x37, 0x3c, 0xa2, + 0x6a, 0x5f, 0x3f, 0x75, 0x7a, 0x81, 0x8f, 0x6f, 0x98, 0x3e, 0x3d, 0xbb, + 0xa4, 0x3a, 0x73, 0x7f, 0x43, 0x5a, 0x76, 0x8c, 0x5f, 0x72, 0x9d, 0x96, + 0xa6, 0x4a, 0x86, 0x90, 0x33, 0xd0, 0x80, 0x4b, 0xcc, 0x87, 0xbd, 0xbd, + 0x43, 0x7d, 0xad, 0x72, 0x39, 0x43, 0x55, 0x66, 0x5b, 0x83, 0x70, 0xc9, + 0x44, 0x81, 0x63, 0x5f, 0xbc, 0xc9, 0x9e, 0x99, 0x7f, 0x3d, 0x39, 0xcf, + 0x45, 0x48, 0xea, 0x80, 0x4a, 0x6d, 0xb2, 0x9e, 0x94, 0x31, 0xb1, 0xd4, + 0x50, 0xae, 0x46, 0x69, 0xc2, 0xcf, 0x87, 0x6a, 0x68, 0xc0, 0x54, 0x99, + 0xab, 0xae, 0x6d, 0xcb, 0x48, 0x9b, 0x72, 0xc7, 0xbb, 0x8d, 0x6f, 0xda, + 0xc0, 0xab, 0xbc, 0xba, 0x9b, 0x3b, 0xa0, 0x4d, 0xaa, 0x69, 0x3d, 0x53, + 0x55, 0x6d, 0x92, 0xc9, 0x3c, 0x34, 0x7f, 0x7a, 0xca, 0xc2, 0xad, 0xa4, + 0x3b, 0x39, 0x8d, 0x7d, 0x4c, 0xc6, 0x5a, 0xbf, 0x68, 0xbe, 0x33, 0x49, + 0xb4, 0x4d, 0x82, 0x65, 0x9d, 0x90, 0x42, 0xbf, 0x65, 0x61, 0x7f, 0x2f, + 0xb7, 0x6e, 0x64, 0x9f, 0x67, 0x67, 0x7b, 0xae, 0x7e, 0xc0, 0x5a, 0x8e, + 0x51, 0x5b, 0xa7, 0xa0, 0x79, 0x3f, 0x98, 0xd4, 0x69, 0x53, 0x69, 0xca, + 0xb5, 0x8d, 0x42, 0xb5, 0x4a, 0x7e, 0x9f, 0x54, 0xc3, 0xc5, 0x5e, 0xae, + 0xd4, 0x7e, 0x78, 0x87, 0x62, 0x4a, 0xb6, 0x64, 0x8e, 0x5a, 0xbf, 0x63, + 0x71, 0x6f, 0xd3, 0x56, 0x7e, 0x4b, 0xc2, 0xac, 0x3e, 0x9b, 0x7c, 0xca, + 0x9d, 0xc4, 0x44, 0xc0, 0xcc, 0x5b, 0xd0, 0x5b, 0x69, 0x61, 0xbc, 0xa1, + 0x41, 0x56, 0x79, 0x89, 0xb4, 0x4a, 0x40, 0x43, 0x59, 0x66, 0xab, 0xcf, + 0x89, 0x92, 0x8b, 0xa7, 0xb5, 0xb8, 0x9a, 0x4b, 0x85, 0x67, 0x52, 0xca, + 0xbd, 0x6c, 0x65, 0x89, 0x37, 0xb6, 0x45, 0x59, 0x85, 0xce, 0x5c, 0x43, + 0x55, 0x41, 0xd9, 0xe6, 0x2c, 0xb0, 0xca, 0x97, 0x6f, 0x51, 0x71, 0x80, + 0x81, 0x36, 0xae, 0x63, 0x66, 0x6e, 0x7e, 0x7f, 0x95, 0x7d, 0x4d, 0x9a, + 0x52, 0x96, 0xaa, 0xc2, 0x4e, 0x59, 0x76, 0x7c, 0xd7, 0xa2, 0x51, 0x5a, + 0xb0, 0x79, 0x91, 0x7f, 0x60, 0x5e, 0x73, 0x9f, 0x58, 0x9e, 0x78, 0x54, + 0x84, 0x8b, 0x67, 0x7d, 0x6a, 0xad, 0xa4, 0x47, 0x43, 0xd9, 0xb9, 0xa3, + 0xcf, 0x45, 0x3d, 0x75, 0x8c, 0x8c, 0x60, 0xc2, 0x85, 0x79, 0xae, 0xa8, + 0x97, 0xb6, 0x65, 0xb0, 0x6a, 0x96, 0xd4, 0x68, 0x5e, 0xa1, 0xbd, 0x49, + 0xc8, 0x95, 0xd9, 0x55, 0xb4, 0xc3, 0xae, 0x8f, 0x56, 0x83, 0x7f, 0x4a, + 0x51, 0x46, 0xb2, 0xd1, 0x55, 0x86, 0x5f, 0x3b, 0x53, 0xc8, 0xca, 0x73, + 0x6a, 0x61, 0x59, 0x63, 0x97, 0x7f, 0x54, 0x6e, 0xd5, 0xc3, 0x93, 0x96, + 0xb4, 0xb8, 0xb7, 0x8b, 0x9d, 0x89, 0x9e, 0x50, 0x87, 0xc0, 0xc7, 0xa1, + 0x78, 0x66, 0xae, 0x63, 0xa5, 0x59, 0x74, 0x2f, 0x54, 0xb9, 0x9b, 0x6d, + 0x37, 0x8f, 0xc9, 0xbc, 0x7f, 0xc3, 0xbb, 0x6d, 0x47, 0x43, 0xb4, 0xb5, + 0x7d, 0xd7, 0xbd, 0xd6, 0x7b, 0x81, 0x90, 0xbe, 0x3c, 0x56, 0x62, 0x5f, + 0xc6, 0x50, 0x75, 0x76, 0x9e, 0xaf, 0xa9, 0xa0, 0x4e, 0x50, 0x7c, 0x76, + 0x46, 0x8d, 0x51, 0xcc, 0x39, 0xbc, 0xd3, 0x84, 0x8e, 0x78, 0xcf, 0x3b, + 0xd5, 0xc6, 0xaa, 0x53, 0x9c, 0x65, 0x81, 0x47, 0x80, 0x86, 0x46, 0x85, + 0x3a, 0x46, 0x9a, 0xa2, 0x6a, 0x9e, 0x47, 0x68, 0xd1, 0xa8, 0xb3, 0xa5, + 0xa5, 0x4b, 0x6e, 0xaa, 0xcf, 0x87, 0x96, 0x6f, 0xd9, 0x5b, 0xa4, 0x46, + 0x3a, 0x89, 0x37, 0xaa, 0xda, 0x46, 0x79, 0xa3, 0xcf, 0x7e, 0x67, 0xc7, + 0x5f, 0x4b, 0x86, 0x9a, 0xbd, 0xd8, 0x6c, 0x42, 0xb8, 0x56, 0x7b, 0x87, + 0x2a, 0x33, 0x94, 0x60, 0x9f, 0x71, 0xad, 0xd2, 0x68, 0x8d, 0x66, 0xd4, + 0x79, 0xbe, 0x5f, 0x63, 0x93, 0x8f, 0xac, 0x67, 0x95, 0x6e, 0xaa, 0xd0, + 0x7e, 0x7b, 0x47, 0xba, 0x6a, 0xc8, 0x44, 0x5c, 0x57, 0x5f, 0x95, 0x3b, + 0xbf, 0x98, 0xb0, 0xb6, 0x82, 0xc8, 0x6a, 0x65, 0x95, 0x31, 0x7a, 0x8b, + 0xce, 0xaf, 0xad, 0x54, 0x60, 0x72, 0x72, 0x49, 0x7a, 0x3a, 0x6d, 0x79, + 0x77, 0xcc, 0x9d, 0xa4, 0x5d, 0x68, 0x3b, 0x92, 0xcb, 0x92, 0x82, 0x8d, + 0x75, 0x3f, 0xca, 0xaf, 0x88, 0xa1, 0xcf, 0x52, 0x5e, 0xaf, 0x94, 0xc2, + 0x8e, 0x8d, 0x4f, 0x33, 0x96, 0x42, 0xb8, 0x6f, 0x97, 0xd0, 0x90, 0xc6, + 0xbf, 0x5a, 0x56, 0xa5, 0xc2, 0xbc, 0xc2, 0xcf, 0x7c, 0xb8, 0xa4, 0xca, + 0x57, 0xcd, 0xb1, 0x40, 0x81, 0xb8, 0xc6, 0xce, 0xbe, 0xc7, 0xc7, 0x6b, + 0xa1, 0x67, 0x61, 0x5b, 0x40, 0x4f, 0x34, 0xac, 0xc2, 0x40, 0xa0, 0xac, + 0x84, 0xb2, 0xce, 0x50, 0xa0, 0x9f, 0x32, 0x4d, 0x89, 0x51, 0x33, 0x71, + 0x7a, 0x7b, 0xb2, 0x8b, 0x3f, 0x42, 0xa6, 0x7c, 0x9a, 0xd1, 0x41, 0x8c, + 0xab, 0x48, 0x46, 0x73, 0x3e, 0x9c, 0x6c, 0x95, 0x58, 0x9e, 0x75, 0xa5, + 0x70, 0x42, 0x74, 0x69, 0xd1, 0xb4, 0x41, 0xb9, 0xa4, 0x3b, 0xab, 0x39, + 0x7e, 0x58, 0x8e, 0x38, 0x8d, 0x7d, 0x2a, 0x3d, 0x9e, 0xd0, 0x51, 0xa4, + 0x8d, 0xc6, 0x77, 0xd9, 0x69, 0x2b, 0x86, 0x45, 0xc7, 0x91, 0x53, 0x34, + 0x8c, 0x57, 0x9c, 0xbe, 0x5e, 0xa3, 0x53, 0xa8, 0x7d, 0x3d, 0x5e, 0x59, + 0x8f, 0x6b, 0xa7, 0x9d, 0xbc, 0x85, 0x8e, 0x5d, 0xc5, 0x36, 0x6f, 0x6e, + 0xa4, 0x70, 0xc3, 0x40, 0x8f, 0x8a, 0x99, 0x3c, 0x9a, 0x4e, 0x41, 0xa5, + 0x55, 0xa3, 0x3f, 0x74, 0x4d, 0xb9, 0x72, 0xac, 0x86, 0x66, 0x7a, 0xb6, + 0x7e, 0xa3, 0x81, 0x46, 0x70, 0xc6, 0x4d, 0x41, 0x63, 0x84, 0x85, 0xb5, + 0x76, 0xbc, 0x85, 0x89, 0x9f, 0x9a, 0x4c, 0x95, 0xb1, 0x51, 0x89, 0xbf, + 0x4a, 0x67, 0xb9, 0x34, 0x76, 0x50, 0x4d, 0x82, 0x6f, 0x52, 0x34, 0x4d, + 0x4d, 0x9d, 0xc9, 0xb2, 0x8c, 0xb3, 0x6e, 0x66, 0xaa, 0x7a, 0x7d, 0x58, + 0x65, 0x59, 0x56, 0x6c, 0xac, 0x62, 0xb5, 0x6f, 0x66, 0x94, 0xb3, 0x41, + 0x39, 0xb0, 0x34, 0x72, 0x93, 0x9d, 0xa8, 0xa3, 0x87, 0x7b, 0xcc, 0xb8, + 0xbf, 0x48, 0xaa, 0x3d, 0xcd, 0x98, 0xb5, 0x51, 0x5d, 0x59, 0x7a, 0x88, + 0x4d, 0x6b, 0x71, 0x48, 0x98, 0x6e, 0xad, 0xd0, 0x8a, 0x55, 0x69, 0x3d, + 0xbe, 0x3f, 0x8a, 0x51, 0x74, 0x69, 0xa5, 0xa6, 0x4e, 0x82, 0x5d, 0x6d, + 0x7f, 0xb6, 0x97, 0xb8, 0xbe, 0x7e, 0x52, 0x5d, 0x9b, 0xc4, 0x39, 0x6a, + 0xc2, 0x71, 0xa5, 0xa8, 0x4b, 0x96, 0x81, 0x31, 0xc6, 0x86, 0x47, 0x51, + 0xc3, 0x8f, 0x7b, 0x9a, 0x6e, 0x7a, 0x9c, 0x6f, 0x80, 0x54, 0x60, 0xa8, + 0xb4, 0xa5, 0x46, 0x88, 0x85, 0xaf, 0x6f, 0xb7, 0x53, 0x8d, 0x6c, 0xb8, + 0xc8, 0xc8, 0xb3, 0x91, 0xd4, 0x7c, 0xa5, 0x74, 0x38, 0xa3, 0x77, 0xb7, + 0x5c, 0x77, 0x4f, 0x7d, 0xaf, 0x75, 0x62, 0x3d, 0x3b, 0x5d, 0x71, 0x6d, + 0x4c, 0x6f, 0x89, 0x7d, 0xc8, 0xbc, 0xd1, 0x68, 0x9e, 0xce, 0x5c, 0x88, + 0x44, 0x6e, 0xa1, 0x4a, 0xa5, 0xa2, 0xc9, 0x43, 0xa7, 0x89, 0x62, 0x6a, + 0xb2, 0xc8, 0x4c, 0x9b, 0x7e, 0xce, 0x81, 0x91, 0x68, 0x6f, 0x3b, 0x7a, + 0xb3, 0xc7, 0xa4, 0x6f, 0x42, 0x92, 0x94, 0x6f, 0x34, 0x46, 0x8b, 0x6c, + 0x8f, 0x62, 0x71, 0xc4, 0xaf, 0x3c, 0x62, 0x98, 0x78, 0xcc, 0x8c, 0x6e, + 0xca, 0x58, 0xc9, 0xbe, 0x33, 0xa8, 0x52, 0xbb, 0x78, 0xcb, 0x6a, 0x9c, + 0x4a, 0x6b, 0x8a, 0xb8, 0x58, 0xd1, 0x77, 0xae, 0x99, 0x9b, 0xb6, 0x63, + 0x87, 0x6a, 0x77, 0x80, 0x42, 0x76, 0x3d, 0x6b, 0x65, 0x4c, 0x7c, 0x57, + 0xb4, 0x37, 0xad, 0xc1, 0x45, 0x77, 0x7d, 0x5a, 0x7f, 0xa4, 0x3b, 0x86, + 0xab, 0x58, 0xaf, 0x91, 0x5c, 0x69, 0x8f, 0x70, 0x8a, 0xc5, 0xae, 0x7d, + 0xba, 0x7d, 0x9e, 0x2e, 0x92, 0x31, 0x5d, 0x7d, 0xb1, 0xab, 0x44, 0x5c, + 0x98, 0x80, 0xd1, 0x3d, 0x8a, 0x52, 0x67, 0xa2, 0x45, 0x96, 0xa9, 0xbc, + 0x55, 0xac, 0x32, 0x8c, 0xd3, 0x77, 0x9b, 0x77, 0x91, 0xc8, 0x6f, 0x60, + 0xc4, 0x90, 0x7d, 0x3d, 0xb5, 0xb0, 0x6d, 0xb6, 0x98, 0x72, 0x4c, 0x55, + 0xb9, 0xb6, 0xc4, 0x79, 0x67, 0x8e, 0x34, 0x7d, 0xcb, 0x3a, 0x2f, 0x6d, + 0xd0, 0x51, 0x35, 0x52, 0x39, 0xa9, 0xbd, 0x7e, 0x87, 0x80, 0x73, 0x8c, + 0xc7, 0xba, 0x64, 0x5c, 0x7b, 0xb9, 0x91, 0x41, 0xcd, 0x96, 0xce, 0x9e, + 0x7e, 0xae, 0x96, 0x77, 0xa3, 0x44, 0x80, 0x6a, 0x65, 0xcf, 0xb5, 0xcd, + 0x4a, 0x7d, 0xbc, 0x58, 0x77, 0xb7, 0x6c, 0x52, 0x3a, 0x89, 0x54, 0xaf, + 0x86, 0xa3, 0x9e, 0x83, 0x39, 0x74, 0x31, 0x35, 0x4f, 0x93, 0xce, 0x52, + 0x9d, 0x46, 0x61, 0x86, 0xce, 0x54, 0x95, 0x62, 0x3e, 0x76, 0x8c, 0x45, + 0x73, 0x43, 0x5e, 0xb1, 0x9d, 0xa5, 0x56, 0x63, 0x84, 0x5f, 0xb9, 0xc5, + 0xbb, 0x5e, 0x35, 0x4a, 0xc9, 0x66, 0x54, 0xae, 0x9d, 0xc3, 0x5a, 0x9d, + 0x90, 0x30, 0x97, 0xbd, 0x65, 0x51, 0xcd, 0x3f, 0xb3, 0x33, 0x94, 0xa7, + 0xc4, 0xc1, 0xba, 0x9e, 0x70, 0xc1, 0x72, 0x8f, 0x60, 0xbf, 0xd2, 0x7d, + 0x72, 0x71, 0xcf, 0xb9, 0x8b, 0x84, 0x54, 0x5d, 0x36, 0x38, 0xc7, 0x67, + 0x51, 0x65, 0x3b, 0x3a, 0x33, 0x9f, 0x79, 0x46, 0x7b, 0x81, 0x88, 0x72, + 0xad, 0x7f, 0x87, 0x7e, 0xaf, 0x4e, 0xd4, 0x4c, 0xaf, 0xba, 0x72, 0xb0, + 0x87, 0x43, 0x6f, 0xcf, 0xba, 0x75, 0x88, 0xc6, 0xc2, 0x4f, 0x76, 0x4c, + 0x5f, 0x7e, 0xa3, 0x54, 0x62, 0xa4, 0x41, 0xbd, 0x4c, 0x4e, 0xb3, 0xb4, + 0x78, 0x6d, 0xcf, 0x67, 0xc5, 0x8b, 0x80, 0xc4, 0xb4, 0xd3, 0x30, 0x94, + 0xa6, 0x4d, 0x6f, 0x7e, 0xc7, 0x48, 0x88, 0x87, 0xb7, 0x64, 0x3e, 0x86, + 0xac, 0x54, 0x67, 0x8d, 0x45, 0x5c, 0xb3, 0xd0, 0x85, 0x4f, 0x9e, 0x53, + 0x98, 0xb7, 0xb3, 0xbe, 0xb8, 0x83, 0x99, 0xcd, 0xb6, 0xae, 0xc2, 0x9b, + 0x9c, 0x37, 0xc7, 0x52, 0xb4, 0x93, 0xa1, 0xad, 0x5e, 0x52, 0x5d, 0x41, + 0x91, 0x8f, 0x3a, 0x61, 0xa6, 0xb0, 0xb6, 0x4f, 0x85, 0x54, 0x67, 0xa9, + 0xc3, 0x88, 0x67, 0x69, 0xa4, 0x9c, 0x8f, 0x7e, 0x77, 0x8b, 0x71, 0x6a, + 0xa0, 0xcd, 0xd3, 0x7d, 0x43, 0xde, 0xb2, 0xe0, 0x94, 0x71, 0xaf, 0x9f, + 0x5b, 0x9a, 0x34, 0xcc, 0xa8, 0x28, 0xc6, 0x3a, 0x62, 0xbe, 0x6a, 0x29, + 0x6b, 0x74, 0xca, 0xd5, 0x5b, 0x8f, 0x60, 0x48, 0x8b, 0xe5, 0x54, 0x64, + 0xc6, 0x33, 0x9f, 0x41, 0xc7, 0x96, 0x33, 0xca, 0x30, 0xba, 0x78, 0x86, + 0x6a, 0xd1, 0x5a, 0xa1, 0xaa, 0x51, 0x54, 0xcd, 0xa5, 0x56, 0xbe, 0xa4, + 0x2c, 0x78, 0x32, 0x3d, 0x60, 0xca, 0x5d, 0xb1, 0xaa, 0x7c, 0x2d, 0x2c, + 0x58, 0xa6, 0xb1, 0x7e, 0x45, 0x4c, 0x6f, 0x95, 0x69, 0xc9, 0x98, 0xc2, + 0xa5, 0xc7, 0x44, 0x9d, 0x65, 0x72, 0x41, 0x50, 0x4e, 0x9d, 0x8b, 0x6d, + 0x4d, 0xa2, 0x5c, 0x77, 0x99, 0xf5, 0x88, 0x6a, 0x9c, 0x2d, 0x63, 0xc3, + 0x83, 0x63, 0x39, 0xc2, 0x31, 0x83, 0x31, 0xa5, 0x39, 0x85, 0x22, 0xa8, + 0x80, 0x9e, 0xc6, 0xc0, 0x87, 0x73, 0xbd, 0x45, 0x41, 0x4c, 0xa0, 0x4f, + 0x80, 0x4b, 0x62, 0xd2, 0xb9, 0x44, 0x66, 0x80, 0x93, 0xc6, 0x70, 0x80, + 0x38, 0x7a, 0x84, 0x9c, 0x75, 0xa3, 0x67, 0x88, 0xb9, 0x8d, 0xad, 0x5e, + 0x78, 0xc6, 0x89, 0xad, 0x89, 0x7e, 0x42, 0x5e, 0x7e, 0x5b, 0x3e, 0xaa, + 0xa9, 0x37, 0x8d, 0x9e, 0xa7, 0xb9, 0x56, 0x7c, 0x64, 0xae, 0x55, 0x6d, + 0xae, 0xc8, 0x91, 0x94, 0x84, 0x96, 0xd2, 0x6f, 0x69, 0xcd, 0xaa, 0xa8, + 0x52, 0x99, 0xc0, 0xd0, 0x68, 0x3c, 0xa8, 0x4e, 0x96, 0xaf, 0x2e, 0xba, + 0xad, 0xab, 0x61, 0x80, 0xb4, 0xc8, 0x8e, 0x9d, 0x99, 0x99, 0x9e, 0xba, + 0x74, 0x9e, 0xa3, 0x58, 0xa8, 0xce, 0x91, 0xbe, 0x67, 0xbc, 0xa1, 0x35, + 0x7a, 0xa9, 0x7a, 0xa3, 0xb5, 0xb5, 0x58, 0x85, 0x8e, 0xa7, 0xa0, 0x7a, + 0x75, 0xcc, 0x82, 0xb6, 0x68, 0x5c, 0xa5, 0xb5, 0x2b, 0xb4, 0x9f, 0x37, + 0xbd, 0xc5, 0x4f, 0x39, 0x90, 0x78, 0x8a, 0x71, 0x6d, 0x88, 0x86, 0x4e, + 0xb2, 0x49, 0x62, 0xa7, 0x7d, 0x3d, 0x9f, 0x95, 0xb7, 0x5d, 0x45, 0xc4, + 0xbf, 0x85, 0x68, 0x96, 0xc6, 0x7a, 0x92, 0x76, 0x68, 0x70, 0x4a, 0x3f, + 0x4e, 0x83, 0x58, 0x80, 0x36, 0x59, 0xb8, 0x6d, 0x6e, 0xc2, 0x6d, 0x65, + 0x5c, 0x89, 0x9c, 0xc9, 0x7d, 0x68, 0xd2, 0x46, 0x5e, 0x8e, 0x73, 0x66, + 0x9f, 0x51, 0x40, 0xc3, 0x94, 0xc3, 0xc1, 0x40, 0x69, 0x27, 0x5d, 0xbb, + 0x78, 0xb0, 0x69, 0x8b, 0x67, 0x44, 0x85, 0x5b, 0x71, 0x94, 0xaa, 0x57, + 0x57, 0xb2, 0x7c, 0x96, 0xbf, 0xa8, 0x7f, 0x70, 0xbf, 0x69, 0xa6, 0xb7, + 0x2e, 0x8c, 0x52, 0x75, 0xa2, 0x67, 0x64, 0xbe, 0x46, 0x58, 0x80, 0xb9, + 0x4d, 0x82, 0xc8, 0x49, 0x56, 0xbb, 0xbb, 0xbe, 0x58, 0x64, 0x3b, 0x74, + 0x2c, 0x37, 0x9b, 0xab, 0x77, 0xd3, 0x91, 0x9f, 0xc9, 0x45, 0x61, 0xa1, + 0xad, 0x6c, 0x92, 0x4d, 0x72, 0xcf, 0x74, 0x4d, 0x98, 0x76, 0xac, 0xae, + 0x5a, 0x8a, 0x3d, 0x4d, 0x9e, 0x57, 0x90, 0x55, 0xb7, 0xbc, 0x78, 0x3a, + 0x9e, 0x46, 0xc6, 0x52, 0x8f, 0xb5, 0x90, 0x93, 0xc2, 0x51, 0xa6, 0xc8, + 0x7f, 0x80, 0x5c, 0x69, 0x99, 0x9c, 0x85, 0x7f, 0xc6, 0x7a, 0x67, 0x97, + 0x68, 0x6c, 0xba, 0x58, 0x54, 0x8a, 0xd2, 0xd1, 0x5c, 0x69, 0xc9, 0x5c, + 0xd4, 0x48, 0x41, 0xc8, 0x7c, 0x64, 0x32, 0x4c, 0x75, 0xac, 0x78, 0xbd, + 0xc9, 0x83, 0xc3, 0xa5, 0xa4, 0x38, 0x82, 0x5f, 0x74, 0xcf, 0x37, 0x87, + 0x42, 0xa8, 0x8f, 0xbc, 0x5d, 0x44, 0xb2, 0x54, 0x6b, 0x7d, 0x4b, 0x65, + 0x71, 0x9c, 0x80, 0x90, 0x59, 0x70, 0xa5, 0x64, 0xcc, 0x6e, 0xba, 0x68, + 0x79, 0x7b, 0xbc, 0x75, 0xbc, 0xc5, 0x77, 0x51, 0x6a, 0x8c, 0x79, 0xb5, + 0x95, 0x44, 0xaf, 0x94, 0x96, 0x2d, 0x76, 0x6d, 0x90, 0x83, 0xb1, 0x49, + 0x56, 0x36, 0x4a, 0x8f, 0x6f, 0x3c, 0x6a, 0xb1, 0x3a, 0x7d, 0xcc, 0x6a, + 0xaa, 0xbd, 0x44, 0x7e, 0xa0, 0xb7, 0x6d, 0xbe, 0x9c, 0x34, 0xa6, 0x9f, + 0xa1, 0xd3, 0x9c, 0x9a, 0xac, 0x37, 0x63, 0xc9, 0xab, 0x34, 0x8d, 0x99, + 0x50, 0xc5, 0x55, 0x38, 0xac, 0xa6, 0xd4, 0xcb, 0x7a, 0xba, 0x6b, 0xd7, + 0xac, 0x41, 0x4c, 0x5b, 0x57, 0x4b, 0xc8, 0x52, 0x74, 0xce, 0x37, 0x5c, + 0x80, 0xbb, 0x5e, 0xb4, 0x97, 0xc2, 0x4a, 0x63, 0x57, 0x65, 0x97, 0x57, + 0x8f, 0xd6, 0x53, 0x46, 0x6a, 0x77, 0xa9, 0x5d, 0xc3, 0xa3, 0x5e, 0xda, + 0x9e, 0x9b, 0xcb, 0x4d, 0x6c, 0xcd, 0x6f, 0x3e, 0x7f, 0x48, 0x76, 0x87, + 0x8b, 0x64, 0x93, 0xb7, 0xbd, 0x66, 0x93, 0xb1, 0x3e, 0x5f, 0x67, 0x35, + 0x48, 0x87, 0xc4, 0x71, 0x3f, 0xc9, 0x5f, 0x42, 0x37, 0x89, 0x7a, 0x98, + 0xb0, 0x68, 0x66, 0x84, 0x8d, 0xbe, 0x7e, 0x56, 0x3b, 0x3d, 0x4e, 0x9a, + 0x66, 0x53, 0x38, 0xa5, 0x8d, 0xa2, 0x32, 0x71, 0x7e, 0x2f, 0xa3, 0xcb, + 0x8c, 0x47, 0x8c, 0x4a, 0xaf, 0xbf, 0x5a, 0x78, 0xa2, 0x3b, 0xb2, 0x47, + 0x8b, 0x89, 0x92, 0x84, 0x98, 0x3c, 0xb2, 0xa2, 0x67, 0x99, 0x9e, 0x61, + 0x38, 0x89, 0x4d, 0x53, 0x7c, 0xc4, 0xdb, 0x97, 0x2a, 0x83, 0x4d, 0x32, + 0x9b, 0xc5, 0x7b, 0x95, 0x9d, 0x23, 0x63, 0x51, 0x94, 0x6a, 0x5e, 0xc3, + 0xbc, 0xd6, 0xa0, 0xdc, 0xcd, 0x99, 0x75, 0x54, 0x75, 0xa1, 0xc2, 0xa1, + 0xc3, 0x65, 0x8c, 0x70, 0x7b, 0x3b, 0x2a, 0x4d, 0x52, 0xab, 0xbf, 0x82, + 0x90, 0xa9, 0x5a, 0x54, 0x9d, 0x74, 0x75, 0xdc, 0xa0, 0xd0, 0xa4, 0x80, + 0x27, 0x83, 0x51, 0x52, 0x33, 0x4a, 0xb3, 0xd4, 0x90, 0x54, 0x5e, 0xc6, + 0xd1, 0x88, 0x44, 0xcd, 0x5f, 0x7e, 0xc1, 0x30, 0x6c, 0x95, 0xb9, 0x53, + 0x5d, 0x9f, 0xb0, 0x85, 0x95, 0x6d, 0x90, 0x9c, 0x76, 0x92, 0x6f, 0x9f, + 0x42, 0x51, 0xb6, 0x9a, 0x6c, 0x81, 0x39, 0x4c, 0x91, 0x46, 0x51, 0x6e, + 0x3b, 0x8f, 0x8a, 0xbe, 0x76, 0x8a, 0x8a, 0xce, 0x64, 0x86, 0xb9, 0x8e, + 0xc9, 0x65, 0xce, 0x51, 0x4e, 0xca, 0x5a, 0x70, 0x62, 0x54, 0x3f, 0x3a, + 0x31, 0x80, 0x5c, 0x68, 0x69, 0xc5, 0x9c, 0x4d, 0xce, 0xaf, 0x59, 0x97, + 0x5e, 0x65, 0x78, 0x97, 0x79, 0x49, 0xa1, 0x80, 0x97, 0xa2, 0x98, 0x34, + 0x7b, 0xc1, 0x43, 0x5c, 0x46, 0x70, 0xcf, 0x96, 0x96, 0x46, 0x71, 0x85, + 0x42, 0x9d, 0x61, 0x4f, 0x92, 0x37, 0x46, 0xbc, 0x8a, 0x9a, 0x42, 0x99, + 0x8a, 0x4f, 0x7c, 0x41, 0xc1, 0xbe, 0x41, 0xbf, 0xb4, 0x87, 0x60, 0x4a, + 0x52, 0xc0, 0xa7, 0x3d, 0xbc, 0x44, 0x45, 0xb4, 0xd0, 0x54, 0xb8, 0x41, + 0x70, 0x71, 0xa7, 0x5d, 0x6a, 0xad, 0x33, 0x5c, 0x4a, 0x4d, 0x4c, 0xa3, + 0xb5, 0x78, 0x2a, 0xc5, 0xa7, 0x98, 0xc3, 0x76, 0xa3, 0x99, 0xc6, 0x77, + 0xb9, 0x3a, 0x36, 0xc2, 0x9f, 0xb0, 0xb2, 0x7a, 0xb8, 0x66, 0x78, 0x8c, + 0x8c, 0x62, 0x96, 0xc4, 0xc2, 0x46, 0x50, 0x5c, 0xac, 0x4f, 0xb9, 0x45, + 0x6c, 0x55, 0x4e, 0x9e, 0x41, 0x7e, 0x63, 0x40, 0x99, 0x49, 0x76, 0x65, + 0xb0, 0x76, 0xa4, 0x4b, 0x42, 0x40, 0xad, 0x8b, 0x64, 0x4f, 0x39, 0xcf, + 0x34, 0xaf, 0xab, 0x80, 0xbc, 0xa7, 0xaa, 0x40, 0x7c, 0x47, 0xce, 0xb7, + 0xbf, 0x85, 0xbb, 0x48, 0x96, 0x8f, 0x63, 0x50, 0xa7, 0x57, 0x65, 0x7e, + 0x4a, 0xb6, 0x75, 0x55, 0x78, 0x60, 0x39, 0x55, 0x40, 0x59, 0xae, 0x6a, + 0x99, 0x9c, 0xa9, 0x79, 0xa1, 0x6f, 0x42, 0x4a, 0x85, 0xb8, 0x66, 0x9d, + 0x4c, 0x3a, 0x6f, 0x40, 0xa9, 0xa9, 0x67, 0x62, 0x7f, 0x47, 0x6e, 0xc7, + 0x6a, 0xa5, 0xad, 0x78, 0x60, 0x55, 0x35, 0x88, 0x45, 0x77, 0x42, 0xb3, + 0xc5, 0xbc, 0x75, 0x6e, 0x96, 0x4e, 0x33, 0x9b, 0x71, 0x8c, 0x8f, 0xc4, + 0x93, 0x7d, 0x8d, 0xc8, 0x4a, 0xa3, 0xb6, 0x35, 0xb9, 0xa1, 0x65, 0xbf, + 0x77, 0xc0, 0x50, 0x91, 0x41, 0x38, 0xae, 0x58, 0x3f, 0xac, 0xce, 0xcb, + 0xc4, 0x40, 0xc1, 0x75, 0xa8, 0x7b, 0x34, 0x3c, 0x7c, 0x92, 0x68, 0x45, + 0x5a, 0x57, 0xbc, 0xa2, 0x78, 0x3f, 0xa7, 0x37, 0xc2, 0x7c, 0x88, 0x46, + 0x53, 0x6d, 0x77, 0xcb, 0x81, 0x39, 0xc6, 0xba, 0x73, 0x89, 0xc6, 0x56, + 0xb0, 0x56, 0x99, 0x71, 0x5d, 0x51, 0xb3, 0xb0, 0xae, 0x94, 0x68, 0xb2, + 0x3b, 0xba, 0xb8, 0xba, 0x5e, 0xc9, 0xa8, 0x46, 0xbf, 0x84, 0xbd, 0xbd, + 0xb9, 0x8c, 0x50, 0x45, 0xa9, 0x7f, 0x56, 0xb8, 0x6e, 0x57, 0x4a, 0x3c, + 0xc6, 0x6d, 0x96, 0x77, 0x6b, 0x6f, 0x3b, 0x38, 0xce, 0x7b, 0x42, 0x67, + 0x66, 0x66, 0x33, 0x5a, 0xc1, 0xa4, 0xa1, 0x92, 0x72, 0xaf, 0xcb, 0xbf, + 0x83, 0x43, 0x8b, 0x3b, 0x88, 0x62, 0xc3, 0x4c, 0x4f, 0x77, 0xc7, 0x98, + 0x8d, 0x3d, 0xc8, 0xad, 0xac, 0xcc, 0xd1, 0x73, 0x56, 0xa1, 0x56, 0xb2, + 0x62, 0x35, 0x77, 0xb8, 0x98, 0x9d, 0xa2, 0x55, 0x81, 0xb4, 0xc6, 0xbe, + 0x5e, 0x37, 0x5e, 0x5c, 0x53, 0x35, 0x99, 0x55, 0xbd, 0x8f, 0x70, 0x9d, + 0x66, 0x48, 0x89, 0x89, 0x69, 0x9d, 0x87, 0x73, 0x90, 0xa7, 0x95, 0xc2, + 0x91, 0xbd, 0x9e, 0xac, 0xbd, 0x9e, 0x80, 0xaa, 0x8d, 0x78, 0x58, 0x4a, + 0xb1, 0x52, 0x40, 0x78, 0x35, 0x5c, 0xa1, 0x40, 0x78, 0xb2, 0x9a, 0xa6, + 0x45, 0xc6, 0x7b, 0x92, 0x40, 0xc1, 0x32, 0x42, 0x52, 0x51, 0x73, 0xb8, + 0x79, 0x76, 0x6f, 0x7f, 0xa1, 0xc8, 0x70, 0x3c, 0x84, 0x94, 0x9c, 0x30, + 0x43, 0x8e, 0x66, 0x5d, 0x4f, 0xa1, 0x7c, 0xc3, 0xaf, 0xca, 0x4a, 0x5f, + 0x52, 0xd1, 0x90, 0x58, 0x33, 0x90, 0x4b, 0x78, 0xca, 0x5c, 0xb8, 0xa9, + 0x45, 0xb3, 0x87, 0x7d, 0x34, 0x8d, 0xd0, 0x49, 0x5d, 0xa8, 0xcd, 0xbc, + 0x65, 0x7b, 0x66, 0xa3, 0xa8, 0x39, 0xad, 0x6a, 0x52, 0x96, 0x77, 0xac, + 0xbe, 0x4b, 0x2f, 0x6a, 0x3f, 0x3c, 0xc0, 0x6e, 0x48, 0x36, 0xb1, 0xcc, + 0xa0, 0x70, 0x35, 0x37, 0x9f, 0xa8, 0x34, 0xb4, 0x64, 0x3c, 0x6d, 0x9f, + 0xa8, 0x96, 0x94, 0xce, 0xc6, 0x47, 0x37, 0x53, 0xc5, 0x54, 0x6e, 0x7a, + 0x7c, 0xc1, 0xb5, 0x62, 0x98, 0xb4, 0xa7, 0x47, 0xba, 0x5b, 0x40, 0x53, + 0xae, 0x9f, 0x35, 0xd1, 0x34, 0xc3, 0xc7, 0x5a, 0x8f, 0x38, 0x46, 0xa8, + 0xce, 0x66, 0xbf, 0x41, 0x5e, 0x63, 0x54, 0x3e, 0x80, 0x41, 0x79, 0x8c, + 0xc1, 0x49, 0x63, 0xbf, 0x66, 0x70, 0x6a, 0x7e, 0xc9, 0xa0, 0x82, 0x42, + 0x50, 0x86, 0x72, 0xce, 0x33, 0xa4, 0xa5, 0x40, 0x79, 0xbf, 0x53, 0x38, + 0x68, 0x91, 0x59, 0x9b, 0xc7, 0xb5, 0xbc, 0x9b, 0x86, 0x5e, 0xb7, 0x4d, + 0xa6, 0x44, 0x91, 0x71, 0x8f, 0x49, 0x39, 0x8f, 0xa0, 0x6b, 0x6a, 0xb8, + 0xa8, 0xd0, 0xb6, 0x83, 0x67, 0x47, 0xac, 0x42, 0x43, 0x92, 0xce, 0x3d, + 0xa5, 0xbc, 0x3f, 0x7b, 0x7e, 0x48, 0xbf, 0x9a, 0x5d, 0x46, 0x4d, 0x35, + 0xba, 0x3e, 0x8d, 0x2f, 0x67, 0xb4, 0x9a, 0x96, 0x63, 0x7e, 0x7a, 0xd1, + 0xb7, 0x60, 0x76, 0xb3, 0x36, 0xc6, 0x81, 0x44, 0x56, 0x7f, 0x4f, 0xa8, + 0xbb, 0xa9, 0xad, 0x85, 0xbb, 0xc0, 0x96, 0x66, 0x8e, 0xa2, 0xb1, 0x35, + 0x34, 0x6b, 0x5a, 0x43, 0x73, 0x49, 0x70, 0x9f, 0x85, 0x3e, 0x6b, 0x78, + 0x62, 0x4c, 0x49, 0x39, 0xb6, 0x89, 0x44, 0x7b, 0xbe, 0x87, 0x34, 0x49, + 0x96, 0xa3, 0x7f, 0xc8, 0x46, 0xb8, 0x65, 0xb8, 0x49, 0x47, 0xc3, 0xc3, + 0x3c, 0xa0, 0x5f, 0x81, 0x90, 0x9f, 0x90, 0xd7, 0x8a, 0x54, 0x3a, 0x9f, + 0x51, 0xa4, 0x62, 0xb6, 0x61, 0xb0, 0xcc, 0x79, 0xbc, 0x45, 0x2e, 0x7c, + 0x59, 0x9e, 0x48, 0xc3, 0xd3, 0x8d, 0xaf, 0x84, 0xac, 0x37, 0x44, 0x91, + 0xa5, 0xa1, 0xa8, 0xad, 0x93, 0x37, 0xb2, 0x7b, 0xc6, 0x8c, 0x5a, 0x4a, + 0x4c, 0xa0, 0x9d, 0x48, 0x6c, 0xac, 0x54, 0x9d, 0x64, 0x57, 0x6f, 0x7e, + 0x91, 0xa7, 0xc0, 0x5c, 0x32, 0x7c, 0x72, 0x5d, 0x9d, 0x4b, 0xbc, 0xb8, + 0xa6, 0x6c, 0x50, 0x6d, 0x69, 0xb6, 0x78, 0x69, 0x46, 0x4d, 0x3d, 0xae, + 0x75, 0x44, 0x33, 0x82, 0xc9, 0x87, 0x40, 0xb2, 0x55, 0xb1, 0x95, 0x58, + 0x36, 0xb2, 0x36, 0x7c, 0x3d, 0xb6, 0x5c, 0x52, 0xa0, 0xc9, 0x56, 0x2c, + 0x62, 0xaa, 0x4a, 0xb5, 0xc9, 0x69, 0x4e, 0x45, 0x51, 0x62, 0xb6, 0x70, + 0x73, 0x6a, 0x3e, 0xbd, 0x56, 0x78, 0xc1, 0xc2, 0xa0, 0xc0, 0xd7, 0x6a, + 0x4f, 0x99, 0xc9, 0x36, 0x6d, 0x67, 0x47, 0x7a, 0x35, 0xd1, 0x5e, 0x8e, + 0x91, 0x85, 0x73, 0x78, 0x64, 0x41, 0x8a, 0x92, 0x7c, 0x31, 0x39, 0xc1, + 0x56, 0x3f, 0x8a, 0xa2, 0xc9, 0xbf, 0x43, 0x6d, 0x92, 0x98, 0x5a, 0x8c, + 0xaf, 0x2c, 0xb6, 0x95, 0x3f, 0x6f, 0x98, 0x84, 0x73, 0x7c, 0x77, 0x86, + 0x87, 0xb2, 0x3f, 0x9b, 0x66, 0x73, 0x5e, 0x8b, 0x8e, 0x90, 0x99, 0xa5, + 0x5a, 0x8b, 0xc9, 0x81, 0xb3, 0xa2, 0xaa, 0xa6, 0x44, 0x8e, 0x43, 0x7f, + 0x56, 0x3d, 0x88, 0x60, 0xcd, 0x44, 0x7b, 0x8c, 0xa2, 0x7d, 0x62, 0x4d, + 0x3b, 0xb4, 0xbf, 0x9c, 0x61, 0x94, 0xc0, 0x89, 0x49, 0x2f, 0xa1, 0x40, + 0x5c, 0x55, 0x3d, 0x6e, 0x66, 0x90, 0x71, 0x60, 0xc5, 0x4e, 0x56, 0x3d, + 0x8b, 0x57, 0x6a, 0x86, 0x50, 0x41, 0x9f, 0x58, 0x7a, 0x52, 0xba, 0x8d, + 0xdb, 0x63, 0xb8, 0xab, 0x4e, 0x74, 0x51, 0xcd, 0x49, 0x4d, 0xc6, 0xa9, + 0x99, 0xc2, 0xa2, 0x41, 0x4f, 0x3b, 0xb6, 0x7b, 0xac, 0xac, 0xdc, 0x4c, + 0x7d, 0xc6, 0x82, 0x69, 0x9f, 0xa0, 0x89, 0x4b, 0xbe, 0x43, 0x97, 0x8f, + 0x8f, 0x82, 0x9e, 0x59, 0x66, 0x3b, 0x8e, 0x96, 0x3e, 0x75, 0xce, 0x5e, + 0x44, 0xa2, 0xaa, 0x5f, 0xca, 0x4b, 0xbd, 0xad, 0xae, 0xb0, 0x71, 0x38, + 0xcf, 0xcf, 0x45, 0x32, 0x4c, 0x6d, 0x56, 0xb7, 0xad, 0xca, 0x80, 0x57, + 0x49, 0x73, 0x63, 0xb4, 0xb7, 0x67, 0x30, 0x88, 0x68, 0x66, 0x79, 0x95, + 0x5e, 0x39, 0x54, 0x9d, 0x72, 0x7f, 0x46, 0x81, 0xba, 0x7a, 0xac, 0x7a, + 0x90, 0x6a, 0x52, 0x62, 0x24, 0x56, 0xca, 0x55, 0xa5, 0x77, 0x80, 0x31, + 0x3f, 0x38, 0x6a, 0x7c, 0x93, 0x72, 0x3d, 0xc0, 0xc1, 0xb2, 0x5f, 0x5a, + 0xa1, 0x60, 0x4d, 0xa7, 0x8f, 0x6f, 0x45, 0xc5, 0x9f, 0x7b, 0x30, 0xbb, + 0xc5, 0x83, 0x56, 0xbf, 0x61, 0x58, 0xa2, 0x91, 0x5a, 0x6c, 0xcd, 0xa2, + 0x39, 0x77, 0xb6, 0x6d, 0x4e, 0xd2, 0x71, 0xb5, 0x7d, 0x8b, 0xc4, 0x80, + 0x9b, 0x40, 0xc0, 0x6b, 0xb5, 0xaa, 0xb1, 0x9c, 0x9d, 0xca, 0x7d, 0x79, + 0xcc, 0x95, 0x8b, 0x8f, 0x80, 0x95, 0x5b, 0x6a, 0x8a, 0x8f, 0xa3, 0xbc, + 0xa7, 0x6f, 0xb8, 0x81, 0x6e, 0x7e, 0x5f, 0x62, 0x94, 0x3e, 0x74, 0x52, + 0x9f, 0xc9, 0xcd, 0xce, 0xb7, 0x54, 0xbc, 0xbd, 0x62, 0x38, 0x93, 0xcc, + 0xa4, 0xbc, 0x8b, 0x63, 0x9b, 0xbf, 0x74, 0x3a, 0x47, 0x3e, 0x9c, 0x91, + 0xbd, 0x69, 0xbd, 0x7d, 0xad, 0x30, 0xbd, 0xcb, 0x8c, 0xb9, 0x41, 0xa5, + 0x2b, 0x4b, 0x8f, 0x8e, 0xbc, 0x78, 0x90, 0x5c, 0xc6, 0x9c, 0x86, 0x3c, + 0xce, 0x45, 0x54, 0x4d, 0xa3, 0x9e, 0x6b, 0x99, 0x61, 0x64, 0xa1, 0x5e, + 0x51, 0x9a, 0xbd, 0x6c, 0x3f, 0xc6, 0x82, 0x93, 0x4c, 0x5c, 0x3c, 0x42, + 0x40, 0x3e, 0x5e, 0x92, 0x41, 0x94, 0xd1, 0x40, 0x51, 0x82, 0x70, 0x55, + 0x55, 0xd3, 0x5e, 0x89, 0x68, 0x48, 0xad, 0xaa, 0xc8, 0xcb, 0x54, 0xd3, + 0x49, 0x7d, 0x57, 0xbd, 0x50, 0x99, 0x46, 0x35, 0x7e, 0x3f, 0xc7, 0x83, + 0x70, 0x9e, 0x5a, 0x3e, 0xc1, 0x8b, 0x92, 0xa7, 0xc6, 0x65, 0x8d, 0x3f, + 0x89, 0x3b, 0x5a, 0xa6, 0xbf, 0x9b, 0xad, 0x90, 0xb1, 0x3b, 0x79, 0xaa, + 0x6f, 0xc2, 0x76, 0x8d, 0x55, 0x6c, 0xb1, 0x51, 0x32, 0x84, 0xce, 0x96, + 0x2f, 0xb8, 0xd2, 0x6e, 0x3c, 0xa8, 0x66, 0xb5, 0x84, 0x78, 0xa0, 0xbc, + 0xb1, 0x6c, 0x36, 0x7b, 0xa4, 0xa9, 0xa9, 0xca, 0x49, 0xd8, 0x73, 0x9f, + 0x2f, 0xcb, 0x9d, 0x34, 0x40, 0x44, 0x44, 0x3b, 0x8c, 0xd8, 0xcb, 0xca, + 0x3f, 0xcb, 0xd2, 0x51, 0xbb, 0x7b, 0xca, 0x5c, 0x7f, 0x79, 0x38, 0x36, + 0x90, 0xb0, 0x46, 0xa1, 0x90, 0xb5, 0x7f, 0x91, 0xa6, 0x9e, 0x90, 0x81, + 0x3d, 0x71, 0xc9, 0xbe, 0xac, 0x41, 0xb3, 0xc8, 0xac, 0x57, 0xd5, 0x8f, + 0x33, 0x89, 0x61, 0xca, 0xce, 0x74, 0xc9, 0x58, 0xb0, 0x96, 0x58, 0x61, + 0x5d, 0xb7, 0x8c, 0xb5, 0xa7, 0x3f, 0x61, 0x57, 0x49, 0x7b, 0x85, 0x4e, + 0x94, 0x77, 0x7b, 0x8f, 0x93, 0xd4, 0x91, 0x5d, 0x2e, 0xac, 0x6b, 0x32, + 0x50, 0x93, 0x6d, 0x5b, 0xc8, 0xce, 0x60, 0x8e, 0x9f, 0x7b, 0xab, 0x83, + 0xae, 0xb7, 0x47, 0x95, 0x9e, 0x9b, 0x3d, 0xc2, 0xd4, 0x92, 0xcb, 0x48, + 0x77, 0xa3, 0x94, 0x57, 0xce, 0x8f, 0xb7, 0xb4, 0x77, 0xa0, 0x66, 0x30, + 0x77, 0x4e, 0x98, 0x51, 0xaf, 0x99, 0x8b, 0xaf, 0x6f, 0x8b, 0xbd, 0xb5, + 0x79, 0x82, 0x8f, 0x7e, 0xc2, 0xa9, 0xbf, 0x3e, 0xa9, 0xc2, 0x63, 0x86, + 0xac, 0x54, 0xd3, 0xae, 0xac, 0x5d, 0x62, 0x55, 0x5e, 0xa6, 0x42, 0x9f, + 0xc1, 0x89, 0x8b, 0x98, 0x92, 0xcd, 0x54, 0x82, 0xab, 0x42, 0xc6, 0x81, + 0x70, 0xa6, 0xb1, 0x82, 0x6a, 0x7c, 0x48, 0x71, 0x37, 0xae, 0x4a, 0x9c, + 0x42, 0x4c, 0xab, 0x79, 0x60, 0xae, 0x94, 0x9b, 0xb0, 0xa6, 0xbb, 0x3b, + 0x6c, 0x76, 0xb6, 0x95, 0xcc, 0xd1, 0x38, 0xcf, 0xc0, 0x2d, 0xa7, 0x5a, + 0xa5, 0x68, 0x77, 0x41, 0xcd, 0x90, 0x8b, 0x40, 0xab, 0x3a, 0x54, 0x5f, + 0xa0, 0x59, 0xd1, 0x41, 0x6b, 0x81, 0xa5, 0x41, 0x73, 0x38, 0x94, 0x86, + 0x7a, 0x7e, 0xaf, 0x5b, 0x49, 0x6c, 0x45, 0x98, 0xcb, 0x62, 0x32, 0x35, + 0x86, 0x52, 0x74, 0xa6, 0xd0, 0x52, 0xba, 0x31, 0x73, 0x6d, 0xa1, 0x58, + 0xa5, 0x3e, 0x97, 0xbf, 0xbf, 0xa1, 0x53, 0x71, 0x7a, 0x54, 0x9d, 0x6c, + 0x41, 0x59, 0xb8, 0xcc, 0xc3, 0xbe, 0x5d, 0x35, 0x71, 0x3d, 0xae, 0xbb, + 0x4d, 0x4f, 0x76, 0x57, 0x5c, 0xa4, 0x6c, 0xc1, 0x51, 0xc6, 0xd1, 0x60, + 0x59, 0xbb, 0x7e, 0xbe, 0x88, 0x37, 0x53, 0x70, 0x61, 0x8d, 0xae, 0x85, + 0x43, 0x3a, 0x65, 0x69, 0xd2, 0x58, 0xac, 0x57, 0x42, 0x89, 0x54, 0x6c, + 0xc0, 0x6f, 0x79, 0x82, 0x3b, 0xa2, 0xa4, 0xc8, 0x70, 0x86, 0xc2, 0x45, + 0xbe, 0x3b, 0x87, 0xc1, 0x8d, 0xa2, 0x55, 0x40, 0xc4, 0x39, 0x78, 0x3e, + 0x81, 0x40, 0x5c, 0x48, 0x7a, 0x47, 0xb6, 0xb5, 0x9c, 0xb8, 0x56, 0x9d, + 0x44, 0xc8, 0x69, 0xa8, 0x5d, 0xab, 0x79, 0x87, 0x58, 0xa6, 0xbf, 0x71, + 0xa9, 0x86, 0x3b, 0x8f, 0x32, 0x6a, 0x37, 0x8f, 0x4e, 0x77, 0x6b, 0x9a, + 0x5a, 0x5f, 0x59, 0xb4, 0x67, 0x4f, 0x7e, 0x78, 0x9d, 0x5f, 0x44, 0x31, + 0x73, 0x5e, 0xaf, 0xb9, 0xac, 0x63, 0xad, 0x89, 0x48, 0x5a, 0x81, 0x35, + 0x73, 0x6f, 0x4b, 0x3a, 0xb6, 0x6b, 0x71, 0x5c, 0xd0, 0xa5, 0x6b, 0xb2, + 0x9f, 0x9e, 0x99, 0xc2, 0xc1, 0x58, 0x67, 0xb3, 0xbb, 0xb8, 0x5e, 0xbc, + 0x64, 0x59, 0x6a, 0x96, 0x68, 0x88, 0xa3, 0xb0, 0x76, 0xc2, 0x6a, 0x80, + 0x4f, 0xd1, 0x38, 0x70, 0x76, 0x91, 0x3c, 0x4b, 0x5f, 0x49, 0x94, 0xb8, + 0x88, 0x5a, 0x98, 0x97, 0x6c, 0xa3, 0x46, 0x4a, 0xab, 0x9c, 0x75, 0x70, + 0x97, 0xa9, 0x3d, 0x3b, 0x36, 0x38, 0x32, 0xa3, 0x47, 0x6a, 0x8f, 0x8f, + 0x49, 0xa2, 0xab, 0x3e, 0xa6, 0xa7, 0x4c, 0x59, 0x4a, 0x48, 0x47, 0x4f, + 0x87, 0x3c, 0xbb, 0xbb, 0xbb, 0x4b, 0xc1, 0x40, 0x73, 0x87, 0xa9, 0xca, + 0x6c, 0x3d, 0xc0, 0x57, 0xbf, 0xc9, 0xc7, 0x52, 0x85, 0x39, 0x97, 0x3d, + 0x48, 0x50, 0x4d, 0x6e, 0x6e, 0x8e, 0x94, 0x43, 0xb6, 0x88, 0x31, 0x62, + 0xbd, 0x8a, 0x3f, 0x76, 0xc3, 0xb4, 0xd1, 0x86, 0x34, 0xbd, 0x8d, 0xb8, + 0x5d, 0x5f, 0x96, 0x50, 0x3a, 0x6d, 0x4a, 0x4e, 0xae, 0x71, 0xbe, 0x82, + 0x49, 0xc5, 0x38, 0x4e, 0xb4, 0x4d, 0x57, 0x87, 0xcc, 0x8c, 0xac, 0xb1, + 0xab, 0x35, 0xd3, 0x8f, 0x8c, 0xb5, 0xc5, 0x72, 0x40, 0x90, 0x5b, 0xcf, + 0x43, 0x49, 0x8f, 0x6c, 0xb7, 0x5e, 0xbc, 0xc9, 0x6f, 0xbe, 0xbc, 0x49, + 0x74, 0x6c, 0x56, 0x82, 0x57, 0x7c, 0x84, 0x4d, 0x9c, 0x73, 0xb8, 0x86, + 0x91, 0xae, 0x70, 0xaa, 0x44, 0x37, 0x6f, 0x64, 0x88, 0x6a, 0x6f, 0x5f, + 0x7e, 0x47, 0x7f, 0x9c, 0x58, 0xb1, 0x8c, 0x58, 0xaf, 0x88, 0xc6, 0x91, + 0x87, 0xc7, 0xac, 0x78, 0xcf, 0x85, 0x6a, 0xb2, 0xca, 0xbb, 0x44, 0x76, + 0x66, 0xbd, 0xc7, 0x6e, 0x34, 0x31, 0xa1, 0xb6, 0x73, 0x9f, 0x7d, 0x65, + 0x92, 0x79, 0x4f, 0x7d, 0x6e, 0x98, 0xc6, 0x53, 0x9b, 0x89, 0xa2, 0xb8, + 0xbf, 0xbb, 0x9c, 0x4a, 0x4a, 0x31, 0xb6, 0x59, 0x88, 0x58, 0x3e, 0x8a, + 0x59, 0x60, 0xcd, 0x70, 0x31, 0xab, 0x74, 0x32, 0xb5, 0x90, 0x39, 0x89, + 0xa0, 0x85, 0x78, 0xd1, 0xc2, 0xbb, 0xca, 0x5a, 0xc4, 0x49, 0x9c, 0x60, + 0xa3, 0x5c, 0x45, 0x7a, 0x87, 0x3a, 0xcf, 0x4a, 0x36, 0x43, 0xa4, 0xa0, + 0x4c, 0x59, 0x40, 0x52, 0xc5, 0xae, 0x73, 0xb0, 0xae, 0x58, 0x98, 0xaa, + 0xa0, 0x95, 0x99, 0xbb, 0x7d, 0xba, 0x6c, 0x88, 0x32, 0x53, 0x61, 0x31, + 0x83, 0x5d, 0x86, 0xca, 0x93, 0x65, 0xa7, 0x3e, 0x75, 0x87, 0x65, 0x67, + 0x5a, 0x42, 0xce, 0xa7, 0xb5, 0x4d, 0x47, 0xa5, 0xc4, 0x3c, 0x96, 0xc3, + 0x63, 0xca, 0xce, 0x50, 0x4f, 0xbb, 0xa6, 0x82, 0x7c, 0x89, 0x4f, 0xb1, + 0xd4, 0x87, 0xc5, 0x6b, 0x4b, 0x8c, 0x5f, 0xcb, 0x37, 0xc3, 0x8b, 0x7c, + 0x63, 0xa3, 0x7c, 0x8e, 0x97, 0xbc, 0x61, 0xa1, 0x5e, 0x42, 0x6e, 0x96, + 0x2e, 0x33, 0xa7, 0xcd, 0xb2, 0x55, 0xa6, 0x52, 0x8c, 0xc7, 0x9f, 0x7f, + 0xb8, 0x71, 0xa6, 0xa4, 0x4b, 0x81, 0xba, 0x93, 0xb8, 0xad, 0x7c, 0x48, + 0xc2, 0x69, 0x97, 0x62, 0x6c, 0x85, 0x59, 0xbc, 0x4e, 0x98, 0x99, 0x6a, + 0x99, 0x7a, 0x6b, 0x88, 0x48, 0x91, 0x69, 0xbf, 0xc1, 0xac, 0xbc, 0x7b, + 0xa9, 0xa4, 0x8a, 0x75, 0x87, 0x76, 0x56, 0xb5, 0x35, 0xb6, 0xb4, 0x3e, + 0x54, 0x3d, 0x97, 0xb8, 0x5b, 0x93, 0xcc, 0x76, 0x80, 0x32, 0x42, 0xb8, + 0x54, 0x50, 0x87, 0xaa, 0xc4, 0x3b, 0xae, 0x89, 0x4d, 0x5f, 0x9c, 0x43, + 0x70, 0xa1, 0x76, 0x6b, 0x77, 0x95, 0xc6, 0xb3, 0xb2, 0x98, 0x6b, 0x40, + 0x92, 0x90, 0xaf, 0x72, 0x7f, 0x88, 0xc5, 0x85, 0xc9, 0xc7, 0x35, 0x8a, + 0x96, 0xb1, 0xc2, 0x81, 0x50, 0xa1, 0x92, 0x4d, 0xc7, 0x4e, 0x47, 0xb4, + 0x56, 0xa8, 0x60, 0x5c, 0x56, 0x64, 0x6d, 0xbd, 0xbc, 0xbd, 0xbd, 0xb6, + 0x4a, 0x39, 0x4f, 0xca, 0x4f, 0x7b, 0x62, 0xa8, 0x48, 0xac, 0x52, 0xc7, + 0x34, 0xad, 0x55, 0x5e, 0x31, 0xbc, 0xb5, 0x58, 0x31, 0x7e, 0xb4, 0x90, + 0xc6, 0xd1, 0xcf, 0x58, 0xcf, 0x8d, 0x4f, 0x37, 0x98, 0x68, 0x67, 0x8f, + 0x67, 0x5e, 0x6a, 0x9c, 0xb2, 0x8e, 0xcd, 0xcf, 0x37, 0x72, 0xa1, 0xaa, + 0xb3, 0x57, 0x70, 0x5e, 0x9c, 0x8b, 0xb7, 0x33, 0x69, 0x70, 0x8a, 0x45, + 0x4a, 0x5f, 0x3e, 0x4d, 0x3a, 0xb0, 0x62, 0xd3, 0x57, 0x36, 0x60, 0x8a, + 0xa3, 0x78, 0xba, 0x56, 0x9f, 0xd3, 0x6e, 0x8c, 0xa7, 0x4c, 0x74, 0x93, + 0xc9, 0x8f, 0x62, 0xc8, 0x4b, 0x6c, 0x64, 0x53, 0x63, 0x58, 0x53, 0xca, + 0x3c, 0xa5, 0xb0, 0x6e, 0x8b, 0xbf, 0x90, 0x89, 0x73, 0xa6, 0x5d, 0xbc, + 0x74, 0x66, 0x77, 0x4b, 0x50, 0x29, 0x5a, 0x79, 0x9b, 0xd5, 0x9b, 0xb2, + 0x5e, 0x55, 0x61, 0x93, 0x57, 0x47, 0x72, 0x91, 0x8d, 0xa3, 0x7b, 0xc4, + 0xdc, 0x96, 0x83, 0x56, 0x70, 0xbe, 0x6a, 0xca, 0x6f, 0x77, 0x48, 0x3f, + 0x95, 0x72, 0xa5, 0x8c, 0xd5, 0x7f, 0x50, 0x7f, 0x5f, 0x94, 0x59, 0xb5, + 0x5a, 0x8b, 0x4e, 0x9b, 0x41, 0x40, 0x9a, 0xd3, 0xb4, 0xad, 0x83, 0x9c, + 0x8d, 0x65, 0x9b, 0xba, 0xd0, 0xad, 0x5d, 0x44, 0x96, 0x93, 0x35, 0x68, + 0x39, 0x83, 0xc6, 0xb3, 0xa4, 0xab, 0x7c, 0x40, 0x42, 0x46, 0x43, 0x88, + 0x53, 0xd0, 0x4d, 0xb7, 0x49, 0x93, 0x51, 0x3b, 0x71, 0x4b, 0x76, 0x4f, + 0x9e, 0x4e, 0x80, 0xab, 0x60, 0x47, 0xa5, 0xd0, 0x7a, 0xa0, 0x93, 0x81, + 0xaa, 0x8e, 0x92, 0x61, 0x72, 0x34, 0xc4, 0xa2, 0x75, 0xab, 0x5e, 0xb8, + 0x60, 0x7d, 0x76, 0x77, 0x9e, 0xa1, 0x87, 0xb1, 0x5d, 0x49, 0x92, 0x63, + 0x45, 0x43, 0x53, 0x44, 0xad, 0xa8, 0xd0, 0x3b, 0xa3, 0xc1, 0x86, 0xc6, + 0x8e, 0x4a, 0x50, 0x7f, 0x37, 0x66, 0xb2, 0xbc, 0x54, 0xb4, 0x85, 0x44, + 0x3f, 0x87, 0x45, 0x52, 0xa3, 0x62, 0x6e, 0x7a, 0xcc, 0x3f, 0xb8, 0x7e, + 0x46, 0xcb, 0xa2, 0x43, 0xb3, 0x6b, 0xb7, 0x7a, 0x30, 0xb2, 0xc0, 0xca, + 0x69, 0x95, 0x36, 0x32, 0x44, 0xc1, 0x32, 0x60, 0x49, 0xb0, 0xb2, 0x87, + 0x99, 0xbf, 0x83, 0xc1, 0x8d, 0xa6, 0x8e, 0xad, 0xa2, 0x88, 0x3a, 0xa4, + 0x93, 0xb1, 0xce, 0x68, 0x97, 0xca, 0x59, 0x77, 0xa7, 0xac, 0xb1, 0x56, + 0xce, 0x43, 0x7e, 0xb5, 0xd4, 0xac, 0xb8, 0x54, 0x43, 0xae, 0x74, 0xb5, + 0x53, 0x6a, 0x45, 0xc6, 0xcd, 0x37, 0x62, 0x3b, 0xa5, 0xc0, 0x99, 0xaa, + 0x61, 0xca, 0xaa, 0x66, 0x98, 0x7c, 0x85, 0x65, 0xc6, 0xbf, 0xc4, 0xb1, + 0xac, 0x50, 0x55, 0x55, 0x7d, 0x5c, 0x2d, 0xae, 0x4c, 0x88, 0x50, 0x74, + 0x8b, 0x54, 0x76, 0xb6, 0x7a, 0x87, 0xd1, 0xc3, 0x66, 0x5a, 0xd0, 0x94, + 0x63, 0xbf, 0xa2, 0x77, 0x9b, 0xa1, 0x3a, 0xaa, 0x8e, 0xca, 0x3d, 0x3c, + 0xca, 0x5d, 0x65, 0x4e, 0x5a, 0x74, 0xbe, 0xaa, 0x8d, 0x69, 0x63, 0xb3, + 0xa6, 0x6d, 0xaf, 0xcb, 0x42, 0x96, 0x41, 0xc2, 0xb5, 0xbd, 0x84, 0x5f, + 0x48, 0x48, 0x53, 0xa1, 0x6c, 0x60, 0xbc, 0xb2, 0xae, 0x7c, 0x6f, 0xb8, + 0x59, 0x95, 0x9d, 0x87, 0x57, 0xcd, 0x9e, 0x5d, 0x96, 0xa7, 0x58, 0xae, + 0xa7, 0x66, 0xaa, 0xd3, 0xb8, 0xbd, 0x8c, 0x76, 0xa4, 0x58, 0x3d, 0x84, + 0x34, 0x41, 0x68, 0xa1, 0x99, 0xca, 0xcf, 0x91, 0x6c, 0x7b, 0xb6, 0x57, + 0xd1, 0x81, 0xab, 0xbb, 0x39, 0x97, 0x67, 0x7c, 0x45, 0x93, 0x69, 0x37, + 0xae, 0xc8, 0x6b, 0x39, 0xbc, 0x6e, 0xb3, 0x5b, 0x9b, 0x47, 0xce, 0x6b, + 0x5c, 0x43, 0xbc, 0x82, 0xd3, 0xb6, 0x30, 0x90, 0xba, 0x98, 0xaf, 0x61, + 0xc1, 0xd0, 0x7f, 0x2c, 0xa4, 0x73, 0x95, 0x98, 0x8c, 0x55, 0xc7, 0xd2, + 0x6d, 0x48, 0xaa, 0xca, 0x8d, 0x78, 0xbc, 0x6f, 0x43, 0xce, 0x54, 0xa0, + 0x5f, 0x2b, 0x4d, 0x81, 0x49, 0x96, 0xae, 0xb2, 0x89, 0xbb, 0xb2, 0x9a, + 0xcc, 0xa9, 0x74, 0x5c, 0x7c, 0xc7, 0x56, 0x5c, 0xaa, 0xb2, 0xa0, 0x6f, + 0xa8, 0x8c, 0xa2, 0x88, 0xbd, 0xd7, 0x92, 0x8d, 0x83, 0x83, 0x35, 0xc7, + 0x4b, 0xa4, 0xd1, 0xd0, 0x76, 0x65, 0x52, 0x75, 0x7c, 0x54, 0x6a, 0x91, + 0x39, 0xd2, 0x9d, 0xc6, 0x92, 0x92, 0x79, 0x86, 0x95, 0x8a, 0x66, 0xb5, + 0x6e, 0x74, 0x9d, 0x7b, 0x9b, 0x55, 0x6c, 0xbb, 0xa4, 0xb0, 0x2c, 0x79, + 0x49, 0xb2, 0xd8, 0xd9, 0x79, 0xd8, 0x9a, 0x71, 0x4e, 0x67, 0x66, 0x50, + 0x9f, 0x6e, 0x6f, 0x48, 0x67, 0x92, 0xab, 0x3d, 0xd1, 0xca, 0x8d, 0x42, + 0x36, 0xb8, 0xdc, 0xb9, 0x36, 0x39, 0x56, 0x83, 0x63, 0x79, 0x70, 0x7e, + 0x9c, 0x51, 0x4b, 0x78, 0x4c, 0x41, 0x30, 0x3e, 0x79, 0xd1, 0x77, 0xa5, + 0x4b, 0xb4, 0xb4, 0x77, 0x75, 0x69, 0xa5, 0x37, 0x57, 0x49, 0xb1, 0xa6, + 0xab, 0x4d, 0x6b, 0x9d, 0x4b, 0xb7, 0xc1, 0x48, 0x51, 0x5e, 0x89, 0xa0, + 0xa5, 0x38, 0x88, 0x36, 0x74, 0x71, 0x4f, 0x84, 0x59, 0xa2, 0x6b, 0xbd, + 0x3f, 0xc0, 0x68, 0xb9, 0x55, 0x9a, 0x2e, 0x2d, 0xcf, 0xc8, 0x4e, 0x65, + 0x54, 0x7c, 0x60, 0x84, 0xc7, 0x95, 0x52, 0x7a, 0xd0, 0x39, 0x8d, 0x3e, + 0xb1, 0xc4, 0x42, 0x3d, 0x8b, 0x7e, 0x73, 0xb2, 0x76, 0xb2, 0x7c, 0x7f, + 0x61, 0x6e, 0x58, 0x88, 0x8c, 0x7f, 0x78, 0x42, 0x4b, 0xde, 0x38, 0x3c, + 0xb8, 0x88, 0xa6, 0x94, 0x3c, 0x64, 0x29, 0xbe, 0x5b, 0xb3, 0x89, 0xb5, + 0x84, 0xbc, 0xe0, 0xc3, 0x8c, 0x5d, 0x4f, 0x44, 0xc2, 0x58, 0x3d, 0xa8, + 0xc4, 0x77, 0x46, 0x58, 0x78, 0x80, 0x48, 0x98, 0x5f, 0x4e, 0xa6, 0xc5, + 0x92, 0xac, 0x98, 0xaf, 0x68, 0x88, 0x5b, 0xb3, 0x98, 0x9a, 0xd3, 0xa5, + 0x88, 0x72, 0x5b, 0x66, 0xd1, 0x89, 0x80, 0x69, 0x43, 0x52, 0x85, 0x8b, + 0xd4, 0xa9, 0x46, 0x57, 0x58, 0x70, 0x6a, 0x9c, 0xc8, 0xbd, 0xab, 0xb1, + 0x45, 0x33, 0x52, 0xa6, 0x5d, 0xad, 0x54, 0x9f, 0x8a, 0xc6, 0x93, 0xc0, + 0xad, 0x44, 0xa6, 0xc8, 0x98, 0xb1, 0x9e, 0xd9, 0xa8, 0x5b, 0x6e, 0x79, + 0x87, 0x52, 0x6b, 0x51, 0xa4, 0xb3, 0x64, 0x61, 0x6a, 0x7b, 0x4e, 0xd3, + 0xbf, 0x83, 0x93, 0x94, 0x9e, 0xcf, 0xb0, 0xc3, 0x92, 0x65, 0x77, 0xaa, + 0xa2, 0xb8, 0x76, 0x58, 0x2f, 0xa8, 0xb8, 0x96, 0xd8, 0x6a, 0xa8, 0x5a, + 0xa1, 0x88, 0x8b, 0xa5, 0xc0, 0xb0, 0xca, 0x58, 0x45, 0x41, 0x87, 0x9c, + 0x3d, 0xb1, 0x64, 0xd9, 0x40, 0xa1, 0x78, 0x64, 0x98, 0xaf, 0xb5, 0x3d, + 0x9b, 0x69, 0xa4, 0x2c, 0x34, 0xba, 0x32, 0xc1, 0xca, 0x42, 0x80, 0x9e, + 0x66, 0x46, 0x51, 0x81, 0x36, 0x45, 0x88, 0x8b, 0xc0, 0xaf, 0x8a, 0xd2, + 0x4b, 0x53, 0x60, 0xbb, 0xa7, 0xac, 0xcc, 0x37, 0x90, 0x95, 0x91, 0x7f, + 0x34, 0x53, 0x5a, 0x63, 0x6c, 0x68, 0xa5, 0x56, 0x33, 0x41, 0x65, 0x81, + 0x72, 0xa6, 0x92, 0x59, 0x82, 0x83, 0x45, 0xb1, 0xb7, 0xa7, 0x45, 0x8e, + 0x77, 0x77, 0x89, 0x99, 0x4b, 0xd4, 0x99, 0xb0, 0x61, 0x49, 0xbc, 0x4d, + 0xbe, 0x32, 0xb9, 0xd4, 0x3e, 0xce, 0x32, 0x4e, 0x4d, 0x94, 0x38, 0x41, + 0x5c, 0x64, 0x9e, 0x72, 0x80, 0x59, 0x6d, 0x32, 0x66, 0xa7, 0xbb, 0xbf, + 0x97, 0x73, 0x6b, 0x65, 0x9c, 0x6b, 0xc8, 0x67, 0x49, 0xb0, 0x2c, 0xb8, + 0x53, 0xc8, 0xa8, 0x58, 0x37, 0xc2, 0x56, 0x94, 0x34, 0x56, 0xb7, 0x42, + 0x55, 0x74, 0x89, 0x3c, 0x71, 0x97, 0x43, 0xae, 0x62, 0x6a, 0xc5, 0xd9, + 0x82, 0xc5, 0xb2, 0xd2, 0x58, 0x94, 0x7a, 0x7b, 0x52, 0xcc, 0x7e, 0x45, + 0x2d, 0xd2, 0x84, 0x44, 0x3b, 0x75, 0x97, 0x46, 0x6a, 0x69, 0x7f, 0xa1, + 0x48, 0x4e, 0x8f, 0x35, 0x59, 0x97, 0x75, 0x5b, 0x2e, 0x64, 0x94, 0x81, + 0xcf, 0x88, 0x7e, 0x2e, 0x79, 0xab, 0x87, 0x57, 0xb4, 0xa4, 0xa9, 0x93, + 0xce, 0x8c, 0xc3, 0x69, 0x96, 0x3f, 0xad, 0x8e, 0xce, 0xb1, 0x70, 0x50, + 0xc2, 0xd4, 0xc9, 0x6e, 0x5f, 0x6d, 0xa4, 0x9f, 0x8d, 0x47, 0x4e, 0x51, + 0x9f, 0xb5, 0xb0, 0x88, 0x4e, 0xbb, 0x40, 0x8c, 0xd2, 0x8d, 0xa5, 0x6e, + 0x91, 0x3b, 0x9e, 0x96, 0xc0, 0x31, 0x5a, 0x5e, 0x9f, 0xbf, 0x6d, 0xcd, + 0x68, 0xa7, 0x73, 0xc8, 0x8e, 0x33, 0x56, 0x48, 0xbb, 0x96, 0x3c, 0x90, + 0x4b, 0x8b, 0x37, 0x3a, 0xa1, 0x6d, 0x38, 0xb1, 0x5c, 0xd4, 0xa6, 0x45, + 0x50, 0x6b, 0x42, 0xa8, 0x8c, 0x3c, 0x54, 0x63, 0x32, 0x65, 0xc1, 0xc5, + 0xd0, 0x34, 0x87, 0x33, 0x7a, 0x43, 0xa8, 0x7f, 0x37, 0x60, 0x98, 0x94, + 0x75, 0x4b, 0xa9, 0xc0, 0xa9, 0xa8, 0x33, 0xca, 0x8a, 0xba, 0xb2, 0x9e, + 0x45, 0x4c, 0x38, 0x8d, 0x8e, 0xce, 0x65, 0xa3, 0x34, 0x7d, 0xa8, 0xa2, + 0x9c, 0x93, 0xd7, 0x9d, 0x8f, 0x7d, 0x35, 0xcf, 0x4a, 0x9e, 0x77, 0xbe, + 0x93, 0x7b, 0x5c, 0x62, 0x96, 0x4f, 0x60, 0x7a, 0x8e, 0x91, 0x5e, 0x70, + 0x80, 0x74, 0x48, 0x7e, 0xce, 0x57, 0x8d, 0x8e, 0xc1, 0x92, 0x5d, 0xda, + 0x37, 0x35, 0x4e, 0x97, 0x54, 0xa8, 0xb1, 0x35, 0x2d, 0xb8, 0x77, 0x7c, + 0xb1, 0x43, 0xa7, 0x38, 0xd3, 0x3d, 0x4e, 0x73, 0x43, 0x49, 0x62, 0x49, + 0xc9, 0x9d, 0x51, 0x40, 0x74, 0xc5, 0xba, 0x8b, 0xb6, 0x68, 0x94, 0xb1, + 0x75, 0xa3, 0x47, 0x73, 0x9e, 0x66, 0x46, 0xa3, 0x3b, 0x36, 0x85, 0x79, + 0x54, 0x53, 0x54, 0x45, 0xb7, 0x5c, 0x4c, 0xc1, 0xc3, 0x92, 0xb9, 0x43, + 0xbd, 0x58, 0xb4, 0x74, 0x79, 0x77, 0x34, 0x42, 0x60, 0x99, 0x37, 0x66, + 0x63, 0x73, 0x7a, 0xb8, 0xb7, 0x82, 0xc0, 0x3a, 0x58, 0x99, 0x85, 0xb9, + 0x6e, 0x35, 0xab, 0xc3, 0xb3, 0x91, 0x37, 0x5f, 0xcf, 0x9b, 0x41, 0xd8, + 0xb6, 0x96, 0xa9, 0xb4, 0xcb, 0x62, 0x44, 0xb0, 0xa6, 0xcb, 0x8d, 0x92, + 0x67, 0xc5, 0x61, 0x93, 0x4a, 0x55, 0x8b, 0x89, 0x52, 0x8b, 0x5f, 0x5c, + 0x45, 0x35, 0xcc, 0x88, 0x75, 0xbc, 0x36, 0x64, 0x96, 0x51, 0x40, 0x45, + 0x46, 0x67, 0x6f, 0x58, 0x7e, 0x7a, 0xbc, 0x5b, 0xa4, 0xa8, 0xa7, 0x3e, + 0xa6, 0x5e, 0xa5, 0xaa, 0x63, 0x54, 0xad, 0xb9, 0x5a, 0x6b, 0xb6, 0x49, + 0x72, 0xb3, 0x89, 0x7d, 0x36, 0xb8, 0x90, 0x3b, 0x47, 0xcb, 0xcd, 0x9d, + 0x85, 0xa6, 0x39, 0xcf, 0x8f, 0xd0, 0x67, 0xbf, 0xcc, 0x44, 0x45, 0xa5, + 0x8a, 0xc0, 0x84, 0x9b, 0x2f, 0x75, 0x4a, 0x50, 0x38, 0x2d, 0x72, 0xc3, + 0x77, 0x99, 0x3d, 0xb3, 0xa3, 0xce, 0xc6, 0x85, 0x66, 0x61, 0x77, 0x92, + 0xa8, 0x9e, 0xcc, 0x7c, 0x90, 0xa3, 0x44, 0x48, 0x84, 0xce, 0x9c, 0x4c, + 0xd2, 0x36, 0xb7, 0x8b, 0x85, 0xb9, 0x50, 0x48, 0xd0, 0x79, 0x9f, 0x48, + 0x3f, 0x36, 0xbb, 0x35, 0x84, 0x63, 0x94, 0x59, 0xa8, 0x6d, 0x42, 0x7c, + 0x51, 0x42, 0xbd, 0x3b, 0x59, 0xac, 0xb5, 0x97, 0x39, 0x64, 0x59, 0xb1, + 0xbb, 0x3c, 0xd2, 0x58, 0xa0, 0xce, 0xb8, 0x3d, 0x4d, 0x97, 0x5e, 0x57, + 0x57, 0xa1, 0x78, 0x80, 0xba, 0xb8, 0x75, 0xad, 0x31, 0x37, 0x98, 0xac, + 0x99, 0x43, 0x51, 0x4d, 0x9a, 0x77, 0x34, 0x46, 0x87, 0x60, 0x80, 0x43, + 0x93, 0x5b, 0x85, 0x40, 0xcf, 0x4e, 0xb9, 0xc0, 0x92, 0x9f, 0xb0, 0x51, + 0xcd, 0xaa, 0x40, 0x4d, 0x32, 0x66, 0x3d, 0x68, 0x5f, 0x49, 0x5b, 0x93, + 0x61, 0xab, 0x7b, 0x5d, 0x35, 0xb3, 0x8f, 0x43, 0x91, 0x94, 0x73, 0x51, + 0xc6, 0x99, 0x78, 0x2e, 0x66, 0x93, 0x7a, 0x35, 0x9b, 0x65, 0x47, 0x6d, + 0x51, 0x48, 0x5e, 0x52, 0xcf, 0xca, 0x71, 0xc7, 0x9c, 0x63, 0x61, 0x88, + 0x5a, 0xbb, 0x7d, 0x6d, 0x7a, 0xc8, 0xcd, 0x49, 0x8b, 0xae, 0x41, 0x6b, + 0x3c, 0x93, 0x73, 0xb1, 0x88, 0x3c, 0x81, 0x69, 0xb9, 0x3d, 0x87, 0xa3, + 0x83, 0x65, 0xb3, 0x55, 0xce, 0x63, 0x3c, 0x4b, 0x8c, 0x81, 0x4b, 0x69, + 0x91, 0x6c, 0x31, 0x55, 0xcd, 0xd2, 0xbc, 0xad, 0xa4, 0xa0, 0x6a, 0x38, + 0x63, 0x6d, 0xc8, 0x56, 0x92, 0xb6, 0x88, 0x3d, 0x8f, 0x95, 0x79, 0x8e, + 0xc3, 0x87, 0xc8, 0x59, 0x8e, 0x57, 0x43, 0x67, 0x9f, 0x89, 0x96, 0xab, + 0x80, 0x44, 0x9c, 0x81, 0x81, 0x3d, 0xa8, 0x87, 0x4c, 0x42, 0x5e, 0x2e, + 0x9c, 0x9c, 0xc1, 0xa0, 0xc1, 0x78, 0xa3, 0x59, 0x3b, 0x74, 0xa9, 0x84, + 0xc3, 0xc6, 0xb8, 0xa5, 0x42, 0x55, 0xcf, 0x80, 0x42, 0x37, 0xb6, 0x43, + 0x99, 0x46, 0xa4, 0x7a, 0x98, 0x31, 0xd3, 0x87, 0x5e, 0x64, 0x4b, 0x5b, + 0x74, 0x85, 0x47, 0x8b, 0xc0, 0xd3, 0x86, 0x33, 0x5d, 0xd2, 0xbd, 0x55, + 0x72, 0x68, 0xc3, 0xb6, 0x49, 0x7e, 0x87, 0x71, 0x4c, 0x55, 0x4e, 0xac, + 0x36, 0x89, 0x8b, 0x94, 0x41, 0x2f, 0x76, 0xa3, 0xc3, 0x9b, 0x74, 0x90, + 0xbb, 0x7c, 0x68, 0xcb, 0x89, 0xc3, 0x84, 0x87, 0x4c, 0x8a, 0x7a, 0xcb, + 0xb2, 0x92, 0xc8, 0x7d, 0xb1, 0xc0, 0x84, 0x8e, 0x43, 0x9e, 0xb8, 0x6e, + 0x70, 0x55, 0x42, 0x49, 0xc8, 0x59, 0xb8, 0x43, 0x79, 0x54, 0x5a, 0x7c, + 0xcf, 0xb1, 0xa5, 0xd1, 0xcb, 0x75, 0x9d, 0x85, 0xb0, 0xbb, 0x63, 0x2e, + 0xb5, 0xc4, 0x89, 0xa6, 0xd1, 0x71, 0x8e, 0xa8, 0x72, 0xa5, 0xbf, 0x68, + 0xae, 0x86, 0xad, 0x62, 0x83, 0x65, 0xa1, 0x5c, 0x9e, 0x99, 0x60, 0xa9, + 0x63, 0x99, 0x80, 0xb8, 0x5e, 0xaf, 0xaf, 0xa0, 0xcf, 0x78, 0x61, 0x83, + 0xb4, 0x48, 0x34, 0xc4, 0xca, 0x6e, 0x54, 0x7f, 0xc9, 0xa1, 0x5c, 0xdb, + 0xba, 0x54, 0x6e, 0x8f, 0xb1, 0xa5, 0xd3, 0xa4, 0xb7, 0x9d, 0x68, 0x72, + 0x61, 0x59, 0x53, 0xae, 0x34, 0x75, 0xc8, 0xb8, 0xb7, 0x2a, 0x79, 0x37, + 0x5a, 0x70, 0xab, 0xd6, 0x4f, 0x97, 0xc4, 0xa4, 0x8e, 0x82, 0xc8, 0xb1, + 0xcf, 0xba, 0x91, 0x71, 0x74, 0xba, 0x60, 0x72, 0xc4, 0x71, 0xb3, 0x3d, + 0x97, 0x99, 0x71, 0x5e, 0x44, 0x8a, 0x38, 0x55, 0x89, 0xa5, 0x97, 0x44, + 0x68, 0xd2, 0xd5, 0x5f, 0x3d, 0x5a, 0x3a, 0x4f, 0xc0, 0x62, 0x82, 0xc9, + 0xbf, 0x30, 0x6b, 0x2f, 0xa5, 0xce, 0x49, 0x74, 0x54, 0x65, 0x61, 0x31, + 0xbf, 0xb6, 0x5f, 0x75, 0xc1, 0xc5, 0x5d, 0xc9, 0x9c, 0x50, 0xa6, 0x5c, + 0x70, 0x5c, 0x9b, 0x75, 0x9e, 0xb0, 0x39, 0xc4, 0xc7, 0xb0, 0x37, 0x43, + 0x92, 0x3b, 0x4b, 0xd2, 0x73, 0x53, 0xaa, 0x5f, 0x99, 0xd3, 0xc7, 0x37, + 0x83, 0xa4, 0x41, 0x41, 0x86, 0x42, 0x94, 0x48, 0xae, 0x5d, 0xcb, 0x3e, + 0xc5, 0xa3, 0x81, 0xc6, 0x5e, 0xc3, 0x99, 0xae, 0x89, 0x9e, 0x54, 0x4a, + 0x35, 0x7d, 0xb2, 0x95, 0xa5, 0x8a, 0x3f, 0xcc, 0xc7, 0x59, 0x8d, 0x87, + 0x8e, 0xb3, 0xbf, 0x35, 0xb3, 0x60, 0xd4, 0xb3, 0x97, 0x63, 0x4c, 0xd0, + 0x38, 0xb9, 0x5f, 0x5c, 0x36, 0x3a, 0x7a, 0x7e, 0x8d, 0x58, 0xbc, 0xb8, + 0x5f, 0x41, 0x82, 0x3d, 0xbe, 0x47, 0x7d, 0xa3, 0x86, 0xab, 0xc6, 0x86, + 0x5b, 0x51, 0x4a, 0x31, 0xc3, 0x90, 0x6f, 0x5c, 0xae, 0x29, 0x3b, 0xae, + 0x94, 0x35, 0x49, 0x5f, 0x46, 0x4f, 0x6c, 0x6d, 0x25, 0xc9, 0x46, 0xd4, + 0xbe, 0x51, 0xb7, 0xb7, 0x64, 0x8f, 0xbf, 0x5e, 0x5c, 0x7f, 0xad, 0x58, + 0xc9, 0x4c, 0x64, 0x8a, 0x51, 0x7d, 0x33, 0x74, 0xc4, 0x41, 0x33, 0x41, + 0x6b, 0x7b, 0x73, 0x5e, 0x79, 0x7d, 0xc2, 0x4f, 0xc0, 0xbe, 0x3e, 0x4c, + 0xaa, 0x68, 0xc8, 0x49, 0xc8, 0x4b, 0xa4, 0x64, 0x49, 0x7d, 0x48, 0x53, + 0x67, 0x92, 0x8e, 0x8c, 0x3b, 0x8b, 0x71, 0x33, 0x8f, 0x5d, 0xc1, 0xac, + 0x69, 0x62, 0xac, 0x6d, 0x47, 0x70, 0x77, 0x53, 0xd3, 0x5c, 0xa5, 0x81, + 0x52, 0xc2, 0x92, 0xb0, 0xd1, 0x31, 0x63, 0x94, 0x7c, 0xac, 0x8b, 0x3d, + 0x4c, 0x35, 0xa3, 0xab, 0x9c, 0x95, 0xad, 0x7a, 0x90, 0x40, 0x65, 0x7a, + 0x5b, 0xc5, 0xc2, 0x3d, 0x60, 0xa0, 0x60, 0x89, 0xa1, 0x66, 0x8c, 0x7b, + 0x96, 0x28, 0x6a, 0xa7, 0xd5, 0xb8, 0xc7, 0x81, 0xa0, 0xb1, 0xbd, 0x5f, + 0xbb, 0x65, 0xbf, 0xc7, 0x4d, 0xb7, 0x4e, 0x98, 0x78, 0x9f, 0x57, 0xa0, + 0x6b, 0x34, 0xc5, 0x54, 0xa0, 0x67, 0x75, 0x59, 0xbd, 0x69, 0xc1, 0x42, + 0x61, 0xaa, 0x5b, 0xc5, 0x86, 0x73, 0x52, 0xa7, 0xab, 0xa1, 0xa0, 0x7a, + 0xc3, 0x5e, 0x60, 0x8f, 0x76, 0xa8, 0xa3, 0x48, 0xaf, 0x87, 0x46, 0x65, + 0x64, 0x46, 0x75, 0x5d, 0xab, 0x3c, 0x9e, 0x77, 0x68, 0x4e, 0x4f, 0x32, + 0x40, 0x6d, 0x4a, 0x34, 0x32, 0xab, 0x59, 0x83, 0xaa, 0x67, 0xbf, 0x47, + 0xd1, 0xc0, 0xbc, 0xd4, 0x6c, 0x68, 0x49, 0x42, 0x5d, 0x93, 0x9e, 0x54, + 0x61, 0x81, 0x36, 0x57, 0x4d, 0xb3, 0xbf, 0x6b, 0x9e, 0x89, 0x82, 0xb1, + 0xac, 0x42, 0x55, 0x4f, 0x73, 0x96, 0xb7, 0xd3, 0xb4, 0x6c, 0x9a, 0x85, + 0xc7, 0x5c, 0x39, 0x95, 0x55, 0x66, 0x78, 0xb3, 0xc7, 0x8b, 0x88, 0xb1, + 0x65, 0x76, 0x36, 0x9c, 0x6e, 0x36, 0x76, 0x74, 0xcb, 0x89, 0x4f, 0x5a, + 0x44, 0xa5, 0xaf, 0x97, 0x83, 0x7b, 0x74, 0x82, 0xae, 0x5f, 0x88, 0x6d, + 0x99, 0x60, 0x9f, 0xae, 0x4d, 0xd4, 0x8b, 0xca, 0xa8, 0xa4, 0x99, 0xa6, + 0x5e, 0xa1, 0x83, 0x51, 0x58, 0x9a, 0x49, 0x9e, 0x6c, 0x3b, 0xca, 0x65, + 0x92, 0x88, 0x61, 0x78, 0x51, 0x6e, 0x64, 0x5d, 0x3e, 0xac, 0x81, 0x87, + 0x7e, 0xa5, 0x32, 0x37, 0xac, 0x45, 0xa7, 0x51, 0x7e, 0xaa, 0x85, 0xb6, + 0x85, 0x8a, 0x90, 0xc1, 0x85, 0xcb, 0x62, 0x9e, 0x86, 0x68, 0x5f, 0x73, + 0x98, 0x63, 0xae, 0xc4, 0x4a, 0xb3, 0x53, 0x71, 0x6b, 0x7f, 0x7f, 0x71, + 0xa2, 0xc9, 0xbe, 0xba, 0xc4, 0x6c, 0x6e, 0x5e, 0xb3, 0x62, 0xba, 0x52, + 0xad, 0x76, 0xc3, 0x95, 0x40, 0x6c, 0x41, 0xc8, 0xb0, 0xb4, 0x79, 0x80, + 0x31, 0x98, 0xb9, 0x95, 0xb0, 0xb2, 0x4d, 0x46, 0x47, 0xad, 0xad, 0x72, + 0x74, 0x9d, 0xb3, 0x8a, 0x62, 0x8c, 0x77, 0x82, 0x9e, 0xc7, 0x63, 0xaa, + 0xca, 0x49, 0x73, 0xac, 0x49, 0x76, 0x93, 0x53, 0xc3, 0x3e, 0xb2, 0x6b, + 0x2d, 0x7e, 0xab, 0x63, 0x92, 0x6b, 0x50, 0x63, 0xb3, 0x6b, 0x68, 0x49, + 0x4f, 0x8b, 0x4d, 0x5d, 0x4c, 0x83, 0xb3, 0x7a, 0x5e, 0xa8, 0xb3, 0x72, + 0xae, 0x5f, 0x7d, 0x7f, 0x88, 0x37, 0x48, 0x59, 0x92, 0x56, 0xb3, 0xcd, + 0x3d, 0xa2, 0x73, 0x78, 0x95, 0x43, 0x60, 0xc4, 0x3c, 0x4c, 0x77, 0x50, + 0x3a, 0x95, 0x93, 0xb7, 0x56, 0x90, 0x61, 0x4b, 0xb2, 0xcb, 0x75, 0x7a, + 0xbd, 0x7f, 0x6b, 0x88, 0x60, 0x71, 0x9e, 0xd5, 0x6b, 0x7d, 0xb9, 0x72, + 0x62, 0xbd, 0x8f, 0xd2, 0x52, 0x6c, 0x43, 0x42, 0x68, 0xa0, 0x5b, 0x4a, + 0x50, 0xb2, 0x9c, 0xd4, 0x7a, 0x96, 0x63, 0xb9, 0x8b, 0x97, 0x94, 0x6b, + 0xa3, 0xa3, 0x57, 0xbe, 0x46, 0x82, 0x73, 0x6f, 0x93, 0x3e, 0x86, 0x47, + 0x74, 0x53, 0x6d, 0x36, 0xb0, 0x9d, 0x62, 0x71, 0xca, 0xbf, 0x84, 0x99, + 0x87, 0x46, 0x6f, 0xbf, 0xb6, 0x52, 0x39, 0x38, 0x88, 0x3d, 0x84, 0xb9, + 0x4a, 0xa6, 0x36, 0x55, 0xa5, 0x88, 0x50, 0xac, 0x47, 0x5f, 0x98, 0x38, + 0x28, 0x83, 0x95, 0xb8, 0xa1, 0x56, 0x78, 0xc2, 0x32, 0x70, 0xc5, 0xcf, + 0x55, 0x64, 0x57, 0x69, 0x44, 0xe0, 0x72, 0x35, 0xaf, 0x6e, 0xbc, 0x3a, + 0x59, 0xbd, 0xbc, 0x87, 0x5b, 0x6f, 0x8d, 0xb4, 0x9b, 0x66, 0x93, 0x49, + 0xa2, 0x64, 0x80, 0x37, 0x75, 0x3e, 0x43, 0x45, 0xaf, 0x49, 0xd1, 0x85, + 0x80, 0x9f, 0xaa, 0xa4, 0x62, 0x6e, 0xad, 0x73, 0x97, 0x8c, 0x1f, 0x2c, + 0xa7, 0x8c, 0x84, 0x39, 0x52, 0x38, 0x57, 0x61, 0x31, 0x69, 0xaa, 0xc2, + 0x5b, 0x84, 0x3a, 0x8e, 0x8d, 0xac, 0x6e, 0xcd, 0x68, 0x7c, 0x51, 0x68, + 0x81, 0x6a, 0x4e, 0x60, 0x49, 0xa5, 0x83, 0x60, 0xb5, 0x3c, 0xd1, 0x45, + 0x87, 0x58, 0x37, 0x83, 0x42, 0x85, 0x9f, 0x51, 0x43, 0x8f, 0x9d, 0x8f, + 0xcb, 0x81, 0x29, 0x6b, 0x8e, 0x42, 0x51, 0x7b, 0xa8, 0xac, 0xd6, 0xb6, + 0xa8, 0x46, 0xbb, 0x63, 0x90, 0x4c, 0x82, 0x97, 0x72, 0x2c, 0x82, 0x3b, + 0xe1, 0x6a, 0xb9, 0xd0, 0xb2, 0xa6, 0x3f, 0xcc, 0x80, 0x49, 0xc5, 0x46, + 0x82, 0x7b, 0xa5, 0x44, 0xa0, 0xb4, 0x80, 0x56, 0xa1, 0xc9, 0x3c, 0x98, + 0x48, 0xc1, 0xb6, 0x43, 0x50, 0x5a, 0x43, 0x7a, 0x7c, 0xc7, 0xd4, 0xa9, + 0x5d, 0x34, 0x70, 0xb3, 0x95, 0xbf, 0xa3, 0x6e, 0x61, 0x7f, 0x38, 0xb8, + 0x4c, 0x9c, 0x83, 0x7c, 0x79, 0x51, 0x7a, 0xab, 0xd8, 0xa6, 0x93, 0x56, + 0xb2, 0x6b, 0xb0, 0x79, 0x96, 0x41, 0x8b, 0xaa, 0xa6, 0xbf, 0xbb, 0x9b, + 0x71, 0x84, 0xc9, 0xc4, 0x5c, 0x27, 0x58, 0x67, 0x3b, 0xb0, 0x96, 0xcf, + 0x49, 0x77, 0x4e, 0x62, 0x65, 0x9e, 0x60, 0xa6, 0xbc, 0xbc, 0xd2, 0x94, + 0xc2, 0x5a, 0x9a, 0x54, 0xd3, 0xd3, 0xa6, 0x35, 0xa8, 0xb5, 0x5a, 0xc4, + 0xda, 0x9e, 0x9d, 0xd2, 0xce, 0xbd, 0x5a, 0x3d, 0xc9, 0xa9, 0x52, 0xaa, + 0x56, 0x68, 0x81, 0x8f, 0x39, 0x47, 0xac, 0x78, 0x62, 0x45, 0xc2, 0x5e, + 0x76, 0x4f, 0x65, 0x5f, 0x65, 0x4e, 0xcc, 0x50, 0x6f, 0xac, 0xb1, 0x9b, + 0xb6, 0xa1, 0xbd, 0x4c, 0x41, 0x85, 0x9d, 0xb0, 0x43, 0x63, 0x5d, 0x6d, + 0x5f, 0xd1, 0xa4, 0x2f, 0x61, 0xcb, 0x43, 0x91, 0xc0, 0x93, 0x3b, 0x62, + 0x61, 0xa4, 0x91, 0x3a, 0x53, 0xc9, 0xbe, 0x4a, 0xd9, 0xb2, 0x50, 0x75, + 0x77, 0xa6, 0x6e, 0x60, 0x5c, 0x5f, 0x91, 0x7b, 0xa8, 0x68, 0x6a, 0xc7, + 0xd0, 0xac, 0x36, 0xc0, 0x7e, 0x81, 0x67, 0x87, 0xb4, 0x61, 0x9a, 0x6d, + 0x4b, 0x8c, 0x91, 0x35, 0xa4, 0xc4, 0x8f, 0xab, 0x7b, 0x60, 0x9e, 0x40, + 0x4a, 0x48, 0x97, 0x98, 0xbb, 0x77, 0x56, 0x8c, 0x57, 0x84, 0xa4, 0xc4, + 0x99, 0x38, 0x50, 0x34, 0x4c, 0x47, 0x78, 0x77, 0xcb, 0xb8, 0xa6, 0xcb, + 0xc9, 0x71, 0xbe, 0x88, 0x88, 0xa4, 0x80, 0x58, 0x4b, 0x9b, 0xc8, 0x54, + 0x47, 0x76, 0x5c, 0x65, 0xb4, 0x6d, 0xbb, 0xbc, 0x48, 0x62, 0xad, 0x83, + 0x53, 0x8b, 0x82, 0x8f, 0x3f, 0x9e, 0x81, 0xa0, 0x6e, 0x5a, 0xcd, 0x7e, + 0xa7, 0x4f, 0x59, 0x6c, 0x53, 0x57, 0x89, 0x84, 0x67, 0x93, 0x65, 0xcb, + 0x87, 0x83, 0xa4, 0xbe, 0xc6, 0xac, 0xab, 0xc4, 0xd0, 0x9d, 0x46, 0x49, + 0xc0, 0x83, 0x53, 0x4a, 0x41, 0xac, 0x6d, 0xb6, 0xd4, 0xa2, 0xd7, 0x39, + 0xd1, 0x9f, 0xe1, 0x7a, 0xe8, 0x4f, 0x5b, 0x74, 0xde, 0x8d, 0xd3, 0xbb, + 0x6e, 0x9f, 0x86, 0x69, 0xc6, 0xcd, 0x2f, 0x64, 0xb9, 0x61, 0xab, 0x82, + 0x64, 0x4b, 0xd8, 0x77, 0xb2, 0x56, 0x4a, 0x6e, 0x6d, 0xa3, 0x79, 0xd6, + 0x17, 0x7d, 0x5f, 0x68, 0x6c, 0x55, 0xac, 0x49, 0x94, 0x5d, 0xc4, 0xb1, + 0xa9, 0x76, 0xa1, 0xad, 0x1d, 0x73, 0x51, 0x6b, 0x69, 0xd5, 0x6e, 0xcd, + 0xa1, 0x61, 0x64, 0xc9, 0x5c, 0x6c, 0x3e, 0x5c, 0x39, 0xa1, 0x21, 0x36, + 0x6c, 0x89, 0x9c, 0x2f, 0xac, 0xc1, 0x76, 0x59, 0x5d, 0x50, 0x7f, 0x41, + 0x8b, 0x95, 0x68, 0xb2, 0x83, 0x56, 0x8e, 0xb6, 0xa7, 0xce, 0x85, 0x42, + 0xa9, 0x4a, 0x73, 0x56, 0x6a, 0xbd, 0xaa, 0x90, 0x6e, 0xaf, 0x55, 0x56, + 0x82, 0x7f, 0xb7, 0x63, 0x99, 0x21, 0x66, 0xa4, 0x40, 0xba, 0x8a, 0xb5, + 0x47, 0x64, 0xad, 0xae, 0x43, 0x80, 0xd4, 0x58, 0x81, 0x60, 0xcc, 0xab, + 0xad, 0x4b, 0xd7, 0xca, 0xc9, 0x55, 0x36, 0x57, 0xbe, 0x72, 0x93, 0x56, + 0xd4, 0x82, 0x89, 0x3b, 0x7b, 0xa8, 0x69, 0x6b, 0x93, 0xb3, 0x80, 0xdb, + 0x71, 0x5f, 0xd1, 0x67, 0x48, 0x60, 0xa2, 0x8a, 0xbc, 0x8f, 0x68, 0x81, + 0x6f, 0x74, 0xc2, 0x81, 0xd3, 0xc3, 0x51, 0x53, 0x47, 0x73, 0xb2, 0x9b, + 0x4f, 0x98, 0x7d, 0x97, 0xd2, 0x3b, 0x6c, 0x49, 0x3c, 0xc2, 0x6e, 0x5e, + 0x94, 0xb9, 0xc9, 0x3d, 0xcc, 0x38, 0x44, 0x6b, 0x96, 0xaf, 0x5f, 0x4b, + 0x5f, 0xa6, 0x3b, 0x8c, 0xbd, 0xad, 0x7e, 0xbd, 0x5b, 0xbc, 0x6c, 0x41, + 0xdb, 0x9a, 0x42, 0xb0, 0x9b, 0x8b, 0x45, 0xa5, 0x62, 0x84, 0x66, 0xa5, + 0x9a, 0xdf, 0x4f, 0x6e, 0x30, 0xa4, 0xc5, 0xab, 0x38, 0x95, 0xcf, 0x7d, + 0x60, 0x3e, 0xa2, 0xa9, 0x47, 0x79, 0x78, 0x43, 0xb8, 0xbd, 0xa2, 0x9c, + 0xe3, 0xd5, 0x80, 0x4f, 0x47, 0x36, 0xce, 0x41, 0xc2, 0x63, 0x53, 0xb5, + 0x94, 0x73, 0x80, 0x40, 0x89, 0x6e, 0x5b, 0x40, 0x5b, 0x8a, 0x86, 0x4e, + 0x70, 0x34, 0x49, 0x69, 0xb3, 0x90, 0x4a, 0xba, 0x8f, 0xa9, 0x49, 0xb9, + 0x7f, 0x37, 0x86, 0x7a, 0x85, 0x91, 0x6c, 0x8a, 0x96, 0x57, 0x73, 0x5a, + 0xb1, 0xd0, 0xc7, 0x47, 0xd6, 0xb6, 0x68, 0x68, 0xa5, 0xc5, 0x4a, 0xc2, + 0x97, 0x68, 0x97, 0x71, 0x6d, 0xa5, 0xaa, 0xb3, 0xc0, 0x31, 0xce, 0x85, + 0x80, 0x3a, 0x64, 0x5e, 0x61, 0x48, 0xae, 0xc4, 0xdc, 0xd8, 0xce, 0x47, + 0x98, 0xc5, 0x6b, 0xc1, 0x4f, 0x68, 0x5b, 0x96, 0xc4, 0xd1, 0xcf, 0xa2, + 0x53, 0x3c, 0xbb, 0x64, 0x91, 0x82, 0x87, 0xbf, 0x5c, 0xa2, 0xaf, 0x43, + 0x78, 0x58, 0x9d, 0xbb, 0xa6, 0x8a, 0x93, 0x9e, 0x51, 0x64, 0x70, 0xae, + 0x41, 0x83, 0xb6, 0x6d, 0x61, 0xd7, 0x8b, 0xbb, 0xba, 0x7f, 0xc0, 0x49, + 0x5b, 0xc0, 0x7b, 0x38, 0x84, 0xa3, 0x8a, 0x63, 0xce, 0x48, 0xa7, 0x54, + 0x44, 0x4c, 0x4a, 0x4c, 0x3c, 0x81, 0x8a, 0x61, 0xc5, 0xc5, 0x7a, 0x88, + 0xc0, 0xc0, 0xca, 0x59, 0x78, 0xe0, 0x94, 0xaa, 0x60, 0x86, 0x58, 0x77, + 0xb8, 0xad, 0x4c, 0x8b, 0x70, 0x2a, 0x9c, 0x63, 0x67, 0xb9, 0x8b, 0x7b, + 0x7d, 0x88, 0xa6, 0x9e, 0xba, 0xb6, 0x8d, 0xa2, 0x58, 0x8a, 0x7b, 0x78, + 0xbd, 0x4f, 0x53, 0x32, 0x57, 0x52, 0x90, 0x60, 0x65, 0x4d, 0xc8, 0x63, + 0xbe, 0x42, 0x74, 0x80, 0xbb, 0xd1, 0x90, 0x59, 0x82, 0x64, 0xa3, 0xd2, + 0xbf, 0x4c, 0xa3, 0xcd, 0x88, 0x8e, 0x7e, 0x93, 0xc2, 0x45, 0x5d, 0x6d, + 0xd2, 0x9f, 0x35, 0xce, 0x3e, 0x8e, 0x40, 0x60, 0xba, 0x68, 0x8b, 0xa9, + 0xb7, 0xc2, 0xa4, 0x65, 0x30, 0xbd, 0xcb, 0x98, 0x88, 0x65, 0x76, 0x63, + 0x71, 0x7c, 0x82, 0x4d, 0x96, 0x5d, 0xa2, 0x6e, 0xb7, 0xa2, 0xac, 0x72, + 0xd7, 0x40, 0x87, 0xc0, 0x6e, 0x69, 0x9b, 0x6f, 0x35, 0x44, 0x39, 0x40, + 0xa3, 0x3c, 0xa7, 0x46, 0x3c, 0x39, 0x69, 0x96, 0x77, 0x64, 0x80, 0x6c, + 0x40, 0xd9, 0x92, 0xa9, 0x73, 0x3f, 0x60, 0x87, 0xcf, 0x64, 0x39, 0xbb, + 0xb0, 0x83, 0xd7, 0x49, 0x58, 0x6d, 0x4e, 0xb1, 0xa3, 0x52, 0xb7, 0xbf, + 0x9a, 0xc2, 0x96, 0x7f, 0xc1, 0x3a, 0xa1, 0xa3, 0x9c, 0x6a, 0xa5, 0xc5, + 0xb8, 0x93, 0x6f, 0x39, 0xa7, 0x7f, 0x37, 0x38, 0xaa, 0xca, 0x8d, 0x7f, + 0x4d, 0x2c, 0x7a, 0x94, 0xb2, 0x6f, 0xb3, 0x38, 0x6b, 0x5d, 0x86, 0x8f, + 0xc5, 0x6f, 0xcf, 0x94, 0x3c, 0xba, 0xd0, 0x3c, 0x65, 0x81, 0xc8, 0xb9, + 0x84, 0xa4, 0xd3, 0x71, 0x6f, 0xb9, 0x41, 0x9c, 0x2d, 0x4b, 0x59, 0x6f, + 0x49, 0x46, 0xbd, 0x95, 0x2d, 0xd3, 0x40, 0x35, 0x6e, 0xa8, 0xb8, 0x9e, + 0x95, 0x6e, 0xc6, 0xaf, 0xb8, 0xcb, 0x86, 0x2c, 0x6b, 0x2a, 0xcb, 0x74, + 0xa6, 0xcf, 0x65, 0x55, 0x4b, 0x7a, 0x72, 0xd2, 0xd1, 0x61, 0x57, 0xb0, + 0x37, 0x87, 0x86, 0x77, 0x75, 0x45, 0x93, 0xcc, 0xce, 0x61, 0xc2, 0x70, + 0xb1, 0x40, 0x97, 0x52, 0xb6, 0xd3, 0x45, 0xb5, 0x67, 0x8e, 0x46, 0x92, + 0x5d, 0x99, 0x4a, 0x46, 0xce, 0xb4, 0xc5, 0x3a, 0x40, 0x58, 0x9a, 0xd2, + 0x90, 0xbd, 0xa2, 0x7e, 0x41, 0x5b, 0x86, 0x7d, 0x95, 0x64, 0x59, 0x53, + 0xa5, 0x8d, 0x91, 0xc2, 0x84, 0x75, 0xc6, 0x75, 0x80, 0xd4, 0xc1, 0x54, + 0x44, 0xab, 0xd4, 0xb1, 0x5f, 0xa8, 0x75, 0x5b, 0x4f, 0x57, 0x6c, 0x6a, + 0x7c, 0x7d, 0x6e, 0x70, 0x5a, 0x91, 0x3c, 0x34, 0x4f, 0x64, 0x38, 0xc8, + 0xc4, 0xb1, 0x87, 0xc2, 0x63, 0xd1, 0xc4, 0x7e, 0xc3, 0x52, 0xb9, 0xb3, + 0x37, 0xc6, 0x51, 0xd2, 0xa8, 0x82, 0x49, 0xa8, 0x79, 0x94, 0x53, 0x88, + 0x75, 0x8a, 0x8f, 0xcd, 0xba, 0x4c, 0x51, 0xb4, 0xad, 0x82, 0x39, 0x8b, + 0x6c, 0xa1, 0x3a, 0x38, 0xb2, 0xa4, 0xa6, 0x9d, 0xa9, 0x66, 0xa7, 0x6a, + 0xc0, 0x86, 0x2e, 0x53, 0x31, 0xa5, 0x33, 0x41, 0x82, 0x99, 0x6e, 0x4d, + 0x96, 0xb9, 0xb7, 0x4e, 0x9c, 0x4d, 0x6e, 0x96, 0x5b, 0x5b, 0x95, 0xc6, + 0xa1, 0xaf, 0xa5, 0xc3, 0x3a, 0x7a, 0x3c, 0x33, 0x52, 0xba, 0x89, 0x34, + 0xae, 0x91, 0xa9, 0x4e, 0x60, 0x5e, 0x7a, 0xd4, 0xa3, 0xc1, 0xc3, 0x3c, + 0x37, 0x96, 0xab, 0x46, 0x64, 0x46, 0x3b, 0xb0, 0x4a, 0xd1, 0x57, 0x6f, + 0x62, 0xc1, 0x39, 0x75, 0xd8, 0xa4, 0x70, 0x61, 0x8b, 0x3a, 0x58, 0xb1, + 0x45, 0xb2, 0x5a, 0x39, 0x47, 0x41, 0x76, 0x54, 0x43, 0x78, 0xd2, 0xb4, + 0x9f, 0x54, 0x9e, 0xb1, 0x96, 0x4c, 0x50, 0x7d, 0xb4, 0x38, 0xc8, 0x70, + 0xa5, 0x6a, 0xb3, 0xa6, 0x9f, 0x91, 0xa3, 0x61, 0x5c, 0x8c, 0x93, 0x73, + 0x88, 0x4c, 0x3a, 0x8a, 0x35, 0xd0, 0xc2, 0x5f, 0x62, 0x4e, 0x46, 0x3c, + 0x7e, 0x6d, 0x84, 0x3c, 0x2f, 0x9c, 0x7d, 0x70, 0x7a, 0x65, 0x87, 0x52, + 0xbb, 0xaf, 0xcd, 0x4f, 0x96, 0x8d, 0xca, 0x3a, 0x4f, 0xa0, 0x2c, 0xbd, + 0x3e, 0x42, 0x41, 0x8b, 0x3d, 0xc1, 0xaf, 0x5c, 0xb0, 0xb1, 0xb8, 0x7b, + 0x6a, 0x48, 0xae, 0x6c, 0xc4, 0x81, 0x83, 0xc2, 0x94, 0xb1, 0x99, 0x8d, + 0x34, 0xb8, 0x63, 0xa8, 0x73, 0x34, 0xa2, 0x3a, 0x7f, 0x9b, 0x78, 0x5f, + 0xb1, 0x60, 0xa6, 0xb7, 0xa7, 0xcb, 0x9f, 0xc1, 0x70, 0xb4, 0x66, 0x60, + 0x9d, 0x6f, 0x40, 0xd5, 0x5a, 0x9f, 0x5e, 0x6d, 0x4d, 0xa2, 0xbf, 0x67, + 0x81, 0x47, 0x4d, 0xb6, 0x81, 0xe1, 0x62, 0xbd, 0x6a, 0x77, 0x71, 0xd9, + 0xc4, 0x90, 0x92, 0xc6, 0x96, 0xb7, 0xc9, 0x2d, 0xd1, 0x4a, 0x31, 0x41, + 0x33, 0x94, 0xc9, 0xc7, 0xae, 0x60, 0x81, 0x74, 0xab, 0x25, 0x91, 0x94, + 0xb7, 0x83, 0x9a, 0x89, 0x9c, 0x7e, 0x6b, 0xaf, 0x74, 0xcd, 0x87, 0x9c, + 0x31, 0x3a, 0xac, 0x95, 0x9f, 0xb0, 0xae, 0xc8, 0x66, 0x27, 0x6c, 0x8a, + 0x60, 0x6c, 0xa3, 0xc8, 0xa7, 0x4e, 0xc7, 0x76, 0x9e, 0x36, 0x51, 0x5d, + 0x93, 0x7b, 0x43, 0xb7, 0x85, 0x63, 0xc3, 0xc7, 0x55, 0x9d, 0x7b, 0x41, + 0x75, 0xbf, 0x96, 0x71, 0x41, 0x7c, 0x4d, 0xc5, 0x31, 0xc6, 0x90, 0x4a, + 0x95, 0x62, 0x3d, 0x4f, 0x49, 0x7e, 0xa6, 0x98, 0x59, 0x34, 0xc1, 0x89, + 0x8b, 0x60, 0xb3, 0x47, 0x8b, 0x2a, 0xc4, 0x6c, 0x84, 0xb2, 0xd8, 0xc9, + 0x96, 0x51, 0xd7, 0x52, 0x96, 0xbd, 0xcc, 0x66, 0xa4, 0x9c, 0x40, 0x76, + 0x6f, 0x76, 0xc3, 0x54, 0x67, 0x5e, 0x3b, 0x5d, 0xb7, 0xae, 0xa9, 0xaa, + 0x37, 0x6a, 0xb9, 0x95, 0x7e, 0x96, 0xbe, 0xb9, 0xcf, 0x78, 0x7f, 0xba, + 0xa9, 0xa2, 0x58, 0x7d, 0xa8, 0x38, 0x4f, 0x5f, 0x61, 0xb6, 0x73, 0xa4, + 0x6a, 0xd9, 0x5e, 0x6f, 0xc5, 0xbc, 0xc9, 0xc9, 0x4b, 0x39, 0x92, 0xb7, + 0x45, 0xc9, 0x45, 0x94, 0xa5, 0xbe, 0x81, 0xa6, 0xc4, 0x41, 0x7f, 0x83, + 0x83, 0x97, 0x48, 0xc6, 0x93, 0x9c, 0xce, 0x97, 0x92, 0xcd, 0x60, 0x52, + 0x32, 0x9c, 0x82, 0x4e, 0x6e, 0x39, 0x67, 0xc2, 0x9a, 0x93, 0x76, 0x9d, + 0x50, 0xc7, 0x93, 0x5d, 0x96, 0x48, 0xa0, 0x47, 0x53, 0xa1, 0xb0, 0x67, + 0x36, 0xb0, 0x89, 0x38, 0x95, 0x78, 0x3b, 0x94, 0xb3, 0xae, 0x71, 0x99, + 0x4d, 0x53, 0xce, 0xae, 0x2c, 0x6c, 0xbc, 0xa2, 0xaf, 0xd7, 0xc0, 0x9d, + 0x49, 0x4c, 0x33, 0x75, 0x6b, 0x40, 0xb5, 0x6b, 0xb7, 0x7f, 0x9c, 0xb7, + 0x86, 0x76, 0x3b, 0x6a, 0x5b, 0x8f, 0x76, 0x8b, 0x4d, 0x69, 0x46, 0x88, + 0x33, 0x37, 0x5b, 0xab, 0x9e, 0x7a, 0x34, 0x29, 0x48, 0x5e, 0x84, 0x85, + 0x3b, 0x79, 0x9f, 0x65, 0x88, 0xbe, 0xce, 0x42, 0x59, 0xaa, 0x49, 0x8f, + 0xac, 0xcf, 0xb2, 0x50, 0x74, 0x5b, 0x81, 0x6f, 0xc1, 0x2f, 0xcc, 0x71, + 0xbc, 0xb9, 0xae, 0x60, 0xc4, 0x8e, 0x42, 0xae, 0x67, 0xb5, 0x57, 0xa5, + 0x62, 0x64, 0x8b, 0x63, 0xbc, 0x57, 0x57, 0x25, 0xcc, 0xc3, 0x90, 0x9b, + 0xb1, 0x72, 0xad, 0x70, 0xc8, 0x54, 0x9b, 0xa5, 0x62, 0xa2, 0x50, 0x4d, + 0x6f, 0xac, 0x54, 0x9f, 0x68, 0x72, 0x61, 0x8f, 0x8c, 0x79, 0x5d, 0x9c, + 0x8e, 0x42, 0xd1, 0xd7, 0x6c, 0xc8, 0xaa, 0x89, 0x38, 0xc5, 0x36, 0x32, + 0xc3, 0x7a, 0x7a, 0x5e, 0x7e, 0x65, 0x8a, 0x4e, 0x5c, 0xcd, 0x84, 0xab, + 0x8a, 0xbb, 0xb3, 0xc1, 0x68, 0x9c, 0xce, 0x44, 0x56, 0xae, 0x9a, 0x6c, + 0x77, 0x4f, 0x46, 0xa3, 0x9e, 0xd2, 0xad, 0x49, 0x9f, 0x3d, 0x42, 0x49, + 0x83, 0x72, 0x9a, 0xab, 0x58, 0xaf, 0x39, 0x87, 0x4d, 0xd9, 0xd8, 0x31, + 0x50, 0xd6, 0x5f, 0x84, 0x3c, 0x40, 0x99, 0xb0, 0xc3, 0xaf, 0x40, 0x65, + 0x5e, 0xba, 0xcc, 0x44, 0xd6, 0x4e, 0x99, 0xb1, 0x77, 0x56, 0x76, 0x71, + 0xc3, 0xa7, 0xce, 0x5c, 0xbe, 0xaf, 0x78, 0x52, 0x85, 0xbd, 0x3b, 0xa7, + 0xd9, 0xbc, 0x5b, 0x45, 0x8e, 0x32, 0xa0, 0x31, 0x9f, 0x46, 0x8f, 0xb7, + 0x6e, 0x60, 0x90, 0xae, 0xae, 0x6a, 0x85, 0x76, 0x5f, 0x5b, 0x72, 0x5c, + 0x4c, 0xc2, 0x64, 0xae, 0xd9, 0xbd, 0xc0, 0x46, 0x82, 0x4a, 0xd9, 0x6c, + 0x4c, 0x75, 0x4e, 0x2b, 0x93, 0xb9, 0x7c, 0x8f, 0x93, 0x50, 0x3b, 0x9a, + 0x74, 0xe8, 0xc4, 0xc9, 0x9d, 0x80, 0x84, 0x75, 0x91, 0x94, 0x84, 0x73, + 0xd0, 0x67, 0x55, 0xc5, 0xbf, 0x40, 0x7e, 0xc7, 0x49, 0xa3, 0xca, 0xac, + 0xbf, 0x52, 0x6e, 0x7a, 0xcb, 0xaf, 0xc0, 0x6d, 0x51, 0x89, 0x59, 0x47, + 0xb4, 0x8d, 0x8e, 0xb3, 0x55, 0x5c, 0xc8, 0x8d, 0x83, 0x35, 0x7e, 0x5e, + 0xcf, 0x7d, 0x59, 0x77, 0xa9, 0x51, 0x8d, 0xc4, 0x7b, 0x97, 0x42, 0x85, + 0x8d, 0xb9, 0x78, 0xaa, 0x71, 0xb6, 0xd0, 0xa2, 0xa9, 0xad, 0x81, 0x62, + 0x99, 0x79, 0x5e, 0xb0, 0xa4, 0x82, 0x47, 0x58, 0x95, 0xca, 0x7e, 0x32, + 0x57, 0x8e, 0xab, 0x9e, 0xab, 0x4e, 0x99, 0xc2, 0x94, 0x34, 0xc1, 0x2e, + 0x9a, 0x36, 0x9a, 0xa3, 0x9e, 0x8e, 0x8c, 0xb8, 0x3e, 0x59, 0x51, 0x5e, + 0x8b, 0x9d, 0xb8, 0x91, 0xbc, 0x28, 0x5a, 0x8e, 0x50, 0x34, 0xa8, 0x8e, + 0xa3, 0x3c, 0x46, 0x47, 0xb4, 0x37, 0xa4, 0x6a, 0xd2, 0xb9, 0x30, 0x7b, + 0x4e, 0xdf, 0x5a, 0x61, 0x42, 0x30, 0x4a, 0xc2, 0x28, 0x45, 0x7f, 0x31, + 0x40, 0x82, 0x6b, 0x85, 0x82, 0x85, 0x84, 0xbe, 0x72, 0x31, 0x80, 0x81, + 0x44, 0x5e, 0x4c, 0xad, 0xa8, 0x97, 0x94, 0x78, 0xbc, 0xa3, 0xb5, 0x69, + 0x6a, 0x3f, 0xae, 0xa7, 0x36, 0xaf, 0xba, 0x5f, 0x69, 0x9b, 0x74, 0xb9, + 0xa1, 0xca, 0x49, 0xc2, 0x64, 0x62, 0x29, 0x9f, 0x83, 0x73, 0x6e, 0xb3, + 0x56, 0xb2, 0x4f, 0xcb, 0x5a, 0x39, 0xb1, 0x5d, 0xc5, 0x81, 0x8d, 0x39, + 0x81, 0x88, 0x96, 0x94, 0x5a, 0x37, 0xbe, 0x4a, 0xc2, 0x79, 0x4b, 0x7a, + 0x3d, 0x88, 0xc1, 0x2b, 0x9a, 0xb8, 0x46, 0x7d, 0x67, 0x40, 0x2b, 0x2e, + 0xcc, 0x8f, 0x36, 0xc8, 0xaf, 0x64, 0x8e, 0x57, 0x69, 0x7c, 0xb7, 0x69, + 0xce, 0xa5, 0x63, 0x83, 0x9c, 0xbe, 0x4b, 0x90, 0xbd, 0xca, 0x80, 0x8d, + 0xd0, 0x50, 0x60, 0x4f, 0xb4, 0x2d, 0x6a, 0x8c, 0x72, 0x2e, 0x98, 0x7e, + 0xbd, 0x3e, 0x8a, 0x2a, 0xc3, 0x69, 0x79, 0xa1, 0x92, 0x32, 0xcd, 0x63, + 0xac, 0x9b, 0x35, 0x3f, 0x7e, 0xc5, 0xa9, 0xca, 0xaa, 0xb2, 0x75, 0x6b, + 0x72, 0xc9, 0x80, 0x64, 0x41, 0xa7, 0x45, 0xbc, 0x4c, 0x3c, 0xa8, 0x66, + 0x4c, 0xaa, 0xa0, 0x75, 0x8e, 0x8e, 0xc4, 0x7a, 0x56, 0xa4, 0x76, 0x34, + 0xaa, 0xa0, 0xae, 0xab, 0xb3, 0x47, 0x60, 0xba, 0xa5, 0x73, 0x6c, 0x3d, + 0x9d, 0xbd, 0x8e, 0x44, 0x94, 0x8d, 0x94, 0x87, 0x70, 0x9d, 0x64, 0x75, + 0xdb, 0x81, 0x59, 0xc9, 0xaf, 0xb8, 0x33, 0x93, 0x7d, 0x7f, 0x7e, 0x9b, + 0x54, 0x8d, 0x5b, 0xe2, 0xb5, 0xd4, 0x96, 0xa9, 0xa6, 0xc8, 0x9e, 0xb5, + 0x3c, 0xb2, 0x31, 0xc4, 0x67, 0x9b, 0xbb, 0x77, 0x3c, 0xb1, 0xd3, 0xa9, + 0x3b, 0x47, 0xa6, 0x6f, 0x69, 0xc6, 0x90, 0x9f, 0x5f, 0xad, 0x47, 0x8c, + 0xab, 0xa4, 0xaf, 0xa5, 0x6a, 0x98, 0x98, 0xba, 0xb0, 0x4e, 0x7d, 0xaa, + 0x4a, 0x93, 0x5d, 0x43, 0xb4, 0x39, 0x64, 0x87, 0x9f, 0x84, 0x72, 0x45, + 0xd0, 0x33, 0x97, 0x3b, 0xc4, 0xbc, 0x73, 0x88, 0x8f, 0x68, 0xa0, 0x98, + 0xc0, 0x97, 0xbc, 0x5b, 0x4f, 0x8d, 0x61, 0x61, 0xd9, 0x3f, 0x68, 0xbb, + 0x4a, 0x38, 0x57, 0x36, 0xa0, 0x57, 0xc9, 0x52, 0x38, 0xc7, 0x42, 0x8c, + 0xb5, 0x9d, 0xbc, 0x8e, 0xa7, 0xc4, 0x89, 0x3e, 0xbb, 0x38, 0x29, 0x96, + 0xb7, 0x46, 0x9e, 0x80, 0xab, 0xcb, 0xc5, 0x5d, 0x8e, 0x81, 0x3a, 0x84, + 0x53, 0x8a, 0xac, 0xa0, 0xb3, 0x89, 0xc2, 0x9a, 0x42, 0x59, 0x36, 0x81, + 0xbc, 0xbd, 0x32, 0xb3, 0x9b, 0xb1, 0x7f, 0xc0, 0x6a, 0x88, 0x46, 0x7a, + 0xa7, 0x38, 0x9d, 0x7b, 0xb8, 0x51, 0x72, 0x4d, 0xac, 0xca, 0xb8, 0xac, + 0xa8, 0x85, 0xc5, 0x91, 0x68, 0xb1, 0x3f, 0x82, 0xa9, 0xb9, 0x51, 0xc9, + 0xa4, 0x7a, 0xc1, 0xaa, 0x78, 0x3d, 0x72, 0x58, 0x96, 0x7e, 0xb4, 0x67, + 0x9c, 0x55, 0xa1, 0xc7, 0x66, 0x8d, 0xbd, 0xa7, 0xa5, 0x7b, 0x98, 0x9c, + 0xa9, 0x59, 0x99, 0x56, 0xe0, 0xa2, 0x7c, 0x91, 0x41, 0x99, 0x45, 0x5c, + 0x36, 0xbe, 0x71, 0x88, 0x51, 0x67, 0x80, 0xcd, 0xc8, 0xa5, 0x9a, 0x77, + 0xb4, 0x71, 0xca, 0x42, 0xc7, 0xa1, 0x82, 0xa5, 0x7e, 0x6e, 0x6b, 0xbc, + 0xb8, 0xca, 0xd5, 0xc7, 0x42, 0x75, 0xae, 0xba, 0x5c, 0x7f, 0x8c, 0x53, + 0x6b, 0x5a, 0x59, 0x67, 0xa7, 0xcf, 0x62, 0x4d, 0x2d, 0xc3, 0x49, 0x3e, + 0x3e, 0xce, 0x3a, 0x84, 0xcf, 0x41, 0xc2, 0xc2, 0x9f, 0x9e, 0x3f, 0x4e, + 0x47, 0xb1, 0x6c, 0x85, 0x3e, 0x56, 0x86, 0xaf, 0xa2, 0x62, 0x3f, 0x5e, + 0xc3, 0xaf, 0x9d, 0xb3, 0x69, 0xca, 0x59, 0x49, 0x9a, 0x97, 0xad, 0xc2, + 0x69, 0x82, 0x4c, 0xb8, 0xcb, 0x84, 0x62, 0x38, 0xd6, 0x47, 0xb2, 0x6b, + 0x86, 0xba, 0x70, 0xd2, 0x27, 0x56, 0x2f, 0xbe, 0xc6, 0x9c, 0xa2, 0x6a, + 0xb5, 0x60, 0x69, 0xc8, 0xa8, 0xb9, 0x91, 0x4e, 0x30, 0x6c, 0x5f, 0x7a, + 0xb4, 0xb7, 0x90, 0x56, 0x37, 0xc8, 0x43, 0x9b, 0x7f, 0x48, 0x66, 0x70, + 0x78, 0x8a, 0x84, 0xc5, 0x60, 0x3e, 0x59, 0xce, 0x2b, 0x72, 0x86, 0xc7, + 0x91, 0x8c, 0x44, 0x5a, 0xc5, 0x68, 0x84, 0x99, 0x99, 0x7a, 0x7c, 0x3a, + 0x38, 0x8b, 0x36, 0xc0, 0xae, 0x71, 0x73, 0x7a, 0xae, 0xd5, 0x6c, 0xab, + 0xa3, 0x37, 0x7d, 0x35, 0x8d, 0x9b, 0x44, 0xac, 0x5d, 0x48, 0xcb, 0x57, + 0x89, 0x6a, 0xab, 0xc6, 0xc8, 0xcc, 0xba, 0x96, 0x39, 0x68, 0xc2, 0xac, + 0x8a, 0xb3, 0xb3, 0x2f, 0x7d, 0xa8, 0x86, 0x73, 0x77, 0x43, 0x67, 0xb0, + 0x3e, 0x51, 0x7f, 0xad, 0xcf, 0xc9, 0x3c, 0x55, 0xc1, 0x71, 0xab, 0xc0, + 0x4a, 0x86, 0xae, 0xbc, 0xbf, 0x36, 0x42, 0xc6, 0x3e, 0xb6, 0x83, 0x82, + 0x81, 0x62, 0x76, 0xbd, 0x5a, 0x5f, 0xba, 0x5a, 0x95, 0x44, 0x8c, 0x96, + 0x5d, 0x44, 0xc7, 0x7d, 0xb3, 0x3a, 0x51, 0x4e, 0x72, 0x34, 0x6c, 0xa1, + 0xbb, 0x64, 0x8e, 0xc7, 0xc3, 0x9f, 0x62, 0x71, 0x56, 0xc0, 0xd5, 0xbb, + 0x95, 0x3f, 0x96, 0x31, 0xc1, 0x4b, 0x67, 0x63, 0x45, 0x44, 0xcf, 0x8b, + 0x97, 0xc9, 0x89, 0x84, 0x5b, 0x4c, 0x81, 0xcf, 0x32, 0x67, 0x86, 0xcc, + 0xab, 0x67, 0xc2, 0xc7, 0x3e, 0x6b, 0x52, 0x3e, 0xb8, 0x4b, 0x4d, 0xb0, + 0xc2, 0xca, 0xb9, 0x73, 0xc8, 0x41, 0xc8, 0x7a, 0x68, 0xab, 0x3c, 0x7e, + 0x9d, 0xbb, 0xca, 0xc1, 0x3f, 0x55, 0x66, 0x4e, 0x34, 0x66, 0xcd, 0xce, + 0xbd, 0xba, 0xad, 0x9d, 0xa0, 0x43, 0xc2, 0xa8, 0xb8, 0x76, 0xc8, 0x88, + 0x95, 0x43, 0x95, 0x6d, 0x78, 0xb8, 0x68, 0x78, 0xaa, 0x9e, 0x7b, 0xb2, + 0xae, 0x98, 0x89, 0x95, 0x40, 0x4b, 0xe6, 0x47, 0x5c, 0x6f, 0xb9, 0x58, + 0x47, 0x5a, 0xd0, 0x6b, 0xa9, 0x8a, 0x91, 0x49, 0x43, 0x6c, 0x8c, 0xa4, + 0x9f, 0x56, 0x7c, 0x35, 0x44, 0xad, 0x72, 0xa5, 0x59, 0x93, 0x7a, 0x9f, + 0x86, 0x5d, 0x42, 0xba, 0x60, 0xb2, 0xab, 0x5f, 0x54, 0xab, 0x4f, 0xbd, + 0xb5, 0x84, 0x9e, 0x51, 0x3e, 0xc4, 0x76, 0x39, 0xa0, 0x78, 0x61, 0x96, + 0x90, 0x7a, 0x3c, 0xbf, 0xcf, 0xb7, 0x81, 0x38, 0xaf, 0xaf, 0x8c, 0x98, + 0xba, 0xa6, 0xb5, 0xbe, 0xaf, 0x92, 0x8f, 0x9d, 0x39, 0x35, 0xa3, 0xc0, + 0x5e, 0xbb, 0x54, 0xa1, 0xcc, 0x76, 0x68, 0x66, 0x4c, 0x75, 0x61, 0x7f, + 0x9d, 0x85, 0xc4, 0xb2, 0xb8, 0xa1, 0xc2, 0x60, 0x98, 0x4e, 0x74, 0x38, + 0x6a, 0x3c, 0x6f, 0xca, 0x7a, 0x5d, 0xc2, 0x52, 0x88, 0xce, 0xad, 0x59, + 0x45, 0x89, 0x62, 0xa3, 0x5d, 0x85, 0x7a, 0xa7, 0x6b, 0x91, 0x60, 0xb4, + 0xcf, 0x9d, 0x47, 0x7e, 0xb4, 0x8a, 0x84, 0xcf, 0x92, 0x47, 0xb1, 0x54, + 0x57, 0x38, 0x9c, 0x96, 0xc9, 0x3b, 0x39, 0xb1, 0xd8, 0x8a, 0x31, 0x3e, + 0xb3, 0x3d, 0x5d, 0x78, 0xba, 0x54, 0x4a, 0x7b, 0x62, 0x3d, 0x9a, 0x9e, + 0xd6, 0xcd, 0xb2, 0xcc, 0xb6, 0x6f, 0x58, 0x90, 0x33, 0xbd, 0x83, 0xc4, + 0x6b, 0x53, 0xa8, 0xa4, 0x84, 0x6e, 0xb0, 0x4a, 0xc5, 0x3a, 0xb5, 0xc9, + 0x3c, 0x4f, 0xa0, 0xbf, 0xcb, 0x4a, 0x47, 0xb2, 0xc4, 0x53, 0x96, 0x6b, + 0x52, 0xd4, 0x92, 0xbe, 0x42, 0x6c, 0x71, 0x34, 0x75, 0x9a, 0x51, 0xc7, + 0x59, 0xa2, 0x7a, 0x4f, 0x79, 0xa9, 0x68, 0xb2, 0x7e, 0xb6, 0x5c, 0x2f, + 0xc2, 0x27, 0x6c, 0xc3, 0xbe, 0xcb, 0x4b, 0xc5, 0xbd, 0xc9, 0x6b, 0xa5, + 0x76, 0x8a, 0xbe, 0x88, 0xd5, 0x2e, 0xa3, 0x40, 0x28, 0x73, 0x93, 0xad, + 0x6c, 0x9d, 0x5d, 0xba, 0x98, 0x53, 0xca, 0x39, 0x7f, 0x72, 0x89, 0xb9, + 0x5d, 0x85, 0x6d, 0x83, 0x23, 0xae, 0x6e, 0x94, 0x2d, 0x6b, 0x5d, 0xd2, + 0x5a, 0x94, 0x7b, 0xba, 0xce, 0x6e, 0x6e, 0xc3, 0xc1, 0x67, 0x6c, 0x74, + 0x9d, 0x9e, 0x7e, 0x6c, 0xbd, 0x92, 0x87, 0x52, 0x60, 0x44, 0x4a, 0x3b, + 0x5c, 0xd1, 0x88, 0xc0, 0x93, 0xa3, 0x84, 0x32, 0x71, 0xb0, 0x8d, 0x76, + 0x4d, 0x49, 0xab, 0x60, 0xa3, 0xb6, 0xc4, 0x7d, 0xb2, 0x90, 0x8a, 0xd1, + 0x7e, 0x45, 0x98, 0x55, 0xd1, 0x99, 0xa1, 0xb5, 0xbd, 0x51, 0x70, 0x48, + 0xbc, 0xde, 0x7f, 0xd4, 0x5f, 0x78, 0x9f, 0x7e, 0xc4, 0x91, 0x88, 0xb6, + 0xb1, 0xaf, 0xc4, 0x70, 0xc7, 0x97, 0x90, 0x8a, 0xb6, 0x80, 0xb5, 0x77, + 0x9e, 0x69, 0x81, 0xc0, 0xca, 0x4b, 0x54, 0x95, 0x4e, 0x65, 0x33, 0xbb, + 0xd5, 0x7b, 0x74, 0xd4, 0x75, 0x37, 0xbd, 0x99, 0x41, 0x6a, 0xd1, 0x42, + 0xb2, 0x48, 0x4b, 0xca, 0xad, 0x35, 0xc3, 0x57, 0xbc, 0x55, 0xbf, 0xb3, + 0x3e, 0x51, 0x4c, 0x4a, 0x5f, 0xa4, 0xaf, 0x74, 0xbd, 0xb5, 0xc1, 0x71, + 0x66, 0x90, 0x9b, 0x6f, 0xae, 0xaf, 0xbb, 0xa1, 0x84, 0x4e, 0x29, 0x6b, + 0x67, 0x57, 0x3e, 0x49, 0x6a, 0xbd, 0x43, 0x91, 0x74, 0x6c, 0xb1, 0x43, + 0x54, 0x7c, 0x41, 0x64, 0x55, 0x63, 0x75, 0x7a, 0x50, 0xbe, 0xb9, 0x31, + 0xd9, 0xd7, 0x9c, 0x63, 0x57, 0x6b, 0x77, 0xba, 0x3d, 0xd5, 0xa0, 0xcb, + 0x75, 0xcf, 0x7f, 0xa8, 0xa0, 0x70, 0x37, 0xb6, 0x9a, 0x6e, 0x78, 0x73, + 0x68, 0x68, 0xb1, 0x62, 0xab, 0x6f, 0x61, 0x94, 0x73, 0x7d, 0x40, 0x55, + 0x82, 0x3a, 0x77, 0xc0, 0x6b, 0x33, 0x68, 0x7b, 0x8e, 0x52, 0x73, 0x3e, + 0x48, 0xa4, 0x49, 0x93, 0x8b, 0x73, 0x38, 0xb9, 0x5b, 0xb1, 0x3a, 0x87, + 0xc9, 0xa5, 0xa0, 0x85, 0xc7, 0x42, 0xb0, 0x69, 0xac, 0x92, 0xd8, 0x60, + 0x3f, 0x6e, 0x87, 0x9c, 0x38, 0x36, 0x46, 0x90, 0xa5, 0x4f, 0xcf, 0x6d, + 0xa4, 0xb2, 0x67, 0x83, 0xbd, 0xc4, 0x93, 0x5e, 0x6b, 0x72, 0x95, 0x72, + 0xb6, 0xb3, 0x53, 0x43, 0x48, 0xb6, 0x7d, 0x4b, 0xab, 0x9c, 0x85, 0xb7, + 0x3e, 0x93, 0xd1, 0x84, 0x8f, 0xc7, 0x4e, 0xa0, 0xaa, 0xd1, 0x87, 0xd9, + 0x87, 0x44, 0xa6, 0x74, 0x9b, 0x78, 0x68, 0x6d, 0x3c, 0xb3, 0xc9, 0xac, + 0xc5, 0xbd, 0xde, 0xb1, 0x7e, 0xb4, 0x47, 0x9f, 0xbc, 0xd2, 0x95, 0xd0, + 0x30, 0x80, 0x86, 0x90, 0x90, 0x61, 0xd4, 0x49, 0x84, 0x45, 0x8e, 0xb2, + 0x4b, 0x55, 0x6a, 0x69, 0x61, 0x9c, 0xb7, 0x8f, 0xd0, 0xd6, 0xb0, 0x95, + 0x6c, 0x7d, 0x7b, 0x55, 0xb1, 0x81, 0x6d, 0x68, 0x9a, 0x46, 0xaf, 0x9c, + 0x9c, 0x3e, 0xba, 0x6f, 0x64, 0xca, 0x4d, 0x87, 0xc3, 0xca, 0xd0, 0x55, + 0x49, 0x4e, 0xa9, 0x34, 0xa6, 0xc5, 0x88, 0x59, 0xd5, 0xa3, 0xba, 0xaa, + 0x44, 0x5c, 0x52, 0x52, 0x65, 0x9a, 0xbe, 0x57, 0x34, 0x84, 0x96, 0x59, + 0xb4, 0x40, 0x8b, 0x74, 0xb2, 0xc1, 0xa3, 0x85, 0x7d, 0x88, 0x37, 0x8c, + 0xd2, 0x86, 0xaf, 0xc8, 0x45, 0xd0, 0x95, 0x90, 0xb1, 0xa0, 0x84, 0xae, + 0xd0, 0x68, 0x90, 0x5d, 0x6c, 0x71, 0x80, 0xa0, 0x35, 0xab, 0x5a, 0x6e, + 0x69, 0x77, 0x34, 0x75, 0x5b, 0x69, 0xc7, 0x88, 0x3c, 0x5a, 0x94, 0xd0, + 0x49, 0x6b, 0xa8, 0xb0, 0xc8, 0x54, 0x51, 0x68, 0xc2, 0x3e, 0x5e, 0x7d, + 0x9b, 0x3e, 0x3f, 0x41, 0x4f, 0xa0, 0x7a, 0x3a, 0xcc, 0xb2, 0x8c, 0x9f, + 0x37, 0x44, 0x53, 0xd7, 0x6f, 0x80, 0x8f, 0x82, 0x7e, 0x47, 0x87, 0xcc, + 0x9d, 0x95, 0x46, 0x8d, 0x4e, 0xab, 0xb8, 0x64, 0x8f, 0x5d, 0xd2, 0x8f, + 0xa5, 0xa9, 0x96, 0xa4, 0xca, 0x7f, 0xc8, 0xa1, 0xa4, 0x80, 0x54, 0xd1, + 0xae, 0xb6, 0x5e, 0xaa, 0x46, 0x66, 0x5e, 0xaf, 0x3b, 0xa8, 0xae, 0x94, + 0x6b, 0xb8, 0xa5, 0x5a, 0xb9, 0x99, 0x95, 0x91, 0x37, 0xa8, 0xad, 0xbe, + 0x54, 0x3b, 0xbf, 0x81, 0xa0, 0xc5, 0x94, 0x7f, 0x77, 0x96, 0x9a, 0x69, + 0xb5, 0x55, 0x84, 0x80, 0xb6, 0xbd, 0x38, 0xc7, 0x3f, 0x44, 0xd2, 0x8f, + 0x59, 0x9e, 0x4f, 0x5a, 0xc7, 0x76, 0xd2, 0x74, 0x80, 0x9b, 0x87, 0x8e, + 0xa2, 0xa4, 0xd0, 0x75, 0x76, 0x36, 0x4b, 0x69, 0x72, 0x5e, 0x96, 0x6e, + 0x46, 0x4e, 0xca, 0x69, 0x7b, 0x65, 0x4d, 0x7c, 0xad, 0xae, 0xc1, 0x7c, + 0x4b, 0xb8, 0x6d, 0xd5, 0xc9, 0x3d, 0xb1, 0xaf, 0x8c, 0x70, 0x8a, 0x36, + 0x97, 0x50, 0x45, 0xbb, 0x7d, 0x86, 0x9b, 0x91, 0x96, 0x95, 0x54, 0xaf, + 0xc3, 0x97, 0xad, 0xa1, 0xb0, 0x3d, 0x46, 0xce, 0x70, 0x66, 0x81, 0x3e, + 0xca, 0xb4, 0x5b, 0xae, 0x39, 0x9d, 0xbd, 0x7a, 0x7f, 0x6d, 0x39, 0x97, + 0xc9, 0x9d, 0xc3, 0x59, 0xc1, 0xab, 0x73, 0xbb, 0xcb, 0xa2, 0xb0, 0xc0, + 0x48, 0x48, 0xa3, 0x40, 0xd9, 0x88, 0xaa, 0xb0, 0xcc, 0x5c, 0x62, 0x50, + 0x69, 0x7e, 0xc1, 0x82, 0xb8, 0xb7, 0x80, 0xd0, 0x6d, 0x4b, 0xa8, 0x62, + 0x52, 0x4e, 0x5e, 0xa6, 0x39, 0xcd, 0x62, 0x63, 0x60, 0x4c, 0xd0, 0x45, + 0xa8, 0xaa, 0x95, 0x4c, 0xb0, 0xd3, 0x56, 0xc4, 0x6b, 0x95, 0xbc, 0x91, + 0x85, 0x6e, 0x49, 0xbb, 0x38, 0x4d, 0xc2, 0xa6, 0xb4, 0x6f, 0xa1, 0x47, + 0x91, 0xb9, 0xc2, 0xaa, 0xca, 0xa3, 0x8a, 0x82, 0x7f, 0x47, 0x42, 0x88, + 0x55, 0xbf, 0x84, 0xce, 0xd5, 0xc2, 0x6f, 0xbb, 0x83, 0x78, 0x6f, 0x51, + 0xb0, 0xd8, 0xd4, 0x89, 0x68, 0xb2, 0xd1, 0xc0, 0x7b, 0x93, 0x9c, 0x36, + 0x44, 0x89, 0x57, 0x3c, 0x3e, 0x42, 0x2d, 0x8e, 0xcb, 0x4b, 0x71, 0x60, + 0x4d, 0xcd, 0xbd, 0xd2, 0x56, 0xb7, 0xc6, 0xcf, 0x5b, 0x3d, 0xa2, 0x82, + 0x99, 0x3c, 0x5b, 0x59, 0x37, 0xd2, 0x9d, 0xaf, 0x4a, 0x97, 0xc6, 0x51, + 0x74, 0x56, 0x3b, 0xb5, 0x6d, 0x5b, 0xc9, 0xbe, 0xa0, 0x72, 0x82, 0x91, + 0xb1, 0xa6, 0x94, 0xb8, 0x4f, 0x53, 0xde, 0x46, 0xa4, 0xa5, 0x40, 0xbf, + 0x82, 0xa1, 0x4f, 0x87, 0x97, 0x46, 0x60, 0x86, 0x63, 0xa1, 0x8f, 0xcd, + 0xca, 0x36, 0x55, 0x8c, 0x70, 0x8b, 0x54, 0xd4, 0x35, 0xc8, 0x9a, 0x3d, + 0x4c, 0xd1, 0x60, 0x83, 0xce, 0x6c, 0x67, 0xd1, 0x8a, 0x9c, 0x67, 0xba, + 0xcf, 0x67, 0x7b, 0x57, 0x76, 0x47, 0x46, 0xa0, 0x34, 0xb9, 0x5d, 0x44, + 0x5a, 0x69, 0x61, 0x98, 0x37, 0x39, 0x90, 0x7d, 0xa6, 0xb7, 0xb2, 0x53, + 0x76, 0x7c, 0x56, 0xb5, 0xae, 0x50, 0xc6, 0x89, 0x4f, 0x61, 0xab, 0xb0, + 0x86, 0xa5, 0x9c, 0xc3, 0x92, 0x36, 0x48, 0x46, 0xa0, 0x6d, 0x57, 0x86, + 0xd0, 0xa3, 0x5b, 0x80, 0xce, 0xcb, 0xa5, 0x95, 0xcd, 0x3e, 0x72, 0x64, + 0x9b, 0x99, 0x83, 0x33, 0x6c, 0xb7, 0x6d, 0xaa, 0x84, 0x35, 0x47, 0x96, + 0x6e, 0x97, 0xb3, 0x95, 0x63, 0x80, 0x8a, 0x6b, 0x7a, 0x6c, 0x4c, 0x8e, + 0x71, 0x49, 0x35, 0x33, 0x9d, 0xaf, 0xcb, 0x5f, 0xa1, 0x63, 0x77, 0x66, + 0x67, 0x7a, 0xc7, 0x79, 0x70, 0xa8, 0x50, 0x76, 0xa3, 0xc2, 0x4c, 0xc0, + 0x6f, 0x91, 0xcd, 0x3d, 0x41, 0xae, 0x82, 0x75, 0xb9, 0x95, 0x36, 0x81, + 0xd3, 0x53, 0x8c, 0x87, 0xb1, 0x56, 0x46, 0x73, 0x91, 0x65, 0x58, 0xb4, + 0x4b, 0xc1, 0x2e, 0x65, 0x7a, 0x42, 0xc3, 0x62, 0x66, 0xd3, 0xcd, 0x36, + 0xa2, 0x43, 0x35, 0xa7, 0x7f, 0xd0, 0x9e, 0x67, 0x7a, 0x7f, 0x3c, 0xc8, + 0xa4, 0x53, 0x48, 0x9c, 0xce, 0xcb, 0x82, 0x3e, 0x99, 0x8d, 0xac, 0x43, + 0xae, 0x73, 0x5f, 0xba, 0x5e, 0x78, 0x36, 0x8d, 0x87, 0x8c, 0x34, 0x41, + 0x9b, 0x3e, 0x6c, 0x81, 0xab, 0xc6, 0x5a, 0xc8, 0xc9, 0x62, 0x6b, 0xb0, + 0x8f, 0xcb, 0x42, 0x4d, 0x71, 0x5f, 0xab, 0xc8, 0x98, 0x54, 0xcd, 0x36, + 0xb9, 0xb7, 0xa8, 0x94, 0xc8, 0x8c, 0xd6, 0x91, 0x9b, 0x4e, 0x84, 0x4d, + 0x40, 0xcf, 0xa9, 0x57, 0x6b, 0x88, 0x53, 0x4d, 0x99, 0x9a, 0xb3, 0x44, + 0x58, 0xb8, 0xa4, 0x9f, 0x81, 0x6e, 0x62, 0x7b, 0x9e, 0xa3, 0xc8, 0x67, + 0xab, 0x37, 0xa1, 0xab, 0xbe, 0x91, 0x56, 0x89, 0xcf, 0x78, 0xbd, 0x75, + 0xa0, 0x96, 0x9b, 0x89, 0x9b, 0x4f, 0x8d, 0x44, 0x79, 0x6e, 0xa1, 0x4f, + 0x4d, 0x78, 0xbe, 0x5d, 0x9a, 0xd1, 0x44, 0x83, 0x51, 0x47, 0x72, 0xb0, + 0x4a, 0xa5, 0x3c, 0x58, 0x92, 0xab, 0xb0, 0x46, 0xb8, 0x64, 0xc2, 0x67, + 0x63, 0x4c, 0x63, 0xb4, 0x76, 0x89, 0x8a, 0x49, 0x43, 0x2f, 0xc2, 0x5e, + 0xba, 0x3a, 0x65, 0x31, 0x63, 0xcd, 0xb5, 0xae, 0x34, 0x69, 0x95, 0x77, + 0xba, 0x67, 0x61, 0x76, 0x70, 0x7d, 0xcb, 0xd0, 0x33, 0x74, 0x50, 0x92, + 0x7d, 0x6c, 0x39, 0x55, 0x9d, 0x7c, 0x73, 0x47, 0x69, 0xb9, 0xc3, 0xce, + 0xa0, 0x4e, 0xca, 0xc0, 0xab, 0x3a, 0xb5, 0xc2, 0xbf, 0xd0, 0x4a, 0x3e, + 0x57, 0xbf, 0x8e, 0xc9, 0x47, 0x71, 0x59, 0x99, 0x3c, 0xa3, 0xcd, 0xb9, + 0x9d, 0x51, 0x5b, 0xae, 0x7d, 0xa2, 0xaa, 0x3a, 0xab, 0x84, 0x90, 0xd5, + 0x68, 0xc3, 0x81, 0x70, 0xbb, 0x4e, 0xc9, 0x81, 0x9f, 0x45, 0xab, 0x78, + 0xb8, 0x86, 0x7d, 0x5f, 0x63, 0x4c, 0xc5, 0xa2, 0xa8, 0xb6, 0x37, 0x37, + 0x4a, 0xb0, 0x52, 0x52, 0xaf, 0xcd, 0x75, 0xb6, 0x3a, 0x3c, 0x3c, 0x8a, + 0x90, 0x50, 0x90, 0xdc, 0xb9, 0x80, 0x92, 0x81, 0xc9, 0x72, 0x5c, 0xa0, + 0x33, 0xb8, 0x33, 0x94, 0x54, 0x82, 0x7e, 0x8a, 0x3c, 0x70, 0x3e, 0x32, + 0xbe, 0xba, 0x66, 0x34, 0x44, 0x83, 0xb8, 0x5e, 0x3d, 0xbc, 0x51, 0xc6, + 0xbb, 0x78, 0x65, 0xc0, 0xa1, 0x42, 0x56, 0x79, 0xbb, 0x51, 0xce, 0xc8, + 0x64, 0x77, 0x83, 0xb4, 0x83, 0x4b, 0x7c, 0xa8, 0xc8, 0x3a, 0x35, 0x73, + 0x61, 0xa6, 0x54, 0x67, 0xcc, 0xaa, 0x6c, 0x3c, 0x74, 0x5f, 0x43, 0x6e, + 0x75, 0x6a, 0x8f, 0xb6, 0x33, 0xc2, 0xc1, 0x6e, 0x85, 0x66, 0x56, 0xa9, + 0x92, 0xb4, 0x8f, 0x42, 0x66, 0x3e, 0x3d, 0x63, 0x6e, 0x95, 0x4a, 0x83, + 0xac, 0xcd, 0x47, 0x7d, 0xa0, 0x5f, 0x56, 0x91, 0x65, 0xd1, 0x5d, 0xc2, + 0xa4, 0xcd, 0x87, 0xc0, 0x3f, 0x9f, 0x5f, 0x8d, 0x3d, 0xa1, 0x5b, 0xc9, + 0x9c, 0x98, 0x4b, 0xde, 0x50, 0xa1, 0x89, 0xbb, 0x31, 0x89, 0x8b, 0x9d, + 0xd1, 0x9c, 0x50, 0x9f, 0xaa, 0x83, 0x8e, 0x83, 0xb6, 0x8c, 0x8f, 0x46, + 0x37, 0x38, 0x42, 0x4a, 0x57, 0x4b, 0x46, 0xa9, 0xba, 0xb1, 0x79, 0xcc, + 0x78, 0xd0, 0xd2, 0xcb, 0x72, 0x7a, 0x47, 0x3f, 0x60, 0xbc, 0x87, 0xb9, + 0x5b, 0x89, 0x8a, 0xad, 0xad, 0x57, 0x8d, 0x4d, 0x33, 0x84, 0x30, 0xa3, + 0x6b, 0x64, 0x4c, 0xa7, 0x64, 0x30, 0x91, 0x76, 0x55, 0x35, 0x75, 0x89, + 0xd3, 0x87, 0x3c, 0xc6, 0xc7, 0xd0, 0x52, 0x98, 0xa0, 0x84, 0xb4, 0xb9, + 0x9e, 0x79, 0x88, 0xce, 0xd0, 0x6e, 0xc5, 0x38, 0xa5, 0x43, 0x62, 0x6a, + 0x9c, 0xa4, 0x94, 0x44, 0xb7, 0xaa, 0x6d, 0xbd, 0x5d, 0x90, 0x51, 0x98, + 0xb3, 0x6e, 0x65, 0x3a, 0x3f, 0x9e, 0x71, 0xbf, 0x63, 0xad, 0x9f, 0xaa, + 0xcb, 0x5a, 0xba, 0xb3, 0x78, 0x40, 0x57, 0xc7, 0xc0, 0x4a, 0xaa, 0x71, + 0x4d, 0x63, 0x4d, 0x74, 0xb0, 0xaf, 0x63, 0xbe, 0x53, 0x44, 0x78, 0x33, + 0x52, 0x93, 0x9e, 0x51, 0x88, 0x4d, 0x66, 0x81, 0xa0, 0x66, 0x60, 0x94, + 0xc9, 0x4e, 0xac, 0xb9, 0x68, 0xb6, 0xae, 0x3b, 0xa1, 0x3b, 0x87, 0x6c, + 0x5f, 0x85, 0x3e, 0x99, 0xc3, 0x92, 0x88, 0x57, 0xcc, 0xc8, 0x64, 0x54, + 0xd2, 0x5d, 0x97, 0xa0, 0x4a, 0x9a, 0xb9, 0xd2, 0xa9, 0x50, 0x72, 0x86, + 0xa1, 0x4e, 0x88, 0x5a, 0x3d, 0x8c, 0x98, 0xd2, 0x30, 0xb8, 0x4d, 0x54, + 0xce, 0x73, 0x9c, 0x40, 0x6b, 0xab, 0xa8, 0x8a, 0x81, 0x6a, 0x61, 0xaa, + 0xb3, 0xbf, 0x77, 0x74, 0x42, 0x6a, 0x62, 0x98, 0xbb, 0x3d, 0xbb, 0x82, + 0x62, 0x4f, 0x52, 0x53, 0x6e, 0xa1, 0xa2, 0x7f, 0x81, 0xb3, 0x8c, 0xbb, + 0xbb, 0x6d, 0x75, 0x45, 0xbb, 0xca, 0xb0, 0x71, 0x56, 0xac, 0x44, 0x64, + 0x3c, 0xab, 0x7c, 0x36, 0x9c, 0x3e, 0x80, 0x7f, 0x7e, 0x7d, 0x7c, 0x88, + 0x79, 0x96, 0x79, 0x64, 0xc5, 0x89, 0x9c, 0x32, 0x61, 0x7f, 0x3b, 0xb4, + 0x3d, 0x34, 0x66, 0x44, 0x37, 0x3e, 0xbc, 0xcf, 0x7e, 0xb8, 0x58, 0x81, + 0x51, 0xb6, 0x85, 0x5c, 0xc7, 0x33, 0xbf, 0x4e, 0x5e, 0x57, 0xca, 0xa6, + 0x9c, 0x56, 0x62, 0x99, 0x6a, 0xb5, 0xc5, 0xb1, 0x52, 0x4d, 0x45, 0x58, + 0x80, 0x81, 0x9e, 0x52, 0x3f, 0xc4, 0x92, 0x54, 0x38, 0x9e, 0x6f, 0x4f, + 0x86, 0x82, 0x74, 0xc7, 0xc6, 0xcd, 0x61, 0x4b, 0x44, 0xc4, 0xb8, 0xb4, + 0x6e, 0x5b, 0xb0, 0x9d, 0x72, 0x50, 0x33, 0xa5, 0x76, 0x3d, 0x45, 0x9f, + 0xd0, 0xce, 0xa3, 0x96, 0xca, 0xbc, 0x6b, 0x76, 0x9f, 0xa7, 0x63, 0x79, + 0x87, 0x70, 0x3a, 0xa2, 0x9d, 0x6c, 0x9f, 0x63, 0x9c, 0x58, 0x4b, 0x85, + 0x38, 0x67, 0x9b, 0x6b, 0x67, 0x72, 0x5c, 0x75, 0x43, 0x99, 0x76, 0x6e, + 0x52, 0x57, 0x9d, 0x6e, 0x76, 0x9c, 0x7f, 0xb1, 0xc6, 0x87, 0x4c, 0xb1, + 0x81, 0x77, 0xb2, 0x95, 0xa7, 0x8b, 0xa0, 0x4b, 0xa9, 0x6d, 0x57, 0x99, + 0x37, 0x50, 0x84, 0x88, 0x5a, 0x4f, 0x7d, 0x40, 0x3d, 0x79, 0x2d, 0x50, + 0x4e, 0xa3, 0x7c, 0x9b, 0xa6, 0xae, 0xb9, 0x57, 0xae, 0x8c, 0xa8, 0x67, + 0x66, 0x8f, 0x3e, 0x72, 0xc7, 0xac, 0x40, 0x7f, 0xb3, 0x5a, 0xd5, 0x35, + 0x47, 0x3d, 0x7b, 0xb4, 0x78, 0x75, 0x36, 0xa6, 0xd0, 0xd5, 0xbb, 0x47, + 0xc0, 0x64, 0x95, 0xb4, 0x61, 0xd8, 0x8a, 0x45, 0x47, 0x91, 0xc5, 0x64, + 0xa5, 0x8a, 0x2d, 0x56, 0x58, 0x70, 0x2f, 0x9c, 0xbd, 0xc9, 0x66, 0x3c, + 0x79, 0x94, 0x6b, 0x72, 0x6d, 0x51, 0xae, 0x6e, 0x63, 0x69, 0x59, 0x9f, + 0x9e, 0xce, 0x7d, 0xb5, 0xb9, 0xa5, 0x43, 0xa7, 0xc4, 0xb9, 0x6a, 0xc6, + 0x58, 0x4b, 0xc7, 0x52, 0xca, 0x84, 0xd0, 0x53, 0xc8, 0xa1, 0xb7, 0xb8, + 0x57, 0xd7, 0x58, 0xae, 0xab, 0x8f, 0x70, 0xc4, 0x70, 0xae, 0x7b, 0x58, + 0x38, 0xd0, 0xba, 0x41, 0x9f, 0x93, 0x63, 0x35, 0x89, 0x56, 0xa2, 0x9a, + 0x4f, 0x4c, 0x63, 0x94, 0x82, 0x82, 0xae, 0x7b, 0x7b, 0x7c, 0x3d, 0xa3, + 0x7a, 0xe0, 0x90, 0xab, 0xa2, 0xcd, 0x86, 0x37, 0x44, 0xca, 0xca, 0x54, + 0x48, 0x47, 0x9e, 0x7e, 0x8b, 0x5c, 0xb9, 0x3a, 0x44, 0xca, 0x7f, 0x7b, + 0x32, 0x8a, 0x9b, 0xca, 0xd1, 0xcd, 0xd5, 0x94, 0x65, 0xa3, 0x72, 0x2f, + 0x25, 0x73, 0x6c, 0x75, 0x7d, 0x8b, 0x80, 0x45, 0x8a, 0x5c, 0xba, 0x95, + 0xc2, 0xb8, 0x7f, 0x76, 0x6d, 0x9b, 0xb3, 0x52, 0xcb, 0xd1, 0x51, 0xb3, + 0xbc, 0xab, 0x75, 0x9e, 0x87, 0xc8, 0x7a, 0xd1, 0xce, 0x56, 0x76, 0x93, + 0x4a, 0x9c, 0x99, 0x69, 0xbc, 0x35, 0x66, 0xd0, 0xb2, 0xd8, 0x30, 0x34, + 0x33, 0x2e, 0x5d, 0x49, 0x7a, 0x67, 0xb8, 0x80, 0xb9, 0x5a, 0x6b, 0xa3, + 0x4c, 0x63, 0x8c, 0x80, 0xa0, 0x46, 0xa1, 0x4a, 0xca, 0x4d, 0x98, 0x51, + 0x2f, 0xbf, 0xab, 0x56, 0x8d, 0x74, 0xd8, 0x78, 0x41, 0x4c, 0xa1, 0x6d, + 0x3c, 0x76, 0x5b, 0x54, 0xbd, 0x6e, 0x3e, 0x5a, 0x37, 0x7d, 0x45, 0x61, + 0xbf, 0x95, 0xb3, 0xc7, 0x33, 0x98, 0xdf, 0xa2, 0x66, 0xc7, 0x83, 0xc4, + 0xc0, 0x54, 0x53, 0x3b, 0x40, 0x9f, 0x72, 0x6f, 0xb3, 0x80, 0xbc, 0xa4, + 0x48, 0x9d, 0xb9, 0x3e, 0x63, 0x96, 0x4c, 0x7e, 0xbf, 0x5f, 0x83, 0x5c, + 0x6e, 0x7e, 0x5f, 0x74, 0x51, 0x9d, 0x5c, 0x59, 0xa2, 0xa6, 0xa2, 0x8e, + 0xa9, 0x67, 0xcd, 0xc0, 0x61, 0xc3, 0x9c, 0xc1, 0x84, 0x65, 0x6d, 0xbf, + 0xc1, 0x8f, 0x78, 0x5c, 0x8b, 0x89, 0x90, 0x34, 0x90, 0xbe, 0x46, 0x84, + 0x9c, 0x3e, 0x54, 0x69, 0x89, 0x98, 0x82, 0x33, 0x8e, 0x4f, 0x7f, 0xc3, + 0x44, 0x5d, 0x59, 0x41, 0x7d, 0x69, 0x92, 0xbd, 0x4e, 0x8c, 0xc0, 0xad, + 0x43, 0xcd, 0x69, 0x86, 0x67, 0x88, 0x84, 0x69, 0x6f, 0x34, 0x61, 0x9a, + 0xc7, 0x38, 0xae, 0xa4, 0xa2, 0x63, 0x48, 0xc6, 0x8b, 0x46, 0x93, 0x94, + 0xcb, 0x4e, 0x83, 0x53, 0x6c, 0xb6, 0x9d, 0x41, 0x33, 0x6a, 0x57, 0x2c, + 0x80, 0x65, 0xa9, 0x9d, 0x39, 0x44, 0x52, 0x67, 0xc5, 0x96, 0x8c, 0x85, + 0x32, 0xc7, 0x4f, 0x85, 0x3e, 0x8d, 0x80, 0x9a, 0x92, 0x62, 0x45, 0x3f, + 0xab, 0xd7, 0x48, 0x59, 0xc2, 0xc5, 0x31, 0xc0, 0x83, 0x43, 0x3c, 0xb4, + 0x39, 0x5c, 0xc0, 0x4f, 0xd7, 0x81, 0xcd, 0x74, 0xbe, 0x7e, 0x95, 0xc2, + 0x8f, 0x85, 0x41, 0x37, 0x97, 0x4f, 0x45, 0x39, 0x8d, 0x48, 0x6b, 0x88, + 0xa5, 0x8d, 0xc7, 0xa2, 0x90, 0x71, 0x7b, 0x3c, 0x32, 0x68, 0x96, 0xc8, + 0x8f, 0xad, 0xaa, 0xb1, 0x32, 0x69, 0x9f, 0xa5, 0xcb, 0xbc, 0x3f, 0x39, + 0x87, 0x79, 0xb9, 0x2f, 0x50, 0xa1, 0xa6, 0x3a, 0xcf, 0x67, 0xa9, 0x4c, + 0x54, 0x30, 0xc8, 0xcb, 0x5c, 0xa1, 0x38, 0x9a, 0x97, 0x98, 0x50, 0x58, + 0x95, 0x54, 0x6c, 0x77, 0x73, 0x9e, 0x84, 0x36, 0xc0, 0x62, 0x91, 0x9a, + 0x9d, 0xb9, 0x60, 0x9a, 0xaf, 0xb2, 0x87, 0x39, 0xb1, 0x65, 0xd5, 0x3f, + 0x43, 0xa0, 0x86, 0xb9, 0x98, 0xa4, 0xd3, 0xb2, 0xb1, 0x62, 0x82, 0x6c, + 0x6b, 0x80, 0x8a, 0x6e, 0x61, 0x42, 0x9c, 0xc1, 0x77, 0x3b, 0x6a, 0x55, + 0xc1, 0xcd, 0x43, 0x8c, 0xc4, 0x8f, 0x9f, 0x35, 0x59, 0x74, 0x60, 0xa8, + 0x80, 0xd5, 0xb8, 0x5b, 0x5b, 0x5e, 0x86, 0xb4, 0x2a, 0x35, 0x62, 0x44, + 0x2e, 0x65, 0xaf, 0x96, 0x45, 0x6d, 0xc2, 0xb2, 0xa5, 0xd0, 0x6a, 0x42, + 0xca, 0x87, 0xcc, 0x41, 0x44, 0xcf, 0x84, 0xa0, 0xc2, 0x80, 0x47, 0x91, + 0x88, 0x75, 0x39, 0x26, 0x59, 0x4d, 0x4f, 0x67, 0x70, 0x52, 0x5f, 0x58, + 0xd8, 0xbd, 0x2b, 0xc2, 0x4c, 0x7b, 0xd4, 0xc9, 0xb0, 0x5d, 0x4f, 0xc9, + 0xd1, 0xc5, 0xc7, 0xa5, 0x47, 0x32, 0x50, 0x3f, 0x82, 0x6f, 0x89, 0x3f, + 0x4c, 0xcc, 0x98, 0x57, 0xcb, 0x86, 0x63, 0xb4, 0xa7, 0xb1, 0xc2, 0x58, + 0x47, 0x59, 0x42, 0x47, 0xb0, 0x33, 0x73, 0x75, 0x47, 0x9c, 0xc5, 0x7a, + 0x5b, 0xc5, 0x57, 0x86, 0xae, 0x9c, 0x75, 0x6c, 0xd9, 0xcd, 0xbd, 0x82, + 0x6d, 0x2d, 0xa6, 0xbe, 0x8c, 0xac, 0x42, 0x86, 0x4c, 0x3b, 0x4f, 0xb9, + 0x9e, 0xbe, 0x45, 0x70, 0x57, 0xc0, 0xb9, 0x3e, 0x9b, 0x93, 0xb9, 0x40, + 0x70, 0xbe, 0xbf, 0xab, 0x34, 0x98, 0x70, 0xc6, 0x37, 0x97, 0x81, 0x45, + 0x39, 0x51, 0x83, 0x95, 0x89, 0x68, 0x8d, 0x5c, 0x3e, 0xb9, 0x6f, 0xca, + 0x28, 0x9b, 0xde, 0xb9, 0x97, 0x8f, 0x80, 0x5c, 0xc8, 0x7e, 0x7b, 0xab, + 0x8b, 0x60, 0x63, 0x6f, 0xac, 0x3b, 0x61, 0x64, 0x8d, 0x4b, 0x77, 0xc1, + 0x6f, 0xd3, 0x88, 0x48, 0x76, 0x31, 0x49, 0x64, 0xac, 0x50, 0x63, 0x82, + 0xd5, 0x96, 0x8d, 0xa2, 0xa1, 0x76, 0x58, 0xcf, 0x8e, 0xac, 0x93, 0xbd, + 0x67, 0x88, 0xb7, 0xac, 0x75, 0x81, 0xc4, 0xae, 0x52, 0x61, 0xc4, 0xd7, + 0xbf, 0xc4, 0xd3, 0x96, 0xb6, 0x9a, 0xc7, 0xc0, 0x35, 0x37, 0xd4, 0x32, + 0x6e, 0x97, 0x93, 0x76, 0x42, 0x5b, 0x64, 0x51, 0xd0, 0x70, 0x5c, 0x3d, + 0x37, 0xaf, 0x34, 0x8b, 0xa6, 0xaa, 0xb8, 0x5f, 0xbf, 0x4f, 0xa5, 0x4b, + 0x7a, 0x58, 0x5c, 0x39, 0x55, 0xbb, 0x5e, 0xb9, 0x91, 0xc6, 0x7b, 0xa2, + 0xc6, 0xa1, 0x62, 0xce, 0x87, 0x5c, 0x58, 0x37, 0x57, 0x91, 0x57, 0x9b, + 0x5e, 0x42, 0x87, 0x6a, 0x7b, 0x6d, 0x37, 0xc7, 0xa3, 0x70, 0xc6, 0xcb, + 0xac, 0x93, 0xd3, 0x5e, 0x5f, 0x95, 0x59, 0xc3, 0x92, 0x5e, 0x68, 0xa8, + 0xa1, 0x33, 0xc7, 0x62, 0x6b, 0x38, 0x9f, 0x48, 0x46, 0xce, 0x3b, 0x8b, + 0x55, 0x65, 0xc4, 0x5e, 0xbd, 0x33, 0x42, 0x98, 0xa2, 0x88, 0xa7, 0x6e, + 0x63, 0xc4, 0x6b, 0xc3, 0x3e, 0x89, 0x63, 0xcd, 0xcb, 0xa5, 0x9f, 0xc3, + 0xb3, 0x7d, 0xb9, 0x53, 0x57, 0xa9, 0x8f, 0xcd, 0x89, 0xb7, 0xab, 0x97, + 0xae, 0xb7, 0x35, 0x4f, 0x90, 0x34, 0x59, 0x96, 0x77, 0x30, 0xc8, 0x37, + 0x71, 0x9c, 0x9a, 0x55, 0x51, 0xa2, 0x82, 0xa0, 0x8f, 0x7a, 0x5f, 0xaa, + 0xbe, 0x89, 0x2a, 0x87, 0xb7, 0x6d, 0x7d, 0x4b, 0x87, 0x8d, 0x75, 0x8f, + 0x36, 0xcd, 0x6b, 0x93, 0xcb, 0xd5, 0x85, 0x51, 0x6e, 0x43, 0x89, 0xd2, + 0xcf, 0x52, 0x41, 0x7c, 0x3f, 0x83, 0x3b, 0x5f, 0xc6, 0x57, 0x4f, 0x6c, + 0x40, 0x4a, 0x97, 0x84, 0x73, 0x6f, 0x7f, 0x83, 0x3c, 0x75, 0x6d, 0x97, + 0x2f, 0x75, 0x89, 0x6b, 0x2f, 0xa0, 0x55, 0xb5, 0x39, 0x87, 0xa6, 0x3c, + 0x9b, 0xa9, 0x7d, 0x28, 0x3f, 0x5a, 0x35, 0x63, 0xab, 0xbf, 0x4e, 0x77, + 0x9a, 0x3e, 0x8e, 0x8c, 0x69, 0x79, 0xa6, 0x7a, 0x71, 0xbb, 0xb3, 0x4b, + 0x52, 0x6b, 0xb1, 0xcd, 0x9a, 0xc8, 0x7e, 0x68, 0x81, 0x3a, 0xc2, 0x98, + 0x72, 0x38, 0x56, 0x40, 0xca, 0x89, 0x7a, 0x2e, 0x44, 0xa1, 0xd1, 0xae, + 0x2b, 0x5a, 0x47, 0xc7, 0x4d, 0x8b, 0x80, 0xa0, 0x99, 0xce, 0x32, 0x62, + 0xad, 0xa7, 0x87, 0xd3, 0x9c, 0x84, 0x90, 0xda, 0x76, 0x9f, 0x9b, 0x5c, + 0x6c, 0xb0, 0x79, 0x31, 0x4f, 0xa3, 0x36, 0xa2, 0x73, 0x32, 0x9b, 0xb5, + 0x3f, 0x9f, 0xc0, 0x91, 0x57, 0xcd, 0x78, 0x4e, 0x7c, 0x41, 0x8e, 0xa0, + 0x34, 0x8d, 0xb6, 0x59, 0x3c, 0xad, 0xa9, 0x5b, 0x72, 0xcd, 0xa1, 0xd0, + 0xd1, 0x6c, 0x89, 0x9b, 0xa8, 0x6a, 0x56, 0x9c, 0xb0, 0xae, 0x31, 0x45, + 0x40, 0x9d, 0x5c, 0x71, 0xcf, 0xb3, 0xa9, 0xc5, 0x78, 0x1b, 0xa7, 0x74, + 0x6f, 0x96, 0xbd, 0x5f, 0x4c, 0x3b, 0xc9, 0x4a, 0x8b, 0x91, 0x8b, 0x64, + 0x3d, 0x87, 0x81, 0x32, 0xc6, 0x61, 0x62, 0x48, 0x8c, 0xac, 0x37, 0x9d, + 0xad, 0x7b, 0x62, 0xb5, 0x73, 0x58, 0xaa, 0x6a, 0xdc, 0x6f, 0xc1, 0xa4, + 0x9b, 0x4c, 0x64, 0x34, 0x44, 0xad, 0xad, 0xb9, 0x9b, 0xae, 0x8b, 0x5f, + 0xa8, 0x37, 0x3f, 0xa8, 0xb0, 0x55, 0xc4, 0xa0, 0xd7, 0x98, 0xbf, 0x93, + 0xb2, 0x7e, 0x72, 0x6f, 0x3e, 0x49, 0x4c, 0x46, 0x62, 0x9d, 0x50, 0x72, + 0x74, 0x8a, 0x62, 0xc8, 0xad, 0x88, 0x90, 0xcf, 0x68, 0x66, 0x4f, 0x92, + 0xb2, 0x29, 0x81, 0x60, 0x85, 0xce, 0x55, 0xbc, 0x64, 0x4b, 0x4b, 0xb9, + 0x39, 0x2b, 0xaa, 0x4e, 0x91, 0x94, 0x9c, 0xbf, 0x7e, 0x57, 0x95, 0x6f, + 0x38, 0x64, 0x7a, 0xd3, 0x34, 0x91, 0xac, 0x62, 0x40, 0x82, 0x6e, 0xc0, + 0xbd, 0xcb, 0xb9, 0xad, 0xd9, 0xa9, 0x7c, 0x58, 0xca, 0x3f, 0xc5, 0xc9, + 0x55, 0x82, 0x72, 0x43, 0x74, 0x74, 0xae, 0x53, 0x79, 0x84, 0x48, 0x83, + 0x90, 0x39, 0x8b, 0xd4, 0xce, 0x68, 0x73, 0x7d, 0xae, 0x96, 0x98, 0x54, + 0x8e, 0x7e, 0x73, 0x46, 0x59, 0x8f, 0xa3, 0x36, 0x5a, 0x57, 0xd0, 0x3b, + 0xc9, 0xba, 0x74, 0x7f, 0x9b, 0x39, 0xb5, 0x58, 0x83, 0x6b, 0xd9, 0x86, + 0x3d, 0xd8, 0x5f, 0xcc, 0xc7, 0xd0, 0xca, 0x90, 0x90, 0x23, 0x77, 0x9d, + 0x3b, 0x6a, 0xa4, 0xbf, 0xb7, 0x78, 0xb7, 0x72, 0x59, 0x71, 0x6d, 0x8c, + 0x61, 0x2e, 0xa0, 0xb9, 0x80, 0x6c, 0x75, 0x8d, 0x38, 0x47, 0xd1, 0x2e, + 0x75, 0x7d, 0x51, 0x4c, 0x99, 0x80, 0x52, 0x8f, 0xab, 0x80, 0x3f, 0x9e, + 0x5f, 0x93, 0xbb, 0x90, 0xaa, 0xc2, 0x5a, 0x41, 0xa4, 0x46, 0x7d, 0x59, + 0x3b, 0x79, 0xc4, 0xb7, 0x6a, 0xa5, 0xc2, 0xa6, 0x81, 0xbe, 0xb1, 0xb8, + 0x72, 0x73, 0x66, 0x39, 0x5b, 0x3a, 0x5c, 0x9d, 0xb7, 0xca, 0x86, 0xad, + 0x5c, 0xd5, 0x4b, 0x5a, 0xcc, 0x7b, 0x2c, 0x5f, 0xc0, 0x67, 0x96, 0x84, + 0x84, 0xc2, 0x88, 0x53, 0x50, 0xcf, 0x3e, 0x53, 0x7b, 0xdb, 0x92, 0xd5, + 0x9e, 0x8c, 0x66, 0xca, 0x8e, 0x69, 0xbd, 0x29, 0x9c, 0xc6, 0x9d, 0xdd, + 0x3e, 0xcc, 0x54, 0xc8, 0xa4, 0xa8, 0x4f, 0x74, 0x93, 0x88, 0x28, 0x59, + 0xae, 0x29, 0x84, 0x5b, 0x74, 0x9a, 0x72, 0xa0, 0xbb, 0x41, 0x2a, 0x7b, + 0x73, 0x75, 0x85, 0x84, 0xbc, 0x8a, 0x8c, 0x45, 0xe3, 0x63, 0x8d, 0x49, + 0x56, 0x8d, 0xa0, 0x30, 0xb5, 0x48, 0x84, 0x4c, 0x6f, 0xbf, 0xbc, 0x90, + 0x39, 0xd4, 0xce, 0x97, 0xae, 0xba, 0xad, 0x68, 0x5c, 0x93, 0x5b, 0x8e, + 0xbb, 0x6e, 0xb5, 0xc9, 0x81, 0x7a, 0x2b, 0x47, 0xa8, 0x50, 0x9a, 0x81, + 0xd5, 0x4e, 0x3a, 0x35, 0x56, 0xc1, 0xb7, 0x61, 0x41, 0xce, 0x67, 0x41, + 0x75, 0x88, 0xb4, 0xc1, 0xb3, 0x87, 0x5c, 0x6f, 0x4d, 0x60, 0xba, 0x48, + 0x7b, 0x33, 0x4a, 0xb3, 0x7f, 0x7b, 0x97, 0x56, 0xbc, 0xb4, 0x6d, 0x36, + 0xa4, 0xa3, 0xdf, 0xd2, 0xb9, 0xa8, 0x5f, 0xc6, 0xa3, 0xc1, 0x64, 0xb4, + 0x16, 0xb1, 0xa1, 0xc7, 0x98, 0x80, 0xaf, 0x81, 0x7f, 0x5f, 0x3d, 0x27, + 0x34, 0x8a, 0x8c, 0x6a, 0xd6, 0xb3, 0x83, 0xb0, 0x77, 0x81, 0x95, 0xc8, + 0x67, 0x9d, 0xde, 0x79, 0x5b, 0x26, 0x40, 0x88, 0xa0, 0x7d, 0x80, 0x81, + 0x5e, 0x95, 0x3b, 0xd6, 0x50, 0x65, 0x3a, 0x59, 0x87, 0x96, 0xc1, 0xc1, + 0x43, 0x45, 0xa1, 0x8d, 0x5e, 0x47, 0x9f, 0x60, 0x6f, 0x98, 0x6e, 0x69, + 0x55, 0x3f, 0xad, 0x6f, 0x92, 0xd3, 0x4a, 0x99, 0x4f, 0xb4, 0x7b, 0x50, + 0x68, 0x8a, 0x3f, 0x30, 0x61, 0x6c, 0x84, 0x4c, 0x58, 0xd2, 0x91, 0x9d, + 0x8b, 0xdb, 0x97, 0x86, 0x35, 0xbd, 0x40, 0xd3, 0xa7, 0x98, 0x49, 0x6e, + 0xb9, 0xca, 0x63, 0xc7, 0x69, 0x42, 0xc3, 0x97, 0xc3, 0xc8, 0x8b, 0x80, + 0xc4, 0xa9, 0xa7, 0x74, 0xc6, 0xb7, 0x4d, 0x84, 0x44, 0x2e, 0x67, 0x3a, + 0xa3, 0xab, 0x42, 0xae, 0x66, 0x98, 0x43, 0x82, 0x3e, 0x9a, 0x29, 0x8f, + 0x47, 0x5b, 0x6f, 0x82, 0x6b, 0x43, 0x48, 0x9b, 0x67, 0x31, 0x94, 0x94, + 0x9c, 0x6c, 0x4f, 0x92, 0x6c, 0x5f, 0x36, 0xbb, 0x42, 0x59, 0xbd, 0xb0, + 0xcb, 0x62, 0x38, 0x3a, 0x74, 0x4d, 0xa5, 0x7f, 0x5b, 0x68, 0x4e, 0xad, + 0xa5, 0x4d, 0x7b, 0xd9, 0x8b, 0x81, 0x4a, 0x42, 0x4a, 0xc8, 0xd7, 0xd2, + 0x3f, 0xa5, 0x47, 0x5b, 0x8b, 0xd5, 0x69, 0xb2, 0x9b, 0xa6, 0x5f, 0xac, + 0xae, 0x8b, 0x9d, 0xb9, 0x2d, 0x40, 0x7b, 0x75, 0xa2, 0xc9, 0xa5, 0xb9, + 0x2f, 0x84, 0x64, 0x3c, 0x8d, 0x89, 0x3e, 0x61, 0xaa, 0x79, 0xa4, 0xb6, + 0x2a, 0x44, 0xae, 0x8a, 0x2b, 0x33, 0x48, 0x5c, 0xd2, 0x6d, 0xa5, 0x80, + 0xb6, 0x95, 0x93, 0x42, 0xd0, 0xcf, 0x79, 0xb6, 0x6e, 0xb0, 0x8a, 0xa8, + 0x95, 0x61, 0xa9, 0x51, 0x6c, 0x79, 0x98, 0x96, 0xc8, 0x42, 0x51, 0x2e, + 0x7c, 0x5c, 0xc9, 0x7b, 0x9e, 0x95, 0x6a, 0xb9, 0x6c, 0xa5, 0x72, 0xa9, + 0x56, 0x3e, 0x5e, 0xa1, 0x9c, 0x64, 0x37, 0x3d, 0x56, 0x86, 0x79, 0xbd, + 0xbc, 0xda, 0x6d, 0xc8, 0x94, 0xba, 0xa8, 0x66, 0x7c, 0xb6, 0x86, 0xc3, + 0xc0, 0x74, 0x58, 0x7e, 0x41, 0x8d, 0x96, 0x4d, 0x87, 0x79, 0x49, 0x8d, + 0x40, 0x7e, 0xad, 0x79, 0xcf, 0xb8, 0x91, 0xb8, 0x56, 0xc5, 0x85, 0xb7, + 0xbe, 0x70, 0x87, 0x99, 0x5b, 0x90, 0x9e, 0x61, 0xbe, 0xc8, 0x55, 0xa7, + 0x8e, 0x53, 0xbc, 0x9e, 0x56, 0x9f, 0x2e, 0x3b, 0xb8, 0xca, 0xa6, 0xc7, + 0x46, 0x75, 0x7a, 0xbd, 0x78, 0x96, 0xd9, 0x49, 0x8c, 0xa4, 0x72, 0xab, + 0x97, 0xa2, 0x8e, 0xa7, 0x69, 0x92, 0x92, 0xa4, 0x4c, 0xb4, 0x7b, 0x87, + 0xa5, 0xb6, 0x9b, 0x7c, 0x32, 0xce, 0x9e, 0xbc, 0xc4, 0xba, 0x69, 0x7e, + 0x74, 0x64, 0x50, 0x9b, 0x48, 0xbd, 0x43, 0x50, 0x4b, 0x49, 0x8b, 0x70, + 0x6e, 0x5a, 0x70, 0xa8, 0xb7, 0x67, 0x71, 0x9a, 0x2a, 0x9c, 0xc6, 0x62, + 0x44, 0xb6, 0x83, 0x79, 0x73, 0xc0, 0x53, 0x8a, 0x7d, 0x70, 0x73, 0x34, + 0xa6, 0x75, 0x8a, 0x33, 0x86, 0xb6, 0x39, 0x86, 0x89, 0xa6, 0x62, 0x52, + 0xb0, 0xd0, 0x9c, 0x9d, 0x61, 0x9e, 0x4d, 0x43, 0x7c, 0x76, 0x88, 0xb6, + 0xa4, 0x6d, 0xbe, 0x4f, 0x7e, 0x5c, 0x88, 0xba, 0xd1, 0x45, 0xb5, 0xbc, + 0x87, 0x9a, 0x80, 0xd4, 0x7b, 0x7d, 0x28, 0x3f, 0x9d, 0xb6, 0x96, 0x42, + 0x46, 0x4d, 0x49, 0xc6, 0x2c, 0x2e, 0x94, 0xc8, 0x66, 0x39, 0x3a, 0x4f, + 0x70, 0xd1, 0x3a, 0x91, 0x56, 0x95, 0xc3, 0x8a, 0x4b, 0x98, 0x5c, 0x4e, + 0x95, 0x9b, 0x61, 0x52, 0xcb, 0x65, 0x78, 0x74, 0x7b, 0x88, 0x41, 0x5b, + 0x6a, 0x75, 0x7d, 0x8c, 0x3a, 0x5c, 0x79, 0x67, 0xc5, 0xc9, 0xae, 0x77, + 0x54, 0xb0, 0x6a, 0x57, 0x32, 0x36, 0xc0, 0x51, 0x5b, 0x9e, 0x9a, 0x88, + 0x8a, 0x99, 0xc4, 0x4e, 0xb5, 0xa9, 0xcb, 0xb0, 0x4a, 0x61, 0x93, 0xbe, + 0x76, 0x86, 0xc2, 0x8a, 0x38, 0x56, 0xa8, 0x58, 0x67, 0xcb, 0xa1, 0x9a, + 0xd1, 0xab, 0xc9, 0x48, 0x7a, 0x71, 0x8e, 0x86, 0xc6, 0x2b, 0xc4, 0x79, + 0x6e, 0x30, 0x83, 0xbd, 0xa0, 0x68, 0x35, 0xb9, 0xc2, 0xa3, 0x5a, 0x88, + 0x60, 0x44, 0xa3, 0x8c, 0x6c, 0x78, 0xc9, 0xd3, 0x49, 0xa7, 0x89, 0x34, + 0x91, 0xb9, 0x8c, 0x50, 0xa7, 0x34, 0x3c, 0xc2, 0x8d, 0xcb, 0xcf, 0x9a, + 0x54, 0x50, 0x8c, 0x8e, 0x80, 0x61, 0x94, 0xa6, 0xb3, 0xc0, 0x72, 0x5f, + 0xba, 0x39, 0x32, 0x53, 0xa6, 0xad, 0x66, 0x78, 0x8d, 0xae, 0x8a, 0x53, + 0xd7, 0x4b, 0x53, 0x9d, 0xaa, 0xc6, 0xba, 0x80, 0xc6, 0x5d, 0x86, 0xb1, + 0xa6, 0x84, 0x40, 0x76, 0xd7, 0x4d, 0x3f, 0xbb, 0x7c, 0x69, 0xdf, 0x9e, + 0x40, 0xa0, 0x88, 0x9a, 0xae, 0xbc, 0xaa, 0x78, 0x5b, 0x8e, 0x70, 0x56, + 0x60, 0x44, 0x5e, 0x4f, 0x50, 0xb6, 0xb7, 0xaa, 0xcf, 0x4f, 0x81, 0x68, + 0x89, 0x91, 0x44, 0x7c, 0xc4, 0x8a, 0x60, 0x2c, 0xca, 0x61, 0x49, 0x8f, + 0x3b, 0xa9, 0x59, 0xb0, 0x61, 0x9e, 0x8d, 0xae, 0xbb, 0x89, 0x37, 0x45, + 0x95, 0x8e, 0xc4, 0xba, 0xbe, 0xa1, 0xca, 0x69, 0xaf, 0xad, 0x47, 0xb0, + 0xce, 0x69, 0x98, 0x9e, 0x9b, 0xd2, 0xcd, 0x67, 0xa8, 0x39, 0x74, 0xae, + 0x6b, 0xcb, 0x56, 0x85, 0xac, 0x5d, 0x52, 0xb4, 0x33, 0xb1, 0x83, 0x74, + 0xb3, 0x79, 0x95, 0x3f, 0xbb, 0x82, 0xa6, 0x51, 0x48, 0xb1, 0xc2, 0xd1, + 0x5d, 0x87, 0x5d, 0x70, 0x9d, 0x6c, 0xbf, 0x94, 0xa4, 0x3c, 0x5f, 0xa1, + 0x51, 0x7b, 0x76, 0x63, 0x37, 0xc8, 0x79, 0x79, 0x5a, 0x83, 0xa2, 0x9c, + 0xbb, 0x93, 0x52, 0x2b, 0x75, 0x5a, 0xb9, 0x63, 0xc1, 0x6e, 0xce, 0x63, + 0x89, 0x6e, 0xbc, 0x28, 0xaa, 0x79, 0x87, 0xca, 0x3d, 0xc0, 0xa9, 0x53, + 0xbf, 0x4b, 0x5a, 0x8c, 0x31, 0x86, 0xc9, 0xdb, 0x64, 0x91, 0xb5, 0xd0, + 0x5c, 0xcb, 0x4a, 0xad, 0x4a, 0xc8, 0x7b, 0xa5, 0xb8, 0xcc, 0x72, 0x89, + 0x49, 0x8a, 0x82, 0x35, 0x76, 0x8e, 0x90, 0x83, 0x3f, 0xa6, 0x83, 0x5c, + 0x7b, 0x63, 0xab, 0x74, 0x94, 0x85, 0x3f, 0x38, 0x4c, 0x54, 0x41, 0xa4, + 0x67, 0x89, 0x2e, 0x68, 0x31, 0xab, 0xc5, 0xba, 0xaa, 0x5b, 0x51, 0x7b, + 0xa7, 0x51, 0x4f, 0x56, 0x44, 0xc1, 0x6d, 0x3b, 0x41, 0x75, 0xc4, 0x50, + 0x4a, 0x9c, 0xc9, 0xc0, 0x3c, 0x6c, 0x9b, 0xb1, 0x6e, 0x89, 0x44, 0xaf, + 0x8d, 0xc2, 0xc7, 0xae, 0x83, 0x68, 0x6c, 0xc5, 0x8c, 0x86, 0xa7, 0x4e, + 0x54, 0x9f, 0xa1, 0xd0, 0xc2, 0xa5, 0x75, 0x85, 0xa4, 0x6b, 0x4a, 0x38, + 0xbe, 0xb0, 0x35, 0xab, 0xc9, 0x8b, 0x3d, 0x6f, 0x7c, 0x31, 0x94, 0xa2, + 0xa6, 0xb2, 0xaf, 0x62, 0x60, 0x52, 0xa5, 0xc8, 0x3d, 0xac, 0x93, 0x6c, + 0x35, 0xac, 0x6a, 0x62, 0x83, 0x6b, 0xa7, 0x9d, 0x3d, 0x4c, 0x6e, 0xbc, + 0x92, 0xd6, 0xd2, 0x68, 0xa6, 0xb8, 0x5d, 0x5f, 0x59, 0x9a, 0x26, 0x65, + 0x5d, 0x43, 0x65, 0x80, 0xbf, 0x45, 0xd2, 0x54, 0xd5, 0xba, 0x89, 0x84, + 0x94, 0x5a, 0xa3, 0x87, 0xd2, 0x6d, 0x46, 0x9e, 0x49, 0x83, 0x87, 0xbc, + 0xa6, 0x66, 0x95, 0x3e, 0x3f, 0x72, 0x99, 0x63, 0x4f, 0x78, 0x6e, 0xd2, + 0x96, 0x64, 0x47, 0xa9, 0x79, 0x67, 0x42, 0x3a, 0xa5, 0x99, 0x61, 0xb5, + 0x5a, 0x7f, 0xa7, 0xcd, 0x83, 0x52, 0xaf, 0x37, 0xd0, 0x6d, 0x96, 0x8b, + 0x6d, 0x51, 0x5a, 0xd7, 0xa0, 0x4e, 0xd9, 0x5f, 0x3b, 0x56, 0x56, 0x75, + 0x3c, 0x5e, 0x50, 0xbc, 0xd6, 0xaf, 0x7d, 0x8e, 0x78, 0xbf, 0x38, 0xa3, + 0x61, 0xa8, 0x42, 0x72, 0x49, 0x6b, 0x68, 0xbb, 0x5c, 0xa4, 0x83, 0x9c, + 0xba, 0x9e, 0xc4, 0x7f, 0x7a, 0x7c, 0x7c, 0x5e, 0x73, 0x5e, 0x50, 0x81, + 0x79, 0x64, 0xac, 0xa7, 0xba, 0x6c, 0x39, 0xc4, 0x4e, 0xb9, 0x71, 0x9b, + 0xbc, 0x85, 0xb7, 0x4c, 0x63, 0xc8, 0x5b, 0xcd, 0x67, 0x52, 0x65, 0x9c, + 0x32, 0xb6, 0x3d, 0x6e, 0x57, 0x72, 0xc8, 0x85, 0x8f, 0x5e, 0xaa, 0x4e, + 0x65, 0x36, 0x5e, 0x8a, 0xaf, 0x3b, 0xb4, 0x9f, 0xd5, 0x62, 0x72, 0x85, + 0x69, 0x8b, 0xba, 0x89, 0xae, 0xa0, 0xad, 0x66, 0xa9, 0xa8, 0x8c, 0x4f, + 0x5b, 0xb5, 0xdd, 0xd8, 0x8d, 0x4d, 0x8b, 0x98, 0x3e, 0x5f, 0xa6, 0x57, + 0x61, 0x3f, 0x9c, 0xbf, 0x58, 0x96, 0x40, 0x6f, 0xcf, 0x91, 0x85, 0x3d, + 0x7f, 0xc7, 0x39, 0xbc, 0x90, 0xca, 0xa1, 0x8a, 0x86, 0x71, 0x92, 0x79, + 0xb4, 0xb2, 0x91, 0x3f, 0xa1, 0x38, 0x75, 0xaf, 0x72, 0xab, 0x44, 0x8b, + 0xd0, 0x8e, 0xba, 0xbc, 0x95, 0x7e, 0x63, 0x5b, 0xce, 0x86, 0x87, 0x8a, + 0x82, 0x45, 0xcb, 0x6a, 0x9c, 0x47, 0x3c, 0x51, 0x5f, 0xdb, 0xd1, 0x92, + 0xc4, 0x97, 0x6e, 0x95, 0x65, 0xc3, 0x91, 0x88, 0x57, 0x7c, 0xab, 0xb3, + 0x5a, 0x95, 0xc6, 0x4a, 0x40, 0x42, 0x46, 0xa8, 0x8f, 0xca, 0x77, 0xbf, + 0x90, 0x41, 0x8a, 0x9e, 0xa9, 0xb0, 0x50, 0xcc, 0xc7, 0xa4, 0x82, 0x62, + 0xae, 0x6b, 0xda, 0x7a, 0x3e, 0x66, 0x9e, 0x9f, 0xcf, 0x63, 0x7c, 0x44, + 0x3f, 0x45, 0x8c, 0x70, 0x76, 0x4b, 0x99, 0xd6, 0x3c, 0xb3, 0x44, 0x78, + 0xb1, 0x67, 0x8e, 0x5b, 0x79, 0xa0, 0x68, 0xc8, 0x66, 0x22, 0x4d, 0x2c, + 0x67, 0xaa, 0xc3, 0x3a, 0xbc, 0xcd, 0x94, 0x9f, 0x3f, 0x7e, 0x38, 0x53, + 0x61, 0xc7, 0xa0, 0x73, 0x4d, 0x52, 0x51, 0x5f, 0xba, 0x53, 0xa8, 0x88, + 0xbb, 0x96, 0x4c, 0x40, 0x95, 0x97, 0xbe, 0xa6, 0xbf, 0xc1, 0xc3, 0xb0, + 0xd3, 0xbd, 0x4e, 0xd3, 0xca, 0xb8, 0x90, 0x95, 0xc1, 0x2a, 0xb5, 0x76, + 0xa5, 0x72, 0xab, 0x83, 0xa1, 0x80, 0x8e, 0xac, 0x31, 0x6c, 0x5a, 0xd0, + 0xc1, 0xc4, 0x75, 0xc5, 0x43, 0x44, 0x6a, 0xbc, 0x9a, 0xb4, 0xc7, 0xcd, + 0x62, 0x39, 0x35, 0xa7, 0x64, 0x85, 0x9b, 0x38, 0xb0, 0x5d, 0x79, 0x45, + 0x7b, 0xcc, 0x43, 0xb3, 0x46, 0xbf, 0x63, 0x78, 0x5d, 0x6e, 0xa3, 0x7d, + 0x46, 0x41, 0x67, 0x89, 0xa0, 0x48, 0xc6, 0x95, 0x6e, 0x8e, 0xba, 0x34, + 0xb4, 0x6b, 0xc1, 0x72, 0x38, 0xcb, 0x82, 0x72, 0xb6, 0xaa, 0xb7, 0x3e, + 0xab, 0xa4, 0x9f, 0x5d, 0x96, 0xab, 0x84, 0xa2, 0x42, 0x3c, 0x71, 0x71, + 0x88, 0xb8, 0x44, 0xce, 0x41, 0xc2, 0xaf, 0x54, 0x91, 0x3b, 0x33, 0xb4, + 0x76, 0xb9, 0x78, 0x71, 0x9c, 0x74, 0x37, 0x7a, 0xaa, 0x80, 0x49, 0xa1, + 0xa3, 0xc2, 0xa0, 0xb4, 0x4d, 0xa9, 0x78, 0xa5, 0x5e, 0xb0, 0x81, 0x96, + 0xba, 0xb6, 0x3a, 0x5b, 0x3a, 0x79, 0x88, 0xaa, 0x8a, 0xa1, 0x9a, 0x95, + 0xa2, 0x50, 0x8d, 0xc6, 0x4b, 0x90, 0x7f, 0x96, 0x8e, 0x3f, 0x7a, 0x42, + 0x6b, 0xb3, 0x7e, 0xbb, 0x5d, 0x45, 0x79, 0xdc, 0xca, 0x92, 0x4f, 0xb2, + 0xd3, 0xd3, 0xab, 0x60, 0x7d, 0x51, 0xb2, 0xc7, 0xab, 0xac, 0x56, 0xaf, + 0x9f, 0x9f, 0x75, 0xbd, 0x41, 0x5f, 0xa9, 0x56, 0x61, 0x57, 0x4d, 0x9e, + 0xae, 0x57, 0x82, 0xad, 0x77, 0xb2, 0x61, 0x4f, 0x39, 0xd3, 0x84, 0xc7, + 0x74, 0x51, 0x49, 0x7d, 0xb6, 0x93, 0x48, 0x3d, 0xb0, 0x57, 0x4a, 0xbd, + 0x43, 0x6d, 0xb2, 0xa1, 0x8b, 0x7e, 0x5d, 0x67, 0xa2, 0x6e, 0x4a, 0x81, + 0x4a, 0x92, 0x31, 0x83, 0x8b, 0xab, 0xa2, 0x6f, 0x5e, 0xcb, 0xcf, 0x79, + 0x8b, 0x73, 0x78, 0xcd, 0x37, 0x52, 0x9e, 0xa9, 0x2a, 0x49, 0x42, 0x45, + 0x56, 0x2c, 0x8b, 0x29, 0x88, 0x57, 0xbe, 0x85, 0x44, 0xe1, 0x95, 0x86, + 0x76, 0x6a, 0x68, 0x54, 0x55, 0x61, 0x59, 0x79, 0x5b, 0x54, 0x42, 0xb7, + 0x7b, 0x8a, 0x88, 0x49, 0xb7, 0x3c, 0xa1, 0x7d, 0x31, 0x6e, 0x81, 0xba, + 0x4a, 0xc7, 0xbb, 0x97, 0x45, 0x41, 0x58, 0x7d, 0x5b, 0x32, 0x45, 0xbc, + 0x62, 0x69, 0x3e, 0x37, 0xc8, 0x43, 0x84, 0x87, 0x83, 0x2f, 0xc1, 0xb3, + 0x74, 0x3e, 0x6e, 0x80, 0x9f, 0x3e, 0xc1, 0xd4, 0xb6, 0x9b, 0x49, 0xaf, + 0x3e, 0x81, 0x65, 0x46, 0x61, 0x91, 0x5f, 0x86, 0x54, 0x9e, 0x63, 0x94, + 0xd5, 0x78, 0x8b, 0x7e, 0x42, 0x3f, 0x9f, 0xa0, 0xb1, 0x4f, 0x82, 0xb4, + 0x5d, 0x84, 0x26, 0x4a, 0x77, 0x64, 0x8a, 0x9f, 0xab, 0x9f, 0xaf, 0x5a, + 0x84, 0xb4, 0x6f, 0x6e, 0xbb, 0xc3, 0x97, 0xc2, 0x58, 0x3b, 0x81, 0x79, + 0xa6, 0xb3, 0x50, 0x81, 0xa7, 0x9f, 0x4a, 0x62, 0x3e, 0x3e, 0x9d, 0x5d, + 0xcb, 0x71, 0xcd, 0xd9, 0x7e, 0x7b, 0xad, 0xc8, 0x41, 0xca, 0xb6, 0x81, + 0x84, 0xb7, 0xb4, 0x8b, 0xac, 0xbd, 0xaf, 0x46, 0x98, 0xb5, 0x3f, 0x65, + 0x7d, 0x7f, 0x55, 0xd5, 0x3b, 0x94, 0x32, 0x96, 0x80, 0xba, 0xa5, 0x5f, + 0xcc, 0x2a, 0x6a, 0x9f, 0xcf, 0x70, 0xb0, 0x65, 0x75, 0xd1, 0xc5, 0xa0, + 0xa2, 0xb8, 0xa6, 0xd5, 0x6d, 0x5f, 0x76, 0xa1, 0x7c, 0x57, 0xa1, 0xc8, + 0x92, 0x51, 0xcc, 0x90, 0x98, 0x8b, 0xc4, 0xd7, 0xb4, 0xa2, 0x62, 0xd0, + 0x5a, 0xcc, 0x27, 0x58, 0x72, 0x40, 0x8c, 0xc3, 0x5a, 0x9d, 0x75, 0xae, + 0xab, 0x41, 0x8f, 0x52, 0xb5, 0xcc, 0xa2, 0xa2, 0x87, 0x63, 0xa9, 0xc6, + 0xab, 0x7d, 0x3d, 0xa0, 0x34, 0x66, 0x3b, 0xad, 0x9a, 0x63, 0xc1, 0x77, + 0xb7, 0x6a, 0xc0, 0x49, 0x3e, 0x24, 0x54, 0xd9, 0x95, 0x9d, 0x6b, 0x4e, + 0x86, 0x63, 0xd1, 0x33, 0xbf, 0x8d, 0x58, 0xa9, 0x45, 0x45, 0x98, 0x6a, + 0xc2, 0x8b, 0xc3, 0x77, 0xbf, 0x60, 0xc6, 0x27, 0x9f, 0xbc, 0x93, 0xbd, + 0xd1, 0x9d, 0x49, 0x30, 0xae, 0x4b, 0xd5, 0x67, 0x91, 0x85, 0x59, 0x9a, + 0xb9, 0xcc, 0xe4, 0x58, 0x9f, 0xa8, 0x6e, 0x73, 0x4c, 0xb4, 0x7b, 0xd4, + 0x2a, 0x3f, 0xc6, 0x66, 0x4d, 0x76, 0xa6, 0xc0, 0x75, 0x95, 0x8f, 0x4e, + 0x97, 0xb6, 0xb1, 0x42, 0x3d, 0xc7, 0xb5, 0xb1, 0x6d, 0xaf, 0x65, 0x39, + 0xa5, 0x41, 0x5b, 0xa2, 0x82, 0xc0, 0x4b, 0x5f, 0xa7, 0x58, 0xc2, 0xb5, + 0xc9, 0x5d, 0x97, 0x9e, 0xa8, 0x3f, 0xb9, 0xbf, 0xae, 0x70, 0x9e, 0x40, + 0x38, 0x6e, 0x7f, 0x98, 0x2c, 0xac, 0xa2, 0xa5, 0xa6, 0x2f, 0xa3, 0xd0, + 0xa8, 0x99, 0x79, 0xab, 0xcf, 0xb6, 0xb4, 0xb9, 0x94, 0x59, 0x52, 0xca, + 0xa8, 0x99, 0xa2, 0x5f, 0x47, 0x6d, 0xe1, 0x6f, 0x8e, 0x50, 0xb3, 0x42, + 0xae, 0x56, 0x4e, 0xae, 0xc3, 0xb5, 0xda, 0x6a, 0x79, 0x3e, 0x5d, 0xbd, + 0x94, 0xb5, 0xcb, 0x84, 0xbd, 0x92, 0x6a, 0x67, 0x7e, 0x79, 0x95, 0x71, + 0xcc, 0x48, 0xaf, 0x9a, 0x63, 0xd3, 0xa6, 0x31, 0x7c, 0x8a, 0x91, 0x46, + 0xc9, 0xd8, 0xac, 0x5e, 0xc1, 0x96, 0x8b, 0x48, 0x3d, 0x6d, 0x69, 0x74, + 0x70, 0x84, 0xcf, 0x9d, 0x48, 0xa1, 0xb4, 0x62, 0xbc, 0x49, 0x68, 0x69, + 0x5a, 0x6b, 0xac, 0xd4, 0x61, 0x60, 0xcf, 0x4c, 0x3b, 0xb9, 0xa1, 0x8d, + 0xac, 0x9e, 0xac, 0xae, 0x2d, 0x5c, 0x32, 0xb7, 0x68, 0x7f, 0x6d, 0x83, + 0x63, 0x77, 0x8b, 0x8a, 0xce, 0x7b, 0xd5, 0x4c, 0xc9, 0x5e, 0x54, 0x8f, + 0x6b, 0xa1, 0x60, 0x90, 0x35, 0xbd, 0xbd, 0x6c, 0x5a, 0xa5, 0x49, 0xbc, + 0x56, 0xba, 0xac, 0x9b, 0xcf, 0xcc, 0xb5, 0x36, 0x3d, 0x3d, 0xa9, 0xb5, + 0x8b, 0xbe, 0xb5, 0xbb, 0x69, 0x64, 0x3f, 0x77, 0xba, 0x9c, 0x47, 0x7f, + 0x94, 0xbb, 0xca, 0x6d, 0xc5, 0xa1, 0xb1, 0xa7, 0x45, 0x5a, 0xcd, 0x85, + 0x37, 0x88, 0x82, 0xb8, 0x7d, 0x59, 0xc0, 0x8b, 0x3e, 0x41, 0xc4, 0x6b, + 0xb8, 0x5a, 0x81, 0x7f, 0xba, 0x8d, 0x94, 0x95, 0x3b, 0xd2, 0x7a, 0xbe, + 0x9c, 0x9a, 0x5f, 0xa5, 0x76, 0xd3, 0xb5, 0x9b, 0x47, 0x60, 0x79, 0x47, + 0xce, 0x95, 0x2d, 0x57, 0x77, 0xcd, 0xae, 0xb9, 0xc1, 0x74, 0x81, 0x49, + 0x74, 0xcb, 0x96, 0xc0, 0x5a, 0xc0, 0x6d, 0x85, 0xc1, 0x4b, 0xba, 0x85, + 0xc5, 0xbd, 0x90, 0x44, 0xba, 0x9c, 0x67, 0x3d, 0xd4, 0xc3, 0xca, 0x6f, + 0x4f, 0xb9, 0x7d, 0x6e, 0xc2, 0xc1, 0x81, 0x8e, 0x63, 0x4f, 0xcc, 0x8b, + 0xc2, 0x79, 0x8f, 0x64, 0xbc, 0x56, 0x55, 0x6b, 0x4d, 0x70, 0xc8, 0x66, + 0x9e, 0x67, 0x4c, 0xb2, 0x88, 0x7d, 0xb8, 0x58, 0xb6, 0x95, 0x99, 0x7c, + 0xb1, 0x38, 0xb8, 0xaa, 0x51, 0x78, 0x6f, 0x77, 0x8e, 0xab, 0x35, 0x5b, + 0x41, 0x45, 0xc7, 0x46, 0x98, 0x41, 0xc5, 0x62, 0xa3, 0xaf, 0x64, 0x78, + 0x3e, 0x3e, 0x7e, 0x7c, 0x8c, 0x68, 0xb6, 0x6b, 0x8e, 0x9c, 0x59, 0x9c, + 0x54, 0xd1, 0xb0, 0x89, 0x7a, 0xa8, 0xa0, 0x5b, 0x89, 0xc3, 0x46, 0x73, + 0x62, 0x79, 0x82, 0xaf, 0xab, 0xc8, 0x98, 0xae, 0x88, 0x38, 0x63, 0xb8, + 0x5c, 0x4e, 0x8e, 0x42, 0x89, 0x72, 0x4c, 0x3c, 0x42, 0xa0, 0x67, 0x33, + 0x40, 0x51, 0xa4, 0x1f, 0x9f, 0xa4, 0xa0, 0x55, 0x84, 0x6a, 0x9d, 0x70, + 0xa4, 0x91, 0x7d, 0x4f, 0x32, 0xcd, 0x30, 0xa8, 0x4c, 0x2d, 0xb0, 0x89, + 0xc7, 0x46, 0x76, 0xa6, 0x44, 0x74, 0x5c, 0x75, 0xcf, 0x9b, 0x4a, 0x73, + 0x94, 0xca, 0xb2, 0xa5, 0x76, 0xc6, 0x34, 0xc9, 0x4d, 0xa4, 0xba, 0x8d, + 0x54, 0x65, 0x35, 0x8a, 0xbd, 0xb8, 0xb9, 0xb9, 0x44, 0x44, 0x49, 0x49, + 0x95, 0xad, 0x5e, 0x4b, 0xa3, 0xb4, 0x45, 0x83, 0xb9, 0xa3, 0x8c, 0x3d, + 0xc6, 0xcb, 0x72, 0x56, 0x39, 0x5c, 0xa3, 0xc9, 0xa2, 0x52, 0x84, 0x86, + 0xbe, 0x53, 0x97, 0x69, 0x83, 0x86, 0x78, 0x47, 0x42, 0x34, 0xce, 0xb7, + 0x93, 0x75, 0x47, 0x82, 0x8b, 0x37, 0x82, 0x34, 0xa3, 0xbe, 0x5b, 0x5c, + 0xd0, 0x6e, 0xaf, 0x4c, 0x56, 0xc3, 0x4d, 0x79, 0x84, 0x72, 0x51, 0x80, + 0x36, 0x50, 0x9a, 0x7b, 0x9d, 0xa6, 0x73, 0x42, 0x4a, 0x38, 0xc5, 0x52, + 0x94, 0x91, 0x3d, 0x47, 0xbf, 0xb3, 0x48, 0x99, 0x45, 0xb1, 0x68, 0x96, + 0xa1, 0x3c, 0xaa, 0xb7, 0xc9, 0xae, 0x4b, 0x95, 0x4e, 0xb5, 0x7b, 0x96, + 0xbe, 0xc0, 0xcb, 0x56, 0x96, 0x2f, 0x98, 0xad, 0xb0, 0x43, 0x53, 0x43, + 0x7b, 0x89, 0xd8, 0x4f, 0xb1, 0x94, 0x3b, 0x5e, 0x3b, 0xb1, 0xa9, 0xb6, + 0x78, 0x33, 0xba, 0x5a, 0x74, 0x97, 0x8a, 0xcb, 0x79, 0x83, 0x70, 0x61, + 0x9f, 0xc7, 0x38, 0x8b, 0xaf, 0x79, 0x58, 0x71, 0xc4, 0x50, 0x5f, 0x3d, + 0xcb, 0xb1, 0x66, 0x4e, 0x49, 0x8f, 0x54, 0x72, 0x75, 0x50, 0x3d, 0x9c, + 0x4e, 0x55, 0xbd, 0x6c, 0x45, 0x2f, 0xb6, 0x6c, 0x86, 0xcc, 0x35, 0x40, + 0x9e, 0x34, 0xc1, 0xc9, 0x6f, 0x71, 0x47, 0x70, 0x8d, 0x7b, 0x4b, 0x5f, + 0xa6, 0x75, 0x43, 0x5f, 0xb5, 0x5c, 0xc6, 0x7e, 0xb4, 0xcf, 0xca, 0xc2, + 0x79, 0x7d, 0xc6, 0x39, 0x6a, 0xb3, 0x63, 0x6d, 0xa5, 0x6b, 0x35, 0x5b, + 0x34, 0xb8, 0x83, 0x69, 0x64, 0x7d, 0x8f, 0xc4, 0x8c, 0x75, 0x37, 0x5c, + 0x34, 0x5e, 0x8f, 0x45, 0x3f, 0x50, 0x76, 0x47, 0x5f, 0x61, 0x4f, 0x93, + 0x8b, 0x90, 0xa6, 0x8e, 0x57, 0xb3, 0x96, 0x42, 0x84, 0xd1, 0x43, 0xa1, + 0x91, 0xba, 0x78, 0xbc, 0x7b, 0x78, 0xb5, 0x8f, 0x4b, 0x86, 0xbc, 0x6a, + 0xb5, 0x4d, 0x3b, 0x47, 0xd1, 0xc4, 0x77, 0x3a, 0xcb, 0x97, 0xcf, 0x49, + 0x4d, 0xab, 0xac, 0xa4, 0x33, 0x40, 0x9e, 0x67, 0xae, 0x3d, 0x38, 0x6d, + 0x93, 0x9a, 0xb5, 0x54, 0xae, 0x9b, 0x96, 0xd0, 0x57, 0x8c, 0x79, 0x99, + 0xa1, 0x4b, 0x90, 0xc6, 0x7f, 0xbf, 0x47, 0x5d, 0x55, 0xa3, 0xbc, 0x3f, + 0x69, 0x89, 0x3d, 0x37, 0xb1, 0x53, 0x7e, 0x6b, 0xc5, 0xcb, 0xa1, 0x36, + 0x3d, 0xd4, 0x3f, 0xd5, 0x6d, 0xcc, 0xaa, 0x60, 0x72, 0x95, 0x5b, 0xbe, + 0x36, 0x74, 0xb8, 0x4e, 0x8d, 0x88, 0x4c, 0x86, 0x40, 0xca, 0xb1, 0x3a, + 0x30, 0x76, 0x67, 0x34, 0xd4, 0xa1, 0xb2, 0x83, 0x9a, 0x9a, 0x54, 0xbf, + 0x81, 0x98, 0x6e, 0x6c, 0x98, 0xa4, 0x48, 0x41, 0xa5, 0xc5, 0xa6, 0x87, + 0x6d, 0xc1, 0x76, 0x38, 0x51, 0x9b, 0x6a, 0x99, 0x33, 0x48, 0x5c, 0x4c, + 0x46, 0x88, 0xc0, 0xb6, 0x5f, 0x96, 0x57, 0x66, 0xa5, 0xcd, 0x5c, 0x35, + 0x54, 0x3f, 0x6c, 0x6b, 0x36, 0x50, 0x4b, 0x52, 0xa8, 0x43, 0xc8, 0x5a, + 0xaa, 0x71, 0xb7, 0xba, 0xc7, 0x74, 0x74, 0x8a, 0xb6, 0x5a, 0xa0, 0xa2, + 0x6d, 0x9e, 0x9e, 0x97, 0x5e, 0xb6, 0xb8, 0xd3, 0xbc, 0x3d, 0x31, 0x49, + 0x97, 0xb5, 0x31, 0x9c, 0x31, 0x92, 0x58, 0xae, 0xad, 0xbf, 0x4e, 0x2e, + 0xb3, 0xc0, 0x6a, 0x9e, 0x7f, 0x68, 0x74, 0x58, 0x76, 0x48, 0x59, 0x9b, + 0xb8, 0x96, 0x6d, 0x7d, 0xc1, 0x6c, 0x54, 0xd3, 0x58, 0x5f, 0x52, 0xb6, + 0xac, 0xb9, 0x8c, 0xbc, 0x7d, 0x80, 0x81, 0xc2, 0xc1, 0x6c, 0xae, 0xa7, + 0x4e, 0x67, 0x8f, 0xb2, 0xa6, 0x47, 0x44, 0x33, 0x4b, 0x68, 0xc1, 0x9f, + 0x7e, 0x88, 0x71, 0x40, 0x71, 0x65, 0x52, 0x46, 0xcb, 0x9a, 0xa7, 0xce, + 0x74, 0x62, 0x8f, 0x99, 0x85, 0xb2, 0xc4, 0x9a, 0x63, 0x3c, 0x51, 0x79, + 0x92, 0x85, 0xbf, 0x96, 0x5e, 0x9c, 0xd3, 0x3e, 0xa1, 0x50, 0x4d, 0xbf, + 0x6f, 0xcd, 0x6f, 0xc0, 0xbc, 0x9f, 0xbf, 0x33, 0x65, 0x41, 0x77, 0x91, + 0x69, 0xb8, 0x76, 0x59, 0xa0, 0xb5, 0x4e, 0xb8, 0x31, 0x7f, 0x87, 0x38, + 0x72, 0x43, 0x95, 0xc0, 0x6a, 0xaa, 0xa1, 0x53, 0xd4, 0x69, 0xc3, 0x45, + 0xaf, 0x75, 0x90, 0x88, 0xc6, 0x44, 0x4a, 0x34, 0x8a, 0x8a, 0x58, 0x80, + 0x5b, 0x41, 0x9d, 0xb1, 0xb4, 0xab, 0xab, 0x70, 0xb7, 0xa7, 0x71, 0x4b, + 0x90, 0x6a, 0x40, 0xbc, 0xbe, 0x73, 0x4d, 0xd3, 0x64, 0xab, 0x4d, 0x7f, + 0x7c, 0xd9, 0x50, 0xd6, 0xbe, 0x3e, 0xcb, 0x8b, 0xc6, 0xcb, 0xa2, 0x82, + 0x82, 0x9d, 0x5d, 0xc1, 0x35, 0x3d, 0x8a, 0x7f, 0xbf, 0x68, 0x80, 0xce, + 0x69, 0x27, 0x66, 0x92, 0x9f, 0xac, 0x70, 0x9a, 0x3f, 0x8d, 0x89, 0x65, + 0xa4, 0x9a, 0x6b, 0x8b, 0x96, 0x66, 0x83, 0x32, 0xa9, 0x6e, 0x7a, 0x57, + 0x49, 0xab, 0x9d, 0xa1, 0x85, 0x90, 0xbb, 0x5c, 0x80, 0xad, 0x56, 0x99, + 0x2e, 0x67, 0x71, 0x54, 0x66, 0x3d, 0xae, 0x31, 0x55, 0x8d, 0x69, 0x7c, + 0x2f, 0xa5, 0xc9, 0x9c, 0x75, 0x85, 0x36, 0x7d, 0x47, 0xb0, 0x92, 0x4e, + 0x51, 0xae, 0xbc, 0x31, 0x4f, 0xa4, 0x7e, 0xc9, 0xad, 0xaa, 0x38, 0x82, + 0x75, 0xa3, 0x8b, 0xcf, 0xa7, 0xbe, 0xae, 0x35, 0x7b, 0x7d, 0x94, 0x4f, + 0x85, 0x58, 0x89, 0xa6, 0x7a, 0x66, 0x52, 0xc7, 0x2f, 0xa3, 0x95, 0xb3, + 0x68, 0x4c, 0x71, 0x53, 0x50, 0x32, 0xb6, 0x67, 0xb3, 0x3f, 0x3a, 0xc0, + 0xbd, 0x47, 0x8d, 0x48, 0x8a, 0x65, 0x3b, 0x37, 0x8c, 0xa5, 0x43, 0x36, + 0x91, 0x53, 0xb4, 0x9b, 0x2e, 0xcb, 0x50, 0x6e, 0xa4, 0x6c, 0x4f, 0x55, + 0xbb, 0xa5, 0xbf, 0x62, 0xb5, 0x68, 0x70, 0x8e, 0x6b, 0x44, 0x4c, 0x8e, + 0x7f, 0xc5, 0x7d, 0x72, 0x34, 0xd2, 0xb4, 0x62, 0x82, 0x78, 0xa2, 0xd7, + 0x5c, 0xa0, 0xbf, 0x9d, 0xcf, 0x73, 0xc9, 0x6a, 0x9e, 0xac, 0x71, 0x2c, + 0x3d, 0x86, 0x8b, 0x72, 0x4f, 0xcd, 0x2c, 0xa4, 0xa3, 0x46, 0x74, 0xa3, + 0xbe, 0xcb, 0x3c, 0xa6, 0xc1, 0x58, 0x3d, 0x7d, 0x56, 0xba, 0xa3, 0x74, + 0xbf, 0x35, 0x93, 0x85, 0xbb, 0x37, 0x61, 0xcb, 0xba, 0x54, 0xc3, 0x40, + 0x73, 0xae, 0xc9, 0x46, 0xd8, 0xb0, 0x59, 0x85, 0xbc, 0x6a, 0xae, 0x7e, + 0x86, 0x3c, 0x8a, 0x88, 0x7b, 0xa1, 0xca, 0x6e, 0xa6, 0x7c, 0x42, 0x82, + 0x41, 0x8d, 0x6d, 0x84, 0x85, 0xb5, 0x37, 0x5e, 0x57, 0xce, 0xc6, 0x48, + 0x90, 0x8f, 0x9c, 0x78, 0x77, 0x3f, 0xbb, 0x7f, 0xc6, 0x6d, 0x8d, 0xb2, + 0x5b, 0x93, 0x7d, 0x43, 0xd2, 0xa0, 0xb9, 0x58, 0xad, 0x58, 0x2c, 0x43, + 0x3d, 0x5b, 0x66, 0xba, 0x91, 0xc5, 0x82, 0xa3, 0xb3, 0xc8, 0x72, 0x9b, + 0x83, 0x8b, 0x40, 0x7d, 0x67, 0x6e, 0x3e, 0x9e, 0x37, 0x36, 0x8e, 0x4a, + 0x83, 0x4e, 0x3d, 0x71, 0xb5, 0x86, 0x6c, 0xa8, 0x6a, 0xaf, 0x9d, 0x4d, + 0x5b, 0x9b, 0x6f, 0xa6, 0x32, 0xc3, 0x60, 0x65, 0x93, 0x9f, 0x36, 0x8c, + 0xb8, 0x66, 0x2b, 0x8a, 0x72, 0x58, 0x2f, 0x6f, 0x62, 0x5f, 0xc3, 0x73, + 0xb1, 0x4d, 0x71, 0x72, 0xcc, 0x43, 0xc5, 0x78, 0x5c, 0xb0, 0x84, 0x6b, + 0x88, 0xc9, 0x52, 0xa5, 0x93, 0x85, 0xd7, 0x71, 0x86, 0x88, 0x33, 0x96, + 0xa4, 0xa7, 0xc5, 0x89, 0x8e, 0x65, 0x7e, 0x60, 0x5a, 0xbb, 0xaa, 0xad, + 0x57, 0x87, 0x4a, 0x9e, 0xaa, 0x99, 0x87, 0x44, 0x8f, 0x7b, 0x8b, 0x5f, + 0x9c, 0x39, 0x39, 0xd1, 0x8e, 0xa4, 0x73, 0x77, 0x37, 0xc1, 0xbf, 0x83, + 0x7c, 0xa0, 0xaf, 0xbd, 0x50, 0x79, 0x9c, 0x70, 0x73, 0x43, 0xae, 0x9b, + 0xb6, 0x5b, 0x4a, 0x9a, 0xab, 0x60, 0x38, 0x73, 0xa0, 0x9b, 0xa0, 0x78, + 0x90, 0xad, 0x3d, 0x93, 0x58, 0x6f, 0x95, 0x89, 0x7c, 0x90, 0x5a, 0x3a, + 0x49, 0xc1, 0x73, 0x78, 0x4e, 0x4b, 0x51, 0x34, 0x53, 0xb9, 0x75, 0xc9, + 0x83, 0x47, 0x49, 0x6a, 0x67, 0xc5, 0x66, 0xa4, 0xb3, 0xc8, 0x3e, 0xa4, + 0x68, 0x57, 0xca, 0x49, 0x9d, 0x43, 0xc1, 0xc6, 0x5d, 0x55, 0x4e, 0x8a, + 0x8d, 0x44, 0xab, 0x9e, 0xb8, 0x66, 0x76, 0x70, 0x2f, 0x47, 0x91, 0x38, + 0x9e, 0x5f, 0x60, 0xc6, 0xc7, 0x51, 0x69, 0x72, 0x83, 0x2d, 0x46, 0x67, + 0xb1, 0x32, 0x53, 0x35, 0xb1, 0xaa, 0x9d, 0xb6, 0xa8, 0x4f, 0x91, 0x91, + 0x8c, 0x9b, 0x79, 0x5c, 0x36, 0x45, 0x6a, 0xa9, 0xa2, 0x3c, 0x8f, 0xc4, + 0x8a, 0x49, 0x76, 0x5b, 0x4c, 0x3f, 0x77, 0xc1, 0x99, 0x93, 0x86, 0x36, + 0x89, 0xb2, 0xc5, 0xb5, 0xbf, 0x58, 0x92, 0x8c, 0xb0, 0xc1, 0x80, 0x39, + 0x70, 0xaf, 0x53, 0x54, 0x73, 0x4d, 0xbf, 0xba, 0xa0, 0x3e, 0x62, 0xb9, + 0xc7, 0x91, 0x9d, 0x55, 0x7e, 0x63, 0x5f, 0x5b, 0x9e, 0x40, 0xd3, 0xc8, + 0x35, 0xbc, 0x8d, 0xa6, 0x47, 0x3e, 0x9f, 0x77, 0x46, 0x89, 0xa0, 0xa2, + 0xcc, 0x9d, 0xa3, 0x52, 0xbe, 0x49, 0xa3, 0x4a, 0x47, 0x44, 0xbf, 0xbf, + 0x54, 0x51, 0x6e, 0x36, 0x57, 0x5f, 0xce, 0x37, 0x7b, 0x57, 0x9a, 0x64, + 0x35, 0x4f, 0x81, 0x6a, 0x79, 0x4e, 0x52, 0x65, 0x89, 0x7b, 0x75, 0x93, + 0x53, 0x91, 0x5a, 0xc9, 0x78, 0x44, 0xc0, 0xbe, 0x48, 0x77, 0x64, 0x60, + 0x77, 0x9c, 0xbd, 0x4c, 0x66, 0x84, 0xc2, 0x3c, 0x4a, 0x77, 0x99, 0x73, + 0x48, 0xce, 0x70, 0x58, 0x56, 0x66, 0x57, 0x85, 0xc1, 0x81, 0x6d, 0x8a, + 0x37, 0x53, 0xac, 0x60, 0x57, 0x5e, 0x8f, 0xc7, 0x4d, 0xce, 0x42, 0x4d, + 0x74, 0xc7, 0xd4, 0x5f, 0x58, 0x9a, 0x31, 0x8a, 0x38, 0xa3, 0x9a, 0x91, + 0x77, 0xd2, 0x3e, 0xb5, 0xc3, 0x90, 0x54, 0xd0, 0x8b, 0x80, 0x42, 0x4b, + 0x98, 0xbd, 0x8b, 0x61, 0x7a, 0x71, 0x9c, 0x39, 0xa1, 0xae, 0xc8, 0x3d, + 0xbf, 0x8e, 0xdc, 0x92, 0x88, 0x5d, 0x7b, 0x8e, 0x47, 0x61, 0x7b, 0x56, + 0x38, 0x5b, 0xa3, 0xbf, 0x35, 0x47, 0x39, 0xc5, 0x4d, 0xb0, 0x5a, 0x6e, + 0x39, 0xa6, 0xc2, 0xcf, 0x60, 0xb2, 0x66, 0xc4, 0x9b, 0x30, 0x37, 0xb9, + 0xb6, 0xa3, 0xc5, 0x6d, 0x7d, 0x43, 0x97, 0x82, 0x99, 0xc8, 0xca, 0xd9, + 0x37, 0x70, 0x7d, 0x40, 0x5d, 0xbe, 0x91, 0xae, 0x65, 0x3e, 0xa0, 0x9b, + 0xc2, 0xac, 0x3e, 0x79, 0x86, 0x96, 0x90, 0x3b, 0x71, 0xc9, 0x6d, 0xa9, + 0x36, 0x8d, 0xd6, 0xaa, 0x60, 0x79, 0x8c, 0xa1, 0x66, 0xa3, 0x6e, 0x7a, + 0x90, 0x34, 0x43, 0xce, 0x52, 0xc4, 0x92, 0x51, 0x92, 0xb7, 0xda, 0x8a, + 0x8a, 0xb0, 0x72, 0xc9, 0x42, 0x81, 0xb1, 0x89, 0x9f, 0x96, 0x33, 0x63, + 0x38, 0x60, 0x93, 0x9f, 0x53, 0x76, 0x5f, 0x9c, 0x4c, 0x46, 0x60, 0x5a, + 0x3d, 0x33, 0x70, 0x56, 0x9e, 0x5e, 0xa0, 0x53, 0x8d, 0x81, 0xbd, 0xca, + 0x33, 0x6d, 0x86, 0xac, 0xac, 0xa6, 0x56, 0x8e, 0x60, 0x34, 0x9e, 0xc3, + 0x68, 0xc9, 0xa3, 0x2e, 0x7a, 0x8e, 0x4e, 0x46, 0x8d, 0xb5, 0x58, 0xcd, + 0xa4, 0x8f, 0x87, 0x3f, 0xd2, 0xbf, 0x86, 0x6e, 0x9b, 0x4c, 0x6d, 0x65, + 0x70, 0x66, 0xaa, 0x88, 0xb1, 0xb1, 0x4d, 0x6d, 0x4a, 0xaa, 0x4b, 0x88, + 0x54, 0x40, 0x53, 0x3e, 0x90, 0xb0, 0x6e, 0x47, 0x58, 0xbf, 0x95, 0xd7, + 0x8e, 0x9e, 0x94, 0x4c, 0x55, 0x3e, 0x5d, 0xa7, 0x7f, 0xc3, 0xa7, 0x7b, + 0x98, 0x63, 0xc1, 0x86, 0x3a, 0x4e, 0x28, 0xd0, 0x3b, 0x68, 0x50, 0xb3, + 0xc6, 0xc8, 0x7f, 0xa5, 0x68, 0xc7, 0x9f, 0x86, 0x31, 0x4f, 0x89, 0xb2, + 0xc8, 0xc4, 0xc0, 0xb5, 0x86, 0x2d, 0x39, 0x5d, 0xb3, 0x58, 0x34, 0x6b, + 0xb0, 0xc8, 0x4d, 0xb7, 0x4e, 0x2c, 0x59, 0xcf, 0xb4, 0x89, 0x66, 0x51, + 0x97, 0xa9, 0xb7, 0x52, 0x47, 0x48, 0x72, 0xb7, 0x7f, 0x75, 0xb8, 0x9b, + 0x6f, 0xc8, 0xcd, 0x44, 0x8a, 0x43, 0x6e, 0x70, 0x8e, 0x4f, 0x69, 0x59, + 0xd2, 0xca, 0x5c, 0x78, 0x8b, 0xce, 0x7e, 0x76, 0xab, 0xcd, 0x81, 0x3b, + 0x65, 0x30, 0x3d, 0x51, 0x6d, 0x9e, 0x91, 0x7c, 0xb9, 0xbf, 0x35, 0x81, + 0x8c, 0xb7, 0x44, 0x4d, 0x34, 0x5d, 0x46, 0x5d, 0xd0, 0xcf, 0xc2, 0x5e, + 0x9b, 0x3a, 0x81, 0x31, 0x8e, 0x51, 0x6e, 0x4e, 0x8c, 0xbc, 0x82, 0x60, + 0x8a, 0x3a, 0x58, 0x75, 0x4f, 0x87, 0x6b, 0xaa, 0xb7, 0xb0, 0xcd, 0xb2, + 0x42, 0x5d, 0xa8, 0xca, 0x51, 0x4a, 0xaa, 0x64, 0xcb, 0x64, 0x69, 0x79, + 0xa0, 0x5f, 0xab, 0xaa, 0x37, 0x3b, 0x57, 0x50, 0xa4, 0x38, 0xa2, 0x5e, + 0x54, 0xb3, 0x73, 0x38, 0x4e, 0x48, 0xdc, 0x91, 0x99, 0xcd, 0xc6, 0xa2, + 0xbd, 0x94, 0x78, 0x45, 0x89, 0x68, 0x51, 0x29, 0x94, 0xa8, 0x50, 0xa1, + 0x48, 0x5d, 0x61, 0x9b, 0x74, 0x42, 0x35, 0x77, 0xa9, 0x90, 0xaf, 0x85, + 0xc1, 0x77, 0xb4, 0xb6, 0x8b, 0x40, 0x71, 0x73, 0xa2, 0x5e, 0x84, 0x88, + 0x54, 0x94, 0x5f, 0x72, 0x33, 0x42, 0x94, 0x37, 0xb1, 0xab, 0x8c, 0xc0, + 0x35, 0x88, 0x97, 0x2c, 0xd1, 0x58, 0x84, 0x3a, 0x58, 0xa2, 0xb8, 0xa1, + 0x61, 0xd6, 0xcf, 0x64, 0xb4, 0xc3, 0xa6, 0x2a, 0x4a, 0xd1, 0x41, 0x37, + 0x8d, 0x47, 0xcb, 0x37, 0x5c, 0x98, 0xbd, 0x81, 0x45, 0xa2, 0x95, 0x8b, + 0xb4, 0x87, 0x43, 0x36, 0x4d, 0xce, 0xa0, 0x5a, 0x50, 0x80, 0x43, 0x3a, + 0xbe, 0x5a, 0x77, 0x64, 0x7a, 0x8b, 0x49, 0x85, 0x95, 0x96, 0xc4, 0x2e, + 0x6c, 0x79, 0x60, 0xd8, 0x41, 0xb7, 0xb5, 0x6d, 0x51, 0x71, 0xc4, 0x72, + 0x75, 0x39, 0x95, 0x63, 0x8a, 0xc8, 0x3e, 0x8c, 0x9c, 0xae, 0xaa, 0x39, + 0x7f, 0x8d, 0x9c, 0x91, 0xa1, 0x78, 0x6d, 0xcc, 0x57, 0x65, 0x35, 0x86, + 0xa3, 0x50, 0xc4, 0x9b, 0x86, 0x61, 0xba, 0x44, 0x96, 0x67, 0xca, 0x51, + 0x80, 0x7f, 0xa6, 0xb9, 0xd8, 0x85, 0xd6, 0x50, 0xb0, 0x53, 0x5d, 0x40, + 0xce, 0x6a, 0x5f, 0x4d, 0xd1, 0xc2, 0x39, 0xb7, 0x48, 0x7f, 0xcc, 0x7b, + 0x5f, 0x7d, 0xba, 0xca, 0x5e, 0xae, 0x6c, 0x80, 0xa6, 0x5a, 0x88, 0x40, + 0xc9, 0x66, 0x5e, 0x6a, 0x70, 0xc2, 0xcf, 0x69, 0xcd, 0xa6, 0x7e, 0x8f, + 0x55, 0xc1, 0xc0, 0x9f, 0x7b, 0x51, 0x60, 0x59, 0x6a, 0x8f, 0xb6, 0x72, + 0x6f, 0x6f, 0x80, 0xac, 0xb3, 0x9d, 0x4d, 0x87, 0x80, 0xcf, 0x5f, 0x51, + 0x9b, 0x6e, 0x84, 0x47, 0x49, 0xaa, 0xa6, 0x95, 0x9d, 0x76, 0xb8, 0xce, + 0x33, 0x31, 0x84, 0x66, 0x82, 0xce, 0x8e, 0x43, 0x71, 0xb5, 0x64, 0xdf, + 0xb4, 0xbe, 0x33, 0x9f, 0x7a, 0xcd, 0x53, 0xb2, 0x3d, 0xcb, 0x59, 0x93, + 0xa7, 0x9f, 0x49, 0xc8, 0x61, 0xcd, 0x6c, 0xdc, 0xb7, 0x36, 0x8b, 0xa8, + 0x39, 0x60, 0xa9, 0x7e, 0xad, 0x66, 0xa4, 0x86, 0x92, 0xa4, 0x80, 0xb7, + 0x67, 0x5d, 0x8c, 0x5f, 0x95, 0x71, 0x50, 0x6c, 0x97, 0xab, 0x7d, 0x5b, + 0x76, 0x5b, 0xcb, 0xb6, 0x75, 0x5b, 0xae, 0x76, 0x31, 0x8e, 0xb8, 0x81, + 0x39, 0x8b, 0xa9, 0x51, 0x4c, 0xb6, 0x9d, 0x39, 0x69, 0xc1, 0x65, 0x6e, + 0x8f, 0x97, 0x36, 0x83, 0xa5, 0x3b, 0x4b, 0x4b, 0xa9, 0x61, 0xc4, 0x99, + 0x36, 0xb7, 0x42, 0x78, 0x36, 0x86, 0x59, 0x65, 0x31, 0xc9, 0x93, 0x60, + 0x65, 0x47, 0x3c, 0xaf, 0x61, 0x8b, 0x6e, 0x95, 0x42, 0x50, 0xa4, 0x8a, + 0xca, 0x34, 0x91, 0xa9, 0x6c, 0x32, 0x58, 0x33, 0x9e, 0x34, 0x5b, 0xad, + 0x9c, 0x3e, 0xbd, 0x3f, 0xc3, 0x7d, 0x6f, 0xb0, 0xb4, 0xa4, 0x69, 0x44, + 0x96, 0xa3, 0x89, 0x49, 0xb3, 0x6b, 0x98, 0x33, 0x46, 0x9e, 0x46, 0xc9, + 0xae, 0x3e, 0xaa, 0x82, 0xab, 0xa5, 0x3c, 0xad, 0x95, 0xba, 0x33, 0x4c, + 0x86, 0x40, 0xaf, 0x3f, 0x4d, 0x5c, 0xc5, 0xda, 0xaa, 0x35, 0xa8, 0xbf, + 0x84, 0xac, 0x76, 0x5b, 0x42, 0x80, 0x96, 0x41, 0xd0, 0x78, 0x97, 0x6a, + 0xbb, 0x4a, 0x88, 0x4c, 0xcb, 0x37, 0xc4, 0x7c, 0x6c, 0x46, 0x8e, 0x55, + 0xbd, 0x57, 0xa6, 0x37, 0x6b, 0x3e, 0x6d, 0x32, 0x6f, 0x42, 0x59, 0x6e, + 0xc9, 0x8a, 0xaf, 0xb6, 0x8d, 0xa6, 0x84, 0x92, 0x44, 0x44, 0x83, 0x78, + 0xb0, 0x66, 0xc4, 0x65, 0x9f, 0xb1, 0x3e, 0xdd, 0x64, 0x52, 0x56, 0x8d, + 0x57, 0x5e, 0x62, 0x7f, 0x72, 0x6e, 0x7d, 0xcf, 0x5b, 0x87, 0x7c, 0x64, + 0x68, 0x94, 0xc0, 0x66, 0x7f, 0x67, 0xd3, 0x7d, 0xaf, 0x4d, 0x58, 0x52, + 0x36, 0x75, 0x5d, 0xa1, 0x76, 0x93, 0xc1, 0xc4, 0x80, 0x67, 0xc2, 0x61, + 0xbd, 0xc3, 0xb2, 0x7d, 0x4a, 0x81, 0xa7, 0xb8, 0xcd, 0x90, 0xc5, 0x7a, + 0xc3, 0xcd, 0x79, 0xc5, 0x4f, 0x85, 0xa3, 0x91, 0xb3, 0x4f, 0xca, 0x2f, + 0x9f, 0xaa, 0xa0, 0xca, 0x43, 0x36, 0x6e, 0xcb, 0x43, 0x6a, 0x85, 0xce, + 0x7e, 0x87, 0x7e, 0x7a, 0x34, 0xa7, 0x72, 0x7d, 0x45, 0x4f, 0x7a, 0x96, + 0x95, 0x3e, 0xba, 0xc5, 0x63, 0x4f, 0x49, 0xbb, 0x3c, 0xd3, 0x44, 0xd4, + 0xcf, 0xc7, 0x69, 0xaa, 0x64, 0xb2, 0x61, 0x82, 0x3b, 0xc4, 0x8f, 0xa9, + 0x86, 0x94, 0xd3, 0x89, 0x9e, 0x5f, 0xd2, 0x8b, 0x82, 0x6d, 0x3e, 0xa6, + 0x4b, 0xbd, 0x42, 0x48, 0x40, 0x68, 0x47, 0x97, 0x41, 0x9e, 0x59, 0x79, + 0x52, 0x86, 0x86, 0xaa, 0x4a, 0x94, 0xb9, 0xaf, 0x7a, 0x3e, 0x95, 0x87, + 0xa7, 0x53, 0x6e, 0xab, 0x86, 0x55, 0xae, 0x84, 0x62, 0x72, 0xc4, 0x32, + 0x42, 0x3e, 0xad, 0x68, 0x84, 0x49, 0x91, 0xd3, 0x72, 0x8e, 0x46, 0xaf, + 0x7e, 0xa3, 0x91, 0x9b, 0xac, 0x8b, 0x79, 0xac, 0x6d, 0x63, 0x3e, 0xb5, + 0x79, 0xbf, 0x6a, 0x54, 0x3b, 0xb4, 0xd2, 0x9f, 0x57, 0x7a, 0x9b, 0x9b, + 0x83, 0x54, 0x8b, 0xd6, 0xad, 0x77, 0x7a, 0xe4, 0x29, 0x51, 0x99, 0x82, + 0x88, 0x62, 0x5a, 0x9b, 0x56, 0x86, 0x73, 0xa3, 0x77, 0x5a, 0x73, 0x76, + 0x42, 0x4b, 0xb6, 0xc1, 0xb9, 0x76, 0x38, 0xc7, 0x45, 0xc7, 0x87, 0xaf, + 0x68, 0x36, 0x92, 0x84, 0xb5, 0x87, 0xd7, 0x75, 0x56, 0xbc, 0x66, 0x8c, + 0x9a, 0x5c, 0xab, 0x3d, 0x85, 0xb0, 0xc6, 0x7a, 0xe0, 0x8d, 0x98, 0xc8, + 0x40, 0xb0, 0x39, 0x68, 0xc7, 0x3e, 0x69, 0x53, 0x4d, 0x46, 0x66, 0x3d, + 0x62, 0x6e, 0xd0, 0x56, 0xc5, 0xa3, 0xd4, 0xd4, 0x92, 0xd0, 0x4a, 0x90, + 0xc2, 0x5c, 0x79, 0x5e, 0xad, 0x83, 0x53, 0x8f, 0xd8, 0x5e, 0x7a, 0x4c, + 0xa2, 0x41, 0x7f, 0x34, 0xbe, 0x81, 0x5c, 0x95, 0xa0, 0x71, 0x3d, 0x89, + 0x60, 0xc7, 0x5f, 0xa8, 0x87, 0x87, 0x8f, 0x53, 0xc1, 0x8f, 0x81, 0x5d, + 0x94, 0xd5, 0x42, 0x8c, 0xcd, 0x65, 0xc5, 0xab, 0xbf, 0x9f, 0xa0, 0x7c, + 0x45, 0xd1, 0x6b, 0x4f, 0xc4, 0xc3, 0x62, 0x65, 0xb8, 0xe8, 0xd1, 0x41, + 0xaa, 0x66, 0x28, 0xb8, 0x8a, 0x9f, 0x48, 0xad, 0x55, 0x4e, 0x68, 0x87, + 0x52, 0x6b, 0xa1, 0x65, 0x50, 0x60, 0x80, 0xb5, 0xc1, 0x4b, 0x4a, 0x68, + 0x87, 0x2f, 0xb5, 0x84, 0x31, 0x7a, 0x48, 0xa2, 0xd5, 0x84, 0x8f, 0xc2, + 0x8b, 0x88, 0xae, 0xae, 0xca, 0xa9, 0x43, 0x4c, 0x8c, 0xa6, 0xa9, 0x36, + 0xae, 0xba, 0xc2, 0xb8, 0x34, 0x33, 0x47, 0xaf, 0xa1, 0x3c, 0xdb, 0x3c, + 0xbb, 0x9d, 0x7e, 0xd7, 0xce, 0xab, 0x6e, 0xb0, 0x43, 0xc2, 0x85, 0xae, + 0x71, 0x9a, 0x97, 0xd1, 0xb0, 0x96, 0x45, 0xbb, 0x85, 0x9d, 0x4c, 0x36, + 0x53, 0x82, 0xcd, 0x92, 0x40, 0x86, 0xbe, 0xb0, 0x4f, 0x5b, 0x8b, 0xae, + 0x5b, 0x6b, 0xde, 0x73, 0x47, 0x5f, 0x85, 0x82, 0x40, 0xa9, 0x3d, 0x8a, + 0xd0, 0xb1, 0x7e, 0x97, 0x5f, 0xd0, 0xc4, 0x64, 0x92, 0x76, 0xa5, 0x6c, + 0x8d, 0x70, 0x44, 0xc1, 0x89, 0x3e, 0x90, 0x7b, 0x42, 0x4d, 0x7d, 0x76, + 0xd3, 0x45, 0x9e, 0xca, 0x46, 0x7c, 0xdc, 0x7c, 0x87, 0x52, 0x87, 0x34, + 0x39, 0x78, 0x9f, 0xd8, 0x7d, 0x44, 0xa1, 0xcc, 0x3f, 0x4c, 0x5b, 0x51, + 0xa9, 0xab, 0x54, 0x70, 0xb3, 0xad, 0x6b, 0x69, 0x37, 0x57, 0x45, 0xce, + 0xb5, 0x3c, 0x3f, 0x66, 0x83, 0x89, 0xca, 0xc0, 0x5a, 0x7a, 0xd4, 0xb3, + 0x6a, 0x41, 0xc4, 0x82, 0x79, 0x8f, 0x5c, 0xc5, 0x5f, 0x4f, 0x75, 0x36, + 0xce, 0x87, 0x80, 0x4e, 0xc1, 0x6a, 0x5d, 0x45, 0x66, 0x9e, 0xc7, 0x67, + 0x52, 0xe0, 0x95, 0xac, 0x46, 0x74, 0x8f, 0xa1, 0x65, 0xa9, 0x6d, 0x7b, + 0x60, 0x31, 0xcb, 0x4f, 0x5e, 0x3e, 0x9c, 0x4f, 0x80, 0x29, 0x97, 0xac, + 0x5f, 0xa5, 0x32, 0x7b, 0xbd, 0x42, 0x6f, 0x68, 0x51, 0x63, 0xa3, 0x75, + 0x55, 0xae, 0xe0, 0x7c, 0xb2, 0x6f, 0x4d, 0x3a, 0x66, 0xd3, 0x72, 0xba, + 0xd2, 0x75, 0x48, 0x4a, 0x52, 0xbc, 0x38, 0x69, 0x99, 0x89, 0x75, 0x53, + 0x97, 0xca, 0xbe, 0xc4, 0xb7, 0x6f, 0x9a, 0xb8, 0x40, 0x71, 0x5d, 0x98, + 0x49, 0xa9, 0x53, 0xc1, 0x58, 0xa2, 0x86, 0x9b, 0x9c, 0x85, 0x68, 0x66, + 0x6f, 0xaf, 0x4d, 0x65, 0xc5, 0x95, 0x37, 0x8e, 0xce, 0x72, 0x6e, 0xbc, + 0x9d, 0xae, 0x4d, 0xba, 0xcb, 0x9b, 0x6e, 0x83, 0x96, 0xac, 0xc9, 0xc2, + 0x2f, 0xa5, 0x37, 0x57, 0x3e, 0xc3, 0x94, 0x42, 0x73, 0x48, 0xa4, 0x9e, + 0x39, 0x65, 0xa9, 0x53, 0x55, 0x4c, 0xc9, 0x8d, 0xba, 0x9f, 0xc2, 0x62, + 0x66, 0xc7, 0x8f, 0xc4, 0x81, 0xc9, 0x61, 0xbe, 0x7e, 0x3b, 0xa9, 0x52, + 0x6b, 0x7f, 0xc9, 0xce, 0xbf, 0xc8, 0x5f, 0xcb, 0x5d, 0x6b, 0xae, 0xa1, + 0x69, 0xd3, 0x34, 0x86, 0x45, 0xac, 0x96, 0x4e, 0xb9, 0x59, 0xdf, 0xaf, + 0x60, 0x43, 0x81, 0x6e, 0x46, 0xaf, 0x57, 0x3d, 0x94, 0x60, 0xbd, 0xc0, + 0x63, 0xb7, 0xa2, 0x64, 0xdb, 0x64, 0xd9, 0x7c, 0x67, 0x3c, 0x87, 0xc2, + 0x8d, 0xa8, 0x64, 0xd0, 0xd8, 0x45, 0x97, 0xad, 0x65, 0x63, 0xca, 0x7a, + 0xd5, 0xbd, 0x38, 0xa7, 0xae, 0xa0, 0x52, 0x74, 0x92, 0x9c, 0x9d, 0x7b, + 0x5e, 0x5d, 0x9e, 0x88, 0x46, 0xd0, 0xc7, 0xbc, 0x42, 0x74, 0xa7, 0x8f, + 0xa2, 0x64, 0x3a, 0xcc, 0x96, 0xd6, 0xc8, 0x77, 0xd1, 0xbe, 0x36, 0x39, + 0x7c, 0xd6, 0xd6, 0xbf, 0xa2, 0xc1, 0x70, 0x74, 0x49, 0xc9, 0x3e, 0x6c, + 0x97, 0x48, 0x70, 0x54, 0x40, 0x67, 0x9d, 0x69, 0x73, 0x87, 0xa3, 0x65, + 0x3c, 0x4f, 0xa9, 0x55, 0x9f, 0xaa, 0x80, 0x6f, 0xaa, 0x82, 0xae, 0xcd, + 0x38, 0x7d, 0x6f, 0x46, 0xce, 0x32, 0x8b, 0x9b, 0x45, 0x62, 0xa7, 0x5e, + 0x85, 0x50, 0x6c, 0x52, 0x5c, 0x7b, 0x49, 0xb7, 0xa0, 0xc4, 0x3b, 0x33, + 0x60, 0xa3, 0x50, 0x43, 0x77, 0x97, 0x66, 0x70, 0x6f, 0xad, 0xad, 0xbc, + 0xc3, 0x61, 0xcf, 0x9f, 0x4c, 0x49, 0xbe, 0xb0, 0x6c, 0xbe, 0x79, 0xda, + 0x5e, 0x67, 0x50, 0x97, 0x47, 0xd0, 0x4c, 0xb3, 0xc1, 0x6f, 0x5d, 0x52, + 0x3f, 0x72, 0xc3, 0xa0, 0xa9, 0x3e, 0x77, 0x65, 0xd8, 0xbc, 0x71, 0x72, + 0x6e, 0x7b, 0x3c, 0x97, 0xb2, 0x3f, 0xdf, 0x8d, 0xcf, 0xbb, 0xd9, 0x35, + 0x8b, 0x88, 0x37, 0x79, 0x74, 0x39, 0x9c, 0x65, 0x80, 0x40, 0x53, 0xc3, + 0x42, 0x55, 0x7b, 0xc9, 0x99, 0xaf, 0xb1, 0x57, 0x75, 0x46, 0x79, 0x87, + 0xa6, 0x6e, 0xa7, 0x45, 0x3a, 0x40, 0xc5, 0xb9, 0xa5, 0x3e, 0x96, 0xb1, + 0x51, 0x6a, 0x5b, 0x2f, 0x4c, 0x56, 0x52, 0x4a, 0x3d, 0x47, 0x78, 0xa3, + 0x48, 0xce, 0x7d, 0xda, 0x4b, 0xa8, 0xc6, 0xa0, 0x48, 0x9a, 0x6e, 0xbd, + 0x96, 0x9d, 0x54, 0x6f, 0x45, 0x75, 0x4d, 0x71, 0x6f, 0xa1, 0x44, 0x67, + 0x80, 0x82, 0x58, 0xa1, 0x35, 0x44, 0x81, 0x8c, 0x49, 0x3c, 0xc1, 0x9f, + 0x6b, 0x99, 0xc5, 0x6e, 0x8d, 0xb2, 0x90, 0x40, 0x4f, 0x85, 0x63, 0x42, + 0x70, 0x16, 0x80, 0x55, 0xa6, 0x63, 0xd3, 0x75, 0x8a, 0x79, 0x99, 0x8a, + 0x60, 0x3a, 0x32, 0x45, 0xa5, 0x6d, 0x3e, 0x2a, 0x9a, 0x26, 0x38, 0xb9, + 0x52, 0x40, 0xbc, 0xb3, 0x68, 0xba, 0xcc, 0x9f, 0x33, 0xa4, 0x70, 0x58, + 0x64, 0x53, 0x7d, 0xbd, 0x46, 0x4b, 0x4d, 0x7f, 0xa0, 0xae, 0x71, 0x5b, + 0xcc, 0x38, 0x6d, 0x46, 0x80, 0x36, 0x9e, 0x45, 0x42, 0x65, 0xaa, 0x66, + 0x5f, 0xa2, 0xb1, 0x57, 0xb3, 0x33, 0x8e, 0xca, 0x3c, 0xa2, 0xc0, 0xbc, + 0x59, 0x34, 0x51, 0xc6, 0x75, 0x69, 0x56, 0x5b, 0xca, 0xc9, 0x89, 0x7d, + 0xd6, 0xb2, 0x37, 0x9f, 0xb3, 0x59, 0x5d, 0x6e, 0x57, 0x44, 0x60, 0x8c, + 0x91, 0x87, 0x53, 0x7a, 0xbb, 0x9f, 0xaf, 0xa8, 0xb4, 0xb0, 0x40, 0x31, + 0x43, 0xc7, 0xcc, 0x94, 0xa4, 0x8b, 0xc0, 0x3e, 0x59, 0xc9, 0xa6, 0x4f, + 0xd0, 0x57, 0xb8, 0x5b, 0xb5, 0xb7, 0x44, 0x38, 0x8b, 0xcc, 0x5c, 0x65, + 0xda, 0xd6, 0x49, 0x90, 0xb1, 0x98, 0x50, 0x3a, 0x35, 0xb2, 0x8e, 0x45, + 0x71, 0x85, 0x48, 0xa1, 0xaa, 0xa0, 0x82, 0x3b, 0x89, 0x82, 0x85, 0xc5, + 0x7b, 0x99, 0x8b, 0x34, 0xc5, 0xc2, 0x30, 0x78, 0x85, 0xc3, 0x50, 0x97, + 0x62, 0x65, 0xa1, 0x2a, 0x38, 0x76, 0x50, 0x73, 0x99, 0xda, 0xab, 0xac, + 0x82, 0x66, 0x83, 0x53, 0x97, 0x81, 0xb4, 0x58, 0xa1, 0x5d, 0x8e, 0x6e, + 0x85, 0x38, 0x3f, 0x84, 0x94, 0x61, 0x71, 0xb2, 0x7b, 0x38, 0x94, 0x39, + 0x88, 0xd5, 0xa3, 0xc9, 0x3f, 0x4c, 0x52, 0xa3, 0xaa, 0x44, 0xcd, 0x6d, + 0x88, 0x5a, 0x7a, 0x75, 0x9c, 0x66, 0x9a, 0x87, 0x9b, 0x72, 0xae, 0xcb, + 0x97, 0x98, 0xc7, 0x93, 0x90, 0xcb, 0x65, 0x58, 0xbd, 0x5c, 0x81, 0x6d, + 0xcb, 0x30, 0xb8, 0x49, 0xbb, 0xc0, 0xc7, 0x6f, 0x3b, 0x4e, 0x77, 0x5b, + 0x7b, 0xa3, 0x92, 0x39, 0xbf, 0xcf, 0x35, 0x60, 0x78, 0x78, 0x87, 0x95, + 0x6d, 0x8a, 0x3d, 0x4c, 0x6e, 0x49, 0x3f, 0x64, 0x53, 0x3a, 0x5a, 0x76, + 0x4b, 0x88, 0x5f, 0xcb, 0x8f, 0x40, 0x53, 0x33, 0xd4, 0x79, 0xac, 0xc1, + 0x53, 0xde, 0x44, 0xba, 0x48, 0x39, 0x77, 0xb8, 0x6c, 0x33, 0x68, 0xc0, + 0xa2, 0x5c, 0xc1, 0xc6, 0x3f, 0x6f, 0x7e, 0x9a, 0xbb, 0xba, 0x8a, 0x9f, + 0x3e, 0xae, 0xc3, 0xcc, 0x55, 0x84, 0x6a, 0x6b, 0x77, 0x78, 0xa7, 0xdc, + 0x39, 0xbd, 0x5a, 0x61, 0x37, 0xbb, 0x72, 0xb0, 0x9f, 0x56, 0x75, 0x63, + 0x54, 0x68, 0xbc, 0xc8, 0x35, 0x94, 0x8a, 0x5f, 0x98, 0x55, 0xcb, 0xcc, + 0x3e, 0xa2, 0xc3, 0x4b, 0x4b, 0x8d, 0x91, 0x32, 0xad, 0x9f, 0x8a, 0xd4, + 0x73, 0xbb, 0x8f, 0x64, 0xbd, 0x66, 0x3b, 0xae, 0x31, 0x8d, 0xa9, 0xc5, + 0x6b, 0xc4, 0xc0, 0x4b, 0x9b, 0x9d, 0x2b, 0xa4, 0xc5, 0xb8, 0xb5, 0xa9, + 0x87, 0xa7, 0xcd, 0x93, 0x3a, 0xbf, 0xd3, 0xb0, 0xb3, 0x56, 0x39, 0x96, + 0xc7, 0x66, 0x2d, 0xad, 0x5e, 0xa0, 0x99, 0xbe, 0x8b, 0x81, 0x90, 0x3e, + 0x71, 0x89, 0x59, 0x63, 0x91, 0xbb, 0x4b, 0xd6, 0x61, 0xb3, 0xa1, 0x63, + 0x6c, 0xc5, 0xc9, 0x77, 0x7e, 0xa8, 0x62, 0x6b, 0x8a, 0x8b, 0xcd, 0x59, + 0x8f, 0xae, 0xaa, 0x8a, 0xa2, 0x81, 0x42, 0xc0, 0xa9, 0x66, 0x55, 0x61, + 0x30, 0x83, 0x36, 0x63, 0xd1, 0x7b, 0x56, 0xc9, 0xac, 0x70, 0x47, 0x54, + 0x59, 0x5a, 0x63, 0x72, 0x42, 0x38, 0x78, 0x63, 0x8b, 0x99, 0x84, 0x41, + 0x35, 0x89, 0x57, 0x7f, 0x4f, 0xa3, 0x41, 0xab, 0x76, 0xc8, 0x3e, 0x6b, + 0xaf, 0xb9, 0xab, 0xb0, 0xa2, 0xb5, 0xb6, 0x8a, 0xc5, 0x6f, 0xb7, 0x9c, + 0x8c, 0x43, 0x6e, 0x3b, 0x3b, 0x68, 0x46, 0xa3, 0x58, 0x71, 0xb8, 0x63, + 0xab, 0x40, 0x71, 0xa1, 0xc6, 0x6e, 0xac, 0x54, 0x55, 0x97, 0x78, 0x5d, + 0xb6, 0xa9, 0xb1, 0x4c, 0x9c, 0x87, 0xbd, 0x92, 0x83, 0xbd, 0x2c, 0x8e, + 0x91, 0x3b, 0x68, 0x63, 0xb7, 0x42, 0x3f, 0xd1, 0x51, 0xb7, 0xbb, 0x5f, + 0x5e, 0x66, 0x7b, 0x97, 0x50, 0x9c, 0xc1, 0x82, 0xb3, 0xca, 0xb8, 0xa7, + 0xba, 0xcd, 0x40, 0x6c, 0xb5, 0x64, 0xaf, 0x65, 0x97, 0x3d, 0xa8, 0xb9, + 0xbf, 0x64, 0x52, 0xb0, 0x59, 0x48, 0x82, 0xb5, 0x3b, 0x4e, 0x99, 0x9f, + 0x97, 0x46, 0x58, 0x8e, 0xac, 0x4a, 0x7c, 0xb6, 0x4f, 0x35, 0x3f, 0xc2, + 0xbc, 0x44, 0x71, 0xbd, 0x96, 0xc0, 0x6e, 0xa6, 0x33, 0x47, 0xb0, 0x6b, + 0x78, 0x8f, 0xa8, 0x85, 0x4b, 0x66, 0x51, 0xd3, 0xab, 0xa5, 0xcd, 0xd9, + 0x4a, 0xa7, 0xab, 0x25, 0x4c, 0x8d, 0x53, 0x40, 0x75, 0x70, 0x56, 0x42, + 0x81, 0xb6, 0x67, 0xce, 0xae, 0x25, 0xb7, 0x5f, 0x88, 0x99, 0x51, 0x6a, + 0xb4, 0xbc, 0x8d, 0x46, 0xac, 0x9a, 0x71, 0x9e, 0x5c, 0x53, 0x8d, 0x6a, + 0x64, 0x44, 0x54, 0x6b, 0x62, 0x62, 0x83, 0x31, 0x73, 0x99, 0xb9, 0x8c, + 0x3b, 0x9a, 0xac, 0x88, 0x87, 0x73, 0x55, 0x31, 0x83, 0xc9, 0xce, 0xa4, + 0x95, 0xae, 0xb9, 0x57, 0x4d, 0x84, 0x6a, 0x9b, 0xbc, 0x63, 0x2d, 0xa2, + 0x5e, 0x69, 0x73, 0x6a, 0x40, 0x4b, 0x94, 0x90, 0x6c, 0x9d, 0x5e, 0xa7, + 0x59, 0x81, 0x57, 0x91, 0x83, 0x94, 0xc4, 0x81, 0x78, 0x92, 0xdd, 0x9a, + 0x39, 0xad, 0x82, 0x50, 0xa2, 0xd5, 0xca, 0x4b, 0xd6, 0x34, 0x35, 0x89, + 0x71, 0xa2, 0xc3, 0xc1, 0x32, 0x6f, 0x9b, 0x3a, 0xba, 0x8d, 0x9a, 0x5b, + 0x46, 0x63, 0xc4, 0xac, 0x89, 0x83, 0x63, 0xa9, 0x6c, 0xab, 0x7d, 0xbc, + 0x75, 0x75, 0x67, 0x8d, 0xa4, 0x37, 0x74, 0x87, 0x5a, 0xb1, 0x63, 0x61, + 0x86, 0xb2, 0x86, 0x56, 0x67, 0x59, 0xa5, 0x3b, 0x67, 0x8e, 0xc3, 0xc2, + 0xb0, 0x4e, 0xa5, 0xc9, 0x5e, 0xbd, 0x4a, 0x3f, 0x83, 0x27, 0xb5, 0xca, + 0x3e, 0x72, 0x9f, 0x8f, 0x81, 0x5e, 0x7e, 0xce, 0x41, 0x87, 0x57, 0x8f, + 0xc8, 0x95, 0xdb, 0x51, 0x71, 0x8d, 0x47, 0x48, 0xa2, 0xa7, 0x3b, 0x73, + 0xbd, 0x67, 0x6a, 0xcc, 0xb1, 0x40, 0x4f, 0x68, 0xd6, 0x8f, 0xde, 0x52, + 0x4d, 0x43, 0x8b, 0x66, 0x64, 0x56, 0x80, 0x67, 0x6e, 0x37, 0x49, 0x39, + 0x83, 0xa8, 0xc8, 0xbc, 0x34, 0x9a, 0xcd, 0xd7, 0x37, 0x81, 0xb4, 0x46, + 0x51, 0x79, 0xcc, 0x67, 0xb7, 0x90, 0x9f, 0x36, 0xcc, 0x99, 0x71, 0xb9, + 0x68, 0xa8, 0x82, 0xab, 0x67, 0x8c, 0xc3, 0x9d, 0xcd, 0xb9, 0x55, 0x74, + 0x63, 0xc5, 0xb6, 0x3f, 0x4c, 0xc9, 0xc2, 0x57, 0x79, 0xc6, 0x46, 0x4e, + 0x74, 0x6f, 0x97, 0xc5, 0x9b, 0x98, 0x7c, 0xc8, 0x89, 0xd0, 0x93, 0x6b, + 0x9b, 0xc3, 0x59, 0x81, 0x30, 0x4c, 0xbf, 0x4d, 0x9c, 0xb8, 0x91, 0x63, + 0x48, 0xc9, 0x79, 0xa9, 0x7b, 0x49, 0x90, 0x7b, 0x67, 0x43, 0xb6, 0xba, + 0x41, 0x93, 0xbf, 0x38, 0x72, 0x4b, 0xb4, 0xd6, 0xc9, 0x92, 0x9c, 0xab, + 0x51, 0x44, 0x56, 0x88, 0xd0, 0x83, 0xba, 0xbf, 0x9f, 0xa7, 0xad, 0xaa, + 0xce, 0x8e, 0xa5, 0x6c, 0x69, 0x3d, 0xae, 0xa4, 0x4b, 0x37, 0xd1, 0x68, + 0x84, 0xb1, 0x33, 0x89, 0x43, 0xd8, 0x6e, 0x91, 0x53, 0x87, 0x42, 0x61, + 0x7e, 0x79, 0x6c, 0xad, 0x7b, 0x56, 0xba, 0xa4, 0x69, 0x64, 0x5f, 0x2f, + 0x88, 0x58, 0xc7, 0xb6, 0x78, 0xb7, 0xa8, 0x48, 0xa8, 0x96, 0x3f, 0x69, + 0x60, 0xcd, 0xcc, 0xa3, 0xae, 0x4f, 0x3a, 0x94, 0x3e, 0x49, 0xa6, 0x36, + 0x46, 0x8f, 0xbd, 0x78, 0x8f, 0x6b, 0xaf, 0x40, 0x3e, 0xa5, 0x9a, 0x9f, + 0x49, 0x70, 0x9b, 0x82, 0x62, 0x39, 0x9d, 0x9d, 0x6a, 0x8e, 0x4f, 0x6c, + 0xa1, 0x70, 0x45, 0x35, 0xbb, 0x3d, 0x91, 0x72, 0xd4, 0xa8, 0x80, 0xc5, + 0x83, 0x88, 0xc7, 0xa0, 0x9a, 0x9e, 0xa9, 0x3d, 0x44, 0xaf, 0x80, 0x7d, + 0x59, 0xa9, 0x9c, 0x9d, 0x3f, 0x80, 0x62, 0x70, 0xc9, 0x40, 0x94, 0x32, + 0x5c, 0x8c, 0x3e, 0xbf, 0xcb, 0xcd, 0x4b, 0xbd, 0xc1, 0xc7, 0x51, 0xc5, + 0x5a, 0xb9, 0x68, 0xa1, 0xa9, 0x91, 0x4b, 0x55, 0xad, 0x95, 0xa4, 0x8e, + 0x4c, 0xc5, 0x93, 0x3f, 0x5a, 0xd6, 0x71, 0x36, 0xaa, 0x56, 0x9a, 0x90, + 0x9c, 0x93, 0x60, 0x75, 0x76, 0x7d, 0xa0, 0xa8, 0x9e, 0x95, 0xc2, 0xb4, + 0xac, 0xb0, 0x80, 0x9a, 0x68, 0x37, 0x86, 0x5b, 0x33, 0x3f, 0x4b, 0x99, + 0x84, 0xbf, 0xa8, 0xb6, 0x8e, 0xb1, 0xa1, 0x93, 0xc8, 0xad, 0x53, 0x98, + 0x9a, 0x90, 0xc9, 0x39, 0x8a, 0x5b, 0xa4, 0x37, 0xaa, 0xd0, 0x6e, 0xb7, + 0x49, 0x61, 0x9b, 0x76, 0xb9, 0x77, 0x36, 0x49, 0x59, 0x6f, 0xae, 0x87, + 0x53, 0xc8, 0xd6, 0x5c, 0x80, 0x9c, 0x6f, 0xcb, 0x86, 0xd7, 0x4c, 0x50, + 0x65, 0x73, 0x9a, 0x94, 0x88, 0xa5, 0xcd, 0x33, 0x9b, 0xa0, 0xc0, 0x60, + 0x35, 0x59, 0xb2, 0x66, 0x7f, 0xd3, 0x2f, 0x3f, 0xbe, 0x94, 0xb8, 0x83, + 0x92, 0x33, 0x7b, 0x7d, 0x31, 0x39, 0x5e, 0x40, 0x51, 0x98, 0x56, 0xce, + 0xcb, 0x69, 0x4a, 0xd0, 0x3c, 0x83, 0x3d, 0x8d, 0xb8, 0x45, 0x94, 0xbf, + 0x50, 0x5d, 0x6a, 0xb9, 0xa2, 0xc3, 0xba, 0x82, 0x58, 0x8e, 0x51, 0x33, + 0xc9, 0x3d, 0x4b, 0xa7, 0xc0, 0x51, 0x44, 0x4d, 0x32, 0x49, 0x66, 0x51, + 0x4b, 0xb9, 0xa9, 0x8e, 0xc7, 0x90, 0x81, 0x41, 0x9d, 0x76, 0xb2, 0x8d, + 0x39, 0x60, 0x6a, 0x6c, 0x62, 0xa1, 0x42, 0x72, 0x8a, 0x7e, 0xa1, 0xce, + 0x48, 0x8f, 0xb9, 0x5f, 0x6c, 0x49, 0x83, 0xbf, 0xaa, 0x44, 0x91, 0x47, + 0x5d, 0x3e, 0x69, 0xbb, 0x47, 0x88, 0x69, 0x6c, 0x6d, 0xc7, 0xb3, 0xae, + 0x7d, 0x5b, 0x81, 0xcc, 0x83, 0x75, 0x71, 0xba, 0xa7, 0x7f, 0x68, 0x5a, + 0x81, 0x3d, 0x43, 0x30, 0x43, 0xca, 0xbf, 0x38, 0xa4, 0xc4, 0x3e, 0x9b, + 0x71, 0xc8, 0xa4, 0x52, 0x79, 0xc4, 0x5f, 0x51, 0xaa, 0x79, 0x61, 0x46, + 0x4b, 0x87, 0xb9, 0x83, 0xce, 0x93, 0x5c, 0xcf, 0x45, 0x43, 0xae, 0x61, + 0x98, 0xb4, 0x5c, 0x54, 0x8d, 0xb8, 0x77, 0x89, 0x91, 0x32, 0x8c, 0x2f, + 0x90, 0x3b, 0xc2, 0x30, 0xcf, 0x69, 0x53, 0x8d, 0x44, 0xbd, 0x4c, 0x42, + 0x7a, 0x6a, 0x7c, 0xcf, 0x5e, 0xca, 0xbb, 0x9e, 0x78, 0x85, 0x81, 0xb0, + 0x7e, 0xc6, 0x86, 0x87, 0x6c, 0xc3, 0x91, 0x7a, 0x8f, 0x7c, 0x4a, 0xb6, + 0x40, 0x50, 0x85, 0xb2, 0xa4, 0xbc, 0xa7, 0x37, 0x4b, 0x5a, 0x50, 0x6b, + 0x33, 0x3b, 0x60, 0x6b, 0x8f, 0xba, 0x86, 0x30, 0x8a, 0x8c, 0x6e, 0xa4, + 0x9b, 0x9f, 0x89, 0xb8, 0x7e, 0x6d, 0x4e, 0x69, 0xc3, 0x6d, 0xc2, 0xba, + 0x8d, 0x81, 0x73, 0xcc, 0x91, 0xc2, 0xa2, 0x6b, 0x57, 0xa2, 0x4c, 0x9a, + 0xd2, 0xbf, 0x3e, 0x68, 0x5a, 0x98, 0xad, 0x6c, 0xbb, 0xcc, 0xa4, 0x57, + 0xc5, 0x67, 0x4d, 0xc4, 0x7b, 0xba, 0xc1, 0xc7, 0x99, 0xca, 0xbb, 0xd3, + 0xa7, 0x2f, 0x3f, 0x54, 0x54, 0x43, 0x81, 0x9f, 0x76, 0x4e, 0x3d, 0x4e, + 0x4c, 0xb2, 0x48, 0xb9, 0x9f, 0x4e, 0x4e, 0xb7, 0x5d, 0x5d, 0x76, 0xac, + 0x9b, 0x38, 0x4a, 0xc8, 0x35, 0x4c, 0xb1, 0x61, 0x88, 0x90, 0xac, 0x82, + 0x79, 0x6d, 0x3a, 0x70, 0x81, 0x75, 0xa1, 0xc5, 0xba, 0xc9, 0xcc, 0x4a, + 0x75, 0x52, 0x9d, 0x43, 0x8c, 0x71, 0x76, 0x30, 0x76, 0xc0, 0xcf, 0x6c, + 0x7f, 0x90, 0x4e, 0xce, 0x8f, 0x79, 0x48, 0x7b, 0x31, 0x37, 0x9f, 0x32, + 0xb1, 0x49, 0x9c, 0x4a, 0x40, 0x6e, 0x66, 0xb5, 0xa4, 0x49, 0xd2, 0xc1, + 0x70, 0xba, 0x33, 0xc1, 0x5d, 0x97, 0xa9, 0x81, 0x6c, 0x7c, 0x74, 0xd1, + 0xc8, 0x39, 0x8d, 0x42, 0xc8, 0x61, 0xc4, 0xab, 0x3f, 0x8a, 0x4d, 0xc6, + 0xa6, 0xa7, 0xa1, 0xbb, 0x64, 0x9d, 0x67, 0x63, 0x9c, 0x66, 0xa3, 0x7e, + 0x47, 0x50, 0xc1, 0x64, 0x57, 0x50, 0x38, 0x79, 0xaa, 0x32, 0x6f, 0xae, + 0xc4, 0x3c, 0xa0, 0x6c, 0x82, 0x48, 0x7f, 0xc7, 0x49, 0x7e, 0x87, 0xc9, + 0xb4, 0x6b, 0x81, 0xb7, 0xa4, 0xc2, 0x7d, 0x54, 0x39, 0x60, 0x5b, 0xa5, + 0xc1, 0xd2, 0x4d, 0xca, 0x7a, 0x6b, 0x6c, 0xa7, 0x68, 0x73, 0x42, 0x7c, + 0x97, 0x35, 0x99, 0xc3, 0x8b, 0x94, 0x59, 0x39, 0x5e, 0x66, 0x75, 0x57, + 0x87, 0xb7, 0x97, 0x7b, 0x36, 0x7d, 0x62, 0x94, 0xaf, 0x9f, 0x9b, 0x7e, + 0x73, 0x40, 0x91, 0xa2, 0x8e, 0xba, 0x45, 0xcf, 0x4c, 0x9d, 0x56, 0x52, + 0xba, 0x83, 0xbd, 0x57, 0xb8, 0x88, 0x7d, 0x9e, 0x4b, 0x72, 0xb0, 0x56, + 0xa8, 0x41, 0x9d, 0x98, 0x60, 0x74, 0x35, 0xa7, 0xc7, 0x8b, 0x37, 0x3d, + 0xd2, 0xd5, 0xbf, 0x78, 0x97, 0xc9, 0xa5, 0x48, 0xae, 0xa9, 0x7b, 0xae, + 0x9b, 0x4a, 0x31, 0xa6, 0x95, 0xd1, 0x9a, 0x43, 0x97, 0xa2, 0x53, 0x8e, + 0xc4, 0x99, 0x79, 0x57, 0xbb, 0x6d, 0xad, 0x75, 0x44, 0xb1, 0x9d, 0x73, + 0x6c, 0x6f, 0x3c, 0xa3, 0x36, 0xb2, 0xad, 0x5f, 0xd2, 0x93, 0x92, 0x5a, + 0x54, 0xc3, 0x6b, 0x93, 0x59, 0x9f, 0x8d, 0x76, 0x8c, 0x41, 0x65, 0xa0, + 0xb2, 0x9b, 0x9a, 0xab, 0x6e, 0x84, 0x72, 0xd5, 0xbf, 0xce, 0x87, 0xbb, + 0xae, 0x55, 0x58, 0x36, 0x6f, 0xb8, 0x4d, 0x43, 0x30, 0xb9, 0xd9, 0xb2, + 0xa5, 0x69, 0x67, 0x55, 0x6b, 0xb5, 0x7a, 0x8d, 0xc7, 0x4e, 0x97, 0xc8, + 0x45, 0xdb, 0x52, 0x8c, 0x79, 0x56, 0x4d, 0x6c, 0xb9, 0x72, 0x5a, 0x98, + 0x5a, 0x68, 0x59, 0xd1, 0x81, 0xa2, 0x2b, 0xc5, 0x46, 0x91, 0x7c, 0x3b, + 0x50, 0xa8, 0x73, 0x6a, 0x9f, 0x31, 0x5e, 0x63, 0x43, 0x63, 0x90, 0x9c, + 0xbd, 0x55, 0x3d, 0x48, 0xac, 0x95, 0x97, 0x7c, 0x70, 0x9f, 0x41, 0x7a, + 0x7c, 0xbc, 0xa6, 0xbc, 0xbc, 0xb1, 0x45, 0x8e, 0xaa, 0x42, 0xac, 0x75, + 0xc1, 0x3a, 0x43, 0x97, 0x90, 0x35, 0x63, 0x89, 0x63, 0xcb, 0xbc, 0xb5, + 0xae, 0x86, 0x74, 0x64, 0x9f, 0xbf, 0xce, 0x53, 0x5a, 0x7f, 0xaf, 0x55, + 0x9c, 0x8c, 0xc8, 0x63, 0x48, 0x76, 0x90, 0x92, 0x37, 0xa0, 0x9b, 0x8d, + 0x8d, 0x91, 0x9f, 0x7c, 0xbd, 0x45, 0x86, 0xa0, 0x33, 0xcb, 0x85, 0x96, + 0xbb, 0xbd, 0x8b, 0xbb, 0x53, 0x90, 0xbe, 0xb6, 0x31, 0x87, 0x6b, 0x9f, + 0x93, 0x78, 0xc1, 0xd0, 0x5e, 0x79, 0x78, 0x5c, 0x6f, 0xb4, 0xc0, 0x3c, + 0x97, 0x32, 0x9c, 0x90, 0x3c, 0x4a, 0x99, 0x8e, 0xd1, 0x71, 0x73, 0xc5, + 0xc7, 0x45, 0x80, 0xb5, 0xb3, 0x61, 0xa0, 0x4f, 0x57, 0xb0, 0x58, 0xd4, + 0xc6, 0xd4, 0xbe, 0x3e, 0xb1, 0x4a, 0xac, 0x6e, 0x5f, 0x4d, 0x63, 0x57, + 0x52, 0xba, 0x4b, 0x6e, 0x9e, 0xc6, 0xb8, 0x7a, 0x65, 0x3f, 0x84, 0x5d, + 0x7f, 0x9f, 0x9e, 0x88, 0x6f, 0x50, 0x35, 0xa9, 0xa0, 0x56, 0x45, 0x5c, + 0xc6, 0xae, 0xad, 0x98, 0xaa, 0x35, 0x86, 0x5f, 0x44, 0x3c, 0x60, 0x91, + 0x86, 0x54, 0x38, 0xcf, 0x72, 0x40, 0xa9, 0x53, 0x56, 0x33, 0x80, 0xb6, + 0x98, 0x53, 0x96, 0xbd, 0x3b, 0xc0, 0x8b, 0x68, 0x4c, 0x62, 0x8b, 0xa1, + 0x97, 0xd1, 0xb2, 0x47, 0x63, 0x89, 0x39, 0x52, 0x34, 0xab, 0x3b, 0xc5, + 0x81, 0xcf, 0xc0, 0x5f, 0xa4, 0x83, 0x99, 0x6e, 0x6f, 0x9a, 0xbf, 0x5f, + 0x47, 0xa2, 0xa0, 0xc2, 0xbe, 0x6a, 0x50, 0x7c, 0x85, 0x37, 0xc0, 0x8d, + 0xc1, 0xa6, 0x54, 0x63, 0x49, 0xb6, 0x95, 0xa3, 0xae, 0xba, 0x8a, 0x5e, + 0x7e, 0x47, 0x53, 0xb0, 0x38, 0x6a, 0x39, 0x75, 0x77, 0x52, 0x8e, 0x4e, + 0x3f, 0xcc, 0x6e, 0xa9, 0x51, 0x46, 0x87, 0x69, 0x45, 0x86, 0x56, 0x7e, + 0xcd, 0x54, 0x51, 0x37, 0x7d, 0xb5, 0x81, 0x79, 0x3b, 0x88, 0x4c, 0x73, + 0xc6, 0x9f, 0x58, 0x50, 0x95, 0x6a, 0x70, 0x3d, 0xc1, 0x6b, 0x3e, 0x3d, + 0xc9, 0x8d, 0x57, 0x72, 0x68, 0x93, 0xbd, 0x99, 0xac, 0x45, 0x88, 0xb9, + 0x8c, 0x57, 0xa0, 0x8a, 0x7a, 0xba, 0x9e, 0x56, 0x5b, 0x56, 0x3d, 0x66, + 0x94, 0xc5, 0x39, 0x90, 0xba, 0x37, 0xae, 0x51, 0x6c, 0xd3, 0xbb, 0xa5, + 0x51, 0xa5, 0x85, 0x9c, 0x80, 0x65, 0x4d, 0x8b, 0x33, 0xd2, 0xc2, 0x52, + 0xa1, 0x47, 0x7d, 0x53, 0x50, 0xd2, 0x8d, 0x77, 0xb5, 0x59, 0xcf, 0x71, + 0x8a, 0x36, 0x65, 0x39, 0xb7, 0xc3, 0x3c, 0x8e, 0x6b, 0xc8, 0x4e, 0xa7, + 0x98, 0xa1, 0x3a, 0x76, 0xc6, 0x93, 0x89, 0x8e, 0xd5, 0xbc, 0x87, 0x3c, + 0x69, 0x36, 0x51, 0x38, 0xcc, 0x70, 0xbb, 0x8d, 0x2f, 0xa8, 0x66, 0xc6, + 0x3f, 0x91, 0x7d, 0x66, 0x8f, 0x7a, 0x71, 0x84, 0x52, 0x4c, 0xb9, 0x5a, + 0x50, 0xb6, 0x6d, 0x37, 0xd6, 0x87, 0x9d, 0x51, 0xa4, 0xa1, 0x79, 0x64, + 0x76, 0x99, 0x70, 0xa3, 0x55, 0x49, 0x85, 0xc4, 0x98, 0x59, 0xa7, 0x65, + 0xc1, 0x9b, 0xc4, 0x4e, 0x88, 0x7f, 0x56, 0x71, 0xb2, 0xb8, 0x35, 0x35, + 0xa5, 0x99, 0xad, 0x42, 0x42, 0x86, 0xa2, 0x52, 0x64, 0x90, 0x80, 0x31, + 0xb8, 0xd2, 0xac, 0xb2, 0x5b, 0x39, 0x4f, 0xb1, 0xcf, 0xa2, 0x8b, 0x9e, + 0xbd, 0x95, 0x9c, 0x64, 0x6c, 0x7d, 0xd4, 0x94, 0x99, 0x2f, 0x67, 0x96, + 0x7a, 0x8e, 0xa3, 0xb8, 0x7a, 0x5c, 0x53, 0x65, 0xc2, 0xa3, 0xbf, 0x83, + 0x92, 0x8d, 0x9c, 0x5e, 0xc8, 0x7c, 0xca, 0x8b, 0xab, 0x81, 0x4d, 0x53, + 0x87, 0xa9, 0xa1, 0x71, 0xb6, 0xcb, 0xcc, 0xb1, 0x51, 0x92, 0x37, 0x35, + 0x37, 0xa3, 0x6d, 0x8a, 0x98, 0xa2, 0x37, 0xc2, 0x8b, 0x56, 0x75, 0xbe, + 0xa5, 0x66, 0x35, 0xce, 0x76, 0x6e, 0x6e, 0x34, 0x31, 0x42, 0x3c, 0x48, + 0xc8, 0xa9, 0x64, 0x76, 0x45, 0x79, 0x84, 0x57, 0x92, 0x68, 0xa7, 0xbb, + 0xb4, 0x62, 0x74, 0x81, 0x88, 0xa6, 0xc8, 0xd3, 0x82, 0xbe, 0xc1, 0x9d, + 0x34, 0xb3, 0xb0, 0x5c, 0x34, 0x75, 0x5f, 0x60, 0x63, 0x35, 0xa2, 0x30, + 0xa6, 0x4b, 0xaa, 0x81, 0x52, 0x74, 0xbb, 0xca, 0x3b, 0x9d, 0xbf, 0xa0, + 0xc5, 0x5b, 0x82, 0xc0, 0x44, 0x57, 0xcf, 0x3a, 0x46, 0x82, 0x8f, 0xc5, + 0x4d, 0xcd, 0x9f, 0x86, 0x34, 0x5e, 0xcc, 0x69, 0x84, 0x78, 0x90, 0x79, + 0x8f, 0xc9, 0x8d, 0xa7, 0xaf, 0x88, 0x3f, 0xc1, 0xa4, 0x69, 0x7f, 0x54, + 0x70, 0x56, 0xca, 0xae, 0xa1, 0xb3, 0x6e, 0x51, 0x52, 0x30, 0xbb, 0x4e, + 0x9b, 0x92, 0x36, 0xbc, 0x53, 0x9d, 0xac, 0x87, 0x46, 0xb9, 0x42, 0x58, + 0x8c, 0xc7, 0x5b, 0xc9, 0xc5, 0xa6, 0xba, 0x51, 0x82, 0xda, 0x77, 0x61, + 0x48, 0xce, 0x48, 0x69, 0x85, 0xb1, 0x68, 0x2b, 0x57, 0x43, 0x52, 0x99, + 0x8d, 0x8b, 0x95, 0xaf, 0xac, 0x53, 0x7d, 0x5b, 0x5a, 0xad, 0x2c, 0x37, + 0xa8, 0x7d, 0x48, 0xc7, 0x67, 0xb2, 0x6d, 0x75, 0x82, 0x88, 0xd0, 0x67, + 0x8f, 0xb5, 0x76, 0xa4, 0x66, 0x69, 0x71, 0xbc, 0xc2, 0xaf, 0xc2, 0xb0, + 0x5f, 0xcf, 0x69, 0x6a, 0xca, 0x61, 0x9c, 0xc9, 0x4f, 0x5b, 0x81, 0xce, + 0xa6, 0xb6, 0x8a, 0x38, 0xaa, 0x62, 0x7e, 0x6e, 0x56, 0x3f, 0x4b, 0xc6, + 0x3f, 0xab, 0x61, 0x73, 0x3b, 0x50, 0xd1, 0x5a, 0x8e, 0x90, 0x53, 0x5c, + 0xc6, 0x64, 0x7e, 0x87, 0x45, 0x5d, 0x57, 0xce, 0x6b, 0xa3, 0x89, 0xdf, + 0x3b, 0x4f, 0x9f, 0xc1, 0x4d, 0x21, 0xbc, 0x53, 0x99, 0x82, 0x61, 0x91, + 0xbb, 0x36, 0x54, 0x52, 0x54, 0x72, 0x65, 0xba, 0x46, 0x45, 0x71, 0xc4, + 0x2a, 0x2a, 0xc9, 0x86, 0xd0, 0xad, 0x7f, 0x7d, 0x5d, 0x9b, 0x78, 0x58, + 0xc6, 0x64, 0x6d, 0xc7, 0x45, 0x55, 0x44, 0x94, 0x6e, 0x5d, 0x32, 0x37, + 0x53, 0x36, 0x49, 0x66, 0xb0, 0x64, 0x3c, 0x7f, 0xbe, 0xc0, 0x5a, 0x2f, + 0xbc, 0x8a, 0x7a, 0x54, 0x4c, 0x6c, 0x40, 0x92, 0x37, 0x90, 0x4e, 0xb8, + 0x55, 0x8d, 0x7a, 0xb7, 0x6c, 0x58, 0x43, 0xa5, 0xa0, 0x59, 0x7b, 0xad, + 0x79, 0xbf, 0x6e, 0xd3, 0x6d, 0x8b, 0x61, 0x81, 0x74, 0xa5, 0x6a, 0xd0, + 0x57, 0xc7, 0xa1, 0x36, 0x51, 0x51, 0x31, 0x61, 0x6a, 0x95, 0xb2, 0x3f, + 0x4d, 0x71, 0x37, 0x6f, 0xb3, 0x30, 0x6b, 0x4d, 0x62, 0x53, 0x3f, 0x83, + 0xa5, 0x6e, 0xc7, 0xc0, 0xa7, 0xb1, 0xb9, 0x81, 0xb7, 0x92, 0xd4, 0x4d, + 0x6c, 0x78, 0x8a, 0x9e, 0xbf, 0x3d, 0xbd, 0x5a, 0x3e, 0x85, 0x4a, 0x49, + 0x6f, 0xd0, 0xaf, 0x93, 0xc1, 0x55, 0x8b, 0x34, 0x78, 0x36, 0x75, 0x51, + 0xb4, 0x7d, 0x8f, 0xde, 0x70, 0x51, 0xb6, 0xb8, 0x61, 0x7a, 0xbc, 0xb6, + 0xaa, 0xa8, 0x47, 0x58, 0x43, 0x44, 0x7d, 0x75, 0xa8, 0x5d, 0x92, 0x33, + 0xad, 0x7f, 0x78, 0xd9, 0xc1, 0x57, 0x6b, 0x61, 0x60, 0x88, 0x8d, 0x71, + 0x9f, 0x65, 0x97, 0x56, 0x6b, 0xaa, 0x76, 0x7f, 0x6b, 0x8d, 0x6c, 0x8e, + 0x5a, 0x9c, 0x59, 0xa7, 0x6c, 0x5d, 0x55, 0x3c, 0x3e, 0xd1, 0x36, 0xa4, + 0xa1, 0xc4, 0xbb, 0xc7, 0x8f, 0x99, 0x7c, 0x57, 0x37, 0x44, 0xaa, 0x81, + 0x83, 0x36, 0x4f, 0x55, 0x8a, 0x83, 0x73, 0xb1, 0x9f, 0x9d, 0x6f, 0xbd, + 0x8f, 0x4c, 0x5e, 0x38, 0x88, 0x56, 0xd0, 0x5f, 0x49, 0xc8, 0x78, 0x72, + 0x51, 0xc2, 0x7b, 0xaa, 0x3a, 0x9d, 0xc5, 0x3c, 0x3e, 0xb2, 0x7e, 0x3f, + 0x82, 0xb9, 0xcf, 0xa3, 0x8c, 0x26, 0x62, 0xa3, 0x66, 0x2f, 0xc6, 0xbf, + 0x71, 0x6a, 0xbb, 0xce, 0x95, 0xab, 0x5c, 0x89, 0xcc, 0xba, 0xb8, 0x9b, + 0x9d, 0x64, 0x78, 0xc3, 0x61, 0x3c, 0xb2, 0xae, 0x74, 0x3d, 0x96, 0xae, + 0xbd, 0xc6, 0xc7, 0xb6, 0xbe, 0x72, 0x3d, 0x51, 0xa9, 0x77, 0x81, 0x43, + 0x5a, 0x71, 0x4d, 0xa6, 0x84, 0xa6, 0x7c, 0xbc, 0xc8, 0xa0, 0x61, 0x36, + 0x49, 0xc8, 0x9d, 0x9b, 0xab, 0xa4, 0x59, 0x6f, 0xb1, 0x94, 0xad, 0xae, + 0x5a, 0xbe, 0x85, 0xb7, 0x67, 0xa9, 0x74, 0x56, 0x5f, 0x9d, 0x92, 0xbc, + 0xa6, 0xc6, 0x8b, 0x5d, 0x5f, 0x66, 0x7a, 0x9e, 0x92, 0x6d, 0x6a, 0xb2, + 0x4c, 0x94, 0x78, 0x7f, 0xa9, 0x63, 0x9a, 0x7b, 0x4c, 0x6d, 0x56, 0x4d, + 0x5b, 0x89, 0xcb, 0x4d, 0x76, 0xb2, 0x73, 0x8f, 0x62, 0x36, 0x29, 0x4f, + 0x6c, 0xb1, 0x8b, 0xcc, 0x64, 0x4a, 0xb4, 0xa4, 0x41, 0xcc, 0x5c, 0xaf, + 0x77, 0xc9, 0x7c, 0xc7, 0x3d, 0x88, 0x4e, 0xcc, 0x82, 0x93, 0x4d, 0x4c, + 0xd1, 0xb0, 0x4c, 0x52, 0xbc, 0x5a, 0xd5, 0x47, 0x3e, 0xa8, 0xc1, 0x8b, + 0x97, 0x62, 0xbc, 0x88, 0xd4, 0x54, 0x7f, 0xa5, 0xb8, 0x97, 0x7f, 0x85, + 0xa0, 0xb2, 0x89, 0xc7, 0x73, 0x45, 0xba, 0x70, 0x33, 0xad, 0x88, 0x83, + 0x9b, 0x47, 0x8f, 0xdb, 0x32, 0xaf, 0x2d, 0x58, 0xaf, 0x98, 0xbd, 0x54, + 0x43, 0x5b, 0x48, 0xc9, 0x89, 0xbc, 0x8d, 0x52, 0x5e, 0x56, 0x67, 0x56, + 0xc5, 0x8a, 0x2c, 0x30, 0x40, 0xcc, 0x6e, 0x42, 0x74, 0xb6, 0xca, 0x77, + 0x5a, 0xa6, 0x2c, 0xc4, 0x64, 0x4e, 0xc2, 0x7f, 0xa9, 0x59, 0x5d, 0x44, + 0x40, 0xab, 0x4d, 0x9f, 0x4c, 0xb0, 0xc7, 0xc6, 0x55, 0x6e, 0x4d, 0x6c, + 0x38, 0x82, 0xaf, 0x3e, 0x90, 0xb5, 0x47, 0x70, 0xcb, 0x34, 0x49, 0x82, + 0x6a, 0xb9, 0xb3, 0xc3, 0xab, 0xbb, 0xa4, 0xc1, 0xc0, 0x50, 0x70, 0xc9, + 0x7c, 0x9e, 0x44, 0xb8, 0x90, 0x39, 0x5b, 0x7f, 0xb7, 0x84, 0xae, 0xa5, + 0x46, 0x60, 0x5e, 0x82, 0xb0, 0x35, 0x61, 0x7b, 0x88, 0x82, 0x49, 0xc1, + 0x75, 0x77, 0x3a, 0x40, 0xc0, 0xae, 0x5f, 0xa3, 0xce, 0x43, 0x8d, 0x7a, + 0x63, 0xd4, 0x48, 0x4c, 0x90, 0xd2, 0x7a, 0x4f, 0x66, 0x9d, 0x70, 0x3c, + 0x36, 0x84, 0xa2, 0x94, 0x7f, 0xa2, 0xa9, 0x68, 0x84, 0x6a, 0x77, 0x4e, + 0x8c, 0xcc, 0x82, 0x5c, 0x91, 0x6a, 0x2a, 0xab, 0x88, 0x59, 0x6c, 0x6d, + 0xab, 0x5a, 0x75, 0x2e, 0xc0, 0xaa, 0xc5, 0xaf, 0x5d, 0x90, 0x9b, 0xa5, + 0x34, 0x6b, 0xc8, 0x63, 0x8b, 0x63, 0x62, 0x84, 0x63, 0xc8, 0x85, 0xb8, + 0xae, 0x8b, 0x97, 0x83, 0x60, 0x30, 0x9b, 0x78, 0x42, 0x9d, 0x4d, 0x62, + 0xb2, 0x59, 0x94, 0x52, 0x99, 0x53, 0x64, 0x7e, 0x3f, 0x60, 0x75, 0x39, + 0x7d, 0x76, 0xbc, 0xae, 0x47, 0x69, 0xb0, 0xb4, 0xba, 0x97, 0xca, 0x45, + 0x6b, 0xab, 0x32, 0x8b, 0xc1, 0x49, 0x50, 0x73, 0x7d, 0x6c, 0xce, 0xc7, + 0x5a, 0x6a, 0xab, 0x40, 0x72, 0x5d, 0x41, 0x78, 0x5f, 0x45, 0x40, 0xb6, + 0x59, 0x98, 0x56, 0x90, 0x45, 0x49, 0x6b, 0x8d, 0x77, 0x61, 0x7c, 0x9b, + 0x61, 0xc8, 0x89, 0xb2, 0x9d, 0x77, 0x47, 0xd5, 0x6c, 0xcf, 0x76, 0x53, + 0x77, 0x82, 0x80, 0x7c, 0xb0, 0x86, 0xa0, 0x9b, 0x2f, 0x87, 0x31, 0x83, + 0xca, 0x6b, 0x35, 0x85, 0xba, 0x49, 0x8b, 0xc9, 0x7c, 0x6a, 0xaf, 0xb0, + 0xb1, 0xbe, 0x5e, 0x75, 0x90, 0x94, 0x47, 0x8a, 0x6d, 0x38, 0xbd, 0x5c, + 0x6c, 0xa7, 0xc5, 0x66, 0x4c, 0x33, 0x9d, 0xb6, 0x79, 0xbd, 0x66, 0xaf, + 0x7e, 0x47, 0x6c, 0x54, 0xc4, 0x42, 0x6a, 0x4d, 0x76, 0xa1, 0xb8, 0x49, + 0x49, 0x42, 0xb4, 0x92, 0x58, 0x3e, 0x81, 0x3b, 0x49, 0x55, 0xaa, 0x3b, + 0x47, 0xb3, 0x71, 0x72, 0xaf, 0x69, 0x87, 0xc3, 0xcc, 0x92, 0x33, 0x89, + 0x61, 0x32, 0xa0, 0x6f, 0x95, 0xd0, 0xa4, 0x69, 0x7e, 0xaa, 0xcb, 0x7b, + 0x85, 0x8a, 0x47, 0xb8, 0x50, 0x82, 0x7e, 0x6e, 0x97, 0x2d, 0x5b, 0x92, + 0xb8, 0x81, 0x6c, 0xd0, 0x5e, 0x41, 0xa2, 0x74, 0xc7, 0xc1, 0x61, 0x3c, + 0x55, 0x59, 0x99, 0xd6, 0x8f, 0x42, 0xb2, 0xa8, 0x9c, 0x88, 0x3d, 0x5e, + 0x4b, 0xa8, 0xbb, 0x5b, 0xac, 0xa0, 0x4e, 0x42, 0x31, 0x88, 0xbc, 0xa6, + 0x4e, 0x92, 0x53, 0x65, 0x73, 0x3c, 0x66, 0xb1, 0x6f, 0xc0, 0x93, 0x8d, + 0x60, 0xca, 0x65, 0x87, 0xb4, 0x3b, 0x34, 0x96, 0xa3, 0x70, 0x69, 0x58, + 0xad, 0xa9, 0x9f, 0xc6, 0x5a, 0xad, 0x6e, 0xa8, 0x5b, 0x5e, 0xc4, 0x36, + 0x97, 0x40, 0xd0, 0xc2, 0x44, 0xba, 0x46, 0xc1, 0x53, 0x82, 0xcf, 0x59, + 0x98, 0x70, 0x33, 0xb9, 0xbd, 0x5f, 0xa9, 0xcc, 0xb6, 0x82, 0xc2, 0xa0, + 0xc4, 0x61, 0xb9, 0x43, 0xc8, 0xc2, 0x41, 0x95, 0x95, 0xd2, 0x36, 0x61, + 0xc0, 0xbc, 0xa3, 0xbf, 0x66, 0x5f, 0x6a, 0x9a, 0x3d, 0x37, 0xb9, 0x5f, + 0x8f, 0x79, 0x98, 0x56, 0x97, 0xa9, 0x66, 0xb1, 0x52, 0x84, 0xc4, 0x53, + 0x72, 0x97, 0x97, 0xba, 0x8b, 0xa4, 0xc4, 0x6d, 0xbe, 0x6b, 0xd0, 0xab, + 0x9c, 0x34, 0x3b, 0xb0, 0x7c, 0xb1, 0x9c, 0xc1, 0x35, 0x55, 0x58, 0x53, + 0xae, 0x6a, 0x88, 0x4d, 0x79, 0x91, 0x32, 0x67, 0x9a, 0xa0, 0x52, 0x83, + 0xaf, 0x9a, 0xa5, 0xca, 0x5e, 0xb6, 0x7f, 0x7e, 0x44, 0x30, 0x5c, 0x91, + 0x40, 0xbf, 0x9f, 0xb4, 0x62, 0x86, 0x69, 0x53, 0x77, 0xa6, 0x95, 0x48, + 0x57, 0x6f, 0x98, 0x2e, 0x83, 0x5d, 0x40, 0x35, 0x49, 0x83, 0xc2, 0xb2, + 0x67, 0xc7, 0x8a, 0x52, 0x41, 0xc8, 0x89, 0x74, 0x69, 0x4f, 0xc3, 0xc7, + 0xa0, 0xd0, 0x4f, 0x93, 0x80, 0x9f, 0x3d, 0x37, 0xb0, 0x93, 0xbe, 0x68, + 0x90, 0x9c, 0x96, 0x9f, 0x81, 0x85, 0xbf, 0x85, 0x6d, 0x52, 0x8c, 0x7f, + 0x42, 0x75, 0x6a, 0x97, 0x6a, 0x6d, 0x82, 0x91, 0x4e, 0x6d, 0xc2, 0x48, + 0x31, 0x44, 0xc6, 0x4b, 0x5d, 0x7e, 0xb9, 0xd0, 0xa4, 0xbf, 0x46, 0x48, + 0x68, 0x43, 0x5a, 0x8d, 0x5f, 0x5a, 0xab, 0xab, 0x9e, 0x63, 0xaa, 0xce, + 0x66, 0x90, 0x89, 0xb3, 0x5e, 0x89, 0x9b, 0x78, 0x51, 0xa0, 0xa8, 0x41, + 0x34, 0x9d, 0xbf, 0x3d, 0x5a, 0x3d, 0x88, 0xb0, 0x5a, 0xa2, 0xb6, 0x47, + 0x60, 0xcf, 0x95, 0x9c, 0x3c, 0xb8, 0x5d, 0x30, 0xbc, 0x94, 0x54, 0xbc, + 0x4a, 0x83, 0xaa, 0xd2, 0x99, 0xca, 0x95, 0xcb, 0x57, 0x5a, 0x47, 0x43, + 0xb1, 0x9f, 0x8a, 0x46, 0xb1, 0x6c, 0x65, 0x6b, 0x42, 0x69, 0x98, 0x9e, + 0x7d, 0x56, 0x4f, 0xa5, 0x82, 0x51, 0x6a, 0x98, 0xbe, 0xad, 0x98, 0x42, + 0x79, 0x92, 0x32, 0x35, 0x8f, 0x82, 0x5b, 0xc6, 0xa2, 0x2f, 0x71, 0x32, + 0x82, 0xac, 0x2e, 0x52, 0x9f, 0xb2, 0xba, 0x7d, 0x52, 0xaf, 0x82, 0x3a, + 0xc9, 0x41, 0x6e, 0xb9, 0xc6, 0xa0, 0xac, 0x86, 0xbe, 0x9d, 0xa4, 0xc1, + 0xbb, 0x3d, 0x4c, 0x5e, 0x30, 0xc1, 0x69, 0x34, 0x79, 0x7d, 0x6f, 0x36, + 0xbc, 0x3d, 0xc8, 0x3e, 0x57, 0xa5, 0x57, 0x75, 0x8a, 0xa9, 0x5f, 0x4a, + 0x45, 0x35, 0x80, 0xad, 0x61, 0x48, 0x51, 0xce, 0x66, 0x56, 0x9f, 0x37, + 0xba, 0x40, 0x8d, 0x87, 0x52, 0x33, 0x47, 0xb7, 0xd1, 0xd3, 0xa2, 0x70, + 0x4d, 0x74, 0xaa, 0xcd, 0x84, 0xac, 0x71, 0x3e, 0xc4, 0xbe, 0x83, 0x36, + 0xba, 0x4e, 0x8d, 0x9b, 0x83, 0x97, 0xcf, 0x6c, 0xbb, 0x7f, 0x92, 0xbe, + 0x86, 0x86, 0x8f, 0xca, 0xae, 0x3b, 0x3d, 0xcc, 0xc0, 0xce, 0x40, 0x96, + 0x3a, 0x54, 0x78, 0x3e, 0xcd, 0x97, 0xcc, 0xba, 0x76, 0x9a, 0x57, 0xbf, + 0x6b, 0xac, 0xc9, 0x40, 0xb6, 0xb5, 0x4e, 0xc1, 0x9e, 0x58, 0xc4, 0xc7, + 0x5b, 0xb8, 0x9c, 0x48, 0xb0, 0x90, 0xc5, 0x5c, 0x4f, 0x62, 0x61, 0x4b, + 0x2e, 0x7b, 0xab, 0xb9, 0x6c, 0x89, 0xc5, 0x9e, 0x4b, 0x69, 0x5c, 0x30, + 0xbe, 0x6a, 0xa1, 0x91, 0x51, 0x78, 0x89, 0x6e, 0x38, 0x33, 0x61, 0x53, + 0xaf, 0x45, 0x4b, 0x69, 0x4e, 0xbc, 0xc5, 0xb9, 0x89, 0xc0, 0x31, 0x92, + 0xa7, 0x51, 0xd0, 0x48, 0xa9, 0x8f, 0xb2, 0x62, 0x43, 0x64, 0xc3, 0x48, + 0x82, 0xa9, 0x73, 0x7c, 0x75, 0x52, 0x89, 0x86, 0x62, 0xc7, 0x3b, 0x8b, + 0x5d, 0x8a, 0xce, 0x57, 0x50, 0xad, 0x50, 0x56, 0x6d, 0xbd, 0xc0, 0x4e, + 0x45, 0x93, 0xd1, 0x9b, 0xac, 0x53, 0x59, 0x33, 0x94, 0x95, 0x57, 0x6e, + 0x42, 0x87, 0x60, 0xb0, 0x91, 0x35, 0xb8, 0x94, 0x5e, 0x2b, 0x51, 0x98, + 0x35, 0x9b, 0x6b, 0x75, 0x47, 0x89, 0xaf, 0x83, 0x95, 0xca, 0xa8, 0xb9, + 0x6b, 0xab, 0xaf, 0x7f, 0x54, 0x35, 0x35, 0xc6, 0xcb, 0x81, 0x41, 0x78, + 0x85, 0x44, 0x51, 0x7c, 0x62, 0x78, 0x32, 0x30, 0x6a, 0x99, 0x31, 0x5b, + 0x9a, 0x6e, 0x98, 0xa4, 0xa9, 0x47, 0xa3, 0xcb, 0x95, 0x7e, 0xc2, 0x52, + 0x78, 0xbc, 0xba, 0x42, 0x48, 0xb8, 0x98, 0x82, 0x68, 0x65, 0xb6, 0xb4, + 0x7e, 0x89, 0xb7, 0xaf, 0x87, 0x34, 0xbc, 0xb2, 0xa1, 0xdb, 0x68, 0xac, + 0x45, 0xc5, 0x98, 0x63, 0xb6, 0xd3, 0x57, 0x7a, 0xb1, 0x61, 0x54, 0x3d, + 0x77, 0xa3, 0x67, 0x6f, 0x94, 0x3d, 0x32, 0x3d, 0xaa, 0x77, 0x62, 0x8d, + 0x2f, 0x67, 0xa8, 0x91, 0x53, 0x37, 0x64, 0xc5, 0x68, 0x60, 0x96, 0xae, + 0xaf, 0x96, 0x37, 0xa2, 0x64, 0xc8, 0xcd, 0xa0, 0x5f, 0xbf, 0x3a, 0x37, + 0xbc, 0x87, 0xb7, 0x47, 0x41, 0x52, 0xae, 0x70, 0xbf, 0x6a, 0xb0, 0x66, + 0xb3, 0x33, 0x89, 0x7c, 0xc7, 0x47, 0xc3, 0xa0, 0x45, 0x7c, 0x4e, 0xa9, + 0x7d, 0xaf, 0xc0, 0x31, 0xb8, 0x3d, 0x47, 0x64, 0xcf, 0x9b, 0x8b, 0x7c, + 0xc2, 0x99, 0xa5, 0xc9, 0x92, 0x40, 0x76, 0xbe, 0xb6, 0x4f, 0xa8, 0x2c, + 0xb4, 0xb3, 0x60, 0x42, 0xab, 0xbe, 0x4a, 0x86, 0x3e, 0x94, 0x73, 0x35, + 0xb7, 0x68, 0xc2, 0x4d, 0xb7, 0x85, 0x1d, 0x99, 0x78, 0x61, 0xbc, 0x69, + 0xa0, 0xd9, 0x80, 0x5a, 0x28, 0xba, 0x67, 0xaf, 0x4c, 0x34, 0x53, 0x91, + 0xab, 0x73, 0x96, 0x2c, 0xd3, 0xab, 0x6d, 0x76, 0xab, 0xaa, 0xd4, 0xc8, + 0x86, 0x63, 0x7b, 0x4e, 0xa6, 0x82, 0x6f, 0x9f, 0xc0, 0xa7, 0x92, 0xb0, + 0xbf, 0x56, 0xab, 0x6d, 0xc9, 0xb0, 0x55, 0xc9, 0x35, 0x81, 0xa2, 0xbc, + 0x48, 0xc3, 0x44, 0x6c, 0x74, 0xb8, 0x74, 0xd5, 0x5c, 0x56, 0x2c, 0x89, + 0x5f, 0x4a, 0xa0, 0xad, 0x62, 0x59, 0x99, 0x2d, 0xb7, 0xc2, 0x8c, 0xa1, + 0x5d, 0x61, 0xb0, 0x6c, 0xc7, 0x3d, 0xad, 0x84, 0x85, 0x38, 0xcc, 0xbb, + 0x91, 0x40, 0x64, 0xda, 0x43, 0x99, 0x67, 0x8f, 0x52, 0x91, 0x44, 0xa8, + 0xad, 0x92, 0x81, 0x6f, 0x88, 0x3d, 0x77, 0xa4, 0x72, 0x3c, 0xb7, 0x73, + 0x84, 0xcd, 0xa0, 0xcc, 0x81, 0xb3, 0x8d, 0x91, 0xca, 0x8e, 0x8c, 0xbf, + 0x78, 0x98, 0x70, 0x61, 0x71, 0x65, 0x4d, 0xa6, 0x45, 0x78, 0x8c, 0xe7, + 0xb9, 0x55, 0xb2, 0x9e, 0xce, 0x3b, 0x68, 0x8c, 0x52, 0x39, 0xc7, 0x44, + 0x81, 0x5e, 0xa0, 0xaa, 0x5f, 0xc5, 0x94, 0x74, 0x46, 0x90, 0x6a, 0xd4, + 0xa5, 0xb1, 0x6a, 0xc3, 0xb1, 0xb9, 0x48, 0xbb, 0xbf, 0xd7, 0x1d, 0x6f, + 0x45, 0x82, 0xaa, 0xc2, 0xb3, 0x57, 0x48, 0x4f, 0x52, 0xb4, 0x86, 0x69, + 0xa4, 0xa8, 0xb1, 0x38, 0xc0, 0x86, 0xdc, 0x7f, 0x45, 0x65, 0x87, 0x61, + 0x35, 0x43, 0xc5, 0x96, 0x75, 0xc8, 0x29, 0x38, 0xc4, 0xb8, 0x31, 0xd6, + 0x8c, 0x66, 0x56, 0xa5, 0x42, 0xcb, 0xa4, 0x81, 0xa6, 0x49, 0x3d, 0x63, + 0x43, 0xd7, 0x6c, 0x57, 0xd3, 0x9e, 0xcd, 0xbe, 0x72, 0x5b, 0x45, 0x85, + 0x98, 0x75, 0xa3, 0x3e, 0x66, 0xb1, 0x4c, 0xbb, 0xc4, 0xb8, 0x40, 0x3d, + 0x9e, 0xbf, 0x5a, 0xa5, 0x3d, 0xce, 0x8d, 0x69, 0xbe, 0x53, 0x71, 0x82, + 0x8a, 0x72, 0xc1, 0x59, 0x83, 0xb1, 0x66, 0xb1, 0xb7, 0xb0, 0xb3, 0xc6, + 0x83, 0x4b, 0x3e, 0xa8, 0xad, 0x6b, 0x71, 0xa0, 0xd8, 0x9a, 0x91, 0xb9, + 0x96, 0xac, 0x61, 0xb4, 0xaf, 0xaf, 0x8c, 0xb1, 0x84, 0x6b, 0x7d, 0x4b, + 0x69, 0x60, 0xa2, 0xad, 0x46, 0x4b, 0x4b, 0xb2, 0x43, 0xbc, 0x42, 0xa3, + 0xc3, 0xa6, 0x3e, 0x2e, 0x5b, 0xd0, 0xb6, 0xd2, 0xa5, 0x46, 0x47, 0x7e, + 0x85, 0x77, 0x81, 0x52, 0x91, 0xac, 0x4a, 0x7f, 0x4d, 0x42, 0xcb, 0xa5, + 0x66, 0xb1, 0x62, 0xd4, 0xb7, 0x64, 0x70, 0x37, 0x8e, 0xa5, 0x45, 0x39, + 0x4a, 0x39, 0x50, 0x3f, 0x58, 0xb0, 0x89, 0x35, 0xb2, 0x3d, 0xc3, 0x5c, + 0x3a, 0xd7, 0x52, 0xd3, 0xbb, 0x39, 0xc4, 0x50, 0x6d, 0x3c, 0x53, 0x46, + 0xaa, 0x32, 0xae, 0xbe, 0xb4, 0x51, 0xb6, 0xb3, 0x62, 0x74, 0x3e, 0x82, + 0xca, 0xca, 0x3d, 0x83, 0x47, 0xd5, 0x9a, 0x55, 0xcc, 0x97, 0x8c, 0x66, + 0x77, 0x8f, 0x4d, 0x7a, 0xc0, 0x7a, 0x8d, 0xa3, 0x67, 0x46, 0xb6, 0x6e, + 0xb9, 0xbf, 0xdd, 0x9d, 0xbe, 0x4e, 0x94, 0xc2, 0x53, 0xa3, 0x78, 0x75, + 0x39, 0x73, 0x6c, 0xde, 0xd1, 0xbd, 0x45, 0x59, 0x3b, 0x53, 0xa4, 0x73, + 0x86, 0xba, 0x6d, 0x98, 0x41, 0xb3, 0x9d, 0x93, 0x1e, 0xa1, 0x93, 0x57, + 0x9e, 0x4c, 0x68, 0x5e, 0xa9, 0x58, 0xc3, 0xc8, 0xa4, 0xb0, 0x6d, 0x67, + 0x6e, 0x92, 0x70, 0x6d, 0x66, 0xbc, 0x86, 0x60, 0x48, 0x7f, 0xc4, 0x4d, + 0x54, 0x4d, 0x9f, 0x41, 0xc9, 0xca, 0x54, 0x8a, 0xab, 0x69, 0x42, 0x3e, + 0x69, 0xb6, 0x77, 0x50, 0x71, 0xbd, 0x41, 0xcb, 0x66, 0xc2, 0x88, 0x78, + 0x94, 0x7b, 0x6a, 0x74, 0x47, 0xd6, 0x89, 0xce, 0xb6, 0x51, 0xca, 0x80, + 0x44, 0xaa, 0x87, 0x81, 0x77, 0x7b, 0xc6, 0x59, 0xce, 0x6e, 0x5d, 0xbf, + 0x99, 0xdc, 0xa1, 0x6c, 0xad, 0x46, 0x35, 0x39, 0x92, 0xc0, 0x4f, 0x62, + 0x4e, 0x77, 0xb8, 0x30, 0x5d, 0x3f, 0x67, 0xa7, 0x6e, 0xd2, 0x32, 0x4a, + 0x66, 0xb9, 0x95, 0x95, 0xb8, 0x8e, 0xd0, 0xb8, 0x59, 0x8f, 0xbf, 0x69, + 0xc3, 0x90, 0x98, 0xc3, 0xbf, 0x84, 0xa1, 0xc6, 0xa9, 0xa0, 0xaa, 0x5b, + 0x9d, 0x75, 0x74, 0x7b, 0x3e, 0xc7, 0xb0, 0x3f, 0xa2, 0xb6, 0x33, 0xac, + 0x8b, 0x45, 0x88, 0x6b, 0x56, 0x8c, 0x7d, 0x45, 0x6a, 0x38, 0xaa, 0x9a, + 0xb3, 0x65, 0x47, 0x55, 0x8c, 0x33, 0xa1, 0x61, 0x6e, 0x60, 0x28, 0x39, + 0x79, 0xa5, 0xc1, 0x8c, 0x52, 0xca, 0x4b, 0x85, 0x48, 0xbd, 0x67, 0xc0, + 0x41, 0x3d, 0x96, 0x4f, 0xc3, 0xcb, 0xcc, 0x37, 0xc3, 0xa5, 0x50, 0x2e, + 0x85, 0xc9, 0xc0, 0xb0, 0x85, 0x63, 0xa1, 0xd6, 0xac, 0x62, 0x49, 0x9c, + 0xb8, 0xb4, 0xb9, 0x5d, 0x45, 0x7b, 0x5c, 0x5d, 0x3f, 0x89, 0x78, 0xc9, + 0x4c, 0x79, 0x8b, 0x65, 0x8c, 0x4d, 0x74, 0x61, 0x89, 0xac, 0x4a, 0xc0, + 0x42, 0x93, 0x98, 0x64, 0x83, 0x80, 0x5b, 0x64, 0x3f, 0x3f, 0x55, 0xbc, + 0x75, 0xad, 0x6a, 0x45, 0x7e, 0x3a, 0x47, 0x4c, 0x4d, 0x5a, 0xcf, 0x94, + 0xa2, 0x8c, 0x48, 0x9a, 0x47, 0x33, 0xa6, 0xd4, 0xd6, 0xa2, 0x53, 0x74, + 0xcb, 0x60, 0x8a, 0xb8, 0xd0, 0x8f, 0xd4, 0x59, 0x45, 0xca, 0xce, 0xa0, + 0xd1, 0x5b, 0x4b, 0x61, 0xb7, 0x50, 0x61, 0x90, 0xa7, 0x9c, 0x81, 0x97, + 0xa1, 0x3f, 0x56, 0x39, 0xb8, 0x5e, 0xca, 0xb2, 0x9d, 0x42, 0xc0, 0x51, + 0x3c, 0x5a, 0x96, 0x67, 0xc1, 0xa0, 0x78, 0xb9, 0x67, 0x91, 0xb3, 0x3f, + 0xd4, 0xc9, 0xda, 0x9c, 0x72, 0x61, 0xaf, 0x3b, 0x65, 0x71, 0x91, 0x6e, + 0xb6, 0x64, 0x44, 0x7d, 0xaf, 0xcb, 0xd0, 0xd1, 0x46, 0x4a, 0x3b, 0x48, + 0x38, 0x85, 0xb6, 0xc7, 0x3e, 0x85, 0x8b, 0xb2, 0x84, 0x5e, 0xab, 0x77, + 0x47, 0xbd, 0xba, 0xb2, 0x88, 0x50, 0x49, 0xd9, 0x4f, 0x71, 0xa7, 0x60, + 0xcd, 0xb0, 0x66, 0xa5, 0xb3, 0x83, 0xb1, 0x9b, 0x9a, 0x98, 0xce, 0x59, + 0x48, 0xc5, 0x6c, 0xc8, 0x9d, 0x8c, 0xad, 0x6c, 0x8e, 0x9d, 0xa2, 0xac, + 0x75, 0x92, 0xa1, 0xd8, 0xd0, 0xac, 0x72, 0x98, 0xab, 0x3f, 0xb1, 0x7d, + 0xbf, 0x41, 0x3b, 0xb7, 0xb2, 0xc2, 0x4c, 0x41, 0x36, 0x8e, 0x53, 0xa3, + 0xce, 0x91, 0x2f, 0x96, 0xae, 0x88, 0x9d, 0xbf, 0xc2, 0x91, 0x61, 0x88, + 0x92, 0x9b, 0x93, 0x8d, 0x8f, 0x59, 0x9a, 0xd0, 0xa0, 0x2e, 0xa1, 0xb5, + 0x9b, 0x75, 0x67, 0x68, 0x6b, 0x84, 0x52, 0x60, 0xd5, 0xca, 0x4b, 0x4b, + 0xbe, 0x7a, 0x5c, 0x75, 0x89, 0x33, 0x88, 0x8b, 0x42, 0xa6, 0x57, 0x93, + 0xc2, 0xdb, 0xd1, 0xb9, 0x8c, 0x63, 0x94, 0x35, 0x4b, 0xc7, 0xce, 0x33, + 0xba, 0xd1, 0x65, 0xcb, 0xc2, 0x4b, 0x7c, 0x2e, 0x57, 0x67, 0xae, 0x9f, + 0x84, 0xb2, 0x3a, 0x78, 0xc3, 0xda, 0x84, 0x70, 0x5e, 0x90, 0x8d, 0x47, + 0x86, 0xbc, 0xc1, 0x34, 0x6e, 0xd4, 0x4e, 0x78, 0x7d, 0x9d, 0x3b, 0x69, + 0x9e, 0x8b, 0x67, 0x6e, 0x89, 0x95, 0x47, 0x55, 0x8a, 0x5c, 0xcb, 0xaa, + 0x5a, 0x84, 0x7f, 0xc2, 0x5b, 0x3d, 0xce, 0x83, 0x58, 0xbb, 0xa2, 0x84, + 0xca, 0x50, 0x97, 0xc8, 0x4f, 0x7f, 0x7a, 0x59, 0xc3, 0xa3, 0x80, 0xb1, + 0xa6, 0xbe, 0x4d, 0x63, 0xc4, 0xad, 0x64, 0xc5, 0x6c, 0xc1, 0xbd, 0xa0, + 0x41, 0x66, 0x8f, 0xc2, 0x47, 0xcc, 0xc5, 0x4e, 0x3a, 0xcd, 0x57, 0x78, + 0x96, 0xd2, 0x9f, 0x6d, 0x45, 0x6c, 0x97, 0x82, 0xb0, 0xa6, 0xd5, 0x35, + 0xb2, 0x4b, 0x8d, 0x8c, 0x6e, 0x7e, 0xd1, 0xc5, 0xc8, 0xcb, 0x4c, 0xd5, + 0x3d, 0x58, 0xa5, 0x62, 0x55, 0xc7, 0xbb, 0x95, 0xcf, 0x97, 0x6f, 0xca, + 0x39, 0x74, 0x4a, 0x43, 0x4b, 0x36, 0x94, 0x49, 0xb6, 0x7d, 0x6d, 0x83, + 0xd6, 0x87, 0x41, 0x93, 0x73, 0x5c, 0x3c, 0x8c, 0x5c, 0xbb, 0x9e, 0x7a, + 0x68, 0x50, 0xc2, 0x48, 0x63, 0xd0, 0x4e, 0x8f, 0x4d, 0x40, 0x9c, 0x9d, + 0x63, 0x8e, 0xbe, 0x6e, 0xc5, 0xa9, 0x8c, 0x91, 0xa9, 0xb0, 0x5d, 0x37, + 0x95, 0xa4, 0xb7, 0x6a, 0x4e, 0x3d, 0x58, 0x8a, 0xa9, 0xa5, 0xc5, 0xc1, + 0xab, 0x3d, 0x7d, 0x9c, 0x63, 0x38, 0xb0, 0xc0, 0xbd, 0xc0, 0xba, 0x79, + 0x9b, 0x8a, 0x8b, 0xc3, 0xa7, 0x4e, 0x8c, 0x4f, 0xc6, 0xa3, 0xd2, 0x58, + 0xc8, 0x8a, 0x50, 0x9c, 0x93, 0x4d, 0xad, 0x99, 0x86, 0x93, 0x4d, 0xcc, + 0x9a, 0x84, 0x8d, 0x3c, 0xb2, 0x8f, 0xc1, 0xaa, 0x4b, 0x6f, 0x6d, 0xa2, + 0xa2, 0xcf, 0x38, 0x6d, 0xb1, 0xb1, 0xc1, 0xae, 0x5a, 0x85, 0x4a, 0x7f, + 0x4e, 0x83, 0xc0, 0x8f, 0x93, 0x42, 0xd4, 0xbf, 0x9e, 0x5a, 0x55, 0x9a, + 0x42, 0x8a, 0x46, 0x34, 0x64, 0x6a, 0x61, 0xc9, 0x99, 0xcc, 0x7c, 0x65, + 0x8a, 0x61, 0x3e, 0x42, 0x52, 0x60, 0x8e, 0xa6, 0xaa, 0x71, 0x79, 0xad, + 0x8c, 0xc8, 0x72, 0x39, 0x49, 0x86, 0x6b, 0x90, 0xb0, 0x7d, 0x2f, 0xa8, + 0x80, 0x68, 0x85, 0xc8, 0x3f, 0x2d, 0xa0, 0x94, 0xc6, 0x64, 0x85, 0x8c, + 0x89, 0xbd, 0x43, 0x78, 0x5b, 0xbf, 0x40, 0x91, 0x31, 0xc1, 0x86, 0xc9, + 0x45, 0x7c, 0x72, 0x8e, 0x92, 0x6e, 0x31, 0x77, 0xaf, 0x57, 0x8b, 0x74, + 0x45, 0x64, 0x4d, 0x45, 0x3f, 0xc7, 0x5b, 0xa8, 0xdd, 0x53, 0xc4, 0xd8, + 0x48, 0xb8, 0xd2, 0x92, 0xc0, 0x59, 0x94, 0x6d, 0xb3, 0x4f, 0xbd, 0x57, + 0x83, 0x8d, 0xc8, 0x64, 0x40, 0x3a, 0x5f, 0xae, 0x39, 0xaf, 0x9f, 0xca, + 0x53, 0x6d, 0x80, 0xc9, 0xbb, 0x98, 0x4e, 0xc3, 0xc4, 0x9b, 0x63, 0x4c, + 0xc2, 0xb8, 0x83, 0x8f, 0xa8, 0xd2, 0x77, 0x46, 0x96, 0x6a, 0x50, 0x3d, + 0x8e, 0x76, 0x77, 0x41, 0x32, 0xbc, 0xc1, 0x30, 0x38, 0xbe, 0xd2, 0x8c, + 0x99, 0x82, 0xb5, 0xb6, 0xbd, 0xaf, 0x60, 0xd2, 0x70, 0xcd, 0x3c, 0xc2, + 0x7e, 0xb8, 0x84, 0xaf, 0x8c, 0x62, 0x72, 0x66, 0xca, 0xae, 0x4a, 0x94, + 0x42, 0x3f, 0xb5, 0xce, 0xb2, 0xa3, 0x77, 0xca, 0xbf, 0x9d, 0x6d, 0x9d, + 0xaa, 0x52, 0x8c, 0x7a, 0x37, 0xa8, 0xb7, 0x72, 0x6a, 0xa7, 0x42, 0x3f, + 0xcf, 0x42, 0x97, 0x96, 0xb1, 0x84, 0x4d, 0xae, 0x94, 0x6e, 0x55, 0x94, + 0x73, 0x57, 0xb5, 0x3a, 0x30, 0x97, 0x48, 0x3f, 0xb5, 0x8d, 0x45, 0x3d, + 0x89, 0x58, 0xa5, 0x62, 0x76, 0x50, 0xb6, 0x8a, 0xd8, 0x94, 0xb7, 0xa7, + 0xd0, 0x4a, 0x7b, 0x73, 0x66, 0x92, 0x99, 0x69, 0x54, 0x58, 0x80, 0xc3, + 0xd2, 0x97, 0xb7, 0x83, 0x36, 0x99, 0x60, 0xd5, 0x7b, 0xa3, 0xa2, 0xd3, + 0xbe, 0x56, 0x49, 0x78, 0x7e, 0x43, 0x64, 0x5a, 0x60, 0x59, 0xce, 0x3f, + 0xc6, 0xb3, 0x9a, 0xce, 0x2d, 0x40, 0x3a, 0x78, 0x76, 0x4d, 0xbc, 0xa5, + 0xa0, 0x99, 0x79, 0x7c, 0x7c, 0x44, 0x56, 0xc6, 0x7a, 0x98, 0x47, 0x94, + 0x83, 0xa3, 0x47, 0x87, 0x74, 0x74, 0x48, 0x8f, 0x95, 0x58, 0x5b, 0xa9, + 0x35, 0xb0, 0x56, 0x94, 0x56, 0x87, 0x57, 0x8f, 0x8c, 0x94, 0xaf, 0x7e, + 0x8a, 0x6f, 0x5e, 0x6c, 0x77, 0xa2, 0xb9, 0x45, 0xa8, 0xa8, 0xb0, 0xae, + 0xcd, 0x69, 0xbc, 0x35, 0x30, 0x53, 0x6f, 0x75, 0xbe, 0xab, 0x3a, 0xc3, + 0xb2, 0xae, 0x6f, 0x44, 0x94, 0xb4, 0xc6, 0xac, 0x71, 0xd3, 0x90, 0x96, + 0xc3, 0x9a, 0x6c, 0xcb, 0x5a, 0x53, 0xc3, 0xad, 0x78, 0xce, 0x34, 0x35, + 0x92, 0xaa, 0x4e, 0x9b, 0xa0, 0x67, 0x95, 0x39, 0x3d, 0xc5, 0x53, 0x6e, + 0x3e, 0x9c, 0xbd, 0xcf, 0x57, 0x5a, 0xa6, 0x5f, 0x70, 0x98, 0x52, 0xb2, + 0x4a, 0x32, 0xbb, 0x51, 0xb3, 0x64, 0xb6, 0x74, 0xa6, 0xcc, 0x48, 0x39, + 0x38, 0x7f, 0x62, 0x46, 0x4a, 0xc8, 0x4a, 0xc9, 0x30, 0xa5, 0x7a, 0x45, + 0xaa, 0x42, 0xba, 0x8e, 0xbd, 0xcf, 0xa6, 0x53, 0x9a, 0x61, 0x53, 0xc4, + 0x5b, 0x6f, 0x51, 0xca, 0x48, 0x9c, 0x4c, 0x5c, 0x91, 0x87, 0xd1, 0x73, + 0x2f, 0x43, 0xbd, 0x5a, 0x4c, 0xb9, 0xb5, 0x5e, 0xae, 0xae, 0x45, 0x75, + 0x75, 0x62, 0x5e, 0x4b, 0x4a, 0xa1, 0x8a, 0x94, 0x48, 0x7b, 0x88, 0x8a, + 0x89, 0x72, 0xb8, 0x38, 0x2e, 0x97, 0xc7, 0x80, 0xa6, 0x6a, 0xa5, 0xb6, + 0x32, 0x61, 0x30, 0x66, 0x7e, 0x94, 0x3e, 0x8c, 0xb0, 0x5d, 0x66, 0x82, + 0x6c, 0xa1, 0x73, 0x2c, 0x3d, 0x92, 0x8e, 0x3d, 0x46, 0x92, 0xa8, 0xcc, + 0xaf, 0x5e, 0x4b, 0x49, 0xaf, 0x7b, 0x83, 0xbd, 0x5e, 0x71, 0x71, 0xc8, + 0x9e, 0xae, 0x39, 0x38, 0x6b, 0x81, 0xb7, 0x8d, 0x5d, 0x5c, 0x6c, 0x39, + 0xbd, 0x38, 0xaf, 0x46, 0xc0, 0xbc, 0xd0, 0x6d, 0xba, 0x5b, 0x6a, 0x5b, + 0x51, 0x57, 0x46, 0x91, 0xa1, 0x44, 0x33, 0x52, 0x44, 0x78, 0x45, 0x41, + 0x93, 0x90, 0x31, 0x6f, 0xb1, 0xcc, 0xd0, 0xce, 0x6d, 0xc0, 0x57, 0xc8, + 0xa3, 0x82, 0x5c, 0xa6, 0x75, 0x5d, 0xa8, 0x7a, 0x47, 0xa8, 0x56, 0x94, + 0xba, 0x35, 0x77, 0x46, 0xa9, 0x97, 0x6f, 0xc0, 0xc0, 0x44, 0x62, 0xac, + 0x9b, 0x64, 0x80, 0x6e, 0x57, 0xb9, 0x88, 0x89, 0xb5, 0xb8, 0x3f, 0x8d, + 0xd1, 0x37, 0x63, 0x76, 0x90, 0xcf, 0x31, 0x76, 0x6f, 0x84, 0x3c, 0xbc, + 0x51, 0xcc, 0x95, 0x48, 0x73, 0x38, 0x39, 0x45, 0x4d, 0x35, 0xa0, 0x8b, + 0x6f, 0x7a, 0x68, 0x9d, 0xcf, 0x38, 0xb8, 0x5d, 0xd1, 0xbd, 0x7f, 0xc3, + 0x37, 0x9b, 0x7a, 0x4d, 0x97, 0x48, 0x54, 0x88, 0xb8, 0x63, 0x8a, 0x42, + 0xaa, 0x2e, 0x97, 0xb4, 0x3d, 0x4e, 0x9f, 0x63, 0x5e, 0xb9, 0xa7, 0xa9, + 0xc2, 0x3e, 0x68, 0xb4, 0x4d, 0x81, 0x62, 0x7e, 0x68, 0x7b, 0x9a, 0xb6, + 0x77, 0x49, 0x3e, 0xa9, 0x52, 0x97, 0xa2, 0x31, 0x5f, 0x9c, 0x95, 0xb4, + 0xa9, 0x65, 0x8b, 0x7f, 0x47, 0x44, 0x48, 0x6a, 0xc3, 0x55, 0x43, 0x55, + 0x50, 0x59, 0x3b, 0x4e, 0xce, 0x8e, 0x56, 0x8f, 0x3a, 0x5b, 0xd1, 0x3f, + 0xc2, 0xca, 0x91, 0x62, 0xa8, 0xb3, 0x83, 0x6c, 0x5e, 0x60, 0xb6, 0xa8, + 0x70, 0x4d, 0x80, 0x9e, 0xcf, 0x90, 0x78, 0xd4, 0xba, 0x73, 0x90, 0xb3, + 0x66, 0xb8, 0xc4, 0x99, 0x54, 0x7c, 0xcc, 0x5b, 0x3b, 0x6f, 0xae, 0x3a, + 0x58, 0xca, 0x91, 0x4e, 0xc5, 0x5b, 0x7d, 0x37, 0x55, 0x40, 0x90, 0x7d, + 0x37, 0x58, 0x84, 0xb4, 0xb9, 0x9a, 0x58, 0xd1, 0x38, 0xad, 0xb1, 0x62, + 0x4c, 0x87, 0x59, 0xc2, 0x89, 0x6f, 0x5a, 0xad, 0x57, 0x73, 0x30, 0xcd, + 0x54, 0x8f, 0x48, 0x5b, 0x3c, 0x54, 0x56, 0xc8, 0xbb, 0x77, 0x8a, 0x6a, + 0xc8, 0xc8, 0x7e, 0x9d, 0x86, 0xad, 0x8b, 0xc3, 0x8a, 0x38, 0x54, 0x7d, + 0x63, 0x6f, 0x7e, 0x98, 0x31, 0x68, 0x9d, 0x63, 0x33, 0xc0, 0x4d, 0x98, + 0xb7, 0x4c, 0xc1, 0xbf, 0x65, 0x8d, 0x92, 0x47, 0xa4, 0x39, 0x3a, 0x63, + 0x41, 0x6b, 0xaf, 0x63, 0xb7, 0x74, 0xba, 0x43, 0x3c, 0x6e, 0x57, 0x90, + 0xc7, 0x70, 0x96, 0xad, 0x42, 0x95, 0x7e, 0xa0, 0xca, 0x85, 0x41, 0x67, + 0x43, 0x43, 0x86, 0xab, 0xc9, 0x68, 0xb3, 0xb2, 0x59, 0x53, 0x92, 0xb8, + 0xc0, 0xc9, 0x6e, 0x84, 0xc0, 0xa3, 0x37, 0x47, 0x80, 0x37, 0x38, 0x86, + 0x63, 0x47, 0x9c, 0x30, 0x9b, 0xc2, 0xc9, 0xae, 0x97, 0x53, 0x85, 0x44, + 0x6c, 0x94, 0xa4, 0x3c, 0xa1, 0xb0, 0x77, 0x4c, 0x9f, 0x7a, 0x6b, 0x48, + 0xbd, 0x37, 0xa4, 0xaa, 0x71, 0x84, 0x32, 0x35, 0x5b, 0x89, 0xc9, 0x8e, + 0x90, 0x3d, 0x37, 0x90, 0x6a, 0xc1, 0x41, 0x88, 0xc1, 0x59, 0xb9, 0x84, + 0xb0, 0xa9, 0x8f, 0x3d, 0x5f, 0x8d, 0xaa, 0x37, 0xd0, 0xa6, 0x6f, 0x7c, + 0x69, 0x58, 0xba, 0xce, 0x6c, 0x64, 0x7a, 0x35, 0xb1, 0x35, 0xa3, 0xb1, + 0x85, 0xa2, 0x9f, 0xcc, 0xd0, 0xbc, 0x78, 0x5c, 0x7b, 0x42, 0x96, 0x59, + 0x44, 0x96, 0xc9, 0x94, 0xcc, 0x7c, 0x77, 0x3b, 0x30, 0xa8, 0x71, 0xb0, + 0xb6, 0x2b, 0x3a, 0x47, 0x54, 0x79, 0xbc, 0xa8, 0xcb, 0x71, 0xcb, 0xb1, + 0x3d, 0x97, 0x7f, 0x48, 0xce, 0x81, 0x9c, 0x53, 0xc4, 0x47, 0x8a, 0x2f, + 0xa5, 0x69, 0x6c, 0x94, 0x75, 0x47, 0x74, 0xb0, 0x96, 0x32, 0xca, 0x56, + 0xba, 0x47, 0x57, 0xaa, 0xca, 0x41, 0xa9, 0xc7, 0xc0, 0x76, 0x7b, 0x2f, + 0x51, 0x88, 0x8e, 0x41, 0xa8, 0x61, 0x38, 0x7e, 0x75, 0x41, 0x47, 0x46, + 0x6c, 0x31, 0x43, 0x39, 0xb8, 0x6e, 0x33, 0xb7, 0x62, 0x49, 0x86, 0xa5, + 0x77, 0x8c, 0xa2, 0x63, 0xd1, 0xa0, 0xba, 0x85, 0x6f, 0x4b, 0xb3, 0x4e, + 0x54, 0x78, 0xbd, 0xaf, 0x7d, 0x90, 0x6a, 0xd0, 0xca, 0xc9, 0x39, 0xab, + 0xb8, 0x57, 0x8b, 0x5e, 0x94, 0x9d, 0x7f, 0xcf, 0x79, 0x59, 0xbe, 0x38, + 0xcc, 0xae, 0xc7, 0xbe, 0xd0, 0x9d, 0x6b, 0xb8, 0x75, 0xb6, 0xbc, 0xa2, + 0x5b, 0xa7, 0x73, 0x4f, 0x63, 0xc4, 0x8d, 0xc6, 0x9b, 0x75, 0x91, 0xc3, + 0x52, 0x8b, 0x95, 0x51, 0x46, 0x7f, 0x5f, 0x92, 0xbc, 0x68, 0xc3, 0xaa, + 0x7b, 0xc2, 0x3a, 0xbf, 0x4e, 0xd9, 0x60, 0x3e, 0x7b, 0xb1, 0x7d, 0xc5, + 0x4a, 0x82, 0xaa, 0x9e, 0x57, 0x46, 0x5e, 0x40, 0xbe, 0xae, 0xbe, 0x3f, + 0xd7, 0x95, 0x3c, 0xab, 0x73, 0xc5, 0xd5, 0x89, 0x8c, 0xdc, 0x8a, 0x55, + 0x3c, 0xa7, 0x59, 0x78, 0x55, 0x53, 0x91, 0x6a, 0x47, 0x90, 0x9a, 0x3b, + 0xa2, 0xc2, 0x51, 0x7a, 0xb0, 0x5e, 0xa4, 0xc5, 0x48, 0x9a, 0x9c, 0x58, + 0x5c, 0x6b, 0x2f, 0xb6, 0xc0, 0xb8, 0x4d, 0x9a, 0xb7, 0x82, 0x44, 0x72, + 0xa0, 0xc9, 0x9e, 0xa3, 0xd9, 0x99, 0x2e, 0xa8, 0xc7, 0xaf, 0x97, 0x54, + 0x8c, 0x6d, 0xc7, 0x51, 0x77, 0x7a, 0xa1, 0x94, 0x39, 0xc9, 0xb9, 0xc5, + 0x78, 0xbc, 0xdc, 0xb4, 0xd4, 0xbd, 0x71, 0x3f, 0x31, 0x7b, 0x9f, 0xc6, + 0xbe, 0xb1, 0xa1, 0xdc, 0x78, 0x32, 0x96, 0x8c, 0x46, 0x7d, 0x45, 0x43, + 0x50, 0x42, 0x63, 0xc1, 0x55, 0x8c, 0xc1, 0x41, 0xc1, 0x99, 0xa2, 0xbb, + 0xbc, 0xc7, 0x45, 0x6a, 0x62, 0xc6, 0x79, 0xc2, 0x89, 0xcb, 0x5c, 0xa7, + 0xc9, 0x31, 0xa4, 0x51, 0xbc, 0xa8, 0x3d, 0x3b, 0x3f, 0x39, 0xbe, 0xb6, + 0x66, 0x3f, 0x68, 0x64, 0x6d, 0x2e, 0xb6, 0xcc, 0x7c, 0x6d, 0xa7, 0x89, + 0xd8, 0x63, 0x8d, 0x34, 0x2c, 0x9b, 0x9e, 0x44, 0xcf, 0xd6, 0xba, 0x8f, + 0x33, 0xa1, 0x94, 0x5d, 0x79, 0x97, 0x30, 0x85, 0xa2, 0x64, 0x7c, 0xa4, + 0xa5, 0x50, 0x8e, 0x6b, 0xa6, 0x33, 0x7e, 0x82, 0xb0, 0xc5, 0x6d, 0x5c, + 0xc5, 0x82, 0xc4, 0x4d, 0x2c, 0x34, 0x4d, 0xc0, 0x3d, 0x48, 0x44, 0xa9, + 0x36, 0x48, 0x87, 0x88, 0xc2, 0x87, 0x36, 0xa4, 0x4d, 0x96, 0x5f, 0x8c, + 0xc4, 0x43, 0x94, 0x75, 0x9d, 0x85, 0x5e, 0x67, 0xcc, 0x79, 0xa6, 0xc6, + 0xc7, 0x9c, 0xbe, 0x62, 0xa6, 0xcf, 0x39, 0x88, 0x78, 0xa6, 0xce, 0xab, + 0xbe, 0x8e, 0x97, 0x29, 0x66, 0x5b, 0xa1, 0x46, 0xb0, 0x60, 0x75, 0xcc, + 0x82, 0x84, 0x47, 0x70, 0x85, 0x71, 0x7f, 0x44, 0x2c, 0x88, 0xb2, 0xae, + 0xc2, 0x52, 0xc4, 0xba, 0x3f, 0x58, 0xd5, 0x48, 0x58, 0x4e, 0x56, 0x6f, + 0xb8, 0x41, 0xb3, 0x42, 0x88, 0xd1, 0xa4, 0xc0, 0x5d, 0x6b, 0x4d, 0xc8, + 0xa8, 0xae, 0x5d, 0x74, 0x9a, 0x9e, 0x7f, 0x84, 0xcf, 0x56, 0x8e, 0xc5, + 0xae, 0x66, 0x7a, 0x82, 0x75, 0x47, 0x60, 0xac, 0x5b, 0x44, 0x74, 0x85, + 0xb1, 0x82, 0xa7, 0x5c, 0x50, 0xb7, 0xd2, 0x9b, 0x85, 0xbc, 0xe9, 0x67, + 0x91, 0x56, 0x92, 0x5b, 0xae, 0x42, 0xa2, 0xc4, 0x54, 0xc2, 0xb1, 0x3a, + 0xae, 0x9a, 0x29, 0x3a, 0x92, 0xd9, 0x3b, 0xd0, 0x77, 0xcc, 0x70, 0xb4, + 0x3d, 0x73, 0x98, 0xc5, 0x59, 0xab, 0x4a, 0xc6, 0xac, 0xac, 0x81, 0xb2, + 0x36, 0xb6, 0x8e, 0x39, 0x47, 0xb0, 0x4b, 0xd3, 0xc4, 0x55, 0xb4, 0x81, + 0x7e, 0x8a, 0x6b, 0x3b, 0xb4, 0xaf, 0xa8, 0xbe, 0x4b, 0x52, 0xb7, 0xcd, + 0x5b, 0x5f, 0x41, 0xd6, 0x61, 0xbb, 0xc8, 0x33, 0x50, 0xb8, 0x6a, 0xaa, + 0x3c, 0x5f, 0x75, 0x9a, 0x44, 0x3c, 0x83, 0x61, 0x64, 0x9e, 0x57, 0xc0, + 0x9a, 0x39, 0x69, 0x7e, 0xa8, 0x52, 0x86, 0x4d, 0x63, 0xb9, 0x79, 0x60, + 0x8d, 0xcf, 0xa7, 0x8d, 0xb9, 0x56, 0x74, 0x85, 0xb0, 0x39, 0x66, 0xbf, + 0x47, 0x5c, 0x8f, 0x4b, 0x99, 0x46, 0xac, 0x85, 0xcd, 0xcf, 0x9e, 0x5b, + 0x89, 0x52, 0x72, 0xb6, 0x79, 0x8e, 0x67, 0x54, 0x64, 0xc7, 0xd0, 0x69, + 0x78, 0x3c, 0xd3, 0xb4, 0xd0, 0x61, 0x70, 0x89, 0x7c, 0x88, 0x53, 0x69, + 0x92, 0x43, 0xbf, 0x4a, 0x55, 0xd1, 0x6f, 0xbf, 0x3b, 0x4f, 0x4e, 0x7d, + 0x64, 0xc5, 0x5b, 0xac, 0x71, 0x36, 0x68, 0xc8, 0x77, 0x3b, 0x58, 0x4e, + 0x3b, 0x7e, 0xcb, 0x9d, 0x49, 0x37, 0xc9, 0xb3, 0xbb, 0x51, 0xcd, 0x42, + 0x77, 0x39, 0xa3, 0x53, 0x9d, 0x7d, 0x2b, 0x53, 0x9d, 0xd5, 0x64, 0xc5, + 0x59, 0x97, 0xb6, 0x4a, 0x73, 0xb1, 0x4f, 0xcb, 0x8f, 0x8e, 0x71, 0x6e, + 0xb3, 0x5f, 0x5a, 0xc8, 0xb5, 0xb4, 0xa6, 0xc0, 0x9b, 0xbe, 0x64, 0x98, + 0x6b, 0x63, 0x91, 0x7d, 0x70, 0x7b, 0x62, 0x82, 0x56, 0x89, 0xa2, 0x5d, + 0xa6, 0xa3, 0x66, 0x8b, 0xc3, 0x7a, 0x57, 0xab, 0xbf, 0x3c, 0x60, 0x80, + 0x2a, 0x38, 0x6d, 0x84, 0x3e, 0x37, 0x80, 0xb1, 0x7a, 0x94, 0x8b, 0xc1, + 0xc5, 0x38, 0x64, 0x49, 0x7b, 0xc0, 0x9e, 0x9c, 0x5c, 0x85, 0xc2, 0x4e, + 0x77, 0xa8, 0x4e, 0x35, 0x40, 0x90, 0x3a, 0x95, 0x87, 0xaf, 0x2f, 0xb8, + 0x32, 0x68, 0x5d, 0x87, 0x63, 0x95, 0x50, 0x59, 0x82, 0x46, 0x75, 0x9c, + 0x91, 0x40, 0x3b, 0xb6, 0x3c, 0xd8, 0xd6, 0xc6, 0x73, 0x8e, 0xc8, 0x4f, + 0xcc, 0x72, 0xba, 0x96, 0xc5, 0x6e, 0x3e, 0xcd, 0x54, 0xb3, 0x57, 0x8b, + 0x3e, 0xa6, 0x4a, 0x8a, 0xc8, 0x65, 0xac, 0x2f, 0x5b, 0x65, 0xa1, 0x9b, + 0xba, 0xd1, 0x50, 0x37, 0xb6, 0x41, 0xc1, 0x55, 0x8e, 0xc2, 0xac, 0xaf, + 0x93, 0xc9, 0x83, 0x8d, 0x3a, 0x4d, 0x64, 0xcb, 0xa0, 0x92, 0x71, 0x9c, + 0x5a, 0xbc, 0xb9, 0xcb, 0x42, 0x50, 0x53, 0x4d, 0xa8, 0x7b, 0x52, 0x7e, + 0x46, 0x9e, 0x3a, 0xc5, 0x9c, 0x99, 0x57, 0xaa, 0x53, 0x41, 0xce, 0xce, + 0x8a, 0x5e, 0xa4, 0x81, 0x38, 0x53, 0x92, 0x85, 0x62, 0x52, 0x34, 0x82, + 0x4e, 0x4e, 0x58, 0x73, 0x69, 0xb8, 0x3e, 0xc0, 0x9f, 0xcb, 0xd1, 0x6c, + 0x34, 0x4a, 0xd9, 0x92, 0x7b, 0xc7, 0x9c, 0x3c, 0x7d, 0xa4, 0xa0, 0x5b, + 0x6d, 0xce, 0x95, 0x33, 0x66, 0xc7, 0x64, 0x3a, 0x30, 0x96, 0x9b, 0xa3, + 0xba, 0xa5, 0xb9, 0x3d, 0x4f, 0x6e, 0xdd, 0x83, 0xa2, 0xaf, 0x6f, 0xc8, + 0x56, 0x59, 0xc8, 0x85, 0x7e, 0xe7, 0xc6, 0x95, 0x58, 0x77, 0x92, 0x4f, + 0x97, 0x8e, 0xac, 0x66, 0x67, 0x47, 0x46, 0x4e, 0x77, 0xa3, 0x86, 0x6d, + 0x62, 0xa8, 0x73, 0xc9, 0x51, 0x96, 0xa5, 0x7e, 0xdb, 0x53, 0x31, 0xaa, + 0x36, 0x98, 0x37, 0x93, 0x79, 0xb6, 0x8a, 0x78, 0x46, 0x54, 0x49, 0x55, + 0x3e, 0x5b, 0xc3, 0xae, 0x43, 0x6d, 0xb6, 0xad, 0xbc, 0xa8, 0x32, 0xaa, + 0x83, 0xca, 0x4a, 0x3c, 0x3b, 0x4f, 0xa4, 0x71, 0xcf, 0x40, 0x41, 0x77, + 0x66, 0xc9, 0x48, 0x92, 0x4c, 0xaf, 0x80, 0x6e, 0xda, 0x5c, 0xd8, 0x9e, + 0x7f, 0xb2, 0x4f, 0x48, 0x94, 0x3a, 0x5c, 0x71, 0x65, 0xa4, 0x95, 0x6d, + 0x77, 0x8d, 0xbf, 0x6e, 0xc0, 0x48, 0x80, 0xae, 0xb6, 0x5d, 0x9a, 0xd1, + 0xd1, 0x92, 0x7e, 0x73, 0xcb, 0x80, 0xbc, 0x55, 0x57, 0x9b, 0xab, 0x74, + 0x6b, 0xc7, 0x8d, 0xbd, 0x4e, 0x48, 0xc4, 0x5e, 0x87, 0x45, 0x6c, 0xa7, + 0x6d, 0xce, 0x8e, 0x83, 0xb9, 0x94, 0x71, 0x86, 0x7f, 0xa8, 0xc1, 0x60, + 0x5a, 0x92, 0x7e, 0xbf, 0x57, 0x8a, 0xa0, 0xdd, 0x3d, 0x73, 0xc7, 0x62, + 0x39, 0x8c, 0x71, 0x49, 0xa8, 0x9b, 0x43, 0x5c, 0x6b, 0xa3, 0xd0, 0xc7, + 0xba, 0x3c, 0x88, 0x7c, 0x4f, 0x90, 0x47, 0x72, 0xb9, 0x77, 0x41, 0xe1, + 0x1f, 0x88, 0x80, 0xc7, 0x97, 0x57, 0xbc, 0x87, 0x3a, 0x38, 0x8b, 0x42, + 0x70, 0x3e, 0x3d, 0x1b, 0x51, 0x69, 0x45, 0xbf, 0x55, 0xbe, 0x4c, 0x4a, + 0xc8, 0x5d, 0xc8, 0x7a, 0x67, 0x80, 0xbb, 0xa4, 0x42, 0x6c, 0x60, 0x79, + 0xb3, 0x75, 0x6d, 0x30, 0x90, 0xc8, 0xaa, 0x3f, 0x41, 0xbc, 0x3a, 0x21, + 0xce, 0x64, 0x58, 0x37, 0x9e, 0xd3, 0xcf, 0x67, 0x9b, 0x7f, 0x7a, 0x3f, + 0x8c, 0x39, 0xab, 0x30, 0x62, 0x7f, 0x51, 0x63, 0xae, 0xae, 0xaa, 0x33, + 0xcb, 0x7f, 0x76, 0x33, 0x55, 0x93, 0x6d, 0x95, 0xb9, 0x38, 0x5f, 0x60, + 0x98, 0x9a, 0x54, 0x5b, 0xbe, 0xc6, 0xa5, 0xc0, 0x32, 0xd7, 0x32, 0x3d, + 0xa7, 0x50, 0xb4, 0x58, 0x3d, 0xca, 0x4b, 0xd5, 0x72, 0xb1, 0x8e, 0x42, + 0x98, 0xdd, 0xb5, 0xc9, 0xa5, 0x92, 0xc8, 0x8f, 0x3b, 0x6a, 0xae, 0x45, + 0x3d, 0x45, 0x6e, 0x62, 0xa3, 0x8d, 0x60, 0x79, 0x84, 0xbf, 0x51, 0x9f, + 0x5e, 0xbb, 0x80, 0xac, 0xc9, 0xa5, 0x87, 0x5b, 0xbb, 0x95, 0xcb, 0xb7, + 0x77, 0xd1, 0xb4, 0x7c, 0x7b, 0x6f, 0x3f, 0x50, 0xc3, 0x8e, 0xaa, 0xa8, + 0xd1, 0x64, 0xd4, 0x9e, 0x9b, 0x3c, 0xc2, 0x59, 0x68, 0x91, 0xbe, 0x7f, + 0x43, 0xa7, 0x83, 0x38, 0x8c, 0x6d, 0x71, 0xa1, 0x94, 0x96, 0x5a, 0x84, + 0x9a, 0x7a, 0xb9, 0x41, 0x9f, 0x8b, 0x63, 0x9b, 0xa3, 0xde, 0x68, 0x7f, + 0xd3, 0xa9, 0x51, 0x9b, 0x7d, 0x58, 0xa9, 0x2d, 0x6f, 0x86, 0xbe, 0x4a, + 0x6a, 0x4c, 0xd2, 0x85, 0x94, 0x81, 0x90, 0x4e, 0xa1, 0x5f, 0x64, 0x62, + 0xbb, 0x6d, 0x44, 0x46, 0x43, 0x8d, 0x85, 0x37, 0xa9, 0x79, 0xa6, 0xcb, + 0xa7, 0xae, 0x96, 0x6f, 0xb0, 0x88, 0x3a, 0xc7, 0xa5, 0xc3, 0x49, 0xc1, + 0xc5, 0xb5, 0xa2, 0x81, 0x5f, 0x3e, 0x96, 0x63, 0x44, 0x69, 0x8f, 0x56, + 0xb8, 0xa0, 0x9d, 0x43, 0xd2, 0x45, 0x40, 0x6d, 0x73, 0xcf, 0x3c, 0x3c, + 0x97, 0xb1, 0xcf, 0x58, 0x61, 0xac, 0xc9, 0x86, 0x42, 0x70, 0x6b, 0xc1, + 0xc8, 0xc4, 0xd9, 0xd1, 0xb9, 0x87, 0x9b, 0x39, 0x74, 0xba, 0x57, 0xd6, + 0x35, 0x66, 0x9f, 0x57, 0x97, 0xd0, 0xba, 0xca, 0x87, 0x79, 0x7a, 0x97, + 0x21, 0x5a, 0x9b, 0xb2, 0x5d, 0x7e, 0x91, 0x46, 0x74, 0xca, 0xc5, 0x91, + 0x76, 0x90, 0x9a, 0xc6, 0x5a, 0x77, 0xd3, 0x77, 0x86, 0x51, 0x65, 0xc6, + 0xc9, 0xc7, 0x2f, 0x61, 0x5c, 0x71, 0x9a, 0x72, 0x9d, 0x4c, 0x64, 0x73, + 0x51, 0x37, 0x5a, 0x2e, 0x71, 0xa1, 0xb7, 0x4e, 0xb4, 0x41, 0xad, 0x84, + 0x58, 0x7b, 0x3a, 0x5e, 0xbb, 0xb2, 0x84, 0x98, 0xd0, 0x6b, 0x87, 0x41, + 0x3f, 0x85, 0x44, 0x89, 0x98, 0xba, 0x3f, 0x50, 0x65, 0x5b, 0x3e, 0x9b, + 0x9f, 0x61, 0x60, 0x6e, 0x66, 0x90, 0x7f, 0x8a, 0x96, 0x59, 0x60, 0x61, + 0xc2, 0xd1, 0x57, 0x56, 0x92, 0x32, 0x92, 0xb6, 0xab, 0x5f, 0x52, 0x2e, + 0x97, 0x72, 0x4c, 0x51, 0x92, 0x8b, 0x78, 0xa2, 0xb0, 0x39, 0x51, 0x3c, + 0xbf, 0x52, 0x62, 0xd2, 0xb5, 0x7b, 0x3d, 0x8b, 0x85, 0x37, 0xa8, 0xac, + 0xc7, 0x9e, 0x6e, 0x52, 0x34, 0x9c, 0x40, 0x69, 0x62, 0x86, 0x7b, 0x9d, + 0x82, 0x54, 0x64, 0x82, 0x3c, 0x38, 0x61, 0x52, 0xd5, 0xb1, 0x52, 0xcb, + 0x86, 0xbd, 0xd1, 0xe1, 0x46, 0xc0, 0xb9, 0x91, 0x77, 0x6f, 0x49, 0x63, + 0x3f, 0xae, 0xbf, 0xd7, 0x4b, 0x6a, 0x71, 0xd3, 0x9c, 0x50, 0xc4, 0xd8, + 0x67, 0xae, 0x97, 0x59, 0x7d, 0x48, 0x4d, 0x8e, 0xba, 0x8a, 0xd5, 0x58, + 0xa0, 0x76, 0x67, 0xb6, 0x8b, 0x7c, 0x4b, 0x82, 0x63, 0xcb, 0xa0, 0x82, + 0xa1, 0x58, 0x6d, 0x67, 0x66, 0xb9, 0x7c, 0x94, 0x6e, 0x60, 0xb9, 0xa5, + 0xb2, 0xa2, 0x9e, 0x99, 0x71, 0x56, 0x96, 0x2d, 0x43, 0x6d, 0x56, 0x56, + 0xd2, 0x97, 0x3b, 0x7a, 0x54, 0x87, 0x3c, 0x2e, 0x51, 0x7b, 0xac, 0x91, + 0x3e, 0xbe, 0xc7, 0x70, 0x8c, 0x6d, 0xae, 0xd8, 0x93, 0x58, 0xac, 0x55, + 0x3d, 0x5c, 0xb3, 0x90, 0x5c, 0x9d, 0x34, 0xb1, 0x9f, 0x96, 0x4b, 0x70, + 0xbb, 0x9e, 0xc8, 0x59, 0x4e, 0x6a, 0x55, 0x8a, 0x59, 0x8e, 0x48, 0x9b, + 0x60, 0xa7, 0x6a, 0x9e, 0xbc, 0x9c, 0xd2, 0x83, 0x49, 0xa6, 0x8f, 0x8a, + 0xb8, 0xa0, 0x5a, 0x65, 0xa8, 0xb7, 0x71, 0x90, 0x4a, 0x5e, 0x43, 0x7b, + 0xb9, 0x67, 0x79, 0x97, 0xc7, 0xc1, 0x83, 0x3a, 0x66, 0xac, 0xa3, 0x8a, + 0x43, 0xcc, 0x36, 0x4d, 0xa9, 0x94, 0x39, 0x2c, 0xa9, 0x7e, 0xc3, 0x37, + 0xc8, 0x4a, 0x86, 0xc4, 0x81, 0x41, 0x1c, 0x28, 0x66, 0xb6, 0x70, 0x69, + 0x45, 0xa8, 0xa8, 0x7c, 0xc5, 0x6e, 0xc3, 0x5f, 0x65, 0xb0, 0x5a, 0x77, + 0x6e, 0x8c, 0x65, 0x89, 0x62, 0x9a, 0xc9, 0x6c, 0xb7, 0xaa, 0xc3, 0x2e, + 0x3e, 0x5c, 0x8e, 0x81, 0x89, 0x9b, 0xa8, 0x56, 0x80, 0x8c, 0x85, 0xc6, + 0x8b, 0xc7, 0x44, 0x64, 0x2e, 0xc8, 0xa5, 0x84, 0x83, 0x47, 0x93, 0xaf, + 0x40, 0x7b, 0x5a, 0x99, 0x5c, 0x48, 0xa6, 0xb7, 0x61, 0x42, 0x6d, 0x5a, + 0xd6, 0x98, 0x52, 0x66, 0xa5, 0xbe, 0xab, 0x5f, 0x56, 0x51, 0x6d, 0xc4, + 0xcb, 0x56, 0x77, 0x3b, 0x58, 0x98, 0x47, 0xa4, 0x50, 0xa4, 0x3c, 0x57, + 0xb3, 0xa8, 0x3d, 0x87, 0x36, 0xa7, 0x60, 0xb2, 0x6f, 0x44, 0xcc, 0xb5, + 0x62, 0xae, 0xa0, 0x8d, 0x9c, 0x91, 0x98, 0x7a, 0x5a, 0x84, 0x58, 0xd4, + 0xbb, 0x43, 0xaa, 0x80, 0x3b, 0x53, 0xbe, 0xd4, 0x6b, 0x7d, 0x83, 0x3e, + 0xd6, 0x6e, 0x8e, 0xb3, 0xae, 0xa5, 0x50, 0x88, 0x57, 0x99, 0xbd, 0x49, + 0xaa, 0x41, 0xc0, 0xb2, 0x3c, 0xb4, 0x9a, 0x7f, 0x6f, 0x9b, 0x33, 0x6e, + 0xa1, 0x7e, 0x4b, 0x5d, 0x47, 0xa9, 0x9f, 0x92, 0x8b, 0x60, 0x83, 0x36, + 0x30, 0x87, 0x9b, 0xcb, 0x3e, 0x58, 0x4e, 0xbf, 0x9b, 0x48, 0xa7, 0xbe, + 0x6d, 0xc7, 0x4b, 0x93, 0xa7, 0x2b, 0x54, 0xce, 0x94, 0xb5, 0x3c, 0x71, + 0xd2, 0xcc, 0x9e, 0x6c, 0xb5, 0x34, 0x6d, 0x5e, 0x3d, 0x42, 0xa1, 0xd1, + 0xa8, 0x77, 0x45, 0x6c, 0xa7, 0xa5, 0xd4, 0x9d, 0xa6, 0xc4, 0x54, 0x49, + 0xbe, 0xc5, 0xad, 0x7e, 0xad, 0x8e, 0x65, 0xce, 0xc8, 0x62, 0xcd, 0xa0, + 0x30, 0xbf, 0x3a, 0x67, 0xa6, 0x71, 0x87, 0x93, 0x8b, 0xa5, 0x4b, 0x6f, + 0x49, 0xba, 0x58, 0xb3, 0x6d, 0x94, 0x5b, 0xa2, 0x6c, 0xc8, 0x97, 0x49, + 0x41, 0xc2, 0xc1, 0x46, 0x7d, 0x5c, 0xa0, 0x6a, 0x94, 0x38, 0x5f, 0xc1, + 0xa0, 0x41, 0xc0, 0x96, 0xcb, 0xa2, 0x8e, 0x77, 0x8c, 0x33, 0xab, 0x4a, + 0x89, 0x34, 0x70, 0x6a, 0x3e, 0xb8, 0xbe, 0xaa, 0x67, 0xbd, 0x79, 0x85, + 0x6f, 0xc9, 0xa4, 0x82, 0x73, 0xbf, 0x97, 0x47, 0x35, 0x4f, 0xc8, 0x7c, + 0xca, 0x62, 0x36, 0x4f, 0xa0, 0xa7, 0x8d, 0x91, 0x5e, 0x34, 0x8e, 0xd6, + 0x5d, 0x36, 0x95, 0x7b, 0xb3, 0x7a, 0x62, 0x48, 0x6f, 0xbc, 0x77, 0xbf, + 0x56, 0x94, 0x69, 0x85, 0x5b, 0x2f, 0x6e, 0xcc, 0x46, 0xd4, 0x6c, 0xb8, + 0x6b, 0xb4, 0xa5, 0x4c, 0xa0, 0x41, 0xab, 0xa2, 0x5b, 0x66, 0x3b, 0x94, + 0xd1, 0x9a, 0xb6, 0x66, 0xab, 0xa2, 0x2e, 0xb7, 0xc1, 0x65, 0xab, 0x45, + 0x7d, 0x6a, 0x44, 0x89, 0x40, 0x2e, 0x8d, 0x93, 0x24, 0x86, 0xbd, 0x65, + 0x6c, 0xd3, 0x51, 0xc2, 0x4b, 0x96, 0xc4, 0x45, 0x50, 0xdb, 0xa6, 0x61, + 0x80, 0xca, 0xc0, 0x5d, 0x35, 0x5c, 0x75, 0x33, 0xc0, 0xdd, 0xa5, 0x5f, + 0x30, 0xcd, 0xa3, 0xd1, 0x79, 0x95, 0xba, 0xb8, 0x7d, 0xa8, 0x93, 0xcf, + 0xa3, 0x4d, 0x69, 0xb9, 0x4e, 0x6e, 0x4f, 0x7e, 0xd1, 0x4f, 0x8b, 0x3b, + 0xc0, 0x77, 0x99, 0x8a, 0x86, 0xb9, 0x9d, 0xd0, 0xbf, 0x4e, 0x4d, 0x79, + 0xa8, 0x69, 0x82, 0xcf, 0x91, 0x87, 0x45, 0x87, 0xca, 0x4c, 0xaa, 0x5b, + 0x70, 0x9b, 0xca, 0x86, 0x41, 0x46, 0x4d, 0x85, 0x7c, 0x95, 0xaa, 0x88, + 0xb1, 0x4c, 0xbb, 0x48, 0xba, 0xb1, 0x89, 0x97, 0x79, 0x70, 0x4f, 0x44, + 0xcd, 0x31, 0x7e, 0x57, 0x55, 0x7b, 0x63, 0xd1, 0x8b, 0xd1, 0x87, 0x5e, + 0x5b, 0xb3, 0x91, 0xb3, 0x97, 0x69, 0x70, 0xca, 0x70, 0x50, 0x46, 0xca, + 0x31, 0x39, 0x7c, 0xc4, 0xbf, 0x82, 0xaf, 0xcd, 0x6e, 0x9a, 0xa1, 0x53, + 0x43, 0xb1, 0x87, 0xa6, 0xa8, 0xd5, 0xbf, 0x9c, 0xa7, 0xab, 0xc5, 0x8d, + 0x4c, 0x59, 0x3a, 0x5d, 0x7e, 0x3b, 0x48, 0xc6, 0xb3, 0xb0, 0x9f, 0x70, + 0xca, 0x7c, 0x42, 0x6a, 0xa2, 0xba, 0x3a, 0xba, 0x67, 0x60, 0xab, 0xa0, + 0x58, 0xb3, 0x96, 0xbc, 0x4d, 0x3d, 0x40, 0xcb, 0xa1, 0x67, 0x59, 0x49, + 0xa4, 0xc4, 0x33, 0x7d, 0x7a, 0x3e, 0x5d, 0xa4, 0x36, 0x75, 0x86, 0x54, + 0x3a, 0xa6, 0x8b, 0x6d, 0x45, 0x84, 0xb7, 0x40, 0x8e, 0x59, 0x71, 0x32, + 0xc1, 0x9b, 0xb6, 0x97, 0x7a, 0x86, 0x9f, 0x49, 0xc9, 0x97, 0x8c, 0x5e, + 0x45, 0x4b, 0x77, 0xd4, 0xb2, 0xb6, 0x53, 0x9c, 0x4e, 0xcb, 0x4d, 0xcf, + 0xad, 0x86, 0x42, 0x6e, 0x81, 0x5f, 0x6d, 0xa7, 0x85, 0xbe, 0x9b, 0xd5, + 0x87, 0x34, 0x7c, 0x54, 0xbd, 0x54, 0xa4, 0x5a, 0x41, 0x80, 0x4d, 0x91, + 0x5f, 0x6b, 0x40, 0xa3, 0x2e, 0xb8, 0x90, 0x94, 0x6b, 0xb2, 0x8d, 0x3b, + 0x89, 0xd0, 0xd5, 0xe0, 0x47, 0x8e, 0xbb, 0x99, 0x5f, 0x3b, 0x7a, 0xc7, + 0x2c, 0x9b, 0x55, 0x3c, 0xcc, 0x71, 0x66, 0x44, 0xa5, 0xb8, 0xa6, 0x8a, + 0xcc, 0x3f, 0x7a, 0xd3, 0x53, 0x91, 0x75, 0xcb, 0x76, 0x90, 0x49, 0x46, + 0x86, 0x7c, 0x5f, 0xd3, 0x8f, 0x86, 0x3e, 0x9b, 0xce, 0x53, 0x65, 0x97, + 0xb1, 0x38, 0x40, 0x78, 0xaf, 0x82, 0x91, 0xbb, 0x83, 0x53, 0x97, 0x68, + 0xc7, 0xbf, 0x9c, 0x89, 0x9c, 0x3c, 0x5c, 0x3c, 0x4c, 0x7d, 0x91, 0x40, + 0x53, 0xa2, 0x82, 0x51, 0x3e, 0x37, 0x97, 0x6d, 0xab, 0x51, 0x55, 0x6e, + 0xa1, 0xd2, 0x8c, 0x6c, 0xd9, 0x41, 0x5a, 0x57, 0x6c, 0x53, 0x42, 0x8a, + 0x79, 0x53, 0x48, 0xc0, 0x67, 0x4c, 0xaf, 0x48, 0x56, 0x3e, 0x51, 0x91, + 0x5c, 0xa0, 0xcf, 0x61, 0x52, 0xa3, 0x90, 0x37, 0x4f, 0x47, 0x87, 0x88, + 0x46, 0xb9, 0x37, 0x70, 0xad, 0xcc, 0x81, 0xc5, 0xd6, 0x66, 0x46, 0x40, + 0xb1, 0xd0, 0x5d, 0x43, 0x88, 0x6d, 0x3d, 0x9b, 0x32, 0xac, 0x9e, 0x46, + 0xd4, 0xc6, 0x52, 0x57, 0x98, 0x85, 0xb8, 0xa1, 0x7d, 0x9f, 0x70, 0x86, + 0x92, 0xd4, 0xae, 0xa4, 0x56, 0xa9, 0x4c, 0xb9, 0x48, 0x5a, 0x60, 0x3f, + 0x3f, 0x67, 0xca, 0xd4, 0xc6, 0x4c, 0xa7, 0xa7, 0x93, 0x9d, 0xd1, 0x9a, + 0x61, 0x85, 0x45, 0x9e, 0x8d, 0x28, 0x8f, 0x36, 0x98, 0x5c, 0x59, 0xae, + 0x4f, 0xce, 0x84, 0x6b, 0x91, 0x70, 0xaf, 0x93, 0x9c, 0x2e, 0x4a, 0x36, + 0x35, 0x77, 0xcf, 0xb4, 0xc3, 0x92, 0x7b, 0xcd, 0x76, 0x95, 0xc1, 0x52, + 0x5f, 0x9b, 0x3d, 0xb3, 0x90, 0x56, 0xb7, 0x91, 0xb0, 0xb9, 0x9b, 0xb5, + 0x7e, 0x5c, 0xa2, 0x9b, 0x53, 0xac, 0x6f, 0x7e, 0x4f, 0x5f, 0x3c, 0x7c, + 0x57, 0xc7, 0x56, 0x41, 0xc0, 0x31, 0xc7, 0x8f, 0x56, 0x57, 0x6e, 0x45, + 0x9f, 0x40, 0x6a, 0x69, 0x4c, 0x5a, 0x6c, 0x5b, 0xc4, 0x87, 0xb1, 0xb0, + 0x7d, 0x8b, 0x6f, 0xc9, 0x5c, 0xc0, 0x57, 0x83, 0x83, 0x40, 0xbb, 0xa7, + 0xb3, 0x5d, 0xaf, 0x58, 0xcc, 0xa9, 0x68, 0xce, 0x63, 0x73, 0x7b, 0x54, + 0xcf, 0xa4, 0x60, 0xc5, 0x8f, 0x7a, 0x2c, 0xbc, 0xc0, 0x7b, 0x61, 0x54, + 0x3f, 0x91, 0x49, 0xde, 0xc9, 0x99, 0x63, 0x8c, 0xa5, 0x98, 0xae, 0xae, + 0x48, 0xc4, 0xba, 0x44, 0x9c, 0x7c, 0xc7, 0x44, 0xca, 0x59, 0x60, 0x1f, + 0xd0, 0x42, 0x97, 0x65, 0xca, 0x88, 0x87, 0xa7, 0x7d, 0x52, 0x74, 0x30, + 0x7c, 0xac, 0xa8, 0x35, 0x29, 0xa0, 0x69, 0x8b, 0xbf, 0x85, 0x77, 0x48, + 0x78, 0x8e, 0x6c, 0x4e, 0x39, 0x53, 0x47, 0x9c, 0xa6, 0x8e, 0xb6, 0x88, + 0x81, 0x9b, 0xad, 0x7f, 0x91, 0xd1, 0x52, 0xb4, 0x5c, 0xbb, 0x63, 0xa0, + 0x5f, 0x99, 0x76, 0xab, 0xc2, 0x9f, 0x8e, 0x68, 0x70, 0x97, 0xba, 0x68, + 0x7a, 0xb1, 0x76, 0xb1, 0x7a, 0xc2, 0x9a, 0x88, 0xb0, 0x92, 0xc0, 0x67, + 0x37, 0xb4, 0x3b, 0xb7, 0x9b, 0xaf, 0x7e, 0xd2, 0x39, 0x8a, 0xc5, 0xbe, + 0x83, 0x36, 0xbe, 0x5f, 0x58, 0x66, 0x60, 0x72, 0x52, 0x64, 0xc9, 0x47, + 0x7c, 0x63, 0xad, 0xb2, 0x6a, 0x62, 0xbb, 0xc0, 0x43, 0xaf, 0x2b, 0x9c, + 0x76, 0x54, 0x4c, 0x86, 0xc0, 0x8d, 0x71, 0xb0, 0x83, 0x72, 0xb4, 0x87, + 0xcb, 0xbf, 0x94, 0x86, 0xa3, 0x7c, 0x46, 0x6b, 0x5b, 0xc0, 0x62, 0xab, + 0x92, 0x46, 0xc8, 0xc5, 0x67, 0x56, 0x93, 0x89, 0xc8, 0x66, 0xba, 0xae, + 0xc2, 0xc9, 0xc7, 0x6c, 0xaf, 0xaa, 0x5d, 0x75, 0xbd, 0xb5, 0x8e, 0x45, + 0x72, 0x28, 0x8d, 0x53, 0x7a, 0xa0, 0x71, 0xc7, 0x43, 0x7e, 0x4d, 0xd0, + 0x76, 0xad, 0xa4, 0x4a, 0x80, 0x97, 0x9a, 0x98, 0x49, 0x46, 0x36, 0xac, + 0x45, 0x44, 0x90, 0x83, 0x5c, 0xcd, 0x38, 0x32, 0x88, 0xc8, 0x93, 0x7b, + 0x5f, 0x67, 0x4d, 0x75, 0x9e, 0x8b, 0xbc, 0x92, 0xb0, 0x63, 0x2a, 0x65, + 0x95, 0xa7, 0xd4, 0x64, 0x9d, 0xc3, 0xbc, 0x9d, 0x63, 0x6e, 0x8f, 0xba, + 0xc6, 0xcd, 0xa0, 0x6c, 0x5d, 0x7a, 0x69, 0x9e, 0x8c, 0x4a, 0x69, 0x93, + 0x76, 0x98, 0x58, 0x7a, 0xb3, 0x90, 0xc4, 0xa1, 0xcd, 0x41, 0x35, 0x9f, + 0x6f, 0x72, 0x46, 0xc1, 0x6a, 0xc0, 0x9e, 0x68, 0x8f, 0x65, 0xcd, 0x41, + 0x70, 0x6f, 0xbe, 0xcf, 0x3c, 0x8b, 0x2c, 0x45, 0xb6, 0x30, 0x87, 0x54, + 0x2e, 0x4a, 0xb9, 0x7d, 0xc8, 0xa9, 0x73, 0x72, 0xce, 0x92, 0x77, 0xde, + 0xb5, 0xc7, 0x76, 0x42, 0x5f, 0x68, 0xb3, 0x4e, 0x63, 0xc6, 0xc1, 0x56, + 0xa8, 0x3a, 0x5b, 0xcc, 0xbd, 0x82, 0x77, 0x49, 0x76, 0x35, 0x83, 0x3a, + 0x80, 0x9a, 0x89, 0x92, 0x89, 0x76, 0x47, 0x91, 0xbd, 0xbd, 0xa5, 0xb9, + 0x72, 0xab, 0x59, 0xb2, 0x67, 0xcf, 0x4c, 0x26, 0x77, 0xc3, 0x45, 0xaf, + 0x66, 0xbd, 0x52, 0xcf, 0xa3, 0x87, 0x3d, 0x78, 0x50, 0xaa, 0xbe, 0x51, + 0xc0, 0xb2, 0x85, 0x50, 0x92, 0xbb, 0xcc, 0xad, 0xcc, 0x6a, 0xab, 0xbd, + 0xb4, 0xc9, 0xbb, 0x36, 0x6b, 0xb5, 0x88, 0x3a, 0x77, 0xac, 0x87, 0xbe, + 0x5f, 0x49, 0xc2, 0x8d, 0x53, 0xc7, 0x9b, 0xa9, 0x46, 0x64, 0x3c, 0x54, + 0x44, 0x9f, 0x34, 0xc5, 0xbd, 0xcb, 0x78, 0xa4, 0xc4, 0xc1, 0x6c, 0xbf, + 0x59, 0xa6, 0x7f, 0x83, 0x52, 0xcb, 0x55, 0xaf, 0x35, 0x5b, 0x3c, 0xb0, + 0x3e, 0xcd, 0xd4, 0x93, 0x79, 0x5f, 0x3b, 0xc3, 0xa9, 0x6e, 0xa3, 0xa3, + 0x48, 0x51, 0x83, 0xcf, 0xc3, 0x4c, 0x71, 0x8e, 0x92, 0x3d, 0x90, 0xa3, + 0x73, 0x6e, 0xad, 0x4a, 0xbb, 0xd1, 0xb8, 0xc1, 0xb8, 0x91, 0x4a, 0x5c, + 0x90, 0x25, 0xbf, 0x9d, 0x9b, 0x59, 0xd0, 0x48, 0x63, 0x61, 0x8f, 0xd2, + 0xad, 0x4d, 0x86, 0x80, 0x45, 0x8e, 0x9d, 0xc1, 0x64, 0x75, 0xb4, 0x72, + 0x23, 0xbd, 0x76, 0x41, 0xb1, 0x6d, 0x4c, 0x52, 0x68, 0x3c, 0xb3, 0x77, + 0x47, 0x72, 0x36, 0xad, 0x7e, 0x53, 0x90, 0xa1, 0x7e, 0x9d, 0x82, 0x61, + 0x9b, 0x71, 0x47, 0x44, 0x7a, 0x6b, 0x88, 0x81, 0xac, 0x27, 0x94, 0x4a, + 0x7b, 0x41, 0x42, 0xb2, 0xc7, 0x87, 0x40, 0x44, 0x88, 0x4f, 0xa9, 0xd7, + 0x90, 0x2e, 0xa6, 0x82, 0x82, 0x5c, 0xaf, 0x98, 0x89, 0x5c, 0x75, 0x9e, + 0x9a, 0x4b, 0xb5, 0x50, 0xc8, 0x6e, 0x9b, 0x54, 0x85, 0xac, 0x44, 0x69, + 0x8c, 0x7e, 0x9c, 0x44, 0xc6, 0xd3, 0x4c, 0x3f, 0x98, 0x8a, 0xb8, 0xbd, + 0x93, 0x5f, 0x42, 0x92, 0x41, 0x95, 0xac, 0x3a, 0x8c, 0xce, 0x75, 0x99, + 0x5d, 0xbf, 0x74, 0x38, 0xb1, 0x42, 0x36, 0x77, 0x5f, 0x6d, 0x9e, 0x47, + 0x72, 0x77, 0xc4, 0xba, 0x8b, 0x6b, 0xac, 0x9e, 0xaf, 0x44, 0xd3, 0x3a, + 0x70, 0xcc, 0x7f, 0x37, 0x5e, 0x9a, 0x78, 0x4e, 0x98, 0x8e, 0x6f, 0x4e, + 0x65, 0x9e, 0x8a, 0x7b, 0xa3, 0x58, 0x62, 0x69, 0x3a, 0x9c, 0xb5, 0xb1, + 0xac, 0xb2, 0x6e, 0x7f, 0x84, 0x76, 0x67, 0x94, 0x68, 0x8e, 0xa7, 0x74, + 0x9f, 0x53, 0x7b, 0x38, 0xb3, 0x44, 0x44, 0x5d, 0x3a, 0xc9, 0x68, 0x5d, + 0x58, 0x5a, 0x7e, 0xc5, 0x6b, 0x5d, 0x97, 0x35, 0x43, 0x8d, 0xb8, 0x29, + 0x93, 0x99, 0x29, 0x90, 0x99, 0xbf, 0x53, 0x3c, 0xb8, 0x79, 0x7a, 0x6d, + 0x7c, 0x66, 0xb4, 0x98, 0x51, 0xa6, 0x24, 0x49, 0x9d, 0xc2, 0x91, 0x92, + 0xa3, 0x9e, 0xa6, 0xd8, 0xca, 0x3f, 0x56, 0x3b, 0x3e, 0x4d, 0x99, 0x2d, + 0x6d, 0xc4, 0xcc, 0xa8, 0x58, 0x4e, 0x62, 0x83, 0x3a, 0x91, 0x59, 0x4a, + 0xa4, 0x6b, 0x70, 0x74, 0x33, 0x52, 0xa3, 0x4c, 0x61, 0x77, 0xc7, 0xb3, + 0xcb, 0xb4, 0x7f, 0x66, 0x6e, 0xa9, 0x3d, 0xab, 0xba, 0xb9, 0x6a, 0x2a, + 0x91, 0x8d, 0x6c, 0xbe, 0x5a, 0x9a, 0x65, 0x44, 0x87, 0x45, 0x4d, 0xcc, + 0x7e, 0x74, 0xab, 0x41, 0x4b, 0xbc, 0xc6, 0x9a, 0x96, 0x53, 0x43, 0x64, + 0x55, 0xb8, 0xb2, 0x4e, 0x47, 0xc3, 0xd3, 0x96, 0x81, 0xa5, 0x7f, 0x68, + 0x69, 0xb6, 0x44, 0xac, 0x5f, 0x98, 0x56, 0x77, 0x86, 0x64, 0x5d, 0xb2, + 0x63, 0x82, 0xc7, 0x3b, 0x4c, 0xaf, 0x73, 0xb3, 0x79, 0xbe, 0xb8, 0x9a, + 0x6a, 0x92, 0x8d, 0xaf, 0xcb, 0x4c, 0xcb, 0xca, 0x74, 0x35, 0x50, 0x3d, + 0xb9, 0xc8, 0xb1, 0x93, 0xc6, 0x6e, 0x51, 0x87, 0xc5, 0x9f, 0xcb, 0xa8, + 0x81, 0x95, 0x42, 0x40, 0xae, 0xa6, 0xc5, 0xb3, 0x57, 0x70, 0xc3, 0x8d, + 0x73, 0x71, 0x3c, 0x54, 0xa8, 0x7b, 0x50, 0xc4, 0xaa, 0xab, 0x80, 0x66, + 0xbf, 0x43, 0x53, 0x47, 0xa3, 0x2c, 0x9f, 0x43, 0x5c, 0x8e, 0x4a, 0xb7, + 0x6e, 0x9c, 0x7c, 0x5f, 0x89, 0xb2, 0x50, 0x4e, 0x50, 0xcc, 0x58, 0x45, + 0x97, 0xc1, 0xdd, 0x80, 0x41, 0x63, 0x78, 0xa7, 0x9b, 0x82, 0x87, 0xc6, + 0xa0, 0x45, 0x79, 0x3a, 0x63, 0x5f, 0xc9, 0x2e, 0xaa, 0x3a, 0x70, 0xce, + 0x49, 0x5c, 0x89, 0x77, 0xd1, 0x6e, 0x96, 0x3b, 0x42, 0x5f, 0x5f, 0x41, + 0x5c, 0x80, 0xc0, 0xb6, 0x90, 0x31, 0x53, 0xaa, 0x44, 0x4a, 0x44, 0x59, + 0xc8, 0x33, 0x2f, 0x89, 0x3f, 0xab, 0x7a, 0x30, 0x60, 0x8e, 0x7d, 0x94, + 0xc1, 0x85, 0x4e, 0x63, 0x85, 0x4a, 0xcc, 0xa0, 0x76, 0x9b, 0x36, 0x51, + 0xbf, 0x5d, 0x35, 0xa3, 0x72, 0xc3, 0x67, 0x87, 0x4f, 0x59, 0x8d, 0xc4, + 0x5f, 0x3f, 0xa0, 0x5a, 0x9c, 0xad, 0x55, 0x34, 0x45, 0x8b, 0xc7, 0x58, + 0x62, 0x8d, 0x80, 0x99, 0x95, 0x3c, 0x6c, 0x72, 0xc9, 0x4b, 0x73, 0x9a, + 0xd2, 0xc3, 0xd1, 0x6e, 0x7d, 0x96, 0xa3, 0x8f, 0x75, 0xd2, 0x51, 0x2f, + 0x81, 0xba, 0x9a, 0x4a, 0x8b, 0x47, 0xd4, 0x9d, 0x64, 0x36, 0x37, 0x53, + 0x98, 0x4b, 0x3a, 0xbf, 0x6f, 0x9d, 0xa2, 0x5d, 0x59, 0x6e, 0x9d, 0x88, + 0x68, 0x83, 0xd0, 0x61, 0x49, 0x81, 0x6d, 0x69, 0x40, 0x98, 0x24, 0x63, + 0x53, 0x81, 0xa9, 0x99, 0x38, 0x86, 0x7c, 0xd5, 0x7a, 0x2b, 0x67, 0xa3, + 0x50, 0x8b, 0xb1, 0xaa, 0xcb, 0xd1, 0x3e, 0x5d, 0xc3, 0x73, 0xc7, 0x6b, + 0x9a, 0x54, 0xb6, 0x50, 0x67, 0xad, 0x69, 0xd6, 0x9e, 0xd9, 0xc4, 0x70, + 0x5b, 0x57, 0x37, 0x7d, 0x25, 0xa6, 0x4e, 0xc4, 0x58, 0x6b, 0xa9, 0xa5, + 0x79, 0x54, 0xbb, 0xc2, 0x9c, 0xd8, 0x9b, 0x78, 0x43, 0x88, 0x33, 0x7e, + 0x4f, 0xc1, 0xae, 0x2a, 0x7e, 0x97, 0xa2, 0x6c, 0x92, 0x90, 0xbd, 0x8b, + 0x48, 0xa4, 0x84, 0xbf, 0xcb, 0xb9, 0xa7, 0xab, 0xb0, 0x60, 0x77, 0xc7, + 0xc7, 0x46, 0xc7, 0x87, 0x85, 0x8f, 0x87, 0x9b, 0x41, 0x68, 0x44, 0xbc, + 0x52, 0x4b, 0xa0, 0xa0, 0xb4, 0xbd, 0x4e, 0x83, 0x70, 0xc1, 0x75, 0xac, + 0x8a, 0xd4, 0xb6, 0x41, 0xc0, 0x4e, 0xb7, 0xa3, 0x65, 0x96, 0xa5, 0x48, + 0xa6, 0x7a, 0x51, 0x97, 0xc1, 0x79, 0x58, 0x72, 0x8b, 0x54, 0xa1, 0x9c, + 0xd1, 0xc2, 0xb3, 0x6f, 0xba, 0x67, 0xc2, 0x6d, 0x92, 0x85, 0x50, 0x42, + 0x75, 0xac, 0x8f, 0x50, 0x6d, 0x6e, 0x95, 0xae, 0x80, 0xcf, 0xa4, 0xa5, + 0x96, 0x8d, 0xa1, 0xb9, 0xc6, 0x7a, 0x68, 0x50, 0x3c, 0x60, 0x74, 0x82, + 0xcb, 0x59, 0xb7, 0x6b, 0x4e, 0x86, 0x5b, 0x9b, 0x72, 0xa7, 0xa6, 0x60, + 0x96, 0x72, 0x9c, 0x7e, 0x41, 0xc2, 0xcc, 0xd5, 0x63, 0xbe, 0x7e, 0x8b, + 0xb9, 0xa8, 0x59, 0x4f, 0x49, 0xbd, 0x44, 0xc6, 0xa2, 0xc5, 0x2e, 0x3a, + 0x3f, 0x7e, 0xcb, 0xb7, 0x92, 0xc6, 0x52, 0xc0, 0x59, 0xaf, 0x59, 0xbb, + 0x83, 0x3e, 0x7f, 0xc6, 0x2c, 0x9b, 0xb6, 0x6d, 0xb8, 0x3e, 0x7e, 0x5b, + 0xb0, 0x66, 0xbb, 0x4d, 0x37, 0xcd, 0xce, 0x7d, 0x6c, 0xc0, 0x41, 0x6f, + 0x6e, 0x33, 0xc0, 0x89, 0x8b, 0x97, 0x57, 0xd4, 0xad, 0x98, 0xbc, 0x55, + 0xbd, 0x7e, 0x43, 0x89, 0x82, 0xb3, 0x40, 0x7e, 0x50, 0x9d, 0x9d, 0x37, + 0xcc, 0xa2, 0xa4, 0x69, 0x75, 0x9e, 0x6b, 0xab, 0x60, 0x44, 0xd5, 0x9d, + 0xbf, 0x4e, 0x84, 0x8f, 0x3b, 0x96, 0x52, 0x50, 0xc4, 0x7e, 0x9e, 0xa0, + 0xc7, 0xa1, 0x39, 0x76, 0xa9, 0x42, 0xa3, 0xcd, 0x53, 0x41, 0xca, 0x6d, + 0x55, 0x95, 0x9e, 0xb7, 0xcf, 0x5a, 0xc2, 0xb9, 0x3a, 0xa9, 0xc9, 0x7f, + 0x6f, 0xb7, 0x9e, 0xac, 0x66, 0x31, 0x57, 0x81, 0x7e, 0x90, 0xc7, 0x79, + 0x7d, 0xbd, 0x61, 0x93, 0x4e, 0xc8, 0x91, 0x6a, 0xd3, 0xd3, 0xc5, 0x8a, + 0x8a, 0x30, 0xbc, 0x89, 0xd6, 0xae, 0xcf, 0xb1, 0xa3, 0x5d, 0x4f, 0x99, + 0x95, 0xb0, 0x48, 0x41, 0x82, 0x72, 0x8a, 0x9e, 0x92, 0xcd, 0x9f, 0x60, + 0x8e, 0xbe, 0x39, 0x9b, 0x50, 0x40, 0xcd, 0x50, 0x35, 0x82, 0x9a, 0x33, + 0xdc, 0x56, 0x43, 0xb7, 0x3d, 0x9e, 0xc4, 0x5b, 0x47, 0x5d, 0x44, 0x5a, + 0xc3, 0xcf, 0x95, 0xa3, 0xba, 0x94, 0xaf, 0x2f, 0x87, 0x76, 0x47, 0x6c, + 0x65, 0x68, 0xb6, 0x67, 0x8e, 0xb1, 0x4a, 0x50, 0x69, 0xb7, 0x7a, 0x95, + 0x8c, 0x68, 0xc6, 0xa4, 0x94, 0x73, 0xcf, 0x50, 0xb3, 0xb4, 0xae, 0xc1, + 0x42, 0x96, 0x50, 0xb6, 0xa2, 0xac, 0x97, 0x9c, 0x2a, 0xb6, 0xc3, 0xbb, + 0x54, 0x61, 0x93, 0xc0, 0x8c, 0xcd, 0x91, 0x48, 0x81, 0xb2, 0x9e, 0xac, + 0x5e, 0x5f, 0xb5, 0x7a, 0x3b, 0x46, 0x6d, 0xcb, 0x8b, 0x5b, 0xac, 0xa7, + 0x7f, 0x5c, 0xce, 0xba, 0x36, 0xce, 0x44, 0xb5, 0x43, 0x7d, 0x89, 0xc7, + 0xa2, 0x7f, 0x5d, 0xae, 0xcb, 0x73, 0xa4, 0xb2, 0x3e, 0x78, 0x6b, 0xe0, + 0x67, 0xc7, 0xc0, 0xd3, 0x7b, 0x5f, 0x67, 0x6d, 0xb5, 0x71, 0x7c, 0x5a, + 0x5a, 0x69, 0xa3, 0x64, 0x52, 0xc1, 0x72, 0x93, 0xb3, 0x92, 0xbf, 0x34, + 0xb3, 0x8c, 0xa3, 0x9c, 0x39, 0xc4, 0x94, 0x52, 0x7f, 0x38, 0x5f, 0x90, + 0x44, 0x79, 0xb7, 0x3d, 0x75, 0x70, 0xbc, 0xa2, 0xa1, 0x70, 0x60, 0x92, + 0x90, 0xc8, 0x85, 0x44, 0x35, 0x91, 0x35, 0x45, 0x49, 0x4e, 0xca, 0x93, + 0x47, 0xb1, 0x3b, 0x6e, 0x50, 0xd6, 0x80, 0xc6, 0xa8, 0x7a, 0x2f, 0xc2, + 0x56, 0x73, 0x51, 0x85, 0x5e, 0x61, 0xc5, 0x8a, 0xc3, 0x7a, 0x59, 0x55, + 0xb6, 0x5c, 0x8f, 0x8b, 0x85, 0x64, 0xae, 0x49, 0x90, 0xb5, 0x44, 0x53, + 0xd2, 0x80, 0x94, 0x43, 0x67, 0x89, 0xad, 0x41, 0x60, 0xaa, 0x50, 0x7e, + 0x6c, 0x81, 0x76, 0xc6, 0x7f, 0x8f, 0x4d, 0x50, 0xba, 0xb1, 0xaf, 0x6e, + 0x98, 0x2f, 0x90, 0xc1, 0xa6, 0x2e, 0xae, 0x3a, 0xb7, 0xac, 0xbe, 0x52, + 0x69, 0x7a, 0x49, 0x8f, 0xa3, 0x25, 0x4b, 0x51, 0xc8, 0x5e, 0xbe, 0x96, + 0x63, 0x5e, 0x51, 0x83, 0x6f, 0x4b, 0x99, 0x67, 0xae, 0x6f, 0xc4, 0xb8, + 0x3f, 0x3e, 0x82, 0x70, 0xb0, 0x3c, 0xcc, 0x44, 0x31, 0x4a, 0x51, 0xc8, + 0x7a, 0x56, 0x2c, 0x8c, 0x63, 0x3a, 0x9b, 0x4b, 0x3e, 0x75, 0x55, 0x87, + 0x47, 0x90, 0x85, 0x8f, 0x4e, 0x79, 0xc7, 0x94, 0x91, 0xac, 0x3e, 0x90, + 0x57, 0x3c, 0x9e, 0xbf, 0x84, 0x6d, 0x91, 0x95, 0x7c, 0x9a, 0x3b, 0xb3, + 0x9c, 0x7f, 0xbe, 0xb6, 0xb0, 0x3e, 0x51, 0x3b, 0x90, 0x99, 0x82, 0x5b, + 0xa1, 0xa3, 0x62, 0x3e, 0xcd, 0xba, 0x60, 0x89, 0x73, 0x62, 0x76, 0x98, + 0x7c, 0x32, 0x64, 0x94, 0x63, 0xcc, 0x46, 0x56, 0x90, 0x3b, 0x76, 0x76, + 0x8d, 0x3a, 0x89, 0xa9, 0x96, 0xab, 0x82, 0x98, 0xaf, 0x50, 0x51, 0x73, + 0xa7, 0x3c, 0x89, 0xb4, 0x7c, 0x30, 0x37, 0x78, 0x3d, 0x50, 0x56, 0x83, + 0x75, 0x5f, 0x81, 0x5d, 0xab, 0xc7, 0x8b, 0xb8, 0xa9, 0x77, 0x58, 0x6b, + 0xcd, 0xce, 0xb7, 0x94, 0xb4, 0x7e, 0x82, 0x53, 0x87, 0x6b, 0x53, 0x42, + 0x3f, 0xa8, 0x85, 0xc4, 0x3c, 0x40, 0x8a, 0xca, 0x42, 0xaf, 0xc8, 0x77, + 0xaf, 0x6e, 0x4f, 0x3b, 0xb6, 0x96, 0xb9, 0xb1, 0x5f, 0xd1, 0x36, 0x6a, + 0xd1, 0x56, 0x7d, 0x76, 0x5a, 0x66, 0x91, 0x6d, 0xd5, 0xbf, 0xa6, 0x4f, + 0xa8, 0x9a, 0x73, 0x4f, 0x37, 0x80, 0xbe, 0x4e, 0xad, 0x7e, 0x40, 0xbd, + 0x5d, 0x70, 0x41, 0xc6, 0x6f, 0x56, 0x46, 0xc4, 0x7e, 0xbb, 0x60, 0xb9, + 0xa1, 0x69, 0xcb, 0x57, 0x98, 0xae, 0x75, 0x5a, 0x9b, 0xa3, 0x44, 0x47, + 0xa5, 0x35, 0x66, 0x62, 0x7c, 0x6e, 0x89, 0xc9, 0x52, 0x78, 0x45, 0xa3, + 0x34, 0xa1, 0xca, 0x6d, 0xb5, 0x51, 0x90, 0x98, 0x3c, 0x73, 0x5e, 0xb9, + 0x48, 0xb8, 0x8e, 0x49, 0x40, 0xd0, 0x9f, 0x5e, 0xcd, 0x81, 0xa8, 0x3f, + 0x4c, 0xbd, 0x58, 0xa1, 0x96, 0xa3, 0xcd, 0x8b, 0xd3, 0x8c, 0x9e, 0x7e, + 0xab, 0x80, 0x38, 0xba, 0x9d, 0x78, 0x7d, 0x57, 0x3a, 0xbc, 0x92, 0x4b, + 0x89, 0xd0, 0x55, 0xb6, 0x58, 0x56, 0xcb, 0x94, 0xaa, 0x9e, 0x90, 0x67, + 0x55, 0xae, 0x3a, 0x5e, 0xb0, 0xc6, 0xa4, 0x58, 0x7d, 0xad, 0x61, 0x4c, + 0x66, 0xc7, 0x4b, 0x83, 0x74, 0xc3, 0xa8, 0x49, 0x64, 0x37, 0x4c, 0x87, + 0xac, 0x81, 0xc7, 0xb7, 0x8f, 0x98, 0x90, 0x6a, 0xc0, 0xaa, 0x8d, 0xa6, + 0xa2, 0xbf, 0x5b, 0xaa, 0x6e, 0x58, 0x53, 0xa5, 0x71, 0x8d, 0x65, 0x32, + 0x71, 0x4c, 0x6b, 0x62, 0xcf, 0x43, 0xbc, 0x8b, 0x60, 0x8a, 0x38, 0x66, + 0x76, 0xb2, 0x95, 0x48, 0x5a, 0x3f, 0x61, 0x93, 0x64, 0xd1, 0x52, 0xad, + 0x78, 0x8b, 0x82, 0xa4, 0xc3, 0xb9, 0xc8, 0x5c, 0x76, 0x4f, 0x62, 0x42, + 0x4c, 0x8c, 0xaf, 0x3c, 0x93, 0x6d, 0x7d, 0x9c, 0x84, 0x49, 0x6d, 0x70, + 0x49, 0x34, 0x9f, 0x64, 0x88, 0x84, 0xc2, 0x93, 0xa6, 0x9b, 0x8d, 0xa5, + 0xbd, 0xbc, 0xa5, 0x9b, 0xb0, 0xb9, 0x37, 0x9c, 0xbc, 0x8b, 0x72, 0x3c, + 0xbd, 0x43, 0xca, 0xca, 0xb2, 0x96, 0x37, 0xc9, 0xcd, 0x73, 0x49, 0x8e, + 0x9d, 0x40, 0xa5, 0x50, 0x6a, 0x60, 0x33, 0x3b, 0x47, 0x82, 0x88, 0x4c, + 0x73, 0x50, 0x61, 0x76, 0xaf, 0x63, 0xbe, 0xb3, 0xc6, 0x56, 0x78, 0xa3, + 0xa9, 0x55, 0x3d, 0x9b, 0xa3, 0x45, 0x57, 0x4b, 0x69, 0x5a, 0x7c, 0x71, + 0x85, 0x94, 0x84, 0xac, 0x4f, 0xc9, 0xc5, 0x41, 0x96, 0x86, 0x90, 0xb3, + 0x89, 0xc1, 0xc6, 0x90, 0x73, 0x66, 0x58, 0x4f, 0x57, 0xbc, 0xc3, 0x8b, + 0x4c, 0x84, 0x47, 0xbc, 0xd4, 0x51, 0x7b, 0x58, 0x6c, 0x5c, 0xc2, 0xa7, + 0xb0, 0x3c, 0xa7, 0xb1, 0xbe, 0x65, 0x88, 0x6d, 0x53, 0xa9, 0xbe, 0x42, + 0xc7, 0xc2, 0x30, 0x95, 0xb3, 0x87, 0x8a, 0xca, 0xaf, 0x37, 0x8a, 0x86, + 0x75, 0x36, 0x9d, 0x52, 0x7b, 0x3d, 0x7e, 0x7a, 0xc9, 0x76, 0x5c, 0xbb, + 0x6a, 0xb2, 0xb2, 0x6b, 0x50, 0xc2, 0x44, 0x78, 0xb2, 0x53, 0x46, 0xb0, + 0x8d, 0x57, 0x49, 0xd2, 0xb2, 0xad, 0x43, 0xc6, 0xa0, 0x56, 0x70, 0x56, + 0x74, 0x4f, 0xb9, 0x32, 0xcc, 0xd1, 0xbb, 0xce, 0x9c, 0x36, 0xb8, 0x52, + 0x3c, 0x3e, 0x42, 0x85, 0x4e, 0xbf, 0x50, 0x89, 0x74, 0x73, 0x4c, 0x98, + 0x77, 0x47, 0x59, 0x72, 0xbb, 0xb4, 0xa4, 0x52, 0xcc, 0x34, 0xc9, 0x57, + 0x7b, 0x44, 0x4b, 0x60, 0xc1, 0x63, 0x62, 0xcc, 0x45, 0x79, 0x97, 0x5a, + 0xae, 0x79, 0x77, 0x4a, 0x67, 0x76, 0x76, 0x54, 0xc4, 0x43, 0xac, 0x41, + 0x96, 0x44, 0x98, 0x36, 0x49, 0x86, 0xa3, 0x99, 0xad, 0x37, 0x76, 0x3c, + 0x55, 0x7f, 0x6a, 0xb7, 0x53, 0x64, 0xcd, 0x8a, 0x95, 0x7f, 0x58, 0x8b, + 0x98, 0x60, 0xae, 0x7d, 0x69, 0xc4, 0xaf, 0x6e, 0xbb, 0x7e, 0xad, 0x33, + 0x61, 0x78, 0x8f, 0x85, 0xc3, 0x98, 0xa2, 0x92, 0x36, 0x61, 0x51, 0xbd, + 0xb8, 0x8b, 0x40, 0x79, 0x48, 0x9e, 0x88, 0x6d, 0x8a, 0xd6, 0x83, 0x78, + 0x6b, 0xad, 0x8a, 0xd3, 0x5e, 0x4a, 0x80, 0x34, 0xb9, 0x4e, 0x91, 0x2f, + 0x55, 0x47, 0x84, 0xbf, 0x49, 0x38, 0x68, 0x84, 0x54, 0xca, 0x6f, 0x6e, + 0x5b, 0x9b, 0xd3, 0x3d, 0x8a, 0x89, 0x85, 0x7d, 0xc6, 0xcc, 0x3e, 0x94, + 0x86, 0xd4, 0x77, 0xcd, 0xc0, 0x87, 0x63, 0xac, 0x83, 0x39, 0x66, 0x73, + 0x65, 0x44, 0x4e, 0x3f, 0x5f, 0x7d, 0xb9, 0xd3, 0x9a, 0xd0, 0xde, 0xdb, + 0xb8, 0x73, 0xac, 0xd9, 0x73, 0x44, 0xc0, 0xcc, 0x3d, 0x46, 0x55, 0x3a, + 0x53, 0x6a, 0xc5, 0x41, 0xce, 0x5c, 0x98, 0x4e, 0x93, 0xc1, 0x9e, 0x72, + 0x5b, 0x9d, 0x8e, 0xa1, 0xad, 0xc1, 0xcd, 0xc5, 0x60, 0x88, 0x97, 0x73, + 0x8b, 0x97, 0xd5, 0x81, 0xb6, 0x4e, 0x3d, 0x3a, 0x60, 0x88, 0xa3, 0xbb, + 0xaa, 0xa9, 0xd0, 0x63, 0xb9, 0xd3, 0x76, 0x69, 0x79, 0xcf, 0x6b, 0x48, + 0xc1, 0x9a, 0xad, 0x87, 0xee, 0xd7, 0xb4, 0x41, 0x78, 0xdb, 0x6a, 0xd6, + 0xa5, 0x85, 0xbb, 0x61, 0xc6, 0xb5, 0xb4, 0xac, 0xd1, 0x8c, 0x54, 0xae, + 0x71, 0xbf, 0xa9, 0x5c, 0x63, 0x5f, 0x82, 0xde, 0x9f, 0x6f, 0xa4, 0xca, + 0x7c, 0x51, 0x9c, 0x9a, 0xc4, 0x7b, 0xab, 0xb9, 0x9d, 0xae, 0xd1, 0x57, + 0xd5, 0xbb, 0x3a, 0x94, 0xbf, 0x64, 0x82, 0x7c, 0xc3, 0xb2, 0x58, 0x61, + 0xbe, 0xa4, 0xb3, 0x6b, 0xcb, 0xd2, 0x37, 0x55, 0x73, 0xac, 0x6a, 0xab, + 0x4f, 0x46, 0x8d, 0x39, 0x7e, 0xb2, 0x3a, 0x6d, 0x69, 0x99, 0xd6, 0xa5, + 0x5f, 0xbc, 0xba, 0x6a, 0x33, 0xad, 0x76, 0xc8, 0xbb, 0x87, 0x70, 0xa5, + 0x7a, 0x66, 0x38, 0x42, 0x98, 0x35, 0x7b, 0x57, 0x76, 0xaf, 0xce, 0x9d, + 0x6b, 0x88, 0xc3, 0x97, 0xbd, 0x4c, 0xc9, 0x75, 0xb2, 0xc9, 0x93, 0xb4, + 0xce, 0x3c, 0x3c, 0x59, 0x31, 0x67, 0x91, 0x4d, 0x90, 0x9c, 0x38, 0x8b, + 0x93, 0xba, 0xa9, 0x43, 0x8c, 0x3e, 0x39, 0x3e, 0x99, 0xab, 0xbc, 0x89, + 0x3b, 0xb3, 0xd3, 0xa6, 0x73, 0xd0, 0x6e, 0x44, 0x7c, 0xc0, 0xc8, 0x65, + 0x6b, 0x8c, 0x72, 0x9a, 0xcc, 0xd4, 0x40, 0x8e, 0xb1, 0x49, 0x77, 0xa9, + 0x53, 0xc0, 0xd2, 0x83, 0x6a, 0x58, 0xbf, 0x5d, 0xa6, 0x97, 0xa1, 0xaf, + 0x69, 0xa4, 0x70, 0x4f, 0xc8, 0x3f, 0x51, 0xce, 0x35, 0x5a, 0xad, 0x9b, + 0x65, 0x7c, 0x2b, 0xa6, 0x77, 0x4a, 0x4c, 0x69, 0xd5, 0x64, 0xb1, 0x73, + 0x3b, 0xb9, 0xba, 0xc7, 0xdd, 0xda, 0xd6, 0x3e, 0x90, 0x4d, 0x36, 0x52, + 0x9f, 0x70, 0x5a, 0x5f, 0x9d, 0x6d, 0xda, 0x74, 0x8c, 0x40, 0x79, 0xc5, + 0xd6, 0x66, 0x85, 0x99, 0x49, 0x36, 0x8d, 0x76, 0x6d, 0x93, 0xd5, 0x53, + 0x58, 0x98, 0x78, 0xb9, 0x48, 0x61, 0x81, 0xd1, 0xa8, 0x9c, 0x60, 0x9d, + 0xb2, 0xb2, 0xbe, 0xa4, 0xb8, 0x93, 0x5b, 0x7e, 0xc1, 0x61, 0x4f, 0x32, + 0xc4, 0xad, 0xaf, 0x4d, 0xb9, 0xcd, 0x89, 0xb8, 0x6c, 0xc5, 0x5d, 0x77, + 0x50, 0x83, 0xae, 0x37, 0x33, 0x46, 0xce, 0x5d, 0x97, 0x73, 0x99, 0x65, + 0x9f, 0x68, 0x42, 0x8d, 0x61, 0x84, 0x9a, 0x77, 0xa2, 0x45, 0xd1, 0xbd, + 0x9b, 0xc3, 0x3b, 0x71, 0xa4, 0xa9, 0x6c, 0x99, 0x58, 0x6f, 0x4c, 0x82, + 0x8d, 0x82, 0x95, 0x5c, 0xb2, 0x40, 0x72, 0xce, 0x44, 0x71, 0xc3, 0x55, + 0xb7, 0x9d, 0xda, 0x8f, 0x96, 0x42, 0xa2, 0x6a, 0xab, 0x90, 0x2d, 0x62, + 0xa1, 0xaf, 0x53, 0x3c, 0x30, 0x77, 0x39, 0x70, 0xb6, 0x6a, 0xbc, 0xaa, + 0x66, 0xb0, 0x73, 0x63, 0xd0, 0xbc, 0x73, 0x3b, 0x90, 0x7b, 0x6a, 0xb1, + 0x87, 0x8f, 0x43, 0x93, 0x95, 0x3d, 0xb5, 0x4a, 0x74, 0x35, 0xaf, 0x92, + 0x90, 0x4a, 0x55, 0x4d, 0x48, 0x3d, 0x8d, 0x6e, 0x92, 0x51, 0xbd, 0x38, + 0x47, 0x2a, 0xca, 0xb4, 0x3f, 0xa9, 0x66, 0x37, 0x41, 0x74, 0x51, 0xaf, + 0x57, 0xb9, 0xa0, 0x91, 0x74, 0x36, 0xa0, 0xa5, 0xa5, 0x50, 0xbb, 0x85, + 0xaf, 0x9e, 0x49, 0x89, 0x92, 0x8e, 0x37, 0xb8, 0x69, 0x6d, 0x76, 0xd5, + 0x5c, 0xb3, 0x4e, 0xcd, 0x6e, 0xb6, 0xa4, 0x99, 0x58, 0x4c, 0x8a, 0x90, + 0x67, 0xbd, 0x3a, 0xb8, 0x78, 0x87, 0x99, 0x69, 0x5b, 0x6e, 0x3c, 0x3a, + 0x90, 0xc7, 0xae, 0x8a, 0xa1, 0xd8, 0x72, 0xb0, 0x7b, 0xba, 0x68, 0x72, + 0x54, 0x8a, 0x9a, 0x90, 0x7b, 0xa5, 0xc1, 0xe0, 0xa3, 0xbb, 0x78, 0x53, + 0xbf, 0xbb, 0x96, 0x4e, 0x8b, 0xcd, 0x3b, 0x46, 0x5c, 0x44, 0x84, 0xc7, + 0xe0, 0xc2, 0x64, 0xca, 0x97, 0x71, 0xb9, 0x8e, 0xab, 0x38, 0x5b, 0x9c, + 0x9f, 0x6b, 0xd0, 0x99, 0x7e, 0xb2, 0x5f, 0x85, 0x4b, 0x5b, 0x48, 0x8e, + 0x79, 0x4d, 0xd8, 0xa4, 0xcd, 0x42, 0x58, 0x8e, 0x80, 0xc5, 0x36, 0xb1, + 0xcc, 0xb9, 0x62, 0x6d, 0x7b, 0xb8, 0xae, 0xc1, 0xc2, 0x90, 0xb7, 0x80, + 0xbc, 0x3f, 0x5c, 0x41, 0x74, 0xa2, 0x41, 0x57, 0x83, 0xcc, 0x9b, 0xc9, + 0x5f, 0x31, 0x98, 0x99, 0x97, 0x3c, 0x71, 0x30, 0x94, 0xbd, 0x9f, 0xaf, + 0xa9, 0x84, 0xc9, 0x6e, 0xc3, 0xc6, 0x45, 0x81, 0x87, 0x7c, 0x80, 0x57, + 0x6c, 0x95, 0x46, 0x7a, 0x8e, 0x8e, 0x4c, 0x74, 0x7b, 0xa4, 0x61, 0xa5, + 0x98, 0x9b, 0xb4, 0xbe, 0xcf, 0xb5, 0x9a, 0x9f, 0x3f, 0x3f, 0xc1, 0xa5, + 0x7a, 0x46, 0xa4, 0x51, 0xc5, 0x75, 0x50, 0x6a, 0x82, 0x63, 0x73, 0x45, + 0x9c, 0x7a, 0xcf, 0x7a, 0x6c, 0x3e, 0x60, 0x91, 0xdb, 0xc0, 0x8c, 0xb8, + 0x69, 0x85, 0x71, 0x27, 0x97, 0xc8, 0x81, 0x44, 0x39, 0x86, 0x48, 0x94, + 0xbc, 0xb9, 0x5f, 0x9d, 0x59, 0xd7, 0x43, 0xb5, 0x98, 0xc0, 0x7d, 0xc2, + 0xa8, 0x41, 0xd2, 0x5d, 0xa5, 0x79, 0x7d, 0x6f, 0x39, 0xb0, 0xe1, 0x91, + 0x72, 0xcc, 0xbb, 0x9e, 0x49, 0x31, 0x4e, 0x7c, 0xd0, 0xdb, 0x66, 0x60, + 0x6e, 0xe3, 0x9d, 0x50, 0x86, 0xce, 0x90, 0x4a, 0x5f, 0x9e, 0x90, 0x41, + 0x5c, 0xb6, 0xcb, 0x4f, 0x45, 0x84, 0xa0, 0xcf, 0x4c, 0x8c, 0x93, 0x3b, + 0x92, 0x55, 0x8c, 0x60, 0xd2, 0x87, 0x86, 0xcb, 0xbb, 0xab, 0xae, 0x5e, + 0x7f, 0x88, 0x41, 0x90, 0x61, 0x2d, 0x9e, 0x92, 0x93, 0x44, 0x3c, 0x87, + 0x3f, 0x9b, 0x91, 0xbb, 0x88, 0x57, 0x4f, 0x37, 0xcf, 0x5e, 0x3e, 0x5b, + 0xb7, 0x39, 0x4c, 0x7e, 0x37, 0xb2, 0xb4, 0x7a, 0xba, 0x62, 0x44, 0xb4, + 0x5c, 0x55, 0x71, 0xac, 0xca, 0x43, 0x67, 0x9d, 0x8d, 0x2c, 0x63, 0xb7, + 0x7c, 0xca, 0xa2, 0x8d, 0x50, 0x8d, 0xce, 0x4a, 0x65, 0xb1, 0xbe, 0x52, + 0x90, 0xc7, 0xae, 0x59, 0x51, 0x97, 0x3d, 0xa0, 0x90, 0x55, 0x55, 0x91, + 0xae, 0x46, 0x79, 0x59, 0x47, 0x41, 0xc5, 0x59, 0x88, 0x2b, 0xd2, 0x6a, + 0x52, 0x6c, 0x47, 0x64, 0x5d, 0x37, 0x4d, 0x91, 0xba, 0xcb, 0xbe, 0xb7, + 0xb3, 0x3d, 0x44, 0x37, 0x71, 0x57, 0x3b, 0x7b, 0x4d, 0x5f, 0xb3, 0xba, + 0x40, 0xaa, 0xd0, 0xc6, 0x94, 0x6a, 0x72, 0xca, 0xc6, 0xe9, 0xae, 0xc4, + 0x44, 0x29, 0x42, 0x98, 0x4a, 0xba, 0x8e, 0xd3, 0x8d, 0xa6, 0x98, 0x6c, + 0x97, 0x4e, 0xb6, 0x9a, 0x98, 0x26, 0xc0, 0x6d, 0x5f, 0xaf, 0x7c, 0x57, + 0xc2, 0xa6, 0xa4, 0xb1, 0x7b, 0x66, 0x49, 0x63, 0x3e, 0x65, 0x91, 0x95, + 0xd4, 0xa9, 0x77, 0x5f, 0xc0, 0x6d, 0x50, 0x51, 0x60, 0x77, 0xb4, 0xac, + 0x4b, 0x5b, 0xaf, 0x5f, 0xc7, 0x43, 0xba, 0xb8, 0xca, 0xa8, 0x4d, 0x6a, + 0x7b, 0xc6, 0xc2, 0x8b, 0x7b, 0xb8, 0x45, 0x50, 0x60, 0xbc, 0x63, 0x61, + 0xbf, 0x84, 0xa9, 0x58, 0x30, 0x8e, 0x50, 0xc9, 0xc7, 0xc8, 0x37, 0x6d, + 0x55, 0x8b, 0xbd, 0xcc, 0x35, 0xbd, 0x4f, 0xa3, 0x3e, 0x5a, 0xb9, 0x6c, + 0x82, 0x94, 0x90, 0xc5, 0x9e, 0x65, 0x99, 0xd1, 0xb6, 0xd4, 0x2b, 0x3a, + 0x6b, 0xbf, 0xb3, 0xa2, 0x68, 0x54, 0x92, 0x6c, 0x79, 0x4e, 0x81, 0xa0, + 0x44, 0x38, 0xab, 0x97, 0xc6, 0x8a, 0xbf, 0x52, 0x79, 0xc7, 0x55, 0x9e, + 0xcd, 0xce, 0x5d, 0x54, 0xc2, 0x71, 0xbf, 0x4a, 0x6d, 0x92, 0xae, 0xc7, + 0xa8, 0x3d, 0x4a, 0x3b, 0x9f, 0x51, 0xcc, 0x75, 0x3d, 0x3d, 0x3e, 0xac, + 0xb0, 0xac, 0xa8, 0xcc, 0x32, 0x69, 0xc0, 0x7a, 0x9d, 0x66, 0xb8, 0x4c, + 0x5b, 0xcb, 0x4f, 0x5a, 0x4e, 0xb9, 0xcb, 0xbd, 0x4a, 0xbe, 0x39, 0x94, + 0x86, 0x63, 0xda, 0x3a, 0x36, 0x87, 0x82, 0x4d, 0x87, 0xb4, 0xd3, 0x65, + 0x96, 0xb0, 0x6a, 0x6f, 0xaa, 0x4f, 0xd0, 0x6d, 0x7e, 0x8e, 0xb3, 0x97, + 0x60, 0x7f, 0x74, 0x65, 0x69, 0xa7, 0x4a, 0x52, 0xd1, 0xa7, 0x6f, 0x61, + 0x36, 0x64, 0x96, 0x9e, 0x6c, 0x96, 0x4e, 0x46, 0x43, 0x85, 0x41, 0x96, + 0x53, 0xa2, 0xa9, 0xa6, 0xc1, 0xaf, 0x94, 0x4a, 0x5f, 0x3d, 0x98, 0xb2, + 0x5d, 0x87, 0xbe, 0xa1, 0xa6, 0x43, 0x92, 0x75, 0x54, 0x6d, 0x6b, 0x40, + 0x37, 0x7d, 0x64, 0xb4, 0xb8, 0x3f, 0xb3, 0xb5, 0xb0, 0xbc, 0xd3, 0xba, + 0xcd, 0xd4, 0x4a, 0x7a, 0x45, 0xba, 0x6d, 0x90, 0xa9, 0x70, 0x75, 0xb5, + 0x5e, 0x78, 0x4f, 0x2f, 0x8a, 0x8e, 0x37, 0x6f, 0x50, 0x92, 0xcd, 0x44, + 0x8b, 0x95, 0x5e, 0x8d, 0x82, 0xa0, 0x50, 0x7e, 0x7c, 0x88, 0xa3, 0x9f, + 0x71, 0x79, 0xb7, 0x86, 0x3e, 0x83, 0x9d, 0x4c, 0x6b, 0xac, 0xae, 0xa0, + 0x32, 0xcd, 0x7e, 0x43, 0x69, 0x4e, 0xdc, 0x75, 0xd3, 0x6f, 0x67, 0x8e, + 0xbd, 0xb3, 0x4e, 0x49, 0x72, 0x7c, 0xc3, 0x92, 0x7d, 0x35, 0x9f, 0x7d, + 0x94, 0xd3, 0x6b, 0x76, 0xcd, 0x59, 0x94, 0xc8, 0x9b, 0x42, 0xa3, 0x86, + 0x94, 0x9e, 0xba, 0xc5, 0x85, 0x77, 0x9c, 0x76, 0x7d, 0x34, 0x26, 0x3e, + 0x5d, 0x4f, 0x5c, 0x39, 0x5d, 0xaa, 0x59, 0x82, 0xbf, 0x9b, 0x79, 0xa5, + 0x6f, 0xb1, 0xb3, 0x8f, 0x33, 0x8f, 0x5f, 0x7f, 0xb1, 0x6b, 0x63, 0x76, + 0x4b, 0x7b, 0xa4, 0x73, 0xa9, 0xbc, 0x6a, 0xcc, 0x7a, 0xd8, 0x7f, 0x46, + 0x6c, 0xae, 0xc1, 0x4d, 0x9a, 0x3c, 0x5b, 0x44, 0x77, 0xbe, 0xcb, 0x84, + 0x94, 0x33, 0x5f, 0xbe, 0xa6, 0xa6, 0xbd, 0xa7, 0xd1, 0xb8, 0x99, 0x8d, + 0x83, 0x4d, 0x6e, 0x49, 0x31, 0xca, 0xbd, 0x57, 0x7c, 0x8e, 0x58, 0x7c, + 0xb3, 0xbe, 0xc9, 0x59, 0x99, 0x48, 0x46, 0xa0, 0xb6, 0x59, 0x4b, 0xb9, + 0x8b, 0x8e, 0x46, 0x6b, 0x79, 0x8c, 0x92, 0x85, 0x68, 0x7d, 0x31, 0x67, + 0xa8, 0xd1, 0xe7, 0x48, 0x90, 0xde, 0x6e, 0xd4, 0x5a, 0x40, 0x65, 0x52, + 0x51, 0xa0, 0xa7, 0x75, 0xe4, 0xd0, 0xbb, 0x3c, 0x89, 0xa4, 0x42, 0xaf, + 0xb7, 0x33, 0x63, 0x46, 0x8d, 0xbc, 0x31, 0x39, 0x41, 0x60, 0xbc, 0x51, + 0x36, 0x91, 0x90, 0x47, 0x98, 0xbc, 0x9f, 0xbf, 0xd8, 0xa9, 0x3c, 0x92, + 0x36, 0xd5, 0x92, 0x91, 0xb6, 0x33, 0xcf, 0xb0, 0xca, 0x7a, 0x46, 0x57, + 0x50, 0xbb, 0xb4, 0x77, 0x88, 0x91, 0xa9, 0x6d, 0xd0, 0x5e, 0xb0, 0x7e, + 0xb6, 0x72, 0x56, 0x7b, 0x4a, 0x52, 0xcc, 0xce, 0x9d, 0x33, 0xa8, 0x2e, + 0xd5, 0x90, 0x6a, 0x68, 0x54, 0x4f, 0x4e, 0x48, 0xc6, 0x47, 0x4a, 0x55, + 0x89, 0x4b, 0x4b, 0xc3, 0x2f, 0x7d, 0x85, 0x53, 0xbf, 0x3c, 0xd2, 0x4e, + 0xb7, 0x9e, 0x8a, 0x7e, 0x71, 0xc7, 0x44, 0x5b, 0x2f, 0x5b, 0x88, 0x83, + 0x9b, 0xc6, 0x42, 0xad, 0xa4, 0x90, 0x8c, 0x55, 0x9d, 0x71, 0xd4, 0x94, + 0x76, 0xc7, 0x89, 0x6f, 0xc8, 0x98, 0x8b, 0x5f, 0x5a, 0x9d, 0x42, 0x5a, + 0x91, 0xb8, 0xc0, 0x47, 0x30, 0x50, 0xd3, 0xc8, 0x9e, 0xad, 0x8a, 0xbb, + 0x7d, 0x3a, 0x37, 0x6a, 0x7f, 0x3c, 0x3d, 0x46, 0x83, 0xda, 0x57, 0x9f, + 0x67, 0x93, 0x4a, 0x56, 0x65, 0xab, 0x9a, 0xa2, 0x2e, 0x55, 0x7a, 0x66, + 0x4c, 0x64, 0xab, 0x39, 0x5d, 0xbb, 0x55, 0x5d, 0xb5, 0xb9, 0x3c, 0x43, + 0x7e, 0xd6, 0x36, 0x60, 0x8b, 0xd4, 0xc0, 0xc0, 0x79, 0x60, 0x3a, 0xd3, + 0xba, 0xa1, 0xd2, 0xb7, 0xab, 0xbb, 0x57, 0x77, 0xbc, 0xb1, 0x8f, 0x31, + 0xb6, 0xc8, 0xca, 0x45, 0xaa, 0x8a, 0x51, 0xab, 0xc1, 0xcc, 0x4b, 0x58, + 0x8a, 0x68, 0x63, 0xa4, 0xa9, 0x61, 0xae, 0x66, 0x4b, 0x6c, 0x42, 0xc5, + 0x53, 0x40, 0x48, 0x5d, 0xd1, 0x31, 0xbf, 0x76, 0x6f, 0x7d, 0xa6, 0x72, + 0xcc, 0x75, 0x55, 0xba, 0xa5, 0xbc, 0xc8, 0xbb, 0xb3, 0x84, 0xcb, 0x71, + 0x8c, 0x87, 0x76, 0x59, 0x93, 0xd1, 0x90, 0xc0, 0x51, 0x35, 0x77, 0xb4, + 0x65, 0xbe, 0x56, 0x30, 0xb7, 0x48, 0x56, 0x80, 0x4a, 0x80, 0xaf, 0x9e, + 0xa1, 0xb7, 0x45, 0xcc, 0x96, 0x37, 0xc0, 0x87, 0x75, 0x42, 0x89, 0xaa, + 0xa3, 0x83, 0x6c, 0x74, 0x3e, 0x5e, 0x85, 0x50, 0xd8, 0x38, 0x9f, 0x3a, + 0x70, 0x5a, 0x48, 0x90, 0xb4, 0xb2, 0xd9, 0x96, 0xbe, 0x6c, 0x93, 0x64, + 0x5b, 0xcf, 0x83, 0x9a, 0x4d, 0x3f, 0xd3, 0x3e, 0x97, 0xe4, 0x7f, 0xd3, + 0xe6, 0x86, 0xb9, 0x74, 0x59, 0x8e, 0x76, 0x6d, 0xd1, 0xa7, 0x45, 0x29, + 0x91, 0x86, 0x81, 0xbc, 0xae, 0x7a, 0xd5, 0x41, 0x4e, 0x94, 0xc2, 0x48, + 0xcc, 0x42, 0x7b, 0x62, 0x9b, 0x24, 0xae, 0xab, 0x47, 0x65, 0x95, 0xca, + 0x90, 0xc3, 0x4e, 0x87, 0xc8, 0xd4, 0x34, 0xcb, 0xba, 0x5e, 0xae, 0xc2, + 0x3b, 0xc8, 0xa8, 0xaf, 0xb5, 0x90, 0x41, 0x6e, 0xc2, 0x64, 0xc3, 0x32, + 0x59, 0x9c, 0x7b, 0xa4, 0x34, 0x64, 0x60, 0xbc, 0xa6, 0x42, 0xa4, 0xae, + 0x8a, 0x90, 0x50, 0xa7, 0x50, 0x90, 0x4c, 0x49, 0xb6, 0x38, 0x38, 0x3b, + 0x61, 0x53, 0x36, 0xac, 0xcd, 0x82, 0x8a, 0xa7, 0xa5, 0xb5, 0x9e, 0xd0, + 0x41, 0xa2, 0x69, 0x47, 0xba, 0x47, 0xca, 0x77, 0x95, 0xbe, 0x58, 0xae, + 0x3e, 0x70, 0x5b, 0x3d, 0x77, 0x55, 0x45, 0xc4, 0x3a, 0x78, 0x8c, 0x86, + 0x46, 0x4f, 0x32, 0x84, 0xab, 0x62, 0x70, 0x8a, 0x62, 0x82, 0x9f, 0x9d, + 0x53, 0xbc, 0x9a, 0x6a, 0xc5, 0x76, 0xc0, 0x69, 0x3d, 0xce, 0xbd, 0xc2, + 0xd5, 0x3e, 0x46, 0xb5, 0xa6, 0x84, 0x3b, 0x5b, 0x5f, 0x79, 0xc7, 0x5a, + 0xbb, 0x43, 0xa8, 0x99, 0x4e, 0xb1, 0xb3, 0xbe, 0x94, 0x4b, 0xac, 0xa4, + 0x4f, 0x72, 0x76, 0xb9, 0x9b, 0x4b, 0xc9, 0x5a, 0x50, 0x60, 0x7a, 0x8d, + 0x3b, 0x94, 0x8d, 0x59, 0x5e, 0x65, 0x5a, 0x6b, 0xb6, 0x68, 0xa7, 0x89, + 0x8d, 0x64, 0x6d, 0x8f, 0x40, 0x80, 0x7c, 0xb1, 0x81, 0xc5, 0xc1, 0xa6, + 0x91, 0x80, 0x35, 0x3a, 0x7b, 0xcc, 0x58, 0x45, 0x3d, 0x7b, 0xc1, 0x90, + 0x45, 0xc4, 0x6d, 0xa2, 0x3d, 0xcb, 0xa2, 0x3e, 0x2f, 0xac, 0x8e, 0x43, + 0x92, 0x77, 0x96, 0xc6, 0x88, 0xad, 0x57, 0x46, 0xcf, 0x55, 0xaf, 0x6f, + 0xcc, 0x4b, 0x33, 0x51, 0xa5, 0xa7, 0x93, 0xa5, 0x38, 0x76, 0x63, 0x82, + 0x63, 0x34, 0x5a, 0xb4, 0x5b, 0x8a, 0xb4, 0x3e, 0x9b, 0x63, 0xc1, 0x87, + 0xe0, 0x82, 0x55, 0xae, 0x5a, 0x2c, 0x4d, 0xb0, 0xaa, 0x51, 0xcd, 0x5c, + 0x7b, 0x41, 0x59, 0x76, 0x7a, 0x99, 0x52, 0x79, 0x32, 0x3b, 0xd8, 0x3c, + 0x35, 0x90, 0x82, 0x94, 0xd3, 0xa5, 0x5f, 0x48, 0x6e, 0xa2, 0x4f, 0x61, + 0x53, 0xa2, 0x58, 0xa3, 0x84, 0x53, 0x4e, 0x7b, 0x4b, 0x99, 0xaf, 0x4d, + 0xbe, 0x44, 0x39, 0x99, 0x68, 0x60, 0xd6, 0xb5, 0x92, 0xa1, 0xba, 0x55, + 0x48, 0x49, 0x73, 0x7d, 0x3e, 0x44, 0xbb, 0xad, 0x83, 0xb3, 0x3f, 0x4e, + 0x69, 0x8a, 0xca, 0x87, 0xb8, 0x3f, 0x47, 0x64, 0xb9, 0x6b, 0x43, 0xc6, + 0xac, 0x3c, 0x5c, 0x6b, 0x9e, 0xa6, 0x75, 0x60, 0x3b, 0x90, 0x5b, 0x98, + 0x73, 0x49, 0x32, 0x69, 0x9f, 0x42, 0xb6, 0xb2, 0x89, 0x86, 0x46, 0x49, + 0x42, 0xd7, 0xd5, 0xe0, 0x92, 0x70, 0xba, 0x26, 0x9d, 0x7a, 0x37, 0xba, + 0xa1, 0xc0, 0x4d, 0xc1, 0xba, 0xb1, 0x79, 0x5b, 0x46, 0xa7, 0xb8, 0x3a, + 0x27, 0x7e, 0xa0, 0xd7, 0x3a, 0xa8, 0x6e, 0x78, 0x3d, 0x8c, 0x4d, 0x85, + 0xd2, 0x9c, 0x74, 0x70, 0x9c, 0x47, 0x31, 0xc4, 0x86, 0x52, 0x4c, 0x56, + 0x89, 0x4a, 0x75, 0x74, 0x5c, 0x38, 0xbe, 0xad, 0xc1, 0xd3, 0x81, 0x4c, + 0x45, 0xba, 0x59, 0xbf, 0x98, 0x3e, 0x6b, 0x67, 0x75, 0x53, 0xb9, 0x7f, + 0x88, 0x58, 0x69, 0xaf, 0xbd, 0x45, 0x71, 0x6f, 0x59, 0xce, 0xbf, 0x7f, + 0x4c, 0x83, 0xc9, 0x86, 0x7b, 0xd3, 0x9e, 0x58, 0x4a, 0x47, 0x4c, 0x70, + 0x75, 0x81, 0x6e, 0x8b, 0x62, 0xc4, 0x3f, 0xcb, 0xb5, 0xd6, 0x36, 0x78, + 0x9f, 0x80, 0x2d, 0x67, 0x6d, 0x3e, 0x8f, 0x67, 0x3e, 0xc0, 0xcf, 0xce, + 0xa6, 0x48, 0x82, 0x4e, 0x44, 0xd1, 0x3f, 0x7b, 0x6c, 0x86, 0x89, 0x73, + 0x6e, 0xd5, 0x56, 0xb9, 0x87, 0xb1, 0x4a, 0xce, 0x4e, 0x94, 0xd0, 0xb6, + 0x67, 0x3d, 0x73, 0xb7, 0xa1, 0xc6, 0x44, 0x51, 0xba, 0xa1, 0xcb, 0xae, + 0x83, 0xba, 0x6e, 0x89, 0x74, 0x63, 0x7a, 0xbe, 0x82, 0x2d, 0xbc, 0x67, + 0xa9, 0xc1, 0x9e, 0x67, 0x79, 0x6b, 0xd4, 0xa7, 0x82, 0x90, 0xcc, 0xa9, + 0x7a, 0xd5, 0x62, 0x7c, 0xa5, 0x42, 0xb2, 0x2c, 0x9f, 0x4f, 0xbc, 0xae, + 0x6f, 0x61, 0xd8, 0xc1, 0x97, 0xc2, 0x54, 0x76, 0x8b, 0xc8, 0x83, 0x99, + 0x63, 0x37, 0x87, 0x78, 0x9b, 0xda, 0xa6, 0x9b, 0xc4, 0x73, 0xa2, 0xb8, + 0xb4, 0x73, 0xc1, 0x8d, 0x6e, 0x2d, 0x34, 0x7b, 0xd5, 0xa6, 0x8e, 0x82, + 0x65, 0xaa, 0x54, 0xab, 0x37, 0x6b, 0x40, 0x80, 0x84, 0x9d, 0xaf, 0x59, + 0x4d, 0x97, 0x6a, 0xcd, 0x58, 0xa9, 0x84, 0x68, 0x90, 0x53, 0x7f, 0x54, + 0x5d, 0xac, 0xd9, 0xa6, 0x73, 0x91, 0xcc, 0x6d, 0x2a, 0xae, 0x50, 0xa3, + 0xac, 0xc1, 0x61, 0xa4, 0x5a, 0xb5, 0x8e, 0xbd, 0xd1, 0xa6, 0x74, 0x77, + 0x75, 0x82, 0x5e, 0x9c, 0x51, 0x5a, 0x71, 0xa4, 0x3c, 0x87, 0xb6, 0x41, + 0x6b, 0x7f, 0xc8, 0x8c, 0x3e, 0x78, 0x52, 0x49, 0xa3, 0x8f, 0xdc, 0x42, + 0xa1, 0xb2, 0xd0, 0xb9, 0x78, 0xba, 0x93, 0x96, 0x75, 0xc8, 0x7d, 0xa6, + 0xb6, 0x8f, 0x2a, 0x37, 0xa0, 0x4d, 0x3e, 0xb9, 0x46, 0xc8, 0xbb, 0xaa, + 0xa7, 0x4c, 0xae, 0x6a, 0x7a, 0x7a, 0x79, 0x8b, 0xb1, 0x7a, 0xd6, 0x7a, + 0xad, 0xc8, 0x7b, 0xba, 0x99, 0x9b, 0x5e, 0xad, 0x5f, 0xd4, 0xcb, 0x9d, + 0x4f, 0x44, 0xcb, 0x63, 0x87, 0xa9, 0xd2, 0x49, 0x31, 0xbd, 0x89, 0x91, + 0x74, 0xc4, 0x82, 0x4c, 0x86, 0x9e, 0xd6, 0x40, 0x4d, 0x70, 0x3c, 0x8c, + 0x9a, 0x5f, 0xac, 0xd3, 0x73, 0x4d, 0xbd, 0x9a, 0x59, 0xbe, 0x63, 0x63, + 0xc5, 0x85, 0x3a, 0x35, 0x8d, 0x36, 0x78, 0x78, 0x9e, 0xb5, 0x44, 0xca, + 0x44, 0x56, 0xa7, 0x6d, 0x62, 0x7c, 0xa9, 0xb5, 0xc1, 0x79, 0x95, 0x6e, + 0x60, 0xb0, 0xae, 0x3d, 0x6c, 0x71, 0x52, 0x9c, 0x66, 0x4e, 0xbb, 0x33, + 0xd3, 0x9b, 0x49, 0x97, 0x42, 0x42, 0x31, 0x5c, 0xc0, 0xd5, 0xa4, 0x5f, + 0xa9, 0x44, 0xb8, 0xc4, 0xb4, 0xd2, 0xb5, 0x5e, 0x60, 0xbe, 0x5b, 0x34, + 0x56, 0x69, 0x2e, 0x79, 0xda, 0x95, 0x82, 0x76, 0x7a, 0xa3, 0xaa, 0xa5, + 0xb2, 0xab, 0xbe, 0x57, 0x50, 0x41, 0x9b, 0x4e, 0x79, 0x83, 0xc8, 0x90, + 0xb0, 0x9a, 0x8e, 0xa0, 0x75, 0x67, 0x78, 0x4e, 0x42, 0xa5, 0x69, 0xb2, + 0x62, 0x50, 0xbc, 0x6d, 0x93, 0x91, 0xbe, 0x95, 0x79, 0x42, 0x6e, 0x7f, + 0xb0, 0x55, 0x38, 0x53, 0x70, 0xc6, 0xb3, 0x91, 0x6b, 0x9d, 0xa1, 0xdf, + 0x87, 0x70, 0x7d, 0x57, 0xbd, 0x52, 0x4a, 0x68, 0x41, 0x59, 0x8d, 0xc2, + 0x63, 0x61, 0x6e, 0x83, 0x57, 0x9c, 0x7a, 0xa7, 0xc2, 0xa8, 0x39, 0x86, + 0x32, 0x52, 0x98, 0x40, 0x49, 0x54, 0xbc, 0xb9, 0x53, 0xbb, 0x8a, 0x42, + 0xcd, 0x6c, 0xc5, 0x92, 0xbd, 0x56, 0xb2, 0x35, 0x6c, 0x2f, 0x47, 0xbd, + 0x32, 0x52, 0x98, 0x3e, 0xbc, 0xad, 0x59, 0x77, 0xa3, 0x5a, 0x48, 0x76, + 0xc6, 0x61, 0x3b, 0x7b, 0xa6, 0x62, 0x95, 0x88, 0x4c, 0x91, 0x46, 0x78, + 0x88, 0xbb, 0x36, 0x50, 0x86, 0xa9, 0x49, 0xb7, 0x8a, 0x90, 0x66, 0x96, + 0xc8, 0x77, 0x60, 0x46, 0xa0, 0x50, 0xc5, 0x9b, 0x6e, 0x87, 0xcb, 0x3b, + 0x8e, 0x6a, 0x8b, 0x88, 0x6a, 0x77, 0x44, 0x81, 0xa6, 0xb3, 0x48, 0x8e, + 0xab, 0x69, 0x36, 0x7e, 0xa5, 0xb6, 0x95, 0xa0, 0x4d, 0x7f, 0xd5, 0xca, + 0x41, 0x6e, 0x7f, 0xa5, 0xd4, 0xb9, 0x6c, 0xe1, 0xd9, 0x3e, 0xc3, 0x47, + 0x9e, 0x5a, 0x49, 0x74, 0xc6, 0x32, 0x9a, 0x97, 0x83, 0x9e, 0x46, 0xb1, + 0xa9, 0xa7, 0x6d, 0x63, 0x57, 0x91, 0x81, 0x87, 0x63, 0x74, 0x44, 0xa5, + 0x40, 0xb4, 0xa6, 0x48, 0x8f, 0x42, 0x59, 0x63, 0x42, 0xd5, 0xa0, 0x56, + 0xc4, 0xae, 0xb4, 0x3d, 0x89, 0x87, 0x84, 0x38, 0x75, 0x57, 0xa2, 0xc0, + 0x4a, 0x33, 0x65, 0x61, 0x59, 0x3d, 0x67, 0xaf, 0x92, 0x5d, 0x62, 0x87, + 0x47, 0x51, 0x92, 0xa3, 0xcb, 0x5b, 0x72, 0x77, 0x6f, 0x74, 0xcc, 0x84, + 0x61, 0xa0, 0x83, 0x6d, 0x75, 0x6a, 0x95, 0x94, 0x7e, 0x5d, 0x4b, 0xb9, + 0xd3, 0xc0, 0xcc, 0xb2, 0xa4, 0x5f, 0xe1, 0x5a, 0x69, 0xb8, 0x93, 0xb8, + 0x36, 0x80, 0xac, 0x36, 0xb9, 0x5f, 0xa1, 0x66, 0x9b, 0x9d, 0x9b, 0x60, + 0x49, 0x53, 0x58, 0xcc, 0x86, 0x70, 0x6b, 0x6c, 0xce, 0x45, 0x5e, 0x84, + 0x87, 0x84, 0x97, 0x85, 0x84, 0x48, 0xce, 0xb8, 0x51, 0xb3, 0x70, 0xc4, + 0xa3, 0x54, 0x6b, 0x5a, 0xa0, 0x8f, 0xc8, 0x60, 0x49, 0xb7, 0xc6, 0x3e, + 0x3c, 0x96, 0xc7, 0xa4, 0xb5, 0x4a, 0x3c, 0x98, 0x34, 0x4f, 0xd5, 0x4d, + 0x5e, 0x77, 0xa0, 0x90, 0xab, 0x81, 0x96, 0x68, 0x76, 0x57, 0x49, 0x94, + 0x4c, 0x5a, 0x71, 0x40, 0x41, 0x80, 0x8b, 0xd2, 0x47, 0xc9, 0xb5, 0x81, + 0xa0, 0xbc, 0x57, 0xc1, 0x87, 0xae, 0xbb, 0x2c, 0x70, 0x71, 0xb5, 0x43, + 0xbd, 0x42, 0xba, 0xc4, 0xcf, 0x38, 0x4f, 0x33, 0x7f, 0x85, 0x3c, 0xb4, + 0xc2, 0xd8, 0x8f, 0x41, 0xcf, 0x94, 0x3f, 0x4c, 0x89, 0x68, 0xc4, 0x36, + 0x4a, 0xd4, 0x96, 0x65, 0xb8, 0x9b, 0x98, 0x8e, 0x4e, 0xd3, 0x63, 0x76, + 0x45, 0x31, 0x83, 0xc6, 0xae, 0x65, 0x66, 0x89, 0x6c, 0x49, 0x5a, 0xb3, + 0xdc, 0x9c, 0x5d, 0xd5, 0x50, 0x9c, 0x93, 0x4d, 0x45, 0x84, 0x5a, 0x87, + 0xc8, 0xb5, 0x8d, 0xd5, 0xd4, 0x51, 0x3e, 0xd4, 0xa8, 0x66, 0x4d, 0xcf, + 0xcc, 0x43, 0x85, 0x3f, 0x93, 0x7a, 0xb7, 0xc2, 0x49, 0x81, 0x9e, 0xcd, + 0x2b, 0x7e, 0x56, 0x50, 0x51, 0x69, 0x89, 0x57, 0x9a, 0x48, 0x9c, 0x43, + 0xb2, 0xbb, 0x46, 0xa7, 0xce, 0xbf, 0x3c, 0x4d, 0xba, 0x64, 0x66, 0x62, + 0xb5, 0x79, 0xbc, 0xb3, 0x97, 0x44, 0x3e, 0xbc, 0xb0, 0xb3, 0x81, 0xb7, + 0xb7, 0x91, 0xb4, 0xd6, 0x9c, 0x67, 0xbe, 0x67, 0x76, 0x3e, 0x5a, 0x3c, + 0xb0, 0x68, 0x36, 0x94, 0xbb, 0x43, 0x42, 0xa1, 0x81, 0xaf, 0x94, 0x38, + 0x9a, 0xc6, 0xac, 0xa0, 0xac, 0xc4, 0x91, 0x99, 0xa4, 0xd3, 0x5e, 0x46, + 0x5f, 0x51, 0x2d, 0x3a, 0x5f, 0x56, 0x50, 0x65, 0x3d, 0xc7, 0x3b, 0xbb, + 0x64, 0xc2, 0x6c, 0x7c, 0xaa, 0xc3, 0xdb, 0xab, 0x52, 0xb8, 0xb4, 0xc8, + 0xb6, 0x5a, 0x7e, 0x2e, 0xc2, 0x7e, 0x66, 0x2e, 0xca, 0xcd, 0x7a, 0x4c, + 0xb2, 0x85, 0xa6, 0x65, 0x6c, 0x53, 0x8e, 0xc0, 0x75, 0xd0, 0x85, 0x5a, + 0x55, 0xaf, 0x7c, 0x5e, 0xb3, 0xb0, 0x75, 0x64, 0xab, 0x72, 0x80, 0xc5, + 0x60, 0xbc, 0x8b, 0x9e, 0x7b, 0x88, 0x73, 0x43, 0xd0, 0xab, 0x8d, 0x72, + 0x5a, 0xcb, 0xcf, 0xdf, 0x3f, 0x47, 0x90, 0xab, 0xc5, 0x95, 0x94, 0xad, + 0x92, 0xb0, 0xc6, 0x39, 0x7f, 0xa3, 0x58, 0x35, 0xb7, 0x94, 0x89, 0x58, + 0xa4, 0x79, 0x4e, 0xbe, 0x6d, 0x42, 0xcd, 0xb1, 0x66, 0x93, 0x74, 0x46, + 0x5d, 0x57, 0x54, 0x35, 0xcf, 0x88, 0x63, 0x62, 0xa4, 0xa5, 0x98, 0xa1, + 0xa8, 0x3e, 0x8f, 0xce, 0x7c, 0x49, 0x6c, 0xc7, 0xcb, 0xbe, 0x8a, 0x31, + 0x4b, 0xc7, 0x38, 0xb7, 0x8e, 0x8a, 0x46, 0x3b, 0xbf, 0x40, 0x6e, 0x49, + 0xc1, 0x9d, 0xc7, 0x4e, 0x8f, 0x98, 0xa9, 0xc9, 0x5c, 0x81, 0xc7, 0x5b, + 0x7d, 0x45, 0x3c, 0x9d, 0x74, 0xaa, 0xb0, 0x33, 0x9b, 0x77, 0x76, 0x86, + 0xaa, 0x6d, 0xc6, 0x3c, 0x46, 0x96, 0x85, 0xce, 0xac, 0xbd, 0xac, 0x40, + 0xac, 0x70, 0xc5, 0x5b, 0xa6, 0xa9, 0xc5, 0xa7, 0x3e, 0xae, 0xbc, 0x77, + 0xbb, 0x91, 0x65, 0xc0, 0x7b, 0x67, 0x41, 0x75, 0xc6, 0x34, 0xb9, 0x9f, + 0x94, 0x5a, 0xaa, 0xab, 0x46, 0xbf, 0x90, 0xaf, 0x46, 0x85, 0xba, 0xbb, + 0x4d, 0x72, 0xb1, 0x53, 0xae, 0xb2, 0xd4, 0x48, 0xb4, 0x3c, 0xc2, 0x64, + 0x33, 0x40, 0x32, 0x4b, 0xcf, 0xa7, 0xa4, 0xd2, 0xa4, 0x9e, 0x8c, 0x6c, + 0xbc, 0x73, 0xc5, 0xc5, 0x77, 0x86, 0x60, 0x94, 0xc7, 0xa9, 0x73, 0x49, + 0x8c, 0x9d, 0x82, 0x5c, 0xb6, 0x4f, 0xb1, 0xd0, 0x7a, 0x8b, 0x44, 0x58, + 0x91, 0xd0, 0xae, 0x6f, 0xc9, 0x5a, 0x86, 0x42, 0xaa, 0x34, 0x44, 0x80, + 0x90, 0x87, 0xac, 0x91, 0x4e, 0x88, 0x31, 0x76, 0x81, 0xa5, 0x55, 0xc5, + 0x68, 0xa0, 0xc1, 0xa9, 0x38, 0x58, 0xb3, 0x82, 0xaa, 0x38, 0xb0, 0x6f, + 0x9d, 0x36, 0x69, 0x52, 0x91, 0x44, 0x57, 0xc1, 0xc9, 0x7e, 0xae, 0xa5, + 0x4c, 0xbe, 0xc6, 0xb1, 0x58, 0x86, 0x48, 0x57, 0x73, 0x77, 0xc1, 0x8f, + 0x64, 0x4e, 0xbf, 0xb1, 0x51, 0xa7, 0x75, 0x98, 0x9c, 0x97, 0x48, 0x7a, + 0xc4, 0x61, 0x36, 0xa4, 0xb1, 0xbf, 0x6f, 0x88, 0x87, 0x67, 0x37, 0x46, + 0x5d, 0xa1, 0x64, 0xcc, 0xb3, 0x50, 0x65, 0x36, 0xa3, 0x84, 0xcd, 0xa8, + 0x84, 0xca, 0x58, 0xc6, 0x8f, 0xcc, 0x34, 0xa1, 0x44, 0x91, 0x90, 0xc2, + 0xbe, 0xa7, 0x71, 0x76, 0x61, 0xb5, 0x97, 0x6e, 0x78, 0x90, 0x7e, 0x3b, + 0x7a, 0x38, 0x92, 0x8a, 0x5a, 0x51, 0xa5, 0xce, 0x3b, 0xae, 0xac, 0x8b, + 0xa4, 0x6e, 0xab, 0x4a, 0x82, 0xbb, 0x5e, 0x87, 0x65, 0x3a, 0x81, 0xa0, + 0x4f, 0x68, 0x76, 0x31, 0x45, 0xa5, 0x36, 0x82, 0x70, 0x73, 0x68, 0x92, + 0x83, 0x8b, 0xbb, 0xb5, 0x5c, 0x39, 0x99, 0x35, 0x92, 0x48, 0x9b, 0x68, + 0xbc, 0x96, 0x67, 0xb0, 0x99, 0x6f, 0x5b, 0x30, 0x87, 0x6f, 0xa6, 0xc6, + 0xad, 0x83, 0xd5, 0xaa, 0x85, 0x52, 0x5b, 0x53, 0x57, 0x55, 0x90, 0x8e, + 0xb1, 0xa9, 0xa5, 0xc6, 0x66, 0x51, 0x8e, 0xb7, 0xae, 0x82, 0xcd, 0x2f, + 0x77, 0x3c, 0x54, 0xb4, 0x9d, 0x47, 0x3c, 0x3e, 0x4a, 0x31, 0x55, 0x95, + 0xa3, 0x5c, 0xc0, 0x45, 0xbb, 0x94, 0x56, 0x98, 0xac, 0x3c, 0xc8, 0xcf, + 0x84, 0x7d, 0x49, 0x41, 0x69, 0x8a, 0xb8, 0xb2, 0x92, 0x6a, 0xce, 0xca, + 0x84, 0x3b, 0x3c, 0xb2, 0x36, 0x37, 0x6b, 0x70, 0xb1, 0x47, 0x76, 0x57, + 0xac, 0x88, 0x72, 0x3b, 0x98, 0x33, 0x50, 0x4d, 0x74, 0x66, 0xbc, 0x87, + 0xb8, 0xc5, 0xcd, 0x8a, 0x59, 0x6f, 0x89, 0xb6, 0x68, 0x52, 0xc2, 0x84, + 0x81, 0x79, 0xd7, 0xbb, 0xbe, 0x8b, 0x3a, 0x99, 0x4f, 0x7a, 0x4f, 0x84, + 0x44, 0xae, 0xb8, 0x77, 0xc3, 0x6a, 0x3e, 0xa2, 0xa1, 0x3d, 0xa5, 0x31, + 0x62, 0x60, 0x68, 0x4f, 0x5a, 0xa0, 0x43, 0x31, 0xc6, 0x71, 0x5f, 0xbb, + 0x32, 0x40, 0x41, 0x5d, 0x92, 0x58, 0x42, 0x5e, 0xcc, 0x64, 0x33, 0x59, + 0xa7, 0x3b, 0xb5, 0x3f, 0x91, 0xad, 0x3e, 0x47, 0x9f, 0x67, 0x6b, 0xb7, + 0x8e, 0x72, 0x60, 0x87, 0x7d, 0x66, 0x78, 0x97, 0x7d, 0x71, 0x7b, 0xa0, + 0x77, 0x5a, 0x3a, 0xba, 0x6f, 0x30, 0x89, 0x6d, 0x7d, 0xaf, 0x82, 0xd1, + 0x8e, 0x99, 0x48, 0xc3, 0x8c, 0x48, 0x5b, 0x66, 0x36, 0xb6, 0x7d, 0x62, + 0x66, 0xd5, 0x91, 0xb3, 0x40, 0xd1, 0x42, 0x4f, 0xce, 0x65, 0x3a, 0x85, + 0x3a, 0xb4, 0xd0, 0x66, 0x44, 0xb0, 0x9f, 0xd3, 0xc5, 0x73, 0x47, 0x5a, + 0x86, 0xaf, 0x4d, 0x75, 0xae, 0x47, 0x6e, 0x87, 0x95, 0x9f, 0xc0, 0xc9, + 0x3c, 0x8f, 0x76, 0xb0, 0x4c, 0x8e, 0x46, 0x79, 0x97, 0x4b, 0xc6, 0xcb, + 0x3e, 0x6e, 0xbb, 0x4f, 0x43, 0x6b, 0x36, 0xd0, 0xc3, 0x39, 0x97, 0x9d, + 0x36, 0x67, 0x5c, 0x9a, 0xa5, 0x6f, 0x64, 0x86, 0xa0, 0x38, 0xd0, 0x6e, + 0x6b, 0xc8, 0x5d, 0x9e, 0xc7, 0xa6, 0xb3, 0x91, 0xba, 0xbc, 0x93, 0x32, + 0x5a, 0x48, 0x5d, 0x9f, 0x35, 0xc0, 0x4d, 0xbe, 0x3f, 0x5c, 0x75, 0x47, + 0x4f, 0xad, 0xb4, 0x73, 0xac, 0x43, 0x6b, 0x96, 0x37, 0x96, 0x38, 0x4d, + 0xc9, 0x61, 0xa6, 0x93, 0x3b, 0xac, 0x5b, 0x44, 0x5e, 0xb8, 0x46, 0xa5, + 0x3d, 0xa6, 0x50, 0x46, 0xa9, 0x56, 0x8b, 0xbb, 0xcc, 0x48, 0x33, 0xa1, + 0x64, 0x69, 0x81, 0xa4, 0x79, 0x69, 0xd2, 0x62, 0x76, 0xa7, 0x89, 0x3a, + 0x2d, 0xac, 0x5c, 0x51, 0xcc, 0x82, 0xd2, 0x78, 0x81, 0x87, 0xd2, 0x4a, + 0x5b, 0xa0, 0x4d, 0x5c, 0xb4, 0x3d, 0x9e, 0xa5, 0x59, 0x4a, 0x33, 0x5e, + 0x62, 0xb6, 0x9b, 0xa1, 0xc5, 0x43, 0xa1, 0x4b, 0xaf, 0x69, 0x4e, 0x64, + 0x97, 0x4a, 0x8f, 0x72, 0xaa, 0xc4, 0x57, 0x94, 0x40, 0x86, 0xc7, 0x9b, + 0xab, 0xa2, 0x84, 0x38, 0x50, 0xb6, 0x86, 0x99, 0x53, 0x4c, 0x8d, 0x8c, + 0x7d, 0xd3, 0x82, 0xa2, 0x46, 0x3d, 0x40, 0x7b, 0xd1, 0x78, 0xae, 0x90, + 0x73, 0xbf, 0xcc, 0xa9, 0x32, 0xb9, 0xbb, 0x6e, 0x8e, 0x46, 0xa0, 0x9f, + 0x9e, 0x2d, 0x65, 0xb8, 0xb9, 0x92, 0x69, 0x58, 0xcd, 0x47, 0xd2, 0x3a, + 0x5c, 0xb5, 0x9d, 0x77, 0x5e, 0x62, 0xa9, 0x9c, 0xd0, 0xa3, 0x5e, 0x77, + 0x61, 0x3f, 0x65, 0x6e, 0x51, 0x3f, 0x51, 0x91, 0x3f, 0xcf, 0xa2, 0xb8, + 0xcc, 0x3c, 0x91, 0x7a, 0x5b, 0x69, 0x63, 0x48, 0x6d, 0x51, 0x96, 0xd0, + 0x94, 0x76, 0x67, 0xc8, 0x63, 0x8a, 0x71, 0x38, 0x4b, 0x8e, 0xc9, 0xc0, + 0x39, 0x9d, 0x83, 0x3d, 0x9b, 0x8c, 0xbd, 0xbf, 0x59, 0xb8, 0x9c, 0x32, + 0xc8, 0x5c, 0x8a, 0xb9, 0x31, 0x40, 0xce, 0x50, 0xb2, 0x54, 0x6d, 0x4a, + 0x3d, 0x97, 0x7f, 0x96, 0x35, 0x94, 0x5a, 0x4b, 0x94, 0x47, 0x60, 0x6c, + 0x6c, 0xc2, 0xd3, 0x6d, 0x5c, 0x97, 0xa8, 0x8d, 0xd7, 0x90, 0x52, 0x90, + 0xb1, 0xc4, 0x2e, 0xb3, 0x36, 0xc4, 0xb3, 0x8b, 0xc5, 0x64, 0x9f, 0x60, + 0xc1, 0xa5, 0xb7, 0x3f, 0x54, 0x85, 0x69, 0xca, 0x7b, 0xa9, 0xaf, 0xba, + 0x97, 0x2f, 0x68, 0x50, 0x4a, 0x6b, 0xa3, 0x3a, 0x6d, 0xaf, 0x30, 0x98, + 0xd2, 0xab, 0x81, 0x75, 0x77, 0x77, 0x7a, 0x48, 0x7a, 0xa9, 0x66, 0xcd, + 0xbe, 0x79, 0x92, 0x57, 0x2b, 0x27, 0x42, 0xd6, 0x9c, 0x7c, 0x6a, 0xbe, + 0xb8, 0x45, 0x6c, 0x8c, 0x38, 0x6a, 0x4c, 0xa4, 0xae, 0x5b, 0xa3, 0x44, + 0xaa, 0xac, 0x62, 0x6f, 0x5a, 0xa4, 0x85, 0x86, 0x6c, 0x5f, 0x28, 0x72, + 0x59, 0x7a, 0xca, 0xd1, 0xad, 0x55, 0xb0, 0x9d, 0x84, 0x52, 0x5b, 0xdf, + 0xa7, 0xa9, 0xd3, 0x87, 0x78, 0x4f, 0xbf, 0xa9, 0x7e, 0x78, 0xaa, 0xb7, + 0xac, 0x26, 0x5b, 0x6b, 0xca, 0x7d, 0x7d, 0xc8, 0x93, 0x7f, 0xb3, 0x56, + 0x62, 0x85, 0x39, 0x49, 0x8f, 0x6c, 0x9a, 0x7f, 0x9e, 0x6b, 0xa7, 0x5e, + 0x53, 0x44, 0x72, 0x79, 0xcf, 0x72, 0x93, 0x56, 0x5c, 0x48, 0xc0, 0x40, + 0xb2, 0x4f, 0x40, 0x7c, 0x5e, 0x54, 0x4b, 0x98, 0x62, 0x59, 0x5c, 0xce, + 0x33, 0xb5, 0xaa, 0x69, 0x9b, 0xc2, 0xb5, 0x93, 0x73, 0xab, 0x8d, 0xae, + 0x96, 0xc4, 0x37, 0x9e, 0x36, 0x6c, 0x92, 0x7a, 0x58, 0x48, 0x53, 0x40, + 0x74, 0xaa, 0xba, 0x59, 0x67, 0xbd, 0x7d, 0xad, 0x6a, 0xac, 0xc3, 0x80, + 0x30, 0x89, 0x2b, 0xbb, 0x3d, 0x3d, 0x46, 0x84, 0x54, 0xb0, 0x5f, 0xac, + 0x60, 0xb7, 0x52, 0x81, 0x8a, 0xac, 0xbf, 0x68, 0x66, 0xbc, 0xa1, 0x3d, + 0x4a, 0x52, 0x5b, 0x59, 0x77, 0x94, 0x50, 0x88, 0xae, 0x80, 0xb0, 0x8b, + 0xac, 0x97, 0x98, 0x3e, 0x9f, 0x60, 0x41, 0xd0, 0xbc, 0xa5, 0x77, 0x6f, + 0xbc, 0x41, 0xa8, 0x95, 0x97, 0x65, 0xb8, 0x3b, 0x77, 0x4c, 0xbc, 0x73, + 0xd7, 0x6d, 0xbf, 0x60, 0x3e, 0x9f, 0x4f, 0x45, 0x91, 0x79, 0xd3, 0xa6, + 0xcb, 0xb3, 0x5b, 0x60, 0x69, 0xb5, 0x29, 0xc3, 0x74, 0xbb, 0x55, 0x48, + 0x59, 0x6b, 0x2f, 0xa0, 0x60, 0x58, 0x7d, 0xaf, 0x66, 0xb1, 0xa2, 0xb3, + 0x4a, 0xb0, 0x75, 0xc9, 0xb8, 0xb4, 0x47, 0x8b, 0x6a, 0xc7, 0x4a, 0x41, + 0x76, 0xd1, 0xc7, 0xa0, 0x4a, 0xc8, 0x4a, 0x37, 0x90, 0x6c, 0x41, 0x5c, + 0x3c, 0x50, 0xbb, 0x7f, 0x2b, 0x9c, 0x76, 0xc9, 0x6c, 0xb5, 0x66, 0xb9, + 0xb8, 0x79, 0x5e, 0x6f, 0xbf, 0xa2, 0x6d, 0xd1, 0x8f, 0x78, 0x70, 0xbd, + 0x9e, 0x49, 0x6c, 0x7b, 0x53, 0x4d, 0x9e, 0xb3, 0x59, 0xb7, 0x9d, 0xc8, + 0xa2, 0xa0, 0x85, 0x7b, 0x73, 0x34, 0x3d, 0x40, 0x65, 0x93, 0x45, 0x79, + 0x57, 0x83, 0x8f, 0xc2, 0x67, 0x8d, 0x56, 0x84, 0xb3, 0x7e, 0x4e, 0xca, + 0xc4, 0x49, 0x9b, 0x7c, 0xa9, 0x77, 0xa5, 0x99, 0x3b, 0x19, 0x70, 0x6b, + 0x6a, 0x54, 0xb2, 0xc7, 0x6b, 0xd4, 0xa0, 0x4f, 0xb6, 0x50, 0x3f, 0xc0, + 0xbd, 0xc8, 0x53, 0xbc, 0x5a, 0xae, 0xd4, 0xa0, 0x48, 0xc3, 0x63, 0xcd, + 0xae, 0xab, 0xbe, 0xab, 0x59, 0x85, 0xcd, 0xa8, 0xa0, 0x5b, 0xc8, 0x4e, + 0x68, 0x48, 0x65, 0xd5, 0x9c, 0x77, 0xa0, 0x7f, 0x65, 0x55, 0x9c, 0xc2, + 0x97, 0x69, 0xe0, 0xc3, 0xce, 0xa6, 0x71, 0x69, 0x47, 0x93, 0x51, 0xa2, + 0xa6, 0x5a, 0x4d, 0x38, 0x5c, 0xa2, 0x53, 0xd4, 0xb9, 0xa4, 0x72, 0xb7, + 0x9d, 0xb1, 0x41, 0xb5, 0xde, 0x8d, 0xa5, 0x7e, 0x43, 0x7f, 0x52, 0x79, + 0x7d, 0xcf, 0x92, 0x63, 0x95, 0x4d, 0x6f, 0x74, 0xbc, 0xa4, 0x68, 0xa1, + 0x5b, 0xa2, 0x8d, 0x7b, 0xc1, 0x74, 0x86, 0x68, 0x71, 0xce, 0x88, 0x54, + 0x58, 0xb2, 0x85, 0x65, 0xc9, 0xa3, 0x49, 0x38, 0xd7, 0x6d, 0x5d, 0x5a, + 0x8d, 0xbc, 0x82, 0x75, 0xbc, 0x51, 0x80, 0x37, 0x41, 0x4d, 0x3c, 0x58, + 0xa0, 0x5d, 0x55, 0xcb, 0xca, 0x2f, 0xb3, 0xb5, 0x44, 0x95, 0xd9, 0x9d, + 0x98, 0x30, 0x5e, 0x40, 0xc9, 0xd4, 0x83, 0xc1, 0x66, 0x57, 0xa2, 0x65, + 0x9f, 0x2f, 0x68, 0x65, 0x75, 0x99, 0x83, 0x6e, 0x69, 0x85, 0xc4, 0xc4, + 0x69, 0x4b, 0x6a, 0xca, 0xa9, 0x8f, 0x4a, 0xce, 0x64, 0xab, 0xba, 0x8e, + 0x67, 0x96, 0x92, 0x37, 0x79, 0x68, 0x4c, 0xba, 0x81, 0x79, 0x58, 0xc9, + 0x44, 0x6a, 0x71, 0x59, 0x56, 0x65, 0x43, 0x4d, 0x8c, 0x75, 0xd3, 0x46, + 0x5a, 0x50, 0x84, 0xc2, 0x51, 0xb9, 0x8d, 0x2f, 0xad, 0x94, 0xbc, 0xc0, + 0x7c, 0xca, 0x5a, 0x55, 0xa8, 0xc7, 0xd6, 0xad, 0x64, 0x3d, 0xb9, 0x55, + 0x93, 0x4c, 0x43, 0x35, 0xb1, 0x76, 0x4c, 0x49, 0xdf, 0x71, 0xcb, 0x6f, + 0xb3, 0x8a, 0x7d, 0xc4, 0x3d, 0xb3, 0x46, 0xce, 0x29, 0xc8, 0x7b, 0x7a, + 0x93, 0xa8, 0xae, 0x66, 0xc7, 0xd0, 0x49, 0xc7, 0xcd, 0x8f, 0x59, 0x68, + 0x74, 0x35, 0x38, 0x65, 0x66, 0x79, 0x61, 0x78, 0x93, 0x46, 0xa8, 0x6a, + 0x7a, 0xb8, 0x9a, 0x73, 0xcf, 0x3f, 0xb4, 0x33, 0x76, 0x51, 0xa3, 0x98, + 0xd2, 0xcd, 0x7f, 0x93, 0x9a, 0xa9, 0x93, 0x4b, 0x7f, 0x59, 0xa9, 0x51, + 0x8c, 0xa2, 0x70, 0x7a, 0x2c, 0xb8, 0x9f, 0xa6, 0x5d, 0x83, 0xc8, 0x7c, + 0x46, 0x65, 0x97, 0xd6, 0x85, 0x50, 0xd8, 0x3d, 0xc8, 0x41, 0xc2, 0x6b, + 0x47, 0x5e, 0xcc, 0x4d, 0xda, 0x2b, 0x86, 0xad, 0x9d, 0xb6, 0x44, 0xc4, + 0x7f, 0x37, 0x3c, 0x21, 0xd2, 0x31, 0x81, 0xa6, 0xb2, 0x9d, 0x59, 0x97, + 0xbb, 0xba, 0x4f, 0x5d, 0xcd, 0x41, 0x43, 0x9b, 0xad, 0x89, 0x4d, 0x47, + 0x72, 0x61, 0x83, 0xa1, 0xca, 0xb6, 0x66, 0x41, 0xbe, 0xd5, 0x7e, 0xa2, + 0x7c, 0x48, 0x96, 0x96, 0x39, 0xd4, 0xa9, 0x6d, 0x45, 0xaa, 0xad, 0x57, + 0xca, 0x33, 0xce, 0xd3, 0x45, 0xbf, 0x45, 0x64, 0x38, 0xac, 0xc5, 0x4b, + 0x87, 0x9b, 0x4c, 0xa0, 0x42, 0xa1, 0xaa, 0x4e, 0x66, 0x7d, 0x33, 0xc6, + 0x87, 0xbd, 0xc8, 0x7b, 0x79, 0xd9, 0xa9, 0x4f, 0x53, 0x81, 0x41, 0x4a, + 0x3c, 0x87, 0x4d, 0xbd, 0xcb, 0x68, 0x62, 0xbf, 0x64, 0x8d, 0x3c, 0xae, + 0x55, 0xb1, 0x8a, 0x43, 0x3e, 0xa2, 0x57, 0xae, 0xa5, 0x42, 0xa3, 0x87, + 0xc9, 0x9c, 0x8e, 0x55, 0x39, 0x51, 0x32, 0x97, 0x63, 0xc8, 0x51, 0xa1, + 0xc1, 0xc5, 0xa9, 0x91, 0xbd, 0x3f, 0xd1, 0xa5, 0x9b, 0x94, 0xcb, 0x6d, + 0x41, 0x8b, 0x52, 0xbe, 0x37, 0x7c, 0xbc, 0x8c, 0x61, 0x58, 0x39, 0xcb, + 0x71, 0x62, 0x96, 0x47, 0x40, 0x3d, 0x69, 0xbf, 0x57, 0xd0, 0xda, 0x84, + 0x35, 0xa1, 0xcc, 0xa8, 0xb9, 0xd1, 0x41, 0x92, 0x67, 0x44, 0x69, 0xa8, + 0x44, 0x5e, 0x32, 0x7d, 0x46, 0x4b, 0x7e, 0x73, 0xc1, 0x68, 0x61, 0xac, + 0x68, 0xa1, 0xd2, 0x50, 0x3e, 0x85, 0x6e, 0xb2, 0x4c, 0xbc, 0x40, 0xb3, + 0x52, 0x59, 0x50, 0xb8, 0xa9, 0xba, 0x50, 0x38, 0xc4, 0xd3, 0x8a, 0xc7, + 0x81, 0xd3, 0xa2, 0x99, 0xad, 0x78, 0xa2, 0x6c, 0xc3, 0x95, 0x47, 0x8f, + 0xa8, 0xae, 0x54, 0x63, 0x55, 0xc6, 0x63, 0x3d, 0x72, 0x35, 0x62, 0xa0, + 0xce, 0x80, 0xbc, 0xd2, 0xd0, 0x98, 0xba, 0x59, 0x5d, 0x67, 0x51, 0x4d, + 0x61, 0xa8, 0x7a, 0xd3, 0x60, 0x77, 0x46, 0x56, 0x5c, 0xb7, 0x88, 0x6a, + 0xa3, 0xd1, 0x83, 0x4e, 0x56, 0x72, 0xbf, 0x40, 0x41, 0x59, 0x79, 0x6d, + 0xac, 0x5d, 0x43, 0x64, 0xc3, 0x46, 0xd4, 0xb8, 0xae, 0xa8, 0x84, 0x98, + 0xb7, 0x3d, 0x69, 0xc8, 0x35, 0xd4, 0xc1, 0x72, 0x6b, 0x91, 0x6f, 0x52, + 0xa6, 0x4f, 0x97, 0xaa, 0x6a, 0x7f, 0x41, 0x4e, 0xa1, 0x55, 0x61, 0x62, + 0x92, 0x69, 0x3f, 0x88, 0xbc, 0x4a, 0x4f, 0xc9, 0xbf, 0x64, 0xc8, 0xb6, + 0x7b, 0x67, 0x49, 0x7b, 0x3e, 0x9a, 0xcf, 0x64, 0xb9, 0x63, 0xbc, 0x31, + 0x3c, 0x34, 0x60, 0x94, 0xbd, 0x93, 0xc8, 0x6e, 0xc0, 0x88, 0x6e, 0xc8, + 0x41, 0xe6, 0x62, 0x59, 0x92, 0x5e, 0x55, 0x74, 0xc4, 0x96, 0x79, 0xb2, + 0x5d, 0xbb, 0xbe, 0xbb, 0x66, 0x51, 0xb9, 0x57, 0xd2, 0x43, 0xcb, 0x97, + 0x3f, 0x6d, 0xcb, 0xbe, 0xab, 0x4b, 0xc1, 0xc3, 0xb1, 0x79, 0xa7, 0xa8, + 0x65, 0xc9, 0x84, 0x66, 0xbc, 0xcb, 0xca, 0x9b, 0xbe, 0xb2, 0x73, 0x9b, + 0x5f, 0x7a, 0x34, 0x8c, 0xa0, 0x83, 0x5f, 0x88, 0x41, 0x4f, 0xba, 0x35, + 0x81, 0xa2, 0x90, 0xa4, 0xc3, 0x89, 0x51, 0x9e, 0xcf, 0x6d, 0x44, 0x95, + 0x4c, 0x45, 0xd5, 0x69, 0xca, 0x7b, 0x7f, 0xc8, 0xa3, 0x6e, 0x5b, 0x8f, + 0x9e, 0x5b, 0xd0, 0x7e, 0x44, 0x91, 0xb9, 0x4c, 0x52, 0x96, 0x97, 0xcf, + 0x66, 0x4e, 0x85, 0x4a, 0xd8, 0x3a, 0xc4, 0x69, 0xc2, 0x9d, 0x8c, 0x9e, + 0xa8, 0x43, 0x32, 0x67, 0x4a, 0xdc, 0x7c, 0x44, 0xc6, 0x7f, 0x5f, 0x6c, + 0x65, 0xbe, 0x5f, 0x51, 0xbd, 0x53, 0xb7, 0x8f, 0x64, 0x66, 0x9c, 0xd0, + 0x49, 0xb4, 0x42, 0x4f, 0xab, 0x5d, 0x4c, 0xb1, 0xab, 0x6a, 0xba, 0x5e, + 0xc1, 0x52, 0x9e, 0xbc, 0x7c, 0xcb, 0xb9, 0x5a, 0xd3, 0xab, 0x55, 0x7d, + 0x6e, 0x83, 0x73, 0xc0, 0x7c, 0x34, 0x49, 0x8a, 0xba, 0x5a, 0x91, 0x68, + 0x43, 0x6d, 0x73, 0x39, 0x72, 0x5e, 0xaf, 0x8a, 0xcb, 0x41, 0x96, 0x98, + 0x85, 0x91, 0xab, 0xca, 0x53, 0x7e, 0x2f, 0x87, 0x53, 0x67, 0xbf, 0x53, + 0xb2, 0x4e, 0xa0, 0xa8, 0x94, 0x98, 0x7f, 0x73, 0x89, 0x88, 0xae, 0x62, + 0xb7, 0x6a, 0x7d, 0x4a, 0xd1, 0xd2, 0xaa, 0x3a, 0x7e, 0xc6, 0x69, 0x47, + 0xd6, 0x6c, 0xa8, 0x97, 0x4a, 0xa5, 0xa2, 0x8e, 0x7d, 0x38, 0xb2, 0x5b, + 0x74, 0x39, 0x73, 0x79, 0xb1, 0xcb, 0x96, 0xa1, 0xbb, 0xd0, 0x75, 0xb8, + 0xa4, 0x48, 0x67, 0xb5, 0x68, 0x85, 0x5b, 0xa0, 0xaf, 0x39, 0x49, 0xa5, + 0x85, 0xa0, 0xc6, 0x39, 0x75, 0xb6, 0xcf, 0x45, 0x62, 0xad, 0x4a, 0xc2, + 0x7f, 0x6b, 0x9e, 0x9c, 0x45, 0x94, 0xa3, 0x5d, 0xa7, 0xb8, 0x9c, 0x7f, + 0xcd, 0x72, 0x49, 0x97, 0x84, 0x70, 0x73, 0x8f, 0xb6, 0xaa, 0xbd, 0x3f, + 0x53, 0x57, 0xd6, 0xcd, 0x4e, 0x84, 0x88, 0x93, 0x9e, 0x6e, 0x6f, 0x49, + 0xc0, 0xc5, 0x97, 0x81, 0x4d, 0x76, 0x48, 0x9d, 0x79, 0xb9, 0x8f, 0x6e, + 0xa4, 0xa9, 0xaa, 0x94, 0x52, 0xcb, 0xc3, 0xb0, 0x62, 0x35, 0x82, 0x99, + 0x6a, 0x7f, 0x38, 0x69, 0xa3, 0x75, 0xc9, 0x34, 0xad, 0xcf, 0xa1, 0xca, + 0x9a, 0xb3, 0xaf, 0xc4, 0x94, 0xce, 0xcb, 0x54, 0x65, 0x5d, 0xa8, 0x71, + 0xb3, 0x67, 0x9c, 0xb6, 0xae, 0xd0, 0x4b, 0x30, 0x7d, 0x50, 0x9b, 0x71, + 0xa9, 0xce, 0x98, 0x56, 0x69, 0x9f, 0xb7, 0x44, 0xb1, 0xb5, 0xb8, 0xcc, + 0x6b, 0xb4, 0x70, 0x7a, 0x5d, 0x7f, 0x92, 0xc0, 0xc4, 0x47, 0xd0, 0x3c, + 0xc1, 0x99, 0xba, 0xca, 0x96, 0xc5, 0x68, 0x3c, 0xad, 0x56, 0x55, 0xa8, + 0xb0, 0xa0, 0xa8, 0x69, 0xab, 0x99, 0x2e, 0x5c, 0x77, 0x8c, 0x6b, 0xba, + 0x89, 0x74, 0xc3, 0x8b, 0x5e, 0x73, 0x9e, 0xcb, 0x7f, 0x41, 0xc8, 0xcb, + 0x82, 0x8a, 0xb0, 0xa3, 0xcb, 0x33, 0x7c, 0x5e, 0x5d, 0x42, 0xcb, 0xcd, + 0x59, 0x77, 0xaf, 0x51, 0x53, 0xb7, 0x6c, 0x47, 0xb2, 0x9e, 0x95, 0x41, + 0x7f, 0x3d, 0xd8, 0x79, 0xd1, 0xbf, 0x9e, 0x8a, 0x53, 0x67, 0x41, 0xb8, + 0x35, 0x40, 0xb8, 0x5e, 0xd4, 0x85, 0x94, 0x74, 0xbd, 0x4c, 0x45, 0x4e, + 0x91, 0xbd, 0x5f, 0x8f, 0x4d, 0xbb, 0xd1, 0x5e, 0xa6, 0x92, 0x87, 0x39, + 0xc9, 0xcc, 0xb4, 0x33, 0x6e, 0xd4, 0xaf, 0x9b, 0x7d, 0x6b, 0x6d, 0x9a, + 0xba, 0x2f, 0x67, 0x45, 0xcb, 0x56, 0x7c, 0xa1, 0xc4, 0x5f, 0xa7, 0x35, + 0xb3, 0xa2, 0x4e, 0xbc, 0xd3, 0xac, 0x46, 0xbe, 0x50, 0xc1, 0x98, 0x61, + 0xbe, 0x6a, 0x5c, 0x53, 0xa4, 0xbf, 0x82, 0x87, 0xa9, 0x6a, 0xb1, 0xbc, + 0x56, 0x8c, 0x89, 0x5b, 0x46, 0xce, 0x7a, 0xab, 0x39, 0x4d, 0x9b, 0x4f, + 0x47, 0x57, 0xaf, 0x3e, 0x94, 0x7a, 0x7c, 0xa0, 0xae, 0x5c, 0x59, 0xb7, + 0x6f, 0x3b, 0xbc, 0xc6, 0x7a, 0xc8, 0xd4, 0x3c, 0x3f, 0x41, 0xb9, 0x9b, + 0xc3, 0x47, 0x74, 0x9e, 0xbb, 0x8e, 0x95, 0x92, 0xa1, 0xae, 0x48, 0x7b, + 0x8b, 0x50, 0xa9, 0x6a, 0x4a, 0x76, 0x93, 0xcb, 0x8d, 0x62, 0x99, 0x47, + 0xab, 0x6a, 0xa6, 0x41, 0xc7, 0x49, 0xc3, 0xc8, 0x73, 0x58, 0x58, 0xd1, + 0x98, 0x6b, 0xa4, 0x99, 0xc6, 0x81, 0x55, 0x6d, 0x4e, 0x9e, 0xd4, 0xbd, + 0x7b, 0xad, 0xb7, 0x48, 0xa5, 0xaf, 0x5b, 0xb5, 0x91, 0x8b, 0xa4, 0xc0, + 0xc3, 0xaa, 0x59, 0xe1, 0x59, 0x9c, 0x7b, 0xc3, 0x9a, 0xba, 0x44, 0xc4, + 0xb2, 0x2d, 0xcf, 0x8c, 0x99, 0x63, 0xbb, 0x90, 0x9e, 0xb1, 0x6f, 0x31, + 0xc6, 0x4d, 0xb6, 0x50, 0x44, 0x66, 0xbc, 0x73, 0xb7, 0x56, 0x4b, 0x74, + 0x57, 0x51, 0x94, 0x66, 0x64, 0xbf, 0x36, 0xd6, 0x6d, 0xc1, 0xa5, 0x92, + 0x7c, 0x95, 0x5a, 0x6c, 0x85, 0xac, 0xd1, 0x6d, 0xcd, 0x7e, 0x3d, 0x4e, + 0xb2, 0x3b, 0x6c, 0xd3, 0x42, 0x6c, 0xbf, 0x88, 0xa0, 0x7a, 0x5f, 0x51, + 0x76, 0x83, 0x59, 0x5f, 0x92, 0xcf, 0x65, 0x46, 0x8b, 0x42, 0x7c, 0xa9, + 0xa8, 0x89, 0xc2, 0x5b, 0xce, 0xa9, 0x6c, 0x8a, 0xa8, 0x38, 0xc1, 0x3b, + 0x5b, 0x57, 0x77, 0x44, 0x3b, 0x74, 0x96, 0x43, 0x81, 0xa8, 0x35, 0xc1, + 0x6a, 0x3a, 0xb3, 0x69, 0x38, 0x44, 0xc7, 0x8f, 0x58, 0x5f, 0xc9, 0x3b, + 0x69, 0x3c, 0x2e, 0x33, 0x74, 0x6a, 0x8a, 0x76, 0x78, 0x56, 0x73, 0xb7, + 0x3c, 0x59, 0x41, 0xab, 0x41, 0x5d, 0x67, 0x6f, 0x5a, 0xa2, 0x43, 0xb5, + 0x4b, 0xad, 0x99, 0x94, 0xb4, 0xa5, 0x55, 0x4d, 0x65, 0x96, 0xbc, 0x3a, + 0x67, 0x6b, 0x95, 0x5c, 0x5e, 0x9a, 0x60, 0x85, 0x40, 0x7f, 0xb6, 0xba, + 0x70, 0x9b, 0xa3, 0x74, 0x49, 0x61, 0x87, 0x6b, 0xc9, 0x7f, 0x6e, 0x46, + 0xc0, 0x42, 0xca, 0x56, 0x79, 0xb5, 0x36, 0x3b, 0x42, 0x7d, 0xb5, 0x48, + 0x68, 0xb2, 0x76, 0x61, 0x88, 0x98, 0x97, 0xad, 0xca, 0xa4, 0x70, 0xb5, + 0x9b, 0xd4, 0xb9, 0x4d, 0xd4, 0xbd, 0x51, 0x47, 0x41, 0x8d, 0x9c, 0x78, + 0xc3, 0xc2, 0xb7, 0xad, 0x39, 0x72, 0x73, 0x43, 0x5a, 0xaf, 0xbf, 0x93, + 0xaf, 0x74, 0x94, 0x7e, 0xab, 0x53, 0x73, 0x7a, 0x7f, 0x91, 0x6b, 0x95, + 0x78, 0x66, 0x5e, 0xcf, 0x56, 0x8b, 0xca, 0xae, 0x5b, 0x7b, 0xa4, 0x9d, + 0xc0, 0xb8, 0x8c, 0xc1, 0xbb, 0x90, 0x33, 0x66, 0x77, 0x93, 0xa2, 0xc6, + 0x5f, 0x54, 0xb7, 0xc5, 0x53, 0x4e, 0x88, 0xb3, 0xc0, 0xb5, 0x2f, 0x7f, + 0xb4, 0x7a, 0x55, 0x56, 0xbd, 0xa8, 0x3f, 0x3b, 0x72, 0x50, 0xb2, 0xc2, + 0xaf, 0xd1, 0xa0, 0xa7, 0xcc, 0xb3, 0xd7, 0x6c, 0xcf, 0x59, 0xba, 0xd3, + 0x95, 0x55, 0x62, 0x69, 0xae, 0x5c, 0xaf, 0xca, 0xb8, 0x9b, 0x60, 0x79, + 0xc2, 0x61, 0x49, 0x65, 0xcd, 0xd2, 0x3c, 0x4e, 0xa7, 0x97, 0x63, 0xbe, + 0x45, 0x80, 0x67, 0x89, 0x39, 0x50, 0x35, 0x5f, 0x46, 0xd5, 0x52, 0x4c, + 0x60, 0x77, 0xce, 0x84, 0xa8, 0xa2, 0xb3, 0xc5, 0xa3, 0x7e, 0x70, 0xcb, + 0xba, 0xbb, 0x7c, 0xb6, 0x37, 0x74, 0x30, 0x48, 0x9a, 0xbd, 0x61, 0xb3, + 0xad, 0x59, 0x88, 0xca, 0x8f, 0x8b, 0x82, 0x95, 0xcc, 0xa0, 0x5e, 0x40, + 0x81, 0xc6, 0x88, 0x85, 0x9f, 0x3d, 0x76, 0x71, 0x8f, 0xab, 0x6d, 0x76, + 0xcc, 0x3f, 0xca, 0xb6, 0xa1, 0x47, 0x4f, 0xc6, 0x96, 0x59, 0x62, 0x72, + 0x9a, 0x70, 0x4b, 0x4c, 0x4c, 0x91, 0xc6, 0x51, 0x69, 0xc7, 0xa8, 0x7b, + 0x5b, 0x31, 0x37, 0x36, 0xa3, 0xa6, 0xc1, 0x88, 0x6f, 0x47, 0xbf, 0xb0, + 0x95, 0x9d, 0xd1, 0xc5, 0x65, 0x59, 0xbf, 0xb1, 0x6d, 0xa4, 0xb6, 0x68, + 0x7b, 0x9e, 0x42, 0xc3, 0x87, 0x4a, 0x36, 0xac, 0x90, 0x8c, 0x42, 0x66, + 0x76, 0x83, 0x44, 0xd2, 0x9e, 0x85, 0x74, 0xd3, 0xc8, 0x7f, 0xa9, 0x32, + 0xd3, 0x93, 0x5a, 0x7b, 0x4b, 0xc7, 0xb1, 0xab, 0xa6, 0x3d, 0x6b, 0x4d, + 0x96, 0x51, 0x91, 0x8a, 0x7e, 0x54, 0x8e, 0x62, 0xa6, 0x58, 0x4e, 0x92, + 0x98, 0x3b, 0x6d, 0xd4, 0x98, 0x32, 0x6a, 0xa8, 0xd2, 0x4b, 0x4a, 0x32, + 0x80, 0x94, 0xc0, 0x65, 0x40, 0x55, 0x81, 0xba, 0xb4, 0xab, 0x94, 0xb9, + 0x5b, 0x71, 0x71, 0x70, 0x9a, 0x64, 0x3f, 0x7c, 0x88, 0x30, 0x66, 0x4c, + 0xc5, 0xb1, 0xc3, 0x3b, 0x47, 0x56, 0x3c, 0x82, 0xb8, 0x4c, 0x5a, 0xbb, + 0xa1, 0xbd, 0xce, 0x50, 0x4d, 0xb1, 0x8d, 0x52, 0x96, 0x90, 0xca, 0x80, + 0xcb, 0x3d, 0xc3, 0x42, 0xb3, 0x9b, 0x8f, 0x9f, 0xa0, 0x7d, 0x9a, 0xce, + 0x63, 0x73, 0xcc, 0x89, 0x8c, 0x6e, 0x86, 0x69, 0x42, 0x4e, 0x95, 0x8b, + 0xbe, 0x4e, 0x70, 0x3c, 0xbc, 0xa0, 0x37, 0x3e, 0x73, 0x92, 0xc8, 0xce, + 0xb2, 0x9c, 0x34, 0x71, 0x8c, 0xc4, 0x8f, 0x93, 0x9d, 0x5c, 0x96, 0x54, + 0x98, 0x62, 0x9c, 0x9c, 0x54, 0x80, 0x80, 0x2c, 0x3d, 0x38, 0x8d, 0xce, + 0x7f, 0x54, 0x7a, 0x9f, 0xbb, 0x6d, 0xc4, 0x69, 0x61, 0xab, 0xe0, 0x78, + 0x9c, 0x85, 0xbd, 0x76, 0x6c, 0x6d, 0x52, 0x7a, 0x4c, 0xba, 0xc5, 0xc7, + 0x47, 0x7c, 0x8c, 0x88, 0xc7, 0x92, 0xb5, 0x79, 0x48, 0xb2, 0x55, 0xc6, + 0x8c, 0x57, 0x81, 0x96, 0x5e, 0x74, 0x6d, 0x83, 0x9e, 0xaa, 0xac, 0x98, + 0x36, 0x95, 0x69, 0x43, 0x79, 0xc2, 0x45, 0x68, 0x9a, 0x45, 0x76, 0x69, + 0xd0, 0x83, 0xbd, 0x3e, 0x41, 0x81, 0x33, 0x5d, 0x6d, 0x65, 0x45, 0xa4, + 0x79, 0xbc, 0xc6, 0x99, 0xb1, 0x63, 0x88, 0x78, 0xba, 0x75, 0x55, 0x71, + 0x85, 0x58, 0x8a, 0x5b, 0xc1, 0xd9, 0x71, 0x68, 0x93, 0x5c, 0x4d, 0xa0, + 0xc6, 0x56, 0xcc, 0xa7, 0xc0, 0x83, 0xac, 0x9e, 0x20, 0x4f, 0x8f, 0x61, + 0xa5, 0x93, 0xb9, 0x60, 0xbf, 0x7e, 0x40, 0x8b, 0x67, 0xc1, 0xab, 0x36, + 0xa1, 0xd6, 0x36, 0x56, 0xab, 0x70, 0x42, 0x3f, 0xcc, 0x60, 0xca, 0xb5, + 0x76, 0x64, 0xc4, 0xc6, 0xc6, 0x6c, 0xdb, 0x52, 0x2c, 0xb3, 0x56, 0xbc, + 0x34, 0x49, 0xc2, 0xbf, 0xaa, 0xba, 0x9e, 0x6d, 0x48, 0xc2, 0x47, 0x95, + 0x61, 0x9a, 0x60, 0x48, 0x77, 0x72, 0x6f, 0x4a, 0x39, 0x6c, 0xb8, 0xd2, + 0xad, 0x7d, 0x6f, 0xe6, 0x2e, 0xc8, 0x68, 0x5f, 0xba, 0x3a, 0x7b, 0x4d, + 0x64, 0xb7, 0x8a, 0x77, 0x5c, 0x71, 0xb5, 0x33, 0x61, 0x5e, 0x31, 0x4c, + 0x57, 0x42, 0xae, 0xd4, 0x68, 0x7d, 0x7c, 0x44, 0x28, 0x5f, 0x73, 0xa3, + 0x60, 0x7c, 0x50, 0xa5, 0x6f, 0x9e, 0xaa, 0x9c, 0xaf, 0x7d, 0x82, 0xa4, + 0x4e, 0xa9, 0x8b, 0xbc, 0x54, 0x6e, 0x86, 0xc9, 0x8b, 0x6e, 0x6b, 0x56, + 0x2d, 0x83, 0x45, 0x75, 0x9a, 0x3d, 0x44, 0x7a, 0x32, 0x51, 0xc6, 0x8c, + 0x78, 0x6c, 0x5c, 0x8f, 0xc5, 0x65, 0x3f, 0x3f, 0x8e, 0xd3, 0x4e, 0x71, + 0x9e, 0x8d, 0x9f, 0x62, 0x9d, 0x2b, 0xc0, 0xd2, 0xc9, 0x64, 0x3b, 0x62, + 0x85, 0x8e, 0x42, 0x35, 0xa9, 0x5f, 0x41, 0x55, 0xb7, 0xdc, 0x2e, 0x69, + 0xd6, 0x34, 0x71, 0x7c, 0x9d, 0xbd, 0x7d, 0xc7, 0x78, 0x31, 0xb1, 0x61, + 0xa1, 0x33, 0xa7, 0x4f, 0x72, 0x56, 0x9a, 0x7b, 0xcb, 0xcf, 0x44, 0x8e, + 0x9e, 0xd3, 0x8f, 0xbf, 0x8a, 0xc5, 0x48, 0x7b, 0xbe, 0x95, 0x91, 0xae, + 0x43, 0x44, 0x42, 0x71, 0x44, 0x34, 0x9b, 0xad, 0x6f, 0xbf, 0xcf, 0x74, + 0xc4, 0x56, 0x56, 0x5d, 0x72, 0x6a, 0x4d, 0x45, 0xb6, 0x80, 0xa4, 0x4f, + 0x81, 0x8e, 0xba, 0xc8, 0xb2, 0xb1, 0xc0, 0x69, 0xc4, 0x77, 0x98, 0x67, + 0xcc, 0x4b, 0x68, 0xbb, 0x86, 0xad, 0xa9, 0x4f, 0x4c, 0xbe, 0x4c, 0xc4, + 0xcd, 0xb9, 0x3c, 0x4b, 0x80, 0x82, 0x86, 0x63, 0x68, 0x72, 0x59, 0xc8, + 0x84, 0x6b, 0x42, 0x3e, 0x47, 0xa3, 0xe1, 0x81, 0x69, 0x85, 0x52, 0xab, + 0x4d, 0x8a, 0xc3, 0x53, 0x81, 0x5b, 0x99, 0x59, 0x74, 0x5d, 0xc3, 0x8d, + 0x66, 0x8f, 0x69, 0x4c, 0x60, 0x3c, 0x63, 0x90, 0xb3, 0xbd, 0x84, 0xa2, + 0x3b, 0x3f, 0x9a, 0x4e, 0x39, 0xcf, 0x97, 0x97, 0x3a, 0x9e, 0x8d, 0xb2, + 0xb3, 0x97, 0x48, 0x52, 0x88, 0x4c, 0xc2, 0x27, 0x6a, 0x5a, 0xa3, 0x51, + 0xa4, 0xa7, 0x55, 0x83, 0xa5, 0x5e, 0xab, 0xd5, 0x7e, 0x4c, 0x35, 0x3f, + 0x70, 0x71, 0x8a, 0x41, 0xb2, 0xc7, 0x64, 0x5c, 0x37, 0x98, 0xc0, 0x50, + 0xb7, 0x64, 0x65, 0x69, 0xa1, 0x4e, 0x37, 0xda, 0x57, 0x47, 0x8c, 0x7b, + 0xc8, 0x58, 0xb9, 0x79, 0xc8, 0xca, 0xe6, 0x69, 0xd5, 0x97, 0x68, 0x68, + 0x77, 0x95, 0x7d, 0x9d, 0x51, 0x52, 0x73, 0x6b, 0x94, 0x37, 0xa8, 0x63, + 0xa1, 0xaf, 0xd6, 0xb3, 0xc1, 0xbe, 0x88, 0x33, 0x6a, 0x76, 0x52, 0x2a, + 0xb2, 0x62, 0xb1, 0x6e, 0x58, 0xc7, 0xa2, 0x5a, 0x9c, 0x89, 0x49, 0x85, + 0x6d, 0xcf, 0xd1, 0xb8, 0xa5, 0x65, 0x9f, 0x72, 0x7a, 0x70, 0x7e, 0x7c, + 0x94, 0xd4, 0x90, 0x79, 0xce, 0x77, 0x53, 0x50, 0xbf, 0x2a, 0x4d, 0x2d, + 0xcb, 0x67, 0x67, 0x43, 0x3e, 0x62, 0xcf, 0xa4, 0x62, 0x70, 0xda, 0xd4, + 0x7f, 0xb8, 0x7b, 0x72, 0x6a, 0x27, 0x38, 0x49, 0x5f, 0xbe, 0x7d, 0xbe, + 0x86, 0x47, 0x95, 0x60, 0xad, 0xc1, 0xb5, 0xc4, 0xb0, 0x5b, 0x99, 0x73, + 0x6a, 0x9f, 0x9c, 0xa1, 0xa0, 0x79, 0x44, 0x79, 0x57, 0xdc, 0x60, 0xae, + 0xac, 0x9c, 0x8d, 0x7c, 0xa4, 0x6c, 0xab, 0x60, 0x3f, 0x63, 0x9b, 0x5c, + 0x80, 0x5f, 0x6e, 0x79, 0x88, 0x8d, 0x5c, 0xbf, 0x42, 0x71, 0xd9, 0xd1, + 0xbe, 0x99, 0xad, 0xd9, 0x3f, 0x99, 0x64, 0xa8, 0x3c, 0x27, 0x90, 0x87, + 0x25, 0xb3, 0x46, 0xba, 0x52, 0xcb, 0x78, 0x7b, 0x8b, 0x87, 0xb0, 0xb8, + 0x63, 0x47, 0xae, 0x7c, 0xc5, 0x37, 0x58, 0x9b, 0x9a, 0x6d, 0x49, 0x8b, + 0x8c, 0x93, 0xa0, 0x7b, 0x7c, 0x63, 0x78, 0x7b, 0xb5, 0x8d, 0xc6, 0x49, + 0xa7, 0xc6, 0x68, 0xbc, 0x63, 0x95, 0x47, 0xbc, 0x46, 0x9b, 0x6d, 0x9e, + 0xbb, 0xd4, 0x7e, 0xa0, 0xba, 0x6d, 0x31, 0x6e, 0x92, 0x8f, 0xc0, 0x89, + 0x37, 0x60, 0xaf, 0xe1, 0x85, 0x28, 0x44, 0x37, 0xa6, 0x65, 0xcc, 0x9f, + 0x6c, 0x89, 0x87, 0xa9, 0xb8, 0x5e, 0xa4, 0x6b, 0x99, 0xb2, 0x69, 0x42, + 0xcd, 0xb6, 0x77, 0xa4, 0xb0, 0xc7, 0x30, 0xac, 0x50, 0x56, 0xa6, 0x8d, + 0x52, 0x94, 0x73, 0xb4, 0xbb, 0xc9, 0x60, 0x5d, 0x86, 0xc8, 0xd0, 0xc6, + 0x4d, 0xbf, 0xc5, 0xb8, 0x41, 0xa9, 0x73, 0x43, 0x6b, 0x9b, 0x53, 0xab, + 0x65, 0x72, 0xa2, 0xa5, 0xd0, 0x83, 0xba, 0x9e, 0x7c, 0xb5, 0x50, 0x6c, + 0xd0, 0x32, 0x53, 0xb9, 0x85, 0x48, 0x57, 0xcc, 0x69, 0x8c, 0xa9, 0x9c, + 0x84, 0xd0, 0xdb, 0x36, 0x5e, 0x76, 0x9d, 0xa4, 0xca, 0x5c, 0x6b, 0x45, + 0x4e, 0xae, 0xa9, 0x47, 0xc5, 0x63, 0xcc, 0x58, 0x94, 0xb4, 0xa4, 0x80, + 0x86, 0x7a, 0x9e, 0x7a, 0x95, 0xce, 0xba, 0x84, 0xb7, 0xa3, 0xbe, 0xac, + 0x31, 0x60, 0x71, 0x64, 0x2c, 0x21, 0xb6, 0x54, 0x47, 0x95, 0xab, 0x72, + 0x86, 0x59, 0x8a, 0x75, 0x52, 0x96, 0x66, 0x68, 0xcb, 0x6e, 0xb5, 0x7a, + 0xc2, 0x63, 0x75, 0x7b, 0xae, 0x3c, 0x33, 0xa3, 0x80, 0x5a, 0x9c, 0x45, + 0x61, 0xb6, 0x7f, 0x51, 0x8a, 0xce, 0x47, 0x92, 0x62, 0x87, 0x73, 0x40, + 0x7e, 0x82, 0x39, 0x4a, 0x95, 0x91, 0xb5, 0xc9, 0x85, 0x7e, 0x70, 0xa4, + 0x81, 0xaa, 0x6b, 0x4f, 0x3f, 0x56, 0x6c, 0x92, 0xb9, 0xab, 0x7f, 0x9b, + 0xb6, 0x9e, 0x5d, 0xa1, 0x9d, 0xab, 0xb9, 0xb5, 0x74, 0x40, 0x9a, 0xc7, + 0x89, 0xc9, 0x8a, 0xb9, 0x99, 0x80, 0x39, 0xca, 0xd1, 0x99, 0x75, 0x53, + 0x7d, 0x3b, 0x9e, 0x85, 0xa2, 0x8c, 0x77, 0x53, 0x42, 0xb1, 0xc6, 0xa9, + 0x4c, 0x4f, 0xb8, 0xd6, 0x78, 0x85, 0x60, 0x6a, 0x7e, 0x8b, 0x4a, 0x7d, + 0x59, 0xb7, 0x2f, 0x94, 0x70, 0x2b, 0xb3, 0x39, 0x7f, 0xab, 0x99, 0x6e, + 0x7e, 0x72, 0xa8, 0xc8, 0x5f, 0x6d, 0x95, 0x9a, 0xdc, 0x72, 0xcb, 0x5b, + 0xa3, 0x8a, 0x6f, 0x6f, 0xcd, 0x8a, 0xd6, 0xac, 0x9a, 0x37, 0xce, 0x4e, + 0xa2, 0x51, 0x9d, 0x38, 0x65, 0x45, 0x8d, 0x8f, 0x90, 0x36, 0x66, 0xbc, + 0x9e, 0xb1, 0xbd, 0x92, 0x79, 0xb1, 0xb9, 0xb5, 0xa2, 0xd4, 0xa8, 0x44, + 0x46, 0xa9, 0x56, 0x3e, 0x99, 0x76, 0xca, 0x5d, 0x83, 0x8e, 0xc4, 0x82, + 0xcf, 0x66, 0xc2, 0xbd, 0x8f, 0xba, 0xaa, 0xb8, 0x48, 0xae, 0x8c, 0x4d, + 0xd8, 0x77, 0xa0, 0xba, 0xbe, 0xc3, 0x53, 0x3f, 0xca, 0x54, 0xa1, 0x75, + 0x5c, 0xd5, 0x3a, 0x95, 0x3b, 0xca, 0xa9, 0x36, 0x67, 0x67, 0x3b, 0x7f, + 0xd1, 0x95, 0x84, 0xab, 0x98, 0x67, 0x45, 0x3f, 0x70, 0x72, 0x73, 0xc2, + 0x66, 0x49, 0xbd, 0x4a, 0xcd, 0x6b, 0x49, 0x51, 0xaf, 0x9f, 0x5d, 0xa8, + 0xb4, 0x99, 0x42, 0x8c, 0x62, 0xc5, 0x5a, 0x6f, 0xad, 0x69, 0x75, 0x9a, + 0x77, 0x4a, 0x54, 0x8a, 0x98, 0x98, 0x4c, 0xc8, 0xb4, 0xd7, 0xa1, 0x43, + 0x50, 0xa1, 0x73, 0x5a, 0x57, 0x85, 0x84, 0x97, 0x59, 0x6b, 0xcc, 0x8f, + 0xa4, 0x3f, 0x82, 0x81, 0x4a, 0x9d, 0xa5, 0x39, 0xbd, 0x4e, 0x58, 0x41, + 0xab, 0x8f, 0x9a, 0xc9, 0x78, 0x6e, 0xb2, 0x83, 0xa4, 0x58, 0x3b, 0x7b, + 0xb4, 0xb1, 0x99, 0xcd, 0xc1, 0xac, 0xa3, 0xa8, 0x84, 0x5c, 0x87, 0x47, + 0x9d, 0x71, 0xa6, 0xcc, 0x9f, 0xaf, 0xb1, 0xa4, 0x98, 0x41, 0x8e, 0x8c, + 0x43, 0x9b, 0x37, 0xae, 0x9d, 0x75, 0xca, 0xa7, 0xa0, 0xbb, 0x7b, 0x6b, + 0x98, 0xcc, 0xaa, 0x32, 0x70, 0x4f, 0xb2, 0x7d, 0x8b, 0x8b, 0x7b, 0xd3, + 0x5b, 0x6c, 0xb0, 0x57, 0xab, 0xb3, 0xbc, 0xd0, 0x40, 0x68, 0x38, 0xd0, + 0x43, 0x5e, 0x52, 0xa0, 0x61, 0x5c, 0x97, 0xae, 0xc3, 0x4f, 0x93, 0x39, + 0xc4, 0xc9, 0xc4, 0x99, 0x5c, 0x5d, 0x64, 0x4c, 0x5d, 0x8e, 0x9f, 0xad, + 0xb1, 0xc6, 0x3e, 0x54, 0x30, 0x5e, 0x92, 0xbb, 0x62, 0x64, 0x35, 0x57, + 0x71, 0x58, 0xad, 0x69, 0x41, 0x4c, 0xbd, 0x4c, 0xb0, 0x5e, 0x9e, 0x35, + 0xbc, 0x70, 0x77, 0x7e, 0x7c, 0x9f, 0xc5, 0xa3, 0x47, 0x75, 0x5f, 0x3e, + 0x5d, 0x79, 0x8b, 0xc0, 0xd2, 0xd4, 0x73, 0xc7, 0x43, 0x96, 0x95, 0x76, + 0xc7, 0x58, 0xa7, 0x94, 0x90, 0x55, 0x90, 0x62, 0xb1, 0x65, 0xab, 0x9d, + 0x71, 0x6f, 0xb9, 0x8a, 0x5d, 0xb6, 0xa8, 0xcd, 0xa7, 0x7c, 0xb0, 0x95, + 0x6d, 0x95, 0x70, 0x8d, 0x4d, 0x4a, 0x55, 0xa5, 0x5d, 0x6e, 0x46, 0xad, + 0xb9, 0x54, 0xc6, 0xc9, 0x8c, 0x89, 0x72, 0xa3, 0xcf, 0xc3, 0x9d, 0x4f, + 0x6c, 0x42, 0x5a, 0x89, 0x44, 0x84, 0x96, 0x7d, 0x72, 0xad, 0xc7, 0x99, + 0x64, 0x6a, 0x86, 0x55, 0x96, 0xa2, 0x82, 0x52, 0x78, 0x50, 0x86, 0x4e, + 0x6d, 0x8e, 0x4a, 0xc0, 0x71, 0x56, 0x94, 0x84, 0xc2, 0xa9, 0x9a, 0x5a, + 0x35, 0x6f, 0x4a, 0xd6, 0xaf, 0xc7, 0x9f, 0x49, 0x57, 0x6c, 0x77, 0xcc, + 0x79, 0x73, 0xd6, 0xa6, 0x55, 0xd6, 0x41, 0x53, 0x4a, 0xa2, 0x5c, 0xa9, + 0xb7, 0x68, 0xba, 0xa7, 0xcf, 0xa6, 0x91, 0x5a, 0x92, 0x75, 0x72, 0x44, + 0x4c, 0xaf, 0x8e, 0xb4, 0x95, 0x6f, 0x4c, 0x8a, 0x89, 0xb3, 0xd5, 0xab, + 0x63, 0x76, 0x6f, 0xba, 0x5f, 0xba, 0xa3, 0x45, 0x5d, 0x4d, 0x55, 0x42, + 0x50, 0x80, 0x93, 0x3e, 0x90, 0xa9, 0xc5, 0xc3, 0x78, 0x95, 0x40, 0x87, + 0x3e, 0xb6, 0x3d, 0x96, 0xc3, 0x94, 0xa9, 0x98, 0x3d, 0x90, 0xa8, 0xba, + 0xc3, 0x53, 0xcc, 0xbf, 0xc9, 0x81, 0x3b, 0x6f, 0xc6, 0xbe, 0x48, 0x7d, + 0x7c, 0x44, 0x80, 0x7a, 0xa2, 0x87, 0x3d, 0xbd, 0x6b, 0x51, 0x59, 0x80, + 0x87, 0x35, 0x98, 0xa6, 0xd8, 0x66, 0xa7, 0x6a, 0x53, 0xce, 0x51, 0x7b, + 0xc2, 0xa7, 0x4a, 0x8b, 0x34, 0xae, 0xa7, 0x43, 0xc2, 0x54, 0xd2, 0x9a, + 0x5a, 0xd4, 0x48, 0xd1, 0x3d, 0xa8, 0x3a, 0x3d, 0xa0, 0xcc, 0x97, 0xb2, + 0x6b, 0xd7, 0x95, 0xb0, 0x8b, 0x71, 0x43, 0x70, 0x7a, 0x68, 0x66, 0x53, + 0xb8, 0x57, 0x91, 0x68, 0x5a, 0xa5, 0x4d, 0x78, 0x39, 0x6d, 0xb8, 0x3e, + 0xb8, 0x74, 0x92, 0xa2, 0xc5, 0xb2, 0x86, 0xbe, 0x55, 0x69, 0x63, 0xbd, + 0x48, 0x48, 0x4e, 0x81, 0x5a, 0x4c, 0x7c, 0x6b, 0xc6, 0xc2, 0xd4, 0x79, + 0xc9, 0x77, 0xd4, 0x72, 0x47, 0x64, 0x4f, 0xbc, 0x80, 0xc5, 0x3f, 0xa9, + 0xcc, 0xa5, 0x66, 0x70, 0x9e, 0x5f, 0xa5, 0x39, 0x40, 0xda, 0x89, 0x40, + 0x4d, 0x5e, 0x8f, 0x4b, 0x69, 0x92, 0x5c, 0x8e, 0x4e, 0xce, 0x3a, 0xc3, + 0x68, 0xb7, 0x41, 0x3f, 0x72, 0x5b, 0x77, 0x4f, 0x92, 0x55, 0x79, 0x8d, + 0x8e, 0x48, 0xb2, 0x63, 0xd0, 0x9e, 0xab, 0xc5, 0x92, 0x2d, 0x5b, 0xcf, + 0x90, 0xc7, 0x41, 0x91, 0x9f, 0xa7, 0x39, 0x97, 0xdb, 0x5b, 0xb0, 0xa6, + 0x4d, 0x74, 0xc4, 0xbd, 0x9d, 0x5c, 0x8a, 0xba, 0x7e, 0xd9, 0x6b, 0x72, + 0xbd, 0x8a, 0x87, 0x4e, 0x9e, 0x71, 0x56, 0xa2, 0xc5, 0x41, 0x8a, 0xc0, + 0xcb, 0xa1, 0x56, 0x92, 0x44, 0x34, 0x5a, 0x3d, 0xd2, 0xce, 0x9e, 0x39, + 0x96, 0x9d, 0x92, 0x9f, 0x5b, 0x42, 0x54, 0xa3, 0x5b, 0x4b, 0x8d, 0x4c, + 0xc9, 0x88, 0x92, 0x31, 0x57, 0x4f, 0x82, 0x3a, 0xd4, 0xd6, 0x59, 0x9a, + 0x5c, 0xd3, 0xaa, 0x58, 0x3e, 0xc9, 0x70, 0xce, 0xab, 0xc3, 0x7b, 0x51, + 0xca, 0x92, 0xd5, 0x51, 0x61, 0xa7, 0x44, 0x87, 0x67, 0xd3, 0xc4, 0x6c, + 0xad, 0x86, 0x73, 0x5c, 0x7a, 0x47, 0x27, 0x8a, 0x57, 0x45, 0x62, 0x6f, + 0x3e, 0x38, 0xa2, 0xbc, 0x4f, 0xbd, 0xb8, 0xb8, 0x78, 0xa1, 0x5f, 0xb2, + 0x51, 0xaa, 0xb4, 0xc2, 0x57, 0xbe, 0x48, 0x3c, 0xa2, 0x4c, 0x3c, 0xb0, + 0x81, 0xd2, 0x35, 0xc2, 0x99, 0xca, 0xa2, 0xbb, 0x96, 0x2e, 0x74, 0xc1, + 0x7a, 0x3f, 0xbe, 0x64, 0x5b, 0x4a, 0x79, 0x4e, 0xc9, 0x40, 0xaf, 0x61, + 0x7d, 0xad, 0x63, 0xd4, 0xbe, 0x83, 0xc8, 0x68, 0xcd, 0x97, 0xaf, 0x7e, + 0x98, 0xd9, 0x89, 0xce, 0x6f, 0x43, 0x8b, 0x65, 0x34, 0xb5, 0x87, 0x44, + 0x99, 0x86, 0x36, 0x41, 0x87, 0xcf, 0x46, 0xbf, 0x88, 0xa6, 0x9d, 0xd0, + 0xaa, 0xa2, 0x9a, 0xc7, 0x3c, 0x92, 0x41, 0x91, 0x7a, 0x3a, 0x5c, 0x2e, + 0x44, 0x76, 0xc4, 0x8b, 0x69, 0x91, 0x31, 0x75, 0x88, 0x5a, 0x4a, 0x89, + 0xcd, 0xbf, 0x9e, 0x62, 0x4c, 0xa3, 0x88, 0x8c, 0x73, 0x5e, 0x7f, 0x94, + 0xbb, 0x80, 0xa9, 0xbf, 0x46, 0xb3, 0x60, 0xbc, 0x32, 0x56, 0x31, 0x41, + 0xb7, 0xb4, 0x99, 0xaf, 0x51, 0xb5, 0x32, 0x7b, 0x87, 0xc6, 0x23, 0x33, + 0xcf, 0x67, 0xa9, 0xb7, 0xbd, 0xd4, 0x98, 0x79, 0x66, 0xa8, 0xa3, 0x9b, + 0x49, 0x76, 0xa8, 0x90, 0x41, 0x9d, 0x3c, 0xa7, 0x3a, 0xd1, 0xc1, 0xd9, + 0x3b, 0x3d, 0x90, 0x98, 0x55, 0x38, 0x50, 0x93, 0x62, 0x70, 0x69, 0x76, + 0xca, 0x75, 0x79, 0xc7, 0x9e, 0xa7, 0xa0, 0xcd, 0x3a, 0x48, 0xb2, 0x40, + 0xb9, 0x76, 0x5c, 0xa2, 0xb5, 0x92, 0x89, 0xcc, 0xcc, 0x79, 0x62, 0xb6, + 0x61, 0xb7, 0x42, 0xa3, 0xd4, 0x51, 0x8c, 0x61, 0x91, 0xb1, 0x38, 0x96, + 0xb7, 0xa8, 0x3d, 0x42, 0x7d, 0xd5, 0xd0, 0x47, 0xb4, 0x7f, 0x85, 0xbc, + 0x89, 0xac, 0x8b, 0x9f, 0x7c, 0xa9, 0xa4, 0x76, 0x7e, 0xb5, 0x97, 0xd1, + 0x33, 0x58, 0x32, 0x80, 0xca, 0x5a, 0x9f, 0x70, 0xbc, 0x78, 0xcf, 0xbd, + 0xd4, 0x9a, 0x77, 0x42, 0xb9, 0xcc, 0x45, 0xb7, 0xb0, 0xaa, 0x36, 0x82, + 0xaa, 0x4f, 0x9a, 0xc6, 0x37, 0x5c, 0x7a, 0xc7, 0x67, 0xbe, 0x94, 0x5a, + 0x74, 0x6f, 0x49, 0xcc, 0x2a, 0x27, 0x39, 0x46, 0x2c, 0x6f, 0x56, 0x79, + 0x6f, 0xad, 0x5a, 0x89, 0xbf, 0x81, 0x6f, 0x40, 0x88, 0xb5, 0xb9, 0x50, + 0x4f, 0x82, 0x48, 0x96, 0xc0, 0x45, 0xa1, 0x7c, 0xd9, 0x50, 0x4f, 0x77, + 0x65, 0x80, 0x63, 0xa2, 0x36, 0x63, 0x41, 0x39, 0x44, 0x97, 0x6c, 0xb1, + 0xba, 0xa9, 0xb8, 0x8d, 0x35, 0x83, 0x71, 0x59, 0xa1, 0x34, 0x4c, 0xa2, + 0xc3, 0x53, 0x45, 0xa6, 0x53, 0x8e, 0x8b, 0x8b, 0x47, 0x57, 0x95, 0xc9, + 0x3e, 0x3d, 0x76, 0xb1, 0x7d, 0xc3, 0x37, 0xb4, 0x98, 0xac, 0xbf, 0xc0, + 0x3a, 0xb9, 0x40, 0x39, 0x78, 0x65, 0x7f, 0xca, 0xd6, 0x74, 0x8e, 0x77, + 0xa2, 0xbb, 0x86, 0x89, 0x56, 0x50, 0xbd, 0x58, 0x76, 0x84, 0x3f, 0xac, + 0x46, 0x6a, 0x75, 0x6d, 0x43, 0xb0, 0xc6, 0x2f, 0x98, 0x64, 0x67, 0xa9, + 0xab, 0xad, 0xb3, 0x5a, 0x4a, 0x93, 0xa5, 0xc9, 0xc2, 0xae, 0x57, 0xc7, + 0x74, 0x6a, 0x32, 0x39, 0x6d, 0xc4, 0x63, 0xc3, 0xca, 0x63, 0x70, 0x2e, + 0x9b, 0xb7, 0x5d, 0x8a, 0x59, 0x7a, 0x88, 0xce, 0xc2, 0x89, 0x4d, 0x92, + 0xb5, 0x8d, 0x88, 0x9f, 0x45, 0x5d, 0x3f, 0x7f, 0xa2, 0x8a, 0xb9, 0xce, + 0x9a, 0x37, 0x78, 0x6e, 0x78, 0x40, 0xd0, 0x7d, 0xba, 0xbd, 0xa6, 0x82, + 0x40, 0xd2, 0xd8, 0xb9, 0x7c, 0xa3, 0x4a, 0xa5, 0x4f, 0xaf, 0x52, 0x25, + 0x7c, 0xbe, 0xb3, 0xd8, 0x46, 0x51, 0x59, 0x8b, 0x7e, 0x3d, 0xbf, 0x91, + 0x54, 0x4c, 0xbb, 0x33, 0x66, 0x80, 0xab, 0xd4, 0x5d, 0x30, 0xc6, 0x5d, + 0x58, 0x79, 0xd5, 0x4b, 0x7c, 0xaf, 0x2c, 0x9d, 0x35, 0x92, 0x89, 0x44, + 0xa4, 0xa2, 0x66, 0xa8, 0x6f, 0xa8, 0x5e, 0x38, 0xbb, 0x61, 0xae, 0x68, + 0xb3, 0xa2, 0x9e, 0xce, 0x9d, 0x68, 0xb7, 0x77, 0xd4, 0x43, 0x26, 0x42, + 0x8f, 0x73, 0x71, 0xa6, 0xcc, 0xda, 0x60, 0x49, 0xab, 0xad, 0x89, 0xbd, + 0x35, 0xb5, 0x7a, 0xc2, 0x70, 0xae, 0x90, 0x85, 0x9e, 0xc1, 0x79, 0xa8, + 0x69, 0xc0, 0xda, 0x8a, 0x3a, 0x37, 0x94, 0xa2, 0x41, 0xae, 0x7b, 0x86, + 0xae, 0x3c, 0x52, 0x6e, 0x6b, 0xa3, 0x86, 0xa6, 0x34, 0x9e, 0x92, 0x6c, + 0xba, 0x6d, 0xa5, 0x75, 0xb2, 0x78, 0x57, 0x6f, 0xce, 0xbc, 0xc2, 0x3c, + 0x97, 0xc6, 0x70, 0xa4, 0xa8, 0x5f, 0xb1, 0x7f, 0x7c, 0xc6, 0x4f, 0x9a, + 0x46, 0x8f, 0xc2, 0x3b, 0xb4, 0x56, 0x43, 0x27, 0x4f, 0x3d, 0xc8, 0xcc, + 0x94, 0xd3, 0x90, 0x7f, 0xc2, 0xc3, 0xc0, 0x99, 0x2e, 0xb5, 0xc7, 0x71, + 0x52, 0x60, 0xd1, 0xad, 0x66, 0x7b, 0xc1, 0x8d, 0xbe, 0xa9, 0xb4, 0x53, + 0x80, 0xc5, 0x41, 0x8d, 0x41, 0xaf, 0x99, 0x90, 0xc4, 0x49, 0xb3, 0x66, + 0x8a, 0xc1, 0xba, 0x73, 0x86, 0xa5, 0xa7, 0x4e, 0x5e, 0x6a, 0x85, 0x87, + 0x62, 0xcc, 0x88, 0xb5, 0x50, 0xd0, 0x89, 0xa7, 0x99, 0x43, 0x7b, 0x69, + 0x94, 0x6e, 0x24, 0x8f, 0xad, 0x67, 0x7d, 0x94, 0xcc, 0x59, 0xe1, 0xc9, + 0xb3, 0x49, 0x63, 0x9a, 0x95, 0x7a, 0xc6, 0x7f, 0x72, 0xb1, 0x7c, 0x80, + 0xb0, 0x71, 0xa2, 0x4b, 0xae, 0xd3, 0x44, 0x79, 0x8a, 0x62, 0xa4, 0xba, + 0x8d, 0x8f, 0x9b, 0x49, 0x73, 0xba, 0x88, 0x90, 0x7a, 0x50, 0xb6, 0x3c, + 0x73, 0xac, 0x6e, 0x86, 0x34, 0xaa, 0x3a, 0x53, 0x4f, 0x77, 0x8e, 0xba, + 0xa6, 0x7b, 0xa8, 0x3c, 0x95, 0xa9, 0x60, 0x31, 0x52, 0x48, 0x79, 0x6b, + 0x6f, 0x3d, 0x64, 0xc0, 0xbc, 0xd1, 0x51, 0x44, 0x5c, 0x7f, 0x3e, 0x49, + 0x81, 0x6f, 0x94, 0x55, 0x86, 0x6d, 0x72, 0x36, 0x63, 0xa1, 0xca, 0xa8, + 0x9c, 0x41, 0xc7, 0xd0, 0x9b, 0x52, 0x89, 0x75, 0xd5, 0x3d, 0xcf, 0xcf, + 0xc8, 0x47, 0x9a, 0x45, 0x8d, 0xc0, 0x75, 0x3a, 0x66, 0x57, 0x45, 0x77, + 0x38, 0x7d, 0x95, 0xa9, 0xca, 0x7c, 0x6a, 0x6a, 0x93, 0x4d, 0xc3, 0x9b, + 0x87, 0x94, 0x62, 0xc3, 0xab, 0x62, 0x85, 0xb4, 0x94, 0x55, 0x70, 0xcc, + 0x6c, 0x5e, 0xb2, 0xb3, 0x41, 0x6f, 0x5e, 0x4c, 0x6b, 0x5b, 0x68, 0x85, + 0x3a, 0x7f, 0x80, 0x8f, 0x7f, 0x64, 0xae, 0xb9, 0xa3, 0xd7, 0x31, 0x6c, + 0x7b, 0x99, 0x5f, 0xb9, 0x63, 0x59, 0xc0, 0x7d, 0x49, 0xd3, 0x88, 0x37, + 0x4d, 0x48, 0x7c, 0xbc, 0x4d, 0xd0, 0x35, 0x68, 0x88, 0xb6, 0xa7, 0x80, + 0x3a, 0xd5, 0xbd, 0x7c, 0x6d, 0x45, 0x5d, 0x52, 0x42, 0x40, 0x49, 0x69, + 0xac, 0xc0, 0xcc, 0x8d, 0x97, 0xcf, 0x88, 0xb3, 0x92, 0xb2, 0x82, 0xc8, + 0x84, 0xad, 0x6f, 0xb5, 0x4e, 0xb4, 0x8c, 0x42, 0x88, 0x8b, 0x8d, 0xb2, + 0x3c, 0xb8, 0x84, 0xc6, 0x67, 0xc7, 0x4a, 0x30, 0x7a, 0xc9, 0x2a, 0xa5, + 0xda, 0x5a, 0x91, 0x70, 0x84, 0x70, 0xcd, 0x57, 0x45, 0x44, 0xc0, 0xa8, + 0x99, 0x56, 0x6e, 0xa8, 0x7d, 0xcb, 0x4c, 0x56, 0xa7, 0xb8, 0xd3, 0x7f, + 0x64, 0xb0, 0xa6, 0x6f, 0x66, 0x30, 0x3e, 0xae, 0x94, 0x6b, 0x37, 0x61, + 0x5d, 0x99, 0x52, 0xa9, 0xbe, 0x67, 0xc3, 0x76, 0x43, 0x57, 0x51, 0x6b, + 0xb4, 0xdf, 0x55, 0xba, 0x5b, 0x4c, 0xd5, 0x95, 0x77, 0x70, 0xaa, 0x6a, + 0xcd, 0x61, 0xab, 0xbb, 0x5d, 0x37, 0x4c, 0x7e, 0xcf, 0x43, 0x8f, 0x39, + 0x60, 0xce, 0x7a, 0x84, 0x48, 0xac, 0x40, 0x92, 0xd1, 0xce, 0xab, 0x99, + 0x98, 0x94, 0x66, 0x39, 0x3b, 0xb1, 0x85, 0xd0, 0x32, 0xaa, 0x60, 0xad, + 0x52, 0xd5, 0xd0, 0x3c, 0x91, 0x69, 0xa1, 0xc4, 0x5a, 0xce, 0x7a, 0x82, + 0x9e, 0x7f, 0xad, 0x62, 0xc8, 0x5e, 0x8f, 0x9c, 0x81, 0xab, 0x79, 0xc5, + 0xae, 0x60, 0x4a, 0x41, 0x8a, 0x4d, 0x5b, 0x5d, 0x47, 0x3e, 0x37, 0x5d, + 0xbb, 0x3e, 0x68, 0xc6, 0x72, 0xcf, 0x9e, 0x39, 0x93, 0x5b, 0x47, 0x42, + 0x6b, 0x4b, 0xb2, 0xb7, 0x69, 0xa0, 0xce, 0x5a, 0x4f, 0x5a, 0xa4, 0x77, + 0x5f, 0xc1, 0xb7, 0xab, 0xb3, 0x5f, 0xc1, 0xb6, 0x93, 0xb4, 0x87, 0x55, + 0x73, 0x68, 0x9b, 0x50, 0x3e, 0xb7, 0xda, 0x54, 0x5f, 0x98, 0x84, 0xa5, + 0xaa, 0xd6, 0x36, 0x41, 0x75, 0x5f, 0x71, 0x63, 0x8d, 0x58, 0xac, 0x62, + 0xc5, 0x88, 0xa6, 0x4c, 0x4f, 0x4c, 0xc3, 0x42, 0xc0, 0x4f, 0xa3, 0xb6, + 0xb5, 0xc3, 0x40, 0x6f, 0x4b, 0xa5, 0x4b, 0x72, 0x49, 0xd1, 0x49, 0xae, + 0x44, 0x45, 0x54, 0xd8, 0x5c, 0xbb, 0x8a, 0xc7, 0x86, 0x55, 0xaf, 0x76, + 0x8f, 0x6a, 0x81, 0xca, 0x45, 0x88, 0x58, 0xb0, 0xae, 0x31, 0x83, 0x97, + 0x61, 0x40, 0x5c, 0xc0, 0x71, 0xa2, 0xb1, 0x93, 0x3c, 0xce, 0xc3, 0xa5, + 0xa9, 0x9f, 0x9f, 0x46, 0xb6, 0xca, 0x4e, 0xbf, 0xc8, 0x68, 0x6e, 0x6a, + 0xa2, 0xa6, 0xc9, 0x46, 0x5c, 0xc4, 0xd1, 0x6d, 0xcd, 0x6b, 0x66, 0x8c, + 0x3f, 0x4b, 0x76, 0x43, 0xd8, 0x4c, 0xa6, 0xb7, 0x79, 0x65, 0x50, 0xa8, + 0x55, 0xa1, 0xd4, 0x8a, 0x57, 0x7b, 0x3d, 0x42, 0x9f, 0xb0, 0xca, 0x8f, + 0x5f, 0x4c, 0x5a, 0x94, 0x52, 0xb1, 0x9c, 0x6d, 0xaf, 0x89, 0x5f, 0xc3, + 0xba, 0xa7, 0xdb, 0xca, 0x66, 0x8f, 0xcf, 0xae, 0xa1, 0x3c, 0xaa, 0x7f, + 0x8b, 0x68, 0xaa, 0x40, 0x3f, 0xc7, 0x80, 0xa0, 0x73, 0x80, 0xce, 0x78, + 0x52, 0x4c, 0x63, 0x6e, 0x87, 0x3e, 0x4f, 0x4a, 0xb7, 0x6a, 0x9d, 0x36, + 0x99, 0x58, 0x93, 0x35, 0x55, 0x58, 0x50, 0x6e, 0x9b, 0x81, 0x74, 0x6d, + 0x85, 0x3a, 0x67, 0x4f, 0xc4, 0x3f, 0x64, 0x6d, 0xa6, 0x4e, 0x36, 0x40, + 0xb3, 0x3a, 0x52, 0x95, 0x52, 0xa9, 0xa6, 0xb8, 0x50, 0xca, 0x7a, 0xcc, + 0x4a, 0xc1, 0xc9, 0x54, 0x67, 0x88, 0x82, 0xd3, 0x90, 0x4c, 0xa9, 0x8f, + 0x8b, 0xab, 0xa7, 0x9f, 0x48, 0xbb, 0xd5, 0x9c, 0x74, 0xd2, 0x76, 0xa4, + 0xc1, 0x7a, 0x95, 0x81, 0x57, 0xc2, 0x5c, 0x58, 0x41, 0x41, 0x6e, 0x72, + 0x59, 0xa1, 0xa6, 0x53, 0xc9, 0x68, 0xa4, 0x5a, 0xb7, 0xbb, 0xab, 0xb8, + 0x77, 0x5e, 0x90, 0xac, 0xcc, 0x5a, 0xc5, 0xa9, 0x55, 0x7d, 0x61, 0x83, + 0x41, 0x6f, 0x3d, 0x47, 0x97, 0x8a, 0x62, 0x59, 0x63, 0x3c, 0x6e, 0x6a, + 0xad, 0x59, 0xa6, 0x34, 0x3f, 0x39, 0x5b, 0xc2, 0xc1, 0x8e, 0x83, 0x4a, + 0xbb, 0x81, 0x48, 0x9f, 0x7c, 0x55, 0x2e, 0x33, 0x3e, 0x79, 0x95, 0x8b, + 0xbf, 0x64, 0xbd, 0x4d, 0xaf, 0xd5, 0x61, 0xcb, 0x40, 0x5f, 0x69, 0x42, + 0x8a, 0x99, 0x91, 0x89, 0xbe, 0x81, 0xb2, 0x97, 0x60, 0x75, 0xa3, 0xca, + 0x59, 0xca, 0x57, 0xa4, 0xcb, 0xd3, 0x39, 0x8d, 0xb7, 0x4d, 0x8a, 0xb3, + 0x9a, 0x54, 0x5c, 0xa4, 0xc9, 0xa4, 0x43, 0x41, 0x4a, 0xcd, 0xc1, 0x9c, + 0x9a, 0x8c, 0x8f, 0x90, 0x43, 0x62, 0xc4, 0x47, 0x81, 0x70, 0x9a, 0x7b, + 0xc0, 0xc1, 0x55, 0x38, 0x43, 0xa7, 0xc3, 0x96, 0xd5, 0x93, 0x40, 0x53, + 0x5c, 0x56, 0x83, 0x4b, 0x8e, 0x86, 0x94, 0x50, 0xa1, 0x91, 0x58, 0xaf, + 0x6e, 0x7e, 0x6a, 0x76, 0x65, 0x5d, 0x54, 0x8a, 0xc9, 0xb1, 0xa2, 0x94, + 0xb5, 0x89, 0x94, 0x64, 0x59, 0x6e, 0x8b, 0x97, 0xb2, 0x37, 0x60, 0x51, + 0x50, 0xb9, 0x92, 0x3c, 0x97, 0x54, 0x38, 0x5d, 0x42, 0x9c, 0x5a, 0x99, + 0x36, 0x48, 0x9c, 0x69, 0x51, 0xa4, 0x5d, 0x67, 0xa9, 0x88, 0xbc, 0x86, + 0x71, 0x6f, 0x9a, 0xa2, 0xb0, 0xb7, 0xb7, 0x75, 0x70, 0x60, 0x98, 0x40, + 0x92, 0x96, 0xd5, 0xa0, 0xd3, 0xdb, 0xbc, 0x3a, 0x6b, 0xc5, 0x9a, 0x7f, + 0x9f, 0x5e, 0x8b, 0xc2, 0x4e, 0xcb, 0xad, 0xcb, 0x47, 0xd4, 0x88, 0x5b, + 0xae, 0x59, 0x7b, 0x8b, 0x3c, 0x48, 0xad, 0x74, 0xaa, 0x9b, 0x8e, 0x6b, + 0x73, 0xcc, 0xd6, 0x9d, 0x93, 0x83, 0xb3, 0xa1, 0x4f, 0x9f, 0x8a, 0x53, + 0xbf, 0x69, 0xb5, 0xd2, 0x62, 0xb6, 0xd3, 0x86, 0x72, 0x80, 0xb0, 0x5a, + 0x4f, 0xbd, 0xbf, 0x89, 0x9e, 0x73, 0x8f, 0x4b, 0x95, 0xa5, 0x2e, 0x36, + 0x65, 0xa3, 0x44, 0x3a, 0x66, 0x87, 0x81, 0xac, 0x78, 0x53, 0xb2, 0x5b, + 0xc5, 0x9d, 0xc3, 0xad, 0x80, 0xa4, 0x67, 0x8f, 0x4a, 0xb7, 0x75, 0x3e, + 0x3b, 0xd0, 0x6d, 0xc8, 0x8f, 0x5e, 0x6c, 0x44, 0x70, 0x46, 0x8e, 0xc2, + 0x41, 0x7c, 0x84, 0x64, 0x8c, 0x5f, 0xd0, 0x44, 0x76, 0x54, 0xd4, 0xd3, + 0xb7, 0xab, 0xb1, 0x4a, 0x5b, 0xc8, 0x45, 0xcd, 0x4b, 0x57, 0xb3, 0xa3, + 0x94, 0x61, 0xc6, 0x3e, 0x6d, 0x49, 0x35, 0x93, 0xab, 0x74, 0x3f, 0xd3, + 0xc6, 0xb8, 0xb7, 0x73, 0x73, 0x81, 0x4d, 0x62, 0xbb, 0xa6, 0x9d, 0x50, + 0x39, 0x77, 0xbc, 0xa1, 0x5b, 0xd5, 0x9d, 0x7b, 0xb9, 0x7f, 0xaa, 0x9f, + 0xac, 0x69, 0x3c, 0x83, 0x3e, 0x75, 0x37, 0x94, 0x2f, 0x87, 0x55, 0xd9, + 0x65, 0xd7, 0xa8, 0xb5, 0x70, 0xc8, 0x76, 0x46, 0x7b, 0xa4, 0xb0, 0x74, + 0x5b, 0x88, 0x95, 0xa2, 0xa0, 0x27, 0x46, 0x72, 0xb8, 0xa1, 0x49, 0x5e, + 0x40, 0x6d, 0x8f, 0xb4, 0x99, 0x68, 0x60, 0xb7, 0x9e, 0xa6, 0x65, 0x3e, + 0xc7, 0x64, 0x7d, 0xb3, 0x8b, 0x73, 0x39, 0x68, 0x7d, 0x49, 0x48, 0x6c, + 0x3b, 0x47, 0x87, 0xbe, 0x51, 0xa6, 0x40, 0x51, 0x3b, 0xcf, 0x39, 0x90, + 0x4b, 0xbe, 0x4b, 0x96, 0x88, 0x32, 0x7f, 0x65, 0x4a, 0xa4, 0x43, 0x7f, + 0xb6, 0x70, 0x57, 0xbb, 0xbb, 0x98, 0xc1, 0x57, 0x89, 0x62, 0x51, 0xa3, + 0xa6, 0xb3, 0x8f, 0x54, 0x72, 0x5c, 0x3a, 0x7c, 0xb3, 0x93, 0x47, 0x8d, + 0xb7, 0x72, 0x69, 0xbd, 0x33, 0x8a, 0x4f, 0x78, 0x60, 0xad, 0x3a, 0xbb, + 0x70, 0xbc, 0xc7, 0xaf, 0x99, 0x3b, 0xaa, 0x8e, 0x3c, 0x71, 0x42, 0x46, + 0x33, 0x70, 0x81, 0xb0, 0x49, 0x92, 0x75, 0x3f, 0x9c, 0x7f, 0x3d, 0x4b, + 0x79, 0xbf, 0x64, 0x8f, 0x98, 0x2f, 0x52, 0x83, 0x33, 0x3d, 0x6c, 0x8a, + 0xcc, 0x37, 0x75, 0x80, 0x47, 0x59, 0x69, 0x79, 0x6c, 0x7e, 0x65, 0x48, + 0x30, 0x91, 0x86, 0x9b, 0x83, 0x3d, 0x34, 0x48, 0xa3, 0x41, 0xca, 0x9c, + 0x6a, 0xa4, 0x8f, 0x96, 0xc5, 0x6c, 0xb5, 0xa9, 0x37, 0x92, 0x5f, 0xb9, + 0x31, 0x32, 0xa3, 0x80, 0xb0, 0x69, 0xd5, 0x52, 0xc2, 0x44, 0x7a, 0x89, + 0xc5, 0x3f, 0x47, 0x8f, 0x68, 0x55, 0x69, 0xcc, 0x3d, 0x75, 0xcf, 0x49, + 0x53, 0x60, 0x76, 0x7b, 0x6b, 0x87, 0x87, 0xb0, 0xbf, 0xac, 0x40, 0x35, + 0x94, 0x44, 0x7f, 0x4a, 0x8d, 0xac, 0xb0, 0x6e, 0x8f, 0xbf, 0x8e, 0x9d, + 0x72, 0x93, 0x4a, 0x63, 0x78, 0x7c, 0xa6, 0x77, 0x3d, 0x81, 0x8b, 0x6b, + 0x7c, 0x66, 0xa9, 0xbe, 0x3c, 0x6a, 0x6f, 0x42, 0xd1, 0xc1, 0x6e, 0x31, + 0x87, 0xa1, 0x8b, 0xaf, 0x5b, 0xc3, 0x34, 0x71, 0x46, 0x51, 0x6a, 0xb2, + 0x84, 0x58, 0x62, 0x6d, 0xa3, 0x99, 0xc6, 0x77, 0x3d, 0xa0, 0xa8, 0x9b, + 0x32, 0x7d, 0xd6, 0x8e, 0x8b, 0x39, 0xd2, 0x39, 0xbf, 0x92, 0xc3, 0x38, + 0x84, 0xc3, 0xb4, 0xb5, 0x6a, 0x48, 0x5f, 0x75, 0xba, 0x63, 0xcc, 0x79, + 0x7f, 0x8c, 0xa5, 0xb1, 0x69, 0xb5, 0xb7, 0x42, 0xb7, 0xc8, 0xd3, 0x63, + 0x78, 0xd0, 0x38, 0x93, 0x91, 0x56, 0x65, 0xac, 0xb6, 0xc0, 0x8d, 0xbf, + 0x35, 0x80, 0x4a, 0x77, 0x87, 0x53, 0xcb, 0x58, 0x9b, 0x4c, 0x69, 0x47, + 0xd0, 0x89, 0x8a, 0xbd, 0x55, 0xcf, 0xbd, 0x5a, 0x6c, 0x79, 0x5f, 0x85, + 0x3c, 0xa4, 0x67, 0xc6, 0x85, 0x3e, 0xcc, 0x67, 0x85, 0x57, 0x3d, 0xb2, + 0x4b, 0xac, 0x90, 0x41, 0x5f, 0x8c, 0xcb, 0x4a, 0x4f, 0x9b, 0x96, 0x69, + 0xaf, 0x8c, 0x89, 0x6d, 0x71, 0x55, 0x79, 0x4c, 0x82, 0x84, 0xc3, 0xcc, + 0x56, 0xc2, 0x96, 0x2d, 0xb2, 0x83, 0x84, 0x6e, 0xbd, 0x46, 0xbf, 0x51, + 0x68, 0x44, 0xa9, 0xa7, 0x3b, 0x60, 0xc0, 0x70, 0xc5, 0x65, 0x8e, 0xc6, + 0x9e, 0x36, 0xa4, 0xb8, 0xd6, 0x4f, 0x5c, 0x94, 0x51, 0x46, 0x82, 0xd4, + 0x72, 0x97, 0x50, 0xba, 0xae, 0x8e, 0x5f, 0x6d, 0x65, 0xbc, 0xc6, 0xb5, + 0xcb, 0x45, 0x72, 0x4b, 0x64, 0xab, 0x80, 0x39, 0x6d, 0x83, 0x63, 0x57, + 0x36, 0xb1, 0x86, 0x62, 0x37, 0x8e, 0x52, 0x5f, 0x4d, 0xd3, 0x9f, 0xb1, + 0x7e, 0x6b, 0x5c, 0x9a, 0x6b, 0x8b, 0xbf, 0x85, 0x45, 0x82, 0x9a, 0xc0, + 0x3d, 0x45, 0xa3, 0x88, 0xbe, 0x62, 0x60, 0x6d, 0xbf, 0x43, 0x5f, 0x70, + 0x5c, 0xcc, 0xcf, 0x43, 0x88, 0x95, 0x3e, 0xc5, 0xa8, 0x3c, 0x6d, 0x84, + 0x83, 0x94, 0xa7, 0xaa, 0xaa, 0x69, 0x61, 0x8d, 0x4d, 0x2d, 0x67, 0x44, + 0xcb, 0xc4, 0x4b, 0x40, 0x55, 0x3d, 0x7b, 0xac, 0x32, 0x91, 0x59, 0x35, + 0x5b, 0x94, 0xc3, 0x9f, 0x5f, 0x80, 0x6a, 0x80, 0x9f, 0x9f, 0xb1, 0x69, + 0x54, 0x99, 0x83, 0x69, 0xa8, 0x66, 0x37, 0x4f, 0x8c, 0xc1, 0xcc, 0xa6, + 0x81, 0xa5, 0x80, 0x57, 0x5e, 0xd3, 0xb9, 0xb4, 0x94, 0x3d, 0x7f, 0x3e, + 0xc0, 0xbe, 0x72, 0x6d, 0x53, 0x82, 0x64, 0x5c, 0x7c, 0x8b, 0x4e, 0xc5, + 0x3e, 0x4b, 0x5e, 0xb3, 0xcf, 0x98, 0x58, 0xb7, 0x6e, 0x81, 0x68, 0x50, + 0x76, 0x40, 0x75, 0xc1, 0x98, 0x57, 0x50, 0x2c, 0xa7, 0xca, 0x95, 0x92, + 0x8d, 0x55, 0xa1, 0x85, 0x3a, 0xaa, 0x5a, 0x49, 0xa7, 0xce, 0xd4, 0x75, + 0xcd, 0x58, 0xcf, 0xc8, 0x82, 0xa7, 0xaf, 0x84, 0x47, 0xa5, 0x5f, 0xc0, + 0x8d, 0xaa, 0x46, 0x7e, 0x94, 0x52, 0x42, 0xb2, 0xc1, 0xdc, 0x93, 0xc7, + 0x78, 0x94, 0x31, 0x7c, 0xab, 0xa2, 0x2b, 0xd0, 0xd4, 0x44, 0x84, 0xd1, + 0x64, 0x7f, 0x5b, 0x6b, 0x69, 0x3f, 0xa7, 0x75, 0x60, 0x71, 0x9e, 0xa3, + 0x58, 0xca, 0xbd, 0x77, 0x65, 0x88, 0x7d, 0x91, 0x6e, 0x5c, 0x6e, 0x3a, + 0x4c, 0xbb, 0x54, 0x3e, 0xb5, 0x57, 0x3c, 0xa3, 0x8e, 0x9b, 0x32, 0x81, + 0x32, 0x99, 0x77, 0x72, 0x9b, 0x36, 0xba, 0x77, 0x6d, 0xcc, 0x6a, 0xbd, + 0x3a, 0xd5, 0xcd, 0x3c, 0x61, 0x69, 0xd4, 0xcd, 0x6e, 0x9d, 0x7d, 0xb6, + 0x63, 0x42, 0x7e, 0x4e, 0x7b, 0x3c, 0xa1, 0x5e, 0x85, 0x50, 0xcb, 0x8c, + 0x5f, 0x37, 0xa8, 0xa5, 0xb0, 0x98, 0xd3, 0x72, 0xb0, 0xb6, 0xcb, 0x41, + 0x67, 0x45, 0xc8, 0x97, 0x66, 0xc7, 0xb6, 0x69, 0x58, 0x88, 0x34, 0x61, + 0x72, 0x4b, 0x67, 0x87, 0xa6, 0xad, 0x7f, 0xc2, 0xb1, 0x92, 0xad, 0x62, + 0x43, 0x3b, 0x31, 0x76, 0x90, 0xab, 0xd1, 0xc3, 0xcc, 0xbe, 0x72, 0x8b, + 0x9b, 0x91, 0xc4, 0x44, 0xc7, 0x4c, 0x3b, 0xd3, 0xa6, 0xa2, 0x4b, 0xd1, + 0x7a, 0xaa, 0xb9, 0xa2, 0x3d, 0x8d, 0x8a, 0x5d, 0x84, 0xd5, 0xce, 0x98, + 0x85, 0x74, 0x7c, 0x54, 0x93, 0x64, 0x5b, 0x72, 0xb8, 0x40, 0xbf, 0x9f, + 0xbf, 0x3a, 0xbf, 0x5c, 0x95, 0x37, 0x2d, 0x87, 0x7a, 0x84, 0x3d, 0x4e, + 0xb0, 0x7d, 0x59, 0x91, 0xa6, 0x43, 0x2c, 0x6c, 0xcd, 0xcf, 0x74, 0x3f, + 0xc8, 0x91, 0x45, 0x37, 0x54, 0x3b, 0x4c, 0x3b, 0xa9, 0x61, 0x73, 0x70, + 0x89, 0x63, 0xa3, 0x7a, 0x8a, 0x9c, 0x55, 0x4b, 0x59, 0x8f, 0xc0, 0x3f, + 0x8e, 0x5f, 0x35, 0x33, 0xcc, 0xc1, 0xbe, 0x7b, 0xaa, 0x3e, 0x82, 0xa4, + 0x40, 0x3d, 0x94, 0x9b, 0x4f, 0xb2, 0x9a, 0x69, 0x81, 0xa7, 0xc3, 0x3e, + 0x6e, 0x88, 0x51, 0x81, 0x84, 0x31, 0x8b, 0xaa, 0x5f, 0xb2, 0xb8, 0xce, + 0xcf, 0x70, 0x80, 0x39, 0x38, 0x8b, 0x4a, 0x63, 0x74, 0x98, 0x31, 0xbc, + 0x83, 0x6b, 0x5d, 0x8c, 0xa1, 0x70, 0x5c, 0x55, 0x4d, 0x42, 0xd4, 0x73, + 0xb2, 0xc6, 0x9b, 0x31, 0x45, 0x49, 0x47, 0x3e, 0xcb, 0x5d, 0xab, 0x43, + 0xba, 0x61, 0x6a, 0x49, 0x84, 0xc3, 0x76, 0x45, 0xb3, 0x36, 0xbc, 0x58, + 0x46, 0xa3, 0x9f, 0x9f, 0x91, 0xc8, 0xaf, 0x6e, 0x73, 0xab, 0x59, 0x4e, + 0xb9, 0x83, 0xb2, 0x48, 0x99, 0x7e, 0xc3, 0x9c, 0x66, 0xa0, 0x57, 0xc6, + 0xa2, 0x5d, 0x91, 0x94, 0x90, 0x67, 0xa7, 0x5f, 0xb8, 0x7c, 0x98, 0x3c, + 0x9a, 0xa8, 0x45, 0x98, 0x7b, 0x90, 0x83, 0x54, 0x52, 0xca, 0xa1, 0x39, + 0x58, 0x9d, 0xcb, 0xb9, 0xbb, 0x45, 0x99, 0xbd, 0xcd, 0xb6, 0xa0, 0xa9, + 0x60, 0x5f, 0x81, 0x9d, 0xc5, 0x5a, 0xd0, 0x59, 0xc6, 0x45, 0xca, 0xc7, + 0xad, 0xa9, 0x7c, 0x83, 0x64, 0x57, 0x73, 0x6f, 0xbe, 0x9e, 0x55, 0x31, + 0x9a, 0x56, 0xcc, 0x2e, 0xad, 0xbe, 0x56, 0x46, 0x72, 0x5c, 0x70, 0x4f, + 0x42, 0x72, 0xc2, 0x8d, 0xc9, 0x8b, 0x6c, 0x4d, 0x49, 0x92, 0x5b, 0x92, + 0x99, 0x37, 0x51, 0xce, 0x58, 0x85, 0x61, 0x6c, 0x4f, 0x5d, 0x60, 0xd1, + 0x91, 0x77, 0x8c, 0x48, 0x73, 0x83, 0xa4, 0x80, 0x99, 0x33, 0x90, 0x84, + 0x6a, 0x4f, 0x6b, 0x94, 0xb1, 0x9d, 0x61, 0x77, 0xd2, 0x35, 0xcf, 0x35, + 0x7e, 0x66, 0x80, 0x96, 0xab, 0x6b, 0xba, 0xcd, 0x8a, 0x5e, 0xc8, 0x36, + 0xc9, 0xbc, 0x51, 0x86, 0x90, 0x33, 0xd5, 0x33, 0xc2, 0x65, 0xa2, 0x45, + 0x67, 0xa3, 0xcf, 0x9e, 0x41, 0xa4, 0x7f, 0x5b, 0x36, 0x53, 0xa8, 0x83, + 0xc0, 0x75, 0xcd, 0x58, 0x76, 0x67, 0x8f, 0xab, 0x44, 0x47, 0xa5, 0x85, + 0x9d, 0x39, 0xc9, 0x56, 0x33, 0xb8, 0x8a, 0x71, 0x42, 0x6a, 0x60, 0x39, + 0x7d, 0xad, 0x97, 0xbb, 0x5d, 0x94, 0x3b, 0xb9, 0x54, 0x9e, 0xbb, 0x85, + 0x3d, 0x8e, 0x7a, 0xcc, 0x81, 0xa8, 0xbb, 0x8a, 0x57, 0xb8, 0x7c, 0x5c, + 0xb6, 0x9e, 0x7a, 0xd0, 0x4e, 0x56, 0x5a, 0x33, 0x91, 0x8a, 0x9e, 0x7d, + 0x9e, 0x82, 0x51, 0xb3, 0xa3, 0x5b, 0xb5, 0x64, 0x47, 0x63, 0x57, 0x9b, + 0xb1, 0x6a, 0x82, 0x47, 0x89, 0x45, 0xd1, 0x3c, 0x94, 0x4c, 0x74, 0x94, + 0xbf, 0x36, 0x67, 0x83, 0xb2, 0xd0, 0xc6, 0x40, 0x74, 0x3e, 0xa2, 0xc0, + 0xcd, 0xa2, 0x4c, 0xa4, 0x44, 0xd0, 0xa6, 0xc3, 0x76, 0x9a, 0x4a, 0x5c, + 0x32, 0xaf, 0x9d, 0x84, 0x8a, 0x76, 0xa8, 0x4e, 0x78, 0xc2, 0x73, 0x59, + 0x49, 0x94, 0xb9, 0x88, 0xbf, 0x41, 0x7e, 0x75, 0x32, 0xb2, 0x92, 0x43, + 0x6c, 0xbd, 0x3b, 0x53, 0x4d, 0xcc, 0x48, 0x58, 0xa3, 0xb5, 0x8f, 0x74, + 0x8a, 0x48, 0x46, 0xb5, 0xa1, 0x7b, 0x88, 0xa8, 0xb9, 0xa2, 0xb7, 0x6b, + 0x6c, 0xb9, 0x60, 0x4f, 0x4a, 0x91, 0x7d, 0xd4, 0x8a, 0x6f, 0x80, 0x54, + 0x51, 0x4e, 0x4f, 0x55, 0x5d, 0x6f, 0x4f, 0xa3, 0x94, 0x93, 0x83, 0x5a, + 0xa0, 0x3a, 0x79, 0x93, 0xa4, 0x46, 0xce, 0x8c, 0x91, 0xbd, 0xb6, 0x60, + 0x8e, 0xc1, 0x48, 0xcf, 0x91, 0x5e, 0x76, 0x4b, 0x3d, 0x3a, 0x4d, 0xbc, + 0x34, 0x38, 0x7a, 0xc7, 0x99, 0x4d, 0x53, 0xb8, 0x94, 0xa4, 0xac, 0xce, + 0xb6, 0xd1, 0x7b, 0x8c, 0x53, 0x64, 0x6a, 0xa3, 0x59, 0x37, 0xa0, 0x81, + 0xa3, 0xc3, 0x4c, 0xbc, 0x8c, 0x83, 0xaf, 0x99, 0x4b, 0x85, 0x8c, 0x41, + 0xc1, 0xbc, 0xa8, 0x41, 0x44, 0x82, 0xcf, 0x91, 0xab, 0x64, 0x5e, 0xcb, + 0xc5, 0xcd, 0xb7, 0xa8, 0x75, 0x5c, 0x81, 0x3f, 0x48, 0x40, 0xc4, 0x66, + 0x4a, 0xbe, 0x5f, 0x9c, 0x52, 0x9d, 0xaa, 0x38, 0x4d, 0x6f, 0x66, 0xd0, + 0x6c, 0x36, 0xb7, 0x96, 0x5b, 0xd0, 0x74, 0x85, 0xa7, 0x52, 0xbc, 0x72, + 0xcc, 0xce, 0xc9, 0xbf, 0x35, 0xa2, 0xba, 0x83, 0x63, 0x71, 0xd3, 0x6b, + 0xb8, 0xcf, 0x4b, 0x9e, 0x58, 0x70, 0xb6, 0x74, 0x87, 0x3a, 0x8f, 0xb1, + 0x8d, 0xac, 0x6e, 0x5e, 0xbd, 0x58, 0x4f, 0x3b, 0xb8, 0x5a, 0x7f, 0x35, + 0xc2, 0xa4, 0x7f, 0x80, 0x67, 0x54, 0x62, 0xb6, 0xcd, 0xb6, 0x64, 0x43, + 0x9e, 0xc0, 0x55, 0x8d, 0xcb, 0x5b, 0x86, 0xd2, 0x39, 0x6a, 0x85, 0xbf, + 0xae, 0xbb, 0x96, 0xc5, 0x4c, 0x85, 0xab, 0xa6, 0xc0, 0xd0, 0xb7, 0x9e, + 0xbb, 0x36, 0x4c, 0xa2, 0x50, 0x32, 0x58, 0x7a, 0xaa, 0x5b, 0x4a, 0xcc, + 0x5e, 0xa6, 0xa7, 0x3c, 0x7e, 0x34, 0x46, 0x41, 0xa7, 0x5e, 0xc2, 0xa4, + 0x64, 0xaf, 0xc0, 0x85, 0xa3, 0x9a, 0x56, 0xbb, 0x37, 0x3e, 0x9a, 0x70, + 0x8d, 0xd2, 0x82, 0x8c, 0xb3, 0x71, 0x6c, 0x62, 0xb4, 0x88, 0xaa, 0x3d, + 0x59, 0x8a, 0x7d, 0x42, 0xc9, 0x46, 0xa6, 0x90, 0xbf, 0x74, 0xcd, 0xc0, + 0x83, 0x73, 0x80, 0x3d, 0x6d, 0x37, 0x3e, 0x60, 0x9c, 0x58, 0x86, 0x73, + 0x74, 0xab, 0xb0, 0xcd, 0x38, 0xbe, 0xcd, 0x95, 0x95, 0x7f, 0x60, 0x66, + 0xa5, 0x48, 0x98, 0x40, 0x58, 0x78, 0x3d, 0x58, 0x2f, 0xbf, 0xbf, 0x4c, + 0x5e, 0x91, 0x98, 0x8b, 0xa0, 0x9a, 0x77, 0x5c, 0x94, 0x7d, 0x6d, 0xb6, + 0x98, 0x3e, 0xce, 0x70, 0x7b, 0x78, 0x2c, 0xb7, 0x81, 0xc8, 0x39, 0x9a, + 0xa1, 0x97, 0xc6, 0xad, 0x63, 0xc1, 0xb7, 0x52, 0xa7, 0xd2, 0x41, 0x90, + 0xc4, 0x71, 0x89, 0xbd, 0x53, 0xb3, 0xd2, 0xca, 0xa3, 0x86, 0xa5, 0xa7, + 0x6b, 0x57, 0x8e, 0x6a, 0x48, 0xb4, 0x8d, 0x6c, 0x51, 0x61, 0x82, 0xa2, + 0x63, 0x50, 0x66, 0x94, 0x6f, 0xc3, 0x45, 0xc0, 0x52, 0xab, 0xa4, 0xb1, + 0x91, 0xa9, 0xc9, 0xc5, 0x71, 0x5a, 0xac, 0xce, 0x59, 0x5a, 0x46, 0x71, + 0x5a, 0x9e, 0xb5, 0x7f, 0x6e, 0x39, 0x6d, 0x3c, 0x58, 0x8f, 0x8b, 0x95, + 0xd3, 0xa9, 0x68, 0x7c, 0x9d, 0x8a, 0x89, 0xcb, 0x86, 0x46, 0xc7, 0xb2, + 0x8c, 0xad, 0x35, 0x97, 0x72, 0xc3, 0xc8, 0x49, 0x6e, 0xba, 0x57, 0x82, + 0x8d, 0x6b, 0x54, 0x7c, 0xae, 0x35, 0x51, 0xbd, 0xb7, 0xd3, 0x99, 0x5b, + 0xa9, 0x99, 0xab, 0x72, 0x37, 0xa6, 0x88, 0xb5, 0x3e, 0x3a, 0x70, 0xcb, + 0x6c, 0x9c, 0x42, 0x8f, 0x57, 0xd4, 0xae, 0x74, 0x98, 0x6f, 0x8d, 0x77, + 0xad, 0x54, 0x5c, 0xc1, 0x61, 0x98, 0xb6, 0xca, 0xc2, 0xa0, 0x4b, 0x32, + 0xa5, 0x45, 0xc2, 0xb4, 0x50, 0x3e, 0x78, 0x67, 0x5c, 0x85, 0x3b, 0x5c, + 0x49, 0x8c, 0x45, 0x36, 0x35, 0x38, 0x7d, 0x36, 0x49, 0xc8, 0x6b, 0xa5, + 0x6f, 0x6f, 0x59, 0x54, 0x9a, 0xbf, 0xd3, 0x49, 0x3b, 0x90, 0x51, 0x76, + 0x73, 0x87, 0x6b, 0xaf, 0x9f, 0x74, 0x36, 0xa1, 0xb0, 0xd1, 0x7f, 0xa7, + 0xa2, 0x72, 0x86, 0xd5, 0x99, 0x9c, 0x53, 0x38, 0xb4, 0x3e, 0xb4, 0xb9, + 0xc1, 0xbb, 0x3a, 0xcc, 0x35, 0x3b, 0x95, 0x43, 0x40, 0x70, 0xc8, 0xc9, + 0x65, 0xbd, 0xc5, 0xc4, 0x37, 0x91, 0x44, 0x4a, 0x99, 0x4b, 0xb1, 0x76, + 0x3b, 0x3d, 0xcf, 0x45, 0x7b, 0x77, 0x3b, 0xb1, 0x87, 0xd0, 0x53, 0x7f, + 0xb8, 0x96, 0x37, 0x84, 0x9d, 0xaf, 0x69, 0xaa, 0xa4, 0x92, 0x3b, 0xa2, + 0x5f, 0xbd, 0x3d, 0x93, 0x33, 0xab, 0x77, 0x89, 0x88, 0xba, 0x34, 0x40, + 0xac, 0x6b, 0xba, 0x88, 0x47, 0x4a, 0x53, 0x3d, 0x91, 0x2b, 0xb8, 0xd6, + 0x7e, 0xca, 0xaf, 0x96, 0x79, 0x87, 0x9a, 0x4f, 0xc5, 0x58, 0x73, 0x5b, + 0x80, 0x4f, 0xc0, 0xb3, 0xa2, 0x47, 0xa0, 0xa6, 0x7d, 0xa1, 0x60, 0x4a, + 0x82, 0x34, 0xb2, 0xbf, 0x41, 0xa3, 0xca, 0xc1, 0x39, 0x70, 0x3f, 0x66, + 0xc1, 0x96, 0x94, 0x83, 0xc8, 0x50, 0xb3, 0x7a, 0xad, 0xbb, 0xaf, 0x5e, + 0x65, 0xb8, 0x30, 0x6e, 0x97, 0xd2, 0x5f, 0x89, 0x9c, 0x3c, 0xb0, 0x7b, + 0xa5, 0x83, 0x5f, 0x8c, 0x53, 0xca, 0xc6, 0xd0, 0x9a, 0x72, 0x8b, 0x66, + 0x87, 0xb4, 0xb5, 0xb3, 0xa9, 0xc2, 0x3a, 0x50, 0xa4, 0xa2, 0x3b, 0x9c, + 0x7c, 0x38, 0xb9, 0xc9, 0xc2, 0x35, 0xa7, 0x7e, 0x66, 0x3e, 0x90, 0x56, + 0xcc, 0xbf, 0x6d, 0x91, 0x3d, 0xd5, 0x98, 0x67, 0x71, 0x6f, 0x61, 0x3d, + 0x55, 0x32, 0xa8, 0x35, 0x60, 0x76, 0x62, 0x7a, 0xd2, 0xc7, 0xb1, 0xa2, + 0x38, 0xcb, 0xb7, 0x53, 0xa0, 0x47, 0x57, 0xa9, 0x36, 0x2c, 0xaf, 0xa3, + 0xb5, 0xbe, 0x49, 0x80, 0xa8, 0xbf, 0x95, 0xc1, 0xa8, 0xc6, 0x64, 0xba, + 0x72, 0x8a, 0x6c, 0x69, 0x95, 0xd2, 0x65, 0xa2, 0x6b, 0x8e, 0x5e, 0xd2, + 0xb5, 0xc2, 0xc5, 0x41, 0xa4, 0x59, 0xd5, 0xd6, 0xa7, 0x6f, 0x6c, 0xc2, + 0x81, 0x4b, 0xbe, 0x61, 0xc1, 0x8d, 0x51, 0x8b, 0x7c, 0x63, 0x9e, 0x69, + 0x9d, 0xb5, 0xc7, 0xc3, 0x66, 0xc8, 0xba, 0x6e, 0x8c, 0x71, 0x66, 0x5c, + 0xb0, 0x3e, 0x68, 0xa7, 0xb7, 0x42, 0x5d, 0xb1, 0xca, 0x35, 0x33, 0x59, + 0x7b, 0x33, 0x9d, 0x7e, 0xc8, 0x96, 0xaf, 0xd3, 0x45, 0x8c, 0xaf, 0xbc, + 0x39, 0x95, 0x48, 0x35, 0x63, 0xbb, 0x35, 0xc5, 0x59, 0x55, 0x45, 0x48, + 0x59, 0x5a, 0x6d, 0x6b, 0xcc, 0x34, 0x63, 0x7b, 0xcb, 0x69, 0xd5, 0xd3, + 0x53, 0x9b, 0x74, 0x41, 0x72, 0x87, 0x76, 0xcb, 0x58, 0xd0, 0x3a, 0xa0, + 0x41, 0x71, 0x65, 0x5a, 0x4e, 0xa2, 0x84, 0x8c, 0x4e, 0xad, 0x9a, 0x86, + 0xd1, 0x3f, 0xc6, 0x9a, 0x46, 0xd8, 0x49, 0xbd, 0xd1, 0xb5, 0xcc, 0x9e, + 0x6f, 0xc6, 0x4a, 0xca, 0x70, 0xca, 0x8e, 0x6c, 0x3a, 0x40, 0x4c, 0x73, + 0x87, 0xc5, 0x55, 0x55, 0xbb, 0x51, 0xa1, 0xa0, 0x3c, 0xa6, 0x7c, 0xd3, + 0xc7, 0x62, 0xa3, 0x4f, 0x54, 0xb0, 0xbc, 0x69, 0x64, 0x7a, 0x84, 0x72, + 0x9d, 0xb1, 0xad, 0x64, 0x61, 0x49, 0xc7, 0x6d, 0x70, 0x53, 0xad, 0xd2, + 0xbc, 0x54, 0x57, 0xb1, 0x35, 0x7d, 0x91, 0x8e, 0xd1, 0xc1, 0x60, 0x4e, + 0xd5, 0x43, 0xb8, 0x4e, 0xc8, 0x46, 0x43, 0x68, 0x84, 0xc2, 0xbe, 0x85, + 0xd0, 0xd4, 0x94, 0x4f, 0x43, 0x58, 0xa7, 0x50, 0xc4, 0xba, 0x87, 0xcd, + 0x7a, 0x31, 0x6d, 0x98, 0x92, 0x95, 0x65, 0xb7, 0x54, 0x85, 0xb9, 0x8b, + 0xc2, 0xc4, 0x5f, 0x71, 0xd3, 0x80, 0x9c, 0x7d, 0x8a, 0xa2, 0x92, 0x88, + 0xb5, 0x8f, 0x9e, 0xb0, 0x31, 0x4e, 0x78, 0x80, 0xc4, 0x69, 0xab, 0x35, + 0x54, 0x87, 0x83, 0xbd, 0xa4, 0xc9, 0x68, 0xac, 0x4b, 0x9e, 0x7d, 0x38, + 0x89, 0x8d, 0xcd, 0x93, 0x96, 0xbc, 0x57, 0x8d, 0x62, 0x91, 0x5d, 0xb6, + 0x65, 0x4e, 0xad, 0xb1, 0x76, 0x87, 0x77, 0x64, 0x92, 0x6f, 0x92, 0x55, + 0x8d, 0x99, 0x5f, 0xb8, 0x8b, 0x68, 0x3c, 0x48, 0xa1, 0xaf, 0x41, 0x34, + 0x50, 0x3f, 0x36, 0x69, 0xc8, 0x99, 0x7d, 0xc0, 0x37, 0xb4, 0x53, 0x7d, + 0x8b, 0x34, 0xad, 0x32, 0x6d, 0xa8, 0xb9, 0xc0, 0x5c, 0xb9, 0x93, 0xa5, + 0x84, 0x8f, 0xaf, 0x8e, 0x72, 0x9f, 0x9f, 0xd7, 0x93, 0x35, 0x5f, 0x5b, + 0x4b, 0xcd, 0xb8, 0xc5, 0xbd, 0x5c, 0xcd, 0x59, 0xd6, 0x55, 0xa4, 0x59, + 0xaf, 0x63, 0x4a, 0xa8, 0x83, 0xa3, 0x7f, 0xb4, 0x93, 0x94, 0x5b, 0x7f, + 0x42, 0x5d, 0x7f, 0x72, 0x6c, 0x3f, 0x75, 0x80, 0x83, 0x72, 0x66, 0x68, + 0x74, 0x49, 0xc1, 0x36, 0xa4, 0xa9, 0x80, 0x50, 0x7d, 0xbd, 0xb3, 0x43, + 0x6d, 0x64, 0xce, 0xaf, 0xaa, 0x6b, 0x84, 0x40, 0x78, 0xcb, 0x39, 0x8c, + 0x92, 0x71, 0x4c, 0xd5, 0x98, 0x5a, 0x86, 0x77, 0x37, 0xc1, 0x6b, 0xc2, + 0xc0, 0x2a, 0x3a, 0x57, 0x8c, 0x6d, 0xbd, 0x9d, 0xd6, 0xa2, 0xc1, 0x6f, + 0x94, 0xae, 0xa4, 0x72, 0xb4, 0x8c, 0x3e, 0x4f, 0x62, 0x43, 0x3b, 0x39, + 0x3c, 0x6a, 0x79, 0x45, 0xaa, 0x33, 0xab, 0x46, 0xa7, 0x2c, 0x3c, 0x42, + 0xbf, 0x79, 0x52, 0xa8, 0xbb, 0x5a, 0x4d, 0x78, 0x81, 0x56, 0x97, 0x50, + 0x96, 0x7b, 0x86, 0x4c, 0x58, 0x52, 0x9e, 0xb8, 0xb0, 0xc4, 0x3a, 0xaa, + 0x61, 0x6c, 0x7b, 0x37, 0x5d, 0xb8, 0x3d, 0x52, 0x63, 0xc1, 0x62, 0xa0, + 0x9c, 0xcb, 0xa7, 0x6e, 0x39, 0x55, 0x52, 0xcb, 0xbb, 0x52, 0x71, 0x9d, + 0xcf, 0xb0, 0x50, 0x99, 0xaa, 0xb7, 0xba, 0x55, 0xb3, 0x6d, 0x46, 0xd0, + 0x97, 0x75, 0x83, 0x9b, 0x6b, 0x66, 0x6e, 0x63, 0x40, 0x4c, 0x41, 0x2d, + 0xb5, 0x77, 0xaa, 0x3d, 0xba, 0x7c, 0x44, 0x9f, 0xc1, 0x52, 0x6e, 0x80, + 0x60, 0x8e, 0x3f, 0x5b, 0xb4, 0x67, 0x8c, 0x7a, 0xa4, 0xa5, 0x9e, 0x39, + 0x8a, 0x95, 0x9a, 0x33, 0xc4, 0x45, 0x82, 0x5e, 0x37, 0x95, 0xc5, 0xce, + 0xc0, 0xa0, 0x87, 0xa3, 0xc9, 0x3a, 0x5b, 0xcc, 0xac, 0x61, 0x86, 0x96, + 0x59, 0x2f, 0xd2, 0x5b, 0x7c, 0x5f, 0x3f, 0x79, 0x92, 0x9b, 0xc4, 0x80, + 0x80, 0xab, 0x92, 0x3b, 0x5b, 0x99, 0xb9, 0x9e, 0x49, 0x99, 0x5f, 0x83, + 0x8d, 0xc3, 0x7a, 0xcc, 0x6d, 0xc3, 0xaa, 0xb1, 0x66, 0x71, 0x35, 0x86, + 0x8b, 0x6b, 0x5c, 0x5e, 0xb3, 0x76, 0x9e, 0xbe, 0xa8, 0x62, 0xbc, 0x54, + 0x77, 0x43, 0x96, 0x6e, 0x73, 0xd1, 0x63, 0xc7, 0xcc, 0x46, 0x38, 0x6a, + 0x9f, 0x53, 0x98, 0x3a, 0xa2, 0xc5, 0x66, 0xc1, 0x65, 0xa1, 0x38, 0x93, + 0xbc, 0x42, 0x90, 0x69, 0xc4, 0xbd, 0x67, 0xbe, 0x73, 0xc4, 0xa5, 0x9c, + 0x4a, 0xba, 0x8c, 0x69, 0xc5, 0xa4, 0x3b, 0x3d, 0x46, 0x3e, 0x9f, 0xab, + 0x50, 0xb9, 0x8c, 0xd2, 0xbc, 0x6d, 0x59, 0x32, 0x6f, 0x66, 0x3c, 0x3c, + 0x75, 0x9c, 0x58, 0x56, 0x79, 0x72, 0x49, 0xb4, 0x38, 0x5b, 0x49, 0x89, + 0x7a, 0x64, 0xa6, 0x80, 0x8e, 0xa3, 0x75, 0xa8, 0x80, 0x60, 0xb2, 0xcc, + 0xb3, 0xb1, 0x8e, 0x9a, 0x44, 0x5c, 0xae, 0x9b, 0x63, 0xbf, 0x86, 0x3d, + 0xa8, 0x9c, 0x38, 0xcf, 0xb1, 0x81, 0xa1, 0x2d, 0xc1, 0xd1, 0x71, 0xae, + 0x9f, 0xac, 0x3e, 0x57, 0x32, 0x78, 0xd6, 0xb7, 0xc6, 0x42, 0xa0, 0xad, + 0x7f, 0xbf, 0xc0, 0x5e, 0x63, 0xba, 0x38, 0x41, 0x49, 0x90, 0x8b, 0x4d, + 0x44, 0x43, 0x5a, 0xab, 0x5a, 0x46, 0xcb, 0x38, 0x75, 0x74, 0x51, 0xb7, + 0x5d, 0x97, 0x3c, 0x93, 0xc6, 0x8d, 0x58, 0x66, 0x90, 0x62, 0xa2, 0x54, + 0xb6, 0xb3, 0x89, 0x96, 0xcf, 0x34, 0xd5, 0x93, 0x52, 0x42, 0x78, 0xd0, + 0xb8, 0x69, 0xac, 0x7f, 0xc2, 0xbe, 0xad, 0xba, 0xb9, 0x43, 0xbd, 0x36, + 0x84, 0x2e, 0x97, 0xa0, 0x4c, 0x69, 0x88, 0xaa, 0x58, 0xa1, 0xb9, 0xcd, + 0xce, 0xa3, 0xbb, 0x89, 0x55, 0x84, 0xa5, 0x29, 0x50, 0x67, 0xb9, 0xb3, + 0x3c, 0x9b, 0x50, 0x36, 0xb4, 0x92, 0xa8, 0x58, 0x7a, 0xc4, 0x46, 0xbc, + 0x4e, 0x46, 0x63, 0x4f, 0x63, 0x44, 0x76, 0x6a, 0x59, 0x83, 0x7d, 0x39, + 0x4c, 0x89, 0xa1, 0x61, 0xb6, 0x95, 0xc3, 0x97, 0x52, 0xa0, 0xc3, 0x5d, + 0xb8, 0x86, 0xc4, 0xcd, 0xd2, 0xd3, 0xc3, 0x47, 0x56, 0x99, 0x47, 0x59, + 0x45, 0xba, 0x8e, 0x42, 0x6c, 0x8d, 0xac, 0xc0, 0xb9, 0x8d, 0xa1, 0xd1, + 0x2e, 0xab, 0x31, 0x6c, 0x8a, 0x63, 0x68, 0xb7, 0x4f, 0x8e, 0xb8, 0x4b, + 0xb4, 0x33, 0x6a, 0x5c, 0xb7, 0xc6, 0xa2, 0xac, 0xd6, 0x4a, 0xce, 0xcf, + 0xad, 0xad, 0x57, 0x5f, 0x77, 0x49, 0xaa, 0x92, 0x69, 0x92, 0x61, 0x53, + 0x4b, 0x55, 0xbb, 0x97, 0x97, 0x92, 0x85, 0x7d, 0x54, 0x89, 0x46, 0x4b, + 0x64, 0xce, 0xca, 0xca, 0xae, 0x77, 0x67, 0xbd, 0x42, 0xab, 0xbf, 0x70, + 0x94, 0xa1, 0x53, 0x85, 0x68, 0xa8, 0x76, 0xc6, 0x80, 0x81, 0xa9, 0x8b, + 0xa2, 0xc3, 0x76, 0xb7, 0xbc, 0xb8, 0x5c, 0x6b, 0x47, 0x45, 0x83, 0x4b, + 0x66, 0xbd, 0xbf, 0x8b, 0x7b, 0xb2, 0xd0, 0xaa, 0x4f, 0x97, 0x8c, 0x55, + 0x6a, 0x47, 0xb4, 0x94, 0x76, 0x4f, 0x46, 0x92, 0x36, 0xa4, 0x59, 0x43, + 0x88, 0x74, 0xc5, 0x9d, 0x7a, 0x41, 0x32, 0x7e, 0x3a, 0xb3, 0x42, 0xd4, + 0xad, 0xbe, 0x92, 0xce, 0xc1, 0x64, 0xcd, 0x7f, 0x91, 0x65, 0xbe, 0xbd, + 0x4c, 0x92, 0x62, 0x65, 0x52, 0x6a, 0x80, 0xb2, 0x53, 0x29, 0xbc, 0x8d, + 0xb1, 0x80, 0x98, 0x77, 0x76, 0x64, 0xb5, 0x8a, 0x52, 0x92, 0x5b, 0xcb, + 0x7e, 0x86, 0xbd, 0x40, 0x65, 0x30, 0xc5, 0x8d, 0x3a, 0x61, 0x37, 0x73, + 0xb2, 0x3a, 0xd0, 0xaa, 0xc5, 0xc1, 0x76, 0xb6, 0x55, 0xa0, 0x8a, 0xc6, + 0x80, 0xce, 0x7b, 0xab, 0x62, 0xb3, 0xc5, 0x74, 0x68, 0xb5, 0x5a, 0x84, + 0xce, 0xd8, 0xc3, 0x86, 0xa7, 0xaa, 0xac, 0x8c, 0x5e, 0x48, 0xa3, 0x56, + 0x78, 0x9a, 0x3d, 0x81, 0x32, 0xab, 0x80, 0xcf, 0x70, 0xa5, 0x88, 0x64, + 0x5e, 0x6f, 0x8e, 0x38, 0x93, 0x8c, 0x6b, 0x99, 0x34, 0xc7, 0x4b, 0x9e, + 0x92, 0xae, 0xcf, 0xd1, 0x85, 0xc3, 0xa6, 0xb9, 0x55, 0xaa, 0x68, 0x37, + 0x40, 0x92, 0x71, 0x8c, 0x55, 0x7a, 0x60, 0x84, 0xc0, 0x33, 0x3a, 0xca, + 0xcf, 0x45, 0x98, 0xc6, 0x61, 0x7d, 0x4e, 0x3a, 0x30, 0x9a, 0xab, 0x8c, + 0x4f, 0x47, 0x63, 0x26, 0x5e, 0x93, 0xbf, 0x51, 0x64, 0x91, 0x48, 0x82, + 0x46, 0x56, 0x6c, 0x2b, 0x46, 0xd0, 0xd2, 0x78, 0x3e, 0x98, 0x90, 0xa7, + 0x4c, 0x9c, 0x8a, 0x73, 0x9b, 0x5e, 0x96, 0x81, 0xc8, 0x97, 0x54, 0xdb, + 0x8c, 0x6a, 0x49, 0xa1, 0x6a, 0xb3, 0x78, 0x8c, 0x22, 0x49, 0x38, 0x6c, + 0xa3, 0xa5, 0xb4, 0x8b, 0x89, 0x84, 0x79, 0x3a, 0x6a, 0x63, 0x3e, 0x50, + 0x9e, 0x91, 0x91, 0xd1, 0x76, 0xaf, 0x56, 0x80, 0x55, 0x51, 0xc2, 0xbd, + 0x89, 0x65, 0x7c, 0x7b, 0x3d, 0x7a, 0xb0, 0x5e, 0xc2, 0x3c, 0x9f, 0xaf, + 0xa2, 0x73, 0x9d, 0x60, 0x9f, 0xd2, 0x98, 0x3b, 0x3c, 0xbb, 0x85, 0xcf, + 0xc1, 0x54, 0xc5, 0x53, 0x53, 0xd0, 0x38, 0x34, 0x4e, 0xac, 0xb2, 0xb8, + 0xcd, 0xb0, 0x4c, 0x6e, 0xb6, 0xa2, 0x98, 0x5e, 0x72, 0x2f, 0x8a, 0x66, + 0x35, 0x9f, 0x2c, 0xc7, 0xcd, 0x91, 0x83, 0xd3, 0xb4, 0xbf, 0x54, 0x38, + 0xb3, 0x69, 0x56, 0x59, 0xcd, 0x76, 0x45, 0x54, 0x52, 0x9e, 0x4e, 0x9f, + 0x6e, 0x88, 0x95, 0x3d, 0x32, 0xd3, 0x7f, 0x6c, 0x4a, 0x43, 0x83, 0xbd, + 0x91, 0x67, 0x6f, 0xab, 0x3a, 0x71, 0x6b, 0x82, 0xb7, 0x70, 0xd6, 0xb4, + 0x33, 0x71, 0x9a, 0xbb, 0x3c, 0xa7, 0xac, 0x84, 0x4d, 0x9e, 0x50, 0x60, + 0x46, 0xbe, 0x43, 0xa1, 0x9b, 0x77, 0x78, 0xb3, 0x3d, 0x85, 0xb5, 0xcd, + 0xd5, 0x54, 0xcb, 0x52, 0x80, 0xbe, 0x71, 0xab, 0x58, 0x56, 0x40, 0x58, + 0xa3, 0x97, 0xd3, 0x43, 0xa4, 0xab, 0x5c, 0x95, 0xc4, 0x69, 0x79, 0xb2, + 0xa3, 0x47, 0x93, 0x91, 0x5f, 0x87, 0xcc, 0x3e, 0xb1, 0xaa, 0xa2, 0x8e, + 0x7d, 0x6e, 0xaf, 0x3e, 0xac, 0x76, 0x33, 0x55, 0xa0, 0x44, 0x77, 0x96, + 0x81, 0x9b, 0xd0, 0xc9, 0x52, 0x38, 0x9f, 0x4a, 0x93, 0x3e, 0xc2, 0x89, + 0xe2, 0xb9, 0xa2, 0x78, 0x48, 0xc8, 0x3c, 0x9c, 0xcc, 0x50, 0x9e, 0x65, + 0x41, 0xa1, 0xb4, 0xc0, 0x4a, 0x68, 0xa4, 0xa3, 0x59, 0xab, 0x7d, 0x5a, + 0xdf, 0xc4, 0x38, 0x89, 0x53, 0x49, 0x85, 0x68, 0xa0, 0xa2, 0x7c, 0xbd, + 0xb7, 0xad, 0x6a, 0xa9, 0x8f, 0xa1, 0x5b, 0x6d, 0x73, 0x50, 0x7e, 0x35, + 0x3b, 0x63, 0x89, 0xad, 0x34, 0xcb, 0xbf, 0x3d, 0x5e, 0xba, 0x63, 0x74, + 0xc4, 0x95, 0x4e, 0xa7, 0x8a, 0xae, 0x94, 0x85, 0x4c, 0x85, 0xa8, 0x4c, + 0xc1, 0x3e, 0x4e, 0xa3, 0x3c, 0xc7, 0x66, 0x8b, 0x60, 0x85, 0xa2, 0x80, + 0x6f, 0x45, 0x74, 0xad, 0x84, 0x60, 0x96, 0x53, 0x3b, 0xbd, 0x60, 0x99, + 0xd7, 0x69, 0xc5, 0xbf, 0xa7, 0x87, 0x3a, 0x6a, 0x7d, 0x72, 0x7d, 0x56, + 0x65, 0x85, 0x9e, 0xb0, 0xad, 0x84, 0x50, 0x42, 0x7d, 0xa4, 0x5c, 0xc1, + 0x91, 0xb0, 0xc5, 0xc2, 0x3b, 0x85, 0x5c, 0xb5, 0x8c, 0xc2, 0x9d, 0xb6, + 0x30, 0x2d, 0x74, 0xa1, 0x70, 0x85, 0x97, 0x3f, 0xac, 0x49, 0x4f, 0x42, + 0xc3, 0x3a, 0x7e, 0xc8, 0x54, 0xa7, 0x7d, 0x73, 0x86, 0x92, 0x35, 0xa5, + 0x75, 0xa2, 0x9e, 0xb6, 0x3e, 0x62, 0xbd, 0xbf, 0x9f, 0x32, 0x95, 0xb7, + 0x44, 0x6d, 0x97, 0x41, 0x82, 0x99, 0x9f, 0x70, 0x6f, 0x3a, 0xb0, 0xd6, + 0xc9, 0xc0, 0xd1, 0x93, 0x51, 0x5c, 0x42, 0x9a, 0x65, 0x8b, 0x60, 0xb8, + 0xda, 0xd0, 0x5b, 0x57, 0x97, 0xa2, 0xd2, 0x75, 0x96, 0xa4, 0x6e, 0x97, + 0xa7, 0x81, 0xc5, 0x7f, 0x73, 0x73, 0x7e, 0x45, 0xc7, 0xab, 0x7e, 0x9f, + 0xb1, 0x26, 0x85, 0x32, 0x8c, 0x9d, 0x8d, 0x2b, 0xcb, 0x71, 0x5f, 0x84, + 0xce, 0x99, 0x7c, 0x49, 0x62, 0x75, 0x97, 0xb5, 0xae, 0x4c, 0x70, 0x59, + 0x86, 0x8e, 0x56, 0x7e, 0x44, 0xc2, 0x8b, 0x5b, 0x8a, 0x40, 0x6e, 0x53, + 0x8a, 0x40, 0xb2, 0x95, 0xa7, 0x3e, 0xcc, 0x70, 0xd0, 0x75, 0xd1, 0x87, + 0x78, 0x9d, 0x43, 0xc4, 0x46, 0x36, 0x67, 0x75, 0x79, 0x62, 0xce, 0x79, + 0x3b, 0x5d, 0x8a, 0x7c, 0x79, 0x53, 0x95, 0xbb, 0xa6, 0xc6, 0xae, 0xc8, + 0xcb, 0x82, 0xc2, 0x6f, 0x5d, 0x82, 0x69, 0x9f, 0x54, 0x43, 0x58, 0x8b, + 0xb0, 0x9d, 0xa2, 0xbd, 0x66, 0xb7, 0x5b, 0x99, 0x4f, 0x4a, 0x38, 0x61, + 0x49, 0x37, 0xc7, 0x7b, 0x71, 0x93, 0x6d, 0xc5, 0x51, 0xa9, 0xba, 0xb1, + 0x56, 0x79, 0xce, 0x5d, 0xba, 0x7c, 0xda, 0x99, 0xbf, 0x7c, 0x62, 0xa2, + 0x75, 0xae, 0x7b, 0xbd, 0xb8, 0x67, 0x55, 0x45, 0x29, 0x5c, 0x41, 0x38, + 0x9c, 0x91, 0x84, 0x55, 0x9a, 0x8f, 0x64, 0xc7, 0x69, 0x73, 0xc1, 0x72, + 0x2f, 0x6c, 0xbd, 0x77, 0xaf, 0xbd, 0xb6, 0x87, 0x93, 0xa7, 0x64, 0xd1, + 0x87, 0x4c, 0x49, 0x4f, 0x3e, 0xa4, 0x6f, 0x67, 0x3e, 0xb7, 0x78, 0xd0, + 0x42, 0x80, 0xd1, 0x40, 0x39, 0x9b, 0x94, 0x4a, 0x2c, 0x8c, 0x3b, 0xca, + 0x53, 0x40, 0x87, 0x75, 0x89, 0xc0, 0xdb, 0x6d, 0x4d, 0x78, 0x62, 0x52, + 0x79, 0x7d, 0x91, 0x46, 0x60, 0x88, 0x5e, 0x3f, 0xc5, 0x4b, 0xa9, 0x4a, + 0x4e, 0xd9, 0xd3, 0xdc, 0x38, 0x5a, 0x34, 0xaa, 0x81, 0xbe, 0x27, 0xae, + 0xa5, 0xa7, 0x97, 0xcf, 0xd2, 0xaf, 0x66, 0x94, 0x79, 0x9c, 0xdb, 0x87, + 0x74, 0x23, 0x72, 0x55, 0x5c, 0xab, 0xa1, 0x76, 0x76, 0x7a, 0x87, 0x3e, + 0x99, 0x43, 0xb5, 0x75, 0xcd, 0xb0, 0x8c, 0xc9, 0x72, 0xa8, 0x90, 0xab, + 0xc5, 0x42, 0x8a, 0x6f, 0x81, 0x7a, 0xdf, 0x53, 0x6d, 0x3d, 0x8a, 0x38, + 0x4f, 0x85, 0x7f, 0x5f, 0x8b, 0x83, 0x32, 0x56, 0x51, 0x2e, 0x96, 0x56, + 0x2a, 0x9f, 0x94, 0x6e, 0x3a, 0x8d, 0x87, 0xd8, 0xb0, 0x35, 0x75, 0xd6, + 0xa1, 0xd1, 0x49, 0xbd, 0x91, 0x7e, 0x8f, 0x26, 0xc2, 0x72, 0x9f, 0x54, + 0xb7, 0xc6, 0x24, 0xb4, 0x3c, 0xb4, 0x95, 0xa3, 0x60, 0x6a, 0xd5, 0x45, + 0xcf, 0xaf, 0x82, 0x43, 0xb4, 0x93, 0x6e, 0xcc, 0xba, 0x38, 0x8e, 0x8c, + 0x50, 0xcc, 0x51, 0x52, 0xbb, 0x8f, 0x6b, 0x2f, 0x5a, 0xbd, 0x5f, 0xd9, + 0x3f, 0x81, 0x81, 0x88, 0x66, 0x89, 0xbf, 0x3f, 0xb7, 0x49, 0x6a, 0xa4, + 0x9e, 0xd7, 0x97, 0xa7, 0xc1, 0xc2, 0x8b, 0x85, 0xaf, 0x3b, 0x66, 0xaa, + 0x53, 0x95, 0x9f, 0xaf, 0x94, 0x6a, 0x9c, 0x5b, 0xbf, 0x7b, 0x48, 0x6c, + 0x43, 0xb3, 0x4a, 0x31, 0xad, 0xcb, 0x90, 0xae, 0x2f, 0x70, 0xc4, 0xdd, + 0xc0, 0x39, 0x85, 0x50, 0xa5, 0x85, 0x9c, 0xb1, 0x55, 0x73, 0x82, 0x9c, + 0xa0, 0x49, 0xa9, 0x44, 0xc9, 0x7f, 0xd9, 0x79, 0x4c, 0x8d, 0x68, 0xb5, + 0xa4, 0x7f, 0xd0, 0xcf, 0x70, 0x7f, 0xd4, 0x9c, 0xa4, 0x69, 0xaa, 0xc3, + 0xb0, 0xac, 0x4b, 0x83, 0x8d, 0xcb, 0x61, 0x96, 0xbb, 0x7a, 0x61, 0x89, + 0xc0, 0x65, 0x5f, 0xa8, 0xd4, 0x76, 0x44, 0x91, 0xb7, 0x7e, 0x38, 0xbd, + 0xc1, 0x81, 0x5c, 0xc3, 0x34, 0x65, 0xc0, 0x56, 0x87, 0xa0, 0xcb, 0x75, + 0x83, 0xbf, 0xc2, 0x85, 0x30, 0x85, 0x68, 0xd5, 0xa1, 0x8d, 0xb4, 0xa1, + 0x96, 0x71, 0x86, 0xa2, 0xd1, 0xa2, 0xa6, 0x69, 0x6d, 0x42, 0x60, 0x59, + 0x98, 0x65, 0x5b, 0x3f, 0x36, 0x92, 0x80, 0x5e, 0xbe, 0x96, 0x93, 0xa5, + 0xbc, 0x28, 0x72, 0x5b, 0x7a, 0x41, 0x3a, 0x97, 0x87, 0x60, 0xbe, 0x65, + 0xa9, 0xa2, 0xcf, 0x9e, 0x43, 0xba, 0x3e, 0x96, 0xb4, 0x9a, 0x95, 0x8c, + 0x70, 0xa5, 0x36, 0x51, 0x46, 0x93, 0x8f, 0xb8, 0x65, 0x78, 0x9a, 0xc0, + 0xad, 0x83, 0x32, 0xb5, 0x4a, 0xb7, 0xb5, 0xc8, 0x6d, 0x43, 0x73, 0x8f, + 0x66, 0xb4, 0x85, 0x4e, 0xc3, 0xcb, 0x7b, 0x5d, 0xd5, 0xca, 0xab, 0x42, + 0x58, 0x29, 0x88, 0x8f, 0xae, 0xbd, 0x6a, 0x41, 0xd3, 0xcf, 0xb9, 0x49, + 0x10, 0x7e, 0x88, 0x57, 0x64, 0x51, 0x72, 0xa1, 0x4c, 0xbc, 0xb6, 0xa3, + 0x3d, 0xad, 0x77, 0x85, 0x7e, 0xcc, 0x7e, 0x55, 0x82, 0x41, 0x2f, 0x2d, + 0x6c, 0x94, 0xa3, 0x71, 0x76, 0xc7, 0x81, 0x78, 0xe1, 0xc6, 0xc5, 0xb2, + 0x7f, 0x7e, 0x7f, 0x4f, 0x4a, 0x9c, 0xbc, 0xb3, 0x32, 0xcc, 0x7c, 0x98, + 0x59, 0x65, 0x57, 0x40, 0x6b, 0x6e, 0x5c, 0x56, 0x49, 0x80, 0x52, 0x77, + 0xa3, 0x35, 0xc7, 0xa1, 0x40, 0x68, 0x95, 0x71, 0x3e, 0x9e, 0x92, 0x31, + 0x35, 0xd4, 0x82, 0x6e, 0x42, 0xbc, 0x94, 0x42, 0x63, 0x51, 0x9c, 0x47, + 0xd7, 0xad, 0x87, 0x7f, 0xbc, 0x57, 0xbb, 0xbf, 0x35, 0x4a, 0x7f, 0xb3, + 0x8a, 0x84, 0xa3, 0xde, 0x76, 0x65, 0x84, 0x55, 0xc7, 0x97, 0x46, 0x91, + 0x9e, 0xa2, 0x4e, 0x48, 0x7d, 0x63, 0xc6, 0x23, 0xc6, 0x8c, 0x9a, 0x88, + 0x4b, 0x64, 0x76, 0x5c, 0x36, 0xb9, 0xb7, 0x84, 0xc0, 0x97, 0x44, 0x7c, + 0x80, 0xd0, 0xad, 0x9c, 0xaa, 0x5b, 0x54, 0xc0, 0xcb, 0x72, 0xdb, 0x44, + 0x35, 0xb5, 0xa1, 0xa6, 0x75, 0xcb, 0x5c, 0x63, 0xc5, 0x72, 0x8d, 0x58, + 0xca, 0xbc, 0x69, 0x9e, 0x3c, 0x9a, 0xa9, 0x9a, 0xb2, 0xc0, 0xc3, 0x50, + 0xcd, 0x50, 0x6a, 0x95, 0xaf, 0x6d, 0x86, 0x4a, 0xbd, 0x5d, 0x6f, 0x4a, + 0x99, 0x6f, 0x37, 0xba, 0x47, 0xb1, 0x4c, 0x86, 0x93, 0xc1, 0x41, 0xd4, + 0x65, 0x82, 0xa8, 0xbc, 0x74, 0x3b, 0x88, 0x38, 0x70, 0xab, 0x33, 0x5f, + 0xa9, 0xa9, 0x86, 0x59, 0x48, 0x78, 0x9d, 0xb7, 0x62, 0x7f, 0x80, 0xdf, + 0xc5, 0x9a, 0x96, 0x8b, 0x44, 0xb5, 0x34, 0xd8, 0x42, 0xa2, 0x56, 0x9e, + 0x66, 0x7f, 0x52, 0xae, 0xa4, 0x3d, 0x8e, 0x3d, 0xc1, 0x58, 0xc1, 0xd2, + 0x4e, 0xb6, 0x7c, 0xb9, 0x77, 0xcb, 0x8f, 0x36, 0x32, 0x48, 0xbe, 0x42, + 0x47, 0x6f, 0x73, 0x49, 0xa5, 0x9f, 0x7b, 0x36, 0x41, 0x7c, 0xb2, 0x73, + 0x76, 0x58, 0x94, 0x25, 0x9a, 0x9c, 0x79, 0xbe, 0xce, 0x43, 0xd3, 0x8c, + 0xdd, 0xbc, 0x84, 0x4e, 0x5d, 0x64, 0x87, 0x3e, 0x2f, 0x88, 0x56, 0x81, + 0x91, 0x7f, 0x59, 0x7f, 0xcb, 0xc0, 0x9d, 0x5b, 0xb5, 0x7f, 0x97, 0xc9, + 0x78, 0x57, 0x59, 0xc4, 0x69, 0x77, 0xac, 0x66, 0x39, 0x6f, 0x59, 0x43, + 0x8c, 0xc9, 0xd6, 0x5d, 0x66, 0x5d, 0xc7, 0x49, 0xcc, 0x8d, 0x60, 0x62, + 0x59, 0x39, 0xa7, 0x41, 0x88, 0x8a, 0x7a, 0xa2, 0x3f, 0xa4, 0x42, 0x88, + 0xcc, 0x77, 0xc2, 0x42, 0x8b, 0xb4, 0x58, 0x5f, 0x7d, 0x93, 0x86, 0x4a, + 0xb3, 0xa5, 0x3e, 0x3b, 0x46, 0x8c, 0x5d, 0x33, 0xc3, 0x71, 0x4d, 0x87, + 0x98, 0x92, 0xb1, 0x4d, 0x73, 0xc4, 0x6f, 0x9b, 0xa2, 0x95, 0x90, 0x3d, + 0xcf, 0x3f, 0xa9, 0x65, 0xa6, 0x6f, 0x51, 0xbb, 0x74, 0x7a, 0x79, 0x66, + 0xb7, 0x75, 0xaa, 0xb7, 0xbd, 0xd8, 0xc5, 0xb0, 0x4e, 0x46, 0x6b, 0x5b, + 0x73, 0x3d, 0x3c, 0x89, 0x58, 0x65, 0x7c, 0x56, 0x8d, 0xb0, 0x7d, 0x9a, + 0xb5, 0x7d, 0x44, 0x96, 0xcb, 0x96, 0xaa, 0x54, 0xd2, 0x8e, 0xb9, 0x97, + 0x90, 0xc1, 0x7b, 0x93, 0x7c, 0x77, 0x78, 0x6c, 0xb8, 0x46, 0xb6, 0xb6, + 0x9c, 0xb7, 0xb7, 0x4b, 0xc4, 0x58, 0x93, 0xb3, 0xd9, 0x6d, 0x95, 0x6c, + 0x52, 0x8a, 0x9e, 0x70, 0x71, 0xa6, 0x62, 0x92, 0x4b, 0x45, 0x78, 0xbc, + 0x51, 0x7a, 0x7c, 0x75, 0x94, 0xbe, 0x94, 0x99, 0x6d, 0x78, 0x83, 0x73, + 0x4d, 0x34, 0x8f, 0x46, 0x7a, 0x65, 0xa2, 0x9f, 0x97, 0x50, 0x90, 0x86, + 0x83, 0xbc, 0x46, 0xc4, 0x7c, 0xcc, 0xa5, 0x4b, 0x48, 0x76, 0x57, 0xaf, + 0x86, 0x69, 0xc5, 0xae, 0x93, 0xc6, 0x60, 0x65, 0x75, 0x50, 0x70, 0x68, + 0x4a, 0x90, 0xb2, 0x98, 0x7a, 0x4b, 0x57, 0x8f, 0xab, 0x53, 0x95, 0x7b, + 0x59, 0x42, 0x80, 0x64, 0x9f, 0x44, 0x7b, 0xca, 0x94, 0x9f, 0xcb, 0xac, + 0x71, 0x8c, 0x6e, 0x7c, 0x9b, 0x9c, 0x53, 0x6b, 0x99, 0x5c, 0x62, 0x8a, + 0x5e, 0x44, 0xb7, 0xba, 0x5a, 0xd1, 0x8d, 0x79, 0x40, 0xa5, 0x65, 0x70, + 0xa7, 0x65, 0x53, 0x77, 0x86, 0xac, 0x86, 0x7a, 0x72, 0xa5, 0x96, 0x76, + 0xbd, 0xb1, 0xa1, 0x4f, 0xc0, 0x54, 0x80, 0x55, 0x7e, 0x85, 0xb6, 0xc4, + 0x3c, 0x82, 0x7d, 0x6c, 0x3c, 0xb0, 0x5c, 0xc3, 0x82, 0x89, 0xad, 0x67, + 0x9a, 0x9a, 0x30, 0x2f, 0x4a, 0x75, 0x8d, 0x30, 0xb5, 0x58, 0x8b, 0xc6, + 0xa9, 0x70, 0x5e, 0x5f, 0xd5, 0xb7, 0x9e, 0x45, 0x55, 0xa0, 0x44, 0x75, + 0x60, 0x3a, 0xbb, 0x53, 0xc7, 0xc9, 0xad, 0x7a, 0xd3, 0x46, 0x3c, 0xa4, + 0x8f, 0x64, 0xab, 0x67, 0x73, 0xbd, 0x97, 0xb9, 0x72, 0x49, 0x35, 0x5e, + 0x4d, 0x4d, 0x53, 0xbd, 0x42, 0xa2, 0xca, 0xc1, 0xc3, 0x77, 0x99, 0xc1, + 0x7a, 0x6c, 0x85, 0x89, 0x8d, 0x58, 0x55, 0xb4, 0x69, 0x8c, 0x8d, 0x69, + 0x4f, 0x49, 0x7e, 0xb0, 0x81, 0x90, 0x7f, 0xa1, 0x91, 0x58, 0x82, 0xaf, + 0x5d, 0x8b, 0xb9, 0x9d, 0xc7, 0x4d, 0x7c, 0x8b, 0x50, 0x46, 0xbf, 0x8c, + 0x46, 0x85, 0xab, 0xaa, 0xa0, 0xc4, 0xad, 0xb0, 0x83, 0x96, 0x41, 0x92, + 0x3b, 0x6c, 0xc8, 0x3b, 0x89, 0xb2, 0x47, 0x3d, 0x40, 0x75, 0x4f, 0xc2, + 0x33, 0xb1, 0x55, 0x5c, 0x51, 0x66, 0xc3, 0x61, 0x32, 0xb2, 0x41, 0x63, + 0x69, 0x71, 0x81, 0x8e, 0x89, 0xab, 0x62, 0x68, 0x54, 0x98, 0xa2, 0xca, + 0x44, 0x46, 0xc1, 0x7e, 0x77, 0xa7, 0x92, 0x39, 0x88, 0xc4, 0x67, 0x71, + 0x86, 0x52, 0xb0, 0xba, 0x41, 0xc4, 0x65, 0x44, 0x35, 0xc4, 0x8e, 0x3b, + 0x43, 0xa4, 0x51, 0x43, 0x65, 0xd3, 0xad, 0x7b, 0xa5, 0x7b, 0x7b, 0xbe, + 0x73, 0xb6, 0x57, 0x35, 0x93, 0x3e, 0xbf, 0xcf, 0x61, 0x7d, 0xb6, 0x6c, + 0xc0, 0xc6, 0x75, 0x97, 0x59, 0x9a, 0xba, 0x74, 0xc2, 0x78, 0x77, 0x7f, + 0x36, 0x52, 0xc3, 0x3e, 0x71, 0x6d, 0xaf, 0xcd, 0x37, 0x3b, 0xbd, 0x52, + 0x6d, 0x73, 0x53, 0x3a, 0xa8, 0x63, 0x3f, 0xb3, 0xd1, 0x4b, 0xc6, 0x8c, + 0xb5, 0x4c, 0xc3, 0xc1, 0x3c, 0x33, 0x46, 0xad, 0x8e, 0x5e, 0x33, 0x39, + 0x51, 0x94, 0xa9, 0x9e, 0x56, 0xc7, 0x55, 0x97, 0x7a, 0x32, 0x4a, 0xa6, + 0x89, 0x68, 0xd2, 0x5c, 0x6d, 0x6f, 0x64, 0x73, 0x39, 0x6b, 0x50, 0xc7, + 0xb1, 0x52, 0x8b, 0x4a, 0x78, 0x5f, 0x70, 0x9d, 0x32, 0x88, 0x40, 0x61, + 0x49, 0x9f, 0x63, 0x45, 0x55, 0x9c, 0x85, 0x5a, 0x9b, 0xb8, 0x90, 0x9a, + 0x78, 0xc0, 0x8b, 0x5d, 0xcd, 0x9a, 0xc2, 0x4c, 0xc8, 0xa4, 0x9c, 0xd2, + 0x7e, 0x52, 0xad, 0xb9, 0x31, 0x36, 0xc7, 0xa8, 0xa5, 0x82, 0x87, 0xab, + 0xa1, 0x7e, 0x9f, 0x4b, 0xce, 0x7b, 0xce, 0x41, 0x64, 0xc6, 0x9d, 0xae, + 0xbf, 0x98, 0x59, 0xd3, 0xc9, 0x86, 0x5e, 0x45, 0xc8, 0x92, 0x98, 0x69, + 0xb4, 0xa9, 0x97, 0x97, 0xac, 0x9d, 0x7c, 0xca, 0x80, 0x6b, 0xca, 0xb9, + 0x4c, 0xc4, 0x40, 0x70, 0x31, 0xb1, 0x83, 0xc8, 0xb1, 0x64, 0x65, 0x62, + 0xa1, 0xb6, 0x7e, 0x7c, 0x8a, 0x99, 0xa2, 0x6a, 0x3b, 0xca, 0x90, 0x5e, + 0x8b, 0x79, 0xc3, 0x7b, 0xa8, 0x3c, 0xaf, 0x32, 0x3b, 0xbf, 0x37, 0xbd, + 0xb4, 0x42, 0x36, 0x2f, 0x9d, 0x81, 0xaa, 0xc0, 0xa9, 0x31, 0xcc, 0xa9, + 0x51, 0xcf, 0x90, 0x29, 0xaf, 0x86, 0x8b, 0x63, 0xce, 0x5b, 0x3b, 0xa6, + 0x45, 0xa2, 0x60, 0xce, 0xa4, 0xbe, 0xa5, 0x9f, 0x94, 0x70, 0xb2, 0xbd, + 0x80, 0x63, 0x8a, 0x89, 0x47, 0x33, 0xc0, 0x62, 0x9e, 0x82, 0x74, 0xb2, + 0x58, 0xc3, 0x7a, 0x74, 0xa8, 0xc3, 0x52, 0x60, 0x75, 0x70, 0xa1, 0x5d, + 0xce, 0xc0, 0xae, 0x50, 0x49, 0x85, 0x55, 0xa4, 0x43, 0xae, 0x5d, 0x5d, + 0x93, 0x3c, 0x71, 0x8c, 0xa4, 0xcc, 0xcf, 0x42, 0x61, 0x7a, 0xa6, 0x3e, + 0x80, 0xc1, 0xc7, 0x80, 0x48, 0xa2, 0xb1, 0x4a, 0xb9, 0x79, 0xc8, 0xd2, + 0x8e, 0xd3, 0xc3, 0x93, 0x56, 0xa4, 0x2d, 0xcb, 0x36, 0xa7, 0x60, 0x4e, + 0x64, 0x55, 0x89, 0x7b, 0x43, 0x77, 0xcf, 0xcc, 0x38, 0x59, 0x54, 0x8c, + 0xb6, 0xd1, 0x78, 0xc6, 0x36, 0xb4, 0x50, 0x82, 0xb3, 0x56, 0xa1, 0x92, + 0xa4, 0x49, 0xb3, 0x7e, 0x82, 0x8c, 0xbe, 0x40, 0x7d, 0xba, 0xcb, 0x7d, + 0xae, 0x65, 0x37, 0x78, 0x5c, 0x7a, 0x6e, 0x35, 0x3e, 0xb2, 0x3b, 0x60, + 0x60, 0x85, 0x92, 0x74, 0x86, 0xb8, 0x76, 0x7b, 0x7f, 0xd0, 0x5a, 0x7b, + 0x65, 0x45, 0x4e, 0xa8, 0x93, 0x71, 0xb1, 0x99, 0xb7, 0x44, 0x5f, 0xab, + 0x92, 0x51, 0x38, 0x3d, 0x31, 0x58, 0x70, 0xad, 0xba, 0x9c, 0x99, 0xcb, + 0x76, 0x8a, 0x33, 0x6b, 0xbd, 0x91, 0x8f, 0x5c, 0x71, 0x92, 0xc0, 0x3d, + 0x4c, 0xb8, 0xb4, 0x94, 0xcf, 0x61, 0xc8, 0x64, 0x78, 0xc5, 0x96, 0x38, + 0xc0, 0x41, 0xc5, 0xc6, 0x43, 0xca, 0x46, 0x5c, 0x79, 0x5f, 0x89, 0x72, + 0xaa, 0xbe, 0x5f, 0x58, 0x71, 0x6d, 0xbb, 0xb2, 0xb6, 0x50, 0xb3, 0x4e, + 0x53, 0x7a, 0x85, 0x75, 0x79, 0x6e, 0x8b, 0x5e, 0x34, 0x49, 0x9b, 0x91, + 0xac, 0x7f, 0xc7, 0x3f, 0xc9, 0xd0, 0xc5, 0x3c, 0x58, 0x61, 0x31, 0xc5, + 0x70, 0xae, 0x3b, 0x41, 0xc4, 0x35, 0x57, 0x84, 0x74, 0xbc, 0x39, 0x4c, + 0x5f, 0x56, 0xbb, 0x67, 0x80, 0x41, 0xbf, 0x96, 0xb5, 0x59, 0x51, 0x4c, + 0x99, 0x5e, 0x7e, 0x7e, 0xcc, 0x6f, 0xa2, 0x9e, 0xa1, 0x49, 0x75, 0xd0, + 0xbc, 0xc0, 0x84, 0x3f, 0x35, 0x74, 0x4d, 0xc7, 0x40, 0xce, 0x7a, 0x91, + 0x9d, 0xc7, 0x68, 0x5b, 0x76, 0x4a, 0xa3, 0x64, 0x92, 0x82, 0x5f, 0x54, + 0x38, 0x7e, 0xbb, 0x88, 0xb6, 0x84, 0x46, 0xc7, 0x63, 0xa0, 0x3a, 0x8e, + 0x91, 0x7b, 0x79, 0x32, 0x35, 0x9f, 0x55, 0xc2, 0x5c, 0x42, 0x86, 0x36, + 0x4b, 0x34, 0xc5, 0x2f, 0xb3, 0xa3, 0x99, 0x9c, 0xa0, 0x95, 0x97, 0x6a, + 0x6e, 0x75, 0x66, 0xcb, 0x49, 0x95, 0xa4, 0xa9, 0x4b, 0x46, 0x76, 0xc9, + 0xb2, 0x39, 0x98, 0x74, 0x91, 0xd2, 0xad, 0x52, 0x4c, 0xaa, 0x40, 0x60, + 0x66, 0xaf, 0xbc, 0xbd, 0xce, 0xa4, 0x75, 0x7a, 0x58, 0x80, 0x94, 0x40, + 0x97, 0x3e, 0x81, 0x47, 0x66, 0x7d, 0x84, 0x62, 0xc9, 0x3c, 0x68, 0xb3, + 0x4e, 0x88, 0x79, 0x56, 0xa1, 0x34, 0xce, 0x86, 0x88, 0x8f, 0x3f, 0xa0, + 0x9f, 0xbf, 0x99, 0x70, 0x74, 0x54, 0xce, 0x39, 0x3e, 0x98, 0x34, 0x75, + 0x6a, 0xaa, 0x50, 0x43, 0x6e, 0x4e, 0x43, 0x5e, 0xae, 0x5a, 0x9f, 0x9c, + 0x4a, 0x90, 0x2f, 0xa2, 0x53, 0x34, 0x77, 0x81, 0xba, 0x98, 0xbb, 0xc2, + 0x38, 0x2e, 0xc5, 0x9e, 0xbb, 0xab, 0x45, 0x62, 0x99, 0x39, 0x79, 0x34, + 0x59, 0x37, 0x77, 0x7e, 0xa9, 0x5c, 0x82, 0x2c, 0x7e, 0x5b, 0x93, 0x4b, + 0xba, 0x69, 0xd2, 0x3f, 0xaa, 0x57, 0x55, 0x40, 0x84, 0x63, 0x4b, 0x62, + 0xa8, 0x86, 0xcc, 0x95, 0xac, 0x9d, 0xbe, 0xae, 0x94, 0x3a, 0x4e, 0x45, + 0xa4, 0xaa, 0xb1, 0xcc, 0x36, 0xb2, 0x99, 0xd3, 0x6d, 0xb5, 0x5a, 0xa1, + 0x5a, 0x71, 0x3e, 0x7c, 0x4c, 0xb0, 0xae, 0x3b, 0x40, 0x7e, 0x9c, 0x74, + 0x40, 0x87, 0x51, 0xd1, 0x56, 0x3a, 0x7b, 0xa6, 0x59, 0xa6, 0xc9, 0xc1, + 0xc7, 0x4a, 0x41, 0x8c, 0x37, 0x4f, 0x9c, 0xbf, 0xbb, 0x73, 0xa4, 0x3c, + 0xb8, 0x8c, 0xaf, 0x7c, 0x7b, 0x8b, 0x80, 0x35, 0x3e, 0xb7, 0xb0, 0xa2, + 0x83, 0x39, 0x82, 0x5b, 0xd4, 0x36, 0xbc, 0xbe, 0x3a, 0x61, 0x72, 0x60, + 0x97, 0x35, 0xb9, 0xb2, 0x49, 0x81, 0x3f, 0x62, 0x6e, 0x53, 0x8d, 0x72, + 0x89, 0xca, 0xbd, 0xca, 0x4b, 0x5b, 0x5b, 0x48, 0xb6, 0xbd, 0x85, 0x8f, + 0x6f, 0x93, 0x3f, 0xbc, 0x58, 0x8a, 0x7a, 0xce, 0x7c, 0xcc, 0x71, 0x8e, + 0xa2, 0x5e, 0x95, 0x72, 0x90, 0x60, 0x9d, 0x4c, 0x79, 0xcb, 0xc9, 0xc9, + 0xd3, 0x49, 0x34, 0x3a, 0xb5, 0xc8, 0xa1, 0xb0, 0xd5, 0xd4, 0x41, 0x33, + 0x51, 0x97, 0xa5, 0xa3, 0x96, 0x59, 0xb6, 0x83, 0xbf, 0xaf, 0x7d, 0x7e, + 0x5f, 0x99, 0xb1, 0xc2, 0x49, 0x41, 0x87, 0x9d, 0xb9, 0xa8, 0x8b, 0x58, + 0x48, 0xbd, 0x97, 0xbc, 0x7b, 0xa8, 0xbb, 0xc3, 0x42, 0x78, 0x69, 0x65, + 0x99, 0x57, 0x38, 0xce, 0x77, 0xce, 0x4c, 0xc3, 0xcf, 0xd1, 0x33, 0x65, + 0x9e, 0xa8, 0x46, 0x5c, 0xcb, 0x54, 0x6f, 0xb4, 0x5b, 0xce, 0x56, 0x7c, + 0x51, 0x55, 0x93, 0x58, 0x3e, 0xa2, 0x38, 0xad, 0x79, 0x87, 0xb2, 0x73, + 0x8d, 0xaa, 0xcc, 0x4e, 0xb8, 0x74, 0x49, 0xae, 0x77, 0xc6, 0x7e, 0x3c, + 0x5b, 0x8c, 0x7a, 0x93, 0x3a, 0x90, 0x53, 0x7c, 0x3d, 0x7d, 0x65, 0x71, + 0x95, 0xc5, 0xad, 0xd2, 0xac, 0x53, 0xb1, 0xb2, 0xa3, 0x67, 0x83, 0x53, + 0x9a, 0x9a, 0xa5, 0x69, 0xb1, 0xc1, 0x30, 0x95, 0xd4, 0xa9, 0xbf, 0xb9, + 0x9f, 0x31, 0x73, 0x4c, 0x4e, 0x6b, 0x49, 0x9d, 0xbf, 0x46, 0x65, 0x3e, + 0xcd, 0x63, 0x5d, 0x60, 0xc6, 0x49, 0xce, 0xc0, 0x7f, 0x48, 0x5d, 0x63, + 0x52, 0x5d, 0x9d, 0x85, 0x57, 0xb6, 0x71, 0xba, 0xc0, 0x90, 0x4c, 0x79, + 0x4b, 0x74, 0x8a, 0xd3, 0x51, 0xa8, 0x72, 0xa3, 0x9b, 0x66, 0x47, 0x49, + 0xca, 0xa6, 0x97, 0x5a, 0xa6, 0x9e, 0x37, 0x45, 0xa4, 0xc0, 0x8d, 0x7e, + 0x6c, 0x48, 0x3d, 0xbb, 0x7c, 0x78, 0xc5, 0x6b, 0x6c, 0xab, 0x33, 0x89, + 0x5d, 0x59, 0x51, 0x9b, 0x88, 0x34, 0x88, 0x57, 0x75, 0x66, 0x86, 0x3f, + 0xa2, 0xc3, 0xaa, 0xad, 0xbe, 0xa8, 0x6c, 0x57, 0x99, 0x56, 0x32, 0x81, + 0x43, 0x63, 0x48, 0x5d, 0x6e, 0xaa, 0xad, 0xc8, 0x90, 0x7a, 0x9c, 0x43, + 0x3b, 0x91, 0xb3, 0x31, 0x79, 0xb7, 0x61, 0x6b, 0x38, 0x6b, 0x87, 0x74, + 0x54, 0x87, 0x41, 0x62, 0x76, 0x75, 0x81, 0x4e, 0xa4, 0xba, 0x4b, 0x8e, + 0x49, 0x73, 0x39, 0x40, 0x7b, 0xaa, 0xca, 0x4f, 0x90, 0x76, 0x5e, 0x74, + 0x79, 0x99, 0x8d, 0x97, 0xa0, 0x66, 0x7f, 0xba, 0xc0, 0xc1, 0x50, 0x6f, + 0x7b, 0xbe, 0xd1, 0x97, 0x9f, 0x86, 0x3c, 0xc1, 0xca, 0x63, 0x44, 0x93, + 0xbb, 0x99, 0x64, 0x58, 0x35, 0x83, 0x35, 0x46, 0x66, 0x4a, 0xd0, 0x69, + 0x94, 0x5e, 0x85, 0x8d, 0x55, 0x43, 0xb1, 0xbe, 0x39, 0xb9, 0x4e, 0x3d, + 0x63, 0x7b, 0xc2, 0x66, 0xb6, 0xc3, 0x97, 0x8b, 0x44, 0x4f, 0x7d, 0xca, + 0x59, 0x87, 0x7d, 0xa8, 0x5c, 0x5f, 0x54, 0x9c, 0x73, 0x41, 0x45, 0xd2, + 0x9f, 0xbc, 0xa0, 0x98, 0xc0, 0x3a, 0xc2, 0x4d, 0x64, 0x4c, 0x30, 0x67, + 0x59, 0xc8, 0x56, 0xd2, 0x8c, 0x85, 0x84, 0xa4, 0x6c, 0x42, 0x72, 0x97, + 0x86, 0x74, 0x64, 0x53, 0x3f, 0x81, 0x40, 0xa9, 0xd0, 0xb4, 0x87, 0x6c, + 0x5d, 0x56, 0x86, 0x7f, 0x4c, 0x52, 0xb7, 0x94, 0x48, 0x62, 0x46, 0x39, + 0xc7, 0x3c, 0x62, 0x6a, 0x3b, 0x85, 0x4c, 0x83, 0x71, 0x6f, 0x38, 0x39, + 0x93, 0xcf, 0x74, 0xa0, 0x51, 0x9d, 0x33, 0x72, 0xb7, 0xbf, 0x43, 0xb2, + 0xc9, 0xd1, 0xcf, 0xb3, 0x3d, 0x7d, 0x3a, 0x36, 0xb0, 0xbc, 0xc0, 0x87, + 0x37, 0x56, 0xbb, 0x86, 0xaa, 0x9d, 0x35, 0xcf, 0xc5, 0x47, 0x6e, 0xa3, + 0xb9, 0xa0, 0x96, 0xc9, 0x8d, 0x89, 0x84, 0x8c, 0xc4, 0x52, 0x73, 0x9f, + 0xa3, 0x40, 0x52, 0xbd, 0xc5, 0x52, 0xa7, 0x48, 0x37, 0x9a, 0x60, 0xb4, + 0xc5, 0xd1, 0x3e, 0x39, 0xc8, 0xa1, 0x8c, 0x9e, 0x64, 0x5f, 0x44, 0x72, + 0x5d, 0x49, 0x3f, 0xa7, 0x74, 0x68, 0x4d, 0xaf, 0x69, 0x37, 0x38, 0x7b, + 0xcb, 0x6e, 0x6e, 0x50, 0x6c, 0xbc, 0x4f, 0xb8, 0xb0, 0x8f, 0xd4, 0x9c, + 0xb5, 0x87, 0x8f, 0xcb, 0x84, 0x51, 0x47, 0x4c, 0x2f, 0xa0, 0x54, 0x73, + 0x6e, 0x52, 0xbb, 0x53, 0xcf, 0xd5, 0x51, 0xae, 0x5b, 0x57, 0x6b, 0x45, + 0x5b, 0x98, 0x90, 0x70, 0x41, 0x5a, 0x3c, 0xd3, 0x56, 0xbf, 0x5a, 0x94, + 0x69, 0x78, 0x91, 0x8e, 0x70, 0x97, 0xbc, 0xaf, 0x4e, 0xaa, 0xa5, 0x89, + 0x5f, 0x80, 0x3c, 0xb3, 0x41, 0x9d, 0x5d, 0x7b, 0xb4, 0x54, 0x3a, 0xbb, + 0x77, 0x42, 0x4d, 0xc1, 0xb4, 0x53, 0x4a, 0xbd, 0xa7, 0xad, 0x60, 0x48, + 0x2f, 0x8b, 0x7f, 0x4f, 0x6f, 0x35, 0xb7, 0x47, 0x8b, 0xbd, 0xa3, 0xb1, + 0xc2, 0x71, 0x71, 0x7a, 0x6f, 0x6e, 0x65, 0x97, 0x83, 0x51, 0xaf, 0x72, + 0xa5, 0x5c, 0x5c, 0xc2, 0x93, 0x46, 0xbe, 0x9f, 0xa0, 0x6b, 0xb2, 0x63, + 0x5f, 0x67, 0xad, 0x8f, 0x8d, 0xb3, 0xd2, 0x49, 0x7b, 0xb0, 0x6c, 0x6a, + 0x3f, 0x85, 0xcf, 0x7a, 0x9d, 0x3f, 0x94, 0x4d, 0x6e, 0xac, 0x8b, 0x31, + 0xb8, 0xce, 0x2d, 0x70, 0xbf, 0x66, 0x96, 0xb1, 0x40, 0xc8, 0x9d, 0x5c, + 0x63, 0xb8, 0xb8, 0xa3, 0xc9, 0x80, 0xc3, 0x96, 0xa3, 0xc4, 0x7d, 0x4d, + 0x59, 0x38, 0x38, 0x38, 0x7b, 0x2f, 0x42, 0xab, 0x35, 0x3a, 0x36, 0xb5, + 0x96, 0x8a, 0x3d, 0xc5, 0x6d, 0x53, 0x3b, 0x72, 0x76, 0x46, 0x3b, 0x63, + 0xb3, 0x71, 0x8e, 0x9e, 0x7e, 0x6a, 0x42, 0x66, 0x84, 0x4a, 0x8d, 0xa1, + 0x35, 0x77, 0x54, 0x3d, 0xc8, 0xab, 0x99, 0x9b, 0x57, 0xbd, 0x8d, 0x87, + 0xbf, 0x83, 0x36, 0xb8, 0x93, 0x6e, 0x54, 0xbd, 0x62, 0x82, 0x41, 0x31, + 0x5a, 0xd2, 0xa1, 0xbc, 0xb6, 0x6f, 0xcb, 0x50, 0x7e, 0x6e, 0x45, 0xba, + 0x99, 0xaa, 0x58, 0x38, 0x8d, 0xc2, 0x34, 0xb0, 0x33, 0xa9, 0xba, 0x69, + 0x66, 0xc8, 0xb8, 0x91, 0xa6, 0xae, 0x68, 0xa8, 0x93, 0xa0, 0x5d, 0xa5, + 0x35, 0xd6, 0x95, 0xab, 0xcf, 0x41, 0x50, 0x66, 0x7b, 0xca, 0x4d, 0x61, + 0x64, 0x37, 0x30, 0xc7, 0xad, 0x6b, 0x8d, 0x3a, 0xc3, 0x3f, 0x57, 0x69, + 0x66, 0x69, 0x82, 0x91, 0x53, 0xc6, 0x4c, 0x7f, 0xa9, 0x4e, 0x44, 0x5c, + 0xcb, 0x8e, 0xc8, 0x5e, 0xbd, 0xc3, 0xbb, 0x68, 0x90, 0x97, 0xb1, 0x48, + 0xb4, 0xb0, 0xb9, 0x3a, 0x51, 0x52, 0x76, 0x99, 0x3e, 0x38, 0x94, 0x48, + 0xcd, 0xa5, 0xb2, 0x69, 0xdb, 0x9a, 0xba, 0x68, 0x44, 0xb4, 0xbd, 0x52, + 0xb3, 0xcb, 0x35, 0x86, 0xaf, 0x36, 0xc7, 0x9a, 0x92, 0x71, 0xc3, 0xac, + 0x36, 0x8a, 0x54, 0x6d, 0x49, 0xb9, 0x9c, 0x7e, 0x4e, 0x6d, 0x6b, 0x31, + 0x3b, 0x52, 0xbd, 0xc8, 0xa6, 0x8d, 0xa4, 0x44, 0x42, 0x9d, 0x8f, 0x8e, + 0xb8, 0x75, 0x45, 0x9c, 0x8d, 0x4e, 0xa3, 0x94, 0xa8, 0x33, 0x8b, 0x76, + 0xaa, 0xae, 0x43, 0x91, 0x49, 0x59, 0x5c, 0x45, 0x6f, 0x48, 0x95, 0x76, + 0x9a, 0xc9, 0x76, 0xad, 0xb6, 0xc1, 0x7f, 0xbf, 0x70, 0x72, 0x6a, 0x98, + 0xb4, 0x50, 0x45, 0x8c, 0x97, 0x34, 0xc5, 0x6b, 0x52, 0x91, 0x55, 0x70, + 0xcc, 0x4b, 0xc3, 0xa0, 0x5a, 0x66, 0x84, 0xc7, 0xc5, 0x53, 0x75, 0xc4, + 0xb0, 0x36, 0x88, 0x58, 0xbd, 0x39, 0x39, 0xba, 0xbb, 0x4c, 0x38, 0x9b, + 0x73, 0x67, 0xbf, 0x4b, 0xbd, 0x84, 0x38, 0xb5, 0x33, 0x4b, 0x4c, 0xc0, + 0x92, 0x92, 0x9b, 0xc1, 0x88, 0xb3, 0x53, 0x92, 0x77, 0x9d, 0xb7, 0x83, + 0x4e, 0x32, 0xac, 0xc8, 0x55, 0x46, 0x78, 0xcf, 0x38, 0x54, 0x9b, 0x34, + 0xcd, 0x48, 0xa9, 0x45, 0x8e, 0x81, 0x9b, 0x2f, 0x6d, 0x89, 0x98, 0x3a, + 0x3b, 0x70, 0x9f, 0x76, 0x6f, 0x93, 0x82, 0x3c, 0xcd, 0xd3, 0x45, 0x67, + 0x50, 0xc4, 0x46, 0x93, 0x6b, 0x84, 0xcf, 0x92, 0xb1, 0xa8, 0x35, 0x54, + 0xa4, 0x5a, 0x9f, 0x7c, 0x65, 0xb1, 0x8e, 0x62, 0x8e, 0xb8, 0xd6, 0xcf, + 0x9d, 0x5b, 0x82, 0x77, 0x64, 0xcc, 0x43, 0x71, 0x3c, 0xa4, 0x6b, 0x32, + 0xcf, 0xbe, 0x6f, 0xa3, 0x88, 0x5d, 0x34, 0x88, 0x70, 0x53, 0x5f, 0x4e, + 0x61, 0x8e, 0xbb, 0x84, 0x71, 0x9f, 0x6a, 0x65, 0xbf, 0x8b, 0xa9, 0x77, + 0xcb, 0x65, 0x6f, 0x34, 0x8a, 0xb1, 0x63, 0x9b, 0xbd, 0x4a, 0x63, 0x9b, + 0x56, 0x43, 0xba, 0xa7, 0x31, 0x39, 0x54, 0xb9, 0x7d, 0xd1, 0x8a, 0xc6, + 0x83, 0x6a, 0x9c, 0x8c, 0x84, 0x5a, 0xa8, 0x6c, 0x8d, 0x4c, 0xa6, 0x50, + 0x48, 0xba, 0x8b, 0x59, 0xb4, 0xc3, 0xa4, 0xbd, 0xb0, 0xb2, 0x70, 0x63, + 0x6e, 0x35, 0xcd, 0x7a, 0x7f, 0xc7, 0x9b, 0xbc, 0x4e, 0xb2, 0x79, 0x58, + 0xc3, 0x73, 0x7d, 0x37, 0xd6, 0x68, 0x60, 0xcc, 0xad, 0xba, 0x73, 0xa1, + 0x4d, 0x3e, 0x85, 0x4f, 0xac, 0x7c, 0x60, 0x4d, 0x37, 0xaa, 0x85, 0x35, + 0x4a, 0x76, 0xad, 0x46, 0x6d, 0x33, 0x93, 0x91, 0x84, 0xc2, 0xb3, 0x4c, + 0xbf, 0x43, 0x60, 0xaf, 0x30, 0x93, 0x4f, 0x82, 0x68, 0xcb, 0x77, 0x78, + 0x6f, 0x3e, 0x71, 0x79, 0x65, 0xa9, 0x83, 0x72, 0xba, 0x60, 0x69, 0xb6, + 0x33, 0x3f, 0x81, 0x2c, 0x51, 0x8f, 0x45, 0x4e, 0x39, 0xb7, 0x71, 0xb7, + 0x42, 0x85, 0xad, 0x91, 0x6c, 0x3a, 0x8b, 0x62, 0xb2, 0xcf, 0x55, 0xc6, + 0xd1, 0x7a, 0x8a, 0x8a, 0x65, 0x44, 0x33, 0x3b, 0xc5, 0x97, 0x3a, 0x93, + 0x66, 0x35, 0xbe, 0xaa, 0x69, 0x3c, 0xc5, 0xa3, 0x6e, 0x99, 0x7f, 0xd5, + 0x9b, 0x5a, 0x38, 0x75, 0xb4, 0xaf, 0x52, 0x81, 0xbe, 0x38, 0x69, 0x46, + 0x59, 0x6d, 0x8e, 0x85, 0x84, 0xcb, 0x4c, 0x80, 0x5a, 0x4b, 0xbc, 0x6c, + 0x70, 0x66, 0x6a, 0x95, 0x56, 0xbe, 0x9c, 0xb3, 0x44, 0xcc, 0x77, 0xc5, + 0xae, 0xc6, 0x46, 0x89, 0x60, 0xaa, 0x55, 0xc9, 0x56, 0x6f, 0x9f, 0x79, + 0x71, 0x70, 0x6c, 0x9d, 0x6b, 0xbe, 0x9a, 0xcd, 0xa4, 0x95, 0xcd, 0xa7, + 0x3c, 0x67, 0x3b, 0x91, 0x7f, 0x71, 0x4b, 0xae, 0x61, 0x4e, 0x5d, 0x86, + 0x6f, 0xc7, 0xb3, 0x52, 0x4e, 0x43, 0x92, 0x7b, 0x41, 0x7b, 0x31, 0x4d, + 0x9d, 0x62, 0x41, 0x44, 0x51, 0xdb, 0x56, 0x7f, 0xa4, 0xa0, 0xb5, 0x51, + 0x75, 0xb2, 0x40, 0x5b, 0xd7, 0x95, 0xd0, 0xb1, 0xb4, 0xab, 0x3c, 0xc0, + 0xbe, 0xb8, 0xd5, 0x64, 0xbf, 0x6c, 0xa5, 0xbc, 0x87, 0x57, 0x3e, 0x92, + 0xb1, 0xcd, 0xa7, 0x35, 0xa9, 0x30, 0xc1, 0xbd, 0x71, 0x77, 0x78, 0xb0, + 0x81, 0x7f, 0xc9, 0x8e, 0xad, 0x89, 0xbd, 0xc5, 0xaa, 0x31, 0x70, 0x80, + 0x3b, 0x49, 0x3b, 0xd9, 0x41, 0x4d, 0xc6, 0x9a, 0x79, 0x62, 0x77, 0x5b, + 0xa7, 0x74, 0xad, 0x7e, 0x8c, 0x37, 0xd3, 0x95, 0x96, 0x65, 0x70, 0x52, + 0xb3, 0x58, 0x31, 0x37, 0x50, 0x84, 0xbb, 0xc1, 0xa1, 0x9a, 0xa3, 0x59, + 0x43, 0x90, 0x9d, 0x65, 0x7b, 0xb7, 0xdd, 0x5a, 0xd6, 0x32, 0xa8, 0x44, + 0x79, 0x1f, 0x39, 0x44, 0xd9, 0xc0, 0x43, 0x82, 0x6e, 0x6c, 0x91, 0xb7, + 0x90, 0x62, 0xa6, 0x48, 0x6f, 0xba, 0xd5, 0x53, 0x4c, 0x70, 0x96, 0xa1, + 0x65, 0xaa, 0x76, 0xb3, 0x7a, 0x6a, 0xd8, 0x33, 0x50, 0xc8, 0xc5, 0x9d, + 0x53, 0xd1, 0x30, 0x40, 0x32, 0xc5, 0x2c, 0xaf, 0xc2, 0x5f, 0x55, 0xa6, + 0x32, 0x92, 0xc9, 0x44, 0x94, 0xa3, 0x3d, 0xbf, 0xa3, 0xa2, 0x3f, 0x42, + 0xbd, 0xb0, 0x90, 0x3f, 0xab, 0x7a, 0x8a, 0xc8, 0xb8, 0x96, 0x7c, 0x53, + 0x4f, 0xb6, 0x8c, 0x7a, 0xd9, 0xa5, 0x5a, 0x3d, 0x7c, 0x30, 0x9b, 0x91, + 0x53, 0x7b, 0x6e, 0x94, 0xb1, 0x7d, 0x41, 0x8d, 0xcb, 0x7b, 0x60, 0xd3, + 0xd5, 0x5c, 0x82, 0x37, 0x7d, 0xa7, 0xb0, 0xc2, 0xc8, 0x52, 0x3f, 0x30, + 0x7a, 0xca, 0xa6, 0x70, 0xc7, 0xa3, 0xc8, 0x6c, 0x75, 0x7a, 0xad, 0x31, + 0x47, 0x38, 0x4c, 0x83, 0xc8, 0xcb, 0xa3, 0x91, 0xab, 0x43, 0xbb, 0x55, + 0xc1, 0x3e, 0x3d, 0xc4, 0x57, 0x4b, 0x3d, 0xa7, 0xcc, 0x4a, 0x47, 0xab, + 0x97, 0x43, 0xbf, 0xb4, 0xc8, 0x73, 0xac, 0xdb, 0x41, 0x5b, 0xa0, 0x7b, + 0x4a, 0xbc, 0xb4, 0x5b, 0x67, 0x43, 0x3d, 0x58, 0xa0, 0x88, 0x85, 0x4e, + 0x50, 0x8d, 0x5f, 0xc6, 0x80, 0xbe, 0xbf, 0xd2, 0x68, 0x3e, 0x86, 0xbf, + 0x7a, 0x74, 0x9b, 0x4c, 0x7b, 0x2b, 0x92, 0x5d, 0xc3, 0xd8, 0x40, 0x27, + 0xa2, 0x91, 0x8b, 0xaf, 0xba, 0x5c, 0x93, 0x85, 0x7a, 0xac, 0x92, 0xc4, + 0x35, 0x36, 0xc5, 0x8a, 0x3c, 0x69, 0x5e, 0xa3, 0x7a, 0x9f, 0xa5, 0x60, + 0xb4, 0x8f, 0x5a, 0x9f, 0x72, 0xca, 0x9d, 0x9d, 0x3f, 0x6a, 0xbd, 0x46, + 0x59, 0xb2, 0x5a, 0x98, 0x64, 0x5b, 0x96, 0xac, 0xb4, 0xc4, 0x82, 0x88, + 0x4e, 0xcd, 0x58, 0xb3, 0x69, 0x46, 0x7d, 0xb8, 0x58, 0x7f, 0xac, 0x72, + 0x89, 0x42, 0x49, 0xa5, 0xa8, 0x90, 0x67, 0xa3, 0x48, 0x97, 0x55, 0x88, + 0xb1, 0x7c, 0x2c, 0x6d, 0x63, 0x49, 0x80, 0x4a, 0xa3, 0xce, 0xc7, 0x98, + 0x91, 0x43, 0xa5, 0xbb, 0xaa, 0xb7, 0xcd, 0x35, 0xce, 0xa4, 0x81, 0x3a, + 0xd6, 0xb7, 0xaf, 0x59, 0xd8, 0x7f, 0x4f, 0x5f, 0xcc, 0x36, 0x2d, 0x4d, + 0x6a, 0x98, 0xcc, 0x80, 0x81, 0xc2, 0x48, 0x84, 0xd1, 0x3f, 0x8f, 0x94, + 0xd6, 0x6e, 0x39, 0x45, 0x82, 0x42, 0x55, 0xbb, 0x89, 0x4a, 0xae, 0xc6, + 0xb6, 0xcc, 0x5d, 0xaf, 0x8b, 0xd0, 0x9e, 0x5d, 0xd1, 0xcc, 0xa0, 0x3b, + 0xc6, 0xbf, 0x3c, 0x7d, 0xce, 0x9b, 0x37, 0x9d, 0xbc, 0xa6, 0x74, 0x40, + 0xb2, 0xb2, 0x58, 0xb0, 0x69, 0x5d, 0x59, 0x5d, 0xc7, 0x5f, 0xbf, 0x35, + 0x44, 0xbf, 0x39, 0x2e, 0xa9, 0xb2, 0x5e, 0xc6, 0xa7, 0xad, 0x5f, 0xc4, + 0x62, 0xb3, 0x3c, 0x4a, 0x8e, 0x43, 0x72, 0x98, 0x4c, 0x40, 0x6b, 0x3c, + 0x4e, 0x48, 0xb1, 0x67, 0x74, 0x62, 0xa7, 0x3f, 0x71, 0x7e, 0x81, 0xc1, + 0x41, 0xa8, 0xb1, 0x96, 0xb4, 0xd6, 0x6b, 0x93, 0xaa, 0x43, 0x53, 0xb0, + 0x81, 0x3f, 0x7b, 0x70, 0x5f, 0x71, 0xcf, 0x3f, 0xc6, 0x7d, 0xc6, 0xd3, + 0x76, 0x8f, 0x74, 0x73, 0x69, 0xc5, 0x50, 0xa5, 0xc1, 0x8b, 0x9a, 0x6b, + 0x2f, 0xc5, 0xc6, 0x7f, 0xc7, 0xcd, 0x66, 0x59, 0xb2, 0x62, 0x90, 0x39, + 0x67, 0x5b, 0x3e, 0xc8, 0xa0, 0xa4, 0xbf, 0xc9, 0x66, 0x4c, 0xaa, 0xd7, + 0xab, 0xc3, 0x9d, 0x70, 0x53, 0x6a, 0x58, 0x89, 0x83, 0x4c, 0x5e, 0xd6, + 0xd7, 0xc8, 0x92, 0x9d, 0xd1, 0x91, 0x9a, 0xc1, 0x2d, 0x40, 0x38, 0x36, + 0x54, 0xbc, 0x6b, 0x79, 0x31, 0x37, 0xd6, 0x63, 0x74, 0xbe, 0x8d, 0xa1, + 0xbf, 0x86, 0x5f, 0x5d, 0x79, 0x83, 0x9d, 0x8f, 0x70, 0x56, 0x63, 0x36, + 0x53, 0x42, 0x6c, 0x82, 0x3b, 0x9b, 0x84, 0xcb, 0x9d, 0xce, 0x79, 0x5b, + 0x7f, 0x69, 0x96, 0xd1, 0x82, 0x3b, 0x78, 0x6e, 0xd4, 0x79, 0x65, 0xb0, + 0x67, 0x5c, 0x62, 0x45, 0x4c, 0x51, 0x37, 0xbf, 0xa3, 0xcd, 0x49, 0xb5, + 0x4b, 0xb7, 0x9e, 0x83, 0x4a, 0x91, 0x4c, 0xca, 0x6f, 0xce, 0xbc, 0x80, + 0x64, 0x38, 0x35, 0x9c, 0x3b, 0x81, 0xcb, 0xd5, 0x99, 0x7a, 0xa6, 0xa0, + 0xc1, 0x5f, 0x80, 0x60, 0x70, 0xb0, 0x5c, 0xc7, 0x6a, 0xc4, 0x59, 0x38, + 0xcf, 0xac, 0x7b, 0x91, 0x83, 0x91, 0x5e, 0x9f, 0x84, 0x63, 0xc1, 0x8c, + 0x70, 0x7c, 0x61, 0x8f, 0x80, 0x5d, 0x80, 0x51, 0x6b, 0x4a, 0xb5, 0xd3, + 0xe0, 0x69, 0x72, 0xd9, 0xb4, 0x7c, 0x60, 0x7a, 0x58, 0x72, 0x3c, 0xdf, + 0x97, 0x49, 0x98, 0x51, 0x76, 0x90, 0x9d, 0xbb, 0x57, 0x99, 0x83, 0x4a, + 0x35, 0xbe, 0x9a, 0xb6, 0xb5, 0x51, 0x64, 0x53, 0x8d, 0xce, 0x7a, 0xb5, + 0x9c, 0x45, 0xac, 0x5e, 0xd2, 0x90, 0x84, 0x37, 0xa5, 0x99, 0x50, 0xba, + 0xc5, 0x67, 0x3e, 0xd2, 0x9e, 0x3f, 0x38, 0x65, 0xa2, 0xb5, 0x76, 0xb3, + 0x83, 0x4d, 0x52, 0x72, 0x99, 0xcb, 0x72, 0xd6, 0x69, 0x40, 0x54, 0x60, + 0x32, 0x80, 0xdd, 0xae, 0xa8, 0x75, 0x42, 0x81, 0xc3, 0x84, 0xc5, 0x73, + 0x79, 0x47, 0xae, 0x83, 0xa5, 0xb6, 0x31, 0x95, 0x31, 0x5e, 0x54, 0x7c, + 0x57, 0x5e, 0x54, 0xac, 0xc3, 0x76, 0x99, 0x87, 0x58, 0xc6, 0x9f, 0x80, + 0x9a, 0xa8, 0xa0, 0x8c, 0xd0, 0x65, 0x8c, 0xb4, 0x70, 0x5f, 0xca, 0xb3, + 0xbc, 0x7c, 0xca, 0x6b, 0x35, 0x91, 0xcd, 0x74, 0x53, 0x97, 0x36, 0x65, + 0x84, 0x5d, 0x70, 0x7a, 0x9c, 0xbe, 0x4d, 0x80, 0xd4, 0xb3, 0x3f, 0xcd, + 0xb8, 0x70, 0x4b, 0x96, 0x7c, 0x72, 0x87, 0xbe, 0xa8, 0xa1, 0xb2, 0xce, + 0x91, 0xba, 0x2e, 0xc0, 0x88, 0x5e, 0x75, 0x6c, 0xc7, 0xb9, 0xb4, 0xa2, + 0x94, 0x72, 0x70, 0x8b, 0xb5, 0x8f, 0xc5, 0xbe, 0x67, 0x9c, 0x77, 0xd1, + 0x78, 0x53, 0x81, 0xc2, 0xd2, 0x5e, 0x36, 0x51, 0x3a, 0xa7, 0x8a, 0xb4, + 0xca, 0xe5, 0x46, 0xbf, 0x54, 0x77, 0x38, 0x69, 0x3b, 0x32, 0x3b, 0xa1, + 0x41, 0x78, 0x87, 0x49, 0x78, 0x8f, 0x5d, 0x31, 0x6b, 0xb4, 0x9b, 0x60, + 0xb7, 0x5e, 0x96, 0xd6, 0x60, 0x50, 0xc6, 0x4a, 0x52, 0x94, 0x51, 0xa0, + 0x3f, 0x88, 0x57, 0x3a, 0xb6, 0xd0, 0x7d, 0x44, 0x76, 0x89, 0xd1, 0x8a, + 0x3f, 0x6e, 0x5c, 0xc9, 0x94, 0xb1, 0x7f, 0xc4, 0x68, 0x78, 0x72, 0x6b, + 0xce, 0x44, 0x42, 0x93, 0x92, 0xa2, 0x3d, 0x44, 0xa8, 0x71, 0x8c, 0x81, + 0x76, 0x7c, 0x74, 0x6e, 0xab, 0x5c, 0xad, 0xa3, 0xa3, 0x37, 0x62, 0xe5, + 0x4d, 0x7d, 0xb6, 0x9a, 0x7e, 0x95, 0xa3, 0x65, 0x55, 0xc1, 0x91, 0x3b, + 0x58, 0x86, 0x76, 0xcb, 0xc4, 0x48, 0x77, 0x60, 0x7b, 0x7e, 0x68, 0x73, + 0xc5, 0x46, 0x91, 0xc7, 0x95, 0xbc, 0x9d, 0xd4, 0xa9, 0xce, 0x41, 0x71, + 0xa4, 0x4b, 0x65, 0x59, 0x88, 0x57, 0x74, 0x60, 0x92, 0x6f, 0xd0, 0x54, + 0x40, 0x5a, 0x85, 0x64, 0x70, 0xcd, 0xa0, 0xba, 0xa8, 0x64, 0xaf, 0x7f, + 0x7d, 0x9e, 0x74, 0x62, 0xbf, 0x83, 0x56, 0x5f, 0xc0, 0xcb, 0x7f, 0xa7, + 0x99, 0x81, 0xca, 0xc2, 0xa9, 0x80, 0xaf, 0x95, 0x8a, 0x3b, 0xcd, 0x9e, + 0x36, 0x66, 0xa4, 0xad, 0xbf, 0x45, 0x49, 0x3c, 0x47, 0x2b, 0x75, 0x7a, + 0x4b, 0xb9, 0xca, 0x8f, 0xab, 0x26, 0x72, 0xaf, 0x74, 0x83, 0x9a, 0x40, + 0x75, 0x83, 0x89, 0x91, 0xeb, 0x9a, 0xa8, 0xc5, 0xb6, 0xeb, 0xbf, 0xb6, + 0xd6, 0xa1, 0x68, 0x98, 0x7e, 0x3a, 0xa1, 0x6b, 0x73, 0x9b, 0xbf, 0x6d, + 0x9b, 0x31, 0x3b, 0x5e, 0x42, 0xc2, 0x3c, 0x43, 0x3b, 0x62, 0xaf, 0x64, + 0x9b, 0xad, 0x90, 0xca, 0x40, 0x49, 0xc5, 0x60, 0x6b, 0x38, 0x8d, 0xc6, + 0xd2, 0xa4, 0x4c, 0x4b, 0x64, 0x82, 0xae, 0x77, 0x90, 0xd3, 0xb4, 0x2b, + 0x37, 0xc1, 0xa1, 0x85, 0xb8, 0x83, 0x97, 0x5a, 0xd3, 0x3c, 0x7f, 0x75, + 0x67, 0xbb, 0xa2, 0x82, 0xb5, 0x93, 0x87, 0x84, 0x77, 0xb9, 0x5a, 0xb0, + 0x24, 0xbb, 0x69, 0x5d, 0x6a, 0x78, 0xbd, 0x50, 0x63, 0xbf, 0x9e, 0x8f, + 0x73, 0xa3, 0x4a, 0xac, 0x4f, 0x46, 0xb8, 0x49, 0xb0, 0xb7, 0x27, 0xb0, + 0x96, 0x88, 0xb5, 0x93, 0x9c, 0x45, 0x83, 0x38, 0x66, 0x60, 0x54, 0x47, + 0x8d, 0x8b, 0xb4, 0x78, 0x7a, 0x4d, 0x70, 0x96, 0x70, 0xd1, 0x45, 0xb6, + 0x99, 0x40, 0x54, 0xc1, 0xa8, 0x41, 0x42, 0xcb, 0x6c, 0x80, 0xd7, 0x43, + 0x88, 0x54, 0xb0, 0x91, 0x47, 0x3d, 0xb0, 0x62, 0x46, 0x90, 0x4f, 0x72, + 0x5c, 0xa6, 0x94, 0x4f, 0xb5, 0xd9, 0xd1, 0xc9, 0x44, 0x72, 0xbe, 0x92, + 0x8a, 0x43, 0x78, 0xbe, 0xcc, 0x84, 0x65, 0x91, 0x56, 0xa1, 0xba, 0x39, + 0x92, 0x2d, 0x64, 0xbe, 0x25, 0x22, 0x92, 0xad, 0x4d, 0x7b, 0xd4, 0x90, + 0xc9, 0x7f, 0x8f, 0xb7, 0xcc, 0x82, 0x2d, 0x46, 0xb7, 0xc5, 0x92, 0xcb, + 0x35, 0x81, 0x9e, 0x70, 0x87, 0x4d, 0xcb, 0x86, 0x56, 0x6e, 0x66, 0x68, + 0x79, 0x2f, 0x66, 0x2e, 0xaf, 0x70, 0x4f, 0xc5, 0x7e, 0xba, 0xa6, 0x3d, + 0x3a, 0x69, 0xca, 0x72, 0x4d, 0xaf, 0x82, 0x66, 0xc1, 0x60, 0x49, 0xa8, + 0x53, 0xbc, 0x5a, 0x6a, 0x75, 0x94, 0xbd, 0x4b, 0xbf, 0x83, 0x54, 0x53, + 0x6f, 0xa6, 0xac, 0x3c, 0x94, 0x3c, 0xda, 0x9b, 0x85, 0x9d, 0x90, 0x58, + 0x28, 0x94, 0x8d, 0x85, 0x62, 0xbb, 0xc4, 0x29, 0x4b, 0xe2, 0x71, 0x9e, + 0x8a, 0x29, 0x42, 0x92, 0x50, 0x9c, 0x64, 0x94, 0x5d, 0x63, 0x8e, 0x90, + 0x56, 0xb8, 0xa3, 0x5b, 0x5c, 0x29, 0xae, 0x71, 0x5e, 0x80, 0x7f, 0xc2, + 0xcd, 0x81, 0x32, 0x73, 0xac, 0xbe, 0x47, 0x9f, 0xa4, 0xce, 0xcf, 0x58, + 0xcb, 0x91, 0xbc, 0xc7, 0x53, 0x79, 0x3b, 0x9a, 0xcb, 0x66, 0x61, 0x33, + 0x32, 0x4b, 0x68, 0x4b, 0xb0, 0xa8, 0x7f, 0xcd, 0xa5, 0x53, 0x82, 0x40, + 0xb7, 0x3d, 0x98, 0x70, 0xa7, 0x67, 0x7e, 0xc2, 0xa4, 0xbe, 0x54, 0x8e, + 0x3d, 0xc1, 0x6c, 0xb3, 0xc1, 0x88, 0xd1, 0x38, 0xc5, 0x9b, 0x59, 0xb9, + 0x91, 0x63, 0x96, 0x42, 0x56, 0x84, 0x60, 0xa8, 0x3e, 0x2e, 0x91, 0x76, + 0x2e, 0x96, 0xcc, 0x58, 0x33, 0xaf, 0x42, 0x83, 0x6f, 0xa2, 0x98, 0xb7, + 0x8d, 0x59, 0xc0, 0xc6, 0x96, 0xbc, 0xaa, 0x6a, 0x39, 0x74, 0x89, 0x65, + 0xa0, 0xcc, 0x9b, 0xa5, 0x9f, 0x7b, 0x7e, 0x4d, 0x5c, 0xac, 0x9e, 0x46, + 0x91, 0x69, 0xd7, 0x71, 0x9f, 0x80, 0xd2, 0xcf, 0x50, 0x2f, 0x79, 0xc3, + 0xb0, 0x48, 0xca, 0xc9, 0x23, 0x6e, 0x98, 0x55, 0xb2, 0x42, 0xbd, 0x63, + 0xab, 0x50, 0xaf, 0xa3, 0x78, 0xb3, 0x7f, 0x35, 0x64, 0xd4, 0xd3, 0x65, + 0x53, 0xb7, 0x8d, 0x56, 0xc6, 0xaa, 0x9d, 0xb7, 0x9b, 0x65, 0x8e, 0xe3, + 0xce, 0xa1, 0xad, 0xa4, 0x55, 0xd7, 0x51, 0xac, 0x59, 0x5b, 0x3a, 0x93, + 0x7a, 0x9f, 0xcd, 0x74, 0x69, 0x88, 0x89, 0x5c, 0xb2, 0xce, 0x7f, 0xb5, + 0xc2, 0x59, 0x44, 0x6b, 0x74, 0x81, 0x5c, 0x97, 0x69, 0xc1, 0x63, 0x45, + 0x77, 0x8a, 0xbe, 0x5a, 0x3a, 0x92, 0x4e, 0xa1, 0x64, 0x65, 0x76, 0x64, + 0x84, 0x5a, 0x89, 0x6e, 0xc9, 0xa9, 0x3e, 0x5c, 0x4d, 0x47, 0x86, 0x7d, + 0x94, 0x51, 0x43, 0xbb, 0x72, 0xae, 0x61, 0xb7, 0x55, 0x43, 0x7b, 0x59, + 0x99, 0xc5, 0xa7, 0xb6, 0x5e, 0x3f, 0x85, 0xca, 0x3d, 0x38, 0xa0, 0x49, + 0x4a, 0x48, 0x45, 0xa5, 0xa0, 0xc2, 0x4e, 0x9c, 0x39, 0x4c, 0xb8, 0x91, + 0x54, 0x98, 0x5e, 0x4f, 0x64, 0x95, 0x62, 0xaf, 0x7a, 0x8e, 0x30, 0x69, + 0x5e, 0x75, 0x68, 0xaa, 0x4d, 0x82, 0x36, 0x3d, 0x89, 0x63, 0x89, 0x71, + 0x66, 0x76, 0xc4, 0xc4, 0x44, 0x95, 0x49, 0x9b, 0x60, 0x80, 0xcf, 0x41, + 0x49, 0x43, 0x60, 0xa5, 0x69, 0xad, 0x73, 0x7c, 0xc9, 0x5a, 0x5e, 0x33, + 0xbe, 0xbb, 0x92, 0xac, 0x3e, 0x59, 0x77, 0x67, 0x7a, 0xc6, 0x55, 0x9f, + 0x55, 0xa7, 0x5e, 0xa3, 0xcc, 0x6c, 0x83, 0x9a, 0x43, 0x49, 0xa3, 0x8b, + 0x8a, 0x45, 0x62, 0x63, 0x6d, 0xd0, 0xdb, 0xa5, 0x4a, 0x86, 0xc3, 0x71, + 0x55, 0x80, 0xba, 0xc9, 0x4f, 0x98, 0x9f, 0x86, 0x81, 0xbe, 0x35, 0xb0, + 0x47, 0x58, 0x44, 0xcd, 0x41, 0xa3, 0x73, 0x63, 0x86, 0x58, 0x5f, 0x82, + 0x66, 0x4f, 0x86, 0xcc, 0x89, 0x7b, 0xae, 0x8c, 0x3e, 0x8c, 0x7f, 0xb1, + 0x94, 0x8e, 0x49, 0x73, 0x2b, 0x54, 0x5d, 0xa2, 0xce, 0x6a, 0xba, 0x91, + 0x6d, 0xbb, 0x5e, 0x41, 0xc2, 0x44, 0x53, 0x74, 0x32, 0x65, 0x5e, 0xaa, + 0x5d, 0x5c, 0xbf, 0xc4, 0x4e, 0x41, 0x7d, 0xc6, 0x67, 0x9e, 0xa0, 0x8d, + 0x7c, 0x97, 0x6e, 0xa2, 0x54, 0xa6, 0x72, 0xb5, 0x42, 0x83, 0x37, 0x9d, + 0x4b, 0xc6, 0xa4, 0x92, 0x4f, 0x68, 0x39, 0x50, 0xba, 0x87, 0x43, 0x77, + 0xb4, 0x5f, 0xcd, 0x62, 0x52, 0x62, 0xc6, 0x63, 0x2b, 0xbe, 0x96, 0xc3, + 0x9a, 0x36, 0x50, 0x51, 0x38, 0xbc, 0x64, 0x51, 0x58, 0x36, 0x3f, 0xa2, + 0x92, 0xa8, 0x4d, 0x6c, 0x48, 0x90, 0x82, 0x3a, 0x9f, 0xb1, 0x4d, 0x75, + 0x87, 0xab, 0x63, 0x62, 0x69, 0x42, 0x98, 0x4e, 0x35, 0x69, 0x32, 0x7c, + 0x47, 0x59, 0x71, 0x46, 0xca, 0x9b, 0x6e, 0x4b, 0x5e, 0x77, 0x97, 0x65, + 0xc6, 0x30, 0xc5, 0x45, 0x87, 0xbf, 0x39, 0x8f, 0x8e, 0x59, 0x38, 0x94, + 0x54, 0xd1, 0x7a, 0x58, 0x80, 0x7d, 0xc8, 0xa3, 0x48, 0x79, 0x43, 0x91, + 0xba, 0x8a, 0xc6, 0x92, 0xae, 0x82, 0x67, 0x8f, 0x68, 0x4e, 0xa3, 0xd1, + 0x4c, 0x85, 0x7e, 0x7f, 0x9b, 0xb7, 0x2c, 0x53, 0xbc, 0x55, 0x46, 0x5a, + 0x66, 0x6e, 0xb3, 0xac, 0x42, 0xc1, 0x6d, 0xb0, 0x6f, 0x35, 0x70, 0xaf, + 0xb8, 0x87, 0x72, 0x6c, 0x9d, 0xb1, 0x45, 0x95, 0xb2, 0xc2, 0x58, 0xba, + 0x36, 0x9c, 0x45, 0x9e, 0xbd, 0xc2, 0x3b, 0xb4, 0xb2, 0x48, 0xa3, 0x54, + 0xc9, 0x6a, 0x70, 0x67, 0x46, 0xac, 0x45, 0xc9, 0xbc, 0xa3, 0x4d, 0x86, + 0x93, 0xaf, 0xb4, 0xc8, 0xa0, 0x53, 0xad, 0x53, 0x58, 0x41, 0xc5, 0x46, + 0x95, 0x4a, 0x93, 0x97, 0x93, 0xcf, 0x58, 0x5b, 0xdb, 0x78, 0x57, 0xbf, + 0xc8, 0x44, 0x7e, 0x4a, 0x6c, 0xc0, 0x96, 0x6a, 0x79, 0x4f, 0xb4, 0x40, + 0x3c, 0xba, 0x3c, 0x66, 0x58, 0x5a, 0x5a, 0x55, 0x32, 0x98, 0xb9, 0x43, + 0x37, 0x36, 0x36, 0xc3, 0xa4, 0xc5, 0x8d, 0x2c, 0x42, 0xb7, 0xbe, 0x55, + 0x62, 0x7f, 0x91, 0xa1, 0x5a, 0x71, 0x54, 0xc6, 0x6b, 0x97, 0x4f, 0x74, + 0x5e, 0x73, 0x6b, 0x57, 0x34, 0xc2, 0xa2, 0x6b, 0x36, 0x38, 0xb8, 0xc7, + 0x8d, 0xa4, 0x7c, 0xc4, 0x75, 0x44, 0x6f, 0xb6, 0x4c, 0x38, 0x60, 0x5f, + 0x9b, 0xa1, 0xa3, 0x9f, 0x72, 0xa8, 0xd4, 0x57, 0xa3, 0x74, 0x3a, 0x97, + 0xcc, 0x54, 0xba, 0xaf, 0xc7, 0xa5, 0xc2, 0x79, 0x53, 0xbb, 0xc1, 0xb5, + 0xa7, 0xb3, 0x4e, 0x66, 0x79, 0xa6, 0xc1, 0x3c, 0xc1, 0x7e, 0xc1, 0x77, + 0x55, 0xcb, 0x6e, 0x5a, 0x55, 0xaf, 0x68, 0x4a, 0xa7, 0xaf, 0xb3, 0x36, + 0xc9, 0x3c, 0x85, 0xcd, 0x86, 0x7c, 0x35, 0xbb, 0x88, 0x50, 0xb2, 0x9e, + 0xa9, 0x99, 0xbb, 0x8c, 0xc9, 0xb6, 0x41, 0x71, 0x55, 0x8b, 0xac, 0xc0, + 0xc1, 0xba, 0x93, 0xb1, 0x50, 0x6c, 0x77, 0xac, 0xd3, 0xcc, 0x92, 0x4f, + 0x4e, 0x38, 0x97, 0xc8, 0xba, 0x9b, 0xc0, 0x99, 0x36, 0x84, 0x97, 0x9c, + 0x5d, 0x46, 0x7e, 0xa6, 0xcf, 0x5e, 0x5c, 0x44, 0xcd, 0x9c, 0xce, 0x69, + 0xab, 0x5f, 0x4a, 0x82, 0xc4, 0x65, 0xb7, 0x80, 0x6e, 0x74, 0x4c, 0x9a, + 0xa4, 0xa9, 0x7a, 0xb9, 0x93, 0x84, 0x8e, 0x78, 0x99, 0x96, 0x70, 0x65, + 0x8c, 0xb8, 0x4d, 0xcd, 0x52, 0xae, 0x2f, 0x94, 0x55, 0xb8, 0x4a, 0x51, + 0x33, 0x3d, 0xa1, 0xd3, 0xad, 0xb0, 0xb5, 0x86, 0x4e, 0xcc, 0x44, 0x8e, + 0x8a, 0xac, 0x4c, 0x72, 0x84, 0x92, 0x82, 0x9a, 0x6f, 0x44, 0xb3, 0x6b, + 0xbd, 0x3e, 0x78, 0xb9, 0x9f, 0x57, 0x40, 0x57, 0x9b, 0xcb, 0x4a, 0x50, + 0x4e, 0x76, 0xd2, 0xc2, 0x75, 0xa2, 0x30, 0x90, 0x3d, 0xaf, 0x51, 0xaf, + 0x89, 0x84, 0x93, 0x5f, 0x62, 0xc8, 0x70, 0x64, 0x9f, 0xc5, 0x99, 0x9d, + 0x81, 0x48, 0x96, 0x66, 0x97, 0xab, 0x96, 0xb0, 0x96, 0xcf, 0xae, 0x44, + 0x69, 0x4b, 0xd5, 0x7c, 0x46, 0x89, 0xa3, 0xb7, 0x44, 0xcd, 0x8a, 0x87, + 0xb3, 0x5e, 0x34, 0x34, 0x75, 0x62, 0x6b, 0x68, 0x67, 0x7c, 0xbf, 0x88, + 0x83, 0x68, 0x7c, 0xc9, 0x56, 0x5d, 0xd0, 0x99, 0xad, 0x6d, 0xcb, 0xac, + 0xca, 0xa8, 0x69, 0x91, 0xb6, 0x84, 0x92, 0xb1, 0x39, 0x2c, 0xc2, 0x5d, + 0xd4, 0xcf, 0x6d, 0x4d, 0xa5, 0x3d, 0x59, 0x4d, 0x9b, 0x91, 0x8e, 0x36, + 0x70, 0x6c, 0xc0, 0xc6, 0xc0, 0x88, 0xc5, 0x56, 0x51, 0xbf, 0x3c, 0x56, + 0xa3, 0x99, 0xce, 0x58, 0x88, 0x9d, 0x80, 0x93, 0xc1, 0x3e, 0x65, 0x82, + 0x8e, 0xa1, 0xad, 0x49, 0x99, 0x48, 0x74, 0x8d, 0xa8, 0xb4, 0xb1, 0x9a, + 0xcc, 0x41, 0xc0, 0xc4, 0x3c, 0x5a, 0x78, 0x6d, 0x7c, 0x95, 0xce, 0x58, + 0xc4, 0x49, 0x62, 0x61, 0x64, 0x57, 0x36, 0x9a, 0xa9, 0x77, 0x40, 0xb9, + 0x57, 0x35, 0x74, 0x9a, 0x3e, 0x7e, 0xb9, 0xb2, 0x99, 0x87, 0x6e, 0x4c, + 0xc3, 0xb3, 0xc7, 0x7e, 0x5f, 0xaf, 0x6e, 0x71, 0xd4, 0x63, 0xb6, 0x99, + 0x90, 0x92, 0x88, 0x5f, 0xb7, 0xd4, 0xbc, 0xc1, 0xc1, 0x9f, 0xa8, 0x5c, + 0x35, 0x30, 0x3e, 0x63, 0xc2, 0x79, 0xb2, 0xba, 0x91, 0x6b, 0xc5, 0x88, + 0x49, 0x99, 0x55, 0x98, 0xce, 0x8f, 0x66, 0x5d, 0x3d, 0xa9, 0x36, 0x68, + 0x4e, 0x4c, 0x57, 0x35, 0xd6, 0xa8, 0x5a, 0x8d, 0x8d, 0xcd, 0xaa, 0x43, + 0xc8, 0x42, 0xe1, 0x41, 0xa7, 0xaf, 0x8b, 0x94, 0x71, 0x4a, 0x69, 0x4d, + 0xac, 0xd7, 0x8b, 0xcb, 0x42, 0xe3, 0xb7, 0x51, 0xb1, 0x86, 0x56, 0x74, + 0x44, 0x39, 0x3e, 0x30, 0x97, 0x4d, 0x2e, 0x25, 0x2a, 0x7a, 0x8d, 0xda, + 0x84, 0x9d, 0x68, 0xc3, 0x97, 0x30, 0x47, 0xbb, 0x80, 0x7a, 0xcf, 0x7a, + 0x7d, 0x39, 0x92, 0x9c, 0x66, 0x52, 0x6b, 0xba, 0xac, 0xa0, 0xc7, 0x73, + 0x34, 0xbf, 0x7a, 0x9a, 0x5b, 0x73, 0x6e, 0x62, 0x57, 0x33, 0x34, 0xa9, + 0x7e, 0x91, 0x83, 0x7f, 0x99, 0x72, 0xbb, 0x8f, 0xc8, 0xa8, 0x89, 0x58, + 0x92, 0x62, 0xab, 0xaa, 0x8c, 0xb9, 0x6d, 0xb0, 0x64, 0xca, 0x57, 0x59, + 0x2f, 0x5e, 0xb9, 0x9a, 0x3e, 0xbe, 0x88, 0xa4, 0x81, 0x80, 0xd6, 0x6a, + 0x8a, 0x5e, 0x8f, 0xd6, 0x49, 0x3d, 0x5f, 0xa9, 0x44, 0x34, 0xc7, 0x75, + 0xd1, 0x95, 0x9d, 0x83, 0x46, 0x5c, 0x33, 0x8b, 0x98, 0x57, 0xc8, 0x67, + 0x3c, 0x4f, 0xaf, 0x98, 0x8a, 0x76, 0xc5, 0xa2, 0x69, 0xd2, 0xa2, 0x8d, + 0xc0, 0xaf, 0xbf, 0x53, 0x9d, 0x3f, 0xc4, 0x62, 0x6d, 0xc8, 0xbb, 0x65, + 0x5b, 0x6f, 0x48, 0xcc, 0x66, 0x9b, 0x67, 0x6e, 0x5b, 0xd8, 0xa6, 0x49, + 0x51, 0x40, 0x79, 0x3e, 0xbe, 0xd3, 0xc4, 0xa7, 0x4c, 0x89, 0x86, 0x91, + 0x94, 0x35, 0xbc, 0x85, 0xaa, 0x8e, 0x78, 0xcc, 0xce, 0x51, 0xa7, 0x4f, + 0x93, 0xc1, 0x5f, 0x6a, 0x63, 0x45, 0x82, 0x7d, 0xaa, 0x7d, 0xb3, 0x3f, + 0x78, 0x40, 0xc2, 0x78, 0x54, 0x69, 0x59, 0xc0, 0x6a, 0x98, 0xc3, 0x86, + 0xbd, 0xb3, 0x6f, 0xbe, 0x36, 0xa5, 0x4d, 0xc4, 0x6d, 0x4f, 0x9c, 0x7f, + 0x68, 0x96, 0x64, 0x5c, 0x53, 0xbc, 0xa9, 0xce, 0x4e, 0x6d, 0x77, 0x73, + 0x33, 0xd5, 0x77, 0xc0, 0xc0, 0x5d, 0x6d, 0x86, 0x6a, 0xc9, 0xa1, 0x3e, + 0x63, 0x7a, 0x9d, 0xb3, 0x5c, 0x7a, 0xd1, 0x92, 0xc0, 0x5f, 0x3e, 0x99, + 0x56, 0xd2, 0x65, 0x7a, 0xc9, 0x48, 0x73, 0xb1, 0x59, 0xce, 0x8f, 0xa9, + 0x97, 0x32, 0x97, 0x4a, 0xc2, 0x7f, 0x51, 0xa1, 0x2f, 0x84, 0x53, 0x8b, + 0x2b, 0x7f, 0x43, 0x36, 0x41, 0x5f, 0x98, 0x75, 0xaa, 0x85, 0xcf, 0xb5, + 0xc3, 0x8f, 0x8b, 0x5a, 0x9f, 0x6e, 0x74, 0x51, 0x9e, 0x6d, 0x66, 0x6c, + 0x70, 0xa1, 0x76, 0x46, 0x6a, 0x4f, 0x48, 0x72, 0xb3, 0xb4, 0xc6, 0x70, + 0x45, 0x65, 0xa7, 0x79, 0xcb, 0x5b, 0xd3, 0x97, 0xb0, 0xbd, 0xc1, 0x58, + 0x6a, 0x71, 0x6b, 0x56, 0xd6, 0xc4, 0x74, 0x76, 0xa3, 0x95, 0x8d, 0x6f, + 0x4d, 0x95, 0x81, 0x5e, 0x90, 0x89, 0xa5, 0x3b, 0x9c, 0x47, 0x52, 0x58, + 0x97, 0x7d, 0x87, 0x93, 0x82, 0x78, 0xb3, 0xbf, 0x87, 0xb7, 0x28, 0x40, + 0x5e, 0x7a, 0x6d, 0xdd, 0x98, 0x6b, 0x38, 0xc9, 0x3f, 0xb6, 0x4e, 0x58, + 0x33, 0x6b, 0x33, 0x55, 0xae, 0x80, 0xc0, 0x80, 0xb1, 0x32, 0x3a, 0xc8, + 0x7d, 0x92, 0x45, 0x2b, 0xb3, 0x82, 0xad, 0x7a, 0xaf, 0x4e, 0x60, 0x7c, + 0x5a, 0xd1, 0xa7, 0x61, 0xab, 0x5c, 0xcd, 0x6b, 0x5d, 0x42, 0x74, 0x6b, + 0x72, 0x9c, 0xcd, 0x6e, 0xcf, 0x41, 0x79, 0x3c, 0x39, 0xc6, 0x93, 0x43, + 0xc0, 0xae, 0x5d, 0x9f, 0x99, 0xc7, 0x38, 0xcf, 0xa0, 0xc7, 0x68, 0x92, + 0x63, 0x7c, 0x95, 0x5b, 0x47, 0xc3, 0x45, 0x42, 0x8e, 0x3c, 0x6d, 0x65, + 0x86, 0x50, 0x7b, 0xb3, 0x6b, 0xc1, 0xb7, 0x84, 0xc3, 0x3c, 0xac, 0x7a, + 0xc2, 0x64, 0x57, 0x55, 0x3a, 0x5b, 0xc2, 0xb9, 0xb9, 0xc7, 0x8c, 0x4c, + 0x9e, 0xa1, 0x51, 0x95, 0x3c, 0xd4, 0x7f, 0x92, 0x57, 0x80, 0xac, 0xa6, + 0x62, 0x8c, 0x4a, 0x86, 0x32, 0x71, 0x99, 0x41, 0x2b, 0xb5, 0x5a, 0xb3, + 0xcf, 0x74, 0x65, 0x39, 0x96, 0x7f, 0xd0, 0x3e, 0x4b, 0xe1, 0x6e, 0x37, + 0x7c, 0x66, 0x51, 0x7f, 0x59, 0x78, 0x62, 0x56, 0x6d, 0x4e, 0x84, 0xb8, + 0xab, 0x72, 0xd0, 0x9b, 0x6c, 0x77, 0x80, 0x70, 0xb9, 0x66, 0x4f, 0x94, + 0xb3, 0x80, 0x42, 0x34, 0x47, 0x47, 0x81, 0xb2, 0x5c, 0x40, 0x7c, 0xc9, + 0x9a, 0xd2, 0xc8, 0xd1, 0x4d, 0x92, 0xa9, 0x83, 0xc1, 0x38, 0xc1, 0xc7, + 0x69, 0x6c, 0x9b, 0x44, 0xaf, 0x79, 0x78, 0x5d, 0x53, 0x8c, 0x84, 0x77, + 0x46, 0xbe, 0x4d, 0x97, 0x67, 0x35, 0x99, 0x88, 0x78, 0x3a, 0x92, 0xda, + 0x90, 0x3b, 0x8b, 0x38, 0x98, 0xa0, 0xc3, 0x7c, 0x79, 0x31, 0xd0, 0x62, + 0x7c, 0x75, 0x98, 0xc9, 0x38, 0x9a, 0x4e, 0x6d, 0x6b, 0x4d, 0xb8, 0x6d, + 0x89, 0x42, 0x70, 0xac, 0x89, 0x5d, 0xab, 0x7e, 0x41, 0x4b, 0x70, 0x98, + 0x36, 0x81, 0xa9, 0x4e, 0xbd, 0x6b, 0x40, 0x7d, 0xbb, 0x8d, 0xd2, 0x7e, + 0x9a, 0x8b, 0x8a, 0x39, 0xab, 0xc1, 0x86, 0xd4, 0x6c, 0xb7, 0x9d, 0x55, + 0x9a, 0x41, 0xc1, 0x73, 0x5d, 0x99, 0xbb, 0x7a, 0x63, 0xe7, 0x68, 0x77, + 0xa7, 0x83, 0x37, 0x4d, 0x82, 0xaa, 0x3e, 0x91, 0x44, 0x7b, 0x89, 0x7f, + 0x85, 0x86, 0x68, 0xbb, 0x7c, 0x93, 0xba, 0xbc, 0xd6, 0x77, 0x6a, 0x45, + 0x47, 0xb5, 0xa9, 0x82, 0xca, 0x8e, 0xd5, 0xa1, 0x50, 0x94, 0x5b, 0x75, + 0x5d, 0x4a, 0x69, 0xa2, 0x9d, 0xc4, 0x6e, 0xcb, 0x7d, 0xb1, 0xc9, 0xc3, + 0x76, 0x54, 0xb8, 0xc3, 0x85, 0xc7, 0x54, 0x69, 0x69, 0x67, 0xbd, 0x86, + 0xb8, 0x3c, 0xd2, 0x74, 0x37, 0xb7, 0x4e, 0x4d, 0xb0, 0xad, 0xb9, 0x43, + 0xd0, 0x8c, 0x48, 0x77, 0xcf, 0x8a, 0x5d, 0x3f, 0x38, 0x49, 0x6b, 0x77, + 0xb4, 0x4e, 0x7f, 0x88, 0x92, 0xab, 0xc0, 0xc0, 0x69, 0xca, 0x89, 0x40, + 0x5b, 0x5c, 0x4d, 0x8b, 0x99, 0x31, 0x46, 0xc7, 0x61, 0x62, 0x92, 0x5d, + 0x7c, 0x5b, 0x97, 0xa9, 0xb6, 0x7c, 0xa1, 0xb6, 0x41, 0x83, 0x76, 0xd6, + 0xa3, 0x5e, 0xbb, 0xb7, 0x30, 0x4d, 0x8d, 0x85, 0x33, 0x76, 0x9f, 0x8d, + 0x5b, 0xb4, 0x73, 0x47, 0x8a, 0x42, 0x54, 0x76, 0x75, 0x40, 0x2f, 0xa0, + 0x4d, 0x89, 0xa6, 0x91, 0xb6, 0xc9, 0x60, 0x8d, 0x64, 0x78, 0xb1, 0x7d, + 0xc5, 0xb2, 0xc4, 0x93, 0x51, 0x6e, 0x8d, 0x37, 0x35, 0x91, 0x8b, 0xb7, + 0x9d, 0xbf, 0xa6, 0x74, 0xb6, 0xa8, 0x69, 0x6d, 0x85, 0x71, 0xc8, 0x9c, + 0x8b, 0x3b, 0x9a, 0x48, 0x6a, 0xbb, 0x32, 0x69, 0x65, 0x9b, 0x3b, 0x52, + 0x71, 0x82, 0xce, 0x42, 0x4d, 0xaf, 0x7c, 0xbb, 0x62, 0x3d, 0x63, 0xc6, + 0x39, 0x71, 0x42, 0x51, 0xb6, 0xa1, 0xca, 0xbf, 0x66, 0xa9, 0x6f, 0xcf, + 0x38, 0x78, 0xc3, 0x6a, 0xa0, 0x54, 0x44, 0x60, 0x44, 0x52, 0x56, 0x40, + 0x4b, 0x91, 0x94, 0x9b, 0x94, 0x87, 0x6d, 0xa9, 0x8f, 0x7f, 0xcc, 0x82, + 0x4a, 0x89, 0x55, 0x64, 0x7c, 0x69, 0x36, 0x9a, 0x71, 0x4b, 0x39, 0xa4, + 0xc7, 0x37, 0xbe, 0x39, 0x5e, 0x84, 0x71, 0xb4, 0x34, 0x53, 0x42, 0xaf, + 0xb7, 0x84, 0x48, 0x3b, 0x7e, 0x88, 0xd0, 0x8a, 0x87, 0x34, 0x7e, 0x45, + 0xd5, 0xb4, 0x98, 0x85, 0x33, 0x60, 0x54, 0xb9, 0x75, 0xd2, 0x6d, 0x86, + 0x69, 0x56, 0x8a, 0xa0, 0x97, 0x9f, 0xb1, 0xa1, 0x49, 0x3a, 0x78, 0x83, + 0x60, 0x56, 0xba, 0xc3, 0x94, 0xac, 0xb5, 0xd1, 0xae, 0xa4, 0x7f, 0x63, + 0xb7, 0xb4, 0x92, 0x59, 0x46, 0x97, 0xa1, 0x31, 0xb3, 0xb1, 0x47, 0xad, + 0x67, 0x47, 0x99, 0x66, 0x36, 0xbb, 0x87, 0xaf, 0x67, 0xb6, 0x55, 0xc3, + 0x81, 0x3a, 0x37, 0x30, 0xb1, 0xa2, 0x62, 0x8f, 0x4a, 0x3a, 0x90, 0xad, + 0x30, 0x3d, 0x5b, 0x6b, 0x74, 0x61, 0x33, 0x93, 0xcd, 0xc1, 0x35, 0xa6, + 0x42, 0x9d, 0x6c, 0xa9, 0x54, 0x78, 0xb7, 0xa2, 0x7e, 0xcd, 0xbb, 0x81, + 0x4f, 0xd4, 0xcc, 0x8b, 0xa0, 0xc2, 0x85, 0x4e, 0xd1, 0xc3, 0x73, 0x92, + 0x3f, 0xa8, 0x9f, 0x7c, 0xa2, 0xb6, 0x52, 0x81, 0xc6, 0x83, 0x83, 0x41, + 0x33, 0xc5, 0x4b, 0xb4, 0x77, 0x83, 0xb4, 0x9c, 0x99, 0x38, 0x28, 0x2e, + 0x57, 0xb2, 0x68, 0x6a, 0xd4, 0x80, 0x6a, 0x7d, 0x6c, 0xd4, 0x72, 0xc3, + 0x86, 0x5c, 0x89, 0x5c, 0x47, 0xd4, 0xcb, 0x61, 0xcc, 0x40, 0x41, 0xba, + 0xac, 0x93, 0xaa, 0x6c, 0x79, 0x7c, 0x99, 0x80, 0x42, 0x75, 0x7e, 0xc8, + 0x67, 0x2e, 0x9f, 0xb0, 0x62, 0xa6, 0xcd, 0x2e, 0x79, 0x7f, 0xa7, 0x4d, + 0x3f, 0x59, 0x3a, 0xaa, 0x6b, 0x48, 0xb7, 0xbc, 0xd9, 0xca, 0x34, 0x53, + 0xb2, 0x68, 0xbe, 0x7d, 0xa7, 0xa9, 0x4c, 0x75, 0x9b, 0x66, 0xbf, 0x8e, + 0x5b, 0x6a, 0xad, 0x4e, 0xc2, 0x35, 0xdc, 0x76, 0x7d, 0xb6, 0x55, 0xd2, + 0x58, 0x76, 0x41, 0x48, 0x53, 0xbd, 0xba, 0xab, 0x6c, 0x94, 0x51, 0x3e, + 0xb3, 0x54, 0x5c, 0x96, 0x73, 0x75, 0x54, 0x4c, 0xaf, 0x56, 0x60, 0xa1, + 0x44, 0x9f, 0xa1, 0x52, 0x80, 0xb9, 0x3d, 0xa7, 0x8d, 0x73, 0x35, 0x78, + 0x51, 0x3d, 0xaa, 0x8d, 0xcf, 0xa6, 0x7b, 0x49, 0x40, 0x77, 0x52, 0xa4, + 0xd3, 0x4b, 0x4a, 0x97, 0xd2, 0x3c, 0xcf, 0xc6, 0x80, 0xbf, 0xd4, 0xce, + 0x5c, 0xa2, 0x8e, 0xc8, 0x31, 0x6c, 0x80, 0x5d, 0x5a, 0x85, 0x4d, 0x2f, + 0x86, 0xb6, 0xaa, 0x4e, 0x74, 0xba, 0x36, 0x78, 0x32, 0x74, 0xc3, 0xaa, + 0xcb, 0x32, 0x8f, 0x6e, 0xd3, 0x44, 0xb9, 0x89, 0x81, 0x80, 0x87, 0x68, + 0x92, 0xd7, 0x8d, 0x36, 0x84, 0xc9, 0x5b, 0xb5, 0x94, 0x47, 0xca, 0x84, + 0x36, 0x84, 0x63, 0x81, 0xd0, 0xd3, 0x79, 0xbc, 0x79, 0x97, 0x31, 0x75, + 0x52, 0x85, 0xa5, 0x70, 0x6d, 0xcb, 0x53, 0x52, 0x9a, 0x51, 0x75, 0xb6, + 0xbf, 0x6e, 0xa9, 0x8e, 0xce, 0x8d, 0xc9, 0x60, 0x40, 0x97, 0xb6, 0xa0, + 0xb1, 0x94, 0x47, 0x47, 0xbd, 0xbc, 0xa5, 0x9e, 0x3f, 0x72, 0x5b, 0x62, + 0x73, 0x33, 0x51, 0x49, 0xca, 0x30, 0x47, 0x47, 0x73, 0x8e, 0x9a, 0x5b, + 0x67, 0x3d, 0x58, 0x72, 0xa2, 0x42, 0x42, 0xa0, 0x7e, 0x3b, 0x4f, 0xa6, + 0x6f, 0x45, 0x9f, 0x9c, 0xc5, 0xd3, 0x51, 0xb4, 0x8c, 0x6b, 0x85, 0x70, + 0x9d, 0x80, 0x9b, 0xbe, 0xd1, 0x70, 0x56, 0xc4, 0x70, 0x57, 0x5b, 0x8d, + 0x9b, 0x57, 0xb3, 0x43, 0x68, 0x8c, 0x73, 0x44, 0x65, 0x56, 0x71, 0x83, + 0x71, 0x47, 0x6f, 0x30, 0x90, 0x7f, 0xb6, 0x5b, 0xb6, 0xc9, 0x99, 0xb0, + 0xcd, 0x6b, 0x51, 0xb4, 0x9f, 0xd7, 0x8d, 0x69, 0x5f, 0x5e, 0x77, 0x48, + 0x9e, 0xa2, 0x6c, 0x59, 0x47, 0x99, 0x72, 0x90, 0xb2, 0x89, 0x71, 0xc6, + 0x39, 0x45, 0x8d, 0x35, 0x37, 0xcd, 0x3f, 0xd5, 0xb5, 0x70, 0xcc, 0xaa, + 0xd1, 0x4d, 0xc8, 0x96, 0xdb, 0xd9, 0xad, 0x31, 0x65, 0x85, 0x4b, 0x76, + 0x6e, 0x8b, 0x52, 0xd4, 0xad, 0xb0, 0x5f, 0x89, 0x51, 0x6f, 0xc9, 0x80, + 0xa0, 0x5a, 0xd5, 0xa2, 0x85, 0x73, 0x68, 0xb4, 0x77, 0xc3, 0x6e, 0x7c, + 0x52, 0x5e, 0x2f, 0x6a, 0x86, 0xd1, 0x59, 0xc3, 0x9d, 0xcd, 0x9d, 0x8d, + 0x4f, 0xcd, 0x9f, 0xc5, 0xca, 0xc0, 0xa8, 0x99, 0xa2, 0x42, 0x6d, 0xca, + 0x52, 0x6f, 0x41, 0x8f, 0x45, 0xd1, 0x47, 0x2e, 0x89, 0xbb, 0xbe, 0x67, + 0x6a, 0x60, 0xd6, 0x7d, 0x53, 0xac, 0x5c, 0xaa, 0x5c, 0x82, 0xa7, 0xd3, + 0xc0, 0xa5, 0x55, 0x52, 0xb2, 0x67, 0xaa, 0x48, 0x52, 0xd2, 0x72, 0x3f, + 0xd6, 0x99, 0xbc, 0xa0, 0xae, 0xbf, 0xad, 0x82, 0x40, 0x38, 0x5c, 0x36, + 0xb2, 0x83, 0x71, 0xae, 0x3a, 0x7e, 0x98, 0x59, 0x65, 0x67, 0x7a, 0x49, + 0xaa, 0x5f, 0x97, 0x3c, 0x5e, 0x90, 0x50, 0x5c, 0x99, 0xd9, 0x96, 0xbd, + 0x40, 0xb1, 0x5f, 0xa9, 0x44, 0x92, 0xc6, 0xa8, 0xce, 0x3c, 0xa4, 0xcb, + 0x96, 0x38, 0xd9, 0x81, 0xbe, 0xb7, 0x8d, 0x57, 0x4d, 0xc9, 0x88, 0x89, + 0xb8, 0x3c, 0x83, 0x80, 0x7d, 0x90, 0x43, 0x6a, 0x71, 0x8b, 0x7d, 0xb0, + 0xbb, 0x36, 0x60, 0x73, 0xc2, 0x67, 0xab, 0xbc, 0x6d, 0x65, 0xa7, 0x58, + 0xa5, 0x8b, 0x89, 0xb6, 0x35, 0x7a, 0x4b, 0x4d, 0xb1, 0x63, 0x7f, 0x86, + 0x94, 0x57, 0x99, 0x86, 0x98, 0x96, 0x93, 0x96, 0x62, 0xa7, 0xba, 0x41, + 0xda, 0x7c, 0xa1, 0x50, 0x66, 0x39, 0x98, 0x79, 0x5c, 0xdc, 0x9c, 0xc5, + 0x51, 0x66, 0xa6, 0xab, 0x58, 0x9d, 0x60, 0xa9, 0x5a, 0xaa, 0x94, 0xec, + 0x7c, 0x4f, 0x53, 0x7e, 0x48, 0x73, 0x85, 0xc5, 0x5a, 0xd3, 0xa8, 0x7f, + 0x4d, 0x7f, 0x48, 0x8d, 0x80, 0x69, 0x40, 0x48, 0x9f, 0x48, 0x87, 0x48, + 0x69, 0x72, 0x75, 0x70, 0xa7, 0x83, 0x3d, 0x97, 0x65, 0x46, 0x84, 0x6a, + 0xb8, 0xad, 0xb7, 0x43, 0xcb, 0xb0, 0xa5, 0x6b, 0x9e, 0x7d, 0x34, 0xba, + 0x98, 0x32, 0x90, 0x97, 0x83, 0xb4, 0x9b, 0xbf, 0x9e, 0x67, 0x71, 0xd7, + 0x56, 0x92, 0x37, 0x36, 0x5d, 0x9c, 0xb6, 0x64, 0x9b, 0x74, 0x54, 0xb4, + 0xa1, 0x7b, 0x32, 0x4d, 0x9c, 0x68, 0x3e, 0x7c, 0xa6, 0x63, 0x9a, 0x71, + 0xab, 0x69, 0x41, 0x80, 0x8e, 0xaa, 0x84, 0xbb, 0xd5, 0x48, 0x84, 0xaa, + 0xc6, 0x42, 0x58, 0x66, 0xb9, 0xbb, 0x4b, 0xaa, 0x3c, 0xa4, 0xbd, 0x7a, + 0x38, 0x5d, 0x8f, 0x95, 0x50, 0xae, 0x8b, 0xa7, 0xae, 0xa4, 0x6b, 0x60, + 0xdb, 0x7d, 0x79, 0x8c, 0x9c, 0x51, 0xa1, 0x5f, 0x54, 0x53, 0xc2, 0x56, + 0x87, 0xa2, 0x40, 0x3d, 0xbf, 0x5b, 0x43, 0x45, 0x33, 0xbe, 0x7f, 0xae, + 0x52, 0x6f, 0x39, 0xb9, 0x89, 0x77, 0x38, 0xbc, 0xaa, 0x89, 0xa1, 0x4e, + 0xb2, 0x49, 0x96, 0x94, 0x4e, 0xcb, 0xac, 0xa5, 0xcc, 0x5a, 0xc9, 0x5c, + 0xb3, 0xb8, 0x98, 0x9d, 0x51, 0x9f, 0x29, 0x71, 0x52, 0x5f, 0x61, 0x4e, + 0x74, 0xb4, 0x25, 0xda, 0x25, 0xa4, 0x3f, 0x89, 0xc7, 0xb9, 0x33, 0x9a, + 0x7f, 0x5c, 0x84, 0xcc, 0x88, 0x89, 0x87, 0xa8, 0x7f, 0x73, 0xac, 0xaf, + 0x85, 0xcc, 0xc1, 0x79, 0x7a, 0x61, 0x60, 0x97, 0x99, 0x72, 0x9c, 0xbf, + 0x9c, 0xb7, 0xa6, 0xc4, 0x25, 0x9b, 0xc0, 0x46, 0x6c, 0x55, 0xa3, 0x89, + 0x6f, 0x46, 0x6e, 0xa4, 0xad, 0x69, 0x6c, 0xa2, 0x5d, 0x56, 0xda, 0x3c, + 0xb6, 0xce, 0x56, 0x45, 0x7e, 0x60, 0x75, 0xd1, 0x33, 0x23, 0x9b, 0xa6, + 0xc2, 0xc2, 0x39, 0xb4, 0xbf, 0xbf, 0xac, 0x94, 0xb6, 0x8a, 0x56, 0x56, + 0xa5, 0x6b, 0x38, 0x8a, 0x7c, 0x46, 0x78, 0x6e, 0xcc, 0xc8, 0x62, 0xbc, + 0xbb, 0x4e, 0xaf, 0xd9, 0x69, 0x9f, 0xa0, 0x93, 0x65, 0x2b, 0x5b, 0x39, + 0x3f, 0x9f, 0x85, 0xb9, 0x55, 0x8c, 0x70, 0xbd, 0x7d, 0x78, 0xa5, 0x2b, + 0xab, 0x8d, 0xb6, 0x61, 0x3c, 0x4a, 0x5e, 0x85, 0x8a, 0x6e, 0x49, 0xad, + 0xd1, 0xb1, 0x6e, 0xa2, 0xba, 0xa8, 0x97, 0xd0, 0x7a, 0x8f, 0x87, 0xc4, + 0x64, 0xd4, 0x8b, 0x6a, 0x5d, 0x73, 0x8b, 0x6c, 0xa6, 0xc5, 0x49, 0xb4, + 0xa2, 0x8b, 0x9c, 0xda, 0x7f, 0xc5, 0x53, 0xdc, 0x60, 0xcb, 0x79, 0x84, + 0x91, 0x4a, 0x4b, 0x8a, 0x43, 0x51, 0xa2, 0xca, 0x7f, 0x8b, 0x7f, 0x70, + 0x45, 0xac, 0x5a, 0x76, 0x98, 0x7e, 0xcb, 0x57, 0x66, 0x89, 0xc7, 0x41, + 0xcc, 0xd3, 0xa8, 0xc1, 0xc9, 0x8a, 0x4e, 0x7f, 0xdb, 0xdb, 0x84, 0x76, + 0xbb, 0x43, 0x7a, 0x4a, 0xa9, 0xa7, 0xa4, 0x90, 0x90, 0x90, 0x7c, 0x3d, + 0x2d, 0xa5, 0xca, 0x7b, 0x97, 0xbd, 0xc2, 0x85, 0xc1, 0x96, 0xb8, 0x95, + 0x75, 0x3c, 0x7c, 0xde, 0x76, 0x37, 0x37, 0x5e, 0x9e, 0x63, 0xcf, 0xce, + 0x37, 0x5e, 0xac, 0xb9, 0x40, 0x78, 0x40, 0x96, 0x7c, 0xb4, 0x7d, 0x8c, + 0xba, 0x2e, 0xc6, 0xcf, 0x51, 0x55, 0x69, 0xa4, 0xd8, 0xc9, 0x26, 0xab, + 0x5a, 0x5d, 0x15, 0x66, 0xc3, 0xca, 0x9b, 0x75, 0x95, 0xa2, 0x97, 0xc2, + 0x32, 0xa5, 0xa7, 0x48, 0x23, 0x75, 0xa7, 0x53, 0x64, 0x54, 0xbb, 0xa8, + 0x53, 0x97, 0x4b, 0xd6, 0xb8, 0x35, 0xb9, 0x69, 0x4b, 0x87, 0x43, 0x90, + 0x89, 0xba, 0x87, 0x58, 0xb0, 0xc2, 0x65, 0xc2, 0x9d, 0x43, 0x61, 0x4e, + 0x71, 0x57, 0xd6, 0x60, 0x6d, 0xbe, 0x4f, 0x84, 0x8f, 0xdf, 0x39, 0x90, + 0x67, 0x49, 0x3d, 0xbc, 0x9d, 0x99, 0x7f, 0x8c, 0x83, 0xd4, 0x45, 0x42, + 0x92, 0x48, 0x73, 0xcd, 0x57, 0x4d, 0x5d, 0xcc, 0xa8, 0x9d, 0xa0, 0x65, + 0x80, 0x8b, 0x71, 0x87, 0xbf, 0x97, 0x9e, 0x39, 0x79, 0x92, 0xcd, 0x6b, + 0x94, 0x5b, 0x78, 0x67, 0x7f, 0xcd, 0xb3, 0xb1, 0x5f, 0xa5, 0xb0, 0xc3, + 0x5d, 0x3c, 0x37, 0x6c, 0x4d, 0xc6, 0x72, 0x5e, 0x83, 0xb3, 0xd7, 0xaf, + 0x59, 0x75, 0xdf, 0x69, 0x71, 0xa1, 0x78, 0x3e, 0x75, 0x71, 0xc8, 0x97, + 0x2a, 0x56, 0xac, 0xce, 0xa5, 0x49, 0xcc, 0xcc, 0xc1, 0xd2, 0x51, 0x99, + 0xcc, 0x61, 0x39, 0x75, 0xbc, 0xc8, 0x71, 0xa4, 0xb7, 0xb6, 0x74, 0x59, + 0xdb, 0x8d, 0x63, 0x7c, 0xa7, 0xcd, 0x65, 0xa3, 0x66, 0xb3, 0x97, 0x91, + 0x81, 0x33, 0xab, 0x33, 0xf0, 0xa4, 0x28, 0x37, 0x4f, 0x34, 0x63, 0x49, + 0x5d, 0xb6, 0x26, 0xa6, 0x5f, 0x28, 0x66, 0x48, 0x67, 0x50, 0x3c, 0x80, + 0x3b, 0x9f, 0x61, 0x49, 0x9e, 0x6a, 0x40, 0xaa, 0x5f, 0xaa, 0xcb, 0x41, + 0x94, 0x7c, 0xa1, 0x7f, 0xb8, 0x3f, 0x8b, 0x40, 0x43, 0x49, 0x38, 0x3d, + 0x4e, 0x57, 0xb1, 0x97, 0x6a, 0xb2, 0x60, 0x79, 0x8c, 0x8e, 0x44, 0x8e, + 0xb2, 0x54, 0x95, 0xd0, 0x61, 0x7c, 0x58, 0x77, 0x85, 0xc3, 0xbf, 0xcc, + 0x7b, 0x3a, 0x3b, 0x83, 0xb7, 0x4f, 0x7b, 0x44, 0xb5, 0xc8, 0x3e, 0xb3, + 0x5c, 0x5e, 0x2f, 0xd4, 0x87, 0x49, 0x54, 0x7b, 0x4a, 0xbb, 0x81, 0x54, + 0x41, 0x4e, 0x83, 0x37, 0x89, 0x67, 0xc4, 0x44, 0xcc, 0x48, 0x7d, 0x7c, + 0x94, 0x79, 0x93, 0xb6, 0x7c, 0xcb, 0x65, 0x45, 0x92, 0x61, 0x48, 0x5c, + 0xcd, 0x92, 0xb5, 0x67, 0x63, 0xb1, 0xd2, 0x9e, 0x99, 0xb5, 0xcb, 0x2c, + 0x58, 0xc1, 0x31, 0xc6, 0x72, 0x49, 0xd8, 0xc6, 0x76, 0xbe, 0x61, 0x59, + 0xaf, 0xc0, 0x4b, 0x3b, 0xa2, 0x5e, 0x35, 0xc3, 0xb5, 0x53, 0xc3, 0x7f, + 0xd5, 0xbc, 0xa9, 0xc5, 0x6a, 0x3a, 0xb9, 0xaa, 0x5f, 0x45, 0x46, 0xd3, + 0xb6, 0x5d, 0x81, 0xb4, 0xaf, 0x33, 0x35, 0xc6, 0x7c, 0x51, 0x35, 0x77, + 0x2f, 0xc1, 0x7a, 0xc1, 0x6c, 0xac, 0x94, 0x9b, 0x7d, 0x4c, 0xa4, 0xd1, + 0x4c, 0x6a, 0x58, 0x80, 0x4d, 0x95, 0x7a, 0x9a, 0x30, 0x66, 0x4f, 0x5e, + 0x7b, 0xa4, 0x74, 0x6e, 0x40, 0x4f, 0x3e, 0x6c, 0x9d, 0x40, 0x62, 0x8d, + 0xb7, 0xaf, 0xba, 0x4a, 0x3d, 0x95, 0x36, 0x53, 0x82, 0x78, 0x82, 0x81, + 0xab, 0x64, 0x5e, 0x8d, 0xcc, 0x92, 0xbd, 0x8d, 0x7f, 0x32, 0xd0, 0xa7, + 0x40, 0x2c, 0x39, 0xcd, 0x8d, 0x93, 0x85, 0x8d, 0x73, 0x7f, 0x9e, 0x58, + 0x73, 0x63, 0xa7, 0x69, 0xc5, 0x9f, 0xd0, 0xa3, 0x3f, 0x3f, 0x98, 0xba, + 0x88, 0xca, 0x34, 0xad, 0x9e, 0x83, 0x9e, 0x3b, 0x7e, 0x64, 0x8f, 0x5f, + 0x72, 0xb3, 0x8c, 0x66, 0x66, 0xab, 0x7c, 0x3e, 0x8b, 0x49, 0x97, 0x7a, + 0xc3, 0xc0, 0x51, 0x6a, 0x5a, 0x73, 0x72, 0xa1, 0x3a, 0xc1, 0x90, 0x98, + 0xb2, 0xa1, 0x65, 0x35, 0x9f, 0x84, 0xa2, 0xbd, 0x78, 0x87, 0xc3, 0x8f, + 0x32, 0x39, 0xa2, 0x8d, 0xc3, 0xb0, 0x3b, 0x95, 0xac, 0x3a, 0xc4, 0x4d, + 0xba, 0x52, 0x97, 0x4b, 0x3f, 0xb7, 0x6f, 0x51, 0xb8, 0x99, 0x39, 0x65, + 0x44, 0x82, 0xb4, 0x96, 0xc9, 0x47, 0x4a, 0xae, 0x5d, 0x9d, 0x78, 0x7d, + 0xa7, 0x84, 0xce, 0x95, 0x8d, 0xba, 0xcd, 0x6a, 0xaf, 0x81, 0x83, 0xa6, + 0xa7, 0xb5, 0xa6, 0xd0, 0xbc, 0x7c, 0xb8, 0x8a, 0x46, 0xb6, 0x78, 0x4e, + 0x83, 0x38, 0x4e, 0x3e, 0x42, 0x7c, 0x9f, 0x78, 0x80, 0xc8, 0xba, 0x40, + 0x9f, 0xc8, 0x6c, 0x8a, 0x54, 0x2c, 0xc3, 0x38, 0x75, 0x97, 0x56, 0x69, + 0x54, 0x83, 0x95, 0xa6, 0x84, 0x9f, 0xc6, 0x5d, 0xbe, 0x4c, 0x6c, 0x6a, + 0x95, 0x72, 0x42, 0x63, 0x51, 0x5a, 0xaa, 0x48, 0x32, 0xcc, 0x89, 0xc5, + 0xc9, 0x5f, 0x38, 0x4f, 0x42, 0x3e, 0x44, 0xc5, 0x6d, 0x75, 0xcc, 0xc0, + 0x66, 0x6a, 0x3b, 0x57, 0x80, 0xc1, 0x47, 0xb6, 0xae, 0xb4, 0xaf, 0x59, + 0x6b, 0x82, 0x83, 0xc1, 0x99, 0x63, 0x54, 0x7f, 0xc8, 0xaa, 0xae, 0x69, + 0x83, 0x5f, 0x5e, 0xae, 0xa4, 0x4a, 0xb9, 0x72, 0xbb, 0xa6, 0xb7, 0x6f, + 0x42, 0x83, 0x43, 0x63, 0x62, 0xbe, 0xba, 0x39, 0x97, 0x4c, 0x58, 0xaf, + 0x3a, 0x51, 0xbb, 0x4f, 0x4c, 0xa5, 0x88, 0xbc, 0xac, 0x3b, 0x90, 0x3c, + 0x9f, 0x6e, 0xd0, 0x7d, 0x4a, 0x55, 0x7e, 0x9a, 0xa5, 0x44, 0x6d, 0x63, + 0xd0, 0x91, 0x57, 0xc4, 0x58, 0x30, 0x58, 0xa0, 0x79, 0x7f, 0xcb, 0x82, + 0x68, 0x5c, 0x7e, 0xac, 0x5e, 0x72, 0x46, 0xc1, 0x53, 0x84, 0xc3, 0x59, + 0x6a, 0x80, 0xb3, 0x7a, 0x6c, 0x99, 0x69, 0x78, 0xaa, 0x51, 0x34, 0x76, + 0x73, 0xa2, 0x39, 0x4f, 0x54, 0x80, 0x9a, 0xac, 0xd3, 0x6d, 0xc7, 0x86, + 0x88, 0x38, 0x35, 0x97, 0xb7, 0x7f, 0x35, 0x75, 0xb4, 0xc9, 0x62, 0xb2, + 0xac, 0xb9, 0xb3, 0xba, 0xdb, 0x56, 0x3f, 0xba, 0x3e, 0x89, 0x5d, 0x5b, + 0x98, 0x48, 0xb1, 0x54, 0x40, 0x7b, 0x8c, 0x9a, 0x4d, 0xb5, 0x79, 0x62, + 0xb6, 0x4d, 0x67, 0x71, 0xa3, 0x3b, 0x69, 0x6e, 0x30, 0x3f, 0xd4, 0x57, + 0x82, 0x3a, 0x7b, 0x72, 0xc1, 0x36, 0x74, 0xa8, 0x91, 0x90, 0xb5, 0x9b, + 0x73, 0x66, 0x7c, 0x33, 0xcf, 0xaa, 0x9e, 0xac, 0x38, 0x51, 0xce, 0xd0, + 0x7f, 0x8e, 0x53, 0x79, 0x69, 0x2e, 0x78, 0x41, 0x70, 0xb6, 0x98, 0xc5, + 0x8a, 0x66, 0xc9, 0x9c, 0x5d, 0x6f, 0xae, 0xa8, 0x26, 0x47, 0x5e, 0x8a, + 0x34, 0x5d, 0x56, 0x89, 0x60, 0xb3, 0x87, 0x58, 0x68, 0x8f, 0xc0, 0x51, + 0xb0, 0x93, 0x9e, 0x97, 0x54, 0xc7, 0xb3, 0x93, 0x9b, 0xc9, 0x5c, 0x5a, + 0x34, 0x80, 0xc0, 0x9f, 0xb4, 0x84, 0x4f, 0xc1, 0x89, 0x8c, 0xd5, 0x6d, + 0x42, 0xaf, 0x64, 0x74, 0x99, 0x5d, 0x7b, 0x75, 0x66, 0x80, 0x9a, 0xb6, + 0x9f, 0x45, 0xb8, 0xd3, 0x7b, 0x52, 0x5c, 0x98, 0x7a, 0xb1, 0x71, 0x97, + 0xc4, 0x59, 0xca, 0xb9, 0x79, 0x85, 0x38, 0x80, 0x76, 0x93, 0xc3, 0xbe, + 0xac, 0x50, 0xb1, 0xa0, 0xbb, 0x5d, 0xbb, 0x7a, 0x82, 0x3d, 0x8a, 0x7f, + 0x40, 0x48, 0x55, 0xaf, 0xc4, 0x3b, 0xcf, 0x66, 0x4d, 0x58, 0x53, 0xa9, + 0x8d, 0x42, 0x43, 0x8c, 0xad, 0x77, 0xa6, 0x95, 0x79, 0x32, 0x47, 0x9f, + 0x2f, 0xaf, 0xd1, 0x63, 0x92, 0xb4, 0x3d, 0x79, 0x85, 0xb5, 0x99, 0x8b, + 0x8d, 0x7c, 0xb2, 0x3f, 0x5f, 0x61, 0xb2, 0xcf, 0x88, 0x6d, 0xbe, 0xd2, + 0x67, 0x94, 0xd2, 0x64, 0x4c, 0x9a, 0xba, 0xbb, 0xa5, 0x90, 0x32, 0x38, + 0xac, 0x7b, 0x99, 0xbb, 0x5a, 0x4b, 0xbf, 0xb8, 0x34, 0x9d, 0xcf, 0xaf, + 0xbf, 0x97, 0x70, 0x52, 0xca, 0x94, 0x98, 0x44, 0x42, 0xa5, 0xc0, 0xca, + 0xb6, 0xba, 0x56, 0x92, 0x6d, 0x46, 0xd2, 0xb4, 0x44, 0x90, 0x61, 0x34, + 0xbf, 0x48, 0xd0, 0x6a, 0x9d, 0x43, 0xde, 0xa8, 0x8f, 0x2e, 0x68, 0xcb, + 0xa5, 0xce, 0xd3, 0xc7, 0xb6, 0xbf, 0xa0, 0x3e, 0x41, 0xd0, 0xaa, 0x9c, + 0x9e, 0x6b, 0x58, 0xcb, 0x9a, 0xb2, 0x6c, 0xc2, 0x39, 0xcb, 0x80, 0xa0, + 0x62, 0x70, 0xbd, 0x5e, 0x87, 0x57, 0x88, 0xc3, 0x9d, 0xa7, 0x26, 0x97, + 0xb5, 0x52, 0xbe, 0x78, 0x9c, 0x50, 0x57, 0x8a, 0x31, 0xa8, 0x35, 0xa1, + 0x87, 0x3a, 0x6d, 0x53, 0xb5, 0x40, 0xb8, 0x32, 0xb8, 0xa5, 0x9a, 0x7f, + 0x3e, 0xb6, 0xa5, 0x61, 0x90, 0xad, 0x32, 0x7f, 0x30, 0x6b, 0x41, 0x2e, + 0x3b, 0x34, 0x48, 0xa0, 0x5e, 0x84, 0xbf, 0xa4, 0x70, 0x59, 0xbf, 0x55, + 0xa9, 0x84, 0x4d, 0xbd, 0xb9, 0x6b, 0x4d, 0x78, 0x6a, 0x41, 0xbd, 0x41, + 0x5a, 0x94, 0x66, 0x62, 0x89, 0xb5, 0x4f, 0x52, 0xc3, 0x51, 0x38, 0x3f, + 0x62, 0x5e, 0x94, 0xc2, 0x86, 0x88, 0x2a, 0x7a, 0xc2, 0x9e, 0x41, 0x75, + 0xc3, 0x94, 0x43, 0xa7, 0xd1, 0x4d, 0xa2, 0x49, 0x9c, 0xa9, 0x85, 0x5c, + 0x86, 0x40, 0x7f, 0x58, 0x9e, 0xb4, 0x44, 0x71, 0xa2, 0x5d, 0x2f, 0x77, + 0x93, 0x9b, 0xcd, 0xc2, 0x77, 0x80, 0xac, 0x4c, 0x95, 0x7a, 0xc9, 0x54, + 0xaf, 0x57, 0xdb, 0x48, 0x67, 0x4c, 0xbc, 0xc6, 0xae, 0xc7, 0x44, 0x7f, + 0x77, 0x85, 0xac, 0x5b, 0x75, 0x59, 0x52, 0x3c, 0x81, 0xa3, 0x67, 0xd2, + 0x66, 0x70, 0x89, 0xb3, 0x37, 0xbe, 0x52, 0x33, 0xc4, 0xb7, 0x40, 0x8c, + 0x94, 0xb6, 0x7b, 0x3d, 0xa8, 0x9a, 0x8f, 0x80, 0x6e, 0x91, 0xa1, 0xb8, + 0x5b, 0x2f, 0x62, 0x40, 0xb3, 0xa4, 0x55, 0xb3, 0x7f, 0x6e, 0x7e, 0x7b, + 0xb4, 0x3e, 0xbb, 0x5f, 0x32, 0x79, 0x99, 0x8a, 0x58, 0x65, 0x6d, 0x99, + 0xd4, 0xad, 0x9a, 0xbc, 0x85, 0x99, 0xae, 0x7f, 0xd6, 0xd1, 0x81, 0xb6, + 0x4b, 0xaf, 0x5c, 0x4c, 0x92, 0x53, 0xb5, 0x62, 0x92, 0x4d, 0xd2, 0x90, + 0x47, 0xdf, 0xe4, 0x45, 0x53, 0x95, 0xa9, 0xb9, 0x6f, 0xac, 0x57, 0xc9, + 0xb7, 0x61, 0x6c, 0x99, 0xd6, 0x45, 0x60, 0x40, 0x70, 0xc3, 0xb0, 0x71, + 0x66, 0xac, 0x91, 0xb5, 0x50, 0xb4, 0x74, 0x4a, 0x90, 0x32, 0x54, 0xa9, + 0x87, 0x44, 0x96, 0x91, 0xcd, 0xbe, 0xb8, 0x77, 0xb9, 0xa6, 0xc6, 0x8a, + 0xa7, 0x98, 0xc1, 0xca, 0xa1, 0x73, 0x82, 0x4f, 0x40, 0x3d, 0x9e, 0xab, + 0x42, 0x5c, 0x91, 0x77, 0xba, 0x76, 0xab, 0xba, 0xe1, 0xb2, 0x87, 0x6e, + 0x93, 0x7c, 0xae, 0x3d, 0x9e, 0xb1, 0x56, 0x9d, 0xc2, 0x39, 0xc2, 0x74, + 0xa4, 0x5f, 0x62, 0x75, 0x7a, 0x4a, 0x30, 0xb8, 0x73, 0x66, 0x51, 0xa0, + 0xa4, 0x5e, 0x56, 0xb3, 0xb0, 0x98, 0x9a, 0x3d, 0xb5, 0x4b, 0xad, 0x45, + 0x70, 0xa8, 0x84, 0x6c, 0xea, 0xc4, 0x7a, 0x44, 0x95, 0x84, 0x5d, 0xc2, + 0x37, 0xb2, 0x8a, 0x37, 0xc5, 0x77, 0xc7, 0x8e, 0x3d, 0x88, 0x50, 0x6a, + 0x41, 0x55, 0x93, 0x9c, 0x73, 0xc9, 0xcb, 0xc2, 0xc9, 0x62, 0x72, 0xbf, + 0x87, 0x6d, 0x82, 0x8e, 0x4d, 0x5c, 0x7e, 0x78, 0x67, 0xbf, 0x89, 0xc9, + 0xb5, 0xa8, 0x89, 0xc6, 0x59, 0x50, 0xa4, 0xc1, 0x3b, 0x46, 0x56, 0x3d, + 0x77, 0x9a, 0x69, 0x87, 0x3e, 0x26, 0x7d, 0x74, 0xa3, 0x6b, 0x68, 0x46, + 0x5f, 0x14, 0x31, 0x95, 0xa3, 0x60, 0x82, 0x8b, 0x4f, 0xb6, 0xb6, 0xca, + 0xc3, 0xb5, 0x49, 0xc4, 0x8f, 0x9e, 0x2c, 0x77, 0xbe, 0x64, 0x4a, 0xa3, + 0x79, 0x69, 0x99, 0x94, 0x55, 0x4a, 0x8d, 0x7f, 0x56, 0x52, 0x8e, 0x67, + 0x84, 0x40, 0x66, 0x9d, 0x50, 0x9a, 0xc8, 0x38, 0x62, 0xa0, 0x49, 0x52, + 0xa1, 0x39, 0xa0, 0x58, 0x73, 0xad, 0xbf, 0xdb, 0xc7, 0x7e, 0xa5, 0x82, + 0x30, 0x41, 0xb5, 0xb6, 0x4a, 0x4f, 0x28, 0xb2, 0xb5, 0x82, 0xc9, 0x44, + 0x5f, 0xc8, 0x2a, 0x53, 0x98, 0xd8, 0x60, 0xca, 0xb7, 0xd6, 0xba, 0x58, + 0x4a, 0x79, 0xc8, 0x3d, 0xc0, 0xd2, 0x6b, 0x83, 0x53, 0x47, 0x39, 0x51, + 0x67, 0x7a, 0xb5, 0x70, 0x9f, 0x7d, 0x2e, 0xbb, 0x27, 0x49, 0x8b, 0xa3, + 0x72, 0xb6, 0xa0, 0xc8, 0x38, 0x6d, 0x9f, 0x60, 0xa7, 0x97, 0x40, 0x43, + 0x49, 0x8c, 0x4e, 0x55, 0x7c, 0x9c, 0x4e, 0x3f, 0x83, 0xae, 0x7c, 0xa8, + 0xbb, 0x44, 0x4c, 0xc4, 0xd4, 0xa9, 0x74, 0xa3, 0x92, 0x2c, 0x93, 0x54, + 0x55, 0x25, 0x54, 0x38, 0xb8, 0xbd, 0x8a, 0x9e, 0x7d, 0x55, 0xa6, 0x79, + 0xcc, 0x65, 0x49, 0x7b, 0x73, 0x99, 0x43, 0x92, 0x65, 0x51, 0x84, 0x88, + 0x3a, 0x76, 0xa2, 0x80, 0x7a, 0x4d, 0x98, 0xc6, 0x76, 0x42, 0x50, 0x43, + 0x5e, 0xc1, 0x74, 0xc5, 0x77, 0x95, 0x4d, 0x8b, 0xbe, 0xbe, 0x84, 0x6b, + 0xac, 0x37, 0x62, 0x7a, 0x3a, 0x9e, 0x7e, 0x46, 0xc3, 0x95, 0x69, 0xb5, + 0x56, 0x58, 0x29, 0x5e, 0xa6, 0x43, 0xc8, 0x47, 0x8b, 0xaa, 0xb6, 0xbf, + 0xc6, 0x72, 0x35, 0x84, 0x7f, 0x9e, 0x5f, 0x6b, 0xb0, 0xa2, 0x9a, 0xcb, + 0xa3, 0x9f, 0x84, 0x7d, 0x56, 0x9e, 0x72, 0x5a, 0xae, 0x6a, 0x64, 0x60, + 0x8a, 0x72, 0xcb, 0xe2, 0x88, 0x63, 0x37, 0xa2, 0x58, 0x82, 0xc7, 0xb6, + 0x3f, 0x3a, 0x6d, 0x98, 0xbf, 0x92, 0xa5, 0x3b, 0xbb, 0xcf, 0x8c, 0xc6, + 0xd6, 0xa7, 0x9b, 0x52, 0xc7, 0x86, 0x9c, 0xb0, 0x57, 0x5f, 0x7a, 0x70, + 0x42, 0xb4, 0x55, 0xd0, 0x7a, 0x34, 0xab, 0x95, 0x71, 0xbb, 0xab, 0x81, + 0x5e, 0x78, 0xb9, 0x49, 0x70, 0xc5, 0xd2, 0x5e, 0xc9, 0xbd, 0x83, 0x4e, + 0xa8, 0x47, 0x5d, 0x98, 0xa2, 0xbf, 0x63, 0x9c, 0x71, 0xaa, 0xc6, 0xc2, + 0x38, 0x7d, 0x76, 0x4d, 0x9f, 0xa7, 0x4b, 0x92, 0x6e, 0xaa, 0xef, 0xe5, + 0x70, 0xac, 0xb3, 0xb2, 0x53, 0x55, 0x90, 0x3c, 0xa7, 0xb4, 0x4c, 0x88, + 0xd0, 0x23, 0x67, 0x45, 0x57, 0x4f, 0x92, 0xc5, 0x7a, 0x8a, 0xd2, 0x83, + 0x53, 0xc3, 0x41, 0xce, 0x72, 0x7c, 0x22, 0xd6, 0xcd, 0xc9, 0xc8, 0xae, + 0x8d, 0x69, 0x44, 0x92, 0xab, 0xa6, 0x56, 0x6b, 0x7a, 0xa4, 0x5e, 0x79, + 0x3e, 0xcf, 0xcf, 0x37, 0x43, 0x51, 0xb9, 0xd3, 0x48, 0x4e, 0x87, 0x81, + 0xaa, 0x77, 0x44, 0x86, 0xb5, 0x6f, 0x3c, 0x4d, 0xa8, 0xa8, 0x83, 0x5d, + 0x3e, 0xbd, 0x4a, 0x49, 0xa1, 0x4e, 0x75, 0x96, 0x4e, 0x31, 0x94, 0xaa, + 0xc5, 0x37, 0x4c, 0x61, 0x3c, 0x45, 0x40, 0xcb, 0x53, 0x9a, 0xca, 0x37, + 0xa0, 0xcc, 0x73, 0x5e, 0xcd, 0x7f, 0x50, 0x5a, 0x3c, 0xc7, 0xc3, 0x48, + 0xb8, 0x52, 0x3f, 0xb4, 0x4a, 0x38, 0xa1, 0x50, 0x32, 0xab, 0xc8, 0x70, + 0x47, 0xbe, 0x9f, 0x39, 0xb8, 0xab, 0x5e, 0xd4, 0x54, 0x92, 0x3d, 0x3c, + 0x37, 0x7a, 0x94, 0xbe, 0xcf, 0x45, 0x50, 0xa0, 0x42, 0x34, 0x75, 0x57, + 0xa7, 0x97, 0x80, 0x89, 0x89, 0x58, 0xc2, 0x3f, 0x36, 0x9d, 0x40, 0x96, + 0xb5, 0xc0, 0x33, 0x91, 0x62, 0x44, 0x75, 0x90, 0x5b, 0x6c, 0x76, 0xc6, + 0x5c, 0x98, 0x4a, 0x9b, 0x43, 0x5c, 0x82, 0x89, 0xc2, 0x2e, 0x3b, 0xd4, + 0x74, 0xa6, 0x8d, 0x95, 0x6b, 0x8d, 0x6c, 0x82, 0x92, 0xcc, 0x9f, 0xba, + 0x5e, 0x5d, 0x7f, 0x6e, 0x3a, 0x31, 0x7a, 0x68, 0x98, 0xc5, 0x9d, 0x84, + 0x8f, 0x45, 0x97, 0x44, 0x78, 0x8e, 0xc1, 0x9c, 0x39, 0x7a, 0x35, 0x65, + 0x9a, 0x94, 0x76, 0xad, 0x92, 0x86, 0xd2, 0xad, 0xca, 0x97, 0x46, 0x9f, + 0xb4, 0x58, 0xc9, 0x37, 0x9f, 0xc3, 0x48, 0xce, 0x39, 0x36, 0xcf, 0x5b, + 0x3c, 0xb8, 0xce, 0xa8, 0x9b, 0x89, 0xca, 0x35, 0xac, 0x6e, 0x91, 0xbf, + 0xc2, 0x69, 0x5e, 0x35, 0x4a, 0x70, 0xa4, 0x72, 0x92, 0x44, 0x96, 0x6f, + 0x5b, 0xbb, 0x4a, 0xb8, 0x42, 0xc3, 0x76, 0xb7, 0x96, 0xd1, 0x68, 0x43, + 0x3e, 0xb3, 0x9a, 0x7b, 0xcd, 0x94, 0x57, 0x70, 0x5a, 0x38, 0x4c, 0x4d, + 0x7e, 0x40, 0x60, 0x73, 0x5f, 0x39, 0xc5, 0x92, 0x44, 0xc0, 0xa1, 0xa0, + 0x35, 0x47, 0xaa, 0x49, 0x32, 0x42, 0x40, 0xa4, 0x35, 0x7a, 0x91, 0x37, + 0x86, 0x69, 0xa4, 0x49, 0x8e, 0x61, 0x86, 0xb8, 0x6b, 0x7e, 0xbf, 0xc0, + 0xbd, 0x8b, 0x74, 0x6a, 0xba, 0x62, 0x8a, 0x8d, 0xce, 0x87, 0xa4, 0x91, + 0xb8, 0xca, 0xaa, 0xc7, 0x72, 0xc8, 0x3f, 0x48, 0x77, 0x99, 0x8e, 0x5f, + 0xa5, 0x52, 0x84, 0xb7, 0x5f, 0x9f, 0x36, 0x8c, 0x97, 0xa0, 0xba, 0xcb, + 0x5f, 0xc8, 0x4a, 0x3c, 0x4d, 0x96, 0x55, 0x3f, 0xa5, 0xab, 0xad, 0x5c, + 0x8a, 0x78, 0x55, 0xbc, 0x3c, 0x85, 0x3d, 0xa6, 0xc6, 0x49, 0x35, 0xcf, + 0x9d, 0x5a, 0x8c, 0x43, 0x43, 0xbf, 0xc2, 0xa8, 0x37, 0x4f, 0x31, 0x45, + 0x65, 0xc0, 0x60, 0xa4, 0x5e, 0xa7, 0x99, 0x48, 0x83, 0x36, 0x4b, 0x7f, + 0x9c, 0x43, 0xb5, 0xcf, 0x42, 0x3c, 0xd1, 0xb9, 0x69, 0xc6, 0x58, 0xbf, + 0x8c, 0x9c, 0xd4, 0x8a, 0x6a, 0xd0, 0x9e, 0xa7, 0xb9, 0x8e, 0xa2, 0xbe, + 0xbf, 0x50, 0x6c, 0xca, 0xa4, 0x9d, 0x53, 0xca, 0x68, 0xc1, 0x64, 0x4e, + 0x68, 0xb3, 0x8e, 0xc1, 0x5c, 0x8e, 0xd3, 0x3b, 0xad, 0x74, 0x54, 0xa4, + 0xbe, 0x6e, 0xa9, 0x3b, 0xbf, 0x96, 0x87, 0xc6, 0x71, 0x3c, 0x33, 0x43, + 0x5b, 0x62, 0x4f, 0x6f, 0x67, 0xc9, 0xa3, 0x4f, 0x47, 0x59, 0xbf, 0x9f, + 0x55, 0xa5, 0x9c, 0x5d, 0x5c, 0xa7, 0x7c, 0x5f, 0xb0, 0x91, 0x44, 0xaf, + 0x8f, 0xad, 0xaf, 0x36, 0x85, 0xc0, 0x6a, 0x5f, 0x6d, 0xa9, 0xa9, 0x4b, + 0x97, 0x6f, 0x9f, 0x95, 0x8c, 0x2e, 0xcc, 0x61, 0x8e, 0x6f, 0x72, 0x38, + 0x51, 0xc1, 0xce, 0x48, 0x90, 0xab, 0x53, 0x78, 0x4f, 0xc0, 0x92, 0x5a, + 0xc9, 0x90, 0x81, 0x3e, 0x6b, 0xae, 0x9d, 0x7b, 0x72, 0x7e, 0xd1, 0xb5, + 0xcb, 0x30, 0x4b, 0x55, 0x96, 0x54, 0xba, 0x9d, 0xbc, 0x66, 0x32, 0xd0, + 0x58, 0xca, 0xcb, 0x8b, 0xb2, 0xbc, 0x72, 0xca, 0x79, 0x96, 0x3b, 0x6c, + 0xc3, 0x73, 0x52, 0x81, 0x9e, 0x78, 0x6c, 0x9f, 0x7e, 0x7a, 0x96, 0x51, + 0x96, 0x4f, 0x73, 0x4c, 0x36, 0x84, 0x4a, 0xcb, 0x7e, 0x6e, 0x43, 0x74, + 0x56, 0x3e, 0x3c, 0xd3, 0x44, 0x3c, 0x81, 0x76, 0xd3, 0xa0, 0x74, 0x6c, + 0xcc, 0xb3, 0xc7, 0xd4, 0x37, 0xc3, 0x5a, 0xc8, 0xbb, 0xae, 0x89, 0x5c, + 0xa0, 0x80, 0xc0, 0x3e, 0x3f, 0x7e, 0xa8, 0x88, 0x8c, 0x9d, 0x46, 0x3c, + 0x63, 0x9d, 0xaf, 0x50, 0xc4, 0x98, 0x52, 0xb8, 0x71, 0x9d, 0x34, 0x41, + 0xa1, 0x7b, 0x66, 0xd2, 0x77, 0x80, 0x69, 0xcb, 0x3a, 0xbf, 0x4a, 0x60, + 0xb9, 0x60, 0x5c, 0x5b, 0x95, 0x61, 0xc6, 0xc6, 0x98, 0x61, 0x81, 0x47, + 0xd4, 0xd4, 0x91, 0x7b, 0xcf, 0xad, 0xc5, 0x64, 0xc7, 0x5b, 0xc5, 0x3d, + 0x71, 0x9f, 0xb2, 0x37, 0x46, 0x8a, 0x90, 0x71, 0x87, 0x79, 0xc2, 0x4e, + 0x48, 0xb1, 0xb1, 0xd7, 0x88, 0x6e, 0x63, 0x81, 0x7f, 0x6e, 0x67, 0x43, + 0xd2, 0xbf, 0xc7, 0x84, 0x6a, 0x7c, 0x60, 0x70, 0x65, 0x53, 0x9a, 0x8b, + 0x87, 0x92, 0x96, 0x70, 0x91, 0x77, 0x42, 0x4c, 0x87, 0x86, 0x5c, 0x5f, + 0x5c, 0xa2, 0x37, 0x73, 0xad, 0x60, 0x38, 0x83, 0x74, 0x38, 0x43, 0x3b, + 0x54, 0x38, 0x4a, 0x37, 0x69, 0x7d, 0x3e, 0x75, 0xa7, 0x40, 0x4c, 0x77, + 0xaa, 0xcd, 0x86, 0x63, 0xcb, 0x7a, 0x52, 0x6e, 0xb9, 0x7d, 0xaf, 0xcf, + 0x5d, 0x6f, 0xa7, 0x52, 0x67, 0x50, 0x8b, 0x84, 0xc9, 0x96, 0x90, 0x87, + 0x8e, 0x8d, 0x5f, 0x4a, 0xc0, 0x5f, 0x86, 0x8f, 0x37, 0xaa, 0x62, 0x7a, + 0x9a, 0xcc, 0x41, 0xbb, 0x6c, 0x7f, 0x3c, 0x59, 0x7a, 0x92, 0x68, 0x4f, + 0x81, 0x56, 0x3a, 0x44, 0xa0, 0xb4, 0xcf, 0xc7, 0x38, 0x82, 0xa0, 0x85, + 0xb9, 0x92, 0xa0, 0x6c, 0x60, 0xa2, 0x98, 0xd3, 0x88, 0x83, 0x90, 0x4c, + 0x9a, 0x80, 0xcb, 0x8a, 0x42, 0xcc, 0xb4, 0xba, 0x62, 0x6c, 0x47, 0xa7, + 0x7f, 0x40, 0x58, 0x8d, 0x8a, 0x51, 0xb3, 0xc2, 0x6d, 0xd5, 0x68, 0x97, + 0xc7, 0x85, 0x7f, 0x5c, 0xbc, 0x43, 0x70, 0xb8, 0xb3, 0x87, 0xbd, 0x94, + 0x45, 0x51, 0x5f, 0xd1, 0x4c, 0xba, 0xd3, 0xc3, 0x99, 0x4e, 0x81, 0x6e, + 0xc3, 0x78, 0xca, 0xb7, 0xc5, 0x4a, 0x8b, 0x42, 0x7a, 0xc4, 0x5c, 0x89, + 0x52, 0x92, 0x96, 0x83, 0x80, 0x44, 0xb7, 0x6e, 0xae, 0x90, 0x83, 0x77, + 0x9d, 0x82, 0xc5, 0xb9, 0x8d, 0x97, 0xcf, 0xbb, 0xb3, 0xad, 0xc2, 0xd1, + 0x8d, 0x31, 0x4e, 0x64, 0x82, 0x83, 0x9f, 0x42, 0x77, 0x3d, 0xa4, 0xc0, + 0x84, 0x72, 0xa5, 0x4f, 0xa9, 0x6d, 0x3a, 0x4f, 0x89, 0x4b, 0xc0, 0xc8, + 0x72, 0x73, 0x6b, 0x78, 0x61, 0x87, 0x90, 0x36, 0xd3, 0xbc, 0x6b, 0x9a, + 0x9e, 0x9f, 0x4c, 0xaa, 0x2e, 0x9c, 0xb2, 0xc0, 0xa6, 0x38, 0x3c, 0x60, + 0x56, 0x60, 0x70, 0x71, 0xc5, 0x4f, 0x57, 0x3b, 0x53, 0x3d, 0xb2, 0xb3, + 0x6e, 0xb1, 0x4f, 0x70, 0xbe, 0x32, 0x55, 0x4c, 0x6b, 0x99, 0x62, 0x3a, + 0x5b, 0x81, 0xc2, 0x73, 0x8c, 0x6a, 0xb5, 0x5f, 0x59, 0x30, 0x88, 0x70, + 0x6a, 0x84, 0xb4, 0x68, 0x80, 0xa1, 0xbc, 0xd0, 0xc9, 0xaa, 0x40, 0xb3, + 0x8b, 0xe9, 0x73, 0x45, 0x52, 0xad, 0x82, 0x7f, 0x3a, 0x69, 0x72, 0xca, + 0x74, 0x42, 0x54, 0x4e, 0x72, 0x55, 0x56, 0xc4, 0x97, 0xa4, 0x38, 0x9c, + 0xc6, 0x96, 0x9c, 0x7f, 0xa0, 0x8b, 0x6c, 0xbc, 0xcc, 0xa4, 0x67, 0x4e, + 0x9c, 0x5c, 0x75, 0x4e, 0x61, 0xda, 0xc0, 0x6e, 0x79, 0xa6, 0x82, 0xbf, + 0x38, 0x51, 0xb7, 0x95, 0xb2, 0x56, 0xa0, 0x86, 0x7c, 0x8f, 0x6e, 0x52, + 0x3e, 0xc8, 0x3b, 0x38, 0x3f, 0x9e, 0xb1, 0xa3, 0x57, 0x47, 0x84, 0x5c, + 0xbc, 0x9a, 0x3e, 0xca, 0x9a, 0x93, 0xc3, 0x90, 0xbb, 0x5d, 0x5d, 0x80, + 0xaa, 0x66, 0x74, 0x40, 0x9a, 0xd1, 0xb6, 0x96, 0x65, 0x59, 0x47, 0x3a, + 0x7d, 0x73, 0x5f, 0xa3, 0x3e, 0x5a, 0x6d, 0x84, 0xa9, 0xc5, 0x29, 0x44, + 0x3e, 0x66, 0x56, 0x98, 0xac, 0x3d, 0x5a, 0x63, 0x7f, 0xdd, 0x98, 0xb8, + 0x63, 0x68, 0x74, 0x74, 0x5c, 0x6d, 0xaa, 0x67, 0x37, 0x8a, 0xad, 0x68, + 0xbe, 0xb5, 0xcc, 0xc4, 0xa2, 0x61, 0x49, 0x56, 0xa0, 0xc6, 0xaa, 0x44, + 0xbb, 0x9e, 0x6c, 0x99, 0xa2, 0xbd, 0x3b, 0x7d, 0xc4, 0xa9, 0x63, 0xc9, + 0x6e, 0x90, 0x6c, 0xa9, 0xe0, 0x8f, 0x36, 0xad, 0x86, 0x2c, 0xbc, 0x7a, + 0x9c, 0xae, 0x35, 0x57, 0xa9, 0x40, 0x52, 0x3a, 0x6c, 0x9b, 0x95, 0x2f, + 0x6f, 0xbe, 0xa5, 0x2c, 0x4d, 0xaa, 0xdb, 0xbb, 0xbe, 0x55, 0x44, 0x9f, + 0x87, 0x71, 0x3f, 0x5a, 0x5e, 0x9a, 0x6a, 0x55, 0x3f, 0x72, 0xd1, 0x8a, + 0x5c, 0x46, 0x78, 0x70, 0xc8, 0x74, 0xb6, 0x70, 0x64, 0x7f, 0xaf, 0x4a, + 0x4c, 0xbf, 0xa2, 0x97, 0x88, 0x5a, 0x7f, 0x32, 0xb3, 0x5a, 0xbd, 0x31, + 0x42, 0x39, 0x51, 0x9d, 0xb1, 0xcc, 0x41, 0x97, 0xa6, 0xb7, 0x5e, 0x72, + 0x61, 0x64, 0xa0, 0x6a, 0xb0, 0x69, 0xbd, 0x93, 0x4c, 0xa7, 0x70, 0x3a, + 0x6a, 0x8d, 0xbc, 0x40, 0x79, 0x7a, 0xc5, 0x8a, 0xb5, 0x8d, 0x6c, 0x4a, + 0x3e, 0x97, 0xa6, 0xc0, 0x48, 0xbf, 0x74, 0x7c, 0xb6, 0x81, 0xb8, 0xbd, + 0x65, 0x72, 0x8d, 0xb0, 0xa9, 0x3c, 0x55, 0xbe, 0xc3, 0x6e, 0x72, 0x49, + 0xaa, 0x7f, 0x8d, 0x67, 0x9c, 0x93, 0x91, 0x85, 0x9c, 0x88, 0x39, 0x4a, + 0x9e, 0xa4, 0xb1, 0xb4, 0x35, 0xb3, 0x8f, 0xc6, 0x9e, 0x38, 0x3c, 0xc1, + 0x83, 0x7b, 0xce, 0xad, 0xc4, 0x54, 0x59, 0x48, 0xce, 0xa6, 0x3f, 0x34, + 0xc3, 0x9f, 0xd1, 0x39, 0xaa, 0x95, 0x4b, 0x62, 0x40, 0xac, 0x91, 0x4a, + 0x59, 0xc2, 0x36, 0x43, 0xad, 0xb8, 0x98, 0xa5, 0xa5, 0x7a, 0x97, 0x6d, + 0x53, 0x40, 0x8a, 0xb5, 0x7e, 0x36, 0xbd, 0xc4, 0x60, 0x87, 0x4a, 0x57, + 0x37, 0xa8, 0xb0, 0xd5, 0x84, 0xb9, 0xc7, 0x63, 0x95, 0x89, 0xc4, 0xd9, + 0x87, 0xbb, 0x75, 0xbb, 0x66, 0xc6, 0xbc, 0xdc, 0x84, 0x51, 0x58, 0x57, + 0x85, 0xd9, 0xc8, 0x3f, 0x89, 0xa8, 0x57, 0xa6, 0xc5, 0x52, 0x42, 0x7b, + 0x5f, 0x50, 0xbf, 0x81, 0x88, 0x6a, 0x99, 0xce, 0x31, 0x3e, 0x56, 0x36, + 0xbf, 0xb6, 0xcb, 0x58, 0xc2, 0x63, 0x90, 0x35, 0x95, 0x47, 0xa2, 0x59, + 0xa2, 0xc6, 0xaa, 0x27, 0xa6, 0x62, 0x4e, 0x6f, 0x9c, 0x59, 0x71, 0xa5, + 0x97, 0x4e, 0xcc, 0xaa, 0x72, 0xbc, 0x67, 0x6c, 0x4d, 0xa7, 0x88, 0xaf, + 0x35, 0x3c, 0xb8, 0xba, 0x5c, 0x9a, 0x99, 0x4f, 0x89, 0x9f, 0xcb, 0xa1, + 0xcc, 0x94, 0x3a, 0x8a, 0xcb, 0x81, 0x4e, 0x48, 0x33, 0x65, 0x44, 0xb0, + 0x44, 0x92, 0x44, 0x66, 0x3b, 0x9c, 0x85, 0x81, 0x9e, 0xb0, 0xb5, 0x6c, + 0x80, 0x43, 0x4d, 0x7e, 0xd5, 0x95, 0xd3, 0xc7, 0xd2, 0x7b, 0x4b, 0xcf, + 0xae, 0x8f, 0xb0, 0x82, 0xbf, 0xa9, 0xcc, 0x96, 0xb9, 0xd5, 0x64, 0xda, + 0xd2, 0x70, 0x8f, 0x76, 0x5c, 0x49, 0x61, 0x5a, 0xb3, 0x64, 0x80, 0x83, + 0x57, 0x4d, 0x8e, 0xcc, 0x55, 0x4d, 0xc4, 0x62, 0x8f, 0x70, 0x91, 0x5e, + 0x68, 0x4d, 0xa1, 0x3a, 0xcc, 0x74, 0x46, 0x6b, 0x5b, 0x3e, 0xd3, 0x69, + 0x55, 0xcf, 0x51, 0x8f, 0x4b, 0x6d, 0xc5, 0xcb, 0xc6, 0xcd, 0x39, 0x58, + 0x9c, 0x96, 0x8c, 0xc0, 0x88, 0xbf, 0x45, 0xc0, 0xaf, 0xb9, 0x87, 0xa8, + 0xbc, 0xb1, 0x68, 0xc4, 0x7c, 0xb9, 0xad, 0xb5, 0x6f, 0x79, 0x84, 0x72, + 0x8f, 0xb0, 0x98, 0x91, 0x43, 0x6c, 0xc4, 0x4d, 0xa1, 0x42, 0x83, 0x8f, + 0xc4, 0xce, 0x31, 0x35, 0xb6, 0x35, 0xce, 0x63, 0x40, 0x84, 0x5e, 0x4f, + 0x52, 0x57, 0x7c, 0xa1, 0x36, 0x58, 0xd7, 0xc3, 0x9d, 0x4f, 0xa9, 0x5c, + 0x33, 0x75, 0x82, 0x9f, 0x7c, 0xd6, 0x48, 0x91, 0x83, 0x57, 0xb7, 0xb3, + 0x33, 0x91, 0xc5, 0x7f, 0x37, 0x4a, 0x32, 0xa7, 0x52, 0x4a, 0x9d, 0x5b, + 0xac, 0x87, 0x38, 0x3d, 0xae, 0x7e, 0x59, 0x48, 0x36, 0xa4, 0x88, 0x42, + 0x97, 0x9d, 0x37, 0x61, 0x94, 0x60, 0x72, 0x75, 0x95, 0xaf, 0x55, 0x89, + 0x5b, 0x6c, 0x94, 0x57, 0x65, 0x43, 0x83, 0xc5, 0x62, 0x7e, 0x6d, 0xa2, + 0x5f, 0xd1, 0x6d, 0x98, 0x93, 0xb5, 0xa6, 0xcb, 0x3c, 0x58, 0xae, 0x45, + 0x91, 0x31, 0x46, 0xce, 0x95, 0x70, 0x8f, 0x48, 0x4d, 0x4f, 0x54, 0xc8, + 0xca, 0xae, 0x32, 0x6b, 0x4c, 0x5d, 0xac, 0xbd, 0x72, 0x8d, 0x76, 0xa0, + 0x6f, 0x63, 0xc2, 0x6b, 0xc2, 0x5b, 0x84, 0xc2, 0xb0, 0x8b, 0x43, 0x7d, + 0xb7, 0x80, 0x4e, 0x44, 0xc1, 0xc9, 0xa9, 0xa2, 0x6d, 0x84, 0x7c, 0x6e, + 0xcc, 0x76, 0x82, 0x4c, 0x69, 0xc4, 0x8b, 0x46, 0x6a, 0x67, 0x89, 0x8d, + 0x8a, 0xd2, 0x39, 0x34, 0x46, 0x6b, 0x5e, 0xd7, 0x6e, 0x36, 0xc9, 0x7e, + 0xab, 0xcc, 0x83, 0x7c, 0xb3, 0xc5, 0xc7, 0x4e, 0xbf, 0xdb, 0xb6, 0x64, + 0x34, 0x9a, 0x45, 0x6f, 0x4a, 0x3d, 0x2d, 0x4c, 0x6e, 0x5d, 0xc9, 0x9b, + 0x62, 0xba, 0xc5, 0x7b, 0x56, 0xbe, 0x3f, 0x8d, 0x6e, 0x79, 0x97, 0x5d, + 0x7c, 0x89, 0x62, 0xc8, 0xd3, 0x43, 0x96, 0xb1, 0x4f, 0x36, 0xc2, 0xa0, + 0xc4, 0x98, 0x7b, 0xd7, 0x44, 0x81, 0x7d, 0x61, 0x9f, 0x74, 0x9d, 0x77, + 0x95, 0xad, 0x32, 0x50, 0xc9, 0x7a, 0xb9, 0x69, 0xc5, 0x6b, 0xba, 0xb1, + 0x5d, 0xb3, 0x41, 0x5b, 0x7d, 0x84, 0x6d, 0xab, 0x66, 0x5d, 0xa4, 0x7b, + 0x56, 0x85, 0xc4, 0x71, 0x42, 0x8c, 0x90, 0x7d, 0x79, 0xba, 0x67, 0xab, + 0x56, 0x77, 0x4c, 0x86, 0xcb, 0xa8, 0x56, 0x8c, 0x8b, 0x81, 0xaf, 0x47, + 0xc0, 0x74, 0x57, 0x61, 0x56, 0x86, 0x47, 0x65, 0x96, 0x4b, 0xc7, 0xcd, + 0xc6, 0xb6, 0xbb, 0xa0, 0x79, 0x3b, 0x7e, 0x7d, 0xb7, 0x77, 0xc0, 0x61, + 0xa2, 0x80, 0xb3, 0x70, 0xd1, 0x82, 0xa2, 0x86, 0x77, 0x88, 0x37, 0x9a, + 0x5d, 0x5c, 0xc8, 0x96, 0x38, 0x64, 0x54, 0x8a, 0x96, 0x57, 0x8b, 0x96, + 0x42, 0x56, 0x46, 0x53, 0x96, 0x96, 0x34, 0x65, 0x7c, 0x74, 0x4a, 0x7e, + 0x4f, 0xce, 0x6f, 0x34, 0x63, 0x5a, 0x56, 0x5a, 0x7e, 0x76, 0x56, 0x73, + 0x9a, 0xc4, 0x8a, 0xbd, 0x39, 0x3e, 0x8e, 0xc6, 0xaf, 0xbb, 0x5a, 0x42, + 0x77, 0xb1, 0xa7, 0x6a, 0x3c, 0x43, 0x59, 0xd2, 0xa8, 0x8d, 0x76, 0x75, + 0x93, 0xc1, 0xc5, 0x77, 0x80, 0x99, 0x99, 0xc3, 0x30, 0xd4, 0x8b, 0x91, + 0x3a, 0x6e, 0x81, 0xbf, 0xad, 0x42, 0x3b, 0xb2, 0x5d, 0x82, 0x6c, 0x8c, + 0x8c, 0x68, 0x6c, 0x93, 0x2c, 0x6b, 0x90, 0xaa, 0x93, 0xba, 0x51, 0x57, + 0x71, 0xa7, 0x3f, 0x3d, 0xa0, 0xcb, 0x60, 0x9d, 0x37, 0xd2, 0xb4, 0x6b, + 0x66, 0x95, 0x8d, 0x70, 0x43, 0x93, 0x9a, 0x85, 0x7f, 0x81, 0x74, 0xc8, + 0x91, 0x9d, 0x7e, 0x8f, 0x82, 0x44, 0x77, 0x4a, 0x5f, 0x48, 0xb7, 0x48, + 0x30, 0x8e, 0x78, 0x33, 0xb2, 0x87, 0x42, 0xd2, 0xaa, 0x97, 0x94, 0xa9, + 0x68, 0xce, 0x95, 0xbc, 0x3a, 0x4b, 0x5e, 0x65, 0x86, 0xd7, 0xb1, 0x93, + 0xbc, 0x52, 0x80, 0xb9, 0xb1, 0xc1, 0x4d, 0x3e, 0x94, 0x3d, 0x53, 0x47, + 0x73, 0xc0, 0x33, 0x92, 0xa7, 0x70, 0xcb, 0x6d, 0x84, 0x6f, 0x7e, 0x75, + 0xc4, 0x61, 0x6e, 0x52, 0x94, 0xa9, 0x82, 0xc3, 0xaa, 0x9e, 0x40, 0x70, + 0xb5, 0x4c, 0x90, 0x5f, 0x3f, 0x7e, 0xbd, 0xb3, 0xcb, 0xba, 0x83, 0x97, + 0xcc, 0x88, 0x4a, 0x57, 0x45, 0x56, 0x9e, 0xac, 0x91, 0x3b, 0x39, 0x51, + 0x53, 0x57, 0x7f, 0x98, 0x2e, 0xb5, 0x96, 0xbd, 0x9f, 0x64, 0xc7, 0x7d, + 0x63, 0x6a, 0x6a, 0x59, 0xcd, 0xb7, 0xa3, 0x68, 0x6f, 0x99, 0x6d, 0x77, + 0x8a, 0x5e, 0x8d, 0x79, 0x87, 0x9f, 0x72, 0x8d, 0x31, 0x33, 0x8f, 0x92, + 0xa3, 0xc9, 0xc1, 0xa9, 0x66, 0xb7, 0xaf, 0xce, 0xa3, 0xba, 0xc5, 0xbb, + 0xd2, 0xbf, 0x8f, 0x51, 0x52, 0x47, 0x4a, 0x39, 0xb3, 0x87, 0xab, 0x77, + 0x57, 0x80, 0xbb, 0x81, 0x50, 0xac, 0xaa, 0x3f, 0xd3, 0x37, 0x89, 0xb7, + 0x35, 0xc4, 0xb5, 0x44, 0x5a, 0xb7, 0xbb, 0x68, 0x47, 0x8d, 0xad, 0x37, + 0x26, 0x67, 0x3b, 0xb8, 0x4d, 0xa6, 0x60, 0x98, 0x71, 0xb4, 0x7a, 0xd8, + 0x2c, 0x38, 0x41, 0x31, 0x6a, 0x2c, 0x61, 0xa1, 0x9c, 0x72, 0xb7, 0x63, + 0x82, 0xaa, 0x5c, 0xa1, 0x5c, 0x4e, 0x86, 0x41, 0x46, 0x74, 0xa1, 0x47, + 0xc7, 0x86, 0xa5, 0x83, 0x41, 0x37, 0x97, 0x96, 0x92, 0x3a, 0x53, 0x85, + 0xc9, 0x2e, 0x9a, 0x69, 0xb0, 0x49, 0xc4, 0x7e, 0xc7, 0x72, 0xac, 0x50, + 0x5b, 0x66, 0x52, 0xaf, 0xaa, 0x4f, 0x92, 0x60, 0x70, 0x2d, 0x48, 0xac, + 0xb3, 0x63, 0xa7, 0x59, 0x39, 0x90, 0xa2, 0x7e, 0x37, 0xc1, 0x69, 0xb7, + 0x78, 0x5a, 0x3e, 0x4f, 0xac, 0xaa, 0x45, 0x7a, 0xb1, 0x7b, 0x9a, 0x61, + 0xd1, 0x39, 0x42, 0xaa, 0xb1, 0xa9, 0x4a, 0x75, 0x71, 0xc4, 0xa2, 0x96, + 0x73, 0x37, 0x34, 0xcf, 0xca, 0x99, 0xc5, 0x3a, 0x97, 0xac, 0x65, 0x6e, + 0x6a, 0x3c, 0x4a, 0x5e, 0xb0, 0xc8, 0x90, 0x91, 0x33, 0xa5, 0xac, 0x3f, + 0xab, 0xcf, 0x92, 0x60, 0xab, 0xa3, 0x56, 0xd1, 0x33, 0xc2, 0x99, 0x90, + 0x5e, 0x5a, 0x59, 0xbb, 0x74, 0x47, 0x7f, 0x8a, 0x70, 0x59, 0x78, 0x74, + 0x7a, 0x5e, 0x59, 0x7b, 0xb2, 0x4b, 0xcf, 0x39, 0x74, 0x54, 0x7e, 0x98, + 0x71, 0x6a, 0xa5, 0xc8, 0x56, 0x45, 0x78, 0xc6, 0x89, 0x48, 0xaf, 0x5c, + 0x39, 0x9c, 0x66, 0x51, 0xc4, 0x7c, 0x4d, 0x8f, 0x80, 0x2f, 0xad, 0x49, + 0x65, 0x44, 0xb8, 0x6f, 0xa7, 0x63, 0x85, 0x8f, 0x4d, 0xb8, 0x2e, 0x7b, + 0xc8, 0xc9, 0x5d, 0x8b, 0xb7, 0x87, 0x3e, 0xc1, 0xc4, 0x92, 0xb9, 0x49, + 0x83, 0xb0, 0x72, 0x5f, 0x3f, 0x79, 0x39, 0x5a, 0x3c, 0x75, 0x57, 0xa3, + 0x69, 0x5d, 0xb5, 0x95, 0xa3, 0x56, 0x6a, 0x6d, 0x59, 0x93, 0x54, 0x5b, + 0x65, 0x6f, 0x60, 0xd2, 0x91, 0x93, 0x44, 0xa7, 0xb3, 0xba, 0x32, 0x7c, + 0x7c, 0xb8, 0x5a, 0x69, 0x75, 0xc4, 0x79, 0xa1, 0xa8, 0x52, 0xd6, 0x40, + 0x32, 0x48, 0x71, 0x8c, 0x7d, 0x2c, 0xb4, 0xb0, 0xb3, 0xc2, 0xb0, 0x62, + 0xb3, 0x9a, 0xb4, 0xae, 0x83, 0x5f, 0x59, 0x69, 0xc5, 0x5a, 0xc3, 0x61, + 0x62, 0xc4, 0x97, 0x62, 0xc4, 0x3b, 0xc3, 0x7d, 0x9d, 0x51, 0x9a, 0x2e, + 0x6a, 0x56, 0x6d, 0xa0, 0x84, 0x78, 0x29, 0x29, 0xbc, 0x82, 0x37, 0x63, + 0xae, 0x3d, 0xc5, 0xcf, 0x5e, 0x61, 0x3b, 0x69, 0x2a, 0x6f, 0x92, 0x3c, + 0xad, 0xd6, 0x48, 0xba, 0xc5, 0x9a, 0x5b, 0xd1, 0x99, 0x76, 0x5f, 0xac, + 0x9e, 0x6d, 0x9c, 0x45, 0x40, 0x3f, 0x2d, 0xbd, 0xc6, 0x9e, 0x4b, 0x6b, + 0x44, 0x81, 0xc2, 0x94, 0x38, 0x73, 0xb6, 0xb0, 0x59, 0x30, 0xd4, 0x9b, + 0x5e, 0x64, 0x8d, 0xc7, 0xa6, 0xa0, 0xac, 0xb1, 0x51, 0x6c, 0x67, 0x79, + 0x8e, 0x63, 0xb6, 0x5b, 0x60, 0xc8, 0x8a, 0x71, 0x69, 0x93, 0xd2, 0xc9, + 0x97, 0x46, 0xc3, 0x5a, 0x33, 0x95, 0xa5, 0xaa, 0x6a, 0x79, 0x3f, 0x84, + 0x73, 0x86, 0xa5, 0x7a, 0xc1, 0x3c, 0x43, 0x57, 0xb3, 0x4d, 0xa1, 0x7f, + 0x56, 0x5a, 0x5f, 0x7f, 0xa7, 0xb9, 0x7d, 0x5e, 0x94, 0xc7, 0x79, 0x72, + 0xaf, 0xa0, 0xa4, 0xb8, 0xbe, 0xa2, 0x2b, 0xc1, 0x32, 0xcd, 0x79, 0xd2, + 0x52, 0x78, 0x88, 0x58, 0xa6, 0x90, 0x5b, 0x80, 0x87, 0x76, 0x81, 0x4a, + 0x71, 0x4d, 0x40, 0x96, 0xaa, 0x89, 0x4d, 0x5f, 0x78, 0xa1, 0x4d, 0xb4, + 0xc1, 0x46, 0x61, 0x5c, 0x69, 0xb8, 0xbd, 0xc5, 0xaf, 0x31, 0x70, 0x4e, + 0xcd, 0x5e, 0x8b, 0x40, 0x66, 0x69, 0xc5, 0xca, 0x49, 0xa1, 0x3e, 0x57, + 0xae, 0x84, 0x7e, 0xb7, 0xae, 0x8e, 0xa5, 0x3e, 0xbb, 0x83, 0xc7, 0x5a, + 0xca, 0x5d, 0x3a, 0x3b, 0xc2, 0xbc, 0x59, 0x62, 0x69, 0x43, 0x9c, 0x6c, + 0x57, 0x4f, 0x87, 0x64, 0xa6, 0x83, 0x64, 0x93, 0x7e, 0xcf, 0x36, 0x6d, + 0xca, 0x83, 0x7c, 0x9a, 0x79, 0x8e, 0xb3, 0xb6, 0x88, 0xcb, 0x32, 0x85, + 0x4a, 0xc1, 0x6d, 0x85, 0x4a, 0xc5, 0x9c, 0xb0, 0x96, 0xd0, 0x7b, 0xb7, + 0xad, 0x79, 0xb9, 0xa4, 0xca, 0x53, 0xa1, 0xaf, 0xd5, 0x5f, 0xd4, 0xcc, + 0x67, 0x99, 0x75, 0x45, 0x7e, 0x7c, 0x61, 0xb9, 0xbd, 0x81, 0x79, 0xe4, + 0x31, 0xbc, 0x3d, 0x88, 0x8b, 0x4f, 0x45, 0x4f, 0x70, 0x38, 0x5e, 0x81, + 0x56, 0x42, 0xb7, 0x73, 0xa2, 0x52, 0xcc, 0x87, 0x6e, 0xa9, 0x88, 0xb9, + 0xb8, 0xb7, 0xb2, 0x5d, 0x94, 0x7d, 0x8f, 0xab, 0x4c, 0x3e, 0x54, 0x80, + 0x46, 0x57, 0x80, 0x41, 0xab, 0x65, 0xb0, 0xb9, 0x5a, 0xa3, 0x87, 0x82, + 0x34, 0xa7, 0x8c, 0x81, 0xa8, 0x2d, 0x8b, 0xd3, 0x77, 0x4b, 0xbe, 0x9d, + 0x56, 0xc8, 0x88, 0xc0, 0xd2, 0xb2, 0x6e, 0xa6, 0x6d, 0x85, 0x74, 0xbc, + 0xa6, 0xb7, 0x53, 0xc8, 0xc6, 0x72, 0x72, 0xca, 0x48, 0x57, 0xc1, 0x56, + 0x50, 0x69, 0xa5, 0xca, 0x74, 0x60, 0x2c, 0x91, 0xb0, 0x5c, 0x30, 0xca, + 0x5b, 0x92, 0x6a, 0xab, 0xb4, 0xbd, 0xad, 0x8d, 0x99, 0x97, 0x37, 0x78, + 0x74, 0x4d, 0x51, 0xc1, 0xa5, 0x7c, 0x76, 0x5d, 0x66, 0x23, 0x7b, 0x26, + 0x57, 0x5b, 0xb6, 0x77, 0x74, 0x79, 0xc6, 0xda, 0x79, 0xc9, 0x65, 0xb6, + 0x42, 0x84, 0xcf, 0x8f, 0x76, 0x65, 0x9a, 0xa2, 0xd3, 0x88, 0xaf, 0x56, + 0x49, 0x9c, 0xba, 0xb0, 0x4e, 0xb3, 0xd0, 0x88, 0x73, 0x83, 0x6b, 0x7b, + 0x9a, 0x6d, 0xd6, 0xc3, 0x7c, 0x3b, 0xc8, 0x4e, 0x62, 0xd8, 0x73, 0x3a, + 0xbd, 0x8e, 0x8c, 0xbc, 0x3c, 0xd5, 0x9b, 0xb9, 0x85, 0x87, 0xca, 0x90, + 0x45, 0x61, 0xa1, 0xc9, 0x7a, 0xa7, 0x6a, 0x73, 0x44, 0x63, 0xc5, 0x4e, + 0x88, 0xba, 0xb3, 0xc0, 0x9b, 0x9c, 0x41, 0x98, 0x6c, 0xba, 0x6c, 0xcf, + 0x3e, 0x8a, 0xd7, 0xce, 0x8b, 0xa1, 0x81, 0xbd, 0xb1, 0xb8, 0x84, 0x92, + 0x82, 0xcf, 0x4c, 0xc0, 0xd3, 0x35, 0x55, 0x5c, 0xa0, 0x47, 0xad, 0xa0, + 0x67, 0x3e, 0x79, 0x53, 0x3b, 0xc2, 0x8e, 0x97, 0x29, 0xb7, 0xb1, 0xbb, + 0x7c, 0x49, 0xb5, 0x46, 0xa8, 0x65, 0x7d, 0x28, 0x97, 0xc8, 0xbd, 0x3a, + 0x41, 0xcb, 0x36, 0xa1, 0xcd, 0xa7, 0xbb, 0xd7, 0xc2, 0x54, 0xcb, 0xbe, + 0x69, 0x4b, 0x75, 0x3d, 0x4c, 0x97, 0x95, 0x5d, 0x92, 0xa2, 0xca, 0x45, + 0xcd, 0xe3, 0x5c, 0x3b, 0x49, 0xc9, 0x3a, 0x52, 0xa1, 0x57, 0x53, 0x91, + 0xba, 0xa6, 0x38, 0x4f, 0x4c, 0x3c, 0x73, 0xba, 0x5f, 0xa1, 0x90, 0x46, + 0x46, 0x57, 0x6d, 0x2d, 0xb6, 0x42, 0x71, 0x66, 0xae, 0xa0, 0x42, 0xdf, + 0x6b, 0x95, 0x6d, 0x9d, 0x4e, 0xa8, 0x66, 0xa2, 0xd6, 0xa2, 0x7a, 0x58, + 0x67, 0x2c, 0x7d, 0x66, 0xa9, 0x60, 0x40, 0x4e, 0xb6, 0xd3, 0x64, 0x9c, + 0xb7, 0x4e, 0x89, 0x4c, 0x60, 0x7e, 0xae, 0x7e, 0x5f, 0x7c, 0xb4, 0x5d, + 0xb8, 0xcc, 0xb3, 0x95, 0xad, 0x28, 0x68, 0xaa, 0xb2, 0x47, 0xb6, 0x7d, + 0x9b, 0x4a, 0xbe, 0xc2, 0x6d, 0x9f, 0x7a, 0x69, 0x94, 0x54, 0x9f, 0x80, + 0x95, 0xa0, 0x35, 0x71, 0xc4, 0x6e, 0x9c, 0x41, 0x32, 0x3e, 0x8d, 0xc6, + 0xca, 0x4f, 0x42, 0x73, 0xd6, 0x9c, 0x8a, 0x67, 0x44, 0xb6, 0x99, 0x95, + 0xb0, 0x91, 0x7e, 0x57, 0x56, 0x86, 0x57, 0x3e, 0x4f, 0xd1, 0x3c, 0x85, + 0x76, 0xdd, 0xab, 0x87, 0x65, 0x82, 0x57, 0xaf, 0x8c, 0x79, 0x64, 0x92, + 0xc0, 0x49, 0xa1, 0x34, 0xae, 0x64, 0xa8, 0x3b, 0x59, 0x57, 0x97, 0x4c, + 0x42, 0x79, 0xb3, 0x49, 0x5c, 0xab, 0x30, 0x53, 0xdf, 0x85, 0x68, 0x7c, + 0x4f, 0x7a, 0xc8, 0x66, 0x9f, 0x8f, 0xc9, 0xbf, 0x7d, 0xd2, 0x4e, 0x8c, + 0x55, 0xc2, 0x80, 0x84, 0x3f, 0xbb, 0x69, 0x52, 0x9b, 0xa0, 0x50, 0x55, + 0x4d, 0xb9, 0xbc, 0xd0, 0x3d, 0xaf, 0x53, 0xd3, 0xaa, 0x94, 0xb7, 0xa9, + 0x96, 0x40, 0x64, 0xd1, 0x6b, 0x6d, 0x3d, 0x54, 0x84, 0x7c, 0x75, 0x3f, + 0x8a, 0x5e, 0x98, 0x8e, 0x3b, 0x58, 0xcd, 0xb4, 0xba, 0xa3, 0x96, 0x74, + 0xa3, 0xbb, 0x5f, 0x95, 0x3d, 0xd2, 0x55, 0x71, 0x36, 0x6e, 0xc3, 0x56, + 0x55, 0xc8, 0x51, 0x7f, 0x5c, 0x8a, 0x68, 0x30, 0x69, 0xce, 0x89, 0x33, + 0x57, 0x7a, 0x3a, 0x34, 0x57, 0xc6, 0xcd, 0x9a, 0x67, 0x63, 0xbd, 0x5b, + 0x97, 0xc2, 0x8d, 0x7a, 0x60, 0xc2, 0xc2, 0x55, 0xba, 0x5e, 0x78, 0x49, + 0x4f, 0x45, 0x66, 0x75, 0x50, 0xcb, 0x9d, 0xc7, 0x35, 0x2e, 0xc7, 0x79, + 0x53, 0x7c, 0x77, 0x82, 0x35, 0x7b, 0x5f, 0x88, 0x82, 0xbe, 0xa9, 0xa0, + 0xa8, 0xa0, 0x41, 0x86, 0x67, 0xb9, 0x5a, 0x74, 0x93, 0x92, 0x48, 0x75, + 0x36, 0xc2, 0x74, 0x34, 0xa5, 0xbe, 0xa5, 0x2f, 0x49, 0xb2, 0x78, 0xb1, + 0x5e, 0x95, 0x55, 0x3d, 0x98, 0x9a, 0xb2, 0xbd, 0x5f, 0x59, 0x95, 0xa4, + 0x47, 0x42, 0x7e, 0x66, 0x45, 0xb6, 0x44, 0x29, 0x64, 0x5f, 0x59, 0x8e, + 0xa8, 0x69, 0x7b, 0x89, 0x38, 0x3f, 0xad, 0xc1, 0xcd, 0xa0, 0x93, 0x58, + 0xa7, 0x5c, 0x95, 0x8c, 0x6a, 0xaa, 0x6c, 0x91, 0xb8, 0x3f, 0x3a, 0x45, + 0x7e, 0x7b, 0xca, 0x6b, 0x88, 0x66, 0x73, 0xb7, 0x69, 0xa9, 0x4e, 0x50, + 0x55, 0x8f, 0x97, 0x66, 0xbc, 0x3f, 0x8f, 0x42, 0x4a, 0x30, 0xaf, 0x8b, + 0xba, 0xb6, 0xba, 0xb1, 0x88, 0x91, 0xc7, 0x95, 0x5b, 0xca, 0x94, 0x7e, + 0x98, 0xc5, 0x85, 0xc9, 0x62, 0x45, 0xb2, 0xc5, 0xb9, 0x66, 0xa9, 0x69, + 0x65, 0xcf, 0x66, 0x9f, 0xc2, 0xa1, 0x78, 0x42, 0x76, 0x7b, 0xba, 0x49, + 0x59, 0xae, 0x72, 0x41, 0x7f, 0x78, 0x7c, 0x95, 0xb5, 0x36, 0xa2, 0xa3, + 0x8b, 0x83, 0x3c, 0x9b, 0xb0, 0xd1, 0x69, 0xb8, 0xb4, 0xb6, 0xd0, 0x6e, + 0xac, 0xcb, 0x78, 0x3d, 0x87, 0x4f, 0xbe, 0xba, 0x8e, 0x87, 0x90, 0xc5, + 0x89, 0x88, 0x47, 0x64, 0x61, 0x3e, 0xc3, 0xbf, 0x77, 0x84, 0x45, 0xa2, + 0xcd, 0x3e, 0xca, 0x4d, 0xbf, 0xb7, 0xb4, 0x32, 0x65, 0xbe, 0x48, 0x5a, + 0x4a, 0x9c, 0x3e, 0xd2, 0x53, 0x8a, 0x66, 0xb9, 0xc7, 0x88, 0xcd, 0xa5, + 0x34, 0x94, 0x44, 0x9f, 0xb8, 0x83, 0x5b, 0xa3, 0x68, 0xb3, 0x39, 0x57, + 0x9e, 0xa7, 0x6b, 0xcf, 0x9a, 0x47, 0x44, 0x5e, 0x91, 0xd5, 0x9b, 0x94, + 0x9e, 0x90, 0x95, 0x33, 0x71, 0x83, 0x77, 0x73, 0x96, 0xb9, 0x31, 0x8f, + 0x50, 0x68, 0x7d, 0xac, 0x3d, 0x9e, 0x8e, 0x5e, 0x66, 0x78, 0x94, 0x7f, + 0x5c, 0x90, 0xb8, 0x7e, 0x49, 0x68, 0x95, 0x95, 0x85, 0x70, 0xbb, 0x57, + 0xbd, 0xb6, 0x6d, 0xc4, 0x96, 0xd3, 0x59, 0x88, 0x7a, 0x49, 0x85, 0x4e, + 0x70, 0x47, 0xa8, 0x57, 0x90, 0x97, 0xa9, 0x4e, 0x41, 0x9d, 0xba, 0xb6, + 0x52, 0xbf, 0x37, 0xa8, 0x80, 0xaf, 0x81, 0x45, 0x9f, 0xc8, 0x46, 0x45, + 0x9b, 0xb1, 0xaa, 0x9e, 0xb8, 0x37, 0x77, 0x43, 0x60, 0x83, 0x7e, 0xbf, + 0x56, 0x8c, 0x95, 0x3f, 0xb3, 0x6e, 0x9d, 0xc2, 0x8d, 0xd9, 0x7a, 0x42, + 0xab, 0xcf, 0x3a, 0x41, 0x7e, 0x53, 0x94, 0x66, 0x42, 0xc0, 0x5b, 0x3d, + 0xc4, 0xd5, 0x78, 0xb5, 0x5c, 0x87, 0x5b, 0x4f, 0xb9, 0xb0, 0x62, 0x55, + 0xb3, 0xa1, 0xb0, 0x85, 0xb5, 0x53, 0x57, 0x6e, 0x27, 0x8e, 0x57, 0x3d, + 0xa8, 0x72, 0x6a, 0x3b, 0xd1, 0xcf, 0xc6, 0xa6, 0x51, 0xd2, 0x3d, 0x74, + 0x43, 0xa6, 0x89, 0x49, 0xce, 0xcb, 0xbe, 0x95, 0xb2, 0x8e, 0x54, 0x2d, + 0x4b, 0x79, 0x37, 0x7e, 0x8d, 0x40, 0x4e, 0xcf, 0xb6, 0x91, 0xb2, 0x75, + 0x9b, 0x47, 0x74, 0x99, 0x59, 0x41, 0x6f, 0x5c, 0xaa, 0x54, 0x39, 0x41, + 0x71, 0x31, 0x30, 0x60, 0x6a, 0xd1, 0x5c, 0xbc, 0x86, 0x77, 0x51, 0x72, + 0xb3, 0x79, 0x37, 0xaf, 0x7d, 0x3e, 0xd7, 0xa8, 0xa1, 0x44, 0x5f, 0x3c, + 0x2f, 0x42, 0x5d, 0x8e, 0x3f, 0x52, 0x8f, 0x8a, 0xa8, 0x89, 0xa1, 0x64, + 0x35, 0xc7, 0x36, 0x6e, 0x9a, 0x72, 0x59, 0x78, 0x4b, 0x32, 0x4a, 0x37, + 0xaa, 0x3a, 0xaa, 0x77, 0x36, 0xb5, 0x55, 0x6e, 0x62, 0x6f, 0x56, 0x60, + 0x3c, 0xbf, 0x45, 0xbe, 0xae, 0x7e, 0xbd, 0x55, 0x96, 0x43, 0xb6, 0xc1, + 0x77, 0x3a, 0x66, 0x5e, 0xd0, 0x38, 0xa6, 0x8b, 0xa6, 0x34, 0xa5, 0x87, + 0xb1, 0x49, 0x52, 0x62, 0x6a, 0xb6, 0x4f, 0xbc, 0x58, 0x5d, 0x35, 0x63, + 0x75, 0x4c, 0x90, 0x46, 0xbf, 0x47, 0x31, 0x9b, 0x9e, 0xbb, 0x8a, 0xa1, + 0x92, 0x3b, 0x3a, 0x3f, 0xac, 0xa1, 0xc5, 0xac, 0x38, 0xaa, 0x74, 0x91, + 0x51, 0x90, 0xcc, 0xba, 0x76, 0x55, 0x88, 0xaa, 0xa8, 0xb6, 0xb6, 0x47, + 0x9e, 0x64, 0xae, 0xac, 0xcb, 0x94, 0xa1, 0x89, 0x68, 0x56, 0xc5, 0xa6, + 0x52, 0x3b, 0x8c, 0x53, 0x69, 0x75, 0x92, 0x4f, 0x96, 0xbf, 0x78, 0x4c, + 0xc3, 0x9c, 0xa2, 0xb6, 0x97, 0x66, 0x33, 0x37, 0xc1, 0x71, 0x42, 0xb1, + 0x87, 0xb5, 0x61, 0x9f, 0x74, 0x67, 0x84, 0xa4, 0xa0, 0x3f, 0x34, 0xbc, + 0x6a, 0x54, 0x32, 0x63, 0xbc, 0x7c, 0x47, 0xa3, 0x2f, 0x45, 0x91, 0xb6, + 0x35, 0x67, 0xcf, 0x9e, 0xc6, 0xcc, 0x30, 0xab, 0x55, 0xb1, 0x5c, 0x54, + 0x71, 0x46, 0x67, 0xb3, 0x87, 0x7e, 0x98, 0x36, 0x54, 0x99, 0xa0, 0x3c, + 0x60, 0xc1, 0x48, 0x87, 0xb0, 0x85, 0x5c, 0x86, 0x7b, 0xaf, 0xb0, 0x9c, + 0x8e, 0x51, 0x4f, 0x80, 0xb6, 0xcc, 0x9c, 0xaa, 0x9e, 0x42, 0x99, 0xbc, + 0xb9, 0x8c, 0x55, 0xa2, 0x84, 0x7b, 0xad, 0x78, 0x35, 0x9e, 0xa9, 0xc7, + 0x91, 0xbb, 0x3e, 0xa7, 0x5a, 0x8d, 0x46, 0xbd, 0x94, 0x51, 0x2f, 0x73, + 0xc1, 0x60, 0x71, 0x6d, 0x50, 0xb1, 0xac, 0x7d, 0x6c, 0x66, 0x3c, 0xb9, + 0x54, 0x67, 0x60, 0x7e, 0xae, 0xb2, 0xb5, 0xb0, 0x71, 0x9e, 0x95, 0xb8, + 0x7f, 0xc4, 0xb5, 0xa8, 0x40, 0x5c, 0x69, 0x47, 0x6e, 0x89, 0xab, 0xa6, + 0x90, 0xc9, 0x41, 0x67, 0x46, 0x38, 0xa7, 0x63, 0x66, 0x6f, 0xb2, 0x41, + 0x52, 0x84, 0xaa, 0xa1, 0x6b, 0x46, 0x3b, 0x9b, 0xa3, 0x63, 0xb0, 0x91, + 0xd0, 0x3c, 0x47, 0x36, 0x72, 0x54, 0x94, 0xc5, 0xa7, 0x9c, 0x6f, 0x6a, + 0x61, 0xc8, 0xb3, 0x47, 0x7d, 0xa9, 0x93, 0x47, 0xb5, 0x7e, 0x87, 0x83, + 0xac, 0xcb, 0x49, 0x73, 0x54, 0xd0, 0x40, 0xcf, 0x35, 0xbd, 0x4c, 0x9e, + 0x88, 0x74, 0xb3, 0xbe, 0x8b, 0x6e, 0x94, 0xa0, 0x61, 0x86, 0xd4, 0x8c, + 0x9a, 0xc4, 0xc7, 0x5d, 0x61, 0xba, 0x80, 0x40, 0x77, 0xc5, 0x51, 0x75, + 0xa5, 0xbb, 0xcd, 0x63, 0x5a, 0x64, 0x67, 0x6d, 0xc6, 0xbf, 0x7a, 0x58, + 0xc8, 0x56, 0xc4, 0xb7, 0x88, 0xb2, 0xc6, 0x4e, 0xb2, 0x30, 0x67, 0x3b, + 0x69, 0xa6, 0xac, 0x94, 0x6c, 0x92, 0x97, 0x50, 0xa7, 0xb5, 0xde, 0xd5, + 0xa8, 0x81, 0x78, 0xa1, 0x8a, 0x33, 0xa3, 0xa8, 0xc5, 0x95, 0x64, 0x40, + 0x3a, 0x4c, 0x5d, 0xa5, 0x6d, 0xca, 0xb1, 0xaf, 0x6e, 0x53, 0x81, 0x52, + 0x54, 0x43, 0xce, 0x73, 0x8d, 0x6f, 0xd2, 0x27, 0x80, 0x97, 0x4f, 0x72, + 0x6b, 0xb1, 0x77, 0x6d, 0xa2, 0x51, 0x95, 0x5f, 0x38, 0xa2, 0x4c, 0x55, + 0xb3, 0x9b, 0xa9, 0x5c, 0xcf, 0x69, 0x34, 0xc4, 0xb3, 0x66, 0x9e, 0x9e, + 0x36, 0x58, 0x88, 0x78, 0xcd, 0x59, 0xbc, 0xcf, 0xb4, 0xd9, 0x65, 0xb7, + 0xba, 0xc3, 0xbc, 0x5c, 0x45, 0x47, 0xb1, 0xa3, 0x9b, 0xc9, 0xc1, 0x4b, + 0x5c, 0xcc, 0x94, 0x86, 0x9a, 0x7f, 0xa9, 0x4d, 0x4c, 0x76, 0xb6, 0x66, + 0xb0, 0x3a, 0x73, 0xb8, 0x47, 0xd4, 0x7b, 0x43, 0x6c, 0xb6, 0xbd, 0xaa, + 0x98, 0xa4, 0xab, 0x68, 0xaf, 0xb2, 0x8b, 0x8c, 0x65, 0x75, 0x44, 0xb8, + 0x85, 0xaa, 0x2e, 0x70, 0xaf, 0x66, 0x84, 0x71, 0xb7, 0x46, 0x40, 0xab, + 0xa1, 0x60, 0xaf, 0x72, 0x87, 0x5e, 0x46, 0x34, 0x79, 0xa8, 0x3c, 0x58, + 0x88, 0xb7, 0xc7, 0x99, 0x84, 0x86, 0x5b, 0x38, 0xb6, 0xa8, 0xbc, 0x45, + 0xc2, 0xcf, 0x41, 0x5e, 0x89, 0xa7, 0xbd, 0x74, 0xaa, 0x5c, 0x80, 0x47, + 0x38, 0x66, 0x91, 0xa4, 0x5c, 0x37, 0x8a, 0x38, 0xa4, 0xc6, 0x34, 0xd2, + 0x9c, 0xce, 0x88, 0x5c, 0x57, 0xa7, 0xc0, 0xce, 0x5b, 0x48, 0x72, 0xb0, + 0x48, 0xae, 0xbe, 0xc2, 0x88, 0x74, 0x4f, 0x3a, 0x96, 0x3b, 0x78, 0x5e, + 0x33, 0xb9, 0x9f, 0x95, 0xad, 0x72, 0x33, 0x91, 0xcd, 0x4f, 0x9f, 0x37, + 0xc4, 0x75, 0x58, 0x53, 0x98, 0x91, 0x37, 0x8b, 0x5c, 0x88, 0x95, 0x9a, + 0x71, 0xca, 0x43, 0xb1, 0x7e, 0x59, 0xa2, 0x88, 0xcb, 0x90, 0x66, 0x83, + 0x78, 0x7a, 0xac, 0xb6, 0x43, 0x62, 0xcf, 0x95, 0x33, 0x51, 0x4a, 0xb4, + 0x7b, 0xcf, 0x3b, 0x3f, 0x7f, 0x5e, 0x9b, 0xa9, 0x85, 0xab, 0x74, 0x5f, + 0xa4, 0x47, 0x6c, 0x49, 0xa7, 0xb3, 0x41, 0x3d, 0x56, 0xcc, 0x8e, 0x9e, + 0xa0, 0xa4, 0xcc, 0x84, 0x63, 0x81, 0x47, 0x97, 0xa0, 0x70, 0x3c, 0xcd, + 0x71, 0x55, 0xa2, 0x39, 0xbc, 0x92, 0x72, 0x73, 0x8d, 0xb1, 0xbe, 0xa1, + 0x8b, 0x44, 0x35, 0x8a, 0x37, 0x72, 0x47, 0xcb, 0x55, 0x9f, 0x59, 0x6e, + 0xc8, 0x54, 0x4c, 0x81, 0x94, 0x88, 0x7c, 0x5e, 0x76, 0x5d, 0x4f, 0x6d, + 0xc4, 0xa2, 0x67, 0xc3, 0xaf, 0x99, 0x76, 0x81, 0x8f, 0x53, 0xce, 0x9b, + 0x8d, 0x38, 0x48, 0xc3, 0x5d, 0xb0, 0x4f, 0xa2, 0x87, 0x4c, 0x83, 0x88, + 0x68, 0xa3, 0xa3, 0x92, 0x62, 0x86, 0xc5, 0xc6, 0xb9, 0x7b, 0xa0, 0x86, + 0x48, 0x70, 0x94, 0x62, 0x70, 0xa4, 0x31, 0x65, 0x3c, 0xa2, 0x9a, 0x79, + 0xb1, 0x77, 0xc1, 0xb0, 0x65, 0x87, 0x38, 0x49, 0x7e, 0x86, 0x32, 0xb6, + 0xc0, 0x6b, 0x31, 0x34, 0x84, 0x30, 0x93, 0xc0, 0xbf, 0x76, 0x6f, 0x35, + 0x98, 0xaf, 0x73, 0x97, 0x63, 0x34, 0x8e, 0xa9, 0x52, 0x45, 0x94, 0x95, + 0x41, 0xb1, 0xc0, 0x5d, 0x85, 0xce, 0x4e, 0x60, 0x82, 0x72, 0x44, 0x66, + 0x61, 0xc8, 0xca, 0xbd, 0x55, 0x93, 0x4d, 0x69, 0x3e, 0x8c, 0x91, 0xc9, + 0x9a, 0x2a, 0xbf, 0x94, 0x68, 0x75, 0x9f, 0x9d, 0x75, 0x79, 0xd3, 0x90, + 0x91, 0xc0, 0xd8, 0x42, 0xc7, 0x64, 0x5e, 0x74, 0xd1, 0x3d, 0x71, 0xb7, + 0xdb, 0xa2, 0x86, 0x81, 0x4c, 0x77, 0xa0, 0xa2, 0xd3, 0x4c, 0x86, 0x8e, + 0x7c, 0x4e, 0x3b, 0xc2, 0x73, 0xb3, 0x80, 0x68, 0x8e, 0xa5, 0xda, 0x80, + 0x76, 0xc0, 0xac, 0xb5, 0x36, 0x2b, 0x38, 0x90, 0x85, 0x7a, 0x8f, 0xcf, + 0x2f, 0x93, 0x52, 0x34, 0x5b, 0x48, 0x4a, 0x41, 0x8f, 0xc9, 0x5b, 0x38, + 0xd4, 0xd6, 0x84, 0x9a, 0x64, 0x8b, 0x9c, 0x58, 0x84, 0x84, 0xa0, 0x4d, + 0xaf, 0x42, 0xcf, 0x54, 0x41, 0xb1, 0x95, 0x6e, 0xa7, 0x41, 0x66, 0xb0, + 0xb0, 0xaf, 0x6d, 0x7d, 0xbf, 0xc5, 0x84, 0xaf, 0x46, 0x93, 0xd0, 0xc8, + 0xc8, 0x8b, 0xbf, 0xad, 0x6d, 0x4c, 0x50, 0x90, 0x65, 0x3d, 0xa5, 0xbb, + 0x96, 0x98, 0xb2, 0x3c, 0x9b, 0x37, 0x48, 0x60, 0x93, 0xb9, 0x87, 0xd3, + 0x8d, 0x95, 0x71, 0x54, 0xcd, 0xc6, 0xb7, 0xa1, 0xb2, 0xd8, 0xc0, 0xc6, + 0xd1, 0xc3, 0x7e, 0xb6, 0x61, 0xa8, 0xc5, 0x5d, 0x99, 0x6e, 0xc9, 0x3d, + 0xae, 0xc1, 0x7f, 0x87, 0xd2, 0xb8, 0x8c, 0x57, 0x7f, 0x49, 0x45, 0x39, + 0x6f, 0x83, 0xca, 0xcc, 0xb5, 0xa6, 0xaf, 0x5e, 0xa6, 0x5a, 0x60, 0xb6, + 0x91, 0x34, 0x38, 0x46, 0x32, 0x61, 0x54, 0x96, 0x5a, 0x43, 0x90, 0xa0, + 0xb4, 0x62, 0x71, 0x79, 0x3e, 0x8b, 0x4b, 0x79, 0x5f, 0x87, 0x71, 0x4d, + 0x9d, 0x7e, 0x78, 0xa7, 0x84, 0xb9, 0x4f, 0x7e, 0x7c, 0x71, 0x47, 0x8e, + 0xc4, 0x4b, 0xb7, 0x7f, 0x65, 0x9f, 0xe1, 0x4e, 0x6f, 0x80, 0x3e, 0xc2, + 0x4d, 0x36, 0xa0, 0x86, 0x3a, 0x8f, 0x53, 0x49, 0xbd, 0x66, 0x8a, 0x8b, + 0xb8, 0xc4, 0xd0, 0x46, 0x6c, 0x43, 0x7f, 0x49, 0xcf, 0xc6, 0x58, 0x3d, + 0xd4, 0xa1, 0xc9, 0x30, 0xda, 0x3d, 0xbb, 0x8a, 0x96, 0x6b, 0xb0, 0x4c, + 0x43, 0x4d, 0xa5, 0x8b, 0x88, 0x24, 0xb0, 0x9d, 0x53, 0x92, 0x3a, 0x3a, + 0xac, 0x99, 0xa2, 0xac, 0x9b, 0xba, 0xc2, 0xbf, 0x62, 0x52, 0xab, 0x51, + 0xb0, 0x45, 0x9f, 0x44, 0x41, 0xab, 0xb8, 0x98, 0x58, 0x66, 0xb0, 0x7d, + 0x5c, 0x40, 0x6b, 0x8e, 0x8c, 0xba, 0xb7, 0x8b, 0x92, 0x55, 0x94, 0xbc, + 0xa1, 0xa5, 0x67, 0x75, 0x56, 0x7e, 0x9a, 0xc5, 0x27, 0xb7, 0x45, 0x69, + 0x7a, 0x63, 0xbd, 0xa2, 0x40, 0x74, 0xb7, 0x9f, 0x47, 0x55, 0x91, 0x65, + 0x7d, 0x5d, 0x3e, 0xc5, 0x61, 0xce, 0x91, 0x45, 0x8b, 0x94, 0x5a, 0x5c, + 0xac, 0xbb, 0x59, 0x7d, 0x66, 0xbf, 0x93, 0x70, 0x83, 0x7c, 0x83, 0xc5, + 0x35, 0x87, 0x85, 0x6e, 0xd7, 0xb, 0x85, 0x82, 0x66, 0x30, 0x86, 0x71, + 0x6c, 0x55, 0x7e, 0x94, 0x60, 0x7f, 0xb0, 0xac, 0xbc, 0xae, 0x84, 0xa8, + 0xc8, 0x97, 0x97, 0xce, 0x70, 0xa6, 0x74, 0xa4, 0xa6, 0x53, 0xb9, 0xa3, + 0x4d, 0xc7, 0x77, 0x87, 0x8a, 0x4e, 0x5c, 0x78, 0xc2, 0x86, 0x7f, 0x95, + 0x67, 0x45, 0x46, 0x74, 0x4a, 0xcb, 0x8b, 0xc1, 0x97, 0x2e, 0x7c, 0x78, + 0xba, 0x82, 0x46, 0x58, 0x56, 0xd2, 0x46, 0xbe, 0x93, 0xbc, 0x5b, 0x97, + 0x54, 0x71, 0x80, 0xcc, 0xd0, 0x59, 0xb4, 0x93, 0x8c, 0x48, 0xb3, 0xbb, + 0x6e, 0x40, 0x60, 0x48, 0x80, 0x6d, 0xd7, 0x9e, 0x3b, 0x3b, 0x42, 0x38, + 0xb9, 0x95, 0x51, 0x7f, 0x90, 0x50, 0x48, 0x4f, 0x99, 0xc8, 0xa9, 0xc0, + 0x89, 0x4b, 0xb8, 0x83, 0x63, 0xc9, 0x6c, 0x99, 0x72, 0x8b, 0x5f, 0x58, + 0xa8, 0xc9, 0xa1, 0x7e, 0x85, 0x6b, 0x4e, 0x4d, 0x55, 0xb5, 0x72, 0x44, + 0x6b, 0x64, 0x46, 0xac, 0x61, 0x56, 0x38, 0x81, 0xc2, 0x73, 0x45, 0x72, + 0x8e, 0x62, 0x61, 0xc1, 0x4d, 0x64, 0x41, 0x56, 0x8f, 0x62, 0xb1, 0x96, + 0x65, 0x91, 0x3a, 0x30, 0x62, 0x77, 0x3a, 0xae, 0xa8, 0x5d, 0xbf, 0x44, + 0xc4, 0x50, 0x6f, 0x75, 0x7a, 0x9a, 0xa3, 0xb2, 0x48, 0x89, 0x5e, 0xa4, + 0x3f, 0xa7, 0x6c, 0x53, 0xd2, 0x81, 0xb6, 0x5b, 0x3b, 0xa3, 0x81, 0x82, + 0x49, 0xa8, 0x34, 0xc1, 0xc0, 0xbd, 0x54, 0x32, 0x4a, 0xb4, 0x73, 0xbb, + 0xb4, 0x56, 0xa0, 0x8a, 0x57, 0x46, 0x6e, 0x9e, 0xc5, 0x91, 0xa5, 0xbb, + 0x48, 0xa1, 0xad, 0x3f, 0x4f, 0xb1, 0x4c, 0xc1, 0x75, 0x66, 0xb2, 0xca, + 0x93, 0x9c, 0x38, 0x77, 0xb4, 0x8f, 0xd6, 0x79, 0x41, 0xaa, 0x9e, 0xc4, + 0x4f, 0x9f, 0xc4, 0x49, 0x70, 0x64, 0xb7, 0xac, 0x65, 0x86, 0x67, 0x8f, + 0x90, 0x77, 0xa6, 0xa9, 0x37, 0x6f, 0x65, 0x81, 0xaa, 0x5c, 0x50, 0x6b, + 0x33, 0xbc, 0x92, 0x3b, 0x5a, 0x98, 0x77, 0xb6, 0x7f, 0x78, 0x79, 0x64, + 0x2c, 0x84, 0x9a, 0xa5, 0x4c, 0x86, 0x8b, 0x56, 0x82, 0xcd, 0xb2, 0x36, + 0xdd, 0x6f, 0xd4, 0xcd, 0xdc, 0x8b, 0x5a, 0x58, 0x72, 0xbc, 0x67, 0xac, + 0x89, 0xa5, 0xaf, 0xa3, 0xc6, 0x3b, 0x44, 0x70, 0xd4, 0xac, 0x84, 0xaa, + 0xa2, 0xc0, 0xb0, 0xb1, 0x6d, 0xbf, 0x58, 0x33, 0x9b, 0xd3, 0x60, 0xa9, + 0x7f, 0x43, 0x94, 0x6f, 0xb5, 0xa1, 0xaa, 0x5a, 0xad, 0x5d, 0x42, 0x52, + 0x67, 0x56, 0x57, 0xca, 0x31, 0x93, 0x7d, 0x49, 0x7f, 0x7f, 0xd1, 0xb9, + 0x4b, 0xc2, 0x41, 0x95, 0x79, 0x36, 0x9b, 0xa7, 0xca, 0x7c, 0x9d, 0xa1, + 0xa1, 0x62, 0xa3, 0x4d, 0x9e, 0xae, 0xc6, 0xa8, 0xb7, 0x39, 0x4a, 0x6c, + 0x73, 0x55, 0x69, 0xbb, 0x4e, 0x3e, 0x7f, 0x52, 0x9f, 0x43, 0xbb, 0x84, + 0x59, 0x6a, 0x33, 0x70, 0x91, 0xba, 0x71, 0x3b, 0xd6, 0x32, 0xb1, 0xc9, + 0xa7, 0xa8, 0xb6, 0x85, 0xa6, 0x5b, 0xbe, 0xaa, 0xcc, 0x9e, 0xb7, 0x87, + 0xae, 0x4f, 0x68, 0x71, 0x83, 0x6d, 0x7b, 0x47, 0x66, 0xb4, 0x78, 0x7c, + 0xa2, 0xae, 0x66, 0xcf, 0x32, 0xb8, 0x90, 0x35, 0x56, 0x83, 0x7f, 0x9e, + 0x4f, 0x98, 0x7f, 0x4d, 0x79, 0xcd, 0xe8, 0xba, 0x31, 0x6c, 0xc9, 0x9f, + 0x5d, 0x80, 0x5e, 0xb1, 0xbe, 0x8e, 0x86, 0x40, 0x6c, 0x42, 0x79, 0x50, + 0xc2, 0xb7, 0x7c, 0x7d, 0x6f, 0x66, 0x9d, 0x9f, 0x45, 0xb9, 0xbb, 0x6c, + 0x58, 0x69, 0x83, 0x91, 0x9c, 0xaf, 0x2f, 0x5b, 0x37, 0x6a, 0x4e, 0x5c, + 0xca, 0x90, 0x87, 0x4d, 0x30, 0x76, 0x85, 0xad, 0x55, 0x43, 0xa6, 0x44, + 0xcc, 0x84, 0x60, 0x87, 0x94, 0x74, 0x71, 0x8c, 0x8b, 0x3e, 0xcb, 0xb4, + 0x9c, 0xae, 0xb8, 0x88, 0xa2, 0xa9, 0x4f, 0xa2, 0x6c, 0x68, 0x9d, 0x41, + 0x4a, 0x4a, 0x69, 0x9f, 0x9e, 0x88, 0x97, 0x79, 0x7f, 0xae, 0xbc, 0xbf, + 0xce, 0x5c, 0x64, 0x47, 0xbc, 0xbf, 0x3a, 0x6b, 0x92, 0xa2, 0x77, 0xc6, + 0xad, 0xd1, 0x55, 0x4f, 0x5b, 0xb5, 0x40, 0xbe, 0xb4, 0xa4, 0x4f, 0x47, + 0x8b, 0xb5, 0x88, 0xc9, 0x52, 0x7c, 0xaa, 0xb0, 0x84, 0x58, 0xa5, 0x43, + 0x31, 0xc1, 0x70, 0xa8, 0x7c, 0x6e, 0x79, 0x60, 0xa6, 0x30, 0x78, 0xc6, + 0xc8, 0x7e, 0x3e, 0x8c, 0x80, 0x64, 0xa4, 0xb6, 0x59, 0xdc, 0x86, 0x49, + 0xa4, 0x41, 0x60, 0x75, 0xaf, 0x68, 0x38, 0x3f, 0xb2, 0x55, 0xb8, 0x8f, + 0x89, 0x2f, 0xc6, 0x2f, 0xa6, 0x41, 0x7f, 0xb3, 0xbe, 0xb2, 0x4c, 0x9b, + 0x5a, 0xc7, 0x94, 0xbf, 0x42, 0xd0, 0x79, 0x39, 0x6c, 0xb7, 0x71, 0x62, + 0x3e, 0x9a, 0x71, 0xba, 0x47, 0x30, 0xb4, 0x92, 0xac, 0xae, 0x74, 0x8d, + 0x56, 0x49, 0x80, 0x45, 0xc1, 0x3d, 0xc3, 0x7a, 0x75, 0x77, 0x6c, 0x6e, + 0x1c, 0x72, 0xad, 0x64, 0xab, 0x39, 0xc6, 0x5e, 0x4f, 0x89, 0x93, 0x7f, + 0xd3, 0xbe, 0xae, 0xc3, 0x45, 0x73, 0x88, 0x64, 0x46, 0x9e, 0x42, 0x79, + 0x99, 0xb3, 0x6a, 0x65, 0x6a, 0xae, 0xc9, 0x4d, 0xd2, 0xb1, 0x42, 0x50, + 0x3d, 0xce, 0x50, 0x9a, 0x3a, 0x85, 0x7c, 0x4d, 0xb8, 0xaf, 0x5a, 0xc2, + 0xbb, 0xae, 0x82, 0xc6, 0x61, 0x8d, 0x8d, 0x7d, 0x5f, 0x84, 0xab, 0xaf, + 0x81, 0x79, 0xc3, 0x47, 0x89, 0x6d, 0x49, 0x33, 0xc0, 0x9a, 0x54, 0xc8, + 0x5c, 0x6a, 0x72, 0x42, 0x6b, 0x7d, 0x83, 0x47, 0x7d, 0xbc, 0x64, 0xa5, + 0x81, 0x50, 0xc5, 0x98, 0x49, 0xc7, 0x68, 0x7f, 0x5c, 0x56, 0x60, 0x7d, + 0x6a, 0x3f, 0xa4, 0xb6, 0xa5, 0x33, 0x4f, 0xd8, 0x6c, 0xa4, 0xa2, 0x81, + 0xaf, 0x9b, 0x61, 0x6f, 0x83, 0x41, 0x44, 0xbd, 0x7d, 0x75, 0x96, 0x53, + 0xad, 0xb6, 0xc5, 0x58, 0xc0, 0x8d, 0x90, 0xc7, 0xa3, 0xc8, 0x8e, 0x38, + 0xbd, 0x98, 0x89, 0x3d, 0x9f, 0xcf, 0xa8, 0x64, 0x3e, 0x9b, 0x49, 0x9c, + 0x8e, 0x4a, 0xd2, 0xc4, 0x68, 0xca, 0x65, 0x8f, 0x9b, 0x77, 0x45, 0x9a, + 0xd1, 0xad, 0xcf, 0x6a, 0xd0, 0xd8, 0x83, 0x87, 0x56, 0xac, 0xcc, 0x63, + 0x85, 0x61, 0xa4, 0x85, 0x43, 0xc3, 0xa0, 0xac, 0x3a, 0x42, 0x57, 0x88, + 0x50, 0x64, 0x5c, 0xcb, 0x9f, 0x9c, 0x7f, 0x3e, 0xc2, 0x9f, 0x69, 0xbf, + 0x9a, 0x9b, 0x72, 0xbe, 0x7d, 0xc0, 0x45, 0xb1, 0xa9, 0x4a, 0x5a, 0x9f, + 0x70, 0x78, 0xc9, 0xa5, 0xca, 0x31, 0x7d, 0xb3, 0x99, 0xa0, 0x65, 0x51, + 0xc2, 0x61, 0x84, 0xa4, 0x69, 0xc9, 0xa4, 0xa9, 0x91, 0xc5, 0x6a, 0x9d, + 0x69, 0xbe, 0x91, 0x47, 0x81, 0x65, 0xbf, 0x82, 0x8e, 0xab, 0x67, 0x98, + 0x40, 0x39, 0x8f, 0xcb, 0x7b, 0x3a, 0x87, 0x86, 0x7c, 0x55, 0x87, 0x6e, + 0x8c, 0x77, 0x8c, 0xc6, 0xc7, 0x65, 0x69, 0x31, 0x3b, 0x60, 0xb7, 0x9a, + 0x3d, 0xcc, 0xc4, 0x45, 0x9f, 0x52, 0x74, 0x84, 0xaf, 0xa3, 0xc6, 0x66, + 0x7c, 0x48, 0x9e, 0x65, 0xb7, 0xc4, 0x6d, 0x3a, 0x5d, 0x35, 0x9d, 0x65, + 0x90, 0x47, 0xc5, 0xb0, 0xc9, 0xc7, 0x51, 0xac, 0xd1, 0x9e, 0x61, 0x4d, + 0xa9, 0x49, 0x92, 0x73, 0x68, 0x6e, 0x8c, 0xd6, 0xaf, 0xb9, 0x37, 0x9a, + 0xcc, 0x90, 0x8d, 0xa1, 0x7f, 0xad, 0xc8, 0x6d, 0x37, 0x8a, 0xd0, 0xa5, + 0x72, 0x3e, 0xd2, 0xb3, 0x4d, 0x7b, 0xc7, 0x46, 0xd0, 0x33, 0xa2, 0x7a, + 0x8f, 0xc8, 0xd7, 0x75, 0xcf, 0x59, 0x73, 0x4b, 0xa2, 0x80, 0x45, 0x4f, + 0x8a, 0x59, 0x96, 0x69, 0x64, 0x67, 0x98, 0xaa, 0x6f, 0x8a, 0x57, 0x47, + 0x85, 0x94, 0xb0, 0x8c, 0x48, 0xd1, 0x33, 0x9f, 0x91, 0x36, 0x3f, 0xac, + 0xbb, 0xb7, 0x36, 0x79, 0x62, 0xca, 0x62, 0x67, 0x5d, 0xc1, 0xc9, 0x9a, + 0xaf, 0xb9, 0xa2, 0x6b, 0x31, 0x9e, 0x8c, 0x30, 0x5d, 0xaa, 0xc9, 0x4a, + 0x58, 0x9e, 0x7b, 0x8d, 0x78, 0x8a, 0xac, 0x5d, 0xb8, 0xca, 0x40, 0x69, + 0x3b, 0x8c, 0x86, 0x8d, 0xd3, 0x9e, 0xb2, 0x4f, 0x2e, 0xbb, 0x59, 0x59, + 0x35, 0x90, 0xd3, 0x5e, 0xc0, 0xa3, 0x9b, 0x62, 0x3a, 0xa7, 0xcb, 0x57, + 0x32, 0x72, 0x42, 0xb0, 0x5d, 0x59, 0x4b, 0x63, 0xba, 0x91, 0xae, 0x57, + 0x4b, 0x75, 0x5c, 0x56, 0x3e, 0xc2, 0xb5, 0xc9, 0xa6, 0xb0, 0xbf, 0xb2, + 0x33, 0x5c, 0x8e, 0xa8, 0x80, 0xaa, 0x33, 0x5f, 0xc8, 0x3d, 0xb0, 0x91, + 0x66, 0x79, 0xae, 0xa7, 0x46, 0x83, 0x71, 0xc7, 0x51, 0xcd, 0xa2, 0x51, + 0x4b, 0xc6, 0x8e, 0xc3, 0xcc, 0x6c, 0x77, 0xc4, 0xc4, 0xc7, 0xb7, 0x45, + 0x93, 0x93, 0x8b, 0x61, 0x5b, 0x3d, 0xd5, 0xae, 0xc2, 0x8c, 0x6c, 0x9f, + 0x79, 0x9b, 0x7d, 0x53, 0xd6, 0xd3, 0xaf, 0x53, 0xd3, 0x4e, 0x8c, 0x42, + 0xc5, 0x9b, 0x57, 0xc2, 0x4d, 0x49, 0x41, 0x69, 0x85, 0xb5, 0xc6, 0x35, + 0x70, 0x75, 0x70, 0xce, 0x54, 0xc5, 0x85, 0xb7, 0xa5, 0xad, 0x5b, 0x63, + 0x49, 0x3c, 0x98, 0xa1, 0x4f, 0x6e, 0x45, 0x47, 0x8d, 0x59, 0x56, 0x73, + 0xd1, 0x7c, 0x4c, 0x47, 0x4a, 0x56, 0x73, 0x92, 0x9f, 0x4d, 0x6c, 0xbc, + 0x72, 0xb6, 0x55, 0xa8, 0x4c, 0x84, 0x6d, 0x9a, 0xa8, 0x65, 0xc2, 0x6b, + 0xc6, 0xa4, 0x38, 0x64, 0x50, 0xc3, 0x47, 0xac, 0x91, 0x69, 0x5d, 0x49, + 0x90, 0x8e, 0x83, 0x91, 0xbe, 0xb8, 0x98, 0x54, 0x99, 0x6a, 0x5c, 0x93, + 0xa6, 0xb7, 0x4e, 0x84, 0x72, 0x7f, 0x93, 0x36, 0x91, 0x5d, 0xad, 0x4a, + 0x9c, 0x72, 0xb7, 0x48, 0x6b, 0x80, 0x98, 0xa3, 0x36, 0xc0, 0x41, 0xb5, + 0x31, 0x33, 0xbd, 0x96, 0x32, 0x8f, 0xb0, 0x30, 0x5e, 0x6b, 0xaf, 0x68, + 0xd2, 0x44, 0xbe, 0xaf, 0x36, 0xaa, 0x39, 0x41, 0x77, 0xcb, 0x68, 0x44, + 0x79, 0x57, 0x78, 0x55, 0xc9, 0x7b, 0x7f, 0xaf, 0x61, 0x8b, 0x53, 0xaf, + 0x3b, 0xc8, 0xca, 0xd0, 0x83, 0x36, 0x6c, 0xc7, 0x85, 0xa2, 0xb9, 0x4b, + 0xa1, 0x45, 0x89, 0x61, 0x8c, 0x75, 0xc4, 0xcc, 0x80, 0x84, 0x82, 0x64, + 0x8f, 0x45, 0x5a, 0x8a, 0xbf, 0xc7, 0x6f, 0xce, 0xbf, 0xab, 0x79, 0xbd, + 0x44, 0x5e, 0x47, 0xce, 0x73, 0xa8, 0x66, 0xbd, 0x34, 0x84, 0x42, 0xce, + 0x6b, 0xa9, 0xb3, 0x6c, 0xd5, 0x7e, 0x3f, 0x61, 0x74, 0xbe, 0xac, 0x6e, + 0xcf, 0xb6, 0xbf, 0x61, 0xbb, 0x59, 0x9c, 0x63, 0xd3, 0xc8, 0x89, 0xbb, + 0x53, 0x45, 0x9e, 0x3c, 0x59, 0x56, 0x93, 0xc3, 0x64, 0x33, 0x79, 0x3f, + 0x80, 0x4d, 0x66, 0x3c, 0xa4, 0xaf, 0x8e, 0xa4, 0x80, 0x74, 0x6d, 0x7e, + 0xd4, 0x49, 0xa5, 0x6c, 0x46, 0x4c, 0x51, 0x45, 0xcf, 0x90, 0x5e, 0x5f, + 0x6b, 0x57, 0x6e, 0x57, 0xa8, 0x4d, 0xbd, 0x78, 0xb4, 0x8c, 0x4b, 0x66, + 0x97, 0x7d, 0xc8, 0x89, 0x9e, 0x88, 0x44, 0x45, 0xb2, 0x6d, 0x54, 0x47, + 0xda, 0xa2, 0x6e, 0x87, 0x65, 0x43, 0x64, 0xbc, 0x5a, 0xcd, 0xb0, 0x58, + 0x53, 0xa7, 0xc2, 0x91, 0x3a, 0x41, 0x2b, 0x6e, 0xa5, 0x85, 0x30, 0x77, + 0x5f, 0xb3, 0xad, 0x4b, 0xbd, 0x64, 0xa8, 0x37, 0xca, 0xa4, 0x30, 0x88, + 0x50, 0x8b, 0xb7, 0x72, 0xc2, 0x4d, 0xb1, 0xce, 0xca, 0x94, 0xbb, 0x9c, + 0x8e, 0xb3, 0x5b, 0x4b, 0x89, 0x38, 0xb6, 0x9d, 0x74, 0x67, 0xad, 0xb4, + 0xc8, 0x39, 0x9e, 0xb2, 0xb7, 0x34, 0x9d, 0x94, 0x87, 0x8d, 0x5c, 0xa7, + 0x9d, 0x9e, 0x7d, 0x80, 0x78, 0x6c, 0xcd, 0x44, 0x3d, 0x57, 0x84, 0xae, + 0x74, 0x9f, 0x4a, 0x8a, 0xab, 0x62, 0x8d, 0xc7, 0x63, 0x44, 0x5f, 0xb7, + 0x34, 0x52, 0x9c, 0xa0, 0x75, 0xaa, 0xc1, 0x4e, 0x6a, 0xc4, 0x78, 0x66, + 0x8f, 0x48, 0xb7, 0xc1, 0x44, 0x62, 0x35, 0x78, 0xa1, 0xa4, 0x7b, 0xb8, + 0xab, 0x4b, 0x4d, 0x85, 0x90, 0x49, 0xd0, 0xaf, 0xa6, 0x36, 0xc5, 0x57, + 0x81, 0x9c, 0x4b, 0xc7, 0x71, 0x62, 0x9f, 0x8f, 0x4f, 0x67, 0x6f, 0x8d, + 0x5b, 0xa2, 0x61, 0x5a, 0x8c, 0x3a, 0x82, 0x75, 0x3c, 0x5f, 0xb4, 0x34, + 0x54, 0x65, 0x64, 0xc4, 0x72, 0x86, 0x61, 0xbb, 0xdc, 0xe2, 0x3f, 0x77, + 0xb6, 0x64, 0xa7, 0xd9, 0x4d, 0xbb, 0x56, 0x82, 0x7f, 0xc9, 0x88, 0x44, + 0xa8, 0xb9, 0x9e, 0x5a, 0x63, 0xc3, 0xa6, 0x3d, 0xc1, 0x93, 0x37, 0xb2, + 0x9a, 0x76, 0x7f, 0xad, 0x6d, 0x30, 0x6a, 0x40, 0x99, 0x55, 0x86, 0xa0, + 0x5f, 0xc3, 0x82, 0xb1, 0x90, 0xb0, 0x42, 0x39, 0xb9, 0xaa, 0x9d, 0xbe, + 0x66, 0xb6, 0x36, 0xd6, 0x93, 0x86, 0x66, 0x80, 0x40, 0xc4, 0x9f, 0x98, + 0xa4, 0x6c, 0xc2, 0xab, 0x9d, 0x62, 0x47, 0x32, 0x90, 0x43, 0x65, 0xb9, + 0x6f, 0x9a, 0xa5, 0x52, 0x44, 0x67, 0xb6, 0xaf, 0xc6, 0x92, 0x3f, 0x46, + 0x62, 0x56, 0x88, 0x97, 0x5a, 0x68, 0x6d, 0x3d, 0x3a, 0x4c, 0xbd, 0x40, + 0xa0, 0x53, 0x42, 0x54, 0x5f, 0x5a, 0x58, 0x67, 0x57, 0x8f, 0x77, 0x82, + 0x47, 0xc3, 0xcb, 0x4b, 0xa7, 0xa5, 0x3c, 0x3f, 0x82, 0x54, 0xbd, 0x97, + 0x3a, 0x8a, 0x52, 0x55, 0xc8, 0xbd, 0x8d, 0x97, 0x4a, 0xbf, 0x3a, 0x4d, + 0x7f, 0xd3, 0xc7, 0x38, 0x9f, 0x9b, 0x7a, 0x98, 0xb6, 0xc1, 0x32, 0x38, + 0x68, 0x2b, 0xc1, 0x95, 0xd0, 0x53, 0xa5, 0x47, 0x80, 0x7f, 0xbc, 0xa1, + 0x40, 0xca, 0xa1, 0x80, 0x40, 0x33, 0x66, 0xbd, 0x55, 0xa5, 0x9a, 0x69, + 0x34, 0x51, 0xac, 0x3a, 0x50, 0x9b, 0xc5, 0x7e, 0x9e, 0xa9, 0xcb, 0x73, + 0x98, 0xa7, 0x49, 0x85, 0x9f, 0x80, 0x76, 0xbf, 0xa6, 0x65, 0xd6, 0x6c, + 0x97, 0x9c, 0x7e, 0x48, 0x4e, 0x9d, 0x96, 0x68, 0x93, 0x90, 0x59, 0x76, + 0x8e, 0x61, 0x4d, 0xb8, 0x31, 0x39, 0x79, 0x8e, 0x35, 0x3a, 0xb2, 0x7b, + 0x51, 0x34, 0x3b, 0x88, 0xa4, 0x71, 0xa8, 0xa6, 0x50, 0x82, 0x7b, 0x5f, + 0xa9, 0x87, 0x8a, 0x99, 0xa5, 0x47, 0xb7, 0x70, 0x74, 0x8c, 0xc9, 0xb3, + 0xcc, 0xcd, 0xb6, 0x33, 0x3f, 0x83, 0xcf, 0x60, 0x8c, 0x48, 0x44, 0xa2, + 0xc7, 0x52, 0x78, 0x38, 0x73, 0x8a, 0xc9, 0x66, 0x51, 0xb4, 0x66, 0x34, + 0xa0, 0x3f, 0x4c, 0xd1, 0xb9, 0xbe, 0x52, 0x58, 0x32, 0x34, 0x38, 0x94, + 0x85, 0x4f, 0x65, 0xd2, 0xc4, 0xb2, 0x98, 0x33, 0xa4, 0x6e, 0xb3, 0xa4, + 0x79, 0x8c, 0x97, 0x5d, 0x4b, 0xd1, 0xac, 0x88, 0xd2, 0x62, 0x78, 0x98, + 0x76, 0x91, 0xc7, 0x37, 0x47, 0x88, 0x60, 0x5a, 0x69, 0x31, 0x37, 0xd5, + 0xb6, 0x3a, 0x98, 0x79, 0x83, 0x54, 0x77, 0xa8, 0x86, 0xb1, 0xaa, 0xab, + 0xc5, 0xc9, 0x34, 0x69, 0xbe, 0xb1, 0x75, 0x6c, 0xcd, 0xce, 0x91, 0x47, + 0x7a, 0x88, 0x59, 0x44, 0x89, 0xcb, 0xce, 0x5e, 0x9f, 0xb9, 0xa8, 0x78, + 0x33, 0x79, 0x33, 0x81, 0x30, 0xb8, 0x52, 0xd1, 0x4b, 0xc2, 0xb5, 0xb1, + 0x3b, 0x2a, 0x59, 0xd0, 0xcb, 0x2f, 0xc7, 0x96, 0x59, 0x99, 0x41, 0x4f, + 0xac, 0x73, 0x79, 0xcc, 0x2a, 0xbb, 0x60, 0x8b, 0x66, 0xbb, 0x46, 0xbe, + 0x79, 0xb7, 0x39, 0x44, 0x63, 0xb2, 0x61, 0x95, 0x8e, 0x40, 0xa6, 0xb6, + 0x51, 0xb5, 0x33, 0x9f, 0x8a, 0x71, 0x5c, 0x84, 0x65, 0xd4, 0x7e, 0x31, + 0x54, 0xd3, 0xc3, 0xcf, 0xcb, 0xba, 0x8c, 0x74, 0xc7, 0xb5, 0x4b, 0xa0, + 0x9f, 0x7c, 0x67, 0x3e, 0x6f, 0x5a, 0x70, 0x7e, 0x44, 0x6c, 0x76, 0x98, + 0x6b, 0x5d, 0x6f, 0xcf, 0x6f, 0x8d, 0x38, 0x70, 0x72, 0x2d, 0x45, 0x9b, + 0xbf, 0x58, 0x90, 0xae, 0x65, 0x47, 0xb3, 0x93, 0xa7, 0x97, 0x74, 0x71, + 0xc7, 0x54, 0x6d, 0x99, 0x65, 0x75, 0x56, 0x3a, 0x8f, 0x92, 0x5b, 0x83, + 0x41, 0xb5, 0x4f, 0xc4, 0xbc, 0xb8, 0xaa, 0xcb, 0xa7, 0xb8, 0x74, 0x4e, + 0x58, 0x71, 0x36, 0x9f, 0xa3, 0x48, 0x3f, 0xd0, 0x36, 0x96, 0x99, 0x5b, + 0x53, 0x9f, 0xa0, 0xd1, 0xc5, 0xcd, 0x78, 0x90, 0x41, 0xa0, 0x8e, 0xbf, + 0xcf, 0x6f, 0x83, 0x6d, 0x59, 0xc4, 0xa3, 0x84, 0x49, 0x42, 0xc4, 0x4a, + 0x6d, 0xa5, 0x90, 0x9a, 0xc2, 0x33, 0x43, 0x8f, 0x73, 0x68, 0x36, 0x93, + 0x54, 0xc9, 0x8d, 0x75, 0x60, 0x35, 0x8e, 0xb1, 0xbd, 0xad, 0x52, 0xb1, + 0x73, 0x57, 0x80, 0x32, 0x5b, 0x88, 0x75, 0x52, 0xc4, 0x8a, 0x74, 0x4e, + 0x52, 0xd6, 0xbc, 0xca, 0x4a, 0x66, 0xa7, 0xd3, 0xa3, 0x77, 0x70, 0x43, + 0x39, 0x83, 0xce, 0x90, 0xa3, 0xc7, 0xb5, 0x63, 0x6e, 0xb2, 0x62, 0xa7, + 0x39, 0x8a, 0x90, 0x44, 0x91, 0x6e, 0x71, 0x39, 0xc7, 0xc6, 0x3e, 0x5f, + 0x4b, 0x7f, 0x52, 0x94, 0x49, 0xc7, 0x4b, 0x98, 0x5e, 0xd3, 0xb8, 0x95, + 0xd3, 0xc3, 0xd0, 0x3e, 0x33, 0x55, 0x9c, 0xc8, 0x72, 0xa1, 0x5b, 0x6a, + 0x66, 0x5b, 0x4c, 0xc2, 0x81, 0xc3, 0x76, 0x38, 0x45, 0x44, 0xb6, 0x9e, + 0xc3, 0xa1, 0xcb, 0xd7, 0xae, 0x2f, 0x8b, 0x99, 0xbc, 0xae, 0x43, 0xbc, + 0xaa, 0x46, 0x3a, 0x61, 0x54, 0x44, 0xab, 0xc2, 0x91, 0xa5, 0xa8, 0x67, + 0xd0, 0x2c, 0x54, 0x53, 0x60, 0xa9, 0x5f, 0xc0, 0x52, 0x44, 0x72, 0x65, + 0xc6, 0x84, 0x61, 0x72, 0xad, 0x65, 0xa5, 0x9e, 0xb7, 0xb4, 0x3d, 0x8c, + 0x57, 0xd3, 0x44, 0x54, 0xaf, 0x6f, 0xcd, 0x65, 0x4c, 0x8e, 0x4e, 0x7b, + 0x6e, 0xc4, 0x89, 0xc8, 0x48, 0x82, 0x40, 0xbf, 0x7a, 0x61, 0xd1, 0xa6, + 0x83, 0x65, 0x52, 0x2e, 0x43, 0xc8, 0xbc, 0xbf, 0xd6, 0xa5, 0x50, 0xaa, + 0xc0, 0x97, 0xd4, 0x44, 0x88, 0x6c, 0x54, 0xd0, 0x52, 0xc1, 0x63, 0x3e, + 0x3a, 0x74, 0xc6, 0xc0, 0xd2, 0xc8, 0x3c, 0xab, 0xa3, 0xcc, 0xbb, 0x99, + 0x54, 0x5e, 0xc3, 0x80, 0xd2, 0x90, 0x6b, 0x6c, 0x5c, 0xa8, 0x6e, 0x54, + 0x64, 0xbb, 0x89, 0x95, 0x8e, 0xb1, 0x3f, 0x8b, 0x9d, 0x6e, 0x36, 0x7b, + 0x46, 0x94, 0x8c, 0x95, 0xcf, 0x54, 0x38, 0x3f, 0xa2, 0x86, 0x90, 0xe8, + 0x33, 0x9f, 0x79, 0x49, 0xc4, 0xa6, 0xbe, 0xb3, 0xd2, 0x7f, 0x38, 0xc6, + 0x76, 0x84, 0x7d, 0x9d, 0xb2, 0x72, 0x50, 0x56, 0x36, 0x3f, 0xaf, 0xd3, + 0x6a, 0x8a, 0xa8, 0x34, 0x66, 0x77, 0xc0, 0x61, 0x56, 0x85, 0x3e, 0x99, + 0x5b, 0xb7, 0x40, 0xa0, 0xb1, 0xab, 0xb4, 0xad, 0x68, 0x45, 0x3e, 0xa4, + 0xb2, 0x60, 0xac, 0x44, 0x39, 0xb9, 0x6c, 0x69, 0x93, 0xb8, 0x98, 0x5d, + 0x5a, 0x72, 0x3a, 0xbd, 0xb9, 0xd6, 0xd6, 0xbd, 0xb1, 0xa0, 0xdb, 0x52, + 0xb1, 0xc9, 0x9a, 0x6e, 0x87, 0xd7, 0x6c, 0xc7, 0xc3, 0xb8, 0xb8, 0x5d, + 0x97, 0xc7, 0x9e, 0x37, 0x9e, 0xbf, 0x3e, 0x97, 0xa8, 0x3b, 0xcd, 0x82, + 0xc8, 0x3e, 0x4c, 0x9b, 0x94, 0xbb, 0xcf, 0x7f, 0xb9, 0x8b, 0xb7, 0x56, + 0xa5, 0x46, 0xb1, 0x80, 0x4e, 0x59, 0x93, 0x48, 0x97, 0x8d, 0x49, 0x3e, + 0xd0, 0xca, 0x85, 0x95, 0x4c, 0x53, 0x84, 0x32, 0x9d, 0x31, 0x32, 0xbd, + 0xb8, 0xbb, 0x5d, 0xc5, 0x7f, 0x4a, 0x49, 0xb2, 0xb9, 0x64, 0xa0, 0x43, + 0xa8, 0x6d, 0x6f, 0x3a, 0xca, 0x40, 0xcf, 0xb5, 0x68, 0x8c, 0xa1, 0x8b, + 0x33, 0x83, 0x7b, 0xc5, 0xd7, 0xb0, 0x61, 0xb1, 0x47, 0x30, 0x49, 0x85, + 0x85, 0x8c, 0x4b, 0x8d, 0x8d, 0x95, 0x54, 0xcc, 0x33, 0xcc, 0xb1, 0xcf, + 0xa0, 0x61, 0x38, 0x55, 0xd1, 0xcc, 0xce, 0x47, 0x39, 0x4c, 0xbd, 0x33, + 0x8f, 0x39, 0x8b, 0x86, 0xc9, 0xcc, 0x4c, 0x41, 0xad, 0xb6, 0x4c, 0x36, + 0x47, 0xbf, 0x53, 0x79, 0x91, 0xa1, 0x43, 0xc5, 0x7b, 0x65, 0xc2, 0x3a, + 0xbe, 0x33, 0xbe, 0x45, 0x5f, 0xa3, 0xb8, 0x55, 0xd4, 0x62, 0xb9, 0x5b, + 0x35, 0xb2, 0xa1, 0x7a, 0x2e, 0x79, 0xa5, 0x34, 0xb7, 0xb8, 0x2d, 0x97, + 0x76, 0x54, 0x79, 0x94, 0x40, 0x74, 0xb6, 0x3c, 0xa7, 0x71, 0x96, 0x62, + 0x9e, 0x55, 0xae, 0x60, 0x3f, 0x43, 0x90, 0x32, 0xdd, 0xc9, 0x9e, 0x77, + 0x6d, 0x88, 0x8b, 0x6c, 0xca, 0xae, 0x4d, 0x4d, 0x3d, 0xb2, 0x70, 0x75, + 0x7f, 0x5a, 0xa0, 0x9f, 0xcf, 0x9f, 0x4c, 0x3b, 0xa9, 0x9b, 0xd1, 0x31, + 0xc6, 0xcc, 0x58, 0x7e, 0xa5, 0xb0, 0x2f, 0x97, 0x9e, 0xb9, 0xbb, 0xca, + 0x31, 0x5a, 0x95, 0x2a, 0x63, 0x86, 0x4b, 0xc9, 0xbc, 0xa5, 0x42, 0x68, + 0x52, 0x74, 0x71, 0x9d, 0x96, 0x62, 0xae, 0xa1, 0x46, 0x3f, 0x68, 0x6e, + 0xc3, 0x77, 0x89, 0xb4, 0x8e, 0xd5, 0xc5, 0xbb, 0x5a, 0xcc, 0x48, 0x93, + 0xc6, 0x7d, 0x30, 0x8e, 0x57, 0x84, 0x87, 0x6e, 0xd1, 0x49, 0x9d, 0x82, + 0xbc, 0x87, 0x9a, 0x36, 0x92, 0x34, 0x59, 0x44, 0xb0, 0x45, 0xd5, 0x5d, + 0x61, 0x37, 0xc3, 0x97, 0x46, 0xb9, 0xc6, 0xa7, 0xa3, 0xc4, 0x91, 0xb4, + 0x3e, 0x47, 0xcc, 0x84, 0x8e, 0xa5, 0xad, 0x93, 0x66, 0x8b, 0x7c, 0x63, + 0x42, 0xab, 0x36, 0xc4, 0x45, 0x5d, 0xb9, 0x9d, 0x5d, 0xb9, 0xbe, 0x39, + 0x63, 0x43, 0xad, 0x61, 0xb0, 0x75, 0x5b, 0x64, 0x47, 0x38, 0x6c, 0xcf, + 0x4a, 0xa9, 0xc8, 0x76, 0x7f, 0x9d, 0x89, 0x3c, 0x68, 0xa4, 0x9f, 0x80, + 0xbd, 0xcd, 0x38, 0x9a, 0x43, 0x5b, 0x51, 0x9b, 0x8b, 0x37, 0x46, 0x5a, + 0x55, 0xb3, 0x87, 0x5f, 0xa9, 0x6a, 0xb1, 0x8d, 0x8d, 0x90, 0xbe, 0xc8, + 0x44, 0x53, 0x90, 0x65, 0x38, 0x2f, 0x9e, 0x5d, 0x86, 0xa0, 0x30, 0x80, + 0x75, 0x77, 0xcc, 0x9a, 0xa1, 0xaf, 0x86, 0x8d, 0xcb, 0x88, 0x97, 0x95, + 0x76, 0xc3, 0xc5, 0x9d, 0x9d, 0x87, 0x42, 0x81, 0xdd, 0xc6, 0x3e, 0x43, + 0x94, 0xc4, 0x6b, 0x49, 0x88, 0x5b, 0x3e, 0x4f, 0x40, 0xb9, 0x74, 0xae, + 0x57, 0xab, 0xb9, 0x96, 0x4f, 0x65, 0x6f, 0x5c, 0x7a, 0xb6, 0xcc, 0xaa, + 0x6e, 0xb0, 0x80, 0x41, 0xa8, 0xcb, 0x8a, 0xa5, 0x70, 0xaa, 0x48, 0x98, + 0x33, 0x74, 0xa6, 0x8e, 0xcf, 0x95, 0xba, 0x89, 0x45, 0x83, 0x70, 0x50, + 0x73, 0x83, 0x8c, 0xaa, 0x69, 0x95, 0xb0, 0xb4, 0xa4, 0x91, 0x6e, 0x77, + 0x52, 0xe5, 0x64, 0x5c, 0x9e, 0xc8, 0x86, 0x59, 0x90, 0x80, 0x6d, 0x58, + 0x56, 0xca, 0x8b, 0x90, 0xb8, 0x5d, 0xd4, 0xb0, 0x68, 0xc1, 0xa3, 0x31, + 0x31, 0xcb, 0xbf, 0x39, 0x81, 0x7d, 0xb1, 0x53, 0x66, 0x5b, 0x43, 0xb1, + 0x6c, 0x5f, 0x65, 0x4d, 0xc6, 0xa6, 0x9e, 0x34, 0x70, 0x8a, 0x84, 0x7f, + 0x4e, 0xab, 0x5c, 0x52, 0x3b, 0x7b, 0xc5, 0xac, 0x70, 0x38, 0xbd, 0xc3, + 0xab, 0x9a, 0x78, 0x7b, 0x58, 0x4a, 0x84, 0x39, 0x48, 0xd2, 0xbc, 0x2e, + 0x75, 0x93, 0x83, 0x44, 0x65, 0xdf, 0x76, 0x56, 0xba, 0x9f, 0x82, 0x82, + 0xa6, 0x59, 0x62, 0x90, 0x46, 0x67, 0x5c, 0x6d, 0x3b, 0xc6, 0x78, 0x9e, + 0x4e, 0xc0, 0x59, 0xb9, 0x81, 0xc4, 0x77, 0x5e, 0x6c, 0xbe, 0xc5, 0x6e, + 0xa5, 0x3a, 0x98, 0xaf, 0xbf, 0x77, 0x43, 0x91, 0xc1, 0x89, 0x4f, 0x8b, + 0x7a, 0x82, 0x37, 0x8e, 0x51, 0x9f, 0x5d, 0x6c, 0x6d, 0x8b, 0x99, 0x93, + 0xae, 0xaf, 0xb5, 0x94, 0x34, 0x48, 0xc9, 0x87, 0xae, 0x54, 0xc3, 0x7e, + 0x58, 0x56, 0x98, 0x54, 0x88, 0xb2, 0xb9, 0x2a, 0xbc, 0xd0, 0x7e, 0x97, + 0x86, 0x61, 0xdf, 0xb8, 0x2e, 0x55, 0x8d, 0x40, 0x91, 0x63, 0xb0, 0x74, + 0x57, 0xac, 0x95, 0x85, 0x95, 0x33, 0xa6, 0xcf, 0x3e, 0xce, 0x63, 0xc3, + 0xa7, 0xcc, 0x76, 0xac, 0x6c, 0x3c, 0x7b, 0xc6, 0xbb, 0xcf, 0xb3, 0x5b, + 0x5f, 0x78, 0x50, 0x52, 0xbc, 0x5e, 0x71, 0xc6, 0x5f, 0xa3, 0x77, 0x79, + 0xac, 0x8e, 0xc5, 0xac, 0x2f, 0x9e, 0x64, 0x6f, 0xa5, 0x99, 0x60, 0xb8, + 0xde, 0xa3, 0x98, 0xa9, 0x44, 0xce, 0x8c, 0x30, 0x7a, 0xd6, 0x45, 0xb7, + 0x73, 0x42, 0x7f, 0xb8, 0x95, 0x61, 0x5c, 0x95, 0xc4, 0x61, 0x3e, 0x42, + 0x4a, 0x8b, 0x58, 0xd5, 0xad, 0xb7, 0xd3, 0x54, 0x7f, 0x38, 0x4a, 0xa0, + 0xb1, 0x3b, 0xb7, 0x42, 0xa9, 0xbd, 0xba, 0x23, 0xc5, 0xa7, 0xc5, 0x69, + 0xbb, 0x98, 0xa8, 0xbd, 0x69, 0x99, 0xa0, 0x55, 0x7a, 0xa7, 0xe1, 0x47, + 0xc9, 0xd1, 0x44, 0x97, 0xd3, 0x6a, 0x95, 0x89, 0x95, 0x4a, 0x52, 0x6c, + 0x90, 0x47, 0xdd, 0xe6, 0x60, 0x75, 0xc6, 0x79, 0x3d, 0x47, 0x9a, 0x5e, + 0x4d, 0x59, 0xcd, 0xbf, 0xc9, 0x69, 0x4a, 0xb0, 0xb9, 0x9d, 0xaf, 0xaa, + 0x4d, 0x9c, 0x5a, 0x3a, 0x7e, 0xcc, 0x6a, 0x90, 0x8e, 0x3d, 0xa6, 0x75, + 0x29, 0x4b, 0xc1, 0x5d, 0x4d, 0xaa, 0x6c, 0xae, 0x88, 0xb8, 0xc4, 0xd0, + 0x41, 0x79, 0x3f, 0x9e, 0x98, 0x7f, 0x76, 0xa7, 0xc6, 0xb2, 0x86, 0x57, + 0x6c, 0x6c, 0xba, 0xbe, 0xc0, 0xc8, 0xb7, 0xc5, 0xa3, 0xd1, 0xa0, 0x9c, + 0x8a, 0xbd, 0xbe, 0x90, 0x6e, 0xbd, 0x7a, 0xc8, 0xb1, 0xad, 0xa3, 0xb6, + 0x53, 0x7a, 0x66, 0x74, 0xab, 0x8d, 0x94, 0x47, 0x74, 0x6c, 0x8e, 0xa1, + 0x87, 0x81, 0xb6, 0x56, 0x5a, 0x3b, 0x91, 0xc1, 0x34, 0xc6, 0x9f, 0xc9, + 0x4d, 0x31, 0x62, 0xb1, 0x65, 0xa6, 0x32, 0xb2, 0xb5, 0x5e, 0x7b, 0xb4, + 0x9f, 0x3f, 0x9f, 0xa7, 0x34, 0x8c, 0x5c, 0x96, 0x5e, 0x63, 0x25, 0x48, + 0x3a, 0x64, 0xa6, 0x46, 0xca, 0xce, 0x96, 0xd3, 0x2f, 0xb3, 0xdf, 0xbf, + 0xc5, 0x7d, 0x69, 0x92, 0x4a, 0xa9, 0x59, 0xc1, 0xb6, 0x36, 0x68, 0xbb, + 0x7b, 0x59, 0x8c, 0x7d, 0x87, 0x46, 0x8b, 0x4e, 0x61, 0x9c, 0x9e, 0xbd, + 0x3b, 0x53, 0x75, 0x6f, 0x45, 0x4d, 0xa8, 0x4b, 0x73, 0x6f, 0x74, 0xcc, + 0x53, 0x7b, 0x5e, 0x38, 0xbf, 0x4f, 0xbe, 0x92, 0xb2, 0x8a, 0x98, 0xa3, + 0xaf, 0xda, 0x89, 0x5d, 0x61, 0x95, 0x8c, 0x59, 0xb3, 0xb6, 0x72, 0x58, + 0x3c, 0x26, 0x4a, 0x6d, 0xd6, 0x72, 0xd2, 0xb8, 0x98, 0xbc, 0xbf, 0x51, + 0x95, 0xb5, 0x9e, 0xb3, 0x69, 0x51, 0x99, 0x37, 0x92, 0x72, 0x6a, 0x64, + 0x79, 0x76, 0x4f, 0xc1, 0x5a, 0x59, 0x53, 0x36, 0x4b, 0xd5, 0xba, 0x45, + 0xa2, 0x4f, 0x3a, 0x2a, 0x9e, 0x52, 0x44, 0x4f, 0xb9, 0x7e, 0x49, 0x8e, + 0xb5, 0x8b, 0xb4, 0xd2, 0xd4, 0xb9, 0x38, 0x6e, 0x93, 0xba, 0xd0, 0x3d, + 0x4f, 0x3b, 0x92, 0x66, 0xa7, 0x6c, 0xad, 0x65, 0x3b, 0xb8, 0x89, 0x67, + 0x81, 0xa9, 0x8f, 0x77, 0x71, 0x37, 0xbc, 0x42, 0xd6, 0xc1, 0x5a, 0x59, + 0x7d, 0x7d, 0xc8, 0x57, 0x7f, 0x4d, 0x4f, 0x46, 0xc5, 0x5d, 0xbf, 0x72, + 0x3b, 0xbc, 0x78, 0xce, 0x9c, 0x35, 0xb9, 0xcd, 0xac, 0x53, 0xca, 0xbc, + 0x93, 0x64, 0x4f, 0xc2, 0x66, 0xc6, 0x9d, 0xb6, 0x69, 0xbe, 0x89, 0xcf, + 0xbd, 0x3d, 0x50, 0xa4, 0x99, 0x98, 0x40, 0x48, 0xc3, 0x74, 0x77, 0x55, + 0xcd, 0xac, 0xb9, 0x6f, 0x8a, 0x40, 0x68, 0xb4, 0xb3, 0x7c, 0xc1, 0xa9, + 0xc5, 0x59, 0x74, 0xa0, 0xbd, 0xa8, 0x7d, 0xa9, 0x87, 0xa9, 0x89, 0x80, + 0x91, 0xbe, 0x9f, 0x9d, 0x94, 0xbb, 0x7e, 0x3a, 0xba, 0x57, 0xce, 0xae, + 0x79, 0xbc, 0x8c, 0xde, 0x91, 0xa7, 0x4c, 0xa1, 0x91, 0x9a, 0x3f, 0xbb, + 0xde, 0x41, 0x6c, 0x86, 0x34, 0x7d, 0xd0, 0x3d, 0x74, 0x38, 0x3e, 0x9e, + 0x5a, 0xcf, 0xc1, 0x7a, 0x5e, 0x55, 0x4b, 0x9b, 0x3f, 0x92, 0x6f, 0x93, + 0x47, 0xc4, 0x95, 0xbe, 0xcc, 0x5d, 0x96, 0x60, 0x91, 0x4a, 0xca, 0xb6, + 0x58, 0x9a, 0x3f, 0xc3, 0xd1, 0xc1, 0xd0, 0xbf, 0xb1, 0x81, 0xc5, 0x5d, + 0x42, 0x99, 0x7c, 0xac, 0x47, 0x26, 0xbb, 0x51, 0x80, 0x8f, 0xc5, 0x72, + 0xc4, 0x9a, 0x7e, 0x44, 0x76, 0x55, 0xc3, 0x96, 0x53, 0x6f, 0x98, 0xd6, + 0xb5, 0x4d, 0x7d, 0x8f, 0x8e, 0x44, 0x99, 0x4e, 0x90, 0xb9, 0xae, 0xc9, + 0xcf, 0x77, 0xb4, 0x49, 0x59, 0xb7, 0x94, 0x52, 0x91, 0x7a, 0x7d, 0x49, + 0x70, 0x7d, 0x7f, 0x66, 0x32, 0x6e, 0xc3, 0x80, 0x59, 0xc3, 0x3c, 0x8b, + 0xc1, 0xb8, 0xbc, 0x81, 0xcd, 0x92, 0x7d, 0xcd, 0x5d, 0x70, 0x82, 0xb4, + 0x4b, 0x53, 0x6e, 0x8b, 0xb7, 0x81, 0x5a, 0xc6, 0x39, 0xa7, 0x61, 0xa5, + 0x70, 0xb2, 0x8d, 0x7a, 0xb4, 0x59, 0xaa, 0x38, 0xd6, 0x4c, 0x39, 0x87, + 0x6d, 0xcd, 0x3c, 0x54, 0x4d, 0x5f, 0xa9, 0x3e, 0x40, 0x9f, 0xc1, 0x37, + 0xd5, 0xa1, 0x45, 0xb1, 0x5a, 0x3f, 0x98, 0xd2, 0x93, 0xb2, 0xd0, 0x5b, + 0x6a, 0xb3, 0x7e, 0x9a, 0xa6, 0x77, 0xd6, 0x3e, 0xa5, 0xa0, 0x7d, 0x85, + 0x32, 0x75, 0xd3, 0x6e, 0x58, 0xcf, 0xa9, 0xa3, 0xa3, 0x76, 0x69, 0x76, + 0x71, 0xa9, 0x5f, 0xa7, 0x46, 0xa3, 0xa2, 0xba, 0xc0, 0x78, 0xb1, 0xd0, + 0xd2, 0x71, 0xc9, 0xbb, 0x6b, 0x3e, 0x3f, 0x7e, 0xc6, 0xb3, 0x7c, 0x62, + 0xc4, 0x83, 0x41, 0xbe, 0x53, 0xac, 0x63, 0xab, 0xb1, 0x29, 0x6d, 0x83, + 0xc1, 0x6d, 0x71, 0x6e, 0x90, 0x61, 0xc0, 0xa1, 0xd6, 0xc5, 0x99, 0xad, + 0xa1, 0xab, 0xae, 0x55, 0x54, 0x9f, 0x74, 0x9d, 0x5c, 0x8a, 0x67, 0x6b, + 0xc0, 0x80, 0x52, 0x80, 0xbb, 0x8a, 0x8e, 0x7b, 0x4f, 0x57, 0x8a, 0xc5, + 0x7b, 0x91, 0x4b, 0x68, 0x70, 0xd0, 0x76, 0xd1, 0x5e, 0x31, 0xac, 0x77, + 0x37, 0x5a, 0x35, 0x70, 0xa9, 0x87, 0xa7, 0x5b, 0x76, 0x8a, 0xbc, 0x8e, + 0x3a, 0x95, 0x31, 0x9f, 0xa3, 0x6e, 0x92, 0x8d, 0x67, 0xba, 0x85, 0xb2, + 0x9e, 0x7e, 0x51, 0xa5, 0x83, 0x91, 0x6d, 0x7e, 0x7d, 0x7c, 0x91, 0x85, + 0xbd, 0xa6, 0x9a, 0x9c, 0x64, 0xc1, 0xc6, 0xb9, 0x3b, 0x36, 0xc9, 0xc6, + 0x77, 0x97, 0xc4, 0xc5, 0xce, 0x8c, 0xba, 0x73, 0x54, 0x33, 0x60, 0x73, + 0xa7, 0xbd, 0xc8, 0x91, 0xaa, 0xbc, 0x4f, 0x76, 0x43, 0xbb, 0x3e, 0xb4, + 0x39, 0xca, 0xba, 0x4c, 0x71, 0x91, 0x63, 0x6d, 0x5b, 0x66, 0x62, 0xbd, + 0x77, 0x37, 0x32, 0xbc, 0x87, 0x70, 0x4a, 0x9b, 0x78, 0x93, 0xbf, 0x44, + 0x73, 0x3b, 0xcb, 0x3f, 0x83, 0x60, 0x86, 0xb0, 0xc1, 0xcc, 0x99, 0x70, + 0x33, 0xbf, 0x8f, 0x34, 0x41, 0xa3, 0x39, 0x42, 0x59, 0xc9, 0x8c, 0x93, + 0x4b, 0x54, 0x95, 0x74, 0xb8, 0x34, 0xb7, 0x6e, 0x5c, 0x6c, 0x63, 0xc1, + 0x8b, 0x58, 0xc4, 0x3a, 0x81, 0x75, 0x93, 0x45, 0x90, 0x90, 0x9a, 0x79, + 0xb0, 0xbe, 0x78, 0x97, 0xd4, 0xb6, 0xa6, 0x44, 0xbf, 0xa1, 0x38, 0xd1, + 0x43, 0x7a, 0x82, 0x62, 0x7f, 0x61, 0xc9, 0x4d, 0x40, 0x5d, 0x88, 0xa6, + 0x94, 0x4a, 0xaf, 0x82, 0x8e, 0xb5, 0xba, 0xb0, 0xbc, 0x40, 0x4d, 0x5e, + 0xc9, 0xcd, 0x73, 0xcc, 0x91, 0x36, 0x44, 0x73, 0x70, 0xb8, 0x88, 0x3a, + 0x97, 0x64, 0xa8, 0x92, 0xc5, 0xae, 0xc4, 0x51, 0xc0, 0xbe, 0x6d, 0x56, + 0x88, 0xa9, 0x62, 0x81, 0x79, 0x3b, 0x6c, 0x91, 0xd0, 0xbd, 0x32, 0x77, + 0x7a, 0xc9, 0x54, 0x46, 0xcd, 0xc0, 0xcc, 0x52, 0x6a, 0x88, 0xb3, 0x69, + 0x70, 0x59, 0x64, 0x8f, 0xb7, 0x96, 0x92, 0x70, 0x65, 0x97, 0x65, 0xa6, + 0xaa, 0x35, 0xb3, 0xb2, 0x94, 0x52, 0xb7, 0x58, 0x5d, 0xbe, 0x8f, 0xd0, + 0x56, 0x50, 0x89, 0x44, 0xa8, 0xae, 0x84, 0xa9, 0x39, 0xbc, 0x53, 0x66, + 0x82, 0x43, 0x4a, 0x45, 0x54, 0x78, 0xcd, 0xa2, 0x6f, 0x69, 0x35, 0x37, + 0xa5, 0x85, 0xc7, 0x44, 0x7c, 0xc0, 0x9a, 0x62, 0x9b, 0x69, 0xb4, 0x83, + 0x99, 0x9e, 0x86, 0x56, 0x8b, 0x68, 0x74, 0xb9, 0x89, 0x43, 0x4d, 0x74, + 0x63, 0x9a, 0x3f, 0x3f, 0x33, 0x35, 0x9b, 0x71, 0xbc, 0x9c, 0x4a, 0x80, + 0x99, 0x83, 0x4e, 0x84, 0x60, 0x59, 0x8c, 0x8c, 0x7c, 0x51, 0x55, 0x46, + 0x75, 0x5c, 0xa2, 0xcb, 0x47, 0x50, 0x9d, 0x6a, 0xca, 0x31, 0x84, 0x43, + 0xa4, 0xaa, 0x5d, 0x41, 0x45, 0x70, 0x34, 0x85, 0x59, 0x56, 0xb7, 0x70, + 0x9a, 0xbd, 0xc7, 0x46, 0x47, 0x39, 0x5d, 0x3f, 0x6c, 0x7f, 0xb6, 0x75, + 0x9e, 0x5a, 0xbf, 0x41, 0x6b, 0x7a, 0x93, 0x51, 0x44, 0x4c, 0x96, 0xbc, + 0xc6, 0xd3, 0xbd, 0xc4, 0x8a, 0x4e, 0x53, 0xc5, 0xc6, 0x4c, 0x41, 0xb5, + 0x58, 0xb0, 0x38, 0x5d, 0x89, 0x9f, 0x48, 0x73, 0x5b, 0x76, 0x3e, 0x53, + 0x9e, 0xb1, 0x5c, 0x59, 0x9d, 0x44, 0x80, 0x87, 0x5b, 0xa3, 0x31, 0x9f, + 0xc1, 0x52, 0x45, 0x3f, 0x62, 0x96, 0x9e, 0xc8, 0xbe, 0xaa, 0x7b, 0xbc, + 0x4b, 0x53, 0xab, 0x84, 0xc0, 0x86, 0x35, 0x5e, 0x69, 0x39, 0xc0, 0xbe, + 0x5a, 0x4e, 0x99, 0xae, 0xc4, 0xca, 0x63, 0x49, 0x69, 0x4c, 0x58, 0x39, + 0xa6, 0x5b, 0xd0, 0x77, 0x6f, 0xb5, 0x9a, 0x7b, 0x9d, 0x46, 0xab, 0x98, + 0x93, 0x47, 0x7a, 0x88, 0x46, 0x97, 0x35, 0xaf, 0x55, 0x73, 0x35, 0x58, + 0x58, 0x35, 0x84, 0x80, 0x32, 0x75, 0x96, 0x7d, 0x8d, 0xbd, 0xc0, 0x75, + 0x4c, 0xa2, 0xcc, 0x4a, 0x34, 0x88, 0x6f, 0x51, 0x6d, 0x35, 0xa9, 0x46, + 0x71, 0xcc, 0x3e, 0xc5, 0xd3, 0x72, 0x66, 0xa1, 0x78, 0xc2, 0x5b, 0x6e, + 0x6e, 0xaf, 0x6d, 0x7d, 0xc4, 0x7f, 0x91, 0x34, 0x30, 0xb0, 0xae, 0x44, + 0x9b, 0x4b, 0x42, 0x48, 0xca, 0x7f, 0x90, 0xc7, 0x48, 0x36, 0x57, 0xce, + 0x92, 0x47, 0xbc, 0x73, 0x9c, 0x4b, 0x82, 0x7d, 0x3d, 0x5c, 0xc7, 0x38, + 0x42, 0x76, 0xa8, 0x78, 0xc5, 0x72, 0xc9, 0x65, 0xaf, 0x8a, 0x6c, 0x5a, + 0xaf, 0x58, 0x59, 0x9d, 0x6f, 0x86, 0x3d, 0x4a, 0xc2, 0x67, 0x95, 0x5f, + 0x85, 0xb3, 0xb1, 0x5e, 0xaf, 0x70, 0x77, 0x84, 0x9f, 0x34, 0x66, 0x56, + 0x34, 0xce, 0x71, 0x75, 0xab, 0xa5, 0x57, 0x95, 0x76, 0x5a, 0x75, 0x48, + 0xd3, 0x66, 0xc1, 0x47, 0x31, 0xb7, 0x9e, 0x72, 0x99, 0xc4, 0xb0, 0x8b, + 0xb3, 0x69, 0x53, 0x5f, 0x56, 0x87, 0x78, 0x71, 0xd4, 0xbf, 0xa8, 0x87, + 0x3d, 0xbf, 0xd1, 0xce, 0x32, 0x9d, 0x86, 0x77, 0xa5, 0x34, 0x95, 0xcc, + 0xc0, 0x4e, 0x53, 0xc3, 0x84, 0xa7, 0xaa, 0x88, 0xb8, 0x51, 0x99, 0xc9, + 0xbf, 0x50, 0xb9, 0x8d, 0x7b, 0x8c, 0x92, 0x42, 0xd2, 0x95, 0x43, 0x56, + 0xcb, 0x3a, 0xce, 0xcd, 0x77, 0xc1, 0x80, 0x97, 0x5c, 0x78, 0x51, 0x97, + 0xcb, 0x5e, 0xb3, 0x7a, 0xcd, 0x35, 0xc6, 0x58, 0x46, 0x89, 0x3d, 0x80, + 0x4b, 0x68, 0x76, 0x45, 0x3d, 0xa2, 0x43, 0x8e, 0x48, 0x79, 0x3a, 0x6c, + 0x88, 0x54, 0x7e, 0x79, 0x8d, 0x5a, 0x8d, 0xce, 0x3c, 0x82, 0x52, 0x38, + 0x92, 0xb0, 0xa6, 0x78, 0xc5, 0x6e, 0xc5, 0xb8, 0x73, 0x95, 0x78, 0x85, + 0x8d, 0x61, 0x79, 0xc1, 0x4f, 0x60, 0x67, 0x37, 0x69, 0x54, 0x45, 0x5e, + 0x91, 0x8e, 0xd1, 0x9c, 0xbb, 0xb0, 0x62, 0x7c, 0x3e, 0x6a, 0xcd, 0x97, + 0x7f, 0x4b, 0x7a, 0x58, 0x56, 0xc4, 0xca, 0x68, 0x77, 0x58, 0xa7, 0xab, + 0x49, 0x4a, 0xd3, 0x78, 0x68, 0xbb, 0xc4, 0x9b, 0xa0, 0x73, 0x92, 0x6c, + 0x91, 0x5d, 0x84, 0x68, 0x4b, 0x69, 0x3d, 0xa0, 0x88, 0x6b, 0x71, 0x32, + 0x7b, 0xb9, 0x5d, 0x73, 0xd1, 0x3e, 0x3b, 0x9a, 0xb3, 0xc2, 0xa3, 0xcb, + 0x82, 0x6f, 0x7e, 0x9c, 0xb5, 0x76, 0xa5, 0xb0, 0x43, 0x52, 0xaf, 0xb1, + 0xa8, 0x82, 0xb8, 0x90, 0x69, 0x4e, 0xae, 0x9c, 0x64, 0x5d, 0x9c, 0xb3, + 0x34, 0x40, 0x32, 0x70, 0xa1, 0x50, 0xb1, 0x47, 0x97, 0x9b, 0x87, 0x3b, + 0x64, 0x95, 0x42, 0x58, 0xa1, 0xa4, 0x38, 0x48, 0xa5, 0x99, 0xc3, 0x9f, + 0xba, 0x43, 0xcd, 0xb1, 0x53, 0xa6, 0xba, 0x64, 0x75, 0x37, 0xa6, 0xd6, + 0x72, 0x38, 0x7b, 0x91, 0x87, 0x8d, 0x9c, 0x49, 0x6c, 0x51, 0x52, 0x70, + 0x7e, 0xcb, 0xaf, 0xc1, 0x30, 0x88, 0x8f, 0x8f, 0x84, 0x3e, 0x81, 0x42, + 0x74, 0x51, 0xd0, 0x47, 0x80, 0x4b, 0x56, 0xac, 0x3e, 0x37, 0x44, 0x90, + 0x55, 0x31, 0xcd, 0xbe, 0x74, 0x80, 0x5d, 0x8f, 0xad, 0x72, 0xca, 0x79, + 0xc6, 0x66, 0x77, 0x30, 0x3d, 0x6c, 0xae, 0x87, 0xc2, 0x88, 0x3b, 0x87, + 0x7a, 0x82, 0xaf, 0xca, 0xb0, 0x81, 0x7c, 0x50, 0xc1, 0xcf, 0xb2, 0xba, + 0xb8, 0xbf, 0x55, 0xaa, 0xb9, 0xab, 0xd5, 0xbe, 0x93, 0x7b, 0xa9, 0x3d, + 0x58, 0x78, 0x61, 0x77, 0x5f, 0x87, 0x64, 0xc8, 0x94, 0x62, 0xa9, 0x65, + 0x92, 0xc3, 0xc1, 0x79, 0x79, 0x33, 0x25, 0x2b, 0x52, 0xbd, 0x3b, 0x6b, + 0xda, 0xad, 0x71, 0xd0, 0xb7, 0x55, 0xa0, 0x47, 0x93, 0xb5, 0xa5, 0xbb, + 0x64, 0xc5, 0x57, 0xa5, 0xa6, 0x93, 0xce, 0xb9, 0x6b, 0x66, 0x4c, 0x97, + 0xbb, 0xa7, 0xcd, 0x60, 0x7c, 0x97, 0xa4, 0x64, 0x8a, 0x80, 0xb6, 0x95, + 0x5f, 0x53, 0x86, 0x7e, 0x57, 0x56, 0x7b, 0x4e, 0xcd, 0x53, 0xaf, 0x62, + 0xc4, 0x8e, 0x4f, 0xd6, 0x8e, 0xbf, 0x69, 0x9c, 0x73, 0xa0, 0x79, 0xa3, + 0x4f, 0x5d, 0x4e, 0x66, 0xb8, 0x4e, 0xb6, 0x9e, 0x33, 0xb2, 0x3a, 0x74, + 0x65, 0xd9, 0xb0, 0xb4, 0xc0, 0x56, 0xdf, 0x75, 0x83, 0x6b, 0x4e, 0x82, + 0xaa, 0xbe, 0x83, 0x59, 0xc9, 0xa6, 0xb3, 0x6e, 0xcc, 0xb9, 0xbf, 0xc1, + 0xc8, 0x7f, 0x7c, 0xad, 0x6c, 0xb3, 0xab, 0x52, 0xbe, 0xac, 0x50, 0x9c, + 0x7f, 0x76, 0x40, 0x5c, 0xaf, 0x85, 0x95, 0xcc, 0x38, 0x3e, 0x39, 0x75, + 0x41, 0x36, 0x60, 0xb5, 0xbd, 0xae, 0xac, 0x3a, 0x39, 0xdb, 0x4f, 0x93, + 0xbf, 0x98, 0x37, 0x53, 0x42, 0x62, 0xcf, 0xd0, 0xbc, 0xa6, 0x8d, 0x97, + 0xaa, 0xaa, 0x5c, 0x4a, 0x8d, 0xae, 0xbf, 0xb5, 0x5e, 0x77, 0x4c, 0x4a, + 0x9f, 0xd3, 0xc9, 0x92, 0x2e, 0x70, 0x3b, 0x44, 0xd1, 0xcc, 0x4d, 0xae, + 0x44, 0x90, 0xd2, 0x4b, 0x5f, 0x5c, 0xd3, 0x3a, 0x61, 0x96, 0x6b, 0x57, + 0xc8, 0xad, 0x3c, 0xc7, 0x3f, 0x73, 0x9a, 0x2e, 0x72, 0x43, 0x63, 0xca, + 0xad, 0x81, 0x7d, 0x8d, 0x98, 0x36, 0x77, 0x63, 0xb1, 0xce, 0x9d, 0x67, + 0x9d, 0x60, 0x41, 0xa2, 0x74, 0x6c, 0xa2, 0x99, 0x88, 0xc9, 0xb8, 0xc5, + 0x47, 0xd0, 0xcc, 0x86, 0x6c, 0x49, 0xcc, 0x94, 0x59, 0x39, 0x40, 0x6f, + 0x96, 0x7a, 0xd5, 0xd7, 0xa0, 0x75, 0xaa, 0xb0, 0x8c, 0x42, 0x73, 0x46, + 0x9b, 0xa6, 0xd3, 0xb1, 0x77, 0xa4, 0x57, 0x73, 0x61, 0xa6, 0xaa, 0xc0, + 0x62, 0x87, 0x7c, 0xb9, 0x84, 0xcb, 0x4e, 0x9d, 0x87, 0x8a, 0x4b, 0x96, + 0x59, 0xc6, 0x97, 0xca, 0x5e, 0x8b, 0x74, 0x67, 0x62, 0x4d, 0xaa, 0xcf, + 0xb5, 0x5b, 0x40, 0xb4, 0x48, 0xa1, 0x80, 0xb2, 0xb9, 0x58, 0xab, 0xb8, + 0x37, 0x61, 0xa5, 0xb8, 0x6f, 0x60, 0x82, 0x85, 0xa4, 0x45, 0xae, 0x91, + 0x90, 0x7c, 0x95, 0x82, 0x9a, 0x9c, 0x99, 0x73, 0x85, 0x4d, 0x84, 0x76, + 0x7d, 0xa5, 0x57, 0xb7, 0x8e, 0xd3, 0x69, 0x6b, 0x89, 0x37, 0x50, 0xc4, + 0xcc, 0x7a, 0x4c, 0xb9, 0x9a, 0x4e, 0x8b, 0x3b, 0x99, 0x82, 0xc0, 0x59, + 0xc2, 0x59, 0x82, 0x41, 0xb0, 0x44, 0x7e, 0x75, 0x92, 0xa8, 0xc9, 0x82, + 0x52, 0xbb, 0xc0, 0xae, 0xcf, 0xb4, 0x68, 0x77, 0x9f, 0x87, 0x7b, 0xca, + 0xc8, 0x55, 0xcf, 0x83, 0xcf, 0x96, 0x90, 0x79, 0x4e, 0x4c, 0x5d, 0x4b, + 0xaa, 0xbc, 0x99, 0xba, 0x3c, 0x9f, 0x9f, 0x80, 0x51, 0x85, 0x45, 0xbc, + 0x6c, 0x66, 0xb7, 0x75, 0x5c, 0x77, 0x7e, 0x4b, 0x99, 0x6a, 0xd1, 0xae, + 0xcb, 0x8f, 0x82, 0xc7, 0xc6, 0x9d, 0xca, 0x88, 0xc8, 0x42, 0xcd, 0x43, + 0xa7, 0xbc, 0x5d, 0x82, 0x3f, 0x6c, 0x41, 0xcf, 0x3b, 0x74, 0x9a, 0x61, + 0x30, 0x54, 0x96, 0x86, 0x88, 0x4b, 0x59, 0x69, 0xc6, 0x93, 0x43, 0xc8, + 0x4c, 0xcd, 0x87, 0x30, 0x3a, 0xb9, 0x4d, 0xa9, 0xd3, 0xb6, 0xa6, 0xaf, + 0xa1, 0xa6, 0x6d, 0x6d, 0x31, 0xa1, 0x5c, 0x97, 0xbe, 0x38, 0xcf, 0x73, + 0xbe, 0xa4, 0x2b, 0x6c, 0x8b, 0x58, 0xc8, 0xc5, 0xa4, 0xaa, 0x67, 0xc3, + 0x3e, 0x4a, 0x7f, 0x37, 0x6a, 0x67, 0x5d, 0x97, 0x7a, 0xa2, 0x53, 0xd0, + 0x67, 0x46, 0x7b, 0x8b, 0x71, 0xaa, 0xb1, 0xc7, 0x93, 0x62, 0x2d, 0x40, + 0xd4, 0x63, 0x55, 0xc0, 0x65, 0x9c, 0x56, 0xc7, 0x6f, 0xa3, 0xc2, 0x5a, + 0xa4, 0x9d, 0x64, 0x5e, 0xac, 0x8e, 0xbd, 0x4b, 0x75, 0xc3, 0x8f, 0x6c, + 0xba, 0x41, 0xb6, 0x89, 0x36, 0xa0, 0x40, 0xc2, 0xac, 0xc4, 0xc0, 0x45, + 0xa0, 0x3f, 0x82, 0x95, 0xbb, 0x58, 0x70, 0x66, 0x56, 0x6f, 0x74, 0xb2, + 0x83, 0xa3, 0xae, 0x66, 0x7d, 0x60, 0xc8, 0xc3, 0x51, 0x7f, 0x3b, 0xd7, + 0x82, 0x4d, 0x3c, 0xbe, 0x97, 0x33, 0x9d, 0xb2, 0xa5, 0x96, 0xa4, 0x7a, + 0xaa, 0x9f, 0xbf, 0x8f, 0x6b, 0x40, 0xc8, 0x73, 0x3e, 0x6a, 0x8b, 0x41, + 0x6c, 0x65, 0x6c, 0x78, 0x50, 0x45, 0x84, 0xc3, 0x64, 0xad, 0xa4, 0x42, + 0x86, 0x78, 0x4e, 0x8a, 0xa7, 0x88, 0x38, 0x58, 0xa7, 0xc3, 0xc1, 0xae, + 0x6c, 0x5a, 0x33, 0x42, 0xca, 0x69, 0x63, 0x57, 0x9d, 0xbf, 0x75, 0xaf, + 0x6f, 0x4e, 0x38, 0x90, 0x40, 0xb4, 0x89, 0x67, 0xc0, 0xa9, 0x9f, 0x73, + 0x44, 0x96, 0x56, 0x79, 0x33, 0x8d, 0x83, 0xab, 0x9a, 0x4c, 0x95, 0xa2, + 0x9d, 0x4d, 0xa4, 0x93, 0x6e, 0xa0, 0x48, 0xb8, 0x9d, 0xad, 0x76, 0xad, + 0x5e, 0x79, 0x3d, 0xa4, 0x82, 0xcc, 0xa2, 0x96, 0xd1, 0x9d, 0x84, 0x67, + 0xa9, 0x95, 0x9a, 0xa6, 0x9c, 0xd4, 0x71, 0x59, 0xa5, 0xd0, 0xaf, 0x66, + 0x86, 0x54, 0x79, 0x91, 0xbc, 0x48, 0xbb, 0x96, 0xce, 0xb9, 0x51, 0x40, + 0x9e, 0x75, 0x33, 0xd4, 0x97, 0xbe, 0xb1, 0xca, 0x35, 0x6d, 0x85, 0xaa, + 0xc2, 0x6b, 0x57, 0xbd, 0x92, 0x39, 0x58, 0x65, 0x79, 0x3f, 0xa0, 0xac, + 0xac, 0x53, 0x73, 0x7f, 0x72, 0x74, 0x4d, 0x84, 0x7c, 0xc0, 0x74, 0x69, + 0xae, 0x32, 0x39, 0x77, 0x4b, 0x8c, 0xb4, 0x61, 0xb7, 0x64, 0x38, 0xc4, + 0x8e, 0xa7, 0xa4, 0x75, 0x48, 0xbe, 0x6d, 0x8f, 0xd3, 0x5f, 0x51, 0x61, + 0xb5, 0xab, 0xb3, 0x33, 0x78, 0xad, 0xb1, 0x4d, 0xa0, 0xc7, 0x89, 0x9c, + 0x40, 0x32, 0x4c, 0x48, 0x50, 0x3b, 0x92, 0xc5, 0x7f, 0x67, 0xc8, 0x55, + 0xc3, 0xbf, 0x6c, 0x92, 0x93, 0x8d, 0xa8, 0x76, 0x3e, 0x4b, 0x86, 0x87, + 0x46, 0xbd, 0xa8, 0xc5, 0x3a, 0x50, 0x4e, 0x89, 0x59, 0x84, 0x62, 0xc4, + 0xd6, 0xc5, 0x95, 0x6e, 0xc2, 0x83, 0xb6, 0x5a, 0xd1, 0x3b, 0x68, 0xb6, + 0x3b, 0x87, 0x9f, 0x7a, 0xa9, 0xa9, 0x56, 0x67, 0x76, 0x35, 0xca, 0x4e, + 0x7a, 0x71, 0x3d, 0xd3, 0xc1, 0x91, 0x72, 0xa7, 0xaa, 0xab, 0x92, 0x76, + 0xa5, 0x69, 0x54, 0x87, 0x52, 0x51, 0x8d, 0xb7, 0xb6, 0x80, 0xae, 0xb8, + 0x84, 0xb5, 0x3d, 0xa0, 0x86, 0x72, 0xa8, 0x54, 0x42, 0x36, 0x8c, 0xb9, + 0x58, 0x9f, 0x77, 0xb5, 0x3b, 0xbb, 0xc2, 0x62, 0x87, 0x39, 0x40, 0x32, + 0x3c, 0x6a, 0x47, 0x5c, 0x8d, 0x80, 0x38, 0x7e, 0x72, 0x72, 0x7e, 0x6d, + 0x35, 0x7b, 0xab, 0x4c, 0xba, 0x9d, 0xa0, 0x38, 0xc4, 0x47, 0x7d, 0x79, + 0xbf, 0x35, 0xd2, 0xca, 0xa1, 0xb2, 0xbf, 0x82, 0x93, 0x8c, 0xd7, 0xbb, + 0x80, 0x3c, 0x78, 0xa8, 0xc0, 0xc7, 0x98, 0x6a, 0x93, 0x81, 0x78, 0xc7, + 0xac, 0xca, 0x6e, 0xb7, 0x38, 0xcf, 0x74, 0x7e, 0x76, 0x70, 0xd3, 0xba, + 0x64, 0xd1, 0xbd, 0x6d, 0x8e, 0x6d, 0xb3, 0x9a, 0xbc, 0x90, 0x4a, 0x46, + 0x8f, 0x9e, 0x5f, 0xa6, 0x6d, 0x7e, 0xb5, 0x5d, 0x60, 0xbc, 0xa4, 0x41, + 0x3f, 0x7d, 0x51, 0x8f, 0xae, 0xae, 0xb2, 0x45, 0xb1, 0x81, 0x96, 0xa5, + 0x86, 0x3d, 0x95, 0xd5, 0xc7, 0xce, 0x5c, 0x4b, 0x63, 0x5b, 0xb0, 0x54, + 0x83, 0xa5, 0xcc, 0x7a, 0x4d, 0x63, 0xbc, 0x67, 0xbe, 0xce, 0x63, 0x9d, + 0x80, 0xcc, 0x31, 0x53, 0x85, 0xdb, 0x55, 0x8d, 0xd5, 0xcd, 0xc9, 0x9f, + 0x85, 0xb3, 0xb1, 0xac, 0x66, 0x45, 0x3b, 0x8f, 0xac, 0xa6, 0x35, 0xcb, + 0x7a, 0x35, 0x45, 0xa9, 0xb5, 0x39, 0x73, 0xcb, 0x5b, 0x3a, 0x39, 0x9a, + 0x65, 0x80, 0xb2, 0xab, 0x67, 0x62, 0x7c, 0x7e, 0x47, 0xaa, 0x82, 0xbe, + 0x86, 0xcb, 0xb1, 0x52, 0x3a, 0xa7, 0x98, 0x61, 0x62, 0x93, 0x6e, 0xaf, + 0x6f, 0x70, 0x84, 0x84, 0x55, 0x9b, 0xbf, 0x7e, 0x6e, 0x89, 0xa9, 0x86, + 0xbc, 0x84, 0x75, 0xc4, 0x81, 0x7c, 0x71, 0x37, 0xa0, 0x74, 0xca, 0xbc, + 0xbf, 0x47, 0x90, 0x71, 0x65, 0x30, 0x40, 0xc4, 0xce, 0x35, 0x49, 0xc0, + 0x3e, 0x78, 0x79, 0x8c, 0xc8, 0x60, 0xa8, 0xce, 0x46, 0x85, 0x44, 0x3a, + 0x81, 0xc6, 0xba, 0x35, 0xb3, 0xbe, 0x5c, 0x78, 0x2d, 0x8a, 0xc8, 0xab, + 0x31, 0x8e, 0x51, 0xc1, 0x72, 0xa9, 0x87, 0x4c, 0x5a, 0xa9, 0x3e, 0xbd, + 0x96, 0x5c, 0x59, 0x73, 0xcc, 0xc4, 0x76, 0x6e, 0x51, 0xb4, 0x64, 0xa9, + 0xb4, 0x57, 0x9b, 0xc3, 0x83, 0xd5, 0x42, 0xd6, 0x98, 0x59, 0xb4, 0xb6, + 0xb8, 0x36, 0x56, 0xcc, 0x7f, 0x63, 0xa4, 0x55, 0x63, 0x52, 0x3f, 0xa1, + 0x79, 0x68, 0x41, 0x3f, 0x84, 0xad, 0x91, 0xd7, 0x5b, 0x5d, 0x87, 0xbf, + 0x62, 0x92, 0x55, 0x42, 0xb2, 0x2f, 0x5f, 0x82, 0x46, 0x58, 0x4a, 0x7e, + 0xc1, 0x35, 0x4d, 0x45, 0x60, 0x9a, 0x3f, 0xae, 0x65, 0x83, 0x64, 0x30, + 0x9c, 0xbd, 0x94, 0xcf, 0x35, 0x8e, 0x4a, 0xbb, 0xda, 0xcc, 0x3a, 0x69, + 0x9d, 0xc3, 0x6a, 0x8d, 0xa6, 0x6f, 0x93, 0xc5, 0x34, 0x4a, 0x90, 0x67, + 0x6e, 0x6c, 0x72, 0x6a, 0xbf, 0x64, 0xc2, 0xc5, 0xb8, 0xa9, 0x7e, 0xd6, + 0x98, 0x61, 0x96, 0x5f, 0xbe, 0x9f, 0x53, 0x35, 0xcb, 0x4c, 0x3e, 0xd5, + 0xa3, 0x4a, 0xa5, 0xad, 0x4f, 0x67, 0xa4, 0xb3, 0x32, 0xcc, 0xb4, 0x8c, + 0x9b, 0x7e, 0x4a, 0x52, 0x71, 0x3f, 0x83, 0x4b, 0xb3, 0xa7, 0x47, 0xa7, + 0x75, 0x68, 0xa3, 0x32, 0x57, 0x28, 0x89, 0x5d, 0x3b, 0x2a, 0x3d, 0xc8, + 0xaa, 0x75, 0xd4, 0x91, 0x7f, 0x4a, 0x6e, 0x58, 0x53, 0xad, 0xb9, 0x73, + 0xc9, 0x79, 0x3c, 0xa1, 0x4d, 0x5c, 0x74, 0xcc, 0x69, 0x6f, 0x3e, 0x6a, + 0x4d, 0x48, 0x97, 0x73, 0xb6, 0x8a, 0xc1, 0x54, 0xb8, 0x6b, 0xb8, 0xd0, + 0x35, 0x6c, 0x99, 0x51, 0xc2, 0x68, 0x88, 0x67, 0x65, 0x8b, 0x5c, 0x37, + 0xd4, 0x99, 0x6b, 0x3f, 0x89, 0x37, 0x5a, 0x36, 0x8c, 0x73, 0x89, 0x3a, + 0xab, 0x50, 0x98, 0x96, 0x64, 0xc4, 0x8e, 0xa3, 0x57, 0x96, 0x67, 0x74, + 0x58, 0x5a, 0x63, 0xd6, 0x88, 0xd0, 0x82, 0xa2, 0x88, 0x74, 0xa1, 0x39, + 0x9e, 0x77, 0x34, 0x3e, 0x4e, 0x96, 0x7a, 0x4b, 0xbc, 0xc0, 0x3a, 0xc1, + 0xab, 0x58, 0xc0, 0x73, 0xa8, 0x7c, 0xa8, 0x7e, 0x45, 0xa1, 0x84, 0x61, + 0xaa, 0x39, 0x7f, 0x9d, 0x54, 0x47, 0xc5, 0x99, 0xbf, 0xb8, 0x97, 0x3e, + 0x9a, 0xaf, 0x3e, 0xa7, 0x71, 0x5f, 0x34, 0xc2, 0x43, 0x91, 0xd1, 0xa9, + 0x50, 0xb1, 0x4f, 0xb4, 0x4a, 0x78, 0x3c, 0x8f, 0x7a, 0x5b, 0xbb, 0x8a, + 0x6d, 0xb3, 0x41, 0x5e, 0x63, 0xcd, 0x3f, 0x3e, 0x51, 0x81, 0x89, 0x99, + 0x24, 0xce, 0x5b, 0x9d, 0x51, 0xbb, 0x53, 0x99, 0xc2, 0x4e, 0x97, 0x4f, + 0x94, 0xb2, 0x70, 0xcc, 0x75, 0xb1, 0x9f, 0x81, 0x69, 0x53, 0x6d, 0x5c, + 0xc5, 0x43, 0x4a, 0x9a, 0x4f, 0x87, 0xa0, 0x74, 0xce, 0xbb, 0xbc, 0x46, + 0xd4, 0x86, 0x6a, 0x8b, 0x64, 0xa2, 0x56, 0x76, 0x98, 0x56, 0x80, 0x54, + 0x8a, 0x6b, 0xa0, 0xb8, 0x6d, 0x68, 0x88, 0x46, 0xc5, 0xb4, 0x50, 0x92, + 0x8e, 0x64, 0x5c, 0x28, 0x93, 0x8c, 0xa9, 0xad, 0x35, 0x94, 0xb5, 0x62, + 0xbc, 0xd2, 0xd1, 0x85, 0xa9, 0xd4, 0x65, 0x77, 0x95, 0x74, 0xbe, 0x2e, + 0x89, 0xc5, 0x42, 0x8a, 0xa4, 0x73, 0x3a, 0x56, 0x32, 0xba, 0xa2, 0x6e, + 0xa3, 0x37, 0xc5, 0x93, 0xa1, 0x6a, 0x41, 0x69, 0x74, 0x8c, 0x5a, 0x4d, + 0x66, 0x7d, 0x8b, 0x49, 0xaa, 0x47, 0xb5, 0x78, 0x60, 0x62, 0x7e, 0x4b, + 0xa9, 0xd4, 0x90, 0xc2, 0x9e, 0xa2, 0x3e, 0xad, 0xd9, 0x6b, 0x8f, 0x61, + 0xa3, 0x83, 0xb5, 0xd5, 0x92, 0x6f, 0x3b, 0x9b, 0xcd, 0x80, 0x75, 0x53, + 0xc1, 0xca, 0x96, 0x49, 0x61, 0x72, 0x7a, 0x83, 0xd0, 0x3a, 0x6e, 0xa8, + 0x9e, 0x9c, 0xd0, 0x84, 0x80, 0x5f, 0x67, 0x5a, 0x7f, 0x82, 0x9a, 0x4d, + 0xcb, 0x76, 0x5d, 0x85, 0x77, 0xc5, 0x8e, 0x6c, 0x9d, 0xad, 0x4c, 0x4f, + 0xcc, 0x76, 0xc3, 0x71, 0xc1, 0x9c, 0xc1, 0x3c, 0x3f, 0x62, 0x75, 0x67, + 0xbf, 0xcf, 0x9e, 0x9d, 0x7b, 0xd0, 0x54, 0x92, 0x74, 0xc1, 0x2a, 0x80, + 0x39, 0x5b, 0x42, 0x5b, 0x48, 0xb9, 0xcd, 0x50, 0xda, 0xae, 0x60, 0x48, + 0x6a, 0x3b, 0xa8, 0xae, 0xcd, 0x9e, 0xce, 0x48, 0xa6, 0x42, 0x42, 0x7c, + 0x50, 0x8c, 0xab, 0x90, 0x8d, 0x46, 0x96, 0xd6, 0x92, 0x56, 0xaa, 0x62, + 0x36, 0x7c, 0xca, 0xc4, 0x4d, 0x4c, 0x41, 0x60, 0x71, 0xd0, 0x95, 0xbd, + 0x82, 0x78, 0x49, 0xce, 0x67, 0x34, 0x3e, 0x97, 0x59, 0x3b, 0x7a, 0xbe, + 0x9f, 0x3c, 0x3b, 0x49, 0x66, 0x90, 0x5e, 0x64, 0x63, 0x4c, 0x90, 0x4a, + 0xcd, 0x9e, 0x91, 0x96, 0x81, 0x38, 0xc0, 0x61, 0x55, 0xd3, 0x37, 0xa7, + 0x6f, 0x8e, 0x99, 0x38, 0x78, 0x4a, 0x59, 0x45, 0x9f, 0xba, 0x9b, 0x47, + 0x63, 0xa9, 0xcd, 0x73, 0x3f, 0xb2, 0xce, 0x72, 0x9b, 0x57, 0x4a, 0xcd, + 0x74, 0x88, 0x48, 0x51, 0xb7, 0xab, 0x8e, 0x89, 0xaa, 0x93, 0x63, 0x5c, + 0x6e, 0xc6, 0x6b, 0x8c, 0x50, 0x5c, 0x89, 0xbb, 0x8b, 0x8f, 0x7e, 0x66, + 0x4d, 0x59, 0x4b, 0x8b, 0x6a, 0x6c, 0x72, 0xae, 0xca, 0xc3, 0x8b, 0x7f, + 0x98, 0xb2, 0x52, 0x93, 0x60, 0x5f, 0x83, 0xcc, 0x58, 0xcd, 0xa0, 0xc0, + 0x82, 0x2c, 0x41, 0xd4, 0xa5, 0x4c, 0x5b, 0x98, 0x36, 0xcf, 0xb3, 0x66, + 0x5f, 0x4e, 0x99, 0x76, 0xb1, 0x63, 0xc6, 0x47, 0x70, 0xd0, 0x5e, 0x87, + 0xa4, 0x32, 0x3f, 0x82, 0x50, 0xa5, 0xd9, 0x2c, 0x7f, 0x91, 0xc2, 0x2c, + 0x94, 0x66, 0xb3, 0x41, 0xc5, 0xd5, 0x72, 0x80, 0xbb, 0x76, 0xc6, 0x44, + 0xc4, 0xbb, 0x3a, 0x46, 0xb5, 0x49, 0x9b, 0x86, 0x6d, 0x75, 0xa5, 0xa8, + 0x43, 0x7b, 0xc6, 0x97, 0xc0, 0xa1, 0x42, 0x8d, 0xa9, 0x95, 0x51, 0xb0, + 0x4c, 0x44, 0x9f, 0x9e, 0x6a, 0x5f, 0x71, 0x62, 0x70, 0x68, 0x6a, 0xc3, + 0xda, 0xb0, 0x35, 0x83, 0x35, 0xce, 0x67, 0x50, 0x52, 0x33, 0x60, 0xc1, + 0x67, 0xa7, 0x6e, 0x5c, 0xb0, 0x99, 0xa2, 0x5f, 0xb4, 0x85, 0x9d, 0xa6, + 0xa2, 0x60, 0x6d, 0x45, 0x9e, 0x76, 0xd3, 0x80, 0x91, 0x7b, 0xbb, 0x96, + 0x4a, 0xb7, 0x76, 0xac, 0xcf, 0x7c, 0x9f, 0xb1, 0x40, 0x3d, 0xba, 0xbf, + 0x81, 0x68, 0xc1, 0xb4, 0x98, 0xc0, 0x4d, 0x2e, 0x6c, 0x8a, 0xd0, 0x99, + 0x92, 0xa6, 0x5b, 0xb3, 0x53, 0x93, 0x63, 0x88, 0xa8, 0x3c, 0x83, 0x90, + 0x52, 0x89, 0x4b, 0x4a, 0xce, 0x3e, 0xa7, 0x99, 0x62, 0x6f, 0x5d, 0x63, + 0x3c, 0x8f, 0x83, 0xac, 0xb4, 0x64, 0xc6, 0x9c, 0xbf, 0x7a, 0x92, 0xa4, + 0x34, 0xc0, 0x93, 0xbc, 0x39, 0xb8, 0x67, 0xc9, 0x74, 0xc4, 0x84, 0xb3, + 0x7c, 0x94, 0xc5, 0xcb, 0x97, 0x7b, 0x6e, 0xbb, 0xad, 0x45, 0xcf, 0xa2, + 0x41, 0x6c, 0x92, 0xa9, 0x75, 0x4e, 0xe0, 0xa6, 0x84, 0xcd, 0x42, 0x9e, + 0x2d, 0x7c, 0x59, 0xa0, 0xc6, 0xb7, 0x3a, 0xa0, 0x89, 0x41, 0x2e, 0xdd, + 0x90, 0x7b, 0x4f, 0x6c, 0x78, 0x6b, 0x75, 0x95, 0xca, 0x58, 0xa1, 0xad, + 0xb6, 0xa3, 0xbb, 0x81, 0xe4, 0x6a, 0x6e, 0x8e, 0x6f, 0x31, 0xcc, 0x5b, + 0xba, 0x9f, 0x7e, 0xd0, 0xad, 0x94, 0x85, 0x77, 0x7d, 0x4b, 0x79, 0x5f, + 0xcb, 0x82, 0x58, 0xb7, 0xaa, 0xc5, 0x84, 0x93, 0x9a, 0xb3, 0xc9, 0x3b, + 0xcd, 0xc8, 0x3f, 0x96, 0xcf, 0x31, 0x44, 0x67, 0x89, 0xa8, 0xc5, 0xb4, + 0x6b, 0xc4, 0xd0, 0x97, 0x47, 0x5a, 0xd4, 0xa4, 0x39, 0x9e, 0xa9, 0xb7, + 0x3d, 0x62, 0x6b, 0x9b, 0xaf, 0x7c, 0xba, 0xa0, 0x50, 0x61, 0x38, 0x5b, + 0xcd, 0x9b, 0x7c, 0x7b, 0x5d, 0x97, 0x8d, 0xbf, 0x60, 0xbe, 0xb3, 0x4f, + 0xc5, 0x80, 0xbd, 0xcd, 0x84, 0x8f, 0xcb, 0x4d, 0x81, 0x79, 0xca, 0x6c, + 0x60, 0xbb, 0x53, 0x91, 0xa2, 0x72, 0x64, 0xb6, 0x70, 0x4f, 0xd8, 0xc6, + 0x38, 0x32, 0xac, 0x53, 0x3e, 0xaf, 0x7e, 0x88, 0x48, 0x3f, 0xd7, 0x3f, + 0x5f, 0xda, 0xb5, 0xd6, 0x48, 0xaf, 0xd6, 0x9c, 0x49, 0x8c, 0x66, 0xa2, + 0x4f, 0x5b, 0x41, 0x6a, 0xa9, 0x35, 0xcb, 0x31, 0x3c, 0x73, 0x69, 0x85, + 0x9d, 0xdd, 0x51, 0x39, 0xc1, 0x97, 0xc0, 0xa1, 0xc6, 0x9b, 0x42, 0x74, + 0x7a, 0x9f, 0x8d, 0xbc, 0xc6, 0x4f, 0x8d, 0x63, 0x50, 0xc1, 0x74, 0x9a, + 0x95, 0xc2, 0x73, 0xd4, 0xb9, 0x86, 0x5f, 0x57, 0x45, 0x38, 0x4e, 0x2c, + 0xcb, 0x8b, 0xd9, 0x71, 0x4d, 0x63, 0x76, 0x7f, 0x7c, 0x77, 0xb7, 0xa7, + 0x58, 0x8f, 0x90, 0x63, 0x46, 0x37, 0x67, 0x9c, 0xc3, 0xcb, 0x63, 0x5d, + 0xb4, 0x9e, 0x49, 0xa1, 0x4b, 0xdc, 0x67, 0x6b, 0x88, 0x54, 0xc3, 0x34, + 0x75, 0x88, 0x89, 0x6a, 0x3c, 0x5a, 0x5f, 0xca, 0x6c, 0x4f, 0xc3, 0xc8, + 0x46, 0x79, 0xc5, 0x6a, 0x52, 0x5d, 0x4c, 0x6b, 0x89, 0x46, 0xcc, 0xd5, + 0x5e, 0x52, 0xbe, 0x47, 0xd5, 0xb2, 0x55, 0x7b, 0x43, 0xb9, 0x92, 0xc1, + 0x67, 0x49, 0xbd, 0xa9, 0x53, 0x6f, 0x72, 0x75, 0x1e, 0x90, 0xc3, 0xa2, + 0x46, 0x81, 0xdc, 0x6e, 0x5c, 0xbd, 0x97, 0x7c, 0x6c, 0x63, 0x57, 0xd4, + 0xb0, 0xb8, 0x8f, 0xcf, 0x51, 0x59, 0xcd, 0x3d, 0x43, 0x45, 0xbf, 0x40, + 0xbb, 0xb3, 0xd0, 0x86, 0xbd, 0x72, 0x3a, 0xd6, 0xae, 0xc5, 0x5b, 0x8d, + 0x98, 0x9e, 0x9a, 0x87, 0x8e, 0x9f, 0xce, 0x60, 0x95, 0x2e, 0x70, 0xa6, + 0xa5, 0x34, 0x41, 0xd1, 0xa8, 0xa3, 0x9b, 0x88, 0xc4, 0xcd, 0xc2, 0x4a, + 0x96, 0x3f, 0x4a, 0xa8, 0x94, 0x6e, 0x78, 0xab, 0x68, 0x6b, 0x84, 0xa6, + 0x46, 0x40, 0x9b, 0x9b, 0xde, 0xa0, 0x75, 0xd3, 0x55, 0x5e, 0x98, 0x4a, + 0x39, 0x45, 0x9b, 0x76, 0x64, 0xcd, 0x50, 0x58, 0x48, 0xc1, 0x7d, 0x3d, + 0x94, 0x9e, 0x4a, 0xa7, 0xbf, 0xbf, 0x7e, 0x84, 0x83, 0x56, 0x5c, 0xa6, + 0x39, 0x51, 0xab, 0x53, 0xce, 0x61, 0xc3, 0x59, 0x46, 0x80, 0xa5, 0xdb, + 0xae, 0x86, 0xc4, 0x56, 0x88, 0x79, 0x80, 0x5a, 0xaa, 0xbd, 0xc2, 0x38, + 0x75, 0x5c, 0xc6, 0xe5, 0xcf, 0xc7, 0xa0, 0x3d, 0xb5, 0x72, 0x7a, 0xc4, + 0x32, 0x57, 0x4e, 0xd8, 0x8f, 0x46, 0x97, 0x53, 0xb7, 0x54, 0x59, 0x3c, + 0x6c, 0xa5, 0x93, 0x84, 0x8e, 0xd6, 0x59, 0x8f, 0xc5, 0x3f, 0x70, 0x4b, + 0x50, 0x8f, 0xbc, 0x52, 0x45, 0x3c, 0xd2, 0xbd, 0xd9, 0x93, 0xc5, 0xa1, + 0x6e, 0xb1, 0xa0, 0x7f, 0x81, 0xc2, 0x2a, 0x54, 0x9c, 0x42, 0x8f, 0xbc, + 0xa3, 0xad, 0x7a, 0xc1, 0x99, 0x4f, 0x69, 0x43, 0x65, 0xe0, 0x3e, 0xa4, + 0xaa, 0x6e, 0xd1, 0xcb, 0xc0, 0x2d, 0x40, 0xa6, 0xd2, 0xb9, 0xd5, 0xaa, + 0xa9, 0x86, 0xc2, 0xb9, 0x93, 0x9b, 0x46, 0x55, 0x7c, 0x89, 0x75, 0x38, + 0xaf, 0xbf, 0xb0, 0xc5, 0x8c, 0x45, 0x85, 0x64, 0xd3, 0x58, 0x42, 0x82, + 0xd1, 0xb4, 0xc9, 0x52, 0x91, 0xcf, 0x6c, 0xbd, 0x3d, 0x67, 0xc7, 0x92, + 0x8e, 0x94, 0xd8, 0x9b, 0x52, 0x94, 0xac, 0x87, 0x57, 0x6e, 0x82, 0x5a, + 0x74, 0x41, 0x6e, 0x97, 0xb8, 0x7e, 0x9f, 0xc7, 0xa0, 0x3d, 0x43, 0x73, + 0x96, 0x4a, 0x86, 0x89, 0x70, 0xb2, 0x36, 0x1d, 0x78, 0x8c, 0xcd, 0x40, + 0x76, 0x81, 0x79, 0x78, 0x34, 0x31, 0xe6, 0x64, 0x3a, 0xbf, 0x97, 0x8a, + 0x38, 0x27, 0x62, 0x91, 0x84, 0x5f, 0xef, 0xdd, 0x5b, 0xaa, 0x8d, 0x71, + 0x43, 0xd9, 0xcf, 0x85, 0x4e, 0x8d, 0xce, 0x48, 0x56, 0x46, 0x49, 0x82, + 0xc0, 0x79, 0x46, 0x47, 0x91, 0x3d, 0x6f, 0x70, 0xb3, 0xcb, 0x52, 0xb1, + 0x70, 0xa6, 0x93, 0xcd, 0x37, 0xa2, 0x65, 0x2e, 0x2e, 0x91, 0x9c, 0x79, + 0x41, 0x9e, 0xaa, 0x5d, 0x97, 0x51, 0xa5, 0xd8, 0xbb, 0x74, 0x8d, 0xd7, + 0x37, 0x75, 0x75, 0x59, 0x3a, 0x94, 0x75, 0x92, 0xdb, 0x8a, 0x42, 0x87, + 0xd4, 0xcd, 0xcd, 0xa5, 0xc4, 0xbb, 0x9e, 0x8b, 0x6a, 0x46, 0x76, 0x29, + 0xd1, 0xa7, 0x6d, 0x49, 0xb0, 0x8f, 0x83, 0xa7, 0x48, 0x90, 0xc1, 0xcd, + 0x36, 0xb4, 0x77, 0x37, 0xbe, 0x62, 0x8f, 0x97, 0x6d, 0xc5, 0xbf, 0x7b, + 0x62, 0x75, 0x65, 0x48, 0xb6, 0x71, 0x54, 0xa4, 0xc2, 0xd2, 0x7f, 0x61, + 0xc6, 0x64, 0xa8, 0x55, 0x54, 0xaf, 0x4e, 0x42, 0xbd, 0xae, 0x78, 0x9e, + 0x44, 0xcc, 0x76, 0xb2, 0x63, 0x98, 0xb2, 0xcf, 0x8a, 0xc1, 0xa3, 0xa5, + 0x82, 0x8e, 0x9c, 0x9b, 0x5e, 0x5d, 0xc1, 0xd9, 0x48, 0x3e, 0xcf, 0x6f, + 0x6d, 0x44, 0xb2, 0xb0, 0x52, 0x8e, 0x95, 0x57, 0x6c, 0x5b, 0xb8, 0x9f, + 0x3b, 0x55, 0x44, 0x96, 0xc8, 0x3d, 0x90, 0xd6, 0x4a, 0xed, 0x74, 0x98, + 0x50, 0xa9, 0x7a, 0x46, 0x7e, 0x71, 0x3f, 0x46, 0xab, 0xb8, 0x8f, 0xac, + 0x34, 0x36, 0x9f, 0xa4, 0xce, 0x79, 0xa9, 0x3f, 0x48, 0x44, 0x7c, 0x86, + 0x25, 0x5c, 0x63, 0x2d, 0xb3, 0xe1, 0x97, 0x3a, 0x45, 0x3c, 0xaf, 0x4b, + 0x51, 0x35, 0x76, 0xbe, 0x9d, 0x46, 0x5f, 0x8e, 0x86, 0x6b, 0x60, 0x3d, + 0xcb, 0x3a, 0x66, 0x93, 0x9a, 0xb0, 0xe0, 0xd6, 0xb7, 0x74, 0x4f, 0x5f, + 0x32, 0xd5, 0xa0, 0xae, 0x52, 0x33, 0x70, 0x7d, 0x9e, 0xdb, 0xbc, 0xbc, + 0x8a, 0x60, 0xb7, 0x56, 0x4b, 0xbd, 0xb3, 0x40, 0xb3, 0x53, 0x42, 0xa1, + 0x6d, 0x75, 0xa8, 0xbf, 0x71, 0xb2, 0xa5, 0xb6, 0x64, 0x92, 0x95, 0x34, + 0xa7, 0x7c, 0xb9, 0x42, 0x8e, 0x9a, 0xc6, 0x56, 0x75, 0x57, 0x8a, 0x97, + 0xe1, 0xb2, 0xd3, 0x8d, 0xac, 0x77, 0xca, 0x92, 0x6e, 0x7f, 0x44, 0xaa, + 0xc4, 0x4f, 0x4a, 0xb5, 0x9b, 0x66, 0xbb, 0x35, 0x6e, 0x71, 0x43, 0x82, + 0x6b, 0x8d, 0x68, 0xb0, 0xd7, 0x62, 0xe9, 0x6f, 0x8e, 0x6f, 0x64, 0x44, + 0x7d, 0x62, 0x6b, 0x50, 0xc0, 0xb5, 0x9c, 0x73, 0x47, 0x4e, 0x41, 0xad, + 0x85, 0xc0, 0x6a, 0x78, 0x98, 0x5d, 0xcd, 0x64, 0x6d, 0xb4, 0x49, 0xc2, + 0x3d, 0xb0, 0xa4, 0xa2, 0x92, 0xc9, 0xda, 0x50, 0x89, 0xaf, 0xd6, 0x3a, + 0xd0, 0x8b, 0xc5, 0x40, 0xca, 0xaa, 0x53, 0xd2, 0x67, 0x81, 0x6f, 0x99, + 0xac, 0xab, 0x4a, 0x59, 0xa4, 0x52, 0x71, 0x92, 0xa7, 0xa3, 0x3f, 0xb8, + 0x41, 0x76, 0xdb, 0xbe, 0x3c, 0x80, 0x6c, 0x39, 0x8b, 0x38, 0x6d, 0x56, + 0xba, 0x5c, 0x86, 0x96, 0x72, 0x59, 0x5f, 0x6d, 0x6a, 0xc3, 0x5a, 0xcf, + 0xd4, 0x95, 0xb1, 0x5b, 0x3d, 0x6f, 0x91, 0x48, 0xc5, 0xdd, 0x9e, 0x43, + 0x9c, 0x68, 0xa5, 0x6f, 0xb2, 0x72, 0x8c, 0xa6, 0x7f, 0x73, 0xd3, 0x77, + 0xa3, 0x67, 0x8f, 0xa6, 0xb2, 0x89, 0x4e, 0x59, 0x97, 0xcb, 0x6b, 0x45, + 0xd2, 0xab, 0x50, 0x58, 0x42, 0x63, 0xae, 0xa4, 0x22, 0xbf, 0xb9, 0xc6, + 0x73, 0x72, 0xb0, 0xb0, 0x4a, 0xa4, 0xb1, 0x78, 0xb7, 0xa2, 0x9a, 0x6a, + 0xa1, 0x69, 0xbb, 0x59, 0x72, 0x3e, 0x6a, 0xc5, 0x5f, 0xaa, 0xc1, 0x71, + 0xb7, 0x35, 0x8a, 0x56, 0x5d, 0x6e, 0x8c, 0x77, 0x75, 0x2f, 0xb9, 0x37, + 0x77, 0x54, 0x35, 0x61, 0x8f, 0x8e, 0x84, 0xa9, 0x84, 0x81, 0xc2, 0xb1, + 0x33, 0x3c, 0x34, 0x39, 0xa1, 0x55, 0x60, 0xaa, 0xb2, 0x95, 0x59, 0x4c, + 0xd0, 0xc3, 0xbb, 0x8e, 0x40, 0x70, 0x57, 0x3a, 0x83, 0x6e, 0x70, 0x64, + 0xb3, 0x69, 0xd3, 0xb7, 0x8f, 0x51, 0x3d, 0x5d, 0xc6, 0x68, 0xad, 0xc6, + 0xb5, 0x9a, 0xba, 0xcc, 0x48, 0x99, 0x96, 0xb9, 0x72, 0xb3, 0xa9, 0xcf, + 0x5e, 0x5c, 0xc5, 0x77, 0x64, 0x55, 0x8b, 0xb6, 0x7a, 0x94, 0xab, 0x3f, + 0x76, 0x77, 0x42, 0x54, 0xb0, 0x67, 0xc1, 0x8b, 0x6a, 0x4a, 0x67, 0x79, + 0xa9, 0x63, 0x37, 0x3e, 0x9f, 0x9d, 0x5a, 0xb6, 0x82, 0x77, 0x40, 0x92, + 0x94, 0xd3, 0x85, 0xb8, 0x89, 0x66, 0x34, 0x4d, 0x9a, 0x46, 0xa6, 0x88, + 0x84, 0xba, 0x81, 0x49, 0x42, 0xa0, 0x74, 0xba, 0x4c, 0x69, 0x7b, 0xc0, + 0x61, 0x5a, 0xa4, 0x6b, 0x86, 0x4a, 0x63, 0x3a, 0x9e, 0x45, 0x6c, 0x8c, + 0x79, 0xbe, 0xd2, 0xb0, 0xa8, 0xaf, 0xcf, 0xd2, 0x59, 0x2e, 0x73, 0xa5, + 0x99, 0xc0, 0x35, 0xd6, 0xb9, 0x7e, 0xc3, 0x81, 0x51, 0xb7, 0xa1, 0x6c, + 0x48, 0x79, 0x49, 0x48, 0xc6, 0x70, 0x62, 0x88, 0xd2, 0x9a, 0x6d, 0x40, + 0xca, 0xbf, 0xac, 0xb9, 0x5d, 0x64, 0x46, 0x6d, 0x73, 0x56, 0x92, 0x7a, + 0x3b, 0x74, 0xc4, 0x5a, 0xcd, 0xb9, 0xc8, 0xba, 0x3c, 0x63, 0x69, 0xa8, + 0xa8, 0x90, 0x36, 0x37, 0xbb, 0x8f, 0xba, 0xca, 0xb2, 0x4b, 0xbb, 0xac, + 0x37, 0x71, 0x9c, 0x6d, 0x70, 0xaf, 0x7e, 0x8e, 0x74, 0xa0, 0xc8, 0x72, + 0xcf, 0xbd, 0xaf, 0x7b, 0x4f, 0x34, 0x9a, 0xb0, 0x5f, 0x7f, 0x82, 0xa2, + 0xd3, 0x50, 0x8d, 0x66, 0x81, 0xc0, 0xaf, 0x76, 0x7d, 0x3e, 0x3c, 0x5b, + 0x48, 0x83, 0xce, 0x59, 0x4f, 0x4f, 0x43, 0xbb, 0xca, 0xad, 0x70, 0x6a, + 0x58, 0x99, 0x84, 0x66, 0x87, 0x6b, 0x6e, 0x77, 0x71, 0x59, 0x51, 0xa5, + 0xa7, 0x82, 0x89, 0x69, 0xcf, 0x76, 0x57, 0x65, 0x93, 0xc2, 0xc8, 0x4c, + 0xa3, 0x4e, 0x43, 0x7d, 0xc1, 0xcb, 0xbb, 0x46, 0x32, 0x44, 0x53, 0x62, + 0x30, 0xad, 0x61, 0x75, 0x92, 0x44, 0xcd, 0xce, 0x3e, 0x9e, 0x5f, 0x76, + 0x61, 0x8e, 0x5e, 0xd4, 0x50, 0x4c, 0x36, 0x5f, 0x4b, 0xa1, 0x42, 0xb7, + 0xb6, 0x52, 0x7c, 0x83, 0x65, 0x6c, 0x45, 0x3c, 0x58, 0x7c, 0x80, 0x47, + 0x63, 0xc8, 0x92, 0x5d, 0xcb, 0x69, 0x84, 0x85, 0x74, 0xae, 0xc1, 0x98, + 0x8a, 0xb1, 0x97, 0xbd, 0xae, 0x5f, 0xc0, 0x4b, 0xc6, 0x7b, 0x80, 0xa0, + 0x6b, 0x7c, 0xb1, 0x50, 0x5a, 0x5c, 0x9e, 0x56, 0xd1, 0x48, 0x33, 0x61, + 0x8c, 0x9b, 0x99, 0x69, 0x53, 0x65, 0x78, 0x96, 0x4d, 0x55, 0x92, 0x99, + 0x73, 0xa8, 0x51, 0x8a, 0xbe, 0xa6, 0x56, 0x60, 0x6e, 0x79, 0xcc, 0xb0, + 0x7b, 0x62, 0x7f, 0x4c, 0x75, 0x74, 0x5a, 0x7c, 0x97, 0x6b, 0x89, 0x80, + 0x99, 0xc0, 0x46, 0xa7, 0x79, 0x45, 0x69, 0x66, 0x50, 0xa4, 0x8e, 0x9f, + 0x72, 0x9c, 0xa9, 0x71, 0xa2, 0x41, 0x39, 0xa2, 0x7b, 0xcc, 0x3b, 0x5b, + 0x37, 0x50, 0x41, 0x54, 0xd3, 0x65, 0xa0, 0xb3, 0xc9, 0x8b, 0xce, 0x97, + 0x8c, 0x91, 0x9b, 0xaf, 0xaf, 0x51, 0x6a, 0xbe, 0x5d, 0xc2, 0x8a, 0xb3, + 0xaa, 0xa8, 0x6e, 0xca, 0x85, 0xb9, 0xa4, 0x3b, 0x63, 0x98, 0x47, 0x93, + 0x5f, 0xbd, 0x4d, 0x90, 0x38, 0xb8, 0x97, 0x70, 0x7c, 0x78, 0xa6, 0xc7, + 0x5d, 0x53, 0x37, 0xbd, 0x98, 0x43, 0x71, 0x52, 0x33, 0x54, 0x9a, 0x7d, + 0x30, 0xbb, 0xcf, 0x74, 0x82, 0x56, 0xc2, 0x2f, 0x40, 0x91, 0xb6, 0x58, + 0xc6, 0x6c, 0x58, 0x9e, 0x4a, 0x56, 0x58, 0xaa, 0x80, 0x74, 0x48, 0xa8, + 0x48, 0xd5, 0xce, 0x6a, 0xaf, 0xb4, 0x76, 0xd0, 0x54, 0x7d, 0xbf, 0x42, + 0x50, 0x89, 0x94, 0x49, 0x2e, 0x5f, 0x67, 0x7e, 0xc7, 0xcc, 0x87, 0x59, + 0x9e, 0x69, 0x8f, 0x7d, 0x63, 0x95, 0xa4, 0xa6, 0xd3, 0x83, 0xbe, 0xca, + 0x71, 0xcc, 0x90, 0x60, 0x7a, 0xce, 0xb3, 0x94, 0x3f, 0xa9, 0xcb, 0xbc, + 0xa5, 0x5a, 0xbf, 0x72, 0x8b, 0x96, 0x89, 0xcb, 0xa0, 0x51, 0xb6, 0x78, + 0x62, 0x70, 0x71, 0x32, 0x67, 0x85, 0x74, 0x63, 0x36, 0x51, 0x69, 0x98, + 0xa9, 0xa0, 0x83, 0x52, 0x2f, 0x79, 0x5f, 0xb5, 0x37, 0x3e, 0x89, 0x6b, + 0x55, 0x35, 0x3e, 0x33, 0x78, 0x33, 0xb4, 0xcd, 0x89, 0x52, 0xc3, 0x8e, + 0x38, 0x98, 0x53, 0xcb, 0x73, 0xce, 0xc9, 0x77, 0x48, 0x2e, 0x74, 0x59, + 0x4a, 0x5b, 0x66, 0x55, 0x57, 0xc4, 0x41, 0xb2, 0x84, 0xa7, 0x72, 0x97, + 0x8b, 0x61, 0xc9, 0x86, 0x6f, 0x7a, 0xc0, 0x81, 0xa3, 0xb7, 0x5a, 0x5e, + 0x9c, 0x51, 0xb6, 0x7a, 0x71, 0x71, 0xce, 0xab, 0x8a, 0x57, 0xc2, 0xb7, + 0xbb, 0x8b, 0xb5, 0x51, 0xbd, 0x73, 0x77, 0x77, 0xd3, 0x42, 0x9f, 0xd0, + 0x63, 0x7f, 0x43, 0x83, 0x4d, 0x67, 0x5d, 0x42, 0x5e, 0x5b, 0x7b, 0x5d, + 0x4a, 0x89, 0xc7, 0x32, 0x38, 0x34, 0x99, 0x38, 0xae, 0x62, 0x7a, 0xb7, + 0xc0, 0x8d, 0x9d, 0x88, 0xcc, 0x3c, 0x58, 0x8a, 0x35, 0x80, 0x8a, 0x9d, + 0xbb, 0xce, 0xca, 0x51, 0xcf, 0x6b, 0x4c, 0x7e, 0xaa, 0x48, 0x97, 0xc5, + 0x46, 0xc6, 0x4f, 0xb4, 0x2e, 0x44, 0x80, 0x35, 0x82, 0xb1, 0x47, 0x50, + 0x67, 0x85, 0x4c, 0x5f, 0x47, 0x96, 0xa2, 0xa2, 0x87, 0x5c, 0x81, 0x81, + 0x7f, 0xa3, 0xce, 0x53, 0xc6, 0x8b, 0x4c, 0x4a, 0x73, 0x63, 0xa5, 0xc1, + 0xc3, 0x66, 0x48, 0xba, 0xb0, 0x38, 0x9e, 0x42, 0x3e, 0x9a, 0xa6, 0x2f, + 0x55, 0x9b, 0xa3, 0x76, 0x68, 0x2c, 0xc3, 0x40, 0x4f, 0x4f, 0xcd, 0x87, + 0x67, 0x9e, 0xba, 0xb3, 0x7c, 0x5f, 0xbf, 0x42, 0x9a, 0x5d, 0x47, 0xa5, + 0x38, 0xbf, 0x53, 0xa8, 0x88, 0x97, 0xac, 0xa2, 0xad, 0x7f, 0x87, 0xb9, + 0x84, 0xa7, 0xa7, 0x96, 0xb8, 0xa1, 0xba, 0x61, 0x33, 0x47, 0x5e, 0x67, + 0xcb, 0x67, 0x4a, 0xcf, 0x9a, 0xca, 0x8e, 0x99, 0x8c, 0x6f, 0xc1, 0xcf, + 0x53, 0x87, 0x3e, 0x8d, 0x94, 0x88, 0x7e, 0x42, 0xad, 0xbc, 0x39, 0x99, + 0x5a, 0xbe, 0xd6, 0xd0, 0x56, 0x6e, 0xcc, 0x6d, 0x77, 0x40, 0xc1, 0x7f, + 0x9d, 0x4d, 0xd3, 0x81, 0x8e, 0xd6, 0xc3, 0x4c, 0xa3, 0x4b, 0x3f, 0x70, + 0x5e, 0x80, 0xb2, 0x4f, 0xc1, 0x74, 0xb0, 0x63, 0x3d, 0x7c, 0x8b, 0x5f, + 0xcb, 0x6a, 0x8f, 0xa0, 0xb0, 0x5c, 0x4e, 0xbe, 0xae, 0xb7, 0x91, 0x75, + 0xb9, 0xaf, 0x62, 0x37, 0x85, 0x48, 0x59, 0xb2, 0xd8, 0x6b, 0x70, 0x75, + 0xa1, 0x85, 0x8d, 0x5a, 0x88, 0xb5, 0x37, 0x4e, 0x97, 0xa9, 0xdc, 0x5e, + 0x64, 0xda, 0xc3, 0x3f, 0xba, 0xa1, 0xcf, 0xb1, 0x5b, 0x98, 0xb6, 0x7c, + 0x85, 0x94, 0xb8, 0x44, 0xd2, 0xa1, 0x6b, 0x3d, 0x68, 0x90, 0x6d, 0x80, + 0x52, 0x52, 0x56, 0x76, 0x61, 0xdc, 0xd8, 0xcb, 0xd1, 0x94, 0xd9, 0x95, + 0x5c, 0xd1, 0xcc, 0xad, 0x69, 0xb2, 0xd1, 0x6c, 0x42, 0x65, 0x67, 0x65, + 0x66, 0x2f, 0x6b, 0x4d, 0x79, 0x6b, 0x41, 0x62, 0xc3, 0x58, 0xc7, 0x8e, + 0xa4, 0x85, 0xcf, 0x8e, 0xce, 0xc5, 0x49, 0x46, 0x97, 0x7f, 0x62, 0x92, + 0xd1, 0x3f, 0x43, 0x58, 0x53, 0x7c, 0x6d, 0x65, 0x52, 0x3e, 0x63, 0x80, + 0x71, 0x52, 0xa4, 0x91, 0xa6, 0xd2, 0xab, 0x7b, 0x48, 0x9e, 0x73, 0x6d, + 0xb0, 0xc7, 0x3a, 0xbc, 0xc7, 0x59, 0x31, 0x7b, 0x38, 0x7a, 0xaa, 0x64, + 0xa4, 0xce, 0x55, 0x92, 0x8e, 0x67, 0x38, 0x56, 0x4f, 0xc0, 0x8e, 0x57, + 0x80, 0x70, 0x54, 0x91, 0x54, 0x43, 0x4b, 0x71, 0x39, 0x7e, 0x88, 0xc5, + 0x4f, 0x73, 0xb3, 0xbf, 0xbe, 0x5b, 0x46, 0x9e, 0x41, 0xca, 0x8e, 0x83, + 0x74, 0xb2, 0xaf, 0xbf, 0x5d, 0x74, 0x48, 0xa2, 0x52, 0xcd, 0x6a, 0x6d, + 0xaa, 0x7f, 0x6d, 0x94, 0x9d, 0x82, 0x34, 0xc5, 0xb2, 0xc8, 0xb7, 0xbd, + 0xca, 0x88, 0x74, 0x47, 0xd8, 0x60, 0x7e, 0x9b, 0xad, 0x95, 0x71, 0x39, + 0x8e, 0x9c, 0x6d, 0x5c, 0xbc, 0xa1, 0xc6, 0x5e, 0x46, 0x3a, 0xc7, 0xb9, + 0x4f, 0x97, 0x4d, 0x53, 0x87, 0x89, 0x40, 0x36, 0x29, 0x4f, 0x7c, 0xd3, + 0x28, 0x74, 0x5e, 0x77, 0x7a, 0xb1, 0x49, 0x64, 0x6a, 0xc0, 0x49, 0x9e, + 0xb4, 0x97, 0x63, 0xb1, 0x85, 0xac, 0x59, 0x79, 0x72, 0x96, 0x80, 0x9a, + 0xaf, 0xae, 0xaf, 0x45, 0xc1, 0x73, 0xa2, 0xa4, 0x73, 0xce, 0xa2, 0x59, + 0x52, 0x76, 0xdd, 0x6c, 0x68, 0xc0, 0xcc, 0xcf, 0x46, 0x7c, 0x78, 0x97, + 0xd1, 0x6d, 0xb1, 0x70, 0x6d, 0x4a, 0xa1, 0x4f, 0xa0, 0x4c, 0xbc, 0x63, + 0x58, 0x59, 0x3a, 0x86, 0x76, 0xb3, 0xc5, 0xa9, 0xae, 0x5c, 0xcf, 0x3d, + 0x65, 0xd5, 0x92, 0xca, 0xca, 0xce, 0x93, 0x45, 0x4a, 0x9c, 0xb1, 0x3c, + 0x9e, 0x69, 0x73, 0x8d, 0x2f, 0xae, 0xb2, 0x9f, 0x96, 0x2b, 0x6b, 0x76, + 0x3d, 0xb7, 0x84, 0xbd, 0x37, 0xb3, 0x93, 0x8a, 0x3d, 0x74, 0xac, 0x7d, + 0x6f, 0x8f, 0x37, 0xa8, 0x59, 0x50, 0x85, 0x4d, 0xc4, 0x3d, 0x97, 0x42, + 0x4e, 0xb8, 0xc2, 0x39, 0xa0, 0x8b, 0x69, 0x7a, 0x2f, 0x96, 0xad, 0xcd, + 0x88, 0x4b, 0x2d, 0xa0, 0x53, 0xac, 0xdc, 0xbf, 0xbb, 0xa7, 0x76, 0xb5, + 0x51, 0x44, 0x8d, 0x70, 0x87, 0xc7, 0x49, 0xa7, 0xc6, 0xd5, 0xdb, 0xcd, + 0xc7, 0x55, 0xba, 0x95, 0xa3, 0xa6, 0xba, 0x78, 0x3c, 0x97, 0xb5, 0x91, + 0x3f, 0xa5, 0x54, 0x60, 0x50, 0xce, 0x73, 0x64, 0x8a, 0x97, 0x8d, 0xc2, + 0x76, 0x7d, 0x55, 0x5d, 0xd9, 0xdd, 0x93, 0x98, 0x4c, 0x86, 0x62, 0x66, + 0x74, 0x43, 0xcb, 0xab, 0xac, 0xc9, 0x55, 0xc2, 0xa1, 0xae, 0x66, 0x7c, + 0xc7, 0xbf, 0x71, 0x50, 0xa6, 0xc9, 0x88, 0xbe, 0x42, 0xcd, 0xa4, 0x64, + 0x26, 0x63, 0x6e, 0xac, 0xa6, 0x4e, 0xa2, 0x34, 0x47, 0x66, 0x97, 0x4c, + 0x7c, 0xa9, 0x2f, 0xc3, 0x79, 0x96, 0x70, 0x87, 0x9d, 0x81, 0xd3, 0x9c, + 0xc6, 0xd4, 0x50, 0xaf, 0xc5, 0xab, 0x7c, 0x82, 0xae, 0x9c, 0xc9, 0xe1, + 0x95, 0xbb, 0x94, 0xbb, 0x61, 0x61, 0x83, 0x5b, 0x70, 0x73, 0x61, 0x9d, + 0x65, 0xbb, 0x79, 0x87, 0x3a, 0x74, 0xd8, 0xd1, 0x7d, 0xbe, 0xa7, 0xd5, + 0x8c, 0x4b, 0xc6, 0x74, 0x9e, 0x88, 0x73, 0x7b, 0xb7, 0x55, 0xd0, 0xb9, + 0xd6, 0x54, 0x92, 0x92, 0x9e, 0x35, 0xb5, 0x5a, 0x97, 0xb2, 0x78, 0x92, + 0xc3, 0x9c, 0x72, 0xc0, 0xa8, 0xc3, 0x5e, 0xa2, 0x9e, 0x77, 0x8b, 0x77, + 0x3a, 0x71, 0x8b, 0x9c, 0x51, 0x92, 0xbc, 0xc2, 0x51, 0xba, 0xb8, 0x8d, + 0xa4, 0x82, 0x8f, 0x8e, 0x61, 0x55, 0x82, 0xca, 0x31, 0xb7, 0x54, 0xa2, + 0x3e, 0x7e, 0x8c, 0x61, 0x4a, 0x77, 0x8f, 0x93, 0x39, 0x8e, 0x9d, 0x70, + 0xb1, 0x86, 0x8c, 0x88, 0x69, 0xbe, 0xa2, 0x87, 0xc4, 0x80, 0x50, 0x74, + 0x91, 0x7b, 0x9f, 0x69, 0x4d, 0x72, 0xad, 0x40, 0xba, 0x84, 0x9a, 0xc8, + 0x42, 0xc2, 0x3f, 0x42, 0xc7, 0x49, 0x58, 0xb0, 0xba, 0xac, 0x5b, 0x54, + 0xb6, 0x88, 0xb3, 0xb0, 0x8a, 0x71, 0xc8, 0x88, 0x49, 0xb8, 0x9a, 0x50, + 0x41, 0x4d, 0xcd, 0x58, 0xad, 0x9b, 0x7b, 0x9d, 0x56, 0x82, 0x9f, 0xd2, + 0xa8, 0x3f, 0x57, 0x50, 0x4a, 0x87, 0x48, 0x7f, 0x45, 0x85, 0x8d, 0x9f, + 0x61, 0x9a, 0x3f, 0xc4, 0x8b, 0x56, 0x6f, 0x7f, 0xbe, 0x78, 0x47, 0xbb, + 0xc1, 0xe7, 0x5a, 0x39, 0x44, 0x5a, 0x65, 0x6d, 0xa3, 0x46, 0x65, 0x72, + 0x50, 0x6b, 0x7a, 0x8f, 0x5e, 0xd1, 0x45, 0xbc, 0xd0, 0x4c, 0x47, 0x95, + 0x36, 0xab, 0xcc, 0x64, 0x71, 0x81, 0x5b, 0x4b, 0x4e, 0x97, 0x90, 0xae, + 0x44, 0x9d, 0x6c, 0xbd, 0x4d, 0x3f, 0xac, 0x66, 0x5e, 0x41, 0xc3, 0x7e, + 0xa8, 0xc0, 0x72, 0x34, 0xb7, 0x54, 0xaa, 0x42, 0xca, 0xc3, 0xa4, 0x37, + 0x8c, 0x5f, 0xa1, 0x4f, 0x77, 0x54, 0xcf, 0xa5, 0x4d, 0x33, 0xa2, 0xac, + 0x7c, 0x9c, 0xd0, 0x54, 0x75, 0x5b, 0x3e, 0xd1, 0x42, 0x5e, 0x31, 0x7c, + 0x76, 0x40, 0xc6, 0x63, 0x6c, 0xba, 0xc2, 0xa9, 0x50, 0x5f, 0xd4, 0x39, + 0xa7, 0x3a, 0x38, 0x70, 0x94, 0x3e, 0x90, 0xa8, 0x93, 0x91, 0xb6, 0xae, + 0x77, 0xc6, 0x81, 0xbd, 0xc9, 0x73, 0x51, 0x7f, 0xb5, 0x6b, 0x51, 0x69, + 0x54, 0x52, 0x7f, 0xc2, 0x8b, 0x84, 0x3e, 0xad, 0xa6, 0x56, 0x6b, 0x4f, + 0xc7, 0x5e, 0xbd, 0xa0, 0x59, 0x74, 0x42, 0x61, 0xaf, 0x58, 0xd3, 0xbc, + 0xc1, 0xa9, 0x31, 0x83, 0x8d, 0x76, 0xc9, 0x6a, 0x65, 0xa5, 0x37, 0x8a, + 0x7a, 0x41, 0x4b, 0x64, 0x4f, 0x79, 0xab, 0x82, 0x70, 0xd0, 0xbe, 0x99, + 0xb9, 0x68, 0x76, 0x3a, 0x3b, 0x6f, 0x71, 0x87, 0x3a, 0xbf, 0x9e, 0xa1, + 0xd1, 0x57, 0xae, 0xc0, 0xa5, 0x9d, 0xc2, 0x7f, 0x33, 0xb8, 0x79, 0x92, + 0x5e, 0xa0, 0xa3, 0xa9, 0x97, 0x60, 0xb2, 0x82, 0x85, 0x49, 0x5c, 0x37, + 0xd1, 0x77, 0x4d, 0x89, 0x7e, 0xaa, 0x39, 0xc1, 0xc5, 0xd0, 0x8c, 0x69, + 0x8f, 0x94, 0x30, 0x89, 0x4f, 0x96, 0x56, 0x89, 0x39, 0xca, 0x82, 0xbc, + 0x96, 0xc1, 0x73, 0x75, 0x3a, 0x79, 0xb4, 0x34, 0x94, 0x92, 0x8a, 0xce, + 0xc1, 0x41, 0x48, 0xa4, 0xc5, 0x52, 0x59, 0x59, 0x98, 0x3c, 0x60, 0x49, + 0x8e, 0xcc, 0xb6, 0x54, 0xa7, 0xbe, 0xcf, 0x5e, 0x59, 0x51, 0x5d, 0x44, + 0xac, 0xc9, 0x75, 0x91, 0x3d, 0x77, 0xca, 0x74, 0xbf, 0x3f, 0x4b, 0xa9, + 0xc1, 0x5b, 0x78, 0x89, 0x4d, 0x62, 0x83, 0x37, 0x8b, 0x9d, 0x5b, 0x62, + 0x67, 0xc1, 0x69, 0x87, 0x3e, 0x60, 0x5d, 0x95, 0xa0, 0x78, 0x72, 0x5d, + 0x63, 0x69, 0x9a, 0x89, 0x34, 0x62, 0x33, 0x48, 0x76, 0x87, 0x47, 0x64, + 0xb8, 0xce, 0x63, 0xc0, 0xc5, 0x36, 0x68, 0x45, 0xd0, 0x8f, 0x53, 0x9b, + 0xcf, 0x7a, 0xc0, 0x92, 0x7c, 0xa4, 0x62, 0x6c, 0x32, 0xb4, 0x75, 0x6a, + 0x40, 0xbb, 0x98, 0x91, 0x36, 0xc1, 0x35, 0x6c, 0xc2, 0xc4, 0x35, 0x3d, + 0xc1, 0xb8, 0x84, 0x8b, 0x60, 0x7f, 0xa9, 0x6a, 0xa9, 0x4f, 0xb6, 0xca, + 0x40, 0x7d, 0x59, 0xb6, 0xa7, 0xc6, 0xa7, 0x9e, 0x80, 0x80, 0x89, 0x8c, + 0xcc, 0x9a, 0x5f, 0xc0, 0x65, 0xc7, 0xa7, 0x60, 0xce, 0x93, 0x2f, 0xb2, + 0x4c, 0xc0, 0x7f, 0x3b, 0x51, 0x34, 0xd2, 0xc6, 0x53, 0x54, 0x7f, 0x52, + 0x82, 0xc5, 0x88, 0xc0, 0x9a, 0x4b, 0xbf, 0x63, 0x4f, 0x78, 0xb4, 0x4d, + 0x55, 0xd3, 0x42, 0xa3, 0x67, 0x37, 0xca, 0x50, 0xaa, 0xbe, 0xc5, 0x96, + 0xbe, 0xc3, 0x3d, 0xca, 0xce, 0xcb, 0x72, 0xc9, 0x4b, 0x45, 0x86, 0x9b, + 0xc2, 0x8d, 0xbd, 0x9e, 0x49, 0xcd, 0x5d, 0xa8, 0x6d, 0x97, 0x32, 0x99, + 0xaf, 0x60, 0xab, 0xa4, 0x54, 0x5d, 0xb0, 0xa0, 0x63, 0xcf, 0xaf, 0xd0, + 0x8c, 0x8e, 0x53, 0xa8, 0x77, 0xa3, 0x7f, 0x4c, 0x9c, 0x65, 0xa4, 0xc1, + 0xd0, 0x47, 0x5e, 0x89, 0x37, 0xc6, 0x37, 0xc0, 0x7a, 0xa6, 0x64, 0x7b, + 0x79, 0xaa, 0x4e, 0x8d, 0xba, 0x61, 0xcf, 0x4f, 0x3f, 0x9b, 0x51, 0x7e, + 0xbd, 0xc8, 0x48, 0x67, 0x98, 0x81, 0x4e, 0x6e, 0xc6, 0x39, 0xaf, 0x57, + 0x80, 0x58, 0x72, 0xb6, 0x8d, 0x73, 0xc9, 0xc8, 0x6a, 0x67, 0xbc, 0x54, + 0xc6, 0xb8, 0x44, 0x52, 0x63, 0xb5, 0x42, 0x7c, 0x79, 0x44, 0x7b, 0xac, + 0x54, 0x88, 0x50, 0x6b, 0xa0, 0xab, 0x77, 0x98, 0x51, 0x55, 0x3d, 0xa6, + 0xb4, 0x61, 0x88, 0x52, 0x71, 0x8d, 0x79, 0x49, 0x8f, 0x4b, 0x66, 0x9c, + 0x9b, 0xb5, 0xaf, 0x68, 0x94, 0x6b, 0x3f, 0xa9, 0x51, 0x3e, 0x51, 0x72, + 0x79, 0x5a, 0x55, 0xa8, 0x3a, 0x9a, 0x5d, 0x76, 0xc1, 0x87, 0x6f, 0x99, + 0xb7, 0x93, 0x8b, 0x90, 0x9c, 0x39, 0xc8, 0xb7, 0x3c, 0x44, 0x6b, 0x81, + 0xb5, 0x39, 0x55, 0x8a, 0xbc, 0x96, 0xbe, 0x84, 0x67, 0x81, 0xb4, 0x79, + 0x4b, 0x7e, 0xc3, 0xcb, 0xa2, 0xa8, 0x77, 0xad, 0x73, 0xb9, 0x38, 0x7e, + 0x49, 0xba, 0x8d, 0x57, 0x7e, 0xce, 0xa2, 0x67, 0x72, 0xc3, 0xbf, 0x50, + 0x51, 0x55, 0x64, 0x4b, 0xcd, 0xd0, 0x51, 0x88, 0x40, 0x8f, 0x3f, 0x46, + 0x92, 0x7b, 0x3f, 0x3e, 0xaa, 0x73, 0xac, 0x85, 0xcc, 0x8b, 0xc4, 0x56, + 0xc9, 0x4d, 0x39, 0x9e, 0xcf, 0x58, 0x46, 0xc9, 0xa9, 0xa1, 0xbf, 0x76, + 0xbc, 0x9a, 0x90, 0xab, 0x48, 0xc7, 0x73, 0x40, 0xce, 0xbc, 0x39, 0x62, + 0x82, 0x8c, 0xcc, 0xc4, 0x41, 0x6e, 0x9f, 0x52, 0xa0, 0xcc, 0xb1, 0xba, + 0x80, 0x33, 0xbb, 0x91, 0x5a, 0x32, 0x98, 0x39, 0x5b, 0x4c, 0x6b, 0x3f, + 0x9e, 0x5a, 0x3c, 0x3d, 0xa2, 0x56, 0xb1, 0x88, 0xa7, 0xa0, 0x50, 0x4d, + 0xd4, 0x73, 0x58, 0x93, 0x8c, 0xa0, 0x95, 0xbe, 0x61, 0x7e, 0x61, 0x51, + 0xd2, 0x7a, 0x50, 0x56, 0x3f, 0x72, 0x3d, 0x9f, 0xb2, 0x85, 0x87, 0x74, + 0xa0, 0xd4, 0x41, 0xcb, 0xbe, 0x4f, 0xa4, 0x95, 0x55, 0xa8, 0xb1, 0x47, + 0xac, 0xc3, 0x79, 0xcf, 0x6b, 0x77, 0xbc, 0x91, 0x84, 0x96, 0x47, 0x77, + 0x72, 0xad, 0x92, 0x88, 0x40, 0xc0, 0xb1, 0xc5, 0x39, 0x8d, 0x46, 0x49, + 0x68, 0x6b, 0xb6, 0x30, 0x8e, 0xc3, 0x4d, 0x64, 0x46, 0x3c, 0xae, 0x82, + 0x40, 0x9b, 0x69, 0x82, 0x51, 0x8f, 0x3f, 0x5a, 0x9e, 0xd2, 0xd1, 0x85, + 0x49, 0x33, 0x91, 0x39, 0x8b, 0xc6, 0xc3, 0x51, 0x5e, 0x6d, 0x84, 0x5b, + 0x3b, 0x93, 0x51, 0x71, 0xa3, 0x83, 0x42, 0x3d, 0x6d, 0x50, 0x65, 0x66, + 0x36, 0x50, 0xb3, 0xab, 0xa4, 0x3d, 0xd1, 0x94, 0x58, 0x33, 0x42, 0x46, + 0x3c, 0x89, 0xc9, 0x66, 0x44, 0xd3, 0x95, 0x5c, 0xcf, 0x4f, 0x83, 0xa1, + 0x84, 0xa8, 0x34, 0x5b, 0xad, 0x78, 0x35, 0xcf, 0xbe, 0x47, 0x9b, 0x81, + 0x3a, 0x40, 0x8b, 0xa6, 0x37, 0x7e, 0x96, 0x67, 0x4a, 0x9b, 0xc4, 0x9b, + 0xc5, 0x37, 0x49, 0x34, 0x5b, 0x9e, 0x37, 0xd1, 0xc1, 0x8a, 0xad, 0xc0, + 0x36, 0x86, 0x6b, 0x84, 0x7b, 0xa3, 0x43, 0x7f, 0x6c, 0x56, 0x47, 0x98, + 0x3b, 0x3f, 0x55, 0x56, 0xc6, 0x75, 0x8b, 0xd4, 0x5a, 0x37, 0x59, 0x70, + 0xc4, 0x4e, 0x86, 0x3f, 0x32, 0xb5, 0x91, 0x50, 0x8d, 0x89, 0xa0, 0x71, + 0xc9, 0x70, 0xab, 0x8a, 0xbe, 0x49, 0xb3, 0x5c, 0x7a, 0x96, 0x6c, 0xab, + 0x5f, 0xc5, 0xc5, 0x65, 0xa4, 0x32, 0x7b, 0x53, 0x58, 0xd3, 0x8c, 0x72, + 0x66, 0x8f, 0x79, 0xcc, 0xd4, 0x44, 0x47, 0xcf, 0xcf, 0x7d, 0xb9, 0x5d, + 0xb2, 0x93, 0x99, 0x5c, 0xc0, 0x80, 0x40, 0x46, 0x35, 0x9a, 0xa0, 0xce, + 0x74, 0xbb, 0x64, 0x7b, 0x93, 0x39, 0x3d, 0xbd, 0x5d, 0x6f, 0x3b, 0x5f, + 0xd2, 0x73, 0x2f, 0x73, 0x36, 0x40, 0x79, 0x77, 0x7e, 0xbc, 0x53, 0xb3, + 0xc2, 0x68, 0x4b, 0x72, 0xc7, 0xb6, 0xc6, 0x7d, 0x78, 0x4c, 0x67, 0x4d, + 0x4f, 0x4c, 0x51, 0x9f, 0x61, 0x51, 0x4d, 0x3f, 0xbd, 0xc4, 0x44, 0xab, + 0x8f, 0x98, 0x8d, 0x7e, 0xd1, 0xb2, 0x8b, 0xc4, 0x31, 0x4f, 0xc8, 0xd8, + 0xa5, 0xaa, 0xa2, 0x5e, 0x7f, 0x77, 0x4e, 0xc4, 0xbe, 0xaf, 0xa3, 0x4a, + 0xba, 0x84, 0x3b, 0xd2, 0xc1, 0xa5, 0x91, 0x7f, 0x4d, 0x8f, 0xcc, 0x83, + 0x56, 0xcb, 0x93, 0x33, 0x91, 0xaf, 0x3f, 0x9b, 0x9e, 0x3c, 0x41, 0x98, + 0xad, 0x42, 0xcb, 0xc7, 0xc6, 0x7c, 0x61, 0x6a, 0x3a, 0x6b, 0x7b, 0xa4, + 0x8e, 0x9c, 0xb4, 0x83, 0xb8, 0xd0, 0x9f, 0x44, 0x3c, 0x94, 0x4b, 0x37, + 0x33, 0x7b, 0xaf, 0x60, 0xb2, 0x44, 0x50, 0x52, 0xb6, 0xcc, 0x9d, 0x6b, + 0x5c, 0x6c, 0x67, 0x6f, 0xcc, 0x8c, 0x7d, 0x6c, 0x64, 0x4b, 0xcb, 0x60, + 0xa3, 0x60, 0xc7, 0xa1, 0x6e, 0x4e, 0xa9, 0x6c, 0x69, 0xc0, 0x41, 0xa6, + 0x9d, 0xb9, 0xb2, 0x53, 0x9a, 0x37, 0xd0, 0xbe, 0x6d, 0xa9, 0x66, 0x69, + 0x7d, 0xa7, 0x6e, 0x51, 0x76, 0x8f, 0x3d, 0xb4, 0xa9, 0xb5, 0x35, 0x9d, + 0x69, 0x98, 0xb3, 0x78, 0xbf, 0xb8, 0x41, 0x7c, 0xb3, 0x68, 0x59, 0x82, + 0x8a, 0xbb, 0x48, 0xcf, 0x95, 0xb6, 0x6f, 0x6e, 0x9d, 0x67, 0x80, 0x9d, + 0x3b, 0xcb, 0x92, 0xce, 0xc5, 0xd4, 0x88, 0x86, 0x5f, 0x6a, 0x43, 0xc3, + 0xa2, 0x3e, 0x52, 0xb4, 0x51, 0x45, 0x8c, 0x94, 0x7b, 0x73, 0x8b, 0x97, + 0x7b, 0x8e, 0x4c, 0x4f, 0xcf, 0x7a, 0x72, 0xa1, 0x91, 0x63, 0x3a, 0x44, + 0x3c, 0xc5, 0x3f, 0xa4, 0x8a, 0x94, 0xc2, 0x49, 0x41, 0x4f, 0x73, 0x40, + 0xa8, 0x4e, 0x41, 0xcc, 0xcc, 0xcf, 0x72, 0xb1, 0xa9, 0x6c, 0x73, 0xae, + 0x72, 0xb6, 0xaa, 0xcb, 0x5e, 0x8c, 0xc5, 0x54, 0xae, 0x44, 0x5d, 0xa4, + 0x8c, 0x42, 0x4b, 0x65, 0x6f, 0x38, 0x84, 0xc6, 0xc0, 0x63, 0x7c, 0x45, + 0xbb, 0x74, 0xbf, 0xac, 0x9e, 0x7b, 0xd9, 0xa2, 0x53, 0xb5, 0x6d, 0x87, + 0x95, 0x88, 0x9f, 0x3b, 0x99, 0xb0, 0x7c, 0x8a, 0xcf, 0xa7, 0x7c, 0xc9, + 0xac, 0xad, 0x72, 0xa7, 0x88, 0x5c, 0x56, 0x88, 0xa2, 0x32, 0xc0, 0x58, + 0x93, 0xa9, 0x5d, 0x68, 0x39, 0xab, 0xb6, 0x75, 0x35, 0x58, 0xcc, 0xa8, + 0x39, 0xc9, 0x87, 0x7d, 0x6b, 0x47, 0x7d, 0x98, 0x5e, 0x8b, 0xae, 0xd2, + 0xaa, 0x82, 0xd2, 0x56, 0x51, 0xa2, 0x43, 0xa3, 0x4b, 0xc4, 0xa7, 0xd4, + 0x94, 0xc8, 0x44, 0xab, 0x83, 0x8b, 0x68, 0x94, 0x50, 0xa0, 0xa6, 0xa0, + 0x34, 0x55, 0xa2, 0x69, 0x79, 0xbb, 0xcd, 0x3d, 0x62, 0xca, 0x45, 0xb8, + 0xb1, 0xb8, 0xc3, 0x8a, 0xac, 0x3c, 0x4a, 0xd4, 0x58, 0xa5, 0x49, 0x6f, + 0xbf, 0x6c, 0xc3, 0xaa, 0x4f, 0x8d, 0x81, 0x5e, 0x8b, 0xcb, 0x97, 0xb0, + 0xa5, 0x5b, 0x2f, 0x69, 0x4f, 0xa0, 0x48, 0x65, 0xad, 0x97, 0xc5, 0xa9, + 0x3a, 0x8c, 0xb8, 0x65, 0x6f, 0xd2, 0x93, 0x5a, 0xac, 0xca, 0xae, 0x58, + 0x5e, 0xce, 0x60, 0x3c, 0xc1, 0x4e, 0xae, 0xb2, 0x34, 0x43, 0x9e, 0x82, + 0x48, 0xa8, 0x64, 0xcc, 0xca, 0x6d, 0xbb, 0x94, 0x91, 0x8e, 0xcf, 0xc2, + 0x39, 0x48, 0x89, 0x9c, 0x87, 0x68, 0xc6, 0x89, 0xb1, 0x54, 0x9f, 0x98, + 0x49, 0x5b, 0xba, 0x5f, 0x67, 0x4f, 0x82, 0x2d, 0xb7, 0x68, 0x43, 0xbe, + 0x5f, 0xa5, 0xce, 0xa7, 0xb2, 0x99, 0x46, 0xd3, 0xbc, 0x35, 0x3b, 0x35, + 0xc9, 0xb1, 0xb6, 0xc8, 0x97, 0x74, 0x3d, 0x6b, 0x9b, 0xd8, 0xe1, 0xc6, + 0xaa, 0x59, 0x50, 0xba, 0x53, 0xb6, 0xcd, 0xb4, 0x8f, 0x99, 0xc6, 0x4e, + 0xd0, 0x69, 0x7d, 0x7e, 0x8b, 0x93, 0x4c, 0x58, 0x62, 0x43, 0x86, 0x5b, + 0x2f, 0x71, 0xc4, 0x32, 0x39, 0x62, 0x4a, 0xcd, 0xa9, 0x94, 0x45, 0x6e, + 0xca, 0x57, 0x52, 0xb4, 0x8a, 0x97, 0x35, 0x34, 0x52, 0xa7, 0x5e, 0x99, + 0x8a, 0x83, 0xa7, 0xc8, 0xb5, 0x6f, 0xcb, 0x81, 0x82, 0xc2, 0x72, 0xad, + 0xaa, 0x45, 0x6d, 0xc5, 0x45, 0xa7, 0x37, 0x81, 0x87, 0x94, 0x44, 0x64, + 0x4b, 0x86, 0x74, 0x6a, 0x5c, 0x3e, 0x3f, 0x91, 0x4d, 0x86, 0x98, 0x82, + 0xa5, 0x97, 0xb6, 0x79, 0x58, 0xb7, 0xa8, 0xac, 0xad, 0x40, 0x86, 0x8b, + 0x8f, 0x75, 0xbf, 0x49, 0xbc, 0x3d, 0xc0, 0x3c, 0xb8, 0xba, 0x5f, 0x80, + 0x43, 0x5d, 0x84, 0x53, 0x5f, 0xa8, 0xda, 0xb7, 0x4a, 0x94, 0x61, 0xc2, + 0xaa, 0xb8, 0x6c, 0x53, 0x38, 0x52, 0x49, 0x7b, 0xd1, 0x93, 0xc9, 0xb0, + 0x64, 0x6f, 0x40, 0x78, 0x4e, 0x7c, 0xc3, 0x91, 0x35, 0x2e, 0xc4, 0x74, + 0xae, 0x52, 0x39, 0x91, 0x4a, 0xcf, 0x58, 0x94, 0xce, 0x87, 0xcd, 0x63, + 0x31, 0xd0, 0x7b, 0xb9, 0xc6, 0xc3, 0x80, 0x34, 0xc1, 0xa7, 0xa3, 0xcc, + 0x41, 0xaf, 0xb2, 0xaf, 0xb3, 0x6f, 0xcb, 0xbe, 0x8f, 0x8c, 0x80, 0xbb, + 0x64, 0x96, 0x96, 0x51, 0x8c, 0x52, 0xa0, 0x86, 0xc8, 0x31, 0xce, 0xbe, + 0xb7, 0xba, 0x40, 0x65, 0x87, 0xc5, 0x7f, 0x4e, 0x9c, 0x6a, 0x52, 0xbe, + 0x53, 0x30, 0xbb, 0x9c, 0x53, 0x86, 0x5a, 0x45, 0x34, 0x9e, 0x7a, 0x74, + 0xc1, 0xb3, 0x34, 0x95, 0xad, 0x74, 0x49, 0x4c, 0xcc, 0xb3, 0x4a, 0x51, + 0x86, 0x55, 0x40, 0x58, 0x4d, 0x57, 0x86, 0xc9, 0x48, 0x47, 0xc0, 0x8d, + 0x96, 0x8c, 0x7c, 0x83, 0xc8, 0xae, 0xbd, 0x35, 0x8b, 0xbf, 0x74, 0xc2, + 0xbb, 0x5c, 0x8a, 0x3d, 0x58, 0x33, 0x63, 0xcd, 0x67, 0x38, 0x4f, 0x66, + 0x72, 0x6c, 0xc3, 0xc2, 0x37, 0x39, 0xa1, 0xa0, 0x34, 0xab, 0x9f, 0x3f, + 0xc6, 0xb1, 0xac, 0x32, 0xb5, 0x43, 0xc4, 0xb7, 0x58, 0x70, 0x93, 0x50, + 0x3e, 0xc5, 0xc3, 0xb8, 0x71, 0x41, 0xaa, 0xb5, 0xb9, 0x34, 0x8c, 0xc8, + 0x87, 0xcb, 0x9a, 0x5e, 0x3c, 0xc7, 0xb0, 0xd6, 0x8f, 0x57, 0x58, 0x74, + 0x74, 0x70, 0xc6, 0x88, 0xb9, 0x4e, 0x44, 0x49, 0xa2, 0x7a, 0xc6, 0x88, + 0x85, 0x5a, 0x9a, 0xb9, 0x36, 0x8d, 0x38, 0x63, 0xc4, 0x53, 0xaa, 0x95, + 0xa0, 0x56, 0x59, 0x8e, 0xc5, 0x5e, 0x77, 0x8c, 0xa1, 0x49, 0x97, 0xb0, + 0x81, 0x4f, 0x40, 0x2d, 0x55, 0x60, 0x3d, 0x95, 0xc4, 0x7a, 0x58, 0x84, + 0x61, 0x9b, 0x7a, 0x80, 0x77, 0x37, 0x9d, 0x3a, 0x8e, 0x52, 0x53, 0x5e, + 0xc2, 0xbb, 0x3b, 0x4d, 0xb3, 0x7e, 0x4b, 0x81, 0xaa, 0x9d, 0x3c, 0xb5, + 0xaf, 0x9e, 0x42, 0xc8, 0x58, 0x94, 0xab, 0xd1, 0x94, 0xbf, 0xb8, 0xaa, + 0x85, 0x67, 0x54, 0xa8, 0x5c, 0xa5, 0x53, 0x7b, 0x5e, 0xbd, 0xac, 0x7d, + 0x86, 0x81, 0x73, 0x9d, 0x71, 0xab, 0x7e, 0xcc, 0x7c, 0xae, 0xa8, 0x8d, + 0xba, 0x60, 0x60, 0x64, 0x71, 0x6b, 0xa9, 0x88, 0x97, 0x62, 0xb8, 0xc8, + 0x47, 0x94, 0x3a, 0x34, 0x8d, 0x37, 0x3e, 0xa5, 0x5c, 0x97, 0x74, 0xb8, + 0x8e, 0x6c, 0xb9, 0x58, 0xae, 0x75, 0x80, 0xce, 0xac, 0x31, 0x8e, 0xb5, + 0xb0, 0x71, 0x96, 0x52, 0xcd, 0x99, 0xb6, 0x7d, 0xab, 0xc3, 0xce, 0xc0, + 0x3f, 0x48, 0x57, 0x73, 0x78, 0xd2, 0x58, 0x4c, 0xc2, 0x79, 0x98, 0x35, + 0xb1, 0xcb, 0x6f, 0x88, 0x74, 0x38, 0xa2, 0x68, 0xb4, 0x6e, 0xa9, 0xb6, + 0xc6, 0x55, 0xa9, 0x76, 0x72, 0x8a, 0xbb, 0xac, 0xa0, 0x62, 0x82, 0x66, + 0xbf, 0x8a, 0x91, 0x88, 0xb3, 0x78, 0x3c, 0x8e, 0xb7, 0x31, 0x40, 0xb3, + 0x76, 0xcc, 0x34, 0xd3, 0x3e, 0x80, 0x5b, 0xa5, 0xc4, 0xbe, 0xb8, 0x3e, + 0xc5, 0x86, 0x27, 0x57, 0x7f, 0x82, 0x7a, 0x94, 0x60, 0xce, 0x97, 0x56, + 0x8c, 0x62, 0x5f, 0x88, 0xce, 0xa3, 0x9e, 0xc1, 0xc9, 0x3d, 0x7c, 0x79, + 0x96, 0x68, 0x51, 0x98, 0xbb, 0x73, 0x5b, 0x3f, 0xb2, 0xb1, 0x38, 0xaa, + 0x44, 0x9d, 0xaf, 0x9c, 0xb7, 0x88, 0x61, 0x87, 0x74, 0x48, 0xcd, 0x9b, + 0xa5, 0x40, 0x3e, 0x5a, 0x7b, 0xac, 0xca, 0xa1, 0xc8, 0xc6, 0xa1, 0x41, + 0xbc, 0x78, 0x65, 0x81, 0xce, 0x8d, 0xb7, 0xab, 0x44, 0x7c, 0xa3, 0x84, + 0x5e, 0x95, 0x43, 0xab, 0x3b, 0x66, 0x35, 0x45, 0xba, 0x99, 0xb7, 0xd5, + 0x3f, 0xa2, 0xd7, 0xc9, 0xc4, 0xd2, 0xcc, 0xb3, 0x8f, 0x54, 0x6b, 0x5e, + 0x69, 0x5c, 0xa6, 0xaa, 0x53, 0x8a, 0x4d, 0x71, 0xab, 0xa2, 0x3c, 0x6e, + 0x99, 0xcc, 0xb5, 0x8a, 0x6b, 0x36, 0xa2, 0xcf, 0x70, 0xa5, 0x6f, 0xc8, + 0x82, 0xcd, 0xa6, 0x3e, 0x8b, 0x54, 0x35, 0xa3, 0x54, 0xc4, 0x70, 0xc7, + 0xa2, 0x93, 0x8d, 0x80, 0x55, 0x84, 0x5d, 0x36, 0xb2, 0x8b, 0xb9, 0xd0, + 0xa0, 0x8c, 0x83, 0x63, 0x3f, 0xa7, 0xd0, 0x99, 0xc9, 0x38, 0x57, 0xaf, + 0x53, 0x56, 0xa4, 0x3c, 0x37, 0x4e, 0xca, 0x45, 0x9d, 0xb9, 0x52, 0x3f, + 0x83, 0xb1, 0x75, 0x39, 0x68, 0x9f, 0x44, 0x8b, 0x86, 0xa9, 0x92, 0xa6, + 0x7b, 0x60, 0x5f, 0x59, 0xab, 0x79, 0x85, 0xa3, 0x80, 0xbe, 0x4a, 0x90, + 0x51, 0x72, 0xa6, 0xc0, 0x9f, 0x7e, 0x3a, 0x79, 0xce, 0xa3, 0x81, 0x3f, + 0xcc, 0xc8, 0xb3, 0x88, 0xbd, 0x72, 0x81, 0x51, 0x8e, 0xaa, 0x52, 0x5a, + 0x5a, 0xc3, 0x4f, 0x4a, 0xab, 0x39, 0x93, 0x65, 0xc9, 0x73, 0x96, 0xa9, + 0xc4, 0xa2, 0xb2, 0x58, 0x34, 0x7e, 0xb9, 0x65, 0xd2, 0x8a, 0xc1, 0x52, + 0x75, 0xaf, 0x88, 0x91, 0xb8, 0x56, 0x7f, 0x9b, 0x56, 0x72, 0xac, 0x87, + 0xc8, 0x37, 0xa5, 0x61, 0x44, 0xac, 0xa6, 0x5d, 0x62, 0x8d, 0x7a, 0xd0, + 0x71, 0x8c, 0xba, 0x8c, 0x6c, 0xad, 0x99, 0xcb, 0x5f, 0xbc, 0x8b, 0xce, + 0xa6, 0x36, 0x5f, 0xb5, 0x57, 0xc3, 0x63, 0xbd, 0xc4, 0x3a, 0x48, 0xab, + 0xab, 0x36, 0xbd, 0x80, 0x6c, 0xb4, 0x83, 0x87, 0x36, 0xd4, 0x4e, 0x9f, + 0x9e, 0xa5, 0x65, 0x55, 0x92, 0xad, 0x3b, 0xd0, 0xd1, 0x7a, 0x82, 0x9e, + 0xb3, 0xce, 0x46, 0xc0, 0xc0, 0x61, 0x8b, 0x67, 0x41, 0xa0, 0x40, 0x56, + 0x8c, 0x48, 0x4c, 0x33, 0x8b, 0xcf, 0x5f, 0x83, 0x85, 0x5b, 0xa1, 0xb1, + 0x91, 0x49, 0x8f, 0x9b, 0x80, 0x98, 0xa5, 0xd3, 0x3b, 0x33, 0xc0, 0xc7, + 0xcb, 0xa6, 0x7a, 0x8c, 0x47, 0x4c, 0x35, 0x31, 0x3d, 0x34, 0xb8, 0x6c, + 0x5b, 0x74, 0xc5, 0x7b, 0x5d, 0x98, 0x6f, 0x5d, 0x8e, 0x37, 0x3b, 0x47, + 0x75, 0x92, 0x50, 0x9b, 0xb4, 0x2f, 0x86, 0x3c, 0xd1, 0x7a, 0x46, 0x86, + 0xbc, 0x40, 0x6e, 0xb0, 0x41, 0x69, 0x6e, 0xc1, 0xb8, 0xb8, 0xd4, 0x9c, + 0x44, 0x7d, 0x78, 0xcd, 0x78, 0x66, 0x99, 0x4a, 0x76, 0xa5, 0x41, 0x47, + 0xb3, 0x6f, 0xb8, 0x85, 0x67, 0x6f, 0x68, 0xb3, 0xae, 0x9f, 0x82, 0x32, + 0xb4, 0x7a, 0x39, 0x7d, 0x83, 0xa3, 0x9b, 0xa3, 0x72, 0xc6, 0xdc, 0x8e, + 0xa8, 0x7a, 0x92, 0x5b, 0x80, 0xc7, 0xbd, 0xc3, 0x6e, 0x39, 0xab, 0x51, + 0x9f, 0x9a, 0x46, 0x4c, 0x6e, 0xb8, 0xd9, 0xc6, 0xae, 0x40, 0xb8, 0x6d, + 0x37, 0xa1, 0xbd, 0x50, 0x3a, 0x38, 0x82, 0x7d, 0x5a, 0x9d, 0x4f, 0x85, + 0x91, 0xa2, 0x32, 0xc7, 0x9f, 0x37, 0x2e, 0xcd, 0x43, 0x78, 0x5b, 0xb2, + 0x40, 0x94, 0x53, 0xc9, 0x71, 0x63, 0xc0, 0x5e, 0xae, 0xa1, 0xb2, 0x55, + 0x42, 0x5a, 0x9e, 0x53, 0xb3, 0x3b, 0x85, 0x5f, 0xc6, 0xc8, 0x6e, 0x53, + 0x9a, 0xca, 0x84, 0x90, 0x33, 0x5a, 0x81, 0x93, 0xb5, 0x6b, 0x7a, 0xad, + 0xaf, 0xaa, 0x38, 0x5b, 0x49, 0x84, 0xcc, 0x6f, 0xab, 0xb4, 0x48, 0x7c, + 0x61, 0x49, 0xab, 0x36, 0x5f, 0x51, 0x70, 0xc3, 0x9b, 0xc8, 0xd1, 0xa3, + 0xc1, 0xd2, 0xc8, 0x58, 0xa7, 0x8a, 0xa7, 0xd3, 0xbc, 0xc8, 0x45, 0xa3, + 0x73, 0x46, 0x6d, 0x81, 0xaf, 0xa9, 0x54, 0x89, 0x89, 0x62, 0xb7, 0x96, + 0x7b, 0x8d, 0xb1, 0x48, 0xab, 0xca, 0xbb, 0x91, 0x7d, 0x57, 0x7f, 0xb0, + 0x5f, 0xd4, 0x58, 0x7a, 0x8f, 0x4a, 0x76, 0x64, 0x38, 0x89, 0xcb, 0x82, + 0x6a, 0x56, 0x90, 0x5f, 0x66, 0x84, 0x70, 0xba, 0x43, 0x92, 0x5f, 0x52, + 0xcd, 0xb2, 0x8a, 0x62, 0x72, 0xc9, 0x38, 0x6c, 0xcb, 0x79, 0xb4, 0xe0, + 0xac, 0xb9, 0xb9, 0xc9, 0x9b, 0xd6, 0x92, 0x5a, 0x57, 0xab, 0x87, 0xa5, + 0x41, 0x57, 0x88, 0x33, 0xbf, 0x88, 0x50, 0x95, 0xc2, 0x77, 0xc5, 0x5c, + 0x9c, 0x81, 0xc5, 0x64, 0xb5, 0x80, 0x3f, 0xc2, 0xa8, 0x89, 0x45, 0xa1, + 0xc3, 0x35, 0x7a, 0x98, 0xce, 0x9d, 0x65, 0x6d, 0x9f, 0x87, 0x9a, 0x31, + 0x8d, 0x92, 0xb9, 0x84, 0x29, 0xbf, 0x8e, 0x8d, 0x3c, 0x58, 0x5b, 0x6c, + 0xb2, 0x37, 0x34, 0x94, 0x86, 0x34, 0x95, 0x7a, 0x3c, 0x5d, 0xd7, 0xc6, + 0xb6, 0x9f, 0xd0, 0xc2, 0x9e, 0x6b, 0x3b, 0x30, 0xb5, 0xaf, 0x5e, 0x6a, + 0xaa, 0xbb, 0x9c, 0xaa, 0x31, 0xad, 0x59, 0xa7, 0x81, 0x35, 0x66, 0x6a, + 0xca, 0x9b, 0x85, 0xc1, 0xac, 0xc3, 0x9f, 0x2f, 0x70, 0xaf, 0x4a, 0x69, + 0xb3, 0xd9, 0xcf, 0x3c, 0xb2, 0x75, 0xa8, 0xc6, 0x78, 0x6e, 0x87, 0xc7, + 0x48, 0x50, 0x33, 0x6a, 0x75, 0x75, 0x58, 0x2b, 0xa2, 0x45, 0xb2, 0x53, + 0x74, 0x4a, 0x93, 0x97, 0x49, 0x6f, 0x51, 0x37, 0x35, 0x84, 0xce, 0x6e, + 0x63, 0x74, 0x44, 0xb9, 0x77, 0xae, 0x7e, 0xd0, 0x31, 0x87, 0x86, 0xd1, + 0x75, 0xa5, 0x9e, 0x9c, 0xd0, 0x6f, 0x56, 0x6e, 0xb8, 0xc4, 0xba, 0x72, + 0xaa, 0x71, 0xab, 0x33, 0x59, 0xa0, 0x5b, 0xb6, 0x4f, 0xcb, 0xbb, 0xd5, + 0x4d, 0x6c, 0x6f, 0x5b, 0x43, 0x3f, 0x52, 0x5c, 0xa5, 0x52, 0x34, 0x8c, + 0x9b, 0x6d, 0xac, 0x8a, 0xb6, 0x34, 0x93, 0x86, 0x9b, 0x6b, 0xc9, 0x8b, + 0x6e, 0xc1, 0x84, 0x49, 0x98, 0x6e, 0x5d, 0xc2, 0x80, 0x60, 0x96, 0xc9, + 0xc6, 0x56, 0xa6, 0x78, 0xaa, 0xa5, 0x42, 0x6d, 0xc6, 0x7d, 0x8d, 0x38, + 0x5f, 0x98, 0x9d, 0x3c, 0x8c, 0xb1, 0x7d, 0xbe, 0x3d, 0x9a, 0x9e, 0x32, + 0x6e, 0xcb, 0xce, 0x99, 0x79, 0x2f, 0x75, 0x70, 0x39, 0x33, 0x6b, 0xb6, + 0x9f, 0x3c, 0xcc, 0x59, 0x60, 0x90, 0xbc, 0x90, 0x53, 0x60, 0xae, 0xc3, + 0xbd, 0x78, 0xc0, 0x76, 0x89, 0x3b, 0x75, 0x95, 0xb0, 0xb9, 0x91, 0xb3, + 0x7b, 0xbb, 0x98, 0x5b, 0x94, 0x3c, 0x73, 0x3f, 0x72, 0x9c, 0xd0, 0xab, + 0xa5, 0x4b, 0xc8, 0x54, 0xcf, 0x5d, 0xa6, 0x72, 0x95, 0x5b, 0xad, 0xc8, + 0x6c, 0x53, 0x57, 0x76, 0x50, 0x3d, 0x77, 0x6e, 0xc8, 0xbd, 0x78, 0xcf, + 0x67, 0xbf, 0x8e, 0x46, 0xc0, 0x46, 0x7d, 0x38, 0x8c, 0x2f, 0x3f, 0xb0, + 0x3d, 0x3d, 0xa4, 0x64, 0xb1, 0x9e, 0x5f, 0x46, 0xa6, 0x65, 0x68, 0x5d, + 0x4b, 0xbb, 0x58, 0xd0, 0xaa, 0xb2, 0x95, 0x64, 0xb0, 0x94, 0x92, 0x6c, + 0x9d, 0x87, 0x84, 0x46, 0xc6, 0xb3, 0xce, 0xc3, 0xaf, 0xa2, 0xbf, 0x75, + 0xc5, 0x38, 0x9d, 0x57, 0xac, 0x5b, 0x9e, 0xba, 0x9d, 0xb0, 0xaa, 0x4f, + 0x60, 0x3c, 0xad, 0xb0, 0xad, 0xb5, 0xa9, 0x86, 0x54, 0x96, 0x7b, 0xb7, + 0x42, 0x90, 0xc7, 0x33, 0x6d, 0x4b, 0xcd, 0x8e, 0xcd, 0x57, 0x99, 0xc4, + 0x44, 0x98, 0x94, 0x2e, 0x8a, 0x5e, 0x60, 0xa0, 0x89, 0xc8, 0x4e, 0x42, + 0x46, 0xaa, 0x4e, 0x7a, 0x4e, 0xbc, 0x52, 0x8e, 0x48, 0x63, 0x69, 0x57, + 0xa6, 0x46, 0xc3, 0x3e, 0x6c, 0x57, 0x65, 0xcc, 0xc4, 0x78, 0xc4, 0x3f, + 0x5f, 0x43, 0x79, 0x6e, 0x8c, 0xa0, 0x56, 0x4b, 0x88, 0x9e, 0xce, 0xab, + 0x50, 0x59, 0x9c, 0x3a, 0x57, 0x52, 0x86, 0xc0, 0xbe, 0xb6, 0x80, 0x4f, + 0x3f, 0xbf, 0x9e, 0xb2, 0xac, 0x64, 0xb8, 0xd3, 0x9c, 0x8b, 0xcc, 0xa6, + 0x85, 0xc6, 0x56, 0xb0, 0x52, 0xc5, 0xbe, 0x92, 0x8e, 0x4b, 0x5b, 0x49, + 0x3f, 0xb1, 0x4c, 0x6a, 0x9a, 0x91, 0xa8, 0xcc, 0x4a, 0xca, 0x87, 0x7b, + 0x49, 0x84, 0xc4, 0x7e, 0xb3, 0xcd, 0x54, 0x6f, 0x4e, 0x39, 0x76, 0x3e, + 0x8e, 0x75, 0x37, 0xab, 0x94, 0xc1, 0x65, 0x45, 0x7d, 0xa5, 0x29, 0x33, + 0x7b, 0x2d, 0xa5, 0x35, 0xb2, 0x62, 0x49, 0xc6, 0x56, 0x5d, 0x5b, 0x4c, + 0x51, 0x62, 0x5e, 0x44, 0x39, 0x36, 0x5e, 0x7f, 0xc5, 0x72, 0x2e, 0x85, + 0x36, 0x71, 0xc3, 0x70, 0x60, 0x55, 0xb2, 0x9a, 0x4a, 0xc5, 0x51, 0x57, + 0x3b, 0x72, 0xc8, 0xac, 0x59, 0xb6, 0x5d, 0x9b, 0x99, 0x35, 0xc9, 0xa5, + 0xc0, 0x98, 0x61, 0xc3, 0x9e, 0xd4, 0xd5, 0x31, 0xc3, 0x72, 0x71, 0xbe, + 0x6b, 0xd3, 0x4e, 0x8c, 0xcd, 0x43, 0x3d, 0x43, 0xbd, 0x7e, 0x3a, 0xaf, + 0x4c, 0xae, 0x5c, 0x8f, 0xab, 0xa8, 0x8b, 0xba, 0x92, 0x37, 0x83, 0x6b, + 0x44, 0x61, 0x7b, 0x61, 0x3f, 0x8d, 0xda, 0xc5, 0xa6, 0x9d, 0x90, 0xd3, + 0x84, 0xcf, 0x43, 0x84, 0x91, 0x93, 0x45, 0xce, 0x49, 0x74, 0x5b, 0x55, + 0x9c, 0xb4, 0x8b, 0xbc, 0x65, 0x4b, 0x4b, 0x4d, 0x61, 0xce, 0x7b, 0xc8, + 0x8b, 0x62, 0x47, 0x6d, 0xcf, 0xb7, 0x4d, 0x51, 0x98, 0xa8, 0xd1, 0x91, + 0xbc, 0x5f, 0x82, 0x86, 0x82, 0x6b, 0x4e, 0x5e, 0xa5, 0x81, 0xab, 0xc1, + 0x75, 0x43, 0x86, 0xa0, 0xd1, 0x31, 0x4f, 0xd3, 0x81, 0x7d, 0x4b, 0x3f, + 0x80, 0xbf, 0xcd, 0xa1, 0xb4, 0xb6, 0x5e, 0x92, 0x3b, 0x7c, 0x55, 0xbc, + 0x47, 0xa8, 0x6d, 0x89, 0xc0, 0xa8, 0xaf, 0xaf, 0xa3, 0x87, 0x34, 0x98, + 0x32, 0x46, 0x82, 0xae, 0x81, 0xa4, 0xb4, 0xcf, 0xac, 0x74, 0x64, 0xbb, + 0xa1, 0x73, 0x65, 0x63, 0x6e, 0x69, 0x86, 0x38, 0xd1, 0x52, 0xb1, 0x41, + 0xa6, 0x65, 0xa3, 0x37, 0xb4, 0xcc, 0x38, 0x65, 0x8d, 0xbe, 0xcf, 0x39, + 0x8c, 0x52, 0xb9, 0x5b, 0x4f, 0x97, 0x98, 0xcd, 0xcc, 0x4e, 0x49, 0xc0, + 0x8b, 0x73, 0xc7, 0x90, 0x50, 0x96, 0x46, 0xc2, 0xbf, 0x2c, 0xa4, 0x6f, + 0x4b, 0x4b, 0x47, 0xd2, 0xcc, 0x37, 0x82, 0x8c, 0xc8, 0xaf, 0x41, 0x7f, + 0x7b, 0xa6, 0xd7, 0x55, 0x58, 0x98, 0x45, 0xbb, 0xbe, 0x93, 0xde, 0xb1, + 0x39, 0xa7, 0xbb, 0xaf, 0x47, 0x51, 0x60, 0x59, 0xc3, 0xce, 0xb9, 0x5c, + 0x57, 0x3d, 0x48, 0xc2, 0x83, 0x3f, 0x2d, 0x87, 0x90, 0xb5, 0x96, 0x2d, + 0xd2, 0x97, 0xa6, 0xbe, 0x74, 0x3d, 0x3f, 0x5a, 0x8c, 0x67, 0xd3, 0x4c, + 0x45, 0xa5, 0x80, 0x83, 0x3a, 0x5f, 0x53, 0x7f, 0xa7, 0x40, 0x4f, 0xcf, + 0x48, 0xb3, 0x80, 0x33, 0xab, 0x69, 0xc1, 0x81, 0x9d, 0xce, 0x49, 0x87, + 0x3f, 0x90, 0x5f, 0x97, 0x73, 0xa6, 0x5e, 0x3e, 0xc0, 0x31, 0xa1, 0x8a, + 0x58, 0x75, 0x90, 0x3e, 0xb0, 0xa6, 0x9f, 0x5d, 0x5e, 0xbe, 0x7f, 0x73, + 0x90, 0x83, 0xc3, 0x93, 0x4f, 0x5d, 0xa9, 0xd2, 0xcf, 0xbb, 0x64, 0x84, + 0x6e, 0x62, 0x3a, 0x32, 0x58, 0x3a, 0x9c, 0xce, 0xb2, 0x5d, 0xa9, 0xb5, + 0x90, 0x59, 0x81, 0xb5, 0x91, 0x9e, 0x69, 0xa4, 0xd0, 0xa9, 0x87, 0x52, + 0x38, 0xae, 0x5b, 0xb2, 0x3b, 0x3c, 0x52, 0x65, 0xb5, 0x3c, 0xc3, 0x5e, + 0x45, 0xa9, 0x97, 0xc7, 0x4d, 0xd0, 0x7a, 0x63, 0x97, 0xc3, 0x47, 0x60, + 0xd3, 0xba, 0x35, 0x4c, 0x3a, 0x8b, 0xd0, 0xad, 0x57, 0x5b, 0xa0, 0x50, + 0x5d, 0x58, 0xc9, 0x5f, 0x43, 0x70, 0x76, 0x4d, 0x8c, 0x85, 0xb5, 0xc7, + 0x4c, 0x5c, 0xa6, 0xa7, 0xd1, 0x9e, 0x99, 0x9b, 0xcd, 0x61, 0x35, 0x6a, + 0x99, 0x51, 0x8c, 0x64, 0xcf, 0x89, 0x81, 0xc8, 0x9a, 0x84, 0x86, 0x5e, + 0x84, 0xae, 0x9e, 0xaa, 0xae, 0x9b, 0xa9, 0x9d, 0xa8, 0x6d, 0x68, 0x68, + 0xb1, 0x42, 0xa0, 0x93, 0xaf, 0x70, 0x81, 0xb2, 0xbc, 0x74, 0x9d, 0x6d, + 0x5a, 0xd3, 0x69, 0xb9, 0x85, 0xa1, 0x8b, 0x76, 0xa9, 0x3a, 0x80, 0x82, + 0xc4, 0x39, 0x94, 0xb7, 0x61, 0x8d, 0x68, 0x77, 0x61, 0x98, 0xbd, 0x79, + 0xbe, 0x5e, 0x90, 0x2e, 0x37, 0x6f, 0x54, 0x70, 0x8c, 0x98, 0x9b, 0xc2, + 0x71, 0x5e, 0x3d, 0x9d, 0xa6, 0x73, 0xa6, 0x96, 0xcb, 0x86, 0x57, 0x72, + 0x94, 0xd2, 0x86, 0xac, 0xaf, 0xcb, 0xa3, 0x7e, 0xa8, 0x73, 0xb5, 0x59, + 0x5e, 0xb7, 0x88, 0x75, 0x74, 0x5a, 0x9a, 0xc7, 0xd2, 0xc3, 0x92, 0x52, + 0xd3, 0x7d, 0x4d, 0x4f, 0x91, 0x7c, 0x5e, 0x3e, 0xcd, 0x89, 0x4a, 0x5c, + 0x7e, 0x6d, 0x9c, 0xbe, 0x5d, 0x71, 0x71, 0x98, 0xd2, 0x52, 0x7d, 0x57, + 0x3a, 0x31, 0x47, 0xa5, 0xcf, 0x86, 0x4f, 0x50, 0xd0, 0x49, 0x48, 0x5f, + 0x56, 0xac, 0x97, 0xcf, 0x39, 0x73, 0x5e, 0x3b, 0x90, 0xd2, 0x93, 0x54, + 0x61, 0x5a, 0x69, 0x46, 0xd1, 0x86, 0xa9, 0x58, 0xc3, 0xaf, 0x41, 0x52, + 0x67, 0x83, 0x7d, 0x3b, 0xbb, 0xb1, 0x3b, 0x73, 0x6f, 0x8c, 0xc0, 0xd1, + 0xab, 0x56, 0x83, 0x83, 0x42, 0xc4, 0x62, 0x79, 0x50, 0x6d, 0x88, 0x49, + 0x6e, 0x53, 0x79, 0xd5, 0x74, 0x96, 0x71, 0x6a, 0x39, 0xb8, 0x5d, 0x3e, + 0x70, 0x68, 0x71, 0x58, 0xc7, 0x82, 0xc2, 0xd9, 0xaf, 0xa9, 0x78, 0x8c, + 0x3f, 0xc1, 0x40, 0xb8, 0x84, 0x42, 0x9e, 0x7d, 0x4d, 0x84, 0x9a, 0x9f, + 0x76, 0x41, 0x70, 0xbd, 0x73, 0xbc, 0x35, 0xc8, 0x81, 0x71, 0x7d, 0x41, + 0x9f, 0x71, 0xb2, 0x39, 0xa1, 0xbb, 0x89, 0x34, 0x91, 0xc2, 0xbf, 0xa5, + 0xa6, 0x8c, 0x9c, 0x70, 0xd4, 0x90, 0x62, 0x60, 0xb4, 0x4e, 0x40, 0x79, + 0x91, 0xa6, 0x8b, 0x27, 0x52, 0xac, 0x7b, 0xcd, 0x82, 0xb3, 0x7d, 0xa1, + 0xc3, 0x3e, 0x46, 0x53, 0xb8, 0x33, 0x6d, 0x85, 0x45, 0x70, 0xb5, 0x70, + 0x8a, 0x3a, 0xc6, 0x89, 0x49, 0x8d, 0x5d, 0x5a, 0x8c, 0x80, 0x76, 0x38, + 0x7e, 0x4e, 0x5d, 0x88, 0x4e, 0x45, 0x48, 0x68, 0x4c, 0x36, 0x2e, 0x91, + 0x92, 0x84, 0x3a, 0x54, 0x7c, 0xa4, 0x92, 0xc8, 0x76, 0xc6, 0x9a, 0x56, + 0x7c, 0x6b, 0x4f, 0x2b, 0x46, 0x8b, 0x7e, 0x9a, 0x55, 0x8c, 0x91, 0x63, + 0x7c, 0xcc, 0xa7, 0xb7, 0xc0, 0x35, 0x74, 0x38, 0xa5, 0x7b, 0x9d, 0x6d, + 0x87, 0xc0, 0x86, 0x72, 0xa6, 0x80, 0x86, 0x38, 0xcf, 0x5b, 0x9a, 0xcf, + 0x50, 0xae, 0x5c, 0x77, 0x3c, 0x4e, 0x80, 0x3c, 0x81, 0xa9, 0x39, 0xaa, + 0x77, 0xa0, 0xb2, 0x6f, 0x8a, 0xd1, 0xc4, 0x5a, 0x4e, 0x6f, 0xbc, 0x4c, + 0xaf, 0x94, 0x71, 0x70, 0x8b, 0x74, 0xd3, 0x7e, 0x46, 0xb2, 0x5c, 0x60, + 0xa8, 0xb7, 0x6c, 0x58, 0xd4, 0x7d, 0x5b, 0xc5, 0x3c, 0x9d, 0x6b, 0xd0, + 0x74, 0xa0, 0xc9, 0xcb, 0xa4, 0x47, 0xd0, 0x84, 0xd4, 0xc0, 0xaf, 0xc1, + 0x81, 0x4d, 0x70, 0xc8, 0x80, 0xac, 0x2e, 0x66, 0x4b, 0x88, 0xbe, 0xb9, + 0x4f, 0x92, 0xb8, 0xbc, 0x7f, 0x36, 0x67, 0xd1, 0x43, 0x79, 0x56, 0x9b, + 0x98, 0x3c, 0x32, 0x6d, 0x86, 0xac, 0x45, 0xb3, 0xb1, 0x50, 0x7c, 0x28, + 0x5a, 0x8f, 0xb4, 0xb3, 0x60, 0x89, 0x3a, 0xdb, 0xc5, 0x3f, 0xd0, 0xcc, + 0x37, 0x47, 0x77, 0x4e, 0xb6, 0x4d, 0x8c, 0x30, 0x38, 0xb3, 0x43, 0x94, + 0x66, 0x4b, 0x98, 0x71, 0xa6, 0x5a, 0x89, 0xc9, 0xd6, 0xd0, 0x9e, 0x68, + 0x7b, 0x8b, 0x4c, 0x4b, 0x85, 0x89, 0x94, 0x49, 0xc3, 0xbc, 0x76, 0x82, + 0xb8, 0xcb, 0x43, 0xb4, 0xc6, 0x9d, 0x52, 0xaf, 0x54, 0x6e, 0x80, 0xcf, + 0xa2, 0xcf, 0x4b, 0xda, 0xb3, 0x55, 0x68, 0x74, 0xc2, 0x8b, 0x6a, 0x67, + 0x92, 0x36, 0x8b, 0x3d, 0x38, 0x40, 0x67, 0x9b, 0x72, 0x51, 0x47, 0x87, + 0xc0, 0xa2, 0x7c, 0xc6, 0xc2, 0x6b, 0x6a, 0x39, 0x50, 0x87, 0x8f, 0x43, + 0x47, 0x21, 0x7c, 0x86, 0x8e, 0x88, 0x99, 0xa2, 0xb4, 0x3b, 0x77, 0x8f, + 0x9d, 0x9c, 0x6c, 0x5c, 0xd6, 0xac, 0x9e, 0x9a, 0x72, 0xb0, 0x79, 0x70, + 0x6e, 0x2d, 0x84, 0x65, 0x7d, 0x25, 0x54, 0x71, 0xc0, 0x7e, 0x73, 0x92, + 0xb2, 0x35, 0xc7, 0x49, 0xab, 0xa9, 0x93, 0x81, 0x92, 0x9e, 0xd2, 0x67, + 0x9a, 0x85, 0x6e, 0x2e, 0x41, 0xc0, 0x48, 0xa4, 0x91, 0x27, 0x34, 0x52, + 0x6e, 0x3c, 0xc7, 0x86, 0xa4, 0x64, 0x68, 0x4d, 0xc4, 0xd4, 0xdb, 0xc1, + 0x49, 0x5e, 0x7e, 0xac, 0x38, 0xce, 0x8e, 0xc6, 0x44, 0xa6, 0x40, 0xd5, + 0x66, 0xd8, 0xb6, 0x59, 0x54, 0x4c, 0x6f, 0x79, 0x8f, 0x7a, 0x5f, 0x4e, + 0x64, 0x9e, 0x8f, 0x44, 0x47, 0x2e, 0x82, 0x81, 0x46, 0x72, 0xc6, 0x9b, + 0x57, 0x5e, 0xac, 0x70, 0x4d, 0x4a, 0xc8, 0x98, 0x85, 0x90, 0xb5, 0x99, + 0x39, 0x4a, 0x78, 0x57, 0x38, 0x46, 0xc6, 0x81, 0x69, 0x42, 0xbc, 0x8c, + 0x7a, 0x4c, 0x3b, 0x7f, 0x82, 0x61, 0x55, 0x70, 0x78, 0x9e, 0x3c, 0x56, + 0x60, 0x8d, 0xb1, 0xba, 0x87, 0xb3, 0x81, 0xa1, 0x99, 0xa0, 0xc6, 0xb8, + 0xca, 0x99, 0x34, 0x68, 0xc1, 0x8f, 0x56, 0x50, 0x8e, 0xb0, 0x88, 0x7a, + 0xa3, 0xbf, 0x36, 0xb6, 0xb0, 0xc5, 0x9e, 0xa0, 0xae, 0x4b, 0x47, 0xb6, + 0x84, 0x49, 0x2a, 0xad, 0x5d, 0x89, 0x73, 0x43, 0x52, 0x28, 0xa4, 0x8a, + 0xa9, 0xd0, 0x95, 0x7b, 0x81, 0x60, 0xba, 0xbb, 0x9c, 0x49, 0x56, 0xad, + 0x54, 0xa5, 0x69, 0x69, 0x8b, 0xbb, 0x69, 0x60, 0x3a, 0xa4, 0x64, 0xcd, + 0x5d, 0x5b, 0xd0, 0x4b, 0x7b, 0xd3, 0x65, 0x96, 0xc4, 0xca, 0x80, 0x99, + 0x53, 0x8d, 0x65, 0x68, 0xc7, 0xb2, 0xb0, 0xbd, 0xa8, 0xc8, 0x47, 0x6d, + 0x6d, 0x5b, 0xa7, 0x80, 0xbd, 0x60, 0x27, 0x65, 0xd0, 0xa2, 0x9f, 0x86, + 0x28, 0x42, 0xa7, 0xa6, 0x73, 0x35, 0x5a, 0x85, 0x40, 0xa4, 0x56, 0xeb, + 0x42, 0x51, 0x9e, 0x77, 0x9e, 0x4d, 0xa4, 0x7c, 0x6d, 0xcb, 0x4d, 0x3d, + 0x63, 0x5c, 0x2c, 0xa9, 0xb0, 0x91, 0x7f, 0xb7, 0x5d, 0x60, 0x77, 0xa8, + 0x8a, 0x37, 0xc6, 0xa7, 0x55, 0x74, 0xb7, 0x69, 0xbf, 0x56, 0x70, 0x39, + 0xcd, 0x82, 0x50, 0x88, 0xa6, 0xb4, 0xb3, 0x8c, 0xc9, 0x52, 0x67, 0x67, + 0xa2, 0x69, 0x58, 0x57, 0x77, 0x99, 0x79, 0x8f, 0x47, 0x56, 0x49, 0x65, + 0xbb, 0xbe, 0xca, 0xcf, 0x6b, 0x87, 0x94, 0xc5, 0xc0, 0x8c, 0x44, 0x5e, + 0x34, 0x47, 0xc6, 0x57, 0x53, 0xc3, 0xac, 0x64, 0x48, 0x4c, 0xa1, 0x78, + 0x6c, 0x70, 0x84, 0x40, 0x36, 0x36, 0x7a, 0x7d, 0xb9, 0xad, 0x5e, 0x57, + 0x87, 0x8a, 0x5b, 0x9f, 0xac, 0x4f, 0xb9, 0x3e, 0xac, 0xc8, 0x7b, 0x35, + 0x86, 0xaf, 0x90, 0x82, 0x99, 0x9d, 0x64, 0x4e, 0xaa, 0x9e, 0x93, 0x4a, + 0x7c, 0xad, 0x50, 0xa4, 0xab, 0x91, 0xbf, 0xae, 0xae, 0xd5, 0xa4, 0xbf, + 0x57, 0xaf, 0xc5, 0x4a, 0x7c, 0xbc, 0x84, 0x7a, 0x43, 0xa5, 0xa6, 0x85, + 0x3a, 0x66, 0xc3, 0x39, 0x42, 0x80, 0x5c, 0xc6, 0x43, 0x49, 0x50, 0xbd, + 0x91, 0xd2, 0x9b, 0xb9, 0xba, 0x6e, 0x6b, 0x90, 0xb6, 0x93, 0xa1, 0x8d, + 0xb4, 0x77, 0x61, 0x39, 0xa5, 0x85, 0x4d, 0x3e, 0x79, 0xa7, 0x44, 0x6c, + 0x98, 0x91, 0x45, 0x7b, 0x8d, 0xca, 0x7d, 0xc7, 0x66, 0xb9, 0xe0, 0x49, + 0x46, 0xc3, 0x31, 0xb8, 0x92, 0x65, 0x71, 0x4d, 0x5d, 0xd6, 0x9d, 0xb0, + 0x81, 0x4a, 0x97, 0xad, 0x3c, 0x73, 0x97, 0x7f, 0x82, 0x40, 0x96, 0x39, + 0xb6, 0xc9, 0xbc, 0xa2, 0x9a, 0x63, 0x43, 0x40, 0xc9, 0x90, 0x8a, 0x49, + 0xd0, 0xa1, 0xc1, 0xd1, 0x4b, 0x70, 0x40, 0x96, 0x52, 0x57, 0xac, 0x42, + 0x8d, 0x7e, 0x4d, 0x61, 0x81, 0xc2, 0xd6, 0xcf, 0x90, 0xb4, 0x3c, 0x95, + 0x78, 0xb6, 0xc3, 0x2e, 0xb0, 0xb3, 0xbf, 0x7a, 0x44, 0xdc, 0xd8, 0x62, + 0xbf, 0x87, 0xaf, 0x74, 0x3a, 0xad, 0x3c, 0xc0, 0x82, 0xb1, 0x89, 0x9f, + 0xd4, 0x9a, 0x3b, 0x67, 0xc7, 0xb7, 0x9a, 0x65, 0x77, 0xc7, 0x6b, 0x47, + 0x9b, 0x63, 0x36, 0xc9, 0x94, 0x71, 0x3b, 0xa7, 0x62, 0xbe, 0x40, 0xa1, + 0x43, 0xae, 0x47, 0x30, 0x41, 0x86, 0xa6, 0xa0, 0xc1, 0xcb, 0x71, 0xb3, + 0x5e, 0x49, 0x4f, 0xbc, 0xb0, 0x94, 0x9b, 0x93, 0x4f, 0x49, 0xd2, 0xb1, + 0x9b, 0x59, 0x7d, 0x5e, 0x99, 0x48, 0x6e, 0x8a, 0x94, 0x95, 0xc8, 0xb2, + 0x5d, 0x30, 0x7e, 0x8d, 0x9f, 0x5a, 0x76, 0xa3, 0x73, 0xb3, 0xb8, 0x63, + 0x64, 0xe6, 0x74, 0x9a, 0x6e, 0x6c, 0xb3, 0x77, 0xcf, 0x34, 0x8a, 0xa0, + 0xae, 0xa9, 0xb3, 0x60, 0xaf, 0x54, 0xa0, 0xb2, 0x37, 0x94, 0x6c, 0xd4, + 0x40, 0xd2, 0xd3, 0xbd, 0x97, 0x9b, 0x5a, 0xb6, 0x56, 0x7c, 0x8f, 0xb8, + 0xa0, 0x26, 0xa7, 0x69, 0x4b, 0x63, 0x70, 0x98, 0x73, 0x81, 0x36, 0x7f, + 0x84, 0xbf, 0x4c, 0xb4, 0x53, 0x95, 0x3c, 0x6e, 0xc0, 0xbd, 0xca, 0x54, + 0x6d, 0x8e, 0xbb, 0x6a, 0x90, 0xbf, 0x77, 0x84, 0x63, 0x5e, 0x9c, 0x95, + 0x43, 0x90, 0xb9, 0x44, 0x7d, 0x96, 0x57, 0xb6, 0x6c, 0x6a, 0xb4, 0xc8, + 0x95, 0x90, 0xa2, 0x9c, 0x7a, 0x60, 0x48, 0xd2, 0xb9, 0xa0, 0x33, 0x97, + 0x8e, 0x7c, 0xb5, 0xb6, 0x79, 0xa7, 0x57, 0x42, 0x58, 0x73, 0x82, 0xcf, + 0x9b, 0xdb, 0xab, 0x3c, 0x59, 0x79, 0x54, 0x63, 0x75, 0xbd, 0x5c, 0x5c, + 0x49, 0x6a, 0x51, 0xb7, 0x77, 0x65, 0x9d, 0x78, 0x87, 0x63, 0xa0, 0x8b, + 0xc9, 0xc9, 0x8b, 0xcb, 0x6c, 0x94, 0xb6, 0x31, 0x78, 0x56, 0xa3, 0x82, + 0x8c, 0x8c, 0xbf, 0x7e, 0x64, 0x73, 0x3a, 0x70, 0x9d, 0x73, 0x37, 0xc3, + 0xc6, 0xb5, 0x97, 0x46, 0x3c, 0x85, 0x3a, 0x85, 0xa0, 0xaa, 0x83, 0x47, + 0x5c, 0x9e, 0x67, 0x9a, 0x3f, 0x8b, 0x59, 0x92, 0x93, 0xcb, 0x83, 0xaf, + 0x74, 0x74, 0x74, 0x58, 0x38, 0x92, 0x8b, 0xb4, 0xc1, 0x44, 0x5d, 0xd5, + 0x5d, 0x7c, 0x92, 0x68, 0x79, 0xb5, 0xbb, 0xd0, 0x7a, 0x32, 0xaa, 0x7f, + 0x91, 0x96, 0x5c, 0x79, 0x74, 0x2e, 0x37, 0xa3, 0xa3, 0xb1, 0xcb, 0xa2, + 0x58, 0xbb, 0x99, 0x5d, 0x32, 0x42, 0x86, 0x8e, 0xc3, 0x99, 0xc1, 0x47, + 0x81, 0x45, 0xc7, 0x76, 0x72, 0x5e, 0xb4, 0x5b, 0x7a, 0xcd, 0x5c, 0x5f, + 0x34, 0x6e, 0xb2, 0x52, 0x54, 0xbd, 0x61, 0x6b, 0x36, 0x89, 0xac, 0x87, + 0x3d, 0xaa, 0x4c, 0x3b, 0xb2, 0xd1, 0x4f, 0x45, 0x6d, 0x41, 0xb6, 0x5d, + 0xc4, 0x28, 0x4a, 0x6d, 0x3e, 0x72, 0xc8, 0x75, 0x6c, 0x46, 0xc1, 0x7d, + 0x56, 0xcc, 0x83, 0x75, 0xa5, 0xb1, 0xb3, 0x32, 0x7f, 0xab, 0x8a, 0x93, + 0xa3, 0x54, 0x67, 0xb4, 0x40, 0xd3, 0xa2, 0x69, 0x73, 0xb4, 0x8d, 0x9e, + 0x6c, 0x31, 0x78, 0x6f, 0xb6, 0x41, 0xb7, 0x5c, 0x72, 0x62, 0x81, 0x31, + 0xb3, 0xa6, 0x79, 0x45, 0xa3, 0xbf, 0x52, 0x31, 0x52, 0x42, 0x45, 0x9a, + 0x3c, 0x9b, 0xae, 0x4c, 0x92, 0x8b, 0xb4, 0xb1, 0x57, 0x79, 0x57, 0x49, + 0x93, 0xd3, 0x39, 0x43, 0x87, 0x68, 0xa0, 0xc1, 0xaa, 0xc0, 0x45, 0x58, + 0xbb, 0xc3, 0x5d, 0x6d, 0x71, 0x78, 0x8f, 0x35, 0x86, 0x3b, 0x8b, 0x63, + 0xaa, 0x2d, 0xb5, 0xd3, 0x8b, 0x65, 0x56, 0xa6, 0xb4, 0xcc, 0x92, 0xb9, + 0x71, 0x36, 0xce, 0xc2, 0x56, 0x42, 0xb0, 0x5c, 0xb3, 0x9d, 0x50, 0x42, + 0x73, 0x4a, 0x33, 0x39, 0x70, 0x93, 0xcc, 0xab, 0x80, 0x8d, 0x89, 0x6a, + 0x85, 0x45, 0x43, 0x98, 0x96, 0x54, 0xa5, 0x5f, 0x67, 0xbe, 0x8a, 0x8c, + 0x3d, 0x67, 0xb9, 0x4e, 0x5f, 0x9f, 0xa2, 0x8e, 0x7c, 0x5a, 0x66, 0xc7, + 0xd0, 0x40, 0xbe, 0x85, 0x65, 0xad, 0xaa, 0x4b, 0xaf, 0x54, 0xc1, 0xb0, + 0xb1, 0x7a, 0x6c, 0xb1, 0x85, 0x3a, 0x4a, 0xb7, 0x8e, 0xbd, 0x97, 0x99, + 0xad, 0xa6, 0xa5, 0x82, 0x35, 0x81, 0x8d, 0x62, 0xba, 0x8c, 0xc7, 0x71, + 0x56, 0x9a, 0x5b, 0x7a, 0x67, 0x62, 0xc5, 0x8f, 0x4b, 0x77, 0x47, 0xa9, + 0x3f, 0x75, 0x75, 0x8d, 0x49, 0xaa, 0xd3, 0x37, 0x4a, 0x88, 0x9a, 0x7f, + 0x83, 0xa1, 0x90, 0x3a, 0x41, 0xa3, 0x5e, 0xb4, 0x47, 0xb6, 0xc7, 0x59, + 0x66, 0x43, 0xb9, 0x35, 0xca, 0x9a, 0xbb, 0xb5, 0xb8, 0x82, 0x7e, 0x8f, + 0xad, 0x54, 0x6e, 0xb4, 0xc5, 0x55, 0x96, 0x33, 0x70, 0x9c, 0xa5, 0x40, + 0x67, 0xa6, 0x40, 0x87, 0xc7, 0x6f, 0xc4, 0x64, 0x63, 0xc1, 0x3e, 0xa0, + 0x75, 0x8a, 0xaa, 0xb6, 0xa8, 0xb4, 0x42, 0x65, 0x82, 0x4a, 0x76, 0xad, + 0x39, 0x5a, 0x65, 0x50, 0x6d, 0x6f, 0x73, 0xb9, 0x81, 0x53, 0xa2, 0xd3, + 0x75, 0x4c, 0x57, 0x7b, 0x36, 0x95, 0xaa, 0xc0, 0x38, 0xcb, 0x37, 0x59, + 0x78, 0x69, 0xb0, 0x3b, 0xbb, 0x58, 0xb5, 0x5a, 0x32, 0xaf, 0xb5, 0x8e, + 0x81, 0xb7, 0x9f, 0xa3, 0x93, 0x73, 0x9c, 0x46, 0xa6, 0x8a, 0x89, 0x86, + 0x53, 0x52, 0x94, 0x55, 0x51, 0x7c, 0xad, 0x4e, 0x85, 0xc7, 0xaa, 0x8c, + 0xd9, 0xa3, 0xc8, 0x89, 0xad, 0x8c, 0x8d, 0x37, 0x2f, 0x47, 0x95, 0x58, + 0x78, 0xcf, 0x5c, 0x3d, 0x84, 0x63, 0xc8, 0x4f, 0x4a, 0x34, 0x66, 0x3f, + 0x47, 0x65, 0x34, 0x83, 0x3d, 0x39, 0xc3, 0x90, 0x8f, 0xce, 0x44, 0xa8, + 0xb0, 0x71, 0x35, 0x91, 0xb7, 0xab, 0xb9, 0xc7, 0xa0, 0x69, 0x34, 0x4b, + 0x9e, 0x63, 0x9c, 0x9b, 0x35, 0xcd, 0x3d, 0xa4, 0xaf, 0x62, 0xc5, 0xb0, + 0xc0, 0x8d, 0xae, 0xba, 0x86, 0xc0, 0x98, 0x6b, 0x47, 0xcb, 0x9a, 0x42, + 0x47, 0xbf, 0x48, 0x80, 0xc5, 0x81, 0xaf, 0x9e, 0x55, 0xd5, 0x43, 0x36, + 0x6f, 0x68, 0x38, 0x5d, 0x9e, 0x35, 0x4e, 0x4e, 0x85, 0xc2, 0x6e, 0xd1, + 0x79, 0x8b, 0xcc, 0x33, 0x87, 0x30, 0x74, 0x57, 0x50, 0x47, 0x64, 0x5c, + 0x57, 0xb2, 0x43, 0x44, 0x98, 0xd5, 0x9c, 0x5a, 0xd3, 0x4d, 0x84, 0x55, + 0x7c, 0x8b, 0x70, 0xa2, 0x75, 0x43, 0xa9, 0x94, 0x78, 0x65, 0xdb, 0xbd, + 0xb7, 0x72, 0x45, 0x5d, 0x65, 0xcd, 0x61, 0x56, 0x84, 0x6a, 0xd3, 0x8b, + 0x8a, 0x38, 0x6f, 0x41, 0xba, 0x7e, 0x47, 0xc8, 0x91, 0x62, 0xcb, 0x86, + 0x7d, 0x46, 0x44, 0x75, 0x90, 0x4d, 0x9a, 0x7c, 0x3f, 0x5f, 0x74, 0x89, + 0x95, 0xa5, 0x6b, 0x53, 0x42, 0x2d, 0x8a, 0x7e, 0x62, 0xc9, 0x61, 0x8f, + 0x3e, 0x48, 0x58, 0xc8, 0xbb, 0xc6, 0x8f, 0xcc, 0x39, 0x99, 0xd2, 0x88, + 0xc9, 0xd0, 0x74, 0x32, 0xc4, 0x61, 0x40, 0x7e, 0xbf, 0x53, 0xbc, 0x79, + 0x5a, 0xb3, 0x7f, 0x6d, 0x37, 0x46, 0xb7, 0x88, 0x3e, 0x50, 0xb7, 0x6b, + 0x64, 0xb7, 0x8a, 0x30, 0xcb, 0x40, 0x40, 0x38, 0x93, 0x3e, 0x6f, 0xc8, + 0x92, 0x5d, 0x32, 0xc4, 0xb0, 0xb9, 0xae, 0xd0, 0x38, 0x6b, 0x5c, 0xc4, + 0xd6, 0xb3, 0x91, 0x3d, 0x6d, 0xcc, 0x43, 0xb3, 0xa9, 0x3f, 0x4e, 0xcc, + 0xa7, 0xbc, 0xaa, 0x95, 0x85, 0xca, 0x43, 0x98, 0x33, 0x7c, 0x92, 0x4d, + 0x5e, 0xca, 0x97, 0xa8, 0xb8, 0x6e, 0x5b, 0x9e, 0xca, 0x65, 0x81, 0x60, + 0xb2, 0xc2, 0xcb, 0x9f, 0x35, 0x5f, 0x3c, 0x7d, 0x9a, 0xbb, 0x91, 0x80, + 0xaa, 0x58, 0xad, 0x8b, 0x56, 0xe1, 0xbb, 0xb2, 0x61, 0x96, 0xa4, 0x85, + 0x5f, 0xad, 0xa0, 0x91, 0x50, 0x8f, 0xa6, 0x3b, 0xc4, 0x7e, 0x9a, 0x84, + 0x84, 0xc8, 0x3e, 0x9a, 0x54, 0x31, 0x40, 0x58, 0x7e, 0x98, 0x4b, 0xa5, + 0xd1, 0x68, 0xcd, 0x9a, 0x86, 0x6a, 0x8f, 0xb4, 0x59, 0x9f, 0x9a, 0x5e, + 0x76, 0x36, 0xc2, 0xaf, 0x67, 0xcb, 0x8d, 0x7b, 0x67, 0xba, 0x7c, 0x3d, + 0x6f, 0xb6, 0x87, 0xc1, 0x51, 0x42, 0x73, 0xc7, 0x7b, 0x74, 0x7d, 0x98, + 0xd6, 0x89, 0xbd, 0xcd, 0x44, 0xd0, 0xd2, 0x40, 0xc5, 0x3a, 0x87, 0xc5, + 0x55, 0xb3, 0x43, 0x32, 0x86, 0x75, 0x35, 0x48, 0x65, 0x53, 0xba, 0x81, + 0x59, 0x3a, 0x5e, 0x8e, 0x40, 0x5c, 0x85, 0xab, 0x40, 0x8c, 0x4a, 0xa3, + 0x36, 0x88, 0x3b, 0x5e, 0x33, 0xb2, 0xc8, 0x8e, 0x6a, 0xa3, 0xc0, 0x7d, + 0x9d, 0xb7, 0xbe, 0xc0, 0xc2, 0x76, 0x34, 0x7b, 0x4c, 0x7a, 0x61, 0x28, + 0x34, 0x3f, 0x2f, 0x2b, 0x45, 0x99, 0x64, 0x7c, 0x94, 0x42, 0x40, 0xb0, + 0x54, 0x48, 0x48, 0x41, 0xb3, 0x92, 0x3c, 0x57, 0x63, 0xd2, 0x64, 0x56, + 0x5d, 0x89, 0x9c, 0x61, 0x7d, 0x97, 0x57, 0xa8, 0xbb, 0x8f, 0x46, 0xd2, + 0xb1, 0xa0, 0x84, 0xbf, 0xc6, 0x44, 0xbf, 0x9a, 0x61, 0x2e, 0x75, 0x41, + 0x8f, 0x77, 0x44, 0xc2, 0xb1, 0xa2, 0x49, 0xcf, 0x67, 0xa5, 0xb4, 0xb4, + 0x37, 0x93, 0xa8, 0x69, 0x51, 0xcd, 0x7a, 0x65, 0xaf, 0x4d, 0x49, 0xa6, + 0x9d, 0x70, 0x48, 0xca, 0x35, 0xb0, 0x6b, 0xd1, 0x9c, 0x86, 0xab, 0x9d, + 0x71, 0x95, 0xb0, 0xab, 0x7d, 0xa4, 0xa3, 0xa1, 0x96, 0x35, 0x77, 0x9a, + 0xd3, 0xb1, 0x3d, 0x6b, 0x5c, 0x94, 0x4b, 0xaa, 0x8f, 0x77, 0x4c, 0xbe, + 0x41, 0xc4, 0x3c, 0x9d, 0x57, 0xab, 0x44, 0xa1, 0x37, 0x93, 0x5a, 0x3f, + 0x50, 0x8c, 0xb5, 0xa2, 0xaa, 0xaf, 0x60, 0x67, 0xc5, 0x89, 0x8f, 0x6c, + 0x60, 0x7c, 0xc4, 0xa9, 0xd0, 0x59, 0x2f, 0xc6, 0x85, 0xb8, 0x48, 0x5c, + 0x6c, 0x3b, 0x78, 0x92, 0x7c, 0x82, 0xc1, 0x47, 0x55, 0x88, 0x6c, 0x68, + 0xce, 0x85, 0x8a, 0x70, 0xcf, 0x7a, 0xb7, 0xab, 0xaf, 0x48, 0x55, 0x89, + 0xbb, 0xa4, 0x8e, 0xbc, 0x8b, 0x5c, 0x39, 0x5e, 0xc1, 0x8d, 0xc8, 0x51, + 0x8b, 0x3d, 0x69, 0x62, 0x58, 0x92, 0x83, 0x81, 0x3a, 0x62, 0x40, 0x98, + 0xcb, 0x6c, 0x7c, 0x8e, 0x8e, 0xaf, 0x3f, 0x8d, 0x87, 0xa8, 0x6d, 0xad, + 0xbb, 0xb7, 0x84, 0x5e, 0x62, 0xb0, 0x4d, 0xb1, 0x9c, 0x4c, 0x6c, 0x35, + 0x47, 0x44, 0xc6, 0x86, 0x6b, 0x70, 0xc3, 0x4c, 0xa0, 0x96, 0x76, 0x96, + 0xa7, 0x70, 0xa1, 0x5e, 0x73, 0x8e, 0x8d, 0x7d, 0x39, 0x4a, 0x52, 0x38, + 0xce, 0xa5, 0xaa, 0x61, 0xac, 0x91, 0x40, 0x91, 0x67, 0x33, 0x90, 0x80, + 0x63, 0x97, 0x4b, 0x82, 0x73, 0x7a, 0x2c, 0x3a, 0x97, 0x94, 0x81, 0x50, + 0x9e, 0x74, 0x38, 0x42, 0xa1, 0x7e, 0xa2, 0xcd, 0xa0, 0x5a, 0x81, 0x70, + 0x83, 0x3e, 0x7a, 0xb2, 0x9e, 0x8c, 0xb9, 0xab, 0xc6, 0x7e, 0x9d, 0x46, + 0x88, 0x70, 0x40, 0xa8, 0xab, 0xc4, 0x94, 0xc0, 0x41, 0x8c, 0xbd, 0x44, + 0xba, 0x5c, 0xd0, 0x98, 0xa9, 0xb4, 0x4f, 0x92, 0x2a, 0x49, 0xcb, 0xcd, + 0x6c, 0x83, 0x8e, 0x83, 0x3b, 0xa4, 0xbf, 0xc4, 0xc0, 0xd0, 0x6c, 0x48, + 0xb4, 0x6e, 0xa6, 0x4a, 0x39, 0x44, 0x4c, 0x4c, 0x77, 0x7a, 0x58, 0x49, + 0x67, 0xaa, 0x73, 0xc9, 0x62, 0x9c, 0xb9, 0x3f, 0x35, 0xb8, 0x35, 0x73, + 0xa8, 0x4f, 0x6a, 0x5f, 0x40, 0x88, 0x35, 0xc5, 0xbb, 0x41, 0xb5, 0x3d, + 0x82, 0x6c, 0x56, 0xce, 0x79, 0x5a, 0x50, 0x48, 0x41, 0x44, 0x6f, 0x4f, + 0x37, 0x45, 0xa3, 0x51, 0xbf, 0x6c, 0x91, 0x8a, 0x98, 0x9f, 0xaa, 0x7b, + 0xbb, 0xbf, 0x40, 0x39, 0x8b, 0x89, 0xa2, 0x6d, 0x9c, 0x68, 0x88, 0x8b, + 0x76, 0x61, 0x48, 0x6f, 0xaf, 0x33, 0x48, 0x59, 0xce, 0x89, 0xb7, 0x8c, + 0x64, 0x47, 0xa1, 0x48, 0x48, 0x50, 0x7f, 0xc2, 0xb5, 0xca, 0x87, 0x62, + 0x4d, 0x65, 0x56, 0x64, 0x77, 0x8b, 0xa0, 0x62, 0xae, 0x47, 0xbf, 0xb9, + 0xd1, 0x85, 0x68, 0x51, 0xc2, 0x4e, 0x5f, 0x81, 0x9d, 0xb2, 0x7a, 0xb2, + 0x3f, 0xce, 0xc8, 0xcb, 0x9d, 0x65, 0x4e, 0xc5, 0x6b, 0x40, 0x87, 0x2d, + 0x9a, 0x93, 0xac, 0xb0, 0x85, 0x89, 0x2e, 0xa0, 0x61, 0x63, 0x9a, 0x59, + 0xd2, 0x3c, 0x8b, 0x67, 0x48, 0xbd, 0x63, 0x94, 0x3d, 0x9c, 0x74, 0x93, + 0x36, 0x4a, 0x67, 0x95, 0x98, 0xa3, 0xc3, 0xa4, 0x64, 0xd0, 0x68, 0x94, + 0xd0, 0x63, 0xc5, 0xd2, 0x3d, 0x62, 0x5d, 0xdd, 0x6f, 0x54, 0xac, 0x67, + 0x56, 0x36, 0x3f, 0xa2, 0xb1, 0xcc, 0x3d, 0x3e, 0x51, 0x7b, 0xce, 0xac, + 0x59, 0xa9, 0x5a, 0x6f, 0x43, 0x91, 0x66, 0x8b, 0x5f, 0xc2, 0x8b, 0x97, + 0x94, 0x4e, 0xa4, 0x9c, 0x70, 0xd8, 0xc7, 0xa5, 0x55, 0x69, 0x30, 0x4c, + 0xaa, 0xc4, 0xa3, 0xd4, 0x9d, 0x82, 0x52, 0x33, 0x8d, 0xa6, 0x66, 0x3f, + 0x3c, 0x95, 0x91, 0x47, 0x5e, 0x62, 0x61, 0x79, 0xa0, 0x6e, 0x6c, 0xbd, + 0x77, 0xab, 0x8d, 0xad, 0x38, 0x70, 0x96, 0x80, 0x5c, 0x48, 0xa7, 0xc0, + 0x78, 0xae, 0xa2, 0x85, 0x84, 0x66, 0x47, 0xa4, 0x39, 0x4e, 0xbf, 0xab, + 0x7d, 0xa3, 0x7e, 0x4c, 0x92, 0x3c, 0x6a, 0x92, 0xa4, 0x7a, 0x52, 0xca, + 0x89, 0x41, 0x96, 0x76, 0xca, 0x3b, 0x40, 0x45, 0x7f, 0x5a, 0x55, 0x62, + 0xa3, 0x3e, 0x75, 0xc6, 0x7e, 0x9f, 0x7d, 0x97, 0x80, 0xc0, 0x4d, 0x42, + 0x3b, 0xd0, 0x92, 0xbd, 0x41, 0x58, 0x8d, 0x41, 0x8d, 0x38, 0xce, 0x98, + 0x37, 0x51, 0xc7, 0x8e, 0x78, 0xc5, 0xab, 0x53, 0x9c, 0xa0, 0xbb, 0xc7, + 0x81, 0x3d, 0xbc, 0x41, 0xc9, 0xd6, 0x80, 0xb8, 0xc8, 0x8d, 0x9d, 0x5a, + 0x35, 0x43, 0x54, 0x48, 0x7c, 0xb0, 0xaf, 0x80, 0x82, 0x62, 0xa2, 0x44, + 0x56, 0xa8, 0x89, 0x96, 0x74, 0x5d, 0x92, 0x72, 0x6e, 0x5e, 0x94, 0x60, + 0x80, 0x93, 0x6f, 0x74, 0x9f, 0xb5, 0xc9, 0x92, 0x88, 0x4a, 0x93, 0xa4, + 0x6e, 0x9a, 0x81, 0xa7, 0x67, 0x35, 0x9c, 0x72, 0xad, 0xb7, 0x88, 0x6c, + 0x80, 0x40, 0x35, 0x38, 0x58, 0x54, 0x58, 0x70, 0x4c, 0x44, 0x9d, 0xc8, + 0x3a, 0xb0, 0xc2, 0x95, 0x39, 0x65, 0x4f, 0xd0, 0xd2, 0x80, 0x98, 0x83, + 0xb7, 0x39, 0xab, 0x48, 0xce, 0x35, 0xaf, 0x35, 0xd5, 0x42, 0x9f, 0xb3, + 0x83, 0x33, 0xbe, 0x79, 0x9d, 0x43, 0x7b, 0x78, 0xce, 0xa9, 0xae, 0x5a, + 0x75, 0xc7, 0x99, 0x9c, 0x6b, 0x67, 0x74, 0x6a, 0xc5, 0xaa, 0xaf, 0x98, + 0x93, 0xd3, 0x41, 0x6e, 0x94, 0x6c, 0x50, 0x47, 0xa9, 0x85, 0xc0, 0x97, + 0x40, 0x72, 0x64, 0x41, 0x41, 0x9a, 0xca, 0xbe, 0xc0, 0xb7, 0x41, 0xbc, + 0x74, 0x9b, 0xb4, 0x35, 0x46, 0x94, 0x7d, 0xd2, 0xcd, 0x69, 0xbb, 0xb1, + 0xd5, 0x90, 0x5f, 0x41, 0x62, 0xc0, 0x6c, 0x4a, 0x64, 0xae, 0x9d, 0x46, + 0x47, 0x4a, 0x46, 0xc0, 0x52, 0xbd, 0x68, 0x8a, 0x85, 0x5a, 0x7b, 0x87, + 0x6d, 0xa1, 0x69, 0x6d, 0xc0, 0x9b, 0xb2, 0xae, 0x72, 0xa0, 0x5b, 0xb5, + 0x4a, 0x80, 0xa7, 0xc5, 0x5e, 0xcf, 0xc0, 0xbe, 0x32, 0x6b, 0x9d, 0x46, + 0x97, 0x54, 0x31, 0x97, 0x73, 0x38, 0x6e, 0x38, 0x66, 0xc3, 0x74, 0x40, + 0xd7, 0xb9, 0x37, 0xa4, 0x90, 0x8d, 0xb3, 0x4a, 0x4f, 0xd0, 0x71, 0x5b, + 0x61, 0xaa, 0x33, 0x95, 0x97, 0xc6, 0xbe, 0xc4, 0x5e, 0x8f, 0x46, 0xbd, + 0x5c, 0xa7, 0x9e, 0x74, 0x6c, 0xd6, 0x41, 0x4f, 0x9b, 0x36, 0x9e, 0x97, + 0x81, 0xb2, 0xb5, 0x7b, 0x3d, 0x49, 0x4e, 0x63, 0x8a, 0x61, 0x36, 0xc5, + 0xc2, 0x51, 0x93, 0x57, 0x45, 0x37, 0x7a, 0x84, 0x35, 0x61, 0x83, 0x47, + 0xd2, 0xa3, 0x8d, 0xb2, 0x7c, 0x3e, 0xa1, 0x5e, 0x81, 0xc9, 0xc8, 0x7e, + 0x31, 0x7b, 0xc5, 0x51, 0x76, 0x42, 0x6d, 0x5a, 0xa8, 0xcd, 0x5e, 0x7f, + 0x88, 0xab, 0xb4, 0x6a, 0x80, 0x68, 0x49, 0x4d, 0x55, 0xbb, 0x51, 0x81, + 0x98, 0x96, 0x53, 0x6e, 0x44, 0xa3, 0x5a, 0x7d, 0x8d, 0x5f, 0x83, 0xb3, + 0xac, 0xa6, 0x73, 0x50, 0xc5, 0x7a, 0xc4, 0x5e, 0x52, 0x6f, 0x67, 0xc2, + 0x8d, 0x3c, 0xad, 0x50, 0x58, 0x37, 0x84, 0x40, 0xd0, 0xc9, 0x68, 0xa9, + 0x3d, 0x9b, 0x6f, 0x5d, 0x44, 0x57, 0x5a, 0x65, 0x8b, 0x76, 0x41, 0xa2, + 0x83, 0x9c, 0x60, 0x3b, 0x65, 0x44, 0x99, 0xd1, 0x59, 0x3e, 0x54, 0xc2, + 0xc7, 0x5c, 0x9f, 0xca, 0x9b, 0x3f, 0x7f, 0xbb, 0x80, 0x82, 0x6e, 0xca, + 0x77, 0xcf, 0x4d, 0x7e, 0x9b, 0xcd, 0x97, 0xc2, 0x37, 0xd1, 0x8d, 0x3c, + 0xcd, 0x8d, 0x46, 0x72, 0xc1, 0xd3, 0x7d, 0xce, 0x41, 0xb7, 0xb2, 0x5e, + 0x7a, 0x3b, 0x69, 0xcb, 0x34, 0x6d, 0x60, 0x97, 0x9f, 0xca, 0x39, 0x48, + 0x51, 0x9c, 0x42, 0xa3, 0x74, 0x66, 0x64, 0x9f, 0xae, 0x33, 0x3a, 0x3b, + 0x6e, 0xb8, 0xca, 0x92, 0x63, 0xc0, 0x42, 0xc5, 0x9f, 0x55, 0x54, 0x5a, + 0x33, 0x9c, 0x89, 0x3b, 0xa7, 0x63, 0xaf, 0xab, 0xce, 0x44, 0x3d, 0x3f, + 0xcf, 0x3b, 0x76, 0x4d, 0xaf, 0x3d, 0x35, 0xc0, 0xc4, 0xc2, 0x5d, 0x45, + 0xcf, 0xbc, 0xaa, 0x58, 0x69, 0x89, 0x76, 0xab, 0xb1, 0x9d, 0x9f, 0x66, + 0xbd, 0x49, 0x61, 0x6c, 0x44, 0x39, 0x58, 0x8f, 0x60, 0x46, 0x4c, 0x95, + 0x83, 0xc6, 0xc6, 0x69, 0xc5, 0xc8, 0xa3, 0x61, 0xc9, 0x65, 0x79, 0x7c, + 0x8c, 0xc8, 0x99, 0x7e, 0xcd, 0xab, 0x38, 0x41, 0x4b, 0x60, 0xcc, 0x9c, + 0x71, 0x52, 0x59, 0x9c, 0x9d, 0x53, 0x88, 0xc7, 0x44, 0xb6, 0x44, 0x40, + 0x51, 0x44, 0x81, 0x4c, 0xd0, 0x84, 0xaa, 0xbb, 0x8d, 0x6e, 0x9c, 0x66, + 0x32, 0xc1, 0x35, 0xd1, 0x4c, 0xb4, 0xa7, 0x32, 0x83, 0xa4, 0x71, 0x45, + 0xa9, 0x53, 0xac, 0xc8, 0x94, 0x9d, 0x55, 0x7f, 0xcf, 0x5f, 0x51, 0x43, + 0x6d, 0x4d, 0x8e, 0xaf, 0xc7, 0x80, 0x83, 0x33, 0x5c, 0x64, 0x37, 0x43, + 0x9b, 0xc4, 0xaf, 0xc5, 0xa7, 0x47, 0xd0, 0x5c, 0xbb, 0xab, 0x70, 0x72, + 0x56, 0x47, 0x9c, 0x8d, 0x91, 0xc7, 0xc2, 0xcd, 0x6c, 0xad, 0x61, 0x91, + 0x4d, 0x90, 0x69, 0xa6, 0x73, 0x84, 0x5c, 0x54, 0x5f, 0x7b, 0x3b, 0xc0, + 0x87, 0xc0, 0x66, 0xa9, 0xab, 0x5c, 0x68, 0xd5, 0x76, 0x5d, 0x8f, 0x6a, + 0xb0, 0x66, 0x92, 0x8c, 0x3f, 0xb5, 0xb8, 0xbb, 0xbd, 0x52, 0xa3, 0xb1, + 0x71, 0x5e, 0xa5, 0xb5, 0x5c, 0xae, 0x60, 0xab, 0x3b, 0x57, 0x8d, 0xa4, + 0x51, 0xb1, 0x5d, 0x50, 0xa6, 0x74, 0x31, 0x48, 0x76, 0x8d, 0x9e, 0xb3, + 0x7e, 0xb3, 0x62, 0x93, 0x62, 0xcc, 0x48, 0x50, 0x57, 0xca, 0xc9, 0x57, + 0x79, 0xb6, 0x6b, 0x69, 0x9a, 0xd1, 0x7f, 0x8c, 0xc5, 0x5e, 0xcb, 0xa7, + 0x85, 0x3f, 0x95, 0x83, 0x67, 0xc3, 0x88, 0x9f, 0x7c, 0x60, 0x5b, 0x4e, + 0x35, 0x9b, 0xc7, 0x73, 0x90, 0x9b, 0xc3, 0x7d, 0xa9, 0x7f, 0x34, 0x72, + 0x93, 0x7e, 0x36, 0x3a, 0x3f, 0x81, 0x3b, 0x75, 0x67, 0xa9, 0x82, 0xc9, + 0x78, 0x79, 0x74, 0x5e, 0x6f, 0x91, 0x75, 0x36, 0x5d, 0x42, 0x8d, 0x8e, + 0x4b, 0x7c, 0x60, 0xc2, 0xa4, 0x50, 0x3d, 0x8c, 0xc0, 0x5c, 0xb5, 0x49, + 0x58, 0xc7, 0x78, 0x8b, 0x47, 0xcb, 0x44, 0x5a, 0x56, 0x7d, 0x7d, 0xc0, + 0x65, 0x80, 0x9d, 0x83, 0xc0, 0x6e, 0x4a, 0x41, 0x63, 0x6c, 0x4b, 0xc6, + 0x9d, 0xa7, 0xc3, 0x4c, 0x7d, 0x84, 0x6b, 0xc2, 0x7b, 0x44, 0x8e, 0xc7, + 0x5e, 0x70, 0x5e, 0x7e, 0x4f, 0x39, 0x32, 0xbf, 0x41, 0xc9, 0x65, 0xa5, + 0x4d, 0x7f, 0x3c, 0xa0, 0x49, 0xb8, 0x4d, 0x74, 0xb3, 0x59, 0xa2, 0x6a, + 0x33, 0x59, 0x53, 0xae, 0xd0, 0x92, 0x69, 0x74, 0x6c, 0xaf, 0x9b, 0xac, + 0xca, 0x88, 0x5b, 0x85, 0x7f, 0x50, 0x45, 0x5d, 0x79, 0x90, 0x36, 0x66, + 0x82, 0x3f, 0x3f, 0x53, 0xcc, 0xcc, 0x46, 0x8f, 0x9c, 0x97, 0x49, 0xbb, + 0xbd, 0x95, 0x61, 0x90, 0x67, 0x7a, 0x75, 0x4f, 0xcf, 0x35, 0x37, 0xad, + 0x9d, 0xa5, 0x9f, 0x95, 0xa0, 0x50, 0x69, 0xa1, 0x41, 0x58, 0xd7, 0x3e, + 0x3d, 0xd4, 0xb1, 0x49, 0x75, 0xb8, 0xa4, 0x63, 0xa9, 0xb3, 0x48, 0xb5, + 0x71, 0x4e, 0xca, 0xcd, 0x3a, 0x92, 0x75, 0x4b, 0x6a, 0x81, 0x92, 0x93, + 0x41, 0x55, 0x4c, 0xd3, 0xb6, 0x9d, 0x25, 0x95, 0xa8, 0x39, 0x69, 0x35, + 0xb7, 0x3a, 0x85, 0xa1, 0x4d, 0x98, 0x54, 0xba, 0xa6, 0x78, 0xd3, 0x90, + 0xb8, 0x6f, 0xca, 0xd0, 0xbd, 0x9b, 0x92, 0x3c, 0xd5, 0xd2, 0x6c, 0xb1, + 0x5d, 0x3c, 0x8b, 0xba, 0xc7, 0x63, 0x50, 0x89, 0x56, 0xb2, 0xb4, 0x74, + 0x2c, 0x7a, 0x7b, 0x86, 0x4b, 0x95, 0x99, 0xa2, 0x7e, 0x51, 0x94, 0x80, + 0x5b, 0x4a, 0xbd, 0x36, 0x69, 0x7b, 0x7a, 0xab, 0xc6, 0x8d, 0x5a, 0x7f, + 0xa6, 0xae, 0x26, 0x77, 0x73, 0xc4, 0xad, 0xaf, 0x4d, 0x9c, 0x4f, 0x4d, + 0x74, 0x81, 0x62, 0xd1, 0x69, 0xbf, 0x7d, 0x91, 0xc1, 0x46, 0x80, 0xc9, + 0xb4, 0xa2, 0xca, 0xd0, 0x35, 0x21, 0x75, 0x74, 0x9d, 0x74, 0x9d, 0xa2, + 0xad, 0x4a, 0x35, 0x4c, 0xb3, 0x5a, 0x4d, 0x43, 0xc2, 0x65, 0xb6, 0xa6, + 0xbb, 0x77, 0x9a, 0x9d, 0x7a, 0x47, 0xc7, 0x54, 0xa9, 0xc1, 0x9e, 0xb7, + 0xaf, 0x31, 0x93, 0x2c, 0xcc, 0xd9, 0xbc, 0x50, 0x8b, 0xad, 0xb3, 0x86, + 0x61, 0x45, 0xce, 0x77, 0x3a, 0x7a, 0xc6, 0xd4, 0xce, 0x3d, 0xa2, 0xba, + 0x86, 0x81, 0xc2, 0x78, 0x9f, 0x5b, 0x41, 0xb6, 0x6d, 0x6a, 0xba, 0x95, + 0x9a, 0x73, 0xcb, 0x91, 0x61, 0xb0, 0xb9, 0xce, 0xa8, 0x69, 0x8d, 0x8c, + 0xa7, 0x5b, 0xa8, 0xbe, 0x4d, 0x74, 0x92, 0x41, 0x82, 0x2f, 0xc5, 0xa2, + 0xbd, 0x94, 0xc4, 0xaa, 0xa3, 0x9f, 0x5e, 0xc5, 0xab, 0xad, 0x3b, 0xc6, + 0xc0, 0x85, 0x72, 0xb9, 0x5f, 0xb7, 0x7e, 0xbf, 0xb8, 0x3d, 0x8a, 0xa3, + 0x6c, 0x3d, 0xcb, 0x3b, 0x7b, 0x50, 0x99, 0x86, 0xa4, 0x9a, 0xbf, 0xc5, + 0x2c, 0x66, 0xa1, 0xac, 0xb7, 0x9d, 0x3b, 0xb6, 0x51, 0xc0, 0x6f, 0x8b, + 0x8e, 0x9f, 0xc6, 0x40, 0xb3, 0x58, 0x5d, 0x9e, 0x67, 0x68, 0x2f, 0x88, + 0xb8, 0x8c, 0xba, 0x6a, 0x38, 0x9d, 0x6d, 0xb3, 0xac, 0x7a, 0xa6, 0xcf, + 0xa4, 0x95, 0xc4, 0x41, 0x59, 0x58, 0x6e, 0x35, 0x99, 0x3f, 0xa3, 0x40, + 0xa5, 0xc0, 0x54, 0xa3, 0x9c, 0x46, 0x6a, 0xae, 0xa7, 0x93, 0x4b, 0x87, + 0x96, 0x8d, 0x5c, 0xc9, 0xc3, 0xc0, 0xac, 0x68, 0x56, 0x46, 0x4b, 0x6f, + 0x8a, 0x3f, 0xc1, 0x93, 0x23, 0x86, 0xbc, 0x4b, 0x52, 0x4c, 0x64, 0x4b, + 0x7c, 0x7c, 0x90, 0xd3, 0x69, 0x73, 0x77, 0x44, 0xb7, 0x39, 0xaa, 0x85, + 0x96, 0xc4, 0x97, 0xa1, 0x93, 0xc5, 0x7a, 0x44, 0x4b, 0xd7, 0x5e, 0x94, + 0xa8, 0x65, 0x34, 0x4a, 0x77, 0x7d, 0x89, 0x3f, 0x3f, 0xa6, 0x73, 0x4f, + 0xbb, 0x3b, 0x3c, 0x57, 0xad, 0x74, 0x83, 0xb6, 0xa4, 0x33, 0x52, 0xbb, + 0x9b, 0x31, 0xa0, 0x56, 0x8f, 0x92, 0x52, 0x85, 0x35, 0x37, 0xd2, 0x90, + 0xa8, 0x7b, 0x56, 0x9e, 0x4a, 0xbc, 0xc9, 0xbc, 0x8d, 0xd7, 0xa8, 0xac, + 0x5f, 0x73, 0xc8, 0x9f, 0x6f, 0xb1, 0x85, 0x6f, 0xd1, 0x4b, 0x94, 0x37, + 0x5b, 0x56, 0x65, 0x9d, 0x5b, 0x9e, 0x99, 0x53, 0x3a, 0x29, 0x40, 0xb7, + 0xbd, 0x84, 0xd4, 0x3c, 0x76, 0x53, 0x5b, 0xab, 0x3e, 0x38, 0x64, 0x3d, + 0x73, 0x50, 0x60, 0x37, 0xa1, 0x62, 0x47, 0x6b, 0x49, 0x71, 0x77, 0x6d, + 0x44, 0x51, 0x62, 0x7a, 0x8c, 0xb7, 0x8f, 0x68, 0xbd, 0x8d, 0x94, 0x31, + 0x3d, 0xd6, 0x4f, 0x88, 0xbf, 0x57, 0x2c, 0xae, 0x28, 0x8d, 0xce, 0x9d, + 0x7d, 0x88, 0xd0, 0x72, 0xba, 0xcd, 0x36, 0x5c, 0xaf, 0x4e, 0x7e, 0x65, + 0x63, 0x4e, 0xbc, 0xa2, 0x6b, 0xc5, 0x79, 0xcc, 0xb9, 0xa1, 0x5b, 0xb2, + 0x54, 0x49, 0x7f, 0xc8, 0x5a, 0xbb, 0x72, 0x76, 0xab, 0x50, 0xc0, 0x73, + 0x55, 0xad, 0xbd, 0x59, 0x70, 0xd1, 0x79, 0xba, 0xe1, 0x84, 0x33, 0x9e, + 0x3f, 0x35, 0x73, 0x31, 0x82, 0x5c, 0xaf, 0x4b, 0x5d, 0x5e, 0xac, 0x8f, + 0xc5, 0x7b, 0x56, 0xc0, 0x58, 0xc3, 0x7a, 0x24, 0x35, 0x84, 0x5f, 0x2e, + 0x9a, 0x65, 0x2a, 0xc0, 0x7d, 0x2b, 0x7b, 0xd3, 0xcb, 0x30, 0xb1, 0x70, + 0x61, 0x63, 0x71, 0x25, 0xae, 0x3f, 0xb0, 0xb9, 0x56, 0x3d, 0xac, 0x7b, + 0x96, 0x75, 0x7f, 0x65, 0xc6, 0x7c, 0x60, 0x8e, 0xa0, 0x45, 0x49, 0x71, + 0xb6, 0x1c, 0x2a, 0x7c, 0x68, 0xb7, 0x4f, 0x57, 0x8a, 0x70, 0x83, 0x58, + 0x44, 0x3e, 0x91, 0x7e, 0x81, 0xa2, 0x79, 0x35, 0x6a, 0xbf, 0x48, 0x8d, + 0x72, 0x37, 0xcc, 0x42, 0x52, 0x58, 0xb7, 0xa5, 0xae, 0xd0, 0xc2, 0xa3, + 0xb4, 0x9e, 0xbe, 0x44, 0xa7, 0x6b, 0x7a, 0x9b, 0x47, 0xb2, 0x63, 0x88, + 0x45, 0x8c, 0x86, 0x88, 0x83, 0x65, 0x9f, 0x5b, 0xbe, 0x7e, 0x7e, 0x4e, + 0x92, 0x88, 0x7e, 0x90, 0xd8, 0x3d, 0xad, 0x4b, 0x88, 0x3f, 0x3c, 0xc4, + 0x67, 0xc3, 0x93, 0x4a, 0xcc, 0x84, 0xc8, 0x4d, 0x4d, 0xd1, 0x58, 0x6f, + 0xc0, 0x6a, 0x49, 0x3b, 0x64, 0x39, 0x5b, 0x96, 0xc5, 0x8f, 0x7a, 0x7f, + 0xa0, 0x57, 0x6d, 0x91, 0x6a, 0xc9, 0x98, 0x4c, 0xd4, 0xd1, 0x47, 0x5c, + 0x99, 0x6e, 0x5a, 0x59, 0x7e, 0x7d, 0x60, 0x5f, 0x43, 0xae, 0x97, 0x6e, + 0x53, 0x6e, 0xb6, 0xba, 0x93, 0x4d, 0x3f, 0x85, 0xb7, 0x41, 0x60, 0xca, + 0xc4, 0x2f, 0x52, 0xb7, 0x7e, 0x95, 0x5b, 0x65, 0xa7, 0x85, 0xbb, 0x57, + 0x5d, 0x68, 0xc1, 0x4c, 0x68, 0xc1, 0x92, 0x7d, 0xb7, 0x81, 0x6a, 0xa5, + 0x41, 0x75, 0xae, 0x6e, 0x32, 0xd6, 0xb1, 0x6e, 0xa4, 0xb4, 0x39, 0xbb, + 0xc3, 0x47, 0xb3, 0xc7, 0xc8, 0xa3, 0x95, 0x78, 0x95, 0x54, 0x26, 0x54, + 0xd2, 0x40, 0x51, 0xcc, 0x8f, 0x2e, 0xd8, 0x94, 0xa5, 0x70, 0x8e, 0xc6, + 0xca, 0x82, 0x34, 0xc7, 0x40, 0x6b, 0x55, 0x58, 0x4b, 0x5e, 0xbc, 0xca, + 0xa4, 0x87, 0x59, 0x73, 0x8b, 0xc6, 0x79, 0x32, 0x62, 0x79, 0x68, 0x43, + 0x78, 0xd6, 0xbb, 0x45, 0x55, 0x7a, 0x64, 0x7f, 0xc2, 0x52, 0x9a, 0x46, + 0x34, 0x5c, 0x55, 0x69, 0x5d, 0x73, 0xa3, 0x82, 0xb0, 0x45, 0x52, 0xbc, + 0x66, 0x5e, 0x82, 0x4a, 0x2d, 0x64, 0xbc, 0x52, 0x73, 0xbb, 0x3e, 0x97, + 0xbd, 0x52, 0x44, 0x5e, 0x50, 0x91, 0xc3, 0xb2, 0x99, 0x53, 0xca, 0xa9, + 0xd1, 0x98, 0xbb, 0x95, 0x3d, 0x83, 0xbe, 0x79, 0x31, 0x29, 0x60, 0xa4, + 0x31, 0x6d, 0xc9, 0x4e, 0x9a, 0x6f, 0x82, 0xd4, 0x66, 0x9a, 0xa6, 0xba, + 0xb0, 0x48, 0x3a, 0x7b, 0x63, 0x62, 0x48, 0xb0, 0xa8, 0x4b, 0xca, 0xbf, + 0xb4, 0xb3, 0x5d, 0x80, 0x8a, 0x93, 0x95, 0x71, 0xbe, 0xb1, 0x7b, 0x77, + 0x7e, 0xa7, 0x4e, 0x84, 0x64, 0x95, 0x60, 0x2e, 0x92, 0x3b, 0x37, 0xb4, + 0xb1, 0x3a, 0x95, 0xc2, 0x6f, 0x95, 0xcd, 0xc7, 0x8d, 0x77, 0x54, 0x64, + 0x8e, 0xc8, 0x6d, 0x98, 0x3e, 0x57, 0x52, 0x4b, 0x69, 0xa6, 0x96, 0x57, + 0x73, 0x39, 0xbd, 0x53, 0x94, 0x9e, 0x79, 0xbf, 0x5d, 0xd2, 0xc8, 0x6d, + 0x82, 0x44, 0xa1, 0x90, 0x81, 0x8d, 0xc0, 0x52, 0x6b, 0x70, 0x4e, 0x74, + 0x4f, 0x3d, 0x5d, 0xbd, 0xa6, 0x62, 0xce, 0x58, 0xd0, 0x9d, 0x2d, 0x5a, + 0x7d, 0xab, 0x74, 0x9b, 0xa1, 0xd3, 0x39, 0x75, 0x38, 0xbb, 0x72, 0x91, + 0xa9, 0x3e, 0x8e, 0x64, 0x75, 0xb9, 0xc0, 0xc9, 0x40, 0x55, 0x54, 0xa8, + 0xba, 0x96, 0x42, 0x82, 0x4e, 0xb7, 0x87, 0xc5, 0x45, 0x7b, 0xaf, 0x63, + 0x43, 0x68, 0x79, 0xd1, 0x99, 0x5e, 0xb1, 0xac, 0x8f, 0x50, 0xc6, 0x3a, + 0x3e, 0x79, 0x5b, 0xb2, 0x80, 0xbb, 0xd4, 0x6d, 0xd3, 0x9c, 0x36, 0xb8, + 0xb6, 0x47, 0x5c, 0x8b, 0xbe, 0xcb, 0x84, 0x6c, 0x7a, 0x9f, 0x61, 0x5a, + 0x69, 0xab, 0x45, 0x38, 0xb1, 0x9f, 0xc5, 0x43, 0xc5, 0xcd, 0x7b, 0xaa, + 0x70, 0x58, 0x71, 0x5d, 0xa1, 0x73, 0xb4, 0x41, 0x40, 0x56, 0x8a, 0x98, + 0x8a, 0xcd, 0x85, 0xd2, 0x8c, 0x4e, 0x54, 0x75, 0x57, 0x5d, 0x49, 0xb5, + 0x9c, 0xbe, 0xab, 0x41, 0x8f, 0x59, 0x4a, 0xd0, 0xaf, 0xb6, 0x70, 0x46, + 0xcf, 0xa4, 0x69, 0xcd, 0xaf, 0xc7, 0x40, 0x68, 0xa4, 0x3f, 0x4f, 0x5d, + 0x62, 0x5d, 0x4d, 0x4e, 0x4e, 0x97, 0x2c, 0x3d, 0x62, 0x49, 0x7c, 0x39, + 0x5c, 0x44, 0x7c, 0x67, 0x5e, 0xba, 0xbd, 0x82, 0xdd, 0x58, 0xce, 0xc6, + 0x33, 0x8b, 0xcb, 0x3f, 0xab, 0x9a, 0x9d, 0xc0, 0x84, 0xbd, 0x8b, 0xba, + 0xa4, 0x3c, 0x45, 0x8c, 0x44, 0x86, 0x72, 0xc0, 0x57, 0xc8, 0xd7, 0x86, + 0xad, 0xc5, 0x8a, 0xa8, 0xc1, 0xcd, 0x7f, 0x94, 0x93, 0x7f, 0x62, 0xc7, + 0x95, 0x35, 0x3d, 0xa1, 0x42, 0xb1, 0x6d, 0x85, 0x73, 0xab, 0x6a, 0x73, + 0xbd, 0xd0, 0x80, 0x4d, 0x3c, 0xa2, 0x77, 0x87, 0xb6, 0x5d, 0x39, 0x87, + 0x9a, 0xd0, 0x9d, 0x40, 0xbd, 0x42, 0x3f, 0xb8, 0x84, 0x3d, 0x7d, 0x9c, + 0xae, 0x77, 0x96, 0x89, 0x9e, 0xca, 0xd1, 0x89, 0xc5, 0x49, 0x8f, 0x74, + 0x65, 0x5e, 0x6c, 0x66, 0xd1, 0x97, 0x9d, 0xc7, 0x9c, 0x91, 0x8b, 0x97, + 0xd1, 0x69, 0xa4, 0x86, 0x51, 0xda, 0xbe, 0xa0, 0xc9, 0xce, 0x7b, 0x77, + 0x8a, 0xba, 0x6b, 0x63, 0x66, 0x7e, 0x59, 0x6b, 0x4a, 0x71, 0x99, 0x46, + 0xce, 0x88, 0x9b, 0x74, 0x7c, 0x97, 0x4e, 0x6d, 0x88, 0x4d, 0x7f, 0x7e, + 0xa5, 0xb3, 0x67, 0x56, 0x4b, 0x66, 0x79, 0x79, 0x3b, 0xc3, 0x7d, 0xac, + 0xa5, 0x4c, 0xdb, 0x69, 0x5d, 0x82, 0xac, 0xab, 0xb1, 0xb7, 0x6b, 0x41, + 0xbb, 0x5a, 0x93, 0x6e, 0xc7, 0xcc, 0xa0, 0x40, 0x98, 0x5b, 0xa6, 0x5e, + 0x49, 0xc5, 0xc5, 0xb2, 0x8d, 0x50, 0xb8, 0xb2, 0x88, 0xab, 0xa4, 0x42, + 0x54, 0x9b, 0x9b, 0x8e, 0x4a, 0x87, 0xad, 0x6f, 0xab, 0x45, 0x37, 0x46, + 0xa2, 0x97, 0x55, 0x94, 0x57, 0x8f, 0xbb, 0x68, 0x6a, 0x4b, 0xcc, 0x94, + 0x9d, 0x45, 0x9d, 0x4f, 0xcd, 0x80, 0xac, 0xa7, 0xab, 0x90, 0xca, 0xa6, + 0x95, 0x61, 0x9a, 0xaa, 0x5b, 0x84, 0xc4, 0x6a, 0xba, 0xcd, 0x81, 0x5d, + 0x2c, 0x67, 0xaf, 0x96, 0x90, 0x4d, 0x86, 0xc8, 0x9f, 0x95, 0x50, 0xd6, + 0x90, 0xa7, 0xcc, 0x9d, 0x8a, 0xb4, 0x89, 0x58, 0x6f, 0x47, 0x76, 0x66, + 0xb7, 0x9e, 0x50, 0x50, 0x44, 0x8b, 0xdf, 0x66, 0xbb, 0x78, 0x5f, 0x6a, + 0xcb, 0xbf, 0xc3, 0x75, 0xa7, 0x5b, 0xc2, 0xa9, 0x53, 0x8d, 0x44, 0xb2, + 0x9b, 0x4c, 0x4c, 0x87, 0x35, 0x6f, 0xb7, 0xab, 0x3a, 0x31, 0x92, 0x94, + 0xa0, 0xd1, 0x67, 0x56, 0x4e, 0x37, 0x65, 0x4d, 0x82, 0xbb, 0x6b, 0x3a, + 0x52, 0xcb, 0x59, 0x83, 0x6e, 0x62, 0xc2, 0xd5, 0xae, 0x65, 0xa7, 0xd1, + 0x3f, 0xc4, 0x54, 0x68, 0x3f, 0x48, 0x93, 0x38, 0x55, 0x61, 0x8a, 0x6d, + 0xd0, 0x5d, 0x71, 0x43, 0x7e, 0x5c, 0x8d, 0x9e, 0x2c, 0x7c, 0x35, 0x63, + 0xb1, 0xba, 0x58, 0xc5, 0x6e, 0x41, 0x7f, 0xd1, 0x7f, 0x53, 0xb6, 0x69, + 0x3f, 0x83, 0xd1, 0x8a, 0x3d, 0x59, 0xb9, 0xa7, 0x9f, 0x51, 0xb1, 0xc6, + 0x5a, 0xbc, 0x9c, 0x92, 0x97, 0x65, 0xcb, 0x42, 0x4a, 0x7f, 0xac, 0xb3, + 0x59, 0x37, 0x85, 0x50, 0xb1, 0x48, 0x35, 0xc8, 0x97, 0x77, 0x8c, 0x9b, + 0xba, 0x69, 0x4b, 0x46, 0x49, 0x6a, 0x4c, 0x49, 0xa1, 0x77, 0x9d, 0x36, + 0x6d, 0xa8, 0xbd, 0xcd, 0xa5, 0xc9, 0x4f, 0x9a, 0xa1, 0x38, 0x7c, 0xa1, + 0x7e, 0xa6, 0x6c, 0x73, 0x5e, 0x5a, 0x2d, 0xb5, 0x6e, 0xb4, 0x5f, 0x8a, + 0x70, 0x70, 0x9c, 0xd4, 0x34, 0xce, 0x56, 0xb6, 0x9e, 0x4e, 0xa7, 0x99, + 0xbb, 0xb4, 0xc8, 0xb0, 0x35, 0x5c, 0x48, 0x98, 0x93, 0x75, 0xc2, 0x49, + 0x9f, 0xbf, 0x5a, 0x2e, 0xc6, 0x70, 0x68, 0xb6, 0x45, 0xc7, 0x47, 0x34, + 0x9c, 0x9c, 0x6b, 0xd1, 0x74, 0x49, 0x55, 0xb6, 0xcf, 0x8a, 0xd6, 0xc2, + 0xc0, 0x45, 0x3b, 0xa4, 0x68, 0x90, 0x36, 0xa5, 0x39, 0x4c, 0x9b, 0x93, + 0x4f, 0x2e, 0xae, 0xa8, 0xa6, 0x9b, 0xa1, 0x36, 0xa6, 0x50, 0x59, 0x69, + 0x66, 0x6d, 0x92, 0x94, 0x7b, 0x4c, 0x6e, 0x7f, 0x5c, 0xcb, 0x93, 0x55, + 0xc9, 0x38, 0x37, 0x85, 0x88, 0x74, 0x6e, 0xb6, 0x51, 0x42, 0x65, 0x6c, + 0xc8, 0x5e, 0xc0, 0x9e, 0x9b, 0x4f, 0xb0, 0xb3, 0x5b, 0x4b, 0x96, 0x5a, + 0x4d, 0x94, 0x3d, 0x70, 0x4a, 0x32, 0x8e, 0xcf, 0x6b, 0x69, 0x8f, 0xbe, + 0xc3, 0x91, 0x82, 0x6d, 0x89, 0x3d, 0xac, 0x84, 0x70, 0xc9, 0x46, 0x9b, + 0x4a, 0x44, 0x92, 0xba, 0x56, 0x95, 0x29, 0xa3, 0xdb, 0x7f, 0x61, 0x7b, + 0x50, 0x5b, 0xa6, 0x70, 0x80, 0x3e, 0x65, 0xa6, 0x5d, 0x3b, 0x76, 0x70, + 0x80, 0xa2, 0xa6, 0xc4, 0x67, 0x3f, 0x42, 0xa4, 0x57, 0x89, 0, 0x26, + 0xd1, 0xe3, 0x6b, 0xb3, 0x73, 0x96, 0xd4, 0x66, 0x4f, 0xa6, 0x80, 0x76, + 0x9f, 0x5b, 0x92, 0x5e, 0xb8, 0xd9, 0x8b, 0x5b, 0xb1, 0x60, 0xb4, 0x66, + 0xcd, 0xaf, 0xae, 0x4e, 0x8d, 0x6e, 0x52, 0x35, 0x87, 0xb9, 0x59, 0xd7, + 0xa3, 0x6f, 0xca, 0xa7, 0x40, 0xaf, 0x81, 0x8d, 0xd0, 0xae, 0x83, 0x43, + 0xd5, 0x52, 0xcb, 0xdb, 0xa2, 0x4d, 0x80, 0x46, 0x37, 0xcb, 0x4d, 0xc3, + 0x38, 0x97, 0x86, 0x5c, 0x94, 0x54, 0x5e, 0xd1, 0x66, 0x49, 0xc6, 0x5f, + 0x57, 0x78, 0x36, 0x74, 0xa6, 0x78, 0x90, 0x91, 0x37, 0xc9, 0x98, 0xac, + 0x60, 0x88, 0x8d, 0x4e, 0x4b, 0x77, 0x3a, 0xbb, 0xc4, 0x67, 0xa5, 0x3f, + 0xb7, 0xa0, 0xb3, 0xca, 0xc8, 0x7a, 0xbf, 0x5a, 0x97, 0x9a, 0x71, 0x37, + 0x79, 0x2d, 0x42, 0xb7, 0xab, 0x66, 0x4e, 0x83, 0x98, 0xa0, 0x96, 0xd7, + 0x8b, 0x61, 0xa1, 0x47, 0x36, 0x37, 0x6b, 0x4d, 0xd0, 0xae, 0x69, 0xb9, + 0x80, 0xa8, 0x94, 0xa7, 0xa3, 0x3d, 0xba, 0x38, 0x95, 0x8a, 0x85, 0x85, + 0xa7, 0x40, 0xb2, 0xa7, 0x9e, 0xc4, 0x4e, 0x7d, 0x78, 0x9f, 0x5b, 0x87, + 0x5b, 0xac, 0x9b, 0x2e, 0x71, 0xaa, 0x85, 0x45, 0xb3, 0x36, 0xb8, 0xaa, + 0x23, 0xcb, 0x43, 0x35, 0x98, 0x4d, 0x80, 0x38, 0x2f, 0x48, 0x22, 0x3d, + 0x43, 0xb4, 0xbc, 0x38, 0x35, 0x84, 0x31, 0xd2, 0x47, 0xb9, 0xbb, 0x8a, + 0x9d, 0x58, 0x7b, 0xd0, 0x9a, 0x5c, 0x91, 0x93, 0x88, 0xd3, 0x54, 0x33, + 0x82, 0x60, 0x43, 0x9f, 0x51, 0x44, 0xa7, 0xd9, 0xb9, 0x2b, 0xc8, 0x6f, + 0xce, 0x62, 0x81, 0x6d, 0xb6, 0x5e, 0x3f, 0x7b, 0x66, 0xd5, 0x87, 0x2d, + 0x58, 0x60, 0x66, 0x6e, 0xab, 0x73, 0xa8, 0x93, 0x7a, 0x94, 0xbb, 0xca, + 0x4a, 0xae, 0x78, 0x5f, 0x91, 0xad, 0x7b, 0x77, 0x3a, 0x6d, 0x53, 0xcf, + 0x5e, 0xdf, 0x7f, 0x98, 0x4b, 0x63, 0xb4, 0xb4, 0x91, 0x9b, 0x8c, 0x55, + 0x4f, 0x8a, 0xcb, 0xab, 0x63, 0xbd, 0x79, 0x3f, 0xd6, 0xdb, 0x89, 0xc4, + 0xc1, 0x75, 0x4a, 0x59, 0x28, 0x8a, 0xb9, 0x60, 0x30, 0x2c, 0x33, 0x8f, + 0x65, 0x6b, 0xa3, 0x41, 0xa9, 0xaa, 0x79, 0xc1, 0x70, 0x5a, 0x81, 0xc0, + 0x9d, 0x8b, 0x1c, 0x3f, 0x4c, 0x61, 0xcc, 0x41, 0xc1, 0xa1, 0x8f, 0x95, + 0xcb, 0xbc, 0xa7, 0x4b, 0x74, 0x98, 0x51, 0xb6, 0xa8, 0x66, 0x76, 0xb3, + 0x6e, 0x80, 0x8f, 0x9c, 0xb9, 0x70, 0xce, 0x4e, 0xa6, 0x86, 0xbd, 0x5e, + 0x86, 0xd5, 0xc4, 0x74, 0x3c, 0x92, 0x6a, 0x8a, 0x3e, 0x91, 0xd3, 0xc2, + 0x86, 0xcb, 0xd7, 0x36, 0x34, 0xc8, 0x72, 0xd7, 0x60, 0x59, 0xbd, 0x59, + 0xc2, 0x40, 0x42, 0xa0, 0xc7, 0xe0, 0xe7, 0xa4, 0x52, 0x7d, 0xce, 0xa3, + 0x95, 0xae, 0x59, 0xa5, 0xc7, 0xb4, 0xa4, 0x8c, 0x32, 0x54, 0x91, 0xbf, + 0xbd, 0x74, 0x5b, 0x7b, 0xbf, 0xbf, 0x25, 0x35, 0x3f, 0xc4, 0x3b, 0x5e, + 0x98, 0xb5, 0xc2, 0x85, 0x58, 0x9e, 0xb5, 0x72, 0x6d, 0x4f, 0xad, 0x50, + 0x6b, 0x52, 0x75, 0x4c, 0xb7, 0x95, 0xa4, 0xc0, 0x6f, 0x63, 0x85, 0xdb, + 0x81, 0x5f, 0x5c, 0x51, 0x84, 0x7d, 0x61, 0x71, 0x96, 0xcc, 0xb4, 0xa1, + 0xd6, 0xea, 0xcd, 0x3b, 0x75, 0x8f, 0x2e, 0x5b, 0x8a, 0xb2, 0x86, 0x5b, + 0xab, 0x76, 0x88, 0x3d, 0xde, 0x87, 0xaf, 0x6f, 0xb4, 0x2f, 0xac, 0xa8, + 0x70, 0x51, 0x89, 0x83, 0xa1, 0x45, 0x4d, 0xc4, 0x7f, 0x77, 0xc1, 0xd0, + 0x65, 0xbf, 0x53, 0x66, 0xbd, 0x89, 0x9c, 0x44, 0x61, 0xa5, 0x49, 0x6a, + 0x6e, 0xce, 0xb7, 0x8e, 0x8d, 0x8e, 0x7e, 0x87, 0xb7, 0x69, 0x72, 0x51, + 0xb0, 0x86, 0xab, 0x6b, 0x89, 0x50, 0x66, 0xb3, 0x3b, 0xb5, 0x50, 0x86, + 0x3d, 0x64, 0x5f, 0xc4, 0xe0, 0xb2, 0xcf, 0x81, 0xe0, 0xcc, 0xda, 0xbc, + 0x47, 0x7d, 0x3f, 0x8e, 0x3b, 0xd3, 0xb1, 0x75, 0x8e, 0x6e, 0x89, 0x80, + 0x6d, 0x85, 0x9a, 0x30, 0xb0, 0xcb, 0xac, 0x3a, 0x94, 0x70, 0x77, 0xd3, + 0xcb, 0x4c, 0x57, 0x9f, 0xb9, 0x9f, 0xa3, 0x27, 0xc6, 0xd1, 0x53, 0x73, + 0xd6, 0x55, 0x63, 0x3f, 0x88, 0x88, 0x70, 0x90, 0x4a, 0xa3, 0xb7, 0x3f, + 0x30, 0x97, 0xaf, 0x3d, 0x3d, 0xd4, 0x38, 0x43, 0x5d, 0x45, 0x8d, 0x99, + 0xba, 0x4b, 0x47, 0x4a, 0x51, 0x80, 0x75, 0x84, 0x6d, 0x35, 0x6c, 0x5c, + 0xc5, 0xbe, 0xc9, 0xa8, 0xd3, 0x9e, 0x52, 0xa8, 0xb5, 0xc6, 0xc7, 0xb4, + 0x64, 0x88, 0x91, 0xa3, 0xec, 0xff, 0xbf, 0x24, 0xb6, 0x67, 0xaf, 0x4b, + 0x7a, 0xbc, 0x64, 0x46, 0x93, 0xa7, 0x64, 0xba, 0x72, 0x70, 0x66, 0x51, + 0x96, 0xc7, 0x9e, 0x40, 0xd8, 0xb5, 0xae, 0x39, 0x66, 0xb9, 0x6c, 0xc9, + 0x98, 0xe1, 0x42, 0x8a, 0x55, 0x65, 0xa5, 0x72, 0x7f, 0xb8, 0x83, 0x9b, + 0x45, 0x3b, 0xce, 0xc1, 0xb6, 0x7f, 0x5a, 0x1e, 0x53, 0x6c, 0x6c, 0x60, + 0xdd, 0x4e, 0x45, 0x52, 0x45, 0x56, 0x5e, 0x7d, 0x40, 0xbf, 0x84, 0x56, + 0x7d, 0xa7, 0x4c, 0x53, 0x44, 0xbb, 0x55, 0xb9, 0x6d, 0x23, 0x8e, 0x5a, + 0x96, 0xd6, 0xac, 0xb3, 0x7f, 0xb2, 0xb4, 0x91, 0xb2, 0x98, 0xb5, 0x7c, + 0x58, 0xad, 0xa8, 0x1f, 0x85, 0x9e, 0x86, 0xbb, 0x9e, 0xaa, 0x46, 0x7e, + 0x7c, 0x94, 0xbd, 0xb1, 0x44, 0x45, 0xad, 0xae, 0x52, 0x2c, 0xe1, 0xe1, + 0x96, 0x6e, 0x99, 0xb5, 0xd2, 0x61, 0xa1, 0xb0, 0x7e, 0x3a, 0x86, 0x6d, + 0xcc, 0x69, 0x61, 0x8a, 0x84, 0xd3, 0x24, 0x64, 0x68, 0x5d, 0x9d, 0x77, + 0x59, 0x9c, 0x89, 0x92, 0x50, 0x52, 0x77, 0x41, 0x87, 0xa9, 0x75, 0x31, + 0xbb, 0x6f, 0x94, 0x79, 0xaa, 0xb1, 0x31, 0x92, 0x64, 0xb0, 0x93, 0x66, + 0x5e, 0x41, 0xd1, 0xa1, 0x1f, 0x8a, 0x85, 0xa7, 0xbb, 0x92, 0x75, 0xd4, + 0x52, 0xb3, 0x4a, 0x7b, 0xbb, 0x52, 0xb2, 0x3d, 0x9f, 0x6a, 0x34, 0x93, + 0x70, 0x6d, 0x96, 0x7c, 0x99, 0x71, 0x60, 0x6e, 0x37, 0xbb, 0x9e, 0x3f, + 0xab, 0x1e, 0x69, 0xb5, 0xd4, 0x76, 0x98, 0xb0, 0x6e, 0xb1, 0xc7, 0x68, + 0x2d, 0xb2, 0x8e, 0xa2, 0x4b, 0x63, 0xcc, 0x64, 0x78, 0x51, 0xb5, 0x88, + 0x2d, 0xc6, 0x43, 0x16, 0x87, 0x55, 0x81, 0x8c, 0x41, 0x4f, 0x97, 0xb2, + 0x2c, 0x63, 0x3a, 0x4f, 0x71, 0xa3, 0xaa, 0x90, 0xb0, 0xb3, 0x35, 0x5e, + 0xd9, 0xdd, 0xbe, 0x96, 0x82, 0x95, 0x77, 0x57, 0x35, 0xac, 0x4f, 0x7d, + 0x4f, 0x34, 0x8a, 0xa7, 0x5a, 0xb4, 0xdf, 0x3d, 0xb6, 0x6e, 0xbd, 0xc1, + 0xa9, 0x6a, 0x68, 0xc6, 0x70, 0x3c, 0xcf, 0xa9, 0xdc, 0x90, 0x7f, 0xa1, + 0x48, 0x5b, 0xbe, 0x7c, 0xb6, 0x6a, 0xbe, 0x54, 0xa9, 0x81, 0xa8, 0x81, + 0xb4, 0x8b, 0x92, 0x51, 0xb1, 0xb4, 0x5a, 0xaf, 0x66, 0x4b, 0x3e, 0xb1, + 0x40, 0x83, 0xba, 0x8d, 0x63, 0xc8, 0x60, 0xae, 0x5e, 0x6e, 0xa9, 0x54, + 0x8e, 0xb3, 0x6b, 0x73, 0xc5, 0x81, 0xac, 0x3b, 0xa5, 0x83, 0x76, 0x90, + 0x44, 0x56, 0xd1, 0xc1, 0x6c, 0xad, 0x41, 0xa1, 0x71, 0xab, 0xc7, 0x62, + 0x37, 0x69, 0x68, 0x8c, 0x26, 0x6e, 0x74, 0xc7, 0xc7, 0x48, 0x39, 0x57, + 0xb0, 0x50, 0x61, 0x41, 0xc8, 0x7f, 0x70, 0xd7, 0xa3, 0xbf, 0x52, 0xa9, + 0x50, 0x34, 0xa6, 0x4e, 0x59, 0x89, 0x26, 0xb6, 0x4d, 0xbc, 0x31, 0xc7, + 0xd8, 0x75, 0x45, 0xca, 0x70, 0x42, 0x30, 0x71, 0xc3, 0xae, 0xb6, 0x65, + 0x38, 0x28, 0x8b, 0x2f, 0x5d, 0x7a, 0x7f, 0x78, 0x36, 0x76, 0x76, 0xc7, + 0xbc, 0x7b, 0x85, 0x49, 0xdd, 0x94, 0x42, 0x53, 0x7a, 0xa2, 0x87, 0x77, + 0xce, 0x44, 0xa3, 0xba, 0x7b, 0x6e, 0xbb, 0x4e, 0x3f, 0x56, 0x72, 0xb3, + 0x44, 0x67, 0x5e, 0x8f, 0x82, 0xad, 0xac, 0x60, 0x91, 0x89, 0xaa, 0x6b, + 0x34, 0x65, 0x8a, 0x50, 0xbc, 0xa7, 0xcc, 0xbf, 0x4a, 0x70, 0xc8, 0x81, + 0xa8, 0x55, 0x5b, 0xa8, 0x5a, 0x69, 0x6b, 0x67, 0xca, 0x85, 0xa7, 0x42, + 0x91, 0x47, 0x6e, 0x94, 0xbc, 0xa5, 0xcd, 0x91, 0x9e, 0x52, 0x2e, 0x8a, + 0x51, 0x41, 0x3d, 0x8b, 0x41, 0x8c, 0xa8, 0x51, 0x6d, 0xcb, 0x6f, 0x6e, + 0x67, 0x7e, 0x69, 0x3a, 0xc8, 0x52, 0x69, 0x35, 0x21, 0x55, 0xb7, 0xbf, + 0xbc, 0x57, 0x3c, 0xdd, 0x51, 0x60, 0xc0, 0xb4, 0xb4, 0x47, 0x74, 0xb7, + 0x87, 0x31, 0xb6, 0x33, 0x4e, 0x68, 0x1e, 0x49, 0xbd, 0xd7, 0x8e, 0x3a, + 0xb4, 0xb1, 0xa8, 0x65, 0xa1, 0xa8, 0x83, 0xd1, 0x87, 0xa5, 0x8e, 0x61, + 0x7a, 0xbe, 0x45, 0xaa, 0x9e, 0x6c, 0x78, 0xb2, 0xaa, 0x69, 0x4f, 0x48, + 0x5a, 0x99, 0xc0, 0xe0, 0xd2, 0xde, 0xb0, 0x99, 0xcd, 0xa2, 0x9f, 0xa4, + 0xb3, 0xac, 0x74, 0xce, 0x3e, 0x99, 0x57, 0xea, 0xb7, 0x76, 0xca, 0x73, + 0xc1, 0xc9, 0x4f, 0x6a, 0xcd, 0xac, 0x42, 0x85, 0xb0, 0xd1, 0x5a, 0xb1, + 0x7b, 0x4c, 0x78, 0x42, 0xaa, 0x4c, 0x73, 0x48, 0xae, 0x46, 0x2c, 0x96, + 0x85, 0x96, 0x57, 0x66, 0x57, 0x9c, 0x66, 0x42, 0xcc, 0x65, 0x3c, 0xd0, + 0x96, 0xd3, 0x5d, 0x9b, 0x73, 0x76, 0xbe, 0x8d, 0x50, 0xc5, 0xdb, 0x76, + 0x41, 0xb5, 0xab, 0xc4, 0x8a, 0x4a, 0x48, 0xd0, 0x43, 0x48, 0xbc, 0x98, + 0x2d, 0x85, 0x8a, 0x9b, 0x46, 0x7a, 0x4c, 0x9e, 0x9c, 0x92, 0xbf, 0x51, + 0x7a, 0x36, 0x76, 0x94, 0x78, 0x44, 0x45, 0x84, 0x91, 0x8d, 0x99, 0x72, + 0x9f, 0x3a, 0x3e, 0x93, 0xb4, 0x21, 0x61, 0xd7, 0xd2, 0xc0, 0x99, 0xb8, + 0x66, 0x5f, 0xab, 0x2e, 0xa7, 0x9b, 0x4d, 0x83, 0x3d, 0x4f, 0x8b, 0x4c, + 0x59, 0x66, 0x3d, 0xc8, 0x8e, 0xd9, 0x7b, 0x8d, 0xa9, 0x76, 0x57, 0x9c, + 0x44, 0x7a, 0xa6, 0xab, 0xd0, 0x8c, 0xad, 0x5c, 0x94, 0x49, 0x43, 0xb8, + 0x6c, 0xbb, 0xb5, 0x6f, 0x75, 0x56, 0xe4, 0xd6, 0x8c, 0xa0, 0x6b, 0x39, + 0x97, 0x31, 0x46, 0x46, 0xc5, 0xcf, 0x72, 0x4c, 0x98, 0xbc, 0x9d, 0x54, + 0x76, 0x45, 0xa9, 0x96, 0xcb, 0x9b, 0x9f, 0x6f, 0x89, 0x85, 0xbc, 0x66, + 0x88, 0xde, 0x49, 0xa6, 0x2c, 0xbf, 0x80, 0xce, 0x4e, 0xd3, 0xc8, 0xa8, + 0xc2, 0x46, 0x51, 0x66, 0x8c, 0x71, 0x9e, 0x8e, 0xbc, 0xa8, 0xa8, 0x32, + 0xca, 0x9e, 0x4a, 0x7e, 0x89, 0xd3, 0x32, 0x60, 0x26, 0x2e, 0x73, 0x3a, + 0x95, 0x3a, 0x98, 0x65, 0x70, 0xbe, 0xa5, 0x64, 0x4a, 0x7b, 0xb2, 0x32, + 0x68, 0x90, 0x46, 0x30, 0xbc, 0xc0, 0x38, 0xb2, 0x77, 0xcb, 0xb6, 0xa3, + 0x35, 0xd1, 0xa3, 0x93, 0x37, 0x6b, 0x94, 0x95, 0xbc, 0x97, 0x8a, 0xa0, + 0xaa, 0x4f, 0xad, 0x4d, 0xc4, 0xb6, 0x52, 0x85, 0x30, 0xdd, 0x55, 0x8c, + 0xb4, 0x53, 0x7a, 0x47, 0x82, 0x98, 0x98, 0x86, 0x4f, 0x5d, 0x48, 0x93, + 0x2b, 0x1c, 0x86, 0x5e, 0xa8, 0x22, 0x67, 0x3b, 0xe6, 0x42, 0xa0, 0x90, + 0x96, 0x87, 0x3c, 0xa1, 0x4c, 0x3d, 0x3f, 0x8a, 0x6e, 0x89, 0x4a, 0x90, + 0x5e, 0x9f, 0x7e, 0x66, 0x6a, 0x88, 0x98, 0x39, 0x69, 0x83, 0x6d, 0x58, + 0x9d, 0x82, 0xa6, 0xc1, 0xbd, 0x82, 0xb2, 0x45, 0xc7, 0x67, 0x33, 0x31, + 0x8b, 0x48, 0xb5, 0xac, 0x3e, 0x43, 0xc0, 0x9e, 0x50, 0x84, 0x60, 0x57, + 0xba, 0x7f, 0x40, 0x85, 0x90, 0x8e, 0xc4, 0x4f, 0xbf, 0x70, 0x3b, 0x7d, + 0x9a, 0xd1, 0x72, 0xc8, 0xb5, 0x65, 0x98, 0x88, 0x65, 0x62, 0xa4, 0x68, + 0x63, 0xc5, 0x34, 0x97, 0xad, 0x4f, 0x4c, 0xae, 0x91, 0x45, 0x8c, 0x37, + 0x90, 0x3b, 0x88, 0x4a, 0xcf, 0xac, 0x70, 0x9a, 0x82, 0x91, 0x4a, 0x30, + 0x51, 0xb6, 0x34, 0x9a, 0x58, 0x52, 0x5f, 0x5c, 0xc3, 0x7d, 0x3a, 0xc0, + 0xbf, 0x74, 0x9b, 0x8a, 0x36, 0x36, 0x86, 0xc6, 0x8a, 0x69, 0x95, 0x65, + 0x3a, 0x4b, 0x83, 0x7c, 0x86, 0xb4, 0x7d, 0x48, 0xb4, 0x97, 0x80, 0x38, + 0x93, 0x4f, 0xcb, 0x3d, 0x69, 0xb0, 0x97, 0x48, 0x43, 0x54, 0xa0, 0x5d, + 0x33, 0x36, 0xa4, 0x8d, 0x60, 0x97, 0x62, 0x7a, 0x63, 0x57, 0xc9, 0xc1, + 0x7f, 0x74, 0x90, 0xb9, 0xc3, 0x4a, 0xcb, 0x60, 0x70, 0x9e, 0x48, 0x3e, + 0x98, 0xb0, 0x59, 0x58, 0x31, 0x7c, 0x88, 0xc2, 0x6f, 0x9a, 0x3a, 0xa4, + 0xa0, 0x81, 0x7f, 0x63, 0x68, 0x85, 0x9a, 0xa0, 0xd2, 0x9d, 0x44, 0xb6, + 0x36, 0x6d, 0x9a, 0xc4, 0x9c, 0xb5, 0x4f, 0x4b, 0xd4, 0xc9, 0x60, 0x56, + 0x3c, 0x8e, 0x32, 0xa4, 0x56, 0x4b, 0x43, 0x43, 0x96, 0xa5, 0x40, 0xc6, + 0xb2, 0x49, 0x53, 0xb2, 0x72, 0xbe, 0xbe, 0x92, 0x75, 0x52, 0x54, 0x5a, + 0xa0, 0x9d, 0x56, 0xae, 0x95, 0xc7, 0x67, 0xb9, 0xc1, 0x79, 0x78, 0xb0, + 0xc2, 0x53, 0xb6, 0x92, 0x3e, 0x76, 0x59, 0x3e, 0x82, 0x96, 0x94, 0xbf, + 0x37, 0x45, 0x3c, 0x3e, 0xad, 0x5c, 0x44, 0x40, 0x93, 0x37, 0x4e, 0xcb, + 0x9f, 0xa7, 0x62, 0x46, 0x85, 0xa4, 0x9e, 0xb6, 0xcc, 0xaa, 0xc0, 0xaa, + 0x8b, 0x7a, 0xae, 0x6a, 0x8f, 0xd0, 0x7b, 0x53, 0xca, 0xb6, 0xa7, 0x41, + 0xbd, 0x71, 0xba, 0x57, 0x32, 0xa1, 0xa3, 0x30, 0xc4, 0x33, 0x7d, 0xbe, + 0x54, 0x59, 0x31, 0xb6, 0xa8, 0xd1, 0x61, 0x5a, 0xb2, 0x96, 0xc3, 0x50, + 0xb5, 0x37, 0xc0, 0x57, 0x96, 0x58, 0x52, 0xb7, 0x87, 0x79, 0xac, 0xa3, + 0xa0, 0xac, 0x8a, 0x9c, 0x8e, 0x42, 0x88, 0x7c, 0xb1, 0xbd, 0x8b, 0x65, + 0x43, 0x38, 0x90, 0x56, 0x72, 0xa3, 0x5f, 0x5e, 0x7c, 0x48, 0x84, 0x63, + 0x5e, 0xcb, 0xbc, 0x4e, 0xb4, 0x4c, 0x87, 0x4e, 0x93, 0xac, 0x65, 0x47, + 0xcf, 0xcc, 0x58, 0x99, 0x7a, 0x5b, 0x87, 0x6d, 0xb9, 0xab, 0x31, 0x33, + 0x59, 0x43, 0x57, 0x9f, 0xbe, 0xcc, 0x79, 0xa9, 0x8b, 0x6e, 0x63, 0x66, + 0x3b, 0x98, 0xbd, 0xbe, 0x36, 0xcd, 0x32, 0x57, 0x4e, 0x37, 0xab, 0x8b, + 0x3e, 0xb6, 0x4d, 0x83, 0xaa, 0x3b, 0x77, 0x97, 0x5a, 0x6b, 0x35, 0x87, + 0x70, 0x92, 0x7f, 0xc7, 0xcb, 0xbf, 0x86, 0x7d, 0x99, 0x60, 0x9c, 0xc5, + 0xcd, 0xab, 0x4e, 0x36, 0x4e, 0x7d, 0x6b, 0x3e, 0x3e, 0x94, 0x56, 0xb5, + 0xb2, 0xa0, 0x3e, 0xc7, 0x9c, 0x60, 0x3d, 0x9d, 0x58, 0x96, 0xbe, 0xc9, + 0xca, 0x98, 0xa0, 0x3b, 0x72, 0x84, 0x42, 0x8a, 0x5c, 0x90, 0x78, 0xb9, + 0x99, 0x6e, 0x52, 0x57, 0x49, 0xbd, 0x8b, 0x5b, 0x92, 0xcd, 0x44, 0xc8, + 0x95, 0xab, 0x6e, 0x45, 0xce, 0xce, 0x90, 0x77, 0x9f, 0xb8, 0xba, 0x37, + 0x9e, 0x7b, 0xb4, 0xa4, 0x9a, 0x47, 0x9a, 0x8d, 0xa4, 0xa7, 0xd0, 0x5a, + 0x63, 0x41, 0x8d, 0x66, 0x43, 0x54, 0x9f, 0x9f, 0x7f, 0x45, 0xb6, 0x6f, + 0x4e, 0x61, 0xa2, 0xac, 0x53, 0x99, 0x59, 0xc7, 0x7e, 0x58, 0x76, 0x4e, + 0xb1, 0x5b, 0x69, 0x54, 0x3c, 0x8a, 0x31, 0xce, 0x9b, 0x3c, 0x34, 0x3e, + 0x89, 0x36, 0xaf, 0x79, 0x33, 0xac, 0x57, 0xaa, 0xc7, 0x9a, 0x38, 0xa3, + 0x4f, 0x78, 0xb5, 0x3c, 0x4d, 0x85, 0x64, 0x59, 0x70, 0x96, 0x39, 0x5a, + 0x87, 0x77, 0x3b, 0xa2, 0x86, 0x38, 0x7d, 0x43, 0xae, 0x7b, 0x8e, 0x4c, + 0xc5, 0xa0, 0xcb, 0x6f, 0x61, 0x9d, 0x99, 0x85, 0x7b, 0x93, 0x48, 0x63, + 0xa1, 0x9d, 0x9f, 0x9e, 0xad, 0x51, 0x4e, 0x42, 0x4c, 0x53, 0x85, 0xc5, + 0x57, 0x48, 0x83, 0x8b, 0x88, 0xae, 0x73, 0x6b, 0x74, 0x5a, 0x73, 0x93, + 0x6d, 0x78, 0x6d, 0xcd, 0x3f, 0x86, 0x95, 0x3f, 0x9a, 0x5d, 0x88, 0xd2, + 0x6e, 0x8b, 0x45, 0x75, 0x77, 0x9f, 0x39, 0xc8, 0x45, 0xc2, 0x97, 0x58, + 0xbb, 0x9c, 0xc0, 0x63, 0xa8, 0x3a, 0xd1, 0x9d, 0x95, 0xb2, 0x6b, 0x7c, + 0xc8, 0x7a, 0x5d, 0x42, 0x54, 0x33, 0x91, 0xbd, 0x72, 0x48, 0xbd, 0xa2, + 0x3c, 0x6c, 0xa9, 0xce, 0xc4, 0x5a, 0x94, 0x88, 0x6f, 0x48, 0xb2, 0x6d, + 0x99, 0x42, 0x3e, 0x60, 0x5d, 0x6e, 0x5d, 0x78, 0x59, 0x93, 0xae, 0x6b, + 0x3a, 0xb6, 0x41, 0x65, 0xcb, 0xbe, 0x59, 0x80, 0x65, 0x51, 0x54, 0x6c, + 0xbc, 0x57, 0xc2, 0xca, 0x39, 0xb9, 0x6f, 0x68, 0x37, 0x7d, 0x54, 0x98, + 0x8f, 0xc3, 0xa3, 0x6a, 0x78, 0xd0, 0xca, 0x78, 0x94, 0x58, 0x88, 0x3f, + 0xcd, 0x6b, 0xa9, 0xcb, 0x66, 0x69, 0x96, 0x61, 0x31, 0x8a, 0x82, 0x50, + 0x5b, 0xae, 0x50, 0x7f, 0x61, 0xba, 0x5f, 0xc2, 0x34, 0x45, 0x7a, 0x4d, + 0x31, 0x9a, 0x63, 0x6a, 0x8e, 0x7a, 0x6a, 0x2f, 0xd0, 0x99, 0xae, 0x7b, + 0xcd, 0x99, 0x64, 0x59, 0xaf, 0x4f, 0x59, 0x47, 0xc7, 0x77, 0xb8, 0xd3, + 0x33, 0x81, 0x97, 0xa8, 0xc8, 0x80, 0xb0, 0x5a, 0xbb, 0x96, 0xa9, 0x7f, + 0x56, 0x97, 0x70, 0x49, 0xc4, 0x6a, 0x7e, 0xad, 0x52, 0x9f, 0xbb, 0xbd, + 0x38, 0x36, 0x5b, 0x3e, 0x4a, 0xc1, 0x59, 0x8d, 0xa0, 0x7b, 0xc4, 0xa1, + 0x33, 0xaa, 0x8a, 0x91, 0x4a, 0xc6, 0xbc, 0xab, 0xc1, 0xce, 0xab, 0x3e, + 0xa0, 0xce, 0xbb, 0xca, 0xa2, 0x9b, 0xb0, 0xb4, 0x86, 0x72, 0x3b, 0x68, + 0x3b, 0xac, 0xa1, 0x3f, 0xa4, 0x59, 0x59, 0xa1, 0xc6, 0x48, 0xa2, 0x8a, + 0x90, 0xb4, 0x92, 0x69, 0xc7, 0x34, 0xab, 0x7f, 0xa7, 0x63, 0xc2, 0x7b, + 0xa0, 0xb6, 0x7d, 0xaa, 0xbf, 0x48, 0x3c, 0x98, 0x48, 0x66, 0xc6, 0x58, + 0x39, 0x33, 0xc9, 0x34, 0xac, 0xc5, 0x6d, 0xb7, 0xb3, 0x9e, 0x5b, 0xc2, + 0x7d, 0x4d, 0x39, 0xc1, 0xa1, 0xb0, 0xa7, 0xbf, 0xc2, 0x35, 0x4e, 0x47, + 0xc4, 0x3a, 0x76, 0x5e, 0x51, 0x90, 0xbd, 0x9c, 0x34, 0x54, 0x8f, 0xa7, + 0x67, 0x86, 0xd3, 0x8f, 0x8c, 0xcc, 0x3b, 0xba, 0x89, 0x32, 0x82, 0x32, + 0x65, 0x4d, 0x77, 0x84, 0xb7, 0x9f, 0x3a, 0x3b, 0x36, 0xcf, 0xa1, 0x75, + 0x52, 0x5b, 0x85, 0x58, 0xad, 0x47, 0xb7, 0x48, 0x52, 0x40, 0xc9, 0x4c, + 0xc3, 0x7a, 0x45, 0x2e, 0x7a, 0x52, 0x7a, 0xca, 0xd0, 0x60, 0x59, 0xbd, + 0x87, 0x61, 0x5d, 0xbb, 0xaf, 0x92, 0xc1, 0xa7, 0xb3, 0xdc, 0xb9, 0x92, + 0xc6, 0x5c, 0xcd, 0x46, 0x5a, 0x4f, 0x97, 0x31, 0x53, 0xa8, 0x36, 0xa0, + 0x7b, 0x72, 0xa5, 0xd8, 0xb6, 0x70, 0x8e, 0x9f, 0x46, 0x7d, 0xb6, 0x67, + 0xa4, 0xc9, 0xae, 0x6a, 0x58, 0x3b, 0xb6, 0xcc, 0x3e, 0x77, 0xc0, 0x66, + 0x95, 0x80, 0xa9, 0x60, 0x56, 0xb8, 0x48, 0x74, 0x99, 0x58, 0xab, 0x65, + 0x46, 0xc4, 0x4f, 0x55, 0xa1, 0x4d, 0x85, 0x95, 0x4f, 0xac, 0x9a, 0x7a, + 0xa0, 0xd2, 0x54, 0x44, 0xb4, 0xd4, 0x51, 0x46, 0x6f, 0xab, 0x6b, 0x9c, + 0x7f, 0x63, 0xa4, 0x58, 0x53, 0xc2, 0x7a, 0xad, 0x34, 0xac, 0xc5, 0x9a, + 0xcd, 0x73, 0x61, 0x75, 0x37, 0x77, 0x70, 0x31, 0x48, 0x49, 0xc9, 0x7b, + 0x6c, 0x9f, 0xcc, 0xcd, 0x88, 0x3b, 0x62, 0x58, 0xb9, 0xc5, 0x77, 0x46, + 0x96, 0x59, 0x4e, 0x3c, 0x3a, 0x7e, 0x9c, 0x63, 0x9f, 0xa5, 0x4a, 0xb9, + 0xbe, 0xd1, 0x84, 0x76, 0x81, 0xb6, 0x52, 0x35, 0xb6, 0xcd, 0xbc, 0x96, + 0xaf, 0x58, 0x83, 0x42, 0x38, 0xca, 0xd6, 0x99, 0x3d, 0xae, 0x3f, 0xa8, + 0xd1, 0x39, 0x58, 0x6a, 0x8b, 0x92, 0x70, 0xae, 0x88, 0x91, 0x81, 0x49, + 0x6a, 0xc4, 0x72, 0xc6, 0xd2, 0x71, 0x6f, 0x55, 0x6a, 0x47, 0xca, 0x57, + 0xa0, 0x58, 0x2d, 0x5c, 0x4d, 0x8b, 0x61, 0x68, 0x60, 0x8d, 0x82, 0x6d, + 0xaf, 0x7d, 0xb1, 0xa8, 0xc4, 0x6e, 0xcc, 0xbc, 0x50, 0x42, 0x63, 0x4e, + 0xd0, 0x88, 0x75, 0x3d, 0x67, 0xa4, 0xa0, 0xdf, 0x33, 0xb8, 0xc1, 0x53, + 0x53, 0xc3, 0xa0, 0xa6, 0x9c, 0xb2, 0xa0, 0x78, 0x53, 0x69, 0x73, 0xc9, + 0x3d, 0x90, 0x89, 0x5f, 0x99, 0xb1, 0x7f, 0x9d, 0xbf, 0xb0, 0xd3, 0x48, + 0x28, 0x66, 0xd4, 0xb0, 0x5b, 0x4a, 0x92, 0xae, 0xc3, 0x58, 0x65, 0x54, + 0x8c, 0x35, 0x56, 0x4c, 0x86, 0x6f, 0x5a, 0x87, 0x84, 0x61, 0xb1, 0x7d, + 0x4e, 0xa6, 0x4c, 0xac, 0xaf, 0x99, 0xb0, 0xa0, 0xb9, 0xcd, 0xc3, 0x8d, + 0xb1, 0x34, 0xbf, 0x7a, 0xa0, 0x71, 0x2e, 0xa4, 0x74, 0xad, 0x5c, 0x3d, + 0x5f, 0xa1, 0x4e, 0x5a, 0xd0, 0xab, 0x3b, 0x65, 0xbc, 0xd0, 0xd8, 0xa5, + 0x90, 0x95, 0x85, 0x73, 0x98, 0x3c, 0x84, 0x47, 0x90, 0x59, 0x66, 0xb4, + 0x39, 0x86, 0x33, 0xa6, 0x39, 0xd5, 0x86, 0x70, 0x6f, 0x5b, 0x97, 0x75, + 0x4f, 0x82, 0x90, 0xcb, 0xc2, 0x37, 0xce, 0x5f, 0x36, 0x7e, 0x4a, 0xb8, + 0xc0, 0x3a, 0xc2, 0x67, 0xb2, 0xcc, 0xc0, 0x79, 0xaf, 0xb5, 0xc6, 0x55, + 0x6c, 0xc2, 0x65, 0xb4, 0x71, 0x72, 0xc7, 0x86, 0x91, 0x48, 0x56, 0xa1, + 0x65, 0x59, 0xa7, 0x98, 0xd1, 0xb5, 0xbb, 0x9c, 0xa2, 0x95, 0x8a, 0x54, + 0xd4, 0x8e, 0x7a, 0x72, 0x71, 0x3c, 0xb7, 0xc2, 0x5f, 0xb7, 0x7d, 0x5d, + 0x68, 0xd1, 0xc5, 0x80, 0x69, 0x5f, 0xce, 0xb7, 0x52, 0x5c, 0x38, 0x8e, + 0xc8, 0x4a, 0x4d, 0x6a, 0x75, 0x3f, 0x9c, 0x74, 0x88, 0xa2, 0x37, 0x6b, + 0xbb, 0x6b, 0x8d, 0xb8, 0x95, 0x2a, 0xde, 0x5e, 0x65, 0xcc, 0x60, 0x47, + 0x8e, 0xb7, 0x9d, 0x89, 0xb1, 0xad, 0x41, 0xb7, 0x57, 0xd2, 0xb5, 0x5a, + 0x80, 0x66, 0x75, 0x77, 0x3d, 0x4d, 0xc6, 0xce, 0xb0, 0x9b, 0x3c, 0xa3, + 0xa9, 0x77, 0x4a, 0x45, 0x37, 0xb7, 0x9b, 0xc3, 0x7f, 0x97, 0x87, 0x6b, + 0x49, 0x81, 0x94, 0x2f, 0xa4, 0x87, 0x6f, 0xaa, 0xd8, 0x50, 0x45, 0xae, + 0x7f, 0x78, 0xcc, 0x87, 0xb8, 0x58, 0x73, 0x8c, 0x5f, 0xb1, 0x62, 0x41, + 0xb3, 0x6a, 0xab, 0xb5, 0xb7, 0xa1, 0xc1, 0xa7, 0x98, 0xba, 0x64, 0xcc, + 0xb2, 0x5d, 0xb3, 0x7f, 0x65, 0x67, 0x7a, 0xc6, 0xa3, 0x62, 0x86, 0x38, + 0xd1, 0x87, 0x39, 0x81, 0x75, 0x86, 0x5c, 0x3e, 0x75, 0x57, 0x7c, 0x30, + 0xd6, 0x5a, 0xa7, 0xcb, 0x95, 0xaf, 0x6a, 0x4d, 0x67, 0xb2, 0xc4, 0xa9, + 0x81, 0x63, 0xc6, 0x90, 0x7f, 0x81, 0x41, 0x4e, 0x70, 0x43, 0xba, 0x43, + 0x98, 0xd3, 0xc3, 0xa4, 0x8f, 0xcd, 0xad, 0xa4, 0x55, 0x6b, 0x79, 0xae, + 0x63, 0xb3, 0xa9, 0x60, 0x47, 0xcc, 0x73, 0xbe, 0xbf, 0x51, 0x37, 0x44, + 0xa1, 0x72, 0xd3, 0xb7, 0xa0, 0x5d, 0xb4, 0x3a, 0x83, 0xd1, 0x67, 0x7b, + 0x79, 0x55, 0x8d, 0x9b, 0x52, 0x4a, 0xcc, 0x64, 0x7e, 0x90, 0x76, 0x53, + 0x56, 0x40, 0xcf, 0x5e, 0xb2, 0x97, 0x6e, 0x3d, 0x6c, 0x3a, 0x3b, 0x68, + 0x69, 0x65, 0x6a, 0xa4, 0x32, 0xc5, 0x70, 0x71, 0x9a, 0x68, 0x5f, 0xcb, + 0xa4, 0x7c, 0x88, 0x9c, 0x95, 0x31, 0x37, 0x48, 0x7b, 0xcf, 0xb2, 0x73, + 0x92, 0x87, 0xbe, 0x6b, 0x2f, 0x39, 0x60, 0x8c, 0x62, 0xb5, 0x86, 0xb8, + 0x76, 0x96, 0xbc, 0x77, 0x2d, 0x57, 0x70, 0x6e, 0x3b, 0x7e, 0x86, 0x74, + 0x5c, 0x75, 0xaa, 0x77, 0x8e, 0xaf, 0x57, 0x4c, 0x87, 0x83, 0xb4, 0x58, + 0x4a, 0x88, 0xd0, 0x95, 0x49, 0x5c, 0xc5, 0xbb, 0x63, 0x3f, 0xcb, 0xa9, + 0x72, 0x5f, 0xbb, 0x69, 0xc4, 0x90, 0xb1, 0x87, 0xa7, 0x88, 0x3a, 0x85, + 0x50, 0x6d, 0x7f, 0x9f, 0x67, 0x53, 0xa7, 0xc5, 0x47, 0x38, 0xcc, 0xa6, + 0x80, 0x7f, 0xce, 0x91, 0x9e, 0x83, 0xcb, 0x84, 0x6f, 0xce, 0x75, 0xc3, + 0xc0, 0x59, 0xb6, 0x87, 0xae, 0x73, 0xc5, 0x3d, 0x81, 0x79, 0xb1, 0xaa, + 0x7d, 0xa7, 0x65, 0x96, 0x5c, 0x90, 0x59, 0x5e, 0x33, 0x66, 0xd0, 0x92, + 0xbb, 0x86, 0x44, 0xc6, 0xc9, 0x39, 0xb5, 0x82, 0xa8, 0xc1, 0x6c, 0xb2, + 0x48, 0x39, 0x35, 0x5f, 0x38, 0x62, 0x81, 0x82, 0x68, 0x80, 0x4a, 0xc1, + 0x4e, 0x4a, 0xb7, 0x8d, 0x3a, 0x78, 0x56, 0xae, 0x6e, 0xbe, 0x4d, 0xc7, + 0xd6, 0xb6, 0x65, 0xb7, 0x6a, 0xab, 0x85, 0xd1, 0xad, 0x61, 0x93, 0x9e, + 0x92, 0x91, 0xc9, 0x9a, 0x97, 0x52, 0x4e, 0xb5, 0x88, 0x61, 0x71, 0x65, + 0x86, 0xbb, 0x3a, 0x4e, 0xb0, 0x5a, 0x2a, 0x3b, 0x6d, 0x82, 0x63, 0xd0, + 0x78, 0x52, 0x54, 0x85, 0xac, 0x88, 0x1b, 0x9e, 0x71, 0xc4, 0x50, 0x83, + 0x7d, 0xa3, 0x97, 0x8f, 0x58, 0xb2, 0x69, 0x49, 0x4f, 0x89, 0xa4, 0x37, + 0x66, 0x76, 0xd0, 0x52, 0xca, 0x9e, 0x3e, 0x52, 0x46, 0x6a, 0xa0, 0x6b, + 0xb3, 0x34, 0xbb, 0x84, 0x3b, 0x32, 0x9e, 0x9a, 0xc9, 0x4e, 0x60, 0x75, + 0xac, 0x47, 0x84, 0x20, 0x67, 0x82, 0xac, 0x93, 0xcb, 0x59, 0x4d, 0x79, + 0xbe, 0x84, 0xc2, 0xab, 0xb3, 0x3c, 0x64, 0x31, 0x91, 0xc1, 0xa4, 0xad, + 0x3b, 0x6d, 0xd5, 0xcb, 0x57, 0xb2, 0x59, 0xd2, 0xc7, 0x48, 0x5b, 0x66, + 0x96, 0x41, 0x4b, 0xa1, 0x48, 0x78, 0x8f, 0xba, 0x97, 0x59, 0x67, 0xb1, + 0x46, 0x90, 0xc2, 0x9e, 0xbe, 0xb7, 0xcd, 0x6d, 0x9d, 0x6d, 0x4b, 0xd2, + 0x3f, 0x5e, 0x8f, 0xc2, 0xc4, 0x71, 0xd5, 0xd3, 0xb0, 0xbb, 0x98, 0x57, + 0x42, 0xb6, 0xb6, 0x5d, 0x49, 0x70, 0x62, 0x46, 0x9a, 0xb2, 0x8f, 0x85, + 0x7e, 0x94, 0xcc, 0xa0, 0x49, 0x90, 0xe3, 0xbc, 0x58, 0x7b, 0x40, 0x8d, + 0x46, 0x78, 0xb1, 0x64, 0x86, 0xab, 0x9f, 0xa0, 0xa1, 0xd1, 0x84, 0x51, + 0x50, 0x6f, 0x5b, 0x4d, 0xa0, 0xa4, 0x78, 0x6c, 0x8d, 0x6e, 0x73, 0x3f, + 0x85, 0x7b, 0xb5, 0xa6, 0x86, 0xc5, 0x63, 0x4d, 0x2a, 0xaa, 0x69, 0x36, + 0xbc, 0x2f, 0xad, 0x98, 0x5f, 0xb1, 0x65, 0x7a, 0xbe, 0x32, 0xa6, 0x75, + 0x7f, 0x89, 0x8d, 0x60, 0x30, 0xa0, 0x39, 0x60, 0x51, 0x4f, 0x4b, 0xd3, + 0xc4, 0xb0, 0xe3, 0x3d, 0x37, 0x67, 0x82, 0x98, 0x68, 0x38, 0xae, 0xa7, + 0x47, 0xa6, 0x97, 0x66, 0xc2, 0x48, 0x3b, 0x76, 0xc6, 0x74, 0xce, 0xc3, + 0xa0, 0xbb, 0xb2, 0x98, 0x88, 0xa4, 0x8a, 0x3e, 0x99, 0x3e, 0xaf, 0xbc, + 0x93, 0x3d, 0x5d, 0x77, 0xa3, 0x60, 0xa3, 0x5d, 0xa5, 0xa5, 0x76, 0x4b, + 0x5d, 0xb7, 0x98, 0x4b, 0xb8, 0xbf, 0x63, 0x91, 0x36, 0x63, 0x43, 0x94, + 0x6d, 0xce, 0xc1, 0xbb, 0xb0, 0xa7, 0x6e, 0x40, 0x41, 0x9a, 0x71, 0x7d, + 0x3d, 0x6f, 0x8b, 0x46, 0xe2, 0xd5, 0x92, 0x59, 0xb6, 0xb5, 0x9a, 0xb6, + 0x6d, 0xb0, 0xc6, 0x7b, 0x40, 0x24, 0x92, 0x2c, 0x51, 0x79, 0x5e, 0x36, + 0x61, 0x5d, 0x82, 0x4a, 0x9a, 0x60, 0x5a, 0xb7, 0xa7, 0x9f, 0x61, 0xc2, + 0xb5, 0xc3, 0x78, 0xce, 0x59, 0xb3, 0xa2, 0x9f, 0x83, 0xa2, 0x8c, 0x93, + 0xc3, 0x9b, 0x99, 0x44, 0x9f, 0x34, 0xad, 0x71, 0x33, 0x47, 0x75, 0xaf, + 0x7d, 0x75, 0x92, 0x32, 0x8f, 0xac, 0x6b, 0x82, 0xcb, 0x5a, 0xba, 0x8b, + 0xcb, 0x52, 0x3f, 0x3e, 0xb1, 0xb4, 0x6a, 0x78, 0xb7, 0xc1, 0x68, 0x98, + 0x5b, 0xdd, 0xcd, 0x86, 0x7c, 0xa1, 0xb8, 0x2e, 0x31, 0x3d, 0x40, 0x65, + 0xac, 0xbe, 0x84, 0x88, 0xc9, 0x88, 0x68, 0x65, 0x60, 0x9d, 0xb2, 0x90, + 0x64, 0x61, 0xbf, 0x8d, 0xcc, 0x4f, 0xc1, 0xd1, 0x64, 0x42, 0xba, 0x42, + 0x6a, 0x65, 0x57, 0x35, 0x54, 0x6e, 0xd3, 0x97, 0xad, 0x70, 0x6f, 0x86, + 0x8d, 0x8c, 0x47, 0xcd, 0xb8, 0x30, 0x52, 0x46, 0x3f, 0x77, 0xb3, 0x76, + 0x63, 0x81, 0x7d, 0x7d, 0x38, 0xa7, 0x95, 0x8e, 0x4c, 0x69, 0x80, 0x9c, + 0xb9, 0x5f, 0xb1, 0xb1, 0xe1, 0x58, 0x56, 0x51, 0xab, 0xb9, 0x9b, 0x97, + 0x9b, 0x3d, 0x59, 0x8e, 0x9b, 0xd8, 0xaa, 0x3c, 0x82, 0xa6, 0x68, 0xa8, + 0x89, 0x8f, 0xa4, 0x6c, 0x6a, 0xd4, 0x9c, 0x6f, 0x65, 0xd7, 0xe3, 0xc4, + 0x50, 0x50, 0x7f, 0xa7, 0x7d, 0xa1, 0xc1, 0x61, 0x84, 0xbf, 0x55, 0x89, + 0x5f, 0x79, 0x48, 0xca, 0x6b, 0x9e, 0x7b, 0xab, 0x73, 0x40, 0xc1, 0x8e, + 0x55, 0x4a, 0x80, 0xa3, 0x4e, 0x56, 0xba, 0x53, 0x75, 0x39, 0x38, 0x4e, + 0x4d, 0x93, 0xaf, 0x38, 0xd1, 0x5b, 0xaf, 0xb4, 0x96, 0x95, 0x6e, 0xc2, + 0xa5, 0xa0, 0x59, 0x8d, 0x98, 0x8b, 0xa6, 0x77, 0xa8, 0x69, 0x77, 0x97, + 0x4c, 0x75, 0xaf, 0x32, 0xb3, 0x90, 0x9a, 0xd2, 0x47, 0xaf, 0x53, 0xaf, + 0xac, 0x44, 0xc1, 0x59, 0x8c, 0x6c, 0x46, 0x60, 0x45, 0xb3, 0x99, 0xc7, + 0xb1, 0xbe, 0xb2, 0x6a, 0xc3, 0x89, 0xcf, 0x68, 0x7f, 0x9f, 0xc8, 0xad, + 0xb5, 0x8e, 0xcb, 0x6c, 0x42, 0x7b, 0x50, 0x65, 0xce, 0x42, 0xb8, 0xa1, + 0x89, 0x8a, 0x38, 0xbc, 0x75, 0xc4, 0x58, 0xb2, 0xd5, 0x4d, 0x6b, 0x95, + 0x85, 0xa3, 0x83, 0xb4, 0x36, 0x86, 0x95, 0x69, 0x4c, 0x81, 0xc2, 0x48, + 0x9f, 0x3c, 0x8b, 0xa7, 0x8e, 0xa5, 0x90, 0x44, 0x87, 0x3b, 0x7a, 0x7b, + 0x89, 0x58, 0x92, 0x34, 0xaa, 0xa7, 0xad, 0x8f, 0x6a, 0x9d, 0x4e, 0xaa, + 0x81, 0xb8, 0xad, 0x7a, 0x6b, 0x48, 0x5e, 0x6e, 0x62, 0x55, 0x86, 0xc6, + 0xbf, 0xb0, 0xa8, 0x5b, 0x64, 0x49, 0x98, 0xab, 0x86, 0x27, 0x70, 0xa9, + 0x5c, 0xc8, 0x66, 0x73, 0x45, 0x47, 0x41, 0x99, 0x47, 0x77, 0xda, 0x88, + 0x5e, 0x8d, 0x74, 0x3e, 0x3e, 0xa7, 0x57, 0x41, 0xa0, 0xc6, 0x9d, 0x81, + 0x54, 0x33, 0x89, 0x92, 0x7b, 0x76, 0x99, 0x54, 0x7c, 0x33, 0xc8, 0x77, + 0x38, 0xc3, 0x38, 0x52, 0x8b, 0x5a, 0xbe, 0xa8, 0xa2, 0x63, 0x71, 0x68, + 0x5e, 0x3b, 0xc6, 0x89, 0x72, 0x9e, 0x86, 0x64, 0xa5, 0x7b, 0x68, 0x49, + 0x87, 0x89, 0xc4, 0x70, 0xb8, 0xb3, 0x2f, 0x95, 0x5e, 0xaf, 0xa9, 0x48, + 0xc2, 0xca, 0x72, 0x3e, 0x9c, 0x72, 0xcf, 0x83, 0x72, 0x64, 0x90, 0xbc, + 0xa5, 0x7c, 0x9a, 0x83, 0x3b, 0x31, 0xc3, 0x45, 0x63, 0xcc, 0x99, 0x46, + 0xb0, 0xbe, 0x70, 0xd5, 0x8a, 0xce, 0x4f, 0x8c, 0x39, 0xa2, 0x40, 0x60, + 0x8d, 0x9e, 0x64, 0xb3, 0x3e, 0x93, 0x5f, 0x52, 0xaa, 0xaa, 0x50, 0x45, + 0x72, 0xa2, 0x4f, 0x50, 0xac, 0xc6, 0x30, 0x9a, 0x38, 0xb6, 0x44, 0x81, + 0xa8, 0x51, 0xc3, 0xcf, 0x95, 0xd0, 0x97, 0x7f, 0xbe, 0x84, 0x68, 0x72, + 0xae, 0x95, 0xd4, 0x99, 0x99, 0x9c, 0x69, 0xa6, 0x59, 0x96, 0x75, 0x8f, + 0xa2, 0x62, 0xb3, 0x37, 0xa5, 0xba, 0xc4, 0xc2, 0xd0, 0x5e, 0x8e, 0x70, + 0x31, 0x9f, 0x6f, 0x52, 0x6b, 0xbe, 0x53, 0x7c, 0x62, 0x51, 0xb4, 0x75, + 0xc9, 0xb3, 0x60, 0x77, 0x46, 0x64, 0xc5, 0x5a, 0x78, 0x7e, 0x43, 0x48, + 0x5b, 0x9e, 0x34, 0xbb, 0xc3, 0x9a, 0x3c, 0xd0, 0x96, 0xb7, 0x5e, 0xbf, + 0xcf, 0x4b, 0x32, 0x41, 0x98, 0xd5, 0x74, 0x41, 0xa7, 0x5e, 0x81, 0x84, + 0x9e, 0x8a, 0xc5, 0xba, 0x5a, 0x4b, 0x2a, 0x3d, 0x29, 0x41, 0xa9, 0xc2, + 0xa0, 0x5c, 0x72, 0xac, 0xab, 0x4d, 0x3b, 0x86, 0xb3, 0x90, 0x8f, 0xb5, + 0x46, 0x9e, 0xc1, 0x4e, 0x88, 0x87, 0x8f, 0x3f, 0x4a, 0x6b, 0x48, 0x44, + 0x6d, 0x2c, 0xa8, 0xa3, 0x54, 0xb1, 0xcb, 0x3f, 0x8e, 0xb7, 0xc7, 0xa7, + 0xc2, 0xa3, 0x96, 0xd2, 0x9a, 0xad, 0x6e, 0x8b, 0x94, 0x4b, 0x92, 0xd3, + 0x36, 0xa4, 0x8e, 0x3f, 0x44, 0xae, 0x49, 0x6b, 0xb4, 0x49, 0xb6, 0x86, + 0x36, 0x87, 0x3b, 0x8e, 0x5d, 0xa7, 0x5f, 0xc8, 0x56, 0x49, 0x52, 0x87, + 0x47, 0x69, 0x5a, 0xbc, 0x55, 0x3e, 0x88, 0x88, 0x65, 0x6d, 0x52, 0xbc, + 0x88, 0x79, 0x6e, 0x67, 0xc5, 0x74, 0x94, 0xc0, 0x8b, 0x7c, 0x83, 0xb1, + 0xcc, 0xa6, 0x61, 0xc6, 0x39, 0xcc, 0xb9, 0xc6, 0xc3, 0x62, 0xab, 0x71, + 0x80, 0xbc, 0x49, 0x94, 0x4b, 0x68, 0x3f, 0xa7, 0xb8, 0xc0, 0x6d, 0x55, + 0x74, 0x98, 0x9a, 0x8b, 0x44, 0x77, 0x93, 0x60, 0x51, 0x40, 0xc2, 0x31, + 0x8a, 0x54, 0xc4, 0x38, 0x60, 0xbd, 0xc0, 0xd8, 0x3f, 0x43, 0x7c, 0xae, + 0x98, 0x30, 0xa3, 0x48, 0xbc, 0x5f, 0x36, 0xb1, 0xa7, 0x7a, 0x63, 0x9d, + 0x4f, 0x80, 0xcd, 0xb9, 0xa1, 0x73, 0x41, 0x46, 0x62, 0xd1, 0x3e, 0x41, + 0x88, 0x98, 0x63, 0xac, 0x8d, 0x6c, 0x5e, 0xa1, 0x3c, 0xc4, 0x7a, 0x86, + 0x48, 0x6c, 0x33, 0x54, 0xbb, 0x6b, 0x65, 0x3b, 0x78, 0x64, 0xb9, 0x67, + 0x79, 0x7c, 0x72, 0x42, 0x65, 0x8a, 0x84, 0x61, 0x51, 0xae, 0x61, 0x8d, + 0x4f, 0x8a, 0xc1, 0x7c, 0xcd, 0x60, 0x48, 0x65, 0x92, 0x62, 0x8c, 0x6a, + 0x6b, 0x9f, 0x9f, 0x70, 0x8c, 0x4d, 0x7d, 0xb0, 0x9c, 0x8e, 0x42, 0xac, + 0x9b, 0x83, 0x30, 0xd0, 0x52, 0x3c, 0x5b, 0xac, 0xc8, 0x5e, 0x46, 0xbf, + 0x42, 0x72, 0xbb, 0x7b, 0x98, 0x89, 0xbb, 0x67, 0x38, 0xc0, 0x31, 0x4a, + 0xac, 0x86, 0x5d, 0x81, 0x50, 0x7c, 0x42, 0x8c, 0x5c, 0x4b, 0x7a, 0x6e, + 0x59, 0xc3, 0x85, 0x74, 0x90, 0x73, 0x45, 0x84, 0x9f, 0x69, 0x7c, 0xaa, + 0x85, 0x4e, 0x52, 0x56, 0xa3, 0x46, 0x42, 0x5c, 0x63, 0xc6, 0x7b, 0x5c, + 0x81, 0xc5, 0x44, 0xd0, 0x3c, 0x7c, 0x45, 0x64, 0xa9, 0xa9, 0x40, 0x2c, + 0x90, 0x56, 0x4a, 0x63, 0x45, 0xc1, 0xbd, 0xa1, 0xa9, 0xb9, 0x8c, 0x87, + 0x3c, 0xad, 0xd3, 0x8e, 0x53, 0x4c, 0x96, 0x6e, 0x49, 0xc2, 0x8e, 0xae, + 0x9e, 0x96, 0x8f, 0x9a, 0x52, 0x7b, 0xa1, 0x93, 0x31, 0x45, 0xbd, 0xa8, + 0x61, 0xa1, 0x76, 0x79, 0x45, 0x2e, 0x8a, 0x85, 0xcb, 0x8d, 0x45, 0x5b, + 0x78, 0xd2, 0x9c, 0xc6, 0x81, 0x8c, 0x95, 0x8e, 0x45, 0x67, 0xa0, 0x9f, + 0x5c, 0x46, 0x50, 0xbb, 0xbc, 0x79, 0x92, 0xd1, 0x78, 0x96, 0xbb, 0x71, + 0x3b, 0x44, 0x96, 0x61, 0xd1, 0x52, 0xc1, 0xbc, 0xa4, 0xa3, 0x33, 0xd8, + 0x5a, 0x81, 0x80, 0x4f, 0x95, 0x45, 0x67, 0xd2, 0x7e, 0x48, 0x52, 0x6f, + 0x49, 0xa3, 0x87, 0x57, 0x40, 0x45, 0x53, 0x79, 0x62, 0x61, 0x93, 0x50, + 0x52, 0x8d, 0x2e, 0x93, 0x83, 0x74, 0xba, 0xa3, 0x5b, 0x3a, 0x43, 0x95, + 0xaa, 0x90, 0x49, 0xb6, 0xbd, 0x95, 0x2f, 0xa2, 0xb9, 0x76, 0xb4, 0xad, + 0x30, 0xd0, 0xa2, 0xc6, 0x65, 0x76, 0xc5, 0xd4, 0x9f, 0x21, 0x46, 0x69, + 0x5f, 0x4d, 0x43, 0x33, 0x5a, 0xb4, 0x86, 0x5d, 0xbd, 0xb1, 0xcf, 0x98, + 0xc1, 0x40, 0x4c, 0x3a, 0xb0, 0xaf, 0xb1, 0x6c, 0x3d, 0x39, 0xcd, 0x60, + 0xc5, 0x7b, 0x96, 0x75, 0x76, 0x3d, 0x47, 0xa9, 0xc0, 0xce, 0x3a, 0x9f, + 0x3a, 0x71, 0xa1, 0xa6, 0xbd, 0x45, 0x78, 0x99, 0x95, 0x8c, 0x90, 0xad, + 0x73, 0x55, 0x9c, 0x44, 0x73, 0x41, 0x9f, 0x86, 0xd3, 0xa8, 0xac, 0x98, + 0x9f, 0x56, 0x83, 0x3d, 0xab, 0xb5, 0x85, 0x86, 0x78, 0x77, 0x3f, 0x2f, + 0xb6, 0x74, 0x3d, 0xb6, 0x85, 0xb9, 0xa4, 0x91, 0xd2, 0xbe, 0x5e, 0x55, + 0x77, 0x6a, 0x43, 0x92, 0x49, 0xbd, 0xc7, 0x75, 0x96, 0x69, 0x34, 0xae, + 0xa0, 0x67, 0x77, 0x45, 0x70, 0x43, 0xac, 0x4b, 0xc7, 0x88, 0x57, 0x8f, + 0xc2, 0x35, 0xa5, 0x7a, 0xac, 0xc6, 0x38, 0x57, 0x9e, 0x70, 0x9c, 0x3e, + 0x47, 0x67, 0x3c, 0x36, 0x67, 0x5e, 0xb4, 0x48, 0x7a, 0xaa, 0xba, 0x58, + 0x76, 0xcf, 0x70, 0x2d, 0xac, 0x9b, 0x68, 0x8f, 0xba, 0x5b, 0xb2, 0x2f, + 0x8c, 0x9a, 0xb4, 0x79, 0x38, 0x34, 0x73, 0xac, 0x85, 0x79, 0x43, 0x38, + 0x6d, 0xb2, 0x75, 0x80, 0x56, 0xbf, 0xa0, 0x4f, 0x4a, 0xcb, 0xcb, 0xad, + 0x59, 0x6c, 0x55, 0xae, 0x89, 0x91, 0xcf, 0x3a, 0x38, 0x60, 0xa6, 0xae, + 0x77, 0x6e, 0xa4, 0xa0, 0xa9, 0x6a, 0x5c, 0x42, 0x55, 0x75, 0x57, 0x5e, + 0x2f, 0xbc, 0x32, 0xd1, 0xba, 0xa0, 0xb5, 0x46, 0xa2, 0x8d, 0x36, 0x8e, + 0x80, 0x2b, 0x33, 0x5e, 0x72, 0xd0, 0xab, 0x72, 0x64, 0x97, 0x65, 0xae, + 0x49, 0xd9, 0x88, 0x6e, 0x51, 0xa1, 0x3b, 0x9c, 0xcd, 0xbf, 0x4c, 0x64, + 0xbb, 0x4f, 0x83, 0xc1, 0xb2, 0x83, 0x93, 0xa8, 0x72, 0xbf, 0x3e, 0x40, + 0x8b, 0x66, 0xa1, 0x89, 0x91, 0x6c, 0x73, 0x66, 0x5a, 0x53, 0xba, 0xa7, + 0x78, 0x88, 0x67, 0xbc, 0x4a, 0x9c, 0x4d, 0xaa, 0x46, 0xc3, 0x6f, 0x45, + 0x69, 0xa2, 0x7e, 0x4f, 0x43, 0xbc, 0x7c, 0x58, 0x81, 0x8b, 0x31, 0x9e, + 0x4c, 0x63, 0x95, 0x61, 0xba, 0x80, 0xbe, 0x80, 0x5c, 0x79, 0x96, 0x81, + 0x1c, 0x3c, 0x93, 0x9e, 0x81, 0x21, 0x69, 0x8b, 0x8c, 0xc6, 0xd2, 0x4c, + 0x67, 0x9a, 0x74, 0xbb, 0x32, 0xa0, 0xa7, 0x63, 0x54, 0xaa, 0x69, 0x99, + 0x9f, 0xa9, 0x89, 0x6d, 0xa6, 0xd5, 0x5d, 0x78, 0x5c, 0x3e, 0xc6, 0xa4, + 0x8f, 0x5b, 0x7d, 0x58, 0x98, 0x9a, 0x82, 0x4c, 0x66, 0x5d, 0x70, 0xda, + 0x6f, 0xb1, 0x3e, 0x40, 0xba, 0x7b, 0x78, 0x27, 0x3c, 0xb8, 0x90, 0xac, + 0xbc, 0x9d, 0xb2, 0xd3, 0x8e, 0x5d, 0x9c, 0x5f, 0x77, 0xaa, 0xdc, 0x6e, + 0x61, 0xb9, 0xc1, 0x5e, 0x3b, 0xa1, 0x9d, 0x3b, 0x61, 0x40, 0x97, 0xa3, + 0x78, 0x5e, 0xc8, 0x65, 0xc1, 0x8b, 0xa6, 0x3b, 0x8c, 0x4f, 0x5c, 0x8d, + 0xcb, 0xa4, 0xad, 0xcc, 0xa6, 0xaf, 0x8d, 0xaa, 0x71, 0x87, 0x2f, 0x88, + 0xbe, 0x92, 0x32, 0x68, 0x7d, 0x42, 0x5d, 0x8d, 0x88, 0x5c, 0x5c, 0xcc, + 0x42, 0x27, 0x4a, 0x77, 0x35, 0xa2, 0x85, 0xaf, 0x3d, 0x4b, 0x6b, 0x63, + 0xc4, 0x9a, 0x7b, 0xc9, 0x7d, 0x66, 0xa7, 0x79, 0xd1, 0x7e, 0x48, 0x39, + 0x77, 0xcc, 0x94, 0x55, 0xc0, 0xc7, 0x39, 0xad, 0xb4, 0x95, 0x6e, 0xb5, + 0x58, 0x56, 0x83, 0x69, 0xa1, 0xbb, 0xc8, 0xd3, 0xd8, 0x72, 0x59, 0xa4, + 0x80, 0x94, 0x67, 0x8e, 0xb8, 0xc4, 0x85, 0xb0, 0xdb, 0xce, 0x70, 0xbd, + 0x5d, 0xa8, 0xa9, 0xc7, 0x64, 0x64, 0x4a, 0x6b, 0xce, 0x67, 0xb9, 0x64, + 0x74, 0x6c, 0x57, 0x3e, 0x78, 0x5a, 0xbc, 0x1b, 0x6f, 0xb8, 0x8d, 0xbd, + 0x79, 0x80, 0xd2, 0x6f, 0xb4, 0x66, 0x3e, 0xd2, 0xc4, 0xca, 0x5a, 0x86, + 0x3c, 0x62, 0x3c, 0x7a, 0x41, 0xbf, 0x4a, 0x67, 0x79, 0xab, 0x5f, 0x4f, + 0x98, 0x96, 0x79, 0xc2, 0xb3, 0x87, 0x68, 0x52, 0x54, 0xb0, 0x73, 0x59, + 0x82, 0x93, 0x9d, 0x2b, 0x8b, 0x9c, 0x60, 0xb6, 0x33, 0x62, 0xc8, 0x68, + 0x48, 0x64, 0xd5, 0xb2, 0xb1, 0x64, 0xb9, 0x60, 0xb3, 0x93, 0xc2, 0x3f, + 0xaf, 0x61, 0x59, 0x34, 0x57, 0x89, 0x66, 0x8d, 0xad, 0x7a, 0x82, 0x9a, + 0x51, 0xc9, 0x77, 0x5f, 0xae, 0x5c, 0xb9, 0xb2, 0xa2, 0x5c, 0x8b, 0x60, + 0xa0, 0x8d, 0x73, 0x8f, 0x6a, 0x35, 0x6a, 0x35, 0xb7, 0x6a, 0x84, 0xba, + 0x83, 0x4a, 0x74, 0x54, 0xc5, 0x31, 0x4c, 0x79, 0x40, 0xae, 0x8f, 0x49, + 0x56, 0xaf, 0xc0, 0x2b, 0xc9, 0x8e, 0x8a, 0x4b, 0x62, 0x85, 0xa7, 0x53, + 0x90, 0x66, 0x52, 0x66, 0x56, 0x75, 0xce, 0x5d, 0x3a, 0x4e, 0x8e, 0x92, + 0xc1, 0x69, 0x4a, 0x3a, 0x4a, 0x60, 0xb4, 0x71, 0xa1, 0x4e, 0x9d, 0x8d, + 0x83, 0xcd, 0x28, 0xa4, 0xc0, 0xb3, 0x39, 0x87, 0xa1, 0x98, 0x63, 0x3e, + 0x85, 0x7a, 0xdc, 0x3a, 0xa9, 0x80, 0xa5, 0x90, 0xb3, 0x6e, 0x38, 0x83, + 0x70, 0xd3, 0x84, 0xb2, 0x4c, 0xc2, 0x57, 0x90, 0x82, 0xd2, 0x82, 0xb0, + 0xd4, 0x9c, 0x9d, 0x2b, 0x55, 0x76, 0xca, 0x9c, 0x4f, 0x36, 0xbd, 0x39, + 0xa8, 0xbd, 0x4e, 0x97, 0x89, 0xa8, 0xb8, 0x5e, 0xbc, 0x5c, 0xaa, 0x8c, + 0xce, 0x90, 0x49, 0x60, 0x9f, 0x3c, 0x6f, 0x8b, 0x3b, 0x3f, 0xbe, 0xcd, + 0xb7, 0xb4, 0x40, 0x4a, 0x45, 0x50, 0x2d, 0xac, 0xaa, 0xab, 0x85, 0xc7, + 0xbb, 0x72, 0x60, 0xdf, 0xbe, 0xc7, 0xbb, 0xb0, 0xa3, 0x8e, 0x82, 0x42, + 0x7a, 0xab, 0x5c, 0x59, 0x55, 0xa3, 0x5a, 0xa1, 0x81, 0x79, 0x6b, 0xaf, + 0xb5, 0xab, 0x69, 0x1c, 0x35, 0x7b, 0x77, 0xce, 0x79, 0xb7, 0x50, 0x8d, + 0xc1, 0x45, 0x6b, 0x42, 0x80, 0xae, 0x58, 0x5e, 0x77, 0x58, 0xa5, 0x7e, + 0x51, 0x4f, 0xa8, 0x52, 0xa3, 0xc1, 0xaa, 0x33, 0xa9, 0x50, 0x9b, 0xbe, + 0x56, 0xab, 0x93, 0xbb, 0xb7, 0x7d, 0xda, 0x80, 0x7f, 0xc3, 0x8d, 0x3d, + 0x7a, 0xb0, 0xd8, 0x78, 0x38, 0xd3, 0x67, 0xe5, 0xaf, 0x78, 0xa6, 0xd7, + 0x66, 0x61, 0xc0, 0x9c, 0xaf, 0x89, 0xbd, 0x3c, 0x82, 0x8f, 0x83, 0x5e, + 0xbd, 0xd1, 0x5f, 0xba, 0x56, 0x6a, 0xb3, 0x61, 0x5e, 0x65, 0xba, 0xb1, + 0x3a, 0x77, 0x8c, 0x92, 0xb2, 0x96, 0xb1, 0x45, 0x50, 0x53, 0x55, 0xa7, + 0x67, 0x7b, 0x83, 0x8c, 0x3d, 0x72, 0x5b, 0xa0, 0x62, 0x4b, 0x51, 0x41, + 0xda, 0xac, 0x95, 0x64, 0x6b, 0x96, 0x8f, 0xc0, 0x43, 0x5a, 0x8a, 0x4c, + 0x4a, 0xbc, 0xcc, 0x3c, 0x66, 0x99, 0x5f, 0xc0, 0x6d, 0x60, 0x67, 0x96, + 0xce, 0x71, 0xb2, 0x4c, 0x9d, 0x89, 0xc4, 0x5a, 0x79, 0xa2, 0x71, 0x9b, + 0x42, 0xcc, 0x5d, 0x91, 0x4e, 0xd8, 0xd6, 0x8b, 0xc6, 0x45, 0x91, 0x80, + 0xb3, 0x8d, 0x56, 0x3a, 0x9a, 0x4f, 0xb0, 0x77, 0x9e, 0x76, 0x88, 0x43, + 0x1f, 0x66, 0xd2, 0xc1, 0x51, 0xb1, 0x91, 0xd1, 0x8f, 0xac, 0x7b, 0x6b, + 0xaf, 0xb4, 0x6b, 0x7e, 0x38, 0x60, 0x65, 0x89, 0xc8, 0x6b, 0x6a, 0x45, + 0x69, 0x67, 0x68, 0x80, 0x99, 0x54, 0x62, 0x90, 0xa9, 0xc0, 0x8e, 0x4e, + 0x57, 0xb3, 0x7c, 0x38, 0xa2, 0xa3, 0xc8, 0x9d, 0x47, 0x98, 0x51, 0x62, + 0x5b, 0x82, 0x58, 0x8f, 0xac, 0x7f, 0x61, 0x36, 0x7a, 0x5e, 0xb0, 0x6e, + 0x91, 0x7c, 0x8e, 0x9b, 0x8e, 0x7c, 0xce, 0x49, 0xc8, 0x88, 0x77, 0xbc, + 0x8b, 0x47, 0xa9, 0x74, 0x8e, 0xc5, 0xa1, 0xd3, 0xa9, 0xa1, 0x45, 0x3c, + 0xcb, 0xa3, 0x41, 0x95, 0x82, 0x32, 0x9a, 0x78, 0xb1, 0x90, 0xbd, 0x7d, + 0x8b, 0xb0, 0x5c, 0x3c, 0xbf, 0x62, 0xaf, 0xba, 0x33, 0x85, 0x9c, 0xb3, + 0x3e, 0x51, 0x2d, 0x29, 0xcc, 0xcf, 0xc2, 0xa2, 0x52, 0x54, 0x7d, 0x2d, + 0x58, 0x84, 0x90, 0x7a, 0x9d, 0x41, 0xdb, 0x79, 0x32, 0x65, 0xb5, 0x3b, + 0xa6, 0xd1, 0x7d, 0x55, 0x88, 0xb3, 0x64, 0x88, 0x5f, 0xcf, 0x74, 0x5b, + 0x5d, 0xb3, 0xc3, 0xa6, 0x56, 0x43, 0x4a, 0xa6, 0x53, 0x97, 0xc3, 0xa6, + 0x59, 0x6a, 0xce, 0x90, 0x38, 0x84, 0x2e, 0x7f, 0xa2, 0x50, 0x95, 0x9b, + 0x66, 0xa6, 0x56, 0xb3, 0xa2, 0xa5, 0x94, 0x45, 0x1c, 0x98, 0x98, 0x65, + 0x64, 0x1b, 0xc0, 0x7f, 0x58, 0x70, 0x65, 0xa9, 0xc9, 0x82, 0xd9, 0x75, + 0x59, 0xb6, 0xd1, 0xa0, 0x49, 0xbd, 0xd4, 0x92, 0x5b, 0xbc, 0x57, 0x7c, + 0xb4, 0x69, 0xa7, 0xa8, 0xa6, 0xdb, 0xa2, 0xbf, 0x41, 0x69, 0x9e, 0xdb, + 0x24, 0x4d, 0xb5, 0xe9, 0x70, 0x98, 0xcf, 0x3f, 0xa9, 0x73, 0xac, 0x96, + 0xc9, 0x75, 0xa0, 0x78, 0xce, 0xb7, 0x83, 0x4a, 0x33, 0x67, 0x89, 0x74, + 0xcf, 0x8f, 0xc2, 0x86, 0x46, 0x6d, 0x90, 0x5e, 0xac, 0x44, 0x5f, 0x5e, + 0x5b, 0x72, 0x50, 0x7e, 0x8b, 0x75, 0xa7, 0xa6, 0x47, 0xa8, 0x80, 0x37, + 0xa7, 0x8c, 0xaf, 0xaa, 0xc3, 0xcc, 0x81, 0x8f, 0xb2, 0xcf, 0x6c, 0x5a, + 0x60, 0xb2, 0x3c, 0x3d, 0xa6, 0x7e, 0x81, 0xaa, 0x9b, 0x5d, 0xb9, 0x3d, + 0xd9, 0x5d, 0x9d, 0xd2, 0x91, 0x90, 0x65, 0x8b, 0xa3, 0xb0, 0x5a, 0xa1, + 0x90, 0x7a, 0xcb, 0x82, 0xc5, 0x38, 0xa6, 0xc8, 0x1f, 0xb0, 0x8c, 0xa9, + 0xa1, 0x97, 0x7a, 0xb0, 0x81, 0x74, 0x81, 0xc5, 0x6b, 0xd3, 0x4a, 0x3a, + 0x58, 0x53, 0x93, 0xc8, 0xb3, 0x62, 0x67, 0x65, 0x37, 0xce, 0xc4, 0xd1, + 0x79, 0xa4, 0x59, 0x40, 0x44, 0x72, 0x5f, 0x44, 0x7c, 0xb1, 0x42, 0xc6, + 0x8f, 0x53, 0xa4, 0x29, 0x49, 0xae, 0x38, 0x31, 0xc0, 0x7c, 0x47, 0x63, + 0x60, 0x41, 0x92, 0x89, 0x6e, 0xdb, 0xba, 0x74, 0xb4, 0x73, 0xaa, 0x50, + 0xb3, 0x9a, 0xa3, 0x79, 0xa6, 0xa5, 0xba, 0x95, 0xc2, 0x5e, 0x6c, 0x67, + 0x61, 0x8f, 0x40, 0xcb, 0x58, 0x91, 0x9b, 0x41, 0x89, 0xba, 0x92, 0xb8, + 0x6f, 0x88, 0x96, 0x98, 0x61, 0xb9, 0xa1, 0x4c, 0x67, 0x9f, 0xb7, 0xa5, + 0xa4, 0x35, 0xb6, 0xaf, 0x75, 0x40, 0x46, 0x78, 0x2c, 0xc9, 0x74, 0x74, + 0xb2, 0x45, 0x4d, 0x4a, 0xb3, 0xc8, 0x4d, 0x55, 0x45, 0x96, 0x40, 0x68, + 0x8f, 0xc3, 0x5a, 0x99, 0x7b, 0x62, 0x4c, 0x84, 0x7b, 0x99, 0x73, 0x48, + 0x3c, 0x6d, 0xbe, 0x44, 0x8e, 0x78, 0x63, 0x33, 0xe5, 0x3b, 0xa6, 0x6b, + 0x77, 0xc8, 0x28, 0x1e, 0x7e, 0x43, 0xbb, 0x37, 0x55, 0x55, 0x75, 0x8b, + 0x43, 0xa5, 0xa9, 0x79, 0xc9, 0x7b, 0xba, 0xb4, 0x76, 0xc1, 0x5e, 0x86, + 0x2a, 0x58, 0x42, 0xd9, 0xb6, 0x9c, 0x2b, 0x66, 0xd3, 0xb3, 0x89, 0x55, + 0xa7, 0x6e, 0xba, 0xbc, 0x97, 0x66, 0x45, 0x33, 0x56, 0xb0, 0xb2, 0x37, + 0x9e, 0xac, 0xb6, 0xc5, 0x80, 0x91, 0x76, 0x77, 0xb4, 0x52, 0xa7, 0xaf, + 0xd7, 0xd9, 0x82, 0x78, 0x84, 0x67, 0x70, 0x75, 0xb7, 0x78, 0x46, 0x36, + 0x99, 0xbe, 0xaa, 0x91, 0x4e, 0x87, 0x6e, 0x7a, 0xbe, 0x81, 0x6f, 0x2c, + 0xbb, 0x93, 0xa1, 0x3e, 0x94, 0xb3, 0x4a, 0x7a, 0x48, 0xc1, 0x5d, 0xb3, + 0x7e, 0x50, 0xb0, 0x2d, 0x9e, 0x6f, 0x8e, 0x95, 0x60, 0xcd, 0x3f, 0x55, + 0x85, 0xd4, 0x6c, 0x43, 0x8c, 0xc8, 0x50, 0x54, 0x75, 0x90, 0xaf, 0xbe, + 0x8c, 0x45, 0x77, 0xc4, 0xa1, 0x75, 0xcd, 0x7b, 0x9a, 0xc3, 0xba, 0x8f, + 0xa8, 0xbf, 0x6e, 0x31, 0xa4, 0x9c, 0xcd, 0xac, 0xce, 0x88, 0xc4, 0x3a, + 0x50, 0x7e, 0x97, 0x79, 0x69, 0x4a, 0xdb, 0xcf, 0xa8, 0x73, 0x75, 0xd5, + 0x35, 0xa0, 0x43, 0x84, 0xaa, 0xbe, 0x8e, 0x24, 0x75, 0x8c, 0xcc, 0x68, + 0x63, 0x3b, 0xec, 0x8d, 0xc8, 0x87, 0x62, 0x84, 0x51, 0x3c, 0x53, 0x38, + 0xbb, 0x43, 0xba, 0x95, 0x4d, 0x2c, 0x84, 0x3c, 0xae, 0x4f, 0x86, 0xb7, + 0xe2, 0xa5, 0x76, 0x4a, 0x54, 0x58, 0xaa, 0x97, 0x96, 0x97, 0xcf, 0xc3, + 0x98, 0x8e, 0xb4, 0x65, 0x52, 0x6a, 0xaf, 0x59, 0x83, 0xc9, 0xbb, 0x7a, + 0xb8, 0xa7, 0x73, 0xb5, 0xaf, 0xbb, 0x36, 0x69, 0x46, 0x74, 0xc3, 0x42, + 0x7b, 0x85, 0xf3, 0xda, 0x3b, 0xbb, 0xcd, 0x87, 0x8e, 0xa1, 0xa9, 0x33, + 0x9d, 0x50, 0x8a, 0x7f, 0xb2, 0x67, 0x89, 0xd0, 0x53, 0x87, 0x61, 0x5b, + 0xcc, 0xac, 0x53, 0x9f, 0x3d, 0x9d, 0xb6, 0x61, 0xc1, 0x87, 0xb9, 0xc5, + 0x95, 0xb8, 0x7e, 0x5d, 0x4c, 0x51, 0x4b, 0x64, 0x81, 0xa3, 0xc2, 0x3c, + 0x33, 0x37, 0x4b, 0x8e, 0x6c, 0x5b, 0xd1, 0x75, 0xd0, 0xd1, 0x51, 0x7f, + 0x50, 0xb4, 0x41, 0xc4, 0x46, 0x6d, 0xa1, 0x34, 0x6c, 0x80, 0x85, 0xd6, + 0xb9, 0x73, 0x70, 0x51, 0x61, 0x73, 0x53, 0xb6, 0x5c, 0x73, 0xb7, 0x92, + 0xb7, 0x7b, 0x70, 0x78, 0x6c, 0x46, 0x3d, 0x72, 0xc3, 0xce, 0x93, 0x3e, + 0x8b, 0x9a, 0xba, 0xce, 0x3a, 0x5e, 0x5d, 0xd9, 0xb4, 0x39, 0xc9, 0xcb, + 0xd8, 0x5f, 0xbf, 0x36, 0xd2, 0x98, 0x99, 0xea, 0x80, 0xd4, 0x95, 0x4a, + 0xab, 0x44, 0x50, 0x30, 0x6e, 0x3d, 0x3a, 0xc1, 0x90, 0xaa, 0x72, 0xb7, + 0xd0, 0x83, 0xaa, 0x7f, 0xb6, 0x73, 0x58, 0x5c, 0x51, 0x68, 0x8b, 0xb9, + 0x5b, 0x60, 0x79, 0xc2, 0x43, 0x98, 0x69, 0xd4, 0x62, 0xa4, 0x6b, 0xa5, + 0x9a, 0x34, 0x8f, 0x89, 0x7e, 0x81, 0xcc, 0x45, 0x63, 0xb7, 0x72, 0x87, + 0x57, 0x75, 0x35, 0x77, 0x6c, 0xa8, 0x89, 0xac, 0x5d, 0x61, 0xc2, 0xc7, + 0x35, 0x30, 0x6c, 0x8e, 0x83, 0x80, 0x5c, 0xc3, 0x78, 0xcf, 0x4c, 0x90, + 0xb6, 0xac, 0x4d, 0xa2, 0x7a, 0x39, 0x85, 0x5c, 0xb8, 0x62, 0xc2, 0x36, + 0x95, 0x69, 0x46, 0x38, 0x3c, 0xc7, 0x90, 0xb9, 0x6f, 0x9c, 0x7c, 0x99, + 0xac, 0xb1, 0x64, 0x96, 0xc7, 0x69, 0x3e, 0x6d, 0xb0, 0x7e, 0xdb, 0xbc, + 0xa5, 0x6b, 0x4c, 0x51, 0xc7, 0x37, 0x66, 0xc3, 0xbb, 0xb3, 0x80, 0x87, + 0x89, 0x32, 0x4f, 0x9d, 0x8c, 0x34, 0x52, 0xb6, 0x82, 0x5c, 0x43, 0xd8, + 0x6f, 0xa2, 0xc3, 0xae, 0xba, 0x9f, 0x54, 0x2b, 0x37, 0x2c, 0x3f, 0x68, + 0xbc, 0x3c, 0xd1, 0x75, 0xcf, 0xa4, 0x5b, 0x76, 0x4c, 0x3f, 0xda, 0x3f, + 0xc5, 0x50, 0x48, 0xaf, 0x36, 0xd4, 0xc3, 0xa3, 0x37, 0x82, 0x4d, 0x89, + 0x57, 0x8e, 0x67, 0x88, 0x77, 0xac, 0x79, 0x47, 0x5d, 0xa6, 0x5a, 0x3d, + 0x4f, 0xad, 0x99, 0x92, 0x3c, 0x9a, 0x76, 0xc6, 0x6b, 0x2e, 0x34, 0xa0, + 0x7f, 0xd3, 0x68, 0x73, 0xb5, 0x56, 0x58, 0x52, 0xb3, 0x92, 0x4c, 0xce, + 0x3d, 0x78, 0x9e, 0xce, 0x45, 0x90, 0x4e, 0xc1, 0x79, 0x82, 0xc2, 0x49, + 0x9b, 0x6f, 0x4f, 0x96, 0x64, 0x92, 0x39, 0x54, 0xcc, 0xb6, 0x46, 0xca, + 0xc2, 0x36, 0xc8, 0x52, 0x66, 0x66, 0x50, 0xc7, 0x74, 0xa7, 0xa6, 0x39, + 0x8e, 0xb2, 0x6e, 0x78, 0x42, 0x93, 0xcf, 0xb2, 0xac, 0xd8, 0x49, 0x4c, + 0x40, 0x70, 0x7c, 0xc1, 0xca, 0x7d, 0xa5, 0x99, 0x26, 0x4e, 0x4d, 0xc5, + 0x34, 0x2d, 0x9b, 0xa3, 0x40, 0x3f, 0x51, 0x8e, 0x97, 0x85, 0xa6, 0xb0, + 0x99, 0x5b, 0x4c, 0xd2, 0xa5, 0x2e, 0xa4, 0x7a, 0xc3, 0xa0, 0x80, 0x76, + 0x3e, 0x66, 0xa8, 0xb9, 0xc7, 0x83, 0x83, 0xb3, 0xc2, 0x79, 0x4e, 0x40, + 0xaf, 0x73, 0xaf, 0x3d, 0xa5, 0x85, 0x70, 0x81, 0xd9, 0x8c, 0xb2, 0x5d, + 0x3d, 0x55, 0x70, 0x6e, 0x6f, 0x45, 0x35, 0x7e, 0xae, 0xc3, 0xbc, 0x7b, + 0x42, 0xce, 0x87, 0x8a, 0xc1, 0x7e, 0xbc, 0xb1, 0xd0, 0x49, 0x67, 0x8e, + 0x51, 0x4a, 0xce, 0xa7, 0x79, 0x33, 0xbd, 0x32, 0x3c, 0x88, 0xc3, 0x92, + 0x32, 0x8d, 0xc4, 0x4f, 0x93, 0x5e, 0x86, 0x7e, 0xbf, 0x88, 0x45, 0xbc, + 0x31, 0x61, 0xba, 0xa6, 0xba, 0x96, 0x9e, 0x68, 0xb6, 0xbb, 0xb6, 0x34, + 0xa8, 0x89, 0x57, 0xc6, 0x3a, 0xa6, 0x60, 0x51, 0x3c, 0x74, 0x34, 0xbe, + 0x32, 0xb0, 0x4c, 0x6e, 0x37, 0x8a, 0xaf, 0x62, 0x26, 0x6b, 0xa7, 0xcb, + 0xc2, 0x50, 0x73, 0x56, 0xb1, 0x55, 0xa8, 0x3c, 0xa0, 0xbc, 0x6b, 0x98, + 0x9d, 0x41, 0x81, 0xbb, 0x9e, 0x8f, 0x9a, 0xcc, 0x66, 0xc4, 0x7b, 0x8e, + 0x85, 0xbb, 0xd4, 0xc5, 0xa1, 0x96, 0x53, 0x55, 0x84, 0x32, 0x7d, 0x81, + 0xb9, 0x34, 0x46, 0x59, 0x4e, 0xb4, 0x35, 0xaa, 0x9a, 0x5e, 0x56, 0x7c, + 0x76, 0x94, 0xbe, 0x3d, 0xb8, 0xb3, 0xcc, 0xb6, 0x25, 0x52, 0x6e, 0x65, + 0x4c, 0x7e, 0x9c, 0x49, 0xc7, 0x6d, 0x5f, 0x81, 0x82, 0x98, 0x73, 0x32, + 0x91, 0x58, 0x98, 0x91, 0xbf, 0x9f, 0x67, 0x82, 0x54, 0xc4, 0xad, 0x56, + 0x83, 0x6f, 0x25, 0x5e, 0xb4, 0x43, 0x8a, 0x98, 0xc9, 0x54, 0x96, 0xbc, + 0xc8, 0x70, 0x70, 0x56, 0xc0, 0x50, 0x63, 0xa0, 0x74, 0x89, 0xd3, 0x8b, + 0x72, 0x79, 0x8b, 0xaa, 0x87, 0x93, 0xa5, 0x72, 0x31, 0x77, 0x6f, 0x88, + 0x4d, 0x75, 0x5d, 0x9d, 0x82, 0xca, 0x6b, 0x3e, 0x63, 0xa1, 0xbb, 0x47, + 0xb8, 0xb5, 0x52, 0x6b, 0x8e, 0xa8, 0xce, 0x40, 0x86, 0x57, 0x4a, 0xd2, + 0x82, 0x55, 0x4b, 0x4c, 0x84, 0xbf, 0xbe, 0x4d, 0x5d, 0xb7, 0x4e, 0x38, + 0x53, 0x36, 0x4e, 0x3f, 0x61, 0x30, 0x8f, 0xa6, 0xb4, 0x79, 0xa6, 0x5f, + 0xa8, 0x98, 0x57, 0xd3, 0x4f, 0xac, 0x50, 0xa7, 0xa1, 0x98, 0x8d, 0xab, + 0x49, 0x4a, 0xcb, 0xb6, 0xba, 0x42, 0x5e, 0xb7, 0xb4, 0x60, 0xce, 0xc5, + 0xa9, 0xc8, 0x55, 0xb4, 0xab, 0x95, 0x90, 0x8f, 0x5a, 0x31, 0x62, 0x70, + 0x70, 0x7c, 0x65, 0x6a, 0xb8, 0x5a, 0x3c, 0xd2, 0x51, 0x98, 0x28, 0x5f, + 0x42, 0x38, 0x47, 0x79, 0x9c, 0xc4, 0x86, 0x4b, 0xd3, 0xde, 0x38, 0x7b, + 0x75, 0xa2, 0x9c, 0x85, 0xb4, 0x81, 0x56, 0xad, 0x7f, 0x52, 0x4d, 0x45, + 0x3a, 0xc6, 0xa8, 0x77, 0x85, 0xc7, 0x83, 0x34, 0xa9, 0x95, 0x45, 0x37, + 0x6b, 0x45, 0x39, 0xcd, 0x81, 0x6d, 0xb7, 0x80, 0x61, 0x8f, 0xca, 0xbe, + 0x34, 0x2f, 0x3b, 0x53, 0xa5, 0x64, 0xae, 0x48, 0x33, 0xa3, 0xce, 0x81, + 0x7c, 0x6d, 0x86, 0x8d, 0xaf, 0x50, 0x3a, 0x72, 0x9b, 0xc7, 0x2f, 0xbc, + 0xbd, 0x64, 0xb5, 0xc5, 0x46, 0x9a, 0xa9, 0x98, 0x77, 0xc5, 0xc0, 0x69, + 0x5f, 0x69, 0xb0, 0x82, 0xbb, 0xbe, 0x6b, 0xc4, 0x90, 0x47, 0x56, 0xc0, + 0xb4, 0xcb, 0x7f, 0x9f, 0xac, 0xa4, 0xc4, 0x87, 0xcb, 0x5a, 0x8d, 0x62, + 0xc2, 0xb5, 0xb9, 0x70, 0x45, 0x76, 0x97, 0x33, 0x41, 0xb4, 0x9d, 0x7e, + 0xd5, 0x31, 0xba, 0x7c, 0xa2, 0x2a, 0xac, 0x58, 0x3b, 0x7e, 0x3c, 0xbf, + 0xb7, 0xb9, 0x60, 0x3d, 0xcd, 0x4b, 0xc1, 0xa8, 0x9f, 0xa5, 0xa3, 0xb9, + 0x9e, 0x69, 0xa2, 0xc3, 0xc3, 0x2f, 0x3b, 0x44, 0x3a, 0x7b, 0xae, 0x4c, + 0xc4, 0xc3, 0x40, 0xcb, 0x7b, 0xa3, 0x92, 0x83, 0x89, 0x9a, 0x70, 0xcb, + 0xa4, 0xb5, 0xa7, 0x5d, 0x6f, 0x8a, 0xc6, 0xa6, 0x27, 0xb2, 0x92, 0xc0, + 0xb0, 0x2d, 0x72, 0xad, 0x4a, 0x9d, 0x30, 0x39, 0x52, 0x49, 0x6a, 0x81, + 0x9f, 0xbf, 0x8f, 0x65, 0x89, 0x3b, 0x43, 0xa3, 0x4f, 0x89, 0x41, 0xc3, + 0x8b, 0xa6, 0x46, 0x75, 0xbe, 0x36, 0x53, 0x8a, 0x6b, 0xbb, 0x4d, 0x95, + 0xbb, 0xd0, 0x64, 0x44, 0xdd, 0x91, 0x37, 0x70, 0x3d, 0x46, 0xc4, 0xb8, + 0x82, 0x4f, 0x5b, 0x89, 0x93, 0xac, 0x5a, 0x8f, 0xa6, 0x4c, 0x80, 0x71, + 0xa2, 0x97, 0x54, 0xa7, 0xc3, 0xc7, 0x48, 0x41, 0x64, 0x7b, 0x80, 0x45, + 0x45, 0xaf, 0x87, 0x56, 0x69, 0xcb, 0xb5, 0x54, 0x9f, 0x98, 0x39, 0xc2, + 0x8d, 0x94, 0x9a, 0xc6, 0x39, 0x50, 0x33, 0x64, 0xb6, 0xad, 0x51, 0x5a, + 0x54, 0x64, 0xab, 0x90, 0xae, 0xbd, 0xb4, 0x48, 0xb2, 0x82, 0xbf, 0x3f, + 0x42, 0x6c, 0xaa, 0xb0, 0x44, 0xc2, 0xc6, 0x52, 0x47, 0xa2, 0xa9, 0x55, + 0x89, 0xd3, 0x8e, 0xc1, 0xa8, 0x6a, 0x78, 0x7b, 0x3f, 0xbb, 0x4a, 0x6e, + 0x96, 0xd4, 0xc6, 0xaa, 0x44, 0xb5, 0xb4, 0x67, 0x41, 0xa5, 0x4d, 0xcb, + 0x48, 0xc8, 0xc5, 0xa1, 0x6b, 0x9f, 0xc4, 0x75, 0x81, 0x63, 0xc9, 0x64, + 0x73, 0x81, 0x39, 0x41, 0x86, 0xe1, 0x9c, 0xc2, 0x6e, 0xaa, 0x93, 0x9d, + 0x55, 0xb3, 0x71, 0xa6, 0xc4, 0xa2, 0xcf, 0x35, 0x79, 0x3e, 0xa5, 0xc8, + 0x6e, 0x3d, 0x2e, 0xa6, 0x80, 0x58, 0x6e, 0x71, 0xe2, 0xbf, 0xa6, 0x83, + 0xb5, 0x53, 0x71, 0x84, 0xa4, 0x50, 0x5d, 0xdb, 0x46, 0x3b, 0x7d, 0xb2, + 0x39, 0x5d, 0xc1, 0x21, 0xa5, 0xd1, 0x4e, 0xb7, 0x7a, 0x5f, 0xb6, 0x5b, + 0x35, 0xbb, 0xd8, 0xae, 0x4d, 0x8f, 0xac, 0x61, 0x5d, 0x4b, 0xb3, 0x2d, + 0x74, 0x7d, 0x58, 0x7c, 0x7c, 0xb1, 0xb9, 0xad, 0xc6, 0xab, 0x59, 0xa1, + 0xa1, 0x49, 0x60, 0x92, 0x5a, 0x65, 0xad, 0x5a, 0xd9, 0x77, 0x8c, 0x81, + 0x3e, 0x4f, 0x48, 0xa5, 0xb4, 0xc5, 0x9c, 0xa7, 0xd2, 0x8c, 0xcb, 0x9b, + 0x8b, 0xc6, 0xd8, 0xb7, 0xd7, 0xa2, 0x7b, 0xa4, 0x81, 0x8f, 0xce, 0x83, + 0xc5, 0xa1, 0x9d, 0x78, 0x69, 0x5c, 0xb1, 0xce, 0x6b, 0x7f, 0xcd, 0xc7, + 0x87, 0x7d, 0x7d, 0xa6, 0x49, 0x70, 0x63, 0x8c, 0x98, 0x50, 0x90, 0xad, + 0x5d, 0x79, 0xd5, 0xb7, 0xa4, 0xad, 0x90, 0x55, 0x7b, 0x4b, 0xa9, 0xdf, + 0x7f, 0xbb, 0x93, 0x67, 0x2f, 0x92, 0x7d, 0x41, 0x49, 0x5c, 0x45, 0x77, + 0x60, 0xd9, 0x6f, 0xb7, 0xa6, 0xbb, 0x6a, 0xc8, 0xcb, 0xca, 0x8c, 0x6a, + 0xc1, 0x8e, 0x43, 0x6a, 0x56, 0xac, 0x67, 0x5e, 0x34, 0x56, 0xb3, 0xa8, + 0xb4, 0xc0, 0xab, 0xa2, 0x78, 0x71, 0x7e, 0x4d, 0xd6, 0x7f, 0xb9, 0x6a, + 0x78, 0x74, 0x64, 0x75, 0x4a, 0x59, 0x8f, 0x35, 0x55, 0xe2, 0xb7, 0x44, + 0x4c, 0xba, 0x97, 0x42, 0xaa, 0xbb, 0x58, 0x98, 0x65, 0xba, 0x51, 0x35, + 0xb6, 0x5d, 0x4a, 0xc1, 0xd5, 0xb5, 0x78, 0xaf, 0x5f, 0x7f, 0xac, 0x6c, + 0x9b, 0x37, 0xc9, 0xb4, 0x94, 0x83, 0x9e, 0x4f, 0xb6, 0xb9, 0x6f, 0x9d, + 0xd0, 0x87, 0x95, 0x48, 0xc2, 0xca, 0x87, 0xc1, 0x94, 0x60, 0x98, 0x2c, + 0x81, 0x7d, 0xbf, 0xc6, 0x2b, 0xc2, 0xbf, 0x93, 0x81, 0x46, 0x4e, 0xca, + 0x8b, 0x2f, 0x5b, 0x67, 0xb4, 0x96, 0x6d, 0xcb, 0x94, 0xc6, 0xb1, 0x4f, + 0x5e, 0x36, 0x83, 0x5c, 0x63, 0x7c, 0x76, 0xaa, 0x8b, 0x6f, 0x72, 0xb5, + 0xd2, 0x8b, 0x56, 0x9a, 0x51, 0x50, 0xb3, 0xc2, 0xb5, 0xc0, 0x6e, 0xb0, + 0x73, 0x6e, 0xbe, 0x5f, 0x70, 0x3b, 0x64, 0xad, 0x8a, 0x94, 0xb5, 0xcb, + 0x52, 0xd3, 0x85, 0xb5, 0x28, 0xa4, 0x9c, 0x75, 0x67, 0x3a, 0x56, 0x46, + 0x5d, 0x8d, 0xa2, 0x75, 0x9c, 0xab, 0x74, 0x89, 0x4e, 0xc0, 0x97, 0xbe, + 0x6a, 0x9a, 0xc7, 0x50, 0x51, 0x6c, 0xd0, 0x49, 0x9e, 0x4c, 0x48, 0xb9, + 0x5b, 0x40, 0x55, 0xb7, 0x52, 0x7f, 0x8c, 0xa1, 0xab, 0xc9, 0x62, 0xcb, + 0x9d, 0x5d, 0xc6, 0xaa, 0x37, 0xbd, 0x86, 0xc6, 0x8c, 0xb3, 0xcd, 0xcc, + 0x5d, 0x7c, 0x79, 0xad, 0xb3, 0x4c, 0xc7, 0xaa, 0x41, 0x2d, 0x3e, 0xb1, + 0x9a, 0xa0, 0x8b, 0xaf, 0x76, 0x5d, 0xd0, 0x3c, 0x49, 0xad, 0x94, 0x6c, + 0x76, 0xc1, 0xb9, 0x6a, 0x69, 0x73, 0x46, 0x97, 0x8c, 0xc9, 0xc6, 0x99, + 0xd7, 0x39, 0xad, 0xbb, 0x7e, 0x54, 0x41, 0x7a, 0x8c, 0x7b, 0xa2, 0x5b, + 0x5f, 0xb1, 0x69, 0x6b, 0x43, 0x84, 0xb0, 0x63, 0x44, 0x7f, 0x31, 0x53, + 0xa5, 0x8b, 0x86, 0xb6, 0x3a, 0x3c, 0xcb, 0x9f, 0x8a, 0x7a, 0xc7, 0xbd, + 0x7a, 0xb9, 0xb6, 0x47, 0xc8, 0x63, 0x74, 0x65, 0x5a, 0x89, 0xa5, 0xb8, + 0x4e, 0x9d, 0xd0, 0xbf, 0x80, 0x89, 0xc2, 0x63, 0x64, 0x9d, 0x59, 0x46, + 0x99, 0x75, 0x6f, 0xbd, 0xa0, 0xb2, 0x7b, 0x77, 0x2e, 0x84, 0x95, 0x42, + 0x8d, 0x4f, 0x8d, 0xd0, 0x6e, 0x4a, 0x95, 0x3c, 0xb3, 0x82, 0xcd, 0xbd, + 0x86, 0xae, 0x52, 0x9f, 0x71, 0x8f, 0x93, 0x8f, 0x4e, 0xca, 0x7d, 0xa9, + 0x71, 0xd2, 0xc3, 0xa4, 0xca, 0x88, 0x8b, 0x59, 0x87, 0x34, 0xb0, 0x93, + 0x9b, 0x3c, 0xa9, 0xcf, 0xc0, 0x70, 0x96, 0x9c, 0x51, 0xaf, 0x63, 0xd5, + 0x73, 0xb5, 0x55, 0xc9, 0x96, 0x9e, 0x51, 0x6d, 0x9c, 0x71, 0xaa, 0x94, + 0xcd, 0x49, 0x92, 0xa6, 0x37, 0x52, 0x47, 0x67, 0x4f, 0xd7, 0x4d, 0x76, + 0xae, 0xde, 0x88, 0x54, 0x69, 0xa8, 0x9b, 0x66, 0x6d, 0x34, 0x7c, 0x9f, + 0x57, 0x56, 0x5c, 0x99, 0x2a, 0xbb, 0x38, 0x59, 0x77, 0x7e, 0xa6, 0xaa, + 0x95, 0x3c, 0x3d, 0x8e, 0x3e, 0xbf, 0x4a, 0x91, 0x5e, 0x45, 0x76, 0x63, + 0x3b, 0x9e, 0xb3, 0x6d, 0x58, 0x66, 0x5c, 0x7c, 0x8c, 0x7a, 0xb0, 0xa0, + 0xc1, 0x76, 0x3d, 0xd5, 0x55, 0x80, 0x5b, 0x87, 0x5d, 0x58, 0x5f, 0xb6, + 0xaf, 0xbc, 0x82, 0x5b, 0x43, 0xd2, 0xa0, 0x87, 0x60, 0xc6, 0xbf, 0x59, + 0xb3, 0x4a, 0x3c, 0xa3, 0xcd, 0xa5, 0x85, 0xc5, 0xc5, 0x7b, 0x75, 0xa8, + 0x6a, 0x4e, 0xa3, 0xa0, 0x96, 0x3b, 0x79, 0x27, 0x99, 0x9c, 0x8c, 0x3e, + 0xa5, 0x5a, 0x9e, 0x6f, 0xd2, 0x2c, 0xb2, 0x63, 0x41, 0x74, 0xc8, 0x8a, + 0x65, 0x62, 0xcd, 0xa2, 0x7a, 0x71, 0xdb, 0xca, 0x48, 0x5b, 0xbd, 0x8e, + 0xc6, 0x8d, 0xc5, 0xb0, 0xc5, 0x7f, 0x54, 0x66, 0x83, 0x5b, 0x6a, 0x96, + 0x4e, 0x7f, 0xb0, 0x4f, 0x9d, 0xc4, 0x4f, 0x3d, 0x9c, 0x6c, 0x40, 0x87, + 0x6e, 0x4b, 0x4d, 0x82, 0x5d, 0xa4, 0x7e, 0xa8, 0x8f, 0x7d, 0x7e, 0x83, + 0x8e, 0xcd, 0xc7, 0x61, 0xbf, 0x71, 0x3e, 0x4f, 0x8f, 0xbd, 0x57, 0x2e, + 0x8e, 0x82, 0x64, 0x4c, 0xc5, 0xbd, 0x9c, 0xc7, 0xd0, 0x83, 0x45, 0x9f, + 0x60, 0xb1, 0x6b, 0x8a, 0x93, 0xa3, 0xb3, 0x8e, 0xa6, 0xba, 0x8e, 0x35, + 0x2e, 0xaf, 0x7a, 0x33, 0x4f, 0x78, 0xc3, 0x3d, 0xa7, 0x47, 0x51, 0x68, + 0xba, 0x65, 0x3b, 0x3c, 0xb6, 0xc9, 0xd5, 0xc4, 0x89, 0xb2, 0x6d, 0x46, + 0xcb, 0xca, 0x79, 0x68, 0xc1, 0x3f, 0x6c, 0x5b, 0x91, 0xa9, 0xbf, 0x33, + 0x8f, 0x73, 0xc6, 0x72, 0x2e, 0x74, 0x31, 0x9c, 0xaa, 0xc1, 0x62, 0x8a, + 0x55, 0x37, 0x5d, 0x58, 0xc1, 0xa4, 0x65, 0xa1, 0x97, 0xa9, 0x39, 0x8b, + 0xa5, 0x82, 0x36, 0xa8, 0x42, 0x92, 0xda, 0x85, 0xc8, 0xc3, 0xcc, 0x5a, + 0x56, 0x50, 0x3b, 0xab, 0x67, 0xb2, 0x94, 0x9c, 0xcf, 0xcf, 0xa0, 0x56, + 0xa4, 0xce, 0xad, 0x89, 0x9e, 0xa7, 0x3d, 0x87, 0xbe, 0x41, 0x9e, 0x35, + 0x65, 0x41, 0xbe, 0x57, 0x59, 0x41, 0xad, 0x39, 0x82, 0x93, 0xad, 0x47, + 0x70, 0xb1, 0xad, 0x5b, 0xa8, 0x4e, 0x3a, 0x3b, 0x64, 0x8c, 0xcd, 0x3f, + 0xc7, 0x9f, 0xa2, 0x4c, 0x9b, 0x56, 0x71, 0x5f, 0x8c, 0x72, 0x34, 0x83, + 0x90, 0xd9, 0xc3, 0x7a, 0xb2, 0xc4, 0x3d, 0x8e, 0x8f, 0x2d, 0xb8, 0xbe, + 0x33, 0xc7, 0xa2, 0x9e, 0x7f, 0xb8, 0xa4, 0x73, 0x6c, 0x44, 0xd3, 0xae, + 0x60, 0xa4, 0x6e, 0xbe, 0x5d, 0xd2, 0x5b, 0xd1, 0xb0, 0xb2, 0xcb, 0x89, + 0xb5, 0xac, 0x93, 0x8a, 0xb5, 0x84, 0x4a, 0x38, 0x8b, 0xbe, 0xbd, 0xa2, + 0x61, 0x72, 0xa8, 0xc9, 0xb0, 0xc7, 0x67, 0x58, 0x48, 0x92, 0x85, 0xa6, + 0x7e, 0xc7, 0xac, 0x8e, 0xb7, 0x49, 0xa5, 0x32, 0xbd, 0xb4, 0x6c, 0x4f, + 0xad, 0x83, 0x37, 0xbd, 0x3c, 0x59, 0xcb, 0xb0, 0x46, 0x37, 0x62, 0x58, + 0xbe, 0x7f, 0x2d, 0x44, 0x61, 0x84, 0xcf, 0x72, 0x93, 0xcb, 0x8e, 0xd1, + 0xc6, 0x45, 0xc9, 0x4e, 0xa0, 0xb4, 0x7e, 0x83, 0xd6, 0x73, 0xd2, 0x3a, + 0xa1, 0xb9, 0x64, 0x94, 0xba, 0xa5, 0x5b, 0xc3, 0x48, 0xa4, 0x53, 0x79, + 0xb2, 0x53, 0x5a, 0x80, 0x83, 0x6c, 0x2e, 0x72, 0x3b, 0x78, 0xbd, 0x45, + 0xcb, 0x9f, 0x95, 0x73, 0xc0, 0xc2, 0xcc, 0xb8, 0x51, 0xa6, 0xb6, 0x99, + 0x36, 0x59, 0x8d, 0x8c, 0x63, 0x9c, 0x40, 0x70, 0x72, 0x5a, 0x9a, 0x33, + 0x8c, 0x9d, 0x4e, 0x78, 0xad, 0x6d, 0x79, 0xc9, 0x58, 0x3d, 0x9e, 0x8a, + 0x59, 0x57, 0x5a, 0xa7, 0xc7, 0x40, 0x86, 0x72, 0xb8, 0x5c, 0x75, 0x44, + 0xb9, 0xcb, 0x34, 0x6a, 0x7b, 0x47, 0xc9, 0x86, 0x91, 0x75, 0x60, 0xa2, + 0x8e, 0x4e, 0x4e, 0xca, 0xbe, 0x79, 0x91, 0x61, 0x30, 0x80, 0xb5, 0x6f, + 0xbc, 0xae, 0x5f, 0x75, 0xbf, 0x85, 0x4c, 0xae, 0x31, 0x79, 0x5a, 0x8c, + 0xc9, 0x9b, 0xd3, 0x6f, 0x58, 0xbc, 0xa4, 0x91, 0x75, 0x32, 0x3d, 0x8f, + 0x59, 0x4d, 0x6c, 0x82, 0xb6, 0xd3, 0x88, 0x70, 0x88, 0xbe, 0x99, 0x5d, + 0x58, 0xb7, 0xae, 0x70, 0x4b, 0x73, 0x63, 0x4f, 0xbf, 0xa7, 0xb4, 0x5b, + 0x7f, 0xad, 0xc6, 0x48, 0x80, 0x59, 0x92, 0xcd, 0xbb, 0xc9, 0x93, 0xa7, + 0xc7, 0x94, 0x8b, 0x47, 0xc5, 0xb7, 0x4f, 0x64, 0x83, 0x7e, 0x3c, 0x2e, + 0x8d, 0x4a, 0xb2, 0x65, 0x6c, 0x4a, 0x63, 0x4d, 0xbf, 0x32, 0x43, 0x76, + 0xca, 0x5a, 0x33, 0xbb, 0x59, 0x5b, 0xb8, 0xc7, 0x93, 0xa6, 0x58, 0x5d, + 0x7d, 0x48, 0xa7, 0x90, 0xc1, 0xb2, 0xcf, 0xc9, 0x36, 0xa0, 0xb5, 0xb1, + 0x8f, 0x31, 0x97, 0x5b, 0xb7, 0x47, 0xae, 0xc0, 0x40, 0xce, 0x82, 0x84, + 0x8f, 0x3c, 0x9e, 0x87, 0x62, 0x86, 0x98, 0xca, 0xc5, 0x6f, 0xca, 0x2f, + 0x69, 0x82, 0xb5, 0x97, 0xbc, 0xd2, 0x97, 0x7e, 0xbe, 0x70, 0x8f, 0xcd, + 0x98, 0x9a, 0xb4, 0x63, 0x52, 0x6e, 0x8c, 0x83, 0x5b, 0x66, 0xca, 0x77, + 0x64, 0x4e, 0x68, 0xc6, 0x56, 0x97, 0x85, 0x95, 0x63, 0x59, 0x36, 0xcb, + 0x47, 0xc0, 0xc3, 0x4a, 0xd9, 0xa9, 0x47, 0xa5, 0xc6, 0x73, 0x70, 0x4b, + 0xae, 0x4a, 0x95, 0x7c, 0xa6, 0x55, 0x7f, 0x96, 0x66, 0x5f, 0x8c, 0xd3, + 0x85, 0x5c, 0x5d, 0x9a, 0xd5, 0x5a, 0x45, 0x43, 0x34, 0x94, 0x79, 0xba, + 0x64, 0x41, 0x4c, 0xd6, 0xb6, 0x5f, 0x4b, 0xb7, 0x6f, 0xcb, 0x55, 0x48, + 0xce, 0xb0, 0xd5, 0x6c, 0x61, 0xb9, 0x9c, 0xd0, 0xb5, 0xa9, 0x5f, 0x68, + 0x95, 0x5b, 0x79, 0x60, 0x4e, 0x6a, 0x84, 0xa7, 0x63, 0xc3, 0x9b, 0x3e, + 0x9c, 0x8c, 0x5c, 0x80, 0x9c, 0x7d, 0x57, 0xb3, 0xcb, 0x3a, 0xb0, 0x60, + 0xce, 0x57, 0x30, 0x58, 0x72, 0x3b, 0x96, 0xb9, 0x7a, 0x7c, 0x82, 0x95, + 0xa2, 0x88, 0x5b, 0x36, 0x9c, 0x78, 0x6f, 0xbd, 0xa2, 0x45, 0x77, 0x39, + 0xb3, 0x79, 0xbc, 0x7d, 0x3c, 0x3d, 0x3b, 0x4f, 0x91, 0x48, 0x77, 0xc8, + 0x68, 0x50, 0xc0, 0xbf, 0x9f, 0x6b, 0x8d, 0x39, 0x5d, 0x74, 0x5c, 0x8b, + 0xbd, 0x96, 0x55, 0x5c, 0x9a, 0x80, 0xd0, 0x8e, 0x80, 0x88, 0x4a, 0x98, + 0x5d, 0x75, 0x93, 0xbc, 0x79, 0x86, 0x71, 0xbc, 0x8d, 0xb5, 0x41, 0x4b, + 0x57, 0x68, 0x64, 0x5e, 0xa3, 0x6a, 0x70, 0x3f, 0x7b, 0xa7, 0x67, 0xb0, + 0x88, 0x7b, 0x7b, 0x83, 0x39, 0x47, 0x94, 0xc4, 0xae, 0x6e, 0xa0, 0xcf, + 0x95, 0x63, 0x80, 0xcd, 0x64, 0x41, 0x84, 0x77, 0x85, 0x9b, 0x69, 0x4c, + 0xa2, 0x87, 0x2c, 0x77, 0x97, 0xa7, 0x87, 0x99, 0x95, 0xab, 0x79, 0xdf, + 0xad, 0x50, 0x56, 0x65, 0x4c, 0x72, 0x4e, 0xba, 0x5e, 0x96, 0x65, 0xce, + 0x39, 0x8b, 0x48, 0xc4, 0x33, 0x69, 0x52, 0x9c, 0x49, 0x8a, 0xd6, 0xc6, + 0x88, 0xca, 0x95, 0xc7, 0x76, 0x6c, 0x3c, 0xa8, 0x61, 0x6a, 0xcf, 0xc9, + 0x80, 0x94, 0xa7, 0x99, 0xad, 0x94, 0x4f, 0x85, 0x5e, 0x53, 0x77, 0xa6, + 0x39, 0x50, 0x8f, 0x34, 0xc6, 0x6d, 0x96, 0xbe, 0x9c, 0xcf, 0x7c, 0x9b, + 0x4f, 0xd3, 0x4c, 0xc8, 0x80, 0x89, 0xbf, 0x82, 0xcb, 0x4a, 0x58, 0x35, + 0x91, 0x7f, 0x95, 0xa3, 0xb5, 0x59, 0xa3, 0x8e, 0xc4, 0x8f, 0x89, 0x5d, + 0x62, 0x5d, 0x9f, 0x40, 0x93, 0x9c, 0xbd, 0x37, 0xaa, 0xc3, 0x72, 0x59, + 0xc4, 0x61, 0x2b, 0xb3, 0x84, 0x78, 0xaa, 0x63, 0xcf, 0x6a, 0x54, 0xa9, + 0xc5, 0x5b, 0xb2, 0x2f, 0x4c, 0x3c, 0x4e, 0x72, 0x4d, 0x8e, 0x95, 0x8e, + 0xa9, 0xa3, 0x44, 0x57, 0x7e, 0xca, 0x7d, 0x6f, 0xdc, 0xbc, 0xbc, 0x85, + 0x71, 0xd8, 0xb7, 0x41, 0x65, 0x3d, 0x7c, 0x91, 0xbb, 0xc4, 0xa1, 0xb3, + 0x72, 0x82, 0xb7, 0x5b, 0xbe, 0x78, 0x3a, 0x41, 0xc6, 0x36, 0x55, 0x7a, + 0x8b, 0xcc, 0x6a, 0x1a, 0xab, 0x65, 0x61, 0x53, 0x7c, 0x2d, 0x8b, 0x97, + 0x6e, 0x98, 0x93, 0x97, 0x92, 0x58, 0x2c, 0x7a, 0x4b, 0x6d, 0x4e, 0xb4, + 0x3f, 0x42, 0x93, 0xc1, 0x43, 0x6a, 0x68, 0xb6, 0x72, 0x33, 0xc2, 0xa6, + 0x4b, 0xa6, 0x56, 0x7d, 0xa8, 0x6a, 0x67, 0xab, 0xc5, 0xc2, 0x67, 0xa0, + 0xbb, 0x54, 0x73, 0xc3, 0x99, 0x37, 0x68, 0x9e, 0x3e, 0xb6, 0x8e, 0xa0, + 0x4a, 0x7e, 0xd8, 0xc1, 0xaa, 0x3d, 0xac, 0x9e, 0x34, 0xb9, 0x30, 0x7c, + 0x3d, 0x6c, 0x95, 0x75, 0x7b, 0xa2, 0x59, 0x74, 0x7b, 0x7e, 0x65, 0x5a, + 0x2c, 0xa7, 0x3e, 0x59, 0x87, 0x4e, 0x90, 0x9f, 0x92, 0x3f, 0x73, 0x85, + 0xd5, 0x7d, 0x45, 0x3a, 0x9e, 0xbb, 0xb6, 0x49, 0x8c, 0x89, 0x6d, 0x70, + 0x8a, 0xbd, 0x85, 0x88, 0x9b, 0xbc, 0x38, 0xc5, 0x49, 0x56, 0x8e, 0x8c, + 0xb8, 0x9e, 0x64, 0xbd, 0xcb, 0xb5, 0xce, 0x82, 0x54, 0x97, 0x5c, 0x52, + 0x72, 0x51, 0x49, 0xb4, 0x4b, 0x8a, 0x89, 0x7c, 0xc0, 0xa6, 0x8a, 0x98, + 0x4c, 0x53, 0x55, 0x45, 0x42, 0x91, 0x4f, 0x4d, 0x6d, 0x3d, 0x3e, 0x4e, + 0x7d, 0xae, 0x4d, 0xb8, 0x5e, 0xdc, 0x77, 0x9d, 0x5a, 0xa4, 0x80, 0xd1, + 0xb2, 0xbf, 0xba, 0x93, 0x84, 0x7f, 0x3e, 0x43, 0xa5, 0x60, 0x40, 0x38, + 0x37, 0xa8, 0x80, 0x41, 0xb8, 0x60, 0x8d, 0x95, 0x8a, 0xa0, 0x52, 0x7c, + 0x97, 0x9a, 0x56, 0xab, 0xb4, 0x82, 0x80, 0x2c, 0xc9, 0xa7, 0x42, 0xae, + 0xb7, 0xb6, 0xbf, 0xa8, 0xb3, 0x75, 0x94, 0x88, 0x75, 0x92, 0xb0, 0x58, + 0x45, 0x3a, 0xa7, 0x6e, 0xd1, 0x69, 0x99, 0x6f, 0x50, 0xcd, 0xb7, 0xb4, + 0x59, 0xa7, 0x6a, 0xd1, 0x45, 0x5c, 0xc8, 0xbb, 0xae, 0x4a, 0x6b, 0x67, + 0x67, 0x9b, 0x7e, 0x4f, 0xd7, 0xb0, 0x2d, 0x50, 0x3c, 0xaf, 0x81, 0x4f, + 0xaf, 0x64, 0x9e, 0xab, 0xba, 0x3f, 0xc1, 0xbe, 0x91, 0x75, 0x5c, 0x8c, + 0xae, 0x72, 0x9c, 0x6b, 0x61, 0xad, 0x6c, 0x54, 0xa3, 0x62, 0xb0, 0x75, + 0x64, 0xb9, 0x44, 0x59, 0x2b, 0x8b, 0x8c, 0x3a, 0x5f, 0x5b, 0x6b, 0x50, + 0x87, 0x7d, 0x79, 0x85, 0xa3, 0xa0, 0x4e, 0x59, 0x41, 0x53, 0x97, 0xa9, + 0xb7, 0x85, 0x2c, 0x67, 0x6f, 0x30, 0xa0, 0x55, 0x7a, 0xb5, 0x37, 0xc8, + 0x97, 0x77, 0xcc, 0x4f, 0x9e, 0x44, 0xd6, 0x47, 0x74, 0xa7, 0x6e, 0x49, + 0xaf, 0x77, 0xb0, 0x98, 0x7a, 0x75, 0x3c, 0x90, 0x81, 0xa6, 0x4f, 0xbd, + 0x41, 0x99, 0x99, 0x95, 0x6b, 0xb3, 0xb4, 0xbb, 0xa2, 0x77, 0xbb, 0x7a, + 0x85, 0xb4, 0x94, 0xc1, 0x67, 0x80, 0xcb, 0x71, 0x69, 0x63, 0xce, 0x86, + 0x62, 0x9d, 0xb2, 0x7a, 0x86, 0x9e, 0xb7, 0x9c, 0x8c, 0x39, 0xa1, 0xb5, + 0x82, 0x65, 0x64, 0x74, 0xa5, 0x82, 0x96, 0x43, 0x7e, 0xa3, 0x47, 0x7d, + 0x6c, 0x6a, 0x6c, 0x5b, 0x85, 0x4b, 0x98, 0x47, 0x85, 0xaf, 0xbe, 0x85, + 0x94, 0xce, 0xbc, 0xd8, 0x75, 0xb8, 0x9d, 0x77, 0xa0, 0xb7, 0x57, 0xce, + 0xae, 0xb3, 0x4e, 0x6a, 0xb4, 0x82, 0x58, 0x60, 0x4d, 0xa8, 0xbe, 0xb6, + 0x4c, 0xb1, 0x63, 0x6e, 0x5b, 0x41, 0x8e, 0xbc, 0xa4, 0xc1, 0x71, 0x50, + 0xd1, 0xb3, 0x96, 0x3c, 0x98, 0xa0, 0xaf, 0xb4, 0x89, 0x78, 0xc4, 0x84, + 0x38, 0xb0, 0x6d, 0xc0, 0xd2, 0x9b, 0x7a, 0x55, 0x3a, 0x98, 0xa6, 0x6f, + 0x7b, 0x70, 0xb8, 0x6b, 0x8a, 0xa3, 0x94, 0xb3, 0x78, 0x83, 0x36, 0x9a, + 0x90, 0x84, 0x78, 0x4c, 0x9c, 0x37, 0x77, 0x38, 0x43, 0xa0, 0x8e, 0x64, + 0xa5, 0x9b, 0x4e, 0xaa, 0x5e, 0x5c, 0x47, 0x95, 0xb3, 0x79, 0x37, 0x62, + 0x42, 0x62, 0x84, 0x68, 0xa4, 0xb0, 0x4f, 0xa2, 0x8f, 0x76, 0xc1, 0x9e, + 0x7b, 0x40, 0x3f, 0xb3, 0xc2, 0xc9, 0x8d, 0xcd, 0x6f, 0xb9, 0x7b, 0x31, + 0x77, 0x3b, 0x83, 0xa8, 0x73, 0x93, 0xa2, 0xb4, 0x99, 0x68, 0xc5, 0x5b, + 0x32, 0x47, 0x8f, 0xc2, 0x8f, 0xb6, 0x94, 0xc4, 0xaf, 0x52, 0x86, 0x65, + 0xcd, 0x37, 0x8f, 0x86, 0x65, 0x6d, 0x3f, 0xd6, 0x88, 0x68, 0x88, 0x41, + 0x71, 0xde, 0x67, 0xb2, 0xd3, 0x61, 0x7d, 0x47, 0x5e, 0xb0, 0x4e, 0x9f, + 0xb0, 0x8b, 0xca, 0xad, 0xa2, 0x34, 0x4f, 0xcb, 0xa4, 0xa1, 0x89, 0xc0, + 0x8c, 0xc5, 0xa1, 0x6a, 0xd1, 0x6f, 0xc2, 0x86, 0x8f, 0x45, 0x58, 0xc8, + 0xc6, 0x80, 0xc7, 0x9e, 0x72, 0xc6, 0x8b, 0x51, 0xb2, 0x98, 0x3d, 0xd4, + 0xa7, 0xbc, 0x41, 0x9d, 0x98, 0xb7, 0xad, 0x44, 0xc7, 0xa5, 0xad, 0x5e, + 0x42, 0xa1, 0xbd, 0x52, 0x6f, 0xa9, 0x69, 0x72, 0x64, 0x8d, 0xa0, 0x5c, + 0xc1, 0x8d, 0x59, 0x55, 0x4f, 0xcf, 0xbb, 0xcf, 0x54, 0x3c, 0xa1, 0xa1, + 0x32, 0xaf, 0x82, 0x91, 0x83, 0x71, 0x76, 0xce, 0x70, 0x39, 0x97, 0xb3, + 0x6f, 0x4c, 0x7d, 0x71, 0xa3, 0x77, 0x44, 0xd5, 0x97, 0x63, 0x6f, 0xaa, + 0xaf, 0x72, 0xb7, 0x83, 0x3f, 0xaf, 0x67, 0x9e, 0xb9, 0x8a, 0x35, 0x3d, + 0x3c, 0x54, 0xc6, 0x4b, 0x6e, 0xce, 0x4e, 0x78, 0x38, 0x4b, 0xc0, 0x8e, + 0x89, 0x95, 0xbf, 0x4c, 0x7b, 0xa4, 0x57, 0x53, 0x49, 0xb4, 0xa3, 0xc1, + 0xb4, 0x5a, 0x9a, 0xd3, 0x71, 0x80, 0x57, 0x6e, 0x40, 0xb6, 0xc1, 0x5a, + 0xa9, 0x95, 0x67, 0xc8, 0xa0, 0x68, 0x68, 0xac, 0x6a, 0x55, 0x39, 0xa8, + 0xb3, 0x89, 0xaa, 0xad, 0xc2, 0x93, 0x38, 0x7f, 0x57, 0x49, 0x82, 0xbf, + 0x53, 0xca, 0x71, 0xa3, 0x9f, 0x72, 0x20, 0xc7, 0x82, 0x5d, 0xaa, 0x95, + 0xac, 0x32, 0x66, 0x3b, 0xc0, 0x9d, 0x83, 0xab, 0xb6, 0x66, 0x88, 0x64, + 0x77, 0x83, 0x9e, 0x83, 0x4c, 0x4e, 0x51, 0x52, 0xb9, 0x4c, 0x60, 0x8f, + 0x5b, 0x9f, 0x6d, 0x9e, 0xa7, 0xc8, 0x9d, 0x64, 0x58, 0x3a, 0xb4, 0x91, + 0xc9, 0x82, 0x8d, 0xbb, 0x49, 0x8d, 0x5c, 0x63, 0xd1, 0x3d, 0x89, 0x6a, + 0x6f, 0x9d, 0xb9, 0xbe, 0xd6, 0x63, 0x4f, 0xb5, 0x38, 0x48, 0xa3, 0x85, + 0x91, 0xa3, 0x85, 0x39, 0x51, 0x6f, 0x4f, 0x4d, 0x5a, 0xbd, 0x9b, 0x6d, + 0xd1, 0xb8, 0xdf, 0x80, 0x67, 0x4b, 0xc2, 0x7b, 0xcc, 0x47, 0x9a, 0x7b, + 0xc9, 0xcc, 0x44, 0x7b, 0x87, 0x70, 0x71, 0xab, 0xc7, 0x46, 0x75, 0xbb, + 0x59, 0x3f, 0xb4, 0xc9, 0x4a, 0xb0, 0xca, 0x5a, 0x77, 0x90, 0x65, 0x7f, + 0xb2, 0x60, 0x98, 0x7e, 0x65, 0xb6, 0xd5, 0x9e, 0x6d, 0x49, 0x5c, 0x7c, + 0x4f, 0x40, 0x45, 0xa8, 0x71, 0x63, 0x79, 0x4b, 0x76, 0x82, 0x9f, 0xb3, + 0x91, 0x3d, 0x6f, 0xca, 0x4f, 0x36, 0x45, 0x74, 0xc5, 0x79, 0x6f, 0x9e, + 0x5f, 0x2f, 0x99, 0x7a, 0xae, 0xba, 0x40, 0x4e, 0xa7, 0x3f, 0x32, 0xc3, + 0xc0, 0x67, 0x6b, 0xa9, 0xc0, 0x54, 0x66, 0x8c, 0x8c, 0x65, 0xb7, 0x66, + 0xb2, 0x97, 0xae, 0x84, 0xa3, 0xbb, 0x64, 0x8c, 0x97, 0x64, 0x4b, 0x3b, + 0x76, 0xbf, 0x8f, 0xba, 0xc0, 0xb0, 0x49, 0x39, 0x64, 0x1f, 0x71, 0x91, + 0xb1, 0xbb, 0x91, 0xaf, 0xa2, 0x44, 0xc9, 0xe4, 0x49, 0x43, 0x89, 0x6f, + 0x3b, 0xca, 0xb8, 0x6d, 0x83, 0x32, 0x76, 0x53, 0xa4, 0x97, 0x59, 0x92, + 0x3c, 0xb5, 0xb1, 0x5a, 0x7e, 0x82, 0x4c, 0x42, 0x9a, 0xb9, 0xb2, 0xc0, + 0x72, 0x96, 0xb5, 0xbf, 0x33, 0x5d, 0xa1, 0xc9, 0x75, 0xca, 0x70, 0x92, + 0xbf, 0x76, 0x65, 0x38, 0x8f, 0xb2, 0x83, 0x48, 0x70, 0x66, 0x7d, 0x4f, + 0x85, 0xbb, 0x67, 0xc0, 0x2f, 0xd5, 0x30, 0xc2, 0x61, 0x4c, 0xae, 0xb6, + 0x70, 0x65, 0x33, 0xc1, 0x3c, 0x82, 0x5c, 0xcd, 0x3e, 0x68, 0x78, 0x95, + 0xbe, 0x41, 0x66, 0x4a, 0x65, 0x72, 0xd1, 0xa0, 0x96, 0x63, 0x38, 0x65, + 0xaa, 0x51, 0x6a, 0x69, 0xaf, 0x41, 0x5f, 0x2c, 0xc0, 0xb3, 0x73, 0x9c, + 0x55, 0x41, 0x3c, 0xa5, 0xba, 0xad, 0x7a, 0x61, 0x38, 0xc2, 0x87, 0x5f, + 0xbc, 0xca, 0x6a, 0x60, 0x49, 0x8c, 0xcf, 0x50, 0x7f, 0xb7, 0xa9, 0xb7, + 0xc6, 0x9f, 0x8a, 0x70, 0x4b, 0x6a, 0x9a, 0x79, 0xb0, 0x48, 0x7c, 0xc7, + 0x78, 0x3b, 0xb2, 0x6e, 0xc1, 0xa2, 0x6f, 0x83, 0xaa, 0x91, 0xb5, 0x73, + 0x3c, 0xc7, 0x6b, 0x3e, 0x9d, 0x92, 0xae, 0xc4, 0x3a, 0x86, 0x9a, 0xb4, + 0xa9, 0x96, 0x69, 0x98, 0xc4, 0x4a, 0x8d, 0xc9, 0x83, 0x49, 0x4b, 0x78, + 0x4e, 0x49, 0x40, 0x80, 0xcd, 0xc1, 0xc6, 0x91, 0xb9, 0x3d, 0x69, 0x60, + 0xa2, 0x67, 0xb0, 0x9a, 0x57, 0x54, 0xcc, 0x5c, 0x5c, 0x7b, 0x4e, 0xd2, + 0x98, 0x53, 0x7d, 0x58, 0x6f, 0x7a, 0xdf, 0xcd, 0x81, 0x82, 0xa7, 0xc8, + 0x89, 0xa6, 0x6a, 0x70, 0x8c, 0x82, 0x51, 0x74, 0x71, 0x9c, 0x8d, 0x60, + 0x9f, 0x39, 0x51, 0x48, 0x69, 0x5f, 0xbe, 0x98, 0x68, 0xd5, 0x58, 0xaf, + 0x85, 0x90, 0x66, 0x7b, 0xaa, 0x3f, 0x5b, 0x36, 0x9e, 0x71, 0x76, 0xbf, + 0x70, 0xb4, 0x9e, 0xbb, 0x67, 0xb7, 0xb5, 0xc1, 0x92, 0xd6, 0xbd, 0x96, + 0x74, 0xc6, 0x44, 0x57, 0x93, 0x49, 0x93, 0x98, 0xd3, 0x91, 0xcb, 0x92, + 0x4e, 0x7e, 0xa8, 0x52, 0x43, 0xb4, 0x3a, 0xb0, 0x96, 0x64, 0x7a, 0x73, + 0x47, 0x60, 0x5d, 0xd5, 0x8d, 0xbb, 0x8a, 0x4a, 0x6e, 0xad, 0xc1, 0xb7, + 0x76, 0xb9, 0x97, 0x77, 0x99, 0x94, 0xb0, 0x76, 0x95, 0xc4, 0x5c, 0x84, + 0x5d, 0x81, 0xbe, 0xb0, 0x41, 0x42, 0xdc, 0xbc, 0x83, 0x89, 0x7d, 0xc4, + 0x74, 0x35, 0x46, 0x42, 0x46, 0x84, 0x78, 0x5c, 0x30, 0x97, 0xbe, 0xc6, + 0x3e, 0x5e, 0xba, 0x4b, 0x62, 0x87, 0x79, 0x85, 0x82, 0x4b, 0xb2, 0x45, + 0x64, 0x41, 0xd7, 0x7e, 0xdd, 0x6d, 0x4e, 0x5d, 0x2d, 0x5d, 0x58, 0x4f, + 0xa7, 0xc7, 0x38, 0xb7, 0x67, 0x87, 0x54, 0x50, 0x73, 0x46, 0xa2, 0x80, + 0xaa, 0x52, 0x8e, 0x9d, 0x4d, 0x4b, 0xb7, 0x97, 0x88, 0x7e, 0x7e, 0xad, + 0x3e, 0x98, 0xa9, 0x62, 0x52, 0x4b, 0x7f, 0x57, 0x64, 0x3c, 0x69, 0x71, + 0x88, 0xd3, 0x8b, 0x3b, 0xd0, 0xae, 0x94, 0x95, 0x92, 0x7a, 0x51, 0x4f, + 0xac, 0x94, 0xd5, 0x83, 0x7f, 0xc9, 0x74, 0x84, 0x7c, 0xb5, 0xcc, 0x4b, + 0x7c, 0xee, 0xa0, 0xca, 0x49, 0xb5, 0x3b, 0x40, 0x3a, 0xba, 0x62, 0x70, + 0x73, 0xc7, 0x37, 0x74, 0xac, 0xb0, 0x70, 0x2d, 0x39, 0x37, 0x51, 0x96, + 0x48, 0xce, 0xc5, 0xd0, 0x99, 0xba, 0xb6, 0xa3, 0x65, 0xcc, 0x36, 0x57, + 0xbc, 0x94, 0x9f, 0x96, 0x55, 0xcb, 0x62, 0x4d, 0x9b, 0x64, 0xbf, 0x43, + 0x9b, 0x9c, 0xbd, 0xd1, 0xc3, 0x9f, 0xaa, 0x66, 0xd0, 0x9b, 0xaf, 0xb4, + 0xca, 0xd0, 0x91, 0x72, 0x6f, 0x60, 0x6f, 0x73, 0x5f, 0x5d, 0x8d, 0xbf, + 0xb3, 0x4d, 0x74, 0x8e, 0x79, 0xbe, 0x78, 0x62, 0xd8, 0x7b, 0xa8, 0x5f, + 0x49, 0x8f, 0x66, 0xb1, 0xc4, 0x52, 0x99, 0x94, 0x86, 0x33, 0x34, 0x67, + 0x35, 0x9c, 0xa6, 0xce, 0xbd, 0x4f, 0xcc, 0x4c, 0x5b, 0xc9, 0x52, 0x84, + 0x70, 0xab, 0xae, 0x54, 0xad, 0x34, 0x8b, 0x50, 0x9e, 0xc0, 0x63, 0x66, + 0xb2, 0x9a, 0x45, 0x37, 0xb7, 0x6d, 0xb9, 0x89, 0xab, 0xdd, 0x71, 0x9c, + 0xbc, 0xaa, 0x95, 0xa3, 0xc2, 0x6e, 0x62, 0x43, 0x69, 0x84, 0x86, 0x6b, + 0xce, 0x9d, 0x46, 0x4d, 0xc5, 0x7c, 0xc3, 0xaa, 0xb8, 0x7f, 0xaa, 0x64, + 0xa9, 0xc7, 0x8c, 0xce, 0xa5, 0x56, 0x73, 0x36, 0x61, 0x98, 0xcb, 0x38, + 0x72, 0xd0, 0x93, 0x4e, 0x60, 0x75, 0x67, 0x8f, 0x60, 0xc1, 0xb2, 0x58, + 0x8f, 0xd8, 0xa1, 0xaf, 0x96, 0x3a, 0x3b, 0x85, 0x91, 0xc1, 0x8b, 0x34, + 0xb2, 0xaf, 0xc2, 0x4c, 0xc8, 0x7f, 0xc7, 0xb4, 0xae, 0xa8, 0x92, 0xa4, + 0x6f, 0x63, 0xce, 0x44, 0x8a, 0xdb, 0x65, 0xcf, 0x91, 0x9b, 0xa5, 0x35, + 0xb2, 0x4e, 0xc9, 0x8e, 0x51, 0x53, 0xbd, 0x5a, 0xbf, 0xaa, 0x9c, 0x3c, + 0xab, 0xcb, 0xba, 0x40, 0xb6, 0x6a, 0x6c, 0x50, 0x70, 0xaa, 0x86, 0xba, + 0xb5, 0x92, 0x64, 0xbb, 0xcb, 0xb9, 0x44, 0x73, 0x57, 0x4a, 0x59, 0x5f, + 0x2a, 0x7b, 0x7e, 0x8f, 0xa6, 0xce, 0x52, 0x5d, 0x73, 0x99, 0x5b, 0x7f, + 0x48, 0xab, 0xd0, 0x85, 0x5a, 0x45, 0xaf, 0xa0, 0x97, 0x9e, 0x86, 0xac, + 0x95, 0x43, 0xd9, 0xa1, 0x2d, 0xca, 0x60, 0xa9, 0x96, 0x6d, 0x3e, 0x50, + 0xa6, 0x4a, 0x7a, 0x3e, 0xa9, 0x79, 0xc4, 0xb3, 0x4d, 0x43, 0x9d, 0xa8, + 0x81, 0xae, 0x3c, 0x86, 0xcf, 0x42, 0xcc, 0xc1, 0x8b, 0xb4, 0xae, 0x40, + 0xc9, 0x90, 0x6f, 0xa1, 0x65, 0xb5, 0xd4, 0x53, 0xc8, 0x49, 0x4e, 0x5e, + 0x38, 0x8f, 0x99, 0x6b, 0x8a, 0x56, 0x7a, 0xb2, 0x62, 0x40, 0x36, 0x54, + 0x7f, 0xa1, 0xa8, 0x40, 0x31, 0x78, 0x3b, 0x60, 0x83, 0xb9, 0xc6, 0x50, + 0x54, 0x5c, 0x2f, 0xa7, 0x57, 0x4a, 0xa0, 0x38, 0x7d, 0x5a, 0x40, 0xb9, + 0x56, 0x98, 0x92, 0x81, 0xa0, 0x40, 0x3a, 0x99, 0x3f, 0xa0, 0x4a, 0xb8, + 0x5f, 0xcb, 0xb9, 0xb5, 0x79, 0xbd, 0xa5, 0xaa, 0xa8, 0x98, 0x9e, 0x6d, + 0xc8, 0xc4, 0xb5, 0x8c, 0xd1, 0xd2, 0x89, 0xa2, 0xd5, 0xaf, 0x85, 0x89, + 0x6b, 0x35, 0xb6, 0xc6, 0xae, 0xc1, 0xbf, 0xb3, 0x8e, 0x3b, 0xd7, 0xb6, + 0xd1, 0x71, 0x56, 0x44, 0x94, 0x73, 0xb4, 0x53, 0x47, 0xa4, 0x6d, 0x8e, + 0xc9, 0x6b, 0x5d, 0x46, 0xce, 0x4b, 0x49, 0x44, 0xc6, 0xaa, 0xd4, 0x66, + 0x86, 0x7a, 0x7b, 0x98, 0x56, 0x38, 0xb3, 0xbe, 0x61, 0x84, 0x2f, 0x25, + 0x9c, 0x78, 0x3f, 0x75, 0x43, 0x3a, 0x7c, 0x64, 0x36, 0x5b, 0x69, 0x98, + 0x71, 0x76, 0xac, 0xaf, 0x4d, 0x84, 0x84, 0x5b, 0x6e, 0x69, 0x39, 0x76, + 0x79, 0x2a, 0x2d, 0x99, 0x50, 0x94, 0x3b, 0x7b, 0x77, 0x4e, 0x35, 0x9a, + 0x82, 0x8d, 0x43, 0xa4, 0x71, 0x5f, 0x43, 0xa3, 0xd6, 0x47, 0x9e, 0x69, + 0x5f, 0x66, 0xad, 0x6e, 0x43, 0x69, 0x7e, 0x80, 0xa1, 0x8f, 0x75, 0xaf, + 0x80, 0x7c, 0x38, 0x4d, 0xcb, 0x6e, 0x4e, 0x91, 0x78, 0xb9, 0x34, 0x7c, + 0x40, 0x86, 0x82, 0x4d, 0x81, 0x3e, 0x4b, 0x3d, 0x9c, 0x85, 0x3e, 0xb7, + 0xab, 0xba, 0x5a, 0x60, 0xb4, 0x5b, 0x56, 0x90, 0x89, 0x91, 0x5d, 0xbb, + 0xcc, 0xb1, 0xad, 0x48, 0x7b, 0x49, 0x5f, 0xbc, 0x30, 0xbc, 0xa4, 0x5a, + 0x76, 0x68, 0xd1, 0xc4, 0xaf, 0x5b, 0x6f, 0x58, 0x5d, 0xaa, 0x72, 0x78, + 0x51, 0x55, 0xc4, 0x42, 0x62, 0xd4, 0x3a, 0x39, 0x7b, 0x9f, 0x89, 0x3b, + 0x67, 0xac, 0x80, 0xb4, 0xcd, 0x56, 0xc8, 0xa1, 0xba, 0xb7, 0x96, 0x4b, + 0x4c, 0xa8, 0xa3, 0x33, 0xcf, 0xce, 0x77, 0xae, 0x4d, 0xc0, 0x7b, 0x45, + 0x9a, 0xbc, 0x43, 0x67, 0x94, 0x5b, 0xa0, 0x7e, 0xc3, 0xc0, 0xc9, 0x72, + 0xc4, 0x6b, 0xaa, 0x4f, 0x46, 0xa5, 0xaa, 0xb6, 0x51, 0x37, 0x2e, 0x5d, + 0x83, 0x36, 0xbc, 0x54, 0x91, 0x87, 0x61, 0x32, 0x8c, 0x51, 0x6d, 0x3a, + 0x79, 0xa5, 0xb2, 0x6c, 0x91, 0x3e, 0x84, 0x4d, 0x5c, 0x78, 0x75, 0xc5, + 0xbd, 0xa4, 0x79, 0xad, 0x5d, 0x9b, 0xcd, 0xcd, 0xb1, 0xb7, 0x5a, 0xd4, + 0x7e, 0x5b, 0x90, 0x93, 0x8b, 0x5b, 0x62, 0x85, 0x47, 0x79, 0x83, 0x3d, + 0x77, 0x3e, 0x4d, 0x94, 0x36, 0xd8, 0x86, 0xd0, 0xb7, 0x45, 0x96, 0xb6, + 0x99, 0x3e, 0x4d, 0xb6, 0x89, 0x55, 0xbc, 0x97, 0xbb, 0x7a, 0xa5, 0x7a, + 0x84, 0xc7, 0x4a, 0x6e, 0x68, 0x3f, 0x4e, 0xb1, 0xc1, 0x5a, 0x73, 0x5f, + 0x54, 0x60, 0xda, 0x55, 0xcc, 0xca, 0x72, 0x81, 0x9b, 0xba, 0x7d, 0xa3, + 0xc7, 0x97, 0xa0, 0xd1, 0x51, 0xd7, 0x5b, 0x84, 0xc3, 0xb5, 0x86, 0xd3, + 0x39, 0xb8, 0x37, 0x82, 0xa7, 0x7c, 0xae, 0x9b, 0xab, 0x59, 0x81, 0x91, + 0x6c, 0x91, 0x5e, 0xcd, 0x9e, 0xa3, 0x6b, 0x9b, 0xae, 0x50, 0xb4, 0x5d, + 0x77, 0xb4, 0xc8, 0xa3, 0x78, 0xa1, 0x35, 0x9b, 0x9a, 0xb7, 0x93, 0xc7, + 0xb0, 0x40, 0x76, 0xb4, 0xc3, 0x4a, 0x8d, 0x5f, 0x3e, 0x31, 0x42, 0xa1, + 0xae, 0xc9, 0xbe, 0x6e, 0x61, 0x61, 0x40, 0x93, 0x8e, 0x62, 0xb4, 0x64, + 0x32, 0xc9, 0x61, 0xa3, 0x3e, 0xd6, 0xc0, 0x8f, 0xa3, 0x3f, 0x9c, 0x81, + 0xa7, 0x5c, 0x82, 0x4c, 0xa8, 0x50, 0x43, 0xb2, 0x75, 0x70, 0x32, 0x7c, + 0xc9, 0x84, 0xcf, 0xaa, 0x72, 0xb2, 0x71, 0x91, 0x6a, 0x56, 0x73, 0xca, + 0x42, 0x5a, 0xc9, 0xab, 0x8e, 0x6d, 0x75, 0xbf, 0xb3, 0xc3, 0x5e, 0xb0, + 0x96, 0x93, 0xbb, 0xba, 0xd5, 0x71, 0x7f, 0xa2, 0x58, 0x32, 0xbd, 0xcf, + 0x8c, 0x75, 0x31, 0x6a, 0xb0, 0x4b, 0x2e, 0xbd, 0x67, 0x4e, 0x7e, 0x82, + 0xad, 0xa3, 0x5b, 0x4c, 0x75, 0x30, 0x64, 0xc8, 0x3d, 0x8f, 0x9a, 0xcd, + 0xd3, 0xbd, 0x68, 0x71, 0x4d, 0x55, 0x2e, 0xd1, 0xa3, 0x6c, 0x9d, 0x92, + 0xbd, 0x68, 0xbf, 0x48, 0x35, 0x5f, 0x52, 0x34, 0x81, 0x70, 0xd1, 0x99, + 0x64, 0x8c, 0x71, 0xb8, 0x77, 0xae, 0x81, 0x9c, 0xc1, 0xa9, 0xb8, 0x5a, + 0x48, 0xbe, 0x82, 0xac, 0xab, 0xbb, 0x86, 0x4f, 0xb5, 0xc3, 0x3c, 0x4b, + 0x82, 0x76, 0x42, 0xc9, 0xb8, 0x4e, 0x8f, 0x3e, 0xaa, 0xac, 0xb5, 0xaf, + 0x61, 0xb4, 0xad, 0xb9, 0x38, 0x50, 0x72, 0xbf, 0xad, 0x81, 0x67, 0x2b, + 0x67, 0xc9, 0xb4, 0xaf, 0x9b, 0x79, 0x46, 0xae, 0x7d, 0xa7, 0xb7, 0x38, + 0xc4, 0x97, 0x80, 0xcd, 0xbd, 0x58, 0x68, 0xcc, 0x75, 0xd3, 0xc0, 0x93, + 0x9c, 0x86, 0x8d, 0xa2, 0xc7, 0x7d, 0xbe, 0xb4, 0x3b, 0x57, 0x7a, 0x39, + 0x70, 0x64, 0xb2, 0xc9, 0x2e, 0xcc, 0xd5, 0x6e, 0x84, 0x5d, 0x2e, 0xae, + 0x57, 0x82, 0xb9, 0xc6, 0x4a, 0xca, 0x97, 0xbe, 0x6a, 0x62, 0x95, 0x4e, + 0x59, 0xd0, 0x64, 0xab, 0x44, 0x8b, 0xa4, 0x78, 0x7e, 0x4b, 0xde, 0x45, + 0xa9, 0xac, 0x3b, 0x4c, 0xd4, 0xc1, 0x64, 0xca, 0xa8, 0x58, 0x93, 0x50, + 0xa5, 0x34, 0xb2, 0x99, 0x91, 0xc9, 0x70, 0xc6, 0x52, 0x86, 0x8c, 0x39, + 0x42, 0x32, 0x54, 0x71, 0xae, 0x41, 0x79, 0x54, 0x58, 0x9c, 0x48, 0xbb, + 0x4d, 0x70, 0xb2, 0x9a, 0x5d, 0x1f, 0x80, 0x33, 0x84, 0xc7, 0x9c, 0x41, + 0x4d, 0x4c, 0xb6, 0x2c, 0xa8, 0x2b, 0x54, 0x6b, 0xb7, 0xb2, 0xa9, 0xcd, + 0xc4, 0x32, 0x9d, 0x84, 0x4b, 0x8e, 0xc5, 0x6d, 0xb0, 0x3b, 0xbb, 0x88, + 0x49, 0x8e, 0xc3, 0x73, 0x62, 0x60, 0x41, 0x75, 0x7a, 0x5e, 0x7c, 0x5a, + 0x56, 0xb6, 0x3f, 0xc8, 0x3f, 0xa9, 0x49, 0xb0, 0x96, 0x5d, 0x88, 0x30, + 0x4e, 0xbb, 0x8e, 0xa4, 0xa5, 0x5f, 0x2d, 0xa1, 0xbb, 0xae, 0x5f, 0xa4, + 0x79, 0x40, 0x62, 0x9d, 0xb6, 0xd8, 0x62, 0x60, 0x50, 0xa0, 0x64, 0x9e, + 0x81, 0xa5, 0x3c, 0x71, 0x41, 0x6c, 0x89, 0xcf, 0x38, 0x45, 0xbb, 0x8b, + 0xca, 0x36, 0x3e, 0xa0, 0xac, 0xa6, 0x66, 0xa7, 0x86, 0x3e, 0xaf, 0xdc, + 0x91, 0xd3, 0x4e, 0x35, 0x32, 0xb7, 0xc1, 0x9e, 0xbd, 0xd7, 0x39, 0x6a, + 0x57, 0x42, 0x56, 0x8d, 0xd1, 0x52, 0xca, 0x5f, 0x77, 0xb0, 0xba, 0x98, + 0x42, 0x43, 0x7e, 0x54, 0xb1, 0x5a, 0xb1, 0xcb, 0x49, 0x8e, 0x99, 0x50, + 0x40, 0x76, 0xd7, 0xae, 0x2e, 0x4e, 0x9b, 0x9d, 0x45, 0xb5, 0x8d, 0xc8, + 0xc4, 0x70, 0x6d, 0x44, 0x9d, 0x50, 0x5f, 0x74, 0xd5, 0x38, 0x50, 0xba, + 0xce, 0x82, 0x2f, 0x8a, 0xba, 0x50, 0x42, 0x77, 0x72, 0x95, 0x5c, 0x56, + 0x51, 0x54, 0x83, 0x66, 0x53, 0x36, 0xd7, 0xa6, 0x36, 0x58, 0xad, 0x77, + 0xbf, 0x40, 0x8c, 0x73, 0x5f, 0x50, 0x37, 0x72, 0xb6, 0xbd, 0x38, 0xcd, + 0x7f, 0x6e, 0x77, 0x61, 0x64, 0x80, 0x59, 0xcf, 0xc7, 0xa3, 0x78, 0x91, + 0x78, 0xd8, 0xa4, 0x53, 0x5d, 0xb1, 0x65, 0x60, 0x35, 0x4b, 0x98, 0x4a, + 0xad, 0xc4, 0x88, 0x4e, 0xb1, 0xcf, 0xb7, 0xca, 0xa3, 0xa8, 0xb8, 0x61, + 0x75, 0xc0, 0x89, 0xc7, 0xaf, 0x9d, 0xb0, 0x37, 0xba, 0xcf, 0x4d, 0xa1, + 0xa5, 0x7a, 0x63, 0x5b, 0xac, 0x8e, 0xb0, 0x9a, 0x94, 0x97, 0x45, 0xcd, + 0x9e, 0xbc, 0x88, 0x6f, 0x75, 0x85, 0xa0, 0x3f, 0xae, 0xb8, 0x6a, 0x5a, + 0xb8, 0xba, 0xba, 0x52, 0x7e, 0x95, 0xac, 0x35, 0x4d, 0xad, 0xad, 0x58, + 0xbc, 0x8a, 0x92, 0x54, 0xa0, 0x56, 0xb5, 0x3f, 0x51, 0x46, 0x51, 0xbd, + 0x98, 0x5c, 0x4f, 0xb7, 0x8c, 0x7a, 0x36, 0x8d, 0xa2, 0x61, 0x44, 0xd9, + 0xa1, 0xb2, 0x49, 0x6b, 0x3b, 0xc4, 0xd9, 0x28, 0xb7, 0xa7, 0x9f, 0x55, + 0x7d, 0xc1, 0xc4, 0x9e, 0xd2, 0x6d, 0x65, 0x6b, 0x40, 0x9a, 0x41, 0x7f, + 0xac, 0x99, 0x7a, 0x42, 0x62, 0x95, 0x83, 0x79, 0xba, 0x50, 0x43, 0x3f, + 0xca, 0x4d, 0x48, 0x6e, 0xcd, 0xcb, 0xb1, 0x43, 0x54, 0x65, 0x41, 0x5a, + 0x91, 0x33, 0x8a, 0x8c, 0xb8, 0xbc, 0xe5, 0x71, 0x95, 0xa2, 0x4a, 0xb9, + 0xa3, 0x55, 0x40, 0x60, 0x90, 0x9b, 0xc9, 0x47, 0x48, 0xca, 0xae, 0x41, + 0x64, 0x81, 0x5a, 0xc5, 0xb5, 0x99, 0x76, 0x93, 0x38, 0x44, 0x99, 0x9e, + 0x91, 0x94, 0x75, 0x71, 0x93, 0x3a, 0xb7, 0x3c, 0x69, 0xbe, 0x7b, 0x70, + 0x47, 0xbf, 0xd8, 0xb1, 0x5f, 0x79, 0xc8, 0xb2, 0x2e, 0x84, 0x76, 0x90, + 0xb7, 0x68, 0x5e, 0x33, 0xa8, 0x9b, 0x95, 0x56, 0x73, 0xc4, 0xa5, 0xc1, + 0x4b, 0x80, 0x4e, 0x83, 0x7e, 0xb6, 0x3c, 0x84, 0x55, 0x1f, 0x3e, 0x51, + 0x4f, 0x8a, 0xd0, 0xd2, 0x67, 0xc2, 0x74, 0x54, 0x59, 0x84, 0x90, 0x8e, + 0xd1, 0x3b, 0xb6, 0x71, 0xd0, 0xb9, 0x69, 0x85, 0x8a, 0x68, 0xca, 0x5a, + 0x88, 0x98, 0x91, 0x69, 0xc5, 0xbb, 0x42, 0x57, 0xcc, 0x75, 0xd7, 0xa5, + 0xc1, 0xac, 0xbf, 0x66, 0xb2, 0x49, 0xba, 0x52, 0xc5, 0x5d, 0xab, 0xd4, + 0x46, 0x42, 0x91, 0x5e, 0xd5, 0x77, 0xbf, 0xac, 0x6a, 0x5a, 0x68, 0x6e, + 0xb4, 0xae, 0xb7, 0x71, 0x45, 0x4b, 0x70, 0x2e, 0x3c, 0xaf, 0xcb, 0x98, + 0x3a, 0x61, 0x83, 0x8f, 0x40, 0x53, 0x94, 0xa0, 0x38, 0x9e, 0x6f, 0x5d, + 0xb3, 0xad, 0x47, 0x43, 0xa9, 0xd1, 0xb6, 0x82, 0xb3, 0x41, 0x3b, 0x40, + 0x9a, 0x88, 0x7e, 0x92, 0x90, 0xde, 0xcf, 0x31, 0x9c, 0x52, 0x42, 0x92, + 0x7d, 0x55, 0xaf, 0x52, 0x9a, 0x4e, 0x67, 0xab, 0x3d, 0x2a, 0x34, 0x66, + 0x6e, 0x8e, 0xba, 0x79, 0x41, 0x44, 0x45, 0x98, 0x47, 0x88, 0xb7, 0xab, + 0xc5, 0x9d, 0x52, 0x8a, 0x8a, 0x84, 0x4c, 0x4a, 0x71, 0x69, 0x8a, 0x93, + 0xbb, 0x65, 0xb2, 0xc2, 0x8a, 0x58, 0x77, 0xba, 0x4f, 0x4a, 0xa8, 0x70, + 0x8d, 0x6f, 0x74, 0x54, 0xae, 0xcd, 0x4d, 0xa2, 0x84, 0x30, 0xb4, 0x29, + 0x4a, 0x36, 0x67, 0x76, 0x3a, 0x4c, 0xc6, 0x8a, 0x6a, 0x1b, 0x78, 0x55, + 0x97, 0xa9, 0x3a, 0xc8, 0xb1, 0x88, 0x7e, 0x82, 0xc9, 0xad, 0x75, 0x73, + 0x51, 0x75, 0x7f, 0xad, 0x70, 0x86, 0x93, 0x93, 0x98, 0x6f, 0xbd, 0x75, + 0xc8, 0xa0, 0x49, 0x58, 0x66, 0x88, 0x6c, 0x83, 0x67, 0x4e, 0x78, 0x75, + 0x3c, 0x4f, 0x6c, 0xa3, 0x3f, 0xcd, 0x76, 0xa4, 0x7b, 0x8b, 0x96, 0xbf, + 0x41, 0x9a, 0x43, 0x6f, 0xbc, 0x68, 0x6c, 0x88, 0xbe, 0x37, 0xb1, 0x37, + 0x91, 0x3c, 0xb6, 0xa7, 0x36, 0x89, 0x60, 0xad, 0x79, 0x43, 0x3b, 0x33, + 0x57, 0xb9, 0x5b, 0x35, 0x96, 0xbe, 0xc8, 0xb4, 0x8d, 0xaa, 0x6a, 0xc8, + 0x40, 0x38, 0x80, 0x9f, 0xa2, 0xa6, 0x6c, 0x2f, 0x65, 0x72, 0xac, 0x3e, + 0xca, 0x95, 0x39, 0x45, 0x3f, 0xba, 0xd4, 0xd1, 0x9c, 0x40, 0x47, 0x3a, + 0xaf, 0x6a, 0x3b, 0x84, 0x8b, 0x61, 0x8d, 0x7a, 0x4d, 0xa2, 0xba, 0x88, + 0x3e, 0x7c, 0x42, 0x88, 0xa6, 0x91, 0xb4, 0x3b, 0xac, 0xc4, 0x39, 0x37, + 0x7c, 0x49, 0xa5, 0xb3, 0x94, 0x62, 0xc3, 0xb8, 0xa8, 0x78, 0xd0, 0x7c, + 0x84, 0x60, 0xaf, 0xd2, 0x45, 0x51, 0xca, 0xa8, 0x49, 0xa4, 0x94, 0x8e, + 0x98, 0x3a, 0x32, 0x98, 0x46, 0x79, 0xb2, 0x5b, 0x55, 0xcc, 0x84, 0xaa, + 0x73, 0xa4, 0xa1, 0x6f, 0xbb, 0x82, 0xbc, 0x3e, 0xc8, 0xb2, 0x6b, 0x3f, + 0x3d, 0x49, 0x4a, 0x86, 0xc2, 0xbf, 0xc0, 0x87, 0x4a, 0x47, 0xb7, 0xbf, + 0xc3, 0xa4, 0x7d, 0xc2, 0x85, 0x3a, 0x4a, 0x80, 0x3a, 0x98, 0xc8, 0x43, + 0x3f, 0x6c, 0x34, 0x79, 0x41, 0xb2, 0x47, 0x2e, 0xa1, 0x90, 0xb2, 0x43, + 0x49, 0x3b, 0xa5, 0x55, 0xb4, 0x6f, 0x30, 0x66, 0x7c, 0x68, 0x5c, 0x67, + 0x9b, 0x6d, 0x58, 0x8b, 0xa9, 0x39, 0x49, 0x3e, 0x9c, 0x83, 0x5e, 0x95, + 0x75, 0xcb, 0x31, 0x80, 0xa5, 0xb0, 0xa6, 0x40, 0x3c, 0xbe, 0xa5, 0x9d, + 0xb3, 0x5e, 0x94, 0x8e, 0x56, 0x97, 0x5d, 0x68, 0x32, 0xb6, 0x87, 0xa0, + 0xc9, 0x39, 0xcf, 0x6a, 0x7c, 0x6c, 0x92, 0x8e, 0x56, 0x68, 0xae, 0x47, + 0x5d, 0x31, 0x41, 0x5d, 0x6f, 0xaf, 0x2f, 0x7e, 0xc7, 0x52, 0x8b, 0x9e, + 0xa4, 0x43, 0x3b, 0x45, 0xc7, 0x97, 0x92, 0x84, 0x72, 0x94, 0x36, 0x3f, + 0xb4, 0xc4, 0xb4, 0xb7, 0x3c, 0x41, 0xc3, 0x58, 0x4a, 0x6e, 0xcf, 0x81, + 0xd2, 0xcd, 0x48, 0x5a, 0x8f, 0x48, 0x3e, 0xba, 0x83, 0x77, 0x9a, 0x44, + 0x50, 0x94, 0x95, 0x88, 0x8f, 0x4a, 0x3e, 0x41, 0xad, 0x9b, 0x40, 0x94, + 0x8f, 0x4b, 0xa4, 0xa1, 0xd1, 0x3c, 0x4a, 0xbf, 0xc3, 0x30, 0x75, 0x61, + 0x52, 0xb5, 0x85, 0x4f, 0x73, 0xd1, 0x84, 0x64, 0x4a, 0x77, 0x9f, 0x59, + 0x43, 0x32, 0x9a, 0x93, 0x64, 0x60, 0x5c, 0xa3, 0xcb, 0x7a, 0x65, 0x56, + 0x76, 0xaf, 0x66, 0xa8, 0x62, 0xb5, 0xa3, 0x8a, 0x45, 0xbb, 0x7a, 0x64, + 0x32, 0x76, 0x35, 0xb2, 0xba, 0x75, 0x9b, 0x6f, 0x40, 0x58, 0xc8, 0x6e, + 0xb9, 0x80, 0x5e, 0xc9, 0x2a, 0x98, 0xb7, 0x6b, 0x37, 0x95, 0xa1, 0x92, + 0x55, 0xbb, 0xa7, 0x54, 0x3e, 0x72, 0x4c, 0x60, 0xce, 0x6a, 0xa1, 0x6c, + 0x86, 0x3b, 0x66, 0x5c, 0x96, 0x3c, 0x93, 0xb2, 0xa2, 0x69, 0x5d, 0xd0, + 0x5f, 0x74, 0xc5, 0x99, 0x56, 0x48, 0xb5, 0x9c, 0x48, 0x46, 0xd0, 0x78, + 0xa7, 0x43, 0x73, 0x56, 0x95, 0x80, 0x5a, 0x8a, 0x85, 0xad, 0xb6, 0xb3, + 0x69, 0x75, 0x64, 0x5a, 0x89, 0x3b, 0xcb, 0xbb, 0x89, 0x9e, 0x9c, 0x8a, + 0xb3, 0x71, 0xbd, 0xbe, 0x59, 0xad, 0x30, 0x46, 0x42, 0x45, 0xaf, 0xbb, + 0x5f, 0xba, 0x79, 0x52, 0x46, 0xba, 0xce, 0x67, 0xc3, 0xb3, 0x58, 0x77, + 0x44, 0x77, 0x92, 0xcd, 0x77, 0x45, 0x3a, 0x60, 0x89, 0x80, 0x36, 0xb0, + 0xac, 0x44, 0x65, 0x68, 0x3f, 0x90, 0x92, 0xa1, 0xc4, 0x54, 0xa5, 0xab, + 0x7e, 0xbb, 0xba, 0x93, 0x7e, 0x46, 0x3c, 0x64, 0x69, 0x60, 0x55, 0x47, + 0x68, 0x9d, 0x86, 0x56, 0xc6, 0x8f, 0x8c, 0x2d, 0x34, 0x90, 0x61, 0x5f, + 0xcd, 0x91, 0xbb, 0x81, 0x91, 0xb8, 0x88, 0x6a, 0xcf, 0x6c, 0x99, 0x41, + 0x6a, 0x2f, 0xc1, 0x9c, 0xb7, 0xab, 0x9e, 0xc6, 0xc6, 0xcc, 0xa4, 0x78, + 0xb2, 0x55, 0x7e, 0x82, 0x87, 0x69, 0x88, 0x63, 0x33, 0x64, 0x74, 0x93, + 0xc0, 0x38, 0x70, 0x97, 0x7a, 0x56, 0x89, 0xca, 0xc7, 0x81, 0x88, 0xa4, + 0x85, 0x44, 0x4f, 0xb0, 0xca, 0x3d, 0x3e, 0xb6, 0x9d, 0x9c, 0x84, 0xad, + 0x84, 0x9f, 0x67, 0xc8, 0x3b, 0xbf, 0x88, 0x87, 0x6b, 0x36, 0x49, 0x5d, + 0x92, 0x39, 0xc9, 0xb8, 0x98, 0x4a, 0x8d, 0xa3, 0x3f, 0x49, 0x47, 0xcc, + 0x72, 0xce, 0x46, 0xc9, 0xcc, 0x85, 0xa5, 0xb8, 0x35, 0x76, 0x6b, 0xd9, + 0x92, 0x89, 0xca, 0x3a, 0xad, 0x6d, 0x3e, 0xaa, 0xb4, 0xab, 0xa8, 0x47, + 0xce, 0x7d, 0x38, 0x43, 0x6c, 0xae, 0x5f, 0x36, 0x5a, 0xcd, 0x72, 0x84, + 0x8a, 0xc2, 0x76, 0x38, 0x8d, 0xab, 0x5e, 0x3a, 0x71, 0x3f, 0xa6, 0x8b, + 0x8e, 0x49, 0x8a, 0x62, 0x91, 0x61, 0xb2, 0xa3, 0xa1, 0x82, 0xb1, 0xb5, + 0x71, 0x92, 0x49, 0x72, 0x51, 0x5e, 0x2e, 0x44, 0x59, 0x94, 0xac, 0x3b, + 0x81, 0x51, 0x81, 0x7a, 0x71, 0x55, 0x95, 0x94, 0x72, 0x9d, 0x73, 0x83, + 0xa5, 0x52, 0x90, 0x53, 0x79, 0x37, 0x74, 0x8e, 0x6b, 0x55, 0xb1, 0x5f, + 0x84, 0xc6, 0x86, 0xc7, 0x45, 0x9e, 0x39, 0x6b, 0x33, 0xd0, 0x5a, 0x84, + 0x73, 0xab, 0x76, 0x40, 0x3f, 0x3e, 0xc7, 0x3b, 0xae, 0x8c, 0x4b, 0x8b, + 0xa4, 0x7b, 0x89, 0x9d, 0x36, 0xa2, 0xa0, 0x38, 0xa2, 0x3f, 0x99, 0x5a, + 0x44, 0x35, 0xbc, 0x87, 0xbe, 0xc7, 0x8a, 0x86, 0xb0, 0x86, 0xb8, 0xc1, + 0x78, 0x37, 0x55, 0x91, 0xb8, 0xb2, 0xa5, 0x79, 0x5a, 0x6d, 0xa5, 0xb7, + 0xab, 0xac, 0x8e, 0x94, 0x61, 0x9e, 0x38, 0x87, 0x47, 0x80, 0x31, 0x50, + 0x49, 0x42, 0x7e, 0x93, 0xb2, 0x95, 0x4f, 0x9a, 0xa1, 0x7a, 0xb0, 0xbc, + 0x9f, 0x62, 0xcd, 0x6f, 0x61, 0x85, 0xb4, 0x54, 0x5d, 0xab, 0xb1, 0xa2, + 0x35, 0xb2, 0x6f, 0x8a, 0x5d, 0x47, 0x58, 0x66, 0x6b, 0x76, 0xcf, 0x41, + 0x9f, 0x7b, 0x9d, 0x94, 0xa5, 0x68, 0xce, 0x44, 0xc3, 0x8e, 0x35, 0xab, + 0xa7, 0xa5, 0x4f, 0x87, 0xa1, 0x7a, 0xbc, 0x96, 0xc9, 0x43, 0xd2, 0x7d, + 0x62, 0xc7, 0x39, 0x3d, 0xbe, 0xcd, 0x5d, 0xc0, 0xbf, 0x6b, 0xd1, 0x82, + 0xa6, 0x99, 0x76, 0x36, 0x63, 0x7d, 0x46, 0x31, 0xad, 0x56, 0x31, 0x56, + 0xa6, 0x41, 0xc8, 0xc4, 0x49, 0x9f, 0x35, 0x3c, 0x80, 0x59, 0xa5, 0xa9, + 0x92, 0x6c, 0xad, 0x81, 0x94, 0x97, 0x78, 0x56, 0x76, 0x9d, 0xcc, 0x5b, + 0x8f, 0x68, 0x86, 0x50, 0xd1, 0x4a, 0x6a, 0xb2, 0x7e, 0x36, 0x6a, 0x7e, + 0x4a, 0x6e, 0x67, 0x89, 0x87, 0x9b, 0x88, 0x77, 0x4f, 0x56, 0x96, 0x39, + 0x79, 0x93, 0x98, 0x33, 0x5e, 0x49, 0x59, 0x47, 0xc3, 0x79, 0x52, 0x3c, + 0x61, 0xba, 0x66, 0x67, 0x4f, 0x51, 0x93, 0xb7, 0x9a, 0xb1, 0xc7, 0xd4, + 0x4d, 0xb5, 0xab, 0x42, 0x8a, 0x82, 0xb9, 0xa6, 0x8a, 0x82, 0xc7, 0x83, + 0x8a, 0x99, 0xa9, 0xac, 0x94, 0xa7, 0x85, 0x64, 0x6a, 0x75, 0x7e, 0x7e, + 0x54, 0x40, 0x52, 0x3a, 0xc6, 0x80, 0xa1, 0xc6, 0x99, 0xb0, 0x15, 0xc1, + 0xa3, 0xa1, 0x90, 0x75, 0x53, 0x69, 0xb7, 0x71, 0x59, 0x3b, 0x55, 0x5c, + 0x8e, 0x6d, 0x52, 0xbf, 0x99, 0xaa, 0xd1, 0xc1, 0x8e, 0xaa, 0x9a, 0x9f, + 0xcc, 0xa5, 0x4a, 0x5f, 0x6e, 0x66, 0x84, 0xc3, 0x95, 0x36, 0x68, 0x48, + 0x7a, 0x55, 0x9a, 0x97, 0xaf, 0xb8, 0x71, 0xc5, 0xc0, 0x72, 0x67, 0x54, + 0xbc, 0x86, 0x4d, 0xa6, 0x8f, 0x46, 0x85, 0x88, 0xba, 0x61, 0xb1, 0x53, + 0x84, 0x5d, 0x95, 0x75, 0x59, 0xbc, 0x33, 0x54, 0x5f, 0x99, 0xd1, 0x64, + 0x3f, 0x86, 0x59, 0x6c, 0xbe, 0x8a, 0xa8, 0xad, 0x96, 0xa0, 0x38, 0x7f, + 0x60, 0xc1, 0xb4, 0x2e, 0x92, 0x52, 0xa9, 0x49, 0x42, 0x51, 0x72, 0xaf, + 0x6b, 0xce, 0xa9, 0xcd, 0x47, 0xa6, 0x37, 0x31, 0xba, 0x7d, 0xc0, 0x9f, + 0x7c, 0x9f, 0x30, 0x7e, 0xc8, 0x64, 0x42, 0xbc, 0xad, 0x2d, 0x73, 0xa5, + 0x80, 0x42, 0x7c, 0x62, 0xb5, 0x9e, 0x7f, 0x4a, 0x49, 0xc5, 0x33, 0xca, + 0x33, 0x5f, 0x3d, 0x81, 0xab, 0x2e, 0xb5, 0xd0, 0xbd, 0x71, 0xa3, 0xa7, + 0x5f, 0xc7, 0x22, 0xa0, 0xcf, 0xb1, 0x37, 0xa6, 0x95, 0x4c, 0x79, 0x9a, + 0xcb, 0xa8, 0xc7, 0xb3, 0xd8, 0xda, 0x97, 0x55, 0x3a, 0x65, 0x66, 0xa0, + 0x94, 0x40, 0x7b, 0x81, 0x79, 0x87, 0x9c, 0x49, 0xb8, 0x2d, 0x58, 0xc1, + 0x9f, 0xa1, 0x6d, 0x77, 0x29, 0x2b, 0x47, 0x6e, 0x79, 0x75, 0x80, 0xa4, + 0x78, 0x79, 0xa2, 0x65, 0x7f, 0xa3, 0x8f, 0xb4, 0x49, 0xbc, 0x73, 0x38, + 0x41, 0xab, 0xa0, 0xce, 0x73, 0xb9, 0x64, 0x6f, 0x9b, 0x9c, 0x74, 0x7b, + 0xa6, 0x5d, 0x55, 0x9c, 0x53, 0x6e, 0xa6, 0x36, 0x69, 0x46, 0x4d, 0x74, + 0xb1, 0xa8, 0xc8, 0x92, 0x33, 0x88, 0x79, 0x9d, 0xb2, 0x39, 0x68, 0x6d, + 0xc0, 0x65, 0xac, 0x94, 0x73, 0xa1, 0x50, 0xa7, 0xa0, 0x4d, 0xb5, 0xb7, + 0x8d, 0x5b, 0x71, 0x6b, 0xb9, 0xbf, 0x5f, 0x6e, 0xb4, 0x8e, 0xbc, 0x83, + 0x61, 0x58, 0x88, 0xa6, 0x62, 0xab, 0x64, 0xc7, 0xbd, 0xa9, 0xa9, 0x4d, + 0x76, 0xb7, 0x52, 0xa6, 0x7c, 0xa8, 0x49, 0xb8, 0xb4, 0x89, 0x71, 0xd9, + 0xa1, 0xa6, 0x60, 0xc1, 0xa1, 0x3b, 0x8e, 0x50, 0x4a, 0x3f, 0x87, 0x82, + 0xd1, 0x55, 0x83, 0x66, 0x64, 0xc9, 0x76, 0x4d, 0x7b, 0xe8, 0x84, 0x88, + 0x9c, 0xcc, 0xa5, 0x6a, 0x7e, 0x7f, 0x80, 0xc1, 0x3b, 0x46, 0xa1, 0x4f, + 0x7a, 0x9f, 0x3b, 0x88, 0x6e, 0xa5, 0x4c, 0x82, 0xba, 0xc9, 0x7d, 0x4f, + 0xcf, 0x67, 0x7e, 0x89, 0x96, 0x7a, 0x3a, 0xcc, 0xac, 0x87, 0xdb, 0x77, + 0xbf, 0xa2, 0x71, 0xa3, 0x70, 0x6a, 0x24, 0x64, 0xab, 0xda, 0x9e, 0xc1, + 0x77, 0x51, 0xaf, 0x63, 0x99, 0x94, 0x98, 0x47, 0x77, 0x72, 0x4a, 0x63, + 0xbf, 0x5c, 0xce, 0x57, 0x6c, 0xd1, 0x63, 0x57, 0x8b, 0xbd, 0x50, 0x51, + 0x97, 0x96, 0x7a, 0x68, 0x75, 0x9a, 0x38, 0x82, 0x30, 0xa3, 0x50, 0x65, + 0xaf, 0xa6, 0x66, 0x40, 0xb2, 0xb5, 0x36, 0x99, 0xbb, 0x31, 0x4b, 0x8e, + 0x31, 0x61, 0x3f, 0x39, 0x5c, 0xa9, 0x42, 0x53, 0x6e, 0x89, 0x8c, 0x53, + 0x88, 0x7e, 0x9f, 0x82, 0x68, 0x31, 0xca, 0x77, 0xc1, 0x92, 0xc4, 0x57, + 0x56, 0x97, 0x5a, 0x50, 0x86, 0xcd, 0x7e, 0x8d, 0x55, 0xb3, 0xcf, 0xb2, + 0x9c, 0xa2, 0x34, 0xb7, 0xaa, 0x6f, 0xa5, 0x4c, 0xa2, 0xb7, 0x6f, 0xa4, + 0x2f, 0x8d, 0xb7, 0x82, 0x98, 0x40, 0xc3, 0x48, 0x8d, 0x59, 0x95, 0x74, + 0xe1, 0xbf, 0xcf, 0x51, 0x3a, 0xa8, 0x6a, 0x51, 0x8d, 0x36, 0x43, 0x66, + 0xd5, 0xa1, 0x90, 0x91, 0x50, 0x47, 0xca, 0xb0, 0xad, 0x47, 0x90, 0x83, + 0x6f, 0x5b, 0x6f, 0xb8, 0xc3, 0x91, 0x4a, 0x95, 0x8d, 0x60, 0xb9, 0x6b, + 0xb9, 0x49, 0x76, 0x77, 0xa4, 0x7b, 0x48, 0x49, 0xbf, 0xb8, 0x46, 0xc5, + 0xc9, 0x53, 0x7c, 0x5c, 0xa8, 0x82, 0x5d, 0x9f, 0x3f, 0x87, 0x65, 0xa3, + 0x94, 0x40, 0x96, 0x51, 0xcf, 0x89, 0x73, 0x9a, 0xbe, 0x81, 0x5d, 0x8e, + 0xc4, 0xc8, 0xbc, 0xa2, 0xc9, 0xb0, 0x6d, 0xa2, 0xa9, 0x5e, 0x8d, 0x3e, + 0xaf, 0x5a, 0x6b, 0xbc, 0x6e, 0x41, 0x75, 0x40, 0xb1, 0x75, 0xbf, 0x87, + 0x5f, 0x50, 0xd6, 0x69, 0xb3, 0x89, 0x50, 0x96, 0x4c, 0x9f, 0x81, 0x78, + 0x69, 0x75, 0xcd, 0xa7, 0xa5, 0xbf, 0xc4, 0x73, 0xa7, 0xac, 0x54, 0x9b, + 0x9d, 0x8f, 0x3c, 0xc7, 0x89, 0x3f, 0xb2, 0x9a, 0xb1, 0x43, 0x9f, 0x2f, + 0x57, 0x98, 0x9f, 0x63, 0xaf, 0x5b, 0x55, 0x61, 0x33, 0xbe, 0x3d, 0x87, + 0x8d, 0xc9, 0x44, 0x49, 0x5f, 0xaf, 0x5e, 0x8d, 0x74, 0x34, 0xcc, 0xc0, + 0xb8, 0xa1, 0x3c, 0x5d, 0x88, 0x6c, 0xa7, 0x7f, 0x68, 0xba, 0x3d, 0x7e, + 0x60, 0x2a, 0x92, 0x60, 0x51, 0xa8, 0x4f, 0x43, 0x93, 0x7b, 0x3d, 0x85, + 0x2f, 0x70, 0x75, 0xb6, 0x8d, 0xd2, 0x9c, 0x53, 0x93, 0x53, 0x86, 0x9e, + 0x80, 0xd0, 0x4b, 0x93, 0x86, 0x71, 0x9e, 0x55, 0x4a, 0xb2, 0x39, 0xb9, + 0x6f, 0xbd, 0x39, 0xc7, 0x9e, 0x49, 0x5c, 0xa5, 0xc9, 0x79, 0x48, 0x8c, + 0x38, 0xb8, 0x33, 0x3d, 0xa0, 0x4e, 0xa9, 0x62, 0xb8, 0x2f, 0x5b, 0x5a, + 0xb8, 0x8a, 0x8f, 0xb7, 0xa2, 0x3b, 0x62, 0x5a, 0xcf, 0xba, 0x7c, 0xa8, + 0x37, 0xb8, 0x5c, 0x73, 0x30, 0x98, 0xaf, 0x99, 0x9e, 0xd3, 0x5b, 0x83, + 0xaa, 0xd0, 0xbf, 0x76, 0xa7, 0x84, 0xc8, 0x5f, 0xbf, 0x3b, 0x9c, 0x7e, + 0x30, 0xd1, 0x38, 0xb7, 0xcb, 0xab, 0x58, 0x47, 0xc7, 0x72, 0x97, 0x9b, + 0x4e, 0x7b, 0x9c, 0xca, 0xb7, 0x46, 0x92, 0x8e, 0x95, 0x96, 0x97, 0xac, + 0x8d, 0xcc, 0x5d, 0xb8, 0x72, 0x41, 0xc4, 0x5f, 0x5c, 0x92, 0x89, 0x99, + 0xa3, 0xd2, 0x35, 0x47, 0x8d, 0x53, 0x8b, 0x3f, 0x5d, 0x72, 0x36, 0x44, + 0x84, 0x55, 0x9e, 0x64, 0x94, 0xb5, 0x77, 0x9a, 0xaf, 0x2b, 0x4e, 0xa0, + 0xc5, 0x4f, 0xc7, 0x73, 0x8a, 0x78, 0xc6, 0x9e, 0x99, 0x3f, 0x56, 0x30, + 0x56, 0x3e, 0x87, 0x53, 0x7b, 0x77, 0xa6, 0x91, 0x7e, 0xcc, 0x37, 0x78, + 0x4c, 0x6f, 0xa2, 0x6f, 0xc6, 0x70, 0xbf, 0xa5, 0x9b, 0xb6, 0xb2, 0x53, + 0x51, 0x4e, 0x94, 0x5f, 0xcd, 0xad, 0x83, 0xb3, 0x64, 0xa7, 0x80, 0x90, + 0x3d, 0x87, 0x63, 0x7a, 0x90, 0x8c, 0x85, 0x88, 0x78, 0x51, 0x83, 0x40, + 0x49, 0x9f, 0xce, 0x69, 0x31, 0xc3, 0x8d, 0xaa, 0x5c, 0xaa, 0x4a, 0x5f, + 0x97, 0xba, 0x48, 0x67, 0xa4, 0x59, 0xa2, 0xb2, 0x79, 0xa9, 0x76, 0xb6, + 0x3e, 0x6d, 0x7f, 0x7e, 0x4e, 0x73, 0x45, 0x5d, 0x8e, 0x52, 0x86, 0xa8, + 0x67, 0x48, 0x63, 0x81, 0x86, 0xb1, 0x9e, 0x8f, 0xbd, 0x38, 0x9c, 0xd3, + 0xd3, 0x40, 0x44, 0xcc, 0x62, 0x5c, 0x95, 0xc0, 0x46, 0x96, 0x86, 0x99, + 0xcc, 0x69, 0xac, 0x63, 0x56, 0x99, 0xc8, 0x82, 0x5d, 0x6d, 0xac, 0x59, + 0x5f, 0x43, 0xbf, 0x53, 0x92, 0x57, 0x34, 0x4a, 0x83, 0x8a, 0x8c, 0x87, + 0x85, 0x91, 0x66, 0x43, 0x47, 0x53, 0x8c, 0x73, 0xb9, 0x34, 0xc0, 0x62, + 0x7e, 0x87, 0x64, 0x91, 0xcc, 0x37, 0xb3, 0xad, 0x82, 0xb2, 0x6c, 0xc8, + 0x5e, 0xc7, 0xb6, 0x89, 0xc3, 0x5e, 0xab, 0x9f, 0xae, 0x56, 0x48, 0x39, + 0xc4, 0x9d, 0xcd, 0x99, 0x44, 0x90, 0xca, 0xc2, 0xa0, 0xc3, 0x5b, 0x3f, + 0x90, 0x8a, 0x84, 0x8d, 0x56, 0x88, 0xba, 0x83, 0xa8, 0x5a, 0x5a, 0x78, + 0x7f, 0xac, 0x8b, 0xc6, 0x48, 0x3e, 0x81, 0xb6, 0xcc, 0x54, 0x4f, 0x55, + 0x73, 0x7f, 0xae, 0xc2, 0x4a, 0x91, 0x79, 0xc7, 0xca, 0x72, 0x8b, 0x75, + 0x5a, 0xa0, 0x76, 0xa7, 0x6c, 0x39, 0x8a, 0x56, 0x79, 0xad, 0x98, 0x45, + 0x82, 0x7d, 0xcc, 0x6f, 0x89, 0xc7, 0x6e, 0xc2, 0xa0, 0x55, 0x35, 0xa8, + 0xaa, 0x8d, 0x48, 0x6e, 0x3a, 0xae, 0x58, 0x87, 0x31, 0xb1, 0x44, 0xad, + 0x41, 0x3d, 0x80, 0x8d, 0x52, 0x9c, 0x8f, 0x35, 0xb2, 0x38, 0x6b, 0xc5, + 0xcc, 0xac, 0x70, 0x4d, 0x56, 0x37, 0x97, 0x50, 0x32, 0xc9, 0x49, 0x8c, + 0x66, 0x54, 0x5d, 0xb4, 0x61, 0xca, 0xb4, 0x92, 0x5f, 0x49, 0x45, 0x5e, + 0x57, 0xce, 0xa8, 0x73, 0x38, 0x83, 0x5a, 0xad, 0xa3, 0x67, 0x4d, 0x8b, + 0xca, 0xa6, 0x3e, 0xa8, 0x6c, 0x54, 0xb2, 0x72, 0x44, 0x86, 0x96, 0x68, + 0x74, 0xa6, 0x8a, 0xb5, 0x9c, 0x45, 0xd1, 0x55, 0xae, 0xbb, 0x55, 0xad, + 0x5f, 0x6a, 0xb0, 0x41, 0x98, 0x58, 0x79, 0x51, 0x6f, 0xbb, 0x98, 0x39, + 0x67, 0x61, 0x3a, 0x95, 0x46, 0x65, 0xc6, 0xcf, 0xc7, 0xcf, 0x4b, 0x95, + 0x8a, 0x3e, 0x83, 0x3a, 0x76, 0x4b, 0x6a, 0x90, 0x33, 0x71, 0x89, 0x6a, + 0x5d, 0x49, 0x81, 0x84, 0x53, 0xc7, 0xb0, 0x55, 0x5a, 0x61, 0x5c, 0x7b, + 0x93, 0x56, 0xd2, 0x36, 0x93, 0x46, 0xb6, 0x32, 0x4e, 0x93, 0x56, 0x53, + 0xbc, 0x7a, 0xc9, 0x6f, 0x50, 0x45, 0x97, 0xac, 0x58, 0xb7, 0x69, 0xaa, + 0x50, 0x7f, 0x52, 0x63, 0x8b, 0xbb, 0x8b, 0xb9, 0x83, 0x64, 0x9d, 0xc6, + 0x66, 0x36, 0x36, 0xce, 0x45, 0x53, 0x4c, 0x5f, 0x50, 0x4f, 0x5b, 0x84, + 0xa1, 0x3f, 0x8b, 0x38, 0x4b, 0x39, 0xb2, 0x95, 0x9c, 0x46, 0x46, 0x60, + 0x6d, 0x45, 0xba, 0x59, 0x7b, 0xb2, 0x49, 0xba, 0x7e, 0xb7, 0x4f, 0x93, + 0x6f, 0x58, 0x4e, 0x66, 0x9a, 0x5f, 0x3f, 0x3d, 0x72, 0x83, 0x87, 0x97, + 0x62, 0x87, 0x88, 0x54, 0xbb, 0x6c, 0x2b, 0x8d, 0x3b, 0xac, 0x39, 0xb2, + 0xa9, 0x38, 0xcc, 0xcd, 0x58, 0x84, 0x87, 0x36, 0x51, 0x96, 0x73, 0xb6, + 0xaf, 0x94, 0xbd, 0x82, 0x42, 0x8a, 0x45, 0x5a, 0x83, 0x8f, 0x4d, 0x51, + 0x4a, 0x45, 0x5f, 0x82, 0x66, 0x80, 0x59, 0x32, 0xa5, 0xbd, 0x4e, 0x67, + 0x59, 0x94, 0x80, 0xbb, 0xb2, 0x88, 0x8c, 0xbd, 0x38, 0x5d, 0x39, 0x78, + 0x54, 0x99, 0x34, 0x3c, 0x7b, 0x89, 0xd9, 0x71, 0x61, 0xa7, 0x38, 0xaa, + 0x6c, 0x54, 0x44, 0xc9, 0x66, 0x66, 0xa4, 0x9c, 0x56, 0x4a, 0x42, 0x3d, + 0xa8, 0x65, 0x47, 0x76, 0x70, 0x47, 0x80, 0xbb, 0x55, 0x8d, 0x43, 0x3a, + 0xac, 0x6d, 0xb3, 0x8b, 0x80, 0x44, 0x86, 0x4f, 0x48, 0x62, 0x64, 0xa1, + 0x80, 0x65, 0x5a, 0x9d, 0x97, 0x4c, 0xa8, 0x72, 0xce, 0xc3, 0x8b, 0x3d, + 0x85, 0xd5, 0xb1, 0x6a, 0x39, 0x9c, 0x58, 0x71, 0x50, 0xc6, 0xc0, 0x61, + 0x6c, 0x7b, 0x33, 0x68, 0x42, 0xb6, 0x65, 0xa8, 0x8a, 0x93, 0xc8, 0x57, + 0x67, 0x72, 0x81, 0x96, 0x88, 0x34, 0x3a, 0x99, 0x64, 0xbf, 0x92, 0x75, + 0x4d, 0x68, 0x4a, 0x6a, 0xb3, 0x53, 0x6a, 0xb7, 0xcb, 0xc6, 0x40, 0x30, + 0xd3, 0x44, 0x79, 0xbe, 0x50, 0xba, 0xba, 0xba, 0xbb, 0x83, 0x33, 0x8b, + 0x4e, 0x97, 0x5a, 0x59, 0xb8, 0xa2, 0x5b, 0xa3, 0x65, 0x5f, 0x43, 0x9e, + 0xc9, 0x3f, 0xc8, 0xb2, 0x4c, 0xa7, 0x38, 0x63, 0x4f, 0xb7, 0x35, 0x74, + 0xa1, 0x8b, 0xbf, 0xb1, 0x98, 0x79, 0x64, 0x8d, 0x50, 0x97, 0x97, 0x65, + 0x60, 0xb1, 0x9d, 0x48, 0x45, 0x93, 0x85, 0x7f, 0xd2, 0x77, 0x75, 0x7a, + 0xbd, 0x3c, 0x6d, 0x6a, 0x98, 0xc2, 0x97, 0x89, 0xa3, 0xab, 0x65, 0x6e, + 0x4e, 0xcd, 0x4f, 0x8a, 0xb6, 0x57, 0xab, 0x4b, 0x84, 0x98, 0x98, 0x40, + 0x75, 0x63, 0xbc, 0xca, 0xba, 0xd0, 0x79, 0x69, 0x85, 0x82, 0x41, 0x66, + 0x5f, 0xc5, 0x77, 0xb5, 0xce, 0x99, 0x34, 0xb4, 0x50, 0x68, 0x94, 0xca, + 0x30, 0x7a, 0xd2, 0x63, 0x88, 0x66, 0x3c, 0x90, 0x59, 0x9e, 0x95, 0x65, + 0xd2, 0x7d, 0x44, 0xb1, 0x88, 0x46, 0xb8, 0xbb, 0xda, 0x49, 0xcc, 0x4b, + 0x9a, 0x43, 0xad, 0xaa, 0x72, 0x46, 0xa4, 0xe9, 0x55, 0x41, 0x47, 0xd1, + 0x8a, 0xbb, 0xa2, 0x4e, 0x84, 0x32, 0x7c, 0x6a, 0x9b, 0xb0, 0xb9, 0x92, + 0x83, 0x67, 0x4d, 0x26, 0x9b, 0x9b, 0x72, 0x35, 0xaf, 0xab, 0x5b, 0xb0, + 0xc8, 0x8d, 0x42, 0xe0, 0x96, 0xd2, 0xa2, 0xb6, 0x94, 0x42, 0xa8, 0x6d, + 0x53, 0x98, 0xae, 0x60, 0x36, 0xa5, 0xa7, 0x3c, 0x37, 0x2e, 0xdb, 0x82, + 0xa9, 0x78, 0x76, 0x6a, 0xbd, 0x3d, 0xa8, 0x9b, 0xd1, 0x4d, 0x94, 0x51, + 0x39, 0xaa, 0x64, 0x7c, 0x65, 0xc8, 0xcc, 0x7e, 0x85, 0x72, 0xa2, 0x63, + 0x72, 0x44, 0xc0, 0xab, 0xd1, 0xd2, 0xdc, 0xd1, 0x7d, 0x54, 0xb7, 0x9b, + 0xa1, 0x93, 0xaa, 0x74, 0x92, 0x99, 0x4d, 0x84, 0xcb, 0x97, 0x37, 0x5e, + 0x7f, 0x6f, 0x3b, 0xb2, 0xc7, 0xc1, 0xcd, 0xa4, 0x79, 0x66, 0xb3, 0x42, + 0x78, 0x39, 0x8c, 0xab, 0x5a, 0x87, 0xa5, 0xcc, 0x8a, 0x44, 0xa3, 0xaf, + 0x8a, 0xd9, 0x63, 0x90, 0x8a, 0x82, 0x75, 0x87, 0x47, 0x6e, 0x84, 0xc1, + 0x5e, 0xd9, 0x4e, 0x47, 0x3e, 0xa4, 0x58, 0x75, 0x45, 0x46, 0x4e, 0x3e, + 0xb6, 0x60, 0x49, 0x6c, 0xd4, 0xc1, 0xd7, 0x5e, 0xa2, 0x64, 0x69, 0xa4, + 0x37, 0xc8, 0x9c, 0x4a, 0x80, 0xa4, 0x7c, 0xa1, 0x89, 0x43, 0xdc, 0x46, + 0x99, 0x56, 0xa4, 0x4b, 0x4a, 0x62, 0x6c, 0xb2, 0x74, 0x9c, 0xa7, 0x87, + 0x8b, 0x9d, 0xb9, 0xb2, 0xb4, 0x4b, 0x80, 0xb5, 0x74, 0x7c, 0x54, 0xb3, + 0x3e, 0xc3, 0x60, 0x3b, 0xbf, 0x94, 0x8b, 0xce, 0x94, 0x8f, 0xe4, 0xc2, + 0x80, 0xa9, 0x76, 0xa9, 0x60, 0x8a, 0x92, 0x5b, 0x58, 0x96, 0x7d, 0xb6, + 0xb2, 0x94, 0xa0, 0x45, 0x88, 0x69, 0xcd, 0x8f, 0x78, 0x8e, 0xa0, 0xbf, + 0x88, 0x35, 0x87, 0x4c, 0x39, 0x3c, 0x49, 0xa8, 0x69, 0xb6, 0xc0, 0x43, + 0x4b, 0x92, 0x59, 0x7a, 0xb7, 0x3a, 0x3d, 0xa5, 0xae, 0x77, 0x6d, 0x74, + 0x82, 0x6c, 0x85, 0x6c, 0x3d, 0x9a, 0x7b, 0x82, 0xb5, 0x5a, 0x60, 0x9b, + 0x7a, 0x87, 0x6d, 0xcc, 0xbd, 0x6c, 0xb0, 0xae, 0xac, 0x8f, 0x8f, 0x86, + 0x65, 0x70, 0xc3, 0x7f, 0x51, 0x95, 0xc3, 0xc9, 0x3f, 0x6f, 0xc2, 0x72, + 0xcd, 0x7f, 0xd5, 0x9b, 0xb7, 0x48, 0x47, 0x3d, 0x5f, 0xa0, 0xa0, 0xb5, + 0xd1, 0x40, 0x96, 0xb1, 0xc4, 0x68, 0x60, 0x8f, 0x95, 0x44, 0xd0, 0x6e, + 0x6a, 0x51, 0xd8, 0xb3, 0xc9, 0xcb, 0x9d, 0xdd, 0xbd, 0x90, 0x93, 0x4c, + 0x74, 0x4e, 0x69, 0x9b, 0xa7, 0xc0, 0x7b, 0xc4, 0xae, 0xb4, 0x9e, 0x6c, + 0x51, 0xb6, 0xdc, 0xa3, 0x52, 0x6b, 0x78, 0xc6, 0x37, 0xc2, 0x6f, 0x96, + 0x98, 0x7a, 0x99, 0x5b, 0xb2, 0x9c, 0x54, 0xab, 0x74, 0xcb, 0x9d, 0x91, + 0x4d, 0x97, 0x62, 0x93, 0xcd, 0x7d, 0x3b, 0xae, 0x99, 0x92, 0xb4, 0x79, + 0x6b, 0xae, 0x4b, 0x82, 0xa0, 0x42, 0x3a, 0x8f, 0x90, 0xb8, 0x3e, 0x5a, + 0xcc, 0xbc, 0x74, 0x91, 0x9b, 0x9d, 0x7e, 0x42, 0x79, 0x80, 0x57, 0x67, + 0x56, 0x6b, 0xaf, 0x3e, 0x58, 0x9f, 0x51, 0x50, 0x49, 0x77, 0xb5, 0x96, + 0x4c, 0xc4, 0x7d, 0x71, 0x44, 0xdb, 0xb1, 0x59, 0xb7, 0x51, 0x6c, 0x7f, + 0x51, 0xb8, 0x9d, 0xa9, 0x5d, 0x99, 0x3e, 0x47, 0x74, 0x8f, 0x65, 0xca, + 0x40, 0x5c, 0x6d, 0x6d, 0x83, 0x82, 0xa6, 0x69, 0x5c, 0xc0, 0x7f, 0xb9, + 0x7a, 0x42, 0x85, 0x64, 0xc0, 0x4b, 0xb4, 0x38, 0x8a, 0x9b, 0x5f, 0xb7, + 0xa7, 0x42, 0x50, 0x82, 0x4c, 0x63, 0x48, 0x8d, 0x57, 0x50, 0xcd, 0x89, + 0xc6, 0xdc, 0xc7, 0x73, 0xc6, 0x57, 0x3f, 0x98, 0x88, 0x75, 0xad, 0x4f, + 0xa2, 0xc9, 0x62, 0x93, 0x45, 0x46, 0x72, 0x90, 0xae, 0xb8, 0x4d, 0xc7, + 0x42, 0x84, 0x8f, 0x4e, 0xe1, 0x52, 0xe6, 0xc4, 0x7d, 0x34, 0xa5, 0x3d, + 0x6a, 0x8e, 0x39, 0x8e, 0x98, 0x6b, 0x45, 0xbe, 0x43, 0xa3, 0x57, 0x6f, + 0xd2, 0x96, 0x7e, 0x38, 0x29, 0x20, 0x9b, 0x9d, 0xb9, 0x72, 0x50, 0xc0, + 0xc8, 0xc6, 0x37, 0x8a, 0xb1, 0x85, 0x64, 0x2b, 0xbb, 0x51, 0x80, 0xb0, + 0x89, 0x6c, 0x4b, 0x8e, 0x65, 0x76, 0x9a, 0x8f, 0xb0, 0x98, 0x9a, 0xa0, + 0x79, 0x63, 0x3b, 0x49, 0x96, 0x31, 0x70, 0xab, 0x2d, 0x67, 0xbc, 0xa8, + 0x80, 0xd7, 0x68, 0x5a, 0x34, 0x7b, 0x55, 0x83, 0x5c, 0xa4, 0x8e, 0x65, + 0x4e, 0xc1, 0x53, 0x43, 0x7c, 0xaf, 0x90, 0x66, 0x51, 0xbd, 0x41, 0x72, + 0x81, 0x83, 0xa8, 0x84, 0xac, 0xb8, 0x89, 0x88, 0x8b, 0x43, 0x99, 0xae, + 0x51, 0x87, 0x8f, 0x7b, 0x8f, 0xb7, 0x7f, 0x68, 0x6b, 0x5b, 0xb8, 0x9f, + 0x77, 0x4f, 0xe4, 0xa7, 0xba, 0x20, 0xa0, 0x9b, 0x77, 0xa7, 0x65, 0xca, + 0x3e, 0x81, 0x3f, 0x65, 0x9a, 0x6d, 0x87, 0x42, 0x75, 0x4b, 0x56, 0x8c, + 0x93, 0x60, 0x45, 0xa4, 0x37, 0xac, 0xa2, 0x97, 0x81, 0xa7, 0xc3, 0x98, + 0x8f, 0xdc, 0x4e, 0x6e, 0x5f, 0xb7, 0x58, 0xcc, 0x5c, 0x7f, 0x9c, 0xa3, + 0xcf, 0xbc, 0x7c, 0x46, 0xa5, 0x4c, 0xd5, 0x95, 0x77, 0x74, 0x46, 0x4f, + 0x74, 0xf4, 0xb0, 0xd2, 0x5f, 0x7d, 0x56, 0xd7, 0x79, 0x74, 0x81, 0x99, + 0x39, 0xd1, 0x91, 0x89, 0xa9, 0x65, 0x45, 0xa2, 0xd2, 0xcc, 0xd2, 0x31, + 0x4d, 0x51, 0xb3, 0xad, 0x8e, 0xc5, 0x45, 0xa9, 0xaa, 0xc4, 0x30, 0x90, + 0x93, 0x94, 0xb3, 0x4b, 0x44, 0xcc, 0xb2, 0xbc, 0x79, 0x4c, 0x7a, 0x2f, + 0x51, 0x7c, 0x85, 0xc7, 0x71, 0xcb, 0x76, 0xa9, 0x63, 0x3d, 0xbb, 0x88, + 0xcf, 0xca, 0x7d, 0x84, 0x54, 0x91, 0x8d, 0x93, 0xd0, 0x4a, 0xac, 0x5b, + 0x73, 0xb8, 0x78, 0x84, 0x72, 0x69, 0xcf, 0x92, 0xc7, 0x98, 0xc3, 0x66, + 0x43, 0x65, 0xaa, 0xa1, 0x4b, 0x61, 0x38, 0x4f, 0x7e, 0x44, 0xc0, 0xb1, + 0xad, 0x7f, 0x81, 0x3c, 0x7d, 0x9a, 0xb4, 0x80, 0x74, 0x58, 0x55, 0xc5, + 0xc2, 0xcd, 0xb7, 0x4d, 0x32, 0xa9, 0xa2, 0x34, 0xbf, 0x93, 0xc6, 0x53, + 0x7b, 0x98, 0xb1, 0xc7, 0x89, 0x32, 0x83, 0xc8, 0xaf, 0x9f, 0x9a, 0xe2, + 0x7c, 0xa6, 0x6e, 0xbd, 0x8c, 0x6a, 0xb4, 0x9a, 0x53, 0x66, 0xd6, 0x42, + 0x26, 0x87, 0xc8, 0x92, 0x2a, 0x66, 0x37, 0x5e, 0x4d, 0x62, 0xb9, 0x7d, + 0x97, 0x42, 0x7d, 0x18, 0x5c, 0x37, 0x87, 0x91, 0x3a, 0x4c, 0xd2, 0x55, + 0x41, 0x54, 0xbc, 0xab, 0x76, 0x60, 0xd5, 0x60, 0x96, 0x6e, 0x3d, 0x85, + 0x9a, 0xc5, 0x64, 0x35, 0x5f, 0x7c, 0x89, 0x39, 0x51, 0xc4, 0x65, 0x6f, + 0xba, 0xd1, 0xd7, 0x39, 0x6d, 0x9b, 0x66, 0xb6, 0x9d, 0x91, 0xb8, 0xbd, + 0x5d, 0x9b, 0x81, 0xa9, 0x97, 0x46, 0xbe, 0x32, 0x71, 0xd8, 0xce, 0x85, + 0x98, 0x45, 0xda, 0xb7, 0x95, 0x3a, 0xa2, 0xc4, 0x2e, 0xb6, 0x9e, 0x38, + 0xc3, 0x9c, 0x41, 0x5a, 0xca, 0xa8, 0x4f, 0x80, 0x80, 0x57, 0x79, 0x87, + 0x57, 0x9d, 0xa8, 0x3f, 0x58, 0x98, 0x95, 0x6b, 0x37, 0xce, 0x9e, 0x49, + 0x7a, 0xa2, 0x7e, 0xab, 0xaf, 0xa3, 0xae, 0xc1, 0x75, 0x5d, 0x4b, 0x5c, + 0x6a, 0xa1, 0xd0, 0x50, 0x6a, 0x9a, 0x2a, 0x7d, 0xb7, 0xb9, 0xa1, 0x6d, + 0x6a, 0x75, 0x4f, 0x61, 0x99, 0x38, 0x76, 0x49, 0xc3, 0x62, 0x7e, 0x44, + 0x49, 0x8f, 0xaf, 0x64, 0x4d, 0x9c, 0x4a, 0xc1, 0x82, 0xd2, 0x40, 0x9b, + 0x6f, 0xc4, 0x8b, 0xa0, 0x96, 0xad, 0x6b, 0x7a, 0x46, 0xb2, 0x37, 0xce, + 0x5d, 0x46, 0x46, 0xc8, 0x51, 0xaf, 0x90, 0xb9, 0xc7, 0x8b, 0x74, 0x3c, + 0x30, 0xa8, 0xb4, 0xbf, 0xb9, 0xc4, 0x4c, 0x54, 0x69, 0x8b, 0x6c, 0xa5, + 0x9b, 0x8b, 0x31, 0xbc, 0xc5, 0x9d, 0x30, 0xc9, 0x96, 0xcd, 0x4b, 0x54, + 0x3d, 0x55, 0xa9, 0xbf, 0x7b, 0x6e, 0x37, 0x69, 0x5b, 0x7b, 0xac, 0xae, + 0xa4, 0xc3, 0xb7, 0x8a, 0x61, 0x69, 0xa3, 0x3f, 0x4a, 0xab, 0x9d, 0x5a, + 0xb8, 0x58, 0x49, 0x4e, 0xdb, 0xc3, 0x90, 0x51, 0x47, 0x48, 0xc9, 0x7b, + 0x48, 0xa3, 0x46, 0xab, 0x32, 0xaa, 0x32, 0x64, 0x56, 0x4e, 0xb2, 0xb8, + 0x47, 0x4a, 0xb0, 0x3a, 0xc6, 0xc7, 0xab, 0x45, 0x53, 0x99, 0x44, 0xbf, + 0x90, 0x6b, 0x7a, 0x90, 0x5c, 0xbb, 0x4c, 0xb7, 0x8c, 0x76, 0x7b, 0x54, + 0x84, 0x88, 0xc9, 0x62, 0x8e, 0x5a, 0xce, 0x75, 0xc0, 0x41, 0x59, 0x98, + 0x5e, 0xbe, 0x5d, 0x8b, 0xaf, 0x73, 0xaf, 0x3e, 0x65, 0x6a, 0x75, 0xa3, + 0x8b, 0x72, 0x69, 0x42, 0x45, 0x6a, 0x64, 0x3d, 0xb3, 0xb1, 0xdd, 0x52, + 0x9c, 0x8c, 0x74, 0x74, 0xab, 0x38, 0xa0, 0x4c, 0xb1, 0x55, 0x8c, 0x53, + 0xbc, 0x84, 0x2e, 0xdc, 0x54, 0x48, 0x8e, 0x63, 0x52, 0x59, 0x38, 0x66, + 0x9e, 0x76, 0xcd, 0x8e, 0x80, 0x81, 0xa1, 0x71, 0x46, 0x5f, 0x4d, 0xa7, + 0xb5, 0x64, 0x48, 0x4d, 0xbb, 0x7f, 0x94, 0x8b, 0x9e, 0x5e, 0x9d, 0x99, + 0x58, 0xce, 0x96, 0x7d, 0x1e, 0xa7, 0x53, 0x5d, 0x37, 0xc3, 0x4d, 0x2f, + 0xa0, 0xde, 0xaf, 0xe0, 0xd1, 0x49, 0xd2, 0xa1, 0x6d, 0x2b, 0x65, 0xbf, + 0xbe, 0x58, 0x62, 0x68, 0x4d, 0x7e, 0xc5, 0x8d, 0x5d, 0xa5, 0xdd, 0xc7, + 0x5a, 0x3b, 0x7e, 0xe9, 0xba, 0x6c, 0xb3, 0x78, 0xbc, 0x43, 0xc9, 0xbc, + 0x8d, 0x51, 0x81, 0x79, 0x6e, 0x41, 0xa7, 0x8a, 0xa0, 0xbe, 0xbd, 0x82, + 0x86, 0xdc, 0xc8, 0x7a, 0x97, 0xd0, 0x48, 0x3d, 0x6d, 0xc9, 0x3a, 0x88, + 0x72, 0x79, 0x9f, 0x92, 0x4f, 0x80, 0xa8, 0x43, 0x9d, 0xd0, 0xac, 0x4b, + 0xb6, 0xc4, 0xa8, 0x83, 0x35, 0xa5, 0x93, 0x28, 0xb1, 0x72, 0xce, 0x71, + 0x92, 0x92, 0x50, 0x73, 0xb8, 0x97, 0xc9, 0x46, 0x9e, 0x45, 0x64, 0xa3, + 0x36, 0x95, 0xa3, 0x85, 0x77, 0x5d, 0x48, 0x8f, 0x69, 0x74, 0x95, 0x3e, + 0x64, 0x5d, 0x58, 0x38, 0xc5, 0x76, 0xb3, 0xa5, 0xcb, 0xb6, 0x60, 0x7f, + 0x9f, 0x4f, 0x57, 0x3d, 0x53, 0x93, 0x48, 0xa9, 0xa3, 0x3e, 0x5a, 0x41, + 0x4a, 0x8d, 0x88, 0x73, 0xb5, 0xbb, 0x5e, 0x80, 0x5f, 0x39, 0xca, 0xd7, + 0xbe, 0xb1, 0x9f, 0x7a, 0xd1, 0xa8, 0x31, 0xa1, 0x7e, 0x47, 0x8c, 0x7e, + 0x84, 0x51, 0x9c, 0xb2, 0x55, 0x28, 0x38, 0x53, 0x3f, 0xc1, 0x32, 0x77, + 0x85, 0xca, 0x50, 0xb1, 0xc4, 0x9e, 0x69, 0x91, 0x7b, 0xa4, 0x64, 0xd3, + 0x9c, 0x9b, 0x37, 0xd3, 0x89, 0xc2, 0xb1, 0x37, 0xac, 0xa6, 0x6b, 0xa2, + 0x64, 0x40, 0x76, 0x6f, 0xcf, 0xd5, 0x9d, 0x65, 0x7b, 0xbe, 0xcf, 0xc3, + 0xbd, 0xbb, 0x58, 0x9b, 0x9c, 0x8d, 0x86, 0xae, 0x81, 0xca, 0x48, 0xa1, + 0x3c, 0xdb, 0x7a, 0x6c, 0x5b, 0x52, 0xbf, 0xd6, 0x91, 0x37, 0xa5, 0xba, + 0x3e, 0x43, 0x84, 0x94, 0x73, 0xa0, 0x54, 0x26, 0x73, 0x75, 0x52, 0x85, + 0x3d, 0xa3, 0xd8, 0xa4, 0x8b, 0x41, 0x6b, 0x24, 0x5d, 0x5a, 0x5a, 0x68, + 0xae, 0xd9, 0x3b, 0xad, 0x59, 0x5d, 0xc2, 0x7b, 0xaa, 0xbb, 0xc0, 0x4e, + 0xca, 0x61, 0xcc, 0xd0, 0x39, 0xa9, 0x99, 0x76, 0xb5, 0xb4, 0xb8, 0xab, + 0x97, 0xcb, 0x60, 0x58, 0x52, 0xaa, 0xb8, 0x4d, 0x23, 0x33, 0x62, 0xb5, + 0x85, 0x74, 0x44, 0x8c, 0xc0, 0x4d, 0xd4, 0x51, 0xcf, 0x7f, 0x77, 0xaa, + 0x60, 0x62, 0x97, 0xab, 0x5d, 0xb4, 0x62, 0xaa, 0x44, 0x2b, 0xa4, 0x54, + 0xc0, 0xb5, 0x5f, 0x36, 0x80, 0x7c, 0xc9, 0x59, 0x3a, 0x4d, 0x42, 0x8d, + 0x65, 0x8b, 0x41, 0xba, 0x4c, 0x50, 0x4c, 0xb5, 0x42, 0x32, 0x70, 0x99, + 0xd0, 0xa8, 0x4d, 0x9a, 0x4a, 0x33, 0xba, 0x7c, 0x90, 0xc6, 0x56, 0x76, + 0x96, 0x90, 0x4c, 0x7b, 0x79, 0x49, 0x45, 0x5d, 0xcb, 0xc4, 0xdb, 0xd4, + 0x97, 0xab, 0x58, 0xa1, 0x77, 0x36, 0x55, 0xbf, 0x71, 0xc1, 0x95, 0xc6, + 0x84, 0x50, 0x7d, 0xc8, 0x54, 0xad, 0x46, 0x8b, 0xc0, 0x72, 0xa9, 0xa6, + 0x5e, 0xbe, 0x93, 0x43, 0x70, 0xbe, 0x67, 0xad, 0xbb, 0xa0, 0xc7, 0xb8, + 0x76, 0x9b, 0x7b, 0x37, 0x83, 0xac, 0x81, 0x55, 0x59, 0x36, 0x7a, 0xb5, + 0x9f, 0xa2, 0x7b, 0x38, 0x51, 0xcf, 0x89, 0xaa, 0x8c, 0x73, 0x7f, 0x65, + 0x97, 0x63, 0x81, 0x68, 0x97, 0xcb, 0xbd, 0x95, 0x33, 0x65, 0xa8, 0x84, + 0xb4, 0xb5, 0x88, 0x8d, 0x3f, 0xa6, 0x26, 0x40, 0x7f, 0x53, 0x3f, 0x7c, + 0x4b, 0x4c, 0x3c, 0x9d, 0x46, 0x61, 0xbc, 0xc5, 0x72, 0xa6, 0xd9, 0x9f, + 0xa7, 0x93, 0x7c, 0x64, 0x49, 0xa1, 0x93, 0x68, 0xbf, 0xcc, 0x8a, 0xc3, + 0x73, 0x44, 0x8c, 0xac, 0xb5, 0x85, 0x81, 0x34, 0xb8, 0x8a, 0x7b, 0xbd, + 0x41, 0x84, 0x3c, 0x7f, 0x99, 0xcd, 0x8e, 0xbd, 0x47, 0x65, 0x79, 0x32, + 0xb0, 0x99, 0xa9, 0xa1, 0x60, 0x99, 0xb4, 0xc2, 0x5b, 0x4f, 0x6f, 0x56, + 0xce, 0x9d, 0x47, 0x4a, 0x73, 0x56, 0x64, 0x93, 0x4e, 0x6a, 0xc0, 0x8f, + 0x7c, 0x4e, 0xa1, 0x43, 0x62, 0xc2, 0xbe, 0x88, 0x42, 0x3b, 0x45, 0x83, + 0xa3, 0x66, 0x34, 0xa8, 0xcb, 0xba, 0xd7, 0x4c, 0x31, 0x96, 0xcd, 0x46, + 0x37, 0xa2, 0x79, 0xb7, 0x85, 0x86, 0x68, 0x74, 0xd3, 0x74, 0x79, 0x97, + 0xb6, 0xa2, 0x8c, 0x43, 0xb1, 0x6d, 0xa6, 0xbd, 0x46, 0xaa, 0x35, 0x67, + 0x48, 0xd9, 0x4e, 0x3a, 0xd2, 0x3b, 0x4b, 0x58, 0x71, 0x6e, 0x95, 0x55, + 0x5e, 0x91, 0x83, 0x93, 0x54, 0x5e, 0xad, 0x79, 0x60, 0xa9, 0x93, 0x9a, + 0x41, 0x4f, 0x50, 0x8a, 0xbc, 0xc1, 0x50, 0x4d, 0x7d, 0x4b, 0x59, 0xa2, + 0x98, 0xbf, 0x31, 0xb7, 0xa4, 0x94, 0xa3, 0xd2, 0x49, 0xca, 0xc3, 0x3d, + 0x4b, 0x9c, 0xcb, 0xd4, 0xc7, 0x3d, 0x58, 0x33, 0x7e, 0x5b, 0x39, 0xbb, + 0x3b, 0xb6, 0x75, 0xb0, 0xb0, 0x46, 0x78, 0x9c, 0x86, 0x8a, 0x56, 0x3a, + 0x60, 0x49, 0x2d, 0x61, 0x93, 0xd2, 0x6a, 0x80, 0x61, 0x45, 0xb7, 0xb2, + 0x79, 0xad, 0xc0, 0x49, 0xa1, 0xb0, 0x9f, 0xbb, 0x93, 0x8d, 0x82, 0x8f, + 0xbb, 0xbc, 0x74, 0x8d, 0x97, 0x5f, 0xbb, 0x78, 0x53, 0x6c, 0x87, 0xb1, + 0xae, 0xca, 0x4d, 0x91, 0xa6, 0x69, 0xba, 0x90, 0x98, 0x86, 0x85, 0xc5, + 0x50, 0xb0, 0xb3, 0xc9, 0x86, 0xa3, 0xa9, 0x86, 0xb1, 0xa0, 0xce, 0xa1, + 0x42, 0x5a, 0xc7, 0x78, 0x39, 0x83, 0x58, 0x64, 0x72, 0xce, 0x80, 0x91, + 0x3e, 0x7f, 0x41, 0x65, 0x35, 0x9a, 0xa0, 0xac, 0xc0, 0xa7, 0x7d, 0x91, + 0x5d, 0x6e, 0x6a, 0x30, 0x5f, 0x78, 0x88, 0xa7, 0x8a, 0x90, 0x4e, 0x45, + 0xa4, 0x41, 0x99, 0x9b, 0x68, 0x95, 0xd6, 0x45, 0xcf, 0xba, 0x84, 0x92, + 0xa1, 0x9a, 0x5f, 0x99, 0x33, 0x98, 0xab, 0x60, 0x9f, 0x76, 0x70, 0x6c, + 0x47, 0x4f, 0x54, 0x8c, 0xb4, 0x9e, 0x60, 0xa3, 0x69, 0x77, 0x79, 0xc4, + 0x38, 0x5a, 0x81, 0xcd, 0x9d, 0xc5, 0xae, 0xc8, 0x92, 0x74, 0xb5, 0x50, + 0x68, 0xb3, 0x68, 0x99, 0x3e, 0xbd, 0xbd, 0x93, 0x49, 0x59, 0x80, 0xab, + 0x47, 0x8a, 0x4a, 0x5b, 0x94, 0x49, 0xcd, 0x56, 0x8a, 0x65, 0xc6, 0x88, + 0x8e, 0xc2, 0x8f, 0x96, 0x32, 0x7f, 0xc9, 0x73, 0xc7, 0x99, 0x84, 0x90, + 0x86, 0x91, 0x57, 0x63, 0x99, 0x41, 0x4f, 0xa5, 0x4c, 0x53, 0x78, 0x54, + 0xa1, 0x83, 0x4d, 0x5d, 0x4d, 0xc1, 0x8c, 0xc5, 0xaf, 0xc3, 0xc8, 0x3c, + 0x32, 0xb8, 0x5a, 0xca, 0x4a, 0x3b, 0xb5, 0xb6, 0x5e, 0x6a, 0x6c, 0x38, + 0x56, 0x56, 0xc2, 0x6a, 0xa8, 0x4e, 0x7a, 0xbc, 0x4a, 0x66, 0xc7, 0x6d, + 0xc8, 0x7e, 0xbb, 0x35, 0x5c, 0x57, 0x3d, 0xdb, 0x82, 0x3f, 0x42, 0x56, + 0x87, 0x4b, 0x93, 0xa6, 0x4f, 0xbc, 0xa6, 0x6c, 0x97, 0x6b, 0x90, 0x63, + 0x3c, 0xb4, 0x5e, 0xc6, 0x56, 0xa8, 0x96, 0x3f, 0xa7, 0xb2, 0x84, 0x3b, + 0x41, 0xd7, 0xb8, 0x9f, 0x7a, 0x7b, 0x70, 0x6c, 0x61, 0x8c, 0x6c, 0xba, + 0x7c, 0xbe, 0x3c, 0xb7, 0xa4, 0xcb, 0x3b, 0xbe, 0x44, 0xa0, 0x5f, 0xd1, + 0x70, 0x43, 0x95, 0xb1, 0x3e, 0x69, 0x5a, 0x6b, 0xb1, 0x71, 0x73, 0x46, + 0x7a, 0x55, 0xbd, 0x43, 0xc2, 0x39, 0x8b, 0x3e, 0xce, 0x6a, 0x38, 0x50, + 0x41, 0xcc, 0xb0, 0xb9, 0x37, 0x3a, 0x72, 0xc3, 0x5a, 0xcd, 0xa7, 0x64, + 0x99, 0xc3, 0x6a, 0x5d, 0xac, 0x76, 0xb7, 0x92, 0x8e, 0x40, 0x9a, 0x4d, + 0xae, 0xa0, 0xce, 0x3c, 0x7f, 0x3d, 0x9a, 0x77, 0xaa, 0x7e, 0x42, 0x8b, + 0x6b, 0x92, 0xb7, 0xbe, 0xb9, 0x63, 0x8a, 0xa0, 0x45, 0x4d, 0x96, 0xd7, + 0x40, 0x46, 0x7e, 0x40, 0x80, 0xbe, 0xb7, 0x54, 0x80, 0x92, 0x63, 0x79, + 0xa4, 0xb5, 0xad, 0xce, 0xaa, 0xab, 0x49, 0x5b, 0x73, 0x89, 0x89, 0x79, + 0x56, 0x44, 0xc5, 0x83, 0xa3, 0x69, 0xb1, 0xba, 0x67, 0x2f, 0x2e, 0x50, + 0x5b, 0x9e, 0x3d, 0xac, 0x72, 0xd0, 0x57, 0x9b, 0xb4, 0x7a, 0xa3, 0x37, + 0x65, 0x46, 0x61, 0x98, 0x75, 0xc6, 0xb6, 0x90, 0x52, 0x9d, 0xa3, 0xb7, + 0x38, 0x96, 0xb0, 0x3b, 0xca, 0xc3, 0x5d, 0x9e, 0xda, 0x7c, 0x4c, 0x79, + 0x97, 0xaf, 0xbc, 0xbc, 0x9c, 0x56, 0x95, 0x3e, 0xc8, 0x54, 0xc5, 0xd5, + 0xb6, 0xc9, 0xa9, 0x6b, 0xaf, 0xbd, 0x92, 0xcd, 0xbd, 0x7d, 0xa0, 0x45, + 0xcf, 0x57, 0xc8, 0x56, 0x6a, 0x8b, 0xcf, 0xb1, 0x71, 0xc3, 0x47, 0x3a, + 0x82, 0x8a, 0x85, 0x65, 0xb3, 0x5d, 0xc0, 0xd2, 0x80, 0xc1, 0xc2, 0x80, + 0xae, 0xab, 0xc4, 0xae, 0x47, 0xa6, 0x66, 0x53, 0x6f, 0x6d, 0xa1, 0xa1, + 0x46, 0x96, 0x54, 0x9b, 0xac, 0x43, 0xc2, 0x9e, 0xc8, 0x81, 0xbf, 0x73, + 0x58, 0x90, 0xa9, 0x8e, 0x3d, 0x40, 0xa0, 0x5c, 0x37, 0x6d, 0x37, 0x45, + 0x74, 0x95, 0x8c, 0xb2, 0x7b, 0xbb, 0x3e, 0x37, 0x7d, 0x4a, 0x8c, 0xa6, + 0xc4, 0xa4, 0x34, 0xc3, 0x8e, 0x5f, 0x47, 0x9d, 0x4a, 0x81, 0xc8, 0xcf, + 0x8b, 0x6f, 0x6b, 0x5f, 0x9a, 0x33, 0x6e, 0xc2, 0xa3, 0x48, 0x6e, 0xa7, + 0x6e, 0xc2, 0x3d, 0xaa, 0x3d, 0x75, 0x5d, 0xa0, 0x89, 0xb4, 0x96, 0x49, + 0x6b, 0xaf, 0x9b, 0x64, 0x5c, 0xce, 0x9a, 0x67, 0x37, 0x55, 0xcc, 0x42, + 0x35, 0x5d, 0x81, 0x89, 0xc1, 0xc6, 0x4b, 0x77, 0xb4, 0x86, 0x41, 0xcc, + 0x40, 0x66, 0x94, 0x6c, 0x69, 0x73, 0xa0, 0xbe, 0x5f, 0xad, 0x4e, 0x8d, + 0x5c, 0xb3, 0x6a, 0xb5, 0x9a, 0x76, 0x63, 0xb8, 0x9a, 0x46, 0x3a, 0xaa, + 0x3e, 0x3d, 0xcc, 0x8e, 0x6e, 0x43, 0x6f, 0xc2, 0xa7, 0xc2, 0x77, 0x2e, + 0x2d, 0x62, 0x8f, 0x3d, 0xb1, 0x94, 0x67, 0x42, 0x54, 0x4e, 0x88, 0x57, + 0x48, 0x31, 0xc3, 0x3b, 0x67, 0xd3, 0x4c, 0x6c, 0x58, 0x9c, 0x38, 0x96, + 0x48, 0x91, 0x60, 0x77, 0xb4, 0xa6, 0x3f, 0xba, 0x49, 0x32, 0xc8, 0x67, + 0x8b, 0x8c, 0x44, 0xc7, 0xcf, 0x90, 0x50, 0x76, 0xb3, 0xac, 0xb9, 0x62, + 0x5d, 0xbc, 0xb1, 0x41, 0xc5, 0x2b, 0x60, 0x44, 0xc9, 0xd1, 0x48, 0x49, + 0x35, 0x44, 0x68, 0x43, 0x5a, 0x67, 0xcf, 0x94, 0x4b, 0x3b, 0x6a, 0x4e, + 0x97, 0xc6, 0x9d, 0xa1, 0x45, 0xcf, 0xab, 0xb7, 0x7f, 0xc6, 0xc9, 0xa2, + 0x79, 0x81, 0x41, 0xc3, 0x82, 0xac, 0x5e, 0x93, 0x5c, 0xc0, 0x96, 0xba, + 0xc7, 0xca, 0x49, 0x68, 0xc4, 0x86, 0x37, 0x96, 0x5e, 0x41, 0x9f, 0xa0, + 0xaa, 0xaa, 0x6d, 0x66, 0x85, 0x37, 0x84, 0xcc, 0x7e, 0xb2, 0xb5, 0xb1, + 0x4f, 0x68, 0xd4, 0x64, 0x38, 0xa0, 0x4b, 0xba, 0x2e, 0x37, 0x3a, 0xcc, + 0x89, 0x3b, 0x5e, 0x90, 0xb5, 0x7c, 0x7c, 0x55, 0x59, 0x6c, 0x9c, 0x57, + 0xca, 0xcf, 0x3f, 0xbb, 0x36, 0x5b, 0x98, 0x4a, 0x88, 0xcd, 0x86, 0xab, + 0x5a, 0x7d, 0x45, 0xad, 0x68, 0xa0, 0xa8, 0x7d, 0x74, 0x97, 0x77, 0x6f, + 0x6b, 0xba, 0xc7, 0x76, 0x89, 0xac, 0xd1, 0x71, 0x5a, 0x71, 0x4d, 0x3a, + 0x92, 0x6e, 0xd1, 0x43, 0xab, 0x95, 0x5e, 0x8a, 0x9d, 0x4a, 0x87, 0x49, + 0x44, 0x5c, 0xbc, 0x5b, 0x7b, 0x96, 0x4c, 0x73, 0x4d, 0x98, 0x5a, 0x4d, + 0xd3, 0xae, 0x6c, 0x7e, 0xa9, 0x76, 0x91, 0xae, 0x3e, 0xac, 0x89, 0xc6, + 0x4e, 0xa7, 0x9f, 0xa9, 0xc3, 0xad, 0x33, 0x52, 0x5b, 0x50, 0x37, 0xaa, + 0x44, 0x7b, 0xbf, 0x69, 0xbc, 0x9c, 0x74, 0x4d, 0x5f, 0x52, 0x3f, 0xa0, + 0x2f, 0x70, 0x7a, 0x64, 0x91, 0x52, 0xc2, 0x3b, 0xa5, 0xa0, 0x44, 0xd7, + 0x50, 0x6f, 0x49, 0xab, 0xcf, 0xb3, 0x5c, 0x43, 0xc3, 0xb1, 0x64, 0xa2, + 0x89, 0xbe, 0x5a, 0x7f, 0x7b, 0x94, 0x3b, 0x59, 0x4d, 0x89, 0xb1, 0x6a, + 0x67, 0xc5, 0x63, 0x33, 0x6e, 0x44, 0x3e, 0x4c, 0x91, 0x42, 0x38, 0xa2, + 0xc5, 0xa9, 0x88, 0x9f, 0x44, 0x70, 0x69, 0x92, 0x33, 0xa2, 0x5e, 0x58, + 0x7f, 0x35, 0xb4, 0x47, 0xad, 0x36, 0x54, 0x56, 0x4f, 0x6b, 0x40, 0x58, + 0x4d, 0xb8, 0x34, 0x9d, 0xb3, 0x77, 0x33, 0xd0, 0xc6, 0xc2, 0xa4, 0x36, + 0x8c, 0x3a, 0x5b, 0x3a, 0x60, 0x9d, 0x7f, 0x42, 0x54, 0x61, 0x32, 0x77, + 0x5f, 0x29, 0xc7, 0xc3, 0x39, 0x68, 0xa6, 0xa5, 0x7f, 0x95, 0x8f, 0x63, + 0xae, 0xa1, 0xbf, 0xc4, 0x75, 0x55, 0x3d, 0x2d, 0x30, 0x8d, 0xb2, 0xa3, + 0x34, 0xce, 0x45, 0x80, 0xa9, 0x43, 0x74, 0xc0, 0x9e, 0x5d, 0xbe, 0x45, + 0x84, 0x2f, 0x6a, 0xc6, 0x5a, 0x7f, 0x99, 0x4b, 0x60, 0x7c, 0xcd, 0x9b, + 0xcd, 0x5b, 0x77, 0x75, 0xad, 0x4d, 0xb7, 0xc8, 0x63, 0x56, 0xd6, 0xa2, + 0x5a, 0xc6, 0xaf, 0x51, 0x94, 0x4a, 0xb5, 0xae, 0xa8, 0x65, 0xc4, 0x91, + 0xc9, 0xbf, 0x6b, 0x4c, 0x93, 0x56, 0x7b, 0x88, 0x62, 0x83, 0x9f, 0x4f, + 0x32, 0x60, 0x95, 0xc0, 0xa8, 0xda, 0x64, 0xca, 0x2c, 0x77, 0x4e, 0x75, + 0xbe, 0xa8, 0x98, 0x5d, 0x49, 0x89, 0x91, 0x66, 0x40, 0xdc, 0x90, 0x3d, + 0xc9, 0x4a, 0x39, 0xcf, 0x7a, 0xc9, 0xa7, 0x9d, 0x55, 0x6c, 0x6a, 0x64, + 0x99, 0xa1, 0xa0, 0x42, 0x69, 0xa1, 0x65, 0xab, 0xb6, 0x84, 0x91, 0xba, + 0x59, 0xa9, 0x7c, 0xb6, 0xc8, 0xa9, 0x95, 0x56, 0x7c, 0x7b, 0x3a, 0x58, + 0xa2, 0x5b, 0x7d, 0x97, 0xac, 0x58, 0xcc, 0x86, 0x95, 0xa6, 0xad, 0x74, + 0xb1, 0x9e, 0x78, 0x63, 0x7c, 0x30, 0x3e, 0xb3, 0x80, 0xaa, 0x96, 0x3e, + 0xce, 0x3e, 0xba, 0xc8, 0x68, 0xce, 0xb8, 0x71, 0x34, 0xa3, 0x93, 0x9f, + 0x9e, 0xcd, 0x2e, 0xa1, 0x45, 0x84, 0x35, 0xa4, 0x7f, 0x61, 0x47, 0x3a, + 0xa2, 0x8c, 0xab, 0x75, 0x81, 0x9c, 0xca, 0x9e, 0x7b, 0x56, 0x80, 0x5a, + 0x7e, 0x2d, 0x9f, 0x5d, 0xd0, 0x31, 0x3e, 0xab, 0x91, 0x5b, 0x39, 0xa1, + 0xa1, 0x54, 0xc1, 0xcd, 0x99, 0xc3, 0x85, 0x65, 0x43, 0xc8, 0xab, 0x9a, + 0x7f, 0xa2, 0x90, 0xcc, 0x58, 0x36, 0x83, 0xcb, 0x48, 0x76, 0x6e, 0xb9, + 0x81, 0xbe, 0x54, 0xb3, 0xc7, 0x92, 0x46, 0x91, 0x71, 0x54, 0x32, 0x96, + 0x64, 0x68, 0x4b, 0x51, 0x79, 0x72, 0x36, 0x76, 0x9e, 0x86, 0x8a, 0x51, + 0x65, 0xc7, 0xbf, 0xaa, 0x85, 0x90, 0xa4, 0xd9, 0x9c, 0x3e, 0xc1, 0x73, + 0xb3, 0x98, 0xa4, 0x58, 0xab, 0x96, 0x5f, 0x51, 0x97, 0x90, 0xc4, 0xc3, + 0xb0, 0x88, 0xb7, 0xb6, 0x8f, 0x4c, 0xdb, 0x96, 0xcf, 0xbe, 0xa9, 0x8f, + 0xc4, 0x67, 0x83, 0x4d, 0x9b, 0xbf, 0x8e, 0x32, 0xb9, 0x84, 0x9d, 0x46, + 0xb4, 0x58, 0xa1, 0x3c, 0x57, 0x37, 0x5d, 0x76, 0xbc, 0x45, 0x49, 0xcb, + 0x2c, 0x6c, 0xae, 0x5d, 0x43, 0xa4, 0x62, 0xac, 0xd1, 0x9c, 0x77, 0x85, + 0x47, 0x59, 0xd1, 0xc6, 0xd2, 0xd0, 0xca, 0x3c, 0xa8, 0xce, 0x76, 0xcf, + 0xa1, 0x4a, 0x70, 0x89, 0xa2, 0x6f, 0x35, 0x4f, 0x30, 0xc9, 0x5c, 0xcc, + 0x4e, 0x8d, 0x66, 0x8e, 0xa5, 0x97, 0x44, 0x54, 0x5c, 0x73, 0xa5, 0x64, + 0x7c, 0x9d, 0xd0, 0x78, 0x32, 0x89, 0x81, 0xb7, 0x9c, 0xb4, 0x5b, 0x55, + 0xc1, 0xa5, 0x68, 0x47, 0x69, 0x59, 0x50, 0x4f, 0xb5, 0x5a, 0xc0, 0x9a, + 0x96, 0x93, 0x7c, 0x98, 0x43, 0x4e, 0x52, 0x6e, 0xcb, 0x66, 0xd1, 0x3c, + 0x79, 0xce, 0x8b, 0x57, 0xbf, 0x56, 0xb5, 0xcf, 0x54, 0x60, 0x70, 0xc2, + 0x8a, 0xa6, 0xb9, 0x71, 0x52, 0x4a, 0x5e, 0xd0, 0x56, 0x34, 0x84, 0xbc, + 0x7b, 0x9b, 0xa8, 0xa3, 0x93, 0x5f, 0xa8, 0x8c, 0x95, 0xb2, 0x7e, 0xd6, + 0x43, 0xaf, 0xcb, 0x7d, 0x9c, 0xc1, 0x2b, 0xd0, 0x7c, 0x4a, 0xc7, 0x73, + 0xc8, 0xaa, 0xc2, 0xbd, 0x99, 0xaf, 0x47, 0x70, 0x8e, 0x83, 0xaa, 0x71, + 0x74, 0x59, 0x58, 0x71, 0xbd, 0x9d, 0x84, 0x8d, 0xcf, 0xb7, 0xa8, 0x9c, + 0x85, 0x83, 0xa4, 0xc8, 0x3f, 0x9d, 0xb9, 0x4b, 0xb6, 0x79, 0xc6, 0xb1, + 0xbf, 0xb8, 0x84, 0xcc, 0x73, 0x86, 0x91, 0x82, 0x7e, 0xb2, 0xcd, 0x3c, + 0x40, 0xc1, 0x45, 0x63, 0x4c, 0xc2, 0x73, 0x70, 0xd0, 0x68, 0xc0, 0x6c, + 0x58, 0xb4, 0x32, 0x80, 0xaa, 0x59, 0x8c, 0x89, 0xc8, 0x6e, 0x54, 0x62, + 0xd2, 0xc0, 0x72, 0xaf, 0xcc, 0x31, 0xd0, 0x81, 0x5f, 0xc4, 0xc2, 0x81, + 0xbe, 0x4e, 0x72, 0x44, 0x58, 0x99, 0x39, 0x45, 0xaa, 0x4e, 0x9d, 0x86, + 0xc2, 0x64, 0x74, 0x97, 0x93, 0x3c, 0x7e, 0xab, 0xcd, 0xa5, 0xa6, 0x6f, + 0x4d, 0x86, 0x51, 0x6a, 0xbd, 0xd0, 0xa0, 0xa8, 0xa9, 0x40, 0x97, 0x65, + 0x34, 0x98, 0x76, 0x42, 0x68, 0xbc, 0xa4, 0x9d, 0x39, 0x47, 0x8e, 0xc5, + 0x43, 0x7f, 0xa6, 0x79, 0xaa, 0x9b, 0xa4, 0x9d, 0x94, 0x4b, 0xbb, 0x45, + 0x92, 0xce, 0x73, 0xbb, 0x72, 0x71, 0x54, 0xc8, 0xcf, 0xa8, 0x79, 0xc3, + 0x6e, 0x5f, 0x51, 0x37, 0x39, 0x7c, 0x63, 0x7e, 0x79, 0x86, 0xbd, 0x95, + 0x68, 0xa6, 0x6e, 0x79, 0x43, 0x9a, 0x8c, 0x65, 0x80, 0x9d, 0xd1, 0x71, + 0x5d, 0xb3, 0x74, 0x9c, 0xc2, 0x32, 0x98, 0x85, 0x7b, 0x55, 0x68, 0xb9, + 0xb8, 0xc2, 0xb4, 0x32, 0xac, 0x64, 0xa3, 0x8a, 0xc8, 0x3b, 0x59, 0x57, + 0xab, 0xa3, 0x7f, 0x48, 0xc9, 0x7d, 0xc2, 0xb5, 0xbb, 0xa6, 0x35, 0xa7, + 0xb8, 0x76, 0x6a, 0x5c, 0x69, 0xd2, 0x32, 0x55, 0xc6, 0xcd, 0x70, 0x3c, + 0x6e, 0x75, 0x9b, 0xca, 0x77, 0xc5, 0xd0, 0x52, 0xa0, 0xce, 0x8a, 0xc3, + 0x77, 0xae, 0x8a, 0x7d, 0x8d, 0xa1, 0x6e, 0x5f, 0xa7, 0x72, 0x7a, 0x9c, + 0x6e, 0x3d, 0x49, 0xa7, 0xb4, 0x70, 0xd2, 0x91, 0x44, 0x5a, 0x56, 0x91, + 0x8a, 0xbe, 0x3b, 0x4a, 0xc8, 0xbe, 0x8e, 0x7d, 0xb1, 0x9b, 0xa4, 0x6f, + 0x5c, 0x89, 0x47, 0x4e, 0x41, 0x9a, 0xd4, 0x96, 0x6b, 0x80, 0xc9, 0x67, + 0x80, 0xc5, 0x4f, 0x48, 0x3e, 0x8f, 0xc5, 0x69, 0x33, 0x39, 0x89, 0x33, + 0xa2, 0xa7, 0xae, 0x6d, 0x4b, 0xb2, 0x5b, 0x60, 0x56, 0x5b, 0x34, 0x3f, + 0xc3, 0x7d, 0xad, 0x50, 0xa7, 0x8f, 0x3c, 0x8c, 0x82, 0x73, 0x37, 0x62, + 0x90, 0xc5, 0x5e, 0xac, 0xc2, 0x35, 0x96, 0x9b, 0xb4, 0x9e, 0x79, 0x72, + 0x41, 0x87, 0x58, 0x3b, 0x45, 0x76, 0x5e, 0xa5, 0xa8, 0x90, 0xc1, 0x59, + 0x69, 0x2d, 0xbc, 0x32, 0xac, 0xb5, 0xa4, 0x86, 0x65, 0x33, 0x47, 0x51, + 0x54, 0x60, 0x95, 0xc4, 0x92, 0x48, 0x83, 0x67, 0x46, 0x5f, 0xad, 0x81, + 0x86, 0xb7, 0x9b, 0x60, 0xb4, 0x5a, 0x6b, 0x40, 0xc3, 0xa1, 0x38, 0xb2, + 0x56, 0x69, 0x56, 0x7a, 0x51, 0x65, 0x38, 0x70, 0x3f, 0xce, 0x9e, 0x7a, + 0xcb, 0xcb, 0x46, 0x7a, 0xaa, 0xbb, 0xa8, 0xa8, 0x5b, 0x32, 0xab, 0xb5, + 0x47, 0xcf, 0x4e, 0xc2, 0xa0, 0x5c, 0xa3, 0x75, 0x6b, 0xd3, 0x67, 0xaf, + 0x9b, 0xcb, 0x33, 0x88, 0x42, 0xaa, 0x55, 0x3e, 0xc9, 0x83, 0x63, 0x42, + 0x78, 0xad, 0x4b, 0x8c, 0xa2, 0x54, 0x71, 0x93, 0x42, 0x6b, 0x40, 0xbf, + 0x83, 0x61, 0x98, 0xd3, 0x5f, 0x58, 0xbe, 0x37, 0x5b, 0xce, 0x4e, 0xbd, + 0x3e, 0x8f, 0xa9, 0x4e, 0x7d, 0xc2, 0x59, 0x92, 0x58, 0x3b, 0xb8, 0x90, + 0x40, 0x37, 0x2d, 0x48, 0x5f, 0x98, 0x69, 0xb1, 0x45, 0xa5, 0x8b, 0xcb, + 0xac, 0x60, 0x40, 0x7d, 0x94, 0x70, 0xaa, 0x9d, 0x7b, 0x6a, 0x71, 0xac, + 0x9b, 0xb6, 0x54, 0x52, 0x51, 0x52, 0xba, 0x8c, 0x84, 0x43, 0x6e, 0xbf, + 0x53, 0x39, 0x85, 0xcc, 0x70, 0x2f, 0x75, 0xb3, 0x76, 0xb2, 0xb3, 0x69, + 0x58, 0x5a, 0x78, 0x86, 0x55, 0x75, 0x8c, 0x67, 0x64, 0xba, 0x8e, 0x57, + 0x69, 0x55, 0xb3, 0xac, 0x80, 0xbf, 0xa2, 0x9c, 0x99, 0x3f, 0x86, 0xcc, + 0x6f, 0x6e, 0x37, 0x8d, 0x59, 0x48, 0xd1, 0x5c, 0x40, 0x41, 0xa7, 0xcd, + 0x94, 0x51, 0x3d, 0x4b, 0x58, 0xc8, 0x5b, 0xa6, 0x53, 0xaa, 0x9e, 0x9d, + 0x8d, 0x5e, 0x79, 0xaf, 0xa2, 0xd0, 0xb2, 0x41, 0x70, 0x86, 0x5f, 0x99, + 0x89, 0x94, 0x5d, 0x61, 0x77, 0x91, 0xa0, 0x8b, 0x45, 0xc7, 0x78, 0x7c, + 0xa0, 0x6f, 0x58, 0x55, 0x7c, 0x39, 0xc7, 0x8d, 0x37, 0xb9, 0x33, 0xab, + 0x56, 0x80, 0x4d, 0x62, 0x70, 0xb4, 0x90, 0xbd, 0xc2, 0x69, 0x95, 0xc4, + 0x8c, 0x94, 0x91, 0x89, 0x55, 0x7f, 0x90, 0x7f, 0xca, 0xaf, 0x84, 0x64, + 0x3b, 0x52, 0x40, 0xb1, 0xb2, 0x5c, 0x81, 0xd3, 0xb6, 0x8f, 0xdc, 0x73, + 0x56, 0x5d, 0xc9, 0x78, 0xa2, 0x3f, 0x4c, 0x4f, 0x8f, 0x8c, 0x7a, 0xb3, + 0xc4, 0xa4, 0xa2, 0x8a, 0x4b, 0x8e, 0xbb, 0xa0, 0xbd, 0x66, 0xb9, 0xc3, + 0x7c, 0x6a, 0x64, 0x60, 0xa4, 0xae, 0x47, 0xad, 0x3e, 0x38, 0x79, 0x56, + 0x9b, 0xbc, 0x47, 0xcf, 0x6d, 0xa5, 0x57, 0x69, 0xd1, 0x5c, 0xc7, 0x71, + 0xaa, 0x80, 0x6f, 0x58, 0x87, 0x9b, 0xb1, 0x8a, 0x56, 0x94, 0xb2, 0xac, + 0x6b, 0xac, 0x8a, 0x47, 0x91, 0x68, 0x37, 0xb9, 0x63, 0x5b, 0x42, 0x3c, + 0x79, 0x45, 0x81, 0x9d, 0x71, 0x7f, 0x60, 0xa6, 0xc4, 0x56, 0x50, 0xc7, + 0xc8, 0xa6, 0x8d, 0x68, 0x9d, 0x83, 0x3d, 0xba, 0x7b, 0x48, 0xb8, 0xad, + 0x49, 0xa8, 0x64, 0x6b, 0x53, 0xb0, 0x55, 0x93, 0x88, 0x91, 0xb7, 0x44, + 0x74, 0x5a, 0xc3, 0x84, 0xa6, 0x8a, 0xab, 0x30, 0x98, 0x9d, 0xb9, 0x35, + 0x85, 0xd3, 0x39, 0x55, 0x93, 0x85, 0x74, 0x87, 0x38, 0x8c, 0x65, 0xa9, + 0x6a, 0x32, 0x55, 0x39, 0x5e, 0x3f, 0x81, 0xbc, 0x3d, 0x6c, 0xa2, 0x7b, + 0xc8, 0x77, 0x9e, 0x6c, 0xa8, 0x6a, 0x97, 0xcf, 0xc1, 0xb1, 0x9d, 0x66, + 0x57, 0xb0, 0x5b, 0x40, 0x7e, 0xbd, 0xa2, 0x62, 0x77, 0xc1, 0x58, 0xb7, + 0x8a, 0xd1, 0xae, 0xa0, 0x8f, 0xb5, 0x50, 0x48, 0x47, 0x4d, 0xdc, 0x53, + 0x8b, 0x7d, 0x97, 0xaf, 0xc5, 0x64, 0xa2, 0x51, 0x7a, 0x43, 0x83, 0xac, + 0x73, 0x3d, 0xb0, 0xae, 0x59, 0x7e, 0x95, 0x9f, 0x34, 0x82, 0x4b, 0x56, + 0xbe, 0xc5, 0xc7, 0x5a, 0x8c, 0x64, 0xbf, 0x7d, 0xc4, 0xbe, 0xa7, 0x6c, + 0x67, 0x96, 0x94, 0x92, 0x91, 0x67, 0x3f, 0x99, 0xca, 0xc5, 0x74, 0x42, + 0x9c, 0x93, 0x82, 0x9b, 0xb2, 0xc9, 0x9b, 0x36, 0x46, 0x77, 0x6a, 0xcd, + 0xb5, 0x28, 0x41, 0x7a, 0x8c, 0xb7, 0x57, 0x4e, 0xd9, 0xb7, 0x5e, 0xa6, + 0xa2, 0x6c, 0x8e, 0xcc, 0x82, 0xbb, 0x5a, 0xd2, 0xc8, 0x60, 0x76, 0xc2, + 0x3a, 0xb8, 0x9a, 0x61, 0x87, 0x7c, 0x51, 0x94, 0xbc, 0x3e, 0xb2, 0x4d, + 0x5f, 0x4f, 0x2d, 0x79, 0x65, 0x2a, 0x91, 0xc9, 0x5f, 0x9b, 0x58, 0xbc, + 0xbb, 0x3a, 0x64, 0x89, 0x36, 0x5b, 0xca, 0x8e, 0x90, 0xca, 0x5d, 0x92, + 0x46, 0xc9, 0xa4, 0x3a, 0xc7, 0x4b, 0x4f, 0x90, 0x69, 0x5b, 0x33, 0x4b, + 0x7f, 0x98, 0x6c, 0x80, 0xbb, 0x87, 0x58, 0x62, 0x52, 0x95, 0x47, 0x5d, + 0x3b, 0x79, 0x31, 0xa7, 0x7f, 0x47, 0xce, 0x86, 0x4a, 0xcd, 0x9f, 0x82, + 0x2a, 0xbf, 0xda, 0x64, 0x96, 0xa6, 0x8e, 0x67, 0xb8, 0x7c, 0x53, 0x8f, + 0x91, 0x34, 0xb1, 0x7e, 0x3b, 0x99, 0x7b, 0xcb, 0x5a, 0x99, 0xbc, 0x59, + 0xc2, 0x6b, 0x79, 0x40, 0x8c, 0xce, 0x38, 0x43, 0x87, 0xa6, 0x8e, 0x47, + 0x4d, 0xb6, 0x7c, 0x9b, 0x6e, 0xc9, 0x4d, 0xca, 0x49, 0x43, 0x7a, 0x4b, + 0x6b, 0xd7, 0x3b, 0xc9, 0xab, 0xcc, 0xcc, 0x3a, 0x9f, 0x58, 0x74, 0xbb, + 0x4f, 0x84, 0xa0, 0xb7, 0xb5, 0x9c, 0x44, 0x53, 0x88, 0xab, 0xc8, 0x40, + 0xd3, 0x8d, 0xb2, 0x43, 0xa8, 0x6c, 0x95, 0x38, 0x60, 0x68, 0x2f, 0x5a, + 0x86, 0x6f, 0x40, 0xb6, 0x6e, 0x92, 0x73, 0xa6, 0x26, 0x65, 0x2b, 0x7b, + 0xa5, 0x65, 0xab, 0xaf, 0x62, 0x4f, 0x92, 0x6c, 0x80, 0x8f, 0x42, 0x57, + 0xc7, 0x48, 0xa4, 0xad, 0x6e, 0x8d, 0x9b, 0x6b, 0x42, 0xbb, 0xd6, 0x71, + 0x40, 0xcb, 0xb0, 0x92, 0xaa, 0x62, 0x9b, 0xaf, 0x70, 0x42, 0xb2, 0x86, + 0x96, 0x74, 0x51, 0x70, 0xc8, 0x3c, 0x4a, 0xda, 0x55, 0xa1, 0x6a, 0x87, + 0x93, 0x57, 0x34, 0xca, 0xbb, 0xab, 0x8d, 0x48, 0x78, 0xbf, 0x98, 0x77, + 0x86, 0x44, 0x5c, 0xbf, 0xd7, 0xc9, 0xa9, 0xad, 0x69, 0x88, 0x53, 0xdb, + 0x51, 0xab, 0xb1, 0x8f, 0x3c, 0xcc, 0xc1, 0xc4, 0x69, 0xc9, 0xac, 0x8e, + 0x5a, 0xdb, 0x52, 0x3f, 0x42, 0xa6, 0xb8, 0x6e, 0x9e, 0xc9, 0xa7, 0x9a, + 0x99, 0x84, 0x44, 0xca, 0xa8, 0x5f, 0x6b, 0xac, 0x91, 0xa2, 0x75, 0xc8, + 0x78, 0xa8, 0xd0, 0x4c, 0x58, 0x4b, 0x6f, 0xc0, 0xc8, 0x47, 0x55, 0x60, + 0x8c, 0x89, 0x38, 0x8e, 0x5e, 0x54, 0x61, 0xa7, 0x37, 0x6d, 0xb5, 0x62, + 0x40, 0x54, 0x3d, 0x49, 0x31, 0x67, 0xbf, 0xd2, 0x71, 0x35, 0x5f, 0xa4, + 0x95, 0x4d, 0xb9, 0x84, 0xcc, 0x79, 0x85, 0x87, 0x92, 0x62, 0x9d, 0x6f, + 0x82, 0xb9, 0x37, 0xab, 0x36, 0xc1, 0xbf, 0xc3, 0x94, 0x98, 0x76, 0x63, + 0x55, 0x82, 0x6f, 0xa4, 0x74, 0x5f, 0xb9, 0xc4, 0x73, 0xb0, 0x8a, 0x71, + 0xa9, 0x38, 0x2b, 0x2d, 0x7d, 0x40, 0x9c, 0x3f, 0xb6, 0xa8, 0x47, 0x85, + 0xc5, 0x52, 0xa7, 0x8b, 0x50, 0x60, 0x53, 0x9e, 0x44, 0xa4, 0xb5, 0xb8, + 0x66, 0xa0, 0x7b, 0x55, 0x9d, 0xaa, 0xce, 0x3a, 0x66, 0x84, 0x43, 0x69, + 0xb4, 0x75, 0xc4, 0xc8, 0x4e, 0xce, 0x35, 0x4d, 0x6d, 0x75, 0x93, 0x30, + 0x3f, 0x92, 0x40, 0xc0, 0xa3, 0x36, 0xce, 0x7c, 0x87, 0xb7, 0x90, 0x3c, + 0xd0, 0x8b, 0x9a, 0x67, 0x65, 0x33, 0x9f, 0x80, 0xb2, 0x79, 0xb4, 0x73, + 0x58, 0x67, 0x4c, 0x7e, 0x68, 0x49, 0x73, 0x55, 0x93, 0x47, 0x3a, 0x34, + 0x71, 0xc0, 0x71, 0xd5, 0xb0, 0xc0, 0xbf, 0x8d, 0xa1, 0x9c, 0x78, 0x46, + 0xd4, 0xbc, 0x89, 0x89, 0xbf, 0x6b, 0x6b, 0x86, 0x51, 0x7c, 0xd6, 0x66, + 0x3c, 0x7b, 0x3e, 0x94, 0x56, 0xb1, 0x87, 0x9b, 0x9e, 0xc1, 0xa6, 0x87, + 0x70, 0xe0, 0x6c, 0x43, 0xca, 0x62, 0x86, 0xba, 0x48, 0xd8, 0x86, 0x48, + 0x80, 0xa5, 0xbe, 0xc8, 0x39, 0x7c, 0x60, 0x6a, 0xad, 0x69, 0xcc, 0x30, + 0x5c, 0x38, 0x76, 0x71, 0xa0, 0xb5, 0x3b, 0x4f, 0x7e, 0xcc, 0x46, 0x8f, + 0x60, 0xde, 0xc7, 0x4c, 0x9e, 0x6f, 0x93, 0x37, 0x36, 0x4e, 0xc2, 0xaf, + 0x3e, 0x36, 0x69, 0x8f, 0x6b, 0xae, 0x2d, 0x8a, 0xd3, 0xb6, 0x4d, 0x74, + 0x84, 0x79, 0x81, 0x5a, 0x8e, 0xa0, 0x91, 0xa1, 0x7e, 0x5b, 0x4c, 0x64, + 0xab, 0x41, 0x88, 0xa8, 0x8a, 0x4a, 0xbf, 0xaf, 0x99, 0xba, 0x8a, 0x8b, + 0x5c, 0xa6, 0x87, 0x3c, 0xb8, 0x53, 0x7b, 0xd3, 0x82, 0x87, 0x77, 0x8d, + 0xb2, 0xd4, 0x8b, 0x9b, 0x86, 0xa2, 0xc4, 0x90, 0x49, 0x9d, 0x5c, 0x77, + 0x97, 0x88, 0x5b, 0xbe, 0x8a, 0x96, 0xbf, 0x48, 0x58, 0x6b, 0xbb, 0xb4, + 0x8c, 0x92, 0xa5, 0x3e, 0xba, 0x93, 0x5b, 0xd3, 0x2f, 0x7f, 0xc8, 0x54, + 0x99, 0xcd, 0x76, 0x73, 0xb3, 0xb3, 0x52, 0x75, 0x8d, 0xa6, 0x4b, 0x39, + 0x81, 0x6e, 0xc0, 0xbb, 0x44, 0x51, 0xa3, 0x89, 0x4c, 0xa2, 0x51, 0x7f, + 0x60, 0x8d, 0xae, 0x89, 0x9e, 0x9a, 0x54, 0x4d, 0x7b, 0xac, 0x60, 0x95, + 0xba, 0x4a, 0xa7, 0x59, 0xab, 0xb5, 0x57, 0x44, 0x65, 0x80, 0xca, 0x45, + 0xa7, 0x92, 0x47, 0x9f, 0x78, 0xcf, 0x72, 0xac, 0xcb, 0x75, 0x60, 0x9b, + 0x44, 0xaf, 0xbf, 0x7c, 0x78, 0xac, 0x6c, 0x9a, 0xb8, 0x92, 0x65, 0x5f, + 0x33, 0xbe, 0x75, 0xc4, 0x8c, 0x3d, 0x5b, 0x3a, 0x6f, 0x98, 0x79, 0xbf, + 0xb8, 0x57, 0x63, 0xb6, 0xb3, 0x49, 0x6c, 0x97, 0x4f, 0x9a, 0x36, 0x7f, + 0x6b, 0x2b, 0x59, 0x3b, 0x2e, 0x48, 0xec, 0xbd, 0x5e, 0xa7, 0xcd, 0x73, + 0x69, 0xcc, 0x3e, 0x43, 0xa2, 0x2d, 0xc6, 0x42, 0x62, 0x3a, 0x54, 0x3a, + 0xa3, 0x51, 0x87, 0x32, 0x45, 0x6d, 0x61, 0x9d, 0x41, 0x65, 0x91, 0x5e, + 0x46, 0xb9, 0x3a, 0x58, 0x70, 0xd8, 0x2f, 0x38, 0x78, 0xb8, 0x5b, 0xb7, + 0x39, 0x76, 0x5d, 0x49, 0x57, 0xb1, 0x5a, 0x84, 0x96, 0x3d, 0x4a, 0xbd, + 0x69, 0x7b, 0x40, 0x5d, 0xd4, 0xcf, 0x6a, 0x49, 0x3e, 0xa4, 0x46, 0x60, + 0x8e, 0x8c, 0x85, 0xa3, 0x56, 0x4d, 0xc8, 0x58, 0xc7, 0x70, 0x8f, 0x73, + 0x99, 0x84, 0x9d, 0x5b, 0x55, 0x4a, 0xa7, 0x83, 0xc7, 0x57, 0xa4, 0x89, + 0xb5, 0x93, 0x66, 0xc0, 0x91, 0xbb, 0x78, 0x7e, 0xad, 0x9b, 0x8b, 0xc7, + 0x2f, 0x83, 0xb5, 0xc4, 0x2d, 0x56, 0xb6, 0x8a, 0x71, 0xb5, 0x91, 0xb3, + 0xa5, 0xca, 0xa9, 0x57, 0x99, 0x4b, 0xc1, 0xb4, 0x44, 0x99, 0xa4, 0x9b, + 0x47, 0x4a, 0xba, 0xe5, 0xa6, 0x7a, 0x39, 0xa4, 0xbf, 0xc5, 0x46, 0xbd, + 0xbb, 0x32, 0x9d, 0xc2, 0x6f, 0x58, 0x4e, 0x96, 0xab, 0x4b, 0x3a, 0x73, + 0x81, 0xb4, 0x35, 0x3e, 0xa7, 0xd0, 0x39, 0x32, 0x6a, 0x51, 0x64, 0xb5, + 0x6b, 0xb2, 0x7d, 0x3a, 0xba, 0x6c, 0xa9, 0x97, 0x57, 0x77, 0xc9, 0x4b, + 0x6f, 0x75, 0x48, 0x59, 0x95, 0xb1, 0x59, 0x8c, 0xa0, 0x2f, 0xa5, 0x6d, + 0x70, 0xbc, 0x6d, 0x50, 0x70, 0x8c, 0xa3, 0x28, 0x30, 0x31, 0x98, 0x73, + 0x71, 0x48, 0xb1, 0x6d, 0x65, 0xd2, 0x46, 0x88, 0x64, 0x90, 0xb2, 0x5b, + 0x68, 0x50, 0xa7, 0xc1, 0x7b, 0xde, 0x68, 0xb0, 0x4a, 0x82, 0x75, 0x8e, + 0x8b, 0xa8, 0x30, 0x8c, 0xbb, 0x5c, 0x68, 0x69, 0x68, 0x7d, 0xa5, 0x54, + 0x56, 0x4c, 0x34, 0x39, 0x5b, 0xad, 0x43, 0xb3, 0xb0, 0xbb, 0x57, 0x8d, + 0xb4, 0x72, 0x5f, 0xb4, 0xde, 0x6e, 0x30, 0x75, 0x79, 0xc4, 0x9f, 0xa5, + 0x94, 0x7b, 0x34, 0x30, 0x6a, 0x77, 0x57, 0x8a, 0x3d, 0xa7, 0x87, 0x30, + 0x2d, 0x94, 0x84, 0x53, 0x48, 0x44, 0xd7, 0x50, 0x6e, 0x8c, 0x92, 0xdc, + 0x8a, 0x61, 0x32, 0xb8, 0x58, 0x35, 0x40, 0xd0, 0x63, 0xd0, 0x67, 0xae, + 0x49, 0x71, 0xc4, 0x5d, 0x47, 0x5f, 0x35, 0xc1, 0x98, 0x75, 0xa1, 0x32, + 0x8b, 0x9d, 0xa0, 0x31, 0xd5, 0x64, 0x7b, 0xa7, 0x9d, 0x6b, 0xb2, 0x6c, + 0xd1, 0xa8, 0x8b, 0xad, 0xa4, 0x8b, 0x44, 0x90, 0x7d, 0x45, 0xcb, 0x6c, + 0x92, 0x9a, 0xc7, 0xc5, 0x8f, 0x7d, 0xb6, 0xc6, 0x41, 0x41, 0xb9, 0x78, + 0x49, 0x3b, 0x3f, 0x84, 0x8f, 0x91, 0x8b, 0xc9, 0x9f, 0xb7, 0xbc, 0xb6, + 0x41, 0x45, 0x76, 0x76, 0xac, 0x3e, 0xd0, 0xa5, 0x96, 0x45, 0x89, 0xb8, + 0x46, 0x58, 0x66, 0x55, 0x8d, 0xa0, 0xc4, 0xa4, 0xbc, 0x78, 0x64, 0x75, + 0x61, 0x72, 0xb4, 0x37, 0x8b, 0xb6, 0x92, 0xb1, 0x8c, 0x71, 0xd6, 0x4a, + 0x43, 0x4a, 0x6d, 0xb8, 0xbb, 0x7e, 0x71, 0x5a, 0xc1, 0x37, 0x68, 0xac, + 0x89, 0x25, 0xbd, 0x43, 0x52, 0x91, 0x69, 0xcb, 0x87, 0x66, 0x51, 0x51, + 0x5a, 0x98, 0xb9, 0xcd, 0x96, 0x80, 0x50, 0x4e, 0x88, 0x91, 0xae, 0xba, + 0x62, 0x24, 0x58, 0x97, 0x9a, 0xa5, 0xc5, 0x7f, 0xd4, 0x49, 0x8e, 0x34, + 0x86, 0x42, 0x59, 0xc4, 0xcb, 0xb9, 0xa8, 0x7f, 0x9b, 0x4d, 0xcb, 0x63, + 0x8e, 0x29, 0x8e, 0xbd, 0x71, 0x63, 0x56, 0x90, 0xc1, 0xa9, 0x68, 0x2f, + 0xca, 0x7b, 0x62, 0x3e, 0x8e, 0x40, 0x48, 0x70, 0x33, 0x39, 0x3f, 0x43, + 0xa1, 0x4b, 0x99, 0xbd, 0x93, 0x62, 0x5d, 0xab, 0x83, 0x25, 0x79, 0xdb, + 0x2a, 0x91, 0x85, 0x6a, 0xb0, 0x5d, 0x3b, 0x91, 0x91, 0x49, 0x60, 0x28, + 0x77, 0xd2, 0xbf, 0xa3, 0x8a, 0x33, 0x83, 0x9e, 0xac, 0xac, 0xca, 0x33, + 0xc0, 0xd8, 0xbb, 0xa7, 0x61, 0x7e, 0x41, 0xc3, 0xc5, 0x9a, 0xb5, 0x70, + 0x6c, 0x40, 0x5c, 0xc0, 0x3a, 0x88, 0xb6, 0x69, 0x98, 0x53, 0x54, 0xa6, + 0x4f, 0x5c, 0x94, 0x8d, 0x85, 0xc7, 0xa6, 0xb3, 0x60, 0xb1, 0x9a, 0x3f, + 0xb3, 0xbc, 0x58, 0x44, 0xbd, 0x77, 0x39, 0x85, 0xa8, 0xbf, 0x7c, 0xb7, + 0xcd, 0xd2, 0xa0, 0x3f, 0xcf, 0x81, 0x5d, 0xa5, 0x4a, 0xaa, 0x70, 0x9e, + 0x8d, 0x72, 0xd2, 0x67, 0x5e, 0x8f, 0xb5, 0xab, 0x7a, 0xbb, 0x9f, 0xa8, + 0xc4, 0x75, 0x9b, 0xd1, 0x35, 0xbf, 0x60, 0x77, 0x5d, 0xc4, 0x45, 0x82, + 0x79, 0x98, 0x69, 0x49, 0x46, 0x81, 0xc0, 0x46, 0xb1, 0x7d, 0x76, 0x6b, + 0xd1, 0x81, 0xaf, 0x83, 0xa7, 0x78, 0xac, 0xa8, 0x4b, 0x9e, 0x62, 0x40, + 0xbd, 0x7b, 0xbd, 0x3b, 0x49, 0x7e, 0x52, 0xdc, 0x5b, 0x77, 0xb4, 0x66, + 0x51, 0x4f, 0x80, 0x58, 0xb7, 0x3b, 0x32, 0x89, 0xce, 0x9a, 0x81, 0x83, + 0xaf, 0x63, 0x72, 0x8b, 0x5c, 0x79, 0x39, 0x69, 0xbd, 0x6d, 0x9b, 0x65, + 0x38, 0xc5, 0x37, 0x6a, 0x84, 0x8d, 0xc0, 0x72, 0x87, 0x98, 0x9d, 0x4f, + 0x42, 0x83, 0x34, 0x24, 0x9d, 0x83, 0x3d, 0x39, 0x97, 0xb8, 0xad, 0xa6, + 0x37, 0x98, 0x94, 0xc2, 0x4d, 0x73, 0x47, 0xd8, 0x81, 0x9f, 0x8d, 0x51, + 0xc2, 0x62, 0x42, 0x48, 0x7f, 0x32, 0x9f, 0x72, 0x39, 0xa2, 0xbf, 0xa9, + 0x79, 0x94, 0x8c, 0x59, 0xbf, 0x47, 0x3f, 0xae, 0x75, 0x57, 0x5e, 0xd0, + 0x6e, 0x49, 0xd7, 0xca, 0xc1, 0x8c, 0x4e, 0x53, 0xb9, 0xce, 0xcd, 0x86, + 0x6b, 0x8f, 0x70, 0xa2, 0x78, 0x40, 0xa8, 0x3e, 0x6c, 0x90, 0xc0, 0x5d, + 0x80, 0x38, 0x87, 0x4b, 0x4f, 0x55, 0x90, 0x35, 0x82, 0xca, 0x84, 0xc2, + 0xc9, 0xc2, 0xbf, 0x6c, 0x37, 0x98, 0x49, 0x60, 0x52, 0xa9, 0x46, 0x6b, + 0x93, 0xa0, 0x83, 0xc9, 0xa0, 0xd1, 0x3b, 0xc2, 0x43, 0x62, 0xa5, 0x78, + 0xaa, 0xaa, 0x86, 0xbc, 0x78, 0x74, 0xa3, 0xd1, 0x9f, 0xcd, 0x7c, 0x5d, + 0x6d, 0xb0, 0x92, 0x4b, 0x92, 0xa8, 0x5c, 0xc6, 0xbc, 0x71, 0xac, 0x3b, + 0x94, 0x58, 0x86, 0x9a, 0xcd, 0x35, 0xb6, 0x67, 0xcf, 0x71, 0x8b, 0xbc, + 0x99, 0x55, 0xd2, 0x8f, 0xb5, 0x6f, 0x54, 0x38, 0x66, 0x62, 0x8c, 0x40, + 0x8f, 0x45, 0xbc, 0x36, 0x60, 0x29, 0xb6, 0x56, 0x9f, 0x70, 0x36, 0x36, + 0x7b, 0x7c, 0x56, 0x7f, 0x77, 0xa4, 0xc6, 0x41, 0x7c, 0xa5, 0xbd, 0xb3, + 0x38, 0x98, 0x9c, 0x88, 0x6c, 0xc8, 0xc2, 0x9c, 0x8e, 0x8c, 0x92, 0xb2, + 0x9b, 0x4c, 0x64, 0xb6, 0x66, 0x91, 0x9b, 0x70, 0x6e, 0xab, 0x81, 0x76, + 0x92, 0xcc, 0xbd, 0x44, 0xd0, 0xb0, 0xbb, 0xa8, 0xba, 0xbe, 0x6e, 0x5b, + 0xb1, 0x35, 0x90, 0x3b, 0x5a, 0x8f, 0xa3, 0x92, 0x7f, 0xa1, 0x85, 0x39, + 0x4e, 0x8f, 0x57, 0x55, 0x49, 0xbe, 0x46, 0x9d, 0x55, 0xaa, 0x93, 0xc3, + 0x3f, 0xa7, 0x63, 0x6b, 0x5f, 0x41, 0x3a, 0x6b, 0x88, 0xc1, 0x5b, 0x70, + 0x59, 0x33, 0x66, 0x34, 0xa9, 0x4e, 0x64, 0x93, 0x84, 0x75, 0x5d, 0x4a, + 0x8c, 0x80, 0x81, 0xd0, 0x99, 0x3e, 0x3d, 0x4d, 0x7d, 0x40, 0x81, 0x79, + 0x80, 0xba, 0x4f, 0x91, 0xa9, 0x63, 0x6b, 0xa6, 0x99, 0xa2, 0x44, 0x72, + 0x90, 0x79, 0x31, 0x92, 0x7a, 0x49, 0xc5, 0x59, 0xa4, 0xaa, 0x96, 0xa7, + 0x62, 0x7f, 0x97, 0xaa, 0xa3, 0xad, 0x3d, 0x48, 0xd4, 0xb7, 0x9a, 0xb3, + 0x43, 0xad, 0xac, 0x9c, 0x44, 0x46, 0x72, 0xa7, 0xc2, 0x97, 0x75, 0xc7, + 0x55, 0x80, 0x91, 0x7c, 0x82, 0x68, 0x4f, 0x49, 0x66, 0x4b, 0x53, 0xb5, + 0x39, 0xac, 0x47, 0x72, 0xa8, 0x3e, 0x52, 0x9d, 0x96, 0x62, 0xa9, 0x8c, + 0x92, 0x39, 0x4e, 0xb2, 0xb0, 0x9c, 0x3d, 0x8a, 0x73, 0xaa, 0x95, 0x8b, + 0x79, 0x4f, 0x86, 0x91, 0x40, 0xcd, 0x75, 0x5a, 0x69, 0xca, 0x30, 0xa2, + 0xc6, 0x40, 0xa9, 0xd1, 0x48, 0x5e, 0xac, 0x3a, 0xb9, 0x87, 0x53, 0xd2, + 0xbc, 0x4c, 0xc4, 0xb5, 0x4d, 0xb0, 0xc0, 0xbd, 0xaf, 0x5c, 0x50, 0x93, + 0xb6, 0x75, 0x86, 0xa1, 0x7a, 0xcc, 0x80, 0x41, 0xa0, 0x9c, 0xba, 0x62, + 0x40, 0x9b, 0x67, 0x73, 0xcc, 0xa6, 0xb6, 0x54, 0xa0, 0xc3, 0xc5, 0x43, + 0xa3, 0xb9, 0xc9, 0x47, 0x7f, 0x86, 0x90, 0x39, 0xaa, 0x6f, 0xb0, 0x6f, + 0x8b, 0x93, 0x41, 0x43, 0xa1, 0x77, 0x71, 0xaf, 0x62, 0xc1, 0x76, 0xce, + 0xab, 0x59, 0x46, 0x35, 0x30, 0xa6, 0xbf, 0xc1, 0xb3, 0x80, 0xb0, 0xa4, + 0x50, 0x66, 0x95, 0x36, 0x9f, 0xb3, 0xa8, 0xb1, 0x43, 0xc1, 0x39, 0xcc, + 0x5c, 0x5e, 0x98, 0xc5, 0xa5, 0x2f, 0x7e, 0xc5, 0xcc, 0xaa, 0xc5, 0x40, + 0x48, 0x53, 0x9b, 0x62, 0x54, 0x6b, 0xac, 0x62, 0xa9, 0xc1, 0x2c, 0x7a, + 0x45, 0x42, 0x6b, 0x64, 0xa8, 0x35, 0x6b, 0x73, 0x3a, 0x99, 0xce, 0x56, + 0x75, 0xbd, 0x75, 0x88, 0xd1, 0x72, 0x85, 0x77, 0x52, 0x63, 0x6e, 0x7d, + 0x64, 0x65, 0x58, 0x5c, 0x82, 0x49, 0x5b, 0xae, 0x8d, 0x6a, 0x5e, 0x5c, + 0x36, 0xa3, 0x66, 0x34, 0x4e, 0x7c, 0x5a, 0x37, 0xa8, 0x70, 0xb8, 0x84, + 0x66, 0x32, 0x8b, 0x97, 0x3c, 0xc4, 0x7a, 0x95, 0x91, 0x62, 0x7f, 0x77, + 0xbd, 0xca, 0x57, 0x84, 0x6c, 0x8c, 0x31, 0x66, 0x7f, 0x85, 0x71, 0xd5, + 0x55, 0xab, 0xc8, 0xc5, 0x8f, 0x94, 0xa3, 0xca, 0x46, 0x44, 0x34, 0x84, + 0x3e, 0x6f, 0xb4, 0x46, 0xbe, 0xa5, 0x60, 0x84, 0xae, 0xaf, 0xac, 0xae, + 0x83, 0xce, 0x64, 0x59, 0x8f, 0x53, 0x3a, 0x5b, 0xb9, 0x5e, 0xbc, 0x36, + 0x57, 0x5a, 0xa6, 0x4c, 0x38, 0x4d, 0xc1, 0x5a, 0xc1, 0x36, 0x57, 0xb2, + 0x55, 0x7a, 0x76, 0xb7, 0xac, 0x5a, 0x37, 0xa7, 0x3c, 0x3b, 0x59, 0x5b, + 0x6d, 0x29, 0x74, 0x86, 0xaa, 0x60, 0xb7, 0x55, 0xc8, 0x97, 0xa3, 0xc8, + 0x45, 0xc6, 0xa3, 0x4f, 0x5e, 0x7f, 0x27, 0x99, 0x52, 0x50, 0x9d, 0xc5, + 0x46, 0x64, 0x32, 0xa8, 0x2c, 0x3a, 0x9b, 0xb9, 0xa8, 0x30, 0x6c, 0xb5, + 0x97, 0x74, 0x6e, 0x8f, 0x76, 0xb6, 0x87, 0xc7, 0x92, 0x6d, 0x59, 0xb7, + 0xba, 0x62, 0xb1, 0xaa, 0xae, 0xc3, 0xc2, 0xc2, 0x76, 0x97, 0xc0, 0xac, + 0xc5, 0x91, 0x41, 0xbe, 0xa8, 0x6e, 0xc3, 0xc0, 0x58, 0x40, 0x62, 0x50, + 0x45, 0x32, 0x8b, 0x53, 0x44, 0x61, 0x32, 0x89, 0x6b, 0xa2, 0x86, 0x3b, + 0x40, 0xc7, 0x4b, 0xbc, 0xc9, 0xa5, 0x55, 0x59, 0x91, 0x6f, 0x69, 0xb7, + 0x69, 0x9d, 0x3c, 0x5b, 0x83, 0x4f, 0xc8, 0x5b, 0x35, 0x7e, 0x79, 0x58, + 0x4f, 0xbd, 0x67, 0x99, 0xb3, 0x5b, 0x55, 0x60, 0xa3, 0x66, 0x68, 0x9d, + 0xc7, 0x43, 0xc5, 0xb6, 0x79, 0x6f, 0x92, 0xc4, 0x9d, 0x7d, 0x9d, 0xa0, + 0x61, 0x47, 0xb7, 0xc8, 0xc8, 0x84, 0x57, 0x7c, 0x91, 0x4a, 0xa2, 0x81, + 0x6a, 0x9f, 0x6f, 0xe2, 0xae, 0x9c, 0x97, 0xa5, 0x34, 0xa3, 0x53, 0x9b, + 0x69, 0xa1, 0xc3, 0x77, 0x7c, 0x3c, 0x92, 0x4a, 0x50, 0xc8, 0x35, 0xb3, + 0x47, 0x63, 0x73, 0x8f, 0x8d, 0xa3, 0xbf, 0x49, 0x39, 0x49, 0x9e, 0x93, + 0x89, 0x6e, 0x83, 0xce, 0x7d, 0xdd, 0xa7, 0x7a, 0x3b, 0x7e, 0x9c, 0xba, + 0x60, 0x8d, 0xa6, 0x5e, 0x9a, 0x67, 0x4a, 0x8a, 0x87, 0x4a, 0x84, 0x4b, + 0xc0, 0x61, 0xca, 0xba, 0xa4, 0x46, 0xd1, 0x63, 0x56, 0x40, 0x6c, 0x60, + 0x8b, 0xde, 0x5d, 0x4c, 0xd7, 0x70, 0x46, 0x4e, 0x41, 0x8b, 0x83, 0x5b, + 0x4e, 0x46, 0xa0, 0x9a, 0xc3, 0x41, 0xaf, 0xc4, 0x2e, 0x73, 0x45, 0x99, + 0xb7, 0xd1, 0xc4, 0x4e, 0x57, 0x69, 0x31, 0x65, 0x39, 0x57, 0x3c, 0x47, + 0x8e, 0xc3, 0x63, 0x47, 0x87, 0xac, 0xa7, 0x39, 0x7b, 0x43, 0x9e, 0xb1, + 0x73, 0x7f, 0xc3, 0x55, 0xb5, 0x99, 0xaf, 0x50, 0x5d, 0xc1, 0x55, 0xca, + 0x75, 0x37, 0x8b, 0xd4, 0x6a, 0x48, 0x77, 0xa2, 0x9c, 0xcd, 0x84, 0xb0, + 0x66, 0x7f, 0x9d, 0x9f, 0x94, 0xa1, 0x86, 0xd1, 0x7f, 0x9e, 0x3d, 0x8d, + 0x92, 0xc8, 0xc6, 0xcd, 0x70, 0xc2, 0x87, 0xc2, 0x5e, 0x97, 0x78, 0x66, + 0xe1, 0xcb, 0x57, 0x8a, 0x7c, 0xad, 0xaa, 0xb0, 0x86, 0xb6, 0x6e, 0x5f, + 0xcc, 0x3b, 0xa3, 0x8a, 0x5f, 0x61, 0xa7, 0x70, 0x34, 0xaa, 0xa8, 0xb0, + 0xc6, 0x70, 0xc7, 0x4a, 0xc6, 0x79, 0xcc, 0x91, 0x6d, 0x76, 0x55, 0xd8, + 0xd7, 0x75, 0xb1, 0x56, 0x65, 0x8b, 0x3a, 0x94, 0xbf, 0x88, 0x7f, 0x3b, + 0x6b, 0xcf, 0xce, 0xaa, 0xa0, 0x2d, 0x62, 0x40, 0x40, 0x63, 0xb7, 0x3d, + 0x8b, 0x3f, 0x7f, 0x8b, 0x30, 0x4c, 0xb2, 0x40, 0x9a, 0x3a, 0x89, 0xba, + 0x80, 0x4a, 0x34, 0x9f, 0x85, 0x98, 0x5f, 0x6b, 0x5b, 0x47, 0x38, 0x38, + 0x91, 0x93, 0xbb, 0x77, 0xd3, 0x5e, 0x53, 0x73, 0xc9, 0x76, 0x51, 0x96, + 0x49, 0xcb, 0x69, 0x62, 0xb2, 0x5f, 0xb5, 0x29, 0x57, 0x6b, 0x72, 0x92, + 0x8c, 0x6c, 0x51, 0x66, 0xcb, 0xac, 0xb8, 0x6d, 0xdb, 0x59, 0x2d, 0x44, + 0x81, 0x4d, 0x42, 0x4a, 0x39, 0x42, 0x72, 0x9c, 0xb8, 0x91, 0xa5, 0xa9, + 0x78, 0x71, 0xc0, 0xac, 0x80, 0x60, 0xa1, 0xa0, 0x7c, 0xa3, 0x60, 0x9b, + 0xb6, 0x8e, 0xaa, 0xcb, 0x5c, 0x86, 0x78, 0xae, 0xc6, 0x80, 0x7b, 0x3d, + 0xce, 0xc1, 0xa2, 0x25, 0xd1, 0x8d, 0xb4, 0x35, 0x64, 0x65, 0xda, 0x97, + 0x53, 0xbe, 0x41, 0x7b, 0x40, 0x85, 0x6d, 0xac, 0x36, 0x74, 0xa0, 0x41, + 0xb2, 0x59, 0xbc, 0x97, 0xa1, 0xbf, 0x64, 0x51, 0xaa, 0xbb, 0x45, 0xa8, + 0x7e, 0x3c, 0x5f, 0x8e, 0xb2, 0xcb, 0x85, 0xba, 0xb6, 0x87, 0x9a, 0xd0, + 0xa5, 0x91, 0x50, 0x89, 0x88, 0xa8, 0x49, 0x75, 0xd7, 0x6f, 0x7b, 0x5b, + 0xb9, 0x87, 0x81, 0xc5, 0xa1, 0xb0, 0x66, 0x59, 0x2f, 0x80, 0x5b, 0x6b, + 0x2d, 0x9e, 0xc0, 0xd6, 0x91, 0x7d, 0x42, 0xa8, 0x8a, 0xb1, 0x3b, 0x6c, + 0x80, 0xa5, 0x9e, 0x64, 0x8e, 0x3c, 0x64, 0xc3, 0xac, 0xb0, 0xb1, 0x95, + 0xcf, 0xb2, 0x6a, 0xb8, 0x52, 0x2e, 0x63, 0x71, 0x76, 0x41, 0x44, 0xc4, + 0x9d, 0x65, 0x46, 0xaa, 0xb3, 0x61, 0x7c, 0x73, 0xc4, 0xad, 0x35, 0x6e, + 0x98, 0x2e, 0xb8, 0x4c, 0xa0, 0xbf, 0x9c, 0xd8, 0x7e, 0xc8, 0xc3, 0x54, + 0xa8, 0xa2, 0xd0, 0x44, 0x5c, 0x2c, 0xa6, 0xb8, 0xb9, 0x75, 0x40, 0x32, + 0x3b, 0xca, 0xa5, 0x99, 0xc3, 0x35, 0xc7, 0x3d, 0xa7, 0x67, 0x6c, 0x65, + 0x6e, 0xae, 0x54, 0xb4, 0xcd, 0x95, 0x3d, 0xb2, 0xd1, 0xb6, 0xc5, 0x5e, + 0xa8, 0x86, 0xa7, 0x34, 0xc5, 0xa1, 0x6b, 0x3e, 0xd7, 0x8f, 0xcc, 0x41, + 0x3f, 0x41, 0x8d, 0x6c, 0x71, 0x7d, 0xab, 0xc8, 0xb2, 0xa5, 0x6d, 0x3d, + 0x94, 0x92, 0x79, 0x55, 0x9f, 0xa0, 0xa0, 0x71, 0xb1, 0x8e, 0x87, 0x71, + 0x60, 0xa3, 0x9f, 0x48, 0xd0, 0x71, 0x90, 0x45, 0x85, 0x71, 0x9b, 0xa6, + 0xb5, 0x52, 0x35, 0x35, 0x9b, 0x97, 0x55, 0xae, 0x5b, 0x81, 0xc7, 0x32, + 0x5e, 0x40, 0x4e, 0x6e, 0x9a, 0x8d, 0x5d, 0x37, 0x98, 0x8c, 0x9b, 0x67, + 0x96, 0x7c, 0xac, 0xa5, 0xc7, 0x97, 0x54, 0xb6, 0x6a, 0x51, 0x5e, 0xa5, + 0x90, 0x8c, 0xa4, 0xbd, 0x67, 0xa5, 0x3d, 0x76, 0x43, 0x8f, 0xc5, 0xa7, + 0x70, 0x95, 0xa5, 0x58, 0x87, 0x5d, 0x62, 0xdf, 0xb6, 0x4f, 0x92, 0x51, + 0x8a, 0x8f, 0xb5, 0x7a, 0xa6, 0x7b, 0x9d, 0xd3, 0xa4, 0xd1, 0x3c, 0xd7, + 0xae, 0x56, 0xd1, 0xb3, 0x63, 0x98, 0x67, 0x6d, 0x16, 0xa7, 0xc9, 0xa2, + 0x79, 0xc5, 0xcc, 0x6a, 0xcd, 0x8b, 0x3d, 0xcf, 0x57, 0x9f, 0x51, 0xa6, + 0x7f, 0x48, 0x6e, 0x7e, 0x64, 0xcb, 0x91, 0x4d, 0xad, 0x4c, 0x66, 0x8a, + 0x49, 0x48, 0xb4, 0x56, 0x32, 0xa5, 0xa3, 0x7c, 0x50, 0xa7, 0x51, 0xb6, + 0xcf, 0x84, 0x94, 0xc9, 0xbe, 0x6c, 0x6b, 0xb5, 0x7d, 0x6d, 0xa4, 0x8e, + 0xd0, 0x67, 0xc4, 0xc5, 0x8f, 0xce, 0x76, 0xcb, 0xc5, 0x5d, 0x33, 0x76, + 0x9e, 0x2d, 0xd5, 0x83, 0x3b, 0xce, 0xc4, 0xb3, 0xc0, 0xcf, 0x91, 0x43, + 0x54, 0xd5, 0x53, 0x82, 0x8f, 0xc4, 0x47, 0xb1, 0x68, 0x87, 0x5b, 0x7d, + 0x71, 0x3d, 0x81, 0x46, 0xa0, 0x9e, 0xc2, 0x59, 0xc2, 0x73, 0x5e, 0x75, + 0xba, 0x63, 0x4e, 0x6b, 0xc7, 0xac, 0xc9, 0x67, 0x9e, 0x72, 0x81, 0x4e, + 0x32, 0x7c, 0x65, 0xc8, 0x3b, 0xb3, 0xa5, 0x41, 0x7e, 0x6a, 0x97, 0x63, + 0x3f, 0x60, 0x79, 0xb5, 0x5f, 0x61, 0x82, 0xc7, 0x71, 0x2b, 0xb7, 0xcb, + 0x86, 0x8e, 0x9a, 0x54, 0xc1, 0x71, 0x43, 0xb6, 0x76, 0xd0, 0xc2, 0x61, + 0x7c, 0x57, 0x43, 0xcc, 0xc0, 0x8a, 0x72, 0x62, 0x94, 0x6a, 0x90, 0x78, + 0xa6, 0xcb, 0x42, 0x99, 0x39, 0xbc, 0x90, 0x71, 0x5f, 0x61, 0x61, 0xb3, + 0x30, 0x43, 0xd3, 0xbe, 0x96, 0x35, 0xc6, 0xdb, 0xc6, 0xa3, 0xb9, 0xb2, + 0x39, 0xb6, 0xaa, 0x94, 0x56, 0xcc, 0x5a, 0x7a, 0xaa, 0x5e, 0xd5, 0x68, + 0x86, 0x72, 0x9c, 0x88, 0x99, 0x95, 0x60, 0xa3, 0xa8, 0x51, 0x2d, 0x32, + 0x73, 0x54, 0x46, 0x8e, 0x63, 0x91, 0xa3, 0x8b, 0x5c, 0x6d, 0xa2, 0x72, + 0xcb, 0xb0, 0x9e, 0x82, 0xb0, 0xc7, 0x65, 0x88, 0x4e, 0x58, 0x32, 0x94, + 0xaf, 0xa7, 0x84, 0xbe, 0xab, 0x90, 0xc0, 0xc6, 0xb7, 0x8d, 0xd1, 0xa0, + 0x4f, 0x8c, 0xbf, 0x73, 0x93, 0x59, 0x5d, 0x3b, 0xba, 0x72, 0xa1, 0x84, + 0x5a, 0xbf, 0x4a, 0x78, 0xa2, 0xc3, 0xca, 0x85, 0x57, 0xc0, 0x3b, 0x61, + 0x98, 0x9f, 0x96, 0xa8, 0xc9, 0x8d, 0xc6, 0x40, 0x3c, 0x55, 0x44, 0x8f, + 0x8e, 0xc9, 0x82, 0x3e, 0x64, 0x4f, 0x9f, 0xb3, 0x68, 0xa4, 0x5b, 0xb6, + 0x32, 0x9e, 0x76, 0x88, 0x36, 0x67, 0x6f, 0x5b, 0xb9, 0xd0, 0xaa, 0xb5, + 0xa5, 0x80, 0x7d, 0x70, 0x81, 0x3d, 0x6a, 0xa1, 0xb3, 0xb3, 0xbe, 0x7f, + 0x34, 0x9a, 0x72, 0x9a, 0x7f, 0x65, 0x69, 0xa9, 0x6f, 0x75, 0x93, 0x51, + 0xbd, 0xa3, 0xbf, 0x3a, 0xa4, 0x50, 0x3b, 0x42, 0x81, 0x49, 0x3a, 0x41, + 0x33, 0x74, 0xa6, 0x75, 0xc7, 0x3b, 0x70, 0xa6, 0xc5, 0x8d, 0x72, 0xaf, + 0x39, 0x5a, 0xc5, 0xba, 0x4e, 0x4c, 0xb8, 0xe0, 0x50, 0x68, 0xba, 0x9a, + 0x37, 0x4e, 0x3d, 0x6f, 0xb5, 0xb3, 0xcd, 0x6e, 0x3c, 0x98, 0x38, 0x7b, + 0x81, 0x8a, 0x2b, 0xa0, 0x65, 0xd4, 0xa2, 0xc6, 0x84, 0xca, 0x53, 0xa7, + 0x39, 0x83, 0xbc, 0x5c, 0xc2, 0x59, 0x78, 0xa3, 0x39, 0x6c, 0xba, 0xbf, + 0xd8, 0x76, 0x8a, 0xd4, 0x78, 0x45, 0x81, 0x94, 0xd0, 0x66, 0x62, 0x98, + 0xa1, 0x38, 0x69, 0xae, 0x3e, 0x3c, 0x45, 0xcc, 0x99, 0xa8, 0x80, 0x4c, + 0x60, 0x7a, 0x5b, 0xaa, 0xbb, 0x93, 0x7e, 0x64, 0xc9, 0x98, 0x63, 0x78, + 0xb2, 0x9f, 0x64, 0xce, 0x9d, 0x44, 0xa3, 0x4a, 0xcb, 0xcc, 0xbf, 0x38, + 0x53, 0xb5, 0x58, 0x39, 0x72, 0x53, 0x6c, 0x99, 0x3f, 0x46, 0xb5, 0xbf, + 0xaa, 0x78, 0x7a, 0xa9, 0xaf, 0x56, 0xb0, 0x7a, 0x7c, 0xb7, 0x9c, 0x3c, + 0xd0, 0x3b, 0x81, 0x6b, 0xad, 0x55, 0x69, 0x46, 0x7d, 0x95, 0x74, 0x97, + 0x3f, 0xb4, 0x8e, 0x9f, 0x40, 0x5d, 0x3d, 0x3b, 0x3c, 0x8d, 0x51, 0xa2, + 0x6d, 0xa9, 0x45, 0x69, 0x49, 0x8e, 0x3a, 0x70, 0x8d, 0x78, 0xcf, 0x4e, + 0x4a, 0x3f, 0x89, 0xc9, 0x8f, 0x81, 0xb3, 0xc3, 0x6e, 0xa6, 0x39, 0x62, + 0x61, 0x4f, 0xa9, 0x89, 0x36, 0x33, 0xb5, 0x6e, 0xc2, 0x9f, 0x5b, 0x86, + 0x42, 0xa8, 0x89, 0xad, 0xc5, 0x49, 0x83, 0x85, 0xb2, 0x41, 0x88, 0xa2, + 0xb3, 0xac, 0xcb, 0x82, 0x61, 0xae, 0xa6, 0x8a, 0x8f, 0x7d, 0x7f, 0xc4, + 0x9c, 0xdb, 0xa8, 0x86, 0xc0, 0x6a, 0xca, 0xbe, 0xc5, 0x3c, 0x3e, 0xb8, + 0x67, 0xc8, 0xb0, 0x80, 0xb7, 0x5a, 0xb6, 0x70, 0x5e, 0xcf, 0xa7, 0x89, + 0x4c, 0xbc, 0xca, 0x76, 0xbd, 0x8a, 0x98, 0xa7, 0x6a, 0xb8, 0x77, 0x59, + 0xcb, 0x6b, 0x86, 0x57, 0x7c, 0xa3, 0x48, 0x9a, 0xb7, 0x76, 0x52, 0x51, + 0xa5, 0x97, 0x6d, 0xad, 0xa8, 0x45, 0x74, 0xbf, 0x30, 0xb5, 0x5a, 0x5d, + 0x64, 0x9f, 0x99, 0xb6, 0xa8, 0x32, 0x9c, 0x4f, 0x85, 0x6d, 0x84, 0xaa, + 0xb9, 0x6c, 0xc1, 0xc4, 0xb9, 0xad, 0xba, 0x7f, 0x56, 0x53, 0x2b, 0xad, + 0x8e, 0x98, 0x8a, 0x9b, 0x85, 0xbd, 0x7d, 0xa7, 0xc2, 0x9a, 0x57, 0x6e, + 0xbd, 0x83, 0x55, 0xc7, 0xa2, 0xbb, 0xbc, 0x4a, 0x98, 0xca, 0x79, 0x84, + 0x3d, 0xaa, 0x47, 0xa3, 0x9b, 0x30, 0xa7, 0x9a, 0x36, 0xd6, 0x2f, 0x5f, + 0x35, 0x41, 0x95, 0x5e, 0xa9, 0x28, 0x44, 0xc6, 0x99, 0xbf, 0x30, 0x41, + 0x8d, 0x8e, 0x8f, 0x66, 0x31, 0x67, 0xb4, 0x65, 0x82, 0x3a, 0x4e, 0xad, + 0x6f, 0xcd, 0xbf, 0x9f, 0xb7, 0xa8, 0x34, 0x48, 0x54, 0x4a, 0x9b, 0x6c, + 0x63, 0x58, 0x56, 0x34, 0x8d, 0xb3, 0x78, 0xbe, 0x80, 0x88, 0x7c, 0x64, + 0xca, 0x6b, 0x9b, 0xc2, 0x9d, 0xc3, 0x53, 0xae, 0x3d, 0x7b, 0x62, 0xc7, + 0x57, 0x32, 0xda, 0x74, 0xa4, 0x7a, 0xca, 0x7e, 0xb7, 0x93, 0x45, 0xd6, + 0x88, 0x93, 0xba, 0xb8, 0x9d, 0x64, 0xad, 0x43, 0x4e, 0xa9, 0x84, 0xca, + 0x65, 0xac, 0xc5, 0xb3, 0x36, 0xa4, 0x3d, 0x54, 0xb8, 0xb5, 0x86, 0x63, + 0x84, 0x6c, 0x6b, 0xd4, 0xb5, 0x9b, 0x59, 0x81, 0x4b, 0xa7, 0x87, 0x5c, + 0x78, 0xa8, 0xa9, 0xa2, 0xb9, 0x2d, 0x2b, 0x54, 0x46, 0x42, 0xa9, 0x48, + 0x9a, 0xb2, 0x60, 0x65, 0x34, 0x89, 0xa3, 0x7b, 0x88, 0x6c, 0x4a, 0x73, + 0x91, 0x48, 0x53, 0x57, 0xc9, 0xb6, 0x55, 0x4f, 0xa5, 0x55, 0xb0, 0xa2, + 0x84, 0xa1, 0xb8, 0xcc, 0x7e, 0xb1, 0x43, 0x43, 0x37, 0xba, 0xc6, 0x92, + 0xa7, 0xbf, 0xa1, 0x46, 0x78, 0x53, 0x98, 0x5f, 0x8f, 0x3e, 0x6d, 0xb0, + 0x97, 0x9a, 0x3b, 0x33, 0x9a, 0x5c, 0xae, 0x7a, 0x7c, 0xce, 0x44, 0x5b, + 0x77, 0x6e, 0x5e, 0x35, 0x46, 0xb5, 0x48, 0x82, 0x76, 0x6b, 0x5b, 0x97, + 0x68, 0xba, 0x3f, 0x43, 0xab, 0x8b, 0x83, 0x90, 0x90, 0xca, 0x48, 0x7c, + 0x49, 0x7a, 0x50, 0xa2, 0xad, 0x2a, 0x6f, 0xa2, 0x7f, 0x89, 0x92, 0xb0, + 0xa0, 0x33, 0xc2, 0x6c, 0x81, 0x9d, 0x9b, 0xb8, 0x2f, 0xb3, 0xb0, 0x83, + 0x90, 0x3e, 0x47, 0x7d, 0xc5, 0xc6, 0xbf, 0x5a, 0xa7, 0x5e, 0xa2, 0x6a, + 0xba, 0x7e, 0x52, 0x82, 0x2f, 0x29, 0x74, 0x5e, 0xc3, 0x87, 0x2e, 0xa4, + 0x5d, 0x8b, 0x35, 0xbf, 0x26, 0x6d, 0xa9, 0x44, 0x92, 0x54, 0x46, 0xae, + 0x93, 0x4f, 0x3b, 0x35, 0xb2, 0xb5, 0x3b, 0x41, 0x3c, 0x4d, 0xa2, 0xad, + 0x3c, 0x99, 0x63, 0xd8, 0x9d, 0x2e, 0x24, 0xa3, 0xc3, 0xb5, 0x4e, 0x50, + 0x59, 0x2e, 0x18, 0xcb, 0x62, 0x97, 0x85, 0xc8, 0xa0, 0x9e, 0x62, 0xdd, + 0x5b, 0x88, 0x51, 0xbe, 0x50, 0x86, 0x63, 0xaa, 0x9b, 0x72, 0x62, 0x32, + 0x66, 0x27, 0xd3, 0xbf, 0xcd, 0x7b, 0xa3, 0x46, 0x53, 0x89, 0x4d, 0x7a, + 0x7f, 0x52, 0x78, 0x63, 0xae, 0xb8, 0xcb, 0x91, 0xc9, 0x8d, 0xd9, 0xb3, + 0x5a, 0x46, 0x8d, 0x51, 0xb8, 0xa3, 0x4c, 0x40, 0x80, 0x5f, 0x90, 0xbb, + 0xb9, 0xad, 0x99, 0x45, 0x98, 0x77, 0xc1, 0xb7, 0x96, 0x55, 0x9c, 0x34, + 0x59, 0x70, 0x5b, 0x9d, 0x54, 0x2b, 0x43, 0xb8, 0x56, 0xb7, 0x98, 0xd5, + 0x5f, 0xaf, 0xc6, 0xd0, 0xc3, 0x79, 0x7b, 0xbd, 0x82, 0x70, 0x84, 0x48, + 0x34, 0x79, 0x50, 0x37, 0xb5, 0x5e, 0xc6, 0x88, 0xa2, 0x69, 0xb2, 0x7d, + 0x5c, 0x8f, 0x4d, 0x97, 0xcb, 0xc5, 0x7d, 0xce, 0x9e, 0xc3, 0xd2, 0x74, + 0xc7, 0x74, 0xae, 0xbe, 0x60, 0x38, 0x57, 0x68, 0xb7, 0x2f, 0x88, 0x91, + 0x7a, 0x28, 0x91, 0xb7, 0xc5, 0x31, 0x8a, 0xc4, 0x61, 0x3b, 0x6d, 0xc2, + 0x5b, 0xa0, 0x84, 0xa3, 0x53, 0x9c, 0xb3, 0xb5, 0x75, 0x35, 0xbf, 0x7e, + 0xbf, 0xbe, 0xa5, 0x62, 0xe0, 0xa3, 0x71, 0x76, 0x72, 0x3a, 0xa8, 0x69, + 0x6f, 0x83, 0x5a, 0x2f, 0x3a, 0x6f, 0x56, 0x41, 0x65, 0xa6, 0x5b, 0x92, + 0x30, 0x8f, 0xc3, 0xc9, 0xc1, 0xb4, 0x8a, 0x97, 0x98, 0x6c, 0x76, 0x9d, + 0x96, 0x59, 0x85, 0x68, 0x67, 0xd7, 0xb3, 0x9b, 0x52, 0x82, 0x8c, 0xc9, + 0x3d, 0x6d, 0x9d, 0x71, 0x7d, 0x47, 0x89, 0xb1, 0x47, 0x9d, 0x99, 0x65, + 0x61, 0x40, 0xc5, 0x67, 0xad, 0x65, 0x98, 0xb3, 0x4e, 0xa9, 0xa7, 0x4b, + 0xc0, 0x43, 0xa5, 0xce, 0xa0, 0x93, 0x2e, 0xcf, 0xc9, 0x85, 0x28, 0x42, + 0xa8, 0x8f, 0xa3, 0x6d, 0x85, 0xaf, 0xc3, 0xd2, 0x59, 0x8c, 0x6e, 0xce, + 0x92, 0x3e, 0x54, 0x91, 0x49, 0xa1, 0x9d, 0x77, 0x98, 0xc5, 0x6e, 0x5b, + 0x5b, 0xaa, 0xad, 0x51, 0xc4, 0xa3, 0x52, 0x7a, 0xaf, 0x70, 0xa7, 0x92, + 0x93, 0x5d, 0x55, 0x71, 0x76, 0x64, 0x59, 0x5a, 0x51, 0xb4, 0x58, 0x33, + 0x77, 0xa4, 0x8b, 0x6f, 0x9e, 0x85, 0x66, 0x4e, 0xb6, 0xcd, 0x44, 0x78, + 0x8a, 0xb5, 0xbd, 0x62, 0x7b, 0x75, 0x7e, 0xc6, 0x9b, 0x75, 0x44, 0x4f, + 0x6d, 0xc5, 0xcf, 0x47, 0xc9, 0xc0, 0x4a, 0x4c, 0xcb, 0x9c, 0x7b, 0xab, + 0x84, 0xd5, 0x66, 0x74, 0x6f, 0xa6, 0x8a, 0xc9, 0x2f, 0x50, 0x32, 0x42, + 0x40, 0xca, 0x3c, 0x7e, 0x77, 0x58, 0x48, 0xca, 0xb2, 0x79, 0xa0, 0x46, + 0x39, 0x36, 0x7d, 0x85, 0x55, 0x8b, 0xa1, 0x40, 0xcb, 0xba, 0x59, 0x94, + 0xc8, 0x86, 0x71, 0xb3, 0x8d, 0x99, 0x5d, 0x40, 0xa1, 0xa3, 0x9a, 0x46, + 0xaf, 0x7f, 0xc0, 0xba, 0x31, 0x7c, 0xc7, 0x51, 0x82, 0x6f, 0x6b, 0x9b, + 0x96, 0x5c, 0x91, 0x7a, 0x7e, 0x6b, 0xbe, 0x75, 0xba, 0x95, 0x40, 0xcb, + 0x7a, 0x93, 0x5c, 0x4b, 0x84, 0x78, 0x74, 0x39, 0x3e, 0xad, 0x8e, 0x40, + 0x3d, 0x99, 0x49, 0x37, 0xd3, 0x9c, 0x9e, 0xb6, 0x7e, 0x32, 0x64, 0x34, + 0x6d, 0x63, 0xa2, 0x8e, 0xcc, 0xa4, 0x62, 0x9c, 0xbf, 0x81, 0x65, 0x77, + 0x7b, 0xbc, 0xb9, 0x41, 0xc6, 0x8a, 0xd5, 0x4e, 0x34, 0xd0, 0x70, 0xba, + 0x6f, 0x38, 0xa6, 0x90, 0x43, 0xa0, 0x50, 0x7d, 0x6f, 0x78, 0x42, 0x8f, + 0x4f, 0x41, 0x54, 0x7d, 0x5e, 0x95, 0x8f, 0x37, 0xa8, 0xbb, 0x56, 0x7b, + 0xce, 0x4d, 0x59, 0x3a, 0x77, 0xc9, 0x72, 0x65, 0x57, 0xa1, 0x71, 0x42, + 0xa2, 0xc5, 0x75, 0x71, 0x88, 0x7d, 0xa2, 0xa2, 0x53, 0x9d, 0x7e, 0x4f, + 0xa4, 0x66, 0x6e, 0x92, 0xb4, 0x56, 0x49, 0x7f, 0xca, 0x96, 0x66, 0x62, + 0x92, 0x70, 0x50, 0x74, 0x9c, 0x60, 0x97, 0x8e, 0xbd, 0x36, 0x78, 0x34, + 0x2e, 0xcf, 0x39, 0x79, 0xab, 0x78, 0x8d, 0x8f, 0x3f, 0xb2, 0x66, 0x66, + 0xc2, 0x9c, 0x91, 0x61, 0x65, 0x5e, 0x65, 0x3e, 0x46, 0xac, 0x2f, 0xbe, + 0x8d, 0x82, 0x3c, 0x6a, 0x81, 0x92, 0x48, 0xcc, 0x54, 0x72, 0x7c, 0x83, + 0x6a, 0x99, 0x4f, 0x63, 0x88, 0xa1, 0x5c, 0x53, 0xb1, 0x80, 0x64, 0x7a, + 0x8b, 0x50, 0x35, 0x6d, 0x3c, 0x7d, 0xb7, 0xcb, 0x7e, 0x63, 0xd2, 0x39, + 0x8d, 0xb0, 0x3b, 0x84, 0x50, 0x95, 0x54, 0xc4, 0xcc, 0xbd, 0xbb, 0x44, + 0x92, 0xa0, 0x4e, 0x80, 0x74, 0x74, 0x90, 0xc6, 0xb3, 0x4a, 0x9a, 0xbe, + 0x50, 0xd0, 0x88, 0x53, 0x77, 0xce, 0x5b, 0x95, 0xa4, 0xa0, 0x92, 0x46, + 0xb7, 0x44, 0xc3, 0xa5, 0x61, 0xc4, 0xb1, 0x7c, 0x8e, 0xe7, 0x30, 0xce, + 0x78, 0x5d, 0x49, 0x4e, 0x6d, 0x80, 0x71, 0x7e, 0x8b, 0xab, 0xa5, 0x3b, + 0xc1, 0x87, 0xc8, 0xc2, 0x73, 0x3d, 0x74, 0x2f, 0x79, 0x38, 0x58, 0xcf, + 0xae, 0x86, 0xb3, 0x3e, 0x81, 0x77, 0xc8, 0xa7, 0x83, 0x9e, 0x4f, 0x50, + 0x52, 0x9e, 0xa8, 0x7d, 0x65, 0x31, 0x9c, 0xa1, 0x91, 0x3e, 0x47, 0xa2, + 0x9b, 0xb4, 0x72, 0x88, 0x9d, 0x5b, 0x79, 0xab, 0x39, 0xbe, 0xab, 0x6b, + 0x71, 0x53, 0x97, 0x39, 0xd3, 0xad, 0xce, 0x5b, 0x59, 0x93, 0xaf, 0x88, + 0x7f, 0xca, 0x53, 0x72, 0x8d, 0x98, 0xbf, 0x78, 0x4b, 0x54, 0x8e, 0x6e, + 0xa9, 0x87, 0x7f, 0xb2, 0xaa, 0xa6, 0x6d, 0x82, 0xc9, 0x9c, 0xce, 0x5a, + 0x54, 0x42, 0xa4, 0x88, 0xba, 0xc8, 0xc0, 0xc3, 0xab, 0xd8, 0xd1, 0x88, + 0xd1, 0x52, 0x6d, 0x8d, 0x53, 0x9b, 0xc1, 0xc6, 0xac, 0x54, 0x8b, 0xc9, + 0x60, 0x66, 0xd5, 0x7a, 0x45, 0x39, 0x4c, 0x48, 0x8d, 0x76, 0x77, 0xae, + 0x6a, 0xa2, 0x71, 0xa4, 0xd4, 0x70, 0xa1, 0x33, 0xcc, 0x85, 0xc7, 0xb3, + 0x64, 0x7f, 0x9d, 0x73, 0x56, 0x52, 0x4c, 0xc8, 0x69, 0x8c, 0x75, 0x74, + 0xb6, 0x6a, 0xa3, 0x97, 0x8c, 0x2e, 0xb9, 0x95, 0x4a, 0x93, 0x7b, 0x46, + 0x83, 0x80, 0x59, 0x97, 0xc2, 0x72, 0x51, 0x90, 0x78, 0x9c, 0x65, 0xb1, + 0x41, 0xc5, 0x4e, 0xc4, 0x9b, 0x49, 0xc8, 0x8b, 0x39, 0xa4, 0x46, 0x8d, + 0xc0, 0x64, 0xcc, 0x80, 0x8b, 0x97, 0x7a, 0xcf, 0x37, 0x46, 0x4a, 0x5e, + 0xc1, 0x40, 0x4e, 0xcf, 0x4c, 0x5d, 0x58, 0x30, 0x68, 0x37, 0x49, 0xc2, + 0x4f, 0x6c, 0x43, 0x70, 0x45, 0x9a, 0x42, 0xb6, 0x48, 0x3a, 0x3d, 0x55, + 0xc3, 0xd3, 0x63, 0x9d, 0x8f, 0x6f, 0xa7, 0x3e, 0x42, 0x56, 0xa1, 0xab, + 0xb9, 0xcd, 0xc7, 0xa6, 0x58, 0x64, 0x95, 0x46, 0x9a, 0xcb, 0x4f, 0x9a, + 0x49, 0x2d, 0xcb, 0xbb, 0xa5, 0x94, 0x45, 0x50, 0x5d, 0xd2, 0x39, 0x68, + 0x83, 0x5e, 0x35, 0xca, 0x6b, 0x93, 0x82, 0x90, 0x4d, 0xb5, 0xa0, 0xb9, + 0x80, 0x53, 0x7f, 0xb8, 0xaa, 0x80, 0x36, 0x5c, 0x6d, 0xa3, 0x88, 0xa9, + 0x7d, 0xb9, 0x6f, 0x6e, 0x62, 0x39, 0xc0, 0x62, 0x3c, 0x55, 0xc3, 0x4d, + 0xc9, 0x3d, 0x46, 0xc4, 0x6a, 0x8e, 0xc3, 0x9b, 0x44, 0xb9, 0x7d, 0xb1, + 0x46, 0x3a, 0x8a, 0x46, 0xc3, 0x7f, 0x79, 0x71, 0x51, 0x44, 0x53, 0xb4, + 0x5e, 0xa9, 0x5d, 0x4b, 0x83, 0x41, 0x6f, 0x4e, 0x5c, 0xd2, 0x66, 0xce, + 0x99, 0x68, 0x71, 0xa8, 0xb7, 0xc3, 0x33, 0xb3, 0xc9, 0x35, 0x81, 0x3d, + 0xcf, 0xae, 0x72, 0xb2, 0x90, 0xa6, 0xa1, 0xd1, 0x49, 0x91, 0x35, 0x52, + 0x3d, 0x6f, 0x92, 0xc3, 0xa9, 0xd5, 0xb0, 0x66, 0xc5, 0x8e, 0x95, 0x31, + 0xa3, 0xba, 0x86, 0x63, 0x9d, 0x5b, 0x44, 0x92, 0xc9, 0xa1, 0x35, 0x4e, + 0x90, 0x9b, 0xb9, 0xa7, 0xd0, 0x4d, 0x62, 0x79, 0xcf, 0x35, 0x80, 0x71, + 0x8a, 0x38, 0x83, 0xc3, 0xa9, 0x41, 0x31, 0x62, 0xa7, 0x56, 0x98, 0x3a, + 0x89, 0xab, 0x35, 0x7e, 0x5a, 0x3b, 0x63, 0x52, 0x4a, 0x5b, 0x95, 0xce, + 0x8b, 0x72, 0x7d, 0x65, 0x34, 0x7d, 0x90, 0xa7, 0x8f, 0x44, 0x38, 0x85, + 0xb0, 0x90, 0x80, 0x75, 0x3a, 0xcb, 0x85, 0x45, 0x8a, 0x3c, 0x32, 0xcc, + 0xba, 0x37, 0x45, 0x68, 0x76, 0xab, 0x7f, 0x3b, 0x7f, 0x7c, 0xbf, 0x61, + 0x5e, 0x8e, 0xa0, 0x6d, 0x67, 0xb5, 0x5c, 0xd1, 0xc7, 0x51, 0x45, 0x80, + 0x36, 0x7c, 0x68, 0x68, 0x3c, 0x94, 0x5a, 0x39, 0x6a, 0x6a, 0x62, 0xad, + 0xcc, 0xc1, 0xc8, 0xb8, 0xb5, 0xb8, 0x6e, 0x5b, 0x8f, 0x92, 0x92, 0x8b, + 0x50, 0x7d, 0xa6, 0x6e, 0xd1, 0xb3, 0x46, 0xb7, 0x43, 0x41, 0x6e, 0x8c, + 0x99, 0x69, 0x72, 0x57, 0xc6, 0x56, 0xa0, 0x4e, 0x54, 0xbe, 0xc7, 0xa9, + 0x4d, 0x4f, 0xd0, 0xc8, 0x67, 0x9a, 0x8b, 0x83, 0x34, 0x90, 0x30, 0xc6, + 0x3f, 0x87, 0xb7, 0x77, 0xa2, 0x63, 0x87, 0x5b, 0xcc, 0xb0, 0x5e, 0x45, + 0x6f, 0x76, 0x4a, 0x80, 0x6b, 0xa2, 0x6a, 0x7c, 0x93, 0x3e, 0x74, 0x55, + 0xa6, 0x98, 0x74, 0x3c, 0x45, 0x7f, 0x9b, 0xa3, 0x88, 0x4b, 0x37, 0x75, + 0x50, 0xb4, 0x93, 0x3a, 0x6c, 0x81, 0x63, 0x64, 0x32, 0xc4, 0x4a, 0x47, + 0xcb, 0xa7, 0x75, 0x91, 0xad, 0xb7, 0xb9, 0x9e, 0x30, 0x4a, 0xbd, 0x6a, + 0x71, 0xc2, 0x8e, 0x9e, 0x8b, 0xa0, 0xb2, 0x92, 0xc5, 0x42, 0x74, 0xcf, + 0x9c, 0xae, 0x4d, 0x91, 0x36, 0x86, 0x56, 0xac, 0x91, 0x7c, 0xac, 0xa5, + 0x3d, 0x47, 0x3a, 0x89, 0x98, 0x90, 0x98, 0x94, 0x93, 0xc2, 0xa6, 0x67, + 0x63, 0x3d, 0xa2, 0x39, 0x80, 0xb0, 0xc2, 0x69, 0x70, 0x38, 0x71, 0xc2, + 0x38, 0x94, 0xae, 0xa0, 0x6e, 0x51, 0x8e, 0x3e, 0xa6, 0x4e, 0x4f, 0x42, + 0xad, 0xaf, 0x53, 0x9e, 0x62, 0xc9, 0x79, 0xc1, 0x85, 0x50, 0x36, 0xad, + 0x88, 0xa1, 0xc8, 0xa1, 0xbe, 0x3a, 0xcd, 0x86, 0x7e, 0xa4, 0x37, 0x53, + 0xc5, 0x4c, 0xb8, 0x69, 0xb8, 0xae, 0xa7, 0x79, 0x34, 0x84, 0x74, 0x59, + 0xb9, 0x48, 0x9c, 0x9a, 0x65, 0x36, 0xae, 0x97, 0x70, 0x35, 0xab, 0x5e, + 0x88, 0x8a, 0x8d, 0x9e, 0x69, 0x39, 0x9b, 0x73, 0xbf, 0xc9, 0xc4, 0xa9, + 0xad, 0x49, 0x49, 0x9f, 0xa9, 0x70, 0x6e, 0x45, 0xcc, 0x5b, 0x41, 0x6d, + 0x34, 0x96, 0xcf, 0xce, 0x80, 0x7a, 0xc2, 0x71, 0x33, 0x64, 0x35, 0x74, + 0x6e, 0x4a, 0x49, 0xbe, 0x4e, 0x50, 0x31, 0x6a, 0x4c, 0x76, 0x6f, 0xbb, + 0x35, 0x8a, 0x9c, 0x9f, 0x96, 0x6a, 0xa1, 0x8e, 0x9d, 0xc4, 0xb7, 0x45, + 0x59, 0x5a, 0xbe, 0x70, 0x83, 0x46, 0x41, 0xb1, 0x75, 0xb7, 0x8f, 0xd0, + 0xab, 0x3f, 0x99, 0x64, 0x4f, 0x9f, 0xbc, 0xd7, 0xd2, 0x4b, 0x66, 0x5f, + 0x3f, 0x72, 0xa6, 0x9f, 0x57, 0x5b, 0x94, 0x3d, 0x7f, 0x8e, 0x6b, 0xca, + 0x50, 0x97, 0xd3, 0xc1, 0xd0, 0x6f, 0xaf, 0x4b, 0x33, 0x81, 0xa7, 0x72, + 0x3f, 0x62, 0x46, 0xcc, 0xbb, 0x42, 0x64, 0x54, 0xcf, 0x9b, 0x92, 0x51, + 0xab, 0x64, 0xa1, 0x9b, 0xd0, 0xb9, 0xcb, 0xd2, 0x36, 0x60, 0x95, 0xd2, + 0x52, 0x69, 0x39, 0x6b, 0x5a, 0x97, 0xcb, 0x62, 0xd1, 0xa7, 0xbc, 0x5b, + 0x9e, 0x71, 0x9d, 0x39, 0x54, 0x32, 0x97, 0xbb, 0x6f, 0x56, 0x88, 0xbb, + 0x41, 0x3d, 0xac, 0x94, 0x8a, 0x88, 0x3a, 0x87, 0xcd, 0x4e, 0x49, 0x8e, + 0x32, 0xc6, 0xa7, 0x7b, 0xa6, 0x3b, 0xc1, 0x4a, 0xc2, 0x8c, 0x36, 0x30, + 0x9a, 0xa8, 0x3e, 0x55, 0x74, 0xb1, 0x76, 0x67, 0x3f, 0x9b, 0x43, 0x57, + 0x8b, 0x3c, 0x6a, 0x60, 0x39, 0x9d, 0xb4, 0xca, 0x43, 0x84, 0x44, 0x51, + 0x71, 0x3f, 0x94, 0x84, 0x6b, 0x72, 0x77, 0x4d, 0x96, 0x76, 0x62, 0xba, + 0xc5, 0x9d, 0x42, 0xbd, 0xb9, 0x81, 0xb1, 0x67, 0x49, 0xb3, 0xb8, 0x63, + 0x7b, 0x6f, 0x52, 0x58, 0x94, 0x64, 0x5c, 0x62, 0x83, 0x54, 0x7d, 0x97, + 0x85, 0xb8, 0xc1, 0x30, 0x6e, 0xb8, 0x35, 0x34, 0x7f, 0xb7, 0x9e, 0xc3, + 0xbe, 0x84, 0xc1, 0x63, 0xba, 0xbf, 0x69, 0x47, 0x46, 0xcd, 0x8e, 0x8b, + 0xb8, 0x86, 0x3d, 0x37, 0x84, 0x75, 0xbe, 0xd0, 0x52, 0xa2, 0xac, 0x4f, + 0x41, 0xb9, 0x81, 0xa1, 0x96, 0x70, 0x63, 0xce, 0x35, 0x79, 0x4f, 0x9f, + 0x49, 0xc3, 0x7c, 0x60, 0x6d, 0x88, 0xa6, 0x53, 0xb9, 0x5a, 0x4b, 0xaa, + 0x7e, 0xbf, 0xa9, 0x71, 0x62, 0x4d, 0xb0, 0xc5, 0xa0, 0xb1, 0x39, 0x7e, + 0xce, 0xac, 0xce, 0x7a, 0x3d, 0x68, 0x84, 0x80, 0x93, 0xb9, 0x3e, 0xa4, + 0x6d, 0x84, 0x66, 0x65, 0xd0, 0x99, 0xb2, 0x7a, 0xc8, 0xae, 0xa1, 0x7c, + 0xa8, 0x61, 0x75, 0x6c, 0x51, 0x53, 0xbb, 0x53, 0xcd, 0x44, 0xd2, 0x65, + 0x75, 0x7d, 0xbb, 0x86, 0xb3, 0x36, 0x7c, 0x55, 0xb2, 0x46, 0x32, 0x30, + 0x6d, 0x86, 0xbd, 0xbf, 0xbd, 0x7e, 0x48, 0x58, 0x9a, 0xbe, 0xab, 0xad, + 0x73, 0xab, 0xb9, 0x94, 0xcd, 0xad, 0xad, 0x6d, 0x60, 0xd5, 0x5d, 0x92, + 0x40, 0x9c, 0x85, 0x86, 0x77, 0xc0, 0x5f, 0x35, 0x78, 0x51, 0x50, 0xd6, + 0x74, 0x97, 0x87, 0xb2, 0xa0, 0x2f, 0x7b, 0x89, 0x5f, 0x9b, 0x64, 0x7b, + 0xa3, 0x9d, 0x97, 0x42, 0x52, 0xb8, 0x6c, 0xba, 0x25, 0xce, 0x2e, 0x86, + 0x48, 0xaa, 0x6d, 0x31, 0x3a, 0x91, 0xbf, 0x93, 0x96, 0x43, 0x42, 0x87, + 0x6e, 0xb9, 0xa1, 0x78, 0x91, 0xc0, 0x69, 0x42, 0x6b, 0x68, 0xcd, 0xb3, + 0x9f, 0x74, 0x91, 0x5c, 0x8c, 0x42, 0xd1, 0xc4, 0x4d, 0x85, 0x76, 0x7d, + 0xcc, 0x65, 0x79, 0x9c, 0x8b, 0x64, 0xb6, 0x8f, 0x2c, 0x52, 0x64, 0xc5, + 0x71, 0x9c, 0xbd, 0x8a, 0x8a, 0x57, 0x46, 0xd4, 0x4b, 0x82, 0x51, 0x51, + 0x92, 0x39, 0x84, 0x9c, 0xbc, 0x7d, 0x7c, 0x51, 0x46, 0xaa, 0xcb, 0x89, + 0x70, 0x97, 0x83, 0x4d, 0x83, 0x22, 0x72, 0x7a, 0xcd, 0x75, 0xa1, 0xaa, + 0x47, 0x35, 0x3b, 0xa6, 0x77, 0xa3, 0x27, 0xb3, 0x78, 0x98, 0xb3, 0x8e, + 0x64, 0xd6, 0x40, 0xc5, 0xce, 0xda, 0x50, 0x9f, 0x60, 0xc1, 0x84, 0x5e, + 0x64, 0x8b, 0x99, 0x96, 0x66, 0x8a, 0x93, 0xaa, 0x4c, 0x5d, 0x60, 0x46, + 0x32, 0x57, 0x89, 0x5c, 0xb0, 0xc0, 0x61, 0x5f, 0x5c, 0x8e, 0x41, 0xd7, + 0x55, 0x36, 0x99, 0xce, 0xb0, 0xa6, 0x8f, 0x36, 0x9c, 0x66, 0x47, 0x43, + 0x7e, 0x30, 0xce, 0x6d, 0x4d, 0xa3, 0xc4, 0x9e, 0x57, 0xb1, 0x5a, 0x69, + 0x7f, 0x2f, 0xaa, 0x4f, 0x37, 0x3c, 0x94, 0x8d, 0xc5, 0xc2, 0x4c, 0x98, + 0xd1, 0x5b, 0x7c, 0xbb, 0x7b, 0x6a, 0x60, 0x7b, 0x9e, 0x4a, 0x6c, 0xb3, + 0x34, 0x5a, 0x7a, 0x6a, 0xa1, 0xb4, 0xab, 0xc6, 0x6e, 0xb9, 0x3a, 0x69, + 0x7b, 0x93, 0xc3, 0x7d, 0x32, 0x83, 0x52, 0xb5, 0x78, 0x63, 0xd5, 0xa2, + 0x94, 0x49, 0x7b, 0x68, 0x61, 0x45, 0x7a, 0x66, 0x4d, 0xa8, 0x73, 0x45, + 0x76, 0xbc, 0x9a, 0x68, 0xb3, 0xbe, 0xa9, 0x85, 0xa4, 0x7d, 0xbf, 0xa6, + 0x9e, 0xa1, 0x9c, 0x52, 0x97, 0x86, 0xc5, 0x9f, 0x46, 0xb0, 0x87, 0x95, + 0x92, 0x84, 0x92, 0xc8, 0xc6, 0x61, 0x78, 0xc2, 0x96, 0xb8, 0x7c, 0x96, + 0x64, 0x45, 0x75, 0x64, 0xa2, 0x79, 0x6a, 0xbe, 0x6d, 0x7f, 0xc8, 0xc2, + 0x98, 0x4e, 0xb2, 0x7a, 0xa1, 0xbb, 0x8f, 0x8e, 0xa7, 0x35, 0xa9, 0x76, + 0x33, 0x42, 0xa0, 0x9e, 0xc4, 0x35, 0x9f, 0xab, 0x9d, 0x3f, 0x82, 0x68, + 0xca, 0x53, 0x85, 0xa8, 0xb7, 0xbc, 0x39, 0x91, 0x85, 0x89, 0x3a, 0x76, + 0x32, 0xac, 0x8a, 0xc3, 0x69, 0x45, 0x87, 0x5c, 0x6b, 0x96, 0xd9, 0xa7, + 0x36, 0xcd, 0x3d, 0x4d, 0x82, 0x6f, 0x8c, 0xa8, 0x7c, 0x7a, 0xae, 0xa8, + 0xb1, 0x77, 0x6f, 0x53, 0xb0, 0xb8, 0x88, 0x57, 0x8d, 0x73, 0x62, 0xb6, + 0xba, 0xb9, 0xbb, 0x73, 0x37, 0xa5, 0xb7, 0x5d, 0x46, 0x73, 0x66, 0x69, + 0xc2, 0x5b, 0x9b, 0x55, 0xc3, 0x88, 0x71, 0xa8, 0x45, 0xc9, 0x4d, 0xbc, + 0x65, 0x44, 0x32, 0x36, 0xbc, 0x45, 0x66, 0xc0, 0x34, 0x74, 0xa1, 0x53, + 0x3b, 0x84, 0xc5, 0x25, 0x7b, 0x69, 0xa8, 0x52, 0x7b, 0x99, 0x7a, 0xbf, + 0x52, 0x45, 0xc6, 0x67, 0xab, 0x51, 0x74, 0xac, 0x67, 0xc7, 0xd3, 0xac, + 0xb6, 0xa6, 0xc1, 0x70, 0x93, 0x3b, 0xc1, 0x7a, 0xbd, 0xb9, 0x52, 0x76, + 0x93, 0xa3, 0xd2, 0x93, 0x94, 0x53, 0x6c, 0xb6, 0x51, 0xca, 0x64, 0xc1, + 0xc1, 0x55, 0x82, 0x61, 0x6d, 0x41, 0xaf, 0xd4, 0x52, 0x99, 0x88, 0x53, + 0x79, 0x83, 0x8c, 0x9f, 0xbc, 0x4a, 0x4d, 0xcc, 0x39, 0xd3, 0x9a, 0x9b, + 0xb3, 0x31, 0x48, 0xa0, 0xa9, 0x99, 0x71, 0xa6, 0xb4, 0xb7, 0x57, 0x61, + 0xcb, 0xa7, 0xc1, 0x4d, 0x25, 0x29, 0x7c, 0x60, 0x7d, 0xa7, 0x77, 0x88, + 0xcd, 0xcf, 0x84, 0x87, 0x8a, 0xb3, 0x87, 0xc0, 0x2e, 0x2e, 0xa6, 0x3b, + 0xc2, 0x4c, 0x5e, 0x79, 0x39, 0x8c, 0x47, 0x9d, 0x85, 0xb2, 0xba, 0x96, + 0x89, 0xbd, 0xbb, 0x66, 0xa5, 0xc8, 0x39, 0x35, 0xc0, 0x96, 0x4a, 0x50, + 0x95, 0x4d, 0x55, 0x6e, 0xa6, 0xc5, 0xc7, 0x5a, 0x9c, 0xa8, 0x76, 0x7c, + 0x9d, 0x55, 0x57, 0x5c, 0x51, 0x7e, 0x49, 0xae, 0xbe, 0x78, 0xb9, 0x36, + 0x76, 0x83, 0xa5, 0x38, 0x46, 0xd2, 0x36, 0x9e, 0x7f, 0x9f, 0x86, 0x88, + 0xc3, 0x98, 0x45, 0x61, 0x61, 0x7d, 0x8d, 0x3a, 0x43, 0xd8, 0xb3, 0xb9, + 0xb1, 0x63, 0x8f, 0x87, 0x29, 0x7a, 0x9f, 0x3e, 0xa8, 0x3b, 0x30, 0x2d, + 0xad, 0x5b, 0x50, 0x62, 0x89, 0x71, 0xa2, 0xb9, 0xb0, 0xc6, 0x5c, 0xc3, + 0x90, 0xbb, 0x94, 0x41, 0x98, 0x39, 0x44, 0xcc, 0xd3, 0x50, 0x61, 0x71, + 0xae, 0x92, 0x44, 0xbf, 0x26, 0x52, 0xd9, 0x74, 0x60, 0xd4, 0x3f, 0x35, + 0x3e, 0x8f, 0x92, 0x92, 0x61, 0x9a, 0x5d, 0x57, 0xb6, 0xc9, 0x49, 0x84, + 0x89, 0xbd, 0x7a, 0x6c, 0x7d, 0x9b, 0xac, 0x50, 0x1f, 0xc8, 0x54, 0x9b, + 0x80, 0x4f, 0x75, 0x66, 0x7f, 0x52, 0x8f, 0xa3, 0x9e, 0x85, 0x8a, 0x9c, + 0x62, 0x60, 0x90, 0x98, 0x8e, 0x52, 0xc7, 0x51, 0x90, 0xc0, 0x4c, 0x45, + 0x6d, 0xc0, 0x37, 0x79, 0x96, 0x6b, 0x8d, 0x3f, 0x80, 0x3d, 0x6d, 0xa6, + 0x92, 0xb2, 0x35, 0xab, 0x50, 0x53, 0x69, 0x42, 0x78, 0xb0, 0x8e, 0xa2, + 0xde, 0x97, 0x73, 0xaa, 0x98, 0x46, 0x92, 0xcd, 0x7f, 0x4d, 0xc5, 0x66, + 0x71, 0x44, 0xc1, 0xa0, 0x58, 0x9c, 0x79, 0x5e, 0x73, 0x8e, 0x41, 0x69, + 0x6b, 0xb3, 0xa2, 0xb8, 0x65, 0x90, 0xbf, 0x77, 0x62, 0xb8, 0x4a, 0x3b, + 0x43, 0x5d, 0x83, 0x4d, 0x6e, 0x3b, 0x58, 0xa9, 0x64, 0x73, 0x51, 0xa6, + 0xcd, 0xb5, 0x93, 0x99, 0x37, 0x49, 0x73, 0x84, 0xc7, 0x4a, 0x34, 0x4a, + 0xad, 0x83, 0x75, 0xa4, 0x50, 0xcb, 0xca, 0xbb, 0x8c, 0x5f, 0xc5, 0x90, + 0xb5, 0x79, 0xa7, 0x3d, 0x6f, 0x2f, 0xa0, 0x73, 0x79, 0x34, 0x6c, 0x4e, + 0xc4, 0x45, 0xb1, 0x67, 0x76, 0x46, 0x5f, 0x4c, 0x8b, 0x6f, 0xaf, 0xad, + 0xb0, 0x52, 0x88, 0x79, 0x3a, 0xce, 0xac, 0x89, 0xb7, 0x98, 0x9c, 0xba, + 0xaa, 0x41, 0xc0, 0x3b, 0x58, 0x66, 0xbb, 0xc6, 0x3c, 0x75, 0x65, 0x9e, + 0xce, 0xb3, 0x91, 0xa3, 0x71, 0x75, 0x78, 0xb5, 0x4b, 0x7f, 0xa1, 0x69, + 0x6b, 0x78, 0x61, 0x8f, 0x9f, 0x55, 0xbc, 0x93, 0xb6, 0x78, 0xb3, 0xad, + 0xb4, 0x72, 0x3a, 0xa1, 0x2f, 0xce, 0xbd, 0xba, 0x52, 0x9d, 0x83, 0x6c, + 0x53, 0xbb, 0xaa, 0x72, 0xd1, 0xcd, 0x53, 0xd2, 0x5c, 0x69, 0x47, 0x38, + 0x65, 0x44, 0xa3, 0x40, 0x39, 0x50, 0x93, 0x75, 0x7c, 0xc1, 0x8c, 0x47, + 0x74, 0x44, 0x9b, 0x8f, 0x84, 0x46, 0x91, 0xc3, 0x40, 0xbb, 0xae, 0x61, + 0x74, 0xbd, 0x6d, 0x9e, 0x7c, 0x58, 0x89, 0xc2, 0xc3, 0x97, 0x3d, 0x45, + 0xb7, 0x52, 0x5d, 0x3f, 0x38, 0x6f, 0x9c, 0x4d, 0x73, 0x54, 0x62, 0x52, + 0xb1, 0xab, 0x97, 0x35, 0x61, 0x94, 0x89, 0xbf, 0x6e, 0x80, 0x71, 0x7f, + 0x32, 0x37, 0xb9, 0x35, 0xa7, 0xc4, 0x62, 0xc5, 0xcd, 0xb1, 0xc9, 0x8b, + 0x49, 0x5f, 0xcd, 0x6e, 0x59, 0x71, 0x57, 0x72, 0xc4, 0xa8, 0xbf, 0x9a, + 0x46, 0x48, 0xa7, 0x95, 0xd3, 0x64, 0x80, 0xaa, 0x4b, 0xc2, 0x3f, 0x9d, + 0x49, 0x49, 0x98, 0x98, 0x3a, 0x7c, 0xbb, 0xc9, 0x84, 0x62, 0xc6, 0x96, + 0x7a, 0xa1, 0x6d, 0x9b, 0x99, 0x65, 0x3d, 0x95, 0x8e, 0x45, 0xa8, 0x46, + 0x8e, 0xc5, 0xc5, 0x36, 0xbb, 0xc9, 0xc7, 0x95, 0x83, 0x5d, 0x3f, 0xc0, + 0xd2, 0x8a, 0x7c, 0x6f, 0xa6, 0x82, 0x8d, 0x70, 0x5c, 0xbb, 0x3e, 0x49, + 0x52, 0x45, 0x67, 0x98, 0x4f, 0x83, 0xbb, 0x9c, 0xb6, 0x34, 0x3b, 0x54, + 0xc9, 0x81, 0xd0, 0x76, 0xbe, 0xb2, 0xb9, 0x47, 0x31, 0x46, 0x88, 0x93, + 0xaa, 0x72, 0xb7, 0xc1, 0x88, 0x64, 0x7d, 0x37, 0x59, 0xca, 0xcb, 0x4a, + 0x2d, 0x31, 0x7f, 0x31, 0x8c, 0xb8, 0xc7, 0x7e, 0x9d, 0x90, 0x93, 0xcc, + 0x7a, 0x9c, 0xd3, 0xba, 0x6f, 0xab, 0x2f, 0x43, 0xb1, 0x9e, 0x78, 0x38, + 0x74, 0x61, 0xa4, 0x63, 0x56, 0x88, 0x35, 0xce, 0x80, 0xbf, 0x35, 0xc5, + 0x5a, 0xc9, 0x87, 0x9b, 0x66, 0xc1, 0x50, 0x56, 0x7d, 0x8d, 0x5b, 0x94, + 0xc8, 0xbc, 0xbd, 0x4f, 0x8f, 0xb0, 0x69, 0x7e, 0x56, 0x35, 0x62, 0x76, + 0xa4, 0xb0, 0xad, 0x70, 0x83, 0x31, 0xaa, 0xcd, 0x78, 0x94, 0x46, 0x33, + 0x75, 0x6d, 0x35, 0xaf, 0x92, 0x41, 0x64, 0x4f, 0xb3, 0x48, 0x40, 0x81, + 0xbd, 0x45, 0x4d, 0x41, 0x61, 0x6d, 0xa4, 0xa9, 0xc4, 0xa1, 0x3a, 0xd0, + 0x52, 0x67, 0x66, 0xa9, 0x61, 0xd0, 0xb9, 0x4d, 0x94, 0xac, 0xc6, 0x54, + 0x5a, 0x79, 0xa2, 0x9f, 0x7c, 0x7c, 0x3d, 0x59, 0xc7, 0x9c, 0x97, 0xd1, + 0x6d, 0x2f, 0xd2, 0x91, 0x93, 0xaf, 0x60, 0xca, 0x5d, 0x46, 0x4f, 0x6c, + 0xb7, 0x51, 0xc2, 0x6e, 0x3b, 0x9a, 0x6b, 0xd0, 0xa5, 0x8f, 0xae, 0x4d, + 0x37, 0x5b, 0x7b, 0x8f, 0x64, 0x5c, 0x83, 0x6c, 0x4a, 0x6d, 0x54, 0x9e, + 0xd2, 0x6c, 0x38, 0x6e, 0xc2, 0xbc, 0x81, 0x6a, 0x4e, 0x6c, 0x96, 0x34, + 0xbd, 0xd0, 0x7a, 0x8b, 0x3c, 0x60, 0x45, 0x52, 0x65, 0xba, 0x6b, 0x6e, + 0x7b, 0x39, 0x35, 0x57, 0xb1, 0x96, 0xa9, 0xce, 0x73, 0xcc, 0xd6, 0xcc, + 0x81, 0x91, 0x5d, 0xbe, 0x6d, 0x5a, 0xb4, 0x88, 0x96, 0x36, 0x3f, 0x36, + 0xcc, 0x54, 0x48, 0x57, 0x3f, 0x9c, 0x60, 0x54, 0x7d, 0x62, 0x90, 0x94, + 0x66, 0x96, 0xa7, 0xab, 0xb5, 0xbe, 0x7c, 0x81, 0x44, 0xc0, 0x49, 0xaf, + 0x2e, 0x8e, 0x41, 0x3b, 0xd0, 0x90, 0x88, 0x4a, 0x78, 0x88, 0x78, 0x8e, + 0x46, 0x6b, 0xb2, 0x59, 0x9f, 0xb5, 0x53, 0xaa, 0x72, 0x7c, 0x53, 0xa5, + 0x7c, 0xd8, 0x8e, 0x3c, 0x65, 0x6a, 0x9b, 0x5e, 0xca, 0x5d, 0x8f, 0x71, + 0x43, 0xd3, 0xc8, 0x6c, 0xc3, 0x5c, 0xd2, 0xb3, 0x48, 0x94, 0xb9, 0x9e, + 0x4e, 0x4c, 0x88, 0xc0, 0x9d, 0x63, 0xba, 0x69, 0xa0, 0x95, 0xb0, 0x8e, + 0x9b, 0x71, 0x81, 0x51, 0x7e, 0x6e, 0x76, 0x83, 0x53, 0xbd, 0x80, 0x59, + 0x6f, 0x88, 0x69, 0xad, 0xba, 0x4f, 0x60, 0xaf, 0xac, 0x9c, 0xa7, 0x45, + 0x7e, 0x3b, 0x4c, 0xc1, 0xcf, 0x3b, 0x78, 0x54, 0x75, 0x80, 0x97, 0x64, + 0xb3, 0xbb, 0x7b, 0x79, 0x4b, 0xaf, 0xc5, 0x88, 0x62, 0x57, 0xb0, 0x8d, + 0xb8, 0x5a, 0xa1, 0xd3, 0x6f, 0xb4, 0x41, 0xb8, 0xac, 0xb7, 0x64, 0xa6, + 0x9d, 0x8b, 0x36, 0x95, 0xb2, 0xd1, 0x61, 0xc0, 0x37, 0x58, 0xbd, 0xa7, + 0x82, 0x6a, 0x4e, 0x36, 0x4a, 0x8c, 0xd3, 0x67, 0x6e, 0x46, 0x5e, 0xa4, + 0x65, 0x67, 0x90, 0xae, 0x8c, 0x38, 0x7d, 0x4a, 0xcf, 0xad, 0x73, 0x45, + 0xad, 0x94, 0x89, 0xcf, 0x5a, 0x7f, 0x48, 0x80, 0x9d, 0x6a, 0x6d, 0x69, + 0x92, 0x7a, 0x3f, 0x60, 0xc2, 0x39, 0xb4, 0x3a, 0x49, 0x5e, 0x9b, 0x7e, + 0x89, 0x74, 0x6b, 0x67, 0xa7, 0xae, 0x71, 0x50, 0xb3, 0x43, 0x66, 0x5f, + 0x55, 0x8b, 0xb0, 0xca, 0x69, 0xa3, 0x8d, 0xb9, 0x49, 0x7b, 0x94, 0x6e, + 0x41, 0xa3, 0xaa, 0xae, 0x4f, 0x89, 0x7d, 0x80, 0xb2, 0xb1, 0x8a, 0x6c, + 0x9a, 0x40, 0x41, 0x77, 0xd0, 0x6c, 0xb4, 0x89, 0x6c, 0xa0, 0x4e, 0x35, + 0xac, 0x79, 0x94, 0x57, 0x75, 0x73, 0xa4, 0x54, 0xc9, 0xb8, 0xad, 0x50, + 0xce, 0xb7, 0x3b, 0x76, 0xa3, 0xa2, 0x5b, 0xa7, 0x80, 0xa0, 0x49, 0xb6, + 0x77, 0xab, 0xc4, 0x57, 0x45, 0x5f, 0xc4, 0x27, 0x52, 0x49, 0x83, 0x9c, + 0xd6, 0x56, 0x73, 0x40, 0x8a, 0x48, 0xc7, 0x97, 0x39, 0x95, 0x99, 0x7e, + 0x43, 0xb4, 0x88, 0xc9, 0x7a, 0x70, 0x60, 0x87, 0x37, 0x30, 0x87, 0x3d, + 0x30, 0x63, 0x8e, 0x4a, 0x6f, 0x4f, 0xbd, 0x76, 0x52, 0x81, 0x61, 0x7c, + 0x88, 0x39, 0x58, 0x93, 0xa7, 0xb2, 0x66, 0x79, 0x9a, 0x76, 0xb5, 0xa1, + 0x46, 0x99, 0x7f, 0x47, 0xc6, 0x3a, 0x58, 0xdb, 0xb1, 0x35, 0xd0, 0xd6, + 0xb5, 0xae, 0xa2, 0xce, 0xa3, 0xc1, 0x9b, 0x39, 0x8b, 0x9a, 0x81, 0x6c, + 0x40, 0xa5, 0x93, 0x7c, 0x70, 0xaf, 0x69, 0x6e, 0x62, 0x6e, 0x8b, 0x6d, + 0x6d, 0x7d, 0x4e, 0x95, 0x38, 0x7d, 0x3e, 0xa1, 0x63, 0x77, 0x83, 0x7b, + 0x3c, 0x63, 0x5a, 0x72, 0x72, 0x94, 0x7f, 0x6f, 0x98, 0x5d, 0x39, 0x9a, + 0x44, 0x75, 0x57, 0x64, 0x8a, 0x51, 0x3c, 0x7a, 0x6c, 0x4e, 0xa3, 0x4e, + 0x43, 0x40, 0x81, 0x4f, 0x80, 0x37, 0xaf, 0xc8, 0x4b, 0x6d, 0xa7, 0xd0, + 0xa5, 0x52, 0x46, 0xb3, 0x52, 0xbe, 0x5c, 0x8a, 0xc9, 0xbd, 0xa9, 0xa0, + 0xc2, 0x60, 0x48, 0xbe, 0x6d, 0x4a, 0x66, 0xa0, 0xda, 0x6b, 0x82, 0x79, + 0xa8, 0xbe, 0x9d, 0x7a, 0x7e, 0x5b, 0x4a, 0xbe, 0x64, 0xb6, 0x6a, 0x77, + 0x80, 0xb0, 0xc0, 0x65, 0x66, 0x4f, 0x8e, 0xa7, 0xaa, 0xcc, 0xcd, 0x41, + 0xbe, 0x84, 0x8b, 0xd0, 0xb7, 0x4d, 0x58, 0x65, 0x34, 0x8b, 0x52, 0x97, + 0x39, 0x33, 0x43, 0x4b, 0x2f, 0x52, 0x5e, 0xac, 0x7c, 0x7c, 0x31, 0x67, + 0xc9, 0x7b, 0x73, 0x76, 0x2f, 0x7d, 0xbe, 0x85, 0x66, 0x45, 0x6d, 0x53, + 0x95, 0x73, 0x85, 0xb1, 0x93, 0x7a, 0x48, 0x50, 0x54, 0xa3, 0x44, 0xb2, + 0xa8, 0xba, 0xc2, 0x36, 0xde, 0xa3, 0x83, 0x4a, 0x6a, 0x66, 0x66, 0x5d, + 0x85, 0x97, 0x89, 0x7b, 0x4b, 0xd1, 0x38, 0x9b, 0x9c, 0x5f, 0x99, 0xcd, + 0x48, 0x9d, 0x83, 0x39, 0x54, 0x76, 0xab, 0xbf, 0x8e, 0xb6, 0x70, 0xad, + 0xa5, 0xa0, 0xb3, 0x82, 0x39, 0x4d, 0x83, 0xa3, 0x67, 0xcb, 0x76, 0x89, + 0xa1, 0x7b, 0x73, 0xc5, 0xc5, 0xb3, 0x49, 0x8c, 0x64, 0xc3, 0x4f, 0x46, + 0xc1, 0x7c, 0x98, 0x93, 0x94, 0x87, 0x67, 0x44, 0xcc, 0xa9, 0x32, 0x31, + 0x9a, 0x94, 0x50, 0x50, 0x52, 0xb6, 0x9a, 0x98, 0xc4, 0xd9, 0x47, 0x3f, + 0x43, 0x40, 0x80, 0xa2, 0xb7, 0x73, 0x90, 0x95, 0x63, 0x8c, 0x70, 0xae, + 0xb8, 0x3c, 0x45, 0xbc, 0x2b, 0xc0, 0x8b, 0x6b, 0x67, 0x50, 0x50, 0xaa, + 0x5e, 0x8c, 0xa2, 0xb9, 0x5b, 0x79, 0xb7, 0xb0, 0x7a, 0xd8, 0x58, 0x97, + 0x5b, 0x5f, 0x72, 0x90, 0x6d, 0xcb, 0x56, 0x8a, 0xd3, 0xcf, 0xa5, 0x88, + 0xa8, 0xa1, 0x6c, 0x85, 0xa1, 0x3f, 0xb4, 0x5c, 0x7b, 0xb5, 0xc9, 0xa2, + 0x68, 0x44, 0xb2, 0xcd, 0xbf, 0xd8, 0xa9, 0x82, 0xa2, 0x49, 0x7a, 0xa7, + 0x34, 0xdb, 0x60, 0x6a, 0xa7, 0x8a, 0x85, 0xb0, 0x7b, 0xba, 0xc1, 0xa6, + 0xa5, 0xc8, 0x7e, 0xcd, 0xc1, 0x46, 0x38, 0xac, 0xd1, 0xad, 0x6f, 0x5e, + 0x94, 0x38, 0x3e, 0xac, 0xd6, 0x65, 0xaf, 0x45, 0x72, 0xcb, 0xc1, 0xa4, + 0xc4, 0xc4, 0xb2, 0x50, 0xa7, 0x9f, 0x9b, 0xbd, 0x6e, 0x87, 0xc7, 0x3c, + 0x85, 0x3e, 0xaf, 0x45, 0xbb, 0x59, 0x99, 0xb6, 0x91, 0x57, 0x57, 0x2c, + 0x8a, 0xca, 0x59, 0x90, 0x7a, 0x4b, 0x42, 0x53, 0x64, 0xca, 0x2d, 0x60, + 0xb7, 0xb4, 0x77, 0xab, 0x65, 0x94, 0xb4, 0x59, 0x31, 0xa7, 0x42, 0x59, + 0xa0, 0x67, 0x9d, 0x51, 0xa4, 0xc4, 0x36, 0x47, 0x2d, 0x5c, 0x6a, 0x93, + 0x8d, 0x55, 0x38, 0x59, 0x8d, 0xb1, 0xcc, 0x96, 0xb1, 0x93, 0x8d, 0xb2, + 0x83, 0x84, 0xd6, 0x52, 0x52, 0xe4, 0x30, 0x85, 0x73, 0x44, 0xce, 0x42, + 0x5a, 0x5a, 0x99, 0x7e, 0x32, 0xaf, 0x5e, 0xcc, 0xb9, 0x8a, 0x48, 0xd4, + 0x95, 0xa2, 0xc5, 0x57, 0x42, 0xcb, 0xd6, 0xb2, 0x89, 0xb8, 0xa3, 0xc4, + 0x47, 0xaf, 0x54, 0x88, 0x87, 0x8b, 0xb4, 0x82, 0x71, 0x9b, 0x34, 0x95, + 0x92, 0x3e, 0x3a, 0x2b, 0xcc, 0x49, 0x3d, 0x56, 0x8f, 0x25, 0x97, 0x42, + 0x79, 0x6e, 0x3a, 0x55, 0x42, 0x9a, 0x50, 0x71, 0x55, 0x5a, 0x36, 0x73, + 0xac, 0xc2, 0xaf, 0x59, 0x81, 0xc5, 0xb0, 0xaa, 0xb3, 0x89, 0x88, 0x9e, + 0xbf, 0xce, 0x53, 0x7c, 0x3e, 0x31, 0x9c, 0xcc, 0x6e, 0xce, 0x49, 0xcb, + 0x75, 0xa4, 0x8f, 0x68, 0x70, 0x44, 0xc6, 0x60, 0xcc, 0x7a, 0x4e, 0x44, + 0xd1, 0x2e, 0x9f, 0x4f, 0x7b, 0x56, 0xbc, 0x2f, 0x38, 0x97, 0x9b, 0x79, + 0x38, 0xb1, 0x5b, 0x75, 0x3c, 0x85, 0xa9, 0x4a, 0x5d, 0x99, 0xc2, 0x3d, + 0xc8, 0x9b, 0x3d, 0x6b, 0x65, 0x69, 0x7d, 0x9e, 0x4f, 0x9e, 0xb5, 0x8e, + 0x49, 0x9d, 0x3b, 0xdf, 0x96, 0x5d, 0x47, 0x7a, 0x3a, 0x73, 0x93, 0x67, + 0xc8, 0xbc, 0xab, 0x4a, 0x66, 0x7f, 0x54, 0x78, 0x3c, 0xc5, 0x77, 0x2f, + 0x74, 0x7d, 0x98, 0x7d, 0x4b, 0xb8, 0xb5, 0xcd, 0xcf, 0xa5, 0xd4, 0x51, + 0x34, 0x49, 0x66, 0x9f, 0x2d, 0xc0, 0x9f, 0x29, 0x46, 0x7d, 0x5e, 0x7a, + 0xb9, 0x61, 0x9a, 0xa0, 0xb4, 0xa4, 0xc0, 0xc5, 0xa6, 0x36, 0x73, 0x7e, + 0x87, 0xa8, 0x94, 0xa9, 0x44, 0x4b, 0xb1, 0x53, 0xad, 0xb3, 0x62, 0x64, + 0x40, 0x61, 0xa3, 0x33, 0x67, 0xd3, 0x45, 0x90, 0x2a, 0x57, 0x90, 0xa5, + 0x28, 0xa9, 0xad, 0x6f, 0x92, 0x91, 0x68, 0x42, 0xb2, 0x8d, 0x85, 0xa4, + 0xbe, 0xb3, 0x83, 0xa4, 0x77, 0xc3, 0x4b, 0x95, 0x84, 0x8e, 0x37, 0x3d, + 0x66, 0xe5, 0x87, 0xc5, 0x4b, 0x58, 0xa8, 0xa4, 0x85, 0xbe, 0x44, 0x9d, + 0xcc, 0x6b, 0x42, 0x8f, 0x62, 0x98, 0x41, 0xba, 0xca, 0x66, 0x80, 0x47, + 0xb3, 0xab, 0x43, 0x3f, 0x9a, 0x65, 0x8a, 0x67, 0x62, 0x85, 0x79, 0xac, + 0x46, 0x69, 0x95, 0xa1, 0x52, 0x65, 0xb1, 0xa0, 0x6d, 0xbe, 0xcc, 0xa4, + 0x9b, 0xb5, 0xc3, 0x7d, 0xca, 0x3c, 0x65, 0x98, 0x85, 0x42, 0xb9, 0x66, + 0x8c, 0x6a, 0x40, 0xae, 0xa0, 0xbc, 0x5e, 0x60, 0x4a, 0x66, 0x9e, 0x6f, + 0x7c, 0x90, 0xb7, 0xa9, 0xb0, 0xc5, 0xbe, 0x95, 0x6c, 0x47, 0x68, 0xcf, + 0xaa, 0xa4, 0x3a, 0xb2, 0x65, 0x47, 0x5c, 0x48, 0x4a, 0x63, 0xbf, 0x97, + 0xc0, 0x8d, 0xb3, 0xc9, 0x64, 0xb9, 0x6c, 0x8e, 0xa3, 0x93, 0x5d, 0xae, + 0xc7, 0x32, 0x65, 0xaf, 0xa8, 0x76, 0xa9, 0x62, 0x7a, 0x63, 0x54, 0x77, + 0x77, 0x9b, 0x6c, 0x65, 0xb7, 0x7b, 0x2f, 0xcf, 0xe3, 0x40, 0x47, 0xb4, + 0xa4, 0x9a, 0xc5, 0xb3, 0xb3, 0x50, 0xd6, 0x68, 0x71, 0xd7, 0x9a, 0xb0, + 0x3b, 0x53, 0x9e, 0xaa, 0xb5, 0x60, 0xca, 0xc4, 0xaf, 0x8e, 0x5e, 0x6c, + 0x58, 0xc3, 0x8f, 0xc0, 0xb3, 0xa3, 0x7e, 0x6a, 0x89, 0x93, 0x81, 0x5e, + 0x53, 0x6b, 0x73, 0xaa, 0x3e, 0x81, 0x57, 0x36, 0x92, 0xe1, 0x51, 0xb2, + 0x46, 0xc0, 0x6a, 0x4c, 0x37, 0x7f, 0x5a, 0x9b, 0x49, 0xbf, 0x8b, 0x5b, + 0x34, 0x3a, 0x64, 0x7a, 0x39, 0x83, 0x89, 0x99, 0x9f, 0x6a, 0x4e, 0x77, + 0xbf, 0x92, 0xaf, 0x94, 0xbf, 0xca, 0x2f, 0xa1, 0x7e, 0x5f, 0x9b, 0x89, + 0x43, 0x8b, 0xca, 0x6e, 0xaf, 0xc8, 0x6e, 0xca, 0x36, 0xbc, 0x4b, 0xc0, + 0xab, 0x85, 0x8e, 0x8b, 0x9a, 0x39, 0xb9, 0x68, 0x41, 0x61, 0x65, 0x6e, + 0x49, 0x9b, 0xbc, 0x6b, 0x58, 0x6d, 0xbc, 0xb0, 0xb3, 0x58, 0xa9, 0x87, + 0x81, 0x83, 0xb0, 0xc2, 0x58, 0x2f, 0xa1, 0x9d, 0x79, 0x44, 0x4d, 0xb6, + 0x7c, 0xbf, 0xd2, 0x96, 0xbc, 0xc1, 0x3b, 0x6d, 0x7c, 0x2f, 0x3b, 0x7c, + 0xc8, 0x86, 0x39, 0x71, 0xb7, 0xb9, 0xcf, 0x36, 0xc9, 0x50, 0x4f, 0x6d, + 0x52, 0x67, 0x93, 0x99, 0x53, 0x6a, 0x72, 0x9e, 0x44, 0x39, 0x7b, 0xa3, + 0x60, 0x3e, 0xaf, 0x4b, 0xb2, 0xa8, 0x85, 0x9d, 0x75, 0xc7, 0x3c, 0x68, + 0x44, 0x36, 0x5a, 0x5b, 0x68, 0x93, 0x44, 0x3d, 0xc9, 0xc4, 0x52, 0xac, + 0xd0, 0x84, 0x80, 0xa3, 0x4c, 0xcc, 0x89, 0x87, 0x57, 0x4d, 0x5e, 0xbd, + 0x7e, 0x70, 0x6d, 0x8f, 0x61, 0x58, 0xbf, 0x68, 0xce, 0xba, 0x4c, 0x5c, + 0xc8, 0x50, 0x8f, 0x92, 0x31, 0x8d, 0xc1, 0xd4, 0x23, 0x2a, 0x63, 0x92, + 0xb7, 0xa1, 0xa7, 0x6c, 0xb2, 0xb5, 0x62, 0x38, 0x93, 0xa0, 0x35, 0x73, + 0xb6, 0x62, 0x53, 0x5c, 0x68, 0x52, 0x63, 0xc7, 0x96, 0x60, 0x4c, 0x44, + 0xc2, 0x64, 0x76, 0x45, 0x5e, 0x95, 0x52, 0x9e, 0x41, 0x76, 0x5d, 0x38, + 0x68, 0x63, 0x65, 0x4a, 0x37, 0x2d, 0xc8, 0x90, 0xc1, 0x4b, 0x8e, 0x6d, + 0x38, 0x71, 0x3e, 0x8e, 0x7a, 0x68, 0xae, 0x9c, 0x48, 0x52, 0xbb, 0x91, + 0xcd, 0xd8, 0x4e, 0x84, 0x9e, 0x7f, 0x68, 0x54, 0x8b, 0xba, 0x9d, 0x46, + 0x64, 0xad, 0x9e, 0x5e, 0xcf, 0xd0, 0x72, 0x67, 0xbf, 0x64, 0x98, 0xc0, + 0xac, 0xbd, 0xad, 0xbc, 0x63, 0x50, 0xa9, 0xca, 0x2d, 0x93, 0xcf, 0x8e, + 0x80, 0x31, 0x95, 0xc2, 0xd5, 0xbc, 0x60, 0x3d, 0xa5, 0x9a, 0x3c, 0xa4, + 0x55, 0xbe, 0x58, 0x56, 0xaa, 0x9c, 0xcb, 0xad, 0x58, 0xc4, 0xd7, 0xcd, + 0x7b, 0x7b, 0xa1, 0xaa, 0x9b, 0x58, 0x65, 0x6c, 0x51, 0x93, 0xbc, 0x83, + 0x9f, 0x78, 0x85, 0x35, 0x9c, 0x3f, 0xae, 0x99, 0x87, 0xaf, 0x8f, 0x35, + 0x96, 0xaf, 0x6c, 0x74, 0x65, 0x9e, 0xaa, 0x46, 0x5a, 0x5b, 0x6e, 0x7f, + 0xb5, 0x9f, 0xc1, 0x8a, 0xb5, 0xbc, 0x61, 0x3c, 0xae, 0x5b, 0x4f, 0xd4, + 0x7b, 0x61, 0x68, 0x63, 0x59, 0xe6, 0x55, 0x59, 0x6b, 0x53, 0x47, 0x5f, + 0x38, 0x87, 0x62, 0x66, 0xc1, 0xd6, 0x73, 0x66, 0x4c, 0x92, 0xda, 0x9c, + 0x5e, 0xb8, 0xcf, 0x7f, 0x4d, 0x70, 0x92, 0x97, 0x6c, 0x53, 0xb8, 0x3c, + 0xbf, 0x3b, 0x6f, 0xd8, 0x4f, 0x2e, 0x3a, 0xaa, 0x6b, 0x6a, 0x53, 0x8e, + 0x5b, 0xad, 0xa3, 0x74, 0x81, 0x6f, 0x37, 0xb7, 0x79, 0xb5, 0x9e, 0x59, + 0xa6, 0x65, 0x7c, 0xbd, 0x3a, 0x4e, 0x44, 0x85, 0xd2, 0xd7, 0xcf, 0xa7, + 0x65, 0x95, 0xad, 0x98, 0x69, 0xb1, 0x94, 0xcf, 0xa0, 0xad, 0x42, 0x60, + 0xe4, 0xc8, 0x51, 0x75, 0x9e, 0x71, 0x46, 0x56, 0x80, 0xa5, 0x90, 0xa5, + 0x3c, 0xb1, 0xbf, 0x92, 0x42, 0x6d, 0x68, 0xb1, 0x7b, 0x2b, 0x3c, 0x4a, + 0x6c, 0x37, 0x40, 0x6d, 0xe0, 0x46, 0xb1, 0xb3, 0x56, 0xcf, 0xbd, 0x77, + 0xc3, 0x8e, 0x92, 0x84, 0x60, 0x4e, 0xa3, 0xd5, 0x67, 0xa9, 0xb6, 0x8a, + 0x6c, 0x50, 0x3d, 0x39, 0x78, 0x8a, 0x38, 0xaa, 0x60, 0x61, 0x86, 0xbf, + 0xc3, 0x70, 0x62, 0x78, 0xaa, 0xdc, 0x4a, 0xb2, 0x5c, 0xa7, 0x50, 0x8a, + 0x87, 0x42, 0x8a, 0xc0, 0xa0, 0x73, 0xbc, 0x4d, 0x80, 0xa6, 0x51, 0x67, + 0xb5, 0x62, 0xc8, 0xbb, 0x6d, 0xc7, 0x7d, 0x3b, 0xca, 0x56, 0xbb, 0x93, + 0x8c, 0x55, 0x53, 0x49, 0xb3, 0x76, 0x6b, 0xc0, 0xc1, 0xc4, 0x75, 0xbe, + 0xa5, 0x69, 0x84, 0x99, 0x7f, 0x46, 0x3d, 0x42, 0x59, 0x72, 0x4f, 0x5d, + 0xa8, 0x51, 0x77, 0xa5, 0x7c, 0xb3, 0x83, 0x6e, 0xcb, 0xaa, 0x5b, 0x79, + 0xb7, 0x74, 0x2d, 0x9c, 0x44, 0x47, 0x33, 0x7b, 0x59, 0x93, 0x6d, 0x4f, + 0x62, 0xc6, 0x5c, 0x85, 0xae, 0x80, 0x3a, 0x68, 0x46, 0x69, 0x75, 0x4e, + 0x4a, 0xb8, 0xa3, 0x8a, 0x55, 0x70, 0x4e, 0x8e, 0xa8, 0x32, 0x70, 0x91, + 0x98, 0xba, 0x83, 0xaf, 0xdb, 0x9d, 0xe3, 0x9e, 0x60, 0x50, 0x73, 0x61, + 0xb3, 0xc7, 0xbf, 0x53, 0x7c, 0x6b, 0xab, 0xbb, 0x6d, 0x28, 0x87, 0x39, + 0x61, 0x52, 0x5e, 0x40, 0xd2, 0xb2, 0x7d, 0x3e, 0xad, 0x67, 0x42, 0xca, + 0x9b, 0x8b, 0xa1, 0x43, 0x95, 0x37, 0xc9, 0xb6, 0x69, 0xc6, 0x63, 0xcb, + 0x5d, 0x62, 0x3d, 0x9a, 0xa2, 0xb5, 0x42, 0xc8, 0x43, 0x81, 0x69, 0x3f, + 0x9f, 0x69, 0x7d, 0x61, 0xca, 0x87, 0x48, 0x4c, 0xbb, 0x74, 0x8a, 0x7a, + 0xb3, 0xb3, 0x4a, 0x6b, 0x51, 0x35, 0x3f, 0x4e, 0x7e, 0x5b, 0x6b, 0x66, + 0x63, 0xad, 0x8a, 0xa3, 0x3a, 0x7c, 0x7e, 0xa3, 0x8b, 0x68, 0x66, 0x57, + 0x84, 0x3a, 0x3f, 0x5a, 0xbf, 0x4d, 0xdc, 0x4d, 0x34, 0x3d, 0x40, 0x72, + 0xbf, 0x97, 0xae, 0x44, 0xb6, 0xad, 0x9a, 0xb2, 0x86, 0x41, 0xcd, 0x44, + 0x89, 0x89, 0x6f, 0x58, 0xb7, 0x33, 0x5c, 0x40, 0xa6, 0x94, 0xa9, 0x48, + 0x8e, 0xb2, 0x84, 0x4a, 0x65, 0x39, 0x81, 0xa2, 0xd1, 0xca, 0xc8, 0xa8, + 0x62, 0x5b, 0xc8, 0x8c, 0xab, 0x43, 0x63, 0x4b, 0x6f, 0x2e, 0x7d, 0x48, + 0x7b, 0x79, 0x47, 0xb2, 0xb3, 0x2c, 0xb8, 0x5a, 0x78, 0xa9, 0x75, 0x8a, + 0x33, 0x7c, 0x66, 0xa6, 0x48, 0x88, 0xdd, 0xb0, 0xcb, 0x65, 0x58, 0xde, + 0xab, 0xc4, 0x2b, 0xcd, 0x7f, 0xbe, 0x65, 0xbf, 0xd8, 0x6e, 0xc5, 0xc2, + 0xa8, 0xbc, 0x6c, 0x8a, 0x3c, 0xd0, 0xdb, 0xa0, 0x6c, 0x48, 0xc1, 0x3e, + 0x94, 0x93, 0xca, 0x52, 0x48, 0x3b, 0x93, 0x8f, 0xd1, 0xc4, 0x93, 0xa2, + 0x48, 0x64, 0x4b, 0x3a, 0x8e, 0x7f, 0x79, 0xc2, 0x60, 0x3f, 0x49, 0x33, + 0x45, 0x58, 0x58, 0x75, 0xc2, 0xb5, 0xcd, 0xbb, 0x7f, 0x3e, 0x77, 0x65, + 0xd3, 0xc8, 0x9f, 0xba, 0x41, 0xc8, 0x91, 0xd3, 0x50, 0x7e, 0xa7, 0x9e, + 0x81, 0x6c, 0x7a, 0x75, 0x6c, 0x98, 0x31, 0x33, 0x5d, 0xd6, 0x42, 0x74, + 0x38, 0x6e, 0xb6, 0xd0, 0x47, 0x6c, 0x34, 0xb1, 0x75, 0xb5, 0xae, 0x91, + 0x57, 0xb6, 0xc9, 0x78, 0xa2, 0xca, 0x50, 0x6e, 0x86, 0x9b, 0xd5, 0x88, + 0x94, 0x3c, 0x3e, 0x57, 0x3a, 0xb4, 0x86, 0x32, 0x54, 0x34, 0x92, 0xbc, + 0x82, 0x7f, 0xc0, 0x89, 0x49, 0x70, 0x37, 0x82, 0x8e, 0x2c, 0x5b, 0x71, + 0x53, 0x8d, 0x6c, 0xaf, 0x51, 0x73, 0x92, 0x67, 0x6f, 0x3c, 0x40, 0xc9, + 0x9f, 0x7a, 0x71, 0x4d, 0xad, 0xd1, 0x79, 0xb4, 0xa9, 0x99, 0xa1, 0x88, + 0x4c, 0x60, 0x69, 0x4e, 0x83, 0x89, 0x8d, 0xb9, 0x8f, 0x83, 0x76, 0x7a, + 0x5a, 0xbe, 0x85, 0x65, 0x6e, 0x34, 0xa8, 0x5f, 0x72, 0x9a, 0x41, 0x93, + 0x77, 0xb7, 0x81, 0x88, 0x76, 0xbb, 0x6f, 0xcd, 0x34, 0xb7, 0xa1, 0x46, + 0xca, 0x98, 0x7d, 0x9a, 0xa6, 0x82, 0x8b, 0x48, 0x3c, 0x76, 0xa1, 0x53, + 0x3c, 0xc1, 0xc9, 0x3c, 0x9c, 0xa0, 0x69, 0x4f, 0xa5, 0xcd, 0xbd, 0xa8, + 0x55, 0x97, 0x59, 0x4e, 0xc8, 0x7f, 0x3f, 0x8a, 0x4a, 0x84, 0x67, 0xdb, + 0x61, 0xaf, 0xb9, 0x90, 0x8e, 0xc1, 0xab, 0x52, 0xc5, 0x36, 0x87, 0x83, + 0x57, 0xa7, 0x6d, 0x60, 0x49, 0x75, 0xbb, 0x94, 0xa6, 0xa3, 0x58, 0x47, + 0xc8, 0x97, 0xc4, 0x96, 0x7e, 0xae, 0x4e, 0x50, 0xa1, 0x3e, 0xce, 0x93, + 0x7f, 0x6a, 0x9c, 0xcb, 0xc6, 0x47, 0xb9, 0x57, 0x47, 0x93, 0xd7, 0x89, + 0x53, 0x6b, 0x91, 0x9c, 0x76, 0x8b, 0x77, 0x92, 0xaa, 0xbc, 0x39, 0xa4, + 0xb3, 0x73, 0x36, 0xc0, 0x94, 0xa2, 0x8a, 0xc0, 0x6e, 0x35, 0x97, 0x31, + 0x48, 0xae, 0xb3, 0x76, 0xc4, 0xc6, 0x8c, 0x77, 0xa7, 0x7b, 0xb2, 0x83, + 0x40, 0xa2, 0xc3, 0x95, 0xcd, 0x98, 0xce, 0x8a, 0xa7, 0x5e, 0x73, 0x96, + 0x56, 0x63, 0x88, 0x59, 0x3e, 0xc7, 0x8b, 0xa8, 0xb7, 0x4c, 0x78, 0xa7, + 0x94, 0x5e, 0x6c, 0x53, 0x83, 0x6e, 0x82, 0xb8, 0x9c, 0x93, 0xa0, 0x86, + 0xa3, 0x8a, 0x68, 0xaa, 0x93, 0xad, 0x72, 0xaf, 0x9c, 0x6f, 0x40, 0x4b, + 0xb9, 0xbc, 0x3a, 0x65, 0x37, 0x45, 0x50, 0x31, 0xa0, 0xae, 0xc3, 0x94, + 0x91, 0x5d, 0x98, 0x96, 0x6b, 0xc4, 0x9c, 0x55, 0x53, 0x8d, 0xd6, 0x65, + 0xb8, 0x66, 0xc0, 0x81, 0x79, 0x4a, 0x5e, 0x71, 0x9a, 0x86, 0x6f, 0xbb, + 0xbf, 0x62, 0x3e, 0x46, 0x39, 0x51, 0xae, 0x7d, 0x53, 0x5e, 0x55, 0x86, + 0x70, 0x6f, 0x82, 0xc4, 0x64, 0x54, 0x37, 0x67, 0x79, 0xc5, 0xcf, 0xaf, + 0x59, 0x57, 0x74, 0x79, 0xba, 0x62, 0x7c, 0x96, 0x8c, 0x4b, 0x57, 0x42, + 0x8d, 0xaa, 0x30, 0xa1, 0xcc, 0xbf, 0x51, 0x8d, 0x9e, 0x8b, 0x3c, 0x54, + 0x89, 0x68, 0x9e, 0xc0, 0x77, 0xa2, 0x4e, 0x44, 0x74, 0x57, 0xd2, 0x79, + 0x6b, 0x74, 0x73, 0x9e, 0xa6, 0xbb, 0xae, 0xa7, 0xb0, 0xbd, 0x68, 0x3f, + 0x80, 0x8d, 0xa5, 0x8c, 0x88, 0xa0, 0x88, 0xa5, 0x43, 0xcf, 0x5b, 0x76, + 0x96, 0x93, 0xb8, 0x85, 0x41, 0xac, 0xb7, 0x5f, 0x42, 0x6c, 0x7a, 0x3a, + 0x92, 0x94, 0x6d, 0x42, 0x91, 0x94, 0x87, 0xbf, 0x88, 0x6e, 0x5c, 0xd3, + 0x97, 0xa6, 0x4d, 0x39, 0xb6, 0x52, 0xab, 0x32, 0x53, 0xbf, 0x53, 0xcb, + 0x80, 0x65, 0x9e, 0xd1, 0x65, 0x98, 0x6a, 0xaa, 0x88, 0xb6, 0x5d, 0x80, + 0x8e, 0x3f, 0xa3, 0x5d, 0x53, 0x82, 0x47, 0x68, 0xd2, 0x60, 0xaa, 0x41, + 0xb4, 0x61, 0x60, 0x90, 0x68, 0x9c, 0x73, 0x60, 0x39, 0x76, 0xa9, 0xb1, + 0x76, 0x99, 0xa3, 0x54, 0x64, 0x3d, 0xbf, 0xa1, 0x69, 0x41, 0x94, 0x4c, + 0x64, 0xb1, 0x7b, 0x46, 0x8f, 0x9c, 0x7c, 0x57, 0xa1, 0x3c, 0xa3, 0x59, + 0xa0, 0xc7, 0x67, 0x50, 0x73, 0x89, 0x80, 0x7f, 0xb5, 0xd2, 0x51, 0xb3, + 0x4f, 0x92, 0xd8, 0x8d, 0x42, 0xad, 0x9a, 0x42, 0xb0, 0x5e, 0xcf, 0x62, + 0x5d, 0xba, 0x68, 0xae, 0x89, 0xba, 0x60, 0x8c, 0x7d, 0xaf, 0x69, 0x57, + 0x87, 0xcb, 0xc6, 0x57, 0x73, 0xbc, 0x50, 0x66, 0xd7, 0x4b, 0x83, 0x6e, + 0x5b, 0x97, 0x73, 0x99, 0x56, 0x55, 0x2d, 0x36, 0x98, 0x62, 0xa4, 0x4c, + 0xa0, 0xbe, 0x91, 0x3a, 0x3f, 0x68, 0x81, 0xb8, 0x3c, 0x92, 0x9c, 0x59, + 0x6b, 0xa7, 0x43, 0xa5, 0xc7, 0xbb, 0xc0, 0x47, 0x5d, 0xa3, 0x69, 0xc3, + 0xa3, 0x8f, 0xcf, 0x6f, 0x48, 0x5a, 0xd2, 0x3b, 0x3d, 0x42, 0x63, 0xa0, + 0x74, 0x70, 0x66, 0x39, 0x39, 0x3e, 0x79, 0x34, 0x9a, 0xd4, 0x3f, 0x42, + 0xbf, 0x70, 0x8a, 0xd3, 0x66, 0x76, 0xc4, 0xd9, 0x5c, 0xc4, 0xd9, 0x62, + 0xad, 0x97, 0xa2, 0x8c, 0x47, 0x78, 0x82, 0xc6, 0x5f, 0x89, 0x85, 0x68, + 0x90, 0x33, 0x3e, 0x86, 0xac, 0x8c, 0x54, 0x86, 0x46, 0x76, 0x9f, 0x45, + 0x95, 0x69, 0x44, 0x45, 0xbf, 0x5f, 0x3d, 0x7d, 0x82, 0xab, 0xd4, 0x71, + 0xc0, 0xd0, 0x47, 0x6b, 0x8c, 0x7e, 0x47, 0x88, 0xb7, 0x4c, 0x55, 0x65, + 0x49, 0x79, 0xcd, 0x39, 0x5b, 0xce, 0xb7, 0xcb, 0x4d, 0x5b, 0xb9, 0x5f, + 0xcb, 0x95, 0xd6, 0x51, 0x99, 0xbf, 0x65, 0x7d, 0x5d, 0x54, 0x3f, 0x42, + 0xab, 0x40, 0xba, 0x78, 0x36, 0x4a, 0x84, 0xa4, 0x57, 0xd2, 0x7e, 0xd6, + 0x8d, 0xc8, 0xc7, 0x3f, 0x8f, 0x68, 0x76, 0x7a, 0x99, 0xc4, 0x8e, 0xae, + 0x75, 0xa2, 0x7e, 0x63, 0x33, 0x9c, 0x55, 0xd3, 0x27, 0x3c, 0x74, 0x45, + 0x4b, 0x56, 0x8f, 0x7a, 0x63, 0xab, 0x18, 0x26, 0xc2, 0xb7, 0xd4, 0x78, + 0x7f, 0x2d, 0xb4, 0xc1, 0x56, 0x85, 0x94, 0xbf, 0x68, 0xd6, 0x5e, 0x6d, + 0x86, 0xc3, 0xb1, 0x56, 0x48, 0x40, 0xb9, 0x63, 0xc0, 0xa1, 0x6b, 0xca, + 0x7a, 0xac, 0x76, 0x4e, 0xc0, 0xc2, 0x42, 0x41, 0x4a, 0x5d, 0x82, 0xb9, + 0xa4, 0x3d, 0xa6, 0xb1, 0x5a, 0xaa, 0x3c, 0x2d, 0x56, 0x65, 0xc7, 0x7b, + 0x91, 0xda, 0x61, 0xb0, 0xab, 0x69, 0xbe, 0xa0, 0x40, 0x55, 0xab, 0x78, + 0x7e, 0x2f, 0x39, 0xc7, 0x57, 0x7f, 0x37, 0x3c, 0xb8, 0xad, 0x4a, 0xd1, + 0x76, 0xb6, 0x47, 0xcd, 0x4c, 0x54, 0x64, 0x7f, 0xb3, 0x93, 0x58, 0x3f, + 0x9e, 0xd8, 0x72, 0xc8, 0xc7, 0x7f, 0x71, 0x89, 0x42, 0x6f, 0x90, 0x3e, + 0x93, 0xab, 0x6b, 0x6f, 0x91, 0xbf, 0xc1, 0xab, 0x5d, 0x50, 0x87, 0xcd, + 0xcb, 0xd6, 0x7f, 0x4a, 0xa0, 0x54, 0xbd, 0x9c, 0x9b, 0x53, 0xae, 0xcd, + 0x61, 0xc3, 0x63, 0x69, 0x4f, 0x4c, 0xab, 0xcd, 0xb8, 0xcc, 0x3e, 0xab, + 0x7e, 0x69, 0x9e, 0xcc, 0x77, 0xa7, 0xb3, 0x8f, 0xaf, 0x79, 0x55, 0x88, + 0xab, 0x7f, 0xcf, 0x8f, 0xb4, 0x4c, 0x7f, 0x63, 0x37, 0x6e, 0xd0, 0x78, + 0x51, 0x3f, 0xcd, 0xaf, 0x75, 0x52, 0x67, 0xcf, 0xdb, 0x52, 0xa1, 0x58, + 0x4b, 0xcf, 0x76, 0xea, 0xc2, 0xd9, 0xbd, 0xcd, 0x94, 0xaf, 0xcb, 0x55, + 0x7c, 0xac, 0x7e, 0x3b, 0x34, 0x56, 0x5a, 0x9f, 0xca, 0x57, 0x68, 0x2c, + 0xb8, 0x80, 0xa4, 0x78, 0x70, 0xcd, 0x63, 0x34, 0x97, 0x33, 0x24, 0xbe, + 0x37, 0xd1, 0x8e, 0xdc, 0x52, 0x27, 0x4f, 0x93, 0x64, 0xcf, 0xcf, 0x5b, + 0x37, 0x52, 0x6d, 0x7b, 0xc8, 0x8d, 0xad, 0x4c, 0x51, 0xc2, 0x80, 0x4a, + 0x6a, 0xd1, 0x65, 0xbe, 0x63, 0x56, 0xc1, 0xb8, 0x6c, 0x65, 0x5f, 0xca, + 0x6d, 0x9e, 0x57, 0x7d, 0x73, 0xaf, 0x70, 0x98, 0x64, 0x3d, 0x36, 0x8d, + 0x88, 0x43, 0x43, 0x71, 0x8b, 0x69, 0x63, 0x8b, 0x4d, 0x72, 0xbe, 0x46, + 0x5c, 0x8c, 0xb0, 0x4c, 0xa6, 0x84, 0x9e, 0x77, 0x56, 0x3a, 0x35, 0xce, + 0xac, 0xb3, 0xad, 0x84, 0x59, 0x2a, 0x69, 0xad, 0x72, 0x69, 0x4f, 0x52, + 0x94, 0x48, 0x72, 0xa5, 0x54, 0xaa, 0x8b, 0x8a, 0xa1, 0x90, 0x78, 0x93, + 0xd3, 0xbd, 0xa3, 0x64, 0x99, 0x92, 0x3f, 0x32, 0xc3, 0x53, 0x97, 0xbd, + 0x67, 0xb2, 0x63, 0x37, 0xc5, 0x5b, 0xb3, 0x4f, 0x42, 0x55, 0xbc, 0x99, + 0xd5, 0x9d, 0xda, 0x6c, 0x9f, 0xc9, 0x4a, 0x3a, 0x84, 0x49, 0x4b, 0x42, + 0x9a, 0xaf, 0xca, 0xca, 0x76, 0x59, 0xce, 0x52, 0xae, 0x3d, 0x39, 0x5c, + 0x95, 0x78, 0x87, 0x87, 0x8a, 0xbe, 0xa1, 0x9d, 0x7a, 0x47, 0x39, 0x51, + 0x79, 0x3b, 0x2e, 0xb9, 0xbf, 0xa9, 0xa8, 0x96, 0xa4, 0x95, 0xd1, 0xc9, + 0x5f, 0xaa, 0x48, 0x91, 0xd7, 0xa8, 0xcd, 0x52, 0xc6, 0xba, 0x75, 0xb5, + 0xdc, 0x93, 0xac, 0x41, 0x42, 0xae, 0x95, 0xa6, 0x97, 0x40, 0x8c, 0x7d, + 0x9b, 0x88, 0x9d, 0x6e, 0x57, 0x84, 0x76, 0xa9, 0xb1, 0x7d, 0x40, 0xbd, + 0xc1, 0x85, 0x84, 0xd7, 0x3f, 0x90, 0x3a, 0x76, 0x75, 0x77, 0x7f, 0x8e, + 0x9b, 0x5e, 0xc0, 0xbd, 0x8c, 0x6c, 0xd1, 0xa6, 0x24, 0x2d, 0x5e, 0x3e, + 0xce, 0x9a, 0x9a, 0x47, 0x90, 0x51, 0x44, 0x79, 0xb1, 0xb0, 0x99, 0x50, + 0x4a, 0xab, 0x52, 0xbc, 0x5b, 0x4c, 0x94, 0x48, 0x80, 0x43, 0x70, 0x75, + 0xb4, 0x83, 0x96, 0xd3, 0x89, 0xb8, 0x32, 0x29, 0x5b, 0x59, 0xb0, 0x90, + 0x6e, 0xd2, 0x3a, 0xae, 0xb7, 0x87, 0x54, 0x99, 0x7b, 0x53, 0x83, 0x94, + 0xcd, 0x56, 0xc8, 0x32, 0xb3, 0x60, 0x86, 0xaa, 0x86, 0x9c, 0x94, 0xa8, + 0xc1, 0xa7, 0xb1, 0x73, 0x5f, 0x5d, 0x31, 0xb1, 0x72, 0x2d, 0xb7, 0x81, + 0xb0, 0x3f, 0x99, 0x5c, 0x3c, 0xac, 0x54, 0xbf, 0xc1, 0x8f, 0x84, 0x81, + 0x91, 0x63, 0x69, 0xa4, 0xa6, 0x90, 0xb9, 0x74, 0x46, 0x86, 0x81, 0x45, + 0x35, 0x98, 0xc5, 0xd1, 0xac, 0x58, 0x80, 0xcf, 0x2e, 0x2c, 0x64, 0x47, + 0xa0, 0xb0, 0xb1, 0x54, 0x8f, 0x83, 0x66, 0x3a, 0x75, 0x64, 0x62, 0x63, + 0x3c, 0xb7, 0x7b, 0xa9, 0xa9, 0xbf, 0x6f, 0x83, 0x45, 0x90, 0x78, 0x85, + 0x36, 0x93, 0x8b, 0x52, 0x75, 0xd1, 0x65, 0x83, 0x71, 0x3d, 0x9e, 0x71, + 0x72, 0x68, 0xb7, 0x46, 0x8e, 0xb7, 0x55, 0x34, 0xba, 0xa9, 0x47, 0xa6, + 0x69, 0x31, 0x42, 0x58, 0x60, 0x3f, 0xa3, 0xc1, 0x57, 0x89, 0x70, 0x8b, + 0x82, 0x66, 0x3b, 0x57, 0x59, 0x98, 0x67, 0x59, 0x7e, 0x77, 0xba, 0x9d, + 0x34, 0x69, 0xcb, 0xbd, 0x5f, 0xde, 0xad, 0x84, 0x5b, 0xd4, 0x34, 0xa5, + 0x61, 0xa3, 0x59, 0x8c, 0x6b, 0x45, 0x2f, 0x83, 0x61, 0x8b, 0xcc, 0xb8, + 0x94, 0x4d, 0x32, 0x38, 0x2d, 0x4e, 0xa1, 0x74, 0xe3, 0xd5, 0x8f, 0x92, + 0x5e, 0xa9, 0x89, 0xc8, 0x63, 0xb4, 0x33, 0x94, 0xcd, 0x66, 0xb6, 0xa7, + 0x9a, 0xc2, 0x49, 0xb3, 0xad, 0x45, 0x58, 0xb0, 0x33, 0xa0, 0x91, 0x72, + 0x9e, 0x64, 0x4a, 0x42, 0x8b, 0x64, 0xae, 0xb3, 0x82, 0x77, 0x35, 0x5f, + 0x3f, 0xd0, 0x7e, 0xb4, 0xbf, 0x69, 0xa1, 0x78, 0x62, 0xa5, 0x9c, 0x82, + 0x51, 0x43, 0x4b, 0x7f, 0x88, 0x93, 0x9e, 0x9f, 0xa1, 0x4e, 0xa2, 0x77, + 0x78, 0x55, 0xc4, 0xa5, 0x75, 0x9e, 0x38, 0x61, 0x68, 0x62, 0x37, 0x5c, + 0x8d, 0xbc, 0x82, 0x84, 0x70, 0x56, 0xc7, 0xa2, 0x66, 0xb1, 0x60, 0x96, + 0x4a, 0x77, 0x75, 0xac, 0xb2, 0x66, 0xb4, 0x81, 0xa6, 0x90, 0x3f, 0x39, + 0x50, 0x76, 0x83, 0xc4, 0x85, 0x81, 0x98, 0x4c, 0x9c, 0x6c, 0xc4, 0x6f, + 0xa0, 0x54, 0x47, 0x4c, 0x97, 0xc8, 0x5a, 0xa0, 0x6e, 0xa2, 0x3b, 0x6b, + 0xd6, 0x36, 0xb4, 0xaa, 0xb4, 0xaa, 0x99, 0x7d, 0xb3, 0xd6, 0x31, 0xa5, + 0xb3, 0xbc, 0xbe, 0x8f, 0xb1, 0xb5, 0x7c, 0x80, 0xbe, 0x57, 0x4f, 0x4a, + 0x36, 0x49, 0x72, 0xad, 0x5c, 0x94, 0x7a, 0x74, 0x5f, 0xac, 0x50, 0x63, + 0x4d, 0x81, 0xa0, 0x69, 0x40, 0x71, 0x74, 0x5d, 0x88, 0x8c, 0xb0, 0x36, + 0x72, 0x3d, 0x58, 0xb0, 0xc5, 0x43, 0x39, 0x33, 0x7d, 0xbf, 0x9d, 0x47, + 0xbe, 0x71, 0x68, 0xaf, 0x59, 0xaf, 0x6e, 0x4a, 0xa9, 0x5b, 0x54, 0x98, + 0x52, 0x75, 0xa5, 0x36, 0x7f, 0x44, 0xce, 0x65, 0x94, 0x8c, 0x92, 0x84, + 0x70, 0xd9, 0x7f, 0xd5, 0x96, 0x62, 0x42, 0x59, 0x37, 0xce, 0x70, 0x3e, + 0x5f, 0x7e, 0x89, 0x7c, 0x37, 0xba, 0xc6, 0xc9, 0x4f, 0xa8, 0xcc, 0x52, + 0x33, 0xb2, 0xb3, 0x42, 0x83, 0x50, 0x91, 0xc2, 0x9c, 0x39, 0xc2, 0x4a, + 0x9c, 0x67, 0xb9, 0xb3, 0x2e, 0xc1, 0xaa, 0x68, 0xcf, 0x46, 0x4a, 0xc5, + 0xb7, 0xa3, 0x4e, 0xaa, 0x37, 0x4a, 0xa2, 0x5e, 0x47, 0xa4, 0xc6, 0x4f, + 0x83, 0x8c, 0x42, 0xcd, 0x6f, 0x87, 0x32, 0x43, 0xbe, 0xd3, 0x36, 0x64, + 0x59, 0x44, 0x5e, 0xba, 0x59, 0x46, 0x9c, 0xc2, 0x89, 0x96, 0x95, 0x7d, + 0x7d, 0x91, 0x5a, 0x75, 0x86, 0x8b, 0xc2, 0xca, 0x5a, 0x54, 0x61, 0xce, + 0x55, 0xab, 0xa3, 0xb8, 0x77, 0xcb, 0xc1, 0x78, 0xa4, 0xce, 0x80, 0xd6, + 0xab, 0xc3, 0xae, 0x6f, 0x96, 0x4f, 0x75, 0xbc, 0xba, 0x32, 0x72, 0x66, + 0x4e, 0x4d, 0xc9, 0xa5, 0xdd, 0xd8, 0xbf, 0x77, 0x77, 0xb4, 0xbb, 0x88, + 0x4f, 0x3a, 0x65, 0x5f, 0x56, 0xd6, 0xad, 0x78, 0x88, 0xb7, 0xd1, 0x86, + 0x35, 0xab, 0x9b, 0xcd, 0x9e, 0x91, 0x8c, 0x80, 0x32, 0x78, 0x4d, 0xa2, + 0xb2, 0x77, 0xca, 0x7a, 0x99, 0xc3, 0xa5, 0x93, 0xc2, 0x67, 0x93, 0xcf, + 0xd7, 0x70, 0xb0, 0x72, 0xc6, 0xbd, 0xb9, 0x47, 0xc0, 0x75, 0xa4, 0xab, + 0x40, 0x5b, 0x55, 0x77, 0xc1, 0x50, 0x54, 0x3b, 0x84, 0xc1, 0x37, 0xb4, + 0x7e, 0xbe, 0x40, 0xc9, 0xd3, 0xc1, 0x46, 0xcc, 0x49, 0x89, 0x91, 0x3e, + 0x6a, 0xd0, 0xaa, 0x98, 0xc2, 0x59, 0x7f, 0xaf, 0x6c, 0xc3, 0x98, 0x8a, + 0x52, 0x52, 0x7e, 0xb0, 0xa8, 0x44, 0x87, 0x95, 0x91, 0x3e, 0x78, 0x96, + 0x46, 0x86, 0x84, 0x9f, 0x97, 0xc0, 0x66, 0x7e, 0x9a, 0x7f, 0x9e, 0xab, + 0x68, 0xa9, 0x96, 0xbb, 0x96, 0x5e, 0xc0, 0x6c, 0xb3, 0x4a, 0xae, 0x8c, + 0xa1, 0x73, 0xbb, 0xa9, 0x69, 0xb6, 0x78, 0x9c, 0xa7, 0x9c, 0x4e, 0x73, + 0xcc, 0x6c, 0x9f, 0xa5, 0x66, 0x5c, 0x31, 0x72, 0x7a, 0xcc, 0x4b, 0x44, + 0x83, 0x72, 0x3b, 0x3f, 0x84, 0x7b, 0x9a, 0x4d, 0x44, 0x49, 0x88, 0x38, + 0x66, 0x45, 0x88, 0x55, 0xaa, 0xb7, 0x6d, 0x85, 0x4c, 0x85, 0x69, 0x9f, + 0x4c, 0x2f, 0x4a, 0x91, 0x64, 0x9c, 0x41, 0xab, 0x9b, 0x83, 0x40, 0x37, + 0x7c, 0x47, 0x78, 0x93, 0x64, 0x57, 0x34, 0x87, 0x57, 0xac, 0x79, 0x6f, + 0x3f, 0xaa, 0x80, 0x38, 0xa6, 0x38, 0xcc, 0x59, 0x74, 0xb3, 0xb0, 0x50, + 0xb3, 0xa6, 0x9e, 0x7e, 0x3b, 0x5c, 0xcd, 0x7e, 0xa4, 0x78, 0xc2, 0x40, + 0x6e, 0xad, 0x6e, 0x58, 0x71, 0x78, 0xbc, 0x47, 0xb9, 0x6c, 0x3e, 0x91, + 0x46, 0x5e, 0x56, 0x35, 0xa1, 0x9c, 0x9e, 0x49, 0x7b, 0x90, 0x48, 0x57, + 0x64, 0xcd, 0xb3, 0x6d, 0x92, 0x8a, 0xa1, 0x91, 0x57, 0x74, 0x89, 0x9c, + 0xb3, 0x71, 0x4c, 0x67, 0x48, 0x4c, 0x7f, 0x49, 0x67, 0xd0, 0xb4, 0x6a, + 0x56, 0x2f, 0x70, 0x3f, 0x3e, 0xbe, 0x90, 0x4f, 0xc3, 0xcf, 0x9e, 0x49, + 0x7c, 0x92, 0x40, 0x66, 0x4c, 0xab, 0x4e, 0x8b, 0x42, 0x90, 0xc7, 0x80, + 0x4f, 0xc2, 0x53, 0x91, 0x72, 0x53, 0x71, 0xc7, 0x5b, 0x8a, 0xc7, 0x44, + 0x3c, 0x4d, 0x60, 0x41, 0x3d, 0x53, 0x8f, 0xc1, 0x6d, 0x5c, 0xa6, 0xc6, + 0x52, 0x87, 0xaa, 0x8f, 0xc6, 0xba, 0xa1, 0x7c, 0x82, 0x49, 0x68, 0x6f, + 0x68, 0xd0, 0x98, 0x96, 0x85, 0x62, 0xa1, 0x94, 0x7e, 0x4a, 0x8f, 0xbb, + 0x3a, 0xb5, 0xab, 0xcc, 0x60, 0x81, 0x97, 0x6d, 0xc7, 0x4b, 0x57, 0xbb, + 0x82, 0x64, 0x73, 0xc1, 0x9a, 0x6c, 0x9e, 0x4f, 0x6c, 0x60, 0xcf, 0xb7, + 0xb5, 0x81, 0x48, 0xa5, 0x66, 0x45, 0x64, 0x75, 0xaf, 0xab, 0x32, 0xa2, + 0xcf, 0x7a, 0x9f, 0x4b, 0xc0, 0x77, 0x45, 0x72, 0x6e, 0x6e, 0x72, 0xb0, + 0x67, 0x95, 0x3d, 0x80, 0x37, 0x8e, 0x6c, 0x39, 0x74, 0x91, 0x3f, 0xc9, + 0xaf, 0xac, 0x6d, 0x3a, 0x6b, 0x30, 0x36, 0x9b, 0x34, 0x92, 0x7f, 0xc2, + 0x70, 0x52, 0x7d, 0x79, 0xb4, 0x8d, 0x36, 0xa0, 0x86, 0x56, 0x84, 0x80, + 0xa6, 0x46, 0x62, 0x8a, 0x7f, 0x3e, 0xc8, 0x33, 0x31, 0x64, 0x5c, 0x59, + 0x4d, 0xc2, 0x67, 0x65, 0xb1, 0x64, 0x74, 0xc2, 0x71, 0x52, 0xd3, 0x7d, + 0x3b, 0x72, 0x90, 0x7a, 0xb7, 0x77, 0x3d, 0x94, 0x66, 0x79, 0x6c, 0x89, + 0x66, 0x7c, 0x8b, 0xbe, 0xb3, 0x5b, 0x6c, 0x48, 0xcd, 0xa4, 0x6b, 0x4e, + 0xb7, 0x72, 0x61, 0x37, 0x99, 0x7f, 0xce, 0xcc, 0x3d, 0xab, 0xbe, 0xa7, + 0x8d, 0xcc, 0xb2, 0xaa, 0x8a, 0x6d, 0x91, 0x96, 0xa2, 0x9d, 0x85, 0x90, + 0xc7, 0x47, 0x71, 0xc4, 0x49, 0xca, 0x86, 0x64, 0x57, 0x82, 0xbd, 0xaf, + 0x6d, 0x79, 0xd2, 0x8a, 0xb1, 0xb8, 0x8a, 0x40, 0x98, 0x5b, 0xbb, 0x7c, + 0x6e, 0x40, 0xb3, 0x32, 0xbd, 0x58, 0x91, 0xa7, 0x3b, 0x5a, 0x91, 0x81, + 0xa2, 0xc1, 0x88, 0xd7, 0x6a, 0x5e, 0x80, 0x97, 0x39, 0xcf, 0x83, 0x85, + 0xb5, 0x7b, 0xac, 0xcd, 0x93, 0x80, 0x40, 0x8c, 0xab, 0x61, 0xbd, 0x5b, + 0xb2, 0xc1, 0x9d, 0x79, 0x5f, 0x60, 0xb7, 0xca, 0xc0, 0x8a, 0x97, 0xca, + 0xca, 0xbb, 0x9c, 0xb0, 0xb3, 0x6e, 0x3d, 0x50, 0x88, 0x76, 0x5a, 0xd8, + 0xbb, 0xa9, 0xc1, 0x33, 0x37, 0x79, 0x45, 0x5c, 0x36, 0xaa, 0xa1, 0x64, + 0xaf, 0x90, 0x89, 0x2e, 0x98, 0x2e, 0x42, 0xae, 0x49, 0xc4, 0xdc, 0x48, + 0xc2, 0x80, 0xdf, 0xe3, 0xb7, 0xa5, 0xbc, 0xd9, 0xaa, 0x86, 0x77, 0x5c, + 0x3d, 0xa3, 0x73, 0x58, 0xc7, 0x90, 0x65, 0x69, 0xb2, 0xa9, 0xac, 0xdc, + 0xc3, 0xaa, 0x39, 0x4c, 0x3e, 0x52, 0x85, 0x99, 0x7c, 0xa3, 0xca, 0x66, + 0x4b, 0xc7, 0x43, 0x24, 0xcb, 0x91, 0xca, 0xa1, 0xba, 0x6d, 0x83, 0x72, + 0x9e, 0x5b, 0x57, 0x59, 0x78, 0x42, 0x6b, 0xb7, 0x37, 0x43, 0x5c, 0x4c, + 0x46, 0x5e, 0xd2, 0x98, 0x5b, 0xb0, 0xbf, 0x54, 0xc3, 0x3b, 0x7e, 0x57, + 0x9b, 0x3e, 0xc5, 0xaf, 0xb9, 0xd2, 0x78, 0xb5, 0xae, 0xc1, 0x64, 0x65, + 0x69, 0x73, 0x68, 0x9d, 0x76, 0xad, 0x87, 0x30, 0x6c, 0xce, 0x91, 0xb9, + 0x3f, 0x97, 0x8c, 0x37, 0x48, 0xa9, 0xb7, 0x91, 0x96, 0x7a, 0x43, 0x6c, + 0x77, 0x3b, 0x87, 0x85, 0x42, 0xa2, 0x59, 0x35, 0xa0, 0x50, 0x89, 0x5d, + 0xb9, 0xb0, 0x4c, 0x9b, 0xc3, 0x6d, 0x36, 0x8e, 0xce, 0x43, 0x47, 0x9c, + 0xb4, 0xc1, 0x50, 0x39, 0x54, 0x93, 0xcc, 0xac, 0x48, 0x6b, 0x52, 0x5d, + 0x83, 0x34, 0x6d, 0x96, 0xb5, 0xa1, 0xb4, 0x3f, 0xc1, 0x5d, 0x8f, 0xab, + 0x34, 0x74, 0x5e, 0xc9, 0x73, 0x7a, 0x3b, 0xaa, 0x63, 0x48, 0x77, 0x3d, + 0xb5, 0xb4, 0xde, 0x58, 0xb8, 0x6e, 0x6f, 0xb4, 0xa2, 0x98, 0x47, 0x3c, + 0xb7, 0xba, 0x3e, 0xcb, 0x96, 0x97, 0x6d, 0xb9, 0x58, 0x9a, 0xb3, 0x9a, + 0x5c, 0x36, 0x4e, 0x5a, 0x39, 0x55, 0xa6, 0xb5, 0xaa, 0xc0, 0xdd, 0x5f, + 0x62, 0x3e, 0xc8, 0x97, 0x4a, 0x2d, 0xbc, 0xb0, 0xa0, 0xbe, 0x5d, 0x4c, + 0xc1, 0x98, 0x8b, 0x40, 0x6d, 0x5b, 0xb9, 0xc0, 0x8a, 0x5c, 0x65, 0x44, + 0xc1, 0x3e, 0x4e, 0x91, 0x6a, 0x43, 0x3c, 0xd2, 0x85, 0x3c, 0x7d, 0x45, + 0xb3, 0xab, 0x56, 0xa6, 0xb2, 0xa0, 0x52, 0x35, 0x7d, 0x86, 0x82, 0x6d, + 0xb8, 0xab, 0x64, 0xbc, 0x8d, 0x91, 0x69, 0xc8, 0x3c, 0x4b, 0xae, 0x51, + 0x48, 0x5b, 0xac, 0x98, 0x9c, 0x9c, 0x65, 0x80, 0x99, 0x80, 0x73, 0xc2, + 0xbd, 0x64, 0x95, 0xb9, 0xcc, 0xc4, 0x56, 0xc3, 0x74, 0x61, 0xb6, 0x48, + 0xd2, 0x67, 0x9b, 0x57, 0x81, 0x91, 0x79, 0x94, 0xa1, 0x7f, 0x68, 0x2c, + 0x6c, 0x9f, 0x6e, 0xc0, 0x49, 0xa4, 0xc6, 0xb6, 0xae, 0xaa, 0xd2, 0x9a, + 0x91, 0x78, 0x9d, 0x93, 0x88, 0x34, 0x8c, 0x7b, 0x76, 0xae, 0xca, 0x4f, + 0x50, 0x44, 0x84, 0x7b, 0xbe, 0x68, 0xa8, 0xc9, 0x9e, 0x4d, 0x9d, 0xac, + 0x50, 0xb1, 0x63, 0x43, 0x8e, 0x46, 0x76, 0x7a, 0x46, 0x8f, 0x6e, 0xca, + 0x7e, 0x55, 0x76, 0x96, 0x4e, 0x84, 0xcb, 0xbd, 0x8f, 0x97, 0xa9, 0x86, + 0xa6, 0x6b, 0x46, 0x8f, 0xd6, 0x44, 0x55, 0xcd, 0x49, 0xc6, 0x97, 0x35, + 0xb0, 0xcd, 0xc0, 0x85, 0xce, 0x98, 0x49, 0x41, 0xb4, 0x35, 0xb2, 0xa8, + 0xc1, 0xcb, 0x53, 0x5b, 0x45, 0xb6, 0xad, 0xbd, 0x69, 0xa6, 0xd0, 0xcf, + 0xbc, 0x7d, 0x83, 0x68, 0x9a, 0x64, 0x2f, 0x3b, 0x5c, 0x47, 0xa6, 0x8c, + 0x39, 0x75, 0x8b, 0xd7, 0xd3, 0x4c, 0x2d, 0x7e, 0x8c, 0x4e, 0x91, 0xd1, + 0x4c, 0xc3, 0x80, 0x83, 0x7d, 0x48, 0x60, 0x67, 0xad, 0xdc, 0x62, 0x69, + 0x80, 0x5e, 0xba, 0x8f, 0xa4, 0x44, 0x34, 0x5a, 0xc0, 0xb3, 0x8d, 0x51, + 0x67, 0x8f, 0x87, 0x93, 0x88, 0x4e, 0xa5, 0xd1, 0x48, 0x55, 0x7d, 0x90, + 0xac, 0x95, 0xd5, 0x68, 0x8c, 0x82, 0x80, 0xb5, 0x6a, 0x8c, 0x45, 0xd2, + 0x3a, 0x91, 0x4c, 0xbe, 0x4e, 0xd7, 0xbc, 0x3d, 0x68, 0x74, 0x7a, 0xb2, + 0x46, 0x64, 0x48, 0x68, 0x88, 0x64, 0x3b, 0x4b, 0x45, 0xab, 0xac, 0x40, + 0xce, 0x8d, 0x3a, 0xcc, 0x4e, 0xb9, 0x6d, 0x69, 0xc4, 0xb5, 0x4a, 0xb7, + 0x88, 0xc2, 0xab, 0x56, 0xd4, 0xb9, 0xa2, 0x41, 0x9e, 0xb8, 0x7c, 0x5b, + 0xaf, 0x4a, 0xb8, 0xc1, 0x51, 0xc0, 0xc9, 0xd3, 0xbe, 0x9d, 0xd0, 0x4f, + 0x43, 0x42, 0x6c, 0xd2, 0x97, 0x97, 0xc8, 0xcf, 0x33, 0x82, 0x77, 0x64, + 0x6f, 0x9e, 0x75, 0x70, 0x36, 0xd5, 0x83, 0x60, 0x46, 0x90, 0xb4, 0xd2, + 0x4f, 0xdc, 0xc7, 0x4d, 0x33, 0xba, 0x52, 0x90, 0xc7, 0xbf, 0xa6, 0x51, + 0x2d, 0x8c, 0x63, 0x42, 0x8e, 0x71, 0xa3, 0xd5, 0x8e, 0xcf, 0xb3, 0xc7, + 0xbe, 0xac, 0x3a, 0xc7, 0x85, 0x3a, 0xb2, 0x7c, 0x6f, 0x4f, 0xa5, 0x96, + 0x73, 0x8c, 0x3b, 0xc7, 0xca, 0x4d, 0x8d, 0x72, 0x46, 0xcd, 0x8f, 0x47, + 0xcf, 0x5e, 0x9d, 0x5c, 0x51, 0xa1, 0x8e, 0xca, 0xb0, 0x72, 0x45, 0x4c, + 0xb6, 0x40, 0x8e, 0x6e, 0x82, 0x7c, 0x90, 0x58, 0x6f, 0x38, 0x50, 0x87, + 0xc0, 0x72, 0x5a, 0x5d, 0xdd, 0xa7, 0x81, 0xd2, 0x5c, 0xcb, 0x9a, 0x51, + 0x34, 0x4b, 0xd2, 0xc2, 0xc6, 0x71, 0x94, 0xc5, 0x43, 0x37, 0x74, 0xa6, + 0xa1, 0x94, 0x5f, 0x66, 0x82, 0x9d, 0x9a, 0xca, 0xab, 0x9b, 0x50, 0x61, + 0x56, 0x3a, 0x5e, 0x33, 0x68, 0x38, 0x7f, 0xb8, 0x75, 0x9d, 0x4d, 0x92, + 0xaf, 0xcb, 0xc0, 0xa3, 0xb6, 0x59, 0x44, 0x70, 0x8c, 0x91, 0x6a, 0x62, + 0x59, 0x63, 0x51, 0xb0, 0xa4, 0x9d, 0x9d, 0x45, 0x59, 0x2e, 0xcd, 0x6a, + 0x79, 0xb6, 0x69, 0x90, 0xae, 0xcc, 0x42, 0x9c, 0xc7, 0xb5, 0x4e, 0xb2, + 0x3a, 0x65, 0x96, 0x99, 0xd3, 0x63, 0x72, 0x80, 0xcb, 0x9f, 0xaf, 0x77, + 0xcf, 0x62, 0x4e, 0x6e, 0x34, 0x37, 0x95, 0xc9, 0x79, 0xc7, 0x4b, 0x69, + 0x70, 0xa7, 0x4b, 0x60, 0x8b, 0x9e, 0x7a, 0x72, 0x99, 0xc7, 0xb8, 0x81, + 0x7b, 0x38, 0x7a, 0x65, 0x8d, 0x85, 0x67, 0xa2, 0xa6, 0xbb, 0x5b, 0x44, + 0x86, 0x4b, 0x79, 0xb4, 0x8b, 0x99, 0x41, 0x3d, 0x9d, 0x90, 0x93, 0x63, + 0x39, 0xb2, 0xc5, 0x8c, 0x49, 0x50, 0xcb, 0xb5, 0x52, 0x92, 0x8a, 0x92, + 0xa7, 0x74, 0x91, 0x9b, 0x34, 0xa6, 0x91, 0x67, 0x44, 0xb9, 0x4b, 0x89, + 0x6e, 0x72, 0x7e, 0xbb, 0x57, 0xc0, 0x89, 0x62, 0xd3, 0x2a, 0x8a, 0x51, + 0x8c, 0x32, 0x98, 0xc0, 0x5a, 0x53, 0x7f, 0x41, 0xb3, 0x41, 0x99, 0x66, + 0x2f, 0x5d, 0x9a, 0x5f, 0xa3, 0x4c, 0x9c, 0x8d, 0xa1, 0x30, 0x94, 0xcf, + 0x59, 0x9a, 0x4e, 0x59, 0x44, 0xd7, 0x48, 0x76, 0x8b, 0x94, 0x8b, 0x84, + 0x35, 0x3e, 0x3f, 0x98, 0x8c, 0xcf, 0x3c, 0xb8, 0x78, 0x6f, 0x3e, 0x8b, + 0x81, 0x31, 0x4d, 0xb9, 0x33, 0x3e, 0x98, 0x9b, 0x74, 0xb4, 0x8b, 0xc6, + 0xbb, 0xbd, 0x50, 0xbc, 0xaa, 0x31, 0x50, 0x61, 0x78, 0xc7, 0x73, 0x74, + 0x91, 0x88, 0x8d, 0x73, 0xd1, 0x61, 0xc1, 0x46, 0xc8, 0x4b, 0x6c, 0x81, + 0xa7, 0x86, 0x62, 0x9f, 0x81, 0x41, 0xc0, 0xa5, 0xcb, 0x76, 0x82, 0x31, + 0x39, 0x6c, 0xb2, 0xcf, 0x96, 0x4a, 0x95, 0xce, 0xc1, 0xc8, 0x52, 0x43, + 0x60, 0xb0, 0x80, 0x87, 0x90, 0x7c, 0x5a, 0x84, 0x6d, 0x33, 0x8a, 0x63, + 0x70, 0x5e, 0xd0, 0xa1, 0xb3, 0x9e, 0x72, 0x67, 0x45, 0x49, 0xad, 0x89, + 0x8a, 0x79, 0xb6, 0x77, 0xd3, 0x68, 0x90, 0xca, 0x44, 0x59, 0x56, 0x67, + 0xa0, 0xb5, 0xa0, 0x72, 0xa2, 0xb8, 0x65, 0x3a, 0xac, 0xbd, 0x4d, 0x56, + 0x94, 0x41, 0x57, 0x94, 0x34, 0x8b, 0xa0, 0x40, 0x99, 0x7a, 0x3b, 0x9a, + 0x62, 0x2f, 0x55, 0x5d, 0xb6, 0xaa, 0x87, 0x79, 0x99, 0x64, 0x78, 0x7c, + 0x6a, 0xc4, 0xbc, 0x53, 0x93, 0x3c, 0x61, 0x42, 0x87, 0xc1, 0x72, 0x3a, + 0xba, 0x9a, 0xa5, 0x64, 0x95, 0x38, 0x73, 0x34, 0x6d, 0x57, 0x69, 0xb8, + 0x7a, 0x60, 0x3a, 0x70, 0x3e, 0x66, 0xb7, 0x93, 0xaf, 0x77, 0x9f, 0xaf, + 0x5a, 0x88, 0x80, 0x36, 0x93, 0x30, 0x85, 0x41, 0x30, 0x88, 0x3d, 0xd1, + 0x94, 0xcd, 0xa3, 0x81, 0x32, 0x3b, 0x42, 0xb7, 0x75, 0x6e, 0xa6, 0x82, + 0x9b, 0x35, 0x98, 0x68, 0x8b, 0xca, 0x71, 0x76, 0x6a, 0xbc, 0x53, 0x77, + 0x72, 0xbf, 0xb8, 0xca, 0x2e, 0x35, 0x30, 0xb9, 0xd5, 0xb4, 0x5f, 0x58, + 0x39, 0xc9, 0x77, 0xac, 0x33, 0x64, 0xb9, 0xc0, 0x4a, 0x75, 0xb6, 0x8f, + 0x70, 0x7c, 0x74, 0x89, 0x58, 0x88, 0x41, 0xb5, 0xa4, 0x47, 0x98, 0xc6, + 0x98, 0x4d, 0x82, 0xbb, 0x70, 0x9f, 0x4a, 0x91, 0x6b, 0x85, 0xbb, 0x93, + 0x38, 0xbe, 0xa4, 0x7a, 0x4f, 0x92, 0xd2, 0x39, 0x4a, 0x5a, 0x91, 0x62, + 0xb3, 0x94, 0x7e, 0x4a, 0x62, 0x4a, 0x58, 0x96, 0x59, 0x36, 0xc0, 0xcc, + 0xca, 0x9f, 0x8e, 0x67, 0x3a, 0xa6, 0x3f, 0x75, 0xac, 0xbc, 0x9d, 0x8a, + 0x7a, 0x91, 0x66, 0x93, 0xae, 0x4c, 0x6b, 0x33, 0xc3, 0x43, 0x90, 0x7e, + 0x4c, 0x5a, 0xc5, 0x59, 0x69, 0x58, 0x88, 0xae, 0x40, 0x72, 0xa3, 0x8f, + 0xba, 0x6b, 0xbe, 0xca, 0x77, 0xae, 0xc4, 0xa5, 0x92, 0x84, 0x6e, 0xca, + 0x56, 0x74, 0xc4, 0x87, 0xc7, 0xbc, 0x66, 0x48, 0xba, 0x37, 0x7a, 0x80, + 0xce, 0xb5, 0x62, 0x81, 0xb8, 0x3f, 0xa7, 0xcd, 0x59, 0x38, 0x6d, 0x41, + 0x6e, 0xc7, 0x7e, 0x6c, 0x7f, 0x77, 0x40, 0xcf, 0x39, 0xbb, 0xad, 0x7b, + 0xa2, 0x45, 0x6d, 0x7f, 0xa5, 0x3a, 0xaf, 0xcb, 0x76, 0x83, 0xc4, 0x8c, + 0xb6, 0xc8, 0x75, 0x6e, 0x85, 0x6c, 0x9c, 0xb3, 0x3b, 0x4f, 0x5b, 0x70, + 0xc3, 0x41, 0x9f, 0x64, 0xaa, 0x5c, 0xb5, 0x6b, 0xa6, 0x34, 0x5e, 0x6e, + 0x42, 0x85, 0xb4, 0x9c, 0x43, 0xb7, 0x79, 0x97, 0xcb, 0x7f, 0xa4, 0x30, + 0x84, 0xce, 0xc8, 0xb6, 0x86, 0xa5, 0x3e, 0xa9, 0x70, 0x2e, 0x3b, 0x42, + 0x5d, 0xd2, 0xc3, 0x71, 0xd0, 0x83, 0x40, 0x6a, 0x8e, 0x95, 0xa5, 0x5c, + 0xb6, 0x73, 0x98, 0x47, 0x93, 0x50, 0x64, 0x67, 0x51, 0xcd, 0x8b, 0xc6, + 0x76, 0x75, 0x65, 0x44, 0x58, 0x38, 0x91, 0x6a, 0xa8, 0xad, 0x87, 0x52, + 0x90, 0x9a, 0xd0, 0x8a, 0x90, 0x88, 0x63, 0xad, 0x58, 0xb0, 0xa8, 0x83, + 0x48, 0xa9, 0x31, 0x94, 0x6b, 0x3b, 0x46, 0x6d, 0x8b, 0x52, 0xbe, 0x83, + 0xa8, 0xc0, 0xc2, 0x4a, 0xaa, 0x7f, 0x59, 0x66, 0x7a, 0x6f, 0xb3, 0x8c, + 0x63, 0xba, 0xa4, 0x35, 0x88, 0xa3, 0x84, 0x3b, 0x57, 0x45, 0x3d, 0xa9, + 0x76, 0xbb, 0x7c, 0x98, 0xa9, 0x6a, 0x40, 0x6b, 0x87, 0xbb, 0x48, 0x9f, + 0x94, 0xcf, 0xbc, 0x98, 0xc7, 0x64, 0x4c, 0x5c, 0x7f, 0x63, 0xb7, 0xbc, + 0x3b, 0x90, 0xcc, 0x64, 0x54, 0xc1, 0x7f, 0xb7, 0x45, 0xb6, 0xbe, 0x78, + 0x87, 0xc0, 0x62, 0xb0, 0x7d, 0x83, 0x45, 0xc3, 0x64, 0xd6, 0xac, 0xcf, + 0x87, 0x84, 0x92, 0xaf, 0x40, 0x53, 0x92, 0x3c, 0x4d, 0x54, 0xc4, 0x78, + 0xad, 0xa5, 0x64, 0x5e, 0xcc, 0x5b, 0x54, 0x67, 0xd2, 0xc8, 0x37, 0x39, + 0x96, 0x77, 0x8e, 0x69, 0xaa, 0x51, 0xb0, 0x6b, 0xb5, 0x99, 0x60, 0x96, + 0x9b, 0xb2, 0x5a, 0x99, 0x66, 0x34, 0x61, 0x52, 0x45, 0x6e, 0x84, 0xb1, + 0xc9, 0xac, 0xc4, 0xab, 0x6c, 0x7a, 0x39, 0x8e, 0x9c, 0x8d, 0xc8, 0x8e, + 0x57, 0xa8, 0x8f, 0x9e, 0x76, 0xb4, 0x5e, 0x6c, 0xb8, 0x8c, 0x6b, 0x76, + 0x94, 0xcb, 0x7e, 0x7f, 0x32, 0x44, 0x8c, 0xd2, 0x7a, 0x6d, 0x9b, 0xcb, + 0x4b, 0xc6, 0xc2, 0x66, 0x89, 0xa9, 0x7f, 0x3d, 0x9a, 0xb0, 0xc6, 0xcf, + 0xca, 0xa6, 0x98, 0x3b, 0xae, 0xac, 0x7d, 0xad, 0x9c, 0x69, 0x60, 0x63, + 0x8e, 0xaf, 0xb3, 0x3e, 0x52, 0xc6, 0x72, 0x54, 0xc8, 0xc4, 0xa2, 0xa2, + 0x64, 0x81, 0x70, 0x7a, 0x52, 0xaf, 0xc4, 0x61, 0x51, 0xca, 0x5f, 0xa3, + 0x7a, 0xc2, 0x7e, 0x71, 0x4a, 0x9e, 0xae, 0x6d, 0xbe, 0x94, 0x5b, 0xa8, + 0x32, 0xbc, 0xab, 0xb2, 0x63, 0x82, 0x39, 0x9f, 0x56, 0x45, 0x6b, 0x5a, + 0xb2, 0x5b, 0x81, 0xa0, 0xcf, 0x98, 0x3f, 0xaf, 0x3f, 0x50, 0x6e, 0x41, + 0x61, 0x8b, 0x93, 0x8f, 0xa5, 0x44, 0xb6, 0x7c, 0x9e, 0x8c, 0xb1, 0x83, + 0xbf, 0xcb, 0x4b, 0xb2, 0x71, 0x78, 0x39, 0x4e, 0x6b, 0x9f, 0xd1, 0x93, + 0x9b, 0x95, 0x78, 0x62, 0xcd, 0xa1, 0xbe, 0x27, 0x7e, 0xa8, 0x4a, 0x5e, + 0xca, 0x59, 0x42, 0x4a, 0xc7, 0x95, 0x47, 0xcf, 0xb6, 0xbe, 0xaa, 0x50, + 0x9a, 0xa6, 0x60, 0x65, 0x88, 0x9d, 0x5e, 0x66, 0x37, 0x3f, 0x30, 0xbf, + 0x96, 0xb2, 0x72, 0xc4, 0xa2, 0x50, 0x44, 0xd0, 0x52, 0xa2, 0x7b, 0x67, + 0x75, 0x65, 0x68, 0x5b, 0x71, 0xca, 0x5e, 0x5a, 0x38, 0xa1, 0x9b, 0xc8, + 0xb4, 0x48, 0xa6, 0x97, 0x5e, 0xbb, 0x3b, 0x8d, 0x50, 0x92, 0x83, 0x71, + 0x3f, 0x92, 0x62, 0xd6, 0x3b, 0xad, 0xd1, 0x64, 0x6e, 0x80, 0x75, 0x54, + 0x44, 0x38, 0x78, 0x33, 0x46, 0xb3, 0x63, 0x85, 0x8f, 0xa6, 0x3d, 0x9b, + 0xa3, 0x9b, 0x34, 0xce, 0x77, 0x41, 0x49, 0xa3, 0x47, 0x6c, 0x51, 0x95, + 0x49, 0x87, 0x48, 0x6e, 0x3c, 0x5f, 0x66, 0x37, 0xb7, 0x82, 0x9f, 0xab, + 0x8f, 0x70, 0x5c, 0x57, 0x8a, 0x4c, 0x46, 0x92, 0x9c, 0xd5, 0x4e, 0xbb, + 0x8c, 0x9b, 0x22, 0x60, 0x83, 0xca, 0x56, 0x72, 0x85, 0x9c, 0xa8, 0x9a, + 0xca, 0xbc, 0x9f, 0x6f, 0xab, 0x86, 0xbd, 0x3c, 0x42, 0x9a, 0xb9, 0x3a, + 0xcb, 0x31, 0x67, 0x55, 0xad, 0x4c, 0x37, 0x72, 0x94, 0x4c, 0x91, 0xcc, + 0xad, 0x7d, 0xcb, 0x67, 0x8f, 0x7d, 0x9a, 0xc4, 0xcc, 0x97, 0xbe, 0x6a, + 0xb7, 0xb9, 0xd6, 0x94, 0xd5, 0x41, 0x99, 0x33, 0x58, 0x42, 0xbe, 0xb7, + 0xaf, 0x58, 0x60, 0x3e, 0x53, 0x89, 0xd2, 0xc7, 0x5b, 0x8d, 0xc4, 0x60, + 0xb8, 0x74, 0xaa, 0xa2, 0x76, 0xb0, 0xca, 0x53, 0x57, 0x62, 0xca, 0x3c, + 0x54, 0x3f, 0x70, 0xd3, 0xbb, 0xbc, 0x96, 0x81, 0x81, 0xbd, 0x60, 0xe1, + 0x63, 0x9a, 0x80, 0xc1, 0xc1, 0x41, 0x8d, 0x96, 0x44, 0xb6, 0xc5, 0xd2, + 0x4e, 0x7e, 0xdd, 0xce, 0x88, 0x85, 0x8b, 0xba, 0x80, 0x9b, 0x3a, 0x90, + 0xc8, 0x33, 0x47, 0x8d, 0x94, 0x7b, 0x8e, 0x80, 0xa8, 0xa8, 0xb7, 0x82, + 0x67, 0x36, 0x84, 0x9f, 0x3b, 0x68, 0x5f, 0x4b, 0x30, 0x3b, 0x78, 0x60, + 0xb7, 0xa6, 0x55, 0xc7, 0x72, 0x59, 0xa7, 0x5a, 0x85, 0x8a, 0xcb, 0x41, + 0xb6, 0x4c, 0x79, 0x89, 0xc1, 0xb5, 0x90, 0xb5, 0x90, 0xbf, 0x8f, 0x94, + 0x36, 0x48, 0xa9, 0x64, 0x89, 0xa5, 0x81, 0x54, 0x8f, 0x2c, 0xa2, 0xa2, + 0x51, 0x7a, 0x5f, 0xb9, 0x6b, 0x47, 0x6b, 0xbb, 0x74, 0xa8, 0x3b, 0xcf, + 0x83, 0xb7, 0xc4, 0xce, 0x7f, 0x78, 0x8d, 0x44, 0x52, 0xc8, 0xb8, 0x51, + 0x7b, 0xb0, 0xc9, 0xbe, 0x83, 0x79, 0x4c, 0x7e, 0x71, 0x5d, 0x7a, 0xbe, + 0x4d, 0x91, 0x72, 0x42, 0x45, 0x46, 0x6e, 0xb2, 0x78, 0xb6, 0x6f, 0x63, + 0x59, 0x45, 0x67, 0x75, 0xaa, 0xb6, 0x83, 0x2f, 0xb3, 0xc1, 0x80, 0x3b, + 0x9a, 0x78, 0x57, 0x41, 0x83, 0x39, 0x84, 0x4b, 0x99, 0x8a, 0xca, 0xb8, + 0x54, 0x71, 0x42, 0x97, 0xd5, 0xc1, 0x9a, 0x5f, 0x86, 0xa0, 0x4e, 0x3f, + 0x69, 0x80, 0xae, 0x8a, 0x5d, 0xbb, 0xc8, 0x42, 0xaf, 0xc6, 0x85, 0x47, + 0x7f, 0x97, 0xcd, 0xb5, 0xcd, 0x81, 0xb7, 0x75, 0x69, 0x90, 0x78, 0x5c, + 0x5a, 0x81, 0x9e, 0x4b, 0x81, 0xaa, 0x9b, 0x79, 0x80, 0x6a, 0x2e, 0x67, + 0xc4, 0x4a, 0x5f, 0xbc, 0xc0, 0x55, 0x40, 0x49, 0xa9, 0x81, 0x48, 0x61, + 0xc0, 0x95, 0x71, 0x71, 0x82, 0x67, 0xc8, 0x1f, 0x48, 0x8a, 0x3e, 0x62, + 0xbb, 0xae, 0x8b, 0x5c, 0x8f, 0x6f, 0xcd, 0x64, 0x88, 0x92, 0xd3, 0x5b, + 0xcb, 0xc0, 0x92, 0x54, 0xad, 0x57, 0xa8, 0x41, 0x89, 0x6c, 0xd2, 0x44, + 0xa4, 0x87, 0x39, 0x92, 0x84, 0x9e, 0xa6, 0xa8, 0x8f, 0x47, 0xbb, 0x3e, + 0x97, 0x54, 0x95, 0x5c, 0x82, 0x46, 0x80, 0xb0, 0x7a, 0xce, 0x59, 0x78, + 0x8e, 0x79, 0xae, 0x6e, 0xbb, 0x6d, 0xc3, 0x94, 0x6d, 0xbc, 0x9b, 0x86, + 0x77, 0x9b, 0x57, 0x64, 0x98, 0x40, 0x87, 0xad, 0xa6, 0xa8, 0xa5, 0xd3, + 0xbf, 0x37, 0x52, 0x8b, 0x60, 0xc1, 0x3a, 0x75, 0x47, 0x79, 0x84, 0xb2, + 0x5a, 0x93, 0x5e, 0x4c, 0x8b, 0xce, 0xcc, 0x72, 0x74, 0x50, 0x40, 0xa2, + 0x82, 0x5b, 0xcc, 0xbc, 0x58, 0x9f, 0x8b, 0x56, 0x52, 0x91, 0xcb, 0xa8, + 0xac, 0x89, 0x49, 0xa9, 0xa3, 0xc1, 0xa4, 0xab, 0x9e, 0x8b, 0xcb, 0xb0, + 0xad, 0xa6, 0x3d, 0x98, 0xd0, 0x8d, 0x83, 0x3d, 0x4b, 0xc6, 0x85, 0x45, + 0x4b, 0xb1, 0x5a, 0x8f, 0x9d, 0x65, 0x69, 0x73, 0x48, 0xc3, 0x7d, 0x8a, + 0x55, 0xaf, 0x79, 0x7b, 0x8e, 0xc9, 0x59, 0x85, 0x54, 0x57, 0x44, 0x2f, + 0x89, 0xac, 0xc1, 0xa8, 0xad, 0x4e, 0xa9, 0x8f, 0xce, 0x4d, 0x78, 0x4c, + 0x68, 0x53, 0xaa, 0x86, 0x28, 0xa4, 0xad, 0x39, 0xa2, 0x63, 0x94, 0x95, + 0xa6, 0xc3, 0x82, 0xda, 0xc7, 0x57, 0xb6, 0xc0, 0x3e, 0xd5, 0xd6, 0x82, + 0xa3, 0xc0, 0xc4, 0xd8, 0xbd, 0xc5, 0xad, 0x59, 0x7a, 0x81, 0x5a, 0x59, + 0x32, 0x96, 0xb5, 0x3a, 0xba, 0x8e, 0x8b, 0xbe, 0x52, 0x61, 0xcc, 0x73, + 0xb1, 0xa6, 0x57, 0xaa, 0x82, 0xac, 0x31, 0x42, 0xa5, 0x48, 0x5c, 0xc3, + 0x4e, 0x45, 0x59, 0xb9, 0x83, 0xb0, 0xb3, 0x75, 0x4e, 0x4f, 0x39, 0xbd, + 0x71, 0xa8, 0xb5, 0x96, 0x78, 0x52, 0xa6, 0x50, 0xb4, 0xd3, 0x39, 0x53, + 0x81, 0xa8, 0x80, 0xa0, 0x5f, 0x37, 0x74, 0x52, 0x6d, 0x42, 0x64, 0x60, + 0x8c, 0x42, 0x85, 0xdb, 0xbd, 0x4c, 0xd0, 0x3b, 0xc8, 0xb8, 0xd4, 0xb4, + 0xa1, 0x8c, 0xab, 0x47, 0xc1, 0xd3, 0x39, 0x31, 0x33, 0x92, 0x85, 0x37, + 0x75, 0x9c, 0xdd, 0x55, 0x63, 0xa5, 0xd1, 0xae, 0x9b, 0x99, 0xd5, 0x95, + 0xbd, 0x95, 0x5a, 0x9b, 0x6f, 0x81, 0x52, 0x8c, 0xcf, 0xb0, 0x9a, 0x58, + 0x7a, 0xa6, 0x82, 0xdc, 0xac, 0x5f, 0x71, 0xb6, 0xba, 0xa3, 0x3d, 0xc2, + 0x50, 0xa8, 0x68, 0x38, 0x44, 0x77, 0x6d, 0xa1, 0x74, 0xbd, 0x4f, 0x70, + 0x4e, 0xc5, 0x49, 0xa5, 0x2c, 0x3d, 0x8e, 0xaf, 0x37, 0xb9, 0xc3, 0xc3, + 0x4d, 0x92, 0x2c, 0xa2, 0x92, 0x78, 0x30, 0x9d, 0xd5, 0x6a, 0xc2, 0xdb, + 0x51, 0xd5, 0x6d, 0xe9, 0x49, 0x81, 0x98, 0xd4, 0xb4, 0x43, 0xca, 0xae, + 0x92, 0x49, 0xdf, 0x9e, 0x7d, 0x69, 0xc1, 0x39, 0x4b, 0x62, 0x89, 0x6a, + 0x9c, 0xaa, 0x8d, 0x8e, 0xb5, 0x94, 0x9f, 0xd3, 0xc2, 0x9e, 0x76, 0xa6, + 0xba, 0x4b, 0x72, 0x8f, 0xcf, 0x8a, 0x94, 0xae, 0xa1, 0x9f, 0x78, 0xc4, + 0x67, 0x93, 0x30, 0xa7, 0xa4, 0x45, 0x53, 0x54, 0xa7, 0xba, 0x9f, 0x5d, + 0x71, 0x77, 0x3f, 0xa1, 0xce, 0x9a, 0x89, 0x33, 0xd6, 0x31, 0xa0, 0xd8, + 0x86, 0x62, 0x32, 0x61, 0xa3, 0x9a, 0xb1, 0xb8, 0x51, 0x9a, 0xb4, 0x51, + 0x7b, 0xa0, 0xbe, 0xb4, 0xca, 0x61, 0x96, 0x56, 0x51, 0x7e, 0x48, 0x9f, + 0xbd, 0xb7, 0xcc, 0x66, 0x7f, 0xbd, 0x41, 0x4f, 0x5a, 0xb2, 0x35, 0xb7, + 0x7b, 0x9e, 0x62, 0x3e, 0xb4, 0x52, 0xa3, 0x5a, 0x45, 0xc5, 0x77, 0xbd, + 0xc4, 0x67, 0x85, 0x76, 0x9a, 0x55, 0x9c, 0x56, 0x96, 0x81, 0xbb, 0x56, + 0x9c, 0xc7, 0xa0, 0x4e, 0x38, 0x8f, 0x62, 0x9a, 0x64, 0xc0, 0x59, 0x8e, + 0x7a, 0x68, 0x50, 0x4b, 0xc9, 0x4c, 0x7c, 0x9f, 0x95, 0x9a, 0x45, 0x49, + 0xda, 0x39, 0x74, 0xa9, 0xb4, 0xc1, 0x69, 0xc3, 0xbb, 0x66, 0x5a, 0x77, + 0x94, 0x4f, 0x81, 0x5c, 0xba, 0xca, 0x8f, 0x80, 0x44, 0xb3, 0x8f, 0x33, + 0xb6, 0x8c, 0x4b, 0xb4, 0x34, 0x4e, 0xa3, 0x49, 0xbb, 0x62, 0xb1, 0xd7, + 0x6c, 0x32, 0x42, 0x3f, 0x61, 0x83, 0x8b, 0xa6, 0x74, 0x77, 0xb9, 0xbf, + 0x5c, 0x41, 0x32, 0x6b, 0x3b, 0x6d, 0xdc, 0xd4, 0x61, 0x6f, 0xaa, 0xae, + 0x91, 0xb4, 0x31, 0x81, 0x68, 0xd8, 0xc9, 0x33, 0x7f, 0x9c, 0xac, 0x4b, + 0x77, 0xbb, 0x6e, 0x3f, 0x6e, 0x78, 0xb3, 0x7b, 0x8b, 0x6e, 0xd3, 0xd1, + 0x83, 0xca, 0x9c, 0xc0, 0x5f, 0x56, 0xd8, 0x85, 0x88, 0x65, 0x9f, 0x47, + 0x53, 0x95, 0x35, 0x69, 0x6f, 0xbb, 0xc2, 0xd2, 0xa5, 0x8e, 0x96, 0x78, + 0xd8, 0x83, 0x45, 0x7b, 0x94, 0x7f, 0x5c, 0x63, 0x4e, 0x8e, 0xcb, 0x42, + 0xab, 0x38, 0x94, 0x5b, 0x4b, 0x93, 0x4f, 0x7a, 0x5f, 0xb8, 0xcf, 0x48, + 0xd3, 0x7f, 0xb3, 0xca, 0x83, 0x62, 0x48, 0x82, 0xd1, 0xba, 0x50, 0x6d, + 0x88, 0xca, 0xc8, 0xa2, 0x7e, 0x83, 0x82, 0x50, 0xc0, 0x6f, 0x4b, 0x39, + 0x4b, 0x90, 0x4c, 0x77, 0x40, 0xc1, 0x96, 0xde, 0xd0, 0xd2, 0x97, 0x8a, + 0xd9, 0x57, 0x92, 0x59, 0x6a, 0x8c, 0xb5, 0xba, 0x5e, 0x9f, 0xc0, 0x4c, + 0x5e, 0x83, 0xa6, 0x49, 0x6a, 0x82, 0xd2, 0xb5, 0x7a, 0xab, 0x5f, 0x7a, + 0x5c, 0x9b, 0xb2, 0xdb, 0x96, 0xc2, 0xb4, 0x58, 0xa5, 0xd7, 0xbb, 0x5c, + 0x74, 0x3d, 0xba, 0xae, 0x59, 0xad, 0x50, 0x3d, 0x40, 0x8c, 0x89, 0xb9, + 0x8f, 0x64, 0x8b, 0x8e, 0xc3, 0xc5, 0x87, 0xa5, 0xc1, 0x3d, 0x53, 0x50, + 0x77, 0xb0, 0x7b, 0x9f, 0x3c, 0x53, 0x49, 0xc9, 0x77, 0x5b, 0x42, 0x68, + 0x82, 0xcf, 0x50, 0xa8, 0x6e, 0x74, 0x30, 0x75, 0x70, 0x89, 0xaf, 0x7a, + 0xcd, 0x47, 0x47, 0x92, 0x6f, 0xc0, 0xc3, 0xd6, 0x45, 0x80, 0xcc, 0xb6, + 0x3b, 0x74, 0x9d, 0x70, 0x4b, 0x57, 0xbe, 0x56, 0x7b, 0xc9, 0xc3, 0x7b, + 0xca, 0xb7, 0x84, 0xd2, 0x6f, 0xa6, 0x77, 0x46, 0x85, 0x2e, 0xb1, 0xba, + 0xd3, 0x54, 0x6d, 0x4d, 0xdc, 0x7d, 0xbd, 0x45, 0x7a, 0x65, 0x7d, 0x5a, + 0xa6, 0x68, 0x5f, 0x82, 0x90, 0x40, 0xde, 0x63, 0xac, 0x9e, 0x5d, 0x97, + 0x8d, 0x2a, 0x54, 0x5a, 0x8d, 0x92, 0x89, 0xbb, 0xb3, 0xc3, 0x78, 0xb2, + 0x9b, 0xa3, 0xa1, 0x83, 0x6b, 0x89, 0x7f, 0x92, 0xac, 0xa0, 0x8d, 0x66, + 0xb8, 0x46, 0x7b, 0xe3, 0x74, 0x68, 0xc8, 0xcf, 0x67, 0x53, 0x80, 0xc8, + 0x6b, 0x6f, 0x53, 0x8a, 0xa6, 0xbf, 0x54, 0x76, 0x91, 0x6d, 0xae, 0xbd, + 0xc1, 0x67, 0xab, 0x9b, 0xba, 0x7b, 0x8e, 0xaf, 0x55, 0xac, 0x52, 0xba, + 0x7a, 0xc7, 0x7d, 0x86, 0xa3, 0x9e, 0x77, 0x47, 0xae, 0x9e, 0x93, 0x5c, + 0xd5, 0xbf, 0x70, 0x74, 0x99, 0x45, 0x8c, 0xae, 0x6a, 0x70, 0x84, 0x3e, + 0xc1, 0x83, 0x83, 0xae, 0x6d, 0x71, 0xa5, 0xdc, 0xca, 0x78, 0xb5, 0x9a, + 0xbd, 0x70, 0xad, 0x83, 0xbe, 0xa3, 0x5d, 0xae, 0x56, 0x96, 0xc2, 0x8a, + 0xac, 0x3c, 0x47, 0x54, 0xad, 0x40, 0xb3, 0x6c, 0xad, 0x8b, 0x53, 0x77, + 0x82, 0x86, 0xc7, 0xa3, 0x79, 0x88, 0xa7, 0xc8, 0x34, 0x7a, 0xa3, 0x72, + 0xd2, 0x97, 0xde, 0xbb, 0x6c, 0xd1, 0x8c, 0x57, 0x58, 0x71, 0xc9, 0xa5, + 0x62, 0x38, 0x41, 0x60, 0xc5, 0x34, 0x4d, 0x3f, 0x57, 0xda, 0xa5, 0x7c, + 0x59, 0x86, 0x52, 0xb1, 0xe3, 0x86, 0x9f, 0xcb, 0xad, 0x73, 0x47, 0x4a, + 0xcd, 0x55, 0xbb, 0xd9, 0xd9, 0x8e, 0x7c, 0x43, 0x6c, 0x72, 0x27, 0x35, + 0x4b, 0x81, 0x16, 0x79, 0xbc, 0x3d, 0x86, 0xc7, 0x95, 0x62, 0xbe, 0x45, + 0xbb, 0xcc, 0x47, 0x58, 0xcf, 0x77, 0x74, 0x68, 0x48, 0xd8, 0xae, 0x5f, + 0x81, 0xaf, 0xa9, 0x5f, 0x7b, 0x9c, 0xa2, 0xc7, 0x50, 0x74, 0x81, 0x9f, + 0xba, 0xa7, 0x72, 0x86, 0xa0, 0x9b, 0x9b, 0x36, 0x74, 0xd2, 0x33, 0x4b, + 0x44, 0xc3, 0x34, 0xbe, 0x99, 0xb1, 0x8c, 0x34, 0x82, 0xbb, 0x96, 0x4d, + 0xc6, 0xc4, 0x6f, 0x5b, 0xd3, 0x8a, 0xc9, 0x4a, 0x7d, 0x37, 0x4b, 0x69, + 0x68, 0xd9, 0x9d, 0xa5, 0x73, 0x9b, 0x6c, 0x4a, 0x89, 0xd6, 0x6e, 0xae, + 0x44, 0xb4, 0x54, 0x3f, 0x7f, 0x75, 0x6b, 0x50, 0xaf, 0x75, 0x4d, 0x60, + 0xaf, 0x8a, 0x90, 0xc9, 0x87, 0xa5, 0x32, 0xc4, 0xc9, 0x8f, 0xa5, 0xd4, + 0xcb, 0x95, 0x8f, 0x9c, 0x75, 0x71, 0xd1, 0xb3, 0x9c, 0xa7, 0x63, 0x35, + 0xd1, 0x70, 0x7a, 0x8c, 0x6b, 0xb1, 0x95, 0x65, 0x6f, 0x7f, 0x7d, 0x4c, + 0xa1, 0x67, 0x8d, 0x56, 0x84, 0xca, 0x99, 0x9e, 0x71, 0x55, 0x64, 0x47, + 0xc1, 0xab, 0x83, 0x54, 0xb8, 0x90, 0x86, 0x92, 0x3e, 0xce, 0xc7, 0x57, + 0x46, 0x60, 0x7e, 0x74, 0x60, 0x6a, 0x9a, 0xa9, 0xc9, 0xac, 0x90, 0x9d, + 0x95, 0x9d, 0x8d, 0x43, 0x84, 0x92, 0x85, 0x58, 0x4b, 0xc7, 0xc3, 0x42, + 0xb4, 0x62, 0xc5, 0xb8, 0xa5, 0x51, 0x48, 0x65, 0xad, 0xb3, 0x9f, 0x54, + 0xc0, 0xb6, 0xab, 0x84, 0x48, 0x34, 0x8f, 0x54, 0x3d, 0xd8, 0xb9, 0x43, + 0x3b, 0xbe, 0x66, 0x41, 0xcb, 0x52, 0x77, 0x52, 0x30, 0xbd, 0x51, 0xcf, + 0xbe, 0x47, 0x8d, 0xa9, 0xba, 0xbe, 0xc3, 0xc4, 0x8b, 0xbd, 0xba, 0xce, + 0x6e, 0xb9, 0x66, 0x3a, 0xb4, 0xb3, 0xd4, 0x68, 0xb6, 0x84, 0x3b, 0x83, + 0x3b, 0x38, 0xa4, 0x39, 0xab, 0x48, 0x47, 0x3d, 0x85, 0x5d, 0x88, 0xad, + 0xc9, 0x4c, 0x89, 0x5f, 0xa7, 0xac, 0x7b, 0x9b, 0xc3, 0xca, 0xa7, 0x8b, + 0x46, 0xad, 0x34, 0x90, 0x3b, 0x37, 0x41, 0x5a, 0x81, 0x61, 0xae, 0x69, + 0xac, 0x8a, 0x44, 0x96, 0xcf, 0x7d, 0x4b, 0x66, 0xa2, 0x69, 0xd4, 0xac, + 0xc4, 0x52, 0xb0, 0x40, 0xc3, 0xc3, 0x9e, 0xaf, 0x87, 0xc5, 0x5b, 0x8d, + 0x65, 0xa6, 0x78, 0xae, 0x87, 0xc7, 0x90, 0x7b, 0x87, 0xab, 0x9f, 0x81, + 0x71, 0xa8, 0xca, 0x6b, 0xca, 0x4c, 0xd0, 0xaa, 0x86, 0x82, 0xa4, 0x88, + 0xa7, 0xe4, 0x62, 0x35, 0x83, 0xc5, 0x89, 0x22, 0x3d, 0x3d, 0x97, 0xa0, + 0x8d, 0xad, 0x9b, 0x9d, 0xb7, 0x8e, 0xd7, 0x6f, 0x59, 0xad, 0x4e, 0x56, + 0x54, 0xc0, 0x3f, 0x8b, 0x57, 0xb0, 0x65, 0x41, 0xb1, 0x3a, 0x70, 0x82, + 0xc0, 0x49, 0xd1, 0x5b, 0x2f, 0x4e, 0x6f, 0xbf, 0x44, 0xb0, 0xb1, 0xb1, + 0xb0, 0x37, 0x83, 0x98, 0xcd, 0x4d, 0x50, 0xad, 0xaf, 0x87, 0x52, 0x4d, + 0x3c, 0x80, 0x5d, 0x41, 0xa8, 0xc5, 0x93, 0xa4, 0xb9, 0x4f, 0x66, 0x99, + 0x63, 0x67, 0x59, 0x74, 0xb2, 0x82, 0xa0, 0xab, 0x75, 0xaf, 0xca, 0xb0, + 0x61, 0xd6, 0xad, 0x65, 0x38, 0xa0, 0x3a, 0x84, 0xcd, 0xb4, 0x77, 0x99, + 0x50, 0x82, 0x83, 0x37, 0xbd, 0x80, 0x38, 0x42, 0x66, 0x6c, 0x54, 0xcc, + 0x73, 0x6f, 0xa1, 0x42, 0xcd, 0x4f, 0xe9, 0x7d, 0x47, 0xbc, 0xb2, 0x78, + 0x8e, 0x67, 0x58, 0x4a, 0xd8, 0xd9, 0xb3, 0x4d, 0xaa, 0xb3, 0x9d, 0xb7, + 0xb7, 0xab, 0xc2, 0x9d, 0x44, 0xa5, 0x5e, 0x7e, 0xa6, 0x73, 0xab, 0xbb, + 0xb9, 0xaa, 0xb3, 0x4d, 0x84, 0x67, 0xcb, 0x85, 0xb3, 0x91, 0xab, 0x4f, + 0x9d, 0x84, 0xaf, 0x9e, 0x50, 0x73, 0x38, 0xc0, 0xce, 0xaa, 0x92, 0x82, + 0x54, 0x53, 0xbe, 0xd2, 0x35, 0x3a, 0x34, 0x44, 0x37, 0x49, 0xb3, 0xa3, + 0x43, 0x71, 0x79, 0xcd, 0x4a, 0xb7, 0x58, 0xaf, 0x40, 0x38, 0x2e, 0x5d, + 0xb2, 0x64, 0x72, 0x5e, 0x38, 0x5b, 0xb4, 0xc6, 0x6e, 0x5c, 0x6c, 0x88, + 0x6c, 0x9f, 0xbe, 0x6a, 0xa9, 0x82, 0x48, 0x9a, 0xac, 0x9d, 0x3a, 0x6f, + 0x6d, 0xbd, 0xc8, 0xa0, 0x70, 0xaf, 0xce, 0x32, 0x40, 0xb7, 0xbe, 0xd5, + 0x59, 0xc7, 0x87, 0x48, 0x9d, 0x7d, 0xaa, 0x91, 0xaf, 0x43, 0x4d, 0x50, + 0x6e, 0x85, 0x66, 0xdd, 0x54, 0x3a, 0xc1, 0xa5, 0xc6, 0x73, 0x58, 0x54, + 0x80, 0xac, 0x92, 0xcd, 0xb7, 0x60, 0xa2, 0xd9, 0x99, 0x54, 0x5b, 0x40, + 0x77, 0x3b, 0x78, 0xb1, 0xc6, 0x8b, 0x48, 0x9d, 0x6f, 0xa7, 0x97, 0x7e, + 0x9d, 0x31, 0xa0, 0x5b, 0xbb, 0x99, 0x62, 0x8e, 0xa7, 0xb3, 0x6e, 0x48, + 0x48, 0x5d, 0x92, 0x94, 0x58, 0x36, 0xbd, 0xad, 0x40, 0x34, 0xa5, 0x73, + 0x63, 0x5e, 0x46, 0x76, 0xba, 0x9f, 0x3f, 0x58, 0x78, 0xc1, 0x40, 0xd7, + 0x4d, 0x67, 0xaf, 0x3b, 0x4c, 0x88, 0x43, 0x57, 0xb9, 0x82, 0xb4, 0x8f, + 0x72, 0x85, 0xda, 0xd1, 0x5a, 0xa3, 0x28, 0x60, 0xb9, 0x8b, 0x68, 0x6a, + 0x80, 0xa0, 0x97, 0x75, 0x37, 0xc9, 0x49, 0x71, 0x77, 0x48, 0x63, 0x68, + 0x7d, 0x5b, 0xcc, 0x87, 0xa9, 0x5f, 0x4c, 0xc6, 0x55, 0x88, 0xb3, 0x79, + 0x90, 0xe0, 0x41, 0x36, 0xc2, 0xad, 0xaf, 0x4d, 0x88, 0x43, 0xc5, 0xa6, + 0xb8, 0xbc, 0x41, 0x61, 0x8a, 0xc2, 0x99, 0x94, 0xc5, 0x80, 0x98, 0x62, + 0xa1, 0xc2, 0xc1, 0x74, 0xb6, 0xbd, 0x66, 0xba, 0xa5, 0x68, 0x34, 0x3f, + 0x54, 0x6a, 0xc1, 0xb1, 0x52, 0x4f, 0xb2, 0x53, 0x93, 0xaa, 0x86, 0x37, + 0x80, 0x8d, 0xbf, 0x68, 0xb4, 0x4b, 0xc9, 0x63, 0x31, 0x4c, 0x9d, 0xbb, + 0xa0, 0xac, 0xb4, 0x64, 0x4c, 0x99, 0x33, 0x63, 0xd4, 0x5e, 0x3c, 0xb3, + 0x9b, 0xbb, 0x43, 0x4f, 0xa0, 0x45, 0xe0, 0x73, 0xac, 0x87, 0x83, 0x9f, + 0xb2, 0x3e, 0x3b, 0x2c, 0x62, 0xb8, 0xd2, 0x92, 0xd9, 0x45, 0x60, 0x56, + 0x44, 0x8a, 0x77, 0x49, 0xc2, 0x6f, 0x5d, 0x4e, 0x3a, 0x64, 0x62, 0xcf, + 0xa2, 0xcf, 0x64, 0xca, 0x40, 0xc7, 0x52, 0x52, 0x7c, 0x65, 0x55, 0x52, + 0xc8, 0xbd, 0x57, 0xe2, 0x35, 0x87, 0x78, 0x9c, 0xaa, 0x7c, 0x49, 0x54, + 0x4a, 0x8a, 0x65, 0x99, 0xb9, 0x89, 0x6f, 0x34, 0x78, 0xca, 0x5c, 0x40, + 0xa7, 0x71, 0x64, 0xc5, 0x5c, 0x43, 0xc0, 0x9e, 0xa9, 0x36, 0x98, 0x9d, + 0x47, 0x56, 0x6b, 0x90, 0x71, 0x8e, 0x4e, 0x98, 0xa9, 0x84, 0x3b, 0x82, + 0x2d, 0x86, 0xc5, 0xcf, 0xbf, 0xc0, 0x90, 0x69, 0xa2, 0xd4, 0x2c, 0x83, + 0x38, 0xa0, 0x73, 0xc9, 0xc9, 0xa9, 0x72, 0x2f, 0xb8, 0x92, 0x42, 0x32, + 0xb9, 0x89, 0x94, 0x3a, 0x40, 0xa2, 0x6e, 0xc8, 0x94, 0x6f, 0x6f, 0x45, + 0x91, 0x66, 0x69, 0x6f, 0x97, 0x48, 0x70, 0x99, 0x85, 0xb0, 0x3b, 0xbb, + 0xb0, 0x3c, 0xad, 0xa4, 0x62, 0xb1, 0xb4, 0x81, 0xbf, 0x47, 0xac, 0xd7, + 0xb8, 0x52, 0xde, 0x55, 0x9e, 0xb3, 0xaf, 0x55, 0x95, 0x86, 0x5c, 0x81, + 0x3a, 0xb1, 0xb2, 0xc9, 0x42, 0xc1, 0xa2, 0xb8, 0x5b, 0xb6, 0x75, 0x95, + 0x64, 0x4e, 0xb2, 0x4d, 0x4f, 0x53, 0x3a, 0xce, 0x48, 0x32, 0x44, 0x69, + 0x85, 0x6f, 0x61, 0xaf, 0x46, 0x9a, 0x59, 0xd6, 0x89, 0x3e, 0x58, 0x40, + 0x99, 0x79, 0x84, 0x90, 0xc2, 0x96, 0x6d, 0x49, 0xc8, 0x9c, 0xcb, 0xae, + 0x34, 0x53, 0x59, 0xaf, 0x7a, 0xbb, 0xda, 0x5b, 0xa2, 0xa8, 0xcc, 0xa7, + 0xc1, 0x65, 0x77, 0xc8, 0x99, 0x54, 0x8e, 0xa2, 0x88, 0x71, 0xd8, 0xb7, + 0xb1, 0x54, 0xcc, 0x9e, 0x82, 0x9a, 0x94, 0xb6, 0xa1, 0x94, 0xd5, 0x96, + 0x36, 0x74, 0x71, 0x62, 0x97, 0x62, 0x9b, 0x83, 0x77, 0xb9, 0xd0, 0xc0, + 0xbd, 0xc2, 0xb5, 0x77, 0xcc, 0x85, 0x58, 0x93, 0xb4, 0xca, 0xb7, 0x66, + 0x6b, 0xba, 0x97, 0x7c, 0xc6, 0xbc, 0x98, 0xa0, 0x99, 0xad, 0x74, 0x9d, + 0x77, 0x9d, 0x43, 0xc2, 0x4e, 0x8c, 0x43, 0x8f, 0xba, 0x87, 0xb3, 0xd9, + 0xc9, 0xc2, 0x78, 0xa7, 0x7d, 0x60, 0xad, 0x9a, 0x74, 0x3c, 0x94, 0x80, + 0x97, 0x36, 0x52, 0x5c, 0x81, 0x4f, 0x7b, 0x89, 0xad, 0x3e, 0x28, 0x82, + 0x5e, 0x47, 0xd3, 0xce, 0x3d, 0x35, 0x75, 0x6d, 0x34, 0x76, 0x48, 0x3a, + 0x69, 0x44, 0x5c, 0x64, 0x71, 0x82, 0x44, 0xb3, 0x88, 0x3f, 0x9c, 0x5e, + 0x7b, 0xb7, 0x77, 0xa4, 0x36, 0x47, 0x66, 0x83, 0xd7, 0x64, 0x62, 0x4d, + 0xd7, 0xa3, 0x47, 0xa5, 0xa3, 0x6f, 0x61, 0x3e, 0xc2, 0x7d, 0x3d, 0x20, + 0x93, 0xa5, 0xca, 0x2a, 0x47, 0x70, 0xa7, 0x7d, 0x5b, 0x35, 0x9c, 0x5f, + 0x39, 0x2e, 0x3f, 0xc6, 0x52, 0x9e, 0xb9, 0x47, 0x5d, 0x63, 0x82, 0xb4, + 0xd8, 0xd2, 0x61, 0x77, 0x5e, 0xbc, 0x8d, 0x4a, 0x70, 0x85, 0xb2, 0x58, + 0xa0, 0xaf, 0x70, 0x56, 0xb0, 0x4c, 0x2d, 0xb7, 0x7f, 0xc5, 0xba, 0x53, + 0x89, 0x83, 0x9e, 0x7a, 0x77, 0xa1, 0xa2, 0xa0, 0xb6, 0x53, 0x63, 0x98, + 0x94, 0x64, 0x46, 0x9e, 0x39, 0x45, 0xcc, 0xb6, 0xab, 0x99, 0xb0, 0x47, + 0x8e, 0xcd, 0x98, 0xbb, 0x6b, 0x4e, 0xa2, 0x41, 0x98, 0xc4, 0x5e, 0xc5, + 0x53, 0xbc, 0x7f, 0x80, 0xd6, 0x4f, 0x71, 0x6b, 0xb6, 0xce, 0x55, 0x65, + 0x95, 0xc5, 0x96, 0x46, 0x71, 0x6e, 0x49, 0xc1, 0x9c, 0x96, 0x45, 0x5c, + 0xac, 0xd7, 0x9a, 0x5f, 0xc4, 0x66, 0xa1, 0x8f, 0x5e, 0x82, 0x70, 0xc9, + 0x60, 0xa0, 0xd3, 0xb1, 0x92, 0x47, 0x71, 0xc2, 0x8e, 0x7f, 0x99, 0x3b, + 0xbf, 0xba, 0x6b, 0xcd, 0x9b, 0x99, 0x5b, 0xd5, 0xc0, 0x79, 0xc9, 0x61, + 0x51, 0xbd, 0x4b, 0xd4, 0xc3, 0x5b, 0x34, 0x90, 0xdc, 0x92, 0x52, 0x89, + 0x9a, 0xa3, 0x61, 0xd3, 0x7a, 0x4e, 0x87, 0xbb, 0x53, 0x43, 0x71, 0xc7, + 0x57, 0xab, 0x8c, 0x8c, 0x3d, 0xc8, 0xb9, 0x94, 0x79, 0x63, 0x99, 0x82, + 0xc5, 0x79, 0x70, 0x6b, 0xab, 0xd7, 0x72, 0x28, 0xcc, 0x9f, 0x2d, 0xad, + 0x57, 0x4e, 0x88, 0x3a, 0x6c, 0x8d, 0x69, 0x90, 0xa0, 0x8d, 0x9e, 0x80, + 0x4c, 0x40, 0x51, 0xa1, 0xab, 0x5d, 0x4b, 0xb9, 0x9e, 0xd2, 0x5b, 0x42, + 0x70, 0x68, 0x5b, 0xd3, 0x41, 0x37, 0x7e, 0x46, 0x33, 0xce, 0x42, 0xb6, + 0x9f, 0xc5, 0x9b, 0x5a, 0x7d, 0x6b, 0x38, 0x38, 0x61, 0xb7, 0x7e, 0x6f, + 0x4e, 0x3d, 0x8a, 0xba, 0xa4, 0x8c, 0x91, 0x91, 0x74, 0xb4, 0x66, 0xa1, + 0xaa, 0xe4, 0x5a, 0xc3, 0xa4, 0x2a, 0xcc, 0xd7, 0xa1, 0x9b, 0xbb, 0x61, + 0x7a, 0x66, 0x84, 0xbd, 0x7c, 0xc1, 0xa0, 0xdb, 0x8a, 0x45, 0x98, 0x89, + 0xa2, 0x8a, 0x4c, 0x54, 0x44, 0x2c, 0x40, 0x46, 0x92, 0x46, 0xb9, 0x80, + 0xcc, 0x99, 0xd3, 0x66, 0x99, 0x65, 0x96, 0xc7, 0x69, 0xb1, 0x72, 0xc8, + 0x95, 0x84, 0xaf, 0xac, 0xc9, 0x58, 0xad, 0x48, 0x31, 0x46, 0xbc, 0x7a, + 0x99, 0x6c, 0x93, 0x56, 0x91, 0x86, 0xe8, 0x70, 0x9b, 0xae, 0xc3, 0x36, + 0x84, 0xab, 0x71, 0x39, 0xea, 0x71, 0x75, 0x64, 0x94, 0xa8, 0xbe, 0xc8, + 0xce, 0x53, 0xbd, 0x44, 0xa2, 0xce, 0x92, 0xa9, 0x6b, 0x70, 0x4d, 0x7b, + 0xbe, 0xb6, 0x52, 0xb9, 0x8e, 0x43, 0xb2, 0xa6, 0x9b, 0x65, 0x47, 0x99, + 0xd2, 0xa6, 0x90, 0x9a, 0x32, 0xc1, 0xab, 0xb6, 0x81, 0xbb, 0xa0, 0x44, + 0x79, 0x86, 0x4c, 0x43, 0xc8, 0x8d, 0x48, 0xbe, 0xcc, 0x86, 0xcf, 0x43, + 0x9f, 0x4c, 0x64, 0xd3, 0x3b, 0x73, 0x8e, 0xa2, 0xb7, 0x9a, 0x66, 0x9d, + 0x8e, 0x4b, 0xb2, 0x66, 0x4d, 0x80, 0x71, 0x8d, 0x46, 0xa5, 0x4b, 0xd3, + 0x56, 0x52, 0xc8, 0x45, 0xc0, 0x98, 0x8a, 0x40, 0x78, 0x58, 0xb3, 0xb0, + 0xbc, 0x64, 0xa7, 0x9a, 0xc6, 0xc3, 0xa1, 0xa3, 0x56, 0xa0, 0x77, 0x8d, + 0x37, 0x6e, 0x9a, 0x47, 0x4a, 0xa5, 0x76, 0x36, 0x97, 0x57, 0xa2, 0xb6, + 0x6c, 0xc5, 0x7b, 0x7f, 0x6d, 0xa6, 0x69, 0xaf, 0x86, 0x24, 0x93, 0xa5, + 0x9b, 0x26, 0xb9, 0x9d, 0xa4, 0x4c, 0xb8, 0xc5, 0x55, 0x38, 0x90, 0x32, + 0x69, 0xb4, 0xbd, 0x8e, 0x71, 0x20, 0x70, 0xd2, 0xd6, 0xfe, 0xff, 0xff, + 0xc4, 0xff, 0xff, 0xff, 0x3a, 0, 0, 0, 0, 0, 0, 0, + 0xff, 0xff, 0xff, 0xff, 0x10, 0x1, 0, 0, 0x8c, 0xff, 0xff, 0xff, + 0x49, 0, 0, 0, 0x5d, 0x1, 0, 0, 0x2a, 0x1, 0, 0, + 0xd4, 0xff, 0xff, 0xff, 0x16, 0, 0, 0, 0x25, 0x1, 0, 0, + 0x84, 0xff, 0xff, 0xff, 0x3, 0, 0, 0, 0xe, 0x1, 0, 0, + 0x37, 0, 0, 0, 0xca, 0xff, 0xff, 0xff, 0x72, 0xff, 0xff, 0xff, + 0x5f, 0, 0, 0, 0x7, 0, 0, 0, 0x3a, 0, 0, 0, + 0x9d, 0xff, 0xff, 0xff, 0x36, 0, 0, 0, 0xba, 0, 0, 0, + 0x83, 0, 0, 0, 0xed, 0xff, 0xff, 0xff, 0x1d, 0, 0, 0, + 0x61, 0x1, 0, 0, 0x36, 0xff, 0xff, 0xff, 0x7d, 0xff, 0xff, 0xff, + 0xe, 0xff, 0xff, 0xff, 0x69, 0xff, 0xff, 0xff, 0xf6, 0, 0, 0, + 0x8a, 0xff, 0xff, 0xff, 0x4d, 0, 0, 0, 0x85, 0xff, 0xff, 0xff, + 0xb2, 0xff, 0xff, 0xff, 0x1b, 0, 0, 0, 0xed, 0, 0, 0, + 0x42, 0, 0, 0, 0x8, 0xff, 0xff, 0xff, 0xb2, 0xff, 0xff, 0xff, + 0xee, 0xff, 0xff, 0xff, 0xd1, 0xff, 0xff, 0xff, 0xd3, 0xff, 0xff, 0xff, + 0x52, 0, 0, 0, 0x65, 0, 0, 0, 0x65, 0xff, 0xff, 0xff, + 0xbc, 0, 0, 0, 0x2c, 0x1, 0, 0, 0x44, 0, 0, 0, + 0xed, 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xff, 0xf8, 0, 0, 0, + 0x12, 0x1, 0, 0, 0x3f, 0, 0, 0, 0x5e, 0x1, 0, 0, + 0x64, 0xff, 0xff, 0xff, 0x54, 0xfe, 0xff, 0xff, 0xd2, 0, 0, 0, + 0x3f, 0, 0, 0, 0xb7, 0xff, 0xff, 0xff, 0xcf, 0, 0, 0, + 0xe, 0x1, 0, 0, 0x32, 0xff, 0xff, 0xff, 0x8a, 0, 0, 0, + 0x39, 0, 0, 0, 0xd1, 0xff, 0xff, 0xff, 0x87, 0xff, 0xff, 0xff, + 0xe2, 0xff, 0xff, 0xff, 0x4c, 0xff, 0xff, 0xff, 0xa8, 0xff, 0xff, 0xff, + 0x23, 0, 0, 0, 0x3e, 0x1, 0, 0, 0xb7, 0xff, 0xff, 0xff, + 0x84, 0, 0, 0, 0xc1, 0xff, 0xff, 0xff, 0xb7, 0, 0, 0, + 0x13, 0x2, 0, 0, 0x22, 0, 0, 0, 0x73, 0x1, 0, 0, + 0xcb, 0, 0, 0, 0xd6, 0, 0, 0, 0xc6, 0, 0, 0, + 0xe3, 0xff, 0xff, 0xff, 0x98, 0x1, 0, 0, 0x21, 0x1, 0, 0, + 0x6c, 0xff, 0xff, 0xff, 0xe2, 0xfe, 0xff, 0xff, 0x2b, 0, 0, 0, + 0x60, 0, 0, 0, 0xcb, 0xff, 0xff, 0xff, 0x6c, 0, 0, 0, + 0xf8, 0xfe, 0xff, 0xff, 0xc9, 0, 0, 0, 0x3, 0, 0, 0, + 0x30, 0, 0, 0, 0xb8, 0xff, 0xff, 0xff, 0x1c, 0xff, 0xff, 0xff, + 0x59, 0, 0, 0, 0x58, 0, 0, 0, 0x8f, 0, 0, 0, + 0xc0, 0xff, 0xff, 0xff, 0x9a, 0xff, 0xff, 0xff, 0x14, 0xfe, 0xff, 0xff, + 0x88, 0xff, 0xff, 0xff, 0x38, 0, 0, 0, 0x36, 0xff, 0xff, 0xff, + 0x4a, 0, 0, 0, 0x61, 0, 0, 0, 0xf2, 0xff, 0xff, 0xff, + 0x68, 0xff, 0xff, 0xff, 0xf7, 0xff, 0xff, 0xff, 0x25, 0, 0, 0, + 0x6d, 0x1, 0, 0, 0x15, 0x1, 0, 0, 0x4f, 0x1, 0, 0, + 0xed, 0x1, 0, 0, 0xd1, 0xfe, 0xff, 0xff, 0xa7, 0xff, 0xff, 0xff, + 0xf2, 0, 0, 0, 0xba, 0, 0, 0, 0x92, 0, 0, 0, + 0xc8, 0, 0, 0, 0x4d, 0, 0, 0, 0x54, 0xff, 0xff, 0xff, + 0xa8, 0xff, 0xff, 0xff, 0xf3, 0xff, 0xff, 0xff, 0x38, 0, 0, 0, + 0xd1, 0, 0, 0, 0xd7, 0xfe, 0xff, 0xff, 0x21, 0x1, 0, 0, + 0xce, 0, 0, 0, 0xfa, 0xff, 0xff, 0xff, 0x3a, 0, 0, 0, + 0x42, 0, 0, 0, 0xce, 0xff, 0xff, 0xff, 0x23, 0, 0, 0, + 0x20, 0x1, 0, 0, 0x93, 0xff, 0xff, 0xff, 0x8a, 0xfe, 0xff, 0xff, + 0x9d, 0, 0, 0, 0x9e, 0, 0, 0, 0x18, 0xff, 0xff, 0xff, + 0x83, 0xff, 0xff, 0xff, 0xf7, 0, 0, 0, 0xea, 0, 0, 0, + 0x55, 0, 0, 0, 0xf9, 0xff, 0xff, 0xff, 0x37, 0x1, 0, 0, + 0xfe, 0xff, 0xff, 0xff, 0x39, 0x1, 0, 0, 0x7, 0, 0, 0, + 0xc2, 0xff, 0xff, 0xff, 0x37, 0, 0, 0, 0x4, 0x1, 0, 0, + 0x72, 0x1, 0, 0, 0xfb, 0xff, 0xff, 0xff, 0xc9, 0x1, 0, 0, + 0x44, 0xff, 0xff, 0xff, 0x5b, 0, 0, 0, 0x24, 0, 0, 0, + 0x16, 0, 0, 0, 0x74, 0, 0, 0, 0x41, 0x1, 0, 0, + 0x6f, 0xff, 0xff, 0xff, 0xb6, 0xff, 0xff, 0xff, 0x19, 0x1, 0, 0, + 0x85, 0xff, 0xff, 0xff, 0xb6, 0xff, 0xff, 0xff, 0x2d, 0, 0, 0, + 0x6b, 0, 0, 0, 0xf3, 0x1, 0, 0, 0xde, 0, 0, 0, + 0x7a, 0xff, 0xff, 0xff, 0xc, 0xff, 0xff, 0xff, 0xad, 0xff, 0xff, 0xff, + 0x57, 0xff, 0xff, 0xff, 0xad, 0xff, 0xff, 0xff, 0x8a, 0xff, 0xff, 0xff, + 0xc2, 0xff, 0xff, 0xff, 0xb5, 0, 0, 0, 0xa1, 0xff, 0xff, 0xff, + 0x76, 0, 0, 0, 0x3d, 0xfe, 0xff, 0xff, 0x1, 0x1, 0, 0, + 0x30, 0, 0, 0, 0xe6, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, + 0x52, 0, 0, 0, 0x1a, 0, 0, 0, 0x91, 0xff, 0xff, 0xff, + 0x29, 0, 0, 0, 0x78, 0, 0, 0, 0x7e, 0xff, 0xff, 0xff, + 0x50, 0xff, 0xff, 0xff, 0xb6, 0xff, 0xff, 0xff, 0x7a, 0xff, 0xff, 0xff, + 0xcc, 0xff, 0xff, 0xff, 0xa4, 0xfe, 0xff, 0xff, 0x7d, 0, 0, 0, + 0x61, 0, 0, 0, 0x98, 0xff, 0xff, 0xff, 0xa3, 0xfe, 0xff, 0xff, + 0xc1, 0xff, 0xff, 0xff, 0x95, 0xff, 0xff, 0xff, 0x66, 0, 0, 0, + 0x6b, 0x1, 0, 0, 0xd, 0, 0, 0, 0x2d, 0, 0, 0, + 0x6b, 0xff, 0xff, 0xff, 0x57, 0x1, 0, 0, 0xcf, 0xff, 0xff, 0xff, + 0x35, 0xff, 0xff, 0xff, 0x9e, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xb0, 0, 0, 0, 0x8b, 0xff, 0xff, 0xff, 0x27, 0, 0, 0, + 0xd7, 0xff, 0xff, 0xff, 0x7d, 0xff, 0xff, 0xff, 0xac, 0xfe, 0xff, 0xff, + 0xd7, 0xff, 0xff, 0xff, 0x68, 0, 0, 0, 0x39, 0xff, 0xff, 0xff, + 0x1, 0, 0, 0, 0x59, 0, 0, 0, 0xa9, 0xfe, 0xff, 0xff, + 0x38, 0xff, 0xff, 0xff, 0xd8, 0, 0, 0, 0x6a, 0xff, 0xff, 0xff, + 0xc4, 0, 0, 0, 0x5, 0xff, 0xff, 0xff, 0x32, 0, 0, 0, + 0xfc, 0xfe, 0xff, 0xff, 0x8f, 0xff, 0xff, 0xff, 0xac, 0xfe, 0xff, 0xff, + 0xaa, 0x2, 0, 0, 0x2f, 0, 0, 0, 0xa3, 0, 0, 0, + 0x16, 0, 0, 0, 0x5, 0, 0, 0, 0xa9, 0, 0, 0, + 0xee, 0, 0, 0, 0x96, 0, 0, 0, 0x2d, 0x1, 0, 0, + 0xb1, 0xff, 0xff, 0xff, 0x77, 0xff, 0xff, 0xff, 0x48, 0x1, 0, 0, + 0x8, 0x1, 0, 0, 0x9a, 0, 0, 0, 0, 0xff, 0xff, 0xff, + 0xff, 0xfe, 0xff, 0xff, 0xe4, 0, 0, 0, 0x1a, 0x1, 0, 0, + 0xb0, 0, 0, 0, 0xdf, 0xff, 0xff, 0xff, 0x7, 0x1, 0, 0, + 0x5, 0, 0, 0, 0x13, 0, 0, 0, 0x43, 0xff, 0xff, 0xff, + 0x8, 0, 0, 0, 0x1, 0x1, 0, 0, 0x3e, 0, 0, 0, + 0x44, 0xff, 0xff, 0xff, 0x4c, 0x1, 0, 0, 0xde, 0, 0, 0, + 0x2, 0, 0, 0, 0xf2, 0xfe, 0xff, 0xff, 0xee, 0xfe, 0xff, 0xff, + 0xc9, 0, 0, 0, 0xee, 0xff, 0xff, 0xff, 0xbd, 0xff, 0xff, 0xff, + 0x92, 0, 0, 0, 0x87, 0xff, 0xff, 0xff, 0xd6, 0x1, 0, 0, + 0x28, 0xff, 0xff, 0xff, 0xd8, 0xff, 0xff, 0xff, 0x7b, 0, 0, 0, + 0xd0, 0xff, 0xff, 0xff, 0x16, 0xff, 0xff, 0xff, 0xae, 0, 0, 0, + 0x40, 0xff, 0xff, 0xff, 0xab, 0xff, 0xff, 0xff, 0x5d, 0, 0, 0, + 0x2f, 0x1, 0, 0, 0xa2, 0, 0, 0, 0xc6, 0, 0, 0, + 0x64, 0, 0, 0, 0xf, 0, 0, 0, 0x7, 0x1, 0, 0, + 0xe8, 0xfe, 0xff, 0xff, 0x62, 0xff, 0xff, 0xff, 0xe9, 0, 0, 0, + 0x31, 0, 0, 0, 0x55, 0, 0, 0, 0xae, 0xff, 0xff, 0xff, + 0x3b, 0, 0, 0, 0x63, 0, 0, 0, 0x73, 0xff, 0xff, 0xff, + 0x97, 0, 0, 0, 0xe9, 0, 0, 0, 0xd8, 0xff, 0xff, 0xff, + 0xc8, 0, 0, 0, 0xa9, 0xff, 0xff, 0xff, 0x91, 0xff, 0xff, 0xff, + 0xf8, 0xff, 0xff, 0xff, 0xd9, 0, 0, 0, 0x23, 0, 0, 0, + 0x9a, 0, 0, 0, 0x63, 0, 0, 0, 0x69, 0xff, 0xff, 0xff, + 0xbd, 0, 0, 0, 0xda, 0xff, 0xff, 0xff, 0xfa, 0xff, 0xff, 0xff, + 0x37, 0xff, 0xff, 0xff, 0x66, 0xff, 0xff, 0xff, 0x6b, 0x1, 0, 0, + 0xdf, 0, 0, 0, 0x45, 0, 0, 0, 0x5a, 0xff, 0xff, 0xff, + 0xb0, 0, 0, 0, 0xe3, 0xff, 0xff, 0xff, 0x95, 0x1, 0, 0, + 0x48, 0xff, 0xff, 0xff, 0x4c, 0, 0, 0, 0x2c, 0x1, 0, 0, + 0x16, 0xff, 0xff, 0xff, 0xb3, 0, 0, 0, 0x1c, 0, 0, 0, + 0xd1, 0xff, 0xff, 0xff, 0x33, 0, 0, 0, 0x2, 0, 0, 0, + 0xb9, 0xfe, 0xff, 0xff, 0x4e, 0, 0, 0, 0x91, 0x1, 0, 0, + 0x48, 0, 0, 0, 0x67, 0, 0, 0, 0x96, 0xfe, 0xff, 0xff, + 0x7, 0x1, 0, 0, 0xf0, 0xff, 0xff, 0xff, 0xed, 0, 0, 0, + 0x2e, 0, 0, 0, 0x8a, 0, 0, 0, 0x33, 0x1, 0, 0, + 0x5f, 0xff, 0xff, 0xff, 0x22, 0x1, 0, 0, 0x7b, 0x1, 0, 0, + 0x24, 0x1, 0, 0, 0xc2, 0x1, 0, 0, 0xe0, 0xfe, 0xff, 0xff, + 0x9c, 0xff, 0xff, 0xff, 0xc5, 0xff, 0xff, 0xff, 0x25, 0xff, 0xff, 0xff, + 0x8b, 0xff, 0xff, 0xff, 0xf2, 0xff, 0xff, 0xff, 0x7d, 0x1, 0, 0, + 0xcf, 0xff, 0xff, 0xff, 0x6b, 0, 0, 0, 0x94, 0, 0, 0, + 0x7e, 0x1, 0, 0, 0x6e, 0xff, 0xff, 0xff, 0xe5, 0xff, 0xff, 0xff, + 0x8d, 0xff, 0xff, 0xff, 0xa9, 0xff, 0xff, 0xff, 0x56, 0xff, 0xff, 0xff, + 0x24, 0, 0, 0, 0x80, 0xff, 0xff, 0xff, 0x6a, 0xff, 0xff, 0xff, + 0x4, 0x1, 0, 0, 0x7e, 0, 0, 0, 0xd7, 0xff, 0xff, 0xff, + 0x44, 0xff, 0xff, 0xff, 0xd4, 0, 0, 0, 0xec, 0xff, 0xff, 0xff, + 0xe8, 0, 0, 0, 0x90, 0xff, 0xff, 0xff, 0x9, 0xff, 0xff, 0xff, + 0xd9, 0, 0, 0, 0xb6, 0, 0, 0, 0xab, 0xff, 0xff, 0xff, + 0xf2, 0xfe, 0xff, 0xff, 0x70, 0, 0, 0, 0x5, 0x1, 0, 0, + 0x54, 0xff, 0xff, 0xff, 0x42, 0x1, 0, 0, 0x19, 0, 0, 0, + 0x74, 0xff, 0xff, 0xff, 0x41, 0xff, 0xff, 0xff, 0xef, 0, 0, 0, + 0x37, 0, 0, 0, 0xe8, 0xfe, 0xff, 0xff, 0x39, 0x1, 0, 0, + 0xc1, 0xff, 0xff, 0xff, 0x31, 0, 0, 0, 0x28, 0, 0, 0, + 0x9d, 0, 0, 0, 0xfd, 0xfe, 0xff, 0xff, 0x78, 0xff, 0xff, 0xff, + 0x70, 0, 0, 0, 0xdd, 0, 0, 0, 0x37, 0x1, 0, 0, + 0xcf, 0xff, 0xff, 0xff, 0x76, 0xff, 0xff, 0xff, 0x5d, 0, 0, 0, + 0x4, 0, 0, 0, 0xdb, 0x1, 0, 0, 0x44, 0, 0, 0, + 0x75, 0xfe, 0xff, 0xff, 0xb6, 0, 0, 0, 0x7f, 0, 0, 0, + 0xf9, 0xff, 0xff, 0xff, 0xc, 0, 0, 0, 0xed, 0xff, 0xff, 0xff, + 0xa0, 0, 0, 0, 0x57, 0xff, 0xff, 0xff, 0xa0, 0xff, 0xff, 0xff, + 0xe8, 0xff, 0xff, 0xff, 0x29, 0, 0, 0, 0x14, 0, 0, 0, + 0xcf, 0xff, 0xff, 0xff, 0x2d, 0xff, 0xff, 0xff, 0xac, 0, 0, 0, + 0x61, 0x1, 0, 0, 0x6b, 0x2, 0, 0, 0xbb, 0xff, 0xff, 0xff, + 0xb, 0xff, 0xff, 0xff, 0xee, 0xff, 0xff, 0xff, 0x6c, 0, 0, 0, + 0x7e, 0, 0, 0, 0x1f, 0, 0, 0, 0x51, 0xff, 0xff, 0xff, + 0x41, 0, 0, 0, 0xda, 0, 0, 0, 0xdc, 0, 0, 0, + 0xb6, 0xff, 0xff, 0xff, 0x6b, 0, 0, 0, 0x8c, 0, 0, 0, + 0xd3, 0xff, 0xff, 0xff, 0xd, 0, 0, 0, 0xb1, 0, 0, 0, + 0x6b, 0x1, 0, 0, 0x50, 0x2, 0, 0, 0x22, 0, 0, 0, + 0xd, 0x1, 0, 0, 0x35, 0, 0, 0, 0x51, 0, 0, 0, + 0x78, 0xff, 0xff, 0xff, 0x2b, 0xff, 0xff, 0xff, 0x4f, 0, 0, 0, + 0x19, 0, 0, 0, 0x2b, 0, 0, 0, 0x8c, 0xff, 0xff, 0xff, + 0x33, 0, 0, 0, 0x79, 0, 0, 0, 0xda, 0xff, 0xff, 0xff, + 0x3d, 0xff, 0xff, 0xff, 0x9d, 0, 0, 0, 0x26, 0xff, 0xff, 0xff, + 0x54, 0, 0, 0, 0x72, 0, 0, 0, 0xe3, 0xff, 0xff, 0xff, + 0x2b, 0, 0, 0, 0x49, 0x1, 0, 0, 0xd0, 0xff, 0xff, 0xff, + 0x64, 0, 0, 0, 0x92, 0xff, 0xff, 0xff, 0x46, 0, 0, 0, + 0xb8, 0, 0, 0, 0xd9, 0xff, 0xff, 0xff, 0x76, 0, 0, 0, + 0x31, 0xff, 0xff, 0xff, 0xba, 0xff, 0xff, 0xff, 0xcc, 0, 0, 0, + 0x17, 0x2, 0, 0, 0x3d, 0, 0, 0, 0xf1, 0xff, 0xff, 0xff, + 0xa5, 0xfe, 0xff, 0xff, 0xb, 0xff, 0xff, 0xff, 0xa1, 0xff, 0xff, 0xff, + 0xe7, 0xff, 0xff, 0xff, 0xbc, 0xfe, 0xff, 0xff, 0xaa, 0xff, 0xff, 0xff, + 0xf2, 0xfe, 0xff, 0xff, 0xeb, 0, 0, 0, 0x58, 0xff, 0xff, 0xff, + 0x46, 0, 0, 0, 0x4, 0, 0, 0, 0xcf, 0, 0, 0, + 0xa7, 0, 0, 0, 0x78, 0xff, 0xff, 0xff, 0x75, 0, 0, 0, + 0x7a, 0xff, 0xff, 0xff, 0x1b, 0, 0, 0, 0xb8, 0xff, 0xff, 0xff, + 0xa9, 0xff, 0xff, 0xff, 0x48, 0x1, 0, 0, 0x8b, 0x1, 0, 0, + 0x8f, 0, 0, 0, 0x85, 0x64, 0x9b, 0xb3, 0x91, 0xd8, 0x90, 0x5f, + 0x84, 0xb0, 0x78, 0x62, 0x7d, 0x6b, 0xb6, 0x59, 0x9f, 0x62, 0xc4, 0x54, + 0x96, 0x4f, 0x79, 0x90, 0xa5, 0x64, 0x57, 0x6a, 0x5e, 0x8d, 0xae, 0x7c, + 0x87, 0x8c, 0x75, 0x90, 0xa7, 0x8b, 0x79, 0x53, 0x8b, 0x8c, 0xab, 0x61, + 0x73, 0x82, 0x77, 0xb9, 0x7f, 0x59, 0x7f, 0xbe, 0x61, 0x8e, 0x94, 0x5a, + 0xa7, 0x99, 0xa3, 0xb4, 0x90, 0xcb, 0x73, 0x66, 0xb8, 0xad, 0x6c, 0x83, + 0x8c, 0x9c, 0xa0, 0x7a, 0x9d, 0xe2, 0x8c, 0x5e, 0x8e, 0xa9, 0xc1, 0xc9, + 0xa0, 0x62, 0x98, 0x64, 0x5d, 0x5e, 0x51, 0x98, 0xb4, 0xa8, 0x5c, 0x81, + 0xab, 0x64, 0x73, 0x72, 0x92, 0xa8, 0xb2, 0xbb, 0xa5, 0x8f, 0x5f, 0x85, + 0xc0, 0xae, 0x73, 0x8b, 0x65, 0x74, 0x88, 0x6f, 0xd9, 0x74, 0xaa, 0x73, + 0x9e, 0xad, 0x8a, 0x7d, 0x9d, 0x4e, 0x7f, 0x67, 0xaa, 0xdb, 0x8a, 0x87, + 0x44, 0x58, 0x51, 0x79, 0x97, 0x63, 0x9c, 0x61, 0x5d, 0xa8, 0x80, 0x7f, + 0xc0, 0x67, 0xc6, 0x7e, 0xa6, 0x74, 0x74, 0x9b, 0x4e, 0xa2, 0xcc, 0x92, + 0xc8, 0x7b, 0x71, 0xc5, 0x33, 0xb1, 0xa4, 0x80, 0xd4, 0xa0, 0x73, 0x6f, + 0xcb, 0xa0, 0x6b, 0x98, 0x58, 0x94, 0x53, 0x7e, 0x79, 0x82, 0xdc, 0x90, + 0x74, 0x6b, 0x2a, 0x7c, 0x97, 0x90, 0x61, 0xb2, 0x6d, 0x87, 0xc4, 0x80, + 0x9a, 0x62, 0x8a, 0x5b, 0xa9, 0xb5, 0x7a, 0x83, 0x7b, 0x5c, 0xa8, 0x6c, + 0x5f, 0xbe, 0x91, 0x54, 0x96, 0x83, 0xa4, 0x5b, 0xd1, 0x40, 0x78, 0x53, + 0xbe, 0x91, 0x89, 0x9c, 0xa1, 0xa2, 0x8b, 0x9f, 0xbf, 0xb7, 0x9c, 0xaf, + 0x85, 0x5e, 0x94, 0x88, 0x60, 0xe4, 0x98, 0x68, 0x87, 0x6f, 0xa6, 0xb4, + 0x97, 0x90, 0x9c, 0xbc, 0x81, 0x44, 0x58, 0xb0, 0xe9, 0xd3, 0xbb, 0x80, + 0xbe, 0x58, 0xad, 0x6b, 0x82, 0x96, 0x46, 0xae, 0x8c, 0x61, 0x9f, 0xb9, + 0xa8, 0xb5, 0xa2, 0xbb, 0x58, 0xad, 0x8d, 0x52, 0x5a, 0xa7, 0x84, 0x75, + 0x4b, 0x6c, 0xb2, 0x6e, 0x41, 0x90, 0x70, 0x4e, 0x6f, 0x99, 0x9d, 0xbe, + 0x86, 0x5e, 0x66, 0x49, 0x84, 0x67, 0x93, 0x55, 0x9b, 0x8c, 0x23, 0xae, + 0x83, 0x69, 0x58, 0xa3, 0x71, 0xb2, 0x8d, 0xaa, 0x86, 0xb3, 0x94, 0x67, + 0x56, 0xd8, 0xa2, 0x74, 0x8c, 0xad, 0x80, 0x73, 0x87, 0xaf, 0x7f, 0xae, + 0x80, 0x66, 0x82, 0x98, 0x4d, 0x60, 0x8f, 0x5e, 0xa6, 0x75, 0x64, 0x84, + 0x77, 0x9b, 0x6e, 0x56, 0x92, 0x54, 0xa5, 0x62, 0x92, 0x7e, 0x90, 0x69, + 0x41, 0x34, 0xa3, 0xb0, 0x87, 0x7e, 0x51, 0x66, 0xa6, 0xae, 0x70, 0x72, + 0x6b, 0x57, 0x71, 0x5d, 0x8c, 0x4d, 0x64, 0xb2, 0xa5, 0x66, 0x81, 0x7d, + 0x85, 0x96, 0x6a, 0xa9, 0x8c, 0xb9, 0x4c, 0x69, 0x93, 0x82, 0x7b, 0x61, + 0x9c, 0x62, 0x5b, 0xdd, 0xb6, 0x77, 0x79, 0x64, 0x6a, 0x8c, 0x8e, 0xa5, + 0x5e, 0x79, 0x77, 0x71, 0xb5, 0x81, 0x94, 0x92, 0x53, 0x6b, 0xbd, 0x8b, + 0x95, 0xb9, 0x92, 0x84, 0x9c, 0x3b, 0x58, 0x51, 0xa9, 0xbb, 0x86, 0x85, + 0x69, 0x57, 0x6b, 0x65, 0x86, 0xd5, 0xa7, 0x62, 0x7f, 0x9f, 0x9e, 0xde, + 0x59, 0x93, 0xa8, 0x45, 0x63, 0x5e, 0xad, 0x9e, 0xa4, 0x7e, 0x7a, 0x74, + 0xa6, 0xbc, 0x77, 0x53, 0x9c, 0x82, 0x70, 0xb7, 0xa0, 0xbd, 0x63, 0x7f, + 0x7e, 0x6b, 0x91, 0xc9, 0x5e, 0x7a, 0x48, 0xb0, 0xba, 0x6d, 0x72, 0x3f, + 0x3b, 0x8f, 0x5d, 0x8d, 0xb0, 0x88, 0x52, 0x4d, 0x90, 0x64, 0xc3, 0x3b, + 0xcd, 0x73, 0x8a, 0x95, 0xbf, 0x8a, 0x6f, 0xc2, 0x80, 0x8d, 0x6e, 0x9b, + 0x8b, 0x78, 0x66, 0xbe, 0x65, 0xab, 0xb3, 0x7e, 0xa7, 0x81, 0x7f, 0xab, + 0x8a, 0xb0, 0xd1, 0x5f, 0xae, 0xaf, 0x60, 0x6e, 0x56, 0x94, 0xb1, 0x90, + 0x80, 0x6a, 0x65, 0xa4, 0x5b, 0x87, 0x83, 0x8e, 0xaf, 0x6c, 0x66, 0x45, + 0x87, 0xa2, 0x8f, 0x69, 0xd3, 0x48, 0xb1, 0xa7, 0x4f, 0xd0, 0x58, 0xd1, + 0x69, 0x96, 0x8d, 0xb1, 0x52, 0x72, 0xa2, 0x98, 0x67, 0x69, 0x6e, 0x98, + 0x6f, 0xc7, 0x8a, 0x83, 0x97, 0x82, 0xc5, 0xa9, 0xb1, 0x84, 0xb3, 0x8e, + 0x86, 0x97, 0x92, 0x83, 0xcc, 0xb1, 0x9c, 0xbf, 0xab, 0xb0, 0x70, 0xad, + 0xbf, 0x52, 0xc0, 0x79, 0x4e, 0x9a, 0xa5, 0x93, 0x8c, 0x93, 0x56, 0x67, + 0x96, 0xb0, 0x6f, 0x92, 0x78, 0x8f, 0x7b, 0x70, 0x45, 0x97, 0x64, 0x60, + 0x7f, 0x77, 0x4c, 0x3f, 0x42, 0xba, 0xbf, 0x43, 0x7b, 0x63, 0x5d, 0x77, + 0x8f, 0xc9, 0x97, 0x7c, 0xa1, 0x55, 0x7e, 0xcf, 0x69, 0x8b, 0xc5, 0x51, + 0xaf, 0x7e, 0xaa, 0xc9, 0xb2, 0x50, 0xab, 0x80, 0x73, 0xc0, 0xdf, 0x67, + 0x7c, 0xef, 0x80, 0xad, 0xb5, 0xad, 0xae, 0xbd, 0x55, 0xb1, 0x6b, 0xaf, + 0x56, 0xa5, 0xc1, 0xc3, 0x6f, 0x80, 0xc8, 0x54, 0x79, 0x70, 0x7b, 0x6c, + 0xca, 0x5c, 0xc9, 0xad, 0xb3, 0x7f, 0x85, 0x6c, 0xaa, 0xac, 0x6a, 0x98, + 0xaa, 0x7c, 0x9f, 0x76, 0x93, 0xc4, 0x7c, 0xb3, 0x5f, 0x8a, 0x49, 0x70, + 0x9d, 0xb4, 0xa5, 0xa9, 0xa7, 0x60, 0x57, 0xb6, 0x44, 0x75, 0x75, 0x69, + 0xa4, 0xc1, 0x87, 0x58, 0x64, 0xaf, 0x66, 0x5f, 0x38, 0x54, 0x7b, 0x62, + 0x87, 0x73, 0x88, 0xe5, 0x9e, 0x95, 0xb3, 0x68, 0x88, 0x74, 0xb3, 0x8e, + 0xa8, 0x96, 0x72, 0x82, 0x95, 0xab, 0x74, 0x6d, 0x87, 0x95, 0xaa, 0x89, + 0xb1, 0x5c, 0x6c, 0x77, 0x8f, 0x6a, 0x7c, 0x7c, 0xa1, 0x9a, 0xac, 0x47, + 0xaf, 0x73, 0x85, 0xc0, 0x7a, 0xc6, 0x5d, 0xa9, 0xab, 0x86, 0x57, 0x84, + 0x91, 0x64, 0x89, 0xb2, 0xa9, 0x91, 0x89, 0x7d, 0x70, 0xb0, 0x8f, 0x80, + 0x91, 0x77, 0x82, 0xb3, 0x85, 0x9e, 0x7a, 0xc6, 0x6c, 0x91, 0x7d, 0x6c, + 0xb4, 0xd6, 0x88, 0xa7, 0x9a, 0x3a, 0x77, 0x52, 0x92, 0x43, 0x94, 0x8b, + 0x84, 0xa0, 0x67, 0x80, 0x61, 0x61, 0x56, 0xac, 0x69, 0x5a, 0x79, 0xb5, + 0x69, 0x73, 0x99, 0x80, 0x92, 0x59, 0x61, 0x6b, 0x7f, 0x78, 0xa4, 0xa0, + 0x4b, 0x98, 0xaf, 0x85, 0x94, 0x7a, 0x6f, 0x7d, 0xb4, 0xa9, 0x6e, 0x4d, + 0xbe, 0x53, 0x9a, 0xcc, 0x47, 0xd7, 0x7c, 0xa2, 0x45, 0x53, 0x67, 0x73, + 0x60, 0x83, 0x61, 0x66, 0xa8, 0xaa, 0xd7, 0x5d, 0x76, 0xc6, 0x55, 0xc7, + 0x7e, 0x22, 0xbe, 0x7b, 0xaa, 0xaf, 0x27, 0x69, 0x91, 0xbb, 0xa7, 0x84, + 0x89, 0xa1, 0x89, 0x74, 0x7a, 0x73, 0x9d, 0x58, 0x8d, 0x89, 0x50, 0xb5, + 0x95, 0x4e, 0xac, 0x64, 0xd0, 0x72, 0x34, 0xa8, 0x5a, 0x74, 0x62, 0xab, + 0x35, 0x71, 0xbe, 0x5e, 0xa1, 0x99, 0xa5, 0x75, 0xb3, 0xab, 0x9a, 0x84, + 0xba, 0xb6, 0x60, 0xc4, 0xb0, 0xb3, 0x89, 0x83, 0x61, 0x94, 0x67, 0x99, + 0x64, 0x45, 0xb4, 0x78, 0xb3, 0x6b, 0x7c, 0x94, 0x54, 0x7d, 0xaa, 0x6f, + 0x96, 0xae, 0x78, 0xa4, 0xb6, 0xb5, 0x81, 0x7c, 0x54, 0x70, 0xa1, 0x5d, + 0x74, 0x9c, 0x98, 0x8a, 0x99, 0xb4, 0x5e, 0x75, 0x5c, 0x5b, 0xc2, 0x89, + 0x99, 0x70, 0x71, 0xb0, 0x7a, 0x8c, 0x6a, 0xaa, 0x83, 0xba, 0xb6, 0xa0, + 0xc7, 0x6a, 0x55, 0x89, 0x5d, 0x80, 0x70, 0x78, 0x5b, 0xce, 0x68, 0x84, + 0x2f, 0x51, 0xb1, 0xb2, 0xb0, 0x69, 0x86, 0xc5, 0x8a, 0xab, 0xaf, 0x8d, + 0x59, 0x75, 0x73, 0x4c, 0x6f, 0xcf, 0x7b, 0xaa, 0x63, 0xeb, 0x77, 0x7a, + 0x9e, 0xa5, 0xbf, 0x8e, 0x9f, 0x35, 0x7c, 0x3e, 0x67, 0x6d, 0x64, 0x9c, + 0xb9, 0x5b, 0xb4, 0x51, 0x49, 0xc4, 0x63, 0x6e, 0xaf, 0xaf, 0xbe, 0x61, + 0x69, 0x7e, 0x9e, 0x4e, 0x81, 0x87, 0x87, 0x6a, 0x5e, 0xc1, 0x9c, 0x78, + 0x9e, 0xc3, 0x83, 0x68, 0x86, 0x95, 0x74, 0x3b, 0x84, 0xa7, 0x7d, 0xa4, + 0x55, 0xd9, 0x87, 0x97, 0x52, 0xa1, 0x58, 0xaa, 0xa8, 0xbe, 0x5c, 0x9c, + 0xb5, 0x81, 0xc8, 0xbd, 0xaa, 0x54, 0xa7, 0x79, 0x63, 0xa7, 0x7a, 0xb9, + 0xd4, 0x99, 0xb8, 0xa5, 0xb3, 0xab, 0xa6, 0x9d, 0xa0, 0x7c, 0x22, 0x97, + 0xcc, 0x9d, 0x6c, 0x8d, 0x53, 0x97, 0xad, 0x93, 0x63, 0x7a, 0x67, 0xbb, + 0x66, 0x8a, 0x90, 0x6a, 0x96, 0x51, 0x66, 0x5a, 0xb0, 0x5b, 0x64, 0x70, + 0x91, 0x67, 0x8c, 0xae, 0x80, 0x78, 0x89, 0xbf, 0x75, 0x8e, 0xae, 0x6e, + 0x6e, 0x9d, 0x5c, 0x80, 0xa9, 0x63, 0xc7, 0x54, 0x3a, 0x41, 0x6d, 0x54, + 0xa3, 0x8d, 0x75, 0x89, 0x49, 0x58, 0x8f, 0x98, 0x97, 0x9c, 0xa9, 0xc7, + 0xc0, 0xa1, 0xab, 0xa6, 0x8f, 0x56, 0xc8, 0x76, 0x87, 0x98, 0x97, 0xb2, + 0x87, 0x60, 0xd2, 0xeb, 0x83, 0x8b, 0xb2, 0x8d, 0x51, 0x40, 0x61, 0x59, + 0x6a, 0x68, 0x7a, 0xb8, 0xc5, 0xd7, 0x8c, 0x79, 0xb6, 0x69, 0x8e, 0x73, + 0xab, 0xa7, 0x84, 0x8c, 0xa2, 0x8a, 0x8d, 0x79, 0xa1, 0x7e, 0xd5, 0xae, + 0x92, 0x62, 0xa8, 0x6d, 0xb7, 0x7d, 0x84, 0x72, 0x70, 0x8f, 0xab, 0x66, + 0xbe, 0x94, 0x8d, 0x82, 0x91, 0x59, 0xba, 0x7e, 0x7a, 0x82, 0x78, 0xdd, + 0x3e, 0xc0, 0xa8, 0x89, 0x9b, 0x70, 0xac, 0x6d, 0xc5, 0x9e, 0xa0, 0x5f, + 0xb6, 0x9b, 0x8e, 0x67, 0xba, 0x5d, 0x6b, 0x4e, 0x88, 0x83, 0xac, 0xad, + 0x7c, 0x9a, 0x86, 0x5f, 0x7d, 0x5e, 0xb2, 0x7a, 0xa7, 0x57, 0x6d, 0x90, + 0x9b, 0xab, 0x5f, 0x9a, 0x5c, 0x94, 0x74, 0x6d, 0xc4, 0x6d, 0x79, 0xae, + 0x7f, 0x8a, 0x9c, 0x7e, 0x70, 0x5c, 0x7b, 0x93, 0xac, 0xcb, 0x4e, 0xa3, + 0x4d, 0x9c, 0x5d, 0xbb, 0x5f, 0x90, 0x74, 0x88, 0x65, 0x61, 0x63, 0x94, + 0xdc, 0xaf, 0x4e, 0x76, 0xbc, 0x47, 0xa6, 0xa2, 0xc3, 0x7d, 0x82, 0xc4, + 0x93, 0x8c, 0xd5, 0x86, 0x7e, 0xa0, 0xac, 0x89, 0x5c, 0x4b, 0xa3, 0xab, + 0x71, 0x8e, 0xca, 0x7c, 0x91, 0x68, 0x64, 0x5b, 0x9a, 0x30, 0x8c, 0x79, + 0xa6, 0xb0, 0xa5, 0x6b, 0xce, 0x9b, 0xab, 0x75, 0x8a, 0xa2, 0x7b, 0xcd, + 0x92, 0x90, 0x50, 0x6e, 0xac, 0x8d, 0x87, 0xa0, 0x81, 0xd7, 0x86, 0xb1, + 0x52, 0x86, 0x76, 0x66, 0xac, 0x72, 0xac, 0x91, 0x8a, 0x71, 0x6b, 0x79, + 0xb5, 0x6c, 0xa2, 0x82, 0x85, 0x4d, 0x74, 0xb5, 0x89, 0x67, 0x51, 0x9b, + 0x84, 0x6f, 0x79, 0x8e, 0x80, 0x4b, 0x46, 0x78, 0x87, 0xa0, 0xae, 0x9e, + 0x4c, 0xd7, 0x8e, 0x88, 0xcb, 0x78, 0x91, 0x63, 0x94, 0x7f, 0x93, 0x55, + 0x75, 0x80, 0x55, 0x70, 0x90, 0xb9, 0x64, 0xb9, 0xb8, 0x8f, 0x72, 0xbd, + 0x6f, 0xc8, 0xcb, 0x7d, 0xa4, 0x8f, 0x91, 0x35, 0xa0, 0xaa, 0x9a, 0x66, + 0x60, 0xa1, 0xb5, 0xbc, 0x33, 0x96, 0xa5, 0x80, 0xd6, 0xca, 0x9e, 0x78, + 0xa9, 0xac, 0x75, 0xea, 0xab, 0x50, 0x5d, 0x47, 0xbb, 0xb8, 0x75, 0x8b, + 0x70, 0x84, 0x4d, 0x8a, 0xab, 0x92, 0xc6, 0x17, 0x85, 0xc2, 0x93, 0x62, + 0x83, 0x8b, 0xcc, 0x84, 0xa8, 0xa2, 0xa5, 0xba, 0x8f, 0xdb, 0x59, 0x97, + 0x85, 0xc4, 0x75, 0x46, 0x3f, 0x75, 0x79, 0x95, 0xb4, 0x77, 0x90, 0x3b, + 0xa1, 0x68, 0x61, 0x2f, 0x93, 0x95, 0x7e, 0x7d, 0x77, 0x61, 0x66, 0x9e, + 0x80, 0xb1, 0x84, 0x39, 0xa7, 0x74, 0x4c, 0xca, 0x60, 0x8f, 0xbd, 0x8e, + 0x86, 0x95, 0x75, 0x82, 0x90, 0x66, 0x84, 0x40, 0x58, 0x98, 0x7b, 0x7a, + 0x93, 0xb5, 0x97, 0xa4, 0x9f, 0x6b, 0xb7, 0x4f, 0x82, 0xe2, 0x8e, 0x5f, + 0x66, 0x6e, 0x73, 0x47, 0x6f, 0x43, 0x76, 0xa9, 0xd8, 0x9d, 0x94, 0xa6, + 0x79, 0x6d, 0xbd, 0x79, 0xa4, 0xb2, 0xb1, 0xbd, 0x63, 0x92, 0x6d, 0x72, + 0x5e, 0x63, 0x92, 0x58, 0x7f, 0x3b, 0x84, 0x9a, 0x86, 0xb4, 0x4b, 0x80, + 0x88, 0x92, 0x61, 0xe0, 0xa4, 0x7b, 0x5a, 0x8d, 0x76, 0xb0, 0x90, 0x85, + 0xa3, 0x85, 0x8b, 0xd4, 0x8b, 0x4e, 0xc4, 0xa6, 0x7c, 0x62, 0xbe, 0x9a, + 0x6b, 0x45, 0x7a, 0x94, 0x50, 0x70, 0xa0, 0x81, 0x68, 0x6e, 0xd1, 0x76, + 0x7a, 0xab, 0xa8, 0xbc, 0x84, 0x51, 0x7e, 0x6d, 0x7a, 0x57, 0xa1, 0xbe, + 0x9b, 0x8b, 0x60, 0x9c, 0x5f, 0xa1, 0x83, 0xa7, 0x76, 0x7a, 0x3b, 0xa0, + 0x8c, 0xb3, 0x85, 0x82, 0xab, 0x9b, 0x8a, 0xc9, 0x7a, 0x88, 0x5c, 0xd1, + 0x54, 0x6c, 0x71, 0x61, 0x60, 0x33, 0x96, 0x96, 0x8d, 0x91, 0x75, 0x43, + 0x90, 0x57, 0xa9, 0x59, 0x67, 0xac, 0xb4, 0x74, 0xaa, 0x7a, 0x5f, 0x97, + 0x4b, 0x59, 0x56, 0xb2, 0x5a, 0x82, 0x6a, 0x54, 0x90, 0x88, 0x8c, 0x87, + 0x9e, 0xc8, 0x82, 0x8c, 0x4a, 0x6d, 0x72, 0x81, 0x8f, 0x45, 0x68, 0x8e, + 0x7d, 0x63, 0xcd, 0x9a, 0xd9, 0x63, 0xc9, 0x98, 0x87, 0x67, 0x52, 0xa8, + 0x5f, 0xad, 0xa5, 0x77, 0x9c, 0x97, 0xac, 0x81, 0x82, 0xf1, 0x97, 0x76, + 0xc2, 0x79, 0x5b, 0xb9, 0x75, 0x8a, 0xad, 0xc5, 0x44, 0xa3, 0x5f, 0xc4, + 0x87, 0xac, 0xbe, 0x57, 0xa7, 0x63, 0x7f, 0x72, 0x67, 0x77, 0xa2, 0x9c, + 0x78, 0x5f, 0x91, 0x72, 0x7f, 0x49, 0x58, 0xbd, 0xb4, 0x9d, 0x6d, 0x9b, + 0x84, 0xa0, 0x83, 0x3e, 0x89, 0xa0, 0xc0, 0x9f, 0xa5, 0x74, 0xbc, 0x8c, + 0xa3, 0x8c, 0x67, 0xb0, 0x9a, 0x77, 0x2a, 0x94, 0x64, 0x8c, 0x53, 0x57, + 0xa1, 0x54, 0x75, 0xa0, 0x5c, 0x72, 0x89, 0x4e, 0xa0, 0xac, 0x7c, 0x6a, + 0x7e, 0xaf, 0x76, 0xa1, 0x75, 0x85, 0x7f, 0xcd, 0x97, 0x76, 0x73, 0x6d, + 0x85, 0x70, 0xa1, 0x86, 0xa3, 0x81, 0x5a, 0xbf, 0x68, 0x88, 0xb4, 0xae, + 0x68, 0x7d, 0xc1, 0x90, 0x5d, 0x66, 0x56, 0x48, 0x85, 0x8c, 0xcd, 0x87, + 0x6d, 0x77, 0x7d, 0xa7, 0xa1, 0x5c, 0xb5, 0x8c, 0x61, 0x6d, 0x49, 0x5b, + 0x84, 0xae, 0x59, 0x7c, 0x49, 0xd2, 0x82, 0x81, 0x62, 0x5c, 0x95, 0x66, + 0x90, 0xab, 0xcd, 0x5b, 0x94, 0x5b, 0x3d, 0x96, 0x8b, 0x70, 0x57, 0x66, + 0x95, 0xb3, 0x75, 0x6c, 0x8f, 0x3b, 0x5f, 0xa9, 0xb4, 0x65, 0xba, 0xa0, + 0xf6, 0x82, 0x9f, 0x72, 0x56, 0xd1, 0x94, 0x60, 0x72, 0xb2, 0x8b, 0xaa, + 0x9d, 0x5a, 0x44, 0x8b, 0x8b, 0x8d, 0x8e, 0x73, 0x88, 0x5f, 0xc9, 0x57, + 0x50, 0x9e, 0x66, 0x7f, 0x78, 0xab, 0x57, 0xae, 0xa4, 0xa3, 0xae, 0x6c, + 0x9a, 0x3a, 0x61, 0xa0, 0x9e, 0xb3, 0xac, 0x9e, 0xc1, 0xa6, 0x74, 0x6f, + 0x92, 0x5f, 0xb0, 0x6e, 0xd3, 0xd1, 0x6e, 0xb6, 0x5f, 0x88, 0x90, 0x6f, + 0xb4, 0x67, 0x8b, 0x59, 0x70, 0x48, 0xab, 0x9f, 0x87, 0x70, 0x88, 0x71, + 0x79, 0x75, 0x75, 0x83, 0x85, 0x65, 0x46, 0xb5, 0x61, 0xbc, 0x7c, 0x81, + 0x76, 0x62, 0x9b, 0xe8, 0xa5, 0xcf, 0x5e, 0x83, 0xab, 0x76, 0x5e, 0xda, + 0x5c, 0x56, 0xa6, 0x42, 0x8a, 0xf0, 0x5c, 0xb2, 0x7c, 0x98, 0x6f, 0x8e, + 0x91, 0x9d, 0x50, 0x66, 0x97, 0xb5, 0x7f, 0x81, 0x79, 0xa8, 0x72, 0x71, + 0x39, 0xa2, 0x4e, 0x7c, 0xcd, 0x5d, 0x8d, 0x47, 0x55, 0x69, 0x7e, 0x95, + 0x9c, 0x4b, 0xab, 0x7d, 0x70, 0x52, 0xaa, 0x87, 0x52, 0x94, 0x48, 0x7b, + 0xd4, 0x76, 0x84, 0x71, 0xbf, 0xb1, 0x93, 0xc0, 0x85, 0x70, 0xa7, 0xa2, + 0x60, 0x9f, 0x94, 0x4f, 0x5e, 0x46, 0x6d, 0xa7, 0x5e, 0x76, 0xc0, 0x85, + 0x5c, 0xaa, 0x6b, 0x93, 0x6e, 0x82, 0xa3, 0xad, 0xb1, 0xb7, 0x83, 0xb6, + 0x9d, 0x99, 0x47, 0x3d, 0xa2, 0x82, 0x70, 0x7d, 0xb8, 0x85, 0x9d, 0x98, + 0xdd, 0x8f, 0x6a, 0xbd, 0x77, 0x97, 0x62, 0x71, 0x75, 0x64, 0x56, 0x45, + 0x58, 0x8d, 0x5e, 0xb3, 0xbb, 0x93, 0x8d, 0x87, 0x6d, 0xae, 0x66, 0x63, + 0x9d, 0x2f, 0x9c, 0x90, 0x71, 0x92, 0x50, 0x7f, 0x7f, 0xb4, 0xa7, 0x88, + 0xaa, 0x6d, 0x4d, 0x45, 0xb1, 0xab, 0x52, 0xad, 0x5e, 0x5a, 0x5b, 0x6d, + 0x7f, 0x86, 0xaa, 0xb2, 0x7c, 0x8e, 0x70, 0x4e, 0xbe, 0x9a, 0x59, 0x8c, + 0xab, 0x90, 0x54, 0xd2, 0xc5, 0x56, 0x82, 0x54, 0x6a, 0x64, 0x4c, 0x5d, + 0x8f, 0x57, 0x93, 0x5e, 0x87, 0x4d, 0x6f, 0x86, 0xab, 0x85, 0xb2, 0xd0, + 0xa3, 0xb0, 0x71, 0x9a, 0xaa, 0xae, 0xb3, 0x6e, 0x4a, 0x74, 0x3c, 0x88, + 0x70, 0x48, 0x9d, 0x67, 0xcb, 0x86, 0x76, 0xb8, 0xc8, 0x8e, 0x68, 0xb5, + 0x9b, 0xa2, 0xaa, 0x56, 0x80, 0x2d, 0xa3, 0xa3, 0xae, 0x53, 0xa8, 0xad, + 0x70, 0xcd, 0x8d, 0x7a, 0x76, 0x83, 0x74, 0x5b, 0x6b, 0xa4, 0x7f, 0x6d, + 0x83, 0x6a, 0x68, 0x9e, 0x91, 0xbc, 0x4b, 0x58, 0x7d, 0x65, 0x9e, 0xb6, + 0xae, 0x58, 0x8b, 0xaa, 0x73, 0x6d, 0x59, 0x70, 0x9c, 0x8c, 0x93, 0x8b, + 0x76, 0xa2, 0xb9, 0x87, 0x82, 0x8e, 0x52, 0x28, 0x9a, 0xbf, 0x60, 0x8a, + 0x49, 0xb7, 0x5f, 0x76, 0x7a, 0x6e, 0xaa, 0x9a, 0x70, 0x75, 0x9e, 0x80, + 0x96, 0x66, 0x8c, 0x68, 0x59, 0x72, 0x78, 0xa2, 0x6d, 0x8b, 0x5c, 0x72, + 0x72, 0x46, 0xd1, 0x8a, 0x79, 0xaa, 0x8f, 0xa6, 0x81, 0xae, 0xb5, 0x9f, + 0x87, 0xa5, 0x87, 0xab, 0xa5, 0x94, 0x78, 0x9f, 0x89, 0xa2, 0xad, 0x87, + 0x73, 0x88, 0x6a, 0xe2, 0xce, 0x97, 0x86, 0x55, 0xcc, 0x85, 0x5a, 0x51, + 0xc6, 0x5f, 0xbb, 0x97, 0x40, 0x65, 0x41, 0xad, 0xaa, 0x79, 0x8c, 0x88, + 0x67, 0x75, 0x5a, 0x3b, 0xac, 0x7c, 0x39, 0x94, 0x96, 0xc2, 0x62, 0xb5, + 0xb4, 0xa9, 0x4f, 0x57, 0x92, 0x79, 0x86, 0xbe, 0x99, 0x7b, 0x50, 0xb8, + 0x73, 0xa9, 0x67, 0x9e, 0x88, 0xac, 0x64, 0xab, 0x2d, 0xb4, 0xb6, 0xbc, + 0x62, 0x59, 0xad, 0x41, 0xc2, 0x95, 0x93, 0xb2, 0x9b, 0x93, 0xb1, 0x52, + 0x9b, 0x87, 0xc3, 0x6b, 0xe1, 0x5d, 0x98, 0xa1, 0x90, 0xa7, 0x7e, 0xf2, + 0xa2, 0x9c, 0xa9, 0x6f, 0x4d, 0x9a, 0xc4, 0xa2, 0x80, 0x6e, 0x89, 0x68, + 0x57, 0x50, 0x96, 0x34, 0x90, 0x86, 0x93, 0xc9, 0xad, 0xab, 0xcc, 0x69, + 0x93, 0xc8, 0xa6, 0xa4, 0xb2, 0x3f, 0x98, 0x9e, 0x93, 0x97, 0xab, 0x82, + 0x81, 0xaa, 0x55, 0xae, 0x45, 0x5c, 0x44, 0xc5, 0x76, 0x3f, 0xb5, 0x61, + 0x58, 0x5b, 0x9a, 0x64, 0xbf, 0x7c, 0x4c, 0x75, 0x84, 0x9d, 0x5a, 0xcc, + 0x8d, 0xac, 0xb1, 0x9b, 0x97, 0xa9, 0xae, 0x98, 0x75, 0x6e, 0xa0, 0x7a, + 0x6b, 0x7b, 0xb2, 0x6b, 0x71, 0x2c, 0x80, 0x4b, 0xa1, 0xd4, 0xa5, 0xb8, + 0x6c, 0x87, 0x67, 0x5f, 0x62, 0x68, 0x87, 0x81, 0x8c, 0x9b, 0x50, 0x7a, + 0x8f, 0x93, 0x64, 0x79, 0x67, 0x85, 0x5e, 0x3f, 0x5f, 0x7e, 0x92, 0xa0, + 0xa7, 0x41, 0x71, 0x6c, 0x9f, 0x5a, 0xbe, 0x6a, 0x4c, 0x5a, 0x9a, 0xa7, + 0xa0, 0x4b, 0x9d, 0xce, 0x72, 0xd1, 0x9b, 0xc1, 0xef, 0x9b, 0x72, 0x8a, + 0x7b, 0xac, 0x5f, 0x6c, 0x76, 0x93, 0x7d, 0x8f, 0xae, 0xa5, 0xbe, 0xa4, + 0xb3, 0xba, 0x63, 0x83, 0xbf, 0x39, 0x48, 0x24, 0xa2, 0x9b, 0x91, 0x55, + 0x93, 0x48, 0xae, 0xe7, 0x81, 0xb4, 0x9a, 0xb1, 0x73, 0xb4, 0xaf, 0xd5, + 0xec, 0x65, 0x52, 0x43, 0xaa, 0x7c, 0x8e, 0xad, 0xb0, 0x9e, 0x98, 0x8f, + 0xa8, 0x63, 0x4c, 0x6f, 0x51, 0x56, 0x91, 0x9c, 0x7c, 0x89, 0xae, 0xb0, + 0xa0, 0x57, 0x89, 0x8d, 0x66, 0x85, 0xa0, 0x91, 0x5c, 0x67, 0xcd, 0x78, + 0xc2, 0xad, 0x9d, 0x85, 0x58, 0x9d, 0xaf, 0x95, 0x60, 0x44, 0xb2, 0x61, + 0x49, 0x4f, 0x39, 0x9e, 0xc5, 0x72, 0x68, 0x89, 0x9b, 0xe3, 0x85, 0x55, + 0xa6, 0xaa, 0x72, 0xbe, 0xa9, 0x6f, 0x61, 0xab, 0x89, 0x7b, 0xe4, 0xcc, + 0x6b, 0x5e, 0xa6, 0xa6, 0x6d, 0x8f, 0x7f, 0xa8, 0x4e, 0x97, 0x7f, 0x61, + 0xa2, 0x64, 0xc5, 0xa1, 0x4c, 0xa2, 0xa6, 0x96, 0x7f, 0x73, 0xb7, 0x7c, + 0x4f, 0x6b, 0x8d, 0x77, 0x8d, 0xbf, 0xa2, 0x69, 0xb3, 0xac, 0xbc, 0xc6, + 0x4a, 0x4d, 0x56, 0x78, 0x9d, 0x90, 0x7a, 0xa6, 0x94, 0x83, 0x78, 0xaa, + 0x71, 0x6e, 0xa4, 0x79, 0x65, 0x76, 0xab, 0xbb, 0xa3, 0x3e, 0x9d, 0xa1, + 0xac, 0x8c, 0x8a, 0x5c, 0x7d, 0xa7, 0x6c, 0x96, 0x80, 0xf7, 0x75, 0xd3, + 0x62, 0xbf, 0xb2, 0x77, 0x48, 0x4a, 0x6f, 0x70, 0x9d, 0xaf, 0x77, 0xb0, + 0x7b, 0xcf, 0x76, 0xa3, 0x64, 0x7e, 0xe6, 0xb0, 0x5d, 0x73, 0x7b, 0x88, + 0x3b, 0x8f, 0x8a, 0x90, 0x8b, 0x96, 0x92, 0x74, 0x53, 0x98, 0x9d, 0xa3, + 0x37, 0x8f, 0x7a, 0x9a, 0xa6, 0xd1, 0xae, 0x6c, 0x78, 0x8d, 0x58, 0x3d, + 0xa6, 0x91, 0x74, 0x90, 0x9c, 0x51, 0xa4, 0x8a, 0x7b, 0x5e, 0x5a, 0xb4, + 0x7d, 0x58, 0x73, 0x94, 0x9e, 0x95, 0xca, 0x52, 0x7b, 0x7d, 0xb0, 0x8a, + 0x55, 0x92, 0x64, 0x69, 0x90, 0x8b, 0xd4, 0x90, 0x79, 0x96, 0xae, 0xa7, + 0x44, 0x5a, 0x59, 0x9f, 0x92, 0x61, 0xac, 0xa1, 0x8a, 0xa2, 0x98, 0xb7, + 0x92, 0x63, 0xc4, 0x93, 0x58, 0xb1, 0x78, 0x70, 0xcd, 0x5a, 0x6f, 0x40, + 0xa1, 0x6d, 0x6a, 0x54, 0xb9, 0x53, 0x9c, 0x72, 0x99, 0x84, 0x77, 0x53, + 0xa5, 0x53, 0xd6, 0x45, 0x4b, 0xb8, 0x94, 0x9e, 0x56, 0xc0, 0x94, 0x8a, + 0xb2, 0x75, 0x99, 0x95, 0x9a, 0x50, 0x92, 0x84, 0xa8, 0xc8, 0x8d, 0x59, + 0xc2, 0x5f, 0xa3, 0x46, 0x3c, 0xb5, 0x51, 0x95, 0x9f, 0x27, 0x5d, 0x81, + 0x9c, 0x8b, 0x90, 0xd0, 0x8e, 0x6f, 0x43, 0x9c, 0x71, 0xdf, 0x9b, 0x69, + 0x82, 0xad, 0x9f, 0xa6, 0xaf, 0xbb, 0x51, 0x7f, 0xf0, 0xa5, 0x85, 0x7e, + 0, 0x7c, 0x8f, 0x70, 0x60, 0x4e, 0x62, 0x5d, 0xd3, 0xbc, 0xc6, 0x75, + 0x5d, 0xa0, 0x5b, 0x98, 0xc4, 0x87, 0x9f, 0x68, 0x84, 0x68, 0x58, 0x9e, + 0x6b, 0x9f, 0x6c, 0x58, 0x80, 0x61, 0x6a, 0xba, 0x94, 0x97, 0xd2, 0x77, + 0x6d, 0x7b, 0x6a, 0xa2, 0x78, 0x5e, 0x94, 0x47, 0xc0, 0x94, 0x9c, 0x6d, + 0x6f, 0x92, 0xbb, 0xa5, 0x8d, 0x60, 0x74, 0xb2, 0x60, 0xab, 0xa0, 0x58, + 0x65, 0x84, 0xac, 0x9c, 0x8a, 0xb9, 0xc1, 0xbc, 0x8f, 0x85, 0x4c, 0x7c, + 0x5b, 0x9f, 0x9f, 0x92, 0xa7, 0x7b, 0x84, 0xad, 0x8c, 0x53, 0xa3, 0x6f, + 0x77, 0xb7, 0x4e, 0x82, 0xaa, 0x70, 0xab, 0x72, 0xb1, 0xc3, 0xc8, 0x49, + 0x62, 0x58, 0xd0, 0x75, 0x75, 0x51, 0x96, 0xdd, 0x80, 0xb7, 0x95, 0x96, + 0xbc, 0x78, 0x3f, 0xde, 0xa6, 0x97, 0x8e, 0x42, 0xa5, 0x78, 0xe9, 0x92, + 0x7f, 0x8b, 0xae, 0xc7, 0xb6, 0x8e, 0xa2, 0xb7, 0x81, 0xb3, 0xb5, 0x73, + 0x59, 0x9b, 0x5d, 0xb9, 0x65, 0x7c, 0x8a, 0xa6, 0xb1, 0x77, 0x5c, 0x82, + 0x77, 0x5b, 0x73, 0xad, 0xa5, 0x7c, 0x9f, 0x5d, 0x4f, 0x9a, 0x6e, 0x51, + 0x45, 0xd8, 0xb2, 0xb8, 0x5a, 0xaf, 0xb7, 0xb8, 0xa6, 0x72, 0x72, 0x9b, + 0x32, 0x3b, 0x52, 0xa7, 0x9b, 0x8b, 0x57, 0x9b, 0x6e, 0x3c, 0x80, 0x5f, + 0xbc, 0xa9, 0x93, 0xa6, 0xae, 0x67, 0x90, 0x91, 0x5f, 0x99, 0x9e, 0x96, + 0xe2, 0x54, 0x51, 0x6b, 0x76, 0x9b, 0xa9, 0x90, 0xe2, 0x80, 0x72, 0x6b, + 0x5b, 0xa3, 0x8f, 0x7c, 0x5f, 0x94, 0xbb, 0x39, 0x9e, 0x58, 0x79, 0x99, + 0x76, 0x89, 0x57, 0x57, 0xa9, 0xcb, 0x74, 0x7d, 0xa0, 0x92, 0x97, 0x63, + 0x9c, 0xab, 0x6d, 0x78, 0x73, 0xdc, 0x42, 0xab, 0x90, 0x95, 0xa5, 0x78, + 0x71, 0x8a, 0x56, 0x54, 0xa4, 0x66, 0x30, 0xa4, 0x6e, 0x8e, 0x9a, 0xad, + 0xbd, 0x88, 0xaf, 0x87, 0x7a, 0xa5, 0xa9, 0x59, 0x37, 0x8e, 0x41, 0x43, + 0x4f, 0x9e, 0x96, 0x56, 0x74, 0xa4, 0x92, 0x57, 0x69, 0x90, 0xbc, 0x98, + 0xb2, 0x59, 0x3a, 0x95, 0x90, 0x6c, 0x90, 0x97, 0x6d, 0x7b, 0x6b, 0x79, + 0x87, 0x98, 0x6c, 0x96, 0x5b, 0x95, 0x5b, 0x55, 0x87, 0x8c, 0x75, 0x96, + 0x8d, 0x8f, 0x62, 0xda, 0x96, 0x7c, 0xa3, 0x7d, 0x89, 0x73, 0x69, 0x48, + 0x98, 0xac, 0xa9, 0x87, 0x65, 0x94, 0xa2, 0xbd, 0x6d, 0x4f, 0x5b, 0x70, + 0xa6, 0x61, 0xa2, 0x68, 0x60, 0xb4, 0xab, 0x90, 0x53, 0xd5, 0x77, 0xa3, + 0xb7, 0x83, 0xa7, 0x64, 0xa1, 0x7e, 0x63, 0x88, 0x84, 0x5e, 0x6f, 0x97, + 0xad, 0xa6, 0xd2, 0x7b, 0x8e, 0x98, 0x81, 0x64, 0x17, 0x7d, 0x9b, 0xaa, + 0x67, 0xb7, 0xb0, 0xab, 0x6d, 0xb0, 0x3b, 0x98, 0xc2, 0x5b, 0x9c, 0x4c, + 0x53, 0xa5, 0x89, 0x7d, 0x94, 0x72, 0x83, 0x6d, 0x59, 0xaf, 0x83, 0xa6, + 0x74, 0xb0, 0x83, 0x5e, 0x6c, 0x99, 0x99, 0x8e, 0x64, 0x72, 0x9a, 0xa9, + 0xa6, 0x75, 0x72, 0x86, 0xae, 0x4a, 0x70, 0x82, 0x99, 0xa7, 0x76, 0x90, + 0xae, 0xcd, 0x6e, 0x93, 0x8b, 0xa8, 0x68, 0xb6, 0x74, 0x64, 0xa8, 0xc4, + 0x5b, 0x86, 0xa2, 0x5e, 0x85, 0x9f, 0x39, 0x85, 0xb4, 0x8b, 0x8f, 0x57, + 0x39, 0x91, 0x99, 0x6d, 0x63, 0x50, 0x8f, 0x37, 0x8a, 0x94, 0x96, 0xc9, + 0x93, 0x54, 0xd0, 0x49, 0x64, 0xaa, 0x98, 0x4b, 0x8e, 0x95, 0x59, 0x58, + 0x89, 0x83, 0x66, 0x6a, 0xa6, 0x9d, 0xd0, 0x9e, 0x79, 0x97, 0x93, 0x64, + 0x72, 0xd1, 0x9d, 0x52, 0x7b, 0x84, 0x91, 0xaa, 0x62, 0xa8, 0x7c, 0x80, + 0xa4, 0xa8, 0x6b, 0x94, 0xac, 0x50, 0xb6, 0x55, 0x7d, 0x97, 0xaa, 0xa4, + 0x5e, 0xae, 0x7a, 0xa0, 0x5e, 0x73, 0x6d, 0xad, 0xa0, 0x77, 0x82, 0x80, + 0x4a, 0xda, 0x82, 0x59, 0xae, 0xf4, 0xa2, 0xb0, 0x9f, 0x84, 0xb2, 0x6c, + 0x41, 0xd8, 0x6c, 0x5a, 0x6f, 0x80, 0x9a, 0x75, 0x88, 0x78, 0x75, 0x7f, + 0x61, 0x6a, 0xb2, 0x64, 0x49, 0x4a, 0x8e, 0xb1, 0x60, 0x95, 0x73, 0x87, + 0x5d, 0x6c, 0x5f, 0x8f, 0xad, 0x94, 0x64, 0x6b, 0xc8, 0xa9, 0xb5, 0x54, + 0xa0, 0xdb, 0x88, 0x95, 0x92, 0x9b, 0x63, 0x8d, 0x6e, 0xa7, 0x78, 0x88, + 0x88, 0xb2, 0x9a, 0xd0, 0x95, 0x78, 0x92, 0xbb, 0x6d, 0x6c, 0x9f, 0xc6, + 0x4c, 0x93, 0x7f, 0x5d, 0xb6, 0x9f, 0x74, 0xe3, 0x59, 0xb1, 0xd7, 0xc1, + 0xa5, 0x87, 0x86, 0x40, 0x7b, 0xaa, 0xc4, 0x9f, 0xc2, 0x6a, 0x7a, 0x6f, + 0x37, 0xbd, 0x6f, 0xb2, 0x60, 0x9d, 0x7c, 0x98, 0xa9, 0xb6, 0x9f, 0x6c, + 0xaf, 0x6f, 0x66, 0x79, 0x71, 0x1f, 0x7a, 0x79, 0x7a, 0x42, 0x6f, 0x8c, + 0xaf, 0x64, 0xc3, 0x8a, 0xb3, 0x91, 0x97, 0x5c, 0x94, 0x5a, 0xb1, 0x45, + 0x6f, 0xb6, 0x9c, 0x7c, 0x67, 0xe1, 0xb0, 0xb7, 0xaf, 0x7a, 0xa9, 0x88, + 0x7b, 0x51, 0x46, 0x63, 0xff, 0x7e, 0x6e, 0x7c, 0x89, 0xd4, 0xc0, 0xac, + 0x70, 0x68, 0x63, 0x76, 0x59, 0x7a, 0xa2, 0xac, 0xae, 0xc4, 0xab, 0x94, + 0x52, 0x66, 0x94, 0x7b, 0xa7, 0x8c, 0x9d, 0xa6, 0x5e, 0xe6, 0xb9, 0x6d, + 0x87, 0x71, 0x9d, 0xc6, 0x50, 0x82, 0x89, 0x61, 0x97, 0xc9, 0xaa, 0x62, + 0x9b, 0x75, 0x7c, 0x54, 0x8a, 0xa6, 0x96, 0xae, 0xa5, 0x89, 0x6b, 0xb7, + 0x72, 0x42, 0x8f, 0xaa, 0x76, 0x74, 0x73, 0x77, 0xb7, 0xa4, 0x9e, 0x5c, + 0xbd, 0x5e, 0xab, 0x3c, 0xc9, 0x70, 0xc2, 0x98, 0x59, 0x7c, 0x9e, 0x92, + 0x7c, 0x7f, 0xa2, 0xb9, 0x5d, 0x69, 0xab, 0x3c, 0x67, 0xc8, 0x8f, 0xbc, + 0x94, 0x83, 0xb7, 0x3f, 0x94, 0xea, 0x31, 0xa4, 0x3c, 0xb5, 0x62, 0x51, + 0x8c, 0x6e, 0x90, 0x94, 0x5a, 0x80, 0x6f, 0xc2, 0x62, 0x63, 0x66, 0x86, + 0x86, 0x5f, 0x9f, 0x94, 0x6b, 0x9d, 0xdd, 0xac, 0x61, 0x15, 0xb8, 0x75, + 0xba, 0x5a, 0xca, 0x81, 0x65, 0x57, 0x78, 0x3e, 0x67, 0x91, 0x8d, 0x57, + 0xbc, 0xa9, 0x7b, 0x79, 0x56, 0xa7, 0x8a, 0x53, 0xcc, 0x7f, 0x74, 0x52, + 0x9a, 0x98, 0x5a, 0xb9, 0xa2, 0x67, 0xaf, 0x90, 0x68, 0xa0, 0x9b, 0xbf, + 0xb7, 0x7b, 0xa8, 0x83, 0xad, 0xb3, 0xa1, 0x8c, 0x7a, 0x9a, 0x42, 0xb6, + 0xbb, 0x5b, 0x93, 0x96, 0x85, 0x58, 0x76, 0x78, 0x6d, 0xa3, 0x50, 0xaa, + 0x7d, 0xaf, 0x56, 0x5e, 0xd1, 0xa3, 0x8a, 0x60, 0x5e, 0xb0, 0x58, 0x59, + 0xa8, 0xa1, 0xa1, 0xc2, 0xb5, 0xa1, 0x97, 0x89, 0xab, 0x7f, 0x85, 0x6f, + 0x6e, 0x7a, 0x8c, 0x5c, 0xbc, 0xd9, 0x85, 0x47, 0xb1, 0x86, 0x79, 0xb3, + 0xc9, 0x78, 0x5f, 0x56, 0x70, 0x81, 0xa2, 0xc0, 0x5e, 0x7a, 0x82, 0x9b, + 0x76, 0xc5, 0x67, 0x89, 0x75, 0x9a, 0x69, 0xa7, 0x90, 0xa0, 0x69, 0x95, + 0x56, 0x9e, 0x86, 0x98, 0xb6, 0x89, 0xc7, 0x99, 0x89, 0x57, 0x78, 0xb4, + 0xe4, 0x7c, 0x88, 0x5e, 0xda, 0xae, 0xbe, 0xa9, 0x60, 0xaa, 0xa0, 0xc6, + 0xc5, 0xaf, 0xc0, 0x66, 0x52, 0x71, 0x71, 0xb5, 0x9f, 0x69, 0xad, 0xbf, + 0xba, 0x80, 0x96, 0x85, 0xae, 0x87, 0x91, 0x71, 0x6d, 0x74, 0x57, 0x78, + 0xe2, 0x80, 0x5e, 0x83, 0x73, 0x7b, 0xc1, 0x82, 0xae, 0x58, 0xae, 0x84, + 0x58, 0x69, 0x86, 0x8d, 0x74, 0xb3, 0x80, 0xc1, 0xa3, 0x2d, 0xba, 0x52, + 0x76, 0x4b, 0x8c, 0x5d, 0x9c, 0x96, 0x50, 0x82, 0xb7, 0x61, 0xbc, 0x51, + 0xae, 0x54, 0x97, 0x63, 0x49, 0x83, 0x65, 0x9d, 0xb0, 0x89, 0x8e, 0x99, + 0x8b, 0x5f, 0x59, 0x82, 0xbf, 0x85, 0x8c, 0xa1, 0xd7, 0xa8, 0x70, 0x9f, + 0x5a, 0x94, 0x6e, 0xa6, 0x7c, 0x8d, 0x48, 0x60, 0xc4, 0x63, 0xa5, 0x65, + 0x78, 0xbc, 0x5c, 0xb8, 0x69, 0xa0, 0xa5, 0x87, 0x8a, 0x83, 0x67, 0x4d, + 0x80, 0x83, 0x67, 0xa7, 0x55, 0xac, 0xb6, 0xae, 0xe4, 0x9b, 0xa4, 0xa0, + 0x5f, 0x71, 0x6b, 0x6c, 0xb3, 0x5a, 0x3b, 0xc4, 0x88, 0x81, 0xd5, 0x77, + 0x58, 0x60, 0x83, 0xb2, 0x8f, 0xc7, 0x95, 0x73, 0x44, 0x7f, 0xbe, 0x24, + 0xb5, 0x4c, 0xa9, 0xba, 0x96, 0x74, 0x6a, 0x56, 0x76, 0xa5, 0x7d, 0x9b, + 0xc7, 0x85, 0xb3, 0x58, 0xb0, 0x59, 0x64, 0x63, 0x8b, 0x92, 0x53, 0xc6, + 0x99, 0xc9, 0x84, 0xa9, 0x8c, 0x82, 0x70, 0xa2, 0xa1, 0x7b, 0xab, 0x9f, + 0x5e, 0xa9, 0x68, 0xab, 0xa6, 0x5a, 0x8b, 0x5e, 0xa5, 0x8e, 0x64, 0x6d, + 0x4b, 0x6f, 0x4d, 0xaf, 0xb2, 0x78, 0x85, 0x57, 0x91, 0x72, 0x96, 0xba, + 0x54, 0x7c, 0x6b, 0x41, 0x6a, 0x63, 0x77, 0x9e, 0x8b, 0x72, 0x62, 0xa7, + 0x84, 0x7f, 0xb2, 0x96, 0xb5, 0x98, 0x52, 0x66, 0x7a, 0x5c, 0x58, 0xb0, + 0xb4, 0x83, 0xa2, 0x9c, 0xbe, 0x70, 0xa4, 0x50, 0x84, 0x7f, 0x8a, 0x7a, + 0xb1, 0xb7, 0x79, 0x99, 0x6a, 0x45, 0x5e, 0x79, 0x57, 0x74, 0xa1, 0x4a, + 0x4c, 0xc6, 0x71, 0x93, 0x9f, 0x6a, 0x88, 0xc9, 0x71, 0x52, 0x55, 0x9c, + 0x40, 0xb7, 0x68, 0xad, 0x87, 0x88, 0x94, 0x65, 0xad, 0x8b, 0x6f, 0x97, + 0x6e, 0x9e, 0x88, 0x7a, 0x88, 0x99, 0xaf, 0x73, 0xa3, 0x3f, 0x9e, 0xaa, + 0x76, 0x53, 0xc6, 0xa7, 0xb6, 0xba, 0xa2, 0x63, 0xaa, 0x70, 0x8a, 0xaa, + 0x4a, 0x76, 0xb2, 0xa5, 0x95, 0x80, 0x69, 0x64, 0x5d, 0x94, 0x84, 0x6f, + 0xb1, 0x85, 0xa3, 0xcb, 0x87, 0x3c, 0xa4, 0x9a, 0xbb, 0x6f, 0x72, 0xc3, + 0xb6, 0x6e, 0x83, 0x5e, 0xb7, 0xa7, 0x78, 0x96, 0x8e, 0x7e, 0x95, 0x58, + 0x77, 0x8f, 0x91, 0x65, 0xa4, 0x6f, 0x97, 0x52, 0x5c, 0x91, 0xbb, 0x34, + 0xac, 0x8c, 0x77, 0x68, 0xa8, 0xb6, 0xce, 0x96, 0xac, 0x9e, 0x9a, 0x6a, + 0x8c, 0xa5, 0xa6, 0xc6, 0x86, 0x97, 0x86, 0x59, 0xa0, 0x36, 0x8d, 0x9f, + 0x96, 0x49, 0x39, 0x55, 0x6e, 0x93, 0xa1, 0x49, 0x94, 0x8c, 0x72, 0x98, + 0x45, 0x6b, 0x41, 0x98, 0x76, 0x5c, 0x69, 0x73, 0x54, 0x9a, 0x6a, 0x5a, + 0xac, 0x4a, 0x9a, 0x6c, 0xa0, 0x92, 0x76, 0xb4, 0xb7, 0x8e, 0x81, 0x5e, + 0x61, 0xbe, 0xc0, 0x8e, 0x6e, 0x95, 0x79, 0x81, 0xd1, 0xb4, 0x98, 0x9c, + 0xd3, 0x83, 0xd6, 0x68, 0xa2, 0x8c, 0x72, 0x74, 0x95, 0x74, 0x59, 0x58, + 0xaa, 0x9b, 0x9b, 0x60, 0x8f, 0x9c, 0x6b, 0x7e, 0xa7, 0x36, 0x52, 0x77, + 0x52, 0x91, 0x6d, 0xdb, 0x77, 0x84, 0x91, 0xbb, 0xa7, 0x76, 0xd4, 0x6e, + 0x4b, 0x7f, 0xa1, 0x79, 0xb7, 0x9c, 0x81, 0x63, 0x78, 0xb4, 0xa0, 0xb5, + 0xc7, 0x9b, 0x8c, 0xa8, 0x89, 0x84, 0x9d, 0x6e, 0x58, 0x83, 0x6c, 0x9a, + 0x43, 0x65, 0x8e, 0xa3, 0x84, 0x89, 0x74, 0x69, 0xbb, 0x5d, 0xba, 0x62, + 0x30, 0x88, 0x54, 0x9c, 0x9d, 0x78, 0x5c, 0x50, 0x93, 0xa5, 0xc7, 0xd0, + 0xcf, 0xa8, 0x7a, 0x90, 0x89, 0x96, 0x57, 0x52, 0x4a, 0x5e, 0x6f, 0x70, + 0x74, 0x4e, 0xab, 0xa1, 0x90, 0x98, 0xb3, 0xcd, 0x58, 0x86, 0x5b, 0xba, + 0x92, 0x61, 0xa5, 0x6d, 0x78, 0xbd, 0x6b, 0x9c, 0x9e, 0x9f, 0xbf, 0xca, + 0xae, 0x73, 0x8a, 0x56, 0x67, 0xb5, 0xab, 0x6e, 0xac, 0xcc, 0x48, 0x76, + 0x6a, 0x61, 0x7e, 0x52, 0x5a, 0xb2, 0xa6, 0x50, 0xd4, 0x6f, 0x78, 0x6b, + 0xa3, 0x88, 0x91, 0x86, 0x83, 0x63, 0x5d, 0xa0, 0x94, 0x51, 0xb8, 0xa8, + 0xae, 0x5b, 0xc2, 0xb1, 0x65, 0x91, 0x5d, 0x9f, 0x77, 0x55, 0x7e, 0xa5, + 0xac, 0xbd, 0xc5, 0x9a, 0x2b, 0x53, 0x6f, 0x6c, 0xb5, 0x62, 0x71, 0x6f, + 0x82, 0x7c, 0x9c, 0x64, 0xb1, 0x94, 0xc5, 0x48, 0x45, 0x4f, 0x66, 0x99, + 0xac, 0xc4, 0xcd, 0xa5, 0x88, 0x7b, 0x5e, 0xa3, 0xb1, 0xca, 0x90, 0x65, + 0x68, 0x62, 0xa3, 0x6b, 0xb3, 0xa4, 0x45, 0x54, 0xce, 0x99, 0x98, 0x37, + 0x61, 0x6c, 0x4a, 0x91, 0xd1, 0x6f, 0x91, 0xa7, 0x74, 0x8e, 0x8c, 0x9d, + 0xa3, 0xac, 0x4d, 0x6a, 0x8e, 0x6e, 0xb4, 0x64, 0xb0, 0x93, 0x84, 0x6d, + 0x79, 0xcd, 0x81, 0x95, 0x8d, 0x9c, 0x8b, 0xa1, 0xa9, 0x7d, 0xa5, 0x58, + 0xa1, 0x73, 0x78, 0x75, 0x84, 0xaa, 0x60, 0x50, 0x5e, 0x60, 0x81, 0xa3, + 0x58, 0x45, 0x67, 0xa8, 0xa1, 0x78, 0xbc, 0xaa, 0x60, 0x80, 0xd3, 0x64, + 0x5f, 0x94, 0x4b, 0xab, 0xaa, 0xbf, 0x75, 0x8a, 0xa4, 0xae, 0x5c, 0xa9, + 0x8e, 0xb3, 0x76, 0x91, 0x4e, 0x79, 0x69, 0x84, 0xaf, 0xa9, 0x8e, 0x56, + 0xb9, 0xaa, 0x89, 0x84, 0x87, 0x75, 0x5f, 0x5f, 0x93, 0x83, 0x5b, 0x9a, + 0x6a, 0x9b, 0x60, 0x93, 0xa8, 0x71, 0x87, 0x98, 0x5d, 0xeb, 0x99, 0x3c, + 0xfd, 0x3a, 0x4f, 0xb1, 0x56, 0x58, 0xa5, 0x78, 0x99, 0x97, 0x69, 0xd0, + 0x76, 0x52, 0x74, 0x9c, 0x82, 0x5a, 0xa5, 0x63, 0x9d, 0x48, 0x60, 0x8b, + 0x9d, 0x84, 0x4b, 0x8d, 0xab, 0xb1, 0x5c, 0x74, 0x66, 0x4c, 0x8c, 0x62, + 0x92, 0xae, 0x62, 0x80, 0x8f, 0x70, 0x52, 0xae, 0xb9, 0xa3, 0x70, 0x79, + 0x77, 0xbd, 0x78, 0x9c, 0x7c, 0xbe, 0x95, 0x9c, 0x3e, 0x97, 0xba, 0x7d, + 0x50, 0x69, 0xab, 0xb9, 0x64, 0x74, 0x88, 0xb9, 0x8c, 0x8f, 0x97, 0x2e, + 0x86, 0x58, 0xa3, 0x8b, 0xa6, 0x83, 0xa4, 0x9f, 0x83, 0xb7, 0x82, 0xc2, + 0x7a, 0xcd, 0x5f, 0xa1, 0x4a, 0x85, 0x69, 0xa1, 0xda, 0x7f, 0xc6, 0x80, + 0x6c, 0x76, 0x80, 0x47, 0xa6, 0xc2, 0xbf, 0xa7, 0xa8, 0x40, 0x64, 0x32, + 0x60, 0xa3, 0x3b, 0x63, 0xac, 0xbd, 0xa7, 0x72, 0x62, 0xa3, 0x68, 0x3c, + 0x3c, 0x6e, 0x94, 0x81, 0x90, 0x49, 0x93, 0xad, 0x9b, 0x8a, 0xc9, 0x85, + 0xb2, 0x67, 0x68, 0x9c, 0x79, 0x6b, 0x5e, 0x92, 0xaf, 0xb3, 0x3f, 0x2a, + 0x7a, 0x9c, 0x91, 0x7b, 0xc7, 0x8c, 0x53, 0x86, 0xba, 0x7a, 0x65, 0x8a, + 0x92, 0x4f, 0x6e, 0x86, 0x5a, 0xb0, 0xa4, 0x8f, 0x7d, 0x57, 0x73, 0x9e, + 0x8d, 0x70, 0xba, 0x7f, 0x9e, 0x70, 0x3b, 0x60, 0xa7, 0x95, 0x64, 0x73, + 0x58, 0x61, 0x3f, 0x4f, 0x57, 0x6b, 0x74, 0x7f, 0x7e, 0x6a, 0x76, 0x5b, + 0x6b, 0xa5, 0xb1, 0x76, 0x8f, 0xb5, 0xc8, 0x89, 0x6d, 0x6e, 0x7b, 0xdf, + 0xa6, 0x7c, 0x77, 0x65, 0xba, 0x6d, 0xc0, 0xc1, 0x80, 0x74, 0x6d, 0x7b, + 0x3c, 0xa5, 0x63, 0xe1, 0x84, 0x4c, 0xd0, 0x4c, 0x9b, 0x83, 0x93, 0x90, + 0x91, 0x59, 0x53, 0xd6, 0x76, 0xaf, 0xa0, 0x2c, 0x7c, 0xbc, 0x90, 0xa2, + 0x8b, 0x35, 0x38, 0x80, 0xb7, 0x62, 0xa7, 0x57, 0xb0, 0xb0, 0x76, 0x4d, + 0x92, 0xa0, 0x8b, 0x40, 0x88, 0x72, 0xb4, 0xa1, 0xa6, 0x8f, 0x9c, 0x48, + 0xb3, 0x5a, 0x75, 0x80, 0x70, 0xa9, 0x8b, 0xc7, 0x35, 0xc6, 0xa1, 0x5c, + 0x74, 0x72, 0x3c, 0xc2, 0xc6, 0xac, 0xab, 0x92, 0x52, 0x6c, 0x56, 0x92, + 0xc0, 0xa6, 0xc7, 0x4e, 0x65, 0x9c, 0x87, 0x7a, 0x4c, 0x92, 0x79, 0xd3, + 0x78, 0xa5, 0x84, 0xb2, 0x6c, 0xcb, 0xb0, 0xbf, 0x81, 0xa2, 0x5c, 0x63, + 0xb4, 0x69, 0x54, 0x6f, 0x7a, 0xa9, 0x42, 0x70, 0x98, 0x8f, 0x8b, 0x8f, + 0x88, 0x86, 0x85, 0xb4, 0xa2, 0xaf, 0xb2, 0x91, 0x6d, 0xbe, 0x2a, 0xae, + 0xb6, 0x45, 0xd3, 0x57, 0x60, 0xa0, 0x45, 0x80, 0x42, 0xa2, 0x86, 0x64, + 0x66, 0x81, 0x82, 0x6f, 0x9a, 0xa7, 0xab, 0xc6, 0x7b, 0xda, 0x9c, 0x5f, + 0xa6, 0xbd, 0x7d, 0x9c, 0xe8, 0x78, 0x5b, 0x82, 0xa3, 0x6b, 0x5d, 0xa3, + 0xb5, 0x68, 0x5b, 0x78, 0xd6, 0x72, 0x73, 0x91, 0x8b, 0x9c, 0xb8, 0x90, + 0x9c, 0x87, 0xb6, 0x7a, 0xa7, 0x67, 0x84, 0x5b, 0xa9, 0x43, 0x72, 0xa4, + 0x37, 0xb0, 0x53, 0xa3, 0x81, 0x70, 0x59, 0x72, 0x82, 0x66, 0x6d, 0x45, + 0xd0, 0x99, 0xae, 0x4c, 0x51, 0x67, 0x67, 0x51, 0x45, 0x3d, 0xad, 0x55, + 0x6f, 0xad, 0x72, 0xcc, 0x64, 0x67, 0xc3, 0x85, 0x7d, 0xcd, 0x50, 0x75, + 0x6b, 0x64, 0x7f, 0x8b, 0x93, 0x50, 0x69, 0x7c, 0xa3, 0x96, 0x79, 0x56, + 0x5f, 0x9f, 0x7d, 0x2d, 0x7d, 0xca, 0xbf, 0xa8, 0x77, 0xb0, 0x65, 0x9f, + 0xa0, 0x50, 0xa7, 0x50, 0x94, 0x79, 0x89, 0xbc, 0x6e, 0x8a, 0xb2, 0x7c, + 0xc1, 0x66, 0x9f, 0x73, 0xc4, 0x66, 0x71, 0xa4, 0x66, 0x61, 0x65, 0xde, + 0x64, 0x6d, 0x99, 0x69, 0x69, 0x74, 0x66, 0x61, 0x9b, 0x4f, 0x8a, 0xa2, + 0xf3, 0xfb, 0xff, 0xff, 0x75, 0x7, 0, 0, 0xdc, 0xfd, 0xff, 0xff, + 0xc3, 0xfd, 0xff, 0xff, 0xc1, 0xfc, 0xff, 0xff, 0x68, 0x4, 0, 0, + 0x20, 0xfd, 0xff, 0xff, 0x4f, 0x1, 0, 0, 0x2d, 0x2, 0, 0, + 0x33, 0xff, 0xff, 0xff, +}; diff --git a/src/tim/vx/context.cc b/src/tim/vx/context.cc new file mode 100644 index 0000000..df8b694 --- /dev/null +++ b/src/tim/vx/context.cc @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/context.h" + +#include "context_private.h" +#include "graph_private.h" +#include "tim/vx/graph.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { + +ContextImpl::ContextImpl() : context_(vsi_nn_CreateContext()) {} + +ContextImpl::~ContextImpl() { + if (context_) { + vsi_nn_ReleaseContext(&context_); + } +} + +vsi_nn_context_t ContextImpl::context() { return context_; } + +std::shared_ptr Context::Create() { + return std::make_shared(); +} + +std::shared_ptr ContextImpl::CreateGraph() { + return std::make_shared(this); +} + +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/context_private.h b/src/tim/vx/context_private.h new file mode 100644 index 0000000..23b2807 --- /dev/null +++ b/src/tim/vx/context_private.h @@ -0,0 +1,46 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_CONTEXT_PRIVATE_H_ +#define TIM_VX_CONTEXT_PRIVATE_H_ +#include "tim/vx/context.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { + +class ContextImpl : public Context { + public: + ContextImpl(); + ~ContextImpl(); + vsi_nn_context_t context(); + std::shared_ptr CreateGraph(); + + protected: + vsi_nn_context_t context_; +}; + +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_CONTEXT_PRIVATE_H_ */ \ No newline at end of file diff --git a/src/tim/vx/graph.cc b/src/tim/vx/graph.cc new file mode 100644 index 0000000..c9852f7 --- /dev/null +++ b/src/tim/vx/graph.cc @@ -0,0 +1,90 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/graph.h" + +#include + +#include "context_private.h" +#include "graph_private.h" +#include "tensor_private.h" +#include "tim/vx/context.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { + +GraphImpl::GraphImpl(ContextImpl* context) + : context_(context), + graph_(vsi_nn_CreateGraph(context_->context(), 0, 0)), + tensor_placeholder_(nullptr), + compiled_(false) {} + +GraphImpl::~GraphImpl() { vsi_nn_ReleaseGraph(&graph_); } + +vsi_nn_graph_t* GraphImpl::graph() { return graph_; } + +void GraphImpl::AddInput(vsi_nn_tensor_id_t id) { + if (inputs_.end() == std::find(inputs_.begin(), inputs_.end(), id)) { + inputs_.push_back(id); + } +} + +void GraphImpl::AddOutput(vsi_nn_tensor_id_t id) { + if (outputs_.end() == std::find(outputs_.begin(), outputs_.end(), id)) { + outputs_.push_back(id); + } +} + +std::shared_ptr GraphImpl::CreateTensor(const TensorSpec& spec, + const void* data) { + return std::make_shared(this, spec, data); +} + +std::shared_ptr GraphImpl::CreateTensorPlaceHolder() { + if (!tensor_placeholder_) { + tensor_placeholder_ = std::make_shared(this); + } + + return tensor_placeholder_; +} + +bool GraphImpl::Compile() { + compiled_ = true; + + vsi_nn_SetGraphInputs(graph_, inputs_.data(), inputs_.size()); + vsi_nn_SetGraphOutputs(graph_, outputs_.data(), outputs_.size()); + + return (VSI_SUCCESS == vsi_nn_SetupGraph(graph_, true) && + VSI_SUCCESS == vsi_nn_VerifyGraph(graph_)); +} + +bool GraphImpl::Run() { + if (!compiled_ && !Compile()) { + return false; + } + return (VSI_SUCCESS == vsi_nn_RunGraph(graph_)); +} + +} // namespace vx +} // namespace tim diff --git a/src/tim/vx/graph_private.h b/src/tim/vx/graph_private.h new file mode 100644 index 0000000..e4e2820 --- /dev/null +++ b/src/tim/vx/graph_private.h @@ -0,0 +1,65 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_GRAPH_PRIVATE_H_ +#define TIM_VX_GRAPH_PRIVATE_H_ +#include + +#include "context_private.h" +#include "tim/vx/graph.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { + +class GraphImpl : public Graph { + public: + GraphImpl(ContextImpl* context); + ~GraphImpl(); + + /// Return the low-level graph object + vsi_nn_graph_t* graph(); + + void AddInput(vsi_nn_tensor_id_t id); + void AddOutput(vsi_nn_tensor_id_t id); + + /// Implement parents' virtual functions + std::shared_ptr CreateTensor(const TensorSpec& spec, + const void* data = nullptr); + std::shared_ptr CreateTensorPlaceHolder(); + bool Compile(); + bool Run(); + + protected: + ContextImpl* context_; + vsi_nn_graph_t* graph_; + std::shared_ptr tensor_placeholder_; + bool compiled_; + std::vector inputs_; + std::vector outputs_; +}; + +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_GRAPH_PRIVATE_H_ */ \ No newline at end of file diff --git a/src/tim/vx/internal/.clang-format b/src/tim/vx/internal/.clang-format new file mode 100644 index 0000000..bcbd2bb --- /dev/null +++ b/src/tim/vx/internal/.clang-format @@ -0,0 +1,26 @@ +# +# Copyright (C) 2018 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +BasedOnStyle: Google +CommentPragmas: NOLINT:.* +DerivePointerAlignment: false +AllowShortFunctionsOnASingleLine: Inline +ColumnLimit: 80 +TabWidth: 4 +UseTab: Never +IndentWidth: 4 +BinPackArguments: false +BinPackParameters: false diff --git a/src/tim/vx/internal/.gitignore b/src/tim/vx/internal/.gitignore new file mode 100644 index 0000000..6858186 --- /dev/null +++ b/src/tim/vx/internal/.gitignore @@ -0,0 +1,338 @@ +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.suo +*.user +*.userosscache +*.sln.docstates + +*-[Dd]ebug/ +*-[Dd]ebugPublic/ +*-[Rr]elease/ +*-[Rr]eleases/ + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs +.bazelrc +.config_wksp.bzl + +# Build results +*.o +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +NNApi0.3/ +NNApi0.4/ +OpenVX1.2/ +lib/ +bazel-bin +bazel-genfiles +bazel-out +bazel-ovxlib +bazel-testlogs + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# VS code +.vscode + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUNIT +*.VisualState.xml +TestResult.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ +**/Properties/launchSettings.json + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_i.h +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# JustCode is a .NET coding add-in +.JustCode + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# TypeScript v1 declaration files +typings/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# JetBrains Rider +.idea/ +*.sln.iml + +# CodeRush +.cr/ + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# IDE +.settings/ diff --git a/src/tim/vx/internal/BUILD b/src/tim/vx/internal/BUILD new file mode 100644 index 0000000..ae21b3d --- /dev/null +++ b/src/tim/vx/internal/BUILD @@ -0,0 +1,223 @@ +# Description: +# VSI OVX wrapper logic + +package( + default_visibility = ["//visibility:public"], + features = ["-parse_headers"], +) + +load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_tar") + +filegroup( + name = "all_files", + srcs = glob( + ["**/*"], + exclude = [ + "**/METADATA", + "**/OWNERS", + "external/*" + ], + ), +) + +filegroup( + name = "kernel_hdrs", + srcs = glob([ + "include/kernel/cl/*.h", + "include/kernel/evis/*.h", + "include/kernel/cpu/*.h", + ]) +) + +filegroup( + name = "kernel_srcs", + srcs = glob([ + "src/kernel/cl/*.c", + "src/kernel/evis/*.c", + "src/kernel/cpu/*.c", + "src/kernel/vx/*.c", + ]) +) + +filegroup( + name = "operation_srcs", + srcs = glob([ + "src/ops/*.c", + ]) +) + +filegroup( + name = "operation_hdrs", + srcs = glob([ + "include/ops/*.h", + ]) +) + +filegroup( + name = "custom_hdrs", + srcs = glob([ + "include/custom/ops/*.h", + ]) + [ + #custom + "include/custom/custom_node_type.def", + "include/custom/custom_ops.def", + "include/custom/vsi_nn_custom_node_type.h", + ] +) + +filegroup( + name = "custom_srcs", + srcs = glob([ + "src/custom/ops/*.c", + "src/custom/ops/kernel/*.c", + ]) +) + +cc_library( + name = "ovxlibimpl", + copts = [ + "-Werror", "-Wmisleading-indentation", + "-fvisibility=hidden", '-DOVXLIB_API=__attribute__((visibility(\\"default\\")))', + ], + linkopts = ["-ldl", "-lm"], + alwayslink=True, + linkstatic = True, + includes = [ + "include", + ], + hdrs = [ + "include/vsi_nn_pub.h", + "include/vsi_nn_ops.h", + "include/vsi_nn_log.h", + "include/vsi_nn_context.h", + "include/vsi_nn_node_attr_template.h", + "include/vsi_nn_tensor.h", + "include/vsi_nn_prv.h", + "include/vsi_nn_types.h", + "include/vsi_nn_node.h", + "include/vsi_nn_node_type.h", + "include/vsi_nn_client_op.h", + "include/vsi_nn_graph.h", + "include/vsi_nn_test.h", + "include/vsi_nn_tensor_util.h", + "include/vsi_nn_version.h", + "include/vsi_nn_compatibility.h", + "include/vsi_nn_assert.h", + "include/vsi_nn_feature.h", + "include/vsi_nn_rnn.h", + "include/vsi_nn_rnn_helper.h", + "include/vsi_nn_rnn_prv.h", + "include/vsi_nn_internal_node.h", + "include/vsi_nn_daemon.h", + "include/vsi_nn_pre_post_process.h", + "include/vsi_nn_graph_optimization.h", + "include/utils/vsi_nn_link_list.h", + "include/utils/vsi_nn_math.h", + "include/utils/vsi_nn_util.h", + "include/utils/vsi_nn_code_generator.h", + "include/utils/vsi_nn_binary_tree.h", + "include/utils/vsi_nn_map.h", + "include/utils/vsi_nn_hashmap.h", + "include/utils/vsi_nn_limits.h", + "include/utils/vsi_nn_dtype_util.h", + "include/utils/vsi_nn_dtype_util_prv.h", + "include/utils/vsi_nn_vdata.h", + "include/utils/vsi_nn_tensor_op.h", + "include/utils/vsi_nn_shape_util.h", + "include/utils/vsi_nn_constraint_check.h", + "include/quantization/vsi_nn_asymmetric_affine.h", + "include/quantization/vsi_nn_dynamic_fixed_point.h", + "include/quantization/vsi_nn_perchannel_symmetric_affine.h", + "include/client/vsi_nn_vxkernel.h", + "include/interface/ops.def", + "include/kernel/vsi_nn_kernel.h", + "include/kernel/vsi_nn_gpu.h", + "include/kernel/vsi_nn_gpu_config.h", + "include/kernel/vsi_nn_kernel_eltwise.h", + "include/kernel/vsi_nn_kernel_node.h", + "include/kernel/vsi_nn_kernel_gpu_shape_optimize.h", + "include/vsi_nn_error.h", + + # libnnext + "include/libnnext/vx_lib_nnext.h", + "include/libnnext/vsi_nn_libnnext_resource.h", + + #internal + "include/internal/internal_ops.def", + "include/vsi_nn_feature_config.h" + ] + [":kernel_hdrs"] + + [":operation_hdrs"] + + [":custom_hdrs"] + + [ + "include/vsi_nn_platform.h", + ], + srcs = [ + "src/vsi_nn_graph.c", + "src/vsi_nn_ops.c", + "src/vsi_nn_context.c", + "src/vsi_nn_node.c", + "src/vsi_nn_tensor.c", + "src/vsi_nn_client_op.c", + "src/vsi_nn_node_attr_template.c", + "src/vsi_nn_version.c", + "src/vsi_nn_rnn.c", + "src/vsi_nn_rnn_helper.c", + "src/vsi_nn_log.c", + "src/vsi_nn_internal_node.c", + "src/vsi_nn_daemon.c", + "src/vsi_nn_graph_optimization.c", + "src/vsi_nn_pre_post_process.c", + "src/client/vsi_nn_vxkernel.c", + "src/utils/vsi_nn_link_list.c", + "src/utils/vsi_nn_util.c", + "src/utils/vsi_nn_math.c", + "src/utils/vsi_nn_code_generator.c", + "src/utils/vsi_nn_binary_tree.c", + "src/utils/vsi_nn_map.c", + "src/utils/vsi_nn_hashmap.c", + "src/utils/vsi_nn_limits.c", + "src/utils/vsi_nn_dtype_util.c", + "src/utils/vsi_nn_vdata.c", + "src/utils/vsi_nn_tensor_op.c", + "src/utils/vsi_nn_shape_util.c", + "src/utils/vsi_nn_dtype.c", + "src/utils/vsi_nn_constraint_check.c", + "src/quantization/vsi_nn_asymmetric_affine.c", + "src/quantization/vsi_nn_dynamic_fixed_point.c", + "src/quantization/vsi_nn_perchannel_symmetric_affine.c", + "src/kernel/vsi_nn_kernel.c", + "src/kernel/vsi_nn_kernel_util.c", + "src/kernel/vsi_nn_kernel_backend.c", + "src/kernel/vsi_nn_kernel_eltwise.c", + "src/kernel/vsi_nn_kernel_selector.c", + "src/kernel/vsi_nn_kernel_node.c", + "src/kernel/vsi_nn_kernel_param.c", + "src/kernel/vsi_nn_gpu.c", + "src/kernel/vsi_nn_kernel_gpu_shape_optimize.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_crop.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_resize.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_scale.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_topk.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c", + "src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c", + "src/libnnext/vsi_nn_libnnext_resource.c", + ] + [":kernel_srcs"] + + [":operation_srcs"] + + [":custom_srcs"], + deps = ["//prebuilt-sdk:VIV_SDK_LIB"] +) + diff --git a/src/tim/vx/internal/include/client/vsi_nn_vxkernel.h b/src/tim/vx/internal/include/client/vsi_nn_vxkernel.h new file mode 100644 index 0000000..e486949 --- /dev/null +++ b/src/tim/vx/internal/include/client/vsi_nn_vxkernel.h @@ -0,0 +1,132 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_VXKERNEL_H +#define _VSI_NN_VXKERNEL_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum _vx_kernel_type_e +{ + VX_KERNEL_TYPE_CPU, + VX_KERNEL_TYPE_VX, + VX_KERNEL_TYPE_BIN +} vx_kernel_type_e; + +typedef struct vsi_nn_kernel_info +{ + char **resource_name; + uint8_t resource_num; + vx_kernel_type_e type; + vx_kernel_description_t ** kernel; + uint8_t kernel_index; + uint8_t init_index; +} vsi_nn_kernel_info_t; + +uint8_t * vsi_nn_LoadBinarySource + ( + uint8_t * file, + int32_t * sz + ); + +vsi_status vsi_nn_RegisterClientKernel + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_info_t * kernel_info + ); + +/* + * Deprecated(vsi_nn_RegisterClientKernelAndCreateNode): use vsi_nn_RegisterClientKernelAndNewNode() insteatd. +*/ +OVXLIB_API vx_node vsi_nn_RegisterClientKernelAndCreateNode + ( + vsi_nn_graph_t * graph, + vx_kernel_description_t * kernel + ); + +OVXLIB_API vx_node vsi_nn_RegisterClientKernelAndNewNode + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_info_t * kernel_info + ); + +OVXLIB_API vsi_status vsi_nn_ClientNodePassParameters + ( + vx_node node, + vx_reference * params, + uint32_t num + ); + +OVXLIB_API vsi_status VX_CALLBACK vsi_nn_KernelValidator + ( + vx_node node, + const vx_reference parameters[], + uint32_t num, + vx_meta_format metas[] + ); + +OVXLIB_API vsi_status VX_CALLBACK vsi_nn_KernelInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ); + +OVXLIB_API vsi_status VX_CALLBACK vsi_nn_KernelDeinitializer + ( + vx_node nodObj, + const vx_reference *paraObj, + uint32_t paraNum + ); + +OVXLIB_API const char * vsi_nn_VxResourceGetPath(); + +OVXLIB_API void vsi_nn_VxResourceSetPath + ( + char* path + ); + +OVXLIB_API const uint8_t * vsi_nn_VxBinResourceGetResource + ( + char* name, + vx_size *len + ); + +OVXLIB_API vx_kernel_type_e vsi_nn_GetVXKernelTypeForShader(); + +OVXLIB_API vx_bool vsi_nn_is_do_vx_op_pre_init + ( + vx_kernel_type_e type + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/custom/custom_node_type.def b/src/tim/vx/internal/include/custom/custom_node_type.def new file mode 100644 index 0000000..034c37f --- /dev/null +++ b/src/tim/vx/internal/include/custom/custom_node_type.def @@ -0,0 +1,4 @@ +/* + custom op data struct def +*/ +DEF_NODE_TYPE(custom_softmax) diff --git a/src/tim/vx/internal/include/custom/custom_ops.def b/src/tim/vx/internal/include/custom/custom_ops.def new file mode 100644 index 0000000..8ef4d50 --- /dev/null +++ b/src/tim/vx/internal/include/custom/custom_ops.def @@ -0,0 +1,4 @@ +/* + Add custom ops to the end. +*/ +DEF_OP(CUSTOM_SOFTMAX) diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_softmax.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_softmax.h new file mode 100644 index 0000000..037d489 --- /dev/null +++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_softmax.h @@ -0,0 +1,35 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CUSTOM_SOFTMAX_H +#define _VSI_NN_OP_CUSTOM_SOFTMAX_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_custom_softmax_param +{ + int32_t axis; +} vsi_nn_custom_softmax_param; + +#endif diff --git a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h new file mode 100644 index 0000000..16d3d0c --- /dev/null +++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h @@ -0,0 +1,31 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_CUSTOM_NODE_TYPE_H_ +#define _VSI_NN_CUSTOM_NODE_TYPE_H_ +/* + custom op head files +*/ +#include "custom/ops/vsi_nn_op_custom_softmax.h" + +#endif diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def new file mode 100644 index 0000000..0a5077b --- /dev/null +++ b/src/tim/vx/internal/include/interface/ops.def @@ -0,0 +1,146 @@ +/* + Add new ops to the end. +*/ +DEF_OP(ADD) +DEF_OP(MULTIPLY) +DEF_OP(CONV2D) +DEF_OP(CONV_RELU) +DEF_OP(CONV_RELU_POOL) +DEF_OP(FCL) +DEF_OP(FCL_RELU) +DEF_OP(SOFTMAX) +DEF_OP(POOL) +DEF_OP(LEAKY_RELU) +DEF_OP(LRN) +DEF_OP(CONCAT) +DEF_OP(SPLIT) +DEF_OP(NOOP) +DEF_OP(ROI_POOL) +DEF_OP(BATCH_NORM) +DEF_OP(PROPOSAL) +DEF_OP(DECONVOLUTION) +DEF_OP(RESHAPE) +DEF_OP(PERMUTE) +DEF_OP(PRELU) +DEF_OP(UPSAMPLE) +DEF_OP(RELU) +DEF_OP(RELUN) +DEF_OP(LSTM) +DEF_OP(REORG) +DEF_OP(VARIABLE) +DEF_OP(L2_NORMALIZE) +DEF_OP(FCL2) +DEF_OP(POOLWITHARGMAX) +DEF_OP(ARGMAX) +DEF_OP(MAXIMUM) +DEF_OP(L2NORMALIZESCALE) +DEF_OP(CROP) +DEF_OP(SUBTRACT) +DEF_OP(RELU6) +DEF_OP(SIGMOID) +DEF_OP(TANH) +DEF_OP(SQRT) +DEF_OP(RSQRT) +DEF_OP(SOFTRELU) +DEF_OP(DIVIDE) +DEF_OP(DROPOUT) +DEF_OP(SHUFFLECHANNEL) +DEF_OP(RESIZE) +DEF_OP(REVERSE) +DEF_OP(DEPTH2SPACE) +DEF_OP(SPACE2DEPTH) +DEF_OP(DATACONVERT) +DEF_OP(SCALE) +DEF_OP(SLICE) +DEF_OP(ELU) +DEF_OP(BATCH2SPACE) +DEF_OP(SPACE2BATCH) +DEF_OP(PAD) +DEF_OP(IMAGEPROCESS) +DEF_OP(MATRIXMUL) +DEF_OP(LSTMUNIT) +DEF_OP(LAYER_NORM) +DEF_OP(REDUCE) +DEF_OP(INSTANCE_NORM) +DEF_OP(TENSORSTACKCONCAT) +DEF_OP(STRIDED_SLICE) +DEF_OP(SIGNAL_FRAME) +DEF_OP(A_TIMES_B_PLUS_C) +DEF_OP(SVDF) +DEF_OP(ABS) +DEF_OP(CONV1D) +DEF_OP(NBG) +DEF_OP(CONCATSHIFT) +DEF_OP(LRN2) +DEF_OP(RELATIONAL_OPS) +DEF_OP(SYNC_HOST) +DEF_OP(POW) +DEF_OP(FLOORDIV) +DEF_OP(MINIMUM) +DEF_OP(SPATIAL_TRANSFORMER) +DEF_OP(LOGICAL_OPS) +DEF_OP(SELECT) +DEF_OP(LSTMUNIT_ACTIVATION) +DEF_OP(LSTMUNIT_OVXLIB) +DEF_OP(TENSOR_ADD_MEAN_STDDEV_NORM) +DEF_OP(RELU1) +DEF_OP(STACK) +DEF_OP(FLOOR) +DEF_OP(SQUARE) +DEF_OP(NEG) +DEF_OP(EXP) +DEF_OP(LSTM_OVXLIB) +DEF_OP(PRE_PROCESS_TENSOR) +DEF_OP(HASHTABLE_LOOKUP) +DEF_OP(EMBEDDING_LOOKUP) +DEF_OP(LSH_PROJECTION) +DEF_OP(RNN) +DEF_OP(CLIP) +DEF_OP(POST_PROCESS) +DEF_OP(PRE_PROCESS_GRAY) +DEF_OP(UNSTACK) +DEF_OP(PRE_PROCESS_RGB) +DEF_OP(PRE_PROCESS) +DEF_OP(ADDN) +DEF_OP(PRE_PROCESS_YUV420) +DEF_OP(EXTRA_ENDING) +DEF_OP(GATHER) +DEF_OP(TILE) +DEF_OP(GROUPED_CONV2D) +DEF_OP(TOPK) +DEF_OP(PRE_PROCESS_BGRA) +DEF_OP(LOGICAL_NOT) +DEF_OP(SIN) +DEF_OP(LOG) +DEF_OP(ARGMIN) +DEF_OP(ROI_ALIGN) +DEF_OP(HEATMAP_MAX_KEYPOINT) +DEF_OP(AXIS_ALIGNED_BBOX_TRANSFORM) +DEF_OP(BOX_WITH_NMS_LIMIT) +DEF_OP(GENERATE_PROPOSALS) +DEF_OP(DETECTION_POSTPROCESS) +DEF_OP(RANDOM_MULTINOMIAL) +DEF_OP(LOG_SOFTMAX) +DEF_OP(RELU_KERAS) +DEF_OP(GRU_OVXLIB) +DEF_OP(GRUCELL_OVXLIB) +DEF_OP(UNIDIRECTIONAL_SEQUENCE_RNN) +DEF_OP(QUANTIZED_16BIT_LSTM) +DEF_OP(BIDIRECTIONAL_SEQUENCE_RNN) +DEF_OP(BIDIRECTIONAL_SEQUENCE_LSTM) +DEF_OP(RNNCELL_OVXLIB) +DEF_OP(SWISH) +DEF_OP(DEPTHWISE_CONV1D) +DEF_OP(GATHER_ND) +DEF_OP(CAST) +DEF_OP(LINEAR) +DEF_OP(BATCHNORM_SINGLE) +DEF_OP(MOMENTS) +DEF_OP(SQUEEZE) +DEF_OP(HARD_SIGMOID) +DEF_OP(MISH) +DEF_OP(EXPAND_BROADCAST) +DEF_OP(PRE_PROCESS_YUV444) +DEF_OP(PRE_PROCESS_NV12) +DEF_OP(SCATTER_ND) +DEF_OP(DECONVOLUTION1D) diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def new file mode 100644 index 0000000..2a1ac9e --- /dev/null +++ b/src/tim/vx/internal/include/internal/internal_ops.def @@ -0,0 +1,16 @@ +/* + Add internal ops to the end. +*/ +DEF_OP(SOFTMAX_INTERNAL) +DEF_OP(RELU_KERAS_INTERNAL) +DEF_OP(REDUCESUM_INTERNAL) +DEF_OP(REDUCEMAX_INTERNAL) +DEF_OP(REDUCEMIN_INTERNAL) +DEF_OP(REDUCEPROD_INTERNAL) +DEF_OP(REDUCEALL_INTERNAL) +DEF_OP(REDUCEANY_INTERNAL) +DEF_OP(RESIZE_INTERNAL) +DEF_OP(RESIZE_NEAREST_INTERNAL) +DEF_OP(DEPTH2SPACE_INTERNAL) +DEF_OP(GRUCELL_ACTIVATION_INTERNAL) +DEF_OP(GRUCELL_ACTIVATION_INTERNAL_SMA) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_gpu.h b/src/tim/vx/internal/include/kernel/vsi_nn_gpu.h new file mode 100644 index 0000000..3dc44d5 --- /dev/null +++ b/src/tim/vx/internal/include/kernel/vsi_nn_gpu.h @@ -0,0 +1,100 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_GPU_H +#define _VSI_NN_GPU_H + +#include "vsi_nn_gpu_config.h" + +#define gpu_min(x, y) (((x) <= (y)) ? (x) : (y)) + +#define gpu_max(x, y) (((x) >= (y)) ? (x) : (y)) + +#define gpu_postshift( x ) (gpu_min( x, GPU_MAX_POST_SHIFT_BITS)) +#define gpu_multiplier( x ) (gpu_min( x, GPU_MAX_MULTIPLIER_NUM)) + +// Alignment with a power of two value. +#define gpu_align_np2(n, align) (((n) + (align) - 1) - (((n) + (align) - 1) % (align))) + +#define gpu_align_np2_safe(n, align) \ +( \ + (gpu_align_np2((n) & ~0ULL, (align) & ~0ULL) ^ gpu_align_np2(n, align)) ? \ + (n) : gpu_align_np2(n, align) \ +) +#define gpu_align_p2(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + +#define GPU_MAX_DIMENSION_SIZE (3) + +typedef enum +{ + GPU_DP_TYPE_16, + GPU_DP_TYPE_32, +} gpu_dp_type_e; + +typedef struct +{ + // 512 byte data + uint32_t data[16]; + gpu_dp_type_e type; +} gpu_dp_inst_t; + +typedef struct +{ + uint32_t dim; + size_t global_offset[GPU_MAX_DIMENSION_SIZE]; + size_t global_scale[GPU_MAX_DIMENSION_SIZE]; + size_t local_size[GPU_MAX_DIMENSION_SIZE]; + size_t global_size[GPU_MAX_DIMENSION_SIZE]; +} gpu_param_t; + +void gpu_dp_inst_update_postshfit + ( + gpu_dp_inst_t * dp_inst, + int32_t shift + ); + +void gpu_dp_inst_update_multiplier + ( + gpu_dp_inst_t * dp_inst, + int32_t start, + int32_t end, + int32_t multiplier + ); + +void gpu_quantize_multiplier_16bit + ( + double double_multipier, + uint16_t * quantize_multiplier, + int32_t * shift + ); + +void gpu_quantize_multiplier_32bit + ( + double double_multipier, + uint32_t * quantize_multiplier, + int32_t * shift + ); + +#endif + diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h b/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h new file mode 100644 index 0000000..a7ce5e3 --- /dev/null +++ b/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h @@ -0,0 +1,34 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_GPU_CONFIG_H +#define _VSI_NN_GPU_CONFIG_H + +#define GPU_TENSOR_MAX_WIDTH (65536) +#define GPU_MAX_MULTIPLIER_NUM (65535) +#define GPU_MAX_POST_SHIFT_BITS (31) +#define GPU_TENSOR_DIM_2 (2) + +#endif + diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h new file mode 100644 index 0000000..c5c8b2c --- /dev/null +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h @@ -0,0 +1,908 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_KERNEL_H +#define _VSI_NN_KERNEL_H + +#include +#include "vsi_nn_log.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_daemon.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_shape_util.h" +#include "utils/vsi_nn_hashmap.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_gpu.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/** Kernel types */ +typedef enum +{ + VSI_NN_KERNEL_TYPE_CPU = 0, + VSI_NN_KERNEL_TYPE_EVIS, + VSI_NN_KERNEL_TYPE_CL, + VSI_NN_KERNEL_TYPE_VX, + VSI_NN_KERNEL_TYPE_NUM, + VSI_NN_KERNEL_TYPE_NONE = VSI_NN_KERNEL_TYPE_NUM +} vsi_nn_kernel_type_e; + +/** Kernel pirority */ +enum +{ + VSI_NN_KERNEL_PIRORITY_DISABLE = 0, + VSI_NN_KERNEL_PIRORITY_NORMAL_LIMIT = 0x1FFFFFFF, + VSI_NN_KERNEL_PIRORITY_FORCE_EXEC = 0x20000000, +}; + +/** Kernel internal data type */ +typedef enum +{ + I8 = 0, + I16, + I32, + I64, + U8, + U16, + U32, + U64, + F16, + F32, + F64, + BF16, + BOOL8 +} vsi_nn_kernel_dtype_e; + +typedef enum +{ + VSI_NN_KERNEL_QUANT_NONE, + VSI_NN_KERNEL_QUANT_DFP, + VSI_NN_KERNEL_QUANT_ASYMM, + VSI_NN_KERNEL_QUANT_ASYMM_PERCHANNEL, + VSI_NN_KERNEL_QUANT_SYMM, + VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL, + VSI_NN_KERNEL_QUANT_TYPE_NUM +} vsi_nn_kernel_quant_type_e; + +/** GPU source format */ +typedef enum +{ + VSI_NN_GPU_SOURCE_FMT_CODE = 0, + VSI_NN_GPU_SOURCE_FMT_EXECUTABLE = 1, + VSI_NN_GPU_SOURCE_FMT_NUM +} vsi_nn_gpu_source_fmt_e; + +typedef char * vsi_nn_kernel_source_t; +typedef uint32_t vsi_nn_kernel_unique_id_t; + +typedef struct +{ + char * data; +} vsi_nn_kernel_build_option_t; + +typedef struct +{ + size_t num; + vsi_nn_kernel_source_t * data; + vsi_nn_kernel_build_option_t build_option; +} vsi_nn_kernel_source_info_t; + +typedef struct +{ + vsi_nn_kernel_type_e type; + vsi_nn_kernel_unique_id_t unique_id; + vx_kernel_description_t info; + struct + { + vsi_nn_kernel_source_info_t sources[VSI_NN_GPU_SOURCE_FMT_NUM]; + vsi_nn_gpu_source_fmt_e active_source_fmt; + } gpu; +} vsi_nn_kernel_t; + +typedef struct +{ + int32_t fl; +} vsi_nn_kernel_quant_dfp_t; + +typedef struct +{ + float scale; + int32_t zero_point; +} vsi_nn_kernel_quant_asymm_t; + +typedef struct +{ + vsi_float_array_t * scale; + vsi_int_array_t * zero_point; + int32_t channel_dim; +} vsi_nn_kernel_quant_asymm_perchannel_t; + +typedef struct +{ + vsi_nn_kernel_dtype_e dtype; + vsi_int_array_t * shape; + vsi_nn_kernel_quant_type_e quant; + union + { + vsi_nn_kernel_quant_dfp_t dfp; + vsi_nn_kernel_quant_asymm_t asymm; + vsi_nn_kernel_quant_asymm_perchannel_t asymm_v; + }; +} vsi_nn_kernel_tensor_attr_t; + +typedef struct +{ + vsi_nn_kernel_type_e kernel_type; + int32_t fps; +} vsi_nn_kernel_pirority_t; + +typedef struct +{ + vsi_nn_kernel_pirority_t pirority[VSI_NN_KERNEL_TYPE_NUM]; + int32_t allow_kernel_num; +} vsi_nn_kernel_selector_t; + +typedef void * vsi_nn_kernel_node_param_t; + +typedef void * vsi_nn_kernel_tensor_t; + +typedef void * vsi_nn_kernel_node_t; + +typedef void * vsi_nn_kernel_graph_t; + +typedef void * vsi_nn_kernel_scalar_t; + +typedef vsi_nn_hashmap_t vsi_nn_kernel_param_t; + +typedef vsi_nn_kernel_node_t (* vsi_nn_kernel_setup_func_t) + ( + vsi_nn_graph_t *, + vsi_nn_tensor_t **, + size_t input_num, + vsi_nn_tensor_t **, + size_t output_num, + const vsi_nn_kernel_param_t *, + vsi_nn_kernel_t * + ); + +typedef vsi_status (* vsi_nn_kernel_selector_func_t) + ( + vsi_nn_graph_t *, + vsi_nn_tensor_t **, + size_t input_num, + vsi_nn_tensor_t **, + size_t output_num, + const vsi_nn_kernel_param_t *, + vsi_nn_kernel_selector_t * + ); + +typedef struct +{ + vsi_nn_kernel_unique_id_t unique_id; + vsi_nn_kernel_setup_func_t setup[VSI_NN_KERNEL_TYPE_NUM]; + vsi_nn_kernel_selector_func_t select; +} vsi_nn_kernel_backend_t; + +vsi_nn_kernel_param_t * vsi_nn_kernel_param_create(); + +void vsi_nn_kernel_param_release( vsi_nn_kernel_param_t ** params ); + +void vsi_nn_kernel_param_clear( vsi_nn_kernel_param_t * params ); + +vsi_bool vsi_nn_kernel_param_add_int32 + ( vsi_nn_kernel_param_t * params, const char * key, int32_t value); + +int32_t vsi_nn_kernel_param_get_int32 + ( const vsi_nn_kernel_param_t * params, const char * key); + +vsi_bool vsi_nn_kernel_param_add_int64 + ( vsi_nn_kernel_param_t * params, const char * key, int64_t value); + +int64_t vsi_nn_kernel_param_get_int64 + ( const vsi_nn_kernel_param_t * params, const char * key); + +vsi_bool vsi_nn_kernel_param_add_float32 + ( vsi_nn_kernel_param_t * params, const char * key, float value); + +float vsi_nn_kernel_param_get_float32 + ( const vsi_nn_kernel_param_t * params, const char * key); + +vsi_bool vsi_nn_kernel_param_add_str + ( vsi_nn_kernel_param_t * params, const char * key, const char * str); + +const char * vsi_nn_kernel_param_get_str + ( const vsi_nn_kernel_param_t * params, const char * key); + +vsi_bool vsi_nn_kernel_param_add_buffer + ( vsi_nn_kernel_param_t * params, const char * key, void * buf, size_t size); + +void * vsi_nn_kernel_param_get_buffer + ( const vsi_nn_kernel_param_t * params, const char * key, size_t * size); + +/** Kernel register */ +#define REGISTER_KERNEL_BACKEND(kernel_name, kernel_type, func) \ + _INITIALIZER(_register_kernel_##kernel_name##_##kernel_type) \ + { \ + vsi_nn_kernel_backend_register( \ + ""#kernel_name, \ + VSI_NN_KERNEL_TYPE_##kernel_type, func ); \ + } +#define REGISTER_KERNEL_SELECTOR(kernel_name, func) \ + _INITIALIZER(_register_kernel_##kernel_name##_selector) \ + { \ + vsi_nn_kernel_selector_register( \ + ""#kernel_name, func ); \ + } + +#if 0 + typedef struct + { + const char* name; + vsi_nn_op_t op; + vsi_nn_kernel_type_e kernel_type; + vsi_nn_kernel_setup_func_t func; + } vsi_nn_kernel_section_meta_t; + #define REGISTER_KERNEL_BACKEND(operation, kernel_type, func) \ + static vsi_nn_kernel_section_meta_t _kernel_meta = \ + {""#operation, VSI_NN_OP_##operation, VSI_NN_KERNEL_TYPE_##kernel_type, func}; \ + static vsi_nn_kernel_section_meta_t* _kernel_meta_ptr \ + __attribute__((section("kernel_meta_section"))) = &_kernel_meta; +#endif +#if 0 + #define REGISTER_KERNEL_BACKEND(operation, kernel_type, func) \ + vsi_status func##_(vsi_nn_graph_t* graph, \ + vsi_nn_tensor_t** inputs, size_t input_num, \ + vsi_nn_tensor_t** outputs, size_t output_num) {\ + return func(graph, inputs, input_num, outputs, output_num); \ + } + + #define REGISTER_KERNEL_BACKEND_MANUALLY(operation, kernel_type, func) \ + extern vsi_status func##_(vsi_nn_graph_t*, \ + vsi_nn_tensor_t** inputs, size_t input_num, \ + vsi_nn_tensor_t** outputs, size_t output_num); \ + vsi_nn_kernel_backend_register( ""#operation, \ + VSI_NN_KERNEL_TYPE_##kernel_type, func##_ ); +#endif + +#define REGISTER_BACKEND_CL(operation, func) \ + REGISTER_KERNEL_BACKEND(operation, CL, func) +#define REGISTER_BACKEND_EVIS(operation, func) \ + REGISTER_KERNEL_BACKEND(operation, EVIS, func) +#define REGISTER_BACKEND_CPU(operation, func) \ + REGISTER_KERNEL_BACKEND(operation, CPU, func) +#define REGISTER_BACKEND_OPENVX(operation, func) \ + REGISTER_KERNEL_BACKEND(operation, VX, func) + +#define DEF_KERNEL_BASE_CALLBACK( NAME ) \ + static vsi_status NAME##_impl( vsi_nn_kernel_node_t node, \ + const vsi_nn_kernel_node_param_t * param, \ + size_t param_size ); \ + static vx_status VX_CALLBACK NAME( \ + vx_node node, const vx_reference * param,\ + vx_uint32 param_size) {\ + return (vx_status)NAME##_impl( \ + (vsi_nn_kernel_node_t)node, \ + (const vsi_nn_kernel_node_param_t *)param, \ + (uint32_t)param_size \ + ); \ + } \ + static vsi_status NAME##_impl + +#define DEF_KERNEL_INITIALIZER( NAME ) DEF_KERNEL_BASE_CALLBACK( NAME ) +#define DEF_KERNEL_EXECUTOR( NAME ) DEF_KERNEL_BASE_CALLBACK( NAME ) +#define DEF_KERNEL_DEINITIALIZER( NAME ) DEF_KERNEL_BASE_CALLBACK( NAME ) + +void vsi_nn_kernel_backend_register + ( + const char * kernel_name, + vsi_nn_kernel_type_e kernel_type, + vsi_nn_kernel_setup_func_t setup_func + ); + +const vsi_nn_kernel_backend_t * vsi_nn_kernel_backend_get + ( const char * ); + +vsi_status vsi_nn_kernel_backend_init( void ); + +void vsi_nn_kernel_backend_deinit( void ); + +void vsi_nn_kernel_selector_register + ( + const char * kernel_name, + vsi_nn_kernel_selector_func_t selecotr_func + ); + +vsi_status vsi_nn_kernel_pirority_set + ( + vsi_nn_kernel_selector_t * selector, + const vsi_nn_kernel_pirority_t * pirority, + size_t pirority_size + ); + +vsi_nn_kernel_t * vsi_nn_kernel_create + ( + vsi_nn_kernel_type_e type + ); + +void vsi_nn_kernel_reset + ( + vsi_nn_kernel_t * kernel, + vsi_nn_kernel_type_e type + ); + +void vsi_nn_kernel_release + ( + vsi_nn_kernel_t ** kernel + ); + +void vsi_nn_kernel_add_source + ( + vsi_nn_kernel_t * kernel, + vsi_nn_gpu_source_fmt_e fmt, + size_t source_num, + ... + ); + +void vsi_nn_kernel_add_build_option + ( + vsi_nn_kernel_t * kernel, + const char * option + ); + +vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_create + ( + vsi_nn_kernel_graph_t graph, + const vsi_nn_kernel_tensor_attr_t* attr, + vsi_bool is_virtual + ); + +void vsi_nn_kernel_tensor_release + ( + vsi_nn_kernel_tensor_t * tensor + ); + +vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_reshape + ( + vsi_nn_kernel_tensor_t tensor, + int32_t * shape, + uint32_t rank + ); + +vsi_status vsi_nn_kernel_node_pass_param + ( + vsi_nn_kernel_node_t node, + vsi_nn_kernel_node_param_t * params, + size_t num + ); + +static inline void vsi_nn_kernel_node_release + ( + vsi_nn_kernel_node_t * node + ) +{ + if( node && *node ) + { + vxReleaseNode( (vx_node*)node ); + } +} + +static inline void vsi_nn_kernel_node_pack_io + ( + vsi_nn_kernel_node_param_t * params, + size_t param_num, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num + ) +{ + size_t i; + size_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < input_num && cnt < param_num; i ++, cnt ++ ) + { + if( inputs[i] ) + { + params[cnt] = (vsi_nn_kernel_node_param_t)(inputs[i]->t); + } + else + { + params[cnt] = NULL; + } + } + + /* Set outputs */ + for( i = 0; i < output_num && cnt < param_num; i ++, cnt ++ ) + { + if( outputs[i] ) + { + params[cnt] = (vsi_nn_kernel_node_param_t)(outputs[i]->t); + } + else + { + params[cnt] = NULL; + } + } +} /* vsi_nn_kernel_node_pack_io() */ + +/** Kernel selector */ +vsi_nn_kernel_node_t vsi_nn_kernel_selector + ( + vsi_nn_graph_t * graph, + const char * kernel_name, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params + ); + +/** Map data type to gpu internal dtype. */ +static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype + ( + vsi_nn_type_e dtype + ) +{ + switch( dtype ) + { + case VSI_NN_TYPE_INT8: + return I8; + case VSI_NN_TYPE_BOOL8: + return BOOL8; + case VSI_NN_TYPE_INT16: + return I16; + case VSI_NN_TYPE_INT32: + return I32; + case VSI_NN_TYPE_INT64: + return I64; + case VSI_NN_TYPE_UINT8: + return U8; + case VSI_NN_TYPE_UINT16: + return U16; + case VSI_NN_TYPE_UINT32: + return U32; + case VSI_NN_TYPE_FLOAT16: + return F16; + case VSI_NN_TYPE_BFLOAT16: + return BF16; + case VSI_NN_TYPE_FLOAT32: + return F32; + default: + VSILOGE("error data type %d", dtype); + break; + } + return I8; +} /* vsi_nn_kernel_map_dtype() */ + +static inline vsi_nn_type_e vsi_nn_dtype_map_kernel + ( + vsi_nn_kernel_dtype_e dtype + ) +{ + switch( dtype ) + { + case I8: + return VSI_NN_TYPE_INT8; + case BOOL8: + return VSI_NN_TYPE_BOOL8; + case I16: + return VSI_NN_TYPE_INT16; + case I32: + return VSI_NN_TYPE_INT32; + case I64: + return VSI_NN_TYPE_INT64; + case U8: + return VSI_NN_TYPE_UINT8; + case U16: + return VSI_NN_TYPE_UINT16; + case U32: + return VSI_NN_TYPE_UINT32; + case F16: + return VSI_NN_TYPE_FLOAT16; + case BF16: + return VSI_NN_TYPE_BFLOAT16; + case F32: + return VSI_NN_TYPE_FLOAT32; + default: + VSILOGE("error data type %d", dtype); + break; + } + return VSI_NN_TYPE_INT8; +} /* vsi_nn_kernel_map_dtype() */ + +static inline size_t vsi_nn_kernel_dtype_get_bytes + ( + vsi_nn_kernel_dtype_e dtype + ) +{ + switch( dtype ) + { + case I8: + case U8: + case BOOL8: + return sizeof(int8_t); + case I16: + case U16: + case F16: + case BF16: + return sizeof(int16_t); + case I32: + case U32: + case F32: + return sizeof(int32_t); + case I64: + return sizeof(int64_t); + default: + VSILOGE("Error data type %d", dtype); + break; + } + return 0; +} /* vsi_nn_kernel_dtype_get_bytes() */ + +static inline vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type + ( vsi_nn_qnt_type_e quant_type ) +{ + switch( quant_type ) + { + case VSI_NN_QNT_TYPE_DFP: + return VSI_NN_KERNEL_QUANT_DFP; + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + return VSI_NN_KERNEL_QUANT_ASYMM; + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: + return VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL; + default: + break; + } + return VSI_NN_KERNEL_QUANT_NONE; +} /* vsi_nn_kernel_map_quant_type() */ + +vsi_nn_kernel_node_t vsi_nn_kernel_create_node + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_t * kernel + ); + +vsi_status vsi_nn_kernel_node_set_border + (vsi_nn_kernel_node_t node, + vx_border_t* border); + +vsi_nn_kernel_scalar_t vsi_nn_kernel_scalar_create + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_dtype_e dtype, + const void * data + ); + +static inline void vsi_nn_kernel_scalar_release + ( vsi_nn_kernel_scalar_t * scalar ) +{ + if( scalar && *scalar ) + { + vxReleaseScalar( (vx_scalar*)scalar ); + } +} /* vsi_nn_kernel_scalar_relase() */ + +vsi_status vsi_nn_kernel_scalar_read_int8 + ( vsi_nn_kernel_scalar_t scalar, int8_t * out_data ); + +vsi_status vsi_nn_kernel_scalar_read_int32 + ( vsi_nn_kernel_scalar_t scalar, int32_t * out_data ); + +vsi_status vsi_nn_kernel_scalar_read_int64 + ( vsi_nn_kernel_scalar_t scalar, int64_t * out_data ); + +vsi_status vsi_nn_kernel_scalar_read_uint8 + ( vsi_nn_kernel_scalar_t scalar, uint8_t * out_data ); + +vsi_status vsi_nn_kernel_scalar_read_uint32 + ( vsi_nn_kernel_scalar_t scalar, uint32_t * out_data ); + +vsi_status vsi_nn_kernel_scalar_read_float32 + ( vsi_nn_kernel_scalar_t scalar, float * out_data ); + +vsi_status vsi_nn_kernel_scalar_read_float64 + ( vsi_nn_kernel_scalar_t scalar, double * out_data ); + +vsi_status vsi_nn_kernel_scalar_write_int8 + ( vsi_nn_kernel_scalar_t scalar, int8_t out_data ); + +vsi_status vsi_nn_kernel_scalar_write_int32 + ( vsi_nn_kernel_scalar_t scalar, int32_t out_data ); + +vsi_status vsi_nn_kernel_scalar_write_int64 + ( vsi_nn_kernel_scalar_t scalar, int64_t out_data ); + +vsi_status vsi_nn_kernel_scalar_write_uint8 + ( vsi_nn_kernel_scalar_t scalar, uint8_t out_data ); + +vsi_status vsi_nn_kernel_scalar_write_uint32 + ( vsi_nn_kernel_scalar_t scalar, uint32_t out_data ); + +vsi_status vsi_nn_kernel_scalar_write_float32 + ( vsi_nn_kernel_scalar_t scalar, float out_data ); + +vsi_status vsi_nn_kernel_scalar_write_float64 + ( vsi_nn_kernel_scalar_t scalar, double out_data ); + +vsi_status vsi_nn_kernel_scalar_get_dtype + ( + vsi_nn_kernel_scalar_t scalar, + vsi_nn_kernel_dtype_e * dtype + ); + +vsi_status vsi_nn_kernel_register + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_t * kernel + ); + +vsi_bool vsi_nn_kernel_gpu_check_shape + ( const int32_t * shape, size_t rank ); + +vsi_status vsi_nn_kernel_gpu_add_param + ( + vsi_nn_kernel_node_t node, + const char * param_key, + void * data + ); + +vsi_status vsi_nn_kernel_gpu_config + ( + vsi_nn_kernel_node_t node, + const gpu_param_t * gpu_param + ); + +vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create + ( vsi_nn_kernel_tensor_t tensor ); + +void vsi_nn_kernel_tensor_attr_release + ( vsi_nn_kernel_tensor_attr_t ** attr ); + +/* + * Create a buffer with a copy of tensor data. + * attr is optional + */ +void * vsi_nn_kernel_tensor_create_buffer + ( + vsi_nn_kernel_tensor_t tensor, + const vsi_nn_kernel_tensor_attr_t * attr, + vsi_bool convert_to_float + ); + +/* + * Read tensor data to buffer. + * attr is optional + */ +vsi_status vsi_nn_kernel_tensor_read + ( + vsi_nn_kernel_tensor_t tensor, + const vsi_nn_kernel_tensor_attr_t * attr, + void * out_buffer, + size_t out_buffer_size + ); + +/* + * Write float data to tensor. + * attr is optional + */ +vsi_status vsi_nn_kernel_tensor_write_from_float + ( + vsi_nn_kernel_tensor_t tensor, + const vsi_nn_kernel_tensor_attr_t * attr, + const float * float_buffer, + size_t size + ); + +/* + * Write data to tensor. + * attr is optional + */ +vsi_status vsi_nn_kernel_tensor_write + ( + vsi_nn_kernel_tensor_t tensor, + const vsi_nn_kernel_tensor_attr_t * attr, + const void * buffer, + size_t size + ); + +static inline size_t vsi_nn_kernel_tensor_attr_get_size + ( const vsi_nn_kernel_tensor_attr_t * attr ) +{ + if( !attr ) + { + return 0; + } + return vsi_nn_shape_get_size( attr->shape->data, attr->shape->size ); +} /* vsi_nn_kernel_tensor_attr_get_size() */ + +static inline size_t vsi_nn_kernel_tensor_attr_get_bytes + ( const vsi_nn_kernel_tensor_attr_t * attr ) +{ + size_t size; + size_t type_bytes; + if( !attr ) + { + return 0; + } + size = vsi_nn_kernel_tensor_attr_get_size( attr ); + type_bytes = vsi_nn_kernel_dtype_get_bytes( attr->dtype ); + return size * type_bytes; +} /* vsi_nn_kernel_tensor_attr_get_bytes() */ + +static inline void vsi_nn_kernel_tensor_attr_get_stride + ( const vsi_nn_kernel_tensor_attr_t * attr, size_t * out_stride) +{ + if( !attr || !out_stride ) + { + return; + } + vsi_nn_shape_get_stride( attr->shape->data, attr->shape->size, out_stride ); +} /* vsi_nn_kernel_tensor_attr_get_size() */ + +static inline vsi_bool vsi_nn_kernel_tensor_attr_is_quantized + ( const vsi_nn_kernel_tensor_attr_t * attr ) +{ + return ( attr && attr->quant > VSI_NN_KERNEL_QUANT_NONE + && attr->quant < VSI_NN_KERNEL_QUANT_TYPE_NUM + && attr->dtype != F16 + && attr->dtype != BF16 + && attr->dtype != F32 + && attr->dtype != F64 ); +} /* vsi_nn_kernel_tensor_attr_is_quantized() */ + +//TODO: Make vsi_nn_kernel_dtype_e to public and move dtype functions to vsi_nn_dtype.h +vsi_bool vsi_nn_dtype_convert_float_to_dtype + ( + const float * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + void * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm + ( + const float * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + float scale, int32_t zero_point, + void * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_dfp + ( + const float * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + int32_t fl, + void * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm + ( + const float * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + float scale, int32_t zero_point, + void * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm_perchannel + ( + const float * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + const int32_t * shape, size_t rank, + const float * scale, size_t scale_size, + const int32_t * zero_point, size_t zero_point_size, + int32_t channel_dim, + void * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_dtype_to_float + ( + const void * buffer, + size_t size, + vsi_nn_kernel_dtype_e dtype, + float * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float + ( + const void * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + float scale, int32_t zero_point, + float * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_quantize_dfp_to_float + ( + const void * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + int32_t fl, + float * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_quantize_symm_to_float + ( + const void * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + float scale, int32_t zero_point, + float * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_quantize_symm_perchannel_to_float + ( + const void * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + const int32_t * shape, size_t rank, + const float * scale, size_t scale_size, + const int32_t * zero_point, size_t zero_point_size, + int32_t channel_dim, + float * out_buffer + ); + +vsi_nn_tensor_t* vsi_nn_pad_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + int32_t * pad_front, + int32_t * pad_end, + size_t pad_size, + vsi_nn_pad_mode_e mode, + float pad_value + ); + +vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias + ); + +static inline const char* vsi_nn_kernel_type_str + ( + vsi_nn_kernel_type_e type + ) +{ + switch( type ) + { + case VSI_NN_KERNEL_TYPE_CPU: + return "CPU"; + case VSI_NN_KERNEL_TYPE_EVIS: + return "EVIS"; + case VSI_NN_KERNEL_TYPE_CL: + return "CL"; + case VSI_NN_KERNEL_TYPE_VX: + return "OPENVX"; + default: + break; + } + return "None"; +} /* vsi_nn_kernel_type_str() */ + +__END_DECLS + +#endif diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_eltwise.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_eltwise.h new file mode 100644 index 0000000..fee8075 --- /dev/null +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_eltwise.h @@ -0,0 +1,49 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_KERNEL_ELTWISE_H +#define _VSI_NN_KERNEL_ELTWISE_H + +#include +#include "kernel/vsi_nn_kernel.h" + +vsi_bool vsi_nn_kernel_optimize_eltwise_shape + ( + const int32_t* shape_x, const size_t rank_x, + const int32_t* shape_y, const size_t rank_y, + const int32_t* shape_output, const size_t rank_output, + int32_t* out_shape_x, int32_t* out_shape_y, + int32_t* out_shape_output, uint32_t* out_rank_output + ); + +vsi_bool vsi_nn_kernel_optimize_broadcast_shape + ( + const int32_t** shape_in, const size_t* rank_in, + const int32_t input_num, + const int32_t* shape_output, const size_t rank_output, + int32_t** out_shape_in, + int32_t* out_shape_output, uint32_t* out_rank_output + ); + +#endif diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h new file mode 100644 index 0000000..bf2b95d --- /dev/null +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h @@ -0,0 +1,62 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_KERNEL_GPU_SHAPE_OPTIMIZE_H +#define _VSI_NN_KERNEL_GPU_SHAPE_OPTIMIZE_H + +#include +#include "kernel/vsi_nn_kernel.h" + +vsi_bool vsi_nn_kernel_optimize_reduce_shape + ( + const int32_t* shape_x, const size_t rank_x, + const int32_t *axis, const size_t axis_size, + const int32_t* shape_output, const size_t rank_output, + int32_t* out_shape_x, uint32_t* out_rank_x, + int32_t* out_shape_output, uint32_t* out_rank_output, + int32_t* out_axis, uint32_t* out_axis_size + ); + +vsi_bool vsi_nn_kernel_optimize_element_shape + ( + const int32_t* shape_x, const size_t rank_x, + int32_t* out_shape_x, int32_t* out_rank_x + ); + +vsi_bool vsi_nn_kernel_optimize_softmax_shape + ( + const int32_t* shape_x, const size_t rank_x, const int32_t axis, + int32_t* out_shape_x, uint32_t* out_rank_x,int32_t* out_axis + ); + +vsi_bool vsi_nn_kernel_optimize_tile_shape + ( + const int32_t* shape_x, const size_t rank_x, + const int32_t* multiples, const size_t rank, + const int32_t* shape_output, const size_t rank_output, + int32_t* out_shape_x, int32_t* out_shape_y, + int32_t* out_shape_output, uint32_t* out_rank_output + ); + +#endif diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_node.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_node.h new file mode 100644 index 0000000..e682641 --- /dev/null +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_node.h @@ -0,0 +1,48 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_KERNEL_NODE_H +#define _VSI_NN_KERNEL_NODE_H + +#include +#include +#include +#include "vsi_nn_prv.h" +#include "vsi_nn_types.h" +#include "kernel/vsi_nn_kernel.h" + +vsi_nn_kernel_tensor_t kernel_pad_node + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_tensor_t tensor, + int32_t * pad_front, + int32_t * pad_end, + size_t pad_size, + vsi_nn_pad_mode_e mode, + int32_t pad_value, + vsi_nn_kernel_node_t * out_node + ); + +#endif + diff --git a/src/tim/vx/internal/include/libnnext/vsi_nn_libnnext_resource.h b/src/tim/vx/internal/include/libnnext/vsi_nn_libnnext_resource.h new file mode 100644 index 0000000..9535133 --- /dev/null +++ b/src/tim/vx/internal/include/libnnext/vsi_nn_libnnext_resource.h @@ -0,0 +1,50 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +/* WARNING! AUTO-GENERATED, DO NOT MODIFY MANUALLY */ + +#ifndef _VSI_NN_LIBNNEXT_RESOURCE_H +#define _VSI_NN_LIBNNEXT_RESOURCE_H + +#include "kernel/vsi_nn_kernel.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Load gpu source code + */ +const char* vsi_nn_resource_load_source_code + ( + const char* source_name, + size_t* size, + vsi_nn_kernel_type_e type + ); + +#ifdef __cplusplus +} +#endif + +#endif /* _VSI_NN_LIBNNEXT_RESOURCE_H */ diff --git a/src/tim/vx/internal/include/libnnext/vx_bin/vxc_binaries.h b/src/tim/vx/internal/include/libnnext/vx_bin/vxc_binaries.h new file mode 100644 index 0000000..3ae95af --- /dev/null +++ b/src/tim/vx/internal/include/libnnext/vx_bin/vxc_binaries.h @@ -0,0 +1,48 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +/* WARNING! AUTO-GENERATED, DO NOT MODIFY MANUALLY */ + +#ifndef __VXC_BINARIES_H__ +#define __VXC_BINARIES_H__ + +#ifndef _cnt_of_array +#define _cnt_of_array( arr ) (sizeof( arr )/sizeof( arr[0] )) +#endif + +typedef struct _vsi_nn_vx_bin_resource_item_type +{ + char const* name; + uint8_t const* data; + uint32_t len; +} vsi_nn_vx_bin_resource_item_type; + +const vsi_nn_vx_bin_resource_item_type vx_bin_resource_items[] = +{ + {NULL, NULL, 0}, +}; + +const int vx_bin_resource_items_cnt = _cnt_of_array(vx_bin_resource_items); + +#endif diff --git a/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h b/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h new file mode 100644 index 0000000..4941769 --- /dev/null +++ b/src/tim/vx/internal/include/libnnext/vx_lib_nnext.h @@ -0,0 +1,858 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#pragma once +#ifndef _OPENVX_EXT_LIBNNEXT_H_ +#define _OPENVX_EXT_LIBNNEXT_H_ +#include +#include + +#define gcoMATH_MIN(X, Y) (((X) < (Y))?(X):(Y)) +#define gcoMATH_MAX(X, Y) (((X) > (Y))?(X):(Y)) +#define DIM_SIZE 4 + +#ifdef __cplusplus +extern "C" { +#endif + +#define VIVANTE_NAMESPACE "com.vivantecorp.extension" +#define CVIVANTE_NAMESPACE(str) (VIVANTE_NAMESPACE "." str) + +/** + * Assigned from Khronos, vendors control their own + */ +#define VX_LIBRARY_LIBNNEXT (0x3) +#define KERNEL_ID_OVXLIB_START (VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + 0x1) +#define KERNEL_ID_OVXLIB_RESERVED (KERNEL_ID_OVXLIB_START + 0xFFF) +/** + * Use kernel id placeholder to tell ovxlib + * generate a unique id for this kernel. + */ +#define KERNEL_ID_PLACEHOLDER (0x1000) + +/*! if there are more than 1 kernel in solution +the KERNEL_ENUM_LIBNNEXT_OFFSET must be modified keep different for any kernel +*/ +enum vx_kernel_libnnext_offset_e +{ + KERNEL_ENUM_LIBNNEXT_OFFSET = 1, + KERNEL_ENUM_PREMUTE_OFFSET, + KERNEL_ENUM_PRIORBOX_OFFSET = 2 + KERNEL_ENUM_PREMUTE_OFFSET, + KERNEL_ENUM_FLATTEN_OFFSET, + KERNEL_ENUM_L2NORMALIZESCALE_OFFSET, + KERNEL_ENUM_PARAMETRICRELU_OFFSET, + KERNEL_ENUM_PREBBOX_OFFSET = 3 + KERNEL_ENUM_PARAMETRICRELU_OFFSET, + KERNEL_ENUM_ADD_RELU_KERNEL_OFFSET, + KERNEL_ENUM_POOLING_WITH_ARGMAX_OFFSET, + KERNEL_ENUM_UNPOOLING_OFFSET = 2 + KERNEL_ENUM_POOLING_WITH_ARGMAX_OFFSET, + KERNEL_ENUM_ARGMAX_OFFSET = 2 + KERNEL_ENUM_UNPOOLING_OFFSET, + KERNEL_ENUM_ALEXNET_GEMM_OFFSET = 2 + KERNEL_ENUM_ARGMAX_OFFSET, + KERNEL_ENUM_IMG2COL_DILATED_OFFSET, + KERNEL_ENUM_IMG2COL_DILATED_INT8_OFFSET, + KERNEL_ENUM_ALEXNET_GEMM_INT8_OFFSET, + KERNEL_ENUM_ELTWISE_MAX, + KERNEL_ENUM_FULLYCONNECTED_AXIS2, + KERNEL_ENUM_TENSORCROP_INT16, + KERNEL_ENUM_TENSORCROP_INT8, + KERNEL_ENUM_TENSORCROP_INT16_FP16, + KERNEL_ENUM_DROPOUT, + KERNEL_ENUM_SHUFFLECHANNEL, + KERNEL_ENUM_RESIZE, + KERNEL_ENUM_REVERSE, + KERNEL_ENUM_RESIZE_16BITS_DOWNSAMPLE_QUARTER, + KERNEL_ENUM_RESIZE_8BITS_DOWNSAMPLE_QUARTER, + KERNEL_ENUM_SCALE, + KERNEL_ENUM_TENSORREVERSE, + KERNEL_ENUM_TENSORELU_OFFSET, + KERNEL_ENUM_SPACE2BATCH, + KERNEL_ENUM_BATCH2SPACE, + KERNEL_ENUM_SPACE2DEPTH, + KERNEL_ENUM_IMAGEPROCESS, + KERNEL_ENUM_SCALETOTENSOR, + KERNEL_ENUM_GEMM, + KERNEL_ENUM_LAYERNORM, + KERNEL_ENUM_LAYERNORMFP16TOU8_OFFSET, + KERNEL_ENUM_REDUCE, + KERNEL_ENUM_INSTANCENORM, + KERNEL_ENUM_TENSORSTACKCONCAT, + KERNEL_ENUM_TENSORSTACKCONCAT8BITS_OFFSET, + KERNEL_ENUM_SIGNALFRAME, + KERNEL_ENUM_RELATIONALOPS, + KERNEL_ENUM_SYNC_HOST, + KERNEL_ENUM_POW, + KERNEL_ENUM_FLOORDIV, + KERNEL_ENUM_SPATIAL_TRANSFORMER, + KERNEL_ENUM_LOGICAL_OPS, + KERNEL_ENUM_SELECT, + KERNEL_ENUM_LSTMUNIT_ACTIVATION, + KERNEL_ENUM_TENSOR_ADD_MEAN_STDDEV_NORM, + KERNEL_ENUM_STACK, + KERNEL_ENUM_GRAYSCALETOTENSOR, + KERNEL_ENUM_NEG, + KERNEL_ENUM_EXP, + KERNEL_ENUM_CLIP, + KERNEL_ENUM_PRE_PROCESS_GRAY, + KERNEL_ENUM_UNSTACK, + KERNEL_ENUM_PRE_PROCESS_RGB, + KERNEL_ENUM_ADDN, + KERNEL_ENUM_PRE_PROCESS_YUV420, + KERNEL_ENUM_CONV2D, + KERNEL_ENUM_EXTRA_ENDING, + KERNEL_ENUM_GATHER, + KERNEL_ENUM_TILE, + KERNEL_ENUM_TOPK, + KERNEL_ENUM_PRE_PROCESS_BGRA, + KERNEL_ENUM_LOGICAL_NOT, + KERNEL_ENUM_SIN, + KERNEL_ENUM_LOG, + KERNEL_ENUM_ARGMIN, + KERNEL_ENUM_ROI_ALIGN, + KERNEL_ENUM_HEATMAP_MAX_KEYPOINT, + KERNEL_ENUM_AXIS_ALIGNED_BBOX_TRANSFORM, + KERNEL_ENUM_BOX_WITH_NMS_LIMIT, + KERNEL_ENUM_GENERATE_PROPOSALS, + KERNEL_ENUM_DETECTION_POSTPROCESS, + KERNEL_ENUM_RANDOM_MULTINOMIAL, + KERNEL_ENUM_LOG_SOFTMAX, + KERNEL_ENUM_RELU_KERAS_INTERNAL, + KERNEL_ENUM_DECONV2D, + KERNEL_ENUM_REDUCEMAX_INTERNAL, + KERNEL_ENUM_REDUCEMIN_INTERNAL, + KERNEL_ENUM_REDUCEPROD_INTERNAL, + KERNEL_ENUM_REDUCEALL_INTERNAL, + KERNEL_ENUM_REDUCEANY_INTERNAL, + KERNEL_ENUM_RESIZE_INTERNAL, + KERNEL_ENUM_RESIZE_NEAREST_INTERNAL, + KERNEL_ENUM_PRE_PROCESS_YUV444, +}; + +//! [KERNEL NAME] +#define VX_KERNEL_NAME_PERMUTECWH VIVANTE_NAMESPACE ".vxcPermuteCWH" +#define VX_KERNEL_NAME_PERMUTECHW VIVANTE_NAMESPACE ".vxcPermuteCWH" +#define VX_KERNEL_NAME_PRIORBOX VIVANTE_NAMESPACE ".vxcPriorBox" +#define VX_KERNEL_NAME_FLATTEN VIVANTE_NAMESPACE ".flatten" +//! l2normalizscale kernel +#define VX_KERNEL_NAME_L2NORMALIZESCALE VIVANTE_NAMESPACE ".vxcL2NormalizeScale" +#define VX_KERNEL_NAME_L2NORMSCALE_SUMRSQRT_AXI1_F16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_SumRsqrt_axis1_F16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_SUMRSQRT_AXI1_I8_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_SumRsqrt_axis1_I8_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_SUMRSQRT_AXI1_U8_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_SumRsqrt_axis1_U8_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_SUMRSQRT_AXI1_I16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_SumRsqrt_axis1_I16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI1_F16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis1_F16toF16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI1_I8TOI8_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis1_I8toI8_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI1_I8TOF16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis1_I8toF16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI1_U8TOU8_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis1_U8toU8_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI1_U8TOF16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis1_U8toF16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI1_I16TOI16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis1_I16toI16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI1_I16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis1_I16toF16_2D" + +#define VX_KERNEL_NAME_L2NORMSCALE_SUMRSQRT_AXI0_F16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_SumRsqrt_axis0_F16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_SUMRSQRT_AXI0_I8_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_SumRsqrt_axis0_I8_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_SUMRSQRT_AXI0_U8_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_SumRsqrt_axis0_U8_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_SUMRSQRT_AXI0_I16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_SumRsqrt_axis0_I16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI0_F16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis0_F16toF16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI0_I8TOI8_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis0_I8toI8_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI0_I8TOF16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis0_I8toF16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI0_U8TOU8_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis0_U8toU8_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI0_U8TOF16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis0_U8toF16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI0_I16TOI16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis0_I16toI16_2D" +#define VX_KERNEL_NAME_L2NORMSCALE_AXI0_I16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcL2NormScale_axis0_I16toF16_2D" +//! Prelu Kernel +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_I8F16TOI8 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_I8F16toI8" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_I8F16TOF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_I8F16toF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_I16F16TOI16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_I16F16toI16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_I16F16TOF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_I16F16toF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_U8F16TOU8 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_U8F16toU8" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_U8F16TOF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_U8F16toF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_F16F16TOF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_F16F16toF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_F16F16TOU8 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_F16F16toU8" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_F16F16TOI8 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_F16F16toI8" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_F16F16TOI16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_F16F16toI16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_BF16F16TOBF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_BF16F16toBF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_BF16BF16TOBF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_BF16BF16toBF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_I8F16TOI8_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_I8F16toI8_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_I8F16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_I8F16toF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_I16F16TOI16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_I16F16toI16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_I16F16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_I16F16toF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_U8F16TOU8_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_U8F16toU8_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_U8F16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_U8F16toF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_F16F16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_F16F16toF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_F16F16TOU8_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_F16F16toU8_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_F16F16TOI8_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_F16F16toI8_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_F16F16TOI16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_F16F16toI16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_BF16F16TOBF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_BF16F16toBF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI0_BF16BF16TOBF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis0_BF16BF16toBF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_I8F16TOI8 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_I8F16toI8" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_I8F16TOF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_I8F16toF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_I16F16TOI16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_I16F16toI16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_I16F16TOF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_I16F16toF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_U8F16TOU8 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_U8F16toU8" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_U8F16TOF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_U8F16toF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_F16F16TOF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_F16F16toF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_F16F16TOU8 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_F16F16toU8" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_F16F16TOI8 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_F16F16toI8" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_F16F16TOI16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_F16F16toI16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_BF16F16TOBF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_BF16F16toBF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_BF16BF16TOBF16 \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_BF16BF16toBF16" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_I8F16TOI8_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_I8F16toI8_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_I8F16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_I8F16toF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_I16F16TOI16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_I16F16toI16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_I16F16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_I16F16toF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_U8F16TOU8_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_U8F16toU8_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_U8F16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_U8F16toF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_F16F16TOF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_F16F16toF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_F16F16TOU8_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_F16F16toU8_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_F16F16TOI8_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_F16F16toI8_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_F16F16TOI16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_F16F16toI16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_BF16F16TOBF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_BF16F16toBF16_2D" +#define VX_KERNEL_NAME_PARAMETRICRELU_AXI1_BF16BF16TOBF16_2D \ + VIVANTE_NAMESPACE ".vxcParametricRelu_axis1_BF16BF16toBF16_2D" + +#define VX_KERNEL_NAME_PREBBOX VIVANTE_NAMESPACE ".preBBoxVXC" +#define VX_KERNEL_NAME_ADD_RELU_KERNEL VIVANTE_NAMESPACE ".addRelu" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX VIVANTE_NAMESPACE ".poolingWithArgmax" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_INT8 VIVANTE_NAMESPACE ".poolingWithArgmaxInt8" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_INT8_OPT VIVANTE_NAMESPACE ".poolingWithArgmaxInt8_Int8_opt" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_INT8_INT8 VIVANTE_NAMESPACE ".poolingWithArgmaxInt8_Int8" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_INT16 VIVANTE_NAMESPACE ".poolingWithArgmaxInt16_s2k2p0" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_INT16_INT16 \ + VIVANTE_NAMESPACE ".poolingWithArgmaxInt16_int16_s2k2p0" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_INT16_OPT \ + VIVANTE_NAMESPACE ".poolingWithArgmaxInt16_s2k2p0_opt" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_INT16_FP16 \ + VIVANTE_NAMESPACE ".poolingWithArgmaxInt16_fp16_s2k2p0" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_INT16_AXINT16 \ + VIVANTE_NAMESPACE ".poolingWithArgmaxInt16_axI16_s2k2p0" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_UINT8 VIVANTE_NAMESPACE ".poolingWithArgmaxUint8_s2k2p0" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_UINT8_FP16 \ + VIVANTE_NAMESPACE ".poolingWithArgmaxUint8_fp16_s2k2p0" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_UINT8_FP16_FP16 \ + VIVANTE_NAMESPACE ".poolingWithArgmaxUint8_fp16_fp16_s2k2p0" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_INT8_FP16 \ + VIVANTE_NAMESPACE ".poolingWithArgmaxInt8_fp16_s2k2p0" +#define VX_KERNEL_NAME_POOLING_WITH_ARGMAX_UINT8_2D VIVANTE_NAMESPACE ".poolingWithArgmaxU8_s2k2p0_2D" +#define VX_KERNEL_NAME_UNPOOLING VIVANTE_NAMESPACE ".unpooling" +#define VX_KERNEL_NAME_UNPOOLING_INT8 VIVANTE_NAMESPACE ".unpoolingInt8" +#define VX_KERNEL_NAME_UNPOOLING_INT8_INT8 VIVANTE_NAMESPACE ".unpoolingInt8_Int8" +#define VX_KERNEL_NAME_UNPOOLING_INT8_INT8_OPT VIVANTE_NAMESPACE ".unpoolingInt8_Int8_opt" +#define VX_KERNEL_NAME_UNPOOLING_UINT8 VIVANTE_NAMESPACE ".unpoolingUint8_Uint8" +#define VX_KERNEL_NAME_UNPOOLING_INT16_INT16 VIVANTE_NAMESPACE ".unpoolingInt16_Int16" +#define VX_KERNEL_NAME_UNPOOLING_INT16_INT16_OPT VIVANTE_NAMESPACE ".unpoolingInt16_Int16_opt" +#define VX_KERNEL_NAME_UNPOOLING_INT16_INT16_AXINT16 VIVANTE_NAMESPACE ".unpoolingInt16_Int16_axI16" +#define VX_KERNEL_NAME_UNPOOLING_INT16_FP16_AXINT16 VIVANTE_NAMESPACE ".unpoolingInt16_Fp16_axI16" +#define VX_KERNEL_NAME_UNPOOLING_INT16_FP16 VIVANTE_NAMESPACE ".unpoolingInt16_Fp16" +#define VX_KERNEL_NAME_UNPOOLING_FP16_UINT8 VIVANTE_NAMESPACE ".unpoolingFp16_Uint8" +#define VX_KERNEL_NAME_UNPOOLING_FP16_INT8 VIVANTE_NAMESPACE ".unpoolingFp16_Int8" +#define VX_KERNEL_NAME_UNPOOLING_FP16_INT16 VIVANTE_NAMESPACE ".unpoolingFp16_Int16" +#define VX_KERNEL_NAME_UNPOOLING_FP16FP16_UINT8 VIVANTE_NAMESPACE ".unpoolingFp16Fp16_Uint8" +#define VX_KERNEL_NAME_UNPOOLING_UINT8_FP16 VIVANTE_NAMESPACE ".unpoolingUint8_Fp16" +#define VX_KERNEL_NAME_UNPOOLING_INT8_FP16 VIVANTE_NAMESPACE ".unpoolingInt8_Fp16" +#define VX_KERNEL_NAME_UNPOOLING_UINT8_2D VIVANTE_NAMESPACE ".unpoolingUint8_Uint8_2D" +#define VX_KERNEL_NAME_UNPOOLING_FP16_UINT8_2D VIVANTE_NAMESPACE ".unpoolingFp16_Uint8_2D" +#define VX_KERNEL_NAME_ALEXNET_GEMM VIVANTE_NAMESPACE ".alexNet_gemmVXC" +#define VX_KERNEL_NAME_IMG2COL_DILATED VIVANTE_NAMESPACE ".img2col_dilatedVXC" +#define VX_KERNEL_NAME_IMG2COL_DILATED_INT8 VIVANTE_NAMESPACE ".img2col_dilated_int8VXC" +#define VX_KERNEL_NAME_ALEXNET_GEMM_INT8 VIVANTE_NAMESPACE ".alexNet_gemm_int8VXC" +#define VX_KERNEL_NAME_FULLYCONNECTED_AXIS2 VIVANTE_NAMESPACE ".vxcFullyConnected_Axis2" +#define VX_KERNEL_NAME_TENSORCROP_INT16 VIVANTE_NAMESPACE ".vxcTensorCrop_Int16" +#define VX_KERNEL_NAME_TENSORCROP_INT8 VIVANTE_NAMESPACE ".vxcTensorCrop_Int8" +#define VX_KERNEL_NAME_TENSORCROP_INT16_FP16 VIVANTE_NAMESPACE ".vxcTensorCrop_Int16_Fp16" +#define VX_KERNEL_NAME_SHUFFLECHANNEL VIVANTE_NAMESPACE ".shuffleChannelVXC" +#define VX_KERNEL_NAME_SHUFFLECHANNEL8BITS VIVANTE_NAMESPACE ".shuffleChannel8BitsVXC" +#define VX_KERNEL_NAME_SHUFFLECHANNEL16BITS_AXIS1 VIVANTE_NAMESPACE ".shuffleChannel16Bits_Axis1" +#define VX_KERNEL_NAME_SHUFFLECHANNEL8BITS_AXIS1 VIVANTE_NAMESPACE ".shuffleChannel8Bits_Axis1" +#define VX_KERNEL_NAME_RESIZE_16BITS_DOWNSAMPLE_QUARTER \ + VIVANTE_NAMESPACE ".resize_16bits_downsample_quarter" +#define VX_KERNEL_NAME_RESIZE_8BITS_DOWNSAMPLE_QUARTER \ + VIVANTE_NAMESPACE ".resize_8bits_downsample_quarter" +#define VX_KERNEL_NAME_SCALE_FP16 VIVANTE_NAMESPACE ".scale_fp16" +#define VX_KERNEL_NAME_TENSORREVERSE VIVANTE_NAMESPACE ".tensorReverse_axis0_fp16" +#define VX_KERNEL_NAME_SPACE2DEPTH_INT16_INT16 VIVANTE_NAMESPACE ".vxcReorg2_fp16_fp16_sx2_sy1" +#define VX_KERNEL_NAME_SCALETOTENSOR_FP16 VIVANTE_NAMESPACE ".ScaletoTensor_Fp16" +#define VX_KERNEL_NAME_SCALETOTENSOR_INT8 VIVANTE_NAMESPACE ".ScaletoTensor_Int8" +#define VX_KERNEL_NAME_SCALETOTENSOR_FP16_COPY VIVANTE_NAMESPACE ".ScaletoTensor_Fp16_copy" +#define VX_KERNEL_NAME_SCALETOTENSOR_INT8_COPY VIVANTE_NAMESPACE ".ScaletoTensor_Int8_copy" +#define VX_KERNEL_NAME_SCALETOTENSOR_INT16 VIVANTE_NAMESPACE ".ScaletoTensor_Int16" +#define VX_KERNEL_NAME_SCALETOTENSOR_INT16_COPY VIVANTE_NAMESPACE ".ScaletoTensor_Int16_copy" +#define VX_KERNEL_NAME_SCALETOTENSOR_UINT8 VIVANTE_NAMESPACE ".ScaletoTensor_UInt8" +#define VX_KERNEL_NAME_SCALETOTENSOR_UINT8_COPY VIVANTE_NAMESPACE ".ScaletoTensor_UInt8_copy" +#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_FP16 VIVANTE_NAMESPACE ".GrayScaletoTensor_Fp16" +#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT8 VIVANTE_NAMESPACE ".GrayScaletoTensor_Int8" +#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_FP16_COPY VIVANTE_NAMESPACE ".GrayScaletoTensor_Fp16_copy" +#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT8_COPY VIVANTE_NAMESPACE ".GrayScaletoTensor_Int8_copy" +#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT16 VIVANTE_NAMESPACE ".GrayScaletoTensor_Int16" +#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT16_COPY VIVANTE_NAMESPACE ".GrayScaletoTensor_Int16_copy" +#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8 VIVANTE_NAMESPACE ".GrayScaletoTensor_UInt8" +#define VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8_COPY VIVANTE_NAMESPACE ".GrayScaletoTensor_UInt8_copy" +#define VX_KERNEL_NAME_LAYERNORM VIVANTE_NAMESPACE ".vxcLayerNorm" +#define VX_KERNEL_NAME_LAYERNORM_UINT8 VIVANTE_NAMESPACE ".vxcLayerNorm_u8" +#define VX_KERNEL_NAME_LAYERNORM_FP16TOU8 VIVANTE_NAMESPACE ".vxcLayerNormFP16toU8" +#define VX_KERNEL_NAME_LAYERNORM_U8TOFP16 VIVANTE_NAMESPACE ".vxcLayerNormU8toFp16" +#define VX_KERNEL_NAME_TENSORSTACKCONCAT VIVANTE_NAMESPACE ".vxcTensorStackConcat" +#define VX_KERNEL_NAME_TENSORSTACKCONCAT8BITS VIVANTE_NAMESPACE ".vxcTensorStackConcat8Bits" +#define VX_KERNEL_NAME_SIGNALFRAME_WIDTH VIVANTE_NAMESPACE ".vxcSignalFrame_width" +#define VX_KERNEL_NAME_SIGNALFRAME_HEIGHT VIVANTE_NAMESPACE ".vxcSignalFrame_height" +#define VX_KERNEL_NAME_SIGNALFRAME_CHANNEL VIVANTE_NAMESPACE ".vxcSignalFrame_channel" +#define VX_KERNEL_NAME_SIGNALFRAME_WIDTH_8BITS VIVANTE_NAMESPACE ".vxcSignalFrame_width_8bit" +#define VX_KERNEL_NAME_SIGNALFRAME_HEIGHT_8BITS VIVANTE_NAMESPACE ".vxcSignalFrame_height_8bit" +#define VX_KERNEL_NAME_SIGNALFRAME_CHANNEL_8BITS VIVANTE_NAMESPACE ".vxcSignalFrame_channel_8bit" +#define VX_KERNEL_NAME_FLOORDIV_FP16 VIVANTE_NAMESPACE ".vxcTensorFloorDiv_Fp16" +#define VX_KERNEL_NAME_FLOORDIV_INT16 VIVANTE_NAMESPACE ".vxcTensorFloorDiv_Int16" +#define VX_KERNEL_NAME_FLOORDIV_INT8 VIVANTE_NAMESPACE ".vxcTensorFloorDiv_Int8" +#define VX_KERNEL_NAME_FLOORDIV_UINT8 VIVANTE_NAMESPACE ".vxcTensorFloorDiv_Uint8" +#define VX_KERNEL_NAME_SPATIAL_TRANSFORMER VIVANTE_NAMESPACE ".vxcTransform_Gemm_F16toF16" +#define VX_KERNEL_NAME_TRANSFORM_SETUP_THRES_F16TOF16 VIVANTE_NAMESPACE ".vxcTransform_setupThres_F16toF16" +#define VX_KERNEL_NAME_TRANSFORM_INTERP_F16TOF16_2D VIVANTE_NAMESPACE ".vxcTransform_InterP_F16toF16_2D" +#define VX_KERNEL_NAME_TRANSFORM_INTERP_F16TOF16 VIVANTE_NAMESPACE ".vxcTransform_InterP_F16toF16" +#define VX_KERNEL_NAME_LOGICAL_OR_INT16 VIVANTE_NAMESPACE ".vxcTensorLogical_or_int16" +#define VX_KERNEL_NAME_LOGICAL_OR_INT8 VIVANTE_NAMESPACE ".vxcTensorLogical_or_int8" +#define VX_KERNEL_NAME_LOGICAL_OR_UINT8 VIVANTE_NAMESPACE ".vxcTensorLogical_or_uint8" +#define VX_KERNEL_NAME_LOGICAL_OR_FP16 VIVANTE_NAMESPACE ".vxcTensorLogical_or_fp16" +#define VX_KERNEL_NAME_LOGICAL_AND_INT16 VIVANTE_NAMESPACE ".vxcTensorLogical_and_int16" +#define VX_KERNEL_NAME_LOGICAL_AND_INT8 VIVANTE_NAMESPACE ".vxcTensorLogical_and_int8" +#define VX_KERNEL_NAME_LOGICAL_AND_UINT8 VIVANTE_NAMESPACE ".vxcTensorLogical_and_uint8" +#define VX_KERNEL_NAME_LOGICAL_AND_FP16 VIVANTE_NAMESPACE ".vxcTensorLogical_and_fp16" +#define VX_KERNEL_NAME_SELECT_UINT8 VIVANTE_NAMESPACE ".vxcTensorSelect_uint8" +#define VX_KERNEL_NAME_SELECT_BOOL_INT8 VIVANTE_NAMESPACE ".vxcTensorSelect_bool_int8" +#define VX_KERNEL_NAME_SELECT_BOOL_INT16 VIVANTE_NAMESPACE ".vxcTensorSelect_bool_int16" +#define VX_KERNEL_NAME_LSTMUNIT_ACTIVATION VIVANTE_NAMESPACE ".vxcLSTMUnit_Activation_SW" +#define VX_KERNEL_NAME_TENSORADD_MEAN_STDDEV_NORM_FP16 VIVANTE_NAMESPACE ".vxcTensorAddMeanStdNorm_fp16" +#define VX_KERNEL_NAME_TENSORADD_MEAN_STDDEV_NORM_U8_FP16 VIVANTE_NAMESPACE ".vxcTensorAddMeanStdNorm_u8_fp16" +#define VX_KERNEL_NAME_TENSORADD_MEAN_STDDEV_NORM_I16_FP16 VIVANTE_NAMESPACE ".vxcTensorAddMeanStdNorm_i16_fp16" +#define VX_KERNEL_NAME_TENSORADD_MEAN_STDDEV_NORM_I16_FP16 VIVANTE_NAMESPACE ".vxcTensorAddMeanStdNorm_i16_fp16" +#define VX_KERNEL_NAME_STACK VIVANTE_NAMESPACE ".vxcStack" +//! clip kernel +#define VX_KERNEL_NAME_CLIP_F16TOF16_2D VIVANTE_NAMESPACE ".vxcTensorClip_F16toF16_2D" +#define VX_KERNEL_NAME_CLIP_F16TOF16 VIVANTE_NAMESPACE ".vxcTensorClip_F16toF16" +#define VX_KERNEL_NAME_CLIP_F16TOI16_2D VIVANTE_NAMESPACE ".vxcTensorClip_F16toI16_2D" +#define VX_KERNEL_NAME_CLIP_F16TOI16 VIVANTE_NAMESPACE ".vxcTensorClip_F16toI16" +#define VX_KERNEL_NAME_CLIP_F16TOI8_2D VIVANTE_NAMESPACE ".vxcTensorClip_F16toI8_2D" +#define VX_KERNEL_NAME_CLIP_F16TOI8 VIVANTE_NAMESPACE ".vxcTensorClip_F16toI8" +#define VX_KERNEL_NAME_CLIP_F16TOU8_2D VIVANTE_NAMESPACE ".vxcTensorClip_F16toU8_2D" +#define VX_KERNEL_NAME_CLIP_F16TOU8 VIVANTE_NAMESPACE ".vxcTensorClip_F16toU8" +#define VX_KERNEL_NAME_CLIP_U8TOF16_2D VIVANTE_NAMESPACE ".vxcTensorClip_U8toF16_2D" +#define VX_KERNEL_NAME_CLIP_U8TOF16 VIVANTE_NAMESPACE ".vxcTensorClip_U8toF16" +#define VX_KERNEL_NAME_CLIP_I8TOF16_2D VIVANTE_NAMESPACE ".vxcTensorClip_I8toF16_2D" +#define VX_KERNEL_NAME_CLIP_I8TOF16 VIVANTE_NAMESPACE ".vxcTensorClip_I8toF16" +#define VX_KERNEL_NAME_CLIP_I16TOF16_2D VIVANTE_NAMESPACE ".vxcTensorClip_I16toF16_2D" +#define VX_KERNEL_NAME_CLIP_I16TOF16 VIVANTE_NAMESPACE ".vxcTensorClip_I16toF16" +#define VX_KERNEL_NAME_CLIP_I16TOI16_2D VIVANTE_NAMESPACE ".vxcTensorClip_I16toI16_2D" +#define VX_KERNEL_NAME_CLIP_I16TOI16 VIVANTE_NAMESPACE ".vxcTensorClip_I16toI16" +#define VX_KERNEL_NAME_CLIP_I8TOI8_2D VIVANTE_NAMESPACE ".vxcTensorClip_I8toI8_2D" +#define VX_KERNEL_NAME_CLIP_I8TOI8 VIVANTE_NAMESPACE ".vxcTensorClip_I8toI8" +#define VX_KERNEL_NAME_CLIP_U8TOU8_2D VIVANTE_NAMESPACE ".vxcTensorClip_U8toU8_2D" +#define VX_KERNEL_NAME_CLIP_U8TOU8 VIVANTE_NAMESPACE ".vxcTensorClip_U8toU8" +//! pre process gray kernel +#define VX_KERNEL_NAME_UNSTACK VIVANTE_NAMESPACE ".vxcUnstack" +#define VX_KERNEL_NAME_ADDN VIVANTE_NAMESPACE ".vxcAddn" +#define VX_KERNEL_NAME_EXTRA_ENDING_I16 VIVANTE_NAMESPACE ".vxcExtra_ending_i16" +#define VX_KERNEL_NAME_EXTRA_ENDING_I8 VIVANTE_NAMESPACE ".vxcExtra_ending_i8" +#define VX_KERNEL_NAME_EXTRA_ENDING_U8 VIVANTE_NAMESPACE ".vxcExtra_ending_u8" +#define VX_KERNEL_NAME_TOPK VIVANTE_NAMESPACE ".vxcTopk" +#define VX_KERNEL_NAME_LOGICAL_NOT_INT8 VIVANTE_NAMESPACE ".vxcLogical_not_i8" +#define VX_KERNEL_NAME_LOGICAL_NOT_INT16 VIVANTE_NAMESPACE ".vxcLogical_not_i16" +#define VX_KERNEL_NAME_LOGICAL_NOT_UINT8 VIVANTE_NAMESPACE ".vxcLogical_not_u8" +#define VX_KERNEL_NAME_ROI_ALIGN VIVANTE_NAMESPACE ".vxcRoi_align" +#define VX_KERNEL_NAME_HEATMAP_MAX_KEYPOINT VIVANTE_NAMESPACE ".vxcHeatmap_max_keypoint" +#define VX_KERNEL_NAME_AXIS_ALIGNED_BBOX_TRANSFORM VIVANTE_NAMESPACE ".vxcAxis_aligned_bbox_transform" +#define VX_KERNEL_NAME_BOX_WITH_NMS_LIMIT VIVANTE_NAMESPACE ".vxcBox_with_nms_limit" +#define VX_KERNEL_NAME_GENERATE_PROPOSALS VIVANTE_NAMESPACE ".vxcGenerate_proposals" +#define VX_KERNEL_NAME_DETECTION_POSTPROCESS VIVANTE_NAMESPACE ".vxcDetection_postprocess" +#define VX_KERNEL_NAME_RANDOM_MULTINOMIAL VIVANTE_NAMESPACE ".vxcRandom_multinomial" +#define VX_KERNEL_NAME_RANDOM_GENERATE VIVANTE_NAMESPACE ".vxcRandom_generate" +#define VX_KERNEL_NAME_RANDOM_SUM_FP16 VIVANTE_NAMESPACE ".vxcRandom_sum_fp16" +#define VX_KERNEL_NAME_RANDOM_SUM_FP32 VIVANTE_NAMESPACE ".vxcRandom_sum_fp32" + +//! reducemax kernel +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_F16TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis0_F16toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_F16TOI16 VIVANTE_NAMESPACE ".vxcReducemaxAxis0_F16toI16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_F16TOI8 VIVANTE_NAMESPACE ".vxcReducemaxAxis0_F16toI8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_F16TOU8 VIVANTE_NAMESPACE ".vxcReducemaxAxis0_F16toU8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_F16TOF16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis0_F16toF16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_F16TOI16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis0_F16toI16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_F16TOI8_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis0_F16toI8_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_F16TOU8_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis0_F16toU8_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_I16TOI16 VIVANTE_NAMESPACE ".vxcReducemaxAxis0_I16toI16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_I16TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis0_I16toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_I16TOI16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis0_I16toI16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_I16TOF16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis0_I16toF16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_I8TOI8 VIVANTE_NAMESPACE ".vxcReducemaxAxis0_I8toI8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_I8TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis0_I8toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_I8TOI8_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis0_I8toI8_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_I8TOF16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis0_I8toF16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_U8TOU8 VIVANTE_NAMESPACE ".vxcReducemaxAxis0_U8toU8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_U8TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis0_U8toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_U8TOU8_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis0_U8toU8_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI0_U8TOF16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis0_U8toF16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_F16TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis1_F16toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_F16TOI16 VIVANTE_NAMESPACE ".vxcReducemaxAxis1_F16toI16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_F16TOI8 VIVANTE_NAMESPACE ".vxcReducemaxAxis1_F16toI8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_F16TOU8 VIVANTE_NAMESPACE ".vxcReducemaxAxis1_F16toU8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_F16TOF16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis1_F16toF16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_F16TOI16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis1_F16toI16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_F16TOI8_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis1_F16toI8_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_F16TOU8_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis1_F16toU8_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_I16TOI16 VIVANTE_NAMESPACE ".vxcReducemaxAxis1_I16toI16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_I16TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis1_I16toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_I16TOI16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis1_I16toI16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_I16TOF16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis1_I16toF16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_I8TOI8 VIVANTE_NAMESPACE ".vxcReducemaxAxis1_I8toI8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_I8TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis1_I8toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_I8TOI8_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis1_I8toI8_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_I8TOF16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis1_I8toF16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_U8TOU8 VIVANTE_NAMESPACE ".vxcReducemaxAxis1_U8toU8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_U8TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis1_U8toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_U8TOU8_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis1_U8toU8_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI1_U8TOF16_2D VIVANTE_NAMESPACE ".vxcReducemaxAxis1_U8toF16_2D" +#define VX_KERNEL_NAME_REDUCEMAX_AXI2_F16TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis2_F16toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI2_F16TOI16 VIVANTE_NAMESPACE ".vxcReducemaxAxis2_F16toI16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI2_F16TOI8 VIVANTE_NAMESPACE ".vxcReducemaxAxis2_F16toI8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI2_F16TOU8 VIVANTE_NAMESPACE ".vxcReducemaxAxis2_F16toU8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI2_I16TOI16 VIVANTE_NAMESPACE ".vxcReducemaxAxis2_I16toI16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI2_I16TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis2_I16toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI2_I8TOI8 VIVANTE_NAMESPACE ".vxcReducemaxAxis2_I8toI8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI2_I8TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis2_I8toF16" +#define VX_KERNEL_NAME_REDUCEMAX_AXI2_U8TOU8 VIVANTE_NAMESPACE ".vxcReducemaxAxis2_U8toU8" +#define VX_KERNEL_NAME_REDUCEMAX_AXI2_U8TOF16 VIVANTE_NAMESPACE ".vxcReducemaxAxis2_U8toF16" +//! reducemin kernel +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_F16TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis0_F16toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_F16TOI16 VIVANTE_NAMESPACE ".vxcReduceminAxis0_F16toI16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_F16TOI8 VIVANTE_NAMESPACE ".vxcReduceminAxis0_F16toI8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_F16TOU8 VIVANTE_NAMESPACE ".vxcReduceminAxis0_F16toU8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_F16TOF16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis0_F16toF16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_F16TOI16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis0_F16toI16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_F16TOI8_2D VIVANTE_NAMESPACE ".vxcReduceminAxis0_F16toI8_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_F16TOU8_2D VIVANTE_NAMESPACE ".vxcReduceminAxis0_F16toU8_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_I16TOI16 VIVANTE_NAMESPACE ".vxcReduceminAxis0_I16toI16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_I16TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis0_I16toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_I16TOI16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis0_I16toI16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_I16TOF16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis0_I16toF16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceminAxis0_I8toI8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_I8TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis0_I8toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_I8TOI8_2D VIVANTE_NAMESPACE ".vxcReduceminAxis0_I8toI8_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_I8TOF16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis0_I8toF16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_U8TOU8 VIVANTE_NAMESPACE ".vxcReduceminAxis0_U8toU8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_U8TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis0_U8toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_U8TOU8_2D VIVANTE_NAMESPACE ".vxcReduceminAxis0_U8toU8_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI0_U8TOF16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis0_U8toF16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_F16TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis1_F16toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_F16TOI16 VIVANTE_NAMESPACE ".vxcReduceminAxis1_F16toI16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_F16TOI8 VIVANTE_NAMESPACE ".vxcReduceminAxis1_F16toI8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_F16TOU8 VIVANTE_NAMESPACE ".vxcReduceminAxis1_F16toU8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_F16TOF16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis1_F16toF16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_F16TOI16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis1_F16toI16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_F16TOI8_2D VIVANTE_NAMESPACE ".vxcReduceminAxis1_F16toI8_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_F16TOU8_2D VIVANTE_NAMESPACE ".vxcReduceminAxis1_F16toU8_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_I16TOI16 VIVANTE_NAMESPACE ".vxcReduceminAxis1_I16toI16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_I16TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis1_I16toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_I16TOI16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis1_I16toI16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_I16TOF16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis1_I16toF16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceminAxis1_I8toI8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_I8TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis1_I8toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_I8TOI8_2D VIVANTE_NAMESPACE ".vxcReduceminAxis1_I8toI8_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_I8TOF16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis1_I8toF16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_U8TOU8 VIVANTE_NAMESPACE ".vxcReduceminAxis1_U8toU8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_U8TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis1_U8toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_U8TOU8_2D VIVANTE_NAMESPACE ".vxcReduceminAxis1_U8toU8_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI1_U8TOF16_2D VIVANTE_NAMESPACE ".vxcReduceminAxis1_U8toF16_2D" +#define VX_KERNEL_NAME_REDUCEMIN_AXI2_F16TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis2_F16toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI2_F16TOI16 VIVANTE_NAMESPACE ".vxcReduceminAxis2_F16toI16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI2_F16TOI8 VIVANTE_NAMESPACE ".vxcReduceminAxis2_F16toI8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI2_F16TOU8 VIVANTE_NAMESPACE ".vxcReduceminAxis2_F16toU8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI2_I16TOI16 VIVANTE_NAMESPACE ".vxcReduceminAxis2_I16toI16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI2_I16TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis2_I16toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI2_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceminAxis2_I8toI8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI2_I8TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis2_I8toF16" +#define VX_KERNEL_NAME_REDUCEMIN_AXI2_U8TOU8 VIVANTE_NAMESPACE ".vxcReduceminAxis2_U8toU8" +#define VX_KERNEL_NAME_REDUCEMIN_AXI2_U8TOF16 VIVANTE_NAMESPACE ".vxcReduceminAxis2_U8toF16" +//! reduceprod kernel +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_F16TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_F16toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_F16TOI16 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_F16toI16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_F16TOI8 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_F16toI8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_F16TOU8 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_F16toU8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_F16TOF16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_F16toF16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_F16TOI16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_F16toI16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_F16TOI8_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_F16toI8_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_F16TOU8_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_F16toU8_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_I16TOI16 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_I16toI16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_I16TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_I16toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_I16TOI16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_I16toI16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_I16TOF16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_I16toF16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_I8toI8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_I8TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_I8toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_I8TOI8_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_I8toI8_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_I8TOF16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_I8toF16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_U8TOU8 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_U8toU8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_U8TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_U8toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_U8TOU8_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_U8toU8_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_U8TOF16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_U8toF16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_BF16TOBF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis0_BF16toBF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI0_BF16TOBF16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis0_BF16toBF16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_F16TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_F16toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_F16TOI16 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_F16toI16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_F16TOI8 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_F16toI8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_F16TOU8 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_F16toU8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_F16TOF16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_F16toF16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_F16TOI16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_F16toI16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_F16TOI8_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_F16toI8_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_F16TOU8_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_F16toU8_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_I16TOI16 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_I16toI16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_I16TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_I16toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_I16TOI16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_I16toI16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_I16TOF16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_I16toF16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_I8toI8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_I8TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_I8toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_I8TOI8_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_I8toI8_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_I8TOF16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_I8toF16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_U8TOU8 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_U8toU8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_U8TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_U8toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_U8TOU8_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_U8toU8_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_U8TOF16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_U8toF16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_BF16TOBF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis1_BF16toBF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI1_BF16TOBF16_2D VIVANTE_NAMESPACE ".vxcReduceProdAxis1_BF16toBF16_2D" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_F16TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_F16toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_F16TOI16 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_F16toI16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_F16TOI8 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_F16toI8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_F16TOU8 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_F16toU8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_I16TOI16 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_I16toI16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_I16TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_I16toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_I8toI8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_I8TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_I8toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_U8TOU8 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_U8toU8" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_U8TOF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_U8toF16" +#define VX_KERNEL_NAME_REDUCEPROD_AXI2_BF16TOBF16 VIVANTE_NAMESPACE ".vxcReduceProdAxis2_BF16toBF16" +//! reduceall kernel +#define VX_KERNEL_NAME_REDUCEALL_AXI0_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceallAxis0_I8toI8" +#define VX_KERNEL_NAME_REDUCEALL_AXI0_I8TOI8_2D VIVANTE_NAMESPACE ".vxcReduceallAxis0_I8toI8_2D" +#define VX_KERNEL_NAME_REDUCEALL_AXI1_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceallAxis1_I8toI8" +#define VX_KERNEL_NAME_REDUCEALL_AXI1_I8TOI8_2D VIVANTE_NAMESPACE ".vxcReduceallAxis1_I8toI8_2D" +#define VX_KERNEL_NAME_REDUCEALL_AXI2_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceallAxis2_I8toI8" +//! reduceany kernel +#define VX_KERNEL_NAME_REDUCEANY_AXI0_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceanyAxis0_I8toI8" +#define VX_KERNEL_NAME_REDUCEANY_AXI0_I8TOI8_2D VIVANTE_NAMESPACE ".vxcReduceanyAxis0_I8toI8_2D" +#define VX_KERNEL_NAME_REDUCEANY_AXI1_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceanyAxis1_I8toI8" +#define VX_KERNEL_NAME_REDUCEANY_AXI1_I8TOI8_2D VIVANTE_NAMESPACE ".vxcReduceanyAxis1_I8toI8_2D" +#define VX_KERNEL_NAME_REDUCEANY_AXI2_I8TOI8 VIVANTE_NAMESPACE ".vxcReduceanyAxis2_I8toI8" +//! bilinear +#define VX_KERNEL_NAME_RESIZE_INTERNAL_I8TOI8_UP VIVANTE_NAMESPACE ".vxcResize_I8toI8_up" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_I8TOI8 VIVANTE_NAMESPACE ".vxcResize_I8toI8" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_I16TOI16_UP VIVANTE_NAMESPACE ".vxcResize_I16toI16_up" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_I16TOI16 VIVANTE_NAMESPACE ".vxcResize_I16toI16" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_U8TOF16 VIVANTE_NAMESPACE ".vxcResize_U8toF16" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_U8TOU8_UP VIVANTE_NAMESPACE ".vxcResize_U8toU8_up" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_U8TOU8 VIVANTE_NAMESPACE ".vxcResize_U8toU8" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_F16TOF16_UP VIVANTE_NAMESPACE ".vxcResize_F16toF16_up" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_F16TOF16 VIVANTE_NAMESPACE ".vxcResize_F16toF16" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_F16TOU8 VIVANTE_NAMESPACE ".vxcResize_F16toU8" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_BF16TOBF16_UP VIVANTE_NAMESPACE ".vxcResize_BF16toBF16_up" +#define VX_KERNEL_NAME_RESIZE_INTERNAL_BF16TOBF16 VIVANTE_NAMESPACE ".vxcResize_BF16toBF16" +//! nearest resize +#define VX_KERNEL_NAME_NEAREST_INTERNAL_I16TOI16 VIVANTE_NAMESPACE ".vxcResize_nearest_I16toI16" +#define VX_KERNEL_NAME_NEAREST_INTERNAL_I16TOI16_OP VIVANTE_NAMESPACE ".vxcResize_nearest_I16toI16_op" +#define VX_KERNEL_NAME_NEAREST_INTERNAL_F16TOF16 VIVANTE_NAMESPACE ".vxcResize_nearest_F16toF16" +#define VX_KERNEL_NAME_NEAREST_INTERNAL_F16TOF16_OP VIVANTE_NAMESPACE ".vxcResize_nearest_F16toF16_op" +#define VX_KERNEL_NAME_NEAREST_INTERNAL_U8TOU8 VIVANTE_NAMESPACE ".vxcResize_nearest_U8toU8" +#define VX_KERNEL_NAME_NEAREST_INTERNAL_U8TOU8_OP VIVANTE_NAMESPACE ".vxcResize_nearest_U8toU8_op" +#define VX_KERNEL_NAME_NEAREST_INTERNAL_I8TOI8 VIVANTE_NAMESPACE ".vxcResize_nearest_I8toI8" +#define VX_KERNEL_NAME_NEAREST_INTERNAL_I8TOI8_OP VIVANTE_NAMESPACE ".vxcResize_nearest_I8toI8_op" + +/*! \brief The list of Example Kernels. + * \ingroup group_xyz_ext + */ +//! [KERNEL ENUM] +enum vx_kernel_libnnext_ext_e +{ + /*! \brief The Example Kernel */ + VX_KERNEL_ENUM_LIBNNEXT = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_LIBNNEXT_OFFSET, + VX_KERNEL_ENUM_PERMUTECWH = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_PREMUTE_OFFSET, + VX_KERNEL_ENUM_PERMUTECHW = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_PREMUTE_OFFSET + 1, + VX_KERNEL_ENUM_PRIORBOX = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_PRIORBOX_OFFSET, + VX_KERNEL_ENUM_FLATTEN = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_FLATTEN_OFFSET, + VX_KERNEL_ENUM_L2NORMALIZESCALE = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_L2NORMALIZESCALE_OFFSET, + VX_KERNEL_ENUM_L2NORMSCALE_SUMRSQRT = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_L2NORMALIZESCALE_OFFSET + 1, + VX_KERNEL_ENUM_L2NORMSCALE_MULSCALE = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_L2NORMALIZESCALE_OFFSET + 2, + VX_KERNEL_ENUM_PARAMETRICRELU = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_PARAMETRICRELU_OFFSET, + VX_KERNEL_ENUM_PREBBOX = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_PREBBOX_OFFSET, + VX_KERNEL_ENUM_ADD_RELU_KERNEL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_ADD_RELU_KERNEL_OFFSET, + VX_KERNEL_ENUM_POOLING_WITH_ARGMAX = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_POOLING_WITH_ARGMAX_OFFSET, + VX_KERNEL_ENUM_UNPOOLING = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_UNPOOLING_OFFSET, + VX_KERNEL_ENUM_ARGMAX = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_ARGMAX_OFFSET, + VX_KERNEL_ENUM_ALEXNET_GEMM = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_ALEXNET_GEMM_OFFSET, + VX_KERNEL_ENUM_IMG2COL_DILATED = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_IMG2COL_DILATED_OFFSET, + VX_KERNEL_ENUM_IMG2COL_DILATED_INT8 = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_IMG2COL_DILATED_INT8_OFFSET, + VX_KERNEL_ENUM_ALEXNET_GEMM_INT8 = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_ALEXNET_GEMM_INT8_OFFSET, + VX_KERNEL_ENUM_MAXIMUM = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_ELTWISE_MAX, + VX_KERNEL_ENUM_FULLYCONNECTED_AXIS2 = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_FULLYCONNECTED_AXIS2, + VX_KERNEL_ENUM_TENSORCROP_INT16 = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_TENSORCROP_INT16, + VX_KERNEL_ENUM_TENSORCROP_INT8 = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_TENSORCROP_INT8, + VX_KERNEL_ENUM_TENSORCROP_INT16_FP16 = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_TENSORCROP_INT16_FP16, + VX_KERNEL_ENUM_DROPOUT = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_DROPOUT, + VX_KERNEL_ENUM_SHUFFLECHANNEL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_SHUFFLECHANNEL, + VX_KERNEL_ENUM_RESIZE = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_RESIZE, + VX_KERNEL_ENUM_REVERSE = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_REVERSE, + VX_KERNEL_ENUM_RESIZE_16BITS_DOWNSAMPLE_QUARTER = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_RESIZE_16BITS_DOWNSAMPLE_QUARTER, + VX_KERNEL_ENUM_RESIZE_8BITS_DOWNSAMPLE_QUARTER = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_RESIZE_8BITS_DOWNSAMPLE_QUARTER, + VX_KERNEL_ENUM_SCALE = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_SCALE, + VX_KERNEL_ENUM_TENSORREVERSE = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_TENSORREVERSE, + VX_KERNEL_ENUM_TENSORELU = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_TENSORELU_OFFSET, + VX_KERNEL_ENUM_SPACE2BATCH = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_SPACE2BATCH, + VX_KERNEL_ENUM_BATCH2SPACE = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_BATCH2SPACE, + VX_KERNEL_ENUM_SPACE2DEPTH = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_SPACE2DEPTH, + VX_KERNEL_ENUM_IMAGEPROCESS = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_IMAGEPROCESS, + VX_KERNEL_ENUM_SCALETOTENSOR = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_SCALETOTENSOR, + VX_KERNEL_ENUM_GRAYSCALETOTENSOR = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_GRAYSCALETOTENSOR, + VX_KERNEL_ENUM_GEMM = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_GEMM, + VX_KERNEL_ENUM_LAYERNORM = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_LAYERNORM, + VX_KERNEL_ENUM_LAYERNORM_FP16TOU8 = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_LAYERNORMFP16TOU8_OFFSET, + VX_KERNEL_ENUM_REDUCE = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_REDUCE, + VX_KERNEL_ENUM_INSTANCENORM = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_INSTANCENORM, + VX_KERNEL_ENUM_TENSORSTACKCONCAT = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_TENSORSTACKCONCAT, + VX_KERNEL_ENUM_TENSORSTACKCONCAT8BITS = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_TENSORSTACKCONCAT8BITS_OFFSET, + VX_KERNEL_ENUM_SIGNALFRAME = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_SIGNALFRAME, + VX_KERNEL_ENUM_RELATIONALOPS = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_RELATIONALOPS, + VX_KERNEL_ENUM_POW = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_POW, + VX_KERNEL_ENUM_FLOORDIV = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_FLOORDIV, + VX_KERNEL_ENUM_SPATIAL_TRANSFORMER = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_SPATIAL_TRANSFORMER, + VX_KERNEL_ENUM_LOGICAL_OPS = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_LOGICAL_OPS, + VX_KERNEL_ENUM_SELECT = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_SELECT, + VX_KERNEL_ENUM_LSTMUNIT_ACTIVATION = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_LSTMUNIT_ACTIVATION, + VX_KERNEL_ENUM_TENSOR_ADD_MEAN_STDDEV_NORM = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_TENSOR_ADD_MEAN_STDDEV_NORM, + VX_KERNEL_ENUM_STACK = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_STACK, + VX_KERNEL_ENUM_NEG = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_NEG, + VX_KERNEL_ENUM_TENSOR_EXP = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_EXP, + VX_KERNEL_ENUM_CLIP = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_CLIP, + VX_KERNEL_ENUM_PRE_PROCESS_GRAY = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_PRE_PROCESS_GRAY, + VX_KERNEL_ENUM_UNSTACK = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_UNSTACK, + VX_KERNEL_ENUM_PRE_PROCESS_RGB = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_PRE_PROCESS_RGB, + VX_KERNEL_ENUM_ADDN = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_ADDN, + VX_KERNEL_ENUM_PRE_PROCESS_YUV420 = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_PRE_PROCESS_YUV420, + VX_KERNEL_ENUM_CONV2D = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_CONV2D, + VX_KERNEL_ENUM_EXTRA_ENDING = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_EXTRA_ENDING, + VX_KERNEL_ENUM_GATHER = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_GATHER, + VX_KERNEL_ENUM_TILE = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_TILE, + VX_KERNEL_ENUM_TOPK = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_TOPK, + VX_KERNEL_ENUM_PRE_PROCESS_BGRA = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_PRE_PROCESS_BGRA, + VX_KERNEL_ENUM_LOGICAL_NOT = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_LOGICAL_NOT, + VX_KERNEL_ENUM_TENSOR_SIN = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_SIN, + VX_KERNEL_ENUM_TENSOR_LOG = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_LOG, + VX_KERNEL_ENUM_ARGMIN = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_ARGMIN, + VX_KERNEL_ENUM_ROI_ALIGN = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_ROI_ALIGN, + VX_KERNEL_ENUM_HEATMAP_MAX_KEYPOINT = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_HEATMAP_MAX_KEYPOINT, + VX_KERNEL_ENUM_AXIS_ALIGNED_BBOX_TRANSFORM = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_AXIS_ALIGNED_BBOX_TRANSFORM, + VX_KERNEL_ENUM_BOX_WITH_NMS_LIMIT = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_BOX_WITH_NMS_LIMIT, + VX_KERNEL_ENUM_GENERATE_PROPOSALS = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_GENERATE_PROPOSALS, + VX_KERNEL_ENUM_DETECTION_POSTPROCESS = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_DETECTION_POSTPROCESS, + VX_KERNEL_ENUM_RANDOM_MULTINOMIAL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_RANDOM_MULTINOMIAL, + VX_KERNEL_ENUM_LOG_SOFTMAX = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_LOG_SOFTMAX, + VX_KERNEL_ENUM_RELU_KERAS_INTERNAL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_RELU_KERAS_INTERNAL, + VX_KERNEL_ENUM_DECONV2D = VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_DECONV2D, + VX_KERNEL_ENUM_REDUCEMAX_INTERNAL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_REDUCEMAX_INTERNAL, + VX_KERNEL_ENUM_REDUCEMIN_INTERNAL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_REDUCEMIN_INTERNAL, + VX_KERNEL_ENUM_REDUCEPROD_INTERNAL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_REDUCEPROD_INTERNAL, + VX_KERNEL_ENUM_REDUCEALL_INTERNAL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_REDUCEALL_INTERNAL, + VX_KERNEL_ENUM_REDUCEANY_INTERNAL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_REDUCEANY_INTERNAL, + VX_KERNEL_ENUM_RESIZE_INTERNAL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_RESIZE_INTERNAL, + VX_KERNEL_ENUM_RESIZE_NEAREST_INTERNAL = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_RESIZE_NEAREST_INTERNAL, + VX_KERNEL_ENUM_PRE_PROCESS_YUV444 = + VX_KERNEL_BASE(VX_ID_DEFAULT, VX_LIBRARY_LIBNNEXT) + KERNEL_ENUM_PRE_PROCESS_YUV444, + // up to 0xFFF kernel enums can be created. +}; + + + +/* Assigned from Khronos, for custom */ +#define VX_LIBRARY_CUSTOM (0x4) +enum vx_kernel_custom_id_e +{ + _VX_CLIENT_ID_START = VX_KERNEL_BASE( VX_ID_DEFAULT, VX_LIBRARY_CUSTOM ), +#define DEF_OP( name ) VX_CLIENT_ID_##name, + #include "custom/custom_ops.def" +#undef DEF_OP +}; +#define VX_KERNEL_ID( name ) VX_CLIENT_ID_##name + +#ifndef gvxOBJ_CHECK +#define gvxOBJ_CHECK(ref) \ + do \ + { \ + status = vxGetStatus((vx_reference)ref); \ + if (ref == 0 || status != VX_SUCCESS) \ + { \ + printf("Obj ERROR: status=%d @ %s(%d)\n", status, __FUNCTION__, __LINE__); \ + } \ + } \ + while (0) +#endif +#ifndef gvxSTATUS_CHECK +#define gvxSTATUS_CHECK(status) \ + do \ + { \ + if (status != VX_SUCCESS) \ + { \ + printf("status ERROR: status=%d @ %s(%d)\n", status, __FUNCTION__, __LINE__); \ + } \ + } \ + while (0) +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_activations.h b/src/tim/vx/internal/include/ops/vsi_nn_op_activations.h new file mode 100644 index 0000000..b5a8bf4 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_activations.h @@ -0,0 +1,46 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_ACTIVATIONS_H +#define _VSI_NN_OP_ACTIVATIONS_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_activation_param +{ + union + { + float leaky_ratio; + }; +} vsi_nn_activation_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_addn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_addn.h new file mode 100644 index 0000000..4e4f827 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_addn.h @@ -0,0 +1,50 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_ADDN_H +#define _VSI_NN_OP_ADDN_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_addn_lcl_data +{ + uint32_t input_num; +} vsi_nn_addn_lcl_data; + +typedef struct _vsi_nn_addn_param +{ + /* local data must be the first. */ + vsi_nn_addn_lcl_data * lcl_data; +} vsi_nn_addn_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_argmax.h b/src/tim/vx/internal/include/ops/vsi_nn_op_argmax.h new file mode 100644 index 0000000..2ec4172 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_argmax.h @@ -0,0 +1,122 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_ARGMAX_H +#define _VSI_NN_OP_ARGMAX_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VSI_NN_ARGMAX_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_ARGMAX_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + ARGMAX_CPU_KERNEL, + + /* axis 0 */ + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, I8, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, I8, I16, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, U8, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, U8, I16, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, I16, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, I16, I16, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, F16, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, F16, I16, IMAGE_ARRAY) + + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, I8, U8, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, I8, I16, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, U8, U8, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, U8, I16, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, I16, U8, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, I16, I16, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, F16, U8, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(0, F16, I16, IMAGE_2D) + + /* axis 1 */ + VSI_NN_ARGMAX_SH_KERNEL_IDX(1, I8, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(1, I8, I16, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(1, U8, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(1, U8, I16, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(1, I16, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(1, I16, I16, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(1, F16, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(1, F16, I16, IMAGE_ARRAY) + + /* axis 2 */ + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, I8, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, I8, I16, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, U8, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, U8, I16, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, I16, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, I16, I16, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, F16, U8, IMAGE_ARRAY) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, F16, I16, IMAGE_ARRAY) + + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, I8, U8, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, I8, I16, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, U8, U8, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, U8, I16, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, I16, U8, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, I16, I16, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, F16, U8, IMAGE_2D) + VSI_NN_ARGMAX_SH_KERNEL_IDX(2, F16, I16, IMAGE_2D) +}; + +enum { + ARGMAX_INPUT = 0, + + ARGMAX_INPUTS_COUNT, + + ARGMAX_OUTPUT = 0, + + ARGMAX_OUTPUTS_COUNT, + + ARGMAX_PARAM_COUT = ARGMAX_INPUTS_COUNT + ARGMAX_OUTPUTS_COUNT, +}; + +#define _VSI_NN_ARGMAX_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_argmax_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_ARGMAX_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_argmax_lcl_data; + +typedef struct _vsi_nn_argmax_param +{ + /* argmax layer local data structure */ + vsi_nn_argmax_lcl_data local; + int32_t axis; +} vsi_nn_argmax_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h b/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h new file mode 100644 index 0000000..7f43ec8 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h @@ -0,0 +1,114 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_ARGMIN_H +#define _VSI_NN_OP_ARGMIN_H + +#include "vsi_nn_types.h" + + +#define VSI_NN_ARGMIN_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_ARGMIN_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + ARGMIN_CPU_KERNEL, + + /* axis 0 */ + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, I8, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, I8, I16, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, U8, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, U8, I16, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, I16, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, I16, I16, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, F16, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, F16, I16, IMAGE_ARRAY) + + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, I8, U8, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, I8, I16, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, U8, U8, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, U8, I16, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, I16, U8, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, I16, I16, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, F16, U8, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(0, F16, I16, IMAGE_2D) + + /* axis 1 */ + VSI_NN_ARGMIN_SH_KERNEL_IDX(1, I8, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(1, I8, I16, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(1, U8, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(1, U8, I16, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(1, I16, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(1, I16, I16, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(1, F16, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(1, F16, I16, IMAGE_ARRAY) + + /* axis 2 */ + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, I8, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, I8, I16, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, U8, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, U8, I16, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, I16, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, I16, I16, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, F16, U8, IMAGE_ARRAY) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, F16, I16, IMAGE_ARRAY) + + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, I8, U8, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, I8, I16, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, U8, U8, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, U8, I16, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, I16, U8, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, I16, I16, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, F16, U8, IMAGE_2D) + VSI_NN_ARGMIN_SH_KERNEL_IDX(2, F16, I16, IMAGE_2D) +}; + +enum { + ARGMIN_INPUT = 0, + + ARGMIN_INPUTS_COUNT, + + ARGMIN_OUTPUT = 0, + + ARGMIN_OUTPUTS_COUNT, + + ARGMIN_PARAM_COUT = ARGMIN_INPUTS_COUNT + ARGMIN_OUTPUTS_COUNT, +}; + +#define _VSI_NN_ARGMIN_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_argmin_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_ARGMIN_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_argmin_lcl_data; + +typedef struct _vsi_nn_argmin_param +{ + /* argmin layer local data structure */ + vsi_nn_argmin_lcl_data local; + int32_t axis; +} vsi_nn_argmin_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h b/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h new file mode 100644 index 0000000..fcdd425 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h @@ -0,0 +1,35 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_AXIS_ALIGNED_BBOX_TRANSFORM_H +#define _VSI_NN_OP_AXIS_ALIGNED_BBOX_TRANSFORM_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_axis_aligned_bbox_transform_param +{ + vsi_enum type; +} vsi_nn_axis_aligned_bbox_transform_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_batch2space.h b/src/tim/vx/internal/include/ops/vsi_nn_op_batch2space.h new file mode 100644 index 0000000..dce07c5 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_batch2space.h @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_BATCH2SPACE_H +#define _VSI_NN_OP_BATCH2SPACE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_batch2space_lcl_data_t +{ + vsi_nn_tensor_t *block_size_tensor; +} vsi_nn_batch2space_lcl_data_t; + +typedef struct _vsi_nn_batch2space_param +{ + /* local data must be the first. */ + vsi_nn_batch2space_lcl_data_t local; + + const int32_t *block_size; + uint32_t block_size_num; + int32_t crop[4]; // [left, right, top, bottom] +} vsi_nn_batch2space_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_batch_norm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_batch_norm.h new file mode 100644 index 0000000..c4dd840 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_batch_norm.h @@ -0,0 +1,50 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_BATCH_NORM_H +#define _VSI_NN_OP_BATCH_NORM_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_batcnnorm_lcl_data +{ + vsi_nn_tensor_t *reshaped_input; + vsi_nn_tensor_t *reshaped_output; +} vsi_nn_batcnnorm_lcl_data; + +typedef struct _vsi_nn_batch_norm_param +{ + float eps; + vsi_nn_batcnnorm_lcl_data *local; +} vsi_nn_batch_norm_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h b/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h new file mode 100644 index 0000000..36ccbfc --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h @@ -0,0 +1,54 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_BATCHNORM_SINGLE_H +#define _VSI_NN_OP_BATCHNORM_SINGLE_H + +#include "vsi_nn_types.h" + +/* enum for inputs/outputs */ +enum +{ + BATCHNORM_INPUT = 0, + BATCHNORM_INPUT_MEAN = 1, + + BATCHNORM_INPUT_VARIANCE = 2, + BATCHNORM_INPUT_GAMMA = 3, + BATCHNORM_INPUT_BETA = 4, + + BATCHNORM_INPUT_CNT, + + BATCHNORM_OUTPUT = 0, + + BATCHNORM_OUTPUT_CNT +}; + +typedef struct _vsi_nn_batchnorm_single_param +{ + // Add parameters here + float eps; +} vsi_nn_batchnorm_single_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h new file mode 100644 index 0000000..b183d9a --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h @@ -0,0 +1,136 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_BIDIRECTIONAL_SEQUENCE_LSTM_H +#define _VSI_NN_OP_BIDIRECTIONAL_SEQUENCE_LSTM_H + +#include "vsi_nn_types.h" + +enum +{ + BI_LSTM_INPUT_INPUT = 0, + + BI_LSTM_FW_INPUT_WEIGHT_I2I = 1, + BI_LSTM_FW_INPUT_WEIGHT_I2F = 2, + BI_LSTM_FW_INPUT_WEIGHT_I2C = 3, + BI_LSTM_FW_INPUT_WEIGHT_I2O = 4, + + BI_LSTM_FW_INPUT_WEIGHT_R2I = 5, + BI_LSTM_FW_INPUT_WEIGHT_R2F = 6, + BI_LSTM_FW_INPUT_WEIGHT_R2C = 7, + BI_LSTM_FW_INPUT_WEIGHT_R2O = 8, + + BI_LSTM_FW_INPUT_WEIGHT_C2I = 9, + BI_LSTM_FW_INPUT_WEIGHT_C2F = 10, + BI_LSTM_FW_INPUT_WEIGHT_C2O = 11, + + BI_LSTM_FW_INPUT_BIAS_I = 12, + BI_LSTM_FW_INPUT_BIAS_F = 13, + BI_LSTM_FW_INPUT_BIAS_C = 14, + BI_LSTM_FW_INPUT_BIAS_O = 15, + + BI_LSTM_FW_INPUT_WEIGHT_PROJ = 16, + BI_LSTM_FW_INPUT_BIAS_PROJ = 17, + + BI_LSTM_BW_INPUT_WEIGHT_I2I = 18, + BI_LSTM_BW_INPUT_WEIGHT_I2F = 19, + BI_LSTM_BW_INPUT_WEIGHT_I2C = 20, + BI_LSTM_BW_INPUT_WEIGHT_I2O = 21, + + BI_LSTM_BW_INPUT_WEIGHT_R2I = 22, + BI_LSTM_BW_INPUT_WEIGHT_R2F = 23, + BI_LSTM_BW_INPUT_WEIGHT_R2C = 24, + BI_LSTM_BW_INPUT_WEIGHT_R2O = 25, + + BI_LSTM_BW_INPUT_WEIGHT_C2I = 26, + BI_LSTM_BW_INPUT_WEIGHT_C2F = 27, + BI_LSTM_BW_INPUT_WEIGHT_C2O = 28, + + BI_LSTM_BW_INPUT_BIAS_I = 29, + BI_LSTM_BW_INPUT_BIAS_F = 30, + BI_LSTM_BW_INPUT_BIAS_C = 31, + BI_LSTM_BW_INPUT_BIAS_O = 32, + + BI_LSTM_BW_INPUT_WEIGHT_PROJ = 33, + BI_LSTM_BW_INPUT_BIAS_PROJ = 34, + + BI_LSTM_FW_INPUT_H_STATE = 35, + BI_LSTM_FW_INPUT_C_STATE = 36, + + BI_LSTM_BW_INPUT_H_STATE = 37, + BI_LSTM_BW_INPUT_C_STATE = 38, + + BI_LSTM_AUX_INPUT = 39, + + BI_LSTM_FW_AUX_INPUT_WEIGHT_I2I = 40, + BI_LSTM_FW_AUX_INPUT_WEIGHT_I2F = 41, + BI_LSTM_FW_AUX_INPUT_WEIGHT_I2C = 42, + BI_LSTM_FW_AUX_INPUT_WEIGHT_I2O = 43, + + BI_LSTM_BW_AUX_INPUT_WEIGHT_I2I = 44, + BI_LSTM_BW_AUX_INPUT_WEIGHT_I2F = 45, + BI_LSTM_BW_AUX_INPUT_WEIGHT_I2C = 46, + BI_LSTM_BW_AUX_INPUT_WEIGHT_I2O = 47, + + BI_LSTM_FW_INPUT_LAYERNORM_I = 48, + BI_LSTM_FW_INPUT_LAYERNORM_F = 49, + BI_LSTM_FW_INPUT_LAYERNORM_C = 50, + BI_LSTM_FW_INPUT_LAYERNORM_O = 51, + + BI_LSTM_BW_INPUT_LAYERNORM_I = 52, + BI_LSTM_BW_INPUT_LAYERNORM_F = 53, + BI_LSTM_BW_INPUT_LAYERNORM_C = 54, + BI_LSTM_BW_INPUT_LAYERNORM_O = 55, + + BI_LSTM_INPUT_CNT, + + BI_LSTM_FW_OUTPUT_OUTPUT = 0, + BI_LSTM_BW_OUTPUT_OUTPUT = 1, + BI_LSTM_OUTPUT_CNT +}; + +typedef struct _vsi_nn_bidirectional_sequence_lstm_lcl_data_t +{ + vsi_bool use_cifg; + vsi_bool use_layer_norm; + vsi_bool use_projection; + vsi_bool use_projection_bias; + vsi_bool use_hybrid; + vsi_bool multi_batch; +} vsi_nn_bidirectional_sequence_lstm_lcl_data_t; + +typedef struct _vsi_nn_bidirectional_sequence_lstm_param +{ + vsi_nn_bidirectional_sequence_lstm_lcl_data_t *local; + vsi_bool time_major; + vsi_bool merge_outputs; + vsi_nn_activation_e activation; + float cell_clip; + float proj_clip; + float forget_bias; + vsi_nn_activation_e recurrent_activation; + vsi_nn_dtype_t *internal_dtype; +} vsi_nn_bidirectional_sequence_lstm_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h new file mode 100644 index 0000000..1c59ee3 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h @@ -0,0 +1,66 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_BIDIRECTIONAL_SEQUENCE_RNN_H +#define _VSI_NN_OP_BIDIRECTIONAL_SEQUENCE_RNN_H + +#include "vsi_nn_types.h" +#include "vsi_nn_op_rnn.h" + +/* enum for inputs/outputs */ +enum +{ + BI_RNN_INPUT_INPUT = 0, + + BI_RNN_FW_INPUT_WEIGHT_I = 1, + BI_RNN_FW_INPUT_WEIGHT_H = 2, + BI_RNN_FW_INPUT_BIAS = 3, + BI_RNN_FW_INPUT_H_STATE = 4, + + BI_RNN_BW_INPUT_WEIGHT_I = 5, + BI_RNN_BW_INPUT_WEIGHT_H = 6, + BI_RNN_BW_INPUT_BIAS = 7, + BI_RNN_BW_INPUT_H_STATE = 8, + + BI_RNN_AUX_INPUT = 9, + BI_RNN_FW_AUX_INPUT_WEIGHT = 10, + BI_RNN_BW_AUX_INPUT_WEIGHT = 11, + + BI_RNN_INPUT_CNT, + + BI_RNN_FW_OUTPUT_OUTPUT = 0, + BI_RNN_BW_OUTPUT_OUTPUT = 1, + BI_RNN_OUTPUT_CNT +}; + + +typedef struct _vsi_nn_bidirectional_sequence_rnn_param +{ + vsi_bool time_major; + vsi_bool merge_outputs; + vsi_nn_activation_e activation; + vsi_nn_dtype_t* internal_dtype; +} vsi_nn_bidirectional_sequence_rnn_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h b/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h new file mode 100644 index 0000000..b4af7e4 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h @@ -0,0 +1,40 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_BOX_WITH_NMS_LIMIT_H +#define _VSI_NN_OP_BOX_WITH_NMS_LIMIT_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_box_with_nms_limit_param +{ + float score_threshold; + int32_t max_num_bbox; + int32_t nms_kernel_method; + float iou_threshold; + float sigma; + float nms_score_threshold; +} vsi_nn_box_with_nms_limit_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h b/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h new file mode 100644 index 0000000..86fa568 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h @@ -0,0 +1,37 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_CAST_H +#define _VSI_NN_OP_CAST_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_cast_param +{ + // Add parameters here + int32_t nothing; +} vsi_nn_cast_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_clip.h b/src/tim/vx/internal/include/ops/vsi_nn_op_clip.h new file mode 100644 index 0000000..4c4061d --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_clip.h @@ -0,0 +1,106 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIP_H +#define _VSI_NN_OP_CLIP_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VSI_NN_CLIP_SH_KERNEL_IDX(_INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_CLIP_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + + +enum { + CLIP_CPU_KERNEL, + VSI_NN_CLIP_SH_KERNEL_IDX(F16, F16, IMAGE) + VSI_NN_CLIP_SH_KERNEL_IDX(F16, I16, IMAGE) + VSI_NN_CLIP_SH_KERNEL_IDX(F16, I8, IMAGE) + VSI_NN_CLIP_SH_KERNEL_IDX(F16, U8, IMAGE) + VSI_NN_CLIP_SH_KERNEL_IDX(I16, F16, IMAGE) + VSI_NN_CLIP_SH_KERNEL_IDX(I8, F16, IMAGE) + VSI_NN_CLIP_SH_KERNEL_IDX(U8, F16, IMAGE) + VSI_NN_CLIP_SH_KERNEL_IDX(I16, I16, IMAGE) + VSI_NN_CLIP_SH_KERNEL_IDX(I8, I8, IMAGE) + VSI_NN_CLIP_SH_KERNEL_IDX(U8, U8, IMAGE) + VSI_NN_CLIP_SH_KERNEL_IDX(F16, F16, IMAGE_2D) + VSI_NN_CLIP_SH_KERNEL_IDX(F16, I16, IMAGE_2D) + VSI_NN_CLIP_SH_KERNEL_IDX(F16, I8, IMAGE_2D) + VSI_NN_CLIP_SH_KERNEL_IDX(F16, U8, IMAGE_2D) + VSI_NN_CLIP_SH_KERNEL_IDX(I16, F16, IMAGE_2D) + VSI_NN_CLIP_SH_KERNEL_IDX(I8, F16, IMAGE_2D) + VSI_NN_CLIP_SH_KERNEL_IDX(U8, F16, IMAGE_2D) + VSI_NN_CLIP_SH_KERNEL_IDX(I16, I16, IMAGE_2D) + VSI_NN_CLIP_SH_KERNEL_IDX(I8, I8, IMAGE_2D) + VSI_NN_CLIP_SH_KERNEL_IDX(U8, U8, IMAGE_2D) + CLIP_KERNEL_COUNTS, +}; + +enum { + CLIP_INPUT = 0, + + CLIP_INPUTS_COUNT, + + CLIP_OUTPUT = 0, + + CLIP_OUTPUTS_COUNT, + + CLIP_PARAM_COUT = CLIP_INPUTS_COUNT + CLIP_OUTPUTS_COUNT, +}; + +#define _VSI_NN_CLIP_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_clip_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_CLIP_LOCAL_TENSOR_NUM]; +} vsi_nn_clip_lcl_data; + +typedef struct _vsi_nn_clip_lcl2_data +{ + uint32_t hash_idx; + vsi_bool execute_on_sw; + vsi_bool enable_image_2d; + uint32_t sizes0[VSI_NN_MAX_DIM_NUM]; + uint32_t sizes1[VSI_NN_MAX_DIM_NUM]; + uint32_t dim_num; +} vsi_nn_clip_lcl2_data; + +typedef struct _vsi_nn_clip_param +{ + /* local data must be the first. */ + vsi_nn_clip_lcl_data local; + float min; + float max; + vsi_nn_clip_lcl2_data *local2; +} vsi_nn_clip_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_concat.h b/src/tim/vx/internal/include/ops/vsi_nn_op_concat.h new file mode 100644 index 0000000..b597b81 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_concat.h @@ -0,0 +1,67 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CONCAT_H +#define _VSI_NN_OP_CONCAT_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" +#include "utils/vsi_nn_link_list.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_concat_lcl_data +{ + vsi_nn_link_list_t link_list; + union + { + /* used for optimze concat to tensor view */ + struct + { + vx_node cp_node; + vx_tensor src_tensor; + vx_tensor dst_tensor; + }; + /* used for vxConcatIndefiniteLayer */ + struct + { + vx_object_array array; + }; + }; +} vsi_nn_concat_lcl_data; + +typedef struct _vsi_nn_concat_param +{ + /* local data must be the first. */ + vsi_nn_concat_lcl_data * lcl_data; + uint32_t axis; +} vsi_nn_concat_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_concatshift.h b/src/tim/vx/internal/include/ops/vsi_nn_op_concatshift.h new file mode 100644 index 0000000..1e8f184 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_concatshift.h @@ -0,0 +1,57 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CONCATSHIFT_H +#define _VSI_NN_OP_CONCATSHIFT_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" +#include "vsi_nn_op_concat.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_concatshift_lcl_data +{ + /* used for optimze concat to tensor view */ + vx_node cp_node; + vx_tensor src_tensor; +} vsi_nn_concatshift_lcl_data; + +typedef struct _vsi_nn_concatshift_param +{ + /* local data must be the first. */ + vsi_nn_concat_lcl_data * concat_lcl_data; + uint32_t axis; + + vsi_nn_concatshift_lcl_data * lcl_data; + uint32_t keep_size; +} vsi_nn_concatshift_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h new file mode 100644 index 0000000..9d216ff --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h @@ -0,0 +1,62 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CONV1D_H +#define _VSI_NN_OP_CONV1D_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_conv1d_lcl_data_t +{ + vx_tensor input_tensor; + vx_tensor weight_tensor; + vx_tensor output_tensor; +} vsi_nn_conv1d_lcl_data_t; + +typedef struct _vsi_nn_conv1d_param +{ + /* local data must be the first. */ + vsi_nn_conv1d_lcl_data_t local; + + uint32_t ksize; + uint32_t stride; + /* Pad left, right */ + uint32_t pad[2]; + /* Pad type default value shall be AUTO */ + vsi_nn_pad_e pad_type; + uint32_t weights; + uint32_t group; + uint32_t dilation; + int32_t multiplier; +} vsi_nn_conv1d_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h new file mode 100644 index 0000000..282c988 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CONV2D_H +#define _VSI_NN_OP_CONV2D_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_conv2d_param +{ + uint32_t ksize[2]; + uint32_t stride[2]; + /* Pad left, right, top, bottom */ + uint32_t pad[4]; + /* Pad type default value shall be AUTO */ + vsi_nn_pad_e pad_type; + uint32_t weights; + uint32_t group; + uint32_t dilation[2]; + int32_t multiplier; +} vsi_nn_conv2d_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv_relu.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv_relu.h new file mode 100644 index 0000000..205520d --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv_relu.h @@ -0,0 +1,50 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CONV_RELU_H +#define _VSI_NN_OP_CONV_RELU_H + +#include "vsi_nn_node.h" +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +vsi_status vsi_nn_InitConvReluPoolParameter + ( + vsi_nn_node_t * node, + vx_nn_convolution_relu_pooling_params_ext2_t * param, + vsi_bool has_pool + ); + +void vsi_nn_DeinitConvReluPoolParameter + ( + vx_nn_convolution_relu_pooling_params_ext2_t * param + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h b/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h new file mode 100644 index 0000000..5a74974 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_CROP_H +#define _VSI_NN_OP_CLIENT_CROP_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_crop_param +{ + int32_t axis; + uint32_t dims; + uint32_t offset[VSI_NN_MAX_DIM_NUM]; +} vsi_nn_crop_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_dataconvert.h b/src/tim/vx/internal/include/ops/vsi_nn_op_dataconvert.h new file mode 100644 index 0000000..bf4cfcb --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_dataconvert.h @@ -0,0 +1,50 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_DATACONVERT_H +#define _VSI_NN_OP_DATACONVERT_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_dataconvert_lcl_data +{ + vsi_bool use_reshape; +} vsi_nn_dataconvert_lcl_data; + +typedef struct _vsi_nn_dataconvert_param +{ + /* local data must be the first. */ + vsi_nn_dataconvert_lcl_data * lcl_data; +} vsi_nn_dataconvert_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_deconvolution.h b/src/tim/vx/internal/include/ops/vsi_nn_op_deconvolution.h new file mode 100644 index 0000000..f8bc670 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_deconvolution.h @@ -0,0 +1,51 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_DECONVOLUTION_H +#define _VSI_NN_OP_DECONVOLUTION_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_deconv_param +{ + uint32_t ksize[2]; + uint32_t stride[2]; + /* Pad left, right, top, bottom */ + uint32_t pad[4]; + /* Pad type default value shall be AUTO */ + uint32_t pad_type; + uint32_t weights; + uint32_t group; + uint32_t output_padding[2]; +} vsi_nn_deconv_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_deconvolution1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_deconvolution1d.h new file mode 100644 index 0000000..62ffe24 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_deconvolution1d.h @@ -0,0 +1,57 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_DECONVOLUTION1D_H +#define _VSI_NN_OP_DECONVOLUTION1D_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_deconvolution1d_lcl_data +{ + vsi_nn_tensor_t *reshaped_weight; +} vsi_nn_deconvolution1d_lcl_data; + +typedef struct _vsi_nn_deconvolution1d_param +{ + /* local data must be the first. */ + vsi_nn_deconvolution1d_lcl_data* local; + uint32_t ksize; + uint32_t stride; + /* Pad left, right */ + uint32_t pad[2]; + /* Pad type default value shall be AUTO */ + vsi_nn_pad_e pad_type; + uint32_t weights; + uint32_t group; + uint32_t output_padding; +} vsi_nn_deconvolution1d_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_depth2space.h b/src/tim/vx/internal/include/ops/vsi_nn_op_depth2space.h new file mode 100644 index 0000000..87c4e80 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_depth2space.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_DEPTH2SPACE_H +#define _VSI_NN_OP_DEPTH2SPACE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_depth2space_lcl_data_t +{ + vsi_nn_tensor_t *block_size_tensor; +} vsi_nn_depth2space_lcl_data_t; + +typedef struct _vsi_nn_depth2space_param +{ + /* local data must be the first. */ + vsi_nn_depth2space_lcl_data_t local; + + int32_t block_size; + vsi_nn_depth2space_mode_e mode; +} vsi_nn_depth2space_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_depth2space_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_depth2space_internal.h new file mode 100644 index 0000000..e1c47cd --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_depth2space_internal.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_DEPTH2SPACE_INTERNAL_H +#define _VSI_NN_OP_DEPTH2SPACE_INTERNAL_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_depth2space_internal_param +{ + int32_t block_size; + vsi_nn_depth2space_mode_e mode; +} vsi_nn_depth2space_internal_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h new file mode 100644 index 0000000..7f7f66f --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_DEPTHWISE_CONV1D_H +#define _VSI_NN_OP_DEPTHWISE_CONV1D_H + +#include "vsi_nn_types.h" +#include "vsi_nn_prv.h" + +__BEGIN_DECLS + +typedef struct _vsi_nn_depthwise_conv1d_param +{ + uint32_t stride; + /* Pad front, end */ + uint32_t pad[2]; + uint32_t dilation; + int32_t multiplier; +} vsi_nn_depthwise_conv1d_param; + +__END_DECLS + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h b/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h new file mode 100644 index 0000000..90fa87e --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_DETECTION_POSTPROCESS_H +#define _VSI_NN_OP_DETECTION_POSTPROCESS_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_detection_postprocess_param +{ + float dy; + float dx; + float dh; + float dw; + int32_t nms_type; + int32_t max_num_detections; + int32_t maximum_class_per_detection; + int32_t maximum_detection_per_class; + float score_threshold; + float iou_threshold; + int32_t is_bg_in_label; +} vsi_nn_detection_postprocess_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_divide.h b/src/tim/vx/internal/include/ops/vsi_nn_op_divide.h new file mode 100644 index 0000000..533566d --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_divide.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_DIVIDE_H +#define _VSI_NN_OP_DIVIDE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_divide_param +{ + float scale; +} vsi_nn_divide_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_dropout.h b/src/tim/vx/internal/include/ops/vsi_nn_op_dropout.h new file mode 100644 index 0000000..60b61d6 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_dropout.h @@ -0,0 +1,42 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_DROPOUT_H +#define _VSI_NN_OP_CLIENT_DROPOUT_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_dropout_param +{ + float ratio; +} vsi_nn_dropout_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_elu.h b/src/tim/vx/internal/include/ops/vsi_nn_op_elu.h new file mode 100644 index 0000000..87cf3aa --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_elu.h @@ -0,0 +1,54 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_ELU_H +#define _VSI_NN_OP_ELU_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_ELU_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_elu_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_ELU_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_elu_lcl_data; + +typedef struct _vsi_nn_elu_param +{ + /* elu layer local data structure */ + vsi_nn_elu_lcl_data local; +} vsi_nn_elu_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_embedding_lookup.h b/src/tim/vx/internal/include/ops/vsi_nn_op_embedding_lookup.h new file mode 100644 index 0000000..a1a7b58 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_embedding_lookup.h @@ -0,0 +1,51 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_EMBEDDING_LOOKUP_H +#define _VSI_NN_OP_EMBEDDING_LOOKUP_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_embedding_lookup_lcl_data_t +{ + vx_tensor input_tensor; + vx_tensor lut_tensor; + vx_tensor output_tensor; +} vsi_nn_embedding_lookup_lcl_data_t; + +typedef struct _vsi_nn_embedding_lookup_param +{ + /* local data must be the first. */ + vsi_nn_embedding_lookup_lcl_data_t local; +} vsi_nn_embedding_lookup_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h b/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h new file mode 100644 index 0000000..be7de22 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h @@ -0,0 +1,46 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_EXP_H +#define _VSI_NN_OP_EXP_H + +#include "vsi_nn_types.h" + + +#define _VSI_NN_EXP_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_exp_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_EXP_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_exp_lcl_data; + +typedef struct _vsi_nn_exp_param +{ + /* exp layer local data structure */ + vsi_nn_exp_lcl_data local; +} vsi_nn_exp_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_expand_broadcast.h b/src/tim/vx/internal/include/ops/vsi_nn_op_expand_broadcast.h new file mode 100644 index 0000000..93e1d3c --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_expand_broadcast.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_EXPAND_BROADCAST_H +#define _VSI_NN_OP_EXPAND_BROADCAST_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_expand_broadcast_param +{ + uint32_t *shape; + uint32_t dim_num; +} vsi_nn_expand_broadcast_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h b/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h new file mode 100644 index 0000000..4eff2d0 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h @@ -0,0 +1,48 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_EXTRA_ENDING_H +#define _VSI_NN_OP_EXTRA_ENDING_H + +#include "vsi_nn_types.h" + +#define _VSI_NN_EXTRA_ENDING_LOCAL_TENSOR_NUM 3 + +typedef struct _vsi_nn_extra_ending_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_EXTRA_ENDING_LOCAL_TENSOR_NUM]; +} vsi_nn_extra_ending_lcl_data; + + +typedef struct _vsi_nn_extra_ending_param +{ + vsi_nn_extra_ending_lcl_data local; + union { + int64_t value; + int8_t bytes[64]; + }; + int length; +} vsi_nn_extra_ending_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h b/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h new file mode 100644 index 0000000..4066939 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h @@ -0,0 +1,35 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_FLOOR_H +#define _VSI_NN_OP_FLOOR_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_floor_param +{ + vsi_enum type; +} vsi_nn_floor_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_floordiv.h b/src/tim/vx/internal/include/ops/vsi_nn_op_floordiv.h new file mode 100644 index 0000000..597f94a --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_floordiv.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_FLOORDIV_H +#define _VSI_NN_OP_CLIENT_FLOORDIV_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_floordiv_param +{ + void *reserve; +} vsi_nn_floordiv_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_fullconnect.h b/src/tim/vx/internal/include/ops/vsi_nn_op_fullconnect.h new file mode 100644 index 0000000..5071c1a --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_fullconnect.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_FCL_H +#define _VSI_NN_OP_FCL_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_fcl_param +{ + uint32_t weights; + uint32_t axis; +} vsi_nn_fcl_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h new file mode 100644 index 0000000..0d76800 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h @@ -0,0 +1,51 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_GATHER_H +#define _VSI_NN_OP_GATHER_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_GATHER_LOCAL_TENSOR_NUM 3 + +typedef struct _vsi_nn_gather_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_GATHER_LOCAL_TENSOR_NUM]; +} vsi_nn_gather_lcl_data; + +typedef struct _vsi_nn_gather_param +{ + vsi_nn_gather_lcl_data local; + int32_t axis; +} vsi_nn_gather_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h b/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h new file mode 100644 index 0000000..1d5a365 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h @@ -0,0 +1,41 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_GENERATE_PROPOSALS_H +#define _VSI_NN_OP_GENERATE_PROPOSALS_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_generate_proposals_param +{ + float height_stride; + float width_stride; + int32_t pre_nms_top_n; + int32_t post_nms_top_n; + float iou_threshold; + float min_size; + int32_t type; +} vsi_nn_generate_proposals_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h new file mode 100644 index 0000000..721ebbc --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_GROUPED_CONV2D_H +#define _VSI_NN_OP_GROUPED_CONV2D_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_grouped_conv2d_param +{ + uint32_t ksize[2]; + uint32_t stride[2]; + /* Pad left, right, top, bottom */ + uint32_t pad[4]; + /* Pad type default value shall be AUTO */ + vsi_nn_pad_e pad_type; + uint32_t weights; + uint32_t group; + uint32_t dilation[2]; + int32_t multiplier; + void* local; +} vsi_nn_grouped_conv2d_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h new file mode 100644 index 0000000..b4da1fc --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h @@ -0,0 +1,78 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_GRU_OVXLIB_H +#define _VSI_NN_OP_GRU_OVXLIB_H + +#include "vsi_nn_types.h" +#include "vsi_nn_op_grucell_ovxlib.h" + +/* enum for inputs/outputs */ +enum +{ + GRU_INPUT_INPUT = 0, + GRU_INPUT_H_STATE = 1, + + GRU_INPUT_WEIGHT_I2R = 2, + GRU_INPUT_WEIGHT_I2Z = 3, + + GRU_INPUT_WEIGHT_H2R = 4, + GRU_INPUT_WEIGHT_H2Z = 5, + + GRU_INPUT_BIAS_I2R = 6, + GRU_INPUT_BIAS_I2Z = 7, + + GRU_INPUT_BIAS_H2R = 8, + GRU_INPUT_BIAS_H2Z = 9, + + GRU_INPUT_WEIGHT_I2C = 10, + GRU_INPUT_WEIGHT_H2C = 11, + + GRU_INPUT_BIAS_I2C = 12, + GRU_INPUT_BIAS_H2C = 13, + + GRU_INPUT_CNT, + + GRU_OUTPUT_OUTPUT = 0, + GRU_OUTPUT_H_STATE = 1, + + GRU_OUTPUT_CNT +}; + +typedef struct _vsi_nn_gru_ovxlib_param +{ + uint32_t num_units; + vsi_bool time_major; + vsi_nn_activation_e activation; + vsi_nn_activation_e recurrent_activation; + vsi_bool return_sequences; + uint32_t linear_before_reset; + vsi_nn_dtype_t internal_dtype[GRUCELL_QUANTIZE_PARAM_COUNT]; + + struct _gru_ovxlib_local_data_t *local; + vsi_bool use_cudnn_implementation; + uint32_t cudnn_implementation_version; +} vsi_nn_gru_ovxlib_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h new file mode 100644 index 0000000..fe11a36 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h @@ -0,0 +1,87 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL_H +#define _VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL_H + +#include "vsi_nn_types.h" + +enum { + GRUCELL_ACTIVATION_INPUT_ZT_ = 0, + GRUCELL_ACTIVATION_INPUT_HT__ = 1, + GRUCELL_ACTIVATION_INPUT_HT_1 = 2, + + GRUCELL_ACTIVATION_INPUT_H_STATE = 0, + GRUCELL_ACTIVATION_INPUT_INPUT_FC_R = 1, + GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z = 2, + GRUCELL_ACTIVATION_INPUT_INPUT_FC_C = 3, + GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R = 4, + GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_Z = 5, + GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_C = 6, + GRUCELL_ACTIVATION_INPUT_BIAS_R = 7, + GRUCELL_ACTIVATION_INPUT_BIAS_Z = 8, + GRUCELL_ACTIVATION_INPUT_BIAS_C = 9, + GRUCELL_ACTIVATION_INPUT_COND_R = 10, + GRUCELL_ACTIVATION_INPUT_COND_Z = 11, + GRUCELL_ACTIVATION_INPUT_COND_C = 12, + + GRUCELL_ACTIVATION_INPUT_COUNT, + + GRUCELL_ACTIVATION_OUTPUT_OUTPUT = 0, + GRUCELL_ACTIVATION_OUTPUT_H_STATE = 1, + GRUCELL_ACTIVATION_OUTPUT_COUNT +}; + +enum { + GRUCELL_INPUT_CATEGORY_DEFAULT, + GRUCELL_INPUT_CATEGORY_CUDNN, + + GRUCELL_INPUT_CATEGORY_COUNT +}; + +typedef int32_t grucell_activation_input_layout_e; enum +{ + GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC, + GRUCELL_ACTIVATION_INPUT_LAYOUT_INPUT_NC_FC_CN, + GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_CN, + + GRUCELL_ACTIVATION_INPUT_LAYOUT_COUNT +}; + +typedef struct _vsi_nn_grucell_activation_internal_local { + uint32_t placeholder; +} vsi_nn_grucell_activation_internal_local; + +typedef struct _vsi_nn_grucell_activation_internal_param +{ + vsi_nn_grucell_activation_internal_local* local; + vsi_nn_activation_e gate_activation; + vsi_nn_activation_e candidate_activation; + int32_t input_category; + vsi_bool use_cudnn_implementation; + grucell_activation_input_layout_e input_layout; +} vsi_nn_grucell_activation_internal_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h new file mode 100644 index 0000000..51d76a4 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h @@ -0,0 +1,51 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL_SMA_H +#define _VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL_SMA_H + +#include "vsi_nn_types.h" + +enum { + GRUCELL_ACTIVATION_SMA_INPUT_H_STATE = 0, + GRUCELL_ACTIVATION_SMA_INPUT_H_T_ = 1, + GRUCELL_ACTIVATION_SMA_INPUT_Z_T = 2, + GRUCELL_ACTIVATION_SMA_INPUT_COUNT, + + GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT = 0, + GRUCELL_ACTIVATION_SMA_OUTPUT_H_STATE = 1, + GRUCELL_ACTIVATION_SMA_OUTPUT_COUNT +}; + +typedef struct _vsi_nn_grucell_activation_internal_sma_local { + uint32_t placeholder; +} vsi_nn_grucell_activation_internal_sma_local; + +typedef struct _vsi_nn_grucell_activation_internal_sma_param +{ + vsi_nn_grucell_activation_internal_sma_local* local; +} vsi_nn_grucell_activation_internal_sma_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h new file mode 100644 index 0000000..6006952 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h @@ -0,0 +1,106 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_GRUCELL_OVXLIB_H +#define _VSI_NN_OP_GRUCELL_OVXLIB_H + +#include "vsi_nn_types.h" +#include "vsi_nn_op_grucell_ovxlib.h" + +#define GRUCELL_RZ_GATE_COUNT 2 + +/* enum for inputs/outputs */ +enum +{ + GRUCELL_INPUT_INPUT = 0, + GRUCELL_INPUT_H_STATE = 1, + + GRUCELL_INPUT_WEIGHT_I2R = 2, + GRUCELL_INPUT_WEIGHT_I2Z = 3, + + GRUCELL_INPUT_WEIGHT_H2R = 4, + GRUCELL_INPUT_WEIGHT_H2Z = 5, + + GRUCELL_INPUT_BIAS_I2R = 6, + GRUCELL_INPUT_BIAS_I2Z = 7, + + GRUCELL_INPUT_BIAS_H2R = 8, + GRUCELL_INPUT_BIAS_H2Z = 9, + + GRUCELL_INPUT_WEIGHT_I2C = 10, + GRUCELL_INPUT_WEIGHT_H2C = 11, + + GRUCELL_INPUT_BIAS_I2C = 12, + GRUCELL_INPUT_BIAS_H2C = 13, + + GRUCELL_INPUT_COND_RESET = 14, + GRUCELL_INPUT_COND_UPDATE = 15, + GRUCELL_INPUT_COND_CANDIDATE = 16, + + GRUCELL_INPUT_CNT, + + GRUCELL_OUTPUT_OUTPUT = 0, + GRUCELL_OUTPUT_H_STATE = 1, + + GRUCELL_OUTPUT_CNT +}; + +enum +{ + GRUCELL_QUANTIZE_PARAM_I2R, + GRUCELL_QUANTIZE_PARAM_I2Z, + GRUCELL_QUANTIZE_PARAM_H2R, + GRUCELL_QUANTIZE_PARAM_H2Z, + GRUCELL_QUANTIZE_PARAM_I2C, + GRUCELL_QUANTIZE_PARAM_H2C, + + GRUCELL_QUANTIZE_PARAM_COUNT, + + GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT = 0, + GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN, + GRUCELL_CUDNN_QUANTIZE_PARAM_COUNT +}; + +enum +{ + GRUCELL_GATE_R = 0, + GRUCELL_GATE_Z = 1, + + GRUCELL_GATE_COUNT +}; + +typedef struct _vsi_nn_grucell_ovxlib_param +{ + struct _grucell_ovxlib_local_data_t* local; + uint32_t num_units; + vsi_nn_activation_e activation; + vsi_nn_activation_e recurrent_activation; + uint32_t linear_before_reset; + vsi_bool use_cudnn_implementation; + uint32_t cudnn_implementation_version; + vsi_nn_dtype_t internal_dtype[GRUCELL_QUANTIZE_PARAM_COUNT]; +} vsi_nn_grucell_ovxlib_param; +_compiler_assert(offsetof(vsi_nn_grucell_ovxlib_param, local) == 0, \ + vsi_nn_vsi_nn_grucell_ovxlib_h ); + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h b/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h new file mode 100644 index 0000000..d1bdf04 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h @@ -0,0 +1,35 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_HEATMAP_MAX_KEYPOINT_H +#define _VSI_NN_OP_HEATMAP_MAX_KEYPOINT_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_heatmap_max_keypoint_param +{ + vsi_enum type; +} vsi_nn_heatmap_max_keypoint_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_imageprocess.h b/src/tim/vx/internal/include/ops/vsi_nn_op_imageprocess.h new file mode 100644 index 0000000..7b68724 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_imageprocess.h @@ -0,0 +1,123 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_IMAGEPROCESS_H +#define _VSI_NN_OP_IMAGEPROCESS_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint32_t vsi_nn_imageprocess_resize_type_e; enum +{ + VSI_NN_IMAGEPROCESS_RESIZE_NONE = 0, + VSI_NN_IMAGEPROCESS_RESIZE_BILINEAR, +}; + +typedef uint32_t vsi_nn_imageprocess_mean_type_e; enum +{ + VSI_NN_IMAGEPROCESS_MEAN_NONE = 0, + VSI_NN_IMAGEPROCESS_MEAN_CHANNEL, + VSI_NN_IMAGEPROCESS_MEAN_PIXEL +}; + +typedef struct _vsi_nn_imageprocess_param +{ + vsi_nn_platform_e platform_type; + struct + { + vx_bool enable; + int32_t dim_num; + int32_t* start; + int32_t* length; + } crop; + struct + { + vsi_nn_imageprocess_resize_type_e type; + int32_t dim_num; + int32_t* length; + } resize; + vx_bool reverse_channel; + struct + { + vsi_nn_imageprocess_mean_type_e type; + float scale; + int32_t mean_value_size; + float* mean_value; + } mean; +} vsi_nn_imageprocess_param; + +/** +* Insert imageprocess op for image pre process +* @deprecated +* @see vsi_nn_InsertImageprocessSingleNode +* +* @param[in] graph. +* @param[in] the attr of the input tensor of graph. +* @param[in] the parameters of imageprocess. +* @param[in] bmp buffer of input image. +* @param[in] output tensor. +*/ +OVXLIB_API vsi_status vsi_nn_op_imageprocess_single_node + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_attr_t *attr, + vsi_nn_imageprocess_param *p, + uint8_t *data, + vsi_nn_tensor_t *tensor_out + ); + +/** +* Insert imageprocess op for image pre process. +* +* @param[in] graph. +* @param[in] the attr of the input tensor of graph. +* @param[in] the parameters of imageprocess. +* @param[in] bmp buffer of input image. +* @param[in] output tensor. +* @param[in] id. There may be multi models in a process. Each one has a uniqe id.\n +* But repeatedly running one model with different images should share the same id. +*/ +OVXLIB_API vsi_status vsi_nn_InsertImageprocessSingleNode + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_attr_t *attr, + vsi_nn_imageprocess_param *p, + uint8_t *data, + vsi_nn_tensor_t *tensor_out, + int32_t id + ); + +/** +* Release the resource of imageprocess op. +*/ +OVXLIB_API vsi_status vsi_nn_ReleaseImageprocessSingleNode(); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h new file mode 100644 index 0000000..5ec359b --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_instancenormalize.h @@ -0,0 +1,94 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_INSTANCENORMALIZE_H +#define _VSI_NN_OP_CLIENT_INSTANCENORMALIZE_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VSI_NN_SUMSQR_SH_KERNEL_IDX(_INPUT0_TYPE, _RESHAPE_FLAG) \ + VSI_NN_SUMSQR_##_INPUT0_TYPE##_RESHAPE_FLAG##_KERNEL, + +#define VSI_NN_INSTANCENORM_SH_KERNEL_IDX(_INPUT0_TYPE, _OUTPUT_TYPE, _RESHAPE_FLAG) \ + VSI_NN_INSTANCENORM_##_INPUT0_TYPE##TO##_OUTPUT_TYPE##_RESHAPE_FLAG##_KERNEL, + +enum { + INSTANCENORM_CPU_KERNEL, + VSI_NN_SUMSQR_SH_KERNEL_IDX(U8, 1) + VSI_NN_SUMSQR_SH_KERNEL_IDX(I8, 1) + VSI_NN_SUMSQR_SH_KERNEL_IDX(I16, 1) + VSI_NN_SUMSQR_SH_KERNEL_IDX(F16, 1) + VSI_NN_SUMSQR_SH_KERNEL_IDX(U8, 0) + VSI_NN_SUMSQR_SH_KERNEL_IDX(I8, 0) + VSI_NN_SUMSQR_SH_KERNEL_IDX(I16, 0) + VSI_NN_SUMSQR_SH_KERNEL_IDX(F16, 0) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(U8, U8, 1) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(U8, F16, 1) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(I8, I8, 1) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(I8, F16, 1) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(I16, I16, 1) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(I16, F16, 1) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(F16, F16, 1) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(U8, U8, 0) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(U8, F16, 0) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(I8, I8, 0) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(I8, F16, 0) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(I16, I16, 0) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(I16, F16, 0) + VSI_NN_INSTANCENORM_SH_KERNEL_IDX(F16, F16, 0) +}; + +#define _VSI_NN_INSTANCENORM_LOCAL_TENSOR_NUM 5 + +typedef struct _vsi_nn_instancenorm_lcl_data2 +{ + uint32_t reshapeFlg; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_instancenorm_lcl_data2; + +typedef struct _vsi_nn_instancenorm_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_INSTANCENORM_LOCAL_TENSOR_NUM]; +} vsi_nn_instancenorm_lcl_data; + +typedef struct _vsi_nn_instancenormalize_param +{ + /* local data must be the first. */ + vsi_nn_instancenorm_lcl_data local; + float eps; + int axis_num; + int* axis; + vsi_nn_instancenorm_lcl_data2* lcl2_data; +} vsi_nn_instancenormalize_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_l2_normalize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_l2_normalize.h new file mode 100644 index 0000000..87e1171 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_l2_normalize.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_L2_NORMALIZ_H +#define _VSI_NN_OP_CLIENT_L2_NORMALIZ_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_l2_normalize_param +{ + int32_t axis; +} vsi_nn_l2_normalize_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h b/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h new file mode 100644 index 0000000..b15ee4e --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h @@ -0,0 +1,104 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_L2NORMALIZESCALE_H +#define _VSI_NN_OP_CLIENT_L2NORMALIZESCALE_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VSI_NN_L2NORMSACLE_SQRTSUM_KERNEL_IDX(_AXIS, _INPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_L2NORMSACLE_SQRTSUM_AXIS##_AXIS##_##_INPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +#define VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_L2NORMSACLE_MULSCALE_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + L2NORMSACLE_CPU_KERNEL, + VSI_NN_L2NORMSACLE_SQRTSUM_KERNEL_IDX(1, F16, IMAGE_2D) + VSI_NN_L2NORMSACLE_SQRTSUM_KERNEL_IDX(1, I8, IMAGE_2D) + VSI_NN_L2NORMSACLE_SQRTSUM_KERNEL_IDX(1, U8, IMAGE_2D) + VSI_NN_L2NORMSACLE_SQRTSUM_KERNEL_IDX(1, I16, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(1, F16, F16, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(1, I8, I8, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(1, I8, F16, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(1, U8, U8, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(1, U8, F16, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(1, I16, I16, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(1, I16, F16, IMAGE_2D) + + VSI_NN_L2NORMSACLE_SQRTSUM_KERNEL_IDX(0, F16, IMAGE_2D) + VSI_NN_L2NORMSACLE_SQRTSUM_KERNEL_IDX(0, I8, IMAGE_2D) + VSI_NN_L2NORMSACLE_SQRTSUM_KERNEL_IDX(0, U8, IMAGE_2D) + VSI_NN_L2NORMSACLE_SQRTSUM_KERNEL_IDX(0, I16, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(0, F16, F16, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(0, I8, I8, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(0, I8, F16, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(0, U8, U8, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(0, U8, F16, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(0, I16, I16, IMAGE_2D) + VSI_NN_L2NORMSACLE_MULSCALE_KERNEL_IDX(0, I16, F16, IMAGE_2D) + L2NORMSACLE_KERNEL_COUNTS, +}; + +enum { + L2NORMSACLE_INPUT = 0, + + L2NORMSACLE_INPUT1, + + L2NORMSACLE_INPUTS_COUNT, + + L2NORMSACLE_OUTPUT = 0, + + L2NORMSACLE_OUTPUTS_COUNT, + + L2NORMSACLE_PARAM_COUT = L2NORMSACLE_INPUTS_COUNT + L2NORMSACLE_OUTPUTS_COUNT, +}; + + +#define _VSI_NN_L2NORMALIZESCALE_LOCAL_TENSOR_NUM 3 + +typedef struct _vsi_nn_l2normalizescale_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_L2NORMALIZESCALE_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_l2normalizescale_lcl_data; + +typedef struct _vsi_nn_l2normalizescale_param +{ + int32_t dims; + /* l2normalizescale layer local data structure */ + vsi_nn_l2normalizescale_lcl_data local; + int32_t axis; +} vsi_nn_l2normalizescale_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h new file mode 100644 index 0000000..91501bb --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_LAYERNORMALIZE_H +#define _VSI_NN_OP_CLIENT_LAYERNORMALIZE_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM 4 + +typedef struct _vsi_nn_layernorm_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM]; +} vsi_nn_layernorm_lcl_data; + +typedef struct _vsi_nn_layernormalize_param +{ + /* local data must be the first. */ + vsi_nn_layernorm_lcl_data local; + float eps; +} vsi_nn_layernormalize_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_linear.h b/src/tim/vx/internal/include/ops/vsi_nn_op_linear.h new file mode 100644 index 0000000..fb8ede4 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_linear.h @@ -0,0 +1,46 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_LINEAR_H +#define _VSI_NN_OP_LINEAR_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_linear_param +{ + void* reserve; + vx_float32 a; + vx_float32 b; +} vsi_nn_linear_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_log.h b/src/tim/vx/internal/include/ops/vsi_nn_op_log.h new file mode 100644 index 0000000..362f4da --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_log.h @@ -0,0 +1,118 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LOG_H +#define _VSI_NN_OP_LOG_H + +#include "vsi_nn_types.h" + + +#define VSI_NN_LOG_SH_KERNEL_IDX(_INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_LOG_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + LOG_CPU_KERNEL, + + VSI_NN_LOG_SH_KERNEL_IDX(F16, F16, IMAGE_3D) + VSI_NN_LOG_SH_KERNEL_IDX(F16, I16, IMAGE_3D) + VSI_NN_LOG_SH_KERNEL_IDX(F16, I8, IMAGE_3D) + VSI_NN_LOG_SH_KERNEL_IDX(F16, U8, IMAGE_3D) + VSI_NN_LOG_SH_KERNEL_IDX(I16, I16, IMAGE_3D) + VSI_NN_LOG_SH_KERNEL_IDX(I16, F16, IMAGE_3D) + VSI_NN_LOG_SH_KERNEL_IDX(I8, I8, IMAGE_3D) + VSI_NN_LOG_SH_KERNEL_IDX(I8, F16, IMAGE_3D) + VSI_NN_LOG_SH_KERNEL_IDX(U8, U8, IMAGE_3D) + VSI_NN_LOG_SH_KERNEL_IDX(U8, F16, IMAGE_3D) + VSI_NN_LOG_SH_KERNEL_IDX(BF16, BF16, IMAGE_3D) + + VSI_NN_LOG_SH_KERNEL_IDX(F16, F16, IMAGE_2D) + VSI_NN_LOG_SH_KERNEL_IDX(F16, I16, IMAGE_2D) + VSI_NN_LOG_SH_KERNEL_IDX(F16, I8, IMAGE_2D) + VSI_NN_LOG_SH_KERNEL_IDX(F16, U8, IMAGE_2D) + VSI_NN_LOG_SH_KERNEL_IDX(I16, I16, IMAGE_2D) + VSI_NN_LOG_SH_KERNEL_IDX(I16, F16, IMAGE_2D) + VSI_NN_LOG_SH_KERNEL_IDX(I8, I8, IMAGE_2D) + VSI_NN_LOG_SH_KERNEL_IDX(I8, F16, IMAGE_2D) + VSI_NN_LOG_SH_KERNEL_IDX(U8, U8, IMAGE_2D) + VSI_NN_LOG_SH_KERNEL_IDX(U8, F16, IMAGE_2D) + VSI_NN_LOG_SH_KERNEL_IDX(BF16, BF16, IMAGE_2D) +}; + +enum { + TENSOR_LOG_INPUT, + + TENSOR_LOG_INPUTS_COUNT, + + TENSOR_LOG_OUTPUT = 0, + + TENSOR_LOG_OUTUTS_COUNT, + + TENSOR_LOG_PARAM_COUT = TENSOR_LOG_INPUTS_COUNT + TENSOR_LOG_OUTUTS_COUNT, +}; + +enum { + TENSOR_LOG_CPU_KERNEL, + + TENSOR_LOG_F16TOF16_KERNEL, + TENSOR_LOG_F16TOI16_KERNEL, + TENSOR_LOG_F16TOI8_KERNEL, + TENSOR_LOG_F16TOU8_KERNEL, + TENSOR_LOG_I16TOI16_KERNEL, + TENSOR_LOG_I16TOF16_KERNEL, + TENSOR_LOG_I8TOI8_KERNEL, + TENSOR_LOG_I8TOF16_KERNEL, + TENSOR_LOG_U8TOU8_KERNEL, + TENSOR_LOG_U8TOF16_KERNEL, + + TENSOR_LOG_F16TOF16_2D_KERNEL, + TENSOR_LOG_F16TOI16_2D_KERNEL, + TENSOR_LOG_F16TOI8_2D_KERNEL, + TENSOR_LOG_F16TOU8_2D_KERNEL, + TENSOR_LOG_I16TOI16_2D_KERNEL, + TENSOR_LOG_I16TOF16_2D_KERNEL, + TENSOR_LOG_I8TOI8_2D_KERNEL, + TENSOR_LOG_I8TOF16_2D_KERNEL, + TENSOR_LOG_U8TOU8_2D_KERNEL, + TENSOR_LOG_U8TOF16_2D_KERNEL, + + TENSOR_LOG_KERNEL_COUNTS, +}; + +#define _VSI_NN_LOG_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_log_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_EXP_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_log_lcl_data; + +typedef struct _vsi_nn_log_param +{ + /* log layer local data structure */ + vsi_nn_log_lcl_data local; +} vsi_nn_log_param; + + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h b/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h new file mode 100644 index 0000000..26f3baf --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h @@ -0,0 +1,154 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LOG_SOFTMAX_H +#define _VSI_NN_OP_LOG_SOFTMAX_H + +#include "vsi_nn_types.h" + +#define VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_LOGSOFTMAX_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + LOGSOFTMAX_CPU_KERNEL, + + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, F16, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, F16, I16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, F16, I8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, F16, U8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, F16, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I16, I16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I8, I8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, U8, U8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I16, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I8, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, U8, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I16, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I8, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, U8, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, BF16, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, BF16, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, BF16, BF16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, F16, F16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, F16, I16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, F16, I8, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, F16, U8, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, F16, F32, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I16, I16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I8, I8, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, U8, U8, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I16, F16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I8, F16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, U8, F16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I16, F32, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, I8, F32, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, U8, F32, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, BF16, F32, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, BF16, F16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(0, BF16, BF16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, F16, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, F16, I16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, F16, I8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, F16, U8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, F16, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I16, I16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I8, I8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, U8, U8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I16, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I8, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, U8, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I16, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I8, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, U8, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, BF16, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, BF16, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, BF16, BF16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, F16, F16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, F16, I16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, F16, I8, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, F16, U8, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, F16, F32, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I16, I16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I8, I8, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, U8, U8, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I16, F16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I8, F16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, U8, F16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I16, F32, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, I8, F32, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, U8, F32, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, BF16, F32, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, BF16, F16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(1, BF16, BF16, IMAGE_2D) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, F16, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, F16, I16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, F16, I8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, F16, U8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, F16, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, I16, I16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, I8, I8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, U8, U8, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, I16, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, I8, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, U8, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, I16, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, I8, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, U8, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, BF16, F32, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, BF16, F16, IMAGE) + VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(2, BF16, BF16, IMAGE) + + LOGSOFTMAX_KERNEL_COUNTS, +}; + +enum { + LOGSOFTMAX_INPUT = 0, + + LOGSOFTMAX_INPUTS_COUNT, + + LOGSOFTMAX_OUTPUT = 0, + + LOGSOFTMAX_OUTPUTS_COUNT, + + LOGSOFTMAX_PARAM_COUT = LOGSOFTMAX_INPUTS_COUNT + LOGSOFTMAX_OUTPUTS_COUNT, +}; + +#define _VSI_NN_LOGSOFTMAX_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_logsoftmax_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_LOGSOFTMAX_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_logsoftmax_lcl_data; + + +typedef struct _vsi_nn_log_softmax_param +{ + vsi_nn_logsoftmax_lcl_data local; + vx_float32 betaValue; + int32_t axis; +} vsi_nn_log_softmax_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_logical_not.h b/src/tim/vx/internal/include/ops/vsi_nn_op_logical_not.h new file mode 100644 index 0000000..96ee408 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_logical_not.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LOGICAL_NOT_H +#define _VSI_NN_OP_LOGICAL_NOT_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_logical_not_param +{ + void *reserve; +} vsi_nn_logical_not_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_logical_ops.h b/src/tim/vx/internal/include/ops/vsi_nn_op_logical_ops.h new file mode 100644 index 0000000..321c8fc --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_logical_ops.h @@ -0,0 +1,58 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LOGICAL_OPS_H +#define _VSI_NN_OP_LOGICAL_OPS_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_LOGICAL_OPS_LOCAL_TENSOR_NUM 3 + +typedef uint32_t vsi_nn_logical_ops_type_t; enum +{ + VSI_NN_LOGICAL_OR = 0, + VSI_NN_LOGICAL_AND, + VSI_NN_LOGICAL_XOR, +}; + +typedef struct _vsi_nn_logical_ops_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_LOGICAL_OPS_LOCAL_TENSOR_NUM]; +} vsi_nn_logical_ops_lcl_data; + +typedef struct _vsi_nn_logical_ops_param +{ + vsi_nn_logical_ops_lcl_data local; + vsi_nn_logical_ops_type_t op; +} vsi_nn_logical_ops_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lrn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lrn.h new file mode 100644 index 0000000..ea4df84 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lrn.h @@ -0,0 +1,48 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LRN_H +#define _VSI_NN_OP_LRN_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_lrn_param +{ + vsi_enum type; + uint32_t size; + float alpha; + float beta; + float bias; + int32_t axis; +} vsi_nn_lrn_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h new file mode 100644 index 0000000..099c645 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h @@ -0,0 +1,41 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LSH_PROJECTION_H +#define _VSI_NN_OP_LSH_PROJECTION_H + +#include "vsi_nn_types.h" + +typedef enum + { + VSI_NN_LSH_PROJECTION_SPARSE = 1, + VSI_NN_LSH_PROJECTION_DENSE = 2, + } vsi_nn_lsh_projection_type_e; + +typedef struct _vsi_nn_lsh_projection_param + { + vsi_nn_lsh_projection_type_e type; + } vsi_nn_lsh_projection_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm.h new file mode 100644 index 0000000..5d3fbb5 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm.h @@ -0,0 +1,59 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LSTM_H +#define _VSI_NN_OP_LSTM_H + +#include "vsi_nn_types.h" +#include "vsi_nn_op_lstmunit.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_lstm_lcl_data_t +{ + vsi_nn_tensor_t *activation_tensor; + vsi_nn_tensor_t *forget_bias_tensor; + vsi_nn_tensor_t *cell_clip_tensor; + vsi_nn_tensor_t *proj_clip_tensor; +} vsi_nn_lstm_lcl_data_t; + +typedef struct _vsi_nn_lstm_param +{ + vsi_nn_lstm_lcl_data_t local; + + float cell_clip; + float proj_clip; + vsi_nn_activation_e activation; + float forget_bias; + + uint32_t weights; +} vsi_nn_lstm_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h new file mode 100644 index 0000000..cf0ed9f --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h @@ -0,0 +1,104 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LSTM_OVXLIB_H +#define _VSI_NN_OP_LSTM_OVXLIB_H + +#include "vsi_nn_types.h" +#include "vsi_nn_op_lstmunit.h" + +enum +{ + LSTM_INPUT_INPUT = 0, + LSTM_INPUT_H_STATE = 1, + LSTM_INPUT_C_STATE = 2, + + LSTM_INPUT_WEIGHT_I2I = 3, + LSTM_INPUT_WEIGHT_I2F = 4, + LSTM_INPUT_WEIGHT_I2C = 5, + LSTM_INPUT_WEIGHT_I2O = 6, + + LSTM_INPUT_WEIGHT_R2I = 7, + LSTM_INPUT_WEIGHT_R2F = 8, + LSTM_INPUT_WEIGHT_R2C = 9, + LSTM_INPUT_WEIGHT_R2O = 10, + + LSTM_INPUT_WEIGHT_C2I = 11, + LSTM_INPUT_WEIGHT_C2F = 12, + LSTM_INPUT_WEIGHT_C2O = 13, + + LSTM_INPUT_BIAS_I = 14, + LSTM_INPUT_BIAS_F = 15, + LSTM_INPUT_BIAS_C = 16, + LSTM_INPUT_BIAS_O = 17, + + LSTM_INPUT_WEIGHT_PROJ = 18, + LSTM_INPUT_BIAS_PROJ = 19, + + LSTM_INPUT_LAYERNORM_I = 20, + LSTM_INPUT_LAYERNORM_F = 21, + LSTM_INPUT_LAYERNORM_C = 22, + LSTM_INPUT_LAYERNORM_O = 23, + + LSTM_INPUT_AUX_INPUT = 24, + LSTM_INPUT_AUX_WEIGHT_I2I = 25, + LSTM_INPUT_AUX_WEIGHT_I2F = 26, + LSTM_INPUT_AUX_WEIGHT_I2C = 27, + LSTM_INPUT_AUX_WEIGHT_I2O = 28, + + LSTM_INPUT_CNT, + + LSTM_OUTPUT_OUTPUT = 0, + LSTM_OUTPUT_H_STATE = 1, + LSTM_OUTPUT_C_STATE = 2, + + LSTM_OUTPUT_CNT +}; + +typedef struct _vsi_nn_lstm_ovxlib_lcl_data_t +{ + vsi_bool use_cifg; + vsi_bool use_layer_norm; + vsi_bool use_projection; + vsi_bool use_projection_bias; + vsi_bool use_hybrid; + vsi_bool multi_batch; +} vsi_nn_lstm_ovxlib_lcl_data_t; + +typedef struct _vsi_nn_lstm_ovxlib_param +{ + vsi_nn_lstm_ovxlib_lcl_data_t local; + + float cell_clip; + float proj_clip; + vsi_nn_activation_e activation; + float forget_bias; + vsi_bool time_major; + vsi_nn_dtype_t internal_dtype[LSTMUNIT_QUANTIZE_PARAM_COUNT]; + vsi_nn_activation_e recurrent_activation; + vsi_bool return_sequences; + uint32_t weights; /* compatible with LSTM, NOT used */ +} vsi_nn_lstm_ovxlib_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit.h new file mode 100644 index 0000000..b76fd3d --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit.h @@ -0,0 +1,58 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LSTMUNIT_H +#define _VSI_NN_OP_LSTMUNIT_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_lstmunit_lcl_data_t +{ + vsi_nn_tensor_t *activation_tensor; + vsi_nn_tensor_t *cell_clip_tensor; + vsi_nn_tensor_t *proj_clip_tensor; + vsi_nn_tensor_t *scratch_tensor; + + vsi_nn_tensor_attr_t scratch_attr; + vsi_nn_tensor_t *forget_bias_tensor; +} vsi_nn_lstmunit_lcl_data_t; + +typedef struct _vsi_nn_lstmunit_param_t +{ + vsi_nn_lstmunit_lcl_data_t local; + + float cell_clip; + float proj_clip; + vsi_nn_activation_e activation; + float forget_bias; +} vsi_nn_lstmunit_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h new file mode 100644 index 0000000..08a9254 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h @@ -0,0 +1,100 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LSTMUNIT_ACTIVATION_H +#define _VSI_NN_OP_LSTMUNIT_ACTIVATION_H + +#include "vsi_nn_types.h" +#include "vsi_nn_op_lstmunit.h" + +/* c -> cifg, l -> layer norm, p -> projection, h -> peephole, b -> hybrid bias fp32, s -> standard*/ + +enum { + LSTMUNIT_ACT_INPUT_FC_I, //optional + LSTMUNIT_ACT_INPUT_FC_F, + LSTMUNIT_ACT_INPUT_FC_C, + LSTMUNIT_ACT_INPUT_FC_O, + LSTMUNIT_ACT_CSTATE_IN, + LSTMUNIT_ACT_HSTATE_FC_I, //optional + LSTMUNIT_ACT_HSTATE_FC_F, //optional + LSTMUNIT_ACT_HSTATE_FC_C, //optional + LSTMUNIT_ACT_HSTATE_FC_O, //optional + LSTMUNIT_ACT_DATA_BI, //optional + LSTMUNIT_ACT_DATA_BF, //optional + LSTMUNIT_ACT_DATA_BC, //optional + LSTMUNIT_ACT_DATA_BO, //optional + LSTMUNIT_ACT_LN_WI, //optional + LSTMUNIT_ACT_LN_WF, //optional + LSTMUNIT_ACT_LN_WC, //optional + LSTMUNIT_ACT_LN_WO, //optional + + LSTMUNIT_ACT_INPUTS_COUNT, + + LSTMUNIT_ACT_OUTPUT = 0, + LSTMUNIT_ACT_CSTATE_OUT, + LSTMUNIT_ACT_HSTATE_OUT, //optional + + LSTMUNIT_ACT_OUTUTS_COUNT, + + LSTMUNIT_ACT_PARAM_COUT = LSTMUNIT_ACT_INPUTS_COUNT + LSTMUNIT_ACT_OUTUTS_COUNT, +}; + + +enum +{ + LSTMUNIT_ACT_TENSOR_BI, + LSTMUNIT_ACT_TENSOR_BF, + LSTMUNIT_ACT_TENSOR_BC, + LSTMUNIT_ACT_TENSOR_BO, + LSTMUNIT_ACT_TENSOR_LN_WI, + LSTMUNIT_ACT_TENSOR_LN_WF, + LSTMUNIT_ACT_TENSOR_LN_WC, + LSTMUNIT_ACT_TENSOR_LN_WO, + + LSTMUNIT_ACT_TENSOR_CNT +}; + +typedef struct _vsi_nn_lstmunit_activation_lcl_data_t +{ + vsi_nn_tensor_t* tensors[LSTMUNIT_ACT_TENSOR_CNT]; + uint32_t hash_idx; + vsi_bool execute_on_sw; + vsi_nn_tensor_t *lstmunit_param; +} vsi_nn_lstmunit_activation_lcl_data_t; + +typedef struct _vsi_nn_lstmunit_activation_param +{ + vsi_nn_lstmunit_activation_lcl_data_t local; + float cell_clip; + float proj_clip; + float forget_bias; + uint8_t is_cifg; + uint8_t is_projection; + uint8_t is_layer_norm; + uint8_t is_peephole; /* not supported now*/ + uint8_t is_hybrid; /*hybrid mode and bias format is fp32 */ + vsi_nn_activation_e recurrent_activation; +} vsi_nn_lstmunit_activation_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h new file mode 100644 index 0000000..eaac01d --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h @@ -0,0 +1,277 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_LSTMUNIT_OVXLIB_H +#define _VSI_NN_OP_LSTMUNIT_OVXLIB_H + +#include "vsi_nn_tensor.h" +#include "vsi_nn_types.h" +#include "vsi_nn_op_lstmunit.h" + +#define LSTMUNIT_IFCO_GATE_COUNT 4 + +/* enum for inputs/outputs */ +enum +{ + LSTMUNIT_INPUT_INPUT = 0, + LSTMUNIT_INPUT_H_STATE = 1, + LSTMUNIT_INPUT_C_STATE = 2, + + LSTMUNIT_INPUT_WEIGHT_I2I = 3, + LSTMUNIT_INPUT_WEIGHT_I2F = 4, + LSTMUNIT_INPUT_WEIGHT_I2C = 5, + LSTMUNIT_INPUT_WEIGHT_I2O = 6, + + LSTMUNIT_INPUT_WEIGHT_R2I = 7, + LSTMUNIT_INPUT_WEIGHT_R2F = 8, + LSTMUNIT_INPUT_WEIGHT_R2C = 9, + LSTMUNIT_INPUT_WEIGHT_R2O = 10, + + LSTMUNIT_INPUT_WEIGHT_C2I = 11, + LSTMUNIT_INPUT_WEIGHT_C2F = 12, + LSTMUNIT_INPUT_WEIGHT_C2O = 13, + + LSTMUNIT_INPUT_BIAS_I = 14, + LSTMUNIT_INPUT_BIAS_F = 15, + LSTMUNIT_INPUT_BIAS_C = 16, + LSTMUNIT_INPUT_BIAS_O = 17, + + LSTMUNIT_INPUT_WEIGHT_PROJ = 18, + LSTMUNIT_INPUT_BIAS_PROJ = 19, + + LSTMUNIT_INPUT_LAYERNORM_I = 20, + LSTMUNIT_INPUT_LAYERNORM_F = 21, + LSTMUNIT_INPUT_LAYERNORM_C = 22, + LSTMUNIT_INPUT_LAYERNORM_O = 23, + + LSTMUNIT_INPUT_AUX_INPUT = 24, + LSTMUNIT_INPUT_AUX_WEIGHT_I2I = 25, + LSTMUNIT_INPUT_AUX_WEIGHT_I2F = 26, + LSTMUNIT_INPUT_AUX_WEIGHT_I2C = 27, + LSTMUNIT_INPUT_AUX_WEIGHT_I2O = 28, + + LSTMUNIT_INPUT_CNT, + + LSTMUNIT_OUTPUT_OUTPUT = 0, + LSTMUNIT_OUTPUT_H_STATE = 1, + LSTMUNIT_OUTPUT_C_STATE = 2, + LSTMUNIT_OUTPUT_SCRATCH = 3, + + LSTMUNIT_OUTPUT_CNT +}; + +typedef int32_t vsi_nnlstmunit_ovxlib_internal_node_index_t; enum +{ + LSTMUNIT_NODE_INVALID = -1, + /* Add internal node def */ + + LSTMUNIT_NODE_RESHAPE_INPUT_FC_INPUT, + LSTMUNIT_NODE_TRANS_INPUT_FC_INPUT, + LSTMUNIT_NODE_FC_I2I, + LSTMUNIT_NODE_FC_I2F, + LSTMUNIT_NODE_FC_I2C, + LSTMUNIT_NODE_FC_I2O, + + LSTMUNIT_NODE_TEST_NODE, + + LSTMUNIT_NODE_RESHAPE_RECURRENT_FC_INPUT, + LSTMUNIT_NODE_TRANS_RECURRENT_FC_INPUT, + + LSTMUNIT_NODE_FC_R2I, + LSTMUNIT_NODE_FC_R2F, + LSTMUNIT_NODE_FC_R2C, + LSTMUNIT_NODE_FC_R2O, + + LSTMUNIT_NODE_NN_TRANSPOSE_I2I, + LSTMUNIT_NODE_NN_TRANSPOSE_I2F, + LSTMUNIT_NODE_NN_TRANSPOSE_I2C, + LSTMUNIT_NODE_NN_TRANSPOSE_I2O, + LSTMUNIT_NODE_NN_RESHAPE_I2I, + LSTMUNIT_NODE_NN_RESHAPE_I2F, + LSTMUNIT_NODE_NN_RESHAPE_I2C, + LSTMUNIT_NODE_NN_RESHAPE_I2O, + LSTMUNIT_NODE_NN_TRANSPOSE_R2I, + LSTMUNIT_NODE_NN_TRANSPOSE_R2F, + LSTMUNIT_NODE_NN_TRANSPOSE_R2C, + LSTMUNIT_NODE_NN_TRANSPOSE_R2O, + LSTMUNIT_NODE_NN_RESHAPE_R2I, + LSTMUNIT_NODE_NN_RESHAPE_R2F, + LSTMUNIT_NODE_NN_RESHAPE_R2C, + LSTMUNIT_NODE_NN_RESHAPE_R2O, + + LSTMUNIT_NODE_INPUT_FC_OUTPUTS_CONCAT, + LSTMUNIT_NODE_RECURRENT_FC_OUTPUTS_CONCAT, + LSTMUNIT_NODE_LAYER_NORM, + LSTMUNIT_NODE_LAYER_NORM_SPLIT, + + LSTMUNIT_NODE_LAYER_NORM_I, + LSTMUNIT_NODE_LAYER_NORM_F, + LSTMUNIT_NODE_LAYER_NORM_C, + LSTMUNIT_NODE_LAYER_NORM_O, + + LSTMUNIT_NODE_ACTIVATIONS, /* Activations */ + + LSTMUNIT_NODE_RESHAPE_PROJECTION_FC_INPUT, + LSTMUNIT_NODE_FC_PROJ, + LSTMUNIT_NODE_ADD_PROJ, + LSTMUNIT_NODE_RESHAPE_FC_PROJ, + + LSTMUNIT_NODE_CNT +}; + +enum +{ + LSTMUNIT_TENSOR_RESHAPRE_INPUT_FC_INPUT, + LSTMUNIT_TENSOR_TRANS_INPUT_FC_INPUT, + + LSTMUNIT_TENSOR_ZERO_BIAS_I2I, + LSTMUNIT_TENSOR_ZERO_BIAS_I2F, + LSTMUNIT_TENSOR_ZERO_BIAS_I2C, + LSTMUNIT_TENSOR_ZERO_BIAS_I2O, + + LSTMUNIT_TENSOR_RESHAPRE_RECURRENT_FC_INPUT, + LSTMUNIT_TENSOR_TRANS_RECURRENT_FC_INPUT, + + LSTMUNIT_TENSOR_ZERO_BIAS_R2I, + LSTMUNIT_TENSOR_ZERO_BIAS_R2F, + LSTMUNIT_TENSOR_ZERO_BIAS_R2C, + LSTMUNIT_TENSOR_ZERO_BIAS_R2O, + + LSTMUNIT_TENSOR_CONCATED_BIAS, + LSTMUNIT_TENSOR_CONCATED_LN_W, + + LSTMUNIT_TENSOR_OUTPUT_I2I, + LSTMUNIT_TENSOR_OUTPUT_I2F, + LSTMUNIT_TENSOR_OUTPUT_I2C, + LSTMUNIT_TENSOR_OUTPUT_I2O, + LSTMUNIT_TENSOR_OUTPUT_R2I, + LSTMUNIT_TENSOR_OUTPUT_R2F, + LSTMUNIT_TENSOR_OUTPUT_R2C, + LSTMUNIT_TENSOR_OUTPUT_R2O, + + LSTMUNIT_TENSOR_OUTPUT_NN_I2I, + LSTMUNIT_TENSOR_OUTPUT_NN_I2F, + LSTMUNIT_TENSOR_OUTPUT_NN_I2C, + LSTMUNIT_TENSOR_OUTPUT_NN_I2O, + LSTMUNIT_TENSOR_OUTPUT_NN_R2I, + LSTMUNIT_TENSOR_OUTPUT_NN_R2F, + LSTMUNIT_TENSOR_OUTPUT_NN_R2C, + LSTMUNIT_TENSOR_OUTPUT_NN_R2O, + + LSTMUNIT_TENSOR_OUTPUT_NN_TRANS_I2I, + LSTMUNIT_TENSOR_OUTPUT_NN_TRANS_I2F, + LSTMUNIT_TENSOR_OUTPUT_NN_TRANS_I2C, + LSTMUNIT_TENSOR_OUTPUT_NN_TRANS_I2O, + LSTMUNIT_TENSOR_OUTPUT_NN_TRANS_R2I, + LSTMUNIT_TENSOR_OUTPUT_NN_TRANS_R2F, + LSTMUNIT_TENSOR_OUTPUT_NN_TRANS_R2C, + LSTMUNIT_TENSOR_OUTPUT_NN_TRANS_R2O, + + LSTMUNIT_TENSOR_RESHAPED_WEIGHT_I2I, + LSTMUNIT_TENSOR_RESHAPED_WEIGHT_I2F, + LSTMUNIT_TENSOR_RESHAPED_WEIGHT_I2C, + LSTMUNIT_TENSOR_RESHAPED_WEIGHT_I2O, + LSTMUNIT_TENSOR_RESHAPED_WEIGHT_R2I, + LSTMUNIT_TENSOR_RESHAPED_WEIGHT_R2F, + LSTMUNIT_TENSOR_RESHAPED_WEIGHT_R2C, + LSTMUNIT_TENSOR_RESHAPED_WEIGHT_R2O, + + LSTMUNIT_TENSOR_INPUT_FC_OUTPUTS, + LSTMUNIT_TENSOR_RECURRENT_FC_OUTPUTS, + LSTMUNIT_TENSOR_LAYER_NORM_OUTPUT, + + LSTMUNIT_TENSOR_LAYER_NORM_OUTPUT_I, + LSTMUNIT_TENSOR_LAYER_NORM_OUTPUT_F, + LSTMUNIT_TENSOR_LAYER_NORM_OUTPUT_C, + LSTMUNIT_TENSOR_LAYER_NORM_OUTPUT_O, + + LSTMUNIT_TENSOR_ACTIVATION_OUTPUT, + LSTMUNIT_TENSOR_RESHAPE_PROJECTION_FC_INPUT, /* reshape projection input */ + LSTMUNIT_TENSOR_ZERO_BIAS_PROJECTION, + LSTMUNIT_TENSOR_PROJECTION_FC_NN_OUTPUT, + LSTMUNIT_TENSOR_PROJECTION_FC_OUTPUT, + + LSTMUNIT_TENSOR_CNT +}; + +enum +{ + LSTMUNIT_QUANTIZE_PARAM_I2I, + LSTMUNIT_QUANTIZE_PARAM_I2F, + LSTMUNIT_QUANTIZE_PARAM_I2C, + LSTMUNIT_QUANTIZE_PARAM_I2O, + + LSTMUNIT_QUANTIZE_PARAM_R2I, + LSTMUNIT_QUANTIZE_PARAM_R2F, + LSTMUNIT_QUANTIZE_PARAM_R2C, + LSTMUNIT_QUANTIZE_PARAM_R2O, + + LSTMUNIT_QUANTIZE_PARAM_COUNT +}; + +enum +{ + LSTMUNIT_QUANTIZE_PARAM_AUX_I2I, + LSTMUNIT_QUANTIZE_PARAM_AUX_I2F, + LSTMUNIT_QUANTIZE_PARAM_AUX_I2C, + LSTMUNIT_QUANTIZE_PARAM_AUX_I2O, + + LSTMUNIT_QUANTIZE_PARAM_AUX_COUNT +}; + +typedef struct _vsi_nn_lstmunit_ovxlib_lcl_data_t +{ + vsi_bool use_cifg; + vsi_bool use_layer_norm; + vsi_bool use_projection; + vsi_bool use_projection_bias; + vsi_bool use_hybrid; + vsi_bool multi_batch; + vsi_bool use_peephole; +} vsi_nn_lstmunit_ovxlib_lcl_data_t; + +typedef struct _vsi_nn_lstmunit_ovxlib_param +{ + union + { + vsi_nn_lstmunit_ovxlib_lcl_data_t *local; + struct { /* for ABI compatible */ + vsi_bool pad0; + vsi_bool pad1; + vsi_bool pad2; + vsi_bool pad3; + vsi_bool pad4; + vsi_bool pad5; + }; + }; + + float cell_clip; + float proj_clip; + vsi_nn_activation_e activation; + float forget_bias; + vsi_nn_dtype_t internal_dtype[LSTMUNIT_QUANTIZE_PARAM_COUNT]; + vsi_nn_activation_e recurrent_activation; + vsi_nn_dtype_t *internal_dtype_aux; +} vsi_nn_lstmunit_ovxlib_param; + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_matrixmul.h b/src/tim/vx/internal/include/ops/vsi_nn_op_matrixmul.h new file mode 100644 index 0000000..d17593a --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_matrixmul.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_MATRIXMUL_H +#define _VSI_NN_OP_MATRIXMUL_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_matrixmul_param +{ + vx_bool transpose[2]; + vx_bool adjoint[2]; +} vsi_nn_matrixmul_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_maximum.h b/src/tim/vx/internal/include/ops/vsi_nn_op_maximum.h new file mode 100644 index 0000000..7031482 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_maximum.h @@ -0,0 +1,106 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_MAXIMUM_H +#define _VSI_NN_OP_CLIENT_MAXIMUM_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +#define VSI_NN_MAXIMUM_SH_KERNEL_IDX(_INPUT0_TYPE, _INPUT1_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_MAXIMUM_##_INPUT0_TYPE##_INPUT1_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + MAXIMUM_CPU_KERNEL, + + VSI_NN_MAXIMUM_SH_KERNEL_IDX(F16, F16, F16, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I8, F16, I8, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I8, F16, F16, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(U8, F16, U8, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(U8, F16, F16, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I8, I8, I8, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(U8, U8, U8, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I16, I16, I16, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I16, F16, I16, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I16, F16, F16, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(F16, F16, U8, IMAGE_3D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(F16, F16, I8, IMAGE_3D) + + VSI_NN_MAXIMUM_SH_KERNEL_IDX(F16, F16, F16, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I8, F16, I8, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I8, F16, F16, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(U8, F16, U8, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(U8, F16, F16, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I8, I8, I8, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(U8, U8, U8, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I16, I16, I16, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I16, F16, I16, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(I16, F16, F16, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(F16, F16, U8, IMAGE_2D) + VSI_NN_MAXIMUM_SH_KERNEL_IDX(F16, F16, I8, IMAGE_2D) +}; + +enum { + MAXIMUM_INPUT0 = 0, + MAXIMUM_INPUT1, + + MAXIMUM_INPUTS_COUNT, + + MAXIMUM_OUTPUT = 0, + + MAXIMUM_OUTPUTS_COUNT, + + MAXIMUM_PARAM_COUT = MAXIMUM_INPUTS_COUNT + MAXIMUM_OUTPUTS_COUNT, +}; + + +#define _VSI_NN_MAXIMUM_LOCAL_TENSOR_NUM 3 + +typedef struct _vsi_nn_maximum_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_MAXIMUM_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; + vsi_bool enable_image_2d; + uint32_t sizes0[VSI_NN_MAX_DIM_NUM]; + uint32_t sizes1[VSI_NN_MAX_DIM_NUM]; + uint32_t sizes2[VSI_NN_MAX_DIM_NUM]; + uint32_t dim_num; +} vsi_nn_maximum_lcl_data; + +typedef struct _vsi_nn_maximum_param +{ + /* maximum layer local data structure */ + vsi_nn_maximum_lcl_data *local; +} vsi_nn_maximum_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h b/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h new file mode 100644 index 0000000..c9f39ed --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_MOMENTS_H +#define _VSI_NN_OP_MOMENTS_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_moments_param +{ + int32_t* axis; + int32_t axis_num; + vsi_bool keep_dim; +} vsi_nn_moments_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_multiply.h b/src/tim/vx/internal/include/ops/vsi_nn_op_multiply.h new file mode 100644 index 0000000..3b815b7 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_multiply.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_MULTIPLY_H +#define _VSI_NN_OP_MULTIPLY_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_multiply_param +{ + float scale; +} vsi_nn_multiply_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_nbg.h b/src/tim/vx/internal/include/ops/vsi_nn_op_nbg.h new file mode 100644 index 0000000..b2273b0 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_nbg.h @@ -0,0 +1,58 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_NBG_H +#define _VSI_NN_OP_NBG_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum _vsi_nn_nbg_type +{ + VSI_NN_NBG_FILE, + VSI_NN_NBG_FOLDER, + VSI_NN_NBG_LABEL, + VSI_NN_NBG_POINTER +}vsi_nn_nbg_type_e; + +typedef struct _vsi_nn_nbg_lcl_data +{ + vx_kernel kernel; +} vsi_nn_nbg_lcl_data; + +typedef struct _vsi_nn_nbg_param +{ + vsi_nn_nbg_lcl_data local; + vsi_nn_nbg_type_e type; + const char *url; +} vsi_nn_nbg_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h b/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h new file mode 100644 index 0000000..ea85174 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h @@ -0,0 +1,38 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_NEG_H +#define _VSI_NN_OP_NEG_H + +#include "vsi_nn_types.h" + + +#define _VSI_NN_ELU_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_neg_param +{ + vx_tensor local_tensor[_VSI_NN_ELU_LOCAL_TENSOR_NUM]; +} vsi_nn_neg_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h new file mode 100644 index 0000000..7e7d5d1 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PAD_H +#define _VSI_NN_OP_PAD_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + VSI_NN_PAD_MODE_CONSTANT, + VSI_NN_PAD_MODE_REPLICATE, + VSI_NN_PAD_MODE_SYMMETRIC, + VSI_NN_PAD_MODE_REFLECT, +}vsi_nn_pad_mode_e; + +typedef struct _vsi_nn_pad_param +{ + const uint32_t * front_size; + const uint32_t * back_size; + uint8_t dim_num; + int32_t const_val; + vsi_nn_pad_mode_e mode; +} vsi_nn_pad_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_permute.h b/src/tim/vx/internal/include/ops/vsi_nn_op_permute.h new file mode 100644 index 0000000..9a6c3f2 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_permute.h @@ -0,0 +1,51 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PERMUTE_H +#define _VSI_NN_OP_PERMUTE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_permute_lcl_data +{ + vsi_bool initialized; +} vsi_nn_permute_lcl_data; + +typedef struct _vsi_nn_permute_param +{ + const uint32_t * perm; + uint32_t dim_num; + + /* reshape layer local data structure */ + vsi_nn_permute_lcl_data local; +} vsi_nn_permute_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h new file mode 100644 index 0000000..979d22c --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h @@ -0,0 +1,60 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_POOL_H +#define _VSI_NN_OP_POOL_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM 3 + +typedef struct _vsi_nn_poolwithargmax_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM]; +} vsi_nn_poolwithargmax_lcl_data; + +typedef struct _vsi_nn_pool_param +{ + vsi_enum type; + /* round_type is used to calculate the output shape */ + vsi_nn_round_type_e round_type; + uint32_t ksize[2]; + uint32_t stride[2]; + /* Pad left, right, top, bottom */ + uint32_t pad[4]; + /* Pad type default value shall be AUTO */ + vsi_nn_pad_e pad_type; + /* poolwithargmax layer local data structure */ + vsi_nn_poolwithargmax_lcl_data local; +} vsi_nn_pool_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h b/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h new file mode 100644 index 0000000..3ffe93f --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h @@ -0,0 +1,57 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_POST_PROCESS_H +#define _VSI_NN_OP_POST_PROCESS_H + +#include "vsi_nn_types.h" + +enum +{ + POST_PROCESS_INPUT = 0, + + POST_PROCESS_INPUT_CNT, + + POST_PROCESS_OUTPUT = 0, + + POST_PROCESS_OUTPUT_CNT +}; + +typedef struct _vsi_nn_post_process_lcl_data +{ + vsi_bool initialized; + vsi_bool enable_data_conv; + vsi_bool enable_perm; +} vsi_nn_post_process_lcl_data; + +typedef struct _vsi_nn_post_process_param +{ + uint32_t * perm; + uint32_t dim_num; + + /* post process layer local data structure */ + vsi_nn_post_process_lcl_data local; +} vsi_nn_post_process_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pow.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pow.h new file mode 100644 index 0000000..bda2ec4 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pow.h @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_POW_H +#define _VSI_NN_OP_CLIENT_POW_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_POW_LOCAL_TENSOR_NUM 3 + +typedef struct _vsi_nn_pow_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_POW_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_pow_lcl_data; + +typedef struct _vsi_nn_pow_param +{ + /* local data must be the first. */ + vsi_nn_pow_lcl_data local; +} vsi_nn_pow_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h new file mode 100644 index 0000000..c06bf7d --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h @@ -0,0 +1,84 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PRE_PROCESS_H +#define _VSI_NN_OP_PRE_PROCESS_H + +#include "vsi_nn_types.h" +#include "vsi_nn_pre_post_process.h" + +typedef vsi_nn_preprocess_source_format_e vsi_nn_pre_process_type_e; + +enum +{ + PRE_PROCESS_INPUT0 = 0, + + PRE_PROCESS_INPUT1, + PRE_PROCESS_INPUT2, + + PRE_PROCESS_INPUT_CNT, + + PRE_PROCESS_OUTPUT = 0, + + PRE_PROCESS_OUTPUT_CNT +}; + +#define _VSI_NN_PRE_PROCESS_LOCAL_TENSOR_NUM 10 +typedef struct _vsi_nn_pre_process_lcl_data +{ + vsi_nn_tensor_t *local_tensor[_VSI_NN_PRE_PROCESS_LOCAL_TENSOR_NUM]; +} vsi_nn_pre_process_lcl_data; + +typedef struct _vsi_nn_pre_process_param +{ + struct + { + uint32_t left; + uint32_t top; + uint32_t width; + uint32_t height; + } rect; + + struct + { + uint32_t *size; + uint32_t dim_num; + } output_attr; + + uint32_t * perm; + uint32_t dim_num; + + struct + { + float mean[3]; + float scale; + } norm; + + vsi_bool reverse_channel; + + vsi_nn_pre_process_type_e type; + + vsi_nn_pre_process_lcl_data *local; +} vsi_nn_pre_process_param; +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h new file mode 100644 index 0000000..ee246b3 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h @@ -0,0 +1,69 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PRE_PROCESS_BGRA_H +#define _VSI_NN_OP_PRE_PROCESS_BGRA_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_pre_process_bgra_lcl_data +{ + int32_t scale_x; + int32_t scale_y; + vsi_bool enable_copy; + vsi_bool enable_perm; + vx_tensor local_tensor; +} vsi_nn_pre_process_bgra_lcl_data; + +typedef struct _vsi_nn_pre_process_bgra_param +{ + struct + { + uint32_t left; + uint32_t top; + uint32_t width; + uint32_t height; + } rect; + + struct + { + uint32_t *size; + uint32_t dim_num; + } output_attr; + + uint32_t * perm; + uint32_t dim_num; + + float r_mean; + float g_mean; + float b_mean; + float rgb_scale; + + vsi_bool reverse_channel; + + /* pre process rgb layer local data structure */ + vsi_nn_pre_process_bgra_lcl_data local; +} vsi_nn_pre_process_bgra_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h new file mode 100644 index 0000000..57abf78 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h @@ -0,0 +1,71 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PRE_PROCESS_GRAY_H +#define _VSI_NN_OP_PRE_PROCESS_GRAY_H + +#include "vsi_nn_types.h" + +enum +{ + PRE_PROCESS_GRAY_INPUT = 0, + + PRE_PROCESS_GRAY_INPUT_CNT, + + PRE_PROCESS_GRAY_OUTPUT = 0, + + PRE_PROCESS_GRAY_OUTPUT_CNT +}; + +typedef struct _vsi_nn_pre_process_gray_lcl_data +{ + int32_t scale_x; + int32_t scale_y; + vsi_bool enable_copy; +} vsi_nn_pre_process_gray_lcl_data; + +typedef struct _vsi_nn_pre_process_gray_param +{ + struct + { + uint32_t left; + uint32_t top; + uint32_t width; + uint32_t height; + } rect; + + struct + { + uint32_t *size; + uint32_t dim_num; + } output_attr; + + float mean; + float scale; + + /* pre process gray layer local data structure */ + vsi_nn_pre_process_gray_lcl_data local; +} vsi_nn_pre_process_gray_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h new file mode 100644 index 0000000..a62bddb --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h @@ -0,0 +1,78 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PRE_PROCESS_NV12_H +#define _VSI_NN_OP_PRE_PROCESS_NV12_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_PRE_PROCESS_NV12_LOCAL_TENSOR_NUM 3 + +typedef struct _vsi_nn_pre_process_nv12_lcl_data +{ + int32_t scale_x; + int32_t scale_y; + vsi_bool enable_copy; + vsi_bool enable_perm; + vx_tensor local_tensor[_VSI_NN_PRE_PROCESS_NV12_LOCAL_TENSOR_NUM]; +} vsi_nn_pre_process_nv12_lcl_data; + +typedef struct _vsi_nn_pre_process_nv12_param +{ + struct + { + uint32_t left; + uint32_t top; + uint32_t width; + uint32_t height; + } rect; + + struct + { + uint32_t *size; + uint32_t dim_num; + } output_attr; + + uint32_t * perm; + uint32_t dim_num; + + float r_mean; + float g_mean; + float b_mean; + float rgb_scale; + + vsi_bool reverse_channel; + + vsi_nn_pre_process_nv12_lcl_data* local; +} vsi_nn_pre_process_nv12_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h new file mode 100644 index 0000000..f62bfe6 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h @@ -0,0 +1,81 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PRE_PROCESS_RGB_H +#define _VSI_NN_OP_PRE_PROCESS_RGB_H + +#include "vsi_nn_types.h" + + +enum +{ + PRE_PROCESS_RGB_INPUT = 0, + + PRE_PROCESS_RGB_INPUT_CNT, + + PRE_PROCESS_RGB_OUTPUT = 0, + + PRE_PROCESS_RGB_OUTPUT_CNT +}; + +typedef struct _vsi_nn_pre_process_rgb_lcl_data +{ + int32_t scale_x; + int32_t scale_y; + vsi_bool enable_copy; + vsi_bool enable_perm; + vx_tensor local_tensor; +} vsi_nn_pre_process_rgb_lcl_data; + +typedef struct _vsi_nn_pre_process_rgb_param +{ + struct + { + uint32_t left; + uint32_t top; + uint32_t width; + uint32_t height; + } rect; + + struct + { + uint32_t *size; + uint32_t dim_num; + } output_attr; + + uint32_t * perm; + uint32_t dim_num; + + float r_mean; + float g_mean; + float b_mean; + float rgb_scale; + + vsi_bool reverse_channel; + + /* pre process rgb layer local data structure */ + vsi_nn_pre_process_rgb_lcl_data local; +} vsi_nn_pre_process_rgb_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h new file mode 100644 index 0000000..b70094f --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h @@ -0,0 +1,57 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PRE_PROCESS_TENSOR_H +#define _VSI_NN_OP_PRE_PROCESS_TENSOR_H + +#include "vsi_nn_types.h" + +enum +{ + PRE_PROCESS_TENSOR_INPUT = 0, + + PRE_PROCESS_TENSOR_INPUT_CNT, + + PRE_PROCESS_TENSOR_OUTPUT = 0, + + PRE_PROCESS_TENSOR_OUTPUT_CNT +}; + +typedef struct _vsi_nn_pre_process_tensor_lcl_data +{ + vsi_bool initialized; + vsi_bool enable_data_conv; + vsi_bool enable_perm; +} vsi_nn_pre_process_tensor_lcl_data; + +typedef struct _vsi_nn_pre_process_tensor_param +{ + uint32_t * perm; + uint32_t dim_num; + + /* pre process tensor layer local data structure */ + vsi_nn_pre_process_tensor_lcl_data local; +} vsi_nn_pre_process_tensor_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h new file mode 100644 index 0000000..149520a --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h @@ -0,0 +1,78 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PRE_PROCESS_YUV420_H +#define _VSI_NN_OP_PRE_PROCESS_YUV420_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_PRE_PROCESS_YUV420_LOCAL_TENSOR_NUM 4 + +typedef struct _vsi_nn_pre_process_yuv420_lcl_data +{ + int32_t scale_x; + int32_t scale_y; + vsi_bool enable_copy; + vsi_bool enable_perm; + vx_tensor local_tensor[_VSI_NN_PRE_PROCESS_YUV420_LOCAL_TENSOR_NUM]; +} vsi_nn_pre_process_yuv420_lcl_data; + +typedef struct _vsi_nn_pre_process_yuv420_param +{ + struct + { + uint32_t left; + uint32_t top; + uint32_t width; + uint32_t height; + } rect; + + struct + { + uint32_t *size; + uint32_t dim_num; + } output_attr; + + uint32_t * perm; + uint32_t dim_num; + + float r_mean; + float g_mean; + float b_mean; + float rgb_scale; + + vsi_bool reverse_channel; + /* local data must be the first. */ + vsi_nn_pre_process_yuv420_lcl_data local; +} vsi_nn_pre_process_yuv420_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h new file mode 100644 index 0000000..fec700f --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h @@ -0,0 +1,78 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PRE_PROCESS_YUV444_H +#define _VSI_NN_OP_PRE_PROCESS_YUV444_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_PRE_PROCESS_YUV444_LOCAL_TENSOR_NUM 4 + +typedef struct _vsi_nn_pre_process_yuv444_lcl_data +{ + int32_t scale_x; + int32_t scale_y; + vsi_bool enable_copy; + vsi_bool enable_perm; + vx_tensor local_tensor[_VSI_NN_PRE_PROCESS_YUV444_LOCAL_TENSOR_NUM]; +} vsi_nn_pre_process_yuv444_lcl_data; + +typedef struct _vsi_nn_pre_process_yuv444_param +{ + struct + { + uint32_t left; + uint32_t top; + uint32_t width; + uint32_t height; + } rect; + + struct + { + uint32_t *size; + uint32_t dim_num; + } output_attr; + + uint32_t * perm; + uint32_t dim_num; + + float r_mean; + float g_mean; + float b_mean; + float rgb_scale; + + vsi_bool reverse_channel; + /* local data must be the first. */ + vsi_nn_pre_process_yuv444_lcl_data* local; +} vsi_nn_pre_process_yuv444_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_prelu.h b/src/tim/vx/internal/include/ops/vsi_nn_op_prelu.h new file mode 100644 index 0000000..84da943 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_prelu.h @@ -0,0 +1,132 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PRELU_H +#define _VSI_NN_OP_PRELU_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VSI_NN_PRELU_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _ALPHA_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_PRELU_AXIS##_AXIS##_##_INPUT_TYPE##_ALPHA_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + PRELU_CPU_KERNEL, + VSI_NN_PRELU_SH_KERNEL_IDX(0, BF16, F16, BF16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, BF16, BF16, BF16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, F16, F16, F16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, F16, F16, I16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, F16, F16, I8, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, F16, F16, U8, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, I16, F16, I16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, I8, F16, I8, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, U8, F16, U8, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, I16, F16, F16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, I8, F16, F16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, U8, F16, F16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(0, BF16, F16, BF16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, BF16, BF16, BF16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, F16, F16, F16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, F16, F16, I16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, F16, F16, I8, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, F16, F16, U8, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, I16, F16, I16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, I8, F16, I8, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, U8, F16, U8, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, I16, F16, F16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, I8, F16, F16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(0, U8, F16, F16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, BF16, F16, BF16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, BF16, BF16, BF16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, F16, F16, F16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, F16, F16, I16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, F16, F16, I8, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, F16, F16, U8, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, I16, F16, I16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, I8, F16, I8, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, U8, F16, U8, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, I16, F16, F16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, I8, F16, F16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, U8, F16, F16, IMAGE) + VSI_NN_PRELU_SH_KERNEL_IDX(1, BF16, F16, BF16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, BF16, BF16, BF16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, F16, F16, F16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, F16, F16, I16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, F16, F16, I8, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, F16, F16, U8, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, I16, F16, I16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, I8, F16, I8, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, U8, F16, U8, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, I16, F16, F16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, I8, F16, F16, IMAGE_2D) + VSI_NN_PRELU_SH_KERNEL_IDX(1, U8, F16, F16, IMAGE_2D) + PRELU_KERNEL_COUNTS, +}; + +enum { + PRELLU_INPUT = 0, + + PRELLU_INPUT1, + + PRELLU_INPUTS_COUNT, + + PRELLU_OUTPUT = 0, + + PRELLU_OUTPUTS_COUNT, + + PRELLU_PARAM_COUT = PRELLU_INPUTS_COUNT + PRELLU_OUTPUTS_COUNT, +}; + +#define _VSI_NN_PRELU_LOCAL_TENSOR_NUM 3 + +enum { + PRELLU_STYLE_ORIGINAL = 0, + PRELLU_STYLE_ANDROID_NN = 1, + PRELLU_STYLE_CAN_TRANS_ORIGINAL = 2, +}; + +typedef struct _vsi_nn_prelu_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_PRELU_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; + uint32_t style; +} vsi_nn_prelu_lcl_data; + +typedef struct _vsi_nn_prelu_param +{ + /* prelu layer local data structure */ + vsi_nn_prelu_lcl_data *local; + int32_t axis; +} vsi_nn_prelu_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_proposal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_proposal.h new file mode 100644 index 0000000..58cc59f --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_proposal.h @@ -0,0 +1,71 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_PROPOSAL_H +#define _VSI_NN_OP_PROPOSAL_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_proposal_lcl_data +{ + vx_tensor rois; + vx_tensor score; +} vsi_nn_proposal_lcl_data; + +typedef struct _vsi_nn_proposal_anchor +{ + float * ratio; + float * scale; + int32_t ratio_num; + int32_t scale_num; + int32_t base_size; +} vsi_nn_proposal_anchor; + +typedef struct _vsi_nn_proposal_im_info +{ + float size[2]; + float scale[2]; +} vsi_nn_proposal_im_info; + +typedef struct _vsi_nn_proposal_param +{ + vsi_nn_proposal_lcl_data local; + vsi_nn_proposal_anchor anchor; + vsi_nn_proposal_im_info im_info; + uint32_t feat_stride; + uint32_t pre_nms_topn; + uint32_t post_nms_topn; + float nms_thresh; + uint32_t min_size; +} vsi_nn_proposal_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h new file mode 100644 index 0000000..ec127d2 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h @@ -0,0 +1,64 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_QUANTIZED_16BIT_LSTM_H +#define _VSI_NN_OP_QUANTIZED_16BIT_LSTM_H + +#include "vsi_nn_types.h" + +enum +{ + Q16_LSTM_INPUT_INPUT = 0, + + Q16_LSTM_INPUT_WEIGHT_I2I = 1, + Q16_LSTM_INPUT_WEIGHT_I2F = 2, + Q16_LSTM_INPUT_WEIGHT_I2C = 3, + Q16_LSTM_INPUT_WEIGHT_I2O = 4, + + Q16_LSTM_INPUT_WEIGHT_R2I = 5, + Q16_LSTM_INPUT_WEIGHT_R2F = 6, + Q16_LSTM_INPUT_WEIGHT_R2C = 7, + Q16_LSTM_INPUT_WEIGHT_R2O = 8, + + Q16_LSTM_INPUT_BIAS_I = 9, + Q16_LSTM_INPUT_BIAS_F = 10, + Q16_LSTM_INPUT_BIAS_C = 11, + Q16_LSTM_INPUT_BIAS_O = 12, + + Q16_LSTM_INPUT_C_STATE = 13, + Q16_LSTM_INPUT_H_STATE = 14, + + Q16_LSTM_INPUT_CNT, + + Q16_LSTM_OUTPUT_C_STATE = 0, + Q16_LSTM_OUTPUT_OUTPUT = 1, + Q16_LSTM_OUTPUT_CNT +}; + +typedef struct _vsi_nn_quantized_16bit_lstm_param +{ + void* local; +} vsi_nn_quantized_16bit_lstm_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h b/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h new file mode 100644 index 0000000..34b7769 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h @@ -0,0 +1,35 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_RANDOM_MULTINOMIAL_H +#define _VSI_NN_OP_RANDOM_MULTINOMIAL_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_random_multinomial_param +{ + int32_t sample_num; +} vsi_nn_random_multinomial_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h new file mode 100644 index 0000000..cf7bb8b --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h @@ -0,0 +1,65 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_REDUCE_H +#define _VSI_NN_OP_REDUCE_H + +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef vx_uint32 vsi_nn_reduce_t; enum +{ + VSI_NN_REDUCE_MEAN = 1, + VSI_NN_REDUCE_MAX, + VSI_NN_REDUCE_MIN, + VSI_NN_REDUCE_SUM, + VSI_NN_REDUCE_ALL, + VSI_NN_REDUCE_ANY, + VSI_NN_REDUCE_PROD, +}; + +typedef struct _vsi_nn_reduce_lcl_data_t +{ + vsi_nn_tensor_t *axis_tensor; +} vsi_nn_reduce_lcl_data_t; + +typedef struct _vsi_nn_reduce_param +{ + /* local data must be the first. */ + vsi_nn_reduce_lcl_data_t local; + vx_enum type; + const int32_t *axis; + vx_uint32 axis_num; + vx_bool keep_dim; + struct _vsi_nn_reduce_lcl2_data_t* local2; +} vsi_nn_reduce_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h new file mode 100644 index 0000000..24cca15 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h @@ -0,0 +1,72 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_REDUCEALL_INTERNAL_H +#define _VSI_NN_OP_REDUCEALL_INTERNAL_H + +#include "vsi_nn_types.h" + +#define VSI_NN_REDUCEALL_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_REDUCEALL_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + REDUCEALL_CPU_KERNEL, + VSI_NN_REDUCEALL_SH_KERNEL_IDX(0, I8, I8, IMAGE) + VSI_NN_REDUCEALL_SH_KERNEL_IDX(0, I8, I8, IMAGE_2D) + VSI_NN_REDUCEALL_SH_KERNEL_IDX(1, I8, I8, IMAGE) + VSI_NN_REDUCEALL_SH_KERNEL_IDX(1, I8, I8, IMAGE_2D) + VSI_NN_REDUCEALL_SH_KERNEL_IDX(2, I8, I8, IMAGE) + REDUCEALL_KERNEL_COUNTS, +}; + +enum { + REDUCEALL_INPUT = 0, + + REDUCEALL_INPUTS_COUNT, + + REDUCEALL_OUTPUT = 0, + + REDUCEALL_OUTPUTS_COUNT, + + REDUCEALL_PARAM_COUT = REDUCEALL_INPUTS_COUNT + REDUCEALL_OUTPUTS_COUNT, +}; + +#define _VSI_NN_REDUCEALL_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_reduceall_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_REDUCEALL_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_reduceall_lcl_data; + +typedef struct _vsi_nn_reduceall_internal_param +{ + vsi_nn_reduceall_lcl_data local; + vx_int32 *axis; + vx_uint32 axis_num; + vx_bool keep_dim; +} vsi_nn_reduceall_internal_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h new file mode 100644 index 0000000..a316c82 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h @@ -0,0 +1,72 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_REDUCEANY_INTERNAL_H +#define _VSI_NN_OP_REDUCEANY_INTERNAL_H + +#include "vsi_nn_types.h" + +#define VSI_NN_REDUCEANY_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_REDUCEANY_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + REDUCEANY_CPU_KERNEL, + VSI_NN_REDUCEANY_SH_KERNEL_IDX(0, I8, I8, IMAGE) + VSI_NN_REDUCEANY_SH_KERNEL_IDX(0, I8, I8, IMAGE_2D) + VSI_NN_REDUCEANY_SH_KERNEL_IDX(1, I8, I8, IMAGE) + VSI_NN_REDUCEANY_SH_KERNEL_IDX(1, I8, I8, IMAGE_2D) + VSI_NN_REDUCEANY_SH_KERNEL_IDX(2, I8, I8, IMAGE) + REDUCEANY_KERNEL_COUNTS, +}; + +enum { + REDUCEANY_INPUT = 0, + + REDUCEANY_INPUTS_COUNT, + + REDUCEANY_OUTPUT = 0, + + REDUCEANY_OUTPUTS_COUNT, + + REDUCEANY_PARAM_COUT = REDUCEANY_INPUTS_COUNT + REDUCEANY_OUTPUTS_COUNT, +}; + +#define _VSI_NN_REDUCEANY_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_reduceany_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_REDUCEANY_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_reduceany_lcl_data; + +typedef struct _vsi_nn_reduceany_internal_param +{ + vsi_nn_reduceany_lcl_data local; + vx_int32 *axis; + vx_uint32 axis_num; + vx_bool keep_dim; +} vsi_nn_reduceany_internal_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h new file mode 100644 index 0000000..9219983 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h @@ -0,0 +1,117 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_REDUCEMAX_INTERNAL_H +#define _VSI_NN_OP_REDUCEMAX_INTERNAL_H + +#include "vsi_nn_types.h" + +#define VSI_NN_REDUCEMAX_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_REDUCEMAX_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + REDUCEMAX_CPU_KERNEL, + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, F16, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, F16, I16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, F16, I8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, F16, U8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, I16, I16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, I8, I8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, U8, U8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, I16, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, I8, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, U8, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, F16, F16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, F16, I16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, F16, I8, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, F16, U8, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, I16, I16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, I8, I8, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, U8, U8, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, I16, F16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, I8, F16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(0, U8, F16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, F16, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, F16, I16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, F16, I8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, F16, U8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, I16, I16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, I8, I8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, U8, U8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, I16, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, I8, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, U8, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, F16, F16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, F16, I16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, F16, I8, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, F16, U8, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, I16, I16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, I8, I8, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, U8, U8, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, I16, F16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, I8, F16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(1, U8, F16, IMAGE_2D) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(2, F16, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(2, F16, I16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(2, F16, I8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(2, F16, U8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(2, I16, I16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(2, I8, I8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(2, U8, U8, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(2, I16, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(2, I8, F16, IMAGE) + VSI_NN_REDUCEMAX_SH_KERNEL_IDX(2, U8, F16, IMAGE) + REDUCEMAX_KERNEL_COUNTS, +}; + +enum { + REDUCEMAX_INPUT = 0, + + REDUCEMAX_INPUTS_COUNT, + + REDUCEMAX_OUTPUT = 0, + + REDUCEMAX_OUTPUTS_COUNT, + + REDUCEMAX_PARAM_COUT = REDUCEMAX_INPUTS_COUNT + REDUCEMAX_OUTPUTS_COUNT, +}; + +#define _VSI_NN_REDUCEMAX_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_reducemax_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_REDUCEMAX_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_reducemax_lcl_data; + +typedef struct _vsi_nn_reducemax_internal_param +{ + vsi_nn_reducemax_lcl_data local; + vx_int32 *axis; + vx_uint32 axis_num; + vx_bool keep_dim; +} vsi_nn_reducemax_internal_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h new file mode 100644 index 0000000..ee32dd1 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h @@ -0,0 +1,117 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_REDUCEMIN_INTERNAL_H +#define _VSI_NN_OP_REDUCEMIN_INTERNAL_H + +#include "vsi_nn_types.h" + +#define VSI_NN_REDUCEMIN_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_REDUCEMIN_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + REDUCEMIN_CPU_KERNEL, + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, F16, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, F16, I16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, F16, I8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, F16, U8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, I16, I16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, I8, I8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, U8, U8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, I16, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, I8, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, U8, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, F16, F16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, F16, I16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, F16, I8, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, F16, U8, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, I16, I16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, I8, I8, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, U8, U8, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, I16, F16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, I8, F16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(0, U8, F16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, F16, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, F16, I16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, F16, I8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, F16, U8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, I16, I16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, I8, I8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, U8, U8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, I16, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, I8, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, U8, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, F16, F16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, F16, I16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, F16, I8, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, F16, U8, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, I16, I16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, I8, I8, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, U8, U8, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, I16, F16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, I8, F16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(1, U8, F16, IMAGE_2D) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(2, F16, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(2, F16, I16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(2, F16, I8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(2, F16, U8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(2, I16, I16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(2, I8, I8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(2, U8, U8, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(2, I16, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(2, I8, F16, IMAGE) + VSI_NN_REDUCEMIN_SH_KERNEL_IDX(2, U8, F16, IMAGE) + REDUCEMIN_KERNEL_COUNTS, +}; + +enum { + REDUCEMIN_INPUT = 0, + + REDUCEMIN_INPUTS_COUNT, + + REDUCEMIN_OUTPUT = 0, + + REDUCEMIN_OUTPUTS_COUNT, + + REDUCEMIN_PARAM_COUT = REDUCEMIN_INPUTS_COUNT + REDUCEMIN_OUTPUTS_COUNT, +}; + +#define _VSI_NN_REDUCEMIN_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_reducemin_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_REDUCEMIN_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_reducemin_lcl_data; + +typedef struct _vsi_nn_reducemin_internal_param +{ + vsi_nn_reducemin_lcl_data local; + vx_int32 *axis; + vx_uint32 axis_num; + vx_bool keep_dim; +} vsi_nn_reducemin_internal_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h new file mode 100644 index 0000000..b2c830d --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h @@ -0,0 +1,122 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_REDUCEPROD_INTERNAL_H +#define _VSI_NN_OP_REDUCEPROD_INTERNAL_H + +#include "vsi_nn_types.h" + +#define VSI_NN_REDUCEPROD_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ + VSI_NN_REDUCEPROD_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, + +enum { + REDUCEPROD_CPU_KERNEL, + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, F16, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, F16, I16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, F16, I8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, F16, U8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, I16, I16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, I8, I8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, U8, U8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, I16, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, I8, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, U8, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, BF16, BF16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, F16, F16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, F16, I16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, F16, I8, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, F16, U8, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, I16, I16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, I8, I8, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, U8, U8, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, I16, F16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, I8, F16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, U8, F16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(0, BF16, BF16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, F16, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, F16, I16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, F16, I8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, F16, U8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, I16, I16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, I8, I8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, U8, U8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, I16, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, I8, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, U8, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, BF16, BF16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, F16, F16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, F16, I16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, F16, I8, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, F16, U8, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, I16, I16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, I8, I8, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, U8, U8, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, I16, F16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, I8, F16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, U8, F16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(1, BF16, BF16, IMAGE_2D) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, F16, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, F16, I16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, F16, I8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, F16, U8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, I16, I16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, I8, I8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, U8, U8, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, I16, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, I8, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, U8, F16, IMAGE ) + VSI_NN_REDUCEPROD_SH_KERNEL_IDX(2, BF16, BF16, IMAGE ) + REDUCEPROD_KERNEL_COUNTS, +}; + +enum { + REDUCEPROD_INPUT = 0, + + REDUCEPROD_INPUTS_COUNT, + + REDUCEPROD_OUTPUT = 0, + + REDUCEPROD_OUTPUTS_COUNT, + + REDUCEPROD_PARAM_COUT = REDUCEPROD_INPUTS_COUNT + REDUCEPROD_OUTPUTS_COUNT, +}; + +#define _VSI_NN_REDUCEPROD_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_reduceprod_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_REDUCEPROD_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_reduceprod_lcl_data; + +typedef struct _vsi_nn_reduceprod_internal_param +{ + vsi_nn_reduceprod_lcl_data local; + vx_int32 *axis; + vx_uint32 axis_num; + vx_bool keep_dim; +} vsi_nn_reduceprod_internal_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h new file mode 100644 index 0000000..69ca355 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_REDUCESUM_INTERNAL_H +#define _VSI_NN_OP_REDUCESUM_INTERNAL_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_reducesum_lcl_data_t +{ + vsi_nn_tensor_t *reshaped_input; + vsi_nn_tensor_t *reshaped_output; +} vsi_nn_reducesum_lcl_data_t; + +typedef struct _vsi_nn_reducesum_internal_param +{ + vx_int32 *axis; + vx_uint32 axis_num; + vx_bool keep_dim; + vsi_nn_reducesum_lcl_data_t* local; +} vsi_nn_reducesum_internal_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_relational_ops.h b/src/tim/vx/internal/include/ops/vsi_nn_op_relational_ops.h new file mode 100644 index 0000000..47f6003 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_relational_ops.h @@ -0,0 +1,62 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_RELATIONAL_OPS_H +#define _VSI_NN_OP_CLIENT_RELATIONAL_OPS_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_RELATIONAL_OPS_LOCAL_TENSOR_NUM 3 + +typedef uint32_t vsi_nn_relational_ops_type_t; enum +{ + VSI_NN_RELATIONAL_OPS_GREAT = 0, + VSI_NN_RELATIONAL_OPS_GREAT_EQUAL, + VSI_NN_RELATIONAL_OPS_LESS, + VSI_NN_RELATIONAL_OPS_LESS_EQUAL, + VSI_NN_RELATIONAL_OPS_NOT_EQUAL, + VSI_NN_RELATIONAL_OPS_EQUAL, +}; + +typedef struct _vsi_nn_relational_ops_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_RELATIONAL_OPS_LOCAL_TENSOR_NUM]; +} vsi_nn_relational_ops_lcl_data; + +typedef struct _vsi_nn_relational_ops_param +{ + /* local data must be the first. */ + vsi_nn_relational_ops_lcl_data local; + vsi_nn_relational_ops_type_t op; +} vsi_nn_relational_ops_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h new file mode 100644 index 0000000..fdc582d --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h @@ -0,0 +1,37 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_RELU_KERAS_H +#define _VSI_NN_OP_RELU_KERAS_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_relu_keras_param +{ + float alpha; + float max_value; + float threshold; +} vsi_nn_relu_keras_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h new file mode 100644 index 0000000..4e30fb9 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h @@ -0,0 +1,48 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_RELU_KERAS_INTERNAL_H +#define _VSI_NN_OP_RELU_KERAS_INTERNAL_H + +#include "vsi_nn_types.h" + +#define _VSI_NN_RELU_KERAS_INTERNAL_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_relu_keras_internal_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_RELU_KERAS_INTERNAL_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_relu_keras_internal_lcl_data; + +typedef struct _vsi_nn_relu_keras_internal_param +{ + vsi_nn_relu_keras_internal_lcl_data local; + + float alpha; + float max_value; + float threshold; +} vsi_nn_relu_keras_internal_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_relun.h b/src/tim/vx/internal/include/ops/vsi_nn_op_relun.h new file mode 100644 index 0000000..3f59395 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_relun.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_RELUN_H +#define _VSI_NN_OP_RELUN_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_relun_param +{ + float relu_clamp_top; + float relu_clamp_bottom; +} vsi_nn_relun_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reorg.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reorg.h new file mode 100644 index 0000000..74d5851 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reorg.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_REORG_H +#define _VSI_NN_OP_REORG_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_reorg_param +{ + uint32_t stride; +} vsi_nn_reorg_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h new file mode 100644 index 0000000..1b5ca0b --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_RESHAPE_H +#define _VSI_NN_OP_RESHAPE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_reshape_lcl_data +{ + vsi_bool initialized; +} vsi_nn_reshape_lcl_data; + +typedef struct _vsi_nn_reshape_param +{ + const uint32_t * size; + uint32_t dim_num; + + /* reshape layer local data structure */ + vsi_nn_reshape_lcl_data local; +} vsi_nn_reshape_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h new file mode 100644 index 0000000..50270a1 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h @@ -0,0 +1,65 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_RESIZE_H +#define _VSI_NN_OP_RESIZE_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_RESIZE_LOCAL_TENSOR_NUM 2 + +typedef uint32_t vsi_nn_interpolation_type_t; enum +{ + VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR = 0, + VSI_NN_INTERPOLATION_BILINEAR, + VSI_NN_INTERPOLATION_AREA +}; + +typedef struct _vsi_nn_resize_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_RESIZE_LOCAL_TENSOR_NUM]; +} vsi_nn_resize_lcl_data; + +typedef struct _vsi_nn_resize_param +{ + vsi_enum type; + float factor; + int32_t size[2]; + + /* resize layer local data structure */ + vsi_nn_resize_lcl_data local; + vsi_bool align_corners; + vsi_bool half_pixel_centers; +} vsi_nn_resize_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h new file mode 100644 index 0000000..578d943 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_RESIZE_INTERNAL_H +#define _VSI_NN_OP_RESIZE_INTERNAL_H + +#include "vsi_nn_types.h" + + +typedef struct _vsi_nn_resize_in_lcl_data +{ + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_resize_in_lcl_data; + +typedef struct _vsi_nn_resize_internal_param +{ + vsi_nn_resize_in_lcl_data *lcl_data_ptr; + vsi_bool align_corners; + vsi_bool half_pixel_centers; + float factor; +} vsi_nn_resize_internal_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h new file mode 100644 index 0000000..b700334 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_RESIZE_NEAREST_INTERNAL_H +#define _VSI_NN_OP_RESIZE_NEAREST_INTERNAL_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_resize_nearest_in_lcl_data +{ + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_resize_nearest_in_lcl_data; + +typedef struct _vsi_nn_resize_nearest_internal_param +{ + vsi_nn_resize_nearest_in_lcl_data *lcl_data_ptr; + vsi_bool align_corners; + vsi_bool half_pixel_centers; + float factor; +} vsi_nn_resize_nearest_internal_param; + + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reverse.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reverse.h new file mode 100644 index 0000000..3e167a8 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reverse.h @@ -0,0 +1,55 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_REVERSE_H +#define _VSI_NN_OP_REVERSE_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_REVERSE_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_reverse_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_REVERSE_LOCAL_TENSOR_NUM]; +} vsi_nn_reverse_lcl_data; + +typedef struct _vsi_nn_reverse_param +{ + const int32_t *axis; + uint32_t axis_num; + + /* reverse layer local data structure */ + vsi_nn_reverse_lcl_data local; +} vsi_nn_reverse_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h new file mode 100644 index 0000000..0083c78 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h @@ -0,0 +1,71 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_RNN_H +#define _VSI_NN_OP_RNN_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* enum for inputs/outputs */ +enum +{ + RNNCELL_INPUT_INPUT = 0, + RNNCELL_INPUT_WEIGHT_I = 1, + RNNCELL_INPUT_WEIGHT_H = 2, + RNNCELL_INPUT_BIAS = 3, + RNNCELL_INPUT_H_STATE = 4, + + RNNCELL_INPUT_AUX_INPUT = 5, + RNNCELL_INPUT_AUX_WEIGHT = 6, + RNNCELL_INPUT_CNT, + + RNNCELL_OUTPUT_H_STATE = 0, + RNNCELL_OUTPUT_OUTPUT = 1, + RNNCELL_OUTPUT_CNT +}; + +enum +{ + RNNCELL_QUANTIZE_PARAM_I, + RNNCELL_QUANTIZE_PARAM_H, + RNNCELL_QUANTIZE_PARAM_AUX, + + RNNCELL_QUANTIZE_PARAM_COUNT +}; + +typedef struct _vsi_nn_rnn_param +{ + vsi_nn_activation_e activation; +} vsi_nn_rnn_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h new file mode 100644 index 0000000..1f48989 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_RNNCELL_OVXLIB_H +#define _VSI_NN_OP_RNNCELL_OVXLIB_H + +#include "vsi_nn_types.h" +#include "vsi_nn_op_rnn.h" + +typedef struct _vsi_nn_rnncell_ovxlib_lcl_data_t +{ + vsi_bool multi_batch; +} vsi_nn_rnncell_ovxlib_lcl_data_t; + +typedef struct _vsi_nn_rnncell_ovxlib_param +{ + vsi_nn_rnncell_ovxlib_lcl_data_t* local; + + vsi_nn_activation_e activation; + vsi_nn_dtype_t* internal_dtype; +} vsi_nn_rnncell_ovxlib_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h new file mode 100644 index 0000000..e61a33b --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h @@ -0,0 +1,40 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_ROI_ALIGN_H +#define _VSI_NN_OP_ROI_ALIGN_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_roi_align_param +{ + int32_t output_height; + int32_t output_width; + float height_ratio; + float width_ratio; + int32_t height_sample_num; + int32_t width_sample_num; +} vsi_nn_roi_align_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_roi_pool.h b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_pool.h new file mode 100644 index 0000000..f93dfa6 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_pool.h @@ -0,0 +1,51 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_ROI_POOL_H +#define _VSI_NN_OP_ROI_POOL_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_roi_pool_lcl_data +{ + vx_tensor rois; +} vsi_nn_roi_pool_lcl_data; + +typedef struct _vsi_nn_roi_pool_param +{ + vsi_nn_roi_pool_lcl_data local; + vsi_enum type; + uint32_t size[2]; + float scale; +} vsi_nn_roi_pool_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_scale.h b/src/tim/vx/internal/include/ops/vsi_nn_op_scale.h new file mode 100644 index 0000000..1feef4f --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_scale.h @@ -0,0 +1,55 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SCALE_H +#define _VSI_NN_OP_SCALE_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_SCALE_LOCAL_TENSOR_NUM 4 + +typedef struct _vsi_nn_scale_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_SCALE_LOCAL_TENSOR_NUM]; +} vsi_nn_scale_lcl_data; + +typedef struct _vsi_nn_scale_param +{ + int32_t axis; + float bias; + + /* scale layer local data structure */ + vsi_nn_scale_lcl_data local; +} vsi_nn_scale_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd.h b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd.h new file mode 100644 index 0000000..9464f76 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SCATTER_ND_H +#define _VSI_NN_OP_SCATTER_ND_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_scatter_nd_param +{ + uint32_t dim_num; + uint32_t* shape; +} vsi_nn_scatter_nd_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_select.h b/src/tim/vx/internal/include/ops/vsi_nn_op_select.h new file mode 100644 index 0000000..cd52219 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_select.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SELECT_H +#define _VSI_NN_OP_SELECT_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_select_param +{ + void *reserve; +} vsi_nn_select_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_shufflechannel.h b/src/tim/vx/internal/include/ops/vsi_nn_op_shufflechannel.h new file mode 100644 index 0000000..0eaae1b --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_shufflechannel.h @@ -0,0 +1,51 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_SHUFFLECHANNEL_H +#define _VSI_NN_OP_CLIENT_SHUFFLECHANNEL_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_shufflechannel_lcl_data_t +{ + vx_tensor input_tensor; + vx_tensor output_tensor; + vsi_nn_tensor_t *block_size_tensor; +} vsi_nn_shufflechannel_lcl_data_t; + +typedef struct _vsi_nn_shufflechannel_param +{ + int32_t group_number; + int32_t axis; + vsi_nn_shufflechannel_lcl_data_t *local; +} vsi_nn_shufflechannel_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_signalframe.h b/src/tim/vx/internal/include/ops/vsi_nn_op_signalframe.h new file mode 100644 index 0000000..6ef0871 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_signalframe.h @@ -0,0 +1,64 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_SIGNALFRAME_H +#define _VSI_NN_OP_CLIENT_SIGNALFRAME_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_SIGNALFRAME_LOCAL_TENSOR_NUM 7 + +typedef struct _vsi_nn_signalframe_lcl_data +{ + //vsi_nn_tensor_t *signal_tensor; + //vsi_nn_tensor_t *frame_tensor; + vsi_nn_tensor_t *window_length_tensor; + vsi_nn_tensor_t *step_tensor; + vsi_nn_tensor_t *pad_end_tensor; + vsi_nn_tensor_t *pad_tensor; + vsi_nn_tensor_t *axis_tensor; + vx_tensor local_tensor[_VSI_NN_SIGNALFRAME_LOCAL_TENSOR_NUM]; +} vsi_nn_signalframe_lcl_data; + +typedef struct _vsi_nn_signalframe_param +{ + /* local data must be the first. */ + vsi_nn_signalframe_lcl_data local; + + uint32_t window_length; + uint32_t step; + uint32_t pad_end; + uint32_t pad; + uint32_t axis; +} vsi_nn_signalframe_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h b/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h new file mode 100644 index 0000000..719e520 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h @@ -0,0 +1,46 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SIN_H +#define _VSI_NN_OP_SIN_H + +#include "vsi_nn_types.h" + + +#define _VSI_NN_SIN_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_sin_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_SIN_LOCAL_TENSOR_NUM]; + uint32_t hash_idx; + vsi_bool execute_on_sw; +} vsi_nn_sin_lcl_data; + +typedef struct _vsi_nn_sin_param +{ + /* sin layer local data structure */ + vsi_nn_sin_lcl_data local; +} vsi_nn_sin_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_slice.h b/src/tim/vx/internal/include/ops/vsi_nn_op_slice.h new file mode 100644 index 0000000..a8fc8b7 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_slice.h @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_SLICE_H +#define _VSI_NN_OP_CLIENT_SLICE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_slice_lcl_data +{ + vx_int32 begin_dims[VSI_NN_MAX_DIM_NUM]; + vx_int32 end_dims[VSI_NN_MAX_DIM_NUM]; + vx_int32 stride_dims[VSI_NN_MAX_DIM_NUM]; +} vsi_nn_slice_lcl_data; + +typedef struct _vsi_nn_slice_param +{ + uint32_t dims; + const uint32_t * start; + const uint32_t * length; + + vsi_nn_slice_lcl_data *lcl_data; +} vsi_nn_slice_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_softmax.h b/src/tim/vx/internal/include/ops/vsi_nn_op_softmax.h new file mode 100644 index 0000000..4e72399 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_softmax.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SOFTMAX_H +#define _VSI_NN_OP_SOFTMAX_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_softmax_lcl_data +{ + vsi_nn_tensor_t *reshaped_input; + vsi_nn_tensor_t *reshaped_output; +} vsi_nn_softmax_lcl_data; + +typedef struct _vsi_nn_softmax_param +{ + void* placeholder;/* reserved field for ABI test */ + float beta; + int32_t axis; + vsi_nn_softmax_lcl_data local; +} vsi_nn_softmax_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h new file mode 100644 index 0000000..6e12636 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h @@ -0,0 +1,46 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SOFTMAX_INTERNAL_H +#define _VSI_NN_OP_SOFTMAX_INTERNAL_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "utils/vsi_nn_link_list.h" + +typedef struct _vsi_nn_softmax_internal_lcl_data +{ + vsi_nn_link_list_t link_list; + vx_node node; + vx_tensor src_tensor; + vx_tensor dst_tensor; +} vsi_nn_softmax_internal_lcl_data; + +typedef struct _vsi_nn_softmax_internal_param +{ + vsi_nn_softmax_internal_lcl_data *data; + float beta; +} vsi_nn_softmax_internal_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_space2batch.h b/src/tim/vx/internal/include/ops/vsi_nn_op_space2batch.h new file mode 100644 index 0000000..2961a5a --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_space2batch.h @@ -0,0 +1,54 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SPACE2BATCH_H +#define _VSI_NN_OP_SPACE2BATCH_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_space2batch_lcl_data_t +{ + vsi_nn_tensor_t *block_size_tensor; + vsi_nn_tensor_t *pad_tensor; +} vsi_nn_space2batch_lcl_data_t; + +typedef struct _vsi_nn_space2batch_param +{ + /* local data must be the first. */ + vsi_nn_space2batch_lcl_data_t local; + + const int32_t *block_size; + uint32_t block_size_num; + int32_t pad[4]; // [left, right, top, bottom] +} vsi_nn_space2batch_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth.h b/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth.h new file mode 100644 index 0000000..417ce46 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_space2depth.h @@ -0,0 +1,51 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SPACE2DEPTH_H +#define _VSI_NN_OP_SPACE2DEPTH_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_space2depth_lcl_data_t +{ + vsi_nn_tensor_t *block_size_tensor; +} vsi_nn_space2depth_lcl_data_t; + +typedef struct _vsi_nn_space2depth_param +{ + /* local data must be the first. */ + vsi_nn_space2depth_lcl_data_t local; + + int32_t block_size[2]; +} vsi_nn_space2depth_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_spatial_transformer.h b/src/tim/vx/internal/include/ops/vsi_nn_op_spatial_transformer.h new file mode 100644 index 0000000..0716fa9 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_spatial_transformer.h @@ -0,0 +1,67 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SPATIAL_TRANSFORMER_H +#define _VSI_NN_OP_SPATIAL_TRANSFORMER_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_SPATIAL_TRANSFORMER_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_spatial_transformer_lcl_data +{ + vsi_nn_tensor_t *local_tensor; + vx_scalar scl; +} vsi_nn_spatial_transformer_lcl_data; + + +typedef struct _vsi_nn_spatial_transformer_param +{ + int32_t output_H; + int32_t output_W; + int32_t has_theta_1_1; + int32_t has_theta_1_2; + int32_t has_theta_1_3; + int32_t has_theta_2_1; + int32_t has_theta_2_2; + int32_t has_theta_2_3; + float theta_1_1; + float theta_1_2; + float theta_1_3; + float theta_2_1; + float theta_2_2; + float theta_2_3; + + vsi_nn_spatial_transformer_lcl_data lcl; +} vsi_nn_spatial_transformer_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_split.h b/src/tim/vx/internal/include/ops/vsi_nn_op_split.h new file mode 100644 index 0000000..400b845 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_split.h @@ -0,0 +1,56 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SPLIT_H +#define _VSI_NN_OP_SPLIT_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "utils/vsi_nn_link_list.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_split_lcl_data +{ + vx_int32 *begin_dims; + vx_int32 *end_dims; + vx_int32 *stride_dims; +} vsi_nn_split_lcl_data; + +typedef struct _vsi_nn_split_param +{ + /* local data must be the first. */ + vsi_nn_split_lcl_data * lcl_data; + uint32_t axis; + const uint32_t *slices; + uint32_t slices_num; +} vsi_nn_split_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h b/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h new file mode 100644 index 0000000..249ce2a --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h @@ -0,0 +1,38 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_SQUEEZE_H +#define _VSI_NN_OP_SQUEEZE_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_squeeze_param +{ + // Add parameters here + vx_uint32 *axis; + vx_uint32 axis_num; +} vsi_nn_squeeze_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h b/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h new file mode 100644 index 0000000..c75702a --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h @@ -0,0 +1,67 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_STACK_H +#define _VSI_NN_OP_STACK_H + +#include "vsi_nn_types.h" +#define VSI_NN_STACK_MAX_INPUTS (16) + +typedef struct _vsi_nn_stack_lcl_data +{ + vsi_nn_link_list_t link_list; + union + { + /* used for optimze concat to tensor view */ + struct + { + vx_node cp_node; + vx_tensor src_tensor; + vx_tensor dst_tensor; + vsi_nn_tensor_t * src_in; + }; + /* used for vxConcatIndefiniteLayer */ + struct + { + vx_object_array array; + }; + }; +} vsi_nn_stack_lcl_data; + +typedef struct _vsi_nn_stack_lcl +{ + uint32_t block_size; + uint32_t block_num; + vx_tensor local_tensor[VSI_NN_STACK_MAX_INPUTS]; +} vsi_nn_stack_lcl; + +typedef struct _vsi_nn_stack_param +{ + /* local data must be the first. */ + vsi_nn_stack_lcl_data * lcl_data; + vsi_nn_stack_lcl local; + uint32_t axis; +} vsi_nn_stack_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h b/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h new file mode 100644 index 0000000..ec5b6c9 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h @@ -0,0 +1,89 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_STRIDED_SLICE_H +#define _VSI_NN_OP_CLIENT_STRIDED_SLICE_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" +#include "utils/vsi_nn_link_list.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_strided_slice_lcl_data2 +{ + vsi_nn_link_list_t link_list; + /* used for optimze strided slice to tensor view */ + struct + { + vx_node cp_node; + vx_tensor src_tensor; + vx_tensor dst_tensor; + }; + + struct + { + int32_t *begin_dims; + int32_t *end_dims; + int32_t *stride_dims; + int32_t begin_mask; + int32_t end_mask; + int32_t shrink_axis_mask; + }; + + vsi_bool is_dataconvert_op; + vsi_bool is_optimized; +} vsi_nn_strided_slice_lcl_data2; + +typedef struct _vsi_nn_strided_slice_lcl_data_t +{ + vsi_nn_tensor_t *begin_dims_tensor; + vsi_nn_tensor_t *end_dims_tensor; + vsi_nn_tensor_t *stride_dims_tensor; +} vsi_nn_strided_slice_lcl_data_t; + +typedef struct _vsi_nn_strided_slice_param +{ + /* local data must be the first. */ + vsi_nn_strided_slice_lcl_data_t local; + + const vx_int32 *begin_dims; + vx_uint32 begin_dims_num; + const vx_int32 *end_dims; + vx_uint32 end_dims_num; + const vx_int32 *stride_dims; + vx_uint32 stride_dims_num; + vx_int32 begin_mask; + vx_int32 end_mask; + vx_int32 shrink_axis_mask; + + vsi_nn_strided_slice_lcl_data2 * lcl2_data; +} vsi_nn_strided_slice_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_svdf.h b/src/tim/vx/internal/include/ops/vsi_nn_op_svdf.h new file mode 100644 index 0000000..5b99f4a --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_svdf.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SVDF_H +#define _VSI_NN_OP_SVDF_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_svdf_lcl_data +{ + vsi_nn_tensor_t *act_tensor; + vsi_nn_tensor_t *rank_tensor; +} vsi_nn_svdf_lcl_data; + +typedef struct _vsi_nn_svdf_param +{ + vsi_nn_svdf_lcl_data local; + int32_t rank; + int32_t num_units; + int32_t spectrogram_length; +} vsi_nn_svdf_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_swish.h b/src/tim/vx/internal/include/ops/vsi_nn_op_swish.h new file mode 100644 index 0000000..a71277c --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_swish.h @@ -0,0 +1,50 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SWISH_H +#define _VSI_NN_OP_SWISH_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum _vsi_nn_swish_type +{ + VSI_NN_SWISH = 0, + VSI_NN_HSWISH, +} vsi_nn_swish_type; + +typedef struct _vsi_nn_swish_param +{ + float beta; + vsi_nn_swish_type type; +} vsi_nn_swish_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_tanh.h b/src/tim/vx/internal/include/ops/vsi_nn_op_tanh.h new file mode 100644 index 0000000..b93c3e9 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_tanh.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_TANH_H +#define _VSI_NN_OP_TANH_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_tanh_param +{ + float scale_a; + float scale_b; +} vsi_nn_tanh_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h new file mode 100644 index 0000000..3ce2d49 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM_H +#define _VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM_H + +#include "vsi_nn_types.h" + +#define _VSI_NN_TENSORADD_MEANSTDNORM_LOCAL_TENSOR_NUM 3 + +typedef struct _vsi_nn_tensoradd_meanstdnorm_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_TENSORADD_MEANSTDNORM_LOCAL_TENSOR_NUM]; +} vsi_nn_tensoradd_meanstdnorm_lcl_data; + +typedef struct _vsi_nn_tensor_add_mean_stddev_norm_param +{ + vsi_nn_tensoradd_meanstdnorm_lcl_data local; + float eps; +} vsi_nn_tensor_add_mean_stddev_norm_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_tensorstackconcat.h b/src/tim/vx/internal/include/ops/vsi_nn_op_tensorstackconcat.h new file mode 100644 index 0000000..5272644 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_tensorstackconcat.h @@ -0,0 +1,51 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_TENSORSTACKCONCAT_H +#define _VSI_NN_OP_CLIENT_TENSORSTACKCONCAT_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_STACKCONCAT_LOCAL_TENSOR_NUM 3 + +typedef struct _vsi_nn_stackconcat_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_STACKCONCAT_LOCAL_TENSOR_NUM]; +} vsi_nn_stackconcat_lcl_data; + +typedef struct _vsi_nn_tensorstackconcat_param +{ + /* local data must be the first. */ + vsi_nn_stackconcat_lcl_data *local; + int32_t axis; +} vsi_nn_tensorstackconcat_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h b/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h new file mode 100644 index 0000000..258d696 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_TILE_H +#define _VSI_NN_OP_TILE_H + +#include "vsi_nn_types.h" + +#define _VSI_NN_TILE_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_tile_lcl_data_t +{ + vsi_nn_tensor_t *multiples_tensor; + vx_tensor local_tensor[_VSI_NN_TILE_LOCAL_TENSOR_NUM]; +} vsi_nn_tile_lcl_data_t; + +typedef struct _vsi_nn_tile_param +{ + /* local data must be the first. */ + vsi_nn_tile_lcl_data_t local; + + const int32_t* multiples; + uint32_t multiples_num; +} vsi_nn_tile_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h new file mode 100644 index 0000000..11fc2c4 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h @@ -0,0 +1,35 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_TOPK_H +#define _VSI_NN_OP_TOPK_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_topk_param +{ + uint32_t k; +} vsi_nn_topk_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h new file mode 100644 index 0000000..a7281f9 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_UNIDIRECTIONAL_SEQUENCE_RNN_H +#define _VSI_NN_OP_UNIDIRECTIONAL_SEQUENCE_RNN_H + +#include "vsi_nn_types.h" +#include "vsi_nn_op_rnn.h" + +/* enum for inputs/outputs */ +enum +{ + RNN_INPUT_INPUT = 0, + RNN_INPUT_WEIGHT_I = 1, + RNN_INPUT_WEIGHT_H = 2, + RNN_INPUT_BIAS = 3, + RNN_INPUT_H_STATE = 4, + RNN_INPUT_CNT, + + RNN_OUTPUT_OUTPUT = 0, + RNN_OUTPUT_CNT +}; + +typedef struct _vsi_nn_unidirectional_sequence_rnn_param +{ + vsi_bool time_major; + vsi_nn_activation_e activation; + vsi_nn_dtype_t internal_dtype[RNNCELL_QUANTIZE_PARAM_COUNT]; +} vsi_nn_unidirectional_sequence_rnn_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h b/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h new file mode 100644 index 0000000..1ee8220 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_UNSTACK_H +#define _VSI_NN_OP_UNSTACK_H + +#include "vsi_nn_types.h" +#define VSI_NN_UNSTACK_MAX_OUTPUTS (16) + +typedef struct _vsi_nn_unstack_lcl_data +{ + vx_object_array array; +} vsi_nn_unstack_lcl_data; + +typedef struct _vsi_nn_unstack_param +{ + /* local data must be the first. */ + vsi_nn_unstack_lcl_data * lcl_data; + uint32_t axis; +} vsi_nn_unstack_param; + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h b/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h new file mode 100644 index 0000000..112f633 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_UPSAMPLE_H +#define _VSI_NN_OP_UPSAMPLE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_UPSAMPLE_LOCAL_TENSOR_NUM 3 + +typedef struct _vsi_nn_upsample_lcl_data +{ + vx_tensor local_tensor[_VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM]; +} vsi_nn_upsample_lcl_data; + +typedef struct _vsi_nn_upsample_param +{ + uint32_t scale[2]; + uint32_t size[2]; + /* upsample layer local data structure */ + vsi_nn_upsample_lcl_data local; +} vsi_nn_upsample_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_variable.h b/src/tim/vx/internal/include/ops/vsi_nn_op_variable.h new file mode 100644 index 0000000..d00a295 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_variable.h @@ -0,0 +1,48 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_VARIABLE_H +#define _VSI_NN_OP_VARIABLE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_variable_lcl_data +{ + vsi_bool initialized; +} vsi_nn_variable_lcl_data; + +typedef struct _vsi_nn_variable_param +{ + vsi_nn_variable_lcl_data *local; +} vsi_nn_variable_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/quantization/vsi_nn_asymmetric_affine.h b/src/tim/vx/internal/include/quantization/vsi_nn_asymmetric_affine.h new file mode 100644 index 0000000..3dbe782 --- /dev/null +++ b/src/tim/vx/internal/include/quantization/vsi_nn_asymmetric_affine.h @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_ASYMMETRIC_AFFINE_H +#define _VSI_NN_ASYMMETRIC_AFFINE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +OVXLIB_API vsi_status vsi_nn_QuantAffineCalParam + ( + vsi_nn_type_e type, + float max_data, + float min_data, + float * scale, + int32_t * zero_point + ); + +OVXLIB_API vsi_bool vsi_nn_QuantAffineCheck + ( + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *weight, + vsi_nn_tensor_t *bias + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/quantization/vsi_nn_dynamic_fixed_point.h b/src/tim/vx/internal/include/quantization/vsi_nn_dynamic_fixed_point.h new file mode 100644 index 0000000..9128922 --- /dev/null +++ b/src/tim/vx/internal/include/quantization/vsi_nn_dynamic_fixed_point.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_DYNAMIC_FIXED_POINT_H +#define _VSI_NN_DYNAMIC_FIXED_POINT_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +OVXLIB_API vsi_status vsi_nn_QuantDFPCalParam + ( + vsi_nn_type_e dtype, + float max_data, + float min_data, + int8_t * fl + ); + +OVXLIB_API vsi_bool vsi_nn_QuantDFPCheck + ( + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *weight, + vsi_nn_tensor_t *bias + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/quantization/vsi_nn_perchannel_symmetric_affine.h b/src/tim/vx/internal/include/quantization/vsi_nn_perchannel_symmetric_affine.h new file mode 100644 index 0000000..8f0ae20 --- /dev/null +++ b/src/tim/vx/internal/include/quantization/vsi_nn_perchannel_symmetric_affine.h @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_PERCHANNEL_SYMMETRIC_AFFINE_H +#define _VSI_NN_PERCHANNEL_SYMMETRIC_AFFINE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +OVXLIB_API vsi_status vsi_nn_QuantAffinePerchannelCalParam + ( + vsi_nn_type_e type, + float max_data, + float min_data, + float * scales + //int32_t * zero_point + ); + +OVXLIB_API vsi_bool vsi_nn_QuantAffinePerchannelCheck + ( + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *weight, + vsi_nn_tensor_t *bias + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_binary_tree.h b/src/tim/vx/internal/include/utils/vsi_nn_binary_tree.h new file mode 100644 index 0000000..186f381 --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_binary_tree.h @@ -0,0 +1,68 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_BINARY_TREE_H +#define _VSI_NN_BINARY_TREE_H + +#if defined(__cplusplus) +extern "C"{ +#endif + +#include + +typedef int64_t vsi_nn_binary_tree_key_t; + +#define vsi_nn_BinaryTreeInitRoot(n) do{n = NULL;} while (0); + +typedef struct _vsi_nn_binary_tree +{ + struct _vsi_nn_binary_tree * left; + struct _vsi_nn_binary_tree * right; + vsi_nn_binary_tree_key_t key; + void * data_ptr; +} vsi_nn_binary_tree_t; + +OVXLIB_API void vsi_nn_BinaryTreeRemoveNode + ( + vsi_nn_binary_tree_t ** root, + vsi_nn_binary_tree_key_t key + ); + +OVXLIB_API void vsi_nn_BinaryTreeNewNode + ( + vsi_nn_binary_tree_t ** root, + vsi_nn_binary_tree_key_t key, + void * data + ); + +OVXLIB_API void * vsi_nn_BinaryTreeGetNode + ( + vsi_nn_binary_tree_t ** root, + vsi_nn_binary_tree_key_t key + ); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_code_generator.h b/src/tim/vx/internal/include/utils/vsi_nn_code_generator.h new file mode 100644 index 0000000..d5a4119 --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_code_generator.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_CODE_GENERATOR_H +#define _VSI_NN_CODE_GENERATOR_H + +#include "vsi_nn_graph.h" + +#ifdef __cplusplus +extern "C" { +#endif + +OVXLIB_API void vsi_nn_GenGraphCCode + ( + vsi_nn_graph_t * graph, + const char * netpath, + const char * datapath + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h new file mode 100644 index 0000000..3bb7c5d --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h @@ -0,0 +1,152 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_CONSTRAINT_CHECK_H +#define _VSI_NN_CONSTRAINT_CHECK_H + +/*------------------------------------------- + Includes +-------------------------------------------*/ +#include "vsi_nn_tensor.h" +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* short alias for dtype */ +enum { + D_NONE = VSI_NN_TYPE_NONE, + D_I8 = VSI_NN_TYPE_INT8, + D_I16 = VSI_NN_TYPE_INT16, + D_I32 = VSI_NN_TYPE_INT32, + D_I64 = VSI_NN_TYPE_INT64, + D_U8 = VSI_NN_TYPE_UINT8, + D_U16 = VSI_NN_TYPE_UINT16, + D_U32 = VSI_NN_TYPE_UINT32, + D_U64 = VSI_NN_TYPE_UINT64, + D_F16 = VSI_NN_TYPE_FLOAT16, + D_F32 = VSI_NN_TYPE_FLOAT32, + D_F64 = VSI_NN_TYPE_FLOAT64, + D_BF16 = VSI_NN_TYPE_BFLOAT16, + D_BOOL8 = VSI_NN_TYPE_BOOL8 +}; + +/* short alias for qtype */ +enum { + Q_SHIFT = 8, + Q_NONE = VSI_NN_QNT_TYPE_NONE << Q_SHIFT, + Q_DFP = VSI_NN_QNT_TYPE_DFP << Q_SHIFT, + Q_ASYM = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC << Q_SHIFT, + Q_SYM_PC = VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC << Q_SHIFT, + Q_SYM = VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC << Q_SHIFT, +}; + +typedef struct { + uint32_t reg_input_num; + uint32_t reg_output_num; + uint32_t io_types_item_size; + uint32_t io_types_item_count; + const void* types; +} op_constraint_reg_type; + +vsi_bool is_const_tensor + ( + const vsi_nn_tensor_t* tensor + ); + +vsi_bool validate_op_io_types + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t** inputs, + int inputs_num, + vsi_nn_tensor_t** outputs, + int outputs_num, + const op_constraint_reg_type* op_constraint_reg, + const char* name + ); + +void print_op_io_types + ( + const char* name, + const op_constraint_reg_type* op_constraint_reg + ); + +char* generate_op_io_types_desc + ( + vsi_nn_tensor_t** inputs, + int inputs_num, + vsi_nn_tensor_t** outputs, + int outputs_num + ); + +void destroy_op_io_types_desc + ( + char* desc + ); + +vsi_bool is_item_in_array + ( + const void* item, + const void* items, + int item_size, + int item_count + ); + +#define IO_TYPE(...) {{__VA_ARGS__}}, +#define BEGIN_IO_TYPE_DECL(NAME, INPUT_COUNT, OUTPUT_COUNT) \ +enum { NAME##_INPUT_COUNT = INPUT_COUNT, \ + NAME##_OUTPUT_COUNT = OUTPUT_COUNT, \ + NAME##_IO_COUNT = NAME##_INPUT_COUNT + NAME##_OUTPUT_COUNT}; \ +static const struct {vsi_nn_type_e types[NAME##_IO_COUNT];} \ +NAME##_supported_io_types[] = { + +#define DECL_OP_CONSTRAINT_REG(NAME) \ +static const op_constraint_reg_type NAME##_REG = { \ + NAME##_INPUT_COUNT, \ + NAME##_OUTPUT_COUNT, \ + sizeof(NAME##_supported_io_types[0]), \ + _cnt_of_array(NAME##_supported_io_types), \ + NAME##_supported_io_types \ +} + +#ifdef OUTPUT_OP_CONSTRAINT +#define END_IO_TYPE_DECL(NAME) }; \ + DECL_OP_CONSTRAINT_REG(NAME); + print_op_io_types(#NAME, NAME##_IO_COUNT, NAME##_supported_io_types, \ + _cnt_of_array(NAME##_supported_io_types)); +#else +#define END_IO_TYPE_DECL(NAME) }; \ +DECL_OP_CONSTRAINT_REG(NAME); +#endif + +#define VALIDATE_OP_IO_TYPES(NAME, SELF, INPUTS, INPUTS_NUM, OUTPUTS, OUTPUTS_NUM) \ + validate_op_io_types(SELF, INPUTS, INPUTS_NUM, OUTPUTS, OUTPUTS_NUM, \ + &NAME##_REG, #NAME) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h new file mode 100644 index 0000000..62b8e8b --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h @@ -0,0 +1,241 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_DTYPE_UTIL_H +#define _VSI_NN_DTYPE_UTIL_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_math.h" + +#ifdef __cplusplus +extern "C" { +#endif + +OVXLIB_API void vsi_nn_TypeGetRange + ( + vsi_nn_type_e type, + double * max_range, + double * min_range + ); + +OVXLIB_API vsi_bool vsi_nn_TypeIsInteger + ( + const vsi_nn_type_e type + ); + +OVXLIB_API vsi_bool vsi_nn_TypeIsSigned + ( + const vsi_nn_type_e type + ); + +OVXLIB_API vsi_status vsi_nn_DtypeConvert + ( + uint8_t * src, + const vsi_nn_dtype_t * src_dtype, + uint8_t * dst, + const vsi_nn_dtype_t * dst_dtype + ); + +/* + * Deprecated: use vsi_nn_TypeGetBytes() insteatd. + */ +OVXLIB_API uint32_t vsi_nn_GetTypeBytes + ( + const vsi_nn_type_e type + ); + +OVXLIB_API uint32_t vsi_nn_TypeGetBytes + ( + const vsi_nn_type_e type + ); + +OVXLIB_API uint16_t vsi_nn_Fp32ToFp16 + ( + float in + ); +/* + * Deprecated: Use vsi_nn_Fp32ToFp16() instead + */ +#define vsi_nn_Fp32toFp16(in) vsi_nn_Fp32ToFp16( in ) + +OVXLIB_API float vsi_nn_Fp16ToFp32 + ( + int16_t in + ); +/* + * Deprecated: Use vsi_nn_Fp16ToFp32() instead + */ +#define vsi_nn_Fp16toFp32(in) vsi_nn_Fp16ToFp32( in ) + +OVXLIB_API float vsi_nn_BFp16ToFp32 + ( + int16_t in + ); + +OVXLIB_API uint16_t vsi_nn_Fp32ToBFp16 + ( + float in + ); + +OVXLIB_API vsi_status vsi_nn_IntegerConvert + ( + const void * src, + vsi_nn_type_e src_type, + void * dest, + vsi_nn_type_e dest_type + ); + +OVXLIB_API int32_t vsi_nn_Fp32ToDFP + ( + const float in, + const int8_t fl, + const vsi_nn_type_e type + ); + +OVXLIB_API float vsi_nn_DFPToFp32 + ( + const int32_t val, + const int8_t fl, + const vsi_nn_type_e type + ); + +OVXLIB_API int32_t vsi_nn_Fp32ToAffine + ( + const float in, + const float scale, + const int32_t zero_point, + const vsi_nn_type_e type + ); + +OVXLIB_API float vsi_nn_AffineToFp32 + ( + const int32_t val, + const float scale, + const int32_t zero_point, + const vsi_nn_type_e type + ); + +/* + * Deprecated: Use vsi_nn_DtypeToFloat32() instead + */ +OVXLIB_API vsi_status vsi_nn_DtypeToFp32 + ( + void * src, + float * dst, + uint32_t index, + const vsi_nn_dtype_t * src_dtype + ); + +/* + * Deprecated: Use vsi_nn_Float32ToDtype() instead + */ +OVXLIB_API vsi_status vsi_nn_Fp32toDtype + ( + float src, + void * dst, + uint32_t index, + const vsi_nn_dtype_t * dst_dtype + ); + +OVXLIB_API vsi_status vsi_nn_DtypeToFloat32 + ( + uint8_t * src, + float * dst, + const vsi_nn_dtype_t * src_dtype + ); + +OVXLIB_API vsi_status vsi_nn_Float32ToDtype + ( + float src, + uint8_t * dst, + const vsi_nn_dtype_t * dst_dtype + ); + +OVXLIB_API int32_t vsi_nn_DtypeConvertRawData + ( + uint8_t * src, + int32_t src_bytes, + const vsi_nn_dtype_t * src_dtype, + uint8_t * dst, + int32_t dst_bytes, + const vsi_nn_dtype_t * dst_dtype + ); + +OVXLIB_API int32_t vsi_nn_DtypeConvertRawDataToFloat32 + ( + uint8_t * src, + int32_t src_bytes, + const vsi_nn_dtype_t * src_dtype, + float * dst, + int32_t dst_size + ); + +OVXLIB_API int32_t vsi_nn_DtypeConvertFloat32ToRawData + ( + float * src, + int32_t src_size, + uint8_t * dst, + int32_t dst_bytes, + const vsi_nn_dtype_t * dst_dtype + ); + +OVXLIB_API vsi_bool vsi_nn_QuantCheck + ( + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *weight, + vsi_nn_tensor_t *bias + ); + +OVXLIB_API vsi_bool vsi_nn_DtypeCompare + ( + vsi_nn_dtype_t *dtype0, + vsi_nn_dtype_t *dtype1 + ); + +OVXLIB_API vsi_status vsi_nn_vxConvertTensorToFloat32Data + ( + vx_context context, + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr, + float *f32_data, + uint32_t f32_data_sz + ); + +OVXLIB_API vsi_status vsi_nn_vxConvertFloat32DataToTensor + ( + vx_context context, + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr, + float *f32_data, + uint32_t f32_data_sz + ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h new file mode 100644 index 0000000..d85fafd --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h @@ -0,0 +1,562 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_DTYPE_UTIL_PRV_H +#define _VSI_NN_DTYPE_UTIL_PRV_H + +#include "vsi_nn_types.h" +#include "vsi_nn_math.h" +#include "vsi_nn_tensor.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static inline vsi_bool type_is_integer + ( + const vsi_nn_type_e type + ) +{ + vsi_bool ret; + ret = FALSE; + switch( type ) + { + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_INT16: + case VSI_NN_TYPE_INT32: + case VSI_NN_TYPE_INT64: + case VSI_NN_TYPE_UINT8: + case VSI_NN_TYPE_UINT16: + case VSI_NN_TYPE_UINT32: + case VSI_NN_TYPE_UINT64: + case VSI_NN_TYPE_BOOL8: + ret = TRUE; + break; + default: + break; + } + return ret; +} /* type_is_integer() */ + +static inline vsi_bool type_is_signed + ( + const vsi_nn_type_e type + ) +{ + vsi_bool ret; + ret = FALSE; + switch( type ) + { + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_INT16: + case VSI_NN_TYPE_INT32: + case VSI_NN_TYPE_INT64: + case VSI_NN_TYPE_FLOAT16: + case VSI_NN_TYPE_FLOAT32: + case VSI_NN_TYPE_FLOAT64: + case VSI_NN_TYPE_BFLOAT16: + ret = TRUE; + break; + default: + break; + } + return ret; +} /* type_is_signed() */ + +static inline uint32_t type_get_bytes + ( + const vsi_nn_type_e type + ) +{ + switch( type ) + { + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_UINT8: + case VSI_NN_TYPE_BOOL8: + return 1; + case VSI_NN_TYPE_INT16: + case VSI_NN_TYPE_UINT16: + case VSI_NN_TYPE_FLOAT16: + case VSI_NN_TYPE_BFLOAT16: + return 2; + case VSI_NN_TYPE_INT32: + case VSI_NN_TYPE_UINT32: + case VSI_NN_TYPE_FLOAT32: + return 4; + case VSI_NN_TYPE_INT64: + case VSI_NN_TYPE_UINT64: + case VSI_NN_TYPE_FLOAT64: + return 8; + default: + return 0; + } +} /* type_get_bytes() */ + +static inline void type_get_range + ( + vsi_nn_type_e type, + double * max_range, + double * min_range + ) +{ + int32_t bits; + double from, to; + from = 0.0; + to = 0.0; + bits = type_get_bytes( type ) * 8; + if( type_is_integer( type ) ) + { + if( type_is_signed( type ) ) + { + from = (double)(-(1L << (bits - 1))); + to = (double)((1UL << (bits - 1)) - 1); + } + else + { + from = 0.0; + to = (double)((1UL << bits) - 1); + } + } + else + { + // TODO: Add float + } + if( NULL != max_range ) + { + *max_range = to; + } + if( NULL != min_range ) + { + *min_range = from; + } +} /* type_get_range() */ + +static inline int32_t fp32_to_affine + ( + const float in, + const float scale, + const int32_t zero_point, + const vsi_nn_type_e type + ) +{ + int32_t data; + double max_range; + double min_range; + type_get_range( type, &max_range, &min_range ); + data = (int32_t)(vsi_rint( in / scale ) + zero_point ); + data = vsi_nn_max( (int32_t)min_range, vsi_nn_min( (int32_t)max_range , data ) ); + return data; +} /* fp32_to_affine() */ + +static inline float affine_to_fp32 + ( + const int32_t val, + const float scale, + const int32_t zero_point, + const vsi_nn_type_e type + ) +{ + float data; + data = ( (float)val - zero_point ) * scale; + return data; +} /* affine_to_fp32() */ + +static inline int32_t fp32_to_dfp + ( + const float in, + const int8_t fl, + const vsi_nn_type_e type + ) +{ + int32_t data; + double max_range; + double min_range; + type_get_range( type, &max_range, &min_range ); + if( fl > 0 ) + { + data = (int32_t)vsi_rint( in * (float)( (int64_t)1 << fl ) ); + } + else + { + data = (int32_t)vsi_rint( in * ( 1.0f / (float)( (int64_t)1 << -fl ) ) ); + } + data = vsi_nn_min( data, (int32_t)max_range ); + data = vsi_nn_max( data, (int32_t)min_range ); + return data; +} /* fp32_to_dfp() */ + +static inline float dfp_to_fp32 + ( + const int32_t val, + const int8_t fl, + const vsi_nn_type_e type + ) +{ + float result; + if( fl > 0 ) + { + result = (float)val * ( 1.0f / ( (float) ( (int64_t)1 << fl ) ) ); + } + else + { + result = (float)val * ( (float) ( (int64_t)1 << -fl ) ); + } + return result; +} /* dfp_to_fp32() */ + +static inline vsi_status integer_convert + ( + const void * src, + vsi_nn_type_e src_type, + void * dest, + vsi_nn_type_e dest_type + ) +{ + vsi_status status = VSI_SUCCESS; + if( type_is_integer( src_type ) && type_is_integer( dest_type ) ) + { + uint8_t all_zeros[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + uint8_t all_ones[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + uint32_t src_sz = type_get_bytes( src_type ); + uint32_t dest_sz = type_get_bytes( dest_type ); + uint8_t* buffer = all_zeros; + if( type_is_signed( src_type ) && (((int8_t *)src)[src_sz - 1] & 0x80) ) + { + buffer = all_ones; + } + memcpy( buffer, src, src_sz ); + memcpy( dest, buffer, dest_sz ); + } + else + { + status = VSI_FAILURE; + } + return status; +} /* integer_convert() */ + +typedef union +{ + uint32_t u; + float f; +} _fp32_t; + +static inline float fp16_to_fp32 + ( + int16_t in + ) +{ + const _fp32_t magic = { (254 - 15) << 23 }; + const _fp32_t infnan = { (127 + 16) << 23 }; + _fp32_t o; + // Non-sign bits + o.u = ( in & 0x7fff ) << 13; + o.f *= magic.f; + if(o.f >= infnan.f) + { + o.u |= 255 << 23; + } + //Sign bit + o.u |= ( in & 0x8000 ) << 16; + return o.f; +} /* fp16_to_fp32() */ + +static inline float bfp16_to_fp32 + ( + int16_t in + ) +{ + int32_t t1, t2, t3; + float out; + + t1 = in & 0x00FF; // Mantissa + t2 = in & 0xFF00; // Sign bit + Exponent + t3 = in & 0x7F00; // Exponent + + t1 <<= 16; + t2 <<= 16; // Shift (sign + Exponent) bit into position + t1 |= t2; // Re-insert (sign + Exponent) bit + + *((uint32_t*)&out) = t1; + + return t3 == 0 ? 0 : out; +} /* bfp16_to_fp32() */ + +static inline uint16_t fp32_to_fp16 + ( + float in + ) +{ + uint32_t fp32 = *((uint32_t *) &in); + uint32_t t1 = (fp32 & 0x80000000u) >> 16; /* sign bit. */ + uint32_t t2 = (fp32 & 0x7F800000u) >> 13; /* Exponent bits */ + uint32_t t3 = (fp32 & 0x007FE000u) >> 13; /* Mantissa bits, no rounding */ + uint32_t fp16 = 0u; + if( t2 >= 0x023c00u ) + { + fp16 = t1 | 0x7BFF; /* Don't round to infinity. */ + } + else if( t2 <= 0x01c000u ) + { + fp16 = t1; + } + else + { + t2 -= 0x01c000u; + fp16 = t1 | t2 | t3; + } + return (uint16_t) fp16; +} /* fp32_to_fp16() */ + +static inline uint16_t fp32_to_bfp16 + ( + float in + ) +{ + uint32_t fp32 = *((unsigned int *) &in); + uint32_t t1 = fp32 >> 16; + + return (uint16_t) t1; +} /* fp32_to_bfp16() */ + +static inline uint16_t fp32_to_bfp16_rtne + ( + float in + ) +{ + /* + Convert a float point to bfloat16, with round-nearest-to-even as rounding method. + */ + + uint32_t fp32 = *((unsigned int *) &in); + uint16_t out; + + uint32_t lsb = (fp32 >> 16) & 1; /* Least significant bit of resulting bfloat. */ + uint32_t rounding_bias = 0x7fff + lsb; + + if ( VSI_NN_FLOAT32_NAN == in ) + { + out = 0x7fc0; + } + else + { + fp32 += rounding_bias; + out = (uint16_t) (fp32 >> 16); + } + + return out; +} /* fp32_to_bfp16_rtne */ + +static inline vsi_status dtype_to_float32 + ( + uint8_t *src, + float *dst, + const vsi_nn_dtype_t * src_dtype + ) +{ + switch( src_dtype->vx_type ) + { + case VSI_NN_TYPE_FLOAT32: + *dst = *(float *)src; + break; + case VSI_NN_TYPE_FLOAT16: + *dst = fp16_to_fp32( *(int16_t *)src ); + break; + case VSI_NN_TYPE_BFLOAT16: + *dst = bfp16_to_fp32( *(int16_t *)src ); + break; + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_BOOL8: + case VSI_NN_TYPE_UINT8: + case VSI_NN_TYPE_INT16: + case VSI_NN_TYPE_INT32: + { + int32_t src_value = 0; + integer_convert(src, src_dtype->vx_type, &src_value, VSI_NN_TYPE_INT32 ); + switch( src_dtype->qnt_type ) + { + case VSI_NN_QNT_TYPE_DFP: + *dst = dfp_to_fp32( src_value, src_dtype->fl, src_dtype->vx_type ); + break; + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + *dst = affine_to_fp32( src_value, + src_dtype->scale, src_dtype->zero_point, src_dtype->vx_type ); + break; + case VSI_NN_QNT_TYPE_NONE: + *dst = (float)src_value; + break; + default: + break; + } + } + break; + default: + return VSI_FAILURE; + } + return VSI_SUCCESS; +} + +static inline vsi_status float32_to_dtype + ( + float src, + uint8_t *dst, + const vsi_nn_dtype_t * dst_dtype + ) +{ + switch( dst_dtype->vx_type ) + { + case VSI_NN_TYPE_FLOAT32: + *(float *)dst = src; + break; + case VSI_NN_TYPE_FLOAT16: + *(int16_t *)dst = fp32_to_fp16( src ); + break; + case VSI_NN_TYPE_BFLOAT16: + *(int16_t *)dst = fp32_to_bfp16_rtne( src ); + break; + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_BOOL8: + case VSI_NN_TYPE_UINT8: + case VSI_NN_TYPE_INT16: + case VSI_NN_TYPE_INT32: + { + int32_t dst_value = 0; + switch( dst_dtype->qnt_type ) + { + case VSI_NN_QNT_TYPE_DFP: + dst_value = fp32_to_dfp( src, dst_dtype->fl, dst_dtype->vx_type ); + break; + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + dst_value = fp32_to_affine( src, + dst_dtype->scale, dst_dtype->zero_point, dst_dtype->vx_type ); + break; + case VSI_NN_QNT_TYPE_NONE: + dst_value = (int32_t)src; + break; + default: + break; + } + integer_convert( &dst_value, VSI_NN_TYPE_INT32, dst, dst_dtype->vx_type ); + } + break; + default: + return VSI_FAILURE; + } + return VSI_SUCCESS; +} + +#ifdef __cplusplus +} +#endif + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm8 + ( + const float * buffer, size_t size, + float scale, int32_t zero_point, + int8_t * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm16 + ( + const float * buffer, size_t size, + float scale, int32_t zero_point, + int16_t * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm32 + ( + const float * buffer, size_t size, + float scale, int32_t zero_point, + int32_t * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm64 + ( + const float * buffer, size_t size, + float scale, int32_t zero_point, + int64_t * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm8 + ( + const float * buffer, size_t size, + float scale, int32_t zero_point, + uint8_t * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel + ( + const float * buffer, size_t size, + const int32_t * shape, size_t rank, + const float * scale, size_t scale_size, + const int32_t * zero_point, size_t zero_point_size, + int32_t channel_dim, + int8_t * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_quantize_symm8_to_float + ( + const int8_t * buffer, size_t size, + float scale, int32_t zero_point, + float * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_quantize_symm16_to_float + ( + const int16_t * buffer, size_t size, + float scale, int32_t zero_point, + float * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_quantize_symm32_to_float + ( + const int32_t * buffer, size_t size, + float scale, int32_t zero_point, + float * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_quantize_symm64_to_float + ( + const int64_t * buffer, size_t size, + float scale, int32_t zero_point, + float * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_quantize_asymm8_to_float + ( + const uint8_t * buffer, size_t size, + float scale, int32_t zero_point, + float * out_buffer + ); + +vsi_bool vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float + ( + const int8_t * buffer, size_t size, + const int32_t * shape, size_t rank, + const float * scale, size_t scale_size, + const int32_t * zero_point, size_t zero_point_size, + int32_t channel_dim, + float * out_buffer + ); + + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_hashmap.h b/src/tim/vx/internal/include/utils/vsi_nn_hashmap.h new file mode 100644 index 0000000..c8b3324 --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_hashmap.h @@ -0,0 +1,92 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_HASHMAP_H +#define _VSI_NN_HASHMAP_H + +#include +#include "vsi_nn_types.h" +#include "vsi_nn_link_list.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +#define VSI_NN_HASHMAP_KEY_SIZE (21) +typedef struct +{ + vsi_nn_link_list_t link_list; + char * hash_key; + void * data; +} vsi_nn_hashmap_item_t; + +typedef struct +{ + vsi_nn_hashmap_item_t * items; + void * values; + size_t size; +} vsi_nn_hashmap_t; + +vsi_nn_hashmap_t * vsi_nn_hashmap_create(); + +void vsi_nn_hashmap_release + ( vsi_nn_hashmap_t ** map_ptr ); + +void vsi_nn_hashmap_clear + ( vsi_nn_hashmap_t * map ); + +void* vsi_nn_hashmap_get + ( + const vsi_nn_hashmap_t * map, + const char * key + ); + +size_t vsi_nn_hashmap_get_size( const vsi_nn_hashmap_t * map ); + +void vsi_nn_hashmap_add + ( + vsi_nn_hashmap_t * map, + const char * key, + void * value + ); + +void vsi_nn_hashmap_remove + ( + vsi_nn_hashmap_t * map, + const char * key + ); + +vsi_bool vsi_nn_hashmap_has + ( + vsi_nn_hashmap_t * map, + const char * key + ); + +vsi_nn_hashmap_item_t* vsi_nn_hashmap_iter + ( vsi_nn_hashmap_t* map, vsi_nn_hashmap_item_t* item ); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_limits.h b/src/tim/vx/internal/include/utils/vsi_nn_limits.h new file mode 100644 index 0000000..f6b3d8a --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_limits.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_LIMITS_H +#define _VSI_NN_LIMITS_H + +#include "vsi_nn_types.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +OVXLIB_API void vsi_nn_TypeGetRange + ( + vsi_nn_type_e type, + double * max_range, + double * min_range + ); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h new file mode 100644 index 0000000..e16d9e8 --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h @@ -0,0 +1,117 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_LINK_LIST_H +#define _VSI_NN_LINK_LIST_H + +#include "vsi_nn_types.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +#define vsi_nn_LinkListInitRoot(n) do{n = NULL;} while (0); + +typedef struct _vsi_nn_link_list +{ + struct _vsi_nn_link_list * prev; + struct _vsi_nn_link_list * next; +} vsi_nn_link_list_t; + +typedef void ( * vsi_nn_link_list_init_t ) + ( + vsi_nn_link_list_t * node + ); + +typedef void ( * vsi_nn_link_list_deinit_t ) + ( + vsi_nn_link_list_t * node + ); + +OVXLIB_API vsi_nn_link_list_t * vsi_nn_LinkListPopStart + ( + vsi_nn_link_list_t ** root + ); + +OVXLIB_API vsi_nn_link_list_t * vsi_nn_LinkListPopEnd + ( + vsi_nn_link_list_t ** root + ); + +OVXLIB_API void vsi_nn_LinkListPushStart + ( + vsi_nn_link_list_t ** root, + vsi_nn_link_list_t * nodes + ); + +OVXLIB_API void vsi_nn_LinkListPushEnd + ( + vsi_nn_link_list_t ** root, + vsi_nn_link_list_t * nodes + ); + +OVXLIB_API vsi_nn_link_list_t * vsi_nn_LinkListNext + ( + vsi_nn_link_list_t * iter + ); + +OVXLIB_API vsi_nn_link_list_t * vsi_nn_LinkListNewNode + ( + size_t sz, + vsi_nn_link_list_init_t init + ); + +OVXLIB_API void vsi_nn_LinkListRemoveNode + ( + vsi_nn_link_list_t ** root, + vsi_nn_link_list_t * nodes + ); + +OVXLIB_API void vsi_nn_LinkListDeinit + ( + vsi_nn_link_list_t * root, + vsi_nn_link_list_deinit_t deinit + ); + +OVXLIB_API vsi_nn_link_list_t *vsi_nn_LinkListGetIndexNode + ( + vsi_nn_link_list_t * root, + uint32_t index + ); + +OVXLIB_API void vsi_nn_LinkListDelIndexNode + ( + vsi_nn_link_list_t ** root, + uint32_t index + ); + +OVXLIB_API uint32_t vsi_nn_LinkListGetNodeNumber + ( + vsi_nn_link_list_t * root + ); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_map.h b/src/tim/vx/internal/include/utils/vsi_nn_map.h new file mode 100644 index 0000000..33ac22a --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_map.h @@ -0,0 +1,84 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_MAP_H +#define _VSI_NN_MAP_H + +#include "vsi_nn_types.h" +#include "vsi_nn_link_list.h" +#include "vsi_nn_binary_tree.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +typedef vsi_nn_binary_tree_key_t vsi_nn_map_key_t; + +typedef struct _vsi_nn_map_key_list +{ + vsi_nn_link_list_t link_list; + vsi_nn_map_key_t val; +} vsi_nn_map_key_list_t; + +typedef struct _vsi_nn_map +{ + int size; + vsi_nn_map_key_list_t * keys; + vsi_nn_binary_tree_t * values; +} vsi_nn_map_t; + +OVXLIB_API void vsi_nn_MapInit + ( + vsi_nn_map_t * map + ); + +OVXLIB_API void * vsi_nn_MapGet + ( + vsi_nn_map_t * map, + vsi_nn_map_key_t key + ); + +OVXLIB_API void vsi_nn_MapAdd + ( + vsi_nn_map_t * map, + vsi_nn_map_key_t key, + void * value + ); + +OVXLIB_API void vsi_nn_MapRemove + ( + vsi_nn_map_t * map, + vsi_nn_map_key_t key + ); + +OVXLIB_API vsi_bool vsi_nn_MapHasKey + ( + vsi_nn_map_t * map, + vsi_nn_map_key_t key + ); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_math.h b/src/tim/vx/internal/include/utils/vsi_nn_math.h new file mode 100644 index 0000000..e53066b --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h @@ -0,0 +1,211 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_MATH_H +#define _VSI_NN_MATH_H + +#include +#include +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define vsi_nn_abs(x) (((x) < 0) ? -(x) : (x)) +#define vsi_nn_max(a,b) (((a) > (b)) ? (a) : (b)) +#define vsi_nn_min(a,b) (((a) < (b)) ? (a) : (b)) +#define vsi_nn_clamp(x, min, max) (((x) < (min)) ? (min) : \ + ((x) > (max)) ? (max) : (x)) +#define vsi_nn_float_compare(a,b,diff) (vsi_nn_abs((a) - (b)) < (diff) ? TRUE : FALSE) +#define vsi_abs(x) vsi_nn_abs(x) +#define vsi_clamp(x, min, max) vsi_nn_clamp(x, min, max) +#define vsi_rtne(x) vsi_rint(x) + +#define VSI_NN_FLOAT32_INF (0x7F800000) +#define VSI_NN_FLOAT32_NAN (0x7FC00000) +#define VSI_NN_FLOAT64_INF (0x7FF0000000000000) +#define VSI_NN_FLOAT64_NAN (0x7FF8000000000000) + + +#define DEFINE_ARRAY_TYPE( NAME, TYPE ) \ + typedef struct { \ + size_t size; \ + TYPE data[0]; \ + } vsi_##NAME##_array_t; \ + static inline vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \ + vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \ + sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \ + array->size = size; \ + return array; \ + } \ + static inline void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \ + { \ + if( array && *array ) { \ + free( *array ); \ + *array = NULL; \ + } \ + } +DEFINE_ARRAY_TYPE( int, int32_t ) +DEFINE_ARRAY_TYPE( float, float ) + +#undef DEFINE_ARRAY_TYPE + +OVXLIB_API void vsi_nn_Transpose + ( + uint8_t * dst, + uint8_t * data, + uint32_t * shape, + uint32_t dim_num, + uint32_t * perm, + vsi_nn_type_e type + ); + +OVXLIB_API void vsi_nn_Permute + ( + uint8_t * dst, + uint8_t * data, + uint32_t * shape, + uint32_t dim_num, + uint32_t * perm, + vsi_nn_type_e type + ); + +OVXLIB_API void vsi_nn_SqueezeShape + ( + uint32_t * shape, + uint32_t * dim_num + ); + +OVXLIB_API uint32_t vsi_nn_ShapeProduct + ( + uint32_t * shape, + uint32_t dim_num + ); + +//shape: row first <--> column first +OVXLIB_API void vsi_nn_InvertShape + ( + uint32_t * in, + uint32_t dim_num, + uint32_t * out + ); + +//Permute shape: row first <--> column first +OVXLIB_API void vsi_nn_InvertPermuteShape + ( + uint32_t * in, + uint32_t dim_num, + uint32_t * out + ); + +OVXLIB_API double vsi_nn_Rint + ( + double x + ); + +/** +* Set Seeds for philox_4x32_10 algorithm +* philox_4x32_10 algorithm need 2 uint32_t as seeds. +* +* @param[in] the low uint32_t of the seed. +* @param[in] the high uint32_t of the seed. +*/ +OVXLIB_API void vsi_nn_random_init_for_philox_4x32_10 + ( + uint32_t low, + uint32_t high + ); + +/** +* Random Number Generator By philox_4x32_10 algorithm +* Random Number(uint32_t) Generator By philox_4x32_10 algorithm +* +* @param[out] the buffer for RNG output. +* @param[in] the number of generated random numbers. +*/ +OVXLIB_API void vsi_nn_random_generate_by_philox_4x32_10 + ( + uint32_t *random_buf, + uint32_t len + ); + +/** +* Uniform Transform +* Transform the random uint32_t to Uniform float in [0, 1). +* +* @param[in] the buffer for random uint32_t. +* @param[out] the buffer for uniform float in [0, 1). +* @param[in] the number of random numbers. +*/ +OVXLIB_API void vsi_nn_random_uniform_transform + ( + uint32_t *random_buf, + float *uniform_buf, + uint32_t len + ); + +static inline double copy_sign + ( + double number, + double sign + ) +{ + double value = vsi_nn_abs(number); + return (sign > 0) ? value : (-value); +} /* copy_sign() */ + +static inline float simple_round + ( + float x + ) +{ + return (float) copy_sign(floorf(fabsf(x) + 0.5f), x); +} /* simple_round() */ + +static inline double vsi_rint + ( + double x + ) +{ +#define _EPSILON 1e-8 + double decimal; + double inter; + + decimal = modf((double)x, &inter); + if( vsi_nn_abs((vsi_nn_abs(decimal) - 0.5f)) < _EPSILON ) + { + inter += (int32_t)(inter) % 2; + } + else + { + return simple_round( (float)x ); + } + return inter; +} /* vsi_rint() */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_shape_util.h b/src/tim/vx/internal/include/utils/vsi_nn_shape_util.h new file mode 100644 index 0000000..51c0c27 --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_shape_util.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef __VSI_NN_SHAPE_UTIL_H +#define __VSI_NN_SHAPE_UTIL_H + +#include +#include + +void vsi_nn_shape_get_stride + ( + const int32_t * shape, + size_t rank, + size_t * out_stride + ); + +size_t vsi_nn_shape_get_size + ( + const int32_t * shape, + size_t rank + ); + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_tensor_op.h b/src/tim/vx/internal/include/utils/vsi_nn_tensor_op.h new file mode 100644 index 0000000..ccd6059 --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_tensor_op.h @@ -0,0 +1,54 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef __VSI_NN_TENSOR_OP_H__ +#define __VSI_NN_TENSOR_OP_H__ + +#include "vsi_nn_graph.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_types.h" + +vsi_nn_tensor_t* vsi_nn_Concat + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t** tensors, + uint32_t tensor_num, + uint32_t axis + ); + +vsi_nn_tensor_t* vsi_nn_ConvertTensorDtype + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* tensor, + const vsi_nn_dtype_t* dst_dtype + ); + +vsi_nn_tensor_t* vsi_nn_TensorAdd + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t** tensors, + uint32_t tensor_num, + vsi_nn_tensor_attr_t output_attr + ); + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h new file mode 100644 index 0000000..93180c7 --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h @@ -0,0 +1,377 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_UTIL_H +#define _VSI_NN_UTIL_H + +/*------------------------------------------- + Includes +-------------------------------------------*/ +#include "vsi_nn_platform.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_types.h" +#include "vsi_nn_context.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*------------------------------------------- + Macros and Variables +-------------------------------------------*/ + +#ifndef _cnt_of_array +#define _cnt_of_array( arr ) (sizeof( arr )/sizeof( arr[0] )) +#endif + +#define vsi_nn_safe_free( _PTR ) if( _PTR ){ \ + free( _PTR ); _PTR = NULL; } + +#define vsi_safe_release_tensor(_t) if(_t){vsi_nn_ReleaseTensor(&(_t)); _t = NULL;} + +#define END_OF_VARIADIC_ARGUMENTS 0xbadcaffe +#define FOREACH_ARGS(_args, _next, _arg_type) \ + while(((_arg_type)((size_t)END_OF_VARIADIC_ARGUMENTS)) != (_next = va_arg(_args, _arg_type))) + +/*------------------------------------------- + Functions +-------------------------------------------*/ + +/** + * Load binary data from file + * Load binary data from file, it will malloc the buffer to store + * the data, user need to free it with vsi_nn_Free(). + * @see vsi_nn_Free + * + * @param[in] filename Binary data file path. + * @param[out] sz Size(bytes) of data. + * + * @return Data buffer on success, or NULL otherwise. + */ +OVXLIB_API uint8_t * vsi_nn_LoadBinaryData + ( + const char * filename, + uint32_t * sz + ); + +OVXLIB_API uint32_t vsi_nn_GetStrideSize + ( + vsi_nn_tensor_attr_t * attr, + uint32_t * stride + ); + +OVXLIB_API uint32_t vsi_nn_GetStrideSizeBySize + ( + uint32_t * size, + uint32_t dim_num, + vsi_nn_type_e type, + uint32_t * stride + ); + +OVXLIB_API uint32_t vsi_nn_GetTotalBytesBySize + ( + uint32_t * size, + uint32_t dim_num, + vsi_nn_type_e type + ); + +/** + * Convert data to float32 + * Convert data from any type to float32. + * + * @param[in] data The scalar data address. + * @param[in] type Data type. + * + * @return Converted float32 data. + */ +OVXLIB_API float vsi_nn_DataAsFloat32 + ( + uint8_t * data, + vsi_nn_type_e type + ); + +OVXLIB_API void vsi_nn_UpdateTensorDims + ( + vsi_nn_tensor_attr_t * attr + ); + +OVXLIB_API uint32_t vsi_nn_ComputeFilterSize + ( + uint32_t i_size, + uint32_t ksize, + uint32_t * pad, + uint32_t stride, + uint32_t dilation, + vsi_nn_round_type_e rounding + ); + +OVXLIB_API void vsi_nn_InitTensorsId + ( + vsi_nn_tensor_id_t * ids, + int num + ); + +OVXLIB_API void vsi_nn_ComputePadWithPadType + ( + uint32_t * in_shape, + uint32_t in_dim_num, + uint32_t * ksize, + uint32_t * stride, + vsi_nn_pad_e pad_type, + vsi_nn_round_type_e rounding, + uint32_t * out_pad + ); + +OVXLIB_API void vsi_nn_ComputePadWithPadTypeForConv1D + ( + uint32_t * in_shape, + uint32_t in_dim_num, + uint32_t * ksize, + uint32_t * stride, + vsi_nn_pad_e pad_type, + vsi_nn_round_type_e rounding, + uint32_t * out_pad + ); + +OVXLIB_API void vsi_nn_GetPadForOvx + ( + uint32_t * in_pad, + uint32_t * out_pad + ); + +OVXLIB_API vsi_bool vsi_nn_CreateTensorGroup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * in_tensor, + uint32_t axis, + vsi_nn_tensor_t ** out_tensors, + uint32_t group_number + ); + +OVXLIB_API uint32_t vsi_nn_ShapeToString + ( + uint32_t * shape, + uint32_t dim_num, + char * buf, + uint32_t buf_sz, + vsi_bool for_print + ); + +OVXLIB_API int32_t vsi_nn_Access + ( + const char *path, + int32_t mode + ); + +OVXLIB_API int32_t vsi_nn_Mkdir + ( + const char *path, + int32_t mode + ); + +OVXLIB_API vsi_bool vsi_nn_CheckFilePath + ( + const char *path + ); + +OVXLIB_API void vsi_nn_GetFP32MultiAndPostShift + ( + vx_float32 mult, + vx_uint16 *M0, + vx_int8 *N + ); + +/** + * Malloc aligned buffer + * Malloc address and size aligned buffer. + * + * @param[in] mem_size Buffer size to malloc. + * @param[in] align_start_size Address aligned bytes. + * @param[in] align_block_size Buffer size aligned bytes. + * + * @return The aligned buffer address on success, or NULL otherwise. + */ +OVXLIB_API uint8_t * vsi_nn_MallocAlignedBuffer + ( + uint32_t mem_size, + uint32_t align_start_size, + uint32_t align_block_size + ); + +/** + * Free aligned buffer + * Free aligend buffer malloc with vsi_nn_MallocAlignedBuffer(). + * + * @param[in] handle Buffer handle to free. + * @see vsi_nn_MallocAlignedBuffer + */ +OVXLIB_API void vsi_nn_FreeAlignedBuffer + ( + uint8_t* handle + ); + +OVXLIB_API vsi_bool vsi_nn_IsBufferAligned + ( + uint8_t * buf, + uint32_t align_start_size + ); + +OVXLIB_API void vsi_nn_FormatToString + ( + vsi_nn_tensor_t *tensor, + char *buf, + uint32_t buf_sz + ); + +OVXLIB_API const char* vsi_nn_DescribeStatus + ( + vsi_status status + ); + +uint32_t vsi_nn_compute_filter_shape + ( + vsi_nn_pad_e padding_type, + uint32_t image_size, + uint32_t ksize, + uint32_t stride, + uint32_t dilation_rate + ); + +void vsi_nn_compute_padding + ( + uint32_t * in_shape, + uint32_t * ksize, + uint32_t * stride, + uint32_t * dilation, + vsi_nn_pad_e pad_type, + uint32_t * out_pad + ); + +void vsi_nn_compute_padding_conv1d + ( + uint32_t * in_shape, + uint32_t * ksize, + uint32_t * stride, + uint32_t * dilation, + vsi_nn_pad_e pad_type, + uint32_t * out_pad + ); + +void vsi_nn_OptimizedEltOPShape + ( + vsi_nn_tensor_t * input, + uint32_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t * num_of_dims + ); + +vsi_bool vsi_nn_OptimizedEltWiseOPShape + ( + vsi_nn_tensor_t * input0, + vsi_nn_tensor_t * input1, + vsi_nn_tensor_t * output, + uint32_t sizes0[VSI_NN_MAX_DIM_NUM], + uint32_t sizes1[VSI_NN_MAX_DIM_NUM], + uint32_t sizes2[VSI_NN_MAX_DIM_NUM], + uint32_t * dim_num + ); + +vsi_bool vsi_nn_IsEVISFeatureAvaiable + ( + vsi_nn_context_t context + ); + +int32_t vsi_nn_compareVersion + ( + vsi_nn_graph_t * graph, + uint32_t version_major, + uint32_t version_minor, + uint32_t version_patch + ); + +typedef uint32_t(*comp_func)(void* data, int32_t left, int32_t right); + +/** + * the meta function for sort/partial sort + * This function is the key meta function of qsort, which can be used in sort/partial sort. + * But you can NOT use this function directly to sort/partial sort. + * This function do NOT sort data itself, but sort its index. + * + * @param[in] buffer of data which will be sorted. + * @param[in] the left(start) index of data. + * @param[in] the right(end) index of data. + * @param[in] compare function. the meaning of return value is as same as std::sort. + * @param[in] recursively execute vsi_nn_partition. + * @param[out] the sorted index of data. + */ +OVXLIB_API int32_t vsi_nn_partition + ( + void* data, + int32_t left, + int32_t right, + comp_func func, + vsi_bool is_recursion, + uint32_t* indices + ); + +/** + * Reorder tensors + * + * @param[in] tensors Tensor list to reorder. + * @param[in] order New orders. + * @param[in] num Number of tensors. + * @param[out] out_tensors Ordered tensors + * */ +static inline void vsi_nn_reorder_tensor + ( + vsi_nn_tensor_t** tensors, + const int32_t* order, + size_t num, + vsi_nn_tensor_t** out_tensors + ) +{ + size_t i; + for( i = 0; i < num; i++ ) + { + out_tensors[i] = tensors[order[i]]; + } +} + +void vsi_nn_print_int_array( int32_t* array, size_t size ); + +float vsi_nn_activation + ( + float value, + vsi_nn_activation_e activation + ); + +vsi_bool vsi_nn_is_same_type + ( + vsi_nn_tensor_t * src, + vsi_nn_tensor_t * dst + ); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_vdata.h b/src/tim/vx/internal/include/utils/vsi_nn_vdata.h new file mode 100644 index 0000000..a0f295f --- /dev/null +++ b/src/tim/vx/internal/include/utils/vsi_nn_vdata.h @@ -0,0 +1,56 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_VDATA_H +#define _VSI_NN_VDATA_H + +#include +#include + +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_tensor.h" + +#ifdef __cplusplus +extern "C" { +#endif + +OVXLIB_API uint8_t * vsi_nn_VdataCreate + ( + vsi_nn_graph_t * graph, + vsi_nn_node_t * node, + uint32_t * p_stream_size + ); + +OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateVDataTensor + ( + vsi_nn_graph_t * graph, + uint8_t * stream, + vsi_nn_tensor_attr_t * attr + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_assert.h b/src/tim/vx/internal/include/vsi_nn_assert.h new file mode 100644 index 0000000..43012ef --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_assert.h @@ -0,0 +1,42 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_ASSERT_H_ +#define _VSI_NN_ASSERT_H_ + +#if defined(__cplusplus) +extern "C"{ +#endif + +#ifndef _compiler_assert +#define _compiler_assert(cond, msg) _assert_impl(cond, __LINE__, msg) +#define _assert_paste(msg, line) msg##line +#define _assert_impl(cond, line, msg) \ + typedef char _assert_paste(assert_failed_##msg##_, line)[2*!!(cond)-1]; +#endif + +#if defined(__cplusplus) +} +#endif + +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/include/vsi_nn_client_op.h b/src/tim/vx/internal/include/vsi_nn_client_op.h new file mode 100644 index 0000000..856f81b --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_client_op.h @@ -0,0 +1,78 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_H +#define _VSI_NN_OP_CLIENT_H + +/*------------------------------------ + Includes + -----------------------------------*/ + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_ops.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +/*------------------------------------ + Types + -----------------------------------*/ + +/*------------------------------------ + Macros + -----------------------------------*/ + +#define VSI_NN_DEF_CLIENT_OPS( ops, idx ) ( VSI_NN_OP_##ops## = VSI_NN_OP_CLIENT + idx ) + +/*------------------------------------ + Functions + -----------------------------------*/ + +OVXLIB_API vsi_bool vsi_nn_OpIsRegistered + ( + vsi_nn_op_t op + ); + +OVXLIB_API vsi_bool vsi_nn_OpRegisterClient + ( + vsi_nn_op_t op, + vsi_nn_op_proc_t * proc + ); + +OVXLIB_API vsi_nn_op_proc_t * vsi_nn_OpGetClient + ( + vsi_nn_op_t op + ); + +OVXLIB_API void vsi_nn_OpRemoveClient + ( + vsi_nn_op_t op + ); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_compatibility.h b/src/tim/vx/internal/include/vsi_nn_compatibility.h new file mode 100644 index 0000000..bcf2f25 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_compatibility.h @@ -0,0 +1,120 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_COMPATIBILITY_H_ +#define _VSI_NN_COMPATIBILITY_H_ + +#if defined(__cplusplus) +extern "C"{ +#endif + +/* + keep the backward compatibility with spec 1.1 for standard nn kernels +*/ +#define VX_KERNEL_NN_SOFTMAX_LAYER VX_KERNEL_SOFTMAX_LAYER +#define VX_KERNEL_NN_NORMALIZATION_LAYER VX_KERNEL_NORMALIZATION_LAYER +#define VX_KERNEL_NN_POOLING_LAYER VX_KERNEL_POOLING_LAYER +#define VX_KERNEL_NN_FULLY_CONNECTED_LAYER VX_KERNEL_FULLY_CONNECTED_LAYER +#define VX_KERNEL_NN_ACTIVATION_LAYER VX_KERNEL_ACTIVATION_LAYER +#define VX_KERNEL_NN_ROIPOOL VX_KERNEL_ROI_POOLING_LAYER +#define VX_KERNEL_NN_CONVOLUTION_LAYER VX_KERNEL_CONVOLUTION_LAYER +#define VX_KERNEL_NN_DECONVOLUTION_LAYER VX_KERNEL_DECONVOLUTION_LAYER + +/* + keep the backward compatibility with spec 1.1 for vx_tensor_attribute_e +*/ +#define VX_TENSOR_NUM_OF_DIMS VX_TENSOR_NUMBER_OF_DIMS +#define VX_TENSOR_FIXED_POINT_POS VX_TENSOR_FIXED_POINT_POSITION + +/* + keep the backward compatibility with spec 1.1 from vx_convolutional_network_rounding_type_e to vx_nn_rounding_type_e +*/ +typedef enum vx_nn_rounding_type_e vx_convolutional_network_rounding_type_e; +#define VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR VX_NN_DS_SIZE_ROUNDING_FLOOR +#define VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_CEILING VX_NN_DS_SIZE_ROUNDING_CEILING + +/* + keep the backward compatibility with spec 1.1 from vx_convolutional_network_pooling_type_e to vx_nn_pooling_type_e +*/ +typedef enum vx_nn_pooling_type_e vx_convolutional_network_pooling_type_e; +#define VX_CONVOLUTIONAL_NETWORK_POOLING_MAX VX_NN_POOLING_MAX +#define VX_CONVOLUTIONAL_NETWORK_POOLING_AVG VX_NN_POOLING_AVG +#define VX_CONVOLUTIONAL_NETWORK_POOLING_L2 VX_NN_POOLING_L2 +#define VX_CONVOLUTIONAL_NETWORK_POOLING_AVG_ANDROID VX_NN_POOLING_AVG_ANDROID + +/* + keep the backward compatibility with spec 1.1 from vx_convolutional_network_norm_type_e to vx_nn_norm_type_e +*/ +typedef enum vx_nn_norm_type_e vx_convolutional_network_norm_type_e; +#define VX_CONVOLUTIONAL_NETWORK_NORM_SAME_MAP VX_NN_NORMALIZATION_SAME_MAP +#define VX_CONVOLUTIONAL_NETWORK_NORM_ACROSS_MAPS VX_NN_NORMALIZATION_ACROSS_MAPS + +/* + keep the backward compatibility with spec 1.1 from vx_convolutional_network_layer_type_e to vx_nn_layer_type_e +*/ +typedef enum vx_nn_layer_type_e vx_convolutional_network_layer_type_e; +#define VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER VX_NN_CONVOLUTION_LAYER +#define VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER VX_NN_FULLYCONNECTED_LAYER + +/* + keep the backward compatibility with spec 1.1 from vx_convolutional_network_activation_func_e to + vx_nn_activation_function_e +*/ +typedef enum vx_nn_activation_function_e vx_convolutional_network_activation_func_e; +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LOGISTIC VX_NN_ACTIVATION_LOGISTIC +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HYPERBOLIC_TAN VX_NN_ACTIVATION_HYPERBOLIC_TAN +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU VX_NN_ACTIVATION_RELU +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_BRELU VX_NN_ACTIVATION_BRELU +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SOFTRELU VX_NN_ACTIVATION_SOFTRELU +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ABS VX_NN_ACTIVATION_ABS +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SQUARE VX_NN_ACTIVATION_SQUARE +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SQRT VX_NN_ACTIVATION_SQRT +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LINEAR VX_NN_ACTIVATION_LINEAR +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LEAKYRELU VX_NN_ACTIVATION_LEAKYRELU +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6 VX_NN_ACTIVATION_RELU6 +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1 VX_NN_ACTIVATION_RELU1 +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RSQRT VX_NN_ACTIVATION_RSQRT +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LEAKYRELU_MAX_POOLING VX_NN_ACTIVATION_LEAKYRELU_MAX_POOLING +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_NONE VX_NN_ACTIVATION_NONE +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SWISH VX_NN_ACTIVATION_SWISH +#define VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HWISH VX_NN_ACTIVATION_HSWISH + +/* + keep the backward compatibility with spec 1.1 for vxCopyTensorPatch_11 +*/ +VX_API_ENTRY vx_status VX_API_CALL +vxCopyTensorPatch_11( + vx_tensor tensor, + vx_tensor_view view, + vx_tensor_addressing user_addr, + void *user_ptr, + vx_enum usage, + vx_enum user_mem_type +); +//#define vxCopyTensorPatch vxCopyTensorPatch_11 + +#if defined(__cplusplus) +} +#endif +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h new file mode 100644 index 0000000..9a571b9 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -0,0 +1,98 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_CONTEXT_H +#define _VSI_NN_CONTEXT_H + +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** Max size of HW target name */ +#define VSI_NN_MAX_TARGET_NAME 32 + +/** + * Hardware evis version. + */ +typedef enum +{ + VSI_NN_HW_EVIS_NONE, + VSI_NN_HW_EVIS_1, + VSI_NN_HW_EVIS_2 +} vsi_nn_hw_evis_version_e; + +/** + * Structure to store hardware evis version. + */ +typedef struct _vsi_nn_hw_evis_t +{ + vsi_nn_hw_evis_version_e ver; +} vsi_nn_hw_evis_t; + +/** + * Hardware config. + * It stores hardware name and evis version. + */ +typedef struct _vsi_nn_hw_config_t +{ + char target_name[VSI_NN_MAX_TARGET_NAME]; + vsi_nn_hw_evis_t evis; +#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT + uint32_t subGroupSize; +#endif +} vsi_nn_hw_config_t; + +/** + * Ovxlib NN runtime context. + */ +typedef struct _vsi_nn_context_t +{ + vx_context c; + vsi_nn_hw_config_t config; +} *vsi_nn_context_t; + +/** + * Create context + * Create ovxlib NN runtime context. + * @return Context handle on success, or NULL otherwise. + */ +OVXLIB_API vsi_nn_context_t vsi_nn_CreateContext + ( void ); + +/** + * Release context + * Release ovxlib NN runtime resource and reset context handle to NULL. + * + * @param[in] ctx Pointer to context handle. + */ +OVXLIB_API void vsi_nn_ReleaseContext + ( vsi_nn_context_t * ctx ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_daemon.h b/src/tim/vx/internal/include/vsi_nn_daemon.h new file mode 100644 index 0000000..e005466 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_daemon.h @@ -0,0 +1,66 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_DAEMON_H +#define _VSI_NN_DAEMON_H + +#ifdef __cplusplus + #define _INITIALIZER(f) \ + static void f(void); \ + struct f##_t_{ f##_t_(void) { f(); }}; static f##_t_ f##_; \ + static void f(void) + + #define _DEINITIALIZER(f) \ + static void f(void); \ + struct f##_t_{ ~f##_t_(void) { f(); }}; static f##_t_ f##_; \ + static void f(void) + +#elif defined(_MSC_VER) + #pragma section(".CRT$XCU", read) + #define _INITIALIZER2(f, p) \ + static void f(void); \ + __declspec(allocate(".CRT$XCU")) void (*f##_)(void) = f; \ + __pragma(comment(linker, "/include:" p #f "_")) \ + static void f(void) + #ifdef _WIN64 + #define _INITIALIZER(f) _INITIALIZER2(f, "") + #else + #define _INITIALIZER(f) _INITIALIZER2(f, "_") + #endif + + #define _DEINITIALIZER(f) \ + static void f(void) + +#elif defined(__linux__) + #define _INITIALIZER(f) \ + static void f(void) __attribute__((constructor)); \ + static void f(void) + + #define _DEINITIALIZER(f) \ + static void f(void) __attribute__((destructor)); \ + static void f(void) + +#else + #error: Unsupport compiler. +#endif +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_error.h b/src/tim/vx/internal/include/vsi_nn_error.h new file mode 100644 index 0000000..7b55aa5 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_error.h @@ -0,0 +1,63 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_ERROR_H +#define _VSI_NN_ERROR_H + +#include +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" + +#define VSI_ASSERT( cond ) assert(cond) + +#define VSI_CHECK_PTR( pointer, msg, retval ) \ + do { \ + if( pointer == NULL ) { \ + VSILOGD("%s",msg); \ + VSI_ASSERT(FALSE); \ + } \ + } while(0) + + +#define CHECK_STATUS_FAIL_GOTO( stat, lbl ) do {\ + if( VSI_SUCCESS != stat ) {\ + VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\ + goto lbl;\ + }\ +} while(0) + +#define CHECK_STATUS( stat ) do {\ + if( VSI_SUCCESS != stat ) {\ + VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\ + }\ +} while(0) + +#define CHECK_PTR_FAIL_GOTO( pointer, msg, lbl ) \ + do { \ + if( pointer == NULL ) { \ + VSILOGD("CHECK POINTER %s", msg); \ + goto lbl; \ + } \ + } while(0) + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_feature.h b/src/tim/vx/internal/include/vsi_nn_feature.h new file mode 100644 index 0000000..2ebb367 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_feature.h @@ -0,0 +1,35 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_FEATURE_H_ +#define _VSI_NN_FEATURE_H_ + +#include "vsi_nn_types.h" +#include "vsi_nn_prv.h" + +static inline vsi_bool vsi_nn_feature_conv_max_kernel_size() +{ + return 11; +} + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h new file mode 100644 index 0000000..8906a96 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h @@ -0,0 +1,7 @@ +/*****Auto generated header file, Please DO NOT modify manually!*****/ +#ifndef _VSI_NN_FEATURE_CONFIG_H +#define _VSI_NN_FEATURE_CONFIG_H + +#define VSI_PERCHANNEL_QUANTIZATION_SUPPORT + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h new file mode 100644 index 0000000..6e3e6fd --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_graph.h @@ -0,0 +1,718 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ + +#ifndef _VSI_NN_GRAPH_H +#define _VSI_NN_GRAPH_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_context.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_types.h" +#include "vsi_nn_rnn.h" +#include "utils/vsi_nn_map.h" + +/** + * Default max node input or output tensors' number. + * This value may be changed if some node's IO transcent + * it. + * @see vsi_nn_AddNode + * */ +#define VSI_NN_MAX_IO_NUM 32 + +/** + * Default preprocess and postprocess node base uid. + * When add new preprocess node in + * graph, node uid is set based on it. + * @see vsi_nn_AddPreprocNode + * */ +#define VSI_NN_PREPROC_NODE_UID_BASE 10000 + +/** + * Default postprocess node base uid. + * When add new postprocess node in + * graph, node uid is set based on it. + * @see vsi_nn_AddPostprocNode + * */ +#define VSI_NN_POSTPROC_NODE_UID_BASE 20000 + +/** + * Default data convert node base uid. + * When add new data convert node in + * graph, node uid is set based on it. + * @see vsi_nn_AddPreprocNode + * */ +#define VSI_NN_DATACONVERT_NODE_UID_BASE 30000 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Graph structure + */ +struct _vsi_nn_graph +{ + /** Context */ + vsi_nn_context_t ctx; + /** OpenVX graph */ + vx_graph g; + /** Tensor list of this graph */ + union + { + /** @deprecated Never use tensors. */ + vsi_nn_tensor_t ** tensors; + /** Tensor table */ + vsi_nn_map_t * tensor_table; + }; + union + { + /** Current tensor id */ + uint32_t cur_tid; + /** Tensor number */ + uint32_t tensor_num; + }; + /** @deprecated Max tensor number */ + uint32_t max_tensor_num; + /** Node list of this graph */ + union + { + /** @deprecated: Never use nodes. */ + vsi_nn_node_t ** nodes; + /** Node table */ + vsi_nn_map_t * node_table; + }; + union + { + /** Current node id */ + uint32_t cur_nid; + /** Node number */ + uint32_t node_num; + }; + /** @deprecated Max node number */ + uint32_t max_node_num; + /** Max node input or output number */ + uint32_t max_node_io; + /** Inputs to the graph */ + struct + { + vsi_nn_tensor_id_t * tensors; + uint32_t num; + } input; + + /** Outputs to the graph */ + struct + { + vsi_nn_tensor_id_t * tensors; + uint32_t num; + } output; + + /** workspace for RNN */ + void* rnn_wksp; + + /** Handle manager */ + vsi_nn_handle_manager_t handle_manager; + + /** graph version */ + struct + { + uint32_t major; + uint32_t minor; + uint32_t patch; + } version; + + /** Complete signal */ + struct + { + /** Flag to indicate if the need to append complete singal. */ + vsi_bool exists; + union + { + /** Value to be sent. */ + int64_t value; + /** Reserve some more btyes for future features. */ + uint8_t _bytes[64]; + }; + /** Length is not used yet, currently it will be always 8 bytes. */ + int32_t length; + /** COMPLETE signal write address. */ + void* write_address; + /** Pointer that store complete signal tensor, + * this will automatic created after graph setup, + * so please keep it NULL.*/ + vsi_nn_tensor_t* tensor; + } complete_signal; +}; + +/** + * Create graph + * Create a new graph. + * + * @param[in] ctx Context to handle the graph. + * @param[in] tensor_num Number of tensors to be created, + * set it 0 if it is unknown. + * @param[in] node_num Number of nodes to be created, + * set it 0 if it is unknown. + * + * @return Graph hanlde, or NULL if create fail. + * + */ +OVXLIB_API vsi_nn_graph_t * vsi_nn_CreateGraph + ( + vsi_nn_context_t ctx, + uint32_t tensor_num, + uint32_t node_num + ); + +/** + * Release graph + * Relase graph and set graph handle to NULL. + * + * @param[in] graph Graph handle pointer to release. + * + */ +OVXLIB_API void vsi_nn_ReleaseGraph + ( + vsi_nn_graph_t ** graph + ); + +/** + * Setup graph + * Build graph with openVX tensors and nodes. + * + * @param[in] graph Graph handle + * @param[in] sort If need to sort nodes. + * + * @return VSI_SUCCESS on success, or appropriate error code otherwise + */ +OVXLIB_API vsi_status vsi_nn_SetupGraph + ( + vsi_nn_graph_t * graph, + vsi_bool sort + ); + +/** + * Verify graph + * Verify graph, this must be called after graph setup. + * + * @param[in] graph Graph handle + * + * @return VSI_SUCCESS on success, or appropriate error code otherwise + * */ +OVXLIB_API vsi_status vsi_nn_VerifyGraph + ( + vsi_nn_graph_t * graph + ); + +/** + * Run graph + * Invoke the all nodes in graph. + * + * @param[in] graph Graph handle + * + * @return VSI_SUCCESS on success, or appropriate error code otherwise + */ +OVXLIB_API vsi_status vsi_nn_RunGraph + ( + const vsi_nn_graph_t * graph + ); + +/** + * Genearate NBG cache + * Genearate NBG cache + * + * @param[in] graph Graph handle + * @param[in] nbg buffer pointer + * @param[in] nbg buffer size + * @return VSI_SUCCESS on success, or appropriate error code otherwise + */ +OVXLIB_API vsi_status vsi_nn_GenerateNBG( + vsi_nn_graph_t * graph, + void * nbg_buffer, + size_t * size + ); + +/** + * Run graph in asynch way + * Invoke the all nodes in graph. + * + * @param[in] graph Graph handle + * @return VSI_SUCCESS on success, or appropriate error code otherwise + */ +OVXLIB_API vsi_status vsi_nn_AsyncRunGraph + ( + vsi_nn_graph_t * graph + ); + +OVXLIB_API vsi_status vsi_nn_AsyncRunWait + ( + vsi_nn_graph_t * graph + ); + +/** + * Set graph version + * Set the specific ovxlib version, this is used to fetch the + * implementations of different ovxlib. + * + * @param[in] graph Graph handle + * @param[in] major Ovxlib major version bined to graph + * @param[in] minor Ovxlib minor version bined to graph + * @param[in] patch Ovxlib patch version bined to graph + * + * @return VSI_SUCCESS on success, or appropriate error code otherwise + */ +OVXLIB_API vsi_status vsi_nn_SetGraphVersion + ( + vsi_nn_graph_t * graph, + uint32_t major, + uint32_t minor, + uint32_t patch + ); + +/** + * Get graph version + * Get Ovxlib version binded to graph. + * + * @param[in] graph Graph handle + * @param[out] major Ovxlib major version binded to graph. + * @param[out] minor Ovxlib minor version binded to graph. + * @param[out] patch Ovxlib patch version binded to graph. + * + * @return VSI_SUCCESS on success, or appropriate error code otherwise + */ +OVXLIB_API vsi_status vsi_nn_GetGraphVersion + ( + vsi_nn_graph_t * graph, + uint32_t *major, + uint32_t *minor, + uint32_t *patch + ); + +/** + * Add a new tensor to graph + * Create a new tensor and add it to graph. + * + * @param[in] graph Graph handle + * @param[in] id Optional id to the tensor, set it to VSI_NN_TENSOR_ID_AUTO, + * and a new id will be generated. + * @param[in] attr Tensor attirbutes to the new tensor. + * @param[in] data Optional data to the new tensor, if it's not NULL, + * the mem will be copied to the tensor mem. + * + * @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse. + */ +OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id, + vsi_nn_tensor_attr_t * attr, + /* Optional */ + uint8_t * data + ); + +/** + * Add a new tensor from handle + * Create a new tensor from a mem handle and add it to graph. + * + * @param[in] graph Graph handle + * @param[in] id Optional id to the tensor, set it to VSI_NN_TENSOR_ID_AUTO, + * and a new id will be generated. + * @param[in] attr Tensor attirbutes to the new tensor. + * @param[in] data Optional mem handle to the new tensor, the new + * tensor will use this mem as its own mem handle, + * the mem handle must be 64 bytes align. + * If it's set to NULL, a new 64 bytes align mem handle will + * be automatic malloc. + * + * @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse. + */ +OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id, + vsi_nn_tensor_attr_t * attr, + uint8_t * data + ); + +/** + * Attach tensor to graph + * Attach an exist tensor to graph. + * + * @param[in] graph Graph handle + * @param[in] id Optional id to the tensor, set it to VSI_NN_TENSOR_ID_AUTO, + * and a new id will be generated. + * @param[in] tensor Tensor attach to the graph. + * + * @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otherwise. + */ +vsi_nn_tensor_id_t vsi_nn_AttachTensorToGraph + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id, + vsi_nn_tensor_t * tensor + ); + +/** + * @deprecated + * @see vsi_nn_RemoveTensor + */ +void vsi_nn_DeleteTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id + ); + +/** + * Get tensor + * Get tensor from graph. + * + * @param[in] graph Graph handle + * @param[in] tensor_id Tensor's id + * + * @return Tensor's handle on success, or NULL otherwise. + */ +OVXLIB_API vsi_nn_tensor_t * vsi_nn_GetTensor + ( + const vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t tensor_id + ); + +/** + * Get node + * Get node from graph. + * + * @param[in] graph Graph handle + * @param[in] id Node's id + * + * @return Node's handle on success, or NULL otherwise. + */ +OVXLIB_API vsi_nn_node_t * vsi_nn_GetNode + ( + const vsi_nn_graph_t * graph, + vsi_nn_node_id_t id + ); + +/** + * Get tensors + * Get multi tensors from graph. + * + * @param[in] graph Graph handle + * @param[in] tensors_id Tensors' id + * @param[in] num Number of tensors + * @param[out] tensors Tensor handles on success, or NULL otherwise. + */ +OVXLIB_API void vsi_nn_GetTensors + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t * tensors_id, + uint32_t num, + vsi_nn_tensor_t ** tensors + ); + +/** + * Add node + * Create a new node and attach it to graph. + * + * @param[in] graph Graph handle + * @param[in] op Node operation. + * @param[in] input_num Number of inputs to this node. + * @param[in] output_num Number of outputs to this node. + * @param[out] node_id A handle to get the id of new node, + * pass it to NULL to get nothing. + * + * @return The node handle on success, or NULL otherwise. + */ +OVXLIB_API vsi_nn_node_t * vsi_nn_AddNode + ( + vsi_nn_graph_t * graph, + vsi_nn_op_t op, + uint32_t input_num, + uint32_t output_num, + vsi_nn_node_id_t * node_id + ); + +/** + * @deprecated + * @see vsi_nn_AddNode + */ +OVXLIB_API vsi_nn_node_t * vsi_nn_AppendNode + ( + vsi_nn_graph_t * graph, + vsi_nn_op_t op, + vsi_nn_node_id_t * node_id + ); + +/** + * Set graph inputs + * Set inputs to the graph + * + * @param[in] graph Graph handle + * @param[in] tensors_id Input tensors id to the graph. + * @param[in] tensor_num Input tensors number. + * + * @return TRUE on success, or FALSE otherwise. + */ +OVXLIB_API vsi_bool vsi_nn_SetGraphInputs + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t * tensors_id, + uint32_t tensor_num + ); + +/** + * Set graph outputs + * Set outputs to the graph + * + * @param[in] graph Graph handle + * @param[in] tensors_id Output tensors id to the graph. + * @param[in] tensor_num Output tensors number. + * + * @return TRUE on success, or FALSE otherwise. + */ +OVXLIB_API vsi_bool vsi_nn_SetGraphOutputs + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t * tensors_id, + uint32_t tensor_num + ); + +/** + * Remove node + * Remove a node from graph. Please NOTE that, to remove a node + * will break the connections of the node, so it is only used + * when release a graph. + * + * @param[in] graph Graph handle + * @param[in] id Node id to be removed. + */ +void vsi_nn_RemoveNode + ( + vsi_nn_graph_t * graph, + vsi_nn_node_id_t id + ); + +/** + * Sort graph node + * Sort the nodes with the execution sequence. + * + * @param[in] graph Graph handle + * + * @return Sorted nodes id. The node id buffer is malloc internal, + * the need to release it by user. + */ +vsi_nn_node_id_t * vsi_nn_SortGraphNode + ( + vsi_nn_graph_t * graph + ); + +/** + * Get Nodes by uids + * Get number of nodes with uids. + * + * @param[in] graph Graph handle + * @param[in] node_uids Uids of nodes. + * @param[in] node_uids_size Number of nodes. + * @param[out] nodes Buffer to return node handles. + * @param[in] nodes_num Why we need this? Number of nodes, + * it must be equal to node_uids_size. + * + * @return Number of return node. + */ +OVXLIB_API uint32_t vsi_nn_GetNodesByUids + ( + vsi_nn_graph_t * graph, + uint32_t * node_uids, + uint32_t node_uids_size, + vsi_nn_node_id_t * nodes, + uint32_t nodes_num + ); + +/** + * Dump node outputs + * Dump outputs of given nodes. + * + * @param[in] graph Graph handle + * @param[in] path A path to directory, all results will dump into it, + * @param[in] node_uids Uids of dump nodes. + * @param[in] node_uids_size Number of dump nodes. + * @param[in] force_fp32 TRUE if all results needs to be converted to float32. + * @param[in] data_fmt Not implemented. + */ +OVXLIB_API void vsi_nn_DumpGraphNodeOutputs + ( + vsi_nn_graph_t * graph, + const char * path, + uint32_t * node_uids, + uint32_t node_uids_size, + vsi_bool force_fp32, + vsi_nn_dim_fmt_e data_fmt + ); + +/** + * Dump node outputs + * Dump outputs of given nodes. + * + * @param[in] graph Graph handle + * @param[in] path A path to directory, all results will dump into it, + * @param[in] prefix A prefix of dump nodes. + * @param[in] node_uids Uids of dump nodes. + * @param[in] node_uids_size Number of dump nodes. + * @param[in] force_fp32 TRUE if all results needs to be converted to float32. + * @param[in] data_fmt Not implemented. + */ +OVXLIB_API void vsi_nn_DumpGraphNodeOutputsEx + ( + vsi_nn_graph_t * graph, + const char * path, + const char * prefix, + uint32_t * node_uids, + uint32_t node_uids_size, + vsi_bool force_fp32, + vsi_nn_dim_fmt_e data_fmt + ); + +/** + * Print graph + * Print basic info of a graph. + * + * @param[in] graph Graph handle + */ +OVXLIB_API void vsi_nn_PrintGraph + ( + vsi_nn_graph_t * graph + ); + +/** + * Dump graph to json + * Dump basic info of a graph to json + * + * @param[in] graph Graph handle + */ +OVXLIB_API void vsi_nn_DumpGraphToJson + ( + vsi_nn_graph_t *graph + ); + +/** + * Setup RNN Connections + * + * @param[in] graph Graph handle + * @param[in] connections connections of RNN + * @param[in] connections_count Number of connections + * @see vsi_nn_rnn_external_connection_t + * + * @return VSI_SUCCESS on success, or appropriate error code otherwise + */ +OVXLIB_API vsi_status vsi_nn_SetupRNNConnections + ( + vsi_nn_graph_t* graph, + const vsi_nn_rnn_external_connection_t* connections, + uint32_t connections_count + ); + +/** + * Reset RNN Buffers + * Reset RNN buffers in graph + * + * @param[in] graph Graph handle + * + * @return VSI_SUCCESS on success, or appropriate error code otherwise. + */ +OVXLIB_API vsi_status vsi_nn_ResetRNNBuffers + ( + vsi_nn_graph_t* graph + ); + +/** + * Has RNN + * Check if graph is a RNN + * + * @param[in] graph Graph handle + * + * @return TRUE if graph has RNN, or FALSE if not. + */ +OVXLIB_API vsi_bool vsi_nn_HasRNN + ( + const vsi_nn_graph_t* graph + ); + +/** + * Remove tensor + * Remove tensor from graph. + * + * @param[in] graph Graph handle + * @param[in] id Tensor id + */ +void vsi_nn_RemoveTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id + ); + +OVXLIB_API vsi_status vsi_nn_TrySetupCompleteSignalNode + ( + vsi_nn_graph_t* graph + ); + +void vsi_nn_get_tensor_consumers + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_id_t tensor_id, + vsi_nn_node_t** nodes, + uint32_t* count + ); + +void vsi_nn_get_tensor_provider + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_id_t tensor_id, + vsi_nn_node_t** node + ); + +OVXLIB_API vsi_status vsi_nn_SetGraphPreloadSize + ( + vsi_nn_graph_t* graph, + vsi_nn_graph_attr_preload_type_e attr, + uint32_t size + ); + +vsi_nn_tensor_id_t vsi_nn_get_tensor_id + ( + vsi_nn_graph_t* graph, + const vsi_nn_tensor_t * tensor + ); + +OVXLIB_API vsi_status vsi_nn_SetGraphPriority + ( + vsi_nn_graph_t* graph, + uint32_t priority + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_graph_optimization.h b/src/tim/vx/internal/include/vsi_nn_graph_optimization.h new file mode 100644 index 0000000..bdf2e5a --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_graph_optimization.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef VSI_NN_GRAPH_OPTIMIZATION_H +#define VSI_NN_GRAPH_OPTIMIZATION_H + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +vsi_status vsi_nn_OptimizeGraph + ( + vsi_nn_graph_t* graph, + vsi_bool *dirty + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_internal_node.h b/src/tim/vx/internal/include/vsi_nn_internal_node.h new file mode 100644 index 0000000..4c8113c --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_internal_node.h @@ -0,0 +1,178 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_INTRNAL_NODE_H +#define _VSI_NN_INTRNAL_NODE_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_context.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_types.h" +#include "vsi_nn_rnn.h" +#include "utils/vsi_nn_map.h" +#include "utils/vsi_nn_link_list.h" + +/********************************************************** +* MACROS +**********************************************************/ +#define INTERNAL_NODE_DEBUG FALSE + +/********************************************************** +* TYPES +**********************************************************/ +typedef struct _vsi_nn_internal_node_param_t +{ + vsi_nn_link_list_t link_list; + uint8_t param[1]; +} vsi_nn_internal_node_param_t; + +typedef struct _vsi_nn_internal_node_t +{ + vsi_nn_link_list_t link_list; + + vsi_nn_node_t* node; + vsi_nn_tensor_t** inputs; + vsi_nn_tensor_t** outputs; + vsi_nn_internal_node_param_t* param; + + #if( INTERNAL_NODE_DEBUG ) + char name[32]; + #endif +} vsi_nn_internal_node_t; + +typedef struct _vsi_nn_internal_tensor_t +{ + vsi_nn_link_list_t link_list; + + vsi_nn_tensor_t* t; + + #if( INTERNAL_NODE_DEBUG ) + char name[32]; + #endif +} vsi_nn_internal_tensor_t; + +typedef struct _vsi_nn_internal_node_wksp_t +{ + vsi_nn_internal_node_t* nodes; + vsi_nn_internal_tensor_t* tensors; + int curr_node_uid; +} vsi_nn_internal_node_wksp_t; + +/********************************************************** +* PUBLIC FUNCTIONS +**********************************************************/ +vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor + ( + vsi_nn_node_t* node, + vsi_nn_tensor_attr_t* input_attr, + vsi_nn_tensor_attr_t* weight_attr + ); + +vsi_status vsi_nn_internal_deinit_node + ( + vsi_nn_node_t* node + ); + +vsi_status vsi_nn_internal_deinit_node_wksp + ( + vsi_nn_node_t* node + ); + +void vsi_nn_internal_dump_node_output + ( + vsi_nn_graph_t* graph, + const char* path, + const char* filename_prefix, + vsi_bool force_fp32, + vsi_nn_node_t* node + ); + +vsi_nn_internal_node_t* vsi_nn_internal_get_node_by_uid + ( + vsi_nn_node_t* node, + int uid + ); + +vsi_status vsi_nn_internal_init_node_wksp + ( + vsi_nn_node_t* node + ); + +void vsi_nn_internal_init_tensor_attr + ( + vsi_nn_tensor_attr_t* attr, + const vsi_nn_dtype_t* dtype, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_node_t* vsi_nn_internal_new_node + ( + vsi_nn_node_t* node, + vsi_nn_op_t op, + uint32_t input_num, + uint32_t output_num + ); + +void* vsi_nn_internal_new_node_param + ( + vsi_nn_internal_node_t* inode, + size_t size /* in bytes */ + ); + +vsi_nn_internal_tensor_t* vsi_nn_internal_new_tensor + ( + vsi_nn_node_t* node, + vsi_nn_tensor_attr_t* attr, + float default_value + ); + +vsi_status vsi_nn_internal_release_node + ( + vsi_nn_internal_node_t** node + ); + +vsi_status vsi_nn_internal_release_tensor + ( + vsi_nn_internal_tensor_t** tensor + ); + +vsi_bool vsi_nn_internal_setup_node + ( + vsi_nn_node_t* node, + vsi_nn_internal_node_t* inode + ); + +vsi_status vsi_nn_internal_compute_node + ( + vsi_nn_node_t * node + ); + +vsi_status vsi_nn_internal_optimize_node + ( + vsi_nn_node_t * node, + vsi_nn_opt_direction_e direction + ); + +#endif /* _VSI_NN_INTRNAL_NODE_H */ diff --git a/src/tim/vx/internal/include/vsi_nn_log.h b/src/tim/vx/internal/include/vsi_nn_log.h new file mode 100644 index 0000000..cf9c04c --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_log.h @@ -0,0 +1,71 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_LOG_H +#define _VSI_NN_LOG_H +#include + +#if defined(__cplusplus) +extern "C"{ +#endif + +#ifdef _MSC_VER +#define snprintf _snprintf +#endif + +typedef enum _vsi_nn_log_level_e +{ + VSI_NN_LOG_UNINIT = -1, + VSI_NN_LOG_CLOSE, + VSI_NN_LOG_ERROR, + VSI_NN_LOG_WARN, + VSI_NN_LOG_INFO, + VSI_NN_LOG_DEBUG +}vsi_nn_log_level_e; + +#define VSI_NN_MAX_DEBUG_BUFFER_LEN 1024 +#define VSILOGE( fmt, ... ) \ + vsi_nn_LogMsg(VSI_NN_LOG_ERROR, "E [%s:%d]" fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define VSILOGW( fmt, ... ) \ + vsi_nn_LogMsg(VSI_NN_LOG_WARN, "W [%s:%d]" fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define VSILOGI( fmt, ... ) \ + vsi_nn_LogMsg(VSI_NN_LOG_INFO, "I [%s:%d]" fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define VSILOGD( fmt, ... ) \ + vsi_nn_LogMsg(VSI_NN_LOG_DEBUG, "D [%s:%d]" fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define _LOG_( fmt, ... ) \ + vsi_nn_LogMsg(VSI_NN_LOG_DEBUG, "[%s:%d]" fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) + +OVXLIB_API void vsi_nn_LogMsg + ( + vsi_nn_log_level_e level, + const char *fmt, + ... + ); + +#if defined(__cplusplus) +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/vsi_nn_node.h b/src/tim/vx/internal/include/vsi_nn_node.h new file mode 100644 index 0000000..9b3e302 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_node.h @@ -0,0 +1,191 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_NODE_H +#define _VSI_NN_NODE_H + +/*------------------------------------ + Includes + -----------------------------------*/ +#include "vsi_nn_platform.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_types.h" +#include "vsi_nn_node_type.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +/*------------------------------------ + Macros + -----------------------------------*/ + +/** Node invalid id */ +#define VSI_NN_NODE_ID_NA ((uint32_t)-1) +/** Node invalid uid */ +#define VSI_NN_NODE_UID_NA ((uint32_t)-1) + +/*------------------------------------ + Types + -----------------------------------*/ +typedef struct _vsi_nn_node_attr_t +{ + int32_t const_tensor_preload_type; + int32_t enable_op_constraint_check; + int32_t reserved[6]; +} vsi_nn_node_attr_t; + +/** Node structure */ +struct _vsi_nn_node +{ + /** + * Graph handle + * @see vsi_nn_graph_t + */ + vsi_nn_graph_t * graph; + /** OpenVX node */ + vx_node n; + /** + * Operation type + * @see vsi_nn_op_t + */ + vsi_nn_op_t op; + /** Node inputs */ + struct + { + vsi_nn_tensor_id_t * tensors; + uint32_t num; + } input; + /** Node outputs */ + struct + { + vsi_nn_tensor_id_t * tensors; + uint32_t num; + } output; + /** Operation parameters */ + vsi_nn_nn_param_t nn_param; + /** Vx parameters */ + vsi_nn_vx_param_t vx_param; + /** + * User specific ID + * This is for debug only. + */ + uint32_t uid; + /** Node's internal node wksp */ + void* internal_node_wksp; + vsi_nn_node_attr_t attr; +}; + +/*------------------------------------ + Functions + -----------------------------------*/ +/** + * New node + * Create a new node with given input and output number. + * + * @param[in] graph Graph handle. + * @param[in] op Operation type. + * @param[in] input_num Input tensor number, set to 0 to use default value. + * @param[in] output_num Output tensor number, set to 0 to use default value. + * @see vei_nn_op_t + * + * @return Node handle on success, or NULL otherwise. + */ +OVXLIB_API vsi_nn_node_t * vsi_nn_NewNode + ( + vsi_nn_graph_t * graph, + vsi_nn_op_t op, + uint32_t input_num, + uint32_t output_num + ); + +/** + * @deprecated + * @see vsi_nn_NewNode + */ +OVXLIB_API vsi_nn_node_t * vsi_nn_CreateNode + ( + vsi_nn_graph_t * graph, + vsi_nn_op_t op + ); + +/** + * Release node + * Release a node and set the handle to NULL. + * + * param[in] node Node handle. + */ +OVXLIB_API void vsi_nn_ReleaseNode + ( + vsi_nn_node_t ** node + ); + +/** + * Print node + * Print brief info of a node. + * + * @param[in] node Node handle. + * @param[in] id Node id. + */ +OVXLIB_API void vsi_nn_PrintNode + ( + vsi_nn_node_t * node, + vsi_nn_node_id_t id + ); + +/** + * Update node attribute + * Update openvx node attribute based on ovxlib's node attribute + * + * @param[in] node Node handle. + */ +vsi_status vsi_nn_update_node_attr + ( + vsi_nn_node_t *node + ); + +/** + * Set node inputs and outputs + * + * @param[in] node Node to set IO + * @param[in] inputs Input tensors + * @param[in] input_num Input tensors' number + * @param[in] outputs Output tensors + * @param[in] output_num Output tensors' number + */ +vsi_status vsi_nn_SetNodeInputsAndOutputs + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t * const inputs[], + int input_num, + vsi_nn_tensor_t * const outputs[], + int output_num + ); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_node_attr_template.h b/src/tim/vx/internal/include/vsi_nn_node_attr_template.h new file mode 100644 index 0000000..a555404 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_node_attr_template.h @@ -0,0 +1,40 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_NODE_ATTR_TEMPLATE_H +#define _VSI_NN_NODE_ATTR_TEMPLATE_H + +#include "vsi_nn_node.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +void vsi_nn_apply_node_attr_template(vsi_nn_node_t * node); + +#if defined(__cplusplus) +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h new file mode 100644 index 0000000..6304280 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -0,0 +1,334 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_NODE_TYPES_H_ +#define _VSI_NN_NODE_TYPES_H_ + +#include "vsi_nn_types.h" +#include "vsi_nn_assert.h" +#include "ops/vsi_nn_op_activations.h" +#include "ops/vsi_nn_op_batch_norm.h" +#include "ops/vsi_nn_op_multiply.h" +#include "ops/vsi_nn_op_concat.h" +#include "ops/vsi_nn_op_split.h" +#include "ops/vsi_nn_op_conv2d.h" +#include "ops/vsi_nn_op_deconvolution.h" +#include "ops/vsi_nn_op_fullconnect.h" +#include "ops/vsi_nn_op_lrn.h" +#include "ops/vsi_nn_op_permute.h" +#include "ops/vsi_nn_op_pool.h" +#include "ops/vsi_nn_op_proposal.h" +#include "ops/vsi_nn_op_reshape.h" +#include "ops/vsi_nn_op_roi_pool.h" +#include "ops/vsi_nn_op_upsample.h" +#include "ops/vsi_nn_op_resize.h" +#include "ops/vsi_nn_op_lstm.h" +#include "ops/vsi_nn_op_reorg.h" +#include "ops/vsi_nn_op_l2normalizescale.h" +#include "ops/vsi_nn_op_crop.h" +#include "ops/vsi_nn_op_relun.h" +#include "ops/vsi_nn_op_divide.h" +#include "ops/vsi_nn_op_tanh.h" +#include "ops/vsi_nn_op_dropout.h" +#include "ops/vsi_nn_op_shufflechannel.h" +#include "ops/vsi_nn_op_prelu.h" +#include "ops/vsi_nn_op_elu.h" +#include "ops/vsi_nn_op_reverse.h" +#include "ops/vsi_nn_op_space2depth.h" +#include "ops/vsi_nn_op_depth2space.h" +#include "ops/vsi_nn_op_depth2space_internal.h" +#include "ops/vsi_nn_op_maximum.h" +#include "ops/vsi_nn_op_scale.h" +#include "ops/vsi_nn_op_slice.h" +#include "ops/vsi_nn_op_space2batch.h" +#include "ops/vsi_nn_op_batch2space.h" +#include "ops/vsi_nn_op_pad.h" +#include "ops/vsi_nn_op_imageprocess.h" +#include "ops/vsi_nn_op_matrixmul.h" +#include "ops/vsi_nn_op_lstmunit.h" +#include "ops/vsi_nn_op_layernormalize.h" +#include "ops/vsi_nn_op_reduce.h" +#include "ops/vsi_nn_op_softmax.h" +#include "ops/vsi_nn_op_instancenormalize.h" +#include "ops/vsi_nn_op_tensorstackconcat.h" +#include "ops/vsi_nn_op_strided_slice.h" +#include "ops/vsi_nn_op_signalframe.h" +#include "ops/vsi_nn_op_argmax.h" +#include "ops/vsi_nn_op_svdf.h" +#include "ops/vsi_nn_op_conv1d.h" +#include "ops/vsi_nn_op_nbg.h" +#include "ops/vsi_nn_op_spatial_transformer.h" +#include "ops/vsi_nn_op_logical_ops.h" +#include "ops/vsi_nn_op_select.h" +#include "ops/vsi_nn_op_concatshift.h" +#include "ops/vsi_nn_op_relational_ops.h" +#include "ops/vsi_nn_op_pow.h" +#include "ops/vsi_nn_op_floordiv.h" +#include "ops/vsi_nn_op_lstmunit_activation.h" +#include "ops/vsi_nn_op_lstmunit_ovxlib.h" +#include "ops/vsi_nn_op_tensor_add_mean_stddev_norm.h" +#include "ops/vsi_nn_op_lstm_ovxlib.h" +#include "ops/vsi_nn_op_lsh_projection.h" +#include "ops/vsi_nn_op_rnn.h" +#include "ops/vsi_nn_op_stack.h" +#include "ops/vsi_nn_op_floor.h" +#include "ops/vsi_nn_op_neg.h" +#include "ops/vsi_nn_op_exp.h" +#include "ops/vsi_nn_op_clip.h" +#include "ops/vsi_nn_op_pre_process_tensor.h" +#include "ops/vsi_nn_op_post_process.h" +#include "ops/vsi_nn_op_pre_process_gray.h" +#include "ops/vsi_nn_op_unstack.h" +#include "ops/vsi_nn_op_pre_process_rgb.h" +#include "ops/vsi_nn_op_pre_process.h" +#include "ops/vsi_nn_op_addn.h" +#include "ops/vsi_nn_op_softmax_internal.h" +#include "ops/vsi_nn_op_pre_process_yuv420.h" +#include "ops/vsi_nn_op_pre_process_yuv444.h" +#include "ops/vsi_nn_op_pre_process_nv12.h" +#include "ops/vsi_nn_op_extra_ending.h" +#include "ops/vsi_nn_op_gather.h" +#include "ops/vsi_nn_op_scatter_nd.h" +#include "ops/vsi_nn_op_tile.h" +#include "ops/vsi_nn_op_grouped_conv2d.h" +#include "ops/vsi_nn_op_topk.h" +#include "ops/vsi_nn_op_pre_process_bgra.h" +#include "ops/vsi_nn_op_logical_not.h" +#include "ops/vsi_nn_op_sin.h" +#include "ops/vsi_nn_op_log.h" +#include "ops/vsi_nn_op_argmin.h" +#include "ops/vsi_nn_op_roi_align.h" +#include "ops/vsi_nn_op_heatmap_max_keypoint.h" +#include "ops/vsi_nn_op_axis_aligned_bbox_transform.h" +#include "ops/vsi_nn_op_box_with_nms_limit.h" +#include "ops/vsi_nn_op_generate_proposals.h" +#include "ops/vsi_nn_op_detection_postprocess.h" +#include "ops/vsi_nn_op_random_multinomial.h" +#include "ops/vsi_nn_op_log_softmax.h" +#include "ops/vsi_nn_op_relu_keras.h" +#include "ops/vsi_nn_op_relu_keras_internal.h" +#include "ops/vsi_nn_op_reducesum_internal.h" +#include "ops/vsi_nn_op_reducemax_internal.h" +#include "ops/vsi_nn_op_reducemin_internal.h" +#include "ops/vsi_nn_op_gru_ovxlib.h" +#include "ops/vsi_nn_op_grucell_ovxlib.h" +#include "ops/vsi_nn_op_embedding_lookup.h" +#include "ops/vsi_nn_op_reduceprod_internal.h" +#include "ops/vsi_nn_op_reduceall_internal.h" +#include "ops/vsi_nn_op_reduceany_internal.h" +#include "ops/vsi_nn_op_unidirectional_sequence_rnn.h" +#include "ops/vsi_nn_op_quantized_16bit_lstm.h" +#include "ops/vsi_nn_op_bidirectional_sequence_rnn.h" +#include "ops/vsi_nn_op_bidirectional_sequence_lstm.h" +#include "ops/vsi_nn_op_resize_internal.h" +#include "ops/vsi_nn_op_resize_nearest_internal.h" +#include "ops/vsi_nn_op_variable.h" +#include "ops/vsi_nn_op_rnncell_ovxlib.h" +#include "ops/vsi_nn_op_l2_normalize.h" +#include "ops/vsi_nn_op_dataconvert.h" +#include "ops/vsi_nn_op_swish.h" +#include "ops/vsi_nn_op_cast.h" +#include "ops/vsi_nn_op_depthwise_conv1d.h" +#include "ops/vsi_nn_op_grucell_activation_internal.h" +#include "ops/vsi_nn_op_grucell_activation_internal_sma.h" +#include "ops/vsi_nn_op_linear.h" +#include "ops/vsi_nn_op_batchnorm_single.h" +#include "ops/vsi_nn_op_moments.h" +#include "ops/vsi_nn_op_squeeze.h" +#include "ops/vsi_nn_op_expand_broadcast.h" +#include "ops/vsi_nn_op_deconvolution1d.h" +/* custom node head define define */ +#include "custom/vsi_nn_custom_node_type.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +/** Operation attributes */ +typedef union _vsi_nn_nn_param +{ + struct + { + vsi_nn_conv2d_param conv2d; + vsi_nn_pool_param pool; + }; + vsi_nn_fcl_param fcl; + vsi_nn_activation_param activation; + vsi_nn_lrn_param lrn; + vsi_nn_concat_param concat; + vsi_nn_split_param split; + vsi_nn_roi_pool_param roi_pool; + vsi_nn_batch_norm_param batch_norm; + vsi_nn_multiply_param multiply; + vsi_nn_proposal_param proposal; + vsi_nn_deconv_param deconv; + vsi_nn_reshape_param reshape; + vsi_nn_permute_param permute; + vsi_nn_upsample_param upsample; + vsi_nn_resize_param resize; + vsi_nn_lstm_param lstm; + vsi_nn_reorg_param reorg; + vsi_nn_l2normalizescale_param l2normalizescale; + vsi_nn_crop_param crop; + vsi_nn_relun_param relun; + vsi_nn_divide_param divide; + vsi_nn_tanh_param tanh; + vsi_nn_dropout_param dropout; + vsi_nn_shufflechannel_param shufflechannel; + vsi_nn_prelu_param prelu; + vsi_nn_elu_param elu; + vsi_nn_reverse_param reverse; + vsi_nn_space2depth_param space2depth; + vsi_nn_depth2space_param depth2space; + vsi_nn_depth2space_internal_param depth2space_internal; + vsi_nn_maximum_param maximum; + vsi_nn_scale_param scale; + vsi_nn_slice_param slice; + vsi_nn_space2batch_param space2batch; + vsi_nn_batch2space_param batch2space; + vsi_nn_pad_param pad; + vsi_nn_imageprocess_param imageprocess; + vsi_nn_matrixmul_param matrixmul; + vsi_nn_lstmunit_param lstmunit; + vsi_nn_layernormalize_param layernorm; + vsi_nn_reduce_param reduce; + vsi_nn_instancenormalize_param instancenorm; + vsi_nn_tensorstackconcat_param tensorstackconcat; + vsi_nn_softmax_param softmax; + vsi_nn_strided_slice_param strided_slice; + vsi_nn_signalframe_param signalframe; + vsi_nn_svdf_param svdf; + vsi_nn_conv1d_param conv1d; + vsi_nn_nbg_param nbg; + vsi_nn_concatshift_param concatshift; + vsi_nn_relational_ops_param relational_ops; + vsi_nn_pow_param pow; + vsi_nn_floordiv_param floordiv; + vsi_nn_spatial_transformer_param spatial_transformer; + vsi_nn_logical_ops_param logical_ops; + vsi_nn_select_param select; + vsi_nn_lstmunit_activation_param lstmunit_activation; + vsi_nn_lstmunit_ovxlib_param lstmunit_ovxlib; + vsi_nn_tensor_add_mean_stddev_norm_param tensor_add_mean_stddev_norm; + vsi_nn_lstm_ovxlib_param lstm_ovxlib; + vsi_nn_lsh_projection_param lsh_projection; + vsi_nn_rnn_param rnn; + vsi_nn_stack_param stack; + vsi_nn_floor_param floor; + vsi_nn_neg_param neg; + vsi_nn_exp_param exp; + vsi_nn_clip_param clip; + vsi_nn_pre_process_tensor_param pre_process_tensor; + vsi_nn_post_process_param post_process; + vsi_nn_pre_process_gray_param pre_process_gray; + vsi_nn_unstack_param unstack; + vsi_nn_pre_process_rgb_param pre_process_rgb; + vsi_nn_pre_process_param pre_process; + vsi_nn_addn_param addn; + vsi_nn_softmax_internal_param softmax_internal; + vsi_nn_pre_process_yuv420_param pre_process_yuv420; + vsi_nn_pre_process_yuv444_param pre_process_yuv444; + vsi_nn_pre_process_nv12_param pre_process_nv12; + vsi_nn_extra_ending_param extra_ending; + vsi_nn_gather_param gather; + vsi_nn_scatter_nd_param scatter_nd; + vsi_nn_tile_param tile; + vsi_nn_grouped_conv2d_param grouped_conv2d; + vsi_nn_topk_param topk; + vsi_nn_pre_process_bgra_param pre_process_bgra; + vsi_nn_logical_not_param logical_not; + vsi_nn_argmax_param argmax; + vsi_nn_sin_param sin; + vsi_nn_log_param log; + vsi_nn_argmin_param argmin; + vsi_nn_roi_align_param roi_align; + vsi_nn_heatmap_max_keypoint_param heatmap_max_keypoint; + vsi_nn_axis_aligned_bbox_transform_param axis_aligned_bbox_transform; + vsi_nn_box_with_nms_limit_param box_with_nms_limit; + vsi_nn_generate_proposals_param generate_proposals; + vsi_nn_detection_postprocess_param detection_postprocess; + vsi_nn_random_multinomial_param random_multinomial; + vsi_nn_log_softmax_param log_softmax; + vsi_nn_relu_keras_param relu_keras; + vsi_nn_relu_keras_internal_param relu_keras_internal; + vsi_nn_reducesum_internal_param reducesum_internal; + vsi_nn_reducemax_internal_param reducemax_internal; + vsi_nn_reducemin_internal_param reducemin_internal; + vsi_nn_gru_ovxlib_param gru_ovxlib; + vsi_nn_grucell_ovxlib_param grucell_ovxlib; + vsi_nn_embedding_lookup_param embedding_lookup; + vsi_nn_reduceprod_internal_param reduceprod_internal; + vsi_nn_reduceall_internal_param reduceall_internal; + vsi_nn_reduceany_internal_param reduceany_internal; + vsi_nn_unidirectional_sequence_rnn_param unidirectional_sequence_rnn; + vsi_nn_quantized_16bit_lstm_param quantized_16bit_lstm; + vsi_nn_bidirectional_sequence_rnn_param bidirectional_sequence_rnn; + vsi_nn_bidirectional_sequence_lstm_param bidirectional_sequence_lstm; + vsi_nn_resize_internal_param resize_internal; + vsi_nn_resize_nearest_internal_param resize_nearest_internal; + vsi_nn_variable_param variable; + vsi_nn_rnncell_ovxlib_param rnncell_ovxlib; + vsi_nn_l2_normalize_param l2_normalize; + vsi_nn_depthwise_conv1d_param depthwise_conv1d; + vsi_nn_cast_param cast; + vsi_nn_swish_param swish; + vsi_nn_dataconvert_param dataconvert; + vsi_nn_grucell_activation_internal_param grucell_activation_internal; + vsi_nn_grucell_activation_internal_sma_param grucell_activation_internal_sma; + vsi_nn_linear_param linear; + vsi_nn_batchnorm_single_param batchnorm_single; + vsi_nn_moments_param moments; + vsi_nn_squeeze_param squeeze; + vsi_nn_expand_broadcast_param expand_broadcast; + vsi_nn_deconvolution1d_param deconvolution1d; + uint8_t client_param[128]; + + /* custom node data struct define */ +#define DEF_NODE_TYPE( NAME ) vsi_nn_##NAME##_param NAME; + #include "custom/custom_node_type.def" +#undef DEF_NODE_TYPE +} vsi_nn_nn_param_t; + +/** + * Number 576 is the size of `vsi_nn_nn_param_t` from V1.1.2 + * We this check to avoid application binary interface(ABI) compatibility issue. + */ +_compiler_assert( sizeof(vsi_nn_nn_param_t) <= 576, vsi_nn_node_type_h_potential_abi_compatibility_issue ); + +/** Node params for openvx attributes */ +typedef struct _vsi_nn_vx_param +{ + vsi_enum overflow_policy; + vsi_enum rounding_policy; + vsi_enum down_scale_size_rounding; + vsi_bool has_relu; + uint32_t accumulator_bits; + vsi_nn_platform_e platform; +} vsi_nn_vx_param_t; + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_ops.h b/src/tim/vx/internal/include/vsi_nn_ops.h new file mode 100644 index 0000000..23a750d --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_ops.h @@ -0,0 +1,353 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_OPS_H +#define _VSI_NN_OPS_H + +/*------------------------------------ + Includes + -----------------------------------*/ +#include "vsi_nn_platform.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_types.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +/*------------------------------------ + Types + -----------------------------------*/ + +/** + * Ovxlib operation type index. + * Custom ID is start from VSI_NN_OP_CUSTOM_START. + * Ovxlib internal id is start from VSI_NN_OP_INTERNAL_START. + * @see include/interface/ops.def + * @see include/custom/custom_ops.def + * @see include/internal/internal_ops.def + */ +typedef uint32_t vsi_nn_op_t; enum +{ +#define DEF_OP( NAME, ... ) VSI_NN_OP_##NAME, + #include "interface/ops.def" +#undef DEF_OP + VSI_NN_OP_NUM, + VSI_NN_OP_NA = VSI_NN_OP_NUM, + VSI_NN_OP_CLIENT = VSI_NN_OP_NA + 1, + // add operation alias for compilation + VSI_NN_OP_ELTWISEMAX = VSI_NN_OP_MAXIMUM, + + VSI_NN_OP_CUSTOM_START = 0x10000, +#define DEF_OP( NAME, ... ) VSI_NN_OP_##NAME, + #include "custom/custom_ops.def" +#undef DEF_OP + VSI_NN_OP_CUSTOM_END, + VSI_NN_OP_CUSTOM_NUM = VSI_NN_OP_CUSTOM_END - VSI_NN_OP_CUSTOM_START - 1, + + VSI_NN_OP_INTERNAL_START = 0x10000000, +#define DEF_OP( NAME, ... ) VSI_NN_OP_##NAME, + #include "internal/internal_ops.def" +#undef DEF_OP + VSI_NN_OP_INTERNAL_END, + VSI_NN_OP_INTERNAL_NUM = VSI_NN_OP_INTERNAL_END - VSI_NN_OP_INTERNAL_START - 1 +}; + +/** Operation initialization function handler */ +typedef vsi_status ( * vsi_nn_op_init_t ) + ( vsi_nn_node_t * ); + +/** Operation computation function handler */ +typedef vsi_status ( * vsi_nn_op_compute_t ) + ( + vsi_nn_node_t *, + vsi_nn_tensor_t **, + vsi_nn_tensor_t ** + ); + +/** Operation deinitialization function handler */ +typedef vsi_status ( * vsi_nn_op_deinit_t ) + ( vsi_nn_node_t * ); + +/** Operation validation function handler */ +typedef vsi_bool ( * vsi_nn_op_check_t ) + ( + vsi_nn_node_t *, + vsi_nn_tensor_t **, + vsi_nn_tensor_t ** + ); + +/** Operation setup function handler */ +typedef vsi_bool ( * vsi_nn_op_setup_t ) + ( + vsi_nn_node_t *, + vsi_nn_tensor_t **, + vsi_nn_tensor_t ** + ); + +/** Operation optimization function handler */ +typedef vsi_status ( * vsi_nn_op_optimize_t ) + ( + vsi_nn_node_t *, + vsi_nn_tensor_t **, + vsi_nn_tensor_t **, + vsi_nn_opt_direction_e + ); + +/** Operation runtime interface. */ +typedef struct _vsi_nn_op_proc +{ + vsi_nn_op_init_t init; + vsi_nn_op_compute_t compute; + vsi_nn_op_deinit_t deinit; + vsi_nn_op_check_t check; + vsi_nn_op_setup_t setup; + vsi_nn_op_optimize_t optimize; + uint32_t input_num; + uint32_t output_num; +} vsi_nn_op_proc_t; + +/*------------------------------------ + Functions + -----------------------------------*/ + +OVXLIB_API vsi_status vsi_nn_op_common_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ); + +OVXLIB_API vsi_status vsi_nn_op_common_deinit + ( + vsi_nn_node_t * self + ); + +OVXLIB_API vsi_bool vsi_nn_op_common_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ); + +/** + * Setup for eltwise binary op + * Setup for eltwise binary op, which need broadcast shape. + * + * @param[in] node Node handle. + * @param[in] inputs input tensors. + * @param[in] outputs output tensors. + * + * @return VSI_SUCCESS on success, or error code otherwise. + */ +OVXLIB_API vsi_bool vsi_nn_op_eltwise_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ); + +OVXLIB_API vsi_status vsi_nn_op_common_optimize + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ); + +vsi_bool vsi_nn_OpIsValid + ( + vsi_nn_op_t op + ); + +const vsi_nn_op_proc_t * vsi_nn_OpGetProc + ( + vsi_nn_op_t op + ); + +/** + * Init operation + * Call operation init process. + * + * @param[in] op Operation id. + * @param[in] node Node handle. + * + * @return VSI_SUCCESS on success, or error code otherwise. + */ +vsi_status vsi_nn_OpInit + ( + vsi_nn_op_t op, + vsi_nn_node_t * node + ); + +/** + * Build operation with vx backend + * Call operation compute process, it will build the node with vx backend. + * + * @param[in] op Operation id. + * @param[in] node Node handle. + * @param[in] inputs Input tensors' handle.. + * @param[in] outputs Output tensors' handle. + * + * @return VSI_SUCCESS on success, or error code otherwise. + */ +vsi_status vsi_nn_OpCompute + ( + vsi_nn_op_t op, + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ); + +/** + * Deinit operation + * Call operation deinit process, free some resource. + * + * @param[in] op Operation id. + * @param[in] node Node handle. + * + * @return VSI_SUCCESS on success, or error code otherwise. + */ +vsi_status vsi_nn_OpDeinit + ( + vsi_nn_op_t op, + vsi_nn_node_t * node + ); + +/** + * Optimize operation + * Call operation optimize process. + * @see vsi_nn_opt_direction_e + * + * @param[in] op Operation id. + * @param[in] node Node handle. + * @param[in] inputs Input tensors' handle. + * @param[in] outputs Output tensors' handle. + * @param[in] driection Current loop direction, use this param to implement + * different optimizations. + * + * @return VSI_SUCCESS on success, or error code otherwise. + */ +vsi_status vsi_nn_OpOptimize + ( + vsi_nn_op_t op, + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ); + +/** + * Validate operation + * Call operation check process. + * + * @param[in] op Operation id. + * @param[in] node Node handle. + * @param[in] inputs Input tensors' handle. + * @param[in] outputs Output tensors' handle. + * + * @return VSI_SUCCESS on success, or error code otherwise. + */ +vsi_bool vsi_nn_OpCheck + ( + vsi_nn_op_t op, + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ); + +void vsi_nn_OpGetIoNum + ( + vsi_nn_op_t op, + vsi_nn_node_t * node, + uint32_t * input_num, + uint32_t * output_num + ); + +OVXLIB_API vsi_bool vsi_nn_OpGenerateTensor + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ); + +/** + * Setup operation + * Call operation setup process, it runs before computation, + * Ovxlib usually computes output shapes in this process. + * + * @param[in] op Operation id. + * @param[in] node Node handle. + * @param[in] inputs Input tensors' handle. + * @param[in] outputs Output tensors' handle. + * + * @return VSI_SUCCESS on success, or error code otherwise. + */ +vsi_bool vsi_nn_OpSetup + ( + vsi_nn_op_t op, + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ); + +vsi_bool vsi_nn_OpRegisterOvxInit + ( + vsi_nn_op_t op, + vsi_nn_op_compute_t compute + ); + +/** + * Get operation name + * Get operation name string by operation id. + * + * @param[in] op Operation id. + * + * @return Operation name on success, or NULL otherwise. + */ +OVXLIB_API const char * vsi_nn_OpGetName + ( + vsi_nn_op_t op + ); + +#if defined(__cplusplus) +} +#endif + +/** + * Declare an operation with process functions. + */ +#define DEF_OP_REG(op,init,compute,deinit,check,setup,optimize,in,out) \ + vsi_nn_op_proc_t vsi_nn_op_##op =\ +{\ + /* init */ init,\ + /* compute */ compute,\ + /* deinit */ deinit,\ + /* check */ check,\ + /* setup */ setup,\ + /* optimize */ optimize,\ + /* input_num */ in,\ + /* output_num */ out \ +}; + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_platform.h b/src/tim/vx/internal/include/vsi_nn_platform.h new file mode 100644 index 0000000..6c00bd9 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_platform.h @@ -0,0 +1,50 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_PLATFORM_H +#define _VSI_NN_PLATFORM_H + +#include +#include +#include +#include +#include +#include +#if defined(VX_KHR_COMPATIBILITY) && (0x1==VX_KHR_COMPATIBILITY) +#include +#endif + +/* + This is a compatibility head file for backward compatibility OpenVX 1.1 spec +*/ +#include "vsi_nn_compatibility.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_post.h b/src/tim/vx/internal/include/vsi_nn_post.h new file mode 100644 index 0000000..61fe75f --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_post.h @@ -0,0 +1,30 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_POST_H +#define _VSI_NN_POST_H + +#include "post/vsi_nn_post_fasterrcnn.h" +#include "post/vsi_nn_post_cmupose.h" + +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h new file mode 100644 index 0000000..501fca3 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h @@ -0,0 +1,236 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef VSI_NN_PRE_POST_PROCESS_H +#define VSI_NN_PRE_POST_PROCESS_H + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" + +#define VSI_NN_PREPROCESS_IMMATERIAL (0) + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Preprocess type + */ +typedef enum +{ + VSI_NN_PREPROCESS_SOURCE_LAYOUT = 0, + VSI_NN_PREPROCESS_SET_SOURCE_FORMAT, + VSI_NN_PREPROCESS_IMAGE_SIZE, + VSI_NN_PREPROCESS_CROP, + VSI_NN_PREPROCESS_MEAN_AND_SCALE, + VSI_NN_PREPROCESS_PERMUTE, + VSI_NN_PREPROCESS_REVERSE_CHANNEL, + VSI_NN_PREPROCESS_IMAGE_RESIZE_BILINEAR, + VSI_NN_PREPROCESS_IMAGE_RESIZE_NEAREST, + VSI_NN_PREPROCESS_DTYPE_CONVERT, +} vsi_nn_preprocess_type_e; + +/** + * Postprocess type + */ +typedef enum +{ + VSI_NN_POSTPROCESS_PERMUTE = 0, + VSI_NN_POSTPROCESS_DTYPE_CONVERT, +} vsi_nn_postprocess_type_e; + +typedef enum +{ + VSI_NN_SOURCE_LAYOUT_NHWC = 0, + VSI_NN_SOURCE_LAYOUT_NCHW, +} vsi_nn_preprocess_source_layout_e; + +/** + * Input source format + */ +typedef enum +{ + VSI_NN_SOURCE_FORMAT_TENSOR = 0, + VSI_NN_SOURCE_FORMAT_IMAGE_GRAY, + VSI_NN_SOURCE_FORMAT_IMAGE_RGB, + VSI_NN_SOURCE_FORMAT_IMAGE_YUV420, + VSI_NN_SOURCE_FORMAT_IMAGE_BGRA, + VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR, + VSI_NN_SOURCE_FORMAT_IMAGE_YUV444, + VSI_NN_SOURCE_FORMAT_IMAGE_NV12, +} vsi_nn_preprocess_source_format_e; + +/** + * Preprocess base structure + */ +typedef struct +{ + /** Preprocess type*/ + vsi_nn_preprocess_type_e type; + /** Preprocess paramters */ + void* param; +} vsi_nn_preprocess_base_t; + +/** + * Postprocess base structure + */ +typedef struct +{ + /** Postprocess type*/ + vsi_nn_postprocess_type_e type; + /** Postrocess paramters */ + void* param; +} vsi_nn_postprocess_base_t; + +/** + * Process dtype convert parameter structure + */ +typedef struct { + vsi_nn_dtype_t dtype; + /** Reserve some more btyes for future features. */ + char reserved[40]; +} vsi_nn_process_dtype_convert_t; + +typedef vsi_nn_process_dtype_convert_t vsi_nn_preprocess_dtype_convert_t; +typedef vsi_nn_process_dtype_convert_t vsi_nn_postprocess_dtype_convert_t; + +/** + * Process crop parameter structure + */ +typedef struct +{ + /** Crop begin for each dim */ + int32_t* begin; + /** Crop size for each dim */ + int32_t* size; + /** Image dim */ + int32_t dim; +}vsi_nn_preprocess_crop_t; + +/** + * Process mean and scale parameter structure + */ +typedef struct +{ + /** Mean value for each channel */ + float* channel_mean; + /*Channel length */ + int32_t channel_len; + /** Scale value */ + float scale; +}vsi_nn_process_mean_and_scale_t; + +typedef vsi_nn_process_mean_and_scale_t vsi_nn_preprocess_mean_and_scale_t; +typedef vsi_nn_process_mean_and_scale_t vsi_nn_postprocess_mean_and_scale_t; + +/** + * Process permute parameter structure + */ +typedef struct +{ + /** Permute value for each channel */ + int32_t* perm; + /** Permute dim */ + int32_t dim; +}vsi_nn_process_permute_t; + +typedef vsi_nn_process_permute_t vsi_nn_preprocess_permute_t; +typedef vsi_nn_process_permute_t vsi_nn_postprocess_permute_t; + +/** + * Preprocess image resize parameter structure + */ +typedef struct { + /** Width */ + uint32_t w; + /** Height */ + uint32_t h; + /** Channel */ + uint32_t c; +} vsi_nn_preprocess_image_resize_t; + +typedef vsi_nn_preprocess_image_resize_t vsi_nn_preprocess_image_size_t; + +vsi_status vsi_nn_add_single_preproc_node + ( + vsi_nn_graph_t* graph, + uint32_t input_idx, + vsi_nn_node_t** first_node, + uint32_t nodes_count, + vsi_nn_preprocess_base_t* preprocess, + uint32_t proc_count + ); + +vsi_status vsi_nn_add_single_postproc_node + ( + vsi_nn_graph_t* graph, + uint32_t output_idx, + vsi_nn_node_t* last_node, + vsi_nn_postprocess_base_t* postprocess, + uint32_t proc_count + ); + +/** + * Add preprocess node in for specified input + * + * @param[in] graph Graph to be added node in. + * @param[in] input_idx Input tensor port. + * @param[in] preprocess Preprocess task handle. + * @param[in] count Preprocess task count. + * + * @return VSI_SUCCESS on success, or appropriate error code otherwise. + * + */ +OVXLIB_API vsi_status vsi_nn_AddGraphPreProcess + ( + vsi_nn_graph_t* graph, + uint32_t input_idx, + vsi_nn_preprocess_base_t* preprocess, + uint32_t count + ); + +/** + * Add postprocess node in for specified output + * + * @param[in] graph Graph to be added node in. + * @param[in] input_idx Input tensor port. + * @param[in] postprocess Postprocess task handle. + * @param[in] count Postprocess task count. + * + * @return VSI_SUCCESS on success, or appropriate error code otherwise. + * + */ +OVXLIB_API vsi_status vsi_nn_AddGraphPostProcess + ( + vsi_nn_graph_t* graph, + uint32_t output_idx, + vsi_nn_postprocess_base_t* postprocess, + uint32_t count + ); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_prv.h b/src/tim/vx/internal/include/vsi_nn_prv.h new file mode 100644 index 0000000..9535632 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_prv.h @@ -0,0 +1,65 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_PRV_H_ +#define _VSI_NN_PRV_H_ + +#if defined(__cplusplus) +extern "C"{ +#endif + +#define VSI_NN_MAX_PATH 256 + +#ifdef __linux__ +#define VSI_NN_STD_CALL +#else +#define VSI_NN_STD_CALL __stdcall +#endif + +typedef enum _vsi_nn_broad_cast_bits_e +{ + VSI_NN_BROAD_CAST_BITS_0 = 0x01, + VSI_NN_BROAD_CAST_BITS_1 = 0x02, + VSI_NN_BROAD_CAST_BITS_2 = 0x04, + VSI_NN_BROAD_CAST_BITS_4 = 0x08, +} vsi_nn_broad_cast_bits_e; + +#define REQUIRED_IO( _IOPORT ) ( (_IOPORT) != NULL ? (_IOPORT)->t : \ + ( VSILOGE("Required IO port: %s", #_IOPORT), (_IOPORT)->t ) ) +#define OPTIONAL_IO( _IOPORT ) ( (_IOPORT) != NULL ? (_IOPORT)->t : NULL) + +#ifndef __BEGIN_DECLS + #if defined(__cplusplus) + #define __BEGIN_DECLS extern "C" { + #define __END_DECLS } + #else + #define __BEGIN_DECLS + #define __END_DECLS + #endif +#endif + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_pub.h b/src/tim/vx/internal/include/vsi_nn_pub.h new file mode 100644 index 0000000..5e9194e --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_pub.h @@ -0,0 +1,57 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_PUB_H +#define _VSI_NN_PUB_H + +#if !defined(OVXLIB_API) + #if defined(_WIN32) + #define OVXLIB_API __declspec(dllimport) + #else + #define OVXLIB_API __attribute__((visibility("default"))) + #endif +#endif + +#include "vsi_nn_log.h" +#include "vsi_nn_context.h" +#include "vsi_nn_client_op.h" +#include "vsi_nn_node.h" +#include "vsi_nn_node_attr_template.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_types.h" +#include "vsi_nn_version.h" +#include "vsi_nn_assert.h" +#include "vsi_nn_rnn.h" +#include "vsi_nn_test.h" +#include "vsi_nn_pre_post_process.h" +#include "utils/vsi_nn_code_generator.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_dtype_util.h" +#include "quantization/vsi_nn_asymmetric_affine.h" +#include "quantization/vsi_nn_dynamic_fixed_point.h" +#endif + diff --git a/src/tim/vx/internal/include/vsi_nn_rnn.h b/src/tim/vx/internal/include/vsi_nn_rnn.h new file mode 100644 index 0000000..519d783 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_rnn.h @@ -0,0 +1,111 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_RNN_H_ +#define _VSI_NN_RNN_H_ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_internal_node.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +/********************************************************** +* MACROS +**********************************************************/ +#define VSI_NN_MAX_RNN_CONNECTION_INPUTS 16 + +/********************************************************** +* TYPES +**********************************************************/ +typedef struct +{ + vsi_nn_tensor_id_t output; + vsi_nn_tensor_id_t inputs[VSI_NN_MAX_RNN_CONNECTION_INPUTS]; +} vsi_nn_rnn_external_connection_t; + +/*------------------------------------------- +Procedure to prepare input data, return FALSE +to end loop +-------------------------------------------*/ +typedef vsi_bool (*vsi_nn_rnn_prepare_input_func_t) + ( + vsi_nn_graph_t* graph, + uint32_t iteration, + void* user_data + ); + +/*------------------------------------------- +Procedure to process output data, return FALSE +to end loop +-------------------------------------------*/ +typedef vsi_bool (*vsi_rnn_rnn_process_output_func_t) + ( + vsi_nn_graph_t* graph, + uint32_t iteration, + void* user_data + ); + +/********************************************************** +* PUBLIC FUNCTIONS +**********************************************************/ +vsi_status vsi_nn_rnn_feed_internal_state + ( + const vsi_nn_graph_t* graph + ); + +vsi_status vsi_nn_rnn_save_internal_state + ( + const vsi_nn_graph_t* graph + ); + +vsi_status vsi_nn_rnn_DeinitWksp + ( + vsi_nn_graph_t* graph + ); + +vsi_status vsi_nn_rnn_InitWksp + ( + vsi_nn_graph_t* graph, + const vsi_nn_rnn_external_connection_t* connections, + uint32_t connections_count, + void* user_data + ); + +OVXLIB_API vsi_status vsi_nn_rnn_ResetBuffers + ( + vsi_nn_graph_t* graph + ); + +OVXLIB_API vsi_status vsi_nn_rnn_RunGraph + ( + vsi_nn_graph_t* graph + ); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_rnn_helper.h b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h new file mode 100644 index 0000000..e9191ca --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h @@ -0,0 +1,254 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_RNN_HELPER_H +#define _VSI_NN_RNN_HELPER_H + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_util.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +/** + * find the best kernel size on HW + * + * @param[in] is multi batch. + * @param[in] input size. + * @param[out] the height of the best kernel size. + * @param[out] the width of the best kernel size. + */ +vsi_bool vsi_nn_rnn_find_best_kernel_size + ( + vsi_bool multi_batch, + uint32_t input_size, + uint32_t* p_kernel_h, + uint32_t* p_kernel_w + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_bool multi_batch, + uint32_t kernel_h, + uint32_t kernel_w, + int32_t use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_bool multi_batch, + uint32_t kernel_h, + uint32_t kernel_w, + int32_t use_virtual_tensor + ); + +vsi_bool vsi_nn_rnn_process_output_for_nn_fc2 + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + vsi_bool multi_batch, + uint32_t kernel_h, + uint32_t kernel_w, + int32_t use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias, + uint32_t kernel_h, + uint32_t kernel_w, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_prepare_weight_for_nn_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * weight, + uint32_t kernel_h, + uint32_t kernel_w + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias, + uint32_t kernel_h, + uint32_t kernel_w, + vsi_bool has_relu, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_add + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input1, + vsi_nn_tensor_t * input2, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ); + +vsi_nn_op_t vsi_nn_rnn_get_act_op_type + ( + vsi_nn_activation_e type + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_activation + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_activation_e act_type, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + vsi_bool use_virtual_tensor + ); + +void vsi_nn_rnn_split_input_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t ** output, + uint32_t time_step, + vsi_bool use_virtual_tensor + ); + +void vsi_nn_rnn_data_check_aligned + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** input, + uint32_t time_step, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + uint32_t batch_size, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + uint32_t batch_size, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_binary_operator + ( + vsi_nn_node_t* self, + vsi_nn_op_t op, + vsi_nn_tensor_t* operand1, + vsi_nn_tensor_t* operand2, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_concat_impl + ( + vsi_nn_node_t* self, + uint32_t axis, + vsi_bool use_virtual_tensor, + vsi_nn_tensor_t* tensor, + ... + ); +#define vsi_nn_rnn_create_concat(_node, _axis, _virtual, _tensor, ...) \ + vsi_nn_rnn_create_concat_impl(_node, _axis, _virtual, _tensor, __VA_ARGS__, END_OF_VARIADIC_ARGUMENTS) + +vsi_nn_internal_tensor_t** vsi_nn_create_split + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t* tensor, + uint32_t axis, + uint32_t slices_num, + uint32_t* slices, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t* input_tensor, + vsi_nn_tensor_t* output_tensor, + uint32_t* size, + uint32_t dim_num, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t* input_tensor, + vsi_nn_tensor_t* output_tensor, + uint32_t* perm, + uint32_t dim_num, + vsi_bool use_virtual_tensor + ); + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_copy + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t* input_tensor, + vsi_nn_tensor_t* output_tensor, + vsi_nn_dtype_t* dtype, + vsi_bool use_virtual_tensor + ); + +#if defined(__cplusplus) +} +#endif + +#endif /* _VSI_NN_RNN_HELPER_H */ \ No newline at end of file diff --git a/src/tim/vx/internal/include/vsi_nn_rnn_prv.h b/src/tim/vx/internal/include/vsi_nn_rnn_prv.h new file mode 100644 index 0000000..11b1c47 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_rnn_prv.h @@ -0,0 +1,62 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_RNN_PRV_H_ +#define _VSI_NN_RNN_PRV_H_ +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_link_list.h" +#include "vsi_nn_rnn.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +typedef struct +{ + vsi_nn_tensor_attr_t attr; + uint8_t* data; + size_t data_size; /* in bytes */ +} vsi_nn_rnn_internal_buffer_t; + +typedef struct +{ + vsi_nn_link_list_t link_list; + vsi_nn_rnn_external_connection_t connection; + vsi_nn_rnn_internal_buffer_t buffer; + uint32_t connection_inputs_count; + vsi_bool tensor_swappable; +} vsi_nn_rnn_connection_t; + +typedef struct +{ + vsi_nn_rnn_connection_t* external_connection_list; + void* user_data; + vsi_bool is_first_run; +} vsi_nn_rnn_wksp_t; + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h new file mode 100644 index 0000000..4dcde2c --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_tensor.h @@ -0,0 +1,205 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_TENSOR_H +#define _VSI_NN_TENSOR_H + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +/** + * Maximum dimension number + * @todo We will make this dynamic in the future. + */ +#define VSI_NN_MAX_DIM_NUM (8) + +/** Invalid tensor id */ +#define VSI_NN_TENSOR_ID_NA ((uint32_t)-1) + +/** A special value to tell some APIs the id can be automatic generated. + * @see vsi_nn_AddTensor + * @see vsi_nn_AddTensorFromHandle + * @see vsi_nn_AttachTensorToGraph + */ +#define VSI_NN_TENSOR_ID_AUTO (VSI_NN_TENSOR_ID_NA - 1) + +/** + * A special value to tell node to compute the output shape itself. + */ +#define VSI_NN_DIM_AUTO (0) + +/** + * Dimension format + * @todo We haven't use it yet. + */ +typedef enum +{ + VSI_NN_DIM_FMT_NCHW = 0x00, + VSI_NN_DIM_FMT_NHWC = 0x01, + VSI_NN_DIM_FMT_NA = 0xFF, + VSI_NN_DIM_FMT_AUTO = VSI_NN_DIM_FMT_NA - 1, +} vsi_nn_dim_fmt_e; + +/** + * Quantization type. + */ +typedef enum +{ + /** none quantized */ + VSI_NN_QNT_TYPE_NONE = 0, + /** dynamic fixed point */ + VSI_NN_QNT_TYPE_DFP = VX_QUANT_DYNAMIC_FIXED_POINT, + /** affine asymmetric */ + VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC = VX_QUANT_AFFINE_SCALE, + /** affine perchannel symmetric */ + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC = 0x3,/*VX_QUANT_AFFINE_SCALE_PER_CHANNEL*/ + /** affine symmetric */ + VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC = VX_QUANT_AFFINE_SCALE, + /** undefined type */ + VSI_NN_QNT_TYPE_NA = 0xff, +} vsi_nn_qnt_type_e; + +/** + * Data type structure + */ +typedef struct vsi_nn_dtype +{ + /** @see vsi_nn_dim_fmt_e */ + vsi_nn_dim_fmt_e fmt; + /** Data type */ + vsi_nn_type_e vx_type; + struct + { + /** @see vsi_nn_qnt_type_e */ + vsi_nn_qnt_type_e qnt_type; + union + { + /** Meanful in dynamic fixed point */ + struct + { + int8_t fl; + }; + /** Meanful in affine asymmetric */ + struct + { + int32_t zero_point; + float scale; + }; +#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT + /** Meanful in AFFINE_PERCHANNEL_SYMMETRIC */ + struct + { + const float * scales; + int32_t scale_dim; + int32_t channel_dim; + const int32_t * zero_points; + int32_t zero_points_dim; + }; +#endif + }; + }; +} vsi_nn_dtype_t; + +/** + * Tensor Attribute + * @see vsi_nn_AddTensor + */ +typedef struct vsi_nn_tensor_attr +{ + /** Tensor shape */ + uint32_t size[VSI_NN_MAX_DIM_NUM]; + /** Dimension number */ + uint32_t dim_num; + /** If it's virtual tensor*/ + vsi_bool vtl; + /** If it's const tensor */ + vsi_bool is_const; + /** Data type + * @see vsi_nn_dtype_t + */ + vsi_nn_dtype_t dtype; + vsi_bool is_created_from_handle; + vsi_bool is_handle_malloc_by_ovxlib; +#ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL + vsi_memory_type_e vsi_memory_type; +#endif +} vsi_nn_tensor_attr_t; + + +/** + * Tensor structure + */ +struct _vsi_nn_tensor +{ + /** Tensor attributes */ + vsi_nn_tensor_attr_t attr; + /** OVX tensor */ + vx_tensor t; + /** Optimized weight bias tensor */ + vx_weights_biases_parameter wb; + /** Mark tensor swapped by vxSwapTensor */ + int8_t is_swapped; +}; + +/** +* Handle Manager +* The starting memory address of vx_handle MUST be aligned with `align_start_size` bytes. +* And the memory size of vx_handle MUST be multiple of `align_block_size` bytes. +*/ +typedef struct vsi_nn_handle_manager +{ + uint32_t align_start_size; + uint32_t align_block_size; +} vsi_nn_handle_manager_t; + +typedef struct _vsi_nn_tensor_rel_table +{ + vsi_nn_node_id_t node; + uint32_t index; +} vsi_nn_tensor_rel_table_t; + +typedef struct _vsi_nn_tensor_rel +{ + struct + { + vsi_nn_tensor_rel_table_t *table; + uint32_t num; + } input; + struct + { + vsi_nn_tensor_rel_table_t *table; + uint32_t num; + } output; +} vsi_nn_tensor_rel_t; + +#if defined(__cplusplus) +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h new file mode 100644 index 0000000..895307e --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h @@ -0,0 +1,728 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_TENSOR_UTIL_H +#define _VSI_NN_TENSOR_UTIL_H + +/*------------------------------------------- + Includes +-------------------------------------------*/ +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_types.h" +#include "utils/vsi_nn_util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*------------------------------------------- + Types +-------------------------------------------*/ + +/** Openvx tensor attribute IDs */ +typedef enum +{ + VSI_NN_TENSOR_ATTR_DIM_NUM = 0x1, + VSI_NN_TENSOR_ATTR_DTYPE = 0x2, + VSI_NN_TENSOR_ATTR_SIZE = 0x4, + VSI_NN_TENSOR_ATTR_FIXED_POINT_POS = 0x8, + VSI_NN_TENSOR_ATTR_CONST = 0x10, + VSI_NN_TENSOR_ATTR_HIGH_PRECISION = 0x20, + VSI_NN_TENSOR_ATTR_ALL = 0xFF +} vsi_nn_vxtensor_attr_t; + +/*------------------------------------------- + Macros and Variables +-------------------------------------------*/ + +/** Check attribute bit, + * @see vsi_nn_vxtensor_attr_t + */ +#define vsi_nn_hasattr( mask, attr ) (( mask & attr ) != 0) + +/*------------------------------------------- + Functions +-------------------------------------------*/ + +/** + * Create a new tensor + * Create a new tensor with given attributes. + * + * @param[in] graph Graph handle + * @param[in] attr Tensor attributes + * + * @return Tensor handle on success, or NULL otherwise. + */ +OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_attr_t * attr + ); + +/** + * Reinit openvx tensor handle + * Free an exist openvx tensor handle and create a new for current tensor. + * + * @param[in] graph Graph handle + * @param[in] tensor Tensor handle to reinit + * + * @return TRUE if on success, or FALSE otherwise. + */ +vsi_bool vsi_nn_TensorReinit + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor + ); + +/** + * Release tensor + * Relase current tensor and set the handle to NULL. + * + * @param[in] tensor Tensor to release + */ +OVXLIB_API void vsi_nn_ReleaseTensor + ( + vsi_nn_tensor_t ** tensor + ); + +/** + * Set tensor's vx attribute + * The value should be type of vsi_nn_vxtensor_attr_t. + * + * @param[in] tensor Tensor handle + * @param[in] attrs New attributes to update + * @see vsi_nn_vxtensor_attr_t + * + * @return VSI_SUCCESS on success, or error core otherwise. + */ +OVXLIB_API vsi_status vsi_nn_SetTensorAttr + ( + vsi_nn_tensor_t * tensor, + const vsi_nn_vxtensor_attr_t attrs + ); + +/** + * Query tensor attribute + * Query vxtensor attribute and update current tensor attributes. + * + * @param[in] tensor Tensor handle to query and update + * @param[in] attrs VxAttributes to query and update + * @see vsi_nn_vxtensor_attr_t + * + * @return VSI_SUCCESS on success, or error core otherwise. + */ +OVXLIB_API vsi_status vsi_nn_QueryTensorAttr + ( + vsi_nn_tensor_t * tensor, + const vsi_nn_vxtensor_attr_t attrs + ); + +/** + * Convert tensor to data + * Read tensor memory to a user space buffer and return it. + * @note User should free the malloc buffer with vsi_nn_Free. + * @see vsi_nn_Free + * + * @param[in] graph Graph handle + * @param[in] tensor Tensor handle + * + * @return Data buffer address. + */ +OVXLIB_API uint8_t * vsi_nn_ConvertTensorToData + ( + const vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor + ); + +/** + * Convert tensor to data + * Read tensor memory to a user space buffer + * and return it by float32 format. + * @note User should free the malloc buffer with vsi_nn_Free. + * @see vsi_nn_Free + * + * @param[in] graph Graph handle + * @param[in] tensor Tensor handle + * + * @return Data buffer address. + */ +OVXLIB_API float * vsi_nn_ConvertTensorToFloat32Data + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_t *tensor + ); + +/* + * @deprecated + * @see vsi_nn_ConvertRawTensorToData2 + */ +OVXLIB_API uint8_t * vsi_nn_ConvertRawTensorToData + ( + vx_context context, + vx_tensor tensor, + uint32_t * dim, + vx_enum * data_format, + uint32_t * size, + uint32_t * stride_size, + vx_tensor_addressing * addr, + vx_enum accessor + ); + +/** + * Convert vxTensor to data + * Read vxTensor memory to user space buffer + * @note User should free the malloc buffer with vsi_nn_Free. + * @see vsi_nn_Free + * @todo Remove context, it can be returned by vx APIs. + * + * @param[in] context vxContext + * @param[in] tensor VxTensor + * @param[in] Ovxlib tensor attribute + * @param[out] addr vxTensor addressing + * @param[in] accessor Access mode + * + * @return Data buffer address. + */ +OVXLIB_API uint8_t * vsi_nn_ConvertRawTensorToData2 + ( + vx_context context, + vx_tensor tensor, + vsi_nn_tensor_attr_t * attr, + uint32_t * stride_size, + vx_tensor_addressing * addr, + vx_enum accessor + ); + +/** + * Save tensor to text + * Save tensor to a text file with given path. + * + * @param[in] graph Graph handle. + * @param[in] tensor Tensor handle. + * @param[in] filename Filename to save. + * @param[in] seperator Characters used to seperate the data. + */ +OVXLIB_API void vsi_nn_SaveTensorToText + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + const char * filename, + char * seperator + ); + +/** + * Save tensor to text by float32 format + * Save tensor to a text file with given path, all data will + * be converted to float32. + * + * @param[in] graph Graph handle. + * @param[in] tensor Tensor handle. + * @param[in] filename Filename to save. + * @param[in] seperator Characters used to seperate the data. + * @see vsi_nn_SaveTensorToText + */ +OVXLIB_API void vsi_nn_SaveTensorToTextByFp32 + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + const char * filename, + char * seperator + ); + +/** + * Save data to text + * Save data to a text file with given path + * + * @param[in] filename Filename to save. + * @param[in] data Data buffer address. + * @param[in] data_szie Size of data buffer. + * @param[in] data_format Data type. + * @param[in] seperator Characters used to seperate the data. + */ +OVXLIB_API void vsi_nn_SaveDataToText + ( + const char * filename, + uint8_t * data, + uint32_t data_size, + vsi_nn_type_e data_format, + char * seperator + ); + +/** + * Save tensor to binary file + * Save tensor to a binary file with given path. + * + * @param[in] graph Graph handle. + * @param[in] tensor Tensor handle. + * @param[in] filename Filename to save. + */ +OVXLIB_API void vsi_nn_SaveTensorToBinary + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + const char * filename + ); + +/** + * Create tensor from data buffer + * Create a new tensor and copy data to the tensor memory. + * + * @param[in] graph Graph handle. + * @param[in] data Data buffer address. + * @param[in] attr Tensor attributes. + * + * @return Tensor handle on success, or NULL otherwise. + */ +OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateTensorFromData + ( + vsi_nn_graph_t * graph, + uint8_t * data, + vsi_nn_tensor_attr_t * attr + ); + +/** + * Copy data to tensor + * Copy data from buffer to tensor memory. + * + * @param[in] graph Graph handle. + * @param[in] tensor Tensor handle. + * @param[in] data Data buffer address. + * + * @return VSI_SUCCESS on success, or error core otherwise. + */ +OVXLIB_API vsi_status vsi_nn_CopyDataToTensor + ( + const vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + void * data + ); + +/** + * Flush Handle + * If you swap the handle of the tensor, you should flush it. + * + * @param[in] tensor Tensor handle. + * + * @return VSI_SUCCESS on success, or error core otherwise. + */ +OVXLIB_API vsi_status vsi_nn_FlushHandle + ( + const vsi_nn_tensor_t * tensor + ); + +/** + * Get Tensor Handle + * Get the handle of the tensor + * + * @param[in] tensor Tensor. + * @param[out] ptr The handle of the tensor. + * + * @return VSI_SUCCESS on success, or error core otherwise. + */ +OVXLIB_API vsi_status vsi_nn_GetTensorHandle + ( + vsi_nn_tensor_t * tensor, + void** ptr + ); + +OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor + ( + vsi_nn_graph_t* graph, + uint8_t* src_data, + const vsi_nn_dtype_t* src_dtype, + vsi_nn_tensor_t* tensor + ); + +OVXLIB_API uint32_t vsi_nn_CopyTensorToBuffer + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + void * buffer + ); + +/** + * Print node inputs and outputs + * Print a brief info of a node inputs and outputs. + * @todo move this to vsi_nn_node.h + * + * @param[in] graph Graph handle. + * @param[in] node Node handle. + */ +OVXLIB_API void vsi_nn_PrintNodeIO + ( + vsi_nn_graph_t *graph, + vsi_nn_node_t *node + ); + +/** + * Print tensor + * Print a brief info of a tensor. + * + * @param[in] tensor Tensor handle. + * @param[in] id Tensor id. + */ +OVXLIB_API void vsi_nn_PrintTensor + ( + vsi_nn_tensor_t * tensor, + vsi_nn_tensor_id_t id + ); + +OVXLIB_API void vsi_nn_TransposeTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + uint32_t * perm, + uint32_t dim_num, + uint32_t * as_shape + ); + +OVXLIB_API void vsi_nn_PermuteTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + uint32_t * perm, + uint32_t dim_num + ); + +OVXLIB_API vsi_bool vsi_nn_CalcReshapeTensor + ( + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + uint32_t * shape, + uint32_t dim_num + ); + +OVXLIB_API vsi_bool vsi_nn_ReshapeTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + const uint32_t * shape, + uint32_t dim_num + ); + +/** + * Get element number of a tensor + * + * @param[in] tensor Tensor handle. + * @return Element number of the tensor. + */ +OVXLIB_API uint32_t vsi_nn_GetElementNum + ( + const vsi_nn_tensor_t * tensor + ); + +/** + * Get tensor size + * The size is the bytes of the tensor memory reserved. + * + * @param[in] shape Shape handle. + * @param[in] dim_num Dimension number. + * @param[in] dtype Datatype. + * @see vsi_nn_type_e + * + * @return Size of the tensor. + */ +OVXLIB_API uint32_t vsi_nn_GetTensorSize + ( + const uint32_t * shape, + uint32_t dim_num, + vsi_nn_type_e dtype + ); + +/** + * Create a tensor by a scalar + * + * @todo Changed to vsi_nn_ScalarToTensor + * @param[in] self Node handle ???? + * @param[in] data Scalar address. + * @param[in] type Scalar data type. + * @see vsi_nn_type_e + * + */ +OVXLIB_API vsi_nn_tensor_t * vsi_nn_VariableToTensor + ( + vsi_nn_node_t * self, + uint8_t * data, + vsi_nn_type_e type + ); + +/** + * Malloc a buffer + * + * @param[in] size Size to malloc. + * + * @return Buffer address. + */ +OVXLIB_API void *vsi_nn_Malloc + ( + size_t size + ); + +/** + * Free a buffer + * This API is used to free ovxlib malloc buffers. + * + * @param[in] data Data buffer address. + */ +OVXLIB_API void vsi_nn_Free + ( + void * data + ); + +/** + * Create view vxTensor from an exist tensor + * The new tensor is created from a tensor view of current tensor. + * + * @param[in] graph Graph handle. + * @param[in] start View start region. + * @param[in] end View end region. + * @param[in] tensor Tensor handle to create the view. + * + * @return vxTensor from the view. + */ +OVXLIB_API vx_tensor vsi_nn_CreateViewTensor + ( + vsi_nn_graph_t *graph, + uint32_t *start, + uint32_t *end, + vsi_nn_tensor_t *tensor + ); + +OVXLIB_API void vsi_nn_ReleaseTensorRelevance + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_rel_t *tensor_ref + ); + +OVXLIB_API vsi_nn_tensor_rel_t *vsi_nn_CreateTensorRelevance + ( + vsi_nn_graph_t *graph + ); + +OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateTensorFromHandle + ( + vsi_nn_graph_t * graph, + uint8_t * data, + vsi_nn_tensor_attr_t * attr + ); + +OVXLIB_API vsi_status vsi_nn_SwapTensorHandle + ( + vsi_nn_tensor_t * tensor0, + vsi_nn_tensor_t * tensor1 + ); + +OVXLIB_API uint32_t vsi_nn_vxGetTensorElementNum + ( + vsi_nn_tensor_attr_t *attr + ); + +OVXLIB_API vsi_status vsi_nn_vxGetTensorAttr + ( + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr + ); + +OVXLIB_API uint8_t *vsi_nn_vxCopyTensorToData + ( + vx_context context, + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr + ); + +OVXLIB_API vsi_status vsi_nn_vxCopyDataToTensor + ( + vx_context context, + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr, + uint8_t *data + ); + +/** +* Get offset by tensor coods +* Get offset by tensor coods. +* +* @param[in] tensor's attr +* @param[in] coords +* +* @return the offset from the beginning of the tensor(offset unit: element) +*/ +OVXLIB_API uint32_t vsi_nn_GetOffsetByCoords + ( + vsi_nn_tensor_attr_t *attr, + uint32_t *coords + ); + +/** + * Create a tensor with attr and default value + * the tensor content will be initialized with default value + * + * @param[in] graph Graph handle. + * @param[in] tensor attr. + * @param[in] default value to be assigned to tensor content. + * + * @return new tensor on success, or NULL otherwise. + */ +OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateTensorWithDefault + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_attr_t * attr, + float defualt_value + ); + +/** + * Fill tensor with specified value + * + * @param[in] graph Graph handle. + * @param[in] target tensor. + * @param[in] value to be assigned to tensor content. + * + * @return VSI_SUCCESS on success, or error core otherwise. + */ +vsi_status vsi_nn_FillTensorWithValue + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + float value + ); + +void vsi_nn_print_node_io + ( + vsi_nn_graph_t *graph, + vsi_nn_node_t *node, + int32_t type + ); + +vsi_nn_tensor_t *vsi_nn_reshape_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + uint32_t * shape, + uint32_t dim_num + ); + +/** + * OVXLIB internal tensor util api + * A wrapper api for OpenVX vxCopyTensorPatch + * Allows the application to copy a view patch from/into an tensor object . + * + * @param[in] tensor OpenVX Tensor handle. + * @param[in] attr OVXLIB Tensor attr. + * @param[in] user_ptr The address of the memory location where to store the requested data. + * @param[in] start View start region. + * @param[in] end View end region. + * @param[in] stride Array of user memory strides in each dimension. + * @param[in] usage This declares the effect of the copy with regard to the tensor object + * support VX_READ_ONLY or VX_WRITE_ONLY + * @param[in] user_memory_type A, refer vx_memory_type_e + * @return VSI_SUCCESS on success, or error core otherwise. + */ +vsi_status vsi_nn_copy_tensor_veiw_patch + ( + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr, + void *user_ptr, + uint32_t *start, + uint32_t *end, + uint32_t *stride, + vsi_enum usage, + vsi_enum user_memory_type + ); + +/** + * OVXLIB internal tensor util api + * A wrapper api for OpenVX vxCopyTensorPatch + * Allows the application to copy whole tensor patch from/into an tensor object. + * + * @param[in] tensor OpenVX Tensor handle. + * @param[in] attr OVXLIB Tensor attr. + * @param[in] user_ptr The address of the memory location where to store the requested data. + * @param[in] usage This declares the effect of the copy with regard to the tensor object + * support VX_READ_ONLY or VX_WRITE_ONLY + * @return VSI_SUCCESS on success, or error core otherwise. + */ +vsi_status vsi_nn_copy_tensor_patch + ( + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr, + void * user_ptr, + vsi_enum usage + ); + +/** + * OVXLIB internal tensor util api + * Rotate 180 degrees in width*height*channel dims for weights data + * + * @param[in] graph Graph handle. + * @param[in] weights tensor. + */ +void vsi_nn_reshuffle_weight_data + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * weights + ); + +vsi_nn_tensor_t* vsi_nn_ConcatTensor_impl + ( + vsi_nn_graph_t* graph, + uint32_t axis, + ... + ); +#define vsi_nn_ConcatTensor(_graph, _axis, ...) \ + vsi_nn_ConcatTensor_impl(_graph, _axis, __VA_ARGS__, END_OF_VARIADIC_ARGUMENTS) + +/** + * Add multiple constant tensor + * All the input and output tensors must have the same shape. + * + * @param[in] graph Graph handle. + * @param[in] tensor attr. + * @param[in] input constant tensors. + * + * @return new constant tensor on success, or NULL otherwise. + */ +vsi_nn_tensor_t* vsi_nn_ConstTensorAdd_impl + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_attr_t output_attr, + ... + ); +#define vsi_nn_ConstTensorAdd(_graph, _output_attr, ...) \ + vsi_nn_ConstTensorAdd_impl(_graph, _output_attr, __VA_ARGS__, END_OF_VARIADIC_ARGUMENTS) + +vsi_status vsi_nn_SwapHandle + ( + vsi_nn_tensor_t * tensor, + void * new_ptr, + void ** old_ptr + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_test.h b/src/tim/vx/internal/include/vsi_nn_test.h new file mode 100644 index 0000000..8f5df6e --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_test.h @@ -0,0 +1,60 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_TEST_H +#define _VSI_NN_TEST_H + +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +#define TEST_CHECK_TENSOR_ID( id, lbl ) do {\ + if( VSI_NN_TENSOR_ID_NA == id ) {\ + VSILOGE("CHECK TENSOR ID %d", __LINE__);\ + goto lbl;\ + }\ + } while(0) + +#define TEST_CHECK_PTR( ptr, lbl ) do {\ + if( NULL == ptr ) {\ + VSILOGE("CHECK PTR %d", __LINE__);\ + goto lbl;\ + }\ +} while(0) + +#define TEST_CHECK_STATUS( stat, lbl ) do {\ + if( VSI_SUCCESS != stat ) {\ + VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\ + goto lbl;\ + }\ +} while(0) + +#if defined(__cplusplus) +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h new file mode 100644 index 0000000..cb92928 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_types.h @@ -0,0 +1,205 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_TYPES_H_ +#define _VSI_NN_TYPES_H_ + +#include +#include "vsi_nn_platform.h" +#include "vsi_nn_feature_config.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +#ifdef _WIN32 +#define inline __inline +#endif + +/** Enumuration type */ +typedef int32_t vsi_enum; +/** Status type */ +typedef int32_t vsi_status; +/** Bool type */ +typedef int32_t vsi_bool; +/** Half */ +typedef uint16_t vsi_float16; +/** Truncate float16 */ +typedef uint16_t vsi_bfloat16; + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +/** Status enum */ +typedef enum +{ + VSI_FAILURE = VX_FAILURE, + VSI_SUCCESS = VX_SUCCESS, +}vsi_nn_status_e; + +/** Pad enum */ +typedef enum +{ + VSI_NN_PAD_AUTO, + VSI_NN_PAD_VALID, + VSI_NN_PAD_SAME +} vsi_nn_pad_e; + +/** + * @deprecated Platform enum + * @see vsi_nn_dim_fmt_e + */ +typedef enum +{ + VSI_NN_PLATFORM_CAFFE, + VSI_NN_PLATFORM_TENSORFLOW +} vsi_nn_platform_e; + +/** Round type enum */ +typedef enum +{ + VSI_NN_ROUND_CEIL, + VSI_NN_ROUND_FLOOR +} vsi_nn_round_type_e; + +/** Optimize driction */ +typedef enum +{ + VSI_NN_OPTIMIZE_FORWARD, + VSI_NN_OPTIMIZE_BACKWARD +} vsi_nn_opt_direction_e; +#ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL +typedef enum +{ + VSI_MEMORY_TYPE_NONE = VX_MEMORY_TYPE_NONE, + VSI_MEMORY_TYPE_HOST = VX_MEMORY_TYPE_HOST, + VSI_MEMORY_TYPE_DMABUF = VX_MEMORY_TYPE_DMABUF, + VSI_MEMORY_TYPE_INERNAL = VX_MEMORY_TYPE_INTERNAL, + VSI_MEMORY_TYPE_UNCACHED = VX_MEMORY_TYPE_HOST_UNCACHED, + VSI_MEMORY_TYPE_PHYSICAL = VX_MEMORY_TYPE_HOST_PHYSICAL, +}vsi_memory_type_e; +#endif +/** Type enum */ +typedef enum +{ + VSI_NN_TYPE_NONE = VX_TYPE_INVALID, + VSI_NN_TYPE_INT8 = VX_TYPE_INT8, + VSI_NN_TYPE_INT16 = VX_TYPE_INT16, + VSI_NN_TYPE_INT32 = VX_TYPE_INT32, + VSI_NN_TYPE_INT64 = VX_TYPE_INT64, + VSI_NN_TYPE_UINT8 = VX_TYPE_UINT8, + VSI_NN_TYPE_UINT16 = VX_TYPE_UINT16, + VSI_NN_TYPE_UINT32 = VX_TYPE_UINT32, + VSI_NN_TYPE_UINT64 = VX_TYPE_UINT64, + VSI_NN_TYPE_FLOAT16 = VX_TYPE_FLOAT16, + VSI_NN_TYPE_FLOAT32 = VX_TYPE_FLOAT32, + VSI_NN_TYPE_FLOAT64 = VX_TYPE_FLOAT64, +#ifdef VSI_BOOL8_SUPPORT + VSI_NN_TYPE_BOOL8 = VX_TYPE_BOOL8, +#else + VSI_NN_TYPE_BOOL8 = 0x011, +#endif +#ifdef VSI_BFLOAT16_SUPPORT + VSI_NN_TYPE_BFLOAT16 = VX_TYPE_BFLOAT16, +#else + VSI_NN_TYPE_BFLOAT16 = 0x81A, +#endif + VSI_NN_TYPE_VDATA = VX_TYPE_USER_STRUCT_START + 0x1, +}vsi_nn_type_e; + +typedef int32_t vsi_nn_activation_e; enum +{ + VSI_NN_ACT_NONE = 0, + VSI_NN_ACT_RELU = 1, + VSI_NN_ACT_RELU1 = 2, + VSI_NN_ACT_RELU6 = 3, + VSI_NN_ACT_TANH = 4, + VSI_NN_ACT_SIGMOID = 6, + + VSI_NN_ACT_HARD_SIGMOID = 31, /* temporary use 31*/ + + //Deprecated enum, reversed only for old code + VSI_NN_LSTMUNIT_ACT_NONE = 0, + VSI_NN_LSTMUNIT_ACT_RELU = 1, + VSI_NN_LSTMUNIT_ACT_RELU6 = 3, + VSI_NN_LSTMUNIT_ACT_TANH = 4, + VSI_NN_LSTMUNIT_ACT_SIGMOID = 6, + + VSI_NN_LSTMUNIT_ACT_HARD_SIGMOID = 31, + + VSI_NN_GRU_ACT_NONE = 0, + VSI_NN_GRU_ACT_RELU = 1, + VSI_NN_GRU_ACT_RELU6 = 3, + VSI_NN_GRU_ACT_TANH = 4, + VSI_NN_GRU_ACT_SIGMOID = 6, + + VSI_NN_GRU_ACT_HARD_SIGMOID = 31 +}; + +typedef enum +{ + VSI_NN_DEPTH2SPACE_DCR = 0, + VSI_NN_DEPTH2SPACE_CRD +} vsi_nn_depth2space_mode_e; + +typedef enum +{ + VSI_NN_GRAPH_PRELOAD_VIPSRAM, + VSI_NN_GRAPH_PRELOAD_AXISRAM +} vsi_nn_graph_attr_preload_type_e; + +typedef enum _vsi_nn_node_attr_preload_type_e +{ + VSI_NN_NODE_PRELOAD_NONE, + VSI_NN_NODE_PRELOAD_VIPSRAM, + VSI_NN_NODE_PRELOAD_AXISRAM +} vsi_nn_node_attr_preload_type_e; + +/** Deprecated */ +typedef uint32_t vsi_nn_size_t; + +/** Tensor id type */ +typedef uint32_t vsi_nn_tensor_id_t; + +/** Node id type */ +typedef uint32_t vsi_nn_node_id_t; + +/** @see _vsi_nn_graph */ +typedef struct _vsi_nn_graph vsi_nn_graph_t; + +/** @see _vsi_nn_node */ +typedef struct _vsi_nn_node vsi_nn_node_t; + +/** @see _vsi_nn_tensor */ +typedef struct _vsi_nn_tensor vsi_nn_tensor_t; + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h new file mode 100644 index 0000000..5e68dec --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -0,0 +1,74 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** @file */ +#ifndef _VSI_NN_VERSION_H_ +#define _VSI_NN_VERSION_H_ + +#include "vsi_nn_types.h" + +#if defined(__cplusplus) +extern "C"{ +#endif + +#define VSI_NN_VERSION_MAJOR 1 +#define VSI_NN_VERSION_MINOR 1 +#define VSI_NN_VERSION_PATCH 28 +#define VSI_NN_VERSION \ + (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) + +/** + * Ovxlib version check + * Ovxlib will check the suitable version at compile time. + * @note Ovxlib version should be always greater or equal to case version. + */ +#define _version_assert _compiler_assert + +/** + * Get ovxlib version + * Get ovxlib version string. + */ +OVXLIB_API const char *vsi_nn_GetVersion(void); + +/** + * Get ovxlib version major + * Get ovxlib version major, return integer value. + */ +OVXLIB_API uint32_t vsi_nn_GetVersionMajor(void); + +/** + * Get ovxlib version minor + * Get ovxlib version minor, return integer value. + */ +OVXLIB_API uint32_t vsi_nn_GetVersionMinor(void); + +/** + * Get ovxlib version patch + * Get ovxlib version patch, return integer value. + */ +OVXLIB_API uint32_t vsi_nn_GetVersionPatch(void); + +#if defined(__cplusplus) +} +#endif +#endif diff --git a/src/tim/vx/internal/src/Android.mk b/src/tim/vx/internal/src/Android.mk new file mode 100644 index 0000000..235a845 --- /dev/null +++ b/src/tim/vx/internal/src/Android.mk @@ -0,0 +1,132 @@ +# +# Build Vivante chipinfo for android. +# +LOCAL_PATH:= $(call my-dir) +include $(CLEAR_VARS) + +ifeq ($(AQROOT),) +$(error Please set AQROOT env first) +endif + +include $(AQROOT)/Android.mk.def + +ifeq ($(PLATFORM_VENDOR),1) +LOCAL_VENDOR_MODULE := true +endif + +LOCAL_SRC_FILES := \ + vsi_nn_context.c \ + vsi_nn_client_op.c \ + vsi_nn_graph.c \ + vsi_nn_node_attr_template.c \ + vsi_nn_node.c \ + vsi_nn_ops.c \ + vsi_nn_daemon.c \ + vsi_nn_tensor.c \ + vsi_nn_version.c \ + vsi_nn_rnn.c \ + vsi_nn_rnn_helper.c \ + vsi_nn_internal_node.c \ + vsi_nn_log.c \ + vsi_nn_graph_optimization.c \ + vsi_nn_pre_post_process.c + + +LOCAL_SRC_FILES += \ + client/vsi_nn_vxkernel.c + +LOCAL_SRC_FILES += \ + utils/vsi_nn_code_generator.c \ + utils/vsi_nn_binary_tree.c \ + utils/vsi_nn_map.c \ + utils/vsi_nn_hashmap.c \ + utils/vsi_nn_link_list.c \ + utils/vsi_nn_math.c \ + utils/vsi_nn_dtype.c \ + utils/vsi_nn_dtype_util.c \ + utils/vsi_nn_shape_util.c \ + utils/vsi_nn_limits.c \ + utils/vsi_nn_vdata.c \ + utils/vsi_nn_tensor_op.c \ + utils/vsi_nn_util.c \ + utils/vsi_nn_constraint_check.c + + +LOCAL_SRC_FILES += \ + quantization/vsi_nn_dynamic_fixed_point.c \ + quantization/vsi_nn_asymmetric_affine.c \ + quantization/vsi_nn_perchannel_symmetric_affine.c \ + + +LOCAL_SRC_FILES += \ + pycc/vsi_pycc_interface.c + + +LOCAL_SRC_FILES += \ + post/vsi_nn_post_fasterrcnn.c \ + post/vsi_nn_post_cmupose.c + +LOCAL_SRC_FILES += libnnext/vsi_nn_libnnext_resource.c + +LOCAL_SRC_FILES += kernel/vsi_nn_kernel.c \ + kernel/vsi_nn_kernel_util.c \ + kernel/vsi_nn_kernel_backend.c \ + kernel/vsi_nn_kernel_eltwise.c \ + kernel/vsi_nn_kernel_selector.c \ + kernel/vsi_nn_kernel_node.c \ + kernel/vsi_nn_kernel_param.c \ + kernel/vsi_nn_kernel_gpu_shape_optimize.c \ + kernel/vsi_nn_gpu.c + +LIBNNEXT_KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/libnnext/ops/kernel/*.c) +LOCAL_SRC_FILES += $(LIBNNEXT_KERNEL_SOURCES:$(LOCAL_PATH)/%=%) + +KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/kernel/cl/*.c) +KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/cpu/*.c) +KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/evis/*.c) +KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/vx/*.c) +KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/*.c) +KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/kernel/*.c) +LOCAL_SRC_FILES += $(KERNEL_SOURCES:$(LOCAL_PATH)/%=%) + +OPERATION_SOURCES := $(wildcard $(LOCAL_PATH)/ops/*.c) +LOCAL_SRC_FILES += $(OPERATION_SOURCES:$(LOCAL_PATH)/%=%) + + +LOCAL_SHARED_LIBRARIES := \ + liblog \ + libjpeg \ + libGAL \ + libOpenVX \ + libVSC \ + libdl + +LOCAL_C_INCLUDES += \ + external/libjpeg-turbo \ + $(AQROOT)/sdk/inc/CL \ + $(AQROOT)/sdk/inc/VX \ + $(AQROOT)/sdk/inc/ \ + $(AQROOT)/sdk/inc/HAL \ + $(LOCAL_PATH)/../include \ + $(LOCAL_PATH)/../include/ops \ + $(LOCAL_PATH)/../include/utils \ + $(LOCAL_PATH)/../include/infernce \ + $(LOCAL_PATH)/../include/client \ + $(LOCAL_PATH)/../include/libnnext + +LOCAL_CFLAGS := \ + -DLINUX \ + -D'OVXLIB_API=__attribute__((visibility("default")))' \ + -Wno-sign-compare \ + -Wno-implicit-function-declaration \ + -Wno-sometimes-uninitialized \ + -Wno-unused-parameter \ + -Wno-enum-conversion \ + -Wno-missing-field-initializers \ + -Wno-tautological-compare \ + -Wno-missing-braces + +LOCAL_MODULE:= libovxlib +LOCAL_MODULE_TAGS := optional +LOCAL_PRELINK_MODULE := false +include $(BUILD_SHARED_LIBRARY) diff --git a/src/tim/vx/internal/src/client/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/client/vsi_nn_vxkernel.c new file mode 100644 index 0000000..60c05b6 --- /dev/null +++ b/src/tim/vx/internal/src/client/vsi_nn_vxkernel.c @@ -0,0 +1,574 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "VX/vx.h" +#include "VX/vxu.h" +#include "VX/vx_ext_program.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vsi_nn_libnnext_resource.h" +#if VSI_USE_VXC_BINARY +/*this header can be only included once in all *.c files*/ +#include "libnnext/vx_bin/vxc_binaries.h" +#endif + +static char s_vx_resource_path[VSI_NN_MAX_PATH] = "VX"; + +uint8_t * vsi_nn_LoadBinarySource + ( + uint8_t * file, + int32_t * sz + ) +{ + int32_t len; + int32_t n; + FILE *fp; + uint8_t *buf; + + buf = NULL; + + fp = fopen( (char *)file, "rb" ); + + VSILOGI( "Loading program from binary file." ); + if( NULL == fp ) + { + VSILOGE( "Open program file fail." ); + return buf; + } + + fseek( fp, 0, SEEK_END ); + len = ftell( fp ); + fseek( fp, 0, SEEK_SET ); + + buf = (uint8_t *)malloc( len + 1 ); + n = (int32_t)fread( buf, 1, len, fp ); + fclose( fp ); + + if( n != len ) + { + VSILOGE( "Read source file error(%d/%d).", n, len ); + } + + buf[len] = 0; + + if( NULL != sz ) + { + *sz = len; + } + return buf; +} /* vsi_nn_LoadBinarySource() */ + +static vsi_status vsi_nn_InitKernel + ( + vx_kernel_description_t * kernel, + vx_kernel obj + ) +{ + vsi_status status; + uint32_t i; + + status = VSI_SUCCESS; + for( i = 0; i < kernel->numParams; i ++ ) + { + status = vxAddParameterToKernel( + obj, + i, + kernel->parameters[i].direction, + kernel->parameters[i].data_type, + kernel->parameters[i].state + ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Add parameter %d to kernel %s fail. with %d.", + i, kernel->name, status ); + break; + } + } + + if( VSI_SUCCESS == status ) + { + status = vxFinalizeKernel( obj ); + } + + if( VSI_SUCCESS != status ) + { + VSILOGE( "Finalize kernel %s fail with %d.", + kernel->name, status ); + status = vxRemoveKernel( obj ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Remove kernel %s fail with %d.", + kernel->name, status ); + } + } + return status; +} + +static vsi_status vsi_nn_RegisterCPUKernel + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_status status; + vx_kernel obj; + vx_context ctx; + vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index]; + + status = VSI_FAILURE; + ctx = vxGetContext( (vx_reference)graph->g ); + + obj = vxAddUserKernel( + ctx, + kernel->name, + kernel->enumeration, + kernel->function, + kernel->numParams, + kernel->validate, + kernel->initialize, + kernel->deinitialize + ); + + if( NULL != obj ) + { + status = vsi_nn_InitKernel(kernel,obj); + vxReleaseKernel( &obj ); + } + else + { + VSILOGE( "Add kernel %s fail.", kernel->name ); + } + return status; +} /* vsi_nn_RegisterCPUKernel() */ + +static const char * vsi_nn_LoadVxResourceFromFile + ( + char * resource_name, + vx_size * program_len + ) +{ + char resource_path[VSI_NN_MAX_PATH]; + const char * vx_resource_path = vsi_nn_VxResourceGetPath(); + + if (strncmp(vx_resource_path, "", VSI_NN_MAX_PATH) == 0) + { + VSILOGE("No Valid VX Resource Path Error!\n"); + } + snprintf(resource_path, VSI_NN_MAX_PATH, "%s/%s.vx", vx_resource_path, resource_name); + return (char *)vsi_nn_LoadBinarySource((uint8_t *)resource_path, (int32_t *)program_len); +} + +static vsi_status vsi_nn_RegisterVXKernel + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_status status = VSI_FAILURE; + vx_kernel obj = NULL; + vx_program program = NULL; + vx_size * program_len = NULL; + const char **program_src = NULL; + vx_context ctx = NULL; + vsi_nn_context_t context = NULL; + vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index]; + uint8_t i = 0; + vsi_bool load_from_file = FALSE; + +#define MAX_BUILDPROGRAM_LEN 128 + char cmd[MAX_BUILDPROGRAM_LEN] = {0}; + int32_t evis = 0; + + memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN); + status = VSI_FAILURE; + ctx = vxGetContext( (vx_reference)graph->g ); + context = graph->ctx; + evis = context->config.evis.ver; + + program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *)); + program_len = (vx_size*)malloc(kernel_info->resource_num * sizeof(vx_size)); + for (i = 0; i < kernel_info->resource_num; i++) + { + program_src[i] = vsi_nn_resource_load_source_code( + kernel_info->resource_name[i], &program_len[i], VSI_NN_KERNEL_TYPE_EVIS); + if (program_src[i] == NULL) + { + VSILOGI("Try to Load VX Resource from file...\n"); + + program_src[i] = vsi_nn_LoadVxResourceFromFile(kernel_info->resource_name[i], &program_len[i]); + load_from_file = TRUE; + } + } + + program = vxCreateProgramWithSource(ctx, kernel_info->resource_num, program_src, (vx_size *)program_len); + status = vxGetStatus((vx_reference)program); + if(VSI_SUCCESS != status) + { + VSILOGE("[%s : %d] vxCreateProgramWithSource() Error!\n", __FILE__, __LINE__); + status = VSI_FAILURE; + goto OnError; + } + + if(evis == VSI_NN_HW_EVIS_NONE) + { + // set default evis version is 2 + sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=2"); + } + else + { + sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=%d", evis); + } + status = vxBuildProgram(program, cmd); + + if(VSI_SUCCESS != status) + { + VSILOGE("[%s : %d] vxBuildProgram() Error!\n", __FILE__, __LINE__); + } + + obj = vxAddKernelInProgram(program, + kernel->name, + kernel->enumeration, + kernel->numParams, + kernel->validate, + kernel->initialize, + kernel->deinitialize + ); + + if( NULL != obj ) + { + status = vsi_nn_InitKernel(kernel,obj); + vxReleaseProgram(&program); + } + else + { + VSILOGE( "Add kernel %s fail.", kernel->name ); + } +OnError: + for (i = 0; i < kernel_info->resource_num; i++) + { + if (program_src[i] && load_from_file) + { + free((char *)program_src[i]); + } + } + if(program_src) free((char**)program_src); + if(program_len) free(program_len); + return status; +} + +static vsi_status vsi_nn_RegisterBinKernel + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_status status; + vx_kernel obj; + vx_program program = NULL; + vx_size program_len = 0; + const uint8_t *program_ptr = NULL; + vx_context ctx; + vsi_nn_context_t context; + vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index]; + +#define MAX_BUILDPROGRAM_LEN 128 + char cmd[MAX_BUILDPROGRAM_LEN]; + int32_t evis; + + memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN); + status = VSI_FAILURE; + + ctx = vxGetContext( (vx_reference)graph->g ); + context = graph->ctx; + evis = context->config.evis.ver; + + program_ptr = vsi_nn_VxBinResourceGetResource(kernel_info->resource_name[kernel_info->resource_num - 1], &program_len); + program = vxCreateProgramWithBinary(ctx, (const vx_uint8 *)program_ptr, program_len); + + status = vxGetStatus((vx_reference)program); + if(VSI_SUCCESS != status) + { + VSILOGE("[%s : %d] vxCreateProgramWithSource() Error!\n", __FILE__, __LINE__); + return status; + } + +#if 1 + if(evis == VSI_NN_HW_EVIS_NONE) + { + // set default evis version is 2 + sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=2"); + } + else + { + sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=%d", evis); + } +#else + sprintf(cmd, "-cl-viv-vx-extension"); +#endif + status = vxBuildProgram(program, cmd); + + if(VSI_SUCCESS != status) + { + VSILOGE("[%s : %d] vxBuildProgram() Error!\n", __FILE__, __LINE__); + return status; + } + + obj = vxAddKernelInProgram(program, + kernel->name, + kernel->enumeration, + kernel->numParams, + kernel->validate, + kernel->initialize, + kernel->deinitialize + ); + + if( NULL != obj ) + { + status = vsi_nn_InitKernel(kernel,obj); + vxReleaseProgram(&program); + } + else + { + VSILOGE( "Add kernel %s fail.", kernel->name ); + } + + return status; +} + +vsi_status vsi_nn_RegisterClientKernel + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_status status = VSI_SUCCESS; + switch (kernel_info->type) + { + case VX_KERNEL_TYPE_VX: + status = vsi_nn_RegisterVXKernel(graph, kernel_info); + break; + + case VX_KERNEL_TYPE_CPU: + status = vsi_nn_RegisterCPUKernel(graph, kernel_info); + break; + + case VX_KERNEL_TYPE_BIN: + status = vsi_nn_RegisterBinKernel(graph, kernel_info); + break; + + default: + status = VSI_FAILURE; + } + return status; +} /* vsi_nn_RegisterClientKernel() */ + +vx_node vsi_nn_RegisterClientKernelAndNewNode + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_status status; + vx_context ctx; + vx_kernel obj; + vx_node node; + vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index]; + + ctx = vxGetContext( (vx_reference)graph->g ); + + /* Load kernel */ + obj = vxGetKernelByName( ctx, kernel->name ); + status = vxGetStatus( (vx_reference)obj ); + if (VSI_SUCCESS != status) + { + /* Register kernel */ + status = vsi_nn_RegisterClientKernel( graph, kernel_info); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Register client kernel %s fail with %d.", + kernel->name, status ); + return NULL; + } + else + { + VSILOGI( "Register client kernel %s successfully.", + kernel->name ); + } + + /* Load kernel */ + obj = vxGetKernelByName( ctx, kernel->name ); + status = vxGetStatus( (vx_reference)obj ); + } + + if( VSI_SUCCESS != status ) + { + VSILOGE( "Load client kernel %s fail with %d.", + kernel->name, status ); + return NULL; + } + + /* Create node */ + node = vxCreateGenericNode( graph->g, obj ); + vxReleaseKernel( &obj ); + status = vxGetStatus( (vx_reference)node ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Load client node from kernel %s fail with %d.", + kernel->name, status ); + return NULL; + } + return node; +} /* vsi_nn_RegisterClientKernelAndNewNode() */ + +vx_node vsi_nn_RegisterClientKernelAndCreateNode + ( + vsi_nn_graph_t * graph, + vx_kernel_description_t * kernel + ) +{ + /* + * Deprecated: use vsi_nn_RegisterClientKernelAndNewNode() insteatd. + */ + vsi_nn_kernel_info_t kernel_info; + char *resource_name[1] = {NULL}; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + kernel_info.resource_name = resource_name; + kernel_info.resource_name[0] = "old_client_interface"; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = &kernel; + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + + return vsi_nn_RegisterClientKernelAndNewNode(graph, &kernel_info); +} /* vsi_nn_RegisterClientKernelAndCreateNode() */ + +vsi_status vsi_nn_ClientNodePassParameters + ( + vx_node node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + uint8_t i; + + status = VSI_FAILURE; + for( i = 0; i < num; i++ ) + { + status = vxSetParameterByIndex( node, i, params[i] ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Set %d parameter fail.", i ); + break; + } + } + return status; +} /* vsi_nn_ClientKernelPassParameters() */ + +vsi_status VX_CALLBACK vsi_nn_KernelValidator + ( + vx_node node, + const vx_reference parameters[], + uint32_t num, + vx_meta_format metas[] +) +{ + return VSI_SUCCESS; +} /* vsi_nn_KernelValidator() */ + +vsi_status VX_CALLBACK vsi_nn_KernelInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + return VSI_SUCCESS; +} /* vsi_nn_KernelInitializer() */ + +vsi_status VX_CALLBACK vsi_nn_KernelDeinitializer + ( + vx_node nodObj, + const vx_reference *paraObj, + uint32_t paraNum + ) +{ + return VSI_SUCCESS; +} /* vsi_nn_KernelDeinitializer() */ + +const char * vsi_nn_VxResourceGetPath() +{ + return s_vx_resource_path; +} /* vsi_nn_VxResourceGetPath() */ + +void vsi_nn_VxResourceSetPath + ( + char* path + ) +{ + strncpy(s_vx_resource_path, path, VSI_NN_MAX_PATH - 1); +} /* vsi_nn_VxResourceSetPath() */ + +const uint8_t * vsi_nn_VxBinResourceGetResource + ( + char* name, + vx_size *len + ) +{ +#if VSI_USE_VXC_BINARY + int i; + for (i = 0; i < vx_bin_resource_items_cnt; i++) + { + if (strncmp(vx_bin_resource_items[i].name, name, VSI_NN_MAX_PATH) == 0) + { + *len = vx_bin_resource_items[i].len; + return vx_bin_resource_items[i].data; + } + } +#endif + return NULL; +} /* vsi_nn_VxResourceGetBinResource() */ + +vx_kernel_type_e vsi_nn_GetVXKernelTypeForShader() +{ +#if VSI_USE_VXC_BINARY + return VX_KERNEL_TYPE_BIN; +#else + return VX_KERNEL_TYPE_VX; +#endif +} + +vx_bool vsi_nn_is_do_vx_op_pre_init + ( + vx_kernel_type_e type + ) +{ + return (vx_bool)(type == VX_KERNEL_TYPE_VX || type == VX_KERNEL_TYPE_BIN); +} diff --git a/src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c new file mode 100644 index 0000000..0230420 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c @@ -0,0 +1,231 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_ID VX_KERNEL_ID(CUSTOM_SOFTMAX) +#define _VX_KERNEL_VAR_CPU (vx_client_kernel_CUSTOM_SOFTMAX_CPU) +#define _VX_KERNEL_VAR_VX (vx_client_kernel_CUSTOM_SOFTMAX_VX) +#define _VX_KERNEL_NAME ("com.vivantecorp.extension.CustomSoftmaxVXC") +#define _VX_KERNEL_FUNC_KERNEL (vxCustomSoftmaxKernel) + +static vsi_status VX_CALLBACK vxCustomSoftmaxKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + vsi_status status = VX_SUCCESS; + vx_tensor input = NULL,output = NULL; + float *f32_in_buffer = NULL,*f32_out_buffer=NULL; + vx_context context = NULL; + vsi_nn_tensor_attr_t in_attr,out_attr; + uint32_t i,in_elements,out_elements; + int32_t sf_axis; + float fMax = 0.0; + float fProbSum = 0.0f; + + context = vxGetContext((vx_reference)node); + input = (vx_tensor)paramObj[0]; + output = (vx_tensor)paramObj[1]; + vxCopyScalar((vx_scalar)paramObj[2], &(sf_axis),VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + /* Fill input & output attribute data struct */ + status = vsi_nn_vxGetTensorAttr(input, &in_attr); + TEST_CHECK_STATUS(status, final); + status = vsi_nn_vxGetTensorAttr(output, &out_attr); + TEST_CHECK_STATUS(status, final); + + in_elements = vsi_nn_vxGetTensorElementNum(&in_attr); + out_elements = vsi_nn_vxGetTensorElementNum(&out_attr); + + /* alloc the float32 data buffer */ + f32_in_buffer = (float *)malloc(in_elements * sizeof(float)); + f32_out_buffer= (float *)malloc(out_elements * sizeof(float)); + memset(f32_in_buffer, 0, in_elements * sizeof(float)); + memset(f32_out_buffer, 0, out_elements * sizeof(float)); + + /* Copy tensor to buffer, and convert bufer to float32 format */ + status = vsi_nn_vxConvertTensorToFloat32Data( + context, input, &in_attr, f32_in_buffer, in_elements * sizeof(float)); + TEST_CHECK_STATUS(status, final); + + /* Softmax implement */ + for ( i = 0; i < out_elements; i++) + { + fMax = f32_in_buffer[i] > fMax ? f32_in_buffer[i] : fMax; + } + + for ( i = 0; i < out_elements; i++) + { + f32_out_buffer[i] = (float)expf(f32_in_buffer[i] - fMax); + fProbSum += f32_out_buffer[i]; + } + for ( i = 0; i < out_elements; i++) + { + f32_out_buffer[i] = f32_out_buffer[i]/ fProbSum; + } + status = vsi_nn_vxConvertFloat32DataToTensor( + context, output, &out_attr, f32_out_buffer, out_elements * sizeof(float)); + +final: + if(f32_in_buffer)free(f32_in_buffer); + if(f32_out_buffer)free(f32_out_buffer); + return status; +} + +static vx_status VX_CALLBACK vxCustomSoftmaxInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ + vx_status status = VX_SUCCESS; + /*TODO: Add initial code for VX program*/ + // Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + int input_size[6] = {1, 1, 1, 1, 1, 1}; + int sf_size; + uint32_t input_dims; + uint32_t i; + vsi_nn_tensor_attr_t input_attr; + + memset(&input_attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + status = vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[0], &input_attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + + input_dims = input_attr.dim_num; + for (i = 0; i < input_dims; i++) + { + input_size[i] = input_attr.size[i]; + } + + sf_size = input_size[0]; + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkScale[0] = 1; + shaderParam.globalWorkScale[1] = 1; + shaderParam.localWorkSize[0] = 1; + shaderParam.localWorkSize[1] = 1; + shaderParam.globalWorkSize[0] = + gcmALIGN((1 + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); + shaderParam.globalWorkSize[1] = + gcmALIGN((1 + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); + { + vx_uint32 Uni4x4_Fp16ToFp32[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }; + + vxSetNodeUniform(nodObj, "Uni4x4_Fp16ToFp32", 1, Uni4x4_Fp16ToFp32); + vxSetNodeUniform(nodObj, "sf_size", 1, &sf_size); + } + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + if(status < 0) + { + VSILOGE("Initializer failure!"); + } + + return status; +} + +static vx_param_description_t s_params[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, +}; + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t _VX_KERNEL_VAR_CPU = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t _VX_KERNEL_VAR_VX = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + NULL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxCustomSoftmaxInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_CUSTOM_SOFTMAX_list[] = +{ + &_VX_KERNEL_VAR_CPU, + &_VX_KERNEL_VAR_VX, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c new file mode 100644 index 0000000..215334e --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c @@ -0,0 +1,299 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_CUSTOM_SOFTMAX_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_custom_softmax_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.custom_softmax); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, axis ); +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + //vsi_nn_tensor_attr_t attr; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /*TODO: Add code if need to change your parameter*/ + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); +#if 0 + memcpy(&attr, &(inputs[0]->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[0] = attr.size[0]; + attr.size[1] = 1; + attr.dim_num = 2; + params[0] = (vx_reference)vxReshapeTensor(inputs[0]->t, (int32_t*)(attr.size), attr.dim_num); + params[1] = (vx_reference)vxReshapeTensor(outputs[0]->t, (int32_t*)(attr.size), attr.dim_num); +#endif + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); +#if 0 + vxReleaseTensor((vx_tensor*)¶ms[0]); + vxReleaseTensor((vx_tensor*)¶ms[1]); +#endif + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + char *path = NULL; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = vx_kernel_CUSTOM_SOFTMAX_list; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_custom_softmax"; + path = getenv("USER_VX_SOURCE_PATH"); + if(path) + { + vsi_nn_VxResourceSetPath(path); + } + + if( kernel_info.type == VX_KERNEL_TYPE_VX) + { + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + } + else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ + { + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) + { + free(kernel_info.resource_name); + } + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check input tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Compute output tensor shape. */ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CUSTOM_SOFTMAX, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/custom/ops/vx/vsi_nn_kernel_custom_softmax.vx b/src/tim/vx/internal/src/custom/ops/vx/vsi_nn_kernel_custom_softmax.vx new file mode 100644 index 0000000..fce529d --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/vx/vsi_nn_kernel_custom_softmax.vx @@ -0,0 +1,72 @@ +/* + ============================================================================ + Name : Softmax2.vx + Author : VSI + Version : + Copyright : Your copyright notice + Description : + ============================================================================ + */ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32; +_viv_uniform int sf_size; + #define F_MAX(a,b) ((a)>(b)?(a):(b)) +__kernel void Softmax2VXC + ( + image2d_array_t input, + image2d_array_t output, + int axis + ) +{ + + int4 coord_in = (int4)(0,0,0,0); + float fMax = 0.0; + for (int i = 0; i < sf_size; i++) + { + vxc_char8 val; + coord_in.x = i; + VXC_ReadImage2DArray(val, input, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); + float fval; + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); + + fMax = F_MAX(fMax, fval); + } + + float fProbSum = 0.0f; + for (int i = 0; i < sf_size; i++) + { + vxc_char8 val; + + coord_in.x = i; + VXC_ReadImage2DArray(val, input, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); + float fval; + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); + + float fOut = (float)exp(fval - fMax); + fProbSum += fOut; + half hVal; + _viv_asm(CONV,hVal,fOut); + VXC_WriteImage2DArray(output, coord_in, hVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } + + for (int i = 0; i < sf_size; i++) + { + vxc_short8 val; + vxc_half8 val_h; + coord_in.x = i; + VXC_ReadImage2DArray(val, output, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); + float fval; + _viv_asm(COPY, val_h,val, 16); + VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); + + float fOut =fval/fProbSum; + half hVal; + _viv_asm(CONV,hVal,fOut); + VXC_WriteImage2DArray(output, coord_in, hVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } + +} + + + diff --git a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c new file mode 100644 index 0000000..163ebe1 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c @@ -0,0 +1,287 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _ADD_MEAN_STD_NORM_KERNEL_SOURCE "add_mean_std_norm" + + +// Add kernel hashtable here +#define ADD_MEAN_STD_NORM_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + ((IN0_DTYPE << 20) | ( IN1_DTYPE << 12 ) | ( OUT_DTYPE << 4) ) + +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { ADD_MEAN_STD_NORM_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.add_mean_std_norm_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE), \ + _ADD_MEAN_STD_NORM_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _add_mean_std_norm_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, F32 ), + PACK_KERNEL_MAP( U8 , U8 , F32 ), + PACK_KERNEL_MAP( U8 , U8 , U8 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _add_mean_std_norm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _ADD_MEAN_STD_NORM_PARAM_NUM _cnt_of_array( _add_mean_std_norm_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_tensor input0 = (vx_tensor)param[0]; + vsi_nn_kernel_tensor_attr_t *input0_attr = NULL; + vsi_int_array_t *input_shape = NULL; + + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); + CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + input_shape = input0_attr->shape; + + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.global_size[0] = 16; + gpu_param.global_size[1] = gpu_align_p2( (input_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1], gpu_param.local_size[1] ); + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (input0_attr) + { + vsi_nn_kernel_tensor_attr_release(&input0_attr); + } + return status; +} /* _add_mean_std_norm_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _add_mean_std_norm_kernel_map; + size_t kernel_map_size = _cnt_of_array( _add_mean_std_norm_kernel_map ); + vx_param_description_t * param_def = _add_mean_std_norm_kernel_param_def; + size_t param_def_size = _cnt_of_array( _add_mean_std_norm_kernel_param_def ); + vx_kernel_initialize_f initializer = _add_mean_std_norm_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in0_dtype) + { + in0_dtype = F32; + } + + if (F16 == in1_dtype) + { + in1_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + key = ADD_MEAN_STD_NORM_HASH_KEY( in0_dtype, in1_dtype, out_dtype ); + + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ADD_MEAN_STD_NORM_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + float rsEps = (float)(1.0f / sqrtf(eps)); + float dimRatio = (float)(1.0f / (inputs[0]->attr.size[0])); + float input0Scale = inputs[0]->attr.dtype.scale; + float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale; + float input1Scale = inputs[1]->attr.dtype.scale; + float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 0.0f : 1.0f / outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point; + int32_t width = (int32_t)inputs[0]->attr.size[0]; + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = 0; + border.constant_value.S16 = 0; + border.constant_value.U8 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vsi_nn_kernel_node_set_border( node, &border ); + VSI_ASSERT( status == VSI_SUCCESS ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ADD_MEAN_STD_NORM_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[_CPU_IO_NUM] = vsi_nn_kernel_scalar_create( graph, F32, &rsEps ); + node_params[_CPU_IO_NUM + 1] = vsi_nn_kernel_scalar_create( graph, F32, &dimRatio ); + node_params[_CPU_IO_NUM + 2] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale ); + node_params[_CPU_IO_NUM + 3] = vsi_nn_kernel_scalar_create( graph, F32, &input0Tail ); + node_params[_CPU_IO_NUM + 4] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale ); + node_params[_CPU_IO_NUM + 5] = vsi_nn_kernel_scalar_create( graph, F32, &input1Tail ); + node_params[_CPU_IO_NUM + 6] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); + node_params[_CPU_IO_NUM + 7] = vsi_nn_kernel_scalar_create( graph, F32, &outputZP ); + node_params[_CPU_IO_NUM + 8] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _ADD_MEAN_STD_NORM_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM] ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM + 1] ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM + 2] ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM + 3] ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM + 4] ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM + 5] ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM + 6] ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM + 7] ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM + 8] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( add_mean_std_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c new file mode 100644 index 0000000..6311201 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c @@ -0,0 +1,283 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + + +/* + * Define kernel meta. + */ +#define HASH_ARGMAX_KEY(_axis, _input_type, _output_type, _image_2d) \ + ((_axis << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d)) + + #define VSI_NN_GEN_ARGMAX_KERNEL_SOURCE_NAME(_axis) \ + "argmax_axis"#_axis + +#define HASH_ARGMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("argmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE) + +#define TENSOR_ARGMAX_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_ARGMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \ + HASH_ARGMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ + VSI_NN_GEN_ARGMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define TENSOR_ARGMAX_FLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_ARGMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \ + HASH_ARGMAX_SH_KERNEL_NAME(AXIS, F32, OUT_TYPE), \ + VSI_NN_GEN_ARGMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_ARGMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("argmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_ARGMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_ARGMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ + HASH_ARGMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ + VSI_NN_GEN_ARGMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define TENSOR_ARGMAX_FLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_ARGMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ + HASH_ARGMAX_SH_KERNEL_2D_NAME(AXIS, F32, OUT_TYPE), \ + VSI_NN_GEN_ARGMAX_KERNEL_SOURCE_NAME(AXIS) }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_ARGMAX_FLOAT(0, F32, I32) + TENSOR_ARGMAX_FLOAT(1, F32, I32) + TENSOR_ARGMAX_FLOAT(2, F32, I32) + TENSOR_ARGMAX_FLOAT(0, F16, I32) + TENSOR_ARGMAX_FLOAT(1, F16, I32) + TENSOR_ARGMAX_FLOAT(2, F16, I32) + + TENSOR_ARGMAX_FLOAT_2D(0, F32, I32) + TENSOR_ARGMAX_FLOAT_2D(1, F32, I32) + TENSOR_ARGMAX_FLOAT_2D(2, F32, I32) + TENSOR_ARGMAX_FLOAT_2D(0, F16, I32) + TENSOR_ARGMAX_FLOAT_2D(1, F16, I32) + TENSOR_ARGMAX_FLOAT_2D(2, F16, I32) + + TENSOR_ARGMAX_KERNELS(0, I32, I32) + TENSOR_ARGMAX_KERNELS(1, I32, I32) + TENSOR_ARGMAX_KERNELS(2, I32, I32) + TENSOR_ARGMAX_KERNELS(0, U8, I32) + TENSOR_ARGMAX_KERNELS(1, U8, I32) + TENSOR_ARGMAX_KERNELS(2, U8, I32) + + TENSOR_ARGMAX_KERNELS_2D(0, I32, I32) + TENSOR_ARGMAX_KERNELS_2D(1, I32, I32) + TENSOR_ARGMAX_KERNELS_2D(2, I32, I32) + TENSOR_ARGMAX_KERNELS_2D(0, U8, I32) + TENSOR_ARGMAX_KERNELS_2D(1, U8, I32) + TENSOR_ARGMAX_KERNELS_2D(2, U8, I32) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) +#define SCALAR_AXIS_SIZE_VALUE (2) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_argmax_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) vsi_nn_kernel_tensor_attr_release( &attr[0] ); + if (attr[1]) vsi_nn_kernel_tensor_attr_release( &attr[1] ); + + return status; +} /* _argmax_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t axis, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if(input_dtype == I8) + { + input_dtype = I32; + } + + key = HASH_ARGMAX_KEY( axis, input_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _argmax_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + int32_t axis_size = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + axis_size = inputs[0]->attr.size[axis]; + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( inputs, outputs, axis, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + inputs, 1, outputs, 1 ); + node_params[SCALAR_AXIS_SIZE_VALUE] = vsi_nn_kernel_scalar_create( + graph, I32, &axis_size ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, OnError ); + + } + } + +OnError: + if (node_params[SCALAR_AXIS_SIZE_VALUE]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_AXIS_SIZE_VALUE] ); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( argmax, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c new file mode 100644 index 0000000..7afa3b6 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c @@ -0,0 +1,276 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + + +/* + * Define kernel meta. + */ +#define HASH_ARGMIN_KEY(_axis, _input_type, _output_type, _image_2d) \ + ((_axis << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d)) + + #define VSI_NN_GEN_ARGMIN_KERNEL_SOURCE_NAME(_axis) \ + "argmin_axis"#_axis + +#define HASH_ARGMIN_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("argmin_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE) + +#define TENSOR_ARGMIN_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_ARGMIN_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \ + HASH_ARGMIN_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ + VSI_NN_GEN_ARGMIN_KERNEL_SOURCE_NAME(AXIS) }, + +#define TENSOR_ARGMIN_FLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_ARGMIN_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \ + HASH_ARGMIN_SH_KERNEL_NAME(AXIS, F32, OUT_TYPE), \ + VSI_NN_GEN_ARGMIN_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_ARGMIN_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("argmin_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_ARGMIN_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_ARGMIN_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ + HASH_ARGMIN_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ + VSI_NN_GEN_ARGMIN_KERNEL_SOURCE_NAME(AXIS) }, + +#define TENSOR_ARGMIN_FLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_ARGMIN_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ + HASH_ARGMIN_SH_KERNEL_2D_NAME(AXIS, F32, OUT_TYPE), \ + VSI_NN_GEN_ARGMIN_KERNEL_SOURCE_NAME(AXIS) }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_ARGMIN_FLOAT(0, F32, I32) + TENSOR_ARGMIN_FLOAT(1, F32, I32) + TENSOR_ARGMIN_FLOAT(2, F32, I32) + TENSOR_ARGMIN_FLOAT(0, F16, I32) + TENSOR_ARGMIN_FLOAT(1, F16, I32) + TENSOR_ARGMIN_FLOAT(2, F16, I32) + + TENSOR_ARGMIN_FLOAT_2D(0, F32, I32) + TENSOR_ARGMIN_FLOAT_2D(1, F32, I32) + TENSOR_ARGMIN_FLOAT_2D(2, F32, I32) + TENSOR_ARGMIN_FLOAT_2D(0, F16, I32) + TENSOR_ARGMIN_FLOAT_2D(1, F16, I32) + TENSOR_ARGMIN_FLOAT_2D(2, F16, I32) + + TENSOR_ARGMIN_KERNELS(0, I32, I32) + TENSOR_ARGMIN_KERNELS(1, I32, I32) + TENSOR_ARGMIN_KERNELS(2, I32, I32) + TENSOR_ARGMIN_KERNELS(0, U8, I32) + TENSOR_ARGMIN_KERNELS(1, U8, I32) + TENSOR_ARGMIN_KERNELS(2, U8, I32) + + TENSOR_ARGMIN_KERNELS_2D(0, I32, I32) + TENSOR_ARGMIN_KERNELS_2D(1, I32, I32) + TENSOR_ARGMIN_KERNELS_2D(2, I32, I32) + TENSOR_ARGMIN_KERNELS_2D(0, U8, I32) + TENSOR_ARGMIN_KERNELS_2D(1, U8, I32) + TENSOR_ARGMIN_KERNELS_2D(2, U8, I32) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) +#define SCALAR_AXIS_SIZE_VALUE (2) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_argmin_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) vsi_nn_kernel_tensor_attr_release( &attr[0] ); + if (attr[1]) vsi_nn_kernel_tensor_attr_release( &attr[1] ); + + return status; +} /* _argmin_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t axis, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_ARGMIN_KEY( axis, input_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _argmin_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + int32_t axis_size = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + axis_size = inputs[0]->attr.size[axis]; + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( inputs, outputs, axis, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + inputs, 1, outputs, 1 ); + node_params[SCALAR_AXIS_SIZE_VALUE] = vsi_nn_kernel_scalar_create( + graph, I32, &axis_size ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, OnError ); + + } + } + +OnError: + if (node_params[SCALAR_AXIS_SIZE_VALUE]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_AXIS_SIZE_VALUE] ); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( argmin, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c new file mode 100644 index 0000000..3278730 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c @@ -0,0 +1,310 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + + +/* + * Define kernel meta. + */ +#define HASH_BATCH_NORM_KEY( _input_type, _output_type, _image_2d) \ + ( (_input_type << 12) | (_output_type << 4) | (_image_2d)) + + #define VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME \ + "batchnorm_single" + +#define HASH_BATCH_NORM_SH_KERNEL_NAME( SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.batch_norm_"#SRC_TYPE"to"#DST_TYPE) + +#define TENSOR_BATCH_NORM_KERNELS( SRC_TYPE, OUT_TYPE) \ + { HASH_BATCH_NORM_KEY( SRC_TYPE, OUT_TYPE, 0), \ + HASH_BATCH_NORM_SH_KERNEL_NAME( SRC_TYPE, OUT_TYPE), \ + VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME }, + +#define TENSOR_BATCH_NORM_FLOAT( SRC_TYPE, OUT_TYPE) \ + { HASH_BATCH_NORM_KEY( SRC_TYPE, OUT_TYPE, 0), \ + HASH_BATCH_NORM_SH_KERNEL_NAME( F32, F32), \ + VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME }, + +#define HASH_BATCH_NORM_SH_KERNEL_2D_NAME( SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("batch_norm_"#SRC_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_BATCH_NORM_KERNELS_2D( SRC_TYPE, OUT_TYPE) \ + { HASH_BATCH_NORM_KEY( SRC_TYPE, OUT_TYPE, 1), \ + HASH_BATCH_NORM_SH_KERNEL_2D_NAME( SRC_TYPE, OUT_TYPE), \ + VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME }, + +#define TENSOR_BATCH_NORM_FLOAT_2D( SRC_TYPE, OUT_TYPE) \ + { HASH_BATCH_NORM_KEY( SRC_TYPE, OUT_TYPE, 1), \ + HASH_BATCH_NORM_SH_KERNEL_2D_NAME( F32, F32), \ + VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_BATCH_NORM_FLOAT(F32, F32) + TENSOR_BATCH_NORM_FLOAT(F32, F32) + TENSOR_BATCH_NORM_FLOAT(F32, F32) + TENSOR_BATCH_NORM_FLOAT(F16, F16) + TENSOR_BATCH_NORM_FLOAT(F16, F16) + TENSOR_BATCH_NORM_FLOAT(F16, F16) + + TENSOR_BATCH_NORM_FLOAT_2D(F32, F32) + TENSOR_BATCH_NORM_FLOAT_2D(F32, F32) + TENSOR_BATCH_NORM_FLOAT_2D(F16, F16) + TENSOR_BATCH_NORM_FLOAT_2D(F16, F16) + + TENSOR_BATCH_NORM_KERNELS(U8, U8) + TENSOR_BATCH_NORM_KERNELS(U8, U8) + TENSOR_BATCH_NORM_KERNELS(U8, U8) + TENSOR_BATCH_NORM_KERNELS(U8, U8) + TENSOR_BATCH_NORM_KERNELS(U8, U8) + TENSOR_BATCH_NORM_KERNELS(U8, U8) + + TENSOR_BATCH_NORM_KERNELS_2D(U8, U8) + TENSOR_BATCH_NORM_KERNELS_2D(U8, U8) + TENSOR_BATCH_NORM_KERNELS_2D(U8, U8) + TENSOR_BATCH_NORM_KERNELS_2D(U8, U8) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) +#define SCALAR_INPUT_EPS (6) +#define SCALAR_INPUT_SCALE (7) +#define SCALAR_INPUT_TAIL (8) +#define SCALAR_OUTPUT_SCALE (9) +#define SCALAR_OUTPUT_ZP (10) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_log_softmax_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * in_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + in_shape = attr[0]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = in_shape->data[0]; + gpu_param.global_size[1] = in_shape->data[1]; + gpu_param.global_size[2] = in_shape->size > 2 ? in_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + } + + return status; +} /* _log_softmax_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_BATCH_NORM_KEY( input_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _log_softmax_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + float input_scale = 1.0f; + float input_tail = 0; + float output_scale = 1.0f; + float output_zp = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float eps = vsi_nn_kernel_param_get_float32(params, "eps"); + + if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) + { + input_scale = inputs[0]->attr.dtype.scale; + input_tail = 0 - (float)inputs[0]->attr.dtype.zero_point * inputs[0]->attr.dtype.scale; + } + + if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) + { + input_scale = 1.0f / outputs[0]->attr.dtype.scale; + output_zp = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + } + + if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const) + || ( inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 + && inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 ) + || ( inputs[2]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 + && inputs[2]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32) + || ( inputs[3]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 + && inputs[3]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32) + || ( inputs[4]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 + && inputs[4]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 ) + ) + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + ) + { + return NULL; + } + + image_2d = ((inputs[0]->attr.dim_num == 2) || (inputs[0]->attr.size[2] == 1)); + status = _query_kernel( inputs, outputs, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + inputs, 5, outputs, 1 ); + + node_params[SCALAR_INPUT_EPS] = vsi_nn_kernel_scalar_create( + graph, F32, &eps ); + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input_scale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input_tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &output_scale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &output_zp ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_EPS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( batchnorm_single, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/cast_cl.c b/src/tim/vx/internal/src/kernel/cl/cast_cl.c new file mode 100644 index 0000000..d89849a --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/cast_cl.c @@ -0,0 +1,283 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _CAST_KERNEL_SOURCE "cast" + +#define STR(a) #a +// Add kernel hashtable here +#define CAST_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d)) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { CAST_HASH_KEY( IN_DTYPE, OUT_DTYPE, IMAGE ), \ + CVIVANTE_NAMESPACE("cl.cast_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + SOURCE } + +#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { CAST_HASH_KEY( IN_DTYPE, OUT_DTYPE, IMAGE_2D ), \ + CVIVANTE_NAMESPACE("cl.cast_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _cast_kernel_map[] = +{ + PACK_KERNEL_MAP( F32, F32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( F32, I32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( F32, U32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I32, F32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I32, I32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I32, U32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U32, F32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U32, I32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U32, U32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( F32, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I32, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U32, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F32, F32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F32, I32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F32, U32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I32, F32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I32, I32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I32, U32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( U32, F32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( U32, I32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( U32, U32, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F32, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I32, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( U32, BOOL8, _CAST_KERNEL_SOURCE ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _cast_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CAST_PARAM_NUM _cnt_of_array( _cast_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_cast_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_int_array_t * out_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = out_shape->size < 3 ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _cast_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _cast_kernel_map; + size_t kernel_map_size = _cnt_of_array( _cast_kernel_map ); + vx_param_description_t * param_def = _cast_kernel_param_def; + size_t param_def_size = _cnt_of_array( _cast_kernel_param_def ); + vx_kernel_initialize_f initializer = _cast_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + else if ((I8 == in_dtype) || (I16 == in_dtype)) + { + in_dtype = I32; + } + else if ((U8 == in_dtype) || (U16 == in_dtype)) + { + in_dtype = U32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + else if ((I8 == out_dtype) || (I16 == out_dtype)) + { + out_dtype = I32; + } + else if ((U8 == out_dtype) || (U16 == out_dtype)) + { + out_dtype = U32; + } + + key = CAST_HASH_KEY( in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CAST_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d ); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CAST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CAST_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( cast, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c new file mode 100644 index 0000000..d8cb733 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c @@ -0,0 +1,306 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define _CLIP_KERNEL_SOURCE(_input_type) "clip_"#_input_type + +#define STR(a) #a +// Add kernel hashtable here +#define CLIP_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d)) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { CLIP_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 ), \ + CVIVANTE_NAMESPACE("cl.clip_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _CLIP_KERNEL_SOURCE(IN_DTYPE) } + +#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE ) \ + { CLIP_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1 ), \ + CVIVANTE_NAMESPACE("cl.clip_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _CLIP_KERNEL_SOURCE(IN_DTYPE) } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _clip_kernel_map[] = +{ + PACK_KERNEL_MAP(F32, F32), + PACK_KERNEL_MAP(F32, U8), + PACK_KERNEL_MAP(U8, U8), + PACK_KERNEL_MAP(U8, F32), + PACK_KERNEL_MAP_2D(F32, F32), + PACK_KERNEL_MAP_2D(F32, U8), + PACK_KERNEL_MAP_2D(U8, U8), + PACK_KERNEL_MAP_2D(U8, F32), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _clip_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CLIP_PARAM_NUM _cnt_of_array( _clip_kernel_param_def ) + +#define SCALAR_MIN_VALUE (2) +#define SCALAR_MAX_VALUE (3) +#define SCALAR_INPUT_SCALE (4) +#define SCALAR_INPUT_TAIL (5) +#define SCALAR_OUTPUT_SCALE (6) +#define SCALAR_OUTPUT_TAIL (7) + +#define CLIP_PARAM_NUM 4 +#define CLIP_QUANT_PARAM_NUM _cnt_of_array( _clip_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_clip_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_int_array_t * out_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _clip_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _clip_kernel_map; + size_t kernel_map_size = _cnt_of_array( _clip_kernel_map ); + vx_param_description_t * param_def = _clip_kernel_param_def; + size_t param_def_size = _cnt_of_array( _clip_kernel_param_def ); + vx_kernel_initialize_f initializer = _clip_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 == in_dtype) || (U8 == out_dtype)) + { + param_def_size = CLIP_QUANT_PARAM_NUM; + *is_use_u8_kernel = TRUE; + } + else + { + param_def_size = CLIP_PARAM_NUM; + *is_use_u8_kernel = FALSE; + } + + key = CLIP_HASH_KEY( in_dtype, out_dtype, image_2d ); + + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CLIP_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; + float outputTail = (float)outputs[0]->attr.dtype.zero_point; + float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point; + vsi_bool is_use_u8_kernel = FALSE; + float min_value = vsi_nn_kernel_param_get_float32( params, "min_value" ); + float max_value = vsi_nn_kernel_param_get_float32( params, "max_value" ); + + outputScale = 1.0f / outputScale; + inputTail = -(inputTail * inputScale); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + + status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel); + + if( VSI_SUCCESS == status) + { + size_t node_params_num = CLIP_PARAM_NUM; + + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_MIN_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &min_value ); + node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value ); + if (is_use_u8_kernel) + { + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &inputTail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail ); + node_params_num = CLIP_QUANT_PARAM_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MIN_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( clip, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c new file mode 100644 index 0000000..856042d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c @@ -0,0 +1,382 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/** Comparisons Kernel internal type */ +typedef enum +{ + COMP_GREAT = VSI_NN_RELATIONAL_OPS_GREAT, + COMP_GREAT_EQUAL = VSI_NN_RELATIONAL_OPS_GREAT_EQUAL, + COMP_LESS = VSI_NN_RELATIONAL_OPS_LESS, + COMP_LESS_EQUAL = VSI_NN_RELATIONAL_OPS_LESS_EQUAL, + COMP_NOT_EQUAL = VSI_NN_RELATIONAL_OPS_NOT_EQUAL, + COMP_EQUAL = VSI_NN_RELATIONAL_OPS_EQUAL, +} relational_type_e; + +/* + * Define kernel meta. + */ +#define HASH_COMPARISONS_KEY(_type, _input0_type, _input1_type, _output_type, _image_2d) \ + ((_type << 28) | (_input0_type << 20) | (_input1_type << 12) | (_output_type << 2) | (_image_2d)) + +#define KERNEL_SOURCE_2D "relational_ops", +#define KERNEL_SOURCE_3D "relational_ops", + +#define HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, SRC0_TYPE, SRC1_TYPE) \ + CVIVANTE_NAMESPACE("cl."#FUNC_NAME"_"#SRC0_TYPE#SRC1_TYPE"toBOOL8") + +#define COMPARISONS_KERNELS_U32(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 0), \ + HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, U32, U32), \ + SOURCE }, + +#define HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, SRC0_TYPE, SRC1_TYPE) \ + CVIVANTE_NAMESPACE("cl."#FUNC_NAME"_"#SRC0_TYPE#SRC1_TYPE"toBOOL8_2D") + +#define COMPARISONS_KERNELS_U32_2D(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 1), \ + HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, U32, U32), \ + SOURCE }, + +#define COMPARISONS_KERNELS_I32(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 0), \ + HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, I32, I32), \ + SOURCE }, + +#define COMPARISONS_KERNELS_I32_2D(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 1), \ + HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, I32, I32), \ + SOURCE }, + +#define COMPARISONS_KERNELS_F32(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 0), \ + HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, F32, F32), \ + SOURCE }, + +#define COMPARISONS_KERNELS_F32_2D(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 1), \ + HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, F32, F32), \ + SOURCE }, + +#define LESS_OP less +#define GREAT_OP great +#define LESS_EQUAL_OP less_equal +#define GREAT_EQUAL_OP great_equal +#define EQUAL_OP equal +#define NOT_EQUAL_OP not_equal + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } _comparisons_cl_kernel_map[] = +{ + COMPARISONS_KERNELS_F32(LESS_OP, COMP_LESS, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_F32(LESS_OP, COMP_LESS, F32, F32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_U32(LESS_OP, COMP_LESS, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(LESS_OP, COMP_LESS, I32, I32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(LESS_OP, COMP_LESS, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_F32_2D(LESS_OP, COMP_LESS, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_F32_2D(LESS_OP, COMP_LESS, F32, F32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_U32_2D(LESS_OP, COMP_LESS, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(LESS_OP, COMP_LESS, I32, I32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(LESS_OP, COMP_LESS, BOOL8, BOOL8, KERNEL_SOURCE_2D) + + COMPARISONS_KERNELS_F32(GREAT_OP, COMP_GREAT, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_F32(GREAT_OP, COMP_GREAT, F32, F32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_U32(GREAT_OP, COMP_GREAT, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(GREAT_OP, COMP_GREAT, I32, I32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(GREAT_OP, COMP_GREAT, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_F32_2D(GREAT_OP, COMP_GREAT, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_F32_2D(GREAT_OP, COMP_GREAT, F32, F32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_U32_2D(GREAT_OP, COMP_GREAT, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(GREAT_OP, COMP_GREAT, I32, I32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(GREAT_OP, COMP_GREAT, BOOL8, BOOL8, KERNEL_SOURCE_2D) + + COMPARISONS_KERNELS_F32(LESS_EQUAL_OP, COMP_LESS_EQUAL, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_F32(LESS_EQUAL_OP, COMP_LESS_EQUAL, F32, F32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_U32(LESS_EQUAL_OP, COMP_LESS_EQUAL, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(LESS_EQUAL_OP, COMP_LESS_EQUAL, I32, I32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(LESS_EQUAL_OP, COMP_LESS_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_F32_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_F32_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, F32, F32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_U32_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, I32, I32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_2D) + + COMPARISONS_KERNELS_F32(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_F32(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F32, F32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_U32(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, I32, I32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_F32_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_F32_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F32, F32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_U32_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, I32, I32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_2D) + + COMPARISONS_KERNELS_F32(EQUAL_OP, COMP_EQUAL, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_F32(EQUAL_OP, COMP_EQUAL, F32, F32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_U32(EQUAL_OP, COMP_EQUAL, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(EQUAL_OP, COMP_EQUAL, I32, I32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(EQUAL_OP, COMP_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_F32_2D(EQUAL_OP, COMP_EQUAL, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_F32_2D(EQUAL_OP, COMP_EQUAL, F32, F32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_U32_2D(EQUAL_OP, COMP_EQUAL, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(EQUAL_OP, COMP_EQUAL, I32, I32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(EQUAL_OP, COMP_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_2D) + + COMPARISONS_KERNELS_F32(NOT_EQUAL_OP, COMP_NOT_EQUAL, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_F32(NOT_EQUAL_OP, COMP_NOT_EQUAL, F32, F32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_U32(NOT_EQUAL_OP, COMP_NOT_EQUAL, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(NOT_EQUAL_OP, COMP_NOT_EQUAL, I32, I32, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_I32(NOT_EQUAL_OP, COMP_NOT_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_F32_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_F32_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, F32, F32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_U32_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, I32, I32, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_I32_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_2D) +}; + +#undef LESS_OP +#undef GREAT_OP +#undef LESS_EQUAL_OP +#undef GREAT_EQUAL_OP +#undef EQUAL_OP +#undef NOT_EQUAL_OP + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_INPUT0_SCALE (3) +#define SCALAR_INPUT0_TAIL (4) +#define SCALAR_INPUT1_SCALE (5) +#define SCALAR_INPUT1_TAIL (6) +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_comparisons_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_shape = attr[2]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} /* _comparisons_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t operation, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype; + vsi_nn_kernel_dtype_e input1_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(_comparisons_cl_kernel_map); i ++ ) + { + if( _comparisons_cl_kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(_comparisons_cl_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _comparisons_cl_kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _comparisons_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + _comparisons_cl_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _comparisons_cl_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t operation = 0; + + float input0Scale = inputs[0]->attr.dtype.scale; + float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale; + float input1Scale = inputs[1]->attr.dtype.scale; + float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + operation = vsi_nn_kernel_param_get_int32( params, "operation" ); + + image_2d = (outputs[0]->attr.dim_num == 2); + status = _query_kernel( inputs, outputs, operation, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + inputs, 2, outputs, 1 ); + node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input0Scale ); + node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input0Tail ); + node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input1Scale ); + node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input1Tail ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] ); + } + } + return node; +} /* _setup() */ + + +REGISTER_BACKEND_CL( relational_ops, _setup ) + +__END_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c new file mode 100644 index 0000000..2fe07d1 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c @@ -0,0 +1,314 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_DETECT_POST_BOX, +} _internal_kernel_e; + +#define _DETECT_POST_BOX_KERNEL_SOURCE "detect_post_box" + +#define STR(a) #a +// Add kernel hashtable here +#define DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + ((IN0_DTYPE << 18) | ( IN1_DTYPE << 11 ) | ( OUT_DTYPE << 4)) + +#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \ + CVIVANTE_NAMESPACE("cl.detect_post_box_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ + _DETECT_POST_BOX_KERNEL_SOURCE} + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _detect_post_box_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, F32 ), + PACK_KERNEL_MAP( U8, U8, F32 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _detect_post_box_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _DETECT_POST_BOX_PARAM_NUM _cnt_of_array( _detect_post_box_kernel_param_def ) + +#define _DETECT_POST_BOX_F32_PARAM_NUM 8 + +#define SCALAR_SCALE_Y (3) +#define SCALAR_SCALE_X (4) +#define SCALAR_SCALE_H (5) +#define SCALAR_SCALE_W (6) +#define SCALAR_LOG_E (7) +#define SCALAR_TAIL0 (8) +#define SCALAR_TAIL1 (9) +#define SCALAR_SCALE0 (10) +#define SCALAR_SCALE1 (11) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * in_shape = NULL; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + in_shape = input_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = 2; + gpu_param.global_size[0] = ( + (in_shape->data[1] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]); + gpu_param.global_size[1] = ( + (in_shape->data[2] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(input_attr); + + return status; +} /* _detect_post_box_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _detect_post_box_kernel_map; + size_t kernel_map_size = _cnt_of_array( _detect_post_box_kernel_map ); + vx_param_description_t * param_def = _detect_post_box_kernel_param_def; + size_t param_def_size = _cnt_of_array( _detect_post_box_kernel_param_def ); + vx_kernel_initialize_f initializer = _detect_post_box_initializer; + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + + if ((U8 == in0_dtype) && (U8 == in1_dtype)) + { + *is_use_u8_kernel = TRUE; + param_def_size = _DETECT_POST_BOX_PARAM_NUM; + } + else + { + *is_use_u8_kernel = FALSE; + param_def_size = _DETECT_POST_BOX_F32_PARAM_NUM; + + } + + key = DETECT_POST_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype ); + + for ( i = 0; i < kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (vx_uint32)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_DETECT_POST_BOX_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float logE = (float)(log10(exp(1.0f)) / log10(2.0f)); + float inv_scale_y = vsi_nn_kernel_param_get_float32( params, "inv_scale_y" ); + float inv_scale_x = vsi_nn_kernel_param_get_float32( params, "inv_scale_x" ); + float inv_scale_h = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" ); + float inv_scale_w = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" ); + vsi_bool is_use_u8_kernel = FALSE; + float input0Scale = 1.0f; + float input0Zp = 0.0f; + float input0Tail = 0.0f; + float input1Scale = 1.0f; + float input1Zp = 0.0f; + float input1Tail = 0.0f; + + status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel ); + + if ( inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) + { + input0Zp = (float)inputs[0]->attr.dtype.zero_point;; + input0Scale = inputs[0]->attr.dtype.scale; + input0Tail = -input0Zp * input0Scale; + } + + if ( inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) + { + input1Zp = (float)inputs[1]->attr.dtype.zero_point;; + input1Scale = inputs[1]->attr.dtype.scale; + input1Tail = -input1Zp * input1Scale; + } + + if ( VSI_SUCCESS == status ) + { + size_t node_params_num = _DETECT_POST_BOX_F32_PARAM_NUM; + + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_BOX_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_y ); + node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_x ); + node_params[SCALAR_SCALE_H] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_h ); + node_params[SCALAR_SCALE_W] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_w ); + node_params[SCALAR_LOG_E] = vsi_nn_kernel_scalar_create( graph, F32, &logE ); + if (is_use_u8_kernel) + { + node_params[SCALAR_TAIL0] = vsi_nn_kernel_scalar_create( graph, F32, &input0Tail ); + node_params[SCALAR_TAIL1] = vsi_nn_kernel_scalar_create( graph, F32, &input1Tail ); + node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale ); + node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale ); + node_params_num = _DETECT_POST_BOX_PARAM_NUM; + } + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_H] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_W] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_LOG_E] ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL0] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL1] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE0] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE1] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( detect_post_box, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c new file mode 100644 index 0000000..193be18 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c @@ -0,0 +1,191 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS +#if 0 +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_DETECT_POST_NMS, +} _internal_kernel_e; + +#define _DETECT_POST_NMS_KERNEL_SOURCE "detect_post_nms" +#define _DETECT_POST_NMS_KERNEL_NAME CVIVANTE_NAMESPACE("cl.detect_post_nms") + +// Add kernel hashtable here +#define DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _DETECT_POST_NMS_KERNEL_NAME, SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _detect_post_nms_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, _DETECT_POST_NMS_KERNEL_SOURCE ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _detect_post_nms_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _DETECT_POST_NMS_PARAM_NUM _cnt_of_array( _detect_post_nms_kernel_param_def ) + +#define SCALAR_NMS_TYPE (6) +#define SCALAR_MAX_NUM (7) +#define SCALAR_MAX_CLASS (8) +#define SCALAR_MAX_DETECT (9) +#define SCALAR_SCORE_TH (10) +#define SCALAR_IOU_TH (11) +#define SCALAR_IS_BG (12) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_detect_post_nms_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + + return status; +} /* _detect_post_nms_initializer() */ + + + +/* + * Query kernel + */ + +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _detect_post_nms_kernel_map; + size_t kernel_map_size = _cnt_of_array( _detect_post_nms_kernel_map ); + vx_param_description_t * param_def = _detect_post_nms_kernel_param_def; + size_t param_def_size = _cnt_of_array( _detect_post_nms_kernel_param_def ); + vx_kernel_initialize_f initializer = _detect_post_nms_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = DETECT_POST_NMS_HASH_KEY( in_dtype, out_dtype ); + + for ( i = 0; i < kernel_map_size; i++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ +#endif + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_nn_kernel_node_t node = NULL; + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( detect_post_nms, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c new file mode 100644 index 0000000..b2afa0a --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c @@ -0,0 +1,401 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/** Unary Kernel internal type */ +typedef enum +{ + UNARY_SIN, + UNARY_EXP, + UNARY_LOG, + UNARY_ELU, + UNARY_NEG, + UNARY_HSIGMOID, + UNARY_MISH, +} unary_type_e; + +/* + * Define kernel meta. + */ +#define HASH_UNARY_KEY(_type, _input_type, _output_type, _image_2d) \ + ((_type << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d)) + + #define VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() \ + "eltwise_unary" + +#define HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl."#FUNC_NAME"_"#SRC_TYPE"to"#DST_TYPE) + +#define TENSOR_UNARY_KERNELS(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \ + { HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 0), \ + HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, SRC_TYPE, OUT_TYPE), \ + VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() }, + +#define HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl."#FUNC_NAME"_"#SRC_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_UNARY_KERNELS_2D(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \ + { HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 1), \ + HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, SRC_TYPE, OUT_TYPE), \ + VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() }, + +#define TENSOR_UNARY_KERNELS_FLOAT(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \ + { HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 0), \ + HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, F32, F32), \ + VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() }, + +#define TENSOR_UNARY_KERNELS_FLOAT_2D(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE) \ + { HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 1), \ + HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, F32, F32), \ + VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() }, + +#define SIN_OPERATION sin +#define EXP_OPERATION exp +#define LOG_OPERATION log +#define ELU_OPERATION elu +#define NEG_OPERATION neg +#define HSIGMOID_OPERATION hard_sigmoid +#define MISH_OPERATION mish + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION, UNARY_SIN, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION, UNARY_SIN, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION, UNARY_EXP, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION, UNARY_EXP, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT(LOG_OPERATION, UNARY_LOG, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(LOG_OPERATION, UNARY_LOG, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT(ELU_OPERATION, UNARY_ELU, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(ELU_OPERATION, UNARY_ELU, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT(NEG_OPERATION, UNARY_NEG, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(NEG_OPERATION, UNARY_NEG, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION, UNARY_MISH, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION, UNARY_MISH, F16, F16) + + TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION, UNARY_EXP, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION, UNARY_EXP, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT_2D(LOG_OPERATION, UNARY_LOG, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(LOG_OPERATION, UNARY_LOG, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT_2D(ELU_OPERATION, UNARY_ELU, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(ELU_OPERATION, UNARY_ELU, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT_2D(NEG_OPERATION, UNARY_NEG, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(NEG_OPERATION, UNARY_NEG, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION, UNARY_MISH, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION, UNARY_MISH, F16, F16) + + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8, U8) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8, U8) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, U8, U8) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, U8, U8) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, U8, U8) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, U8, U8) + + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, U8) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, U8, U8) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8) + + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I32, I32) + + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32) +}; + +#undef SIN_OPERATION +#undef EXP_OPERATION +#undef LOG_OPERATION +#undef ELU_OPERATION +#undef NEG_OPERATION +#undef HSIGMOID_OPERATION +#undef MISH_OPERATION +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_INPUT_SCALE (2) +#define SCALAR_INPUT_TAIL (3) +#define SCALAR_OUTPUT_SCALE (4) +#define SCALAR_OUTPUT_ZP (5) +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _eltwise_unary_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t type, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_UNARY_KEY( type, input_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _eltwise_unary_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel, + const unary_type_e unary_type + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* rs_tensors[2] = { NULL }; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 0; + vsi_bool ret; + + float inputScale = inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; + float outputScale = outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + + ret = vsi_nn_kernel_optimize_element_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); + if( ret ) + { + rs_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shape, new_rank ); + rs_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shape, new_rank ); + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size, + rs_tensors[0]->attr.dim_num ) ) + { + return NULL; + } + + outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; + + image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1); + status = _query_kernel( rs_tensors, &rs_tensors[1], unary_type, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + rs_tensors, 1, &rs_tensors[1], 1 ); + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &inputTail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &outputZP ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, OnError ); + } + } + +OnError: + if (rs_tensors[0]) + { + vsi_nn_ReleaseTensor( &rs_tensors[0] ); + } + + if (rs_tensors[1]) + { + vsi_nn_ReleaseTensor( &rs_tensors[1] ); + } + + if (node_params[SCALAR_INPUT_SCALE]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + } + + if (node_params[SCALAR_INPUT_TAIL]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + } + + if (node_params[SCALAR_OUTPUT_SCALE]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + } + + if (node_params[SCALAR_OUTPUT_ZP]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + + return node; +} /* _setup() */ + +#define REGISTER_ELTWISE_UNARY_BACKEND_CL(KERNEL_NAME, UNARY_TYPE) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel, UNARY_TYPE); \ + } \ + REGISTER_BACKEND_CL( KERNEL_NAME, _##KERNEL_NAME##_setup ) + + +REGISTER_ELTWISE_UNARY_BACKEND_CL( sin, UNARY_SIN ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( exp, UNARY_EXP ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( log, UNARY_LOG ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( elu, UNARY_ELU ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( neg, UNARY_NEG ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( mish, UNARY_MISH ) + +__END_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c new file mode 100644 index 0000000..831e27c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c @@ -0,0 +1,314 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define FLOORDIV_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + + + #define FLOORDIV_KERNEL_SOURCE_NAME \ + "floordiv" + +#define FLOORDIV_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ + { FLOORDIV_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + CVIVANTE_NAMESPACE("cl.floordiv_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \ + FLOORDIV_KERNEL_SOURCE_NAME }, + +#define FLOORDIV_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ + { FLOORDIV_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + CVIVANTE_NAMESPACE("cl.floordiv_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \ + FLOORDIV_KERNEL_SOURCE_NAME }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _floordiv_kernel_map[] = +{ + // Register kernel here + FLOORDIV_KERNELS( F32, F32, F32 ) + FLOORDIV_KERNELS( I32, I32, I32 ) + FLOORDIV_KERNELS( U8, U8, U8 ) + + FLOORDIV_KERNELS_2D( F32, F32, F32 ) + FLOORDIV_KERNELS_2D( I32, I32, I32 ) + FLOORDIV_KERNELS_2D( U8, U8, U8 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _floordiv_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _FLOORDIV_PARAM_NUM _cnt_of_array( _floordiv_kernel_param_def ) + +#define SCALAR_INPUT0_SCALE (3) +#define SCALAR_INPUT0_TAIL (4) +#define SCALAR_INPUT1_SCALE (5) +#define SCALAR_INPUT1_TAIL (6) +#define SCALAR_OUTPUT_SCALE (7) +#define SCALAR_OUTPUT_TAIL (8) + +#define FLOORDIV_PARAM_NUM 3 +#define FLOORDIV_QUANT_PARAM_NUM _cnt_of_array( _floordiv_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_floordiv_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_status status = VX_FAILURE; + vx_tensor output = (vx_tensor)param[2]; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t *output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = output_shape->size < 3 ? 2 : 3; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + + return status; +} /* _floordiv_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _floordiv_kernel_map; + size_t kernel_map_size = _cnt_of_array( _floordiv_kernel_map ); + vx_param_description_t * param_def = _floordiv_kernel_param_def; + size_t param_def_size = _cnt_of_array( _floordiv_kernel_param_def ); + vx_kernel_initialize_f initializer = _floordiv_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in0_dtype) + { + in0_dtype = F32; + } + + if (F16 == in1_dtype) + { + in1_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 == in0_dtype) || (U8 == in1_dtype) || (U8 == out_dtype)) + { + param_def_size = FLOORDIV_QUANT_PARAM_NUM; + *is_use_u8_kernel = TRUE; + } + else + { + param_def_size = FLOORDIV_PARAM_NUM; + *is_use_u8_kernel = FALSE; + } + + key = FLOORDIV_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_FLOORDIV_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; + float outputTail = (float)outputs[0]->attr.dtype.zero_point; + float input0Scale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; + float input0Tail = (float)inputs[0]->attr.dtype.zero_point; + float input1Scale = inputs[1]->attr.dtype.scale == 0.0f ? 1.0f : inputs[1]->attr.dtype.scale; + float input1Tail = (float)inputs[1]->attr.dtype.zero_point; + vsi_bool is_use_u8_kernel = FALSE; + + outputScale = 1.0f / outputScale; + input0Tail = -(input0Tail * input0Scale); + input1Tail = -(input1Tail * input1Scale); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + + status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = FLOORDIV_PARAM_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _FLOORDIV_PARAM_NUM, + inputs, input_num, outputs, output_num ); + if (is_use_u8_kernel) + { + node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale ); + node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail ); + node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale ); + node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail ); + node_params_num = FLOORDIV_QUANT_PARAM_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( floordiv, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c new file mode 100644 index 0000000..111f66f --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c @@ -0,0 +1,328 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_GATHER, +} _internal_kernel_e; + +#define _GATHER_KERNEL_SOURCE "gather" + +// Add kernel hashtable here +#define VX_KERNEL_NAME_GATHER_U8TOU8 CVIVANTE_NAMESPACE("cl.gather_U8toU8") +#define VX_KERNEL_NAME_GATHER_F16TOF16 CVIVANTE_NAMESPACE("cl.gather_F16toF16") +#define VX_KERNEL_NAME_GATHER_I32TOI32 CVIVANTE_NAMESPACE("cl.gather_I32toI32") +#define VX_KERNEL_NAME_GATHER_F32TOF32 CVIVANTE_NAMESPACE("cl.gather_F32toF32") + +// Add kernel hashtable here +#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + +#define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \ + VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } gather_map[] = +{ + TENSOR_GATHER_KERNELS(U8, I32, U8, _GATHER_KERNEL_SOURCE) + TENSOR_GATHER_KERNELS(F16, I32, F16, _GATHER_KERNEL_SOURCE) + TENSOR_GATHER_KERNELS(I32, I32, I32, _GATHER_KERNEL_SOURCE) + TENSOR_GATHER_KERNELS(F32, I32, F32, _GATHER_KERNEL_SOURCE) +}; + +/* + * Kernel params + */ +static vx_param_description_t _gather_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GATHER_PARAM_NUM _cnt_of_array( _gather_kernel_param_def ) + +static vsi_status cal_gather_tensor_reshape_size + ( + vsi_nn_tensor_t ** inputs, + int32_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t block_size, + uint32_t idxFlg + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t dims_num = inputs[0]->attr.dim_num; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + uint32_t elementCnt = 1; +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + + for(i = 0; i < dims_num; ++i) + { + elementCnt *= input_size[i]; + } + + for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + + if(idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes[0] = elementCnt; + sizes[1] = 1; + status = VSI_SUCCESS; + } + else + { + if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + status = VSI_SUCCESS; + } + } +#undef VSI_NN_MAX_IMAGE_WIDTH + + return status; +} /* _get_EltOP_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_gather_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * input1_shape = NULL; + int32_t block_size = 0; + int32_t block_num = 0; + int32_t indices_num = 1; + uint32_t input_dims1 = 0; + vx_uint32 i = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num); + CHECK_STATUS_FAIL_GOTO(status, final ); + + input1_shape = attr[1]->shape; + input_dims1 = (uint32_t)input1_shape->size; + for (i = 0; i < input_dims1; i++) + { + indices_num *= input1_shape->data[i]; + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = indices_num; + gpu_param.global_size[2] = block_num; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + return status; +} /* _gather_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0 ); + + for( i = 0; i < _cnt_of_array(gather_map); i ++ ) + { + if( gather_map[i].key == key ) + { + break; + } + } + + if( i < _cnt_of_array(gather_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", gather_map[i].function_name ); + kernel->info.parameters = _gather_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _gather_kernel_param_def ); + kernel->info.initialize = _gather_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + gather_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + gather_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GATHER_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); + int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); + int32_t indices_num = vsi_nn_kernel_param_get_int32( params, "indices_num" ); + + status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0); + status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1); + status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0); + if(status != VSI_SUCCESS) + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 0; +#define RESHAPE_DIM 2 + /* Pass parameters to node. */ + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], RESHAPE_DIM ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[1], RESHAPE_DIM ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], RESHAPE_DIM ); +#undef RESHAPE_DIM + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &indices_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &node_params[0] ); + vsi_nn_kernel_tensor_release( &node_params[1] ); + vsi_nn_kernel_tensor_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( gather, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c new file mode 100644 index 0000000..1927146 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c @@ -0,0 +1,334 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "gather_nd" +#define KERNEL_SOURCE_2 "gather_nd_3d" + + typedef enum +{ + _1D = 0, + _2D, + _3D +} vsi_nn_kernel_coord_type_e; + +#define HASH_GATHER_ND_KEY(_input0_type, _input1_type, _output_type, _coord_dim) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_coord_dim)) + +#define HASH_GATHER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \ + CVIVANTE_NAMESPACE("cl.gather_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE) + +#define TENSOR_GATHER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \ + { HASH_GATHER_ND_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE), \ + HASH_GATHER_ND_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } gather_nd_map[] = +{ + TENSOR_GATHER_ND_KERNELS(U8, I32, U8, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(F16, I32, F16, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(I32, I32, I32, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(F32, I32, F32, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(U8, I32, U8, _2D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(F16, I32, F16, _2D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(I32, I32, I32, _2D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(F32, I32, F32, _2D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(U8, I32, U8, _3D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_KERNELS(F16, I32, F16, _3D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_KERNELS(I32, I32, I32, _3D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_KERNELS(F32, I32, F32, _3D, KERNEL_SOURCE_2) +}; + +/* + * Kernel params + */ +static vx_param_description_t _gather_nd_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _GATHER_ND_PARAM_NUM _cnt_of_array(_gather_nd_kernel_param_def) + +static vsi_status cal_gather_nd_tensor_reshape_size + ( + vsi_nn_tensor_t ** inputs, + int32_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t block_size, + uint32_t coordDim, + int32_t* newDim + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t dims_num = inputs[0]->attr.dim_num; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + uint32_t elementCnt = 1; +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + + newDim[0] = 0; + for(i = 0; i < dims_num; ++i) + { + elementCnt *= input_size[i]; + } + + for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + + if(coordDim) // input reshape + { + uint32_t offset = dims_num - coordDim + 1; + for(i = coordDim-1; i > 0; i--) + { + sizes[i] = input_size[i + offset - 1]; + } + for(i = 0; i < offset; i++) + { + sizes[0] *= input_size[i]; + } + + newDim[0] = coordDim; + if(coordDim == 1) + { + newDim[0] = 2; + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + } + + status = VSI_SUCCESS; + } + else // indices&output reshape + { + if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + status = VSI_SUCCESS; + newDim[0] = 2; + } + } +#undef VSI_NN_MAX_IMAGE_WIDTH + + return status; +} /* _get_EltOP_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_gather_nd_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t block_size = 0; + int32_t indices_num = 1; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size); + CHECK_STATUS_FAIL_GOTO(status, final ); + + indices_num = attr[0]->shape->data[1]; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = indices_num; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _gather_nd_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t coord_dim + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_coord_type_e coord_type = _1D; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if(coord_dim == 1) + { + coord_type = _1D; + } + else if(coord_dim == 2) + { + coord_type = _2D; + } + else if(coord_dim == 3) + { + coord_type = _3D; + } + + key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type ); + + for( i = 0; i < _cnt_of_array(gather_nd_map); i ++ ) + { + if( gather_nd_map[i].key == key ) + { + break; + } + } + + if( i < _cnt_of_array(gather_nd_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", gather_nd_map[i].function_name ); + kernel->info.parameters = _gather_nd_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _gather_nd_kernel_param_def ); + kernel->info.initialize = _gather_nd_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + gather_nd_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + gather_nd_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GATHER_ND_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + + status = cal_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim); + status |= cal_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim); + status |= cal_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim); + if(status != VSI_SUCCESS) + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, coord_dim ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], rs_in_dim ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[1], rs_idx_dim ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_ND_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &node_params[0] ); + vsi_nn_kernel_tensor_release( &node_params[1] ); + vsi_nn_kernel_tensor_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( gather_nd, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c new file mode 100644 index 0000000..88cd40f --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c @@ -0,0 +1,213 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_GRUCELL_ACTIVATION, +} _internal_kernel_e; + +#define _GRUCELL_ACTIVATION_KERNEL_SOURCE "grucell_activation" +#define _GRUCELL_ACTIVATION_KERNEL_NAME CVIVANTE_NAMESPACE("cl.grucell_activation") + +// Add kernel hashtable here +#define GRUCELL_ACTIVATION_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { GRUCELL_ACTIVATION_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _GRUCELL_ACTIVATION_KERNEL_NAME, SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _grucell_activation_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, _GRUCELL_ACTIVATION_KERNEL_SOURCE ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _grucell_activation_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GRUCELL_ACTIVATION_PARAM_NUM _cnt_of_array( _grucell_activation_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_grucell_activation_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + // vsi_nn_kernel_tensor_attr * attr[2] = { NULL }; + // attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + // attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + + // Add initializer + + // vsi_nn_kernel_tensor_attr_release( &attr[0] ); + // vsi_nn_kernel_tensor_attr_release( &attr[1] ); + return status; +} /* _grucell_activation_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _grucell_activation_kernel_map; + size_t kernel_map_size = _cnt_of_array( _grucell_activation_kernel_map ); + vx_param_description_t * param_def = _grucell_activation_kernel_param_def; + size_t param_def_size = _cnt_of_array( _grucell_activation_kernel_param_def ); + vx_kernel_initialize_f initializer = _grucell_activation_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = GRUCELL_ACTIVATION_HASH_KEY( in_dtype, out_dtype ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + /* + // Check if gpu can support the size + if( !vsi_nn_kernel_gpu_check_shape( + (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) + { + return NULL; + } + */ + + /* + * Use kernel param + * int32_t integer = vsi_nn_kernel_param_get_int32( param, "data_key_i32" ); + * int64_t integer = vsi_nn_kernel_param_get_int64( param, "hashkey" ); + * float fp = vsi_nn_kernel_param_get_float32( param, "data_key_f32" ); + * const char * str = vsi_nn_kernel_param_get_char( param, "padding" ); + * size_t buffer_size; + * int * buffer = (int*)vsi_nn_kernel_param_get_buffer( param, "padding", &buffer_size ); + */ + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_PARAM_NUM ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( grucell_activation, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c new file mode 100644 index 0000000..9fb557f --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c @@ -0,0 +1,213 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_GRUCELL_ACTIVATION_SMA, +} _internal_kernel_e; + +#define _GRUCELL_ACTIVATION_SMA_KERNEL_SOURCE "grucell_activation_sma" +#define _GRUCELL_ACTIVATION_SMA_KERNEL_NAME CVIVANTE_NAMESPACE("cl.grucell_activation_sma") + +// Add kernel hashtable here +#define GRUCELL_ACTIVATION_SMA_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { GRUCELL_ACTIVATION_SMA_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _GRUCELL_ACTIVATION_SMA_KERNEL_NAME, SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _grucell_activation_sma_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, _GRUCELL_ACTIVATION_SMA_KERNEL_SOURCE ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _grucell_activation_sma_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GRUCELL_ACTIVATION_SMA_PARAM_NUM _cnt_of_array( _grucell_activation_sma_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + // vsi_nn_kernel_tensor_attr * attr[2] = { NULL }; + // attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + // attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + + // Add initializer + + // vsi_nn_kernel_tensor_attr_release( &attr[0] ); + // vsi_nn_kernel_tensor_attr_release( &attr[1] ); + return status; +} /* _grucell_activation_sma_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _grucell_activation_sma_kernel_map; + size_t kernel_map_size = _cnt_of_array( _grucell_activation_sma_kernel_map ); + vx_param_description_t * param_def = _grucell_activation_sma_kernel_param_def; + size_t param_def_size = _cnt_of_array( _grucell_activation_sma_kernel_param_def ); + vx_kernel_initialize_f initializer = _grucell_activation_sma_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = GRUCELL_ACTIVATION_SMA_HASH_KEY( in_dtype, out_dtype ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_SMA_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + /* + // Check if gpu can support the size + if( !vsi_nn_kernel_gpu_check_shape( + (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) + { + return NULL; + } + */ + + /* + * Use kernel param + * int32_t integer = vsi_nn_kernel_param_get_int32( param, "data_key_i32" ); + * int64_t integer = vsi_nn_kernel_param_get_int64( param, "hashkey" ); + * float fp = vsi_nn_kernel_param_get_float32( param, "data_key_f32" ); + * const char * str = vsi_nn_kernel_param_get_char( param, "padding" ); + * size_t buffer_size; + * int * buffer = (int*)vsi_nn_kernel_param_get_buffer( param, "padding", &buffer_size ); + */ + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_SMA_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_SMA_PARAM_NUM ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( grucell_activation_sma, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c new file mode 100644 index 0000000..1b73c36 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c @@ -0,0 +1,667 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_MEAN_VARI, + INTERNAL_KERNEL_NORM, +} _internal_kernel_e; + +#define KERNEL_SOURCE_1 "instance_normalization_u8" +#define KERNEL_SOURCE_2 "instance_normalization_f16" +#define KERNEL_SOURCE_3 "instance_normalization_i32" +#define KERNEL_SOURCE_4 "instance_normalization_f32" + +// Add kernel hashtable here +#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE) + +#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE"_2D") + +#define HASH_INSTANCENORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.instance_norm_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_INSTANCENORM_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.instance_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +// Add kernel hashtable here +// mean vari +#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) + +#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(IN0_TYPE), \ + SOURCE }, + +#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(IN0_TYPE), \ + SOURCE }, + +// normalization +#define HASH_INSTANCENORM_KEY(_input0_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) + +#define TENSOR_INSTANCENORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_INSTANCENORM_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_INSTANCENORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_INSTANCENORM_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] = +{ + // Register kernel here + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( U8, F32, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I32, F32, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F32, F32, KERNEL_SOURCE_4 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 ) +}; + +static const _kernel_map_type _instancenorm_kernel_map[] = +{ + // Register kernel here + TENSOR_INSTANCENORM_KERNELS( U8, U8, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_1 ) + + TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 ) + + TENSOR_INSTANCENORM_KERNELS( I32, I32, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_KERNELS_2D( I32, I32, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_KERNELS( I32, F32, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 ) + + TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_4 ) + TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _INSTANCENORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _instancenorm_mean_vari_kernel_param_def ) + +static vx_param_description_t _instancenorm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _INSTANCENORM_PARAM_NUM _cnt_of_array( _instancenorm_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * input_shape = NULL; + int32_t rsFlg = 0; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &rsFlg); + CHECK_STATUS_FAIL_GOTO(status, final ); + + input_shape = attr[0]->shape; + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = attr[1]->shape->data[1]; + if(rsFlg) + { + height = height / chn; + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.local_size[2] = 1; + gpu_param.global_size[0] = (width + 15) / 16 * 16; + gpu_param.global_size[1] = chn; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + return status; +} /* _instance_normalization_mean_vari_initializer() */ + +DEF_KERNEL_INITIALIZER(_instancenorm_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * input_shape = NULL; + int32_t rsFlg = 0; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rsFlg); + CHECK_STATUS_FAIL_GOTO(status, final ); + + input_shape = attr[0]->shape; + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = attr[1]->shape->data[1]; + if(rsFlg) + { + height = height / chn; + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.local_size[2] = 1; + gpu_param.global_size[0] = (width + 15) / 16 * 16; + gpu_param.global_size[1] = chn; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + return status; +} /* _instancenorm_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + const uint32_t hashkey, + _internal_kernel_e kernel_id + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vx_kernel_initialize_f initializer = NULL; + vx_param_description_t * param_def = NULL; + const _kernel_map_type* kernel_map; + size_t kernel_map_size = 0; + size_t param_size = 0; + uint32_t i = 0; + + switch( kernel_id ) + { + case INTERNAL_KERNEL_MEAN_VARI: + initializer = _instancenorm_mean_vari_initializer; + kernel_map = _instancenorm_mean_vari_kernel_map; + kernel_map_size = _cnt_of_array( _instancenorm_mean_vari_kernel_map ); + param_def = _instancenorm_mean_vari_kernel_param_def; + param_size = _INSTANCENORM_MEAN_VARI_PARAM_NUM; + break; + case INTERNAL_KERNEL_NORM: + initializer = _instancenorm_initializer; + kernel_map = _instancenorm_kernel_map; + kernel_map_size = _cnt_of_array( _instancenorm_kernel_map ); + param_def = _instancenorm_kernel_param_def; + param_size = _INSTANCENORM_PARAM_NUM; + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == hashkey ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ +#define INTERNAL_KERNEL_SIZE (1) +#define MEAN_VARI_INDEX (0) + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t mean_vari_node_params[_INSTANCENORM_MEAN_VARI_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_dtype_e in0_dtype = U8; + vsi_nn_kernel_dtype_e out_dtype = U8; + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; + uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; + uint32_t hashkey = 0; + int32_t i = 0; + + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + int32_t reshape_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" ); + + int32_t width = inputs[0]->attr.size[0]; + int32_t height = inputs[0]->attr.size[1]; + int32_t group_num = (width + 15) / 16; + int32_t input_zp = inputs[0]->attr.dtype.zero_point; + float input_scale = inputs[0]->attr.dtype.scale; + int32_t input_fl = inputs[0]->attr.dtype.fl; + int32_t output_zp = outputs[0]->attr.dtype.zero_point; + float output_scale = outputs[0]->attr.dtype.scale; + int32_t output_fl = outputs[0]->attr.dtype.fl; + float in_fl_scale = 1.0f, out_fl_scale = 1.0; + float dim_ratio = (float)1.0 / (float)(width * height); + + if(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 + || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 + || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + { + if (input_fl > 0) + { + in_fl_scale = (1.0f / ((float) ((int64_t)1 << input_fl))); + } + else + { + in_fl_scale = ((float) ((int64_t)1 << -input_fl)); + } + } + + if(outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 + || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 + || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + { + if (output_fl > 0) + { + out_fl_scale = (float)((int64_t)1 << output_fl); + } + else + { + out_fl_scale = (1.0f / (float)((int64_t)1 << -output_fl)); + } + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL ); + // Assign unique_id + ikernels[i]->unique_id = kernel->unique_id; + } + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + + attr.size[0] = ((inputs[0]->attr.size[0] + 15) / 16) * 4; + attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + attr.size[2] = 1; + attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + attr.dim_num = 4; + tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg ); + hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg ); + + status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI ); + if( VSI_SUCCESS != status ) + { + goto final; + } + status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM ); + if( VSI_SUCCESS != status ) + { + goto final; + } + + if(reshape_flg) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[0]->attr.size[0]; + shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2]; + shape[2] = 1; + shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); + + shape[0] = outputs[0]->attr.size[0]; + shape[1] = outputs[0]->attr.size[1] * outputs[0]->attr.size[2]; + shape[2] = 1; + shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; + rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); + } + if(inputs[1]->attr.dim_num < 2) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[1]->attr.size[0]; + shape[1] = 1; + shape[2] = 1; + shape[3] = 1; + rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 ); + } + if(inputs[2]->attr.dim_num < 2) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[2]->attr.size[0]; + shape[1] = 1; + shape[2] = 1; + shape[3] = 1; + rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 ); + } + // Mean Vari + { + node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] ); + if(node) + { + uint32_t index = 0; + if(reshape_flg) + { + mean_vari_node_params[index++] = rs_input; + } + else + { + mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; + } + mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp ); + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale ); + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + + status = vsi_nn_kernel_node_pass_param( node, mean_vari_node_params, + _INSTANCENORM_MEAN_VARI_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] ); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] ); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[4] ); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[5] ); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[6] ); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[7] ); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[8] ); + vsi_nn_kernel_node_release( &node ); + } + } + + // Nomalization + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if(node) + { + uint32_t index = 0; + if(reshape_flg) + { + node_params[index++] = rs_input; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; + } + if(inputs[1]->attr.dim_num < 2) + { + node_params[index++] = rs_beta; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; + } + if(inputs[2]->attr.dim_num < 2) + { + node_params[index++] = rs_gamma; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; + } + node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; + if(reshape_flg) + { + node_params[index++] = rs_output; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t; + } + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_zp ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &out_fl_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &group_num ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, + _INSTANCENORM_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + vsi_nn_kernel_scalar_release( &node_params[13] ); + vsi_nn_kernel_scalar_release( &node_params[14] ); + vsi_nn_kernel_scalar_release( &node_params[15] ); + vsi_nn_kernel_scalar_release( &node_params[16] ); + } + } + + /* Pass parameters to node. */ +final: + if(rs_beta) + { + vsi_nn_kernel_tensor_release( &rs_beta ); + } + if(rs_gamma) + { + vsi_nn_kernel_tensor_release( &rs_gamma ); + } + if(reshape_flg) + { + vsi_nn_kernel_tensor_release( &rs_input ); + vsi_nn_kernel_tensor_release( &rs_output ); + } + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + if( ikernels[i] ) + { + vsi_nn_kernel_release( &ikernels[i] ); + } + if( tensors[i] ) + { + vsi_nn_ReleaseTensor( &tensors[i] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( instance_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c new file mode 100644 index 0000000..c604edb --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c @@ -0,0 +1,334 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 28) | (IN1_DTYPE << 20) | (IN0_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) \ + "l2normalizescale_axis"#AXIS + +#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("cl.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _l2normalizescale_kernel_map[] = +{ + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F32, F32, F32 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F32, U8 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F32, F32, F32 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F32, U8 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _l2normalizescale_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _L2NORMALIZESCALE_PARAM_NUM _cnt_of_array( _l2normalizescale_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (3) +#define SCALAR_AXIS_SIZE (4) +#define SCALAR_EPS_VALUE (5) +#define SCALAR_INPUT_SCALE (6) +#define SCALAR_INPUT_TAIL (7) +#define SCALAR_OUTPUT_SCALE (8) +#define SCALAR_OUTPUT_TAIL (9) + +#define L2NORMSCALE_PARAM_NUM 6 +#define L2NORMSCALE_QUANT_PARAM_NUM _cnt_of_array( _l2normalizescale_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + output_shape = output_attr->shape; + + if (1 == axis) + { + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.local_size[0] = 1; + gpu_param.local_size[1] = 16; + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = 16; + } + else if (0 == axis) + { + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.global_size[0] = 16; + gpu_param.global_size[1] = output_shape->data[1]; + } + else + { + status = VSI_FAILURE; + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _l2normalizescale_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _l2normalizescale_kernel_map; + size_t kernel_map_size = _cnt_of_array( _l2normalizescale_kernel_map ); + vx_param_description_t * param_def = _l2normalizescale_kernel_param_def; + size_t param_def_size = _cnt_of_array( _l2normalizescale_kernel_param_def ); + vx_kernel_initialize_f initializer = _l2normalizescale_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in0_dtype) + { + in0_dtype = F32; + } + + if (F16 == in1_dtype) + { + in1_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 == in0_dtype) || (U8 == out_dtype)) + { + param_def_size = L2NORMSCALE_QUANT_PARAM_NUM; + *is_use_u8_kernel = TRUE; + } + else + { + param_def_size = L2NORMSCALE_PARAM_NUM; + *is_use_u8_kernel = FALSE; + } + + key = HASH_L2NORMALIZESCALE_HASH_KEY(axis, in0_dtype, in1_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_L2NORMALIZESCALE_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + int32_t axis_size = 0; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; + float outputTail = (float)outputs[0]->attr.dtype.zero_point; + float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point; + float epsilon = (float)10e-12; + float rsEps = 1.0f / sqrtf(epsilon); + vsi_bool is_use_u8_kernel = FALSE; + + outputScale = 1.0f / outputScale; + inputTail = -(inputTail * inputScale); + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d, &is_use_u8_kernel ); + axis_size = inputs[0]->attr.size[axis]; + + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = L2NORMSCALE_PARAM_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _L2NORMALIZESCALE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + node_params[SCALAR_AXIS_SIZE] = vsi_nn_kernel_scalar_create( + graph, I32, &axis_size ); + node_params[SCALAR_EPS_VALUE] = vsi_nn_kernel_scalar_create( + graph, F32, &rsEps ); + if (is_use_u8_kernel) + { + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &inputTail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail ); + node_params_num = L2NORMSCALE_QUANT_PARAM_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_AXIS_SIZE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_EPS_VALUE] ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( l2normalizescale, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c new file mode 100644 index 0000000..8d1e439 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c @@ -0,0 +1,300 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + + +/* + * Define kernel meta. + */ +#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d) \ + ((_axis << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d)) + + #define VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_axis) \ + "log_softmax_axis"#_axis + +#define HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE) + +#define TENSOR_LOG_SOFTMAX_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \ + HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ + VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define TENSOR_LOG_SOFTMAX_FLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \ + HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, F32, F32), \ + VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_LOG_SOFTMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ + HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ + VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define TENSOR_LOG_SOFTMAX_FLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ + HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, F32, F32), \ + VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_LOG_SOFTMAX_FLOAT(0, F32, F32) + TENSOR_LOG_SOFTMAX_FLOAT(1, F32, F32) + TENSOR_LOG_SOFTMAX_FLOAT(2, F32, F32) + TENSOR_LOG_SOFTMAX_FLOAT(0, F16, F16) + TENSOR_LOG_SOFTMAX_FLOAT(1, F16, F16) + TENSOR_LOG_SOFTMAX_FLOAT(2, F16, F16) + + TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F32, F32) + TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F32, F32) + TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F16, F16) + TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F16, F16) + + TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8) + TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8) + TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8) + TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8) + TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8) + TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8) + + TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8, U8) + TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8, U8) + TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8, U8) + TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8, U8) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) +#define SCALAR_INPUT_AXIS (2) +#define SCALAR_INPUT_BETA (3) +#define SCALAR_INPUT_SCALE (4) +#define SCALAR_OUTPUT_SCALE (5) +#define SCALAR_OUTPUT_ZP (6) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_log_softmax_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * out_shape = NULL; + int32_t axis = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + out_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = axis == 0 ? 1 : out_shape->data[0]; + gpu_param.global_size[1] = axis == 1 ? 1 : out_shape->data[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? (axis == 2 ? 1 : out_shape->data[2]) : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + } + + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + } + + return status; +} /* _log_softmax_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t axis, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _log_softmax_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + float beta = 0; + float inputScale = + inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ? inputs[0]->attr.dtype.scale : 1.0f; + float outputScale = 1.0f / outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float scaleValue = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + beta = vsi_nn_kernel_param_get_float32(params, "beta"); + + scaleValue = scaleValue * beta * inputScale; + beta = beta * inputScale; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = ((inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1) + && axis != 2); + status = _query_kernel( inputs, outputs, axis, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + inputs, 1, outputs, 1 ); + + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create( + graph, F32, &beta ); + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &scaleValue ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &outputZP ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_BETA] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( log_softmax, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c new file mode 100644 index 0000000..67cf6e8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c @@ -0,0 +1,243 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _LOGICAL_OPS_KERNEL_SOURCE "logical_not" + +#define STR(a) #a + +// Add kernel hashtable here +#define LOGICAL_NOT_HASH_KEY(IN_DTYPE, OUT_DTYPE, _image_2d) \ + (( IN_DTYPE << 12 ) | ( OUT_DTYPE << 4) | (_image_2d)) + +#define PACK_KERNEL_MAP(IN_DTYPE, OUT_DTYPE) \ + { LOGICAL_NOT_HASH_KEY(IN_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("cl.logical_not_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _LOGICAL_OPS_KERNEL_SOURCE} + +#define PACK_KERNEL_MAP_2D(IN_DTYPE, OUT_DTYPE) \ + { LOGICAL_NOT_HASH_KEY(IN_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("cl.logical_not_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _LOGICAL_OPS_KERNEL_SOURCE} + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _logical_not_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( I8, I8), + PACK_KERNEL_MAP_2D(I8, I8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _logical_not_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _LOGICAL_NOT_PARAM_NUM _cnt_of_array( _logical_not_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_logical_not_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_status status = VX_FAILURE; + vx_tensor output = (vx_tensor)param[1]; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t *output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = output_shape->size < 3 ? 2 : 3; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + return status; +} /* _logical_not_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _logical_not_kernel_map; + size_t kernel_map_size = _cnt_of_array( _logical_not_kernel_map ); + vx_param_description_t * param_def = _logical_not_kernel_param_def; + size_t param_def_size = _cnt_of_array( _logical_not_kernel_param_def ); + vx_kernel_initialize_f initializer = _logical_not_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (BOOL8 == in_dtype && BOOL8 == out_dtype) + { + in_dtype = I8; + out_dtype = I8; + } + + key = LOGICAL_NOT_HASH_KEY( in_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LOGICAL_NOT_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_NOT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_NOT_PARAM_NUM ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( logical_not, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c new file mode 100644 index 0000000..c02e1c1 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c @@ -0,0 +1,261 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _LOGICAL_OPS_KERNEL_SOURCE "logical_ops" + +#define STR(a) #a + +// Add kernel hashtable here +#define LOGICAL_OPS_HASH_KEY(OP_TYPE, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((OP_TYPE << 20) | ( IN_DTYPE << 12 ) | ( OUT_DTYPE << 4) | (_image_2d)) + +#define PACK_KERNEL_MAP(OP_TYPE, IN_DTYPE, OUT_DTYPE, op_name) \ + { LOGICAL_OPS_HASH_KEY(OP_TYPE, IN_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("cl.logical_"op_name"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _LOGICAL_OPS_KERNEL_SOURCE} + +#define PACK_KERNEL_MAP_2D(OP_TYPE, IN_DTYPE, OUT_DTYPE, op_name) \ + { LOGICAL_OPS_HASH_KEY(OP_TYPE, IN_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("cl.logical_"op_name"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _LOGICAL_OPS_KERNEL_SOURCE} + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _logical_ops_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP(VSI_NN_LOGICAL_OR, I8, I8, "or"), + PACK_KERNEL_MAP(VSI_NN_LOGICAL_AND, I8, I8, "and"), + PACK_KERNEL_MAP(VSI_NN_LOGICAL_XOR, I8, I8, "xor"), + PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_OR, I8, I8, "or"), + PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_AND, I8, I8, "and"), + PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_XOR, I8, I8, "xor"), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _logical_ops_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _LOGICAL_OPS_PARAM_NUM _cnt_of_array( _logical_ops_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_logical_ops_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_status status = VX_FAILURE; + vx_tensor output = (vx_tensor)param[2]; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t *output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = output_shape->size < 3 ? 2 : 3; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + + return status; +} /* _logical_ops_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d, + vsi_nn_logical_ops_type_t op_type + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _logical_ops_kernel_map; + size_t kernel_map_size = _cnt_of_array( _logical_ops_kernel_map ); + vx_param_description_t * param_def = _logical_ops_kernel_param_def; + size_t param_def_size = _cnt_of_array( _logical_ops_kernel_param_def ); + vx_kernel_initialize_f initializer = _logical_ops_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (in_dtype != in1_dtype) + { + return VSI_FAILURE; + } + + if (BOOL8 == in_dtype && BOOL8 == out_dtype) + { + in_dtype = I8; + out_dtype = I8; + } + + key = LOGICAL_OPS_HASH_KEY(op_type, in_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LOGICAL_OPS_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + uint32_t ops_type = vsi_nn_kernel_param_get_int32( params, "ops_type" ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + + status = _query_kernel( kernel, inputs, outputs, image_2d, (vsi_nn_logical_ops_type_t)ops_type); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_OPS_PARAM_NUM, + inputs, input_num, outputs, output_num ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_OPS_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( logical_ops, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c new file mode 100644 index 0000000..667cca5 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c @@ -0,0 +1,1648 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum _LSTMUNIT_nn_activation_type_e +{ + SIGMOID = VSI_NN_ACT_SIGMOID, + HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, +}LSTMUNIT_nn_activation_type_e; + +typedef enum _LSTMUNIT_nn_activation_e +{ + CLP_E = 0x1C, + LP_E = 0x14, + CL_E = 0x18, + L_E = 0x10, + BP_E = 0x06, + B_E = 0x02, + CBP_E = 0x0E, + CB_E = 0x0A, + SP_E = 0x04, + S_E = 0x00, + CSP_E = 0x0C, + CS_E = 0x08, +}LSTMUNIT_nn_activation_e; + + +#define LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \ +_input_type, _output_type, _cell_type, _rec_act) \ +((_is_ln << 31) | (_is_cifg << 30) | (_is_proj << 29) | (_is_hybrid << 28) | (_is_peephole << 27) \ +| (_input_type << 23) | (_output_type << 19) | (_cell_type << 15) | (_rec_act << 10)) + +#define LSTMUNIT_ACTIVATION_SOURCE_NAME(_ln_cifg_proj_hybrid_, _input_type) \ + "lstmunit_activation_"#_ln_cifg_proj_hybrid_"_"#_input_type + +#define GEN_LSTMUNIT_STRUCT_ITEMS(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, _input_type, _output_type, \ + _cell_type, _rec_act, _ln_cifg_proj_hybrid_) \ + { LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \ + _input_type, _output_type, _cell_type, _rec_act), \ + CVIVANTE_NAMESPACE(\ + "cl.lstmunit_activation_"#_ln_cifg_proj_hybrid_"_"#_input_type"to"#_output_type"_"#_cell_type"_"#_rec_act), \ + LSTMUNIT_ACTIVATION_SOURCE_NAME(_ln_cifg_proj_hybrid_, _input_type) }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _lstmunit_activation_kernel_map[] = +{ + /* layer norm + cifg + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F32, F32, F32, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F32, F32, F32, HARD_SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, U8 , U8 , F32, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, U8 , U8 , F32, HARD_SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F32, U8 , F32, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F32, U8 , F32, HARD_SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, U8 , F32, F32, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, U8 , F32, F32, HARD_SIGMOID, CLP) + /* layer norm + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F32, F32, F32, SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F32, F32, F32, HARD_SIGMOID, LP) + /* layer norm + cifg */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F32, F32, F32, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F32, F32, F32, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, U8 , U8 , F32, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, U8 , U8 , F32, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F32, U8 , F32, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F32, U8 , F32, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, U8 , F32, F32, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, U8 , F32, F32, HARD_SIGMOID, CL) + /* layer norm */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F32, F32, F32, SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F32, F32, F32, HARD_SIGMOID, L) + /* hybrid + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F32, F32, F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F32, F32, F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8 , U8 , F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8 , U8 , F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F32, U8 , F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F32, U8 , F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8 , F32, F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8 , F32, F32, HARD_SIGMOID, BP) + /* hybrid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F32, F32, F32, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F32, F32, F32, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8 , U8 , F32, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8 , U8 , F32, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F32, U8 , F32, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F32, U8 , F32, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8 , F32, F32, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8 , F32, F32, HARD_SIGMOID, B) + /* cifg + hybrid + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F32, F32, F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F32, F32, F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8 , U8 , F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8 , U8 , F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F32, U8 , F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F32, U8 , F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8 , F32, F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8 , F32, F32, HARD_SIGMOID, CBP) + /* cifg + hybrid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F32, F32, F32, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F32, F32, F32, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8 , U8 , F32, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8 , U8 , F32, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F32, U8 , F32, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F32, U8 , F32, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8 , F32, F32, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8 , F32, F32, HARD_SIGMOID, CB) + /* standard + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F32, F32, F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F32, F32, F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8 , U8 , F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8 , U8 , F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F32, U8 , F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F32, U8 , F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8 , F32, F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8 , F32, F32, HARD_SIGMOID, SP) + /* standard */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F32, F32, F32, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F32, F32, F32, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8 , U8 , F32, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8 , U8 , F32, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F32, U8 , F32, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F32, U8 , F32, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8 , F32, F32, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8 , F32, F32, HARD_SIGMOID, S) + /* cifg + standard + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F32, F32, F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F32, F32, F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8 , U8 , F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8 , U8 , F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F32, U8 , F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F32, U8 , F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8 , F32, F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8 , F32, F32, HARD_SIGMOID, CSP) + /* cifg + standard */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F32, F32, F32, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F32, F32, F32, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8 , U8 , F32, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8 , U8 , F32, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F32, U8 , F32, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F32, U8 , F32, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8 , F32, F32, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8 , F32, F32, HARD_SIGMOID, CS) +}; + + +typedef enum _lstmunit_cifg_ln_proj_e +{ + CLP_INPUT_FC_F, + CLP_INPUT_FC_C, + CLP_INPUT_FC_O, + CLP_CSTATE_IN, + CLP_BIASES_F, + CLP_BIASES_C, + CLP_BIASES_O, + CLP_LN_WF, + CLP_LN_WC, + CLP_LN_WO, + CLP_OUTPUT, + CLP_CSTATE_OUT, + CLP_PARAM +} lstmunit_cifg_ln_proj_e; + +static vx_param_description_t vxLSTMUNIT_CLP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CLP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CLP_Param ) +#define _LSTMUNIT_ACTIVATION_CLP_IN_OUT_NUM 12 + +typedef enum _lstmunit_cifg_ln_e +{ + CL_INPUT_FC_F, + CL_INPUT_FC_C, + CL_INPUT_FC_O, + CL_CSTATE_IN, + CL_BIASES_F, + CL_BIASES_C, + CL_BIASES_O, + CL_LN_WF, + CL_LN_WC, + CL_LN_WO, + CL_OUTPUT, + CL_CSTATE_OUT, + CL_HSTATE_OUT, + CL_LSTMUNIT_PARAM, +} lstmunit_cifg_ln_e; + +static vx_param_description_t vxLSTMUNIT_CL_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CL_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CL_Param ) +#define _LSTMUNIT_ACTIVATION_CL_IN_OUT_NUM 13 + +typedef enum _lstmunit_ln_proj_e +{ + LP_INPUT_FC_I, + LP_INPUT_FC_F, + LP_INPUT_FC_C, + LP_INPUT_FC_O, + LP_CSTATE_IN, + LP_BIASES_I, + LP_BIASES_F, + LP_BIASES_C, + LP_BIASES_O, + LP_LN_WI, + LP_LN_WF, + LP_LN_WC, + LP_LN_WO, + LP_OUTPUT, + LP_CSTATE_OUT, + LP_PARAM +} lstmunit_ln_proj_e; + +static vx_param_description_t vxLSTMUNIT_LP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_LP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_LP_Param ) +#define _LSTMUNIT_ACTIVATION_LP_IN_OUT_NUM 15 + +typedef enum _lstmunit_ln_e +{ + L_INPUT_FC_I, + L_INPUT_FC_F, + L_INPUT_FC_C, + L_INPUT_FC_O, + L_CSTATE_IN, + L_BIASES_I, + L_BIASES_F, + L_BIASES_C, + L_BIASES_O, + L_LN_WI, + L_LN_WF, + L_LN_WC, + L_LN_WO, + L_OUTPUT, + L_CSTATE_OUT, + L_HSTATE_OUT, + L_LSTMUNIT_PARAM, +} lstmunit_ln_e; + +static vx_param_description_t vxLSTMUNIT_L_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_L_PARAM_NUM _cnt_of_array( vxLSTMUNIT_L_Param ) +#define _LSTMUNIT_ACTIVATION_L_IN_OUT_NUM 16 + +typedef enum _lstmunit_hybrid_proj_e +{ + BP_INPUT_FC_I, + BP_INPUT_FC_F, + BP_INPUT_FC_C, + BP_INPUT_FC_O, + BP_CSTATE_IN, + BP_HSTATE_FC_I, + BP_HSTATE_FC_F, + BP_HSTATE_FC_C, + BP_HSTATE_FC_O, + BP_BIASES_I, + BP_BIASES_F, + BP_BIASES_C, + BP_BIASES_O, + BP_OUTPUT, + BP_CSTATE_OUT, + BP_PARAM +} lstmunit_hybrid_proj_e; + +static vx_param_description_t vxLSTMUNIT_BP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_BP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_BP_Param ) +#define _LSTMUNIT_ACTIVATION_BP_IN_OUT_NUM 15 + +typedef enum _lstmunit_hybrid_e +{ + B_INPUT_FC_I, + B_INPUT_FC_F, + B_INPUT_FC_C, + B_INPUT_FC_O, + B_CSTATE_IN, + B_HSTATE_FC_I, + B_HSTATE_FC_F, + B_HSTATE_FC_C, + B_HSTATE_FC_O, + B_BIASES_I, + B_BIASES_F, + B_BIASES_C, + B_BIASES_O, + B_OUTPUT, + B_CSTATE_OUT, + B_HSTATE_OUT, + B_LSTMUNIT_PARAM, +} lstmunit_hybrid_e; + +static vx_param_description_t vxLSTMUNIT_B_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_B_PARAM_NUM _cnt_of_array( vxLSTMUNIT_B_Param ) +#define _LSTMUNIT_ACTIVATION_B_IN_OUT_NUM 16 + +typedef enum _lstmunit_cifg_hybrid_proj_e +{ + CBP_INPUT_FC_F, + CBP_INPUT_FC_C, + CBP_INPUT_FC_O, + CBP_CSTATE_IN, + CBP_HSTATE_FC_F, + CBP_HSTATE_FC_C, + CBP_HSTATE_FC_O, + CBP_BIASES_F, + CBP_BIASES_C, + CBP_BIASES_O, + CBP_OUTPUT, + CBP_CSTATE_OUT, + CBP_PARAM +} lstmunit_cifg_hybrid_proj_e; + +static vx_param_description_t vxLSTMUNIT_CBP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CBP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CBP_Param ) +#define _LSTMUNIT_ACTIVATION_CBP_IN_OUT_NUM 12 + +typedef enum _lstmunit_cifg_hybrid_e +{ + CB_INPUT_FC_F, + CB_INPUT_FC_C, + CB_INPUT_FC_O, + CB_CSTATE_IN, + CB_HSTATE_FC_F, + CB_HSTATE_FC_C, + CB_HSTATE_FC_O, + CB_BIASES_F, + CB_BIASES_C, + CB_BIASES_O, + CB_OUTPUT, + CB_CSTATE_OUT, + CB_HSTATE_OUT, + CB_LSTMUNIT_PARAM, +} lstmunit_cifg_hybrid_e; + +static vx_param_description_t vxLSTMUNIT_CB_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CB_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CB_Param ) +#define _LSTMUNIT_ACTIVATION_CB_IN_OUT_NUM 13 + +typedef enum _lstmunit_standard_proj_e +{ + SP_INPUT_FC_I, + SP_INPUT_FC_F, + SP_INPUT_FC_C, + SP_INPUT_FC_O, + SP_CSTATE_IN, + SP_HSTATE_FC_I, + SP_HSTATE_FC_F, + SP_HSTATE_FC_C, + SP_HSTATE_FC_O, + SP_OUTPUT, + SP_CSTATE_OUT, + SP_PARAM +} lstmunit_standard_proj_e; + +static vx_param_description_t vxLSTMUNIT_SP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_SP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_SP_Param ) +#define _LSTMUNIT_ACTIVATION_SP_IN_OUT_NUM 11 + +typedef enum _lstmunit_standard_e +{ + S_INPUT_FC_I, + S_INPUT_FC_F, + S_INPUT_FC_C, + S_INPUT_FC_O, + S_CSTATE_IN, + S_HSTATE_FC_I, + S_HSTATE_FC_F, + S_HSTATE_FC_C, + S_HSTATE_FC_O, + S_OUTPUT, + S_CSTATE_OUT, + S_HSTATE_OUT, + S_LSTMUNIT_PARAM, +} lstmunit_standard_e; + +static vx_param_description_t vxLSTMUNIT_S_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_S_PARAM_NUM _cnt_of_array( vxLSTMUNIT_S_Param ) +#define _LSTMUNIT_ACTIVATION_S_IN_OUT_NUM 12 + +typedef enum _lstmunit_cifg_standard_proj_e +{ + CSP_INPUT_FC_F, + CSP_INPUT_FC_C, + CSP_INPUT_FC_O, + CSP_CSTATE_IN, + CSP_HSTATE_FC_F, + CSP_HSTATE_FC_C, + CSP_HSTATE_FC_O, + CSP_OUTPUT, + CSP_CSTATE_OUT, + CSP_PARAM +} lstmunit_cifg_standard_proj_e; + +static vx_param_description_t vxLSTMUNIT_CSP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CSP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CSP_Param ) +#define _LSTMUNIT_ACTIVATION_CSP_IN_OUT_NUM 9 + +typedef enum _lstmunit_cifg_standard_e +{ + CS_INPUT_FC_F, + CS_INPUT_FC_C, + CS_INPUT_FC_O, + CS_CSTATE_IN, + CS_HSTATE_FC_F, + CS_HSTATE_FC_C, + CS_HSTATE_FC_O, + CS_OUTPUT, + CS_CSTATE_OUT, + CS_HSTATE_OUT, + CS_LSTMUNIT_PARAM, +} lstmunit_cifg_standard_e; + +static vx_param_description_t vxLSTMUNIT_CS_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CS_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CS_Param ) +#define _LSTMUNIT_ACTIVATION_CS_IN_OUT_NUM 10 + +#define _LSTMUINT_PARAM_NUM 23 + +#define _LSTMUNIT_ACTIVATION_MAX_PARAM_NUM (LSTMUNIT_ACT_PARAM_COUT + _LSTMUINT_PARAM_NUM) + + + +DEF_KERNEL_INITIALIZER(_lstmunit_activation_CL_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t output = NULL; + vsi_nn_kernel_tensor_attr_t* output_attr; + + output = (vsi_nn_kernel_tensor_t)param[CL_OUTPUT]; + + output_attr = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_attr->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_attr->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release( &output_attr ); + } + + return status; +} /* _lstmunit_activation_initializer() */ + +DEF_KERNEL_INITIALIZER(_lstmunit_activation_CB_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t output = NULL; + vsi_nn_kernel_tensor_attr_t* output_attr; + + output = (vsi_nn_kernel_tensor_t)param[CB_OUTPUT]; + output_attr = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_attr->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_attr->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release( &output_attr ); + } + + return status; +} /* _lstmunit_activation_initializer() */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_lstmunit_activation_CS_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t output = NULL; + vsi_nn_kernel_tensor_attr_t* output_attr; + + output = (vsi_nn_kernel_tensor_t)param[CS_OUTPUT]; + + output_attr = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_attr->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_attr->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release( &output_attr ); + } + + return status; +} /* _lstmunit_activation_initializer() */ + + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_lstmunit_activation_L_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t output = NULL; + vsi_nn_kernel_tensor_attr_t* output_attr; + + output = (vsi_nn_kernel_tensor_t)param[L_OUTPUT]; + + output_attr = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_attr->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_attr->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release( &output_attr ); + } + + return status; +} /* _lstmunit_activation_initializer() */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_lstmunit_activation_B_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t output = NULL; + vsi_nn_kernel_tensor_attr_t* output_attr; + + output = (vsi_nn_kernel_tensor_t)param[B_OUTPUT]; + + output_attr = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_attr->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_attr->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release( &output_attr ); + } + + return status; +} /* _lstmunit_activation_initializer() */ + + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_lstmunit_activation_S_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t output = NULL; + vsi_nn_kernel_tensor_attr_t* output_attr; + + output = (vsi_nn_kernel_tensor_t)param[S_OUTPUT]; + + output_attr = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_attr->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_attr->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release( &output_attr ); + } + + return status; +} /* _lstmunit_activation_initializer() */ + + +static size_t get_param_num(LSTMUNIT_nn_activation_e lstm_activation) +{ + size_t lstm_activation_param_num = LSTMUNIT_ACT_PARAM_COUT; + + switch (lstm_activation) + { + case CLP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CLP_PARAM_NUM; + break; + case LP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_LP_PARAM_NUM; + break; + case CL_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CL_PARAM_NUM; + break; + case L_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_L_PARAM_NUM; + break; + case BP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_BP_PARAM_NUM; + break; + case B_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_B_PARAM_NUM; + break; + case CBP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CBP_PARAM_NUM; + break; + case CB_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CB_PARAM_NUM; + break; + case SP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_SP_PARAM_NUM; + break; + case S_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_S_PARAM_NUM; + break; + case CSP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CSP_PARAM_NUM; + break; + case CS_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CS_PARAM_NUM; + break; + default: + break; + } + return lstm_activation_param_num; +} + +static size_t get_in_out_num(LSTMUNIT_nn_activation_e lstm_activation) +{ + size_t lstm_activation_in_out_num = LSTMUNIT_ACT_PARAM_COUT; + + switch (lstm_activation) + { + case CLP_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_CLP_IN_OUT_NUM; + break; + case LP_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_LP_IN_OUT_NUM; + break; + case CL_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_CL_IN_OUT_NUM; + break; + case L_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_L_IN_OUT_NUM; + break; + case BP_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_BP_IN_OUT_NUM; + break; + case B_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_B_IN_OUT_NUM; + break; + case CBP_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_CBP_IN_OUT_NUM; + break; + case CB_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_CB_IN_OUT_NUM; + break; + case SP_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_SP_IN_OUT_NUM; + break; + case S_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_S_IN_OUT_NUM; + break; + case CSP_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_CSP_IN_OUT_NUM; + break; + case CS_E: + lstm_activation_in_out_num = _LSTMUNIT_ACTIVATION_CS_IN_OUT_NUM; + break; + default: + break; + } + return lstm_activation_in_out_num; +} + +static void set_vx_param_description_t(LSTMUNIT_nn_activation_e lstm_activation, vx_param_description_t ** param_def) +{ + switch (lstm_activation) + { + case CLP_E: + *param_def = vxLSTMUNIT_CLP_Param; + break; + case LP_E: + *param_def = vxLSTMUNIT_LP_Param; + break; + case CL_E: + *param_def = vxLSTMUNIT_CL_Param; + break; + case L_E: + *param_def = vxLSTMUNIT_L_Param; + break; + case BP_E: + *param_def = vxLSTMUNIT_BP_Param; + break; + case B_E: + *param_def = vxLSTMUNIT_B_Param; + break; + case CBP_E: + *param_def = vxLSTMUNIT_CBP_Param; + break; + case CB_E: + *param_def = vxLSTMUNIT_CB_Param; + break; + case SP_E: + *param_def = vxLSTMUNIT_SP_Param; + break; + case S_E: + *param_def = vxLSTMUNIT_S_Param; + break; + case CSP_E: + *param_def = vxLSTMUNIT_CSP_Param; + break; + case CS_E: + *param_def = vxLSTMUNIT_CS_Param; + break; + default: + break; + } +} + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t _is_ln, + int32_t _is_cifg, + int32_t _is_proj, + int32_t _is_hybrid, + int32_t _is_peephole, + int32_t recurrent_activation, + vsi_bool *is_u8_type, + LSTMUNIT_nn_activation_e lstm_activation + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e cell_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _lstmunit_activation_kernel_map; + size_t kernel_map_size = _cnt_of_array( _lstmunit_activation_kernel_map ); + vx_param_description_t * param_def = NULL; + size_t param_def_size = _LSTMUNIT_ACTIVATION_MAX_PARAM_NUM; + vx_kernel_initialize_f initializer = NULL; + uint32_t key; + uint32_t i; + + set_vx_param_description_t( lstm_activation, ¶m_def ); + + if (NULL == param_def) + { + status = VSI_FAILURE; + return status; + } + + if (_is_cifg) + { + if (_is_ln) + initializer = _lstmunit_activation_CL_initializer; + else if (_is_hybrid) + initializer = _lstmunit_activation_CB_initializer; + else + initializer = _lstmunit_activation_CS_initializer; + } + else + { + if (_is_ln) + initializer = _lstmunit_activation_L_initializer; + else if (_is_hybrid) + initializer = _lstmunit_activation_B_initializer; + else + initializer = _lstmunit_activation_S_initializer; + } + + param_def_size = get_param_num(lstm_activation); + in_dtype = vsi_nn_kernel_map_dtype( inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.vx_type ); + cell_dtype = vsi_nn_kernel_map_dtype( inputs[LSTMUNIT_ACT_CSTATE_IN]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + + if (F16 == cell_dtype) + { + cell_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 == in_dtype) || (U8 == out_dtype)) + { + *is_u8_type = TRUE; + } + else + { + *is_u8_type = FALSE; + } + + key = LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \ + in_dtype, out_dtype, cell_dtype, recurrent_activation); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LSTMUNIT_ACTIVATION_MAX_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* in_tensor[LSTMUNIT_ACT_INPUTS_COUNT] = {NULL}; + vsi_nn_tensor_t* out_tensor[LSTMUNIT_ACT_OUTUTS_COUNT] = {NULL}; + size_t input_cnt = 0; + size_t output_cnt = 0; + int32_t _is_ln= 0; + int32_t _is_cifg= 0; + int32_t _is_proj= 0; + int32_t _is_hybrid= 0; + int32_t _is_peephole= 0; + int32_t recurrent_activation; + float cell_clip; + float forget_bias; + float logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); + float twoLogE = 2 * logE; + uint32_t uint_min = 0xFBFFFFFF; + uint32_t uint_max = 0x7BFFFFFF; + float float_min = *(vx_float32 *)&uint_min; + float float_max = *(vx_float32 *)&uint_max; + float scale_val[9] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + float tail_val[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; + vsi_bool is_u8_type = FALSE; + LSTMUNIT_nn_activation_e lstm_activation; + size_t lstm_activation_param_num = _LSTMUNIT_ACTIVATION_MAX_PARAM_NUM; + size_t lstm_activation_in_out_num = 0; + uint32_t i; + + _is_ln = vsi_nn_kernel_param_get_int32( params, "_is_ln" ); + _is_cifg = vsi_nn_kernel_param_get_int32( params, "_is_cifg" ); + _is_proj = vsi_nn_kernel_param_get_int32( params, "_is_proj" ); + _is_hybrid = vsi_nn_kernel_param_get_int32( params, "_is_hybrid" ); + _is_peephole = vsi_nn_kernel_param_get_int32( params, "_is_peephole" ); + recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + cell_clip = vsi_nn_kernel_param_get_float32(params, "cell_clip"); + forget_bias = vsi_nn_kernel_param_get_float32(params, "forget_bias"); + + lstm_activation = (LSTMUNIT_nn_activation_e)((_is_ln << 4) |\ + (_is_cifg << 3) | (_is_proj << 2) | (_is_hybrid << 1) | (_is_peephole)); + status = _query_kernel( kernel, inputs, outputs, _is_ln, _is_cifg,\ + _is_proj, _is_hybrid, _is_peephole, recurrent_activation, &is_u8_type, lstm_activation); + + if (cell_clip > 0) + { + float_max = cell_clip; + float_min = -cell_clip; + } + + + if (inputs[LSTMUNIT_ACT_INPUT_FC_I] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.vx_type) + { + scale_val[0] = inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.scale; + tail_val[0] = \ + -inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.zero_point; + } + + if (inputs[LSTMUNIT_ACT_INPUT_FC_F] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.vx_type) + { + scale_val[1] = inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.scale; + tail_val[1] = \ + -inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.zero_point; + } + + if (inputs[LSTMUNIT_ACT_INPUT_FC_C] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.vx_type) + { + scale_val[2] = inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.scale; + tail_val[2] = \ + -inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.zero_point; + } + + if (inputs[LSTMUNIT_ACT_INPUT_FC_O] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.vx_type) + { + scale_val[3] = inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.scale; + tail_val[3] = \ + -inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.zero_point; + } + + + if (inputs[LSTMUNIT_ACT_HSTATE_FC_I] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.vx_type) + { + scale_val[4] = inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.scale; + tail_val[4] = \ + -inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.zero_point; + } + + if (inputs[LSTMUNIT_ACT_HSTATE_FC_F] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.vx_type) + { + scale_val[5] = inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.scale; + tail_val[5] = \ + -inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.zero_point; + } + + if (inputs[LSTMUNIT_ACT_HSTATE_FC_C] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.vx_type) + { + scale_val[6] = inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.scale; + tail_val[6] = \ + -inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.zero_point; + } + + if (inputs[LSTMUNIT_ACT_HSTATE_FC_O] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.vx_type) + { + scale_val[7] = inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.scale; + tail_val[7] = \ + -inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.zero_point; + } + + if (outputs[LSTMUNIT_ACT_OUTPUT] && VSI_NN_TYPE_UINT8 == outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.vx_type) + { + scale_val[8] = 1.0f / outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.scale; + tail_val[8] = (float)(outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.zero_point); + } + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + for (i = 0; i < input_num; i++) + { + if (inputs[i]) + { + in_tensor[input_cnt] = inputs[i]; + input_cnt++; + } + } + for (i = 0; i < output_num; i++) + { + if (outputs[i]) + { + out_tensor[output_cnt] = outputs[i]; + output_cnt++; + } + } + lstm_activation_param_num = get_param_num(lstm_activation); + if (!is_u8_type) + { + lstm_activation_param_num = lstm_activation_param_num - 18; + } + lstm_activation_in_out_num = get_in_out_num(lstm_activation); + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, lstm_activation_param_num, + in_tensor, input_cnt, out_tensor, output_cnt ); + node_params[lstm_activation_in_out_num] = vsi_nn_kernel_scalar_create( + graph, F32, &logE ); + node_params[lstm_activation_in_out_num + 1] = vsi_nn_kernel_scalar_create( + graph, F32, &twoLogE ); + node_params[lstm_activation_in_out_num + 2] = vsi_nn_kernel_scalar_create( + graph, F32, &forget_bias ); + node_params[lstm_activation_in_out_num + 3] = vsi_nn_kernel_scalar_create( + graph, F32, &float_max ); + node_params[lstm_activation_in_out_num + 4] = vsi_nn_kernel_scalar_create( + graph, F32, &float_min ); + if (is_u8_type) + { + for (i = 0; i < 9; i++) + { + node_params[lstm_activation_in_out_num + 5 + 2 * i] = vsi_nn_kernel_scalar_create( + graph, F32, &scale_val[i] ); + node_params[lstm_activation_in_out_num + 6 + 2 * i] = vsi_nn_kernel_scalar_create( + graph, F32, &tail_val[i] ); + } + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, lstm_activation_param_num ); + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_in_out_num] ); + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_in_out_num + 1] ); + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_in_out_num + 2] ); + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_in_out_num + 3] ); + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_in_out_num + 4] ); + if (is_u8_type) + { + for (i = 0; i < 18; i++) + { + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_in_out_num + 5 + i] ); + } + } + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( lstmunit_activation, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c new file mode 100644 index 0000000..272d2b0 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c @@ -0,0 +1,294 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "matrixmul" +#define KERNEL_SOURCE_2 "matrixmul_transA" + + typedef enum +{ + _2D = 0, + _3D +} vsi_nn_kernel_image_dim_type_e; + +#define HASH_MATRIXMUL_KEY(_input0_type, _input1_type, _output_type, _image_dim, _trans_a) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_dim << 4) | (_trans_a)) + +#define HASH_MATRIXMUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ + CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) + +#define HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ + CVIVANTE_NAMESPACE("cl.gemm_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) + +#define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ + { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0), \ + HASH_MATRIXMUL_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \ + SOURCE }, + +#define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ + { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1), \ + HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } matrixmul_map[] = +{ + TENSOR_MATRIXMUL_KERNELS(F16, F16, F16, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_KERNELS(F16, F16, F16, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSA_KERNELS(F16, F16, F16, _2D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSA_KERNELS(F16, F16, F16, _3D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_2) +}; + +/* + * Kernel params + */ +static vx_param_description_t _matrixmul_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _MATRIXMUL_PARAM_NUM _cnt_of_array(_matrixmul_kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_matrixmul_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + width = attr[0]->shape->data[0]; + height = attr[0]->shape->data[0]; + chn = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = gpu_align_p2((height + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1], 4); + gpu_param.global_size[2] = chn; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _matrixmul_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t depth, + int32_t transa + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input1_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_image_dim_type_e dim_type = _2D; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if(depth > 1) + { + dim_type = _3D; + } + + key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa ); + + for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ ) + { + if( matrixmul_map[i].key == key ) + { + break; + } + } + + if( i < _cnt_of_array(matrixmul_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", matrixmul_map[i].function_name ); + kernel->info.parameters = _matrixmul_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _matrixmul_kernel_param_def ); + kernel->info.initialize = _matrixmul_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + matrixmul_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + matrixmul_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_MATRIXMUL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); + int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); + uint32_t M = inputs[0]->attr.size[1]; + uint32_t K = inputs[0]->attr.size[0]; + uint32_t N = inputs[1]->attr.size[0]; + uint32_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1; + uint32_t ac2zero = 0; + uint32_t bc2zero = 0; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + if(transposeB) + { + return NULL; + } + + if(transposeA) + { + K = inputs[0]->attr.size[1]; + M = inputs[0]->attr.size[0]; + } + + if((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) || + (inputs[0]->attr.size[2] > inputs[1]->attr.size[2] + && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2)) + { + bc2zero = 1; + } + else if((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) || + (inputs[1]->attr.size[2] > inputs[0]->attr.size[2] + && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2)) + { + ac2zero = 1; + } + + status = _query_kernel( kernel, inputs, outputs, depth, transposeA ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 3; + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( node_params, _MATRIXMUL_PARAM_NUM, + inputs, 2, outputs, 1 ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ac2zero ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bc2zero ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _MATRIXMUL_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( matrixmul, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c new file mode 100644 index 0000000..d4ad975 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c @@ -0,0 +1,297 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "math.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "maximum", +#define KERNEL_SOURCE_2 "maximum_fp16", +#define KERNEL_SOURCE_3 "maximum_i16" + +#define HASH_MAXIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + +#define HASH_MAXIMUM_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.maximum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + +#define TENSOR_MAX_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_MAXIMUM_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MAX_KERNELS_FLOAT(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_MAXIMUM_SH_KERNEL_NAME(FP32, FP32, FP32), \ + SOURCE }, + + +#define HASH_MAXIMUM_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.maximum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_MAX_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_MAXIMUM_SH_KERNEL_2D_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MAX_KERNELS_2D_FLOAT(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_MAXIMUM_SH_KERNEL_2D_NAME(FP32, FP32, FP32), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_MAX_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS(I32, I32, I32, KERNEL_SOURCE_1) + + TENSOR_MAX_KERNELS_2D_FLOAT(F32, F32, F32, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_2D_FLOAT(F16, F16, F16, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_2D(I32, I32, I32, KERNEL_SOURCE_1) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_INPUT0_SCALE (3) +#define SCALAR_INPUT0_TAIL (4) +#define SCALAR_INPUT1_SCALE (5) +#define SCALAR_INPUT1_TAIL (6) +#define SCALAR_OUTPUT_SCALE (7) +#define SCALAR_OUTPUT_ZP (8) +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_maximum_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_shape = attr[2]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} /* _maximum_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype; + vsi_nn_kernel_dtype_e input1_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_MAXIMUM_KEY( input0_dtype, input1_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _maximum_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + + float input0Scale = inputs[0]->attr.dtype.scale; + float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale; + float input1Scale = inputs[1]->attr.dtype.scale; + float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; + float outputScale = outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + + outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + status = _query_kernel( inputs, outputs, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + inputs, 2, outputs, 1 ); + node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input0Scale ); + node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input0Tail ); + node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input1Scale ); + node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input1Tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &outputZP ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( maximum, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c new file mode 100644 index 0000000..d4f05cb --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c @@ -0,0 +1,296 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "minimum", +#define KERNEL_SOURCE_2 "minimum_fp16", +#define KERNEL_SOURCE_3 "minimum_i16" + +#define HASH_MINIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + +#define HASH_MINIMUM_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.minimum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + +#define TENSOR_MIN_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_MINIMUM_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MIN_KERNELS_FLOAT(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_MINIMUM_SH_KERNEL_NAME(FP32, FP32, FP32), \ + SOURCE }, + + +#define HASH_MINIMUM_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.minimum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_MIN_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_MINIMUM_SH_KERNEL_2D_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MIN_KERNELS_2D_FLOAT(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_MINIMUM_SH_KERNEL_2D_NAME(FP32, FP32, FP32), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_MIN_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS(I32, I32, I32, KERNEL_SOURCE_1) + + TENSOR_MIN_KERNELS_2D_FLOAT(F32, F32, F32, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_2D_FLOAT(F16, F16, F16, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_2D(I32, I32, I32, KERNEL_SOURCE_1) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_INPUT0_SCALE (3) +#define SCALAR_INPUT0_TAIL (4) +#define SCALAR_INPUT1_SCALE (5) +#define SCALAR_INPUT1_TAIL (6) +#define SCALAR_OUTPUT_SCALE (7) +#define SCALAR_OUTPUT_ZP (8) +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_minimum_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_shape = attr[2]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} /* _minimum_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype; + vsi_nn_kernel_dtype_e input1_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_MINIMUM_KEY( input0_dtype, input1_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _minimum_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + + float input0Scale = inputs[0]->attr.dtype.scale; + float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale; + float input1Scale = inputs[1]->attr.dtype.scale; + float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; + float outputScale = outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + + outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + status = _query_kernel( inputs, outputs, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + inputs, 2, outputs, 1 ); + node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input0Scale ); + node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input0Tail ); + node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input1Scale ); + node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input1Tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &outputZP ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( minimum, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c new file mode 100644 index 0000000..8a72060 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c @@ -0,0 +1,481 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + AXIS = 0, + AXIS_NUM, + ZP, + SCALE, + WIDTH, + HEIGHT, + CHN, + DIMRATIO +} vsi_nn_kernel_param_id_e; + +#define KERNEL_SOURCE_1 "moments_axis0" +#define KERNEL_SOURCE_2 "moments_axis1" +#define KERNEL_SOURCE_3 "moments_axis2" +#define KERNEL_SOURCE_4 "moments_axis01" +#define KERNEL_SOURCE_5 "moments_axis012" + +// Add kernel hashtable here +#define HASH_MOMENTS_KEY(_input0_type, _output_type, _axis_num, _axis0, _axis1, _axis2, _image_2d) \ + ((_input0_type<<24) | (_output_type<<20) | (_axis_num<<16) | (_axis0<<12) | (_axis1<<8) | (_axis2<<4)|(_image_2d)) + +#define HASH_MOMENTS_SH_KERNEL_NAME(AXIS0, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.moments_axis"#AXIS0"_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_MOMENTS_TWO_AXIS_SH_KERNEL_NAME(AXIS0, AXIS1, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.moments_axis"#AXIS0#AXIS1"_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_MOMENTS_THREE_AXIS_SH_KERNEL_NAME(AXIS0, AXIS1, AXIS2, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.moments_axis"#AXIS0#AXIS1#AXIS2"_"#SRC0_TYPE"to"#DST_TYPE) + +#define TENSOR_MOMENTS_KERNELS(IN0_TYPE, OUT_TYPE, AXIS0, SOURCE) \ + { HASH_MOMENTS_KEY(IN0_TYPE, OUT_TYPE, 1, AXIS0, 0, 0, 0), \ + HASH_MOMENTS_SH_KERNEL_NAME(AXIS0, IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MOMENTS_TWO_AXIS_KERNELS(IN0_TYPE, OUT_TYPE, AXIS0, AXIS1, SOURCE) \ + { HASH_MOMENTS_KEY(IN0_TYPE, OUT_TYPE, 2, AXIS0, AXIS1, 0, 0), \ + HASH_MOMENTS_TWO_AXIS_SH_KERNEL_NAME(AXIS0, AXIS1, IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MOMENTS_THREE_AXIS_KERNELS(IN0_TYPE, OUT_TYPE, AXIS0, AXIS1, AXIS2, SOURCE) \ + { HASH_MOMENTS_KEY(IN0_TYPE, OUT_TYPE, 3, AXIS0, AXIS1, AXIS2, 0), \ + HASH_MOMENTS_THREE_AXIS_SH_KERNEL_NAME(AXIS0, AXIS1, AXIS2, IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type moments_map[] = +{ + // Register kernel here + TENSOR_MOMENTS_KERNELS(U8, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(F16, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(F32, F32, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(I32, F32, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(U8, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(F16, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(F32, F32, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(I32, F32, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(U8, F16, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_KERNELS(F16, F16, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_KERNELS(F32, F32, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_KERNELS(I32, F32, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(F16, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(F32, F32, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(I32, F32, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F16, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(F16, F16, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(F32, F32, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(I32, F32, 0, 1, 2, KERNEL_SOURCE_5) +}; + +/* + * Kernel params + */ +static vx_param_description_t _moments_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _MOMENTS_PARAM_NUM _cnt_of_array( _moments_kernel_param_def ) + +static int32_t set_constant_border + ( + vsi_nn_kernel_node_t node, + int32_t value + ) +{ + vsi_status status = VSI_FAILURE; + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.S32 = value; + border.constant_value.U32 = (vx_uint32)value; + border.constant_value.S16 = (vx_int16)value; + border.constant_value.U8 = (vx_uint8)value; + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + return status; +} + +static int32_t get_moments_output_reshape_size + ( + vsi_nn_tensor_t ** outputs, + int32_t sizes[VSI_NN_MAX_DIM_NUM], + int32_t* axis, + int32_t axis_num + ) +{ + uint32_t out_dims_num = outputs[0]->attr.dim_num; + uint32_t *output_size = outputs[0]->attr.size; + uint32_t i = 0; + int32_t out_rs_flg = 0; + + for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + sizes[3] = out_dims_num > 3 ? output_size[3] : 1; + + if(axis_num == 1 && axis[0] == 0) + { + sizes[0] = output_size[1]; + sizes[1] = out_dims_num > 2 ? output_size[2] : 1; + out_rs_flg = 1; + } + else if(axis_num == 1 && axis[0] == 1) + { + sizes[0] = output_size[0]; + sizes[1] = out_dims_num > 2 ? output_size[2] : 1; + out_rs_flg = 1; + } + else if(axis_num == 2 && axis[0] == 0 && axis[1] == 1) + { + sizes[0] = out_dims_num > 2 ? output_size[2] : 1; + out_rs_flg = 1; + } + + return out_rs_flg; +} /* _get_moments_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_moments_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * input_shape = NULL; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + int32_t axis = 0; + int32_t axis_num = 1; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis_num); + CHECK_STATUS_FAIL_GOTO(status, final ); + + input_shape = attr[0]->shape; + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = input_shape->size > 2 ? input_shape->data[2] : 1; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + if(axis_num == 1 && axis == 0) + { + gpu_param.global_size[0] = gpu_align_p2((height + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = chn; + } + else if(axis_num == 1 && axis == 1) + { + gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = chn; + } + else if(axis_num == 1 && axis == 2) + { + gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = height; + } + else if(axis_num == 2) + { + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.local_size[2] = 1; + gpu_param.global_size[0] = 16; + gpu_param.global_size[1] = chn; + } + else if(axis_num == 3) + { + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.local_size[2] = 1; + gpu_param.global_size[0] = 16; + gpu_param.global_size[1] = 1; + } + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _instancenorm_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params, + int32_t* axis, + int32_t axis_num, + int32_t rs_flg + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_MOMENTS_KEY( input0_dtype, output_dtype, axis_num, axis[0], axis[1], axis[2], rs_flg ); + + for( i = 0; i < _cnt_of_array(moments_map); i ++ ) + { + if( moments_map[i].key == key ) + { + break; + } + } + + if( i < _cnt_of_array(moments_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", moments_map[i].function_name ); + kernel->info.parameters = _moments_kernel_param_def; + kernel->info.numParams = _MOMENTS_PARAM_NUM; + kernel->info.initialize = _moments_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + moments_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + moments_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ +#define INTERNAL_MOMENTS_SCALAR_NUM (8) + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t out_shape[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t out_rs_flg = 0; + int32_t axis_num = 0; + int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", (size_t*)&axis_num); + int32_t keep_dim = vsi_nn_kernel_param_get_int32( params, "keep_dim" ); + int32_t first_axis = axis[0]; + int32_t i = 0; + vsi_nn_kernel_scalar_t scalar_list[INTERNAL_MOMENTS_SCALAR_NUM] = {NULL}; + + int32_t width = inputs[0]->attr.size[0]; + int32_t height = inputs[0]->attr.size[1]; + int32_t chn = inputs[0]->attr.size[2]; + int32_t input_zp = inputs[0]->attr.dtype.zero_point; + float input_scale = inputs[0]->attr.dtype.scale; + float dim_ratio = (float)1.0 / (float)(width * height); + + if(inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + if (inputs[0]->attr.dtype.fl > 0) + { + input_scale = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl))); + } + else + { + input_scale = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl)); + } + input_zp = 0; + } + + if(axis_num == 1 && axis[0] == 0) + { + dim_ratio = (float)1.0 / (float)(width); + } + else if(axis_num == 1 && axis[0] == 1) + { + dim_ratio = (float)1.0 / (float)(height); + } + else if(axis_num == 1 && axis[0] == 2) + { + dim_ratio = (float)1.0 / (float)(chn); + } + else if(axis_num == 2 && axis[0] == 0 && axis[1] == 1) + { + dim_ratio = (float)1.0 / (float)(width * height); + } + else if(axis_num == 3) + { + dim_ratio = (float)1.0 / (float)(width * height * chn); + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + if(keep_dim) + { + out_rs_flg = get_moments_output_reshape_size(&outputs[0], out_shape, axis, axis_num); + } + + scalar_list[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &first_axis ); + scalar_list[AXIS_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); + scalar_list[ZP] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp ); + scalar_list[SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + scalar_list[WIDTH] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + scalar_list[HEIGHT] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + scalar_list[CHN] = vsi_nn_kernel_scalar_create( graph, I32, &chn ); + scalar_list[DIMRATIO] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio ); + + status = _query_kernel( inputs, outputs, kernel, params, axis, axis_num, 0 ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t); + if(out_rs_flg) + { + node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, out_shape, 4 ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, out_shape, 4 ); + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[0]->t); + node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[1]->t); + } + node_params[index++] = scalar_list[AXIS]; + node_params[index++] = scalar_list[AXIS_NUM]; + node_params[index++] = scalar_list[ZP]; + node_params[index++] = scalar_list[SCALE]; + node_params[index++] = scalar_list[WIDTH]; + node_params[index++] = scalar_list[HEIGHT]; + node_params[index++] = scalar_list[CHN]; + node_params[index++] = scalar_list[DIMRATIO]; + status = vsi_nn_kernel_node_pass_param( node, node_params, _MOMENTS_PARAM_NUM ); + CHECK_STATUS(status); + if(out_rs_flg) + { + vsi_nn_kernel_tensor_release( &node_params[1] ); + vsi_nn_kernel_tensor_release( &node_params[2] ); + } + status = set_constant_border(node, inputs[0]->attr.dtype.zero_point); + CHECK_STATUS(status); + } + } + + /* Pass parameters to node. */ + for( i = 0; i < INTERNAL_MOMENTS_SCALAR_NUM; i ++ ) + { + if(scalar_list[i]) + { + vsi_nn_kernel_scalar_release( &scalar_list[i] ); + } + } +#undef INTERNAL_MOMENTS_SCALAR_NUM + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( moments, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c new file mode 100644 index 0000000..5c1363c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c @@ -0,0 +1,315 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + + +#define _POOLWITHARGMAX_KERNEL_SOURCE "poolwithargmax" + +#define POOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE, _image_2d ) \ + ((IN_DTYPE << 20) | (OUT0_DTYPE << 12) | (OUT1_DTYPE << 4) | (_image_2d)) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE ) \ + { POOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE, 0 ), \ + CVIVANTE_NAMESPACE("cl.poolwithargmax_"#IN_DTYPE"to_"#OUT0_DTYPE"_"#OUT1_DTYPE), \ + _POOLWITHARGMAX_KERNEL_SOURCE } + +#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE ) \ + { POOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE, 1 ), \ + CVIVANTE_NAMESPACE("cl.poolwithargmax_"#IN_DTYPE"to_"#OUT0_DTYPE"_"#OUT1_DTYPE"_2D"), \ + _POOLWITHARGMAX_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _poolwithargmax_kernel_map[] = +{ + PACK_KERNEL_MAP( F32, F32, U8 ), + PACK_KERNEL_MAP( F32, U8, U8 ), + PACK_KERNEL_MAP( U8, F32, U8 ), + PACK_KERNEL_MAP( U8, U8, U8 ), + PACK_KERNEL_MAP( I32, I32, U8 ), + PACK_KERNEL_MAP_2D( F32, F32, U8 ), + PACK_KERNEL_MAP_2D( F32, U8, U8 ), + PACK_KERNEL_MAP_2D( U8, F32, U8 ), + PACK_KERNEL_MAP_2D( U8, U8, U8 ), + PACK_KERNEL_MAP_2D( I32, I32, U8 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _poolwithargmax_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _POOLWITHARGMAX_PARAM_NUM _cnt_of_array( _poolwithargmax_kernel_param_def ) + +#define SCALAR_SCALE (3) +#define SCALAR_TAIL (4) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vx_status status = VX_FAILURE; + vx_tensor output = (vx_tensor)param[1]; + vsi_nn_kernel_tensor_attr_t * attr_out = NULL; + vsi_int_array_t * out_shape = NULL; + vsi_bool image_2d = FALSE; + + attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + out_shape = attr_out->shape; + image_2d = (vsi_bool)(out_shape->size < 3 || 1 == out_shape->data[2]); + + gpu_param.dim = image_2d ? 2 : 3; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = image_2d ? 1 : out_shape->data[2]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (attr_out) + { + vsi_nn_kernel_tensor_attr_release(&attr_out); + } + + return status; +} /* _poolwithargmax_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out0_dtype; + vsi_nn_kernel_dtype_e out1_dtype; + const _kernel_map_type * kernel_map = _poolwithargmax_kernel_map; + size_t kernel_map_size = _cnt_of_array( _poolwithargmax_kernel_map ); + vx_param_description_t * param_def = _poolwithargmax_kernel_param_def; + size_t param_def_size = _cnt_of_array( _poolwithargmax_kernel_param_def ); + vx_kernel_initialize_f initializer = _poolwithargmax_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out0_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + out1_dtype = vsi_nn_kernel_map_dtype( outputs[1]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + + if (F16 == out0_dtype) + { + out0_dtype = F32; + } + + if ((U8 != in_dtype) && (U8 != out0_dtype)) + { + *is_use_u8_kernel = FALSE; + param_def_size = param_def_size - 2; + } + else + { + *is_use_u8_kernel = TRUE; + } + + key = POOLWITHARGMAX_HASH_KEY( in_dtype, out0_dtype, out1_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_POOLWITHARGMAX_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t ksize_x = 0; + int32_t ksize_y = 0; + int32_t stride_x = 0; + int32_t stride_y = 0; + int32_t pad_x = 0; + int32_t pad_y = 0; + vsi_bool image_2d = FALSE; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; + float outputTail = (float)outputs[0]->attr.dtype.zero_point; + float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point; + float scale_value = 1.0f; + float tail_value = 0.0f; + vsi_bool is_use_u8_kernel = FALSE; + + ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x"); + ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y"); + stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x"); + stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y"); + pad_x = vsi_nn_kernel_param_get_int32(params, "pad_x"); + pad_y = vsi_nn_kernel_param_get_int32(params, "pad_y"); + + if ((2 != ksize_x) || (2 != ksize_y) || (2 != stride_x) || (2 != stride_y) || (0 != pad_x) || (0 != pad_y)) + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[1]->attr.size, + outputs[1]->attr.dim_num )) + { + return NULL; + } + + scale_value = inputScale / outputScale; + tail_value = outputTail - inputTail * inputScale / outputScale; + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = _POOLWITHARGMAX_PARAM_NUM - 2; + + if (is_use_u8_kernel) + { + node_params[SCALAR_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &scale_value ); + node_params[SCALAR_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &tail_value ); + node_params_num = _POOLWITHARGMAX_PARAM_NUM; + } + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, node_params_num, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL] ); + } + + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( poolwithargmax, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/pow_cl.c b/src/tim/vx/internal/src/kernel/cl/pow_cl.c new file mode 100644 index 0000000..dea0bb0 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c @@ -0,0 +1,251 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "pow" + +#define HASH_POW_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + +#define HASH_POW_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.pow_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + +#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_POW_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_POW_KERNELS_FLOAT(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_POW_SH_KERNEL_NAME(FP32, FP32, FP32), \ + SOURCE }, + + +#define HASH_POW_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.pow_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_POW_SH_KERNEL_2D_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_POW_KERNELS_2D_FLOAT(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_POW_SH_KERNEL_2D_NAME(FP32, FP32, FP32), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } pow_map[] = +{ + TENSOR_POW_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1) + TENSOR_POW_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1) + + TENSOR_POW_KERNELS_2D_FLOAT(F32, F32, F32, KERNEL_SOURCE_1) + TENSOR_POW_KERNELS_2D_FLOAT(F16, F16, F16, KERNEL_SOURCE_1) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_pow_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_shape = attr[2]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} /* _pow_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype; + vsi_nn_kernel_dtype_e input1_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_POW_KEY( input0_dtype, input1_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(pow_map); i ++ ) + { + if( pow_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(pow_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pow_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _pow_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + pow_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + pow_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + status = _query_kernel( inputs, outputs, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + inputs, 2, outputs, 1 ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( pow, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c new file mode 100644 index 0000000..488eed9 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c @@ -0,0 +1,332 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "prelu", + +#define HASH_MINIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + +#define HASH_MINIMUM_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.prelu_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + +#define PRELU_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_MINIMUM_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define PRELU_KERNELS_FLOAT(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_MINIMUM_SH_KERNEL_NAME(FP32, FP32, FP32), \ + SOURCE }, + + +#define HASH_MINIMUM_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.prelu_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D") + +#define PRELU_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_MINIMUM_SH_KERNEL_2D_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define PRELU_KERNELS_2D_FLOAT(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_MINIMUM_SH_KERNEL_2D_NAME(FP32, FP32, FP32), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1) + PRELU_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1) + PRELU_KERNELS(U8, U8, U8, KERNEL_SOURCE_1) + PRELU_KERNELS(I32, I32, I32, KERNEL_SOURCE_1) + + PRELU_KERNELS_2D_FLOAT(F32, F32, F32, KERNEL_SOURCE_1) + PRELU_KERNELS_2D_FLOAT(F16, F16, F16, KERNEL_SOURCE_1) + PRELU_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_1) + PRELU_KERNELS_2D(I32, I32, I32, KERNEL_SOURCE_1) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_INPUT0_SCALE (3) +#define SCALAR_INPUT0_TAIL (4) +#define SCALAR_INPUT1_SCALE (5) +#define SCALAR_INPUT1_TAIL (6) +#define SCALAR_OUTPUT_SCALE (7) +#define SCALAR_OUTPUT_ZP (8) +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_prelu_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_shape = attr[2]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} /* _prelu_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype; + vsi_nn_kernel_dtype_e input1_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_MINIMUM_KEY( input0_dtype, input1_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _prelu_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t new_rank = 0; + vsi_bool ret; + + float input0Scale = inputs[0]->attr.dtype.scale; + float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale; + float input1Scale = inputs[1]->attr.dtype.scale; + float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; + float outputScale = outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + int32_t is_per_channel_alpha = 0; + + is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha"); + + if (is_per_channel_alpha) + { + return NULL; + } + + outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; + + ret = vsi_nn_kernel_optimize_eltwise_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + if (ret) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes[2], new_rank ); + } + else + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[2]->attr.size, + reshape_tensors[2]->attr.dim_num ) ) + { + goto final; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + reshape_tensors, 2, &reshape_tensors[2], 1 ); + node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input0Scale ); + node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input0Tail ); + node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input1Scale ); + node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input1Tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &outputZP ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + } + +final: + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( prelu, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c new file mode 100644 index 0000000..e209157 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c @@ -0,0 +1,463 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum +{ + INTERNAL_CL_KERNEL_SEED, + INTERNAL_CL_KERNEL_CDF, + INTERNAL_CL_KERNEL_MULTINOMIAL, +} _internal_kernel_e; + +/* + * Define kernel meta. + */ +#define _MULTINOMIAL_KERNEL_SOURCE "random_multinomial" +#define _MULTINOMIAL_KERNEL_NAME CVIVANTE_NAMESPACE("cl.random_multinomial") +#define _CDF_KERNEL_SOURCE "random_multinomial" +#define _SEED_KERNEL_SOURCE "random_multinomial" +#define _SEED_KERNEL_NAME CVIVANTE_NAMESPACE("cl.random_seed") + +// Add kernel hashtable here +#define MULTINOMIAL_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + ((IN0_DTYPE << 16) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE )) +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ + { MULTINOMIAL_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), \ + _MULTINOMIAL_KERNEL_NAME, SOURCE } + +#define CDF_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) +#define CDF_PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { CDF_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.random_multinomial_cdf_"#IN_DTYPE), \ + SOURCE } + +#define SEED_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) +#define SEED_PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { SEED_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _SEED_KERNEL_NAME, SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _seed_kernel_map[] = +{ + // Register kernel here + SEED_PACK_KERNEL_MAP( I32, F32, _SEED_KERNEL_SOURCE ), +}; + +static const _kernel_map_type _cdf_kernel_map[] = +{ + // Register kernel here + CDF_PACK_KERNEL_MAP( F16, F32, _CDF_KERNEL_SOURCE ), + CDF_PACK_KERNEL_MAP( F32, F32, _CDF_KERNEL_SOURCE ), +}; + +static const _kernel_map_type _kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, I32, _MULTINOMIAL_KERNEL_SOURCE ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _PARAM_NUM _cnt_of_array( _kernel_param_def ) + +static vx_param_description_t _cdf_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _CDF_PARAM_NUM _cnt_of_array( _cdf_kernel_param_def ) + +static vx_param_description_t _seed_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define SCALAR_ITER_NUM (2) +#define SCALAR_RE_RAND_MAX (3) +#define _SEED_PARAM_NUM _cnt_of_array( _seed_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_multinomial_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr = NULL; + vsi_int_array_t * in_shape = NULL; + + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); + + in_shape = attr->shape; + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (in_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = in_shape->data[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (attr) + { + vsi_nn_kernel_tensor_attr_release( &attr ); + attr = NULL; + } + + return status; +} /* _multinomial_initializer() */ + +DEF_KERNEL_INITIALIZER(_cdf_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr = NULL; + vsi_int_array_t * in_shape = NULL; + uint32_t batch = 0; + + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); + + in_shape = attr->shape; + batch = in_shape->data[1]; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = 1; + gpu_param.global_size[1] = batch; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (attr) + { + vsi_nn_kernel_tensor_attr_release( &attr ); + attr = NULL; + } + + return status; +} /* _cdf_initializer() */ + +DEF_KERNEL_INITIALIZER(_seed_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = 1; + gpu_param.global_size[1] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + return status; +} /* _seed_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + const uint32_t hashkey, + _internal_kernel_e kernel_id + /* Add extra params */ + ) +{ + vx_kernel_initialize_f initializer = NULL; + vx_param_description_t * param_def; + vsi_status status = VSI_FAILURE; + const _kernel_map_type* kernel_map; + size_t kernel_map_size; + size_t param_size; + uint32_t i; + + switch( kernel_id ) + { + case INTERNAL_CL_KERNEL_SEED: + initializer = _seed_initializer; + kernel_map = _seed_kernel_map; + kernel_map_size = _cnt_of_array( _seed_kernel_map ); + param_def = _seed_kernel_param_def; + param_size = _SEED_PARAM_NUM; + break; + case INTERNAL_CL_KERNEL_CDF: + initializer = _cdf_initializer; + kernel_map = _cdf_kernel_map; + kernel_map_size = _cnt_of_array( _cdf_kernel_map ); + param_def = _cdf_kernel_param_def; + param_size = _CDF_PARAM_NUM; + break; + case INTERNAL_CL_KERNEL_MULTINOMIAL: + initializer = _multinomial_initializer; + kernel_map = _kernel_map; + kernel_map_size = _cnt_of_array( _kernel_map ); + param_def = _kernel_param_def; + param_size = _PARAM_NUM; + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == hashkey ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_size; + kernel->info.initialize = initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ +#define INTERNAL_KERNEL_SIZE (3) +#define SEED_INDEX (0) +#define CDF_INDEX (1) +#define SEEDS_INDEX (2) + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t cdf_node_params[_CDF_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t seed_node_params[_SEED_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; + uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; + uint32_t hashkey = 0; + int32_t i; + uint32_t iteration = (outputs[0]->attr.size[0] + 3) / 4; + float rand_max = (float)(pow(2.0,32)); + float re_rand_max = 1 / rand_max; + + // Check if gpu can support the size + if( !vsi_nn_kernel_gpu_check_shape( + (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL ); + // Assign unique_id + ikernels[i]->unique_id = kernel->unique_id; + } + + memcpy( &attr, &(outputs[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + tensors[SEED_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + + attr.size[0] = inputs[0]->attr.size[0]; + attr.size[1] = inputs[0]->attr.size[1]; + attr.dim_num = 2; + tensors[CDF_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + + memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) ); + attr.size[1] = 1; + attr.dim_num = 2; + tensors[SEEDS_INDEX] = vsi_nn_reshape_tensor( graph, + inputs[1], (uint32_t*)attr.size, attr.dim_num ); + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + hashkeys[SEED_INDEX]= SEED_HASH_KEY( in1_dtype, F32 ); + hashkeys[CDF_INDEX] = CDF_HASH_KEY( in0_dtype, F32 ); + hashkey = MULTINOMIAL_HASH_KEY( F32, F32, out_dtype ); + + status = _query_kernel( ikernels[SEED_INDEX], hashkeys[SEED_INDEX], INTERNAL_CL_KERNEL_SEED ); + if( VSI_SUCCESS != status ) + { + goto final; + } + status = _query_kernel( ikernels[CDF_INDEX], hashkeys[CDF_INDEX], INTERNAL_CL_KERNEL_CDF ); + if( VSI_SUCCESS != status ) + { + goto final; + } + status = _query_kernel( kernel, hashkey, INTERNAL_CL_KERNEL_MULTINOMIAL ); + if( VSI_SUCCESS != status ) + { + goto final; + } + + // Seed + node = vsi_nn_kernel_create_node( graph, ikernels[SEED_INDEX] ); + VSI_ASSERT( node != NULL ); + vsi_nn_kernel_node_pack_io( seed_node_params, _SEED_PARAM_NUM, + &tensors[SEEDS_INDEX], 1, &tensors[SEED_INDEX], 1 ); + seed_node_params[SCALAR_ITER_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &iteration ); + seed_node_params[SCALAR_RE_RAND_MAX] = vsi_nn_kernel_scalar_create( graph, F32, &re_rand_max ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, seed_node_params, _SEED_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &seed_node_params[SCALAR_ITER_NUM] ); + vsi_nn_kernel_scalar_release( &seed_node_params[SCALAR_RE_RAND_MAX] ); + vsi_nn_kernel_node_release( &node ); + + // CDF + node = vsi_nn_kernel_create_node( graph, ikernels[CDF_INDEX] ); + VSI_ASSERT( node != NULL ); + vsi_nn_kernel_node_pack_io( cdf_node_params, _CDF_PARAM_NUM, + &inputs[0], 1, &tensors[CDF_INDEX], 1 ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, cdf_node_params, _CDF_PARAM_NUM ); + + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_node_release( &node ); + + // Multinomial + node = vsi_nn_kernel_create_node( graph, kernel ); + VSI_ASSERT( node != NULL ); + vsi_nn_kernel_node_pack_io( node_params, _PARAM_NUM, tensors, 2, outputs, 1 ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + + /* Pass parameters to node. */ +final: + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + if( ikernels[i] ) + { + vsi_nn_kernel_release( &ikernels[i] ); + } + if( tensors[i] ) + { + vsi_nn_ReleaseTensor( &tensors[i] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( random_multinomial, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c new file mode 100644 index 0000000..09d16b8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c @@ -0,0 +1,247 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_REDUCEALL_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_REDUCEALL_KERNEL_SOURCE_NAME(AXIS) \ + "reduceall_internal_axis"#AXIS + +#define HASH_REDUCEALL_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEALL_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("cl.reduceall_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_REDUCEALL_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_REDUCEALL_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEALL_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("cl.reduceall_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_REDUCEALL_KERNEL_SOURCE_NAME(AXIS) }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _reduceall_internal_kernel_map[] = +{ + HASH_REDUCEALL_KERNELS( 0, I8, I8 ) + HASH_REDUCEALL_KERNELS( 1, I8, I8 ) + HASH_REDUCEALL_KERNELS( 2, I8, I8 ) + + HASH_REDUCEALL_KERNELS_2D( 0, I8, I8 ) + HASH_REDUCEALL_KERNELS_2D( 1, I8, I8 ) + HASH_REDUCEALL_KERNELS_2D( 2, I8, I8 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _reduceall_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEALL_INTERNAL_PARAM_NUM _cnt_of_array( _reduceall_internal_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = 2; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _reduceall_internal_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _reduceall_internal_kernel_map; + size_t kernel_map_size = _cnt_of_array( _reduceall_internal_kernel_map ); + vx_param_description_t * param_def = _reduceall_internal_kernel_param_def; + size_t param_def_size = _cnt_of_array( _reduceall_internal_kernel_param_def ); + vx_kernel_initialize_f initializer = _reduceall_internal_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if ( BOOL8 == in_dtype ) + { + in_dtype = I8; + } + + if ( BOOL8 == out_dtype ) + { + out_dtype = I8; + } + + key = HASH_REDUCEALL_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEALL_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEALL_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEALL_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( reduceall_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c new file mode 100644 index 0000000..e2d6bf8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c @@ -0,0 +1,247 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_REDUCEANY_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_REDUCEANY_KERNEL_SOURCE_NAME(AXIS) \ + "reduceany_internal_axis"#AXIS + +#define HASH_REDUCEANY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEANY_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("cl.reduceany_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_REDUCEANY_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_REDUCEANY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEANY_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("cl.reduceany_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_REDUCEANY_KERNEL_SOURCE_NAME(AXIS) }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _reduceany_internal_kernel_map[] = +{ + HASH_REDUCEANY_KERNELS( 0, I8, I8 ) + HASH_REDUCEANY_KERNELS( 1, I8, I8 ) + HASH_REDUCEANY_KERNELS( 2, I8, I8 ) + + HASH_REDUCEANY_KERNELS_2D( 0, I8, I8 ) + HASH_REDUCEANY_KERNELS_2D( 1, I8, I8 ) + HASH_REDUCEANY_KERNELS_2D( 2, I8, I8 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _reduceany_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEANY_INTERNAL_PARAM_NUM _cnt_of_array( _reduceany_internal_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = 2; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _reduceany_internal_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _reduceany_internal_kernel_map; + size_t kernel_map_size = _cnt_of_array( _reduceany_internal_kernel_map ); + vx_param_description_t * param_def = _reduceany_internal_kernel_param_def; + size_t param_def_size = _cnt_of_array( _reduceany_internal_kernel_param_def ); + vx_kernel_initialize_f initializer = _reduceany_internal_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if ( BOOL8 == in_dtype ) + { + in_dtype = I8; + } + + if ( BOOL8 == out_dtype ) + { + out_dtype = I8; + } + + key = HASH_REDUCEANY_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEANY_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEANY_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEANY_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( reduceany_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c new file mode 100644 index 0000000..ee5b0a4 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c @@ -0,0 +1,276 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_REDUCEMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_REDUCEMAX_KERNEL_SOURCE_NAME(AXIS) \ + "reducemax_internal_axis"#AXIS + +#define HASH_REDUCEMAX_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("cl.reducemax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_REDUCEMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_REDUCEMAX_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("cl.reducemax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_REDUCEMAX_KERNEL_SOURCE_NAME(AXIS) }, + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _reducemax_internal_kernel_map[] = +{ + HASH_REDUCEMAX_KERNELS( 0, F32, F32 ) + HASH_REDUCEMAX_KERNELS( 0, I32, I32 ) + HASH_REDUCEMAX_KERNELS( 0, U8, U8 ) + HASH_REDUCEMAX_KERNELS( 1, F32, F32 ) + HASH_REDUCEMAX_KERNELS( 1, I32, I32 ) + HASH_REDUCEMAX_KERNELS( 1, U8, U8 ) + HASH_REDUCEMAX_KERNELS( 2, F32, F32 ) + HASH_REDUCEMAX_KERNELS( 2, I32, I32 ) + HASH_REDUCEMAX_KERNELS( 2, U8, U8 ) + + HASH_REDUCEMAX_KERNELS_2D( 0, F32, F32 ) + HASH_REDUCEMAX_KERNELS_2D( 0, I32, I32 ) + HASH_REDUCEMAX_KERNELS_2D( 0, U8, U8 ) + HASH_REDUCEMAX_KERNELS_2D( 1, F32, F32 ) + HASH_REDUCEMAX_KERNELS_2D( 1, I32, I32 ) + HASH_REDUCEMAX_KERNELS_2D( 1, U8, U8 ) + +}; + + +/* + * Kernel params + */ +static vx_param_description_t _reducemax_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEMAX_INTERNAL_PARAM_NUM _cnt_of_array( _reducemax_internal_kernel_param_def ) + +#define SCALAR_INPUT_SCALE (2) +#define SCALAR_INPUT_TAIL (3) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = 2; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _reducemax_internal_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _reducemax_internal_kernel_map; + size_t kernel_map_size = _cnt_of_array( _reducemax_internal_kernel_map ); + vx_param_description_t * param_def = _reducemax_internal_kernel_param_def; + size_t param_def_size = _cnt_of_array( _reducemax_internal_kernel_param_def ); + vx_kernel_initialize_f initializer = _reducemax_internal_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + key = HASH_REDUCEMAX_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEMAX_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point; + float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point; + + inputScale = inputScale / outputScale; + inputTail = outputZP - inputTail * inputScale; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEMAX_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &inputTail ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEMAX_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( reducemax_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c new file mode 100644 index 0000000..fb8fd84 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c @@ -0,0 +1,273 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_REDUCEMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_REDUCEMIN_KERNEL_SOURCE_NAME(AXIS) \ + "reducemin_internal_axis"#AXIS + +#define HASH_REDUCEMIN_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("cl.reducemin_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_REDUCEMIN_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_REDUCEMIN_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("cl.reducemin_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_REDUCEMIN_KERNEL_SOURCE_NAME(AXIS) }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _reducemin_internal_kernel_map[] = +{ + HASH_REDUCEMIN_KERNELS( 0, F32, F32 ) + HASH_REDUCEMIN_KERNELS( 0, I32, I32 ) + HASH_REDUCEMIN_KERNELS( 0, U8, U8 ) + HASH_REDUCEMIN_KERNELS( 1, F32, F32 ) + HASH_REDUCEMIN_KERNELS( 1, I32, I32 ) + HASH_REDUCEMIN_KERNELS( 1, U8, U8 ) + HASH_REDUCEMIN_KERNELS( 2, F32, F32 ) + HASH_REDUCEMIN_KERNELS( 2, I32, I32 ) + HASH_REDUCEMIN_KERNELS( 2, U8, U8 ) + + HASH_REDUCEMIN_KERNELS_2D( 0, F32, F32 ) + HASH_REDUCEMIN_KERNELS_2D( 0, I32, I32 ) + HASH_REDUCEMIN_KERNELS_2D( 0, U8, U8 ) + HASH_REDUCEMIN_KERNELS_2D( 1, F32, F32 ) + HASH_REDUCEMIN_KERNELS_2D( 1, I32, I32 ) + HASH_REDUCEMIN_KERNELS_2D( 1, U8, U8 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _reducemin_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEMIN_INTERNAL_PARAM_NUM _cnt_of_array( _reducemin_internal_kernel_param_def ) + +#define SCALAR_INPUT_SCALE (2) +#define SCALAR_INPUT_TAIL (3) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = 2; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _reducemin_internal_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _reducemin_internal_kernel_map; + size_t kernel_map_size = _cnt_of_array( _reducemin_internal_kernel_map ); + vx_param_description_t * param_def = _reducemin_internal_kernel_param_def; + size_t param_def_size = _cnt_of_array( _reducemin_internal_kernel_param_def ); + vx_kernel_initialize_f initializer = _reducemin_internal_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + key = HASH_REDUCEMIN_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEMIN_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point; + float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point; + + inputScale = inputScale / outputScale; + inputTail = outputZP - inputTail * inputScale; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEMIN_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &inputTail ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEMIN_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( reducemin_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c new file mode 100644 index 0000000..8972e7b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c @@ -0,0 +1,307 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_REDUCEPROD_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_REDUCEPROD_KERNEL_SOURCE_NAME(AXIS) \ + "reduceprod_internal_axis"#AXIS + +#define HASH_REDUCEPROD_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEPROD_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("cl.reduceprod_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_REDUCEPROD_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_REDUCEPROD_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEPROD_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("cl.reduceprod_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_REDUCEPROD_KERNEL_SOURCE_NAME(AXIS) }, + + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _reduceprod_internal_kernel_map[] = +{ + HASH_REDUCEPROD_KERNELS( 0, F32, F32 ) + HASH_REDUCEPROD_KERNELS( 0, I32, I32 ) + HASH_REDUCEPROD_KERNELS( 0, U8, U8 ) + HASH_REDUCEPROD_KERNELS( 1, F32, F32 ) + HASH_REDUCEPROD_KERNELS( 1, I32, I32 ) + HASH_REDUCEPROD_KERNELS( 1, U8, U8 ) + HASH_REDUCEPROD_KERNELS( 2, F32, F32 ) + HASH_REDUCEPROD_KERNELS( 2, I32, I32 ) + HASH_REDUCEPROD_KERNELS( 2, U8, U8 ) + + HASH_REDUCEPROD_KERNELS_2D( 0, F32, F32 ) + HASH_REDUCEPROD_KERNELS_2D( 0, I32, I32 ) + HASH_REDUCEPROD_KERNELS_2D( 0, U8, U8 ) + HASH_REDUCEPROD_KERNELS_2D( 1, F32, F32 ) + HASH_REDUCEPROD_KERNELS_2D( 1, I32, I32 ) + HASH_REDUCEPROD_KERNELS_2D( 1, U8, U8 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _reduceprod_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _REDUCEPROD_INTERNAL_PARAM_NUM _cnt_of_array( _reduceprod_internal_kernel_param_def ) + +#define SCALAR_INPUT_SCALE (2) +#define SCALAR_INPUT_TAIL (3) +#define SCALAR_OUTPUT_SCALE (4) +#define SCALAR_OUTPUT_TAIL (5) + +#define REDUCEPROD_PARAM_NUM 2 +#define REDUCEPROD_QUANT_PARAM_NUM _cnt_of_array( _reduceprod_internal_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = 2; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _reduceprod_internal_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _reduceprod_internal_kernel_map; + size_t kernel_map_size = _cnt_of_array( _reduceprod_internal_kernel_map ); + vx_param_description_t * param_def = _reduceprod_internal_kernel_param_def; + size_t param_def_size = _cnt_of_array( _reduceprod_internal_kernel_param_def ); + vx_kernel_initialize_f initializer = _reduceprod_internal_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 == in_dtype) || (U8 == out_dtype)) + { + param_def_size = REDUCEPROD_QUANT_PARAM_NUM; + *is_use_u8_kernel = TRUE; + } + else + { + param_def_size = REDUCEPROD_PARAM_NUM; + *is_use_u8_kernel = FALSE; + } + + key = HASH_REDUCEPROD_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEPROD_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; + float outputTail = (float)outputs[0]->attr.dtype.zero_point; + float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point; + vsi_bool is_use_u8_kernel = FALSE; + + outputScale = 1.0f / outputScale; + inputTail = -(inputTail * inputScale); + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d, &is_use_u8_kernel); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = REDUCEPROD_PARAM_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEPROD_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + if (is_use_u8_kernel) + { + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &inputTail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail ); + node_params_num = REDUCEPROD_QUANT_PARAM_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( reduceprod_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c new file mode 100644 index 0000000..31d3925 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c @@ -0,0 +1,325 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define _RELU_KERAS_KERNEL_SOURCE "relu_keras" + +#define STR(a) #a +// Add kernel hashtable here +#define RELU_KERAS_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d)) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RELU_KERAS_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 ), \ + CVIVANTE_NAMESPACE("cl.relu_keras_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RELU_KERAS_KERNEL_SOURCE } + +#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE ) \ + { RELU_KERAS_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1 ), \ + CVIVANTE_NAMESPACE("cl.relu_keras_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _RELU_KERAS_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _relu_keras_kernel_map[] = +{ + PACK_KERNEL_MAP(F32, F32), + PACK_KERNEL_MAP(F32, U8), + PACK_KERNEL_MAP(U8, U8), + PACK_KERNEL_MAP(U8, F32), + PACK_KERNEL_MAP_2D(F32, F32), + PACK_KERNEL_MAP_2D(F32, U8), + PACK_KERNEL_MAP_2D(U8, U8), + PACK_KERNEL_MAP_2D(U8, F32), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _relu_keras_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_ALPHA (2) +#define SCALAR_MAX_VALUE (3) +#define SCALAR_THRESHOLD (4) +#define SCALAR_OFFSET (5) +#define SCALAR_INPUT_SCALE (6) +#define SCALAR_INPUT_TAIL (7) +#define SCALAR_OUTPUT_SCALE (8) +#define SCALAR_OUTPUT_TAIL (9) + +#define _RELU_KERAS_PARAM_NUM 6 +#define _RELU_KERAS_QUANT_PARAM_NUM _cnt_of_array( _relu_keras_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_relu_keras_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_int_array_t * out_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _relu_keras_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _relu_keras_kernel_map; + size_t kernel_map_size = _cnt_of_array( _relu_keras_kernel_map ); + vx_param_description_t * param_def = _relu_keras_kernel_param_def; + size_t param_def_size = _cnt_of_array( _relu_keras_kernel_param_def ); + vx_kernel_initialize_f initializer = _relu_keras_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 == in_dtype) || (U8 == out_dtype)) + { + param_def_size = _RELU_KERAS_QUANT_PARAM_NUM; + *is_use_u8_kernel = TRUE; + } + else + { + param_def_size = _RELU_KERAS_PARAM_NUM; + *is_use_u8_kernel = FALSE; + } + + key = RELU_KERAS_HASH_KEY( in_dtype, (uint32_t)out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RELU_KERAS_QUANT_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + float outputScale = 1.0f; + float outputTail = 0.0f; + float inputScale = 1.0f; + float inputTail = 0.0f; + vsi_bool is_use_u8_kernel = FALSE; + float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); + float max_value = vsi_nn_kernel_param_get_float32( params, "max_value" ); + float threshold = vsi_nn_kernel_param_get_float32( params, "threshold" ); + float offset = -alpha * threshold; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) ) + { + return NULL; + } + + if (VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == inputs[0]->attr.dtype.qnt_type) + { + inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; + inputTail = -((float)inputs[0]->attr.dtype.zero_point * inputScale); + } + + if (VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == outputs[0]->attr.dtype.qnt_type) + { + outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; + outputScale = 1.0f / outputScale; + outputTail = (float)outputs[0]->attr.dtype.zero_point; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel ); + + if( VSI_SUCCESS == status) + { + size_t node_params_num = _RELU_KERAS_PARAM_NUM; + + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RELU_KERAS_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALPHA] = vsi_nn_kernel_scalar_create( graph, F32, &alpha ); + node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value ); + node_params[SCALAR_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &threshold ); + node_params[SCALAR_OFFSET] = vsi_nn_kernel_scalar_create( graph, F32, &offset ); + if (is_use_u8_kernel) + { + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &inputTail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail ); + node_params_num = _RELU_KERAS_QUANT_PARAM_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALPHA] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_THRESHOLD] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OFFSET] ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] ); + } + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( relu_keras, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c new file mode 100644 index 0000000..26f5051 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c @@ -0,0 +1,322 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define _RESIZE_BILINEAR_KERNEL_SOURCE() "resize_bilinear" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) ) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RESIZE_BILINEAR_KERNEL_SOURCE() } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_bilinear_kernel_map[] = +{ + PACK_KERNEL_MAP( F32, F32), + PACK_KERNEL_MAP( U8, U8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_bilinear_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_SCALE_X (2) +#define SCALAR_SCALE_Y (3) +#define SCALAR_HALF_PIXEL (4) +#define SCALAR_INPUT_SCALE (5) +#define SCALAR_INPUT_TAIL (6) +#define SCALAR_OUTPUT_SCALE (7) +#define SCALAR_OUTPUT_TAIL (8) + + +#define RESIZE_BILINEAR_NUM 5 +#define RESIZE_BILINEAR_QUANT_NUM _cnt_of_array( _resize_bilinear_kernel_param_def ) + + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_int_array_t * out_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _resize_bilinear_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _resize_bilinear_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_bilinear_kernel_map ); + vx_param_description_t * param_def = _resize_bilinear_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_bilinear_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_bilinear_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 == in_dtype) || (U8 == out_dtype)) + { + param_def_size = RESIZE_BILINEAR_QUANT_NUM; + *is_use_u8_kernel = TRUE; + } + else + { + param_def_size = RESIZE_BILINEAR_NUM; + *is_use_u8_kernel = FALSE; + } + + key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype ); + + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[RESIZE_BILINEAR_QUANT_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + int32_t in_width = inputs[0]->attr.size[0]; + int32_t in_height = inputs[0]->attr.size[1]; + int32_t out_width = outputs[0]->attr.size[0]; + int32_t out_height = outputs[0]->attr.size[1]; + float input_zp = (float)inputs[0]->attr.dtype.zero_point; + float input_scale = inputs[0]->attr.dtype.scale; + float input_tail = -(input_zp * input_scale); + float output_zp = (float)outputs[0]->attr.dtype.zero_point; + float output_scale = (0 == outputs[0]->attr.dtype.scale) ? 1.0f : 1.0f / outputs[0]->attr.dtype.scale; + float half_pixel_value = 0.0f; + float scale_factor_x = 0.0f; + float scale_factor_y = 0.0f; + vsi_bool is_use_u8_kernel = FALSE; + + if (align_corners && out_width > 1) + { + scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + } + else + { + scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + } + + if (align_corners && out_height > 1) + { + scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1); + } + else + { + scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + + status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = RESIZE_BILINEAR_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, RESIZE_BILINEAR_QUANT_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x ); + node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create(graph, F32, &scale_factor_y ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value ); + if (is_use_u8_kernel) + { + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input_tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &output_zp ); + node_params_num = RESIZE_BILINEAR_QUANT_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] ); + } + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( resize_bilinear, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c new file mode 100644 index 0000000..b071fdf --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c @@ -0,0 +1,330 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_RESIZE_NEAREST, +} _internal_kernel_e; + +#define _RESIZE_NEAREST_KERNEL_SOURCE "resize_nearest" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.resize_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RESIZE_NEAREST_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_nearest_kernel_map[] = +{ + PACK_KERNEL_MAP( F32, F32), + PACK_KERNEL_MAP( U8, U8), +}; + + + + +/* + * Kernel params + */ +static vx_param_description_t _resize_nearest_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _RESIZE_NEAREST_PARAM_NUM 6 +#define _RESIZE_NEAREST_QUANT_NUM _cnt_of_array( _resize_nearest_kernel_param_def ) + +#define SCALAR_SCALE_X (2) +#define SCALAR_SCALE_Y (3) +#define SCALAR_HALF_PIXEL (4) +#define SCALAR_ROUND_VALUE (5) +#define SCALAR_SCALE_VALUE (6) +#define SCALAR_TAIL_VALUE (7) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_int_array_t * out_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _resize_nearest_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _resize_nearest_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_nearest_kernel_map ); + vx_param_description_t * param_def = _resize_nearest_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_nearest_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_nearest_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 == in_dtype) || (U8 == out_dtype)) + { + param_def_size = _RESIZE_NEAREST_QUANT_NUM; + *is_use_u8_kernel = TRUE; + } + else + { + param_def_size = _RESIZE_NEAREST_PARAM_NUM; + *is_use_u8_kernel = FALSE; + } + + key = RESIZE_NEAREST_HASH_KEY( in_dtype, out_dtype ); + + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_NEAREST_QUANT_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + int32_t in_width = inputs[0]->attr.size[0]; + int32_t in_height = inputs[0]->attr.size[1]; + int32_t out_width = outputs[0]->attr.size[0]; + int32_t out_height = outputs[0]->attr.size[1]; + float input_zp = (float)inputs[0]->attr.dtype.zero_point; + float input_scale = inputs[0]->attr.dtype.scale; + float output_scale = (0 == outputs[0]->attr.dtype.scale) ? \ + input_scale : input_scale / outputs[0]->attr.dtype.scale; + float output_tail = (float)outputs[0]->attr.dtype.zero_point - input_zp * output_scale; + float half_pixel_value = 0.0f; + float round_value = 0.0f; + float scale_factor_x = 0.0f; + float scale_factor_y = 0.0f; + vsi_bool is_use_u8_kernel = FALSE; + + if (align_corners && out_width > 1) + { + scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + } + else + { + scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + } + + if (align_corners && out_height > 1) + { + scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1); + } + else + { + scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height; + } + + if (align_corners) + { + round_value = 0.5f; + } + else + { + round_value = 0.0f; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = _RESIZE_NEAREST_PARAM_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_NEAREST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x ); + node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create(graph, F32, &scale_factor_y ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value ); + node_params[SCALAR_ROUND_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &round_value ); + if (is_use_u8_kernel) + { + node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[SCALAR_TAIL_VALUE] = vsi_nn_kernel_scalar_create(graph, F32, &output_tail ); + node_params_num = _RESIZE_NEAREST_QUANT_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ROUND_VALUE] ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL_VALUE] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( resize_nearest, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c new file mode 100644 index 0000000..21f82b8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c @@ -0,0 +1,336 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "scatter_nd" + + typedef enum +{ + _1D = 0, + _2D, + _3D +} vsi_nn_kernel_coord_type_e; + +#define HASH_SCATTER_ND_KEY(_input0_type, _input1_type, _output_type, _coord_dim) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_coord_dim)) + +#define HASH_SCATTER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \ + CVIVANTE_NAMESPACE("cl.scatter_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE) + +#define TENSOR_SCATTER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \ + { HASH_SCATTER_ND_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE), \ + HASH_SCATTER_ND_SH_KERNEL_NAME(IN1_TYPE, OUT_TYPE, COORD_TYPE), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } scatter_nd_map[] = +{ + TENSOR_SCATTER_ND_KERNELS(I32, I32, I32, _1D, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, I32, I32, _2D, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, I32, I32, _3D, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, U32, U32, _1D, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, U32, U32, _2D, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, U32, U32, _3D, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, F32, F32, _1D, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, F32, F32, _2D, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, F32, F32, _3D, KERNEL_SOURCE_1) +}; + +/* + * Kernel params + */ +static vx_param_description_t _scatter_nd_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _SCATTER_ND_PARAM_NUM _cnt_of_array(_scatter_nd_kernel_param_def) + +static vsi_status cal_scatter_nd_tensor_reshape_size + ( + vsi_nn_tensor_t ** inputs, + int32_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t block_size, + uint32_t coordDim, + uint32_t* width, + uint32_t* area, + int32_t* newDim + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t dims_num = inputs[0]->attr.dim_num; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + uint32_t elementCnt = 1; + + if(coordDim != 0 && (width == NULL || area == NULL)) + { + return status; + } + +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + + newDim[0] = 0; + for(i = 0; i < dims_num; ++i) + { + elementCnt *= input_size[i]; + } + + for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + + if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + status = VSI_SUCCESS; + newDim[0] = 2; + } + else + { + return status; + } + + if(coordDim == 1) // index shape + { + *width = 0; + *area = 0; + } + else if(coordDim == 2) + { + *width = input_size[dims_num - 2]; + *area = 0; + } + else if(coordDim == 3) + { + *width = input_size[dims_num - 3]; + *area = input_size[dims_num - 3] * input_size[dims_num - 2]; + } +#undef VSI_NN_MAX_IMAGE_WIDTH + + return status; +} /* _get_EltOP_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_scatter_nd_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t block_size = 0; + int32_t height = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + block_size = attr[0]->shape->data[0]; + height = attr[0]->shape->data[1]; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = block_size; + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _scatter_nd_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t coord_dim + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input1_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_coord_type_e coord_type = _1D; + uint32_t key = 0; + int i = 0; + + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if(coord_dim == 1) + { + coord_type = _1D; + } + else if(coord_dim == 2) + { + coord_type = _2D; + } + else if(coord_dim == 3) + { + coord_type = _3D; + } + + key = HASH_SCATTER_ND_KEY( I32, input1_dtype, output_dtype, coord_type ); + + for( i = 0; i < _cnt_of_array(scatter_nd_map); i ++ ) + { + if( scatter_nd_map[i].key == key ) + { + break; + } + } + + if( i < _cnt_of_array(scatter_nd_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_map[i].function_name ); + kernel->info.parameters = _scatter_nd_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _scatter_nd_kernel_param_def ); + kernel->info.initialize = _scatter_nd_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + scatter_nd_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t idx_num = vsi_nn_kernel_param_get_int32( params, "idx_num" ); + int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + uint32_t width = 0, area = 0; + + status = cal_scatter_nd_tensor_reshape_size(&inputs[0], shapes[0], coord_dim, 0, NULL, NULL, &rs_in_dim); + status |= cal_scatter_nd_tensor_reshape_size(&inputs[1], shapes[1], block_size, 0, NULL, NULL, &rs_idx_dim); + status |= cal_scatter_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim, + &width, &area, &rs_out_dim); + if(status != VSI_SUCCESS) + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, coord_dim ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], rs_in_dim ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[1], rs_idx_dim ); + node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &idx_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ND_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &node_params[0] ); + vsi_nn_kernel_tensor_release( &node_params[1] ); + vsi_nn_kernel_tensor_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( scatter_nd, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/select_cl.c b/src/tim/vx/internal/src/kernel/cl/select_cl.c new file mode 100644 index 0000000..42c0caa --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/select_cl.c @@ -0,0 +1,292 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _SELECT_KERNEL_SOURCE "select" + +#define STR(a) #a + +// Add kernel hashtable here +#define SELECT_HASH_KEY(COND_DTYPE, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, _image_2d) \ + ((COND_DTYPE << 25) | (IN0_DTYPE << 18) | ( IN1_DTYPE << 11 ) | ( OUT_DTYPE << 4) | (_image_2d)) + +#define PACK_KERNEL_MAP(COND_DTYPE, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { SELECT_HASH_KEY(COND_DTYPE, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("cl.select_"STR(COND_DTYPE)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ + _SELECT_KERNEL_SOURCE} + +#define PACK_KERNEL_MAP_2D(COND_DTYPE, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { SELECT_HASH_KEY(COND_DTYPE, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("cl.select_"STR(COND_DTYPE)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _SELECT_KERNEL_SOURCE} + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _select_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP(I8, U8, U8, U8), + PACK_KERNEL_MAP(I8, I32, I32, I32), + PACK_KERNEL_MAP(I8, F32, F32, F32), + PACK_KERNEL_MAP_2D(I8, U8, U8, U8), + PACK_KERNEL_MAP_2D(I8, I32, I32, I32), + PACK_KERNEL_MAP_2D(I8, F32, F32, F32), +}; + +/* + * Kernel params + */ +static vx_param_description_t _select_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_INPUT0_SCALE (4) +#define SCALAR_INPUT0_TAIL (5) +#define SCALAR_INPUT1_SCALE (6) +#define SCALAR_INPUT1_TAIL (7) +#define _SELECT_PARAM_NUM _cnt_of_array( _select_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_select_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vx_tensor output = (vx_tensor)param[3]; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t *output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = output_shape->size < 3 ? 2 : 3; + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_offset[2] = 0; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? + (output_shape->data[2] + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + + return status; +} /* _select_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e cond_dtype; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _select_kernel_map; + size_t kernel_map_size = _cnt_of_array( _select_kernel_map ); + vx_param_description_t * param_def = _select_kernel_param_def; + size_t param_def_size = _cnt_of_array( _select_kernel_param_def ); + vx_kernel_initialize_f initializer = _select_initializer; + + uint32_t key; + uint32_t i; + + cond_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in0_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + cond_dtype = (BOOL8 == cond_dtype) ? I8 : cond_dtype; + in0_dtype = (BOOL8 == in0_dtype) ? I32 : in0_dtype; + in1_dtype = (BOOL8 == in1_dtype) ? I32 : in1_dtype; + out_dtype = (BOOL8 == out_dtype) ? I32 : out_dtype; + + in0_dtype = (F16 == in0_dtype) ? F32 : in0_dtype; + in1_dtype = (F16 == in1_dtype) ? F32 : in1_dtype; + out_dtype = (F16 == out_dtype) ? F32 : out_dtype; + + key = SELECT_HASH_KEY(cond_dtype, in0_dtype, in1_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SELECT_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point; + float input0Scale = inputs[1]->attr.dtype.scale == 0.0f ? 1.0f : inputs[1]->attr.dtype.scale; + float input0Tail = (float)inputs[1]->attr.dtype.zero_point; + float input1Scale = inputs[2]->attr.dtype.scale == 0.0f ? 1.0f : inputs[2]->attr.dtype.scale; + float input1Tail = (float)inputs[2]->attr.dtype.zero_point; + + input0Scale = input0Scale / outputScale; + input1Scale = input1Scale / outputScale; + input0Tail = outputZP - input0Tail * input0Scale; + input1Tail = outputZP - input1Tail * input1Scale; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale ); + node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail ); + node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale ); + node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SELECT_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( select, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/swish_cl.c b/src/tim/vx/internal/src/kernel/cl/swish_cl.c new file mode 100644 index 0000000..9012c93 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c @@ -0,0 +1,362 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + + +#define _SWISH_KERNEL_SOURCE "swish", +#define _HSWISH_KERNEL_SOURCE "hswish", + +#define STR(a) #a +// Add kernel hashtable here +#define SWISH_HASH_KEY(SWISH_TYPE, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((SWISH_TYPE << 20) | ( IN_DTYPE << 12 ) | ( OUT_DTYPE << 4) | (_image_2d)) + +#define SWISH_PACK_KERNEL_FLOAT_MAP( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_SWISH, IN_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("cl.swish_F32toF32"), \ + _SWISH_KERNEL_SOURCE } + +#define HSWISH_PACK_KERNEL_FLOAT_MAP( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_HSWISH, IN_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("cl.hswish_F32toF32"), \ + _HSWISH_KERNEL_SOURCE } + +#define SWISH_PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_SWISH, IN_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("cl.swish_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _SWISH_KERNEL_SOURCE } + +#define HSWISH_PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_HSWISH, IN_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("cl.hswish_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _HSWISH_KERNEL_SOURCE } + + +#define SWISH_PACK_KERNEL_FLOAT_MAP_2D( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_SWISH, IN_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("cl.swish_F32toF32_2D"), \ + _SWISH_KERNEL_SOURCE } + +#define HSWISH_PACK_KERNEL_FLOAT_MAP_2D( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_HSWISH, IN_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("cl.hswish_F32toF32_2D"), \ + _HSWISH_KERNEL_SOURCE } + +#define SWISH_PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_SWISH, IN_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("cl.swish_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _SWISH_KERNEL_SOURCE } + +#define HSWISH_PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_HSWISH, IN_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("cl.hswish_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _HSWISH_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _swish_kernel_map[] = +{ + SWISH_PACK_KERNEL_FLOAT_MAP(F32, F32), + SWISH_PACK_KERNEL_FLOAT_MAP_2D(F32, F32), + SWISH_PACK_KERNEL_FLOAT_MAP(F16, F16), + SWISH_PACK_KERNEL_FLOAT_MAP_2D(F16, F16), + SWISH_PACK_KERNEL_MAP(U8, U8), + SWISH_PACK_KERNEL_MAP_2D(U8, U8), + SWISH_PACK_KERNEL_MAP(I32, I32), + SWISH_PACK_KERNEL_MAP_2D(I32, I32), + HSWISH_PACK_KERNEL_FLOAT_MAP(F32, F32), + HSWISH_PACK_KERNEL_FLOAT_MAP_2D(F32, F32), + HSWISH_PACK_KERNEL_FLOAT_MAP(F16, F16), + HSWISH_PACK_KERNEL_FLOAT_MAP_2D(F16, F16), + HSWISH_PACK_KERNEL_MAP(U8, U8), + HSWISH_PACK_KERNEL_MAP_2D(U8, U8), + HSWISH_PACK_KERNEL_MAP(I32, I32), + HSWISH_PACK_KERNEL_MAP_2D(I32, I32), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _swish_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_INPUT_SCALE (2) +#define SCALAR_INPUT_TAIL (3) +#define SCALAR_OUTPUT_SCALE (4) +#define SCALAR_OUTPUT_ZP (5) +#define SCALAR_BETA (6) +#define SCALAR_LOGE (7) +#define _SWISH_PARAM_NUM _cnt_of_array( _swish_kernel_param_def ) +#define _HSWISH_PARAM_NUM _SWISH_PARAM_NUM - 2 +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_swish_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vx_status status = VX_FAILURE; + vx_tensor output = (vx_tensor)param[1]; + vsi_nn_kernel_tensor_attr_t * attr_out = NULL; + vsi_int_array_t * out_shape = NULL; + + attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + out_shape = attr_out->shape; + + gpu_param.dim = out_shape->size < 3 ? 2 : 3; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (attr_out) + { + vsi_nn_kernel_tensor_attr_release(&attr_out); + } + + return status; +} /* _swish_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d, + vsi_nn_swish_type swish_type + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _swish_kernel_map; + size_t kernel_map_size = _cnt_of_array( _swish_kernel_map ); + vx_param_description_t * param_def = _swish_kernel_param_def; + size_t param_def_size = _SWISH_PARAM_NUM; + vx_kernel_initialize_f initializer = _swish_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = SWISH_HASH_KEY(swish_type, in_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if (VSI_NN_SWISH == (vsi_nn_swish_type)swish_type) + { + param_def_size = _SWISH_PARAM_NUM; + } + else + { + param_def_size = _HSWISH_PARAM_NUM; + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SWISH_PARAM_NUM] = {NULL}; + int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}}; + uint32_t new_rank = 0; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t swish_type = vsi_nn_kernel_param_get_int32( params, "type" ); + float beta = 1.0f; + float inputScale = inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 0.0f : 1.0f / outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + vx_float32 logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); + +#if (VX_ACTIVATION_EXT_SUPPORT) + if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) + { + return NULL; + } +#endif + + vsi_nn_OptimizedEltOPShape(inputs[0], (uint32_t *)(shapes[0]), &new_rank); + vsi_nn_OptimizedEltOPShape(outputs[0], (uint32_t *)(shapes[1]), &new_rank); + + if( !vsi_nn_kernel_gpu_check_shape( shapes[0], new_rank ) ) + { + return NULL; + } + + image_2d = (new_rank == 2); + + if (VSI_NN_HSWISH == (vsi_nn_swish_type)swish_type) + { + beta = 1.0f / 6.0f; + } + else + { + beta = vsi_nn_kernel_param_get_float32( params, "beta" ); + } + + status = _query_kernel( kernel, inputs, outputs, image_2d, (vsi_nn_swish_type)swish_type); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = _SWISH_PARAM_NUM; + node_params[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], new_rank ); + node_params[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], new_rank ); + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &inputTail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(graph, F32, &outputZP ); + if (VSI_NN_SWISH == (vsi_nn_swish_type)swish_type) + { + node_params[SCALAR_BETA] = vsi_nn_kernel_scalar_create( graph, F32, &beta ); + node_params[SCALAR_LOGE] = vsi_nn_kernel_scalar_create( graph, F32, &logE ); + } + else + { + node_params_num = _HSWISH_PARAM_NUM; + } + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + + vsi_nn_kernel_tensor_release( &node_params[0] ); + vsi_nn_kernel_tensor_release( &node_params[1] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + if (VSI_NN_SWISH == (vsi_nn_swish_type)swish_type) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_BETA] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_LOGE] ); + } + + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( swish, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/tile_cl.c b/src/tim/vx/internal/src/kernel/cl/tile_cl.c new file mode 100644 index 0000000..be790f2 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c @@ -0,0 +1,378 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + + +/* + * Define kernel meta. + */ +#define HASH_TILE_AXIS0_KEY(_input_type, _output_type, _image_2d) \ + ((_input_type << 12) | (_output_type << 4) | (_image_2d)) + +#define KERNEL_SOURCE "tile", + +#define HASH_TILE_AXIS0_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.tile_"#SRC0_TYPE"to"#DST_TYPE) + +#define TENSOR_TILE_AXIS0_KERNELS(SRC0_TYPE, OUT_TYPE) \ + { HASH_TILE_AXIS0_KEY(SRC0_TYPE, OUT_TYPE, 0), \ + HASH_TILE_AXIS0_SH_KERNEL_NAME(SRC0_TYPE, OUT_TYPE), \ + KERNEL_SOURCE }, + +#define TENSOR_TILE_AXIS0_FLOAT(SRC0_TYPE, OUT_TYPE) \ + { HASH_TILE_AXIS0_KEY(SRC0_TYPE, OUT_TYPE, 0), \ + HASH_TILE_AXIS0_SH_KERNEL_NAME(F32, F32), \ + KERNEL_SOURCE }, + +#define HASH_TILE_AXIS0_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.tile_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_TILE_AXIS0_KERNELS_2D(SRC0_TYPE, OUT_TYPE) \ + { HASH_TILE_AXIS0_KEY(SRC0_TYPE, OUT_TYPE, 1), \ + HASH_TILE_AXIS0_SH_KERNEL_2D_NAME(SRC0_TYPE, OUT_TYPE), \ + KERNEL_SOURCE }, + +#define TENSOR_TILE_AXIS0_FLOAT_2D(SRC0_TYPE, OUT_TYPE) \ + { HASH_TILE_AXIS0_KEY(SRC0_TYPE, OUT_TYPE, 1), \ + HASH_TILE_AXIS0_SH_KERNEL_2D_NAME(F32, F32), \ + KERNEL_SOURCE }, + +#define TENSOR_TILE_AXIS0_INT32(SRC0_TYPE, OUT_TYPE) \ + { HASH_TILE_AXIS0_KEY(SRC0_TYPE, OUT_TYPE, 0), \ + HASH_TILE_AXIS0_SH_KERNEL_NAME(I32, I32), \ + KERNEL_SOURCE }, + +#define TENSOR_TILE_AXIS0_UINT32(SRC0_TYPE, OUT_TYPE) \ + { HASH_TILE_AXIS0_KEY(SRC0_TYPE, OUT_TYPE, 0), \ + HASH_TILE_AXIS0_SH_KERNEL_NAME(U32, U32), \ + KERNEL_SOURCE }, + +#define TENSOR_TILE_AXIS0_INT32_2D(SRC0_TYPE, OUT_TYPE) \ + { HASH_TILE_AXIS0_KEY(SRC0_TYPE, OUT_TYPE, 1), \ + HASH_TILE_AXIS0_SH_KERNEL_2D_NAME(I32, I32), \ + KERNEL_SOURCE }, + +#define TENSOR_TILE_AXIS0_UINT32_2D(SRC0_TYPE, OUT_TYPE) \ + { HASH_TILE_AXIS0_KEY(SRC0_TYPE, OUT_TYPE, 1), \ + HASH_TILE_AXIS0_SH_KERNEL_2D_NAME(U32, U32), \ + KERNEL_SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_TILE_AXIS0_INT32(I8, I8) + TENSOR_TILE_AXIS0_INT32(I16, I16) + TENSOR_TILE_AXIS0_INT32(I32, I32) + TENSOR_TILE_AXIS0_UINT32(U8, U8) + TENSOR_TILE_AXIS0_UINT32(U32, U32) + TENSOR_TILE_AXIS0_FLOAT(F16, F16) + TENSOR_TILE_AXIS0_FLOAT(F32, F32) + + TENSOR_TILE_AXIS0_INT32_2D(I8, I8) + TENSOR_TILE_AXIS0_INT32_2D(I16, I16) + TENSOR_TILE_AXIS0_INT32_2D(I32, I32) + TENSOR_TILE_AXIS0_UINT32_2D(U8, U8) + TENSOR_TILE_AXIS0_UINT32_2D(U32, U32) + TENSOR_TILE_AXIS0_FLOAT_2D(F16, F16) + TENSOR_TILE_AXIS0_FLOAT_2D(F32, F32) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) +#define SCALAR_INPUT_BATCH_IN (2) +#define SCALAR_INPUT_DEPTH_IN (3) +#define SCALAR_INPUT_DEPTH_OUT (4) +#define SCALAR_INPUT_MULTIPLES_0 (5) +#define SCALAR_INPUT_MULTIPLES_1 (6) +#define SCALAR_INPUT_MULTIPLES_2 (7) +#define SCALAR_INPUT_MULTIPLES_3 (8) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_tile_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * in_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + in_shape = attr[0]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = in_shape->data[0]; + gpu_param.global_size[1] = in_shape->data[1]; + gpu_param.global_size[2] = in_shape->size > 2 ? in_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + } + + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + } + + return status; +} /* _tile_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_TILE_AXIS0_KEY( input_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _tile_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_bool _is_supported_axis(int32_t* multiples, uint32_t multiples_num) +{ + uint32_t i = 0; + + if ( multiples_num < 4) + { + return TRUE; + } + else if ( multiples_num > 4) + { + return FALSE; + } + + for ( i = 3; i < multiples_num; i++) + { + if (multiples[i] > 1) + { + return FALSE; + } + } + + return TRUE; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t i = 0; + uint32_t new_rank = 0; + vsi_bool ret = FALSE; + uint32_t dim = inputs[0]->attr.dim_num; + int32_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 }; + + for ( i = 0; i < dim; i++) + { + multiples[i] = outputs[0]->attr.size[i] / inputs[0]->attr.size[i]; + } + + ret = vsi_nn_kernel_optimize_tile_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (int32_t *)multiples, inputs[0]->attr.dim_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + if (ret) + { + if ( _is_supported_axis(shapes[1], new_rank) == FALSE) + { + return NULL; + } + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes[2], new_rank ); + } + else + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[1]->attr.size, + outputs[0]->attr.dim_num )) + { + goto final; + } + + image_2d = ((reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1)); + status = _query_kernel( &reshape_tensors[0], &reshape_tensors[1], image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + uint32_t depthIn = new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1; + uint32_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1; + uint32_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1; + + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + &reshape_tensors[0], 1, &reshape_tensors[1], 1 ); + + /* Pass parameters to node. */ + node_params[SCALAR_INPUT_BATCH_IN] = vsi_nn_kernel_scalar_create( + graph, I32, &batchIn ); + node_params[SCALAR_INPUT_DEPTH_IN] = vsi_nn_kernel_scalar_create( + graph, I32, &depthIn ); + node_params[SCALAR_INPUT_DEPTH_OUT] = vsi_nn_kernel_scalar_create( + graph, I32, &depthOut ); + node_params[SCALAR_INPUT_MULTIPLES_0] = vsi_nn_kernel_scalar_create( + graph, I32, &multiples[0] ); + node_params[SCALAR_INPUT_MULTIPLES_1] = vsi_nn_kernel_scalar_create( + graph, I32, &multiples[1] ); + node_params[SCALAR_INPUT_MULTIPLES_2] = vsi_nn_kernel_scalar_create( + graph, I32, &multiples[2] ); + node_params[SCALAR_INPUT_MULTIPLES_3] = vsi_nn_kernel_scalar_create( + graph, I32, &multiples[3] ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_BATCH_IN] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_DEPTH_IN] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_DEPTH_OUT] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_0] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_1] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_2] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_3] ); + } + } + +final: + if (reshape_tensors[0] != inputs[0]) + { + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + } + + if (reshape_tensors[1] != outputs[0]) + { + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( tile, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c new file mode 100644 index 0000000..3124f7b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c @@ -0,0 +1,323 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_UPSAMPLE, +} _internal_kernel_e; + +#define _UPSAMPLE_KERNEL_SOURCE "upsample" + +#define UPSAMPLE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, _image_2d ) \ + ((IN0_DTYPE << 20) | (IN1_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { UPSAMPLE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \ + CVIVANTE_NAMESPACE("cl.upsample_"#IN0_DTYPE"_"#IN1_DTYPE"to_"#OUT_DTYPE), \ + _UPSAMPLE_KERNEL_SOURCE } + + +#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { UPSAMPLE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \ + CVIVANTE_NAMESPACE("cl.upsample_"#IN0_DTYPE"_"#IN1_DTYPE"to_"#OUT_DTYPE"_2D"), \ + _UPSAMPLE_KERNEL_SOURCE } + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _upsample_kernel_map[] = +{ + PACK_KERNEL_MAP( F32, U8, F32 ), + PACK_KERNEL_MAP( F32, U8, U8 ), + PACK_KERNEL_MAP( U8, U8, F32 ), + PACK_KERNEL_MAP( U8, U8, U8 ), + PACK_KERNEL_MAP( I32, U8, I32 ), + PACK_KERNEL_MAP_2D( F32, U8, F32 ), + PACK_KERNEL_MAP_2D( F32, U8, U8 ), + PACK_KERNEL_MAP_2D( U8, U8, F32 ), + PACK_KERNEL_MAP_2D( U8, U8, U8 ), + PACK_KERNEL_MAP_2D( I32, U8, I32 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _upsample_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _UPSAMPLE_PARAM_NUM _cnt_of_array( _upsample_kernel_param_def ) + +#define SCALAR_SCALE (3) +#define SCALAR_TAIL (4) +#define SCALAR_IN_ZP (5) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_upsample_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vx_status status = VX_FAILURE; + vx_tensor input = (vx_tensor)param[0]; + vsi_nn_kernel_tensor_attr_t * attr_in = NULL; + vsi_int_array_t * in_shape = NULL; + vsi_bool image_2d = FALSE; + + attr_in = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input ); + CHECK_PTR_FAIL_GOTO( attr_in, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + in_shape = attr_in->shape; + image_2d = (vsi_bool)(in_shape->size < 3 || 1 == in_shape->data[2]); + + gpu_param.dim = image_2d ? 2 : 3; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2((in_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (in_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = image_2d ? 1 : in_shape->data[2]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (attr_in) + { + vsi_nn_kernel_tensor_attr_release(&attr_in); + } + + return status; +} /* _upsample_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _upsample_kernel_map; + size_t kernel_map_size = _cnt_of_array( _upsample_kernel_map ); + vx_param_description_t * param_def = _upsample_kernel_param_def; + size_t param_def_size = _cnt_of_array( _upsample_kernel_param_def ); + vx_kernel_initialize_f initializer = _upsample_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in0_dtype) + { + in0_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 != in0_dtype) && (U8 != out_dtype)) + { + *is_use_u8_kernel = FALSE; + param_def_size = param_def_size - 3; + } + else + { + *is_use_u8_kernel = TRUE; + } + + key = UPSAMPLE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_UPSAMPLE_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t scale_x = 0; + int32_t scale_y = 0; + vsi_bool image_2d = FALSE; + vsi_bool is_use_u8_kernel = FALSE; + float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; + float outputTail = (float)outputs[0]->attr.dtype.zero_point; + float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point; + int32_t outputZp = outputs[0]->attr.dtype.zero_point; + float scale_value = 1.0f; + float tail_value = 0.0f; + + scale_x = vsi_nn_kernel_param_get_int32(params, "scale_x"); + scale_y = vsi_nn_kernel_param_get_int32(params, "scale_y"); + + if (2 != scale_x || 2 != scale_y) + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[1]->attr.size, + inputs[1]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num )) + { + return NULL; + } + + scale_value = inputScale / outputScale; + tail_value = outputTail - inputTail * inputScale / outputScale; + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = _UPSAMPLE_PARAM_NUM - 3; + if (is_use_u8_kernel) + { + node_params[SCALAR_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &scale_value ); + node_params[SCALAR_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &tail_value ); + node_params[SCALAR_IN_ZP] = vsi_nn_kernel_scalar_create(graph, I32, &outputZp ); + node_params_num = _UPSAMPLE_PARAM_NUM; + } + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, node_params_num, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_IN_ZP] ); + } + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( upsample, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c b/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c new file mode 100644 index 0000000..d750765 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c @@ -0,0 +1,246 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.add_mean_std_norm") + + +/* + * Kernel params + */ +static vx_param_description_t _add_mean_std_norm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _ADD_MEAN_STD_NORM_PARAM_NUM _cnt_of_array( _add_mean_std_norm_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + int32_t i; + float mean = .0f, stddev_inv = .0f, variance = .0f, input_d = .0f, data = .0f, eps = .0f; + int32_t v_size, n_batch, batch; + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[_CPU_IO_NUM], &(eps)); + v_size = in_attr[0]->shape->data[0]; + n_batch = in_attr[0]->shape->data[1]; + + for (batch = 0; batch < n_batch; ++batch) + { + float sum = 0.0f; + float sum_sq = 0.0f; + int32_t index_base = batch * v_size; + for (i = 0; i < v_size; ++i) + { + int32_t index = i + index_base; + input_d = f32_in_buffer[0][index] + f32_in_buffer[1][index]; + sum += input_d; + sum_sq += input_d * input_d; + } + + mean = sum / v_size; + stddev_inv = 0.0f; + variance = sum_sq / v_size - mean * mean; + + if (variance == 0) + { + stddev_inv = (float)(1.0f / sqrt(eps)); + } + else + { + stddev_inv = (float)(1.0f / sqrt(variance)); + } + + for (i = 0; i < v_size; ++i) + { + int32_t index = i + index_base; + input_d = f32_in_buffer[0][index] + f32_in_buffer[1][index]; + data = (input_d - mean) * stddev_inv; + f32_out_buffer[0][index] = data; + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _add_mean_std_norm_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _add_mean_std_norm_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ADD_MEAN_STD_NORM_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ADD_MEAN_STD_NORM_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[_CPU_IO_NUM] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _ADD_MEAN_STD_NORM_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( add_mean_std_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c new file mode 100644 index 0000000..be3424b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c @@ -0,0 +1,213 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "client/vsi_nn_vxkernel.h" +__BEGIN_DECLS + +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("argmax_sw") + +DEF_KERNEL_EXECUTOR(_argmax_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + int32_t i; + int32_t axis = 0; + int32_t outerSize = 1; + int32_t axisSize = 1; + int32_t innerSize = 1; + int32_t inner = 0; + int32_t outer = 0; + + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + for (i = 0; i < axis; i++) + { + innerSize *= attr[0]->shape->data[i]; + } + + axisSize = attr[0]->shape->data[axis]; + + for (i = axis + 1; i < (int32_t)attr[0]->shape->size; i++) + { + outerSize *= attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + float minMaxValue = buffer[0][outer * axisSize * innerSize + inner]; + int32_t minMaxIndex = 0; + for (i = 1; i < axisSize; ++i) + { + float value = buffer[0][(outer * axisSize + i) * innerSize + inner]; + if (value > minMaxValue) + { + minMaxValue = value; + minMaxIndex = i; + } + } + buffer[1][outer * innerSize + inner] = (float)minMaxIndex; + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _minimum_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _argmax_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +#define SCALAR_INPUT_AXIS (2) + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( argmax, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c new file mode 100644 index 0000000..7b0f1dd --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c @@ -0,0 +1,214 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("argmin_sw") + +DEF_KERNEL_EXECUTOR(_argmin_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + int32_t i; + int32_t axis = 0; + int32_t outerSize = 1; + int32_t axisSize = 1; + int32_t innerSize = 1; + int32_t inner = 0; + int32_t outer = 0; + + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + for (i = 0; i < axis; i++) + { + innerSize *= attr[0]->shape->data[i]; + } + + axisSize = attr[0]->shape->data[axis]; + + for (i = axis + 1; i < (int32_t)attr[0]->shape->size; i++) + { + outerSize *= attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + float minMaxValue = buffer[0][outer * axisSize * innerSize + inner]; + int32_t minMaxIndex = 0; + for (i = 1; i < axisSize; ++i) + { + float value = buffer[0][(outer * axisSize + i) * innerSize + inner]; + if (value < minMaxValue) + { + minMaxValue = value; + minMaxIndex = i; + } + } + buffer[1][outer * innerSize + inner] = (float)minMaxIndex; + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _minimum_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _argmin_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +#define SCALAR_INPUT_AXIS (2) + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( argmin, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c new file mode 100644 index 0000000..cd4f594 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c @@ -0,0 +1,234 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (5) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("batch_norm_sw") + +static int32_t _expand_offset + ( + int32_t index, + int32_t * shape, size_t rank, + size_t * strides, int32_t * out_shape + ) +{ + uint32_t i; + int32_t offset = 0; + + for( i = 0; i < rank && index; i ++ ) + { + if( shape[i] == out_shape[i] ) + { + offset += (int32_t)strides[i] * ( index % out_shape[i] ); + } + index /= out_shape[i]; + } + return offset; +} + +DEF_KERNEL_EXECUTOR(_batch_norm_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + float eps = 0.f; + + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &eps); + CHECK_STATUS_FAIL_GOTO(status, final ); + + for ( i = 0; i < _CPU_INPUT_NUM; i++) + { + tensors[i] = (vsi_nn_kernel_tensor_t)param[i]; + attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] ); + + vsi_nn_kernel_tensor_attr_get_stride( attr[i], stride_size[i] ); + buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[i], "Create input buffer fail.", final ); + } + + tensors[5] = (vsi_nn_kernel_tensor_t)param[5]; + attr[5] = vsi_nn_kernel_tensor_attr_create( tensors[5] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[5] ); + + buffer[5] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[5], "Create output buffer fail.", final ); + memset( buffer[5], 0, out_elements * sizeof(float) ); + + for( i = 0; i < out_elements; i ++ ) + { + int32_t in_offset[5] = {0}; + int32_t j = 0; + float src = 0.f; + float mean = 0.f; + float variance = 0.f; + float beta = 0.f; + float gamma = 0.f; + + for ( j = 0; j < 5; j++) + { + in_offset[j] = _expand_offset( i, attr[j]->shape->data, attr[j]->shape->size, + stride_size[j], attr[5]->shape->data ); + } + + src = buffer[0][in_offset[0]]; + mean = buffer[1][in_offset[1]]; + variance = buffer[2][in_offset[2]]; + gamma = buffer[3][in_offset[3]]; + beta = buffer[4][in_offset[4]]; + + + buffer[5][i] = (src - mean) * gamma/ sqrtf(variance + eps) + beta; + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[5], attr[5], + buffer[5], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _batch_norm_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_INPUT_EPS (6) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _batch_norm_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float eps = 0; + + eps = vsi_nn_kernel_param_get_float32(params, "eps"); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + /* Pass parameters to node. */ + backend_params[SCALAR_INPUT_EPS] = vsi_nn_kernel_scalar_create( + graph, F32, &eps ); + + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_EPS] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( batchnorm_single, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c new file mode 100644 index 0000000..c716d94 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c @@ -0,0 +1,220 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "utils/vsi_nn_dtype_util_prv.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.cast") + +/* + * Kernel params + */ +static vx_param_description_t _cast_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CAST_PARAM_NUM _cnt_of_array( _cast_kernel_param_def ) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + double max_value = 0.0f, min_value = 0.0f; + vsi_bool clamp_flag = FALSE; + vsi_nn_type_e out_type; + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + in_attr[i]->quant = VSI_NN_KERNEL_QUANT_NONE; + in_attr[i]->dfp.fl = 0; + in_attr[i]->asymm.scale = 1.0f; + in_attr[i]->asymm.zero_point = 0; + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + out_type = vsi_nn_dtype_map_kernel(out_attr[0]->dtype); + + if( type_is_integer( out_type ) ) + { + clamp_flag = TRUE; + type_get_range(out_type, &max_value, &min_value); + } + + for (i = 0; i < out_elements[0]; i++) + { + float val = f32_in_buffer[0][i]; + if (clamp_flag) + { + val = vsi_nn_clamp(val, (float)min_value, (float)max_value); + } + f32_out_buffer[0][i] = val; + } + + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + out_attr[i]->quant = VSI_NN_KERNEL_QUANT_NONE; + out_attr[i]->dfp.fl = 0; + out_attr[i]->asymm.scale = 1.0f; + out_attr[i]->asymm.zero_point = 0; + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _cast_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _cast_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CAST_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CAST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CAST_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( cast, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c b/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c new file mode 100644 index 0000000..ea416a4 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c @@ -0,0 +1,221 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.clip") + + +/* + * Kernel params + */ +static vx_param_description_t _clip_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CLIP_PARAM_NUM _cnt_of_array( _clip_kernel_param_def ) + +#define SCALAR_MIN_VALUE (2) +#define SCALAR_MAX_VALUE (3) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + float min_value = 0.0f; + float max_value = 0.0f; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MIN_VALUE], &(min_value)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_VALUE], &(max_value)); + + for (i = 0; i < out_elements[0]; i++) + { + f32_out_buffer[0][i] = vsi_nn_clamp(f32_in_buffer[0][i], min_value, max_value); + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _clip_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _clip_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CLIP_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float min_value = vsi_nn_kernel_param_get_float32( params, "min_value" ); + float max_value = vsi_nn_kernel_param_get_float32( params, "max_value" ); + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_MIN_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &min_value ); + node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CLIP_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MIN_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( clip, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c new file mode 100644 index 0000000..f64f102 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c @@ -0,0 +1,280 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + + +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("comparisons_sw") + +typedef enum +{ + COMP_GREAT = VSI_NN_RELATIONAL_OPS_GREAT, + COMP_GREAT_EQUAL = VSI_NN_RELATIONAL_OPS_GREAT_EQUAL, + COMP_LESS = VSI_NN_RELATIONAL_OPS_LESS, + COMP_LESS_EQUAL = VSI_NN_RELATIONAL_OPS_LESS_EQUAL, + COMP_NOT_EQUAL = VSI_NN_RELATIONAL_OPS_NOT_EQUAL, + COMP_EQUAL = VSI_NN_RELATIONAL_OPS_EQUAL, +} relational_type_e; + + +static int32_t _expand_offset + ( + int32_t index, + int32_t * shape, size_t rank, + size_t * strides, int32_t * out_shape + ) +{ + uint32_t i; + int32_t offset = 0; + + for( i = 0; i < rank && index; i ++ ) + { + if( shape[i] == out_shape[i] ) + { + offset += (int32_t)strides[i] * ( index % out_shape[i] ); + } + index /= out_shape[i]; + } + return offset; +} + +DEF_KERNEL_EXECUTOR(_comparisons_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t i = 0; + int32_t operation = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &operation); + CHECK_STATUS_FAIL_GOTO(status, final ); + + + vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] ); + vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + for (i = 0; i < (int32_t)out_elements; i++) + { + int32_t in0_offset = 0; + int32_t in1_offset = 0; + float val1 = 0.f; + float val2 = 0.f; + vsi_bool data = 0; + + in0_offset = _expand_offset( i, attr[0]->shape->data, attr[0]->shape->size, + stride_size[0], attr[2]->shape->data ); + in1_offset = _expand_offset( i, attr[1]->shape->data, attr[1]->shape->size, + stride_size[1], attr[2]->shape->data ); + + val1 = buffer[0][in0_offset]; + val2 = buffer[1][in1_offset]; + + switch (operation) + { + case COMP_GREAT: + data = val1 > val2; + break; + case COMP_GREAT_EQUAL: + data = val1 >= val2; + break; + case COMP_LESS: + data = val1 < val2; + break; + case COMP_LESS_EQUAL: + data = val1 <= val2; + break; + case COMP_EQUAL: + data = val1 == val2; + break; + case COMP_NOT_EQUAL: + data = val1 != val2; + break; + default: + break; + } + buffer[2][i] = (float)data; + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + buffer[i] = NULL; + } + } + return status; +} /* _comparisons_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define INPUT_FUNC_OP (3) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _comparisons_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t operation = 0; + + operation = vsi_nn_kernel_param_get_int32( params, "operation" ); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[INPUT_FUNC_OP] = vsi_nn_kernel_scalar_create( + graph, I32, &operation ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &backend_params[INPUT_FUNC_OP] ); + } + else + { + status = VSI_FAILURE; + } + } + + return node; +} /* _setup() */ + + +__END_DECLS + +REGISTER_BACKEND_CPU( relational_ops, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c new file mode 100644 index 0000000..138c6e4 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c @@ -0,0 +1,223 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.depth2space_crd") + +DEF_KERNEL_EXECUTOR(_depth2space_crd_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[2] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t block_size = 1; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + { + uint32_t output_batch = attr[1]->shape->size > 3 ? attr[1]->shape->data[3] : 1; + uint32_t output_depth = attr[1]->shape->data[2]; + uint32_t output_height = attr[1]->shape->data[1]; + uint32_t output_width = attr[1]->shape->data[0]; + uint32_t input_depth = attr[0]->shape->data[2]; + uint32_t input_height = attr[0]->shape->data[1]; + uint32_t input_width = attr[0]->shape->data[0]; + uint32_t batch = 0, out_h = 0, out_w = 0; + + for (batch = 0; batch < output_batch; ++ batch) + { + uint32_t output_batch_index = batch * output_height * output_width * output_depth; + uint32_t input_batch_index = batch * input_height * input_width * input_depth; + uint32_t out_d = 0; + uint32_t block_e2 = block_size * block_size; + + for (out_d = 0; out_d < output_depth; out_d ++) + { + for (out_h = 0; out_h < output_height; ++ out_h) + { + for (out_w = 0; out_w < output_width; out_w ++) + { + uint32_t in_w = out_w / block_size; + uint32_t in_h = out_h / block_size; + uint32_t in_d = (out_w % block_size) + (out_h % block_size) * block_size + out_d * block_e2; + + uint32_t in_index = in_w + in_h * input_width + in_d * input_width * input_height + + input_batch_index; + uint32_t out_index = out_w + out_h * output_width + out_d * output_height * output_width + + output_batch_index; + + buffer[1][out_index] = buffer[0][in_index]; + } + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + if( buffer[i] ) + { + free( buffer[i] ); + } + } + return status; +} /* _depth2space_crd_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _depth2space_crd_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _DEPTH2SPACE_CRD_PARAM_NUM _cnt_of_array( _depth2space_crd_kernel_param_def ) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _depth2space_crd_exec, + _depth2space_crd_kernel_param_def, + _cnt_of_array( _depth2space_crd_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 2; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + backend_params[index] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[2] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( depth2space_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c b/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c new file mode 100644 index 0000000..72d9dd7 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c @@ -0,0 +1,255 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.detect_post_box") + + +/* + * Kernel params + */ +static vx_param_description_t _detect_post_box_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _DETECT_POST_BOX_PARAM_NUM _cnt_of_array( _detect_post_box_kernel_param_def ) + +#define SCALAR_SCALE_Y (3) +#define SCALAR_SCALE_X (4) +#define SCALAR_SCALE_H (5) +#define SCALAR_SCALE_W (6) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + uint32_t n, a, numBatches, numAnchors, lengthBoxEncoding; + uint32_t kRoiDim = 4; + float inv_scale_y = 0.0f; + float inv_scale_x = 0.0f; + float inv_scale_h = 0.0f; + float inv_scale_w = 0.0f; + + /* prepare data */ + for ( i = 0; i < _INPUT_NUM; i++ ) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for ( i = 0; i < _OUTPUT_NUM; i++ ) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_Y], &(inv_scale_y)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_X], &(inv_scale_x)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_H], &(inv_scale_h)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_W], &(inv_scale_w)); + + numBatches = in_attr[0]->shape->data[2]; + numAnchors = in_attr[0]->shape->data[1]; + lengthBoxEncoding = in_attr[0]->shape->data[0]; + + for ( n = 0; n < numBatches; n++ ) + { + int32_t batch_in_offset = n * numAnchors * lengthBoxEncoding; + int32_t batch_out_offset = n * numAnchors * kRoiDim; + for ( a = 0; a < numAnchors; a++ ) + { + float yCtr = f32_in_buffer[1][a * kRoiDim] + f32_in_buffer[1][a * kRoiDim + 2] + * f32_in_buffer[0][batch_in_offset + a * lengthBoxEncoding] * inv_scale_y; + float xCtr = f32_in_buffer[1][a * kRoiDim + 1] + f32_in_buffer[1][a * kRoiDim + 3] + * f32_in_buffer[0][batch_in_offset + a * lengthBoxEncoding + 1] * inv_scale_x; + float hHalf = f32_in_buffer[1][a * kRoiDim + 2] * + (float)exp(f32_in_buffer[0][batch_in_offset + a * lengthBoxEncoding + 2] * inv_scale_h) * 0.5f; + float wHalf = f32_in_buffer[1][a * kRoiDim + 3] * + (float)exp(f32_in_buffer[0][batch_in_offset + a * lengthBoxEncoding + 3] * inv_scale_w) * 0.5f; + f32_out_buffer[0][batch_out_offset + a * kRoiDim] = yCtr - hHalf; + f32_out_buffer[0][batch_out_offset + a * kRoiDim + 1] = xCtr - wHalf; + f32_out_buffer[0][batch_out_offset + a * kRoiDim + 2] = yCtr + hHalf; + f32_out_buffer[0][batch_out_offset + a * kRoiDim + 3] = xCtr + wHalf; + } + } + + + /* save data */ + for ( i = 0; i < _OUTPUT_NUM; i++ ) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for ( i = 0; i < _INPUT_NUM; i++ ) + { + if ( f32_in_buffer[i] ) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if ( in_attr[i] ) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for ( i = 0; i < _OUTPUT_NUM; i++ ) + { + if ( f32_out_buffer[i] ) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if ( out_attr[i] ) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _detect_post_box_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _detect_post_box_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_DETECT_POST_BOX_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float inv_scale_y = vsi_nn_kernel_param_get_float32( params, "inv_scale_y" ); + float inv_scale_x = vsi_nn_kernel_param_get_float32( params, "inv_scale_x" ); + float inv_scale_h = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" ); + float inv_scale_w = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" ); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_BOX_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_y ); + node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_x ); + node_params[SCALAR_SCALE_H] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_h ); + node_params[SCALAR_SCALE_W] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_w ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _DETECT_POST_BOX_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_H] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_W] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( detect_post_box, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c b/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c new file mode 100644 index 0000000..eaa01d2 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c @@ -0,0 +1,527 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (4) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.detect_post_nms") + + +/* + * Kernel params + */ +static vx_param_description_t _detect_post_nms_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _DETECT_POST_NMS_PARAM_NUM _cnt_of_array( _detect_post_nms_kernel_param_def ) + +#define SCALAR_NMS_TYPE (6) +#define SCALAR_MAX_NUM (7) +#define SCALAR_MAX_CLASS (8) +#define SCALAR_MAX_DETECT (9) +#define SCALAR_SCORE_TH (10) +#define SCALAR_IOU_TH (11) +#define SCALAR_IS_BG (12) + +static void _swap_element + ( + uint32_t* list, + uint32_t first, + uint32_t second + ) +{ + uint32_t temp = list[first]; + list[first] = list[second]; + list[second] = temp; +} + +static uint32_t _max_element + ( + float* data, + uint32_t* index_list, + uint32_t len + ) +{ + uint32_t i; + uint32_t max_index = 0; + float max_val = data[index_list[0]]; + for ( i = 1; i < len; i++ ) + { + float val = data[index_list[i]]; + if ( max_val < val ) + { + max_val = val; + max_index = i; + } + } + return max_index; +} + +static float _getIoUAxisAligned + ( + const float* roi1, + const float* roi2 + ) +{ + const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]); + const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]); + const float x1 = vsi_nn_max(roi1[0], roi2[0]); + const float x2 = vsi_nn_min(roi1[2], roi2[2]); + const float y1 = vsi_nn_max(roi1[1], roi2[1]); + const float y2 = vsi_nn_min(roi1[3], roi2[3]); + const float w = vsi_nn_max(x2 - x1, 0.0f); + const float h = vsi_nn_max(y2 - y1, 0.0f); + const float areaIntersect = w * h; + const float areaUnion = area1 + area2 - areaIntersect; + return areaIntersect / areaUnion; +} + +static uint32_t _max_comp_func + ( + void* data, + int32_t left, + int32_t right + ) +{ + float* fdata = (float*)data; + return fdata[left] >= fdata[right]; +} + +static void _sort_element_by_score + ( + float* data, + uint32_t* index_list, + uint32_t len + ) +{ + vsi_nn_partition(data, 0, len - 1, _max_comp_func, TRUE, index_list); +} + +static float _max_element_value + ( + float* data, + uint32_t len + ) +{ + uint32_t i; + float max_val = data[0]; + for ( i = 1; i < len; i++ ) + { + float val = data[i]; + if ( max_val < val ) + { + max_val = val; + } + } + return max_val; +} + +static void _iota + ( + int32_t * data, + uint32_t len, + int32_t value + ) +{ + uint32_t i; + for ( i = 0; i < len; i++ ) + { + data [i] = value; + value++; + } +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i, j; + uint32_t n, a, c, b, numBatches, numAnchors, numClasses; + int32_t nms_type = 0; + int32_t max_num_detections = 0; + int32_t maximum_class_per_detection = 0; + int32_t maximum_detection_per_class = 0; + float score_threshold = 0.0f; + float iou_threshold = 0.0f; + int32_t is_bg_in_label = 0; + uint32_t numOutDetection = 0; + + /* prepare data */ + for ( i = 0; i < _INPUT_NUM; i++ ) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for ( i = 0; i < _OUTPUT_NUM; i++ ) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_NMS_TYPE], &(nms_type)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_NUM], &(max_num_detections)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_CLASS], &(maximum_class_per_detection)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_DETECT], &(maximum_detection_per_class)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCORE_TH], &(score_threshold)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_IOU_TH], &(iou_threshold)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_IS_BG], &(is_bg_in_label)); + + numBatches = in_attr[0]->shape->data[2]; + numAnchors = in_attr[0]->shape->data[1]; + numClasses = in_attr[0]->shape->data[0]; + numOutDetection = out_attr[0]->shape->data[0]; + + { + uint32_t scores_index = 0; + uint32_t scores_out_index = 0; + uint32_t kRoiDim = 4; + uint32_t roi_out_index = 0; + uint32_t class_out_index = 0; + uint32_t* select = (uint32_t*)malloc(numAnchors * numClasses * sizeof(uint32_t)); + float* maxScores = (float*)malloc(numAnchors * sizeof(float)); + uint32_t* scoreInds = (uint32_t*)malloc((numClasses - 1) * sizeof(uint32_t)); + + for ( n = 0; n < numBatches; n++ ) + { + float* roiBuffer = &(f32_in_buffer[1][n * numAnchors * kRoiDim]); + if (nms_type) + { + uint32_t select_size = 0; + uint32_t select_start = 0; + uint32_t select_len = 0; + uint32_t numDetections = 0; + for ( c = 1; c < numClasses; c++ ) + { + select_start = select_size; + for ( b = 0; b < numAnchors; b++ ) + { + const uint32_t index = b * numClasses + c; + float score = f32_in_buffer[0][scores_index + index]; + if (score > score_threshold) { + select[select_size] = index; + select_size++; + } + } + select_len = select_size - select_start; + + if ( maximum_detection_per_class < 0 ) + { + maximum_detection_per_class = select_len; + } + numDetections = 0; + for ( j = 0; (j < select_len && numDetections < (uint32_t)maximum_detection_per_class); j++ ) + { + // find max score and swap to the front. + int32_t max_index = _max_element(&(f32_in_buffer[0][scores_index]), + &(select[select_start]), select_len); + _swap_element(&(select[select_start]), max_index, j); + + // Calculate IoU of the rest, swap to the end (disgard) if needed. + for ( i = j + 1; i < select_len; i++ ) + { + int32_t roiBase0 = (select[select_start + i] / numClasses) * kRoiDim; + int32_t roiBase1 = (select[select_start + j] / numClasses) * kRoiDim; + float iou = _getIoUAxisAligned(&(roiBuffer[roiBase0]), + &(roiBuffer[roiBase1])); + + if ( iou >= iou_threshold ) + { + _swap_element(&(select[select_start]), i, select_len - 1); + i--; + select_len--; + } + } + numDetections++; + } + select_size = select_start + numDetections; + } + + select_len = select_size; + select_start = 0; + + // Take top maxNumDetections. + _sort_element_by_score(&(f32_in_buffer[0][scores_index]), + &(select[select_start]), select_len); + + for ( i = 0; i < select_len; i++ ) + { + uint32_t ind = select[i]; + f32_out_buffer[0][scores_out_index + i] = + f32_in_buffer[0][scores_index + ind]; + memcpy(&(f32_out_buffer[1][roi_out_index + i * kRoiDim]), + &roiBuffer[(ind / numClasses) * kRoiDim], kRoiDim * sizeof(float)); + f32_out_buffer[2][class_out_index + i] = (float)((ind % numClasses) + - (is_bg_in_label ? 0 : 1)); + } + f32_out_buffer[3][n] = (float)(select_len); + } + else + { + uint32_t numOutClasses = vsi_nn_min(numClasses - 1, (uint32_t)maximum_class_per_detection); + uint32_t select_size = 0; + uint32_t select_start = 0; + uint32_t select_len = 0; + uint32_t numDetections = 0; + for ( a = 0; a < numAnchors; a++ ) + { + // exclude background class: 0 + maxScores[a] = _max_element_value(&(f32_in_buffer[0] + [scores_index + a * numClasses + 1]), numClasses - 1); + if (maxScores[a] > score_threshold) + { + select[select_size] = a; + select_size++; + } + } + select_len = select_size - select_start; + + if ( max_num_detections < 0 ) + { + max_num_detections = select_len; + } + for ( j = 0; (j < select_len && numDetections < (uint32_t)max_num_detections); j++ ) + { + // find max score and swap to the front. + int32_t max_index = _max_element(maxScores, + &(select[select_start + j]), select_len - j); + _swap_element(&(select[select_start]), max_index + j, j); + + // Calculate IoU of the rest, swap to the end (disgard) if needed. + for ( i = j + 1; i < select_len; i++ ) + { + int32_t roiBase0 = select[select_start + i] * kRoiDim; + int32_t roiBase1 = select[select_start + j] * kRoiDim; + float iou = _getIoUAxisAligned(&(roiBuffer[roiBase0]), + &(roiBuffer[roiBase1])); + if ( iou >= iou_threshold ) + { + _swap_element(&(select[select_start]), i, select_len - 1); + i--; + select_len--; + } + } + numDetections++; + } + select_size = select_start + numDetections; + select_len = select_size; + + for ( i = 0; i < select_len; i++ ) + { + _iota((int32_t*)scoreInds, numClasses - 1, 1); + _sort_element_by_score(&(f32_in_buffer[0][scores_index + select[i] * numClasses]), + scoreInds, numClasses - 1); + for (c = 0; c < numOutClasses; c++) + { + f32_out_buffer[0][scores_out_index + i * numOutClasses + c] = + f32_in_buffer[0][scores_index + select[i] * numClasses + scoreInds[c]]; + memcpy(&(f32_out_buffer[1][roi_out_index + (i * numOutClasses + c) + * kRoiDim]), &roiBuffer[select[i] * kRoiDim], kRoiDim * sizeof(float)); + f32_out_buffer[2][class_out_index + i * numOutClasses + c] + = (float)(scoreInds[c] - (is_bg_in_label ? 0 : 1)); + } + } + f32_out_buffer[3][n] = (float)select_len; + } + scores_index += numAnchors * numClasses; + scores_out_index += numOutDetection; + roi_out_index += numOutDetection * kRoiDim; + class_out_index += numOutDetection; + } + + if (select) free(select); + if (maxScores) free(maxScores); + if (scoreInds) free(scoreInds); + } + /* save data */ + for ( i = 0; i < _OUTPUT_NUM; i++ ) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for ( i = 0; i < _OUTPUT_NUM; i++ ) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _detect_post_nms_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _detect_post_nms_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_DETECT_POST_NMS_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t nms_type = vsi_nn_kernel_param_get_int32( params, "nms_type" ); + int32_t max_num_detections = vsi_nn_kernel_param_get_int32( params, "max_num_detections" ); + int32_t maximum_class_per_detection = vsi_nn_kernel_param_get_int32( params, "maximum_class_per_detection" ); + int32_t maximum_detection_per_class = vsi_nn_kernel_param_get_int32( params, "maximum_detection_per_class" ); + float score_threshold = vsi_nn_kernel_param_get_float32( params, "score_threshold" ); + float iou_threshold = vsi_nn_kernel_param_get_float32( params, "iou_threshold" ); + int32_t is_bg_in_label = vsi_nn_kernel_param_get_int32( params, "is_bg_in_label" ); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_NMS_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_NMS_TYPE] = vsi_nn_kernel_scalar_create( graph, I32, &nms_type ); + node_params[SCALAR_MAX_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &max_num_detections ); + node_params[SCALAR_MAX_CLASS] = vsi_nn_kernel_scalar_create( graph, I32, &maximum_class_per_detection ); + node_params[SCALAR_MAX_DETECT] = vsi_nn_kernel_scalar_create( graph, I32, &maximum_detection_per_class ); + node_params[SCALAR_SCORE_TH] = vsi_nn_kernel_scalar_create( graph, F32, &score_threshold ); + node_params[SCALAR_IOU_TH] = vsi_nn_kernel_scalar_create( graph, F32, &iou_threshold ); + node_params[SCALAR_IS_BG] = vsi_nn_kernel_scalar_create( graph, I32, &is_bg_in_label ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _DETECT_POST_NMS_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_NMS_TYPE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_NUM] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_CLASS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_DETECT] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCORE_TH] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_IOU_TH] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_IS_BG] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( detect_post_nms, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c new file mode 100644 index 0000000..f52f367 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c @@ -0,0 +1,291 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/** Unary Kernel internal type */ +typedef enum +{ + UNARY_SIN, + UNARY_EXP, + UNARY_LOG, + UNARY_ELU, + UNARY_NEG, + UNARY_HSIGMOID, + UNARY_MISH, +} unary_type_e; + + +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("eltwise_unary_sw") + +static float exp_eval(float data) +{ + return expf(data); +} + +static float sin_eval(float data) +{ + return sinf(data); +} + +static float log_eval(float data) +{ + return logf(data); +} + +static float elu_eval(float data) +{ + return data >=0 ? data : expf(data) - 1; +} + +static float neg_eval(float data) +{ + return data * -1.0f; +} + +static float hsigmoid_eval(float data) +{ + data = (float)(0.2 * data + 0.5); + data = vsi_nn_clamp(data, 0, 1); + + return data; +} + +static float soft_plus_eval(float data) +{ + return log_eval(exp_eval(data) + 1); +} + +static float mish_eval(float data) +{ + data = (float)(data * tanh(soft_plus_eval(data))); + + return data; +} + +DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + int32_t i; + int32_t unary_type = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &unary_type); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + for ( i = 0; i < (int32_t)out_elements; ++i) + { + float data = buffer[0][i]; + + switch (unary_type) + { + case UNARY_SIN: + data = sin_eval(data); + break; + case UNARY_EXP: + data = exp_eval(data); + break; + case UNARY_LOG: + data = log_eval(data); + break; + case UNARY_ELU: + data = elu_eval(data); + break; + case UNARY_NEG: + data = neg_eval(data); + break; + case UNARY_HSIGMOID: + data = hsigmoid_eval(data); + break; + case UNARY_MISH: + data = mish_eval(data); + break; + default: + break; + } + buffer[1][i] = (float)data; + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + buffer[i] = NULL; + } + } + return status; +} /* _eltwise_unary_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define INPUT_FUNC_TYPE (2) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _eltwise_unary_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel, + const unary_type_e unary_type + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[INPUT_FUNC_TYPE] = vsi_nn_kernel_scalar_create( + graph, I32, &unary_type ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &backend_params[INPUT_FUNC_TYPE] ); + } + else + { + status = VSI_FAILURE; + } + } + + return node; +} /* _setup() */ + +#define REGISTER_ELTWISE_UNARY_BACKEND_CPU(KERNEL_NAME, UNARY_TYPE) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel, UNARY_TYPE); \ + } \ + REGISTER_BACKEND_CPU( KERNEL_NAME, _##KERNEL_NAME##_setup ) + +__END_DECLS + +REGISTER_ELTWISE_UNARY_BACKEND_CPU( sin, UNARY_SIN ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( exp, UNARY_EXP ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( log, UNARY_LOG ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu, UNARY_ELU ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg, UNARY_NEG ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish, UNARY_MISH ) diff --git a/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c b/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c new file mode 100644 index 0000000..8b3f27d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c @@ -0,0 +1,239 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.floordiv") + + +/* + * Kernel params + */ +static vx_param_description_t _floordiv_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _FLOORDIV_PARAM_NUM _cnt_of_array( _floordiv_kernel_param_def ) + +static int32_t _expand_offset + ( + int32_t index, + int32_t * shape, size_t rank, + size_t * strides, int32_t * out_shape + ) +{ + uint32_t i; + int32_t offset = 0; + + for( i = 0; i < rank && index; i ++ ) + { + if( shape[i] == out_shape[i] ) + { + offset += (int32_t)strides[i] * ( index % out_shape[i] ); + } + index /= out_shape[i]; + } + return offset; +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + for (i = 0; i < out_elements[0]; i++) + { + int32_t in0_offset = 0; + int32_t in1_offset = 0; + float in0 = 0; + float in1 = 0; + + in0_offset = _expand_offset( i, in_attr[0]->shape->data, in_attr[0]->shape->size, + in_stride_size[0], out_attr[0]->shape->data ); + in1_offset = _expand_offset( i, in_attr[1]->shape->data, in_attr[1]->shape->size, + in_stride_size[1], out_attr[0]->shape->data ); + in0 = f32_in_buffer[0][in0_offset]; + in1 = f32_in_buffer[1][in1_offset]; + f32_out_buffer[0][i] = (float)floor(in0 / in1); + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _floordiv_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _floordiv_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_FLOORDIV_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _FLOORDIV_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _FLOORDIV_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( floordiv, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c new file mode 100644 index 0000000..c234a51 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c @@ -0,0 +1,238 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (3) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.gather") + +DEF_KERNEL_EXECUTOR(_gather_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[2] = { NULL }; + uint32_t* buffer_idx = NULL; + size_t in_elements = 0, out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0, j = 0; + int32_t block_size = 1, block_num = 1, indices_num = 1, axis_num = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + in_elements = vsi_nn_kernel_tensor_attr_get_size( attr[0] ); + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis_num); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer_idx = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + { + for(i = 0; i < attr[1]->shape->size; ++i) + { + indices_num *= attr[1]->shape->data[i]; + } + + for(i = 0; i < (uint32_t)block_num; i++) + { + for(j = 0; j < (uint32_t)indices_num; j++) + { + uint32_t indice = buffer_idx[j]; + uint32_t in_index = (i * axis_num + indice) * block_size; + if(in_index < in_elements) + { + uint32_t out_index = (i * indices_num + j) * block_size; + memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float)); + } + else + { + status = VX_FAILURE; + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if( buffer_idx ) + { + free( buffer_idx ); + } + for( i = 0; i < 2; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _gather_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _gather_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GATHER_PARAM_NUM _cnt_of_array( _gather_kernel_param_def ) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _gather_exec, + _gather_kernel_param_def, + _cnt_of_array( _gather_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 3; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); + int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( gather, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c new file mode 100644 index 0000000..cb22732 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c @@ -0,0 +1,239 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (2) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.gather_nd") + +DEF_KERNEL_EXECUTOR(_gather_nd_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[2] = { NULL }; + uint32_t* buffer_idx = NULL; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + int32_t i = 0; + int32_t block_size = 1, indices_num = 1; + int32_t coord_stride = 1; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &(block_size)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(coord_stride)); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer_idx = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + // index number + for(i = 0; i < (int32_t)attr[1]->shape->size; ++i) + { + indices_num *= attr[1]->shape->data[i]; + } + indices_num /= coord_stride; + + if(coord_stride <= 3) // reshape 3D + { + int32_t stride[3] = {block_size, 0, 0}; + for(i = 1; i < coord_stride; ++i) + { + stride[i] = stride[i - 1] * attr[0]->shape->data[i]; + } + + for(i = 0; i < indices_num; i++) + { + uint32_t out_index = i * block_size; + uint32_t coord[3] = {0}; + uint32_t in_index = 0; + int32_t j = 0; + + for(j = 0; j < coord_stride; j++) + { + coord[j] = buffer_idx[i * coord_stride + j]; + } + in_index = coord[2] * stride[2] + coord[1] * stride[1] + coord[0] * stride[0]; + memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float)); + } + } + else + { + status = VSI_FAILURE; + CHECK_STATUS_FAIL_GOTO( status, final ); + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if( buffer_idx ) + { + free( buffer_idx ); + } + for( i = 0; i < 2; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_yuv420_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _gather_nd_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GATHER_ND_PARAM_NUM _cnt_of_array( _gather_nd_kernel_param_def ) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _gather_nd_exec, + _gather_nd_kernel_param_def, + _cnt_of_array( _gather_nd_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 3; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( gather_nd, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c new file mode 100644 index 0000000..e4fd5cb --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c @@ -0,0 +1,515 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.grucell_activation") + +/* + * Kernel params + */ +static vx_param_description_t _grucell_activation_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _GRUCELL_ACTIVATION_PARAM_NUM _cnt_of_array( _grucell_activation_kernel_param_def ) + +#define _IO_COUNT_DEFAULT (5) + +static vx_param_description_t _grucell_activation_separated_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _GRUCELL_ACTIVATION_SEPARATED_PARAM_NUM _cnt_of_array( _grucell_activation_separated_kernel_param_def ) +#define _IO_COUNT_SEPARATED (15) +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + int32_t i = 0; + int32_t batch = 0; + int32_t hidden_units = 0; + float * buffer[_IO_COUNT_DEFAULT] = { NULL }; + vsi_status status = VSI_FAILURE; + vsi_nn_activation_e gate_activation; + vsi_nn_activation_e candidate_activation; + vsi_nn_kernel_tensor_t tensors[_IO_COUNT_DEFAULT] = { NULL }; + vsi_nn_kernel_tensor_attr_t* attr[_IO_COUNT_DEFAULT] = { NULL }; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; + tensors[4] = (vsi_nn_kernel_tensor_t)param[4]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + attr[4] = vsi_nn_kernel_tensor_attr_create( tensors[4] ); + + /* z{t_} */ + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final ); + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create input buffer fail.", final ); + buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create input buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &gate_activation); + CHECK_STATUS_FAIL_GOTO(status, final); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &candidate_activation); + CHECK_STATUS_FAIL_GOTO(status, final); + + batch = attr[0]->shape->data[1]; + hidden_units = attr[0]->shape->data[0]; + + for( i = 0; i < batch * hidden_units; i++ ) + { + float zt = vsi_nn_activation(buffer[0][i], gate_activation); + float ht_ = vsi_nn_activation(buffer[1][i], candidate_activation); + float ht_1 = buffer[2][i]; + float ht = zt * (ht_1 - ht_) + ht_; + + buffer[3][i] = ht; + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + buffer[3], batch * hidden_units ); + CHECK_STATUS_FAIL_GOTO( status, final ); + status = vsi_nn_kernel_tensor_write_from_float( tensors[4], attr[4], + buffer[3], batch * hidden_units ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < 5; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _compute() */ + +DEF_KERNEL_EXECUTOR(_compute_separated) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + int32_t i = 0, j = 0; + int32_t batch = 0; + int32_t hidden_units = 0; + float * buffer[_IO_COUNT_SEPARATED] = { NULL }; + vsi_status status = VSI_FAILURE; + vsi_nn_activation_e gate_activation; + vsi_nn_activation_e candidate_activation; + vsi_bool use_cudnn_implementation; + grucell_activation_input_layout_e input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC; + vsi_nn_kernel_tensor_t tensors[_IO_COUNT_SEPARATED] = { NULL }; + vsi_nn_kernel_tensor_attr_t* attr[_IO_COUNT_SEPARATED] = { NULL }; + float *i_r_base = NULL, *i_c_base = NULL, *i_u_base = NULL; + float *r_r_base = NULL, *r_u_base = NULL, *r_c_base = NULL; + float cond_reset = 0.f, cond_update = 0.f, cond_candidate = 0.f; + float i_r = 0.f, i_u = 0.f, i_c = 0.f, r_r = 0.f, r_u = 0.f, r_c = 0.f; + float bias_r = 0.f, bias_u = 0.f, bias_c = 0.f; + float r = 0.f, u = 0.f, c = 0.f, state = 0.f; + + for(i = 0; i < _IO_COUNT_SEPARATED; i++) + { + tensors[i] = (vsi_nn_kernel_tensor_t)param[i]; + attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] ); + } + + /* z{t_} */ + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final ); + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE ); + + buffer[4] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[4], attr[4], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[4], "Create input buffer fail.", final ); + buffer[5] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[5], attr[5], TRUE ); + buffer[6] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[6], attr[6], TRUE ); + + buffer[7] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[7], attr[7], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[7], "Create input buffer fail.", final ); + buffer[8] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[8], attr[8], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[8], "Create input buffer fail.", final ); + buffer[9] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[9], attr[9], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[9], "Create input buffer fail.", final ); + + buffer[10] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[10], attr[10], TRUE ); + buffer[11] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[11], attr[11], TRUE ); + buffer[12] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[12], attr[12], TRUE ); + + buffer[13] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[13], attr[13], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[13], "Create input buffer fail.", final ); + buffer[14] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[14], attr[14], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[14], "Create input buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &gate_activation); + CHECK_STATUS_FAIL_GOTO(status, final); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[16], &candidate_activation); + CHECK_STATUS_FAIL_GOTO(status, final); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[17], &use_cudnn_implementation); + CHECK_STATUS_FAIL_GOTO(status, final); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[18], &input_layout); + CHECK_STATUS_FAIL_GOTO(status, final); + + if(GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC == input_layout) + { + batch = attr[1]->shape->data[1]; + hidden_units = attr[1]->shape->data[0]; + + if(buffer[2] == NULL) + { + hidden_units = hidden_units / 3; + } + + for( i = 0; i < batch; i++ ) + { + float* input_base = buffer[0] + i * hidden_units; + float* output_base = buffer[13] + i * hidden_units; + + if(buffer[2] == NULL) + { + float* input_fc_base = buffer[1] + i * hidden_units * 3; + float* recurrent_fc_base = buffer[4] + i * hidden_units * 3; + + i_r_base = input_fc_base + 0 * hidden_units; + i_u_base = input_fc_base + 1 * hidden_units; + i_c_base = input_fc_base + 2 * hidden_units; + + r_r_base = recurrent_fc_base + 0 * hidden_units; + r_u_base = recurrent_fc_base + 1 * hidden_units; + r_c_base = recurrent_fc_base + 2 * hidden_units; + } + else + { + i_r_base = buffer[1] + i * hidden_units; + i_u_base = buffer[2] + i * hidden_units; + i_c_base = buffer[3] + i * hidden_units; + r_r_base = buffer[4] + i * hidden_units; + r_u_base = buffer[5] + i * hidden_units; + r_c_base = buffer[6] + i * hidden_units; + } + + for( j = 0; j < hidden_units; j++ ) + { + cond_reset = buffer[10] ? buffer[10][j] : cond_reset; + cond_update = buffer[11] ? buffer[11][j] : cond_update; + cond_candidate = buffer[12] ? buffer[12][j] : cond_candidate; + + bias_r = buffer[7][j]; + bias_u = buffer[8][j]; + bias_c = buffer[9][j]; + + i_r = i_r_base[j]; + i_u = i_u_base[j]; + i_c = i_c_base[j]; + + r_r = r_r_base[j]; + r_u = r_u_base[j]; + r_c = r_c_base[j]; + + r = vsi_nn_activation(i_r + cond_reset + r_r + bias_r, gate_activation); + u = vsi_nn_activation(i_u + cond_update + r_u + bias_u, gate_activation); + c = vsi_nn_activation(i_c + cond_candidate + r * (r_c + bias_c), candidate_activation); + state = u * (input_base[j] - c) + c; + + output_base[j] = state; + } + } + } + else + { + vsi_bool input_transposed = FALSE; + float* input_base = buffer[0]; + float* output_base = buffer[13]; + float* curr_input = NULL; + float* curr_output = NULL; + + batch = attr[1]->shape->data[0]; + hidden_units = attr[1]->shape->data[1]; + + if(buffer[2] == NULL) + { + hidden_units = hidden_units / 3; + i_r_base = buffer[1] + 0 * hidden_units * batch; + i_u_base = buffer[1] + 1 * hidden_units * batch; + i_c_base = buffer[1] + 2 * hidden_units * batch; + r_r_base = buffer[4] + 0 * hidden_units * batch; + r_u_base = buffer[4] + 1 * hidden_units * batch; + r_c_base = buffer[4] + 2 * hidden_units * batch; + } + else + { + i_r_base = buffer[1]; + i_u_base = buffer[2]; + i_c_base = buffer[3]; + r_r_base = buffer[4]; + r_u_base = buffer[5]; + r_c_base = buffer[6]; + } + + if(GRUCELL_ACTIVATION_INPUT_LAYOUT_INPUT_NC_FC_CN == input_layout) + { + input_transposed = FALSE; + } + else + { + input_transposed = TRUE; + } + + for( i = 0; i < hidden_units; i++ ) + { + cond_reset = buffer[10] ? buffer[10][i] : cond_reset; + cond_update = buffer[11] ? buffer[11][i] : cond_update; + cond_candidate = buffer[12] ? buffer[12][i] : cond_candidate; + bias_r = buffer[7][i]; + bias_u = buffer[8][i]; + bias_c = buffer[9][i]; + + for( j = 0; j < batch; j++ ) + { + if(input_transposed) + { + curr_input = &input_base[i * batch + j]; + curr_output = &output_base[i * batch + j]; + } + else + { + curr_input = &input_base[j * hidden_units + i]; + curr_output = &output_base[j * hidden_units + i]; + } + + i_r = i_r_base[i * batch + j]; + i_u = i_u_base[i * batch + j]; + i_c = i_c_base[i * batch + j]; + r_r = r_r_base[i * batch + j]; + r_u = r_u_base[i * batch + j]; + r_c = r_c_base[i * batch + j]; + + r = vsi_nn_activation(i_r + cond_reset + r_r + bias_r, gate_activation); + u = vsi_nn_activation(i_u + cond_update + r_u + bias_u, gate_activation); + c = vsi_nn_activation(i_c + cond_candidate + r * (r_c + bias_c), candidate_activation); + state = u * (*curr_input - c) + c; + + *curr_output = state; + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[13], attr[13], + buffer[13], batch * hidden_units ); + CHECK_STATUS_FAIL_GOTO( status, final ); + status = vsi_nn_kernel_tensor_write_from_float( tensors[14], attr[14], + buffer[13], batch * hidden_units ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _IO_COUNT_SEPARATED; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _compute() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t gate_activation, + int32_t candidate_activation, + int32_t input_category, + vsi_bool use_cudnn_implementation, + int32_t* param_count, + int32_t* input_count, + int32_t* output_count + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + if(input_category == 0) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _grucell_activation_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _grucell_activation_kernel_param_def ); + *param_count = _GRUCELL_ACTIVATION_PARAM_NUM; + *input_count = 3; + *output_count = 2; + status = VSI_SUCCESS; + } + else + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute_separated; + kernel->info.parameters = _grucell_activation_separated_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _grucell_activation_separated_kernel_param_def ); + *param_count = _GRUCELL_ACTIVATION_SEPARATED_PARAM_NUM; + *input_count = 13; + *output_count = 2; + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t* node_params = NULL; + vsi_nn_kernel_node_t node = NULL; + int32_t i = 0; + int32_t j = 0; + int32_t param_count = 0; + int32_t input_count = 0; + int32_t output_count = 0; + int32_t gate_activation = vsi_nn_kernel_param_get_int32( params, "gate_activation" ); + int32_t candidate_activation = vsi_nn_kernel_param_get_int32( params, "candidate_activation" ); + int32_t input_category = vsi_nn_kernel_param_get_int32( params, "input_category" ); + int32_t use_cudnn_implementation = vsi_nn_kernel_param_get_int32( params, "use_cudnn_implementation" ); + grucell_activation_input_layout_e input_layout = vsi_nn_kernel_param_get_int32( params, "input_layout" ); + vsi_nn_tensor_t** _inputs = NULL; + + status = _query_kernel( kernel, inputs, outputs, gate_activation, candidate_activation, + input_category, use_cudnn_implementation, ¶m_count, &input_count, &output_count ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + _inputs = (vsi_nn_tensor_t**)malloc(input_count * sizeof(vsi_nn_tensor_t**)); + node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count); + for(i = 0; i < input_count; i++) + { + _inputs[i] = inputs[i]; + } + + j = input_count + output_count; + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, param_count, + _inputs, input_count, outputs, output_count ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &gate_activation ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &candidate_activation ); + if(input_category != 0) + { + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &use_cudnn_implementation ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &input_layout ); + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, param_count ); + if(input_category != 0) + { + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + } + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + } + } + + vsi_nn_safe_free(_inputs); + vsi_nn_safe_free(node_params); + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( grucell_activation, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c new file mode 100644 index 0000000..5b6f715 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c @@ -0,0 +1,182 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (2) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.grucell_activation_sma") + +/* + * Kernel params + */ +static vx_param_description_t _grucell_activation_sma_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _GRUCELL_ACTIVATION_SMA_PARAM_NUM _cnt_of_array( _grucell_activation_sma_kernel_param_def ) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + int32_t i = 0; + int32_t batch = 0; + int32_t hidden_units = 0; + float * buffer[_IO_NUM] = { NULL }; + vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL }; + vsi_nn_kernel_tensor_attr_t* attr[_IO_NUM] = { NULL }; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; + tensors[4] = (vsi_nn_kernel_tensor_t)param[4]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + attr[4] = vsi_nn_kernel_tensor_attr_create( tensors[4] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final ); + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create input buffer fail.", final ); + buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create input buffer fail.", final ); + + batch = attr[0]->shape->data[1]; + hidden_units = attr[0]->shape->data[0]; + + for( i = 0; i < batch * hidden_units; i++ ) + { + buffer[3][i] = (buffer[0][i] - buffer[1][i]) * buffer[2][i] + buffer[1][i]; + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + buffer[3], batch * hidden_units ); + CHECK_STATUS_FAIL_GOTO( status, final ); + status = vsi_nn_kernel_tensor_write_from_float( tensors[4], attr[4], + buffer[3], batch * hidden_units ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _compute() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _grucell_activation_sma_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _grucell_activation_sma_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_SMA_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_SMA_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_SMA_PARAM_NUM ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( grucell_activation_sma, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c new file mode 100644 index 0000000..b1e9860 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c @@ -0,0 +1,259 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (3) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.instance_norm") + +DEF_KERNEL_EXECUTOR(_instance_norm_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + float eps = .0f; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); + + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final ); + + buffer[3] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final ); + memset( buffer[3], 0, out_elements * sizeof(float) ); + + { + uint32_t b = 0, c = 0, h = 0, w = 0; + uint32_t height = attr[0]->shape->data[1]; + uint32_t width = attr[0]->shape->data[0]; + uint32_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + uint32_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; + + for (b = 0; b < bh; b++) + { + for (c = 0; c < ch; c++) + { + uint32_t page = c * (height * width) + b * (height * width * ch); + float sum = .0f; + float sumsq = .0f; + float mean = .0f; + float vari = .0f; + float data = 0; + float scaleVal = buffer[2][c]; + float biasVal = buffer[1][c]; + + for (h = 0; h < height; h++) + { + uint32_t len = page + h * width; + + for (w = 0; w < width; w++) + { + uint32_t index = len + w; + sum += buffer[0][index]; + } + } + mean = sum / (width * height); + for (h = 0; h < height; h++) + { + uint32_t len = page + h * width; + for (w = 0; w < width; w++) + { + uint32_t index = len + w; + data = buffer[0][index] - mean; + sumsq += data * data; + } + } + vari = sumsq / (width * height); + vari = (float)(1.0 / sqrtf(vari + eps)); + for (h = 0; h < height; h++) + { + uint32_t len = page + h * width; + for (w = 0; w < width; w++) + { + float normVal = 0; + uint32_t index = len + w; + data = buffer[0][index] - mean; + + normVal = data * vari * scaleVal + biasVal; + buffer[3][index] = normVal; + } + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + buffer[3], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_yuv420_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _instance_normalization_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _INSTANCE_NORMALIZATION_PARAM_NUM _cnt_of_array( _instance_normalization_kernel_param_def ) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _instance_norm_exec, + _instance_normalization_kernel_param_def, + _cnt_of_array( _instance_normalization_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( instance_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c b/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c new file mode 100644 index 0000000..f3f096a --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c @@ -0,0 +1,248 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.l2normalizescale") + + +/* + * Kernel params + */ +static vx_param_description_t _l2normalizescale_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _L2NORMALIZESCALE_PARAM_NUM _cnt_of_array( _l2normalizescale_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (3) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i, index; + int32_t axis = 0; + int32_t outerSize = 1; + int32_t axisSize = 1; + int32_t innerSize = 1; + int32_t inner = 0; + int32_t outer = 0; + float rsqrt = 0.0f, scaleValue = 0.0f; + float epsilon = (float)10e-12; + float l2Value = 0.0f, tmpValue = 0.0f; + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + for (i = 0; i < (uint32_t)axis; i++) + { + innerSize *= in_attr[0]->shape->data[i]; + } + + axisSize = in_attr[0]->shape->data[axis]; + + for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++) + { + outerSize *= in_attr[0]->shape->data[i]; + } + + for (outer = 0; outer < outerSize; ++outer) { + for (inner = 0; inner < innerSize; ++inner) { + float sum = 0.0f; + + for (i = 0; i < (uint32_t)axisSize; ++i) { + index = (outer * axisSize + i) * innerSize + inner; + tmpValue = f32_in_buffer[0][index]; + sum += tmpValue * tmpValue; + } + rsqrt = 1.0f / sqrtf(vsi_nn_max(sum, epsilon)); + for (i = 0; i < (uint32_t)axisSize; ++i) { + index = (outer * axisSize + i) * innerSize + inner; + tmpValue = f32_in_buffer[0][index];; + scaleValue = f32_in_buffer[1][i];; + l2Value = tmpValue * rsqrt * scaleValue; + f32_out_buffer[0][index] = l2Value; + } + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _l2normalizescale_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _l2normalizescale_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_L2NORMALIZESCALE_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _L2NORMALIZESCALE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _L2NORMALIZESCALE_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( l2normalizescale, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c new file mode 100644 index 0000000..4f56938 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c @@ -0,0 +1,235 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "client/vsi_nn_vxkernel.h" +__BEGIN_DECLS + +#define _CPU_ARG_NUM (2) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("log_softmax_sw") + +DEF_KERNEL_EXECUTOR(_log_softmax_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + int32_t axis = 0; + float beta = 0; + int32_t outerSize = 1; + int32_t axisSize = 1; + int32_t innerSize = 1; + int32_t i = 0; + int32_t inner = 0; + int32_t outer = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &beta); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + for (i = 0; i < axis; i++) + { + innerSize *= attr[0]->shape->data[i]; + } + + axisSize = attr[0]->shape->data[axis]; + + for (i = axis + 1; i < (int32_t)attr[0]->shape->size; i++) + { + outerSize *= attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + // We subtract the maximum value from each element to ensure + // numerical stability, taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + float sum = 0; + float logSum = 0; + float maxValue = buffer[0][outer * axisSize * innerSize + inner]; + for (i = 1; i < axisSize; ++i) + { + maxValue = vsi_nn_max(maxValue, buffer[0][(outer * axisSize + i) * innerSize + inner]); + } + + sum = 0; + for (i = 0; i < axisSize; ++i) + { + sum += expf((buffer[0][(outer * axisSize + i) * innerSize + inner] - maxValue) * beta); + } + + logSum = logf(sum); + for (i = 0; i < axisSize; ++i) + { + buffer[1][(outer * axisSize + i) * innerSize + inner] = + (buffer[0][(outer * axisSize + i) * innerSize + inner] - maxValue) * beta - + logSum; + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _log_softmax_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _log_softmax_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +#define SCALAR_INPUT_AXIS (2) +#define SCALAR_INPUT_BETA (3) + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + float beta = 1.0f; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + beta = vsi_nn_kernel_param_get_float32(params, "beta"); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + backend_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create( + graph, F32, &beta ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] ); + vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_BETA] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( log_softmax, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c b/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c new file mode 100644 index 0000000..c3917a6 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c @@ -0,0 +1,200 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.logical_not") + + +/* + * Kernel params + */ +static vx_param_description_t _logical_not_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _LOGICAL_NOT_PARAM_NUM _cnt_of_array( _logical_not_kernel_param_def ) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + for (i = 0; i < out_elements[0]; i++) + { + f32_out_buffer[0][i] = (float)(!f32_in_buffer[0][i]); + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _logical_not_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _logical_not_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LOGICAL_NOT_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_NOT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_NOT_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( logical_not, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c b/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c new file mode 100644 index 0000000..8e89e7a --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c @@ -0,0 +1,267 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.logical_ops") + +/* + * Kernel params + */ +static vx_param_description_t _logical_ops_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _LOGICAL_OPS_PARAM_NUM _cnt_of_array( _logical_ops_kernel_param_def ) + + +static int32_t _expand_offset + ( + int32_t index, + int32_t * shape, size_t rank, + size_t * strides, int32_t * out_shape + ) +{ + uint32_t i; + int32_t offset = 0; + + for( i = 0; i < rank && index; i ++ ) + { + if( shape[i] == out_shape[i] ) + { + offset += (int32_t)strides[i] * ( index % out_shape[i] ); + } + index /= out_shape[i]; + } + return offset; +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + uint32_t ops_type_int = 0; + vsi_nn_logical_ops_type_t ops_type = VSI_NN_LOGICAL_OR; + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + vsi_nn_kernel_scalar_read_uint32((vsi_nn_kernel_scalar_t)param[_CPU_IO_NUM], &(ops_type_int)); + ops_type = (vsi_nn_logical_ops_type_t)ops_type_int; + if (!(VSI_NN_LOGICAL_OR == ops_type || VSI_NN_LOGICAL_AND == ops_type || VSI_NN_LOGICAL_XOR == ops_type)) + { + status = VSI_FAILURE; + goto final; + } + + for (i = 0; i < out_elements[0]; i++) + { + int32_t in0_offset = 0; + int32_t in1_offset = 0; + int32_t in0 = 0; + int32_t in1 = 0; + + + in0_offset = _expand_offset( i, in_attr[0]->shape->data, in_attr[0]->shape->size, + in_stride_size[0], out_attr[0]->shape->data ); + in1_offset = _expand_offset( i, in_attr[1]->shape->data, in_attr[1]->shape->size, + in_stride_size[1], out_attr[0]->shape->data ); + in0 = (!!(f32_in_buffer[0][in0_offset])); + in1 = (!!(f32_in_buffer[1][in1_offset])); + if (VSI_NN_LOGICAL_OR == ops_type) + { + f32_out_buffer[0][i] = (float)(in0 || in1); + } + else if (VSI_NN_LOGICAL_AND == ops_type) + { + f32_out_buffer[0][i] = (float)(in0 && in1); + } + else if (VSI_NN_LOGICAL_XOR == ops_type) + { + f32_out_buffer[0][i] = (float)(in0 ^ in1); + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _logical_ops_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _logical_ops_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LOGICAL_OPS_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + uint32_t ops_type = vsi_nn_kernel_param_get_int32( params, "ops_type" ); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_OPS_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[_CPU_IO_NUM] = vsi_nn_kernel_scalar_create( graph, U32, &ops_type ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_OPS_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( logical_ops, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c new file mode 100644 index 0000000..8bd6d3b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c @@ -0,0 +1,406 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (LSTMUNIT_ACT_INPUTS_COUNT) +#define _OUTPUT_NUM (LSTMUNIT_ACT_OUTUTS_COUNT) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.lstmunit_activation") + + +/* + * Kernel params + */ +static vx_param_description_t _lstmunit_activation_kernel_param_def[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*0 input_fc_i */ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, /*1 input_fc_f */ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, /*2 input_fc_c */ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, /*3 input_fc_o */ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, /*4 cs_in */ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*5 hstate_fc_i */ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*6 hstate_fc_f */ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*7 hstate_fc_c */ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*8 hstate_fc_o */ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*9 biases_i*/ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*10 biases_f*/ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*11 biases_c*/ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*12 biases_o*/ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*13 ln_w_i*/ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*14 ln_w_f*/ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*15 ln_w_c*/ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*16 ln_w_o*/ + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, /*17 output*/ + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, /*18 cs_out*/ + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, /*19 hs_out*/ + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, /*20 _is_ln*/ + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, /*21 _is_cifg*/ + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, /*22 _is_proj*/ + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, /*23 _is_hybrid*/ + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, /*24 recurrent_activation*/ + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, /*25 forget_bias*/ +}; +#define _LSTMUNIT_ACTIVATION_PARAM_NUM _cnt_of_array( _lstmunit_activation_kernel_param_def ) + +#define SCALAR_IS_LN (20) +#define SCALAR_IS_CIFG (21) +#define SCALAR_IS_PROG (22) +#define SCALAR_IS_HYBRID (23) +#define SCALAR_ACTIVATION (24) +#define SCALAR_FORGET_BIAS (25) + +static float activationFunctor(float a, vsi_nn_activation_e act_) +{ + switch (act_) + { + case VSI_NN_ACT_NONE: + return a; + case VSI_NN_ACT_RELU: + return a < 0.f ? 0.f : a; + case VSI_NN_ACT_RELU6: + return vsi_nn_max(0.f, vsi_nn_min(a, 6.f)); + case VSI_NN_ACT_TANH: + return (float)tanh(a); + case VSI_NN_ACT_SIGMOID: + return (float)(1.0f / (1.0f + exp(-a))); + case VSI_NN_ACT_HARD_SIGMOID: + a = a * 0.2f + 0.5f; + return vsi_nn_max(0.f, vsi_nn_min(a, 1.f)); + default: + // TODO(aselle): More informative fatal error! + exit(1); + } +} + +#define gcoMATH_Exp(X) (float)(expf((X))) +#define gcoMATH_TangentH(X) (float)(tanhf((X))) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; + size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i, b; + int32_t _is_ln = 0; + int32_t _is_cifg = 0; + int32_t _is_proj = 0; + int32_t _is_hybrid = 0; + int32_t recurrent_activation; + vsi_nn_activation_e activation_mode; + uint32_t n_batch = 0; + uint32_t n_cell = 0; + float forget_bias; + /* prepare data */ + for( i = 0; i < _INPUT_NUM; i++ ) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + if (input[i]) + { + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + } + + } + + for( i = 0; i < _OUTPUT_NUM; i++ ) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + if (output[i]) + { + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + } + + status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[SCALAR_IS_LN], &_is_ln ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[SCALAR_IS_CIFG], &_is_cifg ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[SCALAR_IS_PROG], &_is_proj ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[SCALAR_IS_HYBRID], &_is_hybrid ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ACTIVATION], &recurrent_activation ); + CHECK_STATUS_FAIL_GOTO(status, final ); + activation_mode = (vsi_nn_activation_e)recurrent_activation; + status = vsi_nn_kernel_scalar_read_float32( (vsi_nn_kernel_scalar_t)param[SCALAR_FORGET_BIAS], &forget_bias ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + n_cell = in_attr[LSTMUNIT_ACT_CSTATE_IN]->shape->data[0]; + n_batch = in_attr[LSTMUNIT_ACT_CSTATE_IN]->shape->data[1]; + + for (b = 0; b < n_batch; b ++) + { + for (i = 0; i < n_cell; i++) + { + uint32_t index = i + n_cell * b; + float data_i_t = 0; + float data_f_t = 0; + float data_g_t = 0; + float data_o_t = 0; + float data_c_t = 0; + float data_h_t = 0; + + data_i_t = _is_cifg ? 0 : f32_in_buffer[LSTMUNIT_ACT_INPUT_FC_I][index]; + data_f_t = f32_in_buffer[LSTMUNIT_ACT_INPUT_FC_F][index]; + data_g_t = f32_in_buffer[LSTMUNIT_ACT_INPUT_FC_C][index]; + data_o_t = f32_in_buffer[LSTMUNIT_ACT_INPUT_FC_O][index]; + data_c_t = f32_in_buffer[LSTMUNIT_ACT_CSTATE_IN][index]; + + if (!_is_ln) + { + data_i_t += _is_cifg ? 0 : f32_in_buffer[LSTMUNIT_ACT_HSTATE_FC_I][index]; + data_f_t += f32_in_buffer[LSTMUNIT_ACT_HSTATE_FC_F][index]; + data_g_t += f32_in_buffer[LSTMUNIT_ACT_HSTATE_FC_C][index]; + data_o_t += f32_in_buffer[LSTMUNIT_ACT_HSTATE_FC_O][index]; + } + + if (!_is_cifg) + { + if (_is_ln) + { + data_i_t *= f32_in_buffer[LSTMUNIT_ACT_LN_WI][i]; + data_i_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BI][i]; + } + else if (_is_hybrid) + { + data_i_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BI][i]; + } + } + + if (_is_ln) + { + data_f_t *= f32_in_buffer[LSTMUNIT_ACT_LN_WF][i]; + data_f_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BF][i]; + data_g_t *= f32_in_buffer[LSTMUNIT_ACT_LN_WC][i]; + data_g_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BC][i]; + data_o_t *= f32_in_buffer[LSTMUNIT_ACT_LN_WO][i]; + data_o_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BO][i]; + } + else if (_is_hybrid) + { + data_f_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BF][i]; + data_g_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BC][i]; + data_o_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BO][i]; + } + + data_f_t += forget_bias; + data_f_t = activationFunctor(data_f_t, activation_mode); + + if (_is_cifg) + data_i_t = 1 - data_f_t; + else + data_i_t = activationFunctor(data_i_t, activation_mode); + data_g_t = gcoMATH_TangentH(data_g_t); + data_o_t = activationFunctor(data_o_t, activation_mode); + data_c_t = data_f_t * data_c_t + data_i_t * data_g_t; + data_h_t = data_o_t * gcoMATH_TangentH(data_c_t); + + f32_out_buffer[LSTMUNIT_ACT_CSTATE_OUT][index] = data_c_t; + f32_out_buffer[LSTMUNIT_ACT_OUTPUT][index] = data_h_t; + + if (!_is_proj) + { + f32_out_buffer[LSTMUNIT_ACT_HSTATE_OUT][index] = data_h_t; + } + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (output[i]) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; + +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _lstmunit_activation_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _lstmunit_activation_kernel_param_def ); + status = VSI_SUCCESS; + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LSTMUNIT_ACTIVATION_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t _is_ln= 0; + int32_t _is_cifg= 0; + int32_t _is_proj= 0; + int32_t _is_hybrid= 0; + int32_t recurrent_activation; + float forget_bias; + + _is_ln = vsi_nn_kernel_param_get_int32( params, "_is_ln" ); + _is_cifg = vsi_nn_kernel_param_get_int32( params, "_is_cifg" ); + _is_proj = vsi_nn_kernel_param_get_int32( params, "_is_proj" ); + _is_hybrid = vsi_nn_kernel_param_get_int32( params, "_is_hybrid" ); + recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + forget_bias = vsi_nn_kernel_param_get_float32(params, "forget_bias"); + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _LSTMUNIT_ACTIVATION_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_IS_LN] = vsi_nn_kernel_scalar_create( + graph, I32, &_is_ln ); + node_params[SCALAR_IS_CIFG] = vsi_nn_kernel_scalar_create( + graph, I32, &_is_cifg ); + node_params[SCALAR_IS_PROG] = vsi_nn_kernel_scalar_create( + graph, I32, &_is_proj ); + node_params[SCALAR_IS_HYBRID] = vsi_nn_kernel_scalar_create( + graph, I32, &_is_hybrid ); + node_params[SCALAR_ACTIVATION] = vsi_nn_kernel_scalar_create( + graph, I32, &recurrent_activation ); + node_params[SCALAR_FORGET_BIAS] = vsi_nn_kernel_scalar_create( + graph, F32, &forget_bias ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _LSTMUNIT_ACTIVATION_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_IS_LN] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_IS_CIFG] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_IS_PROG] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_IS_HYBRID] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ACTIVATION] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_FORGET_BIAS] ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( lstmunit_activation, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c new file mode 100644 index 0000000..4e8097d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c @@ -0,0 +1,264 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (2) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.matrixmul") + +DEF_KERNEL_EXECUTOR(_matrixmul_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[3] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + int32_t i = 0; + int32_t M = 0, K = 0, N = 0; + int32_t transposeA = 0, transposeB = 0; + vx_size strides0[2] = {0, 0}, strides1[2] = {0, 0}; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &transposeA); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &transposeB); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + K = attr[0]->shape->data[0]; + M = attr[2]->shape->data[1]; + N = attr[2]->shape->data[0]; + + if(transposeA) + { + K = attr[0]->shape->data[1]; + } + + strides0[0] = transposeA? 1:K; + strides0[1] = transposeA? M:1; + + strides1[0] = transposeB? 1:N; + strides1[1] = transposeB? K:1; + + { + int32_t batch = attr[2]->shape->size > 3 ? attr[2]->shape->data[3] : 1; + int32_t depth = attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1; + int32_t a_depth = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + int32_t b_depth = attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1; + int32_t b = 0, c = 0, j = 0, y = 0; + int32_t offsetA = 0, offsetB = 0, offsetD = 0; + int32_t ac2zero = 1; + int32_t bc2zero = 1; + + if((attr[0]->shape->size > attr[1]->shape->size) || + (attr[0]->shape->data[2] > attr[1]->shape->data[2] + && attr[0]->shape->size > 2 && attr[1]->shape->size > 2)) + { + bc2zero = 0; + } + else if((attr[1]->shape->size > attr[0]->shape->size) || + (attr[1]->shape->data[2] > attr[0]->shape->data[2] + && attr[0]->shape->size > 2 && attr[1]->shape->size > 2)) + { + ac2zero = 0; + } + + for(b = 0; b < batch; b++) + { + for(c = 0; c < depth; c++) + { + offsetA = c * M * K * ac2zero + b * M * K * a_depth; + offsetB = c * N * K * bc2zero + b * N * K * b_depth; + offsetD = c * M * N + b * M * N * depth; + for(i = 0 ; i < M; i++) + { + for(j = 0; j < N; j++) + { + float sum = 0; + for(y = 0; y < K; y++) + { + float dataA = buffer[0][i * strides0[0] + y * strides0[1] + offsetA]; + float dataB = buffer[1][y * strides1[0] + j * strides1[1] + offsetB]; + + sum += dataA * dataB; + } + buffer[2][j + i * N + offsetD] = sum; + } + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < 3; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_yuv420_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _matrixmul_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _MATIRXMUL_PARAM_NUM _cnt_of_array( _matrixmul_kernel_param_def ) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _matrixmul_exec, + _matrixmul_kernel_param_def, + _cnt_of_array( _matrixmul_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); + int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 3; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeA ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeB ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( matrixmul, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c new file mode 100644 index 0000000..61bba5c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c @@ -0,0 +1,214 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (0) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("maximum_sw") + +static int32_t _expand_offset + ( + int32_t index, + int32_t * shape, size_t rank, + size_t * strides, int32_t * out_shape + ) +{ + uint32_t i; + int32_t offset = 0; + + for( i = 0; i < rank && index; i ++ ) + { + if( shape[i] == out_shape[i] ) + { + offset += (uint32_t)strides[i] * ( index % out_shape[i] ); + } + index /= out_shape[i]; + } + return offset; +} + +DEF_KERNEL_EXECUTOR(_maximum_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + + vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] ); + vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + for (i = 0; i < out_elements; i++) + { + int32_t in0_offset = 0; + int32_t in1_offset = 0; + float val1 = 0.f; + float val2 = 0.f; + + in0_offset = _expand_offset( i, attr[0]->shape->data, attr[0]->shape->size, + stride_size[0], attr[2]->shape->data ); + in1_offset = _expand_offset( i, attr[1]->shape->data, attr[1]->shape->size, + stride_size[1], attr[2]->shape->data ); + + val1 = buffer[0][in0_offset]; + val2 = buffer[1][in1_offset]; + + buffer[2][i] = vsi_nn_max( val1, val2 ); + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _maximum_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _maximum_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( maximum, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c new file mode 100644 index 0000000..1a63797 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c @@ -0,0 +1,210 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (0) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("minimum_sw") + +static int32_t _expand_offset + ( + int32_t index, + int32_t * shape, size_t rank, + size_t * strides, int32_t * out_shape + ) +{ + uint32_t i; + int32_t offset = 0; + + for( i = 0; i < rank && index; i ++ ) + { + if( shape[i] == out_shape[i] ) + { + offset += (uint32_t)strides[i] * ( index % out_shape[i] ); + } + index /= out_shape[i]; + } + return offset; +} + +DEF_KERNEL_EXECUTOR(_minimum_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + + vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] ); + vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + for( i = 0; i < out_elements; i ++ ) + { + int32_t in0_offset = 0; + int32_t in1_offset = 0; + float val1 = 0.f; + float val2 = 0.f; + + in0_offset = _expand_offset( i, attr[0]->shape->data, attr[0]->shape->size, + stride_size[0], attr[2]->shape->data ); + in1_offset = _expand_offset( i, attr[1]->shape->data, attr[1]->shape->size, + stride_size[1], attr[2]->shape->data ); + + val1 = buffer[0][in0_offset]; + val2 = buffer[1][in1_offset]; + + buffer[2][i] = vsi_nn_min( val1, val2 ); + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _minimum_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _minimum_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( minimum, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c new file mode 100644 index 0000000..b9450c9 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c @@ -0,0 +1,315 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (3) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (2) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.moments") + +DEF_KERNEL_EXECUTOR(_moments_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t axis_first = 0; + int32_t axis_num = 0; + uint32_t mask = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis_first); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis_num); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_uint32((vsi_nn_kernel_scalar_t)param[5], &mask); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + if(mask == 0) + { + int32_t outerSize = 1; + int32_t axisSize = 1; + int32_t innerSize = 1; + int32_t inner = 0; + int32_t outer = 0; + + for (i = 0; i < (uint32_t)axis_first; i++) + { + innerSize *= attr[0]->shape->data[i]; + } + + for(i = 0; i < (uint32_t)axis_num; i++) + { + axisSize *= attr[0]->shape->data[axis_first + i]; + } + + for (i = (uint32_t)axis_first + axis_num; i < attr[0]->shape->size; i++) + { + outerSize *= attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + float sum = .0f; + float sumsq = .0f; + float mean = .0f; + float vari = .0f; + + for (i = 0; i < (uint32_t)axisSize; ++i) + { + float value = buffer[0][(outer * axisSize + i) * innerSize + inner]; + sum += value; + sumsq += (value * value); + } + mean = sum / (axisSize); + vari = sumsq / (axisSize) - mean * mean; + buffer[1][outer * innerSize + inner] = (float)mean; + buffer[2][outer * innerSize + inner] = (float)vari; + } + } + } + else + { + int32_t width = attr[0]->shape->data[0]; + int32_t height = attr[0]->shape->size > 1 ? attr[0]->shape->data[1] : 1; + int32_t channel = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + int32_t batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; + int32_t width_o = attr[1]->shape->data[0]; + int32_t height_o = attr[1]->shape->size > 1 ? attr[1]->shape->data[1] : 1; + int32_t channel_o = attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1; + int32_t b = 0, c = 0, h = 0; + int32_t wh_offset = width * height; + int32_t axisSize = width * channel; + int32_t vol = width_o * height_o * channel_o; + + for(b = 0; b < batch; b++) + { + for(h = 0; h < height; h++) + { + float sum = .0f; + float sumsq = .0f; + float mean = .0f; + float vari = .0f; + int h_offset = h * width; + for(c = 0; c < channel; c++) + { + int offset = h_offset + c * wh_offset; + for(i = 0; i < (uint32_t)width; i++) + { + float value = buffer[0][i + offset]; + sum += value; + sumsq += (value * value); + } + } + mean = sum / (axisSize); + vari = sumsq / (axisSize) - mean * mean; + buffer[1][b * vol + h] = (float)mean; + buffer[2][b * vol + h] = (float)vari; + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + status |= vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_yuv420_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _moments_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _MOMENTS_PARAM_NUM _cnt_of_array( _moments_kernel_param_def ) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _moments_exec, + _moments_kernel_param_def, + _cnt_of_array( _moments_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis_num = 0; + int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", (size_t*)&axis_num); + vsi_bool is_continue_axis = TRUE; + uint32_t mask = 0; + int32_t i = 0; + + for ( i = 1; i < axis_num; i++) + { + if ( axis[i] != (axis[i - 1] + 1) && axis[0] == 0) + { + is_continue_axis = FALSE; + break; + } + } + + if (is_continue_axis == FALSE) + { + for(i = 0; i < axis_num; i++) + { + mask |= (1 << axis[i]); + } + } + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + int32_t axis_first = axis[0]; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &axis_first ); + backend_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); + backend_params[5] = vsi_nn_kernel_scalar_create( graph, U32, &mask ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( moments, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c new file mode 100644 index 0000000..84b0ff8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c @@ -0,0 +1,338 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (2) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.poolwithargmax") + + +/* + * Kernel params + */ +static vx_param_description_t _poolwithargmax_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; +#define _POOLWITHARGMAX_PARAM_NUM _cnt_of_array( _poolwithargmax_kernel_param_def ) + +#define SCALAR_KSZIE_X (3) +#define SCALAR_KSZIE_Y (4) +#define SCALAR_STRIDE_X (5) +#define SCALAR_STRIDE_Y (6) +#define SCALAR_PAD_X (7) +#define SCALAR_PAD_Y (8) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + int32_t i, j, b, p; + int32_t batch, depth_v, height_o, width_o, height, width; + int32_t ksize_x = 0; + int32_t ksize_y = 0; + int32_t stride_x = 0; + int32_t stride_y = 0; + int32_t pad_x = 0; + int32_t pad_y = 0; + int32_t output_base = 0; + int32_t input_base = 0; + int32_t max_index = 0; + vsi_nn_kernel_dtype_e out1_dtype; + vsi_bool is_relative_coord = FALSE; + + + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_KSZIE_X], &ksize_x); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_KSZIE_Y], &ksize_y); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_X], &stride_x); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_Y], &stride_y); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_PAD_X], &pad_x); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_PAD_Y], &pad_y); + + CHECK_STATUS_FAIL_GOTO(status, final ); + + batch = out_attr[0]->shape->size > 3 ? out_attr[0]->shape->data[3] : 1; + depth_v = out_attr[0]->shape->size > 2 ? out_attr[0]->shape->data[2] : 1; + height_o = out_attr[0]->shape->data[1]; + width_o = out_attr[0]->shape->data[0]; + width = in_attr[0]->shape->data[0]; + height = in_attr[0]->shape->data[1]; + + out1_dtype = out_attr[1]->dtype; + + if ((I8 == out1_dtype) || (U8 == out1_dtype) || (I16 == out1_dtype)) + { + is_relative_coord = TRUE; + } + + for(b = 0; b < batch; b++) + { + for (p = 0; p < depth_v; p ++) + { + output_base = b * depth_v * height_o * width_o + p * height_o * width_o; + input_base = b * depth_v * height * width + p * height * width; + for (j = 0; j < height_o; j ++) + { + for (i = 0; i < width_o; i ++) + { + int32_t hstart = j * stride_y - pad_y; + int32_t wstart = i * stride_x - pad_x; + int32_t hoffset = 0; + int32_t woffset = 0; + int32_t hend = vsi_nn_min(hstart + ksize_y, height); + int32_t wend = vsi_nn_min(wstart + ksize_x, width); + int32_t pool_index = 0; + int32_t h, w = 0; + int32_t cur_index = 0; + float d_f32 = 0.0f; + + if (hstart < 0) + { + hoffset = -hstart; + } + + if (wstart < 0) + { + woffset = -wstart; + } + + hstart = vsi_nn_max(hstart, 0); + wstart = vsi_nn_max(wstart, 0); + + pool_index = output_base + j * width_o + i; + max_index = is_relative_coord ? 0 : (input_base + hstart * width + wstart); + d_f32 = f32_in_buffer[0][input_base + hstart * width + wstart]; + for (h = hstart; h < hend; ++ h) + { + cur_index = (h - hstart + hoffset) * ksize_x + woffset; + for (w = wstart; w < wend; ++ w) + { + int32_t index = input_base + h * width + w; + float d; + + d = f32_in_buffer[0][index]; + if (d > d_f32) + { + d_f32 = d; + max_index = is_relative_coord ? cur_index : index; + } + cur_index++; + } + } + f32_out_buffer[0][pool_index] = d_f32; + f32_out_buffer[1][pool_index] = (float)max_index; + } + } + } + } + out_attr[1]->quant = VSI_NN_KERNEL_QUANT_NONE; + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _poolwithargmax_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _poolwithargmax_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_POOLWITHARGMAX_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t ksize_x = 0; + int32_t ksize_y = 0; + int32_t stride_x = 0; + int32_t stride_y = 0; + int32_t pad_x = 0; + int32_t pad_y = 0; + + ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x"); + ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y"); + stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x"); + stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y"); + pad_x = vsi_nn_kernel_param_get_int32(params, "pad_x"); + pad_y = vsi_nn_kernel_param_get_int32(params, "pad_y"); + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _POOLWITHARGMAX_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_KSZIE_X] = vsi_nn_kernel_scalar_create( + graph, I32, &ksize_x ); + node_params[SCALAR_KSZIE_Y] = vsi_nn_kernel_scalar_create( + graph, I32, &ksize_y ); + node_params[SCALAR_STRIDE_X] = vsi_nn_kernel_scalar_create( + graph, I32, &stride_x ); + node_params[SCALAR_STRIDE_Y] = vsi_nn_kernel_scalar_create( + graph, I32, &stride_y ); + node_params[SCALAR_PAD_X] = vsi_nn_kernel_scalar_create( + graph, I32, &pad_x ); + node_params[SCALAR_PAD_Y] = vsi_nn_kernel_scalar_create( + graph, I32, &pad_y ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _POOLWITHARGMAX_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_KSZIE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_KSZIE_Y] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_Y] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_PAD_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_PAD_Y] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( poolwithargmax, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c new file mode 100644 index 0000000..1f7c2eb --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c @@ -0,0 +1,213 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (0) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("pow_sw") + +static int32_t _expand_offset + ( + int32_t index, + int32_t * shape, size_t rank, + size_t * strides, int32_t * out_shape + ) +{ + uint32_t i = 0; + int32_t offset = 0; + + for( i = 0; i < rank && index; i ++ ) + { + if( shape[i] == out_shape[i] ) + { + offset += (int32_t)strides[i] * ( index % out_shape[i] ); + } + index /= out_shape[i]; + } + return offset; +} + +DEF_KERNEL_EXECUTOR(_pow_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] ); + vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + for( i = 0; i < out_elements; i ++ ) + { + int32_t in0_offset = 0; + int32_t in1_offset = 0; + float val1 = 0.f; + float val2 = 0.f; + + in0_offset = _expand_offset( i, attr[0]->shape->data, attr[0]->shape->size, + stride_size[0], attr[2]->shape->data ); + in1_offset = _expand_offset( i, attr[1]->shape->data, attr[1]->shape->size, + stride_size[1], attr[2]->shape->data ); + + val1 = buffer[0][in0_offset]; + val2 = buffer[1][in1_offset]; + + buffer[2][i] = (float)pow( val1, val2 ); + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pow_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _pow_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( pow, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c new file mode 100644 index 0000000..3be8fc9 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c @@ -0,0 +1,384 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (10) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.pre_process_bgra_sw") + +#define DESCALE(x) (((x) + (1<<19)) >> 20) + +DEF_KERNEL_EXECUTOR(_pre_process_bgra_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + float * outBuffer = NULL; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0; + float rMean = 0, gMean = 0, bMean = 0, var = 0; + int32_t order = 0, trans = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + i = 2; + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + if(trans) + { + outBuffer = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final ); + memset( outBuffer, 0, out_elements * sizeof(float) ); + } + + { + int32_t elementSize = 4; + int32_t rline1[2], rline2[2]; + int32_t gline1[2], gline2[2]; + int32_t bline1[2], bline2[2]; + int32_t dx = 0, dy = 0, dz = 0; + int32_t src_stride = attr[0]->shape->data[0]; + int32_t src_width = src_stride / elementSize; + int32_t src_height = attr[0]->shape->data[1]; + int32_t dst_width = trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]; + int32_t dst_height = trans ? attr[1]->shape->data[2] : attr[1]->shape->data[1]; + int32_t stride = dst_width * dst_height; + int32_t bOffset = 0; + int32_t gOffset = 1 * stride; + int32_t rOffset = 2 * stride; + uint8_t R = 0, G = 0, B = 0; + + if(order) + { + bOffset = 2 * stride; + rOffset = 0; + } + + for ( dz = 0; dz < 1; dz ++) + { + for ( dy = 0; dy < (int32_t)dst_height; dy ++) + { + for ( dx = 0; dx < (int32_t)dst_width; dx ++) + { + int32_t source_index = 0; + int32_t output_index = dx + dy * dst_width; + int32_t dstR_idx = output_index + rOffset; + int32_t dstG_idx = output_index + gOffset; + int32_t dstB_idx = output_index + bOffset; + float finalVal = 0; + + if(xRatio != (1 << 15) || yRatio != (1 << 15)) + { + int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14); + int32_t sx = fx & 0xffff8000; // Floor + int32_t fy = 0, sy = 0; + int32_t temp1 = 0, temp2 = 0; + + fx -= sx; + sx = sx >> 15; + + sx = sx < 0 ? 0 : sx; + sx = sx > src_width ? src_width - 1: sx; + + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + + sx += xOffset; + sy += yOffset; + source_index = (sx + sy * src_width + dz * src_width * src_height) * elementSize; + + bline1[0] = (int32_t)buffer[0][source_index]; + bline1[1] = (int32_t)buffer[0][source_index + elementSize]; + bline2[0] = (int32_t)buffer[0][source_index + src_stride]; + bline2[1] = (int32_t)buffer[0][source_index + src_stride + elementSize]; + + gline1[0] = (int32_t)buffer[0][source_index + 1]; + gline1[1] = (int32_t)buffer[0][source_index + elementSize + 1]; + gline2[0] = (int32_t)buffer[0][source_index + src_stride + 1]; + gline2[1] = (int32_t)buffer[0][source_index + src_stride + elementSize + 1]; + + rline1[0] = (int32_t)buffer[0][source_index + 2]; + rline1[1] = (int32_t)buffer[0][source_index + elementSize + 2]; + rline2[0] = (int32_t)buffer[0][source_index + src_stride + 2]; + rline2[1] = (int32_t)buffer[0][source_index + src_stride + elementSize + 2]; + + // B + temp1 = fx * (bline1[1] - bline1[0]) + (bline1[0] << 10); + temp2 = fx * (bline2[1] - bline2[0]) + (bline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + B = (uint8_t)(DESCALE(temp1)); + finalVal = (B - bMean) * var; + buffer[1][dstB_idx] = finalVal; + + // R + temp1 = fx * (rline1[1] - rline1[0]) + (rline1[0] << 10); + temp2 = fx * (rline2[1] - rline2[0]) + (rline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + R = (uint8_t)(DESCALE(temp1)); + finalVal = (R - rMean) * var; + buffer[1][dstR_idx] = finalVal; + + // G + temp1 = fx * (gline1[1] - gline1[0]) + (gline1[0] << 10); + temp2 = fx * (gline2[1] - gline2[0]) + (gline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + G = (uint8_t)(DESCALE(temp1)); + finalVal = (G - gMean) * var; + buffer[1][dstG_idx] = finalVal; + } + else //copy + { + int32_t offset = xOffset + yOffset * src_width; + source_index = (dx + dy * src_width + offset) * elementSize; + + finalVal = (buffer[0][source_index] - bMean) * var; + buffer[1][dstB_idx] = finalVal; + + finalVal = (buffer[0][source_index + 1] - gMean) * var; + buffer[1][dstG_idx] = finalVal; + + finalVal = (buffer[0][source_index + 2] - rMean) * var; + buffer[1][dstR_idx] = finalVal; + } + } + } + } + } + + if(trans) + { + uint32_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1}; + uint32_t perm[] = {1, 2, 0, 3}; + vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[1], + shape, (uint32_t)attr[1]->shape->size, perm, VSI_NN_TYPE_FLOAT32); + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + outBuffer, out_elements ); + } + else + { + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + } + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if(outBuffer) + { + free(outBuffer); + } + + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_bgra_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _pre_process_bgra_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 2; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float bgra_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &bgra_scale ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[2] ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + vsi_nn_kernel_scalar_release( &backend_params[6] ); + vsi_nn_kernel_scalar_release( &backend_params[7] ); + vsi_nn_kernel_scalar_release( &backend_params[8] ); + vsi_nn_kernel_scalar_release( &backend_params[9] ); + vsi_nn_kernel_scalar_release( &backend_params[10] ); + vsi_nn_kernel_scalar_release( &backend_params[11] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( pre_process_bgra, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c new file mode 100644 index 0000000..644add0 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c @@ -0,0 +1,283 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (6) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.pre_process_gray_sw") + +#define DESCALE(x) (((x) + (1<<19)) >> 20) + +DEF_KERNEL_EXECUTOR(_pre_process_gray_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0; + float mean = 0, scale = 1; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + i = 2; + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &scale); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + { + int32_t line1[2], line2[2]; + int32_t dx = 0, dy = 0, dz = 0; + int32_t src_width = attr[0]->shape->data[0]; + int32_t src_height = attr[0]->shape->data[1]; + int32_t dst_width = attr[1]->shape->data[0]; + int32_t dst_height = attr[1]->shape->data[1]; + uint8_t result = 0; + + for ( dz = 0; dz < 1; dz ++) + { + for ( dy = 0; dy < (int32_t)dst_height; dy ++) + { + for ( dx = 0; dx < (int32_t)dst_width; dx ++) + { + int32_t source_index = 0; + int32_t output_index = dx + dy * dst_width; + float finalVal = 0.0f; + + if(xRatio != (1 << 15) || yRatio != (1 << 15)) + { + int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14); + int32_t sx = fx & 0xffff8000; // Floor + int32_t fy = 0, sy = 0; + int32_t temp1 = 0; + int32_t temp2 = 0; + + fx -= sx; + sx = sx >> 15; + + sx = sx < 0 ? 0 : sx; + sx = sx > src_width ? src_width - 1: sx; + + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + + sx += xOffset; + sy += yOffset; + source_index = (sx + sy * src_width + dz * src_width * src_height); + + line1[0] = (int32_t)buffer[0][source_index]; + line1[1] = (int32_t)buffer[0][source_index + 1]; + line2[0] = (int32_t)buffer[0][source_index + src_width]; + line2[1] = (int32_t)buffer[0][source_index + src_width + 1]; + + temp1 = fx * (line1[1] - line1[0]) + (line1[0] << 10); + temp2 = fx * (line2[1] - line2[0]) + (line2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + result = (uint8_t)(DESCALE(temp1)); + finalVal = (result - mean) * scale; + buffer[1][output_index] = finalVal; + } + else + { + int32_t offset = xOffset + yOffset * src_width; + source_index = dx + dy * src_width + offset; + finalVal = (buffer[0][source_index] - mean) * scale; + buffer[1][output_index] = finalVal; + } + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_gray_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _pre_process_gray_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 2; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float mean = vsi_nn_kernel_param_get_float32( params, "mean" ); + float scale = vsi_nn_kernel_param_get_float32( params, "scale" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[2] ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + vsi_nn_kernel_scalar_release( &backend_params[6] ); + vsi_nn_kernel_scalar_release( &backend_params[7] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( pre_process_gray, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c new file mode 100644 index 0000000..2417d0e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c @@ -0,0 +1,357 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (10) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.pre_process_nv12_sw") + +#define DESCALE(x) (((x) + (1<<19)) >> 20) + +DEF_KERNEL_EXECUTOR(_pre_process_nv12_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + float * outBuffer = NULL; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0; + float rMean = 0, gMean = 0, bMean = 0, var = 0; + int32_t order = 0, trans = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + i = 3; + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + if(trans) + { + outBuffer = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final ); + memset( outBuffer, 0, out_elements * sizeof(float) ); + } + + { + int32_t dx, dy, dz; + int32_t src_width = attr[0]->shape->data[0]; + int32_t src_height = attr[0]->shape->data[1]; + int32_t dst_width = trans ? attr[2]->shape->data[1] : attr[2]->shape->data[0]; + int32_t dst_height = trans ? attr[2]->shape->data[2] : attr[2]->shape->data[1]; + int32_t stride = dst_width * dst_height; + int32_t rOffset = 0; + int32_t gOffset = 1 * stride; + int32_t bOffset = 2 * stride; + float D, E; + float R, G, B; + float min = 0; + float max = 255; + float* src_y_slice = NULL; + float* src_uv_yScanline = NULL; + + + uint32_t xrIntFloat_16 = (src_width << 16) / dst_width + 1; + uint32_t yrIntFloat_16 = (src_height << 16) / dst_height + 1; + uint32_t srcy = 0, srcx = 0; + + if(attr[2]->dtype == I8) + { + min = -128; + max = 127; + } + else if(attr[2]->dtype == I16 || attr[2]->dtype == F16) + { + min = -65536; + max = 65535; + } + + if(order) + { + rOffset = 2 * stride; + bOffset = 0; + } + + for ( dz = 0; dz < 1; dz ++) + { + for ( dy = 0; dy < (int32_t)dst_height; dy ++) + { + srcy = (((uint32_t)dy * yrIntFloat_16) >> 16) + yOffset; + src_y_slice = buffer[0] + (srcy) * src_width; + src_uv_yScanline = buffer[1] + (srcy / 2) * src_width; + + for ( dx = 0; dx < (int32_t)dst_width; dx ++) + { + float finalVal = 0; + int32_t output_index = 0; + int32_t dstR_idx = 0, dstG_idx = 0, dstB_idx = 0; + float tmpY = 0.0f; + float tmpU = 0.0f; + float tmpV = 0.0f; + + srcx = (((uint32_t)dx * xrIntFloat_16) >> 16) + xOffset; + tmpY = src_y_slice[srcx]; + tmpU = src_uv_yScanline[(srcx / 2) * 2]; + tmpV = src_uv_yScanline[(srcx / 2) * 2 + 1]; + + D = (tmpU - 128); + E = (tmpV - 128); + + // B + B = (float)vsi_clamp((tmpY + (1.7790 * D)), min, max); + //G + G = (float)vsi_clamp((tmpY - 0.3455 * D - 0.7169 * E), min, max); + //R + R = (float)vsi_clamp((tmpY + 1.4065 * E), min, max); + + output_index = dx + dy * dst_width; + + dstR_idx = output_index + rOffset; + dstG_idx = output_index + gOffset; + dstB_idx = output_index + bOffset; + + finalVal = (B - bMean) * var; + buffer[2][dstB_idx] = finalVal; + + finalVal = (G - gMean) * var; + buffer[2][dstG_idx] = finalVal; + + finalVal = (R - rMean) * var; + buffer[2][dstR_idx] = finalVal; + } + } + } + } + + if(trans) + { + uint32_t shape[] = {attr[2]->shape->data[0], attr[2]->shape->data[1], attr[2]->shape->data[2], 1}; + uint32_t perm[] = {1, 2, 0, 3}; + vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[2], + shape, (uint32_t)attr[2]->shape->size, perm, VSI_NN_TYPE_FLOAT32); + + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + outBuffer, out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + else + { + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + if(outBuffer) + { + free(outBuffer); + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_nv12_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _pre_process_nv12_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 3; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + vsi_nn_kernel_scalar_release( &backend_params[6] ); + vsi_nn_kernel_scalar_release( &backend_params[7] ); + vsi_nn_kernel_scalar_release( &backend_params[8] ); + vsi_nn_kernel_scalar_release( &backend_params[9] ); + vsi_nn_kernel_scalar_release( &backend_params[10] ); + vsi_nn_kernel_scalar_release( &backend_params[11] ); + vsi_nn_kernel_scalar_release( &backend_params[12] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( pre_process_nv12, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c new file mode 100644 index 0000000..2be7273 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c @@ -0,0 +1,383 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (10) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.pre_process_rgb_sw") + +#define DESCALE(x) (((x) + (1<<19)) >> 20) + +DEF_KERNEL_EXECUTOR(_pre_process_rgb_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + float * outBuffer = NULL; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0; + float rMean = 0, gMean = 0, bMean = 0, var = 0; + int32_t order = 0, trans = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + i = 2; + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + if(trans) + { + outBuffer = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final ); + memset( outBuffer, 0, out_elements * sizeof(float) ); + } + + { + int32_t rline1[2], rline2[2]; + int32_t gline1[2], gline2[2]; + int32_t bline1[2], bline2[2]; + int32_t dx = 0, dy = 0, dz = 0; + int32_t src_stride = attr[0]->shape->data[0]; + int32_t src_width = src_stride / 3; + int32_t src_height = attr[0]->shape->data[1]; + int32_t dst_width = trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]; + int32_t dst_height = trans ? attr[1]->shape->data[2] : attr[1]->shape->data[1]; + int32_t stride = dst_width * dst_height; + int32_t rOffset = 0; + int32_t gOffset = 1 * stride; + int32_t bOffset = 2 * stride; + uint8_t R = 0, G = 0, B = 0; + + if(order) + { + rOffset = 2 * stride; + bOffset = 0; + } + + for ( dz = 0; dz < 1; dz ++) + { + for ( dy = 0; dy < (int32_t)dst_height; dy ++) + { + for ( dx = 0; dx < (int32_t)dst_width; dx ++) + { + int32_t source_index = 0; + int32_t output_index = dx + dy * dst_width; + int32_t dstR_idx = output_index + rOffset; + int32_t dstG_idx = output_index + gOffset; + int32_t dstB_idx = output_index + bOffset; + float finalVal = 0; + + if(xRatio != (1 << 15) || yRatio != (1 << 15)) + { + int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14); + int32_t sx = fx & 0xffff8000; // Floor + int32_t fy = 0, sy = 0; + int32_t temp1 = 0, temp2 = 0; + + fx -= sx; + sx = sx >> 15; + + sx = sx < 0 ? 0 : sx; + sx = sx > src_width ? src_width - 1: sx; + + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + + sx += xOffset; + sy += yOffset; + source_index = (sx + sy * src_width + dz * src_width * src_height) * 3; + + rline1[0] = (int32_t)buffer[0][source_index]; + rline1[1] = (int32_t)buffer[0][source_index + 3]; + rline2[0] = (int32_t)buffer[0][source_index + src_stride]; + rline2[1] = (int32_t)buffer[0][source_index + src_stride + 3]; + + gline1[0] = (int32_t)buffer[0][source_index + 1]; + gline1[1] = (int32_t)buffer[0][source_index + 4]; + gline2[0] = (int32_t)buffer[0][source_index + src_stride + 1]; + gline2[1] = (int32_t)buffer[0][source_index + src_stride + 4]; + + bline1[0] = (int32_t)buffer[0][source_index + 2]; + bline1[1] = (int32_t)buffer[0][source_index + 5]; + bline2[0] = (int32_t)buffer[0][source_index + src_stride + 2]; + bline2[1] = (int32_t)buffer[0][source_index + src_stride + 5]; + + // R + temp1 = fx * (rline1[1] - rline1[0]) + (rline1[0] << 10); + temp2 = fx * (rline2[1] - rline2[0]) + (rline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + R = (uint8_t)(DESCALE(temp1)); + finalVal = (R - rMean) * var; + buffer[1][dstR_idx] = finalVal; + + //G + temp1 = fx * (gline1[1] - gline1[0]) + (gline1[0] << 10); + temp2 = fx * (gline2[1] - gline2[0]) + (gline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + G = (uint8_t)(DESCALE(temp1)); + finalVal = (G - gMean) * var; + buffer[1][dstG_idx] = finalVal; + + //B + temp1 = fx * (bline1[1] - bline1[0]) + (bline1[0] << 10); + temp2 = fx * (bline2[1] - bline2[0]) + (bline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + B = (uint8_t)(DESCALE(temp1)); + finalVal = (B - bMean) * var; + buffer[1][dstB_idx] = finalVal; + } + else //copy + { + int32_t offset = xOffset + yOffset * src_width; + source_index = (dx + dy * src_width + offset) * 3; + + finalVal = (buffer[0][source_index] - rMean) * var; + buffer[1][dstR_idx] = finalVal; + + finalVal = (buffer[0][source_index + 1] - gMean) * var; + buffer[1][dstG_idx] = finalVal; + + finalVal = (buffer[0][source_index + 2] - bMean) * var; + buffer[1][dstB_idx] = finalVal; + } + } + } + } + } + + if(trans) + { + uint32_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1}; + uint32_t perm[] = {1, 2, 0, 3}; + vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[1], + shape, (uint32_t)attr[1]->shape->size, perm, VSI_NN_TYPE_FLOAT32); + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + outBuffer, out_elements ); + } + else + { + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + } + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if(outBuffer) + { + free(outBuffer); + } + + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_rgb_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _pre_process_rgb_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 2; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[2] ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + vsi_nn_kernel_scalar_release( &backend_params[6] ); + vsi_nn_kernel_scalar_release( &backend_params[7] ); + vsi_nn_kernel_scalar_release( &backend_params[8] ); + vsi_nn_kernel_scalar_release( &backend_params[9] ); + vsi_nn_kernel_scalar_release( &backend_params[10] ); + vsi_nn_kernel_scalar_release( &backend_params[11] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( pre_process_rgb, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c new file mode 100644 index 0000000..6749f29 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c @@ -0,0 +1,432 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (10) +#define _CPU_INPUT_NUM (3) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.pre_process_yuv420_sw") + +#define DESCALE(x) (((x) + (1<<19)) >> 20) + +DEF_KERNEL_EXECUTOR(_pre_process_yuv420_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + float * outBuffer = NULL; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0; + float rMean = 0, gMean = 0, bMean = 0, var = 0; + int32_t order = 0, trans = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); + + i = 4; + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create input2 buffer fail.", final ); + + buffer[3] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final ); + memset( buffer[3], 0, out_elements * sizeof(float) ); + + if(trans) + { + outBuffer = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final ); + memset( outBuffer, 0, out_elements * sizeof(float) ); + } + + { + uint8_t rline1[2], rline2[2]; + uint8_t gline1[2], gline2[2]; + uint8_t bline1[2], bline2[2]; + int32_t dx, dy, dz; + int32_t src_width = attr[0]->shape->data[0]; + int32_t src_height = attr[0]->shape->data[1]; + int32_t subWidth = src_width >> 1; + int32_t subHeight = src_height >> 1; + int32_t dst_width = trans ? attr[3]->shape->data[1] : attr[3]->shape->data[0]; + int32_t dst_height = trans ? attr[3]->shape->data[2] : attr[3]->shape->data[1]; + int32_t stride = dst_width * dst_height; + int32_t rOffset = 0; + int32_t gOffset = 1 * stride; + int32_t bOffset = 2 * stride; + int32_t subIdx = 0; + int32_t C, D, E; + uint8_t R, G, B; + int32_t min = 0; + int32_t max = 255; + + if(order) + { + rOffset = 2 * stride; + bOffset = 0; + } + + for ( dz = 0; dz < 1; dz ++) + { + for ( dy = 0; dy < (int32_t)dst_height; dy ++) + { + for ( dx = 0; dx < (int32_t)dst_width; dx ++) + { + int32_t source_index = 0; + int32_t output_index = dx + dy * dst_width; + int32_t dstR_idx = output_index + rOffset; + int32_t dstG_idx = output_index + gOffset; + int32_t dstB_idx = output_index + bOffset; + float finalVal = 0; + + if(xRatio != (1 << 15) || yRatio != (1 << 15)) + { + int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14); + int32_t sx = fx & 0xffff8000; // Floor + int32_t fy = 0, sy = 0; + int32_t temp1 = 0, temp2 = 0; + + fx -= sx; + sx = sx >> 15; + + sx = sx < 0 ? 0 : sx; + sx = sx > src_width ? src_width - 1: sx; + + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + + sx += xOffset; + sy += yOffset; + source_index = (sx + sy * src_width + dz * src_width * src_height + 0); + subIdx = ((sx >> 1) + (sy >> 1) * subWidth + dz * subWidth * subHeight + 0); + + /*C = ySrc[source_index] - 16; + D = uSrc[subIdx] - 128; + E = vSrc[subIdx] - 128;*/ + C = (int)buffer[0][source_index] - 16; + D = (int)buffer[1][subIdx] - 128; + E = (int)buffer[2][subIdx] - 128; + + rline1[0] = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max); + gline1[0] = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max); + bline1[0] = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max); + + // right + subIdx = (((sx + 1) >> 1) + (sy >> 1) * subWidth + dz * subWidth * subHeight); + C = (int)buffer[0][source_index + 1] - 16; + D = (int)buffer[1][subIdx] - 128; + E = (int)buffer[2][subIdx] - 128; + + rline1[1] = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max); + gline1[1] = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max); + bline1[1] = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max); + + // below + subIdx = (((sx + 0) >> 1) + ((sy + 1) >> 1) * subWidth + dz * subWidth * subHeight); + C = (int)buffer[0][source_index + src_width] - 16; + D = (int)buffer[1][subIdx] - 128; + E = (int)buffer[2][subIdx] - 128; + + rline2[0] = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max); + gline2[0] = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max); + bline2[0] = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max); + + // below right + //C = ySrc[source_index + src_width + 1] - 16; + subIdx = (((sx + 1) >> 1) + ((sy + 1) >> 1) * subWidth + dz * subWidth * subHeight); + C = (int)buffer[0][source_index + src_width + 1] - 16; + D = (int)buffer[1][subIdx] - 128; + E = (int)buffer[2][subIdx] - 128; + + rline2[1] = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max); + gline2[1] = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max); + bline2[1] = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max); + + //B + temp1 = fx * (bline1[1] - bline1[0]) + (bline1[0] << 10); + temp2 = fx * (bline2[1] - bline2[0]) + (bline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + B = (uint8_t)(DESCALE(temp1)); + finalVal = (B - bMean) * var; + buffer[3][dstB_idx] = finalVal; + + //G + temp1 = fx * (gline1[1] - gline1[0]) + (gline1[0] << 10); + temp2 = fx * (gline2[1] - gline2[0]) + (gline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + + G = (uint8_t)(DESCALE(temp1)); + finalVal = (G - gMean) * var; + buffer[3][dstG_idx] = finalVal; + + // R + temp1 = fx * (rline1[1] - rline1[0]) + (rline1[0] << 10); + temp2 = fx * (rline2[1] - rline2[0]) + (rline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + R = (uint8_t)(DESCALE(temp1)); + finalVal = (R - rMean) * var; + buffer[3][dstR_idx] = finalVal; + } + else + { + // do conversion + C = (int)buffer[0][source_index] - 16; + D = (int)buffer[1][subIdx] - 128; + E = (int)buffer[2][subIdx] - 128; + + R = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max); + G = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max); + B = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max); + + buffer[3][dstB_idx] = (B - bMean) * var; + buffer[3][dstG_idx] = (G - gMean) * var; + buffer[3][dstR_idx] = (R - rMean) * var; + } + } + } + } + } + + if(trans) + { + uint32_t shape[] = {attr[3]->shape->data[0], attr[3]->shape->data[1], attr[3]->shape->data[2], 1}; + uint32_t perm[] = {1, 2, 0, 3}; + vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[3], + shape, (uint32_t)attr[3]->shape->size, perm, VSI_NN_TYPE_FLOAT32); + + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + outBuffer, out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + else + { + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + buffer[3], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + if(outBuffer) + { + free(outBuffer); + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_yuv420_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _pre_process_yuv420_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 4; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + vsi_nn_kernel_scalar_release( &backend_params[6] ); + vsi_nn_kernel_scalar_release( &backend_params[7] ); + vsi_nn_kernel_scalar_release( &backend_params[8] ); + vsi_nn_kernel_scalar_release( &backend_params[9] ); + vsi_nn_kernel_scalar_release( &backend_params[10] ); + vsi_nn_kernel_scalar_release( &backend_params[11] ); + vsi_nn_kernel_scalar_release( &backend_params[12] ); + vsi_nn_kernel_scalar_release( &backend_params[13] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( pre_process_yuv420, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c new file mode 100644 index 0000000..6894957 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c @@ -0,0 +1,426 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (10) +#define _CPU_INPUT_NUM (3) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.pre_process_yuv444_sw") + +#define DESCALE(x) (((x) + (1<<19)) >> 20) + +DEF_KERNEL_EXECUTOR(_pre_process_yuv444_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + float * outBuffer = NULL; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0; + float rMean = 0, gMean = 0, bMean = 0, var = 0; + int32_t order = 0, trans = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); + + i = 4; + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create input2 buffer fail.", final ); + + buffer[3] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final ); + memset( buffer[3], 0, out_elements * sizeof(float) ); + + if(trans) + { + outBuffer = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final ); + memset( outBuffer, 0, out_elements * sizeof(float) ); + } + + { + uint8_t rline1[2], rline2[2]; + uint8_t gline1[2], gline2[2]; + uint8_t bline1[2], bline2[2]; + int32_t dx, dy, dz; + int32_t src_width = attr[0]->shape->data[0]; + int32_t src_height = attr[0]->shape->data[1]; + int32_t dst_width = trans ? attr[3]->shape->data[1] : attr[3]->shape->data[0]; + int32_t dst_height = trans ? attr[3]->shape->data[2] : attr[3]->shape->data[1]; + int32_t stride = dst_width * dst_height; + int32_t rOffset = 0; + int32_t gOffset = 1 * stride; + int32_t bOffset = 2 * stride; + int32_t C, D, E; + uint8_t R, G, B; + int32_t min = 0; + int32_t max = 255; + + if(order) + { + rOffset = 2 * stride; + bOffset = 0; + } + + for ( dz = 0; dz < 1; dz ++) + { + for ( dy = 0; dy < (int32_t)dst_height; dy ++) + { + for ( dx = 0; dx < (int32_t)dst_width; dx ++) + { + int32_t source_index = 0; + int32_t output_index = dx + dy * dst_width; + int32_t dstR_idx = output_index + rOffset; + int32_t dstG_idx = output_index + gOffset; + int32_t dstB_idx = output_index + bOffset; + float finalVal = 0; + + if(xRatio != (1 << 15) || yRatio != (1 << 15)) + { + int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14); + int32_t sx = fx & 0xffff8000; // Floor + int32_t fy = 0, sy = 0; + int32_t temp1 = 0, temp2 = 0; + + fx -= sx; + sx = sx >> 15; + + sx = sx < 0 ? 0 : sx; + sx = sx > src_width ? src_width - 1: sx; + + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + + sx += xOffset; + sy += yOffset; + source_index = (sx + sy * src_width + dz * src_width * src_height + 0); + + /*C = ySrc[source_index] - 16; + D = uSrc[subIdx] - 128; + E = vSrc[subIdx] - 128;*/ + C = (int)buffer[0][source_index] - 16; + D = (int)buffer[1][source_index] - 128; + E = (int)buffer[2][source_index] - 128; + + rline1[0] = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max); + gline1[0] = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max); + bline1[0] = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max); + + // right + C = (int)buffer[0][source_index + 1] - 16; + D = (int)buffer[1][source_index + 1] - 128; + E = (int)buffer[2][source_index + 1] - 128; + + rline1[1] = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max); + gline1[1] = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max); + bline1[1] = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max); + + // below + C = (int)buffer[0][source_index + src_width] - 16; + D = (int)buffer[1][source_index + src_width] - 128; + E = (int)buffer[2][source_index + src_width] - 128; + + rline2[0] = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max); + gline2[0] = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max); + bline2[0] = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max); + + // below right + //C = ySrc[source_index + src_width + 1] - 16; + C = (int)buffer[0][source_index + src_width + 1] - 16; + D = (int)buffer[1][source_index + src_width + 1] - 128; + E = (int)buffer[2][source_index + src_width + 1] - 128; + + rline2[1] = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max); + gline2[1] = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max); + bline2[1] = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max); + + //B + temp1 = fx * (bline1[1] - bline1[0]) + (bline1[0] << 10); + temp2 = fx * (bline2[1] - bline2[0]) + (bline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + B = (uint8_t)(DESCALE(temp1)); + finalVal = (B - bMean) * var; + buffer[3][dstB_idx] = finalVal; + + //G + temp1 = fx * (gline1[1] - gline1[0]) + (gline1[0] << 10); + temp2 = fx * (gline2[1] - gline2[0]) + (gline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + + G = (uint8_t)(DESCALE(temp1)); + finalVal = (G - gMean) * var; + buffer[3][dstG_idx] = finalVal; + + // R + temp1 = fx * (rline1[1] - rline1[0]) + (rline1[0] << 10); + temp2 = fx * (rline2[1] - rline2[0]) + (rline2[0] << 10); + temp1 = fy * (temp2 - temp1) + (temp1 << 10); + R = (uint8_t)(DESCALE(temp1)); + finalVal = (R - rMean) * var; + buffer[3][dstR_idx] = finalVal; + } + else + { + // do conversion + C = (int)buffer[0][source_index] - 16; + D = (int)buffer[1][source_index] - 128; + E = (int)buffer[2][source_index] - 128; + + R = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max); + G = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max); + B = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max); + + buffer[3][dstB_idx] = (B - bMean) * var; + buffer[3][dstG_idx] = (G - gMean) * var; + buffer[3][dstR_idx] = (R - rMean) * var; + } + } + } + } + } + + if(trans) + { + uint32_t shape[] = {attr[3]->shape->data[0], attr[3]->shape->data[1], attr[3]->shape->data[2], 1}; + uint32_t perm[] = {1, 2, 0, 3}; + vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[3], + shape, (uint32_t)attr[3]->shape->size, perm, VSI_NN_TYPE_FLOAT32); + + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + outBuffer, out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + else + { + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + buffer[3], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + if(outBuffer) + { + free(outBuffer); + } + + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_yuv444_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _pre_process_yuv444_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 4; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + vsi_nn_kernel_scalar_release( &backend_params[6] ); + vsi_nn_kernel_scalar_release( &backend_params[7] ); + vsi_nn_kernel_scalar_release( &backend_params[8] ); + vsi_nn_kernel_scalar_release( &backend_params[9] ); + vsi_nn_kernel_scalar_release( &backend_params[10] ); + vsi_nn_kernel_scalar_release( &backend_params[11] ); + vsi_nn_kernel_scalar_release( &backend_params[12] ); + vsi_nn_kernel_scalar_release( &backend_params[13] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( pre_process_yuv444, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c new file mode 100644 index 0000000..fa433dc --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c @@ -0,0 +1,219 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (0) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("prelu_sw") + +static int32_t _expand_offset + ( + int32_t index, + int32_t * shape, size_t rank, + size_t * strides, int32_t * out_shape + ) +{ + uint32_t i; + int32_t offset = 0; + + for( i = 0; i < rank && index; i ++ ) + { + if( shape[i] == out_shape[i] ) + { + offset += (int32_t)strides[i] * ( index % out_shape[i] ); + } + index /= out_shape[i]; + } + return offset; +} + +DEF_KERNEL_EXECUTOR(_prelu_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + + vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] ); + vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + for( i = 0; i < out_elements; i ++ ) + { + int32_t in0_offset = 0; + int32_t in1_offset = 0; + float val1 = 0.f; + float val2 = 0.f; + + in0_offset = _expand_offset( i, attr[0]->shape->data, attr[0]->shape->size, + stride_size[0], attr[2]->shape->data ); + in1_offset = _expand_offset( i, attr[1]->shape->data, attr[1]->shape->size, + stride_size[1], attr[2]->shape->data ); + + val1 = buffer[0][in0_offset]; + val2 = buffer[1][in1_offset]; + + + buffer[2][i] = val1 >= 0 ? val1 : val1 * val2; + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _prelu_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _prelu_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t is_per_channel_alpha = 0; + + is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha"); + + if (is_per_channel_alpha) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( prelu, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c new file mode 100644 index 0000000..3b21033 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c @@ -0,0 +1,262 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (0) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("random_multinomial_sw") + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _PARAM_NUM _cnt_of_array( kernel_param_def ) + +/* + * Kernel function + */ +static int upper_bound(float* a, int n, float x) { + int l = 0; + int h = n; + while (l < h) { + int mid = (l + h) / 2; + if (x >= a[mid]) { + l = mid + 1; + } else { + h = mid; + } + } + return l; +} /* upper_bound() */ + +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t *random_integer = NULL; + float *random_float = NULL; + float *cdf = NULL; + uint32_t i = 0; + uint32_t n = 0; + uint32_t batch = 0; + uint32_t class_size = 0; + int32_t sample_num = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + sample_num = attr[2]->shape->data[0]; + batch = attr[0]->shape->data[1]; + class_size = attr[0]->shape->data[0]; + + vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] ); + vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + random_integer = (uint32_t *)malloc(out_elements * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO( random_integer, "Create buffer fail.", final ); + random_float = (float *)malloc(out_elements * sizeof(float)); + CHECK_PTR_FAIL_GOTO( random_float, "Create buffer fail.", final ); + cdf = (float *)malloc(class_size * sizeof(float)); + CHECK_PTR_FAIL_GOTO( cdf, "Create buffer fail.", final ); + + vsi_nn_random_init_for_philox_4x32_10((uint32_t)(buffer[1][0]), + (uint32_t)(buffer[1][1])); + vsi_nn_random_generate_by_philox_4x32_10(random_integer, (uint32_t)out_elements); + vsi_nn_random_uniform_transform(random_integer, + random_float, (uint32_t)out_elements); + + for (n = 0; n < batch; n++) + { + uint32_t c = 0; + float batch_max = -FLT_MAX; + float total = 0; + for(c = 0; c < class_size; c++) + { + uint32_t index = n * class_size + c; + batch_max = vsi_nn_max(batch_max, buffer[0][index]); + } + + for(c = 0; c < class_size; c++) + { + uint32_t index = n * class_size + c; + total += (float)(exp(buffer[0][index] - batch_max)); + cdf[c] = total; + } + + for(c = 0; c < (uint32_t)sample_num; c++) + { + uint32_t index = n * sample_num + c; + float target = random_float[index] * total; + uint32_t out_class = upper_bound(cdf, class_size, target); + buffer[2][index] = (float)out_class; + } + } + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + + if (cdf) + { + free(cdf); + cdf = NULL; + } + if (random_integer) + { + free(random_integer); + random_integer = NULL; + } + if (random_float) + { + free(random_float); + random_float = NULL; + } + + return status; +} /* _compute() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _PARAM_NUM ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( random_multinomial, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c new file mode 100644 index 0000000..de13bba --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c @@ -0,0 +1,239 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.reduceall_internal") + + +/* + * Kernel params + */ +static vx_param_description_t _reduceall_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEALL_INTERNAL_PARAM_NUM _cnt_of_array( _reduceall_internal_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (2) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + int32_t axis = 0; + int32_t outerSize = 1; + int32_t axisSize = 1; + int32_t innerSize = 1; + int32_t inner = 0; + int32_t outer = 0; + int32_t all_result = 0; + + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + for (i = 0; i < (uint32_t)axis; i++) + { + innerSize *= in_attr[0]->shape->data[i]; + } + + axisSize = in_attr[0]->shape->data[axis]; + + for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++) + { + outerSize *= in_attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + all_result = (!!(f32_in_buffer[0][outer * axisSize * innerSize + inner])); + for (i = 1; i < (uint32_t)axisSize; ++i) + { + int32_t value = (!!(f32_in_buffer[0][(outer * axisSize + i) * innerSize + inner])); + all_result = all_result && value; + } + f32_out_buffer[0][outer * innerSize + inner] = (float)all_result; + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _reduceall_internal_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _reduceall_internal_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEALL_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEALL_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEALL_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( reduceall_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c new file mode 100644 index 0000000..25cfc86 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c @@ -0,0 +1,239 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.reduceany_internal") + + +/* + * Kernel params + */ +static vx_param_description_t _reduceany_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEANY_INTERNAL_PARAM_NUM _cnt_of_array( _reduceany_internal_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (2) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + int32_t axis = 0; + int32_t outerSize = 1; + int32_t axisSize = 1; + int32_t innerSize = 1; + int32_t inner = 0; + int32_t outer = 0; + int32_t any_result = 0; + + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + for (i = 0; i < (uint32_t)axis; i++) + { + innerSize *= in_attr[0]->shape->data[i]; + } + + axisSize = in_attr[0]->shape->data[axis]; + + for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++) + { + outerSize *= in_attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + any_result = (!!(f32_in_buffer[0][outer * axisSize * innerSize + inner])); + for (i = 1; i < (uint32_t)axisSize; ++i) + { + int32_t value = (!!(f32_in_buffer[0][(outer * axisSize + i) * innerSize + inner])); + any_result = any_result || value; + } + f32_out_buffer[0][outer * innerSize + inner] = (float)any_result; + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _reduceany_internal_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _reduceany_internal_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEANY_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEANY_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEANY_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( reduceany_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c new file mode 100644 index 0000000..643e126 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c @@ -0,0 +1,239 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.reducemax_internal") + + +/* + * Kernel params + */ +static vx_param_description_t _reducemax_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; +#define _REDUCEMAX_INTERNAL_PARAM_NUM _cnt_of_array( _reducemax_internal_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (2) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + int32_t axis = 0; + int32_t outerSize = 1; + int32_t axisSize = 1; + int32_t innerSize = 1; + int32_t inner = 0; + int32_t outer = 0; + float maxValue = 0.0f; + + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + for (i = 0; i < (uint32_t)axis; i++) + { + innerSize *= in_attr[0]->shape->data[i]; + } + + axisSize = in_attr[0]->shape->data[axis]; + + for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++) + { + outerSize *= in_attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + maxValue = f32_in_buffer[0][outer * axisSize * innerSize + inner]; + for (i = 1; i < (uint32_t)axisSize; ++i) + { + float value = f32_in_buffer[0][(outer * axisSize + i) * innerSize + inner]; + maxValue = vsi_nn_max(maxValue, value); + } + f32_out_buffer[0][outer * innerSize + inner] = (float)maxValue; + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _reducemax_internal_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _reducemax_internal_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEMAX_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEMAX_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEMAX_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( reducemax_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c new file mode 100644 index 0000000..8f3728d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c @@ -0,0 +1,240 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.reducemin_internal") + + +/* + * Kernel params + */ +static vx_param_description_t _reducemin_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; +#define _REDUCEMIN_INTERNAL_PARAM_NUM _cnt_of_array( _reducemin_internal_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (2) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + int32_t axis = 0; + int32_t outerSize = 1; + int32_t axisSize = 1; + int32_t innerSize = 1; + int32_t inner = 0; + int32_t outer = 0; + float minValue = 0.0f; + + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + for (i = 0; i < (uint32_t)axis; i++) + { + innerSize *= in_attr[0]->shape->data[i]; + } + + axisSize = in_attr[0]->shape->data[axis]; + + for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++) + { + outerSize *= in_attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + minValue = f32_in_buffer[0][outer * axisSize * innerSize + inner]; + for (i = 1; i < (uint32_t)axisSize; ++i) + { + float value = f32_in_buffer[0][(outer * axisSize + i) * innerSize + inner]; + minValue = vsi_nn_min(minValue, value); + } + f32_out_buffer[0][outer * innerSize + inner] = (float)minValue; + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _reducemin_internal_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _reducemin_internal_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEMIN_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEMIN_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEMIN_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( reducemin_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c new file mode 100644 index 0000000..3e59616 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c @@ -0,0 +1,238 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.reduceprod_internal") + + +/* + * Kernel params + */ +static vx_param_description_t _reduceprod_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEPROD_INTERNAL_PARAM_NUM _cnt_of_array( _reduceprod_internal_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (2) +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + int32_t axis = 0; + int32_t outerSize = 1; + int32_t axisSize = 1; + int32_t innerSize = 1; + int32_t inner = 0; + int32_t outer = 0; + float prodValue = 0.0f; + + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + for (i = 0; i < (uint32_t)axis; i++) + { + innerSize *= in_attr[0]->shape->data[i]; + } + + axisSize = in_attr[0]->shape->data[axis]; + + for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++) + { + outerSize *= in_attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + prodValue = f32_in_buffer[0][outer * axisSize * innerSize + inner]; + for (i = 1; i < (uint32_t)axisSize; ++i) + { + float value = f32_in_buffer[0][(outer * axisSize + i) * innerSize + inner]; + prodValue = prodValue * value; + } + f32_out_buffer[0][outer * innerSize + inner] = (float)prodValue; + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _reduceprod_internal_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _reduceprod_internal_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEPROD_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEPROD_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEPROD_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( reduceprod_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c b/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c new file mode 100644 index 0000000..ecedffe --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c @@ -0,0 +1,229 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.relu_keras") + + +/* + * Kernel params + */ +static vx_param_description_t _relu_keras_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RELU_KERAS_PARAM_NUM _cnt_of_array( _relu_keras_kernel_param_def ) + +#define SCALAR_ALPHA (2) +#define SCALAR_MAX_VALUE (3) +#define SCALAR_THRESHOLD (4) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + float alpha = 0.0f; + float max_value = 0.0f; + float threshold = 0.0f; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_ALPHA], &(alpha)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_VALUE], &(max_value)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_THRESHOLD], &(threshold)); + + for (i = 0; i < out_elements[0]; i++) + { + float data = f32_in_buffer[0][i]; + + data = data >= max_value ? max_value : data; + data = data < threshold ? alpha * (data - threshold) : data; + f32_out_buffer[0][i] = data; + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _relu_keras_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _relu_keras_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RELU_KERAS_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); + float max_value = vsi_nn_kernel_param_get_float32( params, "max_value" ); + float threshold = vsi_nn_kernel_param_get_float32( params, "threshold" ); + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RELU_KERAS_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALPHA] = vsi_nn_kernel_scalar_create( graph, F32, &alpha ); + node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value ); + node_params[SCALAR_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &threshold ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _RELU_KERAS_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALPHA] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_THRESHOLD] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( relu_keras, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c new file mode 100644 index 0000000..f735695 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c @@ -0,0 +1,314 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.resize_bilinear") + + +/* + * Kernel params + */ +static vx_param_description_t _resize_bilinear_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_BILINEAR_PARAM_NUM _cnt_of_array( _resize_bilinear_kernel_param_def ) + +#define SCALAR_ALIGN_CORNERS (2) +#define SCALAR_HALF_PIXEL (3) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + int32_t align_corners; + int32_t half_pixel_centers; + float width_scale; + float height_scale; + uint32_t input_width, output_width, input_height, output_height; + uint32_t b = 0, d = 0, w = 0, h = 0; + uint32_t output_depth, input_depth; + uint32_t output_batch; + uint32_t output_dims, input_dims; + float data00 = .0f, data01 = .0f, data10 = .0f, data11 = .0f, interpolation = .0f; + uint32_t input_width_orig; + uint32_t output_width_orig; + uint32_t index; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers)); + input_width = in_attr[0]->shape->data[0]; + input_height = in_attr[0]->shape->data[1]; + output_width = out_attr[0]->shape->data[0]; + output_height = out_attr[0]->shape->data[1]; + output_dims = (uint32_t)out_attr[0]->shape->size; + output_depth = output_dims > 2 ? out_attr[0]->shape->data[2] : 1; + output_batch = output_dims > 3 ? out_attr[0]->shape->data[3] : 1; + input_dims = (uint32_t)in_attr[0]->shape->size; + input_depth = input_dims > 2 ? in_attr[0]->shape->data[2] : 1; + input_width_orig = input_width; + output_width_orig = output_width; + + if (align_corners && output_width > 1) + { + width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1); + } + else + { + width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width; + } + + if (align_corners && output_height > 1) + { + height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(output_height - 1); + } + else + { + height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)output_height; + } + + for (b = 0; b < output_batch; b ++) + { + for (d = 0; d < output_depth; d ++) + { + vx_int32 input_base = b * input_depth * input_width_orig * input_height \ + + d * input_width_orig * input_height; + vx_int32 output_base = b * output_depth * output_width_orig * output_height \ + + d * output_width_orig * output_height; + + for (h = 0; h < output_height; h ++) + { + vx_float32 input_h = h * height_scale; + vx_uint32 h0; + vx_uint32 h1; + + if (half_pixel_centers) + { + input_h = ((vx_float32)h + 0.5f) * height_scale - 0.5f; + } + else + { + input_h = h * height_scale; + } + h0 = (vx_int32)input_h; + h1 = input_h < 0 ? 0 : vsi_nn_min(h0 + 1, input_height - 1); + for (w = 0; w < output_width; w ++) + { + vx_float32 input_w; + vx_int32 w0; + vx_int32 w1; + if (half_pixel_centers) + { + input_w = ((vx_float32)w + 0.5f) * width_scale - 0.5f; + } + else + { + input_w = w * width_scale; + } + w0 = (vx_int32)input_w; + w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vx_int32)(input_width - 1)); + index = input_base + h0 * input_width_orig + w0; + data00 = f32_in_buffer[0][index]; + index = input_base + h0 * input_width_orig + w1; + data01 = f32_in_buffer[0][index]; + index = input_base + h1 * input_width_orig + w0; + data10 = f32_in_buffer[0][index]; + index = input_base + h1 * input_width_orig + w1; + data11 = f32_in_buffer[0][index]; + + interpolation = data00 * (1 - (input_h - h0)) * (1 - (input_w - w0)) + + data10 * (input_h - h0) * (1 - (input_w - w0)) + + data01 * (1 - (input_h - h0)) * (input_w - w0) + + data11 * (input_h - h0) * (input_w - w0); + index = output_base + h * output_width_orig + w; + f32_out_buffer[0][index] = interpolation; + } + } + } + } + + + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _resize_bilinear_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _resize_bilinear_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( resize_bilinear, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c new file mode 100644 index 0000000..7b2aeda --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c @@ -0,0 +1,319 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.resize_nearest") + + +/* + * Kernel params + */ +static vx_param_description_t _resize_nearest_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_NEAREST_PARAM_NUM _cnt_of_array( _resize_nearest_kernel_param_def ) + +#define SCALAR_ALIGN_CORNERS (2) +#define SCALAR_HALF_PIXEL (3) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + int32_t align_corners; + int32_t half_pixel_centers; + float width_scale; + float height_scale; + uint32_t input_width, output_width, input_height, output_height; + uint32_t b = 0, d = 0, w = 0, h = 0; + uint32_t output_depth, input_depth; + uint32_t output_batch; + uint32_t output_dims, input_dims; + uint32_t input_width_orig; + uint32_t output_width_orig; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers)); + input_width = in_attr[0]->shape->data[0]; + input_height = in_attr[0]->shape->data[1]; + output_width = out_attr[0]->shape->data[0]; + output_height = out_attr[0]->shape->data[1]; + output_dims = (uint32_t)out_attr[0]->shape->size; + output_depth = output_dims > 2 ? out_attr[0]->shape->data[2] : 1; + output_batch = output_dims > 3 ? out_attr[0]->shape->data[3] : 1; + input_dims = (uint32_t)in_attr[0]->shape->size; + input_depth = input_dims > 2 ? in_attr[0]->shape->data[2] : 1; + input_width_orig = input_width; + output_width_orig = output_width; + + if (align_corners && output_width > 1) + { + width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1); + } + else + { + width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width; + } + + if (align_corners && output_height > 1) + { + height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(output_height - 1); + } + else + { + height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)output_height; + } + + for (b = 0; b < output_batch; b ++) + { + for (d = 0; d < output_depth; d ++) + { + int32_t input_base = b * input_depth * input_width_orig * input_height \ + + d * input_width_orig * input_height; + int32_t output_base = b * output_depth * output_width_orig * output_height \ + + d * output_width_orig * output_height; + + for (h = 0; h < output_height; h ++) + { + float input_h; + uint32_t in_y; + + if (half_pixel_centers) + { + input_h = ((float)h + 0.5f) * height_scale; + } + else + { + input_h = h * height_scale; + } + if (align_corners) + { + in_y = vsi_nn_min((uint32_t)simple_round(input_h), input_height - 1); + } + else + { + in_y = vsi_nn_min((uint32_t)floorf(input_h), input_height - 1); + } + + for (w = 0; w < output_width; w ++) + { + float input_w; + uint32_t in_x; + int32_t in_index; + int32_t out_index; + + if (half_pixel_centers) + { + input_w = ((float)w + 0.5f) * width_scale; + } + else + { + input_w = w * width_scale; + } + if (align_corners) + { + in_x = vsi_nn_min((uint32_t)simple_round(input_w), input_width - 1); + } + else + { + in_x = vsi_nn_min((uint32_t)floorf(input_w), input_width - 1); + } + in_index = in_x + in_y * input_width_orig + input_base; + out_index = w + h * output_width_orig + output_base; + f32_out_buffer[0][out_index] = f32_in_buffer[0][in_index]; + } + } + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; + +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _resize_nearest_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _resize_nearest_kernel_param_def ); + status = VSI_SUCCESS; + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_NEAREST_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_NEAREST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_NEAREST_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( resize_nearest, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c new file mode 100644 index 0000000..1369867 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c @@ -0,0 +1,259 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (3) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.scatter_nd") + +DEF_KERNEL_EXECUTOR(_scatter_nd_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + uint32_t * para_buffer[1] = { NULL }; + float * buffer[2] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + int32_t i = 0, j = 0; + int32_t block_size = 1, indices_num = 1; + int32_t coord_dim = 1; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; // idx int + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; // update + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; // output + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + para_buffer[0] = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE ); + CHECK_PTR_FAIL_GOTO( para_buffer[0], "Create input0 buffer fail.", final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input1 buffer fail.", final ); + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &(block_size)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(coord_dim)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &(indices_num)); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + if(coord_dim <= 3) + { + int32_t stride[3] = {0, 0, 0}; + int32_t new_shape[3] = {1, 1, 1}; + int32_t merge_dim = (int32_t)attr[2]->shape->size - coord_dim + 1; + + for(i = 0; i < merge_dim; ++i) + { + new_shape[0] *= attr[2]->shape->data[i]; + } + stride[0] = new_shape[0] / block_size; + + for(i = 1; i < coord_dim; ++i) + { + new_shape[i] = attr[2]->shape->data[merge_dim + i - 1]; + + stride[i] = stride[i - 1] * new_shape[i]; + } + + for(i = 0; i < indices_num; i++) + { + uint32_t in_index = i * block_size; + uint32_t out_index = 0; + uint32_t coord[3] = {0}; + int32_t byd_flg = 0; + + for(j = 0; j < coord_dim; j++) + { + coord[j] = para_buffer[0][i * coord_dim + coord_dim - j - 1]; + if(coord[j] >= (uint32_t)new_shape[j]) + { + byd_flg = 1; + break; + } + } + if(byd_flg) + { + continue; + } + + out_index = (coord[2] * stride[1] + coord[1] * stride[0] + coord[0]) * block_size; + for(j = 0; j < block_size; j++) + { + buffer[1][out_index + j] += buffer[0][in_index + j]; + } + } + } + else + { + status = VSI_FAILURE; + CHECK_STATUS_FAIL_GOTO( status, final ); + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if( para_buffer[0] ) + { + free( para_buffer[0] ); + } + for( i = 0; i < 2; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _scatter_nd_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _scatter_nd_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _scatter_nd_exec, + _scatter_nd_kernel_param_def, + _cnt_of_array( _scatter_nd_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t idx_num = vsi_nn_kernel_param_get_int32( params, "idx_num" ); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 3; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &idx_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( scatter_nd, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/select_cpu.c b/src/tim/vx/internal/src/kernel/cpu/select_cpu.c new file mode 100644 index 0000000..d6804bd --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/select_cpu.c @@ -0,0 +1,239 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.select") + +/* + * Kernel params + */ +static vx_param_description_t _select_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _SELECT_PARAM_NUM _cnt_of_array( _select_kernel_param_def ) + +static int32_t _expand_offset + ( + int32_t index, + int32_t * shape, size_t rank, + size_t * strides, int32_t * out_shape + ) +{ + uint32_t i; + int32_t offset = 0; + + for( i = 0; i < rank && index; i ++ ) + { + if( shape[i] == out_shape[i] ) + { + offset += (uint32_t)strides[i] * ( index % out_shape[i] ); + } + index /= out_shape[i]; + } + return offset; +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + for (i = 0; i < out_elements[0]; i++) + { + int32_t in0_offset = 0; + int32_t in1_offset = 0; + int32_t in2_offset = 0; + + in0_offset = _expand_offset( i, in_attr[0]->shape->data, in_attr[0]->shape->size, + in_stride_size[0], out_attr[0]->shape->data ); + in1_offset = _expand_offset( i, in_attr[1]->shape->data, in_attr[1]->shape->size, + in_stride_size[1], out_attr[0]->shape->data ); + in2_offset = _expand_offset( i, in_attr[2]->shape->data, in_attr[2]->shape->size, + in_stride_size[2], out_attr[0]->shape->data ); + + f32_out_buffer[0][i] = (f32_in_buffer[0][in0_offset]) ? + f32_in_buffer[1][in1_offset] : f32_in_buffer[2][in2_offset]; + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _select_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _select_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SELECT_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SELECT_PARAM_NUM ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( select, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c b/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c new file mode 100644 index 0000000..5b09ff7 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c @@ -0,0 +1,298 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.swish") + + +/* + * Kernel params + */ +static vx_param_description_t _swish_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _SWISH_PARAM_NUM _cnt_of_array( _swish_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_swish_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + float beta = 1.0f; + uint32_t i; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[_CPU_IO_NUM], &(beta)); + + /* TODO: Add CPU kernel implement */ + /* example code : copy data form input tensor to output tensor*/ + + for (i = 0; i < out_elements[0]; i++) + { + float val = f32_in_buffer[0][i]; + f32_out_buffer[0][i] = val * 1.0f / (1.0f + (float)exp(beta * val * (-1.0f))); + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + + return status; +} /* _swish_compute() */ + +DEF_KERNEL_EXECUTOR(_hswish_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + uint32_t i; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + f32_out_buffer[i] = (float *)malloc( out_elements[i] * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_elements[i] * sizeof(float) ); + } + + /* TODO: Add CPU kernel implement */ + /* example code : copy data form input tensor to output tensor*/ + + + for (i = 0; i < out_elements[0]; i++) + { + float val = f32_in_buffer[0][i]; + f32_out_buffer[0][i] = val * vsi_nn_clamp((val + 3.0f), 0.0f, 6.0f) / 6.0f; + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + + return status; +} /* _hswish_compute() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_nn_swish_type swish_type + /* Add extra params */ + ) +{ + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + if (VSI_NN_SWISH == swish_type) + { + kernel->info.function = _swish_compute; + } + else + { + kernel->info.function = _hswish_compute; + } + kernel->info.parameters = _swish_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _swish_kernel_param_def ); + + return VSI_SUCCESS; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SWISH_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + int32_t swish_type = vsi_nn_kernel_param_get_int32( params, "type" ); + float beta = vsi_nn_kernel_param_get_float32( params, "beta" ); + + status = _query_kernel( kernel, inputs, outputs, (vsi_nn_swish_type)swish_type); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SWISH_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[_CPU_IO_NUM] = vsi_nn_kernel_scalar_create( graph, F32, &beta ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SWISH_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( swish, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c new file mode 100644 index 0000000..63c2f4c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c @@ -0,0 +1,223 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "client/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (0) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("tile_sw") + +void copyMultipleTimes(const float* in_data, int32_t in_size, int32_t multiplier, float* out_data) +{ + int i = 0; + + for ( i = 0; i < multiplier; ++i) + { + memcpy(out_data, in_data, in_size * sizeof(float)); + out_data += in_size; + } +} + +void tileOneDimension(const vsi_int_array_t* input_shape, const float* in_data, + const uint32_t* multipliers, float* out_data, int dimension, + int *stride_size, int *tiled_stride_size) +{ + int i = 0; + const int dimension_size = input_shape->data[dimension]; + int total_stride_size = 0, total_tiled_stride_size = 0; + const float* copy_from_data = in_data; + float* copy_to_data = out_data; + + if (dimension == 0) + { + copyMultipleTimes(in_data, dimension_size, multipliers[dimension], out_data); + *stride_size = dimension_size; + *tiled_stride_size = dimension_size * multipliers[dimension]; + return ; + } + + for (i = 0; i < dimension_size; ++i) + { + tileOneDimension( + input_shape, copy_from_data, multipliers, copy_to_data, dimension - 1, stride_size, tiled_stride_size); + copy_from_data += *stride_size; + copy_to_data += *tiled_stride_size; + total_stride_size += *stride_size; + total_tiled_stride_size += *tiled_stride_size; + } + + copyMultipleTimes(out_data, total_tiled_stride_size, multipliers[dimension] - 1, + out_data + total_tiled_stride_size); + + *stride_size = total_stride_size; + *tiled_stride_size = total_tiled_stride_size * multipliers[dimension]; + return ; +} + + +DEF_KERNEL_EXECUTOR(_tile_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + uint32_t multiples[VSI_NN_MAX_DIM_NUM] = {0}; + int stride_size = 0, tiled_stride_size = 0; + + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + for (i = 0; i < attr[0]->shape->size; i++) + { + multiples[i] = attr[1]->shape->data[i] / attr[0]->shape->data[i]; + } + + tileOneDimension(attr[0]->shape, buffer[0], multiples, buffer[1], + (int32_t)attr[0]->shape->size - 1, &stride_size, &tiled_stride_size); + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + + return status; +} /* _tile_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _tile_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +REGISTER_BACKEND_CPU( tile, _setup ) + +__END_DECLS + + + diff --git a/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c b/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c new file mode 100644 index 0000000..7dd0a16 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c @@ -0,0 +1,270 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.upsample") + + +/* + * Kernel params + */ +static vx_param_description_t _upsample_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _UPSAMPLE_PARAM_NUM _cnt_of_array( _upsample_kernel_param_def ) + +#define SCALAR_KSZIE_X (3) +#define SCALAR_KSZIE_Y (4) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + int32_t i, j, b, p; + int32_t batch, depth, height, width, height_o, width_o; + int32_t input_base = 0; + int32_t output_base = 0; + int32_t ksize_x = 0; + int32_t ksize_y = 0; + vsi_bool is_relative_coord = FALSE; + vsi_nn_kernel_dtype_e input1_dtype; + + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + if (1 == i) + { + in_attr[1]->quant = VSI_NN_KERNEL_QUANT_NONE; + } + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_KSZIE_X], &ksize_x); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_KSZIE_Y], &ksize_y); + + batch = in_attr[0]->shape->size > 3 ? in_attr[0]->shape->data[3] : 1; + depth = in_attr[0]->shape->size > 2 ? in_attr[0]->shape->data[2] : 1; + width = in_attr[0]->shape->data[0]; + height = in_attr[0]->shape->data[1]; + width_o = out_attr[0]->shape->data[0]; + height_o = out_attr[0]->shape->data[1]; + input1_dtype = in_attr[1]->dtype; + + if ((I8 == input1_dtype) || (U8 == input1_dtype) || (I16 == input1_dtype)) + { + is_relative_coord = TRUE; + } + + + for(b = 0; b < batch; b++) + { + for (p = 0; p < depth; p ++) + { + input_base = b * depth * height * width + p * height * width; + output_base = b * depth * height_o * width_o + p * height_o * width_o; + for (j = 0; j < height; j ++) + { + for (i = 0; i < width; i ++) + { + int32_t in_index = input_base + j * width + i; + float in_value = f32_in_buffer[0][in_index]; + int32_t up_index = (int32_t)f32_in_buffer[1][in_index]; + int32_t out_index = up_index; + if (is_relative_coord) + { + int32_t relative_y = up_index / ksize_x; + int32_t relative_x = up_index % ksize_x; + out_index = output_base + ((j * ksize_y) + relative_y) * width_o + i * ksize_x + relative_x; + } + f32_out_buffer[0][out_index] = in_value; + } + } + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _upsample_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _upsample_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_UPSAMPLE_PARAM_NUM] = {NULL}; + int32_t scale_x = 0; + int32_t scale_y = 0; + vsi_nn_kernel_node_t node = NULL; + + scale_x = vsi_nn_kernel_param_get_int32(params, "scale_x"); + scale_y = vsi_nn_kernel_param_get_int32(params, "scale_y"); + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_KSZIE_X] = vsi_nn_kernel_scalar_create( + graph, I32, &scale_x ); + node_params[SCALAR_KSZIE_Y] = vsi_nn_kernel_scalar_create( + graph, I32, &scale_y ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLE_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_KSZIE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_KSZIE_Y] ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( upsample, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c new file mode 100644 index 0000000..4401d24 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c @@ -0,0 +1,361 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _A_TIMES_B_PLUS_C_KERNEL_SOURCE "a_times_b_plus_c" + +#define STR(a) #a + +// Add kernel hashtable here +#define A_TIMES_B_PLUS_C_HASH_KEY(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, _image_2d) \ + ((IN2_DTYPE << 24) | (IN1_DTYPE << 16) | ( IN0_DTYPE << 8 ) | ( OUT_DTYPE << 1) | (_image_2d)) + +#define A_TIMES_B_PLUS_C_SH_KERNEL_NAME(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE) \ +CVIVANTE_NAMESPACE("evis.a_times_b_plus_c_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)) + +#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE) \ + { A_TIMES_B_PLUS_C_HASH_KEY(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, IMAGE), \ + A_TIMES_B_PLUS_C_SH_KERNEL_NAME(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE), \ + _A_TIMES_B_PLUS_C_KERNEL_SOURCE} + +#define A_TIMES_B_PLUS_C_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE) \ +CVIVANTE_NAMESPACE("evis.a_times_b_plus_c_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)"_2D") + +#define PACK_KERNEL_MAP_2D(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE) \ + { A_TIMES_B_PLUS_C_HASH_KEY(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, IMAGE_2D), \ + A_TIMES_B_PLUS_C_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE), \ + _A_TIMES_B_PLUS_C_KERNEL_SOURCE} + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _a_times_b_plus_c_kernel_map[] = +{ + PACK_KERNEL_MAP(F16, F16, F16, F16), + + PACK_KERNEL_MAP_2D(F16, F16, F16, F16), +}; + +/* + * Kernel params + */ +static vx_param_description_t _a_times_b_plus_c_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _A_TIMES_B_PLUS_C_PARAM_NUM _cnt_of_array( _a_times_b_plus_c_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ +#define _PACK_A_TIMES_B_PLUS_C_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \ + (( IN1_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE)) + vsi_status status = VX_SUCCESS; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_tensor input0 = (vx_tensor)param[0]; + vx_tensor input1 = (vx_tensor)param[1]; + vx_tensor input2 = (vx_tensor)param[2]; + vx_tensor output = (vx_tensor)param[3]; + uint32_t i = 0; + + vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL }; + vsi_int_array_t *output_shape = NULL; + uint32_t pack_key = 0; + + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); + CHECK_PTR_FAIL_GOTO( attr[0], "vsi_nn_kernel_tensor_attr_create fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1); + CHECK_PTR_FAIL_GOTO( attr[1], "vsi_nn_kernel_tensor_attr_create fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input2); + CHECK_PTR_FAIL_GOTO( attr[2], "vsi_nn_kernel_tensor_attr_create fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( attr[3], "vsi_nn_kernel_tensor_attr_create fail.", final ); + + + pack_key = _PACK_A_TIMES_B_PLUS_C_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype, attr[3]->dtype ); + + output_shape = attr[3]->shape; + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = output_shape->data[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1; + + + switch( pack_key ) + { + case _PACK_A_TIMES_B_PLUS_C_KEY( F16, F16, F16, F16 ): + { + gpu_dp_inst_t uniA_Times_B_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x11111111, // BSelt + 0x03020100, 0x07060504, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniA_Plus_B_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniA_Times_B_2x8", &uniA_Times_B_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniA_Plus_B_2x8", &uniA_Plus_B_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + for ( i = 0; i < 4; i++) + { + if (attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + } + +#undef _PACK_A_TIMES_B_PLUS_C_KEY + return status; +} /* _a_times_b_plus_c_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e in2_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _a_times_b_plus_c_kernel_map; + size_t kernel_map_size = _cnt_of_array( _a_times_b_plus_c_kernel_map ); + vx_param_description_t * param_def = _a_times_b_plus_c_kernel_param_def; + size_t param_def_size = _cnt_of_array( _a_times_b_plus_c_kernel_param_def ); + vx_kernel_initialize_f initializer = _a_times_b_plus_c_initializer; + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = A_TIMES_B_PLUS_C_HASH_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_A_TIMES_B_PLUS_C_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t* shapes_in[_INPUT_NUM]; + size_t rank_in[_INPUT_NUM]; + int32_t* shapes_ptr[_IO_NUM]; + int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + uint32_t new_rank = 0; + int32_t i = 0; + vsi_bool ret = FALSE; + vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; + + for (i = 0; i < _IO_NUM; i++) + { + shapes_ptr[i] = shapes[i]; + } + + for (i = 0; i < _INPUT_NUM; i++) + { + shapes_in[i] = (int32_t *)inputs[i]->attr.size; + rank_in[i] = (size_t)inputs[i]->attr.dim_num; + } + + ret = vsi_nn_kernel_optimize_broadcast_shape( + (const int32_t**)shapes_in, (const size_t*)rank_in, _INPUT_NUM, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes_ptr, shapes[_INPUT_NUM], &new_rank); + + if( ret ) + { + for (i = 0; i < _INPUT_NUM; i++) + { + reshape_tensors[i] = vsi_nn_reshape_tensor( graph, + inputs[i], (uint32_t*)shapes[i], new_rank ); + } + + for (i = 0; i < _OUTPUT_NUM; i++) + { + reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( graph, + outputs[i], (uint32_t*)shapes[i + _INPUT_NUM], new_rank ); + } + } + else + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[_INPUT_NUM]->attr.size, + reshape_tensors[_INPUT_NUM]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (reshape_tensors[_INPUT_NUM]->attr.dim_num == 2 || reshape_tensors[_INPUT_NUM]->attr.size[2] == 1); + status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[_INPUT_NUM], image_2d); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _A_TIMES_B_PLUS_C_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _A_TIMES_B_PLUS_C_PARAM_NUM ); + } + } + + for (i = 0; i < _IO_NUM; i++) + { + if (reshape_tensors[i]) + { + vsi_nn_ReleaseTensor( &reshape_tensors[i] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( a_times_b_plus_c, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c new file mode 100644 index 0000000..501e860 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c @@ -0,0 +1,493 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _ADD_MEAN_STD_NORM_KERNEL_SOURCE "add_mean_std_norm" + +// Add kernel hashtable here +#define ADD_MEAN_STD_NORM_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + ((IN0_DTYPE << 20) | ( IN1_DTYPE << 12 ) | ( OUT_DTYPE << 4) ) + +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { ADD_MEAN_STD_NORM_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("evis.add_mean_std_norm_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE), \ + _ADD_MEAN_STD_NORM_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _add_mean_std_norm_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F16, F16, F16 ), + PACK_KERNEL_MAP( U8 , U8 , F16 ), + PACK_KERNEL_MAP( I16, I16, F16 ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _add_mean_std_norm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _ADD_MEAN_STD_NORM_PARAM_NUM _cnt_of_array( _add_mean_std_norm_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_tensor input0 = (vx_tensor)param[0]; + vx_tensor input1 = (vx_tensor)param[1]; + vx_tensor output = (vx_tensor)param[2]; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + vsi_nn_kernel_tensor_attr_t *input0_attr = NULL, *input1_attr = NULL, *output_attr = NULL; + vsi_int_array_t *input_shape = NULL; + float scaleIn = 1.0f; + int32_t input_ZP = 0; + float scaleIn1 = 1.0f; + int32_t input_ZP1 = 0; + float scaleOut = 1.0f; + int32_t output_ZP = 0; + int32_t fixpoint = 0, fixpoint1 = 0, fixpoint_out = 0; + float inScale_dfp, inScale_dfp1; + float eps = 0.0f; + float rsEps = 0.0f; + float dimRatio = 0.0f; + int32_t width = 0; + + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); + CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1); + CHECK_PTR_FAIL_GOTO( input1_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + input_shape = input0_attr->shape; + input_dtype = input0_attr->dtype; + output_dtype = output_attr->dtype; + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[_CPU_IO_NUM], &(eps)); + rsEps = (float)(1.0f / sqrtf(eps)); + dimRatio = (float)(1.0 / (input_shape->data[0])); + + + if ( VSI_NN_KERNEL_QUANT_DFP == input0_attr->quant ) + { + fixpoint = input0_attr->dfp.fl; + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == input0_attr->quant ) + { + input_ZP = input0_attr->asymm.zero_point; + scaleIn = input0_attr->asymm.scale; + } + else + { + input_ZP = 0; + scaleIn = 1.0f; + } + + //input1 + if ( VSI_NN_KERNEL_QUANT_DFP == input1_attr->quant ) + { + fixpoint1 = input1_attr->dfp.fl; + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant ) + { + input_ZP1 = input1_attr->asymm.zero_point; + scaleIn1 = input1_attr->asymm.scale; + } + else + { + input_ZP1 = 0; + scaleIn1 = 1.0f; + } + + //output + if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant ) + { + fixpoint_out = output_attr->dfp.fl; + if (fixpoint_out >= 0) + { + scaleOut = 1.0f / (vx_float32) ((int64_t)1 << fixpoint_out); + } + else + { + scaleOut = (vx_float32) ((int64_t)1 << -fixpoint_out); + } + output_ZP = 0; + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) + { + output_ZP = output_attr->asymm.zero_point; + scaleOut = output_attr->asymm.scale; + } + else + { + output_ZP = 0; + scaleOut = 1.0f; + } + + if (fixpoint >= 0) + { + inScale_dfp = 1.0f / (vx_float32) ((int64_t)1 << fixpoint); + } + else + { + inScale_dfp = (vx_float32) ((int64_t)1 << -fixpoint); + } + + if (fixpoint1 >= 0) + { + inScale_dfp1 = 1.0f / (vx_float32) ((int64_t)1 << fixpoint1); + } + else + { + inScale_dfp1 = (vx_float32) ((int64_t)1 << -fixpoint1); + } + + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.global_size[0] = 16; + gpu_param.global_size[1] = gpu_align_p2( (input_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1], gpu_param.local_size[1] ); + + { + gpu_dp_inst_t uniAddFp16_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniAddFp16toFp32Lo_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniAddFp16toFp32Hi_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + status = vsi_nn_kernel_gpu_add_param(node, "uniAddFp16_2x8", &uniAddFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniAddFp16toFp32Lo_4x4", &uniAddFp16toFp32Lo_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniAddFp16toFp32Hi_4x4", &uniAddFp16toFp32Hi_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if( U8 == input_dtype && F16 == output_dtype ) + { + vx_uint16 M0 = 0; + vx_int8 postShift = 0; + vx_uint32 multAndoutZP0[2] = {0}; + vx_uint32 multAndoutZP1[2] = {0}; + + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8MulAndPostShift_1_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + vsi_nn_GetFP32MultiAndPostShift(scaleIn / scaleOut, &M0, &postShift); + multAndoutZP0[0] = (vx_uint32)(M0); + multAndoutZP0[1] = (vx_uint32)((output_ZP << postShift) - input_ZP * M0); + uniU8MulAndPostShift_0_Lo_2x8.data[7] |= (postShift & 0x1F); + + vsi_nn_GetFP32MultiAndPostShift(scaleIn1 / scaleOut, &M0, &postShift); + multAndoutZP1[0] = (vx_uint32)(M0); + multAndoutZP1[1] = (vx_uint32)((output_ZP << postShift) - input_ZP1 * M0); + uniU8MulAndPostShift_1_Lo_2x8.data[7] |= (postShift & 0x1F); + + status = vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_1_Lo_2x8", &uniU8MulAndPostShift_1_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if( I16 == input_dtype && F16 == output_dtype ) + { + gpu_dp_inst_t uniConvertInt16ScaleToFp32Fst_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x01010101, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertInt16ScaleToFp32Sec_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x01010101, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16ScaleToFp32Fst_4x4", + &uniConvertInt16ScaleToFp32Fst_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16ScaleToFp32Sec_4x4", + &uniConvertInt16ScaleToFp32Sec_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &inScale_dfp); + status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &inScale_dfp1); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + width = input_shape->data[0]; + status = vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); + status |= vsi_nn_kernel_gpu_add_param(node, "rsEps", &rsEps); + CHECK_STATUS_FAIL_GOTO(status, final ); + + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + if (input0_attr) + { + vsi_nn_kernel_tensor_attr_release(&input0_attr); + } + if (input1_attr) + { + vsi_nn_kernel_tensor_attr_release(&input1_attr); + } + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + return status; + +} /* _add_mean_std_norm_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _add_mean_std_norm_kernel_map; + size_t kernel_map_size = _cnt_of_array( _add_mean_std_norm_kernel_map ); + vx_param_description_t * param_def = _add_mean_std_norm_kernel_param_def; + size_t param_def_size = _cnt_of_array( _add_mean_std_norm_kernel_param_def ); + vx_kernel_initialize_f initializer = _add_mean_std_norm_initializer; + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = ADD_MEAN_STD_NORM_HASH_KEY( in0_dtype, in1_dtype, out_dtype ); + + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ADD_MEAN_STD_NORM_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + + status = _query_kernel( kernel, inputs, outputs ); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = 0; + border.constant_value.S16 = 0; + border.constant_value.U8 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vsi_nn_kernel_node_set_border( node, &border ); + VSI_ASSERT( status == VSI_SUCCESS ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ADD_MEAN_STD_NORM_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[_CPU_IO_NUM] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _ADD_MEAN_STD_NORM_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM] ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( add_mean_std_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c new file mode 100644 index 0000000..f7ad8f2 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c @@ -0,0 +1,434 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +#define HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) \ + "argmax_axis"#AXIS + +#define HASH_ARGMAX_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_ARGMAX_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_ARGMAX_KERNELS_HALF( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_F16to"#OUT_DTYPE), \ + HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_ARGMAX_KERNELS_HALF_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_F16to"#OUT_DTYPE"_2D"), \ + HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } _argmax_evis_kernel_map[] = +{ + HASH_ARGMAX_KERNELS_HALF(0, F16, U8) + HASH_ARGMAX_KERNELS_HALF(0, F16, I16) + HASH_ARGMAX_KERNELS_HALF(0, BF16, U8) + HASH_ARGMAX_KERNELS_HALF(0, BF16, I16) + HASH_ARGMAX_KERNELS_HALF(1, F16, U8) + HASH_ARGMAX_KERNELS_HALF(1, F16, I16) + HASH_ARGMAX_KERNELS_HALF(1, BF16, U8) + HASH_ARGMAX_KERNELS_HALF(1, BF16, I16) + HASH_ARGMAX_KERNELS_HALF(2, F16, U8) + HASH_ARGMAX_KERNELS_HALF(2, F16, I16) + HASH_ARGMAX_KERNELS_HALF(2, BF16, U8) + HASH_ARGMAX_KERNELS_HALF(2, BF16, I16) + + HASH_ARGMAX_KERNELS_HALF_2D(0, F16, U8) + HASH_ARGMAX_KERNELS_HALF_2D(0, F16, I16) + HASH_ARGMAX_KERNELS_HALF_2D(0, BF16, U8) + HASH_ARGMAX_KERNELS_HALF_2D(0, BF16, I16) + HASH_ARGMAX_KERNELS_HALF_2D(1, F16, U8) + HASH_ARGMAX_KERNELS_HALF_2D(1, F16, I16) + HASH_ARGMAX_KERNELS_HALF_2D(1, BF16, U8) + HASH_ARGMAX_KERNELS_HALF_2D(1, BF16, I16) + HASH_ARGMAX_KERNELS_HALF_2D(2, F16, U8) + HASH_ARGMAX_KERNELS_HALF_2D(2, F16, I16) + HASH_ARGMAX_KERNELS_HALF_2D(2, BF16, U8) + HASH_ARGMAX_KERNELS_HALF_2D(2, BF16, I16) + + HASH_ARGMAX_KERNELS(0, I8, U8) + HASH_ARGMAX_KERNELS(0, I8, I16) + HASH_ARGMAX_KERNELS(0, U8, U8) + HASH_ARGMAX_KERNELS(0, U8, I16) + HASH_ARGMAX_KERNELS(0, I16, U8) + HASH_ARGMAX_KERNELS(0, I16, I16) + HASH_ARGMAX_KERNELS(1, I8, U8) + HASH_ARGMAX_KERNELS(1, I8, I16) + HASH_ARGMAX_KERNELS(1, U8, U8) + HASH_ARGMAX_KERNELS(1, U8, I16) + HASH_ARGMAX_KERNELS(1, I16, U8) + HASH_ARGMAX_KERNELS(1, I16, I16) + HASH_ARGMAX_KERNELS(2, I8, U8) + HASH_ARGMAX_KERNELS(2, I8, I16) + HASH_ARGMAX_KERNELS(2, U8, U8) + HASH_ARGMAX_KERNELS(2, U8, I16) + HASH_ARGMAX_KERNELS(2, I16, U8) + HASH_ARGMAX_KERNELS(2, I16, I16) + + HASH_ARGMAX_KERNELS_2D(0, I8, U8) + HASH_ARGMAX_KERNELS_2D(0, I8, I16) + HASH_ARGMAX_KERNELS_2D(0, U8, U8) + HASH_ARGMAX_KERNELS_2D(0, U8, I16) + HASH_ARGMAX_KERNELS_2D(0, I16, U8) + HASH_ARGMAX_KERNELS_2D(0, I16, I16) + HASH_ARGMAX_KERNELS_2D(1, I8, U8) + HASH_ARGMAX_KERNELS_2D(1, I8, I16) + HASH_ARGMAX_KERNELS_2D(1, U8, U8) + HASH_ARGMAX_KERNELS_2D(1, U8, I16) + HASH_ARGMAX_KERNELS_2D(1, I16, U8) + HASH_ARGMAX_KERNELS_2D(1, I16, I16) + HASH_ARGMAX_KERNELS_2D(2, I8, U8) + HASH_ARGMAX_KERNELS_2D(2, I8, I16) + HASH_ARGMAX_KERNELS_2D(2, U8, U8) + HASH_ARGMAX_KERNELS_2D(2, U8, I16) + HASH_ARGMAX_KERNELS_2D(2, I16, U8) + HASH_ARGMAX_KERNELS_2D(2, I16, I16) +}; + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +#define SCALAR_INPUT_AXIS (2) + +DEF_KERNEL_INITIALIZER(_argmax_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + uint32_t argLenSub1 = 0; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; + vsi_int_array_t * input_shape = NULL; + vsi_int_array_t * output_shape = NULL; + uint32_t packedArgIdx[4] = {0}; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + input_shape = attr[0]->shape; + output_shape = attr[1]->shape; + + if (axis == 2 && input_shape->data[2] == 1) + { + argLenSub1 = input_shape->data[1] - 1; + } + else + { + if (axis == 2) + argLenSub1 = input_shape->data[2] - 1; + else if (axis == 1) + argLenSub1 = input_shape->data[1] - 1; + } + + if (axis == 0) + { + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + if (attr[0]->dtype == F16 || attr[0]->dtype == BF16) + { + packedArgIdx[0] = 0x00000000; + packedArgIdx[1] = 0x00000001; + packedArgIdx[2] = 0x00000002; + packedArgIdx[3] = 0x00000003; + } + else if (attr[1]->dtype == I8 || attr[1]->dtype == U8) + { + packedArgIdx[0] = 0x03020100; + packedArgIdx[1] = 0x07060504; + packedArgIdx[2] = 0x0b0a0908; + packedArgIdx[3] = 0x0f0e0d0c; + } + else + { + packedArgIdx[0] = 0x00010000; + packedArgIdx[1] = 0x00030002; + packedArgIdx[2] = 0x00050004; + packedArgIdx[3] = 0x00070006; + } + } + else + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + packedArgIdx[0] = packedArgIdx[1] = (argLenSub1 << 16) | (argLenSub1 & 0xFFFF); + packedArgIdx[2] = packedArgIdx[3] = (argLenSub1 << 16) | (argLenSub1 & 0xFFFF); + + if (attr[0]->dtype == I8 || + attr[0]->dtype == U8) + { + if ( attr[1]->dtype == I8 || + attr[1]->dtype == U8) + { + uint32_t pack = ((argLenSub1 & 0xFF) << 24) | ((argLenSub1 & 0xFF) << 16) + | ((argLenSub1 & 0xFF) << 8) | (argLenSub1 & 0xFF); + packedArgIdx[0] = packedArgIdx[1] = pack; + packedArgIdx[2] = packedArgIdx[3] = pack; + } + } + } + + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1; + + switch( axis ) + { + case 0: + { + gpu_dp_inst_t uniPackedIdxAddSat_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0xffff0001, 0xffff0001, 0xffff0001, 0xffff0001, + 0xffff0001, 0xffff0001, 0xffff0001, 0xffff0001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSrcT2DstT_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, + 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalf2Float32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if( attr[0]->dtype == F16 || attr[0]->dtype == BF16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertHalf2Float32_4x4", &uniConvertHalf2Float32_4x4 ); + } + else + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniPackedIdxAddSat_2x8", &uniPackedIdxAddSat_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSrcT2DstT_2x8", &uniSrcT2DstT_2x8 ); + } + status |= vsi_nn_kernel_gpu_add_param( node, + "inputWidth", &input_shape->data[0] ); + status |= vsi_nn_kernel_gpu_add_param( node, + "packedArgIdx", packedArgIdx ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case 1: + case 2: + { + gpu_dp_inst_t uniExtractData_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtractData_2x8", &uniExtractData_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "argLenSub1", &argLenSub1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "packedArgIdx", packedArgIdx ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) vsi_nn_kernel_tensor_attr_release( &attr[0] ); + if (attr[1]) vsi_nn_kernel_tensor_attr_release( &attr[1] ); + + return status; +} /* _argmax_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t axis, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_ARGMAX_HASH_KEY( axis, input_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(_argmax_evis_kernel_map); i ++ ) + { + if( _argmax_evis_kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(_argmax_evis_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _argmax_evis_kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _argmax_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _argmax_evis_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _argmax_evis_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( inputs, outputs, axis, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM, + inputs, 1, outputs, 1 ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, _EVIS_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( argmax, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/argmin_evis.c b/src/tim/vx/internal/src/kernel/evis/argmin_evis.c new file mode 100644 index 0000000..ae94cfd --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/argmin_evis.c @@ -0,0 +1,434 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +#define HASH_ARGMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_ARGMIN_KERNEL_SOURCE_NAME(AXIS) \ + "argmin_axis"#AXIS + +#define HASH_ARGMIN_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_ARGMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.argmin_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_ARGMIN_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_ARGMIN_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_ARGMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.argmin_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_ARGMIN_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_ARGMIN_KERNELS_HALF( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_ARGMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.argmin_axis"#AXIS"_F16to"#OUT_DTYPE), \ + HASH_ARGMIN_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_ARGMIN_KERNELS_HALF_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_ARGMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.argmin_axis"#AXIS"_F16to"#OUT_DTYPE"_2D"), \ + HASH_ARGMIN_KERNEL_SOURCE_NAME(AXIS) }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } _argmin_evis_kernel_map[] = +{ + HASH_ARGMIN_KERNELS_HALF(0, F16, U8) + HASH_ARGMIN_KERNELS_HALF(0, F16, I16) + HASH_ARGMIN_KERNELS_HALF(0, BF16, U8) + HASH_ARGMIN_KERNELS_HALF(0, BF16, I16) + HASH_ARGMIN_KERNELS_HALF(1, F16, U8) + HASH_ARGMIN_KERNELS_HALF(1, F16, I16) + HASH_ARGMIN_KERNELS_HALF(1, BF16, U8) + HASH_ARGMIN_KERNELS_HALF(1, BF16, I16) + HASH_ARGMIN_KERNELS_HALF(2, F16, U8) + HASH_ARGMIN_KERNELS_HALF(2, F16, I16) + HASH_ARGMIN_KERNELS_HALF(2, BF16, U8) + HASH_ARGMIN_KERNELS_HALF(2, BF16, I16) + + HASH_ARGMIN_KERNELS_HALF_2D(0, F16, U8) + HASH_ARGMIN_KERNELS_HALF_2D(0, F16, I16) + HASH_ARGMIN_KERNELS_HALF_2D(0, BF16, U8) + HASH_ARGMIN_KERNELS_HALF_2D(0, BF16, I16) + HASH_ARGMIN_KERNELS_HALF_2D(1, F16, U8) + HASH_ARGMIN_KERNELS_HALF_2D(1, F16, I16) + HASH_ARGMIN_KERNELS_HALF_2D(1, BF16, U8) + HASH_ARGMIN_KERNELS_HALF_2D(1, BF16, I16) + HASH_ARGMIN_KERNELS_HALF_2D(2, F16, U8) + HASH_ARGMIN_KERNELS_HALF_2D(2, F16, I16) + HASH_ARGMIN_KERNELS_HALF_2D(2, BF16, U8) + HASH_ARGMIN_KERNELS_HALF_2D(2, BF16, I16) + + HASH_ARGMIN_KERNELS(0, I8, U8) + HASH_ARGMIN_KERNELS(0, I8, I16) + HASH_ARGMIN_KERNELS(0, U8, U8) + HASH_ARGMIN_KERNELS(0, U8, I16) + HASH_ARGMIN_KERNELS(0, I16, U8) + HASH_ARGMIN_KERNELS(0, I16, I16) + HASH_ARGMIN_KERNELS(1, I8, U8) + HASH_ARGMIN_KERNELS(1, I8, I16) + HASH_ARGMIN_KERNELS(1, U8, U8) + HASH_ARGMIN_KERNELS(1, U8, I16) + HASH_ARGMIN_KERNELS(1, I16, U8) + HASH_ARGMIN_KERNELS(1, I16, I16) + HASH_ARGMIN_KERNELS(2, I8, U8) + HASH_ARGMIN_KERNELS(2, I8, I16) + HASH_ARGMIN_KERNELS(2, U8, U8) + HASH_ARGMIN_KERNELS(2, U8, I16) + HASH_ARGMIN_KERNELS(2, I16, U8) + HASH_ARGMIN_KERNELS(2, I16, I16) + + HASH_ARGMIN_KERNELS_2D(0, I8, U8) + HASH_ARGMIN_KERNELS_2D(0, I8, I16) + HASH_ARGMIN_KERNELS_2D(0, U8, U8) + HASH_ARGMIN_KERNELS_2D(0, U8, I16) + HASH_ARGMIN_KERNELS_2D(0, I16, U8) + HASH_ARGMIN_KERNELS_2D(0, I16, I16) + HASH_ARGMIN_KERNELS_2D(1, I8, U8) + HASH_ARGMIN_KERNELS_2D(1, I8, I16) + HASH_ARGMIN_KERNELS_2D(1, U8, U8) + HASH_ARGMIN_KERNELS_2D(1, U8, I16) + HASH_ARGMIN_KERNELS_2D(1, I16, U8) + HASH_ARGMIN_KERNELS_2D(1, I16, I16) + HASH_ARGMIN_KERNELS_2D(2, I8, U8) + HASH_ARGMIN_KERNELS_2D(2, I8, I16) + HASH_ARGMIN_KERNELS_2D(2, U8, U8) + HASH_ARGMIN_KERNELS_2D(2, U8, I16) + HASH_ARGMIN_KERNELS_2D(2, I16, U8) + HASH_ARGMIN_KERNELS_2D(2, I16, I16) +}; + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +#define SCALAR_INPUT_AXIS (2) + +DEF_KERNEL_INITIALIZER(_argmin_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + uint32_t argLenSub1 = 0; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; + vsi_int_array_t * input_shape = NULL; + vsi_int_array_t * output_shape = NULL; + uint32_t packedArgIdx[4] = {0}; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + input_shape = attr[0]->shape; + output_shape = attr[1]->shape; + + if (axis == 2 && input_shape->data[2] == 1) + { + argLenSub1 = input_shape->data[1] - 1; + } + else + { + if (axis == 2) + argLenSub1 = input_shape->data[2] - 1; + else if (axis == 1) + argLenSub1 = input_shape->data[1] - 1; + } + + if (axis == 0) + { + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + if (attr[0]->dtype == F16 || attr[0]->dtype == BF16) + { + packedArgIdx[0] = 0x00000000; + packedArgIdx[1] = 0x00000001; + packedArgIdx[2] = 0x00000002; + packedArgIdx[3] = 0x00000003; + } + else if (attr[1]->dtype == I8 || attr[1]->dtype == U8) + { + packedArgIdx[0] = 0x03020100; + packedArgIdx[1] = 0x07060504; + packedArgIdx[2] = 0x0b0a0908; + packedArgIdx[3] = 0x0f0e0d0c; + } + else + { + packedArgIdx[0] = 0x00010000; + packedArgIdx[1] = 0x00030002; + packedArgIdx[2] = 0x00050004; + packedArgIdx[3] = 0x00070006; + } + } + else + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + packedArgIdx[0] = packedArgIdx[1] = (argLenSub1 << 16) | (argLenSub1 & 0xFFFF); + packedArgIdx[2] = packedArgIdx[3] = (argLenSub1 << 16) | (argLenSub1 & 0xFFFF); + + if (attr[0]->dtype == I8 || + attr[0]->dtype == U8) + { + if ( attr[1]->dtype == I8 || + attr[1]->dtype == U8) + { + uint32_t pack = ((argLenSub1 & 0xFF) << 24) | ((argLenSub1 & 0xFF) << 16) + | ((argLenSub1 & 0xFF) << 8) | (argLenSub1 & 0xFF); + packedArgIdx[0] = packedArgIdx[1] = pack; + packedArgIdx[2] = packedArgIdx[3] = pack; + } + } + } + + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1; + + switch( axis ) + { + case 0: + { + gpu_dp_inst_t uniPackedIdxAddSat_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0xffff0001, 0xffff0001, 0xffff0001, 0xffff0001, + 0xffff0001, 0xffff0001, 0xffff0001, 0xffff0001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSrcT2DstT_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, + 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalf2Float32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if( attr[0]->dtype == F16 || attr[0]->dtype == BF16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertHalf2Float32_4x4", &uniConvertHalf2Float32_4x4 ); + } + else + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniPackedIdxAddSat_2x8", &uniPackedIdxAddSat_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSrcT2DstT_2x8", &uniSrcT2DstT_2x8 ); + } + status |= vsi_nn_kernel_gpu_add_param( node, + "inputWidth", &input_shape->data[0] ); + status |= vsi_nn_kernel_gpu_add_param( node, + "packedArgIdx", packedArgIdx ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case 1: + case 2: + { + gpu_dp_inst_t uniExtractData_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtractData_2x8", &uniExtractData_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "argLenSub1", &argLenSub1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "packedArgIdx", packedArgIdx ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) vsi_nn_kernel_tensor_attr_release( &attr[0] ); + if (attr[1]) vsi_nn_kernel_tensor_attr_release( &attr[1] ); + + return status; +} /* _argmin_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t axis, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_ARGMIN_HASH_KEY( axis, input_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(_argmin_evis_kernel_map); i ++ ) + { + if( _argmin_evis_kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(_argmin_evis_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _argmin_evis_kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _argmin_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _argmin_evis_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _argmin_evis_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( inputs, outputs, axis, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM, + inputs, 1, outputs, 1 ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, _EVIS_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( argmin, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c new file mode 100644 index 0000000..ec0213c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c @@ -0,0 +1,433 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _BATCH_NORM_KERNEL_SOURCE "batchnorm_single" + +#define STR(a) #a + +// Add kernel hashtable here +#define BATCH_NORM_HASH_KEY(IN_DTYPE, OUT_DTYPE, BRDCST, _image_2d) \ + ( ( IN_DTYPE << 16 ) | ( OUT_DTYPE << 3) | ( BRDCST << 1) | (_image_2d) ) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, BRDCST) \ + { BATCH_NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, BRDCST, IMAGE), \ + CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_brdcst"#BRDCST), \ + _BATCH_NORM_KERNEL_SOURCE} + +#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE, BRDCST) \ + { BATCH_NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, BRDCST, IMAGE_2D), \ + CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_brdcst"#BRDCST"_2D"), \ + _BATCH_NORM_KERNEL_SOURCE} + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _batch_norm_kernel_map[] = +{ + PACK_KERNEL_MAP(F16, F16, 0), + PACK_KERNEL_MAP(F16, I16, 0), + PACK_KERNEL_MAP(F16, U8, 0), + PACK_KERNEL_MAP(F16, I8, 0), + PACK_KERNEL_MAP(U8, U8, 0), + PACK_KERNEL_MAP(U8, F16, 0), + PACK_KERNEL_MAP(I8, I8, 0), + PACK_KERNEL_MAP(I8, F16, 0), + PACK_KERNEL_MAP(I16, I16, 0), + PACK_KERNEL_MAP(I16, F16, 0), + PACK_KERNEL_MAP(F16, F16, 1), + PACK_KERNEL_MAP(F16, I16, 1), + PACK_KERNEL_MAP(F16, U8, 1), + PACK_KERNEL_MAP(F16, I8, 1), + PACK_KERNEL_MAP(U8, U8, 1), + PACK_KERNEL_MAP(U8, F16, 1), + PACK_KERNEL_MAP(I8, I8, 1), + PACK_KERNEL_MAP(I8, F16, 1), + PACK_KERNEL_MAP(I16, I16, 1), + PACK_KERNEL_MAP(I16, F16, 1), + + PACK_KERNEL_MAP_2D(F16, F16, 0), + PACK_KERNEL_MAP_2D(F16, I16, 0), + PACK_KERNEL_MAP_2D(F16, U8 , 0), + PACK_KERNEL_MAP_2D(F16, I8 , 0), + PACK_KERNEL_MAP_2D(U8, U8 , 0), + PACK_KERNEL_MAP_2D(U8, F16, 0), + PACK_KERNEL_MAP_2D(I8, I8, 0), + PACK_KERNEL_MAP_2D(I8, F16, 0), + PACK_KERNEL_MAP_2D(I16, I16, 0), + PACK_KERNEL_MAP_2D(I16, F16, 0), + PACK_KERNEL_MAP_2D(F16, F16, 1), + PACK_KERNEL_MAP_2D(F16, I16, 1), + PACK_KERNEL_MAP_2D(F16, U8 , 1), + PACK_KERNEL_MAP_2D(F16, I8 , 1), + PACK_KERNEL_MAP_2D(U8, U8 , 1), + PACK_KERNEL_MAP_2D(U8, F16, 1), + PACK_KERNEL_MAP_2D(I8, I8, 1), + PACK_KERNEL_MAP_2D(I8, F16, 1), + PACK_KERNEL_MAP_2D(I16, I16, 1), + PACK_KERNEL_MAP_2D(I16, F16, 1), +}; + +/* + * Kernel params + */ +static vx_param_description_t _batch_norm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _BATCH_NORM_PARAM_NUM _cnt_of_array( _batch_norm_kernel_param_def ) +#define SCALAR_INPUT_EPS (6) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_batch_norm_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ +#define _PACK_BATCH_NORM_KEY( IN_TYPE, OUT_TYPE ) \ + ( ( IN_TYPE << 16) | ( OUT_TYPE ) ) + + vsi_status status = VX_SUCCESS; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_tensor input = (vx_tensor)param[BATCHNORM_INPUT]; + vx_tensor output = (vx_tensor)param[BATCHNORM_INPUT_CNT]; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t *output_shape = NULL; + float input_scale = 1.0f; + float input_tail = 0; + float output_scale = 1.0f; + float output_zp = 0; + uint32_t pack_key = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); + CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = input_attr->dfp.fl; + if (fl > 0) + { + input_scale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + input_scale = (float)((int64_t)1 << -fl); + } + } + else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + input_scale = input_attr->asymm.scale; + input_tail = 0 - input_scale * (float)input_attr->asymm.zero_point; + } + + if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = output_attr->dfp.fl; + if (fl > 0) + { + output_scale = (float) ((int64_t)1 << fl); + } + else + { + output_scale = 1.0f / (float)((int64_t)1 << -fl); + } + } + else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + output_scale = 1.0f / output_attr->asymm.scale; + output_zp = (float)output_attr->asymm.zero_point; + } + + pack_key = _PACK_BATCH_NORM_KEY( input_attr->dtype, output_attr->dtype ); + + output_shape = output_attr->shape; + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = output_shape->data[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1; + + switch( pack_key ) + { + case _PACK_BATCH_NORM_KEY( F16, F16 ): + case _PACK_BATCH_NORM_KEY( F16, I16 ): + case _PACK_BATCH_NORM_KEY( F16, U8 ): + case _PACK_BATCH_NORM_KEY( F16, I8 ): + case _PACK_BATCH_NORM_KEY( I16, I16 ): + case _PACK_BATCH_NORM_KEY( I16, F16 ): + case _PACK_BATCH_NORM_KEY( U8, U8 ): + case _PACK_BATCH_NORM_KEY( U8, F16 ): + case _PACK_BATCH_NORM_KEY( I8, I8 ): + case _PACK_BATCH_NORM_KEY( I8, F16 ): + { + gpu_dp_inst_t uniDatatoF32_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDatatoF32_1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniDatatoF32_0_4x4", &uniDatatoF32_0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniDatatoF32_1_4x4", &uniDatatoF32_1_4x4 ); + if (output_attr->dtype == F16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractHalf8_2x8 ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); + } + status |= vsi_nn_kernel_gpu_add_param( node, + "input_scale", &input_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_tail", &input_tail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "output_scale", &output_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "output_zp", &output_zp ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (input_attr) + { + vsi_nn_kernel_tensor_attr_release(&input_attr); + } + + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + +#undef _PACK_BATCH_NORM_KEY + return status; +} /* _batch_norm_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _batch_norm_kernel_map; + size_t kernel_map_size = _cnt_of_array( _batch_norm_kernel_map ); + vx_param_description_t * param_def = _batch_norm_kernel_param_def; + size_t param_def_size = _cnt_of_array( _batch_norm_kernel_param_def ); + vx_kernel_initialize_f initializer = _batch_norm_initializer; + uint32_t key = 0; + uint32_t i = 0; + uint32_t brdcst = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (inputs[BATCHNORM_INPUT]->attr.size[0] != 1 && inputs[BATCHNORM_INPUT_BETA]->attr.size[0] == 1) + { + brdcst = 1; + } + + key = BATCH_NORM_HASH_KEY(in_dtype, out_dtype, brdcst, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_BATCH_NORM_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + float eps = 0; + + eps = vsi_nn_kernel_param_get_float32(params, "eps"); + + if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const) + || (inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16) + || (inputs[2]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16) + || (inputs[3]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16) + || (inputs[4]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32) ) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _BATCH_NORM_PARAM_NUM, + inputs, input_num, outputs, output_num ); + + node_params[SCALAR_INPUT_EPS] = vsi_nn_kernel_scalar_create( + graph, F32, &eps ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _BATCH_NORM_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_EPS] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( batchnorm_single, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/cast_evis.c b/src/tim/vx/internal/src/kernel/evis/cast_evis.c new file mode 100644 index 0000000..2d25883 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/cast_evis.c @@ -0,0 +1,319 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _CAST_KERNEL_SOURCE "cast" + +#define STR(a) #a +// Add kernel hashtable here +#define CAST_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d)) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { CAST_HASH_KEY( IN_DTYPE, OUT_DTYPE, IMAGE ), \ + CVIVANTE_NAMESPACE("evis.cast_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + SOURCE } + +#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { CAST_HASH_KEY( IN_DTYPE, OUT_DTYPE, IMAGE_2D ), \ + CVIVANTE_NAMESPACE("evis.cast_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _cast_kernel_map[] = +{ + PACK_KERNEL_MAP( F16, I16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( F16, I8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( F16, U8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( F16, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I16, F16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I16, I8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I16, U8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I16, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I8, F16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I8, I16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I8, U8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I8, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U8, F16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U8, I16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U8, I8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U8, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( F32, I16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( F32, I8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( F32, U8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I32, I16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I32, I8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I32, U8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F16, I16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F16, I8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F16, U8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F16, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I16, F16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I16, I8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I16, U8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I16, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I8, F16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I8, I16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I8, U8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I8, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( U8, F16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( U8, I16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( U8, I8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( U8, BOOL8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F32, I16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F32, I8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F32, U8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I32, I16, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I32, I8, _CAST_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I32, U8, _CAST_KERNEL_SOURCE ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _cast_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CAST_PARAM_NUM _cnt_of_array( _cast_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_cast_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * out_shape = NULL; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = out_shape->size < 3 ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + if ((F32 == input_attr->dtype) || (I32 == input_attr->dtype)) + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + } + else + { + gpu_dp_inst_t uniDataConvert_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniDataConvert_2x8", &uniDataConvert_2x8 ); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + SAFE_FREE_TENSOR_ATTR(input_attr); + + return status; +} /* _cast_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _cast_kernel_map; + size_t kernel_map_size = _cnt_of_array( _cast_kernel_map ); + vx_param_description_t * param_def = _cast_kernel_param_def; + size_t param_def_size = _cnt_of_array( _cast_kernel_param_def ); + vx_kernel_initialize_f initializer = _cast_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = CAST_HASH_KEY( in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CAST_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CAST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CAST_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( cast, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c new file mode 100644 index 0000000..78d52fc --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c @@ -0,0 +1,613 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "utils/vsi_nn_dtype_util.h" + +__BEGIN_DECLS + + +#define _CLIP_KERNEL_SOURCE(_input_type) "clip_"#_input_type + +#define STR(a) #a +// Add kernel hashtable here +#define CLIP_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d)) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { CLIP_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 ), \ + CVIVANTE_NAMESPACE("evis.clip_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _CLIP_KERNEL_SOURCE(IN_DTYPE) } + +#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE ) \ + { CLIP_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1 ), \ + CVIVANTE_NAMESPACE("evis.clip_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _CLIP_KERNEL_SOURCE(IN_DTYPE) } + + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _clip_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP(F16, F16), + PACK_KERNEL_MAP(F16, I16), + PACK_KERNEL_MAP(F16, I8), + PACK_KERNEL_MAP(F16, U8), + PACK_KERNEL_MAP(I16, F16), + PACK_KERNEL_MAP(I8, F16), + PACK_KERNEL_MAP(U8, F16), + PACK_KERNEL_MAP(I16, I16), + PACK_KERNEL_MAP(I8, I8), + PACK_KERNEL_MAP(U8, U8), + PACK_KERNEL_MAP_2D(F16, F16), + PACK_KERNEL_MAP_2D(F16, I16), + PACK_KERNEL_MAP_2D(F16, I8), + PACK_KERNEL_MAP_2D(F16, U8), + PACK_KERNEL_MAP_2D(I16, F16), + PACK_KERNEL_MAP_2D(I8, F16), + PACK_KERNEL_MAP_2D(U8, F16), + PACK_KERNEL_MAP_2D(I16, I16), + PACK_KERNEL_MAP_2D(I8, I8), + PACK_KERNEL_MAP_2D(U8, U8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _clip_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CLIP_PARAM_NUM _cnt_of_array( _clip_kernel_param_def ) + +#define SCALAR_MIN_VALUE (2) +#define SCALAR_MAX_VALUE (3) + +#define MAX_MULTIPLIER_NUM (65535) +#define MAX_POST_SHIFT_BITS (31) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_clip_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * out_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + float minVal = 1.0f; + float maxVal = 1.0f; + float scaleIn = 1.0f; + float scaleOut = 1.0f; + int32_t output_ZP = 0; + int32_t input_ZP = 0; + int32_t srcFixPointPos = 0; + int32_t dstFixPointPos = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + out_shape = output_attr->shape; + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MIN_VALUE], &(minVal)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_VALUE], &(maxVal)); + input_dtype = input_attr->dtype; + output_dtype = output_attr->dtype; + + if ((F16 == input_dtype) + || (I16 == input_dtype) + || (BF16 == input_dtype) + ) + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.dim = out_shape->size < 3 ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + + if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + srcFixPointPos = input_attr->dfp.fl; + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant) + { + input_ZP = input_attr->asymm.zero_point; + scaleIn = input_attr->asymm.scale; + } + + if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + { + dstFixPointPos = output_attr->dfp.fl; + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) + { + output_ZP = output_attr->asymm.zero_point; + scaleOut = output_attr->asymm.scale; + } + + if ((F16 == input_dtype && + (F16 == output_dtype || I8 == output_dtype + || I16 == output_dtype || U8 == output_dtype)) + || (BF16 == input_dtype && BF16 == output_dtype) + ) + { + uint16_t minTmp = 0; + uint16_t maxTmp = 0; + uint32_t packedMin = 0; + uint32_t packedMax = 0; + uint32_t packedMinData_FP16[4]; + uint32_t packedMaxData_FP16[4]; + uint32_t i; + + if (BF16 == input_dtype) + { + minTmp = vsi_nn_Fp32ToBFp16(minVal); + maxTmp = vsi_nn_Fp32ToBFp16(maxVal); + } + else + { + minTmp = vsi_nn_Fp32toFp16(minVal); + maxTmp = vsi_nn_Fp32toFp16(maxVal); + } + + packedMin = (minTmp << 16) | (minTmp); + packedMax = (maxTmp << 16) | (maxTmp); + + for (i = 0;i < 4; i++) + { + packedMinData_FP16[i] = packedMin; + packedMaxData_FP16[i] = packedMax; + } + + status = vsi_nn_kernel_gpu_add_param( node, "packedMinData_FP16", packedMinData_FP16); + status |= vsi_nn_kernel_gpu_add_param( node, "packedMaxData_FP16", packedMaxData_FP16); + if (I8 == output_dtype || I16 == output_dtype) + { + gpu_dp_inst_t uniConvertF16toInt_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + if (dstFixPointPos <= 0) + { + uniConvertF16toInt_2x8.data[7] |= vsi_nn_min((-dstFixPointPos) & 0x1F, MAX_POST_SHIFT_BITS); + } + else + { + uint32_t lo_part = vsi_nn_min(((int64_t)1 << dstFixPointPos), MAX_MULTIPLIER_NUM); + uint32_t multiplier = lo_part; + uint32_t j = 0; + + for (j = 0; j < 8; j++) + { + uniConvertF16toInt_2x8.data[j + 8] = multiplier; + } + } + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertF16toInt_2x8", &uniConvertF16toInt_2x8); + } + else if (U8 == output_dtype) + { + uint32_t multAndoutZP[2] = {0}; + uint16_t M0 = 0; + int8_t postShift = 0; + gpu_dp_inst_t uniDataMulAndPostShift_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111119, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + vsi_nn_GetFP32MultiAndPostShift(scaleIn / scaleOut, &M0, &postShift); + + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = (uint32_t)(output_ZP << postShift ); + + uniDataMulAndPostShift_2x8.data[7] |= (postShift & 0x1F); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP); + status |= vsi_nn_kernel_gpu_add_param( node, "uniDataMulAndPostShift_2x8", &uniDataMulAndPostShift_2x8); + + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (I8 == input_dtype + && (I8 == output_dtype || F16 == output_dtype)) + { + int32_t packedMin = 0; + int32_t packedMax = 0; + int32_t packedMinData[4]; + int32_t packedMaxData[4]; + gpu_dp_inst_t uniConvertIntegerLo_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertIntegerHi_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + if (srcFixPointPos > dstFixPointPos) + { + int32_t postshift = vsi_nn_min(srcFixPointPos - dstFixPointPos, MAX_POST_SHIFT_BITS); + + uniConvertIntegerLo_2x8.data[7] |= (postshift & 0x1F); + uniConvertIntegerHi_2x8.data[7] |= (postshift & 0x1F); + } + else + { + uint32_t multiplier = vsi_nn_min((int64_t)1 << (dstFixPointPos - srcFixPointPos), MAX_MULTIPLIER_NUM); + uint32_t i = 0; + + for (i = 0; i < 8; i++) + { + uniConvertIntegerLo_2x8.data[i + 8] = multiplier; + uniConvertIntegerHi_2x8.data[i + 8] = multiplier; + } + } + + if (F16 == output_dtype) + { + uint16_t minData = 0; + uint16_t maxData = 0; + minData = vsi_nn_Fp32toFp16(minVal); + maxData = vsi_nn_Fp32toFp16(maxVal); + packedMin = (minData << 16) | (minData); + packedMax = (maxData << 16) | (maxData); + } + else + { + uint8_t minData = 0; + uint8_t maxData = 0; + minData = (uint8_t)vsi_nn_Fp32ToDFP(minVal, (int8_t)dstFixPointPos, VSI_NN_TYPE_INT8); + maxData = (uint8_t)vsi_nn_Fp32ToDFP(maxVal, (int8_t)dstFixPointPos, VSI_NN_TYPE_INT8); + packedMin = (minData << 24) | (minData << 16) | (minData << 8) | (minData); + packedMax = (maxData << 24) | (maxData << 16) | (maxData << 8) | (maxData); + } + + packedMinData[0] = packedMinData[1] = packedMinData[2] = packedMinData[3] = packedMin; + packedMaxData[0] = packedMaxData[1] = packedMaxData[2] = packedMaxData[3] = packedMax; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertIntegerLo_2x8", &uniConvertIntegerLo_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertIntegerHi_2x8", &uniConvertIntegerHi_2x8); + + status |= vsi_nn_kernel_gpu_add_param( node, "packedMinData", packedMinData); + status |= vsi_nn_kernel_gpu_add_param( node, "packedMaxData", packedMaxData); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (I16 == input_dtype + && (I16 == output_dtype || F16 == output_dtype)) + { + uint16_t minData = 0; + uint16_t maxData = 0; + int32_t packedMin = (minData << 16) | (minData); + int32_t packedMax = (maxData << 16) | (maxData); + int32_t packedMinData[4]; + int32_t packedMaxData[4]; + gpu_dp_inst_t uniConvertIntegerLo_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + if (F16 == output_dtype) + { + minData = vsi_nn_Fp32toFp16(minVal); + maxData = vsi_nn_Fp32toFp16(maxVal); + } + else + { + minData = (uint16_t)vsi_nn_Fp32ToDFP(minVal, (int8_t)dstFixPointPos, VSI_NN_TYPE_INT16); + maxData = (uint16_t)vsi_nn_Fp32ToDFP(maxVal, (int8_t)dstFixPointPos, VSI_NN_TYPE_INT16); + } + + packedMin = (minData << 16) | (minData); + packedMax = (maxData << 16) | (maxData); + + packedMinData[0] = packedMinData[1] = packedMinData[2] = packedMinData[3] = packedMin; + packedMaxData[0] = packedMaxData[1] = packedMaxData[2] = packedMaxData[3] = packedMax; + + if (srcFixPointPos > dstFixPointPos) + { + int32_t postshift = vsi_nn_min(srcFixPointPos - dstFixPointPos, MAX_POST_SHIFT_BITS); + + uniConvertIntegerLo_2x8.data[7] |= (postshift & 0x1F); + } + else + { + uint32_t multiplier = vsi_nn_min((int64_t)1 << (dstFixPointPos - srcFixPointPos), MAX_MULTIPLIER_NUM); + uint32_t i = 0; + + for (i = 0; i < 8; i++) + { + uniConvertIntegerLo_2x8.data[i + 8] = multiplier; + } + } + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertIntegerLo_2x8", &uniConvertIntegerLo_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "packedMinData", packedMinData); + status |= vsi_nn_kernel_gpu_add_param( node, "packedMaxData", packedMaxData); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (U8 == input_dtype + && (U8 == output_dtype || F16 == output_dtype)) + { + int32_t packedMin = 0; + int32_t packedMax = 0; + int32_t packedMinData[4]; + int32_t packedMaxData[4]; + float uint8Scale = scaleIn / scaleOut; + uint16_t M0 = 0; + int8_t postShift = 0; + uint32_t multAndoutZP[2] = {0}; + gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x1b1a1918, 0x1f1e1d1c, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + vsi_nn_GetFP32MultiAndPostShift(uint8Scale, &M0, &postShift); + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * M0); + + uniU8MulAndPostShift_Lo_2x8.data[7] |= (postShift & 0x1F); + uniU8MulAndPostShift_Hi_2x8.data[7] |= (postShift & 0x1F); + + if (F16 == output_dtype) + { + uint16_t minData = 0; + uint16_t maxData = 0; + minData = vsi_nn_Fp32toFp16(minVal); + maxData = vsi_nn_Fp32toFp16(maxVal); + packedMin = (minData << 16) | (minData); + packedMax = (maxData << 16) | (maxData); + } + else + { + uint8_t minData = 0; + uint8_t maxData = 0; + minData = (uint8_t)vsi_nn_Fp32ToAffine(minVal, scaleOut, output_ZP, VSI_NN_TYPE_UINT8); + maxData = (uint8_t)vsi_nn_Fp32ToAffine(maxVal, scaleOut, output_ZP, VSI_NN_TYPE_UINT8); + packedMin = (minData << 24) | (minData << 16) | (minData << 8) | (minData); + packedMax = (maxData << 24) | (maxData << 16) | (maxData << 8) | (maxData); + } + + packedMinData[0] = packedMinData[1] = packedMinData[2] = packedMinData[3] = packedMin; + packedMaxData[0] = packedMaxData[1] = packedMaxData[2] = packedMaxData[3] = packedMax; + + status = vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP); + + status |= vsi_nn_kernel_gpu_add_param( node, "packedMinData", packedMinData); + status |= vsi_nn_kernel_gpu_add_param( node, "packedMaxData", packedMaxData); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + SAFE_FREE_TENSOR_ATTR(input_attr); + return status; + +} /* _clip_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _clip_kernel_map; + size_t kernel_map_size = _cnt_of_array( _clip_kernel_map ); + vx_param_description_t * param_def = _clip_kernel_param_def; + size_t param_def_size = _cnt_of_array( _clip_kernel_param_def ); + vx_kernel_initialize_f initializer = _clip_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = CLIP_HASH_KEY( in_dtype, out_dtype, image_2d ); + + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CLIP_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + float min_value = vsi_nn_kernel_param_get_float32( params, "min_value" ); + float max_value = vsi_nn_kernel_param_get_float32( params, "max_value" ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_MIN_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &min_value ); + node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CLIP_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MIN_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( clip, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c new file mode 100644 index 0000000..fd32271 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c @@ -0,0 +1,527 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/** Comparisons Kernel internal type */ +typedef enum +{ + COMP_GREAT = VSI_NN_RELATIONAL_OPS_GREAT, + COMP_GREAT_EQUAL = VSI_NN_RELATIONAL_OPS_GREAT_EQUAL, + COMP_LESS = VSI_NN_RELATIONAL_OPS_LESS, + COMP_LESS_EQUAL = VSI_NN_RELATIONAL_OPS_LESS_EQUAL, + COMP_NOT_EQUAL = VSI_NN_RELATIONAL_OPS_NOT_EQUAL, + COMP_EQUAL = VSI_NN_RELATIONAL_OPS_EQUAL, +} relational_type_e; + +/* + * Define kernel meta. + */ +#define HASH_COMPARISONS_KEY(_type, _input0_type, _input1_type, _output_type, _image_2d) \ + ((_type << 28) | (_input0_type << 20) | (_input1_type << 12) | (_output_type << 2) | (_image_2d)) + +#define KERNEL_SOURCE_2D "relational_ops_2d", +#define KERNEL_SOURCE_3D "relational_ops_3d", + +#define HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, SRC0_TYPE, SRC1_TYPE) \ + CVIVANTE_NAMESPACE("evis."#FUNC_NAME"_"#SRC0_TYPE#SRC1_TYPE"toBOOL8") + +#define COMPARISONS_KERNELS(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 0), \ + HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, SRC0_TYPE, SRC1_TYPE), \ + SOURCE }, + +#define HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, SRC0_TYPE, SRC1_TYPE) \ + CVIVANTE_NAMESPACE("evis."#FUNC_NAME"_"#SRC0_TYPE#SRC1_TYPE"toBOOL8_2D") + +#define COMPARISONS_KERNELS_2D(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 1), \ + HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, SRC0_TYPE, SRC1_TYPE), \ + SOURCE }, + +#define COMPARISONS_KERNELS_INT8(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 0), \ + HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, I8, I8), \ + SOURCE }, + +#define COMPARISONS_KERNELS_INT8_2D(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 1), \ + HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, I8, I8), \ + SOURCE }, + +#define COMPARISONS_KERNELS_HALF(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 0), \ + HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, F16, F16), \ + SOURCE }, + +#define COMPARISONS_KERNELS_HALF_2D(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ + { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 1), \ + HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, F16, F16), \ + SOURCE }, + +#define LESS_OP less +#define GREAT_OP great +#define LESS_EQUAL_OP less_equal +#define GREAT_EQUAL_OP great_equal +#define EQUAL_OP equal +#define NOT_EQUAL_OP not_equal + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } _comparisons_evis_kernel_map[] = +{ + COMPARISONS_KERNELS(LESS_OP, COMP_LESS, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_OP, COMP_LESS, F16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_OP, COMP_LESS, F16, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_OP, COMP_LESS, F16, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_OP, COMP_LESS, I16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_OP, COMP_LESS, I16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_OP, COMP_LESS, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_OP, COMP_LESS, U8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_OP, COMP_LESS, I8, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_OP, COMP_LESS, I8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_HALF(LESS_OP, COMP_LESS, BF16, BF16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_INT8(LESS_OP, COMP_LESS, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_2D(LESS_OP, COMP_LESS, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_OP, COMP_LESS, F16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_OP, COMP_LESS, F16, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_OP, COMP_LESS, F16, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_OP, COMP_LESS, I16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_OP, COMP_LESS, I16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_OP, COMP_LESS, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_OP, COMP_LESS, U8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_OP, COMP_LESS, I8, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_OP, COMP_LESS, I8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_HALF_2D(LESS_OP, COMP_LESS, BF16, BF16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_INT8_2D(LESS_OP, COMP_LESS, BOOL8, BOOL8, KERNEL_SOURCE_2D) + + COMPARISONS_KERNELS(GREAT_OP, COMP_GREAT, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_OP, COMP_GREAT, F16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_OP, COMP_GREAT, F16, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_OP, COMP_GREAT, F16, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_OP, COMP_GREAT, I16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_OP, COMP_GREAT, I16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_OP, COMP_GREAT, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_OP, COMP_GREAT, U8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_OP, COMP_GREAT, I8, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_OP, COMP_GREAT, I8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_HALF(GREAT_OP, COMP_GREAT, BF16, BF16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_INT8(GREAT_OP, COMP_GREAT, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_2D(GREAT_OP, COMP_GREAT, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_OP, COMP_GREAT, F16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_OP, COMP_GREAT, F16, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_OP, COMP_GREAT, F16, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_OP, COMP_GREAT, I16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_OP, COMP_GREAT, I16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_OP, COMP_GREAT, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_OP, COMP_GREAT, U8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_OP, COMP_GREAT, I8, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_OP, COMP_GREAT, I8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_HALF_2D(GREAT_OP, COMP_GREAT, BF16, BF16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_INT8_2D(GREAT_OP, COMP_GREAT, BOOL8, BOOL8, KERNEL_SOURCE_2D) + + COMPARISONS_KERNELS(LESS_EQUAL_OP, COMP_LESS_EQUAL, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_EQUAL_OP, COMP_LESS_EQUAL, F16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_EQUAL_OP, COMP_LESS_EQUAL, F16, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_EQUAL_OP, COMP_LESS_EQUAL, F16, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_EQUAL_OP, COMP_LESS_EQUAL, I16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_EQUAL_OP, COMP_LESS_EQUAL, I16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_EQUAL_OP, COMP_LESS_EQUAL, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_EQUAL_OP, COMP_LESS_EQUAL, U8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_EQUAL_OP, COMP_LESS_EQUAL, I8, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(LESS_EQUAL_OP, COMP_LESS_EQUAL, I8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_HALF(LESS_EQUAL_OP, COMP_LESS_EQUAL, BF16, BF16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_INT8(LESS_EQUAL_OP, COMP_LESS_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, F16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, F16, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, F16, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, I16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, I16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, U8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, I8, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, I8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_HALF_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, BF16, BF16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_INT8_2D(LESS_EQUAL_OP, COMP_LESS_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_2D) + + COMPARISONS_KERNELS(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F16, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F16, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, I16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, I16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, U8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, I8, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, I8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_HALF(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, BF16, BF16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_INT8(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F16, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, F16, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, I16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, I16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, U8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, I8, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, I8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_HALF_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, BF16, BF16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_INT8_2D(GREAT_EQUAL_OP, COMP_GREAT_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_2D) + + COMPARISONS_KERNELS(EQUAL_OP, COMP_EQUAL, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(EQUAL_OP, COMP_EQUAL, F16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(EQUAL_OP, COMP_EQUAL, F16, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(EQUAL_OP, COMP_EQUAL, F16, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(EQUAL_OP, COMP_EQUAL, I16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(EQUAL_OP, COMP_EQUAL, I16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(EQUAL_OP, COMP_EQUAL, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(EQUAL_OP, COMP_EQUAL, U8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(EQUAL_OP, COMP_EQUAL, I8, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(EQUAL_OP, COMP_EQUAL, I8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_HALF(EQUAL_OP, COMP_EQUAL, BF16, BF16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_INT8(EQUAL_OP, COMP_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_2D(EQUAL_OP, COMP_EQUAL, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(EQUAL_OP, COMP_EQUAL, F16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(EQUAL_OP, COMP_EQUAL, F16, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(EQUAL_OP, COMP_EQUAL, F16, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(EQUAL_OP, COMP_EQUAL, I16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(EQUAL_OP, COMP_EQUAL, I16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(EQUAL_OP, COMP_EQUAL, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(EQUAL_OP, COMP_EQUAL, U8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(EQUAL_OP, COMP_EQUAL, I8, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(EQUAL_OP, COMP_EQUAL, I8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_HALF_2D(EQUAL_OP, COMP_EQUAL, BF16, BF16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_INT8_2D(EQUAL_OP, COMP_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_2D) + + COMPARISONS_KERNELS(NOT_EQUAL_OP, COMP_NOT_EQUAL, F16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(NOT_EQUAL_OP, COMP_NOT_EQUAL, F16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(NOT_EQUAL_OP, COMP_NOT_EQUAL, F16, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(NOT_EQUAL_OP, COMP_NOT_EQUAL, F16, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(NOT_EQUAL_OP, COMP_NOT_EQUAL, I16, I16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(NOT_EQUAL_OP, COMP_NOT_EQUAL, I16, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(NOT_EQUAL_OP, COMP_NOT_EQUAL, U8, U8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(NOT_EQUAL_OP, COMP_NOT_EQUAL, U8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(NOT_EQUAL_OP, COMP_NOT_EQUAL, I8, I8, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS(NOT_EQUAL_OP, COMP_NOT_EQUAL, I8, F16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_HALF(NOT_EQUAL_OP, COMP_NOT_EQUAL, BF16, BF16, KERNEL_SOURCE_3D) + COMPARISONS_KERNELS_INT8(NOT_EQUAL_OP, COMP_NOT_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_3D) + + COMPARISONS_KERNELS_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, F16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, F16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, F16, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, F16, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, I16, I16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, I16, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, U8, U8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, U8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, I8, I8, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, I8, F16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_HALF_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, BF16, BF16, KERNEL_SOURCE_2D) + COMPARISONS_KERNELS_INT8_2D(NOT_EQUAL_OP, COMP_NOT_EQUAL, BOOL8, BOOL8, KERNEL_SOURCE_2D) +}; + +#undef LESS_OP +#undef GREAT_OP +#undef LESS_EQUAL_OP +#undef GREAT_EQUAL_OP +#undef EQUAL_OP +#undef NOT_EQUAL_OP + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_comparisons_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL, NULL, NULL }; + vsi_int_array_t * out_shape = NULL; + float input0Scale = 1.0f; + float input0Tail = 0; + float input1Scale = 1.0f; + float input1Tail = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_shape = attr[2]->shape; + + if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = attr[0]->dfp.fl; + if (fl > 0) + { + input0Scale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + input0Scale = (float)((int64_t)1 << -fl); + } + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + input0Scale = attr[0]->asymm.scale; + input0Tail = 0 - attr[0]->asymm.zero_point * input0Scale; + } + + if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = attr[1]->dfp.fl; + if (fl > 0) + { + input1Scale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + input1Scale = (float)((int64_t)1 << -fl); + } + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + input1Scale = attr[1]->asymm.scale; + input1Tail = 0 - attr[1]->asymm.zero_point * input1Scale; + } + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + if (1) + { + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDatatoFp32Part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDatatoFp32Part1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniDatatoFp32Part1_4x4", &uniDatatoFp32Part1_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input0Scale", &input0Scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input0Tail", &input0Tail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input1Scale", &input1Scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input1Tail", &input1Tail ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} /* _comparisons_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t operation, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype; + vsi_nn_kernel_dtype_e input1_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(_comparisons_evis_kernel_map); i ++ ) + { + if( _comparisons_evis_kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(_comparisons_evis_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _comparisons_evis_kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _comparisons_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + _comparisons_evis_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _comparisons_evis_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t operation = 0; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + operation = vsi_nn_kernel_param_get_int32( params, "operation" ); + + image_2d = (outputs[0]->attr.dim_num == 2); + status = _query_kernel( inputs, outputs, operation, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM, + inputs, 2, outputs, 1 ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, _EVIS_PARAM_NUM ); + } + } + return node; +} /* _setup() */ + +REGISTER_BACKEND_EVIS( relational_ops, _setup ) + +__END_DECLS diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c new file mode 100644 index 0000000..9a57aee --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c @@ -0,0 +1,339 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8") + +#define KERNEL_SOURCE_1 "depth2space_crd" + +// Add kernel hashtable here +#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _quant_type) \ + ((_input0_type << 24) | (_output_type << 16) | (_quant_type << 8)) + +#define TENSOR_DEPTH2SPACE_CRD_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 0), \ + VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } depth2space_crd_map[] = +{ + TENSOR_DEPTH2SPACE_CRD_KERNELS(U8, U8, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_KERNELS(I8, I8, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_KERNELS(I16, I16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, F16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_KERNELS(I8, F16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_KERNELS(I16, F16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, I8, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, I16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_KERNELS(U8, F16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, U8, KERNEL_SOURCE_1) +}; + +/* + * Kernel params + */ +static vx_param_description_t _depth2space_crd_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _DEPTH2SPACE_CRD_PARAM_NUM _cnt_of_array( _depth2space_crd_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + uint32_t output_dims = 0; + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + int32_t output_width = 0; + int32_t output_height = 0; + int32_t output_chn = 0; + int32_t src0ZP = 0; + float src0Scale = 0; + int32_t dstZP = 0; + float dstScale = 0; + + uint32_t pack_key = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + dstZP = attr[1]->asymm.zero_point; + dstScale = attr[1]->asymm.scale; + if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + src0Scale = 1; + } + + if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[1]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[1]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl)); + } + dstScale = 1.0f/dstScale; + } + else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + } + + output_dims = (uint32_t)attr[1]->shape->size; + output_width = attr[1]->shape->data[0]; + output_height = attr[1]->shape->data[1]; + output_chn = output_dims > 2 ? attr[1]->shape->data[2] : 1; + + shaderParam.global_scale[0] = 1; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((output_width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = output_height; + shaderParam.global_size[2] = output_chn; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE) \ + (IN0_TYPE | (OUT_TYPE << 8)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype); + + { + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP0[2] = {0}; + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, F16): + case _PACK_SELECT_KEY( I8, F16): + case _PACK_SELECT_KEY( I16, F16): + case _PACK_SELECT_KEY( F16, U8): + case _PACK_SELECT_KEY( F16, I8): + case _PACK_SELECT_KEY( F16, I16): + case _PACK_SELECT_KEY( U8, U8): + case _PACK_SELECT_KEY( I8, I8): + case _PACK_SELECT_KEY( I16, I16): + { + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0); + + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } +#undef _PACK_SELECT_KEY + + CHECK_STATUS_FAIL_GOTO(status, OnError ); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, 0 ); + + for( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ ) + { + if( depth2space_crd_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(depth2space_crd_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", depth2space_crd_map[i].function_name ); + kernel->info.parameters = _depth2space_crd_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _depth2space_crd_kernel_param_def ); + kernel->info.initialize = _depth2space_crd_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + depth2space_crd_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + depth2space_crd_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_DEPTH2SPACE_CRD_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + vsi_nn_kernel_node_pack_io( tmp_params, _DEPTH2SPACE_CRD_PARAM_NUM, inputs, 1, outputs, 1 ); + tmp_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _DEPTH2SPACE_CRD_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[2] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( depth2space_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c new file mode 100644 index 0000000..9c8cbab --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c @@ -0,0 +1,779 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + KN = 0, + K40, + K56, + K64, + K80, + K88, +} _internal_kernel_size_e; + + +typedef enum +{ + D0 = 0, + D1, + D2, +} _internal_dilation_e; + +typedef enum +{ + PARAM_INPUT = 0, + PARAM_WEIGHT, + PARAM_BIAS, + PARAM_OUTPUT, + PARAM_PAD, + PARAM_STRIDE, + PARAM_DILATION, +} param_index_e; + +#define _DEPTHWISE_CONV1D_KERNEL_SOURCE0 "depthwise_conv1d_src0" +#define _DEPTHWISE_CONV1D_KERNEL_SOURCE1 "depthwise_conv1d_src1" +#define _DEPTHWISE_CONV1D_KERNEL_SOURCE2 "depthwise_conv1d_src2" +#define _DEPTHWISE_CONV1D_KERNEL_SOURCE3 "depthwise_conv1d_src3" + +#define STR(a) #a + +// Add kernel hashtable here +#define DEPTHWISE_CONV1D_HASH_KEY( SRC_TYPE, WEIGHT_TYPE, DST_TYPE, KERNEL_SIZE, DILATION ) \ + ((DILATION << 23) | (KERNEL_SIZE << 15) | (WEIGHT_TYPE << 10) | ( SRC_TYPE << 5 ) | ( DST_TYPE )) + +#define PACK_KERNEL_MAP( SRC_TYPE, DST_TYPE, WEIGHT_TYPE, KERNEL_SIZE, DILATION, SOURCE) \ + { DEPTHWISE_CONV1D_HASH_KEY( SRC_TYPE, WEIGHT_TYPE, DST_TYPE, KERNEL_SIZE, DILATION ),\ + CVIVANTE_NAMESPACE("evis.vxDW_Conv1D_"STR(SRC_TYPE)"to"STR(DST_TYPE)"_"STR(KERNEL_SIZE)"_"STR(DILATION)) \ + , SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _depthwise_conv1d_kernel_map[] = +{ + PACK_KERNEL_MAP( U8, U8, U8, KN, D1, _DEPTHWISE_CONV1D_KERNEL_SOURCE0 ), + PACK_KERNEL_MAP( U8, U8, U8, KN, D2, _DEPTHWISE_CONV1D_KERNEL_SOURCE0 ), + PACK_KERNEL_MAP( U8, U8, U8, K40, D1, _DEPTHWISE_CONV1D_KERNEL_SOURCE1 ), + PACK_KERNEL_MAP( U8, U8, U8, K56, D1, _DEPTHWISE_CONV1D_KERNEL_SOURCE1 ), + PACK_KERNEL_MAP( U8, U8, U8, K64, D1, _DEPTHWISE_CONV1D_KERNEL_SOURCE2 ), + PACK_KERNEL_MAP( U8, U8, U8, K80, D1, _DEPTHWISE_CONV1D_KERNEL_SOURCE2 ), + PACK_KERNEL_MAP( U8, U8, U8, K88, D2, _DEPTHWISE_CONV1D_KERNEL_SOURCE3 ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _depthwise_conv1d_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _DEPTHWISE_CONV1D_PARAM_NUM _cnt_of_array( _depthwise_conv1d_kernel_param_def ) + +static _internal_kernel_size_e get_kernel_size(uint32_t k_size, uint32_t dilation, + uint32_t stride, uint32_t evis_version) +{ +#define _PACK_SELECT_KEY( kernel_size, dilation, stride, evis_version ) \ + ( (uint64_t)kernel_size | ((uint64_t)dilation << 16) \ + | ( (uint64_t)stride << 32) | ((uint64_t)evis_version << 48)) + + _internal_kernel_size_e ks_e = KN; + uint64_t pack_key = _PACK_SELECT_KEY(k_size, dilation, stride, evis_version); + + switch (pack_key) + { + case _PACK_SELECT_KEY(40, D1, 1, VSI_NN_HW_EVIS_2): + case _PACK_SELECT_KEY(40, D1, 2, VSI_NN_HW_EVIS_2): + ks_e = K40; + break; + case _PACK_SELECT_KEY(56, D1, 1, VSI_NN_HW_EVIS_2): + ks_e = K56; + break; + case _PACK_SELECT_KEY(64, D1, 1, VSI_NN_HW_EVIS_2): + ks_e = K64; + break; + case _PACK_SELECT_KEY(80, D1, 1, VSI_NN_HW_EVIS_2): + ks_e = K80; + break; + case _PACK_SELECT_KEY(88, D2, 1, VSI_NN_HW_EVIS_2): + ks_e = K88; + break; + default: + ks_e = KN; + break; + } + +#undef _PACK_SELECT_KEY + + return ks_e; +} + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_tensor input = (vx_tensor)param[0]; + vx_tensor weight = (vx_tensor)param[1]; + vx_tensor output = (vx_tensor)param[3]; + int32_t stride = 0; + int32_t dilation = 0; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_nn_kernel_tensor_attr_t *weight_attr = NULL; + vsi_int_array_t *output_shape = NULL; + int32_t weightZP = 0; + float outputScale = 1.0f; + float outputZP = 0; + vx_hardware_caps_params_t hw_param; + _internal_kernel_size_e ks = KN; + uint32_t kernel_size = 0; + uint32_t kernel_size_x16 = 0; + uint32_t kernel_size_x8 = 0; + uint32_t evis_version = 0; + vx_context ctx = vxGetContext((vx_reference)node); + uint64_t pack_key = 0; + + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t)); + status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t)); + CHECK_STATUS_FAIL_GOTO(status, final); + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); + CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + weight_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)weight); + CHECK_PTR_FAIL_GOTO( weight_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &stride); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &dilation); + CHECK_STATUS_FAIL_GOTO(status, final ); + kernel_size = weight_attr->shape->data[0]; + + if(hw_param.evis1 == TRUE && hw_param.evis2 == FALSE) + { + evis_version = VSI_NN_HW_EVIS_1; + } + else if(hw_param.evis1 == FALSE && hw_param.evis2 == TRUE) + { + evis_version = VSI_NN_HW_EVIS_2; + } + + ks = get_kernel_size(kernel_size, dilation, stride, evis_version); + + output_shape = output_attr->shape; + gpu_param.dim = 2; + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + + if (KN == ks) + { + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + } + else + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + } + + gpu_param.local_size[0] = 8; + gpu_param.local_size[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], gpu_param.local_size[0]); + gpu_param.global_size[1] = gpu_align_p2((output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1], gpu_param.local_size[1]); + + outputScale = input_attr->asymm.scale; + + outputScale *= weight_attr->asymm.scale; + weightZP = weight_attr->asymm.zero_point; + outputScale /= output_attr->asymm.scale; + outputZP = (float)output_attr->asymm.zero_point + 0.5f; + +#define _PACK_SELECT_KEY( kernel_size, dilation, evis_version ) \ + ((uint64_t)kernel_size | ((uint64_t)dilation << 16) | ((uint64_t)evis_version << 32)) + + pack_key = _PACK_SELECT_KEY(ks, dilation, evis_version); + + switch (pack_key) + { + case _PACK_SELECT_KEY(KN, D1, VSI_NN_HW_EVIS_1): + case _PACK_SELECT_KEY(KN, D2, VSI_NN_HW_EVIS_1): + case _PACK_SELECT_KEY(KN, D1, VSI_NN_HW_EVIS_2): + case _PACK_SELECT_KEY(KN, D2, VSI_NN_HW_EVIS_2): + { + gpu_dp_inst_t uniU8SubZp_lo_2x8= {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniU8SubZp_hi_2x8= {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_align8_step0_16x1 = {{ + 0x00005555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x00000000, // ABin + 0x00005555, // BSelt + 0x76543210, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_align8_step1_16x1 = {{ + 0x00005555, // TCfg + 0x00000000, // ASelt + 0xfedcba98, 0x00000000, // ABin + 0x00005555, // BSelt + 0x76543210, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniU8ConvS16_align8_dial_step_16x1 = {{ + 0x00005555, // TCfg + 0x00000000, // ASelt + 0xeca86420, 0x00000000, // ABin + 0x00005555, // BSelt + 0x76543210, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (1 == dilation) + { + kernel_size_x16 = (uint32_t)(kernel_size / 16) * 16; + kernel_size_x8 = kernel_size - kernel_size_x16; + status = vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_align8_step0_16x1", + &uniU8ConvS16_align8_step0_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_align8_step1_16x1", + &uniU8ConvS16_align8_step1_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "kernel_size_x16", &kernel_size_x16); + status |= vsi_nn_kernel_gpu_add_param(node, "kernel_size_x8", &kernel_size_x8); + } + else if (2 == dilation) + { + kernel_size_x8 = (uint32_t)(kernel_size / 8) * 8; + status = vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_align8_step0_16x1", + &uniU8ConvS16_align8_dial_step_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "kernel_size_x8", &kernel_size_x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZp_lo_2x8", &uniU8SubZp_lo_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZp_hi_2x8", &uniU8SubZp_hi_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "weightZP", &weightZP); + status |= vsi_nn_kernel_gpu_add_param(node, "scale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); + + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY(K40, D1, VSI_NN_HW_EVIS_2): + case _PACK_SELECT_KEY(K56, D1, VSI_NN_HW_EVIS_2): + case _PACK_SELECT_KEY(K64, D1, VSI_NN_HW_EVIS_2): + case _PACK_SELECT_KEY(K80, D1, VSI_NN_HW_EVIS_2): + { + gpu_dp_inst_t uniU8ConvS16_Stpe0_8x2b= {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x98765432, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_Stpe1_8x2b= {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0xba987654, 0xdcba9876, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_Stpe2_8x2b= {{ + 0x55555555, // TCfg + 0x50000000, // ASelt + 0xfedcba98, 0x10fedcba, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_Stpe3_8x2b= {{ + 0x55555555, // TCfg + 0x55505500, // ASelt + 0x3210fedc, 0x543210fe, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_Stpe4_8x2b= {{ + 0x55555555, // TCfg + 0x50000000, // ASelt + 0xfedcba98, 0x10fedcba, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_Stpe5_8x2b= {{ + 0x55555555, // TCfg + 0x55505500, // ASelt + 0x3210fedc, 0x543210fe, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniU8ConvS16_Stpe6_8x2b= {{ + 0x55555555, // TCfg + 0x55555555, // ASelt + 0x76543210, 0x98765432, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_Stpe7_8x2b= {{ + 0x55555555, // TCfg + 0x55555555, // ASelt + 0xba987654, 0xdcba9876, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniU8SubZp_lo_2x8= {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniU8SubZp_hi_2x8= {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractInteger_2x8= {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (stride == 1) + { + uniU8ConvS16_Stpe0_8x2b.data[3] = 0x87654321; + uniU8ConvS16_Stpe1_8x2b.data[2] = 0x98765432; + uniU8ConvS16_Stpe1_8x2b.data[3] = 0xa9876543; + uniU8ConvS16_Stpe2_8x2b.data[1] = 0x40000000; + uniU8ConvS16_Stpe2_8x2b.data[3] = 0x0fedcba9; + uniU8ConvS16_Stpe3_8x2b.data[1] = 0x54005000; + uniU8ConvS16_Stpe3_8x2b.data[2] = 0x10fedcba; + uniU8ConvS16_Stpe3_8x2b.data[3] = 0x210fedcb; + uniU8ConvS16_Stpe4_8x2b.data[1] = 0x00000000; + uniU8ConvS16_Stpe4_8x2b.data[2] = 0xba987654; + uniU8ConvS16_Stpe4_8x2b.data[3] = 0xcba98765; + uniU8ConvS16_Stpe5_8x2b.data[1] = 0x00000000; + uniU8ConvS16_Stpe5_8x2b.data[2] = 0xdcba9876; + uniU8ConvS16_Stpe5_8x2b.data[3] = 0xedcba987; + uniU8ConvS16_Stpe6_8x2b.data[1] = 0x55405500; + uniU8ConvS16_Stpe6_8x2b.data[2] = 0x3210fedc; + uniU8ConvS16_Stpe6_8x2b.data[3] = 0x43210fed; + uniU8ConvS16_Stpe7_8x2b.data[1] = 0x55545550; + uniU8ConvS16_Stpe7_8x2b.data[2] = 0x543210fe; + uniU8ConvS16_Stpe7_8x2b.data[3] = 0x6543210f; + } + status = vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe0_8x2b", &uniU8ConvS16_Stpe0_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe1_8x2b", &uniU8ConvS16_Stpe1_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe2_8x2b", &uniU8ConvS16_Stpe2_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe3_8x2b", &uniU8ConvS16_Stpe3_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZp_lo_2x8", &uniU8SubZp_lo_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZp_hi_2x8", &uniU8SubZp_hi_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractInteger_2x8", &uniExtractInteger_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe4_8x2b", &uniU8ConvS16_Stpe4_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe5_8x2b", &uniU8ConvS16_Stpe5_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe6_8x2b", &uniU8ConvS16_Stpe6_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe7_8x2b", &uniU8ConvS16_Stpe7_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "weightZP", &weightZP); + status |= vsi_nn_kernel_gpu_add_param(node, "scale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY(K88, D2, VSI_NN_HW_EVIS_2): + { + gpu_dp_inst_t uniExtractInteger_2x8= {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_Stpe0_8x2b= {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0xeca86420, 0xfdb97531, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_Stpe1_8x2b= {{ + 0x55555555, // TCfg + 0x40004000, // ASelt + 0x0eca8642, 0x1fdb9753, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_Stpe4_8x2b= {{ + 0x55555555, // TCfg + 0x50005000, // ASelt + 0x20eca864, 0x31fdb975, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8ConvS16_Stpe5_8x2b= {{ + 0x55555555, // TCfg + 0x54005400, // ASelt + 0x420eca86, 0x531fdb97, // ABin + 0x00000000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZp_hi_2x8= {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZp_lo_2x8= {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniExtractInteger_2x8", &uniExtractInteger_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe0_8x2b", &uniU8ConvS16_Stpe0_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe1_8x2b", &uniU8ConvS16_Stpe1_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe4_8x2b", &uniU8ConvS16_Stpe4_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8ConvS16_Stpe5_8x2b", &uniU8ConvS16_Stpe5_8x2b); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZp_hi_2x8", &uniU8SubZp_hi_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZp_lo_2x8", &uniU8SubZp_lo_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "weightZP", &weightZP); + status |= vsi_nn_kernel_gpu_add_param(node, "scale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + VSILOGE("unsupport kernel size:%d/dilation:%d/evis version:%d", kernel_size, dilation, evis_version); + break; + } + +#undef _PACK_SELECT_KEY + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (input_attr) + { + vsi_nn_kernel_tensor_attr_release(&input_attr); + } + if (weight_attr) + { + vsi_nn_kernel_tensor_attr_release(&weight_attr); + } + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + + return status; +} /* _depthwise_conv1d_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t dilation, + _internal_kernel_size_e kernel_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e weight_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _depthwise_conv1d_kernel_map; + size_t kernel_map_size = _cnt_of_array( _depthwise_conv1d_kernel_map ); + vx_param_description_t * param_def = _depthwise_conv1d_kernel_param_def; + size_t param_def_size = _cnt_of_array( _depthwise_conv1d_kernel_param_def ); + vx_kernel_initialize_f initializer = _depthwise_conv1d_initializer; + _internal_dilation_e dilation_e = D0; + uint32_t key = 0; + size_t i = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + weight_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + switch (dilation) + { + case 1: + dilation_e = D1; + break; + case 2: + dilation_e = D2; + break; + default: + dilation_e = D0; + break; + } + + key = DEPTHWISE_CONV1D_HASH_KEY( in_dtype, weight_dtype, out_dtype, kernel_size, dilation_e); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_DEPTHWISE_CONV1D_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t weight_pad_front[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t weight_pad_end[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_nn_tensor_t * weights = NULL; + vsi_nn_tensor_t * biases = NULL; + vsi_nn_tensor_t *temp_tensor[3] = {NULL}; + int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" ); + int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" ); + int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); + int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" ); + _internal_kernel_size_e ks = KN; + + weight_pad_end[0] = gpu_align_np2_safe(inputs[1]->attr.size[0], 8) - inputs[1]->attr.size[0]; + + weights = vsi_nn_pad_tensor(graph, inputs[1], weight_pad_front, weight_pad_end, inputs[1]->attr.dim_num, + VSI_NN_PAD_MODE_CONSTANT, 0); + + biases = vsi_nn_merge_input_zeropoint_to_bias(graph, inputs[0], inputs[1], inputs[2]); + + temp_tensor[0] = inputs[0]; + temp_tensor[1] = weights; + temp_tensor[2] = biases; + + ks = get_kernel_size(weights->attr.size[0], dilation, stride, graph->ctx->config.evis.ver); + + status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + if( pad_front != 0 && pad_end != 0) + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + } + + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _DEPTHWISE_CONV1D_PARAM_NUM, + temp_tensor, input_num, outputs, output_num ); + node_params[PARAM_PAD] = vsi_nn_kernel_scalar_create( graph, I32, &pad_front ); + node_params[PARAM_STRIDE] = vsi_nn_kernel_scalar_create( graph, I32, &stride ); + node_params[PARAM_DILATION] = vsi_nn_kernel_scalar_create( graph, I32, &dilation ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _DEPTHWISE_CONV1D_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[PARAM_PAD] ); + vsi_nn_kernel_scalar_release( &node_params[PARAM_STRIDE] ); + vsi_nn_kernel_scalar_release( &node_params[PARAM_DILATION] ); + } + } + + if (weights) + { + vsi_nn_ReleaseTensor(&weights); + } + + if (biases) + { + vsi_nn_ReleaseTensor(&biases); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( depthwise_conv1d, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c new file mode 100644 index 0000000..4d67bf3 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c @@ -0,0 +1,338 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_DETECT_POST_BOX, +} _internal_kernel_e; + +#define _DETECT_POST_BOX_KERNEL_SOURCE "detect_post_box" + +#define STR(a) #a +// Add kernel hashtable here +#define DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + ((IN0_DTYPE << 18) | ( IN1_DTYPE << 11 ) | ( OUT_DTYPE << 4)) + +#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \ + CVIVANTE_NAMESPACE("evis.detect_post_box_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ + _DETECT_POST_BOX_KERNEL_SOURCE} + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _detect_post_box_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, F32 ), + PACK_KERNEL_MAP( U8, U8, F32 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _detect_post_box_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _DETECT_POST_BOX_PARAM_NUM _cnt_of_array( _detect_post_box_kernel_param_def ) + +#define SCALAR_SCALE_Y (3) +#define SCALAR_SCALE_X (4) +#define SCALAR_SCALE_H (5) +#define SCALAR_SCALE_W (6) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input1_attr = NULL; + vsi_int_array_t * in_shape = NULL; + float logE = (float)(log10(exp(1.0f)) / log10(2.0f)); + float scaleIn0 = 1.0f; + float scaleIn1 = 1.0f; + int32_t input1_ZP = 0; + int32_t input0_ZP = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + + input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( input1_attr, "Create tensor attr buffer fail.", final ); + + in_shape = input_attr->shape; + + status = vsi_nn_kernel_gpu_add_param( node, "logE", &logE); + CHECK_STATUS_FAIL_GOTO(status, final ); + + if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) + { + input0_ZP = input_attr->asymm.zero_point; + scaleIn0 = input_attr->asymm.scale; + } + + if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant ) + { + input1_ZP = input1_attr->asymm.zero_point; + scaleIn1 = input1_attr->asymm.scale; + } + + if ((F32 == input_attr->dtype) || (F32 == input1_attr->dtype)) + { + gpu_dp_inst_t uniDataMerge_4x4 = {{ + 0x03030303, // TCfg + 0x01010000, // ASelt + 0x00010000, 0x00010000, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00005400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param( node, + "uniDataMerge_4x4", &uniDataMerge_4x4 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if ((U8 == input_attr->dtype) || (U8 == input1_attr->dtype)) + { + uint16_t M0 = 0; + int8_t postShift0 = 0; + uint16_t M1 = 0; + int8_t postShift1 = 0; + uint32_t i = 0; + gpu_dp_inst_t uniU8SubZptoF32Conv0_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8SubZptoF32Conv1_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + vsi_nn_GetFP32MultiAndPostShift(scaleIn0, &M0, &postShift0); + vsi_nn_GetFP32MultiAndPostShift(scaleIn1, &M1, &postShift1); + uniU8SubZptoF32Conv0_4x4.data[7] |= (postShift0 & 0x1F); + uniU8SubZptoF32Conv1_4x4.data[7] |= (postShift1 & 0x1F); + for ( i = 0; i < 8; i++ ) + { + uniU8SubZptoF32Conv0_4x4.data[8 + i] = (((uint32_t)M0 << 16) | M0); + uniU8SubZptoF32Conv1_4x4.data[8 + i] = (((uint32_t)M1 << 16) | M1); + } + + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8SubZptoF32Conv0_4x4", &uniU8SubZptoF32Conv0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8SubZptoF32Conv1_4x4", &uniU8SubZptoF32Conv1_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &input0_ZP); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &input1_ZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = 2; + gpu_param.global_size[0] = ( + (in_shape->data[1] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]); + gpu_param.global_size[1] = ( + (in_shape->data[2] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(input_attr); + + return status; +} /* _detect_post_box_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _detect_post_box_kernel_map; + size_t kernel_map_size = _cnt_of_array( _detect_post_box_kernel_map ); + vx_param_description_t * param_def = _detect_post_box_kernel_param_def; + size_t param_def_size = _cnt_of_array( _detect_post_box_kernel_param_def ); + vx_kernel_initialize_f initializer = _detect_post_box_initializer; + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = DETECT_POST_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (vx_uint32)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_DETECT_POST_BOX_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + float inv_scale_y = vsi_nn_kernel_param_get_float32( params, "inv_scale_y" ); + float inv_scale_x = vsi_nn_kernel_param_get_float32( params, "inv_scale_x" ); + float inv_scale_h = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" ); + float inv_scale_w = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" ); + + status = _query_kernel( kernel, inputs, outputs); + + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_BOX_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_y ); + node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_x ); + node_params[SCALAR_SCALE_H] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_h ); + node_params[SCALAR_SCALE_W] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_w ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _DETECT_POST_BOX_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_H] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_W] ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( detect_post_box, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c new file mode 100644 index 0000000..bc849b4 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c @@ -0,0 +1,155 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS +#if 0 +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_DETECT_POST_NMS, +} _internal_kernel_e; + +#define _DETECT_POST_NMS_KERNEL_SOURCE "detect_post_nms" +#define _DETECT_POST_NMS_KERNEL_NAME CVIVANTE_NAMESPACE("evis.detect_post_nms") + +// Add kernel hashtable here +#define DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _DETECT_POST_NMS_KERNEL_NAME, SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _detect_post_nms_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, _DETECT_POST_NMS_KERNEL_SOURCE ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _detect_post_nms_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _DETECT_POST_NMS_PARAM_NUM _cnt_of_array( _detect_post_nms_kernel_param_def ) + +#define SCALAR_NMS_TYPE (6) +#define SCALAR_MAX_NUM (7) +#define SCALAR_MAX_CLASS (8) +#define SCALAR_MAX_DETECT (9) +#define SCALAR_SCORE_TH (10) +#define SCALAR_IOU_TH (11) +#define SCALAR_IS_BG (12) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_detect_post_nms_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + + return status; +} /* _detect_post_nms_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + + return status; +} /* _query_kernel() */ + +#endif + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + + return NULL; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( detect_post_nms, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c new file mode 100644 index 0000000..995455c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c @@ -0,0 +1,641 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/** Unary Kernel internal type */ +typedef enum +{ + UNARY_SIN, + UNARY_EXP, + UNARY_LOG, + UNARY_ELU, + UNARY_NEG, + UNARY_HSIGMOID, + UNARY_MISH, +} unary_type_e; + +/* + * Define kernel meta. + */ +#define HASH_UNARY_KEY(_type, _input_type, _output_type, _image_2d) \ + ((_type << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d)) + +#define KERNEL_SOURCE_2D "eltwise_unary_2d", +#define KERNEL_SOURCE_3D "eltwise_unary_3d", + +#define HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis."#FUNC_NAME"_"#SRC_TYPE"to"#DST_TYPE) + +#define TENSOR_UNARY_KERNELS(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE, SOURCE) \ + { HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 0), \ + HASH_UNARY_SH_KERNEL_NAME(FUNC_NAME, SRC_TYPE, OUT_TYPE), \ + SOURCE }, + +#define HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis."#FUNC_NAME"_"#SRC_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_UNARY_KERNELS_2D(FUNC_NAME, TYPE, SRC_TYPE, OUT_TYPE, SOURCE) \ + { HASH_UNARY_KEY(TYPE, SRC_TYPE, OUT_TYPE, 1), \ + HASH_UNARY_SH_KERNEL_2D_NAME(FUNC_NAME, SRC_TYPE, OUT_TYPE), \ + SOURCE }, + +#define SIN_OPERATION sin +#define EXP_OPERATION exp +#define LOG_OPERATION log +#define ELU_OPERATION elu +#define NEG_OPERATION neg +#define HSIGMOID_OPERATION hard_sigmoid +#define MISH_OPERATION mish + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } _eltwise_unary_evis_kernel_map[] = +{ + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, F16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, F16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, F16, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, F16, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, I16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, I16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, I8, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, I8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, BF16, BF16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, F16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, F16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, F16, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, F16, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, I16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, I16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, U8, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, U8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, I8, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, I8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, BF16, BF16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, F16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, F16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, F16, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, F16, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, I16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, I16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, U8, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, U8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, I8, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, I8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, BF16, BF16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, F16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, F16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, F16, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, F16, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, U8, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, U8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I8, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, BF16, BF16 , KERNEL_SOURCE_3D) + + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, BF16, BF16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, BF16, BF16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, BF16, BF16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, BF16, BF16 , KERNEL_SOURCE_2D) + + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16 , KERNEL_SOURCE_3D) + + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16 , KERNEL_SOURCE_2D) + + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, F16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, F16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, F16, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, F16, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, I16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, I16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, U8, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, U8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, I8, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, I8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, BF16, BF16 , KERNEL_SOURCE_3D) + + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16 , KERNEL_SOURCE_2D) +}; + +#undef SIN_OPERATION +#undef EXP_OPERATION +#undef LOG_OPERATION +#undef ELU_OPERATION +#undef NEG_OPERATION +#undef HSIGMOID_OPERATION +#undef MISH_OPERATION + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define INPUT_FUNC_TYPE (2) +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t type = 0; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; + vsi_int_array_t * out_shape = NULL; + float inputScale = 1.0f; + float inputTail = 0; + float outputScale = 1.0f; + float outputZP = 0; + uint32_t pack_key; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &type); + CHECK_STATUS_FAIL_GOTO(status, final ); + + out_shape = attr[1]->shape; + + if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = attr[0]->dfp.fl; + if (fl > 0) + { + inputScale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + inputScale = (float)((int64_t)1 << -fl); + } + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inputScale = attr[0]->asymm.scale; + inputTail = 0 - attr[0]->asymm.zero_point * inputScale; + } + + if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = attr[1]->dfp.fl; + if (fl > 0) + { + outputScale = (float)((int64_t)1 << fl); + } + else + { + outputScale = (float)1.0f / (float) ((int64_t)1 << -fl); + } + } + else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + outputScale = (float)1.0f / attr[1]->asymm.scale; + outputZP = (float)attr[1]->asymm.zero_point; + } + +#define _PACK_SELECT_KEY( TYPE, IN_TYPE, OUT_TYPE ) \ + (( TYPE << 24) | ( IN_TYPE << 16) | ( OUT_TYPE << 8)) + + pack_key = _PACK_SELECT_KEY( type, attr[0]->dtype, attr[1]->dtype ); + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + switch( pack_key ) + { + case _PACK_SELECT_KEY( UNARY_SIN, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_ELU, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ): + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + { + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDatatoFp32Part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDatatoFp32Part1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniDatatoFp32Part1_4x4", &uniDatatoFp32Part1_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "inputTail", &inputTail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "outputZP", &outputZP ); + + if (attr[1]->dtype == F16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractHalf8_2x8 ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + } + +#undef _PACK_SELECT_KEY + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _eltwise_unary_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t type, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_UNARY_KEY( type, input_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(_eltwise_unary_evis_kernel_map); i ++ ) + { + if( _eltwise_unary_evis_kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(_eltwise_unary_evis_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _eltwise_unary_evis_kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _eltwise_unary_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + _eltwise_unary_evis_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _eltwise_unary_evis_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel, + const unary_type_e unary_type + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* rs_tensors[2] = { NULL }; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 0; + vsi_bool ret; + + ret = vsi_nn_kernel_optimize_element_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); + if( ret ) + { + rs_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shape, new_rank ); + rs_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shape, new_rank ); + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size, + rs_tensors[0]->attr.dim_num ) ) + { + goto OnError; + } + + image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1); + status = _query_kernel( rs_tensors, &rs_tensors[1], unary_type, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if( node ) + { + vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, + rs_tensors, 1, &rs_tensors[1], 1 ); + node_params[INPUT_FUNC_TYPE] = vsi_nn_kernel_scalar_create( + graph, I32, &unary_type ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, OnError ); + } + } + +OnError: + if (rs_tensors[0]) + { + vsi_nn_ReleaseTensor( &rs_tensors[0] ); + } + + if (rs_tensors[1]) + { + vsi_nn_ReleaseTensor( &rs_tensors[1] ); + } + + if (node_params[INPUT_FUNC_TYPE]) + { + vsi_nn_kernel_scalar_release( &node_params[INPUT_FUNC_TYPE] ); + } + + return node; +} /* _setup() */ + +#define REGISTER_ELTWISE_UNARY_BACKEND_EVIS(KERNEL_NAME, UNARY_TYPE) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel, UNARY_TYPE); \ + } \ + REGISTER_BACKEND_EVIS( KERNEL_NAME, _##KERNEL_NAME##_setup ) + +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( elu, UNARY_ELU ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH ) + + +__END_DECLS diff --git a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c new file mode 100644 index 0000000..733b71c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c @@ -0,0 +1,442 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define FLOORDIV_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + + + #define FLOORDIV_KERNEL_SOURCE_NAME \ + "floordiv" + +#define FLOORDIV_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ + { FLOORDIV_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + CVIVANTE_NAMESPACE("evis.floordiv_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \ + FLOORDIV_KERNEL_SOURCE_NAME }, + +#define FLOORDIV_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ + { FLOORDIV_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + CVIVANTE_NAMESPACE("evis.floordiv_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \ + FLOORDIV_KERNEL_SOURCE_NAME }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _floordiv_kernel_map[] = +{ + // Register kernel here + FLOORDIV_KERNELS( F16, F16, F16 ) + FLOORDIV_KERNELS( F16, F16, I16 ) + FLOORDIV_KERNELS( F16, F16, I8 ) + FLOORDIV_KERNELS( F16, F16, U8 ) + FLOORDIV_KERNELS( I16, I16, I16 ) + FLOORDIV_KERNELS( I8, I8, I8 ) + FLOORDIV_KERNELS( U8, U8, U8 ) + FLOORDIV_KERNELS( I16, I16, F16 ) + FLOORDIV_KERNELS( I8, I8, F16 ) + FLOORDIV_KERNELS( U8, U8, F16 ) + FLOORDIV_KERNELS( BF16, BF16, BF16 ) + + FLOORDIV_KERNELS_2D( F16, F16, F16 ) + FLOORDIV_KERNELS_2D( F16, F16, I16 ) + FLOORDIV_KERNELS_2D( F16, F16, I8 ) + FLOORDIV_KERNELS_2D( F16, F16, U8 ) + FLOORDIV_KERNELS_2D( I16, I16, I16 ) + FLOORDIV_KERNELS_2D( I8, I8, I8 ) + FLOORDIV_KERNELS_2D( U8, U8, U8 ) + FLOORDIV_KERNELS_2D( I16, I16, F16 ) + FLOORDIV_KERNELS_2D( I8, I8, F16 ) + FLOORDIV_KERNELS_2D( U8, U8, F16 ) + FLOORDIV_KERNELS_2D( BF16, BF16, BF16 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _floordiv_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _FLOORDIV_PARAM_NUM _cnt_of_array( _floordiv_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_floordiv_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_status status = VX_FAILURE; + vx_tensor input0 = (vx_tensor)param[0]; + vx_tensor input1 = (vx_tensor)param[1]; + vx_tensor output = (vx_tensor)param[2]; + vsi_nn_kernel_tensor_attr_t *input0_attr = NULL; + vsi_nn_kernel_tensor_attr_t *input1_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t *output_shape = NULL; + vsi_nn_kernel_dtype_e input0_dtype = F16; + int32_t input0_fl = 0; + int32_t input1_fl = 0; + int32_t output_fl = 0; + float inScale0 = 0; + float inScale1 = 0; + float outScale = 0; + float in0Tail = 0; + float in1Tail = 0; + float outZp = 0; + + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 ); + CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1 ); + CHECK_PTR_FAIL_GOTO( input1_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + input0_dtype = input0_attr->dtype; + + gpu_param.dim = output_shape->size < 3 ? 2 : 3; + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_offset[2] = 0; + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? + (output_shape->data[2] + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2] : 1; + + if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + input0_fl = input0_attr->dfp.fl; + if (input0_fl > 0) + { + inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl); + } + else + { + inScale0 = (float)((int64_t)1 << -input0_fl); + } + status = vsi_nn_kernel_gpu_add_param( node, "in_scale0", &inScale0 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inScale0 = input0_attr->asymm.scale; + in0Tail = -inScale0 * ((float)input0_attr->asymm.zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "in_scale0", &inScale0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "in0Tail", &in0Tail ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + input1_fl = input1_attr->dfp.fl; + if (input1_fl > 0) + { + inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl); + } + else + { + inScale1 = (float)((int64_t)1 << -input1_fl); + } + status = vsi_nn_kernel_gpu_add_param( node, "in_scale1", &inScale1 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inScale1 = input1_attr->asymm.scale; + in1Tail = -inScale1 * ((float)input1_attr->asymm.zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "in_scale1", &inScale1 ); + status |= vsi_nn_kernel_gpu_add_param( node, "in1Tail", &in1Tail ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + output_fl = output_attr->dfp.fl; + if (output_fl > 0) + { + outScale = (float) ((int64_t)1 << output_fl); + } + else + { + outScale = 1.0f / (float)((int64_t)1 << -output_fl); + } + status = vsi_nn_kernel_gpu_add_param( node, "out_scale", &outScale ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + outScale = 1.0f / output_attr->asymm.scale; + outZp = (float)(output_attr->asymm.zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "out_scale", &outScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "out_zp", &outZp ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if (BF16 == input0_dtype) + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (input0_attr) + { + vsi_nn_kernel_tensor_attr_release(&input0_attr); + } + if (input1_attr) + { + vsi_nn_kernel_tensor_attr_release(&input1_attr); + } + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + return status; +} /* _floordiv_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _floordiv_kernel_map; + size_t kernel_map_size = _cnt_of_array( _floordiv_kernel_map ); + vx_param_description_t * param_def = _floordiv_kernel_param_def; + size_t param_def_size = _cnt_of_array( _floordiv_kernel_param_def ); + vx_kernel_initialize_f initializer = _floordiv_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = FLOORDIV_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_FLOORDIV_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + + status = _query_kernel( kernel, inputs, outputs, image_2d); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _FLOORDIV_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _FLOORDIV_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( floordiv, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c new file mode 100644 index 0000000..0c8273e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -0,0 +1,455 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define VX_KERNEL_NAME_GATHER_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8") +#define VX_KERNEL_NAME_GATHER_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8") +#define VX_KERNEL_NAME_GATHER_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16") +#define VX_KERNEL_NAME_GATHER_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_F16toF16") +#define VX_KERNEL_NAME_GATHER_I8TOF16 CVIVANTE_NAMESPACE("evis.gather_I8toF16") +#define VX_KERNEL_NAME_GATHER_I16TOF16 CVIVANTE_NAMESPACE("evis.gather_I16toF16") +#define VX_KERNEL_NAME_GATHER_F16TOI8 CVIVANTE_NAMESPACE("evis.gather_F16toI8") +#define VX_KERNEL_NAME_GATHER_F16TOI16 CVIVANTE_NAMESPACE("evis.gather_F16toI16") +#define VX_KERNEL_NAME_GATHER_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_U8toF16") +#define VX_KERNEL_NAME_GATHER_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_F16toU8") + +#define KERNEL_SOURCE_1 "gather" +#define KERNEL_SOURCE_2 "gather_mix" + +// Add kernel hashtable here +#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _quant_type) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_quant_type)) + +#define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \ + VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } gather_map[] = +{ + TENSOR_GATHER_KERNELS(U8, I32, U8, KERNEL_SOURCE_1) + TENSOR_GATHER_KERNELS(I8, I32, I8, KERNEL_SOURCE_1) + TENSOR_GATHER_KERNELS(I16, I32, I16, KERNEL_SOURCE_1) + TENSOR_GATHER_KERNELS(F16, I32, F16, KERNEL_SOURCE_1) + TENSOR_GATHER_KERNELS(I8, I32, F16, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(I16, I32, F16, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(F16, I32, I8, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(F16, I32, I16, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(U8, I32, F16, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(F16, I32, U8, KERNEL_SOURCE_2) +}; + +/* + * Kernel params + */ +static vx_param_description_t _gather_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GATHER_PARAM_NUM _cnt_of_array( _gather_kernel_param_def ) + +static vsi_status get_gather_tensor_reshape_size + ( + vsi_nn_tensor_t ** inputs, + int32_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t block_size, + uint32_t idxFlg + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t dims_num = inputs[0]->attr.dim_num; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + uint32_t elementCnt = 1; +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + + for(i = 0; i < dims_num; ++i) + { + elementCnt *= input_size[i]; + } + + for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + + if(idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes[0] = elementCnt; + sizes[1] = 1; + status = VSI_SUCCESS; + } + else + { + if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + status = VSI_SUCCESS; + } + } +#undef VSI_NN_MAX_IMAGE_WIDTH + + return status; +} /* _get_EltOP_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_gather_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int32_t block_size = 0; + int32_t block_num = 0; + int32_t indices_num = 1; + uint32_t input_dims1 = 0; + vx_uint32 i = 0; + vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; + vsi_int_array_t * input1_shape = NULL; + int32_t src0ZP = 0; + float src0Scale = 0; + int32_t dstZP = 0; + float dstScale = 0; + + uint32_t pack_key = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + dstZP = attr[2]->asymm.zero_point; + dstScale = attr[2]->asymm.scale; + if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + src0Scale = 1; + } + + if( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[2]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + dstScale = 1.0f/dstScale; + } + else if( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + } + + input1_shape = attr[1]->shape; + input_dims1 = (uint32_t)input1_shape->size; + for (i = 0; i < input_dims1; i++) + { + indices_num *= input1_shape->data[i]; + } + + shaderParam.global_scale[0] = 16; + if(attr[0]->dtype == I16 || attr[0]->dtype == F16) + { + shaderParam.global_scale[0] = 8; + } + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = indices_num; + shaderParam.global_size[2] = block_num; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE) \ + (IN0_TYPE | (OUT_TYPE << 8)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype); + + { + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP0[2] = {0}; + uint32_t multAndoutZP1[2] = {0}; + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertFp16toU8_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, F16): + case _PACK_SELECT_KEY( I8, F16): + case _PACK_SELECT_KEY( I16, F16): + { + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0); + + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, U8): + case _PACK_SELECT_KEY( F16, I8): + case _PACK_SELECT_KEY( F16, I16): + { + int32_t postShift0 = 0; + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); + + multAndoutZP1[0] = (uint32_t)(M0); + multAndoutZP1[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); + + gpu_dp_inst_update_postshfit( &uniConvertFp16toU8_2x8, postShift0 ); + status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertFp16toU8_2x8", &uniConvertFp16toU8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } +#undef _PACK_SELECT_KEY + + status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0 ); + + for( i = 0; i < _cnt_of_array(gather_map); i ++ ) + { + if( gather_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(gather_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", gather_map[i].function_name ); + kernel->info.parameters = _gather_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _gather_kernel_param_def ); + kernel->info.initialize = _gather_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + gather_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + gather_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); + int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); + + status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0); + status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1); + status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0); + if(status != VSI_SUCCESS) + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 0; +#define RESHAPE_DIM 2 + /* Pass parameters to node. */ + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], RESHAPE_DIM ); + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[1], RESHAPE_DIM ); + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], RESHAPE_DIM ); +#undef RESHAPE_DIM + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &tmp_params[0] ); + vsi_nn_kernel_tensor_release( &tmp_params[1] ); + vsi_nn_kernel_tensor_release( &tmp_params[2] ); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( gather, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c new file mode 100644 index 0000000..c38b90e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c @@ -0,0 +1,491 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "gather_nd" +#define KERNEL_SOURCE_2 "gather_nd_2d" +#define KERNEL_SOURCE_3 "gather_nd_3d" +#define KERNEL_SOURCE_4 "gather_nd_mix" +#define KERNEL_SOURCE_5 "gather_nd_2d_mix" +#define KERNEL_SOURCE_6 "gather_nd_3d_mix" + + typedef enum +{ + _1D = 0, + _2D, + _3D +} vsi_nn_kernel_coord_type_e; + +#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _quant_type) \ + ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_quant_type)) + +#define HASH_GATHER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \ + CVIVANTE_NAMESPACE("evis.gather_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE) + +#define TENSOR_GATHER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \ + { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0), \ + HASH_GATHER_ND_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } gather_nd_map[] = +{ + TENSOR_GATHER_ND_KERNELS(I8, I32, I8, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(U8, I32, U8, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(I16, I32, I16, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(F16, I32, F16, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_KERNELS(I8, I32, I8, _2D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_KERNELS(U8, I32, U8, _2D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_KERNELS(I16, I32, I16, _2D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_KERNELS(F16, I32, F16, _2D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_KERNELS(I8, I32, I8, _3D, KERNEL_SOURCE_3) + TENSOR_GATHER_ND_KERNELS(U8, I32, U8, _3D, KERNEL_SOURCE_3) + TENSOR_GATHER_ND_KERNELS(I16, I32, I16, _3D, KERNEL_SOURCE_3) + TENSOR_GATHER_ND_KERNELS(F16, I32, F16, _3D, KERNEL_SOURCE_3) + + TENSOR_GATHER_ND_KERNELS(I8, I32, F16, _1D, KERNEL_SOURCE_4) + TENSOR_GATHER_ND_KERNELS(I16, I32, F16, _1D, KERNEL_SOURCE_4) + TENSOR_GATHER_ND_KERNELS(F16, I32, I8, _1D, KERNEL_SOURCE_4) + TENSOR_GATHER_ND_KERNELS(F16, I32, I16, _1D, KERNEL_SOURCE_4) + TENSOR_GATHER_ND_KERNELS(U8, I32, F16, _1D, KERNEL_SOURCE_4) + TENSOR_GATHER_ND_KERNELS(F16, I32, U8, _1D, KERNEL_SOURCE_4) + + TENSOR_GATHER_ND_KERNELS(I8, I32, F16, _2D, KERNEL_SOURCE_5) + TENSOR_GATHER_ND_KERNELS(I16, I32, F16, _2D, KERNEL_SOURCE_5) + TENSOR_GATHER_ND_KERNELS(F16, I32, I8, _2D, KERNEL_SOURCE_5) + TENSOR_GATHER_ND_KERNELS(F16, I32, I16, _2D, KERNEL_SOURCE_5) + TENSOR_GATHER_ND_KERNELS(U8, I32, F16, _2D, KERNEL_SOURCE_5) + TENSOR_GATHER_ND_KERNELS(F16, I32, U8, _2D, KERNEL_SOURCE_5) + + TENSOR_GATHER_ND_KERNELS(I8, I32, F16, _3D, KERNEL_SOURCE_6) + TENSOR_GATHER_ND_KERNELS(I16, I32, F16, _3D, KERNEL_SOURCE_6) + TENSOR_GATHER_ND_KERNELS(F16, I32, I8, _3D, KERNEL_SOURCE_6) + TENSOR_GATHER_ND_KERNELS(F16, I32, I16, _3D, KERNEL_SOURCE_6) + TENSOR_GATHER_ND_KERNELS(U8, I32, F16, _3D, KERNEL_SOURCE_6) + TENSOR_GATHER_ND_KERNELS(F16, I32, U8, _3D, KERNEL_SOURCE_6) +}; + +/* + * Kernel params + */ +static vx_param_description_t _gather_nd_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GATHER_ND_PARAM_NUM _cnt_of_array( _gather_nd_kernel_param_def ) + +static vsi_status get_gather_nd_tensor_reshape_size + ( + vsi_nn_tensor_t ** inputs, + int32_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t block_size, + uint32_t coordDim, + int32_t* newDim + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t dims_num = inputs[0]->attr.dim_num; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + uint32_t elementCnt = 1; +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + + newDim[0] = 0; + for(i = 0; i < dims_num; ++i) + { + elementCnt *= input_size[i]; + } + + for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + + if(coordDim) // input reshape + { + uint32_t offset = dims_num - coordDim + 1; + for(i = coordDim-1; i > 0; i--) + { + sizes[i] = input_size[i + offset - 1]; + } + for(i = 0; i < offset; i++) + { + sizes[0] *= input_size[i]; + } + + newDim[0] = coordDim; + if(coordDim == 1) + { + newDim[0] = 2; + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + } + + status = VSI_SUCCESS; + } + else // indices&output reshape + { + if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + status = VSI_SUCCESS; + newDim[0] = 2; + } + } +#undef VSI_NN_MAX_IMAGE_WIDTH + + return status; +} /* _get_EltOP_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_gather_nd_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + int32_t block_size = 0; + int32_t indices_num = 1; + + int32_t src0ZP = 0; + float src0Scale = 0; + int32_t dstZP = 0; + float dstScale = 0; + + uint32_t pack_key = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + dstZP = attr[2]->asymm.zero_point; + dstScale = attr[2]->asymm.scale; + if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + src0Scale = 1; + } + + if( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[2]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + dstScale = 1.0f / dstScale; + } + else if( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + } + + indices_num = attr[1]->shape->data[1]; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = indices_num; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE) \ + (IN0_TYPE | (OUT_TYPE << 8)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype ); + { + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP0[2] = {0}; + uint32_t multAndoutZP1[2] = {0}; + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertFp16toU8_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, F16 ): + case _PACK_SELECT_KEY( I8, F16 ): + case _PACK_SELECT_KEY( I16, F16 ): + { + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0); + + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, U8 ): + case _PACK_SELECT_KEY( F16, I8 ): + case _PACK_SELECT_KEY( F16, I16 ): + { + int32_t postShift0 = 0; + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); + + multAndoutZP1[0] = (uint32_t)(M0); + multAndoutZP1[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); + + gpu_dp_inst_update_postshfit( &uniConvertFp16toU8_2x8, postShift0 ); + status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertFp16toU8_2x8", &uniConvertFp16toU8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } +#undef _PACK_SELECT_KEY + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + return status; +} /* _gather_nd_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + int32_t coord_dim + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_coord_type_e coord_type = _1D; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if(coord_dim == 1) + { + coord_type = _1D; + } + else if(coord_dim == 2) + { + coord_type = _2D; + } + else if(coord_dim == 3) + { + coord_type = _3D; + } + + key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, 0 ); + + for( i = 0; i < _cnt_of_array(gather_nd_map); i ++ ) + { + if( gather_nd_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(gather_nd_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", gather_nd_map[i].function_name ); + kernel->info.parameters = _gather_nd_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _gather_nd_kernel_param_def ); + kernel->info.initialize = _gather_nd_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + gather_nd_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + gather_nd_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_GATHER_ND_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + + status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim); + status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim); + status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim); + if(status != VSI_SUCCESS) + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, coord_dim ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], rs_in_dim ); + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[1], rs_idx_dim ); + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_ND_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &tmp_params[0] ); + vsi_nn_kernel_tensor_release( &tmp_params[1] ); + vsi_nn_kernel_tensor_release( &tmp_params[2] ); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( gather_nd, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c new file mode 100644 index 0000000..d83d149 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c @@ -0,0 +1,909 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_GRUCELL_ACTIVATION, +} _internal_kernel_e; + +#define _GRUCELL_ACTIVATION_KERNEL_SOURCE "grucell_activation" +#define _GRUCELL_ACTIVATION_KERNEL_NAME CVIVANTE_NAMESPACE("evis.grucell_activation") + +#define _CDNN_KERNEL_SOURCE0 "grucell_cdnn_activation" +#define _CDNN_KERNEL_SOURCE1 "grucell_cdnn_activation_u8" +#define _GRUCELL_ACTIVATION_CDNN_KERNEL_NAME CVIVANTE_NAMESPACE("evis.grucell_cdnn_activation") + +typedef enum _batch_fisrt_layerout_e +{ + NC, + CN, + CN_FULL, +} batch_fisrt_layerout_e; + +typedef enum _gru_activation_type_e +{ + sigmoid = VSI_NN_ACT_SIGMOID, + hsigmoid = VSI_NN_ACT_HARD_SIGMOID, +} gru_activation_type_e; + +#define STR(a) #a +// Add kernel hashtable here +#define GRUCELL_ACTIVATION_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, GATE_ACT, CAND_ACT, LAYER_OUT ) \ + ((uint64_t)IN0_DTYPE | ( (uint64_t)IN1_DTYPE << 8 ) | ( (uint64_t)IN2_DTYPE << 16 ) \ + | ( (uint64_t)OUT_DTYPE << 24 ) | ( (uint64_t)GATE_ACT << 32 ) \ + | ( (uint64_t)CAND_ACT << 40 ) | ( (uint64_t)LAYER_OUT << 48 )) + +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, GATE_ACT, CAND_ACT, LAYER_OUT ) \ +{ GRUCELL_ACTIVATION_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, GATE_ACT, CAND_ACT, LAYER_OUT ), \ + CVIVANTE_NAMESPACE("evis.grucell_activation_"#IN0_DTYPE"_"#IN1_DTYPE"_"#IN2_DTYPE"_to_"#OUT_DTYPE"_"#GATE_ACT), \ + _GRUCELL_ACTIVATION_KERNEL_SOURCE } + +#define PACK_KERNEL_CDNN_SEP_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, GATE_ACT, CAND_ACT, LAYER_OUT, SOURCE ) \ + { GRUCELL_ACTIVATION_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, GATE_ACT, CAND_ACT, LAYER_OUT ), \ + CVIVANTE_NAMESPACE("evis.grucell_activation_cdnn_sep_"#IN0_DTYPE"_"#IN1_DTYPE"_" \ + #IN2_DTYPE"_to_"#OUT_DTYPE"_"STR(LAYER_OUT)), \ + SOURCE } + +#define PACK_KERNEL_CDNN_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, GATE_ACT, CAND_ACT, LAYER_OUT, SOURCE ) \ + { GRUCELL_ACTIVATION_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, GATE_ACT, CAND_ACT, LAYER_OUT ), \ + CVIVANTE_NAMESPACE("evis.grucell_activation_cdnn_"#IN0_DTYPE"_"#IN1_DTYPE"_"#IN2_DTYPE"_to_"#OUT_DTYPE), \ + SOURCE } + +typedef struct +{ + uint64_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _grucell_activation_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, U8, U8, U8, sigmoid, VSI_NN_ACT_TANH, NC), + PACK_KERNEL_MAP( F16, F16, F16, F16, sigmoid, VSI_NN_ACT_TANH, NC), + PACK_KERNEL_MAP( F16, F16, F16, U8, sigmoid, VSI_NN_ACT_TANH, NC), + + PACK_KERNEL_MAP( U8, U8, U8, U8, sigmoid, VSI_NN_ACT_TANH, CN), + PACK_KERNEL_MAP( F16, F16, F16, F16, sigmoid, VSI_NN_ACT_TANH, CN), + PACK_KERNEL_MAP( F16, F16, F16, U8, sigmoid, VSI_NN_ACT_TANH, CN), + + PACK_KERNEL_MAP( U8, U8, U8, U8, hsigmoid, VSI_NN_ACT_TANH, NC), + PACK_KERNEL_MAP( F16, F16, F16, F16, hsigmoid, VSI_NN_ACT_TANH, NC), + PACK_KERNEL_MAP( F16, F16, F16, U8, hsigmoid, VSI_NN_ACT_TANH, NC), + + PACK_KERNEL_MAP( U8, U8, U8, U8, hsigmoid, VSI_NN_ACT_TANH, CN), + PACK_KERNEL_MAP( F16, F16, F16, F16, hsigmoid, VSI_NN_ACT_TANH, CN), + PACK_KERNEL_MAP( F16, F16, F16, U8, hsigmoid, VSI_NN_ACT_TANH, CN), +}; + +static const _kernel_map_type _grucell_cunn_sep_activation_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_CDNN_SEP_MAP( F16, F16, F16, F16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _CDNN_KERNEL_SOURCE0 ), + + PACK_KERNEL_CDNN_SEP_MAP( F16, F16, F16, F16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE0 ), + + PACK_KERNEL_CDNN_SEP_MAP( F16, F16, F16, F16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN_FULL, _CDNN_KERNEL_SOURCE0 ), + + PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _CDNN_KERNEL_SOURCE1 ), + + PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE1 ), + + PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN_FULL, _CDNN_KERNEL_SOURCE1 ), +}; + +static const _kernel_map_type _grucell_cunn_activation_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_CDNN_MAP( F16, F16, F16, F16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _CDNN_KERNEL_SOURCE0 ), + + PACK_KERNEL_CDNN_MAP( F16, F16, F16, F16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE0 ), + + PACK_KERNEL_CDNN_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _CDNN_KERNEL_SOURCE1 ), + + PACK_KERNEL_CDNN_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE1 ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _grucell_activation_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GRUCELL_ACTIVATION_PARAM_NUM _cnt_of_array( _grucell_activation_kernel_param_def ) + +static vx_param_description_t _grucell_activation_separated_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _GRUCELL_CDNN_SEP_ACTIVATION_PARAM_NUM _cnt_of_array( _grucell_activation_separated_kernel_param_def ) + +static vx_param_description_t _grucell_activation_cdnn_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _GRUCELL_CDNN_ACTIVATION_PARAM_NUM _cnt_of_array( _grucell_activation_cdnn_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_grucell_activation_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + float tensorScale[4] = {1.0f, 1.0f, 1.0f, 1.0f}; + float tensorZP[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + uint32_t i = 0; + uint32_t pack_key = 0; + vsi_int_array_t * output_shape = NULL; + vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL }; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + + for (i = 0; i < 4; i++) + { + if( attr[i]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[i]->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + tensorZP[i] = (float)attr[i]->asymm.zero_point; + tensorScale[i] = attr[i]->asymm.scale; + } + } + + tensorZP[0] = tensorScale[0] * tensorZP[0]; + tensorZP[1] = tensorScale[1] * tensorZP[1]; + tensorZP[2] = tensorScale[2] * tensorZP[2]; + tensorScale[3] = 1.0f / tensorScale[3]; + + output_shape = attr[3]->shape; + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = + gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = output_shape->data[1]; + +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (IN1_TYPE << 8) | (IN2_TYPE << 16) | ( OUT_TYPE << 24)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, + attr[2]->dtype, attr[3]->dtype ); + + switch (pack_key) + { + case _PACK_SELECT_KEY( F16, F16, F16, F16 ): + case _PACK_SELECT_KEY( F16, F16, F16, U8 ): + case _PACK_SELECT_KEY( U8, U8, U8, U8 ): + { + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvDatatoFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if (attr[3]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractHalf8_2x8 ); + } + else + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); + } + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvDatatoFp32_4x4", &uniConvDatatoFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "tensorZP", &tensorZP ); + status |= vsi_nn_kernel_gpu_add_param( node, + "tensorScale", &tensorScale ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + default: + break; + } + + status |= vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + for (i = 0; i < 4; i++) + { + if (attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + } + + return status; +} /* _grucell_activation_initializer() */ + +DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + uint32_t i = 0; + uint32_t pack_key = 0; + int32_t layer_out = 1; + int32_t input_size = 1; + int32_t batch = 1; + float input_scale = 1.0f; + float input_tail = 0; + float output_scale = 1.0f; + float output_zp = 0; + float input_r_scale = 1.0f; + float input_r_tail = 0; + float recur_r_scale = 1.0f; + float recur_r_tail = 0; + float input_z_scale = 1.0f; + float input_z_tail = 0; + float recur_z_scale = 1.0f; + float recur_z_tail = 0; + float input_c_scale = 1.0f; + float input_c_tail = 0; + float recur_c_scale = 1.0f; + float recur_c_tail = 0; + vsi_int_array_t * output_shape = NULL; + vsi_nn_kernel_tensor_attr_t * attr[8] = { NULL }; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + if ( param_size == _GRUCELL_CDNN_SEP_ACTIVATION_PARAM_NUM ) + { + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[param_size - 4] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + + attr[4] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[4], "Create tensor attr buffer fail.", final ); + attr[5] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[5] ); + CHECK_PTR_FAIL_GOTO( attr[5], "Create tensor attr buffer fail.", final ); + attr[6] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[6], "Create tensor attr buffer fail.", final ); + attr[7] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[6] ); + CHECK_PTR_FAIL_GOTO( attr[7], "Create tensor attr buffer fail.", final ); + } + else + { + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[param_size - 4] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 1], &layer_out); + CHECK_STATUS_FAIL_GOTO(status, final ); + + output_shape = attr[3]->shape; + + if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM ) + { + input_scale = attr[0]->asymm.scale; + input_tail = 0 - input_scale * (float)attr[0]->asymm.zero_point; + } + + if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM ) + { + input_r_scale = attr[1]->asymm.scale; + input_r_tail = 0 - input_r_scale * (float)attr[1]->asymm.zero_point; + } + + if( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM ) + { + recur_r_scale = attr[2]->asymm.scale; + recur_r_tail = 0 - recur_r_scale * (float)attr[2]->asymm.zero_point; + } + + if( attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[3]->quant == VSI_NN_KERNEL_QUANT_SYMM ) + { + output_scale = 1.0f / attr[3]->asymm.scale; + output_zp = (float)attr[3]->asymm.zero_point; + } + + if ( param_size == _GRUCELL_CDNN_SEP_ACTIVATION_PARAM_NUM ) + { + if( attr[4]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[4]->quant == VSI_NN_KERNEL_QUANT_SYMM ) + { + input_z_scale = attr[4]->asymm.scale; + input_z_tail = 0 - input_z_scale * (float)attr[4]->asymm.zero_point; + } + + if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM ) + { + recur_z_scale = attr[5]->asymm.scale; + recur_z_tail = 0 - recur_z_scale * (float)attr[5]->asymm.zero_point; + } + + if( attr[6]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[6]->quant == VSI_NN_KERNEL_QUANT_SYMM ) + { + input_c_scale = attr[6]->asymm.scale; + input_c_tail = 0 - input_c_scale * (float)attr[6]->asymm.zero_point; + } + + if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM ) + { + recur_c_scale = attr[7]->asymm.scale; + recur_c_tail = 0 - recur_c_scale * (float)attr[7]->asymm.zero_point; + } + } + + if (layer_out == 1 || layer_out == 2) + { + input_size = attr[1]->shape->data[0]; + batch = attr[1]->shape->data[1]; + } + else + { + input_size = output_shape->data[0]; + batch = output_shape->data[1]; + } + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = (input_size + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0]; + gpu_param.global_size[1] = batch; + +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (IN1_TYPE << 8) | (IN2_TYPE << 16) | ( OUT_TYPE << 24)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, + attr[2]->dtype, attr[3]->dtype ); + + switch (pack_key) + { + case _PACK_SELECT_KEY( F16, F16, F16, F16 ): + { + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvDatatoFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uiF16AddF16_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractHalf8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvDatatoFp32_4x4", &uniConvDatatoFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uiF16AddF16_4x4", &uiF16AddF16_4x4 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( U8, U8, U8, U8 ): + { + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvDatatoFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvDatatoFp32_4x4", &uniConvDatatoFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_scale", &input_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_tail", &input_tail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_r_scale", &input_r_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_r_tail", &input_r_tail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "recur_r_scale", &recur_r_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "recur_r_tail", &recur_r_tail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_z_scale", &input_z_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_z_tail", &input_z_tail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "recur_z_scale", &recur_z_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "recur_z_tail", &recur_z_tail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_z_scale", &input_z_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_c_scale", &input_c_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_c_tail", &input_c_tail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "recur_c_scale", &recur_c_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "recur_c_tail", &recur_c_tail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "output_scale", &output_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "output_zp", &output_zp ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + + status |= vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + for (i = 0; i < 8; i++) + { + if (attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + } + + return status; +} /* _grucell_activation_cdnn_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t gate_activation, + int32_t candidate_activation, + int32_t input_category, + int32_t input_layout, + int32_t use_cudnn, + int32_t* param_count, + int32_t* input_count, + int32_t* output_count + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e in2_dtype; + vsi_nn_kernel_dtype_e out_dtype; + batch_fisrt_layerout_e layer_out; + _kernel_map_type * kernel_map = (_kernel_map_type *)_grucell_activation_kernel_map; + vx_param_description_t * param_def = _grucell_activation_kernel_param_def; + vx_kernel_initialize_f initializer = _grucell_activation_initializer; + int32_t numParams = 0; + int32_t kernel_num = 0; + uint64_t key = 0; + int32_t i = 0; + + layer_out = input_layout == GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC ? NC : + input_layout == GRUCELL_ACTIVATION_INPUT_LAYOUT_INPUT_NC_FC_CN ? CN : CN_FULL; + + if (use_cudnn) + { + in0_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACTIVATION_INPUT_H_STATE]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R]->attr.dtype.vx_type ); + in2_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (input_category == GRUCELL_INPUT_CATEGORY_CUDNN) + { + *param_count = _GRUCELL_CDNN_SEP_ACTIVATION_PARAM_NUM; + *input_count = 13; + *output_count = 2; + + numParams = _GRUCELL_CDNN_SEP_ACTIVATION_PARAM_NUM; + kernel_num = _cnt_of_array(_grucell_cunn_sep_activation_kernel_map); + kernel_map = (_kernel_map_type *)_grucell_cunn_sep_activation_kernel_map; + param_def = _grucell_activation_separated_kernel_param_def; + initializer = _grucell_activation_cdnn_initializer; + } + else + { + *param_count = _GRUCELL_CDNN_ACTIVATION_PARAM_NUM; + *input_count = 9; + *output_count = 2; + + numParams = _GRUCELL_CDNN_ACTIVATION_PARAM_NUM; + kernel_num = _cnt_of_array(_grucell_cunn_activation_kernel_map); + kernel_map = (_kernel_map_type *)_grucell_cunn_activation_kernel_map; + param_def = _grucell_activation_cdnn_kernel_param_def; + initializer = _grucell_activation_cdnn_initializer; + } + } + else + { + *param_count = _GRUCELL_ACTIVATION_PARAM_NUM; + *input_count = 3; + *output_count = 2; + + numParams = _GRUCELL_ACTIVATION_PARAM_NUM; + kernel_num = _cnt_of_array(_grucell_activation_kernel_map); + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + } + + key = GRUCELL_ACTIVATION_HASH_KEY( in0_dtype, in1_dtype, in2_dtype, out_dtype, + gate_activation, candidate_activation, layer_out ); + + for( i = 0; i < kernel_num; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_num ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)numParams; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t* node_params = NULL; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t** _inputs = NULL; + vsi_nn_tensor_t* _biases[6] = {NULL}; + vsi_nn_tensor_t* _fc_r[2] = {NULL}; + vsi_nn_tensor_t* cond_zeros = NULL; + int32_t i = 0; + int32_t j = 0; + int32_t k = 0; + int32_t input_size = inputs[0]->attr.size[0]; + int32_t batch = inputs[0]->attr.size[1]; + int32_t param_count = 0; + int32_t input_count = 0; + int32_t output_count = 0; + int32_t gate_activation = 0; + int32_t candidate_activation = 0; + int32_t input_category = vsi_nn_kernel_param_get_int32( params, "input_category" ); + int32_t use_cudnn = vsi_nn_kernel_param_get_int32( params, "use_cudnn_implementation" ); + int32_t input_layout = vsi_nn_kernel_param_get_int32( params, "input_layout" ); + + gate_activation = vsi_nn_kernel_param_get_int32( params, "gate_activation" ); + candidate_activation = vsi_nn_kernel_param_get_int32( params, "candidate_activation" ); + + if (use_cudnn && inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] == NULL) + { + input_category = GRUCELL_INPUT_CATEGORY_DEFAULT; + } + else if (use_cudnn && inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z]) + { + input_category = GRUCELL_INPUT_CATEGORY_CUDNN; + } + + status = _query_kernel( kernel, inputs, outputs, gate_activation, candidate_activation, + input_category, input_layout, use_cudnn, ¶m_count, &input_count, &output_count ); + + if( VSI_SUCCESS == status) + { + _inputs = (vsi_nn_tensor_t**)malloc(input_count * sizeof(vsi_nn_tensor_t**)); + node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count); + + if (use_cudnn) + { + vsi_nn_tensor_attr_t attr; + + if (input_category == GRUCELL_INPUT_CATEGORY_DEFAULT) + { + memcpy(&attr, &inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R]->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.size[0] = input_size; + attr.size[1] = 3 * batch; + attr.dim_num = 2; + + _fc_r[0] = vsi_nn_reshape_tensor(graph, + inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R], attr.size, attr.dim_num); + inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = _fc_r[0]; + _fc_r[1] = vsi_nn_reshape_tensor(graph, + inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R], attr.size, attr.dim_num); + inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R] = _fc_r[1]; + } + + memcpy(&attr, &inputs[GRUCELL_ACTIVATION_INPUT_BIAS_R]->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + for ( i = 0; i < 3; i++) + { + _biases[i] = vsi_nn_reshape_tensor(graph, + inputs[GRUCELL_ACTIVATION_INPUT_BIAS_R + i], attr.size, attr.dim_num); + inputs[GRUCELL_ACTIVATION_INPUT_BIAS_R + i] = _biases[i]; + } + + if(!inputs[GRUCELL_ACTIVATION_INPUT_COND_R] || !inputs[GRUCELL_ACTIVATION_INPUT_COND_Z] + || !inputs[GRUCELL_ACTIVATION_INPUT_COND_C]) + { + cond_zeros = vsi_nn_CreateTensorWithDefault(graph, &attr, 0.0); + } + for ( i = 0; i < 3; i++) + { + if (inputs[GRUCELL_ACTIVATION_INPUT_COND_R + i]) + { + /* Shader kernel cannot take 1-d tensors as inptus + reshape them to 2-d to workaround */ + if(1 == inputs[GRUCELL_ACTIVATION_INPUT_COND_R + i]->attr.dim_num) + { + _biases[i + 3] = vsi_nn_reshape_tensor(graph, + inputs[GRUCELL_ACTIVATION_INPUT_COND_R + i], attr.size, attr.dim_num); + inputs[GRUCELL_ACTIVATION_INPUT_COND_R + i] = _biases[i + 3]; + } + else + { + /* high level had done the workaround */ + } + } + else + { + inputs[GRUCELL_ACTIVATION_INPUT_COND_R + i] = cond_zeros; + } + } + } + + for(i = 0, k = 0; k < input_count; i++) + { + if (inputs[i]) + { + _inputs[k ++] = inputs[i]; + } + } + + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + j = input_count + output_count; + + vsi_nn_kernel_node_pack_io( node_params, param_count, + _inputs, input_count, outputs, output_num ); + /* Pass parameters to node. */ + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &gate_activation ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &candidate_activation ); + if( use_cudnn ) + { + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &input_layout ); + } + status = vsi_nn_kernel_node_pass_param( node, node_params, param_count ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + if( use_cudnn ) + { + vsi_nn_kernel_scalar_release( &node_params[--j] ); + } + } + if(cond_zeros) + { + vsi_nn_ReleaseTensor(&cond_zeros); + } + + for ( i = 0; i < 6; i++) + { + if (_biases[i]) + { + vsi_nn_ReleaseTensor(&_biases[i]); + } + } + + for ( i = 0; i < 2; i++) + { + if (_fc_r[i]) + { + vsi_nn_ReleaseTensor(&_fc_r[i]); + } + } + } + + vsi_nn_safe_free(_inputs); + vsi_nn_safe_free(node_params); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( grucell_activation, _setup ) + + diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c new file mode 100644 index 0000000..65b1767 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c @@ -0,0 +1,375 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _A_GRUCELL_ACTIVATION_SMA_KERNEL_SOURCE "grucell_activation_sma" + +#define STR(a) #a + +// Add kernel hashtable here +#define A_GRUCELL_ACTIVATION_SMA_HASH_KEY(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, _image_2d) \ + ((IN2_DTYPE << 24) | (IN1_DTYPE << 16) | ( IN0_DTYPE << 8 ) | ( OUT_DTYPE << 1) | (_image_2d)) + +#define A_GRUCELL_ACTIVATION_SMA_SH_KERNEL_NAME(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE) \ + CVIVANTE_NAMESPACE("evis.grucell_activation_sma_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE) \ + "_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)) + +#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE) \ + { A_GRUCELL_ACTIVATION_SMA_HASH_KEY(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, IMAGE), \ + A_GRUCELL_ACTIVATION_SMA_SH_KERNEL_NAME(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE), \ + _A_GRUCELL_ACTIVATION_SMA_KERNEL_SOURCE} + +#define A_GRUCELL_ACTIVATION_SMA_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE) \ + CVIVANTE_NAMESPACE("evis.grucell_activation_sma_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE) \ + "_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)"_2D") + +#define PACK_KERNEL_MAP_2D(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE) \ + { A_GRUCELL_ACTIVATION_SMA_HASH_KEY(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, IMAGE_2D), \ + A_GRUCELL_ACTIVATION_SMA_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE), \ + _A_GRUCELL_ACTIVATION_SMA_KERNEL_SOURCE} + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _grucell_activation_sma_kernel_map[] = +{ + PACK_KERNEL_MAP(F16, F16, F16, F16), + + PACK_KERNEL_MAP_2D(F16, F16, F16, F16), +}; + +/* + * Kernel params + */ +static vx_param_description_t _grucell_activation_sma_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _A_GRUCELL_ACTIVATION_SMA_PARAM_NUM _cnt_of_array( _grucell_activation_sma_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ +#define _PACK_A_GRUCELL_ACTIVATION_SMA_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \ + (( IN1_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE)) + vsi_status status = VX_SUCCESS; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_tensor input0 = (vx_tensor)param[0]; + vx_tensor input1 = (vx_tensor)param[1]; + vx_tensor input2 = (vx_tensor)param[2]; + vx_tensor output = (vx_tensor)param[3]; + uint32_t i = 0; + + vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL }; + vsi_int_array_t *output_shape = NULL; + uint32_t pack_key = 0; + + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); + CHECK_PTR_FAIL_GOTO( attr[0], "vsi_nn_kernel_tensor_attr_create fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1); + CHECK_PTR_FAIL_GOTO( attr[1], "vsi_nn_kernel_tensor_attr_create fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input2); + CHECK_PTR_FAIL_GOTO( attr[2], "vsi_nn_kernel_tensor_attr_create fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( attr[3], "vsi_nn_kernel_tensor_attr_create fail.", final ); + + + pack_key = _PACK_A_GRUCELL_ACTIVATION_SMA_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype, attr[3]->dtype ); + + output_shape = attr[3]->shape; + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = output_shape->data[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1; + + + switch( pack_key ) + { + case _PACK_A_GRUCELL_ACTIVATION_SMA_KEY( F16, F16, F16, F16 ): + { + gpu_dp_inst_t uniA_Times_B_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x11111111, // BSelt + 0x03020100, 0x07060504, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniA_Plus_B_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniA_Minus_B_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniA_Times_B_2x8", &uniA_Times_B_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniA_Plus_B_2x8", &uniA_Plus_B_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniA_Minus_B_2x8", &uniA_Minus_B_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + for ( i = 0; i < 4; i++) + { + if (attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + } + +#undef _PACK_A_GRUCELL_ACTIVATION_SMA_KEY + return status; +} /* _grucell_activation_sma_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e in2_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _grucell_activation_sma_kernel_map; + size_t kernel_map_size = _cnt_of_array( _grucell_activation_sma_kernel_map ); + vx_param_description_t * param_def = _grucell_activation_sma_kernel_param_def; + size_t param_def_size = _cnt_of_array( _grucell_activation_sma_kernel_param_def ); + vx_kernel_initialize_f initializer = _grucell_activation_sma_initializer; + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = A_GRUCELL_ACTIVATION_SMA_HASH_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (2) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_A_GRUCELL_ACTIVATION_SMA_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t* shapes_in[_INPUT_NUM]; + size_t rank_in[_INPUT_NUM]; + int32_t* shapes_ptr[_IO_NUM]; + int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + uint32_t new_rank = 0; + int32_t i = 0; + vsi_bool ret = FALSE; + vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; + + for (i = 0; i < _IO_NUM; i++) + { + shapes_ptr[i] = shapes[i]; + } + + for (i = 0; i < _INPUT_NUM; i++) + { + shapes_in[i] = (int32_t *)inputs[i]->attr.size; + rank_in[i] = (size_t)inputs[i]->attr.dim_num; + } + + ret = vsi_nn_kernel_optimize_broadcast_shape( + (const int32_t**)shapes_in, (const size_t*)rank_in, _INPUT_NUM, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes_ptr, shapes[_INPUT_NUM], &new_rank); + + if( ret ) + { + for (i = 0; i < _INPUT_NUM; i++) + { + reshape_tensors[i] = vsi_nn_reshape_tensor( graph, + inputs[i], (uint32_t*)shapes[i], new_rank ); + } + + for (i = 0; i < _OUTPUT_NUM; i++) + { + reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( graph, + outputs[i], (uint32_t*)shapes[_INPUT_NUM], new_rank ); + } + } + else + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[_INPUT_NUM]->attr.size, + reshape_tensors[_INPUT_NUM]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (reshape_tensors[_INPUT_NUM]->attr.dim_num == 2 || reshape_tensors[_INPUT_NUM]->attr.size[2] == 1); + status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[_INPUT_NUM], image_2d); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _A_GRUCELL_ACTIVATION_SMA_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _A_GRUCELL_ACTIVATION_SMA_PARAM_NUM ); + } + } + + for (i = 0; i < _IO_NUM; i++) + { + if (reshape_tensors[i]) + { + vsi_nn_ReleaseTensor( &reshape_tensors[i] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( grucell_activation_sma, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c new file mode 100644 index 0000000..b1f413c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c @@ -0,0 +1,1052 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + +typedef enum +{ + INTERNAL_KERNEL_MEAN_VARI, + INTERNAL_KERNEL_NORM, +} _internal_kernel_e; + +#define KERNEL_SOURCE_1 "instance_normalization_i8" +#define KERNEL_SOURCE_2 "instance_normalization_u8" +#define KERNEL_SOURCE_3 "instance_normalization_i16" +#define KERNEL_SOURCE_4 "instance_normalization_f16" + +#define HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.instance_norm_meanvari_"#SRC0_TYPE) + +#define HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_2D_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.instance_norm_meanvari_"#SRC0_TYPE"_2D") + +#define HASH_INSTANCENORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_INSTANCENORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +// Add kernel hashtable here +// mean vari +#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) + +#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_NAME(IN0_TYPE), \ + SOURCE }, + +#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_2D_NAME(IN0_TYPE), \ + SOURCE }, + +// normalization +#define HASH_INSTANCENORM_KEY(_input0_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) + +#define TENSOR_INSTANCENORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_INSTANCENORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_INSTANCENORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_INSTANCENORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] = +{ + // Register kernel here + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I8, F32, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I8, F32, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( U8, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( U8, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I16, F32, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_4 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 ) +}; + +static const _kernel_map_type _instancenorm_kernel_map[] = +{ + // Register kernel here + TENSOR_INSTANCENORM_KERNELS( I8, I8, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_KERNELS_2D( I8, I8, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_KERNELS( I8, F16, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_KERNELS_2D( I8, F16, KERNEL_SOURCE_1 ) + + TENSOR_INSTANCENORM_KERNELS( U8, U8, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_2 ) + + TENSOR_INSTANCENORM_KERNELS( I16, I16, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_KERNELS( I16, F16, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_KERNELS_2D( I16, F16, KERNEL_SOURCE_3 ) + + TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_4 ) + TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _INSTANCENORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _instancenorm_mean_vari_kernel_param_def ) + +static vx_param_description_t _instancenorm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _INSTANCENORM_PARAM_NUM _cnt_of_array( _instancenorm_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + vsi_int_array_t * input_shape = NULL; + float scaleIn = 0; + int32_t input_zp = 0; + vx_uint32 iter = 0; + int32_t sumInZp = 0; + int32_t tmpZp1 = 0; + float tmpZp2 = 0; + float e2InScale = 0; + float rowSumScale = 0; + int32_t rsFlg = 0; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + float in_scale_fl = 1, inFlScale_s2 = 1; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &rsFlg); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + input_shape = attr[0]->shape; + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + + if(attr[0]->dtype == I8 || attr[0]->dtype == I16) + { + if (attr[0]->dfp.fl > 0) + { + in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + inFlScale_s2 = in_scale_fl * in_scale_fl; + } + + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = attr[1]->shape->data[1]; + if(rsFlg) + { + height = height / chn; + } + iter = height * 16; + + if(attr[0]->dtype == U8) + { + sumInZp = input_zp * iter * (-1); + tmpZp1 = (-2) * input_zp; + e2InScale = scaleIn * scaleIn; + tmpZp2 = input_zp * input_zp * e2InScale; + rowSumScale = height * 16 * tmpZp2; + } + + shaderParam.global_scale[0] = 1; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.local_size[0] = 16; + shaderParam.local_size[1] = 1; + shaderParam.local_size[2] = 1; + + if(attr[0]->dtype == I8 || attr[0]->dtype == U8) + { + shaderParam.global_size[0] = (width + 255) / 256 * 16; + } + else if(attr[0]->dtype == I16 || attr[0]->dtype == F16) + { + shaderParam.global_size[0] = (width + 127) / 128 * 16; + } + shaderParam.global_size[1] = chn; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + if(attr[0]->dtype == U8) + { + gpu_dp_inst_t uniSumU8_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSqrSum_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x55555555, // BSelt + 0x76543210, 0xfedcba98, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else if(attr[0]->dtype == I8) + { + gpu_dp_inst_t uniSumInt8_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSqrSumInt8_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x55555555, // BSelt + 0x76543210, 0xfedcba98, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniSumInt8_16x1", &uniSumInt8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSumInt8_16x1", &uniSqrSumInt8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); + status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else if(attr[0]->dtype == I16) + { + gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); + status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else if(attr[0]->dtype == F16) + { + gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + + status = vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} + +DEF_KERNEL_INITIALIZER(_instancenorm_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; + vsi_int_array_t * input_shape = NULL; + float scaleIn = 0; + float scaleOut = 0; + float reScaleOut_u8 = 0; + float scale_inOut = 0; + int32_t output_zp = 0; + int32_t input_zp = 0; + float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1; + float dimRatio = 0; + vx_uint32 group_num = 0; + vx_int32 height = 0, width = 0, chn = 0; + int32_t rsFlg = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rsFlg); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + input_shape = attr[0]->shape; + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + output_zp = attr[2]->asymm.zero_point; + scaleOut = attr[2]->asymm.scale; + + if(attr[0]->dtype == I8 || attr[0]->dtype == I16) + { + if (attr[0]->dfp.fl > 0) + { + in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + } + + if(attr[2]->dtype == I8 || attr[2]->dtype == I16) + { + if (attr[2]->dfp.fl > 0) + { + out_scale_fl = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + } + + if((attr[2]->dtype == I8 || attr[2]->dtype == I16) + && (attr[0]->dtype == I8 || attr[0]->dtype == I16)) + { + inOut_fl_scale = in_scale_fl * out_scale_fl; + } + + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = attr[1]->shape->data[1]; + if(rsFlg) + { + height = height / chn; + } + + if(attr[2]->dtype == U8) + { + reScaleOut_u8 = 1 / scaleOut; + } + dimRatio = (float)(1.0 / (width * height)); + + group_num = (width + 255) / 256; + + shaderParam.global_scale[0] = 16; + if(attr[0]->dtype == I16 || attr[0]->dtype == F16) + { + shaderParam.global_scale[0] = 8; + group_num = (width + 127) / 128; + } + + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = (chn + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1]; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertEndInt16Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00090008, 0x000b000a, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x000d000c, 0x000f000e, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt16Fp32Fst_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt16Fp32Secd_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toInt16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertDirUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertEndUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertTrdUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00090008, 0x000b000a, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertFthUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x000d000c, 0x000f000e, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + + uint32_t pack_key = 0; +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (OUT_TYPE << 8)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype ); + + status = vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); + status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", &uniConvertHalfToFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( pack_key ) + { + case _PACK_SELECT_KEY( I8, I8 ): + case _PACK_SELECT_KEY( I8, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertDirInt8Fp32_4x4", + &uniConvertDirUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt8Fp32_4x4", + &uniConvertEndUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertTrdInt8Fp32_4x4", + &uniConvertTrdUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4", + &uniConvertFthUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); + + status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl); + status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, U8 ): + case _PACK_SELECT_KEY( U8, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", + &uniConvert3rdUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", + &uniConvert4thUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8); + + scale_inOut = reScaleOut_u8 * scaleIn; + status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, I16 ): + case _PACK_SELECT_KEY( I16, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4", + &uniConvertInt16Fp32Fst_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4", + &uniConvertInt16Fp32Secd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8", + &uniConvertInt32toInt16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl); + + status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4", + &uniConvertEndInt16Fp32_4x4); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } +#undef _PACK_SELECT_KEY + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + const uint32_t hashkey, + _internal_kernel_e kernel_id + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vx_kernel_initialize_f initializer = NULL; + vx_param_description_t * param_def = NULL; + const _kernel_map_type* kernel_map; + size_t kernel_map_size = 0; + size_t param_size = 0; + uint32_t i = 0; + + switch( kernel_id ) + { + case INTERNAL_KERNEL_MEAN_VARI: + initializer = _instancenorm_mean_vari_initializer; + kernel_map = _instancenorm_mean_vari_kernel_map; + kernel_map_size = _cnt_of_array( _instancenorm_mean_vari_kernel_map ); + param_def = _instancenorm_mean_vari_kernel_param_def; + param_size = _INSTANCENORM_MEAN_VARI_PARAM_NUM; + break; + case INTERNAL_KERNEL_NORM: + initializer = _instancenorm_initializer; + kernel_map = _instancenorm_kernel_map; + kernel_map_size = _cnt_of_array( _instancenorm_kernel_map ); + param_def = _instancenorm_kernel_param_def; + param_size = _INSTANCENORM_PARAM_NUM; + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == hashkey ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ +#define INTERNAL_KERNEL_SIZE (1) +#define MEAN_VARI_INDEX (0) + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t mean_vari_node_params[_INSTANCENORM_MEAN_VARI_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t tmp_node = NULL; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_dtype_e in0_dtype = U8; + vsi_nn_kernel_dtype_e out_dtype = U8; + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; + uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; + uint32_t hashkey = 0; + int32_t i = 0; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + int32_t reshape_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" ); + + // Check if gpu can support the size + if( !vsi_nn_kernel_gpu_check_shape( + (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + // Assign unique_id + ikernels[i]->unique_id = kernel->unique_id; + } + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + + attr.size[0] = ((inputs[0]->attr.size[0] + 255) / 256) * 4; + + if( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 + || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) + { + attr.size[0] = ((inputs[0]->attr.size[0] + 127) / 128) * 4; + } + attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + attr.size[2] = 1; + attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + attr.dim_num = 4; + tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg ); + hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg ); + + status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI ); + if( VSI_SUCCESS != status ) + { + goto final; + } + status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM ); + if( VSI_SUCCESS != status ) + { + goto final; + } + + if(reshape_flg) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[0]->attr.size[0]; + shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2]; + shape[2] = 1; + shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); + + shape[0] = outputs[0]->attr.size[0]; + shape[1] = outputs[0]->attr.size[1] * outputs[0]->attr.size[2]; + shape[2] = 1; + shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; + rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); + } + if(inputs[1]->attr.dim_num < 2) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[1]->attr.size[0]; + shape[1] = 1; + shape[2] = 1; + shape[3] = 1; + rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 ); + } + if(inputs[2]->attr.dim_num < 2) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[2]->attr.size[0]; + shape[1] = 1; + shape[2] = 1; + shape[3] = 1; + rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 ); + } + // Mean Vari + { + tmp_node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] ); + if(tmp_node) + { + uint32_t index = 0; + if(reshape_flg) + { + mean_vari_node_params[index++] = rs_input; + vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index], + _INSTANCENORM_MEAN_VARI_PARAM_NUM, NULL, 0, tensors, 1 ); + } + else + { + vsi_nn_kernel_node_pack_io( mean_vari_node_params, + _INSTANCENORM_MEAN_VARI_PARAM_NUM, inputs, 1, tensors, 1 ); + } + index = 2; + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); + + status = vsi_nn_kernel_node_pass_param( tmp_node, mean_vari_node_params, + _INSTANCENORM_MEAN_VARI_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] ); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + if(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + } + + // Nomalization + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if(node) + { + uint32_t index = 0; + if(reshape_flg) + { + node_params[index++] = rs_input; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; + } + if(inputs[1]->attr.dim_num < 2) + { + node_params[index++] = rs_beta; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; + } + if(inputs[2]->attr.dim_num < 2) + { + node_params[index++] = rs_gamma; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; + } + node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; + if(reshape_flg) + { + node_params[index++] = rs_output; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t; + } + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, + _INSTANCENORM_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + if(outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)outputs[0]->attr.dtype.zero_point; + } + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + } + + /* Pass parameters to node. */ +final: + if(rs_beta) + { + vsi_nn_kernel_tensor_release( &rs_beta ); + } + if(rs_gamma) + { + vsi_nn_kernel_tensor_release( &rs_gamma ); + } + if(reshape_flg) + { + vsi_nn_kernel_tensor_release( &rs_input ); + vsi_nn_kernel_tensor_release( &rs_output ); + } + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + if( ikernels[i] ) + { + vsi_nn_kernel_release( &ikernels[i] ); + } + if( tensors[i] ) + { + vsi_nn_ReleaseTensor( &tensors[i] ); + } + } + if(tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( instance_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c new file mode 100644 index 0000000..095edb1 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c @@ -0,0 +1,576 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 28) | (IN1_DTYPE << 20) | (IN0_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) \ + "l2normalizescale_axis"#AXIS + +#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _l2normalizescale_kernel_map[] = +{ + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _l2normalizescale_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _L2NORMALIZESCALE_PARAM_NUM _cnt_of_array( _l2normalizescale_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (3) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * output_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + int32_t input_fl = 0; + int32_t inputZP = 0; + float inputScale = 1.0f; + int32_t output_fl = 0; + int32_t outputZP = 0; + float outputScale = 1.0f; + float r_inputScale = 1.0f; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + output_shape = output_attr->shape; + input_dtype = input_attr->dtype; + output_dtype = output_attr->dtype; + + if ( VSI_NN_KERNEL_QUANT_DFP == input_attr->quant ) + { + input_fl = input_attr->dfp.fl; + if (input_fl >= 0) + { + inputScale = 1.0f / (float) ((int64_t)1 << input_fl); + } + else + { + inputScale = (float) ((int64_t)1 << -input_fl); + } + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) + { + inputZP = input_attr->asymm.zero_point; + inputScale = input_attr->asymm.scale; + } + + if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant ) + { + output_fl = output_attr->dfp.fl; + if (output_fl >= 0) + { + outputScale = (float) ((int64_t)1 << output_fl); + } + else + { + outputScale = 1.0f / (float) ((int64_t)1 << -output_fl); + } + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) + { + outputZP = output_attr->asymm.zero_point; + outputScale = 1.0f / output_attr->asymm.scale; + } + + + r_inputScale = 1.0f / inputScale; + + if (1 == axis) + { + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = 1; + } + else if (0 == axis) + { + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 1; + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.global_size[0] = 16; + gpu_param.global_size[1] = output_shape->data[1]; + } + else + { + status = VSI_FAILURE; + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + { + gpu_dp_inst_t UniFp16MulLo_dp4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x01010101, // BSelt + 0x00010000, 0x00030002, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t UniFp16MulHi_dp4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x01010101, // BSelt + 0x00050004, 0x00070006, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniIntegerSquareLo_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00010000, 0x00030002, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniIntegerSquareHi_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x00000000, // BSelt + 0x00050004, 0x00070006, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDataSquareAddU32Lo_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x00000000, // BSelt + 0x00010000, 0x00030002, // BBin + 0x00005400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDataSquareAddU32Hi_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00150004, 0x00370026, // ABin + 0x00000000, // BSelt + 0x00050004, 0x00070006, // BBin + 0x00005400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniUInt8SquareLo_4x4 = {{ + 0x69696969, // TCfg + 0x40404040, // ASelt + 0x01110000, 0x03330222, // ABin + 0x54545454, // BSelt + 0x00010000, 0x00030002, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniUInt8SquareHi_4x4 = {{ + 0x69696969, // TCfg + 0x40404040, // ASelt + 0x05550444, 0x07770666, // ABin + 0x54545454, // BSelt + 0x00050004, 0x00070006, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniSumSqrt_16x1 = {{ + 0x55555555, // TCfg + 0x55550000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x55550000, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniSumAll_16x1 = {{ + 0x55555555, // TCfg + 0x55550000, // ASelt + 0x76543210, 0x76543210, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + + if (1 == axis) + { + int32_t L2NorS_depth = output_shape->data[1]; + status = vsi_nn_kernel_gpu_add_param( node, "L2NorS_depth", &L2NorS_depth); + if(F16 == input_dtype) + { + status |= vsi_nn_kernel_gpu_add_param( node, "UniFp16MulLo_dp4x4", &UniFp16MulLo_dp4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "UniFp16MulHi_dp4x4", &UniFp16MulHi_dp4x4); + } + else if(I8 == input_dtype) + { + status |= vsi_nn_kernel_gpu_add_param( node, "r_inputScale", &r_inputScale); + status |= vsi_nn_kernel_gpu_add_param( node, "uniDataSquareAddU32Lo_4x4", &uniDataSquareAddU32Lo_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniDataSquareAddU32Hi_4x4", &uniDataSquareAddU32Hi_4x4); + } + else if(I16 == input_dtype) + { + status |= vsi_nn_kernel_gpu_add_param( node, "r_inputScale", &r_inputScale); + status |= vsi_nn_kernel_gpu_add_param( node, "uniIntegerSquareLo_4x4", &uniIntegerSquareLo_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniIntegerSquareHi_4x4", &uniIntegerSquareHi_4x4); + } + else if(U8 == input_dtype) + { + status |= vsi_nn_kernel_gpu_add_param( node, "r_inputScale", &r_inputScale); + status |= vsi_nn_kernel_gpu_add_param( node, "inputZP", &inputZP); + status |= vsi_nn_kernel_gpu_add_param( node, "uniUInt8SquareLo_4x4", &uniUInt8SquareLo_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniUInt8SquareHi_4x4", &uniUInt8SquareHi_4x4); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (0 == axis) + { + int32_t inputWidth, inputWidthCount, inputWidthRemain256; + inputWidth = output_shape->data[0]; + inputWidthRemain256 = output_shape->data[0] % 256; + inputWidthCount = output_shape->data[0] / 256; + vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth); + vsi_nn_kernel_gpu_add_param( node, "inputWidthRemain256", &inputWidthRemain256); + vsi_nn_kernel_gpu_add_param( node, "inputWidthCount", &inputWidthCount); + vsi_nn_kernel_gpu_add_param( node, "uniSumSqrt_16x1", &uniSumSqrt_16x1); + if (I16 == input_dtype || I8 == input_dtype) + { + status = vsi_nn_kernel_gpu_add_param( node, "r_inputScale", &r_inputScale); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if(U8 == input_dtype) + { + float zP2x = 2 * (float)inputZP; + float zpSqrt16x = 16 * (float)inputZP * (float)inputZP; + status = vsi_nn_kernel_gpu_add_param( node, "r_inputScale", &r_inputScale); + status |= vsi_nn_kernel_gpu_add_param( node, "zP2x", &zP2x); + status |= vsi_nn_kernel_gpu_add_param( node, "zpSqrt16x", &zpSqrt16x); + status |= vsi_nn_kernel_gpu_add_param( node, "uniSumAll_16x1", &uniSumAll_16x1); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + } + + { + float IntergerScale = inputScale; + float output_ZP = (float)outputZP; + gpu_dp_inst_t uniExtact8Bin_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDataSubZPtoFp32Part0_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDataSubZPtoFp32Part1_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16toFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16toFp32Hi_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + IntergerScale = IntergerScale * outputScale; + + status = vsi_nn_kernel_gpu_add_param( node, "IntergerScale", &IntergerScale); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &output_ZP); + status |= vsi_nn_kernel_gpu_add_param( node, "inputZP", &inputZP); + status |= vsi_nn_kernel_gpu_add_param( node, "uniDataSubZPtoFp32Part0_4x4", &uniDataSubZPtoFp32Part0_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniDataSubZPtoFp32Part1_4x4", &uniDataSubZPtoFp32Part1_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4); + if (0 == axis) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32Hi_4x4", &uniFp16toFp32Hi_4x4); + } + + if(F16 == output_dtype) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bin_2x8", &uniExtractHalf8_2x8); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bin_2x8", &uniExtact8Bin_2x8); + } + + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + + return status; + +} /* _l2normalizescale_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _l2normalizescale_kernel_map; + size_t kernel_map_size = _cnt_of_array( _l2normalizescale_kernel_map ); + vx_param_description_t * param_def = _l2normalizescale_kernel_param_def; + size_t param_def_size = _cnt_of_array( _l2normalizescale_kernel_param_def ); + vx_kernel_initialize_f initializer = _l2normalizescale_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_L2NORMALIZESCALE_HASH_KEY(axis, in0_dtype, in1_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_L2NORMALIZESCALE_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = 0; + border.constant_value.S16 = 0; + border.constant_value.U8 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vsi_nn_kernel_node_set_border( node, &border ); + VSI_ASSERT( status == VSI_SUCCESS ); + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _L2NORMALIZESCALE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _L2NORMALIZESCALE_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( l2normalizescale, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c new file mode 100644 index 0000000..bfe4a96 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c @@ -0,0 +1,569 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +#define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) \ + "log_softmax_axis"#_suffix + +#define HASH_LOG_SOFTMAX_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \ + { HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) }, + +#define HASH_LOG_SOFTMAX_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, _suffix ) \ + { HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } _log_softmax_evis_kernel_map[] = +{ + HASH_LOG_SOFTMAX_KERNELS(0, F16, F16, 0) + HASH_LOG_SOFTMAX_KERNELS(0, F16, I16, 0) + HASH_LOG_SOFTMAX_KERNELS(0, F16, U8, 0) + HASH_LOG_SOFTMAX_KERNELS(0, F16, I8, 0) + HASH_LOG_SOFTMAX_KERNELS(0, I16, I16, 0) + HASH_LOG_SOFTMAX_KERNELS(0, I16, F16, 0) + HASH_LOG_SOFTMAX_KERNELS(0, BF16, BF16, 0_BF16) + HASH_LOG_SOFTMAX_KERNELS(0, BF16, F32, 0_BF16) + HASH_LOG_SOFTMAX_KERNELS(0, BF16, F16, 0_BF16) + HASH_LOG_SOFTMAX_KERNELS(0, U8, U8, 0) + HASH_LOG_SOFTMAX_KERNELS(0, U8, F16, 0) + HASH_LOG_SOFTMAX_KERNELS(0, I8, I8, 0) + HASH_LOG_SOFTMAX_KERNELS(0, I8, F16, 0) + HASH_LOG_SOFTMAX_KERNELS(1, F16, F16, 1) + HASH_LOG_SOFTMAX_KERNELS(1, F16, I16, 1) + HASH_LOG_SOFTMAX_KERNELS(1, F16, U8, 1) + HASH_LOG_SOFTMAX_KERNELS(1, F16, I8, 1) + HASH_LOG_SOFTMAX_KERNELS(1, I16, I16, 1) + HASH_LOG_SOFTMAX_KERNELS(1, I16, F16, 1) + HASH_LOG_SOFTMAX_KERNELS(1, BF16, BF16, 1_BF16) + HASH_LOG_SOFTMAX_KERNELS(1, BF16, F32, 1_BF16) + HASH_LOG_SOFTMAX_KERNELS(1, BF16, F16, 1_BF16) + HASH_LOG_SOFTMAX_KERNELS(1, U8, U8, 1) + HASH_LOG_SOFTMAX_KERNELS(1, U8, F16, 1) + HASH_LOG_SOFTMAX_KERNELS(1, I8, I8, 1) + HASH_LOG_SOFTMAX_KERNELS(1, I8, F16, 1) + HASH_LOG_SOFTMAX_KERNELS(2, F16, F16, 2) + HASH_LOG_SOFTMAX_KERNELS(2, F16, I16, 2) + HASH_LOG_SOFTMAX_KERNELS(2, F16, U8, 2) + HASH_LOG_SOFTMAX_KERNELS(2, F16, I8, 2) + HASH_LOG_SOFTMAX_KERNELS(2, I16, I16, 2) + HASH_LOG_SOFTMAX_KERNELS(2, I16, F16, 2) + HASH_LOG_SOFTMAX_KERNELS(2, BF16, BF16, 2) + HASH_LOG_SOFTMAX_KERNELS(2, U8, U8, 2) + HASH_LOG_SOFTMAX_KERNELS(2, U8, F16, 2) + HASH_LOG_SOFTMAX_KERNELS(2, I8, I8, 2) + HASH_LOG_SOFTMAX_KERNELS(2, I8, F16, 2) + + HASH_LOG_SOFTMAX_KERNELS_2D(0, F16, F16, 0) + HASH_LOG_SOFTMAX_KERNELS_2D(0, F16, I16, 0) + HASH_LOG_SOFTMAX_KERNELS_2D(0, F16, U8, 0) + HASH_LOG_SOFTMAX_KERNELS_2D(0, F16, I8, 0) + HASH_LOG_SOFTMAX_KERNELS_2D(0, I16, I16, 0) + HASH_LOG_SOFTMAX_KERNELS_2D(0, I16, F16, 0) + HASH_LOG_SOFTMAX_KERNELS_2D(0, BF16, BF16, 0_BF16) + HASH_LOG_SOFTMAX_KERNELS_2D(0, BF16, F32, 0_BF16) + HASH_LOG_SOFTMAX_KERNELS_2D(0, BF16, F16, 0_BF16) + HASH_LOG_SOFTMAX_KERNELS_2D(0, U8, U8, 0) + HASH_LOG_SOFTMAX_KERNELS_2D(0, U8, F16, 0) + HASH_LOG_SOFTMAX_KERNELS_2D(0, I8, I8, 0) + HASH_LOG_SOFTMAX_KERNELS_2D(0, I8, F16, 0) + HASH_LOG_SOFTMAX_KERNELS_2D(1, F16, F16, 1) + HASH_LOG_SOFTMAX_KERNELS_2D(1, F16, I16, 1) + HASH_LOG_SOFTMAX_KERNELS_2D(1, F16, U8, 1) + HASH_LOG_SOFTMAX_KERNELS_2D(1, F16, I8, 1) + HASH_LOG_SOFTMAX_KERNELS_2D(1, I16, I16, 1) + HASH_LOG_SOFTMAX_KERNELS_2D(1, I16, F16, 1) + HASH_LOG_SOFTMAX_KERNELS_2D(1, BF16, BF16, 1_BF16) + HASH_LOG_SOFTMAX_KERNELS_2D(1, BF16, F32, 1_BF16) + HASH_LOG_SOFTMAX_KERNELS_2D(1, BF16, F16, 1_BF16) + HASH_LOG_SOFTMAX_KERNELS_2D(1, U8, U8, 1) + HASH_LOG_SOFTMAX_KERNELS_2D(1, U8, F16, 1) + HASH_LOG_SOFTMAX_KERNELS_2D(1, I8, I8, 1) + HASH_LOG_SOFTMAX_KERNELS_2D(1, I8, F16, 1) + +}; + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +#define SCALAR_INPUT_AXIS (2) +#define SCALAR_INPUT_BETA (3) + +DEF_KERNEL_INITIALIZER(_log_softmax_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + float beta = 0; + float input_scale = 0; + float output_scale = 0; + int32_t outputZP = 0; + uint32_t inputWidth = 0; + uint32_t inputWidthRemain4 = 0; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; + vsi_int_array_t * output_shape = NULL; + float logE = (float)(log10(exp(1.0f)) / log10(2.0f)); + float rlogE = (float)(log10(2.0f) / log10(exp(1.0f))); + float scaleLogE = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &beta); + CHECK_STATUS_FAIL_GOTO(status, final ); + + scaleLogE = logE * beta; + + output_shape = attr[1]->shape; + + gpu_param.dim = 2; + switch (axis) + { + case 0: + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = output_shape->data[1]; + gpu_param.global_size[1] = output_shape->size > 2 ? output_shape->data[2] : 1; + break; + case 1: + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = + gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = output_shape->size > 2 ? output_shape->data[2] : 1; + break; + case 2: + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = + gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = output_shape->data[1]; + break; + default: + break; + } + + { + gpu_dp_inst_t uniGetSubData0to3_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniGetSubData4to7_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackMaxData_2x8 = {{ + 0x00000111, // TCfg + 0x00000000, // ASelt + 0x00050300, 0x00000000, // ABin + 0x00000222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00004400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf4_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGetSubLoData_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGetSubHiData_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + switch( axis ) + { + case 0: + { + inputWidth = output_shape->data[axis] / 4 * 4; + inputWidthRemain4 = output_shape->data[axis] % 4; + + status = vsi_nn_kernel_gpu_add_param( node, + "inputWidth", &inputWidth ); + status |= vsi_nn_kernel_gpu_add_param( node, + "inputWidthRemain4", &inputWidthRemain4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniPackMaxData_2x8", &uniPackMaxData_2x8 ); + if (attr[0]->dtype == BF16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractHalf4_4x4", &uniExtractHalf4_4x4 ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetSubData0to3_4x4", &uniGetSubData0to3_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetSubData4to7_4x4", &uniGetSubData4to7_4x4 ); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case 1: + case 2: + { + if (attr[0]->dtype == BF16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + } + else + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetSubLoData_4x4", &uniGetSubLoData_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetSubHiData_4x4", &uniGetSubHiData_4x4 ); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + } + + if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = attr[1]->dfp.fl; + + if (fl > 0) + { + output_scale = (float)((int64_t)1 << fl); + } + else + { + output_scale = (float)1.0f / (float) ((int64_t)1 << -fl); + } + + status = vsi_nn_kernel_gpu_add_param( node, + "outputScale", &output_scale ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + float output_offset_asymmetric = 0; + outputZP = attr[1]->asymm.zero_point; + output_scale = 1.0f / (float)(attr[1]->asymm.scale); + output_offset_asymmetric = (float)outputZP; + + status = vsi_nn_kernel_gpu_add_param( node, + "outputScale", &output_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "output_offset_asymmetric", &output_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + output_scale = 1; + outputZP = 0; + } + + if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = attr[0]->dfp.fl; + if (fl > 0) + { + input_scale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + input_scale = (float)((int64_t)1 << -fl); + } + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + input_scale = attr[0]->asymm.scale; + } + else + { + input_scale = 1.0f; + } + + scaleLogE = scaleLogE * input_scale; + beta = beta * input_scale; + + status = vsi_nn_kernel_gpu_add_param( node, + "rlogE", &rlogE ); + status |= vsi_nn_kernel_gpu_add_param( node, + "betaValue", &beta ); + status |= vsi_nn_kernel_gpu_add_param( node, + "scaleLogE", &scaleLogE ); + status |= vsi_nn_kernel_gpu_add_param( node, + "axisSize", &output_shape->data[axis] ); + + status |= vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + } + + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + } + + return status; +} /* _log_softmax_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t axis, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_LOG_SOFTMAX_HASH_KEY( axis, input_dtype, output_dtype, image_2d ); + + for( i = 0; i < _cnt_of_array(_log_softmax_evis_kernel_map); i ++ ) + { + if( _log_softmax_evis_kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(_log_softmax_evis_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _log_softmax_evis_kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _log_softmax_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _log_softmax_evis_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _log_softmax_evis_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + float beta = 1.0f; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + beta = vsi_nn_kernel_param_get_float32(params, "beta"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( inputs, outputs, axis, image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM, + inputs, 1, outputs, 1 ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create( + graph, F32, &beta ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, _EVIS_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_BETA] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( log_softmax, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c new file mode 100644 index 0000000..f2f915f --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c @@ -0,0 +1,252 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _LOGICAL_OPS_KERNEL_SOURCE "logical_not" + +#define STR(a) #a + +// Add kernel hashtable here +#define LOGICAL_NOT_HASH_KEY(IN_DTYPE, OUT_DTYPE, _image_2d) \ + (( IN_DTYPE << 12 ) | ( OUT_DTYPE << 4) | (_image_2d)) + +#define PACK_KERNEL_MAP(IN_DTYPE, OUT_DTYPE) \ + { LOGICAL_NOT_HASH_KEY(IN_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("evis.logical_not_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _LOGICAL_OPS_KERNEL_SOURCE} + +#define PACK_KERNEL_MAP_2D(IN_DTYPE, OUT_DTYPE) \ + { LOGICAL_NOT_HASH_KEY(IN_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("evis.logical_not_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _LOGICAL_OPS_KERNEL_SOURCE} + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _logical_not_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( I8, I8), + PACK_KERNEL_MAP_2D(I8, I8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _logical_not_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _LOGICAL_NOT_PARAM_NUM _cnt_of_array( _logical_not_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_logical_not_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vx_tensor output = (vx_tensor)param[1]; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t *output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = output_shape->size < 3 ? 2 : 3; + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_offset[2] = 0; + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? + (output_shape->data[2] + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + return status; +} /* _logical_not_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _logical_not_kernel_map; + size_t kernel_map_size = _cnt_of_array( _logical_not_kernel_map ); + vx_param_description_t * param_def = _logical_not_kernel_param_def; + size_t param_def_size = _cnt_of_array( _logical_not_kernel_param_def ); + vx_kernel_initialize_f initializer = _logical_not_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (BOOL8 == in_dtype && BOOL8 == out_dtype) + { + in_dtype = I8; + out_dtype = I8; + } + + key = LOGICAL_NOT_HASH_KEY( in_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LOGICAL_NOT_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_NOT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_NOT_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( logical_not, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c new file mode 100644 index 0000000..38e9df3 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c @@ -0,0 +1,293 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _LOGICAL_OPS_KERNEL_SOURCE "logical_ops" + +#define STR(a) #a + +// Add kernel hashtable here +#define LOGICAL_OPS_HASH_KEY(OP_TYPE, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((OP_TYPE << 20) | ( IN_DTYPE << 12 ) | ( OUT_DTYPE << 4) | (_image_2d)) + +#define PACK_KERNEL_MAP(OP_TYPE, IN_DTYPE, OUT_DTYPE, op_name) \ + { LOGICAL_OPS_HASH_KEY(OP_TYPE, IN_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("evis.logical_"op_name"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _LOGICAL_OPS_KERNEL_SOURCE} + +#define PACK_KERNEL_MAP_2D(OP_TYPE, IN_DTYPE, OUT_DTYPE, op_name) \ + { LOGICAL_OPS_HASH_KEY(OP_TYPE, IN_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("evis.logical_"op_name"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _LOGICAL_OPS_KERNEL_SOURCE} + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _logical_ops_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP(VSI_NN_LOGICAL_OR, I8, I8, "or"), + PACK_KERNEL_MAP(VSI_NN_LOGICAL_AND, I8, I8, "and"), + PACK_KERNEL_MAP(VSI_NN_LOGICAL_XOR, I8, I8, "xor"), + PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_OR, I8, I8, "or"), + PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_AND, I8, I8, "and"), + PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_XOR, I8, I8, "xor"), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _logical_ops_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _LOGICAL_OPS_PARAM_NUM _cnt_of_array( _logical_ops_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_logical_ops_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vx_tensor input = (vx_tensor)param[0]; + vx_tensor output = (vx_tensor)param[2]; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL, *output_attr = NULL; + vsi_int_array_t *output_shape = NULL; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); + CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + input_dtype = input_attr->dtype; + + gpu_param.dim = output_shape->size < 3 ? 2 : 3; + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_offset[2] = 0; + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? + (output_shape->data[2] + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2] : 1; + + if(F16 == input_dtype) + { + gpu_dp_inst_t uniMulShortMinus1toFp16_2x8 = {{ + 0x22222222, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniMulShortMinus1toFp16_2x8", &uniMulShortMinus1toFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (input_attr) + { + vsi_nn_kernel_tensor_attr_release(&input_attr); + } + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + return status; +} /* _logical_ops_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d, + vsi_nn_logical_ops_type_t op_type + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _logical_ops_kernel_map; + size_t kernel_map_size = _cnt_of_array( _logical_ops_kernel_map ); + vx_param_description_t * param_def = _logical_ops_kernel_param_def; + size_t param_def_size = _cnt_of_array( _logical_ops_kernel_param_def ); + vx_kernel_initialize_f initializer = _logical_ops_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (in_dtype != in1_dtype) + { + return VSI_FAILURE; + } + + if (BOOL8 == in_dtype && BOOL8 == out_dtype) + { + in_dtype = I8; + out_dtype = I8; + } + + key = LOGICAL_OPS_HASH_KEY(op_type, in_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LOGICAL_OPS_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + uint32_t ops_type = vsi_nn_kernel_param_get_int32( params, "ops_type" ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + + status = _query_kernel( kernel, inputs, outputs, image_2d, (vsi_nn_logical_ops_type_t)ops_type); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_OPS_PARAM_NUM, + inputs, input_num, outputs, output_num ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_OPS_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( logical_ops, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c new file mode 100644 index 0000000..03262c3 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c @@ -0,0 +1,1526 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _LSTMUNIT_nn_activation_type_e +{ + SIGMOID = VSI_NN_ACT_SIGMOID, + HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, +}LSTMUNIT_nn_activation_type_e; + +typedef enum _LSTMUNIT_nn_activation_e +{ + CLP_E = 0x1C, + LP_E = 0x14, + CL_E = 0x18, + L_E = 0x10, + BP_E = 0x06, + B_E = 0x02, + CBP_E = 0x0E, + CB_E = 0x0A, + SP_E = 0x04, + S_E = 0x00, + CSP_E = 0x0C, + CS_E = 0x08, +}LSTMUNIT_nn_activation_e; + + +#define LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \ +_input_type, _output_type, _cell_type, _rec_act) \ +((_is_ln << 31) | (_is_cifg << 30) | (_is_proj << 29) | (_is_hybrid << 28) | (_is_peephole << 27) \ +| (_input_type << 23) | (_output_type << 19) | (_cell_type << 15) | (_rec_act << 10)) + +#define LSTMUNIT_ACTIVATION_SOURCE_NAME(_ln_cifg_proj_hybrid_, _input_type) \ + "lstmunit_activation_"#_ln_cifg_proj_hybrid_"_"#_input_type + +#define GEN_LSTMUNIT_STRUCT_ITEMS(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, _input_type, _output_type, \ + _cell_type, _rec_act, _ln_cifg_proj_hybrid_) \ + { LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \ + _input_type, _output_type, _cell_type, _rec_act), \ + CVIVANTE_NAMESPACE(\ + "evis.lstmunit_activation_"#_ln_cifg_proj_hybrid_"_"#_input_type"to"#_output_type"_"#_cell_type"_"#_rec_act), \ + LSTMUNIT_ACTIVATION_SOURCE_NAME(_ln_cifg_proj_hybrid_, _input_type) }, + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _lstmunit_activation_kernel_map[] = +{ + /* layer norm + cifg + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, F16, F32, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, F16, F16, SIGMOID, CLP) + /* layer norm + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, F16, F32, SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, F16, F16, SIGMOID, LP) + /* layer norm + cifg */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, F16, F16, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, I16, F16, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, U8, F16, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, I8, F16, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, F16, F32, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, I16, F32, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, U8, F32, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, I8, F32, SIGMOID, CL) + /* layer norm */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, F16, F16, SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, I16, F16, SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, U8, F16, SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, I8, F16, SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, F16, F32, SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, I16, F32, SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, U8, F32, SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, I8, F32, SIGMOID, L) + /* layer norm + cifg + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, I16, F32, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, I8, F32, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, U8, F32, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, I16, F16, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, I8, F16, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, U8, F16, SIGMOID, CLP) + /* layer norm + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, I16, F32, SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, I8, F32, SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, U8, F32, SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, I16, F16, SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, I8, F16, SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, U8, F16, SIGMOID, LP) + /* hybrid + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, F16, F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, F16, F16, SIGMOID, BP) + /* hybrid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, F16, F16, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, I16, F16, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, U8, F16, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, I8, F16, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, F16, F32, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, I16, F32, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, U8, F32, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, I8, F32, SIGMOID, B) + /* cifg + hybrid + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, F16, F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, F16, F16, SIGMOID, CBP) + /* cifg + hybrid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, F16, F16, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, I16, F16, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, U8, F16, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, I8, F16, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, F16, F32, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, I16, F32, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, U8, F32, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, I8, F32, SIGMOID, CB) + /* cifg + hybrid + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, I16, F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, I8, F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, U8, F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, I16, F16, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, I8, F16, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, U8, F16, SIGMOID, CBP) + /* hybrid + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, I16, F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, I8, F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, U8, F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, I16, F16, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, I8, F16, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, U8, F16, SIGMOID, BP) + /* hybrid + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, F16, F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, F16, F16, SIGMOID, BP) + /* hybrid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8, F16, F16, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8, U8, F16, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8, F16, F32, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8, U8, F32, SIGMOID, B) + /* cifg + hybrid + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, F16, F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, F16, F16, SIGMOID, CBP) + /* cifg + hybrid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8, F16, F16, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8, U8, F16, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8, F16, F32, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8, U8, F32, SIGMOID, CB) + /* cifg + hybrid + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, I16, F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, I8, F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, U8, F32, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, I16, F16, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, I8, F16, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, U8, F16, SIGMOID, CBP) + /* hybrid + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, I16, F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, I8, F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, U8, F32, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, I16, F16, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, I8, F16, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, U8, F16, SIGMOID, BP) + + /* standard + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, F16, F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, F16, F16, SIGMOID, SP) + /* standard */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, F16, F16, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, I16, F16, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, U8, F16, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, I8, F16, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, F16, F32, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, I16, F32, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, U8, F32, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, I8, F32, SIGMOID, S) + /* cifg + standard + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, F16, F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, F16, F16, SIGMOID, CSP) + /* cifg + standard */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, F16, F16, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, I16, F16, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, U8, F16, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, I8, F16, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, F16, F32, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, I16, F32, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, U8, F32, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, I8, F32, SIGMOID, CS) + /* cifg + standard + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, I16, F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, I8, F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, U8, F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, I16, F16, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, I8, F16, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, U8, F16, SIGMOID, CSP) + /* standard + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, I16, F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, I8, F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, U8, F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, I16, F16, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, I8, F16, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, U8, F16, SIGMOID, SP) + /* standard + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, F16, F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, F16, F16, SIGMOID, SP) + /* standard */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8, F16, F16, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8, U8, F16, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8, F16, F32, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8, U8, F32, SIGMOID, S) + /* cifg + standard + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, F16, F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, F16, F16, SIGMOID, CSP) + /* cifg + standard */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8, F16, F16, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8, U8, F16, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8, F16, F32, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8, U8, F32, SIGMOID, CS) + /* cifg + standard + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, I16, F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, I8, F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, U8, F32, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, I16, F16, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, I8, F16, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, U8, F16, SIGMOID, CSP) + /* standard + projection */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I16, F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I8, F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, U8, F32, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I16, F16, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I8, F16, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, U8, F16, SIGMOID, SP) + /* layer norm + cifg + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, F16, F32, HARD_SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, F16, F16, HARD_SIGMOID, CLP) + /* layer norm + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, F16, F32, HARD_SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, F16, F16, HARD_SIGMOID, LP) + /* layer norm + cifg + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, F16, F16, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, I16, F16, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, U8, F16, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, I8, F16, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, F16, F32, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, I16, F32, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, U8, F32, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, F16, I8, F32, HARD_SIGMOID, CL) + /* layer norm + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, F16, F16, HARD_SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, I16, F16, HARD_SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, U8, F16, HARD_SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, I8, F16, HARD_SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, F16, F32, HARD_SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, I16, F32, HARD_SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, U8, F32, HARD_SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, F16, I8, F32, HARD_SIGMOID, L) + /* layer norm + cifg + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, I16, F32, HARD_SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, I8, F32, HARD_SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, U8, F32, HARD_SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, I16, F16, HARD_SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, I8, F16, HARD_SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, F16, U8, F16, HARD_SIGMOID, CLP) + /* layer norm + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, I16, F32, HARD_SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, I8, F32, HARD_SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, U8, F32, HARD_SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, I16, F16, HARD_SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, I8, F16, HARD_SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, F16, U8, F16, HARD_SIGMOID, LP) + /* hybrid + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, F16, F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, F16, F16, HARD_SIGMOID, BP) + /* hybrid + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, F16, F16, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, I16, F16, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, U8, F16, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, I8, F16, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, F16, F32, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, I16, F32, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, U8, F32, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, F16, I8, F32, HARD_SIGMOID, B) + /* cifg + hybrid + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, F16, F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, F16, F16, HARD_SIGMOID, CBP) + /* cifg + hybrid + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, F16, F16, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, I16, F16, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, U8, F16, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, I8, F16, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, F16, F32, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, I16, F32, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, U8, F32, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, F16, I8, F32, HARD_SIGMOID, CB) + /* cifg + hybrid + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, I16, F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, I8, F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, U8, F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, I16, F16, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, I8, F16, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, F16, U8, F16, HARD_SIGMOID, CBP) + /* hybrid + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, I16, F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, I8, F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, U8, F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, I16, F16, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, I8, F16, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, F16, U8, F16, HARD_SIGMOID, BP) + /* hybrid + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, F16, F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, F16, F16, HARD_SIGMOID, BP) + /* hybrid + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8, F16, F16, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8, U8, F16, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8, F16, F32, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, U8, U8, F32, HARD_SIGMOID, B) + /* cifg + hybrid + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, F16, F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, F16, F16, HARD_SIGMOID, CBP) + /* cifg + hybrid + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8, F16, F16, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8, U8, F16, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8, F16, F32, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, U8, U8, F32, HARD_SIGMOID, CB) + /* cifg + hybrid + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, I16, F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, I8, F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, U8, F32, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, I16, F16, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, I8, F16, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, U8, U8, F16, HARD_SIGMOID, CBP) + /* hybrid + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, I16, F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, I8, F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, U8, F32, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, I16, F16, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, I8, F16, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, U8, U8, F16, HARD_SIGMOID, BP) + + /* standard + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, F16, F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, F16, F16, HARD_SIGMOID, SP) + /* standard + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, F16, F16, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, I16, F16, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, U8, F16, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, I8, F16, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, F16, F32, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, I16, F32, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, U8, F32, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, F16, I8, F32, HARD_SIGMOID, S) + /* cifg + standard + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, F16, F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, F16, F16, HARD_SIGMOID, CSP) + /* cifg + standard + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, F16, F16, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, I16, F16, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, U8, F16, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, I8, F16, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, F16, F32, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, I16, F32, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, U8, F32, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, F16, I8, F32, HARD_SIGMOID, CS) + /* cifg + standard + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, I16, F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, I8, F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, U8, F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, I16, F16, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, I8, F16, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, F16, U8, F16, HARD_SIGMOID, CSP) + /* standard + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, I16, F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, I8, F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, U8, F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, I16, F16, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, I8, F16, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, F16, U8, F16, HARD_SIGMOID, SP) + /* standard + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, F16, F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, F16, F16, HARD_SIGMOID, SP) + /* standard + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8, F16, F16, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8, U8, F16, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8, F16, F32, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, U8, U8, F32, HARD_SIGMOID, S) + /* cifg + standard + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, F16, F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, F16, F16, HARD_SIGMOID, CSP) + /* cifg + standard + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8, F16, F16, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8, U8, F16, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8, F16, F32, HARD_SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, U8, U8, F32, HARD_SIGMOID, CS) + /* cifg + standard + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, I16, F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, I8, F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, U8, F32, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, I16, F16, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, I8, F16, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, U8, U8, F16, HARD_SIGMOID, CSP) + /* standard + projection + hard_sigmoid */ + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I16, F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I8, F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, U8, F32, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I16, F16, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I8, F16, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, U8, F16, HARD_SIGMOID, SP) +}; + + +typedef enum _lstmunit_cifg_ln_proj_e +{ + CLP_INPUT_FC_F, + CLP_INPUT_FC_C, + CLP_INPUT_FC_O, + CLP_CSTATE_IN, + CLP_BIASES_F, + CLP_BIASES_C, + CLP_BIASES_O, + CLP_LN_WF, + CLP_LN_WC, + CLP_LN_WO, + CLP_OUTPUT, + CLP_CSTATE_OUT, + CLP_PARAM +} lstmunit_cifg_ln_proj_e; + +static vx_param_description_t vxLSTMUNIT_CLP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CLP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CLP_Param ) + +typedef enum _lstmunit_cifg_ln_e +{ + CL_INPUT_FC_F, + CL_INPUT_FC_C, + CL_INPUT_FC_O, + CL_CSTATE_IN, + CL_BIASES_F, + CL_BIASES_C, + CL_BIASES_O, + CL_LN_WF, + CL_LN_WC, + CL_LN_WO, + CL_OUTPUT, + CL_CSTATE_OUT, + CL_HSTATE_OUT, + CL_LSTMUNIT_PARAM, +} lstmunit_cifg_ln_e; + +static vx_param_description_t vxLSTMUNIT_CL_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CL_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CL_Param ) + +typedef enum _lstmunit_ln_proj_e +{ + LP_INPUT_FC_I, + LP_INPUT_FC_F, + LP_INPUT_FC_C, + LP_INPUT_FC_O, + LP_CSTATE_IN, + LP_BIASES_I, + LP_BIASES_F, + LP_BIASES_C, + LP_BIASES_O, + LP_LN_WI, + LP_LN_WF, + LP_LN_WC, + LP_LN_WO, + LP_OUTPUT, + LP_CSTATE_OUT, + LP_PARAM +} lstmunit_ln_proj_e; + +static vx_param_description_t vxLSTMUNIT_LP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_LP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_LP_Param ) + +typedef enum _lstmunit_ln_e +{ + L_INPUT_FC_I, + L_INPUT_FC_F, + L_INPUT_FC_C, + L_INPUT_FC_O, + L_CSTATE_IN, + L_BIASES_I, + L_BIASES_F, + L_BIASES_C, + L_BIASES_O, + L_LN_WI, + L_LN_WF, + L_LN_WC, + L_LN_WO, + L_OUTPUT, + L_CSTATE_OUT, + L_HSTATE_OUT, + L_LSTMUNIT_PARAM, +} lstmunit_ln_e; + +static vx_param_description_t vxLSTMUNIT_L_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_L_PARAM_NUM _cnt_of_array( vxLSTMUNIT_L_Param ) + +typedef enum _lstmunit_hybrid_proj_e +{ + BP_INPUT_FC_I, + BP_INPUT_FC_F, + BP_INPUT_FC_C, + BP_INPUT_FC_O, + BP_CSTATE_IN, + BP_HSTATE_FC_I, + BP_HSTATE_FC_F, + BP_HSTATE_FC_C, + BP_HSTATE_FC_O, + BP_BIASES_I, + BP_BIASES_F, + BP_BIASES_C, + BP_BIASES_O, + BP_OUTPUT, + BP_CSTATE_OUT, + BP_PARAM +} lstmunit_hybrid_proj_e; + +static vx_param_description_t vxLSTMUNIT_BP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_BP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_BP_Param ) + +typedef enum _lstmunit_hybrid_e +{ + B_INPUT_FC_I, + B_INPUT_FC_F, + B_INPUT_FC_C, + B_INPUT_FC_O, + B_CSTATE_IN, + B_HSTATE_FC_I, + B_HSTATE_FC_F, + B_HSTATE_FC_C, + B_HSTATE_FC_O, + B_BIASES_I, + B_BIASES_F, + B_BIASES_C, + B_BIASES_O, + B_OUTPUT, + B_CSTATE_OUT, + B_HSTATE_OUT, + B_LSTMUNIT_PARAM, +} lstmunit_hybrid_e; + +static vx_param_description_t vxLSTMUNIT_B_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_B_PARAM_NUM _cnt_of_array( vxLSTMUNIT_B_Param ) + +typedef enum _lstmunit_cifg_hybrid_proj_e +{ + CBP_INPUT_FC_F, + CBP_INPUT_FC_C, + CBP_INPUT_FC_O, + CBP_CSTATE_IN, + CBP_HSTATE_FC_F, + CBP_HSTATE_FC_C, + CBP_HSTATE_FC_O, + CBP_BIASES_F, + CBP_BIASES_C, + CBP_BIASES_O, + CBP_OUTPUT, + CBP_CSTATE_OUT, + CBP_PARAM +} lstmunit_cifg_hybrid_proj_e; + +static vx_param_description_t vxLSTMUNIT_CBP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CBP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CBP_Param ) + +typedef enum _lstmunit_cifg_hybrid_e +{ + CB_INPUT_FC_F, + CB_INPUT_FC_C, + CB_INPUT_FC_O, + CB_CSTATE_IN, + CB_HSTATE_FC_F, + CB_HSTATE_FC_C, + CB_HSTATE_FC_O, + CB_BIASES_F, + CB_BIASES_C, + CB_BIASES_O, + CB_OUTPUT, + CB_CSTATE_OUT, + CB_HSTATE_OUT, + CB_LSTMUNIT_PARAM, +} lstmunit_cifg_hybrid_e; + +static vx_param_description_t vxLSTMUNIT_CB_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CB_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CB_Param ) + +typedef enum _lstmunit_standard_proj_e +{ + SP_INPUT_FC_I, + SP_INPUT_FC_F, + SP_INPUT_FC_C, + SP_INPUT_FC_O, + SP_CSTATE_IN, + SP_HSTATE_FC_I, + SP_HSTATE_FC_F, + SP_HSTATE_FC_C, + SP_HSTATE_FC_O, + SP_OUTPUT, + SP_CSTATE_OUT, + SP_PARAM +} lstmunit_standard_proj_e; + +static vx_param_description_t vxLSTMUNIT_SP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_SP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_SP_Param ) + +typedef enum _lstmunit_standard_e +{ + S_INPUT_FC_I, + S_INPUT_FC_F, + S_INPUT_FC_C, + S_INPUT_FC_O, + S_CSTATE_IN, + S_HSTATE_FC_I, + S_HSTATE_FC_F, + S_HSTATE_FC_C, + S_HSTATE_FC_O, + S_OUTPUT, + S_CSTATE_OUT, + S_HSTATE_OUT, + S_LSTMUNIT_PARAM, +} lstmunit_standard_e; + +static vx_param_description_t vxLSTMUNIT_S_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_S_PARAM_NUM _cnt_of_array( vxLSTMUNIT_S_Param ) + +typedef enum _lstmunit_cifg_standard_proj_e +{ + CSP_INPUT_FC_F, + CSP_INPUT_FC_C, + CSP_INPUT_FC_O, + CSP_CSTATE_IN, + CSP_HSTATE_FC_F, + CSP_HSTATE_FC_C, + CSP_HSTATE_FC_O, + CSP_OUTPUT, + CSP_CSTATE_OUT, + CSP_PARAM +} lstmunit_cifg_standard_proj_e; + +static vx_param_description_t vxLSTMUNIT_CSP_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CSP_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CSP_Param ) + +typedef enum _lstmunit_cifg_standard_e +{ + CS_INPUT_FC_F, + CS_INPUT_FC_C, + CS_INPUT_FC_O, + CS_CSTATE_IN, + CS_HSTATE_FC_F, + CS_HSTATE_FC_C, + CS_HSTATE_FC_O, + CS_OUTPUT, + CS_CSTATE_OUT, + CS_HSTATE_OUT, + CS_LSTMUNIT_PARAM, +} lstmunit_cifg_standard_e; + +static vx_param_description_t vxLSTMUNIT_CS_Param[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, +}; + +#define _LSTMUNIT_ACTIVATION_CS_PARAM_NUM _cnt_of_array( vxLSTMUNIT_CS_Param ) + +#define _LSTMUINT_PARAM_NUM 5 + +#define _LSTMUNIT_ACTIVATION_MAX_PARAM_NUM (LSTMUNIT_ACT_PARAM_COUT + _LSTMUINT_PARAM_NUM) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t cell_state_in = NULL; + vsi_nn_kernel_tensor_t output = NULL; + float cell_clip = 0.0f; + float forget_bias = 0.0f; + float outputScale = 1.0f; + float outputZP = 0; + int32_t dstZP = 0; + float dstScale = 1.0f; + vsi_nn_kernel_dtype_e cellFormat = F16; + vsi_nn_kernel_dtype_e dstFormat = F16; + vsi_nn_kernel_quant_type_e dstQuantType = VSI_NN_KERNEL_QUANT_NONE; + int32_t dstFixPointPos = 0; + float logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); + float twoLogE = 2 * logE; + uint32_t uint_min = 0xFBFFFFFF; + uint32_t uint_max = 0x7BFFFFFF; + float float_min = *(vx_float32 *)&uint_min; + float float_max = *(vx_float32 *)&uint_max; + float clip_Min_F[4] = {0}; + float clip_Max_F[4] = {0}; + uint32_t i = 0; + int32_t input0Array_ZP[4] = {0}; + int32_t input1Array_ZP[4] = {0}; + float input0Array_Scale[4] = {1.0f}; + float input1Array_Scale[4] = {1.0f}; + int32_t _is_ln = 0; + int32_t _is_cifg = 0; + int32_t _is_hybrid = 0; + vsi_nn_kernel_tensor_attr_t* input_attr[9]; + vsi_nn_kernel_tensor_attr_t* attr[2]; + + status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 5], &_is_ln ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 4], &_is_cifg ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 3], &_is_hybrid ); + CHECK_STATUS_FAIL_GOTO(status, final ); + if (_is_cifg) + { + cell_state_in = (vsi_nn_kernel_tensor_t)param[CL_CSTATE_IN]; + if (_is_ln) + output = (vsi_nn_kernel_tensor_t)param[CL_OUTPUT]; + else if (_is_hybrid) + output = (vsi_nn_kernel_tensor_t)param[CB_OUTPUT]; + else + output = (vsi_nn_kernel_tensor_t)param[CS_OUTPUT]; + } + else + { + cell_state_in = (vsi_nn_kernel_tensor_t)param[L_CSTATE_IN]; + if (_is_ln) + output = (vsi_nn_kernel_tensor_t)param[L_OUTPUT]; + else if (_is_hybrid) + output = (vsi_nn_kernel_tensor_t)param[B_OUTPUT]; + else + output = (vsi_nn_kernel_tensor_t)param[S_OUTPUT]; + } + + for (i = 0; i < 9; i++) + { + input_attr[i] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[i] ); + CHECK_PTR_FAIL_GOTO( input_attr[i], "Create tensor attr buffer fail.", final ); + } + attr[0] = vsi_nn_kernel_tensor_attr_create( cell_state_in ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_float32( (vsi_nn_kernel_scalar_t)param[param_size - 2], &cell_clip ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32( (vsi_nn_kernel_scalar_t)param[param_size - 1], &forget_bias ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + cellFormat = attr[0]->dtype; + dstFormat = attr[1]->dtype; + + dstQuantType = attr[1]->quant; + + if ( VSI_NN_KERNEL_QUANT_DFP == dstQuantType ) + { + dstFixPointPos = (int8_t)attr[1]->dfp.fl; + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == dstQuantType ) + { + dstZP = attr[1]->asymm.zero_point; + dstScale = attr[1]->asymm.scale; + } + + outputZP = (vx_float32)dstZP; + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((attr[1]->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (attr[1]->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + + if (cell_clip > 0) + { + float_max = cell_clip; + float_min = -cell_clip; + } + + for (i = 0; i < 4; i++) + { + clip_Min_F[i] = float_min; + clip_Max_F[i] = float_max; + } + + { + gpu_dp_inst_t uniFp16toFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractHalf4_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16AddFp16toFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8AddS32_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (dstQuantType == VSI_NN_KERNEL_QUANT_DFP) + { + if (dstFixPointPos >= 0) + outputScale *= (vx_float32)((int64_t)1 << dstFixPointPos); + else if (dstFixPointPos < 0) + outputScale *= 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos); + + outputZP = 0; + } + else if (dstQuantType == VSI_NN_KERNEL_QUANT_ASYMM) + { + outputScale = 1.0f / dstScale; + } + + if ( cellFormat == F16 ) + { + vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4); + } + + if ( dstFormat == F16 ) + { + vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + } + else + { + vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + + vsi_nn_kernel_gpu_add_param(node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4); + vsi_nn_kernel_gpu_add_param(node, "logE", &logE); + vsi_nn_kernel_gpu_add_param(node, "twoLogE", &twoLogE); + vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); + vsi_nn_kernel_gpu_add_param(node, "forget_bias", &forget_bias); + vsi_nn_kernel_gpu_add_param(node, "clip_Min_F", clip_Min_F); + vsi_nn_kernel_gpu_add_param(node, "clip_Max_F", clip_Max_F); + + if ( !_is_ln && input_attr[S_INPUT_FC_F]->dtype == F16 ) + { + vsi_nn_kernel_gpu_add_param(node, "uniFp16AddFp16toFp32_4x4", &uniFp16AddFp16toFp32_4x4); + } + + if (input_attr[S_INPUT_FC_F]->dtype == U8 && + input_attr[S_INPUT_FC_F]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + if (_is_cifg) + { + input0Array_ZP[1] = 0 - input_attr[CS_INPUT_FC_F]->asymm.zero_point; + input0Array_ZP[2] = 0 - input_attr[CS_INPUT_FC_C]->asymm.zero_point; + input0Array_ZP[3] = 0 - input_attr[CS_INPUT_FC_O]->asymm.zero_point; + + input0Array_Scale[1] = input_attr[CS_INPUT_FC_F]->asymm.scale; + input0Array_Scale[2] = input_attr[CS_INPUT_FC_C]->asymm.scale; + input0Array_Scale[3] = input_attr[CS_INPUT_FC_O]->asymm.scale; + + if ( !_is_ln ) + { + input1Array_ZP[1] = 0 - input_attr[CS_HSTATE_FC_F]->asymm.zero_point; + input1Array_ZP[2] = 0 - input_attr[CS_HSTATE_FC_C]->asymm.zero_point; + input1Array_ZP[3] = 0 - input_attr[CS_HSTATE_FC_O]->asymm.zero_point; + + input1Array_Scale[1] = input_attr[CS_HSTATE_FC_F]->asymm.scale; + input1Array_Scale[2] = input_attr[CS_HSTATE_FC_C]->asymm.scale; + input1Array_Scale[3] = input_attr[CS_HSTATE_FC_O]->asymm.scale; + } + } + else + { + input0Array_ZP[0] = 0 - input_attr[S_INPUT_FC_I]->asymm.zero_point; + input0Array_ZP[1] = 0 - input_attr[S_INPUT_FC_F]->asymm.zero_point; + input0Array_ZP[2] = 0 - input_attr[S_INPUT_FC_C]->asymm.zero_point; + input0Array_ZP[3] = 0 - input_attr[S_INPUT_FC_O]->asymm.zero_point; + + input0Array_Scale[0] = input_attr[S_INPUT_FC_I]->asymm.scale; + input0Array_Scale[1] = input_attr[S_INPUT_FC_F]->asymm.scale; + input0Array_Scale[2] = input_attr[S_INPUT_FC_C]->asymm.scale; + input0Array_Scale[3] = input_attr[S_INPUT_FC_O]->asymm.scale; + + if ( !_is_ln ) + { + input1Array_ZP[0] = 0 - input_attr[S_HSTATE_FC_I]->asymm.zero_point; + input1Array_ZP[1] = 0 - input_attr[S_HSTATE_FC_F]->asymm.zero_point; + input1Array_ZP[2] = 0 - input_attr[S_HSTATE_FC_C]->asymm.zero_point; + input1Array_ZP[3] = 0 - input_attr[S_HSTATE_FC_O]->asymm.zero_point; + + input1Array_Scale[0] = input_attr[S_HSTATE_FC_I]->asymm.scale; + input1Array_Scale[1] = input_attr[S_HSTATE_FC_F]->asymm.scale; + input1Array_Scale[2] = input_attr[S_HSTATE_FC_C]->asymm.scale; + input1Array_Scale[3] = input_attr[S_HSTATE_FC_O]->asymm.scale; + } + } + + vsi_nn_kernel_gpu_add_param(node, "input0Array_ZP", input0Array_ZP); + vsi_nn_kernel_gpu_add_param(node, "input0Array_Scale", input0Array_Scale); + vsi_nn_kernel_gpu_add_param(node, "input1Array_ZP", input1Array_ZP); + vsi_nn_kernel_gpu_add_param(node, "input1Array_Scale", input1Array_Scale); + vsi_nn_kernel_gpu_add_param(node, "uniU8AddS32_4x4", &uniU8AddS32_4x4); + } + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + for (i = 0; i < 9; i++) + { + if (input_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &input_attr[i] ); + } + } + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + } + + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + } + + return status; + +} /* _lstmunit_activation_initializer() */ + + +static size_t get_param_num(LSTMUNIT_nn_activation_e lstm_activation) +{ + size_t lstm_activation_param_num = LSTMUNIT_ACT_PARAM_COUT; + + switch (lstm_activation) + { + case CLP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CLP_PARAM_NUM; + break; + case LP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_LP_PARAM_NUM; + break; + case CL_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CL_PARAM_NUM; + break; + case L_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_L_PARAM_NUM; + break; + case BP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_BP_PARAM_NUM; + break; + case B_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_B_PARAM_NUM; + break; + case CBP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CBP_PARAM_NUM; + break; + case CB_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CB_PARAM_NUM; + break; + case SP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_SP_PARAM_NUM; + break; + case S_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_S_PARAM_NUM; + break; + case CSP_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CSP_PARAM_NUM; + break; + case CS_E: + lstm_activation_param_num = _LSTMUNIT_ACTIVATION_CS_PARAM_NUM; + break; + default: + break; + } + return lstm_activation_param_num; +} + +static void set_vx_param_description_t(LSTMUNIT_nn_activation_e lstm_activation, vx_param_description_t ** param_def) +{ + switch (lstm_activation) + { + case CLP_E: + *param_def = vxLSTMUNIT_CLP_Param; + break; + case LP_E: + *param_def = vxLSTMUNIT_LP_Param; + break; + case CL_E: + *param_def = vxLSTMUNIT_CL_Param; + break; + case L_E: + *param_def = vxLSTMUNIT_L_Param; + break; + case BP_E: + *param_def = vxLSTMUNIT_BP_Param; + break; + case B_E: + *param_def = vxLSTMUNIT_B_Param; + break; + case CBP_E: + *param_def = vxLSTMUNIT_CBP_Param; + break; + case CB_E: + *param_def = vxLSTMUNIT_CB_Param; + break; + case SP_E: + *param_def = vxLSTMUNIT_SP_Param; + break; + case S_E: + *param_def = vxLSTMUNIT_S_Param; + break; + case CSP_E: + *param_def = vxLSTMUNIT_CSP_Param; + break; + case CS_E: + *param_def = vxLSTMUNIT_CS_Param; + break; + default: + break; + } +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t _is_ln, + int32_t _is_cifg, + int32_t _is_proj, + int32_t _is_hybrid, + int32_t _is_peephole, + int32_t recurrent_activation, + LSTMUNIT_nn_activation_e lstm_activation + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e cell_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _lstmunit_activation_kernel_map; + size_t kernel_map_size = _cnt_of_array( _lstmunit_activation_kernel_map ); + vx_param_description_t * param_def = NULL; + size_t param_def_size = _LSTMUNIT_ACTIVATION_MAX_PARAM_NUM; + vx_kernel_initialize_f initializer = _lstmunit_activation_initializer; + uint32_t key; + uint32_t i; + + set_vx_param_description_t( lstm_activation, ¶m_def ); + + if (NULL == param_def) + { + status = VSI_FAILURE; + return status; + } + + param_def_size = get_param_num(lstm_activation); + in_dtype = vsi_nn_kernel_map_dtype( inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.vx_type ); + cell_dtype = vsi_nn_kernel_map_dtype( inputs[LSTMUNIT_ACT_CSTATE_IN]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \ + in_dtype, out_dtype, cell_dtype, recurrent_activation); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LSTMUNIT_ACTIVATION_MAX_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* in_tensor[LSTMUNIT_ACT_INPUTS_COUNT] = {NULL}; + vsi_nn_tensor_t* out_tensor[LSTMUNIT_ACT_OUTUTS_COUNT] = {NULL}; + size_t input_cnt = 0; + size_t output_cnt = 0; + int32_t _is_ln= 0; + int32_t _is_cifg= 0; + int32_t _is_proj= 0; + int32_t _is_hybrid= 0; + int32_t _is_peephole= 0; + int32_t recurrent_activation; + float cell_clip; + float forget_bias; + LSTMUNIT_nn_activation_e lstm_activation; + size_t lstm_activation_param_num = _LSTMUNIT_ACTIVATION_MAX_PARAM_NUM; + + _is_ln = vsi_nn_kernel_param_get_int32( params, "_is_ln" ); + _is_cifg = vsi_nn_kernel_param_get_int32( params, "_is_cifg" ); + _is_proj = vsi_nn_kernel_param_get_int32( params, "_is_proj" ); + _is_hybrid = vsi_nn_kernel_param_get_int32( params, "_is_hybrid" ); + _is_peephole = vsi_nn_kernel_param_get_int32( params, "_is_peephole" ); + recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + cell_clip = vsi_nn_kernel_param_get_float32(params, "cell_clip"); + forget_bias = vsi_nn_kernel_param_get_float32(params, "forget_bias"); + + lstm_activation = (LSTMUNIT_nn_activation_e)((_is_ln << 4) | \ + (_is_cifg << 3) | (_is_proj << 2) | (_is_hybrid << 1) | (_is_peephole)); + status = _query_kernel( kernel, inputs, outputs, \ + _is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, recurrent_activation, lstm_activation); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t i; + + for (i = 0; i < input_num; i++) + { + if (inputs[i]) + { + in_tensor[input_cnt] = inputs[i]; + input_cnt++; + } + } + for (i = 0; i < output_num; i++) + { + if (outputs[i]) + { + out_tensor[output_cnt] = outputs[i]; + output_cnt++; + } + } + lstm_activation_param_num = get_param_num(lstm_activation); + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, lstm_activation_param_num, + in_tensor, input_cnt, out_tensor, output_cnt ); + node_params[lstm_activation_param_num - _LSTMUINT_PARAM_NUM] = vsi_nn_kernel_scalar_create( + graph, I32, &_is_ln ); + node_params[lstm_activation_param_num - _LSTMUINT_PARAM_NUM + 1] = vsi_nn_kernel_scalar_create( + graph, I32, &_is_cifg ); + node_params[lstm_activation_param_num - _LSTMUINT_PARAM_NUM + 2] = vsi_nn_kernel_scalar_create( + graph, I32, &_is_hybrid ); + node_params[lstm_activation_param_num - _LSTMUINT_PARAM_NUM + 3] = vsi_nn_kernel_scalar_create( + graph, F32, &cell_clip ); + node_params[lstm_activation_param_num - _LSTMUINT_PARAM_NUM + 4] = vsi_nn_kernel_scalar_create( + graph, F32, &forget_bias ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, lstm_activation_param_num ); + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_param_num - _LSTMUINT_PARAM_NUM] ); + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_param_num - _LSTMUINT_PARAM_NUM + 1] ); + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_param_num - _LSTMUINT_PARAM_NUM + 2] ); + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_param_num - _LSTMUINT_PARAM_NUM + 3] ); + vsi_nn_kernel_scalar_release( &node_params[lstm_activation_param_num - _LSTMUINT_PARAM_NUM + 4] ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( lstmunit_activation, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c new file mode 100644 index 0000000..12bac55 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c @@ -0,0 +1,1169 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "matrixmul_f16u8_f16" +#define KERNEL_SOURCE_2 "matrixmul_f16" +#define KERNEL_SOURCE_3 "matrixmul_transB_f16" +#define KERNEL_SOURCE_4 "matrixmul_transB_f16_mix" +#define KERNEL_SOURCE_5 "matrixmul_transB_u8_mix" +#define KERNEL_SOURCE_6 "matrixmul_u8f16_u8" +#define KERNEL_SOURCE_7 "matrixmul_transA" +#define KERNEL_SOURCE_8 "matrixmul_u8f16_f16" +#define KERNEL_SOURCE_9 "matrixmul_u8" +#define KERNEL_SOURCE_10 "matrixmul_f16u8_u8" +#define KERNEL_SOURCE_11 "matrixmul_f16f16_u8" +#define KERNEL_SOURCE_12 "matrixmul_u8u8_f16" +#define KERNEL_SOURCE_13 "matrixmul_i16" +#define KERNEL_SOURCE_14 "matrixmul_f16i16_i16" + +#define HASH_MATRIX_MUL_KEY(_input0_type, _input1_type, _output_type, _trans_a, _trans_b) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_trans_a << 4) | (_trans_b)) + +#define HASH_MATRIX_MUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + +#define HASH_MATRIX_MUL_TRANSB_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.gemm_transb_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + +#define HASH_MATRIX_MUL_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.gemm_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + +#define TENSOR_MATRIX_MUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0), \ + HASH_MATRIX_MUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MATRIX_MUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 1), \ + HASH_MATRIX_MUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MATRIX_MUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1, 0), \ + HASH_MATRIX_MUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } matrix_mul_map[] = +{ + TENSOR_MATRIX_MUL_KERNELS(U8, U8, U8, KERNEL_SOURCE_9) + TENSOR_MATRIX_MUL_KERNELS(I8, I8, I8, KERNEL_SOURCE_9) + TENSOR_MATRIX_MUL_KERNELS(I16, I16, I16, KERNEL_SOURCE_13) + TENSOR_MATRIX_MUL_KERNELS(F16, U8, F16, KERNEL_SOURCE_1) + TENSOR_MATRIX_MUL_KERNELS(F16, I8, F16, KERNEL_SOURCE_1) + TENSOR_MATRIX_MUL_KERNELS(F16, I16, F16, KERNEL_SOURCE_1) + TENSOR_MATRIX_MUL_KERNELS(F16, U8, U8, KERNEL_SOURCE_10) + TENSOR_MATRIX_MUL_KERNELS(F16, I8, I8, KERNEL_SOURCE_10) + TENSOR_MATRIX_MUL_KERNELS(F16, I16, I16, KERNEL_SOURCE_14) + TENSOR_MATRIX_MUL_KERNELS(U8, F16, U8, KERNEL_SOURCE_6) + TENSOR_MATRIX_MUL_KERNELS(I8, F16, I8, KERNEL_SOURCE_6) + TENSOR_MATRIX_MUL_KERNELS(I16, F16, I16, KERNEL_SOURCE_6) + TENSOR_MATRIX_MUL_KERNELS(U8, U8, F16, KERNEL_SOURCE_12) + TENSOR_MATRIX_MUL_KERNELS(I8, I8, F16, KERNEL_SOURCE_12) + TENSOR_MATRIX_MUL_KERNELS(I16, I16, F16, KERNEL_SOURCE_12) + TENSOR_MATRIX_MUL_KERNELS(U8, F16, F16, KERNEL_SOURCE_8) + TENSOR_MATRIX_MUL_KERNELS(I8, F16, F16, KERNEL_SOURCE_8) + TENSOR_MATRIX_MUL_KERNELS(I16, F16, F16, KERNEL_SOURCE_8) + TENSOR_MATRIX_MUL_KERNELS(F16, F16, F16, KERNEL_SOURCE_2) + TENSOR_MATRIX_MUL_KERNELS(F16, F16, U8, KERNEL_SOURCE_11) + TENSOR_MATRIX_MUL_KERNELS(F16, F16, I8, KERNEL_SOURCE_11) + TENSOR_MATRIX_MUL_KERNELS(F16, F16, I16, KERNEL_SOURCE_11) + TENSOR_MATRIX_MUL_KERNELS(F32, F32, F32, KERNEL_SOURCE_2) + TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, F16, F16, KERNEL_SOURCE_3) + TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8, F16, KERNEL_SOURCE_4) + TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8, U8, KERNEL_SOURCE_4) + TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8, U8, F16, KERNEL_SOURCE_5) + TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8, U8, U8, KERNEL_SOURCE_5) + TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8, U8, U8, KERNEL_SOURCE_7) + TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8, I8, I8, KERNEL_SOURCE_7) + TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, I16, I16, KERNEL_SOURCE_7) + TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8, F16, U8, KERNEL_SOURCE_7) + TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8, F16, I8, KERNEL_SOURCE_7) + TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, F16, I16, KERNEL_SOURCE_7) + TENSOR_MATRIX_MUL_TRANSA_KERNELS(F16, F16, F16, KERNEL_SOURCE_7) +}; + +/* + * Kernel params + */ +static vx_param_description_t _matrix_mul_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _MATRIX_MUL_PARAM_NUM _cnt_of_array( _matrix_mul_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + int32_t transA = 0; + int32_t transB = 0; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + + int32_t src0ZP = 0; + float src0Scale = 0; + int32_t src1ZP = 0; + float src1Scale = 0; + float dstZP = 0; + float dstScale = 0; + uint16_t M0 = 0; + uint16_t M1 = 0; + int8_t postShift0 = 0; + int8_t postShift1 = 0; + + uint32_t pack_key = 0; + int32_t ac2zero = 0; + int32_t bc2zero = 0; + + float mulKIn0In1Zp = 0; + float inOutScale = 0; + int32_t K = 0; + + uint32_t evis2 = 0; + vx_context ctx = vxGetContext((vx_reference)node); + vx_hardware_caps_params_t hw_param; + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t)); + status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t)); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + if (hw_param.evis2 == TRUE) + { + evis2 = 1; + } + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &transA); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &transB); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &K); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + src1ZP = attr[1]->asymm.zero_point; + src1Scale = attr[1]->asymm.scale; + dstZP = (float)attr[2]->asymm.zero_point; + dstScale = attr[2]->asymm.scale; + + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + src0ZP = 0; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + src0Scale = 1; + src0ZP = 0; + } + + if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[1]->dfp.fl > 0) + { + src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl))); + } + else + { + src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl)); + } + src1ZP = 0; + } + else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + src1Scale = 1; + src1ZP = 0; + } + + if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[2]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + dstScale = 1.0f / dstScale; + dstZP = 0.0f; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + dstZP = 0.0f; + } + vsi_nn_GetFP32MultiAndPostShift(src0Scale / 1.0f, &M0, &postShift0); + vsi_nn_GetFP32MultiAndPostShift(src1Scale / 1.0f, &M1, &postShift1); + + mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP); + inOutScale = src0Scale * src1Scale / dstScale; + + if ((attr[0]->shape->size > attr[1]->shape->size) || + (attr[0]->shape->data[2] > attr[1]->shape->data[2] + && attr[0]->shape->size > 2 && attr[1]->shape->size > 2)) + { + bc2zero = 1; + } + else if ((attr[1]->shape->size > attr[0]->shape->size) || + (attr[1]->shape->data[2] > attr[0]->shape->data[2] + && attr[0]->shape->size > 2 && attr[1]->shape->size > 2)) + { + ac2zero = 1; + } + + width = attr[2]->shape->data[0]; + height = attr[2]->shape->data[1]; + chn = attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1; + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 4; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = gpu_align_p2((height + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1], 4); + gpu_param.global_size[2] = chn; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE, TRANSA, TRANSB, EVIS2) \ + ((IN0_TYPE << 24) | (IN1_TYPE << 16) | (OUT_TYPE << 8) | (TRANSA << 4) | (TRANSB << 2) | (EVIS2)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype, transA, transB, evis2); + { + gpu_dp_inst_t uniU8SubZptoFp16_dp2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniFp16MulFp16AddtoFp32_dp8x2 = {{ + 0x00005555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x00000000, // ABin + 0x00005555, // BSelt + 0x76543210, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUint8SubZpToFp32B_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert1stFp16ToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniGemmFp16toFp32Row0Lo_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00100010, 0x00100010, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmFp16toFp32Row0Hi_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00320032, 0x00320032, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmFp16toFp32Row1Lo_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00540054, 0x00540054, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmFp16toFp32Row1Hi_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00760076, 0x00760076, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniGemmU8F16toF32Lo_4x4b = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0x51514040, 0x73736262, // ABin + 0x00000000, // BSelt + 0x32103210, 0x32103210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmU8F16toF32Hi_4x4b = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0x51514040, 0x73736262, // ABin + 0x00000000, // BSelt + 0x76547654, 0x76547654, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmF16I16toF32Lo_4x4b = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0x51514040, 0x73736262, // ABin + 0x00000000, // BSelt + 0x32103210, 0x32103210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmF16I16toF32Hi_4x4b = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0x51514040, 0x73736262, // ABin + 0x00000000, // BSelt + 0x76547654, 0x76547654, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniGemm1stU8F16toF32Lo_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00100010, 0x00100010, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemm2ndU8F16toF32Lo_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00320032, 0x00320032, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemm1stU8F16toF32Hi_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00540054, 0x00540054, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemm2ndU8F16toF32Hi_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00760076, 0x00760076, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmFp16MulZptoFp32_4x4 = {{ + 0xaaaaaaaa, // TCfg + 0x50505050, // ASelt + 0x51514040, 0x73736262, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00020001, 0x00010001, 0x00020001, 0x00010001, 0x00020001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniGemmU8U8toFp32Block4_4x4 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x32103210, 0x32103210, // ABin + 0x55555555, // BSelt + 0xd951c840, 0xfb73ea62, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmU8U8MulZptoFp32_8x4 = {{ + 0xaaaaaaaa, 0xaaaaaaaa, // TCfg + 0xf02a0600, 0x2a8620e0, 0x0640e8f2, 0x60f0f42b, 0xf8f62b86, // BinSelect + 0x00000700, // AccumType, ConstantType, and PostShift + 0x03020302, 0x03020302, 0x03020302, 0x03020302, 0x03020302, 0x03020302, 0x03020302, 0x03020302 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniGemmF16U8toF32_4x4 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x32103210, 0x32103210, // ABin + 0x55555555, // BSelt + 0xd951c840, 0xfb73ea62, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmF16U8toF32Hi_4x4 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76547654, 0x76547654, // ABin + 0x55555555, // BSelt + 0xd951c840, 0xfb73ea62, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmFp16U8MulZptoFp32_4x4 = {{ + 0xaaaaaaaa, // TCfg + 0x55550000, // ASelt + 0x76543210, 0x76543210, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00020002, 0x00020002, 0x00030003, 0x00030003, 0x00040004, 0x00040004 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmFp16I16MulZptoFp32_4x4 = {{ + 0xaaaaaaaa, // TCfg + 0x55550000, // ASelt + 0x76543210, 0x76543210, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00020002, 0x00020002, 0x00020002, 0x00020002, 0x00020002, 0x00020002, 0x00020002, 0x00020002 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmF16I16toF32A_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00100010, 0x00100010, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmF16I16toF32B_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00320032, 0x00320032, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmF16I16toF32C_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00540054, 0x00540054, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmF16I16toF32D_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00760076, 0x00760076, // ABin + 0x05050505, // BSelt + 0x00510040, 0x00730062, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + float scaleIn0divOut = src0Scale / dstScale; + float scaleIn1divOut = src1Scale / dstScale; + float inScaleMul = src0Scale * src1Scale; + float reScaleOut = 1 / dstScale; + float inScaledivOut = inScaleMul / dstScale; + uint32_t multiplierA = (M0 << 16) | M0; + uint32_t multiplierB = (M1 << 16) | M1; + uint32_t multiplierZpA = (src0ZP << 16) | src0ZP; + uint32_t multiplierZpB = (src1ZP << 16) | src1ZP; + uint32_t multiplierU8ZpAB = (src0ZP << 24) | (src1ZP << 16) | (src0ZP << 8) | (src1ZP); + int32_t i = 8; + uniConvertUint8SubZpToFp32_4x4.data[7] |= (postShift0 & 0x1F); + uniConvertUint8SubZpToFp32B_4x4.data[7] |= (postShift1 & 0x1F); + for( i = 8; i < 16; i += 2) + { + uniConvertUint8SubZpToFp32_4x4.data[i] = multiplierA; + uniConvertUint8SubZpToFp32B_4x4.data[i] = multiplierB; + } + for( i = 8; i < 16; i++) + { + uniGemmFp16MulZptoFp32_4x4.data[i] = multiplierZpA; + uniGemmU8U8MulZptoFp32_8x4.data[i] = multiplierU8ZpAB; + uniGemmFp16U8MulZptoFp32_4x4.data[i] = multiplierZpB; + uniGemmFp16I16MulZptoFp32_4x4.data[i] = multiplierZpB; + } + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, U8, F16, 0, 1, 0 ): + case _PACK_SELECT_KEY( U8, U8, U8, 0, 1, 0 ): + case _PACK_SELECT_KEY( U8, U8, F16, 0, 1, 1 ): + case _PACK_SELECT_KEY( U8, U8, U8, 0, 1, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8SubZptoFp16_dp2x8", &uniU8SubZptoFp16_dp2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniFp16MulFp16AddtoFp32_dp8x2", &uniFp16MulFp16AddtoFp32_dp8x2 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "inScaleMul", &inScaleMul ); + status |= vsi_nn_kernel_gpu_add_param( node, "inScaledivOut", &inScaledivOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, U8, F16, 0, 1, 0 ): + case _PACK_SELECT_KEY( F16, U8, U8, 0, 1, 0 ): + case _PACK_SELECT_KEY( F16, U8, F16, 0, 1, 1 ): + case _PACK_SELECT_KEY( F16, U8, U8, 0, 1, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8SubZptoFp16_dp2x8", &uniU8SubZptoFp16_dp2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniFp16MulFp16AddtoFp32_dp8x2", &uniFp16MulFp16AddtoFp32_dp8x2 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input1Scale", &src1Scale ); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "scaleIn2divOut", &scaleIn1divOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, F16, F16, 0, 1, 0 ): + case _PACK_SELECT_KEY( F16, F16, F16, 0, 1, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniFp16MulFp16AddtoFp32_dp8x2", &uniFp16MulFp16AddtoFp32_dp8x2 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, F16, U8, 0, 0, 0 ): + case _PACK_SELECT_KEY( F16, F16, I8, 0, 0, 0 ): + case _PACK_SELECT_KEY( F16, F16, I16, 0, 0, 0 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16toFp32Row0Lo_4x4", &uniGemmFp16toFp32Row0Lo_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16toFp32Row0Hi_4x4", &uniGemmFp16toFp32Row0Hi_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16toFp32Row1Lo_4x4", &uniGemmFp16toFp32Row1Lo_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16toFp32Row1Hi_4x4", &uniGemmFp16toFp32Row1Hi_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, F16, U8, 0, 0, 1 ): + case _PACK_SELECT_KEY( F16, F16, I8, 0, 0, 1 ): + case _PACK_SELECT_KEY( F16, F16, I16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8F16toF32Lo_4x4b", &uniGemmU8F16toF32Lo_4x4b ); + status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, U8, U8, 0, 0, 0 ): + case _PACK_SELECT_KEY( I8, I8, I8, 0, 0, 0 ): + case _PACK_SELECT_KEY( U8, U8, U8, 0, 0, 1 ): + case _PACK_SELECT_KEY( I8, I8, I8, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8U8toFp32Block4_4x4", &uniGemmU8U8toFp32Block4_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8U8MulZptoFp32_8x4", &uniGemmU8U8MulZptoFp32_8x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "mulKIn0In1Zp", &mulKIn0In1Zp ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 0 ): + case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut ); + } + break; + case _PACK_SELECT_KEY( F16, U8, F16, 0, 0, 0 ): + case _PACK_SELECT_KEY( F16, I8, F16, 0, 0, 0 ): + case _PACK_SELECT_KEY( F16, U8, F16, 0, 0, 1 ): + case _PACK_SELECT_KEY( F16, I8, F16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGemmF16U8toF32_4x4", &uniGemmF16U8toF32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmF16U8toF32Hi_4x4", &uniGemmF16U8toF32Hi_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16U8MulZptoFp32_4x4", &uniGemmFp16U8MulZptoFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input1Scale", &src1Scale ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, I16, F16, 0, 0, 0 ): + case _PACK_SELECT_KEY( F16, I16, F16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvert1stFp16ToFp32_4x4", &uniConvert1stFp16ToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, I16, I16, 0, 0, 0 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGemmF16I16toF32A_4x4", &uniGemmF16I16toF32A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmF16I16toF32B_4x4", &uniGemmF16I16toF32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmF16I16toF32C_4x4", &uniGemmF16I16toF32C_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmF16I16toF32D_4x4", &uniGemmF16I16toF32D_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16I16MulZptoFp32_4x4", &uniGemmFp16I16MulZptoFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "in1outScale", &scaleIn1divOut ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, I16, I16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGemmF16I16toF32Lo_4x4b", &uniGemmF16I16toF32Lo_4x4b ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmF16I16toF32Hi_4x4b", &uniGemmF16I16toF32Hi_4x4b ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16I16MulZptoFp32_4x4", &uniGemmFp16I16MulZptoFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "in1outScale", &scaleIn1divOut ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, U8, U8, 0, 0, 0 ): + case _PACK_SELECT_KEY( F16, U8, U8, 0, 0, 1 ): + case _PACK_SELECT_KEY( F16, I8, I8, 0, 0, 0 ): + case _PACK_SELECT_KEY( F16, I8, I8, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGemmF16U8toF32_4x4", &uniGemmF16U8toF32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmF16U8toF32Hi_4x4", &uniGemmF16U8toF32Hi_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16U8MulZptoFp32_4x4", &uniGemmFp16U8MulZptoFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "in1outScale", &scaleIn1divOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, U8, U8, 1, 0, 0 ): + case _PACK_SELECT_KEY( I8, I8, I8, 1, 0, 0 ): + case _PACK_SELECT_KEY( I16, I16, I16, 1, 0, 0 ): + case _PACK_SELECT_KEY( F16, F16, F16, 1, 0, 0 ): + case _PACK_SELECT_KEY( U8, F16, U8, 1, 0, 0 ): + case _PACK_SELECT_KEY( I8, F16, I8, 1, 0, 0 ): + case _PACK_SELECT_KEY( I16, F16, I16, 1, 0, 0 ): + case _PACK_SELECT_KEY( U8, U8, U8, 1, 0, 1 ): + case _PACK_SELECT_KEY( I8, I8, I8, 1, 0, 1 ): + case _PACK_SELECT_KEY( I16, I16, I16, 1, 0, 1 ): + case _PACK_SELECT_KEY( F16, F16, F16, 1, 0, 1 ): + case _PACK_SELECT_KEY( U8, F16, U8, 1, 0, 1 ): + case _PACK_SELECT_KEY( I8, F16, I8, 1, 0, 1 ): + case _PACK_SELECT_KEY( I16, F16, I16, 1, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvert1stFp16ToFp32_4x4", &uniConvert1stFp16ToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, F16, U8, 0, 0, 0 ): + case _PACK_SELECT_KEY( I8, F16, I8, 0, 0, 0 ): + case _PACK_SELECT_KEY( I16, F16, I16, 0, 0, 0 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvert1stFp16ToFp32_4x4", &uniConvert1stFp16ToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, F16, U8, 0, 0, 1 ): + case _PACK_SELECT_KEY( I8, F16, I8, 0, 0, 1 ): + case _PACK_SELECT_KEY( I16, F16, I16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16MulZptoFp32_4x4", &uniGemmFp16MulZptoFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8F16toF32Lo_4x4b", &uniGemmU8F16toF32Lo_4x4b ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8F16toF32Hi_4x4b", &uniGemmU8F16toF32Hi_4x4b ); + status |= vsi_nn_kernel_gpu_add_param( node, "in0outScale", &scaleIn0divOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, U8, F16, 0, 0, 0 ): + case _PACK_SELECT_KEY( I8, I8, F16, 0, 0, 0 ): + case _PACK_SELECT_KEY( U8, U8, F16, 0, 0, 1 ): + case _PACK_SELECT_KEY( I8, I8, F16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8U8toFp32Block4_4x4", &uniGemmU8U8toFp32Block4_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8U8MulZptoFp32_8x4", &uniGemmU8U8MulZptoFp32_8x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input01Scale", &inScaleMul ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, I16, F16, 0, 0, 0 ): + case _PACK_SELECT_KEY( I16, I16, F16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, F16, F16, 0, 0, 0 ): + case _PACK_SELECT_KEY( I8, F16, F16, 0, 0, 0 ): + case _PACK_SELECT_KEY( I16, F16, F16, 0, 0, 0 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGemm1stU8F16toF32Lo_4x4", &uniGemm1stU8F16toF32Lo_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemm2ndU8F16toF32Lo_4x4", &uniGemm2ndU8F16toF32Lo_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemm1stU8F16toF32Hi_4x4", &uniGemm1stU8F16toF32Hi_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemm2ndU8F16toF32Hi_4x4", &uniGemm2ndU8F16toF32Hi_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16MulZptoFp32_4x4", &uniGemmFp16MulZptoFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input0Scale", &src0Scale ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, F16, F16, 0, 0, 1 ): + case _PACK_SELECT_KEY( I8, F16, F16, 0, 0, 1 ): + case _PACK_SELECT_KEY( I16, F16, F16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8F16toF32Lo_4x4b", &uniGemmU8F16toF32Lo_4x4b ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8F16toF32Hi_4x4b", &uniGemmU8F16toF32Hi_4x4b ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16MulZptoFp32_4x4", &uniGemmFp16MulZptoFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input0Scale", &src0Scale ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, F16, F16, 0, 0, 0 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16toFp32Row0Lo_4x4", &uniGemmFp16toFp32Row0Lo_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16toFp32Row0Hi_4x4", &uniGemmFp16toFp32Row0Hi_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16toFp32Row1Lo_4x4", &uniGemmFp16toFp32Row1Lo_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmFp16toFp32Row1Hi_4x4", &uniGemmFp16toFp32Row1Hi_4x4 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, F16, F16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8F16toF32Lo_4x4b", &uniGemmU8F16toF32Lo_4x4b ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F32, F32, F32, 0, 0, 0 ): + case _PACK_SELECT_KEY( F32, F32, F32, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + status = vsi_nn_kernel_gpu_add_param( node, "ac2zero", &ac2zero ); + status |= vsi_nn_kernel_gpu_add_param( node, "bc2zero", &bc2zero ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } +#undef _PACK_SELECT_KEY + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + return status; +} /* _matrix_mul_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + int32_t transa, + int32_t transb + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input1_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_MATRIX_MUL_KEY( input0_dtype, input1_dtype, output_dtype, transa, transb ); + + for( i = 0; i < _cnt_of_array(matrix_mul_map); i ++ ) + { + if ( matrix_mul_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(matrix_mul_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", matrix_mul_map[i].function_name ); + kernel->info.parameters = _matrix_mul_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _matrix_mul_kernel_param_def ); + kernel->info.initialize = _matrix_mul_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + matrix_mul_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + matrix_mul_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); + int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); + int32_t adjointA = vsi_nn_kernel_param_get_int32( params, "adjointA" ); + int32_t adjointB = vsi_nn_kernel_param_get_int32( params, "adjointB" ); + uint32_t M = inputs[0]->attr.size[1]; + uint32_t K = inputs[0]->attr.size[0]; + uint32_t N = inputs[1]->attr.size[0]; + uint32_t depthA = 1, depthB = 1; + + if ((inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 + && inputs[1]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 + && outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32) + &&(M % 4 != 0 || K % 4 != 0 || N %4 != 0)) + { + return NULL; + } + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + if (transposeA) + { + K = inputs[0]->attr.size[1]; + M = inputs[0]->attr.size[0]; + } + else if (transposeB) + { + N = inputs[1]->attr.size[1]; + } + + depthA = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + depthB = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1; + if (M == 1 && depthB == 1 && depthA > 1) + { + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + shape[0] = inputs[0]->attr.size[0]; + shape[1] = inputs[0]->attr.size[2]; + shape[2] = 1; + shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); + + shape[0] = outputs[0]->attr.size[0]; + shape[1] = outputs[0]->attr.size[2]; + shape[2] = 1; + shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; + rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); + } + + status = _query_kernel( inputs, outputs, kernel, transposeA, transposeB ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 3; + /* Pass parameters to node. */ + if (rs_input) + { + tmp_params[0] = rs_input; + tmp_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t); + tmp_params[2] = rs_output; + } + else + { + vsi_nn_kernel_node_pack_io( tmp_params, _MATRIX_MUL_PARAM_NUM, + inputs, 2, outputs, 1 ); + } + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeA ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeB ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &adjointA ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &adjointB ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _MATRIX_MUL_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_tensor_release( &tmp_params[5] ); + vsi_nn_kernel_tensor_release( &tmp_params[6] ); + vsi_nn_kernel_tensor_release( &tmp_params[7] ); + vsi_nn_kernel_tensor_release( &tmp_params[8] ); + vsi_nn_kernel_tensor_release( &tmp_params[9] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.S16 = 0; + border.constant_value.U16 = 0; + border.constant_value.S32 = 0; + border.constant_value.U32 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + if (K % 4 == 0 && N % 4 == 0) + { + border.mode = VX_BORDER_REPLICATE; + } + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + } + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( matrixmul, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c new file mode 100644 index 0000000..5b896c6 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c @@ -0,0 +1,763 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define KERNEL_NAME_MAXIMUM_F16F16TOF16 CVIVANTE_NAMESPACE("evis.maximum_F16F16toF16") +#define KERNEL_NAME_MAXIMUM_F16F16TOF16_2D CVIVANTE_NAMESPACE("evis.maximum_F16F16toF16_2D") +#define KERNEL_NAME_MAXIMUM_I8I8TOI8 CVIVANTE_NAMESPACE("evis.maximum_I8I8toI8") +#define KERNEL_NAME_MAXIMUM_I8I8TOI8_2D CVIVANTE_NAMESPACE("evis.maximum_I8I8toI8_2D") +#define KERNEL_NAME_MAXIMUM_I8F16TOI8 CVIVANTE_NAMESPACE("evis.maximum_I8F16toI8") +#define KERNEL_NAME_MAXIMUM_I8F16TOI8_2D CVIVANTE_NAMESPACE("evis.maximum_I8F16toI8_2D") +#define KERNEL_NAME_MAXIMUM_I8F16TOF16 CVIVANTE_NAMESPACE("evis.maximum_I8F16toF16") +#define KERNEL_NAME_MAXIMUM_I8F16TOF16_2D CVIVANTE_NAMESPACE("evis.maximum_I8F16toF16_2D") +#define KERNEL_NAME_MAXIMUM_U8F16TOF16 CVIVANTE_NAMESPACE("evis.maximum_U8F16toF16") +#define KERNEL_NAME_MAXIMUM_U8F16TOF16_2D CVIVANTE_NAMESPACE("evis.maximum_U8F16toF16_2D") +#define KERNEL_NAME_MAXIMUM_U8F16TOU8 CVIVANTE_NAMESPACE("evis.maximum_U8F16toU8") +#define KERNEL_NAME_MAXIMUM_U8F16TOU8_2D CVIVANTE_NAMESPACE("evis.maximum_U8F16toU8_2D") +#define KERNEL_NAME_MAXIMUM_U8U8TOU8 CVIVANTE_NAMESPACE("evis.maximum_U8U8toU8") +#define KERNEL_NAME_MAXIMUM_U8U8TOU8_2D CVIVANTE_NAMESPACE("evis.maximum_U8U8toU8_2D") +#define KERNEL_NAME_MAXIMUM_I16I16TOI16 CVIVANTE_NAMESPACE("evis.maximum_I16I16toI16") +#define KERNEL_NAME_MAXIMUM_I16I16TOI16_2D CVIVANTE_NAMESPACE("evis.maximum_I16I16toI16_2D") +#define KERNEL_NAME_MAXIMUM_I16F16TOI16 CVIVANTE_NAMESPACE("evis.maximum_I16F16toI16") +#define KERNEL_NAME_MAXIMUM_I16F16TOI16_2D CVIVANTE_NAMESPACE("evis.maximum_I16F16toI16_2D") +#define KERNEL_NAME_MAXIMUM_I16F16TOF16 CVIVANTE_NAMESPACE("evis.maximum_I16F16toF16") +#define KERNEL_NAME_MAXIMUM_I16F16TOF16_2D CVIVANTE_NAMESPACE("evis.maximum_I16F16toF16_2D") +#define KERNEL_NAME_MAXIMUM_F16F16TOU8 CVIVANTE_NAMESPACE("evis.maximum_F16F16toU8") +#define KERNEL_NAME_MAXIMUM_F16F16TOU8_2D CVIVANTE_NAMESPACE("evis.maximum_F16F16toU8_2D") +#define KERNEL_NAME_MAXIMUM_F16F16TOI8 CVIVANTE_NAMESPACE("evis.maximum_F16F16toI8") +#define KERNEL_NAME_MAXIMUM_F16F16TOI8_2D CVIVANTE_NAMESPACE("evis.maximum_F16F16toI8_2D") +#define KERNEL_NAME_MAXIMUM_F16F16TOI16 CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16") +#define KERNEL_NAME_MAXIMUM_F16F16TOI16_2D CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16_2D") + +#define KERNEL_SOURCE_1 "maximum", +#define KERNEL_SOURCE_2 "maximum_fp16", +#define KERNEL_SOURCE_3 "maximum_i16" + +#define HASH_MAXIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + +#define TENSOR_MAX_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + KERNEL_NAME_MAXIMUM_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +#define TENSOR_MAX_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + KERNEL_NAME_MAXIMUM_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE##_2D, \ + SOURCE }, + +#define HASH_MAXIMUM_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.maximum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + +#define TENSOR_MAX_KERNELS_HALF(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_MAXIMUM_SH_KERNEL_NAME(F16, F16, F16), \ + SOURCE }, + +#define HASH_MAXIMUM_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.maximum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_MAX_KERNELS_2D_HALF(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MAXIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_MAXIMUM_SH_KERNEL_2D_NAME(F16, F16, F16), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_MAX_KERNELS_HALF(F16, F16, F16, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS(F16, F16, I8, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS(I8, I8, I8, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS(I16, I16, I16, KERNEL_SOURCE_1) + + TENSOR_MAX_KERNELS(F16, F16, U8, KERNEL_SOURCE_2) + TENSOR_MAX_KERNELS(I8, F16, I8, KERNEL_SOURCE_2) + TENSOR_MAX_KERNELS(I8, F16, F16, KERNEL_SOURCE_2) + TENSOR_MAX_KERNELS(U8, F16, U8, KERNEL_SOURCE_2) + TENSOR_MAX_KERNELS(U8, F16, F16, KERNEL_SOURCE_2) + + TENSOR_MAX_KERNELS(I16, F16, I16, KERNEL_SOURCE_3) + TENSOR_MAX_KERNELS(I16, F16, F16, KERNEL_SOURCE_3) + TENSOR_MAX_KERNELS(F16, F16, I16, KERNEL_SOURCE_3) + + TENSOR_MAX_KERNELS_2D_HALF(F16, F16, F16, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_2D(F16, F16, I8, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_2D(I8, I8, I8, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_2D(I16, I16, I16, KERNEL_SOURCE_1) + + TENSOR_MAX_KERNELS_2D(F16, F16, U8, KERNEL_SOURCE_2) + TENSOR_MAX_KERNELS_2D(I8, F16, I8, KERNEL_SOURCE_2) + TENSOR_MAX_KERNELS_2D(I8, F16, F16, KERNEL_SOURCE_2) + TENSOR_MAX_KERNELS_2D(U8, F16, U8, KERNEL_SOURCE_2) + TENSOR_MAX_KERNELS_2D(U8, F16, F16, KERNEL_SOURCE_2) + + TENSOR_MAX_KERNELS_2D(I16, F16, I16, KERNEL_SOURCE_3) + TENSOR_MAX_KERNELS_2D(I16, F16, F16, KERNEL_SOURCE_3) + TENSOR_MAX_KERNELS_2D(F16, F16, I16, KERNEL_SOURCE_3) +}; + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +DEF_KERNEL_INITIALIZER(_maximum_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + uint8_t in0_fl = 0; + int32_t src0ZP = 0; + float src0Scale = 1.0f; + uint8_t in1_fl = 0; + int32_t src1ZP = 0; + float src1Scale = 1.0f; + uint8_t out_fl = 0; + int32_t dstZP = 0; + float dstScale = 1.0f; + float output_zp = 0.0f; + + int32_t shift0 = 0; + int32_t shift1 = 0; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_int_array_t * out_shape = NULL; + uint32_t pack_key; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_shape = attr[2]->shape; + + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + in0_fl = (uint8_t)attr[0]->dfp.fl; + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + } + + if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + in1_fl = (uint8_t)attr[1]->dfp.fl; + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + src1ZP = attr[1]->asymm.zero_point; + src1Scale = attr[1]->asymm.scale; + } + + if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + out_fl = (uint8_t)attr[2]->dfp.fl; + if (out_fl > 0) + { + dstScale = (float) ((int64_t)1 << out_fl); + } + else + { + dstScale = 1.0f / (float)((int64_t)1 << -out_fl); + } + dstZP = 0; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + dstZP = attr[2]->asymm.zero_point; + dstScale = attr[2]->asymm.scale; + } + output_zp = (float)dstZP; + + shift0 = in0_fl - out_fl; + shift1 = in1_fl - out_fl; + +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, + attr[1]->dtype, attr[2]->dtype ); + + if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16) + || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) ) + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + switch( pack_key ) + { + case _PACK_SELECT_KEY( I8, I8, I8 ): + case _PACK_SELECT_KEY( I8, F16, I8 ): + { + gpu_dp_inst_t uniConvertI8toI8_0_part0_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertI8toI8_0_part1_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertI8toI8_1_part0_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertI8toI8_1_part1_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertI8toI8_0_part0_2x8, shift0 ); + gpu_dp_inst_update_postshfit( &uniConvertI8toI8_0_part1_2x8, shift0 ); + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertI8toI8_0_part0_2x8", &uniConvertI8toI8_0_part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertI8toI8_0_part1_2x8", &uniConvertI8toI8_0_part1_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + if ( attr[1]->dtype == F16 ) + { + gpu_dp_inst_t uinConvertFp16ToInt8_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt8_2x8, shift1 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uinConvertFp16ToInt8_2x8", &uinConvertFp16ToInt8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + gpu_dp_inst_update_postshfit( &uniConvertI8toI8_1_part0_2x8, shift1 ); + gpu_dp_inst_update_postshfit( &uniConvertI8toI8_1_part1_2x8, shift1 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertI8toI8_1_part0_2x8", &uniConvertI8toI8_1_part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertI8toI8_1_part1_2x8", &uniConvertI8toI8_1_part1_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + break; + case _PACK_SELECT_KEY( I16, I16, I16 ): + { + gpu_dp_inst_t uniConvertI16toI16_0_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertI16toI16_1_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertI16toI16_0_2x8, shift0 ); + gpu_dp_inst_update_postshfit( &uniConvertI16toI16_1_2x8, shift1 ); + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertI16toI16_0_2x8", &uniConvertI16toI16_0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertI16toI16_1_2x8", &uniConvertI16toI16_1_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( U8, U8, U8 ): + case _PACK_SELECT_KEY( U8, F16, U8 ): + case _PACK_SELECT_KEY( F16, F16, U8 ): + { + uint16_t M0 = 0; + uint16_t M1 = 0; + int32_t postShift0 = 0; + int32_t postShift1 = 0; + uint32_t multAndoutZP0[2] = {0}; + uint32_t multAndoutZP1[2] = {0}; + + gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x1b1a1918, 0x1f1e1d1c, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); + gpu_quantize_multiplier_16bit( (double)src1Scale / dstScale, &M1, &postShift1); + + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); + multAndoutZP1[0] = (uint32_t)(M1); + multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src1ZP * M1); + + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 ); + + status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + if (attr[0]->dtype == U8) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if ( attr[1]->dtype == F16 ) + { + gpu_dp_inst_t uniConvertFp16toU8_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertFp16toU8_2x8, postShift1 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertFp16toU8_2x8", &uniConvertFp16toU8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + break; + case _PACK_SELECT_KEY( I8, F16, F16 ): + { + gpu_dp_inst_t uniConvertInt8toFp16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertInt8toFp16_2x8, shift0 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt8toFp16_2x8", &uniConvertInt8toFp16_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( U8, F16, F16 ): + { + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP0[2] = {0}; + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0); + + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( I16, F16, I16 ): + { + gpu_dp_inst_t uniConvertI16toI16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uinConvertFp16ToInt16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertI16toI16_2x8, shift0 ); + gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt16_2x8, shift1 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertI16toI16_2x8", &uniConvertI16toI16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uinConvertFp16ToInt16_2x8", &uinConvertFp16ToInt16_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( F16, F16, I16 ): + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert1stFp16ToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert2ndFp16ToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + dstScale = 1.0f / dstScale; + } + + status = vsi_nn_kernel_gpu_add_param( node, + "outputScale", &dstScale ); + status = vsi_nn_kernel_gpu_add_param( node, + "output_zp", &output_zp ); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniConvert1stFp16ToFp32_4x4", &uniConvert1stFp16ToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniConvert2ndFp16ToFp32_4x4", &uniConvert2ndFp16ToFp32_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( I16, F16, F16 ): + { + gpu_dp_inst_t uniConvertInt16toFp16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertInt16toFp16_2x8, shift0 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt16toFp16_2x8", &uniConvertInt16toFp16_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( F16, F16, I8 ): + { + gpu_dp_inst_t uinConvertFp16ToInt8_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt8_2x8, shift0 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uinConvertFp16ToInt8_2x8", &uinConvertFp16ToInt8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + +#undef _PACK_SELECT_KEY + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} /* _maxmum_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype; + vsi_nn_kernel_dtype_e input1_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_MAXIMUM_KEY( input0_dtype, input1_dtype, output_dtype, image_2d ); + + for ( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _maximum_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* tmp_inputs[2] = { NULL }; + vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type; + vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type; + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + // Reorder tensor + if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 ) + { + int32_t order[2] = {1, 0}; + vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs ); + } + else + { + memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 ); + } + + image_2d = (outputs[0]->attr.dim_num == 2); + status = _query_kernel( tmp_inputs, outputs, image_2d, kernel ); + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM, + tmp_inputs, 2, outputs, 1 ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM ); + + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( maximum, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c new file mode 100644 index 0000000..356b93f --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c @@ -0,0 +1,763 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define KERNEL_NAME_MINIMUM_F16F16TOF16 CVIVANTE_NAMESPACE("evis.minimum_F16F16toF16") +#define KERNEL_NAME_MINIMUM_F16F16TOF16_2D CVIVANTE_NAMESPACE("evis.minimum_F16F16toF16_2D") +#define KERNEL_NAME_MINIMUM_I8I8TOI8 CVIVANTE_NAMESPACE("evis.minimum_I8I8toI8") +#define KERNEL_NAME_MINIMUM_I8I8TOI8_2D CVIVANTE_NAMESPACE("evis.minimum_I8I8toI8_2D") +#define KERNEL_NAME_MINIMUM_I8F16TOI8 CVIVANTE_NAMESPACE("evis.minimum_I8F16toI8") +#define KERNEL_NAME_MINIMUM_I8F16TOI8_2D CVIVANTE_NAMESPACE("evis.minimum_I8F16toI8_2D") +#define KERNEL_NAME_MINIMUM_I8F16TOF16 CVIVANTE_NAMESPACE("evis.minimum_I8F16toF16") +#define KERNEL_NAME_MINIMUM_I8F16TOF16_2D CVIVANTE_NAMESPACE("evis.minimum_I8F16toF16_2D") +#define KERNEL_NAME_MINIMUM_U8F16TOF16 CVIVANTE_NAMESPACE("evis.minimum_U8F16toF16") +#define KERNEL_NAME_MINIMUM_U8F16TOF16_2D CVIVANTE_NAMESPACE("evis.minimum_U8F16toF16_2D") +#define KERNEL_NAME_MINIMUM_U8F16TOU8 CVIVANTE_NAMESPACE("evis.minimum_U8F16toU8") +#define KERNEL_NAME_MINIMUM_U8F16TOU8_2D CVIVANTE_NAMESPACE("evis.minimum_U8F16toU8_2D") +#define KERNEL_NAME_MINIMUM_U8U8TOU8 CVIVANTE_NAMESPACE("evis.minimum_U8U8toU8") +#define KERNEL_NAME_MINIMUM_U8U8TOU8_2D CVIVANTE_NAMESPACE("evis.minimum_U8U8toU8_2D") +#define KERNEL_NAME_MINIMUM_I16I16TOI16 CVIVANTE_NAMESPACE("evis.minimum_I16I16toI16") +#define KERNEL_NAME_MINIMUM_I16I16TOI16_2D CVIVANTE_NAMESPACE("evis.minimum_I16I16toI16_2D") +#define KERNEL_NAME_MINIMUM_I16F16TOI16 CVIVANTE_NAMESPACE("evis.minimum_I16F16toI16") +#define KERNEL_NAME_MINIMUM_I16F16TOI16_2D CVIVANTE_NAMESPACE("evis.minimum_I16F16toI16_2D") +#define KERNEL_NAME_MINIMUM_I16F16TOF16 CVIVANTE_NAMESPACE("evis.minimum_I16F16toF16") +#define KERNEL_NAME_MINIMUM_I16F16TOF16_2D CVIVANTE_NAMESPACE("evis.minimum_I16F16toF16_2D") +#define KERNEL_NAME_MINIMUM_F16F16TOU8 CVIVANTE_NAMESPACE("evis.minimum_F16F16toU8") +#define KERNEL_NAME_MINIMUM_F16F16TOU8_2D CVIVANTE_NAMESPACE("evis.minimum_F16F16toU8_2D") +#define KERNEL_NAME_MINIMUM_F16F16TOI8 CVIVANTE_NAMESPACE("evis.minimum_F16F16toI8") +#define KERNEL_NAME_MINIMUM_F16F16TOI8_2D CVIVANTE_NAMESPACE("evis.minimum_F16F16toI8_2D") +#define KERNEL_NAME_MINIMUM_F16F16TOI16 CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16") +#define KERNEL_NAME_MINIMUM_F16F16TOI16_2D CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16_2D") + +#define KERNEL_SOURCE_1 "minimum", +#define KERNEL_SOURCE_2 "minimum_fp16", +#define KERNEL_SOURCE_3 "minimum_i16" + +#define HASH_MINIMUM_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + +#define TENSOR_MIN_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + KERNEL_NAME_MINIMUM_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +#define TENSOR_MIN_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + KERNEL_NAME_MINIMUM_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE##_2D, \ + SOURCE }, + +#define HASH_MINIMUM_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.minimum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + +#define TENSOR_MIN_KERNELS_HALF(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_MINIMUM_SH_KERNEL_NAME(F16, F16, F16), \ + SOURCE }, + +#define HASH_MINIMUM_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.minimum_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_MIN_KERNELS_2D_HALF(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MINIMUM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_MINIMUM_SH_KERNEL_2D_NAME(F16, F16, F16), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_MIN_KERNELS_HALF(F16, F16, F16, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS(F16, F16, I8, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS(I8, I8, I8, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS(I16, I16, I16, KERNEL_SOURCE_1) + + TENSOR_MIN_KERNELS(F16, F16, U8, KERNEL_SOURCE_2) + TENSOR_MIN_KERNELS(I8, F16, I8, KERNEL_SOURCE_2) + TENSOR_MIN_KERNELS(I8, F16, F16, KERNEL_SOURCE_2) + TENSOR_MIN_KERNELS(U8, F16, U8, KERNEL_SOURCE_2) + TENSOR_MIN_KERNELS(U8, F16, F16, KERNEL_SOURCE_2) + + TENSOR_MIN_KERNELS(I16, F16, I16, KERNEL_SOURCE_3) + TENSOR_MIN_KERNELS(I16, F16, F16, KERNEL_SOURCE_3) + TENSOR_MIN_KERNELS(F16, F16, I16, KERNEL_SOURCE_3) + + TENSOR_MIN_KERNELS_2D_HALF(F16, F16, F16, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_2D(F16, F16, I8, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_2D(I8, I8, I8, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_2D(I16, I16, I16, KERNEL_SOURCE_1) + + TENSOR_MIN_KERNELS_2D(F16, F16, U8, KERNEL_SOURCE_2) + TENSOR_MIN_KERNELS_2D(I8, F16, I8, KERNEL_SOURCE_2) + TENSOR_MIN_KERNELS_2D(I8, F16, F16, KERNEL_SOURCE_2) + TENSOR_MIN_KERNELS_2D(U8, F16, U8, KERNEL_SOURCE_2) + TENSOR_MIN_KERNELS_2D(U8, F16, F16, KERNEL_SOURCE_2) + + TENSOR_MIN_KERNELS_2D(I16, F16, I16, KERNEL_SOURCE_3) + TENSOR_MIN_KERNELS_2D(I16, F16, F16, KERNEL_SOURCE_3) + TENSOR_MIN_KERNELS_2D(F16, F16, I16, KERNEL_SOURCE_3) +}; + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +DEF_KERNEL_INITIALIZER(_minimum_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + uint8_t in0_fl = 0; + int32_t src0ZP = 0; + float src0Scale = 1.0f; + uint8_t in1_fl = 0; + int32_t src1ZP = 0; + float src1Scale = 1.0f; + uint8_t out_fl = 0; + int32_t dstZP = 0; + float dstScale = 1.0f; + float output_zp = 0.0f; + + int32_t shift0 = 0; + int32_t shift1 = 0; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_int_array_t * out_shape = NULL; + uint32_t pack_key; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_shape = attr[2]->shape; + + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + in0_fl = (uint8_t)attr[0]->dfp.fl; + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + } + + if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + in1_fl = (uint8_t)attr[1]->dfp.fl; + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + src1ZP = attr[1]->asymm.zero_point; + src1Scale = attr[1]->asymm.scale; + } + + if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + out_fl = (uint8_t)attr[2]->dfp.fl; + if (out_fl > 0) + { + dstScale = (float) ((int64_t)1 << out_fl); + } + else + { + dstScale = 1.0f / (float)((int64_t)1 << -out_fl); + } + dstZP = 0; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + dstZP = attr[2]->asymm.zero_point; + dstScale = attr[2]->asymm.scale; + } + output_zp = (float)dstZP; + + shift0 = in0_fl - out_fl; + shift1 = in1_fl - out_fl; + +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, + attr[1]->dtype, attr[2]->dtype ); + + if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16) + || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) ) + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + switch( pack_key ) + { + case _PACK_SELECT_KEY( I8, I8, I8 ): + case _PACK_SELECT_KEY( I8, F16, I8 ): + { + gpu_dp_inst_t uniConvertI8toI8_0_part0_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertI8toI8_0_part1_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertI8toI8_1_part0_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertI8toI8_1_part1_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertI8toI8_0_part0_2x8, shift0 ); + gpu_dp_inst_update_postshfit( &uniConvertI8toI8_0_part1_2x8, shift0 ); + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertI8toI8_0_part0_2x8", &uniConvertI8toI8_0_part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertI8toI8_0_part1_2x8", &uniConvertI8toI8_0_part1_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + if ( attr[1]->dtype == F16 ) + { + gpu_dp_inst_t uinConvertFp16ToInt8_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt8_2x8, shift1 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uinConvertFp16ToInt8_2x8", &uinConvertFp16ToInt8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + gpu_dp_inst_update_postshfit( &uniConvertI8toI8_1_part0_2x8, shift1 ); + gpu_dp_inst_update_postshfit( &uniConvertI8toI8_1_part1_2x8, shift1 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertI8toI8_1_part0_2x8", &uniConvertI8toI8_1_part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertI8toI8_1_part1_2x8", &uniConvertI8toI8_1_part1_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + break; + case _PACK_SELECT_KEY( I16, I16, I16 ): + { + gpu_dp_inst_t uniConvertI16toI16_0_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertI16toI16_1_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertI16toI16_0_2x8, shift0 ); + gpu_dp_inst_update_postshfit( &uniConvertI16toI16_1_2x8, shift1 ); + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertI16toI16_0_2x8", &uniConvertI16toI16_0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertI16toI16_1_2x8", &uniConvertI16toI16_1_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( U8, U8, U8 ): + case _PACK_SELECT_KEY( U8, F16, U8 ): + case _PACK_SELECT_KEY( F16, F16, U8 ): + { + uint16_t M0 = 0; + uint16_t M1 = 0; + int32_t postShift0 = 0; + int32_t postShift1 = 0; + uint32_t multAndoutZP0[2] = {0}; + uint32_t multAndoutZP1[2] = {0}; + + gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x1b1a1918, 0x1f1e1d1c, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); + gpu_quantize_multiplier_16bit( (double)src1Scale / dstScale, &M1, &postShift1); + + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); + multAndoutZP1[0] = (uint32_t)(M1); + multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src1ZP * M1); + + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 ); + + status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + if (attr[0]->dtype == U8) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if ( attr[1]->dtype == F16 ) + { + gpu_dp_inst_t uniConvertFp16toU8_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertFp16toU8_2x8, postShift1 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertFp16toU8_2x8", &uniConvertFp16toU8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + break; + case _PACK_SELECT_KEY( I8, F16, F16 ): + { + gpu_dp_inst_t uniConvertInt8toFp16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertInt8toFp16_2x8, shift0 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt8toFp16_2x8", &uniConvertInt8toFp16_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( U8, F16, F16 ): + { + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP0[2] = {0}; + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0); + + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( I16, F16, I16 ): + { + gpu_dp_inst_t uniConvertI16toI16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uinConvertFp16ToInt16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertI16toI16_2x8, shift0 ); + gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt16_2x8, shift1 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertI16toI16_2x8", &uniConvertI16toI16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uinConvertFp16ToInt16_2x8", &uinConvertFp16ToInt16_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( F16, F16, I16 ): + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert1stFp16ToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert2ndFp16ToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + dstScale = 1.0f / dstScale; + } + + status = vsi_nn_kernel_gpu_add_param( node, + "outputScale", &dstScale ); + status = vsi_nn_kernel_gpu_add_param( node, + "output_zp", &output_zp ); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniConvert1stFp16ToFp32_4x4", &uniConvert1stFp16ToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniConvert2ndFp16ToFp32_4x4", &uniConvert2ndFp16ToFp32_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( I16, F16, F16 ): + { + gpu_dp_inst_t uniConvertInt16toFp16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniConvertInt16toFp16_2x8, shift0 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt16toFp16_2x8", &uniConvertInt16toFp16_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( F16, F16, I8 ): + { + gpu_dp_inst_t uinConvertFp16ToInt8_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uinConvertFp16ToInt8_2x8, shift0 ); + status = vsi_nn_kernel_gpu_add_param( node, + "uinConvertFp16ToInt8_2x8", &uinConvertFp16ToInt8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + +#undef _PACK_SELECT_KEY + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} /* _minmum_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype; + vsi_nn_kernel_dtype_e input1_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_MINIMUM_KEY( input0_dtype, input1_dtype, output_dtype, image_2d ); + + for ( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _minimum_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* tmp_inputs[2] = { NULL }; + vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type; + vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type; + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + // Reorder tensor + if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 ) + { + int32_t order[2] = {1, 0}; + vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs ); + } + else + { + memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 ); + } + + image_2d = (outputs[0]->attr.dim_num == 2); + status = _query_kernel( tmp_inputs, outputs, image_2d, kernel ); + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM, + tmp_inputs, 2, outputs, 1 ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM ); + + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( minimum, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c new file mode 100644 index 0000000..032d473 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c @@ -0,0 +1,682 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +#define KERNEL_SOURCE_1 "moments_axis0" +#define KERNEL_SOURCE_2 "moments_axis1" +#define KERNEL_SOURCE_3 "moments_axis2" +#define KERNEL_SOURCE_4 "moments_axis01" +#define KERNEL_SOURCE_5 "moments_axis012" + +// Add kernel hashtable here +#define HASH_MOMENTS_KEY(_input0_type, _output_type, _axis_num, _axis0, _axis1, _axis2, _image_2d) \ + ((_input0_type<<24) | (_output_type<<20) | (_axis_num<<16) | (_axis0<<12) | (_axis1<<8) | (_axis2<<4)|(_image_2d)) + +#define HASH_MOMENTS_SH_KERNEL_NAME(AXIS0, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.moments_axis"#AXIS0"_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_MOMENTS_SH_KERNEL_2D_NAME(AXIS0, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.moments_axis"#AXIS0"_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +#define HASH_MOMENTS_TWO_AXIS_SH_KERNEL_NAME(AXIS0, AXIS1, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.moments_axis"#AXIS0#AXIS1"_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_MOMENTS_TWO_AXIS_SH_KERNEL_2D_NAME(AXIS0, AXIS1, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.moments_axis"#AXIS0#AXIS1"_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +#define HASH_MOMENTS_THREE_AXIS_SH_KERNEL_NAME(AXIS0, AXIS1, AXIS2, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.moments_axis"#AXIS0#AXIS1#AXIS2"_"#SRC0_TYPE"to"#DST_TYPE) + +#define TENSOR_MOMENTS_KERNELS(IN0_TYPE, OUT_TYPE, AXIS0, SOURCE) \ + { HASH_MOMENTS_KEY(IN0_TYPE, OUT_TYPE, 1, AXIS0, 0, 0, 0), \ + HASH_MOMENTS_SH_KERNEL_NAME(AXIS0, IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MOMENTS_KERNELS_2D(IN0_TYPE, OUT_TYPE, AXIS0, SOURCE) \ + { HASH_MOMENTS_KEY(IN0_TYPE, OUT_TYPE, 1, AXIS0, 0, 0, 1), \ + HASH_MOMENTS_SH_KERNEL_2D_NAME(AXIS0, IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MOMENTS_TWO_AXIS_KERNELS(IN0_TYPE, OUT_TYPE, AXIS0, AXIS1, SOURCE) \ + { HASH_MOMENTS_KEY(IN0_TYPE, OUT_TYPE, 2, AXIS0, AXIS1, 0, 0), \ + HASH_MOMENTS_TWO_AXIS_SH_KERNEL_NAME(AXIS0, AXIS1, IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(IN0_TYPE, OUT_TYPE, AXIS0, AXIS1, SOURCE) \ + { HASH_MOMENTS_KEY(IN0_TYPE, OUT_TYPE, 2, AXIS0, AXIS1, 0, 1), \ + HASH_MOMENTS_TWO_AXIS_SH_KERNEL_2D_NAME(AXIS0, AXIS1, IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MOMENTS_THREE_AXIS_KERNELS(IN0_TYPE, OUT_TYPE, AXIS0, AXIS1, AXIS2, SOURCE) \ + { HASH_MOMENTS_KEY(IN0_TYPE, OUT_TYPE, 3, AXIS0, AXIS1, AXIS2, 0), \ + HASH_MOMENTS_THREE_AXIS_SH_KERNEL_NAME(AXIS0, AXIS1, AXIS2, IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } moments_map[] = +{ + TENSOR_MOMENTS_KERNELS(U8, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(I8, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(I16, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(F16, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(U8, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(I8, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(I16, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(F16, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(U8, F16, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_KERNELS(I8, F16, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_KERNELS(I16, F16, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_KERNELS(F16, F16, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(I8, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(I16, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(F16, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F16, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(I8, F16, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(I16, F16, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(F16, F16, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_KERNELS_2D(U8, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS_2D(I8, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS_2D(I16, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS_2D(F16, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS_2D(U8, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS_2D(I8, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS_2D(I16, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS_2D(F16, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I8, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I16, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(F16, F16, 0, 1, KERNEL_SOURCE_4) +}; + +/* + * Kernel params + */ + +static vx_param_description_t _moments_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _MOMENTS_PARAM_NUM _cnt_of_array( _moments_kernel_param_def ) + +static int32_t set_constant_border + ( + vsi_nn_kernel_node_t node, + int32_t value + ) +{ + vsi_status status = VSI_FAILURE; + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.S32 = value; + border.constant_value.U32 = (vx_uint32)value; + border.constant_value.S16 = (vx_int16)value; + border.constant_value.U8 = (vx_uint8)value; + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + return status; +} + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_moments_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL}; + vsi_int_array_t * input_shape = NULL; + float scaleIn = 0; + int32_t input_zp = 0; + vx_uint32 iter = 0; + int32_t sumInZp = 0; + int32_t tmpZp1 = 0; + float tmpZp2 = 0; + float e2InScale = 0; + float rowSumScale = 0; + int32_t axis = 0; + int32_t axis_num = 0; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + float dimRatio = 1.0; + int32_t iterSize = 16; + float zpScaleSqr_i16 = 0.0f; + float zpScale2_i16 = 0.0f; + float sumScale_i16 = 0.0f; + + uint32_t pack_key = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis_num); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + input_shape = attr[0]->shape; + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + + if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + + input_zp = 0; + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + input_zp = 0; + scaleIn = 1; + } + + if(attr[0]->dtype == I16) + { + iterSize = 8; + } + + width = input_shape->data[0]; + height = input_shape->size > 1 ? input_shape->data[1] : 1; + chn = input_shape->size > 2 ? input_shape->data[2] : 1; + + shaderParam.global_scale[0] = 1; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + + if(axis_num == 1 && axis == 0) + { + iter = width; + dimRatio = (float)(1.0 / (width)); + + shaderParam.global_size[0] = height; + shaderParam.global_size[1] = chn; + shaderParam.global_size[2] = 1; + } + else if(axis_num == 1 && axis == 1) + { + iter = height; + dimRatio = (float)(1.0 / (height)); + + shaderParam.global_scale[0] = 4; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = chn; + shaderParam.global_size[2] = 1; + } + else if(axis_num == 1 && axis == 2) + { + iter = chn; + dimRatio = (float)(1.0 / (chn)); + + shaderParam.global_scale[0] = 4; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + } + else if(axis_num == 2) + { + iter = height * iterSize; + dimRatio = (float)(1.0 / (width * height)); + + shaderParam.local_size[0] = 16; + shaderParam.local_size[1] = 1; + shaderParam.local_size[2] = 1; + shaderParam.global_size[0] = 16; + shaderParam.global_size[1] = chn; + shaderParam.global_size[2] = 1; + } + else if(axis_num == 3) + { + iter = height * iterSize; + dimRatio = (float)(1.0 / (width * height * chn)); + + shaderParam.local_size[0] = 16; + shaderParam.local_size[1] = 1; + shaderParam.local_size[2] = 1; + shaderParam.global_size[0] = 16; + shaderParam.global_size[1] = 1; + shaderParam.global_size[2] = 1; + } + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + tmpZp1 = (-2) * input_zp; + e2InScale = scaleIn * scaleIn; + tmpZp2 = input_zp * input_zp * e2InScale; + sumInZp = input_zp * iter * (-1); + rowSumScale = iter * tmpZp2; + + zpScaleSqr_i16 = 8 * tmpZp2; + zpScale2_i16 = tmpZp1 * e2InScale; + sumScale_i16 = sumInZp * scaleIn; + +#define _PACK_SELECT_KEY( IN0_TYPE, AXIS_NUM, FIRST_AXIS ) \ + (IN0_TYPE | (AXIS_NUM << 8) | (FIRST_AXIS << 16)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, axis_num, axis); + + { + gpu_dp_inst_t uniSumU8_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSqrSum_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x55555555, // BSelt + 0x76543210, 0xfedcba98, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, 1, 0): + case _PACK_SELECT_KEY( I8, 1, 0): + case _PACK_SELECT_KEY( I16, 1, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "zpScaleSqr_i16", &zpScaleSqr_i16); + status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16); + status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16); + status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, 1, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, 1, 1): + case _PACK_SELECT_KEY( I8, 1, 1): + case _PACK_SELECT_KEY( I16, 1, 1): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, 1, 1): + { + status = vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, 1, 2): + case _PACK_SELECT_KEY( I8, 1, 2): + case _PACK_SELECT_KEY( I16, 1, 2): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, 1, 2): + { + status = vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, 2, 0): + case _PACK_SELECT_KEY( I8, 2, 0): + case _PACK_SELECT_KEY( I16, 2, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "zpScaleSqr_i16", &zpScaleSqr_i16); + status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16); + status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16); + status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, 3, 0): + case _PACK_SELECT_KEY( I8, 3, 0): + case _PACK_SELECT_KEY( I16, 3, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn); + status |= vsi_nn_kernel_gpu_add_param(node, "zpScaleSqr_i16", &zpScaleSqr_i16); + status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16); + status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16); + status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, 2, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, 3, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + VSI_ASSERT( FALSE ); + break; + } + status = vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } +#undef _PACK_SELECT_KEY + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + + return status; +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params, + int32_t* axis, + int32_t axis_num, + int32_t rs_flg + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_MOMENTS_KEY( input0_dtype, output_dtype, axis_num, axis[0], axis[1], axis[2], rs_flg ); + + for( i = 0; i < _cnt_of_array(moments_map); i++ ) + { + if( moments_map[i].key == key ) + { + break; + } + } + + if( i < _cnt_of_array(moments_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", moments_map[i].function_name ); + kernel->info.parameters = _moments_kernel_param_def; + kernel->info.numParams = _MOMENTS_PARAM_NUM; + kernel->info.initialize = _moments_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + moments_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + moments_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t axis_num = 0; + int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", (size_t*)&axis_num); + int32_t axis_first = axis[0]; + int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 1, 1, 1, 1 } }; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + + int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t i = 0; + uint32_t axis_size = 0; + uint32_t rank_in = 0; + uint32_t rank_out = 0; + vsi_bool ret = FALSE; + vsi_bool image_2d = FALSE; + vsi_bool is_continue_axis = TRUE; + + for ( i = 1; i < axis_num; i++) + { + if ( axis[i] != (axis[i - 1] + 1) && axis[0] == 0) + { + is_continue_axis = FALSE; + break; + } + } + + if (is_continue_axis == FALSE) + { + return NULL; + } + + ret = vsi_nn_kernel_optimize_reduce_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + axis, axis_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], &rank_in, shapes[1], &rank_out, + new_axis, &axis_size); + + if ( ret == FALSE || axis_size > 2) + { + return NULL; + } + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes[1], rank_out ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[1], (uint32_t*)shapes[1], rank_out ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[1]->attr.size, + reshape_tensors[1]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1); + axis_first = new_axis[0]; + + status = _query_kernel( inputs, outputs, kernel, params, new_axis, axis_size, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 3; + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( node_params, _MOMENTS_PARAM_NUM, + reshape_tensors, 1, &reshape_tensors[1], 2 ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_first ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_size ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _MOMENTS_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + status = set_constant_border(node, inputs[0]->attr.dtype.zero_point); + CHECK_STATUS(status); + } + } + + for(i = 0; i < 3; i++) + { + if(reshape_tensors[i]) + { + vsi_nn_ReleaseTensor(&reshape_tensors[i]); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( moments, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c new file mode 100644 index 0000000..db0aea8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c @@ -0,0 +1,658 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define _POOLWITHARGMAX_KERNEL_SOURCE(suffix) "poolwithargmax_"#suffix + +// Add kernel hashtable here +#define POOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE, same_type_flag, _image_2d ) \ + ((IN_DTYPE << 20) | (OUT0_DTYPE << 12) | (OUT1_DTYPE << 4) | (same_type_flag << 2) | (_image_2d)) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE ) \ + { POOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE, 0, 0 ), \ + CVIVANTE_NAMESPACE("evis.poolwithargmax_"#IN_DTYPE"to_"#OUT0_DTYPE"_"#OUT1_DTYPE), \ + _POOLWITHARGMAX_KERNEL_SOURCE(IN_DTYPE) } + +#define PACK_KERNEL_MAP_SAME_TYPE( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE ) \ + { POOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE, 1, 0 ), \ + CVIVANTE_NAMESPACE("evis.poolwithargmax_"#IN_DTYPE"to_"#OUT0_DTYPE"_"#OUT1_DTYPE"_SAME"), \ + _POOLWITHARGMAX_KERNEL_SOURCE(IN_DTYPE) } + +#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE ) \ + { POOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE, 0, 1 ), \ + CVIVANTE_NAMESPACE("evis.poolwithargmax_"#IN_DTYPE"to_"#OUT0_DTYPE"_"#OUT1_DTYPE"_2D"), \ + _POOLWITHARGMAX_KERNEL_SOURCE(IN_DTYPE) } + +#define PACK_KERNEL_MAP_SAME_TYPE_2D( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE ) \ + { POOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT0_DTYPE, OUT1_DTYPE, 1, 1 ), \ + CVIVANTE_NAMESPACE("evis.poolwithargmax_"#IN_DTYPE"to_"#OUT0_DTYPE"_"#OUT1_DTYPE"_SAME_2D"), \ + _POOLWITHARGMAX_KERNEL_SOURCE(IN_DTYPE) } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _poolwithargmax_kernel_map[] = +{ + PACK_KERNEL_MAP( F16, F16, U8 ), + PACK_KERNEL_MAP( F16, I16, U8 ), + PACK_KERNEL_MAP( U8, U8, U8 ), + PACK_KERNEL_MAP( U8, F16, U8 ), + PACK_KERNEL_MAP( U8, F16, I16 ), + PACK_KERNEL_MAP( I8, I8, U8 ), + PACK_KERNEL_MAP( I8, F16, U8 ), + PACK_KERNEL_MAP( I16, I16, U8 ), + PACK_KERNEL_MAP( I16, I16, I16 ), + PACK_KERNEL_MAP( I16, F16, U8 ), + PACK_KERNEL_MAP_SAME_TYPE( I8, I8, U8 ), + PACK_KERNEL_MAP_SAME_TYPE( I16, I16, U8 ), + PACK_KERNEL_MAP_2D( F16, F16, U8 ), + PACK_KERNEL_MAP_2D( F16, I16, U8 ), + PACK_KERNEL_MAP_2D( U8, U8, U8 ), + PACK_KERNEL_MAP_2D( U8, F16, U8 ), + PACK_KERNEL_MAP_2D( U8, F16, I16 ), + PACK_KERNEL_MAP_2D( I8, I8, U8 ), + PACK_KERNEL_MAP_2D( I8, F16, U8 ), + PACK_KERNEL_MAP_2D( I16, I16, U8 ), + PACK_KERNEL_MAP_2D( I16, I16, I16 ), + PACK_KERNEL_MAP_2D( I16, F16, U8 ), + PACK_KERNEL_MAP_SAME_TYPE_2D( I8, I8, U8 ), + PACK_KERNEL_MAP_SAME_TYPE_2D( I16, I16, U8 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _poolwithargmax_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _POOLWITHARGMAX_PARAM_NUM _cnt_of_array( _poolwithargmax_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * input_shape = NULL; + vsi_nn_kernel_dtype_e src_dtype = F16; + vsi_nn_kernel_dtype_e dst_dtype = F16; + int32_t input_fl = 0; + int32_t output_fl = 0; + uint16_t M0 = 0; + int8_t postShift = 0; + float inputScale = 1.0f; + int32_t input_ZP = 0; + float outputScale = 1.0f; + int32_t output_ZP = 0; + vsi_bool image_2d = FALSE; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + input_shape = input_attr->shape; + src_dtype = input_attr->dtype; + dst_dtype = output_attr->dtype; + + if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + input_fl = input_attr->dfp.fl; + if (input_fl > 0) + { + inputScale = 1.0f / (float) ((int64_t)1 << input_fl); + } + else + { + inputScale = (float)((int64_t)1 << -input_fl); + } + } + else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inputScale = input_attr->asymm.scale; + input_ZP = input_attr->asymm.zero_point; + } + + if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + output_fl = output_attr->dfp.fl; + if (output_fl > 0) + { + outputScale = 1.0f / (float) ((int64_t)1 << output_fl); + } + else + { + outputScale = (float)((int64_t)1 << -output_fl); + } + } + else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + outputScale = output_attr->asymm.scale; + output_ZP = output_attr->asymm.zero_point; + } + + if ( ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + && ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) ) + { + vsi_nn_GetFP32MultiAndPostShift(inputScale / outputScale, &M0, &postShift); + } + + image_2d = (vsi_bool)(input_shape->size < 3 || 1 == input_shape->data[2]); + + + if (BF16 == src_dtype && BF16 == dst_dtype) + { + src_dtype = F16; + dst_dtype = F16; + } + + + if (I8 == src_dtype || U8 == src_dtype) + { + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 2; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 2; + gpu_param.global_scale[2] = 1; + } + + gpu_param.dim = image_2d ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (input_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (input_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = image_2d ? 1 : ( + (input_shape->data[2] + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2]); + + + if(I8 == src_dtype || U8 == src_dtype) + { + gpu_dp_inst_t poolingEncodeInt8_0 = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0x32321010, 0x76765454, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000700, // AccumType, ConstantType, and PostShift + 0x00400080, 0x00100020, 0x00400080, 0x00100020, + 0x00400080, 0x00100020, 0x00400080, 0x00100020 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t poolingEncodeInt8_1 = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0xbaba9898, 0xfefedcdc, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000700, // AccumType, ConstantType, and PostShift + 0x00400080, 0x00100020, 0x00400080, 0x00100020, + 0x00400080, 0x00100020, 0x00400080, 0x00100020 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8EvenBinSubZP_MulM_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x06040200, 0x0e0c0a08, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00020001, 0x00020001, 0x00020001, 0x00020001, + 0x00020001, 0x00020001, 0x00020001, 0x00020001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniEncodeUint8_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x8628c020, 0x6ad0a49c, 0xe128bd8e, 0xacde96ac, 0xff9eeef1, // BinSelect + 0x00000700, // AccumType, ConstantType, and PostShift + 0x10204080, 0x10204080, 0x10204080, 0x10204080, + 0x10204080, 0x10204080, 0x10204080, 0x10204080 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniS16AddOutZP_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + + if ((U8 == src_dtype && U8 == dst_dtype) + || (I8 == src_dtype && I8 == dst_dtype)) + { + vx_uint32 idx = 0; + vx_uint32 packed_outputZP[4] = {0}; + + for (idx = 0; idx < 4; idx ++) + { + vx_uint8 zp = (vx_uint8)(output_ZP & 0xFF); + packed_outputZP[idx] = (zp << 24) | (zp << 16) | (zp << 8) | zp; + } + + uniU8EvenBinSubZP_MulM_2x8.data[7] |= postShift; + + for (idx = 8; idx < 16; idx ++) + { + uniU8EvenBinSubZP_MulM_2x8.data[idx] = (vx_uint32)((M0 << 16) | M0); + } + + status = vsi_nn_kernel_gpu_add_param(node, "uniU8EvenBinSubZP_MulM_2x8", + &uniU8EvenBinSubZP_MulM_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniS16AddOutZP_2x8", + &uniS16AddOutZP_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "packed_outputZP", packed_outputZP); + status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP", &input_ZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if(U8 == src_dtype) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniEncodeUint8_4x8", + &uniEncodeUint8_4x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "poolingEncodeInt8_0", + &poolingEncodeInt8_0); + status |= vsi_nn_kernel_gpu_add_param(node, "poolingEncodeInt8_1", + &poolingEncodeInt8_1); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if(F16 == dst_dtype) + { + gpu_dp_inst_t uniConvertUint8ToFp32_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertSubZpUint8Fp32_4x4 = {{ + 0x09090905, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0xbc003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniPackHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertEvenU8ToFp32_4x4 = {{ + 0x09090905, // TCfg + 0x04040404, // ASelt + 0x00020000, 0x00060004, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertEvenU8SubZpToFp32_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x000a0008, 0x000e000c, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniPackHalf8_2x8", + &uniPackHalf8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEvenU8ToFp32_4x4", + &uniConvertEvenU8ToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEvenU8SubZpToFp32_4x4", + &uniConvertEvenU8SubZpToFp32_4x4); + if(U8 == src_dtype) + { + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8ToFp32_4x4", + &uniConvertUint8ToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSubZpUint8Fp32_4x4", + &uniConvertSubZpUint8Fp32_4x4); + } + status |= vsi_nn_kernel_gpu_add_param(node, "inputScale", &inputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP", &input_ZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + else + { + gpu_dp_inst_t poolingEncode = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0x32321010, 0x76765454, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000700, // AccumType, ConstantType, and PostShift + 0x00400080, 0x00100020, 0x00400080, 0x00100020, + 0x00400080, 0x00100020, 0x00400080, 0x00100020 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniConvertDirInt16Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertEndInt16Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniPackHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniQuantInOutInt16Even_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if(F16 == src_dtype) + { + status = vsi_nn_kernel_gpu_add_param(node, "poolingEncode", &poolingEncode); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if(I16 == src_dtype) + { + if(F16 == dst_dtype) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniPackHalf8_2x8_2", + &uniPackHalf8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale_i16", &inputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertDirInt16Fp32_4x4", + &uniConvertDirInt16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4", + &uniConvertEndInt16Fp32_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + status = vsi_nn_kernel_gpu_add_param(node, "poolingEncode2", &poolingEncode); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if (I16 == dst_dtype) + { + if(input_fl > output_fl) + { + uniQuantInOutInt16Even_4x4.data[7] = uniQuantInOutInt16Even_4x4.data[7] | (input_fl - output_fl); + } + else + { + vx_uint32 multiply = ((int64_t)1 << (output_fl - input_fl)); + vx_uint32 i = 0; + + for (i = 8; i < 16; i+=2) + { + uniQuantInOutInt16Even_4x4.data[i] = multiply; + } + } + + status = vsi_nn_kernel_gpu_add_param(node, "uniQuantInOutInt16Even_4x4", &uniQuantInOutInt16Even_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + + return status; + +} /* _poolwithargmax_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out0_dtype; + vsi_nn_kernel_dtype_e out1_dtype; + const _kernel_map_type * kernel_map = _poolwithargmax_kernel_map; + size_t kernel_map_size = _cnt_of_array( _poolwithargmax_kernel_map ); + vx_param_description_t * param_def = _poolwithargmax_kernel_param_def; + size_t param_def_size = _cnt_of_array( _poolwithargmax_kernel_param_def ); + vx_kernel_initialize_f initializer = _poolwithargmax_initializer; + uint32_t key; + uint32_t i; + vsi_bool is_same_type = FALSE; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out0_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + out1_dtype = vsi_nn_kernel_map_dtype( outputs[1]->attr.dtype.vx_type ); + + if ((BF16 == in_dtype) && (BF16 == out0_dtype)) + { + in_dtype = F16; + out0_dtype = F16; + } + + if (I8 == out1_dtype) + { + out1_dtype = U8; + } + + if (((I8 == in_dtype) && (I8 == out0_dtype)) || ((I16 == in_dtype) && (I16 == out0_dtype))) + { + if ((inputs[0]->attr.dtype.fl == outputs[0]->attr.dtype.fl + && inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP + && outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + || ((inputs[0]->attr.dtype.zero_point == outputs[0]->attr.dtype.zero_point) + && (inputs[0]->attr.dtype.scale == outputs[0]->attr.dtype.scale) + && inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC + && outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)) + { + is_same_type = TRUE; + } + } + + key = POOLWITHARGMAX_HASH_KEY( in_dtype, out0_dtype, out1_dtype, is_same_type, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_POOLWITHARGMAX_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t ksize_x = 0; + int32_t ksize_y = 0; + int32_t stride_x = 0; + int32_t stride_y = 0; + int32_t pad_x = 0; + int32_t pad_y = 0; + vsi_bool image_2d = FALSE; + + ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x"); + ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y"); + stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x"); + stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y"); + pad_x = vsi_nn_kernel_param_get_int32(params, "pad_x"); + pad_y = vsi_nn_kernel_param_get_int32(params, "pad_y"); + + if ((2 != ksize_x) || (2 != ksize_y) || (2 != stride_x) || (2 != stride_y) || (0 != pad_x) || (0 != pad_y)) + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[1]->attr.size, + outputs[1]->attr.dim_num )) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _POOLWITHARGMAX_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _POOLWITHARGMAX_PARAM_NUM ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( poolwithargmax, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c new file mode 100644 index 0000000..701d23e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c @@ -0,0 +1,658 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define VX_KERNEL_NAME_POW_F16F16TOF16 CVIVANTE_NAMESPACE("evis.pow_F16F16toF16") +#define VX_KERNEL_NAME_POW_F16F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toF16_2D") +#define VX_KERNEL_NAME_POW_F16F16TOU8 CVIVANTE_NAMESPACE("evis.pow_F16F16toU8") +#define VX_KERNEL_NAME_POW_F16F16TOU8_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toU8_2D") +#define VX_KERNEL_NAME_POW_F16F16TOI8 CVIVANTE_NAMESPACE("evis.pow_F16F16toI8") +#define VX_KERNEL_NAME_POW_F16F16TOI8_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toI8_2D") +#define VX_KERNEL_NAME_POW_F16F16TOI16 CVIVANTE_NAMESPACE("evis.pow_F16F16toI16") +#define VX_KERNEL_NAME_POW_F16F16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toI16_2D") +#define VX_KERNEL_NAME_POW_F16U8TOF16 CVIVANTE_NAMESPACE("evis.pow_F16U8toF16") +#define VX_KERNEL_NAME_POW_F16U8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16U8toF16_2D") +#define VX_KERNEL_NAME_POW_F16I8TOF16 CVIVANTE_NAMESPACE("evis.pow_F16I8toF16") +#define VX_KERNEL_NAME_POW_F16I8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16I8toF16_2D") +#define VX_KERNEL_NAME_POW_F16I16TOF16 CVIVANTE_NAMESPACE("evis.pow_F16I16toF16") +#define VX_KERNEL_NAME_POW_F16I16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16I16toF16_2D") +#define VX_KERNEL_NAME_POW_F16U8TOU8 CVIVANTE_NAMESPACE("evis.pow_F16U8toU8") +#define VX_KERNEL_NAME_POW_F16U8TOU8_2D CVIVANTE_NAMESPACE("evis.pow_F16U8toU8_2D") +#define VX_KERNEL_NAME_POW_F16I8TOI8 CVIVANTE_NAMESPACE("evis.pow_F16I8toI8") +#define VX_KERNEL_NAME_POW_F16I8TOI8_2D CVIVANTE_NAMESPACE("evis.pow_F16I8toI8_2D") +#define VX_KERNEL_NAME_POW_F16I16TOI16 CVIVANTE_NAMESPACE("evis.pow_F16I16toI16") +#define VX_KERNEL_NAME_POW_F16I16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_F16I16toI16_2D") +#define VX_KERNEL_NAME_POW_U8F16TOF16 CVIVANTE_NAMESPACE("evis.pow_U8F16toF16") +#define VX_KERNEL_NAME_POW_U8F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_U8F16toF16_2D") +#define VX_KERNEL_NAME_POW_I8F16TOF16 CVIVANTE_NAMESPACE("evis.pow_I8F16toF16") +#define VX_KERNEL_NAME_POW_I8F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_I8F16toF16_2D") +#define VX_KERNEL_NAME_POW_I16F16TOF16 CVIVANTE_NAMESPACE("evis.pow_I16F16toF16") +#define VX_KERNEL_NAME_POW_I16F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_I16F16toF16_2D") +#define VX_KERNEL_NAME_POW_U8F16TOU8 CVIVANTE_NAMESPACE("evis.pow_U8F16toU8") +#define VX_KERNEL_NAME_POW_U8F16TOU8_2D CVIVANTE_NAMESPACE("evis.pow_U8F16toU8_2D") +#define VX_KERNEL_NAME_POW_I8F16TOI8 CVIVANTE_NAMESPACE("evis.pow_I8F16toI8") +#define VX_KERNEL_NAME_POW_I8F16TOI8_2D CVIVANTE_NAMESPACE("evis.pow_I8F16toI8_2D") +#define VX_KERNEL_NAME_POW_I16F16TOI16 CVIVANTE_NAMESPACE("evis.pow_I16F16toI16") +#define VX_KERNEL_NAME_POW_I16F16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_I16F16toI16_2D") +#define VX_KERNEL_NAME_POW_U8U8TOU8 CVIVANTE_NAMESPACE("evis.pow_U8U8toU8") +#define VX_KERNEL_NAME_POW_U8U8TOU8_2D CVIVANTE_NAMESPACE("evis.pow_U8U8toU8_2D") +#define VX_KERNEL_NAME_POW_I8I8TOI8 CVIVANTE_NAMESPACE("evis.pow_I8I8toI8") +#define VX_KERNEL_NAME_POW_I8I8TOI8_2D CVIVANTE_NAMESPACE("evis.pow_I8I8toI8_2D") +#define VX_KERNEL_NAME_POW_I16I16TOI16 CVIVANTE_NAMESPACE("evis.pow_I16I16toI16") +#define VX_KERNEL_NAME_POW_I16I16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_I16I16toI16_2D") +#define VX_KERNEL_NAME_POW_BF16BF16TOBF16 CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16") +#define VX_KERNEL_NAME_POW_BF16BF16TOBF16_2D CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16_2D") +#define VX_KERNEL_NAME_POW_U8U8TOF16 CVIVANTE_NAMESPACE("evis.pow_U8U8toF16") +#define VX_KERNEL_NAME_POW_U8U8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_U8U8toF16_2D") + +#define KERNEL_SOURCE_1 "pow_fp16", +#define KERNEL_SOURCE_2 "pow_fp16_i8", +#define KERNEL_SOURCE_3 "pow_fp16_i16", +#define KERNEL_SOURCE_4 "pow_u8", +#define KERNEL_SOURCE_5 "pow_i8", +#define KERNEL_SOURCE_6 "pow_i16" + + +#define HASH_POW_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + +#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE##_2D, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } pow_map[] = +{ + TENSOR_POW_KERNELS(F16, F16, F16, KERNEL_SOURCE_1) + TENSOR_POW_KERNELS(F16, F16, U8, KERNEL_SOURCE_1) + TENSOR_POW_KERNELS(F16, U8, F16, KERNEL_SOURCE_1) + TENSOR_POW_KERNELS(F16, U8, U8, KERNEL_SOURCE_1) + + TENSOR_POW_KERNELS(F16, F16, I8, KERNEL_SOURCE_2) + TENSOR_POW_KERNELS(F16, I8, F16, KERNEL_SOURCE_2) + TENSOR_POW_KERNELS(F16, I8, I8, KERNEL_SOURCE_2) + + TENSOR_POW_KERNELS(F16, F16, I16, KERNEL_SOURCE_3) + TENSOR_POW_KERNELS(F16, I16, F16, KERNEL_SOURCE_3) + TENSOR_POW_KERNELS(F16, I16, I16, KERNEL_SOURCE_3) + + TENSOR_POW_KERNELS(U8, F16, F16, KERNEL_SOURCE_4) + TENSOR_POW_KERNELS(U8, F16, U8, KERNEL_SOURCE_4) + TENSOR_POW_KERNELS(U8, U8, U8, KERNEL_SOURCE_4) + TENSOR_POW_KERNELS(U8, U8, F16, KERNEL_SOURCE_4) + + TENSOR_POW_KERNELS(I8, F16, F16, KERNEL_SOURCE_5) + TENSOR_POW_KERNELS(I8, F16, I8, KERNEL_SOURCE_5) + TENSOR_POW_KERNELS(I8, I8, I8, KERNEL_SOURCE_5) + + TENSOR_POW_KERNELS(I16, F16, F16, KERNEL_SOURCE_6) + TENSOR_POW_KERNELS(I16, F16, I16, KERNEL_SOURCE_6) + TENSOR_POW_KERNELS(I16, I16, I16, KERNEL_SOURCE_6) + TENSOR_POW_KERNELS(BF16, BF16, BF16, KERNEL_SOURCE_3) + + TENSOR_POW_KERNELS_2D(F16, F16, F16, KERNEL_SOURCE_1) + TENSOR_POW_KERNELS_2D(F16, F16, U8, KERNEL_SOURCE_1) + TENSOR_POW_KERNELS_2D(F16, U8, F16, KERNEL_SOURCE_1) + TENSOR_POW_KERNELS_2D(F16, U8, U8, KERNEL_SOURCE_1) + + TENSOR_POW_KERNELS_2D(F16, F16, I8, KERNEL_SOURCE_2) + TENSOR_POW_KERNELS_2D(F16, I8, F16, KERNEL_SOURCE_2) + TENSOR_POW_KERNELS_2D(F16, I8, I8, KERNEL_SOURCE_2) + + TENSOR_POW_KERNELS_2D(F16, F16, I16, KERNEL_SOURCE_3) + TENSOR_POW_KERNELS_2D(F16, I16, F16, KERNEL_SOURCE_3) + TENSOR_POW_KERNELS_2D(F16, I16, I16, KERNEL_SOURCE_3) + + TENSOR_POW_KERNELS_2D(U8, F16, F16, KERNEL_SOURCE_4) + TENSOR_POW_KERNELS_2D(U8, F16, U8, KERNEL_SOURCE_4) + TENSOR_POW_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_4) + TENSOR_POW_KERNELS_2D(U8, U8, F16, KERNEL_SOURCE_4) + + TENSOR_POW_KERNELS_2D(I8, F16, F16, KERNEL_SOURCE_5) + TENSOR_POW_KERNELS_2D(I8, F16, I8, KERNEL_SOURCE_5) + TENSOR_POW_KERNELS_2D(I8, I8, I8, KERNEL_SOURCE_5) + + TENSOR_POW_KERNELS_2D(I16, F16, F16, KERNEL_SOURCE_6) + TENSOR_POW_KERNELS_2D(I16, F16, I16, KERNEL_SOURCE_6) + TENSOR_POW_KERNELS_2D(I16, I16, I16, KERNEL_SOURCE_6) + TENSOR_POW_KERNELS_2D(BF16, BF16, BF16, KERNEL_SOURCE_3) +}; + +static vx_param_description_t vxPowKernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; +#define _EVIS_POW_PARAM_NUM _cnt_of_array(vxPowKernel_param_def) + +DEF_KERNEL_INITIALIZER(_pow_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int8_t in0_fl = 0; + int32_t src0ZP = 0; + float src0Scale = 1.0f; + int8_t in1_fl = 0; + int32_t src1ZP = 0; + float src1Scale = 1.0f; + int8_t out_fl = 0; + float dstZP = 0; + float dstScale = 1.0f; + + int8_t postshift0 = 0; + int8_t postshift1 = 0; + float outScale_fl = 1; + + uint16_t M0 = 0; + uint16_t M1 = 0; + + uint32_t zAx = 1; + uint32_t pack_key = 0; + // dim number ??? + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + out_shape = attr[2]->shape; + + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + in0_fl = (int8_t)attr[0]->dfp.fl; + postshift0 = in0_fl - 0; + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM ) + { + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + + vsi_nn_GetFP32MultiAndPostShift(src0Scale / 1.0f, &M0, &postshift0); + } + + if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + in1_fl = (int8_t)attr[1]->dfp.fl; + postshift1 = in1_fl - 0; + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + src1ZP = attr[1]->asymm.zero_point; + src1Scale = attr[1]->asymm.scale; + + vsi_nn_GetFP32MultiAndPostShift(src1Scale / 1.0f, &M1, &postshift1); + } + + if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + out_fl = (int8_t)attr[2]->dfp.fl; + if (out_fl > 0) + { + outScale_fl = (vx_float32)((int64_t)1 << out_fl); + } + else + { + outScale_fl = (1.0f / (vx_float32)((int64_t)1 << -out_fl)); + } + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM + || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM ) + { + dstZP = (float)attr[2]->asymm.zero_point; + dstScale = 1.0f / attr[2]->asymm.scale; + } + + if ( out_shape->size < 3 ) + { + zAx = 1; + } + else + { + zAx = out_shape->data[2]; + } + +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, + attr[1]->dtype, attr[2]->dtype ); + + shaderParam.global_scale[0] = 8; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((out_shape->data[0] + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((out_shape->data[1] + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 2); + shaderParam.global_size[2] = gpu_align_p2((zAx + shaderParam.global_scale[2] - 1) + / shaderParam.global_scale[2], 1); + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertFstDataToFp32_4x4_2 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertSecDataToFp32_4x4_2 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4_2 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4_2 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + uint32_t multiplierA = (M0 << 16) | M0; + uint32_t multiplierB = (M1 << 16) | M1; + int32_t i = 8; + + uniConvertUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F); + uniConvertSecUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F); + uniConvertUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F); + uniConvertSecUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F); + for ( i = 8; i < 16; i += 2 ) + { + uniConvertUint8SubZpToFp32_4x4.data[i] = multiplierA; + uniConvertSecUint8SubZpToFp32_4x4.data[i] = multiplierA; + uniConvertUint8SubZpToFp32_4x4_2.data[i] = multiplierB; + uniConvertSecUint8SubZpToFp32_4x4_2.data[i] = multiplierB; + } + + if ( attr[0]->dtype == I8 || attr[0]->dtype == I16 ) + { + gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4, postshift0 ); + gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4, postshift0 ); + } + + if ( attr[1]->dtype == I8 || attr[1]->dtype == I16 ) + { + gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4_2, postshift1 ); + gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4_2, postshift1 ); + } + + switch( pack_key ) + { + case _PACK_SELECT_KEY( F16, F16, I8 ): + case _PACK_SELECT_KEY( F16, I8, F16 ): + case _PACK_SELECT_KEY( F16, I8, I8 ): + case _PACK_SELECT_KEY( F16, F16, I16 ): + case _PACK_SELECT_KEY( F16, I16, F16 ): + case _PACK_SELECT_KEY( F16, I16, I16 ): + case _PACK_SELECT_KEY( I8, F16, F16 ): + case _PACK_SELECT_KEY( I8, F16, I8 ): + case _PACK_SELECT_KEY( I8, I8, I8 ): + case _PACK_SELECT_KEY( I16, F16, F16 ): + case _PACK_SELECT_KEY( I16, F16, I16 ): + case _PACK_SELECT_KEY( I16, I16, I16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4", + &uniConvertFstDataToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4", + &uniConvertSecDataToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2", + &uniConvertFstDataToFp32_4x4_2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2", + &uniConvertSecDataToFp32_4x4_2); + status |= vsi_nn_kernel_gpu_add_param(node, "outScale_fl", &outScale_fl); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, F16, F16 ): + case _PACK_SELECT_KEY( U8, F16, U8 ): + case _PACK_SELECT_KEY( U8, U8, U8 ): + case _PACK_SELECT_KEY( U8, U8, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4", + &uniConvertUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4", + &uniConvertSecUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2", + &uniConvertFstDataToFp32_4x4_2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2", + &uniConvertSecDataToFp32_4x4_2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2", + &uniConvertUint8SubZpToFp32_4x4_2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2", + &uniConvertSecUint8SubZpToFp32_4x4_2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", + &uniConvertHalftoFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP0", &src0ZP); + status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP); + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, F16, F16 ): + case _PACK_SELECT_KEY( F16, F16, U8 ): + case _PACK_SELECT_KEY( F16, U8, F16 ): + case _PACK_SELECT_KEY( F16, U8, U8 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4", + &uniConvertFstDataToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4", + &uniConvertSecDataToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2", + &uniConvertUint8SubZpToFp32_4x4_2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2", + &uniConvertSecUint8SubZpToFp32_4x4_2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP); + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( BF16, BF16, BF16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", + &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", + &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", + &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + default: + break; + } +#undef _PACK_SELECT_KEY + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if ( attr[0] ) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if ( attr[1] ) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if ( attr[2] ) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + return status; +} /* _pow_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype; + vsi_nn_kernel_dtype_e input1_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_POW_KEY( input0_dtype, input1_dtype, output_dtype, image_2d ); + + for ( i = 0; i < _cnt_of_array(pow_map); i ++ ) + { + if ( pow_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(pow_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pow_map[i].function_name ); + kernel->info.parameters = vxPowKernel_param_def; + kernel->info.numParams = _cnt_of_array( vxPowKernel_param_def ); + kernel->info.initialize = _pow_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + pow_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + pow_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_POW_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + status = _query_kernel( inputs, outputs, image_2d, kernel ); + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_POW_PARAM_NUM, + inputs, 2, outputs, 1 ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_POW_PARAM_NUM ); + + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pow, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c new file mode 100644 index 0000000..e46ea14 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c @@ -0,0 +1,568 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define VX_KERNEL_NAME_PRE_PROCESS_BGRA_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_bgra_scale_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_BGRA_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_bgra_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_BGRA_SCALE_NHWC_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_bgra_scale_nhwc_U8toU8") + +#define KERNEL_SOURCE_1 "pre_process_bgra", +#define KERNEL_SOURCE_2 "pre_process_bgra_trans", + +typedef enum +{ + COPY = 0, + SCALE, + SCALE_NHWC +} vsi_nn_kernel_convert_type_e; + +#define HASH_PRE_PROCESS_BGRA_KEY(_input0_type, _output_type, _convert_type, _image_2d) \ + ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_image_2d)) + +#define TENSOR_PRE_PROCESS_BGRA_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \ + { HASH_PRE_PROCESS_BGRA_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \ + VX_KERNEL_NAME_PRE_PROCESS_BGRA_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } pre_process_bgra_map[] = +{ + TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_BGRA_KERNELS(U8, U8, SCALE_NHWC, KERNEL_SOURCE_2) +}; + +static vx_param_description_t vxPreProcessBgraKernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PRE_PROCESS_BGRA_PARAM_NUM _cnt_of_array(vxPreProcessBgraKernel_param_def) + +DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int32_t dstZP = 0; + float outputScale = 1; + int32_t reorder = 0; + int32_t trans = 0; + int32_t xRatio = 0; + int32_t yRatio = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + int32_t enable_copy= 0; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &trans); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + dstZP = attr[0]->asymm.zero_point; + outputScale = attr[0]->asymm.scale; + width = out_shape->data[0]; + height = out_shape->data[1]; + + if(trans) + { + width = width / 3; + } + + if(reorder != 0) + { + reorder = 2; + order1 = 0; + } + enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15)); + + if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + outputScale = (float)((int64_t)1 << attr[0]->dfp.fl); + } + else + { + outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); + } + dstZP = 0; + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + outputScale = 1.0f/outputScale; + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + outputScale = 1; + dstZP = 0; + } + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + // trans + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniBilinearTmp1BgraShort_4x4 = {{ + 0x19191919, // TCfg + 0x00000000, // ASelt + 0x01150004, 0x03370226, // ABin + 0x25252525, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp2BgraShort_4x4 = {{ + 0x19191919, // TCfg + 0x00000000, // ASelt + 0x099d088c, 0x0bbf0aae, // ABin + 0x25252525, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp3BgraShort_4x4 = {{ + 0x19191919, // TCfg + 0x00000000, // ASelt + 0x01150004, 0x03370226, // ABin + 0x25252525, // BSelt + 0x00110011, 0x00110011, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp4BgraShort_4x4 = {{ + 0x19191919, // TCfg + 0x00000000, // ASelt + 0x099d088c, 0x0bbf0aae, // ABin + 0x25252525, // BSelt + 0x00110011, 0x00110011, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp5BgraShort_4x4 = {{ + 0x19191919, // TCfg + 0x00000000, // ASelt + 0x01150004, 0x03370226, // ABin + 0x25252525, // BSelt + 0x00220022, 0x00220022, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp6BgraShort_4x4 = {{ + 0x19191919, // TCfg + 0x00000000, // ASelt + 0x099d088c, 0x0bbf0aae, // ABin + 0x25252525, // BSelt + 0x00220022, 0x00220022, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp7BgraShort_4x4 = {{ + 0x19191919, // TCfg + 0x00000000, // ASelt + 0x01150004, 0x03370226, // ABin + 0x25252525, // BSelt + 0x00330033, 0x00330033, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp8BgraShort_4x4 = {{ + 0x19191919, // TCfg + 0x00000000, // ASelt + 0x099d088c, 0x0bbf0aae, // ABin + 0x25252525, // BSelt + 0x00330033, 0x00330033, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniDescaleU8_4x4 = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002614, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertIntergetoF32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniExtractInt32BgraToU8Bgr_2x8 = {{ + 0x00333333, // TCfg + 0x00111000, // ASelt + 0x00020100, 0x00000201, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + // copy + gpu_dp_inst_t uniExtractBfromBgra_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00040000, 0x000c0008, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractGfromBgra_4x4 = {{ + 0x01010401, // TCfg + 0x00000000, // ASelt + 0x00500001, 0x000d0009, // ABin + 0x02020802, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00010000, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractRfromBgra_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00060002, 0x000e000a, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + // scale + gpu_dp_inst_t uniExtractInt32BgraToU8_2x8 = {{ + 0x33333333, // TCfg + 0x10101010, // ASelt + 0x01010000, 0x03030202, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniExchangeBgra_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x09080100, 0x0b0a0302, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExchangeBgra2_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x0d0c0504, 0x0f0e0706, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + if(trans) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtractInt32BgraToU8Bgr_2x8", + &uniExtractInt32BgraToU8Bgr_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp1BgraShort_4x4", &uniBilinearTmp1BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp2BgraShort_4x4", &uniBilinearTmp2BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp3BgraShort_4x4", &uniBilinearTmp3BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp4BgraShort_4x4", &uniBilinearTmp4BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp5BgraShort_4x4", &uniBilinearTmp5BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp6BgraShort_4x4", &uniBilinearTmp6BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp7BgraShort_4x4", &uniBilinearTmp7BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp8BgraShort_4x4", &uniBilinearTmp8BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDescaleU8_4x4", &uniDescaleU8_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + else if(enable_copy) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtractBfromBgra_4x4", &uniExtractBfromBgra_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGfromBgra_4x4", &uniExtractGfromBgra_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRfromBgra_4x4", &uniExtractRfromBgra_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp1BgraShort_4x4", &uniBilinearTmp1BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp2BgraShort_4x4", &uniBilinearTmp2BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp3BgraShort_4x4", &uniBilinearTmp3BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp4BgraShort_4x4", &uniBilinearTmp4BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp5BgraShort_4x4", &uniBilinearTmp5BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp6BgraShort_4x4", &uniBilinearTmp6BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp7BgraShort_4x4", &uniBilinearTmp7BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp8BgraShort_4x4", &uniBilinearTmp8BgraShort_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDescaleU8_4x4", &uniDescaleU8_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractInt32BgraToU8_2x8", &uniExtractInt32BgraToU8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExchangeBgra_2x8", &uniExchangeBgra_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExchangeBgra2_2x8", &uniExchangeBgra2_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_bgra_copy_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_convert_type_e convert_type = SCALE; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if(enable_perm) + { + convert_type = SCALE_NHWC; + } + else if(enable_copy) + { + convert_type = COPY; + } + else + { + convert_type = SCALE; + } + + key = HASH_PRE_PROCESS_BGRA_KEY( input0_dtype, output_dtype, convert_type, 0 ); + + for( i = 0; i < _cnt_of_array(pre_process_bgra_map); i ++ ) + { + if( pre_process_bgra_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(pre_process_bgra_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_bgra_map[i].function_name ); + kernel->info.parameters = vxPreProcessBgraKernel_param_def; + kernel->info.numParams = _cnt_of_array( vxPreProcessBgraKernel_param_def ); + kernel->info.initialize = _pre_process_bgra_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + pre_process_bgra_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + pre_process_bgra_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_BGRA_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 2; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float bgra_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + + /* Pass parameters to node. */ + if(trans) + { + shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; + shapes[1] = outputs[0]->attr.size[2]; + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num); + + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_BGRA_PARAM_NUM, + inputs, 1, &reshape_tensors[0], 1 ); + } + else + { + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_BGRA_PARAM_NUM, + inputs, 1, outputs, 1 ); + } + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &bgra_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_BGRA_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[2] ); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + vsi_nn_kernel_scalar_release( &tmp_params[8] ); + vsi_nn_kernel_scalar_release( &tmp_params[9] ); + vsi_nn_kernel_scalar_release( &tmp_params[10] ); + vsi_nn_kernel_scalar_release( &tmp_params[11] ); + } + } + + if(reshape_tensors[0]) + { + vsi_nn_ReleaseTensor(&reshape_tensors[0]); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pre_process_bgra, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c new file mode 100644 index 0000000..f54396a --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c @@ -0,0 +1,484 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_gray_scale_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_gray_scale_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_SCALE_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_gray_scale_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_SCALE_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_gray_scale_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_gray_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_COPY_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_gray_copy_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_COPY_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_gray_copy_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_gray_copy_U8toF16") + +#define KERNEL_SOURCE_1 "pre_process_gray", +#define KERNEL_SOURCE_2 "pre_process_gray_copy" + +typedef enum +{ + COPY = 0, + SCALE +} vsi_nn_gray_convert_type_e; + +#define HASH_PRE_PROCESS_GRAY_KEY(_input0_type, _output_type, _convert_type, _image_2d) \ + ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_image_2d)) + +#define TENSOR_PRE_PROCESS_GRAY_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \ + { HASH_PRE_PROCESS_GRAY_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \ + VX_KERNEL_NAME_PRE_PROCESS_GRAY_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } pre_process_gray_map[] = +{ + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I8, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I16, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, F16, COPY, KERNEL_SOURCE_2) +}; + +static vx_param_description_t vxPreProcessGrayKernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PRE_PROCESS_GRAY_PARAM_NUM _cnt_of_array(vxPreProcessGrayKernel_param_def) + +DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + float dstZP = 0; + float outputScale = 1; + uint32_t width = 0; + uint32_t height = 0; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + out_shape = attr[0]->shape; + dstZP = (float)attr[0]->asymm.zero_point; + outputScale = attr[0]->asymm.scale; + width = out_shape->data[0]; + height = out_shape->data[1]; + + if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + outputScale = (float)((int64_t)1 << attr[0]->dfp.fl); + } + else + { + outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); + } + dstZP = 0.0f; + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + outputScale = 1.0f/outputScale; + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + outputScale = 1; + dstZP = 0.0f; + } + + shaderParam.global_scale[0] = 16; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniDataMeanStddevLo_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDataMeanStddevHi_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevLo_2x8", &uniDataMeanStddevLo_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevHi_2x8", &uniDataMeanStddevHi_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &dstZP); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_gray_copy_initializer() */ + +DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + float dstZP = 0; + float outputScale = 1; + uint32_t width = 0; + uint32_t height = 0; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + out_shape = attr[0]->shape; + dstZP = (float)attr[0]->asymm.zero_point; + outputScale = attr[0]->asymm.scale; + width = out_shape->data[0]; + height = out_shape->data[1]; + + if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + outputScale = (float)((int64_t)1 << attr[0]->dfp.fl); + } + else + { + outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); + } + dstZP = 0.0f; + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + outputScale = 1.0f/outputScale; + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + outputScale = 1; + dstZP = 0.0f; + } + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniVecShift10 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAddRShift = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002405, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGetTempVal = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00230001, 0x00670045, // ABin + 0x05050505, // BSelt + 0x00110000, 0x00330022, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractBytes = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002414, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDataMulAlpha_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x01010101, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDataSubMean_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertIntergetoF32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtactInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002300, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10); + status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift); + status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtactInteger_2x8", &uniExtactInteger_2x8); + + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &dstZP); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + + if(attr[0]->dtype == F16) + { + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataMulAlpha_4x4", &uniDataMulAlpha_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataSubMean_4x4", &uniDataSubMean_4x4); + } + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_gray_copy_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_gray_convert_type_e convert_type = SCALE; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if(enable_copy) + { + convert_type = COPY; + } + else + { + convert_type = SCALE; + } + + key = HASH_PRE_PROCESS_GRAY_KEY( input0_dtype, output_dtype, convert_type, 0 ); + + for( i = 0; i < _cnt_of_array(pre_process_gray_map); i ++ ) + { + if( pre_process_gray_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(pre_process_gray_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_gray_map[i].function_name ); + kernel->info.parameters = vxPreProcessGrayKernel_param_def; + kernel->info.numParams = _cnt_of_array( vxPreProcessGrayKernel_param_def ); + + if(enable_copy) + { + kernel->info.initialize = _pre_process_gray_copy_initializer; + } + else + { + kernel->info.initialize = _pre_process_gray_initializer; + } + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + pre_process_gray_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + pre_process_gray_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_GRAY_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 2; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float mean = vsi_nn_kernel_param_get_float32( params, "mean" ); + float scale = vsi_nn_kernel_param_get_float32( params, "scale" ); + + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_GRAY_PARAM_NUM, + inputs, 1, outputs, 1 ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_GRAY_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[2] ); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pre_process_gray, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c new file mode 100644 index 0000000..6976058 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c @@ -0,0 +1,727 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_trans_U8toU8") + +// greater than a quarter +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOF16_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toF16_gq") + +#define KERNEL_SOURCE_1 "pre_process_nv12_scale_8bits", +#define KERNEL_SOURCE_2 "pre_process_nv12_scale", +#define KERNEL_SOURCE_3 "pre_process_nv12_trans_u8", +#define KERNEL_SOURCE_4 "pre_process_nv12_scale_mix" + +typedef enum +{ + COPY = 0, + SCALE, + TRANS +} vsi_nn_kernel_convert_type_e; + +#define HASH_PRE_PROCESS_NV12_KEY(_input0_type, _output_type, _convert_type, _greater_quarter) \ + ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_greater_quarter)) + +#define TENSOR_PRE_PROCESS_NV12_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \ + { HASH_PRE_PROCESS_NV12_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \ + VX_KERNEL_NAME_PRE_PROCESS_NV12_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +#define TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \ + { HASH_PRE_PROCESS_NV12_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 1), \ + VX_KERNEL_NAME_PRE_PROCESS_NV12_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE##_GQ, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } pre_process_nv12_map[] = +{ + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, TRANS, KERNEL_SOURCE_3) + TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_4) + TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_4) +}; + +static vx_param_description_t vxPreProcessNv12Kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PRE_PROCESS_NV12_PARAM_NUM _cnt_of_array(vxPreProcessNv12Kernel_param_def) + +DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int32_t dstZP = 0; + float dstScale = 1; + int32_t reorder = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f; + float outputScaleVar = 0.0f; + float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &var); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + dstZP = attr[0]->asymm.zero_point; + dstScale = attr[0]->asymm.scale; + width = out_shape->data[0]; + height = out_shape->data[1]; + + if(reorder != 0) + { + reorder = 2; + order1 = 0; + } + + if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + dstScale = 1.0f / dstScale; + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl); + } + else + { + dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl)); + } + dstZP = 0; + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + dstScale = 1; + dstZP = 0; + } + + outputScaleVar = dstScale * var; + bMeanScaleVarZp = (float)dstZP - bMean * outputScaleVar; + gMeanScaleVarZp = (float)dstZP - gMean * outputScaleVar; + rMeanScaleVarZp = (float)dstZP - rMean * outputScaleVar; + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 2); + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertNV12toB_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00210000, 0x00630042, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000, + 0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertNV12toG_4x4 = {{ + 0x29292929, // TCfg + 0x14141414, // ASelt + 0x03210100, 0x07630542, // ABin + 0x2a2a2a2a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc, + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertNV12toR_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00310010, 0x00730052, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x01000100, 0x03020302, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar); + status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_nv12_copy_initializer() */ + +DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int32_t dstZP = 0; + float dstScale = 1; + int32_t reorder = 0; + int32_t trans = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + uint32_t xrIntFloat_16 = 0; + uint32_t yrIntFloat_16 = 0; + float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f; + float outputScaleVar = 0.0f; + float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; + float resize = 0.0f; + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &var); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &trans); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[1]->shape; + dstZP = attr[1]->asymm.zero_point; + dstScale = attr[1]->asymm.scale; + width = out_shape->data[0]; + height = out_shape->data[1]; + + if(reorder != 0) + { + reorder = 2; + order1 = 0; + } + if(trans) + { + width = width / 3; + } + resize = (float)width / attr[0]->shape->data[0]; + xrIntFloat_16 = (attr[0]->shape->data[0] << 16) / width + 1; + yrIntFloat_16 = (attr[0]->shape->data[1] << 16) / height + 1; + + if(attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + dstScale = 1.0f / dstScale; + } + else if(attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[1]->dfp.fl > 0) + { + dstScale = (vx_float32)((int64_t)1 << attr[1]->dfp.fl); + } + else + { + dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[1]->dfp.fl)); + } + dstZP = 0; + } + else if(attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + dstScale = 1; + dstZP = 0; + } + + outputScaleVar = dstScale * var; + bMeanScaleVarZp = (float)dstZP - bMean * outputScaleVar; + gMeanScaleVarZp = (float)dstZP - gMean * outputScaleVar; + rMeanScaleVarZp = (float)dstZP - rMean * outputScaleVar; + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 2); + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertNV12toB_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00210000, 0x00630042, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000, + 0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertNV12toG_4x4 = {{ + 0x29292929, // TCfg + 0x14141414, // ASelt + 0x03210100, 0x07630542, // ABin + 0x2a2a2a2a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc, + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertNV12toR_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00310010, 0x00730052, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUVtoCharSub128_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + + //trans + gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{ + 0x11311311, // TCfg + 0x00100100, // ASelt + 0x01000400, 0x06020105, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{ + 0x00003113, // TCfg + 0x00001001, // ASelt + 0x03070302, 0x00000000, // ABin + 0x00000220, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000001, 0x00000001, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateYShift_2x8 = {{ + 0x00009999, // TCfg + 0x00000000, // ASelt + 0x06040200, 0x00000000, // ABin + 0x00005555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateUVShift_2x8 = {{ + 0x51515151, // TCfg + 0x40404040, // ASelt + 0x02020000, 0x06060404, // ABin + 0x91919191, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00010000, 0x00000000, 0x00010000, + 0x00000000, 0x00010000, 0x00000000, 0x00010000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16); + status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar); + status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); + if(resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16) && !trans) + { + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8); + } + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + if(trans && attr[1]->dtype == U8) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); + } + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( attr[1]->dtype ) + { + case U8: + case I8: + case I16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case F16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + return status; +} /* _pre_process_nv12_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_convert_type_e convert_type = SCALE; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + uint32_t srcWidth = inputs[0]->attr.size[0]; + uint32_t dstWidth = enable_perm ? outputs[0]->attr.size[1] : outputs[0]->attr.size[0]; + float scaleVal = (float)dstWidth / srcWidth; + uint32_t optFlg = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if(enable_perm) + { + convert_type = TRANS; + } + else if(enable_copy && output_dtype == U8) + { + convert_type = COPY; + } + else + { + convert_type = SCALE; + } + + if(scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE) + { + optFlg = 1; + } + + key = HASH_PRE_PROCESS_NV12_KEY( input0_dtype, output_dtype, convert_type, optFlg ); + + for( i = 0; i < _cnt_of_array(pre_process_nv12_map); i ++ ) + { + if( pre_process_nv12_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(pre_process_nv12_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_nv12_map[i].function_name ); + kernel->info.parameters = vxPreProcessNv12Kernel_param_def; + kernel->info.numParams = _cnt_of_array( vxPreProcessNv12Kernel_param_def ); + + if(convert_type == COPY) + { + kernel->info.initialize = _pre_process_nv12_copy_initializer; + } + else + { + kernel->info.initialize = _pre_process_nv12_initializer; + } + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + pre_process_nv12_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + pre_process_nv12_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_NV12_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 3; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + + /* Pass parameters to node. */ + if(trans) + { + shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; + shapes[1] = outputs[0]->attr.size[2]; + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num); + + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM, + inputs, 2, &reshape_tensors[0], 1 ); + } + else + { + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM, + inputs, 2, outputs, 1 ); + } + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + vsi_nn_kernel_scalar_release( &tmp_params[8] ); + vsi_nn_kernel_scalar_release( &tmp_params[9] ); + vsi_nn_kernel_scalar_release( &tmp_params[10] ); + vsi_nn_kernel_scalar_release( &tmp_params[11] ); + vsi_nn_kernel_scalar_release( &tmp_params[12] ); + } + } + if(reshape_tensors[0]) + { + vsi_nn_ReleaseTensor(&reshape_tensors[0]); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pre_process_nv12, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c new file mode 100644 index 0000000..c5ea1c5 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c @@ -0,0 +1,780 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_SCALE_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_rgb_scale_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_SCALE_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_rgb_scale_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_rgb_scale_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_rgb_scale_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_rgb_copy_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_COPY_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_rgb_copy_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_rgb_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_COPY_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_rgb_copy_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_SCALE_NHWC_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_rgb_scale_nhwc_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_SCALE_NHWC_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_rgb_scale_nhwc_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_SCALE_NHWC_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_rgb_scale_nhwc_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_SCALE_NHWC_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_rgb_scale_nhwc_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_COPY_NHWC_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_rgb_copy_nhwc_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_COPY_NHWC_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_rgb_copy_nhwc_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_COPY_NHWC_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_rgb_copy_nhwc_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_RGB_COPY_NHWC_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_rgb_copy_nhwc_U8toI8") + +#define KERNEL_SOURCE_1 "pre_process_rgb", +#define KERNEL_SOURCE_2 "pre_process_rgb_copy", +#define KERNEL_SOURCE_3 "pre_process_rgb_trans", +#define KERNEL_SOURCE_4 "pre_process_rgb_copy_trans", + +typedef enum +{ + COPY = 0, + SCALE, + COPY_NHWC, + SCALE_NHWC +} vsi_nn_kernel_convert_type_e; + +#define HASH_PRE_PROCESS_RGB_KEY(_input0_type, _output_type, _convert_type, _image_2d) \ + ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_image_2d)) + +#define TENSOR_PRE_PROCESS_RGB_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \ + { HASH_PRE_PROCESS_RGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \ + VX_KERNEL_NAME_PRE_PROCESS_RGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } pre_process_rgb_map[] = +{ + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, SCALE_NHWC, KERNEL_SOURCE_3) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, SCALE_NHWC, KERNEL_SOURCE_3) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8, SCALE_NHWC, KERNEL_SOURCE_3) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8, SCALE_NHWC, KERNEL_SOURCE_3) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, F16, COPY_NHWC, KERNEL_SOURCE_4) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I16, COPY_NHWC, KERNEL_SOURCE_4) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, U8, COPY_NHWC, KERNEL_SOURCE_4) + TENSOR_PRE_PROCESS_RGB_KERNELS(U8, I8, COPY_NHWC, KERNEL_SOURCE_4) +}; + +static vx_param_description_t vxPreProcessRgbKernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PRE_PROCESS_RGB_PARAM_NUM _cnt_of_array(vxPreProcessRgbKernel_param_def) + +DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + float outputZP = 0; + float outputScale = 1; + int32_t reorder = 0; + int32_t trans = 0; + int32_t xRatio = 0; + int32_t yRatio = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + int32_t enable_copy= 0; + uint32_t pack_key = 0; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &trans); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + outputZP = (float)attr[0]->asymm.zero_point; + outputScale = attr[0]->asymm.scale; + width = out_shape->data[0]; + height = out_shape->data[1]; + + if(reorder != 0) + { + reorder = 2; + order1 = 0; + } + enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15)); + + if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + outputScale = (float)((int64_t)1 << attr[0]->dfp.fl); + } + else + { + outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); + } + outputZP = 0; + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + outputScale = 1.0f / outputScale; + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + outputScale = 1; + outputZP = 0; + } + +#define _PACK_SELECT_KEY( COPY_FLAG, REVERSE_FLAG, TRANS_FLAG) \ + (COPY_FLAG | (REVERSE_FLAG << 24) | (TRANS_FLAG << 16) ) + + pack_key = _PACK_SELECT_KEY( enable_copy, reorder, trans); + { + // trans and copy + gpu_dp_inst_t uniNormilizationLo_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x45002142, 0x27480324, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniNormilizationHi_2x8 = {{ + 0x09999999, // TCfg + 0x04444444, // ASelt + 0x092a4b06, 0x000c2d4e, // ABin + 0x09999999, // BSelt + 0x06060606, 0x00060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniNormilizationLo_NHWC_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03422100, 0x27064524, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniNormilizationHi_NHWC_2x8 = {{ + 0x09999999, // TCfg + 0x04444444, // ASelt + 0x4b2a0948, 0x004e2d0c, // ABin + 0x09999999, // BSelt + 0x06060606, 0x00060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + // copy + gpu_dp_inst_t uniExtractRtoF32_part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00030000, 0x00090006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractRtoF32_part1_4x4 = {{ + 0x01010101, // TCfg + 0x01010100, // ASelt + 0x0000000c, 0x00060003, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractGtoF32_part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00040001, 0x000a0007, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractGtoF32_part1_4x4 = {{ + 0x01010101, // TCfg + 0x01010100, // ASelt + 0x0001000d, 0x00070004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractBtoF32_part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050002, 0x000b0008, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractBtoF32_part1_4x4 = {{ + 0x01010101, // TCfg + 0x01010100, // ASelt + 0x0002000e, 0x00080005, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + // scale and trans + gpu_dp_inst_t uniVecShift10 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000400, 0x00000000, 0x00000400, 0x00000000, + 0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAddRShift = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002405, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGetTempVal = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00230001, 0x00670045, // ABin + 0x05050505, // BSelt + 0x00110000, 0x00330022, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractBytes = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002414, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniUnpackToR = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x09060300, 0x09060300, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniUnpackToG = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x0a070401, 0x0a070401, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniUnpackToB = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x0b080502, 0x0b080502, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertIntergetoF32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniRePackRGBLo_2x8 = {{ + 0x00111111, // TCfg + 0x00001001, // ASelt + 0x01000400, 0x00000105, // ABin + 0x00222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniRePackRGBHi_2x8 = {{ + 0x00111111, // TCfg + 0x00001001, // ASelt + 0x03020602, 0x00000307, // ABin + 0x00222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniRePackRGBLo_NHWC_2x8 = {{ + 0x00111111, // TCfg + 0x00100100, // ASelt + 0x01000400, 0x00000105, // ABin + 0x00222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniRePackRGBHi_NHWC_2x8 = {{ + 0x00111111, // TCfg + 0x00100100, // ASelt + 0x03020602, 0x00000307, // ABin + 0x00222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + switch( pack_key ) + { + case _PACK_SELECT_KEY( 1, 0, 1): // copy trans + { + shaderParam.global_scale[0] = 15; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_add_param(node, "uniNormilizationLo_2x8", &uniNormilizationLo_NHWC_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniNormilizationHi_2x8", &uniNormilizationHi_NHWC_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + break; + case _PACK_SELECT_KEY( 1, 2, 1): // copy reorder trans + { + shaderParam.global_scale[0] = 15; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_add_param(node, "uniNormilizationLo_2x8", &uniNormilizationLo_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniNormilizationHi_2x8", &uniNormilizationHi_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + break; + case _PACK_SELECT_KEY( 1, 0, 0): // copy + case _PACK_SELECT_KEY( 1, 2, 0): // copy reorder + { + shaderParam.global_scale[0] = 8; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + if(attr[0]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part0_4x4", &uniExtractRtoF32_part0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part1_4x4", &uniExtractRtoF32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part0_4x4", &uniExtractGtoF32_part0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part1_4x4", &uniExtractGtoF32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part0_4x4", &uniExtractBtoF32_part0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part1_4x4", &uniExtractBtoF32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + break; + case _PACK_SELECT_KEY( 0, 0, 0): // + case _PACK_SELECT_KEY( 0, 2, 0): // reorder + { + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + if(attr[0]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR); + status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG); + status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB); + status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10); + status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift); + status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes); + status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + break; + case _PACK_SELECT_KEY( 0, 0, 1): // trans + { + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width / 3 + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + if(attr[0]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR); + status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG); + status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB); + status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10); + status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift); + status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes); + status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBLo_2x8", &uniRePackRGBLo_NHWC_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBHi_2x8", &uniRePackRGBHi_NHWC_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + break; + case _PACK_SELECT_KEY( 0, 2, 1): // reorder trans + { + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width / 3 + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + if(attr[0]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToR", &uniUnpackToR); + status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToG", &uniUnpackToG); + status |= vsi_nn_kernel_gpu_add_param(node, "uniUnpackToB", &uniUnpackToB); + status |= vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10); + status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift); + status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes); + status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBLo_2x8", &uniRePackRGBLo_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniRePackRGBHi_2x8", &uniRePackRGBHi_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + break; + default: + break; + } + + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_rgb_copy_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_convert_type_e convert_type = SCALE; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if(enable_copy && enable_perm) + { + convert_type = COPY_NHWC; + } + else if(enable_copy) + { + convert_type = COPY; + } + else if(enable_perm) + { + convert_type = SCALE_NHWC; + } + else + { + convert_type = SCALE; + } + + key = HASH_PRE_PROCESS_RGB_KEY( input0_dtype, output_dtype, convert_type, 0 ); + + for( i = 0; i < _cnt_of_array(pre_process_rgb_map); i ++ ) + { + if( pre_process_rgb_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(pre_process_rgb_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_rgb_map[i].function_name ); + kernel->info.parameters = vxPreProcessRgbKernel_param_def; + kernel->info.numParams = _cnt_of_array( vxPreProcessRgbKernel_param_def ); + kernel->info.initialize = _pre_process_rgb_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + pre_process_rgb_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + pre_process_rgb_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_RGB_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 2; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + + /* Pass parameters to node. */ + if(trans) + { + shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; + shapes[1] = outputs[0]->attr.size[2]; + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num); + + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM, + inputs, 1, &reshape_tensors[0], 1 ); + } + else + { + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM, + inputs, 1, outputs, 1 ); + } + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[2] ); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + vsi_nn_kernel_scalar_release( &tmp_params[8] ); + vsi_nn_kernel_scalar_release( &tmp_params[9] ); + vsi_nn_kernel_scalar_release( &tmp_params[10] ); + vsi_nn_kernel_scalar_release( &tmp_params[11] ); + } + } + + if(reshape_tensors[0]) + { + vsi_nn_ReleaseTensor(&reshape_tensors[0]); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pre_process_rgb, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c new file mode 100644 index 0000000..b7617ae --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c @@ -0,0 +1,1191 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_trans_U8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_trans_U8toU8") + +#define KERNEL_SOURCE_1 "pre_process_yuv420_scale_u8", +#define KERNEL_SOURCE_2 "pre_process_yuv420_copy_u8", +#define KERNEL_SOURCE_3 "pre_process_yuv420_scale_fp16", +#define KERNEL_SOURCE_4 "pre_process_yuv420_scale_i16", +#define KERNEL_SOURCE_5 "pre_process_yuv420_scale_i8", +#define KERNEL_SOURCE_6 "pre_process_yuv420_trans_u8" + +typedef enum +{ + COPY = 0, + SCALE, + TRANS, + COPY_TRANS +} vsi_nn_kernel_convert_type_e; + +#define HASH_PRE_PROCESS_YUV420_KEY(_input0_type, _output_type, _convert_type, _image_2d) \ + ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_image_2d)) + +#define TENSOR_PRE_PROCESS_YUV420_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \ + { HASH_PRE_PROCESS_YUV420_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \ + VX_KERNEL_NAME_PRE_PROCESS_YUV420_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } pre_process_yuv420_map[] = +{ + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_3) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_4) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_5) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY_TRANS, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, TRANS, KERNEL_SOURCE_6) +}; + +static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PRE_PROCESS_YUV420_PARAM_NUM _cnt_of_array(vxPreProcessYuv420Kernel_param_def) + +DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int32_t dstZP = 0; + float dstScale = 1; + int32_t reorder = 0; + int32_t trans = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + dstZP = attr[0]->asymm.zero_point; + dstScale = attr[0]->asymm.scale; + width = out_shape->data[0]; + height = out_shape->data[1]; + + if(reorder != 0) + { + reorder = 2; + order1 = 0; + } + + if(trans) + { + width = width / 3; + } + + if(attr[0]->dtype == U8) + { + dstScale = 1.0f / dstScale; + } + + shaderParam.global_scale[0] = 16; + if(attr[0]->dtype == I16 || attr[0]->dtype == F16) + { + shaderParam.global_scale[0] = 8; + } + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 2); + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniPackBG0_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x01000000, 0x02020001, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniPackTmpAndR_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x03000100, 0x07060104, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniPackRB0_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x03000302, 0x05040004, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackTmp0AndG_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x03030100, 0x07060404, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniPackGR1_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x06000505, 0x07070006, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackTmp1AndB_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x03060100, 0x07060704, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackBG1_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x09000808, 0x0a0a0009, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackTmp1AndR_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x03080100, 0x07060904, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniPackRB2_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x0b000b0a, 0x0d0c000c, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackTmp2AndG_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x030b0100, 0x07060c04, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackGR2_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x0e000d0d, 0x0f0f000e, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackTmp2AndB_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x030e0100, 0x07060f04, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpR1st_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00130012, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpR2nd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00250024, 0x00370036, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpR3rd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00490048, 0x005b005a, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpR4th_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x006d006c, 0x007f007e, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateR1st_4x4 = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpG1st_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00130012, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpG2nd_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00250024, 0x00370036, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpG3rd_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00490048, 0x005b005a, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpG4th_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x006d006c, 0x007f007e, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpGbyU_2x8 = {{ + 0x66666666, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010064, 0x00010064, 0x00010064, 0x00010064, + 0x00010064, 0x00010064, 0x00010064, 0x00010064 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateG1st_4x4 = {{ + 0x07070707, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00130012, // ABin + 0x08080808, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00000000, 0x00010000, 0x00000000, + 0x00010000, 0x00000000, 0x00010000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateG2nd_4x4 = {{ + 0x07130707, // TCfg + 0x04100404, // ASelt + 0x00210020, 0x00330302, // ABin + 0x08200808, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00000000, 0x00010000, 0x00000000, + 0x00000000, 0x00000001, 0x00010000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateG3rd_4x4 = {{ + 0x07130707, // TCfg + 0x04100404, // ASelt + 0x00410040, 0x00530502, // ABin + 0x08200808, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00000000, 0x00010000, 0x00000000, + 0x00000000, 0x00000001, 0x00010000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateG4th_4x4 = {{ + 0x07070707, // TCfg + 0x04040404, // ASelt + 0x00610060, 0x00730072, // ABin + 0x08080808, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00000000, 0x00010000, 0x00000000, + 0x00010000, 0x00000000, 0x00010000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpB1st_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00130012, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpB2nd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00250024, 0x00370036, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpB3rd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00490048, 0x005b005a, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpB4th_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x006d006c, 0x007f007e, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniQuantU8toU8LoB_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniQuantU8toU8HiB_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniQuantU8toU8LoG_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x23222120, 0x27262524, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniQuantU8toU8HiG_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x2b2a2928, 0x2f2e2d2c, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniQuantU8toU8LoR_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x43424140, 0x47464544, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniQuantU8toU8HiR_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x4b4a4948, 0x4f4e4d4c, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + switch( attr[0]->dtype ) + { + case U8: + { + // R + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR2nd_4x4", &uniCalculateTmpR2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR3rd_4x4", &uniCalculateTmpR3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR4th_4x4", &uniCalculateTmpR4th_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4); + + //G + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpG1st_4x4", &uniCalculateTmpG1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpG2nd_4x4", &uniCalculateTmpG2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpG3rd_4x4", &uniCalculateTmpG3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpG4th_4x4", &uniCalculateTmpG4th_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateG1st_4x4", &uniCalculateG1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateG2nd_4x4", &uniCalculateG2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateG3rd_4x4", &uniCalculateG3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateG4th_4x4", &uniCalculateG4th_4x4); + + //B + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB1st_4x4", &uniCalculateTmpB1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB2nd_4x4", &uniCalculateTmpB2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB3rd_4x4", &uniCalculateTmpB3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB4th_4x4", &uniCalculateTmpB4th_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG0_2x8", &uniPackBG0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmpAndR_2x8", &uniPackTmpAndR_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB0_2x8", &uniPackRB0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp0AndG_2x8", &uniPackTmp0AndG_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR1_2x8", &uniPackGR1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndB_2x8", &uniPackTmp1AndB_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG1_2x8", &uniPackBG1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndR_2x8", &uniPackTmp1AndR_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB2_2x8", &uniPackRB2_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndG_2x8", &uniPackTmp2AndG_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR2_2x8", &uniPackGR2_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndB_2x8", &uniPackTmp2AndB_2x8); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoB_2x8", &uniQuantU8toU8LoB_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiB_2x8", &uniQuantU8toU8HiB_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoG_2x8", &uniQuantU8toU8LoG_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiG_2x8", &uniQuantU8toU8HiG_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoR_2x8", &uniQuantU8toU8LoR_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiR_2x8", &uniQuantU8toU8HiR_2x8); + + status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_yuv420_copy_initializer() */ + +DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int32_t dstZP = 0; + float dstScale = 1; + int32_t reorder = 0; + int32_t trans = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + dstZP = attr[0]->asymm.zero_point; + dstScale = attr[0]->asymm.scale; + width = out_shape->data[0]; + height = out_shape->data[1]; + + if(reorder != 0) + { + reorder = 2; + order1 = 0; + } + if(trans) + { + width = width / 3; + } + + if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl); + } + else + { + dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl)); + } + dstZP = 0; + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + dstScale = 1.0f/dstScale; + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + dstZP = 0; + } + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 2); + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpRWise_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpRWise2nd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpRWise3rd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00990088, 0x00bb00aa, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpRWise4th_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00dd00cc, 0x00ff00ee, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateR1st_4x4 = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpGWise_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpGWise2nd_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpGWise3rd_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00990088, 0x00bb00aa, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpGWise4th_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00dd00cc, 0x00ff00ee, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpGbyU_2x8 = {{ + 0x66666666, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpGbyU2nd_2x8 = {{ + 0x66666666, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateGWise_4x4 = {{ + 0x07070707, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x08080808, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateGWise2nd_4x4 = {{ + 0x07070707, // TCfg + 0x04040404, // ASelt + 0x00510040, 0x00730062, // ABin + 0x08080808, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpBWise_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpBWise2nd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpBWise3rd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00990088, 0x00bb00aa, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpBWise4th_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00dd00cc, 0x00ff00ee, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniDescaleU8_4x4 = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002614, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniBilinearTmp1st_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00450001, 0x00cd0089, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp2nd_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00040000, 0x000c0008, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp3rd_4x4 = {{ + 0x69696969, // TCfg + 0x00000000, // ASelt + 0x45670123, 0xcdef89ab, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp4th_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00460002, 0x00ce008a, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x04000400, 0x00000000, 0x04000400, 0x00000000, 0x04000400, 0x00000000, 0x04000400, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + //trans + gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{ + 0x11311311, // TCfg + 0x00100100, // ASelt + 0x01000400, 0x06020105, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{ + 0x00003113, // TCfg + 0x00001001, // ASelt + 0x03070302, 0x00000000, // ABin + 0x00000220, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2nd_2x8", &uniCalculateTmpGbyU2nd_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniDescaleU8_4x4", &uniDescaleU8_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpRWise_4x4", &uniCalculateTmpRWise_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpRWise2nd_4x4", &uniCalculateTmpRWise2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpRWise3rd_4x4", &uniCalculateTmpRWise3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpRWise4th_4x4", &uniCalculateTmpRWise4th_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGWise_4x4", &uniCalculateTmpGWise_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGWise2nd_4x4", &uniCalculateTmpGWise2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGWise3rd_4x4", &uniCalculateTmpGWise3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGWise4th_4x4", &uniCalculateTmpGWise4th_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpBWise_4x4", &uniCalculateTmpBWise_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpBWise2nd_4x4", &uniCalculateTmpBWise2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpBWise3rd_4x4", &uniCalculateTmpBWise3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpBWise4th_4x4", &uniCalculateTmpBWise4th_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp1st_4x4", &uniBilinearTmp1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp2nd_4x4", &uniBilinearTmp2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp3rd_4x4", &uniBilinearTmp3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp4th_4x4", &uniBilinearTmp4th_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise_4x4", &uniCalculateGWise_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise2nd_4x4", &uniCalculateGWise2nd_4x4); + + if(trans) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); + } + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( attr[0]->dtype ) + { + case U8: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); + status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case I8: + case I16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case F16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_yuv420_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_convert_type_e convert_type = SCALE; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if(enable_perm && enable_copy) + { + convert_type = COPY_TRANS; + } + else if(enable_perm) + { + convert_type = TRANS; + } + else if(enable_copy && output_dtype == U8) + { + convert_type = COPY; + } + else + { + convert_type = SCALE; + } + + key = HASH_PRE_PROCESS_YUV420_KEY( input0_dtype, output_dtype, convert_type, 0 ); + + for( i = 0; i < _cnt_of_array(pre_process_yuv420_map); i ++ ) + { + if( pre_process_yuv420_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(pre_process_yuv420_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_yuv420_map[i].function_name ); + kernel->info.parameters = vxPreProcessYuv420Kernel_param_def; + kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def ); + + if(enable_copy && output_dtype == U8) + { + kernel->info.initialize = _pre_process_yuv420_copy_initializer; + } + else + { + kernel->info.initialize = _pre_process_yuv420_initializer; + } + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + pre_process_yuv420_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + pre_process_yuv420_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV420_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 4; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + + /* Pass parameters to node. */ + if(trans) + { + shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; + shapes[1] = outputs[0]->attr.size[2]; + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num); + + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM, + inputs, 3, &reshape_tensors[0], 1 ); + } + else + { + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM, + inputs, 3, outputs, 1 ); + } + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + vsi_nn_kernel_scalar_release( &tmp_params[8] ); + vsi_nn_kernel_scalar_release( &tmp_params[9] ); + vsi_nn_kernel_scalar_release( &tmp_params[10] ); + vsi_nn_kernel_scalar_release( &tmp_params[11] ); + vsi_nn_kernel_scalar_release( &tmp_params[12] ); + vsi_nn_kernel_scalar_release( &tmp_params[13] ); + } + } + if(reshape_tensors[0]) + { + vsi_nn_ReleaseTensor(&reshape_tensors[0]); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pre_process_yuv420, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c new file mode 100644 index 0000000..adb16cb --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c @@ -0,0 +1,1174 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_trans_U8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_TRANS_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_trans_U8toU8") + +#define KERNEL_SOURCE_1 "pre_process_yuv444_scale", +#define KERNEL_SOURCE_2 "pre_process_yuv444_trans_u8", +#define KERNEL_SOURCE_3 "pre_process_yuv444_scale_fp16", +#define KERNEL_SOURCE_4 "pre_process_yuv444_copy_u8", + +typedef enum +{ + COPY = 0, + SCALE, + TRANS, + COPY_TRANS +} vsi_nn_kernel_convert_type_e; + +#define HASH_PRE_PROCESS_YUV444_KEY(_input0_type, _output_type, _convert_type, _image_2d) \ + ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_image_2d)) + +#define TENSOR_PRE_PROCESS_YUV444_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \ + { HASH_PRE_PROCESS_YUV444_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \ + VX_KERNEL_NAME_PRE_PROCESS_YUV444_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } pre_process_yuv444_map[] = +{ + TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_3) + TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, COPY, KERNEL_SOURCE_4) + TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, COPY_TRANS, KERNEL_SOURCE_4) + TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, TRANS, KERNEL_SOURCE_2) +}; + +static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PRE_PROCESS_YUV444_PARAM_NUM _cnt_of_array(vxPreProcessYuv444Kernel_param_def) + +DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int32_t dstZP = 0; + float dstScale = 1; + int32_t reorder = 0; + int32_t trans = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + dstZP = attr[0]->asymm.zero_point; + dstScale = attr[0]->asymm.scale; + width = out_shape->data[0]; + height = out_shape->data[1]; + + if(reorder != 0) + { + reorder = 2; + order1 = 0; + } + + if(trans) + { + width = width / 3; + } + + if(attr[0]->dtype == U8) + { + dstScale = 1.0f / dstScale; + } + + shaderParam.global_scale[0] = 16; + if(attr[0]->dtype == I16 || attr[0]->dtype == F16) + { + shaderParam.global_scale[0] = 8; + } + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 2); + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniPackBG0_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x01000000, 0x02020001, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniPackTmpAndR_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x03000100, 0x07060104, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniPackRB0_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x03000302, 0x05040004, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackTmp0AndG_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x03030100, 0x07060404, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniPackGR1_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x06000505, 0x07070006, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackTmp1AndB_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x03060100, 0x07060704, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackBG1_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x09000808, 0x0a0a0009, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackTmp1AndR_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x03080100, 0x07060904, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniPackRB2_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x0b000b0a, 0x0d0c000c, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackTmp2AndG_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x030b0100, 0x07060c04, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackGR2_2x8 = {{ + 0x11011011, // TCfg + 0x10010010, // ASelt + 0x0e000d0d, 0x0f0f000e, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, + 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackTmp2AndB_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x030e0100, 0x07060f04, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpR1st_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpR2nd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpR3rd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00990088, 0x00bb00aa, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpR4th_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00dd00cc, 0x00ff00ee, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateR1st_4x4 = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpG1st_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpG2nd_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpG3rd_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00990088, 0x00bb00aa, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpG4th_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00dd00cc, 0x00ff00ee, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpGbyU_2x8 = {{ + 0x66666666, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010064, 0x00010064, 0x00010064, 0x00010064, + 0x00010064, 0x00010064, 0x00010064, 0x00010064 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpGbyU2_2x8 = {{ + 0x66666666, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010064, 0x00010064, 0x00010064, 0x00010064, + 0x00010064, 0x00010064, 0x00010064, 0x00010064 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateG1st_4x4 = {{ + 0x07070707, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x08080808, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00000000, 0x00010000, 0x00000000, + 0x00010000, 0x00000000, 0x00010000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateG2nd_4x4 = {{ + 0x07070707, // TCfg + 0x04040404, // ASelt + 0x00510040, 0x00730062, // ABin + 0x08080808, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00000000, 0x00010000, 0x00000000, + 0x00010000, 0x00000000, 0x00010000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpB1st_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpB2nd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpB3rd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00990088, 0x00bb00aa, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpB4th_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00dd00cc, 0x00ff00ee, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniQuantU8toU8LoB_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniQuantU8toU8HiB_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniQuantU8toU8LoG_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x23222120, 0x27262524, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniQuantU8toU8HiG_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x2b2a2928, 0x2f2e2d2c, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniQuantU8toU8LoR_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x43424140, 0x47464544, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniQuantU8toU8HiR_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x4b4a4948, 0x4f4e4d4c, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + switch( attr[0]->dtype ) + { + case U8: + { + // R + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR2nd_4x4", &uniCalculateTmpR2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR3rd_4x4", &uniCalculateTmpR3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR4th_4x4", &uniCalculateTmpR4th_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4); + + //G + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpG1st_4x4", &uniCalculateTmpG1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpG2nd_4x4", &uniCalculateTmpG2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpG3rd_4x4", &uniCalculateTmpG3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpG4th_4x4", &uniCalculateTmpG4th_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2_2x8", &uniCalculateTmpGbyU2_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateG1st_4x4", &uniCalculateG1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateG2nd_4x4", &uniCalculateG2nd_4x4); + + //B + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB1st_4x4", &uniCalculateTmpB1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB2nd_4x4", &uniCalculateTmpB2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB3rd_4x4", &uniCalculateTmpB3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpB4th_4x4", &uniCalculateTmpB4th_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG0_2x8", &uniPackBG0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmpAndR_2x8", &uniPackTmpAndR_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB0_2x8", &uniPackRB0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp0AndG_2x8", &uniPackTmp0AndG_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR1_2x8", &uniPackGR1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndB_2x8", &uniPackTmp1AndB_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackBG1_2x8", &uniPackBG1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp1AndR_2x8", &uniPackTmp1AndR_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackRB2_2x8", &uniPackRB2_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndG_2x8", &uniPackTmp2AndG_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackGR2_2x8", &uniPackGR2_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniPackTmp2AndB_2x8", &uniPackTmp2AndB_2x8); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoB_2x8", &uniQuantU8toU8LoB_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiB_2x8", &uniQuantU8toU8HiB_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoG_2x8", &uniQuantU8toU8LoG_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiG_2x8", &uniQuantU8toU8HiG_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoR_2x8", &uniQuantU8toU8LoR_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiR_2x8", &uniQuantU8toU8HiR_2x8); + + status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_yuv444_copy_initializer() */ + +DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int32_t dstZP = 0; + float dstScale = 1; + int32_t reorder = 0; + int32_t trans = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &trans); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + dstZP = attr[0]->asymm.zero_point; + dstScale = attr[0]->asymm.scale; + width = out_shape->data[0]; + height = out_shape->data[1]; + + if(reorder != 0) + { + reorder = 2; + order1 = 0; + } + if(trans) + { + width = width / 3; + } + + if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl); + } + else + { + dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl)); + } + dstZP = 0; + } + else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + dstScale = 1.0f/dstScale; + } + else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + dstZP = 0; + } + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 2); + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpRWise_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpRWise2nd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpRWise3rd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00990088, 0x00bb00aa, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpRWise4th_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00dd00cc, 0x00ff00ee, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000, 0x0199012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateR1st_4x4 = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpGWise_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpGWise2nd_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpGWise3rd_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00990088, 0x00bb00aa, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpGWise4th_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00dd00cc, 0x00ff00ee, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000, 0x00d0012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpGbyU_2x8 = {{ + 0x66666666, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpGbyU2nd_2x8 = {{ + 0x66666666, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064, 0x00010064 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateGWise_4x4 = {{ + 0x07070707, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x08080808, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateGWise2nd_4x4 = {{ + 0x07070707, // TCfg + 0x04040404, // ASelt + 0x00510040, 0x00730062, // ABin + 0x08080808, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002608, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniCalculateTmpBWise_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpBWise2nd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpBWise3rd_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00990088, 0x00bb00aa, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateTmpBWise4th_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00dd00cc, 0x00ff00ee, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000, 0x0204012a, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniDescaleU8_4x4 = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002614, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniBilinearTmp1st_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00450001, 0x00cd0089, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp2nd_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00040000, 0x000c0008, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp3rd_4x4 = {{ + 0x69696969, // TCfg + 0x00000000, // ASelt + 0x45670123, 0xcdef89ab, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinearTmp4th_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00460002, 0x00ce008a, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x04000400, 0x00000000, 0x04000400, 0x00000000, 0x04000400, 0x00000000, 0x04000400, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + //trans + gpu_dp_inst_t uniTransPackBgr1st_2x8 = {{ + 0x11311311, // TCfg + 0x00100100, // ASelt + 0x01000400, 0x06020105, // ABin + 0x22022022, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniTransPackBgr2nd_2x8 = {{ + 0x00003113, // TCfg + 0x00001001, // ASelt + 0x03070302, 0x00000000, // ABin + 0x00000220, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000001, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniCalculateR1st_4x4", &uniCalculateR1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU_2x8", &uniCalculateTmpGbyU_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGbyU2nd_2x8", &uniCalculateTmpGbyU2nd_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateB1st_4x4", &uniCalculateR1st_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniDescaleU8_4x4", &uniDescaleU8_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpRWise_4x4", &uniCalculateTmpRWise_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpRWise2nd_4x4", &uniCalculateTmpRWise2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpRWise3rd_4x4", &uniCalculateTmpRWise3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpRWise4th_4x4", &uniCalculateTmpRWise4th_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGWise_4x4", &uniCalculateTmpGWise_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGWise2nd_4x4", &uniCalculateTmpGWise2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGWise3rd_4x4", &uniCalculateTmpGWise3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpGWise4th_4x4", &uniCalculateTmpGWise4th_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpBWise_4x4", &uniCalculateTmpBWise_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpBWise2nd_4x4", &uniCalculateTmpBWise2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpBWise3rd_4x4", &uniCalculateTmpBWise3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpBWise4th_4x4", &uniCalculateTmpBWise4th_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp1st_4x4", &uniBilinearTmp1st_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp2nd_4x4", &uniBilinearTmp2nd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp3rd_4x4", &uniBilinearTmp3rd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinearTmp4th_4x4", &uniBilinearTmp4th_4x4); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise_4x4", &uniCalculateGWise_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateGWise2nd_4x4", &uniCalculateGWise2nd_4x4); + + if(trans) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr1st_2x8", &uniTransPackBgr1st_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniTransPackBgr2nd_2x8", &uniTransPackBgr2nd_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); + } + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( attr[0]->dtype ) + { + case U8: + case I8: + case I16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); + status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case F16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_yuv444_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_convert_type_e convert_type = SCALE; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + vsi_bool enable_perm = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if(enable_perm && enable_copy) + { + convert_type = COPY_TRANS; + } + else if(enable_perm) + { + convert_type = TRANS; + } + else if(enable_copy && output_dtype == U8) + { + convert_type = COPY; + } + else + { + convert_type = SCALE; + } + + key = HASH_PRE_PROCESS_YUV444_KEY( input0_dtype, output_dtype, convert_type, 0 ); + + for( i = 0; i < _cnt_of_array(pre_process_yuv444_map); i ++ ) + { + if( pre_process_yuv444_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(pre_process_yuv444_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_yuv444_map[i].function_name ); + kernel->info.parameters = vxPreProcessYuv444Kernel_param_def; + kernel->info.numParams = _cnt_of_array( vxPreProcessYuv444Kernel_param_def ); + + if(enable_copy && output_dtype == U8) + { + kernel->info.initialize = _pre_process_yuv444_copy_initializer; + } + else + { + kernel->info.initialize = _pre_process_yuv444_initializer; + } + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + pre_process_yuv444_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + pre_process_yuv444_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV444_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 4; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + + /* Pass parameters to node. */ + if(trans) + { + shapes[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; + shapes[1] = outputs[0]->attr.size[2]; + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num); + + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM, + inputs, 3, &reshape_tensors[0], 1 ); + } + else + { + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM, + inputs, 3, outputs, 1 ); + } + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + vsi_nn_kernel_scalar_release( &tmp_params[8] ); + vsi_nn_kernel_scalar_release( &tmp_params[9] ); + vsi_nn_kernel_scalar_release( &tmp_params[10] ); + vsi_nn_kernel_scalar_release( &tmp_params[11] ); + vsi_nn_kernel_scalar_release( &tmp_params[12] ); + vsi_nn_kernel_scalar_release( &tmp_params[13] ); + } + } + if(reshape_tensors[0]) + { + vsi_nn_ReleaseTensor(&reshape_tensors[0]); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pre_process_yuv444, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c new file mode 100644 index 0000000..e74fb72 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c @@ -0,0 +1,646 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + + +#define KERNEL_SOURCE0 "prelu", +#define KERNEL_SOURCE1 "prelu_BF16", + + typedef enum +{ + _3D = 0, + _2D, + _2D_OPT, +} vsi_nn_shader_type_e; + +#define HASH_PRELU_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + +#define PRELU_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_PRELU_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + CVIVANTE_NAMESPACE("evis.prelu_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \ + SOURCE }, + +#define PRELU_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SH_TYPE, SOURCE) \ + { HASH_PRELU_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, SH_TYPE), \ + CVIVANTE_NAMESPACE("evis.prelu_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE#SH_TYPE), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + PRELU_KERNELS(BF16, BF16, BF16, KERNEL_SOURCE1) + PRELU_KERNELS(BF16, F16, BF16, KERNEL_SOURCE1) + PRELU_KERNELS(F16, F16, F16, KERNEL_SOURCE0) + PRELU_KERNELS(F16, F16, I16, KERNEL_SOURCE0) + PRELU_KERNELS(F16, F16, U8, KERNEL_SOURCE0) + PRELU_KERNELS(F16, F16, I8, KERNEL_SOURCE0) + PRELU_KERNELS(I16, F16, I16, KERNEL_SOURCE0) + PRELU_KERNELS(I16, F16, F16, KERNEL_SOURCE0) + PRELU_KERNELS(U8, F16, U8, KERNEL_SOURCE0) + PRELU_KERNELS(U8, F16, F16, KERNEL_SOURCE0) + PRELU_KERNELS(I8, F16, I8, KERNEL_SOURCE0) + PRELU_KERNELS(I8, F16, F16, KERNEL_SOURCE0) + PRELU_KERNELS(I8, F16, F16, KERNEL_SOURCE0) + + PRELU_KERNELS_2D(BF16, BF16, BF16, _2D, KERNEL_SOURCE1) + PRELU_KERNELS_2D(BF16, F16, BF16, _2D, KERNEL_SOURCE1) + PRELU_KERNELS_2D(F16, F16, F16, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(F16, F16, I16, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(F16, F16, U8, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(F16, F16, I8, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(I16, F16, I16, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(I16, F16, I16, _2D_OPT, KERNEL_SOURCE0) + PRELU_KERNELS_2D(I16, F16, F16, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(U8, F16, U8, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(U8, F16, F16, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(I8, F16, I8, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(I8, F16, I8, _2D_OPT, KERNEL_SOURCE0) + PRELU_KERNELS_2D(I8, F16, F16, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(U8, U8, U8, _2D, KERNEL_SOURCE0) + PRELU_KERNELS_2D(U8, U8, F16, _2D, KERNEL_SOURCE0) + +}; + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +DEF_KERNEL_INITIALIZER(_prelu_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int8_t in0_fl = 0; + int32_t inputZP0 = 0; + float input_scale0 = 1.0f; + int32_t inputZP1 = 0; + float input_scale1 = 1.0f; + int8_t out_fl = 0; + float outputZP = 0; + + int32_t shift0 = 0; + vsi_bool is_ge_fl = FALSE; + vsi_bool is_2d_img = FALSE; + uint32_t evis_version = 0; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_int_array_t * out_shape = NULL; + uint32_t pack_key; + vx_context ctx = vxGetContext((vx_reference)node); + vx_hardware_caps_params_t hw_param; + + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t)); + status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t)); + CHECK_STATUS_FAIL_GOTO(status, final); + + if (hw_param.evis1 == TRUE && hw_param.evis2 == FALSE) + { + evis_version = 1; + } + else if (hw_param.evis1 == FALSE && hw_param.evis2 == TRUE) + { + evis_version = 2; + } + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_shape = attr[2]->shape; + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + in0_fl = (int8_t)attr[0]->dfp.fl; + if (in0_fl >= 0) + { + input_scale0 = 1.0f / (vx_float32) ((int64_t)1 << in0_fl); + } + else if (in0_fl < 0) + { + input_scale0 = (vx_float32) ((int64_t)1 << -in0_fl); + } + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inputZP0 = attr[0]->asymm.zero_point; + input_scale0 = attr[0]->asymm.scale; + } + + if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inputZP1 = attr[1]->asymm.zero_point; + input_scale1 = attr[1]->asymm.scale; + } + + if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + out_fl = (int8_t)attr[2]->dfp.fl; + + if (out_fl >= 0) + input_scale0 *= (vx_float32)((int64_t)1 << out_fl); + else if (out_fl < 0) + input_scale0 *= 1.0f / (vx_float32) ((int64_t)1 << -out_fl); + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + outputZP = (float)attr[2]->asymm.zero_point; + input_scale0 = input_scale0 / attr[2]->asymm.scale; + } + shift0 = in0_fl - out_fl; + + is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1); + is_ge_fl = shift0 >= 0; + +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 ) \ + (IN0_TYPE | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version ); + + if ( attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl) + { + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + switch( pack_key ) + { + case _PACK_SELECT_KEY( I8, I8, 1, 1, 2 ): + case _PACK_SELECT_KEY( I16, I16, 1, 1, 2 ): + { + gpu_dp_inst_t uniPreluDFPLo_2x8b = {{ + 0x77777777, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0x00000000, // BSelt + 0x30201000, 0x70605040, // BBin + 0x00004000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPreluDFPHi_2x8b = {{ + 0x77777777, // TCfg + 0x44444444, // ASelt + 0xbbaa9988, 0xffeeddcc, // ABin + 0x00000000, // BSelt + 0x30201000, 0x70605040, // BBin + 0x00004000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if ( attr[0]->dtype == I16 ) + { + uniPreluDFPLo_2x8b.data[7] = 0x00003000; + uniPreluDFPHi_2x8b.data[7] = 0x00003000; + } + + gpu_dp_inst_update_postshfit( &uniPreluDFPLo_2x8b, shift0 ); + gpu_dp_inst_update_postshfit( &uniPreluDFPHi_2x8b, shift0 ); + + status = vsi_nn_kernel_gpu_add_param( node, + "uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( I8, I8, 1, 1, 1 ): + case _PACK_SELECT_KEY( I16, I16, 1, 1, 1 ): + { + gpu_dp_inst_t uniPreluInt8_2x8 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0xb3a29180, 0xf7e6d5c4, // ABin + 0x66666666, // BSelt + 0x30201000, 0x70605040, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPreluInt16_part0_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00510040, 0x00730062, // ABin + 0x06060606, // BSelt + 0x00100000, 0x00300020, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPreluInt16_part1_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00510040, 0x00730062, // ABin + 0x06060606, // BSelt + 0x00500040, 0x00700060, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniPreluInt8_2x8, shift0 ); + gpu_dp_inst_update_postshfit( &uniPreluInt16_part0_4x4, shift0 ); + gpu_dp_inst_update_postshfit( &uniPreluInt16_part1_4x4, shift0 ); + + status = vsi_nn_kernel_gpu_add_param( node, + "uniPreluInt8_2x8", &uniPreluInt8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 1 ): + case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 2 ): + case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 1 ): + case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 2 ): + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01010000, 0x03030202, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvF16toF32_Part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvF16toF32_Part1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackedBF16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + if (attr[1]->dtype == F16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvF16toF32_Part0_4x4", &uniConvF16toF32_Part0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvF16toF32_Part1_4x4", &uniConvF16toF32_Part1_4x4 ); + } + status |= vsi_nn_kernel_gpu_add_param( node, + "uniPackedBF16_2x8", &uniPackedBF16_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + { + gpu_dp_inst_t uniConvF16toF32_part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvF16toF32_part1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDataSubZPtoFp32Part0_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDataSubZPtoFp32Part1_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniDataSubZPtoFp32Part0_4x4", &uniDataSubZPtoFp32Part0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniDataSubZPtoFp32Part1_4x4", &uniDataSubZPtoFp32Part1_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvF16toF32_part0_4x4", &uniConvF16toF32_part0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "inputZP0", &inputZP0 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_scale0", &input_scale0 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "inputZP1", &inputZP1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_scale1", &input_scale1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "outputZP", &outputZP ); + if (attr[2]->dtype == F16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtact8Bin_2x8", &uniExtractHalf8_2x8 ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtact8Bin_2x8", &uniExtractInteger_2x8 ); + } + } + break; + } + +#undef _PACK_SELECT_KEY + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} /* _prelu_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype; + vsi_nn_kernel_dtype_e input1_dtype; + vsi_nn_kernel_dtype_e output_dtype; + int8_t input_fl = inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP ? + inputs[0]->attr.dtype.fl : 0; + int8_t output_fl = outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP ? + outputs[0]->attr.dtype.fl : 1; + vsi_nn_shader_type_e sh_type = image_2d ? (input_fl >= output_fl ? _2D_OPT : _2D) : _3D; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_PRELU_KEY( input0_dtype, input1_dtype, output_dtype, sh_type ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _prelu_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t new_rank = 0; + vsi_bool ret; + int32_t is_per_channel_alpha = 0; + + is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha"); + + if (is_per_channel_alpha) + { + return NULL; + } + + ret = vsi_nn_kernel_optimize_eltwise_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + if (ret) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes[2], new_rank ); + } + else + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[2]->attr.size, + reshape_tensors[2]->attr.dim_num ) ) + { + goto final; + } + + // Reorder tensor + image_2d = (reshape_tensors[2]->attr.dim_num == 2); + status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM, + reshape_tensors, 2, &reshape_tensors[2], 1 ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM ); + + } + } + +final: + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( prelu, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c b/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c new file mode 100644 index 0000000..721b835 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c @@ -0,0 +1,538 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum +{ + INTERNAL_KERNEL_SEED, + INTERNAL_KERNEL_CDF, + INTERNAL_KERNEL_MULTINOMIAL, +} _internal_kernel_e; + +/* + * Define kernel meta. + */ +#define _MULTINOMIAL_KERNEL_SOURCE "random_multinomial" +#define _MULTINOMIAL_KERNEL_NAME CVIVANTE_NAMESPACE("evis.random_multinomial") +#define _CDF_KERNEL_SOURCE "random_multinomial" +#define _SEED_KERNEL_SOURCE "random_multinomial" +#define _SEED_KERNEL_NAME CVIVANTE_NAMESPACE("evis.random_seed") + +// Add kernel hashtable here +#define MULTINOMIAL_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + ((IN0_DTYPE << 16) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE )) +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ + { MULTINOMIAL_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), \ + _MULTINOMIAL_KERNEL_NAME, SOURCE } + +#define CDF_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) +#define CDF_PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { CDF_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("evis.random_multinomial_cdf_"#IN_DTYPE), \ + SOURCE } + +#define SEED_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) +#define SEED_PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ + { SEED_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _SEED_KERNEL_NAME, SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _seed_kernel_map[] = +{ + // Register kernel here + SEED_PACK_KERNEL_MAP( I32, F32, _SEED_KERNEL_SOURCE ), +}; + +static const _kernel_map_type _cdf_kernel_map[] = +{ + // Register kernel here + CDF_PACK_KERNEL_MAP( F16, F32, _CDF_KERNEL_SOURCE ), + CDF_PACK_KERNEL_MAP( F32, F32, _CDF_KERNEL_SOURCE ), +}; + +static const _kernel_map_type _kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, I32, _MULTINOMIAL_KERNEL_SOURCE ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define SCALAR_CLASS_SIZE (3) +#define _PARAM_NUM _cnt_of_array( _kernel_param_def ) + +static vx_param_description_t _cdf_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _CDF_PARAM_NUM _cnt_of_array( _cdf_kernel_param_def ) + +static vx_param_description_t _seed_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _SEED_PARAM_NUM _cnt_of_array( _seed_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_multinomial_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr = NULL; + vsi_int_array_t * in_shape = NULL; + + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); + + in_shape = attr->shape; + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (in_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = in_shape->data[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (attr) + { + vsi_nn_kernel_tensor_attr_release( &attr ); + attr = NULL; + } + + return status; +} /* _multinomial_initializer() */ + +DEF_KERNEL_INITIALIZER(_cdf_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr = NULL; + vsi_int_array_t * in_shape = NULL; + uint32_t class_max_iter = 0; + uint32_t class_size = 0; + uint32_t batch = 0; + + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); + + in_shape = attr->shape; + + class_size = in_shape->data[0]; + batch = in_shape->data[1]; + if (attr->dtype == F32) + { + class_max_iter = (class_size + 3) >> 2; + } + else + { + class_max_iter = (class_size + 7) >> 3; + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = 1; + gpu_param.global_size[1] = batch; + + + if (attr->dtype == F16) + { + gpu_dp_inst_t uniPackMaxData_2x8 = {{ + 0x00000111, // TCfg + 0x00000000, // ASelt + 0x00050300, 0x00000000, // ABin + 0x00000222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00004400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGetSubData0to3_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x05050505, // BSelt + 0x00110011, 0x00110011, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGetSubData4to7_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniPackMaxData_2x8", &uniPackMaxData_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniGetSubData0to3_4x4", &uniGetSubData0to3_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniGetSubData4to7_4x4", &uniGetSubData4to7_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_add_param(node, "class_max_iter", &class_max_iter); + status |= vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (attr) + { + vsi_nn_kernel_tensor_attr_release( &attr ); + attr = NULL; + } + + return status; +} /* _cdf_initializer() */ + +DEF_KERNEL_INITIALIZER(_seed_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr = NULL; + vsi_int_array_t * out_shape = NULL; + uint32_t stride = 0; + uint32_t iter = 8; + float rand_max = (float)(pow(2.0,32)); + float re_rand_max = 1 / rand_max; + + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); + + out_shape = attr->shape; + iter = (out_shape->data[0] + 3) / 4; + + stride = iter * 4; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = 1; + gpu_param.global_size[1] = 1; + + status = vsi_nn_kernel_gpu_add_param(node, "stride", &stride); + status |= vsi_nn_kernel_gpu_add_param(node, "iter", &iter); + status |= vsi_nn_kernel_gpu_add_param(node, "re_rand_max", &re_rand_max); + status |= vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (attr) + { + vsi_nn_kernel_tensor_attr_release( &attr ); + attr = NULL; + } + + return status; +} /* _seed_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + const uint32_t hashkey, + _internal_kernel_e kernel_id + /* Add extra params */ + ) +{ + vx_kernel_initialize_f initializer = NULL; + vx_param_description_t * param_def; + vsi_status status = VSI_FAILURE; + const _kernel_map_type* kernel_map; + size_t kernel_map_size; + size_t param_size; + uint32_t i; + + switch( kernel_id ) + { + case INTERNAL_KERNEL_SEED: + initializer = _seed_initializer; + kernel_map = _seed_kernel_map; + kernel_map_size = _cnt_of_array( _seed_kernel_map ); + param_def = _seed_kernel_param_def; + param_size = _SEED_PARAM_NUM; + break; + case INTERNAL_KERNEL_CDF: + initializer = _cdf_initializer; + kernel_map = _cdf_kernel_map; + kernel_map_size = _cnt_of_array( _cdf_kernel_map ); + param_def = _cdf_kernel_param_def; + param_size = _CDF_PARAM_NUM; + break; + case INTERNAL_KERNEL_MULTINOMIAL: + initializer = _multinomial_initializer; + kernel_map = _kernel_map; + kernel_map_size = _cnt_of_array( _kernel_map ); + param_def = _kernel_param_def; + param_size = _PARAM_NUM; + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == hashkey ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ +#define INTERNAL_KERNEL_SIZE (3) +#define SEED_INDEX (0) +#define CDF_INDEX (1) +#define SEEDS_INDEX (2) + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t cdf_node_params[_CDF_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t seed_node_params[_SEED_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; + int32_t class_max_stride = 0; + int32_t class_size = 0; + uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; + uint32_t hashkey = 0; + int32_t i; + + // Check if gpu can support the size + if( !vsi_nn_kernel_gpu_check_shape( + (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + // Assign unique_id + ikernels[i]->unique_id = kernel->unique_id; + } + if( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + class_max_stride = (int32_t)gpu_align_p2(inputs[0]->attr.size[0], 4); + } + else + { + class_max_stride = (int32_t)gpu_align_p2(inputs[0]->attr.size[0], 8); + } + class_size = inputs[0]->attr.size[0]; + + memcpy( &attr, &(outputs[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + tensors[SEED_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + + attr.size[0] = class_max_stride * inputs[0]->attr.size[1]; + attr.size[1] = inputs[0]->attr.size[1]; + attr.dim_num = 2; + tensors[CDF_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + + memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) ); + attr.size[1] = 1; + attr.dim_num = 2; + tensors[SEEDS_INDEX] = vsi_nn_reshape_tensor( graph, + inputs[1], (uint32_t*)attr.size, attr.dim_num ); + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + hashkeys[SEED_INDEX]= SEED_HASH_KEY( in1_dtype, F32 ); + hashkeys[CDF_INDEX] = CDF_HASH_KEY( in0_dtype, F32 ); + hashkey = MULTINOMIAL_HASH_KEY( F32, F32, out_dtype ); + + status = _query_kernel( ikernels[SEED_INDEX], hashkeys[SEED_INDEX], INTERNAL_KERNEL_SEED ); + if( VSI_SUCCESS != status ) + { + goto final; + } + status = _query_kernel( ikernels[CDF_INDEX], hashkeys[CDF_INDEX], INTERNAL_KERNEL_CDF ); + if( VSI_SUCCESS != status ) + { + goto final; + } + status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_MULTINOMIAL ); + if( VSI_SUCCESS != status ) + { + goto final; + } + + // Seed + node = vsi_nn_kernel_create_node( graph, ikernels[SEED_INDEX] ); + VSI_ASSERT( node != NULL ); + vsi_nn_kernel_node_pack_io( seed_node_params, _SEED_PARAM_NUM, + &tensors[SEEDS_INDEX], 1, &tensors[SEED_INDEX], 1 ); + status = vsi_nn_kernel_node_pass_param( node, seed_node_params, _SEED_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_node_release( &node ); + + // CDF + node = vsi_nn_kernel_create_node( graph, ikernels[CDF_INDEX] ); + VSI_ASSERT( node != NULL ); + vsi_nn_kernel_node_pack_io( cdf_node_params, _CDF_PARAM_NUM, + &inputs[0], 1, &tensors[CDF_INDEX], 1 ); + status = vsi_nn_kernel_node_pass_param( node, cdf_node_params, _CDF_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_node_release( &node ); + + // Multinomial + node = vsi_nn_kernel_create_node( graph, kernel ); + VSI_ASSERT( node != NULL ); + vsi_nn_kernel_node_pack_io( node_params, _PARAM_NUM, tensors, 2, outputs, 1 ); + node_params[SCALAR_CLASS_SIZE] = vsi_nn_kernel_scalar_create( graph, I32, &class_size ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_CLASS_SIZE] ); + + /* Pass parameters to node. */ +final: + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + if( ikernels[i] ) + { + vsi_nn_kernel_release( &ikernels[i] ); + } + if( tensors[i] ) + { + vsi_nn_ReleaseTensor( &tensors[i] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( random_multinomial, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c new file mode 100644 index 0000000..845a692 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c @@ -0,0 +1,295 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_REDUCEALL_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_REDUCEALL_KERNEL_SOURCE_NAME(AXIS) \ + "reduceall_internal_axis"#AXIS + +#define HASH_REDUCEALL_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEALL_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.reduceall_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_REDUCEALL_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_REDUCEALL_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEALL_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.reduceall_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_REDUCEALL_KERNEL_SOURCE_NAME(AXIS) }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _reduceall_internal_kernel_map[] = +{ + HASH_REDUCEALL_KERNELS( 0, I8, I8 ) + HASH_REDUCEALL_KERNELS( 1, I8, I8 ) + HASH_REDUCEALL_KERNELS( 2, I8, I8 ) + + HASH_REDUCEALL_KERNELS_2D( 0, I8, I8 ) + HASH_REDUCEALL_KERNELS_2D( 1, I8, I8 ) + HASH_REDUCEALL_KERNELS_2D( 2, I8, I8 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _reduceall_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEALL_INTERNAL_PARAM_NUM _cnt_of_array( _reduceall_internal_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (2) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * input_shape = NULL; + vsi_int_array_t * output_shape = NULL; + int32_t axisSize = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + output_shape = output_attr->shape; + input_shape = input_attr->shape; + + if (axis == 0) + { + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.dim = 2; + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + axisSize = input_shape->data[axis]; + + { + gpu_dp_inst_t uniS8AddAll_16x1 = {{ + 0xffffffff, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00004400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (0 == axis) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniS8AddAll_16x1", &uniS8AddAll_16x1); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + + status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + + return status; +} /* _reduceall_internal_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _reduceall_internal_kernel_map; + size_t kernel_map_size = _cnt_of_array( _reduceall_internal_kernel_map ); + vx_param_description_t * param_def = _reduceall_internal_kernel_param_def; + size_t param_def_size = _cnt_of_array( _reduceall_internal_kernel_param_def ); + vx_kernel_initialize_f initializer = _reduceall_internal_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if ( BOOL8 == in_dtype ) + { + in_dtype = I8; + } + + if ( BOOL8 == out_dtype ) + { + out_dtype = I8; + } + + key = HASH_REDUCEALL_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEALL_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEALL_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEALL_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( reduceall_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c new file mode 100644 index 0000000..4773ee5 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c @@ -0,0 +1,295 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_REDUCEANY_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_REDUCEANY_KERNEL_SOURCE_NAME(AXIS) \ + "reduceany_internal_axis"#AXIS + +#define HASH_REDUCEANY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEANY_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.reduceany_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_REDUCEANY_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_REDUCEANY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEANY_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.reduceany_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_REDUCEANY_KERNEL_SOURCE_NAME(AXIS) }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _reduceany_internal_kernel_map[] = +{ + HASH_REDUCEANY_KERNELS( 0, I8, I8 ) + HASH_REDUCEANY_KERNELS( 1, I8, I8 ) + HASH_REDUCEANY_KERNELS( 2, I8, I8 ) + + HASH_REDUCEANY_KERNELS_2D( 0, I8, I8 ) + HASH_REDUCEANY_KERNELS_2D( 1, I8, I8 ) + HASH_REDUCEANY_KERNELS_2D( 2, I8, I8 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _reduceany_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEANY_INTERNAL_PARAM_NUM _cnt_of_array( _reduceany_internal_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (2) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * input_shape = NULL; + vsi_int_array_t * output_shape = NULL; + int32_t axisSize = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + output_shape = output_attr->shape; + input_shape = input_attr->shape; + + if (axis == 0) + { + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.dim = 2; + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + axisSize = input_shape->data[axis]; + + { + gpu_dp_inst_t uniS8AddAll_16x1 = {{ + 0xffffffff, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00004400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (0 == axis) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniS8AddAll_16x1", &uniS8AddAll_16x1); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + + status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + + return status; +} /* _reduceany_internal_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _reduceany_internal_kernel_map; + size_t kernel_map_size = _cnt_of_array( _reduceany_internal_kernel_map ); + vx_param_description_t * param_def = _reduceany_internal_kernel_param_def; + size_t param_def_size = _cnt_of_array( _reduceany_internal_kernel_param_def ); + vx_kernel_initialize_f initializer = _reduceany_internal_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if ( BOOL8 == in_dtype ) + { + in_dtype = I8; + } + + if ( BOOL8 == out_dtype ) + { + out_dtype = I8; + } + + key = HASH_REDUCEANY_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEANY_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEANY_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEANY_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( reduceany_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c new file mode 100644 index 0000000..604bacc --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c @@ -0,0 +1,425 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_REDUCEMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_REDUCEMAX_KERNEL_SOURCE_NAME(AXIS) \ + "reducemax_internal_axis"#AXIS + +#define HASH_REDUCEMAX_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.reducemax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_REDUCEMAX_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_REDUCEMAX_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.reducemax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_REDUCEMAX_KERNEL_SOURCE_NAME(AXIS) }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _reducemax_internal_kernel_map[] = +{ + HASH_REDUCEMAX_KERNELS( 0, F16, F16 ) + HASH_REDUCEMAX_KERNELS( 0, F16, I16 ) + HASH_REDUCEMAX_KERNELS( 0, F16, I8 ) + HASH_REDUCEMAX_KERNELS( 0, F16, U8 ) + HASH_REDUCEMAX_KERNELS( 0, I16, I16 ) + HASH_REDUCEMAX_KERNELS( 0, I8, I8 ) + HASH_REDUCEMAX_KERNELS( 0, U8, U8 ) + HASH_REDUCEMAX_KERNELS( 0, I16, F16 ) + HASH_REDUCEMAX_KERNELS( 0, I8, F16 ) + HASH_REDUCEMAX_KERNELS( 0, U8, F16 ) + HASH_REDUCEMAX_KERNELS( 1, F16, F16 ) + HASH_REDUCEMAX_KERNELS( 1, F16, I16 ) + HASH_REDUCEMAX_KERNELS( 1, F16, I8 ) + HASH_REDUCEMAX_KERNELS( 1, F16, U8 ) + HASH_REDUCEMAX_KERNELS( 1, I16, I16 ) + HASH_REDUCEMAX_KERNELS( 1, I8, I8 ) + HASH_REDUCEMAX_KERNELS( 1, U8, U8 ) + HASH_REDUCEMAX_KERNELS( 1, I16, F16 ) + HASH_REDUCEMAX_KERNELS( 1, I8, F16 ) + HASH_REDUCEMAX_KERNELS( 1, U8, F16 ) + HASH_REDUCEMAX_KERNELS( 2, F16, F16 ) + HASH_REDUCEMAX_KERNELS( 2, F16, I16 ) + HASH_REDUCEMAX_KERNELS( 2, F16, I8 ) + HASH_REDUCEMAX_KERNELS( 2, F16, U8 ) + HASH_REDUCEMAX_KERNELS( 2, I16, I16 ) + HASH_REDUCEMAX_KERNELS( 2, I8, I8 ) + HASH_REDUCEMAX_KERNELS( 2, U8, U8 ) + HASH_REDUCEMAX_KERNELS( 2, I16, F16 ) + HASH_REDUCEMAX_KERNELS( 2, I8, F16 ) + HASH_REDUCEMAX_KERNELS( 2, U8, F16 ) + + HASH_REDUCEMAX_KERNELS_2D( 0, F16, F16 ) + HASH_REDUCEMAX_KERNELS_2D( 0, F16, I16 ) + HASH_REDUCEMAX_KERNELS_2D( 0, F16, I8 ) + HASH_REDUCEMAX_KERNELS_2D( 0, F16, U8 ) + HASH_REDUCEMAX_KERNELS_2D( 0, I16, I16 ) + HASH_REDUCEMAX_KERNELS_2D( 0, I8, I8 ) + HASH_REDUCEMAX_KERNELS_2D( 0, U8, U8 ) + HASH_REDUCEMAX_KERNELS_2D( 0, I16, F16 ) + HASH_REDUCEMAX_KERNELS_2D( 0, I8, F16 ) + HASH_REDUCEMAX_KERNELS_2D( 0, U8, F16 ) + HASH_REDUCEMAX_KERNELS_2D( 1, F16, F16 ) + HASH_REDUCEMAX_KERNELS_2D( 1, F16, I16 ) + HASH_REDUCEMAX_KERNELS_2D( 1, F16, I8 ) + HASH_REDUCEMAX_KERNELS_2D( 1, F16, U8 ) + HASH_REDUCEMAX_KERNELS_2D( 1, I16, I16 ) + HASH_REDUCEMAX_KERNELS_2D( 1, I8, I8 ) + HASH_REDUCEMAX_KERNELS_2D( 1, U8, U8 ) + HASH_REDUCEMAX_KERNELS_2D( 1, I16, F16 ) + HASH_REDUCEMAX_KERNELS_2D( 1, I8, F16 ) + HASH_REDUCEMAX_KERNELS_2D( 1, U8, F16 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _reducemax_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEMAX_INTERNAL_PARAM_NUM _cnt_of_array( _reducemax_internal_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (2) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * input_shape = NULL; + vsi_int_array_t * output_shape = NULL; + int32_t input_fl = 0, output_fl = 0; + int32_t axisSize = 0; + float inputScale = 1.0f; + float input_offset_asymmetric = 0.0f; + float outputScale = 1.0f; + float output_offset_asymmetric = 0.0f; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + output_shape = output_attr->shape; + input_shape = input_attr->shape; + + if (axis == 0) + { + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.dim = 2; + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + axisSize = input_shape->data[axis]; + + { + gpu_dp_inst_t uniPackMaxData_2x8 = {{ + 0x00000111, // TCfg + 0x00000000, // ASelt + 0x00050300, 0x00000000, // ABin + 0x00000222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00004400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetLoData_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetHiData_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (0 == axis) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniPackMaxData_2x8", &uniPackMaxData_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetLoData_4x4", &uniGetLoData_4x4 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (1 == axis || 2 == axis) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGetLoData_4x4", &uniGetLoData_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetHiData_4x4", &uniGetHiData_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + + if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + input_fl = input_attr->dfp.fl; + if (input_fl > 0) + { + inputScale = 1.0f / (float) ((int64_t)1 << input_fl); + } + else + { + inputScale = (float)((int64_t)1 << -input_fl); + } + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inputScale = input_attr->asymm.scale; + input_offset_asymmetric = (float)(input_attr->asymm.zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + output_fl = output_attr->dfp.fl; + if (output_fl > 0) + { + outputScale = (float) ((int64_t)1 << output_fl); + } + else + { + outputScale = 1.0f / (float)((int64_t)1 << -output_fl); + } + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + outputScale = 1.0f / output_attr->asymm.scale; + output_offset_asymmetric = (float)(output_attr->asymm.zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + + return status; +} /* _reducemax_internal_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _reducemax_internal_kernel_map; + size_t kernel_map_size = _cnt_of_array( _reducemax_internal_kernel_map ); + vx_param_description_t * param_def = _reducemax_internal_kernel_param_def; + size_t param_def_size = _cnt_of_array( _reducemax_internal_kernel_param_def ); + vx_kernel_initialize_f initializer = _reducemax_internal_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if ((BF16 == in_dtype) && (BF16 == out_dtype)) + { + in_dtype = F16; + out_dtype = F16; + } + + key = HASH_REDUCEMAX_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEMAX_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEMAX_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEMAX_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( reducemax_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c new file mode 100644 index 0000000..e5eedc4 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c @@ -0,0 +1,429 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_REDUCEMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_REDUCEMIN_KERNEL_SOURCE_NAME(AXIS) \ + "reducemin_internal_axis"#AXIS + +#define HASH_REDUCEMIN_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.reducemin_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_REDUCEMIN_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_REDUCEMIN_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEMIN_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.reducemin_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_REDUCEMIN_KERNEL_SOURCE_NAME(AXIS) }, + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _reducemin_internal_kernel_map[] = +{ + HASH_REDUCEMIN_KERNELS( 0, F16, F16 ) + HASH_REDUCEMIN_KERNELS( 0, F16, I16 ) + HASH_REDUCEMIN_KERNELS( 0, F16, I8 ) + HASH_REDUCEMIN_KERNELS( 0, F16, U8 ) + HASH_REDUCEMIN_KERNELS( 0, I16, I16 ) + HASH_REDUCEMIN_KERNELS( 0, I8, I8 ) + HASH_REDUCEMIN_KERNELS( 0, U8, U8 ) + HASH_REDUCEMIN_KERNELS( 0, I16, F16 ) + HASH_REDUCEMIN_KERNELS( 0, I8, F16 ) + HASH_REDUCEMIN_KERNELS( 0, U8, F16 ) + HASH_REDUCEMIN_KERNELS( 1, F16, F16 ) + HASH_REDUCEMIN_KERNELS( 1, F16, I16 ) + HASH_REDUCEMIN_KERNELS( 1, F16, I8 ) + HASH_REDUCEMIN_KERNELS( 1, F16, U8 ) + HASH_REDUCEMIN_KERNELS( 1, I16, I16 ) + HASH_REDUCEMIN_KERNELS( 1, I8, I8 ) + HASH_REDUCEMIN_KERNELS( 1, U8, U8 ) + HASH_REDUCEMIN_KERNELS( 1, I16, F16 ) + HASH_REDUCEMIN_KERNELS( 1, I8, F16 ) + HASH_REDUCEMIN_KERNELS( 1, U8, F16 ) + HASH_REDUCEMIN_KERNELS( 2, F16, F16 ) + HASH_REDUCEMIN_KERNELS( 2, F16, I16 ) + HASH_REDUCEMIN_KERNELS( 2, F16, I8 ) + HASH_REDUCEMIN_KERNELS( 2, F16, U8 ) + HASH_REDUCEMIN_KERNELS( 2, I16, I16 ) + HASH_REDUCEMIN_KERNELS( 2, I8, I8 ) + HASH_REDUCEMIN_KERNELS( 2, U8, U8 ) + HASH_REDUCEMIN_KERNELS( 2, I16, F16 ) + HASH_REDUCEMIN_KERNELS( 2, I8, F16 ) + HASH_REDUCEMIN_KERNELS( 2, U8, F16 ) + + HASH_REDUCEMIN_KERNELS_2D( 0, F16, F16 ) + HASH_REDUCEMIN_KERNELS_2D( 0, F16, I16 ) + HASH_REDUCEMIN_KERNELS_2D( 0, F16, I8 ) + HASH_REDUCEMIN_KERNELS_2D( 0, F16, U8 ) + HASH_REDUCEMIN_KERNELS_2D( 0, I16, I16 ) + HASH_REDUCEMIN_KERNELS_2D( 0, I8, I8 ) + HASH_REDUCEMIN_KERNELS_2D( 0, U8, U8 ) + HASH_REDUCEMIN_KERNELS_2D( 0, I16, F16 ) + HASH_REDUCEMIN_KERNELS_2D( 0, I8, F16 ) + HASH_REDUCEMIN_KERNELS_2D( 0, U8, F16 ) + HASH_REDUCEMIN_KERNELS_2D( 1, F16, F16 ) + HASH_REDUCEMIN_KERNELS_2D( 1, F16, I16 ) + HASH_REDUCEMIN_KERNELS_2D( 1, F16, I8 ) + HASH_REDUCEMIN_KERNELS_2D( 1, F16, U8 ) + HASH_REDUCEMIN_KERNELS_2D( 1, I16, I16 ) + HASH_REDUCEMIN_KERNELS_2D( 1, I8, I8 ) + HASH_REDUCEMIN_KERNELS_2D( 1, U8, U8 ) + HASH_REDUCEMIN_KERNELS_2D( 1, I16, F16 ) + HASH_REDUCEMIN_KERNELS_2D( 1, I8, F16 ) + HASH_REDUCEMIN_KERNELS_2D( 1, U8, F16 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _reducemin_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEMIN_INTERNAL_PARAM_NUM _cnt_of_array( _reducemin_internal_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (2) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * input_shape = NULL; + vsi_int_array_t * output_shape = NULL; + int32_t input_fl = 0, output_fl = 0; + int32_t axisSize = 0; + float inputScale = 1.0f; + float input_offset_asymmetric = 0.0f; + float outputScale = 1.0f; + float output_offset_asymmetric = 0.0f; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + output_shape = output_attr->shape; + input_shape = input_attr->shape; + + if (axis == 0) + { + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.dim = 2; + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + axisSize = input_shape->data[axis]; + + { + gpu_dp_inst_t uniPackMaxData_2x8 = {{ + 0x00000111, // TCfg + 0x00000000, // ASelt + 0x00050300, 0x00000000, // ABin + 0x00000222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00004400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetLoData_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetHiData_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (0 == axis) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniPackMaxData_2x8", &uniPackMaxData_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetLoData_4x4", &uniGetLoData_4x4 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (1 == axis || 2 == axis) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGetLoData_4x4", &uniGetLoData_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetHiData_4x4", &uniGetHiData_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + + if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + input_fl = input_attr->dfp.fl; + if (input_fl > 0) + { + inputScale = 1.0f / (float) ((int64_t)1 << input_fl); + } + else + { + inputScale = (float)((int64_t)1 << -input_fl); + } + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inputScale = input_attr->asymm.scale; + input_offset_asymmetric = (float)(input_attr->asymm.zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + output_fl = output_attr->dfp.fl; + if (output_fl > 0) + { + outputScale = (float) ((int64_t)1 << output_fl); + } + else + { + outputScale = 1.0f / (float)((int64_t)1 << -output_fl); + } + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + outputScale = 1.0f / output_attr->asymm.scale; + output_offset_asymmetric = (float)(output_attr->asymm.zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + + return status; +} /* _reducemin_internal_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _reducemin_internal_kernel_map; + size_t kernel_map_size = _cnt_of_array( _reducemin_internal_kernel_map ); + vx_param_description_t * param_def = _reducemin_internal_kernel_param_def; + size_t param_def_size = _cnt_of_array( _reducemin_internal_kernel_param_def ); + vx_kernel_initialize_f initializer = _reducemin_internal_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if ((BF16 == in_dtype) && (BF16 == out_dtype)) + { + in_dtype = F16; + out_dtype = F16; + } + + key = HASH_REDUCEMIN_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEMIN_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEMIN_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEMIN_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( reducemin_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c new file mode 100644 index 0000000..d665ac7 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c @@ -0,0 +1,511 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define HASH_REDUCEPROD_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + + #define HASH_REDUCEPROD_KERNEL_SOURCE_NAME(AXIS) \ + "reduceprod_internal_axis"#AXIS + +#define HASH_REDUCEPROD_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEPROD_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.reduceprod_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_REDUCEPROD_KERNEL_SOURCE_NAME(AXIS) }, + +#define HASH_REDUCEPROD_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_REDUCEPROD_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.reduceprod_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + HASH_REDUCEPROD_KERNEL_SOURCE_NAME(AXIS) }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _reduceprod_internal_kernel_map[] = +{ + HASH_REDUCEPROD_KERNELS( 0, F16, F16 ) + HASH_REDUCEPROD_KERNELS( 0, F16, I16 ) + HASH_REDUCEPROD_KERNELS( 0, F16, I8 ) + HASH_REDUCEPROD_KERNELS( 0, F16, U8 ) + HASH_REDUCEPROD_KERNELS( 0, I16, I16 ) + HASH_REDUCEPROD_KERNELS( 0, I8, I8 ) + HASH_REDUCEPROD_KERNELS( 0, U8, U8 ) + HASH_REDUCEPROD_KERNELS( 0, I16, F16 ) + HASH_REDUCEPROD_KERNELS( 0, I8, F16 ) + HASH_REDUCEPROD_KERNELS( 0, U8, F16 ) + HASH_REDUCEPROD_KERNELS( 0, BF16, BF16 ) + HASH_REDUCEPROD_KERNELS( 1, F16, F16 ) + HASH_REDUCEPROD_KERNELS( 1, F16, I16 ) + HASH_REDUCEPROD_KERNELS( 1, F16, I8 ) + HASH_REDUCEPROD_KERNELS( 1, F16, U8 ) + HASH_REDUCEPROD_KERNELS( 1, I16, I16 ) + HASH_REDUCEPROD_KERNELS( 1, I8, I8 ) + HASH_REDUCEPROD_KERNELS( 1, U8, U8 ) + HASH_REDUCEPROD_KERNELS( 1, I16, F16 ) + HASH_REDUCEPROD_KERNELS( 1, I8, F16 ) + HASH_REDUCEPROD_KERNELS( 1, U8, F16 ) + HASH_REDUCEPROD_KERNELS( 1, BF16, BF16 ) + HASH_REDUCEPROD_KERNELS( 2, F16, F16 ) + HASH_REDUCEPROD_KERNELS( 2, F16, I16 ) + HASH_REDUCEPROD_KERNELS( 2, F16, I8 ) + HASH_REDUCEPROD_KERNELS( 2, F16, U8 ) + HASH_REDUCEPROD_KERNELS( 2, I16, I16 ) + HASH_REDUCEPROD_KERNELS( 2, I8, I8 ) + HASH_REDUCEPROD_KERNELS( 2, U8, U8 ) + HASH_REDUCEPROD_KERNELS( 2, I16, F16 ) + HASH_REDUCEPROD_KERNELS( 2, I8, F16 ) + HASH_REDUCEPROD_KERNELS( 2, U8, F16 ) + HASH_REDUCEPROD_KERNELS( 2, BF16, BF16 ) + + HASH_REDUCEPROD_KERNELS_2D( 0, F16, F16 ) + HASH_REDUCEPROD_KERNELS_2D( 0, F16, I16 ) + HASH_REDUCEPROD_KERNELS_2D( 0, F16, I8 ) + HASH_REDUCEPROD_KERNELS_2D( 0, F16, U8 ) + HASH_REDUCEPROD_KERNELS_2D( 0, I16, I16 ) + HASH_REDUCEPROD_KERNELS_2D( 0, I8, I8 ) + HASH_REDUCEPROD_KERNELS_2D( 0, U8, U8 ) + HASH_REDUCEPROD_KERNELS_2D( 0, I16, F16 ) + HASH_REDUCEPROD_KERNELS_2D( 0, I8, F16 ) + HASH_REDUCEPROD_KERNELS_2D( 0, U8, F16 ) + HASH_REDUCEPROD_KERNELS_2D( 0, BF16, BF16 ) + HASH_REDUCEPROD_KERNELS_2D( 1, F16, F16 ) + HASH_REDUCEPROD_KERNELS_2D( 1, F16, I16 ) + HASH_REDUCEPROD_KERNELS_2D( 1, F16, I8 ) + HASH_REDUCEPROD_KERNELS_2D( 1, F16, U8 ) + HASH_REDUCEPROD_KERNELS_2D( 1, I16, I16 ) + HASH_REDUCEPROD_KERNELS_2D( 1, I8, I8 ) + HASH_REDUCEPROD_KERNELS_2D( 1, U8, U8 ) + HASH_REDUCEPROD_KERNELS_2D( 1, I16, F16 ) + HASH_REDUCEPROD_KERNELS_2D( 1, I8, F16 ) + HASH_REDUCEPROD_KERNELS_2D( 1, U8, F16 ) + HASH_REDUCEPROD_KERNELS_2D( 1, BF16, BF16 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _reduceprod_internal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _REDUCEPROD_INTERNAL_PARAM_NUM _cnt_of_array( _reduceprod_internal_kernel_param_def ) + +#define SCALAR_INPUT_AXIS (2) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t * input_shape = NULL; + vsi_int_array_t * output_shape = NULL; + vsi_nn_kernel_dtype_e src_dtype = F16; + vsi_nn_kernel_dtype_e dst_dtype = F16; + int32_t input_fl = 0, output_fl = 0; + int32_t axisSize = 0; + float inputScale = 1.0f; + float input_offset_asymmetric = 0.0f; + float outputScale = 1.0f; + float output_offset_asymmetric = 0.0f; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + output_shape = output_attr->shape; + input_shape = input_attr->shape; + src_dtype = input_attr->dtype; + dst_dtype = output_attr->dtype; + + if (axis == 0) + { + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.dim = 2; + gpu_param.global_size[0] = gpu_align_p2( + (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + axisSize = input_shape->data[axis]; + + { + gpu_dp_inst_t uniGetLoData_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetHiData_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetEndLoData_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetEndHiData_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + uint32_t uniGetEndConfig[4] = {0x11111111, 0x11111100, 0x11110000, 0x11000000}; + + if (0 == axis) + { + int32_t inputWidth = 0; + int32_t inputWidthRemain8 = 0; + if ((axisSize % 8) == 0 ) + { + inputWidth = (axisSize / 8 - 1) * 8; + } + else + { + inputWidth = axisSize / 8 * 8; + inputWidthRemain8 = axisSize % 8; + if (inputWidthRemain8 >= 4) + { + inputWidthRemain8 = inputWidthRemain8 - 4; + uniGetEndHiData_2x8.data[1] = uniGetEndConfig[inputWidthRemain8]; + } + else + { + uniGetEndLoData_2x8.data[1] = uniGetEndConfig[inputWidthRemain8]; + uniGetEndHiData_2x8.data[1] = uniGetEndConfig[0]; + } + } + status = vsi_nn_kernel_gpu_add_param( node, + "inputWidth", &inputWidth ); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniGetEndLoData_2x8", &uniGetEndLoData_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniGetEndHiData_2x8", &uniGetEndHiData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (1 == axis || 2 == axis) + { + status = vsi_nn_kernel_gpu_add_param(node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "axisSize", &axisSize); + if (dst_dtype == BF16) + { + status |= vsi_nn_kernel_gpu_add_param(node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if (src_dtype == BF16) + { + status = vsi_nn_kernel_gpu_add_param(node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniGetLoData_4x4", &uniGetLoData_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniGetHiData_4x4", &uniGetHiData_4x4); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + input_fl = input_attr->dfp.fl; + if (input_fl > 0) + { + inputScale = 1.0f / (float) ((int64_t)1 << input_fl); + } + else + { + inputScale = (float)((int64_t)1 << -input_fl); + } + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inputScale = input_attr->asymm.scale; + input_offset_asymmetric = (float)(input_attr->asymm.zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + output_fl = output_attr->dfp.fl; + if (output_fl > 0) + { + outputScale = (float) ((int64_t)1 << output_fl); + } + else + { + outputScale = 1.0f / (float)((int64_t)1 << -output_fl); + } + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + outputScale = 1.0f / output_attr->asymm.scale; + output_offset_asymmetric = (float)(output_attr->asymm.zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + + return status; +} /* _reduceprod_internal_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _reduceprod_internal_kernel_map; + size_t kernel_map_size = _cnt_of_array( _reduceprod_internal_kernel_map ); + vx_param_description_t * param_def = _reduceprod_internal_kernel_param_def; + size_t param_def_size = _cnt_of_array( _reduceprod_internal_kernel_param_def ); + vx_kernel_initialize_f initializer = _reduceprod_internal_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_REDUCEPROD_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REDUCEPROD_INTERNAL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || axis > 2) + { + return NULL; + } + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _REDUCEPROD_INTERNAL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEPROD_INTERNAL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( reduceprod_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c new file mode 100644 index 0000000..e454e80 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c @@ -0,0 +1,475 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define _RELU_KERAS_KERNEL_SOURCE "relu_keras" + +#define STR(a) #a +// Add kernel hashtable here +#define RELU_KERAS_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d)) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RELU_KERAS_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 ), \ + CVIVANTE_NAMESPACE("evis.relu_keras_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_3D"), \ + _RELU_KERAS_KERNEL_SOURCE } + +#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE ) \ + { RELU_KERAS_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1 ), \ + CVIVANTE_NAMESPACE("evis.relu_keras_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _RELU_KERAS_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _relu_keras_kernel_map[] = +{ + PACK_KERNEL_MAP(BF16, BF16), + PACK_KERNEL_MAP(F16, F16), + PACK_KERNEL_MAP(F16, I16), + PACK_KERNEL_MAP(F16, I8), + PACK_KERNEL_MAP(F16, U8), + PACK_KERNEL_MAP(I16, I16), + PACK_KERNEL_MAP(I16, F16), + PACK_KERNEL_MAP(I8, I8), + PACK_KERNEL_MAP(I8, F16), + PACK_KERNEL_MAP(U8, U8), + PACK_KERNEL_MAP(U8, F16), + PACK_KERNEL_MAP_2D(BF16, BF16), + PACK_KERNEL_MAP_2D(F16, F16), + PACK_KERNEL_MAP_2D(F16, I16), + PACK_KERNEL_MAP_2D(F16, I8), + PACK_KERNEL_MAP_2D(F16, U8), + PACK_KERNEL_MAP_2D(I16, I16), + PACK_KERNEL_MAP_2D(I16, F16), + PACK_KERNEL_MAP_2D(I8, I8), + PACK_KERNEL_MAP_2D(I8, F16), + PACK_KERNEL_MAP_2D(U8, U8), + PACK_KERNEL_MAP_2D(U8, F16), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _relu_keras_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RELU_KERAS_PARAM_NUM _cnt_of_array( _relu_keras_kernel_param_def ) + +#define SCALAR_ALPHA (2) +#define SCALAR_MAX_VALUE (3) +#define SCALAR_THRESHOLD (4) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_relu_keras_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * out_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + float alpha = 0.0f; + float threshold = 0.0f; + float offset = 0.0f; + float scaleIn = 1.0f; + float scaleOut = 1.0f; + float inputTail = 0.0f; + float output_ZP = 0; + float input_ZP = 0; + int32_t srcFixPointPos = 0; + int32_t dstFixPointPos = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + out_shape = output_attr->shape; + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_ALPHA], &(alpha)); + vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_THRESHOLD], &(threshold)); + input_dtype = input_attr->dtype; + output_dtype = output_attr->dtype; + offset = alpha * threshold; + + if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + srcFixPointPos = input_attr->dfp.fl; + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant) + { + input_ZP = (float)(input_attr->asymm.zero_point); + scaleIn = input_attr->asymm.scale; + } + + if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + { + dstFixPointPos = output_attr->dfp.fl; + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) + { + output_ZP = (float)(output_attr->asymm.zero_point); + scaleOut = 1.0f / output_attr->asymm.scale; + } + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.dim = out_shape->size < 3 ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant) + { + inputTail = -input_ZP * scaleIn; + status = vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "inputTail", &inputTail); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + if (srcFixPointPos >=0 ) + scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos); + else + scaleIn = (float) ((int64_t)1 << -srcFixPointPos); + + status = vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) + { + status = vsi_nn_kernel_gpu_add_param(node, "output_scale", &scaleOut); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &output_ZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + { + if (dstFixPointPos >=0 ) + scaleOut = (float) ((int64_t)1 << dstFixPointPos); + else + scaleOut = 1.0f / (float) ((int64_t)1 << -dstFixPointPos); + + status = vsi_nn_kernel_gpu_add_param(node, "output_scale", &scaleOut); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if (F16 == input_dtype) + { + gpu_dp_inst_t uniConvIntegertoFP32_Lo_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvIntegertoFP32_Hi_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvFP16toFP32_Lo_4x4", &uniConvIntegertoFP32_Lo_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvFP16toFP32_Hi_4x4", &uniConvIntegertoFP32_Hi_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (BF16 == input_dtype) + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + gpu_dp_inst_t uniConvIntegertoFP32_Lo_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvIntegertoFP32_Hi_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvIntegertoFP32_Lo_4x4", &uniConvIntegertoFP32_Lo_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvIntegertoFP32_Hi_4x4", &uniConvIntegertoFP32_Hi_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if (F16 == output_dtype) + { + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (BF16 == output_dtype) + { + gpu_dp_inst_t uniPackedBF16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniPackedBF16_2x8", &uniPackedBF16_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniExtractInteger_2x8", &uniExtractInteger_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_add_param(node, "offset", &offset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + SAFE_FREE_TENSOR_ATTR(input_attr); + return status; + +} /* _relu_keras_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _relu_keras_kernel_map; + size_t kernel_map_size = _cnt_of_array( _relu_keras_kernel_map ); + vx_param_description_t * param_def = _relu_keras_kernel_param_def; + size_t param_def_size = _cnt_of_array( _relu_keras_kernel_param_def ); + vx_kernel_initialize_f initializer = _relu_keras_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = RELU_KERAS_HASH_KEY( in_dtype, out_dtype, image_2d ); + + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RELU_KERAS_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); + float max_value = vsi_nn_kernel_param_get_float32( params, "max_value" ); + float threshold = vsi_nn_kernel_param_get_float32( params, "threshold" ); + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RELU_KERAS_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALPHA] = vsi_nn_kernel_scalar_create( graph, F32, &alpha ); + node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value ); + node_params[SCALAR_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &threshold ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _RELU_KERAS_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALPHA] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_THRESHOLD] ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( relu_keras, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c new file mode 100644 index 0000000..52fb9d4 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -0,0 +1,1110 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "utils/vsi_nn_dtype_util_prv.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + DOWN = 0, + UP, + UP_OPT, + UP_2X_HALF, +} _internal_scale_e; + +#define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type) "resize_bilinear_"#_input_type +#define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt" +#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_2X(_input_type) "resize_bilinear_"#_input_type"_UP_2X" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (scale_flag)) + +#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_2X_half"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_2X(IN_DTYPE) } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_bilinear_kernel_map[] = +{ + PACK_KERNEL_MAP_DOWN(I8, I8), + PACK_KERNEL_MAP_DOWN(I16, I16), + PACK_KERNEL_MAP_DOWN(U8, F16), + PACK_KERNEL_MAP_DOWN(U8, U8), + PACK_KERNEL_MAP_DOWN(F16, F16), + PACK_KERNEL_MAP_DOWN(F16, U8), + PACK_KERNEL_MAP_DOWN(BF16, BF16), + PACK_KERNEL_MAP_UP(I8, I8), + PACK_KERNEL_MAP_UP(I16, I16), + PACK_KERNEL_MAP_UP(U8, U8), + PACK_KERNEL_MAP_UP(F16, F16), + PACK_KERNEL_MAP_UP(BF16, BF16), + PACK_KERNEL_MAP_UP_OPT(U8, U8), + PACK_KERNEL_MAP_UP_2X_HALF(U8, U8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_bilinear_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_BILINEAR_PARAM_NUM _cnt_of_array( _resize_bilinear_kernel_param_def ) +#define _RESIZE_NO_SCALE_PARAM_NUM 4 + +#define SCALAR_ALIGN_CORNERS (2) +#define SCALAR_HALF_PIXEL (3) +#define SCALAR_TENSOR_SCALE (4) + +static vsi_bool _is_same_quant + ( + vsi_nn_kernel_tensor_attr_t * input0_attr, + vsi_nn_kernel_tensor_attr_t * input1_attr + ) +{ + if (NULL == input0_attr || NULL == input1_attr) + { + return FALSE; + } + + if (input0_attr->dtype != input1_attr->dtype || input0_attr->quant != input1_attr->quant) + { + return FALSE; + } + if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (input0_attr->dfp.fl != input1_attr->dfp.fl) + { + return FALSE; + } + } + else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + const float diff = (float)1e-5; + if (input0_attr->asymm.zero_point != input1_attr->asymm.zero_point) + { + return FALSE; + } + if(vsi_nn_float_compare(input0_attr->asymm.scale, input1_attr->asymm.scale, diff) == FALSE) + { + return FALSE; + } + } + + return TRUE; +} /* _is_same_quant */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * out_shape = NULL; + vsi_int_array_t * in_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + int32_t align_corners; + int32_t half_pixel_centers; + + uint32_t depth = 0; + int32_t srcFixPointPos = 0; + int32_t dstFixPointPos = 0; + float input_scale = 1.0; + int32_t inputZP = 0; + float output_scale = 1.0; + int32_t outputZP = 0; + float scale_factor[2]; + uint32_t in_width; + uint32_t in_height; + uint32_t out_width; + uint32_t out_height; + float half_pixel_value = 0.0f; + vsi_bool is_use_scale_kernel = (vsi_bool)(_RESIZE_BILINEAR_PARAM_NUM == param_size); + vsi_bool is_use_2x_up_half_kernel = FALSE; + + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &align_corners); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &half_pixel_centers); + CHECK_STATUS_FAIL_GOTO(status, final ); + + out_shape = output_attr->shape; + in_shape = input_attr->shape; + input_dtype = input_attr->dtype; + output_dtype = output_attr->dtype; + + in_width = in_shape->data[0]; + in_height = in_shape->data[1]; + depth = in_shape->data[2]; + out_width = out_shape->data[0]; + out_height = out_shape->data[1]; + + if (align_corners && out_width > 1) + { + scale_factor[0] = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + } + else + { + scale_factor[0] = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + } + + if (align_corners && out_height > 1) + { + scale_factor[1] = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1); + } + else + { + scale_factor[1] = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr))) + { + is_use_2x_up_half_kernel = (!align_corners) && (half_pixel_centers); + is_use_2x_up_half_kernel = is_use_2x_up_half_kernel && \ + (2 * in_width == out_width) && (2 * in_height == out_height); + } + + if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) + { + input_scale = input_attr->asymm.scale; + inputZP = input_attr->asymm.zero_point; + } + else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + srcFixPointPos = input_attr->dfp.fl; + if (srcFixPointPos >= 0) + { + input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); + } + else if (srcFixPointPos < 0) + { + input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos); + } + inputZP = 0; + } + else + { + input_scale = 1.0f; + inputZP = 0; + } + + if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) + { + output_scale = output_attr->asymm.scale; + outputZP = output_attr->asymm.zero_point; + } + else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + { + dstFixPointPos = output_attr->dfp.fl; + if (dstFixPointPos >= 0) + { + output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos); + } + else if (dstFixPointPos < 0) + { + output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos); + } + outputZP = 0; + } + else + { + output_scale = 1.0; + outputZP = 0; + } + + if (is_use_2x_up_half_kernel) + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + } + else + { + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + if (is_use_2x_up_half_kernel) + { + gpu_dp_inst_t uniResize2xUp_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect + 0x00000400, // AccumType, ConstantType, and PostShift + 0x09030301, 0x03090103, 0x09030301, 0x03090103, + 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize2xUpRound_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000704, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_4x8", &uniResize2xUp_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUpRound_2x8", &uniResize2xUpRound_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + float dfpScale = input_scale * output_scale; + gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtact8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (I8 == input_dtype && I8 == output_dtype && out_width > in_width) + { + gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetMaskShift_2x8 = {{ + 0x99999999, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x55555555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00150004, 0x00370026, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4", + &uniConvertDFP2FP32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); + CHECK_STATUS_FAIL_GOTO(status, final ); + + gpu_param.global_scale[2] = depth; + } + else if (I16 == input_dtype && I16 == output_dtype && out_width > in_width) + { + gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetMaskShift_2x8 = {{ + 0x99999999, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x55555555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00150004, 0x00370026, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_part1_4x4", + &uniConvertDFP2FP32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); + CHECK_STATUS_FAIL_GOTO(status, final ); + + gpu_param.global_scale[2] = depth; + } + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertDFP2FP32_4x4", &uniConvertDFP2FP32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); + status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (U8 == input_dtype && (U8 == output_dtype || F16 == output_dtype)) + { + float uint8Scale = input_scale / output_scale; + float uint8ZP_out = (float)outputZP; + gpu_dp_inst_t uniExtact8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZPtoFp32_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (F16 == output_dtype) + { + status = vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &input_scale); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + if (out_width > in_width) + { + gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetMaskShift_2x8 = {{ + 0x99999999, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x55555555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZPtoFp32_part1_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00150004, 0x00370026, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniBilinear_4x4_b = {{ + 0x55555555, // TCfg + 0x55550000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x00000000, // BSelt + 0xd951c840, 0xfb73ea62, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); + if (is_use_scale_kernel) + { + status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_4x4_b", &uniBilinear_4x4_b); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8SubZPtoFp32_part1_4x4", &uniU8SubZPtoFp32_part1_4x4); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + + gpu_param.global_scale[2] = depth; + } + if (!is_use_scale_kernel) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &uint8Scale); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &uint8ZP_out); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); + if (!is_use_scale_kernel) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniU8SubZPtoFp32_4x4", &uniU8SubZPtoFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "input_ZP", &inputZP); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype)) + { + float uint8Scale = 1.0f / output_scale; + float uint8ZP_out = (vx_float32)outputZP; + gpu_dp_inst_t uniExtact8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16toFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniRightSubLeft_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtactHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + + if (F16 == input_dtype && F16 == output_dtype && out_width > in_width) + { + gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetMaskShift_2x8 = {{ + 0x99999999, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x55555555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16toFp32_part1_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00150004, 0x00370026, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); + CHECK_STATUS_FAIL_GOTO(status, final ); + + gpu_param.global_scale[2] = depth; + } + else if (F16 == output_dtype) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uint8Scale", &uint8Scale); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &uint8ZP_out); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_add_param( node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (BF16 == input_dtype && BF16 == output_dtype) + { + if (out_width > in_width) + { + gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetMaskShift_2x8 = {{ + 0x99999999, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x55555555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); + CHECK_STATUS_FAIL_GOTO(status, final ); + + gpu_param.global_scale[2] = depth; + } + else + { + gpu_dp_inst_t uniConvBF16toF32_odd_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x02050004, 0x06070406, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_even_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x03050104, 0x07070506, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvBF16toF32_odd_2x8", &uniConvBF16toF32_odd_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvBF16toF32_even_2x8", &uniConvBF16toF32_even_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + else + { + VSILOGE("input or output's format is not support"); + status = VSI_FAILURE; + goto final; + } + + if (!is_use_2x_up_half_kernel) + { + status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + if (is_use_2x_up_half_kernel) + { + gpu_param.global_size[0] = gpu_align_p2((out_width + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = depth; + gpu_param.dim = 2; + } + else + { + gpu_param.global_size[0] = gpu_align_p2((out_width + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]; + gpu_param.global_size[2] = depth / gpu_param.global_scale[2]; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _resize_bilinear_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool is_same_type, + vsi_bool is_evis2, + vsi_bool is_2x_up_half, + vsi_bool *is_run_opt_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _resize_bilinear_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_bilinear_kernel_map ); + vx_param_description_t * param_def = _resize_bilinear_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_bilinear_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_bilinear_initializer; + uint32_t key; + uint32_t i; + _internal_scale_e scale_flag = UP; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0]) + { + if (is_2x_up_half) + { + scale_flag = UP_2X_HALF; + } + else if (is_same_type && is_evis2) + { + scale_flag = UP_OPT; + } + else + { + scale_flag = UP; + } + } + else + { + scale_flag = DOWN; + } + + key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if ((UP_2X_HALF == scale_flag) && (i >= kernel_map_size)) + { + scale_flag = UP_OPT; + key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + } + + if ((UP_OPT == scale_flag) && (i >= kernel_map_size)) + { + scale_flag = UP; + key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + } + + if ((UP == scale_flag) && (i >= kernel_map_size)) + { + scale_flag = DOWN; + key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + } + + if( i < kernel_map_size ) + { + if (UP_OPT == scale_flag) + { + param_def_size = _RESIZE_BILINEAR_PARAM_NUM; + *is_run_opt_kernel = TRUE; + } + else + { + param_def_size = _RESIZE_NO_SCALE_PARAM_NUM; + *is_run_opt_kernel = FALSE; + } + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + +static vsi_nn_tensor_t* _create_scale_tensor + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *output, + int32_t align_corners, + int32_t half_pixel_centers + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* scale = NULL; + uint32_t dims = output->attr.dim_num; + uint32_t width = output->attr.size[0]; + uint32_t height = output->attr.size[1]; + uint32_t batch = dims > 3 ? output->attr.size[3] : 1; + uint32_t sizes[4] = {width * 4, height, 1, batch}; + uint32_t item_count = width * 4 * height * batch; + uint32_t input_width = input->attr.size[0]; + uint32_t input_height = input->attr.size[1]; + uint32_t x = 0; + uint32_t y = 0; + uint32_t b = 0; + float width_scale = 1.0f; + float height_scale = 1.0f; + uint16_t *scale_data_ptr = NULL; + + if (align_corners && width > 1) + { + width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(width - 1); + } + else + { + width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)width; + } + + if (align_corners && height > 1) + { + height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(height - 1); + } + else + { + height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)height; + } + + + scale_data_ptr = (uint16_t *)malloc(item_count * sizeof(uint16_t)); + if (scale_data_ptr == NULL) + { + VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__); + goto OnError; + } + memset(scale_data_ptr, 0, item_count * sizeof(vx_uint16)); + + for (b = 0; b < batch; b ++) + { + for (y = 0; y < height; y ++) + { + float input_h = 0.0f; + int32_t h0 = 0; + if (half_pixel_centers) + { + input_h = ((vx_float32)y + 0.5f) * height_scale - 0.5f; + } + else + { + input_h = y * height_scale; + } + h0 = (int32_t)input_h; + for (x = 0; x < width; x ++) + { + float input_w = 0.0f; + int32_t w0 = 0; + uint32_t idx = b * width * 4 * height + y * width * 4 + x * 4; + float tl = 0.0f; + float tr = 0.0f; + float bl = 0.0f; + float br = 0.0f; + if (half_pixel_centers) + { + input_w = ((vx_float32)x + 0.5f) * width_scale - 0.5f; + } + else + { + input_w = x * width_scale; + } + w0 = (vx_int32)input_w; + tl = (1 - (input_h - h0)) * (1 - (input_w - w0)); + tr = (1 - (input_h - h0)) * (input_w - w0); + bl = (input_h - h0) * (1 - (input_w - w0)); + br = (input_h - h0) * (input_w - w0); + + scale_data_ptr[idx + 0] = fp32_to_fp16(tl); + scale_data_ptr[idx + 1] = fp32_to_fp16(tr); + scale_data_ptr[idx + 2] = fp32_to_fp16(bl); + scale_data_ptr[idx + 3] = fp32_to_fp16(br); + } + } + } + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + attr.size[0] = sizes[0]; + attr.size[1] = sizes[1]; + attr.size[2] = sizes[2]; + attr.size[3] = sizes[3]; + attr.dim_num = dims; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.vtl = FALSE; + + scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr); + if (scale_data_ptr) + { + free(scale_data_ptr); + scale_data_ptr = NULL; + } + +OnError: + return scale; +} + + + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]); + vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2); + vsi_bool is_run_opt_kernel = FALSE; + vsi_bool is_2x_up_half = FALSE; + vsi_nn_tensor_t* scale = NULL; + + is_2x_up_half = is_same_type && (!align_corners) && (half_pixel_centers); + is_2x_up_half = is_2x_up_half && (2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ + && (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); + status = _query_kernel( kernel, inputs, outputs, is_same_type, is_evis2, + is_2x_up_half, &is_run_opt_kernel); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = _RESIZE_NO_SCALE_PARAM_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + if (is_run_opt_kernel) + { + scale = _create_scale_tensor(graph, inputs[0], outputs[0], align_corners, half_pixel_centers); + node_params[SCALAR_TENSOR_SCALE] = (vsi_nn_kernel_node_param_t)(scale->t); + node_params_num = _RESIZE_BILINEAR_PARAM_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + if (is_run_opt_kernel) + { + if (scale) + { + vsi_nn_ReleaseTensor(&scale); + } + } + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( resize_bilinear, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c new file mode 100644 index 0000000..9d17244 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c @@ -0,0 +1,541 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + LARGE = 0, + SMALL +} _internal_nearest_e; + +#define _RESIZE_NEAREST_KERNEL_SOURCE "resize_nearest" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE, mode ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (mode)) + + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE, LARGE ), \ + CVIVANTE_NAMESPACE("evis.resize_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RESIZE_NEAREST_KERNEL_SOURCE } + +#define PACK_KERNEL_MAP_OPT( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE, SMALL ), \ + CVIVANTE_NAMESPACE("evis.resize_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_op"), \ + _RESIZE_NEAREST_KERNEL_SOURCE } + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_nearest_kernel_map[] = +{ + PACK_KERNEL_MAP(F16, F16), + PACK_KERNEL_MAP(I16, I16), + PACK_KERNEL_MAP(I8, I8), + PACK_KERNEL_MAP(U8, U8), + PACK_KERNEL_MAP_OPT(F16, F16), + PACK_KERNEL_MAP_OPT(I16, I16), + PACK_KERNEL_MAP_OPT(I8, I8), + PACK_KERNEL_MAP_OPT(U8, U8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_nearest_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_NEAREST_PARAM_NUM _cnt_of_array( _resize_nearest_kernel_param_def ) + +#define SCALAR_ALIGN_CORNERS (2) +#define SCALAR_HALF_PIXEL (3) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ +#define MAX_POST_SHIFT_BITS (31) +#define MAX_MULTIPLIER_NUM (65535) + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * out_shape = NULL; + vsi_int_array_t * in_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + int32_t align_corners; + int32_t half_pixel_centers; + uint32_t depth = 0; + int32_t srcFixPointPos = 0; + int32_t dstFixPointPos = 0; + float input_scale = 1.0; + int32_t inputZP = 0; + float output_scale = 1.0; + int32_t outputZP = 0; + float scale_factor[2]; + uint32_t in_width; + uint32_t in_height; + uint32_t out_width; + uint32_t out_height; + float half_pixel_value = 0.0f; + float round_value = 0.0f; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &align_corners); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &half_pixel_centers); + CHECK_STATUS_FAIL_GOTO(status, final ); + out_shape = output_attr->shape; + in_shape = input_attr->shape; + input_dtype = input_attr->dtype; + output_dtype = output_attr->dtype; + + in_width = in_shape->data[0]; + in_height = in_shape->data[1]; + depth = in_shape->data[2]; + out_width = out_shape->data[0]; + out_height = out_shape->data[1]; + + if (BF16 == input_dtype && output_dtype == BF16) + { + input_dtype = F16; + output_dtype = F16; + } + if (align_corners && out_width > 1) + { + scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1); + } + else + { + scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width; + } + + if (align_corners && out_height > 1) + { + scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1); + } + else + { + scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height; + } + + if (align_corners) + { + round_value = 0.5f; + } + else + { + round_value = 0.0f; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) + { + input_scale = input_attr->asymm.scale; + inputZP = input_attr->asymm.zero_point; + } + else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + srcFixPointPos = input_attr->dfp.fl; + if (srcFixPointPos >= 0) + { + input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos); + } + else if (srcFixPointPos < 0) + { + input_scale = (float)((int64_t)1 << -srcFixPointPos); + } + inputZP = 0; + } + else + { + input_scale = 1.0f; + inputZP = 0; + } + + if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) + { + output_scale = 1.0f / output_attr->asymm.scale; + outputZP = output_attr->asymm.zero_point; + } + else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + { + dstFixPointPos = output_attr->dfp.fl; + if (dstFixPointPos >= 0) + { + output_scale = (float) ((int64_t)1 << dstFixPointPos); + } + else if (dstFixPointPos < 0) + { + output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos); + } + outputZP = 0; + } + else + { + output_scale = 1.0; + outputZP = 0; + } + + if (F16 == input_dtype && F16 == output_dtype) + { + gpu_dp_inst_t uniGetExtractData_2x8 = {{ + 0x00009999, // TCfg + 0x00000000, // ASelt + 0x06040200, 0x00000000, // ABin + 0x0000aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00100010, 0x00100010, 0x00100010, 0x00100010, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + if (scale_factor[0] < 4.0f) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniGetExtractData_2x8", &uniGetExtractData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if ( input_dtype == output_dtype && (I8 == input_dtype || I16 == input_dtype)) + { + gpu_dp_inst_t uniGetExtractData_2x8 = {{ + 0x00009999, // TCfg + 0x00000000, // ASelt + 0x06040200, 0x00000000, // ABin + 0x0000aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00080008, 0x00080008, 0x00080008, 0x00080008, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertI8toI8_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + if (I16 == input_dtype) + { + uniGetExtractData_2x8.data[8] = 0x00100010; + uniGetExtractData_2x8.data[9] = 0x00100010; + uniGetExtractData_2x8.data[10] = 0x00100010; + uniGetExtractData_2x8.data[11] = 0x00100010; + uniGetExtractData_2x8.data[12] = 0x00100010; + uniGetExtractData_2x8.data[13] = 0x00100010; + uniGetExtractData_2x8.data[14] = 0x00100010; + uniGetExtractData_2x8.data[15] = 0x00100010; + } + + if (srcFixPointPos > dstFixPointPos) + { + int32_t postshift = vsi_nn_min(srcFixPointPos - dstFixPointPos, MAX_POST_SHIFT_BITS); + + uniConvertI8toI8_2x8.data[7] |= (postshift & 0x1F); + } + else + { + uint32_t multiplier = vsi_nn_min((int64_t)1 << (dstFixPointPos - srcFixPointPos), MAX_MULTIPLIER_NUM); + uint32_t i = 0; + + for (i = 0; i < 8; i++) + { + uniConvertI8toI8_2x8.data[i + 8] = multiplier; + } + } + + if (scale_factor[0] < 4.0f) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniGetExtractData_2x8", &uniGetExtractData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertI8toI8_2x8", &uniConvertI8toI8_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (U8 == input_dtype && U8 == output_dtype) + { + uint16_t M0 = 0; + vx_int8 postShift = 0; + uint32_t multAndoutZP[2] = {0}; + gpu_dp_inst_t uniMultiplyAndPostShift_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniGetExtractData_2x8 = {{ + 0x00009999, // TCfg + 0x00000000, // ASelt + 0x06040200, 0x00000000, // ABin + 0x0000aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00080008, 0x00080008, 0x00080008, 0x00080008, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + vsi_nn_GetFP32MultiAndPostShift(input_scale * output_scale, &M0, &postShift); + + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0); + + uniMultiplyAndPostShift_2x8.data[7] |= (postShift & 0x1F); + + if (scale_factor[0] < 4.0f) + { + status = vsi_nn_kernel_gpu_add_param( node, "uniGetExtractData_2x8", &uniGetExtractData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + + status = vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP); + status |= vsi_nn_kernel_gpu_add_param( node, "uniMultiplyAndPostShift_2x8", &uniMultiplyAndPostShift_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value); + status |= vsi_nn_kernel_gpu_add_param( node, "round_value", &round_value); + CHECK_STATUS_FAIL_GOTO(status, final ); + + gpu_param.global_size[0] = gpu_align_p2((out_width + gpu_param.global_scale[0] - 1)\ + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]; + gpu_param.global_size[2] = depth; + + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +#undef MAX_MULTIPLIER_NUM +#undef MAX_POST_SHIFT_BITS +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _resize_nearest_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t align_corners + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _resize_nearest_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_nearest_kernel_map ); + vx_param_description_t * param_def = _resize_nearest_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_nearest_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_nearest_initializer; + uint32_t key; + uint32_t i; + uint32_t inputWidth = inputs[0]->attr.size[0]; + uint32_t outputWidth = outputs[0]->attr.size[0]; + float scale_factor; + _internal_nearest_e resize_mode = LARGE; + + if (align_corners && outputWidth > 1) + { + scale_factor = (vx_float32)(inputWidth - 1) / (vx_float32)(outputWidth - 1); + } + else + { + scale_factor = (vx_float32)inputWidth / (vx_float32)outputWidth; + } + + if (scale_factor < 4.0f) + { + resize_mode = SMALL; + } + else + { + resize_mode = LARGE; + } + + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (BF16 == in_dtype && BF16 == out_dtype) + { + in_dtype = F16; + out_dtype = F16; + } + + key = RESIZE_NEAREST_HASH_KEY( in_dtype, out_dtype, resize_mode); + + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_NEAREST_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + + status = _query_kernel( kernel, inputs, outputs, align_corners); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_NEAREST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_NEAREST_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( resize_nearest, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c new file mode 100644 index 0000000..287fc73 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c @@ -0,0 +1,517 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "scatter_nd" +#define KERNEL_SOURCE_2 "scatter_nd_big" + +#define HASH_SCATTER_ND_KEY(_input0_type, _output_type, _coord_dim, _reshape_type) \ + ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_reshape_type)) + +#define HASH_SCATTER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_SCATTER_ND_SH_KERNEL_BIG_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_"#SRC0_TYPE"to"#DST_TYPE"_big") + +#define TENSOR_SCATTER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_KEY(IN1_TYPE, OUT_TYPE, 0, 0), \ + HASH_SCATTER_ND_SH_KERNEL_NAME(IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_SCATTER_ND_BIG_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_KEY(IN1_TYPE, OUT_TYPE, 0, 1), \ + HASH_SCATTER_ND_SH_KERNEL_BIG_NAME(IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } scatter_nd_map[] = +{ + TENSOR_SCATTER_ND_KERNELS(I32, I8, I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, U8, U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, I16, I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, F16, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_BIG_KERNELS(I32, I8, I8, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_BIG_KERNELS(I32, U8, U8, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_BIG_KERNELS(I32, I16, I16, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_BIG_KERNELS(I32, F16, F16, KERNEL_SOURCE_2) +}; + +/* + * Kernel params + */ +static vx_param_description_t _scatter_nd_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _SCATTER_ND_PARAM_NUM _cnt_of_array( _scatter_nd_kernel_param_def ) + +static vsi_status get_scatter_nd_tensor_reshape_size + ( + vsi_nn_tensor_t ** inputs, + int32_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t block_size, + uint32_t coordDim, + uint32_t* width, + uint32_t* area, + int32_t* newDim, + int32_t* isBig + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t dims_num = inputs[0]->attr.dim_num; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + uint32_t elementCnt = 1; + + if(coordDim != 0 && (width == NULL || area == NULL)) + { + return status; + } + +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + + newDim[0] = 0; + for(i = 0; i < dims_num; ++i) + { + elementCnt *= input_size[i]; + } + + for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + newDim[0] = 2; + + if((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH) + { + isBig[0] |= 1; + } + + if(coordDim == 1) // index shape + { + *width = 0; + *area = 0; + } + else if(coordDim == 2) + { + *width = input_size[dims_num - 2]; + *area = 0; + } + else if(coordDim == 3) + { + *width = input_size[dims_num - 3]; + *area = input_size[dims_num - 3] * input_size[dims_num - 2]; + } +#undef VSI_NN_MAX_IMAGE_WIDTH + + return VSI_SUCCESS; +} /* _get_EltOP_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_scatter_nd_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + int32_t block_size = 1; + int32_t height = 1; + int32_t index_num = 1; + int32_t output_zp = 0; + int32_t width = 0, area = 0; + int32_t coord_dim = 0; + int32_t offsetX = 0, offsetY = 0, offsetZ = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &width); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &area); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &coord_dim); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + block_size = attr[2]->shape->data[0]; + height = attr[2]->shape->data[1]; + index_num = attr[0]->shape->data[1]; + output_zp = attr[2]->asymm.zero_point; + + if(coord_dim == 3) + { + offsetX = area; + offsetY = width; + offsetZ = 1; + } + else if(coord_dim == 2) + { + offsetX = width; + offsetY = 1; + offsetZ = 0; + } + else if(coord_dim == 1) + { + offsetX = 1; + offsetY = 0; + offsetZ = 0; + } + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniAccumulateSum_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num ); + status |= vsi_nn_kernel_gpu_add_param( node, "zeropoint", &output_zp ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + return status; +} /* _scatter_nd_initializer() */ + +DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + int32_t block_size = 1; + int32_t height = 1; + int32_t index_num = 1; + int32_t output_zp = 0; + int32_t width = 0, area = 0; + int32_t coord_dim = 0; + int32_t offsetX = 0, offsetY = 0, offsetZ = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &width); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &area); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &coord_dim); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + block_size = attr[2]->shape->data[0]; + height = attr[2]->shape->data[1]; + index_num = attr[0]->shape->data[1]; + output_zp = attr[2]->asymm.zero_point; + + if(coord_dim == 3) + { + offsetX = area; + offsetY = width; + offsetZ = 1; + } + else if(coord_dim == 2) + { + offsetX = width; + offsetY = 1; + offsetZ = 0; + } + else if(coord_dim == 1) + { + offsetX = 1; + offsetY = 0; + offsetZ = 0; + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = block_size; + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniAccumulateSum_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num ); + status |= vsi_nn_kernel_gpu_add_param( node, "update_width", &block_size ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); + status |= vsi_nn_kernel_gpu_add_param( node, "zeropoint", &output_zp ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + return status; +} /* _scatter_nd_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + int32_t coord_dim, + int32_t isBig + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input1_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_SCATTER_ND_KEY( input1_dtype, output_dtype, 0, isBig ); + + for( i = 0; i < _cnt_of_array(scatter_nd_map); i ++ ) + { + if( scatter_nd_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(scatter_nd_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_map[i].function_name ); + kernel->info.parameters = _scatter_nd_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _scatter_nd_kernel_param_def ); + if(isBig) + { + kernel->info.initialize = _scatter_nd_big_initializer; + } + else + { + kernel->info.initialize = _scatter_nd_initializer; + } + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + scatter_nd_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_SCATTER_ND_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + uint32_t width = 0, area = 0; + int32_t big_flg = 0; + + status = get_scatter_nd_tensor_reshape_size(&inputs[0], shapes[0], coord_dim, 0, + NULL, NULL, &rs_idx_dim, &big_flg); + status |= get_scatter_nd_tensor_reshape_size(&inputs[1], shapes[1], block_size, 0, + NULL, NULL, &rs_in_dim, &big_flg); + status |= get_scatter_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim, + &width, &area, &rs_out_dim, &big_flg); + if(status != VSI_SUCCESS) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, coord_dim, big_flg); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], rs_in_dim ); + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[1], rs_idx_dim ); + //tmp_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; + tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _SCATTER_ND_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &tmp_params[0] ); + vsi_nn_kernel_tensor_release( &tmp_params[1] ); + vsi_nn_kernel_tensor_release( &tmp_params[2] ); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( scatter_nd, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c new file mode 100644 index 0000000..9e95f5d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c @@ -0,0 +1,504 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _SELECT_KERNEL_SOURCE "select" + +#define STR(a) #a + +// Add kernel hashtable here +#define SELECT_HASH_KEY(COND_DTYPE, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, _image_2d) \ + ((COND_DTYPE << 25) | (IN0_DTYPE << 18) | ( IN1_DTYPE << 11 ) | ( OUT_DTYPE << 4) | (_image_2d)) + +#define PACK_KERNEL_MAP(COND_DTYPE, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { SELECT_HASH_KEY(COND_DTYPE, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("evis.select_"STR(COND_DTYPE)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ + _SELECT_KERNEL_SOURCE} + +#define PACK_KERNEL_MAP_2D(COND_DTYPE, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { SELECT_HASH_KEY(COND_DTYPE, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("evis.select_"STR(COND_DTYPE)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _SELECT_KERNEL_SOURCE} + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _select_kernel_map[] = +{ + PACK_KERNEL_MAP(I8, I8, I8, I8), + PACK_KERNEL_MAP(I8, U8, U8, U8), + PACK_KERNEL_MAP(I8, I16, I16, I16), + PACK_KERNEL_MAP(I8, F16, F16, F16), + PACK_KERNEL_MAP_2D(I8, I8, I8, I8), + PACK_KERNEL_MAP_2D(I8, U8, U8, U8), + PACK_KERNEL_MAP_2D(I8, I16, I16, I16), + PACK_KERNEL_MAP_2D(I8, F16, F16, F16), +}; + +/* + * Kernel params + */ +static vx_param_description_t _select_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _SELECT_PARAM_NUM _cnt_of_array( _select_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_select_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ + (( IN0_TYPE << 24) | ( IN1_TYPE << 16) | ( OUT_TYPE << 8)) +#define MAX_MULTIPLIER_NUM (65535) +#define MAX_POST_SHIFT_BITS (31) + vsi_status status = VX_SUCCESS; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_tensor input0 = (vx_tensor)param[1]; + vx_tensor input1 = (vx_tensor)param[2]; + vx_tensor output = (vx_tensor)param[3]; + vsi_nn_kernel_tensor_attr_t *input0_attr = NULL; + vsi_nn_kernel_tensor_attr_t *input1_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_int_array_t *output_shape = NULL; + int32_t input0_fl = 0, input1_fl = 0, output_fl = 0; + float input0Scale = 1.0f; + int32_t input0Zp = 0; + float input1Scale = 1.0f; + int32_t input1Zp = 0; + float outputScale = 1.0f; + int32_t outputZP = 0; + uint16_t in0_M0 = 0; + int8_t in0_postShift = 0; + uint16_t in1_M0 = 0; + int8_t in1_postShift = 0; + uint32_t pack_key = 0; + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); + CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1); + CHECK_PTR_FAIL_GOTO( input1_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + input0_fl = input0_attr->dfp.fl; + if (input0_fl > 0) + { + input0Scale = 1.0f / (float) ((int64_t)1 << input0_fl); + } + else + { + input0Scale = (float)((int64_t)1 << -input0_fl); + } + } + else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + input0Scale = input0_attr->asymm.scale; + input0Zp = input0_attr->asymm.zero_point; + } + + if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + input1_fl = input1_attr->dfp.fl; + if (input1_fl > 0) + { + input1Scale = 1.0f / (float) ((int64_t)1 << input1_fl); + } + else + { + input1Scale = (float)((int64_t)1 << -input1_fl); + } + } + else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + input1Scale = input1_attr->asymm.scale; + input1Zp = input1_attr->asymm.zero_point; + } + + if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + output_fl = output_attr->dfp.fl; + if (output_fl > 0) + { + outputScale = 1.0f / (float) ((int64_t)1 << output_fl); + } + else + { + outputScale = (float)((int64_t)1 << -output_fl); + } + } + else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + outputScale = output_attr->asymm.scale; + outputZP = output_attr->asymm.zero_point; + } + + vsi_nn_GetFP32MultiAndPostShift(input0Scale / outputScale, &in0_M0, &in0_postShift); + vsi_nn_GetFP32MultiAndPostShift(input1Scale / outputScale, &in1_M0, &in1_postShift); + + pack_key = _PACK_SELECT_KEY( input0_attr->dtype, input1_attr->dtype, output_attr->dtype ); + + output_shape = output_attr->shape; + gpu_param.dim = output_shape->size < 3 ? 2 : 3; + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_offset[2] = 0; + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? + (output_shape->data[2] + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2] : 1; + + + switch( pack_key ) + { + case _PACK_SELECT_KEY( I8, I8, I8 ): + case _PACK_SELECT_KEY( I16, I16, I16 ): + { + gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvIntIn0toDst_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvIntIn1toDst_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + if (input0_fl >= output_fl) + { + uint8_t postshift = (uint8_t)gpu_min(input0_fl - output_fl, MAX_POST_SHIFT_BITS); + uniConvIntIn0toDst_2x8.data[7] = uniConvIntIn0toDst_2x8.data[7] | (postshift & 0x1F); + } + else + { + uint32_t idx = 0; + uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input0_fl), MAX_MULTIPLIER_NUM); + for (idx = 8; idx < 16; idx ++) + { + uniConvIntIn0toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff); + } + } + + + if (input1_fl >= output_fl) + { + uint8_t postshift = (uint8_t)gpu_min(input1_fl - output_fl, MAX_POST_SHIFT_BITS); + uniConvIntIn1toDst_2x8.data[7] = uniConvIntIn1toDst_2x8.data[7] | (postshift & 0x1F); + } + else + { + uint32_t idx = 0; + uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input1_fl), MAX_MULTIPLIER_NUM); + for (idx = 8; idx < 16; idx ++) + { + uniConvIntIn1toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff); + } + } + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvIntIn0toDst_2x8", &uniConvIntIn0toDst_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvIntIn1toDst_2x8", &uniConvIntIn1toDst_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvConditiontoDst_2x8", &uniConvConditiontoDst_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( F16, F16, F16 ): + { + gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvConditiontoDst_2x8", &uniConvConditiontoDst_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( U8, U8, U8 ): + { + uint32_t idx = 0; + gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In0_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In1_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8AddZP_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + + uniU8SubZP_MulM_PStoF16In0_2x8.data[7] |= (in0_postShift & 0x1F); + uniU8SubZP_MulM_PStoF16In1_2x8.data[7] |= (in1_postShift & 0x1F); + + for (idx = 8; idx < 16; idx ++) + { + uniU8SubZP_MulM_PStoF16In0_2x8.data[idx] = (vx_uint32)(in0_M0 << 16) | in0_M0; + uniU8SubZP_MulM_PStoF16In1_2x8.data[idx] = (vx_uint32)(in1_M0 << 16) | in1_M0; + } + + status = vsi_nn_kernel_gpu_add_param( node, + "uniU8SubZP_MulM_PStoF16In0_2x8", &uniU8SubZP_MulM_PStoF16In0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8SubZP_MulM_PStoF16In1_2x8", &uniU8SubZP_MulM_PStoF16In1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8AddZP_2x8", &uniU8AddZP_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input0Zp", &input0Zp ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input1Zp", &input1Zp ); + status |= vsi_nn_kernel_gpu_add_param( node, + "outputZP", &outputZP ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + if (input0_attr) + { + vsi_nn_kernel_tensor_attr_release(&input0_attr); + } + if (input1_attr) + { + vsi_nn_kernel_tensor_attr_release(&input1_attr); + } + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + +#undef _PACK_SELECT_KEY +#undef MAX_MULTIPLIER_NUM +#undef MAX_POST_SHIFT_BITS + return status; +} /* _select_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e cond_dtype; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _select_kernel_map; + size_t kernel_map_size = _cnt_of_array( _select_kernel_map ); + vx_param_description_t * param_def = _select_kernel_param_def; + size_t param_def_size = _cnt_of_array( _select_kernel_param_def ); + vx_kernel_initialize_f initializer = _select_initializer; + uint32_t key; + uint32_t i; + + cond_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in0_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + cond_dtype = (BOOL8 == cond_dtype || U8 == cond_dtype) ? I8 : cond_dtype; + in0_dtype = (BOOL8 == in0_dtype) ? I8 : in0_dtype; + in1_dtype = (BOOL8 == in1_dtype) ? I8 : in1_dtype; + out_dtype = (BOOL8 == out_dtype) ? I8 : out_dtype; + + key = SELECT_HASH_KEY(cond_dtype, in0_dtype, in1_dtype, out_dtype, image_2d); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SELECT_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + status = _query_kernel( kernel, inputs, outputs, image_2d); + + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SELECT_PARAM_NUM ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( select, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/swish_evis.c b/src/tim/vx/internal/src/kernel/evis/swish_evis.c new file mode 100644 index 0000000..7b1dbb5 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c @@ -0,0 +1,701 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +typedef enum _internal_img_dim_e +{ + IMAGE = 0, + IMAGE_2D, +} internal_img_dim_e; + +#define _SWISH_KERNEL_SOURCE "swish", +#define _HSWISH_KERNEL_SOURCE "hswish", + + +#define STR(a) #a +// Add kernel hashtable here +#define SWISH_HASH_KEY(SWISH_TYPE, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((SWISH_TYPE << 20) | ( IN_DTYPE << 12 ) | ( OUT_DTYPE << 4) | (_image_2d)) + +#define SWISH_PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_SWISH, IN_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("evis.swish_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _SWISH_KERNEL_SOURCE } + +#define HSWISH_PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_HSWISH, IN_DTYPE, OUT_DTYPE, IMAGE), \ + CVIVANTE_NAMESPACE("evis.hswish_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _HSWISH_KERNEL_SOURCE } + + +#define SWISH_PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_SWISH, IN_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("evis.swish_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _SWISH_KERNEL_SOURCE } + +#define HSWISH_PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE) \ + { SWISH_HASH_KEY(VSI_NN_HSWISH, IN_DTYPE, OUT_DTYPE, IMAGE_2D), \ + CVIVANTE_NAMESPACE("evis.hswish_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _HSWISH_KERNEL_SOURCE } +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _swish_kernel_map[] = +{ + // Register kernel here + SWISH_PACK_KERNEL_MAP( F16, F16), + SWISH_PACK_KERNEL_MAP( F16, I16), + SWISH_PACK_KERNEL_MAP( F16, I8), + SWISH_PACK_KERNEL_MAP( F16, U8), + SWISH_PACK_KERNEL_MAP( I16, I16), + SWISH_PACK_KERNEL_MAP( I16, F16), + SWISH_PACK_KERNEL_MAP( I8, I8), + SWISH_PACK_KERNEL_MAP( I8, F16), + SWISH_PACK_KERNEL_MAP( U8, U8), + SWISH_PACK_KERNEL_MAP( U8, F16), + SWISH_PACK_KERNEL_MAP( BF16, BF16), + SWISH_PACK_KERNEL_MAP_2D( F16, F16), + SWISH_PACK_KERNEL_MAP_2D( F16, I16), + SWISH_PACK_KERNEL_MAP_2D( F16, I8), + SWISH_PACK_KERNEL_MAP_2D( F16, U8), + SWISH_PACK_KERNEL_MAP_2D( I16, I16), + SWISH_PACK_KERNEL_MAP_2D( I16, F16), + SWISH_PACK_KERNEL_MAP_2D( I8, I8), + SWISH_PACK_KERNEL_MAP_2D( I8, F16), + SWISH_PACK_KERNEL_MAP_2D( U8, U8), + SWISH_PACK_KERNEL_MAP_2D( U8, F16), + SWISH_PACK_KERNEL_MAP_2D( BF16, BF16), + HSWISH_PACK_KERNEL_MAP( F16, F16), + HSWISH_PACK_KERNEL_MAP( F16, I16), + HSWISH_PACK_KERNEL_MAP( F16, I8), + HSWISH_PACK_KERNEL_MAP( F16, U8), + HSWISH_PACK_KERNEL_MAP( I16, I16), + HSWISH_PACK_KERNEL_MAP( I16, F16), + HSWISH_PACK_KERNEL_MAP( I8, I8), + HSWISH_PACK_KERNEL_MAP( I8, F16), + HSWISH_PACK_KERNEL_MAP( U8, U8), + HSWISH_PACK_KERNEL_MAP( U8, F16), + HSWISH_PACK_KERNEL_MAP( BF16, BF16), + HSWISH_PACK_KERNEL_MAP_2D( F16, F16), + HSWISH_PACK_KERNEL_MAP_2D( F16, I16), + HSWISH_PACK_KERNEL_MAP_2D( F16, I8), + HSWISH_PACK_KERNEL_MAP_2D( F16, U8), + HSWISH_PACK_KERNEL_MAP_2D( I16, I16), + HSWISH_PACK_KERNEL_MAP_2D( I16, F16), + HSWISH_PACK_KERNEL_MAP_2D( I8, I8), + HSWISH_PACK_KERNEL_MAP_2D( I8, F16), + HSWISH_PACK_KERNEL_MAP_2D( U8, U8), + HSWISH_PACK_KERNEL_MAP_2D( U8, F16), + HSWISH_PACK_KERNEL_MAP_2D( BF16, BF16), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _swish_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _SWISH_PARAM_NUM _cnt_of_array( _swish_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_swish_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vx_tensor input = (vx_tensor)param[0]; + vx_tensor output = (vx_tensor)param[1]; + int8_t srcFixPointPos = 0; + int8_t dstFixPointPos = 0; + vx_float32 inputTail = 0; + vx_float32 inputScale = 1.0f; + vx_float32 outputZP = 0; + vx_float32 outputScale = 1.0f; + vx_float32 logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); + vsi_nn_kernel_tensor_attr_t *input_attr = NULL, *output_attr = NULL; + vsi_int_array_t *out_shape = NULL; + uint32_t pack_key = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); + CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + out_shape = output_attr->shape; + + if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP) + { + srcFixPointPos = (int8_t)input_attr->dfp.fl; + if (srcFixPointPos > 0) + { + inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); + } + else + { + inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos); + } + } + else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + inputScale = input_attr->asymm.scale; + inputTail = 0 - input_attr->asymm.zero_point * inputScale; + } + + if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP) + { + dstFixPointPos = (int8_t)output_attr->dfp.fl; + if (dstFixPointPos > 0) + { + outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos); + } + else + { + outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos); + } + } + else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + outputScale = 1.0f / output_attr->asymm.scale; + outputZP = (vx_float32)(output_attr->asymm.zero_point); + } +#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \ + (IN_TYPE | ( OUT_TYPE << 16)) + + pack_key = _PACK_SELECT_KEY(input_attr->dtype, output_attr->dtype ); + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = out_shape->size < 3 ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + { + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDatatoFp32Part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDatatoFp32Part1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + switch (pack_key) + { + case _PACK_SELECT_KEY(BF16, BF16): + { + status = vsi_nn_kernel_gpu_add_param(node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default : + { + if (F16 == output_attr->dtype) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "inputScale", &inputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "inputTail", &inputTail); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDatatoFp32Part1_4x4", &uniDatatoFp32Part1_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + } + + status = vsi_nn_kernel_gpu_add_param(node, "logE", &logE); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); + +#undef _PACK_SELECT_KEY +final: + if (input_attr) + { + vsi_nn_kernel_tensor_attr_release(&input_attr); + } + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + + return status; +} /* _swish_initializer() */ + +DEF_KERNEL_INITIALIZER(_hswish_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vx_tensor input = (vx_tensor)param[0]; + vx_tensor output = (vx_tensor)param[1]; + int8_t srcFixPointPos = 0; + int8_t dstFixPointPos = 0; + vx_float32 inputTail = 0; + vx_float32 inputScale = 1.0f; + vx_float32 outputZP = 0; + vx_float32 outputScale = 1.0f; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL, *output_attr = NULL; + vsi_int_array_t *out_shape = NULL; + uint32_t pack_key = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); + CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + out_shape = output_attr->shape; + + if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP) + { + srcFixPointPos = (int8_t)input_attr->dfp.fl; + if (srcFixPointPos > 0) + { + inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); + } + else + { + inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos); + } + } + else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + inputScale = input_attr->asymm.scale; + inputTail = 0 - input_attr->asymm.zero_point * inputScale; + } + + if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP) + { + dstFixPointPos = (int8_t)output_attr->dfp.fl; + if (dstFixPointPos > 0) + { + outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos); + } + else + { + outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos); + } + } + else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM) + { + outputScale = 1.0f / output_attr->asymm.scale; + outputZP = (vx_float32)(output_attr->asymm.zero_point); + } +#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \ + (IN_TYPE | ( OUT_TYPE << 16)) + + pack_key = _PACK_SELECT_KEY(input_attr->dtype, output_attr->dtype ); + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = out_shape->size < 3 ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + { + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDatatoFp32Part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDatatoFp32Part1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + switch (pack_key) + { + case _PACK_SELECT_KEY(BF16, BF16): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default : + { + if (F16 == output_attr->dtype) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "inputScale", &inputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "inputTail", &inputTail); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDatatoFp32Part1_4x4", &uniDatatoFp32Part1_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + } + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); + +#undef _PACK_SELECT_KEY +final: + if (input_attr) + { + vsi_nn_kernel_tensor_attr_release(&input_attr); + } + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + return status; +} /* _hswish_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d, + vsi_nn_swish_type swish_type + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _swish_kernel_map; + size_t kernel_map_size = _cnt_of_array( _swish_kernel_map ); + vx_param_description_t * param_def = _swish_kernel_param_def; + size_t param_def_size = _cnt_of_array( _swish_kernel_param_def ); + vx_kernel_initialize_f initializer = _swish_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = SWISH_HASH_KEY(swish_type, in_dtype, out_dtype, image_2d); + + if (VSI_NN_HSWISH == swish_type) + { + initializer = _hswish_initializer; + } + else + { + initializer = _swish_initializer; + } + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SWISH_PARAM_NUM] = {NULL}; + int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}}; + uint32_t new_rank = 0; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + int32_t swish_type = vsi_nn_kernel_param_get_int32( params, "type" ); + float beta = 1.0f; +#if (VX_ACTIVATION_EXT_SUPPORT) + if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) + { + return NULL; + } +#endif + vsi_nn_OptimizedEltOPShape(inputs[0], (uint32_t *)(shapes[0]), &new_rank); + vsi_nn_OptimizedEltOPShape(outputs[0], (uint32_t *)(shapes[1]), &new_rank); + + if( !vsi_nn_kernel_gpu_check_shape( shapes[0], new_rank ) ) + { + return NULL; + } + + image_2d = (new_rank == 2); + + if (VSI_NN_HSWISH == (vsi_nn_swish_type)swish_type) + { + beta = 1.0f / 6.0f; + } + else + { + beta = vsi_nn_kernel_param_get_float32( params, "beta" ); + } + + status = _query_kernel( kernel, inputs, outputs, image_2d, (vsi_nn_swish_type)swish_type); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + node_params[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], new_rank ); + node_params[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], new_rank ); + node_params[2] = vsi_nn_kernel_scalar_create( graph, F32, &beta ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SWISH_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_tensor_release( &node_params[0] ); + vsi_nn_kernel_tensor_release( &node_params[1] ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( swish, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c new file mode 100644 index 0000000..7fa9215 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c @@ -0,0 +1,557 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define HASH_TILE_KEY(_input_type, _output_type, _image_2d, _remainder) \ + ((_input_type << 18) | (_output_type << 4) | (_image_2d << 3) | (_remainder)) + +#define KERNEL_SOURCE "tile", +#define KERNEL_SOURCE1 "tile_mix", + +#define STR(a) #a + +#define HASH_TILE_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE, REMAINDER) \ + CVIVANTE_NAMESPACE("evis.tile_remain"STR(REMAINDER)"_"#SRC_TYPE"to"#DST_TYPE) + +#define TENSOR_TILE_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, REMAINDER), \ + HASH_TILE_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE, REMAINDER), \ + KERNEL_SOURCE1 }, + +#define HASH_TILE_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE, REMAINDER) \ + CVIVANTE_NAMESPACE("evis.tile_remain"STR(REMAINDER)"_"#SRC_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_TILE_KERNELS_2D(SRC_TYPE, OUT_TYPE, REMAINDER) \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, REMAINDER), \ + HASH_TILE_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE, REMAINDER), \ + KERNEL_SOURCE1 }, + +#define TENSOR_TILE_8BITS_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, REMAINDER), \ + HASH_TILE_SH_KERNEL_NAME(U8, U8, REMAINDER), \ + KERNEL_SOURCE }, + +#define TENSOR_TILE_16BITS_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, REMAINDER), \ + HASH_TILE_SH_KERNEL_NAME(I16, I16, REMAINDER), \ + KERNEL_SOURCE }, + + #define TENSOR_TILE_8BITS_2D_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, REMAINDER), \ + HASH_TILE_SH_KERNEL_2D_NAME(U8, U8, REMAINDER), \ + KERNEL_SOURCE }, + + #define TENSOR_TILE_16BITS_2D_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, REMAINDER), \ + HASH_TILE_SH_KERNEL_2D_NAME(I16, I16, REMAINDER), \ + KERNEL_SOURCE }, + + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } _tile_evis_kernel_map[] = +{ + TENSOR_TILE_8BITS_KERNELS( I8, I8, 0) + TENSOR_TILE_8BITS_KERNELS( I8, I8, 1) + TENSOR_TILE_8BITS_KERNELS( I8, I8, 2) + TENSOR_TILE_8BITS_KERNELS( I8, I8, 3) + TENSOR_TILE_8BITS_KERNELS( I8, I8, 4) + TENSOR_TILE_8BITS_KERNELS( I8, I8, 5) + TENSOR_TILE_8BITS_KERNELS( I8, I8, 6) + TENSOR_TILE_8BITS_KERNELS( I8, I8, 7) + TENSOR_TILE_8BITS_KERNELS( U8, U8, 0) + TENSOR_TILE_8BITS_KERNELS( U8, U8, 1) + TENSOR_TILE_8BITS_KERNELS( U8, U8, 2) + TENSOR_TILE_8BITS_KERNELS( U8, U8, 3) + TENSOR_TILE_8BITS_KERNELS( U8, U8, 4) + TENSOR_TILE_8BITS_KERNELS( U8, U8, 5) + TENSOR_TILE_8BITS_KERNELS( U8, U8, 6) + TENSOR_TILE_8BITS_KERNELS( U8, U8, 7) + + TENSOR_TILE_16BITS_KERNELS( I16, I16, 0) + TENSOR_TILE_16BITS_KERNELS( I16, I16, 1) + TENSOR_TILE_16BITS_KERNELS( I16, I16, 2) + TENSOR_TILE_16BITS_KERNELS( I16, I16, 3) + TENSOR_TILE_16BITS_KERNELS( I16, I16, 4) + TENSOR_TILE_16BITS_KERNELS( I16, I16, 5) + TENSOR_TILE_16BITS_KERNELS( I16, I16, 6) + TENSOR_TILE_16BITS_KERNELS( I16, I16, 7) + TENSOR_TILE_16BITS_KERNELS( F16, F16, 0) + TENSOR_TILE_16BITS_KERNELS( F16, F16, 1) + TENSOR_TILE_16BITS_KERNELS( F16, F16, 2) + TENSOR_TILE_16BITS_KERNELS( F16, F16, 3) + TENSOR_TILE_16BITS_KERNELS( F16, F16, 4) + TENSOR_TILE_16BITS_KERNELS( F16, F16, 5) + TENSOR_TILE_16BITS_KERNELS( F16, F16, 6) + TENSOR_TILE_16BITS_KERNELS( F16, F16, 7) + TENSOR_TILE_16BITS_KERNELS( BF16, BF16, 0) + TENSOR_TILE_16BITS_KERNELS( BF16, BF16, 1) + TENSOR_TILE_16BITS_KERNELS( BF16, BF16, 2) + TENSOR_TILE_16BITS_KERNELS( BF16, BF16, 3) + TENSOR_TILE_16BITS_KERNELS( BF16, BF16, 4) + TENSOR_TILE_16BITS_KERNELS( BF16, BF16, 5) + TENSOR_TILE_16BITS_KERNELS( BF16, BF16, 6) + TENSOR_TILE_16BITS_KERNELS( BF16, BF16, 7) + + TENSOR_TILE_8BITS_2D_KERNELS( I8, I8, 0) + TENSOR_TILE_8BITS_2D_KERNELS( I8, I8, 1) + TENSOR_TILE_8BITS_2D_KERNELS( I8, I8, 2) + TENSOR_TILE_8BITS_2D_KERNELS( I8, I8, 3) + TENSOR_TILE_8BITS_2D_KERNELS( I8, I8, 4) + TENSOR_TILE_8BITS_2D_KERNELS( I8, I8, 5) + TENSOR_TILE_8BITS_2D_KERNELS( I8, I8, 6) + TENSOR_TILE_8BITS_2D_KERNELS( I8, I8, 7) + TENSOR_TILE_8BITS_2D_KERNELS( U8, U8, 0) + TENSOR_TILE_8BITS_2D_KERNELS( U8, U8, 1) + TENSOR_TILE_8BITS_2D_KERNELS( U8, U8, 2) + TENSOR_TILE_8BITS_2D_KERNELS( U8, U8, 3) + TENSOR_TILE_8BITS_2D_KERNELS( U8, U8, 4) + TENSOR_TILE_8BITS_2D_KERNELS( U8, U8, 5) + TENSOR_TILE_8BITS_2D_KERNELS( U8, U8, 6) + TENSOR_TILE_8BITS_2D_KERNELS( U8, U8, 7) + + TENSOR_TILE_16BITS_2D_KERNELS( I16, I16, 0) + TENSOR_TILE_16BITS_2D_KERNELS( I16, I16, 1) + TENSOR_TILE_16BITS_2D_KERNELS( I16, I16, 2) + TENSOR_TILE_16BITS_2D_KERNELS( I16, I16, 3) + TENSOR_TILE_16BITS_2D_KERNELS( I16, I16, 4) + TENSOR_TILE_16BITS_2D_KERNELS( I16, I16, 5) + TENSOR_TILE_16BITS_2D_KERNELS( I16, I16, 6) + TENSOR_TILE_16BITS_2D_KERNELS( I16, I16, 7) + TENSOR_TILE_16BITS_2D_KERNELS( F16, F16, 0) + TENSOR_TILE_16BITS_2D_KERNELS( F16, F16, 1) + TENSOR_TILE_16BITS_2D_KERNELS( F16, F16, 2) + TENSOR_TILE_16BITS_2D_KERNELS( F16, F16, 3) + TENSOR_TILE_16BITS_2D_KERNELS( F16, F16, 4) + TENSOR_TILE_16BITS_2D_KERNELS( F16, F16, 5) + TENSOR_TILE_16BITS_2D_KERNELS( F16, F16, 6) + TENSOR_TILE_16BITS_2D_KERNELS( F16, F16, 7) + TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 0) + TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 1) + TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 2) + TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 3) + TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 4) + TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 5) + TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 6) + TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 7) + + TENSOR_TILE_KERNELS( U8, F16, 0) + TENSOR_TILE_KERNELS( U8, F16, 1) + TENSOR_TILE_KERNELS( U8, F16, 2) + TENSOR_TILE_KERNELS( U8, F16, 3) + TENSOR_TILE_KERNELS( U8, F16, 4) + TENSOR_TILE_KERNELS( U8, F16, 5) + TENSOR_TILE_KERNELS( U8, F16, 6) + TENSOR_TILE_KERNELS( U8, F16, 7) + + TENSOR_TILE_KERNELS_2D( U8, F16, 0) + TENSOR_TILE_KERNELS_2D( U8, F16, 1) + TENSOR_TILE_KERNELS_2D( U8, F16, 2) + TENSOR_TILE_KERNELS_2D( U8, F16, 3) + TENSOR_TILE_KERNELS_2D( U8, F16, 4) + TENSOR_TILE_KERNELS_2D( U8, F16, 5) + TENSOR_TILE_KERNELS_2D( U8, F16, 6) + TENSOR_TILE_KERNELS_2D( U8, F16, 7) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +#define SCALAR_INPUT_BATCH_IN (2) +#define SCALAR_INPUT_DEPTH_IN (3) +#define SCALAR_INPUT_DEPTH_OUT (4) +#define SCALAR_INPUT_MULTIPLES_0 (5) +#define SCALAR_INPUT_MULTIPLES_1 (6) +#define SCALAR_INPUT_MULTIPLES_2 (7) +#define SCALAR_INPUT_MULTIPLES_3 (8) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_tile_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; + vsi_int_array_t * in_shape = NULL; + uint32_t pack_key; + int32_t lastWorkItem = 0; + float scaleIn = 1.0f; + float scaleOut = 1.0f; + int32_t output_ZP = 0; + int32_t input_ZP = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + in_shape = attr[0]->shape; + + if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant) + { + if (attr[0]->dfp.fl > 0) + { + scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + input_ZP = 0; + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant) + { + input_ZP = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + + if (VSI_NN_KERNEL_QUANT_DFP == attr[1]->quant) + { + if (attr[1]->dfp.fl > 0) + { + scaleOut = (float)((int64_t)1 << attr[1]->dfp.fl); + } + else + { + scaleOut = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl)); + } + output_ZP = 0; + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[1]->quant) + { + output_ZP = attr[1]->asymm.zero_point; + scaleOut = attr[1]->asymm.scale; + } + +#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \ + (( IN_TYPE << 16) | ( OUT_TYPE)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype ); + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = (in_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]; + gpu_param.global_size[1] = in_shape->data[1]; + gpu_param.global_size[2] = in_shape->size > 2 ? in_shape->data[2] : 1; + + lastWorkItem = ((int32_t)gpu_param.global_size[0] - 1) * ((int32_t)gpu_param.global_scale[0]); + + { + float uint8Scale = scaleIn / scaleOut; + uint16_t M0 = 0; + int8_t postShift = 0; + uint32_t multAndoutZP[2] = {0}; + + gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + vsi_nn_GetFP32MultiAndPostShift(uint8Scale, &M0, &postShift); + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * M0); + + uniU8MulAndPostShift_Lo_2x8.data[7] |= (postShift & 0x1F); + + switch( pack_key ) + { + case _PACK_SELECT_KEY(U8, U8 ): + case _PACK_SELECT_KEY(I8, I8 ): + case _PACK_SELECT_KEY(I16, I16 ): + case _PACK_SELECT_KEY(F16, F16 ): + case _PACK_SELECT_KEY(BF16, BF16 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "lastWorkItem", &lastWorkItem ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY(U8, F16 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "lastWorkItem", &lastWorkItem ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + } + +#undef _PACK_SELECT_KEY + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _tile_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_bool image_2d, + vx_uint32 remainder, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + int i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_TILE_KEY( input_dtype, output_dtype, image_2d, remainder); + + for( i = 0; i < _cnt_of_array(_tile_evis_kernel_map); i ++ ) + { + if( _tile_evis_kernel_map[i].key == key ) + { + break; + } + } + if( i < _cnt_of_array(_tile_evis_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _tile_evis_kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _tile_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + _tile_evis_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _tile_evis_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_bool _is_supported_axis(int32_t* multiples, uint32_t multiples_num) +{ + uint32_t i = 0; + + if ( multiples_num < 4) + { + return TRUE; + } + else if ( multiples_num > 4) + { + return FALSE; + } + + for ( i = 3; i < multiples_num; i++) + { + if (multiples[i] > 1) + { + return FALSE; + } + } + + return TRUE; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_node_t node = NULL; + vx_uint32 remainder = inputs[0]->attr.size[0] % 8; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + uint32_t i = 0; + uint32_t new_rank = 0; + vsi_bool ret = FALSE; + uint32_t dim = inputs[0]->attr.dim_num; + int32_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 }; + + for ( i = 0; i < dim; i++) + { + multiples[i] = outputs[0]->attr.size[i] / inputs[0]->attr.size[i]; + } + + ret = vsi_nn_kernel_optimize_tile_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (int32_t *)multiples, inputs[0]->attr.dim_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + if (ret) + { + if ( _is_supported_axis(shapes[1], new_rank) == FALSE) + { + return NULL; + } + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes[2], new_rank ); + } + else + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[1]->attr.size, + outputs[0]->attr.dim_num )) + { + goto final; + } + + remainder = reshape_tensors[0]->attr.size[0] % 8; + image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1); + status = _query_kernel( &reshape_tensors[0], &reshape_tensors[1], image_2d, remainder, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Pass parameters to node. */ + uint32_t depthIn = new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1; + uint32_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1; + uint32_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1; + + + vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM, + &reshape_tensors[0], 1, &reshape_tensors[1], 1 ); + + node_params[SCALAR_INPUT_BATCH_IN] = vsi_nn_kernel_scalar_create( + graph, I32, &batchIn ); + node_params[SCALAR_INPUT_DEPTH_IN] = vsi_nn_kernel_scalar_create( + graph, I32, &depthIn ); + node_params[SCALAR_INPUT_DEPTH_OUT] = vsi_nn_kernel_scalar_create( + graph, I32, &depthOut ); + node_params[SCALAR_INPUT_MULTIPLES_0] = vsi_nn_kernel_scalar_create( + graph, I32, &shapes[1][0] ); + node_params[SCALAR_INPUT_MULTIPLES_1] = vsi_nn_kernel_scalar_create( + graph, I32, &shapes[1][1] ); + node_params[SCALAR_INPUT_MULTIPLES_2] = vsi_nn_kernel_scalar_create( + graph, I32, &shapes[1][2] ); + node_params[SCALAR_INPUT_MULTIPLES_3] = vsi_nn_kernel_scalar_create( + graph, I32, &shapes[1][3] ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, _EVIS_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_BATCH_IN] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_DEPTH_IN] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_DEPTH_OUT] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_0] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_1] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_2] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_3] ); + } + } + +final: + if (reshape_tensors[0] != inputs[0]) + { + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + } + + if (reshape_tensors[1] != outputs[0]) + { + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + } + + return node; +} /* _setup() */ + +REGISTER_BACKEND_EVIS( tile, _setup ) + +__END_DECLS diff --git a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c new file mode 100644 index 0000000..1f96b93 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c @@ -0,0 +1,906 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_UPSAMPLE, +} _internal_kernel_e; + +#define _UPSAMPLE_KERNEL_SOURCE(suffix) "upsample_"#suffix + +// Add kernel hashtable here +#define UPSAMPLE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, same_type_flag, _image_2d ) \ + ((IN0_DTYPE << 20) | (IN1_DTYPE << 12) | (OUT_DTYPE << 4) | (same_type_flag << 2) | (_image_2d)) + +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { UPSAMPLE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \ + CVIVANTE_NAMESPACE("evis.upsample_"#IN0_DTYPE"_"#IN1_DTYPE"to_"#OUT_DTYPE), \ + _UPSAMPLE_KERNEL_SOURCE(IN0_DTYPE) } + +#define PACK_KERNEL_MAP_SAME_TYPE( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { UPSAMPLE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \ + CVIVANTE_NAMESPACE("evis.upsample_"#IN0_DTYPE"_"#IN1_DTYPE"to_"#OUT_DTYPE"_SAME"), \ + _UPSAMPLE_KERNEL_SOURCE(IN0_DTYPE) } + +#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { UPSAMPLE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \ + CVIVANTE_NAMESPACE("evis.upsample_"#IN0_DTYPE"_"#IN1_DTYPE"to_"#OUT_DTYPE"_2D"), \ + _UPSAMPLE_KERNEL_SOURCE(IN0_DTYPE) } + +#define PACK_KERNEL_MAP_SAME_TYPE_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { UPSAMPLE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ), \ + CVIVANTE_NAMESPACE("evis.upsample_"#IN0_DTYPE"_"#IN1_DTYPE"to_"#OUT_DTYPE"_SAME_2D"), \ + _UPSAMPLE_KERNEL_SOURCE(IN0_DTYPE) } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _upsample_kernel_map[] = +{ + PACK_KERNEL_MAP( I16, U8, I16), + PACK_KERNEL_MAP( I16, I16, I16), + PACK_KERNEL_MAP( I16, U8, F16), + PACK_KERNEL_MAP( I16, I16, F16), + PACK_KERNEL_MAP( I8, U8, I8), + PACK_KERNEL_MAP( I8, U8, F16), + PACK_KERNEL_MAP( U8, U8, U8), + PACK_KERNEL_MAP( U8, U8, F16), + PACK_KERNEL_MAP( F16, U8, U8), + PACK_KERNEL_MAP( F16, I16, U8), + PACK_KERNEL_MAP( F16, U8, I16), + PACK_KERNEL_MAP( F16, U8, I8), + PACK_KERNEL_MAP_SAME_TYPE( I16, U8, I16), + PACK_KERNEL_MAP_SAME_TYPE( I8, U8, I8), + PACK_KERNEL_MAP_SAME_TYPE( U8, U8, U8), + PACK_KERNEL_MAP_2D( I16, U8, I16), + PACK_KERNEL_MAP_2D( I16, I16, I16), + PACK_KERNEL_MAP_2D( I16, U8, F16), + PACK_KERNEL_MAP_2D( I16, I16, F16), + PACK_KERNEL_MAP_2D( I8, U8, I8), + PACK_KERNEL_MAP_2D( I8, U8, F16), + PACK_KERNEL_MAP_2D( U8, U8, U8), + PACK_KERNEL_MAP_2D( U8, U8, F16), + PACK_KERNEL_MAP_2D( F16, U8, U8), + PACK_KERNEL_MAP_2D( F16, I16, U8), + PACK_KERNEL_MAP_2D( F16, U8, I16), + PACK_KERNEL_MAP_2D( F16, U8, I8), + PACK_KERNEL_MAP_SAME_TYPE_2D( I16, U8, I16), + PACK_KERNEL_MAP_SAME_TYPE_2D( I8, U8, I8), + PACK_KERNEL_MAP_SAME_TYPE_2D( U8, U8, U8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _upsample_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _UPSAMPLE_PARAM_NUM _cnt_of_array( _upsample_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_upsample_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_nn_kernel_tensor_attr_t *axis_attr = NULL; + vsi_int_array_t * input_shape = NULL; + vsi_nn_kernel_dtype_e src_dtype = F16; + vsi_nn_kernel_dtype_e dst_dtype = F16; + vsi_nn_kernel_dtype_e axis_dtype = F16; + int32_t input_fl = 0; + int32_t output_fl = 0; + uint16_t M0 = 0; + int8_t postShift = 0; + float inputScale = 1.0f; + int32_t input_ZP = 0; + float outputScale = 1.0f; + int32_t output_ZP = 0; + float factorOut = 1.0f; + vsi_bool image_2d = FALSE; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + axis_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( axis_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + input_shape = input_attr->shape; + src_dtype = input_attr->dtype; + dst_dtype = output_attr->dtype; + axis_dtype = axis_attr->dtype; + + if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + input_fl = input_attr->dfp.fl; + if (input_fl > 0) + { + inputScale = 1.0f / (float) ((int64_t)1 << input_fl); + } + else + { + inputScale = (float)((int64_t)1 << -input_fl); + } + } + else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inputScale = input_attr->asymm.scale; + input_ZP = input_attr->asymm.zero_point; + } + + if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + output_fl = output_attr->dfp.fl; + if (output_fl > 0) + { + outputScale = 1.0f / (float) ((int64_t)1 << output_fl); + } + else + { + outputScale = (float)((int64_t)1 << -output_fl); + } + } + else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + outputScale = output_attr->asymm.scale; + output_ZP = output_attr->asymm.zero_point; + } + + factorOut = 1.0f / outputScale; + + + vsi_nn_GetFP32MultiAndPostShift(inputScale / outputScale, &M0, &postShift); + + image_2d = (vsi_bool)(input_shape->size < 3 || 1 == input_shape->data[2]); + + if (BF16 == src_dtype && BF16 == dst_dtype) + { + src_dtype = F16; + dst_dtype = F16; + } + + if ((F16 == src_dtype) && (F16 == dst_dtype)) + { + src_dtype = I16; + dst_dtype = I16; + } + + if (I8 == axis_dtype) + { + axis_dtype = U8; + } + + if (I8 == src_dtype || U8 == src_dtype + || (F16 == src_dtype && U8 == dst_dtype && U8 == axis_dtype)) + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.dim = image_2d ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (input_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (input_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = image_2d ? 1 : ( + (input_shape->data[2] + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2]); + + if((I8 == src_dtype && I8 == dst_dtype) + || (U8 == src_dtype && U8 == dst_dtype) + || (U8 == src_dtype && F16 == dst_dtype) + || (F16 == src_dtype && U8 == dst_dtype && U8 == axis_dtype) + ) + { + vx_bool is_same_quant_type = (vx_bool)((input_attr->dfp.fl == output_attr->dfp.fl + && input_attr->quant == VSI_NN_KERNEL_QUANT_DFP + && output_attr->quant == VSI_NN_KERNEL_QUANT_DFP) + || ((input_attr->asymm.zero_point == output_attr->asymm.zero_point) + && (input_attr->asymm.scale == output_attr->asymm.scale) + && input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM + && output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)); + + if (is_same_quant_type) + { + status = vsi_nn_kernel_gpu_add_param(node, "input_ZP", &input_ZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if ((I8 == src_dtype && I8 == dst_dtype) + || (U8 == src_dtype && U8 == dst_dtype)) + { + gpu_dp_inst_t uniU8SubZP_MulM_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZP_MulM_Hi_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniS16AddOutZP_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + vx_uint32 uniS16MoveValue_2x8[16] = { + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }; + + uint32_t idx = 0; + uint32_t packed_outputZP[4] = {0}; + + for (idx = 0; idx < 4; idx ++) + { + vx_uint16 zp = (vx_uint16)(output_ZP & 0xFFFF); + packed_outputZP[idx] = (zp << 16) | zp; + } + + uniU8SubZP_MulM_2x8.data[7] |= postShift; + uniU8SubZP_MulM_Hi_2x8.data[7] |= postShift; + + for (idx = 8; idx < 16; idx ++) + { + uniU8SubZP_MulM_2x8.data[idx] = (uint32_t)((M0 << 16) | M0); + uniU8SubZP_MulM_Hi_2x8.data[idx] = (uint32_t)((M0 << 16) | M0); + } + + status = vsi_nn_kernel_gpu_add_param(node, "uniU8SubZP_MulM_2x8", &uniU8SubZP_MulM_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZP_MulM_Hi_2x8", &uniU8SubZP_MulM_Hi_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniS16AddOutZP_2x8", &uniS16AddOutZP_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniS16MoveValue_2x8", &uniS16MoveValue_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "packed_outputZP", packed_outputZP); + status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP", &input_ZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + + } + else if (!is_same_quant_type) + { + // uniforms + gpu_dp_inst_t uniConvertDirUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertEndUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertTrdUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00090008, 0x000b000a, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertFthUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x000d000c, 0x000f000e, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniMulMinusZpUint8_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x01010101, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniMulMinusZp2Uint8_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x01010101, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniMulMinusZp3Uint8_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00090008, 0x000b000a, // ABin + 0x01010101, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniMulMinusZp4Uint8_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x000d000c, 0x000f000e, // ABin + 0x01010101, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniF16MulMultipiler_PostShft_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniS16AddOutZP_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + + if(F16 == src_dtype && U8 == dst_dtype) + { + uint32_t idx = 0; + uint32_t packed_outputZP[4] = {0}; + + for (idx = 0; idx < 4; idx ++) + { + vx_uint8 zp = (vx_uint8)(output_ZP & 0xFF); + packed_outputZP[idx] = (zp << 24) | (zp << 16) | (zp << 8) | zp; + } + + uniF16MulMultipiler_PostShft_2x8.data[7] |= postShift; + + for (idx = 8; idx < 16; idx ++) + { + uniF16MulMultipiler_PostShft_2x8.data[idx] = (uint32_t)(M0); + } + + status = vsi_nn_kernel_gpu_add_param(node, "uniF16MulMultipiler_PostShft_2x8", + &uniF16MulMultipiler_PostShft_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniS16AddOutZP_2x8", + &uniS16AddOutZP_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "packed_outputZP", packed_outputZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if(U8 == src_dtype && F16 == dst_dtype) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertDirUint8Fp32_4x4", + &uniConvertDirUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEndUint8Fp32_4x4", + &uniConvertEndUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertTrdUint8Fp32_4x4", + &uniConvertTrdUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthUint8Fp32_4x4", + &uniConvertFthUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "scaleU8Fp16", &inputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "zpU8Fp16", &input_ZP); + status |= vsi_nn_kernel_gpu_add_param(node, "uniMulMinusZpUint8_4x4", &uniMulMinusZpUint8_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniMulMinusZp2Uint8_4x4", &uniMulMinusZp2Uint8_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniMulMinusZp3Uint8_4x4", &uniMulMinusZp3Uint8_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniMulMinusZp4Uint8_4x4", &uniMulMinusZp4Uint8_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + } + else if((I16 == src_dtype && I16 == dst_dtype) + || (I16 == src_dtype && F16 == dst_dtype)) + { + // uniforms + gpu_dp_inst_t uniConvertDirInt16Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniQuantInOutInt16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t ucharMulShort_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x11111111, // BSelt + 0x03020100, 0x07060504, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertU8toI16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + if(I16 == src_dtype && I16 == dst_dtype ) + { + status = vsi_nn_kernel_gpu_add_param(node, "ucharMulShort_2x8", + &ucharMulShort_2x8); + if (input_fl != output_fl || I16 == axis_dtype) + { + if(input_fl > output_fl) + { + uniQuantInOutInt16_2x8.data[7] = uniQuantInOutInt16_2x8.data[7] | (input_fl - output_fl); + } + else + { + uint32_t multiply = ((int64_t)1 << (output_fl - input_fl)); + uint32_t i = 0; + + for (i = 8; i < 16; i++) + { + uniQuantInOutInt16_2x8.data[i] = multiply; + } + } + + status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantInOutInt16_2x8", + &uniQuantInOutInt16_2x8); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if(I16 == src_dtype && F16 == dst_dtype) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertDirInt16Fp32_4x4", + &uniConvertDirInt16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertU8toI16_2x8", + &uniConvertU8toI16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "inScaleInt16", &inputScale); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + else if( F16 == src_dtype + && ( (U8 == dst_dtype && I16 == axis_dtype) || I8 == dst_dtype || I16 == dst_dtype )) + { + // uniforms + gpu_dp_inst_t shortMulShort_8x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x11111111, // BSelt + 0x03020100, 0x07060504, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t ucharMulShort_8x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x11111111, // BSelt + 0x03020100, 0x07060504, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertFstFp16Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstFp16Fp32_4x4", + &uniConvertFstFp16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", + &uniConvertSecFp16Fp32_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + if(U8 == dst_dtype && I16 == axis_dtype) + { + status = vsi_nn_kernel_gpu_add_param(node, "upOutput_Scale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "shortMulShort_8x8", &shortMulShort_8x8); + status |= vsi_nn_kernel_gpu_add_param(node, "upOutput_ZP", &output_ZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if(I8 == dst_dtype) + { + float scale_out_f = 1.0f / outputScale; + float output_ZP_f = (float)output_ZP; + status = vsi_nn_kernel_gpu_add_param(node, "scaleOut", &scale_out_f); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZp", &output_ZP_f); + status |= vsi_nn_kernel_gpu_add_param(node, "ucharMulShort_8x8_2", &ucharMulShort_8x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if(I16 == dst_dtype) + { + status = vsi_nn_kernel_gpu_add_param(node, "up_outFlScale_i16", &factorOut); + status |= vsi_nn_kernel_gpu_add_param(node, "ucharMulShort_8x8_2", &ucharMulShort_8x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + } + else if( I8 == src_dtype && F16 == dst_dtype ) + { + gpu_dp_inst_t uniConvertDirUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertEndUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertTrdUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00090008, 0x000b000a, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertFthUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x000d000c, 0x000f000e, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + float inputTail = (float)input_ZP * inputScale * (-1.0f); + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertDirUint8Fp32_4x4_2", &uniConvertDirUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEndUint8Fp32_4x4_2", &uniConvertEndUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertTrdUint8Fp32_4x4_2", &uniConvertTrdUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthUint8Fp32_4x4_2", &uniConvertFthUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8_2", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "scaleIn", &inputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "inputTail", &inputTail); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + if (axis_attr) vsi_nn_kernel_tensor_attr_release( &axis_attr ); + return status; + +} /* _upsample_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _upsample_kernel_map; + size_t kernel_map_size = _cnt_of_array( _upsample_kernel_map ); + vx_param_description_t * param_def = _upsample_kernel_param_def; + size_t param_def_size = _cnt_of_array( _upsample_kernel_param_def ); + vx_kernel_initialize_f initializer = _upsample_initializer; + uint32_t key; + uint32_t i; + vsi_bool is_same_type = FALSE; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if ((BF16 == in0_dtype) && (BF16 == out_dtype)) + { + in0_dtype = F16; + out_dtype = F16; + } + + if (I8 == in1_dtype) + { + in1_dtype = U8; + } + + if (((I8 == in0_dtype) && (I8 == out_dtype)) + || ((I16 == in0_dtype) && (I16 == out_dtype)) + || ((U8 == in0_dtype) && (U8 == out_dtype))) + { + if ((inputs[0]->attr.dtype.fl == outputs[0]->attr.dtype.fl + && inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP + && outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + || ((inputs[0]->attr.dtype.zero_point == outputs[0]->attr.dtype.zero_point) + && (inputs[0]->attr.dtype.scale == outputs[0]->attr.dtype.scale) + && inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC + && outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)) + { + is_same_type = TRUE; + } + } + + if ((F16 == in0_dtype) && (F16 == out_dtype)) + { + in0_dtype = I16; + out_dtype = I16; + is_same_type = TRUE; + } + + key = UPSAMPLE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, is_same_type, image_2d ); + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + + + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; + +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_UPSAMPLE_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t scale_x = 0; + int32_t scale_y = 0; + vsi_bool image_2d = FALSE; + + scale_x = vsi_nn_kernel_param_get_int32(params, "scale_x"); + scale_y = vsi_nn_kernel_param_get_int32(params, "scale_y"); + + if (2 != scale_x || 2 != scale_y) + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[1]->attr.size, + inputs[1]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num )) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLE_PARAM_NUM ); + } + } + + return node; + +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( upsample, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_gpu.c b/src/tim/vx/internal/src/kernel/vsi_nn_gpu.c new file mode 100644 index 0000000..34d4408 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vsi_nn_gpu.c @@ -0,0 +1,139 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include "kernel/vsi_nn_gpu.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_error.h" + +void gpu_dp_inst_update_postshfit + ( + gpu_dp_inst_t * dp_inst, + int32_t shift + ) +{ + if( !dp_inst ) + { + return; + } + VSI_ASSERT( dp_inst->type == GPU_DP_TYPE_16 ); + if( shift < 0 ) + { + const uint32_t multiplier = gpu_multiplier( 1 << (-shift) ); + gpu_dp_inst_update_multiplier( dp_inst, 0, 8, multiplier ); + } + else + { + const int32_t index = 7; + const uint8_t postshift = (uint8_t)gpu_postshift( shift ); + // clear postshift + dp_inst->data[index] &= ~((uint32_t)0x1F); + // set postshift + dp_inst->data[index] |= (postshift & 0x1F); + } +} /* gpu_dp_inst_update_postshfit() */ + +void gpu_dp_inst_update_multiplier + ( + gpu_dp_inst_t * dp_inst, + int32_t start, + int32_t end, + int32_t multiplier + ) +{ + const int32_t multiplier_pos = 8; + const int32_t start_pos = multiplier_pos + start; + const int32_t end_pos = multiplier_pos + end; + int32_t i; + + for( i = start_pos; i < end_pos; i ++ ) + { + dp_inst->data[i] = multiplier; + } +} + +void gpu_quantize_multiplier_32bit + ( + double double_multiplier, + uint32_t * quantize_multiplier, + int32_t * shift + ) +{ + double q; + int64_t q_fixed; + const int32_t bit = 32; + if( vsi_abs(double_multiplier - 0.0) < 1e-5 ) + { + *quantize_multiplier = 0; + *shift = bit - 0; + } + else + { + q = frexp( double_multiplier, shift ); + q_fixed = (int64_t)(vsi_rint(q * (1ll << 31))); + VSI_ASSERT( q_fixed <= (1ll << 31) ); + if( q_fixed == (1ll << 31) ) + { + q_fixed /= 2; + *shift += 1; + } + if( *shift < -31 ) + { + *shift = 0; + q_fixed = 0; + } + *quantize_multiplier = (uint32_t)q_fixed; + } + if( 0 == *quantize_multiplier ) + { + *shift = 0; + } + else + { + *shift = bit - *shift; + } +} /* gpu_quantize_multiplier_32_bit() */ + +void gpu_quantize_multiplier_16bit + ( + double double_multiplier, + uint16_t * quantize_multiplier, + int32_t * shift + ) +{ + uint32_t multiplier_32bit = 0; + const int32_t bit = 16; + gpu_quantize_multiplier_32bit( double_multiplier, &multiplier_32bit, shift ); + *quantize_multiplier = (uint16_t)(multiplier_32bit >> (bit - 1)); + if( *quantize_multiplier == 0 ) + { + *shift = 0; + } + else + { + *shift -= bit; + } +} /* gpu_quantize_multiplier_16bit() */ + diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c new file mode 100644 index 0000000..af3b91b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -0,0 +1,1250 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include +#include "vsi_nn_context.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_types.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_math.h" + +#include "libnnext/vsi_nn_libnnext_resource.h" +#if VSI_USE_VXC_BINARY +/*this header can be only included once in all *.c files*/ +#include "libnnext/vx_bin/vxc_binaries.h" +#endif + +typedef struct +{ + size_t size; + const void* data; + void* reserve_mem; +} kernel_program_info_t; + +static vsi_status _kernel_init_obj + ( + vx_kernel_description_t* info, + vx_kernel obj + ); + +static vsi_status _cpu_register + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel + ); + +static vsi_status _gpu_register + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel + ); + +static vx_program _create_program_from_executable + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel + ); + +static vx_program _create_program_from_code + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel + ); + +static const void* _load_internal_executable + ( + const char* source_name, + size_t* size + ); + +static char* _load_source_code_from_file + ( + const char* source_name, + size_t* size + ); + +static vx_program _create_program + ( + vx_context ctx, + kernel_program_info_t *program_info, + size_t size + ); + +static void _kernel_clear_source + ( vsi_nn_kernel_t * kernel ); + +static vsi_bool _check_shader_support(vsi_nn_graph_t* graph); + +static vsi_status VX_CALLBACK _kernel_validator + ( + vx_node node, + const vx_reference parameters[], + uint32_t num, + vx_meta_format metas[] + ) +{ + return VSI_SUCCESS; +} /* _kernel_validator() */ + +static vsi_status VX_CALLBACK _kernel_initializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + return VSI_SUCCESS; +} /* _kernel_initializer() */ + +static vsi_status VX_CALLBACK _kernel_deinitializer + ( + vx_node nodObj, + const vx_reference *paraObj, + uint32_t paraNum + ) +{ + return VSI_SUCCESS; +} /* _kernel_deinitializer() */ + +static void _kernel_clear_build_option + ( + vsi_nn_kernel_source_info_t * source + ) +{ + vsi_nn_kernel_build_option_t * option; + if( !source ) + { + return; + } + option = &source->build_option; + if( option->data ) + { + free( option->data ); + option->data = NULL; + } +} /* _kernel_clear_build_option() */ + +static void _kernel_clear_source + ( vsi_nn_kernel_t * kernel ) +{ + size_t i; + if( !kernel ) + { + return; + } + for( i = 0; i < VSI_NN_GPU_SOURCE_FMT_NUM; i ++ ) + { + vsi_nn_kernel_source_info_t* source = &(kernel->gpu.sources[i]); + if( source->data ) + { + size_t j; + for( j = 0; j < source->num; j ++ ) + { + if( source->data[j] ) + { + free( source->data[j] ); + source->data[j] = NULL; + } + } + free( source->data ); + source->data = NULL; + _kernel_clear_build_option( source ); + } + } +} /* _kernel_clear_source() */ + +static vsi_status _cpu_register + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel + ) +{ + vsi_status status; + vx_kernel_description_t* info; + vx_kernel obj; + + status = VSI_FAILURE; + info = &kernel->info; + + obj = vxAddUserKernel( + graph->ctx->c, + info->name, + info->enumeration, + info->function, + info->numParams, + info->validate, + info->initialize, + info->deinitialize + ); + if( NULL != obj ) + { + status = _kernel_init_obj( info, obj ); + //vxReleaseKernel( &obj ); + } + else + { + VSILOGE( "Add kernel %s fail.", info->name ); + } + return status; +} /* _cpu_register() */ + +static const void* _load_internal_executable + ( + const char* source_name, + size_t* size + ) +{ +#if VSI_USE_VXC_BINARY + int i; + for( i = 0; i < vx_bin_resource_items_cnt; i++ ) + { + if( strncmp( vx_bin_resource_items[i].name, source_name, VSI_NN_MAX_PATH ) == 0 ) + { + *size = (size_t)vx_bin_resource_items[i].len; + return vx_bin_resource_items[i].data; + } + } +#endif + return NULL; +} /* _load_internal_executable() */ + +static char* _load_source_code_from_file + ( + const char* source_name, + size_t* size + ) +{ + char* source; + FILE* fp; + size_t total_bytes; + size_t read_bytes; + source = NULL; + //TODO: Pack new name + fp = fopen( source_name, "rb" ); + if( NULL == fp ) + { + VSILOGE("Open program file %s fail.", source_name); + *size = 0; + goto final; + } + fseek( fp, 0, SEEK_END ); + total_bytes = ftell( fp ); + fseek( fp, 0, SEEK_SET ); + if( total_bytes == 0 ) + { + VSILOGE("Program file %s is empty.", source_name); + *size = 0; + goto final; + } + source = (char*)malloc( total_bytes + 1 ); + if( source ) + { + read_bytes = 0; + while( total_bytes - read_bytes > 0 ) + { + read_bytes += fread( &source[read_bytes], 1, total_bytes - read_bytes, fp ); + } + source[read_bytes] = 0; + *size = read_bytes; + } +final: + fclose( fp ); + return source; +} /* _load_source_code_from_file() */ + +static vx_program _create_program + ( + vx_context ctx, + kernel_program_info_t *program_info, + size_t num + ) +{ + vx_char** sources; + vx_size* source_sizes; + size_t i; + vsi_status status; + vx_program program; + program = NULL; + + sources = (vx_char**)malloc( sizeof(vx_char*) * num ); + source_sizes = (vx_size*)malloc( sizeof(vx_size) * num ); + + for( i = 0; i < num; i ++ ) + { + sources[i] = (vx_char*)program_info[i].data; + source_sizes[i] = (vx_size)program_info[i].size; + } + program = vxCreateProgramWithSource( ctx, (vx_uint32)num, + (const vx_char**)sources, source_sizes ); + status = vxGetStatus( (vx_reference)program ); + if(VSI_SUCCESS != status) + { + VSILOGE("Create program from source fail!"); + } + if( sources ) + { + free( sources ); + } + if( source_sizes ) + { + free( source_sizes ); + } + return program; +} /* _create_program() */ + +static vx_program _create_program_from_code + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel + ) +{ + const vsi_nn_kernel_source_info_t* source_info; + kernel_program_info_t* program_info; + size_t i; + vx_program program = NULL; + source_info = &kernel->gpu.sources[VSI_NN_GPU_SOURCE_FMT_CODE]; + + if( source_info->num == 0 ) + { + VSILOGE("Not executable source found in kernel."); + return NULL; + } + program_info = (kernel_program_info_t*)malloc( + source_info->num * sizeof(kernel_program_info_t) ); + if( !program_info ) + { + VSILOGE("Malloc program memory fail."); + return NULL; + } + memset( program_info, 0, source_info->num * sizeof(kernel_program_info_t) ); + + for( i = 0; i < source_info->num; i ++ ) + { + program_info[i].data = (const void*)vsi_nn_resource_load_source_code( + source_info->data[i], &program_info[i].size, kernel->type ); + if( !program_info[i].data ) + { + program_info[i].reserve_mem = (void*)_load_source_code_from_file( + source_info->data[i], &program_info[i].size ); + program_info[i].data = (const void*)program_info[i].reserve_mem; + } + } + program = _create_program( graph->ctx->c, program_info, source_info->num ); + if( program_info ) + { + for( i = 0; i < source_info->num; i ++ ) + { + if( program_info[i].reserve_mem ) + { + free( program_info[i].reserve_mem ); + } + } + free( program_info ); + } + return program; +} /* _create_program_from_code() */ + +static vx_program _create_program_from_executable + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel + ) +{ + const vsi_nn_kernel_source_info_t* source_info; + kernel_program_info_t program_info; + vx_program program = NULL; + source_info = &kernel->gpu.sources[VSI_NN_GPU_SOURCE_FMT_EXECUTABLE]; + + if( source_info->num == 0 ) + { + VSILOGE("Not executable source found in kernel."); + return NULL; + } + + VSI_ASSERT( source_info->num == 1 ); + memset( &program_info, 0, sizeof( kernel_program_info_t ) ); + + program_info.data = _load_internal_executable( + source_info->data[0], &program_info.size); + program = vxCreateProgramWithBinary( graph->ctx->c, + (const vx_uint8 *)program_info.data, program_info.size ); + return program; +} /* _create_program_from_executable() */ + +static vsi_status _gpu_register + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel + ) +{ + vsi_status status; + vx_kernel_description_t* info; + vx_kernel obj; + vsi_nn_context_t context; + vx_program program = NULL; + const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt; + +#define MAX_BUILDPROGRAM_LEN 1024 + char cmd[MAX_BUILDPROGRAM_LEN] = { 0 }; + size_t cost_bytes = 0; + + memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN ); + context = graph->ctx; + + status = VSI_FAILURE; + info = &(kernel->info); + + switch( active_fmt ) + { + case VSI_NN_GPU_SOURCE_FMT_CODE: + program = _create_program_from_code( graph, kernel ); + break; + case VSI_NN_GPU_SOURCE_FMT_EXECUTABLE: + program = _create_program_from_executable( graph, kernel ); + break; + default: + VSILOGE("Unknown source format %d", kernel->gpu.active_source_fmt); + break; + } + if( NULL == program ) + { + return status; + } + + if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE ) + { + // set default evis version is 2 + if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type ) + { + cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, + "-cl-viv-vx-extension -D VX_VERSION=2" ); + } + } + else + { + cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, + "-cl-viv-vx-extension -D VX_VERSION=%d", + context->config.evis.ver ); + } + // Pack build option + if( kernel->gpu.sources[active_fmt].build_option.data ) + { + vsi_nn_kernel_build_option_t * option = &kernel->gpu.sources[active_fmt].build_option; + if( MAX_BUILDPROGRAM_LEN - cost_bytes > strlen( option->data ) + 1 ) + { + snprintf( &cmd[cost_bytes], MAX_BUILDPROGRAM_LEN - cost_bytes, + " %s", option->data ); + } + else + { + VSILOGE("Build option is too long!"); + VSI_ASSERT( FALSE ); + } + } + + status = vxBuildProgram( program, cmd ); + + if( VSI_SUCCESS != status ) + { + VSILOGE("Build program fail."); + return status; + } + + obj = vxAddKernelInProgram( + program, + info->name, + info->enumeration, + info->numParams, + info->validate, + info->initialize, + info->deinitialize + ); + + if( obj ) + { + status = _kernel_init_obj( info, obj ); + //vxReleaseKernel( &obj ); + } + else + { + VSILOGE( "Add kernel %s fail.", info->name ); + } + if( program ) + { + vxReleaseProgram( &program ); + } + return status; +} /* _gpu_register() */ + +static vsi_status _kernel_init_obj + ( + vx_kernel_description_t* info, + vx_kernel obj + ) +{ + vsi_status status; + uint32_t i; + + status = VSI_SUCCESS; + for( i = 0; i < info->numParams; i ++ ) + { + status = vxAddParameterToKernel( + obj, + i, + info->parameters[i].direction, + info->parameters[i].data_type, + info->parameters[i].state + ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Add parameter %d to kernel %s fail. with %d.", + i, info->name, status ); + break; + } + } + + if( VSI_SUCCESS == status ) + { + status = vxFinalizeKernel( obj ); + } + + if( VSI_SUCCESS != status ) + { + VSILOGE( "Finalize kernel %s fail with %d.", + info->name, status ); + status = vxRemoveKernel( obj ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Remove kernel %s fail with %d.", + info->name, status ); + } + } + return status; +} /* _kernel_init_obj() */ + +vsi_status vsi_nn_kernel_register + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel + ) +{ + vsi_status status; + status = VSI_FAILURE; + switch( kernel->type ) + { + case VSI_NN_KERNEL_TYPE_CPU: + status = _cpu_register( graph, kernel ); + break; + case VSI_NN_KERNEL_TYPE_EVIS: + case VSI_NN_KERNEL_TYPE_CL: + status = _gpu_register( graph, kernel ); + break; + case VSI_NN_KERNEL_TYPE_VX: + VSILOGE("Openvx node no need to register."); + break; + default: + VSILOGE("Unknown kernel %d.", kernel->type); + break; + } + return status; +} /* vsi_nn_kernel_register() */ + +vsi_nn_kernel_node_t vsi_nn_kernel_create_node + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel + ) +{ + vsi_status status; + vx_context ctx; + vx_kernel obj; + vx_node node; + vx_kernel_description_t* info; + + info = &(kernel->info); + // Validate kernel + if( !info->initialize ) + { + VSILOGE("Kernel %s initializer is NULL", info->name); + return NULL; + } + if( !info->validate ) + { + VSILOGE("Kernel %s validator is NULL", info->name); + return NULL; + } + if( !info->deinitialize ) + { + VSILOGE("Kernel %s deinitializer is NULL", info->name); + return NULL; + } + if( info->enumeration == KERNEL_ID_PLACEHOLDER ) + { + //VSILOGD("Kernel id: %#x, %#x", kernel->unique_id, info->enumeration); + info->enumeration = (vx_enum)kernel->unique_id; + } + if( info->enumeration > KERNEL_ID_OVXLIB_RESERVED ) + { + VSILOGE("Kernel id is invalid %#x(max: %#x)", + info->enumeration, KERNEL_ID_OVXLIB_RESERVED); + return NULL; + } + + ctx = vxGetContext( (vx_reference)graph->g ); + + obj = vxGetKernelByName( ctx, info->name ); + status = vxGetStatus( (vx_reference)obj ); + if (VSI_SUCCESS != status) + { + fprintf(stderr, "\n"); // TODO: This is a hack for driver msg + /* Register kernel */ + status = vsi_nn_kernel_register( graph, kernel ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Register client kernel %s fail with %d.", + info->name, status ); + return NULL; + } + else + { + VSILOGD( "Register client kernel %s successfully.", + info->name ); + } + + /* Load kernel */ + obj = vxGetKernelByName( ctx, info->name ); + status = vxGetStatus( (vx_reference)obj ); + } + if( VSI_SUCCESS != status ) + { + VSILOGE( "Load client kernel %s fail with %d.", + info->name, status ); + return NULL; + } + node = vxCreateGenericNode( graph->g, obj ); + vxReleaseKernel( &obj ); + status = vxGetStatus( (vx_reference)node ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Load client node from kernel %s fail with %d.", + info->name, status ); + return NULL; + } + if( node ) + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute( node, VX_NODE_BORDER, &border, sizeof(border) ); + } + return (vsi_nn_kernel_node_t)node; +} /* vsi_nn_kernel_create_node() */ + + +vsi_status vsi_nn_kernel_node_set_border + (vsi_nn_kernel_node_t node, + vx_border_t* border) +{ + vsi_status status = VSI_FAILURE; + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, border, sizeof(vx_border_t) ); + return status; +} + +vsi_status vsi_nn_kernel_node_pass_param + ( + vsi_nn_kernel_node_t node, + vsi_nn_kernel_node_param_t * params, + size_t num + ) +{ + vsi_status status; + uint32_t i; + + status = VSI_FAILURE; + for( i = 0; i < num; i++ ) + { + status = vxSetParameterByIndex( (vx_node)node, i, (vx_reference)params[i] ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Set %d parameter fail.", i ); + break; + } + } + return status; +} /* vsi_nn_kernel_node_pass_param() */ + +vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_reshape + ( + vsi_nn_kernel_tensor_t tensor, + int32_t* shape, + uint32_t rank + ) +{ + return (vsi_nn_kernel_tensor_t)vxReshapeTensor((vx_tensor)tensor, shape, rank); +} /* vsi_nn_kernel_tensor_reshape() */ + +void vsi_nn_kernel_tensor_release + ( + vsi_nn_kernel_tensor_t* tensor + ) +{ + vxReleaseTensor( (vx_tensor*)tensor ); + *tensor = NULL; +} /* vsi_nn_kernel_tensor_release() */ + +void vsi_nn_kernel_add_source + ( + vsi_nn_kernel_t* kernel, + vsi_nn_gpu_source_fmt_e fmt, + size_t source_num, + ... + ) +{ + va_list arg; + size_t i; + vsi_nn_kernel_source_info_t* source; + if( source_num == 0 ) + { + return; + } + if( fmt >= VSI_NN_GPU_SOURCE_FMT_NUM ) + { + VSILOGE("Unknown source type %d", fmt); + return; + } + if( kernel->gpu.sources[fmt].data ) + { + VSILOGE("Kernel source %d has been attached!", fmt); + return; + } + source = &(kernel->gpu.sources[fmt]); + va_start( arg, source_num ); + if( source_num > 0 ) + { + const size_t mem_size = sizeof(vsi_nn_kernel_source_t) * source_num; + source->data = (vsi_nn_kernel_source_t*)malloc( mem_size ); + if( !source->data ) + { + VSILOGE("Out of memory, create kernel source fail."); + return; + } + memset( source->data, 0, mem_size ); + } + for( i = 0; i < source_num; i ++ ) + { + vsi_nn_kernel_source_t src = va_arg( arg, vsi_nn_kernel_source_t ); + size_t size = strlen( src ); + source->data[i] = (vsi_nn_kernel_source_t)malloc( size * sizeof(char) + 1 ); + if( source->data[i] ) + { + memcpy( source->data[i], src, size ); + source->data[i][size] = 0; + } + else + { + VSILOGE("Malloc source memory fail."); + return; + } + } + source->num = source_num; + va_end(arg); +} /* vsi_nn_kernel_add_source() */ + +void vsi_nn_kernel_add_build_option + ( + vsi_nn_kernel_t * kernel, + const char * option + ) +{ + const vsi_nn_gpu_source_fmt_e fmt = VSI_NN_GPU_SOURCE_FMT_CODE; + vsi_nn_kernel_build_option_t * build_option; + size_t new_size; + size_t item_size; + size_t org_size; + char * buf = NULL; + if( !kernel || !option ) + { + VSILOGW("Get NULL pointer."); + return; + } + build_option = &kernel->gpu.sources[fmt].build_option; + buf = build_option->data; + item_size = strlen( option ); + org_size = 0; + if( buf ) + { + org_size = strlen( buf ); + } + new_size = org_size + item_size; + buf = (char*)realloc( buf, new_size + 2 ); // Space and terminator + if( !buf ) + { + VSILOGE("Out of memory"); + return; + } + snprintf( &buf[org_size], item_size + 2, " %s", option ); + build_option->data = buf; + +} /* vsi_nn_kernel_add_build_option() */ + +void vsi_nn_kernel_release + ( + vsi_nn_kernel_t ** kernel + ) +{ + if( kernel && *kernel ) + { + _kernel_clear_source( *kernel ); + free( *kernel ); + *kernel = NULL; + } +} /* vsi_nn_kernel_release() */ + +vsi_nn_kernel_t* vsi_nn_kernel_create + ( + vsi_nn_kernel_type_e type + ) +{ + vsi_nn_kernel_t* kernel = (vsi_nn_kernel_t*)malloc( sizeof(vsi_nn_kernel_t) ); + if( !kernel ) + { + VSILOGE("Out of memory, create kernel fail."); + return NULL; + } + /* + * !!!WARNING!!! + * Here must reset memory to 0, or vsi_nn_kernel_reset will crash. + */ + memset( kernel, 0, sizeof(vsi_nn_kernel_t) ); + vsi_nn_kernel_reset( kernel, type ); + return kernel; +} /* vsi_nn_kernel_create() */ + +void vsi_nn_kernel_reset + ( + vsi_nn_kernel_t * kernel, + vsi_nn_kernel_type_e type + ) +{ + if( kernel ) + { + _kernel_clear_source( kernel ); + memset( kernel, 0, sizeof(vsi_nn_kernel_t) ); + kernel->type = type; + // TODO: Choose binary +#if VSI_USE_VXC_BINARY + kernel->gpu.active_source_fmt = VSI_NN_GPU_SOURCE_FMT_EXECUTABLE; +#else + kernel->gpu.active_source_fmt = VSI_NN_GPU_SOURCE_FMT_CODE; +#endif + // Set default functions. + kernel->info.enumeration = KERNEL_ID_PLACEHOLDER; + kernel->info.initialize = _kernel_initializer; + kernel->info.validate = _kernel_validator; + kernel->info.deinitialize = _kernel_deinitializer; + } +} /* vsi_nn_kernel_reset() */ + +vsi_nn_kernel_node_t vsi_nn_kernel_selector + ( + vsi_nn_graph_t* graph, + const char* kernel_name, + vsi_nn_tensor_t** inputs, + size_t input_num, + vsi_nn_tensor_t** outputs, + size_t output_num, + const vsi_nn_kernel_param_t* params + ) +{ + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_t * kernel; + const vsi_nn_kernel_backend_t* backend; + vsi_nn_kernel_selector_t selector; + vsi_status status = VSI_SUCCESS; + if( !kernel_name ) + { + VSI_ASSERT( FALSE ); + return NULL; + } + if( !graph ) + { + VSI_ASSERT( FALSE ); + return NULL; + } + + backend = vsi_nn_kernel_backend_get( kernel_name ); + if( !backend ) + { + VSILOGW("Not found kernel \"%s\"", kernel_name); + return NULL; + } + kernel = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_NONE ); + if( !kernel ) + { + return NULL; + } + memset( &selector, 0, sizeof(vsi_nn_kernel_selector_t) ); + if( backend->select ) + { + status = backend->select( graph, inputs, input_num, outputs, output_num, + params, &selector ); + VSI_ASSERT( status == VSI_SUCCESS ); + } + else + { + vsi_nn_kernel_pirority_t default_pirority[] = { + { VSI_NN_KERNEL_TYPE_EVIS, 4 }, + { VSI_NN_KERNEL_TYPE_CL, 3 }, + { VSI_NN_KERNEL_TYPE_VX, 2 }, + { VSI_NN_KERNEL_TYPE_CPU, 1 }, + }; + vsi_nn_kernel_pirority_set( &selector, + default_pirority, _cnt_of_array(default_pirority) ); + } + /** + * All kernels for one operation will share the same id. + */ + + { + uint32_t i; + vsi_nn_kernel_type_e type; + vsi_nn_kernel_setup_func_t kernel_func = NULL;; + for( i = 0; i < (uint32_t)selector.allow_kernel_num; i ++ ) + { + type = selector.pirority[i].kernel_type; + + //Skip evis and cl when disable shader + if ( (type == VSI_NN_KERNEL_TYPE_EVIS || type == VSI_NN_KERNEL_TYPE_CL) + && _check_shader_support(graph) == FALSE) + { + continue; + } + // Skip evis if not support + if( type == VSI_NN_KERNEL_TYPE_EVIS + && graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE ) + { + continue; + } + kernel_func = backend->setup[type]; + // Skip no kernel func + if( NULL == kernel_func ) + { + continue; + } + vsi_nn_kernel_reset( kernel, type ); + kernel->unique_id = KERNEL_ID_OVXLIB_START + backend->unique_id; + node = kernel_func( graph, inputs, input_num, + outputs, output_num, params, kernel ); + // If node created, break the loop + if( node ) + { + VSILOGD("Instance %s node with kernel \"%s\" ", + vsi_nn_kernel_type_str(type), kernel_name); + break; + } + } + } + if( !node ) + { + VSILOGW("No valid kernel for %s", kernel_name); + } + vsi_nn_kernel_release( &kernel ); + + return node; +} /* vsi_nn_kernel_selector() */ + +vsi_bool vsi_nn_kernel_gpu_check_shape + ( const int32_t * shape, size_t rank ) +{ + size_t i; + vsi_bool ret = TRUE; + const size_t channel_dim = 2; + for( i = 0; i < vsi_nn_min(rank, channel_dim); i++ ) + { + if( shape[i] == 0 + || shape[i] >= GPU_TENSOR_MAX_WIDTH ) + { + ret = FALSE; + break; + } + } + return ret; +} /* vsi_nn_kernel_gpu_check_shape() */ + +vsi_nn_kernel_scalar_t vsi_nn_kernel_scalar_create + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_dtype_e dtype, + const void * data + ) +{ + vx_enum vxtype = VX_TYPE_FLOAT32; + if( !graph || !data ) + { + return NULL; + } + switch( dtype ) + { + case I8: + vxtype = VX_TYPE_INT8; + break; + case I16: + vxtype = VX_TYPE_INT16; + break; + case I32: + vxtype = VX_TYPE_INT32; + break; + case I64: + vxtype = VX_TYPE_INT64; + break; + case U8: + vxtype = VX_TYPE_UINT8; + break; + case U16: + vxtype = VX_TYPE_UINT16; + break; + case U32: + vxtype = VX_TYPE_UINT32; + break; + case U64: + vxtype = VX_TYPE_UINT64; + break; + case F16: + vxtype = VX_TYPE_FLOAT16; + break; + case F32: + vxtype = VX_TYPE_FLOAT32; + break; + case BF16: + case BOOL8: + default: + VSILOGW("Unsupport dtype %d", dtype); + return NULL; + } + return vxCreateScalar( graph->ctx->c, vxtype, (void*)data ); +} /* vsi_nn_kernel_scalar_create() */ + +vsi_status vsi_nn_kernel_gpu_add_param + ( + vsi_nn_kernel_node_t node, + const char * param_key, + void * data + ) +{ + return vxSetNodeUniform( (vx_node)node, param_key, 1, data ); +} /* vsi_nn_kernel_gpu_add_param() */ + +vsi_status vsi_nn_kernel_gpu_config + ( + vsi_nn_kernel_node_t node, + const gpu_param_t * gpu_param + ) +{ + vsi_status status; + vx_kernel_execution_parameters_t param; + param.workDim = gpu_param->dim; + memcpy( param.globalWorkOffset, gpu_param->global_offset, + sizeof(size_t) * GPU_MAX_DIMENSION_SIZE ); + memcpy( param.globalWorkScale, gpu_param->global_scale, + sizeof(size_t) * GPU_MAX_DIMENSION_SIZE ); + memcpy( param.localWorkSize, gpu_param->local_size, + sizeof(size_t) * GPU_MAX_DIMENSION_SIZE ); + memcpy( param.globalWorkSize, gpu_param->global_size, + sizeof(size_t) * GPU_MAX_DIMENSION_SIZE ); + status = vxSetNodeAttribute( (vx_node)node, + VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + ¶m, sizeof(vx_kernel_execution_parameters_t) ); + return status; +} /* vsi_nn_kernel_gpu_config() */ + +vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create + ( vsi_nn_kernel_tensor_t tensor ) +{ + vsi_nn_kernel_tensor_attr_t * attr; + vsi_status status; + uint32_t dim_num; + vsi_nn_type_e dtype = VSI_NN_TYPE_FLOAT16; + vsi_nn_qnt_type_e quant_type = VSI_NN_QNT_TYPE_NONE; + attr = (vsi_nn_kernel_tensor_attr_t *)malloc( + sizeof(vsi_nn_kernel_tensor_attr_t) ); + if( !attr ) + { + VSILOGE("Out of memory, create tensor attr fail!"); + return NULL; + } + memset( attr, 0, sizeof(vsi_nn_kernel_tensor_attr_t) ); + + status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_NUM_OF_DIMS, + &dim_num, sizeof(uint32_t)); + CHECK_STATUS( status ); + if( status == VSI_SUCCESS ) + { + vsi_int_array_t * shape = vsi_int_array_create( dim_num ); + if( shape ) + { + status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_DIMS, + shape->data, sizeof(int32_t) * dim_num); + attr->shape = shape; + CHECK_STATUS( status ); + } + } + status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_DATA_TYPE, + &dtype, sizeof(vsi_enum)); + CHECK_STATUS( status ); + attr->dtype = vsi_nn_kernel_map_dtype( dtype ); + + status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_QUANT_FORMAT, + &quant_type, sizeof(uint32_t)); + CHECK_STATUS( status ); + attr->quant = vsi_nn_kernel_map_quant_type( quant_type ); + + switch( attr->quant ) + { + case VSI_NN_KERNEL_QUANT_DFP: + { + int8_t fl = 0; + status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_FIXED_POINT_POS, + &fl, sizeof(int8_t)); + CHECK_STATUS( status ); + attr->dfp.fl = (int32_t)fl; + } + break; + case VSI_NN_KERNEL_QUANT_ASYMM: + { + status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_ZERO_POINT, + &(attr->asymm.zero_point), sizeof(int32_t)); + CHECK_STATUS( status ); + status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_SCALE, + &(attr->asymm.scale), sizeof(float)); + CHECK_STATUS( status ); + // Reset scale to 1 + if( (attr->asymm.scale - 0.f) < 1e-5 ) + { + attr->asymm.scale = 1.0f; + attr->asymm.zero_point = 0; + } + } + break; + default: + break; + } + return attr; +} /* vsi_nn_kernel_tensor_attr_create() */ + +void vsi_nn_kernel_tensor_attr_release + ( vsi_nn_kernel_tensor_attr_t ** p_attr ) +{ + if( p_attr && *p_attr ) + { + vsi_nn_kernel_tensor_attr_t * attr = *p_attr; + vsi_int_array_release( &attr->shape ); + if( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM_PERCHANNEL ) + { + vsi_float_array_release( &attr->asymm_v.scale ); + vsi_int_array_release( &attr->asymm_v.zero_point ); + } + else if( attr->quant == VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL ) + { + //TODO: + } + free( attr ); + *p_attr = NULL; + } +} /* vsi_nn_kernel_tensor_attr_release() */ + +vsi_status vsi_nn_kernel_pirority_set + ( + vsi_nn_kernel_selector_t * selector, + const vsi_nn_kernel_pirority_t * pirority, + size_t pirority_size + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_pirority_t tmp; + uint32_t i, j, k; + VSI_ASSERT( pirority_size <= VSI_NN_KERNEL_TYPE_NUM ); + VSI_ASSERT( pirority_size > 0 ); + VSI_ASSERT( pirority != NULL ); + VSI_ASSERT( selector != NULL ); + memcpy( selector->pirority, pirority, + pirority_size * sizeof(vsi_nn_kernel_pirority_t) ); + selector->allow_kernel_num = (int32_t)pirority_size; + + for( i = 0; i < pirority_size; i ++ ) + { + k = i; + VSI_ASSERT( selector->pirority[k].fps <= VSI_NN_KERNEL_PIRORITY_NORMAL_LIMIT ); + for( j = i; j < pirority_size; j ++ ) + { + if( selector->pirority[k].fps < selector->pirority[j].fps ) + { + k = j; + } + } + if( k != i ) + { + memcpy( &tmp, &selector->pirority[i], + sizeof( vsi_nn_kernel_pirority_t ) ); + memcpy( &selector->pirority[i], &selector->pirority[k], + sizeof( vsi_nn_kernel_pirority_t ) ); + memcpy( &selector->pirority[k], &tmp, + sizeof( vsi_nn_kernel_pirority_t ) ); + } + } + return status; +} /* vsi_nn_kernel_pirority_set() */ + +static vsi_bool _check_shader_support(vsi_nn_graph_t* graph) +{ + char *envctrl; + int32_t enableShader = 1; + + envctrl = getenv("VIV_VX_ENABLE_SHADER"); + if (envctrl) + { + enableShader = atoi(envctrl); + } + +#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT + if ( graph->ctx->config.subGroupSize == 0 ) + { + return FALSE; + } +#endif + + if(enableShader == 1) + { + return TRUE; + } + + return FALSE; +} + diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_backend.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_backend.c new file mode 100644 index 0000000..872369f --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_backend.c @@ -0,0 +1,154 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include "vsi_nn_prv.h" +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_ops.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_hashmap.h" + +static vsi_nn_kernel_unique_id_t _global_id() + { + static vsi_nn_kernel_unique_id_t global_id = 0; + return global_id ++; + } /* _global_id() */ + +static vsi_nn_hashmap_t* _backends() + { + static vsi_nn_hashmap_t* backends = NULL; + if( !backends ) + { + backends = vsi_nn_hashmap_create(); + } + return backends; + } /* _backends() */ + +static vsi_nn_kernel_backend_t* _get_or_new_backend + ( const char* kernel_name ) +{ + vsi_nn_kernel_backend_t* backend = NULL; + vsi_nn_hashmap_t* backends = _backends(); + if( vsi_nn_hashmap_has( backends, kernel_name ) ) + { + backend = vsi_nn_hashmap_get( backends, kernel_name ); + } + else + { + backend = (vsi_nn_kernel_backend_t*)malloc( sizeof(vsi_nn_kernel_backend_t) ); + if( !backend ) + { + VSILOGE("Out of memory, register backend fail."); + VSI_ASSERT( FALSE ); + } + memset( backend, 0, sizeof(vsi_nn_kernel_backend_t) ); + vsi_nn_hashmap_add( backends, kernel_name, backend ); + backend->unique_id = _global_id(); + } + return backend; +} /* _get_or_new_backend() */ + +void vsi_nn_kernel_backend_register + ( + const char* kernel_name, + vsi_nn_kernel_type_e kernel_type, + vsi_nn_kernel_setup_func_t setup_func + ) +{ + vsi_nn_kernel_backend_t* backend = NULL; + backend = _get_or_new_backend( kernel_name ); + VSI_ASSERT( backend != NULL ); + if( backend->setup[kernel_type] ) + { + VSILOGE("Kernel %s backend %d has been registered!", kernel_name, kernel_type); + VSI_ASSERT( FALSE ); + } + backend->setup[kernel_type] = setup_func; +} /* vsi_nn_register_backend() */ + +void vsi_nn_kernel_selector_register + ( + const char* kernel_name, + vsi_nn_kernel_selector_func_t selector_func + ) +{ + vsi_nn_kernel_backend_t* backend = NULL; + backend = _get_or_new_backend( kernel_name ); + VSI_ASSERT( backend != NULL ); + backend->select = selector_func; +} /* vsi_nn_kernel_selector_register() */ + +const vsi_nn_kernel_backend_t* vsi_nn_kernel_backend_get( const char* key ) +{ + vsi_nn_hashmap_t* backends = _backends(); + const vsi_nn_kernel_backend_t* backend = NULL; + backend = (const vsi_nn_kernel_backend_t*)vsi_nn_hashmap_get( backends, key ); + return backend; +} /* vsi_nn_backend_get() */ + +vsi_status vsi_nn_kernel_backend_init( void ) +{ + vsi_status status = VSI_SUCCESS; + // TODO: Multi-thread support + if( _backends() != NULL ) + { + return status; + } +#if defined(__linux__) +#if 0 + extern vsi_nn_kernel_section_meta_t* __start_kernel_meta_section; + extern vsi_nn_kernel_section_meta_t* __stop_kernel_meta_section; + vsi_nn_kernel_section_meta_t** iter = &__start_kernel_meta_section; + for( ; iter < &__stop_kernel_meta_section; iter ++ ) + { + vsi_nn_kernel_backend_register((*iter)->name, + (*iter)->kernel_type, (*iter)->func ); + } +#endif +#if 0 + REGISTER_KERNEL_BACKEND_MANUALLY( MINIMUM, CL, cl_minimum_setup ); + REGISTER_KERNEL_BACKEND_MANUALLY( MINIMUM, EVIS, evis_minimum_setup ); + REGISTER_KERNEL_BACKEND_MANUALLY( MINIMUM, CPU, cpu_minimum_setup ); + //REGISTER_KERNEL_BACKEND_MANUALLY( MINIMUM, VX, vx_minimum_setup ); +#endif +#endif + return status; +} + +void vsi_nn_kernel_backend_deinit() +{ + vsi_nn_hashmap_t* backends = _backends(); + vsi_nn_hashmap_item_t* p = vsi_nn_hashmap_iter( backends, NULL ); + vsi_nn_hashmap_item_t* next; + while( p ) + { + next = vsi_nn_hashmap_iter( backends, p ); + free( p->data ); + p = next; + } + vsi_nn_hashmap_release( &backends ); +} /* vsi_nn_kernel_backend_deinit() */ + diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c new file mode 100644 index 0000000..7be998c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c @@ -0,0 +1,564 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include "vsi_nn_tensor.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +typedef enum +{ + ELTWISE_BROADCAST_STATE_BROADCAST_X = 0, + ELTWISE_BROADCAST_STATE_BROADCAST_Y = 1, + ELTWISE_BROADCAST_STATE_BROADCAST_XY = 2, + ELTWISE_BROADCAST_STATE_NO_BROADCAST = 4, + ELTWISE_BROADCAST_STATE_EMPTY = 8, +} eltwise_broadcast_state_e; + +#if 0 +static size_t vsi_nn_compute_element_num + ( const int32_t* shape, const size_t rank); +#endif + +static size_t eltwise_fill_dim + ( + int32_t* shape_x, int32_t* shape_y, + int32_t* shape_output, size_t rank, + size_t max_rank, int32_t size_x, int32_t size_y, + int32_t size_output + ); + +static vsi_bool compute_gpu_divisor + ( + const int32_t input_value, + const int32_t limit, + const int32_t gcd, + int32_t* divisor + ); + +static size_t eltwise_fill_dim + ( + int32_t* shape_x, int32_t* shape_y, + int32_t* shape_output, size_t rank, + size_t max_rank, int32_t size_x, int32_t size_y, + int32_t size_output + ); + +#if 0 +static size_t vsi_nn_compute_element_num + ( const int32_t* shape, const size_t rank) +{ + size_t i; + size_t element = 1; + for( i = 0; i < rank; i ++ ) + { + element *= shape[i]; + } + return element; +} +#endif + +static vsi_bool compute_gpu_divisor + ( + const int32_t input_value, + const int32_t limit, + const int32_t gcd, + int32_t* divisor + ) +{ + int32_t i = 0; + for( i = vsi_nn_min( input_value, limit - 1 ); i > 0; i -- ) + { + if( ( i % gcd == 0 ) && ( input_value % i == 0 ) ) + { + *divisor = i; + return TRUE; + } + } + return FALSE; +} /* compute_gpu_divisor */ + +static size_t eltwise_fill_dim + ( + int32_t* shape_x, int32_t* shape_y, + int32_t* shape_output, size_t rank, + size_t max_rank, int32_t size_x, int32_t size_y, + int32_t size_output + ) +{ + size_t cost_size = 1; + VSI_ASSERT( rank <= max_rank ); + VSI_ASSERT( size_output >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) ); + if( size_output < GPU_TENSOR_MAX_WIDTH ) + { + shape_x[rank] = size_x; + shape_y[rank] = size_y; + shape_output[rank] = size_output; + } + else + { + int32_t divisor = 0; + int32_t remainder = 0; + compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); + remainder = size_output / divisor; + if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) + { + // Cannot optimize. + shape_x[rank] = size_x; + shape_y[rank] = size_y; + shape_output[rank] = size_output; + } + else + { + /* + * We've limit the max size to 2^32 -1(Almost 4G * sizeof(data type)), + * so it should be always 2. + */ + cost_size = 2; + if( size_x > 1 ) + { + shape_x[rank] = divisor; + shape_x[rank + 1] = remainder; + } + else + { + shape_x[rank] = 1; + shape_x[rank + 1] = 1; + } + if( size_y > 1 ) + { + shape_y[rank] = divisor; + shape_y[rank + 1] = remainder; + } + else + { + shape_y[rank] = 1; + shape_y[rank + 1] = 1; + } + shape_output[rank] = divisor; + shape_output[rank + 1] = remainder; + } + } + return cost_size; +} /* eltwise_fill_dim() */ + +vsi_bool vsi_nn_kernel_optimize_eltwise_shape + ( + const int32_t* shape_x, const size_t rank_x, + const int32_t* shape_y, const size_t rank_y, + const int32_t* shape_output, const size_t rank_output, + int32_t* out_shape_x, int32_t* out_shape_y, + int32_t* out_shape_output, uint32_t* out_rank_output + ) +{ + vsi_bool ret = TRUE; + vsi_bool append_dim = FALSE; + size_t i = 0; + size_t dims = 0; + int32_t effective_size_x = 1; + int32_t effective_size_y = 1; + int32_t tmp_sz = 0; + int32_t sx = 0; + int32_t sy = 0; + eltwise_broadcast_state_e state = ELTWISE_BROADCAST_STATE_EMPTY; + eltwise_broadcast_state_e prv_state = ELTWISE_BROADCAST_STATE_EMPTY; + +#define _swap_size(a, b, tmp) \ + do { \ + tmp = a; \ + a = b; \ + b = tmp; \ + } while(0) + for( i = 0; i < rank_output; i++ ) + { + sx = i < rank_x ? shape_x[i] : 1; + sy = i < rank_y ? shape_y[i] : 1; + + /* + * Skip dim if the size is equal to 1 + * Also skip if( sx == 1 && sy == 1 ) + */ + if( shape_output[i] == 1 ) + { + continue; + } + // Invalid shape for broadcasting + if( sx != sy && sx > 1 && sy > 1 ) + { + ret = FALSE; + break; + } + // Update state + state = ELTWISE_BROADCAST_STATE_EMPTY; + if( sx == sy ) + { + state = ELTWISE_BROADCAST_STATE_NO_BROADCAST; + } + else if( sx == 1 ) + { + state = ELTWISE_BROADCAST_STATE_BROADCAST_X; + } + else if( sy == 1 ) + { + state = ELTWISE_BROADCAST_STATE_BROADCAST_Y; + } + else + { + VSI_ASSERT( FALSE ); + } + if( prv_state == ELTWISE_BROADCAST_STATE_EMPTY ) + { + effective_size_x *= sx; + effective_size_y *= sy; + prv_state = state; + continue; + } + append_dim = FALSE; +#define _pack_state( prev_state, cur_state ) (prev_state << 16 | cur_state) + switch( _pack_state( prv_state, state ) ) + { + /* + * ...,x1,x2,... + * ...,y1,y2,... + */ + case _pack_state( ELTWISE_BROADCAST_STATE_NO_BROADCAST, ELTWISE_BROADCAST_STATE_NO_BROADCAST ): + effective_size_x *= sx; + effective_size_y *= sy; + break; + /* + * ..., 1, 1,... + * ...,y1,y2,... + */ + case _pack_state( ELTWISE_BROADCAST_STATE_BROADCAST_X, ELTWISE_BROADCAST_STATE_BROADCAST_X ): + effective_size_y *= sy; + break; + /* + * ...,x1,x2,... + * ..., 1, 1,... + */ + case _pack_state( ELTWISE_BROADCAST_STATE_BROADCAST_Y, ELTWISE_BROADCAST_STATE_BROADCAST_Y ): + effective_size_x *= sx; + break; + + /* + * ...,x1, 1,... + * ...,y1,y2,... + * + * ...,x1,x2,... + * ...,y1, 1,... + * + * ..., 1,x2,... + * ...,y1, 1,... + * + * ..., 1,x2,... + * ...,y1,y2,... + * + * ...,x1, 1,... + * ..., 1,y2,... + * + * ...,x1,x2,... + * ..., 1,y2,... + */ + case _pack_state( ELTWISE_BROADCAST_STATE_NO_BROADCAST, ELTWISE_BROADCAST_STATE_BROADCAST_X ): + case _pack_state( ELTWISE_BROADCAST_STATE_NO_BROADCAST, ELTWISE_BROADCAST_STATE_BROADCAST_Y ): + case _pack_state( ELTWISE_BROADCAST_STATE_BROADCAST_X, ELTWISE_BROADCAST_STATE_BROADCAST_Y ): + case _pack_state( ELTWISE_BROADCAST_STATE_BROADCAST_X, ELTWISE_BROADCAST_STATE_NO_BROADCAST ): + case _pack_state( ELTWISE_BROADCAST_STATE_BROADCAST_Y, ELTWISE_BROADCAST_STATE_BROADCAST_X ): + case _pack_state( ELTWISE_BROADCAST_STATE_BROADCAST_Y, ELTWISE_BROADCAST_STATE_NO_BROADCAST ): + _swap_size(sx, effective_size_x, tmp_sz); + _swap_size(sy, effective_size_y, tmp_sz); + append_dim = TRUE; + break; + default: + VSILOGE("Get error state (%d -> %d) while computing broadcast shape.", + prv_state, state); + VSI_ASSERT( FALSE ); + break; + } +#undef _pack_state + prv_state = state; + if( append_dim ) + { + dims += eltwise_fill_dim( out_shape_x, out_shape_y, out_shape_output, + dims, VSI_NN_MAX_DIM_NUM, sx, sy, vsi_nn_max( sx, sy ) ); + } + } + if( ret ) + { + /* Append the last dim */ + if( i == rank_output ) + { + sx = effective_size_x; + sy = effective_size_y; + dims += eltwise_fill_dim( out_shape_x, out_shape_y, out_shape_output, + dims, VSI_NN_MAX_DIM_NUM, sx, sy, vsi_nn_max( sx, sy ) ); + } + /* Avoid 1D shape*/ + if( 1 == dims ) + { + out_shape_x[1] = 1; + out_shape_y[1] = 1; + out_shape_output[1] = 1; + dims = 2; + } + /* For debug */ +#if DEBUG + vsi_nn_print_int_array( out_shape_x, dims ); + vsi_nn_print_int_array( out_shape_y, dims ); + vsi_nn_print_int_array( out_shape_output, dims ); +#endif + *out_rank_output = (uint32_t)dims; + } +#undef _swap_size + return ret; +} /* vsi_nn_kernel_optimize_eltwise_shape() */ + + + +static size_t broadcast_fill_dim + ( + int32_t** shape_in, int32_t input_num, + int32_t* shape_output, size_t rank, + size_t max_rank, int32_t* size_in, + int32_t size_output + ) +{ + int32_t i = 0; + size_t cost_size = 1; + VSI_ASSERT( rank <= max_rank ); + VSI_ASSERT( size_output >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) ); + if( size_output < GPU_TENSOR_MAX_WIDTH ) + { + for (i = 0; i < input_num; i++) + { + shape_in[i][rank] = size_in[i]; + } + shape_output[rank] = size_output; + } + else + { + int32_t divisor = 0; + int32_t remainder = 0; + compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); + remainder = size_output / divisor; + if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) + { + // Cannot optimize. + for (i = 0; i < input_num; i++) + { + shape_in[i][rank] = size_in[i]; + } + shape_output[rank] = size_output; + } + else + { + /* + * We've limit the max size to 2^32 -1(Almost 4G * sizeof(data type)), + * so it should be always 2. + */ + cost_size = 2; + for (i = 0; i < input_num; i++) + { + if (size_in[i] > 1) + { + shape_in[i][rank] = divisor; + shape_in[i][rank + 1] = remainder; + } + else + { + shape_in[i][rank] = 1; + shape_in[i][rank + 1] = 1; + } + } + shape_output[rank] = divisor; + shape_output[rank + 1] = remainder; + } + } + return cost_size; +} /* broadcast_fill_dim() */ + +vsi_bool vsi_nn_kernel_optimize_broadcast_shape + ( + const int32_t** shape_in, const size_t* rank_in, + const int32_t input_num, + const int32_t* shape_output, const size_t rank_output, + int32_t** out_shape_in, + int32_t* out_shape_output, uint32_t* out_rank_output + ) +{ +#define MAX_INPUT_NUM 30 + vsi_bool ret = TRUE; + vsi_bool append_dim = FALSE; + size_t i = 0; + size_t j = 0; + size_t k = 0; + size_t dims = 0; + int32_t effective_size[MAX_INPUT_NUM] = {1}; + int32_t tmp_sz = 0; + int32_t size_in[MAX_INPUT_NUM] = {0}; + int32_t state_mask = 0; + int32_t prv_state_mask = -1; + +#define _swap_size(a, b, tmp) \ + do { \ + tmp = a; \ + a = b; \ + b = tmp; \ + } while(0) + + if (input_num > MAX_INPUT_NUM) + { + VSILOGE("Max support input num is %d, while input num is %d.", + MAX_INPUT_NUM, input_num); + ret = FALSE; + goto final; + } + + for (i = 0; i < (size_t)input_num; i++) + { + effective_size[i] = 1; + } + + for( i = 0; i < rank_output; i++ ) + { + for (j = 0; j < (size_t)input_num; j++) + { + size_in[j] = i < rank_in[j] ? shape_in[j][i] : 1; + } + /* + * Skip dim if the size is equal to 1 + */ + if( shape_output[i] == 1 ) + { + continue; + } + + // Invalid shape for broadcasting + k = 0; + for (j = 0; j < (size_t)input_num; j++) + { + if (size_in[k] > 1) + { + k = j; + break; + } + } + + for (j = 0; j < (uint32_t)input_num; j++) + { + if ((size_in[k] != size_in[j]) + && (size_in[j] > 1)) + { + ret = FALSE; + goto final; + } + } + + state_mask = 0; + for (j = 0; j < (size_t)input_num; j++) + { + if (1 == size_in[j]) + { + state_mask |= (1 << j); + } + } + + append_dim = FALSE; + + if ((-1 == prv_state_mask) || (state_mask == prv_state_mask)) + { + for (j = 0; j < (size_t)input_num; j++) + { + effective_size[j] *= size_in[j]; + } + } + else + { + for (j = 0; j < (size_t)input_num; j++) + { + _swap_size(size_in[j], effective_size[j], tmp_sz); + } + append_dim = TRUE; + } + + prv_state_mask = state_mask; + + if( append_dim ) + { + int32_t size_output; + size_output = size_in[0]; + for (j = 1; j < (size_t)input_num; j++) + { + size_output = vsi_nn_max(size_output, size_in[j]); + } + dims += broadcast_fill_dim(out_shape_in, input_num, out_shape_output, + dims, VSI_NN_MAX_DIM_NUM, size_in, size_output); + } + } + + if( ret ) + { + /* Append the last dim */ + if( i == rank_output ) + { + int32_t size_output; + size_output = effective_size[0]; + for (j = 1; j < (size_t)input_num; j++) + { + size_output = vsi_nn_max(size_output, effective_size[j]); + } + dims += broadcast_fill_dim(out_shape_in, input_num, out_shape_output, + dims, VSI_NN_MAX_DIM_NUM, effective_size, size_output); + } + /* Avoid 1D shape*/ + if( 1 == dims ) + { + for (j = 0; j < (size_t)input_num; j++) + { + out_shape_in[j][1] = 1; + } + out_shape_output[1] = 1; + dims = 2; + } + else + { + for (j = 0; j < (size_t)input_num; j++) + { + for ( i = 0; i < dims; i++) + { + if ( out_shape_in[j][i] == 0 ) + out_shape_in[j][i] = 1; + } + } + } + + *out_rank_output = (uint32_t)dims; + } + +#undef _swap_size +#undef MAX_INPUT_NUM +final: + return ret; +} /* vsi_nn_kernel_optimize_broadcast_shape() */ \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c new file mode 100644 index 0000000..f8d23a0 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c @@ -0,0 +1,510 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include "vsi_nn_tensor.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +static vsi_bool compute_gpu_divisor + ( + const int32_t input_value, + const int32_t limit, + const int32_t gcd, + int32_t* divisor + ); + +static size_t element_fill_dim + ( + int32_t* shape_x, size_t rank_x, + size_t max_rank, int32_t size_x + ); + +static vsi_bool compute_gpu_divisor + ( + const int32_t input_value, + const int32_t limit, + const int32_t gcd, + int32_t* divisor + ) +{ + int32_t i = 0; + for( i = vsi_nn_min( input_value, limit - 1 ); i > 0; i -- ) + { + if( ( i % gcd == 0 ) && ( input_value % i == 0 ) ) + { + *divisor = i; + return TRUE; + } + } + return FALSE; +} /* compute_gpu_divisor */ + +static size_t element_fill_dim + ( + int32_t* shape_x, size_t rank_x, + size_t max_rank, int32_t size_x + ) +{ + size_t cost_size = 1; + VSI_ASSERT( rank_x <= max_rank ); + VSI_ASSERT( size_x >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) ); + + if (size_x == 1) + return 0; + + if( size_x < GPU_TENSOR_MAX_WIDTH) + { + shape_x[rank_x] = size_x; + } + else + { + int32_t divisor = 0; + int32_t remainder = 0; + compute_gpu_divisor( size_x, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); + remainder = size_x / divisor; + if( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank) + { + // Cannot optimize. + shape_x[rank_x] = size_x; + } + else + { + /* + * We've limit the max size to 2^32 -1(Almost 4G * sizeof(data type)), + * so it should be always 2. + */ + cost_size = 2; + if( size_x > 1 ) + { + shape_x[rank_x] = divisor; + shape_x[rank_x + 1] = remainder; + } + else + { + shape_x[rank_x] = 1; + shape_x[rank_x + 1] = 1; + } + } + } + return cost_size; +} /* element_fill_dim() */ + +/*only for continuous axises or one axis*/ +vsi_bool vsi_nn_kernel_optimize_reduce_shape + ( + const int32_t* shape_x, const size_t rank_x, + const int32_t *axis, const size_t axis_size, + const int32_t* shape_output, const size_t rank_output, + int32_t* out_shape_x, uint32_t* out_rank_x, + int32_t* out_shape_output, uint32_t* out_rank_output, + int32_t* out_axis, uint32_t* out_axis_size + ) +{ + vsi_bool ret = TRUE; + size_t i = 0; + size_t rank_in = 0; + size_t rank_out = 0; + size_t dims = 0; + int32_t innerSize = 1; + int32_t outerSize = 1; + int32_t axisSize = 1; + + for (i = 0; i < axis_size; i++) + { + axisSize *= shape_x[axis[i]]; + } + + for (i = 0; i < (size_t)axis[0]; i++) + { + innerSize *= shape_x[i]; + } + + for (i = axis[axis_size - 1] + 1; i < rank_x; i++) + { + outerSize *= shape_x[i]; + } + + rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, innerSize); + rank_out += element_fill_dim(out_shape_output, rank_out, GPU_TENSOR_MAX_WIDTH, innerSize); + dims = element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, axisSize); + if (dims == 0) + { + out_axis[0] = (int32_t)rank_in; + *out_axis_size = 1; + out_shape_x[rank_in ++] = 1; + } + else + { + *out_axis_size = (uint32_t)dims; + for (i = 0; i < dims; i++) + { + out_axis[i] = (int32_t)rank_in + (int32_t)i; + } + } + + rank_in += dims; + + rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize); + rank_out += element_fill_dim(out_shape_output, rank_out, GPU_TENSOR_MAX_WIDTH, outerSize); + + if( 0 == rank_in ) + { + out_shape_x[0] = 1; + out_shape_x[1] = 1; + rank_in = 2; + } + else if( 1 == rank_in ) + { + out_shape_x[1] = 1; + rank_in = 2; + } + + if( 0 == rank_out ) + { + out_shape_output[0] = 1; + out_shape_output[1] = 1; + rank_out = 2; + } + else if( 1 == rank_out ) + { + out_shape_output[1] = 1; + rank_out = 2; + } + + *out_rank_x = (uint32_t)rank_in; + *out_rank_output = (uint32_t)rank_out; + + return ret; +} /* vsi_nn_kernel_optimize_reduce_shape() */ + +vsi_bool vsi_nn_kernel_optimize_element_shape + ( + const int32_t* shape_x, const size_t rank_x, + int32_t* out_shape_x, int32_t* out_rank_x + ) +{ + vsi_bool ret = TRUE; + uint32_t i = 0; + size_t rank_in = 0; + int32_t element_num = 1; + + for (i = 0; i < rank_x; i++) + { + element_num *= shape_x[i]; + } + + rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, element_num); + + if( 0 == rank_in ) + { + out_shape_x[0] = 1; + out_shape_x[1] = 1; + rank_in = 2; + } + else if( 1 == rank_in ) + { + out_shape_x[1] = 1; + rank_in = 2; + } + + *out_rank_x = (int32_t)rank_in; + + return ret; +} /* vsi_nn_kernel_optimize_element_shape() */ + +vsi_bool vsi_nn_kernel_optimize_softmax_shape + ( + const int32_t* shape_x, const size_t rank_x, const int32_t axis, + int32_t* out_shape_x, uint32_t* out_rank_x,int32_t* out_axis + ) +{ + vsi_bool ret = TRUE; + size_t i = 0; + size_t rank_in = 0; + size_t dims = 0; + int32_t innerSize = 1; + int32_t outerSize = 1; + int32_t axisSize = shape_x[axis]; + + for (i = 0; i < (size_t)axis; i++) + { + innerSize *= shape_x[i]; + } + + for (i = axis + 1; i < rank_x; i++) + { + outerSize *= shape_x[i]; + } + + rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, innerSize); + dims = element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, axisSize); + if (dims == 0) + { + *out_axis = (int32_t)rank_in; + out_shape_x[rank_in ++] = 1; + } + else + { + *out_axis = (int32_t)rank_in; + } + + rank_in += dims; + + rank_in += element_fill_dim(out_shape_x, rank_in, GPU_TENSOR_MAX_WIDTH, outerSize); + + if( 0 == rank_in ) + { + out_shape_x[0] = 1; + out_shape_x[1] = 1; + rank_in = 2; + } + else if( 1 == rank_in ) + { + out_shape_x[1] = 1; + rank_in = 2; + } + + *out_rank_x = (uint32_t)rank_in; + + return ret; +} /* vsi_nn_kernel_optimize_softmax_shape() */ + + +typedef enum +{ + TILE_STATE_AXIS_X = 0, + TILE_STATE_AXIS_Y = 1, + TILE_STATE_AXIS_XY = 2, + TILE_STATE_NO_AXIS = 4, + TILE_STATE_EMPTY = 8, +} tile_axis_state_e; + +static size_t tile_fill_dim + ( + int32_t* shape_x, int32_t* shape_y, + int32_t* shape_output, size_t rank, + size_t max_rank, int32_t size_x, int32_t size_y, + int32_t size_output + ) +{ + size_t cost_size = 1; + VSI_ASSERT( rank <= max_rank ); + VSI_ASSERT( size_output >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) ); + if( size_output < GPU_TENSOR_MAX_WIDTH ) + { + shape_x[rank] = size_x; + shape_y[rank] = size_y; + shape_output[rank] = size_output; + } + else + { + int32_t divisor = 0; + int32_t remainder = 0; + compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); + remainder = size_output / divisor; + if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) + { + // Cannot optimize. + shape_x[rank] = size_x; + shape_y[rank] = size_y; + shape_output[rank] = size_output; + } + else + { + /* + * We've limit the max size to 2^32 -1(Almost 4G * sizeof(data type)), + * so it should be always 2. + */ + cost_size = 2; + if( size_x > 1 ) + { + shape_x[rank] = divisor; + shape_x[rank + 1] = remainder; + } + else + { + shape_x[rank] = 1; + shape_x[rank + 1] = 1; + } + if( size_y > 1 ) + { + shape_y[rank] = divisor; + shape_y[rank + 1] = remainder; + } + else + { + shape_y[rank] = 1; + shape_y[rank + 1] = 1; + } + shape_output[rank] = divisor; + shape_output[rank + 1] = remainder; + } + } + return cost_size; +} /* eltwise_fill_dim() */ + +vsi_bool vsi_nn_kernel_optimize_tile_shape + ( + const int32_t* shape_x, const size_t rank_x, + const int32_t* multiples, const size_t rank, + const int32_t* shape_output, const size_t rank_output, + int32_t* out_shape_x, int32_t* out_shape_y, + int32_t* out_shape_output, uint32_t* out_rank_output + ) +{ + vsi_bool ret = TRUE; + vsi_bool append_dim = FALSE; + size_t i = 0; + size_t dims = 0; + int32_t effective_size_x = 1; + int32_t effective_size_y = 1; + int32_t effective_size_z = 1; + int32_t sx = 0; + int32_t sy = 0; + int32_t sz = 0; + tile_axis_state_e state = TILE_STATE_EMPTY; + tile_axis_state_e next_state = TILE_STATE_EMPTY; + +#define _swap_size(a, b, tmp) \ + do { \ + tmp = a; \ + a = b; \ + b = tmp; \ + } while(0) + for( i = 0; i < rank_output; i++ ) + { + sx = shape_x[i]; + sy = multiples[i]; + sz = shape_output[i]; + /* + * Skip dim if the size is equal to 1 + * Also skip if( sx == 1 && sy == 1 ) + */ + if( shape_output[i] == 1 ) + { + continue; + } + + // Update state + state = TILE_STATE_EMPTY; + if( sx == sz ) + { + state = TILE_STATE_NO_AXIS; + } + else if( sx != sz ) + { + state = TILE_STATE_AXIS_X; + } + else + { + VSI_ASSERT( FALSE ); + } + + next_state = (i + 1) < rank_output ? + (multiples[i + 1] == 1 ? TILE_STATE_NO_AXIS : TILE_STATE_AXIS_X) : TILE_STATE_EMPTY; + + append_dim = FALSE; +#define _pack_state( cur_state, next_state ) (next_state << 16 | cur_state) + switch( _pack_state( state, next_state ) ) + { + case _pack_state( TILE_STATE_NO_AXIS, TILE_STATE_NO_AXIS ): + case _pack_state( TILE_STATE_NO_AXIS, TILE_STATE_EMPTY ): + effective_size_x *= sx; + effective_size_y *= sy; + effective_size_z *= sz; + break; + /* + * ...,x1,x2,... + * ...,y1,y2,... + */ + case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_AXIS_X ): + case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_NO_AXIS ): + case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_EMPTY ): + append_dim = TRUE; + break; + /* + * ...,x1, 1,... + * ...,y1,y2,... + * + * ..., 1,x2,... + * ...,y1,y2,... + * + */ + case _pack_state( TILE_STATE_NO_AXIS, TILE_STATE_AXIS_X ): + effective_size_x *= sx; + effective_size_y *= sy; + effective_size_z *= sz; + sx = effective_size_x; + sy = effective_size_y; + sz = effective_size_z; + effective_size_x = 1; + effective_size_y = 1; + effective_size_z = 1; + append_dim = TRUE; + break; + default: + VSILOGE("Get error state (%d -> %d) while computing broadcast shape.", + state, next_state); + VSI_ASSERT( FALSE ); + break; + } +#undef _pack_state + if( append_dim ) + { + dims += tile_fill_dim( out_shape_x, out_shape_y, out_shape_output, + dims, VSI_NN_MAX_DIM_NUM, sx, sy, sz ); + } + } + if( ret ) + { + /* Append the last dim */ + if( i == rank_output ) + { + sx = effective_size_x; + sy = effective_size_y; + sz = effective_size_z; + dims += tile_fill_dim( out_shape_x, out_shape_y, out_shape_output, + dims, VSI_NN_MAX_DIM_NUM, sx, sy, sz ); + } + /* Avoid 1D shape*/ + if( 1 == dims ) + { + out_shape_x[1] = 1; + out_shape_y[1] = 1; + out_shape_output[1] = 1; + dims = 2; + } + /* For debug */ +#if DEBUG + vsi_nn_print_int_array( out_shape_x, dims ); + vsi_nn_print_int_array( out_shape_y, dims ); + vsi_nn_print_int_array( out_shape_output, dims ); +#endif + *out_rank_output = (uint32_t)dims; + } +#undef _swap_size + return ret; +} /* vsi_nn_kernel_optimize_eltwise_shape() */ diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_node.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_node.c new file mode 100644 index 0000000..d45bf2d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_node.c @@ -0,0 +1,143 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_types.h" +#include "kernel/vsi_nn_kernel.h" + +vsi_nn_kernel_tensor_t kernel_pad_node + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_tensor_t tensor, + int32_t * pad_front, + int32_t * pad_end, + size_t pad_size, + vsi_nn_pad_mode_e mode, + int32_t pad_value, + vsi_nn_kernel_node_t * out_node + ) +{ + vsi_nn_kernel_tensor_attr_t * attr = NULL; + vsi_nn_kernel_tensor_t out_tensor = NULL; + vsi_nn_kernel_node_t node = NULL; + vx_nn_pad_params_t pad_param; + int32_t i; + + // Compute pad size + for( i = (int32_t)pad_size - 1; i >= 0; i -- ) + { + if( pad_front[i] > 0 || pad_end[i] > 0 ) + { + break; + } + } + pad_size = (size_t)i + 1; + if( pad_size > 2 ) + { + VSILOGE("Not support pad size > 2."); + return NULL; + } + else if( pad_size == 0 ) + { + VSILOGE("No need to pad."); + return NULL; + } + memset( &pad_param, 0, sizeof( pad_param ) ); + + switch( mode ) + { + case VSI_NN_PAD_MODE_CONSTANT: + pad_param.pad_mode = VX_PAD_CONSTANT; + break; + case VSI_NN_PAD_MODE_REPLICATE: + pad_param.pad_mode = VX_PAD_REPLICATE; + break; + case VSI_NN_PAD_MODE_SYMMETRIC: + pad_param.pad_mode = VX_PAD_MIRROR_SYMMETRIC; + break; + case VSI_NN_PAD_MODE_REFLECT: + pad_param.pad_mode = VX_PAD_MIRROR_REFLECT; + break; + default: + VSILOGE("Wrong pad_mode %d", mode); + break; + } + pad_param.pad_const = (vx_scalar)vsi_nn_kernel_scalar_create( graph, I32, &pad_value ); + pad_param.numViewDimensions = (vx_uint8)pad_size; + pad_param.pad_front_array = pad_front; + pad_param.pad_back_array = pad_end; + + attr = vsi_nn_kernel_tensor_attr_create( tensor ); + CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); + // Compute new size + if( pad_size > attr->shape->size ) + { + VSILOGE("Pad size %lu is greater than tensor's rank %lu", + pad_size, attr->shape->size ); + goto final; + } + for( i = 0; i < (int32_t)pad_size; i ++ ) + { + attr->shape->data[i] += pad_front[i] + pad_end[i]; + } + out_tensor = vsi_nn_kernel_tensor_create( graph->g, attr, TRUE ); + CHECK_PTR_FAIL_GOTO( out_tensor, "Create pad tensor fail.", final ); + node = (vsi_nn_kernel_node_t)vxTensorPadNode( graph->g, + (vx_tensor)tensor, (vx_tensor)out_tensor, + &pad_param, sizeof( pad_param ) ); +final: + if( NULL == node ) { + VSILOGW("Create pad node fail."); + if( out_tensor ) + { + vsi_nn_kernel_tensor_release( &out_tensor ); + } + } + else + { + if( out_node ) + { + *out_node = node; + } + else + { + vxReleaseNode( (vx_node*)&node ); + } + } + if( pad_param.pad_const ) + { + vsi_nn_kernel_scalar_release( (vsi_nn_kernel_scalar_t*)&pad_param.pad_const ); + } + if( attr ) + { + vsi_nn_kernel_tensor_attr_release( &attr ); + } + return out_tensor; +} /* kernel_pad_node() */ + diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c new file mode 100644 index 0000000..12adee6 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c @@ -0,0 +1,198 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include +#include "vsi_nn_prv.h" +#include "vsi_nn_types.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_context.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_hashmap.h" + +typedef enum +{ + _PARAM_I32 = 1, + _PARAM_I64, + _PARAM_F32, + _PARAM_BUFFER, + _PARAM_STR, +} _param_dtype_e; + +typedef struct +{ + _param_dtype_e type; + union + { + int32_t int32; + int64_t int64; + float float32; + void* buffer; + const char* str; + } value; + size_t size; +} _param_type; + +#define CHECK_PARAM_NULL( ptr, rval, ... ) \ + do { \ + if( ptr == NULL ) { \ + VSILOGE(__VA_ARGS__); \ + VSI_ASSERT(FALSE); \ + return rval; \ + } \ + } while(0) + +#define _PARAM_ADD_TEMPLATE(TYPE_NAME, TYPE, PARAM_DTYPE) \ + vsi_bool vsi_nn_kernel_param_add_##TYPE_NAME \ + (vsi_nn_kernel_param_t* params, const char* key, TYPE value) \ + { \ + _param_type* p; \ + CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." ); \ + CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." ); \ + p = malloc( sizeof(_param_type) ); \ + CHECK_PARAM_NULL( p, FALSE, "Out of memory, add param fail." ); \ + p->type = PARAM_DTYPE; \ + p->value.TYPE_NAME = value; \ + p->size = sizeof( TYPE ); \ + vsi_nn_hashmap_add( params, key, p ); \ + return TRUE; \ + } +#define _PARAM_GET_TEMPLATE(TYPE_NAME, TYPE, DEFAULT_VALUE, PARAM_DTYPE) \ + TYPE vsi_nn_kernel_param_get_##TYPE_NAME \ + ( const vsi_nn_kernel_param_t* params, const char* key) \ + { \ + _param_type* p; \ + CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." ); \ + CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." ); \ + p = vsi_nn_hashmap_get( params, key ); \ + if( p->type != PARAM_DTYPE ) { \ + VSILOGW("Key %s is not \"%s\"", key, ""#TYPE_NAME ); \ + } \ + CHECK_PARAM_NULL( p, DEFAULT_VALUE, "Key %s not in params.", key ); \ + return p->value.TYPE_NAME; \ + } + +_PARAM_ADD_TEMPLATE(int32, int32_t, _PARAM_I32) +_PARAM_ADD_TEMPLATE(int64, int64_t, _PARAM_I64) +_PARAM_ADD_TEMPLATE(float32, float, _PARAM_F32) +_PARAM_GET_TEMPLATE(int32, int32_t, 0, _PARAM_I32) +_PARAM_GET_TEMPLATE(int64, int64_t, 0, _PARAM_I64) +_PARAM_GET_TEMPLATE(float32, float, 0.0f, _PARAM_F32) +_PARAM_GET_TEMPLATE(str, const char*, NULL, _PARAM_STR) + +vsi_bool vsi_nn_kernel_param_add_str + ( + vsi_nn_kernel_param_t * params, + const char * key, + const char * value + ) +{ + _param_type* p; + CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." ); + CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." ); + p = malloc( sizeof(_param_type) ); + CHECK_PARAM_NULL( p, FALSE, "Out of memory, add param fail." ); + p->type = _PARAM_STR; + p->value.str = value; + p->size = strlen( value ); + vsi_nn_hashmap_add( params, key, p ); + return TRUE; +} /* vsi_nn_kernel_param_add_str() */ + +vsi_bool vsi_nn_kernel_param_add_buffer + ( + vsi_nn_kernel_param_t * params, + const char * key, + void * value, + size_t size + ) +{ + _param_type* p; + CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." ); + CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." ); + p = malloc( sizeof(_param_type) ); + CHECK_PARAM_NULL( p, FALSE, "Out of memory, add param fail." ); + p->type = _PARAM_BUFFER; + p->value.buffer = value; + p->size = size; + vsi_nn_hashmap_add( params, key, p ); + return TRUE; +} /* vsi_nn_kernel_param_add_buffer() */ + +void* vsi_nn_kernel_param_get_buffer + ( const vsi_nn_kernel_param_t * params, const char * key, size_t * size) +{ + _param_type* p; + CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." ); + CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." ); + p = vsi_nn_hashmap_get( params, key ); + CHECK_PARAM_NULL( p, 0, "Key %s not in params.", key ); + if( p->type != _PARAM_BUFFER ) + { + VSILOGW("Key %s is not \"buffer\"", key ); + } + if( size != NULL ) + { + *size = p->size; + } + return p->value.buffer; +} /* vsi_nn_kernel_param_get_buffer() */ + +vsi_nn_kernel_param_t* vsi_nn_kernel_param_create() +{ + return (vsi_nn_kernel_param_t*)vsi_nn_hashmap_create(); +} /* vsi_nn_kernel_param_create() */ + +void vsi_nn_kernel_param_release( vsi_nn_kernel_param_t ** params ) +{ + if( params && *params ) + { + vsi_nn_kernel_param_clear( *params ); + vsi_nn_hashmap_release( (vsi_nn_hashmap_t**)params ); + *params = NULL; + } +} /* vsi_nn_kernel_param_release() */ + +void vsi_nn_kernel_param_clear( vsi_nn_kernel_param_t * params ) +{ + if( params ) + { + vsi_nn_hashmap_t* hashmap = (vsi_nn_hashmap_t*)(params); + vsi_nn_hashmap_item_t* p = vsi_nn_hashmap_iter( hashmap, NULL ); + vsi_nn_hashmap_item_t* next; + while( p ) + { + next = vsi_nn_hashmap_iter( hashmap, p ); + free( p->data ); + p = next; + } + vsi_nn_hashmap_clear( hashmap ); + } +} /* vsi_nn_kernel_param_clear() */ + diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c new file mode 100644 index 0000000..3b446b8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -0,0 +1,115 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_context.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_types.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +#define KERNEL_SELECTOR( kernel_name ) \ + static vsi_status _##kernel_name##_kernel_selector( \ + vsi_nn_graph_t*, \ + vsi_nn_tensor_t **, size_t, \ + vsi_nn_tensor_t **, size_t, \ + const vsi_nn_kernel_param_t *, \ + vsi_nn_kernel_selector_t * \ + ); \ + REGISTER_KERNEL_SELECTOR( kernel_name, _##kernel_name##_kernel_selector ) \ + static vsi_status _##kernel_name##_kernel_selector + +KERNEL_SELECTOR( depthwise_conv1d ) + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_selector_t * selector + ) +{ + vsi_nn_kernel_pirority_t pirority[] = { + { VSI_NN_KERNEL_TYPE_VX, 0 }, + { VSI_NN_KERNEL_TYPE_EVIS, 3 }, + { VSI_NN_KERNEL_TYPE_CL, 2 }, + { VSI_NN_KERNEL_TYPE_CPU, 1 }, + }; + return vsi_nn_kernel_pirority_set( selector, pirority, _cnt_of_array(pirority) ); +} /* depthwise_conv1d */ + + +static vsi_status _select + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_selector_t * selector + ) +{ + vsi_nn_kernel_pirority_t pirority[] = { + { VSI_NN_KERNEL_TYPE_VX, 3 }, + { VSI_NN_KERNEL_TYPE_EVIS, 2 }, + { VSI_NN_KERNEL_TYPE_CL, 1 }, + { VSI_NN_KERNEL_TYPE_CPU, 0 }, + }; + return vsi_nn_kernel_pirority_set( selector, pirority, _cnt_of_array(pirority) ); +} /* _select */ + +#define REGISTER_VX_FIRST_KERNEL_SELECTOR(kernel_name) \ + static vsi_status _##kernel_name##_kernel_selector( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_selector_t * selector \ + ) \ + { \ + return _select(graph, inputs, input_num, outputs, output_num, \ + params, selector); \ + } \ + REGISTER_KERNEL_SELECTOR( kernel_name, _##kernel_name##_kernel_selector ) + +REGISTER_VX_FIRST_KERNEL_SELECTOR(exp) +REGISTER_VX_FIRST_KERNEL_SELECTOR(log) +REGISTER_VX_FIRST_KERNEL_SELECTOR(elu) +REGISTER_VX_FIRST_KERNEL_SELECTOR(neg) +REGISTER_VX_FIRST_KERNEL_SELECTOR(mish) +REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_sigmoid) +REGISTER_VX_FIRST_KERNEL_SELECTOR(clip) +REGISTER_VX_FIRST_KERNEL_SELECTOR(relu_keras) + +__END_DECLS diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c new file mode 100644 index 0000000..79b3468 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c @@ -0,0 +1,645 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef enum +{ + MEMORY_ACCESSOR_READ_ONLY = 0, + MEMORY_ACCESSOR_WRITE_ONLY = 1, +} mem_accessor_e; + +vsi_status _copy_tensor + ( + vsi_nn_kernel_tensor_t tensor, + const vsi_nn_kernel_tensor_attr_t * attr, + mem_accessor_e accessor, + void * buffer, + size_t buffer_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * internal_attr = NULL; + size_t rank; + size_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; + size_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; + size_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; + size_t type_bytes; + size_t total_bytes; + uint32_t i; + + if( !tensor || !buffer || !buffer_size ) + { + VSILOGE("Invalid parameter"); + return status; + } + if( !attr ) + { + internal_attr = vsi_nn_kernel_tensor_attr_create( tensor ); + CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr fail.", final ); + attr = internal_attr; + } + + total_bytes = vsi_nn_kernel_tensor_attr_get_bytes( attr ); + if( total_bytes != buffer_size ) + { + VSILOGE("Read buffer size mismatch %d vs %d", total_bytes, buffer_size); + goto final; + } + + vsi_nn_shape_get_stride( attr->shape->data, attr->shape->size, stride ); + type_bytes = vsi_nn_kernel_dtype_get_bytes( attr->dtype ); + rank = attr->shape->size; + for( i = 0; i < rank; i++ ) + { + start[i] = 0; + end[i] = attr->shape->data[i]; + stride[i] = stride[i] * type_bytes; + } + switch( accessor ) + { + case MEMORY_ACCESSOR_READ_ONLY: + status = vxCopyTensorPatch( (vx_tensor)tensor, rank, + start, end, stride, buffer, VX_READ_ONLY, 0); + break; + case MEMORY_ACCESSOR_WRITE_ONLY: + status = vxCopyTensorPatch( (vx_tensor)tensor, rank, + start, end, stride, buffer, VX_WRITE_ONLY, 0); + break; + default: + VSI_ASSERT( FALSE ); + break; + } + +final: + if( internal_attr ) + { + vsi_nn_kernel_tensor_attr_release( &internal_attr ); + } + return status; +} /* _copy_tensor() */ + +void * vsi_nn_kernel_tensor_create_buffer + ( + vsi_nn_kernel_tensor_t tensor, + const vsi_nn_kernel_tensor_attr_t * attr, + vsi_bool convert_to_float + ) +{ + vsi_status status = VSI_FAILURE; + void * buffer = NULL; + void * out_buffer = NULL; + size_t bytes; + size_t float_bytes; + size_t tensor_size = 0; + vsi_nn_kernel_tensor_attr_t * internal_attr = NULL; + + if( !tensor ) + { + return NULL; + } + + if( !attr ) + { + internal_attr = vsi_nn_kernel_tensor_attr_create( tensor ); + CHECK_PTR_FAIL_GOTO( internal_attr, "Create tensor attr fail.", final ); + attr = internal_attr; + } + bytes = vsi_nn_kernel_tensor_attr_get_bytes( attr ); + out_buffer = malloc( bytes ); + CHECK_PTR_FAIL_GOTO( out_buffer, "Out of memory, create buffer fail.", final ); + + status = vsi_nn_kernel_tensor_read( tensor, attr, out_buffer, bytes ); + if( status != VSI_SUCCESS ) + { + VSILOGE("Read tensor fail with error \"%s\".", vsi_nn_DescribeStatus(status)); + free( out_buffer ); + out_buffer = NULL; + goto final; + } + + if( convert_to_float && F32 != attr->dtype ) + { + buffer = out_buffer; + tensor_size = vsi_nn_kernel_tensor_attr_get_size( attr ); + float_bytes = tensor_size * sizeof(float); + out_buffer = malloc( float_bytes ); + if( !out_buffer ) + { + VSILOGE("Out of memory, create float buffer fail."); + free( buffer ); + buffer = NULL; + goto final; + } + if( vsi_nn_kernel_tensor_attr_is_quantized( attr ) ) + { + switch( attr->quant ) + { + case VSI_NN_KERNEL_QUANT_DFP: + vsi_nn_dtype_convert_quantize_dfp_to_float( + buffer, tensor_size, attr->dtype, + attr->dfp.fl, out_buffer ); + break; + case VSI_NN_KERNEL_QUANT_ASYMM: + vsi_nn_dtype_convert_quantize_asymm_to_float( + buffer, tensor_size, attr->dtype, + attr->asymm.scale, attr->asymm.zero_point, + out_buffer ); + break; + case VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL: + vsi_nn_dtype_convert_quantize_symm_perchannel_to_float( + buffer, tensor_size, attr->dtype, + attr->shape->data, attr->shape->size, + attr->asymm_v.scale->data, + attr->asymm_v.scale->size, + attr->asymm_v.zero_point->data, + attr->asymm_v.zero_point->size, + attr->asymm_v.channel_dim, + out_buffer ); + break; + default: + VSILOGE("Donot support quantize type %d", attr->quant); + VSI_ASSERT( FALSE ); + break; + } + } + else + { + vsi_nn_dtype_convert_dtype_to_float( buffer, tensor_size, + attr->dtype, out_buffer ); + } + free( buffer ); + } + +final: + if( internal_attr ) + { + vsi_nn_kernel_tensor_attr_release( &internal_attr ); + } + return out_buffer; +} /* vsi_nn_kernel_tensor_create_buffer() */ + +vsi_status vsi_nn_kernel_tensor_read + ( + vsi_nn_kernel_tensor_t tensor, + const vsi_nn_kernel_tensor_attr_t * attr, + void * out_buffer, + size_t out_buffer_size + ) +{ + return _copy_tensor( tensor, attr, MEMORY_ACCESSOR_READ_ONLY, + out_buffer, out_buffer_size ); +} /* vsi_nn_kernel_tensor_read() */ + +vsi_status vsi_nn_kernel_tensor_write + ( + vsi_nn_kernel_tensor_t tensor, + const vsi_nn_kernel_tensor_attr_t * attr, + const void * buffer, + size_t size + ) +{ + // NOTE: openvx api vxCopyTensorPatch access non-const buffer pointer, + // so here we convert const to non-const ptr. + return _copy_tensor( tensor, attr, MEMORY_ACCESSOR_WRITE_ONLY, + (void*)buffer, size ); +} /* vsi_nn_kernel_tensor_write() */ + +vsi_status vsi_nn_kernel_tensor_write_from_float + ( + vsi_nn_kernel_tensor_t tensor, + const vsi_nn_kernel_tensor_attr_t * attr, + const float * float_buffer, + size_t size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * internal_attr = NULL; + size_t bytes; + const void * buffer = NULL; + void * internal_buffer = NULL; + size_t tensor_size = 0; + if( !attr ) + { + internal_attr = vsi_nn_kernel_tensor_attr_create( tensor ); + CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr fail.", final ); + attr = internal_attr; + } + bytes = vsi_nn_kernel_tensor_attr_get_bytes( attr ); + tensor_size = vsi_nn_kernel_tensor_attr_get_size( attr ); + if( tensor_size != size ) + { + VSILOGE("Tensor and buffer size mismatch %d vs %d", tensor_size, size); + goto final; + } + + if( attr->dtype != F32 ) + { + internal_buffer = malloc( bytes ); + CHECK_PTR_FAIL_GOTO( internal_buffer, "Create buffer fail.", final ); + if( vsi_nn_kernel_tensor_attr_is_quantized( attr ) ) + { + switch( attr->quant ) + { + case VSI_NN_KERNEL_QUANT_DFP: + vsi_nn_dtype_convert_float_to_quantize_dfp( + float_buffer, size, attr->dtype, + attr->dfp.fl, internal_buffer ); + break; + case VSI_NN_KERNEL_QUANT_ASYMM: + vsi_nn_dtype_convert_float_to_quantize_asymm( + float_buffer, size, attr->dtype, + attr->asymm.scale, attr->asymm.zero_point, + internal_buffer ); + break; + case VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL: + vsi_nn_dtype_convert_float_to_quantize_symm_perchannel( + float_buffer, size, attr->dtype, + attr->shape->data, attr->shape->size, + attr->asymm_v.scale->data, + attr->asymm_v.scale->size, + attr->asymm_v.zero_point->data, + attr->asymm_v.zero_point->size, + attr->asymm_v.channel_dim, + internal_buffer ); + break; + default: + VSILOGE("Donot support quantize type %d", attr->quant); + VSI_ASSERT( FALSE ); + break; + } + } + else + { + vsi_nn_dtype_convert_float_to_dtype( float_buffer, size, + attr->dtype, internal_buffer ); + } + buffer = (const void*)internal_buffer; + } + else + { + buffer = (const void*)float_buffer; + } + status = vsi_nn_kernel_tensor_write( tensor, attr, buffer, bytes ); +final: + if( internal_attr ) + { + vsi_nn_kernel_tensor_attr_release( &internal_attr ); + } + if( internal_buffer ) + { + free( internal_buffer ); + } + return status; +} /* vsi_nn_kernel_tensor_write_from_float() */ + +vsi_status vsi_nn_kernel_scalar_get_dtype + ( + vsi_nn_kernel_scalar_t scalar, + vsi_nn_kernel_dtype_e * dtype + ) +{ + vsi_status status; + vx_enum type; + if( !dtype ) + { + VSILOGW("Pointer to dtype is NULL"); + return VSI_FAILURE; + } + status = vxQueryScalar( (vx_scalar)scalar, VX_SCALAR_TYPE, &type, sizeof(vx_enum) ); + if( status == VSI_SUCCESS ) + { + *dtype = vsi_nn_kernel_map_dtype( (vsi_nn_type_e)type ); + } + return status; +} /* vsi_nn_kernel_scalar_get_dtype() */ + +#define DEF_KERNEL_SCALAR_FUNC( READ_FUNC_NAME, WRITE_FUNC_NAME, DTYPE, DTYPE_ID ) \ + vsi_status READ_FUNC_NAME \ + ( vsi_nn_kernel_scalar_t scalar, DTYPE * ptr ) \ + { \ + vsi_status status; \ + vsi_nn_kernel_dtype_e dtype; \ + if( !ptr ) \ + { \ + VSILOGE("Pointer to store scalar is null"); \ + return VSI_FAILURE; \ + } \ + status = vsi_nn_kernel_scalar_get_dtype( scalar, &dtype ); \ + if( dtype != DTYPE_ID ) \ + { \ + VSILOGE("Try read scalar type %d as %d", dtype, DTYPE_ID); \ + return VSI_FAILURE; \ + } \ + if( status == VSI_SUCCESS ) \ + { \ + status = vxCopyScalarWithSize( (vx_scalar)scalar, sizeof(DTYPE), \ + ptr, VX_READ_ONLY, VX_MEMORY_TYPE_HOST ); \ + } \ + return status; \ + } \ + vsi_status WRITE_FUNC_NAME \ + ( vsi_nn_kernel_scalar_t scalar, DTYPE data ) \ + { \ + vsi_status status; \ + status = vxCopyScalarWithSize( (vx_scalar)scalar, sizeof(DTYPE), \ + &data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST ); \ + return status; \ + } + +DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_int8, + vsi_nn_kernel_scalar_write_int8, + int8_t, I8 ) +DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_int32, + vsi_nn_kernel_scalar_write_int32, + int32_t, I32 ) +DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_uint8, + vsi_nn_kernel_scalar_write_uint8, + uint8_t, U8 ) +DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_uint32, + vsi_nn_kernel_scalar_write_uint32, + uint32_t, U32 ) +DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_int64, + vsi_nn_kernel_scalar_write_int64, + int64_t, I64 ) +DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_float32, + vsi_nn_kernel_scalar_write_float32, + float, F32 ) +DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_float64, + vsi_nn_kernel_scalar_write_float64, + double, F64 ) +#undef DEF_KERNEL_SCALAR_FUNC + +static void _convert_tensor_attr_to_vx_tensor_param + ( + vx_tensor_create_params_t* p, + const vsi_nn_kernel_tensor_attr_t* attr + ) +{ + memset( p, 0, sizeof( vx_tensor_create_params_t ) ); + + p->num_of_dims = (uint32_t)attr->shape->size; + p->sizes = (uint32_t*)attr->shape->data; +#define MAP_TYPE( var, src_type, dst_type ) \ + case src_type: \ + var = dst_type; \ + break; + + switch( attr->dtype ) + { + MAP_TYPE( p->data_format, I8, VSI_NN_TYPE_INT8 ); + MAP_TYPE( p->data_format, I16, VSI_NN_TYPE_INT16 ); + MAP_TYPE( p->data_format, I32, VSI_NN_TYPE_INT32 ); + MAP_TYPE( p->data_format, I64, VSI_NN_TYPE_INT64 ); + MAP_TYPE( p->data_format, U8, VSI_NN_TYPE_UINT8 ); + MAP_TYPE( p->data_format, U16, VSI_NN_TYPE_UINT16 ); + MAP_TYPE( p->data_format, U32, VSI_NN_TYPE_UINT32 ); + MAP_TYPE( p->data_format, U64, VSI_NN_TYPE_UINT64 ); + MAP_TYPE( p->data_format, F16, VSI_NN_TYPE_FLOAT16 ); + MAP_TYPE( p->data_format, F32, VSI_NN_TYPE_FLOAT32 ); + MAP_TYPE( p->data_format, F64, VSI_NN_TYPE_FLOAT64 ); + MAP_TYPE( p->data_format, BF16, VSI_NN_TYPE_BFLOAT16 ); + MAP_TYPE( p->data_format, BOOL8, VSI_NN_TYPE_BOOL8 ); + default: + VSI_ASSERT( FALSE ); + break; + } + switch( attr->quant ) + { + MAP_TYPE( p->quant_format, + VSI_NN_KERNEL_QUANT_DFP, + VSI_NN_QNT_TYPE_DFP ); + MAP_TYPE( p->quant_format, + VSI_NN_KERNEL_QUANT_ASYMM, + VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ); + MAP_TYPE( p->quant_format, + VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL, + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC ); + default: + VSI_ASSERT( FALSE ); + break; + } + switch( attr->quant ) + { + case VSI_NN_KERNEL_QUANT_DFP: + p->quant_data.dfp.fixed_point_pos = (uint8_t)attr->dfp.fl; + break; + case VSI_NN_KERNEL_QUANT_ASYMM: + p->quant_data.affine.scale = attr->asymm.scale; + p->quant_data.affine.zeroPoint = attr->asymm.zero_point; + break; + //case VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL: + // break; + default: + VSI_ASSERT( FALSE ); + break; + } +} /* _convert_tensor_attr_to_vx_tensor_param() */ + +vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_create + ( + vsi_nn_kernel_graph_t graph, + const vsi_nn_kernel_tensor_attr_t* attr, + vsi_bool is_virtual + ) +{ + vsi_nn_kernel_tensor_t tensor = NULL; + vx_tensor_create_params_t params; + + _convert_tensor_attr_to_vx_tensor_param( ¶ms, attr ); + if( is_virtual ) + { + tensor = (vsi_nn_kernel_tensor_t)vxCreateVirtualTensor2( + (vx_graph)graph, ¶ms, sizeof( vx_tensor_create_params_t ) ); + } + else + { + vx_context context = NULL; + context = vxGetContext((vx_reference)graph); + tensor = (vsi_nn_kernel_tensor_t)vxCreateTensor2( + context, ¶ms, sizeof( vx_tensor_create_params_t ) ); + } + return tensor; +} /* vsi_nn_kernel_tensor_create() */ + +vsi_nn_tensor_t* vsi_nn_pad_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + int32_t * pad_front, + int32_t * pad_end, + size_t pad_size, + vsi_nn_pad_mode_e mode, + float pad_value + ) +{ + uint32_t sz = 0; + vsi_nn_tensor_attr_t attr; + float *input_data_ptr = NULL; + float *output_data_ptr = NULL; + float *src_ptr = NULL; + float *dst_ptr = NULL; + int32_t i = 0; + int32_t out_w = 0; + int32_t out_h = 0; + int32_t out_d = 0; + int32_t out_b = 0; + int32_t output_width = 1; + int32_t output_height = 1; + int32_t output_depth = 1; + int32_t output_batch = 1; + vsi_nn_dtype_t dst_type; + vsi_nn_tensor_t *output = NULL; + + input_data_ptr = vsi_nn_ConvertTensorToFloat32Data(graph, input); + CHECK_PTR_FAIL_GOTO( input_data_ptr, "Create data ptr fail.", final ); + + memcpy(&attr, &input->attr, sizeof(vsi_nn_tensor_attr_t)); + + for(i = 0; i < (int32_t)pad_size; i ++) + { + int32_t front = pad_front[i]; + int32_t back = pad_end[i]; + + attr.size[i] = front + back + attr.size[i]; + } + + output_width = attr.size[0]; + output_height = attr.dim_num > 1 ? attr.size[1] : 1; + output_depth = attr.dim_num > 2 ? attr.size[2] : 1; + output_batch = attr.dim_num > 3 ? attr.size[3] : 1; + + sz = vsi_nn_GetTensorSize( attr.size, attr.dim_num, VSI_NN_TYPE_UINT8); + output_data_ptr = (float *)malloc( sz * sizeof(float)); + CHECK_PTR_FAIL_GOTO( output_data_ptr, "Create data ptr fail.", final ); + + dst_ptr = output_data_ptr; + src_ptr = input_data_ptr; + + for (out_b = 0; out_b < output_batch; ++out_b) + { + for (out_d = 0; out_d < output_depth; ++out_d) + { + for (out_h = 0; out_h < output_height; ++out_h) + { + for (out_w = 0; out_w < output_width; ++out_w) + { + if (out_b < pad_front[3] || + out_b >= output_batch - pad_end[3] || + out_d < pad_front[2] || + out_d >= output_depth - pad_end[2] || + out_h < pad_front[1] || + out_h >= output_height - pad_end[1] || + out_w < pad_front[0] || + out_w >= output_width - pad_end[0]) + { + *dst_ptr++ = pad_value; + } + else + { + *dst_ptr++ = *src_ptr++; + } + } + } + } + } + + output = vsi_nn_CreateTensor(graph, &attr); + CHECK_PTR_FAIL_GOTO( output, "Create tensor fail.", final ); + + memcpy(&dst_type, &attr.dtype, sizeof(vsi_nn_dtype_t)); + dst_type.vx_type = VSI_NN_TYPE_FLOAT32; + vsi_nn_CopyRawDataToTensor( graph, (uint8_t *)output_data_ptr, &dst_type, output ); +final: + if (input_data_ptr) + { + free(input_data_ptr); + input_data_ptr = NULL; + } + + if (output_data_ptr) + { + free(output_data_ptr); + output_data_ptr = NULL; + } + + return output; +} + + +vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias + ) +{ + vsi_nn_tensor_t * new_bias = NULL; + vsi_nn_tensor_attr_t attr; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + if (bias == NULL) + { + memcpy(&attr, &weight->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = 2; + attr.size[0] = weight->attr.size[1]; + attr.size[1] = 1; + if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + attr.dtype.scale = input->attr.dtype.scale * weight->attr.dtype.scale; + attr.dtype.zero_point = 0; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + } + else + { + VSILOGE("need to add ..."); + } + } + else + { + memcpy(&attr, &bias->attr, sizeof(vsi_nn_tensor_attr_t)); + } + + new_bias = vsi_nn_CreateTensorWithDefault(graph, &attr, 0.0); + + if (input->attr.dtype.zero_point == 0) + { + return new_bias; + } + else + { + VSILOGE("need to process bias - (input_zp * (w - w_zp)) ..."); + } + + return new_bias; +} diff --git a/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c b/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c new file mode 100644 index 0000000..32fd5b8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c @@ -0,0 +1,111 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#define REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c ) +{ + vx_node node = NULL; + float scale; + vsi_enum overflow_policy,rounding_policy; + vx_scalar scale_s = NULL; + vsi_nn_tensor_t * a_times_b = NULL; + vsi_nn_tensor_attr_t attr; + + scale = 1.0; + overflow_policy = VX_CONVERT_POLICY_SATURATE; + rounding_policy = VX_ROUND_POLICY_TO_ZERO; + + scale_s = vxCreateScalar(graph->ctx->c, VX_TYPE_FLOAT32, &scale); + if(!scale_s) + { + VSILOGE("CreateScalar fail\n"); + goto OnError; + } + + memset(&attr, 0, sizeof(attr)); + memcpy(attr.size, outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof( uint32_t )); + attr.dim_num = outputs[0]->attr.dim_num; + attr.vtl = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + a_times_b = vsi_nn_CreateTensor(graph, &attr); + + node = vxTensorMultiplyNode( graph->g, + inputs[0]->t, inputs[1]->t, + scale_s, + overflow_policy, + rounding_policy, + a_times_b->t ); + if( NULL == node ) + { + VSILOGE("Call vxTensorMultiplyNode fail.(a_times_b_plus_c)"); + goto OnError; + } + + node = vxTensorAddNode( graph->g, a_times_b->t, inputs[2]->t, + VX_CONVERT_POLICY_SATURATE, outputs[0]->t ); + if( NULL == node ) + { + VSILOGE("Call vxTensorAddNode fail.(a_times_b_plus_c)"); + goto OnError; + } + +OnError: + if (scale_s) vxReleaseScalar(&scale_s); + if (a_times_b) vsi_nn_ReleaseTensor(&a_times_b); + + return (vsi_nn_kernel_node_t)node; +} /* a_times_b_plus_c() */ + +#undef REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL + diff --git a/src/tim/vx/internal/src/kernel/vx/clip_vx.c b/src/tim/vx/internal/src/kernel/vx/clip_vx.c new file mode 100644 index 0000000..2c74303 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/clip_vx.c @@ -0,0 +1,198 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include +#include "utils/vsi_nn_dtype_util_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _sort_lut_s +{ + float index; + float val; +} sort_lut; + +static float clip_eval(float val, float min, float max) +{ + return vsi_nn_clamp(val, min, max); +} + +#ifdef VX_USER_LOOKUP_TABLE_SUPPORT +static int32_t _lut_comparator(const void *pa, const void *pb) +{ + sort_lut a = *(sort_lut *)pa; + sort_lut b = *(sort_lut *)pb; + float diff = a.index - b.index; + if ( diff > 0 ) + { + return 1; + } + else if ( diff < 0 ) + { + return -1; + } + + return 0; +} + +static void _set_table_lookup(float func(float, float, float), float *index, float *value, float min, float max) +{ +#define VSI_NN_MAX_LUT_SIZE (1024) +#define FLT16_MAX (57344) +#define FLT16_MIN (-57344) + uint32_t i = 0; + sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut)); + + for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) + { + int16_t val = (int16_t)(i << 6); + lut[i].index = fp16_to_fp32(val); + lut[i].val = func(lut[i].index, min, max); + } + + for (i = 0x0; i < 0x10; i++) + { + lut[i].index = 0; + lut[i].val = func(lut[i].index, min, max); + } + + for (i = 0x1F0; i < 0x200; i++) + { + lut[i].index = FLT16_MAX; + lut[i].val = func(lut[i].index, min, max); + } + + for (i = 0x3F0; i < 0x400; i++) + { + lut[i].index = FLT16_MIN; + lut[i].val = func(lut[i].index, min, max); + } + + qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); + + for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) + { + index[i] = lut[i].index; + value[i] = lut[i].val; + } + + vsi_nn_safe_free(lut); + +#undef VSI_NN_MAX_LUT_SIZE +#undef FLT16_MIN +#undef FLT16_MAX +} +#endif + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel, + float func(float, float, float) + ) +{ +#ifdef VX_USER_LOOKUP_TABLE_SUPPORT + vx_lut lut1 = NULL; + vx_lut lut2 = NULL; + vx_node node = NULL; + float min = vsi_nn_kernel_param_get_float32( params, "min_value" ); + float max = vsi_nn_kernel_param_get_float32( params, "max_value" ); + float index[1024] = {0}; + float value[1024] = {0}; + + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 || + inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + { + return NULL; + } + + _set_table_lookup(func, index, value, min, max); + + lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + if( NULL == lut1 || NULL == lut2 ) + { + VSILOGE("create lut object fail."); + goto OnError; + } + + vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + + node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); + if( NULL == node ) + { + VSILOGE("Call vxTensorTableLookupLayer fail."); + goto OnError; + } + +OnError: + if (lut1) + { + vxReleaseLUT(&lut1); + lut1 = NULL; + } + if (lut2) + { + vxReleaseLUT(&lut2); + lut2 = NULL; + } + return (vsi_nn_kernel_node_t)node; +#else + return NULL; +#endif +} /* _setup() */ + +#define REGISTER_CLIP_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel, UNARY_FUNC); \ + } \ + REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) + +REGISTER_CLIP_OPENVX_KERNEL( clip, clip_eval ) + +#undef REGISTER_CLIP_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c new file mode 100644 index 0000000..235e5ac --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c @@ -0,0 +1,413 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_node.h" +#include "vsi_nn_feature.h" + +static vsi_bool _build_vx_conv2d_param + ( + vx_nn_convolution_params_ext2_t * param, + int32_t stride_h, int32_t stride_w, + int32_t pad_h_front, int32_t pad_h_end, + int32_t pad_w_front, int32_t pad_w_end, + int32_t dilation_h, int32_t dilation_w, + int32_t multiplier, + vsi_enum overflow_policy, vsi_enum rounding_policy, + vsi_enum down_scale_size_rounding + ) +{ + vx_nn_convolution_params_ext_t * p1 = NULL; + memset( param, 0 ,sizeof(vx_nn_convolution_params_ext2_t) ); + + VSI_ASSERT( stride_h > 0 ); + VSI_ASSERT( stride_w > 0 ); + VSI_ASSERT( pad_h_front >= 0 ); + VSI_ASSERT( pad_h_end >= 0 ); + VSI_ASSERT( pad_w_front >= 0 ); + VSI_ASSERT( pad_w_end >= 0 ); + VSI_ASSERT( dilation_h >= 0 ); + VSI_ASSERT( dilation_w >= 0 ); + VSI_ASSERT( multiplier >= 0 ); + + p1 = ¶m->ext; + p1->khr.padding_x = (uint32_t)pad_w_front; + p1->khr.padding_y = (uint32_t)pad_h_front; + if( dilation_h > 0 ) + { + p1->khr.dilation_y = (uint32_t)(dilation_h - 1); + } + if( dilation_w > 0 ) + { + p1->khr.dilation_x = (uint32_t)(dilation_w - 1); + } + //VSILOGD("pad %d %d %d %d", pad_h_front, pad_h_end, pad_w_front, pad_w_end); + //VSILOGD("dilation %d %d ", p1->khr.dilation_y, p1->khr.dilation_x); + //VSILOGD("mul %d ", multiplier); + p1->khr.overflow_policy = (vx_enum)overflow_policy; + p1->khr.rounding_policy = (vx_enum)rounding_policy; + p1->khr.down_scale_size_rounding = (vx_enum)down_scale_size_rounding; + p1->padding_x_right = (uint32_t)pad_w_end; + p1->padding_y_bottom = (uint32_t)pad_h_end; + param->depth_multiplier = multiplier; + param->stride_x = (uint32_t)stride_w; + param->stride_y = (uint32_t)stride_h; + return TRUE; +} /* _build_vx_conv2d_param() */ + +static vsi_bool _build_vx_deconv2d_param + ( + vx_nn_deconvolution_params_ext2_t * param, + int32_t stride_h, int32_t stride_w, + int32_t pad_h_front, int32_t pad_h_end, + int32_t pad_w_front, int32_t pad_w_end, + uint32_t group, + vsi_enum overflow_policy, vsi_enum rounding_policy, + vsi_enum down_scale_size_rounding + ) +{ + vx_nn_deconvolution_params_ext_t * p1 = NULL; + memset( param, 0 ,sizeof(vx_nn_deconvolution_params_ext2_t) ); + + VSI_ASSERT( stride_h > 0 ); + VSI_ASSERT( stride_w > 0 ); + VSI_ASSERT( pad_h_front >= 0 ); + VSI_ASSERT( pad_h_end >= 0 ); + VSI_ASSERT( pad_w_front >= 0 ); + VSI_ASSERT( pad_w_end >= 0 ); + VSI_ASSERT( group >= 1 ); + + p1 = ¶m->ext; + p1->khr.padding_x = (uint32_t)pad_w_front; + p1->khr.padding_y = (uint32_t)pad_h_front; + p1->khr.overflow_policy = (vx_enum)overflow_policy; + p1->khr.rounding_policy = (vx_enum)rounding_policy; + p1->padding_x_right = (uint32_t)pad_w_end; + p1->padding_y_bottom = (uint32_t)pad_h_end; + p1->channel_group = (uint32_t)group; + param->stride_x = (uint32_t)stride_w; + param->stride_y = (uint32_t)stride_h; + param->down_scale_size_rounding = (vx_enum)down_scale_size_rounding; + return TRUE; +} /* _build_vx_deconv2d_param() */ + +static vx_tensor _expand_tensor_dim + ( vx_tensor tensor, int32_t * shape, size_t rank, int32_t expand_dim ) +{ + int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t i, cnt; + if( expand_dim < 0 ) + { + expand_dim = (int32_t)rank + expand_dim; + } + if( expand_dim < 0 || (uint32_t)expand_dim > rank ) + { + VSILOGE("Run dim to expand %d, rank is %lu", expand_dim, rank); + return NULL; + } + for( i = 0, cnt = 0; i < rank; i ++ ) + { + if( i == (uint32_t)expand_dim ) + { + new_shape[cnt] = 1; + cnt ++; + } + new_shape[cnt] = shape[i]; + cnt ++; + } + if( (uint32_t)expand_dim == rank ) + { + new_shape[cnt] = 1; + } + return vxReshapeTensor( tensor, new_shape, (uint32_t)rank + 1 ); +} /* _expand_tensor_dim() */ + + +#define REGISTER_CONV_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_CONV_OPENVX_KERNEL( conv1d ) +{ + vx_node node = NULL; + vx_nn_convolution_params_ext2_t vxparam; + vx_tensor temp_tensors[3] = { NULL }; + int i; + + _build_vx_conv2d_param( + &vxparam, + vsi_nn_kernel_param_get_int32(params, "stride"), 1, + vsi_nn_kernel_param_get_int32(params, "pad_front"), + vsi_nn_kernel_param_get_int32(params, "pad_end"), + 0,0, + vsi_nn_kernel_param_get_int32(params, "dilation"), 1, + 0, + vsi_nn_kernel_param_get_int32(params, "overflow_policy"), + vsi_nn_kernel_param_get_int32(params, "rounding_policy"), + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + ); + + temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, + (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); + CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); + + temp_tensors[1] = _expand_tensor_dim( inputs[1]->t, + (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); + CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); + + temp_tensors[2] = _expand_tensor_dim( outputs[0]->t, + (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); + CHECK_PTR_FAIL_GOTO( temp_tensors[2], "Expand output dim fail.", final ); + + node = vxConvolutionLayer( graph->g, + temp_tensors[0], temp_tensors[1], inputs[2] ? inputs[2]->t : NULL, + (vx_nn_convolution_params_t *)&vxparam, + sizeof( vx_nn_convolution_params_ext2_t ), + temp_tensors[2] + ); + +final: + for( i = 0; i < 3; i ++ ) + { + if( temp_tensors[i] ) + { + vxReleaseTensor( &temp_tensors[i] ); + } + } + return (vsi_nn_kernel_node_t)node; +} /* conv1d*/ + +REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) +{ + vx_node node = NULL; + vx_nn_convolution_params_ext2_t vxparam; + vx_tensor temp_tensors[3] = { NULL }; + int i; + vsi_bool need_explicit_padding = FALSE; + + _build_vx_conv2d_param( + &vxparam, + vsi_nn_kernel_param_get_int32(params, "stride"), 1, + vsi_nn_kernel_param_get_int32(params, "pad_front"), + vsi_nn_kernel_param_get_int32(params, "pad_end"), + 0,0, + vsi_nn_kernel_param_get_int32(params, "dilation"), 1, + vsi_nn_kernel_param_get_int32(params, "multiplier"), + vsi_nn_kernel_param_get_int32(params, "overflow_policy"), + vsi_nn_kernel_param_get_int32(params, "rounding_policy"), + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + ); + + temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, + (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); + CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); + + temp_tensors[1] = _expand_tensor_dim( inputs[1]->t, + (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); + CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); + + temp_tensors[2] = _expand_tensor_dim( outputs[0]->t, + (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); + CHECK_PTR_FAIL_GOTO( temp_tensors[2], "Expand output dim fail.", final ); + + if( need_explicit_padding ) + { + int32_t pad_front[4] = { 0 }; + int32_t pad_end[4] = { 0 }; + vx_tensor pad_tensor = NULL; + pad_front[0] = (int32_t)vxparam.ext.khr.padding_x; + pad_front[1] = (int32_t)vxparam.ext.khr.padding_y; + pad_end[0] = (int32_t)vxparam.ext.padding_x_right; + pad_end[1] = (int32_t)vxparam.ext.padding_y_bottom; + + pad_tensor = (vx_tensor)kernel_pad_node( + graph, (vsi_nn_kernel_tensor_t)temp_tensors[0], + pad_front, pad_end, 4, VSI_NN_PAD_MODE_CONSTANT, 0, NULL); + + if( NULL == pad_tensor ) + { + VSILOGW("Create pad node fail."); + goto final; + } + else + { + vxReleaseTensor( &temp_tensors[0] ); + temp_tensors[0] = pad_tensor; + } + vxparam.ext.khr.padding_x = 0; + vxparam.ext.khr.padding_y = 0; + vxparam.ext.padding_x_right = 0; + vxparam.ext.padding_y_bottom = 0; + } + + node = vxConvolutionLayer( graph->g, + temp_tensors[0], temp_tensors[1], inputs[2] ? inputs[2]->t : NULL, + (vx_nn_convolution_params_t *)&vxparam, + sizeof( vx_nn_convolution_params_ext2_t ), + temp_tensors[2] + ); +final: + for( i = 0; i < 3; i ++ ) + { + if( temp_tensors[i] ) + { + vxReleaseTensor( &temp_tensors[i] ); + } + } + return (vsi_nn_kernel_node_t)node; +} /* depthwise_conv1d*/ + +REGISTER_CONV_OPENVX_KERNEL( conv2d ) +{ + vx_node node = NULL; + vx_nn_convolution_params_ext2_t vxparam; + + _build_vx_conv2d_param( + &vxparam, + vsi_nn_kernel_param_get_int32(params, "stride_h"), + vsi_nn_kernel_param_get_int32(params, "stride_w"), + vsi_nn_kernel_param_get_int32(params, "pad_h_front"), + vsi_nn_kernel_param_get_int32(params, "pad_h_end"), + vsi_nn_kernel_param_get_int32(params, "pad_w_front"), + vsi_nn_kernel_param_get_int32(params, "pad_w_end"), + vsi_nn_kernel_param_get_int32(params, "dilation_h"), + vsi_nn_kernel_param_get_int32(params, "dilation_w"), + 0, + vsi_nn_kernel_param_get_int32(params, "overflow_policy"), + vsi_nn_kernel_param_get_int32(params, "rounding_policy"), + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + ); + + node = vxConvolutionLayer( graph->g, + inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL, + (vx_nn_convolution_params_t *)&vxparam, + sizeof( vx_nn_convolution_params_ext2_t ), + outputs[2]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* conv2d*/ + +REGISTER_CONV_OPENVX_KERNEL( depthwise_conv2d ) +{ + vx_node node = NULL; + vx_nn_convolution_params_ext2_t vxparam; + + _build_vx_conv2d_param( + &vxparam, + vsi_nn_kernel_param_get_int32(params, "stride_h"), + vsi_nn_kernel_param_get_int32(params, "stride_w"), + vsi_nn_kernel_param_get_int32(params, "pad_h_front"), + vsi_nn_kernel_param_get_int32(params, "pad_h_end"), + vsi_nn_kernel_param_get_int32(params, "pad_w_front"), + vsi_nn_kernel_param_get_int32(params, "pad_w_end"), + vsi_nn_kernel_param_get_int32(params, "dilation_h"), + vsi_nn_kernel_param_get_int32(params, "dilation_w"), + vsi_nn_kernel_param_get_int32(params, "multiplier"), + vsi_nn_kernel_param_get_int32(params, "overflow_policy"), + vsi_nn_kernel_param_get_int32(params, "rounding_policy"), + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + ); + + node = vxConvolutionLayer( graph->g, + inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL, + (vx_nn_convolution_params_t *)&vxparam, + sizeof( vx_nn_convolution_params_ext2_t ), + outputs[2]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* depthwise_conv2d*/ + +REGISTER_CONV_OPENVX_KERNEL( deconvolution1d ) +{ + vx_node node = NULL; + vx_nn_deconvolution_params_ext2_t vxparam; + vx_tensor temp_tensors[2] = { NULL }; + int i; + + _build_vx_deconv2d_param( + &vxparam, + 1, vsi_nn_kernel_param_get_int32(params, "stride"), + 0,0, + vsi_nn_kernel_param_get_int32(params, "pad_front"), + vsi_nn_kernel_param_get_int32(params, "pad_end"), + vsi_nn_kernel_param_get_int32(params, "group"), + vsi_nn_kernel_param_get_int32(params, "overflow_policy"), + vsi_nn_kernel_param_get_int32(params, "rounding_policy"), + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + ); + + temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, + (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 ); + CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); + + temp_tensors[1] = _expand_tensor_dim( outputs[0]->t, + (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 1 ); + CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand output dim fail.", final ); + + node = vxDeconvolutionLayer( graph->g, + temp_tensors[0], inputs[1]->t, inputs[2] ? inputs[2]->t : NULL, + (vx_nn_deconvolution_params_t *)&vxparam, + sizeof( vx_nn_deconvolution_params_ext2_t ), + temp_tensors[1] + ); + +final: + for( i = 0; i < 2; i ++ ) + { + if( temp_tensors[i] ) + { + vxReleaseTensor( &temp_tensors[i] ); + } + } + return (vsi_nn_kernel_node_t)node; +} /* deconvolution1d*/ + +#undef REGISTER_CONV_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c new file mode 100644 index 0000000..e259554 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c @@ -0,0 +1,238 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include +#include "utils/vsi_nn_dtype_util_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _sort_lut_s +{ + float index; + float val; +} sort_lut; + +static float exp_eval(float val) +{ + return expf(val); +} + +static float log_eval(float data) +{ + return logf(data); +} + +static float elu_eval(float data) +{ + return data >=0 ? data : expf(data) - 1; +} + +static float neg_eval(float data) +{ + return data * -1.0f; +} + +static float hsigmoid_eval(float data) +{ + data = (float)(0.2 * data + 0.5); + data = vsi_nn_clamp(data, 0, 1); + + return data; +} + +static float soft_plus_eval(float data) +{ + return log_eval(exp_eval(data) + 1); +} + +static float mish_eval(float data) +{ + data = (float)(data * tanh(soft_plus_eval(data))); + + return data; +} + +#ifdef VX_USER_LOOKUP_TABLE_SUPPORT +static int32_t _lut_comparator(const void *pa, const void *pb) +{ + sort_lut a = *(sort_lut *)pa; + sort_lut b = *(sort_lut *)pb; + float diff = a.index - b.index; + if ( diff > 0 ) + { + return 1; + } + else if ( diff < 0 ) + { + return -1; + } + + return 0; +} + +static void _set_unary_table_lookup(float func(float), float *index, float *value) +{ +#define VSI_NN_MAX_LUT_SIZE (1024) +#define FLT16_MAX (57344) +#define FLT16_MIN (-57344) + uint32_t i = 0; + sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut)); + + for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) + { + int16_t val = (int16_t)(i << 6); + lut[i].index = fp16_to_fp32(val); + lut[i].val = func(lut[i].index); + } + + for (i = 0x0; i < 0x10; i++) + { + lut[i].index = 0; + lut[i].val = func(lut[i].index); + } + + for (i = 0x1F0; i < 0x200; i++) + { + lut[i].index = FLT16_MAX; + lut[i].val = func(lut[i].index); + } + + for (i = 0x3F0; i < 0x400; i++) + { + lut[i].index = FLT16_MIN; + lut[i].val = func(lut[i].index); + } + + qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); + + for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) + { + index[i] = lut[i].index; + value[i] = lut[i].val; + } + + vsi_nn_safe_free(lut); + +#undef VSI_NN_MAX_LUT_SIZE +#undef FLT16_MIN +#undef FLT16_MAX +} +#endif + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel, + float func(float) + ) +{ +#ifdef VX_USER_LOOKUP_TABLE_SUPPORT + vx_lut lut1 = NULL; + vx_lut lut2 = NULL; + vx_node node = NULL; + float index[1024] = {0}; + float value[1024] = {0}; + + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 || + inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + { + return NULL; + } + + _set_unary_table_lookup(func, index, value); + + lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + if( NULL == lut1 || NULL == lut2 ) + { + VSILOGE("create lut object fail."); + goto OnError; + } + + vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + + node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); + if( NULL == node ) + { + VSILOGE("Call vxTensorTableLookupLayer fail."); + goto OnError; + } + +OnError: + if (lut1) + { + vxReleaseLUT(&lut1); + lut1 = NULL; + } + if (lut2) + { + vxReleaseLUT(&lut2); + lut2 = NULL; + } + + return (vsi_nn_kernel_node_t)node; +#else + return NULL; +#endif +} /* _setup() */ + +#define REGISTER_ELTWISE_UNARY_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel, UNARY_FUNC); \ + } \ + REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( mish, mish_eval ) +//REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( exp, exp_eval ) +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( log, log_eval ) +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( elu, elu_eval ) +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( neg, neg_eval ) +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_sigmoid, hsigmoid_eval ) + +#undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL + diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c new file mode 100644 index 0000000..3c9947d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c @@ -0,0 +1,132 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "kernel/vsi_nn_kernel.h" + + +#define REGISTER_ELTWISE_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_ELTWISE_OPENVX_KERNEL( add ) +{ + vx_node node = vxTensorAddNode( graph->g, inputs[0]->t, inputs[1]->t, + VX_CONVERT_POLICY_SATURATE, outputs[0]->t ); + return (vsi_nn_kernel_node_t)node; +} /* add() */ + +REGISTER_ELTWISE_OPENVX_KERNEL( sub ) +{ + vx_node node = vxTensorSubtractNode( graph->g, inputs[0]->t, inputs[1]->t, + VX_CONVERT_POLICY_SATURATE, outputs[0]->t ); + + return (vsi_nn_kernel_node_t)node; +} /* sub() */ + +REGISTER_ELTWISE_OPENVX_KERNEL( div ) +{ + float scale; + vsi_enum overflow_policy, rounding_policy; + vx_scalar scale_s = NULL; + vx_node node = NULL; + + scale = vsi_nn_kernel_param_get_float32(params, "scale"); + overflow_policy = vsi_nn_kernel_param_get_int32(params, "overflow_policy"); + rounding_policy = vsi_nn_kernel_param_get_int32(params, "rounding_policy"); + + scale_s = vxCreateScalar(graph->ctx->c, VX_TYPE_FLOAT32, &scale); + if(!scale_s) + { + VSILOGE("CreateScalar fail\n"); + return NULL; + } + + node = vxTensorDivideNode( graph->g, + inputs[0]->t, inputs[1]->t, + scale_s, + overflow_policy, + rounding_policy, + outputs[0]->t ); + + vxReleaseScalar(&scale_s); + + return (vsi_nn_kernel_node_t)node; +} /* div() */ + +REGISTER_ELTWISE_OPENVX_KERNEL( mul ) +{ + float scale; + vsi_enum overflow_policy, rounding_policy; + vx_scalar scale_s = NULL; + vx_node node = NULL; + + scale = vsi_nn_kernel_param_get_float32(params, "scale"); + overflow_policy = vsi_nn_kernel_param_get_int32(params, "overflow_policy"); + rounding_policy = vsi_nn_kernel_param_get_int32(params, "rounding_policy"); + + scale_s = vxCreateScalar(graph->ctx->c, VX_TYPE_FLOAT32, &scale); + if(!scale_s) + { + VSILOGE("CreateScalar fail\n"); + return NULL; + } + + node = vxTensorMultiplyNode( graph->g, + inputs[0]->t, inputs[1]->t, + scale_s, + overflow_policy, + rounding_policy, + outputs[0]->t ); + + vxReleaseScalar(&scale_s); + + return (vsi_nn_kernel_node_t)node; +} /* mul() */ + +#undef REGISTER_ELTWISE_OPENVX_KERNEL + diff --git a/src/tim/vx/internal/src/kernel/vx/prelu_vx.c b/src/tim/vx/internal/src/kernel/vx/prelu_vx.c new file mode 100644 index 0000000..3a8a861 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/prelu_vx.c @@ -0,0 +1,119 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +static vsi_nn_tensor_t * _reshape_to_1d_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input + ) +{ + vsi_nn_tensor_t *tensor = NULL; + uint32_t i = 0; + uint32_t size = 0; + int32_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 }; + uint32_t one_rank = 0; + + for (i = 0; i < input->attr.dim_num; i++) + { + if (input->attr.size[i] != 1) + { + size = input->attr.size[i]; + one_rank ++; + } + } + + if (one_rank <= 1) + { + shapes[0] = size; + } + else + { + VSILOGD("Error: PRelu Driver API only support per-chanel \n"); + return NULL; + } + + tensor = vsi_nn_reshape_tensor( graph, input, (uint32_t*)shapes, 1 ); + + return tensor; +} + +#define REGISTER_PRELU_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_PRELU_OPENVX_KERNEL( prelu ) +{ + vsi_nn_tensor_t * alpha = NULL; + vx_node node = NULL; + int32_t is_per_channel_alpha = 0; + + is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha"); + + if (!is_per_channel_alpha) + { + return NULL; + } + + alpha = _reshape_to_1d_tensor(graph, inputs[1]); + + node = vxPReluLayer( graph->g, inputs[0]->t, inputs[1]->t, outputs[0]->t ); + + if (alpha) + { + vsi_nn_ReleaseTensor(&alpha); + alpha = NULL; + } + + return (vsi_nn_kernel_node_t)node; +} /* prelu() */ + +#undef REGISTER_PRELU_OPENVX_KERNEL + diff --git a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c new file mode 100644 index 0000000..14ec73d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c @@ -0,0 +1,202 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include +#include "utils/vsi_nn_dtype_util_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _sort_lut_s +{ + float index; + float val; +} sort_lut; + +static float relu_keras_eval(float val, float alpha, float threshold, float max) +{ + val = vsi_nn_min(val, max); + val = val < threshold ? alpha * (val - threshold) : val; + return val; +} + +#ifdef VX_USER_LOOKUP_TABLE_SUPPORT +static int32_t _lut_comparator(const void *pa, const void *pb) +{ + sort_lut a = *(sort_lut *)pa; + sort_lut b = *(sort_lut *)pb; + float diff = a.index - b.index; + if ( diff > 0 ) + { + return 1; + } + else if ( diff < 0 ) + { + return -1; + } + + return 0; +} + +static void _set_table_lookup(float func(float, float, float, float), + float *index, float *value, float alpha, float threshold, float max) +{ +#define VSI_NN_MAX_LUT_SIZE (1024) +#define FLT16_MAX (57344) +#define FLT16_MIN (-57344) + uint32_t i = 0; + sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut)); + + for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) + { + int16_t val = (int16_t)(i << 6); + lut[i].index = fp16_to_fp32(val); + lut[i].val = func(lut[i].index, alpha, threshold, max); + } + + for (i = 0x0; i < 0x10; i++) + { + lut[i].index = 0; + lut[i].val = func(lut[i].index, alpha, threshold, max); + } + + for (i = 0x1F0; i < 0x200; i++) + { + lut[i].index = FLT16_MAX; + lut[i].val = func(lut[i].index, alpha, threshold, max); + } + + for (i = 0x3F0; i < 0x400; i++) + { + lut[i].index = FLT16_MIN; + lut[i].val = func(lut[i].index, alpha, threshold, max); + } + + qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); + + for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) + { + index[i] = lut[i].index; + value[i] = lut[i].val; + } + + vsi_nn_safe_free(lut); + +#undef VSI_NN_MAX_LUT_SIZE +#undef FLT16_MIN +#undef FLT16_MAX +} +#endif + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel, + float func(float, float, float, float) + ) +{ +#ifdef VX_USER_LOOKUP_TABLE_SUPPORT + vx_lut lut1 = NULL; + vx_lut lut2 = NULL; + vx_node node = NULL; + float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); + float max = vsi_nn_kernel_param_get_float32( params, "max_value" ); + float threshold = vsi_nn_kernel_param_get_float32( params, "threshold" ); + float index[1024] = {0}; + float value[1024] = {0}; + + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 || + inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + { + return NULL; + } + + _set_table_lookup(func, index, value, alpha, threshold, max); + + lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + if( NULL == lut1 || NULL == lut2 ) + { + VSILOGE("create lut object fail."); + goto OnError; + } + + vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + + node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); + if( NULL == node ) + { + VSILOGE("Call vxTensorTableLookupLayer fail."); + goto OnError; + } + +OnError: + if (lut1) + { + vxReleaseLUT(&lut1); + lut1 = NULL; + } + if (lut2) + { + vxReleaseLUT(&lut2); + lut2 = NULL; + } + return (vsi_nn_kernel_node_t)node; +#else + return NULL; +#endif +} /* _setup() */ + +#define REGISTER_KERAS_RELU_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel, UNARY_FUNC); \ + } \ + REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) + +REGISTER_KERAS_RELU_OPENVX_KERNEL( relu_keras, relu_keras_eval ) + +#undef REGISTER_KERAS_RELU_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/swish_vx.c b/src/tim/vx/internal/src/kernel/vx/swish_vx.c new file mode 100644 index 0000000..7557d9b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/swish_vx.c @@ -0,0 +1,94 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_ACTIVATION_EXT_SUPPORT) + +#define REGISTER_SWISH_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_SWISH_OPENVX_KERNEL( swish ) +{ + vx_node node = NULL; + vsi_nn_swish_type swish_type = VSI_NN_SWISH; + vx_enum function = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SWISH; + float beta = 1.0f; + + if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) + { + swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type"); + + if (VSI_NN_SWISH == swish_type) + { + function = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SWISH; + } + else + { + function = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HWISH; + } + + beta = vsi_nn_kernel_param_get_float32( params, "beta" ); + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + function, + 1, + beta, + outputs[0]->t + ); + } + return (vsi_nn_kernel_node_t)node; +} /* prelu() */ + +#undef REGISTER_SWISH_OPENVX_KERNEL + +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/add_mean_std_norm.cl b/src/tim/vx/internal/src/libnnext/ops/cl/add_mean_std_norm.cl new file mode 100644 index 0000000..d75c96a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/add_mean_std_norm.cl @@ -0,0 +1,173 @@ + + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_F32_F32toF32( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float rsEps, float dimRatio, + float input0Scale, float input0Tail, + float input1Scale, float input1Tail, + float outputScale, float outputZP, + int width) +{ + int lidx = get_local_id(0); + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + float4 src0, src1, result; + float pSum = 0.0f, pSqr = 0.0f; + float sum = 0.0f, sqr = 0.0f; + float input_d = 0.0f; + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(; coord.x < width; coord.x += 16) + { + src0 = read_imagef(input, coord); + src1 = read_imagef(input1, coord); + input_d = src0.x + src1.x; + pSum += input_d; + pSqr += input_d * input_d; + } + lcl_sum[lidx] = pSum; + lcl_sqr[lidx] = pSqr; + barrier(CLK_LOCAL_MEM_FENCE); + + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + float4 one = (float4)(1, 1, 1, 1); + float4 data0; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + pLocalPtr = (float4 *)&lcl_sqr[0]; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sqr = dot(data0, one); + float mean; + mean = sum * dimRatio; + float vari, stddev_inv, rMeanStd; + vari = sqr*dimRatio - mean*mean; + stddev_inv = (vari==0 ? rsEps : rsqrt(vari)); + rMeanStd = (-mean) * stddev_inv; + for(coord.x = gidx; coord.x < width; coord.x += 16) + { + src0 = read_imagef(input, coord); + src1 = read_imagef(input1, coord); + input_d = src0.x + src1.x; + result.x = input_d * stddev_inv + rMeanStd; + write_imagef(output, coord, result.xxxx); + } +} + + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_U8_U8toF32( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float rsEps, float dimRatio, + float input0Scale, float input0Tail, + float input1Scale, float input1Tail, + float outputScale, float outputZP, + int width) +{ + int lidx = get_local_id(0); + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + float4 src0, src1, result; + float pSum = 0.0f, pSqr = 0.0f; + float sum = 0.0f, sqr = 0.0f; + float input_d = 0.0f; + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(; coord.x < width; coord.x += 16) + { + src0 = convert_float4(read_imageui(input, coord)) * input0Scale - input0Tail; + src1 = convert_float4(read_imageui(input1, coord)) * input1Scale - input1Tail; + input_d = src0.x + src1.x; + pSum += input_d; + pSqr += input_d * input_d; + } + lcl_sum[lidx] = pSum; + lcl_sqr[lidx] = pSqr; + barrier(CLK_LOCAL_MEM_FENCE); + + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + float4 one = (float4)(1, 1, 1, 1); + float4 data0; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + pLocalPtr = (float4 *)&lcl_sqr[0]; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sqr = dot(data0, one); + float mean; + mean = sum * dimRatio; + float vari, stddev_inv, rMeanStd; + vari = sqr*dimRatio - mean*mean; + stddev_inv = (vari==0 ? rsEps : rsqrt(vari)); + rMeanStd = (-mean) * stddev_inv; + for(coord.x = gidx; coord.x < width; coord.x += 16) + { + src0 = convert_float4(read_imageui(input, coord)) * input0Scale - input0Tail; + src1 = convert_float4(read_imageui(input1, coord)) * input1Scale - input1Tail; + input_d = src0.x + src1.x; + result.x = input_d * stddev_inv + rMeanStd; + write_imagef(output, coord, result.xxxx); + } +} + + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_U8_U8toU8( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float rsEps, float dimRatio, + float input0Scale, float input0Tail, + float input1Scale, float input1Tail, + float outputScale, float outputZP, + int width) +{ + int lidx = get_local_id(0); + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + float4 src0, src1, result = 0.0f; + float pSum = 0.0f, pSqr = 0.0f; + float sum = 0.0f, sqr = 0.0f; + float input_d = 0.0f; + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(; coord.x < width; coord.x += 16) + { + src0 = convert_float4(read_imageui(input, coord)) * input0Scale - input0Tail; + src1 = convert_float4(read_imageui(input1, coord)) * input1Scale - input1Tail; + input_d = src0.x + src1.x; + pSum += input_d; + pSqr += input_d * input_d; + } + lcl_sum[lidx] = pSum; + lcl_sqr[lidx] = pSqr; + barrier(CLK_LOCAL_MEM_FENCE); + + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + float4 one = (float4)(1, 1, 1, 1); + float4 data0; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + pLocalPtr = (float4 *)&lcl_sqr[0]; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sqr = dot(data0, one); + float mean; + mean = sum * dimRatio; + float vari, stddev_inv, rMeanStd; + vari = sqr*dimRatio - mean*mean; + stddev_inv = (vari==0 ? rsEps : rsqrt(vari)); + rMeanStd = (-mean) * stddev_inv; + for(coord.x = gidx; coord.x < width; coord.x += 16) + { + src0 = convert_float4(read_imageui(input, coord)) * input0Scale - input0Tail; + src1 = convert_float4(read_imageui(input1, coord)) * input1Scale - input1Tail; + input_d = src0.x + src1.x; + result.x = input_d * stddev_inv + rMeanStd; + uint4 dst = convert_uint4(result * outputScale + outputZP); + write_imageui(output, coord, dst); + } +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/argmax_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/argmax_axis0.cl new file mode 100644 index 0000000..81e1709 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/argmax_axis0.cl @@ -0,0 +1,146 @@ +__kernel void argmax_axis0_F32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + + float4 minVal = read_imagef(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = read_imagef(input, coord); + minIdx = val.x > minVal.x ? coord.x : minIdx; + minVal = val > minVal ? val : minVal; + coord.x ++; + } + + write_imagei(output, coord.yz, minIdx); +} + +__kernel void argmax_axis0_F32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + + float4 minVal = read_imagef(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = read_imagef(input, coord); + minIdx = val.x > minVal.x ? coord.x : minIdx; + minVal = val > minVal ? val : minVal; + coord.x ++; + } + + coord.x = 0; + write_imagei(output, coord.yx, minIdx); +} + +__kernel void argmax_axis0_U8toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + + uint4 minVal = read_imageui(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + uint4 val = read_imageui(input, coord); + minIdx = val.x > minVal.x ? coord.x : minIdx; + minVal = val > minVal ? val : minVal; + coord.x ++; + } + + write_imagei(output, coord.yz, minIdx); +} + +__kernel void argmax_axis0_U8toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + + uint4 minVal = read_imageui(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + uint4 val = read_imageui(input, coord); + minIdx = val.x > minVal.x ? coord.x : minIdx; + minVal = val > minVal ? val : minVal; + coord.x ++; + } + + coord.x = 0; + write_imagei(output, coord.yx, minIdx); +} + +__kernel void argmax_axis0_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + + int4 minVal = read_imagei(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + minIdx = val.x > minVal.x ? coord.x : minIdx; + minVal = val > minVal ? val : minVal; + coord.x ++; + } + + write_imagei(output, coord.yz, minIdx); +} + +__kernel void argmax_axis0_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + + int4 minVal = read_imagei(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + minIdx = val.x > minVal.x ? coord.x : minIdx; + minVal = val > minVal ? val : minVal; + coord.x ++; + } + + coord.x = 0; + write_imagei(output, coord.yx, minIdx); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/argmax_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/argmax_axis1.cl new file mode 100644 index 0000000..ef90719 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/argmax_axis1.cl @@ -0,0 +1,146 @@ +__kernel void argmax_axis1_F32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + + float4 minVal = read_imagef(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = read_imagef(input, coord); + minIdx = val.x > minVal.x ? coord.y : minIdx; + minVal = val > minVal ? val : minVal; + coord.y ++; + } + + write_imagei(output, coord.xz, minIdx); +} + +__kernel void argmax_axis1_F32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + + float4 minVal = read_imagef(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = read_imagef(input, coord); + minIdx = val.x > minVal.x ? coord.y : minIdx; + minVal = val > minVal ? val : minVal; + coord.y ++; + } + + coord.y = 0; + write_imagei(output, coord, minIdx); +} + +__kernel void argmax_axis1_U8toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + + uint4 minVal = read_imageui(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + uint4 val = read_imageui(input, coord); + minIdx = val.x > minVal.x ? coord.y : minIdx; + minVal = val > minVal ? val : minVal; + coord.y ++; + } + + write_imagei(output, coord.xz, minIdx); +} + +__kernel void argmax_axis1_U8toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + + uint4 minVal = read_imageui(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + uint4 val = read_imageui(input, coord); + minIdx = val.x > minVal.x ? coord.y : minIdx; + minVal = val > minVal ? val : minVal; + coord.y ++; + } + + coord.y = 0; + write_imagei(output, coord, minIdx); +} + +__kernel void argmax_axis1_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + + int4 minVal = read_imagei(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + minIdx = val.x > minVal.x ? coord.y : minIdx; + minVal = val > minVal ? val : minVal; + coord.y ++; + } + + write_imagei(output, coord.xz, minIdx); +} + +__kernel void argmax_axis1_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + + int4 minVal = read_imagei(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + minIdx = val.x > minVal.x ? coord.y : minIdx; + minVal = val > minVal ? val : minVal; + coord.y ++; + } + + coord.y = 0; + write_imagei(output, coord, minIdx); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/argmax_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/argmax_axis2.cl new file mode 100644 index 0000000..13b302b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/argmax_axis2.cl @@ -0,0 +1,110 @@ +__kernel void argmax_axis2_F32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + float4 minVal = read_imagef(input, coord); + int minIdx = 0; + coord.z ++; + + for (; coord.z < axisSize;) + { + float4 val = read_imagef(input, coord); + minIdx = val.x > minVal.x ? coord.z : minIdx; + minVal = val > minVal ? val : minVal; + coord.z ++; + } + + write_imagei(output, coord.xy, minIdx); +} + +__kernel void argmax_axis2_F32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int minIdx = 0; + + write_imagei(output, coord.xy, minIdx); +} + +__kernel void argmax_axis2_U8toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + uint4 minVal = read_imageui(input, coord); + int minIdx = 0; + coord.z ++; + + for (; coord.z < axisSize;) + { + uint4 val = read_imageui(input, coord); + minIdx = val.x > minVal.x ? coord.z : minIdx; + minVal = val > minVal ? val : minVal; + coord.z ++; + } + + write_imagei(output, coord.xy, minIdx); +} + +__kernel void argmax_axis2_U8toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int minIdx = 0; + + write_imagei(output, coord.xy, minIdx); +} + +__kernel void argmax_axis2_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + int4 minVal = read_imagei(input, coord); + int minIdx = 0; + coord.z ++; + + for (; coord.z < axisSize;) + { + int4 val = read_imagei(input, coord); + minIdx = val.x > minVal.x ? coord.z : minIdx; + minVal = val > minVal ? val : minVal; + coord.z ++; + } + + write_imagei(output, coord.xy, minIdx); +} + +__kernel void argmax_axis2_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int minIdx = 0; + + write_imagei(output, coord.xy, minIdx); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/argmin_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/argmin_axis0.cl new file mode 100644 index 0000000..ad92c91 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/argmin_axis0.cl @@ -0,0 +1,147 @@ +__kernel void argmin_axis0_F32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + + float4 minVal = read_imagef(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = read_imagef(input, coord); + minIdx = val.x < minVal.x ? coord.x : minIdx; + minVal = val < minVal ? val : minVal; + coord.x ++; + } + + write_imagei(output, coord.yz, minIdx); +} + +__kernel void argmin_axis0_F32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + + float4 minVal = read_imagef(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = read_imagef(input, coord); + minIdx = val.x < minVal.x ? coord.x : minIdx; + minVal = val < minVal ? val : minVal; + coord.x ++; + } + + coord.x = 0; + write_imagei(output, coord.yx, minIdx); +} + +__kernel void argmin_axis0_U8toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + + uint4 minVal = read_imageui(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + uint4 val = read_imageui(input, coord); + minIdx = val.x < minVal.x ? coord.x : minIdx; + minVal = val < minVal ? val : minVal; + coord.x ++; + } + + write_imagei(output, coord.yz, minIdx); +} + +__kernel void argmin_axis0_U8toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + + uint4 minVal = read_imageui(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + uint4 val = read_imageui(input, coord); + minIdx = val.x < minVal.x ? coord.x : minIdx; + minVal = val < minVal ? val : minVal; + coord.x ++; + } + + coord.x = 0; + write_imagei(output, coord.yx, minIdx); +} + +__kernel void argmin_axis0_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + + int4 minVal = read_imagei(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + minIdx = val.x < minVal.x ? coord.x : minIdx; + minVal = val < minVal ? val : minVal; + coord.x ++; + } + + write_imagei(output, coord.yz, minIdx); +} + +__kernel void argmin_axis0_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + + int4 minVal = read_imagei(input, coord); + int minIdx = 0; + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + minIdx = val.x < minVal.x ? coord.x : minIdx; + minVal = val < minVal ? val : minVal; + coord.x ++; + } + + coord.x = 0; + write_imagei(output, coord.yx, minIdx); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/argmin_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/argmin_axis1.cl new file mode 100644 index 0000000..fca2474 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/argmin_axis1.cl @@ -0,0 +1,147 @@ +__kernel void argmin_axis1_F32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + + float4 minVal = read_imagef(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = read_imagef(input, coord); + minIdx = val.x < minVal.x ? coord.y : minIdx; + minVal = val < minVal ? val : minVal; + coord.y ++; + } + + write_imagei(output, coord.xz, minIdx); +} + +__kernel void argmin_axis1_F32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + + float4 minVal = read_imagef(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = read_imagef(input, coord); + minIdx = val.x < minVal.x ? coord.y : minIdx; + minVal = val < minVal ? val : minVal; + coord.y ++; + } + + coord.y = 0; + write_imagei(output, coord, minIdx); +} + +__kernel void argmin_axis1_U8toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + + uint4 minVal = read_imageui(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + uint4 val = read_imageui(input, coord); + minIdx = val.x < minVal.x ? coord.y : minIdx; + minVal = val < minVal ? val : minVal; + coord.y ++; + } + + write_imagei(output, coord.xz, minIdx); +} + +__kernel void argmin_axis1_U8toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + + uint4 minVal = read_imageui(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + uint4 val = read_imageui(input, coord); + minIdx = val.x < minVal.x ? coord.y : minIdx; + minVal = val < minVal ? val : minVal; + coord.y ++; + } + + coord.y = 0; + write_imagei(output, coord, minIdx); +} + +__kernel void argmin_axis1_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + + int4 minVal = read_imagei(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + minIdx = val.x < minVal.x ? coord.y : minIdx; + minVal = val < minVal ? val : minVal; + coord.y ++; + } + + write_imagei(output, coord.xz, minIdx); +} + +__kernel void argmin_axis1_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + + int4 minVal = read_imagei(input, coord); + int minIdx = 0; + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + minIdx = val.x < minVal.x ? coord.y : minIdx; + minVal = val < minVal ? val : minVal; + coord.y ++; + } + + coord.y = 0; + write_imagei(output, coord, minIdx); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/argmin_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/argmin_axis2.cl new file mode 100644 index 0000000..78f9e76 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/argmin_axis2.cl @@ -0,0 +1,111 @@ +__kernel void argmin_axis2_F32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + float4 minVal = read_imagef(input, coord); + int minIdx = 0; + coord.z ++; + + for (; coord.z < axisSize;) + { + float4 val = read_imagef(input, coord); + minIdx = val.x < minVal.x ? coord.z : minIdx; + minVal = val < minVal ? val : minVal; + coord.z ++; + } + + write_imagei(output, coord.xy, minIdx); +} + +__kernel void argmin_axis2_F32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int minIdx = 0; + + write_imagei(output, coord.xy, minIdx); +} + +__kernel void argmin_axis2_U8toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + uint4 minVal = read_imageui(input, coord); + int minIdx = 0; + coord.z ++; + + for (; coord.z < axisSize;) + { + uint4 val = read_imageui(input, coord); + minIdx = val.x < minVal.x ? coord.z : minIdx; + minVal = val < minVal ? val : minVal; + coord.z ++; + } + + write_imagei(output, coord.xy, minIdx); +} + +__kernel void argmin_axis2_U8toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int minIdx = 0; + + write_imagei(output, coord.xy, minIdx); +} + +__kernel void argmin_axis2_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + int4 minVal = read_imagei(input, coord); + int minIdx = 0; + coord.z ++; + + for (; coord.z < axisSize;) + { + int4 val = read_imagei(input, coord); + minIdx = val.x < minVal.x ? coord.z : minIdx; + minVal = val < minVal ? val : minVal; + coord.z ++; + } + + write_imagei(output, coord.xy, minIdx); +} + +__kernel void argmin_axis2_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axisSize + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int minIdx = 0; + + write_imagei(output, coord.xy, minIdx); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl b/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl new file mode 100644 index 0000000..227e659 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl @@ -0,0 +1,132 @@ + +__kernel void batch_norm_F32toF32 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t Mean, + __read_only image2d_array_t Variance, + __read_only image2d_array_t Gamma, + __read_only image2d_array_t Beta, + __write_only image2d_array_t output, + float eps, + float input_scale, + float input_tail, + float output_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + float4 src, mean, var, gamma, beta; + readImage2DArray(src, input, coord); + readImage2DArray(mean, Mean, coord); + readImage2DArray(var, Variance, coord); + readImage2DArray(gamma, Gamma, coord); + readImage2DArray(beta, Beta, coord); + + float4 dst; + src.x = src.x - mean.x; + float inv = rsqrt(var.x + eps); + dst.x = src.x * inv *gamma.x + beta.x; + + write_imagef(output, coord, dst); +} + +__kernel void batch_norm_F32toF32_2D + ( + __read_only image2d_t input, + __read_only image2d_t Mean, + __read_only image2d_t Variance, + __read_only image2d_t Gamma, + __read_only image2d_t Beta, + __write_only image2d_t output, + float eps, + float input_scale, + float input_tail, + float output_scale, + float output_zp + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + float4 src = read_imagef(input, coord); + float4 mean = read_imagef(Mean, coord); + float4 var = read_imagef(Variance, coord); + float4 gamma = read_imagef(Gamma, coord); + float4 beta = read_imagef(Beta, coord); + + float4 dst = 0; + src.x = src.x - mean.x; + float inv = rsqrt(var.x + eps); + dst.x = src.x * inv *gamma.x + beta.x; + + write_imagef(output, coord, dst); +} + +__kernel void batch_norm_U8toU8 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t Mean, + __read_only image2d_array_t Variance, + __read_only image2d_array_t Gamma, + __read_only image2d_array_t Beta, + __write_only image2d_array_t output, + float eps, + float input_scale, + float input_tail, + float output_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + uint4 data; + float4 src, mean, var, gamma, beta; + readImage2DArray(data, input, coord); + readImage2DArray(mean, Mean, coord); + readImage2DArray(var, Variance, coord); + readImage2DArray(gamma, Gamma, coord); + readImage2DArray(beta, Beta, coord); + + src = convert_float4(data) * input_scale - input_tail; + src.x = src.x - mean.x; + float inv = rsqrt(var.x + eps); + src.x = src.x * inv *gamma.x + beta.x; + + uint4 dst = convert_uint4(src * output_scale + output_zp); + + write_imageui(output, coord, dst); +} + +__kernel void batch_norm_U8toU8_2D + ( + __read_only image2d_t input, + __read_only image2d_t Mean, + __read_only image2d_t Variance, + __read_only image2d_t Gamma, + __read_only image2d_t Beta, + __write_only image2d_t output, + float eps, + float input_scale, + float input_tail, + float output_scale, + float output_zp + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + uint4 data = read_imageui(input, coord); + float4 mean = read_imagef(Mean, coord); + float4 var = read_imagef(Variance, coord); + float4 gamma = read_imagef(Gamma, coord); + float4 beta = read_imagef(Beta, coord); + + float4 src = convert_float4(data) * input_scale - input_tail; + src.x = src.x - mean.x; + float inv = rsqrt(var.x + eps); + src.x = src.x * inv *gamma.x + beta.x; + + uint4 dst = convert_uint4(src * output_scale + output_zp); + + write_imageui(output, coord, dst); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cast.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cast.cl new file mode 100644 index 0000000..781d3fb --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cast.cl @@ -0,0 +1,78 @@ + +#define CAST_FUN(src_name, dst_name, src_type, dst_type, conv_fun, read_fun, write_fun) \ +__kernel void cast_##src_name##to##dst_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + src_type src = read_fun(input, coord); \ + dst_type dst = 0; \ + dst = conv_fun(src); \ + write_fun(output, coord, dst); \ +} + +CAST_FUN(F32, I32, float4, int4, convert_int4_sat, read_imagef, write_imagei) +CAST_FUN(F32, U32, float4, uint4, convert_uint4_sat, read_imagef, write_imageui) +CAST_FUN(I32, I32, int4, int4, convert_int4_sat, read_imagei, write_imagei) +CAST_FUN(I32, U32, int4, uint4, convert_uint4_sat, read_imagei, write_imageui) +CAST_FUN(U32, I32, uint4, int4, convert_int4_sat, read_imageui, write_imagei) +CAST_FUN(U32, U32, uint4, uint4, convert_uint4_sat, read_imageui, write_imageui) +CAST_FUN(F32, F32, float4, float4, convert_float4, read_imagef, write_imagef) +CAST_FUN(I32, F32, int4, float4, convert_float4, read_imagei, write_imagef) +CAST_FUN(U32, F32, uint4, float4, convert_float4, read_imageui, write_imagef) + +#define CAST_FUN_2D(src_name, dst_name, src_type, dst_type, conv_fun, read_fun, write_fun) \ +__kernel void cast_##src_name##to##dst_name##_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + src_type src = read_fun(input, coord); \ + dst_type dst = 0; \ + dst = conv_fun(src); \ + write_fun(output, coord, dst); \ +} + +CAST_FUN_2D(F32, I32, float4, int4, convert_int4_sat, read_imagef, write_imagei) +CAST_FUN_2D(F32, U32, float4, uint4, convert_uint4_sat, read_imagef, write_imageui) +CAST_FUN_2D(I32, I32, int4, int4, convert_int4_sat, read_imagei, write_imagei) +CAST_FUN_2D(I32, U32, int4, uint4, convert_uint4_sat, read_imagei, write_imageui) +CAST_FUN_2D(U32, I32, uint4, int4, convert_int4_sat, read_imageui, write_imagei) +CAST_FUN_2D(U32, U32, uint4, uint4, convert_uint4_sat, read_imageui, write_imageui) +CAST_FUN_2D(F32, F32, float4, float4, convert_float4, read_imagef, write_imagef) +CAST_FUN_2D(I32, F32, int4, float4, convert_float4, read_imagei, write_imagef) +CAST_FUN_2D(U32, F32, uint4, float4, convert_float4, read_imageui, write_imagef) + +#define CAST_TO_BOOL_FUN(src_name, src_type, read_fun) \ +__kernel void cast_##src_name##toBOOL8( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + src_type src = read_fun(input, coord); \ + int4 dst = 0; \ + dst.x = (int)(src.x != 0); \ + write_imagei(output, coord, dst); \ +} + +CAST_TO_BOOL_FUN(F32, float4, read_imagef) +CAST_TO_BOOL_FUN(I32, int4, read_imagei) +CAST_TO_BOOL_FUN(U32, uint4, read_imageui) + + +#define CAST_TO_BOOL_FUN_2D(src_name, src_type, read_fun) \ +__kernel void cast_##src_name##toBOOL8_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + src_type src = read_fun(input, coord); \ + int4 dst = 0; \ + dst.x = (int)(src.x != 0); \ + write_imagei(output, coord, dst); \ +} + +CAST_TO_BOOL_FUN_2D(F32, float4, read_imagef) +CAST_TO_BOOL_FUN_2D(I32, int4, read_imagei) +CAST_TO_BOOL_FUN_2D(U32, uint4, read_imageui) + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/clip_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/clip_F32.cl new file mode 100644 index 0000000..384798e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/clip_F32.cl @@ -0,0 +1,64 @@ +__kernel void clip_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float4 src = read_imagef(input, coord); + float4 dst = src > minData ? src : minData; + dst = dst < maxData ? dst : maxData; + write_imagef(output, coord, dst); +} + +__kernel void clip_F32toF32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + float4 src = read_imagef(input, coord); + float4 dst = src > minData ? src : minData; + dst = dst < maxData ? dst : maxData; + write_imagef(output, coord, dst); +} + +__kernel void clip_F32toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float4 src = read_imagef(input, coord); + float4 result = src > minData ? src : minData; + result = result < maxData ? result : maxData; + uint4 dst = convert_uint4_rte(result * outputScale + outputZP); + write_imageui(output, coord, dst); +} + +__kernel void clip_F32toU8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float minData, + float maxData, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + float4 src = read_imagef(input, coord); + float4 result = src > minData ? src : minData; + result = result < maxData ? result : maxData; + uint4 dst = convert_uint4_rte(result * outputScale + outputZP); + write_imageui(output, coord, dst); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/clip_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/clip_U8.cl new file mode 100644 index 0000000..eda391b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/clip_U8.cl @@ -0,0 +1,73 @@ +__kernel void clip_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + float4 result = src > minData ? src : minData; + result = result < maxData ? result : maxData; + uint4 dst = convert_uint4_rte(result * outputScale + outputZP); + write_imageui(output, coord, dst); +} + +__kernel void clip_U8toU8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float minData, + float maxData, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + float4 result = src > minData ? src : minData; + result = result < maxData ? result : maxData; + uint4 dst = convert_uint4_rte(result * outputScale + outputZP); + write_imageui(output, coord, dst); +} + +__kernel void clip_U8toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + float4 dst = src > minData ? src : minData; + dst = dst < maxData ? dst : maxData; + write_imagef(output, coord, dst); +} + +__kernel void clip_U8toF32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float minData, + float maxData, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + float4 dst = src > minData ? src : minData; + dst = dst < maxData ? dst : maxData; + write_imagef(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl b/src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl new file mode 100644 index 0000000..66a7fcb --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl @@ -0,0 +1,101 @@ +float exp_(float x, float logE) +{ + x *= logE; + x = exp2(x); + return x; +} + +__kernel void detect_post_box_F32_F32toF32( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + float inv_scale_y, + float inv_scale_x, + float inv_scale_h, + float inv_scale_w, + float logE) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + float4 src0; + float4 src1; + float4 dst; + float4 tmp0, tmp1; + src0.x = read_imagef(input0, coord).x; + src1.x = read_imagef(input1, coord.xy).x; + coord.x++; + src0.y = read_imagef(input0, coord).x; + src1.y = read_imagef(input1, coord.xy).x; + coord.x++; + src0.z = read_imagef(input0, coord).x; + src1.z = read_imagef(input1, coord.xy).x; + coord.x++; + src0.w = read_imagef(input0, coord).x; + src1.w = read_imagef(input1, coord.xy).x; + + tmp0.x = src1.x + src1.z * src0.x * inv_scale_y; + tmp0.y = src1.y + src1.w * src0.y * inv_scale_x; + tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f; + tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f; + dst.xy = tmp0.xy - tmp1.xy; + dst.zw = tmp0.xy + tmp1.xy; + coord.x = 0; + write_imagef(output, coord, dst.xxxx); + coord.x++; + write_imagef(output, coord, dst.yyyy); + coord.x++; + write_imagef(output, coord, dst.zzzz); + coord.x++; + write_imagef(output, coord, dst.wwww); +} + + +__kernel void detect_post_box_U8_U8toF32( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + float inv_scale_y, + float inv_scale_x, + float inv_scale_h, + float inv_scale_w, + float logE, + float input0Tail, + float input1Tail, + float input0Scale, + float input1Scale) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + uint4 in0, in1; + float4 src0; + float4 src1; + float4 dst; + float4 tmp0, tmp1; + in0.x = read_imageui(input0, coord).x; + in1.x = read_imageui(input1, coord.xy).x; + coord.x++; + in0.y = read_imageui(input0, coord).x; + in1.y = read_imageui(input1, coord.xy).x; + coord.x++; + in0.z = read_imageui(input0, coord).x; + in1.z = read_imageui(input1, coord.xy).x; + coord.x++; + in0.w = read_imageui(input0, coord).x; + in1.w = read_imageui(input1, coord.xy).x; + + src0 = convert_float4(in0) * input0Scale + input0Tail; + src1 = convert_float4(in1) * input1Scale + input1Tail; + + tmp0.x = src1.x + src1.z * src0.x * inv_scale_y; + tmp0.y = src1.y + src1.w * src0.y * inv_scale_x; + tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f; + tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f; + dst.xy = tmp0.xy - tmp1.xy; + dst.zw = tmp0.xy + tmp1.xy; + coord.x = 0; + write_imagef(output, coord, dst.xxxx); + coord.x++; + write_imagef(output, coord, dst.yyyy); + coord.x++; + write_imagef(output, coord, dst.zzzz); + coord.x++; + write_imagef(output, coord, dst.wwww); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl new file mode 100644 index 0000000..cd255bb --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl @@ -0,0 +1,32 @@ +#pragma OPENCL EXTENSION CL_VIV_asm : enable +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable + +#define readImage2DArray(Dest, Image, Coord) \ + do { \ + int8 desc; \ + _viv_asm(COPY, desc, Image, sizeof(desc)); \ + _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \ + int baseAddr = (int)(Coord).w * desc.s4 + desc.s0; \ + _viv_asm(MOV, (Coord).w, baseAddr); \ + _viv_asm(IMAGE_READ_3D, Dest, Image, (Coord).xyww); \ + } while (0) + +#define writeImage2DArray(Image, Coord, Color) \ + do { \ + int8 desc; \ + _viv_asm(COPY, desc, Image, sizeof(desc)); \ + _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \ + int baseAddr = (int)(Coord).w * desc.s4 + desc.s0; \ + _viv_asm(MOV, (Coord).w, baseAddr); \ + _viv_asm(IMAGE_WRITE_3D, Color, Image, (Coord).xyww); \ + } while (0) + +#define readImage(Dest, Image, Coord) \ + do { \ + _viv_asm(IMAGE_READ, Dest, Image, Coord); \ + } while (0) + +#define writeImage(Image, Coord, Color) \ + do { \ + _viv_asm(IMAGE_WRITE, Color, Image, Coord); \ + } while (0) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl new file mode 100644 index 0000000..c702951 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl @@ -0,0 +1,214 @@ + +float4 eltwise_unary_sin(float4 x) +{ + return native_sin(x); +} + +#define logE (1.44269502f) +#define twoLogE (logE * 2.0f) +float4 eltwise_unary_exp(float4 x) +{ + x *= logE; + x = exp2(x); + return x; +} + +#define rlogE (0.693147182f) +float4 eltwise_unary_log(float4 x) +{ + x = log2(x); + return x * rlogE; +} + +float4 eltwise_unary_elu(float4 val) +{ + float4 x = val * logE; + x = exp2(x) - 1; + + return val < 0 ? x : val; +} + +float4 eltwise_unary_neg(float4 x) +{ + return x * -1; +} + +float4 eltwise_unary_hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} + +float4 _softrelu(float4 x) +{ + x *= logE; + x = exp2(x); + x += 1; + x = log2(x); + return x * rlogE; +} + +float4 _tanh(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return (2 * x - 1); +} + +float4 eltwise_unary_mish(float4 x) +{ + float4 y = _softrelu(x); + x = x * _tanh(y); + return x; +} + +#define ELTWISE_UNARY_F32(func_name) \ +__kernel void func_name##_F32toF32 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float inputScale, \ + float inputTail, \ + float outputScale, \ + float outputZP \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + float4 src = read_imagef(input, coord); \ + \ + float4 dst = eltwise_unary_##func_name(src); \ + \ + write_imagef(output, coord, dst); \ +} +ELTWISE_UNARY_F32(sin) +ELTWISE_UNARY_F32(exp) +ELTWISE_UNARY_F32(log) +ELTWISE_UNARY_F32(elu) +ELTWISE_UNARY_F32(neg) +ELTWISE_UNARY_F32(mish) +ELTWISE_UNARY_F32(hard_sigmoid) + +#define ELTWISE_UNARY_F32_2D(func_name) \ +__kernel void func_name##_F32toF32_2D \ + ( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + float inputScale, \ + float inputTail, \ + float outputScale, \ + float outputZP \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + float4 src = read_imagef(input, coord); \ + \ + float4 dst = eltwise_unary_##func_name(src); \ + \ + write_imagef(output, coord, dst); \ +} +ELTWISE_UNARY_F32_2D(sin) +ELTWISE_UNARY_F32_2D(exp) +ELTWISE_UNARY_F32_2D(log) +ELTWISE_UNARY_F32_2D(elu) +ELTWISE_UNARY_F32_2D(neg) +ELTWISE_UNARY_F32_2D(mish) +ELTWISE_UNARY_F32_2D(hard_sigmoid) + +#define ELTWISE_UNARY_U8(func_name) \ +__kernel void func_name##_U8toU8 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float inputScale, \ + float inputTail, \ + float outputScale, \ + float outputZP \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + uint4 src = read_imageui(input, coord); \ + float4 data = convert_float4(src) * inputScale - inputTail; \ + \ + data = eltwise_unary_##func_name(data); \ + uint4 dst = convert_uint4(data * outputScale + outputZP); \ + \ + write_imageui(output, coord, dst); \ +} +ELTWISE_UNARY_U8(sin) +ELTWISE_UNARY_U8(exp) +ELTWISE_UNARY_U8(log) +ELTWISE_UNARY_U8(elu) +ELTWISE_UNARY_U8(neg) +ELTWISE_UNARY_U8(mish) +ELTWISE_UNARY_U8(hard_sigmoid) + +#define ELTWISE_UNARY_U8_2D(func_name) \ +__kernel void func_name##_U8toU8_2D \ + ( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + float inputScale, \ + float inputTail, \ + float outputScale, \ + float outputZP \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + uint4 src = read_imageui(input, coord); \ + float4 data = convert_float4(src) * inputScale - inputTail; \ + \ + data = eltwise_unary_##func_name(data); \ + uint4 dst = convert_uint4(data * outputScale + outputZP); \ + \ + write_imageui(output, coord, dst); \ +} +ELTWISE_UNARY_U8_2D(sin) +ELTWISE_UNARY_U8_2D(exp) +ELTWISE_UNARY_U8_2D(log) +ELTWISE_UNARY_U8_2D(elu) +ELTWISE_UNARY_U8_2D(neg) +ELTWISE_UNARY_U8_2D(mish) +ELTWISE_UNARY_U8_2D(hard_sigmoid) + + +__kernel void neg_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 src = read_imagei(input, coord); + + int4 dst = -src; + + write_imagei(output, coord, dst); +} + +__kernel void neg_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 src = read_imagei(input, coord); + + int4 dst = -src; + + write_imagei(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl new file mode 100644 index 0000000..2164ea2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl @@ -0,0 +1,96 @@ +__kernel void floordiv_F32F32toF32( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float4 src0; + float4 src1; + readImage2DArray(src0, input, coord); + readImage2DArray(src1, input1, coord); + float4 dst = floor(src0 / src1); + write_imagef(output, coord, dst); +} + +__kernel void floordiv_F32F32toF32_2D( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + float4 src0 = read_imagef(input, coord); + float4 src1 = read_imagef(input1, coord); + float4 dst = floor(src0 / src1); + write_imagef(output, coord, dst); +} + +__kernel void floordiv_I32I32toI32( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 src0; + int4 src1; + readImage2DArray(src0, input, coord); + readImage2DArray(src1, input1, coord); + int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1))); + write_imagei(output, coord, dst); +} + +__kernel void floordiv_I32I32toI32_2D( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 src0 = read_imagei(input, coord); + int4 src1 = read_imagei(input1, coord); + int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1))); + write_imagei(output, coord, dst); +} + +__kernel void floordiv_U8U8toU8( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + uint4 src0, src1; + float4 in0, in1, out; + readImage2DArray(src0, input, coord); + readImage2DArray(src1, input1, coord); + in0 = convert_float4(src0) * input0Scale + input0Tail; + in1 = convert_float4(src1) * input1Scale + input1Tail; + out = floor(in0 / in1) * outputScale + outputTail; + uint4 dst = convert_uint4(out); + write_imageui(output, coord, dst); +} + +__kernel void floordiv_U8U8toU8_2D( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + uint4 src0 = read_imageui(input, coord); + uint4 src1 = read_imageui(input1, coord); + float4 in0, in1, out; + in0 = convert_float4(src0) * input0Scale + input0Tail; + in1 = convert_float4(src1) * input1Scale + input1Tail; + out = floor(in0 / in1) * outputScale + outputTail; + uint4 dst = convert_uint4(out); + write_imageui(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl new file mode 100644 index 0000000..1c8caff --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl @@ -0,0 +1,95 @@ +__kernel void gather_U8toU8( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num, + int indices_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + int4 indice = read_imagei(input1, coord_in.xy); + coord_in.w = gidz * axis_num + indice.x; + + uint4 data = read_imageui(input0, coord_in.zw); + + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + write_imageui(output, coord, data); +} + +__kernel void gather_F16toF16( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num, + int indices_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + int4 indice = read_imagei(input1, coord_in.xy); + coord_in.w = gidz * axis_num + indice.x; + + float4 data = read_imagef(input0, coord_in.zw); + + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + write_imagef(output, coord, data); +} + +__kernel void gather_I32toI32( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num, + int indices_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + int4 indice = read_imagei(input1, coord_in.xy); + coord_in.w = gidz * axis_num + indice.x; + + int4 data = read_imagei(input0, coord_in.zw); + + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + write_imagei(output, coord, data); +} + +__kernel void gather_F32toF32( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num, + int indices_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + int4 indice = read_imagei(input1, coord_in.xy); + coord_in.w = gidz * axis_num + indice.x; + + float4 data = read_imagef(input0, coord_in.zw); + + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + write_imagef(output, coord, data); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd.cl new file mode 100644 index 0000000..5c29281 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd.cl @@ -0,0 +1,160 @@ +__kernel void gather_nd_U8toU8_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + coord.w = indice.x; + + uint4 data = read_imageui(input0, coord.zw); + write_imageui(output, coord.zy, data); +} + +__kernel void gather_nd_F16toF16_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + coord.w = indice.x; + + float4 data = read_imagef(input0, coord.zw); + write_imagef(output, coord.zy, data); +} + +__kernel void gather_nd_I32toI32_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + coord.w = indice.x; + + int4 data = read_imagei(input0, coord.zw); + write_imagei(output, coord.zy, data); +} + +__kernel void gather_nd_F32toF32_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + coord.w = indice.x; + + float4 data = read_imagef(input0, coord.zw); + write_imagef(output, coord.zy, data); +} + +//2D +__kernel void gather_nd_U8toU8_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 1); + int4 indice = read_imagei(input1, coord.xy); + int4 indice1 = read_imagei(input1, coord.wy); + indice.x = indice.x * block_size + gidx; + indice.y = indice1.x; + + uint4 data = read_imageui(input0, indice.xy); + write_imageui(output, coord.zy, data); +} + +__kernel void gather_nd_F16toF16_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 1); + int4 indice = read_imagei(input1, coord.xy); + int4 indice1 = read_imagei(input1, coord.wy); + indice.x = indice.x * block_size + gidx; + indice.y = indice1.x; + + float4 data = read_imagef(input0, indice.xy); + write_imagef(output, coord.zy, data); +} + +__kernel void gather_nd_I32toI32_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 1); + int4 indice = read_imagei(input1, coord.xy); + int4 indice1 = read_imagei(input1, coord.wy); + indice.x = indice.x * block_size + gidx; + indice.y = indice1.x; + + int4 data = read_imagei(input0, indice.xy); + write_imagei(output, coord.zy, data); +} + +__kernel void gather_nd_F32toF32_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 1); + int4 indice = read_imagei(input1, coord.xy); + int4 indice1 = read_imagei(input1, coord.wy); + indice.x = indice.x * block_size + gidx; + indice.y = indice1.x; + + float4 data = read_imagef(input0, indice.xy); + write_imagef(output, coord.zy, data); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_3d.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_3d.cl new file mode 100644 index 0000000..43ece7f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_3d.cl @@ -0,0 +1,90 @@ +__kernel void gather_nd_U8toU8_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, 1, 2); + int4 indice = read_imagei(input1, coord.xy); + int4 indice1 = read_imagei(input1, coord.zy); + int4 indice2 = read_imagei(input1, coord.wy); + indice = (int4)(indice.x * block_size + gidx, indice1.x, indice2.x, 0); + coord.z = gidx; + + uint4 data = read_imageui(input0, indice); + write_imageui(output, coord.zy, data); +} + +__kernel void gather_nd_F16toF16_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord = (int4)(0, gidy, 1, 2); + int4 indice = read_imagei(input1, coord.xy); + int4 indice1 = read_imagei(input1, coord.zy); + int4 indice2 = read_imagei(input1, coord.wy); + indice = (int4)(indice.x * block_size + gidx, indice1.x, indice2.x, 0); + coord.z = gidx; + + float4 data = read_imagef(input0, indice); + write_imagef(output, coord.zy, data); +} + +__kernel void gather_nd_I32toI32_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord = (int4)(0, gidy, 1, 2); + int4 indice = read_imagei(input1, coord.xy); + int4 indice1 = read_imagei(input1, coord.zy); + int4 indice2 = read_imagei(input1, coord.wy); + indice = (int4)(indice.x * block_size + gidx, indice1.x, indice2.x, 0); + coord.z = gidx; + + int4 data = read_imagei(input0, indice); + write_imagei(output, coord.zy, data); +} + +__kernel void gather_nd_F32toF32_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord = (int4)(0, gidy, 1, 2); + int4 indice = read_imagei(input1, coord.xy); + int4 indice1 = read_imagei(input1, coord.zy); + int4 indice2 = read_imagei(input1, coord.wy); + indice = (int4)(indice.x * block_size + gidx, indice1.x, indice2.x, 0); + coord.z = gidx; + + float4 data = read_imagef(input0, indice); + write_imagef(output, coord.zy, data); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation.cl new file mode 100644 index 0000000..3d01bc7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation.cl @@ -0,0 +1,6 @@ +__kernel void grucell_activation( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_sma.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_sma.cl new file mode 100644 index 0000000..fa8eb8e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_sma.cl @@ -0,0 +1,6 @@ +__kernel void grucell_activation_sma( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/hswish.cl b/src/tim/vx/internal/src/libnnext/ops/cl/hswish.cl new file mode 100644 index 0000000..6cb76e7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/hswish.cl @@ -0,0 +1,102 @@ +#define HSWISH_F32_F32_PROCESS() \ + float4 src, tmp, dst; \ + src = read_imagef(input, coord); \ + tmp = src + 3; \ + tmp = tmp > 0 ? tmp : 0; \ + tmp = tmp < 6 ? tmp : 6; \ + dst = src * tmp / 6.0f; \ + write_imagef(output, coord, dst); + +__kernel void hswish_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + HSWISH_F32_F32_PROCESS() +} + +__kernel void hswish_F32toF32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + HSWISH_F32_F32_PROCESS() +} + + +#define HSWISH_U8_U8_PROCESS() \ + float4 src, tmp, data; \ + uint4 src0 = read_imageui(input, coord); \ + src = convert_float4(src0) * inputScale - inputTail; \ + tmp = src + 3; \ + tmp = tmp > 0 ? tmp : 0; \ + tmp = tmp < 6 ? tmp : 6; \ + data = src * tmp / 6.0f; \ + uint4 dst = convert_uint4(data * outputScale + outputZP); \ + write_imageui(output, coord, dst); + +__kernel void hswish_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + HSWISH_U8_U8_PROCESS() +} + +__kernel void hswish_U8toU8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + HSWISH_U8_U8_PROCESS() +} + + +#define HSWISH_I32_I32_PROCESS() \ + int4 tmp, dst, src; \ + src = read_imagei(input, coord); \ + tmp = src + 3; \ + tmp = tmp > 0 ? tmp : 0; \ + tmp = tmp < 6 ? tmp : 6; \ + dst = src * tmp / 6; \ + write_imagei(output, coord, dst); + +__kernel void hswish_I32toI32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + HSWISH_I32_I32_PROCESS() +} + +__kernel void hswish_I32toI32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + HSWISH_I32_I32_PROCESS() +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f16.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f16.cl new file mode 100644 index 0000000..f05e01d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f16.cl @@ -0,0 +1,229 @@ +__kernel void instance_norm_meanvari_F16( + __read_only image2d_array_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, gidz, 0); + float4 data; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + data = read_imagef(input, coord); + coord.y++; + sum += data.x; + sqr += data.x * data.x; + } + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void instance_norm_meanvari_F16_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + float4 data; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + data = read_imagef(input, coord); + coord.y++; + sum += data.x; + sqr += data.x * data.x; + } + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void instance_norm_F16toF16( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(0, gidz, 0, 0); + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + bias_val = (beta.s0 - scale_vari * mean_vari.s0); + + float4 data, dst; + for(coord.y = 0; coord.y < height;coord.y++) + { + data = read_imagef(input, coord); + + dst.x = data.x * scale_vari + bias_val; + write_imagef(output, coord, dst); + } +} + +__kernel void instance_norm_F16toF16_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int2 coord_para = (int2)(0, gidz); + int endH = gidy + height; + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + bias_val = (beta.s0 - scale_vari * mean_vari.s0); + + float4 data, dst; + for(; coord.y < endH; coord.y++) + { + data = read_imagef(input, coord); + + dst.x = data.x * scale_vari + bias_val; + write_imagef(output, coord, dst); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl new file mode 100644 index 0000000..5946570 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl @@ -0,0 +1,229 @@ +__kernel void instance_norm_meanvari_F32( + __read_only image2d_array_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, gidz, 0); + float4 data; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + data = read_imagef(input, coord); + coord.y++; + sum += data.x; + sqr += data.x * data.x; + } + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void instance_norm_meanvari_F32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + float4 data; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + data = read_imagef(input, coord); + coord.y++; + sum += data.x; + sqr += data.x * data.x; + } + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void instance_norm_F32toF32( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(0, gidz, 0, 0); + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + bias_val = (beta.s0 - scale_vari * mean_vari.s0); + + float4 data, dst; + for(coord.y = 0; coord.y < height;coord.y++) + { + data = read_imagef(input, coord); + + dst.x = data.x * scale_vari + bias_val; + write_imagef(output, coord, dst); + } +} + +__kernel void instance_norm_F32toF32_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int2 coord_para = (int2)(0, gidz); + int endH = gidy + height; + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + bias_val = beta.s0 - scale_vari * mean_vari.s0; + + float4 data, dst; + for(; coord.y < endH; coord.y++) + { + data = read_imagef(input, coord); + + dst.x = data.x * scale_vari + bias_val; + write_imagef(output, coord, dst); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl new file mode 100644 index 0000000..3928749 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl @@ -0,0 +1,353 @@ +__kernel void instance_norm_meanvari_I32( + __read_only image2d_array_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, gidz, 0); + int4 data; + float sum = 0, sqr = 0; + int tmpSum = 0; + float e2InScale = input_fl * input_fl; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + data = read_imagei(input, coord); + coord.y++; + tmpSum += data.x; + sqr += (data.x * data.x * e2InScale); + } + sum = tmpSum * input_fl; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void instance_norm_meanvari_I32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + int4 data; + float sum = 0, sqr = 0; + int tmpSum = 0; + float e2InScale = input_fl * input_fl; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + data = read_imagei(input, coord); + coord.y++; + tmpSum += data.x; + sqr += (data.x * data.x * e2InScale); + } + sum = tmpSum * input_fl; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void instance_norm_I32toI32( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(0, gidz, 0, 0); + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = input_fl * output_fl * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl; + + int4 data, dst; + for(coord.y = 0; coord.y < height;coord.y++) + { + data = read_imagei(input, coord); + + float4 norm; + norm.x = data.x * alpha + bias_val; + dst = convert_int4_rte(norm); + write_imagei(output, coord, dst); + } +} + +__kernel void instance_norm_I32toI32_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int2 coord_para = (int2)(0, gidz); + int endH = gidy + height; + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = input_fl * output_fl * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl; + + int4 data, dst; + for(; coord.y < endH; coord.y++) + { + data = read_imagei(input, coord); + + float4 norm; + norm.x = data.x * alpha + bias_val; + dst = convert_int4_rte(norm); + write_imagei(output, coord, dst); + } +} + +__kernel void instance_norm_I32toF32( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(0, gidz, 0, 0); + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = input_fl * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0); + + int4 data; + for(coord.y = 0; coord.y < height;coord.y++) + { + data = read_imagei(input, coord); + + float4 norm; + norm.x = data.x * alpha + bias_val; + write_imagef(output, coord, norm); + } +} + +__kernel void instance_norm_I32toF32_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int2 coord_para = (int2)(0, gidz); + int endH = gidy + height; + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = input_fl * scale_vari; + bias_val = beta.s0 - scale_vari * mean_vari.s0; + + int4 data; + for(; coord.y < endH; coord.y++) + { + data = read_imagei(input, coord); + + float4 norm; + norm.x = data.x * alpha + bias_val; + write_imagef(output, coord, norm); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl new file mode 100644 index 0000000..8b82717 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl @@ -0,0 +1,363 @@ +__kernel void instance_norm_meanvari_U8( + __read_only image2d_array_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, gidz, 0); + uint4 data; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0; + float e2InScale = input_scale * input_scale; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + data = read_imageui(input, coord); + coord.y++; + tmpSum += data.x; + tmpSqr += data.x * data.x; + } + sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; + sum = (tmpSum - height * input_zp) * input_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void instance_norm_meanvari_U8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + uint4 data; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0; + float e2InScale = input_scale * input_scale; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + data = read_imageui(input, coord); + coord.y++; + tmpSum += data.x; + tmpSqr += data.x * data.x; + } + sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; + sum = (tmpSum - height * input_zp) * input_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void instance_norm_U8toU8( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(0, gidz, 0, 0); + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + float scale_inOut = input_scale * output_scale; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = scale_inOut * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; + + uint4 data, dst; + for(coord.y = 0; coord.y < height;coord.y++) + { + data = read_imageui(input, coord); + data.x -= input_zp; + + float4 norm; + norm.x = data.x * alpha + bias_val; + dst = convert_uint4_rte(norm); + write_imageui(output, coord, dst); + } +} + +__kernel void instance_norm_U8toU8_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int2 coord_para = (int2)(0, gidz); + int endH = gidy + height; + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + float scale_inOut = input_scale * output_scale; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = scale_inOut * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; + + uint4 data, dst; + for(; coord.y < endH; coord.y++) + { + data = read_imageui(input, coord); + data.x -= input_zp; + + float4 norm; + norm.x = data.x * alpha + bias_val; + dst = convert_uint4_rte(norm); + write_imageui(output, coord, dst); + } +} + +__kernel void instance_norm_U8toF16( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(0, gidz, 0, 0); + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + float scale_inOut = input_scale * output_scale; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = scale_inOut * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; + + uint4 data; + for(coord.y = 0; coord.y < height;coord.y++) + { + data = read_imageui(input, coord); + data.x -= input_zp; + + float4 norm; + norm.x = data.x * alpha + bias_val; + write_imagef(output, coord, norm); + } +} + +__kernel void instance_norm_U8toF16_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int input_zp, + float input_scale, + float input_fl, + int output_zp, + float output_scale, + float output_fl, + int width, + int height, + float dim_ratio, + int group_num + ) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int2 coord_para = (int2)(0, gidz); + int endH = gidy + height; + + float4 gamma = read_imagef(scale, coord_para.yx); + float4 beta = read_imagef(bias, coord_para.yx); + float4 mean_vari = (float4)(0); + float scale_vari, bias_val; + float scale_inOut = input_scale * output_scale; + + for(int i = 0; i < group_num; i++) + { + mean_vari.x += read_imagef(meanVari, coord_para.xy).x; + coord_para.x++; + mean_vari.y += read_imagef(meanVari, coord_para.xy).x; + coord_para.x+=3; + } + mean_vari *= dim_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = scale_inOut * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; + + uint4 data; + for(; coord.y < endH; coord.y++) + { + data = read_imageui(input, coord); + data.x -= input_zp; + + float4 norm; + norm.x = data.x * alpha + bias_val; + write_imagef(output, coord, norm); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl new file mode 100644 index 0000000..36794ea --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl @@ -0,0 +1,83 @@ + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_axis0_F32_F32toF32_2D( + __read_only image2d_t input, + __read_only image2d_t scale, + __write_only image2d_t output, + int axis, + int axis_size, + float rsEps + ) +{ + int lidx = get_local_id(0); + int gidx = get_global_id(0); + float4 src, scale_value, result; + float sum = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f; + int2 coord = (int2)(gidx, get_global_id(1)); + int2 coord_scale = (int2)(gidx, 0); + __local float lcl_sum[16]; + for(; coord.x < axis_size; coord.x += 16) + { + src = read_imagef(input, coord); + pSum += (src.x * src.x); + } + lcl_sum[lidx] = pSum; + barrier(CLK_LOCAL_MEM_FENCE); + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + float4 one = (float4)(1, 1, 1, 1); + float4 data0; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum)); + for(coord.x = gidx; coord.x < axis_size; coord.x += 16) + { + src = read_imagef(input, coord); + scale_value = read_imagef(scale, coord_scale); + result = src * rsqrt_sum * scale_value; + write_imagef(output, coord, result.xxxx); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_axis0_U8_F32toU8_2D( + __read_only image2d_t input, + __read_only image2d_t scale, + __write_only image2d_t output, + int axis, + int axis_size, + float rsEps, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int lidx = get_local_id(0); + int gidx = get_global_id(0); + float4 src, scale_value, result; + float sum = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f; + int2 coord = (int2)(gidx, get_global_id(1)); + int2 coord_scale = (int2)(gidx, 0); + __local float lcl_sum[16]; + for(; coord.x < axis_size; coord.x += 16) + { + src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + pSum += (src.x * src.x); + } + lcl_sum[lidx] = pSum; + barrier(CLK_LOCAL_MEM_FENCE); + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + float4 one = (float4)(1, 1, 1, 1); + float4 data0; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum)); + for(coord.x = gidx; coord.x < axis_size; coord.x += 16) + { + src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + scale_value = read_imagef(scale, coord_scale); + result = src * rsqrt_sum * scale_value; + uint4 dst = convert_uint4_rte(result * outputScale + outputZP); + write_imageui(output, coord, dst); + } +} + + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl new file mode 100644 index 0000000..39ad98a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl @@ -0,0 +1,81 @@ + +__kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_axis1_F32_F32toF32_2D( + __read_only image2d_t input, + __read_only image2d_t scale, + __write_only image2d_t output, + int axis, + int axis_size, + float rsEps + ) +{ + int lidx = get_local_id(1); + int gidy = get_global_id(1); + float4 src, scale_value, result; + float sum = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f; + int2 coord = (int2)(get_global_id(0), gidy ); + int2 coord_scale = (int2)(gidy, 0); + __local float lcl_sum[16]; + for(; coord.y < axis_size; coord.y += 16) + { + src = read_imagef(input, coord); + pSum += (src.x * src.x); + } + lcl_sum[lidx] = pSum; + barrier(CLK_LOCAL_MEM_FENCE); + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + float4 one = (float4)(1, 1, 1, 1); + float4 data0; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum)); + for(coord.y = gidy; coord.y < axis_size; coord.y += 16) + { + src = read_imagef(input, coord); + scale_value = read_imagef(scale, coord_scale); + result = src * rsqrt_sum * scale_value; + write_imagef(output, coord, result.xxxx); + } +} + +__kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_axis1_U8_F32toU8_2D( + __read_only image2d_t input, + __read_only image2d_t scale, + __write_only image2d_t output, + int axis, + int axis_size, + float rsEps, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int lidx = get_local_id(1); + int gidy = get_global_id(1); + float4 src, scale_value, result; + float sum = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f; + int2 coord = (int2)(get_global_id(0), gidy ); + int2 coord_scale = (int2)(gidy, 0); + __local float lcl_sum[16]; + for(; coord.y < axis_size; coord.y += 16) + { + src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + pSum += (src.x * src.x); + } + lcl_sum[lidx] = pSum; + barrier(CLK_LOCAL_MEM_FENCE); + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + float4 one = (float4)(1, 1, 1, 1); + float4 data0; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum)); + for(coord.y = gidy; coord.y < axis_size; coord.y += 16) + { + src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + scale_value = read_imagef(scale, coord_scale); + result = src * rsqrt_sum * scale_value; + uint4 dst = convert_uint4_rte(result * outputScale + outputZP); + write_imageui(output, coord, dst); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis0.cl new file mode 100644 index 0000000..0bb51ba --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis0.cl @@ -0,0 +1,220 @@ +#define rlogE (0.693147182f) +float LOG(float x) +{ + x = log2(x); + return x * rlogE; +} + +__kernel void log_softmax_axis0_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int width = get_image_width(input); + int4 coord_in = (int4)(0, y, z, 0); + float4 maxValue; + float4 src, dst = {0.0}; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + maxValue = read_imagef(input, coord_in); + for (coord_in.x = 1; coord_in.x < width; ) + { + src = read_imagef(input, coord_in); + coord_in.x++; + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.f; + for (coord_in.x = 0; coord_in.x < width; ) + { + src = read_imagef(input, coord_in); + coord_in.x++; + + sum += exp2((src.x - maxValue.x) * scale); + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.x = 0; coord_in.x < width; ) + { + src = read_imagef(input, coord_in); + + dst.x = (src.x - maxValue.x) * beta - logSum; + write_imagef(output, coord_in, dst); + coord_in.x++; + } +} + +__kernel void log_softmax_axis0_F32toF32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int width = get_image_width(input); + int2 coord_in = (int2)(0, y); + float4 maxValue; + float4 src, dst = {0.0}; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + maxValue = read_imagef(input, coord_in); + for (coord_in.x = 1; coord_in.x < width; ) + { + src = read_imagef(input, coord_in); + coord_in.x++; + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.0f; + for (coord_in.x = 0; coord_in.x < width; ) + { + src = read_imagef(input, coord_in); + coord_in.x++; + + sum += exp2((src.x - maxValue.x) * scale); + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.x = 0; coord_in.x < width; ) + { + src = read_imagef(input, coord_in); + + dst.x = (src.x - maxValue.x) * beta - logSum; + write_imagef(output, coord_in, dst); + coord_in.x++; + } +} + +__kernel void log_softmax_axis0_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int width = get_image_width(input); + int4 coord_in = (int4)(0, y, z, 0); + float4 maxValue; + float4 src; + uint4 dst = {0}; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + maxValue = convert_float4(read_imageui(input, coord_in)); + for (coord_in.x = 1; coord_in.x < width; ) + { + src = convert_float4(read_imageui(input, coord_in)); + coord_in.x++; + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.f; + for (coord_in.x = 0; coord_in.x < width; ) + { + src = convert_float4(read_imageui(input, coord_in)); + coord_in.x++; + + sum += exp2((src.x - maxValue.x) * scale); + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.x = 0; coord_in.x < width; ) + { + src = convert_float4(read_imageui(input, coord_in)); + + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut); + + write_imageui(output, coord_in, dst); + coord_in.x++; + } +} + +__kernel void log_softmax_axis0_U8toU8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int width = get_image_width(input); + int2 coord_in = (int2)(0, y); + float4 maxValue; + float4 src; + uint4 dst = {0}; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + maxValue = convert_float4(read_imageui(input, coord_in)); + for (coord_in.x = 1; coord_in.x < width; ) + { + src = convert_float4(read_imageui(input, coord_in)); + coord_in.x++; + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.f; + for (coord_in.x = 0; coord_in.x < width; ) + { + src = convert_float4(read_imageui(input, coord_in)); + coord_in.x++; + + sum += exp2((src.x - maxValue.x)*scale); + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.x = 0; coord_in.x < width; ) + { + src = convert_float4(read_imageui(input, coord_in)); + + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut); + write_imageui(output, coord_in, dst); + coord_in.x++; + } +} +#undef rlogE diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis1.cl new file mode 100644 index 0000000..e647014 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis1.cl @@ -0,0 +1,221 @@ +#define rlogE (0.693147182f) + +float LOG(float x) +{ + x = log2(x); + return x * rlogE; +} + +__kernel void log_softmax_axis1_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int height = get_image_height(input); + int4 coord_in = (int4)(x, 0, z, 0); + float4 maxValue; + float4 src, dst = {0.0}; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + maxValue = read_imagef(input, coord_in); + for (coord_in.y = 1; coord_in.y < height; ) + { + src = read_imagef(input, coord_in); + coord_in.y++; + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.f; + for (coord_in.y = 0; coord_in.y < height; ) + { + src = read_imagef(input, coord_in); + coord_in.y++; + + sum += exp2((src.x - maxValue.x) * scale); + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.y = 0; coord_in.y < height; ) + { + src = read_imagef(input, coord_in); + + dst.x = (src.x - maxValue.x) * beta - logSum; + write_imagef(output, coord_in, dst); + coord_in.y++; + } +} + +__kernel void log_softmax_axis1_F32toF32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int height = get_image_height(input); + int2 coord_in = (int2)(x, 0); + float4 maxValue; + float4 src, dst = {0.0}; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + maxValue = read_imagef(input, coord_in); + for (coord_in.y = 1; coord_in.y < height; ) + { + src = read_imagef(input, coord_in); + coord_in.y++; + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.0f; + for (coord_in.y = 0; coord_in.y < height; ) + { + src = read_imagef(input, coord_in); + coord_in.y++; + + sum += exp2((src.x - maxValue.x) * scale); + } + + // Compute result. + float logSum = 1.0f * LOG(sum); + for (coord_in.y = 0; coord_in.y < height; ) + { + src = read_imagef(input, coord_in); + + dst.x = (src.x - maxValue.x) * beta - logSum; + write_imagef(output, coord_in, dst); + coord_in.y++; + } +} + +__kernel void log_softmax_axis1_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int height = get_image_height(input); + int4 coord_in = (int4)(x, 0, z, 0); + float4 maxValue; + float4 src; + uint4 dst = {0}; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + maxValue = convert_float4(read_imageui(input, coord_in)); + for (coord_in.y = 1; coord_in.y < height; ) + { + src = convert_float4(read_imageui(input, coord_in)); + coord_in.y++; + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.f; + for (coord_in.y = 0; coord_in.y < height; ) + { + src = convert_float4(read_imageui(input, coord_in)); + coord_in.y++; + + sum += exp2((src.x - maxValue.x) * scale); + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.y = 0; coord_in.y < height; ) + { + src = convert_float4(read_imageui(input, coord_in)); + + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut); + + write_imageui(output, coord_in, dst); + coord_in.y++; + } +} + +__kernel void log_softmax_axis1_U8toU8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int height = get_image_height(input); + int2 coord_in = (int2)(x, 0); + float4 maxValue; + float4 src; + uint4 dst = {0}; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + maxValue = convert_float4(read_imageui(input, coord_in)); + for (coord_in.y = 1; coord_in.y < height; ) + { + src = convert_float4(read_imageui(input, coord_in)); + coord_in.y++; + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.f; + for (coord_in.y = 0; coord_in.y < height; ) + { + src = convert_float4(read_imageui(input, coord_in)); + coord_in.y++; + + sum += exp2((src.x - maxValue.x)*scale); + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.y = 0; coord_in.y < height; ) + { + src = convert_float4(read_imageui(input, coord_in)); + + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut); + write_imageui(output, coord_in, dst); + coord_in.y++; + } +} +#undef rlogE diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis2.cl new file mode 100644 index 0000000..d45ff39 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis2.cl @@ -0,0 +1,115 @@ +#define rlogE (0.693147182f) +float LOG(float x) +{ + x = log2(x); + return x * rlogE; +} + +__kernel void log_softmax_axis2_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int depth = get_image_array_size(input); + int4 coord_in = (int4)(x, y, 0, 0); + float4 maxValue; + float4 src, dst = {0.0}; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + maxValue = read_imagef(input, coord_in); + for (coord_in.z = 1; coord_in.z < depth; ) + { + src = read_imagef(input, coord_in); + coord_in.z++; + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.f; + for (coord_in.z = 0; coord_in.z < depth; ) + { + src = read_imagef(input, coord_in); + coord_in.z++; + + sum += exp2((src.x - maxValue.x) * scale); + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.z = 0; coord_in.z < depth; ) + { + src = read_imagef(input, coord_in); + + dst.x = (src.x - maxValue.x) * beta - logSum; + write_imagef(output, coord_in, dst); + coord_in.z++; + } +} + +__kernel void log_softmax_axis2_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int depth = get_image_array_size(input); + int4 coord_in = (int4)(x, y, 0, 0); + float4 maxValue; + float4 src; + uint4 dst = {0}; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + maxValue = convert_float4(read_imageui(input, coord_in)); + for (coord_in.z = 1; coord_in.z < depth; ) + { + src = convert_float4(read_imageui(input, coord_in)); + coord_in.z++; + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.f; + for (coord_in.z = 0; coord_in.z < depth; ) + { + src = convert_float4(read_imageui(input, coord_in)); + coord_in.z++; + + sum += exp2((src.x - maxValue.x) * scale); + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.z = 0; coord_in.z < depth; ) + { + src = convert_float4(read_imageui(input, coord_in)); + + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut); + + write_imageui(output, coord_in, dst); + coord_in.z++; + } +} +#undef rlogE diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/logical_not.cl b/src/tim/vx/internal/src/libnnext/ops/cl/logical_not.cl new file mode 100644 index 0000000..2aae4cc --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/logical_not.cl @@ -0,0 +1,21 @@ +__kernel void logical_not_I8toI8( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 src = read_imagei(input, coord); + int4 dst = !src; + dst.x = dst.x & 1; + write_imagei(output, coord, dst); +} + +__kernel void logical_not_I8toI8_2D( + __read_only image2d_t input, + __write_only image2d_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 src = read_imagei(input, coord); + int4 dst = !src; + dst.x = dst.x & 1; + write_imagei(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/logical_ops.cl b/src/tim/vx/internal/src/libnnext/ops/cl/logical_ops.cl new file mode 100644 index 0000000..0b0ebf5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/logical_ops.cl @@ -0,0 +1,38 @@ +#define TENSORLOGICAL(name, lgc_op, lgc_op2) \ +__kernel void logical_##name##_I8toI8( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 src0; \ + int4 src1; \ + readImage2DArray(src0, input, coord); \ + readImage2DArray(src1, input1, coord); \ + int4 dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \ + dst.x = dst.x & 1; \ + write_imagei(output, coord, dst); \ +} + +TENSORLOGICAL(or, ||, ) +TENSORLOGICAL(and, &&, ) +TENSORLOGICAL(xor, ^, !!) + + +#define TENSORLOGICAL_2D(name, lgc_op, lgc_op2) \ +__kernel void logical_##name##_I8toI8_2D( \ + __read_only image2d_t input, \ + __read_only image2d_t input1, \ + __write_only image2d_t output) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + int4 src0 = read_imagei(input, coord); \ + int4 src1 = read_imagei(input1, coord); \ + int4 dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \ + dst.x = dst.x & 1; \ + write_imagei(output, coord, dst); \ +} + +TENSORLOGICAL_2D(or, ||, ) +TENSORLOGICAL_2D(and, &&, ) +TENSORLOGICAL_2D(xor, ^, !!) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_BP_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_BP_F32.cl new file mode 100644 index 0000000..299d31f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_BP_F32.cl @@ -0,0 +1,151 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_BP_F32(act_name, act_func) \ +__kernel void lstmunit_activation_BP_F32toF32_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src0 = read_imagef(input_i_conv, coord_in.xy); \ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_BP_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_BP_F32(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_BP_F32TOU8(act_name, act_func) \ +__kernel void lstmunit_activation_BP_F32toU8_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src0 = read_imagef(input_i_conv, coord_in.xy); \ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_BP_F32TOU8(SIGMOID, sigmoid) +LSTM_ACTIVATION_BP_F32TOU8(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_BP_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_BP_U8.cl new file mode 100644 index 0000000..5239731 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_BP_U8.cl @@ -0,0 +1,155 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_BP_U8(act_name, act_func) \ +__kernel void lstmunit_activation_BP_U8toU8_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_BP_U8(SIGMOID, sigmoid) +LSTM_ACTIVATION_BP_U8(HARD_SIGMOID, hard_sigmoid) + +#define LSTM_ACTIVATION_BP_U8TOF32(act_name, act_func) \ +__kernel void lstmunit_activation_BP_U8toF32_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_BP_U8TOF32(SIGMOID, sigmoid) +LSTM_ACTIVATION_BP_U8TOF32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_B_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_B_F32.cl new file mode 100644 index 0000000..ce6e883 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_B_F32.cl @@ -0,0 +1,155 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_B_F32(act_name, act_func) \ +__kernel void lstmunit_activation_B_F32toF32_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src0 = read_imagef(input_i_conv, coord_in.xy); \ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_B_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_B_F32(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_B_F32TOU8(act_name, act_func) \ +__kernel void lstmunit_activation_B_F32toU8_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src0 = read_imagef(input_i_conv, coord_in.xy); \ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ + write_imageui(h_state_out, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_B_F32TOU8(SIGMOID, sigmoid) +LSTM_ACTIVATION_B_F32TOU8(HARD_SIGMOID, hard_sigmoid) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_B_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_B_U8.cl new file mode 100644 index 0000000..d683970 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_B_U8.cl @@ -0,0 +1,159 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_B_U8(act_name, act_func) \ +__kernel void lstmunit_activation_B_U8toU8_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ + write_imageui(h_state_out, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_B_U8(SIGMOID, sigmoid) +LSTM_ACTIVATION_B_U8(HARD_SIGMOID, hard_sigmoid) + +#define LSTM_ACTIVATION_B_U8TOF32(act_name, act_func) \ +__kernel void lstmunit_activation_B_U8toF32_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_B_U8TOF32(SIGMOID, sigmoid) +LSTM_ACTIVATION_B_U8TOF32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CBP_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CBP_F32.cl new file mode 100644 index 0000000..71783e1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CBP_F32.cl @@ -0,0 +1,134 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CBP_F32(act_name, act_func) \ +__kernel void lstmunit_activation_CBP_F32toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b1, b2, b3; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CBP_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CBP_F32(HARD_SIGMOID, hard_sigmoid) + +#define LSTM_ACTIVATION_CBP_F32TOU8(act_name, act_func) \ +__kernel void lstmunit_activation_CBP_F32toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b1, b2, b3; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CBP_F32TOU8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CBP_F32TOU8(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CBP_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CBP_U8.cl new file mode 100644 index 0000000..18df414 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CBP_U8.cl @@ -0,0 +1,140 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CBP_U8(act_name, act_func) \ +__kernel void lstmunit_activation_CBP_U8toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CBP_U8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CBP_U8(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_CBP_U8TOF32(act_name, act_func) \ +__kernel void lstmunit_activation_CBP_U8toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CBP_U8TOF32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CBP_U8TOF32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CB_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CB_F32.cl new file mode 100644 index 0000000..3634726 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CB_F32.cl @@ -0,0 +1,139 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CB_F32(act_name, act_func) \ +__kernel void lstmunit_activation_CB_F32toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CB_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CB_F32(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_CB_F32TOU8(act_name, act_func) \ +__kernel void lstmunit_activation_CB_F32toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ + write_imageui(h_state_out, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CB_F32TOU8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CB_F32TOU8(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CB_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CB_U8.cl new file mode 100644 index 0000000..a73cfb4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CB_U8.cl @@ -0,0 +1,144 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CB_U8(act_name, act_func) \ +__kernel void lstmunit_activation_CB_U8toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ + write_imageui(h_state_out, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CB_U8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CB_U8(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_CB_U8TOF32(act_name, act_func) \ +__kernel void lstmunit_activation_CB_U8toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CB_U8TOF32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CB_U8TOF32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CLP_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CLP_F32.cl new file mode 100644 index 0000000..ab54bc5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CLP_F32.cl @@ -0,0 +1,128 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CLP_F32(act_name, act_func) \ +__kernel void lstmunit_activation_CLP_F32toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b1, b2, b3; \ + float4 w1, w2, w3; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 * w1 + b1; \ + data_g_t = src2 * w2 + b2; \ + data_o_t = src3 * w3 + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CLP_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CLP_F32(HARD_SIGMOID, hard_sigmoid) + +#define LSTM_ACTIVATION_CLP_F32TOU8(act_name, act_func) \ +__kernel void lstmunit_activation_CLP_F32toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b1, b2, b3; \ + float4 w1, w2, w3; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 * w1 + b1; \ + data_g_t = src2 * w2 + b2; \ + data_o_t = src3 * w3 + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CLP_F32TOU8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CLP_F32TOU8(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CLP_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CLP_U8.cl new file mode 100644 index 0000000..7612dc6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CLP_U8.cl @@ -0,0 +1,134 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CLP_U8(act_name, act_func) \ +__kernel void lstmunit_activation_CLP_U8toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b1, b2, b3; \ + float4 w1, w2, w3; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 * w1 + b1; \ + data_g_t = src2 * w2 + b2; \ + data_o_t = src3 * w3 + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CLP_U8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CLP_U8(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_CLP_U8TOF32(act_name, act_func) \ +__kernel void lstmunit_activation_CLP_U8toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b1, b2, b3; \ + float4 w1, w2, w3; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 * w1 + b1; \ + data_g_t = src2 * w2 + b2; \ + data_o_t = src3 * w3 + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CLP_U8TOF32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CLP_U8TOF32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CL_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CL_F32.cl new file mode 100644 index 0000000..43d307d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CL_F32.cl @@ -0,0 +1,134 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CL_F32(act_name, act_func) \ +__kernel void lstmunit_activation_CL_F32toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b1, b2, b3; \ + float4 w1, w2, w3; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 * w1 + b1; \ + data_g_t = src2 * w2 + b2; \ + data_o_t = src3 * w3 + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CL_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CL_F32(HARD_SIGMOID, hard_sigmoid) + + + +#define LSTM_ACTIVATION_CL_F32TOU8(act_name, act_func) \ +__kernel void lstmunit_activation_CL_F32toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b1, b2, b3; \ + float4 w1, w2, w3; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 * w1 + b1; \ + data_g_t = src2 * w2 + b2; \ + data_o_t = src3 * w3 + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ + write_imageui(h_state_out, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CL_F32TOU8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CL_F32TOU8(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CL_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CL_U8.cl new file mode 100644 index 0000000..1924947 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CL_U8.cl @@ -0,0 +1,138 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CL_U8(act_name, act_func) \ +__kernel void lstmunit_activation_CL_U8toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b1, b2, b3; \ + float4 w1, w2, w3; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 * w1 + b1; \ + data_g_t = src2 * w2 + b2; \ + data_o_t = src3 * w3 + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ + write_imageui(h_state_out, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CL_U8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CL_U8(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_CL_U8TOF32(act_name, act_func) \ +__kernel void lstmunit_activation_CL_U8toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b1, b2, b3; \ + float4 w1, w2, w3; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_f_t = src1 * w1 + b1; \ + data_g_t = src2 * w2 + b2; \ + data_o_t = src3 * w3 + b3; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CL_U8TOF32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CL_U8TOF32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CSP_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CSP_F32.cl new file mode 100644 index 0000000..122f552 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CSP_F32.cl @@ -0,0 +1,114 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CSP_F32(act_name, act_func) \ +__kernel void lstmunit_activation_CSP_F32toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CSP_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CSP_F32(HARD_SIGMOID, hard_sigmoid) + +#define LSTM_ACTIVATION_CSP_F32TOU8(act_name, act_func) \ +__kernel void lstmunit_activation_CSP_F32toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CSP_F32TOU8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CSP_F32TOU8(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CSP_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CSP_U8.cl new file mode 100644 index 0000000..5de5922 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CSP_U8.cl @@ -0,0 +1,120 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CSP_U8(act_name, act_func) \ +__kernel void lstmunit_activation_CSP_U8toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CSP_U8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CSP_U8(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_CSP_U8TOF32(act_name, act_func) \ +__kernel void lstmunit_activation_CSP_U8toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CSP_U8TOF32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CSP_U8TOF32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CS_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CS_F32.cl new file mode 100644 index 0000000..51a3e2e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CS_F32.cl @@ -0,0 +1,119 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CS_F32(act_name, act_func) \ +__kernel void lstmunit_activation_CS_F32toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CS_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CS_F32(HARD_SIGMOID, hard_sigmoid) + +#define LSTM_ACTIVATION_CS_F32TOU8(act_name, act_func) \ +__kernel void lstmunit_activation_CS_F32toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ + write_imageui(h_state_out, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CS_F32TOU8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CS_F32TOU8(HARD_SIGMOID, hard_sigmoid) + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CS_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CS_U8.cl new file mode 100644 index 0000000..db3001f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_CS_U8.cl @@ -0,0 +1,125 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_CS_U8(act_name, act_func) \ +__kernel void lstmunit_activation_CS_U8toU8_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ + write_imageui(h_state_out, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_CS_U8(SIGMOID, sigmoid) +LSTM_ACTIVATION_CS_U8(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_CS_U8TOF32(act_name, act_func) \ +__kernel void lstmunit_activation_CS_U8toF32_F32_##act_name( \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src1, src2, src3; \ + float4 src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = 1.0f - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_CS_U8TOF32(SIGMOID, sigmoid) +LSTM_ACTIVATION_CS_U8TOF32(HARD_SIGMOID, hard_sigmoid) + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_LP_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_LP_F32.cl new file mode 100644 index 0000000..db19b35 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_LP_F32.cl @@ -0,0 +1,78 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_LP_F32(act_name, act_func) \ +__kernel void lstmunit_activation_LP_F32toF32_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wi, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + float4 w0, w1, w2, w3; \ + src0 = read_imagef(input_i_conv, coord_in.xy); \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_i_t = src0 * w0 + b0; \ + data_f_t = src1 * w1 + b1; \ + data_g_t = src2 * w2 + b2; \ + data_o_t = src3 * w3 + b3; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_LP_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_LP_F32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_L_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_L_F32.cl new file mode 100644 index 0000000..c285092 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_L_F32.cl @@ -0,0 +1,80 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_L_F32(act_name, act_func) \ +__kernel void lstmunit_activation_L_F32toF32_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wi, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + float4 w0, w1, w2, w3; \ + src0 = read_imagef(input_i_conv, coord_in.xy); \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_i_t = src0 * w0 + b0; \ + data_f_t = src1 * w1 + b1; \ + data_g_t = src2 * w2 + b2; \ + data_o_t = src3 * w3 + b3; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_L_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_L_F32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_SP_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_SP_F32.cl new file mode 100644 index 0000000..dd52e0d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_SP_F32.cl @@ -0,0 +1,125 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_SP_F32(act_name, act_func) \ +__kernel void lstmunit_activation_SP_F32toF32_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src0 = read_imagef(input_i_conv, coord_in.xy); \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_SP_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_SP_F32(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_SP_F32TOU8(act_name, act_func) \ +__kernel void lstmunit_activation_SP_F32toU8_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src0 = read_imagef(input_i_conv, coord_in.xy); \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_SP_F32TOU8(SIGMOID, sigmoid) +LSTM_ACTIVATION_SP_F32TOU8(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_SP_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_SP_U8.cl new file mode 100644 index 0000000..95e1c8d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_SP_U8.cl @@ -0,0 +1,130 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_SP_U8(act_name, act_func) \ +__kernel void lstmunit_activation_SP_U8toU8_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_SP_U8(SIGMOID, sigmoid) +LSTM_ACTIVATION_SP_U8(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_SP_U8TOF32(act_name, act_func) \ +__kernel void lstmunit_activation_SP_U8toF32_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_SP_U8TOF32(SIGMOID, sigmoid) +LSTM_ACTIVATION_SP_U8TOF32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_S_F32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_S_F32.cl new file mode 100644 index 0000000..bc85030 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_S_F32.cl @@ -0,0 +1,129 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_S_F32(act_name, act_func) \ +__kernel void lstmunit_activation_S_F32toF32_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src0 = read_imagef(input_i_conv, coord_in.xy); \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_S_F32(SIGMOID, sigmoid) +LSTM_ACTIVATION_S_F32(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_S_F32TOU8(act_name, act_func) \ +__kernel void lstmunit_activation_S_F32toU8_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src0 = read_imagef(input_i_conv, coord_in.xy); \ + src1 = read_imagef(input_f_conv, coord_in.xy); \ + src2 = read_imagef(input_c_conv, coord_in.xy); \ + src3 = read_imagef(input_o_conv, coord_in.xy); \ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ + write_imageui(h_state_out, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_S_F32TOU8(SIGMOID, sigmoid) +LSTM_ACTIVATION_S_F32TOU8(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_S_U8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_S_U8.cl new file mode 100644 index 0000000..06ec279 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lstmunit_activation_S_U8.cl @@ -0,0 +1,134 @@ +float4 sigmoid(float4 x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x, float logE) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x, float twoLogE) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define LSTM_ACTIVATION_S_U8(act_name, act_func) \ +__kernel void lstmunit_activation_S_U8toU8_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \ + write_imageui(output, coord_in.zy, data_o_u); \ + write_imageui(h_state_out, coord_in.zy, data_o_u); \ +} + +LSTM_ACTIVATION_S_U8(SIGMOID, sigmoid) +LSTM_ACTIVATION_S_U8(HARD_SIGMOID, hard_sigmoid) + + +#define LSTM_ACTIVATION_S_U8TOF32(act_name, act_func) \ +__kernel void lstmunit_activation_S_U8toF32_F32_##act_name( \ + __read_only image2d_t input_i_conv, \ + __read_only image2d_t input_f_conv, \ + __read_only image2d_t input_c_conv, \ + __read_only image2d_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t hstate_i_conv, \ + __read_only image2d_t hstate_f_conv, \ + __read_only image2d_t hstate_c_conv, \ + __read_only image2d_t hstate_o_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \ + float out_scale, float out_zp) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + float4 src0, src1, src2, src3; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + data_i_t = act_func(data_i_t, logE); \ + data_f_t = act_func(data_f_t + forget_bias, logE); \ + data_g_t = tangentH(data_g_t, twoLogE); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t, logE); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t, twoLogE); \ + data_o_t = data_o_t * data_c_t; \ + write_imagef(output, coord_in.zy, data_o_t); \ + write_imagef(h_state_out, coord_in.zy, data_o_t); \ +} + +LSTM_ACTIVATION_S_U8TOF32(SIGMOID, sigmoid) +LSTM_ACTIVATION_S_U8TOF32(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl new file mode 100644 index 0000000..ec757ca --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl @@ -0,0 +1,69 @@ +__kernel void gemm_F32F32toF32_2D( + __read_only image2d_t inputA, + __read_only image2d_t inputB, + __write_only image2d_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int2 coord_a = (int2)(0, gidy); + int2 coord_b = (int2)(gidx, 0); + + float4 sum = (float4)(0); + + for(; coord_a.x < K;) + { + float4 tempA0; + float4 tempB0; + + tempA0 = read_imagef(inputA, coord_a); + tempB0 = read_imagef(inputB, coord_b); + coord_a.x++; + coord_b.y++; + + sum += tempA0 * tempB0; + } + + coord_b.y = gidy; + write_imagef(output, coord_b, sum); +} + +__kernel void gemm_F32F32toF32_3D( + __read_only image2d_array_t inputA, + __read_only image2d_array_t inputB, + __write_only image2d_array_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero + ) +{ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); + + float4 sum = (float4)(0); + + for(; coord_a.x < K;) + { + float4 tempA0; + float4 tempB0; + + tempA0 = read_imagef(inputA, coord_a); + tempB0 = read_imagef(inputB, coord_b); + coord_a.x++; + coord_b.y++; + + sum += tempA0 * tempB0; + } + + coord_b.y = get_global_id(1); + coord_b.z = get_global_id(2); + write_imagef(output, coord_b, sum); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl new file mode 100644 index 0000000..7c290d4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl @@ -0,0 +1,72 @@ +__kernel void gemm_transa_F32F32toF32_2D( + __read_only image2d_t inputA, + __read_only image2d_t inputB, + __write_only image2d_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int2 coord_a = (int2)(gidy, 0); + int2 coord_b = (int2)(gidx, 0); + + float4 sum = (float4)(0); + + for(; coord_a.y < K;) + { + float4 tempA0; + float4 tempB0; + + tempA0 = read_imagef(inputA, coord_a); + tempB0 = read_imagef(inputB, coord_b); + coord_a.y++; + coord_b.y++; + + sum += tempA0 * tempB0; + } + + coord_b.y = gidy; + write_imagef(output, coord_b, sum); +} + +__kernel void gemm_transa_F32F32toF32_3D( + __read_only image2d_array_t inputA, + __read_only image2d_array_t inputB, + __write_only image2d_array_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0); + + float4 sum = (float4)(0); + + for(; coord_a.y < K;) + { + float4 tempA0; + float4 tempB0; + + tempA0 = read_imagef(inputA, coord_a); + tempB0 = read_imagef(inputB, coord_b); + coord_a.y++; + coord_b.y++; + + sum += tempA0 * tempB0; + } + + coord_b.y = gidy; + coord_b.z = get_global_id(2); + write_imagef(output, coord_b, sum); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl new file mode 100644 index 0000000..eb59e6d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl @@ -0,0 +1,151 @@ +__kernel void maximum_FP32FP32toFP32 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + float4 src0; + float4 src1; + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + + float4 dst = src0 > src1 ? src0 : src1; + + write_imagef(output, coord, dst); +} + +__kernel void maximum_FP32FP32toFP32_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + float4 src0 = read_imagef(input0, coord); + float4 src1 = read_imagef(input1, coord); + + float4 dst = src0 > src1 ? src0 : src1; + + write_imagef(output, coord, dst); +} + +__kernel void maximum_U8U8toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + uint4 src0; + uint4 src1; + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + + float4 data0 = convert_float4(src0) * input0Scale - input0Tail; + float4 data1 = convert_float4(src1) * input1Scale - input1Tail; + float4 data = data0 > data1 ? data0 : data1; + uint4 dst = convert_uint4(data * outputScale + outputZP); + + write_imageui(output, coord, dst); +} + +__kernel void maximum_U8U8toU8_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + uint4 src0 = read_imageui(input0, coord); + uint4 src1 = read_imageui(input1, coord); + + float4 data0 = convert_float4(src0) * input0Scale - input0Tail; + float4 data1 = convert_float4(src1) * input1Scale - input1Tail; + float4 data = data0 > data1 ? data0 : data1; + uint4 dst = convert_uint4(data * outputScale + outputZP); + + write_imageui(output, coord, dst); +} + + +__kernel void maximum_I32I32toI32 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 src0; + int4 src1; + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + + int4 dst = src0 > src1 ? src0 : src1; + + write_imagei(output, coord, dst); +} + +__kernel void maximum_I32I32toI32_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + int4 src0 = read_imagei(input0, coord); + int4 src1 = read_imagei(input1, coord); + + int4 dst = src0 > src1 ? src0 : src1; + + write_imagei(output, coord, dst); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl new file mode 100644 index 0000000..d04431a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl @@ -0,0 +1,151 @@ +__kernel void minimum_FP32FP32toFP32 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + float4 src0; + float4 src1; + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + + float4 dst = src0 < src1 ? src0 : src1; + + write_imagef(output, coord, dst); +} + +__kernel void minimum_FP32FP32toFP32_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + float4 src0 = read_imagef(input0, coord); + float4 src1 = read_imagef(input1, coord); + + float4 dst = src0 < src1 ? src0 : src1; + + write_imagef(output, coord, dst); +} + +__kernel void minimum_U8U8toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + uint4 src0; + uint4 src1; + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + + float4 data0 = convert_float4(src0) * input0Scale - input0Tail; + float4 data1 = convert_float4(src1) * input1Scale - input1Tail; + float4 data = data0 < data1 ? data0 : data1; + uint4 dst = convert_uint4(data * outputScale + outputZP); + + write_imageui(output, coord, dst); +} + +__kernel void minimum_U8U8toU8_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + uint4 src0 = read_imageui(input0, coord); + uint4 src1 = read_imageui(input1, coord); + + float4 data0 = convert_float4(src0) * input0Scale - input0Tail; + float4 data1 = convert_float4(src1) * input1Scale - input1Tail; + float4 data = data0 < data1 ? data0 : data1; + uint4 dst = convert_uint4(data * outputScale + outputZP); + + write_imageui(output, coord, dst); +} + + +__kernel void minimum_I32I32toI32 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 src0; + int4 src1; + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + + int4 dst = src0 < src1 ? src0 : src1; + + write_imagei(output, coord, dst); +} + +__kernel void minimum_I32I32toI32_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + int4 src0 = read_imagei(input0, coord); + int4 src1 = read_imagei(input1, coord); + + int4 dst = src0 < src1 ? src0 : src1; + + write_imagei(output, coord, dst); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl new file mode 100644 index 0000000..6a71a4f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl @@ -0,0 +1,118 @@ +__kernel void moments_axis0_U8toF16( + __read_only image2d_array_t input, + __write_only image2d_t output_mean, + __write_only image2d_t output_vari, + int axis, + int axis_num, + int input_zp, + float input_scale, + int width, + int height, + int chn, + float dimRatio + ) +{ + int gidy = get_global_id(0); + int gidz = get_global_id(1); + + int4 coord0 = (int4)(0, gidy, gidz, 0); + uint data; + float sum = 0, sqr = 0; + uint tmpSum = 0, tmpSqr = 0; + float e2InScale = input_scale * input_scale; + + { + for(coord0.x = 0; coord0.x < width;) + { + data = read_imageui(input, coord0).x; + coord0.x++; + tmpSum += (data); + tmpSqr += (data * data); + } + sqr = convert_float(tmpSqr - 2 * input_zp * tmpSum + width * input_zp * input_zp) * e2InScale; + sum = convert_float(tmpSum - width * input_zp) * input_scale; + } + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + int2 coord_out = (int2)(gidy, gidz); + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); +} + +#define MOMENTS_AXIS0_F(src0_type_name) \ +__kernel void moments_axis0_##src0_type_name##to##src0_type_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_t output_mean, \ + __write_only image2d_t output_vari, \ + int axis, int axis_num, int input_zp, float input_scale, \ + int width, int height, int chn, float dimRatio \ + ) \ +{ \ + int gidy = get_global_id(0); \ + int gidz = get_global_id(1); \ + \ + int4 coord0 = (int4)(0, gidy, gidz, 0); \ + float data; \ + float sum = 0, sqr = 0; \ + \ + for(coord0.x = 0; coord0.x < width;) \ + { \ + data = read_imagef(input, coord0).x; \ + coord0.x++; \ + sum += (data); \ + sqr += (data * data); \ + } \ + \ + float4 mean, vari; \ + mean.x = sum * dimRatio; \ + vari.x = sqr * dimRatio; \ + vari.x = vari.x - mean.x * mean.x; \ + \ + int2 coord_out = (int2)(gidy, gidz); \ + write_imagef(output_mean, coord_out, mean); \ + write_imagef(output_vari, coord_out, vari); \ +} +MOMENTS_AXIS0_F(F16) +MOMENTS_AXIS0_F(F32) + +__kernel void moments_axis0_I32toF32( + __read_only image2d_array_t input, + __write_only image2d_t output_mean, + __write_only image2d_t output_vari, + int axis, + int axis_num, + int input_zp, + float input_scale, + int width, + int height, + int chn, + float dimRatio + ) +{ + int gidy = get_global_id(0); + int gidz = get_global_id(1); + + int4 coord0 = (int4)(0, gidy, gidz, 0); + int data; + int sum = 0, sqr = 0; + + for(coord0.x = 0; coord0.x < width;) + { + data = read_imagei(input, coord0).x; + coord0.x++; + sum += (data); + sqr += (data * data); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + int2 coord_out = (int2)(gidy, gidz); + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl new file mode 100644 index 0000000..f7c64cf --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl @@ -0,0 +1,174 @@ +__kernel void moments_axis01_U8toF16( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num, int input_zp, float input_scale, + int width, int height, int chn, float dimRatio + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, gidz, 0); + uint4 data; + float sum = 0, sqr = 0; + float e2InScale = input_scale * input_scale; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(coord.x = gidx; coord.x < width; coord.x += 16) + { + int tmpSum = 0, tmpSqr = 0; + for(coord.y = 0; coord.y < height;) + { + data = read_imageui(input, coord); + coord.y++; + tmpSum += data.x; + tmpSqr += data.x * data.x; + } + sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; + sum += (tmpSum - height * input_zp) * input_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(gidz, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); + } +} + +#define MOMENTS_AXIS01_F(src0_type_name) \ +__kernel void moments_axis01_##src0_type_name##to##src0_type_name( \ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num, int input_zp, float input_scale, \ + int width, int height, int chn, float dimRatio \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidz = get_global_id(1); \ + int lidx = get_local_id(0); \ + \ + int4 coord = (int4)(gidx, 0, gidz, 0); \ + float4 data; \ + float sum = 0, sqr = 0; \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + \ + for(coord.x = gidx; coord.x < width; coord.x += 16) \ + { \ + for(coord.y = 0; coord.y < height;) \ + { \ + data = read_imagef(input, coord); \ + coord.y++; \ + sum += data.x; \ + sqr += data.x * data.x; \ + } \ + } \ + lcl_sum[lidx] = sum; \ + lcl_sqr[lidx] = sqr; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + int2 coord_out = (int2)(gidz, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + \ + sum = 0; sqr = 0; \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + \ + float4 mean, vari; \ + mean.x = sum * dimRatio; \ + vari.x = sqr * dimRatio; \ + vari.x = vari.x - mean.x * mean.x; \ + \ + write_imagef(output_mean, coord_out, mean); \ + write_imagef(output_vari, coord_out, vari); \ + } \ +} +MOMENTS_AXIS01_F(F16) +MOMENTS_AXIS01_F(F32) + +__kernel void moments_axis01_I32toF32( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num, int input_zp, float input_scale, + int width, int height, int chn, float dimRatio + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, gidz, 0); + int4 data; + float sum = 0, sqr = 0; + float e2InScale = input_scale * input_scale; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(coord.x = gidx; coord.x < width; coord.x += 16) + { + int tmpSum = 0, tmpSqr = 0; + for(coord.y = 0; coord.y < height;) + { + data = read_imagei(input, coord); + coord.y++; + tmpSum += data.x; + tmpSqr += data.x * data.x; + } + sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; + sum += (tmpSum - height * input_zp) * input_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(gidz, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl new file mode 100644 index 0000000..28a4fc3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl @@ -0,0 +1,180 @@ +__kernel void moments_axis012_U8toF16( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num, int input_zp, float input_scale, + int width, int height, int chn, float dimRatio + ) +{ + int gidx = get_global_id(0); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, 0, 0); + uint4 data; + float sum = 0, sqr = 0; + float e2InScale = input_scale * input_scale; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(coord.z = 0; coord.z < chn; coord.z++) + { + for(coord.x = gidx; coord.x < width; coord.x += 16) + { + int tmpSum = 0, tmpSqr = 0; + for(coord.y = 0; coord.y < height;) + { + data = read_imageui(input, coord); + coord.y++; + tmpSum += data.x; + tmpSqr += data.x * data.x; + } + sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; + sum += (tmpSum - height * input_zp) * input_scale; + } + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); + } +} + +#define MOMENTS_AXIS012_F(src0_type_name) \ +__kernel void moments_axis012_##src0_type_name##to##src0_type_name( \ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num, int input_zp, float input_scale, \ + int width, int height, int chn, float dimRatio \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int lidx = get_local_id(0); \ + \ + int4 coord = (int4)(gidx, 0, 0, 0); \ + float4 data; \ + float sum = 0, sqr = 0; \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + \ + for(coord.z = 0; coord.z < chn; coord.z++) \ + { \ + for(coord.x = gidx; coord.x < width; coord.x += 16) \ + { \ + for(coord.y = 0; coord.y < height;) \ + { \ + data = read_imagef(input, coord); \ + coord.y++; \ + sum += data.x; \ + sqr += data.x * data.x; \ + } \ + } \ + } \ + lcl_sum[lidx] = sum; \ + lcl_sqr[lidx] = sqr; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + int2 coord_out = (int2)(0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + \ + sum = 0; sqr = 0; \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + \ + float4 mean, vari; \ + mean.x = sum * dimRatio; \ + vari.x = sqr * dimRatio; \ + vari.x = vari.x - mean.x * mean.x; \ + \ + write_imagef(output_mean, coord_out, mean); \ + write_imagef(output_vari, coord_out, vari); \ + } \ +} +MOMENTS_AXIS012_F(F16) +MOMENTS_AXIS012_F(F32) + +__kernel void moments_axis012_I32toF32( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num, int input_zp, float input_scale, + int width, int height, int chn, float dimRatio + ) +{ + int gidx = get_global_id(0); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, 0, 0); + int4 data; + float sum = 0, sqr = 0; + float e2InScale = input_scale * input_scale; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(coord.z = 0; coord.z < chn; coord.z++) + { + for(coord.x = gidx; coord.x < width; coord.x += 16) + { + int tmpSum = 0, tmpSqr = 0; + for(coord.y = 0; coord.y < height;) + { + data = read_imagei(input, coord); + coord.y++; + tmpSum += data.x; + tmpSqr += data.x * data.x; + } + sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; + sum += (tmpSum - height * input_zp) * input_scale; + } + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl new file mode 100644 index 0000000..9ba0dc4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl @@ -0,0 +1,113 @@ +__kernel void moments_axis1_U8toF16( + __read_only image2d_array_t input, + __write_only image2d_t output_mean, + __write_only image2d_t output_vari, + int axis, int axis_num, int input_zp, float input_scale, + int width, int height, int chn, float dimRatio + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + + int4 coord0 = (int4)(gidx, 0, gidz, 0); + uint data; + float sum = 0, sqr = 0; + uint tmpSum = 0, tmpSqr = 0; + float e2InScale = input_scale * input_scale; + + { + for(coord0.y = 0; coord0.y < height;) + { + data = read_imageui(input, coord0).x; + coord0.y++; + tmpSum += (data); + tmpSqr += (data * data); + } + sqr = convert_float(tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; + sum = convert_float(tmpSum - height * input_zp) * input_scale; + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + int2 coord_out = (int2)(gidx, gidz); + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); +} + +#define MOMENTS_AXIS1_F(src0_type_name) \ +__kernel void moments_axis1_##src0_type_name##to##src0_type_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_t output_mean, \ + __write_only image2d_t output_vari, \ + int axis, int axis_num, int input_zp, float input_scale, \ + int width, int height, int chn, float dimRatio \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidz = get_global_id(1); \ + \ + int4 coord0 = (int4)(gidx, 0, gidz, 0); \ + float data; \ + float sum = 0, sqr = 0; \ + \ + for(coord0.y = 0; coord0.y < height;) \ + { \ + data = read_imagef(input, coord0).x; \ + coord0.y++; \ + sum += (data); \ + sqr += (data * data); \ + } \ + \ + float4 mean, vari; \ + mean.x = sum * dimRatio; \ + vari.x = sqr * dimRatio; \ + vari.x = vari.x - mean.x * mean.x; \ + \ + int2 coord_out = (int2)(gidx, gidz); \ + write_imagef(output_mean, coord_out, mean); \ + write_imagef(output_vari, coord_out, vari); \ +} +MOMENTS_AXIS1_F(F16) +MOMENTS_AXIS1_F(F32) + +__kernel void moments_axis1_I32toF32( + __read_only image2d_array_t input, + __write_only image2d_t output_mean, + __write_only image2d_t output_vari, + int axis, + int axis_num, + int input_zp, + float input_scale, + int width, + int height, + int chn, + float dimRatio + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + + int4 coord0 = (int4)(gidx, 0, gidz, 0); + int data; + int sum = 0, sqr = 0; + + for(coord0.y = 0; coord0.y < height;) + { + data = read_imagei(input, coord0).x; + coord0.y++; + sum += (data); + sqr += (data * data); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + int2 coord_out = (int2)(gidx, gidz); + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl new file mode 100644 index 0000000..e15d25a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl @@ -0,0 +1,125 @@ +__kernel void moments_axis2_U8toF16( + __read_only image2d_array_t input, + __write_only image2d_t output_mean, + __write_only image2d_t output_vari, + int axis, + int axis_num, + int input_zp, + float input_scale, + int width, + int height, + int chn, + float dimRatio + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int4 coord0 = (int4)(gidx, gidy, 0, 0); + uint data; + float sum = 0, sqr = 0; + uint tmpSum = 0, tmpSqr = 0; + float e2InScale = input_scale * input_scale; + + { + for(coord0.z = 0; coord0.z < chn;) + { + data = read_imageui(input, coord0).x; + coord0.z++; + tmpSum += (data); + tmpSqr += (data * data); + } + sqr = (tmpSqr - 2 * input_zp * tmpSum + chn * input_zp * input_zp) * e2InScale; + sum = (tmpSum - chn * input_zp) * input_scale; + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + int2 coord_out = (int2)(gidx, gidy); + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); +} + +#define MOMENTS_AXIS2_F(src0_type_name) \ +__kernel void moments_axis2_##src0_type_name##to##src0_type_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_t output_mean, \ + __write_only image2d_t output_vari, \ + int axis, \ + int axis_num, \ + int input_zp, \ + float input_scale, \ + int width, \ + int height, \ + int chn, \ + float dimRatio \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord0 = (int4)(gidx, gidy, 0, 0); \ + float data; \ + float sum = 0, sqr = 0; \ + \ + for(coord0.z = 0; coord0.z < chn;) \ + { \ + data = read_imagef(input, coord0).x; \ + coord0.z++; \ + sum += (data); \ + sqr += (data * data); \ + } \ + \ + float4 mean, vari; \ + mean.x = sum * dimRatio; \ + vari.x = sqr * dimRatio; \ + vari.x = vari.x - mean.x * mean.x; \ + \ + int2 coord_out = (int2)(gidx, gidy); \ + write_imagef(output_mean, coord_out, mean); \ + write_imagef(output_vari, coord_out, vari); \ +} +MOMENTS_AXIS2_F(F16) +MOMENTS_AXIS2_F(F32) + +__kernel void moments_axis2_I32toF32( + __read_only image2d_array_t input, + __write_only image2d_t output_mean, + __write_only image2d_t output_vari, + int axis, + int axis_num, + int input_zp, + float input_scale, + int width, + int height, + int chn, + float dimRatio + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int4 coord0 = (int4)(gidx, gidy, 0, 0); + int data; + int sum = 0, sqr = 0; + + for(coord0.z = 0; coord0.z < chn;) + { + data = read_imagei(input, coord0).x; + coord0.z++; + sum += (data); + sqr += (data * data); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + int2 coord_out = (int2)(gidx, gidy); + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/poolwithargmax.cl b/src/tim/vx/internal/src/libnnext/ops/cl/poolwithargmax.cl new file mode 100644 index 0000000..09e5843 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/poolwithargmax.cl @@ -0,0 +1,251 @@ + +#define POOLWITHARGMAX_PROCESS(data_type, read_fun, write_fun0, write_fun1) \ + data_type src = 0; \ + data_type max = 0; \ + uint4 axis = 0; \ + src.x = read_fun(input, coord_in).x; \ + coord_in.x++; \ + src.y = read_fun(input, coord_in).x; \ + coord_in.y++; \ + src.w = read_fun(input, coord_in).x; \ + coord_in.x--; \ + src.z = read_fun(input, coord_in).x; \ + max.x = src.x; \ + axis.x = 0; \ + if (src.y > max.x) \ + { \ + max.x = src.y; \ + axis.x = 1; \ + } \ + if (src.z > max.x) \ + { \ + max.x = src.z; \ + axis.x = 2; \ + } \ + if (src.w > max.x) \ + { \ + max.x = src.w; \ + axis.x = 3; \ + } \ + write_fun0(output, coord_out, max); \ + write_fun1(outaxis, coord_out, axis); + + +__kernel void poolwithargmax_F32to_F32_U8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + __write_only image2d_array_t outaxis) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0); + POOLWITHARGMAX_PROCESS(float4, read_imagef, write_imagef, write_imageui) +} + +__kernel void poolwithargmax_F32to_F32_U8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t outaxis) +{ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + int2 coord_in = (int2)(get_global_id(0) << 1, get_global_id(1) << 1); + POOLWITHARGMAX_PROCESS(float4, read_imagef, write_imagef, write_imageui) +} + +__kernel void poolwithargmax_I32to_I32_U8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + __write_only image2d_array_t outaxis) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0); + POOLWITHARGMAX_PROCESS(int4, read_imagei, write_imagei, write_imageui) +} + +__kernel void poolwithargmax_I32to_I32_U8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t outaxis) +{ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + int2 coord_in = (int2)(get_global_id(0) << 1, get_global_id(1) << 1); + POOLWITHARGMAX_PROCESS(int4, read_imagei, write_imagei, write_imageui) +} + + +#define POOLWITHARGMAX_U8_PROCESS() \ + uint4 src = 0; \ + uint4 max = 0; \ + uint4 axis = 0; \ + float4 result = 0.0f; \ + src.x = read_imageui(input, coord_in).x; \ + coord_in.x++; \ + src.y = read_imageui(input, coord_in).x; \ + coord_in.y++; \ + src.w = read_imageui(input, coord_in).x; \ + coord_in.x--; \ + src.z = read_imageui(input, coord_in).x; \ + max.x = src.x; \ + axis.x = 0; \ + if (src.y > max.x) \ + { \ + max.x = src.y; \ + axis.x = 1; \ + } \ + if (src.z > max.x) \ + { \ + max.x = src.z; \ + axis.x = 2; \ + } \ + if (src.w > max.x) \ + { \ + max.x = src.w; \ + axis.x = 3; \ + } \ + result.x = convert_float4(max).x * scale_value + tail_value; \ + max = convert_uint4(result);\ + write_imageui(output, coord_out, max); \ + write_imageui(outaxis, coord_out, axis); + + +__kernel void poolwithargmax_U8to_U8_U8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + __write_only image2d_array_t outaxis, + float scale_value, + float tail_value) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0); + POOLWITHARGMAX_U8_PROCESS() +} + +__kernel void poolwithargmax_U8to_U8_U8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t outaxis, + float scale_value, + float tail_value) +{ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + int2 coord_in = (int2)(get_global_id(0) << 1, get_global_id(1) << 1); + POOLWITHARGMAX_U8_PROCESS() +} + + +#define POOLWITHARGMAX_U8_TO_F32_PROCESS() \ + uint4 src = 0; \ + uint4 max = 0; \ + uint4 axis = 0; \ + float4 result = 0.0f; \ + src.x = read_imageui(input, coord_in).x; \ + coord_in.x++; \ + src.y = read_imageui(input, coord_in).x; \ + coord_in.y++; \ + src.w = read_imageui(input, coord_in).x; \ + coord_in.x--; \ + src.z = read_imageui(input, coord_in).x; \ + max.x = src.x; \ + axis.x = 0; \ + if (src.y > max.x) \ + { \ + max.x = src.y; \ + axis.x = 1; \ + } \ + if (src.z > max.x) \ + { \ + max.x = src.z; \ + axis.x = 2; \ + } \ + if (src.w > max.x) \ + { \ + max.x = src.w; \ + axis.x = 3; \ + } \ + result.x = convert_float4(max).x * scale_value + tail_value; \ + write_imagef(output, coord_out, result); \ + write_imageui(outaxis, coord_out, axis); + + +__kernel void poolwithargmax_U8to_F32_U8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + __write_only image2d_array_t outaxis, + float scale_value, + float tail_value) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0); + POOLWITHARGMAX_U8_TO_F32_PROCESS() +} + +__kernel void poolwithargmax_U8to_F32_U8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t outaxis, + float scale_value, + float tail_value) +{ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + int2 coord_in = (int2)(get_global_id(0) << 1, get_global_id(1) << 1); + POOLWITHARGMAX_U8_TO_F32_PROCESS() +} + +#define POOLWITHARGMAX_F32_TO_U8_PROCESS() \ + float4 src = 0; \ + float4 max = 0; \ + uint4 axis = 0; \ + uint4 dst = 0; \ + float4 result = 0.0f; \ + src.x = read_imagef(input, coord_in).x; \ + coord_in.x++; \ + src.y = read_imagef(input, coord_in).x; \ + coord_in.y++; \ + src.w = read_imagef(input, coord_in).x; \ + coord_in.x--; \ + src.z = read_imagef(input, coord_in).x; \ + max.x = src.x; \ + axis.x = 0; \ + if (src.y > max.x) \ + { \ + max.x = src.y; \ + axis.x = 1; \ + } \ + if (src.z > max.x) \ + { \ + max.x = src.z; \ + axis.x = 2; \ + } \ + if (src.w > max.x) \ + { \ + max.x = src.w; \ + axis.x = 3; \ + } \ + result.x = max.x * scale_value + tail_value; \ + dst = convert_uint4(result);\ + write_imageui(output, coord_out, dst); \ + write_imageui(outaxis, coord_out, axis); + + +__kernel void poolwithargmax_F32to_U8_U8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + __write_only image2d_array_t outaxis, + float scale_value, + float tail_value) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0); + POOLWITHARGMAX_F32_TO_U8_PROCESS() +} + +__kernel void poolwithargmax_F32to_U8_U8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t outaxis, + float scale_value, + float tail_value) +{ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + int2 coord_in = (int2)(get_global_id(0) << 1, get_global_id(1) << 1); + POOLWITHARGMAX_F32_TO_U8_PROCESS() +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl b/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl new file mode 100644 index 0000000..9acbe98 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl @@ -0,0 +1,44 @@ +__kernel void pow_FP32FP32toFP32 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + float4 src0, src1; + float4 dst; + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + + float4 s0 = sign(src0); + int4 t0 = convert_int4(src1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ? (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f); + + write_imagef(output, coord, dst); +} + +__kernel void pow_FP32FP32toFP32_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + float4 src0 = read_imagef(input0, coord); + float4 src1 = read_imagef(input1, coord); + + float4 dst = (float4)(0); + + float4 s0 = sign(src0); + int4 t0 = convert_int4(src1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + + dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ? (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f); + + write_imagef(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/prelu.cl b/src/tim/vx/internal/src/libnnext/ops/cl/prelu.cl new file mode 100644 index 0000000..daf2819 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/prelu.cl @@ -0,0 +1,177 @@ +__kernel void prelu_FP32FP32toFP32 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + float4 src0; + float4 src1; + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + + float4 maxData = src0 >= 0 ? src0 : 0; + float4 minData = src0 < 0 ? src0 : 0; + float4 dst = maxData + minData * src1; + + write_imagef(output, coord, dst); +} + +__kernel void prelu_FP32FP32toFP32_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + float4 src0 = read_imagef(input0, coord); + float4 src1 = read_imagef(input1, coord); + + float4 maxData = src0 >= 0 ? src0 : 0; + float4 minData = src0 < 0 ? src0 : 0; + float4 dst = maxData + minData * src1; + + write_imagef(output, coord, dst); +} + +__kernel void prelu_U8U8toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + uint4 src0; + uint4 src1; + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + + float4 data0 = convert_float4(src0) * input0Scale - input0Tail; + float4 data1 = convert_float4(src1) * input1Scale - input1Tail; + + float4 maxData = data0 >= 0 ? data0 : 0; + float4 minData = data0 < 0 ? data0 : 0; + float4 data = maxData + minData * data1; + + uint4 dst = convert_uint4(data * outputScale + outputZP); + + write_imageui(output, coord, dst); +} + +__kernel void prelu_U8U8toU8_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + uint4 src0 = read_imageui(input0, coord); + uint4 src1 = read_imageui(input1, coord); + + float4 data0 = convert_float4(src0) * input0Scale - input0Tail; + float4 data1 = convert_float4(src1) * input1Scale - input1Tail; + + float4 maxData = data0 >= 0 ? data0 : 0; + float4 minData = data0 < 0 ? data0 : 0; + float4 data = maxData + minData * data1; + + uint4 dst = convert_uint4(data * outputScale + outputZP); + + write_imageui(output, coord, dst); +} + + +__kernel void prelu_I32I32toI32 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 src0; + int4 src1; + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + + float4 data0 = convert_float4(src0) * input0Scale - input0Tail; + float4 data1 = convert_float4(src1) * input1Scale - input1Tail; + + float4 maxData = data0 >= 0 ? data0 : 0; + float4 minData = data0 < 0 ? data0 : 0; + float4 data = maxData + minData * data1; + + int4 dst = convert_int4(data * outputScale + outputZP); + + write_imagei(output, coord, dst); +} + +__kernel void prelu_I32I32toI32_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + int4 src0 = read_imagei(input0, coord); + int4 src1 = read_imagei(input1, coord); + + float4 data0 = convert_float4(src0) * input0Scale - input0Tail; + float4 data1 = convert_float4(src1) * input1Scale - input1Tail; + + float4 maxData = data0 >= 0 ? data0 : 0; + float4 minData = data0 < 0 ? data0 : 0; + float4 data = maxData + minData * data1; + + int4 dst = convert_int4(data * outputScale + outputZP); + + write_imagei(output, coord, dst); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/random_multinomial.cl b/src/tim/vx/internal/src/libnnext/ops/cl/random_multinomial.cl new file mode 100644 index 0000000..2a90f35 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/random_multinomial.cl @@ -0,0 +1,180 @@ +#pragma OPENCL EXTENSION CL_VIV_asm : enable + +inline uchar* get_image2D_array_ptr(image2d_array_t input) +{ + int8 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + uchar *src_ptr = (uchar*)desc.s0; + + return src_ptr; +} + +uint4 _philox4x32bumpkey(uint4 key) +{ + uint4 mask = (uint4)((uint)0x9E3779B9, (uint)0xBB67AE85, 0, 0); + //key.x += ((uint)0x9E3779B9); + //key.y += ((uint)0xBB67AE85); + key += mask; + return key; +} + +uint mullo32(uint a, uint b) +{ + return a * b; +} + +uint mulhi32(uint a, uint b) +{ + return mul_hi(a, b); +} + +uint4 _philox4x32round(uint4 ctr, uint4 key) +{ + uint PHILOX_M4x32_0 = ((uint)0xD2511F53); + uint PHILOX_M4x32_1 = ((uint)0xCD9E8D57); + + uint lo0 = mullo32(PHILOX_M4x32_0, ctr.x); + uint hi0 = mulhi32(PHILOX_M4x32_0, ctr.x); + uint lo1 = mullo32(PHILOX_M4x32_1, ctr.z); + uint hi1 = mulhi32(PHILOX_M4x32_1, ctr.z); + + uint4 out = (uint4)(hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0); + return out; +} + +uint4 philox4x32_R_10(uint4 ctr, uint4 key) +{ + uint i; + ctr = _philox4x32round(ctr, key); + for (i = 1; i < 10; i++) + { + key = _philox4x32bumpkey(key); + ctr = _philox4x32round(ctr, key); + } + return ctr; +} + +__kernel void random_seed( + __read_only image2d_array_t seeds, + __write_only image2d_array_t output, + int iter, + float re_rand_max + ) +{ + __global uint* seeds_ptr = (__global uint*)get_image2D_array_ptr(seeds); + seeds_ptr = seeds_ptr; + uint4 key = vload4(0, seeds_ptr); + + uint4 ctr = (uint4)(0); + float4 result = 0; + + __global float* output_ptr = (__global float*)get_image2D_array_ptr(output); + + for(int i = 0; i < iter; i++) + { + ctr = philox4x32_R_10(ctr, key); + result = convert_float4(ctr) * re_rand_max; + vstore4(result, i, output_ptr); + } +} + +#define logE (1.44269502f) +float eltwise_unary_exp(float x) +{ + x *= logE; + x = exp2(x); + return x; +} +// N times of 8 +// x dim = 1 + +__kernel void random_multinomial_cdf_F32 + ( + __read_only image2d_t input, + __write_only image2d_t output + ) +{ + int2 coord = (int2)(0, get_global_id(1)); + int class_max_iter = get_image_width(input); + float4 src0, data; + float4 dst = 0; + + float4 maxVal = read_imagef(input, coord); + + for(coord.x = 1; coord.x < class_max_iter;) + { + src0 = read_imagef(input, coord); + coord.x ++; + + maxVal = maxVal > src0 ? maxVal : src0; + } + + for(coord.x = 0; coord.x < class_max_iter; ) + { + float4 val; + src0 = read_imagef(input, coord); + + data = src0 - maxVal; + val.x = eltwise_unary_exp(data.x); + val.x += dst.x; + dst.x = val.x; + write_imagef(output, coord.xy, val); + coord.x ++; + } +} + +uint upper_bound(float* a, int n, float x) +{ + uint l = 0; + uint h = n; + while (l < h) { + int mid = (l + h) >> 1; + if (x >= a[mid]) { + l = mid + 1; + } else { + h = mid; + } + } + return l; +} + +// one thread calculate 4 +__kernel void random_multinomial + ( + __read_only image2d_array_t randoms, + __read_only image2d_array_t cdfs, + __write_only image2d_array_t output + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord = (int4)(gidx, gidy, 0, 0); + int class_size = get_image_width(cdfs); + + int offset = gidy * class_size; + __global float* cdf_ptr = (__global float*)get_image2D_array_ptr(cdfs); + __global float* cdfPtr = cdf_ptr + offset; + + int width = get_image_width(randoms); + offset = coord.x + coord.y * width; + __global float* randoms_ptr = (__global float*)get_image2D_array_ptr(randoms); + randoms_ptr = randoms_ptr + offset; + + width = get_image_width(output); + offset = coord.x + coord.y * width; + __global uint* output_ptr = (__global uint*)get_image2D_array_ptr(output); + output_ptr = output_ptr + offset; + + float4 ran = vload4(0, randoms_ptr); + float total = cdfPtr[class_size - 1]; + float4 target = ran * total; + + uint4 out_class = (uint4)(0); + out_class.x = upper_bound(cdfPtr, class_size, target.x); + out_class.y = upper_bound(cdfPtr, class_size, target.y); + out_class.z = upper_bound(cdfPtr, class_size, target.z); + out_class.w = upper_bound(cdfPtr, class_size, target.w); + + vstore4(out_class, 0, output_ptr); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reduceall_internal_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reduceall_internal_axis0.cl new file mode 100644 index 0000000..7dd43a6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reduceall_internal_axis0.cl @@ -0,0 +1,45 @@ +__kernel void reduceall_axis0_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + + int4 allVal = read_imagei(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + allVal = val && allVal; + coord.x ++; + } + allVal.x = allVal.x & 1; + write_imagei(output, coord.yz, allVal); +} + +__kernel void reduceall_axis0_I8toI8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + + int4 allVal = read_imagei(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + allVal = val && allVal; + coord.x ++; + } + allVal.x = allVal.x & 1; + coord.x = 0; + write_imagei(output, coord.yx, allVal); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reduceall_internal_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reduceall_internal_axis1.cl new file mode 100644 index 0000000..8ecafb9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reduceall_internal_axis1.cl @@ -0,0 +1,45 @@ +__kernel void reduceall_axis1_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + + int4 allVal = read_imagei(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + allVal = val && allVal; + coord.y ++; + } + allVal.x = allVal.x & 1; + write_imagei(output, coord.xz, allVal); +} + +__kernel void reduceall_axis1_I8toI8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + + int4 allVal = read_imagei(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + allVal = val && allVal; + coord.y ++; + } + allVal.x = allVal.x & 1; + coord.y = 0; + write_imagei(output, coord, allVal); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reduceall_internal_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reduceall_internal_axis2.cl new file mode 100644 index 0000000..99ccbbd --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reduceall_internal_axis2.cl @@ -0,0 +1,24 @@ +__kernel void reduceall_axis2_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + + int4 allVal = read_imagei(input, coord); + coord.z ++; + + for (; coord.z < axisSize;) + { + int4 val = read_imagei(input, coord); + allVal = val && allVal; + coord.z ++; + } + allVal.x = allVal.x & 1; + write_imagei(output, coord.xy, allVal); +} + + + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reduceany_internal_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reduceany_internal_axis0.cl new file mode 100644 index 0000000..a9fa8b8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reduceany_internal_axis0.cl @@ -0,0 +1,45 @@ +__kernel void reduceany_axis0_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + + int4 anyVal = read_imagei(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + anyVal = val || anyVal; + coord.x ++; + } + anyVal.x = anyVal.x & 1; + write_imagei(output, coord.yz, anyVal); +} + +__kernel void reduceany_axis0_I8toI8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + + int4 anyVal = read_imagei(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + anyVal = val || anyVal; + coord.x ++; + } + anyVal.x = anyVal.x & 1; + coord.x = 0; + write_imagei(output, coord.yx, anyVal); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reduceany_internal_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reduceany_internal_axis1.cl new file mode 100644 index 0000000..ff3da83 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reduceany_internal_axis1.cl @@ -0,0 +1,45 @@ +__kernel void reduceany_axis1_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + + int4 anyVal = read_imagei(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + anyVal = val || anyVal; + coord.y ++; + } + anyVal.x = anyVal.x & 1; + write_imagei(output, coord.xz, anyVal); +} + +__kernel void reduceany_axis1_I8toI8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + + int4 anyVal = read_imagei(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + anyVal = val || anyVal; + coord.y ++; + } + anyVal.x = anyVal.x & 1; + coord.y = 0; + write_imagei(output, coord, anyVal); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reduceany_internal_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reduceany_internal_axis2.cl new file mode 100644 index 0000000..bc8c0bd --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reduceany_internal_axis2.cl @@ -0,0 +1,24 @@ +__kernel void reduceany_axis2_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + + int4 anyVal = read_imagei(input, coord); + coord.z ++; + + for (; coord.z < axisSize;) + { + int4 val = read_imagei(input, coord); + anyVal = val || anyVal; + coord.z ++; + } + anyVal.x = anyVal.x & 1; + write_imagei(output, coord.xy, anyVal); +} + + + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reducemax_internal_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reducemax_internal_axis0.cl new file mode 100644 index 0000000..855a8d5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reducemax_internal_axis0.cl @@ -0,0 +1,145 @@ +__kernel void reducemax_axis0_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + float4 maxVal = read_imagef(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = read_imagef(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.x ++; + } + + write_imagef(output, coord.yz, maxVal); +} + +__kernel void reducemax_axis0_F32toF32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + float4 maxVal = read_imagef(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = read_imagef(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.x ++; + } + + coord.x = 0; + write_imagef(output, coord.yx, maxVal); +} + +__kernel void reducemax_axis0_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + uint4 dst; + uint4 maxVal = read_imageui(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + uint4 val = read_imageui(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.x ++; + } + dst = convert_uint4(convert_float4(maxVal) * inputScale + inputTail); + write_imageui(output, coord.yz, dst); +} + +__kernel void reducemax_axis0_U8toU8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + uint4 dst; + uint4 maxVal = read_imageui(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + uint4 val = read_imageui(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.x ++; + } + dst = convert_uint4(convert_float4(maxVal) * inputScale + inputTail); + coord.x = 0; + write_imageui(output, coord.yx, dst); +} + +__kernel void reducemax_axis0_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + + int4 maxVal = read_imagei(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.x ++; + } + + write_imagei(output, coord.yz, maxVal); +} + +__kernel void reducemax_axis0_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + + int4 maxVal = read_imagei(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.x ++; + } + + coord.x = 0; + write_imagei(output, coord.yx, maxVal); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reducemax_internal_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reducemax_internal_axis1.cl new file mode 100644 index 0000000..58eef14 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reducemax_internal_axis1.cl @@ -0,0 +1,147 @@ +__kernel void reducemax_axis1_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + + float4 maxVal = read_imagef(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = read_imagef(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.y ++; + } + + write_imagef(output, coord.xz, maxVal); +} + +__kernel void reducemax_axis1_F32toF32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + + float4 maxVal = read_imagef(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = read_imagef(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.y ++; + } + + coord.y = 0; + write_imagef(output, coord, maxVal); +} + +__kernel void reducemax_axis1_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + uint4 dst; + uint4 maxVal = read_imageui(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + uint4 val = read_imageui(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.y ++; + } + dst = convert_uint4(convert_float4(maxVal) * inputScale + inputTail); + write_imageui(output, coord.xz, dst); +} + +__kernel void reducemax_axis1_U8toU8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + uint4 dst; + uint4 maxVal = read_imageui(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + uint4 val = read_imageui(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.y ++; + } + dst = convert_uint4(convert_float4(maxVal) * inputScale + inputTail); + coord.y = 0; + write_imageui(output, coord, dst); +} + +__kernel void reducemax_axis1_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + + int4 maxVal = read_imagei(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.y ++; + } + + write_imagei(output, coord.xz, maxVal); +} + +__kernel void reducemax_axis1_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + + int4 maxVal = read_imagei(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.y ++; + } + + coord.y = 0; + write_imagei(output, coord, maxVal); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reducemax_internal_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reducemax_internal_axis2.cl new file mode 100644 index 0000000..1794d31 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reducemax_internal_axis2.cl @@ -0,0 +1,76 @@ +__kernel void reducemax_axis2_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + + float4 maxVal = read_imagef(input, coord); + coord.z ++; + + for (; coord.z < axisSize;) + { + float4 val = read_imagef(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.z ++; + } + + write_imagef(output, coord.xy, maxVal); +} + + +__kernel void reducemax_axis2_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + uint4 dst; + uint4 maxVal = read_imageui(input, coord); + coord.z ++; + + for (; coord.z < axisSize;) + { + uint4 val = read_imageui(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.z ++; + } + dst = convert_uint4(convert_float4(maxVal) * inputScale + inputTail); + write_imageui(output, coord.xy, dst); +} + + +__kernel void reducemax_axis2_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + + int4 maxVal = read_imagei(input, coord); + coord.z ++; + + for (; coord.z < axisSize;) + { + int4 val = read_imagei(input, coord); + maxVal = val > maxVal ? val : maxVal; + coord.z ++; + } + + write_imagei(output, coord.xy, maxVal); +} + + + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reducemin_internal_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reducemin_internal_axis0.cl new file mode 100644 index 0000000..e4af4d8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reducemin_internal_axis0.cl @@ -0,0 +1,145 @@ +__kernel void reducemin_axis0_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + float4 minVal = read_imagef(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = read_imagef(input, coord); + minVal = val < minVal ? val : minVal; + coord.x ++; + } + + write_imagef(output, coord.yz, minVal); +} + +__kernel void reducemin_axis0_F32toF32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + float4 minVal = read_imagef(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = read_imagef(input, coord); + minVal = val < minVal ? val : minVal; + coord.x ++; + } + + coord.x = 0; + write_imagef(output, coord.yx, minVal); +} + +__kernel void reducemin_axis0_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + uint4 dst; + uint4 minVal = read_imageui(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + uint4 val = read_imageui(input, coord); + minVal = val < minVal ? val : minVal; + coord.x ++; + } + dst = convert_uint4(convert_float4(minVal) * inputScale + inputTail); + write_imageui(output, coord.yz, dst); +} + +__kernel void reducemin_axis0_U8toU8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + uint4 dst; + uint4 minVal = read_imageui(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + uint4 val = read_imageui(input, coord); + minVal = val < minVal ? val : minVal; + coord.x ++; + } + dst = convert_uint4(convert_float4(minVal) * inputScale + inputTail); + coord.x = 0; + write_imageui(output, coord.yx, dst); +} + +__kernel void reducemin_axis0_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + + int4 minVal = read_imagei(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + minVal = val < minVal ? val : minVal; + coord.x ++; + } + + write_imagei(output, coord.yz, minVal); +} + +__kernel void reducemin_axis0_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + + int4 minVal = read_imagei(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + minVal = val < minVal ? val : minVal; + coord.x ++; + } + + coord.x = 0; + write_imagei(output, coord.yx, minVal); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reducemin_internal_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reducemin_internal_axis1.cl new file mode 100644 index 0000000..3348804 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reducemin_internal_axis1.cl @@ -0,0 +1,147 @@ +__kernel void reducemin_axis1_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + + float4 minVal = read_imagef(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = read_imagef(input, coord); + minVal = val < minVal ? val : minVal; + coord.y ++; + } + + write_imagef(output, coord.xz, minVal); +} + +__kernel void reducemin_axis1_F32toF32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + + float4 minVal = read_imagef(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = read_imagef(input, coord); + minVal = val < minVal ? val : minVal; + coord.y ++; + } + + coord.y = 0; + write_imagef(output, coord, minVal); +} + +__kernel void reducemin_axis1_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + uint4 dst; + uint4 minVal = read_imageui(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + uint4 val = read_imageui(input, coord); + minVal = val < minVal ? val : minVal; + coord.y ++; + } + dst = convert_uint4(convert_float4(minVal) * inputScale + inputTail); + write_imageui(output, coord.xz, dst); +} + +__kernel void reducemin_axis1_U8toU8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + uint4 dst; + uint4 minVal = read_imageui(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + uint4 val = read_imageui(input, coord); + minVal = val < minVal ? val : minVal; + coord.y ++; + } + dst = convert_uint4(convert_float4(minVal) * inputScale + inputTail); + coord.y = 0; + write_imageui(output, coord, dst); +} + +__kernel void reducemin_axis1_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + + int4 minVal = read_imagei(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + minVal = val < minVal ? val : minVal; + coord.y ++; + } + + write_imagei(output, coord.xz, minVal); +} + +__kernel void reducemin_axis1_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + + int4 minVal = read_imagei(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + minVal = val < minVal ? val : minVal; + coord.y ++; + } + + coord.y = 0; + write_imagei(output, coord, minVal); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reducemin_internal_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reducemin_internal_axis2.cl new file mode 100644 index 0000000..d1c2c6a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reducemin_internal_axis2.cl @@ -0,0 +1,76 @@ +__kernel void reducemin_axis2_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + + float4 minVal = read_imagef(input, coord); + coord.z ++; + + for (; coord.z < axisSize;) + { + float4 val = read_imagef(input, coord); + minVal = val < minVal ? val : minVal; + coord.z ++; + } + + write_imagef(output, coord.xy, minVal); +} + + +__kernel void reducemin_axis2_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + uint4 dst; + uint4 minVal = read_imageui(input, coord); + coord.z ++; + + for (; coord.z < axisSize;) + { + uint4 val = read_imageui(input, coord); + minVal = val < minVal ? val : minVal; + coord.z ++; + } + dst = convert_uint4(convert_float4(minVal) * inputScale + inputTail); + write_imageui(output, coord.xy, dst); +} + + +__kernel void reducemin_axis2_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + + int4 minVal = read_imagei(input, coord); + coord.z ++; + + for (; coord.z < axisSize;) + { + int4 val = read_imagei(input, coord); + minVal = val < minVal ? val : minVal; + coord.z ++; + } + + write_imagei(output, coord.xy, minVal); +} + + + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reduceprod_internal_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reduceprod_internal_axis0.cl new file mode 100644 index 0000000..9ecd6d5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reduceprod_internal_axis0.cl @@ -0,0 +1,145 @@ +__kernel void reduceprod_axis0_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + float4 prodVal = read_imagef(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = read_imagef(input, coord); + prodVal = val * prodVal; + coord.x ++; + } + + write_imagef(output, coord.yz, prodVal); +} + +__kernel void reduceprod_axis0_F32toF32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + float4 prodVal = read_imagef(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = read_imagef(input, coord); + prodVal = val * prodVal; + coord.x ++; + } + + coord.x = 0; + write_imagef(output, coord.yx, prodVal); +} + +__kernel void reduceprod_axis0_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputTail + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + uint4 dst; + float4 prodVal = convert_float4(read_imageui(input, coord)); + prodVal = prodVal * inputScale + inputTail; + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = convert_float4(read_imageui(input, coord)); + val = val * inputScale + inputTail; + prodVal = val * prodVal; + coord.x ++; + } + dst = convert_uint4(prodVal * outputScale + outputTail); + write_imageui(output, coord.yz, dst); +} + +__kernel void reduceprod_axis0_U8toU8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputTail + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + uint4 dst; + float4 prodVal = convert_float4(read_imageui(input, coord)); + prodVal = prodVal * inputScale + inputTail; + coord.x ++; + + for (; coord.x < axisSize;) + { + float4 val = convert_float4(read_imageui(input, coord)); + val = val * inputScale + inputTail; + prodVal = val * prodVal; + coord.x ++; + } + dst = convert_uint4(prodVal * outputScale + outputTail); + coord.x = 0; + write_imageui(output, coord.yx, dst); +} + +__kernel void reduceprod_axis0_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int axisSize = get_image_width(input); + + int4 prodVal = read_imagei(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + prodVal = val * prodVal; + coord.x ++; + } + + write_imagei(output, coord.yz, prodVal); +} + +__kernel void reduceprod_axis0_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int axisSize = get_image_width(input); + + int4 prodVal = read_imagei(input, coord); + coord.x ++; + + for (; coord.x < axisSize;) + { + int4 val = read_imagei(input, coord); + prodVal = val * prodVal; + coord.x ++; + } + + coord.x = 0; + write_imagei(output, coord.yx, prodVal); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reduceprod_internal_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reduceprod_internal_axis1.cl new file mode 100644 index 0000000..ca69547 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reduceprod_internal_axis1.cl @@ -0,0 +1,147 @@ +__kernel void reduceprod_axis1_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + + float4 prodVal = read_imagef(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = read_imagef(input, coord); + prodVal = val * prodVal; + coord.y ++; + } + + write_imagef(output, coord.xz, prodVal); +} + +__kernel void reduceprod_axis1_F32toF32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + + float4 prodVal = read_imagef(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = read_imagef(input, coord); + prodVal = val * prodVal; + coord.y ++; + } + + coord.y = 0; + write_imagef(output, coord, prodVal); +} + +__kernel void reduceprod_axis1_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputTail + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + uint4 dst; + float4 prodVal = convert_float4(read_imageui(input, coord)); + prodVal = prodVal * inputScale + inputTail; + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = convert_float4(read_imageui(input, coord)); + val = val * inputScale + inputTail; + prodVal = val * prodVal; + coord.y ++; + } + dst = convert_uint4(prodVal * outputScale + outputTail); + write_imageui(output, coord.xz, dst); +} + +__kernel void reduceprod_axis1_U8toU8_2D + ( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputTail + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + uint4 dst; + float4 prodVal = convert_float4(read_imageui(input, coord)); + prodVal = prodVal * inputScale + inputTail; + coord.y ++; + + for (; coord.y < axisSize;) + { + float4 val = convert_float4(read_imageui(input, coord)); + val = val * inputScale + inputTail; + prodVal = val * prodVal; + coord.y ++; + } + dst = convert_uint4(prodVal * outputScale + outputTail); + coord.y = 0; + write_imageui(output, coord, dst); +} + +__kernel void reduceprod_axis1_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int axisSize = get_image_height(input); + + int4 prodVal = read_imagei(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + prodVal = val * prodVal; + coord.y ++; + } + + write_imagei(output, coord.xz, prodVal); +} + +__kernel void reduceprod_axis1_I32toI32_2D + ( + __read_only image2d_t input, + __write_only image2d_t output + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int axisSize = get_image_height(input); + + int4 prodVal = read_imagei(input, coord); + coord.y ++; + + for (; coord.y < axisSize;) + { + int4 val = read_imagei(input, coord); + prodVal = val * prodVal; + coord.y ++; + } + + coord.y = 0; + write_imagei(output, coord, prodVal); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reduceprod_internal_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reduceprod_internal_axis2.cl new file mode 100644 index 0000000..f1fa1cc --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/reduceprod_internal_axis2.cl @@ -0,0 +1,76 @@ +__kernel void reduceprod_axis2_F32toF32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + + float4 prodVal = read_imagef(input, coord); + coord.z ++; + + for (; coord.z < axisSize;) + { + float4 val = read_imagef(input, coord); + prodVal = val * prodVal; + coord.z ++; + } + + write_imagef(output, coord.xy, prodVal); +} + + +__kernel void reduceprod_axis2_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + uint4 dst; + float4 prodVal = convert_float4(read_imageui(input, coord)); + prodVal = prodVal * inputScale + inputTail; + coord.z ++; + + for (; coord.z < axisSize;) + { + float4 val = convert_float4(read_imageui(input, coord)); + val = val * inputScale + inputTail; + prodVal = val * prodVal; + coord.z ++; + } + dst = convert_uint4(prodVal * outputScale + outputTail); + write_imageui(output, coord.xy, dst); +} + + +__kernel void reduceprod_axis2_I32toI32 + ( + __read_only image2d_array_t input, + __write_only image2d_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int axisSize = get_image_depth(input); + + int4 prodVal = read_imagei(input, coord); + coord.z ++; + + for (; coord.z < axisSize;) + { + int4 val = read_imagei(input, coord); + prodVal = val * prodVal; + coord.z ++; + } + + write_imagei(output, coord.xy, prodVal); +} + + + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/relational_ops.cl b/src/tim/vx/internal/src/libnnext/ops/cl/relational_ops.cl new file mode 100644 index 0000000..cd13f7e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/relational_ops.cl @@ -0,0 +1,185 @@ + +#define COMPARISONS_F32(func_name, comp_op) \ +__kernel void func_name##_F32F32toBOOL8 \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output, \ + float input0Scale, \ + float input0Tail, \ + float input1Scale, \ + float input1Tail \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + float4 src0; \ + float4 src1; \ + readImage2DArray(src0, input0, coord); \ + readImage2DArray(src1, input1, coord); \ + \ + int4 dst = (src0)comp_op(src1); \ + dst &= 1; \ + \ + write_imagei(output, coord, dst); \ +} +COMPARISONS_F32(less, <) +COMPARISONS_F32(great, >) +COMPARISONS_F32(less_equal, <=) +COMPARISONS_F32(great_equal, >=) +COMPARISONS_F32(equal, ==) +COMPARISONS_F32(not_equal, !=) + +#define COMPARISONS_F32_2D(func_name, comp_op) \ +__kernel void func_name##_F32F32toBOOL8_2D \ + ( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + float input0Scale, \ + float input0Tail, \ + float input1Scale, \ + float input1Tail \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + float4 src0 = read_imagef(input0, coord); \ + float4 src1 = read_imagef(input1, coord); \ + \ + int4 dst = (src0)comp_op(src1); \ + dst &= 1; \ + \ + write_imagei(output, coord, dst); \ +} +COMPARISONS_F32_2D(less, <) +COMPARISONS_F32_2D(great, >) +COMPARISONS_F32_2D(less_equal, <=) +COMPARISONS_F32_2D(great_equal, >=) +COMPARISONS_F32_2D(equal, ==) +COMPARISONS_F32_2D(not_equal, !=) + +#define COMPARISONS_U32(func_name, comp_op) \ +__kernel void func_name##_U32U32toBOOL8 \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output, \ + float input0Scale, \ + float input0Tail, \ + float input1Scale, \ + float input1Tail \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + uint4 data0; \ + uint4 data1; \ + readImage2DArray(data0, input0, coord); \ + readImage2DArray(data1, input1, coord); \ + \ + float4 src0 = convert_float4(data0) * input0Scale - input0Tail; \ + float4 src1 = convert_float4(data1) * input1Scale - input1Tail; \ + int4 dst = (src0)comp_op(src1); \ + dst &= 1; \ + \ + write_imagei(output, coord, dst); \ +} +COMPARISONS_U32(less, <) +COMPARISONS_U32(great, >) +COMPARISONS_U32(less_equal, <=) +COMPARISONS_U32(great_equal, >=) +COMPARISONS_U32(equal, ==) +COMPARISONS_U32(not_equal, !=) + +#define COMPARISONS_U32_2D(func_name, comp_op) \ +__kernel void func_name##_U32U32toBOOL8_2D \ + ( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + float input0Scale, \ + float input0Tail, \ + float input1Scale, \ + float input1Tail \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + uint4 data0 = read_imageui(input0, coord); \ + uint4 data1 = read_imageui(input1, coord); \ + \ + float4 src0 = convert_float4(data0) * input0Scale - input0Tail; \ + float4 src1 = convert_float4(data1) * input1Scale - input1Tail; \ + int4 dst = (src0)comp_op(src1); \ + dst &= 1; \ + \ + write_imagei(output, coord, dst); \ +} +COMPARISONS_U32_2D(less, <) +COMPARISONS_U32_2D(great, >) +COMPARISONS_U32_2D(less_equal, <=) +COMPARISONS_U32_2D(great_equal, >=) +COMPARISONS_U32_2D(equal, ==) +COMPARISONS_U32_2D(not_equal, !=) + +#define COMPARISONS_I32(func_name, comp_op) \ +__kernel void func_name##_I32I32toBOOL8 \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output, \ + float input0Scale, \ + float input0Tail, \ + float input1Scale, \ + float input1Tail \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + int4 src0; \ + int4 src1; \ + readImage2DArray(src0, input0, coord); \ + readImage2DArray(src1, input1, coord); \ + \ + int4 dst = (src0)comp_op(src1); \ + dst &= 1; \ + \ + write_imagei(output, coord, dst); \ +} +COMPARISONS_I32(less, <) +COMPARISONS_I32(great, >) +COMPARISONS_I32(less_equal, <=) +COMPARISONS_I32(great_equal, >=) +COMPARISONS_I32(equal, ==) +COMPARISONS_I32(not_equal, !=) + +#define COMPARISONS_I32_2D(func_name, comp_op) \ +__kernel void func_name##_I32I32toBOOL8_2D \ + ( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + float input0Scale, \ + float input0Tail, \ + float input1Scale, \ + float input1Tail \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + int4 src0 = read_imagei(input0, coord); \ + int4 src1 = read_imagei(input1, coord); \ + \ + int4 dst = (src0)comp_op(src1); \ + dst &= 1; \ + \ + write_imagei(output, coord, dst); \ +} +COMPARISONS_I32_2D(less, <) +COMPARISONS_I32_2D(great, >) +COMPARISONS_I32_2D(less_equal, <=) +COMPARISONS_I32_2D(great_equal, >=) +COMPARISONS_I32_2D(equal, ==) +COMPARISONS_I32_2D(not_equal, !=) + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/relu_keras.cl b/src/tim/vx/internal/src/libnnext/ops/cl/relu_keras.cl new file mode 100644 index 0000000..6ccd5b4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/relu_keras.cl @@ -0,0 +1,156 @@ + +__kernel void relu_keras_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float alpha, + float max_value, + float threshold, + float offset + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float4 src = read_imagef(input, coord); + float4 dst = src >= max_value ? max_value : src; + dst = dst < threshold ? alpha * dst + offset : dst; + write_imagef(output, coord, dst); +} + +__kernel void relu_keras_F32toF32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float alpha, + float max_value, + float threshold, + float offset + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + float4 src = read_imagef(input, coord); + float4 dst = src >= max_value ? max_value : src; + dst = dst < threshold ? alpha * dst + offset : dst; + write_imagef(output, coord, dst); +} + +__kernel void relu_keras_F32toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float alpha, + float max_value, + float threshold, + float offset, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float4 src = read_imagef(input, coord); + float4 result = src >= max_value ? max_value : src; + result = result < threshold ? alpha * result + offset : result; + uint4 dst = convert_uint4_rte(result * outputScale + outputZP); + write_imageui(output, coord, dst); +} + +__kernel void relu_keras_F32toU8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float alpha, + float max_value, + float threshold, + float offset, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + float4 src = read_imagef(input, coord); + float4 result = src >= max_value ? max_value : src; + result = result < threshold ? alpha * result + offset : result; + uint4 dst = convert_uint4_rte(result * outputScale + outputZP); + write_imageui(output, coord, dst); +} + +__kernel void relu_keras_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float alpha, + float max_value, + float threshold, + float offset, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + float4 result = src >= max_value ? max_value : src; + result = result < threshold ? alpha * result + offset : result; + uint4 dst = convert_uint4_rte(result * outputScale + outputZP); + write_imageui(output, coord, dst); +} + +__kernel void relu_keras_U8toU8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float alpha, + float max_value, + float threshold, + float offset, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + float4 result = src >= max_value ? max_value : src; + result = result < threshold ? alpha * result + offset : result; + uint4 dst = convert_uint4_rte(result * outputScale + outputZP); + write_imageui(output, coord, dst); +} + +__kernel void relu_keras_U8toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float alpha, + float max_value, + float threshold, + float offset, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + float4 dst = src >= max_value ? max_value : src; + dst = dst < threshold ? alpha * dst + offset : dst; + write_imagef(output, coord, dst); +} + +__kernel void relu_keras_U8toF32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float alpha, + float max_value, + float threshold, + float offset, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail; + float4 dst = src >= max_value ? max_value : src; + dst = dst < threshold ? alpha * dst + offset : dst; + write_imagef(output, coord, dst); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_bilinear.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_bilinear.cl new file mode 100644 index 0000000..c460e30 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_bilinear.cl @@ -0,0 +1,84 @@ +__kernel void resize_bilinear_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float half_pixel_value + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; + float left_x_f = floor(in_x); + float x_lerp = in_x - left_x_f; + int left_x_idx = convert_int(left_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + int4 coord_in = (int4)(left_x_idx, top_y_idx, coord_out.z, 0); + float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst; + + top_l = read_imagef(input, coord_in); + coord_in.y++; + bottom_l = read_imagef(input, coord_in); + coord_in.x++; + bottom_r = read_imagef(input, coord_in); + coord_in.y--; + top_r = read_imagef(input, coord_in); + + top_r = top_r - top_l; + top = top_l + x_lerp * top_r; + bottom_r = bottom_r - bottom_l; + bottom = bottom_l + x_lerp * bottom_r; + bottom = bottom - top; + dst = top + y_lerp * bottom; + + write_imagef(output, coord_out, dst); + +} + + +__kernel void resize_bilinear_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float half_pixel_value, + float in_scale, + float in_tail, + float out_scale, + float out_tail + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; + float left_x_f = floor(in_x); + float x_lerp = in_x - left_x_f; + int left_x_idx = convert_int(left_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + int4 coord_in = (int4)(left_x_idx, top_y_idx, coord_out.z, 0); + float4 top_l, top_r, bottom_l, bottom_r, top, bottom; + uint4 dst; + + top_l = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail; + coord_in.y++; + bottom_l = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail; + coord_in.x++; + bottom_r = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail; + coord_in.y--; + top_r = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail; + + top_r = top_r - top_l; + top = top_l + x_lerp * top_r; + bottom_r = bottom_r - bottom_l; + bottom = bottom_l + x_lerp * bottom_r; + bottom = bottom - top; + top = top + y_lerp * bottom; + + dst = convert_uint4(top * out_scale + out_tail); + + write_imageui(output, coord_out, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_nearest.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_nearest.cl new file mode 100644 index 0000000..0a231fd --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_nearest.cl @@ -0,0 +1,40 @@ + +#define NEAREST_INDEX_PROCESS() \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \ + int in_x_idx = convert_int(in_x); \ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y + round_value; \ + int in_y_idx = convert_int(in_y); \ + +__kernel void resize_nearest_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float half_pixel_value, + float round_value) +{ + NEAREST_INDEX_PROCESS() + int4 coord_in = (int4)(in_x_idx, in_y_idx, coord_out.z, 0); + float4 dst; + dst = read_imagef(input, coord_in); + write_imagef(output, coord_out, dst); +} + + +__kernel void resize_nearest_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float half_pixel_value, + float round_value, + float output_scale, + float output_tail) +{ + NEAREST_INDEX_PROCESS() + int4 coord_in = (int4)(in_x_idx, in_y_idx, coord_out.z, 0); + uint4 dst; + dst = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail); + write_imageui(output, coord_out, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd.cl new file mode 100644 index 0000000..ab1985d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd.cl @@ -0,0 +1,239 @@ +__kernel void scatter_nd_U32toU32_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, + int area, + int index_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + uint4 sum = (uint4)(0, 0, 0, 0); + for(int i = 0; i < index_num; i++) + { + int4 indice = read_imagei(input0, (int2)(0, i)); + if(gidy == indice.x) + { + uint4 data = read_imageui(input1, (int2)(gidx, i)); + sum += data; + } + } + write_imageui(output, (int2)(gidx, gidy), sum); +} + +__kernel void scatter_nd_U32toU32_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, + int area, + int index_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + uint4 sum = (uint4)(0, 0, 0, 0); + for(int i = 0; i < index_num; i++) + { + int4 indice0 = read_imagei(input0, (int2)(0, i)); + int4 indice1 = read_imagei(input0, (int2)(1, i)); + int idx = indice0.x * width + indice1.x; + if(gidy == idx) + { + uint4 data = read_imageui(input1, (int2)(gidx, i)); + sum += data; + } + } + write_imageui(output, (int2)(gidx, gidy), sum); +} + +__kernel void scatter_nd_U32toU32_3D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, + int area, + int index_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + uint4 sum = (uint4)(0, 0, 0, 0); + for(int i = 0; i < index_num; i++) + { + int4 indice0 = read_imagei(input0, (int2)(0, i)); + int4 indice1 = read_imagei(input0, (int2)(1, i)); + int4 indice2 = read_imagei(input0, (int2)(2, i)); + int idx = indice0.x * area + indice1.x * width + indice2.x; + if(gidy == idx) + { + uint4 data = read_imageui(input1, (int2)(gidx, i)); + sum += data; + } + } + write_imageui(output, (int2)(gidx, gidy), sum); +} + +__kernel void scatter_nd_I32toI32_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, + int area, + int index_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 sum = (int4)(0, 0, 0, 0); + for(int i = 0; i < index_num; i++) + { + int4 indice = read_imagei(input0, (int2)(0, i)); + if(gidy == indice.x) + { + int4 data = read_imagei(input1, (int2)(gidx, i)); + sum += data; + } + } + write_imagei(output, (int2)(gidx, gidy), sum); +} + +__kernel void scatter_nd_I32toI32_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, + int area, + int index_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 sum = (int4)(0, 0, 0, 0); + for(int i = 0; i < index_num; i++) + { + int4 indice0 = read_imagei(input0, (int2)(0, i)); + int4 indice1 = read_imagei(input0, (int2)(1, i)); + int idx = indice0.x * width + indice1.x; + if(gidy == idx) + { + int4 data = read_imagei(input1, (int2)(gidx, i)); + sum += data; + } + } + write_imagei(output, (int2)(gidx, gidy), sum); +} + +__kernel void scatter_nd_I32toI32_3D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, + int area, + int index_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 sum = (int4)(0, 0, 0, 0); + for(int i = 0; i < index_num; i++) + { + int4 indice0 = read_imagei(input0, (int2)(0, i)); + int4 indice1 = read_imagei(input0, (int2)(1, i)); + int4 indice2 = read_imagei(input0, (int2)(2, i)); + int idx = indice0.x * area + indice1.x * width + indice2.x; + if(gidy == idx) + { + int4 data = read_imagei(input1, (int2)(gidx, i)); + sum += data; + } + } + write_imagei(output, (int2)(gidx, gidy), sum); +} + +__kernel void scatter_nd_F32toF32_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, + int area, + int index_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + float4 sum = (float4)(0, 0, 0, 0); + for(int i = 0; i < index_num; i++) + { + int4 indice = read_imagei(input0, (int2)(0, i)); + if(gidy == indice.x) + { + float4 data = read_imagef(input1, (int2)(gidx, i)); + sum += data; + } + } + write_imagef(output, (int2)(gidx, gidy), sum); +} + +__kernel void scatter_nd_F32toF32_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, + int area, + int index_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + float4 sum = (float4)(0, 0, 0, 0); + for(int i = 0; i < index_num; i++) + { + int4 indice0 = read_imagei(input0, (int2)(0, i)); + int4 indice1 = read_imagei(input0, (int2)(1, i)); + int idx = indice0.x * width + indice1.x; + if(gidy == idx) + { + float4 data = read_imagef(input1, (int2)(gidx, i)); + sum += data; + } + } + write_imagef(output, (int2)(gidx, gidy), sum); +} + +__kernel void scatter_nd_F32toF32_3D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, + int area, + int index_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + float4 sum = (float4)(0, 0, 0, 0); + for(int i = 0; i < index_num; i++) + { + int4 indice0 = read_imagei(input0, (int2)(0, i)); + int4 indice1 = read_imagei(input0, (int2)(1, i)); + int4 indice2 = read_imagei(input0, (int2)(2, i)); + int idx = indice0.x * area + indice1.x * width + indice2.x; + if(gidy == idx) + { + float4 data = read_imagef(input1, (int2)(gidx, i)); + sum += data; + } + } + write_imagef(output, (int2)(gidx, gidy), sum); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/select.cl b/src/tim/vx/internal/src/libnnext/ops/cl/select.cl new file mode 100644 index 0000000..fcdd616 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/select.cl @@ -0,0 +1,120 @@ +__kernel void select_I8_U8_U8toU8( + __read_only image2d_array_t condition, + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 value; + uint4 src0, src1, src, dst; + float inputScale, inputTail; + readImage2DArray(value, condition, coord); + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + src = (value != 0 ? src0 : src1); + inputScale = (value.x != 0 ? input0Scale : input1Scale); + inputTail = (value.x != 0 ? input0Tail : input1Tail); + dst = convert_uint4(convert_float4(src) * inputScale + inputTail); + write_imageui(output, coord, dst); +} + +__kernel void select_I8_U8_U8toU8_2D( + __read_only image2d_t condition, + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 value = read_imagei(condition, coord); + uint4 src0 = read_imageui(input0, coord); + uint4 src1 = read_imageui(input1, coord); + uint4 src = (value != 0 ? src0 : src1); + float inputScale = (value.x != 0 ? input0Scale : input1Scale); + float inputTail = (value.x != 0 ? input0Tail : input1Tail); + uint4 dst = convert_uint4(convert_float4(src) * inputScale + inputTail); + write_imageui(output, coord, dst); +} + +__kernel void select_I8_I32_I32toI32( + __read_only image2d_array_t condition, + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 value; + int4 src0, src1, dst; + readImage2DArray(value, condition, coord); + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + dst = (value != 0 ? src0 : src1); + write_imagei(output, coord, dst); +} + +__kernel void select_I8_I32_I32toI32_2D( + __read_only image2d_t condition, + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 value = read_imagei(condition, coord); + int4 src0 = read_imagei(input0, coord); + int4 src1 = read_imagei(input1, coord); + int4 dst = (value != 0 ? src0 : src1); + write_imagei(output, coord, dst); +} + +__kernel void select_I8_F32_F32toF32( + __read_only image2d_array_t condition, + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 value; + float4 src0, src1, dst; + readImage2DArray(value, condition, coord); + readImage2DArray(src0, input0, coord); + readImage2DArray(src1, input1, coord); + dst = (value != 0 ? src0 : src1); + write_imagef(output, coord, dst); +} + +__kernel void select_I8_F32_F32toF32_2D( + __read_only image2d_t condition, + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 value = read_imagei(condition, coord); + float4 src0 = read_imagef(input0, coord); + float4 src1 = read_imagef(input1, coord); + float4 dst = (value != 0 ? src0 : src1); + write_imagef(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl b/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl new file mode 100644 index 0000000..0a6035c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl @@ -0,0 +1,117 @@ +float sigmoid_(float x, float logE) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} + +#define SWISH_F32_F32_PROCESS() \ + float4 src, tmp, dst; \ + src = read_imagef(input, coord); \ + tmp.x = sigmoid_(src.x * beta, logE); \ + dst.x = src.x * tmp.x; \ + write_imagef(output, coord, dst); + +__kernel void swish_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP, + float beta, + float logE) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + SWISH_F32_F32_PROCESS() +} + +__kernel void swish_F32toF32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP, + float beta, + float logE) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + SWISH_F32_F32_PROCESS() +} + + +#define SWISH_U8_U8_PROCESS() \ + float4 src, tmp, data; \ + uint4 src0 = read_imageui(input, coord); \ + src = convert_float4(src0) * inputScale - inputTail; \ + tmp.x = sigmoid_(src.x * beta, logE); \ + data.x = src.x * tmp.x; \ + uint4 dst = convert_uint4(data * outputScale + outputZP); \ + write_imageui(output, coord, dst); + +__kernel void swish_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP, + float beta, + float logE) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + SWISH_U8_U8_PROCESS() +} + +__kernel void swish_U8toU8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP, + float beta, + float logE) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + SWISH_U8_U8_PROCESS() +} + + +#define SWISH_I32_I32_PROCESS() \ + float4 src, tmp, data; \ + int4 src0 = read_imagei(input, coord); \ + src = convert_float4(src0); \ + tmp.x = sigmoid_(src.x * beta, logE); \ + data.x = src.x * tmp.x; \ + int4 dst = convert_int4(data); \ + write_imagei(output, coord, dst); + +__kernel void swish_I32toI32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP, + float beta, + float logE) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + SWISH_I32_I32_PROCESS() +} + +__kernel void swish_I32toI32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP, + float beta, + float logE) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + SWISH_I32_I32_PROCESS() +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl new file mode 100644 index 0000000..f2c281a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl @@ -0,0 +1,92 @@ + +#define TILE_3D(name0, name1, data_type, write_image_func) \ +__kernel void tile_##name0##to##name1 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int batchIn, \ + int depthIn, \ + int depthOut, \ + int multiples_0, \ + int multiples_1, \ + int multiples_2, \ + int multiples_3 \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_out; \ + int width = get_image_width(input); \ + int height = get_image_height(input); \ + \ + data_type src; \ + readImage2DArray(src, input, coord); \ + \ + int batch_id = (short)coord.z / (short)depthIn; \ + coord.z = (short)coord.z % (short)depthIn; \ + coord_out = coord; \ + \ + for (int w = 0; w < multiples_3; w++) \ + { \ + int batch = batchIn * w + batch_id; \ + \ + for(int z = 0; z < multiples_2; z++) \ + { \ + coord_out.z = coord.z + z * depthIn + batch * depthOut; \ + \ + for (int y = 0; y < multiples_1; y++) \ + { \ + coord_out.y = coord.y + y * height; \ + \ + for (int x = 0; x < multiples_0; x++) \ + { \ + coord_out.x = coord.x + x * width; \ + write_image_func(output, coord_out.xyzw, src); \ + } \ + } \ + } \ + } \ +} +TILE_3D(I32, I32, int4, write_imagei) +TILE_3D(U32, U32, uint4, write_imageui) +TILE_3D(F32, F32, float4, write_imagef) + +#define TILE_2D(name0, name1, data_type) \ +__kernel void tile_##name0##to##name1##_2D \ + ( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int batchIn, \ + int depthIn, \ + int depthOut, \ + int multiples_0, \ + int multiples_1, \ + int multiples_2, \ + int multiples_3 \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + int width = get_image_width(input); \ + int height = get_image_height(input); \ + int output_width = get_image_width(output); \ + int output_height = get_image_height(output); \ + \ + data_type src; \ + readImage(src, input, coord); \ + \ + do \ + { \ + do \ + { \ + writeImage(output, coord, src); \ + coord.x += width; \ + } while (coord.x < output_width); \ + coord.x = get_global_id(0); \ + coord.y += height; \ + } while (coord.y < output_height); \ +} +TILE_2D(I32, I32, int4) +TILE_2D(U32, U32, uint4) +TILE_2D(F32, F32, float4) + + + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/upsample.cl b/src/tim/vx/internal/src/libnnext/ops/cl/upsample.cl new file mode 100644 index 0000000..81c373b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/upsample.cl @@ -0,0 +1,206 @@ + +#define UPSAMPLE_PROCESS(data_type, read_fun, write_fun) \ + data_type src = 0; \ + data_type dst = 0; \ + uint4 axis = 0; \ + src.x = read_fun(input, coord_in).x; \ + axis.x = read_imageui(inaxis, coord_in).x; \ + dst.x = axis.x == 0 ? src.x : 0; \ + write_fun(output, coord_out, dst); \ + dst.x = axis.x == 1 ? src.x : 0; \ + coord_out.x++; \ + write_fun(output, coord_out, dst); \ + dst.x = axis.x == 3 ? src.x : 0; \ + coord_out.y++; \ + write_fun(output, coord_out, dst); \ + dst.x = axis.x == 2 ? src.x : 0; \ + coord_out.x--; \ + write_fun(output, coord_out, dst); + + +__kernel void upsample_F32_U8to_F32( + __read_only image2d_array_t input, + __write_only image2d_array_t inaxis, + __write_only image2d_array_t output) +{ + int4 coord_out = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + UPSAMPLE_PROCESS(float4, read_imagef, write_imagef) +} + +__kernel void upsample_F32_U8to_F32_2D( + __read_only image2d_t input, + __write_only image2d_t inaxis, + __write_only image2d_t output) +{ + int2 coord_out = (int2)(get_global_id(0) << 1, get_global_id(1) << 1); + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); + UPSAMPLE_PROCESS(float4, read_imagef, write_imagef) +} + +__kernel void upsample_I32_U8to_I32( + __read_only image2d_array_t input, + __write_only image2d_array_t inaxis, + __write_only image2d_array_t output) +{ + int4 coord_out = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + UPSAMPLE_PROCESS(int4, read_imagei, write_imagei) +} + +__kernel void upsample_I32_U8to_I32_2D( + __read_only image2d_t input, + __write_only image2d_t inaxis, + __write_only image2d_t output) +{ + int2 coord_out = (int2)(get_global_id(0) << 1, get_global_id(1) << 1); + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); + UPSAMPLE_PROCESS(int4, read_imagei, write_imagei) +} + + +#define UPSAMPLE_U8_PROCESS() \ + uint4 src = 0; \ + uint4 dst = 0; \ + uint4 axis = 0; \ + float4 result = 0.0f; \ + uint output_zp = (uint)zp_out; \ + src.x = read_imageui(input, coord_in).x; \ + axis.x = read_imageui(inaxis, coord_in).x; \ + result.x = convert_float4(src).x * scale_value + tail_value; \ + src = convert_uint4(result);\ + dst.x = axis.x == 0 ? src.x : output_zp; \ + write_imageui(output, coord_out, dst); \ + dst.x = axis.x == 1 ? src.x : output_zp; \ + coord_out.x++; \ + write_imageui(output, coord_out, dst); \ + dst.x = axis.x == 3 ? src.x : output_zp; \ + coord_out.y++; \ + write_imageui(output, coord_out, dst); \ + dst.x = axis.x == 2 ? src.x : output_zp; \ + coord_out.x--; \ + write_imageui(output, coord_out, dst); + + +__kernel void upsample_U8_U8to_U8( + __read_only image2d_array_t input, + __write_only image2d_array_t inaxis, + __write_only image2d_array_t output, + float scale_value, + float tail_value, + int zp_out) +{ + int4 coord_out = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + UPSAMPLE_U8_PROCESS() +} + +__kernel void upsample_U8_U8to_U8_2D( + __read_only image2d_t input, + __write_only image2d_t inaxis, + __write_only image2d_t output, + float scale_value, + float tail_value, + int zp_out) +{ + int2 coord_out = (int2)(get_global_id(0) << 1, get_global_id(1) << 1); + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); + UPSAMPLE_U8_PROCESS() +} + +#define UPSAMPLE_U8_TO_F32PROCESS() \ + uint4 src = 0; \ + float4 dst = 0; \ + uint4 axis = 0; \ + float4 result = 0.0f; \ + src.x = read_imageui(input, coord_in).x; \ + axis.x = read_imageui(inaxis, coord_in).x; \ + result.x = convert_float4(src).x * scale_value + tail_value; \ + dst.x = axis.x == 0 ? result.x : 0.0f; \ + write_imagef(output, coord_out, dst); \ + dst.x = axis.x == 1 ? result.x : 0.0f; \ + coord_out.x++; \ + write_imagef(output, coord_out, dst); \ + dst.x = axis.x == 3 ? result.x : 0.0f; \ + coord_out.y++; \ + write_imagef(output, coord_out, dst); \ + dst.x = axis.x == 2 ? result.x : 0.0f; \ + coord_out.x--; \ + write_imagef(output, coord_out, dst); + + +__kernel void upsample_U8_U8to_F32( + __read_only image2d_array_t input, + __write_only image2d_array_t inaxis, + __write_only image2d_array_t output, + float scale_value, + float tail_value, + int zp_out) +{ + int4 coord_out = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + UPSAMPLE_U8_TO_F32PROCESS() +} + +__kernel void upsample_U8_U8to_F32_2D( + __read_only image2d_t input, + __write_only image2d_t inaxis, + __write_only image2d_t output, + float scale_value, + float tail_value, + int zp_out) +{ + int2 coord_out = (int2)(get_global_id(0) << 1, get_global_id(1) << 1); + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); + UPSAMPLE_U8_TO_F32PROCESS() +} + + +#define UPSAMPLE_F32_TO_U8_PROCESS() \ + uint4 src = 0; \ + uint4 dst = 0; \ + uint4 axis = 0; \ + float4 result = 0.0f; \ + uint output_zp = (uint)zp_out; \ + result.x = read_imagef(input, coord_in).x; \ + axis.x = read_imageui(inaxis, coord_in).x; \ + result.x = result.x * scale_value + tail_value; \ + src = convert_uint4(result);\ + dst.x = axis.x == 0 ? src.x : output_zp; \ + write_imageui(output, coord_out, dst); \ + dst.x = axis.x == 1 ? src.x : output_zp; \ + coord_out.x++; \ + write_imageui(output, coord_out, dst); \ + dst.x = axis.x == 3 ? src.x : output_zp; \ + coord_out.y++; \ + write_imageui(output, coord_out, dst); \ + dst.x = axis.x == 2 ? src.x : output_zp; \ + coord_out.x--; \ + write_imageui(output, coord_out, dst); + + +__kernel void upsample_F32_U8to_U8( + __read_only image2d_array_t input, + __write_only image2d_array_t inaxis, + __write_only image2d_array_t output, + float scale_value, + float tail_value, + int zp_out) +{ + int4 coord_out = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + UPSAMPLE_F32_TO_U8_PROCESS() +} + +__kernel void upsample_F32_U8to_U8_2D( + __read_only image2d_t input, + __write_only image2d_t inaxis, + __write_only image2d_t output, + float scale_value, + float tail_value, + int zp_out) +{ + int2 coord_out = (int2)(get_global_id(0) << 1, get_global_id(1) << 1); + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); + UPSAMPLE_F32_TO_U8_PROCESS() +} diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c new file mode 100644 index 0000000..2755dc8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c @@ -0,0 +1,275 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_AXIS_ALIGNED_BBOX_TRANSFORM) +#define _VX_KERNEL_NAME (VX_KERNEL_NAME_AXIS_ALIGNED_BBOX_TRANSFORM) +#define _VX_KERNEL_FUNC_KERNEL (vxAxis_aligned_bbox_transformKernel) + +typedef struct +{ + float x1, y1, x2, y2; +}BoxEncodingCorner; +typedef struct +{ + float w, h, x, y; +}BoxEncodingCenter; + +void toBoxEncodingCorner + ( + BoxEncodingCenter* ctr, + BoxEncodingCorner* cnr + ) +{ + cnr->x1 = ctr->x - ctr->w / 2; + cnr->y1 = ctr->y - ctr->h / 2; + cnr->x2 = ctr->x + ctr->w / 2; + cnr->y2 = ctr->y + ctr->h / 2; +} + +void toBoxEncodingCenter + ( + BoxEncodingCorner* cnr, + BoxEncodingCenter* ctr + ) +{ + ctr->w = cnr->x2 - cnr->x1; + ctr->h = cnr->y2 - cnr->y1; + ctr->x = (cnr->x1 + cnr->x2) / 2; + ctr->y = (cnr->y1 + cnr->y2) / 2; +} + +static vsi_status VX_CALLBACK vxAxis_aligned_bbox_transformKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ +#define ARG_NUM (0) +#define TENSOR_NUM_INPUT (4) +#define TENSOR_NUM_OUTPUT (1) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vsi_status status = VSI_FAILURE; + vx_context context = NULL; + vx_tensor input[TENSOR_NUM_INPUT] = {0}; + vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; + float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; + int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0}; + float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; + vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; + vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; + uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; + uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; + + int32_t i; + for(i = 0; i < TENSOR_NUM_INPUT; i++) + { + memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + /* prepare data */ + context = vxGetContext((vx_reference)node); + + for(i = 0; i < TENSOR_NUM_INPUT; i ++) + { + input[i] = (vx_tensor)paramObj[i]; + status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); + TEST_CHECK_STATUS(status, final); + in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); + if (i == 2) + { + int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context, + input[i], &in_attr[i]); + } + else + { + f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); + status = vsi_nn_vxConvertTensorToFloat32Data( + context, input[i], &in_attr[i], f32_in_buffer[i], + in_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) + { + output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; + status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); + TEST_CHECK_STATUS(status, final); + out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); + f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float)); + memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float)); + } + + /* TODO: Add CPU kernel implement */ + { + const uint32_t roiLength = 4; + const uint32_t imageLength = 2; + + uint32_t numClasses = in_attr[1].size[0] / roiLength; + uint32_t numRois = in_attr[0].size[1]; + uint32_t j; + uint32_t roiIndex; + for(roiIndex = 0; roiIndex < numRois; roiIndex++) + { + uint32_t batchIndex = int32_in_buffer[2][roiIndex]; + float imageHeight = f32_in_buffer[3][batchIndex * imageLength]; + float imageWidth = f32_in_buffer[3][batchIndex * imageLength + 1]; + BoxEncodingCorner roi_cnr; + BoxEncodingCenter roiBefore; + roi_cnr.x1 = f32_in_buffer[0][roiIndex * roiLength]; + roi_cnr.y1 = f32_in_buffer[0][roiIndex * roiLength + 1]; + roi_cnr.x2 = f32_in_buffer[0][roiIndex * roiLength + 2]; + roi_cnr.y2 = f32_in_buffer[0][roiIndex * roiLength + 3]; + toBoxEncodingCenter(&roi_cnr, &roiBefore); + for (j = 0; j < numClasses; j++) + { + BoxEncodingCenter roi_ctr; + BoxEncodingCorner roiAfter; + BoxEncodingCorner cliped; + uint32_t index = (roiIndex * numClasses + j) * roiLength; + roi_ctr.w = (float)(exp(f32_in_buffer[1][index + 2]) * roiBefore.w); + roi_ctr.h = (float)(exp(f32_in_buffer[1][index + 3]) * roiBefore.h); + roi_ctr.x = roiBefore.x + f32_in_buffer[1][index] * roiBefore.w; + roi_ctr.y = roiBefore.y + f32_in_buffer[1][index + 1] * roiBefore.h; + toBoxEncodingCorner(&roi_ctr, &roiAfter); + cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth); + cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight); + cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth); + cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight); + f32_out_buffer[0][index] = cliped.x1; + f32_out_buffer[0][index + 1] = cliped.y1; + f32_out_buffer[0][index + 2] = cliped.x2; + f32_out_buffer[0][index + 3] = cliped.y2; + } + } + } + + /* save data */ + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + status = vsi_nn_vxConvertFloat32DataToTensor( + context, output[i], &out_attr[i], f32_out_buffer[i], + out_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + +final: + for (i = 0; i < TENSOR_NUM_INPUT; i++) + { + if (f32_in_buffer[i]) free(f32_in_buffer[i]); + if (int32_in_buffer[i]) free(int32_in_buffer[i]); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + if (f32_out_buffer[i]) free(f32_out_buffer[i]); + } + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t vxAxis_aligned_bbox_transformKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_status VX_CALLBACK vxAxis_aligned_bbox_transformInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ + vx_status status = VX_SUCCESS; + /*TODO: Add initial code for VX program*/ + + return status; +} + + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxAxis_aligned_bbox_transform_CPU = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + vxAxis_aligned_bbox_transformKernelParam, + _cnt_of_array( vxAxis_aligned_bbox_transformKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxAxis_aligned_bbox_transform_VX = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + NULL, + vxAxis_aligned_bbox_transformKernelParam, + _cnt_of_array( vxAxis_aligned_bbox_transformKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxAxis_aligned_bbox_transformInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM_list[] = +{ + &vxAxis_aligned_bbox_transform_CPU, + &vxAxis_aligned_bbox_transform_VX, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c new file mode 100644 index 0000000..8114caf --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c @@ -0,0 +1,511 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_link_list.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_BOX_WITH_NMS_LIMIT) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_BOX_WITH_NMS_LIMIT) +#define _VX_KERNEL_NAME (VX_KERNEL_NAME_BOX_WITH_NMS_LIMIT) +#define _VX_KERNEL_FUNC_KERNEL (vxBox_with_nms_limitKernel) + +static float hard_nms_kernel + ( + float iou, + float iouThreshold + ) +{ + return iou < iouThreshold ? 1.0f : 0.0f; +} + +static float linear_nms_kernel + ( + float iou, + float iouThreshold + ) +{ + return iou < iouThreshold ? 1.0f : 1.0f - iou; +} + +static float gaussian_nms_kernel + ( + float iou, + float sigma + ) +{ + return (float)(exp(-1.0f * iou * iou / sigma)); +} + +void swap_element + ( + uint32_t* list, + uint32_t first, + uint32_t second + ) +{ + uint32_t temp = list[first]; + list[first] = list[second]; + list[second] = temp; +} + +uint32_t max_element + ( + float* data, + uint32_t* index_list, + uint32_t len + ) +{ + uint32_t i; + uint32_t max_index = 0; + float max_val = data[index_list[0]]; + for(i = 1; i < len; i++) + { + float val = data[index_list[i]]; + if (max_val < val) + { + max_val = val; + max_index = i; + } + } + return max_index; +} + +static uint32_t max_comp_func + ( + void* data, + int32_t left, + int32_t right + ) +{ + float* fdata = (float*)data; + return fdata[left] >= fdata[right]; +} + +void sort_element_by_score + ( + float* data, + uint32_t* index_list, + uint32_t len + ) +{ + vsi_nn_partition(data, 0, len - 1, max_comp_func, TRUE, index_list); +} + +typedef struct +{ + float* fdata; + uint32_t numClasses; +} class_comp_param; + +static uint32_t class_comp_func + ( + void* data, + int32_t left, + int32_t right + ) +{ + class_comp_param *p = (class_comp_param*)data; + float* fdata = p->fdata; + uint32_t numClasses = p->numClasses; + uint32_t lhsClass = left % numClasses, rhsClass = right % numClasses; + return lhsClass == rhsClass ? fdata[left] > fdata[right] + : lhsClass < rhsClass; +} + +static void sort_element_by_class + ( + float* data, + uint32_t* index_list, + uint32_t len, + uint32_t numClasses + ) +{ + class_comp_param class_comp; + class_comp.fdata = data; + class_comp.numClasses = numClasses; + vsi_nn_partition(&class_comp, 0, len - 1, class_comp_func, TRUE, index_list); +} + +// Taking two indices of bounding boxes, return the intersection-of-union. +float getIoUAxisAligned + ( + const float* roi1, + const float* roi2 + ) +{ + const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]); + const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]); + const float x1 = vsi_nn_max(roi1[0], roi2[0]); + const float x2 = vsi_nn_min(roi1[2], roi2[2]); + const float y1 = vsi_nn_max(roi1[1], roi2[1]); + const float y2 = vsi_nn_min(roi1[3], roi2[3]); + const float w = vsi_nn_max(x2 - x1, 0.0f); + const float h = vsi_nn_max(y2 - y1, 0.0f); + const float areaIntersect = w * h; + const float areaUnion = area1 + area2 - areaIntersect; + return areaIntersect / areaUnion; +} + +static vsi_status VX_CALLBACK vxBox_with_nms_limitKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ +#define ARG_NUM (5) +#define TENSOR_NUM_INPUT (3) +#define TENSOR_NUM_OUTPUT (4) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vsi_status status = VSI_FAILURE; + vx_context context = NULL; + vx_tensor input[TENSOR_NUM_INPUT] = {0}; + vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; + float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; + int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0}; + float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; + int32_t* int32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; + vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; + vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; + uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; + uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; + + float scoreThreshold; + int32_t maxNumDetections; + int32_t nms_kernel_method; + float iou_threshold; + float sigma; + float nms_score_threshold; + + uint32_t i = 0; + for(i = 0; i < TENSOR_NUM_INPUT; i++) + { + memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + /* prepare data */ + context = vxGetContext((vx_reference)node); + + for(i = 0; i < TENSOR_NUM_INPUT; i ++) + { + input[i] = (vx_tensor)paramObj[i]; + status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); + TEST_CHECK_STATUS(status, final); + in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); + if (i == 2) + { + int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context, + input[i], &in_attr[i]); + } + else + { + f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); + status = vsi_nn_vxConvertTensorToFloat32Data( + context, input[i], &in_attr[i], f32_in_buffer[i], + in_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) + { + output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; + status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); + TEST_CHECK_STATUS(status, final); + out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); + if (i < 2) + { + f32_out_buffer[i] = (float *)malloc(out_elements[i] * sizeof(float)); + memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float)); + } + else + { + int32_out_buffer[i] = (int32_t *)malloc(out_elements[i] * sizeof(int32_t)); + memset(int32_out_buffer[i], 0, out_elements[i] * sizeof(int32_t)); + } + } + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(scoreThreshold), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(maxNumDetections), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(nms_kernel_method), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(iou_threshold), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(sigma), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(nms_score_threshold), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + /* TODO: Add CPU kernel implement */ + { + uint32_t j, n, b, c; + const uint32_t kRoiDim = 4; + uint32_t numRois = in_attr[0].size[1]; + uint32_t numClasses = in_attr[0].size[0]; + int32_t ind; + + uint32_t * batch_data = (uint32_t*)malloc(numRois * sizeof(uint32_t)); + int32_t numBatch = 0; + uint32_t * select = NULL; + uint32_t select_size = 0; + uint32_t scores_index = 0; + uint32_t roi_index = 0; + uint32_t roi_out_index = 0; + + memset(batch_data, 0, numRois * sizeof(uint32_t)); + for (i = 0, ind = -1; i < numRois; i++) + { + if (int32_in_buffer[2][i] != ind) + { + ind = int32_in_buffer[2][i]; + numBatch++; + } + batch_data[numBatch - 1]++; + } + select = (uint32_t*)malloc(numBatch * numRois + * numClasses * sizeof(uint32_t)); + memset(select, 0, numBatch * numRois * numClasses * sizeof(uint32_t)); + for (n = 0; n < (uint32_t)numBatch; n++) + { + int32_t numDetections_batch = 0; + uint32_t select_start_batch = select_size; + uint32_t select_len = 0; + // Exclude class 0 (background) + for (c = 1; c < numClasses; c++) + { + uint32_t select_start = select_size; + int32_t maxNumDetections0 = maxNumDetections; + uint32_t numDetections = 0; + for (b = 0; b < batch_data[n]; b++) + { + uint32_t index = b * numClasses + c; + float score = f32_in_buffer[0][scores_index + index]; + if (score > scoreThreshold) { + select[select_size] = index; + select_size++; + } + } + select_len = select_size - select_start; + + if (maxNumDetections0 < 0) + { + maxNumDetections0 = select_len; + } + + for (j = 0; (j < select_len && numDetections < (uint32_t)maxNumDetections0); j++) + { + // find max score and swap to the front. + int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]), + &(select[select_start + j]), select_len - j) + j; + + swap_element(&(select[select_start]), max_index, j); + + // Calculate IoU of the rest, swap to the end (disgard) if needed. + for (i = j + 1; i < select_len; i++) + { + int32_t roiBase0 = roi_index + select[select_start + i] * kRoiDim; + int32_t roiBase1 = roi_index + select[select_start + j] * kRoiDim; + float iou = getIoUAxisAligned(&(f32_in_buffer[1][roiBase0]), + &(f32_in_buffer[1][roiBase1])); + float kernel_iou; + if (nms_kernel_method == 0) + { + kernel_iou = hard_nms_kernel(iou, iou_threshold); + } + else if (nms_kernel_method == 1) + { + kernel_iou = linear_nms_kernel(iou, iou_threshold); + } + else + { + kernel_iou = gaussian_nms_kernel(iou, sigma); + + } + f32_in_buffer[0][scores_index + select[select_start + i]] *= kernel_iou; + if (f32_in_buffer[0][scores_index + select[select_start + i]] < nms_score_threshold) + { + swap_element(&(select[select_start]), i, select_len - 1); + i--; + select_len--; + } + } + numDetections++; + } + select_size = select_start + select_len; + numDetections_batch += numDetections; + } + + // Take top maxNumDetections. + sort_element_by_score(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]), + numDetections_batch); + + if (numDetections_batch > maxNumDetections) + { + select_size = select_start_batch + maxNumDetections; + } + select_len = select_size - select_start_batch; + // Sort again by class. + sort_element_by_class(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]), + select_len, numClasses); + + for (i = 0; i < select_len; i++) + { + int32_t in_index0 = scores_index + select[select_start_batch + i]; + int32_t in_index1 = roi_index + select[select_start_batch + i] * kRoiDim; + f32_out_buffer[0][roi_out_index] = f32_in_buffer[0][in_index0]; + memcpy(&(f32_out_buffer[1][roi_out_index * kRoiDim]), + &f32_in_buffer[1][in_index1], kRoiDim * sizeof(float)); + int32_out_buffer[2][roi_out_index] = select[select_start_batch + i] % numClasses; + int32_out_buffer[3][roi_out_index] = n; + roi_out_index++; + } + + scores_index += batch_data[n] * numClasses; + roi_index += batch_data[n] * numClasses * kRoiDim; + } + if (batch_data) free(batch_data); + if (select) free(select); + } + + /* save data */ + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + if (i < 2) + { + status = vsi_nn_vxConvertFloat32DataToTensor( + context, output[i], &out_attr[i], f32_out_buffer[i], + out_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + else + { + vsi_nn_vxCopyDataToTensor(context, output[i], &out_attr[i], + (uint8_t *)int32_out_buffer[i]); + } + } + +final: + for (i = 0; i < TENSOR_NUM_INPUT; i++) + { + if (f32_in_buffer[i]) free(f32_in_buffer[i]); + if (int32_in_buffer[i]) free(int32_in_buffer[i]); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + if (f32_out_buffer[i]) free(f32_out_buffer[i]); + if (int32_out_buffer[i]) free(int32_out_buffer[i]); + } + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t vxBox_with_nms_limitKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_status VX_CALLBACK vxBox_with_nms_limitInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ + vx_status status = VX_SUCCESS; + /*TODO: Add initial code for VX program*/ + + return status; +} + + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxBox_with_nms_limit_CPU = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + vxBox_with_nms_limitKernelParam, + _cnt_of_array( vxBox_with_nms_limitKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxBox_with_nms_limit_VX = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + NULL, + vxBox_with_nms_limitKernelParam, + _cnt_of_array( vxBox_with_nms_limitKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxBox_with_nms_limitInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_BOX_WITH_NMS_LIMIT_list[] = +{ + &vxBox_with_nms_limit_CPU, + &vxBox_with_nms_limit_VX, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c new file mode 100644 index 0000000..3ab7764 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_crop.c @@ -0,0 +1,253 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +void myTensorCropFunc + ( + int8_t *src, + int8_t *dst + ) +{ + + return; +} +vsi_status VX_CALLBACK TensorCropInternalKernel + (vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + vsi_status status = VX_ERROR_INVALID_PARAMETERS; + + if(paramNum == 2) + { + + } + + return status; +} + +vsi_status VX_CALLBACK TensorCropInitializer + (vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + vsi_status status = VX_SUCCESS; +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in threads + {0, 0, 0}}; // globalWorkSize: image size in threads + + vx_tensor input = (vx_tensor)paramObj[0]; + vx_tensor output = (vx_tensor)paramObj[1]; + uint32_t output_size[4] = {1, 1, 1, 1}; + vsi_enum dataFormat, dstFormat; + int8_t input_fixPointPos = 0; + vx_uint32 i = 0; + int32_t offset[3]; + size_t size[DIM_SIZE]; + vsi_nn_tensor_attr_t attr[2]; + + memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); + + status = vsi_nn_vxGetTensorAttr(input, &attr[0]); + status |= vsi_nn_vxGetTensorAttr(output, &attr[1]); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + + dataFormat = attr[0].dtype.vx_type; + input_fixPointPos = attr[0].dtype.fl; + dstFormat = attr[1].dtype.vx_type; + for (i = 0; i < attr[1].dim_num; i++) + { + output_size[i] = attr[1].size[i]; + } + + vxCopyScalar((vx_scalar)paramObj[2], &offset[0], VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[3], &offset[1], VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[4], &offset[2], VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + memset(size, 0, sizeof(size_t) * DIM_SIZE); + switch(dstFormat) + { + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_UINT8: + size[0] = 16; + size[1] = 4; + break; + case VSI_NN_TYPE_INT16: + case VSI_NN_TYPE_UINT16: + case VSI_NN_TYPE_FLOAT16: + size[0] = 8; + size[1] = 4; + break; + } + + shaderParam.globalWorkOffset[0] = offset[0]; + shaderParam.globalWorkOffset[1] = offset[1]; + shaderParam.globalWorkOffset[2] = offset[2]; + shaderParam.globalWorkScale[0] = size[0]; + shaderParam.globalWorkScale[1] = size[1]; + shaderParam.globalWorkScale[2] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1]; + shaderParam.globalWorkSize[2] = output_size[2]; + + if(dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_FLOAT16) + { + vx_uint32 uniConvertInt16toFp16_2x8[16] = { + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }; + +#define cropMIN(x, y) (((x) <= (y)) ? (x) : (y)) +#define CROP_MAX_POST_SHIFT_BITS (31) +#define CROP_MAX_MULTIPLIER_NUM (65535) + + if (input_fixPointPos > 0) + { + vx_uint8 postshift = cropMIN(input_fixPointPos, CROP_MAX_POST_SHIFT_BITS); + + uniConvertInt16toFp16_2x8[7] |= (postshift & 0x1F); + } + else + { + vx_uint32 multiplier = cropMIN((int64_t)1 << (-input_fixPointPos), CROP_MAX_MULTIPLIER_NUM); + + for (i = 0; i < 8; i++) + { + uniConvertInt16toFp16_2x8[i + 8] = multiplier; + } + } +#undef cropMIN +#undef CROP_MAX_POST_SHIFT_BITS +#undef CROP_MAX_MULTIPLIER_NUM + + status |= vxSetNodeUniform(nodObj, "uniConvertInt16toFp16_2x8", 1, uniConvertInt16toFp16_2x8); + } + + vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + if(status < 0) + { + VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); + } + return status; +} + +vx_param_description_t basekernel_tensorCrop_params[] = { + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; + + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxTensorCropKernelInt16Info = +{ + VX_KERNEL_ENUM_TENSORCROP_INT16, + VX_KERNEL_NAME_TENSORCROP_INT16, + NULL, + basekernel_tensorCrop_params, + (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + TensorCropInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTensorCropKernelInt8Info = +{ + VX_KERNEL_ENUM_TENSORCROP_INT8, + VX_KERNEL_NAME_TENSORCROP_INT8, + NULL, + basekernel_tensorCrop_params, + (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + TensorCropInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTensorCropKernelInt16Fp16Info = +{ + VX_KERNEL_ENUM_TENSORCROP_INT16_FP16, + VX_KERNEL_NAME_TENSORCROP_INT16_FP16, + NULL, + basekernel_tensorCrop_params, + (sizeof(basekernel_tensorCrop_params) / sizeof(basekernel_tensorCrop_params[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + TensorCropInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_CROP_list[] = +{ + NULL, + &vxTensorCropKernelInt16Info, + &vxTensorCropKernelInt8Info, + &vxTensorCropKernelInt16Fp16Info, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c new file mode 100644 index 0000000..34de7f7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c @@ -0,0 +1,250 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_EXTRA_ENDING) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_EXTRA_ENDING) +#define _VX_KERNEL_FUNC_KERNEL (vxExtra_endingKernel) + +static vsi_status VX_CALLBACK vxExtra_endingKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ +#define TENSOR_NUM_INPUT (2) +#define TENSOR_NUM_OUTPUT (1) + + vsi_status status = VSI_FAILURE; + vx_context context = NULL; + vx_tensor input = NULL; + vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; + uint8_t *u8_in_buffer[1] = {0}; + uint8_t *u8_out_buffer[TENSOR_NUM_OUTPUT] = {0}; + vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; + uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; + vsi_nn_tensor_attr_t in_attr; + + int32_t i = 0; + + memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + /* prepare data */ + context = vxGetContext((vx_reference)node); + + input = (vx_tensor)paramObj[1]; + status = vsi_nn_vxGetTensorAttr(input, &in_attr); + TEST_CHECK_STATUS(status, final); + + for(i = 0; i < 1; i ++) + { + output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; + status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); + TEST_CHECK_STATUS(status, final); + out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); + u8_out_buffer[i]= (uint8_t *)malloc(out_elements[i] * sizeof(uint8_t)); + memset(u8_out_buffer[i], 0, out_elements[i] * sizeof(uint8_t)); + + u8_in_buffer[0] = vsi_nn_vxCopyTensorToData(context, input, &in_attr); + memcpy(u8_out_buffer[0], u8_in_buffer[0], out_elements[i] * sizeof(uint8_t)); + } + + /* save data */ + status = vsi_nn_vxCopyDataToTensor(context, output[0], &out_attr[0], u8_out_buffer[0]); + TEST_CHECK_STATUS(status, final); + +final: + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + if (u8_out_buffer[i]) free(u8_out_buffer[i]); + } + if (u8_in_buffer[0]) free(u8_in_buffer[0]); + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t vxExtra_endingKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_status VX_CALLBACK vxExtra_endingInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ + vx_status status = VX_SUCCESS; +// Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vx_tensor output = (vx_tensor)paramObj[2]; + + vx_uint32 width = 0; + vx_uint32 height = 0; + vx_uint32 channel = 0; + vx_uint32 dst_size[4] = {1, 1, 1, 1}; + vsi_nn_tensor_attr_t attr; + uint32_t i; + uint32_t output_dims; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + status = vsi_nn_vxGetTensorAttr(output, &attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + output_dims = attr.dim_num; + for (i = 0; i < output_dims; i++) + { + dst_size[i] = attr.size[i]; + } + + width = dst_size[0]; + height = dst_size[1]; + channel = dst_size[2]; + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkOffset[2] = 0; + shaderParam.globalWorkScale[0] = 8; + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkScale[2] = 1; + shaderParam.localWorkSize[0] = 16; + shaderParam.localWorkSize[1] = 1; + shaderParam.localWorkSize[2] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((width + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); + shaderParam.globalWorkSize[1] = gcmALIGN((height + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); + shaderParam.globalWorkSize[2] = channel; + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + if(status < 0) + VSILOGE("error-%s,%d\n",__FILE__,__LINE__); + + return status; +} + + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxExtra_ending_CPU = +{ + _VX_KERNEL_ID, + VX_KERNEL_NAME_EXTRA_ENDING_I16, + _VX_KERNEL_FUNC_KERNEL, + vxExtra_endingKernelParam, + _cnt_of_array( vxExtra_endingKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxExtra_ending_i16 = +{ + _VX_KERNEL_ID, + VX_KERNEL_NAME_EXTRA_ENDING_I16, + NULL, + vxExtra_endingKernelParam, + _cnt_of_array( vxExtra_endingKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxExtra_endingInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxExtra_ending_i8 = +{ + _VX_KERNEL_ID, + VX_KERNEL_NAME_EXTRA_ENDING_I8, + NULL, + vxExtra_endingKernelParam, + _cnt_of_array( vxExtra_endingKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxExtra_endingInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxExtra_ending_u8 = +{ + _VX_KERNEL_ID, + VX_KERNEL_NAME_EXTRA_ENDING_U8, + NULL, + vxExtra_endingKernelParam, + _cnt_of_array( vxExtra_endingKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxExtra_endingInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_EXTRA_ENDING_list[] = +{ + &vxExtra_ending_CPU, + &vxExtra_ending_i16, + &vxExtra_ending_i8, + &vxExtra_ending_u8, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c new file mode 100644 index 0000000..dacde22 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_fullconnect2.c @@ -0,0 +1,323 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_FCL2) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_FULLYCONNECTED_AXIS2) +#define _VX_KERNEL_NAME ("vsi_nn_kernel_fullconnect2") +#define _VX_KERNEL_FUNC_KERNEL (vxFullconnect2Kernel) + +//static uint32_t layerNum = 0; + +static vsi_status VX_CALLBACK vxFullconnect2Kernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + /* TODO: */ +#define ARG_NUM (2) +#define TENSOR_NUM_INPUT (3) +#define TENSOR_NUM_OUTPUT (1) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vsi_status status = VX_SUCCESS; + uint32_t i, j, k; + vx_context context = NULL; + vsi_nn_tensor_attr_t attr[TENSOR_NUM]; + uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM]; + vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL}; + uint8_t *buffer_ptr[TENSOR_NUM] = {NULL}; + vx_tensor tensor[TENSOR_NUM]; + + //char fileName[256] = {'\0'}; + //uint32_t total_size; + int32_t axis, weights; + uint32_t num_fc = 1, num_no_fc = 1; + + + //prepare data + context = vxGetContext((vx_reference)node); + + for( i = 0; i < TENSOR_NUM_INPUT; i ++ ) + { + tensor[i] = (vx_tensor)paramObj[i]; + buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], + &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY); + } + for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) + { + tensor[i] = (vx_tensor)paramObj[i]; + buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], + &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY); + } + + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(axis), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(weights), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + //op calc + for(i = 0; i <= (uint32_t)axis; ++i) + { + num_fc *= attr[0].size[i]; + } + for(i = axis + 1; i < attr[0].dim_num; ++i) + { + num_no_fc *= attr[0].size[i]; + } + + for(k = 0; k < num_no_fc; ++k) + { + for(j = 0; j < (uint32_t)weights; ++j) + { + float sum; + vsi_nn_DtypeToFloat32(&buffer_ptr[2][stride_size[2][0] * j], &sum, &attr[2].dtype); + for(i = 0; i < num_fc; ++i) + { + float x, w; + vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * (i + num_fc * k)], + &x, &attr[0].dtype); + vsi_nn_DtypeToFloat32(&buffer_ptr[1][stride_size[1][0] * (i + num_fc * j)], + &w, &attr[1].dtype); + sum += w * x; + } + vsi_nn_Float32ToDtype(sum, &buffer_ptr[3][stride_size[3][0] * (j + weights * k)], + &attr[3].dtype); + } + } + +#if 0 + print_index = 3; + total_size = vsi_nn_ShapeProduct(size[print_index], dim_num[print_index]); + if (dim_num[print_index] == 3) + { + snprintf(fileName, VSI_NN_MAX_PATH, "%s_%d_%d_%d_%d.txt", _VX_KERNEL_NAME, layerNum, + size[print_index][0], size[print_index][1], size[print_index][2]); + } + else + { + snprintf(fileName, VSI_NN_MAX_PATH, "%s_%d_%d_%d_%d_%d.txt", _VX_KERNEL_NAME, layerNum, + size[print_index][0], size[print_index][1], size[print_index][2], size[print_index][3]); + } + vsi_nn_SaveDataToText(fileName, buffer_ptr[print_index], total_size, + data_format[print_index], NULL); + layerNum++; +#endif + //save data + for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) + { + status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY); + if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i])); + } + for( i = 0; i < TENSOR_NUM; i ++ ) + { + if (buffer_ptr[i]) free(buffer_ptr[i]); + } + + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + + +static vx_param_description_t s_params[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, +}; + +void myFullyConnected_Axis2Func + ( + int8_t *src, + int8_t *dst + ) +{ + + return; +} +vsi_status VX_CALLBACK vxFullyConnected_Axis2Kernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + vsi_status status = VX_ERROR_INVALID_PARAMETERS; + + if(paramNum == 2) + { + + } + + return status; +} + +vsi_status VX_CALLBACK vxFullyConnected_Axis2Initializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + vsi_status status = VX_SUCCESS; + + // Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in threads + {0, 0, 0}}; // globalWorkSize: image size in threads + + uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}; + uint32_t output_size[DIM_SIZE] = {1, 1, 1, 1}; + + uint32_t uniMulAcc_16x1[16] = { + 0x00005555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x00000000, // ABin + 0x00005555, // BSelt + 0x76543210, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + uint32_t loopNum = 0; + vsi_nn_tensor_attr_t attr[2]; + uint32_t i; + uint32_t input_dims = 0; + uint32_t output_dims = 0; + + memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); + + status = vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[1], &attr[0]); + status |= vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[3], &attr[1]); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + input_dims = attr[0].dim_num; + for (i = 0; i < input_dims; i++) + { + input_size[i] = attr[0].size[i]; + } + output_dims = attr[1].dim_num; + for (i = 0; i < output_dims; i++) + { + output_size[i] = attr[1].size[i]; + } + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkScale[0] = 1; + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1]; + + vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + vxSetNodeUniform(nodObj, "uniMulAcc_16x1", 1, uniMulAcc_16x1); + + loopNum = gcmALIGN(input_size[0], 32); + vxSetNodeUniform(nodObj, "loopNum", 1, &loopNum); + if(status < 0) + { + VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); + } + return status; +} + +static vx_param_description_t vxFullyConnected_Axis2KernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t _VX_KERNEL_VAR = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxFullyConnected_Axis2KernelInfo = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + vxFullyConnected_Axis2Kernel, + vxFullyConnected_Axis2KernelParam, + (sizeof(vxFullyConnected_Axis2KernelParam) / sizeof(vxFullyConnected_Axis2KernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxFullyConnected_Axis2Initializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_FCL2_list[] = +{ + &_VX_KERNEL_VAR, + &vxFullyConnected_Axis2KernelInfo, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c new file mode 100644 index 0000000..0c2b948 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c @@ -0,0 +1,483 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_GENERATE_PROPOSALS) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_GENERATE_PROPOSALS) +#define _VX_KERNEL_NAME (VX_KERNEL_NAME_GENERATE_PROPOSALS) +#define _VX_KERNEL_FUNC_KERNEL (vxGenerate_proposalsKernel) + +typedef struct +{ + float x1, y1, x2, y2; +}BoxEncodingCorner; +typedef struct +{ + float w, h, x, y; +}BoxEncodingCenter; + +// toBoxEncodingCorner is implemented in vsi_nn_kernel_box_with_nms_limit.c +void toBoxEncodingCorner + ( + BoxEncodingCenter* ctr, + BoxEncodingCorner* cnr + ); + +// toBoxEncodingCenter is implemented in vsi_nn_kernel_box_with_nms_limit.c +void toBoxEncodingCenter + ( + BoxEncodingCorner* cnr, + BoxEncodingCenter* ctr + ); + +// iota is implemented in vsi_nn_kernel_detection_postprocess.c +static void _iota + ( + int32_t * data, + uint32_t len, + int32_t value + ) +{ + uint32_t i; + for (i = 0; i < len; i++) + { + data [i] = value; + value++; + } +} + +// swap_element is implemented in vsi_nn_kernel_box_with_nms_limit.c +void swap_element + ( + uint32_t* list, + uint32_t first, + uint32_t second + ); + +// max_element is implemented in vsi_nn_kernel_box_with_nms_limit.c +uint32_t max_element + ( + float* data, + uint32_t* index_list, + uint32_t len + ); + +// getIoUAxisAligned is implemented in vsi_nn_kernel_box_with_nms_limit.c +float getIoUAxisAligned + ( + const float* roi1, + const float* roi2 + ); + +// sort_element_by_score is implemented in vsi_nn_kernel_box_with_nms_limit.c +void sort_element_by_score + ( + float* data, + uint32_t* index_list, + uint32_t len + ); + +void filterBoxes + ( + const float* roiBase, + const float* imageInfoBase, + float minSize, + uint32_t* select, + uint32_t* len + ) +{ + const uint32_t kRoiDim = 4; + uint32_t i = 0; + uint32_t j; + for(j = 0; j < *len; j++) + { + const float* roiInfo = roiBase + select[j] * kRoiDim; + float roiWidth, roiHeight, xRoiCenter, yRoiCenter; + roiWidth = roiInfo[2] - roiInfo[0]; + roiHeight = roiInfo[3] - roiInfo[1]; + xRoiCenter = roiInfo[0] + roiWidth / 2.0f; + yRoiCenter = roiInfo[1] + roiHeight / 2.0f; + if(roiWidth > minSize && roiHeight > minSize && xRoiCenter < imageInfoBase[1] + && yRoiCenter < imageInfoBase[0]) + { + select[i] = select[j]; + i++; + } + } + *len = i; +} + +static vsi_status VX_CALLBACK vxGenerate_proposalsKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ +#define ARG_NUM (6) +#define TENSOR_NUM_INPUT (4) +#define TENSOR_NUM_OUTPUT (3) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vsi_status status = VSI_FAILURE; + vx_context context = NULL; + vx_tensor input[TENSOR_NUM_INPUT] = {0}; + vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; + float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; + float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; + int32_t* int32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; + vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; + vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; + uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; + uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; + + float heightStride; + float widthStride; + int32_t preNmsTopN; + int32_t postNmsTopN; + float iouThreshold; + float minSize; + + uint32_t i = 0; + for(i = 0; i < TENSOR_NUM_INPUT; i++) + { + memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + /* prepare data */ + context = vxGetContext((vx_reference)node); + + for(i = 0; i < TENSOR_NUM_INPUT; i ++) + { + input[i] = (vx_tensor)paramObj[i]; + status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); + TEST_CHECK_STATUS(status, final); + in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); + f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); + status = vsi_nn_vxConvertTensorToFloat32Data( + context, input[i], &in_attr[i], f32_in_buffer[i], + in_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) + { + output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; + status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); + TEST_CHECK_STATUS(status, final); + out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); + if(i < 2) + { + f32_out_buffer[i] = (float *)malloc(out_elements[i] * sizeof(float)); + memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float)); + } + else + { + int32_out_buffer[i] = (int32_t *)malloc(out_elements[i] * sizeof(int32_t)); + memset(int32_out_buffer[i], 0, out_elements[i] * sizeof(int32_t)); + } + } + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(heightStride), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(widthStride), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(preNmsTopN), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(postNmsTopN), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(iouThreshold), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(minSize), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + /* TODO: Add CPU kernel implement */ + { + uint32_t h, w, a, b, j; + const uint32_t kRoiDim = 4; + uint32_t numBatches = in_attr[0].size[3]; + uint32_t height = in_attr[0].size[2]; + uint32_t width = in_attr[0].size[1]; + uint32_t numAnchors = in_attr[0].size[0]; + uint32_t imageInfoLength = in_attr[3].size[0]; + + uint32_t batchSize = height * width * numAnchors; + uint32_t roiBufferSize = batchSize * kRoiDim; + + float * roiBuffer = (float*)malloc(roiBufferSize * sizeof(float)); + float * roiTransformedBuffer = (float*)malloc(roiBufferSize * sizeof(float)); + uint32_t* select = (uint32_t*)malloc(batchSize * sizeof(uint32_t)); + uint32_t index = 0; + uint32_t scores_index = 0; + uint32_t bboxDeltas_index = 0; + uint32_t imageInfo_index = 0; + uint32_t scores_out_index = 0; + uint32_t roi_out_index = 0; + + // Compute the roi region for each anchor. + for(h = 0; h < height; h++) + { + float hShift = h * heightStride; + for(w = 0; w < width; w++) + { + float wShift = w * widthStride; + uint32_t anchor_index = 0; + for(a = 0; a < numAnchors; a++) + { + roiBuffer[index] = f32_in_buffer[2][anchor_index] + wShift; + roiBuffer[index + 1] = f32_in_buffer[2][anchor_index + 1] + hShift; + roiBuffer[index + 2] = f32_in_buffer[2][anchor_index + 2] + wShift; + roiBuffer[index + 3] = f32_in_buffer[2][anchor_index + 3] + hShift; + + index += kRoiDim; + anchor_index += kRoiDim; + } + } + } + + for(b = 0; b < numBatches; b++) + { + const uint32_t roiLength = 4; + + uint32_t numRois = batchSize; + uint32_t roiIndex; + uint32_t select_len; + int32_t numDetections = 0; + for(roiIndex = 0; roiIndex < numRois; roiIndex++) + { + float imageHeight = f32_in_buffer[3][imageInfo_index]; + float imageWidth = f32_in_buffer[3][imageInfo_index + 1]; + BoxEncodingCorner roi_cnr; + BoxEncodingCenter roiBefore; + roi_cnr.x1 = roiBuffer[roiIndex * roiLength]; + roi_cnr.y1 = roiBuffer[roiIndex * roiLength + 1]; + roi_cnr.x2 = roiBuffer[roiIndex * roiLength + 2]; + roi_cnr.y2 = roiBuffer[roiIndex * roiLength + 3]; + toBoxEncodingCenter(&roi_cnr, &roiBefore); + { + BoxEncodingCenter roi_ctr; + BoxEncodingCorner roiAfter; + BoxEncodingCorner cliped; + uint32_t idx = bboxDeltas_index + roiIndex * roiLength; + roi_ctr.w = (float)(exp(f32_in_buffer[1][idx + 2]) * roiBefore.w); + roi_ctr.h = (float)(exp(f32_in_buffer[1][idx + 3]) * roiBefore.h); + roi_ctr.x = roiBefore.x + f32_in_buffer[1][idx] * roiBefore.w; + roi_ctr.y = roiBefore.y + f32_in_buffer[1][idx + 1] * roiBefore.h; + toBoxEncodingCorner(&roi_ctr, &roiAfter); + cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth); + cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight); + cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth); + cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight); + roiTransformedBuffer[idx] = cliped.x1; + roiTransformedBuffer[idx + 1] = cliped.y1; + roiTransformedBuffer[idx + 2] = cliped.x2; + roiTransformedBuffer[idx + 3] = cliped.y2; + } + } + + // Find the top preNmsTopN scores. + _iota((int32_t*)select, batchSize, 0); + select_len = batchSize; + if(preNmsTopN > 0 && preNmsTopN < (int32_t)batchSize) + { + sort_element_by_score(&(f32_in_buffer[0][scores_index]), + select, batchSize); + select_len = preNmsTopN; + } + + // Filter boxes, disgard regions with height or width < minSize. + filterBoxes(roiTransformedBuffer, &(f32_in_buffer[3][0]), + minSize, select, &select_len); + + // Apply hard NMS. + if(postNmsTopN < 0) + { + postNmsTopN = select_len; + } + + for(j = 0; (j < select_len && numDetections < postNmsTopN); j++) + { + // find max score and swap to the front. + int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]), + &(select[j]), select_len - j) + j; + swap_element(select, max_index, j); + + // Calculate IoU of the rest, swap to the end (disgard) ifneeded. + for(i = j + 1; i < select_len; i++) + { + int32_t roiBase0 = select[i] * kRoiDim; + int32_t roiBase1 = select[j] * kRoiDim; + float iou = getIoUAxisAligned(&(roiTransformedBuffer[roiBase0]), + &(roiTransformedBuffer[roiBase1])); + + if(iou >= iouThreshold) + { + swap_element(select, i, select_len - 1); + i--; + select_len--; + } + } + numDetections++; + } + + for(i = 0; i < select_len; i++) + { + memcpy(&(f32_out_buffer[1][roi_out_index]), + &(roiTransformedBuffer[select[i] * kRoiDim]), kRoiDim * sizeof(float)); + f32_out_buffer[0][scores_out_index] = + f32_in_buffer[0][scores_index + select[i]]; + int32_out_buffer[2][scores_out_index] = b; + scores_out_index++; + roi_out_index += kRoiDim; + } + + scores_index += batchSize; + bboxDeltas_index += roiBufferSize; + imageInfo_index += imageInfoLength; + } + + vsi_nn_safe_free(roiBuffer); + vsi_nn_safe_free(roiTransformedBuffer); + vsi_nn_safe_free(select); + } + + /* save data */ + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + if(i < 2) + { + status = vsi_nn_vxConvertFloat32DataToTensor( + context, output[i], &out_attr[i], f32_out_buffer[i], + out_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + else + { + vsi_nn_vxCopyDataToTensor(context, output[i], &out_attr[i], + (uint8_t *)int32_out_buffer[i]); + } + } + +final: + for(i = 0; i < TENSOR_NUM_INPUT; i++) + { + if(f32_in_buffer[i]) free(f32_in_buffer[i]); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + if(f32_out_buffer[i]) free(f32_out_buffer[i]); + if(int32_out_buffer[i]) free(int32_out_buffer[i]); + } + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t vxGenerate_proposalsKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_status VX_CALLBACK vxGenerate_proposalsInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ + vx_status status = VX_SUCCESS; + /*TODO: Add initial code for VX program*/ + + return status; +} + + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxGenerate_proposals_CPU = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + vxGenerate_proposalsKernelParam, + _cnt_of_array( vxGenerate_proposalsKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxGenerate_proposals_VX = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + NULL, + vxGenerate_proposalsKernelParam, + _cnt_of_array( vxGenerate_proposalsKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxGenerate_proposalsInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_GENERATE_PROPOSALS_list[] = +{ + &vxGenerate_proposals_CPU, + &vxGenerate_proposals_VX, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c new file mode 100644 index 0000000..fa9537a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c @@ -0,0 +1,322 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_HEATMAP_MAX_KEYPOINT) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_HEATMAP_MAX_KEYPOINT) +#define _VX_KERNEL_NAME (VX_KERNEL_NAME_HEATMAP_MAX_KEYPOINT) +#define _VX_KERNEL_FUNC_KERNEL (vxHeatmap_max_keypointKernel) + +// This function uses Taylor expansion up to the quatratic term to approximate bicubic +// upscaling result. +// 2nd order Taylor expansion: D(x) = D - b'x + 1/2 * x'Ax +// where D = grid[1][1], Taylor expansion center, the original score, +// x = delta, the correction on max keypoint position, +// D(x) = deltaScore, the accuracy score after correction +static void solveForDelta + ( + const float grid[3][3], + float* delta, + float* deltaScore, + float fpAtol, + float fpRtol + ) +{ + // b: negative 1st order derivative at center + // A: Hessian matrix at center (2nd order derivative) + float A[2][2], b[2]; + float crossProd1, crossProd2; + float detA; + b[0] = -(grid[1][2] - grid[1][0]) / 2.0f; + b[1] = -(grid[2][1] - grid[0][1]) / 2.0f; + A[0][0] = grid[1][0] - 2.0f * grid[1][1] + grid[1][2]; + A[0][1] = (grid[2][2] - grid[2][0] - grid[0][2] + grid[0][0]) / 4.0f; + A[1][0] = A[0][1]; + A[1][1] = grid[0][1] - 2.0f * grid[1][1] + grid[2][1]; + + // solve Ax=b, where x=delta -> delta = inv(A) * b + crossProd1 = A[0][0] * A[1][1]; + crossProd2 = A[0][1] * A[1][0]; + detA = crossProd1 - crossProd2; + // check if A is invertible + if (fabs(detA) < (fpAtol + fpRtol * crossProd1)) return; + delta[0] = (A[1][1] * b[0] - A[0][1] * b[1]) / detA; + delta[1] = (A[0][0] * b[1] - A[1][0] * b[0]) / detA; + + // clip out of range delta, i.e. delta > 3/2 + if (fabs(delta[0]) > 1.5f || fabs(delta[1]) > 1.5f) + { + float scale = (float)(1.5f / vsi_nn_max(fabs(delta[0]), fabs(delta[1]))); + delta[0] *= scale; + delta[1] *= scale; + } + + *deltaScore = grid[1][1] - b[0] * delta[0] - b[1] * delta[1] + + ((A[0][0] * delta[0] + A[0][1] * delta[1]) * delta[0] + + (A[1][0] * delta[0] + A[1][1] * delta[1]) * delta[1]) / + 2.0f; +} + +static vsi_status VX_CALLBACK vxHeatmap_max_keypointKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ +#define ARG_NUM (1) +#define TENSOR_NUM_INPUT (2) +#define TENSOR_NUM_OUTPUT (2) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vsi_status status = VSI_FAILURE; + vx_context context = NULL; + vx_tensor input[TENSOR_NUM_INPUT] = {0}; + vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; + float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; + float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; + vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; + vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; + uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; + uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; + + int32_t type; + + uint32_t i = 0; + for(i = 0; i < TENSOR_NUM_INPUT; i++) + { + memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + /* prepare data */ + context = vxGetContext((vx_reference)node); + + for(i = 0; i < TENSOR_NUM_INPUT; i ++) + { + input[i] = (vx_tensor)paramObj[i]; + status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); + TEST_CHECK_STATUS(status, final); + in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); + f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); + status = vsi_nn_vxConvertTensorToFloat32Data( + context, input[i], &in_attr[i], f32_in_buffer[i], + in_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) + { + output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; + status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); + TEST_CHECK_STATUS(status, final); + out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); + f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float)); + memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float)); + } + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(type), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + /* TODO: Add CPU kernel implement */ + { + uint32_t j, k; + uint32_t numBoxes = in_attr[0].size[3]; + uint32_t heatmapSize = in_attr[0].size[2]; + uint32_t numKeypoints = in_attr[0].size[0]; + uint32_t boxInfoLength = 4; + uint32_t output_score_index = 0; + uint32_t output_keypoint_index = 0; + + for(i = 0; i < numBoxes; i++) + { + for (j = 0; j < numKeypoints; j++) + { + uint32_t maxIndex = 0; + float maxScore = -FLT_MAX; + uint32_t maxIndexWidth; + uint32_t maxIndexHeight; + float localGrid[3][3]; + int32_t dh, dw; + float delta[2] = {0.0f, 0.0f}, deltaScore; + float wRoiStart = f32_in_buffer[1][i * boxInfoLength]; + float hRoiStart = f32_in_buffer[1][i * boxInfoLength + 1]; + float wRoiEnd = f32_in_buffer[1][i * boxInfoLength + 2]; + float hRoiEnd = f32_in_buffer[1][i * boxInfoLength + 3]; + float roiWidth = wRoiEnd - wRoiStart; + float roiHeight = hRoiEnd - hRoiStart; + float wRelativePos; + float hRelativePos; + for (k = 0; k < heatmapSize * heatmapSize; k++) + { + uint32_t index = i * heatmapSize * heatmapSize * numKeypoints + + k * numKeypoints + j; + float val = f32_in_buffer[0][index]; + if (maxScore < val) + { + maxScore = val; + maxIndex = k; + } + } + maxIndexWidth = maxIndex % heatmapSize; + maxIndexHeight = maxIndex / heatmapSize; + + // get local 3x3 grid + for (dh = -1; dh <= 1; dh++) + { + for (dw = -1; dw <= 1; dw++) + { + // cast uint32_t to int32_t + int32_t h = (int32_t)(maxIndexHeight) + dh; + int32_t w = (int32_t)(maxIndexWidth) + dw; + uint32_t heatmapIndex; + + // use mirroring for out of bound indexing + // need to ensure heatmapSize >= 2 + h = h < 0 ? 1 : (h >= (int32_t)heatmapSize ? heatmapSize - 2 : h); + w = w < 0 ? 1 : (w >= (int32_t)heatmapSize ? heatmapSize - 2 : w); + + heatmapIndex = i * heatmapSize * heatmapSize * numKeypoints + + (uint32_t)(h) * heatmapSize * numKeypoints + + (uint32_t)(w) * numKeypoints + j; + localGrid[dh + 1][dw + 1] = f32_in_buffer[0][heatmapIndex]; + } + } + deltaScore = maxScore; + solveForDelta((const float (*)[3])localGrid, delta, &deltaScore, 1e-3f, 1e-3f); + + wRelativePos = ((float)(maxIndexWidth) + delta[0] + 0.5f) / + (float)(heatmapSize); + hRelativePos = ((float)(maxIndexHeight) + delta[1] + 0.5f) / + (float)(heatmapSize); + f32_out_buffer[0][output_score_index] = deltaScore; + f32_out_buffer[1][output_keypoint_index] = wRelativePos * roiWidth + wRoiStart; + f32_out_buffer[1][output_keypoint_index + 1] = hRelativePos * roiHeight + hRoiStart; + output_score_index++; + output_keypoint_index +=2; + } + } + } + + /* save data */ + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + status = vsi_nn_vxConvertFloat32DataToTensor( + context, output[i], &out_attr[i], f32_out_buffer[i], + out_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + +final: + for (i = 0; i < TENSOR_NUM_INPUT; i++) + { + if (f32_in_buffer[i]) free(f32_in_buffer[i]); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + if (f32_out_buffer[i]) free(f32_out_buffer[i]); + } + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t vxHeatmap_max_keypointKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_status VX_CALLBACK vxHeatmap_max_keypointInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ + vx_status status = VX_SUCCESS; + /*TODO: Add initial code for VX program*/ + + return status; +} + + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxHeatmap_max_keypoint_CPU = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + vxHeatmap_max_keypointKernelParam, + _cnt_of_array( vxHeatmap_max_keypointKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxHeatmap_max_keypoint_VX = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + NULL, + vxHeatmap_max_keypointKernelParam, + _cnt_of_array( vxHeatmap_max_keypointKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxHeatmap_max_keypointInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_HEATMAP_MAX_KEYPOINT_list[] = +{ + &vxHeatmap_max_keypoint_CPU, + &vxHeatmap_max_keypoint_VX, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c new file mode 100644 index 0000000..cc99b85 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c @@ -0,0 +1,1177 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_IMAGEPROCESS) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_IMAGEPROCESS) +#define _VX_KERNEL_NAME ("vsi_nn_kernel_imageprocess") +#define _VX_KERNEL_FUNC_KERNEL (vximageprocessKernel) + +//static uint32_t layerNum = 0; + +static void resize_crop_op + ( + uint8_t *buffer_ptr0, + uint8_t *buffer_ptr1, + vsi_nn_tensor_attr_t *attr0, + vsi_nn_tensor_attr_t *attr1, + uint32_t *stride_size0, + uint32_t *stride_size1, + int32_t *resize_crop_start + ) +{ + int32_t index[4]; + for (index[3] = 0; index[3] < (int32_t)attr1->size[3]; index[3]++) + { + for (index[2] = 0; index[2] < (int32_t)attr1->size[2]; index[2]++) + { + for (index[1] = 0; index[1] < (int32_t)attr1->size[1]; index[1]++) + { + for (index[0] = 0; index[0] < (int32_t)attr1->size[0]; index[0]++) + { + int32_t index_in = (((index[3] + resize_crop_start[3]) * attr0->size[2] + + (index[2] + resize_crop_start[2])) * attr0->size[1] + + (index[1] + resize_crop_start[1])) * attr0->size[0] + + (index[0] + resize_crop_start[0]); + int32_t index_out = (((index[3]) * attr1->size[2] + + (index[2])) * attr1->size[1] + + (index[1])) * attr1->size[0] + + (index[0]); + float val; + vsi_nn_DtypeToFloat32(&buffer_ptr0[stride_size0[0] * index_in], + &val, &attr0->dtype); + vsi_nn_Float32ToDtype(val, &buffer_ptr1[stride_size1[0] * index_out], + &attr1->dtype); + } + } + } + } +} + +static void reverse_channel_op + ( + uint8_t *buffer_ptr0, + uint8_t *buffer_ptr1, + vsi_nn_tensor_attr_t *attr, + uint32_t *stride_size + ) +{ + int32_t index[4]; + for (index[3] = 0; index[3] < (int32_t)attr->size[3]; index[3]++) + { + for (index[2] = 0; index[2] < 3; index[2]++) + { + for (index[1] = 0; index[1] < (int32_t)attr->size[1]; index[1]++) + { + for (index[0] = 0; index[0] < (int32_t)attr->size[0]; index[0]++) + { + int32_t index_in = (((index[3]) * attr->size[2] + + (2 - index[2])) * attr->size[1] + + (index[1])) * attr->size[0] + + (index[0]); + int32_t index_out = (((index[3]) * attr->size[2] + + (index[2])) * attr->size[1] + + (index[1])) * attr->size[0] + + (index[0]); + float val; + vsi_nn_DtypeToFloat32(&buffer_ptr0[stride_size[0] * index_in], + &val, &attr->dtype); + vsi_nn_Float32ToDtype(val, &buffer_ptr1[stride_size[0] * index_out], + &attr->dtype); + } + } + } + } +} + +static void mean_pixel_op + ( + uint8_t *buffer_ptr0, + uint8_t *buffer_ptr1, + vsi_nn_tensor_attr_t *attr, + uint32_t *stride_size, + float mean_scale, + float *mean_mean_value + ) +{ + int32_t index[4]; + for (index[3] = 0; index[3] < (int32_t)attr->size[3]; index[3]++) + { + for (index[2] = 0; index[2] < (int32_t)attr->size[2]; index[2]++) + { + for (index[1] = 0; index[1] < (int32_t)attr->size[1]; index[1]++) + { + for (index[0] = 0; index[0] < (int32_t)attr->size[0]; index[0]++) + { + int32_t index_in = (((index[3]) * attr->size[2] + + (index[2])) * attr->size[1] + + (index[1])) * attr->size[0] + + (index[0]); + int32_t index_out = (((index[3]) * attr->size[2] + + (index[2])) * attr->size[1] + + (index[1])) * attr->size[0] + + (index[0]); + float val; + vsi_nn_DtypeToFloat32(&buffer_ptr0[stride_size[0] * index_in], + &val, &attr->dtype); + val = (val - mean_mean_value[0]) * mean_scale; + vsi_nn_Float32ToDtype(val, &buffer_ptr1[stride_size[0] * index_out], + &attr->dtype); + } + } + } + } +} + +static void mean_channel_op + ( + uint8_t *buffer_ptr0, + uint8_t *buffer_ptr1, + vsi_nn_tensor_attr_t *attr, + uint32_t *stride_size, + float mean_scale, + float *mean_mean_value + ) +{ + int32_t index[4]; + for (index[3] = 0; index[3] < (int32_t)attr->size[3]; index[3]++) + { + for (index[2] = 0; index[2] < (int32_t)attr->size[2]; index[2]++) + { + for (index[1] = 0; index[1] < (int32_t)attr->size[1]; index[1]++) + { + for (index[0] = 0; index[0] < (int32_t)attr->size[0]; index[0]++) + { + int32_t index_in = (((index[3]) * attr->size[2] + + (index[2])) * attr->size[1] + + (index[1])) * attr->size[0] + + (index[0]); + int32_t index_out = (((index[3]) * attr->size[2] + + (index[2])) * attr->size[1] + + (index[1])) * attr->size[0] + + (index[0]); + float val; + vsi_nn_DtypeToFloat32(&buffer_ptr0[stride_size[0] * index_in], + &val, &attr->dtype); + val = (val - mean_mean_value[index[2]]) * mean_scale; + vsi_nn_Float32ToDtype(val, &buffer_ptr1[stride_size[0] * index_out], + &attr->dtype); + } + } + } + } +} + +static vsi_status VX_CALLBACK vximageprocessKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ +#define ARG_NUM (14) +#define TENSOR_NUM_INPUT (1) +#define TENSOR_NUM_OUTPUT (1) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vsi_status status = VX_SUCCESS; + int32_t i; + vx_context context = NULL; + vsi_nn_tensor_attr_t attr[TENSOR_NUM]; + uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM]; + vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL}; + uint8_t *buffer_ptr[TENSOR_NUM] = {NULL}; + vx_tensor tensor[TENSOR_NUM]; + + int32_t crop_enable, resize_crop_dim_num, resize_crop_start[4] = {0}; + int32_t mean_type, mean_mean_value_size; + vx_bool reverse_channel; + float mean_scale, mean_mean_value[4] = {0}; + uint8_t *temp_ptr[2] = {NULL}; + uint32_t buf_sz; + + //prepare data + context = vxGetContext((vx_reference)node); + + for( i = 0; i < TENSOR_NUM_INPUT; i ++ ) + { + tensor[i] = (vx_tensor)paramObj[i]; + buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], + &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY); + } + for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) + { + tensor[i] = (vx_tensor)paramObj[i]; + buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], + &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY); + } + + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(crop_enable), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(resize_crop_dim_num), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + for (i = 0; i < resize_crop_dim_num; i++) + { + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2 + i], &(resize_crop_start[i]), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + } + + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 6], &(reverse_channel), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 7], &(mean_type), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 8], &(mean_scale), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 9], &(mean_mean_value_size), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + for (i = 0; i < mean_mean_value_size; i++) + { + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 10 + i], &(mean_mean_value[i]), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + } + + //op calc + buf_sz = vsi_nn_GetTensorSize(attr[1].size, attr[1].dim_num, attr[1].dtype.vx_type); + temp_ptr[0] = (uint8_t *)malloc( buf_sz ); + temp_ptr[1] = (uint8_t *)malloc( buf_sz ); + + if (crop_enable == TRUE) + { + resize_crop_op(buffer_ptr[0], temp_ptr[0], &attr[0], &attr[1], + stride_size[0], stride_size[1], resize_crop_start); + } + + if (reverse_channel) + { + reverse_channel_op(temp_ptr[0], temp_ptr[1], &attr[1], + stride_size[1]); + } + + if (mean_type == VSI_NN_IMAGEPROCESS_MEAN_PIXEL) + { + mean_pixel_op(temp_ptr[1], buffer_ptr[1], &attr[1], + stride_size[1], mean_scale, mean_mean_value); + } + else if (mean_type == VSI_NN_IMAGEPROCESS_MEAN_CHANNEL) + { + mean_channel_op(temp_ptr[1], buffer_ptr[1], &attr[1], + stride_size[1], mean_scale, mean_mean_value); + } + + //save data + for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) + { + status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY); + if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i])); + } + for( i = 0; i < TENSOR_NUM; i ++ ) + { + if (buffer_ptr[i]) free(buffer_ptr[i]); + } + + if (temp_ptr[0]) free(temp_ptr[0]); + if (temp_ptr[1]) free(temp_ptr[1]); + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +vx_status VX_CALLBACK vxScaletoTensorInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ +// Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vx_status status = VX_SUCCESS; + vx_image bgrImg = (vx_image)paramObj[0]; + vx_tensor output = (vx_tensor)paramObj[1]; + vx_scalar xRatio_s = (vx_scalar)paramObj[2]; + vx_scalar yRatio_s = (vx_scalar)paramObj[3]; + vx_uint32 width = 0; + vx_uint32 height = 0; + vx_int32 xRatio = 0; + vx_int32 yRatio = 0; + vx_uint32 output_size[DIM_SIZE] = {1, 1, 1, 1}; + vx_int8 dstFixedPointPos = 0; + vx_enum dstFormat; + vx_float32 outputScale = 1.0; + vx_int32 output_ZP = 0; + uint32_t output_dims = 0; + vsi_nn_tensor_attr_t attr; + uint32_t i; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vxQueryImage(bgrImg, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(bgrImg, VX_IMAGE_HEIGHT, &height, sizeof(height)); + + vxCopyScalar(xRatio_s, (void*)&xRatio, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar(yRatio_s, (void*)&yRatio, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + status = vsi_nn_vxGetTensorAttr(output, &attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + + output_dims = attr.dim_num; + dstFormat = attr.dtype.vx_type; + for (i = 0; i < output_dims; i++) + { + output_size[i] = attr.size[i]; + } + dstFixedPointPos = attr.dtype.fl; + output_ZP = attr.dtype.zero_point; + outputScale = attr.dtype.scale; + + if (xRatio == (1 << 15) && yRatio == (1 << 15)) + { + vx_uint32 uniExtractR_2x8[16] = { + 0x00099999, // TCfg + 0x00044444, // ASelt + 0x09060300, 0x0000000c, // ABin + 0x00099999, // BSelt + 0x06060606, 0x00000006, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniExtractG_2x8[16] = { + 0x00099999, // TCfg + 0x00044444, // ASelt + 0x2a272421, 0x0000002d, // ABin + 0x00099999, // BSelt + 0x06060606, 0x00000006, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniExtractB_2x8[16] = { + 0x00099999, // TCfg + 0x00044444, // ASelt + 0x4b484542, 0x0000004e, // ABin + 0x00099999, // BSelt + 0x06060606, 0x00000006, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + if (dstFormat == VSI_NN_TYPE_FLOAT16 || dstFormat == VSI_NN_TYPE_INT16) + shaderParam.globalWorkScale[0] = 8; + else if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_UINT8) + shaderParam.globalWorkScale[0] = 10; + + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1]; + + if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_INT16) + { + if(dstFixedPointPos > 0) + outputScale = (vx_float32) ((int64_t)1 << dstFixedPointPos); + else + { + outputScale = 1.0f; + uniExtractR_2x8[7] |= ((-dstFixedPointPos) & 0x1F); + uniExtractG_2x8[7] |= ((-dstFixedPointPos) & 0x1F); + uniExtractB_2x8[7] |= ((-dstFixedPointPos) & 0x1F); + } + } + else if (dstFormat == VSI_NN_TYPE_UINT8) + { + vx_float32 outputZP = (vx_float32)output_ZP; + + outputScale = 1.0f / outputScale; + + vxSetNodeUniform(nodObj, "outputZP", 1, &outputZP); + } + + vxSetNodeUniform(nodObj, "uniExtractR_2x8", 1, uniExtractR_2x8); + vxSetNodeUniform(nodObj, "uniExtractG_2x8", 1, uniExtractG_2x8); + vxSetNodeUniform(nodObj, "uniExtractB_2x8", 1, uniExtractB_2x8); + status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); + } + else + { + vx_uint32 uniVecShift10[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000400, 0x00000000, 0x00000400, 0x00000000, + 0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant + }; + vx_uint32 uniAddRShift[16] = { + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002405, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniGetTempVal[16] = { + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00230001, 0x00670045, // ABin + 0x05050505, // BSelt + 0x00110000, 0x00330022, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniExtractBytes[16] = { + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002414, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniUnpackToR[16] = { + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x09060300, 0x09060300, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniUnpackToG[16] = { + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x0a070401, 0x0a070401, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniUnpackToB[16] = { + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x0b080502, 0x0b080502, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniDataMulAlpha_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x01010101, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniDataSubMean_4x4[16] = { + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }; + vx_uint32 uniConvertIntergetoF32_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }; + vx_uint32 uniExtactInteger_2x8[16] = { + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002300, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkScale[0] = 4; + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = (output_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1]; + + status |= vxSetNodeUniform(nodObj, "uniDataMulAlpha_4x4", 1, uniDataMulAlpha_4x4); + status |= vxSetNodeUniform(nodObj, "uniDataSubMean_4x4", 1, uniDataSubMean_4x4); + status |= vxSetNodeUniform(nodObj, "uniUnpackToR", 1, uniUnpackToR); + status |= vxSetNodeUniform(nodObj, "uniUnpackToG", 1, uniUnpackToG); + status |= vxSetNodeUniform(nodObj, "uniUnpackToB", 1, uniUnpackToB); + status |= vxSetNodeUniform(nodObj, "uniVecShift10", 1, uniVecShift10); + status |= vxSetNodeUniform(nodObj, "uniAddRShift", 1, uniAddRShift); + status |= vxSetNodeUniform(nodObj, "uniGetTempVal", 1, uniGetTempVal); + status |= vxSetNodeUniform(nodObj, "uniExtractBytes", 1, uniExtractBytes); + + if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_INT16) + { + if(dstFixedPointPos > 0) + outputScale = (vx_float32) ((int64_t)1 << dstFixedPointPos); + else + outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixedPointPos); + + status |= vxSetNodeUniform(nodObj, "uniConvertIntergetoF32_4x4", + 1, uniConvertIntergetoF32_4x4); + status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); + status |= vxSetNodeUniform(nodObj, "uniExtactInteger_2x8", 1, uniExtactInteger_2x8); + } + else if (dstFormat == VSI_NN_TYPE_UINT8) + { + vx_float32 outputZP = (vx_float32)output_ZP; + + outputScale = 1.0f / outputScale; + + status |= vxSetNodeUniform(nodObj, "uniConvertIntergetoF32_4x4", + 1, uniConvertIntergetoF32_4x4); + status |= vxSetNodeUniform(nodObj, "outputZP", 1, &outputZP); + status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); + status |= vxSetNodeUniform(nodObj, "uniExtactInteger_2x8", 1, uniExtactInteger_2x8); + } + } + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + return VX_SUCCESS; +} + +vx_status VX_CALLBACK vxGrayScaletoTensorInitializer(vx_node nodObj, const vx_reference *paramObj, vx_uint32 paraNum) +{ +// Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vx_status status = VX_SUCCESS; + vx_image inputImg = (vx_image)paramObj[0]; + vx_scalar xRatio_s = (vx_scalar)paramObj[2]; + vx_scalar yRatio_s = (vx_scalar)paramObj[3]; + vx_tensor output = (vx_tensor)paramObj[1]; + vx_uint32 width = 0; + vx_uint32 height = 0; + vx_int32 xRatio = 0; + vx_int32 yRatio = 0; + vx_uint32 output_size[4] = {1, 1, 1, 1}; + vx_int8 dstFixedPointPos = 0; + vx_enum dstFormat; + vx_float32 outputScale = 1.0; + vx_int32 output_ZP = 0; + uint32_t output_dims = 0; + vsi_nn_tensor_attr_t attr; + uint32_t i; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vxQueryImage(inputImg, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(inputImg, VX_IMAGE_HEIGHT, &height, sizeof(height)); + + vxCopyScalar(xRatio_s, (void*)&xRatio, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar(yRatio_s, (void*)&yRatio, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + status = vsi_nn_vxGetTensorAttr(output, &attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + + output_dims = attr.dim_num; + dstFormat = attr.dtype.vx_type; + for (i = 0; i < output_dims; i++) + { + output_size[i] = attr.size[i]; + } + dstFixedPointPos = attr.dtype.fl; + output_ZP = attr.dtype.zero_point; + outputScale = attr.dtype.scale; + + if (xRatio == (1 << 15) && yRatio == (1 << 15)) + { + vx_uint32 uniDataMeanStddevLo_2x8[16] = { + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }; + vx_uint32 uniDataMeanStddevHi_2x8[16] = { + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }; + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + if (dstFormat == VSI_NN_TYPE_FLOAT16 || dstFormat == VSI_NN_TYPE_INT16) + shaderParam.globalWorkScale[0] = 16; + else if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_UINT8) + shaderParam.globalWorkScale[0] = 16; + + shaderParam.globalWorkScale[1] = 1; + shaderParam.localWorkSize[0] = 8; + shaderParam.localWorkSize[1] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); + shaderParam.globalWorkSize[1] = gcmALIGN((output_size[1] + + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); + + if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_INT16) + { + if(dstFixedPointPos > 0) + outputScale = (vx_float32) ((int64_t)1 << dstFixedPointPos); + else + { + outputScale = 1.0f; + uniDataMeanStddevLo_2x8[7] |= ((-dstFixedPointPos) & 0x1F); + uniDataMeanStddevHi_2x8[7] |= ((-dstFixedPointPos) & 0x1F); + } + } + else if (dstFormat == VSI_NN_TYPE_UINT8) + { + vx_float32 outputZP = (vx_float32)output_ZP; + + outputScale = 1.0f / outputScale; + + vxSetNodeUniform(nodObj, "outputZP", 1, &outputZP); + } + + vxSetNodeUniform(nodObj, "uniDataMeanStddevLo_2x8", 1, uniDataMeanStddevLo_2x8); + vxSetNodeUniform(nodObj, "uniDataMeanStddevHi_2x8", 1, uniDataMeanStddevHi_2x8); + status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); + } + else + { + vx_uint32 uniVecShift10[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000400, 0x00000000, 0x00000400, 0x00000000, + 0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant + }; + vx_uint32 uniAddRShift[16] = { + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002405, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniGetTempVal[16] = { + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00230001, 0x00670045, // ABin + 0x05050505, // BSelt + 0x00110000, 0x00330022, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniExtractBytes[16] = { + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002414, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniDataMulAlpha_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x01010101, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniDataSubMean_4x4[16] = { + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }; + vx_uint32 uniConvertIntergetoF32_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }; + vx_uint32 uniExtactInteger_2x8[16] = { + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002300, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkScale[0] = 4; + shaderParam.globalWorkScale[1] = 1; + shaderParam.localWorkSize[0] = 2; + shaderParam.localWorkSize[1] = 4; + shaderParam.globalWorkSize[0] = gcmALIGN((output_size[0] + + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); + shaderParam.globalWorkSize[1] = gcmALIGN((output_size[1] + + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); + + if (dstFormat == VSI_NN_TYPE_FLOAT16) + { + status |= vxSetNodeUniform(nodObj, "uniDataMulAlpha_4x4", 1, uniDataMulAlpha_4x4); + status |= vxSetNodeUniform(nodObj, "uniDataSubMean_4x4", 1, uniDataSubMean_4x4); + } + + status |= vxSetNodeUniform(nodObj, "uniVecShift10", 1, uniVecShift10); + status |= vxSetNodeUniform(nodObj, "uniAddRShift", 1, uniAddRShift); + status |= vxSetNodeUniform(nodObj, "uniGetTempVal", 1, uniGetTempVal); + status |= vxSetNodeUniform(nodObj, "uniExtractBytes", 1, uniExtractBytes); + + if (dstFormat == VSI_NN_TYPE_INT8 || dstFormat == VSI_NN_TYPE_INT16) + { + if(dstFixedPointPos > 0) + outputScale *= (vx_float32) ((int64_t)1 << dstFixedPointPos); + else + outputScale *= 1.0f / (vx_float32) ((int64_t)1 << -dstFixedPointPos); + + status |= vxSetNodeUniform(nodObj, "uniConvertIntergetoF32_4x4", + 1, uniConvertIntergetoF32_4x4); + status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); + status |= vxSetNodeUniform(nodObj, "uniExtactInteger_2x8", 1, + uniExtactInteger_2x8); + } + else if (dstFormat == VSI_NN_TYPE_UINT8) + { + vx_float32 outputZP = (vx_float32)output_ZP; + + outputScale = 1.0f / outputScale; + + status |= vxSetNodeUniform(nodObj, "uniConvertIntergetoF32_4x4", + 1, uniConvertIntergetoF32_4x4); + status |= vxSetNodeUniform(nodObj, "outputZP", 1, &outputZP); + status |= vxSetNodeUniform(nodObj, "outputScale", 1, &outputScale); + status |= vxSetNodeUniform(nodObj, "uniExtactInteger_2x8", 1, + uniExtactInteger_2x8); + } + } + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + return VX_SUCCESS; +} + +static vx_param_description_t s_params[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, +}; + +static vx_param_description_t vxScaletoTensorKernelParam[] = +{ + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +static vx_param_description_t vxGrayScaletoTensorKernelParam[] = +{ + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#ifdef __cplusplus +extern "C" { +#endif + +vx_kernel_description_t _VX_KERNEL_VAR = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxScaletoTensorKernelInfo_fp16 = +{ + VX_KERNEL_ENUM_SCALETOTENSOR, + VX_KERNEL_NAME_SCALETOTENSOR_FP16, + NULL, + vxScaletoTensorKernelParam, + (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxScaletoTensorKernelInfo_int8 = +{ + VX_KERNEL_ENUM_SCALETOTENSOR, + VX_KERNEL_NAME_SCALETOTENSOR_INT8, + NULL, + vxScaletoTensorKernelParam, + (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxScaletoTensorKernelInfo_fp16_copy = +{ + VX_KERNEL_ENUM_SCALETOTENSOR, + VX_KERNEL_NAME_SCALETOTENSOR_FP16_COPY, + NULL, + vxScaletoTensorKernelParam, + (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxScaletoTensorKernelInfo_int8_copy = +{ + VX_KERNEL_ENUM_SCALETOTENSOR, + VX_KERNEL_NAME_SCALETOTENSOR_INT8_COPY, + NULL, + vxScaletoTensorKernelParam, + (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxScaletoTensorKernelInfo_int16 = +{ + VX_KERNEL_ENUM_SCALETOTENSOR, + VX_KERNEL_NAME_SCALETOTENSOR_INT16, + NULL, + vxScaletoTensorKernelParam, + (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxScaletoTensorKernelInfo_int16_copy = +{ + VX_KERNEL_ENUM_SCALETOTENSOR, + VX_KERNEL_NAME_SCALETOTENSOR_INT16_COPY, + NULL, + vxScaletoTensorKernelParam, + (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxScaletoTensorKernelInfo_uint8 = +{ + VX_KERNEL_ENUM_SCALETOTENSOR, + VX_KERNEL_NAME_SCALETOTENSOR_UINT8, + NULL, + vxScaletoTensorKernelParam, + (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxScaletoTensorKernelInfo_uint8_copy = +{ + VX_KERNEL_ENUM_SCALETOTENSOR, + VX_KERNEL_NAME_SCALETOTENSOR_UINT8_COPY, + NULL, + vxScaletoTensorKernelParam, + (sizeof(vxScaletoTensorKernelParam) / sizeof(vxScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxGrayScaletoTensorKernelInfo_fp16 = +{ + VX_KERNEL_ENUM_GRAYSCALETOTENSOR, + VX_KERNEL_NAME_GRAYSCALETOTENSOR_FP16, + NULL, + vxGrayScaletoTensorKernelParam, + (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxGrayScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxGrayScaletoTensorKernelInfo_int8 = +{ + VX_KERNEL_ENUM_GRAYSCALETOTENSOR, + VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT8, + NULL, + vxGrayScaletoTensorKernelParam, + (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxGrayScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxGrayScaletoTensorKernelInfo_fp16_copy = +{ + VX_KERNEL_ENUM_GRAYSCALETOTENSOR, + VX_KERNEL_NAME_GRAYSCALETOTENSOR_FP16_COPY, + NULL, + vxGrayScaletoTensorKernelParam, + (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxGrayScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxGrayScaletoTensorKernelInfo_int8_copy = +{ + VX_KERNEL_ENUM_GRAYSCALETOTENSOR, + VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT8_COPY, + NULL, + vxGrayScaletoTensorKernelParam, + (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxGrayScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxGrayScaletoTensorKernelInfo_int16 = +{ + VX_KERNEL_ENUM_GRAYSCALETOTENSOR, + VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT16, + NULL, + vxGrayScaletoTensorKernelParam, + (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxGrayScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxGrayScaletoTensorKernelInfo_int16_copy = +{ + VX_KERNEL_ENUM_GRAYSCALETOTENSOR, + VX_KERNEL_NAME_GRAYSCALETOTENSOR_INT16_COPY, + NULL, + vxGrayScaletoTensorKernelParam, + (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxGrayScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxGrayScaletoTensorKernelInfo_uint8 = +{ + VX_KERNEL_ENUM_GRAYSCALETOTENSOR, + VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8, + NULL, + vxGrayScaletoTensorKernelParam, + (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxGrayScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxGrayScaletoTensorKernelInfo_uint8_copy = +{ + VX_KERNEL_ENUM_GRAYSCALETOTENSOR, + VX_KERNEL_NAME_GRAYSCALETOTENSOR_UINT8_COPY, + NULL, + vxGrayScaletoTensorKernelParam, + (sizeof(vxGrayScaletoTensorKernelParam) / sizeof(vxGrayScaletoTensorKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxGrayScaletoTensorInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_IMAGEPROCESS_list[] = +{ + &_VX_KERNEL_VAR, + &vxScaletoTensorKernelInfo_fp16, + &vxScaletoTensorKernelInfo_int8, + &vxScaletoTensorKernelInfo_int16, + &vxScaletoTensorKernelInfo_uint8, + &vxScaletoTensorKernelInfo_fp16_copy, + &vxScaletoTensorKernelInfo_int8_copy, + &vxScaletoTensorKernelInfo_int16_copy, + &vxScaletoTensorKernelInfo_uint8_copy, + &vxGrayScaletoTensorKernelInfo_fp16, + &vxGrayScaletoTensorKernelInfo_int8, + &vxGrayScaletoTensorKernelInfo_int16, + &vxGrayScaletoTensorKernelInfo_uint8, + &vxGrayScaletoTensorKernelInfo_fp16_copy, + &vxGrayScaletoTensorKernelInfo_int8_copy, + &vxGrayScaletoTensorKernelInfo_int16_copy, + &vxGrayScaletoTensorKernelInfo_uint8_copy, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c new file mode 100644 index 0000000..f259835 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_layernormalize.c @@ -0,0 +1,688 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +void myLayerNormFunc + ( + void* src, + int16_t* scale, + float* bias, + float eps, + void* dst, + uint32_t input_dim, + uint32_t width, + uint32_t height, + uint32_t channel, + uint32_t batch + ) +{ + uint32_t ch = (input_dim <= 2) ? 1 : channel; + uint32_t bn = (input_dim <= 3) ? 1 : batch; + uint32_t b = 0, c = 0, h = 0, w = 0; + + int16_t* imgIn, *imgOut; + imgIn = (int16_t*)src; + imgOut = (int16_t*)dst; + + VSILOGI("Hello myLayerNormFunc!\n"); + for (b = 0; b < bn; b++) + { + for (c = 0; c < ch; c++) + { + for (h = 0; h < height; h++) + { + uint32_t len = (h + (c + b*ch)*height) * width; + float sum = .0f; + float sumsq = .0f; + float mean = .0f; + float vari = .0f; + + for (w = 0; w < width; w++) + { + uint32_t index = len + w; + sum += vsi_nn_Fp16toFp32(imgIn[index]); + } + mean = sum / width; + for (w = 0; w < width; w++) + { + uint32_t index = len + w; + float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean; + sumsq += data * data; + } + vari = sumsq / width; + vari = (float)(1.0 / sqrtf(vari + eps)); + for (w = 0; w < width; w++) + { + uint32_t index = len + w; + float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean; + float scaleVal = vsi_nn_Fp16toFp32(scale[w]); + float biasVal = bias[w]; + float normVal = data * vari * scaleVal + biasVal; + imgOut[index] = vsi_nn_Fp32ToFp16(normVal); + } + } + } + } + return; +} +void myLayerNormFunc_u8 + ( + void* src, + int16_t* scale, + float* bias, + float eps, + void* dst, + uint32_t input_dim, + uint32_t width, + uint32_t height, + uint32_t channel, + uint32_t batch, + int32_t inZp, + int32_t outZp, + float inScale, + float outScale + ) +{ + uint32_t ch = (input_dim <= 2) ? 1 : channel; + uint32_t bn = (input_dim <= 3) ? 1 : batch; + uint32_t b = 0, c = 0, h = 0, w = 0; + + uint8_t* imgIn, *imgOut; + imgIn = (uint8_t*)src; + imgOut = (uint8_t*)dst; + + VSILOGI("Hello myLayerNormFunc!\n"); + for (b = 0; b < bn; b++) + { + for (c = 0; c < ch; c++) + { + for (h = 0; h < height; h++) + { + uint32_t len = (h + (c + b*ch)*height) * width; + float sum = .0f; + float sumsq = .0f; + float mean = .0f; + float vari = .0f; + + for (w = 0; w < width; w++) + { + uint32_t index = len + w; + //sum += vsi_nn_Fp16toFp32(imgIn[index]); + sum += vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8); + } + mean = sum / width; + for (w = 0; w < width; w++) + { + uint32_t index = len + w; + //float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean; + float data = vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8) - mean; + sumsq += data * data; + } + vari = sumsq / width; + vari = (float)(1.0 / sqrtf(vari + eps)); + for (w = 0; w < width; w++) + { + uint32_t index = len + w; + //float data = vsi_nn_Fp16toFp32(imgIn[index]) - mean; + float data = vsi_nn_AffineToFp32(imgIn[index], inScale, inZp, VSI_NN_TYPE_UINT8) - mean; + float scaleVal = vsi_nn_Fp16toFp32(scale[w]); + float biasVal = bias[w]; + float normVal = data * vari * scaleVal + biasVal; + //imgOut[index] = vsi_nn_Fp32ToFp16(normVal); + imgOut[index] = (vx_uint8)vsi_nn_Fp32ToAffine(normVal, outScale, outZp, VSI_NN_TYPE_UINT8); + } + } + } + } + return; +} +vsi_status VX_CALLBACK vxLayerNormKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + vsi_status status = VX_ERROR_INVALID_PARAMETERS; + + if(paramNum == 5) + { + vx_context context = NULL; + // tensor + vx_tensor imgObj[4] = { NULL }; + vsi_nn_tensor_attr_t attr[4]; + int16_t *input = NULL, *output = NULL, *scale = NULL; + float *bias = NULL; + uint32_t input_size[4] = {1, 1, 1, 1}, output_size[4] = {1, 1, 1, 1}; + uint32_t scale_size[4] = {1, 1, 1, 1}, bias_size[4] = {1, 1, 1, 1}; + uint32_t input_stride_size[4] = {0}; + uint32_t output_stride_size[4] = {0}; + uint32_t scale_stride_size[4] = {0}; + uint32_t bias_stride_size[4] = {0}; + vx_tensor_addressing input_user_addr = NULL; + vx_tensor_addressing output_user_addr = NULL; + vx_tensor_addressing scale_user_addr = NULL; + vx_tensor_addressing bias_user_addr = NULL; + vsi_nn_type_e inputFormat = VSI_NN_TYPE_FLOAT16, outputFormat = VSI_NN_TYPE_FLOAT16; + vsi_nn_type_e scaleFormat = VSI_NN_TYPE_FLOAT16, biasFormat = VSI_NN_TYPE_FLOAT16; + uint32_t input_dims = 0, output_dims = 0; + uint32_t scale_dims = 0, bias_dims = 0; + uint32_t i; + int32_t in_zp, out_zp; + float in_scale, out_scale; + // scalar + vx_scalar scalar[1] = { NULL }; + float eps = .0f; + + imgObj[0] = (vx_tensor)paramObj[0]; + imgObj[1] = (vx_tensor)paramObj[1]; + imgObj[2] = (vx_tensor)paramObj[2]; + imgObj[3] = (vx_tensor)paramObj[3]; + scalar[0] = (vx_scalar)paramObj[4]; + memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[3], 0, sizeof(vsi_nn_tensor_attr_t)); + context = vxGetContext((vx_reference)node); + if (context == NULL) + { + VSILOGE("vxGetContext failure! at line %d\n", __LINE__); + goto OnError; + } + + status = vsi_nn_vxGetTensorAttr(imgObj[0], &attr[0]); + status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]); + status |= vsi_nn_vxGetTensorAttr(imgObj[2], &attr[2]); + status |= vsi_nn_vxGetTensorAttr(imgObj[3], &attr[3]); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + goto OnError; + } + input_dims = attr[0].dim_num; + inputFormat = attr[0].dtype.vx_type; + for (i = 0; i < input_dims; i++) + { + input_size[i] = attr[0].size[i]; + } + in_zp = attr[0].dtype.zero_point; + in_scale = attr[0].dtype.scale; + + //bias + bias_dims = attr[1].dim_num; + biasFormat = attr[1].dtype.vx_type; + for (i = 0; i < bias_dims; i++) + { + bias_size[i] = attr[1].size[i]; + } + //scale + scale_dims = attr[2].dim_num; + scaleFormat = attr[2].dtype.vx_type; + for (i = 0; i < scale_dims; i++) + { + scale_size[i] = attr[2].size[i]; + } + + //output + output_dims = attr[3].dim_num; + outputFormat = attr[3].dtype.vx_type; + for (i = 0; i < output_dims; i++) + { + output_size[i] = attr[3].size[i]; + } + out_zp = attr[3].dtype.zero_point; + out_scale = attr[3].dtype.scale; + + input_size[2] = (input_dims <= 2)?1:input_size[2]; + input_size[3] = (input_dims <= 3)?1:input_size[3]; + + input_stride_size[0] = vsi_nn_GetTypeBytes(inputFormat); + output_stride_size[0] = vsi_nn_GetTypeBytes(outputFormat); + for (i=1; i< input_dims; i++) + { + input_stride_size[i] = input_stride_size[i-1] * input_size[i-1]; + output_stride_size[i] = output_stride_size[i-1] * output_size[i-1]; + } + input = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t)); + output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t)); + input_user_addr = vxCreateTensorAddressing(context, input_size, input_stride_size, (vx_uint8)input_dims); + vsi_nn_copy_tensor_patch(imgObj[0], &attr[0], input, VX_READ_ONLY); + //scale and bias + scale_stride_size[0] = vsi_nn_GetTypeBytes(scaleFormat); + bias_stride_size[0] = vsi_nn_GetTypeBytes(biasFormat); + for (i=1; i< scale_dims; i++) + { + scale_stride_size[i] = scale_stride_size[i-1] * scale_size[i-1]; + bias_stride_size[i] = bias_stride_size[i-1] * bias_size[i-1]; + } + scale = (int16_t*)malloc(scale_size[0]*sizeof(int16_t)); + bias = (float*)malloc(bias_size[0]*sizeof(float)); + bias_user_addr = vxCreateTensorAddressing(context, bias_size, bias_stride_size, (vx_uint8)bias_dims); + vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], bias, VX_READ_ONLY); + scale_user_addr = vxCreateTensorAddressing(context, scale_size, scale_stride_size, (vx_uint8)scale_dims); + vsi_nn_copy_tensor_patch(imgObj[2], &attr[2], scale, VX_READ_ONLY); + + // scalar + status = vxCopyScalar(scalar[0], &eps, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if (status != VX_SUCCESS) + { + VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__); + goto OnError; + } + // Call C Prototype + if(inputFormat == VSI_NN_TYPE_FLOAT16) + { + myLayerNormFunc(input, scale, bias, eps, output, input_dims, input_size[0], + input_size[1], input_size[2], input_size[3]); + } + else + { + myLayerNormFunc_u8(input, scale, bias, eps, output, input_dims, input_size[0], + input_size[1], input_size[2], input_size[3], in_zp, out_zp, in_scale, out_scale); + } + + //output tensor + output_user_addr = vxCreateTensorAddressing(context, output_size, + output_stride_size, (vx_uint8)output_dims); + vsi_nn_copy_tensor_patch(imgObj[3], &attr[3], output, VX_WRITE_ONLY); + +OnError: + if(input) free(input); + if(scale) free(scale); + if(bias) free(bias); + if(output) free(output); + if(input_user_addr) vxReleaseTensorAddressing(&input_user_addr); + if(scale_user_addr) vxReleaseTensorAddressing(&scale_user_addr); + if(bias_user_addr) vxReleaseTensorAddressing(&bias_user_addr); + if(output_user_addr) vxReleaseTensorAddressing(&output_user_addr); + } + + return status; +} +vsi_status VX_CALLBACK vxLayerNormInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + vsi_status status = VX_SUCCESS; + // Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vx_tensor input = (vx_tensor)paramObj[0]; + vx_tensor scale = (vx_tensor)paramObj[2]; + vx_tensor output = (vx_tensor)paramObj[3]; + uint32_t input_size[4] = {1, 1, 1, 1}; + uint32_t input_dims = 0; + vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16; + vsi_nn_type_e scaleDataFormat = VSI_NN_TYPE_FLOAT16; + vsi_nn_type_e outputDataFormat = VSI_NN_TYPE_FLOAT16; + vx_float32 scaleIn = 0; + vx_float32 scaleOut = 0; + vx_float32 reScaleOut_u8 = 0; + vx_float32 reOutZP = 0.f; + int32_t output_ZP = 0; + int32_t input_ZP = 0; + vx_uint32 iter = 0; + int32_t sumInZp = 0; + int32_t tmpZp1 = 0; + int32_t tmpZp2 = 0; + vx_float32 e2InScale = 0; + vsi_nn_tensor_attr_t attr[3]; + uint32_t i; + + memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t)); + status = vsi_nn_vxGetTensorAttr(input, &attr[0]); + status |= vsi_nn_vxGetTensorAttr(output, &attr[1]); + status |= vsi_nn_vxGetTensorAttr(scale, &attr[2]); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + + input_dims = attr[0].dim_num; + inputDataFormat = attr[0].dtype.vx_type; + for (i = 0; i < input_dims; i++) + { + input_size[i] = attr[0].size[i]; + } + input_ZP = attr[0].dtype.zero_point; + scaleIn = attr[0].dtype.scale; + outputDataFormat = attr[1].dtype.vx_type; + output_ZP = attr[1].dtype.zero_point; + scaleOut = attr[1].dtype.scale; + scaleDataFormat = attr[2].dtype.vx_type; + + if(outputDataFormat == VSI_NN_TYPE_UINT8) + { + reScaleOut_u8 = 1.0f / scaleOut; + reOutZP = (vx_float32)output_ZP; + } + iter = ((input_size[0] + 15) / 16) * 16; + sumInZp = input_ZP * iter * (-1); + tmpZp1 = (-2) * input_ZP; + tmpZp2 = iter * input_ZP * input_ZP; + e2InScale = scaleIn * scaleIn; + + input_size[2] = (input_dims <= 2)?1:input_size[2]; + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkOffset[2] = 0; + shaderParam.globalWorkScale[0] = input_size[0]; + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkScale[2] = 1; + shaderParam.globalWorkSize[0] = 1; + shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1], 4); + shaderParam.globalWorkSize[2] = input_size[2]; + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + if(status < 0) + { + VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); + } + { + vx_float32 dimRatio = 1.0f / (vx_float32)input_size[0]; + vx_uint32 uniFp16SumSqr_dp8x2[16] = { + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 UniFP16toFP32Lo4_dp4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }; + vx_uint32 uniExtractHalf4_dp4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }; + vx_uint32 uniConvertSecFp16Fp32_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }; + vx_uint32 uniSumU8_16x1[16] = { + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }; + vx_uint32 uniSqrSum_16x1[16] = { + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x55555555, // BSelt + 0x76543210, 0xfedcba98, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 uniConvert1stUint8SubZpToFp32_4x4[16] = { + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }; + vx_uint32 uniConvert2ndUint8SubZpToFp32_4x4[16] = { + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }; + vx_uint32 uniConvert3rdUint8SubZpToFp32_4x4[16] = { + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00090008, 0x000b000a, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }; + vx_uint32 uniConvert4thUint8SubZpToFp32_4x4[16] = { + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x000d000c, 0x000f000e, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }; + vx_uint32 uniConvertInt32toUint8_2x8[16] = { + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vx_uint32 UniPackFP16even_2x8[16] = { + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }; + if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_FLOAT16 + && scaleDataFormat == VSI_NN_TYPE_FLOAT16) + { + status = vxSetNodeUniform(nodObj, "width", 1, &input_size[0]); + status |= vxSetNodeUniform(nodObj, "dimRatio", 1, &dimRatio); + status |= vxSetNodeUniform(nodObj, "UniFP16toFP32Lo4_dp4x4", 1, UniFP16toFP32Lo4_dp4x4); + status |= vxSetNodeUniform(nodObj, "uniConvertSecFp16Fp32_4x4", 1, uniConvertSecFp16Fp32_4x4); + status |= vxSetNodeUniform(nodObj, "uniSumU8_16x1", 1, uniSumU8_16x1); + status |= vxSetNodeUniform(nodObj, "uniSqrSum_16x1", 1, uniSqrSum_16x1); + status |= vxSetNodeUniform(nodObj, "uniConvert1stUint8SubZpToFp32_4x4", 1, uniConvert1stUint8SubZpToFp32_4x4); + status |= vxSetNodeUniform(nodObj, "uniConvert2ndUint8SubZpToFp32_4x4", 1, uniConvert2ndUint8SubZpToFp32_4x4); + status |= vxSetNodeUniform(nodObj, "uniConvert3rdUint8SubZpToFp32_4x4", 1, uniConvert3rdUint8SubZpToFp32_4x4); + status |= vxSetNodeUniform(nodObj, "uniConvert4thUint8SubZpToFp32_4x4", 1, uniConvert4thUint8SubZpToFp32_4x4); + status |= vxSetNodeUniform(nodObj, "inputZP", 1, &input_ZP); + status |= vxSetNodeUniform(nodObj, "input_scale", 1, &scaleIn); + status |= vxSetNodeUniform(nodObj, "sumInZp", 1, &sumInZp); + status |= vxSetNodeUniform(nodObj, "tmpZp1", 1, &tmpZp1); + status |= vxSetNodeUniform(nodObj, "tmpZp2", 1, &tmpZp2); + status |= vxSetNodeUniform(nodObj, "e2InScale", 1, &e2InScale); + status |= vxSetNodeUniform(nodObj, "UniPackFP16even_2x8", 1, UniPackFP16even_2x8); + } + else + { + status = vxSetNodeUniform(nodObj, "uniFp16SumSqr_dp8x2", 1, uniFp16SumSqr_dp8x2); + status |= vxSetNodeUniform(nodObj, "width", 1, &input_size[0]); + status |= vxSetNodeUniform(nodObj, "dimRatio", 1, &dimRatio); + status |= vxSetNodeUniform(nodObj, "UniFP16toFP32Lo4_dp4x4", 1, UniFP16toFP32Lo4_dp4x4); + status |= vxSetNodeUniform(nodObj, "uniExtractHalf4_dp4x4", 1, uniExtractHalf4_dp4x4); + status |= vxSetNodeUniform(nodObj, "uniConvertInt32toUint8_2x8", 1, uniConvertInt32toUint8_2x8); + status |= vxSetNodeUniform(nodObj, "uniConvertSecFp16Fp32_4x4", 1, uniConvertSecFp16Fp32_4x4); + status |= vxSetNodeUniform(nodObj, "uniSumU8_16x1", 1, uniSumU8_16x1); + status |= vxSetNodeUniform(nodObj, "uniSqrSum_16x1", 1, uniSqrSum_16x1); + status |= vxSetNodeUniform(nodObj, "uniConvert1stUint8SubZpToFp32_4x4", 1, uniConvert1stUint8SubZpToFp32_4x4); + status |= vxSetNodeUniform(nodObj, "uniConvert2ndUint8SubZpToFp32_4x4", 1, uniConvert2ndUint8SubZpToFp32_4x4); + status |= vxSetNodeUniform(nodObj, "uniConvert3rdUint8SubZpToFp32_4x4", 1, uniConvert3rdUint8SubZpToFp32_4x4); + status |= vxSetNodeUniform(nodObj, "uniConvert4thUint8SubZpToFp32_4x4", 1, uniConvert4thUint8SubZpToFp32_4x4); + status |= vxSetNodeUniform(nodObj, "inputZP", 1, &input_ZP); + status |= vxSetNodeUniform(nodObj, "output_ZP", 1, &output_ZP); + status |= vxSetNodeUniform(nodObj, "input_scale", 1, &scaleIn); + status |= vxSetNodeUniform(nodObj, "outputScale", 1, &reScaleOut_u8); + status |= vxSetNodeUniform(nodObj, "outputZP", 1, &reOutZP); + status |= vxSetNodeUniform(nodObj, "sumInZp", 1, &sumInZp); + status |= vxSetNodeUniform(nodObj, "tmpZp1", 1, &tmpZp1); + status |= vxSetNodeUniform(nodObj, "tmpZp2", 1, &tmpZp2); + status |= vxSetNodeUniform(nodObj, "e2InScale", 1, &e2InScale); + } + if(status < 0) + { + VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); + } + } + return status; +} +static vx_param_description_t vxLayerNormKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxLayerNormKernelInfo = +{ + VX_KERNEL_ENUM_LAYERNORM, + VX_KERNEL_NAME_LAYERNORM, + NULL, + vxLayerNormKernelParam, + (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxLayerNormInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxLayerNormKernelInfo_u8 = +{ + VX_KERNEL_ENUM_LAYERNORM, + VX_KERNEL_NAME_LAYERNORM_UINT8, + NULL, + vxLayerNormKernelParam, + (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxLayerNormInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxLayerNormKernelInfo_FP16toU8 = +{ + VX_KERNEL_ENUM_LAYERNORM_FP16TOU8, + VX_KERNEL_NAME_LAYERNORM_FP16TOU8, + NULL, + vxLayerNormKernelParam, + (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxLayerNormInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxLayerNormKernelInfo_U8toFP16 = +{ + VX_KERNEL_ENUM_LAYERNORM, + VX_KERNEL_NAME_LAYERNORM_U8TOFP16, + NULL, + vxLayerNormKernelParam, + (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxLayerNormInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxLayerNormKernelInfo_CPU = +{ + VX_KERNEL_ENUM_LAYERNORM, + VX_KERNEL_NAME_LAYERNORM, + vxLayerNormKernel, + vxLayerNormKernelParam, + (sizeof(vxLayerNormKernelParam) / sizeof(vxLayerNormKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_LAYERNORM_list[] = +{ + &vxLayerNormKernelInfo_CPU, + &vxLayerNormKernelInfo, + &vxLayerNormKernelInfo_u8, + &vxLayerNormKernelInfo_FP16toU8, + &vxLayerNormKernelInfo_U8toFP16, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c new file mode 100644 index 0000000..fa478d0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_reduce.c @@ -0,0 +1,190 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_REDUCE) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_REDUCE) +#define _VX_KERNEL_NAME ("vsi_nn_kernel_reduce") +#define _VX_KERNEL_FUNC_KERNEL (vxReduceKernel) + +static vx_status VX_CALLBACK vxReduceKernel + ( + vx_node node, + const vx_reference* paramObj, + vx_uint32 paramNum + ) +{ + /* TODO: */ +#define ARG_NUM (6) +#define TENSOR_NUM_INPUT (1) +#define TENSOR_NUM_OUTPUT (1) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vx_status status = VX_SUCCESS; + vx_context context = NULL; + vsi_nn_tensor_attr_t attr[TENSOR_NUM]; + vx_uint32 stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM]; + vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL}; + vx_uint8 *buffer_ptr[TENSOR_NUM] = {NULL}; + vx_tensor tensor[TENSOR_NUM]; + + vx_float32 factor0; + vx_int32 factor; + vx_uint32 batch, c, h, w; + vx_uint32 i, j, k, b; + + //prepare data + context = vxGetContext((vx_reference)node); + + for( i = 0; i < TENSOR_NUM_INPUT; i ++ ) + { + tensor[i] = (vx_tensor)paramObj[i]; + buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], + &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY); + } + for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) + { + tensor[i] = (vx_tensor)paramObj[i]; + buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], + &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY); + } + + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(factor0), VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + //op calc + if (factor0 > 1) + { + factor = (vx_int32)(factor0 + 0.5); + w = attr[0].size[0]; + h = attr[0].size[1]; + c = attr[0].size[2]; + batch = 1; + for(b = 0; b < batch; ++b){ + for(k = 0; k < c; ++k){ + for(j = 0; j < h*factor; ++j){ + for(i = 0; i < w*factor; ++i){ + vx_int32 in_index = b*w*h*c + k*w*h + (j/factor)*w + i/factor; + vx_int32 out_index = b*w*h*c*factor*factor + k*w*h*factor*factor + + j*w*factor + i; + vx_float32 fval; + //out[out_index] = in[in_index]; + vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], + &fval, &attr[0].dtype); + vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index], + &attr[1].dtype); + } + } + } + } + } + else + { + factor = (vx_int32)(1 / factor0 + 0.5); + w = attr[1].size[0]; + h = attr[1].size[1]; + c = attr[1].size[2]; + batch = 1; + for(b = 0; b < batch; ++b){ + for(k = 0; k < c; ++k){ + for(j = 0; j < h; ++j){ + for(i = 0; i < w; ++i){ + vx_int32 in_index = b*w*h*c*factor*factor + + k*w*h*factor*factor + j*w*factor*factor + i*factor; + vx_int32 out_index = b*w*h*c + k*w*h + j * w + i; + vx_float32 fval; + //out[out_index] = in[in_index]; + vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], &fval, + &attr[0].dtype); + vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index], + &attr[1].dtype); + } + } + } + } + } + + //save data + for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) + { + status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY); + if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i])); + } + for( i = 0; i < TENSOR_NUM; i ++ ) + { + if (buffer_ptr[i]) free(buffer_ptr[i]); + } + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t s_params[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, +}; + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t _VX_KERNEL_VAR = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_REDUCE_list[] = +{ + &_VX_KERNEL_VAR, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c new file mode 100644 index 0000000..ef9a073 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_resize.c @@ -0,0 +1,283 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include + +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_RESIZE) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_RESIZE) +#define _VX_KERNEL_NAME ("vsi_nn_kernel_resize") +#define _VX_KERNEL_FUNC_KERNEL (vxResizeKernel) + +static vsi_status VX_CALLBACK vxResizeKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + /* TODO: */ +#define ARG_NUM (1) +#define TENSOR_NUM_INPUT (1) +#define TENSOR_NUM_OUTPUT (1) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vsi_status status = VX_SUCCESS; + vx_context context = NULL; + vsi_nn_tensor_attr_t attr[TENSOR_NUM]; + uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM]; + vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL}; + uint8_t *buffer_ptr[TENSOR_NUM] = {NULL}; + vx_tensor tensor[TENSOR_NUM]; + + float factor0; + int32_t factor; + uint32_t batch, c, h, w; + uint32_t i, j, k, b; + + //prepare data + context = vxGetContext((vx_reference)node); + + for( i = 0; i < TENSOR_NUM_INPUT; i ++ ) + { + tensor[i] = (vx_tensor)paramObj[i]; + buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], + &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY); + } + for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) + { + tensor[i] = (vx_tensor)paramObj[i]; + buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], + &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY); + } + + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(factor0), VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + //op calc + if (factor0 > 1) + { + factor = (int32_t)(factor0 + 0.5); + w = attr[0].size[0]; + h = attr[0].size[1]; + c = attr[0].size[2]; + batch = 1; + for(b = 0; b < batch; ++b){ + for(k = 0; k < c; ++k){ + for(j = 0; j < h*factor; ++j){ + for(i = 0; i < w*factor; ++i){ + int32_t in_index = b*w*h*c + k*w*h + (j/factor)*w + i/factor; + int32_t out_index = b*w*h*c*factor*factor + k*w*h*factor*factor + + j*w*factor + i; + float fval; + //out[out_index] = in[in_index]; + vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], + &fval, &attr[0].dtype); + vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index], + &attr[1].dtype); + } + } + } + } + } + else + { + factor = (int32_t)(1 / factor0 + 0.5); + w = attr[1].size[0]; + h = attr[1].size[1]; + c = attr[1].size[2]; + batch = 1; + for(b = 0; b < batch; ++b){ + for(k = 0; k < c; ++k){ + for(j = 0; j < h; ++j){ + for(i = 0; i < w; ++i){ + int32_t in_index = b*w*h*c*factor*factor + + k*w*h*factor*factor + j*w*factor*factor + i*factor; + int32_t out_index = b*w*h*c + k*w*h + j * w + i; + float fval; + //out[out_index] = in[in_index]; + vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], &fval, + &attr[0].dtype); + vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index], + &attr[1].dtype); + } + } + } + } + } + + //save data + for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) + { + status = vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY); + if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i])); + } + for( i = 0; i < TENSOR_NUM; i ++ ) + { + if (buffer_ptr[i]) free(buffer_ptr[i]); + } + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t s_params[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, +}; + +vsi_status VX_CALLBACK vxTensorResizeInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + // Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + uint32_t uniPackEvenData_2x8[16] = { + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00003400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + vsi_status status = VX_SUCCESS; + + vx_tensor input = (vx_tensor)paramObj[0]; + uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}; + vsi_nn_tensor_attr_t attr; + uint32_t i, input_dim; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + status = vsi_nn_vxGetTensorAttr(input, &attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + input_dim = attr.dim_num; + for (i = 0; i < input_dim; i++) + { + input_size[i] = attr.size[i]; + } + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkScale[0] = 16; + shaderParam.globalWorkScale[1] = 2; + shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1]; + + vxSetNodeUniform(nodObj, "uniPackEvenData_2x8", 1, uniPackEvenData_2x8); + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + return VX_SUCCESS; +} + +static vx_param_description_t vxTensorResizeKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t _VX_KERNEL_VAR = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTensorResize16BitsDownSampleQuarterKernelInfo = +{ + VX_KERNEL_ENUM_RESIZE_16BITS_DOWNSAMPLE_QUARTER, + VX_KERNEL_NAME_RESIZE_16BITS_DOWNSAMPLE_QUARTER, + NULL, + vxTensorResizeKernelParam, + (sizeof(vxTensorResizeKernelParam) / sizeof(vxTensorResizeKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxTensorResizeInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTensorResize8BitsDownSampleQuarterKernelInfo = +{ + VX_KERNEL_ENUM_RESIZE_8BITS_DOWNSAMPLE_QUARTER, + VX_KERNEL_NAME_RESIZE_8BITS_DOWNSAMPLE_QUARTER, + NULL, + vxTensorResizeKernelParam, + (sizeof(vxTensorResizeKernelParam) / sizeof(vxTensorResizeKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxTensorResizeInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_RESIZE_list[] = +{ + &_VX_KERNEL_VAR, + &vxTensorResize16BitsDownSampleQuarterKernelInfo, + &vxTensorResize8BitsDownSampleQuarterKernelInfo, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c new file mode 100644 index 0000000..0287f19 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_roi_align.c @@ -0,0 +1,317 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_ROI_ALIGN) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_ROI_ALIGN) +#define _VX_KERNEL_NAME (VX_KERNEL_NAME_ROI_ALIGN) +#define _VX_KERNEL_FUNC_KERNEL (vxRoi_alignKernel) + +static vsi_status VX_CALLBACK vxRoi_alignKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ +#define ARG_NUM (6) +#define TENSOR_NUM_INPUT (3) +#define TENSOR_NUM_OUTPUT (1) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vsi_status status = VSI_FAILURE; + vx_context context = NULL; + vx_tensor input[TENSOR_NUM_INPUT] = {0}; + vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; + float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; + int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0}; + float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; + vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; + vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; + uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; + uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; + + int32_t output_height; + int32_t output_width; + float height_ratio; + float width_ratio; + int32_t height_sample_num; + int32_t width_sample_num; + + uint32_t i = 0; + for(i = 0; i < TENSOR_NUM_INPUT; i++) + { + memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + /* prepare data */ + context = vxGetContext((vx_reference)node); + + for(i = 0; i < TENSOR_NUM_INPUT; i ++) + { + input[i] = (vx_tensor)paramObj[i]; + status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); + TEST_CHECK_STATUS(status, final); + in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); + if (i == 2) + { + int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context, + input[i], &in_attr[i]); + } + else + { + f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); + status = vsi_nn_vxConvertTensorToFloat32Data( + context, input[i], &in_attr[i], f32_in_buffer[i], + in_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) + { + output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; + status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); + TEST_CHECK_STATUS(status, final); + out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); + f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float)); + memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float)); + } + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(output_height), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(output_width), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(height_ratio), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(width_ratio), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(height_sample_num), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(width_sample_num), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + /* TODO: Add CPU kernel implement */ + { + uint32_t n, j, k; + uint32_t kRoiDim = 4; + float heightScale = 1.0f / height_ratio; + float widthScale = 1.0f / width_ratio; + uint32_t inHeight = in_attr[0].size[2]; + uint32_t inWidth = in_attr[0].size[1]; + uint32_t inDepth = in_attr[0].size[0]; + uint32_t numRois = in_attr[1].size[1]; + uint32_t outHeight = out_attr[0].size[2]; + uint32_t outWidth = out_attr[0].size[1]; + uint32_t out_index = 0; + + for(n = 0; n < numRois; n++) + { + uint32_t batchId = int32_in_buffer[2][n]; + float scale = (in_attr[1].dtype.vx_type == VSI_NN_TYPE_UINT16) ? 0.125f : 1.0f; + float wRoiStart = f32_in_buffer[1][n * kRoiDim] * widthScale * scale; + float hRoiStart = f32_in_buffer[1][n * kRoiDim + 1] * heightScale * scale; + float wRoiEnd = f32_in_buffer[1][n * kRoiDim + 2] * widthScale * scale; + float hRoiEnd = f32_in_buffer[1][n * kRoiDim + 3] * heightScale * scale; + + float roiWidth = vsi_nn_max((wRoiEnd - wRoiStart), 1.0f); + float roiHeight = vsi_nn_max((hRoiEnd - hRoiStart), 1.0f); + float wStepSize = roiWidth / outWidth; + float hStepSize = roiHeight / outHeight; + + uint32_t wSamplingRatio = width_sample_num > 0 + ? width_sample_num : (uint32_t)ceil(wStepSize); + uint32_t hSamplingRatio = height_sample_num > 0 + ? height_sample_num : (uint32_t)ceil(hStepSize); + int32_t numSamplingPoints = wSamplingRatio * hSamplingRatio; + float wBinSize = wStepSize / (float)(wSamplingRatio); + float hBinSize = hStepSize / (float)(hSamplingRatio); + + int32_t batch_base_index = batchId * inHeight * inWidth * inDepth; + + for (i = 0; i < outHeight; i++) + { + for (j = 0; j < outWidth; j++) + { + float wStart = wStepSize * j + wRoiStart; + float wEnd = wStepSize * (j + 1) + wRoiStart; + float hStart = hStepSize * i + hRoiStart; + float hEnd = hStepSize * (i + 1) + hRoiStart; + + float x,y; + for (y = hStart + hBinSize / 2; y < hEnd; y += hBinSize) + { + for (x = wStart + wBinSize / 2; x < wEnd; x += wBinSize) + { + uint32_t x1 = (uint32_t)floor(x); + uint32_t y1 = (uint32_t)floor(y); + uint32_t x2 = x1 + 1, y2 = y1 + 1; + float dx1 = x - (float)(x1); + float dy1 = y - (float)(y1); + if (x1 >= inWidth - 1) { + x1 = x2 = inWidth - 1; + dx1 = 0; + } + if (y1 >= inHeight - 1) { + y1 = y2 = inHeight - 1; + dy1 = 0; + } + { + float dx2 = 1.0f - dx1, dy2 = 1.0f - dy1; + float ws[] = {dx2 * dy2, dx1 * dy2, + dx2 * dy1, dx1 * dy1}; + uint32_t offsets[] = {y1 * inWidth * inDepth + x1 * inDepth, + y1 * inWidth * inDepth + x2 * inDepth, + y2 * inWidth * inDepth + x1 * inDepth, + y2 * inWidth * inDepth + x2 * inDepth}; + for (k = 0; k < inDepth; k++) { + float interpolation = 0; + uint32_t c; + for (c = 0; c < 4; c++) + { + interpolation += ws[c] + * f32_in_buffer[0][batch_base_index + offsets[c] + k]; + } + f32_out_buffer[0][out_index + k] += interpolation; + } + } + } + } + for (k = 0; k < inDepth; k++) + { + f32_out_buffer[0][out_index + k] /= (float)(numSamplingPoints); + } + out_index += inDepth; + } + } + } + } + + /* save data */ + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + status = vsi_nn_vxConvertFloat32DataToTensor( + context, output[i], &out_attr[i], f32_out_buffer[i], + out_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + +final: + for (i = 0; i < TENSOR_NUM_INPUT; i++) + { + if (f32_in_buffer[i]) free(f32_in_buffer[i]); + if (int32_in_buffer[i]) free(int32_in_buffer[i]); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + if (f32_out_buffer[i]) free(f32_out_buffer[i]); + } + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t vxRoi_alignKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_status VX_CALLBACK vxRoi_alignInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ + vx_status status = VX_SUCCESS; + /*TODO: Add initial code for VX program*/ + + return status; +} + + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxRoi_align_CPU = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + vxRoi_alignKernelParam, + _cnt_of_array( vxRoi_alignKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxRoi_align_VX = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + NULL, + vxRoi_alignKernelParam, + _cnt_of_array( vxRoi_alignKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxRoi_alignInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_ROI_ALIGN_list[] = +{ + &vxRoi_align_CPU, + &vxRoi_align_VX, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c new file mode 100644 index 0000000..d97517e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_scale.c @@ -0,0 +1,410 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_SCALE) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_SCALE) +#define _VX_KERNEL_NAME ("vsi_nn_kernel_scale") +#define _VX_KERNEL_FUNC_KERNEL (vxScaleKernel) + +static vsi_status VX_CALLBACK vxScaleKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + vsi_status status = VX_ERROR_INVALID_PARAMETERS; + + if( 6 == paramNum ) + { + vx_context context = NULL; + vx_tensor input_tensor = NULL; + vx_tensor scale_tensor = NULL; + vx_tensor bias_tensor = NULL; + vx_tensor output_tensor = NULL; + uint8_t * input_buffer = NULL; + uint8_t * scale_buffer = NULL; + uint8_t * bias_buffer = NULL; + uint8_t * output_buffer = NULL; + vx_scalar axis_scalar = NULL; + vx_scalar has_bias_scalar = NULL; + int axis = 1; + float has_bias = 0; + uint32_t input_dims = 0; + uint32_t scale_dims = 0; + uint32_t bias_dims = 0; + uint32_t output_dims = 0; + vsi_enum inputFormat = VSI_NN_TYPE_FLOAT16; + vsi_enum scaleFormat = VSI_NN_TYPE_FLOAT16; + vsi_enum biasFormat = VSI_NN_TYPE_FLOAT32; + vsi_enum outputFormat = VSI_NN_TYPE_FLOAT16; + uint32_t input_size[4] = {1, 1, 1, 1}; + uint32_t scale_size[4] = {1, 1, 1, 1}; + uint32_t bias_size[4] = {1, 1, 1, 1}; + uint32_t output_size[4] = {1, 1, 1, 1}; + uint32_t input_stride_size[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t output_stride_size[VSI_NN_MAX_DIM_NUM] = { 0 }; + vx_tensor_addressing input_user_addr = NULL; + vx_tensor_addressing scale_user_addr = NULL; + vx_tensor_addressing bias_user_addr = NULL; + vx_tensor_addressing output_user_addr = NULL; + vsi_nn_tensor_attr_t out_attr; + + status = VX_SUCCESS; + + memset(&out_attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); + + input_tensor = (vx_tensor)paramObj[0]; + scale_tensor = (vx_tensor)paramObj[1]; + bias_tensor = (vx_tensor)paramObj[2]; + output_tensor = (vx_tensor)paramObj[3]; + axis_scalar = (vx_scalar)paramObj[4]; + has_bias_scalar = (vx_scalar)paramObj[5]; + + context = vxGetContext((vx_reference)node); + if( NULL == context) + { + VSILOGE("vxGetContext failure!\n"); + status = VX_FAILURE; + goto OnError; + } + + input_buffer = vsi_nn_ConvertRawTensorToData(context, input_tensor, + &input_dims, &inputFormat, input_size, input_stride_size, + &input_user_addr, VX_READ_ONLY); + if( NULL == input_buffer ) + { + VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n"); + status = VX_ERROR_NO_MEMORY; + goto OnError; + } + + scale_buffer = vsi_nn_ConvertRawTensorToData(context, scale_tensor, + &scale_dims, &scaleFormat, scale_size, input_stride_size, + &scale_user_addr, VX_READ_ONLY); + if( NULL == scale_buffer ) + { + VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n"); + status = VX_ERROR_NO_MEMORY; + goto OnError; + } + + bias_buffer = vsi_nn_ConvertRawTensorToData(context, bias_tensor, + &bias_dims, &biasFormat, bias_size, input_stride_size, + &bias_user_addr, VX_READ_ONLY); + if( NULL == bias_buffer ) + { + VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n"); + status = VX_ERROR_NO_MEMORY; + goto OnError; + } + + output_buffer = vsi_nn_ConvertRawTensorToData(context, output_tensor, + &output_dims, &outputFormat, output_size, output_stride_size, + &output_user_addr, VX_WRITE_ONLY); + if( NULL == output_buffer ) + { + VSILOGE("vsi_nn_ConvertRawTensorToData failure!\n"); + status = VX_ERROR_NO_MEMORY; + goto OnError; + } + + status = vsi_nn_vxGetTensorAttr(output_tensor, &out_attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + goto OnError; + } + + status = vxCopyScalar(axis_scalar, &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if( VX_SUCCESS != status) + { + VSILOGE("vxCopyScalar axis failure! status:%d\n", status); + goto OnError; + } + status = vxCopyScalar(has_bias_scalar, &has_bias, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if( VX_SUCCESS != status ) + { + VSILOGE("vxCopyScalar axis failure! has_bias:%f\n", has_bias); + goto OnError; + } + + if( input_dims != output_dims ) + { + VSILOGE("Invalid parameters, input_dims output_dims mismatch %d:%d\n", + input_dims, output_dims); + status = VX_ERROR_INVALID_PARAMETERS; + goto OnError; + } + if( input_size[0] != scale_size[0] || input_size[0] != bias_size[0] ) + { + VSILOGE("Invalid parameters, input size mismatch %d:%d:%d\n", + input_size[0], scale_size[0], bias_size[0]); + status = VX_ERROR_INVALID_PARAMETERS; + goto OnError; + } + { + uint32_t i = 0; + uint32_t j = 0; + uint32_t fixed_num = 1; + uint32_t changed_num = 1; + + fixed_num = input_size[1] * input_size[2] * input_size[3]; + changed_num = input_size[0]; + + for( i = 0; i < fixed_num; i++ ) + { + int16_t* cur_input_row_ofst = ((int16_t *)input_buffer) + i * changed_num; + int16_t* cur_scale_row_ofst = ((int16_t *)scale_buffer); + float* cur_bias_row_ofst = ((float *)bias_buffer); + int16_t* cur_output_row_ofst = ((int16_t *)output_buffer) + i * changed_num; + + for( j = 0; j < changed_num; j++ ) + { + float cur_input_v = vsi_nn_Fp16ToFp32(*(cur_input_row_ofst + j)); + float cur_scale_v = vsi_nn_Fp16ToFp32(*(cur_scale_row_ofst + j)); + float cur_bias_v = *(cur_bias_row_ofst + j); + + float cur_result = cur_input_v * cur_scale_v + cur_bias_v; + *(cur_output_row_ofst + j) = vsi_nn_Fp32ToFp16(cur_result); + } + } + +#if defined(_SAVE_TENSOR) + { + static int count = 0; + char fname[256] = { 0 }; + sprintf(fname, "scale_output_tensor.%d.axis.%d.txt", count, axis); + vsi_nn_SaveDataToText(fname, output_buffer, + vsi_nn_ShapeProduct(output_size, output_dims), VSI_NN_TYPE_FLOAT16, NULL); + count++; + } +#endif + } + status = vsi_nn_vxCopyDataToTensor(context, output_tensor, &out_attr, output_buffer); + TEST_CHECK_STATUS(status, OnError); +OnError: + if( NULL != input_buffer ) + { + free( input_buffer ); + input_buffer = NULL; + } + if( NULL != scale_buffer ) + { + free( scale_buffer ); + scale_buffer = NULL; + } + if( NULL != bias_buffer ) + { + free( bias_buffer ); + bias_buffer = NULL; + } + if( NULL != output_buffer ) + { + free( output_buffer ); + output_buffer = NULL; + } + + if (input_user_addr) + { + vxReleaseTensorAddressing(&input_user_addr); + } + if (scale_user_addr) + { + vxReleaseTensorAddressing(&scale_user_addr); + } + if (bias_user_addr) + { + vxReleaseTensorAddressing(&bias_user_addr); + } + if (output_user_addr) + { + vxReleaseTensorAddressing(&output_user_addr); + } + + } + + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t s_params[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, +}; + +vsi_status VX_CALLBACK vxScaleInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + // Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + uint32_t uniExtractHalf8_2x8[16] = { + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }; + uint32_t uniFp16MulFp16ToFp32_Lo_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x01010101, // BSelt + 0x00010000, 0x00030002, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + uint32_t uniFp16MulFp16ToFp32_Hi_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x01010101, // BSelt + 0x00050004, 0x00070006, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + + vsi_status status = VX_SUCCESS; + + vx_tensor input = (vx_tensor)paramObj[0]; + uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}; + vx_uint32 i = 0; + vsi_nn_tensor_attr_t attr; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + status = vsi_nn_vxGetTensorAttr(input, &attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + for (i = 0; i < attr.dim_num; i++) + { + input_size[i] = attr.size[i]; + } + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkScale[0] = 8; + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1]; + + vxSetNodeUniform(nodObj, "uniExtractHalf8_2x8", 1, uniExtractHalf8_2x8); + vxSetNodeUniform(nodObj, "uniFp16MulFp16ToFp32_Lo_4x4", 1, uniFp16MulFp16ToFp32_Lo_4x4); + vxSetNodeUniform(nodObj, "uniFp16MulFp16ToFp32_Hi_4x4", 1, uniFp16MulFp16ToFp32_Hi_4x4); + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + return VX_SUCCESS; +} + +static vx_param_description_t vxScaleKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t _VX_KERNEL_VAR = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxScaleKernelInfo = +{ + VX_KERNEL_ENUM_SCALE, + VX_KERNEL_NAME_SCALE_FP16, + NULL, + vxScaleKernelParam, + (sizeof(vxScaleKernelParam) / sizeof(vxScaleKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxScaleInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_SCALE_list[] = +{ + &_VX_KERNEL_VAR, + &vxScaleKernelInfo, + NULL +}; +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c new file mode 100644 index 0000000..acdc249 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_shufflechannel.c @@ -0,0 +1,345 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +vsi_status vxShuffleChannelFunc + ( + vx_context context, + vx_tensor input, + vx_tensor output, + int32_t group_number, + int32_t axis + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_tensor_attr_t input_attr; + vsi_nn_tensor_attr_t output_attr; + uint8_t *in_data = NULL; + uint8_t *out_data = NULL; + uint32_t stride_size[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t buf_sz = 0; + uint32_t group_row = group_number; + uint32_t chs = 0, group_col = 0; + uint32_t len = 1, num = 1, feature_map_size = 1; + uint32_t n = 0, i = 0, j = 0; + uint32_t type_bytes = 0, len_bytes = 0, fms_bytes = 0; + + status = vsi_nn_vxGetTensorAttr(input, &input_attr); + status |= vsi_nn_vxGetTensorAttr(output, &output_attr); + TEST_CHECK_STATUS(status, final); + in_data = vsi_nn_vxCopyTensorToData(context, input, &input_attr); + TEST_CHECK_PTR(in_data, final); + buf_sz = vsi_nn_GetStrideSize(&output_attr, stride_size); + out_data = (uint8_t *)malloc( buf_sz ); + TEST_CHECK_PTR(out_data, final); + + chs = input_attr.size[axis]; + group_col = chs / group_row; + type_bytes = vsi_nn_TypeGetBytes( input_attr.dtype.vx_type ); + + for ( i = 0; i < (uint32_t)axis; i++) + { + len *= input_attr.size[i]; + } + for ( i = axis + 1; i < input_attr.dim_num; i++) + { + num *= input_attr.size[i]; + } + for ( i = 0; i <= (uint32_t)axis; i++) + { + feature_map_size *= input_attr.size[i]; + } + + /* Shuffle Channel CPU Implement, the shape and dtype of output must same as input */ + len_bytes = len * type_bytes; + fms_bytes = feature_map_size * type_bytes; + for ( n = 0; n < num; n++) + { + for ( i = 0; i < group_row; i++) + { + for ( j = 0; j < group_col; j++) + { + uint8_t *in_ptr = in_data + n * fms_bytes + (i * group_col + j) * len_bytes; + uint8_t *out_ptr = out_data + n * fms_bytes + (j * group_row + i) * len_bytes; + + memcpy(out_ptr, in_ptr, len_bytes); + } + } + } + + /* Copy data to output tensor */ + status = vsi_nn_vxCopyDataToTensor(context, output, &output_attr, out_data); + TEST_CHECK_STATUS(status, final); +final: + if (in_data) free(in_data); + if (out_data) free(out_data); + return status; +} +vsi_status VX_CALLBACK vxShuffleChannelKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + vsi_status status = VX_ERROR_INVALID_PARAMETERS; + + if(paramNum == 4) + { + vx_context context = NULL; + // tensor + vx_tensor imgObj[2] = { NULL }; + // scalar + vx_scalar scalar[2] = { NULL }; + int32_t group_number = 0; + int32_t axis = 0; + + imgObj[0] = (vx_tensor)paramObj[0]; + imgObj[1] = (vx_tensor)paramObj[1]; + scalar[0] = (vx_scalar)paramObj[2]; + scalar[1] = (vx_scalar)paramObj[3]; + + context = vxGetContext((vx_reference)node); + TEST_CHECK_PTR(context,final); + // scalar + status = vxCopyScalar(scalar[0], &group_number, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + TEST_CHECK_STATUS(status, final); + status = vxCopyScalar(scalar[1], &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + TEST_CHECK_STATUS(status, final); + + // Call C Prototype + status = vxShuffleChannelFunc(context, imgObj[0], imgObj[1], group_number, axis); + TEST_CHECK_STATUS(status, final); + } +final: + return status; +} +vsi_status VX_CALLBACK vxShuffleChannelInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + vsi_status status = VX_SUCCESS; + // Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vx_tensor input = (vx_tensor)paramObj[0]; + vx_scalar group_numbers = (vx_scalar)paramObj[2]; + vx_scalar axis_s = (vx_scalar)paramObj[3]; + uint32_t input_size[4] = {1, 1, 1, 1}; + vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16; + int32_t group_number = 0; + int32_t axis = 0; + int32_t group_column = 0; + float rgroup_column = 0.0f; + uint32_t chs = 0; + vx_uint32 i = 0; + vsi_nn_tensor_attr_t attr; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + status = vsi_nn_vxGetTensorAttr(input, &attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + for (i = 0; i < attr.dim_num; i++) + { + input_size[i] = attr.size[i]; + } + inputDataFormat = attr.dtype.vx_type; + + status |= vxCopyScalar(group_numbers, &group_number, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + status |= vxCopyScalar(axis_s, &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if(VX_SUCCESS != status) + { + VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); + return status; + } + chs = input_size[axis]; + if (chs % group_number) + { + VSILOGE("input channel can't be exact divided by group number! at line %d\n", __LINE__); + return VX_FAILURE; + } + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkOffset[2] = 0; + if (axis == 2) + { + if (inputDataFormat == VSI_NN_TYPE_FLOAT16 || inputDataFormat == VSI_NN_TYPE_INT16) + shaderParam.globalWorkScale[0] = 8; + else + shaderParam.globalWorkScale[0] = 16; + shaderParam.globalWorkScale[1] = 4; + shaderParam.globalWorkScale[2] = 1; + + shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1]; + shaderParam.globalWorkSize[2] = input_size[2]; + } + else if (axis == 1) + { + shaderParam.globalWorkScale[0] = 32; + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkScale[2] = 1; + + shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = input_size[1]; + shaderParam.globalWorkSize[2] = input_size[2]; + } + else + { + VSILOGE("[%s : %d]Initializer failure, not support axis: %d! \n",__FILE__, __LINE__, axis); + return VX_FAILURE; + } + group_column = chs / group_number; + rgroup_column = 1.0f / group_column; + + status |= vxSetNodeUniform(nodObj, "group_column", 1, &group_column); + status |= vxSetNodeUniform(nodObj, "rgroup_column", 1, &rgroup_column); + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + if(status < 0) + { + VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); + } + return status; +} +static vx_param_description_t vxShuffleChannelKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxShuffleChannelKernelInfo = +{ + VX_KERNEL_ENUM_SHUFFLECHANNEL, + VX_KERNEL_NAME_SHUFFLECHANNEL, + NULL, + vxShuffleChannelKernelParam, + (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxShuffleChannelInitializer, + vsi_nn_KernelDeinitializer +}; +vx_kernel_description_t vxShuffleChannelKernelInfo8Bits = +{ + VX_KERNEL_ENUM_SHUFFLECHANNEL, + VX_KERNEL_NAME_SHUFFLECHANNEL8BITS, + NULL, + vxShuffleChannelKernelParam, + (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxShuffleChannelInitializer, + vsi_nn_KernelDeinitializer +}; +vx_kernel_description_t vxShuffleChannelKernelInfo_CPU = +{ + VX_KERNEL_ENUM_SHUFFLECHANNEL, + VX_KERNEL_NAME_SHUFFLECHANNEL, + vxShuffleChannelKernel, + vxShuffleChannelKernelParam, + (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; +vx_kernel_description_t vxShuffleChannelKernelInfo_16BitsAxis1 = +{ + VX_KERNEL_ENUM_SHUFFLECHANNEL, + VX_KERNEL_NAME_SHUFFLECHANNEL16BITS_AXIS1, + NULL, + vxShuffleChannelKernelParam, + (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxShuffleChannelInitializer, + vsi_nn_KernelDeinitializer +}; +vx_kernel_description_t vxShuffleChannelKernelInfo_8BitsAxis1 = +{ + VX_KERNEL_ENUM_SHUFFLECHANNEL, + VX_KERNEL_NAME_SHUFFLECHANNEL8BITS_AXIS1, + NULL, + vxShuffleChannelKernelParam, + (sizeof(vxShuffleChannelKernelParam) / sizeof(vxShuffleChannelKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxShuffleChannelInitializer, + vsi_nn_KernelDeinitializer +}; +vx_kernel_description_t * vx_kernel_SHUFFLECHANNEL_list[] = +{ + &vxShuffleChannelKernelInfo_CPU, + &vxShuffleChannelKernelInfo, + &vxShuffleChannelKernelInfo8Bits, + &vxShuffleChannelKernelInfo_16BitsAxis1, + &vxShuffleChannelKernelInfo_8BitsAxis1, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c new file mode 100644 index 0000000..a473b6e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c @@ -0,0 +1,806 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define INPUT_FP16 0 +#define OUTPUT_FP16 0 + +vx_status getFactor(vx_uint32 data, vx_uint32 *factor, vx_uint32 minLimit, vx_uint32 maxLimit, vx_uint32 alignData) +{ + vx_uint32 i = 0; + vx_uint32 maxFactor = alignData - 1; + vx_status status = VX_FAILURE; + + for (i = minLimit; i <= maxLimit; i ++) + { + if (data % i == 0) + { + if (status == VX_FAILURE && data % i == 0) + { + *factor = i; + maxFactor = i; + status = VX_SUCCESS; + continue; + } + else if ((i % alignData) < (maxFactor % alignData)) + { + *factor = i; + maxFactor = i; + status = VX_SUCCESS; + } + } + } + + return status; +} + +void mySignalFrameFunc + ( + void* imgIn, + void* imgOut, + uint32_t input_dim, + uint32_t width, + uint32_t height, + uint32_t channel, + uint32_t batch, + uint32_t frame_len, // window size + uint32_t step, + uint32_t pad_end, + uint32_t pad_val, + uint32_t axis, + uint32_t *dstW, + uint32_t *dstH, + uint32_t *dstC, + uint32_t *dstB + ) +{ + uint8_t* tmpIn = (uint8_t*)imgIn; + uint8_t* tmpOut = (uint8_t*)imgOut; + + uint32_t i,j,k; + uint32_t size = 0; + uint32_t iter = 0; + + if(input_dim == 1) + { + if(axis != 0) + { + VSILOGE("error.\n"); + return; + } + *dstW = frame_len; + //*dstH = (len - frame_len) / step + 1; + *dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step + 1); + *dstC = 1; + *dstB = 1; + + size = (*dstW) * sizeof(int16_t); + iter = pad_end ? width : (width - frame_len + 1); + if(pad_end) + { + int16_t* output = (int16_t*)tmpOut; + int16_t* input = (int16_t*)tmpIn; + uint32_t m = 0; + for(i = 0, j = 0; i < iter; i += step) + { + for(m = i; m < frame_len + i; m++) + { + if(m >= width) + { + output[j] = 0; + } + else + { + output[j] = input[m]; + } + j++; + } + } + } + else + { + for(i = 0, j = 0; i < iter; i += step, j++) + { + memcpy(tmpOut + j * size, tmpIn + i * sizeof(int16_t), size); + } + } + } + else if(input_dim == 2) + { + if(axis == 0) + { + uint8_t* src = tmpIn; + uint8_t* dst = tmpOut; + + *dstH = frame_len; + *dstW = width; + *dstC = pad_end ? ((height + step - 1) / step) : ((height - frame_len) / step + 1); + + *dstB = 1; + + size = width * frame_len * sizeof(int16_t); + iter = pad_end ? (height) : (height - frame_len + 1); + if(pad_end) + { + uint32_t m = 0; + size = width * sizeof(int16_t); + for(i = 0, j = 0; i < iter; i += step) + { + for(m = i; m < frame_len + i; m++) + { + if(m >= height) + { + memset(dst + j * size, 0, size); + } + else + { + memcpy(dst + j * size, src + m * width * sizeof(int16_t), size); + } + j++; + } + } + } + else + { + for(i = 0, j = 0; i < iter; i += step, j++) + { + memcpy(dst + j * size, src + i * width * sizeof(int16_t), size); + } + } + } + else if(axis == 1) + { + *dstW = frame_len; + + //*dstH = (len - frame_len) / step + 1; + *dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step + 1); + + *dstC = height; + *dstB = 1; + + size = (*dstW) * sizeof(int16_t); + iter = pad_end ? width : (width - frame_len + 1); + if(pad_end) + { + for(k = 0; k < height; k++) + { + uint8_t* src = tmpIn + k * width * sizeof(int16_t); + uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t); + + int16_t* output = (int16_t*)dst; + int16_t* input = (int16_t*)src; + uint32_t m = 0; + for(i = 0, j = 0; i < iter; i += step) + { + for(m = i; m < frame_len + i; m++) + { + if(m >= width) + { + output[j] = 0; + } + else + { + output[j] = input[m]; + } + j++; + } + } + } + } + else + { + for(k = 0; k < height; k++) + { + uint8_t* src = tmpIn + k * width * sizeof(int16_t); + uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t); + + for(i = 0, j = 0; i < iter; i += step, j++) + { + memcpy(dst + j * size, src + i * sizeof(int16_t), size); + } + } + } + } + } + else if(input_dim == 3) + { + if(axis == 0) + { + uint8_t* src = tmpIn; + uint8_t* dst = tmpOut; + size = width * height * frame_len * sizeof(int16_t); + + *dstW = width; + *dstH = height; + *dstC = frame_len; + *dstB = pad_end ? ((channel + step - 1) / step) :((channel - frame_len) / step + 1); + iter = pad_end ? channel : (channel - frame_len + 1); + if(pad_end) + { + uint32_t m = 0; + size = width * height * sizeof(int16_t); + for(i = 0, j = 0; i < iter; i += step) + { + for(m = i; m < frame_len + i; m++) + { + if(m >= channel) + { + memset(dst + j * size, 0 , size); + } + else + { + memcpy(dst + j * size, src + m * width * height * sizeof(int16_t), size); + } + j++; + } + } + } + else + { + for(i = 0, j = 0; i < iter; i += step, j++) + { + memcpy(dst + j * size, src + i * width * height * sizeof(int16_t), size); + } + } + } + else if(axis == 1) + { + *dstH = frame_len; + *dstW = width; + *dstC = pad_end ? ((height + step - 1) / step) : ((height - frame_len) / step + 1); + *dstB = channel; + + size = width * frame_len * sizeof(int16_t); + iter = pad_end ? (height) : (height - frame_len + 1); + if(pad_end) + { + uint32_t m = 0; + size = width * sizeof(int16_t); + for(k = 0; k < channel; k++) + { + uint8_t* src = tmpIn + k * width * height* sizeof(int16_t); + uint8_t* dst = tmpOut + k * (*dstC) * (*dstW) * (*dstH) * sizeof(int16_t); + + for(i = 0, j = 0; i < iter; i += step) + { + for(m = i; m < frame_len + i; m++) + { + if(m >= height) + { + memset(dst + j * size, 0, size); + } + else + { + memcpy(dst + j * size, src + m * width * sizeof(int16_t), size); + } + j++; + } + } + } + } + else + { + for(k = 0; k < channel; k++) + { + uint8_t* src = tmpIn + k * width * height* sizeof(int16_t); + uint8_t* dst = tmpOut + k * (*dstC) * (*dstW) * (*dstH) * sizeof(int16_t); + + for(i = 0, j = 0; i < iter; i += step, j++) + { + memcpy(dst + j * size, src + i * width * sizeof(int16_t), size); + } + } + } + } + else if(axis == 2) + { + //*dstH = (len - frame_len) / step + 1; + *dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step + 1); + *dstW = frame_len; + *dstC = height; + *dstB = channel; + + size = (*dstW) * sizeof(int16_t); + iter = pad_end ? width : (width - frame_len + 1); + + if(pad_end) + { + for(k = 0; k < channel * height; k++) + { + uint8_t* src = tmpIn + k * width * sizeof(int16_t); + uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t); + + int16_t* output = (int16_t*)dst; + int16_t* input = (int16_t*)src; + uint32_t m = 0; + for(i = 0, j = 0; i < iter; i += step) + { + for(m = i; m < frame_len + i; m++) + { + if(m >= width) + { + output[j] = 0; + } + else + { + output[j] = input[m]; + } + j++; + } + } + } + } + else + { + for(k = 0; k < channel * height; k++) + { + uint8_t* src = tmpIn + k * width * sizeof(int16_t); + uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t); + for(i = 0, j = 0; i < iter; i += step, j++) + { + memcpy(dst + j * size, src + i * sizeof(int16_t), size); + } + } + } + } + } + + return; +} + +vsi_status VX_CALLBACK vxSignalFrameKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + vsi_status status = VX_ERROR_INVALID_PARAMETERS; + + if(paramNum == 7) + { + vx_context context = NULL; + // tensor + vx_tensor imgObj[7] = { NULL }; +#if INPUT_FP16 + int16_t *input = NULL; +#else + uint8_t *input = NULL; +#endif +#if OUTPUT_FP16 + int16_t *output = NULL; +#else + uint8_t *output = NULL; +#endif + + uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}, output_size[DIM_SIZE] = {1, 1, 1, 1}, dst_size[DIM_SIZE] = {1, 1, 1, 1}; + vsi_nn_tensor_attr_t in_attr, out_attr; + + vsi_nn_type_e outputFormat = VSI_NN_TYPE_FLOAT16; + uint32_t input_dims = 0, output_dims = 0, tmpDim = 0; + + vx_scalar scalar[5] = { NULL }; + uint32_t frame_length = 0, step = 0, pad_end = 0, pad = 0, axis = 0, axis0 = 0; + uint32_t i = 0; + + memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); + memset(&out_attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); + status = vsi_nn_vxGetTensorAttr(imgObj[0], &in_attr); + status |= vsi_nn_vxGetTensorAttr(imgObj[1], &out_attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + goto OnError; + } + + imgObj[0] = (vx_tensor)paramObj[0]; + imgObj[1] = (vx_tensor)paramObj[1]; //output + scalar[0] = (vx_scalar)paramObj[2]; + scalar[1] = (vx_scalar)paramObj[3]; + scalar[2] = (vx_scalar)paramObj[4]; + scalar[3] = (vx_scalar)paramObj[5]; + scalar[4] = (vx_scalar)paramObj[6]; + context = vxGetContext((vx_reference)node); + if (context == NULL) + { + VSILOGE("vxGetContext failure! at line %d\n", __LINE__); + goto OnError; + } + //input + input_dims = in_attr.dim_num; + for (i = 0; i < input_dims; i++) + { + input_size[i] = in_attr.size[i]; + } + + //output + output_dims = out_attr.dim_num; + outputFormat = out_attr.dtype.vx_type; + for (i = 0; i < output_dims; i++) + { + output_size[i] = out_attr.size[i]; + } + + input_size[2] = (input_dims <= 2)?1:input_size[2]; + input_size[3] = (input_dims <= 3)?1:input_size[3]; + + +#if INPUT_FP16 + input = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t)); +#else + //input = (uint8_t*)malloc(input_size[0]*input_size[1]*input_size[2]*vsi_nn_GetTypeBytes(inputFormat)); +#endif +#if OUTPUT_FP16 + output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t)); +#else + output = (uint8_t*)malloc(output_size[0]*output_size[1]*output_size[2]*vsi_nn_GetTypeBytes(outputFormat)); +#endif + + input = vsi_nn_vxCopyTensorToData(context, imgObj[0], &in_attr); + + // scalar + status = vxCopyScalar(scalar[0], &frame_length, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + status |= vxCopyScalar(scalar[1], &step, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + status |= vxCopyScalar(scalar[2], &pad_end, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + status |= vxCopyScalar(scalar[3], &pad, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + status |= vxCopyScalar(scalar[4], &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if (status != VX_SUCCESS) + { + VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__); + goto OnError; + } + + // Call C Prototype + if(output_dims == 2) + tmpDim = 1; + else + tmpDim = input_dims; + { + axis0 = input_dims - axis - 1; + } + mySignalFrameFunc(input, output, tmpDim, input_size[0], + input_size[1], input_size[2], input_size[3], + frame_length, step, pad_end, pad, axis0, + &dst_size[0], &dst_size[1], &dst_size[2], &dst_size[3]); + + //output tensor + status = vsi_nn_vxCopyDataToTensor(context, imgObj[1], &out_attr, output); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxCopyDataToTensor failure! at line %d\n", __LINE__); + goto OnError; + } + +OnError: + if(input) free(input); + if(output) free(output); + } + + return status; +} + +vsi_status VX_CALLBACK vxSignalFrameInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + vsi_status status = VX_SUCCESS; + // Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + vx_scalar scalar[5]; + vx_tensor input = (vx_tensor)paramObj[0]; + vx_tensor output = (vx_tensor)paramObj[1]; + + uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}; + uint32_t input_dims = 0; + uint32_t output_dims = 0; + //vx_uint32 factor = 1; + //vx_uint32 maxWorkGroupSize = 8; + uint32_t frame_length, step, pad_end, pad, axis, axis0; + uint32_t output_channel = 0; + + vx_uint32 i = 0; + vsi_nn_tensor_attr_t attr[2]; + memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); + status = vsi_nn_vxGetTensorAttr(input, &attr[0]); + status |= vsi_nn_vxGetTensorAttr(output, &attr[1]); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + + input_dims = attr[0].dim_num; + for (i = 0; i < input_dims; i++) + { + input_size[i] = attr[0].size[i]; + } + output_dims = attr[1].dim_num; + + scalar[0] = (vx_scalar)paramObj[2]; + scalar[1] = (vx_scalar)paramObj[3]; + scalar[2] = (vx_scalar)paramObj[4]; + scalar[3] = (vx_scalar)paramObj[5]; + scalar[4] = (vx_scalar)paramObj[6]; + + status = vxCopyScalar(scalar[0], &frame_length, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + status |= vxCopyScalar(scalar[1], &step, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + status |= vxCopyScalar(scalar[2], &pad_end, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + status |= vxCopyScalar(scalar[3], &pad, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + status |= vxCopyScalar(scalar[4], &axis0, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if (status != VX_SUCCESS) + { + VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__); + return status; + } + + { + if(input_dims == 2 && output_dims == 2) + { + axis = input_dims - axis0 - 2; + } + else + { + axis = input_dims - axis0 - 1; + } + } + + input_size[2] = (input_dims <= 2)?1:input_size[2]; + //input_size[2] = (input_dims == 4)?(input_size[2] * input_size[3]):input_size[2]; + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkOffset[2] = 0; + if((output_dims == 2) + || (input_dims == 2 && output_dims == 3 && axis == 1) + || (input_dims == 3 && axis == 2)) + { + shaderParam.globalWorkScale[0] = 1; + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkScale[2] = 1; + shaderParam.localWorkSize[0] = 1; + shaderParam.localWorkSize[1] = 1; +#if 0 + if (input_size[1] <= maxWorkGroupSize) + shaderParam.localWorkSize[1] = input_size[1]; + else if (getFactor(input_size[1], &factor, 2, maxWorkGroupSize, 8) == VX_SUCCESS) + shaderParam.localWorkSize[1] = factor; + else + shaderParam.localWorkSize[1] = 1; +#endif + + shaderParam.localWorkSize[2] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((1 + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); + + shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); + //shaderParam.globalWorkSize[1] = input_size[1]; + shaderParam.globalWorkSize[2] = gcmALIGN((input_size[2] + shaderParam.globalWorkScale[2] - 1) + / shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]); + } + else if((input_dims == 2 && output_dims == 3 && axis == 0) + || (input_dims == 3 && axis == 1)) + { + int height = (pad_end == 0) ? (input_size[1] - frame_length + 1) : (input_size[1]); + shaderParam.globalWorkScale[0] = 8; + shaderParam.globalWorkScale[1] = step; + shaderParam.globalWorkScale[2] = 1; + shaderParam.localWorkSize[0] = 1; + shaderParam.localWorkSize[1] = 1; + shaderParam.localWorkSize[2] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); + shaderParam.globalWorkSize[1] = gcmALIGN((height + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); + shaderParam.globalWorkSize[2] = gcmALIGN((input_size[2] + shaderParam.globalWorkScale[2] - 1) + / shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]); + + output_channel = (pad_end == 0) ? ((input_size[1] - frame_length) / step + 1) : ((input_size[1] + step - 1) / step); + } + else if(input_dims == 3 && axis == 0) + { + int channel = (pad_end == 0) ? (input_size[2] - frame_length + 1) : (input_size[2]); + shaderParam.globalWorkScale[0] = 8; + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkScale[2] = step; + shaderParam.localWorkSize[0] = 1; + shaderParam.localWorkSize[1] = 1; + shaderParam.localWorkSize[2] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); + shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); + shaderParam.globalWorkSize[2] = gcmALIGN((channel + shaderParam.globalWorkScale[2] - 1) + / shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]); + } + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + if(status < 0) + { + VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); + } + { + status |= vxSetNodeUniform(nodObj, "input_width", 1, &input_size[0]); + status |= vxSetNodeUniform(nodObj, "input_height", 1, &input_size[1]); + status |= vxSetNodeUniform(nodObj, "input_channel", 1, &input_size[2]); + status |= vxSetNodeUniform(nodObj, "output_channel", 1, &output_channel); + if(status < 0) + { + VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); + } + } + return status; +} +static vx_param_description_t vxSignalFrameKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxSignalFrameKernelInfo = +{ + VX_KERNEL_ENUM_SIGNALFRAME, + VX_KERNEL_NAME_SIGNALFRAME_WIDTH, + NULL, + vxSignalFrameKernelParam, + (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxSignalFrameInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxSignalFrameKernelInfo_height = +{ + VX_KERNEL_ENUM_SIGNALFRAME, + VX_KERNEL_NAME_SIGNALFRAME_HEIGHT, + NULL, + vxSignalFrameKernelParam, + (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxSignalFrameInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxSignalFrameKernelInfo_channel = +{ + VX_KERNEL_ENUM_SIGNALFRAME, + VX_KERNEL_NAME_SIGNALFRAME_CHANNEL, + NULL, + vxSignalFrameKernelParam, + (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxSignalFrameInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxSignalFrameKernelInfo_8bit = +{ + VX_KERNEL_ENUM_SIGNALFRAME, + VX_KERNEL_NAME_SIGNALFRAME_WIDTH_8BITS, + NULL, + vxSignalFrameKernelParam, + (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxSignalFrameInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxSignalFrameKernelInfo_height_8bit = +{ + VX_KERNEL_ENUM_SIGNALFRAME, + VX_KERNEL_NAME_SIGNALFRAME_HEIGHT_8BITS, + NULL, + vxSignalFrameKernelParam, + (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxSignalFrameInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxSignalFrameKernelInfo_channel_8bit = +{ + VX_KERNEL_ENUM_SIGNALFRAME, + VX_KERNEL_NAME_SIGNALFRAME_CHANNEL_8BITS, + NULL, + vxSignalFrameKernelParam, + (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxSignalFrameInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxSignalFrameKernelInfo_CPU = +{ + VX_KERNEL_ENUM_SIGNALFRAME, + VX_KERNEL_NAME_SIGNALFRAME_WIDTH, + vxSignalFrameKernel, + vxSignalFrameKernelParam, + (sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_SIGNALFRAME_list[] = +{ + &vxSignalFrameKernelInfo_CPU, + &vxSignalFrameKernelInfo, + &vxSignalFrameKernelInfo_height, + &vxSignalFrameKernelInfo_channel, + &vxSignalFrameKernelInfo_8bit, + &vxSignalFrameKernelInfo_height_8bit, + &vxSignalFrameKernelInfo_channel_8bit, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c new file mode 100644 index 0000000..67308f8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_space2depth.c @@ -0,0 +1,293 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_SPACE2DEPTH) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_SPACE2DEPTH) +#define _VX_KERNEL_NAME ("vsi_nn_kernel_space2depth") +#define _VX_KERNEL_FUNC_KERNEL (vxSpace2DepthKernel) + +static vsi_status VX_CALLBACK vxSpace2DepthKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + /* TODO: */ +#define ARG_NUM (2) +#define TENSOR_NUM_INPUT (1) +#define TENSOR_NUM_OUTPUT (1) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vsi_status status = VX_SUCCESS; + uint32_t i = 0; + vx_context context = NULL; + vsi_nn_tensor_attr_t attr[TENSOR_NUM]; + uint32_t stride_size[TENSOR_NUM][VSI_NN_MAX_DIM_NUM]; + vx_tensor_addressing user_addr[TENSOR_NUM] = {NULL}; + uint8_t *buffer_ptr[TENSOR_NUM] = {NULL}; + vx_tensor tensor[TENSOR_NUM] = {NULL}; + + int32_t block_size_x = 0, block_size_y = 0; + int32_t output_depth = 0, output_height = 0, output_width = 0; + int32_t input_batch = 0, input_depth = 0, input_height = 0, input_width = 0; + int32_t batch = 0, dim = 0; + + for(i = 0; i < TENSOR_NUM; i++) + { + memset(&attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + + //prepare data + context = vxGetContext((vx_reference)node); + + for( i = 0; i < TENSOR_NUM_INPUT; i ++ ) + { + tensor[i] = (vx_tensor)paramObj[i]; + buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], + &(attr[i]), stride_size[i], &(user_addr[i]), VX_READ_ONLY); + } + for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) + { + tensor[i] = (vx_tensor)paramObj[i]; + buffer_ptr[i] = vsi_nn_ConvertRawTensorToData2(context, tensor[i], + &(attr[i]), stride_size[i], &(user_addr[i]), VX_WRITE_ONLY); + } + + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(block_size_x), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(block_size_y), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + dim = attr[0].dim_num; + if(dim < 4) + attr[0].size[3] = 1; + //op calc + //output_batch = attr[1].size[3]; + output_depth = attr[1].size[2]; + output_height = attr[1].size[1]; + output_width = attr[1].size[0]; + + input_batch = attr[0].size[3]; + input_depth = attr[0].size[2]; + input_height = attr[0].size[1]; + input_width = attr[0].size[0]; + + for (batch = 0; batch < input_batch; ++batch) + { + vx_uint32 output_batch_index = batch * output_height * output_width * output_depth; + vx_uint32 input_batch_index = batch * input_height * input_width * input_depth; + vx_uint32 in_d; + for (in_d = 0; in_d < (vx_uint32)input_depth; in_d ++) + { + vx_uint32 in_h; + for (in_h = 0; in_h < (vx_uint32)input_height; ++ in_h) + { + vx_uint32 in_w; + for (in_w = 0; in_w < (vx_uint32)input_width; in_w ++) + { + vx_int32 out_w = in_w / block_size_x; + vx_int32 out_h = in_h / block_size_y; + //vx_int32 out_d = (in_w % block_size_x) * input_depth + (in_h % block_size_y) * block_size_x * input_depth + in_d; + vx_int32 out_d = (in_w % block_size_x) + (in_h % block_size_y) * block_size_x + in_d * block_size_x * block_size_y; + + vx_int32 in_index = in_w + in_h * input_width +in_d * input_height * input_width + input_batch_index; + vx_int32 out_index = out_w + out_h * output_width + out_d * output_width * output_height + output_batch_index; + + //outputBase[out_index] = inputBase[in_index]; + float fval; + vsi_nn_DtypeToFloat32(&buffer_ptr[0][stride_size[0][0] * in_index], + &fval, &attr[0].dtype); + vsi_nn_Float32ToDtype(fval, &buffer_ptr[1][stride_size[1][0] * out_index], + &attr[1].dtype); + } + } + } + } + + //save data + for( i = TENSOR_NUM_INPUT; i < TENSOR_NUM; i ++ ) + { + vsi_nn_copy_tensor_patch(tensor[i], &attr[i], buffer_ptr[i], VX_WRITE_ONLY); + } + for( i = 0; i < TENSOR_NUM; i ++ ) + { + if (user_addr[i]) vxReleaseTensorAddressing(&(user_addr[i])); + if (buffer_ptr[i]) free(buffer_ptr[i]); + } + + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +vsi_status VX_CALLBACK vxSpace2DepthInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + // Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_status status = VX_SUCCESS; + + vx_tensor input = (vx_tensor)paramObj[0]; + uint32_t input_size[4] = {1, 1, 1, 1}; + vx_uint32 input_dimz = 0; + vx_uint32 input_depth = 0; + vx_uint32 i = 0; + vsi_nn_tensor_attr_t attr; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + status = vsi_nn_vxGetTensorAttr(input, &attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + for (i = 0; i < attr.dim_num; i++) + { + input_size[i] = attr.size[i]; + } + + input_depth = input_size[2]; + if(input_size[3] > 0) + input_dimz = input_depth * input_size[3]; + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkOffset[2] = 0; + shaderParam.globalWorkScale[0] = 8; + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkScale[2] = 1; + shaderParam.localWorkSize[0] = 8; + shaderParam.localWorkSize[1] = 1; + shaderParam.localWorkSize[2] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); + shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); + shaderParam.globalWorkSize[2] = input_dimz; + + { + vx_uint32 uniExtractEvenFp16Stride2_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }; + vx_uint32 uniExtractOddFp16Stride2_4x4[16] = { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00030001, 0x00070005, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }; + status |= vxSetNodeUniform(nodObj, "uniExtractEvenFp16Stride2_4x4", 1, uniExtractEvenFp16Stride2_4x4); + status |= vxSetNodeUniform(nodObj, "uniExtractOddFp16Stride2_4x4", 1, uniExtractOddFp16Stride2_4x4); + //status |= vxSetNodeUniform(nodObj, "input_depth", 1, &input_depth); + } + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + return VX_SUCCESS; +} + +static vx_param_description_t s_params[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, +}; + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t _VX_KERNEL_VAR = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxSpace2DepthKernelInfo_int16_int16 = +{ + _VX_KERNEL_ID, + VX_KERNEL_NAME_SPACE2DEPTH_INT16_INT16, + NULL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxSpace2DepthInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_SPACE2DEPTH_list[] = +{ + NULL, + &_VX_KERNEL_VAR, + &vxSpace2DepthKernelInfo_int16_int16, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c new file mode 100644 index 0000000..e302139 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c @@ -0,0 +1,481 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_SPATIAL_TRANSFORMER) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_SPATIAL_TRANSFORMER) +#define _VX_KERNEL_NAME (VX_KERNEL_NAME_SPATIAL_TRANSFORMER) +#define _VX_KERNEL_FUNC_KERNEL (vxSpatial_transformerKernel) + + +static vsi_status VX_CALLBACK vxSpatial_transformerKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + /*To do cpu implementation*/ + vsi_status status = VX_SUCCESS; + + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t s_params[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, +}; + +vx_status VX_CALLBACK vxTransform_GemmInputValidator(vx_node node, vx_uint32 index) +{ + return VX_SUCCESS; +} + +vx_status VX_CALLBACK vxTransform_GemmOutputValidator(vx_node node, vx_uint32 index, vx_meta_format metaObj) +{ + return VX_SUCCESS; +} + +vx_status VX_CALLBACK vxValidator(vx_node node, const vx_reference parameters[], + vx_uint32 num, vx_meta_format metas[]) +{ + vx_status status = VX_SUCCESS; + vx_uint32 index = 0; + for(index = 0; index < num; index++) + { + if(index < 2) + { + status |= vxTransform_GemmInputValidator(node,index); + } + else + { + status |= vxTransform_GemmOutputValidator(node,index,metas[index]); + } + } + return status; +} + +static vx_param_description_t vxTransform_GemmKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + +vx_status VX_CALLBACK vxTransform_GemmInitializer(vx_node nodObj, const vx_reference *paramObj, vx_uint32 paraNum) +{ +// Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) +#define gcmMIN(x, y) (((x) <= (y)) ? (x) : (y)) +#define gcmMAX(x, y) (((x) >= (y)) ? (x) : (y)) +#define MAX_MULTIPLIER_NUM (65535) +#define MAX_POST_SHIFT_BITS (31) + vx_kernel_execution_parameters_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vx_status status = VX_SUCCESS; + vx_tensor input0 = (vx_tensor)paramObj[0]; + vx_tensor input1 = (vx_tensor)paramObj[1]; + vx_tensor output = (vx_tensor)paramObj[2]; + vx_enum src0Format = VSI_NN_TYPE_FLOAT16; + vx_enum src1Format = VSI_NN_TYPE_FLOAT16; + vx_enum dstFormat = VSI_NN_TYPE_FLOAT16; + vx_uint32 coord_size[4] = {1, 1, 1, 1}; + vx_uint32 i = 0; + vsi_nn_tensor_attr_t attr[3]; + + memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t)); + + status = vsi_nn_vxGetTensorAttr(input0, &attr[0]); + status |= vsi_nn_vxGetTensorAttr(input1, &attr[1]); + status |= vsi_nn_vxGetTensorAttr(output, &attr[2]); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + + src0Format = attr[0].dtype.vx_type; + src1Format = attr[1].dtype.vx_type; + for (i = 0; i < attr[1].dim_num; i++) + { + coord_size[i] = attr[1].size[i]; + } + dstFormat = attr[2].dtype.vx_type; + + if (src0Format == VSI_NN_TYPE_FLOAT16 && src1Format == VSI_NN_TYPE_FLOAT16 && dstFormat == VSI_NN_TYPE_FLOAT16) + { + shaderParam.globalWorkScale[0] = 12; + shaderParam.globalWorkScale[1] = 1; + } + + shaderParam.globalWorkSize[0] = + gcmALIGN((coord_size[0] + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = + (coord_size[1] + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1]; + { + vx_uint32 uniGemm3x3_4x4[16] = { + 0x15151515, // TCfg + 0x00000000, // ASelt + 0x02100210, 0x05430543, // ABin + 0x15151515, // BSelt + 0x05430210, 0x05430210, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }; + + vxSetNodeUniform(nodObj, "uniGemm3x3_4x4", 1, uniGemm3x3_4x4); + } + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + return VX_SUCCESS; +} + +static vx_param_description_t vxTransform_setupThresKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + +vx_status VX_CALLBACK vxTransform_setupThresInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ +// Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) +#define gcmMIN(x, y) (((x) <= (y)) ? (x) : (y)) +#define gcmMAX(x, y) (((x) >= (y)) ? (x) : (y)) +#define MAX_MULTIPLIER_NUM (65535) +#define MAX_POST_SHIFT_BITS (31) + vx_kernel_execution_parameters_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vx_status status = VX_SUCCESS; + vx_scalar thresFlag_s = (vx_scalar)paramObj[2]; + vx_enum src0Format = VSI_NN_TYPE_FLOAT16; + vx_enum src1Format = VSI_NN_TYPE_FLOAT16; + + vx_int32 thresFlag = 0; + vx_uint32 extract_packed[4] = {0}; + + vxCopyScalar(thresFlag_s, &thresFlag, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + if(status < 0) + VSILOGE("error-%s,%d\n",__FILE__,__LINE__); + + shaderParam.globalWorkScale[0] = 1; + shaderParam.globalWorkScale[1] = 1; + shaderParam.localWorkSize[0] = 1; + shaderParam.localWorkSize[1] = 1; + shaderParam.globalWorkSize[0] = 1; + shaderParam.globalWorkSize[1] = 1; + + if (src0Format == src1Format && src0Format == VSI_NN_TYPE_FLOAT16) + { + vx_uint32 i = 0; + vx_uint32 j = 0; + for (i = 0; i < 4; i++) + { + if (thresFlag & (1 << i)) + { + extract_packed[0] |= ((i << 4) << (i * 8)); + } + else + { + extract_packed[0] |= (((j << 4) + 128) << (i * 8)); + j ++; + } + } + + for (i = 4; i < 6; i++) + { + if (thresFlag & (1 << i)) + { + extract_packed[1] |= ((i << 4) << (i * 8 - 32)); + } + else + { + extract_packed[1] |= (((j << 4) + 128) << (i * 8 - 32)); + j ++; + } + } + + extract_packed[2] = extract_packed[3] = 0x10101010; + } + + vxSetNodeUniform(nodObj, "extract_packed", 1, extract_packed); + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + return VX_SUCCESS; +} + + +static vx_param_description_t vxTransform_InterPKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; + + +vx_status VX_CALLBACK vxTransform_InterPInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ +// Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) +#define gcmMIN(x, y) (((x) <= (y)) ? (x) : (y)) +#define gcmMAX(x, y) (((x) >= (y)) ? (x) : (y)) +#define MAX_MULTIPLIER_NUM (65535) +#define MAX_POST_SHIFT_BITS (31) + vx_kernel_execution_parameters_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vx_status status = VX_SUCCESS; + vx_tensor input0 = (vx_tensor)paramObj[0]; + vx_tensor input1 = (vx_tensor)paramObj[1]; + vx_tensor output = (vx_tensor)paramObj[2]; + vx_enum src0Format = VSI_NN_TYPE_FLOAT16; + vx_enum src1Format = VSI_NN_TYPE_FLOAT16; + vx_enum dstFormat = VSI_NN_TYPE_FLOAT16; + vx_uint32 coord_size[4] = {1, 1, 1, 1}; + vx_uint32 input_size[4] = {1, 1, 1, 1}; + vx_uint32 output_size[4] = {1, 1, 1, 1}; + vx_uint32 i = 0; + vsi_nn_tensor_attr_t attr[3]; + + memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t)); + + status = vsi_nn_vxGetTensorAttr(input0, &attr[0]); + status |= vsi_nn_vxGetTensorAttr(input1, &attr[1]); + status |= vsi_nn_vxGetTensorAttr(output, &attr[2]); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + + for (i = 0; i < attr[0].dim_num; i++) + { + input_size[i] = attr[0].size[i]; + } + src0Format = attr[0].dtype.vx_type; + src1Format = attr[1].dtype.vx_type; + for (i = 0; i < attr[1].dim_num; i++) + { + coord_size[i] = attr[1].size[i]; + } + dstFormat = attr[2].dtype.vx_type; + for (i = 0; i < attr[2].dim_num; i++) + { + output_size[i] = attr[2].size[i]; + } + + if ((src0Format == VSI_NN_TYPE_FLOAT16 && src1Format == VSI_NN_TYPE_FLOAT16 && dstFormat == VSI_NN_TYPE_FLOAT16) + || (src0Format == VSI_NN_TYPE_INT16 && src1Format == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_INT16)) + { + shaderParam.globalWorkScale[0] = 2; + shaderParam.globalWorkScale[1] = 1; + } + + shaderParam.globalWorkSize[0] = + gcmALIGN((coord_size[0] + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = + (coord_size[1] + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1]; + { + vx_int32 packedWH2[2] = {input_size[0], input_size[1]}; + vx_int32 packedWH = (input_size[1] << 16) | (input_size[0] & 0xFFFF); + vx_uint32 uniGetDXY_4x4[16] = { + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00100001, 0x00010010, // ABin + 0x09090909, // BSelt + 0x00010000, 0x00000001, // BBin + 0x00000101, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x00000000, 0x3c000000, 0x00000000, + 0x3c000000, 0x00000000, 0x3c000000, 0x00000000 // Constant + }; + vx_uint32 uniConvertF16toF32_4x4[16] = { + 0x01010101, // TCfg + 0x01010000, // ASelt + 0x00010000, 0x00010000, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }; + + vxSetNodeUniform(nodObj, "uniGetDXY_4x4", 1, uniGetDXY_4x4); + vxSetNodeUniform(nodObj, "uniConvertF16toF32_4x4", 1, uniConvertF16toF32_4x4); + + //packedWH2[0] = input_size[0]; + //packedWH2[1] = input_size[1]; + //packedWH = (input_size[1] << 16) | (input_size[0] & 0xFFFF); + vxSetNodeUniform(nodObj, "packedWH2", 1, packedWH2); + vxSetNodeUniform(nodObj, "packedWH", 1, &packedWH); + } + if (output_size[2] > 1) + { + vxSetNodeUniform(nodObj, "depth", 1, &output_size[2]); + } + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + + return VX_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxSpatial_transformer_CPU = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTransform_GemmKernelInfo_F16toF16 = +{ + VX_KERNEL_ENUM_SPATIAL_TRANSFORMER, + VX_KERNEL_NAME_SPATIAL_TRANSFORMER, + NULL, + vxTransform_GemmKernelParam, + (sizeof(vxTransform_GemmKernelParam) / sizeof(vxTransform_GemmKernelParam[0])), + vxValidator, + NULL, + NULL, + vxTransform_GemmInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTransform_setupThresKernelInfo_F16toF16 = +{ + VX_KERNEL_ENUM_SPATIAL_TRANSFORMER, + VX_KERNEL_NAME_TRANSFORM_SETUP_THRES_F16TOF16, + NULL, + vxTransform_setupThresKernelParam, + (sizeof(vxTransform_setupThresKernelParam) / sizeof(vxTransform_setupThresKernelParam[0])), + vxValidator, + NULL, + NULL, + vxTransform_setupThresInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTransform_InterPKernelInfo_F16toF16_2D = +{ + VX_KERNEL_ENUM_SPATIAL_TRANSFORMER, + VX_KERNEL_NAME_TRANSFORM_INTERP_F16TOF16_2D, + NULL, + vxTransform_InterPKernelParam, + (sizeof(vxTransform_InterPKernelParam) / sizeof(vxTransform_InterPKernelParam[0])), + vxValidator, + NULL, + NULL, + vxTransform_InterPInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTransform_InterPKernelInfo_F16toF16 = +{ + VX_KERNEL_ENUM_SPATIAL_TRANSFORMER, + VX_KERNEL_NAME_TRANSFORM_INTERP_F16TOF16, + NULL, + vxTransform_InterPKernelParam, + (sizeof(vxTransform_InterPKernelParam) / sizeof(vxTransform_InterPKernelParam[0])), + vxValidator, + NULL, + NULL, + vxTransform_InterPInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_SPATIAL_TRANSFORMER_list[] = +{ + &vxSpatial_transformer_CPU, + &vxTransform_setupThresKernelInfo_F16toF16, + &vxTransform_GemmKernelInfo_F16toF16, + &vxTransform_InterPKernelInfo_F16toF16_2D, + &vxTransform_InterPKernelInfo_F16toF16, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c new file mode 100644 index 0000000..9378674 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c @@ -0,0 +1,124 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include + +#include "vsi_nn_pub.h" +#include "utils/vsi_nn_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR_CPU (vx_client_kernel_cpu_SYNC_HOST) +#define _VX_KERNEL_ID KERNEL_ENUM_SYNC_HOST +#define _VX_KERNEL_NAME ("com.vivantecorp.extension.Sync_hostVXC") +#define _VX_KERNEL_FUNC_KERNEL (vxSync_hostKernel) + +static vsi_status VX_CALLBACK vxSync_hostKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + vsi_status status = 0; + vx_context context = NULL; + vx_tensor input = NULL; + vx_tensor output = NULL; + uint8_t * in_buffer = NULL; + uint32_t in_stride[8] = { 0 }; + vx_tensor_addressing in_addr = NULL; + vsi_nn_tensor_attr_t in_attr; + + status = VX_SUCCESS; + context = vxGetContext( (vx_reference)node ); + input = (vx_tensor)paramObj[0]; + output = (vx_tensor)paramObj[1]; + memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); + + in_buffer = vsi_nn_ConvertRawTensorToData2( context, input, + &in_attr, in_stride, &in_addr, VX_READ_ONLY ); + + status = vsi_nn_vxCopyDataToTensor(context, output, &in_attr, in_buffer); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxCopyDataToTensor failure! at line %d\n", __LINE__); + goto OnError; + } + +OnError: + if( NULL != in_buffer ) + { + free( in_buffer ); + } + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t s_params[] = + { + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + }; + +vx_status VX_CALLBACK vxSync_hostInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ + vx_status status = VX_SUCCESS; + /*TODO: Add initial code for VX program*/ + + return status; +} + + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t _VX_KERNEL_VAR_CPU = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + s_params, + _cnt_of_array( s_params ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_SYNC_HOST_list[] = +{ + &_VX_KERNEL_VAR_CPU, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c new file mode 100644 index 0000000..5d5f1ea --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c @@ -0,0 +1,287 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +void tensorStackConcatFunc + ( + int16_t* dataIn, + int16_t* dataIO, + int32_t index, + uint32_t width, + uint32_t height, + uint32_t channel, + uint32_t batch + ) +{ + int32_t stride = width * sizeof(int16_t); + VSILOGI("Hello tensorStackConcatFunc!\n"); + memcpy(dataIO + index * width, dataIn, stride); + return; +} +vsi_status VX_CALLBACK vxTensorStackConcatKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ + vsi_status status = VX_ERROR_INVALID_PARAMETERS; + + if(paramNum == 3) + { + vx_context context = NULL; + // tensor + vx_tensor imgObj[2] = { NULL }; + vsi_nn_tensor_attr_t attr[2]; + int16_t *input = NULL, *output = NULL; + uint32_t input_size[4] = {1, 1, 1, 1}, output_size[4] = {1, 1, 1, 1}; + uint32_t input_stride_size[4] = {1, 1, 1, 1}; + uint32_t output_stride_size[4] = {1, 1, 1, 1}; + vx_tensor_addressing input_user_addr = NULL; + vx_tensor_addressing output_user_addr = NULL; + vsi_nn_type_e inputFormat = VSI_NN_TYPE_FLOAT16, outputFormat = VSI_NN_TYPE_FLOAT16; + uint32_t input_dims = 0, output_dims = 0; + uint32_t i; + // scalar + vx_scalar scalar[1] = { NULL }; + int32_t index = 0; + + status = VX_SUCCESS; + imgObj[0] = (vx_tensor)paramObj[0]; + imgObj[1] = (vx_tensor)paramObj[1]; + scalar[0] = (vx_scalar)paramObj[2]; + memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); + context = vxGetContext((vx_reference)node); + if (context == NULL) + { + VSILOGE("vxGetContext failure! at line %d\n", __LINE__); + return status; + } + + status = vsi_nn_vxGetTensorAttr(imgObj[0], &attr[0]); + status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]); + status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + goto final; + } + + //input + input_dims = attr[0].dim_num; + inputFormat = attr[0].dtype.vx_type; + for (i = 0; i < input_dims; i++) + { + input_size[i] = attr[0].size[i]; + } + //output + output_dims = attr[1].dim_num; + outputFormat = attr[1].dtype.vx_type; + for (i = 0; i < output_dims; i++) + { + output_size[i] = attr[1].size[i]; + } + + input_size[2] = (input_dims <= 2)?1:input_size[2]; + input_size[3] = (input_dims <= 3)?1:input_size[3]; + input_stride_size[0] = vsi_nn_GetTypeBytes(inputFormat); + for (i=1; i< input_dims; i++) + { + input_stride_size[i] = input_stride_size[i-1] * input_size[i-1]; + } + input = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t)); + input_user_addr = vxCreateTensorAddressing(context, input_size, input_stride_size, (vx_uint8)input_dims); + vsi_nn_copy_tensor_patch(imgObj[0], &attr[0], input, VX_READ_ONLY); + output_stride_size[0] = vsi_nn_GetTypeBytes(outputFormat); + for (i=1; i< output_dims; i++) + { + output_stride_size[i] = output_stride_size[i-1] * output_size[i-1]; + } + output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t)); + output_user_addr = vxCreateTensorAddressing(context, output_size, + output_stride_size, (vx_uint8)output_dims); + + vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], output, VX_READ_ONLY); + // scalar + status = vxCopyScalar(scalar[0], &index, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if (status != VX_SUCCESS) + { + VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__); + goto final; + } + // Call C Prototype + tensorStackConcatFunc(input, output, index, input_size[0], + input_size[1], input_size[2], input_size[3]); + //output tensor + vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], output, VX_WRITE_ONLY); +final: + if(input) free(input); + if(output) free(output); + if(input_user_addr) vxReleaseTensorAddressing(&input_user_addr); + if(output_user_addr) vxReleaseTensorAddressing(&output_user_addr); + } + return status; +} +vsi_status VX_CALLBACK vxTensorStackConcatInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + uint32_t paraNum + ) +{ + vsi_status status = VX_SUCCESS; + // Alignment with a power of two value. +#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) + vx_kernel_execution_parameters_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vx_tensor input = (vx_tensor)paramObj[0]; + uint32_t input_size[4] = {1, 1, 1, 1}; + uint32_t input_dims = 0; + vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16; + vsi_nn_tensor_attr_t attr; + uint32_t i; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + status = vsi_nn_vxGetTensorAttr(input, &attr); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + + input_dims = attr.dim_num; + inputDataFormat = attr.dtype.vx_type; + for (i = 0; i < input_dims; i++) + { + input_size[i] = attr.size[i]; + } + input_size[2] = (input_dims <= 2)?1:input_size[2]; + + shaderParam.globalWorkOffset[0] = 0; + shaderParam.globalWorkOffset[1] = 0; + shaderParam.globalWorkOffset[2] = 0; + if (inputDataFormat == VSI_NN_TYPE_FLOAT16 || inputDataFormat == VSI_NN_TYPE_INT16) + shaderParam.globalWorkScale[0] = 16; + else + shaderParam.globalWorkScale[0] = 32; + shaderParam.globalWorkScale[1] = 1; + shaderParam.globalWorkScale[2] = 1; + shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1) + / shaderParam.globalWorkScale[0], 4); + shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1) + / shaderParam.globalWorkScale[1]; + shaderParam.globalWorkSize[2] = (input_size[2] + shaderParam.globalWorkScale[2] - 1) + / shaderParam.globalWorkScale[2]; + + status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, + &shaderParam, sizeof(vx_kernel_execution_parameters_t)); + if(status < 0) + { + VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__); + } + return status; +} +static vx_param_description_t vxTensorStackConcatKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED} +}; +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxTensorStackConcatKernelInfo = +{ + VX_KERNEL_ENUM_TENSORSTACKCONCAT, + VX_KERNEL_NAME_TENSORSTACKCONCAT, + NULL, + vxTensorStackConcatKernelParam, + (sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxTensorStackConcatInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTensorStackConcatKernelInfo8Bits = +{ + VX_KERNEL_ENUM_TENSORSTACKCONCAT8BITS, + VX_KERNEL_NAME_TENSORSTACKCONCAT8BITS, + NULL, + vxTensorStackConcatKernelParam, + (sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vxTensorStackConcatInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTensorStackConcatKernelInfo_CPU = +{ + VX_KERNEL_ENUM_TENSORSTACKCONCAT, + VX_KERNEL_NAME_TENSORSTACKCONCAT, + vxTensorStackConcatKernel, + vxTensorStackConcatKernelParam, + (sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_TENSORSTACKCONCAT_list[] = +{ + &vxTensorStackConcatKernelInfo_CPU, + &vxTensorStackConcatKernelInfo, + &vxTensorStackConcatKernelInfo8Bits, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c new file mode 100644 index 0000000..2fdf3bd --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c @@ -0,0 +1,266 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" + +#define _VX_KERNEL_VAR (vx_kernel_TOPK) +#define _VX_KERNEL_ID (VX_KERNEL_ENUM_TOPK) +#define _VX_KERNEL_NAME (VX_KERNEL_NAME_TOPK) +#define _VX_KERNEL_FUNC_KERNEL (vxTopkKernel) + +static uint32_t max_comp_func(void* data, int32_t left, int32_t right) +{ + float* fdata = (float*)data; + if (fdata[left] >= fdata[right]) + { + return TRUE; + } + else + { + return FALSE; + } +} + +static void find_top_k_1d +( + float* input, + uint32_t input_len, + uint32_t k, + float* value, + uint32_t* indices +) +{ + int32_t low = 0; + int32_t high = input_len - 1; + int32_t j; + + for (j = 0; j < (int32_t)input_len; j++) + { + indices[j] = j; + } + + j = vsi_nn_partition(input, low, high, max_comp_func, FALSE, indices); + + //part_sort + while (j != (int32_t)k) + { + if ((int32_t)k > j) + { + low = j + 1; + } + else + { + high = j; + } + j = vsi_nn_partition(input, low, high, max_comp_func, FALSE, indices); + } + //all_sort + vsi_nn_partition(input, 0, k - 1, max_comp_func, TRUE, indices); + + for (j = 0; j < (int32_t)k; j++) + { + value[j] = input[indices[j]]; + } +} + +static vsi_status VX_CALLBACK vxTopkKernel + ( + vx_node node, + const vx_reference* paramObj, + uint32_t paramNum + ) +{ +#define ARG_NUM (1) +#define TENSOR_NUM_INPUT (1) +#define TENSOR_NUM_OUTPUT (2) +#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) + + vsi_status status = VSI_FAILURE; + vx_context context = NULL; + vx_tensor input[TENSOR_NUM_INPUT] = {0}; + vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; + float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; + float *f32_out_buffer = NULL; + uint32_t *u32_out_buffer = NULL; + vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; + vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; + uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; + uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; + + int32_t top_k; + + uint32_t i = 0; + for(i = 0; i < TENSOR_NUM_INPUT; i++) + { + memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i++) + { + memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); + } + /* prepare data */ + context = vxGetContext((vx_reference)node); + + for(i = 0; i < TENSOR_NUM_INPUT; i ++) + { + input[i] = (vx_tensor)paramObj[i]; + status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); + TEST_CHECK_STATUS(status, final); + in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); + f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); + status = vsi_nn_vxConvertTensorToFloat32Data( + context, input[i], &in_attr[i], f32_in_buffer[i], + in_elements[i] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + } + for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) + { + output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; + status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); + TEST_CHECK_STATUS(status, final); + out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); + } + f32_out_buffer = (float *)malloc(out_elements[0] * sizeof(float)); + u32_out_buffer = (uint32_t *)malloc(out_elements[1] * sizeof(uint32_t)); + vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(top_k), + VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + /* TODO: Add CPU kernel implement */ + { + uint32_t block_num = in_attr[0].size[1]; + uint32_t block_size = in_attr[0].size[0]; + uint32_t * indices = (uint32_t*)malloc(block_size * sizeof(uint32_t)); + + for(i = 0; i < block_num; i++) + { + uint32_t in_index = i * block_size; + uint32_t out_index = i * top_k; + find_top_k_1d(&(f32_in_buffer[0][in_index]), + block_size, top_k, &(f32_out_buffer[out_index]), indices); + memcpy(&(u32_out_buffer[out_index]), + indices, top_k * sizeof(uint32_t)); + } + // Handle the 1D input + if (!block_num) { + find_top_k_1d(&(f32_in_buffer[0][0]), + block_size, top_k, &(f32_out_buffer[0]), indices); + memcpy(&(u32_out_buffer[0]), + indices, top_k * sizeof(uint32_t)); + } + if (indices) free(indices); + } + + /* save data */ + status = vsi_nn_vxConvertFloat32DataToTensor( + context, output[0], &out_attr[0], f32_out_buffer, + out_elements[0] * sizeof(float)); + TEST_CHECK_STATUS(status, final); + vsi_nn_vxCopyDataToTensor(context, output[1], &out_attr[1], (uint8_t *)u32_out_buffer); + +final: + for (i = 0; i < TENSOR_NUM_INPUT; i++) + { + if (f32_in_buffer[i]) free(f32_in_buffer[i]); + } + if (f32_out_buffer) free(f32_out_buffer); + if (u32_out_buffer) free(u32_out_buffer); + return status; +} /* _VX_KERNEL_FUNC_KERNEL() */ + +static vx_param_description_t vxTopkKernelParam[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_status VX_CALLBACK vxTopkInitializer + ( + vx_node nodObj, + const vx_reference *paramObj, + vx_uint32 paraNum + ) +{ + vx_status status = VX_SUCCESS; + /*TODO: Add initial code for VX program*/ + + return status; +} + + +#ifdef __cplusplus +extern "C" { +#endif +vx_kernel_description_t vxTopk_CPU = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + _VX_KERNEL_FUNC_KERNEL, + vxTopkKernelParam, + _cnt_of_array( vxTopkKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t vxTopk_VX = +{ + _VX_KERNEL_ID, + _VX_KERNEL_NAME, + NULL, + vxTopkKernelParam, + _cnt_of_array( vxTopkKernelParam ), + vsi_nn_KernelValidator, + NULL, + NULL, + vxTopkInitializer, + vsi_nn_KernelDeinitializer +}; + +vx_kernel_description_t * vx_kernel_TOPK_list[] = +{ + &vxTopk_CPU, + &vxTopk_VX, + NULL +}; +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx b/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx new file mode 100644 index 0000000..f19c623 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/a_times_b_plus_c.vx @@ -0,0 +1,56 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniA_Times_B_2x8; +_viv_uniform VXC_512Bits uniA_Plus_B_2x8; +__kernel void a_times_b_plus_c_F16_F16_F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_array_t input2, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_half8 src0, src1, src2, dst; + vxc_ushort8 vec0, vec1, vec2, result; + + VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + VXC_ReadImage2DArray(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src2, vec2, 16); + + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Times_B_2x8); + VXC_DP2x8(dst, dst, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Plus_B_2x8); + _viv_asm(COPY, result, dst, 16); + VXC_WriteImage2DArray(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void a_times_b_plus_c_F16_F16_F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_array_t input2, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_half8 src0, src1, src2, dst; + vxc_ushort8 vec0, vec1, vec2, result; + + VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + VXC_ReadImage(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src2, vec2, 16); + + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Times_B_2x8); + VXC_DP2x8(dst, dst, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Plus_B_2x8); + _viv_asm(COPY, result, dst, 16); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/add_mean_std_norm.vx b/src/tim/vx/internal/src/libnnext/ops/vx/add_mean_std_norm.vx new file mode 100644 index 0000000..99d51fa --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/add_mean_std_norm.vx @@ -0,0 +1,297 @@ +#include "cl_viv_vx_ext.h" + +/**************************Tensor add mean stddev norm float16*********************/ +_viv_uniform int width; +_viv_uniform float dimRatio; +_viv_uniform float rsEps; +_viv_uniform VXC_512Bits uniAddFp16_2x8; +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform VXC_512Bits uniAddFp16toFp32Lo_4x4; +_viv_uniform VXC_512Bits uniAddFp16toFp32Hi_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +// one group(16 threads) calculates one row +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_F16_F16toF16( + image2d_array_t input, + image2d_array_t input1, + image2d_array_t output, + float eps) +{ + int lidx = get_local_id(0); + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + vxc_short8 src0, src1, src2; + float pSum = 0, pSqr = 0; + float sum = 0, sqr = 0; + vxc_half8 in_h, in_h1, in_h2; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(; coord.x < width; coord.x += 128) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h, src0, 16); + _viv_asm(COPY, in_h1, src1, 16); + VXC_DP2x8(in_h2, in_h, in_h1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniAddFp16_2x8); + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, in_h2, in_h2, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + pSum += sumsqr.x; + pSqr += sumsqr.y; + } + + lcl_sum[lidx] = pSum; + lcl_sqr[lidx] = pSqr; + barrier(CLK_LOCAL_MEM_FENCE); + + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + float4 one = (float4)(1, 1, 1, 1); + float4 data0; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + pLocalPtr = (float4 *)&lcl_sqr[0]; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sqr = dot(data0, one); + + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari, stddev_inv, rMeanStd; + vari = sqr*dimRatio - mean*mean; + stddev_inv = (vari==0 ? rsEps : rsqrt(vari)); + rMeanStd = (-mean) * stddev_inv; + + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h, src0, 16); + _viv_asm(COPY, in_h1, src1, 16); + + vxc_float4 in_f0, in_f1; + VXC_DP4x4(in_f0, in_h, in_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAddFp16toFp32Lo_4x4); + VXC_DP4x4(in_f1, in_h, in_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAddFp16toFp32Hi_4x4); + + vxc_float4 norm0, norm1; + half4 norm_h0, norm_h1; + + norm0 = in_f0 * stddev_inv + rMeanStd; + norm1 = in_f1 * stddev_inv + rMeanStd; + _viv_asm(CONV, norm_h0, norm0); + _viv_asm(CONV, norm_h1, norm1); + + VXC_DP2x8(src2, norm_h0, norm_h1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift_1_Lo_2x8; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_U8_U8toF16( + image2d_array_t input, + image2d_array_t input1, + image2d_array_t output, + float eps) +{ + int lidx = get_local_id(0); + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + vxc_uchar8 src0, src1; + vxc_short8 src2; + float pSum = 0, pSqr = 0; + float sum = 0, sqr = 0; + vxc_half8 in_h, in_h1, in_h2; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + vxc_ushort8 ms0, ms1; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + _viv_asm(COPY, ms1, multAndoutZP1, 16); + + for(; coord.x < width; coord.x += 128) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(in_h, src0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_0_Lo_2x8); + VXC_DP2x8(in_h1, src1, ms1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_1_Lo_2x8); + VXC_DP2x8(in_h2, in_h, in_h1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniAddFp16_2x8); + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, in_h2, in_h2, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + pSum += sumsqr.x; + pSqr += sumsqr.y; + } + + lcl_sum[lidx] = pSum; + lcl_sqr[lidx] = pSqr; + barrier(CLK_LOCAL_MEM_FENCE); + + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + float4 one = (float4)(1, 1, 1, 1); + float4 data0; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + pLocalPtr = (float4 *)&lcl_sqr[0]; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sqr = dot(data0, one); + + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari, stddev_inv, rMeanStd; + vari = sqr*dimRatio - mean*mean; + stddev_inv = (vari==0 ? rsEps : rsqrt(vari)); + rMeanStd = (-mean) * stddev_inv; + + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(in_h, src0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_0_Lo_2x8); + VXC_DP2x8(in_h1, src1, ms1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_1_Lo_2x8); + + vxc_float4 in_f0, in_f1; + VXC_DP4x4(in_f0, in_h, in_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAddFp16toFp32Lo_4x4); + VXC_DP4x4(in_f1, in_h, in_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAddFp16toFp32Hi_4x4); + + vxc_float4 norm0, norm1; + half4 norm_h0, norm_h1; + + norm0 = in_f0 * stddev_inv + rMeanStd; + norm1 = in_f1 * stddev_inv + rMeanStd; + _viv_asm(CONV, norm_h0, norm0); + _viv_asm(CONV, norm_h1, norm1); + + VXC_DP2x8(src2, norm_h0, norm_h1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +_viv_uniform VXC_512Bits uniConvertInt16ScaleToFp32Fst_4x4; +_viv_uniform VXC_512Bits uniConvertInt16ScaleToFp32Sec_4x4; +_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4; +_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4; +_viv_uniform float inScale_i16; +_viv_uniform float inScale1_i16; + +// one group(16 threads) calculates one row +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_I16_I16toF16( + image2d_array_t input, + image2d_array_t input1, + image2d_array_t output, + float eps) +{ + int lidx = get_local_id(0); + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + vxc_short8 src0, src1, src2; + float pSum = 0, pSqr = 0; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + half scale_h, scale_h1; + _viv_asm(CONV, scale_h, inScale_i16); + _viv_asm(CONV, scale_h1, inScale1_i16); + float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; + float4 one = (float4)(1, 1, 1, 1); + + for(; coord.x < width; coord.x += 128) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpVal0, src0, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16ScaleToFp32Fst_4x4); + VXC_DP4x4(tmpVal1, src0, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16ScaleToFp32Sec_4x4); + VXC_DP4x4(tmpVal2, src1, scale_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16ScaleToFp32Fst_4x4); + VXC_DP4x4(tmpVal3, src1, scale_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16ScaleToFp32Sec_4x4); + tmpVal0 += tmpVal2; + tmpVal1 += tmpVal3; + + vxc_float4 sumsqr; + sumsqr = tmpVal0 * tmpVal0 + tmpVal1 * tmpVal1; // sqr + tmpVal2 = tmpVal0 + tmpVal1; // pre sum + + pSum += dot(tmpVal2, one); + pSqr += dot(sumsqr, one); + } + + lcl_sum[lidx] = pSum; + lcl_sqr[lidx] = pSqr; + barrier(CLK_LOCAL_MEM_FENCE); + + float4 data0; + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + pLocalPtr = (float4 *)&lcl_sqr[0]; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sqr = dot(data0, one); + + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari, stddev_inv, rMeanStd; + vari = sqr*dimRatio - mean*mean; + stddev_inv = (vari==0 ? rsEps : rsqrt(vari)); + rMeanStd = (-mean) * stddev_inv; + + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpVal0, src0, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16ScaleToFp32Fst_4x4); + VXC_DP4x4(tmpVal1, src0, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16ScaleToFp32Sec_4x4); + VXC_DP4x4(tmpVal2, src1, scale_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16ScaleToFp32Fst_4x4); + VXC_DP4x4(tmpVal3, src1, scale_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16ScaleToFp32Sec_4x4); + tmpVal0 += tmpVal2; + tmpVal1 += tmpVal3; + + vxc_float4 norm0, norm1; + half4 norm_h0, norm_h1; + + norm0 = tmpVal0 * stddev_inv + rMeanStd; + norm1 = tmpVal1 * stddev_inv + rMeanStd; + _viv_asm(CONV, norm_h0, norm0); + _viv_asm(CONV, norm_h1, norm1); + + VXC_DP2x8(src2, norm_h0, norm_h1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis0.vx new file mode 100644 index 0000000..4b48199 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis0.vx @@ -0,0 +1,240 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int4 packedArgIdx; +_viv_uniform int inputWidth; +_viv_uniform VXC_512Bits uniPackedIdxAddSat_2x8; +_viv_uniform VXC_512Bits uniSrcT2DstT_2x8; + +#define TENSOR_ARGMAX_AXIS0_8BITS(src_type_name, dst_type_name, src_type, \ + cond_type0, cond_type1, dst_type, cond_type) \ +__kernel void argmax_axis0_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \ + src_type maxValue, maxVec, value; \ + dst_type packIdx, currIdx; \ + \ + VXC_ReadImage2DArray(maxVec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \ + coord.x += 8; \ + for (; coord.x < inputWidth; ) \ + { \ + VXC_ReadImage2DArray(value, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + \ + currIdx = currIdx + 8; \ + dst_type condition; \ + cond_type0 src_condition0 = value > maxVec; \ + cond_type1 src_condition; \ + _viv_asm(COPY, src_condition, src_condition0, 8); \ + cond_type condition_tmp; \ + VXC_DP2x8(condition_tmp, src_condition, src_condition, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniSrcT2DstT_2x8); \ + _viv_asm(COPY, condition, condition_tmp, 16); \ + packIdx = condition ? currIdx : packIdx; \ + maxVec = max(maxVec, value); \ + } \ + \ + VXC_HorzMax3_Integer(maxValue, maxVec, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_HorzMax3_Integer(maxValue, maxValue.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + maxValue.s01234567 = maxValue.s00000000; \ + \ + cond_type1 _maxVal; \ + VXC_Clamp(_maxVal, maxVec, maxValue, maxValue, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _maxVal += 1; \ + \ + VXC_DP2x8(packIdx, packIdx, _maxVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniPackedIdxAddSat_2x8); \ + \ + VXC_HorzMin3_Integer(packIdx, packIdx, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_HorzMin3_Integer(packIdx, packIdx.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_WriteImage(output, coord.yz, packIdx, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS0_8BITS(I8, I16, vxc_char8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8) +TENSOR_ARGMAX_AXIS0_8BITS(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +TENSOR_ARGMAX_AXIS0_8BITS(U8, I16, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8) +TENSOR_ARGMAX_AXIS0_8BITS(U8, U8, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +TENSOR_ARGMAX_AXIS0_8BITS(I16, I16, vxc_short8, vxc_short8, vxc_ushort8, vxc_short8, vxc_ushort8) +TENSOR_ARGMAX_AXIS0_8BITS(I16, U8, vxc_short8, vxc_short8, vxc_ushort8, vxc_uchar8, vxc_uchar8) + +#define TENSOR_ARGMAX_AXIS0_8BITS_2D(src_type_name, dst_type_name, src_type, \ + cond_type0, cond_type1, dst_type, cond_type) \ +__kernel void argmax_axis0_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(0), 0, 0); \ + src_type maxValue, maxVec, value; \ + dst_type packIdx, currIdx; \ + \ + VXC_ReadImage(maxVec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \ + coord.x += 8; \ + for (; coord.x < inputWidth; ) \ + { \ + VXC_ReadImage(value, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + \ + currIdx = currIdx + 8; \ + dst_type condition; \ + cond_type0 src_condition0 = value > maxVec; \ + cond_type1 src_condition; \ + _viv_asm(COPY, src_condition, src_condition0, 8); \ + cond_type condition_tmp; \ + VXC_DP2x8(condition_tmp, src_condition, src_condition, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniSrcT2DstT_2x8); \ + _viv_asm(COPY, condition, condition_tmp, 16); \ + packIdx = condition ? currIdx : packIdx; \ + maxVec = max(maxVec, value); \ + } \ + \ + VXC_HorzMax3_Integer(maxValue, maxVec, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_HorzMax3_Integer(maxValue, maxValue.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + maxValue.s01234567 = maxValue.s00000000; \ + \ + cond_type1 _maxVal; \ + VXC_Clamp(_maxVal, maxVec, maxValue, maxValue, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _maxVal += 1; \ + \ + VXC_DP2x8(packIdx, packIdx, _maxVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniPackedIdxAddSat_2x8); \ + \ + VXC_HorzMin3_Integer(packIdx, packIdx, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_HorzMin3_Integer(packIdx, packIdx.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_WriteImage(output, coord.yz, packIdx, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} + +TENSOR_ARGMAX_AXIS0_8BITS_2D(I8, I16, vxc_char8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8) +TENSOR_ARGMAX_AXIS0_8BITS_2D(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +TENSOR_ARGMAX_AXIS0_8BITS_2D(U8, I16, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8) +TENSOR_ARGMAX_AXIS0_8BITS_2D(U8, U8, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +TENSOR_ARGMAX_AXIS0_8BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_ushort8, vxc_short8, vxc_ushort8) +TENSOR_ARGMAX_AXIS0_8BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_ushort8, vxc_uchar8, vxc_uchar8) + +_viv_uniform VXC_512Bits uniConvertHalf2Float32_4x4; + +#define TENSOR_ARGMAX_AXIS0_F16_2D(dst_type_name, dst_type) \ +__kernel void argmax_axis0_F16to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(0), 0, 0); \ + vxc_short8 vec0, vec1; \ + vxc_half8 src; \ + uint4 packIdx, currIdx; \ + float4 maxValue, value; \ + \ + VXC_ReadImage(vec0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec0, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \ + coord.x += 4; \ + VXC_DP4x4(maxValue, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \ + for (; coord.x < inputWidth; ) \ + { \ + VXC_ReadImage(vec1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec1, 16); \ + coord.x += 4; \ + \ + currIdx = currIdx + 4; \ + VXC_DP4x4(value, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \ + \ + int4 condition; \ + condition = value > maxValue; \ + \ + uint4 iCondition; \ + _viv_asm(COPY, iCondition, condition, 16); \ + packIdx = iCondition ? currIdx : packIdx; \ + maxValue = value > maxValue ? value : maxValue; \ + } \ + \ + float4 maxVec; \ + float2 maxVal2 = maxValue.xy > maxValue.zw ? maxValue.xy : maxValue.zw; \ + maxVec.x = maxVal2.x > maxVal2.y ? maxVal2.x : maxVal2.y; \ + int4 condition; \ + condition = maxVec.xxxx == maxValue; \ + uint4 iCondition; \ + _viv_asm(COPY, iCondition, condition, 16); \ + iCondition += 1; \ + \ + packIdx = mad_sat(iCondition, 0xFFFFFFFF, packIdx); \ + \ + uint2 val2 = packIdx.xy < packIdx.zw ? packIdx.xy : packIdx.zw; \ + val2.x = val2.x < val2.y ? val2.x : val2.y; \ + \ + dst_type dst; \ + _viv_asm(COPY, dst, val2, 4); \ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS0_F16_2D(I16, vxc_ushort8) +TENSOR_ARGMAX_AXIS0_F16_2D(U8, vxc_uchar8) + + +#define TENSOR_ARGMAX_AXIS0_F16(dst_type_name, dst_type) \ +__kernel void argmax_axis0_F16to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \ + vxc_short8 vec0, vec1; \ + vxc_half8 src; \ + uint4 packIdx, currIdx; \ + float4 maxValue, value; \ + \ + VXC_ReadImage2DArray(vec0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec0, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \ + coord.x += 4; \ + VXC_DP4x4(maxValue, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \ + for (; coord.x < inputWidth; ) \ + { \ + VXC_ReadImage2DArray(vec1, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec1, 16); \ + coord.x += 4; \ + \ + currIdx = currIdx + 4; \ + VXC_DP4x4(value, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \ + \ + int4 condition; \ + condition = value > maxValue; \ + \ + uint4 iCondition; \ + _viv_asm(COPY, iCondition, condition, 16); \ + packIdx = iCondition ? currIdx : packIdx; \ + maxValue = value > maxValue ? value : maxValue; \ + } \ + \ + float4 maxVec; \ + float2 maxVal2 = maxValue.xy > maxValue.zw ? maxValue.xy : maxValue.zw; \ + maxVec.x = maxVal2.x > maxVal2.y ? maxVal2.x : maxVal2.y; \ + int4 condition; \ + condition = maxVec.xxxx == maxValue; \ + uint4 iCondition; \ + _viv_asm(COPY, iCondition, condition, 16); \ + iCondition += 1; \ + \ + packIdx = mad_sat(iCondition, 0xFFFFFFFF, packIdx); \ + \ + uint2 val2 = packIdx.xy < packIdx.zw ? packIdx.xy : packIdx.zw; \ + val2.x = val2.x < val2.y ? val2.x : val2.y; \ + \ + dst_type dst; \ + _viv_asm(COPY, dst, val2, 4); \ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS0_F16(I16, vxc_ushort8) +TENSOR_ARGMAX_AXIS0_F16(U8, vxc_uchar8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis1.vx new file mode 100644 index 0000000..e6cf337 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis1.vx @@ -0,0 +1,161 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int4 packedArgIdx; +_viv_uniform int argLenSub1; +_viv_uniform VXC_512Bits uniExtractData_2x8; + +#define TENSOR_ARGMAX_AXIS1_16BITS(src_type_name, dst_type_name, src_type,\ + copy_type, axis_type, dst_type, inst_type) \ + __kernel void argmax_axis1_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), argLenSub1, get_global_id(1), 0); \ + copy_type vec; \ + src_type src; \ + src_type maxVal; \ + copy_type max; \ + VXC_ReadImage2DArray(max, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, maxVal, max, 16); \ + axis_type axis; \ + axis_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.y --; \ + for (;coord.y >= 0;) \ + { \ + VXC_ReadImage2DArray(vec, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + coord.y --; \ + packIdx --; \ + VXC_VertMax3_##inst_type(maxVal, maxVal, maxVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, max, maxVal, 16); \ + axis = (max == vec) ? packIdx : axis; \ + } \ + \ + dst_type dst_axis; \ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \ + VXC_WriteImage(output, coord.xz, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS1_16BITS(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half) +TENSOR_ARGMAX_AXIS1_16BITS(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half) +TENSOR_ARGMAX_AXIS1_16BITS(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer) +TENSOR_ARGMAX_AXIS1_16BITS(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer) + +#define TENSOR_ARGMAX_AXIS1_16BITS_2D(src_type_name, dst_type_name, src_type,\ + copy_type, axis_type, dst_type, inst_type) \ +__kernel void argmax_axis1_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), argLenSub1, 0, 0); \ + copy_type vec; \ + src_type src; \ + src_type maxVal; \ + copy_type max; \ + VXC_ReadImage(max, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, maxVal, max, 16); \ + axis_type axis; \ + axis_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.y --; \ + for (;coord.y >= 0;) \ + { \ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + coord.y --; \ + packIdx --; \ + VXC_VertMax3_##inst_type(maxVal, maxVal, maxVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, max, maxVal, 16); \ + axis = (max == vec) ? packIdx : axis; \ + } \ + \ + dst_type dst_axis; \ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \ + VXC_WriteImage(output, coord.xz, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS1_16BITS_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half) +TENSOR_ARGMAX_AXIS1_16BITS_2D(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half) +TENSOR_ARGMAX_AXIS1_16BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer) +TENSOR_ARGMAX_AXIS1_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer) + +#define TENSOR_ARGMAX_AXIS1_8BITS(src_type_name, dst_type_name, src_type, dst_type) \ +__kernel void argmax_axis1_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), argLenSub1, get_global_id(1), 0); \ + src_type src; \ + src_type maxVal; \ + VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dst_type axis; \ + dst_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.y --; \ + for (;coord.y >= 0;) \ + { \ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y --; \ + packIdx --; \ + maxVal = max(maxVal, src); \ + dst_type condition; \ + VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axis = condition ? packIdx : axis; \ + } \ + \ + VXC_WriteImage(output, coord.xz, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS1_8BITS(I8, I16, vxc_char16, vxc_short8) +TENSOR_ARGMAX_AXIS1_8BITS(I8, U8, vxc_char16, vxc_uchar16) +TENSOR_ARGMAX_AXIS1_8BITS(U8, I16, vxc_uchar16, vxc_short8) +TENSOR_ARGMAX_AXIS1_8BITS(U8, U8, vxc_uchar16, vxc_uchar16) + +#define TENSOR_ARGMAX_AXIS1_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \ +__kernel void argmax_axis1_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), argLenSub1, 0, 0); \ + src_type src; \ + src_type maxVal; \ + VXC_ReadImage(maxVal, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dst_type axis; \ + dst_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.y --; \ + for (;coord.y >= 0;) \ + { \ + VXC_ReadImage(src, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y --; \ + packIdx --; \ + maxVal = max(maxVal, src); \ + dst_type condition; \ + VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axis = condition ? packIdx : axis; \ + } \ + \ + VXC_WriteImage(output, coord.xz, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS1_8BITS_2D(I8, I16, vxc_char16, vxc_short8) +TENSOR_ARGMAX_AXIS1_8BITS_2D(I8, U8, vxc_char16, vxc_uchar16) +TENSOR_ARGMAX_AXIS1_8BITS_2D(U8, I16, vxc_uchar16, vxc_short8) +TENSOR_ARGMAX_AXIS1_8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx new file mode 100644 index 0000000..ac867c9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx @@ -0,0 +1,117 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int4 packedArgIdx; +_viv_uniform int argLenSub1; +_viv_uniform VXC_512Bits uniExtractData_2x8; + +#define TENSOR_ARGMAX_AXIS2_16BITS(src_type_name, dst_type_name,\ + src_type, copy_type, axis_type, dst_type, inst_type) \ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \ + copy_type vec; \ + src_type src; \ + src_type maxVal; \ + copy_type max; \ + VXC_ReadImage2DArray(max, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, maxVal, max, 16); \ + axis_type axis; \ + axis_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.z --; \ + do \ + { \ + VXC_ReadImage2DArray(vec, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + coord.z --; \ + packIdx --; \ + VXC_VertMax3_##inst_type(maxVal, maxVal, maxVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, max, maxVal, 16); \ + axis = (max == vec) ? packIdx : axis; \ + } while (coord.z >= 0); \ + \ + dst_type dst_axis; \ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \ + VXC_WriteImage(output, coord.xy, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS2_16BITS(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half) +TENSOR_ARGMAX_AXIS2_16BITS(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half) +TENSOR_ARGMAX_AXIS2_16BITS(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer) +TENSOR_ARGMAX_AXIS2_16BITS(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer) + +#define TENSOR_ARGMAX_AXIS2_16BITS_2D(src_type_name, dst_type_name, src_type, \ + copy_type, axis_type, dst_type, inst_type) \ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS2_16BITS_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half) +TENSOR_ARGMAX_AXIS2_16BITS_2D(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half) +TENSOR_ARGMAX_AXIS2_16BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer) +TENSOR_ARGMAX_AXIS2_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer) + + +#define TENSOR_ARGMAX_AXIS2_8BITS(src_type_name, dst_type_name, src_type, dst_type) \ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \ + src_type src; \ + src_type maxVal; \ + VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dst_type axis; \ + dst_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.z --; \ + do \ + { \ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z --; \ + packIdx --; \ + maxVal = max(maxVal, src); \ + dst_type condition; \ + VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axis = condition ? packIdx : axis; \ + } while (coord.z >= 0); \ + \ + VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS2_8BITS(I8, I16, vxc_char8, vxc_short8) +TENSOR_ARGMAX_AXIS2_8BITS(I8, U8, vxc_char8, vxc_uchar8) +TENSOR_ARGMAX_AXIS2_8BITS(U8, I16, vxc_uchar8, vxc_short8) +TENSOR_ARGMAX_AXIS2_8BITS(U8, U8, vxc_uchar8, vxc_uchar8) + +#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, I16, vxc_char8, vxc_short8) +TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, U8, vxc_char8, vxc_uchar8) +TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, I16, vxc_uchar8, vxc_short8) +TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, U8, vxc_uchar8, vxc_uchar8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/argmin_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/argmin_axis0.vx new file mode 100644 index 0000000..2f3a19d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/argmin_axis0.vx @@ -0,0 +1,240 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int4 packedArgIdx; +_viv_uniform int inputWidth; +_viv_uniform VXC_512Bits uniPackedIdxAddSat_2x8; +_viv_uniform VXC_512Bits uniSrcT2DstT_2x8; + +#define TENSOR_ARGMIN_AXIS0_8BITS(src_type_name, dst_type_name, src_type, \ + cond_type0, cond_type1, dst_type, cond_type) \ +__kernel void argmin_axis0_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \ + src_type minValue, minVec, value; \ + dst_type packIdx, currIdx; \ + \ + VXC_ReadImage2DArray(minVec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \ + coord.x += 8; \ + for (; coord.x < inputWidth; ) \ + { \ + VXC_ReadImage2DArray(value, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + \ + currIdx = currIdx + 8; \ + dst_type condition; \ + cond_type0 src_condition0 = value < minVec; \ + cond_type1 src_condition; \ + _viv_asm(COPY, src_condition, src_condition0, 8); \ + cond_type condition_tmp; \ + VXC_DP2x8(condition_tmp, src_condition, src_condition, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniSrcT2DstT_2x8); \ + _viv_asm(COPY, condition, condition_tmp, 16); \ + packIdx = condition ? currIdx : packIdx; \ + minVec = min(minVec, value); \ + } \ + \ + VXC_HorzMin3_Integer(minValue, minVec, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_HorzMin3_Integer(minValue, minValue.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + minValue.s01234567 = minValue.s00000000; \ + \ + cond_type1 _minVal; \ + VXC_Clamp(_minVal, minVec, minValue, minValue, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _minVal += 1; \ + \ + VXC_DP2x8(packIdx, packIdx, _minVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniPackedIdxAddSat_2x8); \ + \ + VXC_HorzMin3_Integer(packIdx, packIdx, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_HorzMin3_Integer(packIdx, packIdx.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_WriteImage(output, coord.yz, packIdx, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS0_8BITS(I8, I16, vxc_char8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8) +TENSOR_ARGMIN_AXIS0_8BITS(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +TENSOR_ARGMIN_AXIS0_8BITS(U8, I16, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8) +TENSOR_ARGMIN_AXIS0_8BITS(U8, U8, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +TENSOR_ARGMIN_AXIS0_8BITS(I16, I16, vxc_short8, vxc_short8, vxc_ushort8, vxc_short8, vxc_ushort8) +TENSOR_ARGMIN_AXIS0_8BITS(I16, U8, vxc_short8, vxc_short8, vxc_ushort8, vxc_uchar8, vxc_uchar8) + +#define TENSOR_ARGMIN_AXIS0_8BITS_2D(src_type_name, dst_type_name, src_type, \ + cond_type0, cond_type1, dst_type, cond_type) \ +__kernel void argmin_axis0_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(0), 0, 0); \ + src_type minValue, minVec, value; \ + dst_type packIdx, currIdx; \ + \ + VXC_ReadImage(minVec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \ + coord.x += 8; \ + for (; coord.x < inputWidth; ) \ + { \ + VXC_ReadImage(value, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + \ + currIdx = currIdx + 8; \ + dst_type condition; \ + cond_type0 src_condition0 = value < minVec; \ + cond_type1 src_condition; \ + _viv_asm(COPY, src_condition, src_condition0, 8); \ + cond_type condition_tmp; \ + VXC_DP2x8(condition_tmp, src_condition, src_condition, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniSrcT2DstT_2x8); \ + _viv_asm(COPY, condition, condition_tmp, 16); \ + packIdx = condition ? currIdx : packIdx; \ + minVec = min(minVec, value); \ + } \ + \ + VXC_HorzMin3_Integer(minValue, minVec, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_HorzMin3_Integer(minValue, minValue.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + minValue.s01234567 = minValue.s00000000; \ + \ + cond_type1 _minVal; \ + VXC_Clamp(_minVal, minVec, minValue, minValue, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _minVal += 1; \ + \ + VXC_DP2x8(packIdx, packIdx, _minVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniPackedIdxAddSat_2x8); \ + \ + VXC_HorzMin3_Integer(packIdx, packIdx, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_HorzMin3_Integer(packIdx, packIdx.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_WriteImage(output, coord.yz, packIdx, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} + +TENSOR_ARGMIN_AXIS0_8BITS_2D(I8, I16, vxc_char8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8) +TENSOR_ARGMIN_AXIS0_8BITS_2D(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +TENSOR_ARGMIN_AXIS0_8BITS_2D(U8, I16, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8) +TENSOR_ARGMIN_AXIS0_8BITS_2D(U8, U8, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +TENSOR_ARGMIN_AXIS0_8BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_ushort8, vxc_short8, vxc_ushort8) +TENSOR_ARGMIN_AXIS0_8BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_ushort8, vxc_uchar8, vxc_uchar8) + +_viv_uniform VXC_512Bits uniConvertHalf2Float32_4x4; + +#define TENSOR_ARGMIN_AXIS0_F16_2D(dst_type_name, dst_type) \ +__kernel void argmin_axis0_F16to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(0), 0, 0); \ + vxc_short8 vec0, vec1; \ + vxc_half8 src; \ + uint4 packIdx, currIdx; \ + float4 minValue, value; \ + \ + VXC_ReadImage(vec0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec0, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \ + coord.x += 4; \ + VXC_DP4x4(minValue, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \ + for (; coord.x < inputWidth; ) \ + { \ + VXC_ReadImage(vec1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec1, 16); \ + coord.x += 4; \ + \ + currIdx = currIdx + 4; \ + VXC_DP4x4(value, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \ + \ + int4 condition; \ + condition = value < minValue; \ + \ + uint4 iCondition; \ + _viv_asm(COPY, iCondition, condition, 16); \ + packIdx = iCondition ? currIdx : packIdx; \ + minValue = value < minValue ? value : minValue; \ + } \ + \ + float4 minVec; \ + float2 minVal2 = minValue.xy < minValue.zw ? minValue.xy : minValue.zw; \ + minVec.x = minVal2.x < minVal2.y ? minVal2.x : minVal2.y; \ + int4 condition; \ + condition = minVec.xxxx == minValue; \ + uint4 iCondition; \ + _viv_asm(COPY, iCondition, condition, 16); \ + iCondition += 1; \ + \ + packIdx = mad_sat(iCondition, 0xFFFFFFFF, packIdx); \ + \ + uint2 val2 = packIdx.xy < packIdx.zw ? packIdx.xy : packIdx.zw; \ + val2.x = val2.x < val2.y ? val2.x : val2.y; \ + \ + dst_type dst; \ + _viv_asm(COPY, dst, val2, 4); \ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS0_F16_2D(I16, vxc_ushort8) +TENSOR_ARGMIN_AXIS0_F16_2D(U8, vxc_uchar8) + + +#define TENSOR_ARGMIN_AXIS0_F16(dst_type_name, dst_type) \ +__kernel void argmin_axis0_F16to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \ + vxc_short8 vec0, vec1; \ + vxc_half8 src; \ + uint4 packIdx, currIdx; \ + float4 minValue, value; \ + \ + VXC_ReadImage2DArray(vec0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec0, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \ + coord.x += 4; \ + VXC_DP4x4(minValue, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \ + for (; coord.x < inputWidth; ) \ + { \ + VXC_ReadImage2DArray(vec1, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec1, 16); \ + coord.x += 4; \ + \ + currIdx = currIdx + 4; \ + VXC_DP4x4(value, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \ + \ + int4 condition; \ + condition = value < minValue; \ + \ + uint4 iCondition; \ + _viv_asm(COPY, iCondition, condition, 16); \ + packIdx = iCondition ? currIdx : packIdx; \ + minValue = value < minValue ? value : minValue; \ + } \ + \ + float4 minVec; \ + float2 minVal2 = minValue.xy < minValue.zw ? minValue.xy : minValue.zw; \ + minVec.x = minVal2.x < minVal2.y ? minVal2.x : minVal2.y; \ + int4 condition; \ + condition = minVec.xxxx == minValue; \ + uint4 iCondition; \ + _viv_asm(COPY, iCondition, condition, 16); \ + iCondition += 1; \ + \ + packIdx = mad_sat(iCondition, 0xFFFFFFFF, packIdx); \ + \ + uint2 val2 = packIdx.xy < packIdx.zw ? packIdx.xy : packIdx.zw; \ + val2.x = val2.x < val2.y ? val2.x : val2.y; \ + \ + dst_type dst; \ + _viv_asm(COPY, dst, val2, 4); \ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS0_F16(I16, vxc_ushort8) +TENSOR_ARGMIN_AXIS0_F16(U8, vxc_uchar8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/argmin_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/argmin_axis1.vx new file mode 100644 index 0000000..04faf97 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/argmin_axis1.vx @@ -0,0 +1,162 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int4 packedArgIdx; +_viv_uniform int argLenSub1; +_viv_uniform VXC_512Bits uniExtractData_2x8; + +#define TENSOR_ARGMIN_AXIS1_16BITS(src_type_name, dst_type_name, src_type,\ + copy_type, axis_type, dst_type, inst_type) \ + __kernel void argmin_axis1_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), argLenSub1, get_global_id(1), 0); \ + copy_type vec; \ + src_type src; \ + src_type minVal; \ + copy_type min; \ + VXC_ReadImage2DArray(min, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, minVal, min, 16); \ + axis_type axis; \ + axis_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.y --; \ + for (;coord.y >= 0;) \ + { \ + VXC_ReadImage2DArray(vec, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + coord.y --; \ + packIdx --; \ + VXC_VertMin3_##inst_type(minVal, minVal, minVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, min, minVal, 16); \ + axis = (min == vec) ? packIdx : axis; \ + } \ + \ + dst_type dst_axis; \ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \ + VXC_WriteImage(output, coord.xz, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS1_16BITS(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half) +TENSOR_ARGMIN_AXIS1_16BITS(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half) +TENSOR_ARGMIN_AXIS1_16BITS(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer) +TENSOR_ARGMIN_AXIS1_16BITS(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer) + +#define TENSOR_ARGMIN_AXIS1_16BITS_2D(src_type_name, dst_type_name, src_type,\ + copy_type, axis_type, dst_type, inst_type) \ + __kernel void argmin_axis1_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), argLenSub1, 0, 0); \ + copy_type vec; \ + src_type src; \ + src_type minVal; \ + copy_type min; \ + VXC_ReadImage(min, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, minVal, min, 16); \ + axis_type axis; \ + axis_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.y --; \ + for (;coord.y >= 0;) \ + { \ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + coord.y --; \ + packIdx --; \ + VXC_VertMin3_##inst_type(minVal, minVal, minVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, min, minVal, 16); \ + axis = (min == vec) ? packIdx : axis; \ + } \ + \ + dst_type dst_axis; \ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \ + VXC_WriteImage(output, coord.xz, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS1_16BITS_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half) +TENSOR_ARGMIN_AXIS1_16BITS_2D(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half) +TENSOR_ARGMIN_AXIS1_16BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer) +TENSOR_ARGMIN_AXIS1_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer) + +#define TENSOR_ARGMIN_AXIS1_8BITS(src_type_name, dst_type_name, src_type, dst_type) \ +__kernel void argmin_axis1_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), argLenSub1, get_global_id(1), 0); \ + src_type src; \ + src_type minVal; \ + VXC_ReadImage2DArray(minVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dst_type axis; \ + dst_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.y --; \ + for (;coord.y >= 0;) \ + { \ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y --; \ + packIdx --; \ + minVal = min(minVal, src); \ + dst_type condition; \ + VXC_Clamp(condition, src, minVal, minVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axis = condition ? packIdx : axis; \ + } \ + \ + VXC_WriteImage(output, coord.xz, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS1_8BITS(I8, I16, vxc_char16, vxc_short8) +TENSOR_ARGMIN_AXIS1_8BITS(I8, U8, vxc_char16, vxc_uchar16) +TENSOR_ARGMIN_AXIS1_8BITS(U8, I16, vxc_uchar16, vxc_short8) +TENSOR_ARGMIN_AXIS1_8BITS(U8, U8, vxc_uchar16, vxc_uchar16) + +#define TENSOR_ARGMIN_AXIS1_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \ +__kernel void argmin_axis1_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), argLenSub1, 0, 0); \ + src_type src; \ + src_type minVal; \ + VXC_ReadImage(minVal, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dst_type axis; \ + dst_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.y --; \ + for (;coord.y >= 0;) \ + { \ + VXC_ReadImage(src, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y --; \ + packIdx --; \ + minVal = min(minVal, src); \ + dst_type condition; \ + VXC_Clamp(condition, src, minVal, minVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axis = condition ? packIdx : axis; \ + } \ + \ + VXC_WriteImage(output, coord.xz, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS1_8BITS_2D(I8, I16, vxc_char16, vxc_short8) +TENSOR_ARGMIN_AXIS1_8BITS_2D(I8, U8, vxc_char16, vxc_uchar16) +TENSOR_ARGMIN_AXIS1_8BITS_2D(U8, I16, vxc_uchar16, vxc_short8) +TENSOR_ARGMIN_AXIS1_8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/argmin_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/argmin_axis2.vx new file mode 100644 index 0000000..f242075 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/argmin_axis2.vx @@ -0,0 +1,117 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int4 packedArgIdx; +_viv_uniform int argLenSub1; +_viv_uniform VXC_512Bits uniExtractData_2x8; + +#define TENSOR_ARGMIN_AXIS2_16BITS(src_type_name, dst_type_name,\ + src_type, copy_type, axis_type, dst_type, inst_type) \ + __kernel void argmin_axis2_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \ + copy_type vec; \ + src_type src; \ + src_type minVal; \ + copy_type min; \ + VXC_ReadImage2DArray(min, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, minVal, min, 16); \ + axis_type axis; \ + axis_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.z --; \ + do \ + { \ + VXC_ReadImage2DArray(vec, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + coord.z --; \ + packIdx --; \ + VXC_VertMin3_##inst_type(minVal, minVal, minVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, min, minVal, 16); \ + axis = (min == vec) ? packIdx : axis; \ + } while (coord.z >= 0); \ + \ + dst_type dst_axis; \ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \ + VXC_WriteImage(output, coord.xy, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS2_16BITS(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half) +TENSOR_ARGMIN_AXIS2_16BITS(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half) +TENSOR_ARGMIN_AXIS2_16BITS(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer) +TENSOR_ARGMIN_AXIS2_16BITS(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer) + +#define TENSOR_ARGMIN_AXIS2_16BITS_2D(src_type_name, dst_type_name, src_type, \ + copy_type, axis_type, dst_type, inst_type) \ + __kernel void argmin_axis2_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS2_16BITS_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half) +TENSOR_ARGMIN_AXIS2_16BITS_2D(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half) +TENSOR_ARGMIN_AXIS2_16BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer) +TENSOR_ARGMIN_AXIS2_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer) + + +#define TENSOR_ARGMIN_AXIS2_8BITS(src_type_name, dst_type_name, src_type, dst_type) \ + __kernel void argmin_axis2_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \ + src_type src; \ + src_type minVal; \ + VXC_ReadImage2DArray(minVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dst_type axis; \ + dst_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.z --; \ + do \ + { \ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z --; \ + packIdx --; \ + minVal = min(minVal, src); \ + dst_type condition; \ + VXC_Clamp(condition, src, minVal, minVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axis = condition ? packIdx : axis; \ + } while (coord.z >= 0); \ + \ + VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS2_8BITS(I8, I16, vxc_char8, vxc_short8) +TENSOR_ARGMIN_AXIS2_8BITS(I8, U8, vxc_char8, vxc_uchar8) +TENSOR_ARGMIN_AXIS2_8BITS(U8, I16, vxc_uchar8, vxc_short8) +TENSOR_ARGMIN_AXIS2_8BITS(U8, U8, vxc_uchar8, vxc_uchar8) + +#define TENSOR_ARGMIN_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \ + __kernel void argmin_axis2_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMIN_AXIS2_8BITS_2D(I8, I16, vxc_char8, vxc_short8) +TENSOR_ARGMIN_AXIS2_8BITS_2D(I8, U8, vxc_char8, vxc_uchar8) +TENSOR_ARGMIN_AXIS2_8BITS_2D(U8, I16, vxc_uchar8, vxc_short8) +TENSOR_ARGMIN_AXIS2_8BITS_2D(U8, U8, vxc_uchar8, vxc_uchar8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx new file mode 100644 index 0000000..9ac4945 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx @@ -0,0 +1,272 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDatatoF32_0_4x4; +_viv_uniform VXC_512Bits uniDatatoF32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform float input_scale; +_viv_uniform float input_tail; +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define BATCH_NORM_SH_IMPL(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ +__kernel void batch_norm_##name0##to##name1##_brdcst1( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t Mean, \ + __read_only image2d_array_t Variance, \ + __read_only image2d_array_t Gamma, \ + __read_only image2d_array_t Beta, \ + __write_only image2d_array_t output, \ + float eps \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + read_type vec; \ + src_type src; \ + VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + vxc_ushort8 _mean, _var, _gamma; \ + vxc_half8 mean, var, gamma; \ + VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, mean, _mean, 16); \ + VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, var, _var, 16); \ + VXC_ReadImage2DArray(_gamma, Gamma, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, gamma, _gamma, 16); \ + float4 beta = read_imagef(Beta, coord); \ + \ + float4 src0, src1, m, v, g; \ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + g = g * rsqrt(v + eps); \ + src0 = src0 * input_scale + input_tail; \ + src0 = (src0 - m) * g + beta.xxxx; \ + src0 = src0 * output_scale + output_zp; \ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + g = g * rsqrt(v + eps); \ + src1 = src1 * input_scale + input_tail; \ + src1 = (src1 - m) * g + beta.xxxx; \ + src1 = src1 * output_scale + output_zp; \ + \ + conv_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, src0); \ + _viv_asm(CONV_RTE, dst1, src1); \ + dst_type tmp; \ + save_type dst; \ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmp, 16); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +BATCH_NORM_SH_IMPL(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) + +#define BATCH_NORM_SH_IMPL_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ +__kernel void batch_norm_##name0##to##name1##_brdcst1_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t Mean, \ + __read_only image2d_t Variance, \ + __read_only image2d_t Gamma, \ + __read_only image2d_t Beta, \ + __write_only image2d_array_t output, \ + float eps \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + read_type vec; \ + src_type src; \ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + vxc_ushort8 _mean, _var, _gamma; \ + vxc_half8 mean, var, gamma; \ + VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, mean, _mean, 16); \ + VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, var, _var, 16); \ + VXC_ReadImage(_gamma, Gamma, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, gamma, _gamma, 16); \ + float4 beta = read_imagef(Beta, coord.xy); \ + \ + float4 src0, src1, m, v, g; \ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + g = g * rsqrt(v + eps); \ + src0 = src0 * input_scale + input_tail; \ + src0 = (src0 - m) * g + beta.xxxx; \ + src0 = src0 * output_scale + output_zp; \ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + g = g * rsqrt(v + eps); \ + src1 = src1 * input_scale + input_tail; \ + src1 = (src1 - m) * g + beta.xxxx; \ + src1 = src1 * output_scale + output_zp; \ + \ + conv_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, src0); \ + _viv_asm(CONV_RTE, dst1, src1); \ + dst_type tmp; \ + save_type dst; \ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmp, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +BATCH_NORM_SH_IMPL_2D(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_2D(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_2D(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_2D(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) + + +#define BATCH_NORM_SH_IMPL_AXIS1(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ +__kernel void batch_norm_##name0##to##name1##_brdcst0( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t Mean, \ + __read_only image2d_array_t Variance, \ + __read_only image2d_array_t Gamma, \ + __read_only image2d_array_t Beta, \ + __write_only image2d_array_t output, \ + float eps \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + read_type vec; \ + src_type src; \ + VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + vxc_ushort8 _mean, _var, _gamma; \ + vxc_half8 mean, var, gamma; \ + VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, mean, _mean, 16); \ + VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, var, _var, 16); \ + VXC_ReadImage2DArray(_gamma, Gamma, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, gamma, _gamma, 16); \ + float4 beta0 = read_imagef(Beta, coord); \ + coord.x += 4; \ + float4 beta1 = read_imagef(Beta, coord); \ + coord.x -= 4; \ + \ + float4 src0, src1, m, v, g; \ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + g = g * rsqrt(v + eps); \ + src0 = src0 * input_scale + input_tail; \ + src0 = (src0 - m) * g + beta0; \ + src0 = src0 * output_scale + output_zp; \ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + g = g * rsqrt(v + eps); \ + src1 = src1 * input_scale + input_tail; \ + src1 = (src1 - m) * g + beta1; \ + src1 = src1 * output_scale + output_zp; \ + \ + conv_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, src0); \ + _viv_asm(CONV_RTE, dst1, src1); \ + dst_type tmp; \ + save_type dst; \ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmp, 16); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +BATCH_NORM_SH_IMPL_AXIS1(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_AXIS1(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_AXIS1(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_AXIS1(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) + +#define BATCH_NORM_SH_IMPL_AXIS1_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ +__kernel void batch_norm_##name0##to##name1##_brdcst0_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t Mean, \ + __read_only image2d_t Variance, \ + __read_only image2d_t Gamma, \ + __read_only image2d_t Beta, \ + __write_only image2d_array_t output, \ + float eps \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + read_type vec; \ + src_type src; \ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + coord.z += 4; \ + vxc_ushort8 _mean, _var, _gamma; \ + vxc_half8 mean, var, gamma; \ + VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, mean, _mean, 16); \ + VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, var, _var, 16); \ + VXC_ReadImage(_gamma, Gamma, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, gamma, _gamma, 16); \ + float4 beta0 = read_imagef(Beta, coord.xy); \ + float4 beta1 = read_imagef(Beta, coord.zy); \ + \ + float4 src0, src1, m, v, g; \ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + g = g * rsqrt(v + eps); \ + src0 = src0 * input_scale + input_tail; \ + src0 = (src0 - m) * g + beta0; \ + src0 = src0 * output_scale + output_zp; \ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + g = g * rsqrt(v + eps); \ + src1 = src1 * input_scale + input_tail; \ + src1 = (src1 - m) * g + beta1; \ + src1 = src1 * output_scale + output_zp; \ + \ + conv_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, src0); \ + _viv_asm(CONV_RTE, dst1, src1); \ + dst_type tmp; \ + save_type dst; \ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmp, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_AXIS1_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cast.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cast.vx new file mode 100644 index 0000000..09e2587 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cast.vx @@ -0,0 +1,149 @@ + +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataConvert_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +#define CAST_PROCESS(read_type, src_type, dst_type, write_type, read_fun, write_fun) \ + read_type read_val; \ + src_type src_val; \ + dst_type dst_val; \ + write_type write_val; \ + read_fun(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src_val, read_val, 16); \ + VXC_DP2x8(dst_val, src_val, src_val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniDataConvert_2x8); \ + _viv_asm(COPY, write_val, dst_val, 16); \ + write_fun(output, coord, write_val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define CAST_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type) \ +__kernel void cast_##src_name##to##dst_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + CAST_PROCESS(read_type, src_type, dst_type, write_type, VXC_ReadImage2DArray, VXC_WriteImage2DArray) \ +} + +CAST_FUN(F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +CAST_FUN(F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8) +CAST_FUN(F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8) +CAST_FUN(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8) +CAST_FUN(I16, I8, vxc_short8, vxc_short8, vxc_char8, vxc_char8) +CAST_FUN(I16, U8, vxc_short8, vxc_short8, vxc_uchar8, vxc_uchar8) +CAST_FUN(I8, F16, vxc_char8, vxc_char8, vxc_half8, vxc_short8) +CAST_FUN(I8, I16, vxc_char8, vxc_char8, vxc_short8, vxc_short8) +CAST_FUN(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8) +CAST_FUN(U8, F16, vxc_uchar8, vxc_uchar8, vxc_half8, vxc_short8) +CAST_FUN(U8, I16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_short8) +CAST_FUN(U8, I8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_char8) + + +#define CAST_FUN_2D(src_name, dst_name, read_type, src_type, dst_type, write_type) \ +__kernel void cast_##src_name##to##dst_name##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + CAST_PROCESS(read_type, src_type, dst_type, write_type, VXC_ReadImage, VXC_WriteImage) \ +} + +CAST_FUN_2D(F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +CAST_FUN_2D(F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8) +CAST_FUN_2D(F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8) +CAST_FUN_2D(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8) +CAST_FUN_2D(I16, I8, vxc_short8, vxc_short8, vxc_char8, vxc_char8) +CAST_FUN_2D(I16, U8, vxc_short8, vxc_short8, vxc_uchar8, vxc_uchar8) +CAST_FUN_2D(I8, F16, vxc_char8, vxc_char8, vxc_half8, vxc_short8) +CAST_FUN_2D(I8, I16, vxc_char8, vxc_char8, vxc_short8, vxc_short8) +CAST_FUN_2D(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8) +CAST_FUN_2D(U8, F16, vxc_uchar8, vxc_uchar8, vxc_half8, vxc_short8) +CAST_FUN_2D(U8, I16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_short8) +CAST_FUN_2D(U8, I8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_char8) + +#define CAST_TO_BOOL_PROCESS(src_type, tmp_type, read_fun, write_fun) \ + src_type src_val; \ + tmp_type tmp_val; \ + vxc_char8 dst_val; \ + read_fun(src_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + tmp_val = (src_val != 0); \ + tmp_val *= (-1); \ + VXC_DP2x8(dst_val, tmp_val, tmp_val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDataConvert_2x8); \ + write_fun(output, coord, dst_val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define CAST_TO_BOOL_FUN(src_name, src_type, tmp_type) \ +__kernel void cast_##src_name##toBOOL8( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + CAST_TO_BOOL_PROCESS(src_type, tmp_type, VXC_ReadImage2DArray, VXC_WriteImage2DArray) \ +} + +CAST_TO_BOOL_FUN(F16, vxc_short8, vxc_short8) +CAST_TO_BOOL_FUN(I16, vxc_short8, vxc_short8) +CAST_TO_BOOL_FUN(I8, vxc_char8, vxc_char8) +CAST_TO_BOOL_FUN(U8, vxc_uchar8, vxc_char8) + +#define CAST_TO_BOOL_FUN_2D(src_name, src_type, tmp_type) \ +__kernel void cast_##src_name##toBOOL8_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + CAST_TO_BOOL_PROCESS(src_type, tmp_type, VXC_ReadImage, VXC_WriteImage) \ +} + +CAST_TO_BOOL_FUN_2D(F16, vxc_short8, vxc_short8) +CAST_TO_BOOL_FUN_2D(I16, vxc_short8, vxc_short8) +CAST_TO_BOOL_FUN_2D(I8, vxc_char8, vxc_char8) +CAST_TO_BOOL_FUN_2D(U8, vxc_uchar8, vxc_char8) + +#define CAST_F32orI32_PROCESS(src_type, dst_type, read_fun, write_fun) \ + src_type src_val0, src_val1; \ + dst_type dst_val; \ + int4 tmpData1, tmpData2; \ + src_val0 = read_fun(input, coord); \ + coord.x += 4; \ + src_val1 = read_fun(input, coord); \ + tmpData1 = convert_int4(src_val0); \ + tmpData2 = convert_int4(src_val1); \ + VXC_DP2x8(dst_val, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + coord.x -= 4; \ + write_fun(output, coord, dst_val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define CAST_F32orI32_FUN(src_name, dst_name, src_type, dst_type, read_fun) \ +__kernel void cast_##src_name##to##dst_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + CAST_F32orI32_PROCESS(src_type, dst_type, read_fun, VXC_WriteImage2DArray) \ +} + +CAST_F32orI32_FUN(F32, I16, float4, vxc_short8, read_imagef) +CAST_F32orI32_FUN(F32, I8, float4, vxc_char8, read_imagef) +CAST_F32orI32_FUN(F32, U8, float4, vxc_uchar8, read_imagef) +CAST_F32orI32_FUN(I32, I16, int4, vxc_short8, read_imagei) +CAST_F32orI32_FUN(I32, I8, int4, vxc_char8, read_imagei) +CAST_F32orI32_FUN(I32, U8, int4, vxc_uchar8, read_imagei) + + +#define CAST_F32orI32_FUN_2D(src_name, dst_name, src_type, dst_type, read_fun) \ +__kernel void cast_##src_name##to##dst_name##_2D( \ + __read_only image2d_t input, \ + __write_only image2d_array_t output) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + CAST_F32orI32_PROCESS(src_type, dst_type, read_fun, VXC_WriteImage) \ +} + +CAST_F32orI32_FUN_2D(F32, I16, float4, vxc_short8, read_imagef) +CAST_F32orI32_FUN_2D(F32, I8, float4, vxc_char8, read_imagef) +CAST_F32orI32_FUN_2D(F32, U8, float4, vxc_uchar8, read_imagef) +CAST_F32orI32_FUN_2D(I32, I16, int4, vxc_short8, read_imagei) +CAST_F32orI32_FUN_2D(I32, I8, int4, vxc_char8, read_imagei) +CAST_F32orI32_FUN_2D(I32, U8, int4, vxc_uchar8, read_imagei) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/clip_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/clip_F16.vx new file mode 100644 index 0000000..0843ea6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/clip_F16.vx @@ -0,0 +1,128 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int4 packedMinData_FP16; +_viv_uniform int4 packedMaxData_FP16; +_viv_uniform VXC_512Bits uniConvertF16toInt_2x8; +_viv_uniform int2 multAndoutZP; +_viv_uniform VXC_512Bits uniDataMulAndPostShift_2x8; + +#define TENSORCLIP_F16TOF16_PROCESS(read_fun, write_fun) \ + vxc_short8 vec0, dst; \ + vxc_half8 src0, src1, minHf, maxHf; \ + read_fun(vec0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vec0, 16); \ + _viv_asm(COPY, minHf, packedMinData_FP16, 16); \ + _viv_asm(COPY, maxHf, packedMaxData_FP16, 16); \ + VXC_Clamp_Half(src1, src0, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \ + _viv_asm(COPY, dst, src1, 16); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + +__kernel void clip_F16toF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + TENSORCLIP_F16TOF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void clip_F16toF16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + TENSORCLIP_F16TOF16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +#define TENSORCLIP_F16TOINT_PROCESS(read_fun, write_fun, dst_type) \ + vxc_short8 vec0; \ + dst_type dst; \ + vxc_half8 src0, src1, minHf, maxHf; \ + read_fun(vec0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vec0, 16); \ + _viv_asm(COPY, minHf, packedMinData_FP16, 16); \ + _viv_asm(COPY, maxHf, packedMaxData_FP16, 16); \ + VXC_Clamp_Half(src1, src0, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \ + VXC_DP2x8(dst, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertF16toInt_2x8); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + +__kernel void clip_F16toI16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + TENSORCLIP_F16TOINT_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray, vxc_short8) +} + +__kernel void clip_F16toI16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + TENSORCLIP_F16TOINT_PROCESS(VXC_ReadImage, VXC_WriteImage, vxc_short8) +} + +__kernel void clip_F16toI8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + TENSORCLIP_F16TOINT_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray, vxc_char16) +} + +__kernel void clip_F16toI8_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + TENSORCLIP_F16TOINT_PROCESS(VXC_ReadImage, VXC_WriteImage, vxc_char16) +} + +#define TENSORCLIP_F16TOU8_PROCESS(read_fun, write_fun) \ + vxc_short8 vec0; \ + vxc_uchar16 dst; \ + vxc_half8 src0, src1, minHf, maxHf; \ + read_fun(vec0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vec0, 16); \ + _viv_asm(COPY, minHf, packedMinData_FP16, 16); \ + _viv_asm(COPY, maxHf, packedMaxData_FP16, 16); \ + VXC_Clamp_Half(src1, src0, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(dst, src1, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMulAndPostShift_2x8); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + +__kernel void clip_F16toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + TENSORCLIP_F16TOU8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void clip_F16toU8_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + TENSORCLIP_F16TOU8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/clip_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/clip_I16.vx new file mode 100644 index 0000000..cbbabb0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/clip_I16.vx @@ -0,0 +1,66 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertIntegerLo_2x8; +_viv_uniform int4 packedMinData; +_viv_uniform int4 packedMaxData; + +#define TENSORCLIP_I16TOI16_PROCESS(read_fun, write_fun) \ + vxc_short8 src0, min, max; \ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerLo_2x8); \ + _viv_asm(COPY, min, packedMinData, 16); \ + _viv_asm(COPY, max, packedMaxData, 16); \ + VXC_Clamp(src0, src0, min, max, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \ + write_fun(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + +__kernel void clip_I16toI16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + TENSORCLIP_I16TOI16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void clip_I16toI16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + TENSORCLIP_I16TOI16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +#define TENSORCLIP_I16TOF16_PROCESS(read_fun, write_fun) \ + vxc_short8 src0, dst; \ + vxc_half8 src1, src2, minHf, maxHf; \ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(src1, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerLo_2x8); \ + _viv_asm(COPY, minHf, packedMinData, 16); \ + _viv_asm(COPY, maxHf, packedMaxData, 16); \ + VXC_Clamp_Half(src2, src1, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \ + _viv_asm(COPY, dst, src2, 16); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + + +__kernel void clip_I16toF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + TENSORCLIP_I16TOF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void clip_I16toF16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + TENSORCLIP_I16TOF16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/clip_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/clip_I8.vx new file mode 100644 index 0000000..f13566d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/clip_I8.vx @@ -0,0 +1,73 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertIntegerLo_2x8; +_viv_uniform VXC_512Bits uniConvertIntegerHi_2x8; +_viv_uniform int4 packedMinData; +_viv_uniform int4 packedMaxData; + +#define TENSORCLIP_I8TOI8_PROCESS(read_fun, write_fun) \ + vxc_char16 src0, min, max; \ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerLo_2x8); \ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerHi_2x8); \ + _viv_asm(COPY, min, packedMinData, 16); \ + _viv_asm(COPY, max, packedMaxData, 16); \ + VXC_Clamp(src0, src0, min, max, VXC_MODIFIER_CLAMP(0, 15, 0, 0)); \ + write_fun(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + +__kernel void clip_I8toI8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + TENSORCLIP_I8TOI8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void clip_I8toI8_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + TENSORCLIP_I8TOI8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +#define TENSORCLIP_I8TOF16_PROCESS(read_fun, write_fun) \ + vxc_char16 src0; \ + vxc_short8 dst0, dst1; \ + vxc_half8 src1, src2, src3, src4, minHf, maxHf; \ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(src1, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerLo_2x8); \ + VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerHi_2x8); \ + _viv_asm(COPY, minHf, packedMinData, 16); \ + _viv_asm(COPY, maxHf, packedMaxData, 16); \ + VXC_Clamp_Half(src3, src1, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \ + VXC_Clamp_Half(src4, src2, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \ + _viv_asm(COPY, dst0, src3, 16); \ + _viv_asm(COPY, dst1, src4, 16); \ + write_fun(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + write_fun(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void clip_I8toF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + TENSORCLIP_I8TOF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void clip_I8toF16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + TENSORCLIP_I8TOF16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/clip_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/clip_U8.vx new file mode 100644 index 0000000..ecfc465 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/clip_U8.vx @@ -0,0 +1,86 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int4 packedMinData; +_viv_uniform int4 packedMaxData; +_viv_uniform VXC_512Bits uniU8MulAndPostShift_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift_Hi_2x8; +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp + +#define TENSORCLIP_U8TOU8_PROCESS(read_fun, write_fun) \ + vxc_uchar16 vec0, min, max, dst; \ + read_fun(vec0, input, coord,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(dst, vec0, multiplier,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Lo_2x8); \ + VXC_DP2x8(dst, vec0, multiplier,\ + VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Hi_2x8); \ + _viv_asm(COPY, min, packedMinData, 16); \ + _viv_asm(COPY, max, packedMaxData, 16); \ + VXC_Clamp(dst, dst, min, max, VXC_MODIFIER_CLAMP(0, 15, 0, 0)); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + + +__kernel void clip_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + TENSORCLIP_U8TOU8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void clip_U8toU8_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + TENSORCLIP_U8TOU8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +#define TENSORCLIP_U8TOF16_PROCESS(read_fun, write_fun) \ + vxc_uchar16 vec0; \ + vxc_short8 dst0, dst1; \ + vxc_half8 src1, src2, src3, src4, minHf, maxHf; \ + read_fun(vec0, input, coord,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(src1, vec0, multiplier,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Lo_2x8); \ + VXC_DP2x8(src2, vec0, multiplier,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Hi_2x8); \ + _viv_asm(COPY, minHf, packedMinData, 16); \ + _viv_asm(COPY, maxHf, packedMaxData, 16); \ + VXC_Clamp_Half(src3, src1, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \ + VXC_Clamp_Half(src4, src2, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \ + _viv_asm(COPY, dst0, src3, 16); \ + _viv_asm(COPY, dst1, src4, 16); \ + write_fun(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + write_fun(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +__kernel void clip_U8toF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + TENSORCLIP_U8TOF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void clip_U8toF16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + TENSORCLIP_U8TOF16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx new file mode 100644 index 0000000..0d4ac70 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx @@ -0,0 +1,115 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +#define DEPTH2SPACE_CRD_QINT_TO_QINT(src0_type_name, src1_type_name, read_type, write_type) \ +__kernel void depth2space_crd_##src0_type_name##to##src1_type_name( \ + image2d_array_t input, \ + image2d_array_t output, \ + int block_size \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \ + int block_e2 = block_size * block_size; \ + int inx = gidx / block_size; \ + int iny = gidy / block_size; \ + int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \ + int4 coord_in = (int4)(inx, iny, inz, 0); \ + read_type src; \ + VXC_ReadImage2DArray(src,input,coord_in,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,0,0,VXC_RM_TowardZero, 0)); \ + \ + write_type dst; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(dst,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +DEPTH2SPACE_CRD_QINT_TO_QINT(U8, U8, vxc_uchar16, vxc_uchar16) +DEPTH2SPACE_CRD_QINT_TO_QINT(I8, I8, vxc_char16, vxc_char16) +DEPTH2SPACE_CRD_QINT_TO_QINT(I16, I16, vxc_short8, vxc_short8) + +__kernel void depth2space_crd_F16toF16( + image2d_array_t input, + image2d_array_t output, + int block_size + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(gidx, gidy, gidz, 0); + int block_e2 = block_size * block_size; + int inx = gidx / block_size; + int iny = gidy / block_size; + int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; + int4 coord_in = (int4)(inx, iny, inz, 0); + vxc_short8 data; + VXC_ReadImage2DArray(data,input,coord_in,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,0,0,VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, data, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +#define DEPTH2SPACE_CRD_QINT_TO_F16(src0_type_name, read_type) \ +__kernel void depth2space_crd_##src0_type_name##toF16( \ + image2d_array_t input, \ + image2d_array_t output, \ + int block_size \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \ + int block_e2 = block_size * block_size; \ + int inx = gidx / block_size; \ + int iny = gidy / block_size; \ + int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \ + int4 coord_in = (int4)(inx, iny, inz, 0); \ + read_type src; \ + VXC_ReadImage2DArray(src,input,coord_in,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,0,0,VXC_RM_TowardZero, 0)); \ + \ + vxc_half8 tmpDst; \ + vxc_short8 dst; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(tmpDst,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst, tmpDst, 16); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +DEPTH2SPACE_CRD_QINT_TO_F16(U8, vxc_uchar16) +DEPTH2SPACE_CRD_QINT_TO_F16(I8, vxc_char16) +DEPTH2SPACE_CRD_QINT_TO_F16(I16, vxc_short8) + +#define DEPTH2SPACE_CRD_F16_TO_QINT(src1_type_name, write_type) \ +__kernel void depth2space_crd_F16to##src1_type_name( \ + image2d_array_t input, \ + image2d_array_t output, \ + int block_size \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \ + int block_e2 = block_size * block_size; \ + int inx = gidx / block_size; \ + int iny = gidy / block_size; \ + int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \ + int4 coord_in = (int4)(inx, iny, inz, 0); \ + vxc_short8 src; \ + VXC_ReadImage2DArray(src,input,coord_in,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,0,0,VXC_RM_TowardZero, 0)); \ + \ + write_type dst; \ + vxc_half8 data; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +DEPTH2SPACE_CRD_F16_TO_QINT(U8, vxc_uchar16) +DEPTH2SPACE_CRD_F16_TO_QINT(I8, vxc_char16) +DEPTH2SPACE_CRD_F16_TO_QINT(I16, vxc_short8) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src0.vx new file mode 100644 index 0000000..70fb9fc --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src0.vx @@ -0,0 +1,99 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8ConvS16_align8_step0_16x1; +_viv_uniform VXC_512Bits uniU8ConvS16_align8_step1_16x1; +_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8; +_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8; +_viv_uniform int weightZP; +_viv_uniform float outputZP; +_viv_uniform float scale; +_viv_uniform int kernel_size_x16; +_viv_uniform int kernel_size_x8; + +__kernel void vxDW_Conv1D_U8toU8_KN_D1( +__read_only image2d_array_t input, +__read_only image2d_array_t weight, +__read_only image2d_t bias, +__write_only image2d_array_t output, + int pad, + int stride, + int dilation) +{ +int2 coord_in = (int2)(get_global_id(0) * stride - pad, get_global_id(1)); +int4 coord = (int4)(get_global_id(0), 0, 0, get_global_id(1)); +vxc_uchar4 zp; +vxc_uchar16 src; +vxc_uchar16 w; +vxc_short8 coef; +int4 sum, sum0; + +_viv_asm(COPY, zp, weightZP, 4); +sum = read_imagei(bias, coord.wz); + +while(coord.y < kernel_size_x16) +{ + VXC_ReadImage(src, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(w, weight, coord.yw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(coef, w, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); + VXC_DP16x1(sum0, src, coef, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\ + uniU8ConvS16_align8_step0_16x1); + sum.x += sum0.x; + VXC_DP2x8(coef, w, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); + VXC_DP16x1(sum0, src, coef, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\ + uniU8ConvS16_align8_step1_16x1); + sum.x += sum0.x; + coord_in.x += 16; + coord.y += 16; +} + +if (kernel_size_x8) +{ + VXC_ReadImage(src, input, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(w, weight, coord.yw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(coef, w, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); + VXC_DP16x1(sum0, src, coef, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\ + uniU8ConvS16_align8_step0_16x1); + sum.x += sum0.x; +} +float4 result = convert_float4(sum.x) * scale + outputZP; +uchar4 dst = convert_uchar4_sat(result); +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void vxDW_Conv1D_U8toU8_KN_D2( +__read_only image2d_array_t input, +__read_only image2d_array_t weight, +__read_only image2d_t bias, +__write_only image2d_array_t output, + int pad, + int stride, + int dilation) +{ +int2 coord_in = (int2)(get_global_id(0) * stride - pad, get_global_id(1)); +int4 coord = (int4)(get_global_id(0), 0, 0, get_global_id(1)); +vxc_uchar4 zp; +vxc_uchar16 src; +vxc_uchar16 w; +vxc_short8 coef; +int4 sum, sum0; + +_viv_asm(COPY, zp, weightZP, 4); +sum = read_imagei(bias, coord.wz); + +while(coord.y < kernel_size_x8) +{ + VXC_ReadImage(src, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(w, weight, coord.yw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(coef, w, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); + VXC_DP16x1(sum0, src, coef, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\ + uniU8ConvS16_align8_step0_16x1); + sum.x += sum0.x; + coord_in.x += 16; + coord.y += 8; +} + +float4 result = convert_float4(sum.x) * scale + outputZP; +uchar4 dst = convert_uchar4_sat(result); +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src1.vx new file mode 100644 index 0000000..117a848 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src1.vx @@ -0,0 +1,194 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe0_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe1_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe2_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe3_8x2b; +_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8; +_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8; +_viv_uniform VXC_512Bits uniExtractInteger_2x8; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe4_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe5_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe6_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe7_8x2b; +_viv_uniform int weightZP; +_viv_uniform float outputZP; +_viv_uniform float scale; + +__kernel void vxDW_Conv1D_U8toU8_K40_D1( +__read_only image2d_array_t input, +__read_only image2d_array_t weight, +__read_only image2d_t bias, +__write_only image2d_array_t output, + int pad, + int stride, + int dilation) +{ +int4 coord_in = (int4)(get_global_id(0) * stride - pad + 16,\ +get_global_id(0) * stride - pad + 48, get_global_id(1), 0); +vxc_uchar32 src0, src1; +vxc_uchar16 s0, s1, s2; +vxc_uchar16 w0, w1, w2; +int4 sum, sumB; +sum = read_imagei(bias, coord_in.zw); +VXC_ReadImage(src0.hi, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(src0.lo, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(src1.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(s2, input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +int4 coord = (int4)(get_global_id(0), 16, 32, get_global_id(1)); +VXC_ReadImage(w0, weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w1, weight, coord.yw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w2, weight, coord.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +sum = sum.xxxx; +sumB = sum.xxxx; +int4 sum0, sum1; +vxc_uchar4 zp; +_viv_asm(COPY, zp, weightZP, 4); +vxc_short8 coef; +VXC_DP2x8(coef, w0, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w0, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; +src1.hi = src0.lo; +VXC_DP2x8(coef, w1, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w1, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; +src0.hi = src1.lo; +src0.lo = s2; +VXC_DP2x8(coef, w2, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +float4 result0 = convert_float4(sum) * scale + outputZP; +float4 result1 = convert_float4(sumB) * scale + outputZP; +int4 dst0 = convert_int4(result0); +int4 dst1 = convert_int4(result1); +vxc_uchar16 dst; +VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtractInteger_2x8); +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void vxDW_Conv1D_U8toU8_K56_D1( +__read_only image2d_array_t input, +__read_only image2d_array_t weight, +__read_only image2d_t bias, +__write_only image2d_array_t output, + int pad, + int stride, + int dilation) +{ +int4 coord_in = (int4)(get_global_id(0) * stride - pad + 16,\ +get_global_id(0) * stride - pad + 48, get_global_id(1), 0); +vxc_uchar32 src0, src1, src2; +vxc_uchar16 s0, s1, s2; +vxc_uchar16 w[4]; +int4 sum, sumB; +sum = read_imagei(bias, coord_in.zw); +VXC_ReadImage(src0.hi, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(src0.lo, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(src1.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +src1.hi = src0.lo; +VXC_ReadImage(src2.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +src2.hi = src1.lo; +int4 coord = (int4)(get_global_id(0), 16, 48, get_global_id(1)); +VXC_ReadImage(w[0], weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w[1], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w[2], weight, coord.zw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w[3], weight, coord.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +sum = sum.xxxx; +sumB = sum.xxxx; +int4 sum0, sum1; +vxc_uchar4 zp; +_viv_asm(COPY, zp, weightZP, 4); +vxc_short8 coef; +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; + +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; +VXC_DP2x8(coef, w[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src2.lo, src2.hi, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src2.lo, src2.hi, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src2.lo, src2.hi, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src2.lo, src2.hi, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; + +float4 result0 = convert_float4(sum) * scale + outputZP; +float4 result1 = convert_float4(sumB) * scale + outputZP; +int4 dst0 = convert_int4(result0); +int4 dst1 = convert_int4(result1); +vxc_uchar16 dst; +VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtractInteger_2x8); +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src2.vx new file mode 100644 index 0000000..bf34300 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src2.vx @@ -0,0 +1,253 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe0_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe1_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe2_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe3_8x2b; +_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8; +_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8; +_viv_uniform VXC_512Bits uniExtractInteger_2x8; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe4_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe5_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe6_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe7_8x2b; +_viv_uniform int weightZP; +_viv_uniform float outputZP; +_viv_uniform float scale; + +__kernel void vxDW_Conv1D_U8toU8_K64_D1( +__read_only image2d_array_t input, +__read_only image2d_array_t weight, +__read_only image2d_t bias, +__write_only image2d_array_t output, +int pad, +int stride, +int dilation) +{ +int4 coord_in = (int4)(get_global_id(0) * stride - pad + 16,\ +get_global_id(0) * stride - pad + 48, get_global_id(1), 0); +vxc_uchar32 src0, src1, src2, src3; +vxc_uchar16 s0, s1, s2; +vxc_uchar16 w[4]; +int4 sum, sumB; +sum = read_imagei(bias, coord_in.zw); +VXC_ReadImage(src0.hi, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(src0.lo, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(src1.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +src1.hi = src0.lo; +VXC_ReadImage(src2.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +src2.hi = src1.lo; +coord_in.y += 16; +VXC_ReadImage(src3.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0), \ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +src3.hi = src2.lo; +int4 coord = (int4)(get_global_id(0), 16, 48, get_global_id(1)); +VXC_ReadImage(w[0], weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w[1], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w[2], weight, coord.zw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w[3], weight, coord.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +sum = sum.xxxx; +sumB = sum.xxxx; +int4 sum0, sum1; +vxc_uchar4 zp; +_viv_asm(COPY, zp, weightZP, 4); +vxc_short8 coef; +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; + +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; + +VXC_DP2x8(coef, w[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src3.hi, src3.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src3.hi, src3.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src3.hi, src3.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src3.hi, src3.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src3.hi, src3.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src3.hi, src3.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src3.hi, src3.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src3.hi, src3.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; +float4 result0 = convert_float4(sum) * scale + outputZP; +float4 result1 = convert_float4(sumB) * scale + outputZP; +int4 dst0 = convert_int4(result0); +int4 dst1 = convert_int4(result1); +vxc_uchar16 dst; +VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtractInteger_2x8); +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void vxDW_Conv1D_U8toU8_K80_D1( +__read_only image2d_array_t input, +__read_only image2d_array_t weight, +__read_only image2d_t bias, +__write_only image2d_array_t output, +int pad, +int stride, +int dilation) +{ +int4 coord_in = (int4)(get_global_id(0) * stride - pad + 16,\ +get_global_id(0) * stride - pad + 80, get_global_id(1), 0); +vxc_uchar32 src[5]; +vxc_uchar16 s0, s1, s2; +vxc_uchar16 w[5]; +int4 sum, sumB; +sum = read_imagei(bias, coord_in.zw); +VXC_ReadImage(src[0].hi, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(src[0].lo, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +coord_in.x += 32; +VXC_ReadImage(src[1].lo, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +src[1].hi = src[0].lo; +VXC_ReadImage(src[2].lo, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +src[2].hi = src[1].lo; +VXC_ReadImage(src[3].lo, input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +src[3].hi = src[2].lo; +VXC_ReadImage(src[4].lo, input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +src[4].hi = src[3].lo; +int4 coord = (int4)(get_global_id(0), 16, 64, get_global_id(1)); +VXC_ReadImage(w[0], weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w[1], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +coord.y += 16; +VXC_ReadImage(w[2], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w[3], weight, coord.zw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(w[4], weight, coord.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +sum = sum.xxxx; +sumB = sum.xxxx; +int4 sum0, sum1; +vxc_uchar4 zp; +_viv_asm(COPY, zp, weightZP, 4); +vxc_short8 coef; +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src[0].hi, src[0].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src[0].hi, src[0].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src[0].hi, src[0].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src[0].hi, src[0].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src[0].hi, src[0].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src[0].hi, src[0].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src[0].hi, src[0].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src[0].hi, src[0].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src[1].hi, src[1].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src[1].hi, src[1].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src[1].hi, src[1].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src[1].hi, src[1].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src[1].hi, src[1].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src[1].hi, src[1].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src[1].hi, src[1].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src[1].hi, src[1].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; + +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src[2].hi, src[2].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src[2].hi, src[2].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src[2].hi, src[2].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src[2].hi, src[2].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src[2].hi, src[2].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src[2].hi, src[2].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src[2].hi, src[2].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src[2].hi, src[2].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; + +VXC_DP2x8(coef, w[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src[3].hi, src[3].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src[3].hi, src[3].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src[3].hi, src[3].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src[3].hi, src[3].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src[3].hi, src[3].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src[3].hi, src[3].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src[3].hi, src[3].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src[3].hi, src[3].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; + +VXC_DP2x8(coef, w[4], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src[4].hi, src[4].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src[4].hi, src[4].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sum += sum0; +VXC_DP8x2_b(sum0, src[4].hi, src[4].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src[4].hi, src[4].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +VXC_DP2x8(coef, w[4], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum1, src[4].hi, src[4].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b); +VXC_DP8x2_b(sum1, src[4].hi, src[4].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b); +sum += sum1; +VXC_DP8x2_b(sum1, src[4].hi, src[4].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b); +VXC_DP8x2_b(sum1, src[4].hi, src[4].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b); +sumB += sum1; + +float4 result0 = convert_float4(sum) * scale + outputZP; +float4 result1 = convert_float4(sumB) * scale + outputZP; +int4 dst0 = convert_int4(result0); +int4 dst1 = convert_int4(result1); +vxc_uchar16 dst; +VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtractInteger_2x8); +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src3.vx new file mode 100644 index 0000000..94f7ad3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/depthwise_conv1d_src3.vx @@ -0,0 +1,205 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe0_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe1_8x2b; +_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8; +_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8; +_viv_uniform VXC_512Bits uniExtractInteger_2x8; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe4_8x2b; +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe5_8x2b; +_viv_uniform int weightZP; +_viv_uniform float outputZP; +_viv_uniform float scale; + +__kernel void vxDW_Conv1D_U8toU8_K88_D2( +__read_only image2d_array_t input, +__read_only image2d_array_t weight, +__read_only image2d_t bias, +__write_only image2d_array_t output, +int pad, +int stride, +int dilation) +{ +int4 coord_in = (int4)(get_global_id(0) * stride - pad + 16,\ +get_global_id(0) * stride - pad + 48, get_global_id(1), 0); + +vxc_uchar32 src0, src1; +vxc_uchar16 inData[12]; +vxc_uchar16 wData[6]; +int4 sumA, sumB; + +sumA = read_imagei(bias, coord_in.zw); + +VXC_ReadImage(inData[0], input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(inData[1], input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(inData[2], input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(inData[3], input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +coord_in.xy += 64; +VXC_ReadImage(inData[4], input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(inData[5], input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(inData[6], input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(inData[7], input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +coord_in.xy += 64; +VXC_ReadImage(inData[8], input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(inData[9], input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(inData[10], input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(inData[11], input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + +int4 coord = (int4)(get_global_id(0), 16, 48, get_global_id(1)); + +VXC_ReadImage(wData[0], weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(wData[1], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(wData[2], weight, coord.zw, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(wData[3], weight, coord.zw, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +coord.yz += 64; +VXC_ReadImage(wData[4], weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +VXC_ReadImage(wData[5], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0),\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + +sumA = sumA.xxxx; +sumB = sumA; + +int4 sum0, sum1; +vxc_uchar4 zp; +_viv_asm(COPY, zp, weightZP, 4); + +vxc_short8 coef; +src0.hi = inData[0]; +src0.lo = inData[1]; + +VXC_DP2x8(coef, wData[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; + +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; + +src0.hi = src0.lo; +src0.lo = inData[2]; + +VXC_DP2x8(coef, wData[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; + +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +src0.hi = inData[2]; +src0.lo = inData[3]; +VXC_DP2x8(coef, wData[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; + +src0.hi = src0.lo; +src0.lo = inData[4]; + +VXC_DP2x8(coef, wData[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; + +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +src0.hi = inData[4]; +src0.lo = inData[5]; + +VXC_DP2x8(coef, wData[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; + +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; + +src0.hi = src0.lo; +src0.lo = inData[6]; + +VXC_DP2x8(coef, wData[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; + +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +src0.hi = inData[6]; +src0.lo = inData[7]; +VXC_DP2x8(coef, wData[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +src0.hi = src0.lo; +src0.lo = inData[8]; +VXC_DP2x8(coef, wData[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; + +src0.hi = inData[8]; +src0.lo = inData[9]; +VXC_DP2x8(coef, wData[4], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +src0.hi = src0.lo; +src0.lo = inData[10]; +VXC_DP2x8(coef, wData[4], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +src0.hi = inData[10]; +src0.lo = inData[11]; +VXC_DP2x8(coef, wData[5], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b); +sumA += sum0; +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b); +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b); +sumB += sum0; +float4 result0 = convert_float4(sumA) * scale + outputZP; +float4 result1 = convert_float4(sumB) * scale + outputZP; +int4 dst0 = convert_int4(result0); +int4 dst1 = convert_int4(result1); +vxc_uchar16 dst; +VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtractInteger_2x8); +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/detect_post_box.vx b/src/tim/vx/internal/src/libnnext/ops/vx/detect_post_box.vx new file mode 100644 index 0000000..1ba51a0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/detect_post_box.vx @@ -0,0 +1,76 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform VXC_512Bits uniDataMerge_4x4; +_viv_uniform VXC_512Bits uniU8SubZptoF32Conv0_4x4; +_viv_uniform VXC_512Bits uniU8SubZptoF32Conv1_4x4; +_viv_uniform float logE; +_viv_uniform int input0_ZP; +_viv_uniform int input1_ZP; + +float exp_(float x) +{ + x *= logE; + x = exp2(x); + return x; +} + +__kernel void detect_post_box_F32_F32toF32( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + float inv_scale_y, + float inv_scale_x, + float inv_scale_h, + float inv_scale_w) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + float4 src0; + float4 src1; + float4 dst; + float4 tmp0, tmp1, tmp2, tmp3; + uint4 tmp5, tmp6, tmp7; + src0 = read_imagef(input0, coord); + src1 = read_imagef(input1, coord.xy); + tmp0.x = src1.x + src1.z * src0.x * inv_scale_y; + tmp0.y = src1.y + src1.w * src0.y * inv_scale_x; + tmp1.x = src1.z * exp_(src0.z * inv_scale_h) * 0.5f; + tmp1.y = src1.w * exp_(src0.w * inv_scale_w) * 0.5f; + tmp2 = tmp0 - tmp1; + tmp3 = tmp0 + tmp1; + _viv_asm(COPY, tmp5, tmp2, 16); + _viv_asm(COPY, tmp6, tmp3, 16); + VXC_DP4x4(tmp7, tmp5, tmp6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDataMerge_4x4); + _viv_asm(COPY, dst, tmp7, 16); + write_imagef(output, coord, dst); +} + +__kernel void detect_post_box_U8_U8toF32( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float inv_scale_y, + float inv_scale_x, + float inv_scale_h, + float inv_scale_w) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + float4 src0; + float4 src1; + float4 dst; + float4 tmp0, tmp1, tmp2, tmp3; + vxc_uchar8 in0 = 0, in1 = 0; + vxc_short8 zp0 = (short)input0_ZP; + vxc_short8 zp1 = (short)input1_ZP; + VXC_ReadImage2DArray(in0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(src0, in0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniU8SubZptoF32Conv0_4x4); + VXC_DP4x4(src1, in1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniU8SubZptoF32Conv1_4x4); + tmp0.x = src1.x + src1.z * src0.x * inv_scale_y; + tmp0.y = src1.y + src1.w * src0.y * inv_scale_x; + tmp1.x = src1.z * exp_(src0.z * inv_scale_h) * 0.5f; + tmp1.y = src1.w * exp_(src0.w * inv_scale_w) * 0.5f; + dst.xy = tmp0.xy - tmp1.xy; + dst.zw = tmp0.xy + tmp1.xy; + write_imagef(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx new file mode 100644 index 0000000..bc3b6c4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx @@ -0,0 +1,233 @@ +#include "cl_viv_vx_ext.h" + +float4 eltwise_unary_sin(float4 x) +{ + return native_sin(x); +} + +#define logE (1.44269502f) +#define twoLogE (logE * 2.0f) +float4 eltwise_unary_exp(float4 x) +{ + x *= logE; + x = exp2(x); + return x; +} + +#define rlogE (0.693147182f) +float4 eltwise_unary_log(float4 x) +{ + x = log2(x); + return x * rlogE; +} + +float4 eltwise_unary_elu(float4 val) +{ + float4 x = val * logE; + x = exp2(x) - 1; + + return val < 0 ? x : val; +} + +float4 eltwise_unary_neg(float4 x) +{ + return x * -1; +} + +float4 eltwise_unary_hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} + +float4 _softrelu(float4 x) +{ + x *= logE; + x = exp2(x); + x += 1; + x = log2(x); + return x * rlogE; +} + +float4 _tanh(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return (2 * x - 1); +} + +float4 eltwise_unary_mish(float4 x) +{ + float4 y = _softrelu(x); + x = x * _tanh(y); + return x; +} + +_viv_uniform float inputScale; +_viv_uniform float inputTail; +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4; +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; + +#define ELTSISE_UNARY_2D(func_name, src_type_name, dst_type_name, src_type, \ + src_copy_type, convert_type, dst_type, dst_copy_type) \ + __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int type \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + src_type src0; \ + src_copy_type src1; \ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, src0, 16); \ + \ + float4 vecA; \ + float4 vecB; \ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \ + VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \ + vecA = vecA * inputScale + inputTail; \ + vecB = vecB * inputScale + inputTail; \ + vecA = eltwise_unary_##func_name(vecA); \ + vecB = eltwise_unary_##func_name(vecB); \ + vecA = vecA * outputScale + outputZP; \ + vecB = vecB * outputScale + outputZP; \ + \ + convert_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, vecA); \ + _viv_asm(CONV_RTE, dst1, vecB); \ + dst_type dst2; \ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + dst_copy_type dst; \ + _viv_asm(COPY, dst, dst2, 16); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +//EXP +ELTSISE_UNARY_2D(exp, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(exp, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(exp, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(exp, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(exp, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(exp, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(exp, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(exp, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(exp, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//SIN +ELTSISE_UNARY_2D(sin, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(sin, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(sin, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(sin, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(sin, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(sin, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//LOG +ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(log, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(log, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(log, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(log, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(log, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(log, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(log, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//ELU +ELTSISE_UNARY_2D(elu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(elu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(elu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(elu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(elu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(elu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(elu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(elu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(elu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(elu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//NEG +ELTSISE_UNARY_2D(neg, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(neg, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(neg, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(neg, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(neg, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(neg, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(neg, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(neg, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(neg, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//MISH +ELTSISE_UNARY_2D(mish, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(mish, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(mish, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(mish, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(mish, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(mish, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(mish, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(mish, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(mish, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(mish, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//HARD_SIGMOID +ELTSISE_UNARY_2D(hard_sigmoid, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(hard_sigmoid, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(hard_sigmoid, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(hard_sigmoid, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(hard_sigmoid, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(hard_sigmoid, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(hard_sigmoid, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(hard_sigmoid, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) + + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define ELTSISE_UNARY_BF16_2D(func_name) \ + __kernel void func_name##_BF16toBF16_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int type \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_ushort8 src0, src1, dst; \ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 vecA; \ + float4 vecB; \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecA, src1, 16); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, vecB, src1, 16); \ + vecA = eltwise_unary_##func_name(vecA); \ + vecB = eltwise_unary_##func_name(vecB); \ + \ + _viv_asm(COPY, src0, vecA, 16); \ + _viv_asm(COPY, src1, vecB, 16); \ + \ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +//EXP +ELTSISE_UNARY_BF16_2D(exp) +//SIN +ELTSISE_UNARY_BF16_2D(sin) +//LOG +ELTSISE_UNARY_BF16_2D(log) +//ELU +ELTSISE_UNARY_BF16_2D(elu) +//NEG +ELTSISE_UNARY_BF16_2D(neg) +//MISH +ELTSISE_UNARY_BF16_2D(mish) +//HARD_SIGMOID +ELTSISE_UNARY_BF16_2D(hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx new file mode 100644 index 0000000..832c948 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx @@ -0,0 +1,231 @@ +#include "cl_viv_vx_ext.h" + +float4 eltwise_unary_sin(float4 x) +{ + return native_sin(x); +} + +#define logE (1.44269502f) +#define twoLogE (logE * 2.0f) +float4 eltwise_unary_exp(float4 x) +{ + x *= logE; + x = exp2(x); + return x; +} + +#define rlogE (0.693147182f) +float4 eltwise_unary_log(float4 x) +{ + x = log2(x); + return x * rlogE; +} + +float4 eltwise_unary_elu(float4 val) +{ + float4 x = val * logE; + x = exp2(x) - 1; + + return val < 0 ? x : val; +} + +float4 eltwise_unary_neg(float4 x) +{ + return x * -1; +} + +float4 eltwise_unary_hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} + +float4 _softrelu(float4 x) +{ + x *= logE; + x = exp2(x); + x += 1; + x = log2(x); + return x * rlogE; +} + +float4 _tanh(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return (2 * x - 1); +} + +float4 eltwise_unary_mish(float4 x) +{ + float4 y = _softrelu(x); + x = x * _tanh(y); + return x; +} + +_viv_uniform float inputScale; +_viv_uniform float inputTail; +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4; +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; + +#define ELTSISE_UNARY_3D(func_name, src_type_name, dst_type_name, src_type, \ + src_copy_type, convert_type, dst_type, dst_copy_type) \ +__kernel void func_name##_##src_type_name##to##dst_type_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int type \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + src_type src0; \ + src_copy_type src1; \ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, src0, 16); \ + \ + float4 vecA; \ + float4 vecB; \ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \ + VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \ + vecA = vecA * inputScale + inputTail; \ + vecB = vecB * inputScale + inputTail; \ + vecA = eltwise_unary_##func_name(vecA); \ + vecB = eltwise_unary_##func_name(vecB); \ + vecA = vecA * outputScale + outputZP; \ + vecB = vecB * outputScale + outputZP; \ + \ + convert_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, vecA); \ + _viv_asm(CONV_RTE, dst1, vecB); \ + dst_type dst2; \ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + dst_copy_type dst; \ + _viv_asm(COPY, dst, dst2, 16); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +//EXP +ELTSISE_UNARY_3D(exp, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(exp, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(exp, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(exp, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(exp, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(exp, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(exp, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(exp, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(exp, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//SIN +ELTSISE_UNARY_3D(sin, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(sin, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(sin, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(sin, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(sin, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(sin, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//LOG +ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(log, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(log, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(log, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(log, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(log, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(log, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(log, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//ELU +ELTSISE_UNARY_3D(elu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(elu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(elu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(elu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(elu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(elu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(elu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(elu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(elu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(elu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//NEG +ELTSISE_UNARY_3D(neg, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(neg, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(neg, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(neg, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(neg, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(neg, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(neg, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(neg, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(neg, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//MISH +ELTSISE_UNARY_3D(mish, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(mish, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(mish, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(mish, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(mish, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(mish, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(mish, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(mish, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(mish, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(mish, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//HARD_SIGMOID +ELTSISE_UNARY_3D(hard_sigmoid, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(hard_sigmoid, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(hard_sigmoid, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(hard_sigmoid, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(hard_sigmoid, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(hard_sigmoid, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(hard_sigmoid, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(hard_sigmoid, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; +#define ELTSISE_UNARY_BF16(func_name) \ + __kernel void func_name##_BF16toBF16( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int type \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + vxc_ushort8 src0, src1, dst; \ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 vecA; \ + float4 vecB; \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecA, src1, 16); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, vecB, src1, 16); \ + vecA = eltwise_unary_##func_name(vecA); \ + vecB = eltwise_unary_##func_name(vecB); \ + \ + _viv_asm(COPY, src0, vecA, 16); \ + _viv_asm(COPY, src1, vecB, 16); \ + \ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +//EXP +ELTSISE_UNARY_BF16(exp) +//SIN +ELTSISE_UNARY_BF16(sin) +//LOG +ELTSISE_UNARY_BF16(log) +//ELU +ELTSISE_UNARY_BF16(elu) +//NEG +ELTSISE_UNARY_BF16(neg) +//MISH +ELTSISE_UNARY_BF16(mish) +//HARD_SIGMOID +ELTSISE_UNARY_BF16(hard_sigmoid) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/floordiv.vx b/src/tim/vx/internal/src/libnnext/ops/vx/floordiv.vx new file mode 100644 index 0000000..21a6b90 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/floordiv.vx @@ -0,0 +1,174 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvertFstToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecToFp32_4x4; + +_viv_uniform float in_scale0; +_viv_uniform float in_scale1; +_viv_uniform float out_scale; +_viv_uniform float in0Tail; +_viv_uniform float in1Tail; +_viv_uniform float out_zp; + +#define FLOORDIV_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \ + save_type data; \ + read_type read_data0, read_data1; \ + copy_type tmpData0, tmpData1; \ + vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \ + vxc_float4 tmpVal1, tmpVal2; \ + dst_type tmpOut1, tmpOut2; \ + read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, tmpData0, read_data0, 16); \ + read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, tmpData1, read_data1, 16); \ + VXC_DP4x4(in0Val1, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \ + VXC_DP4x4(in0Val2, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \ + VXC_DP4x4(in1Val1, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \ + VXC_DP4x4(in1Val2, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \ + in0Val1 = in0Val1 * IN0_SCALE + IN0_TAIL; \ + in0Val2 = in0Val2 * IN0_SCALE + IN0_TAIL; \ + in1Val1 = in1Val1 * IN1_SCALE + IN1_TAIL; \ + in1Val2 = in1Val2 * IN1_SCALE + IN1_TAIL; \ + tmpVal1 = floor(in0Val1 / in1Val1) * OUT_SCALE + OUT_OFFSET; \ + tmpVal2 = floor(in0Val2 / in1Val2) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, tmpOut1, tmpVal1); \ + _viv_asm(conv_mode, tmpOut2, tmpVal2); \ + VXC_DP2x8(data, tmpOut1, tmpOut2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + +#define TENSOR_FLOORDIV(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \ + conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \ +__kernel void floordiv_##src0_name##src1_name##to##dst_name \ + ( \ + image2d_array_t input0, \ + image2d_array_t input1, \ + image2d_array_t output \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + FLOORDIV_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \ +} + + +TENSOR_FLOORDIV(F16, F16, F16, half4, vxc_short8, vxc_short8,\ + vxc_half8, CONV, 1, 0, 1, 0, 1, 0) +TENSOR_FLOORDIV(F16, F16, I16, short4, vxc_short8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0) +TENSOR_FLOORDIV(F16, F16, I8, char4, vxc_char8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0) +TENSOR_FLOORDIV(F16, F16, U8, uchar4, vxc_uchar8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp) + +TENSOR_FLOORDIV(I16, I16, I16, short4, vxc_short8, vxc_short8,\ + vxc_short8, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0) +TENSOR_FLOORDIV(I16, I16, F16, half4, vxc_short8, vxc_short8,\ + vxc_short8, CONV, in_scale0, 0, in_scale1, 0, 1, 0) + +TENSOR_FLOORDIV(I8, I8, I8, char4, vxc_char8, vxc_char16,\ + vxc_char16, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0) +TENSOR_FLOORDIV(I8, I8, F16, half4, vxc_short8, vxc_char16,\ + vxc_char16, CONV, in_scale0, 0, in_scale1, 0, 1, 0) + +TENSOR_FLOORDIV(U8, U8, U8, uchar4, vxc_uchar8, vxc_uchar16,\ + vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp) +TENSOR_FLOORDIV(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\ + vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0) + + + +#define TENSOR_FLOORDIV_2D(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \ + conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \ +__kernel void floordiv_##src0_name##src1_name##to##dst_name##_2D \ + ( \ + image2d_array_t input0, \ + image2d_array_t input1, \ + image2d_array_t output \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + FLOORDIV_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage, VXC_WriteImage); \ +} + + +TENSOR_FLOORDIV_2D(F16, F16, F16, half4, vxc_short8, vxc_short8,\ + vxc_half8, CONV, 1, 0, 1, 0, 1, 0) +TENSOR_FLOORDIV_2D(F16, F16, I16, short4, vxc_short8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0) +TENSOR_FLOORDIV_2D(F16, F16, I8, char4, vxc_char8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0) +TENSOR_FLOORDIV_2D(F16, F16, U8, uchar4, vxc_uchar8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp) + +TENSOR_FLOORDIV_2D(I16, I16, I16, short4, vxc_short8, vxc_short8,\ + vxc_short8, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0) +TENSOR_FLOORDIV_2D(I16, I16, F16, half4, vxc_short8, vxc_short8,\ + vxc_short8, CONV, in_scale0, 0, in_scale1, 0, 1, 0) + +TENSOR_FLOORDIV_2D(I8, I8, I8, char4, vxc_char8, vxc_char16,\ + vxc_char16, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0) +TENSOR_FLOORDIV_2D(I8, I8, F16, half4, vxc_short8, vxc_char16,\ + vxc_char16, CONV, in_scale0, 0, in_scale1, 0, 1, 0) + +TENSOR_FLOORDIV_2D(U8, U8, U8, uchar4, vxc_uchar8, vxc_uchar16,\ + vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp) +TENSOR_FLOORDIV_2D(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\ + vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0) + + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define FLOORDIV_BF16_PROCESS(read_fun, write_fun) \ + vxc_short8 read_data0, read_data1, vec0; \ + vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \ + vxc_float4 tmpVal1, tmpVal2; \ + vxc_ushort8 dst0, dst1; \ + vxc_ushort8 vect; \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, in0Val1, vec0, 16); \ + VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, in0Val2, vec0, 16); \ + read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, in1Val1, vec0, 16); \ + VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, in1Val2, vec0, 16); \ + tmpVal1 = floor(in0Val1 / in1Val1); \ + tmpVal2 = floor(in0Val2 / in1Val2); \ + _viv_asm(COPY, dst0, tmpVal1, 16); \ + _viv_asm(COPY, dst1, tmpVal2, 16); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void floordiv_BF16BF16toBF16 + ( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + FLOORDIV_BF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray); +} + +__kernel void floordiv_BF16BF16toBF16_2D + ( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + FLOORDIV_BF16_PROCESS(VXC_ReadImage, VXC_WriteImage); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx new file mode 100644 index 0000000..e36dfdb --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx @@ -0,0 +1,103 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int indices_num; + +__kernel void gather_I8toI8( + __read_only image2d_t input0, + __read_only image2d_array_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + int4 indice = read_imagei(input1, coord_in.xyyy); + coord_in.w = gidz * axis_num + indice.x; + + vxc_char16 src; + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_U8toU8( + __read_only image2d_t input0, + __read_only image2d_array_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + int4 indice = read_imagei(input1, coord_in.xyyy); + coord_in.w = gidz * axis_num + indice.x; + + vxc_uchar16 src; + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_I16toI16( + __read_only image2d_t input0, + __read_only image2d_array_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + + + int4 indice = read_imagei(input1, coord_in.xyyy); + coord_in.w = gidz * axis_num + indice.x; + + vxc_short8 src; + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_F16toF16( + __read_only image2d_t input0, + __read_only image2d_array_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + + + int4 indice = read_imagei(input1, coord_in.xyyy); + coord_in.w = gidz * axis_num + indice.x; + + vxc_short8 src; + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx new file mode 100644 index 0000000..e3950b1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx @@ -0,0 +1,111 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int indices_num; + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8; +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp + +#define GATHER_8BITS_TO_F16(src0_type_name, read_type) \ +__kernel void gather_##src0_type_name##toF16( \ + __read_only image2d_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int block_num, \ + int axis_num \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + \ + int4 coord_in = (int4)(gidy, 0, gidx, 0); \ + int4 indice = read_imagei(input1, coord_in.xyyy); \ + coord_in.w = gidz * axis_num + indice.x; \ + \ + read_type src; \ + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + int2 coord = (int2)(gidx, gidz * indices_num + gidy); \ + vxc_half8 src0, src1; \ + vxc_short8 dst0, dst1; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_DP2x8(src1,src,ms0, VXC_MODIFIER(0,7,8, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst0, src0, 16); \ + _viv_asm(COPY, dst1, src1, 16); \ + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + VXC_WriteImage(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_8BITS_TO_F16(U8, vxc_uchar16) +GATHER_8BITS_TO_F16(I8, vxc_char16) + +#define GATHER_F16_TO_QINT(src1_type_name, write_type) \ +__kernel void gather_F16to##src1_type_name( \ + __read_only image2d_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int block_num, \ + int axis_num \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_in = (int4)(gidy, 0, gidx, 0); \ + \ + int4 indice = read_imagei(input1, coord_in.xyyy); \ + coord_in.w = gidz * axis_num + indice.x; \ + \ + vxc_short8 src; \ + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + int2 coord = (int2)(gidx, gidz * indices_num + gidy); \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + vxc_half8 data; \ + write_type dst; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_F16_TO_QINT(U8, vxc_uchar16) +GATHER_F16_TO_QINT(I8, vxc_char16) +GATHER_F16_TO_QINT(I16, vxc_short8) + +__kernel void gather_I16toF16( + __read_only image2d_t input0, + __read_only image2d_array_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + int4 indice = read_imagei(input1, coord_in.xyyy); + coord_in.w = gidz * axis_num + indice.x; + + vxc_short8 src; + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + + vxc_half8 src0; + vxc_short8 dst0; + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_0_Lo_2x8); + _viv_asm(COPY, dst0, src0, 16); + + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx new file mode 100644 index 0000000..a526d21 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx @@ -0,0 +1,82 @@ +#include "cl_viv_vx_ext.h" + +__kernel void gather_nd_I8toI8_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + coord.w = indice.x; + + vxc_char16 src; + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_nd_U8toU8_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + coord.w = indice.x; + + vxc_uchar16 src; + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_nd_I16toI16_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + coord.w = indice.x; + + vxc_short8 src; + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_nd_F16toF16_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + coord.w = indice.x; + + vxc_short8 src; + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx new file mode 100644 index 0000000..6b3d90a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx @@ -0,0 +1,82 @@ +#include "cl_viv_vx_ext.h" + +__kernel void gather_nd_I8toI8_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + indice.x = indice.x * block_size + gidx; + + vxc_char16 src; + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_nd_U8toU8_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + indice.x = indice.x * block_size + gidx; + + vxc_uchar16 src; + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_nd_I16toI16_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + indice.x = indice.x * block_size + gidx; + + vxc_short8 src; + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_nd_F16toF16_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + indice.x = indice.x * block_size + gidx; + + vxc_short8 src; + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx new file mode 100644 index 0000000..6b0be59 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx @@ -0,0 +1,76 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8; + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8; +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8; + +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8; +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp + +_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8; + +#define GATHER_ND_QINT_TO_F16_2D(src0_type_name, read_type) \ +__kernel void gather_nd_##src0_type_name##toF16_2D( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + int4 indice = read_imagei(input1, coord.xy); \ + indice.x = indice.x * block_size + gidx; \ + \ + read_type src; \ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_half8 src0; \ + vxc_short8 dst0; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst0, src0, 16); \ + VXC_WriteImage(output, coord.zy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_ND_QINT_TO_F16_2D(U8, vxc_uchar16) +GATHER_ND_QINT_TO_F16_2D(I8, vxc_char16) +GATHER_ND_QINT_TO_F16_2D(I16, vxc_short8) + +#define GATHER_ND_F16_TO_QINT_2D(src1_type_name, write_type) \ +__kernel void gather_nd_F16to##src1_type_name##_2D( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + int4 indice = read_imagei(input1, coord.xy); \ + indice.x = indice.x * block_size + gidx; \ + \ + vxc_short8 src; \ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + vxc_half8 data; \ + write_type dst; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1),uniConvertFp16toU8_2x8); \ + VXC_WriteImage(output, coord.zy, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_ND_F16_TO_QINT_2D(U8, vxc_uchar16) +GATHER_ND_F16_TO_QINT_2D(I8, vxc_char16) +GATHER_ND_F16_TO_QINT_2D(I16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx new file mode 100644 index 0000000..2aa9d4c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx @@ -0,0 +1,86 @@ +#include "cl_viv_vx_ext.h" + +__kernel void gather_nd_I8toI8_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + indice.x = indice.x * block_size + gidx; + indice.w = 0; + + vxc_char16 src; + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_nd_U8toU8_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + indice.x = indice.x * block_size + gidx; + indice.w = 0; + + vxc_uchar16 src; + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_nd_I16toI16_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + indice.x = indice.x * block_size + gidx; + indice.w = 0; + + vxc_short8 src; + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_nd_F16toF16_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + int4 indice = read_imagei(input1, coord.xy); + indice.x = indice.x * block_size + gidx; + indice.w = 0; + + vxc_short8 src; + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx new file mode 100644 index 0000000..3d92bef --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx @@ -0,0 +1,76 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8; + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8; + +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8; +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp + +#define GATHER_ND_QINT_TO_F16_3D(src0_type_name, read_type) \ +__kernel void gather_nd_##src0_type_name##toF16_3D( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + int4 indice = read_imagei(input1, coord.xy); \ + indice.x = indice.x * block_size + gidx; \ + indice.w = 0; \ + \ + read_type src; \ + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_half8 src0; \ + vxc_short8 dst0; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst0, src0, 16); \ + VXC_WriteImage(output, coord.zy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_ND_QINT_TO_F16_3D(U8, vxc_uchar16) +GATHER_ND_QINT_TO_F16_3D(I8, vxc_char16) +GATHER_ND_QINT_TO_F16_3D(I16, vxc_short8) + +#define GATHER_ND_F16_TO_QINT_3D(src1_type_name, write_type) \ +__kernel void gather_nd_F16to##src1_type_name##_3D( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + int4 indice = read_imagei(input1, coord.xy); \ + indice.x = indice.x * block_size + gidx; \ + indice.w = 0; \ + \ + vxc_short8 src; \ + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + vxc_half8 data; \ + write_type dst; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1), uniConvertFp16toU8_2x8); \ + VXC_WriteImage(output, coord.zy, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_ND_F16_TO_QINT_3D(U8, vxc_uchar16) +GATHER_ND_F16_TO_QINT_3D(I8, vxc_char16) +GATHER_ND_F16_TO_QINT_3D(I16, vxc_short8) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx new file mode 100644 index 0000000..770498b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx @@ -0,0 +1,77 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8; + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8; +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8; + +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8; +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp + +_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8; + +#define GATHER_ND_QINT_TO_F16_1D(src0_type_name, read_type) \ +__kernel void gather_nd_##src0_type_name##toF16_1D( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + int4 indice = read_imagei(input1, coord.xy); \ + coord.w = indice.x; \ + \ + read_type src; \ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_half8 src0; \ + vxc_short8 dst0; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst0, src0, 16); \ + VXC_WriteImage(output, coord.zy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_ND_QINT_TO_F16_1D(U8, vxc_uchar16) +GATHER_ND_QINT_TO_F16_1D(I8, vxc_char16) +GATHER_ND_QINT_TO_F16_1D(I16, vxc_short8) + +#define GATHER_ND_F16_TO_QINT_1D(src1_type_name, write_type) \ +__kernel void gather_nd_F16to##src1_type_name##_1D( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + int4 indice = read_imagei(input1, coord.xy); \ + coord.w = indice.x; \ + \ + vxc_short8 src; \ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + vxc_half8 data; \ + write_type dst; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \ + VXC_WriteImage(output, coord.zy, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_ND_F16_TO_QINT_1D(U8, vxc_uchar16) +GATHER_ND_F16_TO_QINT_1D(I8, vxc_char16) +GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation.vx new file mode 100644 index 0000000..7a12afa --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation.vx @@ -0,0 +1,106 @@ +#include "cl_viv_vx_ext.h" + +#define logE (1.44269502f) +#define twoLogE (2.88539004f) + +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hsigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + +_viv_uniform VXC_512Bits uniConvDatatoFp32_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform float4 tensorScale; +_viv_uniform float4 tensorZP; + +#define GRUCELL_ACTIVATION_SIGMOID_TANH(name0, name1, name2, name3, activater, \ + type00, type01, type10, type11, type20, type21, dst_type, conv_type, copy_type) \ +__kernel void grucell_activation_##name0##_##name1##_##name2##_to_##name3##_##activater \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __read_only image2d_array_t input2, \ + __write_only image2d_array_t output, \ + __write_only image2d_array_t hstate, \ + int gate_activation, \ + int candidate_activation \ + ) \ +{ \ + type00 src00; \ + type01 src01; \ + type00 src10; \ + type01 src11; \ + type00 src20; \ + type01 src21; \ + \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + VXC_ReadImage(src00, input0, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src01, src00, 8); \ + VXC_ReadImage(src10, input1, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, src10, 8); \ + VXC_ReadImage(src20, input2, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src21, src20, 8); \ + \ + float4 zt, ht, ht_1; \ + VXC_DP4x4(zt, src01, src01, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); \ + VXC_DP4x4(ht, src11, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); \ + VXC_DP4x4(ht_1, src21, src21, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); \ + \ + zt = zt * tensorScale.xxxx - tensorZP.xxxx; \ + zt = activater(zt); \ + \ + ht = ht * tensorScale.yyyy - tensorZP.yyyy; \ + ht = tangentH(ht); \ + \ + ht_1 = ht_1 * tensorScale.zzzz - tensorZP.zzzz; \ + \ + ht = ht - zt * ht; \ + ht = zt * ht_1 + ht; \ + \ + ht = ht * tensorScale.wwww + tensorZP.wwww; \ + conv_type dst0; \ + dst_type dst1; \ + copy_type dst; \ + \ + _viv_asm(CONV_RTE, dst0, ht); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst1, 8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(hstate, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} + +#define UCHAR8 vxc_uchar8 +#define SHORT8 vxc_short8 +#define HALF8 vxc_half8 + +GRUCELL_ACTIVATION_SIGMOID_TANH(U8, U8, U8, U8, sigmoid, + UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, int4, UCHAR8) +GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, F16, sigmoid, + SHORT8, HALF8, SHORT8, HALF8, SHORT8, HALF8, HALF8, half4, SHORT8) +GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, U8, sigmoid, + SHORT8, HALF8, SHORT8, HALF8, SHORT8, HALF8, UCHAR8, int4, UCHAR8) +GRUCELL_ACTIVATION_SIGMOID_TANH(U8, U8, U8, U8, hsigmoid, + UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, int4, UCHAR8) +GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, F16, hsigmoid, + SHORT8, HALF8, SHORT8, HALF8, SHORT8, HALF8, HALF8, half4, SHORT8) +GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, U8, hsigmoid, + SHORT8, HALF8, SHORT8, HALF8, SHORT8, HALF8, UCHAR8, int4, UCHAR8) + +#undef UCHAR8 +#undef SHORT8 +#undef HALF8 diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_sma.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_sma.vx new file mode 100644 index 0000000..660bc23 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_sma.vx @@ -0,0 +1,63 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniA_Minus_B_2x8; +_viv_uniform VXC_512Bits uniA_Times_B_2x8; +_viv_uniform VXC_512Bits uniA_Plus_B_2x8; +__kernel void grucell_activation_sma_F16_F16_F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_array_t input2, + __write_only image2d_array_t output, + __write_only image2d_array_t h_status + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_half8 src0, src1, src2, minus, dst; + vxc_ushort8 vec0, vec1, vec2; + + VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + VXC_ReadImage2DArray(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src2, vec2, 16); + + VXC_DP2x8(minus, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Minus_B_2x8); + VXC_DP2x8(dst, minus, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Times_B_2x8); + VXC_DP2x8(dst, dst, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Plus_B_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(h_status, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void grucell_activation_sma_F16_F16_F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_array_t input2, + __write_only image2d_array_t output, + __write_only image2d_array_t h_status + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_half8 src0, src1, src2, minus, dst; + vxc_ushort8 vec0, vec1, vec2; + + VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + VXC_ReadImage(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src2, vec2, 16); + + VXC_DP2x8(minus, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Minus_B_2x8); + VXC_DP2x8(dst, minus, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Times_B_2x8); + VXC_DP2x8(dst, dst, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Plus_B_2x8); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(h_status, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_cdnn_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_cdnn_activation.vx new file mode 100644 index 0000000..b6238e6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_cdnn_activation.vx @@ -0,0 +1,390 @@ +#include "cl_viv_vx_ext.h" + +#define logE (1.44269502f) +#define twoLogE (2.88539004f) + +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + +_viv_uniform VXC_512Bits uniConvDatatoFp32_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uiF16AddF16_4x4; + +__kernel void grucell_activation_cdnn_sep_F16_F16_F16_to_F16_NC + ( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_r, + __read_only image2d_array_t input_z, + __read_only image2d_array_t input_c, + __read_only image2d_array_t recur_r, + __read_only image2d_array_t recur_z, + __read_only image2d_array_t recur_c, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, + int candidate_activation, + int batch_first + ) +{ + vxc_ushort8 s0, s1; + vxc_half8 r0, r1; + vxc_ushort8 s2, s3; + vxc_half8 z0, z1; + vxc_ushort8 s4, s5; + vxc_half8 c0, c1; + float4 r, r2, r3; + float4 z, z2, z3; + float4 c, c2, c3; + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, r0, s0, 8); + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, r1, s1, 8); + r2 = read_imagef(bias_r, coord); + r3 = read_imagef(cond_r, coord); + + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, z0, s2, 8); + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, z1, s3, 8); + z2 = read_imagef(bias_z, coord); + z3 = read_imagef(cond_z, coord); + + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c0, s4, 8); + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c1, s5, 8); + c2 = read_imagef(bias_c, coord); + c3 = read_imagef(cond_c, coord); + + VXC_DP4x4(r, r0, r1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4); + r = r + r2 + r3; + VXC_DP4x4(z, z0, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4); + z = z + z2 + z3; + + vxc_ushort8 s7; + vxc_half8 h; + VXC_ReadImage(s7, prev_state, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, h, s7, 8); + + r = sigmoid(r); + z = sigmoid(z); + + c = c2 * r + c3; + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + c = c2 + c3 * r + c; + c = tangentH(c); + + float4 state; + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + + state = z * (state - c) + c; + + half4 dst0; + vxc_half4 dst1; + vxc_short4 dst; + _viv_asm(CONV_RTE, dst0, state); + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst1, 8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void grucell_activation_cdnn_sep_F16_F16_F16_to_F16_CN + ( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_r, + __read_only image2d_array_t input_z, + __read_only image2d_array_t input_c, + __read_only image2d_array_t recur_r, + __read_only image2d_array_t recur_z, + __read_only image2d_array_t recur_c, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, + int candidate_activation, + int batch_first + ) +{ + vxc_ushort8 s0, s1; + vxc_half8 r0, r1; + vxc_ushort8 s2, s3; + vxc_half8 z0, z1; + vxc_ushort8 s4, s5; + vxc_half8 c0, c1; + float4 r, r2, r3; + float4 z, z2, z3; + float4 c, c2, c3; + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, r0, s0, 8); + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, r1, s1, 8); + r2 = read_imagef(bias_r, coord.yx); + r3 = read_imagef(cond_r, coord.yx); + + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, z0, s2, 8); + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, z1, s3, 8); + z2 = read_imagef(bias_z, coord.yx); + z3 = read_imagef(cond_z, coord.yx); + + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c0, s4, 8); + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c1, s5, 8); + c2 = read_imagef(bias_c, coord.yx); + c3 = read_imagef(cond_c, coord.yx); + + VXC_DP4x4(r, r0, r1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4); + r = r + r2.xxxx + r3.xxxx; + VXC_DP4x4(z, z0, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4); + z = z + z2.xxxx + z3.xxxx; + + vxc_ushort8 s7; + vxc_half8 h; + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, h, s7, 8); + + r = sigmoid(r); + z = sigmoid(z); + + c = c2.xxxx * r + c3.xxxx; + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + c = c2 + c3 * r + c; + c = tangentH(c); + + float4 state; + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + + state = z * (state - c) + c; + + half4 dst0; + vxc_half4 dst1; + vxc_short4 dst; + _viv_asm(CONV_RTE, dst0, state); + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst1, 8); + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void grucell_activation_cdnn_sep_F16_F16_F16_to_F16_CN_FULL + ( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_r, + __read_only image2d_array_t input_z, + __read_only image2d_array_t input_c, + __read_only image2d_array_t recur_r, + __read_only image2d_array_t recur_z, + __read_only image2d_array_t recur_c, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, + int candidate_activation, + int batch_first + ) +{ + vxc_ushort8 s0, s1; + vxc_half8 r0, r1; + vxc_ushort8 s2, s3; + vxc_half8 z0, z1; + vxc_ushort8 s4, s5; + vxc_half8 c0, c1; + float4 r, r2, r3; + float4 z, z2, z3; + float4 c, c2, c3; + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, r0, s0, 8); + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, r1, s1, 8); + r2 = read_imagef(bias_r, coord.yx); + r3 = read_imagef(cond_r, coord.yx); + + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, z0, s2, 8); + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, z1, s3, 8); + z2 = read_imagef(bias_z, coord.yx); + z3 = read_imagef(cond_z, coord.yx); + + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c0, s4, 8); + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c1, s5, 8); + c2 = read_imagef(bias_c, coord.yx); + c3 = read_imagef(cond_c, coord.yx); + + VXC_DP4x4(r, r0, r1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4); + r = r + r2.xxxx + r3.xxxx; + VXC_DP4x4(z, z0, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4); + z = z + z2.xxxx + z3.xxxx; + + vxc_ushort8 s7; + vxc_half8 h; + VXC_ReadImage(s7, prev_state, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, h, s7, 8); + + r = sigmoid(r); + z = sigmoid(z); + + c = c2.xxxx * r + c3.xxxx; + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + c = c2 + c3 * r + c; + c = tangentH(c); + + float4 state; + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + + state = z * (state - c) + c; + + half4 dst0; + vxc_half4 dst1; + vxc_short4 dst; + _viv_asm(CONV_RTE, dst0, state); + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst1, 8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + + +__kernel void grucell_activation_cdnn_F16_F16_F16_to_F16 + ( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_rzc, + __read_only image2d_array_t recur_rzc, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, + int candidate_activation, + int batch_first + ) +{ + vxc_ushort8 s0, s1; + vxc_half8 r0, r1; + vxc_ushort8 s2, s3; + vxc_half8 z0, z1; + vxc_ushort8 s4, s5; + vxc_half8 c0, c1; + float4 r, r2, r3; + float4 z, z2, z3; + float4 c, c2, c3; + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1) * 3, get_global_id(1)); + + VXC_ReadImage(s0, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, r0, s0, 8); + VXC_ReadImage(s1, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, r1, s1, 8); + r2 = read_imagef(bias_r, coord.xy); + r3 = read_imagef(cond_r, coord.xy); + + VXC_ReadImage(s2, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, z0, s2, 8); + VXC_ReadImage(s3, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, z1, s3, 8); + z2 = read_imagef(bias_z, coord.xy); + z3 = read_imagef(cond_z, coord.xy); + + VXC_ReadImage(s4, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c0, s4, 8); + VXC_ReadImage(s5, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c1, s5, 8); + c2 = read_imagef(bias_c, coord.xy); + c3 = read_imagef(cond_c, coord.xy); + + VXC_DP4x4(r, r0, r1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4); + r = r + r2 + r3; + VXC_DP4x4(z, z0, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4); + z = z + z2 + z3; + + vxc_ushort8 s7; + vxc_half8 h; + VXC_ReadImage(s7, prev_state, coord.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, h, s7, 8); + + r = sigmoid(r); + z = sigmoid(z); + + c = c2 * r + c3; + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + c = c2 + c3 * r + c; + c = tangentH(c); + + float4 state; + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + + state = z * (state - c) + c; + + half4 dst0; + vxc_half4 dst1; + vxc_short4 dst; + _viv_asm(CONV_RTE, dst0, state); + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst1, 8); + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_cdnn_activation_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_cdnn_activation_u8.vx new file mode 100644 index 0000000..7035bf3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_cdnn_activation_u8.vx @@ -0,0 +1,396 @@ +#include "cl_viv_vx_ext.h" + +#define logE (1.44269502f) +#define twoLogE (2.88539004f) + +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + +_viv_uniform VXC_512Bits uniConvDatatoFp32_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +_viv_uniform float input_scale; +_viv_uniform float input_tail; +_viv_uniform float input_r_scale; +_viv_uniform float input_r_tail; +_viv_uniform float recur_r_scale; +_viv_uniform float recur_r_tail; +_viv_uniform float input_z_scale; +_viv_uniform float input_z_tail; +_viv_uniform float recur_z_scale; +_viv_uniform float recur_z_tail; +_viv_uniform float input_c_scale; +_viv_uniform float input_c_tail; +_viv_uniform float recur_c_scale; +_viv_uniform float recur_c_tail; +_viv_uniform float output_scale; +_viv_uniform float output_zp; +__kernel void grucell_activation_cdnn_sep_U8_U8_U8_to_U8_NC + ( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_r, + __read_only image2d_array_t input_z, + __read_only image2d_array_t input_c, + __read_only image2d_array_t recur_r, + __read_only image2d_array_t recur_z, + __read_only image2d_array_t recur_c, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, + int candidate_activation, + int batch_first + ) +{ + vxc_uchar8 r00, r01; + vxc_uchar8 z0, z1; + vxc_uchar8 c0, c1; + float4 r, r0, r1, r2, r3; + float4 z, z2, z3; + float4 c, c2, c3; + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + VXC_ReadImage(r00, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(r01, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + r2 = read_imagef(bias_r, coord); + r3 = read_imagef(cond_r, coord); + + VXC_ReadImage(z0, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(z1, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + z2 = read_imagef(bias_z, coord); + z3 = read_imagef(cond_z, coord); + + VXC_ReadImage(c0, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(c1, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + c2 = read_imagef(bias_c, coord); + c3 = read_imagef(cond_c, coord); + + VXC_DP4x4(r0, r00, r00, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(r1, r01, r01, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + r0 = r0 * input_r_scale + input_r_tail; + r1 = r1 * recur_r_scale + recur_r_tail; + r = r0 + r1 + r2 + r3; + + VXC_DP4x4(r0, z0, z0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(r1, z1, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + r0 = r0 * input_z_scale + input_z_tail; + r1 = r1 * recur_z_scale + recur_z_tail; + z = r0 + r1 + z2 + z3; + + vxc_uchar8 h; + VXC_ReadImage(h, prev_state, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + r = sigmoid(r); + z = sigmoid(z); + + c = c2 * r + c3; + + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + c2 = c2 * input_c_scale + input_c_tail; + c3 = c3 * recur_c_scale + recur_c_tail; + c = c2 + c3 * r + c; + c = tangentH(c); + + float4 state; + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + state = state * input_scale + input_tail; + state = z * (state - c) + c; + + state = state * output_scale + output_zp; + + int4 dst0; + vxc_uchar4 dst; + _viv_asm(CONV_RTE, dst0, state); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void grucell_activation_cdnn_sep_U8_U8_U8_to_U8_CN + ( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_r, + __read_only image2d_array_t input_z, + __read_only image2d_array_t input_c, + __read_only image2d_array_t recur_r, + __read_only image2d_array_t recur_z, + __read_only image2d_array_t recur_c, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, + int candidate_activation, + int batch_first + ) +{ + vxc_uchar8 r00, r01; + vxc_uchar8 z0, z1; + vxc_uchar8 c0, c1; + float4 r, r2, r3, r0, r1; + float4 z, z2, z3; + float4 c, c2, c3; + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + VXC_ReadImage(r00, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(r01, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + r2 = read_imagef(bias_r, coord.yx); + r3 = read_imagef(cond_r, coord.yx); + + VXC_ReadImage(z0, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(z1, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + z2 = read_imagef(bias_z, coord.yx); + z3 = read_imagef(cond_z, coord.yx); + + VXC_ReadImage(c0, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(c1, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + c2 = read_imagef(bias_c, coord.yx); + c3 = read_imagef(cond_c, coord.yx); + + VXC_DP4x4(r0, r00, r00, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(r1, r01, r01, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + r0 = r0 * input_r_scale + input_r_tail; + r1 = r1 * recur_r_scale + recur_r_tail; + r = r0 + r1 + r2.xxxx + r3.xxxx; + + VXC_DP4x4(r0, z0, z0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(r1, z1, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + r0 = r0 * input_z_scale + input_z_tail; + r1 = r1 * recur_z_scale + recur_z_tail; + z = r0 + r1 + z2.xxxx + z3.xxxx; + + vxc_uchar8 h; + VXC_ReadImage(h, prev_state, coord.yx, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(h, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(h, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(h, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + r = sigmoid(r); + z = sigmoid(z); + + c = c2.xxxx * r + c3.xxxx; + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + c2 = c2 * input_c_scale + input_c_tail; + c3 = c3 * recur_c_scale + recur_c_tail; + c = c2 + c3 * r + c; + c = tangentH(c); + + float4 state; + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + state = state * input_scale + input_tail; + state = z * (state - c) + c; + + state = state * output_scale + output_zp; + + int4 dst0; + vxc_uchar4 dst; + _viv_asm(CONV_RTE, dst0, state); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void grucell_activation_cdnn_sep_U8_U8_U8_to_U8_CN_FULL + ( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_r, + __read_only image2d_array_t input_z, + __read_only image2d_array_t input_c, + __read_only image2d_array_t recur_r, + __read_only image2d_array_t recur_z, + __read_only image2d_array_t recur_c, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, + int candidate_activation, + int batch_first + ) +{ + vxc_uchar8 r00, r01; + vxc_uchar8 z0, z1; + vxc_uchar8 c0, c1; + float4 r, r2, r3, r0, r1; + float4 z, z2, z3; + float4 c, c2, c3; + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + VXC_ReadImage(r00, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(r01, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + r2 = read_imagef(bias_r, coord.yx); + r3 = read_imagef(cond_r, coord.yx); + + VXC_ReadImage(z0, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(z1, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + z2 = read_imagef(bias_z, coord.yx); + z3 = read_imagef(cond_z, coord.yx); + + VXC_ReadImage(c0, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(c1, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + c2 = read_imagef(bias_c, coord.yx); + c3 = read_imagef(cond_c, coord.yx); + + VXC_DP4x4(r0, r00, r00, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(r1, r01, r01, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + r0 = r0 * input_r_scale + input_r_tail; + r1 = r1 * recur_r_scale + recur_r_tail; + r = r0 + r1 + r2.xxxx + r3.xxxx; + + VXC_DP4x4(r0, z0, z0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(r1, z1, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + r0 = r0 * input_z_scale + input_z_tail; + r1 = r1 * recur_z_scale + recur_z_tail; + z = r0 + r1 + z2.xxxx + z3.xxxx; + + vxc_uchar8 h; + VXC_ReadImage(h, prev_state, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + r = sigmoid(r); + z = sigmoid(z); + + c = c2.xxxx * r + c3.xxxx; + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + c2 = c2 * input_c_scale + input_c_tail; + c3 = c3 * recur_c_scale + recur_c_tail; + c = c2 + c3 * r + c; + c = tangentH(c); + + float4 state; + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + state = state * input_scale + input_tail; + state = z * (state - c) + c; + + state = state * output_scale + output_zp; + + int4 dst0; + vxc_uchar4 dst; + _viv_asm(CONV_RTE, dst0, state); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void grucell_activation_cdnn_U8_U8_U8_to_U8 + ( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_rzc, + __read_only image2d_array_t recur_rzc, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, + int candidate_activation, + int batch_first + ) +{ + vxc_uchar8 r00, r01; + vxc_uchar8 z0, z1; + vxc_uchar8 c0, c1; + float4 r, r0, r1, r2, r3; + float4 z, z2, z3; + float4 c, c2, c3; + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1) * 3, get_global_id(1)); + + VXC_ReadImage(r00, input_rzc, coord.xz, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(r01, recur_rzc, coord.xz, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + r2 = read_imagef(bias_r, coord.xy); + r3 = read_imagef(cond_r, coord.xy); + + VXC_ReadImage(z0, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(z1, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + z2 = read_imagef(bias_z, coord.xy); + z3 = read_imagef(cond_z, coord.xy); + + VXC_ReadImage(c0, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(c1, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + c2 = read_imagef(bias_c, coord.xy); + c3 = read_imagef(cond_c, coord.xy); + + VXC_DP4x4(r0, r00, r00, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(r1, r01, r01, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + r0 = r0 * input_r_scale + input_r_tail; + r1 = r1 * recur_r_scale + recur_r_tail; + r = r0 + r1 + r2 + r3; + VXC_DP4x4(r0, z0, z0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(r1, z1, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + r0 = r0 * input_r_scale + input_r_tail; + r1 = r1 * input_r_scale + recur_r_tail; + z = r0 + r1 + z2 + z3; + + vxc_uchar8 h; + VXC_ReadImage(h, prev_state, coord.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + r = sigmoid(r); + z = sigmoid(z); + + c = c2 * r + c3; + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + c2 = c2 * input_c_scale + input_c_tail; + c3 = c3 * recur_c_scale + recur_c_tail; + c = c2 + c3 * r + c; + c = tangentH(c); + + float4 state; + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); + state = state * input_scale + input_tail; + state = z * (state - c) + c; + + state = state * output_scale + output_zp; + + int4 dst0; + vxc_uchar4 dst; + _viv_asm(CONV_RTE, dst0, state); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/hswish.vx b/src/tim/vx/internal/src/libnnext/ops/vx/hswish.vx new file mode 100644 index 0000000..9939096 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/hswish.vx @@ -0,0 +1,141 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float inputScale; +_viv_uniform float inputTail; +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4; +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; + +#define HSWISH_PROCESS(read_fun, write_fun, src_type, src_copy_type, convert_type, dst_type, dst_copy_type, \ + INSCALE, INTAIL, OUTSCALE, OUTZP) \ + src_type src0; \ + src_copy_type src1; \ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, src0, 16); \ + float4 vecA, vecB, vecC, vecD, vecE, vecDstA, vecDstB; \ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \ + VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \ + vecA = vecA * INSCALE + INTAIL; \ + vecB = vecB * INSCALE + INTAIL; \ + vecC = vecA + 3.0f; \ + vecD = vecB + 3.0f; \ + vecE = 6.0f; \ + _viv_asm(CLAMP0MAX, vecDstA, vecC, vecE); \ + _viv_asm(CLAMP0MAX, vecDstB, vecD, vecE); \ + vecA = vecA * vecDstA; \ + vecB = vecB * vecDstB; \ + vecA = vecA / 6.0f; \ + vecB = vecB / 6.0f; \ + vecA = vecA * OUTSCALE + OUTZP; \ + vecB = vecB * OUTSCALE + OUTZP; \ + convert_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, vecA); \ + _viv_asm(CONV_RTE, dst1, vecB); \ + dst_type dst2; \ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + dst_copy_type dst; \ + _viv_asm(COPY, dst, dst2, 16); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +#define HSWISH_FUNC(src_type_name, dst_type_name, src_type, src_copy_type, convert_type, dst_type, \ + dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \ + __kernel void hswish_##src_type_name##to##dst_type_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float beta \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + HSWISH_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray, src_type, \ + src_copy_type, convert_type, dst_type, dst_copy_type, \ + INSCALE, INTAIL, OUTSCALE, OUTZP) \ +} + +HSWISH_FUNC(F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8, 1, 0, 1, 0) +HSWISH_FUNC(F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8, 1, 0, outputScale, 0) +HSWISH_FUNC(F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8, 1, 0, outputScale, outputZP) +HSWISH_FUNC(F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8, 1, 0, outputScale, 0) +HSWISH_FUNC(I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8, inputScale, 0, outputScale, 0) +HSWISH_FUNC(I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0) +HSWISH_FUNC(U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8, \ + inputScale, inputTail, outputScale, outputZP) +HSWISH_FUNC(U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8, inputScale, inputTail, 1, 0) +HSWISH_FUNC(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8, inputScale, 0, outputScale, 0) +HSWISH_FUNC(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0) + + +#define HSWISH_FUNC_2D(src_type_name, dst_type_name, src_type, src_copy_type, convert_type, dst_type, \ + dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \ + __kernel void hswish_##src_type_name##to##dst_type_name##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float beta \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + HSWISH_PROCESS(VXC_ReadImage, VXC_WriteImage, src_type, src_copy_type, convert_type, dst_type, \ + dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \ +} + +HSWISH_FUNC_2D(F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8, 1, 0, 1, 0) +HSWISH_FUNC_2D(F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8, 1, 0, outputScale, 0) +HSWISH_FUNC_2D(F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8, 1, 0, outputScale, outputZP) +HSWISH_FUNC_2D(F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8, 1, 0, outputScale, 0) +HSWISH_FUNC_2D(I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8, inputScale, 0, outputScale, 0) +HSWISH_FUNC_2D(I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0) +HSWISH_FUNC_2D(U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8, inputScale, \ + inputTail, outputScale, outputZP) +HSWISH_FUNC_2D(U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8, inputScale, inputTail, 1, 0) +HSWISH_FUNC_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8, inputScale, 0, outputScale, 0) +HSWISH_FUNC_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0) + + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define HSWISH_BF16_PROCESS(read_fun, write_fun) \ + vxc_ushort8 src0, src1, dst; \ + float4 vecA, vecB, vecC, vecD, vecE, vecDstA, vecDstB; \ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecA, src1, 16); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, vecB, src1, 16); \ + vecC = vecA + 3.0f; \ + vecD = vecB + 3.0f; \ + vecE = 6.0f; \ + _viv_asm(CLAMP0MAX, vecDstA, vecC, vecE); \ + _viv_asm(CLAMP0MAX, vecDstB, vecD, vecE); \ + vecA = vecA * vecDstA; \ + vecB = vecB * vecDstB; \ + vecA = vecA / 6.0f; \ + vecB = vecB / 6.0f; \ + _viv_asm(COPY, src0, vecA, 16); \ + _viv_asm(COPY, src1, vecB, 16); \ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void hswish_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float beta + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + HSWISH_BF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray); +} + +__kernel void hswish_BF16toBF16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float beta + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + HSWISH_BF16_PROCESS(VXC_ReadImage, VXC_WriteImage); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx new file mode 100644 index 0000000..c942079 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx @@ -0,0 +1,261 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; + +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16( + image2d_array_t input, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + vxc_half8 in_h; + vxc_float4 sumsqr; + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSumSqr += sumsqr; + } + } + + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0; + float sqr = 0; + for(int i = 0; i < 4; i++) + { + //sum += lcl_sum[i]; + //sqr += lcl_sqr[i]; + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16_2D( + image2d_array_t input, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + vxc_short8 src0; + vxc_half8 in_h; + vxc_float4 sumsqr; + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSumSqr += sumsqr; + } + } + + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0; + float sqr = 0; + for(int i = 0; i < 4; i++) + { + //sum += lcl_sum[i]; + //sqr += lcl_sqr[i]; + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h, in_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + bias_f = read_imagef(bias, coord_para); + + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h, src0, 16); + + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + + vxc_float4 norm; + norm = scale_vari * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_vari * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16_2D( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int4 coord = (int4)(get_global_id(0), gidy, 0, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + int endH = gidy + height; + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h, in_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + bias_f = read_imagef(bias, coord_para); + + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + for(; coord.y < endH; coord.y++) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h, src0, 16); + + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + vxc_float4 norm; + norm = scale_vari * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_vari * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx new file mode 100644 index 0000000..cedc0a2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx @@ -0,0 +1,398 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; + +_viv_uniform float inFlScale_s2; +_viv_uniform float input_fl_scale; +_viv_uniform float inOut_fl_scale; +_viv_uniform float output_fl_scale; + +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; +_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4; +_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16( + image2d_array_t input, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + float sum = 0, sqr = 0; + vxc_float4 sumsqr = (vxc_float4)(0); + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniInt16SumSqr_dp8x2); + //tmpSumSqr += sumsqr; + tmpSumSqr.x += sumsqr.x; + sqr += (sumsqr.y * inFlScale_s2); + } + sum = tmpSumSqr.x * input_fl_scale; + //sqr = tmpSumSqr.y * inFlScale_s2; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + //sum += lcl_sum[i]; + //sqr += lcl_sqr[i]; + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16_2D( + image2d_array_t input, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + vxc_short8 src0; + float sum = 0, sqr = 0; + vxc_float4 sumsqr = (vxc_float4)(0); + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniInt16SumSqr_dp8x2); + //tmpSumSqr += sumsqr; + tmpSumSqr.x += sumsqr.x; + sqr += (sumsqr.y * inFlScale_s2); + } + sum = tmpSumSqr.x * input_fl_scale; + //sqr = tmpSumSqr.y * inFlScale_s2; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + //sum += lcl_sum[i]; + //sqr += lcl_sqr[i]; + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16_2D( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int4 coord = (int4)(get_global_id(0), gidy, 0, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + int endH = gidy + height; + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + for(; coord.y < endH; coord.y++) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + vxc_short8 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toInt16_2x8); + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16_2D( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int4 coord_para = (int4)(gidz, 0, 0, 0); + int endH = gidy + height; + vxc_short8 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + for(; coord.y < endH; coord.y++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toInt16_2x8); + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx new file mode 100644 index 0000000..489da14 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx @@ -0,0 +1,449 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSumInt8_16x1; +_viv_uniform VXC_512Bits uniSqrSumInt8_16x1; +_viv_uniform float inFlScale_s2; +_viv_uniform float input_fl_scale; + +_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4; + +_viv_uniform float inOut_fl_scale; +_viv_uniform float output_fl_scale; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8( + image2d_array_t input, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_char16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0; + int tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1); + tmpSum += (tmpSum1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1); + tmpSqr += (tmpSqr1); + } + sqr = tmpSqr * inFlScale_s2; + sum = tmpSum * input_fl_scale; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + //sum += lcl_sum[i]; + //sqr += lcl_sqr[i]; + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8_2D( + image2d_array_t input, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + vxc_char16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0; + int tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + tmpSqr = 0; + for(; coord.y < endH;) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1); + tmpSum += (tmpSum1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1); + tmpSqr += (tmpSqr1); + } + sqr = tmpSqr * inFlScale_s2; + sum = tmpSum * input_fl_scale; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + //sum += lcl_sum[i]; + //sqr += lcl_sqr[i]; + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + vxc_char16 src0; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + for(coord.y = 0; coord.y < height;) + { + coord_para = coord; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertFthInt8Fp32_4x4); + + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_para.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16_2D( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int4 coord = (int4)(get_global_id(0), gidy, 0, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + int endH = gidy + height; + vxc_char16 src0; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + for(; coord.y < endH;) + { + coord_para = coord; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertFthInt8Fp32_4x4); + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_para.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + vxc_char16 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertFthInt8Fp32_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8_2D( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int4 coord_para = (int4)(gidz, 0, 0, 0); + int endH = gidy + height; + vxc_char16 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + for(; coord.y < endH; coord.y++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertFthInt8Fp32_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx new file mode 100644 index 0000000..68f8f8a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx @@ -0,0 +1,447 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform float e2InScale; +_viv_uniform float rowSumScale; +_viv_uniform float scale_inOut; +_viv_uniform float outputScale; +_viv_uniform int output_ZP; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8( + image2d_array_t input, + image2d_array_t output, + float eps, int rsFlg) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_uchar16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); + } + sqr += (tmpSqr * e2InScale + rowSumScale); + sum = (tmpSum + sumInZp) * input_scale; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8_2D( + image2d_array_t input, + image2d_array_t output, + float eps, int rsFlg) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + vxc_uchar16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); + } + sqr += (tmpSqr * e2InScale + rowSumScale); + sum = (tmpSum + sumInZp) * input_scale; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + vxc_uchar16 src0; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + float alpha = input_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + for(coord.y = 0; coord.y < height;) + { + coord_para = coord; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_para.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int4 coord = (int4)(get_global_id(0), gidy, 0, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + int endH = gidy + height; + vxc_uchar16 src0; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + float alpha = input_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + for(; coord.y < endH;) + { + coord_para = coord; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_para.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + vxc_uchar16 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + float alpha = scale_inOut * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + + for(coord.y = 0; coord.y < height;coord.y++) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8_2D( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t meanVari, + image2d_array_t output, + float eps, + int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int4 coord_para = (int4)(gidz, 0, 0, 0); + int endH = gidy + height; + vxc_uchar16 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + bias_f = read_imagef(bias, coord_para); + coord_para.x = 0; + coord_para.y = gidz; + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para); + coord_para.x += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + float alpha = scale_inOut * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + + for(; coord.y < endH; coord.y++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx new file mode 100644 index 0000000..d35d79e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx @@ -0,0 +1,322 @@ +#include "cl_viv_vx_ext.h" + +#define VXC_Vstore3(Pointer, Offset, Data) \ +do \ +{ int byteOffset = ((int)sizeof((Data)))*(Offset); \ +VXC_OP3_NoDest(vstore3, Pointer, byteOffset, Data); } \ +while(0) + +inline uchar* get_image2D_array_ptr(image2d_array_t input) +{ + int8 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + uchar *src_ptr = (uchar*)desc.s0; + return src_ptr; +} + +#define L2NORMSCALE_SWITCH_PROCESS(case_value, vec_val, ZpValue) \ + switch (case_value) \ + { \ + case 1: \ + vec_val.s123 = ZpValue; \ + vec_val.s4567 = ZpValue; \ + break; \ + case 2: \ + vec_val.s23 = ZpValue; \ + vec_val.s4567 = ZpValue; \ + break; \ + case 3: \ + vec_val.s3 = ZpValue; \ + vec_val.s4567 = ZpValue; \ + break; \ + case 4: \ + vec_val.s4567 = ZpValue; \ + break; \ + case 5: \ + vec_val.s567 = ZpValue; \ + break; \ + case 6: \ + vec_val.s67 = ZpValue; \ + break; \ + case 7: \ + vec_val.s7 = ZpValue; \ + break; \ + default: \ + break; \ + } + +#define L2NORMSCALE_REM_PROCESS(ZpValue) \ + VXC_Vload8(src0, src_ptr, 0); \ + VXC_Vload8(src1, src_ptr, 1); \ + if (inputRemain <= 8) \ + { \ + L2NORMSCALE_SWITCH_PROCESS(inputRemain, src0, ZpValue) \ + src1 = 0; \ + } \ + else if (inputRemain < 16) \ + { \ + int inputRemain8 = inputRemain - 8; \ + L2NORMSCALE_SWITCH_PROCESS(inputRemain8, src1, ZpValue) \ + } + + +#define L2NORMSCALE_MUL_PROCESS(index) \ + VXC_Vload8(src0, src_ptr, index); \ + _viv_asm(COPY, val0, src0, 16); \ + VXC_Vload8(scale_s16, scale_ptr, index); \ + _viv_asm(COPY, scale_f16, scale_s16, 16); \ + _viv_asm(COPY, input_ZP, inputZP, 4); \ + VXC_DP4x4(vec0, val0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\ + uniDataSubZPtoFp32Part0_4x4);\ + VXC_DP4x4(vec1, val0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\ + uniDataSubZPtoFp32Part1_4x4);\ + VXC_DP4x4(scale_f32, scale_f16, scale_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\ + uniFp16toFp32_4x4);\ + VXC_DP4x4(scale1_f32, scale_f16, scale_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\ + uniFp16toFp32Hi_4x4);\ + vec0 = vec0 * rsqrt0.xxxx + output_ZP;\ + vec1 = vec1 * rsqrt0.xxxx + output_ZP;\ + vec0 *= scale_f32;\ + vec1 *= scale1_f32;\ + _viv_asm(CONV_RTE, dst0, vec0);\ + _viv_asm(CONV_RTE, dst1, vec1);\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\ + _viv_asm(COPY, dst, dst2, 16); + +_viv_uniform int inputWidth; +_viv_uniform int inputWidthRemain256; +_viv_uniform int inputWidthCount; +_viv_uniform VXC_512Bits uniSumSqrt_16x1; +_viv_uniform float r_inputScale; + +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part0_4x4; +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4; +_viv_uniform VXC_512Bits uniExtact8Bin_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform VXC_512Bits uniFp16toFp32Hi_4x4; +_viv_uniform float IntergerScale; +_viv_uniform float output_ZP; +_viv_uniform int inputWidthRemain128; +_viv_uniform float zP2x; +_viv_uniform float zpSqrt16x; +_viv_uniform VXC_512Bits uniSumAll_16x1; +_viv_uniform int inputZP; + +#define L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \ + vxc_float4 rsqrt0;\ + dst_type *dst_ptr = (dst_type *)get_image2D_array_ptr(output); \ + short *scale_ptr = (short *)get_image2D_array_ptr(scale); \ + vxc_float4 vec0, vec1;\ + convert_type dst0, dst1;\ + vxc_short8 scale_s16;\ + vxc_half8 scale_f16;\ + vxc_float4 scale_f32, scale1_f32;\ + output_type dst2;\ + copy_type dst;\ + rsqrt0 = sum.xxxx * IntergerScale;\ + src_ptr = src_ptr_base + (get_global_id(0) + get_global_id(1) * inputWidth); \ + dst_ptr += (get_global_id(0) + get_global_id(1) * inputWidth);\ + scale_ptr += get_global_id(0);\ + for(int i = 0; i < inputWidthCount; i++)\ + {\ + L2NORMSCALE_MUL_PROCESS(0) \ + VXC_Vstore8(dst_ptr, 0, dst); \ + L2NORMSCALE_MUL_PROCESS(1) \ + VXC_Vstore8(dst_ptr, 1, dst); \ + src_ptr += 256; \ + dst_ptr += 256; \ + scale_ptr += 256; \ + }\ + if (inputWidthRemain256) \ + { \ + offset = get_global_id(0) + inputWidthCount * 128; \ + inputRemain = inputWidth - offset; \ + if (inputRemain >= 8) \ + { \ + L2NORMSCALE_MUL_PROCESS(0) \ + VXC_Vstore8(dst_ptr, 0, dst); \ + src_ptr += 8; \ + dst_ptr += 8; \ + scale_ptr += 8; \ + inputRemain -= 8; \ + } \ + if (inputRemain > 0) \ + { \ + L2NORMSCALE_MUL_PROCESS(0) \ + switch (inputRemain) \ + { \ + case 1: \ + dst_ptr[0] = dst.s0; \ + break; \ + case 2: \ + VXC_Vstore2(dst_ptr, 0, dst); \ + break; \ + case 3: \ + VXC_Vstore3(dst_ptr, 0, dst); \ + break; \ + case 4: \ + VXC_Vstore4(dst_ptr, 0, dst); \ + break; \ + case 5: \ + VXC_Vstore2(dst_ptr, 0, dst); \ + dst.s012 = dst.s234; \ + dst_ptr += 2; \ + VXC_Vstore3(dst_ptr, 0, dst); \ + break; \ + case 6: \ + VXC_Vstore3(dst_ptr, 0, dst); \ + dst.s012 = dst.s345; \ + dst_ptr += 3; \ + VXC_Vstore3(dst_ptr, 0, dst); \ + break; \ + case 7: \ + VXC_Vstore4(dst_ptr, 0, dst); \ + dst.s012 = dst.s456; \ + dst_ptr += 4; \ + VXC_Vstore3(dst_ptr, 0, dst); \ + break; \ + default: \ + VXC_Vstore8(dst_ptr, 0, dst); \ + break; \ + } \ + } \ + } \ + + +#define L2NORMSCALE_AXIS0_2D(in0_name, in1_name, out_name, read_type, read_type2, src_type, INPUTSCALE, \ + dst_type, convert_type, output_type, copy_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \ + void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \ + (\ + __read_only image2d_array_t input,\ + __read_only image2d_array_t scale,\ + __write_only image2d_array_t output,\ + int axis\ + )\ +{ \ + int lidx = get_local_id(0); \ + int offset = get_global_id(0); \ + read_type *src_ptr_base = (read_type *)get_image2D_array_ptr(input); \ + read_type *src_ptr; \ + read_type2 src0, src1; \ + src_type val0, val1; \ + int inputRemain; \ + vxc_float4 sum = {0.0f}; \ + read_type2 input_ZP ;\ + __local float lcl_sum[16]; \ + src_ptr = src_ptr_base + (get_global_id(0) + get_global_id(1) * inputWidth); \ + for (int i = 0; i < inputWidthCount; i++) \ + { \ + VXC_Vload8(src0, src_ptr, 0); \ + VXC_Vload8(src1, src_ptr, 1); \ + _viv_asm(COPY, val0, src0, 16); \ + _viv_asm(COPY, val1, src1, 16); \ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\ + uniSumSqrt_16x1); \ + sum.x += sum.y; \ + src_ptr += 256; \ + } \ + if (inputWidthRemain256) \ + { \ + offset = get_global_id(0) + inputWidthCount * 256;\ + inputRemain = inputWidth - offset; \ + if (inputRemain > 0) \ + { \ + L2NORMSCALE_REM_PROCESS(0) \ + _viv_asm(COPY, val0, src0, 16); \ + _viv_asm(COPY, val1, src1, 16); \ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\ + uniSumSqrt_16x1); \ + sum.x += sum.y; \ + } \ + } \ + lcl_sum[lidx] = sum.x; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \ + float4 one = (float4)(1, 1, 1, 1); \ + float4 data0; \ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \ + sum.x = dot(data0, one); \ + sum.x = rsqrt(sum.x) * INPUTSCALE; \ + L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \ +} + +L2NORMSCALE_AXIS0_2D(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \ + ushort, half4, vxc_half8, vxc_ushort8) +L2NORMSCALE_AXIS0_2D(I16, F16, F16, short, vxc_short8, vxc_short8, r_inputScale, \ + ushort, half4, vxc_half8, vxc_ushort8) +L2NORMSCALE_AXIS0_2D(I16, F16, I16, short, vxc_short8, vxc_short8, r_inputScale, \ + short, int4, vxc_short8, vxc_short8) +L2NORMSCALE_AXIS0_2D(I8, F16, F16, char, vxc_char8, vxc_char8, r_inputScale, \ + ushort, half4, vxc_half8, vxc_ushort8) +L2NORMSCALE_AXIS0_2D(I8, F16, I8, char, vxc_char8, vxc_char8, r_inputScale, \ + char, int4, vxc_char8, vxc_char8) + + + +#define L2NORMSCALE_AXIS0_U8_2D(in1_name, out_name,\ + dst_type, convert_type, output_type, copy_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \ + void l2normalizescale_axis0_U8_##in1_name##to##out_name##_2D \ + (\ + __read_only image2d_array_t input,\ + __read_only image2d_array_t scale,\ + __write_only image2d_array_t output,\ + int axis\ + )\ +{ \ + int lidx = get_local_id(0); \ + int offset = get_global_id(0); \ + uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input); \ + uchar *src_ptr; \ + vxc_uchar8 src0, src1; \ + vxc_uchar8 val0, val1; \ + int inputRemain; \ + vxc_float4 sum = {0.0f}; \ + vxc_uchar8 input_ZP ; \ + __local float lcl_sum[16]; \ + src_ptr = src_ptr_base + (get_global_id(0) + get_global_id(1) * inputWidth); \ + for (int i = 0; i < inputWidthCount; i++) \ + { \ + VXC_Vload8(src0, src_ptr, 0); \ + VXC_Vload8(src1, src_ptr, 1); \ + _viv_asm(COPY, val0, src0, 16); \ + _viv_asm(COPY, val1, src1, 16); \ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\ + uniSumSqrt_16x1); \ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 1),\ + uniSumAll_16x1); \ + sum.w = sum.y - zP2x * sum.z + zpSqrt16x; \ + sum.x += sum.w; \ + src_ptr += 256; \ + } \ + if (inputWidthRemain256) \ + { \ + offset = get_global_id(0) + inputWidthCount * 256; \ + inputRemain = inputWidth - offset; \ + if (inputRemain > 0) \ + { \ + L2NORMSCALE_REM_PROCESS((uchar)inputZP) \ + _viv_asm(COPY, val0, src0, 16); \ + _viv_asm(COPY, val1, src1, 16); \ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\ + uniSumSqrt_16x1); \ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 1),\ + uniSumAll_16x1); \ + sum.w = sum.y - zP2x * sum.z + zpSqrt16x; \ + sum.x += sum.w; \ + } \ + } \ + lcl_sum[lidx] = sum.x; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \ + float4 one = (float4)(1, 1, 1, 1); \ + float4 data0; \ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \ + sum.x = dot(data0, one); \ + sum.x = rsqrt(sum.x) * r_inputScale; \ + L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \ +} + +L2NORMSCALE_AXIS0_U8_2D(F16, F16, ushort, half4, vxc_half8, vxc_ushort8) +L2NORMSCALE_AXIS0_U8_2D(F16, U8, uchar, int4, vxc_uchar8, vxc_uchar8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis1.vx new file mode 100644 index 0000000..65daaed --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis1.vx @@ -0,0 +1,250 @@ +#include "cl_viv_vx_ext.h" + +/********************************************L2NormalizeScale*****************************************/ +_viv_uniform int L2NorS_depth; +_viv_uniform VXC_512Bits UniFp16MulLo_dp4x4; +_viv_uniform VXC_512Bits UniFp16MulHi_dp4x4; + +//int8 version +_viv_uniform float r_inputScale; +_viv_uniform VXC_512Bits uniIntegerSquareLo_4x4; +_viv_uniform VXC_512Bits uniIntegerSquareHi_4x4; +_viv_uniform VXC_512Bits uniDataSquareAddU32Lo_4x4; +_viv_uniform VXC_512Bits uniDataSquareAddU32Hi_4x4; + +_viv_uniform VXC_512Bits uniUInt8SquareLo_4x4; +_viv_uniform VXC_512Bits uniUInt8SquareHi_4x4; +_viv_uniform int inputZP; +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part0_4x4; +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4; +_viv_uniform VXC_512Bits uniExtact8Bin_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float IntergerScale; +_viv_uniform float output_ZP; + +#define L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \ + coord.y = get_global_id(1); \ + input_type vect0, vect1;\ + incopy_type src0, src1;\ + vxc_float4 rsqrt0, rsqrt1;\ + rsqrt0 = sum_lo;\ + rsqrt1 = sum_hi;\ + rsqrt0 *= IntergerScale;\ + rsqrt1 *= IntergerScale;\ + for(int i = 0; i < L2NorS_depth; i += 2)\ + {\ + vxc_float4 vec0, vec1;\ + input_type input_ZP ;\ + convert_type dst0, dst1;\ + VXC_ReadImage(vect0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect1, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ + _viv_asm(COPY, src1, vect1, 16); \ + vxc_short8 scale_s16;\ + vxc_half8 scale_f16;\ + vxc_float4 scale_f32;\ + VXC_ReadImage(scale_s16, scale, coord.yw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\ + _viv_asm(COPY, scale_f16, scale_s16, 16); \ + _viv_asm(COPY, input_ZP, inputZP, 4); \ + VXC_DP4x4(vec0, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\ + uniDataSubZPtoFp32Part0_4x4);\ + VXC_DP4x4(vec1, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\ + uniDataSubZPtoFp32Part1_4x4);\ + VXC_DP4x4(scale_f32, scale_f16, scale_f16, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardInf, 0),\ + uniFp16toFp32_4x4);\ + vec0 = vec0 * rsqrt0;\ + vec1 = vec1 * rsqrt1;\ + vec0 = vec0 * scale_f32.xxxx + output_ZP;\ + vec1 = vec1 * scale_f32.xxxx + output_ZP;\ + _viv_asm(CONV_RTE, dst0, vec0);\ + _viv_asm(CONV_RTE, dst1, vec1);\ + output_type dst2;\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\ + copy_type dst;\ + _viv_asm(COPY, dst, dst2, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ + VXC_DP4x4(vec0, src1, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\ + uniDataSubZPtoFp32Part0_4x4);\ + VXC_DP4x4(vec1, src1, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\ + uniDataSubZPtoFp32Part1_4x4);\ + vec0 = vec0 * rsqrt0;\ + vec1 = vec1 * rsqrt1;\ + vec0 = vec0 * scale_f32.yyyy + output_ZP;\ + vec1 = vec1 * scale_f32.yyyy + output_ZP;\ + _viv_asm(CONV_RTE, dst0, vec0);\ + _viv_asm(CONV_RTE, dst1, vec1);\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\ + coord.y++;\ + _viv_asm(COPY, dst, dst2, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ + coord.y++;\ + }\ + + +#define L2NORMSCALE_AXIS1_F16_2D(in1_name, out_name,\ + input_type, incopy_type, output_type, convert_type, copy_type) \ +__kernel void l2normalizescale_axis1_F16_##in1_name##to##out_name##_2D \ + (\ + __read_only image2d_array_t input,\ + __read_only image2d_array_t scale,\ + __write_only image2d_array_t output,\ + int axis\ + )\ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 img1_s16, img2_s16; \ + vxc_float4 squr, sum_lo = 0, sum_hi = 0; \ + vxc_half8 img1_fp16, img2_fp16; \ + for(int i = 0; i < L2NorS_depth; i += 2) \ + { \ + VXC_ReadImage(img1_s16, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(img2_s16, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y += 2; \ + _viv_asm(COPY, img1_fp16, img1_s16, 16); \ + _viv_asm(COPY, img2_fp16, img2_s16, 16); \ + VXC_DP4x4(squr, img1_fp16, img1_fp16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),\ + UniFp16MulLo_dp4x4); \ + sum_lo += squr; \ + VXC_DP4x4(squr, img2_fp16, img2_fp16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),\ + UniFp16MulLo_dp4x4); \ + sum_lo += squr; \ + VXC_DP4x4(squr, img1_fp16, img1_fp16, VXC_MODIFIER(0, 3, 4, VXC_RM_TowardZero, 1),\ + UniFp16MulHi_dp4x4); \ + sum_hi += squr; \ + VXC_DP4x4(squr, img2_fp16, img2_fp16, VXC_MODIFIER(0, 3, 4, VXC_RM_TowardZero, 1),\ + UniFp16MulHi_dp4x4); \ + sum_hi += squr; \ + } \ + sum_lo = rsqrt(sum_lo); \ + sum_hi = rsqrt(sum_hi); \ + L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \ +} + +L2NORMSCALE_AXIS1_F16_2D(F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8) + + +#define L2NORMSCALE_AXIS1_I8_2D(in1_name, out_name,\ + input_type, incopy_type, output_type, convert_type, copy_type) \ +__kernel void l2normalizescale_axis1_I8_##in1_name##to##out_name##_2D \ + (\ + __read_only image2d_array_t input,\ + __read_only image2d_array_t scale,\ + __write_only image2d_array_t output,\ + int axis\ + )\ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_char8 src0_I8, src1_I8; \ + vxc_uint4 dst0_I8 = 0, dst1_I8 = 0; \ + for(int i = 0; i < L2NorS_depth; i += 2) \ + { \ + VXC_ReadImage(src0_I8, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1_I8, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y += 2; \ + VXC_DP4x4(dst0_I8, src0_I8, dst0_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataSquareAddU32Lo_4x4); \ + VXC_DP4x4(dst1_I8, src0_I8, dst1_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataSquareAddU32Hi_4x4); \ + VXC_DP4x4(dst0_I8, src1_I8, dst0_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataSquareAddU32Lo_4x4); \ + VXC_DP4x4(dst1_I8, src1_I8, dst1_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataSquareAddU32Hi_4x4); \ + } \ + vxc_float4 sum_lo, sum_hi; \ + sum_lo = convert_float4(dst0_I8); \ + sum_hi = convert_float4(dst1_I8); \ + sum_lo = rsqrt(sum_lo) * r_inputScale; \ + sum_hi = rsqrt(sum_hi) * r_inputScale; \ + L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \ +} + +L2NORMSCALE_AXIS1_I8_2D(F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16) +L2NORMSCALE_AXIS1_I8_2D(F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8) + + +#define L2NORMSCALE_AXIS1_I16_2D(in1_name, out_name,\ + input_type, incopy_type, output_type, convert_type, copy_type) \ +__kernel void l2normalizescale_axis1_I16_##in1_name##to##out_name##_2D \ + (\ + __read_only image2d_array_t input,\ + __read_only image2d_array_t scale,\ + __write_only image2d_array_t output,\ + int axis\ + )\ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 src0_I16, src1_I16; \ + vxc_float4 squr, sum_lo = 0, sum_hi = 0; \ + for(int i = 0; i < L2NorS_depth; i += 2) \ + { \ + VXC_ReadImage(src0_I16, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1_I16, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y += 2; \ + VXC_DP4x4(squr, src0_I16, src0_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniIntegerSquareLo_4x4); \ + sum_lo = squr + sum_lo; \ + VXC_DP4x4(squr, src0_I16, src0_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniIntegerSquareHi_4x4); \ + sum_hi = squr + sum_hi; \ + VXC_DP4x4(squr, src1_I16, src1_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniIntegerSquareLo_4x4); \ + sum_lo = squr + sum_lo; \ + VXC_DP4x4(squr, src1_I16, src1_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniIntegerSquareHi_4x4); \ + sum_hi = squr + sum_hi; \ + } \ + sum_lo = rsqrt(sum_lo) * r_inputScale; \ + sum_hi = rsqrt(sum_hi) * r_inputScale; \ + L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \ +} + +L2NORMSCALE_AXIS1_I16_2D(F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8) +L2NORMSCALE_AXIS1_I16_2D(F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8) + +#define L2NORMSCALE_AXIS1_U8_2D(in1_name, out_name,\ + input_type, incopy_type, output_type, convert_type, copy_type) \ +__kernel void l2normalizescale_axis1_U8_##in1_name##to##out_name##_2D \ + (\ + __read_only image2d_array_t input,\ + __read_only image2d_array_t scale,\ + __write_only image2d_array_t output,\ + int axis\ + )\ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_uchar8 src0_U8, src1_U8; \ + vxc_float4 squr, sum_lo = 0, sum_hi = 0; \ + for(int i = 0; i < L2NorS_depth; i += 2) \ + { \ + vxc_uchar8 zero; \ + VXC_ReadImage(src0_U8, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1_U8, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y += 2; \ + _viv_asm(COPY, zero, inputZP, 4); \ + VXC_DP4x4(squr, src0_U8, zero, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniUInt8SquareLo_4x4); \ + sum_lo = squr + sum_lo; \ + VXC_DP4x4(squr, src0_U8, zero, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniUInt8SquareHi_4x4); \ + sum_hi = squr + sum_hi; \ + VXC_DP4x4(squr, src1_U8, zero, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniUInt8SquareLo_4x4); \ + sum_lo = squr + sum_lo; \ + VXC_DP4x4(squr, src1_U8, zero, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniUInt8SquareHi_4x4); \ + sum_hi = squr + sum_hi; \ + } \ + sum_lo = rsqrt(sum_lo) * r_inputScale; \ + sum_hi = rsqrt(sum_hi) * r_inputScale; \ + L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \ +} + +L2NORMSCALE_AXIS1_U8_2D(F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8) +L2NORMSCALE_AXIS1_U8_2D(F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis0.vx new file mode 100644 index 0000000..c3d6653 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis0.vx @@ -0,0 +1,239 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform float rlogE; +_viv_uniform int axisSize; +_viv_uniform float betaValue; +_viv_uniform float scaleLogE; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform int inputWidth; +_viv_uniform int inputWidthRemain4; +_viv_uniform VXC_512Bits uniGetSubData0to3_4x4; +_viv_uniform VXC_512Bits uniGetSubData4to7_4x4; +_viv_uniform VXC_512Bits uniPackMaxData_2x8; + +#define LOGSOFTMAX_PROCESS_AXIS0(read_fun, vert_max_fun, horz_max_fun) \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, val, val0, 16); \ + coord.x += 8; \ + do \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val1, val1, 16); \ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val2, val2, 16); \ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val3, val3, 16); \ + coord.x += 32; \ + vert_max_fun(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vert_max_fun(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + while(coord.x < (axisSize + 16)); \ + horz_max_fun(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \ + horz_max_fun(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_float4 prob; \ + float fProbSum = 0; \ + const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \ + int idx = 0; \ + for (coord.x = 0; coord.x < inputWidth; idx ++) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \ + prob *= scaleLogE; \ + prob = exp2(prob); \ + fProbSum += dot(prob, one4); \ + coord.x += 4; \ + } \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \ + prob *= scaleLogE; \ + if(inputWidthRemain4 == 1) \ + { \ + prob.x = exp2(prob.x); \ + prob.yzw = 0; \ + fProbSum += dot(prob, one4); \ + } \ + else if(inputWidthRemain4 == 2) \ + { \ + prob.x = exp2(prob.x); \ + prob.y = exp2(prob.y); \ + prob.zw = 0; \ + fProbSum += dot(prob, one4); \ + } \ + else if(inputWidthRemain4 == 3) \ + { \ + prob.x = exp2(prob.x); \ + prob.y = exp2(prob.y); \ + prob.z = exp2(prob.z); \ + prob.w = 0; \ + fProbSum += dot(prob, one4); \ + } \ + vxc_float4 probSum_log; \ + probSum_log.x = log2(fProbSum) * rlogE; + +#define LOGSOFTMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \ + for (coord.x = 0; coord.x < axisSize; ) \ + { \ + dst_type vec0, vec1; \ + save_type dst; \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \ + prob = prob * betaValue - probSum_log.xxxx; \ + prob = prob * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, vec0, prob); \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \ + prob = prob * betaValue - probSum_log.xxxx; \ + prob = prob * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, vec1, prob); \ + VXC_DP2x8(dst, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + } + +#define LOGSOFTMAX_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun, horz_max_fun) \ +__kernel void log_softmax_axis0_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + LOGSOFTMAX_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \ + LOGSOFTMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \ +} + +LOGSOFTMAX_AXIS0(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_AXIS0(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_AXIS0(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_AXIS0(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) + +#define LOGSOFTMAX_AXIS0_2D(src_name, dst_name, src_type, copy_type,\ + dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun, horz_max_fun) \ +__kernel void log_softmax_axis0_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(16, get_global_id(0)); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + LOGSOFTMAX_PROCESS_AXIS0(VXC_ReadImage, vert_max_fun, horz_max_fun) \ + LOGSOFTMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage, VXC_WriteImage); \ +} + +LOGSOFTMAX_AXIS0_2D(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_AXIS0_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_AXIS0_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, \ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_AXIS0_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) + +#define LOGSOFTMAX_PROCESS_AXIS0_TOF32_SAVE(read_fun) \ + for (coord.x = 0; coord.x < axisSize; ) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \ + prob = prob * betaValue - probSum_log.xxxx; \ + write_imagef(output, coord, prob); \ + coord.x += 4; \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \ + prob = prob * betaValue - probSum_log.xxxx; \ + write_imagef(output, coord, prob); \ + coord.x += 4; \ + } + +#define LOGSOFTMAX_AXIS0_TOF32(src_name, src_type, copy_type, vert_max_fun, horz_max_fun) \ +__kernel void log_softmax_axis0_##src_name##toF32 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + LOGSOFTMAX_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \ + LOGSOFTMAX_PROCESS_AXIS0_TOF32_SAVE(VXC_ReadImage2DArray) \ +} + +LOGSOFTMAX_AXIS0_TOF32(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_AXIS0_TOF32(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0_TOF32(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0_TOF32(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) + +#define LOGSOFTMAX_AXIS0_TOF32_2D(src_name, src_type, copy_type, vert_max_fun, horz_max_fun) \ +__kernel void log_softmax_axis0_##src_name##toF32_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(16, get_global_id(0)); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + LOGSOFTMAX_PROCESS_AXIS0(VXC_ReadImage, vert_max_fun, horz_max_fun) \ + LOGSOFTMAX_PROCESS_AXIS0_TOF32_SAVE(VXC_ReadImage) \ +} + +LOGSOFTMAX_AXIS0_TOF32_2D(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_AXIS0_TOF32_2D(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0_TOF32_2D(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_AXIS0_TOF32_2D(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis0_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis0_BF16.vx new file mode 100644 index 0000000..dc652a4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis0_BF16.vx @@ -0,0 +1,200 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform float rlogE; +_viv_uniform int axisSize; +_viv_uniform float betaValue; +_viv_uniform float scaleLogE; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; + +_viv_uniform int inputWidth; +_viv_uniform int inputWidthRemain4; +_viv_uniform VXC_512Bits uniPackMaxData_2x8; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; + + +#define LOGSOFTMAX_PROCESS_AXIS0_BF16(read_fun) \ + vxc_half8 img_val0, img_val1, img_val2, img_val3; \ + vxc_short8 val0, val1, val2, val3; \ + vxc_half8 val; \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, val, val0, 16); \ + coord.x += 8; \ + do \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val1, val1, 16); \ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val2, val2, 16); \ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val3, val3, 16); \ + coord.x += 32; \ + VXC_VertMax3_Half(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_VertMax3_Half(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + while(coord.x < (axisSize + 16)); \ + VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \ + VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + vxc_ushort8 bf_val_tmp; \ + vxc_float4 vecA; \ + _viv_asm(COPY, bf_val_tmp, val, 16); \ + VXC_DP2x8(bf_val_tmp, bf_val_tmp, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecA, bf_val_tmp, 16); \ + vxc_float4 prob; \ + float fProbSum = 0; \ + const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \ + float max_value = vecA.x * scaleLogE; \ + float max_value_orig = vecA.x; \ + for (coord.x = 0; coord.x < inputWidth; ) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(bf_val_tmp, val0, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, prob, bf_val_tmp, 16); \ + prob = prob * scaleLogE - max_value; \ + prob = exp2(prob); \ + fProbSum += dot(prob, one4); \ + coord.x += 4; \ + } \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(bf_val_tmp, val0, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, prob, bf_val_tmp, 16); \ + prob = prob * scaleLogE - max_value; \ + if(inputWidthRemain4 == 1) \ + { \ + prob.x = exp2(prob.x); \ + prob.yzw = 0; \ + fProbSum += dot(prob, one4); \ + } \ + else if(inputWidthRemain4 == 2) \ + { \ + prob.x = exp2(prob.x); \ + prob.y = exp2(prob.y); \ + prob.zw = 0; \ + fProbSum += dot(prob, one4); \ + } \ + else if(inputWidthRemain4 == 3) \ + { \ + prob.x = exp2(prob.x); \ + prob.y = exp2(prob.y); \ + prob.z = exp2(prob.z); \ + prob.w = 0; \ + fProbSum += dot(prob, one4); \ + } \ + vxc_float4 probSum_log; \ + probSum_log.x = log2(fProbSum) * rlogE; + +#define LOGSOFTMAX_PROCESS_AXIS0_BF16TOBF16_SAVE(read_fun, write_fun) \ + for (coord.x = 0; coord.x < axisSize; ) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(bf_val_tmp, val0, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, prob, bf_val_tmp, 16); \ + prob = prob - max_value_orig; \ + prob = prob * betaValue - probSum_log.xxxx; \ + vxc_ushort8 tmp, dst; \ + _viv_asm(COPY, tmp, prob, 16); \ + dst.s0123 = tmp.s1357; \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 4; \ + } + +#define LOGSOFTMAX_PROCESS_AXIS0_BF16TOF16_SAVE(read_fun, write_fun) \ + for (coord.x = 0; coord.x < axisSize; ) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(bf_val_tmp, val0, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, prob, bf_val_tmp, 16); \ + prob = prob - max_value_orig; \ + prob = prob * betaValue - probSum_log.xxxx; \ + half4 vec; \ + vxc_half4 tmp; \ + vxc_short4 dst; \ + _viv_asm(CONV, vec, prob); \ + VXC_DP4x4(tmp, vec, vec, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, dst, tmp, 8); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 4; \ + } + +#define LOGSOFTMAX_PROCESS_AXIS0_BF16TOF32_SAVE(read_fun) \ + for (coord.x = 0; coord.x < axisSize; ) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(bf_val_tmp, val0, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, prob, bf_val_tmp, 16); \ + prob = prob - max_value_orig; \ + prob = prob * betaValue - probSum_log.xxxx; \ + write_imagef(output, coord, prob); \ + coord.x += 4; \ + } + +__kernel void log_softmax_axis0_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage2DArray) + LOGSOFTMAX_PROCESS_AXIS0_BF16TOBF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} +__kernel void log_softmax_axis0_BF16toF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage2DArray) + LOGSOFTMAX_PROCESS_AXIS0_BF16TOF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} +__kernel void log_softmax_axis0_BF16toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage2DArray) + LOGSOFTMAX_PROCESS_AXIS0_BF16TOF32_SAVE(VXC_ReadImage2DArray) +} +__kernel void log_softmax_axis0_BF16toBF16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int2 coord = (int2)(16, get_global_id(0)); + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage) + LOGSOFTMAX_PROCESS_AXIS0_BF16TOBF16_SAVE(VXC_ReadImage, VXC_WriteImage) +} +__kernel void log_softmax_axis0_BF16toF16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int2 coord = (int2)(16, get_global_id(0)); + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage) + LOGSOFTMAX_PROCESS_AXIS0_BF16TOF16_SAVE(VXC_ReadImage, VXC_WriteImage) +} +__kernel void log_softmax_axis0_BF16toF32_2D( + __read_only image2d_array_t input, + __write_only image2d_t output, + float input_Scale, + int axisVal ) +{ + int2 coord = (int2)(16, get_global_id(0)); + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage) + LOGSOFTMAX_PROCESS_AXIS0_BF16TOF32_SAVE(VXC_ReadImage) +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis1.vx new file mode 100644 index 0000000..454693a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis1.vx @@ -0,0 +1,232 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float rlogE; +_viv_uniform int axisSize; +_viv_uniform float betaValue; +_viv_uniform float scaleLogE; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniGetSubLoData_4x4; +_viv_uniform VXC_512Bits uniGetSubHiData_4x4; + +#define LOGSOFTMAX_PROCESS_AXIS1(read_fun, vert_max_fun) \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, max, in0, 16); \ + coord.y++; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + } \ + while(coord.y < axisSize); \ + coord.y = 0; \ + sum0 = 0; \ + sum1 = 0; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \ + data0 *= scaleLogE; \ + data0 = exp2(data0); \ + sum0 += data0; \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \ + data0 *= scaleLogE; \ + data0 = exp2(data0); \ + sum1 += data0; \ + coord.y++; \ + } \ + while(coord.y < axisSize); \ + sum0 = log2(sum0) * rlogE; \ + sum1 = log2(sum1) * rlogE; + +#define LOGSOFTMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \ + coord.y = 0; \ + dst_type dst0, dst1; \ + save_type vect; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \ + data0 = data0 * betaValue - sum0; \ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst0, data0); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \ + data0 = data0 * betaValue - sum1; \ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst1, data0); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + } \ + while(coord.y < axisSize); + +#define LOGSOFTMAX_AXIS1(src_name, dst_name, src_type, copy_type, dst_type,\ +save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun) \ +__kernel void log_softmax_axis1_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + vxc_float4 sum0, sum1; \ + LOGSOFTMAX_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \ + LOGSOFTMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \ +} + + + +LOGSOFTMAX_AXIS1(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS1(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS1(F16, U8, vxc_half8, vxc_short8, uchar4,\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half) + +LOGSOFTMAX_AXIS1(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer) + +LOGSOFTMAX_AXIS1(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer) + +LOGSOFTMAX_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, uchar4,\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8, CONV, 1, 0, VXC_VertMax3_Integer) + +#define LOGSOFTMAX_AXIS1_2D(src_name, dst_name, src_type,\ +copy_type, dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun) \ +__kernel void log_softmax_axis1_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + vxc_float4 sum0, sum1; \ + LOGSOFTMAX_PROCESS_AXIS1(VXC_ReadImage, vert_max_fun) \ + LOGSOFTMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage, VXC_WriteImage); \ +} + +LOGSOFTMAX_AXIS1_2D(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS1_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS1_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS1_2D(F16, U8, vxc_half8, vxc_short8, uchar4,\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half) + +LOGSOFTMAX_AXIS1_2D(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer) + +LOGSOFTMAX_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8, \ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, VXC_VertMax3_Integer) + +LOGSOFTMAX_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4,\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8, CONV, 1, 0, VXC_VertMax3_Integer) + + +#define LOGSOFTMAX_AXIS1_TOF32(src_name, src_type, copy_type, vert_max_fun) \ +__kernel void log_softmax_axis1_##src_name##toF32 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + vxc_float4 sum0, sum1; \ + LOGSOFTMAX_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \ + coord.y = 0; \ + do \ + { \ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \ + data0 = data0 * betaValue - sum0; \ + write_imagef(output, coord, data0); \ + coord.x += 4; \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \ + data0 = data0 * betaValue - sum1; \ + write_imagef(output, coord, data0); \ + coord.x -= 4; \ + coord.y++; \ + } \ + while(coord.y < axisSize); \ +} + +LOGSOFTMAX_AXIS1_TOF32(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS1_TOF32(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS1_TOF32(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS1_TOF32(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer) + +#define LOGSOFTMAX_AXIS1_TOF32_2D(src_name, src_type, copy_type, vert_max_fun) \ +__kernel void log_softmax_axis1_##src_name##toF32_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + vxc_float4 sum0, sum1; \ + LOGSOFTMAX_PROCESS_AXIS1(VXC_ReadImage, vert_max_fun) \ + coord.y = 0; \ + do \ + { \ + VXC_ReadImage(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \ + data0 = data0 * betaValue - sum0; \ + write_imagef(output, coord, data0); \ + coord.x += 4; \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \ + data0 = data0 * betaValue - sum1; \ + write_imagef(output, coord, data0); \ + coord.x -= 4; \ + coord.y++; \ + } \ + while(coord.y < axisSize); \ +} + +LOGSOFTMAX_AXIS1_TOF32_2D(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS1_TOF32_2D(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS1_TOF32_2D(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS1_TOF32_2D(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis1_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis1_BF16.vx new file mode 100644 index 0000000..0464380 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis1_BF16.vx @@ -0,0 +1,286 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float rlogE; +_viv_uniform int axisSize; +_viv_uniform float betaValue; +_viv_uniform float scaleLogE; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; + +_viv_uniform VXC_512Bits uniExtractHalf8_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define LOGSOFTMAX_PROCESS_AXIS1_BF16(read_fun) \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, max, in0, 16); \ + coord.y++; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_VertMax3_Half(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + } \ + while(coord.y < axisSize); \ + _viv_asm(COPY, tmp0, max, 16); \ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, max_lo, tmp1, 16); \ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, max_hi, tmp1, 16); \ + coord.y = 0; \ + sum0 = 0; \ + sum1 = 0; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data0, tmp1, 16); \ + data0 = data0 - max_lo; \ + data0 *= scaleLogE; \ + sum0 += exp2(data0); \ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, data0, tmp1, 16); \ + data0 = data0 - max_hi; \ + data0 *= scaleLogE; \ + sum1 += exp2(data0); \ + coord.y++; \ + } \ + while (coord.y < axisSize); \ + sum0 = log2(sum0) * rlogE; \ + sum1 = log2(sum1) * rlogE; + +__kernel void log_softmax_axis1_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage2DArray) + + coord.y = 0; + vxc_ushort8 dst0, dst1, dst; + do + { + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + _viv_asm(COPY, dst0, data0, 16); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + _viv_asm(COPY, dst1, data0, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + } + while(coord.y < axisSize); +} + +__kernel void log_softmax_axis1_BF16toF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage2DArray) + + coord.y = 0; + half4 dst0, dst1; + do + { + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + _viv_asm(CONV, dst0, data0); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + _viv_asm(CONV, dst1, data0); + VXC_DP2x8(vec0, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); + vxc_short8 vect; + _viv_asm(COPY, vect, vec0, 16); + VXC_WriteImage2DArray(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + } + while(coord.y < axisSize); +} + +__kernel void log_softmax_axis1_BF16toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage2DArray) + + coord.y = 0; + do + { + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + write_imagef(output, coord, data0); + coord.x += 4; + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + write_imagef(output, coord, data0); + coord.x -= 4; + coord.y++; + } + while (coord.y < axisSize); +} + +__kernel void log_softmax_axis1_BF16toBF16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int2 coord = (int2)(get_global_id(0), 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage) + + coord.y = 0; + vxc_ushort8 dst0, dst1, dst; + do + { + VXC_ReadImage(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + _viv_asm(COPY, dst0, data0, 16); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + _viv_asm(COPY, dst1, data0, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + } + while(coord.y < axisSize); +} + +__kernel void log_softmax_axis1_BF16toF16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int2 coord = (int2)(get_global_id(0), 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage) + + coord.y = 0; + half4 dst0, dst1; + do + { + VXC_ReadImage(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + _viv_asm(CONV, dst0, data0); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + _viv_asm(CONV, dst1, data0); + VXC_DP2x8(vec0, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); + vxc_short8 vect; + _viv_asm(COPY, vect, vec0, 16); + VXC_WriteImage(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + } + while(coord.y < axisSize); +} + +__kernel void log_softmax_axis1_BF16toF32_2D( + __read_only image2d_array_t input, + __write_only image2d_t output, + float input_Scale, + int axisVal ) +{ + int2 coord = (int2)(get_global_id(0), 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage) + + coord.y = 0; + do + { + VXC_ReadImage(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + write_imagef(output, coord, data0); + coord.x += 4; + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + write_imagef(output, coord, data0); + coord.x -= 4; + coord.y++; + } + while (coord.y < axisSize); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis2.vx new file mode 100644 index 0000000..0e01a4f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_axis2.vx @@ -0,0 +1,312 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float rlogE; +_viv_uniform int axisSize; +_viv_uniform float betaValue; +_viv_uniform float scaleLogE; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; + +_viv_uniform VXC_512Bits uniGetSubLoData_4x4; +_viv_uniform VXC_512Bits uniGetSubHiData_4x4; +_viv_uniform VXC_512Bits uniExtractHalf8_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define LOGSOFTMAX_PROCESS_AXIS2(read_fun, vert_max_fun) \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, max, in0, 16); \ + coord.z++; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z++; \ + } \ + while(coord.z < axisSize); \ + coord.z = 0; \ + sum0 = 0; \ + sum1 = 0; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \ + data0 *= scaleLogE; \ + data0 = exp2(data0); \ + sum0 += data0; \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \ + data0 *= scaleLogE; \ + data0 = exp2(data0); \ + sum1 += data0; \ + coord.z++; \ + } \ + while(coord.z < axisSize); \ + sum0 = log2(sum0) * rlogE; \ + sum1 = log2(sum1) * rlogE; + +#define LOGSOFTMAX_PROCESS_AXIS2_SAVE(dst_type, save_type,\ +conv_mode, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \ + coord.z = 0; \ + dst_type dst0, dst1; \ + save_type vect; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \ + data0 = data0 * betaValue - sum0; \ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst0, data0); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \ + data0 = data0 * betaValue - sum1; \ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst1, data0); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z++; \ + } \ + while(coord.z < axisSize); + +#define LOGSOFTMAX_AXIS2(src_name, dst_name, src_type, copy_type,\ +dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun) \ +__kernel void log_softmax_axis2_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + vxc_float4 sum0, sum1; \ + LOGSOFTMAX_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_max_fun) \ + LOGSOFTMAX_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \ +} + +LOGSOFTMAX_AXIS2(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS2(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS2(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS2(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half) + +LOGSOFTMAX_AXIS2(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS2(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer) + +LOGSOFTMAX_AXIS2(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS2(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer) + +LOGSOFTMAX_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS2(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer) + + +#define LOGSOFTMAX_AXIS2_TOF32(src_name, src_type, copy_type, vert_max_fun) \ +__kernel void log_softmax_axis2_##src_name##toF32 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + vxc_float4 sum0, sum1; \ + LOGSOFTMAX_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_max_fun) \ + coord.z = 0; \ + do \ + { \ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \ + data0 = data0 * betaValue - sum0; \ + write_imagef(output, coord, data0); \ + coord.x += 4; \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \ + data0 = data0 * betaValue - sum1; \ + write_imagef(output, coord, data0); \ + coord.x -= 4; \ + coord.z++; \ + } \ + while(coord.z < axisSize); \ +} + +LOGSOFTMAX_AXIS2_TOF32(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half) +LOGSOFTMAX_AXIS2_TOF32(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS2_TOF32(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer) +LOGSOFTMAX_AXIS2_TOF32(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer) + +#define LOGSOFTMAX_PROCESS_AXIS2_BF16(read_fun) \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, max, in0, 16); \ + coord.z++; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_VertMax3_Half(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z++; \ + } \ + while(coord.z < axisSize); \ + _viv_asm(COPY, tmp0, max, 16); \ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, max_lo, tmp1, 16); \ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, max_hi, tmp1, 16); \ + coord.z = 0; \ + sum0 = 0; \ + sum1 = 0; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data0, tmp1, 16); \ + data0 = data0 - max_lo; \ + data0 *= scaleLogE; \ + sum0 += exp2(data0); \ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, data0, tmp1, 16); \ + data0 = data0 - max_hi; \ + data0 *= scaleLogE; \ + sum1 += exp2(data0); \ + coord.z++; \ + } \ + while (coord.z < axisSize); \ + sum0 = log2(sum0) * rlogE; \ + sum1 = log2(sum1) * rlogE; + +__kernel void log_softmax_axis2_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_AXIS2_BF16(VXC_ReadImage2DArray) + + coord.z = 0; + vxc_ushort8 dst0, dst1, dst; + do + { + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + _viv_asm(COPY, dst0, data0, 16); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + _viv_asm(COPY, dst1, data0, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.z++; + } + while(coord.z < axisSize); +} + +__kernel void log_softmax_axis2_BF16toF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_AXIS2_BF16(VXC_ReadImage2DArray) + + coord.z = 0; + half4 dst0, dst1; + do + { + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + _viv_asm(CONV, dst0, data0); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + _viv_asm(CONV, dst1, data0); + VXC_DP2x8(vec0, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); + vxc_short8 vect; + _viv_asm(COPY, vect, vec0, 16); + VXC_WriteImage2DArray(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.z++; + } + while(coord.z < axisSize); +} + +__kernel void log_softmax_axis2_BF16toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_AXIS2_BF16(VXC_ReadImage2DArray) + + coord.z = 0; + do + { + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + write_imagef(output, coord, data0); + coord.x += 4; + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + write_imagef(output, coord, data0); + coord.x -= 4; + coord.z++; + } + while (coord.z < axisSize); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/logical_not.vx b/src/tim/vx/internal/src/libnnext/ops/vx/logical_not.vx new file mode 100644 index 0000000..1515e76 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/logical_not.vx @@ -0,0 +1,30 @@ +#include "cl_viv_vx_ext.h" + +__kernel void logical_not_I8toI8( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_char8 src0; + vxc_char8 dst; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + dst = !src0; + dst *= (-1); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void logical_not_I8toI8_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + vxc_char8 src0; + vxc_char8 dst; + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + dst = !src0; + dst *= (-1); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx b/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx new file mode 100644 index 0000000..4ba7c40 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx @@ -0,0 +1,114 @@ +#include "cl_viv_vx_ext.h" + +#define TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\ +lgc_op, lgc_op2, read_fun, write_fun) \ + input_type vA;\ + copy_type src0;\ + input_type vB;\ + copy_type src1;\ + read_fun(vA,in0,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\ + _viv_asm(COPY, src0, vA, 16); \ + read_fun(vB,in1,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\ + _viv_asm(COPY, src1, vB, 16); \ + output_type dst; \ + dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \ + dst *= (-1); \ + out_copy_type data; \ + _viv_asm(COPY, data, dst, 16); \ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + +#define TENSORLOGICAL(name0, src_type_name, dst_type_name, input_type, copy_type,\ +output_type, out_copy_type, lgc_op, lgc_op2) \ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name( \ + __read_only image2d_array_t in0, \ + __read_only image2d_array_t in1, \ + __write_only image2d_array_t output) \ +{\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\ + TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\ + lgc_op, lgc_op2, VXC_ReadImage2DArray, VXC_WriteImage2DArray) \ +} + +#define TENSORLOGICAL_2D(name0, src_type_name, dst_type_name, input_type,\ +copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name##_2D( \ + __read_only image2d_array_t in0, \ + __read_only image2d_array_t in1, \ + __write_only image2d_array_t output) \ +{\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\ + TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\ + lgc_op, lgc_op2, VXC_ReadImage, VXC_WriteImage) \ +} + +_viv_uniform VXC_512Bits uniMulShortMinus1toFp16_2x8; + +#define TENSORLOGICAL_FP_PROCESS(input_type, copy_type, output_type,\ +out_copy_type, lgc_op, lgc_op2, read_fun, write_fun) \ + input_type vA;\ + copy_type src0;\ + input_type vB;\ + copy_type src1;\ + read_fun(vA,in0,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\ + _viv_asm(COPY, src0, vA, 16); \ + read_fun(vB,in1,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\ + _viv_asm(COPY, src1, vB, 16); \ + output_type dst; \ + dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \ + vxc_half8 tmpOut; \ + VXC_DP2x8(tmpOut,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 0),uniMulShortMinus1toFp16_2x8); \ + out_copy_type data; \ + _viv_asm(COPY, data, tmpOut, 16); \ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + + +#define TENSORLOGICAL_FP(name0, src_type_name, dst_type_name, input_type,\ +copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name( \ + __read_only image2d_array_t in0, \ + __read_only image2d_array_t in1, \ + __write_only image2d_array_t output) \ +{\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\ + TENSORLOGICAL_FP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\ + VXC_ReadImage2DArray, VXC_WriteImage2DArray) \ +} + +#define TENSORLOGICAL_FP_2D(name0, src_type_name, dst_type_name, input_type,\ +copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name##_2D( \ + __read_only image2d_array_t in0, \ + __read_only image2d_array_t in1, \ + __write_only image2d_array_t output) \ +{\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\ + TENSORLOGICAL_FP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\ + VXC_ReadImage, VXC_WriteImage) \ +} + +// name0, src_name, dst_name, input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2 +TENSORLOGICAL(or, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ||, ) +//TENSORLOGICAL(or, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ||, ) +//TENSORLOGICAL(or, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, ) +//TENSORLOGICAL_FP(or, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, ) +TENSORLOGICAL(and, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, &&, ) +//TENSORLOGICAL(and, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, &&, ) +//TENSORLOGICAL(and, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, ) +//TENSORLOGICAL_FP(and, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, ) +TENSORLOGICAL(xor, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ^, !!) +//TENSORLOGICAL(xor, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ^, !!) +//TENSORLOGICAL(xor, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!) +//TENSORLOGICAL_FP(xor, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!) + +TENSORLOGICAL_2D(or, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ||, ) +//TENSORLOGICAL_2D(or, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ||, ) +//TENSORLOGICAL_2D(or, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, ) +//TENSORLOGICAL_FP_2D(or, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, ) +TENSORLOGICAL_2D(and, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, &&, ) +//TENSORLOGICAL_2D(and, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, &&, ) +//TENSORLOGICAL_2D(and, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, ) +//TENSORLOGICAL_FP_2D(and, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, ) +TENSORLOGICAL_2D(xor, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ^, !!) +//TENSORLOGICAL_2D(xor, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ^, !!) +//TENSORLOGICAL_2D(xor, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!) +//TENSORLOGICAL_FP_2D(xor, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_BP_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_BP_F16.vx new file mode 100644 index 0000000..3e0362d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_BP_F16.vx @@ -0,0 +1,210 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4; + +#define LSTMUNIT_BP_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_BP_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src10, vect10, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + \ + convert_type dst0; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_BP_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_BP_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_BP_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_BP_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_BP_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_BP_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_BP_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_BP_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_BP_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_BP_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src10, vect10, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_BP_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_BP_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_BP_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_BP_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_BP_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_BP_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_BP_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_BP_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_BP_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_BP_U8.vx new file mode 100644 index 0000000..8207d26 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_BP_U8.vx @@ -0,0 +1,206 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniU8AddS32_4x4; +_viv_uniform int4 input0Array_ZP; +_viv_uniform int4 input1Array_ZP; +_viv_uniform float4 input0Array_Scale; +_viv_uniform float4 input1Array_Scale; + +#define LSTMUNIT_BP_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_BP_U8to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + float4 vecA, vecB; \ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx + b0; \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \ + \ + convert_type dst0; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_BP_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_BP_U8_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_BP_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_BP_U8_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_BP_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_BP_U8_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_BP_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_BP_U8_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_BP_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_BP_U8to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0; \ + vxc_half8 src4; \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + float4 vecA, vecB; \ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect0, 16); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx + b0; \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src4, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_BP_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_BP_U8_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_BP_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_BP_U8_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_BP_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_BP_U8_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_BP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_BP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_B_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_B_F16.vx new file mode 100644 index 0000000..07eb3ba --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_B_F16.vx @@ -0,0 +1,214 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4; + +#define LSTMUNIT_B_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_B_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src10, vect10, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + \ + convert_type dst0; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_B_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_B_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_B_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_B_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_B_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_B_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_B_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_B_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_B_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_B_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src10, vect10, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + data_i_t = data_i_t + b0; \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_B_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_B_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_B_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_B_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_B_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_B_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_B_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_B_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_B_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_B_U8.vx new file mode 100644 index 0000000..69d960e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_B_U8.vx @@ -0,0 +1,203 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniU8AddS32_4x4; +_viv_uniform int4 input0Array_ZP; +_viv_uniform int4 input1Array_ZP; +_viv_uniform float4 input0Array_Scale; +_viv_uniform float4 input1Array_Scale; + +#define LSTMUNIT_B_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_B_U8to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + float4 vecA, vecB; \ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx + b0; \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \ + \ + convert_type dst0; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} + +LSTMUNIT_B_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_B_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_B_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_B_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_B_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_B_U8to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0; \ + vxc_half8 src4; \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + float4 vecA, vecB; \ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect0, 16); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx + b0; \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src4, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_B_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_B_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_B_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_B_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CBP_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CBP_F16.vx new file mode 100644 index 0000000..e996d56 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CBP_F16.vx @@ -0,0 +1,190 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4; + +#define LSTMUNIT_CBP_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CBP_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + \ + convert_type dst0; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CBP_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CBP_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CBP_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CBP_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CBP_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CBP_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CBP_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CBP_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_CBP_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CBP_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CBP_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CBP_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CBP_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CBP_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CBP_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CBP_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CBP_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CBP_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CBP_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CBP_U8.vx new file mode 100644 index 0000000..0bee306 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CBP_U8.vx @@ -0,0 +1,187 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniU8AddS32_4x4; +_viv_uniform int4 input0Array_ZP; +_viv_uniform int4 input1Array_ZP; +_viv_uniform float4 input0Array_Scale; +_viv_uniform float4 input1Array_Scale; + +#define LSTMUNIT_CBP_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CBP_U8to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 vecA, vecB; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \ + \ + convert_type dst0; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CBP_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CBP_U8_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CBP_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CBP_U8_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CBP_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CBP_U8_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CBP_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CBP_U8_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_CBP_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CBP_U8to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0; \ + vxc_half8 src4; \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + float4 vecA, vecB; \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect0, 16); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src4, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CBP_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CBP_U8_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CBP_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CBP_U8_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CBP_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CBP_U8_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CBP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CBP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CB_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CB_F16.vx new file mode 100644 index 0000000..39f3289 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CB_F16.vx @@ -0,0 +1,194 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4; + +#define LSTMUNIT_CB_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CB_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + \ + convert_type dst0; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CB_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CB_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CB_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CB_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CB_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CB_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CB_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CB_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_CB_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CB_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + data_f_t = data_f_t + b1; \ + data_g_t = data_g_t + b2; \ + data_o_t = data_o_t + b3; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CB_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CB_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CB_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CB_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CB_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CB_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CB_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CB_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CB_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CB_U8.vx new file mode 100644 index 0000000..7e2baee --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CB_U8.vx @@ -0,0 +1,184 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniU8AddS32_4x4; +_viv_uniform int4 input0Array_ZP; +_viv_uniform int4 input1Array_ZP; +_viv_uniform float4 input0Array_Scale; +_viv_uniform float4 input1Array_Scale; + +#define LSTMUNIT_CB_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CB_U8to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + float4 vecA, vecB; \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \ + \ + convert_type dst0; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CB_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CB_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CB_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CB_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_CB_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CB_U8to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0; \ + vxc_half8 src4; \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + float4 vecA, vecB; \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect0, 16); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src4, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CB_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CB_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CB_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CB_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CLP_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CLP_F16.vx new file mode 100644 index 0000000..561885c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CLP_F16.vx @@ -0,0 +1,179 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_CLP_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CLP_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, b0, b1, b2; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_f, coord_in.xw); \ + b1 = read_imagef(bias_c, coord_in.xw); \ + b2 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + data_f_t = data_f_t * w0 + b0; \ + data_g_t = data_g_t * w1 + b1; \ + data_o_t = data_o_t * w2 + b2; \ + \ + convert_type dst0; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CLP_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CLP_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CLP_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CLP_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CLP_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CLP_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CLP_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CLP_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +#define LSTMUNIT_CLP_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CLP_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, b0, b1, b2; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \ + b0 = read_imagef(bias_f, coord_in.xw); \ + b1 = read_imagef(bias_c, coord_in.xw); \ + b2 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + data_f_t = data_f_t * w0 + b0; \ + data_g_t = data_g_t * w1 + b1; \ + data_o_t = data_o_t * w2 + b2; \ + \ + convert_type dst0; \ + half4 cell_data; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, cell_data, data_c_t); \ + VXC_DP4x4(src0, cell_data, cell_data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_o_t = act_func(data_o_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CLP_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CLP_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CLP_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CLP_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CLP_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CLP_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CLP_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CLP_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CL_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CL_F16.vx new file mode 100644 index 0000000..fd74c4c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CL_F16.vx @@ -0,0 +1,183 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; + +#define LSTMUNIT_CL_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CL_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, b0, b1, b2; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_f, coord_in.xw); \ + b1 = read_imagef(bias_c, coord_in.xw); \ + b2 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + data_f_t = data_f_t * w0 + b0; \ + data_g_t = data_g_t * w1 + b1; \ + data_o_t = data_o_t * w2 + b2; \ + \ + convert_type dst0; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CL_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CL_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CL_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CL_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CL_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CL_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CL_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CL_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_CL_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CL_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, b0, b1, b2; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \ + b0 = read_imagef(bias_f, coord_in.xw); \ + b1 = read_imagef(bias_c, coord_in.xw); \ + b2 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + data_f_t = data_f_t * w0 + b0; \ + data_g_t = data_g_t * w1 + b1; \ + data_o_t = data_o_t * w2 + b2; \ + \ + convert_type dst0; \ + half4 cell_data; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, cell_data, data_c_t); \ + VXC_DP4x4(src0, cell_data, cell_data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_o_t = act_func(data_o_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CL_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CL_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CL_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CL_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CL_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CL_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CL_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CL_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CSP_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CSP_F16.vx new file mode 100644 index 0000000..4b30ea7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CSP_F16.vx @@ -0,0 +1,170 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4; + +#define LSTMUNIT_CSP_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CSP_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + \ + convert_type dst0; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CSP_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CSP_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CSP_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CSP_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CSP_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CSP_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CSP_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CSP_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_CSP_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CSP_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CSP_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CSP_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CSP_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CSP_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CSP_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CSP_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CSP_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CSP_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CSP_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CSP_U8.vx new file mode 100644 index 0000000..7b97909 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CSP_U8.vx @@ -0,0 +1,173 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniU8AddS32_4x4; +_viv_uniform int4 input0Array_ZP; +_viv_uniform int4 input1Array_ZP; +_viv_uniform float4 input0Array_Scale; +_viv_uniform float4 input1Array_Scale; + +#define LSTMUNIT_CSP_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CSP_U8to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 vecA, vecB; \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \ + \ + convert_type dst0; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CSP_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CSP_U8_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CSP_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CSP_U8_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CSP_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CSP_U8_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CSP_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CSP_U8_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_CSP_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CSP_U8to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0; \ + vxc_half8 src4; \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 vecA, vecB; \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect0, 16); \ + \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src4, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CSP_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CSP_U8_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CSP_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CSP_U8_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CSP_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CSP_U8_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CSP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CSP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CS_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CS_F16.vx new file mode 100644 index 0000000..810ecf3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CS_F16.vx @@ -0,0 +1,174 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4; + +#define LSTMUNIT_CS_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CS_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + \ + convert_type dst0; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CS_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CS_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CS_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CS_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CS_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CS_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CS_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CS_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_CS_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CS_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CS_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CS_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_CS_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CS_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_CS_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_CS_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_CS_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CS_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CS_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CS_U8.vx new file mode 100644 index 0000000..2d51794 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CS_U8.vx @@ -0,0 +1,170 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniU8AddS32_4x4; +_viv_uniform int4 input0Array_ZP; +_viv_uniform int4 input1Array_ZP; +_viv_uniform float4 input0Array_Scale; +_viv_uniform float4 input1Array_Scale; + +#define LSTMUNIT_CS_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CS_U8to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 vecA, vecB; \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \ + \ + convert_type dst0; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CS_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CS_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CS_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CS_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_CS_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_CS_U8to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0; \ + vxc_half8 src4; \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 vecA, vecB; \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect0, 16); \ + \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src4, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CS_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_CS_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_CS_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_CS_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_LP_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_LP_F16.vx new file mode 100644 index 0000000..35456ed --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_LP_F16.vx @@ -0,0 +1,197 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; + +#define LSTMUNIT_LP_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_LP_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wi, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_i_t, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + data_i_t = data_i_t * w0 + b0; \ + data_f_t = data_f_t * w1 + b1; \ + data_g_t = data_g_t * w2 + b2; \ + data_o_t = data_o_t * w3 + b3; \ + \ + convert_type dst0; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_LP_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_LP_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_LP_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_LP_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_LP_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_LP_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_LP_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_LP_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_LP_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_LP_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wi, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_i_t, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + data_i_t = data_i_t * w0 + b0; \ + data_f_t = data_f_t * w1 + b1; \ + data_g_t = data_g_t * w2 + b2; \ + data_o_t = data_o_t * w3 + b3; \ + \ + convert_type dst0; \ + half4 cell_data; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, cell_data, data_c_t); \ + VXC_DP4x4(src0, cell_data, cell_data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_o_t = act_func(data_o_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_LP_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_LP_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_LP_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_LP_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_LP_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_LP_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_LP_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_LP_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_L_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_L_F16.vx new file mode 100644 index 0000000..c23dc05 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_L_F16.vx @@ -0,0 +1,202 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; + +#define LSTMUNIT_L_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_L_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wi, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_i_t, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + data_i_t = data_i_t * w0 + b0; \ + data_f_t = data_f_t * w1 + b1; \ + data_g_t = data_g_t * w2 + b2; \ + data_o_t = data_o_t * w3 + b3; \ + \ + convert_type dst0; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_L_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_L_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_L_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_L_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_L_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_L_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_L_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_L_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_L_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_L_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wi, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + \ + VXC_DP4x4(data_i_t, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + data_i_t = data_i_t * w0 + b0; \ + data_f_t = data_f_t * w1 + b1; \ + data_g_t = data_g_t * w2 + b2; \ + data_o_t = data_o_t * w3 + b3; \ + \ + convert_type dst0; \ + half4 cell_data; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, cell_data, data_c_t); \ + VXC_DP4x4(src0, cell_data, cell_data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_o_t = act_func(data_o_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_L_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_L_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_L_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_L_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_L_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_L_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_L_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_L_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_SP_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_SP_F16.vx new file mode 100644 index 0000000..1c426f3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_SP_F16.vx @@ -0,0 +1,184 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4; + +#define LSTMUNIT_SP_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_SP_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src10, vect10, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + \ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + \ + convert_type dst0; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_SP_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_SP_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_SP_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_SP_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_SP_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_SP_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_SP_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_SP_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_SP_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_SP_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src10, vect10, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + \ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_SP_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_SP_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_SP_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_SP_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_SP_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_SP_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_SP_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_SP_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_SP_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_SP_U8.vx new file mode 100644 index 0000000..4baaf6c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_SP_U8.vx @@ -0,0 +1,188 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniU8AddS32_4x4; +_viv_uniform int4 input0Array_ZP; +_viv_uniform int4 input1Array_ZP; +_viv_uniform float4 input0Array_Scale; +_viv_uniform float4 input1Array_Scale; + +#define LSTMUNIT_SP_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_SP_U8to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 vecA, vecB; \ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + \ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx; \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \ + \ + convert_type dst0; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_SP_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_SP_U8_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_SP_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_SP_U8_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_SP_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_SP_U8_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_SP_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_SP_U8_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_SP_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_SP_U8to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0; \ + vxc_half8 src4; \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 vecA, vecB; \ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect0, 16); \ + \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx; \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src4, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_SP_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_SP_U8_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_SP_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_SP_U8_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_SP_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_SP_U8_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_SP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_SP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_S_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_S_F16.vx new file mode 100644 index 0000000..441e193 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_S_F16.vx @@ -0,0 +1,188 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4; + +#define LSTMUNIT_S_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_S_F16to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3; \ + vxc_half8 src0, src1, src2, src3; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src10, vect10, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + \ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + \ + convert_type dst0; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_S_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_S_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_S_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_S_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_S_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_S_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_S_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_S_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_S_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_S_F16to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \ + vxc_half8 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_half8 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, vect0, 16); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src10, vect10, 16); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, vect1, 16); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src11, vect11, 16); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, vect2, 16); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src12, vect12, 16); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, vect3, 16); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src13, vect13, 16); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect4, 16); \ + \ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src0, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_S_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_S_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid) +LSTMUNIT_S_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_S_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid) +LSTMUNIT_S_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) +LSTMUNIT_S_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid) +LSTMUNIT_S_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_S_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_S_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_S_U8.vx new file mode 100644 index 0000000..5c528a3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_S_U8.vx @@ -0,0 +1,184 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +_viv_uniform VXC_512Bits uniU8AddS32_4x4; +_viv_uniform int4 input0Array_ZP; +_viv_uniform int4 input1Array_ZP; +_viv_uniform float4 input0Array_Scale; +_viv_uniform float4 input1Array_Scale; + +#define LSTMUNIT_S_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_S_U8to##out_type_name##_F32_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 vecA, vecB; \ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \ + \ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx; \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \ + \ + convert_type dst0; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_S_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_S_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_S_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_S_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) + +#define LSTMUNIT_S_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \ +__kernel void lstmunit_activation_S_U8to##out_type_name##_F16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0; \ + vxc_half8 src4; \ + vxc_uchar4 src0, src1, src2, src3; \ + vxc_uchar4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 vecA, vecB; \ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, vect0, 16); \ + \ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx; \ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \ + \ + convert_type dst0; \ + half4 dst_cell; \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(CONV, dst_cell, data_c_t); \ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, vect0, src4, 8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, data_o_t); \ + dst_type dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + copy_type dst; \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_S_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid) +LSTMUNIT_S_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid) +LSTMUNIT_S_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) +LSTMUNIT_S_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx new file mode 100644 index 0000000..2a6b24f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx @@ -0,0 +1,260 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Lo_4x4; +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Hi_4x4; +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Lo_4x4; +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Hi_4x4; + +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b; + +#if (VX_VERSION==2) +__kernel void gemm_F16F16toF16(image2d_array_t inputA, + image2d_array_t inputB, image2d_array_t output, + int transposeA, int transposeB, + int adjointA, int adjointB, uint M, uint K, uint N) +{ + uint gidy = get_global_id(1); + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); + + half4 valC; + vxc_short8 srcA0, srcA1, srcA2, srcA3, outC; + vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3; + vxc_short16 srcB; + vxc_half16 tmpB; + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) + { + vxc_float4 tempA0, tempA1, tempA2, tempA3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 4; + coord_b.y += 4; + _viv_asm(COPY, tmpA0, srcA0, 16); + _viv_asm(COPY, tmpA1, srcA1, 16); + _viv_asm(COPY, tmpA2, srcA2, 16); + _viv_asm(COPY, tmpA3, srcA3, 16); + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + sum0 += (tempA0); + sum1 += (tempA1); + sum2 += (tempA2); + sum3 += (tempA3); + } + coord_b.y = gidy; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr); + _viv_asm(CONV, valC, sum0); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum1); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum2); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum3); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} +#else +__kernel void gemm_F16F16toF16(image2d_array_t inputA, + image2d_array_t inputB, image2d_array_t output, + int transposeA, int transposeB, + int adjointA, int adjointB, uint M, uint K, uint N) +{ + uint gidy = get_global_id(1); + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); + + half4 valC; + vxc_short8 srcA0, srcB0, srcA1, srcB1, outC; + vxc_half8 tmpA0, tmpB0, tmpA1, tmpB1; + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) + { + vxc_float4 tempA0, tempA1, tempA2, tempA3; + vxc_float4 tempB0, tempB1, tempB2, tempB3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 4; + coord_b.y += 4; + _viv_asm(COPY, tmpA0, srcA0, 16); + _viv_asm(COPY, tmpB0, srcB0, 16); + _viv_asm(COPY, tmpA1, srcA1, 16); + _viv_asm(COPY, tmpB1, srcB1, 16); + + VXC_DP4x4(tempA0, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Lo_4x4); + VXC_DP4x4(tempB0, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Hi_4x4); + VXC_DP4x4(tempA1, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Lo_4x4); + VXC_DP4x4(tempB1, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Hi_4x4); + VXC_DP4x4(tempA2, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Lo_4x4); + VXC_DP4x4(tempB2, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Hi_4x4); + VXC_DP4x4(tempA3, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Lo_4x4); + VXC_DP4x4(tempB3, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Hi_4x4); + sum0 += (tempA0 + tempB0); + sum1 += (tempA1 + tempB1); + sum2 += (tempA2 + tempB2); + sum3 += (tempA3 + tempB3); + } + coord_b.y = gidy; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr); + _viv_asm(CONV, valC, sum0); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum1); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum2); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum3); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} +#endif + +__kernel void gemm_F32F32toF32(image2d_array_t inputA, + image2d_array_t inputB, + image2d_array_t output, + int transposeA, + int transposeB, + int adjointA, + int adjointB, + uint M, uint K, uint N) +{ + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_float4 sum0 = (vxc_float4)(0); + vxc_float4 sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0); + vxc_float4 sum3 = (vxc_float4)(0); + + vxc_int4 tmpOut0, tmpOut1; + vxc_uchar16 outC; + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + + for(int i = 0; i < K; i+=4) + { + vxc_float4 tempA0, tempA1, tempA2, tempA3; + vxc_float4 tempB0, tempB1, tempB2, tempB3; + + coord_a.x = i; + coord_a.y = gidy; + + coord_b.x = gidx; + coord_b.y = i; + + tempA0 = read_imagef(inputA, coord_a); + coord_a.y++; + tempA1 = read_imagef(inputA, coord_a); + coord_a.y++; + tempA2 = read_imagef(inputA, coord_a); + coord_a.y++; + tempA3 = read_imagef(inputA, coord_a); + + tempB0 = read_imagef(inputB, coord_b); + coord_b.y++; + tempB1 = read_imagef(inputB, coord_b); + coord_b.y++; + tempB2 = read_imagef(inputB, coord_b); + coord_b.y++; + tempB3 = read_imagef(inputB, coord_b); + + sum0 += (tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); + sum1 += (tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); + sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); + sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); + } + coord_b = (int4)(gidx, gidy, get_global_id(2), 0); + write_imagef(output, coord_b, sum0); + coord_b.y++; + write_imagef(output, coord_b, sum1); + coord_b.y++; + write_imagef(output, coord_b, sum2); + coord_b.y++; + write_imagef(output, coord_b, sum3); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx new file mode 100644 index 0000000..586b0c6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx @@ -0,0 +1,200 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float output_ZP; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Lo_4x4; +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Hi_4x4; +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Lo_4x4; +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Hi_4x4; + +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b; + +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +#if (VX_VERSION==2) +#define GEMM_F16_TO_QINT(dst_type_name, write_type) \ +__kernel void gemm_F16F16to##dst_type_name( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + \ + vxc_short8 srcA0, srcA1, srcA2, srcA3; \ + vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3; \ + vxc_short16 srcB; \ + vxc_half16 tmpB; \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + _viv_asm(COPY, tmpA0, srcA0, 16); \ + _viv_asm(COPY, tmpA1, srcA1, 16); \ + _viv_asm(COPY, tmpA2, srcA2, 16); \ + _viv_asm(COPY, tmpA3, srcA3, 16); \ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); \ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); \ + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Lo_4x4b); \ + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Lo_4x4b); \ + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Lo_4x4b); \ + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Lo_4x4b); \ + sum0 += (tempA0); \ + sum1 += (tempA1); \ + sum2 += (tempA2); \ + sum3 += (tempA3); \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + write_type outC; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +#else +#define GEMM_F16_TO_QINT(dst_type_name, write_type) \ +__kernel void gemm_F16F16to##dst_type_name( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + \ + vxc_short8 srcA0, srcB0, srcA1, srcB1; \ + vxc_half8 tmpA0, tmpB0, tmpA1, tmpB1; \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + _viv_asm(COPY, tmpA0, srcA0, 16); \ + _viv_asm(COPY, tmpB0, srcB0, 16); \ + _viv_asm(COPY, tmpA1, srcA1, 16); \ + _viv_asm(COPY, tmpB1, srcB1, 16); \ + \ + VXC_DP4x4(tempA0, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Lo_4x4); \ + VXC_DP4x4(tempB0, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Hi_4x4); \ + VXC_DP4x4(tempA1, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Lo_4x4); \ + VXC_DP4x4(tempB1, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Hi_4x4); \ + VXC_DP4x4(tempA2, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Lo_4x4); \ + VXC_DP4x4(tempB2, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Hi_4x4); \ + VXC_DP4x4(tempA3, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Lo_4x4); \ + VXC_DP4x4(tempB3, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Hi_4x4); \ + sum0 += (tempA0 + tempB0); \ + sum1 += (tempA1 + tempB1); \ + sum2 += (tempA2 + tempB2); \ + sum3 += (tempA3 + tempB3); \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + write_type outC; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +#endif +GEMM_F16_TO_QINT(U8, vxc_uchar16) +GEMM_F16_TO_QINT(I8, vxc_char16) +GEMM_F16_TO_QINT(I16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx new file mode 100644 index 0000000..cc959af --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx @@ -0,0 +1,195 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float output_ZP; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniGemmF16I16toF32A_4x4; +_viv_uniform VXC_512Bits uniGemmF16I16toF32B_4x4; +_viv_uniform VXC_512Bits uniGemmF16I16toF32C_4x4; +_viv_uniform VXC_512Bits uniGemmF16I16toF32D_4x4; +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +_viv_uniform VXC_512Bits uniGemmF16I16toF32Lo_4x4b; +_viv_uniform VXC_512Bits uniGemmF16I16toF32Hi_4x4b; +_viv_uniform VXC_512Bits uniGemmFp16I16MulZptoFp32_4x4; +_viv_uniform float in1outScale; + +#if (VX_VERSION==2) +#define GEMM_F16_QINT16_TO_QINT16(src1_type_name, read_type) \ +__kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \ + image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + \ + vxc_short8 srcA0, srcA1, outC; \ + vxc_half8 tmpA0, tmpA1; \ + vxc_short16 srcB; \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + _viv_asm(COPY, tmpA0, srcA0, 16); \ + _viv_asm(COPY, tmpA1, srcA1, 16); \ + VXC_DP4x4_b(tempA0, srcB.hi, srcB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmF16I16toF32Lo_4x4b); \ + VXC_DP4x4_b(tempA1, srcB.hi, srcB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmF16I16toF32Hi_4x4b); \ + VXC_DP4x4_b(tempA2, srcB.hi, srcB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmF16I16toF32Lo_4x4b); \ + VXC_DP4x4_b(tempA3, srcB.hi, srcB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmF16I16toF32Hi_4x4b); \ + VXC_DP4x4(tmpZpScale, tmpA0, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmFp16I16MulZptoFp32_4x4); \ + sum0 += tempA0 + tmpZpScale.xxxx; \ + sum1 += tempA1 + tmpZpScale.yyyy; \ + sum2 += tempA2 + tmpZpScale.zzzz; \ + sum3 += tempA3 + tmpZpScale.wwww; \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * in1outScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +#else +#define GEMM_F16_QINT16_TO_QINT16(src1_type_name, read_type) \ +__kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \ + image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + \ + vxc_short8 srcA0, srcA1, outC; \ + vxc_half8 tmpA0, tmpA1; \ + vxc_short8 srcB0, srcB1; \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + _viv_asm(COPY, tmpA0, srcA0, 16); \ + _viv_asm(COPY, tmpA1, srcA1, 16); \ + VXC_DP4x4(tempA0, tmpA0, srcB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32A_4x4); \ + VXC_DP4x4(tempB0, tmpA0, srcB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32B_4x4); \ + VXC_DP4x4(tempA1, tmpA0, srcB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32C_4x4); \ + VXC_DP4x4(tempB1, tmpA0, srcB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32D_4x4); \ + VXC_DP4x4(tempA2, tmpA1, srcB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32A_4x4); \ + VXC_DP4x4(tempB2, tmpA1, srcB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32B_4x4); \ + VXC_DP4x4(tempA3, tmpA1, srcB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32C_4x4); \ + VXC_DP4x4(tempB3, tmpA1, srcB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32D_4x4); \ + VXC_DP4x4(tmpZpScale, tmpA0, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmFp16I16MulZptoFp32_4x4); \ + sum0 += tempA0 + tempB0 + tmpZpScale.xxxx; \ + sum1 += tempA1 + tempB1 + tmpZpScale.yyyy; \ + sum2 += tempA2 + tempB2 + tmpZpScale.zzzz; \ + sum3 += tempA3 + tempB3 + tmpZpScale.wwww; \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * in1outScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +#endif +GEMM_F16_QINT16_TO_QINT16(I16, vxc_short8) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx new file mode 100644 index 0000000..4a5ee66 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx @@ -0,0 +1,197 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int input1_ZP; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4; +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4; +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +_viv_uniform VXC_512Bits uniGemmF16U8toF32_4x4; +_viv_uniform VXC_512Bits uniGemmF16U8toF32Hi_4x4; +_viv_uniform VXC_512Bits uniGemmFp16U8MulZptoFp32_4x4; +_viv_uniform float input1Scale; + +#define GEMM_F16_QINT_TO_F16(src1_type_name, read_type) \ +__kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \ + image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + \ + half4 valC; \ + vxc_short8 srcA0, srcA1, outC; \ + vxc_half8 tmpA0, tmpA1; \ + read_type srcB; \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempZp; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + _viv_asm(COPY, tmpA0, srcA0, 16); \ + _viv_asm(COPY, tmpA1, srcA1, 16); \ + VXC_DP4x4(tempA0, tmpA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32_4x4); \ + VXC_DP4x4(tempA1, tmpA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32Hi_4x4); \ + VXC_DP4x4(tempA2, tmpA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32_4x4); \ + VXC_DP4x4(tempA3, tmpA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32Hi_4x4); \ + VXC_DP4x4(tempZp, tmpA0, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmFp16U8MulZptoFp32_4x4); \ + sum0 += tempA0 + tempZp.x; \ + sum1 += tempA1 + tempZp.y; \ + sum2 += tempA2 + tempZp.z; \ + sum3 += tempA3 + tempZp.w; \ + } \ + sum0 *= input1Scale; \ + sum1 *= input1Scale; \ + sum2 *= input1Scale; \ + sum3 *= input1Scale; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + _viv_asm(CONV, valC, sum0); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum1); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum2); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum3); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_F16_QINT_TO_F16(U8, vxc_uchar16) +GEMM_F16_QINT_TO_F16(I8, vxc_char16) + +#define GEMM_F16_QINT16_TO_F16(src1_type_name, read_type) \ +__kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \ + image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + \ + half4 valC; \ + vxc_short8 srcA, outC; \ + vxc_half8 tmpA; \ + read_type srcB; \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + short in1_zp; \ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, tmpA, srcA, 16); \ + VXC_DP4x4(tempA0, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, tmpA, srcA, 16); \ + VXC_DP4x4(tempA1, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ + VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, tmpA, srcA, 16); \ + VXC_DP4x4(tempA2, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ + VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + _viv_asm(COPY, tmpA, srcA, 16); \ + VXC_DP4x4(tempA3, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ + VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \ + } \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + _viv_asm(CONV, valC, sum0); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum1); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum2); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum3); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_F16_QINT16_TO_F16(I16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx new file mode 100644 index 0000000..3617df4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx @@ -0,0 +1,97 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float output_ZP; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +_viv_uniform VXC_512Bits uniGemmF16U8toF32_4x4; +_viv_uniform VXC_512Bits uniGemmF16U8toF32Hi_4x4; +_viv_uniform VXC_512Bits uniGemmFp16U8MulZptoFp32_4x4; +_viv_uniform float in1outScale; + +#define GEMM_F16_QINT_TO_QINT(src1_type_name, read_type) \ +__kernel void gemm_F16##src1_type_name##to##src1_type_name(image2d_array_t inputA, \ + image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + \ + vxc_short8 srcA0, srcA1; \ + vxc_half8 tmpA0, tmpA1; \ + read_type srcB, outC; \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempZp; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + _viv_asm(COPY, tmpA0, srcA0, 16); \ + _viv_asm(COPY, tmpA1, srcA1, 16); \ + VXC_DP4x4(tempA0, tmpA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32_4x4); \ + VXC_DP4x4(tempA1, tmpA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32Hi_4x4); \ + VXC_DP4x4(tempA2, tmpA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32_4x4); \ + VXC_DP4x4(tempA3, tmpA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32Hi_4x4); \ + VXC_DP4x4(tempZp, tmpA0, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmFp16U8MulZptoFp32_4x4); \ + sum0 += tempA0 + tempZp.x; \ + sum1 += tempA1 + tempZp.y; \ + sum2 += tempA2 + tempZp.z; \ + sum3 += tempA3 + tempZp.w; \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * in1outScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_F16_QINT_TO_QINT(U8, vxc_uchar16) +GEMM_F16_QINT_TO_QINT(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx new file mode 100644 index 0000000..b3674c1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx @@ -0,0 +1,103 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int input0_ZP; +_viv_uniform int input1_ZP; +_viv_uniform float output_ZP; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +#define GEMM_QINT_TO_QINT(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + read_type srcA, srcB, outC; \ + \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + short in0_zp, in1_zp; \ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_QINT_TO_QINT(I16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx new file mode 100644 index 0000000..4f5e558 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx @@ -0,0 +1,260 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int input0_ZP; +_viv_uniform int input1_ZP; +_viv_uniform float output_ZP; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4; + +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +#define GEMM_TRANSA_QINT(src0_type_name, src1_type_name, dst_type_name, read0_type, read1_type, write_type) \ +__kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, \ + uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + read0_type srcA; \ + read1_type srcB; \ + write_type outC; \ + \ + int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + \ + vxc_float4 sum0 = (vxc_float4)(0); \ + vxc_float4 sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0); \ + vxc_float4 sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + short in0_zp, in1_zp; \ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \ + \ + vxc_float4 tempA0; \ + vxc_float4 tempB0; \ + \ + for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) \ + { \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_a.y++; \ + coord_b.y++; \ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertUint8SubZpToFp32B_4x4); \ + sum0 = (sum0 + tempA0.x * tempB0); \ + sum1 = (sum1 + tempA0.y * tempB0); \ + sum2 = (sum2 + tempA0.z * tempB0); \ + sum3 = (sum3 + tempA0.w * tempB0); \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_TRANSA_QINT(U8, U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16) +GEMM_TRANSA_QINT(I8, I8, I8, vxc_char16, vxc_char16, vxc_char16) +GEMM_TRANSA_QINT(I16, I16, I16, vxc_short8, vxc_short8, vxc_short8) + +#define GEMM_TRANSA_INPUTB_F16(src0_type_name, read0_type) \ +__kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \ + image2d_array_t inputA, \ + image2d_array_t inputB, \ + image2d_array_t output, \ + int transposeA, \ + int transposeB, \ + int adjointA, \ + int adjointB, \ + uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + read0_type srcA, outC; \ + vxc_short8 srcB; \ + vxc_half8 tmpB; \ + \ + int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + \ + vxc_float4 sum0 = (vxc_float4)(0); \ + vxc_float4 sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0); \ + vxc_float4 sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + short in0_zp; \ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \ + \ + vxc_float4 tempA0; \ + vxc_float4 tempB0; \ + \ + for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) \ + { \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_a.y++; \ + coord_b.y++; \ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertUint8SubZpToFp32_4x4); \ + _viv_asm(COPY, tmpB, srcB, 16); \ + VXC_DP4x4(tempB0,tmpB,tmpB,VXC_MODIFIER(0,3,0,VXC_RM_TowardZero,0),uniConvert1stFp16ToFp32_4x4); \ + sum0 = (sum0 + tempA0.x * tempB0); \ + sum1 = (sum1 + tempA0.y * tempB0); \ + sum2 = (sum2 + tempA0.z * tempB0); \ + sum3 = (sum3 + tempA0.w * tempB0); \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_TRANSA_INPUTB_F16(U8, vxc_uchar16) +GEMM_TRANSA_INPUTB_F16(I8, vxc_char16) +GEMM_TRANSA_INPUTB_F16(I16, vxc_short8) + +__kernel void gemm_transa_F16F16toF16( + image2d_array_t inputA, + image2d_array_t inputB, + image2d_array_t output, + int transposeA, + int transposeB, + int adjointA, + int adjointB, + uint M, uint K, uint N) +{ + uint gidy = get_global_id(1); + + half4 valC; + vxc_short8 srcA, srcB, outC; + vxc_half8 tmpA, tmpB; + + int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_float4 sum0 = (vxc_float4)(0); + vxc_float4 sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0); + vxc_float4 sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + vxc_float4 tempA0; + vxc_float4 tempB0; + + for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) + { + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_a.y++; + coord_b.y++; + _viv_asm(COPY, tmpA, srcA, 16); + VXC_DP4x4(tempA0,tmpA,tmpA,VXC_MODIFIER(0,3,0,VXC_RM_TowardZero,0),uniConvert1stFp16ToFp32_4x4); + _viv_asm(COPY, tmpB, srcB, 16); + VXC_DP4x4(tempB0,tmpB,tmpB,VXC_MODIFIER(0,3,0,VXC_RM_TowardZero,0),uniConvert1stFp16ToFp32_4x4); + + sum0 = (sum0 + tempA0.x * tempB0); + sum1 = (sum1 + tempA0.y * tempB0); + sum2 = (sum2 + tempA0.z * tempB0); + sum3 = (sum3 + tempA0.w * tempB0); + } + coord_b.y = gidy; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr); + _viv_asm(CONV, valC, sum0); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_b.y++; + _viv_asm(CONV, valC, sum1); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_b.y++; + _viv_asm(CONV, valC, sum2); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_b.y++; + _viv_asm(CONV, valC, sum3); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx new file mode 100644 index 0000000..04af02c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx @@ -0,0 +1,123 @@ +#include "cl_viv_vx_ext.h" + +/********************gemm transposeB fp16 fp16 to fp16*************************/ +_viv_uniform VXC_512Bits uniFp16MulFp16AddtoFp32_dp8x2; + +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +__kernel void gemm_transb_F16F16toF16(image2d_array_t inputA, + image2d_array_t inputB, + image2d_array_t output, + int transposeA, + int transposeB, + int adjointA, + int adjointB, + uint M, uint K, uint N) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_float4 sum0 = (vxc_float4)(0); + vxc_float4 sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0); + vxc_float4 sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;) + { + vxc_short8 srcA0,srcA1,srcA2,srcA3; + vxc_short8 srcB0,srcB1,srcB2,srcB3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 8; + coord_b.x += 8; + + vxc_half8 halfB0,halfB1,halfB2,halfB3; + _viv_asm(COPY, halfB0, srcB0, 16); + _viv_asm(COPY, halfB1, srcB1, 16); + _viv_asm(COPY, halfB2, srcB2, 16); + _viv_asm(COPY, halfB3, srcB3, 16); + vxc_half8 halfA0,halfA1,halfA2,halfA3; + _viv_asm(COPY, halfA0, srcA0, 16); + _viv_asm(COPY, halfA1, srcA1, 16); + _viv_asm(COPY, halfA2, srcA2, 16); + _viv_asm(COPY, halfA3, srcA3, 16); + vxc_float4 fpVal; + VXC_DP8x2(fpVal, halfA0, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum0 += fpVal; + VXC_DP8x2(fpVal, halfA1, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum1 += fpVal; + VXC_DP8x2(fpVal, halfA2, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum2 += fpVal; + VXC_DP8x2(fpVal, halfA3, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum3 += fpVal; + } + half4 halfDst; + vxc_short8 valDst; + _viv_asm(CONV, halfDst, sum0); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + _viv_asm(CONV, halfDst, sum1); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + _viv_asm(CONV, halfDst, sum2); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + _viv_asm(CONV, halfDst, sum3); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx new file mode 100644 index 0000000..e33d532 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx @@ -0,0 +1,262 @@ +#include "cl_viv_vx_ext.h" + +/********************gemm transposeB fp16 uint8 to fp16*************************/ +_viv_uniform int input1_ZP; +_viv_uniform float input1Scale; +_viv_uniform VXC_512Bits uniU8SubZptoFp16_dp2x8; +_viv_uniform VXC_512Bits uniFp16MulFp16AddtoFp32_dp8x2; + +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +__kernel void gemm_transb_F16U8toF16(image2d_array_t inputA, + image2d_array_t inputB, + image2d_array_t output, + int transposeA, + int transposeB, + int adjointA, + int adjointB, + uint M, uint K, uint N) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_float4 sum0 = (vxc_float4)(0); + vxc_float4 sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0); + vxc_float4 sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + short in1_zp; + _viv_asm(COPY, in1_zp, input1_ZP, 4); + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;) + { + vxc_short8 srcA0,srcA1,srcA2,srcA3; + vxc_uchar8 srcB0,srcB1,srcB2,srcB3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 8; + coord_b.x += 8; + + vxc_half8 halfB0,halfB1,halfB2,halfB3; + VXC_DP2x8(halfB0, srcB0, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB1, srcB1, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB2, srcB2, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB3, srcB3, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8SubZptoFp16_dp2x8); + vxc_half8 halfA0,halfA1,halfA2,halfA3; + _viv_asm(COPY, halfA0, srcA0, 16); + _viv_asm(COPY, halfA1, srcA1, 16); + _viv_asm(COPY, halfA2, srcA2, 16); + _viv_asm(COPY, halfA3, srcA3, 16); + vxc_float4 fpVal; + VXC_DP8x2(fpVal, halfA0, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum0 += fpVal; + VXC_DP8x2(fpVal, halfA1, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum1 += fpVal; + VXC_DP8x2(fpVal, halfA2, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum2 += fpVal; + VXC_DP8x2(fpVal, halfA3, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum3 += fpVal; + } + half4 halfDst; + vxc_short8 valDst; + sum0 *= input1Scale; + _viv_asm(CONV, halfDst, sum0); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + sum1 *= input1Scale; + _viv_asm(CONV, halfDst, sum1); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + sum2 *= input1Scale; + _viv_asm(CONV, halfDst, sum2); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + sum3 *= input1Scale; + _viv_asm(CONV, halfDst, sum3); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} +/***********************gemm transposeB fp16 uint8 to uint8***********************************/ +_viv_uniform float scaleIn2divOut; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform float output_ZP; + +__kernel void gemm_transb_F16U8toU8(image2d_array_t inputA, + image2d_array_t inputB, + image2d_array_t output, + int transposeA, + int transposeB, + int adjointA, + int adjointB, + uint M, uint K, uint N) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_float4 sum0 = (vxc_float4)(0); + vxc_float4 sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0); + vxc_float4 sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + short in1_zp; + _viv_asm(COPY, in1_zp, input1_ZP, 4); + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;) + { + vxc_short8 srcA0,srcA1,srcA2,srcA3; + vxc_uchar8 srcB0,srcB1,srcB2,srcB3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 8; + coord_b.x += 8; + + vxc_half8 halfB0,halfB1,halfB2,halfB3; + VXC_DP2x8(halfB0, srcB0, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB1, srcB1, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB2, srcB2, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB3, srcB3, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8SubZptoFp16_dp2x8); + vxc_half8 halfA0,halfA1,halfA2,halfA3; + _viv_asm(COPY, halfA0, srcA0, 16); + _viv_asm(COPY, halfA1, srcA1, 16); + _viv_asm(COPY, halfA2, srcA2, 16); + _viv_asm(COPY, halfA3, srcA3, 16); + vxc_float4 fpVal; + VXC_DP8x2(fpVal, halfA0, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum0 += fpVal; + VXC_DP8x2(fpVal, halfA1, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum1 += fpVal; + VXC_DP8x2(fpVal, halfA2, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum2 += fpVal; + VXC_DP8x2(fpVal, halfA3, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum3 += fpVal; + } + vxc_int4 tmpOut0, tmpOut1; + vxc_uchar8 valDst; + tmpOut0 = convert_int4_rte(sum0 * scaleIn2divOut + output_ZP); + tmpOut1 = convert_int4_rte(sum1 * scaleIn2divOut + output_ZP); + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + tmpOut0 = convert_int4_rte(sum2 * scaleIn2divOut + output_ZP); + tmpOut1 = convert_int4_rte(sum3 * scaleIn2divOut + output_ZP); + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx new file mode 100644 index 0000000..24ea0e0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx @@ -0,0 +1,273 @@ +#include "cl_viv_vx_ext.h" + +/********************gemm transposeB uint8 uint8 to fp16*************************/ +_viv_uniform int input0_ZP; +_viv_uniform int input1_ZP; +_viv_uniform float inScaleMul; +_viv_uniform VXC_512Bits uniU8SubZptoFp16_dp2x8; +_viv_uniform VXC_512Bits uniFp16MulFp16AddtoFp32_dp8x2; + +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +__kernel void gemm_transb_U8U8toF16(image2d_array_t inputA, + image2d_array_t inputB, + image2d_array_t output, + int transposeA, + int transposeB, + int adjointA, + int adjointB, + uint M, uint K, uint N) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_float4 sum0 = (vxc_float4)(0); + vxc_float4 sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0); + vxc_float4 sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + short in0_zp, in1_zp; + _viv_asm(COPY, in0_zp, input0_ZP, 4); + _viv_asm(COPY, in1_zp, input1_ZP, 4); + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;) + { + vxc_uchar8 srcA0,srcA1,srcA2,srcA3; + vxc_uchar8 srcB0,srcB1,srcB2,srcB3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 8; + coord_b.x += 8; + + vxc_half8 halfA0,halfA1,halfA2,halfA3; + VXC_DP2x8(halfA0, srcA0, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfA1, srcA1, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfA2, srcA2, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfA3, srcA3, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + vxc_half8 halfB0,halfB1,halfB2,halfB3; + VXC_DP2x8(halfB0, srcB0, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB1, srcB1, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB2, srcB2, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB3, srcB3, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + vxc_float4 fpVal; + VXC_DP8x2(fpVal, halfA0, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum0 += fpVal; + VXC_DP8x2(fpVal, halfA1, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum1 += fpVal; + VXC_DP8x2(fpVal, halfA2, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum2 += fpVal; + VXC_DP8x2(fpVal, halfA3, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum3 += fpVal; + } + half4 halfDst; + vxc_short8 valDst; + sum0 *= inScaleMul; + _viv_asm(CONV, halfDst, sum0); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + sum1 *= inScaleMul; + _viv_asm(CONV, halfDst, sum1); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + sum2 *= inScaleMul; + _viv_asm(CONV, halfDst, sum2); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + sum3 *= inScaleMul; + _viv_asm(CONV, halfDst, sum3); + _viv_asm(COPY, valDst, halfDst, 16); + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} +/********************gemm transposeB uint8 uint8 to uint8*************************/ +_viv_uniform float inScaledivOut; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform float output_ZP; + +__kernel void gemm_transb_U8U8toU8(image2d_array_t inputA, + image2d_array_t inputB, + image2d_array_t output, + int transposeA, + int transposeB, + int adjointA, + int adjointB, + uint M, uint K, uint N) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_float4 sum0 = (vxc_float4)(0); + vxc_float4 sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0); + vxc_float4 sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + short in0_zp, in1_zp; + _viv_asm(COPY, in0_zp, input0_ZP, 4); + _viv_asm(COPY, in1_zp, input1_ZP, 4); + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;) + { + vxc_uchar8 srcA0,srcA1,srcA2,srcA3; + vxc_uchar8 srcB0,srcB1,srcB2,srcB3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 8; + coord_b.x += 8; + + vxc_half8 halfA0,halfA1,halfA2,halfA3; + VXC_DP2x8(halfA0, srcA0, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfA1, srcA1, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfA2, srcA2, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfA3, srcA3, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + vxc_half8 halfB0,halfB1,halfB2,halfB3; + VXC_DP2x8(halfB0, srcB0, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB1, srcB1, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB2, srcB2, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + VXC_DP2x8(halfB3, srcB3, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniU8SubZptoFp16_dp2x8); + vxc_float4 fpVal; + VXC_DP8x2(fpVal, halfA0, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA0, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum0 += fpVal; + VXC_DP8x2(fpVal, halfA1, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA1, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum1 += fpVal; + VXC_DP8x2(fpVal, halfA2, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA2, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum2 += fpVal; + VXC_DP8x2(fpVal, halfA3, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + VXC_DP8x2(fpVal, halfA3, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniFp16MulFp16AddtoFp32_dp8x2); + sum3 += fpVal; + } + vxc_int4 tmpOut0, tmpOut1; + vxc_uchar8 valDst; + tmpOut0 = convert_int4_rte(sum0 * inScaledivOut + output_ZP); + tmpOut1 = convert_int4_rte(sum1 * inScaledivOut + output_ZP); + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + tmpOut0 = convert_int4_rte(sum2 * inScaledivOut + output_ZP); + tmpOut1 = convert_int4_rte(sum3 * inScaledivOut + output_ZP); + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx new file mode 100644 index 0000000..e5d2c76 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx @@ -0,0 +1,99 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float output_ZP; +_viv_uniform float mulKIn0In1Zp; +_viv_uniform float inOutScale; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4; +_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4; + +#define GEMM_QINT_TO_QINT(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + vxc_float4 sum0 = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp), sum1 = sum0; \ + vxc_float4 sum2 = sum0, sum3 = sum0; \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + sum0 += tempA0 + tempB0; \ + sum1 += tempA1 + tempB1; \ + sum2 += tempA2 + tempB2; \ + sum3 += tempA3 + tempB3; \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = get_global_id(1); \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_QINT_TO_QINT(U8, vxc_uchar16) +GEMM_QINT_TO_QINT(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx new file mode 100644 index 0000000..8f9ae12 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx @@ -0,0 +1,220 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float input0Scale; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4; + +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b; +_viv_uniform VXC_512Bits uniGemmU8F16toF32Hi_4x4b; +_viv_uniform VXC_512Bits uniGemmFp16MulZptoFp32_4x4; + +_viv_uniform VXC_512Bits uniGemm1stU8F16toF32Lo_4x4; +_viv_uniform VXC_512Bits uniGemm2ndU8F16toF32Lo_4x4; +_viv_uniform VXC_512Bits uniGemm1stU8F16toF32Hi_4x4; +_viv_uniform VXC_512Bits uniGemm2ndU8F16toF32Hi_4x4; + +#if (VX_VERSION==2) +#define GEMM_QINT_F16_TO_F16(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##F16toF16( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + uint gidz = get_global_id(2); \ + vxc_short16 srcB; \ + vxc_half16 tmpB; \ + half4 valC; \ + read_type srcA0, srcA1; \ + vxc_short8 outC; \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : gidz), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : gidz), 0); \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); \ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); \ + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, srcA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Lo_4x4b); \ + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, srcA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Hi_4x4b); \ + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, srcA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Lo_4x4b); \ + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, srcA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Hi_4x4b); \ + VXC_DP4x4(tmpZpScale, tmpB.hi, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmFp16MulZptoFp32_4x4); \ + sum0 += tempA0 + tmpZpScale; \ + sum1 += tempA1 + tmpZpScale; \ + sum2 += tempA2 + tmpZpScale; \ + sum3 += tempA3 + tmpZpScale; \ + } \ + sum0 *= input0Scale; \ + sum1 *= input0Scale; \ + sum2 *= input0Scale; \ + sum3 *= input0Scale; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + \ + _viv_asm(CONV, valC, sum0); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum1); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum2); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum3); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_QINT_F16_TO_F16(U8, vxc_uchar16) +GEMM_QINT_F16_TO_F16(I8, vxc_char16) +GEMM_QINT_F16_TO_F16(I16, vxc_short8) +#else +#define GEMM_QINT_F16_TO_F16(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##F16toF16( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + uint gidz = get_global_id(2); \ + vxc_short16 srcB; \ + vxc_half16 tmpB; \ + half4 valC; \ + read_type srcA0, srcA1; \ + vxc_short8 outC; \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : gidz), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : gidz), 0); \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); \ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); \ + VXC_DP4x4(tempA0, srcA0, tmpB.hi, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemm1stU8F16toF32Lo_4x4); \ + VXC_DP4x4(tempB0, srcA0, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemm2ndU8F16toF32Lo_4x4); \ + VXC_DP4x4(tempA1, srcA0, tmpB.hi, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemm1stU8F16toF32Hi_4x4); \ + VXC_DP4x4(tempB1, srcA0, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemm2ndU8F16toF32Hi_4x4); \ + VXC_DP4x4(tempA2, srcA1, tmpB.hi, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemm1stU8F16toF32Lo_4x4); \ + VXC_DP4x4(tempB2, srcA1, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemm2ndU8F16toF32Lo_4x4); \ + VXC_DP4x4(tempA3, srcA1, tmpB.hi, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemm1stU8F16toF32Hi_4x4); \ + VXC_DP4x4(tempB3, srcA1, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemm2ndU8F16toF32Hi_4x4); \ + VXC_DP4x4(tmpZpScale, tmpB.hi, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmFp16MulZptoFp32_4x4); \ + sum0 += tempA0 + tempB0 + tmpZpScale; \ + sum1 += tempA1 + tempB1 + tmpZpScale; \ + sum2 += tempA2 + tempB2 + tmpZpScale; \ + sum3 += tempA3 + tempB3 + tmpZpScale; \ + } \ + sum0 *= input0Scale; \ + sum1 *= input0Scale; \ + sum2 *= input0Scale; \ + sum3 *= input0Scale; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + \ + _viv_asm(CONV, valC, sum0); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum1); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum2); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum3); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_QINT_F16_TO_F16(U8, vxc_uchar16) +GEMM_QINT_F16_TO_F16(I8, vxc_char16) +GEMM_QINT_F16_TO_F16(I16, vxc_short8) +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx new file mode 100644 index 0000000..18c4214 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx @@ -0,0 +1,207 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int input0_ZP; +_viv_uniform float output_ZP; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4; +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b; +_viv_uniform VXC_512Bits uniGemmU8F16toF32Hi_4x4b; +_viv_uniform VXC_512Bits uniGemmFp16MulZptoFp32_4x4; +_viv_uniform float in0outScale; + +#if (VX_VERSION==2) +#define GEMM_QINT_F16_TO_QINT(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##F16to##src0_type_name( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + uint gidz = get_global_id(2); \ + vxc_short16 srcB; \ + vxc_half16 tmpB; \ + read_type srcA0, srcA1, outC; \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : gidz), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : gidz), 0); \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); \ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); \ + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, srcA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Lo_4x4b); \ + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, srcA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Hi_4x4b); \ + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, srcA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Lo_4x4b); \ + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, srcA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8F16toF32Hi_4x4b); \ + VXC_DP4x4(tmpZpScale, tmpB.hi, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmFp16MulZptoFp32_4x4); \ + sum0 += tempA0 + tmpZpScale; \ + sum1 += tempA1 + tmpZpScale; \ + sum2 += tempA2 + tmpZpScale; \ + sum3 += tempA3 + tmpZpScale; \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * in0outScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * in0outScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * in0outScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * in0outScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +#else +#define GEMM_QINT_F16_TO_QINT(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##F16to##src0_type_name( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + vxc_short8 srcB; \ + vxc_half8 tmpB; \ + half4 valB; \ + read_type srcA, outC; \ + \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + short in0_zp; \ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + _viv_asm(COPY, tmpB, srcB, 16); \ + VXC_DP4x4(tempB0, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ + \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + _viv_asm(COPY, tmpB, srcB, 16); \ + VXC_DP4x4(tempB1, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ + \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + _viv_asm(COPY, tmpB, srcB, 16); \ + VXC_DP4x4(tempB2, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ + \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + _viv_asm(COPY, tmpB, srcB, 16); \ + VXC_DP4x4(tempB3, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +#endif +GEMM_QINT_F16_TO_QINT(U8, vxc_uchar16) +GEMM_QINT_F16_TO_QINT(I8, vxc_char16) +GEMM_QINT_F16_TO_QINT(I16, vxc_short8) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx new file mode 100644 index 0000000..8b925f3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx @@ -0,0 +1,211 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int input0_ZP; +_viv_uniform int input1_ZP; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4; +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4; +_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4; +_viv_uniform float input01Scale; + +#define GEMM_QINT_TO_F16(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##src0_type_name##toF16( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + read_type srcA0, srcA1, srcA2, srcA3, srcB; \ + half4 valC; \ + vxc_short8 outC; \ + \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + vxc_float4 sum0 = (vxc_float4)(0); \ + vxc_float4 sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0); \ + vxc_float4 sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + sum0 += tempA0 + tempB0; \ + sum1 += tempA1 + tempB1; \ + sum2 += tempA2 + tempB2; \ + sum3 += tempA3 + tempB3; \ + } \ + sum0 *= input01Scale; \ + sum1 *= input01Scale; \ + sum2 *= input01Scale; \ + sum3 *= input01Scale; \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + _viv_asm(CONV, valC, sum0); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum1); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum2); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum3); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_QINT_TO_F16(U8, vxc_uchar16) +GEMM_QINT_TO_F16(I8, vxc_char16) + +#define GEMM_QINT16_TO_F16(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##src0_type_name##toF16( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + read_type srcA, srcB; \ + half4 valC; \ + vxc_short8 outC; \ + \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ + vxc_float4 sum0 = (vxc_float4)(0); \ + vxc_float4 sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0); \ + vxc_float4 sum3 = (vxc_float4)(0); \ + short in0_zp, in1_zp; \ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \ + } \ + coord_b.y = gidy; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + _viv_asm(CONV, valC, sum0); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum1); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum2); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + _viv_asm(CONV, valC, sum3); \ + _viv_asm(COPY, outC, valC, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ +} +GEMM_QINT16_TO_F16(I16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx new file mode 100644 index 0000000..c40c720 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx @@ -0,0 +1,269 @@ +#include "cl_viv_vx_ext.h" + +__kernel void maximum_F16F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 vec0, vec1, dst; + vxc_half8 src0, src1; + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_F16F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 vec0, vec1, dst; + vxc_half8 src0, src1; + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + coord.z ++; + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8; + +__kernel void maximum_F16F16toI8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 vec0, vec1; + vxc_char8 dst; + vxc_half8 src0, src1; + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_F16F16toI8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 vec0, vec1; + vxc_char8 dst; + vxc_half8 src0, src1; + VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8; +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8; +_viv_uniform VXC_512Bits uniConvertI8toI8_1_part0_2x8; +_viv_uniform VXC_512Bits uniConvertI8toI8_1_part1_2x8; +__kernel void maximum_I8I8toI8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char16 src0, src1, dst; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8); + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8); + dst = max(src0, src1); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_I8I8toI8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_char16 src0, src1, dst; + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord.z ++; + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8); + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8); + dst = max(src0, src1); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8; +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp +__kernel void maximum_U8U8toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar16 src0, src1, dst; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Hi_2x8); + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Hi_2x8); + dst = max(src0, src1); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_U8U8toU8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_uchar16 src0, src1, dst; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Hi_2x8); + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Hi_2x8); + dst = max(src0, src1); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8; +_viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8; +__kernel void maximum_I16I16toI16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, dst; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8); + dst = max(src0, src1); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_I16I16toI16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 src0, src1, dst; + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord.z ++; + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8); + dst = max(src0, src1); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_fp16.vx new file mode 100644 index 0000000..76269f7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_fp16.vx @@ -0,0 +1,317 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8; +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8; +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8; +_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8; + +__kernel void maximum_I8F16toI8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char16 src0, src2, dst; + vxc_short8 src1, src3, src4, src5; + vxc_half8 data0, data1, data2, data3; + vxc_char16 tmp0, tmp1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src4, input1, coord, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, data0, src1, 16); + _viv_asm(COPY, data1, src4, 16); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8); + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8); + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + dst = max(src0, tmp0); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_I8F16toI8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_char16 src0, src2, dst; + vxc_short8 src1, src3, src4, src5; + vxc_half8 data0, data1, data2, data3; + vxc_char16 tmp0; + + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src4, input1, coord.xy, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, data0, src1, 16); + _viv_asm(COPY, data1, src4, 16); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8); + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8); + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + dst = max(src0, tmp0); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_I8F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char8 vec0, vec2; + vxc_short8 vec1, vec3, dst; + vxc_half8 src0, src1, src2, src3; + + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_I8F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_char8 vec0, vec2; + vxc_short8 vec1, vec3, dst; + vxc_half8 src0, src1, src2, src3; + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +__kernel void maximum_U8F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar8 vec0, vec2; + vxc_short8 vec1, vec3, dst; + vxc_half8 src0, src1, src2, src3; + + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_0_Lo_2x8); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_U8F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_uchar8 vec0, vec2; + vxc_short8 vec1, vec3, dst; + vxc_half8 src0, src1, src2, src3; + + VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_0_Lo_2x8); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8; +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8; +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp +__kernel void maximum_U8F16toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar16 src0, dst0, dst1; + vxc_ushort8 src1, src2; + vxc_half8 data1, data2; + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + _viv_asm(COPY, data2, src2, 16); + + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Hi_2x8); + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + dst0 = max(dst0, dst1); + + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_U8F16toU8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_uchar16 src0, dst0, dst1; + vxc_ushort8 src1, src2; + vxc_half8 data1, data2; + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + _viv_asm(COPY, data2, src2, 16); + + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Hi_2x8); + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + dst0 = max(dst0, dst1); + + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_F16F16toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_ushort8 src0, src1; + vxc_half8 data0, data1; + vxc_uchar16 dst0, dst1; + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + _viv_asm(COPY, data1, src1, 16); + + vxc_ushort8 mp1; + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + dst0 = max(dst0, dst1); + + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_F16F16toU8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_ushort8 src0, src1; + vxc_half8 data0, data1; + vxc_uchar16 dst0, dst1; + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + _viv_asm(COPY, data1, src1, 16); + + vxc_ushort8 mp1; + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + dst0 = max(dst0, dst1); + + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx new file mode 100644 index 0000000..15ab020 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx @@ -0,0 +1,173 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertI16toI16_2x8; +_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8; +_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8; +_viv_uniform float outputScale; +_viv_uniform float output_zp; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndFp16ToFp32_4x4; + + +__kernel void maximum_I16F16toI16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, tmp0, dst; + vxc_half8 data0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, data0, src1, 16); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8); + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8); + dst = max(src0, tmp0); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_I16F16toI16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 src0, src1, tmp0, dst; + vxc_half8 data0; + + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, data0, src1, 16); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8); + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8); + dst = max(src0, tmp0); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_I16F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 vec0, vec1, dst; + vxc_half8 src0, src1; + + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_I16F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 vec0, vec1, dst; + vxc_half8 src0, src1; + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_F16F16toI16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 vec0, vec1; + vxc_short8 dst; + vxc_half8 src0, src1; + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + int4 tmpDst0, tmpDst1; + float4 tmpData0, tmpData1; + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4); + tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp); + tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_F16F16toI16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 vec0, vec1; + vxc_short8 dst; + vxc_half8 src0, src1; + VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + int4 tmpDst0, tmpDst1; + float4 tmpData0, tmpData1; + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4); + tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp); + tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx new file mode 100644 index 0000000..4bfe529 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx @@ -0,0 +1,271 @@ +#include "cl_viv_vx_ext.h" + +__kernel void minimum_F16F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 vec0, vec1, dst; + vxc_half8 src0, src1; + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_F16F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 vec0, vec1, dst; + vxc_half8 src0, src1; + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + coord.z ++; + + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8; + +__kernel void minimum_F16F16toI8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 vec0, vec1; + vxc_char8 dst; + vxc_half8 src0, src1; + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_F16F16toI8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 vec0, vec1; + vxc_char8 dst; + vxc_half8 src0, src1; + VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8; +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8; +_viv_uniform VXC_512Bits uniConvertI8toI8_1_part0_2x8; +_viv_uniform VXC_512Bits uniConvertI8toI8_1_part1_2x8; +__kernel void minimum_I8I8toI8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char16 src0, src1, dst; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8); + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8); + dst = min(src0, src1); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_I8I8toI8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_char16 src0, src1, dst; + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord.z ++; + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8); + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8); + dst = min(src0, src1); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8; +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp +__kernel void minimum_U8U8toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar16 src0, src1, dst; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Hi_2x8); + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Hi_2x8); + dst = min(src0, src1); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_U8U8toU8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_uchar16 src0, src1, dst; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Hi_2x8); + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Hi_2x8); + dst = min(src0, src1); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8; +_viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8; +__kernel void minimum_I16I16toI16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, dst; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8); + dst = min(src0, src1); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_I16I16toI16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 src0, src1, dst; + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord.z ++; + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8); + dst = min(src0, src1); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_fp16.vx new file mode 100644 index 0000000..f60a751 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_fp16.vx @@ -0,0 +1,317 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8; +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8; +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8; +_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8; + +__kernel void minimum_I8F16toI8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char16 src0, src2, dst; + vxc_short8 src1, src3, src4, src5; + vxc_half8 data0, data1, data2, data3; + vxc_char16 tmp0, tmp1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src4, input1, coord, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, data0, src1, 16); + _viv_asm(COPY, data1, src4, 16); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8); + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8); + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + dst = min(src0, tmp0); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_I8F16toI8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_char16 src0, src2, dst; + vxc_short8 src1, src3, src4, src5; + vxc_half8 data0, data1, data2, data3; + vxc_char16 tmp0; + + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src4, input1, coord.xy, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, data0, src1, 16); + _viv_asm(COPY, data1, src4, 16); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8); + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8); + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8); + dst = min(src0, tmp0); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_I8F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char8 vec0, vec2; + vxc_short8 vec1, vec3, dst; + vxc_half8 src0, src1, src2, src3; + + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8); + + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_I8F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_char8 vec0, vec2; + vxc_short8 vec1, vec3, dst; + vxc_half8 src0, src1, src2, src3; + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8); + + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +__kernel void minimum_U8F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar8 vec0, vec2; + vxc_short8 vec1, vec3, dst; + vxc_half8 src0, src1, src2, src3; + + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_0_Lo_2x8); + + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_U8F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_uchar8 vec0, vec2; + vxc_short8 vec1, vec3, dst; + vxc_half8 src0, src1, src2, src3; + + VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_0_Lo_2x8); + + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8; +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8; +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp +__kernel void minimum_U8F16toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar16 src0, dst0, dst1; + vxc_ushort8 src1, src2; + vxc_half8 data1, data2; + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + _viv_asm(COPY, data2, src2, 16); + + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Hi_2x8); + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + dst0 = min(dst0, dst1); + + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_U8F16toU8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_uchar16 src0, dst0, dst1; + vxc_ushort8 src1, src2; + vxc_half8 data1, data2; + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + _viv_asm(COPY, data2, src2, 16); + + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Hi_2x8); + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + dst0 = min(dst0, dst1); + + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_F16F16toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_ushort8 src0, src1; + vxc_half8 data0, data1; + vxc_uchar16 dst0, dst1; + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + _viv_asm(COPY, data1, src1, 16); + + vxc_ushort8 mp1; + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + dst0 = min(dst0, dst1); + + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_F16F16toU8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_ushort8 src0, src1; + vxc_half8 data0, data1; + vxc_uchar16 dst0, dst1; + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + _viv_asm(COPY, data1, src1, 16); + + vxc_ushort8 mp1; + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertFp16toU8_2x8); + dst0 = min(dst0, dst1); + + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx new file mode 100644 index 0000000..a314ca9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx @@ -0,0 +1,177 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertI16toI16_2x8; +_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8; +_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8; + +_viv_uniform float outputScale; +_viv_uniform float output_zp; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndFp16ToFp32_4x4; + +__kernel void minimum_I16F16toI16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, tmp0, dst; + vxc_half8 data0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, data0, src1, 16); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8); + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8); + dst = min(src0, tmp0); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_I16F16toI16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 src0, src1, tmp0, dst; + vxc_half8 data0; + + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, data0, src1, 16); + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8); + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8); + dst = min(src0, tmp0); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_I16F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 vec0, vec1, dst; + vxc_half8 src0, src1; + + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8); + + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_I16F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 vec0, vec1, dst; + vxc_half8 src0, src1; + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8); + + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + _viv_asm(COPY, dst, src0, 16); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_F16F16toI16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, dst; + vxc_half8 data0, data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + _viv_asm(COPY, data1, src1, 16); + + VXC_VertMin3_Half(data0, data0, data1, data1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + + int4 tmpDst0, tmpDst1; + float4 tmpData0, tmpData1; + VXC_DP4x4(tmpData0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4); + VXC_DP4x4(tmpData1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4); + tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp); + tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_F16F16toI16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 src0, src1, dst; + vxc_half8 data0, data1; + + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, data0, src0, 16); + _viv_asm(COPY, data1, src1, 16); + + VXC_VertMin3_Half(data0, data0, data1, data1, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); + + int4 tmpDst0, tmpDst1; + float4 tmpData0, tmpData1; + VXC_DP4x4(tmpData0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4); + VXC_DP4x4(tmpData1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4); + tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp); + tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx new file mode 100644 index 0000000..3d8dd53 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx @@ -0,0 +1,267 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform float dimRatio; + +_viv_uniform float zpScaleSqr_i16; +_viv_uniform float zpScale2_i16; +_viv_uniform float sumScale_i16; + +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform float e2InScale; +_viv_uniform float rowSumScale; +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +#define MOMENTS_AXIS0_QINT(src0_type_name, read0_type) \ +__kernel void moments_axis0_##src0_type_name##toF16( \ + image2d_array_t input, \ + image2d_t output_mean, \ + image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidy = get_global_id(0); \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(0, gidy, gidz, 0); \ + read0_type src0; \ + float sum = 0, sqr = 0; \ + int tmpSum = 0, tmpSqr = 0; \ + int4 tmpSum0, tmpSqr0; \ + \ + for(coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP16x1(tmpSum0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \ + VXC_DP16x1(tmpSqr0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \ + tmpSum += (tmpSum0.x); \ + tmpSqr += (tmpSqr0.x + tmpZp1 * tmpSum0.x); \ + } \ + sqr = (convert_float(tmpSqr) * e2InScale + rowSumScale); \ + sum = convert_float(tmpSum + sumInZp) * input_scale; \ + \ + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0); \ + mean_vari0 *= dimRatio; \ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; \ + \ + int2 coord_out = (int2)(gidy, gidz); \ + half4 tmpData; \ + vxc_half8 tmpVal; \ + vxc_short8 dst; \ + _viv_asm(CONV, tmpData, mean_vari0); \ + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \ + _viv_asm(COPY, dst, tmpVal, 16); \ + \ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} + +MOMENTS_AXIS0_QINT(U8, vxc_uchar16) +MOMENTS_AXIS0_QINT(I8, vxc_char16) + +#define MOMENTS_AXIS0_QINT_2D(src0_type_name, read0_type) \ +__kernel void moments_axis0_##src0_type_name##toF16_2D( \ + image2d_t input, \ + image2d_t output_mean, \ + image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidy = get_global_id(0); \ + int2 coord = (int2)(0, gidy); \ + read0_type src0; \ + float sum = 0, sqr = 0; \ + int tmpSum = 0, tmpSqr = 0; \ + int4 tmpSum0, tmpSqr0; \ + \ + for(coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP16x1(tmpSum0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \ + VXC_DP16x1(tmpSqr0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \ + tmpSum += (tmpSum0.x); \ + tmpSqr += (tmpSqr0.x + tmpZp1 * tmpSum0.x); \ + } \ + sqr = (convert_float(tmpSqr) * e2InScale + rowSumScale); \ + sum = convert_float(tmpSum + sumInZp) * input_scale; \ + \ + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0); \ + mean_vari0 *= dimRatio; \ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; \ + \ + int2 coord_out = (int2)(gidy, 0); \ + half4 tmpData; \ + vxc_half8 tmpVal; \ + vxc_short8 dst; \ + _viv_asm(CONV, tmpData, mean_vari0); \ + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \ + _viv_asm(COPY, dst, tmpVal, 16); \ + \ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +MOMENTS_AXIS0_QINT_2D(U8, vxc_uchar16) +MOMENTS_AXIS0_QINT_2D(I8, vxc_char16) + +__kernel void moments_axis0_F16toF16( + image2d_array_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, int axis_num) +{ + int gidy = get_global_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(0, gidy, gidz, 0); + vxc_short8 src0; + vxc_half8 in_h0; + vxc_float4 sumsqr0; + vxc_float4 mean_vari0 = (vxc_float4)(0); + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h0, src0, 16); + VXC_DP8x2(sumsqr0, in_h0, in_h0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + mean_vari0 += sumsqr0; + } + + mean_vari0 *= dimRatio; + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; + + int2 coord_out = (int2)(gidy, gidz); + + half4 tmpData; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpData, mean_vari0); + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void moments_axis0_F16toF16_2D( + image2d_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, int axis_num) +{ + int gidy = get_global_id(0); + int2 coord = (int2)(0, gidy); + vxc_short8 src0; + vxc_half8 in_h0; + vxc_float4 sumsqr0; + vxc_float4 mean_vari0 = (vxc_float4)(0); + + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h0, src0, 16); + VXC_DP8x2(sumsqr0, in_h0, in_h0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + mean_vari0 += sumsqr0; + } + mean_vari0 *= dimRatio; + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; + + int2 coord_out = (int2)(gidy, 0); + + half4 tmpData; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpData, mean_vari0); + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void moments_axis0_I16toF16( + image2d_array_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, int axis_num) +{ + int gidy = get_global_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(0, gidy, gidz, 0); + vxc_short8 src0; + float4 sumsqr0; + float sum = 0, sqr = 0; + float tmpSum = 0; + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP8x2(sumsqr0, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSum += sumsqr0.x; + sqr += (sumsqr0.y * e2InScale + zpScaleSqr_i16 + zpScale2_i16 * sumsqr0.x); + } + sum = tmpSum * input_scale + sumScale_i16; + + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0); + mean_vari0 *= dimRatio; + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; + + int2 coord_out = (int2)(gidy, gidz); + + half4 tmpData; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpData, mean_vari0); + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void moments_axis0_I16toF16_2D( + image2d_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, int axis_num) +{ + int gidy = get_global_id(0); + int2 coord = (int2)(0, gidy); + vxc_short8 src0; + float4 sumsqr0; + float sum = 0, sqr = 0; + float tmpSum = 0; + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP8x2(sumsqr0, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSum += sumsqr0.x; + sqr += (sumsqr0.y * e2InScale + zpScaleSqr_i16 + zpScale2_i16 * sumsqr0.x); + } + sum = tmpSum * input_scale + sumScale_i16; + + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0); + mean_vari0 *= dimRatio; + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; + + int2 coord_out = (int2)(gidy, 0); + + half4 tmpData; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpData, mean_vari0); + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis01.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis01.vx new file mode 100644 index 0000000..58206a1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis01.vx @@ -0,0 +1,423 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; +_viv_uniform float dimRatio; +_viv_uniform float zpScaleSqr_i16; +_viv_uniform float zpScale2_i16; +_viv_uniform float sumScale_i16; + +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform float e2InScale; +_viv_uniform float rowSumScale; +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +#define MOMENTS_AXIS01_QINT(src0_type_name, read0_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_##src0_type_name##toF16( \ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0) << 4; \ + int lidx = get_local_id(0); \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(gidx, 0, gidz, 0); \ + read0_type src0; \ + float sum = 0, sqr = 0; \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + \ + for(coord.x = gidx; coord.x < width; coord.x += 256) \ + { \ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \ + tmpSum += (tmpSum1); \ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \ + } \ + sqr += (tmpSqr * e2InScale + rowSumScale); \ + sum += (tmpSum + sumInZp) * input_scale; \ + } \ + lcl_sum[lidx] = sum; \ + lcl_sqr[lidx] = sqr; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + int2 coord_out = (int2)(gidz, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + \ + sum = (0); \ + sqr = (0); \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + float4 mean, vari; \ + mean.x = sum * dimRatio; \ + vari.x = sqr * dimRatio; \ + vari.x = vari.x - mean.x * mean.x; \ + \ + half4 tmpMean, tmpVari; \ + vxc_half8 tmpVal; \ + vxc_short8 dst; \ + _viv_asm(CONV, tmpMean, mean); \ + _viv_asm(CONV, tmpVari, vari); \ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \ + _viv_asm(COPY, dst, tmpVal, 16); \ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +MOMENTS_AXIS01_QINT(U8, vxc_uchar16) +MOMENTS_AXIS01_QINT(I8, vxc_char16) + +#define MOMENTS_AXIS01_QINT_2D(src0_type_name, read0_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_##src0_type_name##toF16_2D( \ + image2d_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0) << 4; \ + int lidx = get_local_id(0); \ + int2 coord = (int2)(gidx, 0); \ + read0_type src0; \ + float sum = 0, sqr = 0; \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + \ + for(coord.x = gidx; coord.x < width; coord.x += 256) \ + { \ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \ + tmpSum += (tmpSum1); \ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \ + } \ + sqr += (tmpSqr * e2InScale + rowSumScale); \ + sum += (tmpSum + sumInZp) * input_scale; \ + } \ + lcl_sum[lidx] = sum; \ + lcl_sqr[lidx] = sqr; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + int2 coord_out = (int2)(0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + \ + sum = (0); \ + sqr = (0); \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + float4 mean, vari; \ + mean.x = sum * dimRatio; \ + vari.x = sqr * dimRatio; \ + vari.x = vari.x - mean.x * mean.x; \ + \ + half4 tmpMean, tmpVari; \ + vxc_half8 tmpVal; \ + vxc_short8 dst; \ + _viv_asm(CONV, tmpMean, mean); \ + _viv_asm(CONV, tmpVari, vari); \ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \ + _viv_asm(COPY, dst, tmpVal, 16); \ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +MOMENTS_AXIS01_QINT_2D(U8, vxc_uchar16) +MOMENTS_AXIS01_QINT_2D(I8, vxc_char16) + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_F16toF16( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + vxc_half8 in_h; + vxc_float4 sumsqr; + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSumSqr += sumsqr; + } + } + + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(gidz, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0.0f; + float sqr = 0.0f; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + half4 tmpMean, tmpVari; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpMean, mean); + _viv_asm(CONV, tmpVari, vari); + VXC_DP2x8(tmpVal, tmpMean, tmpVari, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_F16toF16_2D( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int2 coord = (int2)(gidx, 0); + vxc_short8 src0; + vxc_half8 in_h; + vxc_float4 sumsqr; + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSumSqr += sumsqr; + } + } + + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0.0f; + float sqr = 0.0f; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + half4 tmpMean, tmpVari; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpMean, mean); + _viv_asm(CONV, tmpVari, vari); + VXC_DP2x8(tmpVal, tmpMean, tmpVari, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_I16toF16( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + float4 sumsqr; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + float tmpSum = 0; + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSum += sumsqr.x; + sqr += (sumsqr.y * e2InScale + zpScaleSqr_i16 + zpScale2_i16 * sumsqr.x); + } + sum += tmpSum * input_scale + sumScale_i16; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(gidz, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0.0f; + sqr = 0.0f; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + half4 tmpMean, tmpVari; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpMean, mean); + _viv_asm(CONV, tmpVari, vari); + VXC_DP2x8(tmpVal, tmpMean, tmpVari, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_I16toF16_2D( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int2 coord = (int2)(gidx, 0); + vxc_short8 src0; + float4 sumsqr; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + float tmpSum = 0; + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSum += sumsqr.x; + sqr += (sumsqr.y * e2InScale + zpScaleSqr_i16 + zpScale2_i16 * sumsqr.x); + } + sum += tmpSum * input_scale + sumScale_i16; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0.0f; + sqr = 0.0f; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 mean_vari; + mean_vari.x = sum * dimRatio; + mean_vari.y = sqr * dimRatio; + mean_vari.y = mean_vari.y - mean_vari.x * mean_vari.x; + + half4 tmpMean, tmpVari; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpMean, mean_vari); + //_viv_asm(CONV, tmpVari, vari); + VXC_DP2x8(tmpVal, tmpMean, tmpMean, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx new file mode 100644 index 0000000..6afb0a5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx @@ -0,0 +1,239 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; +_viv_uniform float dimRatio; +_viv_uniform float zpScaleSqr_i16; +_viv_uniform float zpScale2_i16; +_viv_uniform float sumScale_i16; + +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform float e2InScale; +_viv_uniform float rowSumScale; +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +#define MOMENTS_AXIS012_QINT(src0_type_name, read0_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toF16( \ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0) << 4; \ + int lidx = get_local_id(0); \ + int4 coord = (int4)(gidx, 0, 0, 0); \ + read0_type src0; \ + float sum = 0, sqr = 0; \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + \ + for(coord.z = 0; coord.z < channel; coord.z++) \ + { \ + for(coord.x = gidx; coord.x < width; coord.x += 256) \ + { \ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \ + tmpSum += (tmpSum1); \ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \ + } \ + sqr += (tmpSqr * e2InScale + rowSumScale); \ + sum += (tmpSum + sumInZp) * input_scale; \ + } \ + } \ + lcl_sum[lidx] = sum; \ + lcl_sqr[lidx] = sqr; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + int2 coord_out = (int2)(0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + \ + sum = (0); \ + sqr = (0); \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + float4 mean, vari; \ + mean.x = sum * dimRatio; \ + vari.x = sqr * dimRatio; \ + vari.x = vari.x - mean.x * mean.x; \ + \ + half4 tmpMean, tmpVari; \ + vxc_half8 tmpVal; \ + vxc_short8 dst; \ + _viv_asm(CONV, tmpMean, mean); \ + _viv_asm(CONV, tmpVari, vari); \ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \ + _viv_asm(COPY, dst, tmpVal, 16); \ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ +} + +MOMENTS_AXIS012_QINT(U8, vxc_uchar16) +MOMENTS_AXIS012_QINT(I8, vxc_char16) + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_F16toF16( + image2d_array_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, + int axis_num) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int4 coord = (int4)(gidx, 0, 0, 0); + vxc_short8 src0; + vxc_half8 in_h; + vxc_float4 sumsqr; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + for(coord.z = 0; coord.z < channel; coord.z++) + { + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSumSqr += sumsqr; + } + } + } + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = (float)(0); + float sqr = (float)(0); + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + half4 tmpMean, tmpVari; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpMean, mean); + _viv_asm(CONV, tmpVari, vari); + VXC_DP2x8(tmpVal, tmpMean, tmpVari, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_I16toF16( + image2d_array_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, + int axis_num) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int4 coord = (int4)(gidx, 0, 0, 0); + vxc_short8 src0; + float4 sumsqr; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + for(coord.z = 0; coord.z < channel; coord.z++) + { + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + float tmpSum = 0; + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSum += sumsqr.x; + sqr += (sumsqr.y * e2InScale + zpScaleSqr_i16 + zpScale2_i16 * sumsqr.x); + } + sum += tmpSum * input_scale + sumScale_i16; + } + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = (float)(0); + sqr = (float)(0); + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + half4 tmpMean, tmpVari; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpMean, mean); + _viv_asm(CONV, tmpVari, vari); + VXC_DP2x8(tmpVal, tmpMean, tmpVari, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx new file mode 100644 index 0000000..dffa293 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx @@ -0,0 +1,185 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int height; +_viv_uniform float dimRatio; + +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform float e2InScale; + +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +#define MOMENTS_AXIS1_QINT(src0_type_name, read0_type) \ +__kernel void moments_axis1_##src0_type_name##toF16( \ + image2d_array_t input, \ + image2d_t output_mean, \ + image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0); \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(gidx, 0, gidz, 0); \ + read0_type src0; \ + float4 sum = 0, sqr = 0; \ + short zp = inputZP;\ + float4 tmpData0;\ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + sum += (tmpData0); \ + sqr += (tmpData0 * tmpData0); \ + } \ + sum *= input_scale; \ + sqr *= e2InScale; \ + \ + float4 mean = sum * dimRatio; \ + float4 vari = sqr * dimRatio; \ + vari = vari - mean * mean; \ + \ + int2 coord_out = (int2)(gidx, gidz); \ + half4 tmpMean, tmpVari; \ + vxc_half8 tmpVal; \ + vxc_short8 dst; \ + _viv_asm(CONV, tmpMean, mean); \ + _viv_asm(CONV, tmpVari, vari); \ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \ + _viv_asm(COPY, dst, tmpVal, 16); \ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} + +MOMENTS_AXIS1_QINT(U8, vxc_uchar16) +MOMENTS_AXIS1_QINT(I8, vxc_char16) +MOMENTS_AXIS1_QINT(I16, vxc_short8) + +#define MOMENTS_AXIS1_QINT_2D(src0_type_name, read0_type) \ +__kernel void moments_axis1_##src0_type_name##toF16_2D( \ + image2d_t input, \ + image2d_t output_mean, \ + image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0); \ + int2 coord = (int2)(gidx, 0); \ + read0_type src0; \ + float4 sum = 0, sqr = 0; \ + short zp = inputZP;\ + float4 tmpData0;\ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + sum += (tmpData0); \ + sqr += (tmpData0 * tmpData0); \ + } \ + sum *= input_scale; \ + sqr *= e2InScale; \ + \ + float4 mean = sum * dimRatio; \ + float4 vari = sqr * dimRatio; \ + vari = vari - mean * mean; \ + \ + int2 coord_out = (int2)(gidx, 0); \ + half4 tmpMean, tmpVari; \ + vxc_half8 tmpVal; \ + vxc_short8 dst; \ + _viv_asm(CONV, tmpMean, mean); \ + _viv_asm(CONV, tmpVari, vari); \ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \ + _viv_asm(COPY, dst, tmpVal, 16); \ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +MOMENTS_AXIS1_QINT_2D(U8, vxc_uchar16) +MOMENTS_AXIS1_QINT_2D(I8, vxc_char16) +MOMENTS_AXIS1_QINT_2D(I16, vxc_short8) + +__kernel void moments_axis1_F16toF16( + image2d_array_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, int axis_num) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + vxc_half8 in_h0; + vxc_float4 tmpSrc0; + vxc_float4 sum = (vxc_float4)(0); + vxc_float4 sqr = (vxc_float4)(0); + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h0, src0, 16); + VXC_DP4x4(tmpSrc0, in_h0, in_h0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + sum += tmpSrc0; + sqr += (tmpSrc0 * tmpSrc0); + } + + vxc_float4 mean = sum * dimRatio; + vxc_float4 vari = sqr * dimRatio; + vari = vari - mean * mean; + + int2 coord_out = (int2)(gidx, gidz); + half4 tmpMean, tmpVari; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpMean, mean); + _viv_asm(CONV, tmpVari, vari); + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void moments_axis1_F16toF16_2D( + image2d_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, int axis_num) +{ + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, 0); + vxc_short8 src0; + vxc_half8 in_h0; + vxc_float4 tmpSrc0; + vxc_float4 sum = (vxc_float4)(0); + vxc_float4 sqr = (vxc_float4)(0); + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h0, src0, 16); + VXC_DP4x4(tmpSrc0, in_h0, in_h0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + sum += tmpSrc0; + sqr += (tmpSrc0 * tmpSrc0); + } + + vxc_float4 mean = sum * dimRatio; + vxc_float4 vari = sqr * dimRatio; + vari = vari - mean * mean; + + int2 coord_out = (int2)(gidx, 0); + half4 tmpMean, tmpVari; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpMean, mean); + _viv_asm(CONV, tmpVari, vari); + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx new file mode 100644 index 0000000..c47c34f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx @@ -0,0 +1,100 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int channel; +_viv_uniform float dimRatio; + +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform float e2InScale; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +#define MOMENTS_AXIS2_QINT(src0_type_name, read0_type) \ +__kernel void moments_axis2_##src0_type_name##toF16( \ + image2d_array_t input, \ + image2d_t output_mean, \ + image2d_t output_vari, \ + int axis, int axis_num) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int4 coord = (int4)(gidx, gidy, 0, 0); \ + read0_type src0; \ + float4 sum = 0, sqr = 0; \ + short zp = inputZP;\ + float4 tmpData0;\ + \ + for(coord.z = 0; coord.z < channel; coord.z++) \ + { \ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + sum += (tmpData0); \ + sqr += (tmpData0 * tmpData0); \ + } \ + sum *= input_scale; \ + sqr *= e2InScale; \ + \ + float4 mean = sum * dimRatio; \ + float4 vari = sqr * dimRatio; \ + vari = vari - mean * mean; \ + \ + int2 coord_out = (int2)(gidx, gidy); \ + half4 tmpMean, tmpVari; \ + vxc_half8 tmpVal; \ + vxc_short8 dst; \ + _viv_asm(CONV, tmpMean, mean); \ + _viv_asm(CONV, tmpVari, vari); \ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \ + _viv_asm(COPY, dst, tmpVal, 16); \ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} + +MOMENTS_AXIS2_QINT(U8, vxc_uchar16) +MOMENTS_AXIS2_QINT(I8, vxc_char16) +MOMENTS_AXIS2_QINT(I16, vxc_short8) + +__kernel void moments_axis2_F16toF16( + image2d_array_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, + int axis_num) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord = (int4)(gidx, gidy, 0, 0); + vxc_short8 src0; + vxc_half8 in_h0; + vxc_float4 tmpSrc0; + vxc_float4 sum = (vxc_float4)(0); + vxc_float4 sqr = (vxc_float4)(0); + + for(coord.z = 0; coord.z < channel; coord.z++) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h0, src0, 16); + VXC_DP4x4(tmpSrc0, in_h0, in_h0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + sum += tmpSrc0; + sqr += (tmpSrc0 * tmpSrc0); + } + + vxc_float4 mean = sum * dimRatio; + vxc_float4 vari = sqr * dimRatio; + vari = vari - mean * mean; + + int2 coord_out = (int2)(gidx, gidy); + half4 tmpMean, tmpVari; + vxc_half8 tmpVal; + vxc_short8 dst; + _viv_asm(CONV, tmpMean, mean); + _viv_asm(CONV, tmpVari, vari); + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_F16.vx new file mode 100644 index 0000000..e2ad91d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_F16.vx @@ -0,0 +1,117 @@ +#include "cl_viv_vx_ext.h" + +//-------------------max pooling with argmax--------------- +_viv_uniform VXC_512Bits poolingEncode; +_viv_uniform VXC_512Bits uniQuantInOutInt16Even_4x4; + +#define POOLWITHARGMAX_F16_TO_F16_U8_PROCESS(read_fun, write_fun) \ + vxc_short8 din0, din1, maxData, src0, src1; \ + vxc_half8 din0Fp16, din1Fp16; \ + vxc_half8 maxDataVer, maxDataVer1; \ + int4 bitExtractCoeff; \ + vxc_short8 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 din0Equal, din1Equal; \ + vxc_uchar4 axisEncode; \ + vxc_uchar4 axisOut; \ + read_fun(src0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(src1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, din0Fp16, src0, 16); \ + _viv_asm(COPY, din1Fp16, src1, 16); \ + VXC_VertMax3_Half(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + din1 = din0.s10325476; \ + _viv_asm(COPY, maxDataVer1, din1, 16); \ + VXC_VertMax3_Half(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + din1 = din0.s02460246; \ + _viv_asm(COPY, maxData, maxDataVer, 16); \ + vxc_short8 one = (vxc_short8)(1, 1, 1, 1, 1, 1, 1, 1); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + din0EqualTmp = src0 == maxData ? one : zero; \ + din1EqualTmp = src1 == maxData ? one : zero; \ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode); \ + axisOut = clz(axisEncode); \ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + +__kernel void poolwithargmax_F16to_F16_U8 + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_F16_TO_F16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_F16to_F16_U8_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_F16_TO_F16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +#define POOLWITHARGMAX_F16_TO_I16_U8_PROCESS(read_fun, write_fun) \ + vxc_short8 din0, din1, maxData, src0, src1; \ + vxc_half8 din0Fp16, din1Fp16; \ + vxc_half8 maxDataVer, maxDataVer1; \ + int4 bitExtractCoeff; \ + vxc_short8 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 din0Equal, din1Equal; \ + vxc_uchar4 axisEncode; \ + vxc_uchar4 axisOut; \ + read_fun(src0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(src1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, din0Fp16, src0, 16); \ + _viv_asm(COPY, din1Fp16, src1, 16); \ + VXC_VertMax3_Half(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + din1 = din0.s10325476; \ + _viv_asm(COPY, maxDataVer1, din1, 16); \ + VXC_VertMax3_Half(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + VXC_DP4x4(din1, din0, din0, VXC_MODIFIER_BIN(0, 3, 0), uniQuantInOutInt16Even_4x4); \ + _viv_asm(COPY, maxData, maxDataVer, 16); \ + vxc_short8 one = (vxc_short8)(1, 1, 1, 1, 1, 1, 1, 1); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + din0EqualTmp = src0 == maxData ? one : zero; \ + din1EqualTmp = src1 == maxData ? one : zero; \ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode); \ + axisOut = clz(axisEncode); \ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + +__kernel void poolwithargmax_F16to_I16_U8 + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_F16_TO_I16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_F16to_I16_U8_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_F16_TO_I16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_I16.vx new file mode 100644 index 0000000..909b221 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_I16.vx @@ -0,0 +1,255 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits poolingEncode2; + + +#define POOLWITHARGMAX_I16_TO_I16_U8_SAME_PROCESS(read_fun, write_fun) \ + vxc_short8 din0, din1; \ + vxc_short8 din0Fp16, din1Fp16; \ + vxc_short8 maxDataVer, maxDataVer1; \ + int4 bitExtractCoeff; \ + vxc_short8 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 din0Equal, din1Equal; \ + vxc_uchar4 axisEncode; \ + vxc_uchar4 axisOut; \ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, din0Fp16, din0, 16); \ + _viv_asm(COPY, din1Fp16, din1, 16); \ + VXC_VertMax3_Integer(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + din1 = din0.s10325476; \ + _viv_asm(COPY, maxDataVer1, din1, 16); \ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + din1 = din0.s02460246; \ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_Clamp(din0EqualTmp, din0Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + VXC_Clamp(din1EqualTmp, din1Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + bitExtractCoeff = (int4)(0x30201000, 0x70605040, 0x01010101, 0x01010101); \ + VXC_BitExtract(din0Equal, din0EqualTmp, din0EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \ + VXC_BitExtract(din1Equal, din1EqualTmp, din1EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \ + VXC_DP4x4(axisEncode, din0Equal, din1Equal, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode2); \ + axisOut = clz(axisEncode); \ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + +__kernel void poolwithargmax_I16to_I16_U8_SAME + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_I16_TO_I16_U8_SAME_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_I16to_I16_U8_SAME_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_I16_TO_I16_U8_SAME_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +_viv_uniform VXC_512Bits uniQuantInOutInt16Even_4x4; + +#define POOLWITHARGMAX_I16_TO_I16_U8_PROCESS(read_fun, write_fun) \ + vxc_short8 din0, din1; \ + vxc_short8 din0Fp16, din1Fp16; \ + vxc_short8 maxDataVer, maxDataVer1; \ + int4 bitExtractCoeff; \ + vxc_short8 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 din0Equal, din1Equal; \ + vxc_uchar4 axisEncode; \ + vxc_uchar4 axisOut; \ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, din0Fp16, din0, 16); \ + _viv_asm(COPY, din1Fp16, din1, 16); \ + VXC_VertMax3_Integer(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + din1 = din0.s10325476; \ + _viv_asm(COPY, maxDataVer1, din1, 16); \ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + VXC_DP4x4(din1, din0, din0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniQuantInOutInt16Even_4x4); \ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_Clamp(din0EqualTmp, din0Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + VXC_Clamp(din1EqualTmp, din1Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + bitExtractCoeff = (int4)(0x30201000, 0x70605040, 0x01010101, 0x01010101); \ + VXC_BitExtract(din0Equal, din0EqualTmp, din0EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \ + VXC_BitExtract(din1Equal, din1EqualTmp, din1EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \ + VXC_DP4x4(axisEncode, din0Equal, din1Equal, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode2); \ + axisOut = clz(axisEncode); \ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + +__kernel void poolwithargmax_I16to_I16_U8 + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_I16_TO_I16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_I16to_I16_U8_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_I16_TO_I16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + + +#define POOLWITHARGMAX_I16_TO_I16_I16_PROCESS(read_fun, write_fun) \ + vxc_short8 din0, din1; \ + vxc_short8 din0Fp16, din1Fp16; \ + vxc_short8 maxDataVer, maxDataVer1; \ + int4 bitExtractCoeff; \ + vxc_short8 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 din0Equal, din1Equal; \ + vxc_uchar4 axisEncode; \ + vxc_uchar4 axisOut; \ + vxc_short4 axisVal; \ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, din0Fp16, din0, 16); \ + _viv_asm(COPY, din1Fp16, din1, 16); \ + VXC_VertMax3_Integer(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + din1 = din0.s10325476; \ + _viv_asm(COPY, maxDataVer1, din1, 16); \ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + VXC_DP4x4(din1, din0, din0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniQuantInOutInt16Even_4x4); \ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_Clamp(din0EqualTmp, din0Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + VXC_Clamp(din1EqualTmp, din1Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + bitExtractCoeff = (int4)(0x30201000, 0x70605040, 0x01010101, 0x01010101); \ + VXC_BitExtract(din0Equal, din0EqualTmp, din0EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \ + VXC_BitExtract(din1Equal, din1EqualTmp, din1EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \ + VXC_DP4x4(axisEncode, din0Equal, din1Equal, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode2); \ + axisOut = clz(axisEncode); \ + axisVal = convert_short4(axisOut); \ + write_fun(axis, coordOut, axisVal, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + +__kernel void poolwithargmax_I16to_I16_I16 + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_I16_TO_I16_I16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_I16to_I16_I16_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_I16_TO_I16_I16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +_viv_uniform VXC_512Bits uniConvertDirInt16Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4; +_viv_uniform float input_fl_scale_i16; +_viv_uniform VXC_512Bits uniPackHalf8_2x8_2; + +#define POOLWITHARGMAX_I16_TO_F16_U8_PROCESS(read_fun, write_fun) \ + vxc_short8 din0, din1; \ + vxc_short8 din0Fp16, din1Fp16; \ + vxc_short8 maxDataVer, maxDataVer1; \ + int4 bitExtractCoeff; \ + vxc_short8 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 din0Equal, din1Equal; \ + vxc_uchar4 axisEncode; \ + vxc_uchar4 axisOut; \ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, din0Fp16, din0, 16); \ + _viv_asm(COPY, din1Fp16, din1, 16); \ + VXC_VertMax3_Integer(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + din1 = din0.s10325476; \ + _viv_asm(COPY, maxDataVer1, din1, 16); \ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \ + _viv_asm(COPY, din0, maxDataVer, 16); \ + din1 = din0.s02460246; \ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; \ + half4 tmpOut0, tmpOut1; \ + vxc_half8 tmpPack; \ + VXC_DP4x4(tmpVal0, din1, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt16Fp32_4x4); \ + VXC_DP4x4(tmpVal2, din1, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt16Fp32_4x4); \ + tmpVal1 = tmpVal0 * input_fl_scale_i16; \ + _viv_asm(CONV, tmpOut0, tmpVal1); \ + tmpVal3 = tmpVal2 * input_fl_scale_i16; \ + _viv_asm(CONV, tmpOut1, tmpVal3); \ + VXC_DP2x8(tmpPack, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackHalf8_2x8_2); \ + _viv_asm(COPY, din1, tmpPack, 16); \ + VXC_Clamp(din0EqualTmp, din0Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + VXC_Clamp(din1EqualTmp, din1Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + bitExtractCoeff = (int4)(0x30201000, 0x70605040, 0x01010101, 0x01010101); \ + VXC_BitExtract(din0Equal, din0EqualTmp, din0EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \ + VXC_BitExtract(din1Equal, din1EqualTmp, din1EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \ + VXC_DP4x4(axisEncode, din0Equal, din1Equal, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode2); \ + axisOut = clz(axisEncode); \ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + +__kernel void poolwithargmax_I16to_F16_U8 + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_I16_TO_F16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_I16to_F16_U8_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_I16_TO_F16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_I8.vx new file mode 100644 index 0000000..dbe902f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_I8.vx @@ -0,0 +1,185 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int input_ZP; +_viv_uniform float inputScale; +_viv_uniform VXC_512Bits uniPackHalf8_2x8; +_viv_uniform VXC_512Bits uniU8EvenBinSubZP_MulM_2x8; +_viv_uniform VXC_512Bits uniS16AddOutZP_2x8; +_viv_uniform vxc_uint4 packed_outputZP; +_viv_uniform VXC_512Bits poolingEncodeInt8_0; +_viv_uniform VXC_512Bits poolingEncodeInt8_1; + +#define POOLWITHARGMAX_I8_TO_I8_U8_PROCESS(read_fun, write_fun) \ + vxc_char16 din0, din1; \ + vxc_char16 maxDataVer, maxDataVer1; \ + int4 bitExtractCoeff; \ + vxc_char16 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 axisEncode; \ + vxc_uchar8 axisOut; \ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_VertMax3_Integer(maxDataVer, din0, din1, din1, VXC_MODIFIER_BIN(0, 15, 0)); \ + maxDataVer1 = maxDataVer.s1032547698badcfe; \ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 15, 0)); \ + vxc_short8 tmp; \ + short zp = input_ZP; \ + VXC_DP2x8(tmp, maxDataVer, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8EvenBinSubZP_MulM_2x8); \ + vxc_char16 packed_outZP; \ + _viv_asm(COPY, packed_outZP, packed_outputZP, 16); \ + VXC_DP2x8(maxDataVer1, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + write_fun(tensorOut, coordOut, maxDataVer1,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + din0EqualTmp &= (vxc_char16)(1); \ + din1EqualTmp &= (vxc_char16)(1); \ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 3, 0), poolingEncodeInt8_0); \ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(4, 7, 0), poolingEncodeInt8_1); \ + axisOut = clz(axisEncode); \ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void poolwithargmax_I8to_I8_U8 + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_I8_TO_I8_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_I8to_I8_U8_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_I8_TO_I8_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +#define POOLWITHARGMAX_I8_TO_I8_U8_SAME_PROCESS(read_fun, write_fun) \ + vxc_char16 din0, din1; \ + vxc_char16 maxDataVer, maxDataVer1; \ + int4 bitExtractCoeff; \ + vxc_char16 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 axisEncode; \ + vxc_uchar8 axisOut; \ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_VertMax3_Integer(maxDataVer, din0, din1, din1, VXC_MODIFIER_BIN(0, 15, 0)); \ + maxDataVer1 = maxDataVer.s1032547698badcfe; \ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 15, 0)); \ + maxDataVer1 = maxDataVer.s02468ace02468ace; \ + write_fun(tensorOut, coordOut, maxDataVer1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + din0EqualTmp &= (vxc_char16)(1); \ + din1EqualTmp &= (vxc_char16)(1); \ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 3, 0), poolingEncodeInt8_0); \ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(4, 7, 0), poolingEncodeInt8_1); \ + axisOut = clz(axisEncode); \ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +__kernel void poolwithargmax_I8to_I8_U8_SAME + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_I8_TO_I8_U8_SAME_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_I8to_I8_U8_SAME_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_I8_TO_I8_U8_SAME_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +_viv_uniform VXC_512Bits uniConvertEvenU8ToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertEvenU8SubZpToFp32_4x4; + +#define POOLWITHARGMAX_I8_TO_F16_U8_PROCESS(read_fun, write_fun) \ + vxc_char16 din0, din1; \ + vxc_char16 maxDataVer, maxDataVer1; \ + vxc_char16 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 axisEncode; \ + vxc_uchar8 axisOut; \ + vxc_short8 result; \ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_VertMax3_Integer(maxDataVer, din0, din1, din1, VXC_MODIFIER_BIN(0, 15, 0)); \ + maxDataVer1 = maxDataVer.s1032547698badcfe; \ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 15, 0)); \ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; \ + half4 tmpOut0, tmpOut1; \ + vxc_half8 tmpPack; \ + short zp = input_ZP; \ + VXC_DP4x4(tmpVal0, maxDataVer, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEvenU8ToFp32_4x4); \ + VXC_DP4x4(tmpVal2, maxDataVer, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEvenU8SubZpToFp32_4x4); \ + tmpVal1 = tmpVal0 * inputScale; \ + _viv_asm(CONV, tmpOut0, tmpVal1); \ + tmpVal3 = tmpVal2 * inputScale; \ + _viv_asm(CONV, tmpOut1, tmpVal3); \ + VXC_DP2x8(tmpPack, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniPackHalf8_2x8); \ + _viv_asm(COPY, result, tmpPack, 16); \ + write_fun(tensorOut, coordOut, result,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + din0EqualTmp &= (vxc_char16)(1); \ + din1EqualTmp &= (vxc_char16)(1); \ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 3, 0), poolingEncodeInt8_0); \ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(4, 7, 0), poolingEncodeInt8_1); \ + axisOut = clz(axisEncode); \ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +__kernel void poolwithargmax_I8to_F16_U8 + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_I8_TO_F16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_I8to_F16_U8_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_I8_TO_F16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_U8.vx new file mode 100644 index 0000000..6cfbaeb --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/poolwithargmax_U8.vx @@ -0,0 +1,205 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int input_ZP; +_viv_uniform VXC_512Bits uniU8EvenBinSubZP_MulM_2x8; +_viv_uniform VXC_512Bits uniEncodeUint8_4x8; +_viv_uniform VXC_512Bits uniS16AddOutZP_2x8; +_viv_uniform vxc_uint4 packed_outputZP; + + +#define POOLWITHARGMAX_U8_TO_U8_U8_PROCESS(read_fun, write_fun) \ + vxc_uchar16 din0, din1; \ + vxc_uchar16 maxDataVer, maxDataVer1; \ + vxc_uchar16 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 axisEncode; \ + vxc_uchar8 axisOut; \ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + maxDataVer = max(din0, din1); \ + maxDataVer1 = maxDataVer.s1032547698badcfe; \ + maxDataVer = max(maxDataVer1, maxDataVer); \ + vxc_short8 tmp; \ + uchar zp = input_ZP; \ + VXC_DP2x8(tmp, maxDataVer, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8EvenBinSubZP_MulM_2x8); \ + vxc_uchar16 packed_outZP; \ + _viv_asm(COPY, packed_outZP, packed_outputZP, 16); \ + VXC_DP2x8(maxDataVer1, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + write_fun(tensorOut, coordOut, maxDataVer1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + din0EqualTmp &= (vxc_uchar16)(1); \ + din1EqualTmp &= (vxc_uchar16)(1); \ + VXC_DP4x8(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 7, 0), uniEncodeUint8_4x8); \ + axisOut = clz(axisEncode); \ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void poolwithargmax_U8to_U8_U8 + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_U8_TO_U8_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_U8to_U8_U8_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_U8_TO_U8_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +_viv_uniform float inputScale; +_viv_uniform VXC_512Bits uniConvertUint8ToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSubZpUint8Fp32_4x4; +_viv_uniform VXC_512Bits uniPackHalf2Short_2x8; +_viv_uniform VXC_512Bits uniExtractHalf2Short_2x8; +_viv_uniform VXC_512Bits uniPackHalf8_2x8; +_viv_uniform VXC_512Bits uniConvertEvenU8ToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertEvenU8SubZpToFp32_4x4; + +#define POOLWITHARGMAX_U8_TO_F16_U8_PROCESS(read_fun, write_fun) \ + vxc_uchar16 din0, din1; \ + vxc_uchar16 maxDataVer, maxDataVer1; \ + int4 bitExtractCoeff; \ + vxc_uchar16 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 axisEncode; \ + vxc_uchar8 axisOut; \ + vxc_short8 result; \ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_VertMax3_Integer(maxDataVer, din0, din1, din1, VXC_MODIFIER_BIN(0, 15, 0)); \ + maxDataVer1 = maxDataVer.s1032547698badcfe; \ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer,\ + maxDataVer, VXC_MODIFIER_BIN(0, 15, 0)); \ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; \ + half4 tmpOut0, tmpOut1; \ + vxc_half8 tmpPack; \ + vxc_short4 tmpOut2, tmpOut3; \ + uchar zp = input_ZP; \ + VXC_DP4x4(tmpVal0, maxDataVer, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEvenU8ToFp32_4x4); \ + VXC_DP4x4(tmpVal2, maxDataVer, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEvenU8SubZpToFp32_4x4); \ + tmpVal1 = tmpVal0 * inputScale; \ + _viv_asm(CONV, tmpOut0, tmpVal1); \ + tmpVal3 = tmpVal2 * inputScale; \ + _viv_asm(CONV, tmpOut1, tmpVal3); \ + VXC_DP2x8(tmpPack, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniPackHalf8_2x8); \ + _viv_asm(COPY, result, tmpPack, 16); \ + write_fun(tensorOut, coordOut, result, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + din0EqualTmp &= (vxc_uchar16)(1); \ + din1EqualTmp &= (vxc_uchar16)(1); \ + VXC_DP4x8(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 7, 0), uniEncodeUint8_4x8); \ + axisOut = clz(axisEncode); \ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void poolwithargmax_U8to_F16_U8 + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_U8_TO_F16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_U8to_F16_U8_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_U8_TO_F16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + + +#define POOLWITHARGMAX_U8_TO_F16_I16_PROCESS(read_fun, write_fun) \ + vxc_uchar16 din0, din1; \ + vxc_uchar16 maxDataVer, maxDataVer1; \ + int4 bitExtractCoeff; \ + vxc_uchar16 din0EqualTmp, din1EqualTmp; \ + vxc_uchar8 axisEncode; \ + vxc_uchar8 axisOut; \ + vxc_short8 result, axisResult; \ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_VertMax3_Integer(maxDataVer, din0, din1, din1, VXC_MODIFIER_BIN(0, 15, 0)); \ + maxDataVer1 = maxDataVer.s1032547698badcfe; \ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 15, 0)); \ + maxDataVer1 = maxDataVer.s02468ace02468ace; \ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; \ + half4 tmpOut0, tmpOut1; \ + vxc_half8 tmpPack; \ + vxc_short4 tmpOut2, tmpOut3; \ + uchar zp = input_ZP; \ + VXC_DP4x4(tmpVal0, maxDataVer1, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertUint8ToFp32_4x4); \ + VXC_DP4x4(tmpVal2, maxDataVer1, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSubZpUint8Fp32_4x4); \ + tmpVal1 = tmpVal0 * inputScale; \ + _viv_asm(CONV, tmpOut0, tmpVal1); \ + tmpVal3 = tmpVal2 * inputScale; \ + _viv_asm(CONV, tmpOut1, tmpVal3); \ + VXC_DP2x8(tmpPack, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniPackHalf8_2x8); \ + _viv_asm(COPY, result, tmpPack, 16); \ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + din0EqualTmp &= (vxc_uchar16)(1); \ + din1EqualTmp &= (vxc_uchar16)(1); \ + VXC_DP4x8(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 7, 0), uniEncodeUint8_4x8); \ + axisOut = clz(axisEncode); \ + _viv_asm(CONV, axisResult, axisOut); \ + write_fun(tensorOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + write_fun(axis, coordOut, axisResult, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void poolwithargmax_U8to_F16_I16 + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0); + POOLWITHARGMAX_U8_TO_F16_I16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void poolwithargmax_U8to_F16_I16_2D + ( + image2d_array_t tensorIn, + image2d_array_t tensorOut, + image2d_array_t axis + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1); + POOLWITHARGMAX_U8_TO_F16_I16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16.vx new file mode 100644 index 0000000..8180085 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16.vx @@ -0,0 +1,338 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2; + +_viv_uniform int input_ZP1; + +_viv_uniform float output_ZP; +_viv_uniform float outputScale; + +__kernel void pow_F16F16toF16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1; + vxc_short8 dst; + vxc_half8 data0, data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16F16toF16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0, src1; + vxc_short8 dst; + vxc_half8 data0, data1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16F16toU8( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1; + vxc_uchar8 dst; + vxc_half8 data0, data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16F16toU8_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0, src1; + vxc_uchar8 dst; + vxc_half8 data0, data1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16U8toF16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0; + vxc_uchar8 src1; + vxc_short8 dst; + vxc_half8 data0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in1_zp; + _viv_asm(COPY, in1_zp, input_ZP1, 4); + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, dst, data0, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16U8toF16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0; + vxc_uchar8 src1; + vxc_short8 dst; + vxc_half8 data0; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in1_zp; + _viv_asm(COPY, in1_zp, input_ZP1, 4); + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, dst, data0, 16); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16U8toU8( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0; + vxc_uchar8 src1, dst; + vxc_half8 data0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in1_zp; + _viv_asm(COPY, in1_zp, input_ZP1, 4); + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16U8toU8_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0; + vxc_uchar8 src1, dst; + vxc_half8 data0; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in1_zp; + _viv_asm(COPY, in1_zp, input_ZP1, 4); + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i16.vx new file mode 100644 index 0000000..f877637 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i16.vx @@ -0,0 +1,322 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2; + +_viv_uniform float outScale_fl; + +__kernel void pow_F16F16toI16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, dst; + vxc_half8 data0, data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16F16toI16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0, src1, dst; + vxc_half8 data0, data1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16I16toF16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, dst; + vxc_half8 data0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16I16toF16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0, src1, dst; + vxc_half8 data0; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16I16toI16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, dst; + vxc_half8 data0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16I16toI16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0, src1, dst; + vxc_half8 data0; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +__kernel void pow_BF16BF16toBF16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_ushort8 src0, src1, dst, tmpData; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, x0, tmpData, 16); + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, x1, tmpData, 16); + + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, y0, tmpData, 16); + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, y1, tmpData, 16); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + _viv_asm(COPY, src0, tmpDst0, 16); + _viv_asm(COPY, src1, tmpDst1, 16); + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_BF16BF16toBF16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_ushort8 src0, src1, dst, tmpData; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, x0, tmpData, 16); + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, x1, tmpData, 16); + + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, y0, tmpData, 16); + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, y1, tmpData, 16); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + _viv_asm(COPY, src0, tmpDst0, 16); + _viv_asm(COPY, src1, tmpDst1, 16); + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i8.vx new file mode 100644 index 0000000..4b1e7fc --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i8.vx @@ -0,0 +1,239 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform float outScale_fl; + +__kernel void pow_F16F16toI8( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1; + vxc_char8 dst; + vxc_half8 data0, data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16F16toI8_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0, src1; + vxc_char8 dst; + vxc_half8 data0, data1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16I8toF16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, dst; + vxc_char8 src1; + vxc_half8 data0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16I8toF16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0, dst; + vxc_char8 src1; + vxc_half8 data0; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16I8toI8( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0; + vxc_char8 src1, dst; + vxc_half8 data0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_F16I8toI8_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0; + vxc_char8 src1, dst; + vxc_half8 data0; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, src0, 16); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_i16.vx new file mode 100644 index 0000000..f336106 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow_i16.vx @@ -0,0 +1,227 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform float outScale_fl; + +__kernel void pow_I16F16toF16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, dst; + vxc_half8 data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_I16F16toF16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0, src1, dst; + vxc_half8 data1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_I16F16toI16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, dst; + vxc_half8 data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_I16F16toI16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0, src1, dst; + vxc_half8 data1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_I16I16toI16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1, dst; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_I16I16toI16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src0, src1, dst; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_i8.vx new file mode 100644 index 0000000..89ecade --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow_i8.vx @@ -0,0 +1,231 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform float outScale_fl; + +__kernel void pow_I8F16toF16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char8 src0; + vxc_short8 src1, dst; + vxc_half8 data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_I8F16toF16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_char8 src0; + vxc_short8 src1, dst; + vxc_half8 data1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_I8F16toI8( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char8 src0, dst; + vxc_short8 src1; + vxc_half8 data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_I8F16toI8_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_char8 src0, dst; + vxc_short8 src1; + vxc_half8 data1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_I8I8toI8( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char8 src0, src1, dst; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_I8I8toI8_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_char8 src0, src1, dst; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_u8.vx new file mode 100644 index 0000000..44e7ca3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow_u8.vx @@ -0,0 +1,349 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2; + +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +_viv_uniform int input_ZP0; +_viv_uniform int input_ZP1; +_viv_uniform float output_ZP; +_viv_uniform float outputScale; + +__kernel void pow_U8F16toF16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar8 src0; + vxc_short8 src1; + vxc_short8 dst; + vxc_half8 data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in0_zp; + _viv_asm(COPY, in0_zp, input_ZP0, 4); + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_U8F16toF16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_uchar8 src0; + vxc_short8 src1; + vxc_short8 dst; + vxc_half8 data1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in0_zp; + _viv_asm(COPY, in0_zp, input_ZP0, 4); + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_U8F16toU8( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar8 src0; + vxc_short8 src1; + vxc_uchar8 dst; + vxc_half8 data1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in0_zp; + _viv_asm(COPY, in0_zp, input_ZP0, 4); + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_U8F16toU8_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_uchar8 src0; + vxc_short8 src1; + vxc_uchar8 dst; + vxc_half8 data1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, src1, 16); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in0_zp; + _viv_asm(COPY, in0_zp, input_ZP0, 4); + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_U8U8toU8( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar8 src0, src1, dst; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in0_zp, in1_zp; + _viv_asm(COPY, in0_zp, input_ZP0, 4); + _viv_asm(COPY, in1_zp, input_ZP1, 4); + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_U8U8toU8_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_uchar8 src0, src1, dst; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in0_zp, in1_zp; + _viv_asm(COPY, in0_zp, input_ZP0, 4); + _viv_asm(COPY, in1_zp, input_ZP1, 4); + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_U8U8toF16( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar8 src0; + vxc_uchar8 src1; + vxc_short8 dst; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in0_zp, in1_zp; + _viv_asm(COPY, in0_zp, input_ZP0, 4); + _viv_asm(COPY, in1_zp, input_ZP1, 4); + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + vxc_half8 tmpVal; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_U8U8toF16_2D( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_uchar8 src0; + vxc_uchar8 src1; + vxc_short8 dst; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + short in0_zp, in1_zp; + _viv_asm(COPY, in0_zp, input_ZP0, 4); + _viv_asm(COPY, in1_zp, input_ZP1, 4); + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + half4 tmpVal0, tmpVal1; + vxc_half8 tmpVal; + _viv_asm(CONV, tmpVal0, tmpDst0); + _viv_asm(CONV, tmpVal1, tmpDst1); + VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx new file mode 100644 index 0000000..28f3f0c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx @@ -0,0 +1,187 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniBilinearTmp1Bgra_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2Bgra_4x4; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform VXC_512Bits uniExtractInt32BgraToU8_2x8; +_viv_uniform VXC_512Bits uniExchangeBgra_2x8; +_viv_uniform VXC_512Bits uniExchangeBgra2_2x8; + +_viv_uniform VXC_512Bits uniBilinearTmp1BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp5BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp6BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp7BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp8BgraShort_4x4; + +_viv_uniform VXC_512Bits uniExtractBfromBgra_4x4; +_viv_uniform VXC_512Bits uniExtractGfromBgra_4x4; +_viv_uniform VXC_512Bits uniExtractRfromBgra_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform int zp; +_viv_uniform float outputScale; + +__kernel void pre_process_bgra_scale_U8toU8( + __read_only image2d_array_t input, __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + int4 gidx = get_global_id(0); + int gidy = get_global_id(1); + gidx += (int4)(0, 1, 2, 3); + + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); + int4 sx = fx & 0xffff8000; // Floor + int fy, sy; + fx -= sx; + sx = sx >> 15; + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + sx = (sx + (*xOffset)) * 4 ; + sy += (*yOffset); + int4 srcPos = (int4)(sx.x, sy, sy + 1, sx.y); + vxc_uchar16 lineBGRA0, lineBGRA1, lineBGRA2, lineBGRA3; + vxc_uchar16 dataB, dataG, dataR; + + VXC_ReadImage(lineBGRA0, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(lineBGRA0, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(lineBGRA1, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(lineBGRA1, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.z; + srcPos.w = sx.w; + + VXC_ReadImage(lineBGRA2, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(lineBGRA2, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(lineBGRA3, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(lineBGRA3, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar4 val_u8; + int4 tmp1, tmp2, result1, result2; + float4 tmpDst, tmp0; + float4 mean = (float4)(bMean, gMean, rMean, 0); + //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x); + int tmpV = 1 << 19; + vxc_short8 tmpFx; + VXC_DP2x8(tmpFx, fx, fx, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + //tmpFx = fx.xxxx; + VXC_DP4x4(tmp1, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1BgraShort_4x4); + VXC_DP4x4(tmp2, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2BgraShort_4x4); + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertIntergetoF32_4x4); + tmpDst = (tmp0 - mean) * var; + result1 = convert_int4_rte(tmpDst * outputScale + zp); + + //tmpFx = fx.yyyy; + VXC_DP4x4(tmp1, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3BgraShort_4x4); + VXC_DP4x4(tmp2, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4BgraShort_4x4); + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertIntergetoF32_4x4); + tmpDst = (tmp0 - mean) * var; + result2 = convert_int4_rte(tmpDst * outputScale + zp); + + vxc_uchar16 dst, data; + VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractInt32BgraToU8_2x8); + + //tmpFx = fx.zzzz; + VXC_DP4x4(tmp1, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp5BgraShort_4x4); + VXC_DP4x4(tmp2, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp6BgraShort_4x4); + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertIntergetoF32_4x4); + tmpDst = (tmp0 - mean) * var; + result1 = convert_int4_rte(tmpDst * outputScale + zp); + + //tmpFx = fx.wwww; + VXC_DP4x4(tmp1, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp7BgraShort_4x4); + VXC_DP4x4(tmp2, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp8BgraShort_4x4); + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertIntergetoF32_4x4); + tmpDst = (tmp0 - mean) * var; + result2 = convert_int4_rte(tmpDst * outputScale + zp); + + VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtractInt32BgraToU8_2x8); + + VXC_DP2x8(data, dst, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExchangeBgra_2x8); + VXC_DP2x8(data, dst, dst, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniExchangeBgra2_2x8); + + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + dstPos.z = bOrder; + VXC_WriteImage2DArray(output, dstPos, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + dstPos.z = 1; + VXC_WriteImage2DArray(output, dstPos, data.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + dstPos.z = rOrder; + VXC_WriteImage2DArray(output, dstPos, data.s89ab, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pre_process_bgra_copy_U8toU8( + __read_only image2d_array_t input, __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + int2 pos = (int2)((get_global_id(0) + (*xOffset)) << 2, get_global_id(1) + (*yOffset)); + + vxc_uchar16 lineBGRA0; + float4 tmpB, tmpG, tmpR; + float4 tmpDst; + int4 result1, result2; + vxc_uchar16 dst; + + VXC_ReadImage(lineBGRA0, input, pos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpB, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBfromBgra_4x4); + VXC_DP4x4(tmpG, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGfromBgra_4x4); + VXC_DP4x4(tmpR, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRfromBgra_4x4); + + tmpDst = (tmpB - bMean) * var; + result1 = convert_int4_rte(tmpDst * outputScale + zp); + + tmpDst = (tmpG - gMean) * var; + result2 = convert_int4_rte(tmpDst * outputScale + zp); + VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + + int4 dstPos = (int4)(get_global_id(0), get_global_id(1), 0, 0); + dstPos.z = bOrder; + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + dstPos.z = 1; + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + + tmpDst = (tmpR - rMean) * var; + result1 = convert_int4_rte(tmpDst * outputScale + zp); + VXC_DP2x8(dst, result1, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + + dstPos.z = rOrder; + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx new file mode 100644 index 0000000..0ce3d53 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra_trans.vx @@ -0,0 +1,136 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDescaleU8_4x4; +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp5BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp6BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp7BgraShort_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp8BgraShort_4x4; + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniExtractInt32BgraToU8Bgr_2x8; + +_viv_uniform int zp; +_viv_uniform float outputScale; + +__kernel void pre_process_bgra_scale_nhwc_U8toU8( + __read_only image2d_array_t input, __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + int4 gidx = get_global_id(0); + int gidy = get_global_id(1); + gidx += (int4)(0, 1, 2, 3); + + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); + int4 sx = fx & 0xffff8000; // Floor + int fy, sy; + fx -= sx; + sx = sx >> 15; + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + sx = (sx + (*xOffset)) * 4 ; + sy += (*yOffset); + int4 srcPos = (int4)(sx.x, sy, sy + 1, sx.y); + vxc_uchar16 lineBGRA0, lineBGRA1, lineBGRA2, lineBGRA3; + vxc_uchar16 dataB, dataG, dataR; + + VXC_ReadImage(lineBGRA0, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(lineBGRA0, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(lineBGRA1, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(lineBGRA1, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.z; + srcPos.w = sx.w; + + VXC_ReadImage(lineBGRA2, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(lineBGRA2, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(lineBGRA3, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(lineBGRA3, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar4 val_u8; + int4 tmp1, tmp2, result1, result2; + float4 tmpDst, tmp0; + float4 mean = (float4)(bMean, gMean, rMean, 0); + //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x); + int tmpV = 1 << 19; + vxc_short8 tmpFx; + VXC_DP2x8(tmpFx, fx, fx, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uniConvertInt32toUint8_2x8); + //tmpFx = fx.xxxx; + VXC_DP4x4(tmp1, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniBilinearTmp1BgraShort_4x4); + VXC_DP4x4(tmp2, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniBilinearTmp2BgraShort_4x4); + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertIntergetoF32_4x4); + tmpDst = (tmp0 - mean) * var; + result1 = convert_int4_rte(tmpDst * outputScale + zp); + + //tmpFx = fx.yyyy; + VXC_DP4x4(tmp1, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3BgraShort_4x4); + VXC_DP4x4(tmp2, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4BgraShort_4x4); + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertIntergetoF32_4x4); + tmpDst = (tmp0 - mean) * var; + result2 = convert_int4_rte(tmpDst * outputScale + zp); + + vxc_uchar16 dst; + VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), + uniExtractInt32BgraToU8Bgr_2x8); + + //tmpFx = fx.zzzz; + VXC_DP4x4(tmp1, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp5BgraShort_4x4); + VXC_DP4x4(tmp2, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp6BgraShort_4x4); + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertIntergetoF32_4x4); + tmpDst = (tmp0 - mean) * var; + result1 = convert_int4_rte(tmpDst * outputScale + zp); + + //tmpFx = fx.wwww; + VXC_DP4x4(tmp1, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp7BgraShort_4x4); + VXC_DP4x4(tmp2, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp8BgraShort_4x4); + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10); + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertIntergetoF32_4x4); + tmpDst = (tmp0 - mean) * var; + result2 = convert_int4_rte(tmpDst * outputScale + zp); + + VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(6, 11, 0, VXC_RM_ToNearestEven, 1), + uniExtractInt32BgraToU8Bgr_2x8); + + int4 dstPos = (int4)(get_global_id(0) * 3, gidy, 0, 0); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray.vx new file mode 100644 index 0000000..f980474 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray.vx @@ -0,0 +1,318 @@ +/* + ============================================================================ + Name : GrayScale.vx + Author : Sam + Version : + Copyright : Your copyright notice + Description : + ============================================================================ + */ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniVecShift10; +_viv_uniform VXC_512Bits uniAddRShift; +_viv_uniform VXC_512Bits uniGetTempVal; +_viv_uniform VXC_512Bits uniExtractBytes; + +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform VXC_512Bits uniExtactInteger_2x8; + +_viv_uniform VXC_512Bits uniDataMulAlpha_4x4; +_viv_uniform VXC_512Bits uniDataSubMean_4x4; + +_viv_uniform float outputScale; +_viv_uniform float outputZP; + +__kernel void pre_process_gray_scale_U8toF16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int2 ratioXY = (int2)(*xRatio, *yRatio); + + int4 xPos = get_global_id(0); + int yPos = get_global_id(1); + + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); + xPos += (int4)(0, 1, 2, 3); + + //x + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; + int4 sx = fx0 & 0xffff8000; + fx0 -= sx; + sx = sx >> 15; + + vxc_short4 fx; + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); + //y + int fy = yPos * ratioXY.y + ratioSufXY.y; + int sy = fy & 0xffff8000; // Floor + + fy -= sy; + sy = sy >> 15; + + fy = (fy + (1<< 4)) >> 5; + + //R + vxc_uchar16 line0Y; + vxc_uchar16 line1Y; + int4 coord; + sx = sx + *xOffset; + coord.xyz = sx.xyz; + coord.w = sy + *yOffset; + int2 coord1 = (int2)(sx.w, coord.w); + VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + float grayMean = mean; + + int4 test01, temp1; + int4 test02, temp2; + int4 tt; + vxc_uchar4 val; + int2 coord_out = (int2)(xPos.x, yPos); + + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniVecShift10); + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniVecShift10); + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), + uniExtractBytes); + + //convert U8 to FP16 + half f16mean; + half f16alpha; + vxc_half4 dst; + vxc_short4 tmp_dst; + _viv_asm(CONV, f16mean, grayMean); + _viv_asm(CONV, f16alpha, f32Var); + VXC_DP4x4(dst, val, f16mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniDataSubMean_4x4); + VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniDataMulAlpha_4x4); + _viv_asm(COPY, tmp_dst, dst, 8); + VXC_WriteImage(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pre_process_gray_scale_U8toI16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int2 ratioXY = (int2)(*xRatio, *yRatio); + + int4 xPos = get_global_id(0); + int yPos = get_global_id(1); + + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); + xPos += (int4)(0, 1, 2, 3); + + //x + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; + int4 sx = fx0 & 0xffff8000; + fx0 -= sx; + sx = sx >> 15; + + vxc_short4 fx; + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniAddRShift); + //y + int fy = yPos * ratioXY.y + ratioSufXY.y; + int sy = fy & 0xffff8000; // Floor + + fy -= sy; + sy = sy >> 15; + + fy = (fy + (1<< 4)) >> 5; + + vxc_uchar16 line0Y; + vxc_uchar16 line1Y; + int4 coord; + sx = sx + *xOffset; + coord.xyz = sx.xyz; + coord.w = sy + *yOffset; + int2 coord1 = (int2)(sx.w, coord.w); + VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + float grayMean = mean * f32Var; + + int4 test01, temp1; + int4 test02, temp2; + int4 tt; + vxc_uchar4 val; + int2 coord_out = (int2)(xPos.x, yPos); + + vxc_uchar8 line1, line2; + + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniVecShift10); + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniVecShift10); + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + vxc_float4 tmp_dst; + vxc_uchar4 u8_dst; + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), + uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), + uniConvertIntergetoF32_4x4); + + //convert U8 to dfp8 + int4 dst0; + vxc_short4 dst; + tmp_dst = tmp_dst * f32Var - grayMean; + tmp_dst = tmp_dst * outputScale + outputZP; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), + uniExtactInteger_2x8); + + VXC_WriteImage(output, coord_out, dst, + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +#define PRE_PROCESS_GRAY_SCALE_8BITS(dst_type_name, write_type) \ +__kernel void pre_process_gray_scale_U8to##dst_type_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float mean, \ + float f32Var \ + ) \ +{ \ + int2 ratioXY = (int2)(*xRatio, *yRatio); \ + int4 xPos = get_global_id(0); \ + int yPos = get_global_id(1); \ + \ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \ + xPos += (int4)(0, 1, 2, 3); \ + \ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \ + int4 sx = fx0 & 0xffff8000; \ + fx0 -= sx; \ + sx = sx >> 15; \ + \ + vxc_short4 fx; \ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \ + \ + int fy = yPos * ratioXY.y + ratioSufXY.y; \ + int sy = fy & 0xffff8000; \ + \ + fy -= sy; \ + sy = sy >> 15; \ + fy = (fy + (1<< 4)) >> 5; \ + \ + vxc_uchar16 line0Y; \ + vxc_uchar16 line1Y; \ + int4 coord; \ + sx = sx + *xOffset; \ + coord.xyz = sx.xyz; \ + coord.w = sy + *yOffset; \ + int2 coord1 = (int2)(sx.w, coord.w); \ + VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float grayMean = mean * f32Var; \ + \ + int4 test01, temp1; \ + int4 test02, temp2; \ + int2 coord_out = (int2)(xPos.x, yPos); \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + vxc_float4 tmp_dst; \ + vxc_uchar4 u8_dst; \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + int4 dst0; \ + write_type dst; \ + tmp_dst = tmp_dst * f32Var - grayMean; \ + tmp_dst = tmp_dst * outputScale + outputZP; \ + dst0 = convert_int4_rte(tmp_dst); \ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtactInteger_2x8); \ + \ + VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} + +PRE_PROCESS_GRAY_SCALE_8BITS(U8, vxc_uchar16) +PRE_PROCESS_GRAY_SCALE_8BITS(I8, vxc_char16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_copy.vx new file mode 100644 index 0000000..ce2724b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_copy.vx @@ -0,0 +1,129 @@ +/* + ============================================================================ + Name : GrayScale.vx + Author : Sam + Version : + Copyright : Your copyright notice + Description : + ============================================================================ + */ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8; +_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8; + +_viv_uniform float outputScale; +_viv_uniform float outputZP; + +__kernel void pre_process_gray_copy_U8toF16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + coord.xy += (int2) (*xOffset, *yOffset); + vxc_uchar16 src0; + vxc_half8 dst0, dst1; + + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord.x = coord.z + 8; + float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var); + //convert U8 to FP16 + half4 paramData_f16; + vxc_short8 tmp_dst; + _viv_asm(CONV, paramData_f16, paramData); + + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniDataMeanStddevLo_2x8); + VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniDataMeanStddevHi_2x8); + _viv_asm(COPY, tmp_dst, dst0, 16); + VXC_WriteImage(output, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, tmp_dst, dst1, 16); + VXC_WriteImage(output, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pre_process_gray_copy_U8toI16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + coord.xy += (int2) (*xOffset, *yOffset); + vxc_uchar16 src0; + vxc_short8 dst0, dst1; + + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord.x = coord.z + 8; + + f32Var *= outputScale; + float4 paramData = (float4)(mean * f32Var - outputZP, mean * f32Var - outputZP, + mean * f32Var - outputZP, f32Var); + //convert U8 to FP16 + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uniDataMeanStddevLo_2x8); + VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uniDataMeanStddevHi_2x8); + VXC_WriteImage(output, coord.zw, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.xw, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +#define PRE_PROCESS_GRAY_COPY_8BITS(dst_type_name, write_type) \ +__kernel void pre_process_gray_copy_U8to##dst_type_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float mean, \ + float f32Var \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + coord.xy += (int2) (*xOffset, *yOffset); \ + vxc_uchar16 src0; \ + write_type dst; \ + \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + f32Var *= outputScale; \ + float4 paramData = (float4)(mean * f32Var - outputZP, mean * f32Var - outputZP, \ + mean * f32Var - outputZP, f32Var); \ + \ + half4 paramData_f16; \ + _viv_asm(CONV, paramData_f16, paramData); \ + \ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevLo_2x8); \ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevHi_2x8); \ + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ +} + +PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16) +PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx new file mode 100644 index 0000000..4c6f935 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx @@ -0,0 +1,158 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +_viv_uniform float outputScaleVar; +_viv_uniform float bMeanScaleVarZp; +_viv_uniform float gMeanScaleVarZp; +_viv_uniform float rMeanScaleVarZp; + +_viv_uniform uint xrIntFloat_16; +_viv_uniform uint yrIntFloat_16; + +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4; + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; +_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; + +__kernel void pre_process_nv12_scale_U8toI16( + __read_only image2d_t y_img, __read_only image2d_t uv_img, + __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + uint4 gidx = get_global_id(0); + uint gidy = get_global_id(1); + gidx += (uint4)(0, 1, 2, 3); + + uint dy = (gidy * yrIntFloat_16) >> 16; + uint4 dx = (gidx * xrIntFloat_16) >> 16; + int sy = convert_int(dy) + (*yOffset); + int4 sx = convert_int4(dx) + (*xOffset); + int4 uvX = sx & 0xfffffffe; + int uvY = sy >> 1; + + vxc_uchar16 Y, UV; + int2 coord = (int2)(sx.x, sy); + int2 coord_uv = (int2)(uvX.x, uvY); + + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.y; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.z; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.w; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.y; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.z; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.w; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_char16 tmpUV; + short tmpVal = 128; + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); + + float4 tmpDstB, tmpDstG, tmpDstR; + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); + + int4 result; + vxc_short8 dst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); + dstPos.z = bOrder; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); + dstPos.z = 1; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); + dstPos.z = rOrder; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pre_process_nv12_scale_U8toF16( + __read_only image2d_t y_img, __read_only image2d_t uv_img, + __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + uint4 gidx = get_global_id(0); + uint gidy = get_global_id(1); + gidx += (uint4)(0, 1, 2, 3); + + uint dy = (gidy * yrIntFloat_16) >> 16; + uint4 dx = (gidx * xrIntFloat_16) >> 16; + int sy = convert_int(dy) + (*yOffset); + int4 sx = convert_int4(dx) + (*xOffset); + int4 uvX = sx & 0xfffffffe; + int uvY = sy >> 1; + + vxc_uchar16 Y, UV; + int2 coord = (int2)(sx.x, sy); + int2 coord_uv = (int2)(uvX.x, uvY); + + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.y; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.z; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.w; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.y; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.z; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.w; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_char16 tmpUV; + short tmpVal = 128; + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); + + float4 tmpDstB, tmpDstG, tmpDstR; + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); + + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; + + half4 result; + vxc_half8 tmpdst; + vxc_short8 dst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + _viv_asm(CONV, result, tmpDstB); + dstPos.z = bOrder; + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpdst, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(CONV, result, tmpDstG); + dstPos.z = 1; + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpdst, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(CONV, result, tmpDstR); + dstPos.z = rOrder; + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpdst, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx new file mode 100644 index 0000000..c274c3c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx @@ -0,0 +1,197 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +_viv_uniform float outputScaleVar; +_viv_uniform float bMeanScaleVarZp; +_viv_uniform float gMeanScaleVarZp; +_viv_uniform float rMeanScaleVarZp; + +_viv_uniform uint xrIntFloat_16; +_viv_uniform uint yrIntFloat_16; + +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4; + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; +_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8; + +__kernel void pre_process_nv12_scale_U8toU8( + __read_only image2d_t y_img, __read_only image2d_t uv_img, + __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + uint4 gidx = get_global_id(0); + uint gidy = get_global_id(1); + gidx += (uint4)(0, 1, 2, 3); + + uint dy = (gidy * yrIntFloat_16) >> 16; + uint4 dx = (gidx * xrIntFloat_16) >> 16; + int sy = convert_int(dy) + (*yOffset); + int4 sx = convert_int4(dx) + (*xOffset); + int4 uvX = sx & 0xfffffffe; + int uvY = sy >> 1; + + vxc_uchar16 Y, UV; + int2 coord = (int2)(sx.x, sy); + int2 coord_uv = (int2)(uvX.x, uvY); + + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.y; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.z; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.w; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.y; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.z; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.w; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_char16 tmpUV; + short tmpVal = 128; + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); + + float4 tmpDstB, tmpDstG, tmpDstR; + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); + + int4 result; + vxc_uchar8 dst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); + dstPos.z = bOrder; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); + dstPos.z = 1; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); + dstPos.z = rOrder; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pre_process_nv12_copy_U8toU8( + __read_only image2d_t y_img, __read_only image2d_t uv_img, + __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int sy = gidy + (*yOffset); + int sx = gidx + (*xOffset); + int uvX = sx & 0xfffffffe; + int uvY = sy >> 1; + + vxc_uchar16 Y, UV; + + VXC_ReadImage(Y, y_img, (int2)(sx,sy), VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + vxc_char16 tmpUV; + short tmpVal = 128; + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); + + float4 tmpDstB, tmpDstG, tmpDstR; + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); + + int4 result; + vxc_uchar8 dst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); + dstPos.z = bOrder; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); + dstPos.z = 1; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); + dstPos.z = rOrder; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pre_process_nv12_scale_U8toI8( + __read_only image2d_t y_img, __read_only image2d_t uv_img, + __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + uint4 gidx = get_global_id(0); + uint gidy = get_global_id(1); + gidx += (uint4)(0, 1, 2, 3); + + uint dy = (gidy * yrIntFloat_16) >> 16; + uint4 dx = (gidx * xrIntFloat_16) >> 16; + int sy = convert_int(dy) + (*yOffset); + int4 sx = convert_int4(dx) + (*xOffset); + int4 uvX = sx & 0xfffffffe; + int uvY = sy >> 1; + + vxc_uchar16 Y, UV; + int2 coord = (int2)(sx.x, sy); + int2 coord_uv = (int2)(uvX.x, uvY); + + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.y; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.z; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.w; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.y; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.z; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.w; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_char16 tmpUV; + short tmpVal = 128; + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); + + float4 tmpDstB, tmpDstG, tmpDstR; + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); + + int4 result; + vxc_char8 dst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); + dstPos.z = bOrder; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); + dstPos.z = 1; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); + dstPos.z = rOrder; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx new file mode 100644 index 0000000..0a4551f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx @@ -0,0 +1,162 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +_viv_uniform float outputScaleVar; +_viv_uniform float bMeanScaleVarZp; +_viv_uniform float gMeanScaleVarZp; +_viv_uniform float rMeanScaleVarZp; + +_viv_uniform uint xrIntFloat_16; +_viv_uniform uint yrIntFloat_16; + +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4; + +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; + +_viv_uniform VXC_512Bits uniCalculateYShift_2x8; +_viv_uniform VXC_512Bits uniCalculateUVShift_2x8; + +__kernel void pre_process_nv12_scale_U8toU8_gq( + __read_only image2d_t y_img, __read_only image2d_t uv_img, + __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + uint4 gidx = get_global_id(0); + uint gidy = get_global_id(1); + gidx += (uint4)(0, 1, 2, 3); + + uint dy = (gidy * yrIntFloat_16) >> 16; + uint4 dx = (gidx * xrIntFloat_16) >> 16; + int sy = convert_int(dy) + (*yOffset); + int4 sx = convert_int4(dx) + (*xOffset); + int4 uvX = sx & 0xfffffffe; + int uvY = sy >> 1; + + vxc_uchar16 Y, UV; + int2 coord = (int2)(sx.x, sy); + int2 coord_uv = (int2)(uvX.x, uvY); + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + int4 offsetUV = uvX - uvX.x; + + vxc_ushort8 diffY, diffUV; + _viv_asm(COPY, diffY, sx, 16); + _viv_asm(COPY, diffUV, offsetUV, 16); + + vxc_ushort8 constData = 8; + VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8); + VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8); + VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_char16 tmpUV; + short tmpVal = 128; + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); + + float4 tmpDstB, tmpDstG, tmpDstR; + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); + + int4 result; + vxc_uchar8 dst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); + dstPos.z = bOrder; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); + dstPos.z = 1; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); + dstPos.z = rOrder; + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pre_process_nv12_scale_U8toF16_gq( + __read_only image2d_t y_img, __read_only image2d_t uv_img, + __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + uint4 gidx = get_global_id(0); + uint gidy = get_global_id(1); + gidx += (uint4)(0, 1, 2, 3); + + uint dy = (gidy * yrIntFloat_16) >> 16; + uint4 dx = (gidx * xrIntFloat_16) >> 16; + int sy = convert_int(dy) + (*yOffset); + int4 sx = convert_int4(dx) + (*xOffset); + int4 uvX = sx & 0xfffffffe; + int uvY = sy >> 1; + + vxc_uchar16 Y, UV; + int2 coord = (int2)(sx.x, sy); + int2 coord_uv = (int2)(uvX.x, uvY); + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + int4 offsetUV = uvX - uvX.x; + + vxc_ushort8 diffY, diffUV; + _viv_asm(COPY, diffY, sx, 16); + _viv_asm(COPY, diffUV, offsetUV, 16); + + vxc_ushort8 constData = 8; + VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8); + VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8); + VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_char16 tmpUV; + short tmpVal = 128; + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); + + float4 tmpDstB, tmpDstG, tmpDstR; + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); + + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; + + half4 result; + vxc_half8 tmpdst; + vxc_short8 dst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + _viv_asm(CONV, result, tmpDstB); + dstPos.z = bOrder; + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpdst, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(CONV, result, tmpDstG); + dstPos.z = 1; + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpdst, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(CONV, result, tmpDstR); + dstPos.z = rOrder; + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpdst, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx new file mode 100644 index 0000000..e235c7f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_trans_u8.vx @@ -0,0 +1,89 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +_viv_uniform float outputScaleVar; +_viv_uniform float bMeanScaleVarZp; +_viv_uniform float gMeanScaleVarZp; +_viv_uniform float rMeanScaleVarZp; + +_viv_uniform uint xrIntFloat_16; +_viv_uniform uint yrIntFloat_16; + +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4; + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; + +_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8; +_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8; + +__kernel void pre_process_nv12_trans_U8toU8( + __read_only image2d_t y_img, __read_only image2d_t uv_img, + __write_only image2d_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + uint4 gidx = get_global_id(0); + uint gidy = get_global_id(1); + gidx += (uint4)(0, 1, 2, 3); + + uint dy = (gidy * yrIntFloat_16) >> 16; + uint4 dx = (gidx * xrIntFloat_16) >> 16; + int sy = convert_int(dy) + (*yOffset); + int4 sx = convert_int4(dx) + (*xOffset); + int4 uvX = sx & 0xfffffffe; + int uvY = sy >> 1; + + vxc_uchar16 Y, UV; + int2 coord = (int2)(sx.x, sy); + int2 coord_uv = (int2)(uvX.x, uvY); + + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.y; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.z; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord.x = sx.w; + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.y; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.z; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_uv.x = uvX.w; + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_char16 tmpUV; + short tmpVal = 128; + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); + + float4 tmpDstB, tmpDstG, tmpDstR; + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); + + int4 result, dstR, dstG, dstB; + vxc_uchar16 dst, tmpPack; + dstB = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); + dstG = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); + dstR = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); + + if(bOrder == 2) + { + int4 exchangeData = dstB; + dstB = dstR; + dstR = exchangeData; + } + + VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); + + int2 dstPos = (int2)(get_global_id(0) * 3, gidy); + VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx new file mode 100644 index 0000000..536c18d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx @@ -0,0 +1,178 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniVecShift10; +_viv_uniform VXC_512Bits uniAddRShift; +_viv_uniform VXC_512Bits uniGetTempVal; +_viv_uniform VXC_512Bits uniExtractBytes; +_viv_uniform VXC_512Bits uniUnpackToR; +_viv_uniform VXC_512Bits uniUnpackToG; +_viv_uniform VXC_512Bits uniUnpackToB; + +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform float outputZP; +_viv_uniform int r_order; +_viv_uniform int b_order; + +#define DESCALE(x) (((x) + (1<<19)) >> 20) + +#define IMAGE_PRE_PROCESS(dst_name, conv_type, dst_type, copy_type) \ +__kernel void pre_process_rgb_scale_U8to##dst_name \ + ( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float f32Var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int2 ratioXY = (int2)(*xRatio, *yRatio); \ + int4 xPos = get_global_id(0); \ + int yPos = get_global_id(1); \ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \ + xPos += (int4)(0, 1, 2, 3); \ + \ + /*x*/ \ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \ + int4 sx = fx0 & 0xffff8000; \ + fx0 -= sx; \ + sx = sx >> 15; \ + \ + vxc_short4 fx; \ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \ + /*y*/ \ + int fy = yPos * ratioXY.y + ratioSufXY.y; \ + int sy = fy & 0xffff8000; \ + \ + fy -= sy; \ + sy = sy >> 15; \ + \ + fy = (fy + (1<< 4)) >> 5; \ + \ + vxc_uchar16 line0RGB1, line0RGB2; \ + vxc_uchar16 line1RGB3, line1RGB4; \ + int4 coord; \ + sx = (sx + (*xOffset)) * 3; \ + coord.xyz = sx.xyz; \ + coord.w = sy + *yOffset; \ + int2 coord1 = (int2)(sx.w, coord.w); \ + VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \ + \ + bgrMean *= f32Var; \ + \ + int4 test01, temp1; \ + int4 test02, temp2; \ + int4 tt; \ + vxc_uchar4 val; \ + int4 coord_out = (int4)(xPos.x, yPos, r_order, 0); \ + \ + vxc_uchar8 line1, line2; \ + \ + /*R*/ \ + VXC_DP2x8(line1, line0RGB1, line0RGB2, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \ + VXC_DP2x8(line2, line1RGB3, line1RGB4, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \ + \ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + vxc_float4 tmp_dst; \ + vxc_uchar4 u8_dst; \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + /*convert U8 to dst*/ \ + dst_type dst; \ + tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \ + tmp_dst = tmp_dst * outputScale + outputZP; \ + conv_type dst0; \ + _viv_asm(CONV_RTE, dst0, tmp_dst); \ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + copy_type result; \ + _viv_asm(COPY, result, dst, 16); \ + VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + /*G*/ \ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \ + \ + coord_out.z = 1; \ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + tmp_dst = tmp_dst * f32Var - bgrMean.y; \ + tmp_dst = tmp_dst * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, tmp_dst); \ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, result, dst, 16); \ + VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + /*B*/ \ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \ + \ + coord_out.z = b_order; \ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + tmp_dst = tmp_dst * f32Var - bgrMean.x; \ + tmp_dst = tmp_dst * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, tmp_dst); \ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, result, dst, 16); \ + VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +IMAGE_PRE_PROCESS(U8, uint4, vxc_uchar16, vxc_uchar16) +IMAGE_PRE_PROCESS(I8, int4, vxc_char16, vxc_char16) +IMAGE_PRE_PROCESS(I16, int4, vxc_short8, vxc_short8) +IMAGE_PRE_PROCESS(F16, half4, vxc_half8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx new file mode 100644 index 0000000..95a43ed --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx @@ -0,0 +1,151 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform int r_order; +_viv_uniform int b_order; +_viv_uniform VXC_512Bits uniExtractRtoF32_part0_4x4; +_viv_uniform VXC_512Bits uniExtractRtoF32_part1_4x4; +_viv_uniform VXC_512Bits uniExtractGtoF32_part0_4x4; +_viv_uniform VXC_512Bits uniExtractGtoF32_part1_4x4; +_viv_uniform VXC_512Bits uniExtractBtoF32_part0_4x4; +_viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +#define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \ +__kernel void pre_process_rgb_copy_U8to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float f32Var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \ + \ + coord.xy += (int2) (*xOffset, *yOffset); \ + vxc_uchar16 src0, src1; \ + dst_type dst0; \ + copy_type dst; \ + \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + f32Var *= outputScale; \ + float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \ + bMean * f32Var - outputZP, f32Var); \ + \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \ + float4 tmp0, tmp1; \ + convert_type result0, result1; \ + \ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \ + tmp0 = tmp0 * paramData.w - paramData.x; \ + tmp1 = tmp1 * paramData.w - paramData.x; \ + _viv_asm(CONV_RTE, result0, tmp0); \ + _viv_asm(CONV_RTE, result1, tmp1); \ + VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, 16); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + coord_out.z = 1; \ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \ + tmp0 = tmp0 * paramData.w - paramData.y; \ + tmp1 = tmp1 * paramData.w - paramData.y; \ + _viv_asm(CONV_RTE, result0, tmp0); \ + _viv_asm(CONV_RTE, result1, tmp1); \ + VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, 16); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + coord_out.z = b_order; \ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \ + tmp0 = tmp0 * paramData.w - paramData.z; \ + tmp1 = tmp1 * paramData.w - paramData.z; \ + _viv_asm(CONV_RTE, result0, tmp0); \ + _viv_asm(CONV_RTE, result1, tmp1); \ + VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, 16); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +IMAGE_PRE_PROCESS_COPY_16BITS(I16, vxc_short8, vxc_short8, int4) +IMAGE_PRE_PROCESS_COPY_16BITS(F16, vxc_half8, vxc_short8, half4) + +#define IMAGE_PRE_PROCESS_COPY_8BITS(dst_name, dst_type) \ +__kernel void pre_process_rgb_copy_U8to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float f32Var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \ + coord.xy += (int2) (*xOffset, *yOffset); \ + vxc_uchar16 src0, src1; \ + dst_type dst; \ + \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + f32Var *= outputScale; \ + float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \ + bMean * f32Var - outputZP, f32Var); \ + \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \ + float4 tmp0, tmp1; \ + int4 result0, result1; \ + \ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \ + tmp0 = tmp0 * paramData.w - paramData.x; \ + tmp1 = tmp1 * paramData.w - paramData.x; \ + result0 = convert_int4_rte(tmp0); \ + result1 = convert_int4_rte(tmp1); \ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + \ + coord_out.z = 1; \ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \ + tmp0 = tmp0 * paramData.w - paramData.y; \ + tmp1 = tmp1 * paramData.w - paramData.y; \ + result0 = convert_int4_rte(tmp0); \ + result1 = convert_int4_rte(tmp1); \ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + coord_out.z = b_order; \ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \ + tmp0 = tmp0 * paramData.w - paramData.z; \ + tmp1 = tmp1 * paramData.w - paramData.z; \ + result0 = convert_int4_rte(tmp0); \ + result1 = convert_int4_rte(tmp1); \ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +IMAGE_PRE_PROCESS_COPY_8BITS(U8, vxc_uchar16) +IMAGE_PRE_PROCESS_COPY_8BITS(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx new file mode 100644 index 0000000..da337ab --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy_trans.vx @@ -0,0 +1,94 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniNormilizationLo_2x8; +_viv_uniform VXC_512Bits uniNormilizationHi_2x8; +#define IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(dst_name, dst_type, copy_type) \ +__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float f32Var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + coord.xy += (int2) (*xOffset, *yOffset); \ + vxc_uchar16 src0, src1; \ + dst_type dst0, dst1; \ + copy_type dst; \ + \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + f32Var *= outputScale; \ + float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \ + bMean * f32Var - outputZP, f32Var); \ + half4 paramData_f16; \ + _viv_asm(CONV, paramData_f16, paramData); \ + \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0)); \ + coord_out.z = coord_out.x + 8; \ + \ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniNormilizationLo_2x8); \ + VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), \ + uniNormilizationHi_2x8); \ + _viv_asm(COPY, dst, dst0, 16); \ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, dst, dst1, 16); \ + VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \ +} +IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(I16, vxc_short8, vxc_short8) +IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(F16, vxc_half8, vxc_short8) + +#define IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(dst_name, dst_type) \ +__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float f32Var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + coord.xy += (int2) (*xOffset, *yOffset); \ + vxc_uchar16 src0, src1; \ + dst_type dst; \ + \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + f32Var *= outputScale; \ + float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \ + bMean * f32Var - outputZP, f32Var); \ + \ + half4 paramData_f16; \ + _viv_asm(CONV, paramData_f16, paramData); \ + \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + \ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniNormilizationLo_2x8); \ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), \ + uniNormilizationHi_2x8); \ + VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); \ +} +IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(U8, vxc_uchar16) +IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx new file mode 100644 index 0000000..0820a03 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_trans.vx @@ -0,0 +1,172 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniVecShift10; +_viv_uniform VXC_512Bits uniAddRShift; +_viv_uniform VXC_512Bits uniGetTempVal; +_viv_uniform VXC_512Bits uniExtractBytes; +_viv_uniform VXC_512Bits uniUnpackToR; +_viv_uniform VXC_512Bits uniUnpackToG; +_viv_uniform VXC_512Bits uniUnpackToB; + +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform float outputZP; + +_viv_uniform VXC_512Bits uniRePackRGBLo_2x8; +_viv_uniform VXC_512Bits uniRePackRGBHi_2x8; +#define IMAGE_PRE_PROCESS_NHWC(dst_name, conv_type, dst_type, copy_type) \ +__kernel void pre_process_rgb_scale_nhwc_U8to##dst_name \ + ( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float f32Var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int2 ratioXY = (int2)(*xRatio, *yRatio); \ + int4 xPos = get_global_id(0); \ + int yPos = get_global_id(1); \ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \ + xPos += (int4)(0, 1, 2, 3); \ + \ + /*x*/ \ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \ + int4 sx = fx0 & 0xffff8000; \ + fx0 -= sx; \ + sx = sx >> 15; \ + \ + vxc_short4 fx; \ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \ + /*y*/ \ + int fy = yPos * ratioXY.y + ratioSufXY.y; \ + int sy = fy & 0xffff8000; \ + \ + fy -= sy; \ + sy = sy >> 15; \ + \ + fy = (fy + (1<< 4)) >> 5; \ + \ + vxc_uchar16 line0RGB1, line0RGB2; \ + vxc_uchar16 line1RGB3, line1RGB4; \ + int4 coord; \ + sx = sx * 3 + *xOffset; \ + coord.xyz = sx.xyz; \ + coord.w = sy + *yOffset; \ + int2 coord1 = (int2)(sx.w, coord.w); \ + VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \ + \ + bgrMean *= f32Var; \ + \ + int4 test01, temp1; \ + int4 test02, temp2; \ + int4 tt; \ + vxc_uchar4 val; \ + int4 coord_out = (int4)(xPos.x * 3, yPos, xPos.x * 3 + 6, 0); \ + \ + vxc_uchar8 line1, line2; \ + \ + /*R*/ \ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \ + \ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + vxc_float4 tmp_dst; \ + vxc_uchar4 u8_dst; \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + /*convert U8 to dst*/ \ + dst_type dstRG, dstB, dst; \ + tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \ + tmp_dst = tmp_dst * outputScale + outputZP; \ + conv_type dst0; \ + _viv_asm(CONV_RTE, dst0, tmp_dst); \ + VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + \ + /*G*/ \ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \ + \ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + tmp_dst = tmp_dst * f32Var - bgrMean.y; \ + tmp_dst = tmp_dst * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, tmp_dst); \ + VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + \ + /*B*/ \ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \ + \ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + tmp_dst = tmp_dst * f32Var - bgrMean.x; \ + tmp_dst = tmp_dst * outputScale + outputZP; \ + _viv_asm(CONV_RTE, dst0, tmp_dst); \ + VXC_DP2x8(dstB, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBLo_2x8); \ + copy_type result; \ + _viv_asm(COPY, result, dst, 16); \ + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBHi_2x8); \ + _viv_asm(COPY, result, dst, 16); \ + VXC_WriteImage(output, coord_out.zy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ +} +IMAGE_PRE_PROCESS_NHWC(U8, uint4, vxc_uchar16, vxc_uchar16) +IMAGE_PRE_PROCESS_NHWC(I8, int4, vxc_char16, vxc_char16) +IMAGE_PRE_PROCESS_NHWC(I16, int4, vxc_short8, vxc_short8) +IMAGE_PRE_PROCESS_NHWC(F16, half4, vxc_half8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx new file mode 100644 index 0000000..4600537 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx @@ -0,0 +1,281 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpR3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpR4th_4x4; +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpG1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpG2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpG3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpG4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; + +_viv_uniform VXC_512Bits uniCalculateG1st_4x4; +_viv_uniform VXC_512Bits uniCalculateG2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateG3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateG4th_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpB1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpB2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4; +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; + +_viv_uniform VXC_512Bits uniPackBG0_2x8; +_viv_uniform VXC_512Bits uniPackTmpAndR_2x8; +_viv_uniform VXC_512Bits uniPackRB0_2x8; +_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8; +_viv_uniform VXC_512Bits uniPackGR1_2x8; +_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8; +_viv_uniform VXC_512Bits uniPackBG1_2x8; +_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8; +_viv_uniform VXC_512Bits uniPackRB2_2x8; +_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8; +_viv_uniform VXC_512Bits uniPackGR2_2x8; +_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8; + +_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8HiG_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8LoR_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform int zp; +_viv_uniform float outputScale; + +__kernel void pre_process_yuv420_copy_U8toU8( + __read_only image2d_t y_img, + __read_only image2d_t u_img, + __read_only image2d_t v_img, + __write_only image2d_array_t output, + global int * xRatio, + global int * yRatio, + global int * xOffset, + global int * yOffset, + float rMean, + float gMean, + float bMean, + float var, + int reverse_channel, + int trans + ) +{ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); + int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0); + vxc_uchar16 Y; + vxc_uchar8 U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + vxc_uchar16 dst0, dst1, dst2; + + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + //C = Y - 16; + //D = U - 128; + //E = V - 128; + // calculate R + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); + + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + // calculate G + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG; + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); + VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); + VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); + + // calculate B + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); + tmpV = -70688; + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + var *= outputScale; + float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\ + rMean * var - zp, var); + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); + + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); + + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); + + pos = (int4)(get_global_id(0), get_global_id(1), 0, 0); + pos.z = bOrder; + VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + pos.z = 1; + VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + pos.z = rOrder; + VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} + +// store bgrbgrbgr +__kernel void pre_process_yuv420_copy_trans_U8( + __read_only image2d_t y_img, + __read_only image2d_t u_img, + __read_only image2d_t v_img, + __write_only image2d_array_t output, + global int * xRatio, + global int * yRatio, + global int * xOffset, + global int * yOffset, + float rMean, + float gMean, + float bMean, + float var, + int reverse_channel, + int trans + ) +{ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); + int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0); + vxc_uchar16 Y; + vxc_uchar8 U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + vxc_uchar16 dst; + + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + var *= outputScale; + float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\ + rMean * var - zp, var); + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + //C = Y - 16; + //D = U - 128; + //E = V - 128; + // calculate R + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); + + VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); + VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); + + // calculate G + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG; + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP4x4(dst, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); + VXC_DP4x4(dst, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); + VXC_DP4x4(dst, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); + VXC_DP4x4(dst, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); + + VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); + VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); + + // calculate B + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); + tmpV = -70688; + VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); + VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); + + // reorder to bgr + vxc_uchar8 tmpdst0, tmpdst1; + vxc_uchar16 dst0, dst1, dst2; + + if(bOrder == 2) + { + vxc_uchar16 exchangeData = B; + B = R; + R = exchangeData; + } + + // BGR BGR BG + VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8); + VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8); + + // RBG RBG RB + VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8); + VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8); + + pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0); + + VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + pos.x += 16; + + // GRB GRB GR + VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8); + VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8); + + // BGR BGR BG + VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8); + VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8); + + VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + pos.x += 16; + + // RBG RBG RB + VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8); + VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8); + + // GRB GRB GR + VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8); + VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8); + + VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx new file mode 100644 index 0000000..9d4e331 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx @@ -0,0 +1,232 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; + +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; + +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +__kernel void pre_process_yuv420_scale_U8toF16( + __read_only image2d_array_t y_img, __read_only image2d_array_t u_img, + __read_only image2d_array_t v_img, __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + int4 gidx = get_global_id(0); + int gidy = get_global_id(1); + gidx += (int4)(0, 1, 2, 3); + + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); + int4 sx = fx & 0xffff8000; // Floor + int fy, sy; + fx -= sx; + sx = sx >> 15; + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + sx += (*xOffset); + sy += (*yOffset); + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); + + vxc_uchar16 Y, U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.x + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.x + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.y; + srcPos1.x = sx.y >> 1; + srcPos2.x = sx.y >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.y + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.y + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.z; + srcPos1.x = sx.z >> 1; + srcPos2.x = sx.z >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.z + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.z + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.w; + srcPos1.x = sx.w >> 1; + srcPos2.x = sx.w >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.w + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.w + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); + + //C = Y - 16; D = U - 128; E = V - 128; + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG, tmpDstG1; + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); + tmpV = -70688; + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + int4 result, temp1, temp2; + int4 tmpData0, tmpData1; + + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + // temp2 - temp1 + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + + vxc_half8 tmpVal; + half4 hDst; + tmpV = 1 << 19; + vxc_short8 dst; + float4 tmpDst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - bMean) * var; + dstPos.z = bOrder; + _viv_asm(CONV, hDst, tmpDst); + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - gMean) * var; + dstPos.z = 1; + _viv_asm(CONV, hDst, tmpDst); + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - rMean) * var; + dstPos.z = rOrder; + _viv_asm(CONV, hDst, tmpDst); + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx new file mode 100644 index 0000000..8bc4c0b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx @@ -0,0 +1,227 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; + +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform float outputScale; + +__kernel void pre_process_yuv420_scale_U8toI16( + __read_only image2d_array_t y_img, __read_only image2d_array_t u_img, + __read_only image2d_array_t v_img, __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + int4 gidx = get_global_id(0); + int gidy = get_global_id(1); + gidx += (int4)(0, 1, 2, 3); + + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); + int4 sx = fx & 0xffff8000; // Floor + int fy, sy; + fx -= sx; + sx = sx >> 15; + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + sx += (*xOffset); + sy += (*yOffset); + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); + + vxc_uchar16 Y, U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.x + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.x + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.y; + srcPos1.x = sx.y >> 1; + srcPos2.x = sx.y >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.y + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.y + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.z; + srcPos1.x = sx.z >> 1; + srcPos2.x = sx.z >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.z + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.z + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.w; + srcPos1.x = sx.w >> 1; + srcPos2.x = sx.w >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.w + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.w + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); + + //C = Y - 16; D = U - 128; E = V - 128; + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG, tmpDstG1; + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); + tmpV = -70688; + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + int4 result, temp1, temp2; + int4 tmpData0, tmpData1; + + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + // temp2 - temp1 + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + + tmpV = 1 << 19; + vxc_short8 dst; + float4 tmpDst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - bMean) * var; + dstPos.z = bOrder; + result = convert_int4_rte(tmpDst * outputScale); + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - gMean) * var; + dstPos.z = 1; + result = convert_int4_rte(tmpDst * outputScale); + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - rMean) * var; + dstPos.z = rOrder; + result = convert_int4_rte(tmpDst * outputScale); + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx new file mode 100644 index 0000000..d3150b0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx @@ -0,0 +1,227 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; + +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform float outputScale; + +__kernel void pre_process_yuv420_scale_U8toI8( + __read_only image2d_array_t y_img, __read_only image2d_array_t u_img, + __read_only image2d_array_t v_img, __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + int4 gidx = get_global_id(0); + int gidy = get_global_id(1); + gidx += (int4)(0, 1, 2, 3); + + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); + int4 sx = fx & 0xffff8000; // Floor + int fy, sy; + fx -= sx; + sx = sx >> 15; + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + sx += (*xOffset); + sy += (*yOffset); + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); + + vxc_uchar16 Y, U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.x + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.x + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.y; + srcPos1.x = sx.y >> 1; + srcPos2.x = sx.y >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.y + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.y + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.z; + srcPos1.x = sx.z >> 1; + srcPos2.x = sx.z >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.z + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.z + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.w; + srcPos1.x = sx.w >> 1; + srcPos2.x = sx.w >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.w + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.w + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); + + //C = Y - 16; D = U - 128; E = V - 128; + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG, tmpDstG1; + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); + tmpV = -70688; + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + int4 result, temp1, temp2; + int4 tmpData0, tmpData1; + + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + // temp2 - temp1 + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + + tmpV = 1 << 19; + vxc_char8 dst; + float4 tmpDst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - bMean) * var; + dstPos.z = bOrder; + result = convert_int4_rte(tmpDst * outputScale); + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - gMean) * var; + dstPos.z = 1; + result = convert_int4_rte(tmpDst * outputScale); + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - rMean) * var; + dstPos.z = rOrder; + result = convert_int4_rte(tmpDst * outputScale); + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx new file mode 100644 index 0000000..6a0340b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx @@ -0,0 +1,228 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; + +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform int zp; +_viv_uniform float outputScale; + +__kernel void pre_process_yuv420_scale_U8toU8( + __read_only image2d_array_t y_img, __read_only image2d_array_t u_img, + __read_only image2d_array_t v_img, __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + int4 gidx = get_global_id(0); + int gidy = get_global_id(1); + gidx += (int4)(0, 1, 2, 3); + + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); + int4 sx = fx & 0xffff8000; // Floor + int fy, sy; + fx -= sx; + sx = sx >> 15; + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + sx += (*xOffset); + sy += (*yOffset); + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); + + vxc_uchar16 Y, U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.x + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.x + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.y; + srcPos1.x = sx.y >> 1; + srcPos2.x = sx.y >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.y + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.y + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.z; + srcPos1.x = sx.z >> 1; + srcPos2.x = sx.z >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.z + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.z + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.w; + srcPos1.x = sx.w >> 1; + srcPos2.x = sx.w >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.w + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.w + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); + + //C = Y - 16; D = U - 128; E = V - 128; + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG, tmpDstG1; + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); + tmpV = -70688; + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + int4 result, temp1, temp2; + int4 tmpData0, tmpData1; + + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + // temp2 - temp1 + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + + tmpV = 1 << 19; + vxc_uchar8 dst; + float4 tmpDst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - bMean) * var; + dstPos.z = bOrder; + result = convert_int4_rte(tmpDst * outputScale + zp); + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - gMean) * var; + dstPos.z = 1; + result = convert_int4_rte(tmpDst * outputScale + zp); + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - rMean) * var; + dstPos.z = rOrder; + result = convert_int4_rte(tmpDst * outputScale + zp); + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx new file mode 100644 index 0000000..afb6bef --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_trans_u8.vx @@ -0,0 +1,235 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; + +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; + +_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8; +_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform int zp; +_viv_uniform float outputScale; + +__kernel void pre_process_yuv420_trans_U8toU8( + __read_only image2d_array_t y_img, __read_only image2d_array_t u_img, + __read_only image2d_array_t v_img, __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + int4 gidx = get_global_id(0); + int gidy = get_global_id(1); + gidx += (int4)(0, 1, 2, 3); + + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); + int4 sx = fx & 0xffff8000; // Floor + int fy, sy; + fx -= sx; + sx = sx >> 15; + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + sx += (*xOffset); + sy += (*yOffset); + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); + + vxc_uchar16 Y, U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.x + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.x + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.y; + srcPos1.x = sx.y >> 1; + srcPos2.x = sx.y >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.y + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.y + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.z; + srcPos1.x = sx.z >> 1; + srcPos2.x = sx.z >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.z + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.z + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.w; + srcPos1.x = sx.w >> 1; + srcPos2.x = sx.w >> 1; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); + srcPos1.x = (sx.w + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); + srcPos2.x = (sx.w + 1) >> 1; + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); + + //C = Y - 16; D = U - 128; E = V - 128; + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG, tmpDstG1; + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); + tmpV = -70688; + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + int4 result, temp1, temp2, dstR, dstG, dstB; + int4 tmpData0, tmpData1; + + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + // temp2 - temp1 + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + + tmpV = 1 << 19; + vxc_uchar8 dst, tmpPack; + float4 tmpDst; + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - bMean) * var; + dstB = convert_int4_rte(tmpDst * outputScale + zp); + + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - gMean) * var; + dstG = convert_int4_rte(tmpDst * outputScale + zp); + + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - rMean) * var; + dstR = convert_int4_rte(tmpDst * outputScale + zp); + + if(bOrder == 2) + { + int4 exchangeData = dstB; + dstB = dstR; + dstR = exchangeData; + } + + VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); + + int2 dstPos = (int2)(get_global_id(0) * 3, gidy); + VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx new file mode 100644 index 0000000..ca99597 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx @@ -0,0 +1,279 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpR3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpR4th_4x4; +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpG1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpG2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpG3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpG4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2_2x8; + +_viv_uniform VXC_512Bits uniCalculateG1st_4x4; +_viv_uniform VXC_512Bits uniCalculateG2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpB1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpB2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4; +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; + +_viv_uniform VXC_512Bits uniPackBG0_2x8; +_viv_uniform VXC_512Bits uniPackTmpAndR_2x8; +_viv_uniform VXC_512Bits uniPackRB0_2x8; +_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8; +_viv_uniform VXC_512Bits uniPackGR1_2x8; +_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8; +_viv_uniform VXC_512Bits uniPackBG1_2x8; +_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8; +_viv_uniform VXC_512Bits uniPackRB2_2x8; +_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8; +_viv_uniform VXC_512Bits uniPackGR2_2x8; +_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8; + +_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8HiG_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8LoR_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform int zp; +_viv_uniform float outputScale; + +__kernel void pre_process_yuv444_copy_U8toU8( + __read_only image2d_t y_img, + __read_only image2d_t u_img, + __read_only image2d_t v_img, + __write_only image2d_array_t output, + global int * xRatio, + global int * yRatio, + global int * xOffset, + global int * yOffset, + float rMean, + float gMean, + float bMean, + float var, + int reverse_channel, + int trans + ) +{ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); + vxc_uchar16 Y, U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + vxc_uchar16 dst0, dst1, dst2; + + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + //C = Y - 16; + //D = U - 128; + //E = V - 128; + // calculate R + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); + + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + // calculate G + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG0, tmpDstG1; + VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8); + + VXC_DP4x4(G, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); + VXC_DP4x4(G, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); + + // calculate B + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); + tmpV = -70688; + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + var *= outputScale; + float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\ + rMean * var - zp, var); + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); + + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); + + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); + + pos = (int4)(get_global_id(0), get_global_id(1), 0, 0); + pos.z = bOrder; + VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + pos.z = 1; + VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + pos.z = rOrder; + VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} + +// store bgrbgrbgr +__kernel void pre_process_yuv444_copy_trans_U8( + __read_only image2d_t y_img, + __read_only image2d_t u_img, + __read_only image2d_t v_img, + __write_only image2d_array_t output, + global int * xRatio, + global int * yRatio, + global int * xOffset, + global int * yOffset, + float rMean, + float gMean, + float bMean, + float var, + int reverse_channel, + int trans + ) +{ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); + vxc_uchar16 Y, U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + vxc_uchar16 dst; + + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + var *= outputScale; + float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\ + rMean * var - zp, var); + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + //C = Y - 16; + //D = U - 128; + //E = V - 128; + // calculate R + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); + + VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); + VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); + + // calculate G + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG0, tmpDstG1; + VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8); + VXC_DP4x4(dst, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); + VXC_DP4x4(dst, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); + VXC_DP4x4(dst, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); + VXC_DP4x4(dst, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); + + VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); + VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); + + // calculate B + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); + tmpV = -70688; + VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); + VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); + + // reorder to bgr + vxc_uchar8 tmpdst0, tmpdst1; + vxc_uchar16 dst0, dst1, dst2; + + if(bOrder == 2) + { + vxc_uchar16 exchangeData = B; + B = R; + R = exchangeData; + } + + // BGR BGR BG + VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8); + VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8); + + // RBG RBG RB + VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8); + VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8); + + pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0); + + VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + pos.x += 16; + + // GRB GRB GR + VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8); + VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8); + + // BGR BGR BG + VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8); + VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8); + + VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + pos.x += 16; + + // RBG RBG RB + VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8); + VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8); + + // GRB GRB GR + VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8); + VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8); + + VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx new file mode 100644 index 0000000..a195750 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx @@ -0,0 +1,190 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; + +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform int zp; +_viv_uniform float outputScale; + +#define IMAGE_PRE_PROCESS_YUV444_QINT(dst_name, dst_type) \ +__kernel void pre_process_yuv444_scale_U8to##dst_name( \ + __read_only image2d_t y_img, __read_only image2d_t u_img, \ + __read_only image2d_t v_img, __write_only image2d_array_t output, \ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \ +{ \ + int4 gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + gidx += (int4)(0, 1, 2, 3); \ + \ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \ + int4 sx = fx & 0xffff8000; \ + int fy, sy; \ + fx -= sx; \ + sx = sx >> 15; \ + fx = (fx +(1 << 4)) >> 5; \ + \ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \ + sy = fy & 0xffff8000; \ + fy -= sy; \ + sy = sy >> 15; \ + \ + sy = sy < 0 ? 0 : sy; \ + fy = fy < 0 ? 0 : fy; \ + \ + fy = (fy + (1<< 4)) >> 5; \ + sx += (*xOffset); \ + sy += (*yOffset); \ + int2 srcPos = (int2)(sx.x, sy); \ + \ + vxc_uchar16 Y, U, V; \ + vxc_int4 C0, C1, C2, C3; \ + vxc_uchar16 R, G, B; \ + \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.y; \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.z; \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.w; \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + int tmpV = -56992; \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \ + \ + ushort tmpG = 34784; \ + vxc_ushort8 tmpDstG, tmpDstG1; \ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \ + \ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \ + tmpV = -70688; \ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + \ + int4 result, temp1, temp2; \ + int4 tmpData0, tmpData1; \ + \ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + \ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + \ + tmpV = 1 << 19; \ + dst_type dst; \ + float4 tmpDst; \ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - bMean) * var; \ + dstPos.z = bOrder; \ + result = convert_int4_rte(tmpDst * outputScale + zp); \ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - gMean) * var; \ + dstPos.z = 1; \ + result = convert_int4_rte(tmpDst * outputScale + zp); \ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - rMean) * var; \ + dstPos.z = rOrder; \ + result = convert_int4_rte(tmpDst * outputScale + zp); \ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +IMAGE_PRE_PROCESS_YUV444_QINT(U8, vxc_uchar8) +IMAGE_PRE_PROCESS_YUV444_QINT(I8, vxc_char8) +IMAGE_PRE_PROCESS_YUV444_QINT(I16, vxc_short8) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx new file mode 100644 index 0000000..c5e706d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx @@ -0,0 +1,196 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; + +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; + +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +__kernel void pre_process_yuv444_scale_U8toF16( + __read_only image2d_t y_img, __read_only image2d_t u_img, + __read_only image2d_t v_img, __write_only image2d_array_t output, + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) +{ + int4 gidx = get_global_id(0); + int gidy = get_global_id(1); + gidx += (int4)(0, 1, 2, 3); + + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); + int4 sx = fx & 0xffff8000; // Floor + int fy, sy; + fx -= sx; + sx = sx >> 15; + fx = (fx +(1 << 4)) >> 5; + + // for y + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); + sy = fy & 0xffff8000; // Floor + fy -= sy; + sy = sy >> 15; + + sy = sy < 0 ? 0 : sy; + fy = fy < 0 ? 0 : fy; + + fy = (fy + (1<< 4)) >> 5; + sx += (*xOffset); + sy += (*yOffset); + int2 srcPos = (int2)(sx.x, sy); + + vxc_uchar16 Y, U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.y; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.z; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); + + srcPos.x = sx.w; + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); + + //C = Y - 16; D = U - 128; E = V - 128; + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG, tmpDstG1; + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); + + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); + tmpV = -70688; + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + int4 result, temp1, temp2; + int4 tmpData0, tmpData1; + + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + // temp2 - temp1 + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + + vxc_half8 tmpVal; + half4 hDst; + tmpV = 1 << 19; + vxc_short8 dst; + float4 tmpDst; + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - bMean) * var; + dstPos.z = bOrder; + _viv_asm(CONV, hDst, tmpDst); + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - gMean) * var; + dstPos.z = 1; + _viv_asm(CONV, hDst, tmpDst); + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); + temp1 = fx * tmpData0 + tmpData1; + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); + temp2 = fx * tmpData0 + tmpData1; + result = fy * temp2 + (temp1 << 10); + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); + tmpDst = (tmpDst - rMean) * var; + dstPos.z = rOrder; + _viv_asm(CONV, hDst, tmpDst); + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx new file mode 100644 index 0000000..8217d2f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_trans_u8.vx @@ -0,0 +1,196 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; + +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; + +_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8; +_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform int zp; +_viv_uniform float outputScale; + +#define IMAGE_PRE_PROCESS_YUV444_TRANS(dst_name, dst_type) \ +__kernel void pre_process_yuv444_trans_U8to##dst_name( \ + __read_only image2d_t y_img, __read_only image2d_t u_img, \ + __read_only image2d_t v_img, __write_only image2d_t output, \ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \ +{ \ + int4 gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + gidx += (int4)(0, 1, 2, 3); \ + \ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \ + int4 sx = fx & 0xffff8000; \ + int fy, sy; \ + fx -= sx; \ + sx = sx >> 15; \ + fx = (fx +(1 << 4)) >> 5; \ + \ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \ + sy = fy & 0xffff8000; \ + fy -= sy; \ + sy = sy >> 15; \ + \ + sy = sy < 0 ? 0 : sy; \ + fy = fy < 0 ? 0 : fy; \ + \ + fy = (fy + (1<< 4)) >> 5; \ + sx += (*xOffset); \ + sy += (*yOffset); \ + int2 srcPos = (int2)(sx.x, sy); \ + \ + vxc_uchar16 Y, U, V; \ + vxc_int4 C0, C1, C2, C3; \ + vxc_uchar16 R, G, B; \ + \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.y; \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.z; \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.w; \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + int tmpV = -56992; \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \ + \ + ushort tmpG = 34784; \ + vxc_ushort8 tmpDstG, tmpDstG1; \ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \ + \ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \ + tmpV = -70688; \ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + \ + int4 result, temp1, temp2, dstR, dstG, dstB; \ + int4 tmpData0, tmpData1; \ + \ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + \ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + \ + tmpV = 1 << 19; \ + dst_type dst, tmpPack; \ + float4 tmpDst; \ + \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - bMean) * var; \ + dstB = convert_int4_rte(tmpDst * outputScale + zp); \ + \ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - gMean) * var; \ + dstG = convert_int4_rte(tmpDst * outputScale + zp); \ + \ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - rMean) * var; \ + dstR = convert_int4_rte(tmpDst * outputScale + zp); \ + \ + if(bOrder == 2) \ + { \ + int4 exchangeData = dstB; \ + dstB = dstR; \ + dstR = exchangeData; \ + } \ + \ + VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \ + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); \ + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); \ + \ + int2 dstPos = (int2)(get_global_id(0) * 3, gidy); \ + VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \ +} +IMAGE_PRE_PROCESS_YUV444_TRANS(U8, vxc_uchar16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx b/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx new file mode 100644 index 0000000..695601d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx @@ -0,0 +1,274 @@ + +#include "cl_viv_vx_ext.h" + +#if (VX_VERSION==2) +_viv_uniform VXC_512Bits uniPreluDFPLo_2x8b; +_viv_uniform VXC_512Bits uniPreluDFPHi_2x8b; +__kernel void prelu_I8F16toI8_2D_OPT +( + image2d_array_t input, + image2d_array_t param, + image2d_array_t output +) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_char16 in, dst; + vxc_char32 src; + vxc_short8 a0, a1; + vxc_half8 c0, c1; + VXC_ReadImage(in, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(a0, param, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(a1, param, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c0, a0, 4); + _viv_asm(COPY, c1, a1, 4); + src.hi = max(in, 0); + src.lo = min(in, 0); + + VXC_DP2x8_b(dst, src.hi, src.lo, c0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniPreluDFPLo_2x8b); + VXC_DP2x8_b(dst, src.hi, src.lo, c1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniPreluDFPHi_2x8b); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void prelu_I16F16toI16_2D_OPT + ( + image2d_array_t input, + image2d_array_t param, + image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 in, dst; + vxc_short16 src; + vxc_short8 a0; + vxc_half8 c0; + VXC_ReadImage(in, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(a0, param, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c0, a0, 4); + src.hi = max(in, 0); + src.lo = min(in, 0); + VXC_DP2x8_b(dst, src.hi, src.lo, c0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniPreluDFPLo_2x8b); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} +#else +_viv_uniform VXC_512Bits uniPreluInt8_2x8; +_viv_uniform VXC_512Bits uniPreluInt16_part0_4x4; +_viv_uniform VXC_512Bits uniPreluInt16_part1_4x4; +__kernel void prelu_I8F16toI8_2D_OPT +( + image2d_array_t input, + image2d_array_t param, + image2d_array_t output +) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + vxc_char16 in, dst; + vxc_char16 src0, src1, src; + vxc_short8 a0, a1; + vxc_half8 c0, c1; + VXC_ReadImage(in, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(a0, param, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(a1, param, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c0, a0, 4); + _viv_asm(COPY, c1, a1, 4); + src0 = max(in, 0); + src1 = min(in, 0); + _viv_asm(COPY, src, src0, 16); + src.s89abcdef = src1.s01234567; + VXC_DP2x8(dst, src, c0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniPreluInt8_2x8); + _viv_asm(COPY, src, src1, 16); + src.s01234567 = src0.s89abcdef; + VXC_DP2x8(dst, src, c1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniPreluInt8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void prelu_I16F16toI16_2D_OPT + ( + image2d_array_t input, + image2d_array_t param, + image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + vxc_short8 in, dst; + vxc_short8 src0, src1, src; + vxc_short8 a0; + vxc_half8 c0; + VXC_ReadImage(in, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(a0, param, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, c0, a0, 4); + src0 = max(in, 0); + src1 = min(in, 0); + _viv_asm(COPY, src, src0, 16); + src.s4567 = src1.s0123; + VXC_DP4x4(dst, src, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniPreluInt16_part0_4x4); + _viv_asm(COPY, src, src1, 16); + src.s0123 = src0.s4567; + VXC_DP4x4(dst, src, c0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniPreluInt16_part1_4x4); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} +#endif + +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part0_4x4; +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4; +_viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4; +_viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4; +_viv_uniform VXC_512Bits uniExtact8Bin_2x8; +_viv_uniform int inputZP0; +_viv_uniform int inputZP1; +_viv_uniform float input_scale0; +_viv_uniform float input_scale1; +_viv_uniform float outputZP; +#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \ + __kernel void prelu_##name0##to##name1( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output) \ +{\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\ + vxc_float4 vecA, vecB, vecC, vecD;\ + input_type0 srcA;\ + copy_type0 src0;\ + vxc_short8 srcB;\ + vxc_half8 src1;\ + input_type0 input_ZP;\ + VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ + _viv_asm(COPY, src0, srcA, 16); \ + VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ + _viv_asm(COPY, src1, srcB, 16); \ + \ + _viv_asm(COPY, input_ZP, inputZP0, 4);\ + VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \ + uniDataSubZPtoFp32Part0_4x4); \ + VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \ + uniDataSubZPtoFp32Part1_4x4);\ + VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\ + VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\ + \ + vecA = vecA * input_scale0;\ + vecB = vecB * input_scale0;\ + vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \ + vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \ + vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \ + vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \ + vecA = maxData0 + vecC * minData0 + outputZP;\ + vecB = maxData1 + vecD * minData1 + outputZP;\ + convert_type dst0, dst1;\ + _viv_asm(CONV_RTE, dst0, vecA);\ + _viv_asm(CONV_RTE, dst1, vecB);\ + output_type dst2;\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\ + copy_type dst;\ + _viv_asm(COPY, dst, dst2, 16); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ +} +// name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type +PRELU_F16_3D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16) +PRELU_F16_3D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8) +PRELU_F16_3D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8) +PRELU_F16_3D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8) +PRELU_F16_3D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16) +PRELU_F16_3D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8) +PRELU_F16_3D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8) +PRELU_F16_3D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16) +PRELU_F16_3D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8) +PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16) + +#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \ + __kernel void prelu_##name0##to##name1##_2D( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output) \ +{\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\ + vxc_float4 vecA, vecB, vecC, vecD;\ + input_type0 srcA;\ + copy_type0 src0;\ + vxc_short8 srcB;\ + vxc_half8 src1;\ + input_type0 input_ZP;\ + VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ + _viv_asm(COPY, src0, srcA, 16); \ + VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ + _viv_asm(COPY, src1, srcB, 16); \ + \ + _viv_asm(COPY, input_ZP, inputZP0, 4);\ + VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ + VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ + VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\ + VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\ + \ + vecA = vecA * input_scale0;\ + vecB = vecB * input_scale0;\ + vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \ + vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \ + vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \ + vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \ + vecA = maxData0 + vecC * minData0 + outputZP;\ + vecB = maxData1 + vecD * minData1 + outputZP;\ + convert_type dst0, dst1;\ + _viv_asm(CONV_RTE, dst0, vecA);\ + _viv_asm(CONV_RTE, dst1, vecB);\ + output_type dst2;\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\ + copy_type dst;\ + _viv_asm(COPY, dst, dst2, 16); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ +} +PRELU_F16_2D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8) +PRELU_F16_2D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16) +PRELU_F16_2D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8) +PRELU_F16_2D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16) +PRELU_F16_2D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8) +PRELU_F16_2D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8) +PRELU_F16_2D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16) +PRELU_F16_2D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8) +PRELU_F16_2D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8) +PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16) + +#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \ + __kernel void prelu_U8U8to##name##_2D( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output) \ +{\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\ + vxc_float4 vecA, vecB, vecC, vecD;\ + vxc_uchar16 src0;\ + vxc_uchar16 src1;\ + vxc_uchar16 input_ZP0;\ + vxc_uchar16 input_ZP1;\ + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ + \ + _viv_asm(COPY, input_ZP0, inputZP0, 4);\ + VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ + VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ + _viv_asm(COPY, input_ZP1, inputZP1, 4);\ + VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ + VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ + \ + vecA = vecA * input_scale0;\ + vecB = vecB * input_scale0;\ + vecC = vecC * input_scale1;\ + vecD = vecD * input_scale1;\ + vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \ + vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \ + vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \ + vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \ + vecA = maxData0 + vecC * minData0 + outputZP;\ + vecB = maxData1 + vecD * minData1 + outputZP;\ + convert_type dst0, dst1;\ + _viv_asm(CONV_RTE, dst0, vecA);\ + _viv_asm(CONV_RTE, dst1, vecB);\ + output_type dst2;\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\ + copy_type dst;\ + _viv_asm(COPY, dst, dst2, 16); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ +} +PRELU_U8_2D(U8, vxc_uchar16, int4, vxc_uchar16) +PRELU_U8_2D(F16, vxc_half8, half4, vxc_short8) + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/prelu_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/prelu_BF16.vx new file mode 100644 index 0000000..5d92d7e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/prelu_BF16.vx @@ -0,0 +1,100 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniConvF16toF32_Part0_4x4; +_viv_uniform VXC_512Bits uniConvF16toF32_Part1_4x4; +_viv_uniform VXC_512Bits uniPackedBF16_2x8; + +#define PRELU_BF16F16TOBF16_PROCESS(read_fun, write_fun) \ + vxc_short8 src0, para_s16; \ + vxc_half8 para_f16; \ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(para_s16, param, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, para_f16, para_s16, 16); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + vxc_ushort8 src1, src2; \ + float4 srcA, srcB; \ + float4 para0_f32, para1_f32; \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, srcA, src1, 16); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, srcB, src1, 16); \ + VXC_DP4x4(para0_f32, para_f16, para_f16, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvF16toF32_Part0_4x4);\ + VXC_DP4x4(para1_f32, para_f16, para_f16, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvF16toF32_Part1_4x4);\ + srcA = srcA >= 0 ? srcA : srcA * para0_f32; \ + srcB = srcB >= 0 ? srcB : srcB * para1_f32; \ + _viv_asm(COPY, src1, srcA, 16); \ + _viv_asm(COPY, src2, srcB, 16); \ + VXC_DP2x8(src1, src1, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackedBF16_2x8); \ + write_fun(output, coord, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void prelu_BF16F16toBF16_2D + ( + __read_only image2d_array_t input, + __read_only image2d_array_t param, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + PRELU_BF16F16TOBF16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +__kernel void prelu_BF16F16toBF16 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t param, + __write_only image2d_array_t output, + int axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + PRELU_BF16F16TOBF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +#define PRELU_BF16BF16TOBF16_PROCESS(read_fun, write_fun) \ + vxc_short8 src0, para_s16; \ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(para_s16, param, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + vxc_ushort8 src1, src2; \ + float4 srcA, srcB; \ + float4 para0_f32, para1_f32; \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, srcA, src1, 16); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, srcB, src1, 16); \ + VXC_DP2x8(src1, para_s16, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, para0_f32, src1, 16); \ + VXC_DP2x8(src1, para_s16, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, para1_f32, src1, 16); \ + srcA = srcA >= 0 ? srcA : srcA * para0_f32; \ + srcB = srcB >= 0 ? srcB : srcB * para1_f32; \ + _viv_asm(COPY, src1, srcA, 16); \ + _viv_asm(COPY, src2, srcB, 16); \ + VXC_DP2x8(src1, src1, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackedBF16_2x8); \ + write_fun(output, coord, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void prelu_BF16BF16toBF16_2D + ( + __read_only image2d_array_t input, + __read_only image2d_array_t param, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + PRELU_BF16BF16TOBF16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +__kernel void prelu_BF16BF16toBF16 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t param, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + PRELU_BF16BF16TOBF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/random_multinomial.vx b/src/tim/vx/internal/src/libnnext/ops/vx/random_multinomial.vx new file mode 100644 index 0000000..b588303 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/random_multinomial.vx @@ -0,0 +1,275 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform int class_max_iter; + +_viv_uniform VXC_512Bits uniPackMaxData_2x8; +_viv_uniform VXC_512Bits uniGetSubData0to3_4x4; +_viv_uniform VXC_512Bits uniGetSubData4to7_4x4; +_viv_uniform int iter; +_viv_uniform int stride; +_viv_uniform float re_rand_max; + +inline uchar* get_image2D_array_ptr(image2d_array_t input) +{ + int8 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + uchar *src_ptr = (uchar*)desc.s0; + + return src_ptr; +} + +uint4 _philox4x32bumpkey(uint4 key) +{ + uint4 mask = (uint4)((uint)0x9E3779B9, (uint)0xBB67AE85, 0, 0); + //key.x += ((uint)0x9E3779B9); + //key.y += ((uint)0xBB67AE85); + key += mask; + return key; +} + +uint mulhilo32(uint a, uint b, uint* hip) +{ + uint product = (uint)(a * b); + *hip = mul_hi(a, b); + return product; +} + +uint mullo32(uint a, uint b) +{ + return a * b; +} + +uint mulhi32(uint a, uint b) +{ + return mul_hi(a, b); +} + +uint4 _philox4x32round(uint4 ctr, uint4 key) +{ + //uint hi0; + //uint hi1; + uint PHILOX_M4x32_0 = ((uint)0xD2511F53); + uint PHILOX_M4x32_1 = ((uint)0xCD9E8D57); + uint lo0 = mullo32(PHILOX_M4x32_0, ctr.x); + uint hi0 = mulhi32(PHILOX_M4x32_0, ctr.x); + uint lo1 = mullo32(PHILOX_M4x32_1, ctr.z); + uint hi1 = mulhi32(PHILOX_M4x32_1, ctr.z); + //uint lo0 = mulhilo32(PHILOX_M4x32_0, ctr.x, &hi0); + //uint lo1 = mulhilo32(PHILOX_M4x32_1, ctr.z, &hi1); + uint4 out = (uint4)(hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0); + return out; +} + +uint4 philox4x32_R_10(uint4 ctr, uint4 key) +{ + uint i; + ctr = _philox4x32round(ctr, key); + for (i = 1; i < 10; i++) + { + key = _philox4x32bumpkey(key); + ctr = _philox4x32round(ctr, key); + } + return ctr; +} + +__kernel void random_seed( + __read_only image2d_array_t seeds, + __write_only image2d_array_t output) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord = (int4)(gidx << 1, gidy, 0, 0); + + int width = get_image_width(seeds); + __global uint* seeds_ptr = (__global uint*)get_image2D_array_ptr(seeds); + seeds_ptr = seeds_ptr + coord.x + coord.y * width; + uint4 key = vload4(0, seeds_ptr); + + uint4 ctr = (uint4)(0); + float4 result = 0; + + width = get_image_width(output); + coord.x = gidx * stride + width * coord.y; + __global float* output_ptr = (__global float*)get_image2D_array_ptr(output); + output_ptr += coord.x; + + for(int i = 0; i < iter; i++) + { + ctr = philox4x32_R_10(ctr, key); + result = convert_float4(ctr) * re_rand_max; + vstore4(result, i, output_ptr); + } +} + +#define logE (1.44269502f) +float4 eltwise_unary_exp(float4 x) +{ + x *= logE; + x = exp2(x); + return x; +} +// N times of 8 +// x dim = 1 +__kernel void random_multinomial_cdf_F16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord = (int4)(gidx, gidy, 0, 0); + + vxc_half8 maxData, data; + vxc_short8 src0; + float4 dst0 = 0, dst1 = 0; + float4 one = (float4)(1, 1, 1, 1); + float tmp = 0; + + int class_max_stride = get_image_width(input); + int offset = gidy * class_max_stride; + __global float* output_ptr = (__global float*)get_image2D_array_ptr(output); + __global float* cdfPtr = output_ptr + offset; + + VXC_ReadImage(maxData, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + for(int i = 1; i < class_max_iter; i++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + _viv_asm(COPY, data, src0, 16); + + VXC_VertMax3_Half(maxData, maxData, maxData, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + VXC_HorzMax3_Half(maxData, maxData, VXC_MODIFIER(0, 5, 0,VXC_RM_TowardZero, 0)); + VXC_DP2x8(maxData, maxData, maxData, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); + VXC_HorzMax3_Half(maxData, maxData, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); + + coord.x = 0; + for(int i = 0; i < class_max_iter; i++) + { + float4 val0, val1; + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + _viv_asm(COPY, data, src0, 16); + VXC_DP4x4(val0, data, maxData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); + VXC_DP4x4(val1, data, maxData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); + val0 = eltwise_unary_exp(val0); + val1 = eltwise_unary_exp(val1); + val0.x += dst1.w; + dst0 = (float4)(val0.x, (val0.x + val0.y), dot(val0, (float4)(1, 1, 1, 0)), dot(val0, one)); + val1.x += dst0.w; + dst0 = (float4)(val1.x, (val1.x + val1.y), dot(val1, (float4)(1, 1, 1, 0)), dot(val1, one)); + vstore4(dst0, 0, cdfPtr); + vstore4(dst1, 1, cdfPtr); + cdfPtr += 8; + } +} + +__kernel void random_multinomial_cdf_F32 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord = (int4)(gidx, gidy, 0, 0); + + vxc_float4 src0, data; + float maxData0 = FLT_MIN, maxData1 = FLT_MIN; + uint4 ctr = (uint4)(0); + float4 dst = 0; + float4 one = (float4)(1, 1, 1, 1); + + int class_max_stride = get_image_width(input); + float tmp = 0; + int offset = gidy * class_max_stride; + __global float* output_ptr = (__global float*)get_image2D_array_ptr(output); + __global float* cdfPtr = output_ptr + offset; + + int width = get_image_width(input); + __global float* input_ptr = (__global float*)get_image2D_array_ptr(input); + input_ptr = input_ptr + coord.x + coord.y * width; + + float4 maxVal = vload4(0, input_ptr); + for(int i = 1; i < class_max_iter; i++) + { + src0 = vload4(i, input_ptr); + + maxVal = maxVal > src0 ? maxVal : src0; + } + maxVal.xy = maxVal.xy > maxVal.zw ? maxVal.xy : maxVal.zw; + maxData0 = maxVal.x > maxVal.y ? maxVal.x : maxVal.y; + + float4 maxData = (float4)(maxData0, maxData0, maxData0, maxData0); + for(int i = 0; i < class_max_iter; i++) + { + float4 val; + src0 = vload4(i, input_ptr); + data = src0 - maxData; + val = eltwise_unary_exp(data); + val.x += dst.w; + dst = (float4)(val.x, (val.x + val.y), dot(val, (float4)(1, 1, 1, 0)), dot(val, one)); + vstore4(dst, i, cdfPtr); + } +} + +uint upper_bound(float* a, int n, float x) +{ + uint l = 0; + uint h = n; + while (l < h) { + int mid = (l + h) >> 1; + if (x >= a[mid]) { + l = mid + 1; + } else { + h = mid; + } + } + return l; +} + +// one thread calculate 4 +__kernel void random_multinomial + ( + __read_only image2d_array_t randoms, + __read_only image2d_array_t cdfs, + __write_only image2d_array_t output, + int class_size + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord = (int4)(gidx, gidy, 0, 0); + + int class_max_stride = get_image_width(cdfs); + int offset = gidy * class_max_stride; + __global float* cdf_ptr = (__global float*)get_image2D_array_ptr(cdfs); + __global float* cdfPtr = cdf_ptr + offset; + + int width = get_image_width(randoms); + offset = coord.x + coord.y * width; + __global float* randoms_ptr = (__global float*)get_image2D_array_ptr(randoms); + randoms_ptr = randoms_ptr + offset; + + width = get_image_width(output); + offset = coord.x + coord.y * width; + __global uint* output_ptr = (__global uint*)get_image2D_array_ptr(output); + output_ptr = output_ptr + offset; + + float4 ran = vload4(0, randoms_ptr); + float total = cdfPtr[class_size - 1]; + float4 target = ran * total; + + uint4 out_class = (uint4)(0); + out_class.x = upper_bound(cdfPtr, class_size, target.x); + out_class.y = upper_bound(cdfPtr, class_size, target.y); + out_class.z = upper_bound(cdfPtr, class_size, target.z); + out_class.w = upper_bound(cdfPtr, class_size, target.w); + + vstore4(out_class, 0, output_ptr); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reduceall_internal_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reduceall_internal_axis0.vx new file mode 100644 index 0000000..def5e32 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reduceall_internal_axis0.vx @@ -0,0 +1,55 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform int axisSize; + +_viv_uniform VXC_512Bits uniS8AddAll_16x1; + +#define REDUCEALL_AXIS0_PROCESS(read_fun, write_fun) \ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \ + int4 sum_val = 0; \ + result = ones; \ + do \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + val = val0 != zeros ? ones : zeros; \ + VXC_DP16x1(sum_val, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniS8AddAll_16x1); \ + if (sum_val.x != 16) \ + { \ + result = zeros; \ + break; \ + } \ + coord.x += 16; \ + } \ + while(coord.x < axisSize); \ + write_fun(output, coord_out, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + +__kernel void reduceall_axis0_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + vxc_char16 val0; + vxc_char16 val, result; + REDUCEALL_AXIS0_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage) +} + +__kernel void reduceall_axis0_I8toI8_2D + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int2 coord_out = (int2)(get_global_id(0), 0); + vxc_char16 val0; + vxc_char16 val, result; + REDUCEALL_AXIS0_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reduceall_internal_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reduceall_internal_axis1.vx new file mode 100644 index 0000000..5400aa0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reduceall_internal_axis1.vx @@ -0,0 +1,48 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int axisSize; + +#define REDUCEALL_AXIS1_PROCESS(read_fun, write_fun) \ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + result = val0 != zeros ? ones : zeros; \ + coord.y++; \ + while(coord.y < axisSize) \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + val = val0 != zeros ? ones : zeros; \ + result = result & val; \ + coord.y++; \ + } \ + write_fun(output, coord_out, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + +__kernel void reduceall_axis1_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + vxc_char16 val0; + vxc_char16 val, result; + REDUCEALL_AXIS1_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage) +} + +__kernel void reduceall_axis1_I8toI8_2D + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int2 coord_out = (int2)(get_global_id(0), 0); + vxc_char16 val0; + vxc_char16 val, result; + REDUCEALL_AXIS1_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reduceall_internal_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reduceall_internal_axis2.vx new file mode 100644 index 0000000..91b3df4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reduceall_internal_axis2.vx @@ -0,0 +1,34 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int axisSize; + +#define REDUCEALL_AXIS2_PROCESS(read_fun, write_fun) \ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + result = val0 != zeros ? ones : zeros; \ + coord.z++; \ + while(coord.z < axisSize) \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + val = val0 != zeros ? ones : zeros; \ + result = result & val; \ + coord.z++; \ + } \ + write_fun(output, coord.xy, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + +__kernel void reduceall_axis2_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + vxc_char16 val0; + vxc_char16 val, result; + REDUCEALL_AXIS2_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage) +} + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reduceany_internal_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reduceany_internal_axis0.vx new file mode 100644 index 0000000..301d933 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reduceany_internal_axis0.vx @@ -0,0 +1,55 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform int axisSize; + +_viv_uniform VXC_512Bits uniS8AddAll_16x1; + +#define REDUCEANY_AXIS0_PROCESS(read_fun, write_fun) \ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \ + int4 sum_val = 0; \ + result = zeros; \ + do \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + val = val0 != zeros ? ones : zeros; \ + VXC_DP16x1(sum_val, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniS8AddAll_16x1); \ + if (sum_val.x != 0) \ + { \ + result = ones; \ + break; \ + } \ + coord.x += 16; \ + } \ + while(coord.x < axisSize); \ + write_fun(output, coord_out, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + +__kernel void reduceany_axis0_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + vxc_char16 val0; + vxc_char16 val, result; + REDUCEANY_AXIS0_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage) +} + +__kernel void reduceany_axis0_I8toI8_2D + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int2 coord_out = (int2)(get_global_id(0), 0); + vxc_char16 val0; + vxc_char16 val, result; + REDUCEANY_AXIS0_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reduceany_internal_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reduceany_internal_axis1.vx new file mode 100644 index 0000000..46fd42d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reduceany_internal_axis1.vx @@ -0,0 +1,48 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int axisSize; + +#define REDUCEANY_AXIS1_PROCESS(read_fun, write_fun) \ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + result = val0 != zeros ? ones : zeros; \ + coord.y++; \ + while(coord.y < axisSize) \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + val = val0 != zeros ? ones : zeros; \ + result = result | val; \ + coord.y++; \ + } \ + write_fun(output, coord_out, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + +__kernel void reduceany_axis1_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + vxc_char16 val0; + vxc_char16 val, result; + REDUCEANY_AXIS1_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage) +} + +__kernel void reduceany_axis1_I8toI8_2D + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int2 coord_out = (int2)(get_global_id(0), 0); + vxc_char16 val0; + vxc_char16 val, result; + REDUCEANY_AXIS1_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reduceany_internal_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reduceany_internal_axis2.vx new file mode 100644 index 0000000..cdfa3cb --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reduceany_internal_axis2.vx @@ -0,0 +1,34 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int axisSize; + +#define REDUCEANY_AXIS2_PROCESS(read_fun, write_fun) \ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + result = val0 != zeros ? ones : zeros; \ + coord.z++; \ + while(coord.z < axisSize) \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + val = val0 != zeros ? ones : zeros; \ + result = result | val; \ + coord.z++; \ + } \ + write_fun(output, coord.xy, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + +__kernel void reduceany_axis2_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + vxc_char16 val0; + vxc_char16 val, result; + REDUCEANY_AXIS2_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage) +} + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reducemax_internal_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reducemax_internal_axis0.vx new file mode 100644 index 0000000..29541a2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reducemax_internal_axis0.vx @@ -0,0 +1,172 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform int axisSize; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform float inputScale; +_viv_uniform float input_offset_asymmetric; +_viv_uniform VXC_512Bits uniGetLoData_4x4; + +_viv_uniform VXC_512Bits uniPackMaxData_2x8; + +#define REDUCEMAX_PROCESS_AXIS0(read_fun, vert_max_fun, horz_max_fun) \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, val, val0, 16); \ + coord.x += 8; \ + do \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val1, val1, 16); \ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val2, val2, 16); \ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val3, val3, 16); \ + coord.x += 32; \ + vert_max_fun(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vert_max_fun(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + horz_max_fun(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \ + horz_max_fun(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ + while(coord.x < (axisSize + 16)); + +#define REDUCEMAX_PROCESS_AXIS0_SAVE_SAME(save_type, write_fun) \ + save_type dst; \ + _viv_asm(COPY, dst, val, 16); \ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \ + vxc_float4 prob; \ + dst_type vec1; \ + save_type dst; \ + VXC_DP4x4(prob, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \ + prob = ((prob - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, vec1, prob); \ + _viv_asm(COPY, dst, vec1, 16); \ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMAX_AXIS0_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_max_fun, horz_max_fun) \ +__kernel void reducemax_axis0_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + REDUCEMAX_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun); \ + REDUCEMAX_PROCESS_AXIS0_SAVE_SAME(save_type, VXC_WriteImage); \ +} + + +#define REDUCEMAX_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_max_fun, horz_max_fun) \ +__kernel void reducemax_axis0_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + REDUCEMAX_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun); \ + REDUCEMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \ +} + +REDUCEMAX_AXIS0_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half) + +REDUCEMAX_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +REDUCEMAX_AXIS0(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, \ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +REDUCEMAX_AXIS0(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric,\ + 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +REDUCEMAX_AXIS0(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +REDUCEMAX_AXIS0(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +REDUCEMAX_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8, +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) + +REDUCEMAX_AXIS0(I16, I16, vxc_short8, vxc_short8, short4, +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +REDUCEMAX_AXIS0(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8, +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +REDUCEMAX_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale,\ +input_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) + +#define REDUCEMAX_AXIS0_SAME_2D(src_name, dst_name, src_type, copy_type, save_type, vert_max_fun, horz_max_fun) \ +__kernel void reducemax_axis0_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(16, get_global_id(0)); \ + int2 coord_out = (int2)(get_global_id(0), 0); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + REDUCEMAX_PROCESS_AXIS0(VXC_ReadImage, vert_max_fun, horz_max_fun); \ + REDUCEMAX_PROCESS_AXIS0_SAVE_SAME(save_type, VXC_WriteImage); \ +} + +#define REDUCEMAX_AXIS0_2D(src_name, dst_name, src_type, copy_type,\ + dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_max_fun, horz_max_fun) \ +__kernel void reducemax_axis0_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(16, get_global_id(0)); \ + int2 coord_out = (int2)(get_global_id(0), 0); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + REDUCEMAX_PROCESS_AXIS0(VXC_ReadImage, vert_max_fun, horz_max_fun); \ + REDUCEMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \ +} + +REDUCEMAX_AXIS0_SAME_2D(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half) + +REDUCEMAX_AXIS0_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +REDUCEMAX_AXIS0_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +REDUCEMAX_AXIS0_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric,\ + 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +REDUCEMAX_AXIS0_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +REDUCEMAX_AXIS0_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +REDUCEMAX_AXIS0_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, + vxc_short8, CONV, 1, 0, inputScale,\ + input_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +REDUCEMAX_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8, + CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +REDUCEMAX_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8, + CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +REDUCEMAX_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale,\ +input_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reducemax_internal_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reducemax_internal_axis1.vx new file mode 100644 index 0000000..08ca873 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reducemax_internal_axis1.vx @@ -0,0 +1,160 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int axisSize; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform float inputScale; +_viv_uniform float input_offset_asymmetric; +_viv_uniform VXC_512Bits uniGetLoData_4x4; + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniGetHiData_4x4; + +#define REDUCEMAX_PROCESS_AXIS1(read_fun, vert_max_fun) \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, max, in0, 16); \ + coord.y++; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + } \ + while(coord.y < axisSize); + +#define REDUCEMAX_PROCESS_AXIS1_SAVE_SAME(save_type, write_fun) \ + save_type vect; \ + _viv_asm(COPY, vect, max, 16); \ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\ +OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \ + dst_type dst0, dst1; \ + save_type vect; \ + VXC_DP4x4(data0, max, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst0, data0); \ + VXC_DP4x4(data0, max, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst1, data0); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMAX_AXIS1_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_max_fun) \ +__kernel void reducemax_axis1_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMAX_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \ + REDUCEMAX_PROCESS_AXIS1_SAVE_SAME(save_type, VXC_WriteImage); \ +} + +#define REDUCEMAX_AXIS1(src_name, dst_name, src_type, copy_type, dst_type, save_type,\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_max_fun) \ +__kernel void reducemax_axis1_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMAX_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \ + REDUCEMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \ +} + + + +REDUCEMAX_AXIS1_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMax3_Half) + + +REDUCEMAX_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half) +REDUCEMAX_AXIS1(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, \ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half) +REDUCEMAX_AXIS1(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8, +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMax3_Half) +REDUCEMAX_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8, +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer) +REDUCEMAX_AXIS1(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8, +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS1(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8, +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer) + + +#define REDUCEMAX_AXIS1_SAME_2D(src_name, dst_name, src_type, copy_type, save_type, vert_max_fun) \ +__kernel void reducemax_axis1_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), 0); \ + int2 coord_out = (int2)(get_global_id(0), 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMAX_PROCESS_AXIS1(VXC_ReadImage, vert_max_fun) \ + REDUCEMAX_PROCESS_AXIS1_SAVE_SAME(save_type, VXC_WriteImage); \ +} + +#define REDUCEMAX_AXIS1_2D(src_name, dst_name, src_type, copy_type, dst_type,\ +save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_max_fun) \ +__kernel void reducemax_axis1_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), 0); \ + int2 coord_out = (int2)(get_global_id(0), 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMAX_PROCESS_AXIS1(VXC_ReadImage, vert_max_fun) \ + REDUCEMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \ +} + +REDUCEMAX_AXIS1_SAME_2D(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMax3_Half) + +REDUCEMAX_AXIS1_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half) +REDUCEMAX_AXIS1_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, \ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half) +REDUCEMAX_AXIS1_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8, +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMax3_Half) +REDUCEMAX_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, +vxc_short8, CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer) +REDUCEMAX_AXIS1_2D(I16, I16, vxc_short8, vxc_short8,\ +short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS1_2D(I8, I8, vxc_char16, vxc_char16,\ +char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reducemax_internal_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reducemax_internal_axis2.vx new file mode 100644 index 0000000..635136c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reducemax_internal_axis2.vx @@ -0,0 +1,102 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int axisSize; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform float inputScale; +_viv_uniform float input_offset_asymmetric; +_viv_uniform VXC_512Bits uniGetLoData_4x4; + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniGetHiData_4x4; + +#define REDUCEMAX_PROCESS_AXIS2(read_fun, vert_max_fun) \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, max, in0, 16); \ + coord.z++; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z++; \ + } \ + while(coord.z < axisSize); + +#define REDUCEMAX_PROCESS_AXIS2_SAVE_SAME(save_type, write_fun) \ + save_type vect; \ + _viv_asm(COPY, vect, max, 16); \ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMAX_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode,\ +OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \ + dst_type dst0, dst1; \ + save_type vect; \ + VXC_DP4x4(data0, max, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst0, data0); \ + VXC_DP4x4(data0, max, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst1, data0); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMAX_AXIS2_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_max_fun) \ +__kernel void reducemax_axis2_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMAX_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_max_fun) \ + REDUCEMAX_PROCESS_AXIS2_SAVE_SAME(save_type, VXC_WriteImage); \ +} + +#define REDUCEMAX_AXIS2(src_name, dst_name, src_type, copy_type, dst_type,\ +save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_max_fun) \ +__kernel void reducemax_axis2_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMAX_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_max_fun) \ + REDUCEMAX_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode, OUT_SCALE,\ + OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \ +} + + + +REDUCEMAX_AXIS2_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMax3_Half) + + +REDUCEMAX_AXIS2(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half) +REDUCEMAX_AXIS2(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, \ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half) +REDUCEMAX_AXIS2(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMax3_Half) +REDUCEMAX_AXIS2(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS2(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS2(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer) + +REDUCEMAX_AXIS2(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS2(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer) +REDUCEMAX_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reducemin_internal_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reducemin_internal_axis0.vx new file mode 100644 index 0000000..595badf --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reducemin_internal_axis0.vx @@ -0,0 +1,172 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform int axisSize; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform float inputScale; +_viv_uniform float input_offset_asymmetric; +_viv_uniform VXC_512Bits uniGetLoData_4x4; + +_viv_uniform VXC_512Bits uniPackMaxData_2x8; + +#define REDUCEMIN_PROCESS_AXIS0(read_fun, vert_min_fun, horz_min_fun) \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, val, val0, 16); \ + coord.x += 8; \ + do \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val1, val1, 16); \ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val2, val2, 16); \ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val3, val3, 16); \ + coord.x += 32; \ + vert_min_fun(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vert_min_fun(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + while(coord.x < (axisSize + 16)); \ + horz_min_fun(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \ + horz_min_fun(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMIN_PROCESS_AXIS0_SAVE_SAME(save_type, write_fun) \ + save_type dst; \ + _viv_asm(COPY, dst, val, 16); \ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMIN_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\ +OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \ + vxc_float4 prob; \ + dst_type vec1; \ + save_type dst; \ + VXC_DP4x4(prob, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \ + prob = ((prob - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, vec1, prob); \ + _viv_asm(COPY, dst, vec1, 16); \ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMIN_AXIS0_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_min_fun, horz_min_fun) \ +__kernel void reducemin_axis0_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + REDUCEMIN_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_min_fun, horz_min_fun); \ + REDUCEMIN_PROCESS_AXIS0_SAVE_SAME(save_type, VXC_WriteImage); \ +} + + +#define REDUCEMIN_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET,\ + IN_SCALE, IN_OFFSET, vert_min_fun, horz_min_fun) \ +__kernel void reducemin_axis0_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + REDUCEMIN_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_min_fun, horz_min_fun); \ + REDUCEMIN_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \ +} + +REDUCEMIN_AXIS0_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMin3_Half, VXC_HorzMin3_Half) + +REDUCEMIN_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale,\ +0, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half) +REDUCEMIN_AXIS0(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale,\ +0, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half) +REDUCEMIN_AXIS0(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half) +REDUCEMIN_AXIS0(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0,\ +inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer) +REDUCEMIN_AXIS0(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0,\ +inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer) +REDUCEMIN_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8, CONV,\ +1, 0, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer, VXC_HorzMin3_Integer) + +REDUCEMIN_AXIS0(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer) +REDUCEMIN_AXIS0(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer) +REDUCEMIN_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric,\ +VXC_VertMin3_Integer, VXC_HorzMin3_Integer) + +#define REDUCEMIN_AXIS0_SAME_2D(src_name, dst_name, src_type, copy_type,\ +save_type, vert_min_fun, horz_min_fun) \ +__kernel void reducemin_axis0_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(16, get_global_id(0)); \ + int2 coord_out = (int2)(get_global_id(0), 0); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + REDUCEMIN_PROCESS_AXIS0(VXC_ReadImage, vert_min_fun, horz_min_fun); \ + REDUCEMIN_PROCESS_AXIS0_SAVE_SAME(save_type, VXC_WriteImage); \ +} + +#define REDUCEMIN_AXIS0_2D(src_name, dst_name, src_type, copy_type,\ + dst_type, save_type, conv_mode, OUT_SCALE,\ + OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_min_fun, horz_min_fun) \ +__kernel void reducemin_axis0_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(16, get_global_id(0)); \ + int2 coord_out = (int2)(get_global_id(0), 0); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + REDUCEMIN_PROCESS_AXIS0(VXC_ReadImage, vert_min_fun, horz_min_fun); \ + REDUCEMIN_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \ +} + +REDUCEMIN_AXIS0_SAME_2D(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMin3_Half, VXC_HorzMin3_Half) + +REDUCEMIN_AXIS0_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half) +REDUCEMIN_AXIS0_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half) +REDUCEMIN_AXIS0_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half) + +REDUCEMIN_AXIS0_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer) +REDUCEMIN_AXIS0_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer) +REDUCEMIN_AXIS0_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer, VXC_HorzMin3_Integer) +REDUCEMIN_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer) +REDUCEMIN_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer) +REDUCEMIN_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric,\ +VXC_VertMin3_Integer, VXC_HorzMin3_Integer) + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reducemin_internal_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reducemin_internal_axis1.vx new file mode 100644 index 0000000..6822dd2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reducemin_internal_axis1.vx @@ -0,0 +1,160 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int axisSize; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform float inputScale; +_viv_uniform float input_offset_asymmetric; +_viv_uniform VXC_512Bits uniGetLoData_4x4; + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniGetHiData_4x4; + +#define REDUCEMIN_PROCESS_AXIS1(read_fun, vert_min_fun) \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, min, in0, 16); \ + coord.y++; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + vert_min_fun(min, min, min, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + } \ + while(coord.y < axisSize); + +#define REDUCEMIN_PROCESS_AXIS1_SAVE_SAME(save_type, write_fun) \ + save_type vect; \ + _viv_asm(COPY, vect, min, 16); \ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMIN_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\ +OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \ + dst_type dst0, dst1; \ + save_type vect; \ + VXC_DP4x4(data0, min, min, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst0, data0); \ + VXC_DP4x4(data0, min, min, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst1, data0); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMIN_AXIS1_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_min_fun) \ +__kernel void reducemin_axis1_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + src_type vec0, min; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMIN_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_min_fun) \ + REDUCEMIN_PROCESS_AXIS1_SAVE_SAME(save_type, VXC_WriteImage); \ +} + +#define REDUCEMIN_AXIS1(src_name, dst_name, src_type, copy_type, dst_type, save_type,\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_min_fun) \ +__kernel void reducemin_axis1_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + src_type vec0, min; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMIN_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_min_fun) \ + REDUCEMIN_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \ +} + + + +REDUCEMIN_AXIS1_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMin3_Half) + + +REDUCEMIN_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half) +REDUCEMIN_AXIS1(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half) +REDUCEMIN_AXIS1(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMin3_Half) +REDUCEMIN_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer) +REDUCEMIN_AXIS1(I16, I16, vxc_short8, vxc_short8, short4,\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS1(I8, I8, vxc_char16, vxc_char16, char4,\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer) + + +#define REDUCEMIN_AXIS1_SAME_2D(src_name, dst_name, src_type, copy_type, save_type, vert_min_fun) \ +__kernel void reducemin_axis1_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), 0); \ + int2 coord_out = (int2)(get_global_id(0), 0); \ + src_type vec0, min; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMIN_PROCESS_AXIS1(VXC_ReadImage, vert_min_fun) \ + REDUCEMIN_PROCESS_AXIS1_SAVE_SAME(save_type, VXC_WriteImage); \ +} + +#define REDUCEMIN_AXIS1_2D(src_name, dst_name, src_type, copy_type, dst_type, save_type,\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_min_fun) \ +__kernel void reducemin_axis1_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), 0); \ + int2 coord_out = (int2)(get_global_id(0), 0); \ + src_type vec0, min; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMIN_PROCESS_AXIS1(VXC_ReadImage, vert_min_fun) \ + REDUCEMIN_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \ +} + +REDUCEMIN_AXIS1_SAME_2D(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMin3_Half) + +REDUCEMIN_AXIS1_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half) +REDUCEMIN_AXIS1_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half) +REDUCEMIN_AXIS1_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMin3_Half) +REDUCEMIN_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer) +REDUCEMIN_AXIS1_2D(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reducemin_internal_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reducemin_internal_axis2.vx new file mode 100644 index 0000000..5e73ff8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reducemin_internal_axis2.vx @@ -0,0 +1,102 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int axisSize; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform float inputScale; +_viv_uniform float input_offset_asymmetric; +_viv_uniform VXC_512Bits uniGetLoData_4x4; + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniGetHiData_4x4; + +#define REDUCEMIN_PROCESS_AXIS2(read_fun, vert_min_fun) \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, min, in0, 16); \ + coord.z++; \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + vert_min_fun(min, min, min, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z++; \ + } \ + while(coord.z < axisSize); + +#define REDUCEMIN_PROCESS_AXIS2_SAVE_SAME(save_type, write_fun) \ + save_type vect; \ + _viv_asm(COPY, vect, min, 16); \ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMIN_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode,\ +OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \ + dst_type dst0, dst1; \ + save_type vect; \ + VXC_DP4x4(data0, min, min, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst0, data0); \ + VXC_DP4x4(data0, min, min, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst1, data0); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEMIN_AXIS2_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_min_fun) \ +__kernel void reducemin_axis2_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + src_type vec0, min; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMIN_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_min_fun) \ + REDUCEMIN_PROCESS_AXIS2_SAVE_SAME(save_type, VXC_WriteImage); \ +} + +#define REDUCEMIN_AXIS2(src_name, dst_name, src_type, copy_type, dst_type,\ +save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_min_fun) \ +__kernel void reducemin_axis2_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + src_type vec0, min; \ + copy_type in0; \ + vxc_float4 data0; \ + REDUCEMIN_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_min_fun) \ + REDUCEMIN_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \ +} + + + +REDUCEMIN_AXIS2_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMin3_Half) + + +REDUCEMIN_AXIS2(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half) +REDUCEMIN_AXIS2(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half) +REDUCEMIN_AXIS2(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMin3_Half) +REDUCEMIN_AXIS2(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS2(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS2(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer) + +REDUCEMIN_AXIS2(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS2(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer) +REDUCEMIN_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reduceprod_internal_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reduceprod_internal_axis0.vx new file mode 100644 index 0000000..e811a33 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reduceprod_internal_axis0.vx @@ -0,0 +1,207 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform float inputScale; +_viv_uniform float input_offset_asymmetric; +_viv_uniform VXC_512Bits uniGetLoData_4x4; +_viv_uniform VXC_512Bits uniGetHiData_4x4; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; + +_viv_uniform int inputWidth; +_viv_uniform VXC_512Bits uniGetEndLoData_2x8; +_viv_uniform VXC_512Bits uniGetEndHiData_2x8; + +#define REDUCEPROD_PROCESS_AXIS0(read_fun, IN_SCALE, IN_OFFSET) \ + while(coord.x < inputWidth) \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val, val0, 16); \ + VXC_DP4x4(tmpProdLo, img_val, img_val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \ + VXC_DP4x4(tmpProdHi, img_val, img_val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \ + tmpProdLo = (tmpProdLo - IN_OFFSET) * IN_SCALE; \ + tmpProdHi = (tmpProdHi - IN_OFFSET) * IN_SCALE; \ + tmpProd = tmpProdLo * tmpProdHi; \ + prodValue = prodValue * tmpProd; \ + coord.x += 8; \ + } \ + vxc_ushort8 tmpProdInt0, tmpProdInt1; \ + vxc_ushort8 tmpOnesInt = {0, 16256, 0, 16256, 0, 16256, 0, 16256}; \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val, val0, 16); \ + VXC_DP4x4(tmpProdLo, img_val, img_val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \ + VXC_DP4x4(tmpProdHi, img_val, img_val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \ + tmpProdLo = (tmpProdLo - IN_OFFSET) * IN_SCALE; \ + tmpProdHi = (tmpProdHi - IN_OFFSET) * IN_SCALE; \ + _viv_asm(COPY, tmpProdInt0, tmpProdLo, 16); \ + _viv_asm(COPY, tmpProdInt1, tmpProdHi, 16); \ + VXC_DP2x8(tmpProdInt0, tmpProdInt0, tmpOnesInt,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetEndLoData_2x8); \ + VXC_DP2x8(tmpProdInt1, tmpProdInt1, tmpOnesInt,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetEndHiData_2x8); \ + _viv_asm(COPY, tmpProdLo, tmpProdInt0, 16); \ + _viv_asm(COPY, tmpProdHi, tmpProdInt1, 16); \ + tmpProd = tmpProdLo * tmpProdHi; \ + prodValue = prodValue * tmpProd; \ + tmpProd.xy = prodValue.xy * prodValue.zw; \ + prodValue.x = tmpProd.x * tmpProd.y; + +#define REDUCEPROD_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, write_fun) \ + dst_type vec1; \ + save_type dst; \ + prodValue = prodValue * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, vec1, prodValue); \ + _viv_asm(COPY, dst, vec1, 16); \ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + +#define REDUCEPROD_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET) \ +__kernel void reduceprod_axis0_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_float4 prodValue = {1.0f, 1.0f, 1.0f, 1.0f}; \ + vxc_float4 tmpProdLo, tmpProdHi, tmpProd; \ + src_type img_val; \ + copy_type val0; \ + REDUCEPROD_PROCESS_AXIS0(VXC_ReadImage2DArray, IN_SCALE, IN_OFFSET); \ + REDUCEPROD_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_WriteImage); \ +} + +REDUCEPROD_AXIS0(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8, CONV, 1, 0, 1, 0) +REDUCEPROD_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, 1, 0) +REDUCEPROD_AXIS0(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, 1, 0) +REDUCEPROD_AXIS0(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0) +REDUCEPROD_AXIS0(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0, inputScale, 0) +REDUCEPROD_AXIS0(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, inputScale, 0) +REDUCEPROD_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, input_offset_asymmetric) + +REDUCEPROD_AXIS0(I16, I16, vxc_short8, vxc_short8, short4,\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0) +REDUCEPROD_AXIS0(I8, I8, vxc_char16, vxc_char16, char4,\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0) +REDUCEPROD_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric) + +#define REDUCEPROD_AXIS0_2D(src_name, dst_name, src_type, copy_type,\ + dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET) \ +__kernel void reduceprod_axis0_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(0, get_global_id(0)); \ + int2 coord_out = (int2)(get_global_id(0), 0); \ + vxc_float4 prodValue = {1.0f, 1.0f, 1.0f, 1.0f}; \ + vxc_float4 tmpProdLo, tmpProdHi, tmpProd; \ + src_type img_val; \ + copy_type val0; \ + REDUCEPROD_PROCESS_AXIS0(VXC_ReadImage, IN_SCALE, IN_OFFSET); \ + REDUCEPROD_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_WriteImage); \ +} + +REDUCEPROD_AXIS0_2D(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8, CONV, 1, 0, 1, 0) +REDUCEPROD_AXIS0_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, 1, 0) +REDUCEPROD_AXIS0_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, 1, 0) +REDUCEPROD_AXIS0_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0) +REDUCEPROD_AXIS0_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0, inputScale, 0) +REDUCEPROD_AXIS0_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, inputScale, 0) +REDUCEPROD_AXIS0_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, \ +vxc_short8, CONV, 1, 0, inputScale, input_offset_asymmetric) +REDUCEPROD_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, short4,\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0) +REDUCEPROD_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, char4,\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0) +REDUCEPROD_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric) + + +#define REDUCEPROD_PROCESS_AXIS0_BF16(read_fun) \ + while(coord.x < inputWidth) \ + { \ + read_fun(img_val, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(val0, img_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, tmpProdLo, val0, 16); \ + VXC_DP2x8(val0, img_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, tmpProdHi, val0, 16); \ + tmpProd = tmpProdLo * tmpProdHi; \ + prodValue = prodValue * tmpProd; \ + coord.x += 8; \ + } \ + vxc_ushort8 tmpProdInt0, tmpProdInt1; \ + vxc_ushort8 tmpOnesInt = {0, 16256, 0, 16256, 0, 16256, 0, 16256}; \ + read_fun(img_val, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(val0, img_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, tmpProdLo, val0, 16); \ + VXC_DP2x8(val0, img_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, tmpProdHi, val0, 16); \ + _viv_asm(COPY, tmpProdInt0, tmpProdLo, 16); \ + _viv_asm(COPY, tmpProdInt1, tmpProdHi, 16); \ + VXC_DP2x8(tmpProdInt0, tmpProdInt0, tmpOnesInt,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetEndLoData_2x8); \ + VXC_DP2x8(tmpProdInt1, tmpProdInt1, tmpOnesInt,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetEndHiData_2x8); \ + _viv_asm(COPY, tmpProdLo, tmpProdInt0, 16); \ + _viv_asm(COPY, tmpProdHi, tmpProdInt1, 16); \ + tmpProd = tmpProdLo * tmpProdHi; \ + prodValue = prodValue * tmpProd; \ + tmpProd.xy = prodValue.xy * prodValue.zw; \ + prodValue.x = tmpProd.x * tmpProd.y; + +#define REDUCEPROD_PROCESS_AXIS0_BF16_SAVE(write_fun) \ + vxc_ushort8 dst; \ + _viv_asm(COPY, dst, prodValue, 16); \ + dst.s0 = dst.s1; \ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + +__kernel void reduceprod_axis0_BF16toBF16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + vxc_float4 prodValue = {1.0f, 1.0f, 1.0f, 1.0f}; + vxc_float4 tmpProdLo, tmpProdHi, tmpProd; + vxc_short8 img_val; + vxc_short8 val0; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + REDUCEPROD_PROCESS_AXIS0_BF16(VXC_ReadImage2DArray); + REDUCEPROD_PROCESS_AXIS0_BF16_SAVE(VXC_WriteImage); +} + +__kernel void reduceprod_axis0_BF16toBF16_2D + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int2 coord = (int2)(0, get_global_id(0)); + int2 coord_out = (int2)(get_global_id(0), 0); + vxc_float4 prodValue = {1.0f, 1.0f, 1.0f, 1.0f}; + vxc_float4 tmpProdLo, tmpProdHi, tmpProd; + vxc_short8 img_val; + vxc_short8 val0; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + REDUCEPROD_PROCESS_AXIS0_BF16(VXC_ReadImage); + REDUCEPROD_PROCESS_AXIS0_BF16_SAVE(VXC_WriteImage); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reduceprod_internal_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reduceprod_internal_axis1.vx new file mode 100644 index 0000000..5b647f2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reduceprod_internal_axis1.vx @@ -0,0 +1,177 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform float inputScale; +_viv_uniform float input_offset_asymmetric; +_viv_uniform VXC_512Bits uniGetLoData_4x4; +_viv_uniform VXC_512Bits uniGetHiData_4x4; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; + +_viv_uniform int axisSize; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define REDUCEPROD_PROCESS_AXIS1(read_fun, IN_SCALE, IN_OFFSET) \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(tmpProdLo, vec0, vec0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \ + VXC_DP4x4(tmpProdHi, vec0, vec0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \ + tmpProdLo = (tmpProdLo - IN_OFFSET) * IN_SCALE; \ + tmpProdHi = (tmpProdHi - IN_OFFSET) * IN_SCALE; \ + prodValueLo = prodValueLo * tmpProdLo; \ + prodValueHi = prodValueHi * tmpProdHi; \ + coord.y++; \ + } \ + while(coord.y < axisSize); + +#define REDUCEPROD_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, write_fun) \ + dst_type dst0, dst1; \ + save_type vect; \ + prodValueLo = prodValueLo * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst0, prodValueLo); \ + prodValueHi = prodValueHi * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst1, prodValueHi); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +#define REDUCEPROD_AXIS1(src_name, dst_name, src_type, copy_type, dst_type, save_type,\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET) \ +__kernel void reduceprod_axis1_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f}; \ + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f};\ + vxc_float4 tmpProdLo, tmpProdHi; \ + src_type vec0; \ + copy_type in0; \ + REDUCEPROD_PROCESS_AXIS1(VXC_ReadImage2DArray, IN_SCALE, IN_OFFSET) \ + REDUCEPROD_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_WriteImage); \ +} + + + +REDUCEPROD_AXIS1(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8, CONV, 1, 0, 1, 0) +REDUCEPROD_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, 1, 0) +REDUCEPROD_AXIS1(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, 1, 0) +REDUCEPROD_AXIS1(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0) +REDUCEPROD_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0, inputScale, 0) +REDUCEPROD_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, inputScale, 0) +REDUCEPROD_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, \ +vxc_short8, CONV, 1, 0, inputScale, input_offset_asymmetric) +REDUCEPROD_AXIS1(I16, I16, vxc_short8, vxc_short8, short4,\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0) +REDUCEPROD_AXIS1(I8, I8, vxc_char16, vxc_char16, char4,\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0) +REDUCEPROD_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric) + + +#define REDUCEPROD_AXIS1_2D(src_name, dst_name, src_type, copy_type, dst_type, save_type,\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET) \ +__kernel void reduceprod_axis1_##src_name##to##dst_name##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), 0); \ + int2 coord_out = (int2)(get_global_id(0), 0); \ + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f}; \ + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f};\ + vxc_float4 tmpProdLo, tmpProdHi; \ + src_type vec0; \ + copy_type in0; \ + REDUCEPROD_PROCESS_AXIS1(VXC_ReadImage, IN_SCALE, IN_OFFSET) \ + REDUCEPROD_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, VXC_WriteImage); \ +} + +REDUCEPROD_AXIS1_2D(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8, CONV, 1, 0, 1, 0) +REDUCEPROD_AXIS1_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, 1, 0) +REDUCEPROD_AXIS1_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, 1, 0) +REDUCEPROD_AXIS1_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0) +REDUCEPROD_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0, inputScale, 0) +REDUCEPROD_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, inputScale, 0) +REDUCEPROD_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, input_offset_asymmetric) +REDUCEPROD_AXIS1_2D(I16, I16, vxc_short8, vxc_short8, short4,\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0) +REDUCEPROD_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, char4,\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0) +REDUCEPROD_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric) + +#define REDUCEPROD_PROCESS_AXIS1_BF16(read_fun) \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(vec0, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, tmpProdLo, vec0, 16); \ + VXC_DP2x8(vec0, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, tmpProdHi, vec0, 16); \ + prodValueLo = prodValueLo * tmpProdLo; \ + prodValueHi = prodValueHi * tmpProdHi; \ + coord.y++; \ + } \ + while(coord.y < axisSize); + + +#define REDUCEPROD_PROCESS_AXIS1_SAVE_BF16(write_fun) \ + vxc_ushort8 dst0, dst1; \ + vxc_ushort8 vect; \ + _viv_asm(COPY, dst0, prodValueLo, 16); \ + _viv_asm(COPY, dst1, prodValueHi, 16); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void reduceprod_axis1_BF16toBF16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f}; + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f}; + vxc_float4 tmpProdLo, tmpProdHi; + vxc_short8 vec0; + vxc_short8 in0; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + REDUCEPROD_PROCESS_AXIS1_BF16(VXC_ReadImage2DArray) + REDUCEPROD_PROCESS_AXIS1_SAVE_BF16(VXC_WriteImage); +} + +__kernel void reduceprod_axis1_BF16toBF16_2D + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + int2 coord_out = (int2)(get_global_id(0), 0); + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f}; + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f}; + vxc_float4 tmpProdLo, tmpProdHi; + vxc_short8 vec0; + vxc_short8 in0; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + REDUCEPROD_PROCESS_AXIS1_BF16(VXC_ReadImage) + REDUCEPROD_PROCESS_AXIS1_SAVE_BF16(VXC_WriteImage); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/reduceprod_internal_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/reduceprod_internal_axis2.vx new file mode 100644 index 0000000..376caad --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/reduceprod_internal_axis2.vx @@ -0,0 +1,121 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform float inputScale; +_viv_uniform float input_offset_asymmetric; +_viv_uniform VXC_512Bits uniGetLoData_4x4; +_viv_uniform VXC_512Bits uniGetHiData_4x4; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; + +_viv_uniform int axisSize; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define REDUCEPROD_PROCESS_AXIS2(read_fun, IN_SCALE, IN_OFFSET) \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(tmpProdLo, vec0, vec0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \ + VXC_DP4x4(tmpProdHi, vec0, vec0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \ + tmpProdLo = (tmpProdLo - IN_OFFSET) * IN_SCALE; \ + tmpProdHi = (tmpProdHi - IN_OFFSET) * IN_SCALE; \ + prodValueLo = prodValueLo * tmpProdLo; \ + prodValueHi = prodValueHi * tmpProdHi; \ + coord.z++; \ + } \ + while(coord.z < axisSize); + + +#define REDUCEPROD_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, write_fun) \ + dst_type dst0, dst1; \ + save_type vect; \ + prodValueLo = prodValueLo * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst0, prodValueLo); \ + prodValueHi = prodValueHi * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst1, prodValueHi); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +#define REDUCEPROD_AXIS2(src_name, dst_name, src_type, copy_type, dst_type, save_type,\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET) \ +__kernel void reduceprod_axis2_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f}; \ + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f};\ + vxc_float4 tmpProdLo, tmpProdHi; \ + src_type vec0; \ + copy_type in0; \ + REDUCEPROD_PROCESS_AXIS2(VXC_ReadImage2DArray, IN_SCALE, IN_OFFSET) \ + REDUCEPROD_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_WriteImage); \ +} + + + +REDUCEPROD_AXIS2(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8, CONV, 1, 0, 1, 0) +REDUCEPROD_AXIS2(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, 1, 0) +REDUCEPROD_AXIS2(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, 1, 0) +REDUCEPROD_AXIS2(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0) +REDUCEPROD_AXIS2(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0, inputScale, 0) +REDUCEPROD_AXIS2(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, inputScale, 0) +REDUCEPROD_AXIS2(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, inputScale, input_offset_asymmetric) + +REDUCEPROD_AXIS2(I16, I16, vxc_short8, vxc_short8, short4,\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0) +REDUCEPROD_AXIS2(I8, I8, vxc_char16, vxc_char16, char4,\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0) +REDUCEPROD_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric) + +#define REDUCEPROD_PROCESS_AXIS2_BF16(read_fun) \ + do \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(vec0, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, tmpProdLo, vec0, 16); \ + VXC_DP2x8(vec0, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, tmpProdHi, vec0, 16); \ + prodValueLo = prodValueLo * tmpProdLo; \ + prodValueHi = prodValueHi * tmpProdHi; \ + coord.z++; \ + } \ + while(coord.z < axisSize); + + +#define REDUCEPROD_PROCESS_AXIS2_SAVE_BF16(write_fun) \ + vxc_ushort8 dst0, dst1; \ + vxc_ushort8 vect; \ + _viv_asm(COPY, dst0, prodValueLo, 16); \ + _viv_asm(COPY, dst1, prodValueHi, 16); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void reduceprod_axis2_BF16toBF16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axisVal + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f}; + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f}; + vxc_float4 tmpProdLo, tmpProdHi; + vxc_short8 vec0; + vxc_short8 in0; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + REDUCEPROD_PROCESS_AXIS2_BF16(VXC_ReadImage2DArray) + REDUCEPROD_PROCESS_AXIS2_SAVE_BF16(VXC_WriteImage); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx new file mode 100644 index 0000000..64018e7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx @@ -0,0 +1,114 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float input0Scale; +_viv_uniform float input0Tail; +_viv_uniform float input1Scale; +_viv_uniform float input1Tail; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4; +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; + +#define COMPARISONS_2D(func_name, src0_type_name, src1_type_name, \ + src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \ +__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + src0_type src0; \ + src0_copy_type srcA; \ + src0_type src1; \ + src0_copy_type srcB; \ + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, srcA, src0, 16); \ + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, srcB, src1, 16); \ + \ + float4 vecA0, vecA1; \ + float4 vecB0, vecB1; \ + VXC_DP4x4(vecA0, srcA, srcA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \ + VXC_DP4x4(vecA1, srcA, srcA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \ + vecA0 = vecA0 * input0Scale + input0Tail; \ + vecA1 = vecA1 * input0Scale + input0Tail; \ + VXC_DP4x4(vecB0, srcB, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \ + VXC_DP4x4(vecB1, srcB, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \ + vecB0 = vecB0 * input1Scale + input1Tail; \ + vecB1 = vecB1 * input1Scale + input1Tail; \ + int4 dst0, dst1; \ + dst0 = (vecA0)cmp_op(vecB0); \ + dst1 = (vecA1)cmp_op(vecB1); \ + \ + vxc_char16 dst; \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + dst &= 1; \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +//LESS +COMPARISONS_2D(less, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, <) +COMPARISONS_2D(less, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, <) +COMPARISONS_2D(less, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, <) +COMPARISONS_2D(less, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, <) +COMPARISONS_2D(less, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, <) +COMPARISONS_2D(less, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, <) +COMPARISONS_2D(less, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, <) +COMPARISONS_2D(less, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, <) +COMPARISONS_2D(less, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, <) +COMPARISONS_2D(less, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, <) +//GREAT +COMPARISONS_2D(great, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, >) +COMPARISONS_2D(great, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, >) +COMPARISONS_2D(great, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, >) +COMPARISONS_2D(great, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, >) +COMPARISONS_2D(great, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, >) +COMPARISONS_2D(great, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, >) +COMPARISONS_2D(great, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, >) +COMPARISONS_2D(great, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, >) +COMPARISONS_2D(great, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, >) +COMPARISONS_2D(great, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, >) +//LESS_EQUAL +COMPARISONS_2D(less_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, <=) +COMPARISONS_2D(less_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, <=) +COMPARISONS_2D(less_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, <=) +COMPARISONS_2D(less_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, <=) +COMPARISONS_2D(less_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, <=) +COMPARISONS_2D(less_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, <=) +COMPARISONS_2D(less_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, <=) +COMPARISONS_2D(less_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, <=) +COMPARISONS_2D(less_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, <=) +COMPARISONS_2D(less_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, <=) +//GREAT_EQUAL +COMPARISONS_2D(great_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, >=) +COMPARISONS_2D(great_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, >=) +COMPARISONS_2D(great_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, >=) +COMPARISONS_2D(great_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, >=) +COMPARISONS_2D(great_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, >=) +COMPARISONS_2D(great_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, >=) +COMPARISONS_2D(great_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, >=) +COMPARISONS_2D(great_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, >=) +COMPARISONS_2D(great_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, >=) +COMPARISONS_2D(great_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, >=) +//EQUAL +COMPARISONS_2D(equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, ==) +COMPARISONS_2D(equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, ==) +COMPARISONS_2D(equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, ==) +COMPARISONS_2D(equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, ==) +COMPARISONS_2D(equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ==) +COMPARISONS_2D(equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, ==) +COMPARISONS_2D(equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, ==) +COMPARISONS_2D(equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, ==) +COMPARISONS_2D(equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ==) +COMPARISONS_2D(equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, ==) +//NOT_EQUAL +COMPARISONS_2D(not_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, !=) +COMPARISONS_2D(not_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, !=) +COMPARISONS_2D(not_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, !=) +COMPARISONS_2D(not_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, !=) +COMPARISONS_2D(not_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, !=) +COMPARISONS_2D(not_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, !=) +COMPARISONS_2D(not_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, !=) +COMPARISONS_2D(not_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, !=) +COMPARISONS_2D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=) +COMPARISONS_2D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, !=) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx new file mode 100644 index 0000000..0fcc274 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx @@ -0,0 +1,114 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float input0Scale; +_viv_uniform float input0Tail; +_viv_uniform float input1Scale; +_viv_uniform float input1Tail; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4; +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; + +#define COMPARISONS_3D(func_name, src0_type_name, src1_type_name, \ + src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \ +__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + src0_type src0; \ + src0_copy_type srcA; \ + src0_type src1; \ + src0_copy_type srcB; \ + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, srcA, src0, 16); \ + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, srcB, src1, 16); \ + \ + float4 vecA0, vecA1, vecA; \ + float4 vecB0, vecB1, vecB; \ + VXC_DP4x4(vecA0, srcA, srcA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \ + VXC_DP4x4(vecA1, srcA, srcA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \ + vecA0 = vecA0 * input0Scale + input0Tail; \ + vecA1 = vecA1 * input0Scale + input0Tail; \ + VXC_DP4x4(vecB0, srcB, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \ + VXC_DP4x4(vecB1, srcB, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \ + vecB0 = vecB0 * input1Scale + input1Tail; \ + vecB1 = vecB1 * input1Scale + input1Tail; \ + int4 dst0, dst1; \ + dst0 = (vecA0)cmp_op(vecB0); \ + dst1 = (vecA1)cmp_op(vecB1); \ + \ + vxc_char16 dst; \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + dst &= 1; \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +//LESS +COMPARISONS_3D(less, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, <) +COMPARISONS_3D(less, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, <) +COMPARISONS_3D(less, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, <) +COMPARISONS_3D(less, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, <) +COMPARISONS_3D(less, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, <) +COMPARISONS_3D(less, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, <) +COMPARISONS_3D(less, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, <) +COMPARISONS_3D(less, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, <) +COMPARISONS_3D(less, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, <) +COMPARISONS_3D(less, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, <) +//GREAT +COMPARISONS_3D(great, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, >) +COMPARISONS_3D(great, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, >) +COMPARISONS_3D(great, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, >) +COMPARISONS_3D(great, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, >) +COMPARISONS_3D(great, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, >) +COMPARISONS_3D(great, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, >) +COMPARISONS_3D(great, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, >) +COMPARISONS_3D(great, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, >) +COMPARISONS_3D(great, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, >) +COMPARISONS_3D(great, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, >) +//LESS_EQUAL +COMPARISONS_3D(less_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, <=) +COMPARISONS_3D(less_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, <=) +COMPARISONS_3D(less_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, <=) +COMPARISONS_3D(less_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, <=) +COMPARISONS_3D(less_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, <=) +COMPARISONS_3D(less_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, <=) +COMPARISONS_3D(less_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, <=) +COMPARISONS_3D(less_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, <=) +COMPARISONS_3D(less_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, <=) +COMPARISONS_3D(less_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, <=) +//GREAT_EQUAL +COMPARISONS_3D(great_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, >=) +COMPARISONS_3D(great_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, >=) +COMPARISONS_3D(great_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, >=) +COMPARISONS_3D(great_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, >=) +COMPARISONS_3D(great_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, >=) +COMPARISONS_3D(great_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, >=) +COMPARISONS_3D(great_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, >=) +COMPARISONS_3D(great_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, >=) +COMPARISONS_3D(great_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, >=) +COMPARISONS_3D(great_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, >=) +//EQUAL +COMPARISONS_3D(equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, ==) +COMPARISONS_3D(equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, ==) +COMPARISONS_3D(equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, ==) +COMPARISONS_3D(equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, ==) +COMPARISONS_3D(equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ==) +COMPARISONS_3D(equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, ==) +COMPARISONS_3D(equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, ==) +COMPARISONS_3D(equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, ==) +COMPARISONS_3D(equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ==) +COMPARISONS_3D(equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, ==) +//NOT_EQUAL +COMPARISONS_3D(not_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, !=) +COMPARISONS_3D(not_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, !=) +COMPARISONS_3D(not_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, !=) +COMPARISONS_3D(not_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, !=) +COMPARISONS_3D(not_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, !=) +COMPARISONS_3D(not_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, !=) +COMPARISONS_3D(not_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, !=) +COMPARISONS_3D(not_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, !=) +COMPARISONS_3D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=) +COMPARISONS_3D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, !=) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/relu_keras.vx b/src/tim/vx/internal/src/libnnext/ops/vx/relu_keras.vx new file mode 100644 index 0000000..02e7ce5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/relu_keras.vx @@ -0,0 +1,235 @@ + +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvFP16toFP32_Lo_4x4; +_viv_uniform VXC_512Bits uniConvFP16toFP32_Hi_4x4; +_viv_uniform VXC_512Bits uniExtractHalf8_2x8; +_viv_uniform VXC_512Bits uniExtractInteger_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniPackedBF16_2x8; +_viv_uniform VXC_512Bits uniConvIntegertoFP32_Lo_4x4; +_viv_uniform VXC_512Bits uniConvIntegertoFP32_Hi_4x4; +_viv_uniform float offset; +_viv_uniform float input_scale; +_viv_uniform float inputTail; +_viv_uniform float output_scale; +_viv_uniform float outputZP; + +float4 I8toF32_Lo(vxc_char8 src) +{ + float4 dst; + + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Lo_4x4); + dst *= input_scale; + return dst; +} + +float4 I8toF32_Hi(vxc_char8 src) +{ + float4 dst; + + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Hi_4x4); + dst *= input_scale; + return dst; +} + +float4 U8toF32_Lo(vxc_uchar8 src) +{ + float4 dst; + + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Lo_4x4); + dst = dst * input_scale + inputTail; + return dst; +} + +float4 U8toF32_Hi(vxc_uchar8 src) +{ + float4 dst; + + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Hi_4x4); + dst = dst * input_scale + inputTail; + return dst; +} + +float4 I16toF32_Lo(vxc_short8 src) +{ + float4 dst; + + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Lo_4x4); + dst *= input_scale; + return dst; +} + +float4 I16toF32_Hi(vxc_short8 src) +{ + float4 dst; + + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Hi_4x4); + dst *= input_scale; + return dst; +} + +float4 F16toF32_Lo(vxc_ushort8 src) +{ + vxc_half8 srcHalf; + float4 dst; + + _viv_asm(COPY, srcHalf, src, 16); + VXC_DP4x4(dst, srcHalf, srcHalf, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvFP16toFP32_Lo_4x4); + return dst; +} + +float4 F16toF32_Hi(vxc_ushort8 src) +{ + vxc_half8 srcHalf; + float4 dst; + + _viv_asm(COPY, srcHalf, src, 16); + VXC_DP4x4(dst, srcHalf, srcHalf, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvFP16toFP32_Hi_4x4); + return dst; +} + +float4 BF16toF32_Lo(vxc_ushort8 src) +{ + vxc_ushort8 srcA; + float4 dst; + + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_DP2x8(srcA, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, dst, srcA, 16); + + return dst; +} + +float4 BF16toF32_Hi(vxc_ushort8 src) +{ + vxc_ushort8 srcA; + float4 dst; + + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_DP2x8(srcA, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, dst, srcA, 16); + + return dst; +} + +vxc_ushort8 F32toF16(float4 src0, float4 src1) +{ + half4 srcHalf0, srcHalf1; + vxc_half8 dst0; + vxc_ushort8 dst; + + _viv_asm(CONV, srcHalf0, src0); + _viv_asm(CONV, srcHalf1, src1); + + VXC_DP2x8(dst0, srcHalf0, srcHalf1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); + _viv_asm(COPY, dst, dst0, 16); + return dst; +} + +vxc_ushort8 F32toBF16(float4 src0, float4 src1) +{ + vxc_ushort8 srcA, srcB; + vxc_ushort8 dst; + + _viv_asm(COPY, srcA, src0, 16); + _viv_asm(COPY, srcB, src1, 16); + VXC_DP2x8(dst, srcA, srcB, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackedBF16_2x8); + return dst; +} + +vxc_char8 F32toI8(float4 src0, float4 src1) +{ + int4 srcInt0, srcInt1; + vxc_char8 dst; + + src0 *= output_scale; + src1 *= output_scale; + _viv_asm(CONV_RTE, srcInt0, src0); + _viv_asm(CONV_RTE, srcInt1, src1); + + VXC_DP2x8(dst, srcInt0, srcInt1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractInteger_2x8); + return dst; +} + +vxc_short8 F32toI16(float4 src0, float4 src1) +{ + int4 srcInt0, srcInt1; + vxc_short8 dst; + + src0 *= output_scale; + src1 *= output_scale; + _viv_asm(CONV_RTE, srcInt0, src0); + _viv_asm(CONV_RTE, srcInt1, src1); + + VXC_DP2x8(dst, srcInt0, srcInt1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractInteger_2x8); + return dst; +} + +vxc_uchar8 F32toU8(float4 src0, float4 src1) +{ + int4 srcInt0, srcInt1; + vxc_uchar8 dst; + + src0 = src0 * output_scale + outputZP; + src1 = src1 * output_scale + outputZP; + _viv_asm(CONV_RTE, srcInt0, src0); + _viv_asm(CONV_RTE, srcInt1, src1); + + VXC_DP2x8(dst, srcInt0, srcInt1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractInteger_2x8); + return dst; +} + + +#define TENSOR_KERAS_RELU(src_type_name, dst_type_name, tensor_dims, image_type, \ + convert2FP32_Func, convert2DstType_Func, src_type, dst_type) \ +__kernel void relu_keras_##src_type_name##to##dst_type_name##tensor_dims( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + float alpha, \ + float max_value, \ + float threshold \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + src_type src; \ + VXC_Read##image_type(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + float4 dataA = convert2FP32_Func##_Lo(src); \ + float4 dataB = convert2FP32_Func##_Hi(src); \ + float4 dataA0 = dataA < threshold ? threshold : dataA; \ + dataA0 = dataA0 > max_value ? max_value : dataA0; \ + float4 dataB0 = dataB < threshold ? threshold : dataB; \ + dataB0 = dataB0 > max_value ? max_value : dataB0; \ + float4 dataA1 = dataA * alpha - offset; \ + float4 dataB1 = dataB * alpha - offset; \ + float4 dst0 = dataA < threshold ? dataA1 : dataA0; \ + float4 dst1 = dataB < threshold ? dataB1 : dataB0; \ + dst_type result = convert2DstType_Func(dst0, dst1); \ + VXC_Write##image_type(output, coord, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} + +TENSOR_KERAS_RELU(F16, F16, _3D, Image2DArray, F16toF32, F32toF16, vxc_ushort8, vxc_ushort8) +TENSOR_KERAS_RELU(F16, I16, _3D, Image2DArray, F16toF32, F32toI16, vxc_ushort8, vxc_short8) +TENSOR_KERAS_RELU(F16, I8, _3D, Image2DArray, F16toF32, F32toI8, vxc_ushort8, vxc_char8) +TENSOR_KERAS_RELU(F16, U8, _3D, Image2DArray, F16toF32, F32toU8, vxc_ushort8, vxc_uchar8) +TENSOR_KERAS_RELU(BF16, BF16, _3D, Image2DArray, BF16toF32, F32toBF16, vxc_ushort8, vxc_ushort8) + +TENSOR_KERAS_RELU(I16, I16, _3D, Image2DArray, I16toF32, F32toI16, vxc_short8, vxc_short8) +TENSOR_KERAS_RELU(I16, F16, _3D, Image2DArray, I16toF32, F32toF16, vxc_short8, vxc_ushort8) +TENSOR_KERAS_RELU(I8, I8, _3D, Image2DArray, I8toF32, F32toI8, vxc_char8, vxc_char8) +TENSOR_KERAS_RELU(I8, F16, _3D, Image2DArray, I8toF32, F32toF16, vxc_char8, vxc_ushort8) +TENSOR_KERAS_RELU(U8, U8, _3D, Image2DArray, U8toF32, F32toU8, vxc_uchar8, vxc_uchar8) +TENSOR_KERAS_RELU(U8, F16, _3D, Image2DArray, U8toF32, F32toF16, vxc_uchar8, vxc_ushort8) + +TENSOR_KERAS_RELU(F16, F16, _2D, Image, F16toF32, F32toF16, vxc_ushort8, vxc_ushort8) +TENSOR_KERAS_RELU(F16, I16, _2D, Image, F16toF32, F32toI16, vxc_ushort8, vxc_short8) +TENSOR_KERAS_RELU(F16, I8, _2D, Image, F16toF32, F32toI8, vxc_ushort8, vxc_char8) +TENSOR_KERAS_RELU(F16, U8, _2D, Image, F16toF32, F32toU8, vxc_ushort8, vxc_uchar8) +TENSOR_KERAS_RELU(BF16, BF16, _2D, Image, BF16toF32, F32toBF16, vxc_ushort8, vxc_ushort8) +TENSOR_KERAS_RELU(I16, I16, _2D, Image, I16toF32, F32toI16, vxc_short8, vxc_short8) +TENSOR_KERAS_RELU(I16, F16, _2D, Image, I16toF32, F32toF16, vxc_short8, vxc_ushort8) +TENSOR_KERAS_RELU(I8, I8, _2D, Image, I8toF32, F32toI8, vxc_char8, vxc_char8) +TENSOR_KERAS_RELU(I8, F16, _2D, Image, I8toF32, F32toF16, vxc_char8, vxc_ushort8) +TENSOR_KERAS_RELU(U8, U8, _2D, Image, U8toF32, F32toU8, vxc_uchar8, vxc_uchar8) +TENSOR_KERAS_RELU(U8, F16, _2D, Image, U8toF32, F32toF16, vxc_uchar8, vxc_ushort8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx new file mode 100644 index 0000000..cd56af5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_BF16.vx @@ -0,0 +1,182 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 scale_xy; +_viv_uniform int depth; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform float half_pixel_value; + +__kernel void resize_bilinear_BF16toBF16_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + int bottom_y_idx = top_y_idx + 1; + vxc_short8 top; + vxc_short8 bottom; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(top, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(top, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(top, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(top, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + coord_in.x = left_x_idx.x; + VXC_ReadImage2DArray(bottom, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(bottom, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(bottom, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(bottom, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 src; + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + float4 dst4; + + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); + _viv_asm(COPY, right4, src, 16); + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); + _viv_asm(COPY, left4, src, 16); + right4 -= left4; + top4 = right4 * x_lerp + left4; + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); + _viv_asm(COPY, right4, src, 16); + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); + _viv_asm(COPY, left4, src, 16); + right4 -= left4; + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + dst4 = bottom4 * y_lerp + top4; + vxc_ushort8 tmp, dst; + _viv_asm(COPY, tmp, dst4, 16); + dst.s0123 = tmp.s1357; + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_bilinear_BF16toBF16_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + vxc_ushort8 src0, src1, src2, src3, dst0, dst1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + VXC_ReadImage2DArray(src2, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 16; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + do + { + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.z ++; + coord_in.y = top_y_idx; + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + VXC_ReadImage2DArray(src2, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 dst_tmp; + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, left4, dst_tmp, 16); + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, right4, dst_tmp, 16); + right4 -= left4; + top4 = right4 * x_lerp + left4; + + VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, left4, dst_tmp, 16); + VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, right4, dst_tmp, 16); + right4 -= left4; + bottom4 = right4 * x_lerp + left4; + + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + + vxc_ushort8 tmp, dst; + _viv_asm(COPY, tmp, dst4, 16); + dst.s0123 = tmp.s1357; + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.z ++; + } while (coord_in.z < depth); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx new file mode 100644 index 0000000..f910d21 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx @@ -0,0 +1,305 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform VXC_512Bits uniFp16toFp32_4x4; +_viv_uniform VXC_512Bits uniRightSubLeft_4x4; +_viv_uniform VXC_512Bits uniExtactHalf8_2x8; +_viv_uniform float2 scale_xy; +_viv_uniform int depth; +_viv_uniform float uint8Scale; +_viv_uniform float output_ZP; +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform float half_pixel_value; + +__kernel void resize_bilinear_F16toF16_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + vxc_short8 top_left0, top_right0; + vxc_short8 bottom_left0, bottom_right0; + vxc_half8 top_left, top_right; + vxc_half8 bottom_left, bottom_right; + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(top_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(top_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(top_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(top_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top_left, top_left0, 16); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(top_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(top_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(top_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(top_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top_right, top_right0, 16); + + coord_in.y = bottom_y_idx; + coord_in.x = left_x_idx.x; + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, bottom_left, bottom_left0, 16); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, bottom_right, bottom_right0, 16); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); + VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); + VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + half4 tmp; + _viv_asm(CONV, tmp, dst4); + VXC_DP2x8(top_left, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, top_left0, top_left, 16); + VXC_WriteImage2DArray(output, coord_out, top_left0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_bilinear_F16toU8_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + vxc_short8 top_left0, top_right0; + vxc_short8 bottom_left0, bottom_right0; + vxc_half8 top_left, top_right; + vxc_half8 bottom_left, bottom_right; + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(top_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(top_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(top_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(top_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top_left, top_left0, 16); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(top_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(top_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(top_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(top_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top_right, top_right0, 16); + + coord_in.y = bottom_y_idx; + coord_in.x = left_x_idx.x; + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, bottom_left, bottom_left0, 16); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, bottom_right, bottom_right0, 16); + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); + VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); + VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + dst4 = dst4 * uint8Scale + output_ZP; + int4 dst = convert_int4_rte(dst4); + vxc_uchar8 dst_uchar; + VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_WriteImage2DArray(output, coord_out, dst_uchar, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_bilinear_F16toF16_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + + vxc_ushort8 src0, src1, src2, src3, dst0, dst1; + vxc_half8 top; + vxc_half8 bottom; + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + VXC_ReadImage2DArray(src2, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 16; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + do + { + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, dst0, 16); + _viv_asm(COPY, bottom, dst1, 16); + coord_in.z ++; + coord_in.y = top_y_idx; + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y = bottom_y_idx; + VXC_ReadImage2DArray(src2, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + top4 = right4 * x_lerp + left4; + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + half4 tmp; + _viv_asm(CONV, tmp, dst4); + VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, dst0, top, 16); + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.z ++; + } while (coord_in.z < depth); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx new file mode 100644 index 0000000..7f5b21f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx @@ -0,0 +1,227 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform float2 scale_xy; +_viv_uniform int depth; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4; +_viv_uniform float dfpScale; +_viv_uniform float half_pixel_value; + +__kernel void resize_bilinear_I16toI16_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + + vxc_ushort8 src0, src1, src2, src3, dst0, dst1; + + vxc_short8 top; + vxc_short8 bottom; + + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + VXC_ReadImage2DArray(src2, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 16; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + do + { + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, dst0, 16); + _viv_asm(COPY, bottom, dst1, 16); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + coord_in.z ++; + coord_in.y = top_y_idx; + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + VXC_ReadImage2DArray(src2, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4); + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + dst4 = dst4 * dfpScale; + int4 dst = convert_int4_rte(dst4); + + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_out.z ++; + } while (coord_in.z < depth); +} + +__kernel void resize_bilinear_I16toI16_DOWN + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + + vxc_short8 top_left, top_right; + vxc_short8 bottom_left, bottom_right; + + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + coord_in.x = left_x_idx.x; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + VXC_DP4x4(left4, top_left, top_left, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_DP4x4(right4, top_right, top_right, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + + right4 -= left4; + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom_left, bottom_left, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_DP4x4(right4, bottom_right, bottom_right, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + + right4 -= left4; + bottom4 = right4 * x_lerp + left4; + + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + + dst4 = dst4 * dfpScale; + + int4 dst = convert_int4_rte(dst4); + + VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx new file mode 100644 index 0000000..aebf873 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx @@ -0,0 +1,218 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform float2 scale_xy; +_viv_uniform int depth; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4; +_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4; +_viv_uniform float dfpScale; +_viv_uniform float half_pixel_value; + +__kernel void resize_bilinear_I8toI8_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + + vxc_uchar16 src0, src1, dst0, dst1; + + vxc_char16 top; + vxc_char16 bottom; + + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 8; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + do + { + VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, top, dst0, 16); + _viv_asm(COPY, bottom, dst1, 16); + + coord_in.z ++; + coord_in.y = top_y_idx; + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.y = bottom_y_idx; + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + VXC_DP4x4(left4, top, top, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_DP4x4(right4, top, top, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4); + + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4); + + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + dst4 = dst4 * dfpScale; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_out.z ++; + } while (coord_in.z < depth); +} + +__kernel void resize_bilinear_I8toI8_DOWN + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + + vxc_char16 top_left, top_right; + vxc_char16 bottom_left, bottom_right; + + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + coord_in.x = left_x_idx.x; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_DP4x4(right4, top_right, top_right, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + + right4 -= left4; + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom_left, bottom_left, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + VXC_DP4x4(right4, bottom_right, bottom_right, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4); + + right4 -= left4; + bottom4 = right4 * x_lerp + left4; + + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + + dst4 = dst4 * dfpScale; + + int4 dst = convert_int4_rte(dst4); + + VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx new file mode 100644 index 0000000..4c21bd7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx @@ -0,0 +1,323 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4; +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform float2 scale_xy; +_viv_uniform int depth; +_viv_uniform int input_ZP; +_viv_uniform float uint8Scale; +_viv_uniform float output_ZP; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform float half_pixel_value; + +__kernel void resize_bilinear_U8toF16_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + vxc_uchar16 top_left, top_right; + vxc_uchar16 bottom_left, bottom_right; + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + coord_in.x = left_x_idx.x; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + unsigned char inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + + right4 -= left4; + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + + right4 -= left4; + bottom4 = right4 * x_lerp + left4; + + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + + dst4 *= uint8Scale; + + half4 dst; + _viv_asm(CONV, dst, dst4); + + vxc_short8 dst_short; + _viv_asm(COPY, dst_short, dst, 16); + + VXC_WriteImage2DArray(output, coord_out, dst_short.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_bilinear_U8toU8_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + + vxc_uchar16 src0, src1; + + vxc_uchar16 top; + vxc_uchar16 bottom; + + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 8; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + do + { + VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.z ++; + coord_in.y = top_y_idx; + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.y = bottom_y_idx; + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + unsigned char inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); + + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); + + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + dst4 = dst4 * uint8Scale + output_ZP; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_out.z ++; + } while (coord_in.z < depth); +} + +__kernel void resize_bilinear_U8toU8_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float bottom_y_f = ceil(in_y); + int bottom_y_idx= convert_int(bottom_y_f); + vxc_uchar16 top_left, top_right; + vxc_uchar16 bottom_left, bottom_right; + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(top_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(top_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.y = bottom_y_idx; + coord_in.x = left_x_idx.x; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_ReadImage2DArray(bottom_left, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + coord_in.x = right_x_idx.x; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.y; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.z; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = right_x_idx.w; + VXC_ReadImage2DArray(bottom_right, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + float4 left4; + float4 right4; + float4 top4; + float4 bottom4; + + unsigned char inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + + right4 -= left4; + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4); + + right4 -= left4; + bottom4 = right4 * x_lerp + left4; + + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + + dst4 = dst4 * uint8Scale + output_ZP; + + int4 dst = convert_int4_rte(dst4); + + VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx new file mode 100644 index 0000000..25f9350 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx @@ -0,0 +1,65 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniResize2xUp_4x8; +_viv_uniform VXC_512Bits uniResize2xUpRound_2x8; +_viv_uniform int out_height; + +__kernel void resize_bilinear_U8toU8_UP_2X_half + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0); + coord_in.x = (coord_out.x * 2 - 1) >> 2; + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; + + vxc_uchar16 in0, in1, tmp, result; + vxc_ushort8 result_s, round_s = 8; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + while (coord_out.y < out_height) + { + VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8); + VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8); + VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8); + VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8); + VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_in.y += 2; + coord_out.y++; + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx new file mode 100644 index 0000000..640560e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_opt.vx @@ -0,0 +1,95 @@ +#include "cl_viv_vx_ext.h" + +#if (VX_VERSION==2) + +_viv_uniform float2 scale_xy; +_viv_uniform int depth; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform VXC_512Bits uniBilinear_4x4_b; +_viv_uniform float half_pixel_value; + +__kernel void resize_bilinear_U8toU8_UP_opt + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers, + __read_only image2d_array_t scale + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + + float4 left_x_f = floor(in_x); + int4 left_x_idx = convert_int4(left_x_f); + int4 right_x_idx = left_x_idx + 1; + + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + + float top_y_f = floor(in_y); + int top_y_idx = convert_int(top_y_f); + + vxc_uchar16 src0, src1; + + vxc_uchar16 top_bottom; + + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 8; + VXC_DP2x8(maskShift, bitextract_p0, constData, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + vxc_ushort16 lerp_0; + vxc_half16 lerp; + + int2 coord = (int2)(coord_out.x * 4, coord_out.y); + VXC_ReadImage(lerp_0.hi, scale, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(lerp_0.lo, scale, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, lerp.hi, lerp_0.hi, 16); + _viv_asm(COPY, lerp.lo, lerp_0.lo, 16); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + do + { + VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); + + coord_in.w += input_desc.s4; + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 dst; + VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom, + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b); + + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.w += output_desc.s4; + + coord_out.z ++; + } while (coord_out.z < depth); +} + +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx new file mode 100644 index 0000000..9d2838c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_nearest.vx @@ -0,0 +1,241 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8; +_viv_uniform float2 scale_xy; +_viv_uniform float half_pixel_value; +_viv_uniform float round_value; +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp + +#define NEAREST_INDEX_PROCESS() \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); \ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx + round_value; \ + int4 in_x_idx = convert_int4(in_x); \ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y + round_value; \ + int in_y_idx = convert_int(in_y); \ + + +__kernel void resize_nearest_F16toF16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_short8 src; + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.y; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.z; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.w; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniGetExtractData_2x8; +__kernel void resize_nearest_F16toF16_op + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_ushort8 src0, src1, dst; + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + //in_x_idx = in_x_idx - in_x_idx.xxxx; + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16); + vxc_ushort8 input_idx; + _viv_asm(COPY, input_idx, in_x_idx, 16); + VXC_DP2x8(mask, input_idx, input_idx, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8); + VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniConvertI8toI8_2x8; +__kernel void resize_nearest_I8toI8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_char16 src; + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.y; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.z; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.w; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); + VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_nearest_I8toI8_op + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_uchar16 src0, dst0; + vxc_char16 dst; + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); + vxc_ushort8 input_idx; + _viv_asm(COPY, input_idx, in_x_idx, 16); + VXC_DP2x8(mask, input_idx, input_idx, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8); + VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, dst, dst0, 8); + VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_nearest_U8toU8 + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_uchar16 src; + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.y; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.z; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.w; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + vxc_ushort8 multiplier; + _viv_asm(COPY, multiplier, multAndoutZP, 16); + VXC_DP2x8(src, src, multiplier, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); + VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_nearest_U8toU8_op + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_uchar16 src0, dst; + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8); + vxc_ushort8 input_idx; + _viv_asm(COPY, input_idx, in_x_idx, 16); + VXC_DP2x8(mask, input_idx, input_idx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8); + VXC_BitExtract(dst, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + vxc_ushort8 multiplier; + _viv_asm(COPY, multiplier, multAndoutZP, 16); + VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_nearest_I16toI16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_short8 src; + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); + + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.y; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.z; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = in_x_idx.w; + VXC_ReadImage2DArray(src, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); + VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_nearest_I16toI16_op + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + NEAREST_INDEX_PROCESS() + + vxc_ushort8 src0, src1, dst0; + vxc_short8 dst; + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0); + VXC_ReadImage2DArray(src0, input, coord_in, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, \ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + //in_x_idx = in_x_idx - in_x_idx.xxxx; + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16); + vxc_ushort8 input_idx; + _viv_asm(COPY, input_idx, in_x_idx, 16); + VXC_DP2x8(mask, input_idx, input_idx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8); + VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, dst, dst0, 8); + VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx new file mode 100644 index 0000000..e02967d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx @@ -0,0 +1,80 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccumulateSum_2x8; +_viv_uniform int index_num; +_viv_uniform int zeropoint; +_viv_uniform int offsetX; +_viv_uniform int offsetY; +_viv_uniform int offsetZ; + +__kernel void scatter_nd_F16toF16( + __read_only image2d_t input0, + __read_only image2d_t input1, + image2d_array_t output, + int width, + int area, + int coord_dim + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_half8 sum; + _viv_asm(COPY, sum, tmpVal, 16); + for(int i = 0; i < index_num; i++) + { + int4 indice = read_imagei(input0, (int2)(0, i)); + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ; + if(gidy == idx) + { + vxc_half8 src; + VXC_ReadImage(tmpVal, input1, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src, tmpVal, 16); + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); + } + } + _viv_asm(COPY, tmpVal, sum, 16); + VXC_WriteImage(output, (int2)(gidx, gidy), tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +#define SCATTER_ND_QINT(src0_type_name, data_type) \ +__kernel void scatter_nd_##src0_type_name##to##src0_type_name##( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + image2d_array_t output, \ + int width, \ + int area, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int firstFlg = 1; \ + \ + data_type sum = (data_type)(0, 0, 0, 0, 0, 0, 0, 0); \ + for(int i = 0; i < index_num; i++) \ + { \ + int4 indice = read_imagei(input0, (int2)(0, i)); \ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ; \ + if(gidy == idx) \ + { \ + data_type src; \ + VXC_ReadImage(src, input1, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); \ + if(firstFlg) \ + { \ + firstFlg = 0; \ + } \ + } \ + } \ + if(firstFlg) \ + { \ + sum = (data_type)(zeropoint, zeropoint, zeropoint, zeropoint, \ + zeropoint, zeropoint, zeropoint, zeropoint); \ + } \ + VXC_WriteImage(output, (int2)(gidx, gidy), sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +SCATTER_ND_QINT(U8, vxc_uchar8) +SCATTER_ND_QINT(I8, vxc_char8) +SCATTER_ND_QINT(I16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_big.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_big.vx new file mode 100644 index 0000000..0a08f33 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_big.vx @@ -0,0 +1,105 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccumulateSum_2x8; +_viv_uniform int index_num; +_viv_uniform int update_width; +_viv_uniform int output_width; +_viv_uniform int zeropoint; + +_viv_uniform int offsetX; +_viv_uniform int offsetY; +_viv_uniform int offsetZ; + +inline uchar* get_image2D_array_ptr(image2d_t input) +{ + int8 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + uchar *src_ptr = (uchar*)desc.s0; + + return src_ptr; +} + +__kernel void scatter_nd_F16toF16_big( + __read_only image2d_t input0, + __read_only image2d_t input1, + image2d_t output, + int width, + int area, + int coord_dim + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_half8 sum; + _viv_asm(COPY, sum, tmpVal, 16); + __global int* index_ptr = (__global int*)get_image2D_array_ptr(input0); + __global short* update_ptr = (__global short*)get_image2D_array_ptr(input1); + __global short* output_ptr = (__global short*)get_image2D_array_ptr(output); + for(int i = 0; i < index_num; i++) + { + int4 indice = vload4(0, index_ptr + i * coord_dim); + + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ; + if(gidy == idx) + { + vxc_half8 src; + short tmpData = update_ptr[i * update_width + gidx]; + _viv_asm(COPY, src, tmpData, 4); + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); + } + } + short dst; + _viv_asm(COPY, dst, sum, 4); + output_ptr[gidy * output_width+ gidx] = dst; +} + +#define SCATTER_ND_QINT_BIG(src0_type_name, data_type, ptr_type) \ +__kernel void scatter_nd_##src0_type_name##to##src0_type_name##_big( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + image2d_t output, \ + int width, \ + int area, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int firstFlg = 1; \ + \ + data_type sum = (data_type)(0, 0, 0, 0, 0, 0, 0, 0); \ + __global int* index_ptr = (__global int*)get_image2D_array_ptr(input0); \ + __global ptr_type* update_ptr = (__global ptr_type*)get_image2D_array_ptr(input1); \ + __global ptr_type* output_ptr = (__global ptr_type*)get_image2D_array_ptr(output); \ + for(int i = 0; i < index_num; i++) \ + { \ + int4 indice = vload4(0, index_ptr + i * coord_dim); \ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ; \ + if(gidy == idx) \ + { \ + data_type src; \ + ptr_type tmpData = update_ptr[i * update_width + gidx]; \ + _viv_asm(COPY, src, tmpData, 4); \ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); \ + if(firstFlg) \ + { \ + firstFlg = 0; \ + } \ + } \ + } \ + ptr_type dst; \ + if(firstFlg) \ + { \ + _viv_asm(COPY, dst, zeropoint, 4); \ + } \ + else \ + { \ + _viv_asm(COPY, dst, sum, 4); \ + } \ + output_ptr[gidy * output_width+ gidx] = dst; \ +} +SCATTER_ND_QINT_BIG(U8, vxc_uchar8, uchar) +SCATTER_ND_QINT_BIG(I8, vxc_char8, char) +SCATTER_ND_QINT_BIG(I16, vxc_short8, short) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/select.vx b/src/tim/vx/internal/src/libnnext/ops/vx/select.vx new file mode 100644 index 0000000..8553903 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/select.vx @@ -0,0 +1,132 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvConditiontoDst_2x8; +_viv_uniform VXC_512Bits uniConvIntIn0toDst_2x8; +_viv_uniform VXC_512Bits uniConvIntIn1toDst_2x8; +_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In0_2x8; +_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In1_2x8; +_viv_uniform int input0Zp; +_viv_uniform int input1Zp; +_viv_uniform int outputZP; +_viv_uniform VXC_512Bits uniU8AddZP_2x8; + +#define SELECT_INT(type_name, read_fun, write_fun) \ + type_name tmp, src0, src1, dst, value; \ + vxc_char8 value_tmp; \ + read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(src0, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn0toDst_2x8); \ + read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(src1, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn1toDst_2x8); \ + read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(value, value_tmp, value_tmp,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \ + dst = (value != 0 ? src0 : src1); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define SELECT_INT_FUN(cond_name, src_name, dst_name, type_name) \ +__kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name( \ + __read_only image2d_array_t condition, \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + SELECT_INT(type_name, VXC_ReadImage2DArray, VXC_WriteImage2DArray) \ +} + +SELECT_INT_FUN(I8, I8, I8, vxc_char8) +SELECT_INT_FUN(I8, I16, I16, vxc_short8) + +#define SELECT_INT_FUN_2D(cond_name, src_name, dst_name, type_name) \ +__kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name##_2D( \ + __read_only image2d_array_t condition, \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + SELECT_INT(type_name, VXC_ReadImage, VXC_WriteImage) \ +} + +SELECT_INT_FUN_2D(I8, I8, I8, vxc_char8) +SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8) + +#define SELECT_HALF(read_fun, write_fun) \ + vxc_short8 src0, src1, dst, value; \ + vxc_char8 value_tmp; \ + read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(value, value_tmp, value_tmp,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \ + dst = (value != 0 ? src0 : src1); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void select_I8_F16_F16toF16( + __read_only image2d_array_t condition, + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + SELECT_HALF(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void select_I8_F16_F16toF16_2D( + __read_only image2d_array_t condition, + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + SELECT_HALF(VXC_ReadImage, VXC_WriteImage) +} + +#define SELECT_U8(read_fun, write_fun) \ + vxc_uchar8 tmp, src0, src1, dst; \ + vxc_char8 value; \ + vxc_half8 tmp1; \ + vxc_uchar16 input0_ZP, input1_ZP, output_ZP; \ + _viv_asm(COPY, input0_ZP, input0Zp, 4); \ + _viv_asm(COPY, input1_ZP, input1Zp, 4); \ + _viv_asm(COPY, output_ZP, outputZP, 4); \ + read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(tmp1, tmp, input0_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8SubZP_MulM_PStoF16In0_2x8); \ + VXC_DP2x8(src0, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \ + read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(tmp1, tmp, input1_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8SubZP_MulM_PStoF16In1_2x8); \ + VXC_DP2x8(src1, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \ + read_fun(value, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dst = (value != 0 ? src0 : src1); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void select_I8_U8_U8toU8( + __read_only image2d_array_t condition, + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + SELECT_U8(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void select_I8_U8_U8toU8_2D( + __read_only image2d_array_t condition, + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + SELECT_U8(VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/swish.vx b/src/tim/vx/internal/src/libnnext/ops/vx/swish.vx new file mode 100644 index 0000000..8a21717 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/swish.vx @@ -0,0 +1,143 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float logE; + +float4 sigmoid_(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} + +_viv_uniform float inputScale; +_viv_uniform float inputTail; +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4; +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; + +#define SWISH_PROCESS(read_fun, write_fun, src_type, src_copy_type, convert_type, dst_type, dst_copy_type, \ + INSCALE, INTAIL, OUTSCALE, OUTZP) \ + src_type src0; \ + src_copy_type src1; \ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, src0, 16); \ + float4 vecA, vecB, vecC, vecD; \ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \ + VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \ + vecA = vecA * INSCALE + INTAIL; \ + vecB = vecB * INSCALE + INTAIL; \ + vecC = beta * vecA; \ + vecD = beta * vecB; \ + vecC = sigmoid_(vecC); \ + vecD = sigmoid_(vecD); \ + vecA = vecA * vecC; \ + vecB = vecB * vecD; \ + vecA = vecA * OUTSCALE + OUTZP; \ + vecB = vecB * OUTSCALE + OUTZP; \ + convert_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, vecA); \ + _viv_asm(CONV_RTE, dst1, vecB); \ + dst_type dst2; \ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + dst_copy_type dst; \ + _viv_asm(COPY, dst, dst2, 16); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +#define SWISH_FUNC(src_type_name, dst_type_name, src_type, src_copy_type, convert_type, dst_type, dst_copy_type,\ + INSCALE, INTAIL, OUTSCALE, OUTZP) \ + __kernel void swish_##src_type_name##to##dst_type_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float beta \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + SWISH_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray, src_type, src_copy_type, convert_type, \ + dst_type, dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \ +} + +SWISH_FUNC(F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8, 1, 0, 1, 0) +SWISH_FUNC(F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8, 1, 0, outputScale, 0) +SWISH_FUNC(F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8, 1, 0, outputScale, outputZP) +SWISH_FUNC(F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8, 1, 0, outputScale, 0) +SWISH_FUNC(I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8, inputScale, 0, outputScale, 0) +SWISH_FUNC(I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0) +SWISH_FUNC(U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8, \ +inputScale, inputTail, outputScale, outputZP) +SWISH_FUNC(U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8, inputScale, inputTail, 1, 0) +SWISH_FUNC(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8, inputScale, 0, outputScale, 0) +SWISH_FUNC(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0) + + +#define SWISH_FUNC_2D(src_type_name, dst_type_name, src_type, src_copy_type, convert_type, dst_type, \ + dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \ + __kernel void swish_##src_type_name##to##dst_type_name##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float beta \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + SWISH_PROCESS(VXC_ReadImage, VXC_WriteImage, src_type, src_copy_type, convert_type, dst_type, \ + dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \ +} + +SWISH_FUNC_2D(F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8, 1, 0, 1, 0) +SWISH_FUNC_2D(F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8, 1, 0, outputScale, 0) +SWISH_FUNC_2D(F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8, 1, 0, outputScale, outputZP) +SWISH_FUNC_2D(F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8, 1, 0, outputScale, 0) +SWISH_FUNC_2D(I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8, inputScale, 0, outputScale, 0) +SWISH_FUNC_2D(I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0) +SWISH_FUNC_2D(U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8, \ +inputScale, inputTail, outputScale, outputZP) +SWISH_FUNC_2D(U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8, inputScale, inputTail, 1, 0) +SWISH_FUNC_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8, inputScale, 0, outputScale, 0) +SWISH_FUNC_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0) + + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define SWISH_BF16_PROCESS(read_fun, write_fun) \ + vxc_ushort8 src0, src1, dst; \ + float4 vecA, vecB, vecC, vecD; \ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecA, src1, 16); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, vecB, src1, 16); \ + vecC = beta * vecA; \ + vecD = beta * vecB; \ + vecC = sigmoid_(vecC); \ + vecD = sigmoid_(vecD); \ + vecA = vecA * vecC; \ + vecB = vecB * vecD; \ + _viv_asm(COPY, src0, vecA, 16); \ + _viv_asm(COPY, src1, vecB, 16); \ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void swish_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float beta + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + SWISH_BF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray); +} + +__kernel void swish_BF16toBF16_2D( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float beta + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + SWISH_BF16_PROCESS(VXC_ReadImage, VXC_WriteImage); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx new file mode 100644 index 0000000..54fb828 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx @@ -0,0 +1,133 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int lastWorkItem; + +#define TILE_3D(name0, name1, name2, remainder, type) \ +__kernel void tile_remain##name2##_##name0##to##name1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int batchIn, \ + int depthIn, \ + int depthOut, \ + int multiples_0, \ + int multiples_1, \ + int multiples_2, \ + int multiples_3 \ +) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_out; \ + int width = get_image_width(input); \ + int height = get_image_height(input); \ + int output_width = get_image_width(output); \ + type src; \ + VXC_ReadImage2DArray(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int isLastItem = coord.x == lastWorkItem; \ + \ + int batch_id = (short)coord.z / (short)depthIn; \ + coord.z = (short)coord.z % (short)depthIn; \ + coord_out = coord; \ + \ + for (int w = 0; w < multiples_3; w++) \ + { \ + int batch = batchIn * w + batch_id; \ + \ + for(int z = 0; z < multiples_2; z++) \ + { \ + coord_out.z = coord.z + z * depthIn + batch * depthOut; \ + \ + for (int y = 0; y < multiples_1; y++) \ + { \ + coord_out.y = coord.y + y * height; \ + \ + for (int x = 0; x < multiples_0; x++) \ + { \ + coord_out.x = coord.x + x * width; \ + if (isLastItem) \ + VXC_WriteImage2DArray(output, coord_out, src, \ + VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \ + else \ + VXC_WriteImage2DArray(output, coord_out, src, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + } \ + } \ +} +TILE_3D(U8, U8, 1, 0, vxc_uchar8) +TILE_3D(U8, U8, 2, 1, vxc_uchar8) +TILE_3D(U8, U8, 3, 2, vxc_uchar8) +TILE_3D(U8, U8, 4, 3, vxc_uchar8) +TILE_3D(U8, U8, 5, 4, vxc_uchar8) +TILE_3D(U8, U8, 6, 5, vxc_uchar8) +TILE_3D(U8, U8, 7, 6, vxc_uchar8) +TILE_3D(U8, U8, 0, 7, vxc_uchar8) + +TILE_3D(I16, I16, 1, 0, vxc_short8) +TILE_3D(I16, I16, 2, 1, vxc_short8) +TILE_3D(I16, I16, 3, 2, vxc_short8) +TILE_3D(I16, I16, 4, 3, vxc_short8) +TILE_3D(I16, I16, 5, 4, vxc_short8) +TILE_3D(I16, I16, 6, 5, vxc_short8) +TILE_3D(I16, I16, 7, 6, vxc_short8) +TILE_3D(I16, I16, 0, 7, vxc_short8) + + +#define TILE_2D(name0, name1, name2, remainder, type) \ +__kernel void tile_remain##name2##_##name0##to##name1##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int batchIn, \ + int depthIn, \ + int depthOut, \ + int multiples_0, \ + int multiples_1, \ + int multiples_2, \ + int multiples_3 \ +) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + int width = get_image_width(input); \ + int height = get_image_height(input); \ + int output_width = get_image_width(output); \ + int output_height = get_image_height(output); \ + type src; \ + VXC_ReadImage(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int isLastItem = coord.x == lastWorkItem; \ + do \ + { \ + do \ + { \ + if (isLastItem) \ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \ + else \ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += width; \ + } while (coord.x < output_width); \ + coord.x = get_global_id(0); \ + coord.y += height; \ + } while (coord.y < output_height); \ +} +TILE_2D(U8, U8, 1, 0, vxc_uchar8) +TILE_2D(U8, U8, 2, 1, vxc_uchar8) +TILE_2D(U8, U8, 3, 2, vxc_uchar8) +TILE_2D(U8, U8, 4, 3, vxc_uchar8) +TILE_2D(U8, U8, 5, 4, vxc_uchar8) +TILE_2D(U8, U8, 6, 5, vxc_uchar8) +TILE_2D(U8, U8, 7, 6, vxc_uchar8) +TILE_2D(U8, U8, 0, 7, vxc_uchar8) + +TILE_2D(I16, I16, 1, 0, vxc_short8) +TILE_2D(I16, I16, 2, 1, vxc_short8) +TILE_2D(I16, I16, 3, 2, vxc_short8) +TILE_2D(I16, I16, 4, 3, vxc_short8) +TILE_2D(I16, I16, 5, 4, vxc_short8) +TILE_2D(I16, I16, 6, 5, vxc_short8) +TILE_2D(I16, I16, 7, 6, vxc_short8) +TILE_2D(I16, I16, 0, 7, vxc_short8) + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx new file mode 100644 index 0000000..b23c1cd --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx @@ -0,0 +1,133 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8MulAndPostShift_Lo_2x8; +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp + +_viv_uniform int lastWorkItem; + +#define TILE_3D_MIX(name0, name1, name2, remainder, type, out_type) \ +__kernel void tile_remain##name2##_##name0##to##name1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int batchIn, \ + int depthIn, \ + int depthOut, \ + int multiples_0, \ + int multiples_1, \ + int multiples_2, \ + int multiples_3 \ +) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_out; \ + int width = get_image_width(input); \ + int height = get_image_height(input); \ + int output_width = get_image_width(output); \ + type src; \ + vxc_half8 src1; \ + out_type dst; \ + VXC_ReadImage2DArray(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int isLastItem = coord.x == lastWorkItem; \ + \ + int batch_id = (short)coord.z / (short)depthIn; \ + coord.z = (short)coord.z % (short)depthIn; \ + coord_out = coord; \ + \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(src1, src, multiplier,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Lo_2x8); \ + _viv_asm(COPY, dst, src1, 16); \ + \ + for (int w = 0; w < multiples_3; w++) \ + { \ + int batch = batchIn * w + batch_id; \ + \ + for(int z = 0; z < multiples_2; z++) \ + { \ + coord_out.z = coord.z + z * depthIn + batch * depthOut; \ + \ + for (int y = 0; y < multiples_1; y++) \ + { \ + coord_out.y = coord.y + y * height; \ + \ + for (int x = 0; x < multiples_0; x++) \ + { \ + coord_out.x = coord.x + x * width; \ + if (isLastItem) \ + VXC_WriteImage2DArray(output, coord_out, dst, \ + VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \ + else \ + VXC_WriteImage2DArray(output, coord_out, dst, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + } \ + } \ +} +TILE_3D_MIX(U8, F16, 1, 0, vxc_uchar8, vxc_short8) +TILE_3D_MIX(U8, F16, 2, 1, vxc_uchar8, vxc_short8) +TILE_3D_MIX(U8, F16, 3, 2, vxc_uchar8, vxc_short8) +TILE_3D_MIX(U8, F16, 4, 3, vxc_uchar8, vxc_short8) +TILE_3D_MIX(U8, F16, 5, 4, vxc_uchar8, vxc_short8) +TILE_3D_MIX(U8, F16, 6, 5, vxc_uchar8, vxc_short8) +TILE_3D_MIX(U8, F16, 7, 6, vxc_uchar8, vxc_short8) +TILE_3D_MIX(U8, F16, 0, 7, vxc_uchar8, vxc_short8) + + +#define TILE_2D_MIX(name0, name1, name2, remainder, type, out_type) \ +__kernel void tile_remain##name2##_##name0##to##name1##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int batchIn, \ + int depthIn, \ + int depthOut, \ + int multiples_0, \ + int multiples_1, \ + int multiples_2, \ + int multiples_3 \ +) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + int width = get_image_width(input); \ + int height = get_image_height(input); \ + int output_width = get_image_width(output); \ + int output_height = get_image_height(output); \ + type src; \ + vxc_half8 src1; \ + out_type dst; \ + VXC_ReadImage(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int isLastItem = coord.x == lastWorkItem; \ + \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(src1, src, multiplier,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Lo_2x8); \ + _viv_asm(COPY, dst, src1, 16); \ + \ + do \ + { \ + do \ + { \ + if (isLastItem) \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \ + else \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += width; \ + } while (coord.x < output_width); \ + coord.x = get_global_id(0); \ + coord.y += height; \ + } while (coord.y < output_height); \ +} +TILE_2D_MIX(U8, F16, 1, 0, vxc_uchar8, vxc_short8) +TILE_2D_MIX(U8, F16, 2, 1, vxc_uchar8, vxc_short8) +TILE_2D_MIX(U8, F16, 3, 2, vxc_uchar8, vxc_short8) +TILE_2D_MIX(U8, F16, 4, 3, vxc_uchar8, vxc_short8) +TILE_2D_MIX(U8, F16, 5, 4, vxc_uchar8, vxc_short8) +TILE_2D_MIX(U8, F16, 6, 5, vxc_uchar8, vxc_short8) +TILE_2D_MIX(U8, F16, 7, 6, vxc_uchar8, vxc_short8) +TILE_2D_MIX(U8, F16, 0, 7, vxc_uchar8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/upsample_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/upsample_F16.vx new file mode 100644 index 0000000..96af041 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsample_F16.vx @@ -0,0 +1,314 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniF16MulMultipiler_PostShft_2x8; +_viv_uniform VXC_512Bits uniS16AddOutZP_2x8; +_viv_uniform vxc_uint4 packed_outputZP; + +#define UPSAMPLE_F16_U8TO_U8_PROCESS(read_fun, write_fun) \ + vxc_short8 din0; \ + vxc_uchar8 din; \ + vxc_uchar8 axisIn; \ + vxc_half8 src; \ + vxc_uchar16 dinExpand; \ + vxc_uchar16 axisInExpand; \ + vxc_uchar16 constAxis; \ + vxc_uchar16 axisData; \ + vxc_uchar16 axisData1; \ + vxc_uchar16 dout; \ + read_fun(din0, dataIn, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vxc_short8 tmp; \ + uchar zp = 0; \ + _viv_asm(COPY, src, din0, 16); \ + VXC_DP2x8(tmp, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniF16MulMultipiler_PostShft_2x8); \ + vxc_uchar16 packed_outZP; \ + _viv_asm(COPY, packed_outZP, packed_outputZP, 16); \ + VXC_DP2x8(din, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \ + dinExpand = din.s0011223344556677; \ + axisInExpand = axisIn.s0011223344556677; \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + axisData &= (vxc_uchar16)(1); \ + _viv_asm(COPY, axisData1, axisData, 16); \ + dout = axisData1 * dinExpand; \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + axisData &= (vxc_uchar16)(1); \ + _viv_asm(COPY, axisData1, axisData, 16); \ + dout = axisData1 * dinExpand; \ + coordOut.y += 1; \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + +__kernel void upsample_F16_U8to_U8 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_F16_U8TO_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_F16_U8to_U8_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_F16_U8TO_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +_viv_uniform VXC_512Bits shortMulShort_8x8; +_viv_uniform VXC_512Bits uniConvertFstFp16Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; +_viv_uniform int upOutput_ZP; +_viv_uniform float upOutput_Scale; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + + +#define UPSAMPLE_F16_I16TO_U8_PROCESS(read_fun, write_fun) \ + vxc_short4 din; \ + vxc_short4 axisIn; \ + vxc_short8 dinExp, axisInExp, constAxis,axisData,tmpout; \ + vxc_half8 dout; \ + vxc_float4 tmpVal1, tmpVal2, convZp; \ + vxc_int4 tmpData1, tmpData2, tmpData3; \ + vxc_uchar8 result; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + dinExp = din.s00112233; \ + axisInExp = axisIn.s00112233; \ + constAxis = (vxc_short8)(0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (vxc_short8)(1); \ + VXC_DP2x8(tmpout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), shortMulShort_8x8); \ + _viv_asm(COPY, dout, tmpout, 16); \ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstFp16Fp32_4x4); \ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecFp16Fp32_4x4); \ + tmpVal1 /= upOutput_Scale; \ + tmpVal2 /= upOutput_Scale; \ + tmpData3 = isnotequal(tmpVal1, 0); \ + tmpData3 *= (-upOutput_ZP); \ + convZp = convert_float4_rtp(tmpData3); \ + tmpVal1 += convZp; \ + tmpData3 = isnotequal(tmpVal2, 0); \ + tmpData3 *= (-upOutput_ZP); \ + convZp = convert_float4_rtp(tmpData3); \ + tmpVal2 += convZp; \ + tmpData1 = convert_int4_rte(tmpVal1); \ + tmpData2 = convert_int4_rte(tmpVal2); \ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertInt32toUint8_2x8); \ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_short8)(2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (vxc_short8)(1); \ + VXC_DP2x8(tmpout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), shortMulShort_8x8); \ + _viv_asm(COPY, dout, tmpout, 16); \ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstFp16Fp32_4x4); \ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecFp16Fp32_4x4); \ + tmpVal1 /= upOutput_Scale; \ + tmpVal2 /= upOutput_Scale; \ + tmpData3 = isnotequal(tmpVal1, 0); \ + tmpData3 *= (-upOutput_ZP); \ + convZp = convert_float4_rtp(tmpData3); \ + tmpVal1 += convZp; \ + tmpData3 = isnotequal(tmpVal2, 0); \ + tmpData3 *= (-upOutput_ZP); \ + convZp = convert_float4_rtp(tmpData3); \ + tmpVal2 += convZp; \ + tmpData1 = convert_int4_rte(tmpVal1); \ + tmpData2 = convert_int4_rte(tmpVal2); \ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertInt32toUint8_2x8); \ + coordOut.y += 1; \ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +__kernel void upsample_F16_I16to_U8 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_F16_I16TO_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_F16_I16to_U8_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_F16_I16TO_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +_viv_uniform float scaleOut; +_viv_uniform float outputZp; +_viv_uniform VXC_512Bits ucharMulShort_8x8_2; + +#define UPSAMPLE_F16_U8TO_I8_PROCESS(read_fun, write_fun) \ + vxc_short4 din; \ + vxc_uchar4 axisIn; \ + vxc_short8 dinExp, tmpOut; \ + vxc_uchar8 axisInExp; \ + vxc_uchar8 constAxis; \ + vxc_uchar8 axisData; \ + vxc_half8 dout; \ + vxc_float4 tmpVal0, tmpVal1; \ + vxc_char8 result; \ + int4 tmpData1, tmpData2; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + dinExp = din.s00112233; \ + axisInExp = axisIn.s00112233; \ + constAxis = (vxc_uchar8)(0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (vxc_uchar8)(1); \ + VXC_DP2x8(tmpOut, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_8x8_2); \ + _viv_asm(COPY, dout, tmpOut, 16); \ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertFstFp16Fp32_4x4); \ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertSecFp16Fp32_4x4); \ + tmpVal0 = tmpVal0 * scaleOut + outputZp; \ + tmpVal1 = tmpVal1 * scaleOut + outputZp; \ + tmpData1 = convert_int4_rte(tmpVal0); \ + tmpData2 = convert_int4_rte(tmpVal1); \ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvertInt32toUint8_2x8); \ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_uchar8)(2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (vxc_uchar8)(1); \ + VXC_DP2x8(tmpOut, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_8x8_2); \ + coordOut.y += 1; \ + _viv_asm(COPY, dout, tmpOut, 16); \ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertFstFp16Fp32_4x4); \ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertSecFp16Fp32_4x4); \ + tmpVal0 = tmpVal0 * scaleOut + outputZp; \ + tmpVal1 = tmpVal1 * scaleOut + outputZp; \ + tmpData1 = convert_int4_rte(tmpVal0); \ + tmpData2 = convert_int4_rte(tmpVal1); \ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvertInt32toUint8_2x8); \ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +__kernel void upsample_F16_U8to_I8 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_F16_U8TO_I8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_F16_U8to_I8_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_F16_U8TO_I8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +_viv_uniform float up_outFlScale_i16; + +#define UPSAMPLE_F16_U8TO_I16_PROCESS(read_fun, write_fun) \ + vxc_short4 din; \ + vxc_uchar4 axisIn; \ + vxc_short8 dinExp, tmpOut; \ + vxc_uchar8 axisInExp; \ + vxc_uchar8 constAxis; \ + vxc_uchar8 axisData; \ + half8 dout; \ + float4 tmpVal1, tmpVal2; \ + int4 tmpData1, tmpData2; \ + vxc_short8 result; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + dinExp = din.s00112233; \ + axisInExp = axisIn.s00112233; \ + constAxis = (vxc_uchar8)(0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (vxc_uchar8)(1); \ + VXC_DP2x8(tmpOut, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_8x8_2); \ + _viv_asm(COPY, dout, tmpOut, 16); \ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstFp16Fp32_4x4); \ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecFp16Fp32_4x4); \ + tmpVal1 *= up_outFlScale_i16; \ + tmpVal2 *= up_outFlScale_i16; \ + tmpData1 = convert_int4_rte(tmpVal1); \ + tmpData2 = convert_int4_rte(tmpVal2); \ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertInt32toUint8_2x8); \ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_uchar8)(2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (vxc_uchar8)(1); \ + VXC_DP2x8(tmpOut, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_8x8_2); \ + coordOut.y += 1; \ + _viv_asm(COPY, dout, tmpOut, 16); \ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstFp16Fp32_4x4); \ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecFp16Fp32_4x4); \ + tmpVal1 *= up_outFlScale_i16; \ + tmpVal2 *= up_outFlScale_i16; \ + tmpData1 = convert_int4_rte(tmpVal1); \ + tmpData2 = convert_int4_rte(tmpVal2); \ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertInt32toUint8_2x8); \ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +__kernel void upsample_F16_U8to_I16 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_F16_U8TO_I16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_F16_U8to_I16_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_F16_U8TO_I16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/upsample_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/upsample_I16.vx new file mode 100644 index 0000000..76db9d2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsample_I16.vx @@ -0,0 +1,227 @@ +#include "cl_viv_vx_ext.h" + +//--------------------------unpooling------------------------- +_viv_uniform VXC_512Bits uniQuantInOutInt16_2x8; +_viv_uniform VXC_512Bits ucharMulShort_2x8; + +#define UPSAMPLE_I16_U8TO_I16_SAME_PROCESS(read_fun, write_fun) \ + vxc_short4 din; \ + vxc_uchar4 axisIn; \ + vxc_short8 dinExp; \ + vxc_uchar8 axisInExp; \ + vxc_uchar8 constAxis; \ + vxc_uchar8 axisData; \ + vxc_short8 axisData_short; \ + vxc_short8 dout; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + dinExp = din.s00112233; \ + axisInExp = axisIn.s00112233; \ + constAxis = (vxc_uchar8)(0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (vxc_uchar8)(1); \ + VXC_DP2x8(dout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_2x8); \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_uchar8)(2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (vxc_uchar8)(1); \ + VXC_DP2x8(dout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_2x8); \ + coordOut.y += 1; \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void upsample_I16_U8to_I16_SAME + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_I16_U8TO_I16_SAME_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_I16_U8to_I16_SAME_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_I16_U8TO_I16_SAME_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +#define UPSAMPLE_I16_TO_I16_PROCESS(axis_type, axis_in_type, read_fun, write_fun) \ + vxc_short4 din; \ + axis_in_type axisIn; \ + vxc_short8 dinExp; \ + axis_type axisInExp; \ + axis_type constAxis; \ + axis_type axisData; \ + vxc_short8 axisData_short; \ + vxc_short8 dout; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + dinExp = din.s00112233; \ + axisInExp = axisIn.s00112233; \ + constAxis = (axis_type)(0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (axis_type)(1); \ + VXC_DP2x8(dout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_2x8); \ + VXC_DP2x8(dout, dout, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantInOutInt16_2x8); \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (axis_type)(2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (axis_type)(1); \ + VXC_DP2x8(dout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_2x8); \ + VXC_DP2x8(dout, dout, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantInOutInt16_2x8); \ + coordOut.y += 1; \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void upsample_I16_U8to_I16 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_I16_TO_I16_PROCESS(vxc_uchar8, vxc_uchar4, VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_I16_U8to_I16_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_I16_TO_I16_PROCESS(vxc_uchar8, vxc_uchar4, VXC_ReadImage, VXC_WriteImage) +} + + +__kernel void upsample_I16_I16to_I16 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_I16_TO_I16_PROCESS(vxc_short8, vxc_short4, VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_I16_I16to_I16_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_I16_TO_I16_PROCESS(vxc_short8, vxc_short4, VXC_ReadImage, VXC_WriteImage) +} + + +_viv_uniform VXC_512Bits uniConvertDirInt16Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertU8toI16_2x8; +_viv_uniform float inScaleInt16; + +#define UPSAMPLE_I16_TO_F16_PROCESS(axis_type, axis_in_type, read_fun, write_fun) \ + vxc_short8 din; \ + axis_in_type axisIn; \ + vxc_short8 dinExp; \ + axis_type axisInExp; \ + axis_type constAxis; \ + axis_type axisData; \ + vxc_short8 axisData_short; \ + vxc_short8 dout; \ + vxc_float4 tmpVal0, tmpVal1; \ + half4 tmpOut0; \ + vxc_short8 tmpOut1; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tmpVal0, din, din, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertDirInt16Fp32_4x4); \ + tmpVal1 = tmpVal0 * inScaleInt16; \ + _viv_asm(CONV, tmpOut0, tmpVal1); \ + _viv_asm(COPY, tmpOut1, tmpOut0, 16); \ + dinExp = tmpOut1.s00224466; \ + axisInExp = axisIn.s00112233; \ + constAxis = (axis_type)(0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (axis_type)(1); \ + VXC_DP2x8(axisData_short, axisData, axisData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertU8toI16_2x8); \ + dout = axisData_short == 1 ? dinExp : 0; \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (axis_type)(2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + axisData &= (axis_type)(1); \ + VXC_DP2x8(axisData_short, axisData, axisData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertU8toI16_2x8); \ + dout = axisData_short == 1 ? dinExp : 0; \ + coordOut.y += 1; \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void upsample_I16_I16to_F16 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_I16_TO_F16_PROCESS(vxc_short8, vxc_short4, VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_I16_I16to_F16_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_I16_TO_F16_PROCESS(vxc_short8, vxc_short4, VXC_ReadImage, VXC_WriteImage) +} + + +__kernel void upsample_I16_U8to_F16 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_I16_TO_F16_PROCESS(vxc_uchar8, vxc_uchar4, VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_I16_U8to_F16_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_I16_TO_F16_PROCESS(vxc_uchar8, vxc_uchar4, VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/upsample_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/upsample_I8.vx new file mode 100644 index 0000000..fa0408f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsample_I8.vx @@ -0,0 +1,246 @@ + +#include "cl_viv_vx_ext.h" + +_viv_uniform int input_ZP; + +#define UPSAMPLE_I8_U8TO_I8_SAME_PROCESS(read_fun, write_fun) \ + vxc_char8 din; \ + vxc_uchar8 axisIn; \ + vxc_char16 dinExpand; \ + vxc_uchar16 axisInExpand; \ + vxc_uchar16 constAxis; \ + vxc_uchar16 axisData; \ + vxc_char16 zpValue; \ + vxc_char16 dout; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dinExpand = din.s0011223344556677; \ + axisInExpand = axisIn.s0011223344556677; \ + zpValue = (char)input_ZP; \ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + dout = axisData ? dinExpand : zpValue; \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + dout = axisData ? dinExpand : zpValue; \ + coordOut.y += 1; \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + +__kernel void upsample_I8_U8to_I8_SAME + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_I8_U8TO_I8_SAME_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_I8_U8to_I8_SAME_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_I8_U8TO_I8_SAME_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + +_viv_uniform VXC_512Bits uniU8SubZP_MulM_2x8; +_viv_uniform VXC_512Bits uniU8SubZP_MulM_Hi_2x8; +_viv_uniform VXC_512Bits uniS16AddOutZP_2x8; +_viv_uniform VXC_512Bits uniS16MoveValue_2x8; +_viv_uniform vxc_uint4 packed_outputZP; + +#define UPSAMPLE_I8_U8TO_I8_PROCESS(read_fun, write_fun) \ + vxc_char8 din; \ + vxc_uchar8 axisIn; \ + vxc_char16 dinExpand; \ + vxc_uchar16 axisInExpand; \ + vxc_uchar16 constAxis; \ + vxc_uchar16 axisData; \ + vxc_char16 zpValue; \ + vxc_char16 dout; \ + vxc_char16 result, result_tmp; \ + zpValue = (char)input_ZP; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dinExpand = din.s0011223344556677; \ + axisInExpand = axisIn.s0011223344556677; \ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + dout = axisData ? dinExpand : zpValue; \ + vxc_short8 tmp; \ + short zp = input_ZP; \ + vxc_short8 packed_outZP; \ + _viv_asm(COPY, packed_outZP, packed_outputZP, 16); \ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8SubZP_MulM_2x8); \ + VXC_DP2x8(result, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8SubZP_MulM_Hi_2x8); \ + VXC_DP2x8(result_tmp, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + VXC_DP2x8(result, result_tmp, result_tmp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniS16MoveValue_2x8); \ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + dout = axisData ? dinExpand : zpValue; \ + coordOut.y += 1; \ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8SubZP_MulM_2x8); \ + VXC_DP2x8(result, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8SubZP_MulM_Hi_2x8); \ + VXC_DP2x8(result_tmp, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + VXC_DP2x8(result, result_tmp, result_tmp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniS16MoveValue_2x8); \ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + +__kernel void upsample_I8_U8to_I8 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_I8_U8TO_I8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_I8_U8to_I8_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_I8_U8TO_I8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + + +_viv_uniform VXC_512Bits uniConvertDirUint8Fp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertEndUint8Fp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertTrdUint8Fp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertFthUint8Fp32_4x4_2; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8_2; +_viv_uniform float scaleIn; +_viv_uniform float inputTail; + +#define UPSAMPLE_I8_U8TO_F16_PROCESS(read_fun, write_fun) \ + vxc_char8 din; \ + vxc_uchar8 axisIn; \ + vxc_char16 dinExpand; \ + vxc_uchar16 axisInExpand; \ + vxc_uchar16 constAxis; \ + vxc_uchar16 axisData; \ + vxc_char16 zpValue; \ + vxc_char16 dout; \ + zpValue = (char)input_ZP; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coordOut1.x += 8; \ + dinExpand = din.s0011223344556677; \ + axisInExpand = axisIn.s0011223344556677; \ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + dout = axisData ? dinExpand : zpValue; \ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; \ + half4 tmpOut0, tmpOut1; \ + vxc_short8 rout0, rout1; \ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertDirUint8Fp32_4x4_2); \ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertEndUint8Fp32_4x4_2); \ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertTrdUint8Fp32_4x4_2); \ + VXC_DP4x4(tmpVal3, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertFthUint8Fp32_4x4_2); \ + tmpVal0 = tmpVal0 * scaleIn + inputTail; \ + tmpVal1 = tmpVal1 * scaleIn + inputTail; \ + tmpVal2 = tmpVal2 * scaleIn + inputTail; \ + tmpVal3 = tmpVal3 * scaleIn + inputTail; \ + _viv_asm(CONV, tmpOut0, tmpVal0); \ + _viv_asm(CONV, tmpOut1, tmpVal1); \ + VXC_DP2x8(rout0, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvertInt32toUint8_2x8_2); \ + _viv_asm(CONV, tmpOut0, tmpVal2); \ + _viv_asm(CONV, tmpOut1, tmpVal3); \ + VXC_DP2x8(rout1, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvertInt32toUint8_2x8_2); \ + write_fun(dataOut, coordOut, rout0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + write_fun(dataOut, coordOut1, rout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + dout = axisData ? dinExpand : zpValue; \ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertDirUint8Fp32_4x4_2); \ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertEndUint8Fp32_4x4_2); \ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertTrdUint8Fp32_4x4_2); \ + VXC_DP4x4(tmpVal3, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertFthUint8Fp32_4x4_2); \ + tmpVal0 = tmpVal0 * scaleIn + inputTail; \ + tmpVal1 = tmpVal1 * scaleIn + inputTail; \ + tmpVal2 = tmpVal2 * scaleIn + inputTail; \ + tmpVal3 = tmpVal3 * scaleIn + inputTail; \ + _viv_asm(CONV, tmpOut0, tmpVal0); \ + _viv_asm(CONV, tmpOut1, tmpVal1); \ + VXC_DP2x8(rout0, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvertInt32toUint8_2x8_2); \ + _viv_asm(CONV, tmpOut0, tmpVal2); \ + _viv_asm(CONV, tmpOut1, tmpVal3); \ + VXC_DP2x8(rout1, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvertInt32toUint8_2x8_2); \ + coordOut.y += 1; \ + coordOut1.y += 1; \ + write_fun(dataOut, coordOut, rout0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + write_fun(dataOut, coordOut1, rout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +__kernel void upsample_I8_U8to_F16 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + int4 coordOut1 = coordOut; + UPSAMPLE_I8_U8TO_F16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_I8_U8to_F16_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + int2 coordOut1 = coordOut; + UPSAMPLE_I8_U8TO_F16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/upsample_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/upsample_U8.vx new file mode 100644 index 0000000..e1b2766 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/upsample_U8.vx @@ -0,0 +1,281 @@ + +#include "cl_viv_vx_ext.h" + +_viv_uniform int input_ZP; + +#define UPSAMPLE_U8_U8TO_U8_SAME_PROCESS(read_fun, write_fun) \ + vxc_uchar8 din; \ + vxc_uchar8 axisIn; \ + vxc_uchar16 dinExpand; \ + vxc_uchar16 axisInExpand; \ + vxc_uchar16 constAxis; \ + vxc_uchar16 axisData; \ + vxc_uchar16 zpValue; \ + vxc_uchar16 dout; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dinExpand = din.s0011223344556677; \ + axisInExpand = axisIn.s0011223344556677; \ + zpValue = (uchar)input_ZP; \ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + dout = axisData ? dinExpand : zpValue; \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + dout = axisData ? dinExpand : zpValue; \ + coordOut.y += 1; \ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + +__kernel void upsample_U8_U8to_U8_SAME + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_U8_U8TO_U8_SAME_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_U8_U8to_U8_SAME_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_U8_U8TO_U8_SAME_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + + +_viv_uniform VXC_512Bits uniU8SubZP_MulM_2x8; +_viv_uniform VXC_512Bits uniU8SubZP_MulM_Hi_2x8; +_viv_uniform VXC_512Bits uniS16AddOutZP_2x8; +_viv_uniform VXC_512Bits uniS16MoveValue_2x8; +_viv_uniform vxc_uint4 packed_outputZP; + +#define UPSAMPLE_U8_U8TO_U8_PROCESS(read_fun, write_fun) \ + vxc_uchar8 din; \ + vxc_uchar8 axisIn; \ + vxc_uchar16 dinExpand; \ + vxc_uchar16 axisInExpand; \ + vxc_uchar16 constAxis; \ + vxc_uchar16 axisData; \ + vxc_uchar16 zpValue; \ + vxc_uchar16 dout; \ + vxc_uchar16 result, result_tmp; \ + zpValue = (uchar)input_ZP; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + dinExpand = din.s0011223344556677; \ + axisInExpand = axisIn.s0011223344556677; \ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + dout = axisData ? dinExpand : zpValue; \ + vxc_short8 tmp; \ + short zp = input_ZP; \ + vxc_short8 packed_outZP; \ + _viv_asm(COPY, packed_outZP, packed_outputZP, 16); \ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8SubZP_MulM_2x8); \ + VXC_DP2x8(result, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8SubZP_MulM_Hi_2x8); \ + VXC_DP2x8(result_tmp, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + VXC_DP2x8(result, result_tmp, result_tmp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniS16MoveValue_2x8); \ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + dout = axisData ? dinExpand : zpValue; \ + coordOut.y += 1; \ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8SubZP_MulM_2x8); \ + VXC_DP2x8(result, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8SubZP_MulM_Hi_2x8); \ + VXC_DP2x8(result_tmp, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniS16AddOutZP_2x8); \ + VXC_DP2x8(result, result_tmp, result_tmp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\ + uniS16MoveValue_2x8); \ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + +__kernel void upsample_U8_U8to_U8 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + UPSAMPLE_U8_U8TO_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_U8_U8to_U8_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + UPSAMPLE_U8_U8TO_U8_PROCESS(VXC_ReadImage, VXC_WriteImage) +} + + +_viv_uniform VXC_512Bits uniMulMinusZpUint8_4x4; +_viv_uniform VXC_512Bits uniMulMinusZp2Uint8_4x4; +_viv_uniform VXC_512Bits uniMulMinusZp3Uint8_4x4; +_viv_uniform VXC_512Bits uniMulMinusZp4Uint8_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8; +_viv_uniform VXC_512Bits uniConvertDirUint8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertEndUint8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertTrdUint8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertFthUint8Fp32_4x4; +_viv_uniform float scaleU8Fp16; +_viv_uniform int zpU8Fp16; + +#define UPSAMPLE_U8_U8TO_F16_PROCESS(read_fun, write_fun) \ + vxc_uchar8 din; \ + vxc_uchar8 axisIn; \ + vxc_uchar16 dinExpand; \ + vxc_uchar16 axisInExpand; \ + vxc_uchar16 constAxis; \ + vxc_uchar16 axisData; \ + vxc_uchar16 axisData1; \ + vxc_uchar16 dout; \ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coordOut1.x += 8; \ + dinExpand = din.s0011223344556677; \ + axisInExpand = axisIn.s0011223344556677; \ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + axisData &= (vxc_uchar16)(1); \ + _viv_asm(COPY, axisData1, axisData, 16); \ + dout = axisData1 * dinExpand; \ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3, convZp; \ + half4 tmpOut0, tmpOut1; \ + vxc_short8 rout0, rout1; \ + vxc_int4 tmpV0, tmpV1, tmpV2, tmpV3; \ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; \ + short tmpZp = (short)(-zpU8Fp16); \ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertDirUint8Fp32_4x4); \ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndUint8Fp32_4x4); \ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertTrdUint8Fp32_4x4); \ + VXC_DP4x4(tmpVal3, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertFthUint8Fp32_4x4); \ + VXC_DP4x4(tmpV0, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniMulMinusZpUint8_4x4); \ + VXC_DP4x4(tmpV1, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniMulMinusZp2Uint8_4x4); \ + VXC_DP4x4(tmpV2, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniMulMinusZp3Uint8_4x4); \ + VXC_DP4x4(tmpV3, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniMulMinusZp4Uint8_4x4); \ + _viv_asm(CONV, tmpData0, tmpV0); \ + _viv_asm(CONV, tmpData1, tmpV1); \ + _viv_asm(CONV, tmpData2, tmpV2); \ + _viv_asm(CONV, tmpData3, tmpV3); \ + tmpVal0 = (tmpVal0 + tmpData0) * scaleU8Fp16; \ + tmpVal1 = (tmpVal1 + tmpData1) * scaleU8Fp16; \ + tmpVal2 = (tmpVal2 + tmpData2) * scaleU8Fp16; \ + tmpVal3 = (tmpVal3 + tmpData3) * scaleU8Fp16; \ + _viv_asm(CONV, tmpOut0, tmpVal0); \ + _viv_asm(CONV, tmpOut1, tmpVal1); \ + VXC_DP2x8(rout0, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt32toInt16_2x8); \ + _viv_asm(CONV, tmpOut0, tmpVal2); \ + _viv_asm(CONV, tmpOut1, tmpVal3); \ + VXC_DP2x8(rout1, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt32toInt16_2x8); \ + write_fun(dataOut, coordOut, rout0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + write_fun(dataOut, coordOut1, rout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + axisData &= (vxc_uchar16)(1); \ + _viv_asm(COPY, axisData1, axisData, 16); \ + dout = axisData1 * dinExpand; \ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertDirUint8Fp32_4x4); \ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndUint8Fp32_4x4); \ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertTrdUint8Fp32_4x4); \ + VXC_DP4x4(tmpVal3, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertFthUint8Fp32_4x4); \ + VXC_DP4x4(tmpV0, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniMulMinusZpUint8_4x4); \ + VXC_DP4x4(tmpV1, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniMulMinusZp2Uint8_4x4); \ + VXC_DP4x4(tmpV2, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniMulMinusZp3Uint8_4x4); \ + VXC_DP4x4(tmpV3, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniMulMinusZp4Uint8_4x4); \ + _viv_asm(CONV, tmpData0, tmpV0); \ + _viv_asm(CONV, tmpData1, tmpV1); \ + _viv_asm(CONV, tmpData2, tmpV2); \ + _viv_asm(CONV, tmpData3, tmpV3); \ + tmpVal0 = (tmpVal0 + tmpData0) * scaleU8Fp16; \ + tmpVal1 = (tmpVal1 + tmpData1) * scaleU8Fp16; \ + tmpVal2 = (tmpVal2 + tmpData2) * scaleU8Fp16; \ + tmpVal3 = (tmpVal3 + tmpData3) * scaleU8Fp16; \ + _viv_asm(CONV, tmpOut0, tmpVal0); \ + _viv_asm(CONV, tmpOut1, tmpVal1); \ + VXC_DP2x8(rout0, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt32toInt16_2x8); \ + _viv_asm(CONV, tmpOut0, tmpVal2); \ + _viv_asm(CONV, tmpOut1, tmpVal3); \ + VXC_DP2x8(rout1, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt32toInt16_2x8); \ + coordOut.y += 1; \ + coordOut1.y += 1; \ + write_fun(dataOut, coordOut, rout0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + write_fun(dataOut, coordOut1, rout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + +__kernel void upsample_U8_U8to_F16 + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0); + int4 coordOut1 = coordOut; + UPSAMPLE_U8_U8TO_F16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} + +__kernel void upsample_U8_U8to_F16_2D + ( + image2d_array_t dataIn, + image2d_array_t axis, + image2d_array_t dataOut + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coordOut = (int2)(coord.x << 1, coord.y << 1); + int2 coordOut1 = coordOut; + UPSAMPLE_U8_U8TO_F16_PROCESS(VXC_ReadImage, VXC_WriteImage) +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx new file mode 100644 index 0000000..b0def7f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx @@ -0,0 +1,8 @@ +#include "cl_viv_vx_ext.h" + +__kernel void vxcAxis_aligned_bbox_transform( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_box_with_nms_limit.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_box_with_nms_limit.vx new file mode 100644 index 0000000..c351f66 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_box_with_nms_limit.vx @@ -0,0 +1,8 @@ +#include "cl_viv_vx_ext.h" + +__kernel void vxcBox_with_nms_limit( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx new file mode 100644 index 0000000..bc5e1d0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_crop.vx @@ -0,0 +1,111 @@ +#include "cl_viv_vx_ext.h" + +//-----------------------------------------------tensor crop------------------------------- +__kernel void vxcTensorCrop_Int16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int offset0, + int offset1, + int offset2) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_ushort8 src0, src1, src2, src3; + + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\ + - offset1, get_global_id(2) - offset2, 0); + + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.y ++; + VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.y ++; + VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.y ++; + VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void vxcTensorCrop_Int8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int offset0, + int offset1, + int offset2) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar16 src0, src1, src2, src3; + + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1) - offset1,\ + get_global_id(2) - offset2, 0); + + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y ++; + VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y ++; + VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y ++; + VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8; + +__kernel void vxcTensorCrop_Int16_Fp16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int offset0, + int offset1, + int offset2) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_short8 src0, src1, src2, src3; + + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\ + - offset1, get_global_id(2) - offset2, 0); + + vxc_half8 dst0, dst1, dst2, dst3; + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt16toFp16_2x8); + VXC_DP2x8(dst1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt16toFp16_2x8); + VXC_DP2x8(dst2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt16toFp16_2x8); + VXC_DP2x8(dst3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt16toFp16_2x8); + + vxc_short8 out0, out1, out2, out3; + _viv_asm(COPY, out0, dst0, 16); + _viv_asm(COPY, out1, dst1, 16); + _viv_asm(COPY, out2, dst2, 16); + _viv_asm(COPY, out3, dst3, 16); + + VXC_WriteImage2DArray(output, coord_out, out0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.y ++; + VXC_WriteImage2DArray(output, coord_out, out1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.y ++; + VXC_WriteImage2DArray(output, coord_out, out2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.y ++; + VXC_WriteImage2DArray(output, coord_out, out3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_detection_postprocess.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_detection_postprocess.vx new file mode 100644 index 0000000..763daa0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_detection_postprocess.vx @@ -0,0 +1,8 @@ +#include "cl_viv_vx_ext.h" + +__kernel void vxcDetection_postprocess( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_extra_ending.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_extra_ending.vx new file mode 100644 index 0000000..7ebb20a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_extra_ending.vx @@ -0,0 +1,43 @@ +#include "cl_viv_vx_ext.h" + +__kernel void vxcExtra_ending_i16( + __read_only image2d_array_t input0, + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 data; + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void vxcExtra_ending_i8( + __read_only image2d_array_t input0, + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char8 data; + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void vxcExtra_ending_u8( + __read_only image2d_array_t input0, + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar8 data; + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx new file mode 100644 index 0000000..a052f8c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_fullconnect2.vx @@ -0,0 +1,63 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int loopNum; +_viv_uniform VXC_512Bits uniMulAcc_16x1; +__kernel void vsi_nn_kernel_fullconnect2( + __read_only image2d_array_t input, + __read_only image2d_array_t weight, + __read_only image2d_array_t bias, + __write_only image2d_array_t output) +{ + int4 coord_in = (int4)(16, get_global_id(0), get_global_id(1), 0); + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 v0, v1, v2, v3, v4, v5, v6, v7; + vxc_half8 i0, i1, i2, i3; + vxc_half8 w0, w1, w2, w3; + float4 sum = 0; + float dst = 0; + dst = read_imagef(bias, coord_in.ywww).x; + do + { + VXC_ReadImage(v0, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, i0, v0, 16); + VXC_ReadImage(v1, weight, coord_in.xy, VXC_5BITOFFSET_XY(-16, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, w0, v1, 16); + VXC_ReadImage(v2, input, coord_in.xz, VXC_5BITOFFSET_XY(-8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, i1, v2, 16); + VXC_ReadImage(v3, weight, coord_in.xy, VXC_5BITOFFSET_XY(-8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, w1, v3, 16); + VXC_ReadImage(v4, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, i2, v4, 16); + VXC_ReadImage(v5, weight, coord_in.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, w2, v5, 16); + VXC_ReadImage(v6, input, coord_in.xz, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, i3, v6, 16); + VXC_ReadImage(v7, weight, coord_in.xy, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, w3, v7, 16); + + coord_in.x += 32; + + VXC_DP16x1(sum, i0, w0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1); + VXC_DP16x1(sum, i1, w1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1); + VXC_DP16x1(sum, i2, w2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1); + VXC_DP16x1(sum, i3, w3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1); + + float4 tmp = {1, 1, 1, 1}; + dst = dst + dot(sum, tmp); + + } while (coord_in.x < loopNum); + + vxc_half v; + _viv_asm(CONV, v, dst); + _viv_asm(COPY, v0, v, 16); + VXC_WriteImage(output, coord_out.xy, v0, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx new file mode 100644 index 0000000..9b2e37d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx @@ -0,0 +1,8 @@ +#include "cl_viv_vx_ext.h" + +__kernel void vxcGenerate_proposals( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx new file mode 100644 index 0000000..b0f9565 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx @@ -0,0 +1,198 @@ +/* + ============================================================================ + Name : libNNExt.vx + Author : VSI + Version : + Copyright : Your copyright notice + Description : + ============================================================================ + */ +#include "cl_viv_vx_ext.h" + +#if (VX_VERSION==1) +#define VXC_DP2x8_b_(dst, src0, src1, src2, info, uniform)\ +do\ +{\ + _viv_asm(COPY, dst, src0, 16); \ +} while (0) + +#define VXC_VertMin3_Integer(dst, src0, src1, src2, info)\ +do\ +{\ + typeof (dst) tmp; \ + tmp = min(src0, src1);\ + dst = min(src2, tmp);\ +} while (0) + +#define VXC_VertMin3_Half(dst, src0, src1, src2, info)\ +do\ +{\ + vxc_short8 val0_ver1, val1_ver1, val2_ver1, minVal_ver1, maxVal_ver1;\ + _viv_asm(COPY, val0_ver1, src0, 16);\ + _viv_asm(COPY, val1_ver1, src1, 16);\ + _viv_asm(COPY, val2_ver1, src2, 16);\ + maxVal_ver1 = max(val0_ver1, val1_ver1);\ + minVal_ver1 = min(val0_ver1, val1_ver1);\ + minVal_ver1 = maxVal_ver1 < 0 ? maxVal_ver1 : minVal_ver1; \ + maxVal_ver1 = max(val2_ver1, minVal_ver1);\ + minVal_ver1 = min(val2_ver1, minVal_ver1);\ + minVal_ver1 = maxVal_ver1 < 0 ? maxVal_ver1 : minVal_ver1; \ + _viv_asm(COPY, dst, minVal_ver1, 16); \ +} while (0) + +#define VXC_VertMax3_Integer(dst, src0, src1, src2, info)\ +do\ +{\ + int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ + int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ + int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ + int mod1 = VXC_MODIFIER_CLAMP(startBin, endBin, sourceBin, 0);\ + typeof (dst) tmp;\ + tmp = max(src0, src1);\ + tmp = max(src2, tmp);\ + VXC_Clamp(dst, tmp, tmp, tmp, mod1);\ +} while (0) + +#define VXC_VertMax3_Half(dst, src0, src1, src2, info)\ + do\ + {\ + vxc_short8 val0_ver1, val1_ver1, val2_ver1, minVal_ver1, maxVal_ver1;\ + _viv_asm(COPY, val0_ver1, src0, 16);\ + _viv_asm(COPY, val1_ver1, src1, 16);\ + _viv_asm(COPY, val2_ver1, src2, 16);\ + maxVal_ver1 = max(val0_ver1, val1_ver1);\ + maxVal_ver1 = max(val2_ver1, maxVal_ver1);\ + minVal_ver1 = min(val0_ver1, val1_ver1);\ + minVal_ver1 = min(val2_ver1, minVal_ver1);\ + maxVal_ver1 = maxVal_ver1 >= 0 ? maxVal_ver1 : minVal_ver1;\ + _viv_asm(COPY, dst, maxVal_ver1, 16); \ + } while (0) + +#define VXC_HorzMax3_Integer(dst, src0, info)\ +do\ +{\ + int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ + int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ + int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ + int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ + int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\ + VXC_OP4(filter, dst, src0, src0, src0, mod1);\ +} while (0) + +#define VXC_HorzMax3_Half(dst, src0, info)\ +do\ +{\ + int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ + int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ + int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ + int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ + int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\ + int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\ + vxc_short8 val0, minVal, maxVal;\ + _viv_asm(COPY, val0, src0, 16);\ + VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\ + VXC_OP4(filter, minVal, val0, val0, val0, mod2);\ + maxVal = maxVal >= 0 ? maxVal : minVal;\ + _viv_asm(COPY, dst, maxVal, 16);\ +} while (0) + +#define VXC_HorzMin3_Integer(dst, src0, info)\ +do\ +{\ + int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ + int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ + int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ + int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ + int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\ + VXC_OP4(filter, dst, src0, src0, src0, mod1);\ +} while (0) + +#define VXC_HorzMin3_Half(dst, src0, info)\ +do\ +{\ + int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ + int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ + int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ + int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ + int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\ + int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\ + int mod3 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Median, clamp);\ + vxc_short8 val0, minVal, maxVal, midVal;\ + _viv_asm(COPY, val0, src0, 16);\ + VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\ + VXC_OP4(filter, minVal, val0, val0, val0, mod2);\ + VXC_OP4(filter, midVal, val0, val0, val0, mod3);\ + minVal = midVal < 0 ? midVal : minVal;\ + minVal = maxVal < 0 ? maxVal : minVal;\ + _viv_asm(COPY, dst, minVal, 16);\ +} while (0) + +#define VXC_Clamp_Half(dst, src0, src1, src2, info)\ +do\ +{\ + typeof (dst) tmp;\ + VXC_VertMax3_Half(tmp, src0, src0, src1, info);\ + VXC_VertMin3_Half(dst, tmp, tmp, src2, info);\ +} while (0) + +#else +#define VXC_DP2x8_b_(dst, src0, src1, src2, info, uniform)\ +do\ +{\ + VXC_DP2x8_b(dst, src0, src1, src2, info, uniform); \ +} while (0) + +#define VXC_VertMin3_Integer(dst, src0, src1, src2, info)\ + do\ + {\ + VXC_VertMin3(dst, src0, src1, src2, info);\ + } while (0) + +#define VXC_VertMin3_Half(dst, src0, src1, src2, info)\ + do\ + {\ + VXC_VertMin3(dst, src0, src1, src2, info);\ + } while (0) + +#define VXC_VertMax3_Integer(dst, src0, src1, src2, info)\ +do\ +{\ + VXC_VertMax3(dst, src0, src1, src2, info);\ +} while (0) + +#define VXC_VertMax3_Half(dst, src0, src1, src2, info)\ +do\ +{\ + VXC_VertMax3(dst, src0, src1, src2, info);\ +} while (0) + +#define VXC_HorzMax3_Integer(dst, src0, info)\ +do\ +{\ + VXC_HorzMax3(dst, src0, info);\ +} while (0) + +#define VXC_HorzMax3_Half(dst, src0, info)\ +do\ +{\ + VXC_HorzMax3(dst, src0, info);\ +} while (0) + +#define VXC_HorzMin3_Integer(dst, src0, info)\ +do\ +{\ + VXC_HorzMin3(dst, src0, info);\ +} while (0) + +#define VXC_HorzMin3_Half(dst, src0, info)\ +do\ +{\ + VXC_HorzMin3(dst, src0, info);\ +} while (0) + +#define VXC_Clamp_Half(dst, src0, src1, src2, info)\ +do\ +{\ + VXC_Clamp(dst, src0, src1, src2, info);\ +} while (0) +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_heatmap_max_keypoint.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_heatmap_max_keypoint.vx new file mode 100644 index 0000000..4d7a7a7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_heatmap_max_keypoint.vx @@ -0,0 +1,8 @@ +#include "cl_viv_vx_ext.h" + +__kernel void vxcHeatmap_max_keypoint( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess.vx new file mode 100644 index 0000000..93ad2cd --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess.vx @@ -0,0 +1,321 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniVecShift10; +_viv_uniform VXC_512Bits uniAddRShift; +_viv_uniform VXC_512Bits uniGetTempVal; +_viv_uniform VXC_512Bits uniExtractBytes; +_viv_uniform VXC_512Bits uniUnpackToR; +_viv_uniform VXC_512Bits uniUnpackToG; +_viv_uniform VXC_512Bits uniUnpackToB; +_viv_uniform VXC_512Bits uniDataMulAlpha_4x4; +_viv_uniform VXC_512Bits uniDataSubMean_4x4; + +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniExtactInteger_2x8; + +#define DESCALE(x) (((x) + (1<<19)) >> 20) +__kernel void ScaletoTensor_Int8 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float f32Var + ) +{ + int2 ratioXY = (int2)(*xRatio, *yRatio); + + int4 xPos = get_global_id(0); + int yPos = get_global_id(1); + + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); + xPos += (int4)(0, 1, 2, 3); + + //x + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; + int4 sx = fx0 & 0xffff8000; + fx0 -= sx; + sx = sx >> 15; + + vxc_short4 fx; + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); + //y + int fy = yPos * ratioXY.y + ratioSufXY.y; + int sy = fy & 0xffff8000; // Floor + + fy -= sy; + sy = sy >> 15; + + fy = (fy + (1<< 4)) >> 5; + + //R + vxc_uchar16 line0RGB1, line0RGB2; + vxc_uchar16 line1RGB3, line1RGB4; + int4 coord; + sx = sx * 3 + *xOffset; + coord.xyz = sx.xyz; + coord.w = sy + *yOffset; + int2 coord1 = (int2)(sx.w, coord.w); + VXC_ReadImage(line0RGB1, input, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB1, input, coord.yw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB2, input, coord.zw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB2, input, coord1, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + + float4 bgrMean = (float4)(bMean, gMean, rMean, 0); + + bgrMean *= f32Var; + + int4 test01, temp1; + int4 test02, temp2; + int4 tt; + vxc_uchar4 val; + int4 coord_out = (int4)(xPos.x, yPos, 2, 0); + + vxc_uchar8 line1, line2; + + //R + VXC_DP2x8(line1, line0RGB1, line0RGB2,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); + VXC_DP2x8(line2, line1RGB3, line1RGB4,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); + + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + vxc_float4 tmp_dst; + vxc_uchar4 u8_dst; + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + //convert U8 to dfp8 + int4 dst0; + vxc_char4 dst; + tmp_dst = tmp_dst * f32Var - bgrMean.z; + tmp_dst *= outputScale; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + //G + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); + + coord_out.z = 1; + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + tmp_dst = tmp_dst * f32Var - bgrMean.y; + tmp_dst *= outputScale; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + //B + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); + + coord_out.z = 0; + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(u8_dst, temp2, 1 << 19,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + tmp_dst = tmp_dst * f32Var - bgrMean.x; + tmp_dst *= outputScale; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void ScaletoTensor_Fp16 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float f32Var + ) +{ + int2 ratioXY = (int2)(*xRatio, *yRatio); + + int4 xPos = get_global_id(0); + int yPos = get_global_id(1); + + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); + xPos += (int4)(0, 1, 2, 3); + + //x + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; + int4 sx = fx0 & 0xffff8000; + fx0 -= sx; + sx = sx >> 15; + + vxc_short4 fx; + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); + //y + int fy = yPos * ratioXY.y + ratioSufXY.y; + int sy = fy & 0xffff8000; // Floor + + fy -= sy; + sy = sy >> 15; + + fy = (fy + (1<< 4)) >> 5; + + //R + vxc_uchar16 line0RGB1, line0RGB2; + vxc_uchar16 line1RGB3, line1RGB4; + int4 coord; + sx = sx * 3 + *xOffset; + coord.xyz = sx.xyz; + coord.w = sy + *yOffset; + int2 coord1 = (int2)(sx.w, coord.w); + VXC_ReadImage(line0RGB1, input, coord.xw,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB1, input, coord.yw,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB2, input, coord.zw,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB2, input, coord1,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(line1RGB3, input, coord.xw,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB3, input, coord.yw,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB4, input, coord.zw,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB4, input, coord1,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + + float4 bgrMean = (float4)(bMean, gMean, rMean, 0); + + int4 test01, temp1; + int4 test02, temp2; + int4 tt; + vxc_uchar4 val; + int4 coord_out = (int4)(xPos.x, yPos, 2, 0); + + vxc_uchar8 line1, line2; + + //R + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); + + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + + //convert U8 to FP16 + half4 f16mean; + half f16alpha; + vxc_half4 dst; + vxc_short4 tmp_dst; + _viv_asm(CONV, f16mean, bgrMean); + _viv_asm(CONV, f16alpha, f32Var); + VXC_DP4x4(dst, val, f16mean.z, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4); + VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4); + _viv_asm(COPY, tmp_dst, dst, 8); + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + //G + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); + + coord_out.z = 1; + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + + VXC_DP4x4(dst, val, f16mean.y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4); + VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4); + _viv_asm(COPY, tmp_dst, dst, 8); + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + //B + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); + + coord_out.z = 0; + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + + VXC_DP4x4(dst, val, f16mean.x, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4); + VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4); + _viv_asm(COPY, tmp_dst, dst, 8); + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_2.vx new file mode 100644 index 0000000..88efed8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_2.vx @@ -0,0 +1,327 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniVecShift10; +_viv_uniform VXC_512Bits uniAddRShift; +_viv_uniform VXC_512Bits uniGetTempVal; +_viv_uniform VXC_512Bits uniExtractBytes; +_viv_uniform VXC_512Bits uniUnpackToR; +_viv_uniform VXC_512Bits uniUnpackToG; +_viv_uniform VXC_512Bits uniUnpackToB; +_viv_uniform VXC_512Bits uniDataMulAlpha_4x4; +_viv_uniform VXC_512Bits uniDataSubMean_4x4; + +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniExtactInteger_2x8; + +#define DESCALE(x) (((x) + (1<<19)) >> 20) +__kernel void ScaletoTensor_Int16 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float f32Var + ) +{ + int2 ratioXY = (int2)(*xRatio, *yRatio); + + int4 xPos = get_global_id(0); + int yPos = get_global_id(1); + + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); + xPos += (int4)(0, 1, 2, 3); + + //x + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; + int4 sx = fx0 & 0xffff8000; + fx0 -= sx; + sx = sx >> 15; + + vxc_short4 fx; + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); + //y + int fy = yPos * ratioXY.y + ratioSufXY.y; + int sy = fy & 0xffff8000; // Floor + + fy -= sy; + sy = sy >> 15; + + fy = (fy + (1<< 4)) >> 5; + + //R + vxc_uchar16 line0RGB1, line0RGB2; + vxc_uchar16 line1RGB3, line1RGB4; + int4 coord; + sx = sx * 3 + *xOffset; + coord.xyz = sx.xyz; + coord.w = sy + *yOffset; + int2 coord1 = (int2)(sx.w, coord.w); + VXC_ReadImage(line0RGB1, input, coord.xw,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB1, input, coord.yw,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB2, input, coord.zw,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB2, input, coord1,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(line1RGB3, input, coord.xw,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB3, input, coord.yw,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB4, input, coord.zw,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB4, input, coord1,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + + float4 bgrMean = (float4)(bMean, gMean, rMean, 0); + + bgrMean *= f32Var; + + int4 test01, temp1; + int4 test02, temp2; + int4 tt; + vxc_uchar4 val; + int4 coord_out = (int4)(xPos.x, yPos, 2, 0); + + vxc_uchar8 line1, line2; + + //R + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); + + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + vxc_float4 tmp_dst; + vxc_uchar4 u8_dst; + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + //convert U8 to dfp8 + int4 dst0; + vxc_short4 dst; + tmp_dst = tmp_dst * f32Var - bgrMean.z; + tmp_dst *= outputScale; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + //G + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); + + coord_out.z = 1; + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + tmp_dst = tmp_dst * f32Var - bgrMean.y; + tmp_dst *= outputScale; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + //B + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); + + coord_out.z = 0; + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + tmp_dst = tmp_dst * f32Var - bgrMean.x; + tmp_dst *= outputScale; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +_viv_uniform float outputZP; +__kernel void ScaletoTensor_UInt8 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float f32Var + ) +{ + int2 ratioXY = (int2)(*xRatio, *yRatio); + + int4 xPos = get_global_id(0); + int yPos = get_global_id(1); + + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); + xPos += (int4)(0, 1, 2, 3); + + //x + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; + int4 sx = fx0 & 0xffff8000; + fx0 -= sx; + sx = sx >> 15; + + vxc_short4 fx; + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); + //y + int fy = yPos * ratioXY.y + ratioSufXY.y; + int sy = fy & 0xffff8000; // Floor + + fy -= sy; + sy = sy >> 15; + + fy = (fy + (1<< 4)) >> 5; + + //R + vxc_uchar16 line0RGB1, line0RGB2; + vxc_uchar16 line1RGB3, line1RGB4; + int4 coord; + sx = sx * 3 + *xOffset; + coord.xyz = sx.xyz; + coord.w = sy + *yOffset; + int2 coord1 = (int2)(sx.w, coord.w); + VXC_ReadImage(line0RGB1, input, coord.xw,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB1, input, coord.yw,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB2, input, coord.zw,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0RGB2, input, coord1,\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(line1RGB3, input, coord.xw,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB3, input, coord.yw,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB4, input, coord.zw,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1RGB4, input, coord1,\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); + + float4 bgrMean = (float4)(bMean, gMean, rMean, 0); + + bgrMean *= f32Var; + + int4 test01, temp1; + int4 test02, temp2; + int4 tt; + vxc_uchar4 val; + int4 coord_out = (int4)(xPos.x, yPos, 2, 0); + + vxc_uchar8 line1, line2; + + //R + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); + + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + vxc_float4 tmp_dst; + vxc_uchar4 u8_dst; + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + //convert U8 to dfp8 + int4 dst0; + vxc_uchar4 dst; + tmp_dst = tmp_dst * f32Var - bgrMean.z; + tmp_dst = tmp_dst * outputScale + outputZP; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + //G + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); + + coord_out.z = 1; + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + tmp_dst = tmp_dst * f32Var - bgrMean.y; + tmp_dst = tmp_dst * outputScale + outputZP; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + //B + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); + + coord_out.z = 0; + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + tmp_dst = tmp_dst * f32Var - bgrMean.x; + tmp_dst = tmp_dst * outputScale + outputZP; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_3.vx new file mode 100644 index 0000000..742459e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_3.vx @@ -0,0 +1,214 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniExtractR_2x8; +_viv_uniform VXC_512Bits uniExtractG_2x8; +_viv_uniform VXC_512Bits uniExtractB_2x8; +_viv_uniform float outputScale; +__kernel void ScaletoTensor_Fp16_copy + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float f32Var + ) +{ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); + + coord.xy += (int2) (*xOffset, *yOffset); + vxc_uchar16 src0, src1; + vxc_half8 dst; + + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var); + //convert U8 to FP16 + half4 paramData_f16; + vxc_short8 tmp_dst; + _viv_asm(CONV, paramData_f16, paramData); + + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0); + //R + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractR_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractR_2x8); + _viv_asm(COPY, tmp_dst, dst, 16); + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + + //G + coord_out.z = 1; + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractG_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractG_2x8); + _viv_asm(COPY, tmp_dst, dst, 16); + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + //B + coord_out.z = 0; + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractB_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractB_2x8); + _viv_asm(COPY, tmp_dst, dst, 16); + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void ScaletoTensor_Int8_copy + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float f32Var + ) +{ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); + + coord.xy += (int2) (*xOffset, *yOffset); + vxc_uchar16 src0, src1; + vxc_char16 dst; + + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + f32Var *= outputScale; + float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var); + //convert U8 to FP16 + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0); + //R + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); + + + //G + coord_out.z = 1; + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); + + //B + coord_out.z = 0; + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void ScaletoTensor_Int16_copy + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float f32Var + ) +{ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); + + coord.xy += (int2) (*xOffset, *yOffset); + vxc_uchar16 src0, src1; + vxc_short8 dst; + + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + f32Var *= outputScale; + float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var); + //convert U8 to FP16 + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0); + //R + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + + //G + coord_out.z = 1; + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + //B + coord_out.z = 0; + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +_viv_uniform float outputZP; +__kernel void ScaletoTensor_UInt8_copy + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float f32Var + ) +{ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); + + coord.xy += (int2) (*xOffset, *yOffset); + vxc_uchar16 src0, src1; + vxc_uchar16 dst; + + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + f32Var *= outputScale; + float4 paramData = (float4)(rMean * f32Var - outputZP,\ + gMean * f32Var - outputZP, bMean * f32Var - outputZP, f32Var); + //convert U8 to FP16 + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0); + //R + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); + + + //G + coord_out.z = 1; + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); + + //B + coord_out.z = 0; + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_4.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_4.vx new file mode 100644 index 0000000..e7c9049 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_4.vx @@ -0,0 +1,207 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniVecShift10; +_viv_uniform VXC_512Bits uniAddRShift; +_viv_uniform VXC_512Bits uniGetTempVal; +_viv_uniform VXC_512Bits uniExtractBytes; + +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniExtactInteger_2x8; + +#define DESCALE(x) (((x) + (1<<19)) >> 20) +__kernel void GrayScaletoTensor_Int8 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int2 ratioXY = (int2)(*xRatio, *yRatio); + + int4 xPos = get_global_id(0); + int yPos = get_global_id(1); + + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); + xPos += (int4)(0, 1, 2, 3); + + //x + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; + int4 sx = fx0 & 0xffff8000; + fx0 -= sx; + sx = sx >> 15; + + vxc_short4 fx; + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); + //y + int fy = yPos * ratioXY.y + ratioSufXY.y; + int sy = fy & 0xffff8000; // Floor + + fy -= sy; + sy = sy >> 15; + + fy = (fy + (1<< 4)) >> 5; + + //R + vxc_uchar16 line0Y; + vxc_uchar16 line1Y; + int4 coord; + sx = sx + *xOffset; + coord.xyz = sx.xyz; + coord.w = sy + *yOffset; + int2 coord1 = (int2)(sx.w, coord.w); + VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + float grayMean = mean * f32Var; + + int4 test01, temp1; + int4 test02, temp2; + int4 tt; + vxc_uchar4 val; + int2 coord_out = (int2)(xPos.x, yPos); + + vxc_uchar8 line1, line2; + + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + vxc_float4 tmp_dst; + vxc_uchar4 u8_dst; + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, + 1), uniConvertIntergetoF32_4x4); + + //convert U8 to dfp8 + int4 dst0; + vxc_char4 dst; + tmp_dst = tmp_dst * f32Var - grayMean; + tmp_dst *= outputScale; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + + VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniDataMulAlpha_4x4; +_viv_uniform VXC_512Bits uniDataSubMean_4x4; +__kernel void GrayScaletoTensor_Fp16 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int2 ratioXY = (int2)(*xRatio, *yRatio); + + int4 xPos = get_global_id(0); + int yPos = get_global_id(1); + + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); + xPos += (int4)(0, 1, 2, 3); + + //x + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; + int4 sx = fx0 & 0xffff8000; + fx0 -= sx; + sx = sx >> 15; + + vxc_short4 fx; + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); + //y + int fy = yPos * ratioXY.y + ratioSufXY.y; + int sy = fy & 0xffff8000; // Floor + + fy -= sy; + sy = sy >> 15; + + fy = (fy + (1<< 4)) >> 5; + + //R + vxc_uchar16 line0Y; + vxc_uchar16 line1Y; + int4 coord; + sx = sx + *xOffset; + coord.xyz = sx.xyz; + coord.w = sy + *yOffset; + int2 coord1 = (int2)(sx.w, coord.w); + VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + float grayMean = mean; + + int4 test01, temp1; + int4 test02, temp2; + int4 tt; + vxc_uchar4 val; + int2 coord_out = (int2)(xPos.x, yPos); + + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + + //convert U8 to FP16 + half f16mean; + half f16alpha; + vxc_half4 dst; + vxc_short4 tmp_dst; + _viv_asm(CONV, f16mean, grayMean); + _viv_asm(CONV, f16alpha, f32Var); + VXC_DP4x4(dst, val, f16mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4); + VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4); + _viv_asm(COPY, tmp_dst, dst, 8); + VXC_WriteImage(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_5.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_5.vx new file mode 100644 index 0000000..15bfb2e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_imageprocess_5.vx @@ -0,0 +1,355 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniVecShift10; +_viv_uniform VXC_512Bits uniAddRShift; +_viv_uniform VXC_512Bits uniGetTempVal; +_viv_uniform VXC_512Bits uniExtractBytes; + +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniExtactInteger_2x8; + +_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8; +_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8; +__kernel void GrayScaletoTensor_Fp16_copy + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + coord.xy += (int2) (*xOffset, *yOffset); + vxc_uchar16 src0; + vxc_half8 dst0, dst1; + + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord.x = coord.z + 8; + float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var); + //convert U8 to FP16 + half4 paramData_f16; + vxc_short8 tmp_dst; + _viv_asm(CONV, paramData_f16, paramData); + + VXC_DP2x8(dst0, src0, paramData_f16, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDataMeanStddevLo_2x8); + VXC_DP2x8(dst1, src0, paramData_f16, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDataMeanStddevHi_2x8); + _viv_asm(COPY, tmp_dst, dst0, 16); + VXC_WriteImage(output, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, tmp_dst, dst1, 16); + VXC_WriteImage(output, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void GrayScaletoTensor_Int8_copy + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + coord.xy += (int2) (*xOffset, *yOffset); + vxc_uchar16 src0; + vxc_char16 dst; + + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + f32Var *= outputScale; + float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var); + //convert U8 to FP16 + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + VXC_DP2x8(dst, src0, paramData_f16, + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8); + VXC_DP2x8(dst, src0, paramData_f16, + VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8); + VXC_WriteImage(output, coord.zw, dst, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + +} + +__kernel void GrayScaletoTensor_Int16 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int2 ratioXY = (int2)(*xRatio, *yRatio); + + int4 xPos = get_global_id(0); + int yPos = get_global_id(1); + + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); + xPos += (int4)(0, 1, 2, 3); + + //x + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; + int4 sx = fx0 & 0xffff8000; + fx0 -= sx; + sx = sx >> 15; + + vxc_short4 fx; + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); + //y + int fy = yPos * ratioXY.y + ratioSufXY.y; + int sy = fy & 0xffff8000; // Floor + + fy -= sy; + sy = sy >> 15; + + fy = (fy + (1<< 4)) >> 5; + + vxc_uchar16 line0Y; + vxc_uchar16 line1Y; + int4 coord; + sx = sx + *xOffset; + coord.xyz = sx.xyz; + coord.w = sy + *yOffset; + int2 coord1 = (int2)(sx.w, coord.w); + VXC_ReadImage(line0Y, input, coord.xw, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.yw, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.zw, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + float grayMean = mean * f32Var; + + int4 test01, temp1; + int4 test02, temp2; + int4 tt; + vxc_uchar4 val; + int2 coord_out = (int2)(xPos.x, yPos); + + vxc_uchar8 line1, line2; + + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + vxc_float4 tmp_dst; + vxc_uchar4 u8_dst; + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + //convert U8 to dfp8 + int4 dst0; + vxc_short4 dst; + tmp_dst = tmp_dst * f32Var - grayMean; + tmp_dst *= outputScale; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + + VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void GrayScaletoTensor_Int16_copy + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + coord.xy += (int2) (*xOffset, *yOffset); + vxc_uchar16 src0; + vxc_short8 dst0, dst1; + + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord.x = coord.z + 8; + + f32Var *= outputScale; + float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var); + //convert U8 to FP16 + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + + VXC_DP2x8(dst0, src0, paramData_f16, + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8); + VXC_DP2x8(dst1, src0, paramData_f16, + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8); + VXC_WriteImage(output, coord.zw, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.xw, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +_viv_uniform float outputZP; +__kernel void GrayScaletoTensor_UInt8_copy + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + coord.xy += (int2) (*xOffset, *yOffset); + vxc_uchar16 src0; + vxc_uchar16 dst; + + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + f32Var *= outputScale; + float4 paramData = (float4)(mean * f32Var - outputZP, mean * f32Var - outputZP, + mean * f32Var - outputZP, f32Var); + //convert U8 to FP16 + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + VXC_DP2x8(dst, src0, paramData_f16, + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8); + VXC_DP2x8(dst, src0, paramData_f16, + VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8); + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void GrayScaletoTensor_UInt8 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int2 ratioXY = (int2)(*xRatio, *yRatio); + + int4 xPos = get_global_id(0); + int yPos = get_global_id(1); + + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); + xPos += (int4)(0, 1, 2, 3); + + //x + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; + int4 sx = fx0 & 0xffff8000; + fx0 -= sx; + sx = sx >> 15; + + vxc_short4 fx; + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); + //y + int fy = yPos * ratioXY.y + ratioSufXY.y; + int sy = fy & 0xffff8000; // Floor + + fy -= sy; + sy = sy >> 15; + + fy = (fy + (1<< 4)) >> 5; + + //R + vxc_uchar16 line0Y; + vxc_uchar16 line1Y; + int4 coord; + sx = sx + *xOffset; + coord.xyz = sx.xyz; + coord.w = sy + *yOffset; + int2 coord1 = (int2)(sx.w, coord.w); + VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + float grayMean = mean * f32Var; + + int4 test01, temp1; + int4 test02, temp2; + int4 tt; + vxc_uchar4 val; + int2 coord_out = (int2)(xPos.x, yPos); + + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp1 = temp1 + test01; + + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); + temp2 = temp2 + test02; + temp2 = fy * (temp2 - temp1) + (temp1 << 10); + + vxc_float4 tmp_dst; + vxc_uchar4 u8_dst; + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4); + + //convert U8 to dfp8 + int4 dst0; + vxc_uchar4 dst; + tmp_dst = tmp_dst * f32Var - grayMean; + tmp_dst = tmp_dst * outputScale + outputZP; + dst0 = convert_int4_rte(tmp_dst); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8); + + VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx new file mode 100644 index 0000000..db424ad --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize.vx @@ -0,0 +1,248 @@ +#include "cl_viv_vx_ext.h" + +/**************************layernorm float16***********************************/ +_viv_uniform int width; +_viv_uniform float dimRatio; +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4; + +__kernel void vxcLayerNorm( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t output, + float eps) +{ + int4 coord = (int4)(0, get_global_id(1), 0, 0); + vxc_short8 src0, src1; + vxc_float sum = 0, sqr = 0; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + for(coord.x = 8; coord.x < (width+8); coord.x += 8) + { + vxc_half8 val0_h; + _viv_asm(COPY, val0_h, src0, 16); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + sum += sumsqr.x; + sqr += sumsqr.y; + } + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 bias_f; + for(coord.x = 0; coord.x < width; coord.x += 4) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord.xwww); + vxc_half8 in_h, scale_h; + _viv_asm(COPY, in_h, src0, 16); + _viv_asm(COPY, scale_h, src1, 16); + vxc_float4 in_f, scale_f; + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + vxc_float4 sub, norm; + sub = in_f - mean; + norm = scale_f * vari * sub + bias_f; + half4 norm_h; + _viv_asm(CONV, norm_h, norm); + vxc_half8 dst; + VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniExtractHalf4_dp4x4); + vxc_short8 dstval; + _viv_asm(COPY, dstval, dst, 16); + VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + } +} +/*****************************layernorm uint8 to uint8****************************/ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform float outputScale; +_viv_uniform int output_ZP; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform int tmpZp2; +_viv_uniform float e2InScale; +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +__kernel void vxcLayerNorm_u8( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t output, + float eps) +{ + int4 coord = (int4)(0, get_global_id(1), 0, 0); + vxc_uchar16 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float sum = 0, sqr = 0; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + int tmpSum = 0, tmpSqr = 0; + vxc_int4 tmpSum1; + vxc_int4 tmpSqr1; + short zp = inputZP; + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1.x); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); + } + sum = (tmpSum + sumInZp) * input_scale; + sqr = (tmpSqr + tmpZp2) * e2InScale; + + float mean, vari; + mean = sum * dimRatio; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + int4 coord_bias = (int4)(0, 0, 0, 0); + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + coord_bias.x = coord.x; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + tmpData0 *= input_scale; + tmpData1 *= input_scale; + tmpData2 *= input_scale; + tmpData3 *= input_scale; + + vxc_float4 norm; + tmpData0 -= mean; + norm = scale_f0 * vari * tmpData0 + bias_f0; + bias_f0 = read_imagef(bias, coord_bias); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + coord_bias.x += 4; + tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP); + + tmpData1 -= mean; + norm = scale_f1 * vari * tmpData1 + bias_f1; + bias_f1 = read_imagef(bias, coord_bias); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + + tmpData2 -= mean; + norm = scale_f0 * vari * tmpData2 + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP); + + tmpData3 -= mean; + norm = scale_f1 * vari * tmpData3 + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + } +} +/***************************layernorm float16 to uint8**************************/ +_viv_uniform float outputZP; +__kernel void vxcLayerNormFP16toU8( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t output, + float eps) +{ + int4 coord = (int4)(0, get_global_id(1), 0, 0); + vxc_short8 src0, src1; + vxc_float sum = 0, sqr = 0; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + for(coord.x = 8; coord.x < (width+8); coord.x += 8) + { + vxc_half8 val0_h; + _viv_asm(COPY, val0_h, src0, 16); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + sum += sumsqr.x; + sqr += sumsqr.y; + } + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 bias_f; + for(coord.x = 0; coord.x < width; coord.x += 4) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord.xwww); + vxc_half8 in_h, scale_h; + _viv_asm(COPY, in_h, src0, 16); + _viv_asm(COPY, scale_h, src1, 16); + vxc_float4 in_f, scale_f; + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + vxc_float4 sub, norm; + sub = in_f - mean; + norm = scale_f * vari * sub + bias_f; + norm = norm * outputScale + outputZP; + int4 output_int4; + output_int4 = convert_int4_rte(norm); + vxc_uchar8 dst; + VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx new file mode 100644 index 0000000..118764e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_layernormalize_U8.vx @@ -0,0 +1,129 @@ +#include "cl_viv_vx_ext.h" + +/*****************************layernorm uint8 to fp16****************************/ +_viv_uniform int width; +_viv_uniform float dimRatio; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform int tmpZp2; +_viv_uniform float e2InScale; +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; +_viv_uniform VXC_512Bits UniPackFP16even_2x8; + +__kernel void vxcLayerNormU8toFp16( + image2d_array_t input, + image2d_array_t bias, + image2d_array_t scale, + image2d_array_t output, + float eps) +{ + int4 coord = (int4)(0, get_global_id(1), 0, 0); + vxc_uchar16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0; + vxc_int4 tmpSum1; + vxc_int4 tmpSqr1; + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1.x); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); + } + sum = (tmpSum + sumInZp) * input_scale; + sqr = (tmpSqr + tmpZp2) * e2InScale; + + float mean, vari; + mean = sum * dimRatio; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + int4 coord_bias = (int4)(0, 0, 0, 0); + vxc_half8 scale_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_short8 src1, outval; + short zp = inputZP; + half4 tmpVal0, tmpVal1; + vxc_half8 dst; + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + coord_bias.x = coord.x; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + tmpData0 *= input_scale; + tmpData1 *= input_scale; + tmpData2 *= input_scale; + tmpData3 *= input_scale; + + vxc_float4 norm; + tmpData0 -= mean; + norm = scale_f0 * vari * tmpData0 + bias_f0; + bias_f0 = read_imagef(bias, coord_bias); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + coord_bias.x += 4; + _viv_asm(CONV, tmpVal0, norm); + + tmpData1 -= mean; + norm = scale_f1 * vari * tmpData1 + bias_f1; + bias_f1 = read_imagef(bias, coord_bias); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + UniPackFP16even_2x8); + _viv_asm(COPY, outval, dst, 16); + int2 coord_out = (int2)(coord.x, coord.y); + VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + tmpData2 -= mean; + norm = scale_f0 * vari * tmpData2 + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + + tmpData3 -= mean; + norm = scale_f1 * vari * tmpData3 + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + UniPackFP16even_2x8); + _viv_asm(COPY, outval, dst, 16); + coord_out.x += 8; + VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx new file mode 100644 index 0000000..8175ced --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_resize.vx @@ -0,0 +1,38 @@ +#include "cl_viv_vx_ext.h" + +//--------------------------resize------------------------- +_viv_uniform VXC_512Bits uniPackEvenData_2x8; +__kernel void resize_16bits_downsample_quarter + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + vxc_short8 src0, src1; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord = coord >> 1; + VXC_DP2x8(src0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardInf, 0), uniPackEvenData_2x8); + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_8bits_downsample_quarter + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + vxc_char16 src0; + vxc_char8 dst; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord = coord >> 1; + dst = src0.s02468ace; + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_roi_align.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_roi_align.vx new file mode 100644 index 0000000..90804a3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_roi_align.vx @@ -0,0 +1,8 @@ +#include "cl_viv_vx_ext.h" + +__kernel void vxcRoi_align( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx new file mode 100644 index 0000000..3c9551d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_scale.vx @@ -0,0 +1,49 @@ +#include "cl_viv_vx_ext.h" + +//--------------------------scale------------------------- +_viv_uniform VXC_512Bits uniExtractHalf8_2x8; +_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Lo_4x4; +_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Hi_4x4; +__kernel void scale_fp16 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t weights, + __read_only image2d_array_t biases, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + vxc_short8 vec0, vec1; + vxc_half8 src0; + vxc_half8 w0; + vxc_float4 b0, b1; + vxc_float4 dst0, dst1; + VXC_ReadImage(vec0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage(vec1, weights, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, w0, vec1, 16); + + coord.z = coord.x + 4; + + b0 = read_imagef(biases, coord.xwww); + b1 = read_imagef(biases, coord.zwww); + + VXC_DP4x4(dst0, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniFp16MulFp16ToFp32_Lo_4x4); + VXC_DP4x4(dst1, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniFp16MulFp16ToFp32_Hi_4x4); + dst0 += b0; + dst1 += b1; + + half4 t0, t1; + + _viv_asm(CONV, t0, dst0); + _viv_asm(CONV, t1, dst1); + + VXC_DP2x8(w0, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); + _viv_asm(COPY, vec0, w0, 16); + + VXC_WriteImage(output, coord.xy, vec0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx new file mode 100644 index 0000000..9800aa8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel.vx @@ -0,0 +1,67 @@ +#include "cl_viv_vx_ext.h" + +/******************shuffle channel float16/int16********************/ +_viv_uniform int group_column; +_viv_uniform float rgroup_column; + +__kernel void shuffleChannelVXC( + image2d_array_t input, + image2d_array_t output, + int group_number, + int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_short8 src0, src1, src2, src3; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + int coordz = coord.z; + int index_col = coordz * rgroup_column; + int index_row = coordz - index_col * group_column; + coord.z = index_row * group_number + index_col; + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y ++; + VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y ++; + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y ++; + VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +/*****************shuffle channel int8/uint8****************************/ + +__kernel void shuffleChannel8BitsVXC( + image2d_array_t input, + image2d_array_t output, + int group_number, + int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_char16 src0, src1, src2, src3; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int coordz = coord.z; + int index_col = coordz * rgroup_column; + int index_row = coordz - index_col * group_column; + coord.z = index_row * group_number + index_col; + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord.y ++; + VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord.y ++; + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord.y ++; + VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx new file mode 100644 index 0000000..a4e0fff --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_shufflechannel_axis1.vx @@ -0,0 +1,65 @@ +#include "cl_viv_vx_ext.h" + +/******************shuffle channel float16/int16********************/ +_viv_uniform int group_column; +_viv_uniform float rgroup_column; + +__kernel void shuffleChannel16Bits_Axis1( + image2d_array_t input, + image2d_array_t output, + int group_number, + int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + vxc_short8 src0, src1, src2, src3; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + int coordy = coord.y; + int index_col = coordy * rgroup_column; + int index_row = coordy - index_col * group_column; + coord_out.y = index_row * group_number + index_col; + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.x += 8; + VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.x += 8; + VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_out.x += 8; + VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +/*****************shuffle channel int8/uint8****************************/ + +__kernel void shuffleChannel8Bits_Axis1( + image2d_array_t input, + image2d_array_t output, + int group_number, + int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + vxc_char16 src0, src1; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.x += 16; + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int coordy = coord.y; + int index_col = coordy * rgroup_column; + int index_row = coordy - index_col * group_column; + coord_out.y = index_row * group_number + index_col; + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.x += 16; + VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_signalframe.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_signalframe.vx new file mode 100644 index 0000000..f055ad7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_signalframe.vx @@ -0,0 +1,278 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int input_width; +_viv_uniform int input_height; +_viv_uniform int input_channel; +_viv_uniform int output_channel; + + +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_width( + image2d_array_t input, + image2d_array_t output, + int frame_length, + int step, + int pad_end, + int pad, + int axis) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int outChn = gidz * input_height + gidy; + int4 coord = (int4)(0, gidy, gidz, 0); + int4 coord_out = (int4)(0, 0, outChn, 0); + + int endcoord = (pad_end == 0) ? (input_width - frame_length + 1) : (input_width); + int iter = frame_length / 8; + int res = frame_length % 8; + vxc_short8 src0; + + for(int i = 0; i < endcoord; i += step) + { + coord.x = i; + for(int j = 0; j < iter; j++) + { + coord_out.x = j << 3; + coord.x = i + (j << 3); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + coord.x = i + (iter << 3); + coord_out.x = (iter << 3); + for(int j = 0; j < res; j++) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_out.x++; + coord.x++; + } + + coord_out.y++; + } +} + +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_height( + image2d_array_t input, + image2d_array_t output, + int frame_length, + int step, + int pad_end, + int pad, + int axis) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int outChn = gidz * output_channel + (gidy / step); + int4 coord = (int4)(gidx, gidy, gidz, 0); + int4 coord_out = (int4)(gidx, 0, outChn, 0); + vxc_short8 src0; + + for(int i = 0; i < frame_length; i++) + { + coord.y = gidy + i; + coord_out.y = i; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_channel( + image2d_array_t input, + image2d_array_t output, + int frame_length, + int step, + int pad_end, + int pad, + int axis) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int outChn = (gidz / step) * frame_length; + int4 coord = (int4)(gidx, gidy, gidz, 0); + int4 coord_out = (int4)(gidx, gidy, outChn, 0); + vxc_short8 src0; + + for(int i = 0; i < frame_length; i++) + { + coord.z = gidz + i; + coord_out.z = outChn + i; + if(coord.z < input_channel) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + else + { + src0 = (vxc_short8)(0); + } + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_width_8bit( + image2d_array_t input, + image2d_array_t output, + int frame_length, + int step, + int pad_end, + int pad, + int axis) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int outChn = gidz * input_height + gidy; + int4 coord = (int4)(0, gidy, gidz, 0); + int4 coord_out = (int4)(0, 0, outChn, 0); + + int endcoord = (pad_end == 0) ? (input_width - frame_length + 1) : (input_width); + int iter = frame_length / 8; + int res = frame_length % 8; + vxc_char8 src0; + + for(int i = 0; i < endcoord; i += step) + { + coord.x = i; + for(int j = 0; j < iter; j++) + { + coord_out.x = j << 3; + coord.x = i + (j << 3); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + coord.x = i + (iter << 3); + coord_out.x = (iter << 3); + for(int j = 0; j < res; j++) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_out.x++; + coord.x++; + } + + coord_out.y++; + } +} + +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_height_8bit( + image2d_array_t input, + image2d_array_t output, + int frame_length, + int step, + int pad_end, + int pad, + int axis) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int outChn = gidz * output_channel + (gidy / step); + int4 coord = (int4)(gidx, gidy, gidz, 0); + int4 coord_out = (int4)(gidx, 0, outChn, 0); + vxc_char8 src0; + + for(int i = 0; i < frame_length; i++) + { + coord.y = gidy + i; + coord_out.y = i; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_channel_8bit( + image2d_array_t input, + image2d_array_t output, + int frame_length, + int step, + int pad_end, + int pad, + int axis) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int outChn = (gidz / step) * frame_length; + int4 coord = (int4)(gidx, gidy, gidz, 0); + int4 coord_out = (int4)(gidx, gidy, outChn, 0); + vxc_char8 src0; + + for(int i = 0; i < frame_length; i++) + { + coord.z = gidz + i; + coord_out.z = outChn + i; + if(coord.z < input_channel) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + else + { + src0 = (vxc_char8)(0); + } + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +#if 0 +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void vxcSignalFrame_tensor( + image2d_array_t input, + image2d_array_t output, + image2d_array_t frame_length, + image2d_array_t steps, + image2d_array_t pad_end, + image2d_array_t pad, + image2d_array_t axis) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int outChn = gidz * input_height + gidy; + int4 coord = (int4)(0, gidy, gidz, 0); + int4 coord_out = (int4)(0, 0, outChn, 0); + int4 coord_para = (int4)(0, 0, 0, 0); + + int4 size = read_imagei(frame_length, coord_para); + int4 step = read_imagei(steps, coord_para); + int4 pe = read_imagei(pad_end, coord_para); + int4 pd = read_imagei(pad, coord_para); + int len = input_width + (pe.x ? pd : 0); + int endcoord = len - size.x + 1; + int iter = size.x / 8; + int res = size.x % 8; + vxc_short8 src0; + + for(int i = 0; i < endcoord; i += step.x) + { + coord.x = i; + for(int j = 0; j < iter; j++) + { + coord_out.x = j << 3; + coord.x += (j << 3); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + coord.x = i + (iter << 3); + coord_out.x = (iter << 3); + for(int j = 0; j < res; j++) + { + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_out.x++; + coord.x++; + } + + coord_out.y++; + } +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx new file mode 100644 index 0000000..01957b0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_space2depth.vx @@ -0,0 +1,41 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4; +_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4; +_viv_uniform int input_depth; + +__kernel void vxcReorg2_fp16_fp16_sx2_sy1 + ( + image2d_array_t input, + image2d_array_t output, + int stridex, + int stridey + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + + int4 coord = (int4)(gidx, gidy, gidz, 0); + int4 coord_out = (int4)(gidx >> 1, gidy, 0, 0); + int out_d0, out_d1; + vxc_short8 imageData; + vxc_short8 imgVal0, imgVal1; + //int tmpw = gidz / input_depth; \n\ + //int tmpz = gidz % input_depth; \n\ + + VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(imgVal0, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniExtractEvenFp16Stride2_4x4); + VXC_DP4x4(imgVal1, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniExtractOddFp16Stride2_4x4); + + out_d0 = gidz * 2 * 1; + out_d1 = out_d0 + 1; + + coord_out.z = out_d0; + VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.z = out_d1; + VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_tensorstackconcat.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_tensorstackconcat.vx new file mode 100644 index 0000000..6d1eb8f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_tensorstackconcat.vx @@ -0,0 +1,42 @@ +#include "cl_viv_vx_ext.h" + +/*******************tensorstackconcat 16BITs********************/ +__kernel void vxcTensorStackConcat( + image2d_array_t input, + image2d_t index, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_short8 src0, src1; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(8, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.w = 0; + coord.y = read_imagei(index, coord.ww).x; + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +/**************tensorstackconcat 8BITs***************************/ +__kernel void vxcTensorStackConcat8Bits( + image2d_array_t input, + image2d_t index, + image2d_array_t output) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int idx = coord.x; + vxc_char16 src0, src1; + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.x += 16; + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.x = idx; + coord.w = 0; + coord.y = read_imagei(index, coord.ww).x; + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.x += 16; + VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_topk.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_topk.vx new file mode 100644 index 0000000..fdacd41 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_topk.vx @@ -0,0 +1,8 @@ +#include "cl_viv_vx_ext.h" + +__kernel void vxcTopk( + __read_only image2d_array_t input, + __write_only image2d_array_t output) +{ + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_gemm.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_gemm.vx new file mode 100644 index 0000000..7a146d1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_gemm.vx @@ -0,0 +1,39 @@ +/* + ============================================================================ + Name : gemm.vx + Author : Sam + Version : + Copyright : Your copyright notice + Description : + ============================================================================ + */ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniGemm3x3_4x4; +__kernel void vxcTransform_Gemm_F16toF16 + ( + __read_only image2d_array_t thetaTensor, + __read_only image2d_array_t gridTensor, + __write_only image2d_array_t coordinates + ) +{ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); + + vxc_short8 vec0, vec1, vec2; + vxc_half8 src0, src1, src2, dst; + + VXC_ReadImage(vec0,thetaTensor,coord.xx,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,5, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 16); + VXC_ReadImage(vec1,gridTensor,coord.yz,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,5,0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 16); + VXC_ReadImage(vec2,gridTensor,coord.yz,VXC_5BITOFFSET_XY(6,0),VXC_MODIFIER(0,5,0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src2, vec2, 16); + + coord.y = (int)((short)coord.y / (short)3) * 2; + + VXC_DP4x4(dst, src1, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemm3x3_4x4); + VXC_DP4x4(dst, src2, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniGemm3x3_4x4); + + _viv_asm(COPY, vec0, dst, 16); + VXC_WriteImage(coordinates, coord.yz, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_interp.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_interp.vx new file mode 100644 index 0000000..c149e6f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_interp.vx @@ -0,0 +1,125 @@ +/* + ============================================================================ + Name : minimum.vx + Author : Sam + Version : + Copyright : Your copyright notice + Description : + ============================================================================ + */ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniGetDXY_4x4; +_viv_uniform VXC_512Bits uniConvertF16toF32_4x4; +_viv_uniform int2 packedWH2; +_viv_uniform int packedWH; +__kernel void vxcTransform_InterP_F16toF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 vec0; + vxc_half8 pxy; + vxc_float4 dxy4; + vxc_int4 pos4; + short dst = 0; + + VXC_ReadImage(vec0, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, pxy, vec0, 4); + + coord.x >>= 1; + vxc_short2 packedWH_16B; + _viv_asm(COPY, packedWH_16B, packedWH, 4); + VXC_DP4x4(dxy4, pxy, packedWH_16B, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniGetDXY_4x4); + dxy4.zw = floor(dxy4.xy); + pos4.xy = convert_int2(dxy4.zw); + pos4.zw = convert_int2(ceil(dxy4.xy)); + + vxc_short8 vec1; + vxc_half8 src0, src1; + VXC_ReadImage(vec0, input0, pos4.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 8); + VXC_ReadImage(vec1, input0, pos4.xw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 8); + + float2 xyLerp = dxy4.xy - dxy4.zw; + float2 oneSub_xyLerp = 1.0f - xyLerp; + float4 coef = (float4)(oneSub_xyLerp.x * oneSub_xyLerp.y, xyLerp.x * oneSub_xyLerp.y, + oneSub_xyLerp.x * xyLerp.y, xyLerp.x * xyLerp.y); + float4 data; + + VXC_DP4x4(data, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16toF32_4x4); + + data.x = dot(data, coef); + + half tmp; + _viv_asm(CONV, tmp, data); + _viv_asm(COPY, dst, tmp, 4); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform int depth; +__kernel void vxcTransform_InterP_F16toF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + vxc_short8 vec0; + vxc_half8 pxy; + vxc_float4 dxy4; + vxc_int4 pos4; + short dst = 0; + + VXC_ReadImage(vec0, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, pxy, vec0, 4); + + coord.x >>= 1; + vxc_short2 packedWH_16B; + _viv_asm(COPY, packedWH_16B, packedWH, 4); + VXC_DP4x4(dxy4, pxy, packedWH_16B, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniGetDXY_4x4); + dxy4.zw = floor(dxy4.xy); + pos4.xy = convert_int2(dxy4.zw); + pos4.zw = convert_int2(ceil(dxy4.xy)); + + + float2 xyLerp = dxy4.xy - dxy4.zw; + float2 oneSub_xyLerp = 1.0f - xyLerp; + float4 coef = (float4)(oneSub_xyLerp.x * oneSub_xyLerp.y, xyLerp.x * oneSub_xyLerp.y, + oneSub_xyLerp.x * xyLerp.y, xyLerp.x * xyLerp.y); + + int4 coord_ = (int4)(pos4.x, pos4.y, 0, 0); + do + { + vxc_short8 vec1; + vxc_half8 src0, src1; + VXC_ReadImage2DArray(vec0,input0,coord_,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,1,0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src0, vec0, 8); + VXC_ReadImage2DArray(vec1,input0,coord_,VXC_5BITOFFSET_XY(0,1),VXC_MODIFIER(0,1,0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, src1, vec1, 8); + + coord_.z ++; + float4 data; + VXC_DP4x4(data, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16toF32_4x4); + + data.x = dot(data, coef); + + half tmp; + _viv_asm(CONV, tmp, data); + _viv_asm(COPY, dst, tmp, 4); + + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); + coord.z ++; + + } while (coord.z < depth); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_setupThres.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_setupThres.vx new file mode 100644 index 0000000..31b1cec --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_transform_setupThres.vx @@ -0,0 +1,32 @@ +/* + ============================================================================ + Name : gemm.vx + Author : Sam + Version : + Copyright : Your copyright notice + Description : + ============================================================================ + */ +#include "cl_viv_vx_ext.h" + +_viv_uniform int4 extract_packed; +__kernel void vxcTransform_setupThres_F16toF16 + ( + __read_only image2d_array_t initTensor, + __read_only image2d_array_t inputFC, + global int* thresFlag, + __write_only image2d_array_t thres + ) +{ + int2 coord = (int2)(0, 0); + + vxc_ushort8 src0, src1, dst; + + int flag = *thresFlag; + VXC_ReadImage(src0, initTensor, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, inputFC, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_BitExtract(dst, src0, src1, extract_packed, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage(thres, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c new file mode 100644 index 0000000..1d65b5e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -0,0 +1,46623 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +/* WARNING! AUTO-GENERATED, DO NOT MODIFY MANUALLY */ + +#include +#include +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vsi_nn_libnnext_resource.h" + + +static const char a_times_b_plus_c_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniA_Times_B_2x8;\n\ +_viv_uniform VXC_512Bits uniA_Plus_B_2x8;\n\ +__kernel void a_times_b_plus_c_F16_F16_F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __read_only image2d_array_t input2,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_half8 src0, src1, src2, dst;\n\ + vxc_ushort8 vec0, vec1, vec2, result;\n\ +\n\ + VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ + VXC_ReadImage2DArray(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src2, vec2, 16);\n\ +\n\ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Times_B_2x8);\n\ + VXC_DP2x8(dst, dst, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Plus_B_2x8);\n\ + _viv_asm(COPY, result, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void a_times_b_plus_c_F16_F16_F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __read_only image2d_array_t input2,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_half8 src0, src1, src2, dst;\n\ + vxc_ushort8 vec0, vec1, vec2, result;\n\ +\n\ + VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ + VXC_ReadImage(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src2, vec2, 16);\n\ +\n\ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Times_B_2x8);\n\ + VXC_DP2x8(dst, dst, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Plus_B_2x8);\n\ + _viv_asm(COPY, result, dst, 16);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +"; /* end of a_times_b_plus_c_vx*/ + +static const char add_mean_std_norm_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/**************************Tensor add mean stddev norm float16*********************/\n\ +_viv_uniform int width;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform float rsEps;\n\ +_viv_uniform VXC_512Bits uniAddFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits uniAddFp16toFp32Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniAddFp16toFp32Hi_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +// one group(16 threads) calculates one row\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_F16_F16toF16(\n\ + image2d_array_t input,\n\ + image2d_array_t input1,\n\ + image2d_array_t output,\n\ + float eps)\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + vxc_short8 src0, src1, src2;\n\ + float pSum = 0, pSqr = 0;\n\ + float sum = 0, sqr = 0;\n\ + vxc_half8 in_h, in_h1, in_h2;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(; coord.x < width; coord.x += 128)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + _viv_asm(COPY, in_h1, src1, 16);\n\ + VXC_DP2x8(in_h2, in_h, in_h1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAddFp16_2x8);\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, in_h2, in_h2, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + pSum += sumsqr.x;\n\ + pSqr += sumsqr.y;\n\ + }\n\ +\n\ + lcl_sum[lidx] = pSum;\n\ + lcl_sqr[lidx] = pSqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + float4 data0;\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + pLocalPtr = (float4 *)&lcl_sqr[0];\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sqr = dot(data0, one);\n\ +\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari, stddev_inv, rMeanStd;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + stddev_inv = (vari==0 ? rsEps : rsqrt(vari));\n\ + rMeanStd = (-mean) * stddev_inv;\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + _viv_asm(COPY, in_h1, src1, 16);\n\ +\n\ + vxc_float4 in_f0, in_f1;\n\ + VXC_DP4x4(in_f0, in_h, in_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAddFp16toFp32Lo_4x4);\n\ + VXC_DP4x4(in_f1, in_h, in_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAddFp16toFp32Hi_4x4);\n\ +\n\ + vxc_float4 norm0, norm1;\n\ + half4 norm_h0, norm_h1;\n\ +\n\ + norm0 = in_f0 * stddev_inv + rMeanStd;\n\ + norm1 = in_f1 * stddev_inv + rMeanStd;\n\ + _viv_asm(CONV, norm_h0, norm0);\n\ + _viv_asm(CONV, norm_h1, norm1);\n\ +\n\ + VXC_DP2x8(src2, norm_h0, norm_h1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_1_Lo_2x8;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_U8_U8toF16(\n\ + image2d_array_t input,\n\ + image2d_array_t input1,\n\ + image2d_array_t output,\n\ + float eps)\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + vxc_uchar8 src0, src1;\n\ + vxc_short8 src2;\n\ + float pSum = 0, pSqr = 0;\n\ + float sum = 0, sqr = 0;\n\ + vxc_half8 in_h, in_h1, in_h2;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + vxc_ushort8 ms0, ms1;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, ms1, multAndoutZP1, 16);\n\ +\n\ + for(; coord.x < width; coord.x += 128)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(in_h, src0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ + VXC_DP2x8(in_h1, src1, ms1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_1_Lo_2x8);\n\ + VXC_DP2x8(in_h2, in_h, in_h1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAddFp16_2x8);\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, in_h2, in_h2, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + pSum += sumsqr.x;\n\ + pSqr += sumsqr.y;\n\ + }\n\ +\n\ + lcl_sum[lidx] = pSum;\n\ + lcl_sqr[lidx] = pSqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + float4 data0;\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + pLocalPtr = (float4 *)&lcl_sqr[0];\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sqr = dot(data0, one);\n\ +\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari, stddev_inv, rMeanStd;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + stddev_inv = (vari==0 ? rsEps : rsqrt(vari));\n\ + rMeanStd = (-mean) * stddev_inv;\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(in_h, src0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ + VXC_DP2x8(in_h1, src1, ms1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_1_Lo_2x8);\n\ +\n\ + vxc_float4 in_f0, in_f1;\n\ + VXC_DP4x4(in_f0, in_h, in_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAddFp16toFp32Lo_4x4);\n\ + VXC_DP4x4(in_f1, in_h, in_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAddFp16toFp32Hi_4x4);\n\ +\n\ + vxc_float4 norm0, norm1;\n\ + half4 norm_h0, norm_h1;\n\ +\n\ + norm0 = in_f0 * stddev_inv + rMeanStd;\n\ + norm1 = in_f1 * stddev_inv + rMeanStd;\n\ + _viv_asm(CONV, norm_h0, norm0);\n\ + _viv_asm(CONV, norm_h1, norm1);\n\ +\n\ + VXC_DP2x8(src2, norm_h0, norm_h1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt16ScaleToFp32Fst_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt16ScaleToFp32Sec_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\ +_viv_uniform float inScale_i16;\n\ +_viv_uniform float inScale1_i16;\n\ +\n\ +// one group(16 threads) calculates one row\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_I16_I16toF16(\n\ + image2d_array_t input,\n\ + image2d_array_t input1,\n\ + image2d_array_t output,\n\ + float eps)\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + vxc_short8 src0, src1, src2;\n\ + float pSum = 0, pSqr = 0;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + half scale_h, scale_h1;\n\ + _viv_asm(CONV, scale_h, inScale_i16);\n\ + _viv_asm(CONV, scale_h1, inScale1_i16);\n\ + float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3;\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ +\n\ + for(; coord.x < width; coord.x += 128)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpVal0, src0, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16ScaleToFp32Fst_4x4);\n\ + VXC_DP4x4(tmpVal1, src0, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16ScaleToFp32Sec_4x4);\n\ + VXC_DP4x4(tmpVal2, src1, scale_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16ScaleToFp32Fst_4x4);\n\ + VXC_DP4x4(tmpVal3, src1, scale_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16ScaleToFp32Sec_4x4);\n\ + tmpVal0 += tmpVal2;\n\ + tmpVal1 += tmpVal3;\n\ +\n\ + vxc_float4 sumsqr;\n\ + sumsqr = tmpVal0 * tmpVal0 + tmpVal1 * tmpVal1; // sqr\n\ + tmpVal2 = tmpVal0 + tmpVal1; // pre sum\n\ +\n\ + pSum += dot(tmpVal2, one);\n\ + pSqr += dot(sumsqr, one);\n\ + }\n\ +\n\ + lcl_sum[lidx] = pSum;\n\ + lcl_sqr[lidx] = pSqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + float4 data0;\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + pLocalPtr = (float4 *)&lcl_sqr[0];\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sqr = dot(data0, one);\n\ +\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari, stddev_inv, rMeanStd;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + stddev_inv = (vari==0 ? rsEps : rsqrt(vari));\n\ + rMeanStd = (-mean) * stddev_inv;\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpVal0, src0, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16ScaleToFp32Fst_4x4);\n\ + VXC_DP4x4(tmpVal1, src0, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16ScaleToFp32Sec_4x4);\n\ + VXC_DP4x4(tmpVal2, src1, scale_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16ScaleToFp32Fst_4x4);\n\ + VXC_DP4x4(tmpVal3, src1, scale_h1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16ScaleToFp32Sec_4x4);\n\ + tmpVal0 += tmpVal2;\n\ + tmpVal1 += tmpVal3;\n\ +\n\ + vxc_float4 norm0, norm1;\n\ + half4 norm_h0, norm_h1;\n\ +\n\ + norm0 = tmpVal0 * stddev_inv + rMeanStd;\n\ + norm1 = tmpVal1 * stddev_inv + rMeanStd;\n\ + _viv_asm(CONV, norm_h0, norm0);\n\ + _viv_asm(CONV, norm_h1, norm1);\n\ +\n\ + VXC_DP2x8(src2, norm_h0, norm_h1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of add_mean_std_norm_vx*/ + +static const char argmax_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int4 packedArgIdx;\n\ +_viv_uniform int inputWidth;\n\ +_viv_uniform VXC_512Bits uniPackedIdxAddSat_2x8;\n\ +_viv_uniform VXC_512Bits uniSrcT2DstT_2x8;\n\ +\n\ +#define TENSOR_ARGMAX_AXIS0_8BITS(src_type_name, dst_type_name, src_type, \\\n\ + cond_type0, cond_type1, dst_type, cond_type) \\\n\ +__kernel void argmax_axis0_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \\\n\ + src_type maxValue, maxVec, value; \\\n\ + dst_type packIdx, currIdx; \\\n\ + \\\n\ + VXC_ReadImage2DArray(maxVec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \\\n\ + coord.x += 8; \\\n\ + for (; coord.x < inputWidth; ) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(value, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + \\\n\ + currIdx = currIdx + 8; \\\n\ + dst_type condition; \\\n\ + cond_type0 src_condition0 = value > maxVec; \\\n\ + cond_type1 src_condition; \\\n\ + _viv_asm(COPY, src_condition, src_condition0, 8); \\\n\ + cond_type condition_tmp; \\\n\ + VXC_DP2x8(condition_tmp, src_condition, src_condition, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniSrcT2DstT_2x8); \\\n\ + _viv_asm(COPY, condition, condition_tmp, 16); \\\n\ + packIdx = condition ? currIdx : packIdx; \\\n\ + maxVec = max(maxVec, value); \\\n\ + } \\\n\ + \\\n\ + VXC_HorzMax3_Integer(maxValue, maxVec, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_HorzMax3_Integer(maxValue, maxValue.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + maxValue.s01234567 = maxValue.s00000000; \\\n\ + \\\n\ + cond_type1 _maxVal; \\\n\ + VXC_Clamp(_maxVal, maxVec, maxValue, maxValue, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _maxVal += 1; \\\n\ + \\\n\ + VXC_DP2x8(packIdx, packIdx, _maxVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniPackedIdxAddSat_2x8); \\\n\ + \\\n\ + VXC_HorzMin3_Integer(packIdx, packIdx, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_HorzMin3_Integer(packIdx, packIdx.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.yz, packIdx, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS0_8BITS(I8, I16, vxc_char8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMAX_AXIS0_8BITS(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_ARGMAX_AXIS0_8BITS(U8, I16, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMAX_AXIS0_8BITS(U8, U8, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_ARGMAX_AXIS0_8BITS(I16, I16, vxc_short8, vxc_short8, vxc_ushort8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMAX_AXIS0_8BITS(I16, U8, vxc_short8, vxc_short8, vxc_ushort8, vxc_uchar8, vxc_uchar8)\n\ +\n\ +#define TENSOR_ARGMAX_AXIS0_8BITS_2D(src_type_name, dst_type_name, src_type, \\\n\ + cond_type0, cond_type1, dst_type, cond_type) \\\n\ +__kernel void argmax_axis0_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(0), 0, 0); \\\n\ + src_type maxValue, maxVec, value; \\\n\ + dst_type packIdx, currIdx; \\\n\ + \\\n\ + VXC_ReadImage(maxVec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \\\n\ + coord.x += 8; \\\n\ + for (; coord.x < inputWidth; ) \\\n\ + { \\\n\ + VXC_ReadImage(value, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + \\\n\ + currIdx = currIdx + 8; \\\n\ + dst_type condition; \\\n\ + cond_type0 src_condition0 = value > maxVec; \\\n\ + cond_type1 src_condition; \\\n\ + _viv_asm(COPY, src_condition, src_condition0, 8); \\\n\ + cond_type condition_tmp; \\\n\ + VXC_DP2x8(condition_tmp, src_condition, src_condition, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniSrcT2DstT_2x8); \\\n\ + _viv_asm(COPY, condition, condition_tmp, 16); \\\n\ + packIdx = condition ? currIdx : packIdx; \\\n\ + maxVec = max(maxVec, value); \\\n\ + } \\\n\ + \\\n\ + VXC_HorzMax3_Integer(maxValue, maxVec, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_HorzMax3_Integer(maxValue, maxValue.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + maxValue.s01234567 = maxValue.s00000000; \\\n\ + \\\n\ + cond_type1 _maxVal; \\\n\ + VXC_Clamp(_maxVal, maxVec, maxValue, maxValue, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _maxVal += 1; \\\n\ + \\\n\ + VXC_DP2x8(packIdx, packIdx, _maxVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniPackedIdxAddSat_2x8); \\\n\ + \\\n\ + VXC_HorzMin3_Integer(packIdx, packIdx, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_HorzMin3_Integer(packIdx, packIdx.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.yz, packIdx, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +TENSOR_ARGMAX_AXIS0_8BITS_2D(I8, I16, vxc_char8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMAX_AXIS0_8BITS_2D(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_ARGMAX_AXIS0_8BITS_2D(U8, I16, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMAX_AXIS0_8BITS_2D(U8, U8, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_ARGMAX_AXIS0_8BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_ushort8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMAX_AXIS0_8BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_ushort8, vxc_uchar8, vxc_uchar8)\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertHalf2Float32_4x4;\n\ +\n\ +#define TENSOR_ARGMAX_AXIS0_F16_2D(dst_type_name, dst_type) \\\n\ +__kernel void argmax_axis0_F16to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(0), 0, 0); \\\n\ + vxc_short8 vec0, vec1; \\\n\ + vxc_half8 src; \\\n\ + uint4 packIdx, currIdx; \\\n\ + float4 maxValue, value; \\\n\ + \\\n\ + VXC_ReadImage(vec0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec0, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \\\n\ + coord.x += 4; \\\n\ + VXC_DP4x4(maxValue, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \\\n\ + for (; coord.x < inputWidth; ) \\\n\ + { \\\n\ + VXC_ReadImage(vec1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec1, 16); \\\n\ + coord.x += 4; \\\n\ + \\\n\ + currIdx = currIdx + 4; \\\n\ + VXC_DP4x4(value, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \\\n\ + \\\n\ + int4 condition; \\\n\ + condition = value > maxValue; \\\n\ + \\\n\ + uint4 iCondition; \\\n\ + _viv_asm(COPY, iCondition, condition, 16); \\\n\ + packIdx = iCondition ? currIdx : packIdx; \\\n\ + maxValue = value > maxValue ? value : maxValue; \\\n\ + } \\\n\ + \\\n\ + float4 maxVec; \\\n\ + float2 maxVal2 = maxValue.xy > maxValue.zw ? maxValue.xy : maxValue.zw; \\\n\ + maxVec.x = maxVal2.x > maxVal2.y ? maxVal2.x : maxVal2.y; \\\n\ + int4 condition; \\\n\ + condition = maxVec.xxxx == maxValue; \\\n\ + uint4 iCondition; \\\n\ + _viv_asm(COPY, iCondition, condition, 16); \\\n\ + iCondition += 1; \\\n\ + \\\n\ + packIdx = mad_sat(iCondition, 0xFFFFFFFF, packIdx); \\\n\ + \\\n\ + uint2 val2 = packIdx.xy < packIdx.zw ? packIdx.xy : packIdx.zw; \\\n\ + val2.x = val2.x < val2.y ? val2.x : val2.y; \\\n\ + \\\n\ + dst_type dst; \\\n\ + _viv_asm(COPY, dst, val2, 4); \\\n\ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS0_F16_2D(I16, vxc_ushort8)\n\ +TENSOR_ARGMAX_AXIS0_F16_2D(U8, vxc_uchar8)\n\ +\n\ +\n\ +#define TENSOR_ARGMAX_AXIS0_F16(dst_type_name, dst_type) \\\n\ +__kernel void argmax_axis0_F16to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \\\n\ + vxc_short8 vec0, vec1; \\\n\ + vxc_half8 src; \\\n\ + uint4 packIdx, currIdx; \\\n\ + float4 maxValue, value; \\\n\ + \\\n\ + VXC_ReadImage2DArray(vec0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec0, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \\\n\ + coord.x += 4; \\\n\ + VXC_DP4x4(maxValue, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \\\n\ + for (; coord.x < inputWidth; ) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(vec1, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec1, 16); \\\n\ + coord.x += 4; \\\n\ + \\\n\ + currIdx = currIdx + 4; \\\n\ + VXC_DP4x4(value, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \\\n\ + \\\n\ + int4 condition; \\\n\ + condition = value > maxValue; \\\n\ + \\\n\ + uint4 iCondition; \\\n\ + _viv_asm(COPY, iCondition, condition, 16); \\\n\ + packIdx = iCondition ? currIdx : packIdx; \\\n\ + maxValue = value > maxValue ? value : maxValue; \\\n\ + } \\\n\ + \\\n\ + float4 maxVec; \\\n\ + float2 maxVal2 = maxValue.xy > maxValue.zw ? maxValue.xy : maxValue.zw; \\\n\ + maxVec.x = maxVal2.x > maxVal2.y ? maxVal2.x : maxVal2.y; \\\n\ + int4 condition; \\\n\ + condition = maxVec.xxxx == maxValue; \\\n\ + uint4 iCondition; \\\n\ + _viv_asm(COPY, iCondition, condition, 16); \\\n\ + iCondition += 1; \\\n\ + \\\n\ + packIdx = mad_sat(iCondition, 0xFFFFFFFF, packIdx); \\\n\ + \\\n\ + uint2 val2 = packIdx.xy < packIdx.zw ? packIdx.xy : packIdx.zw; \\\n\ + val2.x = val2.x < val2.y ? val2.x : val2.y; \\\n\ + \\\n\ + dst_type dst; \\\n\ + _viv_asm(COPY, dst, val2, 4); \\\n\ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS0_F16(I16, vxc_ushort8)\n\ +TENSOR_ARGMAX_AXIS0_F16(U8, vxc_uchar8)\n\ +"; /* end of argmax_axis0_vx*/ + +static const char argmax_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int4 packedArgIdx;\n\ +_viv_uniform int argLenSub1;\n\ +_viv_uniform VXC_512Bits uniExtractData_2x8;\n\ +\n\ +#define TENSOR_ARGMAX_AXIS1_16BITS(src_type_name, dst_type_name, src_type,\\\n\ + copy_type, axis_type, dst_type, inst_type) \\\n\ + __kernel void argmax_axis1_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), argLenSub1, get_global_id(1), 0); \\\n\ + copy_type vec; \\\n\ + src_type src; \\\n\ + src_type maxVal; \\\n\ + copy_type max; \\\n\ + VXC_ReadImage2DArray(max, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, maxVal, max, 16); \\\n\ + axis_type axis; \\\n\ + axis_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.y --; \\\n\ + for (;coord.y >= 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(vec, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + coord.y --; \\\n\ + packIdx --; \\\n\ + VXC_VertMax3_##inst_type(maxVal, maxVal, maxVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, max, maxVal, 16); \\\n\ + axis = (max == vec) ? packIdx : axis; \\\n\ + } \\\n\ + \\\n\ + dst_type dst_axis; \\\n\ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \\\n\ + VXC_WriteImage(output, coord.xz, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS1_16BITS(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half)\n\ +TENSOR_ARGMAX_AXIS1_16BITS(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half)\n\ +TENSOR_ARGMAX_AXIS1_16BITS(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer)\n\ +TENSOR_ARGMAX_AXIS1_16BITS(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer)\n\ +\n\ +#define TENSOR_ARGMAX_AXIS1_16BITS_2D(src_type_name, dst_type_name, src_type,\\\n\ + copy_type, axis_type, dst_type, inst_type) \\\n\ +__kernel void argmax_axis1_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), argLenSub1, 0, 0); \\\n\ + copy_type vec; \\\n\ + src_type src; \\\n\ + src_type maxVal; \\\n\ + copy_type max; \\\n\ + VXC_ReadImage(max, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, maxVal, max, 16); \\\n\ + axis_type axis; \\\n\ + axis_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.y --; \\\n\ + for (;coord.y >= 0;) \\\n\ + { \\\n\ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + coord.y --; \\\n\ + packIdx --; \\\n\ + VXC_VertMax3_##inst_type(maxVal, maxVal, maxVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, max, maxVal, 16); \\\n\ + axis = (max == vec) ? packIdx : axis; \\\n\ + } \\\n\ + \\\n\ + dst_type dst_axis; \\\n\ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \\\n\ + VXC_WriteImage(output, coord.xz, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS1_16BITS_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half)\n\ +TENSOR_ARGMAX_AXIS1_16BITS_2D(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half)\n\ +TENSOR_ARGMAX_AXIS1_16BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer)\n\ +TENSOR_ARGMAX_AXIS1_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer)\n\ +\n\ +#define TENSOR_ARGMAX_AXIS1_8BITS(src_type_name, dst_type_name, src_type, dst_type) \\\n\ +__kernel void argmax_axis1_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), argLenSub1, get_global_id(1), 0); \\\n\ + src_type src; \\\n\ + src_type maxVal; \\\n\ + VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dst_type axis; \\\n\ + dst_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.y --; \\\n\ + for (;coord.y >= 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y --; \\\n\ + packIdx --; \\\n\ + maxVal = max(maxVal, src); \\\n\ + dst_type condition; \\\n\ + VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axis = condition ? packIdx : axis; \\\n\ + } \\\n\ + \\\n\ + VXC_WriteImage(output, coord.xz, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS1_8BITS(I8, I16, vxc_char16, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS1_8BITS(I8, U8, vxc_char16, vxc_uchar16)\n\ +TENSOR_ARGMAX_AXIS1_8BITS(U8, I16, vxc_uchar16, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS1_8BITS(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +\n\ +#define TENSOR_ARGMAX_AXIS1_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \\\n\ +__kernel void argmax_axis1_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), argLenSub1, 0, 0); \\\n\ + src_type src; \\\n\ + src_type maxVal; \\\n\ + VXC_ReadImage(maxVal, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dst_type axis; \\\n\ + dst_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.y --; \\\n\ + for (;coord.y >= 0;) \\\n\ + { \\\n\ + VXC_ReadImage(src, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y --; \\\n\ + packIdx --; \\\n\ + maxVal = max(maxVal, src); \\\n\ + dst_type condition; \\\n\ + VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axis = condition ? packIdx : axis; \\\n\ + } \\\n\ + \\\n\ + VXC_WriteImage(output, coord.xz, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS1_8BITS_2D(I8, I16, vxc_char16, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS1_8BITS_2D(I8, U8, vxc_char16, vxc_uchar16)\n\ +TENSOR_ARGMAX_AXIS1_8BITS_2D(U8, I16, vxc_uchar16, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS1_8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16)"; /* end of argmax_axis1_vx*/ + +static const char argmax_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int4 packedArgIdx;\n\ +_viv_uniform int argLenSub1;\n\ +_viv_uniform VXC_512Bits uniExtractData_2x8;\n\ +\n\ +#define TENSOR_ARGMAX_AXIS2_16BITS(src_type_name, dst_type_name,\\\n\ + src_type, copy_type, axis_type, dst_type, inst_type) \\\n\ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \\\n\ + copy_type vec; \\\n\ + src_type src; \\\n\ + src_type maxVal; \\\n\ + copy_type max; \\\n\ + VXC_ReadImage2DArray(max, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, maxVal, max, 16); \\\n\ + axis_type axis; \\\n\ + axis_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.z --; \\\n\ + do \\\n\ + { \\\n\ + VXC_ReadImage2DArray(vec, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + coord.z --; \\\n\ + packIdx --; \\\n\ + VXC_VertMax3_##inst_type(maxVal, maxVal, maxVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, max, maxVal, 16); \\\n\ + axis = (max == vec) ? packIdx : axis; \\\n\ + } while (coord.z >= 0); \\\n\ + \\\n\ + dst_type dst_axis; \\\n\ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \\\n\ + VXC_WriteImage(output, coord.xy, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS2_16BITS(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half)\n\ +TENSOR_ARGMAX_AXIS2_16BITS(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half)\n\ +TENSOR_ARGMAX_AXIS2_16BITS(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer)\n\ +TENSOR_ARGMAX_AXIS2_16BITS(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer)\n\ +\n\ +#define TENSOR_ARGMAX_AXIS2_16BITS_2D(src_type_name, dst_type_name, src_type, \\\n\ + copy_type, axis_type, dst_type, inst_type) \\\n\ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS2_16BITS_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half)\n\ +TENSOR_ARGMAX_AXIS2_16BITS_2D(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half)\n\ +TENSOR_ARGMAX_AXIS2_16BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer)\n\ +TENSOR_ARGMAX_AXIS2_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer)\n\ +\n\ +\n\ +#define TENSOR_ARGMAX_AXIS2_8BITS(src_type_name, dst_type_name, src_type, dst_type) \\\n\ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \\\n\ + src_type src; \\\n\ + src_type maxVal; \\\n\ + VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dst_type axis; \\\n\ + dst_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.z --; \\\n\ + do \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z --; \\\n\ + packIdx --; \\\n\ + maxVal = max(maxVal, src); \\\n\ + dst_type condition; \\\n\ + VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axis = condition ? packIdx : axis; \\\n\ + } while (coord.z >= 0); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS2_8BITS(I8, I16, vxc_char8, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS2_8BITS(I8, U8, vxc_char8, vxc_uchar8)\n\ +TENSOR_ARGMAX_AXIS2_8BITS(U8, I16, vxc_uchar8, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS2_8BITS(U8, U8, vxc_uchar8, vxc_uchar8)\n\ +\n\ +#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \\\n\ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, I16, vxc_char8, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, U8, vxc_char8, vxc_uchar8)\n\ +TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, I16, vxc_uchar8, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, U8, vxc_uchar8, vxc_uchar8)\n\ +"; /* end of argmax_axis2_vx*/ + +static const char argmin_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int4 packedArgIdx;\n\ +_viv_uniform int inputWidth;\n\ +_viv_uniform VXC_512Bits uniPackedIdxAddSat_2x8;\n\ +_viv_uniform VXC_512Bits uniSrcT2DstT_2x8;\n\ +\n\ +#define TENSOR_ARGMIN_AXIS0_8BITS(src_type_name, dst_type_name, src_type, \\\n\ + cond_type0, cond_type1, dst_type, cond_type) \\\n\ +__kernel void argmin_axis0_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \\\n\ + src_type minValue, minVec, value; \\\n\ + dst_type packIdx, currIdx; \\\n\ + \\\n\ + VXC_ReadImage2DArray(minVec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \\\n\ + coord.x += 8; \\\n\ + for (; coord.x < inputWidth; ) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(value, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + \\\n\ + currIdx = currIdx + 8; \\\n\ + dst_type condition; \\\n\ + cond_type0 src_condition0 = value < minVec; \\\n\ + cond_type1 src_condition; \\\n\ + _viv_asm(COPY, src_condition, src_condition0, 8); \\\n\ + cond_type condition_tmp; \\\n\ + VXC_DP2x8(condition_tmp, src_condition, src_condition, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniSrcT2DstT_2x8); \\\n\ + _viv_asm(COPY, condition, condition_tmp, 16); \\\n\ + packIdx = condition ? currIdx : packIdx; \\\n\ + minVec = min(minVec, value); \\\n\ + } \\\n\ + \\\n\ + VXC_HorzMin3_Integer(minValue, minVec, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_HorzMin3_Integer(minValue, minValue.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + minValue.s01234567 = minValue.s00000000; \\\n\ + \\\n\ + cond_type1 _minVal; \\\n\ + VXC_Clamp(_minVal, minVec, minValue, minValue, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _minVal += 1; \\\n\ + \\\n\ + VXC_DP2x8(packIdx, packIdx, _minVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniPackedIdxAddSat_2x8); \\\n\ + \\\n\ + VXC_HorzMin3_Integer(packIdx, packIdx, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_HorzMin3_Integer(packIdx, packIdx.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.yz, packIdx, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS0_8BITS(I8, I16, vxc_char8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMIN_AXIS0_8BITS(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_ARGMIN_AXIS0_8BITS(U8, I16, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMIN_AXIS0_8BITS(U8, U8, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_ARGMIN_AXIS0_8BITS(I16, I16, vxc_short8, vxc_short8, vxc_ushort8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMIN_AXIS0_8BITS(I16, U8, vxc_short8, vxc_short8, vxc_ushort8, vxc_uchar8, vxc_uchar8)\n\ +\n\ +#define TENSOR_ARGMIN_AXIS0_8BITS_2D(src_type_name, dst_type_name, src_type, \\\n\ + cond_type0, cond_type1, dst_type, cond_type) \\\n\ +__kernel void argmin_axis0_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(0), 0, 0); \\\n\ + src_type minValue, minVec, value; \\\n\ + dst_type packIdx, currIdx; \\\n\ + \\\n\ + VXC_ReadImage(minVec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \\\n\ + coord.x += 8; \\\n\ + for (; coord.x < inputWidth; ) \\\n\ + { \\\n\ + VXC_ReadImage(value, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + \\\n\ + currIdx = currIdx + 8; \\\n\ + dst_type condition; \\\n\ + cond_type0 src_condition0 = value < minVec; \\\n\ + cond_type1 src_condition; \\\n\ + _viv_asm(COPY, src_condition, src_condition0, 8); \\\n\ + cond_type condition_tmp; \\\n\ + VXC_DP2x8(condition_tmp, src_condition, src_condition, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniSrcT2DstT_2x8); \\\n\ + _viv_asm(COPY, condition, condition_tmp, 16); \\\n\ + packIdx = condition ? currIdx : packIdx; \\\n\ + minVec = min(minVec, value); \\\n\ + } \\\n\ + \\\n\ + VXC_HorzMin3_Integer(minValue, minVec, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_HorzMin3_Integer(minValue, minValue.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + minValue.s01234567 = minValue.s00000000; \\\n\ + \\\n\ + cond_type1 _minVal; \\\n\ + VXC_Clamp(_minVal, minVec, minValue, minValue, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _minVal += 1; \\\n\ + \\\n\ + VXC_DP2x8(packIdx, packIdx, _minVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniPackedIdxAddSat_2x8); \\\n\ + \\\n\ + VXC_HorzMin3_Integer(packIdx, packIdx, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_HorzMin3_Integer(packIdx, packIdx.s035, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.yz, packIdx, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +TENSOR_ARGMIN_AXIS0_8BITS_2D(I8, I16, vxc_char8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMIN_AXIS0_8BITS_2D(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_ARGMIN_AXIS0_8BITS_2D(U8, I16, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMIN_AXIS0_8BITS_2D(U8, U8, vxc_uchar8, vxc_char8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_ARGMIN_AXIS0_8BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_ushort8, vxc_short8, vxc_ushort8)\n\ +TENSOR_ARGMIN_AXIS0_8BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_ushort8, vxc_uchar8, vxc_uchar8)\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertHalf2Float32_4x4;\n\ +\n\ +#define TENSOR_ARGMIN_AXIS0_F16_2D(dst_type_name, dst_type) \\\n\ +__kernel void argmin_axis0_F16to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(0), 0, 0); \\\n\ + vxc_short8 vec0, vec1; \\\n\ + vxc_half8 src; \\\n\ + uint4 packIdx, currIdx; \\\n\ + float4 minValue, value; \\\n\ + \\\n\ + VXC_ReadImage(vec0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec0, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \\\n\ + coord.x += 4; \\\n\ + VXC_DP4x4(minValue, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \\\n\ + for (; coord.x < inputWidth; ) \\\n\ + { \\\n\ + VXC_ReadImage(vec1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec1, 16); \\\n\ + coord.x += 4; \\\n\ + \\\n\ + currIdx = currIdx + 4; \\\n\ + VXC_DP4x4(value, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \\\n\ + \\\n\ + int4 condition; \\\n\ + condition = value < minValue; \\\n\ + \\\n\ + uint4 iCondition; \\\n\ + _viv_asm(COPY, iCondition, condition, 16); \\\n\ + packIdx = iCondition ? currIdx : packIdx; \\\n\ + minValue = value < minValue ? value : minValue; \\\n\ + } \\\n\ + \\\n\ + float4 minVec; \\\n\ + float2 minVal2 = minValue.xy < minValue.zw ? minValue.xy : minValue.zw; \\\n\ + minVec.x = minVal2.x < minVal2.y ? minVal2.x : minVal2.y; \\\n\ + int4 condition; \\\n\ + condition = minVec.xxxx == minValue; \\\n\ + uint4 iCondition; \\\n\ + _viv_asm(COPY, iCondition, condition, 16); \\\n\ + iCondition += 1; \\\n\ + \\\n\ + packIdx = mad_sat(iCondition, 0xFFFFFFFF, packIdx); \\\n\ + \\\n\ + uint2 val2 = packIdx.xy < packIdx.zw ? packIdx.xy : packIdx.zw; \\\n\ + val2.x = val2.x < val2.y ? val2.x : val2.y; \\\n\ + \\\n\ + dst_type dst; \\\n\ + _viv_asm(COPY, dst, val2, 4); \\\n\ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS0_F16_2D(I16, vxc_ushort8)\n\ +TENSOR_ARGMIN_AXIS0_F16_2D(U8, vxc_uchar8)\n\ +\n\ +\n\ +#define TENSOR_ARGMIN_AXIS0_F16(dst_type_name, dst_type) \\\n\ +__kernel void argmin_axis0_F16to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \\\n\ + vxc_short8 vec0, vec1; \\\n\ + vxc_half8 src; \\\n\ + uint4 packIdx, currIdx; \\\n\ + float4 minValue, value; \\\n\ + \\\n\ + VXC_ReadImage2DArray(vec0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec0, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, currIdx, packedArgIdx, 16); \\\n\ + coord.x += 4; \\\n\ + VXC_DP4x4(minValue, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \\\n\ + for (; coord.x < inputWidth; ) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(vec1, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec1, 16); \\\n\ + coord.x += 4; \\\n\ + \\\n\ + currIdx = currIdx + 4; \\\n\ + VXC_DP4x4(value, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertHalf2Float32_4x4); \\\n\ + \\\n\ + int4 condition; \\\n\ + condition = value < minValue; \\\n\ + \\\n\ + uint4 iCondition; \\\n\ + _viv_asm(COPY, iCondition, condition, 16); \\\n\ + packIdx = iCondition ? currIdx : packIdx; \\\n\ + minValue = value < minValue ? value : minValue; \\\n\ + } \\\n\ + \\\n\ + float4 minVec; \\\n\ + float2 minVal2 = minValue.xy < minValue.zw ? minValue.xy : minValue.zw; \\\n\ + minVec.x = minVal2.x < minVal2.y ? minVal2.x : minVal2.y; \\\n\ + int4 condition; \\\n\ + condition = minVec.xxxx == minValue; \\\n\ + uint4 iCondition; \\\n\ + _viv_asm(COPY, iCondition, condition, 16); \\\n\ + iCondition += 1; \\\n\ + \\\n\ + packIdx = mad_sat(iCondition, 0xFFFFFFFF, packIdx); \\\n\ + \\\n\ + uint2 val2 = packIdx.xy < packIdx.zw ? packIdx.xy : packIdx.zw; \\\n\ + val2.x = val2.x < val2.y ? val2.x : val2.y; \\\n\ + \\\n\ + dst_type dst; \\\n\ + _viv_asm(COPY, dst, val2, 4); \\\n\ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS0_F16(I16, vxc_ushort8)\n\ +TENSOR_ARGMIN_AXIS0_F16(U8, vxc_uchar8)\n\ +"; /* end of argmin_axis0_vx*/ + +static const char argmin_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int4 packedArgIdx;\n\ +_viv_uniform int argLenSub1;\n\ +_viv_uniform VXC_512Bits uniExtractData_2x8;\n\ +\n\ +#define TENSOR_ARGMIN_AXIS1_16BITS(src_type_name, dst_type_name, src_type,\\\n\ + copy_type, axis_type, dst_type, inst_type) \\\n\ + __kernel void argmin_axis1_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), argLenSub1, get_global_id(1), 0); \\\n\ + copy_type vec; \\\n\ + src_type src; \\\n\ + src_type minVal; \\\n\ + copy_type min; \\\n\ + VXC_ReadImage2DArray(min, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, minVal, min, 16); \\\n\ + axis_type axis; \\\n\ + axis_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.y --; \\\n\ + for (;coord.y >= 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(vec, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + coord.y --; \\\n\ + packIdx --; \\\n\ + VXC_VertMin3_##inst_type(minVal, minVal, minVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, min, minVal, 16); \\\n\ + axis = (min == vec) ? packIdx : axis; \\\n\ + } \\\n\ + \\\n\ + dst_type dst_axis; \\\n\ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \\\n\ + VXC_WriteImage(output, coord.xz, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS1_16BITS(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half)\n\ +TENSOR_ARGMIN_AXIS1_16BITS(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half)\n\ +TENSOR_ARGMIN_AXIS1_16BITS(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer)\n\ +TENSOR_ARGMIN_AXIS1_16BITS(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer)\n\ +\n\ +#define TENSOR_ARGMIN_AXIS1_16BITS_2D(src_type_name, dst_type_name, src_type,\\\n\ + copy_type, axis_type, dst_type, inst_type) \\\n\ + __kernel void argmin_axis1_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), argLenSub1, 0, 0); \\\n\ + copy_type vec; \\\n\ + src_type src; \\\n\ + src_type minVal; \\\n\ + copy_type min; \\\n\ + VXC_ReadImage(min, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, minVal, min, 16); \\\n\ + axis_type axis; \\\n\ + axis_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.y --; \\\n\ + for (;coord.y >= 0;) \\\n\ + { \\\n\ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + coord.y --; \\\n\ + packIdx --; \\\n\ + VXC_VertMin3_##inst_type(minVal, minVal, minVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, min, minVal, 16); \\\n\ + axis = (min == vec) ? packIdx : axis; \\\n\ + } \\\n\ + \\\n\ + dst_type dst_axis; \\\n\ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \\\n\ + VXC_WriteImage(output, coord.xz, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS1_16BITS_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half)\n\ +TENSOR_ARGMIN_AXIS1_16BITS_2D(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half)\n\ +TENSOR_ARGMIN_AXIS1_16BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer)\n\ +TENSOR_ARGMIN_AXIS1_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer)\n\ +\n\ +#define TENSOR_ARGMIN_AXIS1_8BITS(src_type_name, dst_type_name, src_type, dst_type) \\\n\ +__kernel void argmin_axis1_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), argLenSub1, get_global_id(1), 0); \\\n\ + src_type src; \\\n\ + src_type minVal; \\\n\ + VXC_ReadImage2DArray(minVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dst_type axis; \\\n\ + dst_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.y --; \\\n\ + for (;coord.y >= 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y --; \\\n\ + packIdx --; \\\n\ + minVal = min(minVal, src); \\\n\ + dst_type condition; \\\n\ + VXC_Clamp(condition, src, minVal, minVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axis = condition ? packIdx : axis; \\\n\ + } \\\n\ + \\\n\ + VXC_WriteImage(output, coord.xz, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS1_8BITS(I8, I16, vxc_char16, vxc_short8)\n\ +TENSOR_ARGMIN_AXIS1_8BITS(I8, U8, vxc_char16, vxc_uchar16)\n\ +TENSOR_ARGMIN_AXIS1_8BITS(U8, I16, vxc_uchar16, vxc_short8)\n\ +TENSOR_ARGMIN_AXIS1_8BITS(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +\n\ +#define TENSOR_ARGMIN_AXIS1_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \\\n\ +__kernel void argmin_axis1_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), argLenSub1, 0, 0); \\\n\ + src_type src; \\\n\ + src_type minVal; \\\n\ + VXC_ReadImage(minVal, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dst_type axis; \\\n\ + dst_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.y --; \\\n\ + for (;coord.y >= 0;) \\\n\ + { \\\n\ + VXC_ReadImage(src, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y --; \\\n\ + packIdx --; \\\n\ + minVal = min(minVal, src); \\\n\ + dst_type condition; \\\n\ + VXC_Clamp(condition, src, minVal, minVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axis = condition ? packIdx : axis; \\\n\ + } \\\n\ + \\\n\ + VXC_WriteImage(output, coord.xz, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS1_8BITS_2D(I8, I16, vxc_char16, vxc_short8)\n\ +TENSOR_ARGMIN_AXIS1_8BITS_2D(I8, U8, vxc_char16, vxc_uchar16)\n\ +TENSOR_ARGMIN_AXIS1_8BITS_2D(U8, I16, vxc_uchar16, vxc_short8)\n\ +TENSOR_ARGMIN_AXIS1_8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +\n\ +"; /* end of argmin_axis1_vx*/ + +static const char argmin_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int4 packedArgIdx;\n\ +_viv_uniform int argLenSub1;\n\ +_viv_uniform VXC_512Bits uniExtractData_2x8;\n\ +\n\ +#define TENSOR_ARGMIN_AXIS2_16BITS(src_type_name, dst_type_name,\\\n\ + src_type, copy_type, axis_type, dst_type, inst_type) \\\n\ + __kernel void argmin_axis2_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \\\n\ + copy_type vec; \\\n\ + src_type src; \\\n\ + src_type minVal; \\\n\ + copy_type min; \\\n\ + VXC_ReadImage2DArray(min, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, minVal, min, 16); \\\n\ + axis_type axis; \\\n\ + axis_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.z --; \\\n\ + do \\\n\ + { \\\n\ + VXC_ReadImage2DArray(vec, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + coord.z --; \\\n\ + packIdx --; \\\n\ + VXC_VertMin3_##inst_type(minVal, minVal, minVal, src, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, min, minVal, 16); \\\n\ + axis = (min == vec) ? packIdx : axis; \\\n\ + } while (coord.z >= 0); \\\n\ + \\\n\ + dst_type dst_axis; \\\n\ + VXC_DP2x8(dst_axis, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractData_2x8); \\\n\ + VXC_WriteImage(output, coord.xy, dst_axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS2_16BITS(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half)\n\ +TENSOR_ARGMIN_AXIS2_16BITS(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half)\n\ +TENSOR_ARGMIN_AXIS2_16BITS(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer)\n\ +TENSOR_ARGMIN_AXIS2_16BITS(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer)\n\ +\n\ +#define TENSOR_ARGMIN_AXIS2_16BITS_2D(src_type_name, dst_type_name, src_type, \\\n\ + copy_type, axis_type, dst_type, inst_type) \\\n\ + __kernel void argmin_axis2_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS2_16BITS_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8, Half)\n\ +TENSOR_ARGMIN_AXIS2_16BITS_2D(F16, U8, vxc_half8, vxc_short8, vxc_short8, vxc_uchar8, Half)\n\ +TENSOR_ARGMIN_AXIS2_16BITS_2D(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, Integer)\n\ +TENSOR_ARGMIN_AXIS2_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8, Integer)\n\ +\n\ +\n\ +#define TENSOR_ARGMIN_AXIS2_8BITS(src_type_name, dst_type_name, src_type, dst_type) \\\n\ + __kernel void argmin_axis2_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \\\n\ + src_type src; \\\n\ + src_type minVal; \\\n\ + VXC_ReadImage2DArray(minVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dst_type axis; \\\n\ + dst_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.z --; \\\n\ + do \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z --; \\\n\ + packIdx --; \\\n\ + minVal = min(minVal, src); \\\n\ + dst_type condition; \\\n\ + VXC_Clamp(condition, src, minVal, minVal, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axis = condition ? packIdx : axis; \\\n\ + } while (coord.z >= 0); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS2_8BITS(I8, I16, vxc_char8, vxc_short8)\n\ +TENSOR_ARGMIN_AXIS2_8BITS(I8, U8, vxc_char8, vxc_uchar8)\n\ +TENSOR_ARGMIN_AXIS2_8BITS(U8, I16, vxc_uchar8, vxc_short8)\n\ +TENSOR_ARGMIN_AXIS2_8BITS(U8, U8, vxc_uchar8, vxc_uchar8)\n\ +\n\ +#define TENSOR_ARGMIN_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \\\n\ + __kernel void argmin_axis2_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMIN_AXIS2_8BITS_2D(I8, I16, vxc_char8, vxc_short8)\n\ +TENSOR_ARGMIN_AXIS2_8BITS_2D(I8, U8, vxc_char8, vxc_uchar8)\n\ +TENSOR_ARGMIN_AXIS2_8BITS_2D(U8, I16, vxc_uchar8, vxc_short8)\n\ +TENSOR_ARGMIN_AXIS2_8BITS_2D(U8, U8, vxc_uchar8, vxc_uchar8)\n\ +"; /* end of argmin_axis2_vx*/ + +static const char batchnorm_single_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDatatoF32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDatatoF32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_tail;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +#define BATCH_NORM_SH_IMPL(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ +__kernel void batch_norm_##name0##to##name1##_brdcst1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t Mean, \\\n\ + __read_only image2d_array_t Variance, \\\n\ + __read_only image2d_array_t Gamma, \\\n\ + __read_only image2d_array_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + read_type vec; \\\n\ + src_type src; \\\n\ + VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + vxc_ushort8 _mean, _var, _gamma; \\\n\ + vxc_half8 mean, var, gamma; \\\n\ + VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, mean, _mean, 16); \\\n\ + VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, var, _var, 16); \\\n\ + VXC_ReadImage2DArray(_gamma, Gamma, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, gamma, _gamma, 16); \\\n\ + float4 beta = read_imagef(Beta, coord); \\\n\ + \\\n\ + float4 src0, src1, m, v, g; \\\n\ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + g = g * rsqrt(v + eps); \\\n\ + src0 = src0 * input_scale + input_tail; \\\n\ + src0 = (src0 - m) * g + beta.xxxx; \\\n\ + src0 = src0 * output_scale + output_zp; \\\n\ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + g = g * rsqrt(v + eps); \\\n\ + src1 = src1 * input_scale + input_tail; \\\n\ + src1 = (src1 - m) * g + beta.xxxx; \\\n\ + src1 = src1 * output_scale + output_zp; \\\n\ + \\\n\ + conv_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, src0); \\\n\ + _viv_asm(CONV_RTE, dst1, src1); \\\n\ + dst_type tmp; \\\n\ + save_type dst; \\\n\ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmp, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +BATCH_NORM_SH_IMPL(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ +\n\ +#define BATCH_NORM_SH_IMPL_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ +__kernel void batch_norm_##name0##to##name1##_brdcst1_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t Mean, \\\n\ + __read_only image2d_t Variance, \\\n\ + __read_only image2d_t Gamma, \\\n\ + __read_only image2d_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + read_type vec; \\\n\ + src_type src; \\\n\ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + vxc_ushort8 _mean, _var, _gamma; \\\n\ + vxc_half8 mean, var, gamma; \\\n\ + VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, mean, _mean, 16); \\\n\ + VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, var, _var, 16); \\\n\ + VXC_ReadImage(_gamma, Gamma, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, gamma, _gamma, 16); \\\n\ + float4 beta = read_imagef(Beta, coord.xy); \\\n\ + \\\n\ + float4 src0, src1, m, v, g; \\\n\ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + g = g * rsqrt(v + eps); \\\n\ + src0 = src0 * input_scale + input_tail; \\\n\ + src0 = (src0 - m) * g + beta.xxxx; \\\n\ + src0 = src0 * output_scale + output_zp; \\\n\ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + g = g * rsqrt(v + eps); \\\n\ + src1 = src1 * input_scale + input_tail; \\\n\ + src1 = (src1 - m) * g + beta.xxxx; \\\n\ + src1 = src1 * output_scale + output_zp; \\\n\ + \\\n\ + conv_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, src0); \\\n\ + _viv_asm(CONV_RTE, dst1, src1); \\\n\ + dst_type tmp; \\\n\ + save_type dst; \\\n\ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmp, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +BATCH_NORM_SH_IMPL_2D(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_2D(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_2D(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_2D(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ +\n\ +\n\ +#define BATCH_NORM_SH_IMPL_AXIS1(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ +__kernel void batch_norm_##name0##to##name1##_brdcst0( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t Mean, \\\n\ + __read_only image2d_array_t Variance, \\\n\ + __read_only image2d_array_t Gamma, \\\n\ + __read_only image2d_array_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + read_type vec; \\\n\ + src_type src; \\\n\ + VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + vxc_ushort8 _mean, _var, _gamma; \\\n\ + vxc_half8 mean, var, gamma; \\\n\ + VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, mean, _mean, 16); \\\n\ + VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, var, _var, 16); \\\n\ + VXC_ReadImage2DArray(_gamma, Gamma, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, gamma, _gamma, 16); \\\n\ + float4 beta0 = read_imagef(Beta, coord); \\\n\ + coord.x += 4; \\\n\ + float4 beta1 = read_imagef(Beta, coord); \\\n\ + coord.x -= 4; \\\n\ + \\\n\ + float4 src0, src1, m, v, g; \\\n\ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + g = g * rsqrt(v + eps); \\\n\ + src0 = src0 * input_scale + input_tail; \\\n\ + src0 = (src0 - m) * g + beta0; \\\n\ + src0 = src0 * output_scale + output_zp; \\\n\ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + g = g * rsqrt(v + eps); \\\n\ + src1 = src1 * input_scale + input_tail; \\\n\ + src1 = (src1 - m) * g + beta1; \\\n\ + src1 = src1 * output_scale + output_zp; \\\n\ + \\\n\ + conv_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, src0); \\\n\ + _viv_asm(CONV_RTE, dst1, src1); \\\n\ + dst_type tmp; \\\n\ + save_type dst; \\\n\ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmp, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +BATCH_NORM_SH_IMPL_AXIS1(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_AXIS1(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_AXIS1(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_AXIS1(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ +\n\ +#define BATCH_NORM_SH_IMPL_AXIS1_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ +__kernel void batch_norm_##name0##to##name1##_brdcst0_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t Mean, \\\n\ + __read_only image2d_t Variance, \\\n\ + __read_only image2d_t Gamma, \\\n\ + __read_only image2d_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + read_type vec; \\\n\ + src_type src; \\\n\ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + coord.z += 4; \\\n\ + vxc_ushort8 _mean, _var, _gamma; \\\n\ + vxc_half8 mean, var, gamma; \\\n\ + VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, mean, _mean, 16); \\\n\ + VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, var, _var, 16); \\\n\ + VXC_ReadImage(_gamma, Gamma, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, gamma, _gamma, 16); \\\n\ + float4 beta0 = read_imagef(Beta, coord.xy); \\\n\ + float4 beta1 = read_imagef(Beta, coord.zy); \\\n\ + \\\n\ + float4 src0, src1, m, v, g; \\\n\ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + g = g * rsqrt(v + eps); \\\n\ + src0 = src0 * input_scale + input_tail; \\\n\ + src0 = (src0 - m) * g + beta0; \\\n\ + src0 = src0 * output_scale + output_zp; \\\n\ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(g, gamma, gamma, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + g = g * rsqrt(v + eps); \\\n\ + src1 = src1 * input_scale + input_tail; \\\n\ + src1 = (src1 - m) * g + beta1; \\\n\ + src1 = src1 * output_scale + output_zp; \\\n\ + \\\n\ + conv_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, src0); \\\n\ + _viv_asm(CONV_RTE, dst1, src1); \\\n\ + dst_type tmp; \\\n\ + save_type dst; \\\n\ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmp, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ +\n\ +"; /* end of batchnorm_single_vx*/ + +static const char cast_vx[] = "\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataConvert_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +#define CAST_PROCESS(read_type, src_type, dst_type, write_type, read_fun, write_fun) \\\n\ + read_type read_val; \\\n\ + src_type src_val; \\\n\ + dst_type dst_val; \\\n\ + write_type write_val; \\\n\ + read_fun(read_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src_val, read_val, 16); \\\n\ + VXC_DP2x8(dst_val, src_val, src_val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniDataConvert_2x8); \\\n\ + _viv_asm(COPY, write_val, dst_val, 16); \\\n\ + write_fun(output, coord, write_val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define CAST_FUN(src_name, dst_name, read_type, src_type, dst_type, write_type) \\\n\ +__kernel void cast_##src_name##to##dst_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + CAST_PROCESS(read_type, src_type, dst_type, write_type, VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\ +}\n\ +\n\ +CAST_FUN(F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +CAST_FUN(F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8)\n\ +CAST_FUN(F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8)\n\ +CAST_FUN(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8)\n\ +CAST_FUN(I16, I8, vxc_short8, vxc_short8, vxc_char8, vxc_char8)\n\ +CAST_FUN(I16, U8, vxc_short8, vxc_short8, vxc_uchar8, vxc_uchar8)\n\ +CAST_FUN(I8, F16, vxc_char8, vxc_char8, vxc_half8, vxc_short8)\n\ +CAST_FUN(I8, I16, vxc_char8, vxc_char8, vxc_short8, vxc_short8)\n\ +CAST_FUN(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8)\n\ +CAST_FUN(U8, F16, vxc_uchar8, vxc_uchar8, vxc_half8, vxc_short8)\n\ +CAST_FUN(U8, I16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_short8)\n\ +CAST_FUN(U8, I8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_char8)\n\ +\n\ +\n\ +#define CAST_FUN_2D(src_name, dst_name, read_type, src_type, dst_type, write_type) \\\n\ +__kernel void cast_##src_name##to##dst_name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + CAST_PROCESS(read_type, src_type, dst_type, write_type, VXC_ReadImage, VXC_WriteImage) \\\n\ +}\n\ +\n\ +CAST_FUN_2D(F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +CAST_FUN_2D(F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8)\n\ +CAST_FUN_2D(F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8)\n\ +CAST_FUN_2D(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8)\n\ +CAST_FUN_2D(I16, I8, vxc_short8, vxc_short8, vxc_char8, vxc_char8)\n\ +CAST_FUN_2D(I16, U8, vxc_short8, vxc_short8, vxc_uchar8, vxc_uchar8)\n\ +CAST_FUN_2D(I8, F16, vxc_char8, vxc_char8, vxc_half8, vxc_short8)\n\ +CAST_FUN_2D(I8, I16, vxc_char8, vxc_char8, vxc_short8, vxc_short8)\n\ +CAST_FUN_2D(I8, U8, vxc_char8, vxc_char8, vxc_uchar8, vxc_uchar8)\n\ +CAST_FUN_2D(U8, F16, vxc_uchar8, vxc_uchar8, vxc_half8, vxc_short8)\n\ +CAST_FUN_2D(U8, I16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_short8)\n\ +CAST_FUN_2D(U8, I8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_char8)\n\ +\n\ +#define CAST_TO_BOOL_PROCESS(src_type, tmp_type, read_fun, write_fun) \\\n\ + src_type src_val; \\\n\ + tmp_type tmp_val; \\\n\ + vxc_char8 dst_val; \\\n\ + read_fun(src_val, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + tmp_val = (src_val != 0); \\\n\ + tmp_val *= (-1); \\\n\ + VXC_DP2x8(dst_val, tmp_val, tmp_val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDataConvert_2x8); \\\n\ + write_fun(output, coord, dst_val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define CAST_TO_BOOL_FUN(src_name, src_type, tmp_type) \\\n\ +__kernel void cast_##src_name##toBOOL8( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + CAST_TO_BOOL_PROCESS(src_type, tmp_type, VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\ +}\n\ +\n\ +CAST_TO_BOOL_FUN(F16, vxc_short8, vxc_short8)\n\ +CAST_TO_BOOL_FUN(I16, vxc_short8, vxc_short8)\n\ +CAST_TO_BOOL_FUN(I8, vxc_char8, vxc_char8)\n\ +CAST_TO_BOOL_FUN(U8, vxc_uchar8, vxc_char8)\n\ +\n\ +#define CAST_TO_BOOL_FUN_2D(src_name, src_type, tmp_type) \\\n\ +__kernel void cast_##src_name##toBOOL8_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + CAST_TO_BOOL_PROCESS(src_type, tmp_type, VXC_ReadImage, VXC_WriteImage) \\\n\ +}\n\ +\n\ +CAST_TO_BOOL_FUN_2D(F16, vxc_short8, vxc_short8)\n\ +CAST_TO_BOOL_FUN_2D(I16, vxc_short8, vxc_short8)\n\ +CAST_TO_BOOL_FUN_2D(I8, vxc_char8, vxc_char8)\n\ +CAST_TO_BOOL_FUN_2D(U8, vxc_uchar8, vxc_char8)\n\ +\n\ +#define CAST_F32orI32_PROCESS(src_type, dst_type, read_fun, write_fun) \\\n\ + src_type src_val0, src_val1; \\\n\ + dst_type dst_val; \\\n\ + int4 tmpData1, tmpData2; \\\n\ + src_val0 = read_fun(input, coord); \\\n\ + coord.x += 4; \\\n\ + src_val1 = read_fun(input, coord); \\\n\ + tmpData1 = convert_int4(src_val0); \\\n\ + tmpData2 = convert_int4(src_val1); \\\n\ + VXC_DP2x8(dst_val, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + coord.x -= 4; \\\n\ + write_fun(output, coord, dst_val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define CAST_F32orI32_FUN(src_name, dst_name, src_type, dst_type, read_fun) \\\n\ +__kernel void cast_##src_name##to##dst_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + CAST_F32orI32_PROCESS(src_type, dst_type, read_fun, VXC_WriteImage2DArray) \\\n\ +}\n\ +\n\ +CAST_F32orI32_FUN(F32, I16, float4, vxc_short8, read_imagef)\n\ +CAST_F32orI32_FUN(F32, I8, float4, vxc_char8, read_imagef)\n\ +CAST_F32orI32_FUN(F32, U8, float4, vxc_uchar8, read_imagef)\n\ +CAST_F32orI32_FUN(I32, I16, int4, vxc_short8, read_imagei)\n\ +CAST_F32orI32_FUN(I32, I8, int4, vxc_char8, read_imagei)\n\ +CAST_F32orI32_FUN(I32, U8, int4, vxc_uchar8, read_imagei)\n\ +\n\ +\n\ +#define CAST_F32orI32_FUN_2D(src_name, dst_name, src_type, dst_type, read_fun) \\\n\ +__kernel void cast_##src_name##to##dst_name##_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + CAST_F32orI32_PROCESS(src_type, dst_type, read_fun, VXC_WriteImage) \\\n\ +}\n\ +\n\ +CAST_F32orI32_FUN_2D(F32, I16, float4, vxc_short8, read_imagef)\n\ +CAST_F32orI32_FUN_2D(F32, I8, float4, vxc_char8, read_imagef)\n\ +CAST_F32orI32_FUN_2D(F32, U8, float4, vxc_uchar8, read_imagef)\n\ +CAST_F32orI32_FUN_2D(I32, I16, int4, vxc_short8, read_imagei)\n\ +CAST_F32orI32_FUN_2D(I32, I8, int4, vxc_char8, read_imagei)\n\ +CAST_F32orI32_FUN_2D(I32, U8, int4, vxc_uchar8, read_imagei)\n\ +\n\ +"; /* end of cast_vx*/ + +static const char clip_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int4 packedMinData_FP16;\n\ +_viv_uniform int4 packedMaxData_FP16;\n\ +_viv_uniform VXC_512Bits uniConvertF16toInt_2x8;\n\ +_viv_uniform int2 multAndoutZP;\n\ +_viv_uniform VXC_512Bits uniDataMulAndPostShift_2x8;\n\ +\n\ +#define TENSORCLIP_F16TOF16_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 vec0, dst; \\\n\ + vxc_half8 src0, src1, minHf, maxHf; \\\n\ + read_fun(vec0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vec0, 16); \\\n\ + _viv_asm(COPY, minHf, packedMinData_FP16, 16); \\\n\ + _viv_asm(COPY, maxHf, packedMaxData_FP16, 16); \\\n\ + VXC_Clamp_Half(src1, src0, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\ + _viv_asm(COPY, dst, src1, 16); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void clip_F16toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + TENSORCLIP_F16TOF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void clip_F16toF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + TENSORCLIP_F16TOF16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +#define TENSORCLIP_F16TOINT_PROCESS(read_fun, write_fun, dst_type) \\\n\ + vxc_short8 vec0; \\\n\ + dst_type dst; \\\n\ + vxc_half8 src0, src1, minHf, maxHf; \\\n\ + read_fun(vec0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vec0, 16); \\\n\ + _viv_asm(COPY, minHf, packedMinData_FP16, 16); \\\n\ + _viv_asm(COPY, maxHf, packedMaxData_FP16, 16); \\\n\ + VXC_Clamp_Half(src1, src0, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\ + VXC_DP2x8(dst, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertF16toInt_2x8); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void clip_F16toI16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + TENSORCLIP_F16TOINT_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray, vxc_short8)\n\ +}\n\ +\n\ +__kernel void clip_F16toI16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + TENSORCLIP_F16TOINT_PROCESS(VXC_ReadImage, VXC_WriteImage, vxc_short8)\n\ +}\n\ +\n\ +__kernel void clip_F16toI8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + TENSORCLIP_F16TOINT_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray, vxc_char16)\n\ +}\n\ +\n\ +__kernel void clip_F16toI8_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + TENSORCLIP_F16TOINT_PROCESS(VXC_ReadImage, VXC_WriteImage, vxc_char16)\n\ +}\n\ +\n\ +#define TENSORCLIP_F16TOU8_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 vec0; \\\n\ + vxc_uchar16 dst; \\\n\ + vxc_half8 src0, src1, minHf, maxHf; \\\n\ + read_fun(vec0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vec0, 16); \\\n\ + _viv_asm(COPY, minHf, packedMinData_FP16, 16); \\\n\ + _viv_asm(COPY, maxHf, packedMaxData_FP16, 16); \\\n\ + VXC_Clamp_Half(src1, src0, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(dst, src1, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMulAndPostShift_2x8); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void clip_F16toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + TENSORCLIP_F16TOU8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void clip_F16toU8_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + TENSORCLIP_F16TOU8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +"; /* end of clip_F16_vx*/ + +static const char clip_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntegerLo_2x8;\n\ +_viv_uniform int4 packedMinData;\n\ +_viv_uniform int4 packedMaxData;\n\ +\n\ +#define TENSORCLIP_I16TOI16_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 src0, min, max; \\\n\ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerLo_2x8); \\\n\ + _viv_asm(COPY, min, packedMinData, 16); \\\n\ + _viv_asm(COPY, max, packedMaxData, 16); \\\n\ + VXC_Clamp(src0, src0, min, max, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\ + write_fun(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void clip_I16toI16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + TENSORCLIP_I16TOI16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void clip_I16toI16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + TENSORCLIP_I16TOI16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +#define TENSORCLIP_I16TOF16_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 src0, dst; \\\n\ + vxc_half8 src1, src2, minHf, maxHf; \\\n\ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(src1, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerLo_2x8); \\\n\ + _viv_asm(COPY, minHf, packedMinData, 16); \\\n\ + _viv_asm(COPY, maxHf, packedMaxData, 16); \\\n\ + VXC_Clamp_Half(src2, src1, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\ + _viv_asm(COPY, dst, src2, 16); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void clip_I16toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + TENSORCLIP_I16TOF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void clip_I16toF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + TENSORCLIP_I16TOF16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of clip_I16_vx*/ + +static const char clip_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntegerLo_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertIntegerHi_2x8;\n\ +_viv_uniform int4 packedMinData;\n\ +_viv_uniform int4 packedMaxData;\n\ +\n\ +#define TENSORCLIP_I8TOI8_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 src0, min, max; \\\n\ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerLo_2x8); \\\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerHi_2x8); \\\n\ + _viv_asm(COPY, min, packedMinData, 16); \\\n\ + _viv_asm(COPY, max, packedMaxData, 16); \\\n\ + VXC_Clamp(src0, src0, min, max, VXC_MODIFIER_CLAMP(0, 15, 0, 0)); \\\n\ + write_fun(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void clip_I8toI8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + TENSORCLIP_I8TOI8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void clip_I8toI8_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + TENSORCLIP_I8TOI8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +#define TENSORCLIP_I8TOF16_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 src0; \\\n\ + vxc_short8 dst0, dst1; \\\n\ + vxc_half8 src1, src2, src3, src4, minHf, maxHf; \\\n\ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(src1, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerLo_2x8); \\\n\ + VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertIntegerHi_2x8); \\\n\ + _viv_asm(COPY, minHf, packedMinData, 16); \\\n\ + _viv_asm(COPY, maxHf, packedMaxData, 16); \\\n\ + VXC_Clamp_Half(src3, src1, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\ + VXC_Clamp_Half(src4, src2, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\ + _viv_asm(COPY, dst0, src3, 16); \\\n\ + _viv_asm(COPY, dst1, src4, 16); \\\n\ + write_fun(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + write_fun(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void clip_I8toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + TENSORCLIP_I8TOF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void clip_I8toF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + TENSORCLIP_I8TOF16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of clip_I8_vx*/ + +static const char clip_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int4 packedMinData;\n\ +_viv_uniform int4 packedMaxData;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_Hi_2x8;\n\ +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +#define TENSORCLIP_U8TOU8_PROCESS(read_fun, write_fun) \\\n\ + vxc_uchar16 vec0, min, max, dst; \\\n\ + read_fun(vec0, input, coord,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(dst, vec0, multiplier,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Lo_2x8); \\\n\ + VXC_DP2x8(dst, vec0, multiplier,\\\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Hi_2x8); \\\n\ + _viv_asm(COPY, min, packedMinData, 16); \\\n\ + _viv_asm(COPY, max, packedMaxData, 16); \\\n\ + VXC_Clamp(dst, dst, min, max, VXC_MODIFIER_CLAMP(0, 15, 0, 0)); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void clip_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + TENSORCLIP_U8TOU8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void clip_U8toU8_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + TENSORCLIP_U8TOU8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +#define TENSORCLIP_U8TOF16_PROCESS(read_fun, write_fun) \\\n\ + vxc_uchar16 vec0; \\\n\ + vxc_short8 dst0, dst1; \\\n\ + vxc_half8 src1, src2, src3, src4, minHf, maxHf; \\\n\ + read_fun(vec0, input, coord,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(src1, vec0, multiplier,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Lo_2x8); \\\n\ + VXC_DP2x8(src2, vec0, multiplier,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Hi_2x8); \\\n\ + _viv_asm(COPY, minHf, packedMinData, 16); \\\n\ + _viv_asm(COPY, maxHf, packedMaxData, 16); \\\n\ + VXC_Clamp_Half(src3, src1, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\ + VXC_Clamp_Half(src4, src2, minHf, maxHf, VXC_MODIFIER_CLAMP(0, 7, 0, 0)); \\\n\ + _viv_asm(COPY, dst0, src3, 16); \\\n\ + _viv_asm(COPY, dst1, src4, 16); \\\n\ + write_fun(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + write_fun(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void clip_U8toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + TENSORCLIP_U8TOF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void clip_U8toF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + TENSORCLIP_U8TOF16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of clip_U8_vx*/ + +static const char depth2space_crd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +\n\ +#define DEPTH2SPACE_CRD_QINT_TO_QINT(src0_type_name, src1_type_name, read_type, write_type) \\\n\ +__kernel void depth2space_crd_##src0_type_name##to##src1_type_name( \\\n\ + image2d_array_t input, \\\n\ + image2d_array_t output, \\\n\ + int block_size \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\ + int block_e2 = block_size * block_size; \\\n\ + int inx = gidx / block_size; \\\n\ + int iny = gidy / block_size; \\\n\ + int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \\\n\ + int4 coord_in = (int4)(inx, iny, inz, 0); \\\n\ + read_type src; \\\n\ + VXC_ReadImage2DArray(src,input,coord_in,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,0,0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + write_type dst; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(dst,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +DEPTH2SPACE_CRD_QINT_TO_QINT(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +DEPTH2SPACE_CRD_QINT_TO_QINT(I8, I8, vxc_char16, vxc_char16)\n\ +DEPTH2SPACE_CRD_QINT_TO_QINT(I16, I16, vxc_short8, vxc_short8)\n\ +\n\ +__kernel void depth2space_crd_F16toF16(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int block_size\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\ + int block_e2 = block_size * block_size;\n\ + int inx = gidx / block_size;\n\ + int iny = gidy / block_size;\n\ + int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2;\n\ + int4 coord_in = (int4)(inx, iny, inz, 0);\n\ + vxc_short8 data;\n\ + VXC_ReadImage2DArray(data,input,coord_in,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,0,0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, data, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define DEPTH2SPACE_CRD_QINT_TO_F16(src0_type_name, read_type) \\\n\ +__kernel void depth2space_crd_##src0_type_name##toF16( \\\n\ + image2d_array_t input, \\\n\ + image2d_array_t output, \\\n\ + int block_size \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\ + int block_e2 = block_size * block_size; \\\n\ + int inx = gidx / block_size; \\\n\ + int iny = gidy / block_size; \\\n\ + int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \\\n\ + int4 coord_in = (int4)(inx, iny, inz, 0); \\\n\ + read_type src; \\\n\ + VXC_ReadImage2DArray(src,input,coord_in,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,0,0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_half8 tmpDst; \\\n\ + vxc_short8 dst; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(tmpDst,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst, tmpDst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +DEPTH2SPACE_CRD_QINT_TO_F16(U8, vxc_uchar16)\n\ +DEPTH2SPACE_CRD_QINT_TO_F16(I8, vxc_char16)\n\ +DEPTH2SPACE_CRD_QINT_TO_F16(I16, vxc_short8)\n\ +\n\ +#define DEPTH2SPACE_CRD_F16_TO_QINT(src1_type_name, write_type) \\\n\ +__kernel void depth2space_crd_F16to##src1_type_name( \\\n\ + image2d_array_t input, \\\n\ + image2d_array_t output, \\\n\ + int block_size \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\ + int block_e2 = block_size * block_size; \\\n\ + int inx = gidx / block_size; \\\n\ + int iny = gidy / block_size; \\\n\ + int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \\\n\ + int4 coord_in = (int4)(inx, iny, inz, 0); \\\n\ + vxc_short8 src; \\\n\ + VXC_ReadImage2DArray(src,input,coord_in,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,0,0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + write_type dst; \\\n\ + vxc_half8 data; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +DEPTH2SPACE_CRD_F16_TO_QINT(U8, vxc_uchar16)\n\ +DEPTH2SPACE_CRD_F16_TO_QINT(I8, vxc_char16)\n\ +DEPTH2SPACE_CRD_F16_TO_QINT(I16, vxc_short8)"; /* end of depth2space_crd_vx*/ + +static const char depthwise_conv1d_src0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_align8_step0_16x1;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_align8_step1_16x1;\n\ +_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8;\n\ +_viv_uniform int weightZP;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float scale;\n\ +_viv_uniform int kernel_size_x16;\n\ +_viv_uniform int kernel_size_x8;\n\ +\n\ +__kernel void vxDW_Conv1D_U8toU8_KN_D1(\n\ +__read_only image2d_array_t input,\n\ +__read_only image2d_array_t weight,\n\ +__read_only image2d_t bias,\n\ +__write_only image2d_array_t output,\n\ + int pad,\n\ + int stride,\n\ + int dilation)\n\ +{\n\ +int2 coord_in = (int2)(get_global_id(0) * stride - pad, get_global_id(1));\n\ +int4 coord = (int4)(get_global_id(0), 0, 0, get_global_id(1));\n\ +vxc_uchar4 zp;\n\ +vxc_uchar16 src;\n\ +vxc_uchar16 w;\n\ +vxc_short8 coef;\n\ +int4 sum, sum0;\n\ +\n\ +_viv_asm(COPY, zp, weightZP, 4);\n\ +sum = read_imagei(bias, coord.wz);\n\ +\n\ +while(coord.y < kernel_size_x16)\n\ +{\n\ + VXC_ReadImage(src, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(w, weight, coord.yw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(coef, w, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ + VXC_DP16x1(sum0, src, coef, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8ConvS16_align8_step0_16x1);\n\ + sum.x += sum0.x;\n\ + VXC_DP2x8(coef, w, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ + VXC_DP16x1(sum0, src, coef, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8ConvS16_align8_step1_16x1);\n\ + sum.x += sum0.x;\n\ + coord_in.x += 16;\n\ + coord.y += 16;\n\ +}\n\ +\n\ +if (kernel_size_x8)\n\ +{\n\ + VXC_ReadImage(src, input, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(w, weight, coord.yw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(coef, w, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ + VXC_DP16x1(sum0, src, coef, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8ConvS16_align8_step0_16x1);\n\ + sum.x += sum0.x;\n\ +}\n\ +float4 result = convert_float4(sum.x) * scale + outputZP;\n\ +uchar4 dst = convert_uchar4_sat(result);\n\ +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void vxDW_Conv1D_U8toU8_KN_D2(\n\ +__read_only image2d_array_t input,\n\ +__read_only image2d_array_t weight,\n\ +__read_only image2d_t bias,\n\ +__write_only image2d_array_t output,\n\ + int pad,\n\ + int stride,\n\ + int dilation)\n\ +{\n\ +int2 coord_in = (int2)(get_global_id(0) * stride - pad, get_global_id(1));\n\ +int4 coord = (int4)(get_global_id(0), 0, 0, get_global_id(1));\n\ +vxc_uchar4 zp;\n\ +vxc_uchar16 src;\n\ +vxc_uchar16 w;\n\ +vxc_short8 coef;\n\ +int4 sum, sum0;\n\ +\n\ +_viv_asm(COPY, zp, weightZP, 4);\n\ +sum = read_imagei(bias, coord.wz);\n\ +\n\ +while(coord.y < kernel_size_x8)\n\ +{\n\ + VXC_ReadImage(src, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(w, weight, coord.yw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(coef, w, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ + VXC_DP16x1(sum0, src, coef, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8ConvS16_align8_step0_16x1);\n\ + sum.x += sum0.x;\n\ + coord_in.x += 16;\n\ + coord.y += 8;\n\ +}\n\ +\n\ +float4 result = convert_float4(sum.x) * scale + outputZP;\n\ +uchar4 dst = convert_uchar4_sat(result);\n\ +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +"; /* end of depthwise_conv1d_src0_vx*/ + +static const char depthwise_conv1d_src1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe0_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe1_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe2_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe3_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractInteger_2x8;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe4_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe5_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe6_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe7_8x2b;\n\ +_viv_uniform int weightZP;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float scale;\n\ +\n\ +__kernel void vxDW_Conv1D_U8toU8_K40_D1(\n\ +__read_only image2d_array_t input,\n\ +__read_only image2d_array_t weight,\n\ +__read_only image2d_t bias,\n\ +__write_only image2d_array_t output,\n\ + int pad,\n\ + int stride,\n\ + int dilation)\n\ +{\n\ +int4 coord_in = (int4)(get_global_id(0) * stride - pad + 16,\\\n\ +get_global_id(0) * stride - pad + 48, get_global_id(1), 0);\n\ +vxc_uchar32 src0, src1;\n\ +vxc_uchar16 s0, s1, s2;\n\ +vxc_uchar16 w0, w1, w2;\n\ +int4 sum, sumB;\n\ +sum = read_imagei(bias, coord_in.zw);\n\ +VXC_ReadImage(src0.hi, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(src0.lo, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(src1.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(s2, input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +int4 coord = (int4)(get_global_id(0), 16, 32, get_global_id(1));\n\ +VXC_ReadImage(w0, weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w1, weight, coord.yw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w2, weight, coord.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +sum = sum.xxxx;\n\ +sumB = sum.xxxx;\n\ +int4 sum0, sum1;\n\ +vxc_uchar4 zp;\n\ +_viv_asm(COPY, zp, weightZP, 4);\n\ +vxc_short8 coef;\n\ +VXC_DP2x8(coef, w0, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w0, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +src1.hi = src0.lo;\n\ +VXC_DP2x8(coef, w1, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w1, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +src0.hi = src1.lo;\n\ +src0.lo = s2;\n\ +VXC_DP2x8(coef, w2, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +float4 result0 = convert_float4(sum) * scale + outputZP;\n\ +float4 result1 = convert_float4(sumB) * scale + outputZP;\n\ +int4 dst0 = convert_int4(result0);\n\ +int4 dst1 = convert_int4(result1);\n\ +vxc_uchar16 dst;\n\ +VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtractInteger_2x8);\n\ +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void vxDW_Conv1D_U8toU8_K56_D1(\n\ +__read_only image2d_array_t input,\n\ +__read_only image2d_array_t weight,\n\ +__read_only image2d_t bias,\n\ +__write_only image2d_array_t output,\n\ + int pad,\n\ + int stride,\n\ + int dilation)\n\ +{\n\ +int4 coord_in = (int4)(get_global_id(0) * stride - pad + 16,\\\n\ +get_global_id(0) * stride - pad + 48, get_global_id(1), 0);\n\ +vxc_uchar32 src0, src1, src2;\n\ +vxc_uchar16 s0, s1, s2;\n\ +vxc_uchar16 w[4];\n\ +int4 sum, sumB;\n\ +sum = read_imagei(bias, coord_in.zw);\n\ +VXC_ReadImage(src0.hi, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(src0.lo, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(src1.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +src1.hi = src0.lo;\n\ +VXC_ReadImage(src2.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +src2.hi = src1.lo;\n\ +int4 coord = (int4)(get_global_id(0), 16, 48, get_global_id(1));\n\ +VXC_ReadImage(w[0], weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w[1], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w[2], weight, coord.zw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w[3], weight, coord.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +sum = sum.xxxx;\n\ +sumB = sum.xxxx;\n\ +int4 sum0, sum1;\n\ +vxc_uchar4 zp;\n\ +_viv_asm(COPY, zp, weightZP, 4);\n\ +vxc_short8 coef;\n\ +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +\n\ +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +VXC_DP2x8(coef, w[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src2.lo, src2.hi, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src2.lo, src2.hi, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src2.lo, src2.hi, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src2.lo, src2.hi, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +\n\ +float4 result0 = convert_float4(sum) * scale + outputZP;\n\ +float4 result1 = convert_float4(sumB) * scale + outputZP;\n\ +int4 dst0 = convert_int4(result0);\n\ +int4 dst1 = convert_int4(result1);\n\ +vxc_uchar16 dst;\n\ +VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtractInteger_2x8);\n\ +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of depthwise_conv1d_src1_vx*/ + +static const char depthwise_conv1d_src2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe0_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe1_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe2_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe3_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractInteger_2x8;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe4_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe5_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe6_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe7_8x2b;\n\ +_viv_uniform int weightZP;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float scale;\n\ +\n\ +__kernel void vxDW_Conv1D_U8toU8_K64_D1(\n\ +__read_only image2d_array_t input,\n\ +__read_only image2d_array_t weight,\n\ +__read_only image2d_t bias,\n\ +__write_only image2d_array_t output,\n\ +int pad,\n\ +int stride,\n\ +int dilation)\n\ +{\n\ +int4 coord_in = (int4)(get_global_id(0) * stride - pad + 16,\\\n\ +get_global_id(0) * stride - pad + 48, get_global_id(1), 0);\n\ +vxc_uchar32 src0, src1, src2, src3;\n\ +vxc_uchar16 s0, s1, s2;\n\ +vxc_uchar16 w[4];\n\ +int4 sum, sumB;\n\ +sum = read_imagei(bias, coord_in.zw);\n\ +VXC_ReadImage(src0.hi, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(src0.lo, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(src1.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +src1.hi = src0.lo;\n\ +VXC_ReadImage(src2.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +src2.hi = src1.lo;\n\ +coord_in.y += 16;\n\ +VXC_ReadImage(src3.lo, input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0), \\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +src3.hi = src2.lo;\n\ +int4 coord = (int4)(get_global_id(0), 16, 48, get_global_id(1));\n\ +VXC_ReadImage(w[0], weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w[1], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w[2], weight, coord.zw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w[3], weight, coord.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +sum = sum.xxxx;\n\ +sumB = sum.xxxx;\n\ +int4 sum0, sum1;\n\ +vxc_uchar4 zp;\n\ +_viv_asm(COPY, zp, weightZP, 4);\n\ +vxc_short8 coef;\n\ +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src1.hi, src1.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +\n\ +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src2.hi, src2.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +\n\ +VXC_DP2x8(coef, w[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src3.hi, src3.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src3.hi, src3.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src3.hi, src3.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src3.hi, src3.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src3.hi, src3.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src3.hi, src3.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src3.hi, src3.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src3.hi, src3.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +float4 result0 = convert_float4(sum) * scale + outputZP;\n\ +float4 result1 = convert_float4(sumB) * scale + outputZP;\n\ +int4 dst0 = convert_int4(result0);\n\ +int4 dst1 = convert_int4(result1);\n\ +vxc_uchar16 dst;\n\ +VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtractInteger_2x8);\n\ +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void vxDW_Conv1D_U8toU8_K80_D1(\n\ +__read_only image2d_array_t input,\n\ +__read_only image2d_array_t weight,\n\ +__read_only image2d_t bias,\n\ +__write_only image2d_array_t output,\n\ +int pad,\n\ +int stride,\n\ +int dilation)\n\ +{\n\ +int4 coord_in = (int4)(get_global_id(0) * stride - pad + 16,\\\n\ +get_global_id(0) * stride - pad + 80, get_global_id(1), 0);\n\ +vxc_uchar32 src[5];\n\ +vxc_uchar16 s0, s1, s2;\n\ +vxc_uchar16 w[5];\n\ +int4 sum, sumB;\n\ +sum = read_imagei(bias, coord_in.zw);\n\ +VXC_ReadImage(src[0].hi, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(src[0].lo, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +coord_in.x += 32;\n\ +VXC_ReadImage(src[1].lo, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +src[1].hi = src[0].lo;\n\ +VXC_ReadImage(src[2].lo, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +src[2].hi = src[1].lo;\n\ +VXC_ReadImage(src[3].lo, input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +src[3].hi = src[2].lo;\n\ +VXC_ReadImage(src[4].lo, input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +src[4].hi = src[3].lo;\n\ +int4 coord = (int4)(get_global_id(0), 16, 64, get_global_id(1));\n\ +VXC_ReadImage(w[0], weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w[1], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +coord.y += 16;\n\ +VXC_ReadImage(w[2], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w[3], weight, coord.zw, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(w[4], weight, coord.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +sum = sum.xxxx;\n\ +sumB = sum.xxxx;\n\ +int4 sum0, sum1;\n\ +vxc_uchar4 zp;\n\ +_viv_asm(COPY, zp, weightZP, 4);\n\ +vxc_short8 coef;\n\ +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src[0].hi, src[0].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src[0].hi, src[0].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src[0].hi, src[0].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src[0].hi, src[0].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src[0].hi, src[0].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src[0].hi, src[0].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src[0].hi, src[0].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src[0].hi, src[0].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src[1].hi, src[1].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src[1].hi, src[1].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src[1].hi, src[1].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src[1].hi, src[1].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src[1].hi, src[1].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src[1].hi, src[1].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src[1].hi, src[1].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src[1].hi, src[1].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +\n\ +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src[2].hi, src[2].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src[2].hi, src[2].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src[2].hi, src[2].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src[2].hi, src[2].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src[2].hi, src[2].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src[2].hi, src[2].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src[2].hi, src[2].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src[2].hi, src[2].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +\n\ +VXC_DP2x8(coef, w[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src[3].hi, src[3].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src[3].hi, src[3].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src[3].hi, src[3].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src[3].hi, src[3].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src[3].hi, src[3].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src[3].hi, src[3].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src[3].hi, src[3].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src[3].hi, src[3].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +\n\ +VXC_DP2x8(coef, w[4], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src[4].hi, src[4].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src[4].hi, src[4].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sum += sum0;\n\ +VXC_DP8x2_b(sum0, src[4].hi, src[4].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src[4].hi, src[4].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +VXC_DP2x8(coef, w[4], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum1, src[4].hi, src[4].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe2_8x2b);\n\ +VXC_DP8x2_b(sum1, src[4].hi, src[4].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe3_8x2b);\n\ +sum += sum1;\n\ +VXC_DP8x2_b(sum1, src[4].hi, src[4].lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe6_8x2b);\n\ +VXC_DP8x2_b(sum1, src[4].hi, src[4].lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe7_8x2b);\n\ +sumB += sum1;\n\ +\n\ +float4 result0 = convert_float4(sum) * scale + outputZP;\n\ +float4 result1 = convert_float4(sumB) * scale + outputZP;\n\ +int4 dst0 = convert_int4(result0);\n\ +int4 dst1 = convert_int4(result1);\n\ +vxc_uchar16 dst;\n\ +VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtractInteger_2x8);\n\ +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of depthwise_conv1d_src2_vx*/ + +static const char depthwise_conv1d_src3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe0_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe1_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractInteger_2x8;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe4_8x2b;\n\ +_viv_uniform VXC_512Bits uniU8ConvS16_Stpe5_8x2b;\n\ +_viv_uniform int weightZP;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float scale;\n\ +\n\ +__kernel void vxDW_Conv1D_U8toU8_K88_D2(\n\ +__read_only image2d_array_t input,\n\ +__read_only image2d_array_t weight,\n\ +__read_only image2d_t bias,\n\ +__write_only image2d_array_t output,\n\ +int pad,\n\ +int stride,\n\ +int dilation)\n\ +{\n\ +int4 coord_in = (int4)(get_global_id(0) * stride - pad + 16,\\\n\ +get_global_id(0) * stride - pad + 48, get_global_id(1), 0);\n\ +\n\ +vxc_uchar32 src0, src1;\n\ +vxc_uchar16 inData[12];\n\ +vxc_uchar16 wData[6];\n\ +int4 sumA, sumB;\n\ +\n\ +sumA = read_imagei(bias, coord_in.zw);\n\ +\n\ +VXC_ReadImage(inData[0], input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(inData[1], input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(inData[2], input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(inData[3], input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +coord_in.xy += 64;\n\ +VXC_ReadImage(inData[4], input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(inData[5], input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(inData[6], input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(inData[7], input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +coord_in.xy += 64;\n\ +VXC_ReadImage(inData[8], input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(inData[9], input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(inData[10], input, coord_in.yz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(inData[11], input, coord_in.yz, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +int4 coord = (int4)(get_global_id(0), 16, 48, get_global_id(1));\n\ +\n\ +VXC_ReadImage(wData[0], weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(wData[1], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(wData[2], weight, coord.zw, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(wData[3], weight, coord.zw, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +coord.yz += 64;\n\ +VXC_ReadImage(wData[4], weight, coord.yw, VXC_5BITOFFSET_XY(-16, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +VXC_ReadImage(wData[5], weight, coord.yw, VXC_5BITOFFSET_XY(0, 0),\\\n\ +VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +sumA = sumA.xxxx;\n\ +sumB = sumA;\n\ +\n\ +int4 sum0, sum1;\n\ +vxc_uchar4 zp;\n\ +_viv_asm(COPY, zp, weightZP, 4);\n\ +\n\ +vxc_short8 coef;\n\ +src0.hi = inData[0];\n\ +src0.lo = inData[1];\n\ +\n\ +VXC_DP2x8(coef, wData[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +\n\ +src0.hi = src0.lo;\n\ +src0.lo = inData[2];\n\ +\n\ +VXC_DP2x8(coef, wData[0], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +src0.hi = inData[2];\n\ +src0.lo = inData[3];\n\ +VXC_DP2x8(coef, wData[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +\n\ +src0.hi = src0.lo;\n\ +src0.lo = inData[4];\n\ +\n\ +VXC_DP2x8(coef, wData[1], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +src0.hi = inData[4];\n\ +src0.lo = inData[5];\n\ +\n\ +VXC_DP2x8(coef, wData[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +\n\ +src0.hi = src0.lo;\n\ +src0.lo = inData[6];\n\ +\n\ +VXC_DP2x8(coef, wData[2], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +src0.hi = inData[6];\n\ +src0.lo = inData[7];\n\ +VXC_DP2x8(coef, wData[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +src0.hi = src0.lo;\n\ +src0.lo = inData[8];\n\ +VXC_DP2x8(coef, wData[3], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +\n\ +src0.hi = inData[8];\n\ +src0.lo = inData[9];\n\ +VXC_DP2x8(coef, wData[4], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +src0.hi = src0.lo;\n\ +src0.lo = inData[10];\n\ +VXC_DP2x8(coef, wData[4], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +src0.hi = inData[10];\n\ +src0.lo = inData[11];\n\ +VXC_DP2x8(coef, wData[5], zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe0_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe1_8x2b);\n\ +sumA += sum0;\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe4_8x2b);\n\ +VXC_DP8x2_b(sum0, src0.hi, src0.lo, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8ConvS16_Stpe5_8x2b);\n\ +sumB += sum0;\n\ +float4 result0 = convert_float4(sumA) * scale + outputZP;\n\ +float4 result1 = convert_float4(sumB) * scale + outputZP;\n\ +int4 dst0 = convert_int4(result0);\n\ +int4 dst1 = convert_int4(result1);\n\ +vxc_uchar16 dst;\n\ +VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtractInteger_2x8);\n\ +VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +"; /* end of depthwise_conv1d_src3_vx*/ + +static const char detect_post_box_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform VXC_512Bits uniDataMerge_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZptoF32Conv0_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZptoF32Conv1_4x4;\n\ +_viv_uniform float logE;\n\ +_viv_uniform int input0_ZP;\n\ +_viv_uniform int input1_ZP;\n\ +\n\ +float exp_(float x)\n\ +{\n\ + x *= logE;\n\ + x = exp2(x);\n\ + return x;\n\ +}\n\ +\n\ +__kernel void detect_post_box_F32_F32toF32(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + float inv_scale_y,\n\ + float inv_scale_x,\n\ + float inv_scale_h,\n\ + float inv_scale_w)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + float4 src0;\n\ + float4 src1;\n\ + float4 dst;\n\ + float4 tmp0, tmp1, tmp2, tmp3;\n\ + uint4 tmp5, tmp6, tmp7;\n\ + src0 = read_imagef(input0, coord);\n\ + src1 = read_imagef(input1, coord.xy);\n\ + tmp0.x = src1.x + src1.z * src0.x * inv_scale_y;\n\ + tmp0.y = src1.y + src1.w * src0.y * inv_scale_x;\n\ + tmp1.x = src1.z * exp_(src0.z * inv_scale_h) * 0.5f;\n\ + tmp1.y = src1.w * exp_(src0.w * inv_scale_w) * 0.5f;\n\ + tmp2 = tmp0 - tmp1;\n\ + tmp3 = tmp0 + tmp1;\n\ + _viv_asm(COPY, tmp5, tmp2, 16);\n\ + _viv_asm(COPY, tmp6, tmp3, 16);\n\ + VXC_DP4x4(tmp7, tmp5, tmp6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDataMerge_4x4);\n\ + _viv_asm(COPY, dst, tmp7, 16);\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void detect_post_box_U8_U8toF32(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float inv_scale_y,\n\ + float inv_scale_x,\n\ + float inv_scale_h,\n\ + float inv_scale_w)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + float4 src0;\n\ + float4 src1;\n\ + float4 dst;\n\ + float4 tmp0, tmp1, tmp2, tmp3;\n\ + vxc_uchar8 in0 = 0, in1 = 0;\n\ + vxc_short8 zp0 = (short)input0_ZP;\n\ + vxc_short8 zp1 = (short)input1_ZP;\n\ + VXC_ReadImage2DArray(in0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(src0, in0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniU8SubZptoF32Conv0_4x4);\n\ + VXC_DP4x4(src1, in1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniU8SubZptoF32Conv1_4x4);\n\ + tmp0.x = src1.x + src1.z * src0.x * inv_scale_y;\n\ + tmp0.y = src1.y + src1.w * src0.y * inv_scale_x;\n\ + tmp1.x = src1.z * exp_(src0.z * inv_scale_h) * 0.5f;\n\ + tmp1.y = src1.w * exp_(src0.w * inv_scale_w) * 0.5f;\n\ + dst.xy = tmp0.xy - tmp1.xy;\n\ + dst.zw = tmp0.xy + tmp1.xy;\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +"; /* end of detect_post_box_vx*/ + +static const char eltwise_unary_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +float4 eltwise_unary_sin(float4 x)\n\ +{\n\ + return native_sin(x);\n\ +}\n\ +\n\ +#define logE (1.44269502f)\n\ +#define twoLogE (logE * 2.0f)\n\ +float4 eltwise_unary_exp(float4 x)\n\ +{\n\ + x *= logE;\n\ + x = exp2(x);\n\ + return x;\n\ +}\n\ +\n\ +#define rlogE (0.693147182f)\n\ +float4 eltwise_unary_log(float4 x)\n\ +{\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +float4 eltwise_unary_elu(float4 val)\n\ +{\n\ + float4 x = val * logE;\n\ + x = exp2(x) - 1;\n\ +\n\ + return val < 0 ? x : val;\n\ +}\n\ +\n\ +float4 eltwise_unary_neg(float4 x)\n\ +{\n\ + return x * -1;\n\ +}\n\ +\n\ +float4 eltwise_unary_hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +\n\ +float4 _softrelu(float4 x)\n\ +{\n\ + x *= logE;\n\ + x = exp2(x);\n\ + x += 1;\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +float4 _tanh(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return (2 * x - 1);\n\ +}\n\ +\n\ +float4 eltwise_unary_mish(float4 x)\n\ +{\n\ + float4 y = _softrelu(x);\n\ + x = x * _tanh(y);\n\ + return x;\n\ +}\n\ +\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float inputTail;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ +\n\ +#define ELTSISE_UNARY_2D(func_name, src_type_name, dst_type_name, src_type, \\\n\ + src_copy_type, convert_type, dst_type, dst_copy_type) \\\n\ + __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int type \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type src0; \\\n\ + src_copy_type src1; \\\n\ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, src0, 16); \\\n\ + \\\n\ + float4 vecA; \\\n\ + float4 vecB; \\\n\ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\ + VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \\\n\ + vecA = vecA * inputScale + inputTail; \\\n\ + vecB = vecB * inputScale + inputTail; \\\n\ + vecA = eltwise_unary_##func_name(vecA); \\\n\ + vecB = eltwise_unary_##func_name(vecB); \\\n\ + vecA = vecA * outputScale + outputZP; \\\n\ + vecB = vecB * outputScale + outputZP; \\\n\ + \\\n\ + convert_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, vecA); \\\n\ + _viv_asm(CONV_RTE, dst1, vecB); \\\n\ + dst_type dst2; \\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + dst_copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +//EXP\n\ +ELTSISE_UNARY_2D(exp, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(exp, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(exp, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(exp, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(exp, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(exp, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(exp, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(exp, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(exp, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//SIN\n\ +ELTSISE_UNARY_2D(sin, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(sin, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(sin, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(sin, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(sin, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(sin, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//LOG\n\ +ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(log, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(log, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(log, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(log, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(log, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(log, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(log, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//ELU\n\ +ELTSISE_UNARY_2D(elu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(elu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(elu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(elu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(elu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(elu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(elu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(elu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(elu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(elu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//NEG\n\ +ELTSISE_UNARY_2D(neg, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(neg, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(neg, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(neg, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(neg, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(neg, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(neg, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(neg, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(neg, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//MISH\n\ +ELTSISE_UNARY_2D(mish, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(mish, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(mish, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(mish, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(mish, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(mish, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(mish, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(mish, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(mish, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(mish, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//HARD_SIGMOID\n\ +ELTSISE_UNARY_2D(hard_sigmoid, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(hard_sigmoid, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(hard_sigmoid, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(hard_sigmoid, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(hard_sigmoid, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(hard_sigmoid, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(hard_sigmoid, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(hard_sigmoid, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define ELTSISE_UNARY_BF16_2D(func_name) \\\n\ + __kernel void func_name##_BF16toBF16_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int type \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_ushort8 src0, src1, dst; \\\n\ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 vecA; \\\n\ + float4 vecB; \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecA, src1, 16); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, vecB, src1, 16); \\\n\ + vecA = eltwise_unary_##func_name(vecA); \\\n\ + vecB = eltwise_unary_##func_name(vecB); \\\n\ + \\\n\ + _viv_asm(COPY, src0, vecA, 16); \\\n\ + _viv_asm(COPY, src1, vecB, 16); \\\n\ + \\\n\ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +//EXP\n\ +ELTSISE_UNARY_BF16_2D(exp)\n\ +//SIN\n\ +ELTSISE_UNARY_BF16_2D(sin)\n\ +//LOG\n\ +ELTSISE_UNARY_BF16_2D(log)\n\ +//ELU\n\ +ELTSISE_UNARY_BF16_2D(elu)\n\ +//NEG\n\ +ELTSISE_UNARY_BF16_2D(neg)\n\ +//MISH\n\ +ELTSISE_UNARY_BF16_2D(mish)\n\ +//HARD_SIGMOID\n\ +ELTSISE_UNARY_BF16_2D(hard_sigmoid)\n\ +"; /* end of eltwise_unary_2d_vx*/ + +static const char eltwise_unary_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +float4 eltwise_unary_sin(float4 x)\n\ +{\n\ + return native_sin(x);\n\ +}\n\ +\n\ +#define logE (1.44269502f)\n\ +#define twoLogE (logE * 2.0f)\n\ +float4 eltwise_unary_exp(float4 x)\n\ +{\n\ + x *= logE;\n\ + x = exp2(x);\n\ + return x;\n\ +}\n\ +\n\ +#define rlogE (0.693147182f)\n\ +float4 eltwise_unary_log(float4 x)\n\ +{\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +float4 eltwise_unary_elu(float4 val)\n\ +{\n\ + float4 x = val * logE;\n\ + x = exp2(x) - 1;\n\ +\n\ + return val < 0 ? x : val;\n\ +}\n\ +\n\ +float4 eltwise_unary_neg(float4 x)\n\ +{\n\ + return x * -1;\n\ +}\n\ +\n\ +float4 eltwise_unary_hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +\n\ +float4 _softrelu(float4 x)\n\ +{\n\ + x *= logE;\n\ + x = exp2(x);\n\ + x += 1;\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +float4 _tanh(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return (2 * x - 1);\n\ +}\n\ +\n\ +float4 eltwise_unary_mish(float4 x)\n\ +{\n\ + float4 y = _softrelu(x);\n\ + x = x * _tanh(y);\n\ + return x;\n\ +}\n\ +\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float inputTail;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ +\n\ +#define ELTSISE_UNARY_3D(func_name, src_type_name, dst_type_name, src_type, \\\n\ + src_copy_type, convert_type, dst_type, dst_copy_type) \\\n\ +__kernel void func_name##_##src_type_name##to##dst_type_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + src_type src0; \\\n\ + src_copy_type src1; \\\n\ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, src0, 16); \\\n\ + \\\n\ + float4 vecA; \\\n\ + float4 vecB; \\\n\ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\ + VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \\\n\ + vecA = vecA * inputScale + inputTail; \\\n\ + vecB = vecB * inputScale + inputTail; \\\n\ + vecA = eltwise_unary_##func_name(vecA); \\\n\ + vecB = eltwise_unary_##func_name(vecB); \\\n\ + vecA = vecA * outputScale + outputZP; \\\n\ + vecB = vecB * outputScale + outputZP; \\\n\ + \\\n\ + convert_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, vecA); \\\n\ + _viv_asm(CONV_RTE, dst1, vecB); \\\n\ + dst_type dst2; \\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + dst_copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +//EXP\n\ +ELTSISE_UNARY_3D(exp, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(exp, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(exp, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(exp, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(exp, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(exp, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(exp, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(exp, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(exp, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//SIN\n\ +ELTSISE_UNARY_3D(sin, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(sin, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(sin, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(sin, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(sin, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(sin, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//LOG\n\ +ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(log, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(log, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(log, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(log, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(log, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(log, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(log, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//ELU\n\ +ELTSISE_UNARY_3D(elu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(elu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(elu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(elu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(elu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(elu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(elu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(elu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(elu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(elu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//NEG\n\ +ELTSISE_UNARY_3D(neg, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(neg, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(neg, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(neg, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(neg, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(neg, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(neg, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(neg, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(neg, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//MISH\n\ +ELTSISE_UNARY_3D(mish, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(mish, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(mish, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(mish, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(mish, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(mish, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(mish, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(mish, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(mish, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(mish, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//HARD_SIGMOID\n\ +ELTSISE_UNARY_3D(hard_sigmoid, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(hard_sigmoid, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(hard_sigmoid, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(hard_sigmoid, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(hard_sigmoid, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(hard_sigmoid, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(hard_sigmoid, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(hard_sigmoid, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +#define ELTSISE_UNARY_BF16(func_name) \\\n\ + __kernel void func_name##_BF16toBF16( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int type \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + vxc_ushort8 src0, src1, dst; \\\n\ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 vecA; \\\n\ + float4 vecB; \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecA, src1, 16); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, vecB, src1, 16); \\\n\ + vecA = eltwise_unary_##func_name(vecA); \\\n\ + vecB = eltwise_unary_##func_name(vecB); \\\n\ + \\\n\ + _viv_asm(COPY, src0, vecA, 16); \\\n\ + _viv_asm(COPY, src1, vecB, 16); \\\n\ + \\\n\ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +//EXP\n\ +ELTSISE_UNARY_BF16(exp)\n\ +//SIN\n\ +ELTSISE_UNARY_BF16(sin)\n\ +//LOG\n\ +ELTSISE_UNARY_BF16(log)\n\ +//ELU\n\ +ELTSISE_UNARY_BF16(elu)\n\ +//NEG\n\ +ELTSISE_UNARY_BF16(neg)\n\ +//MISH\n\ +ELTSISE_UNARY_BF16(mish)\n\ +//HARD_SIGMOID\n\ +ELTSISE_UNARY_BF16(hard_sigmoid)"; /* end of eltwise_unary_3d_vx*/ + +static const char floordiv_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertFstToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecToFp32_4x4;\n\ +\n\ +_viv_uniform float in_scale0;\n\ +_viv_uniform float in_scale1;\n\ +_viv_uniform float out_scale;\n\ +_viv_uniform float in0Tail;\n\ +_viv_uniform float in1Tail;\n\ +_viv_uniform float out_zp;\n\ +\n\ +#define FLOORDIV_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\\\n\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \\\n\ + save_type data; \\\n\ + read_type read_data0, read_data1; \\\n\ + copy_type tmpData0, tmpData1; \\\n\ + vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \\\n\ + vxc_float4 tmpVal1, tmpVal2; \\\n\ + dst_type tmpOut1, tmpOut2; \\\n\ + read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmpData0, read_data0, 16); \\\n\ + read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmpData1, read_data1, 16); \\\n\ + VXC_DP4x4(in0Val1, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\ + VXC_DP4x4(in0Val2, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\ + VXC_DP4x4(in1Val1, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\ + VXC_DP4x4(in1Val2, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\ + in0Val1 = in0Val1 * IN0_SCALE + IN0_TAIL; \\\n\ + in0Val2 = in0Val2 * IN0_SCALE + IN0_TAIL; \\\n\ + in1Val1 = in1Val1 * IN1_SCALE + IN1_TAIL; \\\n\ + in1Val2 = in1Val2 * IN1_SCALE + IN1_TAIL; \\\n\ + tmpVal1 = floor(in0Val1 / in1Val1) * OUT_SCALE + OUT_OFFSET; \\\n\ + tmpVal2 = floor(in0Val2 / in1Val2) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, tmpOut1, tmpVal1); \\\n\ + _viv_asm(conv_mode, tmpOut2, tmpVal2); \\\n\ + VXC_DP2x8(data, tmpOut1, tmpOut2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +\n\ +#define TENSOR_FLOORDIV(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \\\n\ + conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \\\n\ +__kernel void floordiv_##src0_name##src1_name##to##dst_name \\\n\ + ( \\\n\ + image2d_array_t input0, \\\n\ + image2d_array_t input1, \\\n\ + image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + FLOORDIV_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\\\n\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \\\n\ +}\n\ +\n\ +\n\ +TENSOR_FLOORDIV(F16, F16, F16, half4, vxc_short8, vxc_short8,\\\n\ + vxc_half8, CONV, 1, 0, 1, 0, 1, 0)\n\ +TENSOR_FLOORDIV(F16, F16, I16, short4, vxc_short8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)\n\ +TENSOR_FLOORDIV(F16, F16, I8, char4, vxc_char8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)\n\ +TENSOR_FLOORDIV(F16, F16, U8, uchar4, vxc_uchar8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\ +\n\ +TENSOR_FLOORDIV(I16, I16, I16, short4, vxc_short8, vxc_short8,\\\n\ + vxc_short8, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)\n\ +TENSOR_FLOORDIV(I16, I16, F16, half4, vxc_short8, vxc_short8,\\\n\ + vxc_short8, CONV, in_scale0, 0, in_scale1, 0, 1, 0)\n\ +\n\ +TENSOR_FLOORDIV(I8, I8, I8, char4, vxc_char8, vxc_char16,\\\n\ + vxc_char16, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)\n\ +TENSOR_FLOORDIV(I8, I8, F16, half4, vxc_short8, vxc_char16,\\\n\ + vxc_char16, CONV, in_scale0, 0, in_scale1, 0, 1, 0)\n\ +\n\ +TENSOR_FLOORDIV(U8, U8, U8, uchar4, vxc_uchar8, vxc_uchar16,\\\n\ + vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\ +TENSOR_FLOORDIV(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\\\n\ + vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\ +\n\ +\n\ +\n\ +#define TENSOR_FLOORDIV_2D(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \\\n\ + conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \\\n\ +__kernel void floordiv_##src0_name##src1_name##to##dst_name##_2D \\\n\ + ( \\\n\ + image2d_array_t input0, \\\n\ + image2d_array_t input1, \\\n\ + image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + FLOORDIV_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\\\n\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage, VXC_WriteImage); \\\n\ +}\n\ +\n\ +\n\ +TENSOR_FLOORDIV_2D(F16, F16, F16, half4, vxc_short8, vxc_short8,\\\n\ + vxc_half8, CONV, 1, 0, 1, 0, 1, 0)\n\ +TENSOR_FLOORDIV_2D(F16, F16, I16, short4, vxc_short8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)\n\ +TENSOR_FLOORDIV_2D(F16, F16, I8, char4, vxc_char8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, 0)\n\ +TENSOR_FLOORDIV_2D(F16, F16, U8, uchar4, vxc_uchar8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\ +\n\ +TENSOR_FLOORDIV_2D(I16, I16, I16, short4, vxc_short8, vxc_short8,\\\n\ + vxc_short8, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)\n\ +TENSOR_FLOORDIV_2D(I16, I16, F16, half4, vxc_short8, vxc_short8,\\\n\ + vxc_short8, CONV, in_scale0, 0, in_scale1, 0, 1, 0)\n\ +\n\ +TENSOR_FLOORDIV_2D(I8, I8, I8, char4, vxc_char8, vxc_char16,\\\n\ + vxc_char16, CONV_SAT_RTE, in_scale0, 0, in_scale1, 0, out_scale, 0)\n\ +TENSOR_FLOORDIV_2D(I8, I8, F16, half4, vxc_short8, vxc_char16,\\\n\ + vxc_char16, CONV, in_scale0, 0, in_scale1, 0, 1, 0)\n\ +\n\ +TENSOR_FLOORDIV_2D(U8, U8, U8, uchar4, vxc_uchar8, vxc_uchar16,\\\n\ + vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\ +TENSOR_FLOORDIV_2D(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\\\n\ + vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define FLOORDIV_BF16_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 read_data0, read_data1, vec0; \\\n\ + vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \\\n\ + vxc_float4 tmpVal1, tmpVal2; \\\n\ + vxc_ushort8 dst0, dst1; \\\n\ + vxc_ushort8 vect; \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, in0Val1, vec0, 16); \\\n\ + VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, in0Val2, vec0, 16); \\\n\ + read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, in1Val1, vec0, 16); \\\n\ + VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, in1Val2, vec0, 16); \\\n\ + tmpVal1 = floor(in0Val1 / in1Val1); \\\n\ + tmpVal2 = floor(in0Val2 / in1Val2); \\\n\ + _viv_asm(COPY, dst0, tmpVal1, 16); \\\n\ + _viv_asm(COPY, dst1, tmpVal2, 16); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void floordiv_BF16BF16toBF16\n\ + (\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + FLOORDIV_BF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray);\n\ +}\n\ +\n\ +__kernel void floordiv_BF16BF16toBF16_2D\n\ + (\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + FLOORDIV_BF16_PROCESS(VXC_ReadImage, VXC_WriteImage);\n\ +}"; /* end of floordiv_vx*/ + +static const char gather_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int indices_num;\n\ +\n\ +__kernel void gather_I8toI8(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord_in.xyyy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + vxc_char16 src;\n\ + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_U8toU8(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord_in.xyyy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + vxc_uchar16 src;\n\ + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_I16toI16(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ +\n\ +\n\ + int4 indice = read_imagei(input1, coord_in.xyyy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_F16toF16(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ +\n\ +\n\ + int4 indice = read_imagei(input1, coord_in.xyyy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of gather_vx*/ + +static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int indices_num;\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +#define GATHER_8BITS_TO_F16(src0_type_name, read_type) \\\n\ +__kernel void gather_##src0_type_name##toF16( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int block_num, \\\n\ + int axis_num \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + \\\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\ + int4 indice = read_imagei(input1, coord_in.xyyy); \\\n\ + coord_in.w = gidz * axis_num + indice.x; \\\n\ + \\\n\ + read_type src; \\\n\ + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy); \\\n\ + vxc_half8 src0, src1; \\\n\ + vxc_short8 dst0, dst1; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_DP2x8(src1,src,ms0, VXC_MODIFIER(0,7,8, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst0, src0, 16); \\\n\ + _viv_asm(COPY, dst1, src1, 16); \\\n\ + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + VXC_WriteImage(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_8BITS_TO_F16(U8, vxc_uchar16)\n\ +GATHER_8BITS_TO_F16(I8, vxc_char16)\n\ +\n\ +#define GATHER_F16_TO_QINT(src1_type_name, write_type) \\\n\ +__kernel void gather_F16to##src1_type_name( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int block_num, \\\n\ + int axis_num \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\ + \\\n\ + int4 indice = read_imagei(input1, coord_in.xyyy); \\\n\ + coord_in.w = gidz * axis_num + indice.x; \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy); \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + vxc_half8 data; \\\n\ + write_type dst; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_F16_TO_QINT(U8, vxc_uchar16)\n\ +GATHER_F16_TO_QINT(I8, vxc_char16)\n\ +GATHER_F16_TO_QINT(I16, vxc_short8)\n\ +\n\ +__kernel void gather_I16toF16(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord_in.xyyy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage(src, input0, coord_in.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ +\n\ + vxc_half8 src0;\n\ + vxc_short8 dst0;\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ + _viv_asm(COPY, dst0, src0, 16);\n\ +\n\ + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of gather_mix_vx*/ + +static const char gather_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void gather_nd_I8toI8_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + coord.w = indice.x;\n\ +\n\ + vxc_char16 src;\n\ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_U8toU8_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + coord.w = indice.x;\n\ +\n\ + vxc_uchar16 src;\n\ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_I16toI16_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + coord.w = indice.x;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_F16toF16_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + coord.w = indice.x;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of gather_nd_vx*/ + +static const char gather_nd_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void gather_nd_I8toI8_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + indice.x = indice.x * block_size + gidx;\n\ +\n\ + vxc_char16 src;\n\ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_U8toU8_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + indice.x = indice.x * block_size + gidx;\n\ +\n\ + vxc_uchar16 src;\n\ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_I16toI16_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + indice.x = indice.x * block_size + gidx;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_F16toF16_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + indice.x = indice.x * block_size + gidx;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of gather_nd_2d_vx*/ + +static const char gather_nd_2d_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8;\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8;\n\ +\n\ +#define GATHER_ND_QINT_TO_F16_2D(src0_type_name, read_type) \\\n\ +__kernel void gather_nd_##src0_type_name##toF16_2D( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + int4 indice = read_imagei(input1, coord.xy); \\\n\ + indice.x = indice.x * block_size + gidx; \\\n\ + \\\n\ + read_type src; \\\n\ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_half8 src0; \\\n\ + vxc_short8 dst0; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst0, src0, 16); \\\n\ + VXC_WriteImage(output, coord.zy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_ND_QINT_TO_F16_2D(U8, vxc_uchar16)\n\ +GATHER_ND_QINT_TO_F16_2D(I8, vxc_char16)\n\ +GATHER_ND_QINT_TO_F16_2D(I16, vxc_short8)\n\ +\n\ +#define GATHER_ND_F16_TO_QINT_2D(src1_type_name, write_type) \\\n\ +__kernel void gather_nd_F16to##src1_type_name##_2D( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + int4 indice = read_imagei(input1, coord.xy); \\\n\ + indice.x = indice.x * block_size + gidx; \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + vxc_half8 data; \\\n\ + write_type dst; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1),uniConvertFp16toU8_2x8); \\\n\ + VXC_WriteImage(output, coord.zy, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_ND_F16_TO_QINT_2D(U8, vxc_uchar16)\n\ +GATHER_ND_F16_TO_QINT_2D(I8, vxc_char16)\n\ +GATHER_ND_F16_TO_QINT_2D(I16, vxc_short8)\n\ +"; /* end of gather_nd_2d_mix_vx*/ + +static const char gather_nd_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void gather_nd_I8toI8_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.w = 0;\n\ +\n\ + vxc_char16 src;\n\ + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_U8toU8_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.w = 0;\n\ +\n\ + vxc_uchar16 src;\n\ + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_I16toI16_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.w = 0;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_F16toF16_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.w = 0;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of gather_nd_3d_vx*/ + +static const char gather_nd_3d_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8;\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +#define GATHER_ND_QINT_TO_F16_3D(src0_type_name, read_type) \\\n\ +__kernel void gather_nd_##src0_type_name##toF16_3D( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + int4 indice = read_imagei(input1, coord.xy); \\\n\ + indice.x = indice.x * block_size + gidx; \\\n\ + indice.w = 0; \\\n\ + \\\n\ + read_type src; \\\n\ + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_half8 src0; \\\n\ + vxc_short8 dst0; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst0, src0, 16); \\\n\ + VXC_WriteImage(output, coord.zy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_ND_QINT_TO_F16_3D(U8, vxc_uchar16)\n\ +GATHER_ND_QINT_TO_F16_3D(I8, vxc_char16)\n\ +GATHER_ND_QINT_TO_F16_3D(I16, vxc_short8)\n\ +\n\ +#define GATHER_ND_F16_TO_QINT_3D(src1_type_name, write_type) \\\n\ +__kernel void gather_nd_F16to##src1_type_name##_3D( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + int4 indice = read_imagei(input1, coord.xy); \\\n\ + indice.x = indice.x * block_size + gidx; \\\n\ + indice.w = 0; \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + vxc_half8 data; \\\n\ + write_type dst; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1), uniConvertFp16toU8_2x8); \\\n\ + VXC_WriteImage(output, coord.zy, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_ND_F16_TO_QINT_3D(U8, vxc_uchar16)\n\ +GATHER_ND_F16_TO_QINT_3D(I8, vxc_char16)\n\ +GATHER_ND_F16_TO_QINT_3D(I16, vxc_short8)\n\ +\n\ +"; /* end of gather_nd_3d_mix_vx*/ + +static const char gather_nd_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8;\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8;\n\ +\n\ +#define GATHER_ND_QINT_TO_F16_1D(src0_type_name, read_type) \\\n\ +__kernel void gather_nd_##src0_type_name##toF16_1D( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + int4 indice = read_imagei(input1, coord.xy); \\\n\ + coord.w = indice.x; \\\n\ + \\\n\ + read_type src; \\\n\ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_half8 src0; \\\n\ + vxc_short8 dst0; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst0, src0, 16); \\\n\ + VXC_WriteImage(output, coord.zy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_ND_QINT_TO_F16_1D(U8, vxc_uchar16)\n\ +GATHER_ND_QINT_TO_F16_1D(I8, vxc_char16)\n\ +GATHER_ND_QINT_TO_F16_1D(I16, vxc_short8)\n\ +\n\ +#define GATHER_ND_F16_TO_QINT_1D(src1_type_name, write_type) \\\n\ +__kernel void gather_nd_F16to##src1_type_name##_1D( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + int4 indice = read_imagei(input1, coord.xy); \\\n\ + coord.w = indice.x; \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + vxc_half8 data; \\\n\ + write_type dst; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \\\n\ + VXC_WriteImage(output, coord.zy, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_ND_F16_TO_QINT_1D(U8, vxc_uchar16)\n\ +GATHER_ND_F16_TO_QINT_1D(I8, vxc_char16)\n\ +GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8)\n\ +\n\ +"; /* end of gather_nd_mix_vx*/ + +static const char grucell_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define logE (1.44269502f)\n\ +#define twoLogE (2.88539004f)\n\ +\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hsigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvDatatoFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform float4 tensorScale;\n\ +_viv_uniform float4 tensorZP;\n\ +\n\ +#define GRUCELL_ACTIVATION_SIGMOID_TANH(name0, name1, name2, name3, activater, \\\n\ + type00, type01, type10, type11, type20, type21, dst_type, conv_type, copy_type) \\\n\ +__kernel void grucell_activation_##name0##_##name1##_##name2##_to_##name3##_##activater \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __read_only image2d_array_t input2, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_array_t hstate, \\\n\ + int gate_activation, \\\n\ + int candidate_activation \\\n\ + ) \\\n\ +{ \\\n\ + type00 src00; \\\n\ + type01 src01; \\\n\ + type00 src10; \\\n\ + type01 src11; \\\n\ + type00 src20; \\\n\ + type01 src21; \\\n\ + \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + VXC_ReadImage(src00, input0, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src01, src00, 8); \\\n\ + VXC_ReadImage(src10, input1, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, src10, 8); \\\n\ + VXC_ReadImage(src20, input2, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src21, src20, 8); \\\n\ + \\\n\ + float4 zt, ht, ht_1; \\\n\ + VXC_DP4x4(zt, src01, src01, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); \\\n\ + VXC_DP4x4(ht, src11, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); \\\n\ + VXC_DP4x4(ht_1, src21, src21, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4); \\\n\ + \\\n\ + zt = zt * tensorScale.xxxx - tensorZP.xxxx; \\\n\ + zt = activater(zt); \\\n\ + \\\n\ + ht = ht * tensorScale.yyyy - tensorZP.yyyy; \\\n\ + ht = tangentH(ht); \\\n\ + \\\n\ + ht_1 = ht_1 * tensorScale.zzzz - tensorZP.zzzz; \\\n\ + \\\n\ + ht = ht - zt * ht; \\\n\ + ht = zt * ht_1 + ht; \\\n\ + \\\n\ + ht = ht * tensorScale.wwww + tensorZP.wwww; \\\n\ + conv_type dst0; \\\n\ + dst_type dst1; \\\n\ + copy_type dst; \\\n\ + \\\n\ + _viv_asm(CONV_RTE, dst0, ht); \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(hstate, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +#define UCHAR8 vxc_uchar8\n\ +#define SHORT8 vxc_short8\n\ +#define HALF8 vxc_half8\n\ +\n\ +GRUCELL_ACTIVATION_SIGMOID_TANH(U8, U8, U8, U8, sigmoid,\n\ + UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, int4, UCHAR8)\n\ +GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, F16, sigmoid,\n\ + SHORT8, HALF8, SHORT8, HALF8, SHORT8, HALF8, HALF8, half4, SHORT8)\n\ +GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, U8, sigmoid,\n\ + SHORT8, HALF8, SHORT8, HALF8, SHORT8, HALF8, UCHAR8, int4, UCHAR8)\n\ +GRUCELL_ACTIVATION_SIGMOID_TANH(U8, U8, U8, U8, hsigmoid,\n\ + UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, UCHAR8, int4, UCHAR8)\n\ +GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, F16, hsigmoid,\n\ + SHORT8, HALF8, SHORT8, HALF8, SHORT8, HALF8, HALF8, half4, SHORT8)\n\ +GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, U8, hsigmoid,\n\ + SHORT8, HALF8, SHORT8, HALF8, SHORT8, HALF8, UCHAR8, int4, UCHAR8)\n\ +\n\ +#undef UCHAR8\n\ +#undef SHORT8\n\ +#undef HALF8\n\ +"; /* end of grucell_activation_vx*/ + +static const char grucell_activation_sma_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniA_Minus_B_2x8;\n\ +_viv_uniform VXC_512Bits uniA_Times_B_2x8;\n\ +_viv_uniform VXC_512Bits uniA_Plus_B_2x8;\n\ +__kernel void grucell_activation_sma_F16_F16_F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __read_only image2d_array_t input2,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t h_status\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_half8 src0, src1, src2, minus, dst;\n\ + vxc_ushort8 vec0, vec1, vec2;\n\ +\n\ + VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ + VXC_ReadImage2DArray(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src2, vec2, 16);\n\ +\n\ + VXC_DP2x8(minus, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Minus_B_2x8);\n\ + VXC_DP2x8(dst, minus, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Times_B_2x8);\n\ + VXC_DP2x8(dst, dst, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Plus_B_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(h_status, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void grucell_activation_sma_F16_F16_F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __read_only image2d_array_t input2,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t h_status\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_half8 src0, src1, src2, minus, dst;\n\ + vxc_ushort8 vec0, vec1, vec2;\n\ +\n\ + VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ + VXC_ReadImage(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src2, vec2, 16);\n\ +\n\ + VXC_DP2x8(minus, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Minus_B_2x8);\n\ + VXC_DP2x8(dst, minus, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Times_B_2x8);\n\ + VXC_DP2x8(dst, dst, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniA_Plus_B_2x8);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(h_status, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +"; /* end of grucell_activation_sma_vx*/ + +static const char grucell_cdnn_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define logE (1.44269502f)\n\ +#define twoLogE (2.88539004f)\n\ +\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvDatatoFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uiF16AddF16_4x4;\n\ +\n\ +__kernel void grucell_activation_cdnn_sep_F16_F16_F16_to_F16_NC\n\ + (\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_r,\n\ + __read_only image2d_array_t input_z,\n\ + __read_only image2d_array_t input_c,\n\ + __read_only image2d_array_t recur_r,\n\ + __read_only image2d_array_t recur_z,\n\ + __read_only image2d_array_t recur_c,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation,\n\ + int candidate_activation,\n\ + int batch_first\n\ + )\n\ +{\n\ + vxc_ushort8 s0, s1;\n\ + vxc_half8 r0, r1;\n\ + vxc_ushort8 s2, s3;\n\ + vxc_half8 z0, z1;\n\ + vxc_ushort8 s4, s5;\n\ + vxc_half8 c0, c1;\n\ + float4 r, r2, r3;\n\ + float4 z, z2, z3;\n\ + float4 c, c2, c3;\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, r0, s0, 8);\n\ + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, r1, s1, 8);\n\ + r2 = read_imagef(bias_r, coord);\n\ + r3 = read_imagef(cond_r, coord);\n\ +\n\ + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, z0, s2, 8);\n\ + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, z1, s3, 8);\n\ + z2 = read_imagef(bias_z, coord);\n\ + z3 = read_imagef(cond_z, coord);\n\ +\n\ + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c0, s4, 8);\n\ + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c1, s5, 8);\n\ + c2 = read_imagef(bias_c, coord);\n\ + c3 = read_imagef(cond_c, coord);\n\ +\n\ + VXC_DP4x4(r, r0, r1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4);\n\ + r = r + r2 + r3;\n\ + VXC_DP4x4(z, z0, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4);\n\ + z = z + z2 + z3;\n\ +\n\ + vxc_ushort8 s7;\n\ + vxc_half8 h;\n\ + VXC_ReadImage(s7, prev_state, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, h, s7, 8);\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2 * r + c3;\n\ + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + c = c2 + c3 * r + c;\n\ + c = tangentH(c);\n\ +\n\ + float4 state;\n\ + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ +\n\ + state = z * (state - c) + c;\n\ +\n\ + half4 dst0;\n\ + vxc_half4 dst1;\n\ + vxc_short4 dst;\n\ + _viv_asm(CONV_RTE, dst0, state);\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst1, 8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void grucell_activation_cdnn_sep_F16_F16_F16_to_F16_CN\n\ + (\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_r,\n\ + __read_only image2d_array_t input_z,\n\ + __read_only image2d_array_t input_c,\n\ + __read_only image2d_array_t recur_r,\n\ + __read_only image2d_array_t recur_z,\n\ + __read_only image2d_array_t recur_c,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation,\n\ + int candidate_activation,\n\ + int batch_first\n\ + )\n\ +{\n\ + vxc_ushort8 s0, s1;\n\ + vxc_half8 r0, r1;\n\ + vxc_ushort8 s2, s3;\n\ + vxc_half8 z0, z1;\n\ + vxc_ushort8 s4, s5;\n\ + vxc_half8 c0, c1;\n\ + float4 r, r2, r3;\n\ + float4 z, z2, z3;\n\ + float4 c, c2, c3;\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, r0, s0, 8);\n\ + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, r1, s1, 8);\n\ + r2 = read_imagef(bias_r, coord.yx);\n\ + r3 = read_imagef(cond_r, coord.yx);\n\ +\n\ + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, z0, s2, 8);\n\ + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, z1, s3, 8);\n\ + z2 = read_imagef(bias_z, coord.yx);\n\ + z3 = read_imagef(cond_z, coord.yx);\n\ +\n\ + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c0, s4, 8);\n\ + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c1, s5, 8);\n\ + c2 = read_imagef(bias_c, coord.yx);\n\ + c3 = read_imagef(cond_c, coord.yx);\n\ +\n\ + VXC_DP4x4(r, r0, r1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4);\n\ + r = r + r2.xxxx + r3.xxxx;\n\ + VXC_DP4x4(z, z0, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4);\n\ + z = z + z2.xxxx + z3.xxxx;\n\ +\n\ + vxc_ushort8 s7;\n\ + vxc_half8 h;\n\ + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, h, s7, 8);\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2.xxxx * r + c3.xxxx;\n\ + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + c = c2 + c3 * r + c;\n\ + c = tangentH(c);\n\ +\n\ + float4 state;\n\ + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ +\n\ + state = z * (state - c) + c;\n\ +\n\ + half4 dst0;\n\ + vxc_half4 dst1;\n\ + vxc_short4 dst;\n\ + _viv_asm(CONV_RTE, dst0, state);\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst1, 8);\n\ + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void grucell_activation_cdnn_sep_F16_F16_F16_to_F16_CN_FULL\n\ + (\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_r,\n\ + __read_only image2d_array_t input_z,\n\ + __read_only image2d_array_t input_c,\n\ + __read_only image2d_array_t recur_r,\n\ + __read_only image2d_array_t recur_z,\n\ + __read_only image2d_array_t recur_c,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation,\n\ + int candidate_activation,\n\ + int batch_first\n\ + )\n\ +{\n\ + vxc_ushort8 s0, s1;\n\ + vxc_half8 r0, r1;\n\ + vxc_ushort8 s2, s3;\n\ + vxc_half8 z0, z1;\n\ + vxc_ushort8 s4, s5;\n\ + vxc_half8 c0, c1;\n\ + float4 r, r2, r3;\n\ + float4 z, z2, z3;\n\ + float4 c, c2, c3;\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, r0, s0, 8);\n\ + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, r1, s1, 8);\n\ + r2 = read_imagef(bias_r, coord.yx);\n\ + r3 = read_imagef(cond_r, coord.yx);\n\ +\n\ + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, z0, s2, 8);\n\ + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, z1, s3, 8);\n\ + z2 = read_imagef(bias_z, coord.yx);\n\ + z3 = read_imagef(cond_z, coord.yx);\n\ +\n\ + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c0, s4, 8);\n\ + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c1, s5, 8);\n\ + c2 = read_imagef(bias_c, coord.yx);\n\ + c3 = read_imagef(cond_c, coord.yx);\n\ +\n\ + VXC_DP4x4(r, r0, r1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4);\n\ + r = r + r2.xxxx + r3.xxxx;\n\ + VXC_DP4x4(z, z0, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4);\n\ + z = z + z2.xxxx + z3.xxxx;\n\ +\n\ + vxc_ushort8 s7;\n\ + vxc_half8 h;\n\ + VXC_ReadImage(s7, prev_state, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, h, s7, 8);\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2.xxxx * r + c3.xxxx;\n\ + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + c = c2 + c3 * r + c;\n\ + c = tangentH(c);\n\ +\n\ + float4 state;\n\ + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ +\n\ + state = z * (state - c) + c;\n\ +\n\ + half4 dst0;\n\ + vxc_half4 dst1;\n\ + vxc_short4 dst;\n\ + _viv_asm(CONV_RTE, dst0, state);\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst1, 8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +\n\ +__kernel void grucell_activation_cdnn_F16_F16_F16_to_F16\n\ + (\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_rzc,\n\ + __read_only image2d_array_t recur_rzc,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation,\n\ + int candidate_activation,\n\ + int batch_first\n\ + )\n\ +{\n\ + vxc_ushort8 s0, s1;\n\ + vxc_half8 r0, r1;\n\ + vxc_ushort8 s2, s3;\n\ + vxc_half8 z0, z1;\n\ + vxc_ushort8 s4, s5;\n\ + vxc_half8 c0, c1;\n\ + float4 r, r2, r3;\n\ + float4 z, z2, z3;\n\ + float4 c, c2, c3;\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1) * 3, get_global_id(1));\n\ +\n\ + VXC_ReadImage(s0, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, r0, s0, 8);\n\ + VXC_ReadImage(s1, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, r1, s1, 8);\n\ + r2 = read_imagef(bias_r, coord.xy);\n\ + r3 = read_imagef(cond_r, coord.xy);\n\ +\n\ + VXC_ReadImage(s2, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, z0, s2, 8);\n\ + VXC_ReadImage(s3, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, z1, s3, 8);\n\ + z2 = read_imagef(bias_z, coord.xy);\n\ + z3 = read_imagef(cond_z, coord.xy);\n\ +\n\ + VXC_ReadImage(s4, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c0, s4, 8);\n\ + VXC_ReadImage(s5, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c1, s5, 8);\n\ + c2 = read_imagef(bias_c, coord.xy);\n\ + c3 = read_imagef(cond_c, coord.xy);\n\ +\n\ + VXC_DP4x4(r, r0, r1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4);\n\ + r = r + r2 + r3;\n\ + VXC_DP4x4(z, z0, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uiF16AddF16_4x4);\n\ + z = z + z2 + z3;\n\ +\n\ + vxc_ushort8 s7;\n\ + vxc_half8 h;\n\ + VXC_ReadImage(s7, prev_state, coord.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, h, s7, 8);\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2 * r + c3;\n\ + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + c = c2 + c3 * r + c;\n\ + c = tangentH(c);\n\ +\n\ + float4 state;\n\ + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ +\n\ + state = z * (state - c) + c;\n\ +\n\ + half4 dst0;\n\ + vxc_half4 dst1;\n\ + vxc_short4 dst;\n\ + _viv_asm(CONV_RTE, dst0, state);\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst1, 8);\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +"; /* end of grucell_cdnn_activation_vx*/ + +static const char grucell_cdnn_activation_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define logE (1.44269502f)\n\ +#define twoLogE (2.88539004f)\n\ +\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvDatatoFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_tail;\n\ +_viv_uniform float input_r_scale;\n\ +_viv_uniform float input_r_tail;\n\ +_viv_uniform float recur_r_scale;\n\ +_viv_uniform float recur_r_tail;\n\ +_viv_uniform float input_z_scale;\n\ +_viv_uniform float input_z_tail;\n\ +_viv_uniform float recur_z_scale;\n\ +_viv_uniform float recur_z_tail;\n\ +_viv_uniform float input_c_scale;\n\ +_viv_uniform float input_c_tail;\n\ +_viv_uniform float recur_c_scale;\n\ +_viv_uniform float recur_c_tail;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +__kernel void grucell_activation_cdnn_sep_U8_U8_U8_to_U8_NC\n\ + (\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_r,\n\ + __read_only image2d_array_t input_z,\n\ + __read_only image2d_array_t input_c,\n\ + __read_only image2d_array_t recur_r,\n\ + __read_only image2d_array_t recur_z,\n\ + __read_only image2d_array_t recur_c,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation,\n\ + int candidate_activation,\n\ + int batch_first\n\ + )\n\ +{\n\ + vxc_uchar8 r00, r01;\n\ + vxc_uchar8 z0, z1;\n\ + vxc_uchar8 c0, c1;\n\ + float4 r, r0, r1, r2, r3;\n\ + float4 z, z2, z3;\n\ + float4 c, c2, c3;\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + VXC_ReadImage(r00, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(r01, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + r2 = read_imagef(bias_r, coord);\n\ + r3 = read_imagef(cond_r, coord);\n\ +\n\ + VXC_ReadImage(z0, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(z1, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + z2 = read_imagef(bias_z, coord);\n\ + z3 = read_imagef(cond_z, coord);\n\ +\n\ + VXC_ReadImage(c0, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(c1, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + c2 = read_imagef(bias_c, coord);\n\ + c3 = read_imagef(cond_c, coord);\n\ +\n\ + VXC_DP4x4(r0, r00, r00, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(r1, r01, r01, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + r0 = r0 * input_r_scale + input_r_tail;\n\ + r1 = r1 * recur_r_scale + recur_r_tail;\n\ + r = r0 + r1 + r2 + r3;\n\ +\n\ + VXC_DP4x4(r0, z0, z0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(r1, z1, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + r0 = r0 * input_z_scale + input_z_tail;\n\ + r1 = r1 * recur_z_scale + recur_z_tail;\n\ + z = r0 + r1 + z2 + z3;\n\ +\n\ + vxc_uchar8 h;\n\ + VXC_ReadImage(h, prev_state, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2 * r + c3;\n\ +\n\ + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + c2 = c2 * input_c_scale + input_c_tail;\n\ + c3 = c3 * recur_c_scale + recur_c_tail;\n\ + c = c2 + c3 * r + c;\n\ + c = tangentH(c);\n\ +\n\ + float4 state;\n\ + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + state = state * input_scale + input_tail;\n\ + state = z * (state - c) + c;\n\ +\n\ + state = state * output_scale + output_zp;\n\ +\n\ + int4 dst0;\n\ + vxc_uchar4 dst;\n\ + _viv_asm(CONV_RTE, dst0, state);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void grucell_activation_cdnn_sep_U8_U8_U8_to_U8_CN\n\ + (\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_r,\n\ + __read_only image2d_array_t input_z,\n\ + __read_only image2d_array_t input_c,\n\ + __read_only image2d_array_t recur_r,\n\ + __read_only image2d_array_t recur_z,\n\ + __read_only image2d_array_t recur_c,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation,\n\ + int candidate_activation,\n\ + int batch_first\n\ + )\n\ +{\n\ + vxc_uchar8 r00, r01;\n\ + vxc_uchar8 z0, z1;\n\ + vxc_uchar8 c0, c1;\n\ + float4 r, r2, r3, r0, r1;\n\ + float4 z, z2, z3;\n\ + float4 c, c2, c3;\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + VXC_ReadImage(r00, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(r01, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + r2 = read_imagef(bias_r, coord.yx);\n\ + r3 = read_imagef(cond_r, coord.yx);\n\ +\n\ + VXC_ReadImage(z0, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(z1, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + z2 = read_imagef(bias_z, coord.yx);\n\ + z3 = read_imagef(cond_z, coord.yx);\n\ +\n\ + VXC_ReadImage(c0, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(c1, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + c2 = read_imagef(bias_c, coord.yx);\n\ + c3 = read_imagef(cond_c, coord.yx);\n\ +\n\ + VXC_DP4x4(r0, r00, r00, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(r1, r01, r01, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + r0 = r0 * input_r_scale + input_r_tail;\n\ + r1 = r1 * recur_r_scale + recur_r_tail;\n\ + r = r0 + r1 + r2.xxxx + r3.xxxx;\n\ +\n\ + VXC_DP4x4(r0, z0, z0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(r1, z1, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + r0 = r0 * input_z_scale + input_z_tail;\n\ + r1 = r1 * recur_z_scale + recur_z_tail;\n\ + z = r0 + r1 + z2.xxxx + z3.xxxx;\n\ +\n\ + vxc_uchar8 h;\n\ + VXC_ReadImage(h, prev_state, coord.yx, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(h, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(h, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(h, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2.xxxx * r + c3.xxxx;\n\ + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + c2 = c2 * input_c_scale + input_c_tail;\n\ + c3 = c3 * recur_c_scale + recur_c_tail;\n\ + c = c2 + c3 * r + c;\n\ + c = tangentH(c);\n\ +\n\ + float4 state;\n\ + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + state = state * input_scale + input_tail;\n\ + state = z * (state - c) + c;\n\ +\n\ + state = state * output_scale + output_zp;\n\ +\n\ + int4 dst0;\n\ + vxc_uchar4 dst;\n\ + _viv_asm(CONV_RTE, dst0, state);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, dst, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, dst, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void grucell_activation_cdnn_sep_U8_U8_U8_to_U8_CN_FULL\n\ + (\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_r,\n\ + __read_only image2d_array_t input_z,\n\ + __read_only image2d_array_t input_c,\n\ + __read_only image2d_array_t recur_r,\n\ + __read_only image2d_array_t recur_z,\n\ + __read_only image2d_array_t recur_c,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation,\n\ + int candidate_activation,\n\ + int batch_first\n\ + )\n\ +{\n\ + vxc_uchar8 r00, r01;\n\ + vxc_uchar8 z0, z1;\n\ + vxc_uchar8 c0, c1;\n\ + float4 r, r2, r3, r0, r1;\n\ + float4 z, z2, z3;\n\ + float4 c, c2, c3;\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + VXC_ReadImage(r00, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(r01, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + r2 = read_imagef(bias_r, coord.yx);\n\ + r3 = read_imagef(cond_r, coord.yx);\n\ +\n\ + VXC_ReadImage(z0, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(z1, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + z2 = read_imagef(bias_z, coord.yx);\n\ + z3 = read_imagef(cond_z, coord.yx);\n\ +\n\ + VXC_ReadImage(c0, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(c1, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + c2 = read_imagef(bias_c, coord.yx);\n\ + c3 = read_imagef(cond_c, coord.yx);\n\ +\n\ + VXC_DP4x4(r0, r00, r00, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(r1, r01, r01, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + r0 = r0 * input_r_scale + input_r_tail;\n\ + r1 = r1 * recur_r_scale + recur_r_tail;\n\ + r = r0 + r1 + r2.xxxx + r3.xxxx;\n\ +\n\ + VXC_DP4x4(r0, z0, z0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(r1, z1, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + r0 = r0 * input_z_scale + input_z_tail;\n\ + r1 = r1 * recur_z_scale + recur_z_tail;\n\ + z = r0 + r1 + z2.xxxx + z3.xxxx;\n\ +\n\ + vxc_uchar8 h;\n\ + VXC_ReadImage(h, prev_state, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2.xxxx * r + c3.xxxx;\n\ + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + c2 = c2 * input_c_scale + input_c_tail;\n\ + c3 = c3 * recur_c_scale + recur_c_tail;\n\ + c = c2 + c3 * r + c;\n\ + c = tangentH(c);\n\ +\n\ + float4 state;\n\ + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + state = state * input_scale + input_tail;\n\ + state = z * (state - c) + c;\n\ +\n\ + state = state * output_scale + output_zp;\n\ +\n\ + int4 dst0;\n\ + vxc_uchar4 dst;\n\ + _viv_asm(CONV_RTE, dst0, state);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void grucell_activation_cdnn_U8_U8_U8_to_U8\n\ + (\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_rzc,\n\ + __read_only image2d_array_t recur_rzc,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation,\n\ + int candidate_activation,\n\ + int batch_first\n\ + )\n\ +{\n\ + vxc_uchar8 r00, r01;\n\ + vxc_uchar8 z0, z1;\n\ + vxc_uchar8 c0, c1;\n\ + float4 r, r0, r1, r2, r3;\n\ + float4 z, z2, z3;\n\ + float4 c, c2, c3;\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1) * 3, get_global_id(1));\n\ +\n\ + VXC_ReadImage(r00, input_rzc, coord.xz, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(r01, recur_rzc, coord.xz, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + r2 = read_imagef(bias_r, coord.xy);\n\ + r3 = read_imagef(cond_r, coord.xy);\n\ +\n\ + VXC_ReadImage(z0, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(z1, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + z2 = read_imagef(bias_z, coord.xy);\n\ + z3 = read_imagef(cond_z, coord.xy);\n\ +\n\ + VXC_ReadImage(c0, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(c1, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + c2 = read_imagef(bias_c, coord.xy);\n\ + c3 = read_imagef(cond_c, coord.xy);\n\ +\n\ + VXC_DP4x4(r0, r00, r00, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(r1, r01, r01, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + r0 = r0 * input_r_scale + input_r_tail;\n\ + r1 = r1 * recur_r_scale + recur_r_tail;\n\ + r = r0 + r1 + r2 + r3;\n\ + VXC_DP4x4(r0, z0, z0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(r1, z1, z1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + r0 = r0 * input_r_scale + input_r_tail;\n\ + r1 = r1 * input_r_scale + recur_r_tail;\n\ + z = r0 + r1 + z2 + z3;\n\ +\n\ + vxc_uchar8 h;\n\ + VXC_ReadImage(h, prev_state, coord.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2 * r + c3;\n\ + VXC_DP4x4(c2, c0, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + VXC_DP4x4(c3, c1, c1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + c2 = c2 * input_c_scale + input_c_tail;\n\ + c3 = c3 * recur_c_scale + recur_c_tail;\n\ + c = c2 + c3 * r + c;\n\ + c = tangentH(c);\n\ +\n\ + float4 state;\n\ + VXC_DP4x4(state, h, h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvDatatoFp32_4x4);\n\ + state = state * input_scale + input_tail;\n\ + state = z * (state - c) + c;\n\ +\n\ + state = state * output_scale + output_zp;\n\ +\n\ + int4 dst0;\n\ + vxc_uchar4 dst;\n\ + _viv_asm(CONV_RTE, dst0, state);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +"; /* end of grucell_cdnn_activation_u8_vx*/ + +static const char hswish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float inputTail;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ +\n\ +#define HSWISH_PROCESS(read_fun, write_fun, src_type, src_copy_type, convert_type, dst_type, dst_copy_type, \\\n\ + INSCALE, INTAIL, OUTSCALE, OUTZP) \\\n\ + src_type src0; \\\n\ + src_copy_type src1; \\\n\ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, src0, 16); \\\n\ + float4 vecA, vecB, vecC, vecD, vecE, vecDstA, vecDstB; \\\n\ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\ + VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \\\n\ + vecA = vecA * INSCALE + INTAIL; \\\n\ + vecB = vecB * INSCALE + INTAIL; \\\n\ + vecC = vecA + 3.0f; \\\n\ + vecD = vecB + 3.0f; \\\n\ + vecE = 6.0f; \\\n\ + _viv_asm(CLAMP0MAX, vecDstA, vecC, vecE); \\\n\ + _viv_asm(CLAMP0MAX, vecDstB, vecD, vecE); \\\n\ + vecA = vecA * vecDstA; \\\n\ + vecB = vecB * vecDstB; \\\n\ + vecA = vecA / 6.0f; \\\n\ + vecB = vecB / 6.0f; \\\n\ + vecA = vecA * OUTSCALE + OUTZP; \\\n\ + vecB = vecB * OUTSCALE + OUTZP; \\\n\ + convert_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, vecA); \\\n\ + _viv_asm(CONV_RTE, dst1, vecB); \\\n\ + dst_type dst2; \\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + dst_copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +#define HSWISH_FUNC(src_type_name, dst_type_name, src_type, src_copy_type, convert_type, dst_type, \\\n\ + dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \\\n\ + __kernel void hswish_##src_type_name##to##dst_type_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float beta \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + HSWISH_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray, src_type, \\\n\ + src_copy_type, convert_type, dst_type, dst_copy_type, \\\n\ + INSCALE, INTAIL, OUTSCALE, OUTZP) \\\n\ +}\n\ +\n\ +HSWISH_FUNC(F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8, 1, 0, 1, 0)\n\ +HSWISH_FUNC(F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8, 1, 0, outputScale, 0)\n\ +HSWISH_FUNC(F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8, 1, 0, outputScale, outputZP)\n\ +HSWISH_FUNC(F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8, 1, 0, outputScale, 0)\n\ +HSWISH_FUNC(I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8, inputScale, 0, outputScale, 0)\n\ +HSWISH_FUNC(I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0)\n\ +HSWISH_FUNC(U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8, \\\n\ + inputScale, inputTail, outputScale, outputZP)\n\ +HSWISH_FUNC(U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8, inputScale, inputTail, 1, 0)\n\ +HSWISH_FUNC(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8, inputScale, 0, outputScale, 0)\n\ +HSWISH_FUNC(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0)\n\ +\n\ +\n\ +#define HSWISH_FUNC_2D(src_type_name, dst_type_name, src_type, src_copy_type, convert_type, dst_type, \\\n\ + dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \\\n\ + __kernel void hswish_##src_type_name##to##dst_type_name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float beta \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + HSWISH_PROCESS(VXC_ReadImage, VXC_WriteImage, src_type, src_copy_type, convert_type, dst_type, \\\n\ + dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \\\n\ +}\n\ +\n\ +HSWISH_FUNC_2D(F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8, 1, 0, 1, 0)\n\ +HSWISH_FUNC_2D(F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8, 1, 0, outputScale, 0)\n\ +HSWISH_FUNC_2D(F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8, 1, 0, outputScale, outputZP)\n\ +HSWISH_FUNC_2D(F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8, 1, 0, outputScale, 0)\n\ +HSWISH_FUNC_2D(I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8, inputScale, 0, outputScale, 0)\n\ +HSWISH_FUNC_2D(I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0)\n\ +HSWISH_FUNC_2D(U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8, inputScale, \\\n\ + inputTail, outputScale, outputZP)\n\ +HSWISH_FUNC_2D(U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8, inputScale, inputTail, 1, 0)\n\ +HSWISH_FUNC_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8, inputScale, 0, outputScale, 0)\n\ +HSWISH_FUNC_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0)\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define HSWISH_BF16_PROCESS(read_fun, write_fun) \\\n\ + vxc_ushort8 src0, src1, dst; \\\n\ + float4 vecA, vecB, vecC, vecD, vecE, vecDstA, vecDstB; \\\n\ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecA, src1, 16); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, vecB, src1, 16); \\\n\ + vecC = vecA + 3.0f; \\\n\ + vecD = vecB + 3.0f; \\\n\ + vecE = 6.0f; \\\n\ + _viv_asm(CLAMP0MAX, vecDstA, vecC, vecE); \\\n\ + _viv_asm(CLAMP0MAX, vecDstB, vecD, vecE); \\\n\ + vecA = vecA * vecDstA; \\\n\ + vecB = vecB * vecDstB; \\\n\ + vecA = vecA / 6.0f; \\\n\ + vecB = vecB / 6.0f; \\\n\ + _viv_asm(COPY, src0, vecA, 16); \\\n\ + _viv_asm(COPY, src1, vecB, 16); \\\n\ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void hswish_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float beta\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + HSWISH_BF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray);\n\ +}\n\ +\n\ +__kernel void hswish_BF16toBF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float beta\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + HSWISH_BF16_PROCESS(VXC_ReadImage, VXC_WriteImage);\n\ +}\n\ +"; /* end of hswish_vx*/ + +static const char instance_normalization_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + vxc_float4 sumsqr;\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0;\n\ + float sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + //sum += lcl_sum[i];\n\ + //sqr += lcl_sqr[i];\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + vxc_float4 sumsqr;\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0;\n\ + float sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + //sum += lcl_sum[i];\n\ + //sqr += lcl_sqr[i];\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h, in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ + bias_f = read_imagef(bias, coord_para);\n\ +\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ +\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ +\n\ + vxc_float4 norm;\n\ + norm = scale_vari * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_vari * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + int endH = gidy + height;\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h, in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ + bias_f = read_imagef(bias, coord_para);\n\ +\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ +\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = scale_vari * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_vari * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of instance_normalization_f16_vx*/ + +static const char instance_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +\n\ +_viv_uniform float inFlScale_s2;\n\ +_viv_uniform float input_fl_scale;\n\ +_viv_uniform float inOut_fl_scale;\n\ +_viv_uniform float output_fl_scale;\n\ +\n\ +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + float sum = 0, sqr = 0;\n\ + vxc_float4 sumsqr = (vxc_float4)(0);\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniInt16SumSqr_dp8x2);\n\ + //tmpSumSqr += sumsqr;\n\ + tmpSumSqr.x += sumsqr.x;\n\ + sqr += (sumsqr.y * inFlScale_s2);\n\ + }\n\ + sum = tmpSumSqr.x * input_fl_scale;\n\ + //sqr = tmpSumSqr.y * inFlScale_s2;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + //sum += lcl_sum[i];\n\ + //sqr += lcl_sqr[i];\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + vxc_short8 src0;\n\ + float sum = 0, sqr = 0;\n\ + vxc_float4 sumsqr = (vxc_float4)(0);\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniInt16SumSqr_dp8x2);\n\ + //tmpSumSqr += sumsqr;\n\ + tmpSumSqr.x += sumsqr.x;\n\ + sqr += (sumsqr.y * inFlScale_s2);\n\ + }\n\ + sum = tmpSumSqr.x * input_fl_scale;\n\ + //sqr = tmpSumSqr.y * inFlScale_s2;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + //sum += lcl_sum[i];\n\ + //sqr += lcl_sqr[i];\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ +\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ +\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + int endH = gidy + height;\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ +\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + vxc_short8 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toInt16_2x8);\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + int endH = gidy + height;\n\ + vxc_short8 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toInt16_2x8);\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of instance_normalization_i16_vx*/ + +static const char instance_normalization_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumInt8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSumInt8_16x1;\n\ +_viv_uniform float inFlScale_s2;\n\ +_viv_uniform float input_fl_scale;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\ +\n\ +_viv_uniform float inOut_fl_scale;\n\ +_viv_uniform float output_fl_scale;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_char16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + int tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\ + tmpSum += (tmpSum1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\ + tmpSqr += (tmpSqr1);\n\ + }\n\ + sqr = tmpSqr * inFlScale_s2;\n\ + sum = tmpSum * input_fl_scale;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + //sum += lcl_sum[i];\n\ + //sqr += lcl_sqr[i];\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + vxc_char16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + int tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + tmpSqr = 0;\n\ + for(; coord.y < endH;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\ + tmpSum += (tmpSum1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\ + tmpSqr += (tmpSqr1);\n\ + }\n\ + sqr = tmpSqr * inFlScale_s2;\n\ + sum = tmpSum * input_fl_scale;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + //sum += lcl_sum[i];\n\ + //sqr += lcl_sqr[i];\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + vxc_char16 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ +\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + coord_para = coord;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertFthInt8Fp32_4x4);\n\ +\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + int endH = gidy + height;\n\ + vxc_char16 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ +\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + for(; coord.y < endH;)\n\ + {\n\ + coord_para = coord;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertFthInt8Fp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + vxc_char16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertFthInt8Fp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + int endH = gidy + height;\n\ + vxc_char16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertFthInt8Fp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of instance_normalization_i8_vx*/ + +static const char instance_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform float rowSumScale;\n\ +_viv_uniform float scale_inOut;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform int output_ZP;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + float eps, int rsFlg)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_uchar16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\ + }\n\ + sqr += (tmpSqr * e2InScale + rowSumScale);\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + float eps, int rsFlg)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + vxc_uchar16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\ + }\n\ + sqr += (tmpSqr * e2InScale + rowSumScale);\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + vxc_uchar16 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ +\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + coord_para = coord;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + int endH = gidy + height;\n\ + vxc_uchar16 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ +\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + for(; coord.y < endH;)\n\ + {\n\ + coord_para = coord;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + vxc_uchar16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ +\n\ + for(coord.y = 0; coord.y < height;coord.y++)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + int endH = gidy + height;\n\ + vxc_uchar16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + bias_f = read_imagef(bias, coord_para);\n\ + coord_para.x = 0;\n\ + coord_para.y = gidz;\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para);\n\ + coord_para.x += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ +\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of instance_normalization_u8_vx*/ + +static const char l2normalizescale_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define VXC_Vstore3(Pointer, Offset, Data) \\\n\ +do \\\n\ +{ int byteOffset = ((int)sizeof((Data)))*(Offset); \\\n\ +VXC_OP3_NoDest(vstore3, Pointer, byteOffset, Data); } \\\n\ +while(0)\n\ +\n\ +inline uchar* get_image2D_array_ptr(image2d_array_t input)\n\ +{\n\ + int8 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ + uchar *src_ptr = (uchar*)desc.s0;\n\ + return src_ptr;\n\ +}\n\ +\n\ +#define L2NORMSCALE_SWITCH_PROCESS(case_value, vec_val, ZpValue) \\\n\ + switch (case_value) \\\n\ + { \\\n\ + case 1: \\\n\ + vec_val.s123 = ZpValue; \\\n\ + vec_val.s4567 = ZpValue; \\\n\ + break; \\\n\ + case 2: \\\n\ + vec_val.s23 = ZpValue; \\\n\ + vec_val.s4567 = ZpValue; \\\n\ + break; \\\n\ + case 3: \\\n\ + vec_val.s3 = ZpValue; \\\n\ + vec_val.s4567 = ZpValue; \\\n\ + break; \\\n\ + case 4: \\\n\ + vec_val.s4567 = ZpValue; \\\n\ + break; \\\n\ + case 5: \\\n\ + vec_val.s567 = ZpValue; \\\n\ + break; \\\n\ + case 6: \\\n\ + vec_val.s67 = ZpValue; \\\n\ + break; \\\n\ + case 7: \\\n\ + vec_val.s7 = ZpValue; \\\n\ + break; \\\n\ + default: \\\n\ + break; \\\n\ + }\n\ +\n\ +#define L2NORMSCALE_REM_PROCESS(ZpValue) \\\n\ + VXC_Vload8(src0, src_ptr, 0); \\\n\ + VXC_Vload8(src1, src_ptr, 1); \\\n\ + if (inputRemain <= 8) \\\n\ + { \\\n\ + L2NORMSCALE_SWITCH_PROCESS(inputRemain, src0, ZpValue) \\\n\ + src1 = 0; \\\n\ + } \\\n\ + else if (inputRemain < 16) \\\n\ + { \\\n\ + int inputRemain8 = inputRemain - 8; \\\n\ + L2NORMSCALE_SWITCH_PROCESS(inputRemain8, src1, ZpValue) \\\n\ + }\n\ +\n\ +\n\ +#define L2NORMSCALE_MUL_PROCESS(index) \\\n\ + VXC_Vload8(src0, src_ptr, index); \\\n\ + _viv_asm(COPY, val0, src0, 16); \\\n\ + VXC_Vload8(scale_s16, scale_ptr, index); \\\n\ + _viv_asm(COPY, scale_f16, scale_s16, 16); \\\n\ + _viv_asm(COPY, input_ZP, inputZP, 4); \\\n\ + VXC_DP4x4(vec0, val0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\\\n\ + uniDataSubZPtoFp32Part0_4x4);\\\n\ + VXC_DP4x4(vec1, val0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\\\n\ + uniDataSubZPtoFp32Part1_4x4);\\\n\ + VXC_DP4x4(scale_f32, scale_f16, scale_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\\\n\ + uniFp16toFp32_4x4);\\\n\ + VXC_DP4x4(scale1_f32, scale_f16, scale_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\\\n\ + uniFp16toFp32Hi_4x4);\\\n\ + vec0 = vec0 * rsqrt0.xxxx + output_ZP;\\\n\ + vec1 = vec1 * rsqrt0.xxxx + output_ZP;\\\n\ + vec0 *= scale_f32;\\\n\ + vec1 *= scale1_f32;\\\n\ + _viv_asm(CONV_RTE, dst0, vec0);\\\n\ + _viv_asm(CONV_RTE, dst1, vec1);\\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\\\n\ + _viv_asm(COPY, dst, dst2, 16);\n\ +\n\ +_viv_uniform int inputWidth;\n\ +_viv_uniform int inputWidthRemain256;\n\ +_viv_uniform int inputWidthCount;\n\ +_viv_uniform VXC_512Bits uniSumSqrt_16x1;\n\ +_viv_uniform float r_inputScale;\n\ +\n\ +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtact8Bin_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32Hi_4x4;\n\ +_viv_uniform float IntergerScale;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform int inputWidthRemain128;\n\ +_viv_uniform float zP2x;\n\ +_viv_uniform float zpSqrt16x;\n\ +_viv_uniform VXC_512Bits uniSumAll_16x1;\n\ +_viv_uniform int inputZP;\n\ +\n\ +#define L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\ + vxc_float4 rsqrt0;\\\n\ + dst_type *dst_ptr = (dst_type *)get_image2D_array_ptr(output); \\\n\ + short *scale_ptr = (short *)get_image2D_array_ptr(scale); \\\n\ + vxc_float4 vec0, vec1;\\\n\ + convert_type dst0, dst1;\\\n\ + vxc_short8 scale_s16;\\\n\ + vxc_half8 scale_f16;\\\n\ + vxc_float4 scale_f32, scale1_f32;\\\n\ + output_type dst2;\\\n\ + copy_type dst;\\\n\ + rsqrt0 = sum.xxxx * IntergerScale;\\\n\ + src_ptr = src_ptr_base + (get_global_id(0) + get_global_id(1) * inputWidth); \\\n\ + dst_ptr += (get_global_id(0) + get_global_id(1) * inputWidth);\\\n\ + scale_ptr += get_global_id(0);\\\n\ + for(int i = 0; i < inputWidthCount; i++)\\\n\ + {\\\n\ + L2NORMSCALE_MUL_PROCESS(0) \\\n\ + VXC_Vstore8(dst_ptr, 0, dst); \\\n\ + L2NORMSCALE_MUL_PROCESS(1) \\\n\ + VXC_Vstore8(dst_ptr, 1, dst); \\\n\ + src_ptr += 256; \\\n\ + dst_ptr += 256; \\\n\ + scale_ptr += 256; \\\n\ + }\\\n\ + if (inputWidthRemain256) \\\n\ + { \\\n\ + offset = get_global_id(0) + inputWidthCount * 128; \\\n\ + inputRemain = inputWidth - offset; \\\n\ + if (inputRemain >= 8) \\\n\ + { \\\n\ + L2NORMSCALE_MUL_PROCESS(0) \\\n\ + VXC_Vstore8(dst_ptr, 0, dst); \\\n\ + src_ptr += 8; \\\n\ + dst_ptr += 8; \\\n\ + scale_ptr += 8; \\\n\ + inputRemain -= 8; \\\n\ + } \\\n\ + if (inputRemain > 0) \\\n\ + { \\\n\ + L2NORMSCALE_MUL_PROCESS(0) \\\n\ + switch (inputRemain) \\\n\ + { \\\n\ + case 1: \\\n\ + dst_ptr[0] = dst.s0; \\\n\ + break; \\\n\ + case 2: \\\n\ + VXC_Vstore2(dst_ptr, 0, dst); \\\n\ + break; \\\n\ + case 3: \\\n\ + VXC_Vstore3(dst_ptr, 0, dst); \\\n\ + break; \\\n\ + case 4: \\\n\ + VXC_Vstore4(dst_ptr, 0, dst); \\\n\ + break; \\\n\ + case 5: \\\n\ + VXC_Vstore2(dst_ptr, 0, dst); \\\n\ + dst.s012 = dst.s234; \\\n\ + dst_ptr += 2; \\\n\ + VXC_Vstore3(dst_ptr, 0, dst); \\\n\ + break; \\\n\ + case 6: \\\n\ + VXC_Vstore3(dst_ptr, 0, dst); \\\n\ + dst.s012 = dst.s345; \\\n\ + dst_ptr += 3; \\\n\ + VXC_Vstore3(dst_ptr, 0, dst); \\\n\ + break; \\\n\ + case 7: \\\n\ + VXC_Vstore4(dst_ptr, 0, dst); \\\n\ + dst.s012 = dst.s456; \\\n\ + dst_ptr += 4; \\\n\ + VXC_Vstore3(dst_ptr, 0, dst); \\\n\ + break; \\\n\ + default: \\\n\ + VXC_Vstore8(dst_ptr, 0, dst); \\\n\ + break; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ +\n\ +\n\ +#define L2NORMSCALE_AXIS0_2D(in0_name, in1_name, out_name, read_type, read_type2, src_type, INPUTSCALE, \\\n\ + dst_type, convert_type, output_type, copy_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ + void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \\\n\ + (\\\n\ + __read_only image2d_array_t input,\\\n\ + __read_only image2d_array_t scale,\\\n\ + __write_only image2d_array_t output,\\\n\ + int axis\\\n\ + )\\\n\ +{ \\\n\ + int lidx = get_local_id(0); \\\n\ + int offset = get_global_id(0); \\\n\ + read_type *src_ptr_base = (read_type *)get_image2D_array_ptr(input); \\\n\ + read_type *src_ptr; \\\n\ + read_type2 src0, src1; \\\n\ + src_type val0, val1; \\\n\ + int inputRemain; \\\n\ + vxc_float4 sum = {0.0f}; \\\n\ + read_type2 input_ZP ;\\\n\ + __local float lcl_sum[16]; \\\n\ + src_ptr = src_ptr_base + (get_global_id(0) + get_global_id(1) * inputWidth); \\\n\ + for (int i = 0; i < inputWidthCount; i++) \\\n\ + { \\\n\ + VXC_Vload8(src0, src_ptr, 0); \\\n\ + VXC_Vload8(src1, src_ptr, 1); \\\n\ + _viv_asm(COPY, val0, src0, 16); \\\n\ + _viv_asm(COPY, val1, src1, 16); \\\n\ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\\\n\ + uniSumSqrt_16x1); \\\n\ + sum.x += sum.y; \\\n\ + src_ptr += 256; \\\n\ + } \\\n\ + if (inputWidthRemain256) \\\n\ + { \\\n\ + offset = get_global_id(0) + inputWidthCount * 256;\\\n\ + inputRemain = inputWidth - offset; \\\n\ + if (inputRemain > 0) \\\n\ + { \\\n\ + L2NORMSCALE_REM_PROCESS(0) \\\n\ + _viv_asm(COPY, val0, src0, 16); \\\n\ + _viv_asm(COPY, val1, src1, 16); \\\n\ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\\\n\ + uniSumSqrt_16x1); \\\n\ + sum.x += sum.y; \\\n\ + } \\\n\ + } \\\n\ + lcl_sum[lidx] = sum.x; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + float4 data0; \\\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\ + sum.x = dot(data0, one); \\\n\ + sum.x = rsqrt(sum.x) * INPUTSCALE; \\\n\ + L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\ +}\n\ +\n\ +L2NORMSCALE_AXIS0_2D(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \\\n\ + ushort, half4, vxc_half8, vxc_ushort8)\n\ +L2NORMSCALE_AXIS0_2D(I16, F16, F16, short, vxc_short8, vxc_short8, r_inputScale, \\\n\ + ushort, half4, vxc_half8, vxc_ushort8)\n\ +L2NORMSCALE_AXIS0_2D(I16, F16, I16, short, vxc_short8, vxc_short8, r_inputScale, \\\n\ + short, int4, vxc_short8, vxc_short8)\n\ +L2NORMSCALE_AXIS0_2D(I8, F16, F16, char, vxc_char8, vxc_char8, r_inputScale, \\\n\ + ushort, half4, vxc_half8, vxc_ushort8)\n\ +L2NORMSCALE_AXIS0_2D(I8, F16, I8, char, vxc_char8, vxc_char8, r_inputScale, \\\n\ + char, int4, vxc_char8, vxc_char8)\n\ +\n\ +\n\ +\n\ +#define L2NORMSCALE_AXIS0_U8_2D(in1_name, out_name,\\\n\ + dst_type, convert_type, output_type, copy_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ + void l2normalizescale_axis0_U8_##in1_name##to##out_name##_2D \\\n\ + (\\\n\ + __read_only image2d_array_t input,\\\n\ + __read_only image2d_array_t scale,\\\n\ + __write_only image2d_array_t output,\\\n\ + int axis\\\n\ + )\\\n\ +{ \\\n\ + int lidx = get_local_id(0); \\\n\ + int offset = get_global_id(0); \\\n\ + uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input); \\\n\ + uchar *src_ptr; \\\n\ + vxc_uchar8 src0, src1; \\\n\ + vxc_uchar8 val0, val1; \\\n\ + int inputRemain; \\\n\ + vxc_float4 sum = {0.0f}; \\\n\ + vxc_uchar8 input_ZP ; \\\n\ + __local float lcl_sum[16]; \\\n\ + src_ptr = src_ptr_base + (get_global_id(0) + get_global_id(1) * inputWidth); \\\n\ + for (int i = 0; i < inputWidthCount; i++) \\\n\ + { \\\n\ + VXC_Vload8(src0, src_ptr, 0); \\\n\ + VXC_Vload8(src1, src_ptr, 1); \\\n\ + _viv_asm(COPY, val0, src0, 16); \\\n\ + _viv_asm(COPY, val1, src1, 16); \\\n\ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\\\n\ + uniSumSqrt_16x1); \\\n\ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 1),\\\n\ + uniSumAll_16x1); \\\n\ + sum.w = sum.y - zP2x * sum.z + zpSqrt16x; \\\n\ + sum.x += sum.w; \\\n\ + src_ptr += 256; \\\n\ + } \\\n\ + if (inputWidthRemain256) \\\n\ + { \\\n\ + offset = get_global_id(0) + inputWidthCount * 256; \\\n\ + inputRemain = inputWidth - offset; \\\n\ + if (inputRemain > 0) \\\n\ + { \\\n\ + L2NORMSCALE_REM_PROCESS((uchar)inputZP) \\\n\ + _viv_asm(COPY, val0, src0, 16); \\\n\ + _viv_asm(COPY, val1, src1, 16); \\\n\ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 1),\\\n\ + uniSumSqrt_16x1); \\\n\ + VXC_DP16x1(sum, val0, val1, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 1),\\\n\ + uniSumAll_16x1); \\\n\ + sum.w = sum.y - zP2x * sum.z + zpSqrt16x; \\\n\ + sum.x += sum.w; \\\n\ + } \\\n\ + } \\\n\ + lcl_sum[lidx] = sum.x; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + float4 data0; \\\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\ + sum.x = dot(data0, one); \\\n\ + sum.x = rsqrt(sum.x) * r_inputScale; \\\n\ + L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\ +}\n\ +\n\ +L2NORMSCALE_AXIS0_U8_2D(F16, F16, ushort, half4, vxc_half8, vxc_ushort8)\n\ +L2NORMSCALE_AXIS0_U8_2D(F16, U8, uchar, int4, vxc_uchar8, vxc_uchar8)\n\ +"; /* end of l2normalizescale_axis0_vx*/ + +static const char l2normalizescale_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/********************************************L2NormalizeScale*****************************************/\n\ +_viv_uniform int L2NorS_depth;\n\ +_viv_uniform VXC_512Bits UniFp16MulLo_dp4x4;\n\ +_viv_uniform VXC_512Bits UniFp16MulHi_dp4x4;\n\ +\n\ +//int8 version\n\ +_viv_uniform float r_inputScale;\n\ +_viv_uniform VXC_512Bits uniIntegerSquareLo_4x4;\n\ +_viv_uniform VXC_512Bits uniIntegerSquareHi_4x4;\n\ +_viv_uniform VXC_512Bits uniDataSquareAddU32Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniDataSquareAddU32Hi_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniUInt8SquareLo_4x4;\n\ +_viv_uniform VXC_512Bits uniUInt8SquareHi_4x4;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtact8Bin_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float IntergerScale;\n\ +_viv_uniform float output_ZP;\n\ +\n\ +#define L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \\\n\ + coord.y = get_global_id(1); \\\n\ + input_type vect0, vect1;\\\n\ + incopy_type src0, src1;\\\n\ + vxc_float4 rsqrt0, rsqrt1;\\\n\ + rsqrt0 = sum_lo;\\\n\ + rsqrt1 = sum_hi;\\\n\ + rsqrt0 *= IntergerScale;\\\n\ + rsqrt1 *= IntergerScale;\\\n\ + for(int i = 0; i < L2NorS_depth; i += 2)\\\n\ + {\\\n\ + vxc_float4 vec0, vec1;\\\n\ + input_type input_ZP ;\\\n\ + convert_type dst0, dst1;\\\n\ + VXC_ReadImage(vect0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect1, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + vxc_short8 scale_s16;\\\n\ + vxc_half8 scale_f16;\\\n\ + vxc_float4 scale_f32;\\\n\ + VXC_ReadImage(scale_s16, scale, coord.yw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\\\n\ + _viv_asm(COPY, scale_f16, scale_s16, 16); \\\n\ + _viv_asm(COPY, input_ZP, inputZP, 4); \\\n\ + VXC_DP4x4(vec0, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\\\n\ + uniDataSubZPtoFp32Part0_4x4);\\\n\ + VXC_DP4x4(vec1, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\\\n\ + uniDataSubZPtoFp32Part1_4x4);\\\n\ + VXC_DP4x4(scale_f32, scale_f16, scale_f16, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardInf, 0),\\\n\ + uniFp16toFp32_4x4);\\\n\ + vec0 = vec0 * rsqrt0;\\\n\ + vec1 = vec1 * rsqrt1;\\\n\ + vec0 = vec0 * scale_f32.xxxx + output_ZP;\\\n\ + vec1 = vec1 * scale_f32.xxxx + output_ZP;\\\n\ + _viv_asm(CONV_RTE, dst0, vec0);\\\n\ + _viv_asm(CONV_RTE, dst1, vec1);\\\n\ + output_type dst2;\\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\\\n\ + copy_type dst;\\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ + VXC_DP4x4(vec0, src1, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\\\n\ + uniDataSubZPtoFp32Part0_4x4);\\\n\ + VXC_DP4x4(vec1, src1, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0),\\\n\ + uniDataSubZPtoFp32Part1_4x4);\\\n\ + vec0 = vec0 * rsqrt0;\\\n\ + vec1 = vec1 * rsqrt1;\\\n\ + vec0 = vec0 * scale_f32.yyyy + output_ZP;\\\n\ + vec1 = vec1 * scale_f32.yyyy + output_ZP;\\\n\ + _viv_asm(CONV_RTE, dst0, vec0);\\\n\ + _viv_asm(CONV_RTE, dst1, vec1);\\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\\\n\ + coord.y++;\\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ + coord.y++;\\\n\ + }\\\n\ +\n\ +\n\ +#define L2NORMSCALE_AXIS1_F16_2D(in1_name, out_name,\\\n\ + input_type, incopy_type, output_type, convert_type, copy_type) \\\n\ +__kernel void l2normalizescale_axis1_F16_##in1_name##to##out_name##_2D \\\n\ + (\\\n\ + __read_only image2d_array_t input,\\\n\ + __read_only image2d_array_t scale,\\\n\ + __write_only image2d_array_t output,\\\n\ + int axis\\\n\ + )\\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 img1_s16, img2_s16; \\\n\ + vxc_float4 squr, sum_lo = 0, sum_hi = 0; \\\n\ + vxc_half8 img1_fp16, img2_fp16; \\\n\ + for(int i = 0; i < L2NorS_depth; i += 2) \\\n\ + { \\\n\ + VXC_ReadImage(img1_s16, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(img2_s16, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y += 2; \\\n\ + _viv_asm(COPY, img1_fp16, img1_s16, 16); \\\n\ + _viv_asm(COPY, img2_fp16, img2_s16, 16); \\\n\ + VXC_DP4x4(squr, img1_fp16, img1_fp16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),\\\n\ + UniFp16MulLo_dp4x4); \\\n\ + sum_lo += squr; \\\n\ + VXC_DP4x4(squr, img2_fp16, img2_fp16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),\\\n\ + UniFp16MulLo_dp4x4); \\\n\ + sum_lo += squr; \\\n\ + VXC_DP4x4(squr, img1_fp16, img1_fp16, VXC_MODIFIER(0, 3, 4, VXC_RM_TowardZero, 1),\\\n\ + UniFp16MulHi_dp4x4); \\\n\ + sum_hi += squr; \\\n\ + VXC_DP4x4(squr, img2_fp16, img2_fp16, VXC_MODIFIER(0, 3, 4, VXC_RM_TowardZero, 1),\\\n\ + UniFp16MulHi_dp4x4); \\\n\ + sum_hi += squr; \\\n\ + } \\\n\ + sum_lo = rsqrt(sum_lo); \\\n\ + sum_hi = rsqrt(sum_hi); \\\n\ + L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \\\n\ +}\n\ +\n\ +L2NORMSCALE_AXIS1_F16_2D(F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)\n\ +\n\ +\n\ +#define L2NORMSCALE_AXIS1_I8_2D(in1_name, out_name,\\\n\ + input_type, incopy_type, output_type, convert_type, copy_type) \\\n\ +__kernel void l2normalizescale_axis1_I8_##in1_name##to##out_name##_2D \\\n\ + (\\\n\ + __read_only image2d_array_t input,\\\n\ + __read_only image2d_array_t scale,\\\n\ + __write_only image2d_array_t output,\\\n\ + int axis\\\n\ + )\\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_char8 src0_I8, src1_I8; \\\n\ + vxc_uint4 dst0_I8 = 0, dst1_I8 = 0; \\\n\ + for(int i = 0; i < L2NorS_depth; i += 2) \\\n\ + { \\\n\ + VXC_ReadImage(src0_I8, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1_I8, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y += 2; \\\n\ + VXC_DP4x4(dst0_I8, src0_I8, dst0_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataSquareAddU32Lo_4x4); \\\n\ + VXC_DP4x4(dst1_I8, src0_I8, dst1_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataSquareAddU32Hi_4x4); \\\n\ + VXC_DP4x4(dst0_I8, src1_I8, dst0_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataSquareAddU32Lo_4x4); \\\n\ + VXC_DP4x4(dst1_I8, src1_I8, dst1_I8, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataSquareAddU32Hi_4x4); \\\n\ + } \\\n\ + vxc_float4 sum_lo, sum_hi; \\\n\ + sum_lo = convert_float4(dst0_I8); \\\n\ + sum_hi = convert_float4(dst1_I8); \\\n\ + sum_lo = rsqrt(sum_lo) * r_inputScale; \\\n\ + sum_hi = rsqrt(sum_hi) * r_inputScale; \\\n\ + L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \\\n\ +}\n\ +\n\ +L2NORMSCALE_AXIS1_I8_2D(F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)\n\ +L2NORMSCALE_AXIS1_I8_2D(F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)\n\ +\n\ +\n\ +#define L2NORMSCALE_AXIS1_I16_2D(in1_name, out_name,\\\n\ + input_type, incopy_type, output_type, convert_type, copy_type) \\\n\ +__kernel void l2normalizescale_axis1_I16_##in1_name##to##out_name##_2D \\\n\ + (\\\n\ + __read_only image2d_array_t input,\\\n\ + __read_only image2d_array_t scale,\\\n\ + __write_only image2d_array_t output,\\\n\ + int axis\\\n\ + )\\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 src0_I16, src1_I16; \\\n\ + vxc_float4 squr, sum_lo = 0, sum_hi = 0; \\\n\ + for(int i = 0; i < L2NorS_depth; i += 2) \\\n\ + { \\\n\ + VXC_ReadImage(src0_I16, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1_I16, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y += 2; \\\n\ + VXC_DP4x4(squr, src0_I16, src0_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniIntegerSquareLo_4x4); \\\n\ + sum_lo = squr + sum_lo; \\\n\ + VXC_DP4x4(squr, src0_I16, src0_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniIntegerSquareHi_4x4); \\\n\ + sum_hi = squr + sum_hi; \\\n\ + VXC_DP4x4(squr, src1_I16, src1_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniIntegerSquareLo_4x4); \\\n\ + sum_lo = squr + sum_lo; \\\n\ + VXC_DP4x4(squr, src1_I16, src1_I16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniIntegerSquareHi_4x4); \\\n\ + sum_hi = squr + sum_hi; \\\n\ + } \\\n\ + sum_lo = rsqrt(sum_lo) * r_inputScale; \\\n\ + sum_hi = rsqrt(sum_hi) * r_inputScale; \\\n\ + L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \\\n\ +}\n\ +\n\ +L2NORMSCALE_AXIS1_I16_2D(F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)\n\ +L2NORMSCALE_AXIS1_I16_2D(F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)\n\ +\n\ +#define L2NORMSCALE_AXIS1_U8_2D(in1_name, out_name,\\\n\ + input_type, incopy_type, output_type, convert_type, copy_type) \\\n\ +__kernel void l2normalizescale_axis1_U8_##in1_name##to##out_name##_2D \\\n\ + (\\\n\ + __read_only image2d_array_t input,\\\n\ + __read_only image2d_array_t scale,\\\n\ + __write_only image2d_array_t output,\\\n\ + int axis\\\n\ + )\\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_uchar8 src0_U8, src1_U8; \\\n\ + vxc_float4 squr, sum_lo = 0, sum_hi = 0; \\\n\ + for(int i = 0; i < L2NorS_depth; i += 2) \\\n\ + { \\\n\ + vxc_uchar8 zero; \\\n\ + VXC_ReadImage(src0_U8, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1_U8, input, coord.xy, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y += 2; \\\n\ + _viv_asm(COPY, zero, inputZP, 4); \\\n\ + VXC_DP4x4(squr, src0_U8, zero, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniUInt8SquareLo_4x4); \\\n\ + sum_lo = squr + sum_lo; \\\n\ + VXC_DP4x4(squr, src0_U8, zero, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniUInt8SquareHi_4x4); \\\n\ + sum_hi = squr + sum_hi; \\\n\ + VXC_DP4x4(squr, src1_U8, zero, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniUInt8SquareLo_4x4); \\\n\ + sum_lo = squr + sum_lo; \\\n\ + VXC_DP4x4(squr, src1_U8, zero, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniUInt8SquareHi_4x4); \\\n\ + sum_hi = squr + sum_hi; \\\n\ + } \\\n\ + sum_lo = rsqrt(sum_lo) * r_inputScale; \\\n\ + sum_hi = rsqrt(sum_hi) * r_inputScale; \\\n\ + L2NORMSCALE_MUL_AXIS1_PROCESS(input_type, incopy_type, output_type, convert_type, copy_type) \\\n\ +}\n\ +\n\ +L2NORMSCALE_AXIS1_U8_2D(F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)\n\ +L2NORMSCALE_AXIS1_U8_2D(F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)\n\ +"; /* end of l2normalizescale_axis1_vx*/ + +static const char log_softmax_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform float rlogE;\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float betaValue;\n\ +_viv_uniform float scaleLogE;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform int inputWidth;\n\ +_viv_uniform int inputWidthRemain4;\n\ +_viv_uniform VXC_512Bits uniGetSubData0to3_4x4;\n\ +_viv_uniform VXC_512Bits uniGetSubData4to7_4x4;\n\ +_viv_uniform VXC_512Bits uniPackMaxData_2x8;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS0(read_fun, vert_max_fun, horz_max_fun) \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, val, val0, 16); \\\n\ + coord.x += 8; \\\n\ + do \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val1, val1, 16); \\\n\ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val2, val2, 16); \\\n\ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val3, val3, 16); \\\n\ + coord.x += 32; \\\n\ + vert_max_fun(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vert_max_fun(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + while(coord.x < (axisSize + 16)); \\\n\ + horz_max_fun(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \\\n\ + horz_max_fun(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_float4 prob; \\\n\ + float fProbSum = 0; \\\n\ + const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \\\n\ + int idx = 0; \\\n\ + for (coord.x = 0; coord.x < inputWidth; idx ++) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\ + prob *= scaleLogE; \\\n\ + prob = exp2(prob); \\\n\ + fProbSum += dot(prob, one4); \\\n\ + coord.x += 4; \\\n\ + } \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\ + prob *= scaleLogE; \\\n\ + if(inputWidthRemain4 == 1) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.yzw = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + else if(inputWidthRemain4 == 2) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.y = exp2(prob.y); \\\n\ + prob.zw = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + else if(inputWidthRemain4 == 3) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.y = exp2(prob.y); \\\n\ + prob.z = exp2(prob.z); \\\n\ + prob.w = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + vxc_float4 probSum_log; \\\n\ + probSum_log.x = log2(fProbSum) * rlogE;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \\\n\ + for (coord.x = 0; coord.x < axisSize; ) \\\n\ + { \\\n\ + dst_type vec0, vec1; \\\n\ + save_type dst; \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + prob = prob * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, vec0, prob); \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + prob = prob * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, vec1, prob); \\\n\ + VXC_DP2x8(dst, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + }\n\ +\n\ +#define LOGSOFTMAX_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\\\n\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun, horz_max_fun) \\\n\ +__kernel void log_softmax_axis0_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + LOGSOFTMAX_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \\\n\ + LOGSOFTMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \\\n\ +}\n\ +\n\ +LOGSOFTMAX_AXIS0(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_AXIS0(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_AXIS0(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_AXIS0(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +\n\ +#define LOGSOFTMAX_AXIS0_2D(src_name, dst_name, src_type, copy_type,\\\n\ + dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun, horz_max_fun) \\\n\ +__kernel void log_softmax_axis0_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(16, get_global_id(0)); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + LOGSOFTMAX_PROCESS_AXIS0(VXC_ReadImage, vert_max_fun, horz_max_fun) \\\n\ + LOGSOFTMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage, VXC_WriteImage); \\\n\ +}\n\ +\n\ +LOGSOFTMAX_AXIS0_2D(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_AXIS0_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_AXIS0_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, \\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_AXIS0_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS0_TOF32_SAVE(read_fun) \\\n\ + for (coord.x = 0; coord.x < axisSize; ) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + write_imagef(output, coord, prob); \\\n\ + coord.x += 4; \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + write_imagef(output, coord, prob); \\\n\ + coord.x += 4; \\\n\ + }\n\ +\n\ +#define LOGSOFTMAX_AXIS0_TOF32(src_name, src_type, copy_type, vert_max_fun, horz_max_fun) \\\n\ +__kernel void log_softmax_axis0_##src_name##toF32 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + LOGSOFTMAX_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \\\n\ + LOGSOFTMAX_PROCESS_AXIS0_TOF32_SAVE(VXC_ReadImage2DArray) \\\n\ +}\n\ +\n\ +LOGSOFTMAX_AXIS0_TOF32(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_AXIS0_TOF32(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0_TOF32(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0_TOF32(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +\n\ +#define LOGSOFTMAX_AXIS0_TOF32_2D(src_name, src_type, copy_type, vert_max_fun, horz_max_fun) \\\n\ +__kernel void log_softmax_axis0_##src_name##toF32_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(16, get_global_id(0)); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + LOGSOFTMAX_PROCESS_AXIS0(VXC_ReadImage, vert_max_fun, horz_max_fun) \\\n\ + LOGSOFTMAX_PROCESS_AXIS0_TOF32_SAVE(VXC_ReadImage) \\\n\ +}\n\ +\n\ +LOGSOFTMAX_AXIS0_TOF32_2D(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_AXIS0_TOF32_2D(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0_TOF32_2D(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_AXIS0_TOF32_2D(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +\n\ +"; /* end of log_softmax_axis0_vx*/ + +static const char log_softmax_axis0_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform float rlogE;\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float betaValue;\n\ +_viv_uniform float scaleLogE;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +\n\ +_viv_uniform int inputWidth;\n\ +_viv_uniform int inputWidthRemain4;\n\ +_viv_uniform VXC_512Bits uniPackMaxData_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS0_BF16(read_fun) \\\n\ + vxc_half8 img_val0, img_val1, img_val2, img_val3; \\\n\ + vxc_short8 val0, val1, val2, val3; \\\n\ + vxc_half8 val; \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, val, val0, 16); \\\n\ + coord.x += 8; \\\n\ + do \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val1, val1, 16); \\\n\ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val2, val2, 16); \\\n\ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val3, val3, 16); \\\n\ + coord.x += 32; \\\n\ + VXC_VertMax3_Half(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_VertMax3_Half(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + while(coord.x < (axisSize + 16)); \\\n\ + VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \\\n\ + VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + vxc_ushort8 bf_val_tmp; \\\n\ + vxc_float4 vecA; \\\n\ + _viv_asm(COPY, bf_val_tmp, val, 16); \\\n\ + VXC_DP2x8(bf_val_tmp, bf_val_tmp, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecA, bf_val_tmp, 16); \\\n\ + vxc_float4 prob; \\\n\ + float fProbSum = 0; \\\n\ + const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \\\n\ + float max_value = vecA.x * scaleLogE; \\\n\ + float max_value_orig = vecA.x; \\\n\ + for (coord.x = 0; coord.x < inputWidth; ) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\ + prob = prob * scaleLogE - max_value; \\\n\ + prob = exp2(prob); \\\n\ + fProbSum += dot(prob, one4); \\\n\ + coord.x += 4; \\\n\ + } \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\ + prob = prob * scaleLogE - max_value; \\\n\ + if(inputWidthRemain4 == 1) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.yzw = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + else if(inputWidthRemain4 == 2) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.y = exp2(prob.y); \\\n\ + prob.zw = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + else if(inputWidthRemain4 == 3) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.y = exp2(prob.y); \\\n\ + prob.z = exp2(prob.z); \\\n\ + prob.w = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + vxc_float4 probSum_log; \\\n\ + probSum_log.x = log2(fProbSum) * rlogE;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS0_BF16TOBF16_SAVE(read_fun, write_fun) \\\n\ + for (coord.x = 0; coord.x < axisSize; ) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\ + prob = prob - max_value_orig; \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + vxc_ushort8 tmp, dst; \\\n\ + _viv_asm(COPY, tmp, prob, 16); \\\n\ + dst.s0123 = tmp.s1357; \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 4; \\\n\ + }\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS0_BF16TOF16_SAVE(read_fun, write_fun) \\\n\ + for (coord.x = 0; coord.x < axisSize; ) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\ + prob = prob - max_value_orig; \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + half4 vec; \\\n\ + vxc_half4 tmp; \\\n\ + vxc_short4 dst; \\\n\ + _viv_asm(CONV, vec, prob); \\\n\ + VXC_DP4x4(tmp, vec, vec, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, dst, tmp, 8); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 4; \\\n\ + }\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS0_BF16TOF32_SAVE(read_fun) \\\n\ + for (coord.x = 0; coord.x < axisSize; ) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\ + prob = prob - max_value_orig; \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + write_imagef(output, coord, prob); \\\n\ + coord.x += 4; \\\n\ + }\n\ +\n\ +__kernel void log_softmax_axis0_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0);\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage2DArray)\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16TOBF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +__kernel void log_softmax_axis0_BF16toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0);\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage2DArray)\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16TOF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +__kernel void log_softmax_axis0_BF16toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0);\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage2DArray)\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16TOF32_SAVE(VXC_ReadImage2DArray)\n\ +}\n\ +__kernel void log_softmax_axis0_BF16toBF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int2 coord = (int2)(16, get_global_id(0));\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage)\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16TOBF16_SAVE(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +__kernel void log_softmax_axis0_BF16toF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int2 coord = (int2)(16, get_global_id(0));\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage)\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16TOF16_SAVE(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +__kernel void log_softmax_axis0_BF16toF32_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int2 coord = (int2)(16, get_global_id(0));\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16(VXC_ReadImage)\n\ + LOGSOFTMAX_PROCESS_AXIS0_BF16TOF32_SAVE(VXC_ReadImage)\n\ +}\n\ +\n\ +"; /* end of log_softmax_axis0_BF16_vx*/ + +static const char log_softmax_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float rlogE;\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float betaValue;\n\ +_viv_uniform float scaleLogE;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniGetSubLoData_4x4;\n\ +_viv_uniform VXC_512Bits uniGetSubHiData_4x4;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS1(read_fun, vert_max_fun) \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, max, in0, 16); \\\n\ + coord.y++; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + } \\\n\ + while(coord.y < axisSize); \\\n\ + coord.y = 0; \\\n\ + sum0 = 0; \\\n\ + sum1 = 0; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\ + data0 *= scaleLogE; \\\n\ + data0 = exp2(data0); \\\n\ + sum0 += data0; \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\ + data0 *= scaleLogE; \\\n\ + data0 = exp2(data0); \\\n\ + sum1 += data0; \\\n\ + coord.y++; \\\n\ + } \\\n\ + while(coord.y < axisSize); \\\n\ + sum0 = log2(sum0) * rlogE; \\\n\ + sum1 = log2(sum1) * rlogE;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \\\n\ + coord.y = 0; \\\n\ + dst_type dst0, dst1; \\\n\ + save_type vect; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\ + data0 = data0 * betaValue - sum0; \\\n\ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst0, data0); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\ + data0 = data0 * betaValue - sum1; \\\n\ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst1, data0); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + } \\\n\ + while(coord.y < axisSize);\n\ +\n\ +#define LOGSOFTMAX_AXIS1(src_name, dst_name, src_type, copy_type, dst_type,\\\n\ +save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun) \\\n\ +__kernel void log_softmax_axis1_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + vxc_float4 sum0, sum1; \\\n\ + LOGSOFTMAX_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \\\n\ + LOGSOFTMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \\\n\ +}\n\ +\n\ +\n\ +\n\ +LOGSOFTMAX_AXIS1(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS1(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS1(F16, U8, vxc_half8, vxc_short8, uchar4,\\\n\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half)\n\ +\n\ +LOGSOFTMAX_AXIS1(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +LOGSOFTMAX_AXIS1(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +LOGSOFTMAX_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, uchar4,\\\n\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8, CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +#define LOGSOFTMAX_AXIS1_2D(src_name, dst_name, src_type,\\\n\ +copy_type, dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun) \\\n\ +__kernel void log_softmax_axis1_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + vxc_float4 sum0, sum1; \\\n\ + LOGSOFTMAX_PROCESS_AXIS1(VXC_ReadImage, vert_max_fun) \\\n\ + LOGSOFTMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage, VXC_WriteImage); \\\n\ +}\n\ +\n\ +LOGSOFTMAX_AXIS1_2D(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS1_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS1_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS1_2D(F16, U8, vxc_half8, vxc_short8, uchar4,\\\n\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half)\n\ +\n\ +LOGSOFTMAX_AXIS1_2D(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +LOGSOFTMAX_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8, \\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +LOGSOFTMAX_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4,\\\n\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8, CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +\n\ +#define LOGSOFTMAX_AXIS1_TOF32(src_name, src_type, copy_type, vert_max_fun) \\\n\ +__kernel void log_softmax_axis1_##src_name##toF32 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + vxc_float4 sum0, sum1; \\\n\ + LOGSOFTMAX_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \\\n\ + coord.y = 0; \\\n\ + do \\\n\ + { \\\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\ + data0 = data0 * betaValue - sum0; \\\n\ + write_imagef(output, coord, data0); \\\n\ + coord.x += 4; \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\ + data0 = data0 * betaValue - sum1; \\\n\ + write_imagef(output, coord, data0); \\\n\ + coord.x -= 4; \\\n\ + coord.y++; \\\n\ + } \\\n\ + while(coord.y < axisSize); \\\n\ +}\n\ +\n\ +LOGSOFTMAX_AXIS1_TOF32(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS1_TOF32(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS1_TOF32(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS1_TOF32(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer)\n\ +\n\ +#define LOGSOFTMAX_AXIS1_TOF32_2D(src_name, src_type, copy_type, vert_max_fun) \\\n\ +__kernel void log_softmax_axis1_##src_name##toF32_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + vxc_float4 sum0, sum1; \\\n\ + LOGSOFTMAX_PROCESS_AXIS1(VXC_ReadImage, vert_max_fun) \\\n\ + coord.y = 0; \\\n\ + do \\\n\ + { \\\n\ + VXC_ReadImage(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\ + data0 = data0 * betaValue - sum0; \\\n\ + write_imagef(output, coord, data0); \\\n\ + coord.x += 4; \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\ + data0 = data0 * betaValue - sum1; \\\n\ + write_imagef(output, coord, data0); \\\n\ + coord.x -= 4; \\\n\ + coord.y++; \\\n\ + } \\\n\ + while(coord.y < axisSize); \\\n\ +}\n\ +\n\ +LOGSOFTMAX_AXIS1_TOF32_2D(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS1_TOF32_2D(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS1_TOF32_2D(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS1_TOF32_2D(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer)\n\ +"; /* end of log_softmax_axis1_vx*/ + +static const char log_softmax_axis1_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float rlogE;\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float betaValue;\n\ +_viv_uniform float scaleLogE;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS1_BF16(read_fun) \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, max, in0, 16); \\\n\ + coord.y++; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_VertMax3_Half(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + } \\\n\ + while(coord.y < axisSize); \\\n\ + _viv_asm(COPY, tmp0, max, 16); \\\n\ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, max_lo, tmp1, 16); \\\n\ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, max_hi, tmp1, 16); \\\n\ + coord.y = 0; \\\n\ + sum0 = 0; \\\n\ + sum1 = 0; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data0, tmp1, 16); \\\n\ + data0 = data0 - max_lo; \\\n\ + data0 *= scaleLogE; \\\n\ + sum0 += exp2(data0); \\\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, data0, tmp1, 16); \\\n\ + data0 = data0 - max_hi; \\\n\ + data0 *= scaleLogE; \\\n\ + sum1 += exp2(data0); \\\n\ + coord.y++; \\\n\ + } \\\n\ + while (coord.y < axisSize); \\\n\ + sum0 = log2(sum0) * rlogE; \\\n\ + sum1 = log2(sum1) * rlogE;\n\ +\n\ +__kernel void log_softmax_axis1_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage2DArray)\n\ +\n\ + coord.y = 0;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + do\n\ + {\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + _viv_asm(COPY, dst0, data0, 16);\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + _viv_asm(COPY, dst1, data0, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + }\n\ + while(coord.y < axisSize);\n\ +}\n\ +\n\ +__kernel void log_softmax_axis1_BF16toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage2DArray)\n\ +\n\ + coord.y = 0;\n\ + half4 dst0, dst1;\n\ + do\n\ + {\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + _viv_asm(CONV, dst0, data0);\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + _viv_asm(CONV, dst1, data0);\n\ + VXC_DP2x8(vec0, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\ + vxc_short8 vect;\n\ + _viv_asm(COPY, vect, vec0, 16);\n\ + VXC_WriteImage2DArray(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + }\n\ + while(coord.y < axisSize);\n\ +}\n\ +\n\ +__kernel void log_softmax_axis1_BF16toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage2DArray)\n\ +\n\ + coord.y = 0;\n\ + do\n\ + {\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + write_imagef(output, coord, data0);\n\ + coord.x += 4;\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + write_imagef(output, coord, data0);\n\ + coord.x -= 4;\n\ + coord.y++;\n\ + }\n\ + while (coord.y < axisSize);\n\ +}\n\ +\n\ +__kernel void log_softmax_axis1_BF16toBF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage)\n\ +\n\ + coord.y = 0;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + do\n\ + {\n\ + VXC_ReadImage(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + _viv_asm(COPY, dst0, data0, 16);\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + _viv_asm(COPY, dst1, data0, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + }\n\ + while(coord.y < axisSize);\n\ +}\n\ +\n\ +__kernel void log_softmax_axis1_BF16toF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage)\n\ +\n\ + coord.y = 0;\n\ + half4 dst0, dst1;\n\ + do\n\ + {\n\ + VXC_ReadImage(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + _viv_asm(CONV, dst0, data0);\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + _viv_asm(CONV, dst1, data0);\n\ + VXC_DP2x8(vec0, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\ + vxc_short8 vect;\n\ + _viv_asm(COPY, vect, vec0, 16);\n\ + VXC_WriteImage(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + }\n\ + while(coord.y < axisSize);\n\ +}\n\ +\n\ +__kernel void log_softmax_axis1_BF16toF32_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_AXIS1_BF16(VXC_ReadImage)\n\ +\n\ + coord.y = 0;\n\ + do\n\ + {\n\ + VXC_ReadImage(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + write_imagef(output, coord, data0);\n\ + coord.x += 4;\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + write_imagef(output, coord, data0);\n\ + coord.x -= 4;\n\ + coord.y++;\n\ + }\n\ + while (coord.y < axisSize);\n\ +}\n\ +"; /* end of log_softmax_axis1_BF16_vx*/ + +static const char log_softmax_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float rlogE;\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float betaValue;\n\ +_viv_uniform float scaleLogE;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniGetSubLoData_4x4;\n\ +_viv_uniform VXC_512Bits uniGetSubHiData_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS2(read_fun, vert_max_fun) \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, max, in0, 16); \\\n\ + coord.z++; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + } \\\n\ + while(coord.z < axisSize); \\\n\ + coord.z = 0; \\\n\ + sum0 = 0; \\\n\ + sum1 = 0; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\ + data0 *= scaleLogE; \\\n\ + data0 = exp2(data0); \\\n\ + sum0 += data0; \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\ + data0 *= scaleLogE; \\\n\ + data0 = exp2(data0); \\\n\ + sum1 += data0; \\\n\ + coord.z++; \\\n\ + } \\\n\ + while(coord.z < axisSize); \\\n\ + sum0 = log2(sum0) * rlogE; \\\n\ + sum1 = log2(sum1) * rlogE;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS2_SAVE(dst_type, save_type,\\\n\ +conv_mode, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \\\n\ + coord.z = 0; \\\n\ + dst_type dst0, dst1; \\\n\ + save_type vect; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\ + data0 = data0 * betaValue - sum0; \\\n\ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst0, data0); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\ + data0 = data0 * betaValue - sum1; \\\n\ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst1, data0); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + } \\\n\ + while(coord.z < axisSize);\n\ +\n\ +#define LOGSOFTMAX_AXIS2(src_name, dst_name, src_type, copy_type,\\\n\ +dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun) \\\n\ +__kernel void log_softmax_axis2_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + vxc_float4 sum0, sum1; \\\n\ + LOGSOFTMAX_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_max_fun) \\\n\ + LOGSOFTMAX_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \\\n\ +}\n\ +\n\ +LOGSOFTMAX_AXIS2(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS2(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS2(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS2(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half)\n\ +\n\ +LOGSOFTMAX_AXIS2(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS2(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +LOGSOFTMAX_AXIS2(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS2(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +LOGSOFTMAX_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS2(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +\n\ +#define LOGSOFTMAX_AXIS2_TOF32(src_name, src_type, copy_type, vert_max_fun) \\\n\ +__kernel void log_softmax_axis2_##src_name##toF32 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + vxc_float4 sum0, sum1; \\\n\ + LOGSOFTMAX_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_max_fun) \\\n\ + coord.z = 0; \\\n\ + do \\\n\ + { \\\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\ + data0 = data0 * betaValue - sum0; \\\n\ + write_imagef(output, coord, data0); \\\n\ + coord.x += 4; \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\ + data0 = data0 * betaValue - sum1; \\\n\ + write_imagef(output, coord, data0); \\\n\ + coord.x -= 4; \\\n\ + coord.z++; \\\n\ + } \\\n\ + while(coord.z < axisSize); \\\n\ +}\n\ +\n\ +LOGSOFTMAX_AXIS2_TOF32(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_AXIS2_TOF32(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS2_TOF32(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_AXIS2_TOF32(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer)\n\ +\n\ +#define LOGSOFTMAX_PROCESS_AXIS2_BF16(read_fun) \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, max, in0, 16); \\\n\ + coord.z++; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_VertMax3_Half(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + } \\\n\ + while(coord.z < axisSize); \\\n\ + _viv_asm(COPY, tmp0, max, 16); \\\n\ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, max_lo, tmp1, 16); \\\n\ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, max_hi, tmp1, 16); \\\n\ + coord.z = 0; \\\n\ + sum0 = 0; \\\n\ + sum1 = 0; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data0, tmp1, 16); \\\n\ + data0 = data0 - max_lo; \\\n\ + data0 *= scaleLogE; \\\n\ + sum0 += exp2(data0); \\\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, data0, tmp1, 16); \\\n\ + data0 = data0 - max_hi; \\\n\ + data0 *= scaleLogE; \\\n\ + sum1 += exp2(data0); \\\n\ + coord.z++; \\\n\ + } \\\n\ + while (coord.z < axisSize); \\\n\ + sum0 = log2(sum0) * rlogE; \\\n\ + sum1 = log2(sum1) * rlogE;\n\ +\n\ +__kernel void log_softmax_axis2_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_AXIS2_BF16(VXC_ReadImage2DArray)\n\ +\n\ + coord.z = 0;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + do\n\ + {\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + _viv_asm(COPY, dst0, data0, 16);\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + _viv_asm(COPY, dst1, data0, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + }\n\ + while(coord.z < axisSize);\n\ +}\n\ +\n\ +__kernel void log_softmax_axis2_BF16toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_AXIS2_BF16(VXC_ReadImage2DArray)\n\ +\n\ + coord.z = 0;\n\ + half4 dst0, dst1;\n\ + do\n\ + {\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + _viv_asm(CONV, dst0, data0);\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + _viv_asm(CONV, dst1, data0);\n\ + VXC_DP2x8(vec0, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\ + vxc_short8 vect;\n\ + _viv_asm(COPY, vect, vec0, 16);\n\ + VXC_WriteImage2DArray(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + }\n\ + while(coord.z < axisSize);\n\ +}\n\ +\n\ +__kernel void log_softmax_axis2_BF16toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_AXIS2_BF16(VXC_ReadImage2DArray)\n\ +\n\ + coord.z = 0;\n\ + do\n\ + {\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + write_imagef(output, coord, data0);\n\ + coord.x += 4;\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + write_imagef(output, coord, data0);\n\ + coord.x -= 4;\n\ + coord.z++;\n\ + }\n\ + while (coord.z < axisSize);\n\ +}\n\ +"; /* end of log_softmax_axis2_vx*/ + +static const char logical_not_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void logical_not_I8toI8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_char8 src0;\n\ + vxc_char8 dst;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + dst = !src0;\n\ + dst *= (-1);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void logical_not_I8toI8_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_char8 src0;\n\ + vxc_char8 dst;\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + dst = !src0;\n\ + dst *= (-1);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +"; /* end of logical_not_vx*/ + +static const char logical_ops_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\\\n\ +lgc_op, lgc_op2, read_fun, write_fun) \\\n\ + input_type vA;\\\n\ + copy_type src0;\\\n\ + input_type vB;\\\n\ + copy_type src1;\\\n\ + read_fun(vA,in0,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\\\n\ + _viv_asm(COPY, src0, vA, 16); \\\n\ + read_fun(vB,in1,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\\\n\ + _viv_asm(COPY, src1, vB, 16); \\\n\ + output_type dst; \\\n\ + dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \\\n\ + dst *= (-1); \\\n\ + out_copy_type data; \\\n\ + _viv_asm(COPY, data, dst, 16); \\\n\ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +\n\ +#define TENSORLOGICAL(name0, src_type_name, dst_type_name, input_type, copy_type,\\\n\ +output_type, out_copy_type, lgc_op, lgc_op2) \\\n\ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name( \\\n\ + __read_only image2d_array_t in0, \\\n\ + __read_only image2d_array_t in1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{\\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\\\n\ + TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\\\n\ + lgc_op, lgc_op2, VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\ +}\n\ +\n\ +#define TENSORLOGICAL_2D(name0, src_type_name, dst_type_name, input_type,\\\n\ +copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \\\n\ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name##_2D( \\\n\ + __read_only image2d_array_t in0, \\\n\ + __read_only image2d_array_t in1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{\\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\\\n\ + TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\\\n\ + lgc_op, lgc_op2, VXC_ReadImage, VXC_WriteImage) \\\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniMulShortMinus1toFp16_2x8;\n\ +\n\ +#define TENSORLOGICAL_FP_PROCESS(input_type, copy_type, output_type,\\\n\ +out_copy_type, lgc_op, lgc_op2, read_fun, write_fun) \\\n\ + input_type vA;\\\n\ + copy_type src0;\\\n\ + input_type vB;\\\n\ + copy_type src1;\\\n\ + read_fun(vA,in0,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\\\n\ + _viv_asm(COPY, src0, vA, 16); \\\n\ + read_fun(vB,in1,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\\\n\ + _viv_asm(COPY, src1, vB, 16); \\\n\ + output_type dst; \\\n\ + dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \\\n\ + vxc_half8 tmpOut; \\\n\ + VXC_DP2x8(tmpOut,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 0),uniMulShortMinus1toFp16_2x8); \\\n\ + out_copy_type data; \\\n\ + _viv_asm(COPY, data, tmpOut, 16); \\\n\ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +\n\ +\n\ +#define TENSORLOGICAL_FP(name0, src_type_name, dst_type_name, input_type,\\\n\ +copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \\\n\ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name( \\\n\ + __read_only image2d_array_t in0, \\\n\ + __read_only image2d_array_t in1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{\\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\\\n\ + TENSORLOGICAL_FP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\\\n\ + VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\ +}\n\ +\n\ +#define TENSORLOGICAL_FP_2D(name0, src_type_name, dst_type_name, input_type,\\\n\ +copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \\\n\ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name##_2D( \\\n\ + __read_only image2d_array_t in0, \\\n\ + __read_only image2d_array_t in1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{\\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\\\n\ + TENSORLOGICAL_FP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\\\n\ + VXC_ReadImage, VXC_WriteImage) \\\n\ +}\n\ +\n\ +// name0, src_name, dst_name, input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2\n\ +TENSORLOGICAL(or, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ||, )\n\ +//TENSORLOGICAL(or, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ||, )\n\ +//TENSORLOGICAL(or, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, )\n\ +//TENSORLOGICAL_FP(or, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, )\n\ +TENSORLOGICAL(and, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, &&, )\n\ +//TENSORLOGICAL(and, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, &&, )\n\ +//TENSORLOGICAL(and, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, )\n\ +//TENSORLOGICAL_FP(and, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, )\n\ +TENSORLOGICAL(xor, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ^, !!)\n\ +//TENSORLOGICAL(xor, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ^, !!)\n\ +//TENSORLOGICAL(xor, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ +//TENSORLOGICAL_FP(xor, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ +\n\ +TENSORLOGICAL_2D(or, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ||, )\n\ +//TENSORLOGICAL_2D(or, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ||, )\n\ +//TENSORLOGICAL_2D(or, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, )\n\ +//TENSORLOGICAL_FP_2D(or, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, )\n\ +TENSORLOGICAL_2D(and, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, &&, )\n\ +//TENSORLOGICAL_2D(and, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, &&, )\n\ +//TENSORLOGICAL_2D(and, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, )\n\ +//TENSORLOGICAL_FP_2D(and, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, )\n\ +TENSORLOGICAL_2D(xor, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ^, !!)\n\ +//TENSORLOGICAL_2D(xor, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ^, !!)\n\ +//TENSORLOGICAL_2D(xor, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ +//TENSORLOGICAL_FP_2D(xor, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ +"; /* end of logical_ops_vx*/ + +static const char lstmunit_activation_BP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4;\n\ +\n\ +#define LSTMUNIT_BP_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_BP_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src10, vect10, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_BP_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_BP_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_BP_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_BP_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_BP_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_BP_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_BP_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_BP_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_BP_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_BP_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src10, vect10, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_BP_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_BP_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_BP_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_BP_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_BP_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_BP_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_BP_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_BP_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_BP_F16_vx*/ + +static const char lstmunit_activation_BP_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniU8AddS32_4x4;\n\ +_viv_uniform int4 input0Array_ZP;\n\ +_viv_uniform int4 input1Array_ZP;\n\ +_viv_uniform float4 input0Array_Scale;\n\ +_viv_uniform float4 input1Array_Scale;\n\ +\n\ +#define LSTMUNIT_BP_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_BP_U8to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx + b0; \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_BP_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_BP_U8_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_BP_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_BP_U8_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_BP_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_BP_U8_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_BP_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_BP_U8_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_BP_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_BP_U8to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0; \\\n\ + vxc_half8 src4; \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect0, 16); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx + b0; \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src4, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_BP_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_BP_U8_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_BP_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_BP_U8_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_BP_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_BP_U8_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_BP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_BP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_BP_U8_vx*/ + +static const char lstmunit_activation_B_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4;\n\ +\n\ +#define LSTMUNIT_B_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_B_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src10, vect10, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_B_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_B_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_B_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_B_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_B_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_B_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_B_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_B_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_B_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_B_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src10, vect10, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_B_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_B_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_B_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_B_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_B_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_B_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_B_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_B_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_B_F16_vx*/ + +static const char lstmunit_activation_B_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniU8AddS32_4x4;\n\ +_viv_uniform int4 input0Array_ZP;\n\ +_viv_uniform int4 input1Array_ZP;\n\ +_viv_uniform float4 input0Array_Scale;\n\ +_viv_uniform float4 input1Array_Scale;\n\ +\n\ +#define LSTMUNIT_B_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_B_U8to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx + b0; \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +LSTMUNIT_B_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_B_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_B_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_B_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_B_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_B_U8to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0; \\\n\ + vxc_half8 src4; \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect0, 16); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx + b0; \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src4, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_B_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_B_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_B_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_B_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_B_U8_vx*/ + +static const char lstmunit_activation_CBP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4;\n\ +\n\ +#define LSTMUNIT_CBP_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CBP_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CBP_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_CBP_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CBP_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CBP_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CBP_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CBP_F16_vx*/ + +static const char lstmunit_activation_CBP_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniU8AddS32_4x4;\n\ +_viv_uniform int4 input0Array_ZP;\n\ +_viv_uniform int4 input1Array_ZP;\n\ +_viv_uniform float4 input0Array_Scale;\n\ +_viv_uniform float4 input1Array_Scale;\n\ +\n\ +#define LSTMUNIT_CBP_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CBP_U8to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 vecA, vecB; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CBP_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CBP_U8_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CBP_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CBP_U8_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CBP_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CBP_U8_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CBP_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CBP_U8_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_CBP_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CBP_U8to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0; \\\n\ + vxc_half8 src4; \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect0, 16); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src4, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CBP_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CBP_U8_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CBP_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CBP_U8_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CBP_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CBP_U8_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CBP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CBP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CBP_U8_vx*/ + +static const char lstmunit_activation_CB_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4;\n\ +\n\ +#define LSTMUNIT_CB_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CB_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CB_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CB_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CB_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CB_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CB_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CB_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CB_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CB_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_CB_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CB_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CB_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CB_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CB_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CB_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CB_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CB_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CB_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CB_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CB_F16_vx*/ + +static const char lstmunit_activation_CB_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniU8AddS32_4x4;\n\ +_viv_uniform int4 input0Array_ZP;\n\ +_viv_uniform int4 input1Array_ZP;\n\ +_viv_uniform float4 input0Array_Scale;\n\ +_viv_uniform float4 input1Array_Scale;\n\ +\n\ +#define LSTMUNIT_CB_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CB_U8to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CB_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CB_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CB_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CB_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_CB_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CB_U8to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0; \\\n\ + vxc_half8 src4; \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect0, 16); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy + b1; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz + b2; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src4, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CB_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CB_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CB_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CB_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CB_U8_vx*/ + +static const char lstmunit_activation_CLP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_CLP_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CLP_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, b0, b1, b2; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_f, coord_in.xw); \\\n\ + b1 = read_imagef(bias_c, coord_in.xw); \\\n\ + b2 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + data_f_t = data_f_t * w0 + b0; \\\n\ + data_g_t = data_g_t * w1 + b1; \\\n\ + data_o_t = data_o_t * w2 + b2; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CLP_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CLP_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CLP_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CLP_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CLP_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CLP_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CLP_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CLP_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +#define LSTMUNIT_CLP_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CLP_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, b0, b1, b2; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + b0 = read_imagef(bias_f, coord_in.xw); \\\n\ + b1 = read_imagef(bias_c, coord_in.xw); \\\n\ + b2 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + data_f_t = data_f_t * w0 + b0; \\\n\ + data_g_t = data_g_t * w1 + b1; \\\n\ + data_o_t = data_o_t * w2 + b2; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 cell_data; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, cell_data, data_c_t); \\\n\ + VXC_DP4x4(src0, cell_data, cell_data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CLP_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CLP_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CLP_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CLP_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CLP_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CLP_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CLP_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CLP_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CLP_F16_vx*/ + +static const char lstmunit_activation_CL_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +\n\ +#define LSTMUNIT_CL_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CL_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, b0, b1, b2; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_f, coord_in.xw); \\\n\ + b1 = read_imagef(bias_c, coord_in.xw); \\\n\ + b2 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + data_f_t = data_f_t * w0 + b0; \\\n\ + data_g_t = data_g_t * w1 + b1; \\\n\ + data_o_t = data_o_t * w2 + b2; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CL_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CL_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CL_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CL_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CL_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CL_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CL_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CL_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_CL_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CL_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, b0, b1, b2; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + b0 = read_imagef(bias_f, coord_in.xw); \\\n\ + b1 = read_imagef(bias_c, coord_in.xw); \\\n\ + b2 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + data_f_t = data_f_t * w0 + b0; \\\n\ + data_g_t = data_g_t * w1 + b1; \\\n\ + data_o_t = data_o_t * w2 + b2; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 cell_data; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, cell_data, data_c_t); \\\n\ + VXC_DP4x4(src0, cell_data, cell_data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CL_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CL_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CL_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CL_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CL_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CL_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CL_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CL_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CL_F16_vx*/ + +static const char lstmunit_activation_CSP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4;\n\ +\n\ +#define LSTMUNIT_CSP_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CSP_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CSP_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_CSP_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CSP_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CSP_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CSP_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CSP_F16_vx*/ + +static const char lstmunit_activation_CSP_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniU8AddS32_4x4;\n\ +_viv_uniform int4 input0Array_ZP;\n\ +_viv_uniform int4 input1Array_ZP;\n\ +_viv_uniform float4 input0Array_Scale;\n\ +_viv_uniform float4 input1Array_Scale;\n\ +\n\ +#define LSTMUNIT_CSP_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CSP_U8to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CSP_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CSP_U8_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CSP_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CSP_U8_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CSP_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CSP_U8_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CSP_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CSP_U8_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_CSP_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CSP_U8to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0; \\\n\ + vxc_half8 src4; \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect0, 16); \\\n\ + \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src4, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CSP_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CSP_U8_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CSP_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CSP_U8_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CSP_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CSP_U8_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CSP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CSP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CSP_U8_vx*/ + +static const char lstmunit_activation_CS_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4;\n\ +\n\ +#define LSTMUNIT_CS_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CS_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CS_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CS_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CS_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CS_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CS_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CS_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CS_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CS_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_CS_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CS_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CS_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CS_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_CS_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CS_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CS_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_CS_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_CS_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CS_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CS_F16_vx*/ + +static const char lstmunit_activation_CS_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniU8AddS32_4x4;\n\ +_viv_uniform int4 input0Array_ZP;\n\ +_viv_uniform int4 input1Array_ZP;\n\ +_viv_uniform float4 input0Array_Scale;\n\ +_viv_uniform float4 input1Array_Scale;\n\ +\n\ +#define LSTMUNIT_CS_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CS_U8to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CS_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CS_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CS_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CS_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_CS_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_CS_U8to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0; \\\n\ + vxc_half8 src4; \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect0, 16); \\\n\ + \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src4, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CS_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_CS_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_CS_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_CS_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CS_U8_vx*/ + +static const char lstmunit_activation_LP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +\n\ +#define LSTMUNIT_LP_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_LP_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wi, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + data_i_t = data_i_t * w0 + b0; \\\n\ + data_f_t = data_f_t * w1 + b1; \\\n\ + data_g_t = data_g_t * w2 + b2; \\\n\ + data_o_t = data_o_t * w3 + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_LP_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_LP_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_LP_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_LP_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_LP_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_LP_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_LP_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_LP_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_LP_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_LP_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wi, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + data_i_t = data_i_t * w0 + b0; \\\n\ + data_f_t = data_f_t * w1 + b1; \\\n\ + data_g_t = data_g_t * w2 + b2; \\\n\ + data_o_t = data_o_t * w3 + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 cell_data; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, cell_data, data_c_t); \\\n\ + VXC_DP4x4(src0, cell_data, cell_data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_LP_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_LP_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_LP_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_LP_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_LP_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_LP_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_LP_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_LP_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_LP_F16_vx*/ + +static const char lstmunit_activation_L_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +\n\ +#define LSTMUNIT_L_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_L_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wi, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + data_i_t = data_i_t * w0 + b0; \\\n\ + data_f_t = data_f_t * w1 + b1; \\\n\ + data_g_t = data_g_t * w2 + b2; \\\n\ + data_o_t = data_o_t * w3 + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_L_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_L_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_L_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_L_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_L_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_L_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_L_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_L_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_L_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_L_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wi, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + data_i_t = data_i_t * w0 + b0; \\\n\ + data_f_t = data_f_t * w1 + b1; \\\n\ + data_g_t = data_g_t * w2 + b2; \\\n\ + data_o_t = data_o_t * w3 + b3; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 cell_data; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, cell_data, data_c_t); \\\n\ + VXC_DP4x4(src0, cell_data, cell_data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_L_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_L_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_L_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_L_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_L_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_L_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_L_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_L_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +"; /* end of lstmunit_activation_L_F16_vx*/ + +static const char lstmunit_activation_SP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4;\n\ +\n\ +#define LSTMUNIT_SP_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_SP_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src10, vect10, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_SP_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_SP_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_SP_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_SP_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_SP_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_SP_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_SP_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_SP_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_SP_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_SP_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src10, vect10, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_SP_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_SP_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_SP_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_SP_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_SP_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_SP_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_SP_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_SP_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_SP_F16_vx*/ + +static const char lstmunit_activation_SP_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniU8AddS32_4x4;\n\ +_viv_uniform int4 input0Array_ZP;\n\ +_viv_uniform int4 input1Array_ZP;\n\ +_viv_uniform float4 input0Array_Scale;\n\ +_viv_uniform float4 input1Array_Scale;\n\ +\n\ +#define LSTMUNIT_SP_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_SP_U8to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + \\\n\ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx; \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_SP_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_SP_U8_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_SP_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_SP_U8_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_SP_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_SP_U8_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_SP_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_SP_U8_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_SP_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_SP_U8to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0; \\\n\ + vxc_half8 src4; \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect0, 16); \\\n\ + \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx; \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src4, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_SP_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_SP_U8_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_SP_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_SP_U8_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_SP_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_SP_U8_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_SP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_SP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_SP_U8_vx*/ + +static const char lstmunit_activation_S_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16AddFp16toFp32_4x4;\n\ +\n\ +#define LSTMUNIT_S_FP16_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_S_F16to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3; \\\n\ + vxc_half8 src0, src1, src2, src3; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src10, vect10, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_S_FP16_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_S_FP16_FP32(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_S_FP16_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_S_FP16_FP32(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_S_FP16_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_S_FP16_FP32(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_S_FP16_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_S_FP16_FP32(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_S_FP16_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_S_F16to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4; \\\n\ + vxc_half8 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_half8 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, vect0, 16); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src10, vect10, 16); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, vect1, 16); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src11, vect11, 16); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, vect2, 16); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src12, vect12, 16); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, vect3, 16); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src13, vect13, 16); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect4, 16); \\\n\ + \\\n\ + VXC_DP4x4(data_i_t, src0, src10, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_f_t, src1, src11, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_g_t, src2, src12, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(data_o_t, src3, src13, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16AddFp16toFp32_4x4); \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src0, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src0, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_S_FP16_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_S_FP16_FP16(I8, SIGMOID, int4, vxc_char4, vxc_char4, sigmoid)\n\ +LSTMUNIT_S_FP16_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_S_FP16_FP16(I16, SIGMOID, int4, vxc_short4, vxc_short4, sigmoid)\n\ +LSTMUNIT_S_FP16_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +LSTMUNIT_S_FP16_FP16(I8, HARD_SIGMOID, int4, vxc_char4, vxc_char4, hard_sigmoid)\n\ +LSTMUNIT_S_FP16_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_S_FP16_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_S_F16_vx*/ + +static const char lstmunit_activation_S_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +_viv_uniform VXC_512Bits uniU8AddS32_4x4;\n\ +_viv_uniform int4 input0Array_ZP;\n\ +_viv_uniform int4 input1Array_ZP;\n\ +_viv_uniform float4 input0Array_Scale;\n\ +_viv_uniform float4 input1Array_Scale;\n\ +\n\ +#define LSTMUNIT_S_U8_FP32(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_S_U8to##out_type_name##_F32_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.zy); \\\n\ + \\\n\ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx; \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_S_U8_FP32(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_S_U8_FP32(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_S_U8_FP32(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_S_U8_FP32(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +\n\ +#define LSTMUNIT_S_U8_FP16(out_type_name, act_name, convert_type, dst_type, copy_type, act_func) \\\n\ +__kernel void lstmunit_activation_S_U8to##out_type_name##_F16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0; \\\n\ + vxc_half8 src4; \\\n\ + vxc_uchar4 src0, src1, src2, src3; \\\n\ + vxc_uchar4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 vecA, vecB; \\\n\ + VXC_ReadImage(src0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect0, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, vect0, 16); \\\n\ + \\\n\ + VXC_DP4x4(data_c_t, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4); \\\n\ + VXC_DP4x4(vecA, src0, input0Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src10, input1Array_ZP.xxxx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_i_t = vecA * input0Array_Scale.xxxx + vecB * input1Array_Scale.xxxx; \\\n\ + VXC_DP4x4(vecA, src1, input0Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src11, input1Array_ZP.yyyy, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_f_t = vecA * input0Array_Scale.yyyy + vecB * input1Array_Scale.yyyy; \\\n\ + VXC_DP4x4(vecA, src2, input0Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src12, input1Array_ZP.zzzz, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_g_t = vecA * input0Array_Scale.zzzz + vecB * input1Array_Scale.zzzz; \\\n\ + VXC_DP4x4(vecA, src3, input0Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + VXC_DP4x4(vecB, src13, input1Array_ZP.wwww, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniU8AddS32_4x4);\\\n\ + data_o_t = vecA * input0Array_Scale.wwww + vecB * input1Array_Scale.wwww; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + half4 dst_cell; \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(CONV, dst_cell, data_c_t); \\\n\ + VXC_DP4x4(src4, dst_cell, dst_cell, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, vect0, src4, 8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, vect0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, data_o_t); \\\n\ + dst_type dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_S_U8_FP16(U8, SIGMOID, int4, vxc_uchar4, vxc_uchar4, sigmoid)\n\ +LSTMUNIT_S_U8_FP16(F16, SIGMOID, half4, vxc_half4, vxc_short4, sigmoid)\n\ +LSTMUNIT_S_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid)\n\ +LSTMUNIT_S_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_S_U8_vx*/ + +static const char matrixmul_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Hi_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Hi_4x4;\n\ +\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\ +\n\ +#if (VX_VERSION==2)\n\ +__kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\ + image2d_array_t inputB, image2d_array_t output,\n\ + int transposeA, int transposeB,\n\ + int adjointA, int adjointB, uint M, uint K, uint N)\n\ +{\n\ + uint gidy = get_global_id(1);\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + half4 valC;\n\ + vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;\n\ + vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;\n\ + vxc_short16 srcB;\n\ + vxc_half16 tmpB;\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\ + {\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 4;\n\ + coord_b.y += 4;\n\ + _viv_asm(COPY, tmpA0, srcA0, 16);\n\ + _viv_asm(COPY, tmpA1, srcA1, 16);\n\ + _viv_asm(COPY, tmpA2, srcA2, 16);\n\ + _viv_asm(COPY, tmpA3, srcA3, 16);\n\ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16);\n\ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16);\n\ + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + sum0 += (tempA0);\n\ + sum1 += (tempA1);\n\ + sum2 += (tempA2);\n\ + sum3 += (tempA3);\n\ + }\n\ + coord_b.y = gidy;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr);\n\ + _viv_asm(CONV, valC, sum0);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum1);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum2);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum3);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +#else\n\ +__kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\ + image2d_array_t inputB, image2d_array_t output,\n\ + int transposeA, int transposeB,\n\ + int adjointA, int adjointB, uint M, uint K, uint N)\n\ +{\n\ + uint gidy = get_global_id(1);\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + half4 valC;\n\ + vxc_short8 srcA0, srcB0, srcA1, srcB1, outC;\n\ + vxc_half8 tmpA0, tmpB0, tmpA1, tmpB1;\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\ + {\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 4;\n\ + coord_b.y += 4;\n\ + _viv_asm(COPY, tmpA0, srcA0, 16);\n\ + _viv_asm(COPY, tmpB0, srcB0, 16);\n\ + _viv_asm(COPY, tmpA1, srcA1, 16);\n\ + _viv_asm(COPY, tmpB1, srcB1, 16);\n\ +\n\ + VXC_DP4x4(tempA0, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Lo_4x4);\n\ + VXC_DP4x4(tempB0, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Hi_4x4);\n\ + VXC_DP4x4(tempA1, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Lo_4x4);\n\ + VXC_DP4x4(tempB1, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Hi_4x4);\n\ + VXC_DP4x4(tempA2, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Lo_4x4);\n\ + VXC_DP4x4(tempB2, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Hi_4x4);\n\ + VXC_DP4x4(tempA3, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Lo_4x4);\n\ + VXC_DP4x4(tempB3, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Hi_4x4);\n\ + sum0 += (tempA0 + tempB0);\n\ + sum1 += (tempA1 + tempB1);\n\ + sum2 += (tempA2 + tempB2);\n\ + sum3 += (tempA3 + tempB3);\n\ + }\n\ + coord_b.y = gidy;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr);\n\ + _viv_asm(CONV, valC, sum0);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum1);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum2);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum3);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +#endif\n\ +\n\ +__kernel void gemm_F32F32toF32(image2d_array_t inputA,\n\ + image2d_array_t inputB,\n\ + image2d_array_t output,\n\ + int transposeA,\n\ + int transposeB,\n\ + int adjointA,\n\ + int adjointB,\n\ + uint M, uint K, uint N)\n\ +{\n\ + uint gidx = get_global_id(0);\n\ + uint gidy = get_global_id(1);\n\ +\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_float4 sum0 = (vxc_float4)(0);\n\ + vxc_float4 sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0);\n\ + vxc_float4 sum3 = (vxc_float4)(0);\n\ +\n\ + vxc_int4 tmpOut0, tmpOut1;\n\ + vxc_uchar16 outC;\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + for(int i = 0; i < K; i+=4)\n\ + {\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\ +\n\ + coord_a.x = i;\n\ + coord_a.y = gidy;\n\ +\n\ + coord_b.x = gidx;\n\ + coord_b.y = i;\n\ +\n\ + tempA0 = read_imagef(inputA, coord_a);\n\ + coord_a.y++;\n\ + tempA1 = read_imagef(inputA, coord_a);\n\ + coord_a.y++;\n\ + tempA2 = read_imagef(inputA, coord_a);\n\ + coord_a.y++;\n\ + tempA3 = read_imagef(inputA, coord_a);\n\ +\n\ + tempB0 = read_imagef(inputB, coord_b);\n\ + coord_b.y++;\n\ + tempB1 = read_imagef(inputB, coord_b);\n\ + coord_b.y++;\n\ + tempB2 = read_imagef(inputB, coord_b);\n\ + coord_b.y++;\n\ + tempB3 = read_imagef(inputB, coord_b);\n\ +\n\ + sum0 += (tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3);\n\ + sum1 += (tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3);\n\ + sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3);\n\ + sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);\n\ + }\n\ + coord_b = (int4)(gidx, gidy, get_global_id(2), 0);\n\ + write_imagef(output, coord_b, sum0);\n\ + coord_b.y++;\n\ + write_imagef(output, coord_b, sum1);\n\ + coord_b.y++;\n\ + write_imagef(output, coord_b, sum2);\n\ + coord_b.y++;\n\ + write_imagef(output, coord_b, sum3);\n\ +}"; /* end of matrixmul_f16_vx*/ + +static const char matrixmul_f16f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Hi_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Hi_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\ +\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +#if (VX_VERSION==2)\n\ +#define GEMM_F16_TO_QINT(dst_type_name, write_type) \\\n\ +__kernel void gemm_F16F16to##dst_type_name( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + \\\n\ + vxc_short8 srcA0, srcA1, srcA2, srcA3; \\\n\ + vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3; \\\n\ + vxc_short16 srcB; \\\n\ + vxc_half16 tmpB; \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + _viv_asm(COPY, tmpA0, srcA0, 16); \\\n\ + _viv_asm(COPY, tmpA1, srcA1, 16); \\\n\ + _viv_asm(COPY, tmpA2, srcA2, 16); \\\n\ + _viv_asm(COPY, tmpA3, srcA3, 16); \\\n\ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); \\\n\ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); \\\n\ + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Lo_4x4b); \\\n\ + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Lo_4x4b); \\\n\ + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Lo_4x4b); \\\n\ + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Lo_4x4b); \\\n\ + sum0 += (tempA0); \\\n\ + sum1 += (tempA1); \\\n\ + sum2 += (tempA2); \\\n\ + sum3 += (tempA3); \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + write_type outC; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +#else\n\ +#define GEMM_F16_TO_QINT(dst_type_name, write_type) \\\n\ +__kernel void gemm_F16F16to##dst_type_name( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + \\\n\ + vxc_short8 srcA0, srcB0, srcA1, srcB1; \\\n\ + vxc_half8 tmpA0, tmpB0, tmpA1, tmpB1; \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + _viv_asm(COPY, tmpA0, srcA0, 16); \\\n\ + _viv_asm(COPY, tmpB0, srcB0, 16); \\\n\ + _viv_asm(COPY, tmpA1, srcA1, 16); \\\n\ + _viv_asm(COPY, tmpB1, srcB1, 16); \\\n\ + \\\n\ + VXC_DP4x4(tempA0, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Lo_4x4); \\\n\ + VXC_DP4x4(tempB0, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Hi_4x4); \\\n\ + VXC_DP4x4(tempA1, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Lo_4x4); \\\n\ + VXC_DP4x4(tempB1, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Hi_4x4); \\\n\ + VXC_DP4x4(tempA2, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Lo_4x4); \\\n\ + VXC_DP4x4(tempB2, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row0Hi_4x4); \\\n\ + VXC_DP4x4(tempA3, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Lo_4x4); \\\n\ + VXC_DP4x4(tempB3, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmFp16toFp32Row1Hi_4x4); \\\n\ + sum0 += (tempA0 + tempB0); \\\n\ + sum1 += (tempA1 + tempB1); \\\n\ + sum2 += (tempA2 + tempB2); \\\n\ + sum3 += (tempA3 + tempB3); \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + write_type outC; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +#endif\n\ +GEMM_F16_TO_QINT(U8, vxc_uchar16)\n\ +GEMM_F16_TO_QINT(I8, vxc_char16)\n\ +GEMM_F16_TO_QINT(I16, vxc_short8)\n\ +"; /* end of matrixmul_f16f16_u8_vx*/ + +static const char matrixmul_f16i16_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniGemmF16I16toF32A_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmF16I16toF32B_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmF16I16toF32C_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmF16I16toF32D_4x4;\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmF16I16toF32Lo_4x4b;\n\ +_viv_uniform VXC_512Bits uniGemmF16I16toF32Hi_4x4b;\n\ +_viv_uniform VXC_512Bits uniGemmFp16I16MulZptoFp32_4x4;\n\ +_viv_uniform float in1outScale;\n\ +\n\ +#if (VX_VERSION==2)\n\ +#define GEMM_F16_QINT16_TO_QINT16(src1_type_name, read_type) \\\n\ +__kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \\\n\ + image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + \\\n\ + vxc_short8 srcA0, srcA1, outC; \\\n\ + vxc_half8 tmpA0, tmpA1; \\\n\ + vxc_short16 srcB; \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + _viv_asm(COPY, tmpA0, srcA0, 16); \\\n\ + _viv_asm(COPY, tmpA1, srcA1, 16); \\\n\ + VXC_DP4x4_b(tempA0, srcB.hi, srcB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmF16I16toF32Lo_4x4b); \\\n\ + VXC_DP4x4_b(tempA1, srcB.hi, srcB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmF16I16toF32Hi_4x4b); \\\n\ + VXC_DP4x4_b(tempA2, srcB.hi, srcB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmF16I16toF32Lo_4x4b); \\\n\ + VXC_DP4x4_b(tempA3, srcB.hi, srcB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmF16I16toF32Hi_4x4b); \\\n\ + VXC_DP4x4(tmpZpScale, tmpA0, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmFp16I16MulZptoFp32_4x4); \\\n\ + sum0 += tempA0 + tmpZpScale.xxxx; \\\n\ + sum1 += tempA1 + tmpZpScale.yyyy; \\\n\ + sum2 += tempA2 + tmpZpScale.zzzz; \\\n\ + sum3 += tempA3 + tmpZpScale.wwww; \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * in1outScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +#else\n\ +#define GEMM_F16_QINT16_TO_QINT16(src1_type_name, read_type) \\\n\ +__kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \\\n\ + image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + \\\n\ + vxc_short8 srcA0, srcA1, outC; \\\n\ + vxc_half8 tmpA0, tmpA1; \\\n\ + vxc_short8 srcB0, srcB1; \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + _viv_asm(COPY, tmpA0, srcA0, 16); \\\n\ + _viv_asm(COPY, tmpA1, srcA1, 16); \\\n\ + VXC_DP4x4(tempA0, tmpA0, srcB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32A_4x4); \\\n\ + VXC_DP4x4(tempB0, tmpA0, srcB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32B_4x4); \\\n\ + VXC_DP4x4(tempA1, tmpA0, srcB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32C_4x4); \\\n\ + VXC_DP4x4(tempB1, tmpA0, srcB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32D_4x4); \\\n\ + VXC_DP4x4(tempA2, tmpA1, srcB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32A_4x4); \\\n\ + VXC_DP4x4(tempB2, tmpA1, srcB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32B_4x4); \\\n\ + VXC_DP4x4(tempA3, tmpA1, srcB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32C_4x4); \\\n\ + VXC_DP4x4(tempB3, tmpA1, srcB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16I16toF32D_4x4); \\\n\ + VXC_DP4x4(tmpZpScale, tmpA0, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmFp16I16MulZptoFp32_4x4); \\\n\ + sum0 += tempA0 + tempB0 + tmpZpScale.xxxx; \\\n\ + sum1 += tempA1 + tempB1 + tmpZpScale.yyyy; \\\n\ + sum2 += tempA2 + tempB2 + tmpZpScale.zzzz; \\\n\ + sum3 += tempA3 + tempB3 + tmpZpScale.wwww; \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * in1outScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +#endif\n\ +GEMM_F16_QINT16_TO_QINT16(I16, vxc_short8)\n\ +\n\ +"; /* end of matrixmul_f16i16_i16_vx*/ + +static const char matrixmul_f16u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4;\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmF16U8toF32_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmF16U8toF32Hi_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16U8MulZptoFp32_4x4;\n\ +_viv_uniform float input1Scale;\n\ +\n\ +#define GEMM_F16_QINT_TO_F16(src1_type_name, read_type) \\\n\ +__kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \\\n\ + image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + \\\n\ + half4 valC; \\\n\ + vxc_short8 srcA0, srcA1, outC; \\\n\ + vxc_half8 tmpA0, tmpA1; \\\n\ + read_type srcB; \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempZp; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + _viv_asm(COPY, tmpA0, srcA0, 16); \\\n\ + _viv_asm(COPY, tmpA1, srcA1, 16); \\\n\ + VXC_DP4x4(tempA0, tmpA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32_4x4); \\\n\ + VXC_DP4x4(tempA1, tmpA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32Hi_4x4); \\\n\ + VXC_DP4x4(tempA2, tmpA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32_4x4); \\\n\ + VXC_DP4x4(tempA3, tmpA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32Hi_4x4); \\\n\ + VXC_DP4x4(tempZp, tmpA0, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmFp16U8MulZptoFp32_4x4); \\\n\ + sum0 += tempA0 + tempZp.x; \\\n\ + sum1 += tempA1 + tempZp.y; \\\n\ + sum2 += tempA2 + tempZp.z; \\\n\ + sum3 += tempA3 + tempZp.w; \\\n\ + } \\\n\ + sum0 *= input1Scale; \\\n\ + sum1 *= input1Scale; \\\n\ + sum2 *= input1Scale; \\\n\ + sum3 *= input1Scale; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + _viv_asm(CONV, valC, sum0); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum1); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum2); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum3); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_F16_QINT_TO_F16(U8, vxc_uchar16)\n\ +GEMM_F16_QINT_TO_F16(I8, vxc_char16)\n\ +\n\ +#define GEMM_F16_QINT16_TO_F16(src1_type_name, read_type) \\\n\ +__kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \\\n\ + image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + \\\n\ + half4 valC; \\\n\ + vxc_short8 srcA, outC; \\\n\ + vxc_half8 tmpA; \\\n\ + read_type srcB; \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + short in1_zp; \\\n\ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmpA, srcA, 16); \\\n\ + VXC_DP4x4(tempA0, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmpA, srcA, 16); \\\n\ + VXC_DP4x4(tempA1, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmpA, srcA, 16); \\\n\ + VXC_DP4x4(tempA2, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + _viv_asm(COPY, tmpA, srcA, 16); \\\n\ + VXC_DP4x4(tempA3, tmpA, tmpA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\ + } \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + _viv_asm(CONV, valC, sum0); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum1); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum2); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum3); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_F16_QINT16_TO_F16(I16, vxc_short8)\n\ +"; /* end of matrixmul_f16u8_f16_vx*/ + +static const char matrixmul_f16u8_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmF16U8toF32_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmF16U8toF32Hi_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16U8MulZptoFp32_4x4;\n\ +_viv_uniform float in1outScale;\n\ +\n\ +#define GEMM_F16_QINT_TO_QINT(src1_type_name, read_type) \\\n\ +__kernel void gemm_F16##src1_type_name##to##src1_type_name(image2d_array_t inputA, \\\n\ + image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + \\\n\ + vxc_short8 srcA0, srcA1; \\\n\ + vxc_half8 tmpA0, tmpA1; \\\n\ + read_type srcB, outC; \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempZp; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + _viv_asm(COPY, tmpA0, srcA0, 16); \\\n\ + _viv_asm(COPY, tmpA1, srcA1, 16); \\\n\ + VXC_DP4x4(tempA0, tmpA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32_4x4); \\\n\ + VXC_DP4x4(tempA1, tmpA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32Hi_4x4); \\\n\ + VXC_DP4x4(tempA2, tmpA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32_4x4); \\\n\ + VXC_DP4x4(tempA3, tmpA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemmF16U8toF32Hi_4x4); \\\n\ + VXC_DP4x4(tempZp, tmpA0, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmFp16U8MulZptoFp32_4x4); \\\n\ + sum0 += tempA0 + tempZp.x; \\\n\ + sum1 += tempA1 + tempZp.y; \\\n\ + sum2 += tempA2 + tempZp.z; \\\n\ + sum3 += tempA3 + tempZp.w; \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * in1outScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * in1outScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * in1outScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * in1outScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_F16_QINT_TO_QINT(U8, vxc_uchar16)\n\ +GEMM_F16_QINT_TO_QINT(I8, vxc_char16)\n\ +"; /* end of matrixmul_f16u8_u8_vx*/ + +static const char matrixmul_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input0_ZP;\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +#define GEMM_QINT_TO_QINT(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + read_type srcA, srcB, outC; \\\n\ + \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + short in0_zp, in1_zp; \\\n\ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_QINT_TO_QINT(I16, vxc_short8)\n\ +"; /* end of matrixmul_i16_vx*/ + +static const char matrixmul_transA_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input0_ZP;\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4;\n\ +\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +#define GEMM_TRANSA_QINT(src0_type_name, src1_type_name, dst_type_name, read0_type, read1_type, write_type) \\\n\ +__kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, \\\n\ + uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + read0_type srcA; \\\n\ + read1_type srcB; \\\n\ + write_type outC; \\\n\ + \\\n\ + int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + \\\n\ + vxc_float4 sum0 = (vxc_float4)(0); \\\n\ + vxc_float4 sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0); \\\n\ + vxc_float4 sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + short in0_zp, in1_zp; \\\n\ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\ + \\\n\ + vxc_float4 tempA0; \\\n\ + vxc_float4 tempB0; \\\n\ + \\\n\ + for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.y++; \\\n\ + coord_b.y++; \\\n\ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + sum0 = (sum0 + tempA0.x * tempB0); \\\n\ + sum1 = (sum1 + tempA0.y * tempB0); \\\n\ + sum2 = (sum2 + tempA0.z * tempB0); \\\n\ + sum3 = (sum3 + tempA0.w * tempB0); \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_TRANSA_QINT(U8, U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16)\n\ +GEMM_TRANSA_QINT(I8, I8, I8, vxc_char16, vxc_char16, vxc_char16)\n\ +GEMM_TRANSA_QINT(I16, I16, I16, vxc_short8, vxc_short8, vxc_short8)\n\ +\n\ +#define GEMM_TRANSA_INPUTB_F16(src0_type_name, read0_type) \\\n\ +__kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \\\n\ + image2d_array_t inputA, \\\n\ + image2d_array_t inputB, \\\n\ + image2d_array_t output, \\\n\ + int transposeA, \\\n\ + int transposeB, \\\n\ + int adjointA, \\\n\ + int adjointB, \\\n\ + uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + read0_type srcA, outC; \\\n\ + vxc_short8 srcB; \\\n\ + vxc_half8 tmpB; \\\n\ + \\\n\ + int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + \\\n\ + vxc_float4 sum0 = (vxc_float4)(0); \\\n\ + vxc_float4 sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0); \\\n\ + vxc_float4 sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + short in0_zp; \\\n\ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\ + \\\n\ + vxc_float4 tempA0; \\\n\ + vxc_float4 tempB0; \\\n\ + \\\n\ + for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.y++; \\\n\ + coord_b.y++; \\\n\ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + _viv_asm(COPY, tmpB, srcB, 16); \\\n\ + VXC_DP4x4(tempB0,tmpB,tmpB,VXC_MODIFIER(0,3,0,VXC_RM_TowardZero,0),uniConvert1stFp16ToFp32_4x4); \\\n\ + sum0 = (sum0 + tempA0.x * tempB0); \\\n\ + sum1 = (sum1 + tempA0.y * tempB0); \\\n\ + sum2 = (sum2 + tempA0.z * tempB0); \\\n\ + sum3 = (sum3 + tempA0.w * tempB0); \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_TRANSA_INPUTB_F16(U8, vxc_uchar16)\n\ +GEMM_TRANSA_INPUTB_F16(I8, vxc_char16)\n\ +GEMM_TRANSA_INPUTB_F16(I16, vxc_short8)\n\ +\n\ +__kernel void gemm_transa_F16F16toF16(\n\ + image2d_array_t inputA,\n\ + image2d_array_t inputB,\n\ + image2d_array_t output,\n\ + int transposeA,\n\ + int transposeB,\n\ + int adjointA,\n\ + int adjointB,\n\ + uint M, uint K, uint N)\n\ +{\n\ + uint gidy = get_global_id(1);\n\ +\n\ + half4 valC;\n\ + vxc_short8 srcA, srcB, outC;\n\ + vxc_half8 tmpA, tmpB;\n\ +\n\ + int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_float4 sum0 = (vxc_float4)(0);\n\ + vxc_float4 sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0);\n\ + vxc_float4 sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + vxc_float4 tempA0;\n\ + vxc_float4 tempB0;\n\ +\n\ + for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;)\n\ + {\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.y++;\n\ + coord_b.y++;\n\ + _viv_asm(COPY, tmpA, srcA, 16);\n\ + VXC_DP4x4(tempA0,tmpA,tmpA,VXC_MODIFIER(0,3,0,VXC_RM_TowardZero,0),uniConvert1stFp16ToFp32_4x4);\n\ + _viv_asm(COPY, tmpB, srcB, 16);\n\ + VXC_DP4x4(tempB0,tmpB,tmpB,VXC_MODIFIER(0,3,0,VXC_RM_TowardZero,0),uniConvert1stFp16ToFp32_4x4);\n\ +\n\ + sum0 = (sum0 + tempA0.x * tempB0);\n\ + sum1 = (sum1 + tempA0.y * tempB0);\n\ + sum2 = (sum2 + tempA0.z * tempB0);\n\ + sum3 = (sum3 + tempA0.w * tempB0);\n\ + }\n\ + coord_b.y = gidy;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr);\n\ + _viv_asm(CONV, valC, sum0);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum1);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum2);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum3);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}"; /* end of matrixmul_transA_vx*/ + +static const char matrixmul_transB_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/********************gemm transposeB fp16 fp16 to fp16*************************/\n\ +_viv_uniform VXC_512Bits uniFp16MulFp16AddtoFp32_dp8x2;\n\ +\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +__kernel void gemm_transb_F16F16toF16(image2d_array_t inputA,\n\ + image2d_array_t inputB,\n\ + image2d_array_t output,\n\ + int transposeA,\n\ + int transposeB,\n\ + int adjointA,\n\ + int adjointB,\n\ + uint M, uint K, uint N)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_float4 sum0 = (vxc_float4)(0);\n\ + vxc_float4 sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0);\n\ + vxc_float4 sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)\n\ + {\n\ + vxc_short8 srcA0,srcA1,srcA2,srcA3;\n\ + vxc_short8 srcB0,srcB1,srcB2,srcB3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 8;\n\ + coord_b.x += 8;\n\ +\n\ + vxc_half8 halfB0,halfB1,halfB2,halfB3;\n\ + _viv_asm(COPY, halfB0, srcB0, 16);\n\ + _viv_asm(COPY, halfB1, srcB1, 16);\n\ + _viv_asm(COPY, halfB2, srcB2, 16);\n\ + _viv_asm(COPY, halfB3, srcB3, 16);\n\ + vxc_half8 halfA0,halfA1,halfA2,halfA3;\n\ + _viv_asm(COPY, halfA0, srcA0, 16);\n\ + _viv_asm(COPY, halfA1, srcA1, 16);\n\ + _viv_asm(COPY, halfA2, srcA2, 16);\n\ + _viv_asm(COPY, halfA3, srcA3, 16);\n\ + vxc_float4 fpVal;\n\ + VXC_DP8x2(fpVal, halfA0, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum0 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA1, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum1 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA2, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum2 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA3, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum3 += fpVal;\n\ + }\n\ + half4 halfDst;\n\ + vxc_short8 valDst;\n\ + _viv_asm(CONV, halfDst, sum0);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + _viv_asm(CONV, halfDst, sum1);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + _viv_asm(CONV, halfDst, sum2);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + _viv_asm(CONV, halfDst, sum3);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of matrixmul_transB_f16_vx*/ + +static const char matrixmul_transB_f16_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/********************gemm transposeB fp16 uint8 to fp16*************************/\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float input1Scale;\n\ +_viv_uniform VXC_512Bits uniU8SubZptoFp16_dp2x8;\n\ +_viv_uniform VXC_512Bits uniFp16MulFp16AddtoFp32_dp8x2;\n\ +\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +__kernel void gemm_transb_F16U8toF16(image2d_array_t inputA,\n\ + image2d_array_t inputB,\n\ + image2d_array_t output,\n\ + int transposeA,\n\ + int transposeB,\n\ + int adjointA,\n\ + int adjointB,\n\ + uint M, uint K, uint N)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_float4 sum0 = (vxc_float4)(0);\n\ + vxc_float4 sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0);\n\ + vxc_float4 sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + short in1_zp;\n\ + _viv_asm(COPY, in1_zp, input1_ZP, 4);\n\ + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)\n\ + {\n\ + vxc_short8 srcA0,srcA1,srcA2,srcA3;\n\ + vxc_uchar8 srcB0,srcB1,srcB2,srcB3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 8;\n\ + coord_b.x += 8;\n\ +\n\ + vxc_half8 halfB0,halfB1,halfB2,halfB3;\n\ + VXC_DP2x8(halfB0, srcB0, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB1, srcB1, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB2, srcB2, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB3, srcB3, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8SubZptoFp16_dp2x8);\n\ + vxc_half8 halfA0,halfA1,halfA2,halfA3;\n\ + _viv_asm(COPY, halfA0, srcA0, 16);\n\ + _viv_asm(COPY, halfA1, srcA1, 16);\n\ + _viv_asm(COPY, halfA2, srcA2, 16);\n\ + _viv_asm(COPY, halfA3, srcA3, 16);\n\ + vxc_float4 fpVal;\n\ + VXC_DP8x2(fpVal, halfA0, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum0 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA1, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum1 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA2, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum2 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA3, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum3 += fpVal;\n\ + }\n\ + half4 halfDst;\n\ + vxc_short8 valDst;\n\ + sum0 *= input1Scale;\n\ + _viv_asm(CONV, halfDst, sum0);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + sum1 *= input1Scale;\n\ + _viv_asm(CONV, halfDst, sum1);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + sum2 *= input1Scale;\n\ + _viv_asm(CONV, halfDst, sum2);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + sum3 *= input1Scale;\n\ + _viv_asm(CONV, halfDst, sum3);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +/***********************gemm transposeB fp16 uint8 to uint8***********************************/\n\ +_viv_uniform float scaleIn2divOut;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform float output_ZP;\n\ +\n\ +__kernel void gemm_transb_F16U8toU8(image2d_array_t inputA,\n\ + image2d_array_t inputB,\n\ + image2d_array_t output,\n\ + int transposeA,\n\ + int transposeB,\n\ + int adjointA,\n\ + int adjointB,\n\ + uint M, uint K, uint N)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_float4 sum0 = (vxc_float4)(0);\n\ + vxc_float4 sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0);\n\ + vxc_float4 sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + short in1_zp;\n\ + _viv_asm(COPY, in1_zp, input1_ZP, 4);\n\ + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)\n\ + {\n\ + vxc_short8 srcA0,srcA1,srcA2,srcA3;\n\ + vxc_uchar8 srcB0,srcB1,srcB2,srcB3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 8;\n\ + coord_b.x += 8;\n\ +\n\ + vxc_half8 halfB0,halfB1,halfB2,halfB3;\n\ + VXC_DP2x8(halfB0, srcB0, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB1, srcB1, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB2, srcB2, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB3, srcB3, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8SubZptoFp16_dp2x8);\n\ + vxc_half8 halfA0,halfA1,halfA2,halfA3;\n\ + _viv_asm(COPY, halfA0, srcA0, 16);\n\ + _viv_asm(COPY, halfA1, srcA1, 16);\n\ + _viv_asm(COPY, halfA2, srcA2, 16);\n\ + _viv_asm(COPY, halfA3, srcA3, 16);\n\ + vxc_float4 fpVal;\n\ + VXC_DP8x2(fpVal, halfA0, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum0 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA1, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum1 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA2, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum2 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA3, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum3 += fpVal;\n\ + }\n\ + vxc_int4 tmpOut0, tmpOut1;\n\ + vxc_uchar8 valDst;\n\ + tmpOut0 = convert_int4_rte(sum0 * scaleIn2divOut + output_ZP);\n\ + tmpOut1 = convert_int4_rte(sum1 * scaleIn2divOut + output_ZP);\n\ + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + tmpOut0 = convert_int4_rte(sum2 * scaleIn2divOut + output_ZP);\n\ + tmpOut1 = convert_int4_rte(sum3 * scaleIn2divOut + output_ZP);\n\ + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of matrixmul_transB_f16_mix_vx*/ + +static const char matrixmul_transB_u8_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/********************gemm transposeB uint8 uint8 to fp16*************************/\n\ +_viv_uniform int input0_ZP;\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float inScaleMul;\n\ +_viv_uniform VXC_512Bits uniU8SubZptoFp16_dp2x8;\n\ +_viv_uniform VXC_512Bits uniFp16MulFp16AddtoFp32_dp8x2;\n\ +\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +__kernel void gemm_transb_U8U8toF16(image2d_array_t inputA,\n\ + image2d_array_t inputB,\n\ + image2d_array_t output,\n\ + int transposeA,\n\ + int transposeB,\n\ + int adjointA,\n\ + int adjointB,\n\ + uint M, uint K, uint N)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_float4 sum0 = (vxc_float4)(0);\n\ + vxc_float4 sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0);\n\ + vxc_float4 sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + short in0_zp, in1_zp;\n\ + _viv_asm(COPY, in0_zp, input0_ZP, 4);\n\ + _viv_asm(COPY, in1_zp, input1_ZP, 4);\n\ + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)\n\ + {\n\ + vxc_uchar8 srcA0,srcA1,srcA2,srcA3;\n\ + vxc_uchar8 srcB0,srcB1,srcB2,srcB3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 8;\n\ + coord_b.x += 8;\n\ +\n\ + vxc_half8 halfA0,halfA1,halfA2,halfA3;\n\ + VXC_DP2x8(halfA0, srcA0, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfA1, srcA1, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfA2, srcA2, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfA3, srcA3, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + vxc_half8 halfB0,halfB1,halfB2,halfB3;\n\ + VXC_DP2x8(halfB0, srcB0, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB1, srcB1, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB2, srcB2, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB3, srcB3, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + vxc_float4 fpVal;\n\ + VXC_DP8x2(fpVal, halfA0, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum0 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA1, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum1 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA2, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum2 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA3, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum3 += fpVal;\n\ + }\n\ + half4 halfDst;\n\ + vxc_short8 valDst;\n\ + sum0 *= inScaleMul;\n\ + _viv_asm(CONV, halfDst, sum0);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + sum1 *= inScaleMul;\n\ + _viv_asm(CONV, halfDst, sum1);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + sum2 *= inScaleMul;\n\ + _viv_asm(CONV, halfDst, sum2);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + sum3 *= inScaleMul;\n\ + _viv_asm(CONV, halfDst, sum3);\n\ + _viv_asm(COPY, valDst, halfDst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +/********************gemm transposeB uint8 uint8 to uint8*************************/\n\ +_viv_uniform float inScaledivOut;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform float output_ZP;\n\ +\n\ +__kernel void gemm_transb_U8U8toU8(image2d_array_t inputA,\n\ + image2d_array_t inputB,\n\ + image2d_array_t output,\n\ + int transposeA,\n\ + int transposeB,\n\ + int adjointA,\n\ + int adjointB,\n\ + uint M, uint K, uint N)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_float4 sum0 = (vxc_float4)(0);\n\ + vxc_float4 sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0);\n\ + vxc_float4 sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + short in0_zp, in1_zp;\n\ + _viv_asm(COPY, in0_zp, input0_ZP, 4);\n\ + _viv_asm(COPY, in1_zp, input1_ZP, 4);\n\ + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)\n\ + {\n\ + vxc_uchar8 srcA0,srcA1,srcA2,srcA3;\n\ + vxc_uchar8 srcB0,srcB1,srcB2,srcB3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 8;\n\ + coord_b.x += 8;\n\ +\n\ + vxc_half8 halfA0,halfA1,halfA2,halfA3;\n\ + VXC_DP2x8(halfA0, srcA0, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfA1, srcA1, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfA2, srcA2, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfA3, srcA3, in0_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + vxc_half8 halfB0,halfB1,halfB2,halfB3;\n\ + VXC_DP2x8(halfB0, srcB0, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB1, srcB1, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB2, srcB2, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + VXC_DP2x8(halfB3, srcB3, in1_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniU8SubZptoFp16_dp2x8);\n\ + vxc_float4 fpVal;\n\ + VXC_DP8x2(fpVal, halfA0, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA0, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum0 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA1, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA1, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum1 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA2, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA2, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum2 += fpVal;\n\ + VXC_DP8x2(fpVal, halfA3, halfB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + VXC_DP8x2(fpVal, halfA3, halfB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniFp16MulFp16AddtoFp32_dp8x2);\n\ + sum3 += fpVal;\n\ + }\n\ + vxc_int4 tmpOut0, tmpOut1;\n\ + vxc_uchar8 valDst;\n\ + tmpOut0 = convert_int4_rte(sum0 * inScaledivOut + output_ZP);\n\ + tmpOut1 = convert_int4_rte(sum1 * inScaledivOut + output_ZP);\n\ + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + tmpOut0 = convert_int4_rte(sum2 * inScaledivOut + output_ZP);\n\ + tmpOut1 = convert_int4_rte(sum3 * inScaledivOut + output_ZP);\n\ + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of matrixmul_transB_u8_mix_vx*/ + +static const char matrixmul_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float mulKIn0In1Zp;\n\ +_viv_uniform float inOutScale;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;\n\ +\n\ +#define GEMM_QINT_TO_QINT(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \\\n\ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp), sum1 = sum0; \\\n\ + vxc_float4 sum2 = sum0, sum3 = sum0; \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + sum0 += tempA0 + tempB0; \\\n\ + sum1 += tempA1 + tempB1; \\\n\ + sum2 += tempA2 + tempB2; \\\n\ + sum3 += tempA3 + tempB3; \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = get_global_id(1); \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_QINT_TO_QINT(U8, vxc_uchar16)\n\ +GEMM_QINT_TO_QINT(I8, vxc_char16)\n\ +"; /* end of matrixmul_u8_vx*/ + +static const char matrixmul_u8f16_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float input0Scale;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4;\n\ +\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\ +_viv_uniform VXC_512Bits uniGemmU8F16toF32Hi_4x4b;\n\ +_viv_uniform VXC_512Bits uniGemmFp16MulZptoFp32_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemm1stU8F16toF32Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniGemm2ndU8F16toF32Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniGemm1stU8F16toF32Hi_4x4;\n\ +_viv_uniform VXC_512Bits uniGemm2ndU8F16toF32Hi_4x4;\n\ +\n\ +#if (VX_VERSION==2)\n\ +#define GEMM_QINT_F16_TO_F16(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##F16toF16( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + uint gidz = get_global_id(2); \\\n\ + vxc_short16 srcB; \\\n\ + vxc_half16 tmpB; \\\n\ + half4 valC; \\\n\ + read_type srcA0, srcA1; \\\n\ + vxc_short8 outC; \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : gidz), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : gidz), 0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); \\\n\ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); \\\n\ + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, srcA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Lo_4x4b); \\\n\ + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, srcA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Hi_4x4b); \\\n\ + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, srcA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Lo_4x4b); \\\n\ + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, srcA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Hi_4x4b); \\\n\ + VXC_DP4x4(tmpZpScale, tmpB.hi, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmFp16MulZptoFp32_4x4); \\\n\ + sum0 += tempA0 + tmpZpScale; \\\n\ + sum1 += tempA1 + tmpZpScale; \\\n\ + sum2 += tempA2 + tmpZpScale; \\\n\ + sum3 += tempA3 + tmpZpScale; \\\n\ + } \\\n\ + sum0 *= input0Scale; \\\n\ + sum1 *= input0Scale; \\\n\ + sum2 *= input0Scale; \\\n\ + sum3 *= input0Scale; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + \\\n\ + _viv_asm(CONV, valC, sum0); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum1); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum2); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum3); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_QINT_F16_TO_F16(U8, vxc_uchar16)\n\ +GEMM_QINT_F16_TO_F16(I8, vxc_char16)\n\ +GEMM_QINT_F16_TO_F16(I16, vxc_short8)\n\ +#else\n\ +#define GEMM_QINT_F16_TO_F16(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##F16toF16( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + uint gidz = get_global_id(2); \\\n\ + vxc_short16 srcB; \\\n\ + vxc_half16 tmpB; \\\n\ + half4 valC; \\\n\ + read_type srcA0, srcA1; \\\n\ + vxc_short8 outC; \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : gidz), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : gidz), 0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); \\\n\ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); \\\n\ + VXC_DP4x4(tempA0, srcA0, tmpB.hi, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemm1stU8F16toF32Lo_4x4); \\\n\ + VXC_DP4x4(tempB0, srcA0, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemm2ndU8F16toF32Lo_4x4); \\\n\ + VXC_DP4x4(tempA1, srcA0, tmpB.hi, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemm1stU8F16toF32Hi_4x4); \\\n\ + VXC_DP4x4(tempB1, srcA0, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemm2ndU8F16toF32Hi_4x4); \\\n\ + VXC_DP4x4(tempA2, srcA1, tmpB.hi, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemm1stU8F16toF32Lo_4x4); \\\n\ + VXC_DP4x4(tempB2, srcA1, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemm2ndU8F16toF32Lo_4x4); \\\n\ + VXC_DP4x4(tempA3, srcA1, tmpB.hi, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemm1stU8F16toF32Hi_4x4); \\\n\ + VXC_DP4x4(tempB3, srcA1, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemm2ndU8F16toF32Hi_4x4); \\\n\ + VXC_DP4x4(tmpZpScale, tmpB.hi, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmFp16MulZptoFp32_4x4); \\\n\ + sum0 += tempA0 + tempB0 + tmpZpScale; \\\n\ + sum1 += tempA1 + tempB1 + tmpZpScale; \\\n\ + sum2 += tempA2 + tempB2 + tmpZpScale; \\\n\ + sum3 += tempA3 + tempB3 + tmpZpScale; \\\n\ + } \\\n\ + sum0 *= input0Scale; \\\n\ + sum1 *= input0Scale; \\\n\ + sum2 *= input0Scale; \\\n\ + sum3 *= input0Scale; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + \\\n\ + _viv_asm(CONV, valC, sum0); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum1); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum2); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum3); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_QINT_F16_TO_F16(U8, vxc_uchar16)\n\ +GEMM_QINT_F16_TO_F16(I8, vxc_char16)\n\ +GEMM_QINT_F16_TO_F16(I16, vxc_short8)\n\ +#endif\n\ +"; /* end of matrixmul_u8f16_f16_vx*/ + +static const char matrixmul_u8f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input0_ZP;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4;\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\ +_viv_uniform VXC_512Bits uniGemmU8F16toF32Hi_4x4b;\n\ +_viv_uniform VXC_512Bits uniGemmFp16MulZptoFp32_4x4;\n\ +_viv_uniform float in0outScale;\n\ +\n\ +#if (VX_VERSION==2)\n\ +#define GEMM_QINT_F16_TO_QINT(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##F16to##src0_type_name( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + uint gidz = get_global_id(2); \\\n\ + vxc_short16 srcB; \\\n\ + vxc_half16 tmpB; \\\n\ + read_type srcA0, srcA1, outC; \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : gidz), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : gidz), 0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3, tmpZpScale; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); \\\n\ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); \\\n\ + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, srcA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Lo_4x4b); \\\n\ + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, srcA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Hi_4x4b); \\\n\ + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, srcA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Lo_4x4b); \\\n\ + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, srcA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8F16toF32Hi_4x4b); \\\n\ + VXC_DP4x4(tmpZpScale, tmpB.hi, tmpB.lo, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmFp16MulZptoFp32_4x4); \\\n\ + sum0 += tempA0 + tmpZpScale; \\\n\ + sum1 += tempA1 + tmpZpScale; \\\n\ + sum2 += tempA2 + tmpZpScale; \\\n\ + sum3 += tempA3 + tmpZpScale; \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * in0outScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * in0outScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * in0outScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * in0outScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +#else\n\ +#define GEMM_QINT_F16_TO_QINT(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##F16to##src0_type_name( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + vxc_short8 srcB; \\\n\ + vxc_half8 tmpB; \\\n\ + half4 valB; \\\n\ + read_type srcA, outC; \\\n\ + \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + short in0_zp; \\\n\ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + _viv_asm(COPY, tmpB, srcB, 16); \\\n\ + VXC_DP4x4(tempB0, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ + \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + _viv_asm(COPY, tmpB, srcB, 16); \\\n\ + VXC_DP4x4(tempB1, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ + \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + _viv_asm(COPY, tmpB, srcB, 16); \\\n\ + VXC_DP4x4(tempB2, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ + \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + _viv_asm(COPY, tmpB, srcB, 16); \\\n\ + VXC_DP4x4(tempB3, tmpB, tmpB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stFp16ToFp32_4x4); \\\n\ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +#endif\n\ +GEMM_QINT_F16_TO_QINT(U8, vxc_uchar16)\n\ +GEMM_QINT_F16_TO_QINT(I8, vxc_char16)\n\ +GEMM_QINT_F16_TO_QINT(I16, vxc_short8)\n\ +\n\ +"; /* end of matrixmul_u8f16_u8_vx*/ + +static const char matrixmul_u8u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input0_ZP;\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;\n\ +_viv_uniform float input01Scale;\n\ +\n\ +#define GEMM_QINT_TO_F16(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + read_type srcA0, srcA1, srcA2, srcA3, srcB; \\\n\ + half4 valC; \\\n\ + vxc_short8 outC; \\\n\ + \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(0); \\\n\ + vxc_float4 sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0); \\\n\ + vxc_float4 sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + sum0 += tempA0 + tempB0; \\\n\ + sum1 += tempA1 + tempB1; \\\n\ + sum2 += tempA2 + tempB2; \\\n\ + sum3 += tempA3 + tempB3; \\\n\ + } \\\n\ + sum0 *= input01Scale; \\\n\ + sum1 *= input01Scale; \\\n\ + sum2 *= input01Scale; \\\n\ + sum3 *= input01Scale; \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + _viv_asm(CONV, valC, sum0); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum1); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum2); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum3); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_QINT_TO_F16(U8, vxc_uchar16)\n\ +GEMM_QINT_TO_F16(I8, vxc_char16)\n\ +\n\ +#define GEMM_QINT16_TO_F16(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + read_type srcA, srcB; \\\n\ + half4 valC; \\\n\ + vxc_short8 outC; \\\n\ + \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(0); \\\n\ + vxc_float4 sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0); \\\n\ + vxc_float4 sum3 = (vxc_float4)(0); \\\n\ + short in0_zp, in1_zp; \\\n\ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xyww, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\ + } \\\n\ + coord_b.y = gidy; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + _viv_asm(CONV, valC, sum0); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum1); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum2); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + _viv_asm(CONV, valC, sum3); \\\n\ + _viv_asm(COPY, outC, valC, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xyww, outC.s0246, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GEMM_QINT16_TO_F16(I16, vxc_short8)\n\ +"; /* end of matrixmul_u8u8_f16_vx*/ + +static const char maximum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void maximum_F16F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 vec0, vec1, dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_F16F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 vec0, vec1, dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + coord.z ++;\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\ +\n\ +__kernel void maximum_F16F16toI8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 vec0, vec1;\n\ + vxc_char8 dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_F16F16toI8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 vec0, vec1;\n\ + vxc_char8 dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_1_part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_1_part1_2x8;\n\ +__kernel void maximum_I8I8toI8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char16 src0, src1, dst;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);\n\ + dst = max(src0, src1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_I8I8toI8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_char16 src0, src1, dst;\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord.z ++;\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);\n\ + dst = max(src0, src1);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8;\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +__kernel void maximum_U8U8toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Hi_2x8);\n\ + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Hi_2x8);\n\ + dst = max(src0, src1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_U8U8toU8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Hi_2x8);\n\ + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Hi_2x8);\n\ + dst = max(src0, src1);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;\n\ +__kernel void maximum_I16I16toI16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);\n\ + dst = max(src0, src1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_I16I16toI16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord.z ++;\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);\n\ + dst = max(src0, src1);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of maximum_vx*/ + +static const char maximum_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8;\n\ +\n\ +__kernel void maximum_I8F16toI8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char16 src0, src2, dst;\n\ + vxc_short8 src1, src3, src4, src5;\n\ + vxc_half8 data0, data1, data2, data3;\n\ + vxc_char16 tmp0, tmp1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src4, input1, coord, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, data0, src1, 16);\n\ + _viv_asm(COPY, data1, src4, 16);\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\ + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ + VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ + dst = max(src0, tmp0);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_I8F16toI8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_char16 src0, src2, dst;\n\ + vxc_short8 src1, src3, src4, src5;\n\ + vxc_half8 data0, data1, data2, data3;\n\ + vxc_char16 tmp0;\n\ +\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src4, input1, coord.xy, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, data0, src1, 16);\n\ + _viv_asm(COPY, data1, src4, 16);\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\ + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ + VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ + dst = max(src0, tmp0);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_I8F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char8 vec0, vec2;\n\ + vxc_short8 vec1, vec3, dst;\n\ + vxc_half8 src0, src1, src2, src3;\n\ +\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_I8F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_char8 vec0, vec2;\n\ + vxc_short8 vec1, vec3, dst;\n\ + vxc_half8 src0, src1, src2, src3;\n\ + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +\n\ +__kernel void maximum_U8F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar8 vec0, vec2;\n\ + vxc_short8 vec1, vec3, dst;\n\ + vxc_half8 src0, src1, src2, src3;\n\ +\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_U8F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_uchar8 vec0, vec2;\n\ + vxc_short8 vec1, vec3, dst;\n\ + vxc_half8 src0, src1, src2, src3;\n\ +\n\ + VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +__kernel void maximum_U8F16toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 src0, dst0, dst1;\n\ + vxc_ushort8 src1, src2;\n\ + vxc_half8 data1, data2;\n\ + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + _viv_asm(COPY, data2, src2, 16);\n\ +\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Hi_2x8);\n\ + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + dst0 = max(dst0, dst1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_U8F16toU8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_uchar16 src0, dst0, dst1;\n\ + vxc_ushort8 src1, src2;\n\ + vxc_half8 data1, data2;\n\ + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + _viv_asm(COPY, data2, src2, 16);\n\ +\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Hi_2x8);\n\ + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + dst0 = max(dst0, dst1);\n\ +\n\ + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_F16F16toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_ushort8 src0, src1;\n\ + vxc_half8 data0, data1;\n\ + vxc_uchar16 dst0, dst1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + _viv_asm(COPY, data1, src1, 16);\n\ +\n\ + vxc_ushort8 mp1;\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + dst0 = max(dst0, dst1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_F16F16toU8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_ushort8 src0, src1;\n\ + vxc_half8 data0, data1;\n\ + vxc_uchar16 dst0, dst1;\n\ + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + _viv_asm(COPY, data1, src1, 16);\n\ +\n\ + vxc_ushort8 mp1;\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + dst0 = max(dst0, dst1);\n\ +\n\ + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of maximum_fp16_vx*/ + +static const char maximum_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertI16toI16_2x8;\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndFp16ToFp32_4x4;\n\ +\n\ +\n\ +__kernel void maximum_I16F16toI16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, tmp0, dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, data0, src1, 16);\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);\n\ + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);\n\ + dst = max(src0, tmp0);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_I16F16toI16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, tmp0, dst;\n\ + vxc_half8 data0;\n\ +\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, data0, src1, 16);\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);\n\ + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);\n\ + dst = max(src0, tmp0);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_I16F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 vec0, vec1, dst;\n\ + vxc_half8 src0, src1;\n\ +\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_I16F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 vec0, vec1, dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_F16F16toI16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 vec0, vec1;\n\ + vxc_short8 dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + int4 tmpDst0, tmpDst1;\n\ + float4 tmpData0, tmpData1;\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);\n\ + tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);\n\ + tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_F16F16toI16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 vec0, vec1;\n\ + vxc_short8 dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_VertMax3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + int4 tmpDst0, tmpDst1;\n\ + float4 tmpData0, tmpData1;\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);\n\ + tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);\n\ + tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}"; /* end of maximum_i16_vx*/ + +static const char minimum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void minimum_F16F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 vec0, vec1, dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_F16F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 vec0, vec1, dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + coord.z ++;\n\ +\n\ + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\ +\n\ +__kernel void minimum_F16F16toI8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 vec0, vec1;\n\ + vxc_char8 dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_F16F16toI8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 vec0, vec1;\n\ + vxc_char8 dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_1_part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_1_part1_2x8;\n\ +__kernel void minimum_I8I8toI8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char16 src0, src1, dst;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);\n\ + dst = min(src0, src1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_I8I8toI8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_char16 src0, src1, dst;\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord.z ++;\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part0_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_1_part1_2x8);\n\ + dst = min(src0, src1);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8;\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +__kernel void minimum_U8U8toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Hi_2x8);\n\ + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Hi_2x8);\n\ + dst = min(src0, src1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_U8U8toU8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Hi_2x8);\n\ + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Hi_2x8);\n\ + dst = min(src0, src1);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;\n\ +__kernel void minimum_I16I16toI16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);\n\ + dst = min(src0, src1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_I16I16toI16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord.z ++;\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_0_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_1_2x8);\n\ + dst = min(src0, src1);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of minimum_vx*/ + +static const char minimum_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_0_part1_2x8;\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8;\n\ +\n\ +__kernel void minimum_I8F16toI8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char16 src0, src2, dst;\n\ + vxc_short8 src1, src3, src4, src5;\n\ + vxc_half8 data0, data1, data2, data3;\n\ + vxc_char16 tmp0, tmp1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src4, input1, coord, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, data0, src1, 16);\n\ + _viv_asm(COPY, data1, src4, 16);\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\ + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ + VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ + dst = min(src0, tmp0);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_I8F16toI8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_char16 src0, src2, dst;\n\ + vxc_short8 src1, src3, src4, src5;\n\ + vxc_half8 data0, data1, data2, data3;\n\ + vxc_char16 tmp0;\n\ +\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src4, input1, coord.xy, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, data0, src1, 16);\n\ + _viv_asm(COPY, data1, src4, 16);\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part0_2x8);\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_0_part1_2x8);\n\ + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ + VXC_DP2x8(tmp0, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt8_2x8);\n\ + dst = min(src0, tmp0);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_I8F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char8 vec0, vec2;\n\ + vxc_short8 vec1, vec3, dst;\n\ + vxc_half8 src0, src1, src2, src3;\n\ +\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);\n\ +\n\ + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_I8F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_char8 vec0, vec2;\n\ + vxc_short8 vec1, vec3, dst;\n\ + vxc_half8 src0, src1, src2, src3;\n\ + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt8toFp16_2x8);\n\ +\n\ + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +\n\ +__kernel void minimum_U8F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar8 vec0, vec2;\n\ + vxc_short8 vec1, vec3, dst;\n\ + vxc_half8 src0, src1, src2, src3;\n\ +\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ +\n\ + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_U8F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_uchar8 vec0, vec2;\n\ + vxc_short8 vec1, vec3, dst;\n\ + vxc_half8 src0, src1, src2, src3;\n\ +\n\ + VXC_ReadImage(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(src0, vec0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ +\n\ + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +__kernel void minimum_U8F16toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 src0, dst0, dst1;\n\ + vxc_ushort8 src1, src2;\n\ + vxc_half8 data1, data2;\n\ + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + _viv_asm(COPY, data2, src2, 16);\n\ +\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Hi_2x8);\n\ + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + dst0 = min(dst0, dst1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_U8F16toU8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_uchar16 src0, dst0, dst1;\n\ + vxc_ushort8 src1, src2;\n\ + vxc_half8 data1, data2;\n\ + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src2, input1, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + _viv_asm(COPY, data2, src2, 16);\n\ +\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Hi_2x8);\n\ + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + VXC_DP2x8(dst1, data2, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + dst0 = min(dst0, dst1);\n\ +\n\ + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_F16F16toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_ushort8 src0, src1;\n\ + vxc_half8 data0, data1;\n\ + vxc_uchar16 dst0, dst1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + _viv_asm(COPY, data1, src1, 16);\n\ +\n\ + vxc_ushort8 mp1;\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + dst0 = min(dst0, dst1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_F16F16toU8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_ushort8 src0, src1;\n\ + vxc_half8 data0, data1;\n\ + vxc_uchar16 dst0, dst1;\n\ + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + _viv_asm(COPY, data1, src1, 16);\n\ +\n\ + vxc_ushort8 mp1;\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, data0, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + VXC_DP2x8(dst1, data1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertFp16toU8_2x8);\n\ + dst0 = min(dst0, dst1);\n\ +\n\ + VXC_WriteImage(output, coord, dst0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of minimum_fp16_vx*/ + +static const char minimum_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertI16toI16_2x8;\n\ +_viv_uniform VXC_512Bits uinConvertFp16ToInt16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;\n\ +\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvert1stFp16ToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndFp16ToFp32_4x4;\n\ +\n\ +__kernel void minimum_I16F16toI16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, tmp0, dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, data0, src1, 16);\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);\n\ + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);\n\ + dst = min(src0, tmp0);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_I16F16toI16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, tmp0, dst;\n\ + vxc_half8 data0;\n\ +\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, data0, src1, 16);\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI16toI16_2x8);\n\ + VXC_DP2x8(tmp0, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uinConvertFp16ToInt16_2x8);\n\ + dst = min(src0, tmp0);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_I16F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 vec0, vec1, dst;\n\ + vxc_half8 src0, src1;\n\ +\n\ + VXC_ReadImage2DArray(vec0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(vec1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);\n\ +\n\ + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_I16F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 vec0, vec1, dst;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage(vec0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(vec1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ +\n\ + VXC_DP2x8(src0, vec0, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt16toFp16_2x8);\n\ +\n\ + VXC_VertMin3_Half(src0, src0, src1, src1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ + _viv_asm(COPY, dst, src0, 16);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_F16F16toI16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data0, data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + _viv_asm(COPY, data1, src1, 16);\n\ +\n\ + VXC_VertMin3_Half(data0, data0, data1, data1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ +\n\ + int4 tmpDst0, tmpDst1;\n\ + float4 tmpData0, tmpData1;\n\ + VXC_DP4x4(tmpData0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);\n\ + tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);\n\ + tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_F16F16toI16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data0, data1;\n\ +\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + _viv_asm(COPY, data1, src1, 16);\n\ +\n\ + VXC_VertMin3_Half(data0, data0, data1, data1, VXC_MODIFIER_CLAMP(0, 7, 0, 0));\n\ +\n\ + int4 tmpDst0, tmpDst1;\n\ + float4 tmpData0, tmpData1;\n\ + VXC_DP4x4(tmpData0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert1stFp16ToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniConvert2ndFp16ToFp32_4x4);\n\ + tmpDst0 = convert_int4_rte(tmpData0 * outputScale + output_zp);\n\ + tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}"; /* end of minimum_i16_vx*/ + +static const char moments_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform float dimRatio;\n\ +\n\ +_viv_uniform float zpScaleSqr_i16;\n\ +_viv_uniform float zpScale2_i16;\n\ +_viv_uniform float sumScale_i16;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform float rowSumScale;\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +\n\ +#define MOMENTS_AXIS0_QINT(src0_type_name, read0_type) \\\n\ +__kernel void moments_axis0_##src0_type_name##toF16( \\\n\ + image2d_array_t input, \\\n\ + image2d_t output_mean, \\\n\ + image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidy = get_global_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(0, gidy, gidz, 0); \\\n\ + read0_type src0; \\\n\ + float sum = 0, sqr = 0; \\\n\ + int tmpSum = 0, tmpSqr = 0; \\\n\ + int4 tmpSum0, tmpSqr0; \\\n\ + \\\n\ + for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP16x1(tmpSum0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \\\n\ + VXC_DP16x1(tmpSqr0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \\\n\ + tmpSum += (tmpSum0.x); \\\n\ + tmpSqr += (tmpSqr0.x + tmpZp1 * tmpSum0.x); \\\n\ + } \\\n\ + sqr = (convert_float(tmpSqr) * e2InScale + rowSumScale); \\\n\ + sum = convert_float(tmpSum + sumInZp) * input_scale; \\\n\ + \\\n\ + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0); \\\n\ + mean_vari0 *= dimRatio; \\\n\ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; \\\n\ + \\\n\ + int2 coord_out = (int2)(gidy, gidz); \\\n\ + half4 tmpData; \\\n\ + vxc_half8 tmpVal; \\\n\ + vxc_short8 dst; \\\n\ + _viv_asm(CONV, tmpData, mean_vari0); \\\n\ + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \\\n\ + _viv_asm(COPY, dst, tmpVal, 16); \\\n\ + \\\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +MOMENTS_AXIS0_QINT(U8, vxc_uchar16)\n\ +MOMENTS_AXIS0_QINT(I8, vxc_char16)\n\ +\n\ +#define MOMENTS_AXIS0_QINT_2D(src0_type_name, read0_type) \\\n\ +__kernel void moments_axis0_##src0_type_name##toF16_2D( \\\n\ + image2d_t input, \\\n\ + image2d_t output_mean, \\\n\ + image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidy = get_global_id(0); \\\n\ + int2 coord = (int2)(0, gidy); \\\n\ + read0_type src0; \\\n\ + float sum = 0, sqr = 0; \\\n\ + int tmpSum = 0, tmpSqr = 0; \\\n\ + int4 tmpSum0, tmpSqr0; \\\n\ + \\\n\ + for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP16x1(tmpSum0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \\\n\ + VXC_DP16x1(tmpSqr0, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \\\n\ + tmpSum += (tmpSum0.x); \\\n\ + tmpSqr += (tmpSqr0.x + tmpZp1 * tmpSum0.x); \\\n\ + } \\\n\ + sqr = (convert_float(tmpSqr) * e2InScale + rowSumScale); \\\n\ + sum = convert_float(tmpSum + sumInZp) * input_scale; \\\n\ + \\\n\ + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0); \\\n\ + mean_vari0 *= dimRatio; \\\n\ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; \\\n\ + \\\n\ + int2 coord_out = (int2)(gidy, 0); \\\n\ + half4 tmpData; \\\n\ + vxc_half8 tmpVal; \\\n\ + vxc_short8 dst; \\\n\ + _viv_asm(CONV, tmpData, mean_vari0); \\\n\ + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \\\n\ + _viv_asm(COPY, dst, tmpVal, 16); \\\n\ + \\\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +MOMENTS_AXIS0_QINT_2D(U8, vxc_uchar16)\n\ +MOMENTS_AXIS0_QINT_2D(I8, vxc_char16)\n\ +\n\ +__kernel void moments_axis0_F16toF16(\n\ + image2d_array_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidy = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(0, gidy, gidz, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h0;\n\ + vxc_float4 sumsqr0;\n\ + vxc_float4 mean_vari0 = (vxc_float4)(0);\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h0, src0, 16);\n\ + VXC_DP8x2(sumsqr0, in_h0, in_h0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + mean_vari0 += sumsqr0;\n\ + }\n\ +\n\ + mean_vari0 *= dimRatio;\n\ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;\n\ +\n\ + int2 coord_out = (int2)(gidy, gidz);\n\ +\n\ + half4 tmpData;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpData, mean_vari0);\n\ + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void moments_axis0_F16toF16_2D(\n\ + image2d_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidy = get_global_id(0);\n\ + int2 coord = (int2)(0, gidy);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h0;\n\ + vxc_float4 sumsqr0;\n\ + vxc_float4 mean_vari0 = (vxc_float4)(0);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h0, src0, 16);\n\ + VXC_DP8x2(sumsqr0, in_h0, in_h0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + mean_vari0 += sumsqr0;\n\ + }\n\ + mean_vari0 *= dimRatio;\n\ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;\n\ +\n\ + int2 coord_out = (int2)(gidy, 0);\n\ +\n\ + half4 tmpData;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpData, mean_vari0);\n\ + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void moments_axis0_I16toF16(\n\ + image2d_array_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidy = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(0, gidy, gidz, 0);\n\ + vxc_short8 src0;\n\ + float4 sumsqr0;\n\ + float sum = 0, sqr = 0;\n\ + float tmpSum = 0;\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP8x2(sumsqr0, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSum += sumsqr0.x;\n\ + sqr += (sumsqr0.y * e2InScale + zpScaleSqr_i16 + zpScale2_i16 * sumsqr0.x);\n\ + }\n\ + sum = tmpSum * input_scale + sumScale_i16;\n\ +\n\ + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0);\n\ + mean_vari0 *= dimRatio;\n\ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;\n\ +\n\ + int2 coord_out = (int2)(gidy, gidz);\n\ +\n\ + half4 tmpData;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpData, mean_vari0);\n\ + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void moments_axis0_I16toF16_2D(\n\ + image2d_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidy = get_global_id(0);\n\ + int2 coord = (int2)(0, gidy);\n\ + vxc_short8 src0;\n\ + float4 sumsqr0;\n\ + float sum = 0, sqr = 0;\n\ + float tmpSum = 0;\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP8x2(sumsqr0, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSum += sumsqr0.x;\n\ + sqr += (sumsqr0.y * e2InScale + zpScaleSqr_i16 + zpScale2_i16 * sumsqr0.x);\n\ + }\n\ + sum = tmpSum * input_scale + sumScale_i16;\n\ +\n\ + vxc_float4 mean_vari0 = (vxc_float4)(sum, sqr, 0, 0);\n\ + mean_vari0 *= dimRatio;\n\ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;\n\ +\n\ + int2 coord_out = (int2)(gidy, 0);\n\ +\n\ + half4 tmpData;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpData, mean_vari0);\n\ + VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of moments_axis0_vx*/ + +static const char moments_axis01_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform float zpScaleSqr_i16;\n\ +_viv_uniform float zpScale2_i16;\n\ +_viv_uniform float sumScale_i16;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform float rowSumScale;\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +\n\ +#define MOMENTS_AXIS01_QINT(src0_type_name, read0_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_##src0_type_name##toF16( \\\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 4; \\\n\ + int lidx = get_local_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(gidx, 0, gidz, 0); \\\n\ + read0_type src0; \\\n\ + float sum = 0, sqr = 0; \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + \\\n\ + for(coord.x = gidx; coord.x < width; coord.x += 256) \\\n\ + { \\\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \\\n\ + tmpSum += (tmpSum1); \\\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \\\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \\\n\ + } \\\n\ + sqr += (tmpSqr * e2InScale + rowSumScale); \\\n\ + sum += (tmpSum + sumInZp) * input_scale; \\\n\ + } \\\n\ + lcl_sum[lidx] = sum; \\\n\ + lcl_sqr[lidx] = sqr; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + int2 coord_out = (int2)(gidz, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + \\\n\ + sum = (0); \\\n\ + sqr = (0); \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + float4 mean, vari; \\\n\ + mean.x = sum * dimRatio; \\\n\ + vari.x = sqr * dimRatio; \\\n\ + vari.x = vari.x - mean.x * mean.x; \\\n\ + \\\n\ + half4 tmpMean, tmpVari; \\\n\ + vxc_half8 tmpVal; \\\n\ + vxc_short8 dst; \\\n\ + _viv_asm(CONV, tmpMean, mean); \\\n\ + _viv_asm(CONV, tmpVari, vari); \\\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \\\n\ + _viv_asm(COPY, dst, tmpVal, 16); \\\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +MOMENTS_AXIS01_QINT(U8, vxc_uchar16)\n\ +MOMENTS_AXIS01_QINT(I8, vxc_char16)\n\ +\n\ +#define MOMENTS_AXIS01_QINT_2D(src0_type_name, read0_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_##src0_type_name##toF16_2D( \\\n\ + image2d_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 4; \\\n\ + int lidx = get_local_id(0); \\\n\ + int2 coord = (int2)(gidx, 0); \\\n\ + read0_type src0; \\\n\ + float sum = 0, sqr = 0; \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + \\\n\ + for(coord.x = gidx; coord.x < width; coord.x += 256) \\\n\ + { \\\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \\\n\ + tmpSum += (tmpSum1); \\\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \\\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \\\n\ + } \\\n\ + sqr += (tmpSqr * e2InScale + rowSumScale); \\\n\ + sum += (tmpSum + sumInZp) * input_scale; \\\n\ + } \\\n\ + lcl_sum[lidx] = sum; \\\n\ + lcl_sqr[lidx] = sqr; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + int2 coord_out = (int2)(0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + \\\n\ + sum = (0); \\\n\ + sqr = (0); \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + float4 mean, vari; \\\n\ + mean.x = sum * dimRatio; \\\n\ + vari.x = sqr * dimRatio; \\\n\ + vari.x = vari.x - mean.x * mean.x; \\\n\ + \\\n\ + half4 tmpMean, tmpVari; \\\n\ + vxc_half8 tmpVal; \\\n\ + vxc_short8 dst; \\\n\ + _viv_asm(CONV, tmpMean, mean); \\\n\ + _viv_asm(CONV, tmpVari, vari); \\\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \\\n\ + _viv_asm(COPY, dst, tmpVal, 16); \\\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +MOMENTS_AXIS01_QINT_2D(U8, vxc_uchar16)\n\ +MOMENTS_AXIS01_QINT_2D(I8, vxc_char16)\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_F16toF16(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + vxc_float4 sumsqr;\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(gidz, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0.0f;\n\ + float sqr = 0.0f;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + half4 tmpMean, tmpVari;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpMean, mean);\n\ + _viv_asm(CONV, tmpVari, vari);\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_F16toF16_2D(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int2 coord = (int2)(gidx, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + vxc_float4 sumsqr;\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0.0f;\n\ + float sqr = 0.0f;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + half4 tmpMean, tmpVari;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpMean, mean);\n\ + _viv_asm(CONV, tmpVari, vari);\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_I16toF16(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + float4 sumsqr;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + float tmpSum = 0;\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSum += sumsqr.x;\n\ + sqr += (sumsqr.y * e2InScale + zpScaleSqr_i16 + zpScale2_i16 * sumsqr.x);\n\ + }\n\ + sum += tmpSum * input_scale + sumScale_i16;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(gidz, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0.0f;\n\ + sqr = 0.0f;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + half4 tmpMean, tmpVari;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpMean, mean);\n\ + _viv_asm(CONV, tmpVari, vari);\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_I16toF16_2D(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int2 coord = (int2)(gidx, 0);\n\ + vxc_short8 src0;\n\ + float4 sumsqr;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + float tmpSum = 0;\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSum += sumsqr.x;\n\ + sqr += (sumsqr.y * e2InScale + zpScaleSqr_i16 + zpScale2_i16 * sumsqr.x);\n\ + }\n\ + sum += tmpSum * input_scale + sumScale_i16;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0.0f;\n\ + sqr = 0.0f;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 mean_vari;\n\ + mean_vari.x = sum * dimRatio;\n\ + mean_vari.y = sqr * dimRatio;\n\ + mean_vari.y = mean_vari.y - mean_vari.x * mean_vari.x;\n\ +\n\ + half4 tmpMean, tmpVari;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpMean, mean_vari);\n\ + //_viv_asm(CONV, tmpVari, vari);\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpMean,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of moments_axis01_vx*/ + +static const char moments_axis012_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform float zpScaleSqr_i16;\n\ +_viv_uniform float zpScale2_i16;\n\ +_viv_uniform float sumScale_i16;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform float rowSumScale;\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +\n\ +#define MOMENTS_AXIS012_QINT(src0_type_name, read0_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toF16( \\\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 4; \\\n\ + int lidx = get_local_id(0); \\\n\ + int4 coord = (int4)(gidx, 0, 0, 0); \\\n\ + read0_type src0; \\\n\ + float sum = 0, sqr = 0; \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + \\\n\ + for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + { \\\n\ + for(coord.x = gidx; coord.x < width; coord.x += 256) \\\n\ + { \\\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); \\\n\ + tmpSum += (tmpSum1); \\\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); \\\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); \\\n\ + } \\\n\ + sqr += (tmpSqr * e2InScale + rowSumScale); \\\n\ + sum += (tmpSum + sumInZp) * input_scale; \\\n\ + } \\\n\ + } \\\n\ + lcl_sum[lidx] = sum; \\\n\ + lcl_sqr[lidx] = sqr; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + int2 coord_out = (int2)(0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + \\\n\ + sum = (0); \\\n\ + sqr = (0); \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + float4 mean, vari; \\\n\ + mean.x = sum * dimRatio; \\\n\ + vari.x = sqr * dimRatio; \\\n\ + vari.x = vari.x - mean.x * mean.x; \\\n\ + \\\n\ + half4 tmpMean, tmpVari; \\\n\ + vxc_half8 tmpVal; \\\n\ + vxc_short8 dst; \\\n\ + _viv_asm(CONV, tmpMean, mean); \\\n\ + _viv_asm(CONV, tmpVari, vari); \\\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \\\n\ + _viv_asm(COPY, dst, tmpVal, 16); \\\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +\n\ +MOMENTS_AXIS012_QINT(U8, vxc_uchar16)\n\ +MOMENTS_AXIS012_QINT(I8, vxc_char16)\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_F16toF16(\n\ + image2d_array_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis,\n\ + int axis_num)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int4 coord = (int4)(gidx, 0, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + vxc_float4 sumsqr;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + }\n\ + }\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ +\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = (float)(0);\n\ + float sqr = (float)(0);\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + half4 tmpMean, tmpVari;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpMean, mean);\n\ + _viv_asm(CONV, tmpVari, vari);\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_I16toF16(\n\ + image2d_array_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis,\n\ + int axis_num)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int4 coord = (int4)(gidx, 0, 0, 0);\n\ + vxc_short8 src0;\n\ + float4 sumsqr;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + float tmpSum = 0;\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSum += sumsqr.x;\n\ + sqr += (sumsqr.y * e2InScale + zpScaleSqr_i16 + zpScale2_i16 * sumsqr.x);\n\ + }\n\ + sum += tmpSum * input_scale + sumScale_i16;\n\ + }\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ +\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = (float)(0);\n\ + sqr = (float)(0);\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + half4 tmpMean, tmpVari;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpMean, mean);\n\ + _viv_asm(CONV, tmpVari, vari);\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of moments_axis012_vx*/ + +static const char moments_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int height;\n\ +_viv_uniform float dimRatio;\n\ +\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform float e2InScale;\n\ +\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +\n\ +#define MOMENTS_AXIS1_QINT(src0_type_name, read0_type) \\\n\ +__kernel void moments_axis1_##src0_type_name##toF16( \\\n\ + image2d_array_t input, \\\n\ + image2d_t output_mean, \\\n\ + image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(gidx, 0, gidz, 0); \\\n\ + read0_type src0; \\\n\ + float4 sum = 0, sqr = 0; \\\n\ + short zp = inputZP;\\\n\ + float4 tmpData0;\\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + sum += (tmpData0); \\\n\ + sqr += (tmpData0 * tmpData0); \\\n\ + } \\\n\ + sum *= input_scale; \\\n\ + sqr *= e2InScale; \\\n\ + \\\n\ + float4 mean = sum * dimRatio; \\\n\ + float4 vari = sqr * dimRatio; \\\n\ + vari = vari - mean * mean; \\\n\ + \\\n\ + int2 coord_out = (int2)(gidx, gidz); \\\n\ + half4 tmpMean, tmpVari; \\\n\ + vxc_half8 tmpVal; \\\n\ + vxc_short8 dst; \\\n\ + _viv_asm(CONV, tmpMean, mean); \\\n\ + _viv_asm(CONV, tmpVari, vari); \\\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \\\n\ + _viv_asm(COPY, dst, tmpVal, 16); \\\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +MOMENTS_AXIS1_QINT(U8, vxc_uchar16)\n\ +MOMENTS_AXIS1_QINT(I8, vxc_char16)\n\ +MOMENTS_AXIS1_QINT(I16, vxc_short8)\n\ +\n\ +#define MOMENTS_AXIS1_QINT_2D(src0_type_name, read0_type) \\\n\ +__kernel void moments_axis1_##src0_type_name##toF16_2D( \\\n\ + image2d_t input, \\\n\ + image2d_t output_mean, \\\n\ + image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int2 coord = (int2)(gidx, 0); \\\n\ + read0_type src0; \\\n\ + float4 sum = 0, sqr = 0; \\\n\ + short zp = inputZP;\\\n\ + float4 tmpData0;\\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + sum += (tmpData0); \\\n\ + sqr += (tmpData0 * tmpData0); \\\n\ + } \\\n\ + sum *= input_scale; \\\n\ + sqr *= e2InScale; \\\n\ + \\\n\ + float4 mean = sum * dimRatio; \\\n\ + float4 vari = sqr * dimRatio; \\\n\ + vari = vari - mean * mean; \\\n\ + \\\n\ + int2 coord_out = (int2)(gidx, 0); \\\n\ + half4 tmpMean, tmpVari; \\\n\ + vxc_half8 tmpVal; \\\n\ + vxc_short8 dst; \\\n\ + _viv_asm(CONV, tmpMean, mean); \\\n\ + _viv_asm(CONV, tmpVari, vari); \\\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \\\n\ + _viv_asm(COPY, dst, tmpVal, 16); \\\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +MOMENTS_AXIS1_QINT_2D(U8, vxc_uchar16)\n\ +MOMENTS_AXIS1_QINT_2D(I8, vxc_char16)\n\ +MOMENTS_AXIS1_QINT_2D(I16, vxc_short8)\n\ +\n\ +__kernel void moments_axis1_F16toF16(\n\ + image2d_array_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h0;\n\ + vxc_float4 tmpSrc0;\n\ + vxc_float4 sum = (vxc_float4)(0);\n\ + vxc_float4 sqr = (vxc_float4)(0);\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h0, src0, 16);\n\ + VXC_DP4x4(tmpSrc0, in_h0, in_h0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ + sum += tmpSrc0;\n\ + sqr += (tmpSrc0 * tmpSrc0);\n\ + }\n\ +\n\ + vxc_float4 mean = sum * dimRatio;\n\ + vxc_float4 vari = sqr * dimRatio;\n\ + vari = vari - mean * mean;\n\ +\n\ + int2 coord_out = (int2)(gidx, gidz);\n\ + half4 tmpMean, tmpVari;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpMean, mean);\n\ + _viv_asm(CONV, tmpVari, vari);\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void moments_axis1_F16toF16_2D(\n\ + image2d_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h0;\n\ + vxc_float4 tmpSrc0;\n\ + vxc_float4 sum = (vxc_float4)(0);\n\ + vxc_float4 sqr = (vxc_float4)(0);\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h0, src0, 16);\n\ + VXC_DP4x4(tmpSrc0, in_h0, in_h0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ + sum += tmpSrc0;\n\ + sqr += (tmpSrc0 * tmpSrc0);\n\ + }\n\ +\n\ + vxc_float4 mean = sum * dimRatio;\n\ + vxc_float4 vari = sqr * dimRatio;\n\ + vari = vari - mean * mean;\n\ +\n\ + int2 coord_out = (int2)(gidx, 0);\n\ + half4 tmpMean, tmpVari;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpMean, mean);\n\ + _viv_asm(CONV, tmpVari, vari);\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of moments_axis1_vx*/ + +static const char moments_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int channel;\n\ +_viv_uniform float dimRatio;\n\ +\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +\n\ +#define MOMENTS_AXIS2_QINT(src0_type_name, read0_type) \\\n\ +__kernel void moments_axis2_##src0_type_name##toF16( \\\n\ + image2d_array_t input, \\\n\ + image2d_t output_mean, \\\n\ + image2d_t output_vari, \\\n\ + int axis, int axis_num) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int4 coord = (int4)(gidx, gidy, 0, 0); \\\n\ + read0_type src0; \\\n\ + float4 sum = 0, sqr = 0; \\\n\ + short zp = inputZP;\\\n\ + float4 tmpData0;\\\n\ + \\\n\ + for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + sum += (tmpData0); \\\n\ + sqr += (tmpData0 * tmpData0); \\\n\ + } \\\n\ + sum *= input_scale; \\\n\ + sqr *= e2InScale; \\\n\ + \\\n\ + float4 mean = sum * dimRatio; \\\n\ + float4 vari = sqr * dimRatio; \\\n\ + vari = vari - mean * mean; \\\n\ + \\\n\ + int2 coord_out = (int2)(gidx, gidy); \\\n\ + half4 tmpMean, tmpVari; \\\n\ + vxc_half8 tmpVal; \\\n\ + vxc_short8 dst; \\\n\ + _viv_asm(CONV, tmpMean, mean); \\\n\ + _viv_asm(CONV, tmpVari, vari); \\\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); \\\n\ + _viv_asm(COPY, dst, tmpVal, 16); \\\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +MOMENTS_AXIS2_QINT(U8, vxc_uchar16)\n\ +MOMENTS_AXIS2_QINT(I8, vxc_char16)\n\ +MOMENTS_AXIS2_QINT(I16, vxc_short8)\n\ +\n\ +__kernel void moments_axis2_F16toF16(\n\ + image2d_array_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis,\n\ + int axis_num)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord = (int4)(gidx, gidy, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h0;\n\ + vxc_float4 tmpSrc0;\n\ + vxc_float4 sum = (vxc_float4)(0);\n\ + vxc_float4 sqr = (vxc_float4)(0);\n\ +\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h0, src0, 16);\n\ + VXC_DP4x4(tmpSrc0, in_h0, in_h0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ + sum += tmpSrc0;\n\ + sqr += (tmpSrc0 * tmpSrc0);\n\ + }\n\ +\n\ + vxc_float4 mean = sum * dimRatio;\n\ + vxc_float4 vari = sqr * dimRatio;\n\ + vari = vari - mean * mean;\n\ +\n\ + int2 coord_out = (int2)(gidx, gidy);\n\ + half4 tmpMean, tmpVari;\n\ + vxc_half8 tmpVal;\n\ + vxc_short8 dst;\n\ + _viv_asm(CONV, tmpMean, mean);\n\ + _viv_asm(CONV, tmpVari, vari);\n\ + VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of moments_axis2_vx*/ + +static const char poolwithargmax_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +//-------------------max pooling with argmax---------------\n\ +_viv_uniform VXC_512Bits poolingEncode;\n\ +_viv_uniform VXC_512Bits uniQuantInOutInt16Even_4x4;\n\ +\n\ +#define POOLWITHARGMAX_F16_TO_F16_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 din0, din1, maxData, src0, src1; \\\n\ + vxc_half8 din0Fp16, din1Fp16; \\\n\ + vxc_half8 maxDataVer, maxDataVer1; \\\n\ + int4 bitExtractCoeff; \\\n\ + vxc_short8 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 din0Equal, din1Equal; \\\n\ + vxc_uchar4 axisEncode; \\\n\ + vxc_uchar4 axisOut; \\\n\ + read_fun(src0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(src1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, din0Fp16, src0, 16); \\\n\ + _viv_asm(COPY, din1Fp16, src1, 16); \\\n\ + VXC_VertMax3_Half(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + din1 = din0.s10325476; \\\n\ + _viv_asm(COPY, maxDataVer1, din1, 16); \\\n\ + VXC_VertMax3_Half(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + din1 = din0.s02460246; \\\n\ + _viv_asm(COPY, maxData, maxDataVer, 16); \\\n\ + vxc_short8 one = (vxc_short8)(1, 1, 1, 1, 1, 1, 1, 1); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + din0EqualTmp = src0 == maxData ? one : zero; \\\n\ + din1EqualTmp = src1 == maxData ? one : zero; \\\n\ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode); \\\n\ + axisOut = clz(axisEncode); \\\n\ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void poolwithargmax_F16to_F16_U8\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_F16_TO_F16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_F16to_F16_U8_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_F16_TO_F16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +#define POOLWITHARGMAX_F16_TO_I16_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 din0, din1, maxData, src0, src1; \\\n\ + vxc_half8 din0Fp16, din1Fp16; \\\n\ + vxc_half8 maxDataVer, maxDataVer1; \\\n\ + int4 bitExtractCoeff; \\\n\ + vxc_short8 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 din0Equal, din1Equal; \\\n\ + vxc_uchar4 axisEncode; \\\n\ + vxc_uchar4 axisOut; \\\n\ + read_fun(src0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(src1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, din0Fp16, src0, 16); \\\n\ + _viv_asm(COPY, din1Fp16, src1, 16); \\\n\ + VXC_VertMax3_Half(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + din1 = din0.s10325476; \\\n\ + _viv_asm(COPY, maxDataVer1, din1, 16); \\\n\ + VXC_VertMax3_Half(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + VXC_DP4x4(din1, din0, din0, VXC_MODIFIER_BIN(0, 3, 0), uniQuantInOutInt16Even_4x4); \\\n\ + _viv_asm(COPY, maxData, maxDataVer, 16); \\\n\ + vxc_short8 one = (vxc_short8)(1, 1, 1, 1, 1, 1, 1, 1); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + din0EqualTmp = src0 == maxData ? one : zero; \\\n\ + din1EqualTmp = src1 == maxData ? one : zero; \\\n\ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode); \\\n\ + axisOut = clz(axisEncode); \\\n\ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void poolwithargmax_F16to_I16_U8\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_F16_TO_I16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_F16to_I16_U8_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_F16_TO_I16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of poolwithargmax_F16_vx*/ + +static const char poolwithargmax_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits poolingEncode2;\n\ +\n\ +\n\ +#define POOLWITHARGMAX_I16_TO_I16_U8_SAME_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 din0, din1; \\\n\ + vxc_short8 din0Fp16, din1Fp16; \\\n\ + vxc_short8 maxDataVer, maxDataVer1; \\\n\ + int4 bitExtractCoeff; \\\n\ + vxc_short8 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 din0Equal, din1Equal; \\\n\ + vxc_uchar4 axisEncode; \\\n\ + vxc_uchar4 axisOut; \\\n\ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, din0Fp16, din0, 16); \\\n\ + _viv_asm(COPY, din1Fp16, din1, 16); \\\n\ + VXC_VertMax3_Integer(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + din1 = din0.s10325476; \\\n\ + _viv_asm(COPY, maxDataVer1, din1, 16); \\\n\ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + din1 = din0.s02460246; \\\n\ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_Clamp(din0EqualTmp, din0Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + VXC_Clamp(din1EqualTmp, din1Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + bitExtractCoeff = (int4)(0x30201000, 0x70605040, 0x01010101, 0x01010101); \\\n\ + VXC_BitExtract(din0Equal, din0EqualTmp, din0EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + VXC_BitExtract(din1Equal, din1EqualTmp, din1EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + VXC_DP4x4(axisEncode, din0Equal, din1Equal, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode2); \\\n\ + axisOut = clz(axisEncode); \\\n\ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void poolwithargmax_I16to_I16_U8_SAME\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_I16_TO_I16_U8_SAME_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_I16to_I16_U8_SAME_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_I16_TO_I16_U8_SAME_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniQuantInOutInt16Even_4x4;\n\ +\n\ +#define POOLWITHARGMAX_I16_TO_I16_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 din0, din1; \\\n\ + vxc_short8 din0Fp16, din1Fp16; \\\n\ + vxc_short8 maxDataVer, maxDataVer1; \\\n\ + int4 bitExtractCoeff; \\\n\ + vxc_short8 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 din0Equal, din1Equal; \\\n\ + vxc_uchar4 axisEncode; \\\n\ + vxc_uchar4 axisOut; \\\n\ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, din0Fp16, din0, 16); \\\n\ + _viv_asm(COPY, din1Fp16, din1, 16); \\\n\ + VXC_VertMax3_Integer(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + din1 = din0.s10325476; \\\n\ + _viv_asm(COPY, maxDataVer1, din1, 16); \\\n\ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + VXC_DP4x4(din1, din0, din0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniQuantInOutInt16Even_4x4); \\\n\ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_Clamp(din0EqualTmp, din0Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + VXC_Clamp(din1EqualTmp, din1Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + bitExtractCoeff = (int4)(0x30201000, 0x70605040, 0x01010101, 0x01010101); \\\n\ + VXC_BitExtract(din0Equal, din0EqualTmp, din0EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + VXC_BitExtract(din1Equal, din1EqualTmp, din1EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + VXC_DP4x4(axisEncode, din0Equal, din1Equal, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode2); \\\n\ + axisOut = clz(axisEncode); \\\n\ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void poolwithargmax_I16to_I16_U8\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_I16_TO_I16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_I16to_I16_U8_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_I16_TO_I16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +#define POOLWITHARGMAX_I16_TO_I16_I16_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 din0, din1; \\\n\ + vxc_short8 din0Fp16, din1Fp16; \\\n\ + vxc_short8 maxDataVer, maxDataVer1; \\\n\ + int4 bitExtractCoeff; \\\n\ + vxc_short8 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 din0Equal, din1Equal; \\\n\ + vxc_uchar4 axisEncode; \\\n\ + vxc_uchar4 axisOut; \\\n\ + vxc_short4 axisVal; \\\n\ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, din0Fp16, din0, 16); \\\n\ + _viv_asm(COPY, din1Fp16, din1, 16); \\\n\ + VXC_VertMax3_Integer(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + din1 = din0.s10325476; \\\n\ + _viv_asm(COPY, maxDataVer1, din1, 16); \\\n\ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + VXC_DP4x4(din1, din0, din0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniQuantInOutInt16Even_4x4); \\\n\ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_Clamp(din0EqualTmp, din0Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + VXC_Clamp(din1EqualTmp, din1Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + bitExtractCoeff = (int4)(0x30201000, 0x70605040, 0x01010101, 0x01010101); \\\n\ + VXC_BitExtract(din0Equal, din0EqualTmp, din0EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + VXC_BitExtract(din1Equal, din1EqualTmp, din1EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + VXC_DP4x4(axisEncode, din0Equal, din1Equal, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode2); \\\n\ + axisOut = clz(axisEncode); \\\n\ + axisVal = convert_short4(axisOut); \\\n\ + write_fun(axis, coordOut, axisVal, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void poolwithargmax_I16to_I16_I16\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_I16_TO_I16_I16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_I16to_I16_I16_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_I16_TO_I16_I16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertDirInt16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\ +_viv_uniform float input_fl_scale_i16;\n\ +_viv_uniform VXC_512Bits uniPackHalf8_2x8_2;\n\ +\n\ +#define POOLWITHARGMAX_I16_TO_F16_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 din0, din1; \\\n\ + vxc_short8 din0Fp16, din1Fp16; \\\n\ + vxc_short8 maxDataVer, maxDataVer1; \\\n\ + int4 bitExtractCoeff; \\\n\ + vxc_short8 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 din0Equal, din1Equal; \\\n\ + vxc_uchar4 axisEncode; \\\n\ + vxc_uchar4 axisOut; \\\n\ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, din0Fp16, din0, 16); \\\n\ + _viv_asm(COPY, din1Fp16, din1, 16); \\\n\ + VXC_VertMax3_Integer(maxDataVer, din0Fp16, din1Fp16, din1Fp16, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + din1 = din0.s10325476; \\\n\ + _viv_asm(COPY, maxDataVer1, din1, 16); \\\n\ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + _viv_asm(COPY, din0, maxDataVer, 16); \\\n\ + din1 = din0.s02460246; \\\n\ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; \\\n\ + half4 tmpOut0, tmpOut1; \\\n\ + vxc_half8 tmpPack; \\\n\ + VXC_DP4x4(tmpVal0, din1, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt16Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal2, din1, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt16Fp32_4x4); \\\n\ + tmpVal1 = tmpVal0 * input_fl_scale_i16; \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal1); \\\n\ + tmpVal3 = tmpVal2 * input_fl_scale_i16; \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal3); \\\n\ + VXC_DP2x8(tmpPack, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackHalf8_2x8_2); \\\n\ + _viv_asm(COPY, din1, tmpPack, 16); \\\n\ + VXC_Clamp(din0EqualTmp, din0Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + VXC_Clamp(din1EqualTmp, din1Fp16, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + bitExtractCoeff = (int4)(0x30201000, 0x70605040, 0x01010101, 0x01010101); \\\n\ + VXC_BitExtract(din0Equal, din0EqualTmp, din0EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + VXC_BitExtract(din1Equal, din1EqualTmp, din1EqualTmp, bitExtractCoeff, VXC_MODIFIER_BIN(0, 7, 0)); \\\n\ + VXC_DP4x4(axisEncode, din0Equal, din1Equal, VXC_MODIFIER_BIN(0, 3, 0), poolingEncode2); \\\n\ + axisOut = clz(axisEncode); \\\n\ + write_fun(tensorOut, coordOut, din1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void poolwithargmax_I16to_F16_U8\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_I16_TO_F16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_I16to_F16_U8_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_I16_TO_F16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of poolwithargmax_I16_vx*/ + +static const char poolwithargmax_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input_ZP;\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform VXC_512Bits uniPackHalf8_2x8;\n\ +_viv_uniform VXC_512Bits uniU8EvenBinSubZP_MulM_2x8;\n\ +_viv_uniform VXC_512Bits uniS16AddOutZP_2x8;\n\ +_viv_uniform vxc_uint4 packed_outputZP;\n\ +_viv_uniform VXC_512Bits poolingEncodeInt8_0;\n\ +_viv_uniform VXC_512Bits poolingEncodeInt8_1;\n\ +\n\ +#define POOLWITHARGMAX_I8_TO_I8_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 din0, din1; \\\n\ + vxc_char16 maxDataVer, maxDataVer1; \\\n\ + int4 bitExtractCoeff; \\\n\ + vxc_char16 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 axisEncode; \\\n\ + vxc_uchar8 axisOut; \\\n\ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_VertMax3_Integer(maxDataVer, din0, din1, din1, VXC_MODIFIER_BIN(0, 15, 0)); \\\n\ + maxDataVer1 = maxDataVer.s1032547698badcfe; \\\n\ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 15, 0)); \\\n\ + vxc_short8 tmp; \\\n\ + short zp = input_ZP; \\\n\ + VXC_DP2x8(tmp, maxDataVer, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8EvenBinSubZP_MulM_2x8); \\\n\ + vxc_char16 packed_outZP; \\\n\ + _viv_asm(COPY, packed_outZP, packed_outputZP, 16); \\\n\ + VXC_DP2x8(maxDataVer1, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + write_fun(tensorOut, coordOut, maxDataVer1,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + din0EqualTmp &= (vxc_char16)(1); \\\n\ + din1EqualTmp &= (vxc_char16)(1); \\\n\ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 3, 0), poolingEncodeInt8_0); \\\n\ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(4, 7, 0), poolingEncodeInt8_1); \\\n\ + axisOut = clz(axisEncode); \\\n\ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void poolwithargmax_I8to_I8_U8\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_I8_TO_I8_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_I8to_I8_U8_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_I8_TO_I8_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +#define POOLWITHARGMAX_I8_TO_I8_U8_SAME_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 din0, din1; \\\n\ + vxc_char16 maxDataVer, maxDataVer1; \\\n\ + int4 bitExtractCoeff; \\\n\ + vxc_char16 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 axisEncode; \\\n\ + vxc_uchar8 axisOut; \\\n\ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_VertMax3_Integer(maxDataVer, din0, din1, din1, VXC_MODIFIER_BIN(0, 15, 0)); \\\n\ + maxDataVer1 = maxDataVer.s1032547698badcfe; \\\n\ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 15, 0)); \\\n\ + maxDataVer1 = maxDataVer.s02468ace02468ace; \\\n\ + write_fun(tensorOut, coordOut, maxDataVer1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + din0EqualTmp &= (vxc_char16)(1); \\\n\ + din1EqualTmp &= (vxc_char16)(1); \\\n\ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 3, 0), poolingEncodeInt8_0); \\\n\ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(4, 7, 0), poolingEncodeInt8_1); \\\n\ + axisOut = clz(axisEncode); \\\n\ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void poolwithargmax_I8to_I8_U8_SAME\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_I8_TO_I8_U8_SAME_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_I8to_I8_U8_SAME_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_I8_TO_I8_U8_SAME_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertEvenU8ToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertEvenU8SubZpToFp32_4x4;\n\ +\n\ +#define POOLWITHARGMAX_I8_TO_F16_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 din0, din1; \\\n\ + vxc_char16 maxDataVer, maxDataVer1; \\\n\ + vxc_char16 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 axisEncode; \\\n\ + vxc_uchar8 axisOut; \\\n\ + vxc_short8 result; \\\n\ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_VertMax3_Integer(maxDataVer, din0, din1, din1, VXC_MODIFIER_BIN(0, 15, 0)); \\\n\ + maxDataVer1 = maxDataVer.s1032547698badcfe; \\\n\ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 15, 0)); \\\n\ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; \\\n\ + half4 tmpOut0, tmpOut1; \\\n\ + vxc_half8 tmpPack; \\\n\ + short zp = input_ZP; \\\n\ + VXC_DP4x4(tmpVal0, maxDataVer, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEvenU8ToFp32_4x4); \\\n\ + VXC_DP4x4(tmpVal2, maxDataVer, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEvenU8SubZpToFp32_4x4); \\\n\ + tmpVal1 = tmpVal0 * inputScale; \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal1); \\\n\ + tmpVal3 = tmpVal2 * inputScale; \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal3); \\\n\ + VXC_DP2x8(tmpPack, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniPackHalf8_2x8); \\\n\ + _viv_asm(COPY, result, tmpPack, 16); \\\n\ + write_fun(tensorOut, coordOut, result,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + din0EqualTmp &= (vxc_char16)(1); \\\n\ + din1EqualTmp &= (vxc_char16)(1); \\\n\ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 3, 0), poolingEncodeInt8_0); \\\n\ + VXC_DP4x4(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(4, 7, 0), poolingEncodeInt8_1); \\\n\ + axisOut = clz(axisEncode); \\\n\ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void poolwithargmax_I8to_F16_U8\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_I8_TO_F16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_I8to_F16_U8_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_I8_TO_F16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of poolwithargmax_I8_vx*/ + +static const char poolwithargmax_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input_ZP;\n\ +_viv_uniform VXC_512Bits uniU8EvenBinSubZP_MulM_2x8;\n\ +_viv_uniform VXC_512Bits uniEncodeUint8_4x8;\n\ +_viv_uniform VXC_512Bits uniS16AddOutZP_2x8;\n\ +_viv_uniform vxc_uint4 packed_outputZP;\n\ +\n\ +\n\ +#define POOLWITHARGMAX_U8_TO_U8_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_uchar16 din0, din1; \\\n\ + vxc_uchar16 maxDataVer, maxDataVer1; \\\n\ + vxc_uchar16 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 axisEncode; \\\n\ + vxc_uchar8 axisOut; \\\n\ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + maxDataVer = max(din0, din1); \\\n\ + maxDataVer1 = maxDataVer.s1032547698badcfe; \\\n\ + maxDataVer = max(maxDataVer1, maxDataVer); \\\n\ + vxc_short8 tmp; \\\n\ + uchar zp = input_ZP; \\\n\ + VXC_DP2x8(tmp, maxDataVer, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8EvenBinSubZP_MulM_2x8); \\\n\ + vxc_uchar16 packed_outZP; \\\n\ + _viv_asm(COPY, packed_outZP, packed_outputZP, 16); \\\n\ + VXC_DP2x8(maxDataVer1, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + write_fun(tensorOut, coordOut, maxDataVer1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + din0EqualTmp &= (vxc_uchar16)(1); \\\n\ + din1EqualTmp &= (vxc_uchar16)(1); \\\n\ + VXC_DP4x8(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 7, 0), uniEncodeUint8_4x8); \\\n\ + axisOut = clz(axisEncode); \\\n\ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void poolwithargmax_U8to_U8_U8\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_U8_TO_U8_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_U8to_U8_U8_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_U8_TO_U8_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform VXC_512Bits uniConvertUint8ToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSubZpUint8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniPackHalf2Short_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractHalf2Short_2x8;\n\ +_viv_uniform VXC_512Bits uniPackHalf8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertEvenU8ToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertEvenU8SubZpToFp32_4x4;\n\ +\n\ +#define POOLWITHARGMAX_U8_TO_F16_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_uchar16 din0, din1; \\\n\ + vxc_uchar16 maxDataVer, maxDataVer1; \\\n\ + int4 bitExtractCoeff; \\\n\ + vxc_uchar16 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 axisEncode; \\\n\ + vxc_uchar8 axisOut; \\\n\ + vxc_short8 result; \\\n\ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_VertMax3_Integer(maxDataVer, din0, din1, din1, VXC_MODIFIER_BIN(0, 15, 0)); \\\n\ + maxDataVer1 = maxDataVer.s1032547698badcfe; \\\n\ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer,\\\n\ + maxDataVer, VXC_MODIFIER_BIN(0, 15, 0)); \\\n\ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; \\\n\ + half4 tmpOut0, tmpOut1; \\\n\ + vxc_half8 tmpPack; \\\n\ + vxc_short4 tmpOut2, tmpOut3; \\\n\ + uchar zp = input_ZP; \\\n\ + VXC_DP4x4(tmpVal0, maxDataVer, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEvenU8ToFp32_4x4); \\\n\ + VXC_DP4x4(tmpVal2, maxDataVer, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEvenU8SubZpToFp32_4x4); \\\n\ + tmpVal1 = tmpVal0 * inputScale; \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal1); \\\n\ + tmpVal3 = tmpVal2 * inputScale; \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal3); \\\n\ + VXC_DP2x8(tmpPack, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniPackHalf8_2x8); \\\n\ + _viv_asm(COPY, result, tmpPack, 16); \\\n\ + write_fun(tensorOut, coordOut, result, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + din0EqualTmp &= (vxc_uchar16)(1); \\\n\ + din1EqualTmp &= (vxc_uchar16)(1); \\\n\ + VXC_DP4x8(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 7, 0), uniEncodeUint8_4x8); \\\n\ + axisOut = clz(axisEncode); \\\n\ + write_fun(axis, coordOut, axisOut, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void poolwithargmax_U8to_F16_U8\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_U8_TO_F16_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_U8to_F16_U8_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_U8_TO_F16_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +#define POOLWITHARGMAX_U8_TO_F16_I16_PROCESS(read_fun, write_fun) \\\n\ + vxc_uchar16 din0, din1; \\\n\ + vxc_uchar16 maxDataVer, maxDataVer1; \\\n\ + int4 bitExtractCoeff; \\\n\ + vxc_uchar16 din0EqualTmp, din1EqualTmp; \\\n\ + vxc_uchar8 axisEncode; \\\n\ + vxc_uchar8 axisOut; \\\n\ + vxc_short8 result, axisResult; \\\n\ + read_fun(din0, tensorIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(din1, tensorIn, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_VertMax3_Integer(maxDataVer, din0, din1, din1, VXC_MODIFIER_BIN(0, 15, 0)); \\\n\ + maxDataVer1 = maxDataVer.s1032547698badcfe; \\\n\ + VXC_VertMax3_Integer(maxDataVer, maxDataVer1, maxDataVer, maxDataVer, VXC_MODIFIER_BIN(0, 15, 0)); \\\n\ + maxDataVer1 = maxDataVer.s02468ace02468ace; \\\n\ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; \\\n\ + half4 tmpOut0, tmpOut1; \\\n\ + vxc_half8 tmpPack; \\\n\ + vxc_short4 tmpOut2, tmpOut3; \\\n\ + uchar zp = input_ZP; \\\n\ + VXC_DP4x4(tmpVal0, maxDataVer1, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertUint8ToFp32_4x4); \\\n\ + VXC_DP4x4(tmpVal2, maxDataVer1, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSubZpUint8Fp32_4x4); \\\n\ + tmpVal1 = tmpVal0 * inputScale; \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal1); \\\n\ + tmpVal3 = tmpVal2 * inputScale; \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal3); \\\n\ + VXC_DP2x8(tmpPack, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniPackHalf8_2x8); \\\n\ + _viv_asm(COPY, result, tmpPack, 16); \\\n\ + VXC_Clamp(din0EqualTmp, din0, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + VXC_Clamp(din1EqualTmp, din1, maxDataVer, maxDataVer, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + din0EqualTmp &= (vxc_uchar16)(1); \\\n\ + din1EqualTmp &= (vxc_uchar16)(1); \\\n\ + VXC_DP4x8(axisEncode, din0EqualTmp, din1EqualTmp, VXC_MODIFIER_BIN(0, 7, 0), uniEncodeUint8_4x8); \\\n\ + axisOut = clz(axisEncode); \\\n\ + _viv_asm(CONV, axisResult, axisOut); \\\n\ + write_fun(tensorOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + write_fun(axis, coordOut, axisResult, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void poolwithargmax_U8to_F16_I16\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x >> 1, coord.y >> 1, coord.z, 0);\n\ + POOLWITHARGMAX_U8_TO_F16_I16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_U8to_F16_I16_2D\n\ + (\n\ + image2d_array_t tensorIn,\n\ + image2d_array_t tensorOut,\n\ + image2d_array_t axis\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x >> 1, coord.y >> 1);\n\ + POOLWITHARGMAX_U8_TO_F16_I16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +"; /* end of poolwithargmax_U8_vx*/ + +static const char pow_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2;\n\ +\n\ +_viv_uniform int input_ZP1;\n\ +\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float outputScale;\n\ +\n\ +__kernel void pow_F16F16toF16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1;\n\ + vxc_short8 dst;\n\ + vxc_half8 data0, data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16F16toF16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1;\n\ + vxc_short8 dst;\n\ + vxc_half8 data0, data1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16F16toU8(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1;\n\ + vxc_uchar8 dst;\n\ + vxc_half8 data0, data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16F16toU8_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1;\n\ + vxc_uchar8 dst;\n\ + vxc_half8 data0, data1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16U8toF16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0;\n\ + vxc_uchar8 src1;\n\ + vxc_short8 dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in1_zp;\n\ + _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, dst, data0, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16U8toF16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0;\n\ + vxc_uchar8 src1;\n\ + vxc_short8 dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in1_zp;\n\ + _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, dst, data0, 16);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16U8toU8(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0;\n\ + vxc_uchar8 src1, dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in1_zp;\n\ + _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16U8toU8_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0;\n\ + vxc_uchar8 src1, dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in1_zp;\n\ + _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pow_fp16_vx*/ + +static const char pow_fp16_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\ +\n\ +_viv_uniform float outScale_fl;\n\ +\n\ +__kernel void pow_F16F16toI16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data0, data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16F16toI16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data0, data1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16I16toF16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16I16toF16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16I16toI16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16I16toI16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +__kernel void pow_BF16BF16toBF16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_ushort8 src0, src1, dst, tmpData;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ +\n\ + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, x0, tmpData, 16);\n\ + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, x1, tmpData, 16);\n\ +\n\ + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, y0, tmpData, 16);\n\ + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, y1, tmpData, 16);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + _viv_asm(COPY, src0, tmpDst0, 16);\n\ + _viv_asm(COPY, src1, tmpDst1, 16);\n\ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_BF16BF16toBF16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_ushort8 src0, src1, dst, tmpData;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ +\n\ + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, x0, tmpData, 16);\n\ + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, x1, tmpData, 16);\n\ +\n\ + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, y0, tmpData, 16);\n\ + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, y1, tmpData, 16);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + _viv_asm(COPY, src0, tmpDst0, 16);\n\ + _viv_asm(COPY, src1, tmpDst1, 16);\n\ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pow_fp16_i16_vx*/ + +static const char pow_fp16_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform float outScale_fl;\n\ +\n\ +__kernel void pow_F16F16toI8(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1;\n\ + vxc_char8 dst;\n\ + vxc_half8 data0, data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16F16toI8_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1;\n\ + vxc_char8 dst;\n\ + vxc_half8 data0, data1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16I8toF16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, dst;\n\ + vxc_char8 src1;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16I8toF16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0, dst;\n\ + vxc_char8 src1;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16I8toI8(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0;\n\ + vxc_char8 src1, dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_F16I8toI8_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0;\n\ + vxc_char8 src1, dst;\n\ + vxc_half8 data0;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, src0, 16);\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pow_fp16_i8_vx*/ + +static const char pow_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform float outScale_fl;\n\ +\n\ +__kernel void pow_I16F16toF16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_I16F16toF16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_I16F16toI16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_I16F16toI16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_I16I16toI16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_I16I16toI16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pow_i16_vx*/ + +static const char pow_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform float outScale_fl;\n\ +\n\ +__kernel void pow_I8F16toF16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char8 src0;\n\ + vxc_short8 src1, dst;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_I8F16toF16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_char8 src0;\n\ + vxc_short8 src1, dst;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_I8F16toI8(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char8 src0, dst;\n\ + vxc_short8 src1;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_I8F16toI8_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_char8 src0, dst;\n\ + vxc_short8 src1;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_I8I8toI8(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char8 src0, src1, dst;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_I8I8toI8_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_char8 src0, src1, dst;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pow_i8_vx*/ + +static const char pow_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +\n\ +_viv_uniform int input_ZP0;\n\ +_viv_uniform int input_ZP1;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float outputScale;\n\ +\n\ +__kernel void pow_U8F16toF16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar8 src0;\n\ + vxc_short8 src1;\n\ + vxc_short8 dst;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in0_zp;\n\ + _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_U8F16toF16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_uchar8 src0;\n\ + vxc_short8 src1;\n\ + vxc_short8 dst;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in0_zp;\n\ + _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_U8F16toU8(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar8 src0;\n\ + vxc_short8 src1;\n\ + vxc_uchar8 dst;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in0_zp;\n\ + _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_U8F16toU8_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_uchar8 src0;\n\ + vxc_short8 src1;\n\ + vxc_uchar8 dst;\n\ + vxc_half8 data1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, src1, 16);\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in0_zp;\n\ + _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_U8U8toU8(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar8 src0, src1, dst;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in0_zp, in1_zp;\n\ + _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ + _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_U8U8toU8_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_uchar8 src0, src1, dst;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in0_zp, in1_zp;\n\ + _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ + _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ + int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_U8U8toF16(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar8 src0;\n\ + vxc_uchar8 src1;\n\ + vxc_short8 dst;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in0_zp, in1_zp;\n\ + _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ + _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + vxc_half8 tmpVal;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pow_U8U8toF16_2D(\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_uchar8 src0;\n\ + vxc_uchar8 src1;\n\ + vxc_short8 dst;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 x0, x1;\n\ + float4 y0, y1;\n\ + float4 tmpDst0, tmpDst1;\n\ + short in0_zp, in1_zp;\n\ + _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ + _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ + VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ +\n\ + float4 s0 = sign(x0);\n\ + float4 s1 = sign(x1);\n\ + int4 t0 = convert_int4(y0) & 1;\n\ + int4 t1 = convert_int4(y1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + vxc_half8 tmpVal;\n\ + _viv_asm(CONV, tmpVal0, tmpDst0);\n\ + _viv_asm(CONV, tmpVal1, tmpDst1);\n\ + VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pow_u8_vx*/ + +static const char pre_process_bgra_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1Bgra_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2Bgra_4x4;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractInt32BgraToU8_2x8;\n\ +_viv_uniform VXC_512Bits uniExchangeBgra_2x8;\n\ +_viv_uniform VXC_512Bits uniExchangeBgra2_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp5BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp6BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp7BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp8BgraShort_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtractBfromBgra_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractGfromBgra_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractRfromBgra_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform int zp;\n\ +_viv_uniform float outputScale;\n\ +\n\ +__kernel void pre_process_bgra_scale_U8toU8(\n\ + __read_only image2d_array_t input, __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + int4 gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + gidx += (int4)(0, 1, 2, 3);\n\ +\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ + int4 sx = fx & 0xffff8000; // Floor\n\ + int fy, sy;\n\ + fx -= sx;\n\ + sx = sx >> 15;\n\ + fx = (fx +(1 << 4)) >> 5;\n\ +\n\ + // for y\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ + sy = fy & 0xffff8000; // Floor\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + sy = sy < 0 ? 0 : sy;\n\ + fy = fy < 0 ? 0 : fy;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ + sx = (sx + (*xOffset)) * 4 ;\n\ + sy += (*yOffset);\n\ + int4 srcPos = (int4)(sx.x, sy, sy + 1, sx.y);\n\ + vxc_uchar16 lineBGRA0, lineBGRA1, lineBGRA2, lineBGRA3;\n\ + vxc_uchar16 dataB, dataG, dataR;\n\ +\n\ + VXC_ReadImage(lineBGRA0, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(lineBGRA0, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(lineBGRA1, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(lineBGRA1, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.z;\n\ + srcPos.w = sx.w;\n\ +\n\ + VXC_ReadImage(lineBGRA2, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(lineBGRA2, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(lineBGRA3, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(lineBGRA3, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar4 val_u8;\n\ + int4 tmp1, tmp2, result1, result2;\n\ + float4 tmpDst, tmp0;\n\ + float4 mean = (float4)(bMean, gMean, rMean, 0);\n\ + //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);\n\ + int tmpV = 1 << 19;\n\ + vxc_short8 tmpFx;\n\ + VXC_DP2x8(tmpFx, fx, fx, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + //tmpFx = fx.xxxx;\n\ + VXC_DP4x4(tmp1, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1BgraShort_4x4);\n\ + VXC_DP4x4(tmp2, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2BgraShort_4x4);\n\ + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertIntergetoF32_4x4);\n\ + tmpDst = (tmp0 - mean) * var;\n\ + result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + //tmpFx = fx.yyyy;\n\ + VXC_DP4x4(tmp1, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3BgraShort_4x4);\n\ + VXC_DP4x4(tmp2, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4BgraShort_4x4);\n\ + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertIntergetoF32_4x4);\n\ + tmpDst = (tmp0 - mean) * var;\n\ + result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + vxc_uchar16 dst, data;\n\ + VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractInt32BgraToU8_2x8);\n\ +\n\ + //tmpFx = fx.zzzz;\n\ + VXC_DP4x4(tmp1, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp5BgraShort_4x4);\n\ + VXC_DP4x4(tmp2, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp6BgraShort_4x4);\n\ + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertIntergetoF32_4x4);\n\ + tmpDst = (tmp0 - mean) * var;\n\ + result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + //tmpFx = fx.wwww;\n\ + VXC_DP4x4(tmp1, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp7BgraShort_4x4);\n\ + VXC_DP4x4(tmp2, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp8BgraShort_4x4);\n\ + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertIntergetoF32_4x4);\n\ + tmpDst = (tmp0 - mean) * var;\n\ + result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtractInt32BgraToU8_2x8);\n\ +\n\ + VXC_DP2x8(data, dst, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExchangeBgra_2x8);\n\ + VXC_DP2x8(data, dst, dst, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniExchangeBgra2_2x8);\n\ +\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + dstPos.z = bOrder;\n\ + VXC_WriteImage2DArray(output, dstPos, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + dstPos.z = 1;\n\ + VXC_WriteImage2DArray(output, dstPos, data.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + dstPos.z = rOrder;\n\ + VXC_WriteImage2DArray(output, dstPos, data.s89ab, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pre_process_bgra_copy_U8toU8(\n\ + __read_only image2d_array_t input, __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + int2 pos = (int2)((get_global_id(0) + (*xOffset)) << 2, get_global_id(1) + (*yOffset));\n\ +\n\ + vxc_uchar16 lineBGRA0;\n\ + float4 tmpB, tmpG, tmpR;\n\ + float4 tmpDst;\n\ + int4 result1, result2;\n\ + vxc_uchar16 dst;\n\ +\n\ + VXC_ReadImage(lineBGRA0, input, pos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpB, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBfromBgra_4x4);\n\ + VXC_DP4x4(tmpG, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGfromBgra_4x4);\n\ + VXC_DP4x4(tmpR, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRfromBgra_4x4);\n\ +\n\ + tmpDst = (tmpB - bMean) * var;\n\ + result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + tmpDst = (tmpG - gMean) * var;\n\ + result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\ + VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ +\n\ + int4 dstPos = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + dstPos.z = bOrder;\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + dstPos.z = 1;\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + tmpDst = (tmpR - rMean) * var;\n\ + result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\ + VXC_DP2x8(dst, result1, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ +\n\ + dstPos.z = rOrder;\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of pre_process_bgra_vx*/ + +static const char pre_process_bgra_trans_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp5BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp6BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp7BgraShort_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp8BgraShort_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractInt32BgraToU8Bgr_2x8;\n\ +\n\ +_viv_uniform int zp;\n\ +_viv_uniform float outputScale;\n\ +\n\ +__kernel void pre_process_bgra_scale_nhwc_U8toU8(\n\ + __read_only image2d_array_t input, __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + int4 gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + gidx += (int4)(0, 1, 2, 3);\n\ +\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ + int4 sx = fx & 0xffff8000; // Floor\n\ + int fy, sy;\n\ + fx -= sx;\n\ + sx = sx >> 15;\n\ + fx = (fx +(1 << 4)) >> 5;\n\ +\n\ + // for y\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ + sy = fy & 0xffff8000; // Floor\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + sy = sy < 0 ? 0 : sy;\n\ + fy = fy < 0 ? 0 : fy;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ + sx = (sx + (*xOffset)) * 4 ;\n\ + sy += (*yOffset);\n\ + int4 srcPos = (int4)(sx.x, sy, sy + 1, sx.y);\n\ + vxc_uchar16 lineBGRA0, lineBGRA1, lineBGRA2, lineBGRA3;\n\ + vxc_uchar16 dataB, dataG, dataR;\n\ +\n\ + VXC_ReadImage(lineBGRA0, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(lineBGRA0, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(lineBGRA1, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(lineBGRA1, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.z;\n\ + srcPos.w = sx.w;\n\ +\n\ + VXC_ReadImage(lineBGRA2, input, srcPos.xy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(lineBGRA2, input, srcPos.xz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(lineBGRA3, input, srcPos.wy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(lineBGRA3, input, srcPos.wz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar4 val_u8;\n\ + int4 tmp1, tmp2, result1, result2;\n\ + float4 tmpDst, tmp0;\n\ + float4 mean = (float4)(bMean, gMean, rMean, 0);\n\ + //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);\n\ + int tmpV = 1 << 19;\n\ + vxc_short8 tmpFx;\n\ + VXC_DP2x8(tmpFx, fx, fx, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ + //tmpFx = fx.xxxx;\n\ + VXC_DP4x4(tmp1, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniBilinearTmp1BgraShort_4x4);\n\ + VXC_DP4x4(tmp2, lineBGRA0, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniBilinearTmp2BgraShort_4x4);\n\ + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertIntergetoF32_4x4);\n\ + tmpDst = (tmp0 - mean) * var;\n\ + result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + //tmpFx = fx.yyyy;\n\ + VXC_DP4x4(tmp1, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3BgraShort_4x4);\n\ + VXC_DP4x4(tmp2, lineBGRA1, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4BgraShort_4x4);\n\ + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertIntergetoF32_4x4);\n\ + tmpDst = (tmp0 - mean) * var;\n\ + result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1),\n\ + uniExtractInt32BgraToU8Bgr_2x8);\n\ +\n\ + //tmpFx = fx.zzzz;\n\ + VXC_DP4x4(tmp1, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp5BgraShort_4x4);\n\ + VXC_DP4x4(tmp2, lineBGRA2, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp6BgraShort_4x4);\n\ + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertIntergetoF32_4x4);\n\ + tmpDst = (tmp0 - mean) * var;\n\ + result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + //tmpFx = fx.wwww;\n\ + VXC_DP4x4(tmp1, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp7BgraShort_4x4);\n\ + VXC_DP4x4(tmp2, lineBGRA3, tmpFx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp8BgraShort_4x4);\n\ + tmp1 = fy * (tmp2 - tmp1) + (tmp1 << 10);\n\ + VXC_DP4x4(val_u8, tmp1, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + VXC_DP4x4(tmp0, val_u8, val_u8, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertIntergetoF32_4x4);\n\ + tmpDst = (tmp0 - mean) * var;\n\ + result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(6, 11, 0, VXC_RM_ToNearestEven, 1),\n\ + uniExtractInt32BgraToU8Bgr_2x8);\n\ +\n\ + int4 dstPos = (int4)(get_global_id(0) * 3, gidy, 0, 0);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of pre_process_bgra_trans_vx*/ + +static const char pre_process_gray_vx[] = "/*\n\ + ============================================================================\n\ + Name : GrayScale.vx\n\ + Author : Sam\n\ + Version :\n\ + Copyright : Your copyright notice\n\ + Description :\n\ + ============================================================================\n\ + */\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtactInteger_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniDataMulAlpha_4x4;\n\ +_viv_uniform VXC_512Bits uniDataSubMean_4x4;\n\ +\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +\n\ +__kernel void pre_process_gray_scale_U8toF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ +\n\ + int4 xPos = get_global_id(0);\n\ + int yPos = get_global_id(1);\n\ +\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ + xPos += (int4)(0, 1, 2, 3);\n\ +\n\ + //x\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ + int4 sx = fx0 & 0xffff8000;\n\ + fx0 -= sx;\n\ + sx = sx >> 15;\n\ +\n\ + vxc_short4 fx;\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ + //y\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ + int sy = fy & 0xffff8000; // Floor\n\ +\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ +\n\ + //R\n\ + vxc_uchar16 line0Y;\n\ + vxc_uchar16 line1Y;\n\ + int4 coord;\n\ + sx = sx + *xOffset;\n\ + coord.xyz = sx.xyz;\n\ + coord.w = sy + *yOffset;\n\ + int2 coord1 = (int2)(sx.w, coord.w);\n\ + VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float grayMean = mean;\n\ +\n\ + int4 test01, temp1;\n\ + int4 test02, temp2;\n\ + int4 tt;\n\ + vxc_uchar4 val;\n\ + int2 coord_out = (int2)(xPos.x, yPos);\n\ +\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniVecShift10);\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniVecShift10);\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),\n\ + uniExtractBytes);\n\ +\n\ + //convert U8 to FP16\n\ + half f16mean;\n\ + half f16alpha;\n\ + vxc_half4 dst;\n\ + vxc_short4 tmp_dst;\n\ + _viv_asm(CONV, f16mean, grayMean);\n\ + _viv_asm(CONV, f16alpha, f32Var);\n\ + VXC_DP4x4(dst, val, f16mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniDataSubMean_4x4);\n\ + VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniDataMulAlpha_4x4);\n\ + _viv_asm(COPY, tmp_dst, dst, 8);\n\ + VXC_WriteImage(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pre_process_gray_scale_U8toI16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ +\n\ + int4 xPos = get_global_id(0);\n\ + int yPos = get_global_id(1);\n\ +\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ + xPos += (int4)(0, 1, 2, 3);\n\ +\n\ + //x\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ + int4 sx = fx0 & 0xffff8000;\n\ + fx0 -= sx;\n\ + sx = sx >> 15;\n\ +\n\ + vxc_short4 fx;\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniAddRShift);\n\ + //y\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ + int sy = fy & 0xffff8000; // Floor\n\ +\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ +\n\ + vxc_uchar16 line0Y;\n\ + vxc_uchar16 line1Y;\n\ + int4 coord;\n\ + sx = sx + *xOffset;\n\ + coord.xyz = sx.xyz;\n\ + coord.w = sy + *yOffset;\n\ + int2 coord1 = (int2)(sx.w, coord.w);\n\ + VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float grayMean = mean * f32Var;\n\ +\n\ + int4 test01, temp1;\n\ + int4 test02, temp2;\n\ + int4 tt;\n\ + vxc_uchar4 val;\n\ + int2 coord_out = (int2)(xPos.x, yPos);\n\ +\n\ + vxc_uchar8 line1, line2;\n\ +\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniVecShift10);\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniVecShift10);\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + vxc_float4 tmp_dst;\n\ + vxc_uchar4 u8_dst;\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),\n\ + uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),\n\ + uniConvertIntergetoF32_4x4);\n\ +\n\ + //convert U8 to dfp8\n\ + int4 dst0;\n\ + vxc_short4 dst;\n\ + tmp_dst = tmp_dst * f32Var - grayMean;\n\ + tmp_dst = tmp_dst * outputScale + outputZP;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\ + uniExtactInteger_2x8);\n\ +\n\ + VXC_WriteImage(output, coord_out, dst,\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define PRE_PROCESS_GRAY_SCALE_8BITS(dst_type_name, write_type) \\\n\ +__kernel void pre_process_gray_scale_U8to##dst_type_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float mean, \\\n\ + float f32Var \\\n\ + ) \\\n\ +{ \\\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ + int4 xPos = get_global_id(0); \\\n\ + int yPos = get_global_id(1); \\\n\ + \\\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ + xPos += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ + int4 sx = fx0 & 0xffff8000; \\\n\ + fx0 -= sx; \\\n\ + sx = sx >> 15; \\\n\ + \\\n\ + vxc_short4 fx; \\\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\ + \\\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ + int sy = fy & 0xffff8000; \\\n\ + \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + \\\n\ + vxc_uchar16 line0Y; \\\n\ + vxc_uchar16 line1Y; \\\n\ + int4 coord; \\\n\ + sx = sx + *xOffset; \\\n\ + coord.xyz = sx.xyz; \\\n\ + coord.w = sy + *yOffset; \\\n\ + int2 coord1 = (int2)(sx.w, coord.w); \\\n\ + VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float grayMean = mean * f32Var; \\\n\ + \\\n\ + int4 test01, temp1; \\\n\ + int4 test02, temp2; \\\n\ + int2 coord_out = (int2)(xPos.x, yPos); \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + vxc_float4 tmp_dst; \\\n\ + vxc_uchar4 u8_dst; \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + int4 dst0; \\\n\ + write_type dst; \\\n\ + tmp_dst = tmp_dst * f32Var - grayMean; \\\n\ + tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ + dst0 = convert_int4_rte(tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtactInteger_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +PRE_PROCESS_GRAY_SCALE_8BITS(U8, vxc_uchar16)\n\ +PRE_PROCESS_GRAY_SCALE_8BITS(I8, vxc_char16)"; /* end of pre_process_gray_vx*/ + +static const char pre_process_gray_copy_vx[] = "/*\n\ + ============================================================================\n\ + Name : GrayScale.vx\n\ + Author : Sam\n\ + Version :\n\ + Copyright : Your copyright notice\n\ + Description :\n\ + ============================================================================\n\ + */\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\ +_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\ +\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +\n\ +__kernel void pre_process_gray_copy_U8toF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + coord.xy += (int2) (*xOffset, *yOffset);\n\ + vxc_uchar16 src0;\n\ + vxc_half8 dst0, dst1;\n\ +\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord.x = coord.z + 8;\n\ + float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var);\n\ + //convert U8 to FP16\n\ + half4 paramData_f16;\n\ + vxc_short8 tmp_dst;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniDataMeanStddevLo_2x8);\n\ + VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniDataMeanStddevHi_2x8);\n\ + _viv_asm(COPY, tmp_dst, dst0, 16);\n\ + VXC_WriteImage(output, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, tmp_dst, dst1, 16);\n\ + VXC_WriteImage(output, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pre_process_gray_copy_U8toI16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + coord.xy += (int2) (*xOffset, *yOffset);\n\ + vxc_uchar16 src0;\n\ + vxc_short8 dst0, dst1;\n\ +\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord.x = coord.z + 8;\n\ +\n\ + f32Var *= outputScale;\n\ + float4 paramData = (float4)(mean * f32Var - outputZP, mean * f32Var - outputZP,\n\ + mean * f32Var - outputZP, f32Var);\n\ + //convert U8 to FP16\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ +\n\ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uniDataMeanStddevLo_2x8);\n\ + VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uniDataMeanStddevHi_2x8);\n\ + VXC_WriteImage(output, coord.zw, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.xw, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define PRE_PROCESS_GRAY_COPY_8BITS(dst_type_name, write_type) \\\n\ +__kernel void pre_process_gray_copy_U8to##dst_type_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float mean, \\\n\ + float f32Var \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + coord.xy += (int2) (*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0; \\\n\ + write_type dst; \\\n\ + \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + f32Var *= outputScale; \\\n\ + float4 paramData = (float4)(mean * f32Var - outputZP, mean * f32Var - outputZP, \\\n\ + mean * f32Var - outputZP, f32Var); \\\n\ + \\\n\ + half4 paramData_f16; \\\n\ + _viv_asm(CONV, paramData_f16, paramData); \\\n\ + \\\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\ +PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\ +"; /* end of pre_process_gray_copy_vx*/ + +static const char pre_process_nv12_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +_viv_uniform float outputScaleVar;\n\ +_viv_uniform float bMeanScaleVarZp;\n\ +_viv_uniform float gMeanScaleVarZp;\n\ +_viv_uniform float rMeanScaleVarZp;\n\ +\n\ +_viv_uniform uint xrIntFloat_16;\n\ +_viv_uniform uint yrIntFloat_16;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\ +\n\ +__kernel void pre_process_nv12_scale_U8toI16(\n\ + __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + uint4 gidx = get_global_id(0);\n\ + uint gidy = get_global_id(1);\n\ + gidx += (uint4)(0, 1, 2, 3);\n\ +\n\ + uint dy = (gidy * yrIntFloat_16) >> 16;\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ + int sy = convert_int(dy) + (*yOffset);\n\ + int4 sx = convert_int4(dx) + (*xOffset);\n\ + int4 uvX = sx & 0xfffffffe;\n\ + int uvY = sy >> 1;\n\ +\n\ + vxc_uchar16 Y, UV;\n\ + int2 coord = (int2)(sx.x, sy);\n\ + int2 coord_uv = (int2)(uvX.x, uvY);\n\ +\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.y;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.z;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.w;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.y;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.z;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.w;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_char16 tmpUV;\n\ + short tmpVal = 128;\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ +\n\ + float4 tmpDstB, tmpDstG, tmpDstR;\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ +\n\ + int4 result;\n\ + vxc_short8 dst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ + dstPos.z = bOrder;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ + dstPos.z = 1;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ + dstPos.z = rOrder;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pre_process_nv12_scale_U8toF16(\n\ + __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + uint4 gidx = get_global_id(0);\n\ + uint gidy = get_global_id(1);\n\ + gidx += (uint4)(0, 1, 2, 3);\n\ +\n\ + uint dy = (gidy * yrIntFloat_16) >> 16;\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ + int sy = convert_int(dy) + (*yOffset);\n\ + int4 sx = convert_int4(dx) + (*xOffset);\n\ + int4 uvX = sx & 0xfffffffe;\n\ + int uvY = sy >> 1;\n\ +\n\ + vxc_uchar16 Y, UV;\n\ + int2 coord = (int2)(sx.x, sy);\n\ + int2 coord_uv = (int2)(uvX.x, uvY);\n\ +\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.y;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.z;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.w;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.y;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.z;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.w;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_char16 tmpUV;\n\ + short tmpVal = 128;\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ +\n\ + float4 tmpDstB, tmpDstG, tmpDstR;\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ +\n\ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp;\n\ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp;\n\ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp;\n\ +\n\ + half4 result;\n\ + vxc_half8 tmpdst;\n\ + vxc_short8 dst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + _viv_asm(CONV, result, tmpDstB);\n\ + dstPos.z = bOrder;\n\ + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpdst, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(CONV, result, tmpDstG);\n\ + dstPos.z = 1;\n\ + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpdst, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(CONV, result, tmpDstR);\n\ + dstPos.z = rOrder;\n\ + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpdst, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pre_process_nv12_scale_vx*/ + +static const char pre_process_nv12_scale_8bits_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +_viv_uniform float outputScaleVar;\n\ +_viv_uniform float bMeanScaleVarZp;\n\ +_viv_uniform float gMeanScaleVarZp;\n\ +_viv_uniform float rMeanScaleVarZp;\n\ +\n\ +_viv_uniform uint xrIntFloat_16;\n\ +_viv_uniform uint yrIntFloat_16;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\ +\n\ +__kernel void pre_process_nv12_scale_U8toU8(\n\ + __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + uint4 gidx = get_global_id(0);\n\ + uint gidy = get_global_id(1);\n\ + gidx += (uint4)(0, 1, 2, 3);\n\ +\n\ + uint dy = (gidy * yrIntFloat_16) >> 16;\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ + int sy = convert_int(dy) + (*yOffset);\n\ + int4 sx = convert_int4(dx) + (*xOffset);\n\ + int4 uvX = sx & 0xfffffffe;\n\ + int uvY = sy >> 1;\n\ +\n\ + vxc_uchar16 Y, UV;\n\ + int2 coord = (int2)(sx.x, sy);\n\ + int2 coord_uv = (int2)(uvX.x, uvY);\n\ +\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.y;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.z;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.w;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.y;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.z;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.w;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_char16 tmpUV;\n\ + short tmpVal = 128;\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ +\n\ + float4 tmpDstB, tmpDstG, tmpDstR;\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ +\n\ + int4 result;\n\ + vxc_uchar8 dst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ + dstPos.z = bOrder;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ + dstPos.z = 1;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ + dstPos.z = rOrder;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pre_process_nv12_copy_U8toU8(\n\ + __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + int sy = gidy + (*yOffset);\n\ + int sx = gidx + (*xOffset);\n\ + int uvX = sx & 0xfffffffe;\n\ + int uvY = sy >> 1;\n\ +\n\ + vxc_uchar16 Y, UV;\n\ +\n\ + VXC_ReadImage(Y, y_img, (int2)(sx,sy), VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_char16 tmpUV;\n\ + short tmpVal = 128;\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8);\n\ +\n\ + float4 tmpDstB, tmpDstG, tmpDstR;\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ +\n\ + int4 result;\n\ + vxc_uchar8 dst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ + dstPos.z = bOrder;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ + dstPos.z = 1;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ + dstPos.z = rOrder;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pre_process_nv12_scale_U8toI8(\n\ + __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + uint4 gidx = get_global_id(0);\n\ + uint gidy = get_global_id(1);\n\ + gidx += (uint4)(0, 1, 2, 3);\n\ +\n\ + uint dy = (gidy * yrIntFloat_16) >> 16;\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ + int sy = convert_int(dy) + (*yOffset);\n\ + int4 sx = convert_int4(dx) + (*xOffset);\n\ + int4 uvX = sx & 0xfffffffe;\n\ + int uvY = sy >> 1;\n\ +\n\ + vxc_uchar16 Y, UV;\n\ + int2 coord = (int2)(sx.x, sy);\n\ + int2 coord_uv = (int2)(uvX.x, uvY);\n\ +\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.y;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.z;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.w;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.y;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.z;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.w;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_char16 tmpUV;\n\ + short tmpVal = 128;\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ +\n\ + float4 tmpDstB, tmpDstG, tmpDstR;\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ +\n\ + int4 result;\n\ + vxc_char8 dst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ + dstPos.z = bOrder;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ + dstPos.z = 1;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ + dstPos.z = rOrder;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pre_process_nv12_scale_8bits_vx*/ + +static const char pre_process_nv12_scale_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +_viv_uniform float outputScaleVar;\n\ +_viv_uniform float bMeanScaleVarZp;\n\ +_viv_uniform float gMeanScaleVarZp;\n\ +_viv_uniform float rMeanScaleVarZp;\n\ +\n\ +_viv_uniform uint xrIntFloat_16;\n\ +_viv_uniform uint yrIntFloat_16;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateYShift_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateUVShift_2x8;\n\ +\n\ +__kernel void pre_process_nv12_scale_U8toU8_gq(\n\ + __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + uint4 gidx = get_global_id(0);\n\ + uint gidy = get_global_id(1);\n\ + gidx += (uint4)(0, 1, 2, 3);\n\ +\n\ + uint dy = (gidy * yrIntFloat_16) >> 16;\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ + int sy = convert_int(dy) + (*yOffset);\n\ + int4 sx = convert_int4(dx) + (*xOffset);\n\ + int4 uvX = sx & 0xfffffffe;\n\ + int uvY = sy >> 1;\n\ +\n\ + vxc_uchar16 Y, UV;\n\ + int2 coord = (int2)(sx.x, sy);\n\ + int2 coord_uv = (int2)(uvX.x, uvY);\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ + vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ + int4 offsetUV = uvX - uvX.x;\n\ +\n\ + vxc_ushort8 diffY, diffUV;\n\ + _viv_asm(COPY, diffY, sx, 16);\n\ + _viv_asm(COPY, diffUV, offsetUV, 16);\n\ +\n\ + vxc_ushort8 constData = 8;\n\ + VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8);\n\ + VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8);\n\ + VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_char16 tmpUV;\n\ + short tmpVal = 128;\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ +\n\ + float4 tmpDstB, tmpDstG, tmpDstR;\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ +\n\ + int4 result;\n\ + vxc_uchar8 dst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ + dstPos.z = bOrder;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ + dstPos.z = 1;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ + dstPos.z = rOrder;\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pre_process_nv12_scale_U8toF16_gq(\n\ + __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + uint4 gidx = get_global_id(0);\n\ + uint gidy = get_global_id(1);\n\ + gidx += (uint4)(0, 1, 2, 3);\n\ +\n\ + uint dy = (gidy * yrIntFloat_16) >> 16;\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ + int sy = convert_int(dy) + (*yOffset);\n\ + int4 sx = convert_int4(dx) + (*xOffset);\n\ + int4 uvX = sx & 0xfffffffe;\n\ + int uvY = sy >> 1;\n\ +\n\ + vxc_uchar16 Y, UV;\n\ + int2 coord = (int2)(sx.x, sy);\n\ + int2 coord_uv = (int2)(uvX.x, uvY);\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ + vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ + int4 offsetUV = uvX - uvX.x;\n\ +\n\ + vxc_ushort8 diffY, diffUV;\n\ + _viv_asm(COPY, diffY, sx, 16);\n\ + _viv_asm(COPY, diffUV, offsetUV, 16);\n\ +\n\ + vxc_ushort8 constData = 8;\n\ + VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8);\n\ + VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8);\n\ + VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_char16 tmpUV;\n\ + short tmpVal = 128;\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ +\n\ + float4 tmpDstB, tmpDstG, tmpDstR;\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ +\n\ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp;\n\ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp;\n\ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp;\n\ +\n\ + half4 result;\n\ + vxc_half8 tmpdst;\n\ + vxc_short8 dst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + _viv_asm(CONV, result, tmpDstB);\n\ + dstPos.z = bOrder;\n\ + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpdst, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(CONV, result, tmpDstG);\n\ + dstPos.z = 1;\n\ + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpdst, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(CONV, result, tmpDstR);\n\ + dstPos.z = rOrder;\n\ + VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpdst, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pre_process_nv12_scale_mix_vx*/ + +static const char pre_process_nv12_trans_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +_viv_uniform float outputScaleVar;\n\ +_viv_uniform float bMeanScaleVarZp;\n\ +_viv_uniform float gMeanScaleVarZp;\n\ +_viv_uniform float rMeanScaleVarZp;\n\ +\n\ +_viv_uniform uint xrIntFloat_16;\n\ +_viv_uniform uint yrIntFloat_16;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;\n\ +_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;\n\ +\n\ +__kernel void pre_process_nv12_trans_U8toU8(\n\ + __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ + __write_only image2d_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + uint4 gidx = get_global_id(0);\n\ + uint gidy = get_global_id(1);\n\ + gidx += (uint4)(0, 1, 2, 3);\n\ +\n\ + uint dy = (gidy * yrIntFloat_16) >> 16;\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ + int sy = convert_int(dy) + (*yOffset);\n\ + int4 sx = convert_int4(dx) + (*xOffset);\n\ + int4 uvX = sx & 0xfffffffe;\n\ + int uvY = sy >> 1;\n\ +\n\ + vxc_uchar16 Y, UV;\n\ + int2 coord = (int2)(sx.x, sy);\n\ + int2 coord_uv = (int2)(uvX.x, uvY);\n\ +\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.y;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.z;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = sx.w;\n\ + VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.y;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.z;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_uv.x = uvX.w;\n\ + VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_char16 tmpUV;\n\ + short tmpVal = 128;\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ +\n\ + float4 tmpDstB, tmpDstG, tmpDstR;\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ +\n\ + int4 result, dstR, dstG, dstB;\n\ + vxc_uchar16 dst, tmpPack;\n\ + dstB = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ + dstG = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ + dstR = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ +\n\ + if(bOrder == 2)\n\ + {\n\ + int4 exchangeData = dstB;\n\ + dstB = dstR;\n\ + dstR = exchangeData;\n\ + }\n\ +\n\ + VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);\n\ + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);\n\ +\n\ + int2 dstPos = (int2)(get_global_id(0) * 3, gidy);\n\ + VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of pre_process_nv12_trans_u8_vx*/ + +static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +_viv_uniform VXC_512Bits uniUnpackToR;\n\ +_viv_uniform VXC_512Bits uniUnpackToG;\n\ +_viv_uniform VXC_512Bits uniUnpackToB;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform int r_order;\n\ +_viv_uniform int b_order;\n\ +\n\ +#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\ +\n\ +#define IMAGE_PRE_PROCESS(dst_name, conv_type, dst_type, copy_type) \\\n\ +__kernel void pre_process_rgb_scale_U8to##dst_name \\\n\ + ( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float f32Var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ + int4 xPos = get_global_id(0); \\\n\ + int yPos = get_global_id(1); \\\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ + xPos += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + /*x*/ \\\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ + int4 sx = fx0 & 0xffff8000; \\\n\ + fx0 -= sx; \\\n\ + sx = sx >> 15; \\\n\ + \\\n\ + vxc_short4 fx; \\\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\ + /*y*/ \\\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ + int sy = fy & 0xffff8000; \\\n\ + \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + \\\n\ + vxc_uchar16 line0RGB1, line0RGB2; \\\n\ + vxc_uchar16 line1RGB3, line1RGB4; \\\n\ + int4 coord; \\\n\ + sx = (sx + (*xOffset)) * 3; \\\n\ + coord.xyz = sx.xyz; \\\n\ + coord.w = sy + *yOffset; \\\n\ + int2 coord1 = (int2)(sx.w, coord.w); \\\n\ + VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \\\n\ + \\\n\ + bgrMean *= f32Var; \\\n\ + \\\n\ + int4 test01, temp1; \\\n\ + int4 test02, temp2; \\\n\ + int4 tt; \\\n\ + vxc_uchar4 val; \\\n\ + int4 coord_out = (int4)(xPos.x, yPos, r_order, 0); \\\n\ + \\\n\ + vxc_uchar8 line1, line2; \\\n\ + \\\n\ + /*R*/ \\\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\ + \\\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + vxc_float4 tmp_dst; \\\n\ + vxc_uchar4 u8_dst; \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + /*convert U8 to dst*/ \\\n\ + dst_type dst; \\\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \\\n\ + tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ + conv_type dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + copy_type result; \\\n\ + _viv_asm(COPY, result, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + /*G*/ \\\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\ + \\\n\ + coord_out.z = 1; \\\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.y; \\\n\ + tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, result, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + /*B*/ \\\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\ + \\\n\ + coord_out.z = b_order; \\\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.x; \\\n\ + tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, result, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +IMAGE_PRE_PROCESS(U8, uint4, vxc_uchar16, vxc_uchar16)\n\ +IMAGE_PRE_PROCESS(I8, int4, vxc_char16, vxc_char16)\n\ +IMAGE_PRE_PROCESS(I16, int4, vxc_short8, vxc_short8)\n\ +IMAGE_PRE_PROCESS(F16, half4, vxc_half8, vxc_short8)\n\ +"; /* end of pre_process_rgb_vx*/ + +static const char pre_process_rgb_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform int r_order;\n\ +_viv_uniform int b_order;\n\ +_viv_uniform VXC_512Bits uniExtractRtoF32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractRtoF32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractGtoF32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractGtoF32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractBtoF32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +#define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \\\n\ +__kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float f32Var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\ + \\\n\ + coord.xy += (int2) (*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0, src1; \\\n\ + dst_type dst0; \\\n\ + copy_type dst; \\\n\ + \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + f32Var *= outputScale; \\\n\ + float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\ + bMean * f32Var - outputZP, f32Var); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \\\n\ + float4 tmp0, tmp1; \\\n\ + convert_type result0, result1; \\\n\ + \\\n\ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\ + tmp0 = tmp0 * paramData.w - paramData.x; \\\n\ + tmp1 = tmp1 * paramData.w - paramData.x; \\\n\ + _viv_asm(CONV_RTE, result0, tmp0); \\\n\ + _viv_asm(CONV_RTE, result1, tmp1); \\\n\ + VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord_out.z = 1; \\\n\ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\ + tmp0 = tmp0 * paramData.w - paramData.y; \\\n\ + tmp1 = tmp1 * paramData.w - paramData.y; \\\n\ + _viv_asm(CONV_RTE, result0, tmp0); \\\n\ + _viv_asm(CONV_RTE, result1, tmp1); \\\n\ + VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord_out.z = b_order; \\\n\ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\ + tmp0 = tmp0 * paramData.w - paramData.z; \\\n\ + tmp1 = tmp1 * paramData.w - paramData.z; \\\n\ + _viv_asm(CONV_RTE, result0, tmp0); \\\n\ + _viv_asm(CONV_RTE, result1, tmp1); \\\n\ + VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +IMAGE_PRE_PROCESS_COPY_16BITS(I16, vxc_short8, vxc_short8, int4)\n\ +IMAGE_PRE_PROCESS_COPY_16BITS(F16, vxc_half8, vxc_short8, half4)\n\ +\n\ +#define IMAGE_PRE_PROCESS_COPY_8BITS(dst_name, dst_type) \\\n\ +__kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float f32Var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\ + coord.xy += (int2) (*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0, src1; \\\n\ + dst_type dst; \\\n\ + \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + f32Var *= outputScale; \\\n\ + float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\ + bMean * f32Var - outputZP, f32Var); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \\\n\ + float4 tmp0, tmp1; \\\n\ + int4 result0, result1; \\\n\ + \\\n\ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\ + tmp0 = tmp0 * paramData.w - paramData.x; \\\n\ + tmp1 = tmp1 * paramData.w - paramData.x; \\\n\ + result0 = convert_int4_rte(tmp0); \\\n\ + result1 = convert_int4_rte(tmp1); \\\n\ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + \\\n\ + coord_out.z = 1; \\\n\ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\ + tmp0 = tmp0 * paramData.w - paramData.y; \\\n\ + tmp1 = tmp1 * paramData.w - paramData.y; \\\n\ + result0 = convert_int4_rte(tmp0); \\\n\ + result1 = convert_int4_rte(tmp1); \\\n\ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord_out.z = b_order; \\\n\ + VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\ + tmp0 = tmp0 * paramData.w - paramData.z; \\\n\ + tmp1 = tmp1 * paramData.w - paramData.z; \\\n\ + result0 = convert_int4_rte(tmp0); \\\n\ + result1 = convert_int4_rte(tmp1); \\\n\ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +IMAGE_PRE_PROCESS_COPY_8BITS(U8, vxc_uchar16)\n\ +IMAGE_PRE_PROCESS_COPY_8BITS(I8, vxc_char16)\n\ +"; /* end of pre_process_rgb_copy_vx*/ + +static const char pre_process_rgb_copy_trans_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniNormilizationLo_2x8;\n\ +_viv_uniform VXC_512Bits uniNormilizationHi_2x8;\n\ +#define IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(dst_name, dst_type, copy_type) \\\n\ +__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float f32Var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + coord.xy += (int2) (*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0, src1; \\\n\ + dst_type dst0, dst1; \\\n\ + copy_type dst; \\\n\ + \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + f32Var *= outputScale; \\\n\ + float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\ + bMean * f32Var - outputZP, f32Var); \\\n\ + half4 paramData_f16; \\\n\ + _viv_asm(CONV, paramData_f16, paramData); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0)); \\\n\ + coord_out.z = coord_out.x + 8; \\\n\ + \\\n\ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniNormilizationLo_2x8); \\\n\ + VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniNormilizationHi_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, 16); \\\n\ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(I16, vxc_short8, vxc_short8)\n\ +IMAGE_PRE_PROCESS_COPY_16BITS_NHWC(F16, vxc_half8, vxc_short8)\n\ +\n\ +#define IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(dst_name, dst_type) \\\n\ +__kernel void pre_process_rgb_copy_nhwc_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float f32Var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + coord.xy += (int2) (*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0, src1; \\\n\ + dst_type dst; \\\n\ + \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + f32Var *= outputScale; \\\n\ + float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\ + bMean * f32Var - outputZP, f32Var); \\\n\ + \\\n\ + half4 paramData_f16; \\\n\ + _viv_asm(CONV, paramData_f16, paramData); \\\n\ + \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniNormilizationLo_2x8); \\\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniNormilizationHi_2x8); \\\n\ + VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(U8, vxc_uchar16)\n\ +IMAGE_PRE_PROCESS_COPY_8BITS_NHWC(I8, vxc_char16)\n\ +"; /* end of pre_process_rgb_copy_trans_vx*/ + +static const char pre_process_rgb_trans_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +_viv_uniform VXC_512Bits uniUnpackToR;\n\ +_viv_uniform VXC_512Bits uniUnpackToG;\n\ +_viv_uniform VXC_512Bits uniUnpackToB;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform float outputZP;\n\ +\n\ +_viv_uniform VXC_512Bits uniRePackRGBLo_2x8;\n\ +_viv_uniform VXC_512Bits uniRePackRGBHi_2x8;\n\ +#define IMAGE_PRE_PROCESS_NHWC(dst_name, conv_type, dst_type, copy_type) \\\n\ +__kernel void pre_process_rgb_scale_nhwc_U8to##dst_name \\\n\ + ( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float f32Var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ + int4 xPos = get_global_id(0); \\\n\ + int yPos = get_global_id(1); \\\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ + xPos += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + /*x*/ \\\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ + int4 sx = fx0 & 0xffff8000; \\\n\ + fx0 -= sx; \\\n\ + sx = sx >> 15; \\\n\ + \\\n\ + vxc_short4 fx; \\\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\ + /*y*/ \\\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ + int sy = fy & 0xffff8000; \\\n\ + \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + \\\n\ + vxc_uchar16 line0RGB1, line0RGB2; \\\n\ + vxc_uchar16 line1RGB3, line1RGB4; \\\n\ + int4 coord; \\\n\ + sx = sx * 3 + *xOffset; \\\n\ + coord.xyz = sx.xyz; \\\n\ + coord.w = sy + *yOffset; \\\n\ + int2 coord1 = (int2)(sx.w, coord.w); \\\n\ + VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \\\n\ + \\\n\ + bgrMean *= f32Var; \\\n\ + \\\n\ + int4 test01, temp1; \\\n\ + int4 test02, temp2; \\\n\ + int4 tt; \\\n\ + vxc_uchar4 val; \\\n\ + int4 coord_out = (int4)(xPos.x * 3, yPos, xPos.x * 3 + 6, 0); \\\n\ + \\\n\ + vxc_uchar8 line1, line2; \\\n\ + \\\n\ + /*R*/ \\\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\ + \\\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + vxc_float4 tmp_dst; \\\n\ + vxc_uchar4 u8_dst; \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + /*convert U8 to dst*/ \\\n\ + dst_type dstRG, dstB, dst; \\\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \\\n\ + tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ + conv_type dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + \\\n\ + /*G*/ \\\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\ + \\\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.y; \\\n\ + tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dstRG, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + \\\n\ + /*B*/ \\\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\ + \\\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.x; \\\n\ + tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dstB, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBLo_2x8); \\\n\ + copy_type result; \\\n\ + _viv_asm(COPY, result, dst, 16); \\\n\ + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(dst, dstRG, dstB, VXC_MODIFIER(0, 5, 0, VXC_RM_ToNearestEven, 1), uniRePackRGBHi_2x8); \\\n\ + _viv_asm(COPY, result, dst, 16); \\\n\ + VXC_WriteImage(output, coord_out.zy, result, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +IMAGE_PRE_PROCESS_NHWC(U8, uint4, vxc_uchar16, vxc_uchar16)\n\ +IMAGE_PRE_PROCESS_NHWC(I8, int4, vxc_char16, vxc_char16)\n\ +IMAGE_PRE_PROCESS_NHWC(I16, int4, vxc_short8, vxc_short8)\n\ +IMAGE_PRE_PROCESS_NHWC(F16, half4, vxc_half8, vxc_short8)\n\ +"; /* end of pre_process_rgb_trans_vx*/ + +static const char pre_process_yuv420_copy_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpR3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpR4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpG1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpG2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpG3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpG4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateG1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateG2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateG3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateG4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpB2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniPackBG0_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;\n\ +_viv_uniform VXC_512Bits uniPackRB0_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;\n\ +_viv_uniform VXC_512Bits uniPackGR1_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;\n\ +_viv_uniform VXC_512Bits uniPackBG1_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;\n\ +_viv_uniform VXC_512Bits uniPackRB2_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;\n\ +_viv_uniform VXC_512Bits uniPackGR2_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8HiG_2x8;\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8LoR_2x8;\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform int zp;\n\ +_viv_uniform float outputScale;\n\ +\n\ +__kernel void pre_process_yuv420_copy_U8toU8(\n\ + __read_only image2d_t y_img,\n\ + __read_only image2d_t u_img,\n\ + __read_only image2d_t v_img,\n\ + __write_only image2d_array_t output,\n\ + global int * xRatio,\n\ + global int * yRatio,\n\ + global int * xOffset,\n\ + global int * yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float var,\n\ + int reverse_channel,\n\ + int trans\n\ + )\n\ +{\n\ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ + int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0);\n\ + vxc_uchar16 Y;\n\ + vxc_uchar8 U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ + vxc_uchar16 dst0, dst1, dst2;\n\ +\n\ + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //C = Y - 16;\n\ + //D = U - 128;\n\ + //E = V - 128;\n\ + // calculate R\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\ +\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + // calculate G\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG;\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ + VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);\n\ + VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);\n\ +\n\ + // calculate B\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + var *= outputScale;\n\ + float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\ + rMean * var - zp, var);\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ +\n\ + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ +\n\ + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ +\n\ + pos = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + pos.z = bOrder;\n\ + VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + pos.z = 1;\n\ + VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + pos.z = rOrder;\n\ + VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +// store bgrbgrbgr\n\ +__kernel void pre_process_yuv420_copy_trans_U8(\n\ + __read_only image2d_t y_img,\n\ + __read_only image2d_t u_img,\n\ + __read_only image2d_t v_img,\n\ + __write_only image2d_array_t output,\n\ + global int * xRatio,\n\ + global int * yRatio,\n\ + global int * xOffset,\n\ + global int * yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float var,\n\ + int reverse_channel,\n\ + int trans\n\ + )\n\ +{\n\ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ + int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0);\n\ + vxc_uchar16 Y;\n\ + vxc_uchar8 U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ + vxc_uchar16 dst;\n\ +\n\ + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + var *= outputScale;\n\ + float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\ + rMean * var - zp, var);\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + //C = Y - 16;\n\ + //D = U - 128;\n\ + //E = V - 128;\n\ + // calculate R\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\ +\n\ + VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ + VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ +\n\ + // calculate G\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG;\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP4x4(dst, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ + VXC_DP4x4(dst, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ + VXC_DP4x4(dst, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);\n\ + VXC_DP4x4(dst, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);\n\ +\n\ + VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ + VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ +\n\ + // calculate B\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ + VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ +\n\ + // reorder to bgr\n\ + vxc_uchar8 tmpdst0, tmpdst1;\n\ + vxc_uchar16 dst0, dst1, dst2;\n\ +\n\ + if(bOrder == 2)\n\ + {\n\ + vxc_uchar16 exchangeData = B;\n\ + B = R;\n\ + R = exchangeData;\n\ + }\n\ +\n\ + // BGR BGR BG\n\ + VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);\n\ + VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);\n\ +\n\ + // RBG RBG RB\n\ + VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);\n\ + VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);\n\ +\n\ + pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);\n\ +\n\ + VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + pos.x += 16;\n\ +\n\ + // GRB GRB GR\n\ + VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);\n\ + VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);\n\ +\n\ + // BGR BGR BG\n\ + VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);\n\ + VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + pos.x += 16;\n\ +\n\ + // RBG RBG RB\n\ + VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);\n\ + VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);\n\ +\n\ + // GRB GRB GR\n\ + VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);\n\ + VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of pre_process_yuv420_copy_u8_vx*/ + +static const char pre_process_yuv420_scale_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +__kernel void pre_process_yuv420_scale_U8toF16(\n\ + __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\ + __read_only image2d_array_t v_img, __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + int4 gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + gidx += (int4)(0, 1, 2, 3);\n\ +\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ + int4 sx = fx & 0xffff8000; // Floor\n\ + int fy, sy;\n\ + fx -= sx;\n\ + sx = sx >> 15;\n\ + fx = (fx +(1 << 4)) >> 5;\n\ +\n\ + // for y\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ + sy = fy & 0xffff8000; // Floor\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + sy = sy < 0 ? 0 : sy;\n\ + fy = fy < 0 ? 0 : fy;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ + sx += (*xOffset);\n\ + sy += (*yOffset);\n\ + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\ + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\ + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 Y, U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ +\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.x + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.x + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.y;\n\ + srcPos1.x = sx.y >> 1;\n\ + srcPos2.x = sx.y >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.y + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.y + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.z;\n\ + srcPos1.x = sx.z >> 1;\n\ + srcPos2.x = sx.z >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.z + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.z + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.w;\n\ + srcPos1.x = sx.w >> 1;\n\ + srcPos2.x = sx.w >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.w + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.w + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //C = Y - 16; D = U - 128; E = V - 128;\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG, tmpDstG1;\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ +\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + int4 result, temp1, temp2;\n\ + int4 tmpData0, tmpData1;\n\ +\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + // temp2 - temp1\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ +\n\ + vxc_half8 tmpVal;\n\ + half4 hDst;\n\ + tmpV = 1 << 19;\n\ + vxc_short8 dst;\n\ + float4 tmpDst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - bMean) * var;\n\ + dstPos.z = bOrder;\n\ + _viv_asm(CONV, hDst, tmpDst);\n\ + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - gMean) * var;\n\ + dstPos.z = 1;\n\ + _viv_asm(CONV, hDst, tmpDst);\n\ + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - rMean) * var;\n\ + dstPos.z = rOrder;\n\ + _viv_asm(CONV, hDst, tmpDst);\n\ + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pre_process_yuv420_scale_fp16_vx*/ + +static const char pre_process_yuv420_scale_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform float outputScale;\n\ +\n\ +__kernel void pre_process_yuv420_scale_U8toI16(\n\ + __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\ + __read_only image2d_array_t v_img, __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + int4 gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + gidx += (int4)(0, 1, 2, 3);\n\ +\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ + int4 sx = fx & 0xffff8000; // Floor\n\ + int fy, sy;\n\ + fx -= sx;\n\ + sx = sx >> 15;\n\ + fx = (fx +(1 << 4)) >> 5;\n\ +\n\ + // for y\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ + sy = fy & 0xffff8000; // Floor\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + sy = sy < 0 ? 0 : sy;\n\ + fy = fy < 0 ? 0 : fy;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ + sx += (*xOffset);\n\ + sy += (*yOffset);\n\ + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\ + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\ + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 Y, U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ +\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.x + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.x + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.y;\n\ + srcPos1.x = sx.y >> 1;\n\ + srcPos2.x = sx.y >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.y + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.y + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.z;\n\ + srcPos1.x = sx.z >> 1;\n\ + srcPos2.x = sx.z >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.z + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.z + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.w;\n\ + srcPos1.x = sx.w >> 1;\n\ + srcPos2.x = sx.w >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.w + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.w + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //C = Y - 16; D = U - 128; E = V - 128;\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG, tmpDstG1;\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ +\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + int4 result, temp1, temp2;\n\ + int4 tmpData0, tmpData1;\n\ +\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + // temp2 - temp1\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ +\n\ + tmpV = 1 << 19;\n\ + vxc_short8 dst;\n\ + float4 tmpDst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - bMean) * var;\n\ + dstPos.z = bOrder;\n\ + result = convert_int4_rte(tmpDst * outputScale);\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - gMean) * var;\n\ + dstPos.z = 1;\n\ + result = convert_int4_rte(tmpDst * outputScale);\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - rMean) * var;\n\ + dstPos.z = rOrder;\n\ + result = convert_int4_rte(tmpDst * outputScale);\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pre_process_yuv420_scale_i16_vx*/ + +static const char pre_process_yuv420_scale_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform float outputScale;\n\ +\n\ +__kernel void pre_process_yuv420_scale_U8toI8(\n\ + __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\ + __read_only image2d_array_t v_img, __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + int4 gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + gidx += (int4)(0, 1, 2, 3);\n\ +\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ + int4 sx = fx & 0xffff8000; // Floor\n\ + int fy, sy;\n\ + fx -= sx;\n\ + sx = sx >> 15;\n\ + fx = (fx +(1 << 4)) >> 5;\n\ +\n\ + // for y\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ + sy = fy & 0xffff8000; // Floor\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + sy = sy < 0 ? 0 : sy;\n\ + fy = fy < 0 ? 0 : fy;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ + sx += (*xOffset);\n\ + sy += (*yOffset);\n\ + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\ + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\ + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 Y, U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ +\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.x + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.x + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.y;\n\ + srcPos1.x = sx.y >> 1;\n\ + srcPos2.x = sx.y >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.y + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.y + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.z;\n\ + srcPos1.x = sx.z >> 1;\n\ + srcPos2.x = sx.z >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.z + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.z + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.w;\n\ + srcPos1.x = sx.w >> 1;\n\ + srcPos2.x = sx.w >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.w + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.w + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //C = Y - 16; D = U - 128; E = V - 128;\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG, tmpDstG1;\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ +\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + int4 result, temp1, temp2;\n\ + int4 tmpData0, tmpData1;\n\ +\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + // temp2 - temp1\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ +\n\ + tmpV = 1 << 19;\n\ + vxc_char8 dst;\n\ + float4 tmpDst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - bMean) * var;\n\ + dstPos.z = bOrder;\n\ + result = convert_int4_rte(tmpDst * outputScale);\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - gMean) * var;\n\ + dstPos.z = 1;\n\ + result = convert_int4_rte(tmpDst * outputScale);\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - rMean) * var;\n\ + dstPos.z = rOrder;\n\ + result = convert_int4_rte(tmpDst * outputScale);\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pre_process_yuv420_scale_i8_vx*/ + +static const char pre_process_yuv420_scale_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform int zp;\n\ +_viv_uniform float outputScale;\n\ +\n\ +__kernel void pre_process_yuv420_scale_U8toU8(\n\ + __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\ + __read_only image2d_array_t v_img, __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + int4 gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + gidx += (int4)(0, 1, 2, 3);\n\ +\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ + int4 sx = fx & 0xffff8000; // Floor\n\ + int fy, sy;\n\ + fx -= sx;\n\ + sx = sx >> 15;\n\ + fx = (fx +(1 << 4)) >> 5;\n\ +\n\ + // for y\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ + sy = fy & 0xffff8000; // Floor\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + sy = sy < 0 ? 0 : sy;\n\ + fy = fy < 0 ? 0 : fy;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ + sx += (*xOffset);\n\ + sy += (*yOffset);\n\ + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\ + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\ + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 Y, U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ +\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.x + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.x + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.y;\n\ + srcPos1.x = sx.y >> 1;\n\ + srcPos2.x = sx.y >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.y + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.y + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.z;\n\ + srcPos1.x = sx.z >> 1;\n\ + srcPos2.x = sx.z >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.z + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.z + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.w;\n\ + srcPos1.x = sx.w >> 1;\n\ + srcPos2.x = sx.w >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.w + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.w + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //C = Y - 16; D = U - 128; E = V - 128;\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG, tmpDstG1;\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ +\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + int4 result, temp1, temp2;\n\ + int4 tmpData0, tmpData1;\n\ +\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + // temp2 - temp1\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ +\n\ + tmpV = 1 << 19;\n\ + vxc_uchar8 dst;\n\ + float4 tmpDst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - bMean) * var;\n\ + dstPos.z = bOrder;\n\ + result = convert_int4_rte(tmpDst * outputScale + zp);\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - gMean) * var;\n\ + dstPos.z = 1;\n\ + result = convert_int4_rte(tmpDst * outputScale + zp);\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - rMean) * var;\n\ + dstPos.z = rOrder;\n\ + result = convert_int4_rte(tmpDst * outputScale + zp);\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pre_process_yuv420_scale_u8_vx*/ + +static const char pre_process_yuv420_trans_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;\n\ +_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform int zp;\n\ +_viv_uniform float outputScale;\n\ +\n\ +__kernel void pre_process_yuv420_trans_U8toU8(\n\ + __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\ + __read_only image2d_array_t v_img, __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + int4 gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + gidx += (int4)(0, 1, 2, 3);\n\ +\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ + int4 sx = fx & 0xffff8000; // Floor\n\ + int fy, sy;\n\ + fx -= sx;\n\ + sx = sx >> 15;\n\ + fx = (fx +(1 << 4)) >> 5;\n\ +\n\ + // for y\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ + sy = fy & 0xffff8000; // Floor\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + sy = sy < 0 ? 0 : sy;\n\ + fy = fy < 0 ? 0 : fy;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ + sx += (*xOffset);\n\ + sy += (*yOffset);\n\ + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\ + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\ + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 Y, U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ +\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.x + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.x + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.y;\n\ + srcPos1.x = sx.y >> 1;\n\ + srcPos2.x = sx.y >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.y + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.y + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.z;\n\ + srcPos1.x = sx.z >> 1;\n\ + srcPos2.x = sx.z >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.z + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.z + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.w;\n\ + srcPos1.x = sx.w >> 1;\n\ + srcPos2.x = sx.w >> 1;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ + srcPos1.x = (sx.w + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ + srcPos2.x = (sx.w + 1) >> 1;\n\ + VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //C = Y - 16; D = U - 128; E = V - 128;\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG, tmpDstG1;\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ +\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + int4 result, temp1, temp2, dstR, dstG, dstB;\n\ + int4 tmpData0, tmpData1;\n\ +\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + // temp2 - temp1\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ +\n\ + tmpV = 1 << 19;\n\ + vxc_uchar8 dst, tmpPack;\n\ + float4 tmpDst;\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - bMean) * var;\n\ + dstB = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - gMean) * var;\n\ + dstG = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - rMean) * var;\n\ + dstR = convert_int4_rte(tmpDst * outputScale + zp);\n\ +\n\ + if(bOrder == 2)\n\ + {\n\ + int4 exchangeData = dstB;\n\ + dstB = dstR;\n\ + dstR = exchangeData;\n\ + }\n\ +\n\ + VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8);\n\ + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8);\n\ +\n\ + int2 dstPos = (int2)(get_global_id(0) * 3, gidy);\n\ + VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pre_process_yuv420_trans_u8_vx*/ + +static const char pre_process_yuv444_copy_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpR3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpR4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpG1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpG2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpG3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpG4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateG1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateG2nd_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpB2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniPackBG0_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmpAndR_2x8;\n\ +_viv_uniform VXC_512Bits uniPackRB0_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmp0AndG_2x8;\n\ +_viv_uniform VXC_512Bits uniPackGR1_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmp1AndB_2x8;\n\ +_viv_uniform VXC_512Bits uniPackBG1_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmp1AndR_2x8;\n\ +_viv_uniform VXC_512Bits uniPackRB2_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmp2AndG_2x8;\n\ +_viv_uniform VXC_512Bits uniPackGR2_2x8;\n\ +_viv_uniform VXC_512Bits uniPackTmp2AndB_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8HiG_2x8;\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8LoR_2x8;\n\ +_viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform int zp;\n\ +_viv_uniform float outputScale;\n\ +\n\ +__kernel void pre_process_yuv444_copy_U8toU8(\n\ + __read_only image2d_t y_img,\n\ + __read_only image2d_t u_img,\n\ + __read_only image2d_t v_img,\n\ + __write_only image2d_array_t output,\n\ + global int * xRatio,\n\ + global int * yRatio,\n\ + global int * xOffset,\n\ + global int * yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float var,\n\ + int reverse_channel,\n\ + int trans\n\ + )\n\ +{\n\ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ + vxc_uchar16 Y, U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ + vxc_uchar16 dst0, dst1, dst2;\n\ +\n\ + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //C = Y - 16;\n\ + //D = U - 128;\n\ + //E = V - 128;\n\ + // calculate R\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\ +\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + // calculate G\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG0, tmpDstG1;\n\ + VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8);\n\ +\n\ + VXC_DP4x4(G, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ + VXC_DP4x4(G, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ +\n\ + // calculate B\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + var *= outputScale;\n\ + float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\ + rMean * var - zp, var);\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ +\n\ + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ +\n\ + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ +\n\ + pos = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + pos.z = bOrder;\n\ + VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + pos.z = 1;\n\ + VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + pos.z = rOrder;\n\ + VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +// store bgrbgrbgr\n\ +__kernel void pre_process_yuv444_copy_trans_U8(\n\ + __read_only image2d_t y_img,\n\ + __read_only image2d_t u_img,\n\ + __read_only image2d_t v_img,\n\ + __write_only image2d_array_t output,\n\ + global int * xRatio,\n\ + global int * yRatio,\n\ + global int * xOffset,\n\ + global int * yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float var,\n\ + int reverse_channel,\n\ + int trans\n\ + )\n\ +{\n\ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ + vxc_uchar16 Y, U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ + vxc_uchar16 dst;\n\ +\n\ + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + var *= outputScale;\n\ + float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\ + rMean * var - zp, var);\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + //C = Y - 16;\n\ + //D = U - 128;\n\ + //E = V - 128;\n\ + // calculate R\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\ +\n\ + VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ + VXC_DP2x8(R, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ +\n\ + // calculate G\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG0, tmpDstG1;\n\ + VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8);\n\ + VXC_DP4x4(dst, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ + VXC_DP4x4(dst, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ + VXC_DP4x4(dst, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ + VXC_DP4x4(dst, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ +\n\ + VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ + VXC_DP2x8(G, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ +\n\ + // calculate B\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(dst, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(dst, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(dst, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(dst, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ + VXC_DP2x8(B, dst, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ +\n\ + // reorder to bgr\n\ + vxc_uchar8 tmpdst0, tmpdst1;\n\ + vxc_uchar16 dst0, dst1, dst2;\n\ +\n\ + if(bOrder == 2)\n\ + {\n\ + vxc_uchar16 exchangeData = B;\n\ + B = R;\n\ + R = exchangeData;\n\ + }\n\ +\n\ + // BGR BGR BG\n\ + VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG0_2x8);\n\ + VXC_DP2x8(dst0, tmpdst0, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmpAndR_2x8);\n\ +\n\ + // RBG RBG RB\n\ + VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB0_2x8);\n\ + VXC_DP2x8(dst0, tmpdst0, G, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp0AndG_2x8);\n\ +\n\ + pos = (int4)(get_global_id(0) * 3, get_global_id(1), 0, 0);\n\ +\n\ + VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + pos.x += 16;\n\ +\n\ + // GRB GRB GR\n\ + VXC_DP2x8(tmpdst0, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR1_2x8);\n\ + VXC_DP2x8(dst1, tmpdst0, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndB_2x8);\n\ +\n\ + // BGR BGR BG\n\ + VXC_DP2x8(tmpdst0, B, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackBG1_2x8);\n\ + VXC_DP2x8(dst1, tmpdst0, R, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp1AndR_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + pos.x += 16;\n\ +\n\ + // RBG RBG RB\n\ + VXC_DP2x8(tmpdst0, R, B, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackRB2_2x8);\n\ + VXC_DP2x8(dst2, tmpdst0, G, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndG_2x8);\n\ +\n\ + // GRB GRB GR\n\ + VXC_DP2x8(tmpdst1, G, R, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackGR2_2x8);\n\ + VXC_DP2x8(dst2, tmpdst1, B, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0), uniPackTmp2AndB_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pre_process_yuv444_copy_u8_vx*/ + +static const char pre_process_yuv444_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform int zp;\n\ +_viv_uniform float outputScale;\n\ +\n\ +#define IMAGE_PRE_PROCESS_YUV444_QINT(dst_name, dst_type) \\\n\ +__kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\ + __read_only image2d_t y_img, __read_only image2d_t u_img, \\\n\ + __read_only image2d_t v_img, __write_only image2d_array_t output, \\\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \\\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \\\n\ +{ \\\n\ + int4 gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + gidx += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \\\n\ + int4 sx = fx & 0xffff8000; \\\n\ + int fy, sy; \\\n\ + fx -= sx; \\\n\ + sx = sx >> 15; \\\n\ + fx = (fx +(1 << 4)) >> 5; \\\n\ + \\\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \\\n\ + sy = fy & 0xffff8000; \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + \\\n\ + sy = sy < 0 ? 0 : sy; \\\n\ + fy = fy < 0 ? 0 : fy; \\\n\ + \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + sx += (*xOffset); \\\n\ + sy += (*yOffset); \\\n\ + int2 srcPos = (int2)(sx.x, sy); \\\n\ + \\\n\ + vxc_uchar16 Y, U, V; \\\n\ + vxc_int4 C0, C1, C2, C3; \\\n\ + vxc_uchar16 R, G, B; \\\n\ + \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.y; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.z; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.w; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int tmpV = -56992; \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \\\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \\\n\ + \\\n\ + ushort tmpG = 34784; \\\n\ + vxc_ushort8 tmpDstG, tmpDstG1; \\\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \\\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\ + \\\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \\\n\ + tmpV = -70688; \\\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + \\\n\ + int4 result, temp1, temp2; \\\n\ + int4 tmpData0, tmpData1; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + \\\n\ + tmpV = 1 << 19; \\\n\ + dst_type dst; \\\n\ + float4 tmpDst; \\\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - bMean) * var; \\\n\ + dstPos.z = bOrder; \\\n\ + result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - gMean) * var; \\\n\ + dstPos.z = 1; \\\n\ + result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - rMean) * var; \\\n\ + dstPos.z = rOrder; \\\n\ + result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +IMAGE_PRE_PROCESS_YUV444_QINT(U8, vxc_uchar8)\n\ +IMAGE_PRE_PROCESS_YUV444_QINT(I8, vxc_char8)\n\ +IMAGE_PRE_PROCESS_YUV444_QINT(I16, vxc_short8)"; /* end of pre_process_yuv444_scale_vx*/ + +static const char pre_process_yuv444_scale_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +__kernel void pre_process_yuv444_scale_U8toF16(\n\ + __read_only image2d_t y_img, __read_only image2d_t u_img,\n\ + __read_only image2d_t v_img, __write_only image2d_array_t output,\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ +{\n\ + int4 gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + gidx += (int4)(0, 1, 2, 3);\n\ +\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ + int4 sx = fx & 0xffff8000; // Floor\n\ + int fy, sy;\n\ + fx -= sx;\n\ + sx = sx >> 15;\n\ + fx = (fx +(1 << 4)) >> 5;\n\ +\n\ + // for y\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ + sy = fy & 0xffff8000; // Floor\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + sy = sy < 0 ? 0 : sy;\n\ + fy = fy < 0 ? 0 : fy;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ + sx += (*xOffset);\n\ + sy += (*yOffset);\n\ + int2 srcPos = (int2)(sx.x, sy);\n\ +\n\ + vxc_uchar16 Y, U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ +\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.y;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.z;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + srcPos.x = sx.w;\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //C = Y - 16; D = U - 128; E = V - 128;\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG, tmpDstG1;\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ +\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + int4 result, temp1, temp2;\n\ + int4 tmpData0, tmpData1;\n\ +\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + // temp2 - temp1\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ +\n\ + vxc_half8 tmpVal;\n\ + half4 hDst;\n\ + tmpV = 1 << 19;\n\ + vxc_short8 dst;\n\ + float4 tmpDst;\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - bMean) * var;\n\ + dstPos.z = bOrder;\n\ + _viv_asm(CONV, hDst, tmpDst);\n\ + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - gMean) * var;\n\ + dstPos.z = 1;\n\ + _viv_asm(CONV, hDst, tmpDst);\n\ + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ + temp1 = fx * tmpData0 + tmpData1;\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ + temp2 = fx * tmpData0 + tmpData1;\n\ + result = fy * temp2 + (temp1 << 10);\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ + tmpDst = (tmpDst - rMean) * var;\n\ + dstPos.z = rOrder;\n\ + _viv_asm(CONV, hDst, tmpDst);\n\ + VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of pre_process_yuv444_scale_fp16_vx*/ + +static const char pre_process_yuv444_trans_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniTransPackBgr1st_2x8;\n\ +_viv_uniform VXC_512Bits uniTransPackBgr2nd_2x8;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform int zp;\n\ +_viv_uniform float outputScale;\n\ +\n\ +#define IMAGE_PRE_PROCESS_YUV444_TRANS(dst_name, dst_type) \\\n\ +__kernel void pre_process_yuv444_trans_U8to##dst_name( \\\n\ + __read_only image2d_t y_img, __read_only image2d_t u_img, \\\n\ + __read_only image2d_t v_img, __write_only image2d_t output, \\\n\ + global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \\\n\ + float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \\\n\ +{ \\\n\ + int4 gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + gidx += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \\\n\ + int4 sx = fx & 0xffff8000; \\\n\ + int fy, sy; \\\n\ + fx -= sx; \\\n\ + sx = sx >> 15; \\\n\ + fx = (fx +(1 << 4)) >> 5; \\\n\ + \\\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \\\n\ + sy = fy & 0xffff8000; \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + \\\n\ + sy = sy < 0 ? 0 : sy; \\\n\ + fy = fy < 0 ? 0 : fy; \\\n\ + \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + sx += (*xOffset); \\\n\ + sy += (*yOffset); \\\n\ + int2 srcPos = (int2)(sx.x, sy); \\\n\ + \\\n\ + vxc_uchar16 Y, U, V; \\\n\ + vxc_int4 C0, C1, C2, C3; \\\n\ + vxc_uchar16 R, G, B; \\\n\ + \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.y; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.z; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.w; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int tmpV = -56992; \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \\\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \\\n\ + \\\n\ + ushort tmpG = 34784; \\\n\ + vxc_ushort8 tmpDstG, tmpDstG1; \\\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \\\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\ + \\\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \\\n\ + tmpV = -70688; \\\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + \\\n\ + int4 result, temp1, temp2, dstR, dstG, dstB; \\\n\ + int4 tmpData0, tmpData1; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + \\\n\ + tmpV = 1 << 19; \\\n\ + dst_type dst, tmpPack; \\\n\ + float4 tmpDst; \\\n\ + \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - bMean) * var; \\\n\ + dstB = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - gMean) * var; \\\n\ + dstG = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - rMean) * var; \\\n\ + dstR = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ + \\\n\ + if(bOrder == 2) \\\n\ + { \\\n\ + int4 exchangeData = dstB; \\\n\ + dstB = dstR; \\\n\ + dstR = exchangeData; \\\n\ + } \\\n\ + \\\n\ + VXC_DP2x8(tmpPack, dstB, dstG, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr1st_2x8); \\\n\ + VXC_DP2x8(dst, tmpPack, dstR, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniTransPackBgr2nd_2x8); \\\n\ + \\\n\ + int2 dstPos = (int2)(get_global_id(0) * 3, gidy); \\\n\ + VXC_WriteImage(output, dstPos, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +IMAGE_PRE_PROCESS_YUV444_TRANS(U8, vxc_uchar16)"; /* end of pre_process_yuv444_trans_u8_vx*/ + +static const char prelu_vx[] = "\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#if (VX_VERSION==2)\n\ +_viv_uniform VXC_512Bits uniPreluDFPLo_2x8b;\n\ +_viv_uniform VXC_512Bits uniPreluDFPHi_2x8b;\n\ +__kernel void prelu_I8F16toI8_2D_OPT\n\ +(\n\ + image2d_array_t input,\n\ + image2d_array_t param,\n\ + image2d_array_t output\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_char16 in, dst;\n\ + vxc_char32 src;\n\ + vxc_short8 a0, a1;\n\ + vxc_half8 c0, c1;\n\ + VXC_ReadImage(in, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(a0, param, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(a1, param, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c0, a0, 4);\n\ + _viv_asm(COPY, c1, a1, 4);\n\ + src.hi = max(in, 0);\n\ + src.lo = min(in, 0);\n\ +\n\ + VXC_DP2x8_b(dst, src.hi, src.lo, c0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniPreluDFPLo_2x8b);\n\ + VXC_DP2x8_b(dst, src.hi, src.lo, c1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniPreluDFPHi_2x8b);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void prelu_I16F16toI16_2D_OPT\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t param,\n\ + image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 in, dst;\n\ + vxc_short16 src;\n\ + vxc_short8 a0;\n\ + vxc_half8 c0;\n\ + VXC_ReadImage(in, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(a0, param, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c0, a0, 4);\n\ + src.hi = max(in, 0);\n\ + src.lo = min(in, 0);\n\ + VXC_DP2x8_b(dst, src.hi, src.lo, c0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniPreluDFPLo_2x8b);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +#else\n\ +_viv_uniform VXC_512Bits uniPreluInt8_2x8;\n\ +_viv_uniform VXC_512Bits uniPreluInt16_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniPreluInt16_part1_4x4;\n\ +__kernel void prelu_I8F16toI8_2D_OPT\n\ +(\n\ + image2d_array_t input,\n\ + image2d_array_t param,\n\ + image2d_array_t output\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_char16 in, dst;\n\ + vxc_char16 src0, src1, src;\n\ + vxc_short8 a0, a1;\n\ + vxc_half8 c0, c1;\n\ + VXC_ReadImage(in, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(a0, param, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(a1, param, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c0, a0, 4);\n\ + _viv_asm(COPY, c1, a1, 4);\n\ + src0 = max(in, 0);\n\ + src1 = min(in, 0);\n\ + _viv_asm(COPY, src, src0, 16);\n\ + src.s89abcdef = src1.s01234567;\n\ + VXC_DP2x8(dst, src, c0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniPreluInt8_2x8);\n\ + _viv_asm(COPY, src, src1, 16);\n\ + src.s01234567 = src0.s89abcdef;\n\ + VXC_DP2x8(dst, src, c1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniPreluInt8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void prelu_I16F16toI16_2D_OPT\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t param,\n\ + image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_short8 in, dst;\n\ + vxc_short8 src0, src1, src;\n\ + vxc_short8 a0;\n\ + vxc_half8 c0;\n\ + VXC_ReadImage(in, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(a0, param, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, c0, a0, 4);\n\ + src0 = max(in, 0);\n\ + src1 = min(in, 0);\n\ + _viv_asm(COPY, src, src0, 16);\n\ + src.s4567 = src1.s0123;\n\ + VXC_DP4x4(dst, src, c0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniPreluInt16_part0_4x4);\n\ + _viv_asm(COPY, src, src1, 16);\n\ + src.s0123 = src0.s4567;\n\ + VXC_DP4x4(dst, src, c0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniPreluInt16_part1_4x4);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +#endif\n\ +\n\ +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtact8Bin_2x8;\n\ +_viv_uniform int inputZP0;\n\ +_viv_uniform int inputZP1;\n\ +_viv_uniform float input_scale0;\n\ +_viv_uniform float input_scale1;\n\ +_viv_uniform float outputZP;\n\ +#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \\\n\ + __kernel void prelu_##name0##to##name1( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{\\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\\\n\ + vxc_float4 vecA, vecB, vecC, vecD;\\\n\ + input_type0 srcA;\\\n\ + copy_type0 src0;\\\n\ + vxc_short8 srcB;\\\n\ + vxc_half8 src1;\\\n\ + input_type0 input_ZP;\\\n\ + VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ + _viv_asm(COPY, src0, srcA, 16); \\\n\ + VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ + _viv_asm(COPY, src1, srcB, 16); \\\n\ + \\\n\ + _viv_asm(COPY, input_ZP, inputZP0, 4);\\\n\ + VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \\\n\ + uniDataSubZPtoFp32Part0_4x4); \\\n\ + VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \\\n\ + uniDataSubZPtoFp32Part1_4x4);\\\n\ + VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\\\n\ + VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\\\n\ + \\\n\ + vecA = vecA * input_scale0;\\\n\ + vecB = vecB * input_scale0;\\\n\ + vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \\\n\ + vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \\\n\ + vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\ + vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\ + vecA = maxData0 + vecC * minData0 + outputZP;\\\n\ + vecB = maxData1 + vecD * minData1 + outputZP;\\\n\ + convert_type dst0, dst1;\\\n\ + _viv_asm(CONV_RTE, dst0, vecA);\\\n\ + _viv_asm(CONV_RTE, dst1, vecB);\\\n\ + output_type dst2;\\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\\\n\ + copy_type dst;\\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ +}\n\ +// name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type\n\ +PRELU_F16_3D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)\n\ +PRELU_F16_3D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_3D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)\n\ +PRELU_F16_3D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_3D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)\n\ +PRELU_F16_3D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_3D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_3D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)\n\ +PRELU_F16_3D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)\n\ +PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)\n\ +\n\ +#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \\\n\ + __kernel void prelu_##name0##to##name1##_2D( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{\\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\\\n\ + vxc_float4 vecA, vecB, vecC, vecD;\\\n\ + input_type0 srcA;\\\n\ + copy_type0 src0;\\\n\ + vxc_short8 srcB;\\\n\ + vxc_half8 src1;\\\n\ + input_type0 input_ZP;\\\n\ + VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ + _viv_asm(COPY, src0, srcA, 16); \\\n\ + VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ + _viv_asm(COPY, src1, srcB, 16); \\\n\ + \\\n\ + _viv_asm(COPY, input_ZP, inputZP0, 4);\\\n\ + VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\ + VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\ + VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\\\n\ + VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\\\n\ + \\\n\ + vecA = vecA * input_scale0;\\\n\ + vecB = vecB * input_scale0;\\\n\ + vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \\\n\ + vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \\\n\ + vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\ + vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\ + vecA = maxData0 + vecC * minData0 + outputZP;\\\n\ + vecB = maxData1 + vecD * minData1 + outputZP;\\\n\ + convert_type dst0, dst1;\\\n\ + _viv_asm(CONV_RTE, dst0, vecA);\\\n\ + _viv_asm(CONV_RTE, dst1, vecB);\\\n\ + output_type dst2;\\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\\\n\ + copy_type dst;\\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ +}\n\ +PRELU_F16_2D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_2D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)\n\ +PRELU_F16_2D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_2D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)\n\ +PRELU_F16_2D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_2D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_2D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)\n\ +PRELU_F16_2D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)\n\ +PRELU_F16_2D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)\n\ +PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)\n\ +\n\ +#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \\\n\ + __kernel void prelu_U8U8to##name##_2D( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{\\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\\\n\ + vxc_float4 vecA, vecB, vecC, vecD;\\\n\ + vxc_uchar16 src0;\\\n\ + vxc_uchar16 src1;\\\n\ + vxc_uchar16 input_ZP0;\\\n\ + vxc_uchar16 input_ZP1;\\\n\ + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ + \\\n\ + _viv_asm(COPY, input_ZP0, inputZP0, 4);\\\n\ + VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\ + VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\ + _viv_asm(COPY, input_ZP1, inputZP1, 4);\\\n\ + VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\ + VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\ + \\\n\ + vecA = vecA * input_scale0;\\\n\ + vecB = vecB * input_scale0;\\\n\ + vecC = vecC * input_scale1;\\\n\ + vecD = vecD * input_scale1;\\\n\ + vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \\\n\ + vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \\\n\ + vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\ + vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\ + vecA = maxData0 + vecC * minData0 + outputZP;\\\n\ + vecB = maxData1 + vecD * minData1 + outputZP;\\\n\ + convert_type dst0, dst1;\\\n\ + _viv_asm(CONV_RTE, dst0, vecA);\\\n\ + _viv_asm(CONV_RTE, dst1, vecB);\\\n\ + output_type dst2;\\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bin_2x8);\\\n\ + copy_type dst;\\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ +}\n\ +PRELU_U8_2D(U8, vxc_uchar16, int4, vxc_uchar16)\n\ +PRELU_U8_2D(F16, vxc_half8, half4, vxc_short8)\n\ +\n\ +\n\ +"; /* end of prelu_vx*/ + +static const char prelu_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniConvF16toF32_Part0_4x4;\n\ +_viv_uniform VXC_512Bits uniConvF16toF32_Part1_4x4;\n\ +_viv_uniform VXC_512Bits uniPackedBF16_2x8;\n\ +\n\ +#define PRELU_BF16F16TOBF16_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 src0, para_s16; \\\n\ + vxc_half8 para_f16; \\\n\ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(para_s16, param, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, para_f16, para_s16, 16); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + vxc_ushort8 src1, src2; \\\n\ + float4 srcA, srcB; \\\n\ + float4 para0_f32, para1_f32; \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, srcA, src1, 16); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, srcB, src1, 16); \\\n\ + VXC_DP4x4(para0_f32, para_f16, para_f16, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvF16toF32_Part0_4x4);\\\n\ + VXC_DP4x4(para1_f32, para_f16, para_f16, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvF16toF32_Part1_4x4);\\\n\ + srcA = srcA >= 0 ? srcA : srcA * para0_f32; \\\n\ + srcB = srcB >= 0 ? srcB : srcB * para1_f32; \\\n\ + _viv_asm(COPY, src1, srcA, 16); \\\n\ + _viv_asm(COPY, src2, srcB, 16); \\\n\ + VXC_DP2x8(src1, src1, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackedBF16_2x8); \\\n\ + write_fun(output, coord, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void prelu_BF16F16toBF16_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t param,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + PRELU_BF16F16TOBF16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +__kernel void prelu_BF16F16toBF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t param,\n\ + __write_only image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + PRELU_BF16F16TOBF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +#define PRELU_BF16BF16TOBF16_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 src0, para_s16; \\\n\ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(para_s16, param, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + vxc_ushort8 src1, src2; \\\n\ + float4 srcA, srcB; \\\n\ + float4 para0_f32, para1_f32; \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, srcA, src1, 16); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, srcB, src1, 16); \\\n\ + VXC_DP2x8(src1, para_s16, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, para0_f32, src1, 16); \\\n\ + VXC_DP2x8(src1, para_s16, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, para1_f32, src1, 16); \\\n\ + srcA = srcA >= 0 ? srcA : srcA * para0_f32; \\\n\ + srcB = srcB >= 0 ? srcB : srcB * para1_f32; \\\n\ + _viv_asm(COPY, src1, srcA, 16); \\\n\ + _viv_asm(COPY, src2, srcB, 16); \\\n\ + VXC_DP2x8(src1, src1, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackedBF16_2x8); \\\n\ + write_fun(output, coord, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void prelu_BF16BF16toBF16_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t param,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + PRELU_BF16BF16TOBF16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +__kernel void prelu_BF16BF16toBF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t param,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + PRELU_BF16BF16TOBF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +"; /* end of prelu_BF16_vx*/ + +static const char random_multinomial_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform int class_max_iter;\n\ +\n\ +_viv_uniform VXC_512Bits uniPackMaxData_2x8;\n\ +_viv_uniform VXC_512Bits uniGetSubData0to3_4x4;\n\ +_viv_uniform VXC_512Bits uniGetSubData4to7_4x4;\n\ +_viv_uniform int iter;\n\ +_viv_uniform int stride;\n\ +_viv_uniform float re_rand_max;\n\ +\n\ +inline uchar* get_image2D_array_ptr(image2d_array_t input)\n\ +{\n\ + int8 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ + uchar *src_ptr = (uchar*)desc.s0;\n\ +\n\ + return src_ptr;\n\ +}\n\ +\n\ +uint4 _philox4x32bumpkey(uint4 key)\n\ +{\n\ + uint4 mask = (uint4)((uint)0x9E3779B9, (uint)0xBB67AE85, 0, 0);\n\ + //key.x += ((uint)0x9E3779B9);\n\ + //key.y += ((uint)0xBB67AE85);\n\ + key += mask;\n\ + return key;\n\ +}\n\ +\n\ +uint mulhilo32(uint a, uint b, uint* hip)\n\ +{\n\ + uint product = (uint)(a * b);\n\ + *hip = mul_hi(a, b);\n\ + return product;\n\ +}\n\ +\n\ +uint mullo32(uint a, uint b)\n\ +{\n\ + return a * b;\n\ +}\n\ +\n\ +uint mulhi32(uint a, uint b)\n\ +{\n\ + return mul_hi(a, b);\n\ +}\n\ +\n\ +uint4 _philox4x32round(uint4 ctr, uint4 key)\n\ +{\n\ + //uint hi0;\n\ + //uint hi1;\n\ + uint PHILOX_M4x32_0 = ((uint)0xD2511F53);\n\ + uint PHILOX_M4x32_1 = ((uint)0xCD9E8D57);\n\ + uint lo0 = mullo32(PHILOX_M4x32_0, ctr.x);\n\ + uint hi0 = mulhi32(PHILOX_M4x32_0, ctr.x);\n\ + uint lo1 = mullo32(PHILOX_M4x32_1, ctr.z);\n\ + uint hi1 = mulhi32(PHILOX_M4x32_1, ctr.z);\n\ + //uint lo0 = mulhilo32(PHILOX_M4x32_0, ctr.x, &hi0);\n\ + //uint lo1 = mulhilo32(PHILOX_M4x32_1, ctr.z, &hi1);\n\ + uint4 out = (uint4)(hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0);\n\ + return out;\n\ +}\n\ +\n\ +uint4 philox4x32_R_10(uint4 ctr, uint4 key)\n\ +{\n\ + uint i;\n\ + ctr = _philox4x32round(ctr, key);\n\ + for (i = 1; i < 10; i++)\n\ + {\n\ + key = _philox4x32bumpkey(key);\n\ + ctr = _philox4x32round(ctr, key);\n\ + }\n\ + return ctr;\n\ +}\n\ +\n\ +__kernel void random_seed(\n\ + __read_only image2d_array_t seeds,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord = (int4)(gidx << 1, gidy, 0, 0);\n\ +\n\ + int width = get_image_width(seeds);\n\ + __global uint* seeds_ptr = (__global uint*)get_image2D_array_ptr(seeds);\n\ + seeds_ptr = seeds_ptr + coord.x + coord.y * width;\n\ + uint4 key = vload4(0, seeds_ptr);\n\ +\n\ + uint4 ctr = (uint4)(0);\n\ + float4 result = 0;\n\ +\n\ + width = get_image_width(output);\n\ + coord.x = gidx * stride + width * coord.y;\n\ + __global float* output_ptr = (__global float*)get_image2D_array_ptr(output);\n\ + output_ptr += coord.x;\n\ +\n\ + for(int i = 0; i < iter; i++)\n\ + {\n\ + ctr = philox4x32_R_10(ctr, key);\n\ + result = convert_float4(ctr) * re_rand_max;\n\ + vstore4(result, i, output_ptr);\n\ + }\n\ +}\n\ +\n\ +#define logE (1.44269502f)\n\ +float4 eltwise_unary_exp(float4 x)\n\ +{\n\ + x *= logE;\n\ + x = exp2(x);\n\ + return x;\n\ +}\n\ +// N times of 8\n\ +// x dim = 1\n\ +__kernel void random_multinomial_cdf_F16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord = (int4)(gidx, gidy, 0, 0);\n\ +\n\ + vxc_half8 maxData, data;\n\ + vxc_short8 src0;\n\ + float4 dst0 = 0, dst1 = 0;\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + float tmp = 0;\n\ +\n\ + int class_max_stride = get_image_width(input);\n\ + int offset = gidy * class_max_stride;\n\ + __global float* output_ptr = (__global float*)get_image2D_array_ptr(output);\n\ + __global float* cdfPtr = output_ptr + offset;\n\ +\n\ + VXC_ReadImage(maxData, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + for(int i = 1; i < class_max_iter; i++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + _viv_asm(COPY, data, src0, 16);\n\ +\n\ + VXC_VertMax3_Half(maxData, maxData, maxData, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + VXC_HorzMax3_Half(maxData, maxData, VXC_MODIFIER(0, 5, 0,VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(maxData, maxData, maxData, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8);\n\ + VXC_HorzMax3_Half(maxData, maxData, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord.x = 0;\n\ + for(int i = 0; i < class_max_iter; i++)\n\ + {\n\ + float4 val0, val1;\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + _viv_asm(COPY, data, src0, 16);\n\ + VXC_DP4x4(val0, data, maxData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4);\n\ + VXC_DP4x4(val1, data, maxData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4);\n\ + val0 = eltwise_unary_exp(val0);\n\ + val1 = eltwise_unary_exp(val1);\n\ + val0.x += dst1.w;\n\ + dst0 = (float4)(val0.x, (val0.x + val0.y), dot(val0, (float4)(1, 1, 1, 0)), dot(val0, one));\n\ + val1.x += dst0.w;\n\ + dst0 = (float4)(val1.x, (val1.x + val1.y), dot(val1, (float4)(1, 1, 1, 0)), dot(val1, one));\n\ + vstore4(dst0, 0, cdfPtr);\n\ + vstore4(dst1, 1, cdfPtr);\n\ + cdfPtr += 8;\n\ + }\n\ +}\n\ +\n\ +__kernel void random_multinomial_cdf_F32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord = (int4)(gidx, gidy, 0, 0);\n\ +\n\ + vxc_float4 src0, data;\n\ + float maxData0 = FLT_MIN, maxData1 = FLT_MIN;\n\ + uint4 ctr = (uint4)(0);\n\ + float4 dst = 0;\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ +\n\ + int class_max_stride = get_image_width(input);\n\ + float tmp = 0;\n\ + int offset = gidy * class_max_stride;\n\ + __global float* output_ptr = (__global float*)get_image2D_array_ptr(output);\n\ + __global float* cdfPtr = output_ptr + offset;\n\ +\n\ + int width = get_image_width(input);\n\ + __global float* input_ptr = (__global float*)get_image2D_array_ptr(input);\n\ + input_ptr = input_ptr + coord.x + coord.y * width;\n\ +\n\ + float4 maxVal = vload4(0, input_ptr);\n\ + for(int i = 1; i < class_max_iter; i++)\n\ + {\n\ + src0 = vload4(i, input_ptr);\n\ +\n\ + maxVal = maxVal > src0 ? maxVal : src0;\n\ + }\n\ + maxVal.xy = maxVal.xy > maxVal.zw ? maxVal.xy : maxVal.zw;\n\ + maxData0 = maxVal.x > maxVal.y ? maxVal.x : maxVal.y;\n\ +\n\ + float4 maxData = (float4)(maxData0, maxData0, maxData0, maxData0);\n\ + for(int i = 0; i < class_max_iter; i++)\n\ + {\n\ + float4 val;\n\ + src0 = vload4(i, input_ptr);\n\ + data = src0 - maxData;\n\ + val = eltwise_unary_exp(data);\n\ + val.x += dst.w;\n\ + dst = (float4)(val.x, (val.x + val.y), dot(val, (float4)(1, 1, 1, 0)), dot(val, one));\n\ + vstore4(dst, i, cdfPtr);\n\ + }\n\ +}\n\ +\n\ +uint upper_bound(float* a, int n, float x)\n\ +{\n\ + uint l = 0;\n\ + uint h = n;\n\ + while (l < h) {\n\ + int mid = (l + h) >> 1;\n\ + if (x >= a[mid]) {\n\ + l = mid + 1;\n\ + } else {\n\ + h = mid;\n\ + }\n\ + }\n\ + return l;\n\ +}\n\ +\n\ +// one thread calculate 4\n\ +__kernel void random_multinomial\n\ + (\n\ + __read_only image2d_array_t randoms,\n\ + __read_only image2d_array_t cdfs,\n\ + __write_only image2d_array_t output,\n\ + int class_size\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord = (int4)(gidx, gidy, 0, 0);\n\ +\n\ + int class_max_stride = get_image_width(cdfs);\n\ + int offset = gidy * class_max_stride;\n\ + __global float* cdf_ptr = (__global float*)get_image2D_array_ptr(cdfs);\n\ + __global float* cdfPtr = cdf_ptr + offset;\n\ +\n\ + int width = get_image_width(randoms);\n\ + offset = coord.x + coord.y * width;\n\ + __global float* randoms_ptr = (__global float*)get_image2D_array_ptr(randoms);\n\ + randoms_ptr = randoms_ptr + offset;\n\ +\n\ + width = get_image_width(output);\n\ + offset = coord.x + coord.y * width;\n\ + __global uint* output_ptr = (__global uint*)get_image2D_array_ptr(output);\n\ + output_ptr = output_ptr + offset;\n\ +\n\ + float4 ran = vload4(0, randoms_ptr);\n\ + float total = cdfPtr[class_size - 1];\n\ + float4 target = ran * total;\n\ +\n\ + uint4 out_class = (uint4)(0);\n\ + out_class.x = upper_bound(cdfPtr, class_size, target.x);\n\ + out_class.y = upper_bound(cdfPtr, class_size, target.y);\n\ + out_class.z = upper_bound(cdfPtr, class_size, target.z);\n\ + out_class.w = upper_bound(cdfPtr, class_size, target.w);\n\ +\n\ + vstore4(out_class, 0, output_ptr);\n\ +}\n\ +\n\ +"; /* end of random_multinomial_vx*/ + +static const char reduceall_internal_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform int axisSize;\n\ +\n\ +_viv_uniform VXC_512Bits uniS8AddAll_16x1;\n\ +\n\ +#define REDUCEALL_AXIS0_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \\\n\ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \\\n\ + int4 sum_val = 0; \\\n\ + result = ones; \\\n\ + do \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + val = val0 != zeros ? ones : zeros; \\\n\ + VXC_DP16x1(sum_val, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniS8AddAll_16x1); \\\n\ + if (sum_val.x != 16) \\\n\ + { \\\n\ + result = zeros; \\\n\ + break; \\\n\ + } \\\n\ + coord.x += 16; \\\n\ + } \\\n\ + while(coord.x < axisSize); \\\n\ + write_fun(output, coord_out, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void reduceall_axis0_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_char16 val0;\n\ + vxc_char16 val, result;\n\ + REDUCEALL_AXIS0_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage)\n\ +}\n\ +\n\ +__kernel void reduceall_axis0_I8toI8_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int2 coord_out = (int2)(get_global_id(0), 0);\n\ + vxc_char16 val0;\n\ + vxc_char16 val, result;\n\ + REDUCEALL_AXIS0_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +"; /* end of reduceall_internal_axis0_vx*/ + +static const char reduceall_internal_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int axisSize;\n\ +\n\ +#define REDUCEALL_AXIS1_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \\\n\ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + result = val0 != zeros ? ones : zeros; \\\n\ + coord.y++; \\\n\ + while(coord.y < axisSize) \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + val = val0 != zeros ? ones : zeros; \\\n\ + result = result & val; \\\n\ + coord.y++; \\\n\ + } \\\n\ + write_fun(output, coord_out, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void reduceall_axis1_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_char16 val0;\n\ + vxc_char16 val, result;\n\ + REDUCEALL_AXIS1_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage)\n\ +}\n\ +\n\ +__kernel void reduceall_axis1_I8toI8_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int2 coord_out = (int2)(get_global_id(0), 0);\n\ + vxc_char16 val0;\n\ + vxc_char16 val, result;\n\ + REDUCEALL_AXIS1_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +"; /* end of reduceall_internal_axis1_vx*/ + +static const char reduceall_internal_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int axisSize;\n\ +\n\ +#define REDUCEALL_AXIS2_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \\\n\ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + result = val0 != zeros ? ones : zeros; \\\n\ + coord.z++; \\\n\ + while(coord.z < axisSize) \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + val = val0 != zeros ? ones : zeros; \\\n\ + result = result & val; \\\n\ + coord.z++; \\\n\ + } \\\n\ + write_fun(output, coord.xy, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void reduceall_axis2_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + vxc_char16 val0;\n\ + vxc_char16 val, result;\n\ + REDUCEALL_AXIS2_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +"; /* end of reduceall_internal_axis2_vx*/ + +static const char reduceany_internal_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform int axisSize;\n\ +\n\ +_viv_uniform VXC_512Bits uniS8AddAll_16x1;\n\ +\n\ +#define REDUCEANY_AXIS0_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \\\n\ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \\\n\ + int4 sum_val = 0; \\\n\ + result = zeros; \\\n\ + do \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + val = val0 != zeros ? ones : zeros; \\\n\ + VXC_DP16x1(sum_val, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniS8AddAll_16x1); \\\n\ + if (sum_val.x != 0) \\\n\ + { \\\n\ + result = ones; \\\n\ + break; \\\n\ + } \\\n\ + coord.x += 16; \\\n\ + } \\\n\ + while(coord.x < axisSize); \\\n\ + write_fun(output, coord_out, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void reduceany_axis0_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_char16 val0;\n\ + vxc_char16 val, result;\n\ + REDUCEANY_AXIS0_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage)\n\ +}\n\ +\n\ +__kernel void reduceany_axis0_I8toI8_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int2 coord_out = (int2)(get_global_id(0), 0);\n\ + vxc_char16 val0;\n\ + vxc_char16 val, result;\n\ + REDUCEANY_AXIS0_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +"; /* end of reduceany_internal_axis0_vx*/ + +static const char reduceany_internal_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int axisSize;\n\ +\n\ +#define REDUCEANY_AXIS1_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \\\n\ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + result = val0 != zeros ? ones : zeros; \\\n\ + coord.y++; \\\n\ + while(coord.y < axisSize) \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + val = val0 != zeros ? ones : zeros; \\\n\ + result = result | val; \\\n\ + coord.y++; \\\n\ + } \\\n\ + write_fun(output, coord_out, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void reduceany_axis1_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_char16 val0;\n\ + vxc_char16 val, result;\n\ + REDUCEANY_AXIS1_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage)\n\ +}\n\ +\n\ +__kernel void reduceany_axis1_I8toI8_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int2 coord_out = (int2)(get_global_id(0), 0);\n\ + vxc_char16 val0;\n\ + vxc_char16 val, result;\n\ + REDUCEANY_AXIS1_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +"; /* end of reduceany_internal_axis1_vx*/ + +static const char reduceany_internal_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int axisSize;\n\ +\n\ +#define REDUCEANY_AXIS2_PROCESS(read_fun, write_fun) \\\n\ + vxc_char16 ones = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; \\\n\ + vxc_char16 zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + result = val0 != zeros ? ones : zeros; \\\n\ + coord.z++; \\\n\ + while(coord.z < axisSize) \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + val = val0 != zeros ? ones : zeros; \\\n\ + result = result | val; \\\n\ + coord.z++; \\\n\ + } \\\n\ + write_fun(output, coord.xy, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void reduceany_axis2_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + vxc_char16 val0;\n\ + vxc_char16 val, result;\n\ + REDUCEANY_AXIS2_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +"; /* end of reduceany_internal_axis2_vx*/ + +static const char reducemax_internal_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float input_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniGetLoData_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniPackMaxData_2x8;\n\ +\n\ +#define REDUCEMAX_PROCESS_AXIS0(read_fun, vert_max_fun, horz_max_fun) \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, val, val0, 16); \\\n\ + coord.x += 8; \\\n\ + do \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val1, val1, 16); \\\n\ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val2, val2, 16); \\\n\ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val3, val3, 16); \\\n\ + coord.x += 32; \\\n\ + vert_max_fun(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vert_max_fun(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + horz_max_fun(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \\\n\ + horz_max_fun(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + while(coord.x < (axisSize + 16));\n\ +\n\ +#define REDUCEMAX_PROCESS_AXIS0_SAVE_SAME(save_type, write_fun) \\\n\ + save_type dst; \\\n\ + _viv_asm(COPY, dst, val, 16); \\\n\ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \\\n\ + vxc_float4 prob; \\\n\ + dst_type vec1; \\\n\ + save_type dst; \\\n\ + VXC_DP4x4(prob, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \\\n\ + prob = ((prob - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, vec1, prob); \\\n\ + _viv_asm(COPY, dst, vec1, 16); \\\n\ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMAX_AXIS0_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_max_fun, horz_max_fun) \\\n\ +__kernel void reducemax_axis0_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + REDUCEMAX_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun); \\\n\ + REDUCEMAX_PROCESS_AXIS0_SAVE_SAME(save_type, VXC_WriteImage); \\\n\ +}\n\ +\n\ +\n\ +#define REDUCEMAX_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\\\n\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_max_fun, horz_max_fun) \\\n\ +__kernel void reducemax_axis0_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + REDUCEMAX_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun); \\\n\ + REDUCEMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +REDUCEMAX_AXIS0_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +\n\ +REDUCEMAX_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +REDUCEMAX_AXIS0(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, \\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +REDUCEMAX_AXIS0(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric,\\\n\ + 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +REDUCEMAX_AXIS0(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +REDUCEMAX_AXIS0(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +REDUCEMAX_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\n\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +\n\ +REDUCEMAX_AXIS0(I16, I16, vxc_short8, vxc_short8, short4,\n\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +REDUCEMAX_AXIS0(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +REDUCEMAX_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale,\\\n\ +input_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +\n\ +#define REDUCEMAX_AXIS0_SAME_2D(src_name, dst_name, src_type, copy_type, save_type, vert_max_fun, horz_max_fun) \\\n\ +__kernel void reducemax_axis0_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(16, get_global_id(0)); \\\n\ + int2 coord_out = (int2)(get_global_id(0), 0); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + REDUCEMAX_PROCESS_AXIS0(VXC_ReadImage, vert_max_fun, horz_max_fun); \\\n\ + REDUCEMAX_PROCESS_AXIS0_SAVE_SAME(save_type, VXC_WriteImage); \\\n\ +}\n\ +\n\ +#define REDUCEMAX_AXIS0_2D(src_name, dst_name, src_type, copy_type,\\\n\ + dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_max_fun, horz_max_fun) \\\n\ +__kernel void reducemax_axis0_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(16, get_global_id(0)); \\\n\ + int2 coord_out = (int2)(get_global_id(0), 0); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + REDUCEMAX_PROCESS_AXIS0(VXC_ReadImage, vert_max_fun, horz_max_fun); \\\n\ + REDUCEMAX_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +REDUCEMAX_AXIS0_SAME_2D(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +\n\ +REDUCEMAX_AXIS0_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +REDUCEMAX_AXIS0_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +REDUCEMAX_AXIS0_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric,\\\n\ + 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +REDUCEMAX_AXIS0_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +REDUCEMAX_AXIS0_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +REDUCEMAX_AXIS0_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4,\n\ + vxc_short8, CONV, 1, 0, inputScale,\\\n\ + input_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +REDUCEMAX_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\n\ + CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +REDUCEMAX_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\n\ + CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +REDUCEMAX_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale,\\\n\ +input_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +\n\ +\n\ +"; /* end of reducemax_internal_axis0_vx*/ + +static const char reducemax_internal_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float input_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniGetLoData_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniGetHiData_4x4;\n\ +\n\ +#define REDUCEMAX_PROCESS_AXIS1(read_fun, vert_max_fun) \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, max, in0, 16); \\\n\ + coord.y++; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + } \\\n\ + while(coord.y < axisSize);\n\ +\n\ +#define REDUCEMAX_PROCESS_AXIS1_SAVE_SAME(save_type, write_fun) \\\n\ + save_type vect; \\\n\ + _viv_asm(COPY, vect, max, 16); \\\n\ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ +OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \\\n\ + dst_type dst0, dst1; \\\n\ + save_type vect; \\\n\ + VXC_DP4x4(data0, max, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \\\n\ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst0, data0); \\\n\ + VXC_DP4x4(data0, max, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \\\n\ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst1, data0); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMAX_AXIS1_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_max_fun) \\\n\ +__kernel void reducemax_axis1_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMAX_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \\\n\ + REDUCEMAX_PROCESS_AXIS1_SAVE_SAME(save_type, VXC_WriteImage); \\\n\ +}\n\ +\n\ +#define REDUCEMAX_AXIS1(src_name, dst_name, src_type, copy_type, dst_type, save_type,\\\n\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_max_fun) \\\n\ +__kernel void reducemax_axis1_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMAX_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \\\n\ + REDUCEMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +\n\ +\n\ +REDUCEMAX_AXIS1_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMax3_Half)\n\ +\n\ +\n\ +REDUCEMAX_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half)\n\ +REDUCEMAX_AXIS1(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, \\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half)\n\ +REDUCEMAX_AXIS1(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMax3_Half)\n\ +REDUCEMAX_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\n\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS1(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS1(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer)\n\ +\n\ +\n\ +#define REDUCEMAX_AXIS1_SAME_2D(src_name, dst_name, src_type, copy_type, save_type, vert_max_fun) \\\n\ +__kernel void reducemax_axis1_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMAX_PROCESS_AXIS1(VXC_ReadImage, vert_max_fun) \\\n\ + REDUCEMAX_PROCESS_AXIS1_SAVE_SAME(save_type, VXC_WriteImage); \\\n\ +}\n\ +\n\ +#define REDUCEMAX_AXIS1_2D(src_name, dst_name, src_type, copy_type, dst_type,\\\n\ +save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_max_fun) \\\n\ +__kernel void reducemax_axis1_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMAX_PROCESS_AXIS1(VXC_ReadImage, vert_max_fun) \\\n\ + REDUCEMAX_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +REDUCEMAX_AXIS1_SAME_2D(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMax3_Half)\n\ +\n\ +REDUCEMAX_AXIS1_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half)\n\ +REDUCEMAX_AXIS1_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, \\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half)\n\ +REDUCEMAX_AXIS1_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMax3_Half)\n\ +REDUCEMAX_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4,\n\ +vxc_short8, CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS1_2D(I16, I16, vxc_short8, vxc_short8,\\\n\ +short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS1_2D(I8, I8, vxc_char16, vxc_char16,\\\n\ +char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer)\n\ +"; /* end of reducemax_internal_axis1_vx*/ + +static const char reducemax_internal_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float input_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniGetLoData_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniGetHiData_4x4;\n\ +\n\ +#define REDUCEMAX_PROCESS_AXIS2(read_fun, vert_max_fun) \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, max, in0, 16); \\\n\ + coord.z++; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + } \\\n\ + while(coord.z < axisSize);\n\ +\n\ +#define REDUCEMAX_PROCESS_AXIS2_SAVE_SAME(save_type, write_fun) \\\n\ + save_type vect; \\\n\ + _viv_asm(COPY, vect, max, 16); \\\n\ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMAX_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode,\\\n\ +OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \\\n\ + dst_type dst0, dst1; \\\n\ + save_type vect; \\\n\ + VXC_DP4x4(data0, max, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \\\n\ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst0, data0); \\\n\ + VXC_DP4x4(data0, max, max, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \\\n\ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst1, data0); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMAX_AXIS2_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_max_fun) \\\n\ +__kernel void reducemax_axis2_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMAX_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_max_fun) \\\n\ + REDUCEMAX_PROCESS_AXIS2_SAVE_SAME(save_type, VXC_WriteImage); \\\n\ +}\n\ +\n\ +#define REDUCEMAX_AXIS2(src_name, dst_name, src_type, copy_type, dst_type,\\\n\ +save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_max_fun) \\\n\ +__kernel void reducemax_axis2_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMAX_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_max_fun) \\\n\ + REDUCEMAX_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode, OUT_SCALE,\\\n\ + OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +\n\ +\n\ +REDUCEMAX_AXIS2_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMax3_Half)\n\ +\n\ +\n\ +REDUCEMAX_AXIS2(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half)\n\ +REDUCEMAX_AXIS2(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, \\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMax3_Half)\n\ +REDUCEMAX_AXIS2(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMax3_Half)\n\ +REDUCEMAX_AXIS2(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS2(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS2(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer)\n\ +\n\ +REDUCEMAX_AXIS2(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS2(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMax3_Integer)\n\ +REDUCEMAX_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMax3_Integer)\n\ +\n\ +"; /* end of reducemax_internal_axis2_vx*/ + +static const char reducemin_internal_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float input_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniGetLoData_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniPackMaxData_2x8;\n\ +\n\ +#define REDUCEMIN_PROCESS_AXIS0(read_fun, vert_min_fun, horz_min_fun) \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, val, val0, 16); \\\n\ + coord.x += 8; \\\n\ + do \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val1, val1, 16); \\\n\ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val2, val2, 16); \\\n\ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val3, val3, 16); \\\n\ + coord.x += 32; \\\n\ + vert_min_fun(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vert_min_fun(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + while(coord.x < (axisSize + 16)); \\\n\ + horz_min_fun(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \\\n\ + horz_min_fun(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMIN_PROCESS_AXIS0_SAVE_SAME(save_type, write_fun) \\\n\ + save_type dst; \\\n\ + _viv_asm(COPY, dst, val, 16); \\\n\ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMIN_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ +OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \\\n\ + vxc_float4 prob; \\\n\ + dst_type vec1; \\\n\ + save_type dst; \\\n\ + VXC_DP4x4(prob, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \\\n\ + prob = ((prob - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, vec1, prob); \\\n\ + _viv_asm(COPY, dst, vec1, 16); \\\n\ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMIN_AXIS0_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_min_fun, horz_min_fun) \\\n\ +__kernel void reducemin_axis0_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + REDUCEMIN_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_min_fun, horz_min_fun); \\\n\ + REDUCEMIN_PROCESS_AXIS0_SAVE_SAME(save_type, VXC_WriteImage); \\\n\ +}\n\ +\n\ +\n\ +#define REDUCEMIN_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\\\n\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET,\\\n\ + IN_SCALE, IN_OFFSET, vert_min_fun, horz_min_fun) \\\n\ +__kernel void reducemin_axis0_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(16, get_global_id(0), get_global_id(1), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + REDUCEMIN_PROCESS_AXIS0(VXC_ReadImage2DArray, vert_min_fun, horz_min_fun); \\\n\ + REDUCEMIN_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +REDUCEMIN_AXIS0_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMin3_Half, VXC_HorzMin3_Half)\n\ +\n\ +REDUCEMIN_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale,\\\n\ +0, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half)\n\ +REDUCEMIN_AXIS0(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale,\\\n\ +0, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half)\n\ +REDUCEMIN_AXIS0(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half)\n\ +REDUCEMIN_AXIS0(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0,\\\n\ +inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +REDUCEMIN_AXIS0(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0,\\\n\ +inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +REDUCEMIN_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8, CONV,\\\n\ +1, 0, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +\n\ +REDUCEMIN_AXIS0(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +REDUCEMIN_AXIS0(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +REDUCEMIN_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric,\\\n\ +VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +\n\ +#define REDUCEMIN_AXIS0_SAME_2D(src_name, dst_name, src_type, copy_type,\\\n\ +save_type, vert_min_fun, horz_min_fun) \\\n\ +__kernel void reducemin_axis0_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(16, get_global_id(0)); \\\n\ + int2 coord_out = (int2)(get_global_id(0), 0); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + REDUCEMIN_PROCESS_AXIS0(VXC_ReadImage, vert_min_fun, horz_min_fun); \\\n\ + REDUCEMIN_PROCESS_AXIS0_SAVE_SAME(save_type, VXC_WriteImage); \\\n\ +}\n\ +\n\ +#define REDUCEMIN_AXIS0_2D(src_name, dst_name, src_type, copy_type,\\\n\ + dst_type, save_type, conv_mode, OUT_SCALE,\\\n\ + OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_min_fun, horz_min_fun) \\\n\ +__kernel void reducemin_axis0_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(16, get_global_id(0)); \\\n\ + int2 coord_out = (int2)(get_global_id(0), 0); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + REDUCEMIN_PROCESS_AXIS0(VXC_ReadImage, vert_min_fun, horz_min_fun); \\\n\ + REDUCEMIN_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +REDUCEMIN_AXIS0_SAME_2D(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMin3_Half, VXC_HorzMin3_Half)\n\ +\n\ +REDUCEMIN_AXIS0_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half)\n\ +REDUCEMIN_AXIS0_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half)\n\ +REDUCEMIN_AXIS0_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMin3_Half, VXC_HorzMin3_Half)\n\ +\n\ +REDUCEMIN_AXIS0_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +REDUCEMIN_AXIS0_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +REDUCEMIN_AXIS0_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +REDUCEMIN_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +REDUCEMIN_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +REDUCEMIN_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric,\\\n\ +VXC_VertMin3_Integer, VXC_HorzMin3_Integer)\n\ +\n\ +\n\ +"; /* end of reducemin_internal_axis0_vx*/ + +static const char reducemin_internal_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float input_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniGetLoData_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniGetHiData_4x4;\n\ +\n\ +#define REDUCEMIN_PROCESS_AXIS1(read_fun, vert_min_fun) \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, min, in0, 16); \\\n\ + coord.y++; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + vert_min_fun(min, min, min, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + } \\\n\ + while(coord.y < axisSize);\n\ +\n\ +#define REDUCEMIN_PROCESS_AXIS1_SAVE_SAME(save_type, write_fun) \\\n\ + save_type vect; \\\n\ + _viv_asm(COPY, vect, min, 16); \\\n\ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMIN_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ +OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \\\n\ + dst_type dst0, dst1; \\\n\ + save_type vect; \\\n\ + VXC_DP4x4(data0, min, min, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \\\n\ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst0, data0); \\\n\ + VXC_DP4x4(data0, min, min, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \\\n\ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst1, data0); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMIN_AXIS1_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_min_fun) \\\n\ +__kernel void reducemin_axis1_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type vec0, min; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMIN_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_min_fun) \\\n\ + REDUCEMIN_PROCESS_AXIS1_SAVE_SAME(save_type, VXC_WriteImage); \\\n\ +}\n\ +\n\ +#define REDUCEMIN_AXIS1(src_name, dst_name, src_type, copy_type, dst_type, save_type,\\\n\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_min_fun) \\\n\ +__kernel void reducemin_axis1_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type vec0, min; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMIN_PROCESS_AXIS1(VXC_ReadImage2DArray, vert_min_fun) \\\n\ + REDUCEMIN_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +\n\ +\n\ +REDUCEMIN_AXIS1_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMin3_Half)\n\ +\n\ +\n\ +REDUCEMIN_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half)\n\ +REDUCEMIN_AXIS1(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half)\n\ +REDUCEMIN_AXIS1(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMin3_Half)\n\ +REDUCEMIN_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS1(I16, I16, vxc_short8, vxc_short8, short4,\\\n\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS1(I8, I8, vxc_char16, vxc_char16, char4,\\\n\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer)\n\ +\n\ +\n\ +#define REDUCEMIN_AXIS1_SAME_2D(src_name, dst_name, src_type, copy_type, save_type, vert_min_fun) \\\n\ +__kernel void reducemin_axis1_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), 0); \\\n\ + src_type vec0, min; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMIN_PROCESS_AXIS1(VXC_ReadImage, vert_min_fun) \\\n\ + REDUCEMIN_PROCESS_AXIS1_SAVE_SAME(save_type, VXC_WriteImage); \\\n\ +}\n\ +\n\ +#define REDUCEMIN_AXIS1_2D(src_name, dst_name, src_type, copy_type, dst_type, save_type,\\\n\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_min_fun) \\\n\ +__kernel void reducemin_axis1_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), 0); \\\n\ + src_type vec0, min; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMIN_PROCESS_AXIS1(VXC_ReadImage, vert_min_fun) \\\n\ + REDUCEMIN_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +REDUCEMIN_AXIS1_SAME_2D(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMin3_Half)\n\ +\n\ +REDUCEMIN_AXIS1_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half)\n\ +REDUCEMIN_AXIS1_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half)\n\ +REDUCEMIN_AXIS1_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMin3_Half)\n\ +REDUCEMIN_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS1_2D(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer)\n\ +"; /* end of reducemin_internal_axis1_vx*/ + +static const char reducemin_internal_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float input_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniGetLoData_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniGetHiData_4x4;\n\ +\n\ +#define REDUCEMIN_PROCESS_AXIS2(read_fun, vert_min_fun) \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, min, in0, 16); \\\n\ + coord.z++; \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + vert_min_fun(min, min, min, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + } \\\n\ + while(coord.z < axisSize);\n\ +\n\ +#define REDUCEMIN_PROCESS_AXIS2_SAVE_SAME(save_type, write_fun) \\\n\ + save_type vect; \\\n\ + _viv_asm(COPY, vect, min, 16); \\\n\ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMIN_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode,\\\n\ +OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, write_fun) \\\n\ + dst_type dst0, dst1; \\\n\ + save_type vect; \\\n\ + VXC_DP4x4(data0, min, min, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \\\n\ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst0, data0); \\\n\ + VXC_DP4x4(data0, min, min, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \\\n\ + data0 = ((data0 - IN_OFFSET) * IN_SCALE) * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst1, data0); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEMIN_AXIS2_SAME(src_name, dst_name, src_type, copy_type, save_type, vert_min_fun) \\\n\ +__kernel void reducemin_axis2_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + src_type vec0, min; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMIN_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_min_fun) \\\n\ + REDUCEMIN_PROCESS_AXIS2_SAVE_SAME(save_type, VXC_WriteImage); \\\n\ +}\n\ +\n\ +#define REDUCEMIN_AXIS2(src_name, dst_name, src_type, copy_type, dst_type,\\\n\ +save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, vert_min_fun) \\\n\ +__kernel void reducemin_axis2_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + src_type vec0, min; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + REDUCEMIN_PROCESS_AXIS2(VXC_ReadImage2DArray, vert_min_fun) \\\n\ + REDUCEMIN_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +\n\ +\n\ +REDUCEMIN_AXIS2_SAME(F16, F16, vxc_half8, vxc_short8, vxc_short8, VXC_VertMin3_Half)\n\ +\n\ +\n\ +REDUCEMIN_AXIS2(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half)\n\ +REDUCEMIN_AXIS2(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, 1, 0, VXC_VertMin3_Half)\n\ +REDUCEMIN_AXIS2(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0, VXC_VertMin3_Half)\n\ +REDUCEMIN_AXIS2(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS2(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS2(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer)\n\ +\n\ +REDUCEMIN_AXIS2(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS2(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, inputScale, 0, VXC_VertMin3_Integer)\n\ +REDUCEMIN_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric, VXC_VertMin3_Integer)\n\ +\n\ +"; /* end of reducemin_internal_axis2_vx*/ + +static const char reduceprod_internal_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float input_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniGetLoData_4x4;\n\ +_viv_uniform VXC_512Bits uniGetHiData_4x4;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +\n\ +_viv_uniform int inputWidth;\n\ +_viv_uniform VXC_512Bits uniGetEndLoData_2x8;\n\ +_viv_uniform VXC_512Bits uniGetEndHiData_2x8;\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS0(read_fun, IN_SCALE, IN_OFFSET) \\\n\ + while(coord.x < inputWidth) \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val, val0, 16); \\\n\ + VXC_DP4x4(tmpProdLo, img_val, img_val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \\\n\ + VXC_DP4x4(tmpProdHi, img_val, img_val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \\\n\ + tmpProdLo = (tmpProdLo - IN_OFFSET) * IN_SCALE; \\\n\ + tmpProdHi = (tmpProdHi - IN_OFFSET) * IN_SCALE; \\\n\ + tmpProd = tmpProdLo * tmpProdHi; \\\n\ + prodValue = prodValue * tmpProd; \\\n\ + coord.x += 8; \\\n\ + } \\\n\ + vxc_ushort8 tmpProdInt0, tmpProdInt1; \\\n\ + vxc_ushort8 tmpOnesInt = {0, 16256, 0, 16256, 0, 16256, 0, 16256}; \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val, val0, 16); \\\n\ + VXC_DP4x4(tmpProdLo, img_val, img_val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \\\n\ + VXC_DP4x4(tmpProdHi, img_val, img_val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \\\n\ + tmpProdLo = (tmpProdLo - IN_OFFSET) * IN_SCALE; \\\n\ + tmpProdHi = (tmpProdHi - IN_OFFSET) * IN_SCALE; \\\n\ + _viv_asm(COPY, tmpProdInt0, tmpProdLo, 16); \\\n\ + _viv_asm(COPY, tmpProdInt1, tmpProdHi, 16); \\\n\ + VXC_DP2x8(tmpProdInt0, tmpProdInt0, tmpOnesInt,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetEndLoData_2x8); \\\n\ + VXC_DP2x8(tmpProdInt1, tmpProdInt1, tmpOnesInt,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetEndHiData_2x8); \\\n\ + _viv_asm(COPY, tmpProdLo, tmpProdInt0, 16); \\\n\ + _viv_asm(COPY, tmpProdHi, tmpProdInt1, 16); \\\n\ + tmpProd = tmpProdLo * tmpProdHi; \\\n\ + prodValue = prodValue * tmpProd; \\\n\ + tmpProd.xy = prodValue.xy * prodValue.zw; \\\n\ + prodValue.x = tmpProd.x * tmpProd.y;\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, write_fun) \\\n\ + dst_type vec1; \\\n\ + save_type dst; \\\n\ + prodValue = prodValue * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, vec1, prodValue); \\\n\ + _viv_asm(COPY, dst, vec1, 16); \\\n\ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define REDUCEPROD_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\\\n\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET) \\\n\ +__kernel void reduceprod_axis0_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_float4 prodValue = {1.0f, 1.0f, 1.0f, 1.0f}; \\\n\ + vxc_float4 tmpProdLo, tmpProdHi, tmpProd; \\\n\ + src_type img_val; \\\n\ + copy_type val0; \\\n\ + REDUCEPROD_PROCESS_AXIS0(VXC_ReadImage2DArray, IN_SCALE, IN_OFFSET); \\\n\ + REDUCEPROD_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +REDUCEPROD_AXIS0(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8, CONV, 1, 0, 1, 0)\n\ +REDUCEPROD_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, 1, 0)\n\ +REDUCEPROD_AXIS0(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, 1, 0)\n\ +REDUCEPROD_AXIS0(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0)\n\ +REDUCEPROD_AXIS0(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS0(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, input_offset_asymmetric)\n\ +\n\ +REDUCEPROD_AXIS0(I16, I16, vxc_short8, vxc_short8, short4,\\\n\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS0(I8, I8, vxc_char16, vxc_char16, char4,\\\n\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric)\n\ +\n\ +#define REDUCEPROD_AXIS0_2D(src_name, dst_name, src_type, copy_type,\\\n\ + dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET) \\\n\ +__kernel void reduceprod_axis0_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(0, get_global_id(0)); \\\n\ + int2 coord_out = (int2)(get_global_id(0), 0); \\\n\ + vxc_float4 prodValue = {1.0f, 1.0f, 1.0f, 1.0f}; \\\n\ + vxc_float4 tmpProdLo, tmpProdHi, tmpProd; \\\n\ + src_type img_val; \\\n\ + copy_type val0; \\\n\ + REDUCEPROD_PROCESS_AXIS0(VXC_ReadImage, IN_SCALE, IN_OFFSET); \\\n\ + REDUCEPROD_PROCESS_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +REDUCEPROD_AXIS0_2D(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8, CONV, 1, 0, 1, 0)\n\ +REDUCEPROD_AXIS0_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, 1, 0)\n\ +REDUCEPROD_AXIS0_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, 1, 0)\n\ +REDUCEPROD_AXIS0_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0)\n\ +REDUCEPROD_AXIS0_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS0_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS0_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, \\\n\ +vxc_short8, CONV, 1, 0, inputScale, input_offset_asymmetric)\n\ +REDUCEPROD_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, short4,\\\n\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, char4,\\\n\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric)\n\ +\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS0_BF16(read_fun) \\\n\ + while(coord.x < inputWidth) \\\n\ + { \\\n\ + read_fun(img_val, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(val0, img_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, tmpProdLo, val0, 16); \\\n\ + VXC_DP2x8(val0, img_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, tmpProdHi, val0, 16); \\\n\ + tmpProd = tmpProdLo * tmpProdHi; \\\n\ + prodValue = prodValue * tmpProd; \\\n\ + coord.x += 8; \\\n\ + } \\\n\ + vxc_ushort8 tmpProdInt0, tmpProdInt1; \\\n\ + vxc_ushort8 tmpOnesInt = {0, 16256, 0, 16256, 0, 16256, 0, 16256}; \\\n\ + read_fun(img_val, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(val0, img_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, tmpProdLo, val0, 16); \\\n\ + VXC_DP2x8(val0, img_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, tmpProdHi, val0, 16); \\\n\ + _viv_asm(COPY, tmpProdInt0, tmpProdLo, 16); \\\n\ + _viv_asm(COPY, tmpProdInt1, tmpProdHi, 16); \\\n\ + VXC_DP2x8(tmpProdInt0, tmpProdInt0, tmpOnesInt,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetEndLoData_2x8); \\\n\ + VXC_DP2x8(tmpProdInt1, tmpProdInt1, tmpOnesInt,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetEndHiData_2x8); \\\n\ + _viv_asm(COPY, tmpProdLo, tmpProdInt0, 16); \\\n\ + _viv_asm(COPY, tmpProdHi, tmpProdInt1, 16); \\\n\ + tmpProd = tmpProdLo * tmpProdHi; \\\n\ + prodValue = prodValue * tmpProd; \\\n\ + tmpProd.xy = prodValue.xy * prodValue.zw; \\\n\ + prodValue.x = tmpProd.x * tmpProd.y;\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS0_BF16_SAVE(write_fun) \\\n\ + vxc_ushort8 dst; \\\n\ + _viv_asm(COPY, dst, prodValue, 16); \\\n\ + dst.s0 = dst.s1; \\\n\ + write_fun(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void reduceprod_axis0_BF16toBF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_float4 prodValue = {1.0f, 1.0f, 1.0f, 1.0f};\n\ + vxc_float4 tmpProdLo, tmpProdHi, tmpProd;\n\ + vxc_short8 img_val;\n\ + vxc_short8 val0;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + REDUCEPROD_PROCESS_AXIS0_BF16(VXC_ReadImage2DArray);\n\ + REDUCEPROD_PROCESS_AXIS0_BF16_SAVE(VXC_WriteImage);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis0_BF16toBF16_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int2 coord_out = (int2)(get_global_id(0), 0);\n\ + vxc_float4 prodValue = {1.0f, 1.0f, 1.0f, 1.0f};\n\ + vxc_float4 tmpProdLo, tmpProdHi, tmpProd;\n\ + vxc_short8 img_val;\n\ + vxc_short8 val0;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + REDUCEPROD_PROCESS_AXIS0_BF16(VXC_ReadImage);\n\ + REDUCEPROD_PROCESS_AXIS0_BF16_SAVE(VXC_WriteImage);\n\ +}\n\ +\n\ +"; /* end of reduceprod_internal_axis0_vx*/ + +static const char reduceprod_internal_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float input_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniGetLoData_4x4;\n\ +_viv_uniform VXC_512Bits uniGetHiData_4x4;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS1(read_fun, IN_SCALE, IN_OFFSET) \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(tmpProdLo, vec0, vec0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \\\n\ + VXC_DP4x4(tmpProdHi, vec0, vec0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \\\n\ + tmpProdLo = (tmpProdLo - IN_OFFSET) * IN_SCALE; \\\n\ + tmpProdHi = (tmpProdHi - IN_OFFSET) * IN_SCALE; \\\n\ + prodValueLo = prodValueLo * tmpProdLo; \\\n\ + prodValueHi = prodValueHi * tmpProdHi; \\\n\ + coord.y++; \\\n\ + } \\\n\ + while(coord.y < axisSize);\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, write_fun) \\\n\ + dst_type dst0, dst1; \\\n\ + save_type vect; \\\n\ + prodValueLo = prodValueLo * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst0, prodValueLo); \\\n\ + prodValueHi = prodValueHi * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst1, prodValueHi); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +#define REDUCEPROD_AXIS1(src_name, dst_name, src_type, copy_type, dst_type, save_type,\\\n\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET) \\\n\ +__kernel void reduceprod_axis1_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f}; \\\n\ + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f};\\\n\ + vxc_float4 tmpProdLo, tmpProdHi; \\\n\ + src_type vec0; \\\n\ + copy_type in0; \\\n\ + REDUCEPROD_PROCESS_AXIS1(VXC_ReadImage2DArray, IN_SCALE, IN_OFFSET) \\\n\ + REDUCEPROD_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +\n\ +\n\ +REDUCEPROD_AXIS1(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8, CONV, 1, 0, 1, 0)\n\ +REDUCEPROD_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, 1, 0)\n\ +REDUCEPROD_AXIS1(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, 1, 0)\n\ +REDUCEPROD_AXIS1(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0)\n\ +REDUCEPROD_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, \\\n\ +vxc_short8, CONV, 1, 0, inputScale, input_offset_asymmetric)\n\ +REDUCEPROD_AXIS1(I16, I16, vxc_short8, vxc_short8, short4,\\\n\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS1(I8, I8, vxc_char16, vxc_char16, char4,\\\n\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric)\n\ +\n\ +\n\ +#define REDUCEPROD_AXIS1_2D(src_name, dst_name, src_type, copy_type, dst_type, save_type,\\\n\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET) \\\n\ +__kernel void reduceprod_axis1_##src_name##to##dst_name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + int2 coord_out = (int2)(get_global_id(0), 0); \\\n\ + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f}; \\\n\ + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f};\\\n\ + vxc_float4 tmpProdLo, tmpProdHi; \\\n\ + src_type vec0; \\\n\ + copy_type in0; \\\n\ + REDUCEPROD_PROCESS_AXIS1(VXC_ReadImage, IN_SCALE, IN_OFFSET) \\\n\ + REDUCEPROD_PROCESS_AXIS1_SAVE(dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +REDUCEPROD_AXIS1_2D(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8, CONV, 1, 0, 1, 0)\n\ +REDUCEPROD_AXIS1_2D(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, 1, 0)\n\ +REDUCEPROD_AXIS1_2D(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, 1, 0)\n\ +REDUCEPROD_AXIS1_2D(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0)\n\ +REDUCEPROD_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, input_offset_asymmetric)\n\ +REDUCEPROD_AXIS1_2D(I16, I16, vxc_short8, vxc_short8, short4,\\\n\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, char4,\\\n\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric)\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS1_BF16(read_fun) \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(vec0, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, tmpProdLo, vec0, 16); \\\n\ + VXC_DP2x8(vec0, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, tmpProdHi, vec0, 16); \\\n\ + prodValueLo = prodValueLo * tmpProdLo; \\\n\ + prodValueHi = prodValueHi * tmpProdHi; \\\n\ + coord.y++; \\\n\ + } \\\n\ + while(coord.y < axisSize);\n\ +\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS1_SAVE_BF16(write_fun) \\\n\ + vxc_ushort8 dst0, dst1; \\\n\ + vxc_ushort8 vect; \\\n\ + _viv_asm(COPY, dst0, prodValueLo, 16); \\\n\ + _viv_asm(COPY, dst1, prodValueHi, 16); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + write_fun(output, coord_out, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void reduceprod_axis1_BF16toBF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f};\n\ + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f};\n\ + vxc_float4 tmpProdLo, tmpProdHi;\n\ + vxc_short8 vec0;\n\ + vxc_short8 in0;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + REDUCEPROD_PROCESS_AXIS1_BF16(VXC_ReadImage2DArray)\n\ + REDUCEPROD_PROCESS_AXIS1_SAVE_BF16(VXC_WriteImage);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis1_BF16toBF16_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int2 coord_out = (int2)(get_global_id(0), 0);\n\ + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f};\n\ + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f};\n\ + vxc_float4 tmpProdLo, tmpProdHi;\n\ + vxc_short8 vec0;\n\ + vxc_short8 in0;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + REDUCEPROD_PROCESS_AXIS1_BF16(VXC_ReadImage)\n\ + REDUCEPROD_PROCESS_AXIS1_SAVE_BF16(VXC_WriteImage);\n\ +}\n\ +"; /* end of reduceprod_internal_axis1_vx*/ + +static const char reduceprod_internal_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float input_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniGetLoData_4x4;\n\ +_viv_uniform VXC_512Bits uniGetHiData_4x4;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS2(read_fun, IN_SCALE, IN_OFFSET) \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(tmpProdLo, vec0, vec0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetLoData_4x4); \\\n\ + VXC_DP4x4(tmpProdHi, vec0, vec0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetHiData_4x4); \\\n\ + tmpProdLo = (tmpProdLo - IN_OFFSET) * IN_SCALE; \\\n\ + tmpProdHi = (tmpProdHi - IN_OFFSET) * IN_SCALE; \\\n\ + prodValueLo = prodValueLo * tmpProdLo; \\\n\ + prodValueHi = prodValueHi * tmpProdHi; \\\n\ + coord.z++; \\\n\ + } \\\n\ + while(coord.z < axisSize);\n\ +\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode, OUT_SCALE, OUT_OFFSET, write_fun) \\\n\ + dst_type dst0, dst1; \\\n\ + save_type vect; \\\n\ + prodValueLo = prodValueLo * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst0, prodValueLo); \\\n\ + prodValueHi = prodValueHi * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst1, prodValueHi); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +#define REDUCEPROD_AXIS2(src_name, dst_name, src_type, copy_type, dst_type, save_type,\\\n\ +conv_mode, OUT_SCALE, OUT_OFFSET, IN_SCALE, IN_OFFSET) \\\n\ +__kernel void reduceprod_axis2_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f}; \\\n\ + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f};\\\n\ + vxc_float4 tmpProdLo, tmpProdHi; \\\n\ + src_type vec0; \\\n\ + copy_type in0; \\\n\ + REDUCEPROD_PROCESS_AXIS2(VXC_ReadImage2DArray, IN_SCALE, IN_OFFSET) \\\n\ + REDUCEPROD_PROCESS_AXIS2_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_WriteImage); \\\n\ +}\n\ +\n\ +\n\ +\n\ +REDUCEPROD_AXIS2(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8, CONV, 1, 0, 1, 0)\n\ +REDUCEPROD_AXIS2(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8, CONV_SAT_RTE, outputScale, 0, 1, 0)\n\ +REDUCEPROD_AXIS2(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8, CONV_SAT_RTE, outputScale, 0, 1, 0)\n\ +REDUCEPROD_AXIS2(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, 1, 0)\n\ +REDUCEPROD_AXIS2(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8, CONV, 1, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS2(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8, CONV, 1, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS2(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, inputScale, input_offset_asymmetric)\n\ +\n\ +REDUCEPROD_AXIS2(I16, I16, vxc_short8, vxc_short8, short4,\\\n\ +vxc_short8, CONV_SAT_RTE, outputScale, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS2(I8, I8, vxc_char16, vxc_char16, char4,\\\n\ +vxc_char8, CONV_SAT_RTE, outputScale, 0, inputScale, 0)\n\ +REDUCEPROD_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8, CONV_SAT_RTE,\\\n\ +outputScale, output_offset_asymmetric, inputScale, input_offset_asymmetric)\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS2_BF16(read_fun) \\\n\ + do \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(vec0, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, tmpProdLo, vec0, 16); \\\n\ + VXC_DP2x8(vec0, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, tmpProdHi, vec0, 16); \\\n\ + prodValueLo = prodValueLo * tmpProdLo; \\\n\ + prodValueHi = prodValueHi * tmpProdHi; \\\n\ + coord.z++; \\\n\ + } \\\n\ + while(coord.z < axisSize);\n\ +\n\ +\n\ +#define REDUCEPROD_PROCESS_AXIS2_SAVE_BF16(write_fun) \\\n\ + vxc_ushort8 dst0, dst1; \\\n\ + vxc_ushort8 vect; \\\n\ + _viv_asm(COPY, dst0, prodValueLo, 16); \\\n\ + _viv_asm(COPY, dst1, prodValueHi, 16); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + write_fun(output, coord.xy, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void reduceprod_axis2_BF16toBF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axisVal\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + vxc_float4 prodValueLo = {1.0f, 1.0f, 1.0f, 1.0f};\n\ + vxc_float4 prodValueHi = {1.0f, 1.0f, 1.0f, 1.0f};\n\ + vxc_float4 tmpProdLo, tmpProdHi;\n\ + vxc_short8 vec0;\n\ + vxc_short8 in0;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + REDUCEPROD_PROCESS_AXIS2_BF16(VXC_ReadImage2DArray)\n\ + REDUCEPROD_PROCESS_AXIS2_SAVE_BF16(VXC_WriteImage);\n\ +}\n\ +"; /* end of reduceprod_internal_axis2_vx*/ + +static const char relational_ops_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float input0Scale;\n\ +_viv_uniform float input0Tail;\n\ +_viv_uniform float input1Scale;\n\ +_viv_uniform float input1Tail;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ +\n\ +#define COMPARISONS_2D(func_name, src0_type_name, src1_type_name, \\\n\ + src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \\\n\ +__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src0_type src0; \\\n\ + src0_copy_type srcA; \\\n\ + src0_type src1; \\\n\ + src0_copy_type srcB; \\\n\ + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, srcA, src0, 16); \\\n\ + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, srcB, src1, 16); \\\n\ + \\\n\ + float4 vecA0, vecA1; \\\n\ + float4 vecB0, vecB1; \\\n\ + VXC_DP4x4(vecA0, srcA, srcA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\ + VXC_DP4x4(vecA1, srcA, srcA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \\\n\ + vecA0 = vecA0 * input0Scale + input0Tail; \\\n\ + vecA1 = vecA1 * input0Scale + input0Tail; \\\n\ + VXC_DP4x4(vecB0, srcB, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\ + VXC_DP4x4(vecB1, srcB, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \\\n\ + vecB0 = vecB0 * input1Scale + input1Tail; \\\n\ + vecB1 = vecB1 * input1Scale + input1Tail; \\\n\ + int4 dst0, dst1; \\\n\ + dst0 = (vecA0)cmp_op(vecB0); \\\n\ + dst1 = (vecA1)cmp_op(vecB1); \\\n\ + \\\n\ + vxc_char16 dst; \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + dst &= 1; \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +//LESS\n\ +COMPARISONS_2D(less, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, <)\n\ +COMPARISONS_2D(less, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, <)\n\ +COMPARISONS_2D(less, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, <)\n\ +COMPARISONS_2D(less, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, <)\n\ +COMPARISONS_2D(less, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, <)\n\ +COMPARISONS_2D(less, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, <)\n\ +COMPARISONS_2D(less, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, <)\n\ +COMPARISONS_2D(less, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, <)\n\ +COMPARISONS_2D(less, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, <)\n\ +COMPARISONS_2D(less, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, <)\n\ +//GREAT\n\ +COMPARISONS_2D(great, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, >)\n\ +COMPARISONS_2D(great, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, >)\n\ +COMPARISONS_2D(great, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, >)\n\ +COMPARISONS_2D(great, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, >)\n\ +COMPARISONS_2D(great, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, >)\n\ +COMPARISONS_2D(great, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, >)\n\ +COMPARISONS_2D(great, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, >)\n\ +COMPARISONS_2D(great, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, >)\n\ +COMPARISONS_2D(great, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, >)\n\ +COMPARISONS_2D(great, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, >)\n\ +//LESS_EQUAL\n\ +COMPARISONS_2D(less_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, <=)\n\ +COMPARISONS_2D(less_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, <=)\n\ +COMPARISONS_2D(less_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, <=)\n\ +COMPARISONS_2D(less_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, <=)\n\ +COMPARISONS_2D(less_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, <=)\n\ +COMPARISONS_2D(less_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, <=)\n\ +COMPARISONS_2D(less_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, <=)\n\ +COMPARISONS_2D(less_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, <=)\n\ +COMPARISONS_2D(less_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, <=)\n\ +COMPARISONS_2D(less_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, <=)\n\ +//GREAT_EQUAL\n\ +COMPARISONS_2D(great_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, >=)\n\ +COMPARISONS_2D(great_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, >=)\n\ +COMPARISONS_2D(great_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, >=)\n\ +COMPARISONS_2D(great_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, >=)\n\ +COMPARISONS_2D(great_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, >=)\n\ +COMPARISONS_2D(great_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, >=)\n\ +COMPARISONS_2D(great_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, >=)\n\ +COMPARISONS_2D(great_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, >=)\n\ +COMPARISONS_2D(great_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, >=)\n\ +COMPARISONS_2D(great_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, >=)\n\ +//EQUAL\n\ +COMPARISONS_2D(equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, ==)\n\ +COMPARISONS_2D(equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, ==)\n\ +COMPARISONS_2D(equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, ==)\n\ +COMPARISONS_2D(equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, ==)\n\ +COMPARISONS_2D(equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ==)\n\ +COMPARISONS_2D(equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, ==)\n\ +COMPARISONS_2D(equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, ==)\n\ +COMPARISONS_2D(equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, ==)\n\ +COMPARISONS_2D(equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ==)\n\ +COMPARISONS_2D(equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, ==)\n\ +//NOT_EQUAL\n\ +COMPARISONS_2D(not_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, !=)\n\ +COMPARISONS_2D(not_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, !=)\n\ +COMPARISONS_2D(not_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, !=)\n\ +COMPARISONS_2D(not_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, !=)\n\ +COMPARISONS_2D(not_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, !=)\n\ +COMPARISONS_2D(not_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, !=)\n\ +COMPARISONS_2D(not_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, !=)\n\ +COMPARISONS_2D(not_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, !=)\n\ +COMPARISONS_2D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=)\n\ +COMPARISONS_2D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, !=)\n\ +\n\ +"; /* end of relational_ops_2d_vx*/ + +static const char relational_ops_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float input0Scale;\n\ +_viv_uniform float input0Tail;\n\ +_viv_uniform float input1Scale;\n\ +_viv_uniform float input1Tail;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ +\n\ +#define COMPARISONS_3D(func_name, src0_type_name, src1_type_name, \\\n\ + src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \\\n\ +__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + src0_type src0; \\\n\ + src0_copy_type srcA; \\\n\ + src0_type src1; \\\n\ + src0_copy_type srcB; \\\n\ + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, srcA, src0, 16); \\\n\ + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, srcB, src1, 16); \\\n\ + \\\n\ + float4 vecA0, vecA1, vecA; \\\n\ + float4 vecB0, vecB1, vecB; \\\n\ + VXC_DP4x4(vecA0, srcA, srcA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\ + VXC_DP4x4(vecA1, srcA, srcA, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \\\n\ + vecA0 = vecA0 * input0Scale + input0Tail; \\\n\ + vecA1 = vecA1 * input0Scale + input0Tail; \\\n\ + VXC_DP4x4(vecB0, srcB, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\ + VXC_DP4x4(vecB1, srcB, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \\\n\ + vecB0 = vecB0 * input1Scale + input1Tail; \\\n\ + vecB1 = vecB1 * input1Scale + input1Tail; \\\n\ + int4 dst0, dst1; \\\n\ + dst0 = (vecA0)cmp_op(vecB0); \\\n\ + dst1 = (vecA1)cmp_op(vecB1); \\\n\ + \\\n\ + vxc_char16 dst; \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + dst &= 1; \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +//LESS\n\ +COMPARISONS_3D(less, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, <)\n\ +COMPARISONS_3D(less, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, <)\n\ +COMPARISONS_3D(less, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, <)\n\ +COMPARISONS_3D(less, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, <)\n\ +COMPARISONS_3D(less, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, <)\n\ +COMPARISONS_3D(less, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, <)\n\ +COMPARISONS_3D(less, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, <)\n\ +COMPARISONS_3D(less, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, <)\n\ +COMPARISONS_3D(less, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, <)\n\ +COMPARISONS_3D(less, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, <)\n\ +//GREAT\n\ +COMPARISONS_3D(great, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, >)\n\ +COMPARISONS_3D(great, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, >)\n\ +COMPARISONS_3D(great, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, >)\n\ +COMPARISONS_3D(great, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, >)\n\ +COMPARISONS_3D(great, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, >)\n\ +COMPARISONS_3D(great, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, >)\n\ +COMPARISONS_3D(great, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, >)\n\ +COMPARISONS_3D(great, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, >)\n\ +COMPARISONS_3D(great, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, >)\n\ +COMPARISONS_3D(great, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, >)\n\ +//LESS_EQUAL\n\ +COMPARISONS_3D(less_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, <=)\n\ +COMPARISONS_3D(less_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, <=)\n\ +COMPARISONS_3D(less_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, <=)\n\ +COMPARISONS_3D(less_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, <=)\n\ +COMPARISONS_3D(less_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, <=)\n\ +COMPARISONS_3D(less_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, <=)\n\ +COMPARISONS_3D(less_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, <=)\n\ +COMPARISONS_3D(less_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, <=)\n\ +COMPARISONS_3D(less_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, <=)\n\ +COMPARISONS_3D(less_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, <=)\n\ +//GREAT_EQUAL\n\ +COMPARISONS_3D(great_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, >=)\n\ +COMPARISONS_3D(great_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, >=)\n\ +COMPARISONS_3D(great_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, >=)\n\ +COMPARISONS_3D(great_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, >=)\n\ +COMPARISONS_3D(great_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, >=)\n\ +COMPARISONS_3D(great_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, >=)\n\ +COMPARISONS_3D(great_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, >=)\n\ +COMPARISONS_3D(great_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, >=)\n\ +COMPARISONS_3D(great_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, >=)\n\ +COMPARISONS_3D(great_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, >=)\n\ +//EQUAL\n\ +COMPARISONS_3D(equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, ==)\n\ +COMPARISONS_3D(equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, ==)\n\ +COMPARISONS_3D(equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, ==)\n\ +COMPARISONS_3D(equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, ==)\n\ +COMPARISONS_3D(equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ==)\n\ +COMPARISONS_3D(equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, ==)\n\ +COMPARISONS_3D(equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, ==)\n\ +COMPARISONS_3D(equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, ==)\n\ +COMPARISONS_3D(equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ==)\n\ +COMPARISONS_3D(equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, ==)\n\ +//NOT_EQUAL\n\ +COMPARISONS_3D(not_equal, F16, F16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, !=)\n\ +COMPARISONS_3D(not_equal, F16, I8, vxc_short8, vxc_half8, vxc_char8, vxc_char8, !=)\n\ +COMPARISONS_3D(not_equal, F16, U8, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8, !=)\n\ +COMPARISONS_3D(not_equal, F16, I16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, !=)\n\ +COMPARISONS_3D(not_equal, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, !=)\n\ +COMPARISONS_3D(not_equal, I8, F16, vxc_char8, vxc_char8, vxc_short8, vxc_half8, !=)\n\ +COMPARISONS_3D(not_equal, U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, !=)\n\ +COMPARISONS_3D(not_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half8, !=)\n\ +COMPARISONS_3D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=)\n\ +COMPARISONS_3D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, !=)\n\ +\n\ +"; /* end of relational_ops_3d_vx*/ + +static const char relu_keras_vx[] = "\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvFP16toFP32_Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniConvFP16toFP32_Hi_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractInteger_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniPackedBF16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvIntegertoFP32_Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniConvIntegertoFP32_Hi_4x4;\n\ +_viv_uniform float offset;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float inputTail;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float outputZP;\n\ +\n\ +float4 I8toF32_Lo(vxc_char8 src)\n\ +{\n\ + float4 dst;\n\ +\n\ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Lo_4x4);\n\ + dst *= input_scale;\n\ + return dst;\n\ +}\n\ +\n\ +float4 I8toF32_Hi(vxc_char8 src)\n\ +{\n\ + float4 dst;\n\ +\n\ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Hi_4x4);\n\ + dst *= input_scale;\n\ + return dst;\n\ +}\n\ +\n\ +float4 U8toF32_Lo(vxc_uchar8 src)\n\ +{\n\ + float4 dst;\n\ +\n\ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Lo_4x4);\n\ + dst = dst * input_scale + inputTail;\n\ + return dst;\n\ +}\n\ +\n\ +float4 U8toF32_Hi(vxc_uchar8 src)\n\ +{\n\ + float4 dst;\n\ +\n\ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Hi_4x4);\n\ + dst = dst * input_scale + inputTail;\n\ + return dst;\n\ +}\n\ +\n\ +float4 I16toF32_Lo(vxc_short8 src)\n\ +{\n\ + float4 dst;\n\ +\n\ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Lo_4x4);\n\ + dst *= input_scale;\n\ + return dst;\n\ +}\n\ +\n\ +float4 I16toF32_Hi(vxc_short8 src)\n\ +{\n\ + float4 dst;\n\ +\n\ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvIntegertoFP32_Hi_4x4);\n\ + dst *= input_scale;\n\ + return dst;\n\ +}\n\ +\n\ +float4 F16toF32_Lo(vxc_ushort8 src)\n\ +{\n\ + vxc_half8 srcHalf;\n\ + float4 dst;\n\ +\n\ + _viv_asm(COPY, srcHalf, src, 16);\n\ + VXC_DP4x4(dst, srcHalf, srcHalf, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvFP16toFP32_Lo_4x4);\n\ + return dst;\n\ +}\n\ +\n\ +float4 F16toF32_Hi(vxc_ushort8 src)\n\ +{\n\ + vxc_half8 srcHalf;\n\ + float4 dst;\n\ +\n\ + _viv_asm(COPY, srcHalf, src, 16);\n\ + VXC_DP4x4(dst, srcHalf, srcHalf, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvFP16toFP32_Hi_4x4);\n\ + return dst;\n\ +}\n\ +\n\ +float4 BF16toF32_Lo(vxc_ushort8 src)\n\ +{\n\ + vxc_ushort8 srcA;\n\ + float4 dst;\n\ +\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_DP2x8(srcA, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, dst, srcA, 16);\n\ +\n\ + return dst;\n\ +}\n\ +\n\ +float4 BF16toF32_Hi(vxc_ushort8 src)\n\ +{\n\ + vxc_ushort8 srcA;\n\ + float4 dst;\n\ +\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_DP2x8(srcA, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, dst, srcA, 16);\n\ +\n\ + return dst;\n\ +}\n\ +\n\ +vxc_ushort8 F32toF16(float4 src0, float4 src1)\n\ +{\n\ + half4 srcHalf0, srcHalf1;\n\ + vxc_half8 dst0;\n\ + vxc_ushort8 dst;\n\ +\n\ + _viv_asm(CONV, srcHalf0, src0);\n\ + _viv_asm(CONV, srcHalf1, src1);\n\ +\n\ + VXC_DP2x8(dst0, srcHalf0, srcHalf1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\ + _viv_asm(COPY, dst, dst0, 16);\n\ + return dst;\n\ +}\n\ +\n\ +vxc_ushort8 F32toBF16(float4 src0, float4 src1)\n\ +{\n\ + vxc_ushort8 srcA, srcB;\n\ + vxc_ushort8 dst;\n\ +\n\ + _viv_asm(COPY, srcA, src0, 16);\n\ + _viv_asm(COPY, srcB, src1, 16);\n\ + VXC_DP2x8(dst, srcA, srcB, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniPackedBF16_2x8);\n\ + return dst;\n\ +}\n\ +\n\ +vxc_char8 F32toI8(float4 src0, float4 src1)\n\ +{\n\ + int4 srcInt0, srcInt1;\n\ + vxc_char8 dst;\n\ +\n\ + src0 *= output_scale;\n\ + src1 *= output_scale;\n\ + _viv_asm(CONV_RTE, srcInt0, src0);\n\ + _viv_asm(CONV_RTE, srcInt1, src1);\n\ +\n\ + VXC_DP2x8(dst, srcInt0, srcInt1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractInteger_2x8);\n\ + return dst;\n\ +}\n\ +\n\ +vxc_short8 F32toI16(float4 src0, float4 src1)\n\ +{\n\ + int4 srcInt0, srcInt1;\n\ + vxc_short8 dst;\n\ +\n\ + src0 *= output_scale;\n\ + src1 *= output_scale;\n\ + _viv_asm(CONV_RTE, srcInt0, src0);\n\ + _viv_asm(CONV_RTE, srcInt1, src1);\n\ +\n\ + VXC_DP2x8(dst, srcInt0, srcInt1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractInteger_2x8);\n\ + return dst;\n\ +}\n\ +\n\ +vxc_uchar8 F32toU8(float4 src0, float4 src1)\n\ +{\n\ + int4 srcInt0, srcInt1;\n\ + vxc_uchar8 dst;\n\ +\n\ + src0 = src0 * output_scale + outputZP;\n\ + src1 = src1 * output_scale + outputZP;\n\ + _viv_asm(CONV_RTE, srcInt0, src0);\n\ + _viv_asm(CONV_RTE, srcInt1, src1);\n\ +\n\ + VXC_DP2x8(dst, srcInt0, srcInt1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractInteger_2x8);\n\ + return dst;\n\ +}\n\ +\n\ +\n\ +#define TENSOR_KERAS_RELU(src_type_name, dst_type_name, tensor_dims, image_type, \\\n\ + convert2FP32_Func, convert2DstType_Func, src_type, dst_type) \\\n\ +__kernel void relu_keras_##src_type_name##to##dst_type_name##tensor_dims( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + float alpha, \\\n\ + float max_value, \\\n\ + float threshold \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + src_type src; \\\n\ + VXC_Read##image_type(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + float4 dataA = convert2FP32_Func##_Lo(src); \\\n\ + float4 dataB = convert2FP32_Func##_Hi(src); \\\n\ + float4 dataA0 = dataA < threshold ? threshold : dataA; \\\n\ + dataA0 = dataA0 > max_value ? max_value : dataA0; \\\n\ + float4 dataB0 = dataB < threshold ? threshold : dataB; \\\n\ + dataB0 = dataB0 > max_value ? max_value : dataB0; \\\n\ + float4 dataA1 = dataA * alpha - offset; \\\n\ + float4 dataB1 = dataB * alpha - offset; \\\n\ + float4 dst0 = dataA < threshold ? dataA1 : dataA0; \\\n\ + float4 dst1 = dataB < threshold ? dataB1 : dataB0; \\\n\ + dst_type result = convert2DstType_Func(dst0, dst1); \\\n\ + VXC_Write##image_type(output, coord, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +TENSOR_KERAS_RELU(F16, F16, _3D, Image2DArray, F16toF32, F32toF16, vxc_ushort8, vxc_ushort8)\n\ +TENSOR_KERAS_RELU(F16, I16, _3D, Image2DArray, F16toF32, F32toI16, vxc_ushort8, vxc_short8)\n\ +TENSOR_KERAS_RELU(F16, I8, _3D, Image2DArray, F16toF32, F32toI8, vxc_ushort8, vxc_char8)\n\ +TENSOR_KERAS_RELU(F16, U8, _3D, Image2DArray, F16toF32, F32toU8, vxc_ushort8, vxc_uchar8)\n\ +TENSOR_KERAS_RELU(BF16, BF16, _3D, Image2DArray, BF16toF32, F32toBF16, vxc_ushort8, vxc_ushort8)\n\ +\n\ +TENSOR_KERAS_RELU(I16, I16, _3D, Image2DArray, I16toF32, F32toI16, vxc_short8, vxc_short8)\n\ +TENSOR_KERAS_RELU(I16, F16, _3D, Image2DArray, I16toF32, F32toF16, vxc_short8, vxc_ushort8)\n\ +TENSOR_KERAS_RELU(I8, I8, _3D, Image2DArray, I8toF32, F32toI8, vxc_char8, vxc_char8)\n\ +TENSOR_KERAS_RELU(I8, F16, _3D, Image2DArray, I8toF32, F32toF16, vxc_char8, vxc_ushort8)\n\ +TENSOR_KERAS_RELU(U8, U8, _3D, Image2DArray, U8toF32, F32toU8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_KERAS_RELU(U8, F16, _3D, Image2DArray, U8toF32, F32toF16, vxc_uchar8, vxc_ushort8)\n\ +\n\ +TENSOR_KERAS_RELU(F16, F16, _2D, Image, F16toF32, F32toF16, vxc_ushort8, vxc_ushort8)\n\ +TENSOR_KERAS_RELU(F16, I16, _2D, Image, F16toF32, F32toI16, vxc_ushort8, vxc_short8)\n\ +TENSOR_KERAS_RELU(F16, I8, _2D, Image, F16toF32, F32toI8, vxc_ushort8, vxc_char8)\n\ +TENSOR_KERAS_RELU(F16, U8, _2D, Image, F16toF32, F32toU8, vxc_ushort8, vxc_uchar8)\n\ +TENSOR_KERAS_RELU(BF16, BF16, _2D, Image, BF16toF32, F32toBF16, vxc_ushort8, vxc_ushort8)\n\ +TENSOR_KERAS_RELU(I16, I16, _2D, Image, I16toF32, F32toI16, vxc_short8, vxc_short8)\n\ +TENSOR_KERAS_RELU(I16, F16, _2D, Image, I16toF32, F32toF16, vxc_short8, vxc_ushort8)\n\ +TENSOR_KERAS_RELU(I8, I8, _2D, Image, I8toF32, F32toI8, vxc_char8, vxc_char8)\n\ +TENSOR_KERAS_RELU(I8, F16, _2D, Image, I8toF32, F32toF16, vxc_char8, vxc_ushort8)\n\ +TENSOR_KERAS_RELU(U8, U8, _2D, Image, U8toF32, F32toU8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_KERAS_RELU(U8, F16, _2D, Image, U8toF32, F32toF16, vxc_uchar8, vxc_ushort8)\n\ +"; /* end of relu_keras_vx*/ + +static const char resize_bilinear_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 scale_xy;\n\ +_viv_uniform int depth;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform float half_pixel_value;\n\ +\n\ +__kernel void resize_bilinear_BF16toBF16_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + int bottom_y_idx = top_y_idx + 1;\n\ + vxc_short8 top;\n\ + vxc_short8 bottom;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(top, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(top, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(top, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(top, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + coord_in.x = left_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 src;\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ + float4 dst4;\n\ +\n\ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8);\n\ + _viv_asm(COPY, right4, src, 16);\n\ + VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8);\n\ + _viv_asm(COPY, left4, src, 16);\n\ + right4 -= left4;\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8);\n\ + _viv_asm(COPY, right4, src, 16);\n\ + VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8);\n\ + _viv_asm(COPY, left4, src, 16);\n\ + right4 -= left4;\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + dst4 = bottom4 * y_lerp + top4;\n\ + vxc_ushort8 tmp, dst;\n\ + _viv_asm(COPY, tmp, dst4, 16);\n\ + dst.s0123 = tmp.s1357;\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_bilinear_BF16toBF16_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ + vxc_ushort8 src0, src1, src2, src3, dst0, dst1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 16;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.z ++;\n\ + coord_in.y = top_y_idx;\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 dst_tmp;\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, left4, dst_tmp, 16);\n\ + VXC_DP2x8(dst_tmp, dst0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, right4, dst_tmp, 16);\n\ + right4 -= left4;\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, left4, dst_tmp, 16);\n\ + VXC_DP2x8(dst_tmp, dst1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, right4, dst_tmp, 16);\n\ + right4 -= left4;\n\ + bottom4 = right4 * x_lerp + left4;\n\ +\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ +\n\ + vxc_ushort8 tmp, dst;\n\ + _viv_asm(COPY, tmp, dst4, 16);\n\ + dst.s0123 = tmp.s1357;\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.z ++;\n\ + } while (coord_in.z < depth);\n\ +}\n\ +"; /* end of resize_bilinear_BF16_vx*/ + +static const char resize_bilinear_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\ +_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\ +_viv_uniform float2 scale_xy;\n\ +_viv_uniform int depth;\n\ +_viv_uniform float uint8Scale;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform float half_pixel_value;\n\ +\n\ +__kernel void resize_bilinear_F16toF16_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ + vxc_short8 top_left0, top_right0;\n\ + vxc_short8 bottom_left0, bottom_right0;\n\ + vxc_half8 top_left, top_right;\n\ + vxc_half8 bottom_left, bottom_right;\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top_left, top_left0, 16);\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top_right, top_right0, 16);\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + coord_in.x = left_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, bottom_left, bottom_left0, 16);\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, bottom_right, bottom_right0, 16);\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ + VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ + VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + half4 tmp;\n\ + _viv_asm(CONV, tmp, dst4);\n\ + VXC_DP2x8(top_left, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, top_left0, top_left, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, top_left0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_bilinear_F16toU8_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ + vxc_short8 top_left0, top_right0;\n\ + vxc_short8 bottom_left0, bottom_right0;\n\ + vxc_half8 top_left, top_right;\n\ + vxc_half8 bottom_left, bottom_right;\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(top_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top_left, top_left0, 16);\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(top_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top_right, top_right0, 16);\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + coord_in.x = left_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_left0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, bottom_left, bottom_left0, 16);\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_right0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, bottom_right, bottom_right0, 16);\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ + VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ + VXC_DP4x4(right4, top_right, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP4x4(left4, bottom_left, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ + VXC_DP4x4(right4, bottom_right, bottom_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + vxc_uchar8 dst_uchar;\n\ + VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst_uchar, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_bilinear_F16toF16_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ +\n\ + vxc_ushort8 src0, src1, src2, src3, dst0, dst1;\n\ + vxc_half8 top;\n\ + vxc_half8 bottom;\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 16;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, dst0, 16);\n\ + _viv_asm(COPY, bottom, dst1, 16);\n\ + coord_in.z ++;\n\ + coord_in.y = top_y_idx;\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y = bottom_y_idx;\n\ + VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + half4 tmp;\n\ + _viv_asm(CONV, tmp, dst4);\n\ + VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, dst0, top, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.z ++;\n\ + } while (coord_in.z < depth);\n\ +}\n\ +"; /* end of resize_bilinear_F16_vx*/ + +static const char resize_bilinear_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform float2 scale_xy;\n\ +_viv_uniform int depth;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;\n\ +_viv_uniform float dfpScale;\n\ +_viv_uniform float half_pixel_value;\n\ +\n\ +__kernel void resize_bilinear_I16toI16_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ +\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ +\n\ + vxc_ushort8 src0, src1, src2, src3, dst0, dst1;\n\ +\n\ + vxc_short8 top;\n\ + vxc_short8 bottom;\n\ +\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 16;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(dst0, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, dst0, 16);\n\ + _viv_asm(COPY, bottom, dst1, 16);\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + coord_in.z ++;\n\ + coord_in.y = top_y_idx;\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + VXC_ReadImage2DArray(src2, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + dst4 = dst4 * dfpScale;\n\ + int4 dst = convert_int4_rte(dst4);\n\ +\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.z ++;\n\ + } while (coord_in.z < depth);\n\ +}\n\ +\n\ +__kernel void resize_bilinear_I16toI16_DOWN\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ +\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ +\n\ + vxc_short8 top_left, top_right;\n\ + vxc_short8 bottom_left, bottom_right;\n\ +\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + coord_in.x = left_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + VXC_DP4x4(left4, top_left, top_left, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_DP4x4(right4, top_right, top_right, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ +\n\ + right4 -= left4;\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom_left, bottom_left, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_DP4x4(right4, bottom_right, bottom_right, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ +\n\ + right4 -= left4;\n\ + bottom4 = right4 * x_lerp + left4;\n\ +\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ +\n\ + dst4 = dst4 * dfpScale;\n\ +\n\ + int4 dst = convert_int4_rte(dst4);\n\ +\n\ + VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +"; /* end of resize_bilinear_I16_vx*/ + +static const char resize_bilinear_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform float2 scale_xy;\n\ +_viv_uniform int depth;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertDFP2FP32_4x4;\n\ +_viv_uniform float dfpScale;\n\ +_viv_uniform float half_pixel_value;\n\ +\n\ +__kernel void resize_bilinear_I8toI8_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ +\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ +\n\ + vxc_uchar16 src0, src1, dst0, dst1;\n\ +\n\ + vxc_char16 top;\n\ + vxc_char16 bottom;\n\ +\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 8;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(dst0, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, top, dst0, 16);\n\ + _viv_asm(COPY, bottom, dst1, 16);\n\ +\n\ + coord_in.z ++;\n\ + coord_in.y = top_y_idx;\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y = bottom_y_idx;\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + VXC_DP4x4(left4, top, top, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_DP4x4(right4, top, top, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\ +\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_part1_4x4);\n\ +\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + dst4 = dst4 * dfpScale;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.z ++;\n\ + } while (coord_in.z < depth);\n\ +}\n\ +\n\ +__kernel void resize_bilinear_I8toI8_DOWN\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ +\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ +\n\ + vxc_char16 top_left, top_right;\n\ + vxc_char16 bottom_left, bottom_right;\n\ +\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + coord_in.x = left_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + VXC_DP4x4(left4, top_left, top_left, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_DP4x4(right4, top_right, top_right, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ +\n\ + right4 -= left4;\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom_left, bottom_left, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ + VXC_DP4x4(right4, bottom_right, bottom_right, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_4x4);\n\ +\n\ + right4 -= left4;\n\ + bottom4 = right4 * x_lerp + left4;\n\ +\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ +\n\ + dst4 = dst4 * dfpScale;\n\ +\n\ + int4 dst = convert_int4_rte(dst4);\n\ +\n\ + VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of resize_bilinear_I8_vx*/ + +static const char resize_bilinear_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform float2 scale_xy;\n\ +_viv_uniform int depth;\n\ +_viv_uniform int input_ZP;\n\ +_viv_uniform float uint8Scale;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform float half_pixel_value;\n\ +\n\ +__kernel void resize_bilinear_U8toF16_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ + vxc_uchar16 top_left, top_right;\n\ + vxc_uchar16 bottom_left, bottom_right;\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + coord_in.x = left_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + unsigned char inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ + VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ + VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ +\n\ + right4 -= left4;\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ + VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ +\n\ + right4 -= left4;\n\ + bottom4 = right4 * x_lerp + left4;\n\ +\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ +\n\ + dst4 *= uint8Scale;\n\ +\n\ + half4 dst;\n\ + _viv_asm(CONV, dst, dst4);\n\ +\n\ + vxc_short8 dst_short;\n\ + _viv_asm(COPY, dst_short, dst, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, dst_short.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_bilinear_U8toU8_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ +\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ +\n\ + vxc_uchar16 src0, src1;\n\ +\n\ + vxc_uchar16 top;\n\ + vxc_uchar16 bottom;\n\ +\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 8;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.z ++;\n\ + coord_in.y = top_y_idx;\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y = bottom_y_idx;\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + unsigned char inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ +\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ +\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.z ++;\n\ + } while (coord_in.z < depth);\n\ +}\n\ +\n\ +__kernel void resize_bilinear_U8toU8_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float bottom_y_f = ceil(in_y);\n\ + int bottom_y_idx= convert_int(bottom_y_f);\n\ + vxc_uchar16 top_left, top_right;\n\ + vxc_uchar16 bottom_left, bottom_right;\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(top_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(top_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.y = bottom_y_idx;\n\ + coord_in.x = left_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_left, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = right_x_idx.x;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.y;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.z;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = right_x_idx.w;\n\ + VXC_ReadImage2DArray(bottom_right, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4;\n\ + float4 right4;\n\ + float4 top4;\n\ + float4 bottom4;\n\ +\n\ + unsigned char inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ + VXC_DP4x4(left4, top_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ + VXC_DP4x4(right4, top_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ +\n\ + right4 -= left4;\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom_left, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ + VXC_DP4x4(right4, bottom_right, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_4x4);\n\ +\n\ + right4 -= left4;\n\ + bottom4 = right4 * x_lerp + left4;\n\ +\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ +\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ +\n\ + int4 dst = convert_int4_rte(dst4);\n\ +\n\ + VXC_DP2x8(top_left, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, top_left, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of resize_bilinear_U8_vx*/ + +static const char resize_bilinear_U8_UP_2X_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniResize2xUp_4x8;\n\ +_viv_uniform VXC_512Bits uniResize2xUpRound_2x8;\n\ +_viv_uniform int out_height;\n\ +\n\ +__kernel void resize_bilinear_U8toU8_UP_2X_half\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\ + coord_in.x = (coord_out.x * 2 - 1) >> 2;\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;\n\ +\n\ + vxc_uchar16 in0, in1, tmp, result;\n\ + vxc_ushort8 result_s, round_s = 8;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + while (coord_out.y < out_height)\n\ + {\n\ + VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\ + VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\ + VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\ + VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\ + VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_in.y += 2;\n\ + coord_out.y++;\n\ + }\n\ +}\n\ +"; /* end of resize_bilinear_U8_UP_2X_vx*/ + +static const char resize_bilinear_U8_opt_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#if (VX_VERSION==2)\n\ +\n\ +_viv_uniform float2 scale_xy;\n\ +_viv_uniform int depth;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_4x4_b;\n\ +_viv_uniform float half_pixel_value;\n\ +\n\ +__kernel void resize_bilinear_U8toU8_UP_opt\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers,\n\ + __read_only image2d_array_t scale\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + int4 right_x_idx = left_x_idx + 1;\n\ +\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ +\n\ + float top_y_f = floor(in_y);\n\ + int top_y_idx = convert_int(top_y_f);\n\ +\n\ + vxc_uchar16 src0, src1;\n\ +\n\ + vxc_uchar16 top_bottom;\n\ +\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 8;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + vxc_ushort16 lerp_0;\n\ + vxc_half16 lerp;\n\ +\n\ + int2 coord = (int2)(coord_out.x * 4, coord_out.y);\n\ + VXC_ReadImage(lerp_0.hi, scale, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(lerp_0.lo, scale, coord, VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, lerp.hi, lerp_0.hi, 16);\n\ + _viv_asm(COPY, lerp.lo, lerp_0.lo, 16);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + do\n\ + {\n\ + VXC_BitExtract(top_bottom, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(top_bottom, src1, src1, maskShift, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.w += input_desc.s4;\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xyww,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xyww,\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_DP4x4_b(dst, lerp.hi, lerp.lo, top_bottom,\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4x4_b);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.w += output_desc.s4;\n\ +\n\ + coord_out.z ++;\n\ + } while (coord_out.z < depth);\n\ +}\n\ +\n\ +#endif"; /* end of resize_bilinear_U8_opt_vx*/ + +static const char resize_nearest_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\ +_viv_uniform float2 scale_xy;\n\ +_viv_uniform float half_pixel_value;\n\ +_viv_uniform float round_value;\n\ +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +#define NEAREST_INDEX_PROCESS() \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); \\\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx + round_value; \\\n\ + int4 in_x_idx = convert_int4(in_x); \\\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y + round_value; \\\n\ + int in_y_idx = convert_int(in_y); \\\n\ +\n\ +\n\ +__kernel void resize_nearest_F16toF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_short8 src;\n\ + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.y;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.z;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.w;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniGetExtractData_2x8;\n\ +__kernel void resize_nearest_F16toF16_op\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_ushort8 src0, src1, dst;\n\ + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + //in_x_idx = in_x_idx - in_x_idx.xxxx;\n\ + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\ + vxc_ushort8 input_idx;\n\ + _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ + VXC_DP2x8(mask, input_idx, input_idx, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\ + VXC_BitExtract(dst, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;\n\ +__kernel void resize_nearest_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_char16 src;\n\ + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.y;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.z;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.w;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_nearest_I8toI8_op\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_uchar16 src0, dst0;\n\ + vxc_char16 dst;\n\ + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\ + vxc_ushort8 input_idx;\n\ + _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ + VXC_DP2x8(mask, input_idx, input_idx, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\ + VXC_BitExtract(dst0, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, dst, dst0, 8);\n\ + VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_nearest_U8toU8\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_uchar16 src;\n\ + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.y;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.z;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.w;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + vxc_ushort8 multiplier;\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ + VXC_DP2x8(src, src, multiplier, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_nearest_U8toU8_op\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_uchar16 src0, dst;\n\ + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8);\n\ + vxc_ushort8 input_idx;\n\ + _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ + VXC_DP2x8(mask, input_idx, input_idx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\ + VXC_BitExtract(dst, src0, src0, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + vxc_ushort8 multiplier;\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ + VXC_DP2x8(dst, dst, multiplier, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_nearest_I16toI16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_short8 src;\n\ + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ +\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.y;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.z;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = in_x_idx.w;\n\ + VXC_ReadImage2DArray(src, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(src, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_nearest_I16toI16_op\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ +\n\ + vxc_ushort8 src0, src1, dst0;\n\ + vxc_short8 dst;\n\ + int4 coord_in = (int4)(in_x_idx.x, in_y_idx, coord_out.z, 0);\n\ + VXC_ReadImage2DArray(src0, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, \\\n\ + VXC_5BITOFFSET_XY(8, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //in_x_idx = in_x_idx - in_x_idx.xxxx;\n\ + vxc_uchar16 mask = (vxc_uchar16)(8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16);\n\ + vxc_ushort8 input_idx;\n\ + _viv_asm(COPY, input_idx, in_x_idx, 16);\n\ + VXC_DP2x8(mask, input_idx, input_idx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetExtractData_2x8);\n\ + VXC_BitExtract(dst0, src0, src1, mask, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, dst, dst0, 8);\n\ + VXC_DP2x8(dst, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of resize_nearest_vx*/ + +static const char scatter_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccumulateSum_2x8;\n\ +_viv_uniform int index_num;\n\ +_viv_uniform int zeropoint;\n\ +_viv_uniform int offsetX;\n\ +_viv_uniform int offsetY;\n\ +_viv_uniform int offsetZ;\n\ +\n\ +__kernel void scatter_nd_F16toF16(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + image2d_array_t output,\n\ + int width,\n\ + int area,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_half8 sum;\n\ + _viv_asm(COPY, sum, tmpVal, 16);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice = read_imagei(input0, (int2)(0, i));\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ;\n\ + if(gidy == idx)\n\ + {\n\ + vxc_half8 src;\n\ + VXC_ReadImage(tmpVal, input1, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src, tmpVal, 16);\n\ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8);\n\ + }\n\ + }\n\ + _viv_asm(COPY, tmpVal, sum, 16);\n\ + VXC_WriteImage(output, (int2)(gidx, gidy), tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define SCATTER_ND_QINT(src0_type_name, data_type) \\\n\ +__kernel void scatter_nd_##src0_type_name##to##src0_type_name##( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + image2d_array_t output, \\\n\ + int width, \\\n\ + int area, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int firstFlg = 1; \\\n\ + \\\n\ + data_type sum = (data_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + for(int i = 0; i < index_num; i++) \\\n\ + { \\\n\ + int4 indice = read_imagei(input0, (int2)(0, i)); \\\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ; \\\n\ + if(gidy == idx) \\\n\ + { \\\n\ + data_type src; \\\n\ + VXC_ReadImage(src, input1, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); \\\n\ + if(firstFlg) \\\n\ + { \\\n\ + firstFlg = 0; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + if(firstFlg) \\\n\ + { \\\n\ + sum = (data_type)(zeropoint, zeropoint, zeropoint, zeropoint, \\\n\ + zeropoint, zeropoint, zeropoint, zeropoint); \\\n\ + } \\\n\ + VXC_WriteImage(output, (int2)(gidx, gidy), sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SCATTER_ND_QINT(U8, vxc_uchar8)\n\ +SCATTER_ND_QINT(I8, vxc_char8)\n\ +SCATTER_ND_QINT(I16, vxc_short8)\n\ +"; /* end of scatter_nd_vx*/ + +static const char scatter_nd_big_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccumulateSum_2x8;\n\ +_viv_uniform int index_num;\n\ +_viv_uniform int update_width;\n\ +_viv_uniform int output_width;\n\ +_viv_uniform int zeropoint;\n\ +\n\ +_viv_uniform int offsetX;\n\ +_viv_uniform int offsetY;\n\ +_viv_uniform int offsetZ;\n\ +\n\ +inline uchar* get_image2D_array_ptr(image2d_t input)\n\ +{\n\ + int8 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ + uchar *src_ptr = (uchar*)desc.s0;\n\ +\n\ + return src_ptr;\n\ +}\n\ +\n\ +__kernel void scatter_nd_F16toF16_big(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + image2d_t output,\n\ + int width,\n\ + int area,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_half8 sum;\n\ + _viv_asm(COPY, sum, tmpVal, 16);\n\ + __global int* index_ptr = (__global int*)get_image2D_array_ptr(input0);\n\ + __global short* update_ptr = (__global short*)get_image2D_array_ptr(input1);\n\ + __global short* output_ptr = (__global short*)get_image2D_array_ptr(output);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice = vload4(0, index_ptr + i * coord_dim);\n\ +\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ;\n\ + if(gidy == idx)\n\ + {\n\ + vxc_half8 src;\n\ + short tmpData = update_ptr[i * update_width + gidx];\n\ + _viv_asm(COPY, src, tmpData, 4);\n\ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8);\n\ + }\n\ + }\n\ + short dst;\n\ + _viv_asm(COPY, dst, sum, 4);\n\ + output_ptr[gidy * output_width+ gidx] = dst;\n\ +}\n\ +\n\ +#define SCATTER_ND_QINT_BIG(src0_type_name, data_type, ptr_type) \\\n\ +__kernel void scatter_nd_##src0_type_name##to##src0_type_name##_big( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + image2d_t output, \\\n\ + int width, \\\n\ + int area, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int firstFlg = 1; \\\n\ + \\\n\ + data_type sum = (data_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + __global int* index_ptr = (__global int*)get_image2D_array_ptr(input0); \\\n\ + __global ptr_type* update_ptr = (__global ptr_type*)get_image2D_array_ptr(input1); \\\n\ + __global ptr_type* output_ptr = (__global ptr_type*)get_image2D_array_ptr(output); \\\n\ + for(int i = 0; i < index_num; i++) \\\n\ + { \\\n\ + int4 indice = vload4(0, index_ptr + i * coord_dim); \\\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ; \\\n\ + if(gidy == idx) \\\n\ + { \\\n\ + data_type src; \\\n\ + ptr_type tmpData = update_ptr[i * update_width + gidx]; \\\n\ + _viv_asm(COPY, src, tmpData, 4); \\\n\ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); \\\n\ + if(firstFlg) \\\n\ + { \\\n\ + firstFlg = 0; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + ptr_type dst; \\\n\ + if(firstFlg) \\\n\ + { \\\n\ + _viv_asm(COPY, dst, zeropoint, 4); \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + _viv_asm(COPY, dst, sum, 4); \\\n\ + } \\\n\ + output_ptr[gidy * output_width+ gidx] = dst; \\\n\ +}\n\ +SCATTER_ND_QINT_BIG(U8, vxc_uchar8, uchar)\n\ +SCATTER_ND_QINT_BIG(I8, vxc_char8, char)\n\ +SCATTER_ND_QINT_BIG(I16, vxc_short8, short)\n\ +"; /* end of scatter_nd_big_vx*/ + +static const char select_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvConditiontoDst_2x8;\n\ +_viv_uniform VXC_512Bits uniConvIntIn0toDst_2x8;\n\ +_viv_uniform VXC_512Bits uniConvIntIn1toDst_2x8;\n\ +_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In0_2x8;\n\ +_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In1_2x8;\n\ +_viv_uniform int input0Zp;\n\ +_viv_uniform int input1Zp;\n\ +_viv_uniform int outputZP;\n\ +_viv_uniform VXC_512Bits uniU8AddZP_2x8;\n\ +\n\ +#define SELECT_INT(type_name, read_fun, write_fun) \\\n\ + type_name tmp, src0, src1, dst, value; \\\n\ + vxc_char8 value_tmp; \\\n\ + read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(src0, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn0toDst_2x8); \\\n\ + read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(src1, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn1toDst_2x8); \\\n\ + read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(value, value_tmp, value_tmp,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\ + dst = (value != 0 ? src0 : src1); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define SELECT_INT_FUN(cond_name, src_name, dst_name, type_name) \\\n\ +__kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name( \\\n\ + __read_only image2d_array_t condition, \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + SELECT_INT(type_name, VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\ +}\n\ +\n\ +SELECT_INT_FUN(I8, I8, I8, vxc_char8)\n\ +SELECT_INT_FUN(I8, I16, I16, vxc_short8)\n\ +\n\ +#define SELECT_INT_FUN_2D(cond_name, src_name, dst_name, type_name) \\\n\ +__kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name##_2D( \\\n\ + __read_only image2d_array_t condition, \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + SELECT_INT(type_name, VXC_ReadImage, VXC_WriteImage) \\\n\ +}\n\ +\n\ +SELECT_INT_FUN_2D(I8, I8, I8, vxc_char8)\n\ +SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8)\n\ +\n\ +#define SELECT_HALF(read_fun, write_fun) \\\n\ + vxc_short8 src0, src1, dst, value; \\\n\ + vxc_char8 value_tmp; \\\n\ + read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(value, value_tmp, value_tmp,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\ + dst = (value != 0 ? src0 : src1); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void select_I8_F16_F16toF16(\n\ + __read_only image2d_array_t condition,\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + SELECT_HALF(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void select_I8_F16_F16toF16_2D(\n\ + __read_only image2d_array_t condition,\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + SELECT_HALF(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +#define SELECT_U8(read_fun, write_fun) \\\n\ + vxc_uchar8 tmp, src0, src1, dst; \\\n\ + vxc_char8 value; \\\n\ + vxc_half8 tmp1; \\\n\ + vxc_uchar16 input0_ZP, input1_ZP, output_ZP; \\\n\ + _viv_asm(COPY, input0_ZP, input0Zp, 4); \\\n\ + _viv_asm(COPY, input1_ZP, input1Zp, 4); \\\n\ + _viv_asm(COPY, output_ZP, outputZP, 4); \\\n\ + read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(tmp1, tmp, input0_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8SubZP_MulM_PStoF16In0_2x8); \\\n\ + VXC_DP2x8(src0, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \\\n\ + read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(tmp1, tmp, input1_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8SubZP_MulM_PStoF16In1_2x8); \\\n\ + VXC_DP2x8(src1, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \\\n\ + read_fun(value, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dst = (value != 0 ? src0 : src1); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void select_I8_U8_U8toU8(\n\ + __read_only image2d_array_t condition,\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + SELECT_U8(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void select_I8_U8_U8toU8_2D(\n\ + __read_only image2d_array_t condition,\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + SELECT_U8(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of select_vx*/ + +static const char swish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float logE;\n\ +\n\ +float4 sigmoid_(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float inputTail;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ +\n\ +#define SWISH_PROCESS(read_fun, write_fun, src_type, src_copy_type, convert_type, dst_type, dst_copy_type, \\\n\ + INSCALE, INTAIL, OUTSCALE, OUTZP) \\\n\ + src_type src0; \\\n\ + src_copy_type src1; \\\n\ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, src0, 16); \\\n\ + float4 vecA, vecB, vecC, vecD; \\\n\ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\ + VXC_DP4x4(vecB, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part1_4x4); \\\n\ + vecA = vecA * INSCALE + INTAIL; \\\n\ + vecB = vecB * INSCALE + INTAIL; \\\n\ + vecC = beta * vecA; \\\n\ + vecD = beta * vecB; \\\n\ + vecC = sigmoid_(vecC); \\\n\ + vecD = sigmoid_(vecD); \\\n\ + vecA = vecA * vecC; \\\n\ + vecB = vecB * vecD; \\\n\ + vecA = vecA * OUTSCALE + OUTZP; \\\n\ + vecB = vecB * OUTSCALE + OUTZP; \\\n\ + convert_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, vecA); \\\n\ + _viv_asm(CONV_RTE, dst1, vecB); \\\n\ + dst_type dst2; \\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + dst_copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +#define SWISH_FUNC(src_type_name, dst_type_name, src_type, src_copy_type, convert_type, dst_type, dst_copy_type,\\\n\ + INSCALE, INTAIL, OUTSCALE, OUTZP) \\\n\ + __kernel void swish_##src_type_name##to##dst_type_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float beta \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + SWISH_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray, src_type, src_copy_type, convert_type, \\\n\ + dst_type, dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \\\n\ +}\n\ +\n\ +SWISH_FUNC(F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8, 1, 0, 1, 0)\n\ +SWISH_FUNC(F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8, 1, 0, outputScale, 0)\n\ +SWISH_FUNC(F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8, 1, 0, outputScale, outputZP)\n\ +SWISH_FUNC(F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8, 1, 0, outputScale, 0)\n\ +SWISH_FUNC(I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8, inputScale, 0, outputScale, 0)\n\ +SWISH_FUNC(I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0)\n\ +SWISH_FUNC(U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8, \\\n\ +inputScale, inputTail, outputScale, outputZP)\n\ +SWISH_FUNC(U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8, inputScale, inputTail, 1, 0)\n\ +SWISH_FUNC(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8, inputScale, 0, outputScale, 0)\n\ +SWISH_FUNC(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0)\n\ +\n\ +\n\ +#define SWISH_FUNC_2D(src_type_name, dst_type_name, src_type, src_copy_type, convert_type, dst_type, \\\n\ + dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \\\n\ + __kernel void swish_##src_type_name##to##dst_type_name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float beta \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + SWISH_PROCESS(VXC_ReadImage, VXC_WriteImage, src_type, src_copy_type, convert_type, dst_type, \\\n\ + dst_copy_type, INSCALE, INTAIL, OUTSCALE, OUTZP) \\\n\ +}\n\ +\n\ +SWISH_FUNC_2D(F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8, 1, 0, 1, 0)\n\ +SWISH_FUNC_2D(F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8, 1, 0, outputScale, 0)\n\ +SWISH_FUNC_2D(F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8, 1, 0, outputScale, outputZP)\n\ +SWISH_FUNC_2D(F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8, 1, 0, outputScale, 0)\n\ +SWISH_FUNC_2D(I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8, inputScale, 0, outputScale, 0)\n\ +SWISH_FUNC_2D(I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0)\n\ +SWISH_FUNC_2D(U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8, \\\n\ +inputScale, inputTail, outputScale, outputZP)\n\ +SWISH_FUNC_2D(U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8, inputScale, inputTail, 1, 0)\n\ +SWISH_FUNC_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8, inputScale, 0, outputScale, 0)\n\ +SWISH_FUNC_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8, inputScale, 0, 1, 0)\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define SWISH_BF16_PROCESS(read_fun, write_fun) \\\n\ + vxc_ushort8 src0, src1, dst; \\\n\ + float4 vecA, vecB, vecC, vecD; \\\n\ + read_fun(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecA, src1, 16); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, vecB, src1, 16); \\\n\ + vecC = beta * vecA; \\\n\ + vecD = beta * vecB; \\\n\ + vecC = sigmoid_(vecC); \\\n\ + vecD = sigmoid_(vecD); \\\n\ + vecA = vecA * vecC; \\\n\ + vecB = vecB * vecD; \\\n\ + _viv_asm(COPY, src0, vecA, 16); \\\n\ + _viv_asm(COPY, src1, vecB, 16); \\\n\ + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void swish_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float beta\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + SWISH_BF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray);\n\ +}\n\ +\n\ +__kernel void swish_BF16toBF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float beta\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + SWISH_BF16_PROCESS(VXC_ReadImage, VXC_WriteImage);\n\ +}\n\ +"; /* end of swish_vx*/ + +static const char tile_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int lastWorkItem;\n\ +\n\ +#define TILE_3D(name0, name1, name2, remainder, type) \\\n\ +__kernel void tile_remain##name2##_##name0##to##name1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int batchIn, \\\n\ + int depthIn, \\\n\ + int depthOut, \\\n\ + int multiples_0, \\\n\ + int multiples_1, \\\n\ + int multiples_2, \\\n\ + int multiples_3 \\\n\ +) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_out; \\\n\ + int width = get_image_width(input); \\\n\ + int height = get_image_height(input); \\\n\ + int output_width = get_image_width(output); \\\n\ + type src; \\\n\ + VXC_ReadImage2DArray(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int isLastItem = coord.x == lastWorkItem; \\\n\ + \\\n\ + int batch_id = (short)coord.z / (short)depthIn; \\\n\ + coord.z = (short)coord.z % (short)depthIn; \\\n\ + coord_out = coord; \\\n\ + \\\n\ + for (int w = 0; w < multiples_3; w++) \\\n\ + { \\\n\ + int batch = batchIn * w + batch_id; \\\n\ + \\\n\ + for(int z = 0; z < multiples_2; z++) \\\n\ + { \\\n\ + coord_out.z = coord.z + z * depthIn + batch * depthOut; \\\n\ + \\\n\ + for (int y = 0; y < multiples_1; y++) \\\n\ + { \\\n\ + coord_out.y = coord.y + y * height; \\\n\ + \\\n\ + for (int x = 0; x < multiples_0; x++) \\\n\ + { \\\n\ + coord_out.x = coord.x + x * width; \\\n\ + if (isLastItem) \\\n\ + VXC_WriteImage2DArray(output, coord_out, src, \\\n\ + VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\ + else \\\n\ + VXC_WriteImage2DArray(output, coord_out, src, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ +}\n\ +TILE_3D(U8, U8, 1, 0, vxc_uchar8)\n\ +TILE_3D(U8, U8, 2, 1, vxc_uchar8)\n\ +TILE_3D(U8, U8, 3, 2, vxc_uchar8)\n\ +TILE_3D(U8, U8, 4, 3, vxc_uchar8)\n\ +TILE_3D(U8, U8, 5, 4, vxc_uchar8)\n\ +TILE_3D(U8, U8, 6, 5, vxc_uchar8)\n\ +TILE_3D(U8, U8, 7, 6, vxc_uchar8)\n\ +TILE_3D(U8, U8, 0, 7, vxc_uchar8)\n\ +\n\ +TILE_3D(I16, I16, 1, 0, vxc_short8)\n\ +TILE_3D(I16, I16, 2, 1, vxc_short8)\n\ +TILE_3D(I16, I16, 3, 2, vxc_short8)\n\ +TILE_3D(I16, I16, 4, 3, vxc_short8)\n\ +TILE_3D(I16, I16, 5, 4, vxc_short8)\n\ +TILE_3D(I16, I16, 6, 5, vxc_short8)\n\ +TILE_3D(I16, I16, 7, 6, vxc_short8)\n\ +TILE_3D(I16, I16, 0, 7, vxc_short8)\n\ +\n\ +\n\ +#define TILE_2D(name0, name1, name2, remainder, type) \\\n\ +__kernel void tile_remain##name2##_##name0##to##name1##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int batchIn, \\\n\ + int depthIn, \\\n\ + int depthOut, \\\n\ + int multiples_0, \\\n\ + int multiples_1, \\\n\ + int multiples_2, \\\n\ + int multiples_3 \\\n\ +) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + int width = get_image_width(input); \\\n\ + int height = get_image_height(input); \\\n\ + int output_width = get_image_width(output); \\\n\ + int output_height = get_image_height(output); \\\n\ + type src; \\\n\ + VXC_ReadImage(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int isLastItem = coord.x == lastWorkItem; \\\n\ + do \\\n\ + { \\\n\ + do \\\n\ + { \\\n\ + if (isLastItem) \\\n\ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\ + else \\\n\ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += width; \\\n\ + } while (coord.x < output_width); \\\n\ + coord.x = get_global_id(0); \\\n\ + coord.y += height; \\\n\ + } while (coord.y < output_height); \\\n\ +}\n\ +TILE_2D(U8, U8, 1, 0, vxc_uchar8)\n\ +TILE_2D(U8, U8, 2, 1, vxc_uchar8)\n\ +TILE_2D(U8, U8, 3, 2, vxc_uchar8)\n\ +TILE_2D(U8, U8, 4, 3, vxc_uchar8)\n\ +TILE_2D(U8, U8, 5, 4, vxc_uchar8)\n\ +TILE_2D(U8, U8, 6, 5, vxc_uchar8)\n\ +TILE_2D(U8, U8, 7, 6, vxc_uchar8)\n\ +TILE_2D(U8, U8, 0, 7, vxc_uchar8)\n\ +\n\ +TILE_2D(I16, I16, 1, 0, vxc_short8)\n\ +TILE_2D(I16, I16, 2, 1, vxc_short8)\n\ +TILE_2D(I16, I16, 3, 2, vxc_short8)\n\ +TILE_2D(I16, I16, 4, 3, vxc_short8)\n\ +TILE_2D(I16, I16, 5, 4, vxc_short8)\n\ +TILE_2D(I16, I16, 6, 5, vxc_short8)\n\ +TILE_2D(I16, I16, 7, 6, vxc_short8)\n\ +TILE_2D(I16, I16, 0, 7, vxc_short8)\n\ +\n\ +\n\ +"; /* end of tile_vx*/ + +static const char tile_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_Lo_2x8;\n\ +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +_viv_uniform int lastWorkItem;\n\ +\n\ +#define TILE_3D_MIX(name0, name1, name2, remainder, type, out_type) \\\n\ +__kernel void tile_remain##name2##_##name0##to##name1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int batchIn, \\\n\ + int depthIn, \\\n\ + int depthOut, \\\n\ + int multiples_0, \\\n\ + int multiples_1, \\\n\ + int multiples_2, \\\n\ + int multiples_3 \\\n\ +) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_out; \\\n\ + int width = get_image_width(input); \\\n\ + int height = get_image_height(input); \\\n\ + int output_width = get_image_width(output); \\\n\ + type src; \\\n\ + vxc_half8 src1; \\\n\ + out_type dst; \\\n\ + VXC_ReadImage2DArray(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int isLastItem = coord.x == lastWorkItem; \\\n\ + \\\n\ + int batch_id = (short)coord.z / (short)depthIn; \\\n\ + coord.z = (short)coord.z % (short)depthIn; \\\n\ + coord_out = coord; \\\n\ + \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(src1, src, multiplier,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Lo_2x8); \\\n\ + _viv_asm(COPY, dst, src1, 16); \\\n\ + \\\n\ + for (int w = 0; w < multiples_3; w++) \\\n\ + { \\\n\ + int batch = batchIn * w + batch_id; \\\n\ + \\\n\ + for(int z = 0; z < multiples_2; z++) \\\n\ + { \\\n\ + coord_out.z = coord.z + z * depthIn + batch * depthOut; \\\n\ + \\\n\ + for (int y = 0; y < multiples_1; y++) \\\n\ + { \\\n\ + coord_out.y = coord.y + y * height; \\\n\ + \\\n\ + for (int x = 0; x < multiples_0; x++) \\\n\ + { \\\n\ + coord_out.x = coord.x + x * width; \\\n\ + if (isLastItem) \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, \\\n\ + VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\ + else \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ +}\n\ +TILE_3D_MIX(U8, F16, 1, 0, vxc_uchar8, vxc_short8)\n\ +TILE_3D_MIX(U8, F16, 2, 1, vxc_uchar8, vxc_short8)\n\ +TILE_3D_MIX(U8, F16, 3, 2, vxc_uchar8, vxc_short8)\n\ +TILE_3D_MIX(U8, F16, 4, 3, vxc_uchar8, vxc_short8)\n\ +TILE_3D_MIX(U8, F16, 5, 4, vxc_uchar8, vxc_short8)\n\ +TILE_3D_MIX(U8, F16, 6, 5, vxc_uchar8, vxc_short8)\n\ +TILE_3D_MIX(U8, F16, 7, 6, vxc_uchar8, vxc_short8)\n\ +TILE_3D_MIX(U8, F16, 0, 7, vxc_uchar8, vxc_short8)\n\ +\n\ +\n\ +#define TILE_2D_MIX(name0, name1, name2, remainder, type, out_type) \\\n\ +__kernel void tile_remain##name2##_##name0##to##name1##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int batchIn, \\\n\ + int depthIn, \\\n\ + int depthOut, \\\n\ + int multiples_0, \\\n\ + int multiples_1, \\\n\ + int multiples_2, \\\n\ + int multiples_3 \\\n\ +) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + int width = get_image_width(input); \\\n\ + int height = get_image_height(input); \\\n\ + int output_width = get_image_width(output); \\\n\ + int output_height = get_image_height(output); \\\n\ + type src; \\\n\ + vxc_half8 src1; \\\n\ + out_type dst; \\\n\ + VXC_ReadImage(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int isLastItem = coord.x == lastWorkItem; \\\n\ + \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(src1, src, multiplier,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8MulAndPostShift_Lo_2x8); \\\n\ + _viv_asm(COPY, dst, src1, 16); \\\n\ + \\\n\ + do \\\n\ + { \\\n\ + do \\\n\ + { \\\n\ + if (isLastItem) \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\ + else \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += width; \\\n\ + } while (coord.x < output_width); \\\n\ + coord.x = get_global_id(0); \\\n\ + coord.y += height; \\\n\ + } while (coord.y < output_height); \\\n\ +}\n\ +TILE_2D_MIX(U8, F16, 1, 0, vxc_uchar8, vxc_short8)\n\ +TILE_2D_MIX(U8, F16, 2, 1, vxc_uchar8, vxc_short8)\n\ +TILE_2D_MIX(U8, F16, 3, 2, vxc_uchar8, vxc_short8)\n\ +TILE_2D_MIX(U8, F16, 4, 3, vxc_uchar8, vxc_short8)\n\ +TILE_2D_MIX(U8, F16, 5, 4, vxc_uchar8, vxc_short8)\n\ +TILE_2D_MIX(U8, F16, 6, 5, vxc_uchar8, vxc_short8)\n\ +TILE_2D_MIX(U8, F16, 7, 6, vxc_uchar8, vxc_short8)\n\ +TILE_2D_MIX(U8, F16, 0, 7, vxc_uchar8, vxc_short8)\n\ +"; /* end of tile_mix_vx*/ + +static const char upsample_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniF16MulMultipiler_PostShft_2x8;\n\ +_viv_uniform VXC_512Bits uniS16AddOutZP_2x8;\n\ +_viv_uniform vxc_uint4 packed_outputZP;\n\ +\n\ +#define UPSAMPLE_F16_U8TO_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 din0; \\\n\ + vxc_uchar8 din; \\\n\ + vxc_uchar8 axisIn; \\\n\ + vxc_half8 src; \\\n\ + vxc_uchar16 dinExpand; \\\n\ + vxc_uchar16 axisInExpand; \\\n\ + vxc_uchar16 constAxis; \\\n\ + vxc_uchar16 axisData; \\\n\ + vxc_uchar16 axisData1; \\\n\ + vxc_uchar16 dout; \\\n\ + read_fun(din0, dataIn, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_short8 tmp; \\\n\ + uchar zp = 0; \\\n\ + _viv_asm(COPY, src, din0, 16); \\\n\ + VXC_DP2x8(tmp, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniF16MulMultipiler_PostShft_2x8); \\\n\ + vxc_uchar16 packed_outZP; \\\n\ + _viv_asm(COPY, packed_outZP, packed_outputZP, 16); \\\n\ + VXC_DP2x8(din, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + dinExpand = din.s0011223344556677; \\\n\ + axisInExpand = axisIn.s0011223344556677; \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + axisData &= (vxc_uchar16)(1); \\\n\ + _viv_asm(COPY, axisData1, axisData, 16); \\\n\ + dout = axisData1 * dinExpand; \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + axisData &= (vxc_uchar16)(1); \\\n\ + _viv_asm(COPY, axisData1, axisData, 16); \\\n\ + dout = axisData1 * dinExpand; \\\n\ + coordOut.y += 1; \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void upsample_F16_U8to_U8\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_F16_U8TO_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_F16_U8to_U8_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_F16_U8TO_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits shortMulShort_8x8;\n\ +_viv_uniform VXC_512Bits uniConvertFstFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform int upOutput_ZP;\n\ +_viv_uniform float upOutput_Scale;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +\n\ +#define UPSAMPLE_F16_I16TO_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_short4 din; \\\n\ + vxc_short4 axisIn; \\\n\ + vxc_short8 dinExp, axisInExp, constAxis,axisData,tmpout; \\\n\ + vxc_half8 dout; \\\n\ + vxc_float4 tmpVal1, tmpVal2, convZp; \\\n\ + vxc_int4 tmpData1, tmpData2, tmpData3; \\\n\ + vxc_uchar8 result; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + dinExp = din.s00112233; \\\n\ + axisInExp = axisIn.s00112233; \\\n\ + constAxis = (vxc_short8)(0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (vxc_short8)(1); \\\n\ + VXC_DP2x8(tmpout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), shortMulShort_8x8); \\\n\ + _viv_asm(COPY, dout, tmpout, 16); \\\n\ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstFp16Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecFp16Fp32_4x4); \\\n\ + tmpVal1 /= upOutput_Scale; \\\n\ + tmpVal2 /= upOutput_Scale; \\\n\ + tmpData3 = isnotequal(tmpVal1, 0); \\\n\ + tmpData3 *= (-upOutput_ZP); \\\n\ + convZp = convert_float4_rtp(tmpData3); \\\n\ + tmpVal1 += convZp; \\\n\ + tmpData3 = isnotequal(tmpVal2, 0); \\\n\ + tmpData3 *= (-upOutput_ZP); \\\n\ + convZp = convert_float4_rtp(tmpData3); \\\n\ + tmpVal2 += convZp; \\\n\ + tmpData1 = convert_int4_rte(tmpVal1); \\\n\ + tmpData2 = convert_int4_rte(tmpVal2); \\\n\ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_short8)(2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (vxc_short8)(1); \\\n\ + VXC_DP2x8(tmpout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), shortMulShort_8x8); \\\n\ + _viv_asm(COPY, dout, tmpout, 16); \\\n\ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstFp16Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecFp16Fp32_4x4); \\\n\ + tmpVal1 /= upOutput_Scale; \\\n\ + tmpVal2 /= upOutput_Scale; \\\n\ + tmpData3 = isnotequal(tmpVal1, 0); \\\n\ + tmpData3 *= (-upOutput_ZP); \\\n\ + convZp = convert_float4_rtp(tmpData3); \\\n\ + tmpVal1 += convZp; \\\n\ + tmpData3 = isnotequal(tmpVal2, 0); \\\n\ + tmpData3 *= (-upOutput_ZP); \\\n\ + convZp = convert_float4_rtp(tmpData3); \\\n\ + tmpVal2 += convZp; \\\n\ + tmpData1 = convert_int4_rte(tmpVal1); \\\n\ + tmpData2 = convert_int4_rte(tmpVal2); \\\n\ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + coordOut.y += 1; \\\n\ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void upsample_F16_I16to_U8\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_F16_I16TO_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_F16_I16to_U8_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_F16_I16TO_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +_viv_uniform float scaleOut;\n\ +_viv_uniform float outputZp;\n\ +_viv_uniform VXC_512Bits ucharMulShort_8x8_2;\n\ +\n\ +#define UPSAMPLE_F16_U8TO_I8_PROCESS(read_fun, write_fun) \\\n\ + vxc_short4 din; \\\n\ + vxc_uchar4 axisIn; \\\n\ + vxc_short8 dinExp, tmpOut; \\\n\ + vxc_uchar8 axisInExp; \\\n\ + vxc_uchar8 constAxis; \\\n\ + vxc_uchar8 axisData; \\\n\ + vxc_half8 dout; \\\n\ + vxc_float4 tmpVal0, tmpVal1; \\\n\ + vxc_char8 result; \\\n\ + int4 tmpData1, tmpData2; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + dinExp = din.s00112233; \\\n\ + axisInExp = axisIn.s00112233; \\\n\ + constAxis = (vxc_uchar8)(0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (vxc_uchar8)(1); \\\n\ + VXC_DP2x8(tmpOut, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_8x8_2); \\\n\ + _viv_asm(COPY, dout, tmpOut, 16); \\\n\ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertFstFp16Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertSecFp16Fp32_4x4); \\\n\ + tmpVal0 = tmpVal0 * scaleOut + outputZp; \\\n\ + tmpVal1 = tmpVal1 * scaleOut + outputZp; \\\n\ + tmpData1 = convert_int4_rte(tmpVal0); \\\n\ + tmpData2 = convert_int4_rte(tmpVal1); \\\n\ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_uchar8)(2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (vxc_uchar8)(1); \\\n\ + VXC_DP2x8(tmpOut, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_8x8_2); \\\n\ + coordOut.y += 1; \\\n\ + _viv_asm(COPY, dout, tmpOut, 16); \\\n\ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertFstFp16Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertSecFp16Fp32_4x4); \\\n\ + tmpVal0 = tmpVal0 * scaleOut + outputZp; \\\n\ + tmpVal1 = tmpVal1 * scaleOut + outputZp; \\\n\ + tmpData1 = convert_int4_rte(tmpVal0); \\\n\ + tmpData2 = convert_int4_rte(tmpVal1); \\\n\ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void upsample_F16_U8to_I8\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_F16_U8TO_I8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_F16_U8to_I8_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_F16_U8TO_I8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +_viv_uniform float up_outFlScale_i16;\n\ +\n\ +#define UPSAMPLE_F16_U8TO_I16_PROCESS(read_fun, write_fun) \\\n\ + vxc_short4 din; \\\n\ + vxc_uchar4 axisIn; \\\n\ + vxc_short8 dinExp, tmpOut; \\\n\ + vxc_uchar8 axisInExp; \\\n\ + vxc_uchar8 constAxis; \\\n\ + vxc_uchar8 axisData; \\\n\ + half8 dout; \\\n\ + float4 tmpVal1, tmpVal2; \\\n\ + int4 tmpData1, tmpData2; \\\n\ + vxc_short8 result; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + dinExp = din.s00112233; \\\n\ + axisInExp = axisIn.s00112233; \\\n\ + constAxis = (vxc_uchar8)(0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (vxc_uchar8)(1); \\\n\ + VXC_DP2x8(tmpOut, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_8x8_2); \\\n\ + _viv_asm(COPY, dout, tmpOut, 16); \\\n\ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstFp16Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecFp16Fp32_4x4); \\\n\ + tmpVal1 *= up_outFlScale_i16; \\\n\ + tmpVal2 *= up_outFlScale_i16; \\\n\ + tmpData1 = convert_int4_rte(tmpVal1); \\\n\ + tmpData2 = convert_int4_rte(tmpVal2); \\\n\ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_uchar8)(2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (vxc_uchar8)(1); \\\n\ + VXC_DP2x8(tmpOut, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_8x8_2); \\\n\ + coordOut.y += 1; \\\n\ + _viv_asm(COPY, dout, tmpOut, 16); \\\n\ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstFp16Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecFp16Fp32_4x4); \\\n\ + tmpVal1 *= up_outFlScale_i16; \\\n\ + tmpVal2 *= up_outFlScale_i16; \\\n\ + tmpData1 = convert_int4_rte(tmpVal1); \\\n\ + tmpData2 = convert_int4_rte(tmpVal2); \\\n\ + VXC_DP2x8(result, tmpData1, tmpData2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void upsample_F16_U8to_I16\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_F16_U8TO_I16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_F16_U8to_I16_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_F16_U8TO_I16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of upsample_F16_vx*/ + +static const char upsample_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +//--------------------------unpooling-------------------------\n\ +_viv_uniform VXC_512Bits uniQuantInOutInt16_2x8;\n\ +_viv_uniform VXC_512Bits ucharMulShort_2x8;\n\ +\n\ +#define UPSAMPLE_I16_U8TO_I16_SAME_PROCESS(read_fun, write_fun) \\\n\ + vxc_short4 din; \\\n\ + vxc_uchar4 axisIn; \\\n\ + vxc_short8 dinExp; \\\n\ + vxc_uchar8 axisInExp; \\\n\ + vxc_uchar8 constAxis; \\\n\ + vxc_uchar8 axisData; \\\n\ + vxc_short8 axisData_short; \\\n\ + vxc_short8 dout; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + dinExp = din.s00112233; \\\n\ + axisInExp = axisIn.s00112233; \\\n\ + constAxis = (vxc_uchar8)(0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (vxc_uchar8)(1); \\\n\ + VXC_DP2x8(dout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_2x8); \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_uchar8)(2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (vxc_uchar8)(1); \\\n\ + VXC_DP2x8(dout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_2x8); \\\n\ + coordOut.y += 1; \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void upsample_I16_U8to_I16_SAME\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_I16_U8TO_I16_SAME_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_I16_U8to_I16_SAME_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_I16_U8TO_I16_SAME_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +#define UPSAMPLE_I16_TO_I16_PROCESS(axis_type, axis_in_type, read_fun, write_fun) \\\n\ + vxc_short4 din; \\\n\ + axis_in_type axisIn; \\\n\ + vxc_short8 dinExp; \\\n\ + axis_type axisInExp; \\\n\ + axis_type constAxis; \\\n\ + axis_type axisData; \\\n\ + vxc_short8 axisData_short; \\\n\ + vxc_short8 dout; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + dinExp = din.s00112233; \\\n\ + axisInExp = axisIn.s00112233; \\\n\ + constAxis = (axis_type)(0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (axis_type)(1); \\\n\ + VXC_DP2x8(dout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_2x8); \\\n\ + VXC_DP2x8(dout, dout, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantInOutInt16_2x8); \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (axis_type)(2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (axis_type)(1); \\\n\ + VXC_DP2x8(dout, axisData, dinExp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), ucharMulShort_2x8); \\\n\ + VXC_DP2x8(dout, dout, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantInOutInt16_2x8); \\\n\ + coordOut.y += 1; \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void upsample_I16_U8to_I16\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_I16_TO_I16_PROCESS(vxc_uchar8, vxc_uchar4, VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_I16_U8to_I16_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_I16_TO_I16_PROCESS(vxc_uchar8, vxc_uchar4, VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +__kernel void upsample_I16_I16to_I16\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_I16_TO_I16_PROCESS(vxc_short8, vxc_short4, VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_I16_I16to_I16_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_I16_TO_I16_PROCESS(vxc_short8, vxc_short4, VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertDirInt16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertU8toI16_2x8;\n\ +_viv_uniform float inScaleInt16;\n\ +\n\ +#define UPSAMPLE_I16_TO_F16_PROCESS(axis_type, axis_in_type, read_fun, write_fun) \\\n\ + vxc_short8 din; \\\n\ + axis_in_type axisIn; \\\n\ + vxc_short8 dinExp; \\\n\ + axis_type axisInExp; \\\n\ + axis_type constAxis; \\\n\ + axis_type axisData; \\\n\ + vxc_short8 axisData_short; \\\n\ + vxc_short8 dout; \\\n\ + vxc_float4 tmpVal0, tmpVal1; \\\n\ + half4 tmpOut0; \\\n\ + vxc_short8 tmpOut1; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tmpVal0, din, din, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertDirInt16Fp32_4x4); \\\n\ + tmpVal1 = tmpVal0 * inScaleInt16; \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal1); \\\n\ + _viv_asm(COPY, tmpOut1, tmpOut0, 16); \\\n\ + dinExp = tmpOut1.s00224466; \\\n\ + axisInExp = axisIn.s00112233; \\\n\ + constAxis = (axis_type)(0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (axis_type)(1); \\\n\ + VXC_DP2x8(axisData_short, axisData, axisData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertU8toI16_2x8); \\\n\ + dout = axisData_short == 1 ? dinExp : 0; \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (axis_type)(2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExp, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + axisData &= (axis_type)(1); \\\n\ + VXC_DP2x8(axisData_short, axisData, axisData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertU8toI16_2x8); \\\n\ + dout = axisData_short == 1 ? dinExp : 0; \\\n\ + coordOut.y += 1; \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void upsample_I16_I16to_F16\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_I16_TO_F16_PROCESS(vxc_short8, vxc_short4, VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_I16_I16to_F16_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_I16_TO_F16_PROCESS(vxc_short8, vxc_short4, VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +__kernel void upsample_I16_U8to_F16\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_I16_TO_F16_PROCESS(vxc_uchar8, vxc_uchar4, VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_I16_U8to_F16_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_I16_TO_F16_PROCESS(vxc_uchar8, vxc_uchar4, VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of upsample_I16_vx*/ + +static const char upsample_I8_vx[] = "\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input_ZP;\n\ +\n\ +#define UPSAMPLE_I8_U8TO_I8_SAME_PROCESS(read_fun, write_fun) \\\n\ + vxc_char8 din; \\\n\ + vxc_uchar8 axisIn; \\\n\ + vxc_char16 dinExpand; \\\n\ + vxc_uchar16 axisInExpand; \\\n\ + vxc_uchar16 constAxis; \\\n\ + vxc_uchar16 axisData; \\\n\ + vxc_char16 zpValue; \\\n\ + vxc_char16 dout; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dinExpand = din.s0011223344556677; \\\n\ + axisInExpand = axisIn.s0011223344556677; \\\n\ + zpValue = (char)input_ZP; \\\n\ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + dout = axisData ? dinExpand : zpValue; \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + dout = axisData ? dinExpand : zpValue; \\\n\ + coordOut.y += 1; \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void upsample_I8_U8to_I8_SAME\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_I8_U8TO_I8_SAME_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_I8_U8to_I8_SAME_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_I8_U8TO_I8_SAME_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniU8SubZP_MulM_2x8;\n\ +_viv_uniform VXC_512Bits uniU8SubZP_MulM_Hi_2x8;\n\ +_viv_uniform VXC_512Bits uniS16AddOutZP_2x8;\n\ +_viv_uniform VXC_512Bits uniS16MoveValue_2x8;\n\ +_viv_uniform vxc_uint4 packed_outputZP;\n\ +\n\ +#define UPSAMPLE_I8_U8TO_I8_PROCESS(read_fun, write_fun) \\\n\ + vxc_char8 din; \\\n\ + vxc_uchar8 axisIn; \\\n\ + vxc_char16 dinExpand; \\\n\ + vxc_uchar16 axisInExpand; \\\n\ + vxc_uchar16 constAxis; \\\n\ + vxc_uchar16 axisData; \\\n\ + vxc_char16 zpValue; \\\n\ + vxc_char16 dout; \\\n\ + vxc_char16 result, result_tmp; \\\n\ + zpValue = (char)input_ZP; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dinExpand = din.s0011223344556677; \\\n\ + axisInExpand = axisIn.s0011223344556677; \\\n\ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + dout = axisData ? dinExpand : zpValue; \\\n\ + vxc_short8 tmp; \\\n\ + short zp = input_ZP; \\\n\ + vxc_short8 packed_outZP; \\\n\ + _viv_asm(COPY, packed_outZP, packed_outputZP, 16); \\\n\ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8SubZP_MulM_2x8); \\\n\ + VXC_DP2x8(result, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8SubZP_MulM_Hi_2x8); \\\n\ + VXC_DP2x8(result_tmp, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + VXC_DP2x8(result, result_tmp, result_tmp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16MoveValue_2x8); \\\n\ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + dout = axisData ? dinExpand : zpValue; \\\n\ + coordOut.y += 1; \\\n\ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8SubZP_MulM_2x8); \\\n\ + VXC_DP2x8(result, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8SubZP_MulM_Hi_2x8); \\\n\ + VXC_DP2x8(result_tmp, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + VXC_DP2x8(result, result_tmp, result_tmp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16MoveValue_2x8); \\\n\ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void upsample_I8_U8to_I8\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_I8_U8TO_I8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_I8_U8to_I8_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_I8_U8TO_I8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertDirUint8Fp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertEndUint8Fp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertTrdUint8Fp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertFthUint8Fp32_4x4_2;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8_2;\n\ +_viv_uniform float scaleIn;\n\ +_viv_uniform float inputTail;\n\ +\n\ +#define UPSAMPLE_I8_U8TO_F16_PROCESS(read_fun, write_fun) \\\n\ + vxc_char8 din; \\\n\ + vxc_uchar8 axisIn; \\\n\ + vxc_char16 dinExpand; \\\n\ + vxc_uchar16 axisInExpand; \\\n\ + vxc_uchar16 constAxis; \\\n\ + vxc_uchar16 axisData; \\\n\ + vxc_char16 zpValue; \\\n\ + vxc_char16 dout; \\\n\ + zpValue = (char)input_ZP; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coordOut1.x += 8; \\\n\ + dinExpand = din.s0011223344556677; \\\n\ + axisInExpand = axisIn.s0011223344556677; \\\n\ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + dout = axisData ? dinExpand : zpValue; \\\n\ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3; \\\n\ + half4 tmpOut0, tmpOut1; \\\n\ + vxc_short8 rout0, rout1; \\\n\ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertDirUint8Fp32_4x4_2); \\\n\ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertEndUint8Fp32_4x4_2); \\\n\ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertTrdUint8Fp32_4x4_2); \\\n\ + VXC_DP4x4(tmpVal3, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertFthUint8Fp32_4x4_2); \\\n\ + tmpVal0 = tmpVal0 * scaleIn + inputTail; \\\n\ + tmpVal1 = tmpVal1 * scaleIn + inputTail; \\\n\ + tmpVal2 = tmpVal2 * scaleIn + inputTail; \\\n\ + tmpVal3 = tmpVal3 * scaleIn + inputTail; \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal0); \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal1); \\\n\ + VXC_DP2x8(rout0, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertInt32toUint8_2x8_2); \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal2); \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal3); \\\n\ + VXC_DP2x8(rout1, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertInt32toUint8_2x8_2); \\\n\ + write_fun(dataOut, coordOut, rout0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + write_fun(dataOut, coordOut1, rout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + dout = axisData ? dinExpand : zpValue; \\\n\ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertDirUint8Fp32_4x4_2); \\\n\ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertEndUint8Fp32_4x4_2); \\\n\ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertTrdUint8Fp32_4x4_2); \\\n\ + VXC_DP4x4(tmpVal3, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertFthUint8Fp32_4x4_2); \\\n\ + tmpVal0 = tmpVal0 * scaleIn + inputTail; \\\n\ + tmpVal1 = tmpVal1 * scaleIn + inputTail; \\\n\ + tmpVal2 = tmpVal2 * scaleIn + inputTail; \\\n\ + tmpVal3 = tmpVal3 * scaleIn + inputTail; \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal0); \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal1); \\\n\ + VXC_DP2x8(rout0, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertInt32toUint8_2x8_2); \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal2); \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal3); \\\n\ + VXC_DP2x8(rout1, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertInt32toUint8_2x8_2); \\\n\ + coordOut.y += 1; \\\n\ + coordOut1.y += 1; \\\n\ + write_fun(dataOut, coordOut, rout0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + write_fun(dataOut, coordOut1, rout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void upsample_I8_U8to_F16\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + int4 coordOut1 = coordOut;\n\ + UPSAMPLE_I8_U8TO_F16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_I8_U8to_F16_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + int2 coordOut1 = coordOut;\n\ + UPSAMPLE_I8_U8TO_F16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of upsample_I8_vx*/ + +static const char upsample_U8_vx[] = "\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input_ZP;\n\ +\n\ +#define UPSAMPLE_U8_U8TO_U8_SAME_PROCESS(read_fun, write_fun) \\\n\ + vxc_uchar8 din; \\\n\ + vxc_uchar8 axisIn; \\\n\ + vxc_uchar16 dinExpand; \\\n\ + vxc_uchar16 axisInExpand; \\\n\ + vxc_uchar16 constAxis; \\\n\ + vxc_uchar16 axisData; \\\n\ + vxc_uchar16 zpValue; \\\n\ + vxc_uchar16 dout; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dinExpand = din.s0011223344556677; \\\n\ + axisInExpand = axisIn.s0011223344556677; \\\n\ + zpValue = (uchar)input_ZP; \\\n\ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + dout = axisData ? dinExpand : zpValue; \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + dout = axisData ? dinExpand : zpValue; \\\n\ + coordOut.y += 1; \\\n\ + write_fun(dataOut, coordOut, dout, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void upsample_U8_U8to_U8_SAME\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_U8_U8TO_U8_SAME_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_U8_U8to_U8_SAME_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_U8_U8TO_U8_SAME_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniU8SubZP_MulM_2x8;\n\ +_viv_uniform VXC_512Bits uniU8SubZP_MulM_Hi_2x8;\n\ +_viv_uniform VXC_512Bits uniS16AddOutZP_2x8;\n\ +_viv_uniform VXC_512Bits uniS16MoveValue_2x8;\n\ +_viv_uniform vxc_uint4 packed_outputZP;\n\ +\n\ +#define UPSAMPLE_U8_U8TO_U8_PROCESS(read_fun, write_fun) \\\n\ + vxc_uchar8 din; \\\n\ + vxc_uchar8 axisIn; \\\n\ + vxc_uchar16 dinExpand; \\\n\ + vxc_uchar16 axisInExpand; \\\n\ + vxc_uchar16 constAxis; \\\n\ + vxc_uchar16 axisData; \\\n\ + vxc_uchar16 zpValue; \\\n\ + vxc_uchar16 dout; \\\n\ + vxc_uchar16 result, result_tmp; \\\n\ + zpValue = (uchar)input_ZP; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + dinExpand = din.s0011223344556677; \\\n\ + axisInExpand = axisIn.s0011223344556677; \\\n\ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + dout = axisData ? dinExpand : zpValue; \\\n\ + vxc_short8 tmp; \\\n\ + short zp = input_ZP; \\\n\ + vxc_short8 packed_outZP; \\\n\ + _viv_asm(COPY, packed_outZP, packed_outputZP, 16); \\\n\ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8SubZP_MulM_2x8); \\\n\ + VXC_DP2x8(result, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8SubZP_MulM_Hi_2x8); \\\n\ + VXC_DP2x8(result_tmp, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + VXC_DP2x8(result, result_tmp, result_tmp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16MoveValue_2x8); \\\n\ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + dout = axisData ? dinExpand : zpValue; \\\n\ + coordOut.y += 1; \\\n\ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8SubZP_MulM_2x8); \\\n\ + VXC_DP2x8(result, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + VXC_DP2x8(tmp, dout, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8SubZP_MulM_Hi_2x8); \\\n\ + VXC_DP2x8(result_tmp, tmp, packed_outZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16AddOutZP_2x8); \\\n\ + VXC_DP2x8(result, result_tmp, result_tmp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniS16MoveValue_2x8); \\\n\ + write_fun(dataOut, coordOut, result, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void upsample_U8_U8to_U8\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + UPSAMPLE_U8_U8TO_U8_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_U8_U8to_U8_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + UPSAMPLE_U8_U8TO_U8_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniMulMinusZpUint8_4x4;\n\ +_viv_uniform VXC_512Bits uniMulMinusZp2Uint8_4x4;\n\ +_viv_uniform VXC_512Bits uniMulMinusZp3Uint8_4x4;\n\ +_viv_uniform VXC_512Bits uniMulMinusZp4Uint8_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertDirUint8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertEndUint8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertTrdUint8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertFthUint8Fp32_4x4;\n\ +_viv_uniform float scaleU8Fp16;\n\ +_viv_uniform int zpU8Fp16;\n\ +\n\ +#define UPSAMPLE_U8_U8TO_F16_PROCESS(read_fun, write_fun) \\\n\ + vxc_uchar8 din; \\\n\ + vxc_uchar8 axisIn; \\\n\ + vxc_uchar16 dinExpand; \\\n\ + vxc_uchar16 axisInExpand; \\\n\ + vxc_uchar16 constAxis; \\\n\ + vxc_uchar16 axisData; \\\n\ + vxc_uchar16 axisData1; \\\n\ + vxc_uchar16 dout; \\\n\ + read_fun(din, dataIn, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(axisIn, axis, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coordOut1.x += 8; \\\n\ + dinExpand = din.s0011223344556677; \\\n\ + axisInExpand = axisIn.s0011223344556677; \\\n\ + constAxis = (vxc_uchar16)(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + axisData &= (vxc_uchar16)(1); \\\n\ + _viv_asm(COPY, axisData1, axisData, 16); \\\n\ + dout = axisData1 * dinExpand; \\\n\ + vxc_float4 tmpVal0, tmpVal1, tmpVal2, tmpVal3, convZp; \\\n\ + half4 tmpOut0, tmpOut1; \\\n\ + vxc_short8 rout0, rout1; \\\n\ + vxc_int4 tmpV0, tmpV1, tmpV2, tmpV3; \\\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; \\\n\ + short tmpZp = (short)(-zpU8Fp16); \\\n\ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertDirUint8Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndUint8Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertTrdUint8Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal3, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertFthUint8Fp32_4x4); \\\n\ + VXC_DP4x4(tmpV0, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniMulMinusZpUint8_4x4); \\\n\ + VXC_DP4x4(tmpV1, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniMulMinusZp2Uint8_4x4); \\\n\ + VXC_DP4x4(tmpV2, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniMulMinusZp3Uint8_4x4); \\\n\ + VXC_DP4x4(tmpV3, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniMulMinusZp4Uint8_4x4); \\\n\ + _viv_asm(CONV, tmpData0, tmpV0); \\\n\ + _viv_asm(CONV, tmpData1, tmpV1); \\\n\ + _viv_asm(CONV, tmpData2, tmpV2); \\\n\ + _viv_asm(CONV, tmpData3, tmpV3); \\\n\ + tmpVal0 = (tmpVal0 + tmpData0) * scaleU8Fp16; \\\n\ + tmpVal1 = (tmpVal1 + tmpData1) * scaleU8Fp16; \\\n\ + tmpVal2 = (tmpVal2 + tmpData2) * scaleU8Fp16; \\\n\ + tmpVal3 = (tmpVal3 + tmpData3) * scaleU8Fp16; \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal0); \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal1); \\\n\ + VXC_DP2x8(rout0, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt32toInt16_2x8); \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal2); \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal3); \\\n\ + VXC_DP2x8(rout1, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt32toInt16_2x8); \\\n\ + write_fun(dataOut, coordOut, rout0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + write_fun(dataOut, coordOut1, rout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + constAxis = (vxc_uchar16)(2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3); \\\n\ + VXC_Clamp(axisData, axisInExpand, constAxis, constAxis, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + axisData &= (vxc_uchar16)(1); \\\n\ + _viv_asm(COPY, axisData1, axisData, 16); \\\n\ + dout = axisData1 * dinExpand; \\\n\ + VXC_DP4x4(tmpVal0, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertDirUint8Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal1, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndUint8Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal2, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertTrdUint8Fp32_4x4); \\\n\ + VXC_DP4x4(tmpVal3, dout, dout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertFthUint8Fp32_4x4); \\\n\ + VXC_DP4x4(tmpV0, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniMulMinusZpUint8_4x4); \\\n\ + VXC_DP4x4(tmpV1, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniMulMinusZp2Uint8_4x4); \\\n\ + VXC_DP4x4(tmpV2, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniMulMinusZp3Uint8_4x4); \\\n\ + VXC_DP4x4(tmpV3, axisData1, tmpZp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniMulMinusZp4Uint8_4x4); \\\n\ + _viv_asm(CONV, tmpData0, tmpV0); \\\n\ + _viv_asm(CONV, tmpData1, tmpV1); \\\n\ + _viv_asm(CONV, tmpData2, tmpV2); \\\n\ + _viv_asm(CONV, tmpData3, tmpV3); \\\n\ + tmpVal0 = (tmpVal0 + tmpData0) * scaleU8Fp16; \\\n\ + tmpVal1 = (tmpVal1 + tmpData1) * scaleU8Fp16; \\\n\ + tmpVal2 = (tmpVal2 + tmpData2) * scaleU8Fp16; \\\n\ + tmpVal3 = (tmpVal3 + tmpData3) * scaleU8Fp16; \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal0); \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal1); \\\n\ + VXC_DP2x8(rout0, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt32toInt16_2x8); \\\n\ + _viv_asm(CONV, tmpOut0, tmpVal2); \\\n\ + _viv_asm(CONV, tmpOut1, tmpVal3); \\\n\ + VXC_DP2x8(rout1, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt32toInt16_2x8); \\\n\ + coordOut.y += 1; \\\n\ + coordOut1.y += 1; \\\n\ + write_fun(dataOut, coordOut, rout0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + write_fun(dataOut, coordOut1, rout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void upsample_U8_U8to_F16\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coordOut = (int4)(coord.x << 1, coord.y << 1, coord.z, 0);\n\ + int4 coordOut1 = coordOut;\n\ + UPSAMPLE_U8_U8TO_F16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +\n\ +__kernel void upsample_U8_U8to_F16_2D\n\ + (\n\ + image2d_array_t dataIn,\n\ + image2d_array_t axis,\n\ + image2d_array_t dataOut\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coordOut = (int2)(coord.x << 1, coord.y << 1);\n\ + int2 coordOut1 = coordOut;\n\ + UPSAMPLE_U8_U8TO_F16_PROCESS(VXC_ReadImage, VXC_WriteImage)\n\ +}\n\ +"; /* end of upsample_U8_vx*/ + +static const char vsi_nn_kernel_axis_aligned_bbox_transform_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void vxcAxis_aligned_bbox_transform(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ +\n\ +}\n\ +"; /* end of vsi_nn_kernel_axis_aligned_bbox_transform_vx*/ + +static const char vsi_nn_kernel_box_with_nms_limit_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void vxcBox_with_nms_limit(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ +\n\ +}\n\ +"; /* end of vsi_nn_kernel_box_with_nms_limit_vx*/ + +static const char vsi_nn_kernel_crop_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +//-----------------------------------------------tensor crop-------------------------------\n\ +__kernel void vxcTensorCrop_Int16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int offset0,\n\ + int offset1,\n\ + int offset2)\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_ushort8 src0, src1, src2, src3;\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\\\n\ + - offset1, get_global_id(2) - offset2, 0);\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y ++;\n\ + VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y ++;\n\ + VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y ++;\n\ + VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void vxcTensorCrop_Int8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int offset0,\n\ + int offset1,\n\ + int offset2)\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 src0, src1, src2, src3;\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1) - offset1,\\\n\ + get_global_id(2) - offset2, 0);\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y ++;\n\ + VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y ++;\n\ + VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y ++;\n\ + VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt16toFp16_2x8;\n\ +\n\ +__kernel void vxcTensorCrop_Int16_Fp16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int offset0,\n\ + int offset1,\n\ + int offset2)\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_short8 src0, src1, src2, src3;\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + int4 coord_out = (int4)(get_global_id(0) - offset0, get_global_id(1)\\\n\ + - offset1, get_global_id(2) - offset2, 0);\n\ +\n\ + vxc_half8 dst0, dst1, dst2, dst3;\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt16toFp16_2x8);\n\ + VXC_DP2x8(dst1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt16toFp16_2x8);\n\ + VXC_DP2x8(dst2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt16toFp16_2x8);\n\ + VXC_DP2x8(dst3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt16toFp16_2x8);\n\ +\n\ + vxc_short8 out0, out1, out2, out3;\n\ + _viv_asm(COPY, out0, dst0, 16);\n\ + _viv_asm(COPY, out1, dst1, 16);\n\ + _viv_asm(COPY, out2, dst2, 16);\n\ + _viv_asm(COPY, out3, dst3, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, out0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y ++;\n\ + VXC_WriteImage2DArray(output, coord_out, out1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y ++;\n\ + VXC_WriteImage2DArray(output, coord_out, out2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y ++;\n\ + VXC_WriteImage2DArray(output, coord_out, out3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_crop_vx*/ + +static const char vsi_nn_kernel_detection_postprocess_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void vxcDetection_postprocess(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ +\n\ +}\n\ +"; /* end of vsi_nn_kernel_detection_postprocess_vx*/ + +static const char vsi_nn_kernel_extra_ending_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void vxcExtra_ending_i16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 data;\n\ + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void vxcExtra_ending_i8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char8 data;\n\ + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void vxcExtra_ending_u8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar8 data;\n\ + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_extra_ending_vx*/ + +static const char vsi_nn_kernel_fullconnect2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int loopNum;\n\ +_viv_uniform VXC_512Bits uniMulAcc_16x1;\n\ +__kernel void vsi_nn_kernel_fullconnect2(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t weight,\n\ + __read_only image2d_array_t bias,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord_in = (int4)(16, get_global_id(0), get_global_id(1), 0);\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 v0, v1, v2, v3, v4, v5, v6, v7;\n\ + vxc_half8 i0, i1, i2, i3;\n\ + vxc_half8 w0, w1, w2, w3;\n\ + float4 sum = 0;\n\ + float dst = 0;\n\ + dst = read_imagef(bias, coord_in.ywww).x;\n\ + do\n\ + {\n\ + VXC_ReadImage(v0, input, coord_in.xz, VXC_5BITOFFSET_XY(-16, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, i0, v0, 16);\n\ + VXC_ReadImage(v1, weight, coord_in.xy, VXC_5BITOFFSET_XY(-16, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, w0, v1, 16);\n\ + VXC_ReadImage(v2, input, coord_in.xz, VXC_5BITOFFSET_XY(-8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, i1, v2, 16);\n\ + VXC_ReadImage(v3, weight, coord_in.xy, VXC_5BITOFFSET_XY(-8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, w1, v3, 16);\n\ + VXC_ReadImage(v4, input, coord_in.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, i2, v4, 16);\n\ + VXC_ReadImage(v5, weight, coord_in.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, w2, v5, 16);\n\ + VXC_ReadImage(v6, input, coord_in.xz, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, i3, v6, 16);\n\ + VXC_ReadImage(v7, weight, coord_in.xy, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, w3, v7, 16);\n\ +\n\ + coord_in.x += 32;\n\ +\n\ + VXC_DP16x1(sum, i0, w0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\ + VXC_DP16x1(sum, i1, w1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\ + VXC_DP16x1(sum, i2, w2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\ + VXC_DP16x1(sum, i3, w3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), uniMulAcc_16x1);\n\ +\n\ + float4 tmp = {1, 1, 1, 1};\n\ + dst = dst + dot(sum, tmp);\n\ +\n\ + } while (coord_in.x < loopNum);\n\ +\n\ + vxc_half v;\n\ + _viv_asm(CONV, v, dst);\n\ + _viv_asm(COPY, v0, v, 16);\n\ + VXC_WriteImage(output, coord_out.xy, v0, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_fullconnect2_vx*/ + +static const char vsi_nn_kernel_generate_proposals_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void vxcGenerate_proposals(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ +\n\ +}\n\ +"; /* end of vsi_nn_kernel_generate_proposals_vx*/ + +static const char vsi_nn_kernel_header_vx[] = "/*\n\ + ============================================================================\n\ + Name : libNNExt.vx\n\ + Author : VSI\n\ + Version :\n\ + Copyright : Your copyright notice\n\ + Description :\n\ + ============================================================================\n\ + */\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#if (VX_VERSION==1)\n\ +#define VXC_DP2x8_b_(dst, src0, src1, src2, info, uniform)\\\n\ +do\\\n\ +{\\\n\ + _viv_asm(COPY, dst, src0, 16); \\\n\ +} while (0)\n\ +\n\ +#define VXC_VertMin3_Integer(dst, src0, src1, src2, info)\\\n\ +do\\\n\ +{\\\n\ + typeof (dst) tmp; \\\n\ + tmp = min(src0, src1);\\\n\ + dst = min(src2, tmp);\\\n\ +} while (0)\n\ +\n\ +#define VXC_VertMin3_Half(dst, src0, src1, src2, info)\\\n\ +do\\\n\ +{\\\n\ + vxc_short8 val0_ver1, val1_ver1, val2_ver1, minVal_ver1, maxVal_ver1;\\\n\ + _viv_asm(COPY, val0_ver1, src0, 16);\\\n\ + _viv_asm(COPY, val1_ver1, src1, 16);\\\n\ + _viv_asm(COPY, val2_ver1, src2, 16);\\\n\ + maxVal_ver1 = max(val0_ver1, val1_ver1);\\\n\ + minVal_ver1 = min(val0_ver1, val1_ver1);\\\n\ + minVal_ver1 = maxVal_ver1 < 0 ? maxVal_ver1 : minVal_ver1; \\\n\ + maxVal_ver1 = max(val2_ver1, minVal_ver1);\\\n\ + minVal_ver1 = min(val2_ver1, minVal_ver1);\\\n\ + minVal_ver1 = maxVal_ver1 < 0 ? maxVal_ver1 : minVal_ver1; \\\n\ + _viv_asm(COPY, dst, minVal_ver1, 16); \\\n\ +} while (0)\n\ +\n\ +#define VXC_VertMax3_Integer(dst, src0, src1, src2, info)\\\n\ +do\\\n\ +{\\\n\ + int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ + int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ + int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ + int mod1 = VXC_MODIFIER_CLAMP(startBin, endBin, sourceBin, 0);\\\n\ + typeof (dst) tmp;\\\n\ + tmp = max(src0, src1);\\\n\ + tmp = max(src2, tmp);\\\n\ + VXC_Clamp(dst, tmp, tmp, tmp, mod1);\\\n\ +} while (0)\n\ +\n\ +#define VXC_VertMax3_Half(dst, src0, src1, src2, info)\\\n\ + do\\\n\ + {\\\n\ + vxc_short8 val0_ver1, val1_ver1, val2_ver1, minVal_ver1, maxVal_ver1;\\\n\ + _viv_asm(COPY, val0_ver1, src0, 16);\\\n\ + _viv_asm(COPY, val1_ver1, src1, 16);\\\n\ + _viv_asm(COPY, val2_ver1, src2, 16);\\\n\ + maxVal_ver1 = max(val0_ver1, val1_ver1);\\\n\ + maxVal_ver1 = max(val2_ver1, maxVal_ver1);\\\n\ + minVal_ver1 = min(val0_ver1, val1_ver1);\\\n\ + minVal_ver1 = min(val2_ver1, minVal_ver1);\\\n\ + maxVal_ver1 = maxVal_ver1 >= 0 ? maxVal_ver1 : minVal_ver1;\\\n\ + _viv_asm(COPY, dst, maxVal_ver1, 16); \\\n\ + } while (0)\n\ +\n\ +#define VXC_HorzMax3_Integer(dst, src0, info)\\\n\ +do\\\n\ +{\\\n\ + int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ + int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ + int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ + int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ + int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\ + VXC_OP4(filter, dst, src0, src0, src0, mod1);\\\n\ +} while (0)\n\ +\n\ +#define VXC_HorzMax3_Half(dst, src0, info)\\\n\ +do\\\n\ +{\\\n\ + int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ + int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ + int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ + int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ + int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\ + int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\ + vxc_short8 val0, minVal, maxVal;\\\n\ + _viv_asm(COPY, val0, src0, 16);\\\n\ + VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\\\n\ + VXC_OP4(filter, minVal, val0, val0, val0, mod2);\\\n\ + maxVal = maxVal >= 0 ? maxVal : minVal;\\\n\ + _viv_asm(COPY, dst, maxVal, 16);\\\n\ +} while (0)\n\ +\n\ +#define VXC_HorzMin3_Integer(dst, src0, info)\\\n\ +do\\\n\ +{\\\n\ + int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ + int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ + int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ + int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ + int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\ + VXC_OP4(filter, dst, src0, src0, src0, mod1);\\\n\ +} while (0)\n\ +\n\ +#define VXC_HorzMin3_Half(dst, src0, info)\\\n\ +do\\\n\ +{\\\n\ + int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ + int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ + int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ + int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ + int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\ + int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\ + int mod3 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Median, clamp);\\\n\ + vxc_short8 val0, minVal, maxVal, midVal;\\\n\ + _viv_asm(COPY, val0, src0, 16);\\\n\ + VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\\\n\ + VXC_OP4(filter, minVal, val0, val0, val0, mod2);\\\n\ + VXC_OP4(filter, midVal, val0, val0, val0, mod3);\\\n\ + minVal = midVal < 0 ? midVal : minVal;\\\n\ + minVal = maxVal < 0 ? maxVal : minVal;\\\n\ + _viv_asm(COPY, dst, minVal, 16);\\\n\ +} while (0)\n\ +\n\ +#define VXC_Clamp_Half(dst, src0, src1, src2, info)\\\n\ +do\\\n\ +{\\\n\ + typeof (dst) tmp;\\\n\ + VXC_VertMax3_Half(tmp, src0, src0, src1, info);\\\n\ + VXC_VertMin3_Half(dst, tmp, tmp, src2, info);\\\n\ +} while (0)\n\ +\n\ +#else\n\ +#define VXC_DP2x8_b_(dst, src0, src1, src2, info, uniform)\\\n\ +do\\\n\ +{\\\n\ + VXC_DP2x8_b(dst, src0, src1, src2, info, uniform); \\\n\ +} while (0)\n\ +\n\ +#define VXC_VertMin3_Integer(dst, src0, src1, src2, info)\\\n\ + do\\\n\ + {\\\n\ + VXC_VertMin3(dst, src0, src1, src2, info);\\\n\ + } while (0)\n\ +\n\ +#define VXC_VertMin3_Half(dst, src0, src1, src2, info)\\\n\ + do\\\n\ + {\\\n\ + VXC_VertMin3(dst, src0, src1, src2, info);\\\n\ + } while (0)\n\ +\n\ +#define VXC_VertMax3_Integer(dst, src0, src1, src2, info)\\\n\ +do\\\n\ +{\\\n\ + VXC_VertMax3(dst, src0, src1, src2, info);\\\n\ +} while (0)\n\ +\n\ +#define VXC_VertMax3_Half(dst, src0, src1, src2, info)\\\n\ +do\\\n\ +{\\\n\ + VXC_VertMax3(dst, src0, src1, src2, info);\\\n\ +} while (0)\n\ +\n\ +#define VXC_HorzMax3_Integer(dst, src0, info)\\\n\ +do\\\n\ +{\\\n\ + VXC_HorzMax3(dst, src0, info);\\\n\ +} while (0)\n\ +\n\ +#define VXC_HorzMax3_Half(dst, src0, info)\\\n\ +do\\\n\ +{\\\n\ + VXC_HorzMax3(dst, src0, info);\\\n\ +} while (0)\n\ +\n\ +#define VXC_HorzMin3_Integer(dst, src0, info)\\\n\ +do\\\n\ +{\\\n\ + VXC_HorzMin3(dst, src0, info);\\\n\ +} while (0)\n\ +\n\ +#define VXC_HorzMin3_Half(dst, src0, info)\\\n\ +do\\\n\ +{\\\n\ + VXC_HorzMin3(dst, src0, info);\\\n\ +} while (0)\n\ +\n\ +#define VXC_Clamp_Half(dst, src0, src1, src2, info)\\\n\ +do\\\n\ +{\\\n\ + VXC_Clamp(dst, src0, src1, src2, info);\\\n\ +} while (0)\n\ +#endif\n\ +"; /* end of vsi_nn_kernel_header_vx*/ + +static const char vsi_nn_kernel_heatmap_max_keypoint_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void vxcHeatmap_max_keypoint(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ +\n\ +}\n\ +"; /* end of vsi_nn_kernel_heatmap_max_keypoint_vx*/ + +static const char vsi_nn_kernel_imageprocess_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +_viv_uniform VXC_512Bits uniUnpackToR;\n\ +_viv_uniform VXC_512Bits uniUnpackToG;\n\ +_viv_uniform VXC_512Bits uniUnpackToB;\n\ +_viv_uniform VXC_512Bits uniDataMulAlpha_4x4;\n\ +_viv_uniform VXC_512Bits uniDataSubMean_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniExtactInteger_2x8;\n\ +\n\ +#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\ +__kernel void ScaletoTensor_Int8\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ +\n\ + int4 xPos = get_global_id(0);\n\ + int yPos = get_global_id(1);\n\ +\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ + xPos += (int4)(0, 1, 2, 3);\n\ +\n\ + //x\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ + int4 sx = fx0 & 0xffff8000;\n\ + fx0 -= sx;\n\ + sx = sx >> 15;\n\ +\n\ + vxc_short4 fx;\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ + //y\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ + int sy = fy & 0xffff8000; // Floor\n\ +\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ +\n\ + //R\n\ + vxc_uchar16 line0RGB1, line0RGB2;\n\ + vxc_uchar16 line1RGB3, line1RGB4;\n\ + int4 coord;\n\ + sx = sx * 3 + *xOffset;\n\ + coord.xyz = sx.xyz;\n\ + coord.w = sy + *yOffset;\n\ + int2 coord1 = (int2)(sx.w, coord.w);\n\ + VXC_ReadImage(line0RGB1, input, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB1, input, coord.yw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB2, input, coord.zw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB2, input, coord1, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 bgrMean = (float4)(bMean, gMean, rMean, 0);\n\ +\n\ + bgrMean *= f32Var;\n\ +\n\ + int4 test01, temp1;\n\ + int4 test02, temp2;\n\ + int4 tt;\n\ + vxc_uchar4 val;\n\ + int4 coord_out = (int4)(xPos.x, yPos, 2, 0);\n\ +\n\ + vxc_uchar8 line1, line2;\n\ +\n\ + //R\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ +\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + vxc_float4 tmp_dst;\n\ + vxc_uchar4 u8_dst;\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + //convert U8 to dfp8\n\ + int4 dst0;\n\ + vxc_char4 dst;\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.z;\n\ + tmp_dst *= outputScale;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //G\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ +\n\ + coord_out.z = 1;\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.y;\n\ + tmp_dst *= outputScale;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //B\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ +\n\ + coord_out.z = 0;\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.x;\n\ + tmp_dst *= outputScale;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void ScaletoTensor_Fp16\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ +\n\ + int4 xPos = get_global_id(0);\n\ + int yPos = get_global_id(1);\n\ +\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ + xPos += (int4)(0, 1, 2, 3);\n\ +\n\ + //x\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ + int4 sx = fx0 & 0xffff8000;\n\ + fx0 -= sx;\n\ + sx = sx >> 15;\n\ +\n\ + vxc_short4 fx;\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ + //y\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ + int sy = fy & 0xffff8000; // Floor\n\ +\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ +\n\ + //R\n\ + vxc_uchar16 line0RGB1, line0RGB2;\n\ + vxc_uchar16 line1RGB3, line1RGB4;\n\ + int4 coord;\n\ + sx = sx * 3 + *xOffset;\n\ + coord.xyz = sx.xyz;\n\ + coord.w = sy + *yOffset;\n\ + int2 coord1 = (int2)(sx.w, coord.w);\n\ + VXC_ReadImage(line0RGB1, input, coord.xw,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB1, input, coord.yw,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB2, input, coord.zw,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB2, input, coord1,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(line1RGB3, input, coord.xw,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB3, input, coord.yw,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB4, input, coord.zw,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB4, input, coord1,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 bgrMean = (float4)(bMean, gMean, rMean, 0);\n\ +\n\ + int4 test01, temp1;\n\ + int4 test02, temp2;\n\ + int4 tt;\n\ + vxc_uchar4 val;\n\ + int4 coord_out = (int4)(xPos.x, yPos, 2, 0);\n\ +\n\ + vxc_uchar8 line1, line2;\n\ +\n\ + //R\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ +\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ +\n\ + //convert U8 to FP16\n\ + half4 f16mean;\n\ + half f16alpha;\n\ + vxc_half4 dst;\n\ + vxc_short4 tmp_dst;\n\ + _viv_asm(CONV, f16mean, bgrMean);\n\ + _viv_asm(CONV, f16alpha, f32Var);\n\ + VXC_DP4x4(dst, val, f16mean.z, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4);\n\ + VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4);\n\ + _viv_asm(COPY, tmp_dst, dst, 8);\n\ + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //G\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ +\n\ + coord_out.z = 1;\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ +\n\ + VXC_DP4x4(dst, val, f16mean.y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4);\n\ + VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4);\n\ + _viv_asm(COPY, tmp_dst, dst, 8);\n\ + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //B\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ +\n\ + coord_out.z = 0;\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ +\n\ + VXC_DP4x4(dst, val, f16mean.x, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4);\n\ + VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4);\n\ + _viv_asm(COPY, tmp_dst, dst, 8);\n\ + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +}\n\ +\n\ +"; /* end of vsi_nn_kernel_imageprocess_vx*/ + +static const char vsi_nn_kernel_imageprocess_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +_viv_uniform VXC_512Bits uniUnpackToR;\n\ +_viv_uniform VXC_512Bits uniUnpackToG;\n\ +_viv_uniform VXC_512Bits uniUnpackToB;\n\ +_viv_uniform VXC_512Bits uniDataMulAlpha_4x4;\n\ +_viv_uniform VXC_512Bits uniDataSubMean_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniExtactInteger_2x8;\n\ +\n\ +#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\ +__kernel void ScaletoTensor_Int16\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ +\n\ + int4 xPos = get_global_id(0);\n\ + int yPos = get_global_id(1);\n\ +\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ + xPos += (int4)(0, 1, 2, 3);\n\ +\n\ + //x\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ + int4 sx = fx0 & 0xffff8000;\n\ + fx0 -= sx;\n\ + sx = sx >> 15;\n\ +\n\ + vxc_short4 fx;\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ + //y\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ + int sy = fy & 0xffff8000; // Floor\n\ +\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ +\n\ + //R\n\ + vxc_uchar16 line0RGB1, line0RGB2;\n\ + vxc_uchar16 line1RGB3, line1RGB4;\n\ + int4 coord;\n\ + sx = sx * 3 + *xOffset;\n\ + coord.xyz = sx.xyz;\n\ + coord.w = sy + *yOffset;\n\ + int2 coord1 = (int2)(sx.w, coord.w);\n\ + VXC_ReadImage(line0RGB1, input, coord.xw,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB1, input, coord.yw,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB2, input, coord.zw,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB2, input, coord1,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(line1RGB3, input, coord.xw,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB3, input, coord.yw,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB4, input, coord.zw,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB4, input, coord1,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 bgrMean = (float4)(bMean, gMean, rMean, 0);\n\ +\n\ + bgrMean *= f32Var;\n\ +\n\ + int4 test01, temp1;\n\ + int4 test02, temp2;\n\ + int4 tt;\n\ + vxc_uchar4 val;\n\ + int4 coord_out = (int4)(xPos.x, yPos, 2, 0);\n\ +\n\ + vxc_uchar8 line1, line2;\n\ +\n\ + //R\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ +\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + vxc_float4 tmp_dst;\n\ + vxc_uchar4 u8_dst;\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + //convert U8 to dfp8\n\ + int4 dst0;\n\ + vxc_short4 dst;\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.z;\n\ + tmp_dst *= outputScale;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //G\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ +\n\ + coord_out.z = 1;\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.y;\n\ + tmp_dst *= outputScale;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //B\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ +\n\ + coord_out.z = 0;\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.x;\n\ + tmp_dst *= outputScale;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform float outputZP;\n\ +__kernel void ScaletoTensor_UInt8\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ +\n\ + int4 xPos = get_global_id(0);\n\ + int yPos = get_global_id(1);\n\ +\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ + xPos += (int4)(0, 1, 2, 3);\n\ +\n\ + //x\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ + int4 sx = fx0 & 0xffff8000;\n\ + fx0 -= sx;\n\ + sx = sx >> 15;\n\ +\n\ + vxc_short4 fx;\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ + //y\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ + int sy = fy & 0xffff8000; // Floor\n\ +\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ +\n\ + //R\n\ + vxc_uchar16 line0RGB1, line0RGB2;\n\ + vxc_uchar16 line1RGB3, line1RGB4;\n\ + int4 coord;\n\ + sx = sx * 3 + *xOffset;\n\ + coord.xyz = sx.xyz;\n\ + coord.w = sy + *yOffset;\n\ + int2 coord1 = (int2)(sx.w, coord.w);\n\ + VXC_ReadImage(line0RGB1, input, coord.xw,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB1, input, coord.yw,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB2, input, coord.zw,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0RGB2, input, coord1,\\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(line1RGB3, input, coord.xw,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB3, input, coord.yw,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB4, input, coord.zw,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1RGB4, input, coord1,\\\n\ + VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 bgrMean = (float4)(bMean, gMean, rMean, 0);\n\ +\n\ + bgrMean *= f32Var;\n\ +\n\ + int4 test01, temp1;\n\ + int4 test02, temp2;\n\ + int4 tt;\n\ + vxc_uchar4 val;\n\ + int4 coord_out = (int4)(xPos.x, yPos, 2, 0);\n\ +\n\ + vxc_uchar8 line1, line2;\n\ +\n\ + //R\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR);\n\ +\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + vxc_float4 tmp_dst;\n\ + vxc_uchar4 u8_dst;\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + //convert U8 to dfp8\n\ + int4 dst0;\n\ + vxc_uchar4 dst;\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.z;\n\ + tmp_dst = tmp_dst * outputScale + outputZP;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //G\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG);\n\ +\n\ + coord_out.z = 1;\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.y;\n\ + tmp_dst = tmp_dst * outputScale + outputZP;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //B\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB);\n\ +\n\ + coord_out.z = 0;\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + tmp_dst = tmp_dst * f32Var - bgrMean.x;\n\ + tmp_dst = tmp_dst * outputScale + outputZP;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_imageprocess_2_vx*/ + +static const char vsi_nn_kernel_imageprocess_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniExtractR_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractG_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractB_2x8;\n\ +_viv_uniform float outputScale;\n\ +__kernel void ScaletoTensor_Fp16_copy\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ +\n\ + coord.xy += (int2) (*xOffset, *yOffset);\n\ + vxc_uchar16 src0, src1;\n\ + vxc_half8 dst;\n\ +\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var);\n\ + //convert U8 to FP16\n\ + half4 paramData_f16;\n\ + vxc_short8 tmp_dst;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0);\n\ + //R\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractR_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractR_2x8);\n\ + _viv_asm(COPY, tmp_dst, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + //G\n\ + coord_out.z = 1;\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractG_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractG_2x8);\n\ + _viv_asm(COPY, tmp_dst, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //B\n\ + coord_out.z = 0;\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0), uniExtractB_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_TowardZero, 0), uniExtractB_2x8);\n\ + _viv_asm(COPY, tmp_dst, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void ScaletoTensor_Int8_copy\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ +\n\ + coord.xy += (int2) (*xOffset, *yOffset);\n\ + vxc_uchar16 src0, src1;\n\ + vxc_char16 dst;\n\ +\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + f32Var *= outputScale;\n\ + float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var);\n\ + //convert U8 to FP16\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0);\n\ + //R\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + //G\n\ + coord_out.z = 1;\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //B\n\ + coord_out.z = 0;\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void ScaletoTensor_Int16_copy\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ +\n\ + coord.xy += (int2) (*xOffset, *yOffset);\n\ + vxc_uchar16 src0, src1;\n\ + vxc_short8 dst;\n\ +\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + f32Var *= outputScale;\n\ + float4 paramData = (float4)(rMean * f32Var, gMean * f32Var, bMean * f32Var, f32Var);\n\ + //convert U8 to FP16\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0);\n\ + //R\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + //G\n\ + coord_out.z = 1;\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //B\n\ + coord_out.z = 0;\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform float outputZP;\n\ +__kernel void ScaletoTensor_UInt8_copy\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ +\n\ + coord.xy += (int2) (*xOffset, *yOffset);\n\ + vxc_uchar16 src0, src1;\n\ + vxc_uchar16 dst;\n\ +\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + f32Var *= outputScale;\n\ + float4 paramData = (float4)(rMean * f32Var - outputZP,\\\n\ + gMean * f32Var - outputZP, bMean * f32Var - outputZP, f32Var);\n\ + //convert U8 to FP16\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 2, 0);\n\ + //R\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractR_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + //G\n\ + coord_out.z = 1;\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractG_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //B\n\ + coord_out.z = 0;\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 4, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(5, 9, 0, VXC_RM_ToNearestEven, 1), uniExtractB_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 9, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_imageprocess_3_vx*/ + +static const char vsi_nn_kernel_imageprocess_4_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniExtactInteger_2x8;\n\ +\n\ +#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\ +__kernel void GrayScaletoTensor_Int8\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ +\n\ + int4 xPos = get_global_id(0);\n\ + int yPos = get_global_id(1);\n\ +\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ + xPos += (int4)(0, 1, 2, 3);\n\ +\n\ + //x\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ + int4 sx = fx0 & 0xffff8000;\n\ + fx0 -= sx;\n\ + sx = sx >> 15;\n\ +\n\ + vxc_short4 fx;\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ + //y\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ + int sy = fy & 0xffff8000; // Floor\n\ +\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ +\n\ + //R\n\ + vxc_uchar16 line0Y;\n\ + vxc_uchar16 line1Y;\n\ + int4 coord;\n\ + sx = sx + *xOffset;\n\ + coord.xyz = sx.xyz;\n\ + coord.w = sy + *yOffset;\n\ + int2 coord1 = (int2)(sx.w, coord.w);\n\ + VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float grayMean = mean * f32Var;\n\ +\n\ + int4 test01, temp1;\n\ + int4 test02, temp2;\n\ + int4 tt;\n\ + vxc_uchar4 val;\n\ + int2 coord_out = (int2)(xPos.x, yPos);\n\ +\n\ + vxc_uchar8 line1, line2;\n\ +\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + vxc_float4 tmp_dst;\n\ + vxc_uchar4 u8_dst;\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero,\n\ + 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + //convert U8 to dfp8\n\ + int4 dst0;\n\ + vxc_char4 dst;\n\ + tmp_dst = tmp_dst * f32Var - grayMean;\n\ + tmp_dst *= outputScale;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ +\n\ + VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniDataMulAlpha_4x4;\n\ +_viv_uniform VXC_512Bits uniDataSubMean_4x4;\n\ +__kernel void GrayScaletoTensor_Fp16\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ +\n\ + int4 xPos = get_global_id(0);\n\ + int yPos = get_global_id(1);\n\ +\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ + xPos += (int4)(0, 1, 2, 3);\n\ +\n\ + //x\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ + int4 sx = fx0 & 0xffff8000;\n\ + fx0 -= sx;\n\ + sx = sx >> 15;\n\ +\n\ + vxc_short4 fx;\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ + //y\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ + int sy = fy & 0xffff8000; // Floor\n\ +\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ +\n\ + //R\n\ + vxc_uchar16 line0Y;\n\ + vxc_uchar16 line1Y;\n\ + int4 coord;\n\ + sx = sx + *xOffset;\n\ + coord.xyz = sx.xyz;\n\ + coord.w = sy + *yOffset;\n\ + int2 coord1 = (int2)(sx.w, coord.w);\n\ + VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float grayMean = mean;\n\ +\n\ + int4 test01, temp1;\n\ + int4 test02, temp2;\n\ + int4 tt;\n\ + vxc_uchar4 val;\n\ + int2 coord_out = (int2)(xPos.x, yPos);\n\ +\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + VXC_DP4x4(val, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ +\n\ + //convert U8 to FP16\n\ + half f16mean;\n\ + half f16alpha;\n\ + vxc_half4 dst;\n\ + vxc_short4 tmp_dst;\n\ + _viv_asm(CONV, f16mean, grayMean);\n\ + _viv_asm(CONV, f16alpha, f32Var);\n\ + VXC_DP4x4(dst, val, f16mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataSubMean_4x4);\n\ + VXC_DP4x4(dst, dst, f16alpha, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataMulAlpha_4x4);\n\ + _viv_asm(COPY, tmp_dst, dst, 8);\n\ + VXC_WriteImage(output, coord_out, tmp_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_imageprocess_4_vx*/ + +static const char vsi_nn_kernel_imageprocess_5_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniExtactInteger_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\ +_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\ +__kernel void GrayScaletoTensor_Fp16_copy\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + coord.xy += (int2) (*xOffset, *yOffset);\n\ + vxc_uchar16 src0;\n\ + vxc_half8 dst0, dst1;\n\ +\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord.x = coord.z + 8;\n\ + float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var);\n\ + //convert U8 to FP16\n\ + half4 paramData_f16;\n\ + vxc_short8 tmp_dst;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + VXC_DP2x8(dst0, src0, paramData_f16,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDataMeanStddevLo_2x8);\n\ + VXC_DP2x8(dst1, src0, paramData_f16,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDataMeanStddevHi_2x8);\n\ + _viv_asm(COPY, tmp_dst, dst0, 16);\n\ + VXC_WriteImage(output, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, tmp_dst, dst1, 16);\n\ + VXC_WriteImage(output, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void GrayScaletoTensor_Int8_copy\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + coord.xy += (int2) (*xOffset, *yOffset);\n\ + vxc_uchar16 src0;\n\ + vxc_char16 dst;\n\ +\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + f32Var *= outputScale;\n\ + float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var);\n\ + //convert U8 to FP16\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + VXC_DP2x8(dst, src0, paramData_f16,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8);\n\ + VXC_DP2x8(dst, src0, paramData_f16,\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8);\n\ + VXC_WriteImage(output, coord.zw, dst,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +}\n\ +\n\ +__kernel void GrayScaletoTensor_Int16\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ +\n\ + int4 xPos = get_global_id(0);\n\ + int yPos = get_global_id(1);\n\ +\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ + xPos += (int4)(0, 1, 2, 3);\n\ +\n\ + //x\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ + int4 sx = fx0 & 0xffff8000;\n\ + fx0 -= sx;\n\ + sx = sx >> 15;\n\ +\n\ + vxc_short4 fx;\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ + //y\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ + int sy = fy & 0xffff8000; // Floor\n\ +\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ +\n\ + vxc_uchar16 line0Y;\n\ + vxc_uchar16 line1Y;\n\ + int4 coord;\n\ + sx = sx + *xOffset;\n\ + coord.xyz = sx.xyz;\n\ + coord.w = sy + *yOffset;\n\ + int2 coord1 = (int2)(sx.w, coord.w);\n\ + VXC_ReadImage(line0Y, input, coord.xw,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.yw,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.zw,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float grayMean = mean * f32Var;\n\ +\n\ + int4 test01, temp1;\n\ + int4 test02, temp2;\n\ + int4 tt;\n\ + vxc_uchar4 val;\n\ + int2 coord_out = (int2)(xPos.x, yPos);\n\ +\n\ + vxc_uchar8 line1, line2;\n\ +\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + vxc_float4 tmp_dst;\n\ + vxc_uchar4 u8_dst;\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + //convert U8 to dfp8\n\ + int4 dst0;\n\ + vxc_short4 dst;\n\ + tmp_dst = tmp_dst * f32Var - grayMean;\n\ + tmp_dst *= outputScale;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ +\n\ + VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void GrayScaletoTensor_Int16_copy\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + coord.xy += (int2) (*xOffset, *yOffset);\n\ + vxc_uchar16 src0;\n\ + vxc_short8 dst0, dst1;\n\ +\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord.x = coord.z + 8;\n\ +\n\ + f32Var *= outputScale;\n\ + float4 paramData = (float4)(mean * f32Var, mean * f32Var, mean * f32Var, f32Var);\n\ + //convert U8 to FP16\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ +\n\ + VXC_DP2x8(dst0, src0, paramData_f16,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8);\n\ + VXC_DP2x8(dst1, src0, paramData_f16,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8);\n\ + VXC_WriteImage(output, coord.zw, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.xw, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform float outputZP;\n\ +__kernel void GrayScaletoTensor_UInt8_copy\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + coord.xy += (int2) (*xOffset, *yOffset);\n\ + vxc_uchar16 src0;\n\ + vxc_uchar16 dst;\n\ +\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + f32Var *= outputScale;\n\ + float4 paramData = (float4)(mean * f32Var - outputZP, mean * f32Var - outputZP,\n\ + mean * f32Var - outputZP, f32Var);\n\ + //convert U8 to FP16\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + VXC_DP2x8(dst, src0, paramData_f16,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevLo_2x8);\n\ + VXC_DP2x8(dst, src0, paramData_f16,\n\ + VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniDataMeanStddevHi_2x8);\n\ + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void GrayScaletoTensor_UInt8\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio);\n\ +\n\ + int4 xPos = get_global_id(0);\n\ + int yPos = get_global_id(1);\n\ +\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14);\n\ + xPos += (int4)(0, 1, 2, 3);\n\ +\n\ + //x\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x;\n\ + int4 sx = fx0 & 0xffff8000;\n\ + fx0 -= sx;\n\ + sx = sx >> 15;\n\ +\n\ + vxc_short4 fx;\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift);\n\ + //y\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y;\n\ + int sy = fy & 0xffff8000; // Floor\n\ +\n\ + fy -= sy;\n\ + sy = sy >> 15;\n\ +\n\ + fy = (fy + (1<< 4)) >> 5;\n\ +\n\ + //R\n\ + vxc_uchar16 line0Y;\n\ + vxc_uchar16 line1Y;\n\ + int4 coord;\n\ + sx = sx + *xOffset;\n\ + coord.xyz = sx.xyz;\n\ + coord.w = sy + *yOffset;\n\ + int2 coord1 = (int2)(sx.w, coord.w);\n\ + VXC_ReadImage(line0Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line0Y, input, coord1, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float grayMean = mean * f32Var;\n\ +\n\ + int4 test01, temp1;\n\ + int4 test02, temp2;\n\ + int4 tt;\n\ + vxc_uchar4 val;\n\ + int2 coord_out = (int2)(xPos.x, yPos);\n\ +\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp1 = temp1 + test01;\n\ +\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10);\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal);\n\ + temp2 = temp2 + test02;\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10);\n\ +\n\ + vxc_float4 tmp_dst;\n\ + vxc_uchar4 u8_dst;\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes);\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst,\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniConvertIntergetoF32_4x4);\n\ +\n\ + //convert U8 to dfp8\n\ + int4 dst0;\n\ + vxc_uchar4 dst;\n\ + tmp_dst = tmp_dst * f32Var - grayMean;\n\ + tmp_dst = tmp_dst * outputScale + outputZP;\n\ + dst0 = convert_int4_rte(tmp_dst);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtactInteger_2x8);\n\ +\n\ + VXC_WriteImage(output, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_imageprocess_5_vx*/ + +static const char vsi_nn_kernel_layernormalize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/**************************layernorm float16***********************************/\n\ +_viv_uniform int width;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\ +\n\ +__kernel void vxcLayerNorm(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t output,\n\ + float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_short8 src0, src1;\n\ + vxc_float sum = 0, sqr = 0;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ + {\n\ + vxc_half8 val0_h;\n\ + _viv_asm(COPY, val0_h, src0, 16);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr += sumsqr.y;\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 bias_f;\n\ + for(coord.x = 0; coord.x < width; coord.x += 4)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord.xwww);\n\ + vxc_half8 in_h, scale_h;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + vxc_float4 in_f, scale_f;\n\ + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + vxc_float4 sub, norm;\n\ + sub = in_f - mean;\n\ + norm = scale_f * vari * sub + bias_f;\n\ + half4 norm_h;\n\ + _viv_asm(CONV, norm_h, norm);\n\ + vxc_half8 dst;\n\ + VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtractHalf4_dp4x4);\n\ + vxc_short8 dstval;\n\ + _viv_asm(COPY, dstval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +/*****************************layernorm uint8 to uint8****************************/\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform int output_ZP;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform int tmpZp2;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +__kernel void vxcLayerNorm_u8(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t output,\n\ + float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_uchar16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float sum = 0, sqr = 0;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + vxc_int4 tmpSum1;\n\ + vxc_int4 tmpSqr1;\n\ + short zp = inputZP;\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1.x);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ + }\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ +\n\ + float mean, vari;\n\ + mean = sum * dimRatio;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + int4 coord_bias = (int4)(0, 0, 0, 0);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + coord_bias.x = coord.x;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ +\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + tmpData0 *= input_scale;\n\ + tmpData1 *= input_scale;\n\ + tmpData2 *= input_scale;\n\ + tmpData3 *= input_scale;\n\ +\n\ + vxc_float4 norm;\n\ + tmpData0 -= mean;\n\ + norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + coord_bias.x += 4;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);\n\ +\n\ + tmpData1 -= mean;\n\ + norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + tmpData2 -= mean;\n\ + norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_ZP);\n\ +\n\ + tmpData3 -= mean;\n\ + norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_ZP);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +/***************************layernorm float16 to uint8**************************/\n\ +_viv_uniform float outputZP;\n\ +__kernel void vxcLayerNormFP16toU8(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t output,\n\ + float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_short8 src0, src1;\n\ + vxc_float sum = 0, sqr = 0;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ + {\n\ + vxc_half8 val0_h;\n\ + _viv_asm(COPY, val0_h, src0, 16);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr += sumsqr.y;\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 bias_f;\n\ + for(coord.x = 0; coord.x < width; coord.x += 4)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord.xwww);\n\ + vxc_half8 in_h, scale_h;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + vxc_float4 in_f, scale_f;\n\ + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + vxc_float4 sub, norm;\n\ + sub = in_f - mean;\n\ + norm = scale_f * vari * sub + bias_f;\n\ + norm = norm * outputScale + outputZP;\n\ + int4 output_int4;\n\ + output_int4 = convert_int4_rte(norm);\n\ + vxc_uchar8 dst;\n\ + VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of vsi_nn_kernel_layernormalize_vx*/ + +static const char vsi_nn_kernel_layernormalize_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/*****************************layernorm uint8 to fp16****************************/\n\ +_viv_uniform int width;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform int tmpZp2;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits UniPackFP16even_2x8;\n\ +\n\ +__kernel void vxcLayerNormU8toFp16(\n\ + image2d_array_t input,\n\ + image2d_array_t bias,\n\ + image2d_array_t scale,\n\ + image2d_array_t output,\n\ + float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_uchar16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + vxc_int4 tmpSum1;\n\ + vxc_int4 tmpSqr1;\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1.x);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ + }\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ +\n\ + float mean, vari;\n\ + mean = sum * dimRatio;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + int4 coord_bias = (int4)(0, 0, 0, 0);\n\ + vxc_half8 scale_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_short8 src1, outval;\n\ + short zp = inputZP;\n\ + half4 tmpVal0, tmpVal1;\n\ + vxc_half8 dst;\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + coord_bias.x = coord.x;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ +\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + tmpData0 *= input_scale;\n\ + tmpData1 *= input_scale;\n\ + tmpData2 *= input_scale;\n\ + tmpData3 *= input_scale;\n\ +\n\ + vxc_float4 norm;\n\ + tmpData0 -= mean;\n\ + norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + coord_bias.x += 4;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ +\n\ + tmpData1 -= mean;\n\ + norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + UniPackFP16even_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + int2 coord_out = (int2)(coord.x, coord.y);\n\ + VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + tmpData2 -= mean;\n\ + norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ +\n\ + tmpData3 -= mean;\n\ + norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + UniPackFP16even_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + coord_out.x += 8;\n\ + VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of vsi_nn_kernel_layernormalize_U8_vx*/ + +static const char vsi_nn_kernel_resize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +//--------------------------resize-------------------------\n\ +_viv_uniform VXC_512Bits uniPackEvenData_2x8;\n\ +__kernel void resize_16bits_downsample_quarter\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_short8 src0, src1;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord = coord >> 1;\n\ + VXC_DP2x8(src0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardInf, 0), uniPackEvenData_2x8);\n\ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_8bits_downsample_quarter\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + vxc_char16 src0;\n\ + vxc_char8 dst;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord = coord >> 1;\n\ + dst = src0.s02468ace;\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_resize_vx*/ + +static const char vsi_nn_kernel_roi_align_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void vxcRoi_align(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ +\n\ +}\n\ +"; /* end of vsi_nn_kernel_roi_align_vx*/ + +static const char vsi_nn_kernel_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +//--------------------------scale-------------------------\n\ +_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ +_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16MulFp16ToFp32_Hi_4x4;\n\ +__kernel void scale_fp16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t weights,\n\ + __read_only image2d_array_t biases,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + vxc_short8 vec0, vec1;\n\ + vxc_half8 src0;\n\ + vxc_half8 w0;\n\ + vxc_float4 b0, b1;\n\ + vxc_float4 dst0, dst1;\n\ + VXC_ReadImage(vec0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage(vec1, weights, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, w0, vec1, 16);\n\ +\n\ + coord.z = coord.x + 4;\n\ +\n\ + b0 = read_imagef(biases, coord.xwww);\n\ + b1 = read_imagef(biases, coord.zwww);\n\ +\n\ + VXC_DP4x4(dst0, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16MulFp16ToFp32_Lo_4x4);\n\ + VXC_DP4x4(dst1, src0, w0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16MulFp16ToFp32_Hi_4x4);\n\ + dst0 += b0;\n\ + dst1 += b1;\n\ +\n\ + half4 t0, t1;\n\ +\n\ + _viv_asm(CONV, t0, dst0);\n\ + _viv_asm(CONV, t1, dst1);\n\ +\n\ + VXC_DP2x8(w0, t0, t1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\ + _viv_asm(COPY, vec0, w0, 16);\n\ +\n\ + VXC_WriteImage(output, coord.xy, vec0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_scale_vx*/ + +static const char vsi_nn_kernel_shufflechannel_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/******************shuffle channel float16/int16********************/\n\ +_viv_uniform int group_column;\n\ +_viv_uniform float rgroup_column;\n\ +\n\ +__kernel void shuffleChannelVXC(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int group_number,\n\ + int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_short8 src0, src1, src2, src3;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int coordz = coord.z;\n\ + int index_col = coordz * rgroup_column;\n\ + int index_row = coordz - index_col * group_column;\n\ + coord.z = index_row * group_number + index_col;\n\ + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ + VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ + VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +/*****************shuffle channel int8/uint8****************************/\n\ +\n\ +__kernel void shuffleChannel8BitsVXC(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int group_number,\n\ + int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_char16 src0, src1, src2, src3;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 1),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 2),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 3),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int coordz = coord.z;\n\ + int index_col = coordz * rgroup_column;\n\ + int index_row = coordz - index_col * group_column;\n\ + coord.z = index_row * group_number + index_col;\n\ + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ + VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ + VXC_WriteImage2DArray(output, coord, src3, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_shufflechannel_vx*/ + +static const char vsi_nn_kernel_shufflechannel_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/******************shuffle channel float16/int16********************/\n\ +_viv_uniform int group_column;\n\ +_viv_uniform float rgroup_column;\n\ +\n\ +__kernel void shuffleChannel16Bits_Axis1(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int group_number,\n\ + int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ + vxc_short8 src0, src1, src2, src3;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + VXC_ReadImage2DArray(src2, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + VXC_ReadImage2DArray(src3, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int coordy = coord.y;\n\ + int index_col = coordy * rgroup_column;\n\ + int index_row = coordy - index_col * group_column;\n\ + coord_out.y = index_row * group_number + index_col;\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.x += 8;\n\ + VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.x += 8;\n\ + VXC_WriteImage2DArray(output, coord_out, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.x += 8;\n\ + VXC_WriteImage2DArray(output, coord_out, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +/*****************shuffle channel int8/uint8****************************/\n\ +\n\ +__kernel void shuffleChannel8Bits_Axis1(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int group_number,\n\ + int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ + vxc_char16 src0, src1;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 16;\n\ + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int coordy = coord.y;\n\ + int index_col = coordy * rgroup_column;\n\ + int index_row = coordy - index_col * group_column;\n\ + coord_out.y = index_row * group_number + index_col;\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.x += 16;\n\ + VXC_WriteImage2DArray(output, coord_out, src1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_shufflechannel_axis1_vx*/ + +static const char vsi_nn_kernel_signalframe_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input_width;\n\ +_viv_uniform int input_height;\n\ +_viv_uniform int input_channel;\n\ +_viv_uniform int output_channel;\n\ +\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_width(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int frame_length,\n\ + int step,\n\ + int pad_end,\n\ + int pad,\n\ + int axis)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int outChn = gidz * input_height + gidy;\n\ + int4 coord = (int4)(0, gidy, gidz, 0);\n\ + int4 coord_out = (int4)(0, 0, outChn, 0);\n\ +\n\ + int endcoord = (pad_end == 0) ? (input_width - frame_length + 1) : (input_width);\n\ + int iter = frame_length / 8;\n\ + int res = frame_length % 8;\n\ + vxc_short8 src0;\n\ +\n\ + for(int i = 0; i < endcoord; i += step)\n\ + {\n\ + coord.x = i;\n\ + for(int j = 0; j < iter; j++)\n\ + {\n\ + coord_out.x = j << 3;\n\ + coord.x = i + (j << 3);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + coord.x = i + (iter << 3);\n\ + coord_out.x = (iter << 3);\n\ + for(int j = 0; j < res; j++)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x++;\n\ + coord.x++;\n\ + }\n\ +\n\ + coord_out.y++;\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_height(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int frame_length,\n\ + int step,\n\ + int pad_end,\n\ + int pad,\n\ + int axis)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int outChn = gidz * output_channel + (gidy / step);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + int4 coord_out = (int4)(gidx, 0, outChn, 0);\n\ + vxc_short8 src0;\n\ +\n\ + for(int i = 0; i < frame_length; i++)\n\ + {\n\ + coord.y = gidy + i;\n\ + coord_out.y = i;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_channel(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int frame_length,\n\ + int step,\n\ + int pad_end,\n\ + int pad,\n\ + int axis)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int outChn = (gidz / step) * frame_length;\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + int4 coord_out = (int4)(gidx, gidy, outChn, 0);\n\ + vxc_short8 src0;\n\ +\n\ + for(int i = 0; i < frame_length; i++)\n\ + {\n\ + coord.z = gidz + i;\n\ + coord_out.z = outChn + i;\n\ + if(coord.z < input_channel)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + else\n\ + {\n\ + src0 = (vxc_short8)(0);\n\ + }\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_width_8bit(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int frame_length,\n\ + int step,\n\ + int pad_end,\n\ + int pad,\n\ + int axis)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int outChn = gidz * input_height + gidy;\n\ + int4 coord = (int4)(0, gidy, gidz, 0);\n\ + int4 coord_out = (int4)(0, 0, outChn, 0);\n\ +\n\ + int endcoord = (pad_end == 0) ? (input_width - frame_length + 1) : (input_width);\n\ + int iter = frame_length / 8;\n\ + int res = frame_length % 8;\n\ + vxc_char8 src0;\n\ +\n\ + for(int i = 0; i < endcoord; i += step)\n\ + {\n\ + coord.x = i;\n\ + for(int j = 0; j < iter; j++)\n\ + {\n\ + coord_out.x = j << 3;\n\ + coord.x = i + (j << 3);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + coord.x = i + (iter << 3);\n\ + coord_out.x = (iter << 3);\n\ + for(int j = 0; j < res; j++)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x++;\n\ + coord.x++;\n\ + }\n\ +\n\ + coord_out.y++;\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_height_8bit(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int frame_length,\n\ + int step,\n\ + int pad_end,\n\ + int pad,\n\ + int axis)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int outChn = gidz * output_channel + (gidy / step);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + int4 coord_out = (int4)(gidx, 0, outChn, 0);\n\ + vxc_char8 src0;\n\ +\n\ + for(int i = 0; i < frame_length; i++)\n\ + {\n\ + coord.y = gidy + i;\n\ + coord_out.y = i;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(8, 1, 1))) void vxcSignalFrame_channel_8bit(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int frame_length,\n\ + int step,\n\ + int pad_end,\n\ + int pad,\n\ + int axis)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int outChn = (gidz / step) * frame_length;\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + int4 coord_out = (int4)(gidx, gidy, outChn, 0);\n\ + vxc_char8 src0;\n\ +\n\ + for(int i = 0; i < frame_length; i++)\n\ + {\n\ + coord.z = gidz + i;\n\ + coord_out.z = outChn + i;\n\ + if(coord.z < input_channel)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + else\n\ + {\n\ + src0 = (vxc_char8)(0);\n\ + }\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +#if 0\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void vxcSignalFrame_tensor(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + image2d_array_t frame_length,\n\ + image2d_array_t steps,\n\ + image2d_array_t pad_end,\n\ + image2d_array_t pad,\n\ + image2d_array_t axis)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int outChn = gidz * input_height + gidy;\n\ + int4 coord = (int4)(0, gidy, gidz, 0);\n\ + int4 coord_out = (int4)(0, 0, outChn, 0);\n\ + int4 coord_para = (int4)(0, 0, 0, 0);\n\ +\n\ + int4 size = read_imagei(frame_length, coord_para);\n\ + int4 step = read_imagei(steps, coord_para);\n\ + int4 pe = read_imagei(pad_end, coord_para);\n\ + int4 pd = read_imagei(pad, coord_para);\n\ + int len = input_width + (pe.x ? pd : 0);\n\ + int endcoord = len - size.x + 1;\n\ + int iter = size.x / 8;\n\ + int res = size.x % 8;\n\ + vxc_short8 src0;\n\ +\n\ + for(int i = 0; i < endcoord; i += step.x)\n\ + {\n\ + coord.x = i;\n\ + for(int j = 0; j < iter; j++)\n\ + {\n\ + coord_out.x = j << 3;\n\ + coord.x += (j << 3);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + coord.x = i + (iter << 3);\n\ + coord_out.x = (iter << 3);\n\ + for(int j = 0; j < res; j++)\n\ + {\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord_out, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x++;\n\ + coord.x++;\n\ + }\n\ +\n\ + coord_out.y++;\n\ + }\n\ +}\n\ +#endif\n\ +"; /* end of vsi_nn_kernel_signalframe_vx*/ + +static const char vsi_nn_kernel_space2depth_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniExtractEvenFp16Stride2_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractOddFp16Stride2_4x4;\n\ +_viv_uniform int input_depth;\n\ +\n\ +__kernel void vxcReorg2_fp16_fp16_sx2_sy1\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int stridex,\n\ + int stridey\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ +\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + int4 coord_out = (int4)(gidx >> 1, gidy, 0, 0);\n\ + int out_d0, out_d1;\n\ + vxc_short8 imageData;\n\ + vxc_short8 imgVal0, imgVal1;\n\ + //int tmpw = gidz / input_depth; \\n\\\n\ + //int tmpz = gidz % input_depth; \\n\\\n\ +\n\ + VXC_ReadImage2DArray(imageData, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(imgVal0, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniExtractEvenFp16Stride2_4x4);\n\ + VXC_DP4x4(imgVal1, imageData, imageData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniExtractOddFp16Stride2_4x4);\n\ +\n\ + out_d0 = gidz * 2 * 1;\n\ + out_d1 = out_d0 + 1;\n\ +\n\ + coord_out.z = out_d0;\n\ + VXC_WriteImage2DArray(output, coord_out, imgVal0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.z = out_d1;\n\ + VXC_WriteImage2DArray(output, coord_out, imgVal1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_space2depth_vx*/ + +static const char vsi_nn_kernel_tensorstackconcat_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/*******************tensorstackconcat 16BITs********************/\n\ +__kernel void vxcTensorStackConcat(\n\ + image2d_array_t input,\n\ + image2d_t index,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_short8 src0, src1;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(8, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.w = 0;\n\ + coord.y = read_imagei(index, coord.ww).x;\n\ + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +/**************tensorstackconcat 8BITs***************************/\n\ +__kernel void vxcTensorStackConcat8Bits(\n\ + image2d_array_t input,\n\ + image2d_t index,\n\ + image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int idx = coord.x;\n\ + vxc_char16 src0, src1;\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 16;\n\ + VXC_ReadImage2DArray(src1, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.x = idx;\n\ + coord.w = 0;\n\ + coord.y = read_imagei(index, coord.ww).x;\n\ + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 16;\n\ + VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of vsi_nn_kernel_tensorstackconcat_vx*/ + +static const char vsi_nn_kernel_topk_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void vxcTopk(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ +\n\ +}\n\ +"; /* end of vsi_nn_kernel_topk_vx*/ + +static const char vsi_nn_kernel_transform_gemm_vx[] = "/*\n\ + ============================================================================\n\ + Name : gemm.vx\n\ + Author : Sam\n\ + Version :\n\ + Copyright : Your copyright notice\n\ + Description :\n\ + ============================================================================\n\ + */\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniGemm3x3_4x4;\n\ +__kernel void vxcTransform_Gemm_F16toF16\n\ + (\n\ + __read_only image2d_array_t thetaTensor,\n\ + __read_only image2d_array_t gridTensor,\n\ + __write_only image2d_array_t coordinates\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ +\n\ + vxc_short8 vec0, vec1, vec2;\n\ + vxc_half8 src0, src1, src2, dst;\n\ +\n\ + VXC_ReadImage(vec0,thetaTensor,coord.xx,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,5, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 16);\n\ + VXC_ReadImage(vec1,gridTensor,coord.yz,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,5,0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 16);\n\ + VXC_ReadImage(vec2,gridTensor,coord.yz,VXC_5BITOFFSET_XY(6,0),VXC_MODIFIER(0,5,0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src2, vec2, 16);\n\ +\n\ + coord.y = (int)((short)coord.y / (short)3) * 2;\n\ +\n\ + VXC_DP4x4(dst, src1, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGemm3x3_4x4);\n\ + VXC_DP4x4(dst, src2, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniGemm3x3_4x4);\n\ +\n\ + _viv_asm(COPY, vec0, dst, 16);\n\ + VXC_WriteImage(coordinates, coord.yz, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_transform_gemm_vx*/ + +static const char vsi_nn_kernel_transform_interp_vx[] = "/*\n\ + ============================================================================\n\ + Name : minimum.vx\n\ + Author : Sam\n\ + Version :\n\ + Copyright : Your copyright notice\n\ + Description :\n\ + ============================================================================\n\ + */\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniGetDXY_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertF16toF32_4x4;\n\ +_viv_uniform int2 packedWH2;\n\ +_viv_uniform int packedWH;\n\ +__kernel void vxcTransform_InterP_F16toF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 vec0;\n\ + vxc_half8 pxy;\n\ + vxc_float4 dxy4;\n\ + vxc_int4 pos4;\n\ + short dst = 0;\n\ +\n\ + VXC_ReadImage(vec0, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, pxy, vec0, 4);\n\ +\n\ + coord.x >>= 1;\n\ + vxc_short2 packedWH_16B;\n\ + _viv_asm(COPY, packedWH_16B, packedWH, 4);\n\ + VXC_DP4x4(dxy4, pxy, packedWH_16B, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniGetDXY_4x4);\n\ + dxy4.zw = floor(dxy4.xy);\n\ + pos4.xy = convert_int2(dxy4.zw);\n\ + pos4.zw = convert_int2(ceil(dxy4.xy));\n\ +\n\ + vxc_short8 vec1;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage(vec0, input0, pos4.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 8);\n\ + VXC_ReadImage(vec1, input0, pos4.xw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 8);\n\ +\n\ + float2 xyLerp = dxy4.xy - dxy4.zw;\n\ + float2 oneSub_xyLerp = 1.0f - xyLerp;\n\ + float4 coef = (float4)(oneSub_xyLerp.x * oneSub_xyLerp.y, xyLerp.x * oneSub_xyLerp.y,\n\ + oneSub_xyLerp.x * xyLerp.y, xyLerp.x * xyLerp.y);\n\ + float4 data;\n\ +\n\ + VXC_DP4x4(data, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16toF32_4x4);\n\ +\n\ + data.x = dot(data, coef);\n\ +\n\ + half tmp;\n\ + _viv_asm(CONV, tmp, data);\n\ + _viv_asm(COPY, dst, tmp, 4);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform int depth;\n\ +__kernel void vxcTransform_InterP_F16toF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + vxc_short8 vec0;\n\ + vxc_half8 pxy;\n\ + vxc_float4 dxy4;\n\ + vxc_int4 pos4;\n\ + short dst = 0;\n\ +\n\ + VXC_ReadImage(vec0, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, pxy, vec0, 4);\n\ +\n\ + coord.x >>= 1;\n\ + vxc_short2 packedWH_16B;\n\ + _viv_asm(COPY, packedWH_16B, packedWH, 4);\n\ + VXC_DP4x4(dxy4, pxy, packedWH_16B, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniGetDXY_4x4);\n\ + dxy4.zw = floor(dxy4.xy);\n\ + pos4.xy = convert_int2(dxy4.zw);\n\ + pos4.zw = convert_int2(ceil(dxy4.xy));\n\ +\n\ +\n\ + float2 xyLerp = dxy4.xy - dxy4.zw;\n\ + float2 oneSub_xyLerp = 1.0f - xyLerp;\n\ + float4 coef = (float4)(oneSub_xyLerp.x * oneSub_xyLerp.y, xyLerp.x * oneSub_xyLerp.y,\n\ + oneSub_xyLerp.x * xyLerp.y, xyLerp.x * xyLerp.y);\n\ +\n\ + int4 coord_ = (int4)(pos4.x, pos4.y, 0, 0);\n\ + do\n\ + {\n\ + vxc_short8 vec1;\n\ + vxc_half8 src0, src1;\n\ + VXC_ReadImage2DArray(vec0,input0,coord_,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,1,0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src0, vec0, 8);\n\ + VXC_ReadImage2DArray(vec1,input0,coord_,VXC_5BITOFFSET_XY(0,1),VXC_MODIFIER(0,1,0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, src1, vec1, 8);\n\ +\n\ + coord_.z ++;\n\ + float4 data;\n\ + VXC_DP4x4(data, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16toF32_4x4);\n\ +\n\ + data.x = dot(data, coef);\n\ +\n\ + half tmp;\n\ + _viv_asm(CONV, tmp, data);\n\ + _viv_asm(COPY, dst, tmp, 4);\n\ +\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ + coord.z ++;\n\ +\n\ + } while (coord.z < depth);\n\ +}\n\ +\n\ +"; /* end of vsi_nn_kernel_transform_interp_vx*/ + +static const char vsi_nn_kernel_transform_setupThres_vx[] = "/*\n\ + ============================================================================\n\ + Name : gemm.vx\n\ + Author : Sam\n\ + Version :\n\ + Copyright : Your copyright notice\n\ + Description :\n\ + ============================================================================\n\ + */\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int4 extract_packed;\n\ +__kernel void vxcTransform_setupThres_F16toF16\n\ + (\n\ + __read_only image2d_array_t initTensor,\n\ + __read_only image2d_array_t inputFC,\n\ + global int* thresFlag,\n\ + __write_only image2d_array_t thres\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, 0);\n\ +\n\ + vxc_ushort8 src0, src1, dst;\n\ +\n\ + int flag = *thresFlag;\n\ + VXC_ReadImage(src0, initTensor, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, inputFC, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_BitExtract(dst, src0, src1, extract_packed, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage(thres, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of vsi_nn_kernel_transform_setupThres_vx*/ + + + +static const char add_mean_std_norm_cl[] = "\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_F32_F32toF32(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float rsEps, float dimRatio,\n\ + float input0Scale, float input0Tail,\n\ + float input1Scale, float input1Tail,\n\ + float outputScale, float outputZP,\n\ + int width)\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + float4 src0, src1, result;\n\ + float pSum = 0.0f, pSqr = 0.0f;\n\ + float sum = 0.0f, sqr = 0.0f;\n\ + float input_d = 0.0f;\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(; coord.x < width; coord.x += 16)\n\ + {\n\ + src0 = read_imagef(input, coord);\n\ + src1 = read_imagef(input1, coord);\n\ + input_d = src0.x + src1.x;\n\ + pSum += input_d;\n\ + pSqr += input_d * input_d;\n\ + }\n\ + lcl_sum[lidx] = pSum;\n\ + lcl_sqr[lidx] = pSqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + float4 data0;\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + pLocalPtr = (float4 *)&lcl_sqr[0];\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sqr = dot(data0, one);\n\ + float mean;\n\ + mean = sum * dimRatio;\n\ + float vari, stddev_inv, rMeanStd;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + stddev_inv = (vari==0 ? rsEps : rsqrt(vari));\n\ + rMeanStd = (-mean) * stddev_inv;\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16)\n\ + {\n\ + src0 = read_imagef(input, coord);\n\ + src1 = read_imagef(input1, coord);\n\ + input_d = src0.x + src1.x;\n\ + result.x = input_d * stddev_inv + rMeanStd;\n\ + write_imagef(output, coord, result.xxxx);\n\ + }\n\ +}\n\ +\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_U8_U8toF32(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float rsEps, float dimRatio,\n\ + float input0Scale, float input0Tail,\n\ + float input1Scale, float input1Tail,\n\ + float outputScale, float outputZP,\n\ + int width)\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + float4 src0, src1, result;\n\ + float pSum = 0.0f, pSqr = 0.0f;\n\ + float sum = 0.0f, sqr = 0.0f;\n\ + float input_d = 0.0f;\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(; coord.x < width; coord.x += 16)\n\ + {\n\ + src0 = convert_float4(read_imageui(input, coord)) * input0Scale - input0Tail;\n\ + src1 = convert_float4(read_imageui(input1, coord)) * input1Scale - input1Tail;\n\ + input_d = src0.x + src1.x;\n\ + pSum += input_d;\n\ + pSqr += input_d * input_d;\n\ + }\n\ + lcl_sum[lidx] = pSum;\n\ + lcl_sqr[lidx] = pSqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + float4 data0;\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + pLocalPtr = (float4 *)&lcl_sqr[0];\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sqr = dot(data0, one);\n\ + float mean;\n\ + mean = sum * dimRatio;\n\ + float vari, stddev_inv, rMeanStd;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + stddev_inv = (vari==0 ? rsEps : rsqrt(vari));\n\ + rMeanStd = (-mean) * stddev_inv;\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16)\n\ + {\n\ + src0 = convert_float4(read_imageui(input, coord)) * input0Scale - input0Tail;\n\ + src1 = convert_float4(read_imageui(input1, coord)) * input1Scale - input1Tail;\n\ + input_d = src0.x + src1.x;\n\ + result.x = input_d * stddev_inv + rMeanStd;\n\ + write_imagef(output, coord, result.xxxx);\n\ + }\n\ +}\n\ +\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void add_mean_std_norm_U8_U8toU8(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float rsEps, float dimRatio,\n\ + float input0Scale, float input0Tail,\n\ + float input1Scale, float input1Tail,\n\ + float outputScale, float outputZP,\n\ + int width)\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + float4 src0, src1, result = 0.0f;\n\ + float pSum = 0.0f, pSqr = 0.0f;\n\ + float sum = 0.0f, sqr = 0.0f;\n\ + float input_d = 0.0f;\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(; coord.x < width; coord.x += 16)\n\ + {\n\ + src0 = convert_float4(read_imageui(input, coord)) * input0Scale - input0Tail;\n\ + src1 = convert_float4(read_imageui(input1, coord)) * input1Scale - input1Tail;\n\ + input_d = src0.x + src1.x;\n\ + pSum += input_d;\n\ + pSqr += input_d * input_d;\n\ + }\n\ + lcl_sum[lidx] = pSum;\n\ + lcl_sqr[lidx] = pSqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + float4 data0;\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + pLocalPtr = (float4 *)&lcl_sqr[0];\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sqr = dot(data0, one);\n\ + float mean;\n\ + mean = sum * dimRatio;\n\ + float vari, stddev_inv, rMeanStd;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + stddev_inv = (vari==0 ? rsEps : rsqrt(vari));\n\ + rMeanStd = (-mean) * stddev_inv;\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16)\n\ + {\n\ + src0 = convert_float4(read_imageui(input, coord)) * input0Scale - input0Tail;\n\ + src1 = convert_float4(read_imageui(input1, coord)) * input1Scale - input1Tail;\n\ + input_d = src0.x + src1.x;\n\ + result.x = input_d * stddev_inv + rMeanStd;\n\ + uint4 dst = convert_uint4(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ +}\n\ +\n\ +"; /* end of add_mean_std_norm_cl*/ + +static const char argmax_axis0_cl[] = "__kernel void argmax_axis0_F32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.x : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.yz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis0_F32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.x : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis0_U8toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ +\n\ + uint4 minVal = read_imageui(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.x : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.yz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis0_U8toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ +\n\ + uint4 minVal = read_imageui(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.x : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis0_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.x : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.yz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis0_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.x : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, minIdx);\n\ +}\n\ +"; /* end of argmax_axis0_cl*/ + +static const char argmax_axis1_cl[] = "__kernel void argmax_axis1_F32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.y : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis1_F32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.y : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagei(output, coord, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis1_U8toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ +\n\ + uint4 minVal = read_imageui(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.y : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis1_U8toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ +\n\ + uint4 minVal = read_imageui(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.y : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagei(output, coord, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis1_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.y : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis1_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.y : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagei(output, coord, minIdx);\n\ +}\n\ +"; /* end of argmax_axis1_cl*/ + +static const char argmax_axis2_cl[] = "__kernel void argmax_axis2_F32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + int minIdx = 0;\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.z : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis2_F32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int minIdx = 0;\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis2_U8toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + uint4 minVal = read_imageui(input, coord);\n\ + int minIdx = 0;\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.z : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis2_U8toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int minIdx = 0;\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis2_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + int minIdx = 0;\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minIdx = val.x > minVal.x ? coord.z : minIdx;\n\ + minVal = val > minVal ? val : minVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +__kernel void argmax_axis2_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int minIdx = 0;\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +"; /* end of argmax_axis2_cl*/ + +static const char argmin_axis0_cl[] = "__kernel void argmin_axis0_F32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.x : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.yz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis0_F32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.x : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis0_U8toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ +\n\ + uint4 minVal = read_imageui(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.x : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.yz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis0_U8toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ +\n\ + uint4 minVal = read_imageui(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.x : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis0_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.x : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.yz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis0_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + int minIdx = 0;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.x : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, minIdx);\n\ +}\n\ +\n\ +"; /* end of argmin_axis0_cl*/ + +static const char argmin_axis1_cl[] = "__kernel void argmin_axis1_F32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.y : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis1_F32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.y : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagei(output, coord, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis1_U8toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ +\n\ + uint4 minVal = read_imageui(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.y : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis1_U8toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ +\n\ + uint4 minVal = read_imageui(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.y : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagei(output, coord, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis1_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.y : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xz, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis1_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + int minIdx = 0;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.y : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagei(output, coord, minIdx);\n\ +}\n\ +\n\ +"; /* end of argmin_axis1_cl*/ + +static const char argmin_axis2_cl[] = "__kernel void argmin_axis2_F32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + int minIdx = 0;\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.z : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis2_F32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int minIdx = 0;\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis2_U8toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + uint4 minVal = read_imageui(input, coord);\n\ + int minIdx = 0;\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.z : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis2_U8toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int minIdx = 0;\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis2_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + int minIdx = 0;\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minIdx = val.x < minVal.x ? coord.z : minIdx;\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +__kernel void argmin_axis2_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axisSize\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int minIdx = 0;\n\ +\n\ + write_imagei(output, coord.xy, minIdx);\n\ +}\n\ +\n\ +"; /* end of argmin_axis2_cl*/ + +static const char batchnorm_single_cl[] = "\n\ +__kernel void batch_norm_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t Mean,\n\ + __read_only image2d_array_t Variance,\n\ + __read_only image2d_array_t Gamma,\n\ + __read_only image2d_array_t Beta,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + float input_scale,\n\ + float input_tail,\n\ + float output_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + float4 src, mean, var, gamma, beta;\n\ + readImage2DArray(src, input, coord);\n\ + readImage2DArray(mean, Mean, coord);\n\ + readImage2DArray(var, Variance, coord);\n\ + readImage2DArray(gamma, Gamma, coord);\n\ + readImage2DArray(beta, Beta, coord);\n\ +\n\ + float4 dst;\n\ + src.x = src.x - mean.x;\n\ + float inv = rsqrt(var.x + eps);\n\ + dst.x = src.x * inv *gamma.x + beta.x;\n\ +\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void batch_norm_F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t Mean,\n\ + __read_only image2d_t Variance,\n\ + __read_only image2d_t Gamma,\n\ + __read_only image2d_t Beta,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + float input_scale,\n\ + float input_tail,\n\ + float output_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + float4 src = read_imagef(input, coord);\n\ + float4 mean = read_imagef(Mean, coord);\n\ + float4 var = read_imagef(Variance, coord);\n\ + float4 gamma = read_imagef(Gamma, coord);\n\ + float4 beta = read_imagef(Beta, coord);\n\ +\n\ + float4 dst = 0;\n\ + src.x = src.x - mean.x;\n\ + float inv = rsqrt(var.x + eps);\n\ + dst.x = src.x * inv *gamma.x + beta.x;\n\ +\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void batch_norm_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t Mean,\n\ + __read_only image2d_array_t Variance,\n\ + __read_only image2d_array_t Gamma,\n\ + __read_only image2d_array_t Beta,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + float input_scale,\n\ + float input_tail,\n\ + float output_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + uint4 data;\n\ + float4 src, mean, var, gamma, beta;\n\ + readImage2DArray(data, input, coord);\n\ + readImage2DArray(mean, Mean, coord);\n\ + readImage2DArray(var, Variance, coord);\n\ + readImage2DArray(gamma, Gamma, coord);\n\ + readImage2DArray(beta, Beta, coord);\n\ +\n\ + src = convert_float4(data) * input_scale - input_tail;\n\ + src.x = src.x - mean.x;\n\ + float inv = rsqrt(var.x + eps);\n\ + src.x = src.x * inv *gamma.x + beta.x;\n\ +\n\ + uint4 dst = convert_uint4(src * output_scale + output_zp);\n\ +\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void batch_norm_U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t Mean,\n\ + __read_only image2d_t Variance,\n\ + __read_only image2d_t Gamma,\n\ + __read_only image2d_t Beta,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + float input_scale,\n\ + float input_tail,\n\ + float output_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + uint4 data = read_imageui(input, coord);\n\ + float4 mean = read_imagef(Mean, coord);\n\ + float4 var = read_imagef(Variance, coord);\n\ + float4 gamma = read_imagef(Gamma, coord);\n\ + float4 beta = read_imagef(Beta, coord);\n\ +\n\ + float4 src = convert_float4(data) * input_scale - input_tail;\n\ + src.x = src.x - mean.x;\n\ + float inv = rsqrt(var.x + eps);\n\ + src.x = src.x * inv *gamma.x + beta.x;\n\ +\n\ + uint4 dst = convert_uint4(src * output_scale + output_zp);\n\ +\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +"; /* end of batchnorm_single_cl*/ + +static const char cast_cl[] = "\n\ +#define CAST_FUN(src_name, dst_name, src_type, dst_type, conv_fun, read_fun, write_fun) \\\n\ +__kernel void cast_##src_name##to##dst_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + src_type src = read_fun(input, coord); \\\n\ + dst_type dst = 0; \\\n\ + dst = conv_fun(src); \\\n\ + write_fun(output, coord, dst); \\\n\ +}\n\ +\n\ +CAST_FUN(F32, I32, float4, int4, convert_int4_sat, read_imagef, write_imagei)\n\ +CAST_FUN(F32, U32, float4, uint4, convert_uint4_sat, read_imagef, write_imageui)\n\ +CAST_FUN(I32, I32, int4, int4, convert_int4_sat, read_imagei, write_imagei)\n\ +CAST_FUN(I32, U32, int4, uint4, convert_uint4_sat, read_imagei, write_imageui)\n\ +CAST_FUN(U32, I32, uint4, int4, convert_int4_sat, read_imageui, write_imagei)\n\ +CAST_FUN(U32, U32, uint4, uint4, convert_uint4_sat, read_imageui, write_imageui)\n\ +CAST_FUN(F32, F32, float4, float4, convert_float4, read_imagef, write_imagef)\n\ +CAST_FUN(I32, F32, int4, float4, convert_float4, read_imagei, write_imagef)\n\ +CAST_FUN(U32, F32, uint4, float4, convert_float4, read_imageui, write_imagef)\n\ +\n\ +#define CAST_FUN_2D(src_name, dst_name, src_type, dst_type, conv_fun, read_fun, write_fun) \\\n\ +__kernel void cast_##src_name##to##dst_name##_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type src = read_fun(input, coord); \\\n\ + dst_type dst = 0; \\\n\ + dst = conv_fun(src); \\\n\ + write_fun(output, coord, dst); \\\n\ +}\n\ +\n\ +CAST_FUN_2D(F32, I32, float4, int4, convert_int4_sat, read_imagef, write_imagei)\n\ +CAST_FUN_2D(F32, U32, float4, uint4, convert_uint4_sat, read_imagef, write_imageui)\n\ +CAST_FUN_2D(I32, I32, int4, int4, convert_int4_sat, read_imagei, write_imagei)\n\ +CAST_FUN_2D(I32, U32, int4, uint4, convert_uint4_sat, read_imagei, write_imageui)\n\ +CAST_FUN_2D(U32, I32, uint4, int4, convert_int4_sat, read_imageui, write_imagei)\n\ +CAST_FUN_2D(U32, U32, uint4, uint4, convert_uint4_sat, read_imageui, write_imageui)\n\ +CAST_FUN_2D(F32, F32, float4, float4, convert_float4, read_imagef, write_imagef)\n\ +CAST_FUN_2D(I32, F32, int4, float4, convert_float4, read_imagei, write_imagef)\n\ +CAST_FUN_2D(U32, F32, uint4, float4, convert_float4, read_imageui, write_imagef)\n\ +\n\ +#define CAST_TO_BOOL_FUN(src_name, src_type, read_fun) \\\n\ +__kernel void cast_##src_name##toBOOL8( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + src_type src = read_fun(input, coord); \\\n\ + int4 dst = 0; \\\n\ + dst.x = (int)(src.x != 0); \\\n\ + write_imagei(output, coord, dst); \\\n\ +}\n\ +\n\ +CAST_TO_BOOL_FUN(F32, float4, read_imagef)\n\ +CAST_TO_BOOL_FUN(I32, int4, read_imagei)\n\ +CAST_TO_BOOL_FUN(U32, uint4, read_imageui)\n\ +\n\ +\n\ +#define CAST_TO_BOOL_FUN_2D(src_name, src_type, read_fun) \\\n\ +__kernel void cast_##src_name##toBOOL8_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type src = read_fun(input, coord); \\\n\ + int4 dst = 0; \\\n\ + dst.x = (int)(src.x != 0); \\\n\ + write_imagei(output, coord, dst); \\\n\ +}\n\ +\n\ +CAST_TO_BOOL_FUN_2D(F32, float4, read_imagef)\n\ +CAST_TO_BOOL_FUN_2D(I32, int4, read_imagei)\n\ +CAST_TO_BOOL_FUN_2D(U32, uint4, read_imageui)\n\ +\n\ +"; /* end of cast_cl*/ + +static const char clip_F32_cl[] = "__kernel void clip_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float4 src = read_imagef(input, coord);\n\ + float4 dst = src > minData ? src : minData;\n\ + dst = dst < maxData ? dst : maxData;\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void clip_F32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + float4 src = read_imagef(input, coord);\n\ + float4 dst = src > minData ? src : minData;\n\ + dst = dst < maxData ? dst : maxData;\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void clip_F32toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float4 src = read_imagef(input, coord);\n\ + float4 result = src > minData ? src : minData;\n\ + result = result < maxData ? result : maxData;\n\ + uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void clip_F32toU8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float minData,\n\ + float maxData,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + float4 src = read_imagef(input, coord);\n\ + float4 result = src > minData ? src : minData;\n\ + result = result < maxData ? result : maxData;\n\ + uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +"; /* end of clip_F32_cl*/ + +static const char clip_U8_cl[] = "__kernel void clip_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + float4 result = src > minData ? src : minData;\n\ + result = result < maxData ? result : maxData;\n\ + uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void clip_U8toU8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float minData,\n\ + float maxData,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + float4 result = src > minData ? src : minData;\n\ + result = result < maxData ? result : maxData;\n\ + uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void clip_U8toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + float4 dst = src > minData ? src : minData;\n\ + dst = dst < maxData ? dst : maxData;\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void clip_U8toF32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float minData,\n\ + float maxData,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + float4 dst = src > minData ? src : minData;\n\ + dst = dst < maxData ? dst : maxData;\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +"; /* end of clip_U8_cl*/ + +static const char detect_post_box_cl[] = "float exp_(float x, float logE)\n\ +{\n\ + x *= logE;\n\ + x = exp2(x);\n\ + return x;\n\ +}\n\ +\n\ +__kernel void detect_post_box_F32_F32toF32(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + float inv_scale_y,\n\ + float inv_scale_x,\n\ + float inv_scale_h,\n\ + float inv_scale_w,\n\ + float logE)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + float4 src0;\n\ + float4 src1;\n\ + float4 dst;\n\ + float4 tmp0, tmp1;\n\ + src0.x = read_imagef(input0, coord).x;\n\ + src1.x = read_imagef(input1, coord.xy).x;\n\ + coord.x++;\n\ + src0.y = read_imagef(input0, coord).x;\n\ + src1.y = read_imagef(input1, coord.xy).x;\n\ + coord.x++;\n\ + src0.z = read_imagef(input0, coord).x;\n\ + src1.z = read_imagef(input1, coord.xy).x;\n\ + coord.x++;\n\ + src0.w = read_imagef(input0, coord).x;\n\ + src1.w = read_imagef(input1, coord.xy).x;\n\ +\n\ + tmp0.x = src1.x + src1.z * src0.x * inv_scale_y;\n\ + tmp0.y = src1.y + src1.w * src0.y * inv_scale_x;\n\ + tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f;\n\ + tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f;\n\ + dst.xy = tmp0.xy - tmp1.xy;\n\ + dst.zw = tmp0.xy + tmp1.xy;\n\ + coord.x = 0;\n\ + write_imagef(output, coord, dst.xxxx);\n\ + coord.x++;\n\ + write_imagef(output, coord, dst.yyyy);\n\ + coord.x++;\n\ + write_imagef(output, coord, dst.zzzz);\n\ + coord.x++;\n\ + write_imagef(output, coord, dst.wwww);\n\ +}\n\ +\n\ +\n\ +__kernel void detect_post_box_U8_U8toF32(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + float inv_scale_y,\n\ + float inv_scale_x,\n\ + float inv_scale_h,\n\ + float inv_scale_w,\n\ + float logE,\n\ + float input0Tail,\n\ + float input1Tail,\n\ + float input0Scale,\n\ + float input1Scale)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + uint4 in0, in1;\n\ + float4 src0;\n\ + float4 src1;\n\ + float4 dst;\n\ + float4 tmp0, tmp1;\n\ + in0.x = read_imageui(input0, coord).x;\n\ + in1.x = read_imageui(input1, coord.xy).x;\n\ + coord.x++;\n\ + in0.y = read_imageui(input0, coord).x;\n\ + in1.y = read_imageui(input1, coord.xy).x;\n\ + coord.x++;\n\ + in0.z = read_imageui(input0, coord).x;\n\ + in1.z = read_imageui(input1, coord.xy).x;\n\ + coord.x++;\n\ + in0.w = read_imageui(input0, coord).x;\n\ + in1.w = read_imageui(input1, coord.xy).x;\n\ +\n\ + src0 = convert_float4(in0) * input0Scale + input0Tail;\n\ + src1 = convert_float4(in1) * input1Scale + input1Tail;\n\ +\n\ + tmp0.x = src1.x + src1.z * src0.x * inv_scale_y;\n\ + tmp0.y = src1.y + src1.w * src0.y * inv_scale_x;\n\ + tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f;\n\ + tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f;\n\ + dst.xy = tmp0.xy - tmp1.xy;\n\ + dst.zw = tmp0.xy + tmp1.xy;\n\ + coord.x = 0;\n\ + write_imagef(output, coord, dst.xxxx);\n\ + coord.x++;\n\ + write_imagef(output, coord, dst.yyyy);\n\ + coord.x++;\n\ + write_imagef(output, coord, dst.zzzz);\n\ + coord.x++;\n\ + write_imagef(output, coord, dst.wwww);\n\ +}"; /* end of detect_post_box_cl*/ + +static const char eltwise_ops_helper_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +\n\ +#define readImage2DArray(Dest, Image, Coord) \\\n\ + do { \\\n\ + int8 desc; \\\n\ + _viv_asm(COPY, desc, Image, sizeof(desc)); \\\n\ + _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \\\n\ + int baseAddr = (int)(Coord).w * desc.s4 + desc.s0; \\\n\ + _viv_asm(MOV, (Coord).w, baseAddr); \\\n\ + _viv_asm(IMAGE_READ_3D, Dest, Image, (Coord).xyww); \\\n\ + } while (0)\n\ +\n\ +#define writeImage2DArray(Image, Coord, Color) \\\n\ + do { \\\n\ + int8 desc; \\\n\ + _viv_asm(COPY, desc, Image, sizeof(desc)); \\\n\ + _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \\\n\ + int baseAddr = (int)(Coord).w * desc.s4 + desc.s0; \\\n\ + _viv_asm(MOV, (Coord).w, baseAddr); \\\n\ + _viv_asm(IMAGE_WRITE_3D, Color, Image, (Coord).xyww); \\\n\ + } while (0)\n\ +\n\ +#define readImage(Dest, Image, Coord) \\\n\ + do { \\\n\ + _viv_asm(IMAGE_READ, Dest, Image, Coord); \\\n\ + } while (0)\n\ +\n\ +#define writeImage(Image, Coord, Color) \\\n\ + do { \\\n\ + _viv_asm(IMAGE_WRITE, Color, Image, Coord); \\\n\ + } while (0)\n\ +"; /* end of eltwise_ops_helper_cl*/ + +static const char eltwise_unary_cl[] = "\n\ +float4 eltwise_unary_sin(float4 x)\n\ +{\n\ + return native_sin(x);\n\ +}\n\ +\n\ +#define logE (1.44269502f)\n\ +#define twoLogE (logE * 2.0f)\n\ +float4 eltwise_unary_exp(float4 x)\n\ +{\n\ + x *= logE;\n\ + x = exp2(x);\n\ + return x;\n\ +}\n\ +\n\ +#define rlogE (0.693147182f)\n\ +float4 eltwise_unary_log(float4 x)\n\ +{\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +float4 eltwise_unary_elu(float4 val)\n\ +{\n\ + float4 x = val * logE;\n\ + x = exp2(x) - 1;\n\ +\n\ + return val < 0 ? x : val;\n\ +}\n\ +\n\ +float4 eltwise_unary_neg(float4 x)\n\ +{\n\ + return x * -1;\n\ +}\n\ +\n\ +float4 eltwise_unary_hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +\n\ +float4 _softrelu(float4 x)\n\ +{\n\ + x *= logE;\n\ + x = exp2(x);\n\ + x += 1;\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +float4 _tanh(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return (2 * x - 1);\n\ +}\n\ +\n\ +float4 eltwise_unary_mish(float4 x)\n\ +{\n\ + float4 y = _softrelu(x);\n\ + x = x * _tanh(y);\n\ + return x;\n\ +}\n\ +\n\ +#define ELTWISE_UNARY_F32(func_name) \\\n\ +__kernel void func_name##_F32toF32 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float inputScale, \\\n\ + float inputTail, \\\n\ + float outputScale, \\\n\ + float outputZP \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + float4 src = read_imagef(input, coord); \\\n\ + \\\n\ + float4 dst = eltwise_unary_##func_name(src); \\\n\ + \\\n\ + write_imagef(output, coord, dst); \\\n\ +}\n\ +ELTWISE_UNARY_F32(sin)\n\ +ELTWISE_UNARY_F32(exp)\n\ +ELTWISE_UNARY_F32(log)\n\ +ELTWISE_UNARY_F32(elu)\n\ +ELTWISE_UNARY_F32(neg)\n\ +ELTWISE_UNARY_F32(mish)\n\ +ELTWISE_UNARY_F32(hard_sigmoid)\n\ +\n\ +#define ELTWISE_UNARY_F32_2D(func_name) \\\n\ +__kernel void func_name##_F32toF32_2D \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + float inputScale, \\\n\ + float inputTail, \\\n\ + float outputScale, \\\n\ + float outputZP \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + float4 src = read_imagef(input, coord); \\\n\ + \\\n\ + float4 dst = eltwise_unary_##func_name(src); \\\n\ + \\\n\ + write_imagef(output, coord, dst); \\\n\ +}\n\ +ELTWISE_UNARY_F32_2D(sin)\n\ +ELTWISE_UNARY_F32_2D(exp)\n\ +ELTWISE_UNARY_F32_2D(log)\n\ +ELTWISE_UNARY_F32_2D(elu)\n\ +ELTWISE_UNARY_F32_2D(neg)\n\ +ELTWISE_UNARY_F32_2D(mish)\n\ +ELTWISE_UNARY_F32_2D(hard_sigmoid)\n\ +\n\ +#define ELTWISE_UNARY_U8(func_name) \\\n\ +__kernel void func_name##_U8toU8 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float inputScale, \\\n\ + float inputTail, \\\n\ + float outputScale, \\\n\ + float outputZP \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + uint4 src = read_imageui(input, coord); \\\n\ + float4 data = convert_float4(src) * inputScale - inputTail; \\\n\ + \\\n\ + data = eltwise_unary_##func_name(data); \\\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ + \\\n\ + write_imageui(output, coord, dst); \\\n\ +}\n\ +ELTWISE_UNARY_U8(sin)\n\ +ELTWISE_UNARY_U8(exp)\n\ +ELTWISE_UNARY_U8(log)\n\ +ELTWISE_UNARY_U8(elu)\n\ +ELTWISE_UNARY_U8(neg)\n\ +ELTWISE_UNARY_U8(mish)\n\ +ELTWISE_UNARY_U8(hard_sigmoid)\n\ +\n\ +#define ELTWISE_UNARY_U8_2D(func_name) \\\n\ +__kernel void func_name##_U8toU8_2D \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + float inputScale, \\\n\ + float inputTail, \\\n\ + float outputScale, \\\n\ + float outputZP \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + uint4 src = read_imageui(input, coord); \\\n\ + float4 data = convert_float4(src) * inputScale - inputTail; \\\n\ + \\\n\ + data = eltwise_unary_##func_name(data); \\\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ + \\\n\ + write_imageui(output, coord, dst); \\\n\ +}\n\ +ELTWISE_UNARY_U8_2D(sin)\n\ +ELTWISE_UNARY_U8_2D(exp)\n\ +ELTWISE_UNARY_U8_2D(log)\n\ +ELTWISE_UNARY_U8_2D(elu)\n\ +ELTWISE_UNARY_U8_2D(neg)\n\ +ELTWISE_UNARY_U8_2D(mish)\n\ +ELTWISE_UNARY_U8_2D(hard_sigmoid)\n\ +\n\ +\n\ +__kernel void neg_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 src = read_imagei(input, coord);\n\ +\n\ + int4 dst = -src;\n\ +\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void neg_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 src = read_imagei(input, coord);\n\ +\n\ + int4 dst = -src;\n\ +\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +"; /* end of eltwise_unary_cl*/ + +static const char floordiv_cl[] = "__kernel void floordiv_F32F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float4 src0;\n\ + float4 src1;\n\ + readImage2DArray(src0, input, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ + float4 dst = floor(src0 / src1);\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void floordiv_F32F32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + float4 src0 = read_imagef(input, coord);\n\ + float4 src1 = read_imagef(input1, coord);\n\ + float4 dst = floor(src0 / src1);\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void floordiv_I32I32toI32(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 src0;\n\ + int4 src1;\n\ + readImage2DArray(src0, input, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ + int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void floordiv_I32I32toI32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 src0 = read_imagei(input, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ + int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void floordiv_U8U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + uint4 src0, src1;\n\ + float4 in0, in1, out;\n\ + readImage2DArray(src0, input, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ + in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + out = floor(in0 / in1) * outputScale + outputTail;\n\ + uint4 dst = convert_uint4(out);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void floordiv_U8U8toU8_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + uint4 src0 = read_imageui(input, coord);\n\ + uint4 src1 = read_imageui(input1, coord);\n\ + float4 in0, in1, out;\n\ + in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + out = floor(in0 / in1) * outputScale + outputTail;\n\ + uint4 dst = convert_uint4(out);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +"; /* end of floordiv_cl*/ + +static const char gather_cl[] = "__kernel void gather_U8toU8(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num,\n\ + int indices_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + uint4 data = read_imageui(input0, coord_in.zw);\n\ +\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + write_imageui(output, coord, data);\n\ +}\n\ +\n\ +__kernel void gather_F16toF16(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num,\n\ + int indices_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + float4 data = read_imagef(input0, coord_in.zw);\n\ +\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + write_imagef(output, coord, data);\n\ +}\n\ +\n\ +__kernel void gather_I32toI32(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num,\n\ + int indices_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + int4 data = read_imagei(input0, coord_in.zw);\n\ +\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + write_imagei(output, coord, data);\n\ +}\n\ +\n\ +__kernel void gather_F32toF32(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num,\n\ + int indices_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + float4 data = read_imagef(input0, coord_in.zw);\n\ +\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + write_imagef(output, coord, data);\n\ +}\n\ +"; /* end of gather_cl*/ + +static const char gather_nd_cl[] = "__kernel void gather_nd_U8toU8_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + coord.w = indice.x;\n\ +\n\ + uint4 data = read_imageui(input0, coord.zw);\n\ + write_imageui(output, coord.zy, data);\n\ +}\n\ +\n\ +__kernel void gather_nd_F16toF16_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + coord.w = indice.x;\n\ +\n\ + float4 data = read_imagef(input0, coord.zw);\n\ + write_imagef(output, coord.zy, data);\n\ +}\n\ +\n\ +__kernel void gather_nd_I32toI32_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + coord.w = indice.x;\n\ +\n\ + int4 data = read_imagei(input0, coord.zw);\n\ + write_imagei(output, coord.zy, data);\n\ +}\n\ +\n\ +__kernel void gather_nd_F32toF32_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + coord.w = indice.x;\n\ +\n\ + float4 data = read_imagef(input0, coord.zw);\n\ + write_imagef(output, coord.zy, data);\n\ +}\n\ +\n\ +//2D\n\ +__kernel void gather_nd_U8toU8_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 1);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + int4 indice1 = read_imagei(input1, coord.wy);\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.y = indice1.x;\n\ +\n\ + uint4 data = read_imageui(input0, indice.xy);\n\ + write_imageui(output, coord.zy, data);\n\ +}\n\ +\n\ +__kernel void gather_nd_F16toF16_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 1);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + int4 indice1 = read_imagei(input1, coord.wy);\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.y = indice1.x;\n\ +\n\ + float4 data = read_imagef(input0, indice.xy);\n\ + write_imagef(output, coord.zy, data);\n\ +}\n\ +\n\ +__kernel void gather_nd_I32toI32_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 1);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + int4 indice1 = read_imagei(input1, coord.wy);\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.y = indice1.x;\n\ +\n\ + int4 data = read_imagei(input0, indice.xy);\n\ + write_imagei(output, coord.zy, data);\n\ +}\n\ +\n\ +__kernel void gather_nd_F32toF32_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 1);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + int4 indice1 = read_imagei(input1, coord.wy);\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.y = indice1.x;\n\ +\n\ + float4 data = read_imagef(input0, indice.xy);\n\ + write_imagef(output, coord.zy, data);\n\ +}\n\ +"; /* end of gather_nd_cl*/ + +static const char gather_nd_3d_cl[] = "__kernel void gather_nd_U8toU8_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, 1, 2);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + int4 indice1 = read_imagei(input1, coord.zy);\n\ + int4 indice2 = read_imagei(input1, coord.wy);\n\ + indice = (int4)(indice.x * block_size + gidx, indice1.x, indice2.x, 0);\n\ + coord.z = gidx;\n\ +\n\ + uint4 data = read_imageui(input0, indice);\n\ + write_imageui(output, coord.zy, data);\n\ +}\n\ +\n\ +__kernel void gather_nd_F16toF16_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord = (int4)(0, gidy, 1, 2);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + int4 indice1 = read_imagei(input1, coord.zy);\n\ + int4 indice2 = read_imagei(input1, coord.wy);\n\ + indice = (int4)(indice.x * block_size + gidx, indice1.x, indice2.x, 0);\n\ + coord.z = gidx;\n\ +\n\ + float4 data = read_imagef(input0, indice);\n\ + write_imagef(output, coord.zy, data);\n\ +}\n\ +\n\ +__kernel void gather_nd_I32toI32_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord = (int4)(0, gidy, 1, 2);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + int4 indice1 = read_imagei(input1, coord.zy);\n\ + int4 indice2 = read_imagei(input1, coord.wy);\n\ + indice = (int4)(indice.x * block_size + gidx, indice1.x, indice2.x, 0);\n\ + coord.z = gidx;\n\ +\n\ + int4 data = read_imagei(input0, indice);\n\ + write_imagei(output, coord.zy, data);\n\ +}\n\ +\n\ +__kernel void gather_nd_F32toF32_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord = (int4)(0, gidy, 1, 2);\n\ + int4 indice = read_imagei(input1, coord.xy);\n\ + int4 indice1 = read_imagei(input1, coord.zy);\n\ + int4 indice2 = read_imagei(input1, coord.wy);\n\ + indice = (int4)(indice.x * block_size + gidx, indice1.x, indice2.x, 0);\n\ + coord.z = gidx;\n\ +\n\ + float4 data = read_imagef(input0, indice);\n\ + write_imagef(output, coord.zy, data);\n\ +}\n\ +"; /* end of gather_nd_3d_cl*/ + +static const char grucell_activation_cl[] = "__kernel void grucell_activation(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ +\n\ +}\n\ +"; /* end of grucell_activation_cl*/ + +static const char grucell_activation_sma_cl[] = "__kernel void grucell_activation_sma(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ +\n\ +}\n\ +"; /* end of grucell_activation_sma_cl*/ + +static const char hswish_cl[] = "#define HSWISH_F32_F32_PROCESS() \\\n\ + float4 src, tmp, dst; \\\n\ + src = read_imagef(input, coord); \\\n\ + tmp = src + 3; \\\n\ + tmp = tmp > 0 ? tmp : 0; \\\n\ + tmp = tmp < 6 ? tmp : 6; \\\n\ + dst = src * tmp / 6.0f; \\\n\ + write_imagef(output, coord, dst);\n\ +\n\ +__kernel void hswish_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + HSWISH_F32_F32_PROCESS()\n\ +}\n\ +\n\ +__kernel void hswish_F32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + HSWISH_F32_F32_PROCESS()\n\ +}\n\ +\n\ +\n\ +#define HSWISH_U8_U8_PROCESS() \\\n\ + float4 src, tmp, data; \\\n\ + uint4 src0 = read_imageui(input, coord); \\\n\ + src = convert_float4(src0) * inputScale - inputTail; \\\n\ + tmp = src + 3; \\\n\ + tmp = tmp > 0 ? tmp : 0; \\\n\ + tmp = tmp < 6 ? tmp : 6; \\\n\ + data = src * tmp / 6.0f; \\\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ + write_imageui(output, coord, dst);\n\ +\n\ +__kernel void hswish_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + HSWISH_U8_U8_PROCESS()\n\ +}\n\ +\n\ +__kernel void hswish_U8toU8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + HSWISH_U8_U8_PROCESS()\n\ +}\n\ +\n\ +\n\ +#define HSWISH_I32_I32_PROCESS() \\\n\ + int4 tmp, dst, src; \\\n\ + src = read_imagei(input, coord); \\\n\ + tmp = src + 3; \\\n\ + tmp = tmp > 0 ? tmp : 0; \\\n\ + tmp = tmp < 6 ? tmp : 6; \\\n\ + dst = src * tmp / 6; \\\n\ + write_imagei(output, coord, dst);\n\ +\n\ +__kernel void hswish_I32toI32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + HSWISH_I32_I32_PROCESS()\n\ +}\n\ +\n\ +__kernel void hswish_I32toI32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + HSWISH_I32_I32_PROCESS()\n\ +}\n\ +"; /* end of hswish_cl*/ + +static const char instance_normalization_f16_cl[] = "__kernel void instance_norm_meanvari_F16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = read_imagef(input, coord);\n\ + coord.y++;\n\ + sum += data.x;\n\ + sqr += data.x * data.x;\n\ + }\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_meanvari_F16_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + data = read_imagef(input, coord);\n\ + coord.y++;\n\ + sum += data.x;\n\ + sqr += data.x * data.x;\n\ + }\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_F16toF16(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(0, gidz, 0, 0);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + float4 data, dst;\n\ + for(coord.y = 0; coord.y < height;coord.y++)\n\ + {\n\ + data = read_imagef(input, coord);\n\ +\n\ + dst.x = data.x * scale_vari + bias_val;\n\ + write_imagef(output, coord, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_F16toF16_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int2 coord_para = (int2)(0, gidz);\n\ + int endH = gidy + height;\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + float4 data, dst;\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + data = read_imagef(input, coord);\n\ +\n\ + dst.x = data.x * scale_vari + bias_val;\n\ + write_imagef(output, coord, dst);\n\ + }\n\ +}\n\ +"; /* end of instance_normalization_f16_cl*/ + +static const char instance_normalization_f32_cl[] = "__kernel void instance_norm_meanvari_F32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = read_imagef(input, coord);\n\ + coord.y++;\n\ + sum += data.x;\n\ + sqr += data.x * data.x;\n\ + }\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_meanvari_F32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + data = read_imagef(input, coord);\n\ + coord.y++;\n\ + sum += data.x;\n\ + sqr += data.x * data.x;\n\ + }\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(0, gidz, 0, 0);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + float4 data, dst;\n\ + for(coord.y = 0; coord.y < height;coord.y++)\n\ + {\n\ + data = read_imagef(input, coord);\n\ +\n\ + dst.x = data.x * scale_vari + bias_val;\n\ + write_imagef(output, coord, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_F32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int2 coord_para = (int2)(0, gidz);\n\ + int endH = gidy + height;\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + bias_val = beta.s0 - scale_vari * mean_vari.s0;\n\ +\n\ + float4 data, dst;\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + data = read_imagef(input, coord);\n\ +\n\ + dst.x = data.x * scale_vari + bias_val;\n\ + write_imagef(output, coord, dst);\n\ + }\n\ +}\n\ +"; /* end of instance_normalization_f32_cl*/ + +static const char instance_normalization_i32_cl[] = "__kernel void instance_norm_meanvari_I32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + int4 data;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0;\n\ + float e2InScale = input_fl * input_fl;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = read_imagei(input, coord);\n\ + coord.y++;\n\ + tmpSum += data.x;\n\ + sqr += (data.x * data.x * e2InScale);\n\ + }\n\ + sum = tmpSum * input_fl;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_meanvari_I32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + int4 data;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0;\n\ + float e2InScale = input_fl * input_fl;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + data = read_imagei(input, coord);\n\ + coord.y++;\n\ + tmpSum += data.x;\n\ + sqr += (data.x * data.x * e2InScale);\n\ + }\n\ + sum = tmpSum * input_fl;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_I32toI32(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(0, gidz, 0, 0);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = input_fl * output_fl * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl;\n\ +\n\ + int4 data, dst;\n\ + for(coord.y = 0; coord.y < height;coord.y++)\n\ + {\n\ + data = read_imagei(input, coord);\n\ +\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + dst = convert_int4_rte(norm);\n\ + write_imagei(output, coord, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_I32toI32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int2 coord_para = (int2)(0, gidz);\n\ + int endH = gidy + height;\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = input_fl * output_fl * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl;\n\ +\n\ + int4 data, dst;\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + data = read_imagei(input, coord);\n\ +\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + dst = convert_int4_rte(norm);\n\ + write_imagei(output, coord, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_I32toF32(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(0, gidz, 0, 0);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = input_fl * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + int4 data;\n\ + for(coord.y = 0; coord.y < height;coord.y++)\n\ + {\n\ + data = read_imagei(input, coord);\n\ +\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + write_imagef(output, coord, norm);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_I32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int2 coord_para = (int2)(0, gidz);\n\ + int endH = gidy + height;\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = input_fl * scale_vari;\n\ + bias_val = beta.s0 - scale_vari * mean_vari.s0;\n\ +\n\ + int4 data;\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + data = read_imagei(input, coord);\n\ +\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + write_imagef(output, coord, norm);\n\ + }\n\ +}\n\ +"; /* end of instance_normalization_i32_cl*/ + +static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_meanvari_U8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + uint4 data;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = read_imageui(input, coord);\n\ + coord.y++;\n\ + tmpSum += data.x;\n\ + tmpSqr += data.x * data.x;\n\ + }\n\ + sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ + sum = (tmpSum - height * input_zp) * input_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_meanvari_U8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + uint4 data;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + data = read_imageui(input, coord);\n\ + coord.y++;\n\ + tmpSum += data.x;\n\ + tmpSqr += data.x * data.x;\n\ + }\n\ + sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ + sum = (tmpSum - height * input_zp) * input_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(0, gidz, 0, 0);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ + float scale_inOut = input_scale * output_scale;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ +\n\ + uint4 data, dst;\n\ + for(coord.y = 0; coord.y < height;coord.y++)\n\ + {\n\ + data = read_imageui(input, coord);\n\ + data.x -= input_zp;\n\ +\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + dst = convert_uint4_rte(norm);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_U8toU8_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int2 coord_para = (int2)(0, gidz);\n\ + int endH = gidy + height;\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ + float scale_inOut = input_scale * output_scale;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ +\n\ + uint4 data, dst;\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + data = read_imageui(input, coord);\n\ + data.x -= input_zp;\n\ +\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + dst = convert_uint4_rte(norm);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_U8toF16(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(0, gidz, 0, 0);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ + float scale_inOut = input_scale * output_scale;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ +\n\ + uint4 data;\n\ + for(coord.y = 0; coord.y < height;coord.y++)\n\ + {\n\ + data = read_imageui(input, coord);\n\ + data.x -= input_zp;\n\ +\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + write_imagef(output, coord, norm);\n\ + }\n\ +}\n\ +\n\ +__kernel void instance_norm_U8toF16_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int input_zp,\n\ + float input_scale,\n\ + float input_fl,\n\ + int output_zp,\n\ + float output_scale,\n\ + float output_fl,\n\ + int width,\n\ + int height,\n\ + float dim_ratio,\n\ + int group_num\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int2 coord_para = (int2)(0, gidz);\n\ + int endH = gidy + height;\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.yx);\n\ + float4 beta = read_imagef(bias, coord_para.yx);\n\ + float4 mean_vari = (float4)(0);\n\ + float scale_vari, bias_val;\n\ + float scale_inOut = input_scale * output_scale;\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x++;\n\ + mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ + coord_para.x+=3;\n\ + }\n\ + mean_vari *= dim_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ +\n\ + uint4 data;\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + data = read_imageui(input, coord);\n\ + data.x -= input_zp;\n\ +\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + write_imagef(output, coord, norm);\n\ + }\n\ +}\n\ +"; /* end of instance_normalization_u8_cl*/ + +static const char l2normalizescale_axis0_cl[] = "\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_axis0_F32_F32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t scale,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int axis_size,\n\ + float rsEps\n\ + )\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int gidx = get_global_id(0);\n\ + float4 src, scale_value, result;\n\ + float sum = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f;\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + int2 coord_scale = (int2)(gidx, 0);\n\ + __local float lcl_sum[16];\n\ + for(; coord.x < axis_size; coord.x += 16)\n\ + {\n\ + src = read_imagef(input, coord);\n\ + pSum += (src.x * src.x);\n\ + }\n\ + lcl_sum[lidx] = pSum;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + float4 data0;\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\ + for(coord.x = gidx; coord.x < axis_size; coord.x += 16)\n\ + {\n\ + src = read_imagef(input, coord);\n\ + scale_value = read_imagef(scale, coord_scale);\n\ + result = src * rsqrt_sum * scale_value;\n\ + write_imagef(output, coord, result.xxxx);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_axis0_U8_F32toU8_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t scale,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int axis_size,\n\ + float rsEps,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int gidx = get_global_id(0);\n\ + float4 src, scale_value, result;\n\ + float sum = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f;\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + int2 coord_scale = (int2)(gidx, 0);\n\ + __local float lcl_sum[16];\n\ + for(; coord.x < axis_size; coord.x += 16)\n\ + {\n\ + src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + pSum += (src.x * src.x);\n\ + }\n\ + lcl_sum[lidx] = pSum;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + float4 data0;\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\ + for(coord.x = gidx; coord.x < axis_size; coord.x += 16)\n\ + {\n\ + src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + scale_value = read_imagef(scale, coord_scale);\n\ + result = src * rsqrt_sum * scale_value;\n\ + uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ +}\n\ +\n\ +\n\ +"; /* end of l2normalizescale_axis0_cl*/ + +static const char l2normalizescale_axis1_cl[] = "\n\ +__kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_axis1_F32_F32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t scale,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int axis_size,\n\ + float rsEps\n\ + )\n\ +{\n\ + int lidx = get_local_id(1);\n\ + int gidy = get_global_id(1);\n\ + float4 src, scale_value, result;\n\ + float sum = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f;\n\ + int2 coord = (int2)(get_global_id(0), gidy );\n\ + int2 coord_scale = (int2)(gidy, 0);\n\ + __local float lcl_sum[16];\n\ + for(; coord.y < axis_size; coord.y += 16)\n\ + {\n\ + src = read_imagef(input, coord);\n\ + pSum += (src.x * src.x);\n\ + }\n\ + lcl_sum[lidx] = pSum;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + float4 data0;\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\ + for(coord.y = gidy; coord.y < axis_size; coord.y += 16)\n\ + {\n\ + src = read_imagef(input, coord);\n\ + scale_value = read_imagef(scale, coord_scale);\n\ + result = src * rsqrt_sum * scale_value;\n\ + write_imagef(output, coord, result.xxxx);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_axis1_U8_F32toU8_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t scale,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int axis_size,\n\ + float rsEps,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int lidx = get_local_id(1);\n\ + int gidy = get_global_id(1);\n\ + float4 src, scale_value, result;\n\ + float sum = 0.0f, pSum = 0.0f, rsqrt_sum = 0.0f;\n\ + int2 coord = (int2)(get_global_id(0), gidy );\n\ + int2 coord_scale = (int2)(gidy, 0);\n\ + __local float lcl_sum[16];\n\ + for(; coord.y < axis_size; coord.y += 16)\n\ + {\n\ + src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + pSum += (src.x * src.x);\n\ + }\n\ + lcl_sum[lidx] = pSum;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + float4 data0;\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\ + for(coord.y = gidy; coord.y < axis_size; coord.y += 16)\n\ + {\n\ + src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + scale_value = read_imagef(scale, coord_scale);\n\ + result = src * rsqrt_sum * scale_value;\n\ + uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ +}\n\ +"; /* end of l2normalizescale_axis1_cl*/ + +static const char log_softmax_axis0_cl[] = "#define rlogE (0.693147182f)\n\ +float LOG(float x)\n\ +{\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +__kernel void log_softmax_axis0_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int width = get_image_width(input);\n\ + int4 coord_in = (int4)(0, y, z, 0);\n\ + float4 maxValue;\n\ + float4 src, dst = {0.0};\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + maxValue = read_imagef(input, coord_in);\n\ + for (coord_in.x = 1; coord_in.x < width; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + coord_in.x++;\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + coord_in.x++;\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + write_imagef(output, coord_in, dst);\n\ + coord_in.x++;\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_axis0_F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int width = get_image_width(input);\n\ + int2 coord_in = (int2)(0, y);\n\ + float4 maxValue;\n\ + float4 src, dst = {0.0};\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + maxValue = read_imagef(input, coord_in);\n\ + for (coord_in.x = 1; coord_in.x < width; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + coord_in.x++;\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.0f;\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + coord_in.x++;\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + write_imagef(output, coord_in, dst);\n\ + coord_in.x++;\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_axis0_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int width = get_image_width(input);\n\ + int4 coord_in = (int4)(0, y, z, 0);\n\ + float4 maxValue;\n\ + float4 src;\n\ + uint4 dst = {0};\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + maxValue = convert_float4(read_imageui(input, coord_in));\n\ + for (coord_in.x = 1; coord_in.x < width; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + coord_in.x++;\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + coord_in.x++;\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ +\n\ + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);\n\ +\n\ + write_imageui(output, coord_in, dst);\n\ + coord_in.x++;\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_axis0_U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int width = get_image_width(input);\n\ + int2 coord_in = (int2)(0, y);\n\ + float4 maxValue;\n\ + float4 src;\n\ + uint4 dst = {0};\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + maxValue = convert_float4(read_imageui(input, coord_in));\n\ + for (coord_in.x = 1; coord_in.x < width; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + coord_in.x++;\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + coord_in.x++;\n\ +\n\ + sum += exp2((src.x - maxValue.x)*scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ +\n\ + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);\n\ + write_imageui(output, coord_in, dst);\n\ + coord_in.x++;\n\ + }\n\ +}\n\ +#undef rlogE\n\ +"; /* end of log_softmax_axis0_cl*/ + +static const char log_softmax_axis1_cl[] = "#define rlogE (0.693147182f)\n\ +\n\ +float LOG(float x)\n\ +{\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +__kernel void log_softmax_axis1_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int height = get_image_height(input);\n\ + int4 coord_in = (int4)(x, 0, z, 0);\n\ + float4 maxValue;\n\ + float4 src, dst = {0.0};\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + maxValue = read_imagef(input, coord_in);\n\ + for (coord_in.y = 1; coord_in.y < height; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + coord_in.y++;\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + coord_in.y++;\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + write_imagef(output, coord_in, dst);\n\ + coord_in.y++;\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_axis1_F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int height = get_image_height(input);\n\ + int2 coord_in = (int2)(x, 0);\n\ + float4 maxValue;\n\ + float4 src, dst = {0.0};\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + maxValue = read_imagef(input, coord_in);\n\ + for (coord_in.y = 1; coord_in.y < height; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + coord_in.y++;\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.0f;\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + coord_in.y++;\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = 1.0f * LOG(sum);\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + write_imagef(output, coord_in, dst);\n\ + coord_in.y++;\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_axis1_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int height = get_image_height(input);\n\ + int4 coord_in = (int4)(x, 0, z, 0);\n\ + float4 maxValue;\n\ + float4 src;\n\ + uint4 dst = {0};\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + maxValue = convert_float4(read_imageui(input, coord_in));\n\ + for (coord_in.y = 1; coord_in.y < height; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + coord_in.y++;\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + coord_in.y++;\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ +\n\ + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);\n\ +\n\ + write_imageui(output, coord_in, dst);\n\ + coord_in.y++;\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_axis1_U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int height = get_image_height(input);\n\ + int2 coord_in = (int2)(x, 0);\n\ + float4 maxValue;\n\ + float4 src;\n\ + uint4 dst = {0};\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + maxValue = convert_float4(read_imageui(input, coord_in));\n\ + for (coord_in.y = 1; coord_in.y < height; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + coord_in.y++;\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + coord_in.y++;\n\ +\n\ + sum += exp2((src.x - maxValue.x)*scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ +\n\ + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);\n\ + write_imageui(output, coord_in, dst);\n\ + coord_in.y++;\n\ + }\n\ +}\n\ +#undef rlogE\n\ +"; /* end of log_softmax_axis1_cl*/ + +static const char log_softmax_axis2_cl[] = "#define rlogE (0.693147182f)\n\ +float LOG(float x)\n\ +{\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +__kernel void log_softmax_axis2_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int depth = get_image_array_size(input);\n\ + int4 coord_in = (int4)(x, y, 0, 0);\n\ + float4 maxValue;\n\ + float4 src, dst = {0.0};\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + maxValue = read_imagef(input, coord_in);\n\ + for (coord_in.z = 1; coord_in.z < depth; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + coord_in.z++;\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.z = 0; coord_in.z < depth; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + coord_in.z++;\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.z = 0; coord_in.z < depth; )\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + write_imagef(output, coord_in, dst);\n\ + coord_in.z++;\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_axis2_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int depth = get_image_array_size(input);\n\ + int4 coord_in = (int4)(x, y, 0, 0);\n\ + float4 maxValue;\n\ + float4 src;\n\ + uint4 dst = {0};\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + maxValue = convert_float4(read_imageui(input, coord_in));\n\ + for (coord_in.z = 1; coord_in.z < depth; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + coord_in.z++;\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.z = 0; coord_in.z < depth; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + coord_in.z++;\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.z = 0; coord_in.z < depth; )\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ +\n\ + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);\n\ +\n\ + write_imageui(output, coord_in, dst);\n\ + coord_in.z++;\n\ + }\n\ +}\n\ +#undef rlogE\n\ +"; /* end of log_softmax_axis2_cl*/ + +static const char logical_not_cl[] = "__kernel void logical_not_I8toI8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 src = read_imagei(input, coord);\n\ + int4 dst = !src;\n\ + dst.x = dst.x & 1;\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void logical_not_I8toI8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 src = read_imagei(input, coord);\n\ + int4 dst = !src;\n\ + dst.x = dst.x & 1;\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +"; /* end of logical_not_cl*/ + +static const char logical_ops_cl[] = "#define TENSORLOGICAL(name, lgc_op, lgc_op2) \\\n\ +__kernel void logical_##name##_I8toI8( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 src0; \\\n\ + int4 src1; \\\n\ + readImage2DArray(src0, input, coord); \\\n\ + readImage2DArray(src1, input1, coord); \\\n\ + int4 dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \\\n\ + dst.x = dst.x & 1; \\\n\ + write_imagei(output, coord, dst); \\\n\ +}\n\ +\n\ +TENSORLOGICAL(or, ||, )\n\ +TENSORLOGICAL(and, &&, )\n\ +TENSORLOGICAL(xor, ^, !!)\n\ +\n\ +\n\ +#define TENSORLOGICAL_2D(name, lgc_op, lgc_op2) \\\n\ +__kernel void logical_##name##_I8toI8_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + int4 src0 = read_imagei(input, coord); \\\n\ + int4 src1 = read_imagei(input1, coord); \\\n\ + int4 dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \\\n\ + dst.x = dst.x & 1; \\\n\ + write_imagei(output, coord, dst); \\\n\ +}\n\ +\n\ +TENSORLOGICAL_2D(or, ||, )\n\ +TENSORLOGICAL_2D(and, &&, )\n\ +TENSORLOGICAL_2D(xor, ^, !!)\n\ +"; /* end of logical_ops_cl*/ + +static const char lstmunit_activation_BP_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_BP_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_BP_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src0 = read_imagef(input_i_conv, coord_in.xy); \\\n\ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_BP_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_BP_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_BP_F32TOU8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_BP_F32toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src0 = read_imagef(input_i_conv, coord_in.xy); \\\n\ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_BP_F32TOU8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_BP_F32TOU8(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_BP_F32_cl*/ + +static const char lstmunit_activation_BP_U8_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_BP_U8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_BP_U8toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \\\n\ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_BP_U8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_BP_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define LSTM_ACTIVATION_BP_U8TOF32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_BP_U8toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \\\n\ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_BP_U8TOF32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_BP_U8TOF32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_BP_U8_cl*/ + +static const char lstmunit_activation_B_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_B_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_B_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src0 = read_imagef(input_i_conv, coord_in.xy); \\\n\ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_B_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_B_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_B_F32TOU8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_B_F32toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src0 = read_imagef(input_i_conv, coord_in.xy); \\\n\ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ + write_imageui(h_state_out, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_B_F32TOU8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_B_F32TOU8(HARD_SIGMOID, hard_sigmoid)"; /* end of lstmunit_activation_B_F32_cl*/ + +static const char lstmunit_activation_B_U8_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_B_U8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_B_U8toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \\\n\ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ + write_imageui(h_state_out, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_B_U8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_B_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define LSTM_ACTIVATION_B_U8TOF32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_B_U8toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \\\n\ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = data_i_t + b0; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_B_U8TOF32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_B_U8TOF32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_B_U8_cl*/ + +static const char lstmunit_activation_CBP_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CBP_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CBP_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b1, b2, b3; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CBP_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CBP_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define LSTM_ACTIVATION_CBP_F32TOU8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CBP_F32toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b1, b2, b3; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CBP_F32TOU8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CBP_F32TOU8(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CBP_F32_cl*/ + +static const char lstmunit_activation_CBP_U8_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CBP_U8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CBP_U8toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CBP_U8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CBP_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CBP_U8TOF32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CBP_U8toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CBP_U8TOF32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CBP_U8TOF32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CBP_U8_cl*/ + +static const char lstmunit_activation_CB_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CB_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CB_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CB_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CB_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CB_F32TOU8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CB_F32toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ + write_imageui(h_state_out, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CB_F32TOU8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CB_F32TOU8(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CB_F32_cl*/ + +static const char lstmunit_activation_CB_U8_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CB_U8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CB_U8toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ + write_imageui(h_state_out, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CB_U8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CB_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CB_U8TOF32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CB_U8toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = data_f_t + b1; \\\n\ + data_g_t = data_g_t + b2; \\\n\ + data_o_t = data_o_t + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CB_U8TOF32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CB_U8TOF32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CB_U8_cl*/ + +static const char lstmunit_activation_CLP_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CLP_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CLP_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b1, b2, b3; \\\n\ + float4 w1, w2, w3; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 * w1 + b1; \\\n\ + data_g_t = src2 * w2 + b2; \\\n\ + data_o_t = src3 * w3 + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CLP_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CLP_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define LSTM_ACTIVATION_CLP_F32TOU8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CLP_F32toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b1, b2, b3; \\\n\ + float4 w1, w2, w3; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 * w1 + b1; \\\n\ + data_g_t = src2 * w2 + b2; \\\n\ + data_o_t = src3 * w3 + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CLP_F32TOU8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CLP_F32TOU8(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CLP_F32_cl*/ + +static const char lstmunit_activation_CLP_U8_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CLP_U8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CLP_U8toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b1, b2, b3; \\\n\ + float4 w1, w2, w3; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 * w1 + b1; \\\n\ + data_g_t = src2 * w2 + b2; \\\n\ + data_o_t = src3 * w3 + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CLP_U8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CLP_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CLP_U8TOF32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CLP_U8toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b1, b2, b3; \\\n\ + float4 w1, w2, w3; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 * w1 + b1; \\\n\ + data_g_t = src2 * w2 + b2; \\\n\ + data_o_t = src3 * w3 + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CLP_U8TOF32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CLP_U8TOF32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CLP_U8_cl*/ + +static const char lstmunit_activation_CL_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CL_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CL_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b1, b2, b3; \\\n\ + float4 w1, w2, w3; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 * w1 + b1; \\\n\ + data_g_t = src2 * w2 + b2; \\\n\ + data_o_t = src3 * w3 + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CL_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CL_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CL_F32TOU8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CL_F32toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b1, b2, b3; \\\n\ + float4 w1, w2, w3; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 * w1 + b1; \\\n\ + data_g_t = src2 * w2 + b2; \\\n\ + data_o_t = src3 * w3 + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ + write_imageui(h_state_out, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CL_F32TOU8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CL_F32TOU8(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CL_F32_cl*/ + +static const char lstmunit_activation_CL_U8_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CL_U8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CL_U8toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b1, b2, b3; \\\n\ + float4 w1, w2, w3; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 * w1 + b1; \\\n\ + data_g_t = src2 * w2 + b2; \\\n\ + data_o_t = src3 * w3 + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ + write_imageui(h_state_out, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CL_U8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CL_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CL_U8TOF32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CL_U8toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b1, b2, b3; \\\n\ + float4 w1, w2, w3; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_f_t = src1 * w1 + b1; \\\n\ + data_g_t = src2 * w2 + b2; \\\n\ + data_o_t = src3 * w3 + b3; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CL_U8TOF32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CL_U8TOF32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CL_U8_cl*/ + +static const char lstmunit_activation_CSP_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CSP_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CSP_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CSP_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CSP_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define LSTM_ACTIVATION_CSP_F32TOU8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CSP_F32toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CSP_F32TOU8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CSP_F32TOU8(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CSP_F32_cl*/ + +static const char lstmunit_activation_CSP_U8_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CSP_U8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CSP_U8toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CSP_U8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CSP_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CSP_U8TOF32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CSP_U8toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CSP_U8TOF32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CSP_U8TOF32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CSP_U8_cl*/ + +static const char lstmunit_activation_CS_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CS_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CS_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CS_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CS_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define LSTM_ACTIVATION_CS_F32TOU8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CS_F32toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ + write_imageui(h_state_out, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CS_F32TOU8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CS_F32TOU8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +"; /* end of lstmunit_activation_CS_F32_cl*/ + +static const char lstmunit_activation_CS_U8_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CS_U8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CS_U8toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ + write_imageui(h_state_out, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CS_U8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CS_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_CS_U8TOF32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CS_U8toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src1, src2, src3; \\\n\ + float4 src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = 1.0f - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_CS_U8TOF32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_CS_U8TOF32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +"; /* end of lstmunit_activation_CS_U8_cl*/ + +static const char lstmunit_activation_LP_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_LP_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_LP_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wi, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + float4 w0, w1, w2, w3; \\\n\ + src0 = read_imagef(input_i_conv, coord_in.xy); \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_i_t = src0 * w0 + b0; \\\n\ + data_f_t = src1 * w1 + b1; \\\n\ + data_g_t = src2 * w2 + b2; \\\n\ + data_o_t = src3 * w3 + b3; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_LP_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_LP_F32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_LP_F32_cl*/ + +static const char lstmunit_activation_L_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_L_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_L_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wi, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + float4 w0, w1, w2, w3; \\\n\ + src0 = read_imagef(input_i_conv, coord_in.xy); \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_i_t = src0 * w0 + b0; \\\n\ + data_f_t = src1 * w1 + b1; \\\n\ + data_g_t = src2 * w2 + b2; \\\n\ + data_o_t = src3 * w3 + b3; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_L_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_L_F32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_L_F32_cl*/ + +static const char lstmunit_activation_SP_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_SP_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_SP_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src0 = read_imagef(input_i_conv, coord_in.xy); \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_SP_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_SP_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_SP_F32TOU8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_SP_F32toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src0 = read_imagef(input_i_conv, coord_in.xy); \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_SP_F32TOU8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_SP_F32TOU8(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_SP_F32_cl*/ + +static const char lstmunit_activation_SP_U8_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_SP_U8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_SP_U8toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \\\n\ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_SP_U8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_SP_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_SP_U8TOF32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_SP_U8toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \\\n\ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_SP_U8TOF32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_SP_U8TOF32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_SP_U8_cl*/ + +static const char lstmunit_activation_S_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_S_F32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_S_F32toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src0 = read_imagef(input_i_conv, coord_in.xy); \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_S_F32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_S_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_S_F32TOU8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_S_F32toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src0 = read_imagef(input_i_conv, coord_in.xy); \\\n\ + src1 = read_imagef(input_f_conv, coord_in.xy); \\\n\ + src2 = read_imagef(input_c_conv, coord_in.xy); \\\n\ + src3 = read_imagef(input_o_conv, coord_in.xy); \\\n\ + src10 = read_imagef(hstate_i_conv, coord_in.xy); \\\n\ + src11 = read_imagef(hstate_f_conv, coord_in.xy); \\\n\ + src12 = read_imagef(hstate_c_conv, coord_in.xy); \\\n\ + src13 = read_imagef(hstate_o_conv, coord_in.xy); \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ + write_imageui(h_state_out, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_S_F32TOU8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_S_F32TOU8(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_S_F32_cl*/ + +static const char lstmunit_activation_S_U8_cl[] = "float4 sigmoid(float4 x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x, float logE)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x, float twoLogE)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_S_U8(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_S_U8toU8_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \\\n\ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t * out_scale + out_zp; \\\n\ + uint4 data_o_u = convert_uint4_sat_rte(data_o_t); \\\n\ + write_imageui(output, coord_in.zy, data_o_u); \\\n\ + write_imageui(h_state_out, coord_in.zy, data_o_u); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_S_U8(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_S_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +\n\ +#define LSTM_ACTIVATION_S_U8TOF32(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_S_U8toF32_F32_##act_name( \\\n\ + __read_only image2d_t input_i_conv, \\\n\ + __read_only image2d_t input_f_conv, \\\n\ + __read_only image2d_t input_c_conv, \\\n\ + __read_only image2d_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t hstate_i_conv, \\\n\ + __read_only image2d_t hstate_f_conv, \\\n\ + __read_only image2d_t hstate_c_conv, \\\n\ + __read_only image2d_t hstate_o_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + float logE, float twoLogE, float forget_bias, float clip_Max_F, float clip_Min_F, \\\n\ + float in_fc_i_scale, float in_fc_i_tail, float in_fc_f_scale, float in_fc_f_tail, \\\n\ + float in_fc_c_scale, float in_fc_c_tail, float in_fc_o_scale, float in_fc_o_tail, \\\n\ + float hstate_i_scale, float hstate_i_tail, float hstate_f_scale, float hstate_f_tail, \\\n\ + float hstate_c_scale, float hstate_c_tail, float hstate_o_scale, float hstate_o_tail, \\\n\ + float out_scale, float out_zp) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + src0 = convert_float4(read_imageui(input_i_conv, coord_in.xy)) * in_fc_i_scale + in_fc_i_tail; \\\n\ + src10 = convert_float4(read_imageui(hstate_i_conv, coord_in.xy)) * hstate_i_scale + hstate_i_tail; \\\n\ + src1 = convert_float4(read_imageui(input_f_conv, coord_in.xy)) * in_fc_f_scale + in_fc_f_tail; \\\n\ + src11 = convert_float4(read_imageui(hstate_f_conv, coord_in.xy)) * hstate_f_scale + hstate_f_tail; \\\n\ + src2 = convert_float4(read_imageui(input_c_conv, coord_in.xy)) * in_fc_c_scale + in_fc_c_tail; \\\n\ + src12 = convert_float4(read_imageui(hstate_c_conv, coord_in.xy)) * hstate_c_scale + hstate_c_tail; \\\n\ + src3 = convert_float4(read_imageui(input_o_conv, coord_in.xy)) * in_fc_o_scale + in_fc_o_tail; \\\n\ + src13 = convert_float4(read_imageui(hstate_o_conv, coord_in.xy)) * hstate_o_scale + hstate_o_tail; \\\n\ + data_c_t = read_imagef(cell_state_in, coord_in.xy); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + data_i_t = act_func(data_i_t, logE); \\\n\ + data_f_t = act_func(data_f_t + forget_bias, logE); \\\n\ + data_g_t = tangentH(data_g_t, twoLogE); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t, logE); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + write_imagef(cell_state_out, coord_in.zy, data_c_t); \\\n\ + data_c_t = tangentH(data_c_t, twoLogE); \\\n\ + data_o_t = data_o_t * data_c_t; \\\n\ + write_imagef(output, coord_in.zy, data_o_t); \\\n\ + write_imagef(h_state_out, coord_in.zy, data_o_t); \\\n\ +}\n\ +\n\ +LSTM_ACTIVATION_S_U8TOF32(SIGMOID, sigmoid)\n\ +LSTM_ACTIVATION_S_U8TOF32(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_S_U8_cl*/ + +static const char matrixmul_cl[] = "__kernel void gemm_F32F32toF32_2D(\n\ + __read_only image2d_t inputA,\n\ + __read_only image2d_t inputB,\n\ + __write_only image2d_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + int2 coord_a = (int2)(0, gidy);\n\ + int2 coord_b = (int2)(gidx, 0);\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + for(; coord_a.x < K;)\n\ + {\n\ + float4 tempA0;\n\ + float4 tempB0;\n\ +\n\ + tempA0 = read_imagef(inputA, coord_a);\n\ + tempB0 = read_imagef(inputB, coord_b);\n\ + coord_a.x++;\n\ + coord_b.y++;\n\ +\n\ + sum += tempA0 * tempB0;\n\ + }\n\ +\n\ + coord_b.y = gidy;\n\ + write_imagef(output, coord_b, sum);\n\ +}\n\ +\n\ +__kernel void gemm_F32F32toF32_3D(\n\ + __read_only image2d_array_t inputA,\n\ + __read_only image2d_array_t inputB,\n\ + __write_only image2d_array_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero\n\ + )\n\ +{\n\ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + for(; coord_a.x < K;)\n\ + {\n\ + float4 tempA0;\n\ + float4 tempB0;\n\ +\n\ + tempA0 = read_imagef(inputA, coord_a);\n\ + tempB0 = read_imagef(inputB, coord_b);\n\ + coord_a.x++;\n\ + coord_b.y++;\n\ +\n\ + sum += tempA0 * tempB0;\n\ + }\n\ +\n\ + coord_b.y = get_global_id(1);\n\ + coord_b.z = get_global_id(2);\n\ + write_imagef(output, coord_b, sum);\n\ +}\n\ +"; /* end of matrixmul_cl*/ + +static const char matrixmul_transA_cl[] = "__kernel void gemm_transa_F32F32toF32_2D(\n\ + __read_only image2d_t inputA,\n\ + __read_only image2d_t inputB,\n\ + __write_only image2d_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + int2 coord_a = (int2)(gidy, 0);\n\ + int2 coord_b = (int2)(gidx, 0);\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + for(; coord_a.y < K;)\n\ + {\n\ + float4 tempA0;\n\ + float4 tempB0;\n\ +\n\ + tempA0 = read_imagef(inputA, coord_a);\n\ + tempB0 = read_imagef(inputB, coord_b);\n\ + coord_a.y++;\n\ + coord_b.y++;\n\ +\n\ + sum += tempA0 * tempB0;\n\ + }\n\ +\n\ + coord_b.y = gidy;\n\ + write_imagef(output, coord_b, sum);\n\ +}\n\ +\n\ +__kernel void gemm_transa_F32F32toF32_3D(\n\ + __read_only image2d_array_t inputA,\n\ + __read_only image2d_array_t inputB,\n\ + __write_only image2d_array_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + for(; coord_a.y < K;)\n\ + {\n\ + float4 tempA0;\n\ + float4 tempB0;\n\ +\n\ + tempA0 = read_imagef(inputA, coord_a);\n\ + tempB0 = read_imagef(inputB, coord_b);\n\ + coord_a.y++;\n\ + coord_b.y++;\n\ +\n\ + sum += tempA0 * tempB0;\n\ + }\n\ +\n\ + coord_b.y = gidy;\n\ + coord_b.z = get_global_id(2);\n\ + write_imagef(output, coord_b, sum);\n\ +}\n\ +"; /* end of matrixmul_transA_cl*/ + +static const char maximum_cl[] = "__kernel void maximum_FP32FP32toFP32\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + float4 src0;\n\ + float4 src1;\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ +\n\ + float4 dst = src0 > src1 ? src0 : src1;\n\ +\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void maximum_FP32FP32toFP32_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + float4 src0 = read_imagef(input0, coord);\n\ + float4 src1 = read_imagef(input1, coord);\n\ +\n\ + float4 dst = src0 > src1 ? src0 : src1;\n\ +\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void maximum_U8U8toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + uint4 src0;\n\ + uint4 src1;\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ +\n\ + float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ + float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ + float4 data = data0 > data1 ? data0 : data1;\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP);\n\ +\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void maximum_U8U8toU8_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + uint4 src0 = read_imageui(input0, coord);\n\ + uint4 src1 = read_imageui(input1, coord);\n\ +\n\ + float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ + float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ + float4 data = data0 > data1 ? data0 : data1;\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP);\n\ +\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +\n\ +__kernel void maximum_I32I32toI32\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 src0;\n\ + int4 src1;\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ +\n\ + int4 dst = src0 > src1 ? src0 : src1;\n\ +\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void maximum_I32I32toI32_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + int4 src0 = read_imagei(input0, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ +\n\ + int4 dst = src0 > src1 ? src0 : src1;\n\ +\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +"; /* end of maximum_cl*/ + +static const char minimum_cl[] = "__kernel void minimum_FP32FP32toFP32\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + float4 src0;\n\ + float4 src1;\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ +\n\ + float4 dst = src0 < src1 ? src0 : src1;\n\ +\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void minimum_FP32FP32toFP32_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + float4 src0 = read_imagef(input0, coord);\n\ + float4 src1 = read_imagef(input1, coord);\n\ +\n\ + float4 dst = src0 < src1 ? src0 : src1;\n\ +\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void minimum_U8U8toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + uint4 src0;\n\ + uint4 src1;\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ +\n\ + float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ + float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ + float4 data = data0 < data1 ? data0 : data1;\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP);\n\ +\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void minimum_U8U8toU8_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + uint4 src0 = read_imageui(input0, coord);\n\ + uint4 src1 = read_imageui(input1, coord);\n\ +\n\ + float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ + float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ + float4 data = data0 < data1 ? data0 : data1;\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP);\n\ +\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +\n\ +__kernel void minimum_I32I32toI32\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 src0;\n\ + int4 src1;\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ +\n\ + int4 dst = src0 < src1 ? src0 : src1;\n\ +\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void minimum_I32I32toI32_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + int4 src0 = read_imagei(input0, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ +\n\ + int4 dst = src0 < src1 ? src0 : src1;\n\ +\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +"; /* end of minimum_cl*/ + +static const char moments_axis0_cl[] = "__kernel void moments_axis0_U8toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output_mean,\n\ + __write_only image2d_t output_vari,\n\ + int axis,\n\ + int axis_num,\n\ + int input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + float dimRatio\n\ + )\n\ +{\n\ + int gidy = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ +\n\ + int4 coord0 = (int4)(0, gidy, gidz, 0);\n\ + uint data;\n\ + float sum = 0, sqr = 0;\n\ + uint tmpSum = 0, tmpSqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + {\n\ + for(coord0.x = 0; coord0.x < width;)\n\ + {\n\ + data = read_imageui(input, coord0).x;\n\ + coord0.x++;\n\ + tmpSum += (data);\n\ + tmpSqr += (data * data);\n\ + }\n\ + sqr = convert_float(tmpSqr - 2 * input_zp * tmpSum + width * input_zp * input_zp) * e2InScale;\n\ + sum = convert_float(tmpSum - width * input_zp) * input_scale;\n\ + }\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + int2 coord_out = (int2)(gidy, gidz);\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ +}\n\ +\n\ +#define MOMENTS_AXIS0_F(src0_type_name) \\\n\ +__kernel void moments_axis0_##src0_type_name##to##src0_type_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_t output_mean, \\\n\ + __write_only image2d_t output_vari, \\\n\ + int axis, int axis_num, int input_zp, float input_scale, \\\n\ + int width, int height, int chn, float dimRatio \\\n\ + ) \\\n\ +{ \\\n\ + int gidy = get_global_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + \\\n\ + int4 coord0 = (int4)(0, gidy, gidz, 0); \\\n\ + float data; \\\n\ + float sum = 0, sqr = 0; \\\n\ + \\\n\ + for(coord0.x = 0; coord0.x < width;) \\\n\ + { \\\n\ + data = read_imagef(input, coord0).x; \\\n\ + coord0.x++; \\\n\ + sum += (data); \\\n\ + sqr += (data * data); \\\n\ + } \\\n\ + \\\n\ + float4 mean, vari; \\\n\ + mean.x = sum * dimRatio; \\\n\ + vari.x = sqr * dimRatio; \\\n\ + vari.x = vari.x - mean.x * mean.x; \\\n\ + \\\n\ + int2 coord_out = (int2)(gidy, gidz); \\\n\ + write_imagef(output_mean, coord_out, mean); \\\n\ + write_imagef(output_vari, coord_out, vari); \\\n\ +}\n\ +MOMENTS_AXIS0_F(F16)\n\ +MOMENTS_AXIS0_F(F32)\n\ +\n\ +__kernel void moments_axis0_I32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output_mean,\n\ + __write_only image2d_t output_vari,\n\ + int axis,\n\ + int axis_num,\n\ + int input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + float dimRatio\n\ + )\n\ +{\n\ + int gidy = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ +\n\ + int4 coord0 = (int4)(0, gidy, gidz, 0);\n\ + int data;\n\ + int sum = 0, sqr = 0;\n\ +\n\ + for(coord0.x = 0; coord0.x < width;)\n\ + {\n\ + data = read_imagei(input, coord0).x;\n\ + coord0.x++;\n\ + sum += (data);\n\ + sqr += (data * data);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + int2 coord_out = (int2)(gidy, gidz);\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ +}"; /* end of moments_axis0_cl*/ + +static const char moments_axis01_cl[] = "__kernel void moments_axis01_U8toF16(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num, int input_zp, float input_scale,\n\ + int width, int height, int chn, float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + uint4 data;\n\ + float sum = 0, sqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16)\n\ + {\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = read_imageui(input, coord);\n\ + coord.y++;\n\ + tmpSum += data.x;\n\ + tmpSqr += data.x * data.x;\n\ + }\n\ + sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ + sum += (tmpSum - height * input_zp) * input_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(gidz, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ + }\n\ +}\n\ +\n\ +#define MOMENTS_AXIS01_F(src0_type_name) \\\n\ +__kernel void moments_axis01_##src0_type_name##to##src0_type_name( \\\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num, int input_zp, float input_scale, \\\n\ + int width, int height, int chn, float dimRatio \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int lidx = get_local_id(0); \\\n\ + \\\n\ + int4 coord = (int4)(gidx, 0, gidz, 0); \\\n\ + float4 data; \\\n\ + float sum = 0, sqr = 0; \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + \\\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + data = read_imagef(input, coord); \\\n\ + coord.y++; \\\n\ + sum += data.x; \\\n\ + sqr += data.x * data.x; \\\n\ + } \\\n\ + } \\\n\ + lcl_sum[lidx] = sum; \\\n\ + lcl_sqr[lidx] = sqr; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + int2 coord_out = (int2)(gidz, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + \\\n\ + sum = 0; sqr = 0; \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + \\\n\ + float4 mean, vari; \\\n\ + mean.x = sum * dimRatio; \\\n\ + vari.x = sqr * dimRatio; \\\n\ + vari.x = vari.x - mean.x * mean.x; \\\n\ + \\\n\ + write_imagef(output_mean, coord_out, mean); \\\n\ + write_imagef(output_vari, coord_out, vari); \\\n\ + } \\\n\ +}\n\ +MOMENTS_AXIS01_F(F16)\n\ +MOMENTS_AXIS01_F(F32)\n\ +\n\ +__kernel void moments_axis01_I32toF32(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num, int input_zp, float input_scale,\n\ + int width, int height, int chn, float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + int4 data;\n\ + float sum = 0, sqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16)\n\ + {\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = read_imagei(input, coord);\n\ + coord.y++;\n\ + tmpSum += data.x;\n\ + tmpSqr += data.x * data.x;\n\ + }\n\ + sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ + sum += (tmpSum - height * input_zp) * input_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(gidz, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ + }\n\ +}\n\ +"; /* end of moments_axis01_cl*/ + +static const char moments_axis012_cl[] = "__kernel void moments_axis012_U8toF16(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num, int input_zp, float input_scale,\n\ + int width, int height, int chn, float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, 0, 0);\n\ + uint4 data;\n\ + float sum = 0, sqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(coord.z = 0; coord.z < chn; coord.z++)\n\ + {\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16)\n\ + {\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = read_imageui(input, coord);\n\ + coord.y++;\n\ + tmpSum += data.x;\n\ + tmpSqr += data.x * data.x;\n\ + }\n\ + sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ + sum += (tmpSum - height * input_zp) * input_scale;\n\ + }\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ + }\n\ +}\n\ +\n\ +#define MOMENTS_AXIS012_F(src0_type_name) \\\n\ +__kernel void moments_axis012_##src0_type_name##to##src0_type_name( \\\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, \\\n\ + int axis, int axis_num, int input_zp, float input_scale, \\\n\ + int width, int height, int chn, float dimRatio \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int lidx = get_local_id(0); \\\n\ + \\\n\ + int4 coord = (int4)(gidx, 0, 0, 0); \\\n\ + float4 data; \\\n\ + float sum = 0, sqr = 0; \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + \\\n\ + for(coord.z = 0; coord.z < chn; coord.z++) \\\n\ + { \\\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + data = read_imagef(input, coord); \\\n\ + coord.y++; \\\n\ + sum += data.x; \\\n\ + sqr += data.x * data.x; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + lcl_sum[lidx] = sum; \\\n\ + lcl_sqr[lidx] = sqr; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + int2 coord_out = (int2)(0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + \\\n\ + sum = 0; sqr = 0; \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + \\\n\ + float4 mean, vari; \\\n\ + mean.x = sum * dimRatio; \\\n\ + vari.x = sqr * dimRatio; \\\n\ + vari.x = vari.x - mean.x * mean.x; \\\n\ + \\\n\ + write_imagef(output_mean, coord_out, mean); \\\n\ + write_imagef(output_vari, coord_out, vari); \\\n\ + } \\\n\ +}\n\ +MOMENTS_AXIS012_F(F16)\n\ +MOMENTS_AXIS012_F(F32)\n\ +\n\ +__kernel void moments_axis012_I32toF32(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num, int input_zp, float input_scale,\n\ + int width, int height, int chn, float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, 0, 0);\n\ + int4 data;\n\ + float sum = 0, sqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(coord.z = 0; coord.z < chn; coord.z++)\n\ + {\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16)\n\ + {\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = read_imagei(input, coord);\n\ + coord.y++;\n\ + tmpSum += data.x;\n\ + tmpSqr += data.x * data.x;\n\ + }\n\ + sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ + sum += (tmpSum - height * input_zp) * input_scale;\n\ + }\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ + }\n\ +}\n\ +"; /* end of moments_axis012_cl*/ + +static const char moments_axis1_cl[] = "__kernel void moments_axis1_U8toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output_mean,\n\ + __write_only image2d_t output_vari,\n\ + int axis, int axis_num, int input_zp, float input_scale,\n\ + int width, int height, int chn, float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ +\n\ + int4 coord0 = (int4)(gidx, 0, gidz, 0);\n\ + uint data;\n\ + float sum = 0, sqr = 0;\n\ + uint tmpSum = 0, tmpSqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + {\n\ + for(coord0.y = 0; coord0.y < height;)\n\ + {\n\ + data = read_imageui(input, coord0).x;\n\ + coord0.y++;\n\ + tmpSum += (data);\n\ + tmpSqr += (data * data);\n\ + }\n\ + sqr = convert_float(tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ + sum = convert_float(tmpSum - height * input_zp) * input_scale;\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + int2 coord_out = (int2)(gidx, gidz);\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ +}\n\ +\n\ +#define MOMENTS_AXIS1_F(src0_type_name) \\\n\ +__kernel void moments_axis1_##src0_type_name##to##src0_type_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_t output_mean, \\\n\ + __write_only image2d_t output_vari, \\\n\ + int axis, int axis_num, int input_zp, float input_scale, \\\n\ + int width, int height, int chn, float dimRatio \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + \\\n\ + int4 coord0 = (int4)(gidx, 0, gidz, 0); \\\n\ + float data; \\\n\ + float sum = 0, sqr = 0; \\\n\ + \\\n\ + for(coord0.y = 0; coord0.y < height;) \\\n\ + { \\\n\ + data = read_imagef(input, coord0).x; \\\n\ + coord0.y++; \\\n\ + sum += (data); \\\n\ + sqr += (data * data); \\\n\ + } \\\n\ + \\\n\ + float4 mean, vari; \\\n\ + mean.x = sum * dimRatio; \\\n\ + vari.x = sqr * dimRatio; \\\n\ + vari.x = vari.x - mean.x * mean.x; \\\n\ + \\\n\ + int2 coord_out = (int2)(gidx, gidz); \\\n\ + write_imagef(output_mean, coord_out, mean); \\\n\ + write_imagef(output_vari, coord_out, vari); \\\n\ +}\n\ +MOMENTS_AXIS1_F(F16)\n\ +MOMENTS_AXIS1_F(F32)\n\ +\n\ +__kernel void moments_axis1_I32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output_mean,\n\ + __write_only image2d_t output_vari,\n\ + int axis,\n\ + int axis_num,\n\ + int input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ +\n\ + int4 coord0 = (int4)(gidx, 0, gidz, 0);\n\ + int data;\n\ + int sum = 0, sqr = 0;\n\ +\n\ + for(coord0.y = 0; coord0.y < height;)\n\ + {\n\ + data = read_imagei(input, coord0).x;\n\ + coord0.y++;\n\ + sum += (data);\n\ + sqr += (data * data);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + int2 coord_out = (int2)(gidx, gidz);\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ +}"; /* end of moments_axis1_cl*/ + +static const char moments_axis2_cl[] = "__kernel void moments_axis2_U8toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output_mean,\n\ + __write_only image2d_t output_vari,\n\ + int axis,\n\ + int axis_num,\n\ + int input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + int4 coord0 = (int4)(gidx, gidy, 0, 0);\n\ + uint data;\n\ + float sum = 0, sqr = 0;\n\ + uint tmpSum = 0, tmpSqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + {\n\ + for(coord0.z = 0; coord0.z < chn;)\n\ + {\n\ + data = read_imageui(input, coord0).x;\n\ + coord0.z++;\n\ + tmpSum += (data);\n\ + tmpSqr += (data * data);\n\ + }\n\ + sqr = (tmpSqr - 2 * input_zp * tmpSum + chn * input_zp * input_zp) * e2InScale;\n\ + sum = (tmpSum - chn * input_zp) * input_scale;\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + int2 coord_out = (int2)(gidx, gidy);\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ +}\n\ +\n\ +#define MOMENTS_AXIS2_F(src0_type_name) \\\n\ +__kernel void moments_axis2_##src0_type_name##to##src0_type_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_t output_mean, \\\n\ + __write_only image2d_t output_vari, \\\n\ + int axis, \\\n\ + int axis_num, \\\n\ + int input_zp, \\\n\ + float input_scale, \\\n\ + int width, \\\n\ + int height, \\\n\ + int chn, \\\n\ + float dimRatio \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord0 = (int4)(gidx, gidy, 0, 0); \\\n\ + float data; \\\n\ + float sum = 0, sqr = 0; \\\n\ + \\\n\ + for(coord0.z = 0; coord0.z < chn;) \\\n\ + { \\\n\ + data = read_imagef(input, coord0).x; \\\n\ + coord0.z++; \\\n\ + sum += (data); \\\n\ + sqr += (data * data); \\\n\ + } \\\n\ + \\\n\ + float4 mean, vari; \\\n\ + mean.x = sum * dimRatio; \\\n\ + vari.x = sqr * dimRatio; \\\n\ + vari.x = vari.x - mean.x * mean.x; \\\n\ + \\\n\ + int2 coord_out = (int2)(gidx, gidy); \\\n\ + write_imagef(output_mean, coord_out, mean); \\\n\ + write_imagef(output_vari, coord_out, vari); \\\n\ +}\n\ +MOMENTS_AXIS2_F(F16)\n\ +MOMENTS_AXIS2_F(F32)\n\ +\n\ +__kernel void moments_axis2_I32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output_mean,\n\ + __write_only image2d_t output_vari,\n\ + int axis,\n\ + int axis_num,\n\ + int input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + int4 coord0 = (int4)(gidx, gidy, 0, 0);\n\ + int data;\n\ + int sum = 0, sqr = 0;\n\ +\n\ + for(coord0.z = 0; coord0.z < chn;)\n\ + {\n\ + data = read_imagei(input, coord0).x;\n\ + coord0.z++;\n\ + sum += (data);\n\ + sqr += (data * data);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + int2 coord_out = (int2)(gidx, gidy);\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ +}"; /* end of moments_axis2_cl*/ + +static const char poolwithargmax_cl[] = "\n\ +#define POOLWITHARGMAX_PROCESS(data_type, read_fun, write_fun0, write_fun1) \\\n\ + data_type src = 0; \\\n\ + data_type max = 0; \\\n\ + uint4 axis = 0; \\\n\ + src.x = read_fun(input, coord_in).x; \\\n\ + coord_in.x++; \\\n\ + src.y = read_fun(input, coord_in).x; \\\n\ + coord_in.y++; \\\n\ + src.w = read_fun(input, coord_in).x; \\\n\ + coord_in.x--; \\\n\ + src.z = read_fun(input, coord_in).x; \\\n\ + max.x = src.x; \\\n\ + axis.x = 0; \\\n\ + if (src.y > max.x) \\\n\ + { \\\n\ + max.x = src.y; \\\n\ + axis.x = 1; \\\n\ + } \\\n\ + if (src.z > max.x) \\\n\ + { \\\n\ + max.x = src.z; \\\n\ + axis.x = 2; \\\n\ + } \\\n\ + if (src.w > max.x) \\\n\ + { \\\n\ + max.x = src.w; \\\n\ + axis.x = 3; \\\n\ + } \\\n\ + write_fun0(output, coord_out, max); \\\n\ + write_fun1(outaxis, coord_out, axis);\n\ +\n\ +\n\ +__kernel void poolwithargmax_F32to_F32_U8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t outaxis)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0);\n\ + POOLWITHARGMAX_PROCESS(float4, read_imagef, write_imagef, write_imageui)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_F32to_F32_U8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t outaxis)\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord_in = (int2)(get_global_id(0) << 1, get_global_id(1) << 1);\n\ + POOLWITHARGMAX_PROCESS(float4, read_imagef, write_imagef, write_imageui)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_I32to_I32_U8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t outaxis)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0);\n\ + POOLWITHARGMAX_PROCESS(int4, read_imagei, write_imagei, write_imageui)\n\ +}\n\ +\n\ +__kernel void poolwithargmax_I32to_I32_U8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t outaxis)\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord_in = (int2)(get_global_id(0) << 1, get_global_id(1) << 1);\n\ + POOLWITHARGMAX_PROCESS(int4, read_imagei, write_imagei, write_imageui)\n\ +}\n\ +\n\ +\n\ +#define POOLWITHARGMAX_U8_PROCESS() \\\n\ + uint4 src = 0; \\\n\ + uint4 max = 0; \\\n\ + uint4 axis = 0; \\\n\ + float4 result = 0.0f; \\\n\ + src.x = read_imageui(input, coord_in).x; \\\n\ + coord_in.x++; \\\n\ + src.y = read_imageui(input, coord_in).x; \\\n\ + coord_in.y++; \\\n\ + src.w = read_imageui(input, coord_in).x; \\\n\ + coord_in.x--; \\\n\ + src.z = read_imageui(input, coord_in).x; \\\n\ + max.x = src.x; \\\n\ + axis.x = 0; \\\n\ + if (src.y > max.x) \\\n\ + { \\\n\ + max.x = src.y; \\\n\ + axis.x = 1; \\\n\ + } \\\n\ + if (src.z > max.x) \\\n\ + { \\\n\ + max.x = src.z; \\\n\ + axis.x = 2; \\\n\ + } \\\n\ + if (src.w > max.x) \\\n\ + { \\\n\ + max.x = src.w; \\\n\ + axis.x = 3; \\\n\ + } \\\n\ + result.x = convert_float4(max).x * scale_value + tail_value; \\\n\ + max = convert_uint4(result);\\\n\ + write_imageui(output, coord_out, max); \\\n\ + write_imageui(outaxis, coord_out, axis);\n\ +\n\ +\n\ +__kernel void poolwithargmax_U8to_U8_U8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t outaxis,\n\ + float scale_value,\n\ + float tail_value)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0);\n\ + POOLWITHARGMAX_U8_PROCESS()\n\ +}\n\ +\n\ +__kernel void poolwithargmax_U8to_U8_U8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t outaxis,\n\ + float scale_value,\n\ + float tail_value)\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord_in = (int2)(get_global_id(0) << 1, get_global_id(1) << 1);\n\ + POOLWITHARGMAX_U8_PROCESS()\n\ +}\n\ +\n\ +\n\ +#define POOLWITHARGMAX_U8_TO_F32_PROCESS() \\\n\ + uint4 src = 0; \\\n\ + uint4 max = 0; \\\n\ + uint4 axis = 0; \\\n\ + float4 result = 0.0f; \\\n\ + src.x = read_imageui(input, coord_in).x; \\\n\ + coord_in.x++; \\\n\ + src.y = read_imageui(input, coord_in).x; \\\n\ + coord_in.y++; \\\n\ + src.w = read_imageui(input, coord_in).x; \\\n\ + coord_in.x--; \\\n\ + src.z = read_imageui(input, coord_in).x; \\\n\ + max.x = src.x; \\\n\ + axis.x = 0; \\\n\ + if (src.y > max.x) \\\n\ + { \\\n\ + max.x = src.y; \\\n\ + axis.x = 1; \\\n\ + } \\\n\ + if (src.z > max.x) \\\n\ + { \\\n\ + max.x = src.z; \\\n\ + axis.x = 2; \\\n\ + } \\\n\ + if (src.w > max.x) \\\n\ + { \\\n\ + max.x = src.w; \\\n\ + axis.x = 3; \\\n\ + } \\\n\ + result.x = convert_float4(max).x * scale_value + tail_value; \\\n\ + write_imagef(output, coord_out, result); \\\n\ + write_imageui(outaxis, coord_out, axis);\n\ +\n\ +\n\ +__kernel void poolwithargmax_U8to_F32_U8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t outaxis,\n\ + float scale_value,\n\ + float tail_value)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0);\n\ + POOLWITHARGMAX_U8_TO_F32_PROCESS()\n\ +}\n\ +\n\ +__kernel void poolwithargmax_U8to_F32_U8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t outaxis,\n\ + float scale_value,\n\ + float tail_value)\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord_in = (int2)(get_global_id(0) << 1, get_global_id(1) << 1);\n\ + POOLWITHARGMAX_U8_TO_F32_PROCESS()\n\ +}\n\ +\n\ +#define POOLWITHARGMAX_F32_TO_U8_PROCESS() \\\n\ + float4 src = 0; \\\n\ + float4 max = 0; \\\n\ + uint4 axis = 0; \\\n\ + uint4 dst = 0; \\\n\ + float4 result = 0.0f; \\\n\ + src.x = read_imagef(input, coord_in).x; \\\n\ + coord_in.x++; \\\n\ + src.y = read_imagef(input, coord_in).x; \\\n\ + coord_in.y++; \\\n\ + src.w = read_imagef(input, coord_in).x; \\\n\ + coord_in.x--; \\\n\ + src.z = read_imagef(input, coord_in).x; \\\n\ + max.x = src.x; \\\n\ + axis.x = 0; \\\n\ + if (src.y > max.x) \\\n\ + { \\\n\ + max.x = src.y; \\\n\ + axis.x = 1; \\\n\ + } \\\n\ + if (src.z > max.x) \\\n\ + { \\\n\ + max.x = src.z; \\\n\ + axis.x = 2; \\\n\ + } \\\n\ + if (src.w > max.x) \\\n\ + { \\\n\ + max.x = src.w; \\\n\ + axis.x = 3; \\\n\ + } \\\n\ + result.x = max.x * scale_value + tail_value; \\\n\ + dst = convert_uint4(result);\\\n\ + write_imageui(output, coord_out, dst); \\\n\ + write_imageui(outaxis, coord_out, axis);\n\ +\n\ +\n\ +__kernel void poolwithargmax_F32to_U8_U8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t outaxis,\n\ + float scale_value,\n\ + float tail_value)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0);\n\ + POOLWITHARGMAX_F32_TO_U8_PROCESS()\n\ +}\n\ +\n\ +__kernel void poolwithargmax_F32to_U8_U8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t outaxis,\n\ + float scale_value,\n\ + float tail_value)\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord_in = (int2)(get_global_id(0) << 1, get_global_id(1) << 1);\n\ + POOLWITHARGMAX_F32_TO_U8_PROCESS()\n\ +}\n\ +"; /* end of poolwithargmax_cl*/ + +static const char pow_cl[] = "__kernel void pow_FP32FP32toFP32\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + float4 src0, src1;\n\ + float4 dst;\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ +\n\ + float4 s0 = sign(src0);\n\ + int4 t0 = convert_int4(src1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ + dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ? (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);\n\ +\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void pow_FP32FP32toFP32_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + float4 src0 = read_imagef(input0, coord);\n\ + float4 src1 = read_imagef(input1, coord);\n\ +\n\ + float4 dst = (float4)(0);\n\ +\n\ + float4 s0 = sign(src0);\n\ + int4 t0 = convert_int4(src1) & 1;\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ +\n\ + dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ? (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);\n\ +\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +"; /* end of pow_cl*/ + +static const char prelu_cl[] = "__kernel void prelu_FP32FP32toFP32\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + float4 src0;\n\ + float4 src1;\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ +\n\ + float4 maxData = src0 >= 0 ? src0 : 0;\n\ + float4 minData = src0 < 0 ? src0 : 0;\n\ + float4 dst = maxData + minData * src1;\n\ +\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void prelu_FP32FP32toFP32_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + float4 src0 = read_imagef(input0, coord);\n\ + float4 src1 = read_imagef(input1, coord);\n\ +\n\ + float4 maxData = src0 >= 0 ? src0 : 0;\n\ + float4 minData = src0 < 0 ? src0 : 0;\n\ + float4 dst = maxData + minData * src1;\n\ +\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void prelu_U8U8toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + uint4 src0;\n\ + uint4 src1;\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ +\n\ + float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ + float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ +\n\ + float4 maxData = data0 >= 0 ? data0 : 0;\n\ + float4 minData = data0 < 0 ? data0 : 0;\n\ + float4 data = maxData + minData * data1;\n\ +\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP);\n\ +\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void prelu_U8U8toU8_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + uint4 src0 = read_imageui(input0, coord);\n\ + uint4 src1 = read_imageui(input1, coord);\n\ +\n\ + float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ + float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ +\n\ + float4 maxData = data0 >= 0 ? data0 : 0;\n\ + float4 minData = data0 < 0 ? data0 : 0;\n\ + float4 data = maxData + minData * data1;\n\ +\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP);\n\ +\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +\n\ +__kernel void prelu_I32I32toI32\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 src0;\n\ + int4 src1;\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ +\n\ + float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ + float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ +\n\ + float4 maxData = data0 >= 0 ? data0 : 0;\n\ + float4 minData = data0 < 0 ? data0 : 0;\n\ + float4 data = maxData + minData * data1;\n\ +\n\ + int4 dst = convert_int4(data * outputScale + outputZP);\n\ +\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void prelu_I32I32toI32_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + int4 src0 = read_imagei(input0, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ +\n\ + float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ + float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ +\n\ + float4 maxData = data0 >= 0 ? data0 : 0;\n\ + float4 minData = data0 < 0 ? data0 : 0;\n\ + float4 data = maxData + minData * data1;\n\ +\n\ + int4 dst = convert_int4(data * outputScale + outputZP);\n\ +\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +"; /* end of prelu_cl*/ + +static const char random_multinomial_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\ +\n\ +inline uchar* get_image2D_array_ptr(image2d_array_t input)\n\ +{\n\ + int8 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ + uchar *src_ptr = (uchar*)desc.s0;\n\ +\n\ + return src_ptr;\n\ +}\n\ +\n\ +uint4 _philox4x32bumpkey(uint4 key)\n\ +{\n\ + uint4 mask = (uint4)((uint)0x9E3779B9, (uint)0xBB67AE85, 0, 0);\n\ + //key.x += ((uint)0x9E3779B9);\n\ + //key.y += ((uint)0xBB67AE85);\n\ + key += mask;\n\ + return key;\n\ +}\n\ +\n\ +uint mullo32(uint a, uint b)\n\ +{\n\ + return a * b;\n\ +}\n\ +\n\ +uint mulhi32(uint a, uint b)\n\ +{\n\ + return mul_hi(a, b);\n\ +}\n\ +\n\ +uint4 _philox4x32round(uint4 ctr, uint4 key)\n\ +{\n\ + uint PHILOX_M4x32_0 = ((uint)0xD2511F53);\n\ + uint PHILOX_M4x32_1 = ((uint)0xCD9E8D57);\n\ +\n\ + uint lo0 = mullo32(PHILOX_M4x32_0, ctr.x);\n\ + uint hi0 = mulhi32(PHILOX_M4x32_0, ctr.x);\n\ + uint lo1 = mullo32(PHILOX_M4x32_1, ctr.z);\n\ + uint hi1 = mulhi32(PHILOX_M4x32_1, ctr.z);\n\ +\n\ + uint4 out = (uint4)(hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0);\n\ + return out;\n\ +}\n\ +\n\ +uint4 philox4x32_R_10(uint4 ctr, uint4 key)\n\ +{\n\ + uint i;\n\ + ctr = _philox4x32round(ctr, key);\n\ + for (i = 1; i < 10; i++)\n\ + {\n\ + key = _philox4x32bumpkey(key);\n\ + ctr = _philox4x32round(ctr, key);\n\ + }\n\ + return ctr;\n\ +}\n\ +\n\ +__kernel void random_seed(\n\ + __read_only image2d_array_t seeds,\n\ + __write_only image2d_array_t output,\n\ + int iter,\n\ + float re_rand_max\n\ + )\n\ +{\n\ + __global uint* seeds_ptr = (__global uint*)get_image2D_array_ptr(seeds);\n\ + seeds_ptr = seeds_ptr;\n\ + uint4 key = vload4(0, seeds_ptr);\n\ +\n\ + uint4 ctr = (uint4)(0);\n\ + float4 result = 0;\n\ +\n\ + __global float* output_ptr = (__global float*)get_image2D_array_ptr(output);\n\ +\n\ + for(int i = 0; i < iter; i++)\n\ + {\n\ + ctr = philox4x32_R_10(ctr, key);\n\ + result = convert_float4(ctr) * re_rand_max;\n\ + vstore4(result, i, output_ptr);\n\ + }\n\ +}\n\ +\n\ +#define logE (1.44269502f)\n\ +float eltwise_unary_exp(float x)\n\ +{\n\ + x *= logE;\n\ + x = exp2(x);\n\ + return x;\n\ +}\n\ +// N times of 8\n\ +// x dim = 1\n\ +\n\ +__kernel void random_multinomial_cdf_F32\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(1));\n\ + int class_max_iter = get_image_width(input);\n\ + float4 src0, data;\n\ + float4 dst = 0;\n\ +\n\ + float4 maxVal = read_imagef(input, coord);\n\ +\n\ + for(coord.x = 1; coord.x < class_max_iter;)\n\ + {\n\ + src0 = read_imagef(input, coord);\n\ + coord.x ++;\n\ +\n\ + maxVal = maxVal > src0 ? maxVal : src0;\n\ + }\n\ +\n\ + for(coord.x = 0; coord.x < class_max_iter; )\n\ + {\n\ + float4 val;\n\ + src0 = read_imagef(input, coord);\n\ +\n\ + data = src0 - maxVal;\n\ + val.x = eltwise_unary_exp(data.x);\n\ + val.x += dst.x;\n\ + dst.x = val.x;\n\ + write_imagef(output, coord.xy, val);\n\ + coord.x ++;\n\ + }\n\ +}\n\ +\n\ +uint upper_bound(float* a, int n, float x)\n\ +{\n\ + uint l = 0;\n\ + uint h = n;\n\ + while (l < h) {\n\ + int mid = (l + h) >> 1;\n\ + if (x >= a[mid]) {\n\ + l = mid + 1;\n\ + } else {\n\ + h = mid;\n\ + }\n\ + }\n\ + return l;\n\ +}\n\ +\n\ +// one thread calculate 4\n\ +__kernel void random_multinomial\n\ + (\n\ + __read_only image2d_array_t randoms,\n\ + __read_only image2d_array_t cdfs,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord = (int4)(gidx, gidy, 0, 0);\n\ + int class_size = get_image_width(cdfs);\n\ +\n\ + int offset = gidy * class_size;\n\ + __global float* cdf_ptr = (__global float*)get_image2D_array_ptr(cdfs);\n\ + __global float* cdfPtr = cdf_ptr + offset;\n\ +\n\ + int width = get_image_width(randoms);\n\ + offset = coord.x + coord.y * width;\n\ + __global float* randoms_ptr = (__global float*)get_image2D_array_ptr(randoms);\n\ + randoms_ptr = randoms_ptr + offset;\n\ +\n\ + width = get_image_width(output);\n\ + offset = coord.x + coord.y * width;\n\ + __global uint* output_ptr = (__global uint*)get_image2D_array_ptr(output);\n\ + output_ptr = output_ptr + offset;\n\ +\n\ + float4 ran = vload4(0, randoms_ptr);\n\ + float total = cdfPtr[class_size - 1];\n\ + float4 target = ran * total;\n\ +\n\ + uint4 out_class = (uint4)(0);\n\ + out_class.x = upper_bound(cdfPtr, class_size, target.x);\n\ + out_class.y = upper_bound(cdfPtr, class_size, target.y);\n\ + out_class.z = upper_bound(cdfPtr, class_size, target.z);\n\ + out_class.w = upper_bound(cdfPtr, class_size, target.w);\n\ +\n\ + vstore4(out_class, 0, output_ptr);\n\ +}\n\ +\n\ +"; /* end of random_multinomial_cl*/ + +static const char reduceall_internal_axis0_cl[] = "__kernel void reduceall_axis0_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ +\n\ + int4 allVal = read_imagei(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + allVal = val && allVal;\n\ + coord.x ++;\n\ + }\n\ + allVal.x = allVal.x & 1;\n\ + write_imagei(output, coord.yz, allVal);\n\ +}\n\ +\n\ +__kernel void reduceall_axis0_I8toI8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ +\n\ + int4 allVal = read_imagei(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + allVal = val && allVal;\n\ + coord.x ++;\n\ + }\n\ + allVal.x = allVal.x & 1;\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, allVal);\n\ +}\n\ +\n\ +"; /* end of reduceall_internal_axis0_cl*/ + +static const char reduceall_internal_axis1_cl[] = "__kernel void reduceall_axis1_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + int4 allVal = read_imagei(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + allVal = val && allVal;\n\ + coord.y ++;\n\ + }\n\ + allVal.x = allVal.x & 1;\n\ + write_imagei(output, coord.xz, allVal);\n\ +}\n\ +\n\ +__kernel void reduceall_axis1_I8toI8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + int4 allVal = read_imagei(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + allVal = val && allVal;\n\ + coord.y ++;\n\ + }\n\ + allVal.x = allVal.x & 1;\n\ + coord.y = 0;\n\ + write_imagei(output, coord, allVal);\n\ +}\n\ +\n\ +"; /* end of reduceall_internal_axis1_cl*/ + +static const char reduceall_internal_axis2_cl[] = "__kernel void reduceall_axis2_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ +\n\ + int4 allVal = read_imagei(input, coord);\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + allVal = val && allVal;\n\ + coord.z ++;\n\ + }\n\ + allVal.x = allVal.x & 1;\n\ + write_imagei(output, coord.xy, allVal);\n\ +}\n\ +\n\ +\n\ +\n\ +"; /* end of reduceall_internal_axis2_cl*/ + +static const char reduceany_internal_axis0_cl[] = "__kernel void reduceany_axis0_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ +\n\ + int4 anyVal = read_imagei(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + anyVal = val || anyVal;\n\ + coord.x ++;\n\ + }\n\ + anyVal.x = anyVal.x & 1;\n\ + write_imagei(output, coord.yz, anyVal);\n\ +}\n\ +\n\ +__kernel void reduceany_axis0_I8toI8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ +\n\ + int4 anyVal = read_imagei(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + anyVal = val || anyVal;\n\ + coord.x ++;\n\ + }\n\ + anyVal.x = anyVal.x & 1;\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, anyVal);\n\ +}\n\ +\n\ +"; /* end of reduceany_internal_axis0_cl*/ + +static const char reduceany_internal_axis1_cl[] = "__kernel void reduceany_axis1_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + int4 anyVal = read_imagei(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + anyVal = val || anyVal;\n\ + coord.y ++;\n\ + }\n\ + anyVal.x = anyVal.x & 1;\n\ + write_imagei(output, coord.xz, anyVal);\n\ +}\n\ +\n\ +__kernel void reduceany_axis1_I8toI8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + int4 anyVal = read_imagei(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + anyVal = val || anyVal;\n\ + coord.y ++;\n\ + }\n\ + anyVal.x = anyVal.x & 1;\n\ + coord.y = 0;\n\ + write_imagei(output, coord, anyVal);\n\ +}\n\ +\n\ +"; /* end of reduceany_internal_axis1_cl*/ + +static const char reduceany_internal_axis2_cl[] = "__kernel void reduceany_axis2_I8toI8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ +\n\ + int4 anyVal = read_imagei(input, coord);\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + anyVal = val || anyVal;\n\ + coord.z ++;\n\ + }\n\ + anyVal.x = anyVal.x & 1;\n\ + write_imagei(output, coord.xy, anyVal);\n\ +}\n\ +\n\ +\n\ +\n\ +"; /* end of reduceany_internal_axis2_cl*/ + +static const char reducemax_internal_axis0_cl[] = "__kernel void reducemax_axis0_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ + float4 maxVal = read_imagef(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagef(output, coord.yz, maxVal);\n\ +}\n\ +\n\ +__kernel void reducemax_axis0_F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ + float4 maxVal = read_imagef(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagef(output, coord.yx, maxVal);\n\ +}\n\ +\n\ +__kernel void reducemax_axis0_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ + uint4 dst;\n\ + uint4 maxVal = read_imageui(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.x ++;\n\ + }\n\ + dst = convert_uint4(convert_float4(maxVal) * inputScale + inputTail);\n\ + write_imageui(output, coord.yz, dst);\n\ +}\n\ +\n\ +__kernel void reducemax_axis0_U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ + uint4 dst;\n\ + uint4 maxVal = read_imageui(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.x ++;\n\ + }\n\ + dst = convert_uint4(convert_float4(maxVal) * inputScale + inputTail);\n\ + coord.x = 0;\n\ + write_imageui(output, coord.yx, dst);\n\ +}\n\ +\n\ +__kernel void reducemax_axis0_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ +\n\ + int4 maxVal = read_imagei(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.yz, maxVal);\n\ +}\n\ +\n\ +__kernel void reducemax_axis0_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ +\n\ + int4 maxVal = read_imagei(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, maxVal);\n\ +}\n\ +\n\ +"; /* end of reducemax_internal_axis0_cl*/ + +static const char reducemax_internal_axis1_cl[] = "__kernel void reducemax_axis1_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + float4 maxVal = read_imagef(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagef(output, coord.xz, maxVal);\n\ +}\n\ +\n\ +__kernel void reducemax_axis1_F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + float4 maxVal = read_imagef(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagef(output, coord, maxVal);\n\ +}\n\ +\n\ +__kernel void reducemax_axis1_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ + uint4 dst;\n\ + uint4 maxVal = read_imageui(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.y ++;\n\ + }\n\ + dst = convert_uint4(convert_float4(maxVal) * inputScale + inputTail);\n\ + write_imageui(output, coord.xz, dst);\n\ +}\n\ +\n\ +__kernel void reducemax_axis1_U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ + uint4 dst;\n\ + uint4 maxVal = read_imageui(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.y ++;\n\ + }\n\ + dst = convert_uint4(convert_float4(maxVal) * inputScale + inputTail);\n\ + coord.y = 0;\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void reducemax_axis1_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + int4 maxVal = read_imagei(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xz, maxVal);\n\ +}\n\ +\n\ +__kernel void reducemax_axis1_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + int4 maxVal = read_imagei(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagei(output, coord, maxVal);\n\ +}\n\ +\n\ +"; /* end of reducemax_internal_axis1_cl*/ + +static const char reducemax_internal_axis2_cl[] = "__kernel void reducemax_axis2_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ +\n\ + float4 maxVal = read_imagef(input, coord);\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagef(output, coord.xy, maxVal);\n\ +}\n\ +\n\ +\n\ +__kernel void reducemax_axis2_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ + uint4 dst;\n\ + uint4 maxVal = read_imageui(input, coord);\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.z ++;\n\ + }\n\ + dst = convert_uint4(convert_float4(maxVal) * inputScale + inputTail);\n\ + write_imageui(output, coord.xy, dst);\n\ +}\n\ +\n\ +\n\ +__kernel void reducemax_axis2_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ +\n\ + int4 maxVal = read_imagei(input, coord);\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + maxVal = val > maxVal ? val : maxVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xy, maxVal);\n\ +}\n\ +\n\ +\n\ +\n\ +"; /* end of reducemax_internal_axis2_cl*/ + +static const char reducemin_internal_axis0_cl[] = "__kernel void reducemin_axis0_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ + float4 minVal = read_imagef(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagef(output, coord.yz, minVal);\n\ +}\n\ +\n\ +__kernel void reducemin_axis0_F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ + float4 minVal = read_imagef(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagef(output, coord.yx, minVal);\n\ +}\n\ +\n\ +__kernel void reducemin_axis0_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ + uint4 dst;\n\ + uint4 minVal = read_imageui(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ + dst = convert_uint4(convert_float4(minVal) * inputScale + inputTail);\n\ + write_imageui(output, coord.yz, dst);\n\ +}\n\ +\n\ +__kernel void reducemin_axis0_U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ + uint4 dst;\n\ + uint4 minVal = read_imageui(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ + dst = convert_uint4(convert_float4(minVal) * inputScale + inputTail);\n\ + coord.x = 0;\n\ + write_imageui(output, coord.yx, dst);\n\ +}\n\ +\n\ +__kernel void reducemin_axis0_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.yz, minVal);\n\ +}\n\ +\n\ +__kernel void reducemin_axis0_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, minVal);\n\ +}\n\ +\n\ +"; /* end of reducemin_internal_axis0_cl*/ + +static const char reducemin_internal_axis1_cl[] = "__kernel void reducemin_axis1_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagef(output, coord.xz, minVal);\n\ +}\n\ +\n\ +__kernel void reducemin_axis1_F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagef(output, coord, minVal);\n\ +}\n\ +\n\ +__kernel void reducemin_axis1_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ + uint4 dst;\n\ + uint4 minVal = read_imageui(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ + dst = convert_uint4(convert_float4(minVal) * inputScale + inputTail);\n\ + write_imageui(output, coord.xz, dst);\n\ +}\n\ +\n\ +__kernel void reducemin_axis1_U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ + uint4 dst;\n\ + uint4 minVal = read_imageui(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ + dst = convert_uint4(convert_float4(minVal) * inputScale + inputTail);\n\ + coord.y = 0;\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void reducemin_axis1_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xz, minVal);\n\ +}\n\ +\n\ +__kernel void reducemin_axis1_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagei(output, coord, minVal);\n\ +}\n\ +\n\ +"; /* end of reducemin_internal_axis1_cl*/ + +static const char reducemin_internal_axis2_cl[] = "__kernel void reducemin_axis2_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ +\n\ + float4 minVal = read_imagef(input, coord);\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagef(output, coord.xy, minVal);\n\ +}\n\ +\n\ +\n\ +__kernel void reducemin_axis2_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ + uint4 dst;\n\ + uint4 minVal = read_imageui(input, coord);\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + uint4 val = read_imageui(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.z ++;\n\ + }\n\ + dst = convert_uint4(convert_float4(minVal) * inputScale + inputTail);\n\ + write_imageui(output, coord.xy, dst);\n\ +}\n\ +\n\ +\n\ +__kernel void reducemin_axis2_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ +\n\ + int4 minVal = read_imagei(input, coord);\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + minVal = val < minVal ? val : minVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xy, minVal);\n\ +}\n\ +\n\ +\n\ +\n\ +"; /* end of reducemin_internal_axis2_cl*/ + +static const char reduceprod_internal_axis0_cl[] = "__kernel void reduceprod_axis0_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ + float4 prodVal = read_imagef(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + prodVal = val * prodVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagef(output, coord.yz, prodVal);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis0_F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ + float4 prodVal = read_imagef(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + prodVal = val * prodVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagef(output, coord.yx, prodVal);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis0_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ + uint4 dst;\n\ + float4 prodVal = convert_float4(read_imageui(input, coord));\n\ + prodVal = prodVal * inputScale + inputTail;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = convert_float4(read_imageui(input, coord));\n\ + val = val * inputScale + inputTail;\n\ + prodVal = val * prodVal;\n\ + coord.x ++;\n\ + }\n\ + dst = convert_uint4(prodVal * outputScale + outputTail);\n\ + write_imageui(output, coord.yz, dst);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis0_U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ + uint4 dst;\n\ + float4 prodVal = convert_float4(read_imageui(input, coord));\n\ + prodVal = prodVal * inputScale + inputTail;\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + float4 val = convert_float4(read_imageui(input, coord));\n\ + val = val * inputScale + inputTail;\n\ + prodVal = val * prodVal;\n\ + coord.x ++;\n\ + }\n\ + dst = convert_uint4(prodVal * outputScale + outputTail);\n\ + coord.x = 0;\n\ + write_imageui(output, coord.yx, dst);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis0_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0);\n\ + int axisSize = get_image_width(input);\n\ +\n\ + int4 prodVal = read_imagei(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + prodVal = val * prodVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.yz, prodVal);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis0_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(0, get_global_id(0));\n\ + int axisSize = get_image_width(input);\n\ +\n\ + int4 prodVal = read_imagei(input, coord);\n\ + coord.x ++;\n\ +\n\ + for (; coord.x < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + prodVal = val * prodVal;\n\ + coord.x ++;\n\ + }\n\ +\n\ + coord.x = 0;\n\ + write_imagei(output, coord.yx, prodVal);\n\ +}\n\ +\n\ +"; /* end of reduceprod_internal_axis0_cl*/ + +static const char reduceprod_internal_axis1_cl[] = "__kernel void reduceprod_axis1_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + float4 prodVal = read_imagef(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + prodVal = val * prodVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagef(output, coord.xz, prodVal);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis1_F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + float4 prodVal = read_imagef(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + prodVal = val * prodVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagef(output, coord, prodVal);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis1_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ + uint4 dst;\n\ + float4 prodVal = convert_float4(read_imageui(input, coord));\n\ + prodVal = prodVal * inputScale + inputTail;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = convert_float4(read_imageui(input, coord));\n\ + val = val * inputScale + inputTail;\n\ + prodVal = val * prodVal;\n\ + coord.y ++;\n\ + }\n\ + dst = convert_uint4(prodVal * outputScale + outputTail);\n\ + write_imageui(output, coord.xz, dst);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis1_U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ + uint4 dst;\n\ + float4 prodVal = convert_float4(read_imageui(input, coord));\n\ + prodVal = prodVal * inputScale + inputTail;\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + float4 val = convert_float4(read_imageui(input, coord));\n\ + val = val * inputScale + inputTail;\n\ + prodVal = val * prodVal;\n\ + coord.y ++;\n\ + }\n\ + dst = convert_uint4(prodVal * outputScale + outputTail);\n\ + coord.y = 0;\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis1_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + int4 prodVal = read_imagei(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + prodVal = val * prodVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xz, prodVal);\n\ +}\n\ +\n\ +__kernel void reduceprod_axis1_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ + int axisSize = get_image_height(input);\n\ +\n\ + int4 prodVal = read_imagei(input, coord);\n\ + coord.y ++;\n\ +\n\ + for (; coord.y < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + prodVal = val * prodVal;\n\ + coord.y ++;\n\ + }\n\ +\n\ + coord.y = 0;\n\ + write_imagei(output, coord, prodVal);\n\ +}\n\ +\n\ +"; /* end of reduceprod_internal_axis1_cl*/ + +static const char reduceprod_internal_axis2_cl[] = "__kernel void reduceprod_axis2_F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ +\n\ + float4 prodVal = read_imagef(input, coord);\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + float4 val = read_imagef(input, coord);\n\ + prodVal = val * prodVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagef(output, coord.xy, prodVal);\n\ +}\n\ +\n\ +\n\ +__kernel void reduceprod_axis2_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ + uint4 dst;\n\ + float4 prodVal = convert_float4(read_imageui(input, coord));\n\ + prodVal = prodVal * inputScale + inputTail;\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + float4 val = convert_float4(read_imageui(input, coord));\n\ + val = val * inputScale + inputTail;\n\ + prodVal = val * prodVal;\n\ + coord.z ++;\n\ + }\n\ + dst = convert_uint4(prodVal * outputScale + outputTail);\n\ + write_imageui(output, coord.xy, dst);\n\ +}\n\ +\n\ +\n\ +__kernel void reduceprod_axis2_I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int axisSize = get_image_depth(input);\n\ +\n\ + int4 prodVal = read_imagei(input, coord);\n\ + coord.z ++;\n\ +\n\ + for (; coord.z < axisSize;)\n\ + {\n\ + int4 val = read_imagei(input, coord);\n\ + prodVal = val * prodVal;\n\ + coord.z ++;\n\ + }\n\ +\n\ + write_imagei(output, coord.xy, prodVal);\n\ +}\n\ +\n\ +\n\ +\n\ +"; /* end of reduceprod_internal_axis2_cl*/ + +static const char relational_ops_cl[] = "\n\ +#define COMPARISONS_F32(func_name, comp_op) \\\n\ +__kernel void func_name##_F32F32toBOOL8 \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input0Scale, \\\n\ + float input0Tail, \\\n\ + float input1Scale, \\\n\ + float input1Tail \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + float4 src0; \\\n\ + float4 src1; \\\n\ + readImage2DArray(src0, input0, coord); \\\n\ + readImage2DArray(src1, input1, coord); \\\n\ + \\\n\ + int4 dst = (src0)comp_op(src1); \\\n\ + dst &= 1; \\\n\ + \\\n\ + write_imagei(output, coord, dst); \\\n\ +}\n\ +COMPARISONS_F32(less, <)\n\ +COMPARISONS_F32(great, >)\n\ +COMPARISONS_F32(less_equal, <=)\n\ +COMPARISONS_F32(great_equal, >=)\n\ +COMPARISONS_F32(equal, ==)\n\ +COMPARISONS_F32(not_equal, !=)\n\ +\n\ +#define COMPARISONS_F32_2D(func_name, comp_op) \\\n\ +__kernel void func_name##_F32F32toBOOL8_2D \\\n\ + ( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + float input0Scale, \\\n\ + float input0Tail, \\\n\ + float input1Scale, \\\n\ + float input1Tail \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + float4 src0 = read_imagef(input0, coord); \\\n\ + float4 src1 = read_imagef(input1, coord); \\\n\ + \\\n\ + int4 dst = (src0)comp_op(src1); \\\n\ + dst &= 1; \\\n\ + \\\n\ + write_imagei(output, coord, dst); \\\n\ +}\n\ +COMPARISONS_F32_2D(less, <)\n\ +COMPARISONS_F32_2D(great, >)\n\ +COMPARISONS_F32_2D(less_equal, <=)\n\ +COMPARISONS_F32_2D(great_equal, >=)\n\ +COMPARISONS_F32_2D(equal, ==)\n\ +COMPARISONS_F32_2D(not_equal, !=)\n\ +\n\ +#define COMPARISONS_U32(func_name, comp_op) \\\n\ +__kernel void func_name##_U32U32toBOOL8 \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input0Scale, \\\n\ + float input0Tail, \\\n\ + float input1Scale, \\\n\ + float input1Tail \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + uint4 data0; \\\n\ + uint4 data1; \\\n\ + readImage2DArray(data0, input0, coord); \\\n\ + readImage2DArray(data1, input1, coord); \\\n\ + \\\n\ + float4 src0 = convert_float4(data0) * input0Scale - input0Tail; \\\n\ + float4 src1 = convert_float4(data1) * input1Scale - input1Tail; \\\n\ + int4 dst = (src0)comp_op(src1); \\\n\ + dst &= 1; \\\n\ + \\\n\ + write_imagei(output, coord, dst); \\\n\ +}\n\ +COMPARISONS_U32(less, <)\n\ +COMPARISONS_U32(great, >)\n\ +COMPARISONS_U32(less_equal, <=)\n\ +COMPARISONS_U32(great_equal, >=)\n\ +COMPARISONS_U32(equal, ==)\n\ +COMPARISONS_U32(not_equal, !=)\n\ +\n\ +#define COMPARISONS_U32_2D(func_name, comp_op) \\\n\ +__kernel void func_name##_U32U32toBOOL8_2D \\\n\ + ( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + float input0Scale, \\\n\ + float input0Tail, \\\n\ + float input1Scale, \\\n\ + float input1Tail \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + uint4 data0 = read_imageui(input0, coord); \\\n\ + uint4 data1 = read_imageui(input1, coord); \\\n\ + \\\n\ + float4 src0 = convert_float4(data0) * input0Scale - input0Tail; \\\n\ + float4 src1 = convert_float4(data1) * input1Scale - input1Tail; \\\n\ + int4 dst = (src0)comp_op(src1); \\\n\ + dst &= 1; \\\n\ + \\\n\ + write_imagei(output, coord, dst); \\\n\ +}\n\ +COMPARISONS_U32_2D(less, <)\n\ +COMPARISONS_U32_2D(great, >)\n\ +COMPARISONS_U32_2D(less_equal, <=)\n\ +COMPARISONS_U32_2D(great_equal, >=)\n\ +COMPARISONS_U32_2D(equal, ==)\n\ +COMPARISONS_U32_2D(not_equal, !=)\n\ +\n\ +#define COMPARISONS_I32(func_name, comp_op) \\\n\ +__kernel void func_name##_I32I32toBOOL8 \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input0Scale, \\\n\ + float input0Tail, \\\n\ + float input1Scale, \\\n\ + float input1Tail \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + int4 src0; \\\n\ + int4 src1; \\\n\ + readImage2DArray(src0, input0, coord); \\\n\ + readImage2DArray(src1, input1, coord); \\\n\ + \\\n\ + int4 dst = (src0)comp_op(src1); \\\n\ + dst &= 1; \\\n\ + \\\n\ + write_imagei(output, coord, dst); \\\n\ +}\n\ +COMPARISONS_I32(less, <)\n\ +COMPARISONS_I32(great, >)\n\ +COMPARISONS_I32(less_equal, <=)\n\ +COMPARISONS_I32(great_equal, >=)\n\ +COMPARISONS_I32(equal, ==)\n\ +COMPARISONS_I32(not_equal, !=)\n\ +\n\ +#define COMPARISONS_I32_2D(func_name, comp_op) \\\n\ +__kernel void func_name##_I32I32toBOOL8_2D \\\n\ + ( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + float input0Scale, \\\n\ + float input0Tail, \\\n\ + float input1Scale, \\\n\ + float input1Tail \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + int4 src0 = read_imagei(input0, coord); \\\n\ + int4 src1 = read_imagei(input1, coord); \\\n\ + \\\n\ + int4 dst = (src0)comp_op(src1); \\\n\ + dst &= 1; \\\n\ + \\\n\ + write_imagei(output, coord, dst); \\\n\ +}\n\ +COMPARISONS_I32_2D(less, <)\n\ +COMPARISONS_I32_2D(great, >)\n\ +COMPARISONS_I32_2D(less_equal, <=)\n\ +COMPARISONS_I32_2D(great_equal, >=)\n\ +COMPARISONS_I32_2D(equal, ==)\n\ +COMPARISONS_I32_2D(not_equal, !=)\n\ +\n\ +"; /* end of relational_ops_cl*/ + +static const char relu_keras_cl[] = "\n\ +__kernel void relu_keras_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float alpha,\n\ + float max_value,\n\ + float threshold,\n\ + float offset\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float4 src = read_imagef(input, coord);\n\ + float4 dst = src >= max_value ? max_value : src;\n\ + dst = dst < threshold ? alpha * dst + offset : dst;\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void relu_keras_F32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float alpha,\n\ + float max_value,\n\ + float threshold,\n\ + float offset\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + float4 src = read_imagef(input, coord);\n\ + float4 dst = src >= max_value ? max_value : src;\n\ + dst = dst < threshold ? alpha * dst + offset : dst;\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void relu_keras_F32toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float alpha,\n\ + float max_value,\n\ + float threshold,\n\ + float offset,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float4 src = read_imagef(input, coord);\n\ + float4 result = src >= max_value ? max_value : src;\n\ + result = result < threshold ? alpha * result + offset : result;\n\ + uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void relu_keras_F32toU8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float alpha,\n\ + float max_value,\n\ + float threshold,\n\ + float offset,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + float4 src = read_imagef(input, coord);\n\ + float4 result = src >= max_value ? max_value : src;\n\ + result = result < threshold ? alpha * result + offset : result;\n\ + uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void relu_keras_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float alpha,\n\ + float max_value,\n\ + float threshold,\n\ + float offset,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + float4 result = src >= max_value ? max_value : src;\n\ + result = result < threshold ? alpha * result + offset : result;\n\ + uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void relu_keras_U8toU8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float alpha,\n\ + float max_value,\n\ + float threshold,\n\ + float offset,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + float4 result = src >= max_value ? max_value : src;\n\ + result = result < threshold ? alpha * result + offset : result;\n\ + uint4 dst = convert_uint4_rte(result * outputScale + outputZP);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void relu_keras_U8toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float alpha,\n\ + float max_value,\n\ + float threshold,\n\ + float offset,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + float4 dst = src >= max_value ? max_value : src;\n\ + dst = dst < threshold ? alpha * dst + offset : dst;\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void relu_keras_U8toF32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float alpha,\n\ + float max_value,\n\ + float threshold,\n\ + float offset,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + float4 src = convert_float4(read_imageui(input, coord)) * inputScale + inputTail;\n\ + float4 dst = src >= max_value ? max_value : src;\n\ + dst = dst < threshold ? alpha * dst + offset : dst;\n\ + write_imagef(output, coord, dst);\n\ +}"; /* end of relu_keras_cl*/ + +static const char resize_bilinear_cl[] = "__kernel void resize_bilinear_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float half_pixel_value\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float left_x_f = floor(in_x);\n\ + float x_lerp = in_x - left_x_f;\n\ + int left_x_idx = convert_int(left_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + int4 coord_in = (int4)(left_x_idx, top_y_idx, coord_out.z, 0);\n\ + float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst;\n\ +\n\ + top_l = read_imagef(input, coord_in);\n\ + coord_in.y++;\n\ + bottom_l = read_imagef(input, coord_in);\n\ + coord_in.x++;\n\ + bottom_r = read_imagef(input, coord_in);\n\ + coord_in.y--;\n\ + top_r = read_imagef(input, coord_in);\n\ +\n\ + top_r = top_r - top_l;\n\ + top = top_l + x_lerp * top_r;\n\ + bottom_r = bottom_r - bottom_l;\n\ + bottom = bottom_l + x_lerp * bottom_r;\n\ + bottom = bottom - top;\n\ + dst = top + y_lerp * bottom;\n\ +\n\ + write_imagef(output, coord_out, dst);\n\ +\n\ +}\n\ +\n\ +\n\ +__kernel void resize_bilinear_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float half_pixel_value,\n\ + float in_scale,\n\ + float in_tail,\n\ + float out_scale,\n\ + float out_tail\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float left_x_f = floor(in_x);\n\ + float x_lerp = in_x - left_x_f;\n\ + int left_x_idx = convert_int(left_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + int4 coord_in = (int4)(left_x_idx, top_y_idx, coord_out.z, 0);\n\ + float4 top_l, top_r, bottom_l, bottom_r, top, bottom;\n\ + uint4 dst;\n\ +\n\ + top_l = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail;\n\ + coord_in.y++;\n\ + bottom_l = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail;\n\ + coord_in.x++;\n\ + bottom_r = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail;\n\ + coord_in.y--;\n\ + top_r = convert_float4(read_imageui(input, coord_in)) * in_scale + in_tail;\n\ +\n\ + top_r = top_r - top_l;\n\ + top = top_l + x_lerp * top_r;\n\ + bottom_r = bottom_r - bottom_l;\n\ + bottom = bottom_l + x_lerp * bottom_r;\n\ + bottom = bottom - top;\n\ + top = top + y_lerp * bottom;\n\ +\n\ + dst = convert_uint4(top * out_scale + out_tail);\n\ +\n\ + write_imageui(output, coord_out, dst);\n\ +}\n\ +"; /* end of resize_bilinear_cl*/ + +static const char resize_nearest_cl[] = "\n\ +#define NEAREST_INDEX_PROCESS() \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \\\n\ + int in_x_idx = convert_int(in_x); \\\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y + round_value; \\\n\ + int in_y_idx = convert_int(in_y); \\\n\ +\n\ +__kernel void resize_nearest_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float half_pixel_value,\n\ + float round_value)\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ + int4 coord_in = (int4)(in_x_idx, in_y_idx, coord_out.z, 0);\n\ + float4 dst;\n\ + dst = read_imagef(input, coord_in);\n\ + write_imagef(output, coord_out, dst);\n\ +}\n\ +\n\ +\n\ +__kernel void resize_nearest_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float half_pixel_value,\n\ + float round_value,\n\ + float output_scale,\n\ + float output_tail)\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ + int4 coord_in = (int4)(in_x_idx, in_y_idx, coord_out.z, 0);\n\ + uint4 dst;\n\ + dst = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail);\n\ + write_imageui(output, coord_out, dst);\n\ +}\n\ +"; /* end of resize_nearest_cl*/ + +static const char scatter_nd_cl[] = "__kernel void scatter_nd_U32toU32_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width,\n\ + int area,\n\ + int index_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + uint4 sum = (uint4)(0, 0, 0, 0);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice = read_imagei(input0, (int2)(0, i));\n\ + if(gidy == indice.x)\n\ + {\n\ + uint4 data = read_imageui(input1, (int2)(gidx, i));\n\ + sum += data;\n\ + }\n\ + }\n\ + write_imageui(output, (int2)(gidx, gidy), sum);\n\ +}\n\ +\n\ +__kernel void scatter_nd_U32toU32_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width,\n\ + int area,\n\ + int index_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + uint4 sum = (uint4)(0, 0, 0, 0);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice0 = read_imagei(input0, (int2)(0, i));\n\ + int4 indice1 = read_imagei(input0, (int2)(1, i));\n\ + int idx = indice0.x * width + indice1.x;\n\ + if(gidy == idx)\n\ + {\n\ + uint4 data = read_imageui(input1, (int2)(gidx, i));\n\ + sum += data;\n\ + }\n\ + }\n\ + write_imageui(output, (int2)(gidx, gidy), sum);\n\ +}\n\ +\n\ +__kernel void scatter_nd_U32toU32_3D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width,\n\ + int area,\n\ + int index_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + uint4 sum = (uint4)(0, 0, 0, 0);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice0 = read_imagei(input0, (int2)(0, i));\n\ + int4 indice1 = read_imagei(input0, (int2)(1, i));\n\ + int4 indice2 = read_imagei(input0, (int2)(2, i));\n\ + int idx = indice0.x * area + indice1.x * width + indice2.x;\n\ + if(gidy == idx)\n\ + {\n\ + uint4 data = read_imageui(input1, (int2)(gidx, i));\n\ + sum += data;\n\ + }\n\ + }\n\ + write_imageui(output, (int2)(gidx, gidy), sum);\n\ +}\n\ +\n\ +__kernel void scatter_nd_I32toI32_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width,\n\ + int area,\n\ + int index_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 sum = (int4)(0, 0, 0, 0);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice = read_imagei(input0, (int2)(0, i));\n\ + if(gidy == indice.x)\n\ + {\n\ + int4 data = read_imagei(input1, (int2)(gidx, i));\n\ + sum += data;\n\ + }\n\ + }\n\ + write_imagei(output, (int2)(gidx, gidy), sum);\n\ +}\n\ +\n\ +__kernel void scatter_nd_I32toI32_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width,\n\ + int area,\n\ + int index_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 sum = (int4)(0, 0, 0, 0);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice0 = read_imagei(input0, (int2)(0, i));\n\ + int4 indice1 = read_imagei(input0, (int2)(1, i));\n\ + int idx = indice0.x * width + indice1.x;\n\ + if(gidy == idx)\n\ + {\n\ + int4 data = read_imagei(input1, (int2)(gidx, i));\n\ + sum += data;\n\ + }\n\ + }\n\ + write_imagei(output, (int2)(gidx, gidy), sum);\n\ +}\n\ +\n\ +__kernel void scatter_nd_I32toI32_3D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width,\n\ + int area,\n\ + int index_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 sum = (int4)(0, 0, 0, 0);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice0 = read_imagei(input0, (int2)(0, i));\n\ + int4 indice1 = read_imagei(input0, (int2)(1, i));\n\ + int4 indice2 = read_imagei(input0, (int2)(2, i));\n\ + int idx = indice0.x * area + indice1.x * width + indice2.x;\n\ + if(gidy == idx)\n\ + {\n\ + int4 data = read_imagei(input1, (int2)(gidx, i));\n\ + sum += data;\n\ + }\n\ + }\n\ + write_imagei(output, (int2)(gidx, gidy), sum);\n\ +}\n\ +\n\ +__kernel void scatter_nd_F32toF32_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width,\n\ + int area,\n\ + int index_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + float4 sum = (float4)(0, 0, 0, 0);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice = read_imagei(input0, (int2)(0, i));\n\ + if(gidy == indice.x)\n\ + {\n\ + float4 data = read_imagef(input1, (int2)(gidx, i));\n\ + sum += data;\n\ + }\n\ + }\n\ + write_imagef(output, (int2)(gidx, gidy), sum);\n\ +}\n\ +\n\ +__kernel void scatter_nd_F32toF32_2D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width,\n\ + int area,\n\ + int index_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + float4 sum = (float4)(0, 0, 0, 0);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice0 = read_imagei(input0, (int2)(0, i));\n\ + int4 indice1 = read_imagei(input0, (int2)(1, i));\n\ + int idx = indice0.x * width + indice1.x;\n\ + if(gidy == idx)\n\ + {\n\ + float4 data = read_imagef(input1, (int2)(gidx, i));\n\ + sum += data;\n\ + }\n\ + }\n\ + write_imagef(output, (int2)(gidx, gidy), sum);\n\ +}\n\ +\n\ +__kernel void scatter_nd_F32toF32_3D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width,\n\ + int area,\n\ + int index_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + float4 sum = (float4)(0, 0, 0, 0);\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice0 = read_imagei(input0, (int2)(0, i));\n\ + int4 indice1 = read_imagei(input0, (int2)(1, i));\n\ + int4 indice2 = read_imagei(input0, (int2)(2, i));\n\ + int idx = indice0.x * area + indice1.x * width + indice2.x;\n\ + if(gidy == idx)\n\ + {\n\ + float4 data = read_imagef(input1, (int2)(gidx, i));\n\ + sum += data;\n\ + }\n\ + }\n\ + write_imagef(output, (int2)(gidx, gidy), sum);\n\ +}"; /* end of scatter_nd_cl*/ + +static const char select_cl[] = "__kernel void select_I8_U8_U8toU8(\n\ + __read_only image2d_array_t condition,\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 value;\n\ + uint4 src0, src1, src, dst;\n\ + float inputScale, inputTail;\n\ + readImage2DArray(value, condition, coord);\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ + src = (value != 0 ? src0 : src1);\n\ + inputScale = (value.x != 0 ? input0Scale : input1Scale);\n\ + inputTail = (value.x != 0 ? input0Tail : input1Tail);\n\ + dst = convert_uint4(convert_float4(src) * inputScale + inputTail);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void select_I8_U8_U8toU8_2D(\n\ + __read_only image2d_t condition,\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 value = read_imagei(condition, coord);\n\ + uint4 src0 = read_imageui(input0, coord);\n\ + uint4 src1 = read_imageui(input1, coord);\n\ + uint4 src = (value != 0 ? src0 : src1);\n\ + float inputScale = (value.x != 0 ? input0Scale : input1Scale);\n\ + float inputTail = (value.x != 0 ? input0Tail : input1Tail);\n\ + uint4 dst = convert_uint4(convert_float4(src) * inputScale + inputTail);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void select_I8_I32_I32toI32(\n\ + __read_only image2d_array_t condition,\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 value;\n\ + int4 src0, src1, dst;\n\ + readImage2DArray(value, condition, coord);\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ + dst = (value != 0 ? src0 : src1);\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void select_I8_I32_I32toI32_2D(\n\ + __read_only image2d_t condition,\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 value = read_imagei(condition, coord);\n\ + int4 src0 = read_imagei(input0, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ + int4 dst = (value != 0 ? src0 : src1);\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void select_I8_F32_F32toF32(\n\ + __read_only image2d_array_t condition,\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 value;\n\ + float4 src0, src1, dst;\n\ + readImage2DArray(value, condition, coord);\n\ + readImage2DArray(src0, input0, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ + dst = (value != 0 ? src0 : src1);\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void select_I8_F32_F32toF32_2D(\n\ + __read_only image2d_t condition,\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 value = read_imagei(condition, coord);\n\ + float4 src0 = read_imagef(input0, coord);\n\ + float4 src1 = read_imagef(input1, coord);\n\ + float4 dst = (value != 0 ? src0 : src1);\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +"; /* end of select_cl*/ + +static const char swish_cl[] = "float sigmoid_(float x, float logE)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +\n\ +#define SWISH_F32_F32_PROCESS() \\\n\ + float4 src, tmp, dst; \\\n\ + src = read_imagef(input, coord); \\\n\ + tmp.x = sigmoid_(src.x * beta, logE); \\\n\ + dst.x = src.x * tmp.x; \\\n\ + write_imagef(output, coord, dst);\n\ +\n\ +__kernel void swish_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP,\n\ + float beta,\n\ + float logE)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + SWISH_F32_F32_PROCESS()\n\ +}\n\ +\n\ +__kernel void swish_F32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP,\n\ + float beta,\n\ + float logE)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + SWISH_F32_F32_PROCESS()\n\ +}\n\ +\n\ +\n\ +#define SWISH_U8_U8_PROCESS() \\\n\ + float4 src, tmp, data; \\\n\ + uint4 src0 = read_imageui(input, coord); \\\n\ + src = convert_float4(src0) * inputScale - inputTail; \\\n\ + tmp.x = sigmoid_(src.x * beta, logE); \\\n\ + data.x = src.x * tmp.x; \\\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ + write_imageui(output, coord, dst);\n\ +\n\ +__kernel void swish_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP,\n\ + float beta,\n\ + float logE)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + SWISH_U8_U8_PROCESS()\n\ +}\n\ +\n\ +__kernel void swish_U8toU8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP,\n\ + float beta,\n\ + float logE)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + SWISH_U8_U8_PROCESS()\n\ +}\n\ +\n\ +\n\ +#define SWISH_I32_I32_PROCESS() \\\n\ + float4 src, tmp, data; \\\n\ + int4 src0 = read_imagei(input, coord); \\\n\ + src = convert_float4(src0); \\\n\ + tmp.x = sigmoid_(src.x * beta, logE); \\\n\ + data.x = src.x * tmp.x; \\\n\ + int4 dst = convert_int4(data); \\\n\ + write_imagei(output, coord, dst);\n\ +\n\ +__kernel void swish_I32toI32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP,\n\ + float beta,\n\ + float logE)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + SWISH_I32_I32_PROCESS()\n\ +}\n\ +\n\ +__kernel void swish_I32toI32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP,\n\ + float beta,\n\ + float logE)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + SWISH_I32_I32_PROCESS()\n\ +}\n\ +"; /* end of swish_cl*/ + +static const char tile_cl[] = "\n\ +#define TILE_3D(name0, name1, data_type, write_image_func) \\\n\ +__kernel void tile_##name0##to##name1 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int batchIn, \\\n\ + int depthIn, \\\n\ + int depthOut, \\\n\ + int multiples_0, \\\n\ + int multiples_1, \\\n\ + int multiples_2, \\\n\ + int multiples_3 \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_out; \\\n\ + int width = get_image_width(input); \\\n\ + int height = get_image_height(input); \\\n\ + \\\n\ + data_type src; \\\n\ + readImage2DArray(src, input, coord); \\\n\ + \\\n\ + int batch_id = (short)coord.z / (short)depthIn; \\\n\ + coord.z = (short)coord.z % (short)depthIn; \\\n\ + coord_out = coord; \\\n\ + \\\n\ + for (int w = 0; w < multiples_3; w++) \\\n\ + { \\\n\ + int batch = batchIn * w + batch_id; \\\n\ + \\\n\ + for(int z = 0; z < multiples_2; z++) \\\n\ + { \\\n\ + coord_out.z = coord.z + z * depthIn + batch * depthOut; \\\n\ + \\\n\ + for (int y = 0; y < multiples_1; y++) \\\n\ + { \\\n\ + coord_out.y = coord.y + y * height; \\\n\ + \\\n\ + for (int x = 0; x < multiples_0; x++) \\\n\ + { \\\n\ + coord_out.x = coord.x + x * width; \\\n\ + write_image_func(output, coord_out.xyzw, src); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ +}\n\ +TILE_3D(I32, I32, int4, write_imagei)\n\ +TILE_3D(U32, U32, uint4, write_imageui)\n\ +TILE_3D(F32, F32, float4, write_imagef)\n\ +\n\ +#define TILE_2D(name0, name1, data_type) \\\n\ +__kernel void tile_##name0##to##name1##_2D \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int batchIn, \\\n\ + int depthIn, \\\n\ + int depthOut, \\\n\ + int multiples_0, \\\n\ + int multiples_1, \\\n\ + int multiples_2, \\\n\ + int multiples_3 \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + int width = get_image_width(input); \\\n\ + int height = get_image_height(input); \\\n\ + int output_width = get_image_width(output); \\\n\ + int output_height = get_image_height(output); \\\n\ + \\\n\ + data_type src; \\\n\ + readImage(src, input, coord); \\\n\ + \\\n\ + do \\\n\ + { \\\n\ + do \\\n\ + { \\\n\ + writeImage(output, coord, src); \\\n\ + coord.x += width; \\\n\ + } while (coord.x < output_width); \\\n\ + coord.x = get_global_id(0); \\\n\ + coord.y += height; \\\n\ + } while (coord.y < output_height); \\\n\ +}\n\ +TILE_2D(I32, I32, int4)\n\ +TILE_2D(U32, U32, uint4)\n\ +TILE_2D(F32, F32, float4)\n\ +\n\ +\n\ +\n\ +"; /* end of tile_cl*/ + +static const char upsample_cl[] = "\n\ +#define UPSAMPLE_PROCESS(data_type, read_fun, write_fun) \\\n\ + data_type src = 0; \\\n\ + data_type dst = 0; \\\n\ + uint4 axis = 0; \\\n\ + src.x = read_fun(input, coord_in).x; \\\n\ + axis.x = read_imageui(inaxis, coord_in).x; \\\n\ + dst.x = axis.x == 0 ? src.x : 0; \\\n\ + write_fun(output, coord_out, dst); \\\n\ + dst.x = axis.x == 1 ? src.x : 0; \\\n\ + coord_out.x++; \\\n\ + write_fun(output, coord_out, dst); \\\n\ + dst.x = axis.x == 3 ? src.x : 0; \\\n\ + coord_out.y++; \\\n\ + write_fun(output, coord_out, dst); \\\n\ + dst.x = axis.x == 2 ? src.x : 0; \\\n\ + coord_out.x--; \\\n\ + write_fun(output, coord_out, dst);\n\ +\n\ +\n\ +__kernel void upsample_F32_U8to_F32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t inaxis,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + UPSAMPLE_PROCESS(float4, read_imagef, write_imagef)\n\ +}\n\ +\n\ +__kernel void upsample_F32_U8to_F32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t inaxis,\n\ + __write_only image2d_t output)\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0) << 1, get_global_id(1) << 1);\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1));\n\ + UPSAMPLE_PROCESS(float4, read_imagef, write_imagef)\n\ +}\n\ +\n\ +__kernel void upsample_I32_U8to_I32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t inaxis,\n\ + __write_only image2d_array_t output)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + UPSAMPLE_PROCESS(int4, read_imagei, write_imagei)\n\ +}\n\ +\n\ +__kernel void upsample_I32_U8to_I32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t inaxis,\n\ + __write_only image2d_t output)\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0) << 1, get_global_id(1) << 1);\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1));\n\ + UPSAMPLE_PROCESS(int4, read_imagei, write_imagei)\n\ +}\n\ +\n\ +\n\ +#define UPSAMPLE_U8_PROCESS() \\\n\ + uint4 src = 0; \\\n\ + uint4 dst = 0; \\\n\ + uint4 axis = 0; \\\n\ + float4 result = 0.0f; \\\n\ + uint output_zp = (uint)zp_out; \\\n\ + src.x = read_imageui(input, coord_in).x; \\\n\ + axis.x = read_imageui(inaxis, coord_in).x; \\\n\ + result.x = convert_float4(src).x * scale_value + tail_value; \\\n\ + src = convert_uint4(result);\\\n\ + dst.x = axis.x == 0 ? src.x : output_zp; \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + dst.x = axis.x == 1 ? src.x : output_zp; \\\n\ + coord_out.x++; \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + dst.x = axis.x == 3 ? src.x : output_zp; \\\n\ + coord_out.y++; \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + dst.x = axis.x == 2 ? src.x : output_zp; \\\n\ + coord_out.x--; \\\n\ + write_imageui(output, coord_out, dst);\n\ +\n\ +\n\ +__kernel void upsample_U8_U8to_U8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t inaxis,\n\ + __write_only image2d_array_t output,\n\ + float scale_value,\n\ + float tail_value,\n\ + int zp_out)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + UPSAMPLE_U8_PROCESS()\n\ +}\n\ +\n\ +__kernel void upsample_U8_U8to_U8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t inaxis,\n\ + __write_only image2d_t output,\n\ + float scale_value,\n\ + float tail_value,\n\ + int zp_out)\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0) << 1, get_global_id(1) << 1);\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1));\n\ + UPSAMPLE_U8_PROCESS()\n\ +}\n\ +\n\ +#define UPSAMPLE_U8_TO_F32PROCESS() \\\n\ + uint4 src = 0; \\\n\ + float4 dst = 0; \\\n\ + uint4 axis = 0; \\\n\ + float4 result = 0.0f; \\\n\ + src.x = read_imageui(input, coord_in).x; \\\n\ + axis.x = read_imageui(inaxis, coord_in).x; \\\n\ + result.x = convert_float4(src).x * scale_value + tail_value; \\\n\ + dst.x = axis.x == 0 ? result.x : 0.0f; \\\n\ + write_imagef(output, coord_out, dst); \\\n\ + dst.x = axis.x == 1 ? result.x : 0.0f; \\\n\ + coord_out.x++; \\\n\ + write_imagef(output, coord_out, dst); \\\n\ + dst.x = axis.x == 3 ? result.x : 0.0f; \\\n\ + coord_out.y++; \\\n\ + write_imagef(output, coord_out, dst); \\\n\ + dst.x = axis.x == 2 ? result.x : 0.0f; \\\n\ + coord_out.x--; \\\n\ + write_imagef(output, coord_out, dst);\n\ +\n\ +\n\ +__kernel void upsample_U8_U8to_F32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t inaxis,\n\ + __write_only image2d_array_t output,\n\ + float scale_value,\n\ + float tail_value,\n\ + int zp_out)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + UPSAMPLE_U8_TO_F32PROCESS()\n\ +}\n\ +\n\ +__kernel void upsample_U8_U8to_F32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t inaxis,\n\ + __write_only image2d_t output,\n\ + float scale_value,\n\ + float tail_value,\n\ + int zp_out)\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0) << 1, get_global_id(1) << 1);\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1));\n\ + UPSAMPLE_U8_TO_F32PROCESS()\n\ +}\n\ +\n\ +\n\ +#define UPSAMPLE_F32_TO_U8_PROCESS() \\\n\ + uint4 src = 0; \\\n\ + uint4 dst = 0; \\\n\ + uint4 axis = 0; \\\n\ + float4 result = 0.0f; \\\n\ + uint output_zp = (uint)zp_out; \\\n\ + result.x = read_imagef(input, coord_in).x; \\\n\ + axis.x = read_imageui(inaxis, coord_in).x; \\\n\ + result.x = result.x * scale_value + tail_value; \\\n\ + src = convert_uint4(result);\\\n\ + dst.x = axis.x == 0 ? src.x : output_zp; \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + dst.x = axis.x == 1 ? src.x : output_zp; \\\n\ + coord_out.x++; \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + dst.x = axis.x == 3 ? src.x : output_zp; \\\n\ + coord_out.y++; \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + dst.x = axis.x == 2 ? src.x : output_zp; \\\n\ + coord_out.x--; \\\n\ + write_imageui(output, coord_out, dst);\n\ +\n\ +\n\ +__kernel void upsample_F32_U8to_U8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t inaxis,\n\ + __write_only image2d_array_t output,\n\ + float scale_value,\n\ + float tail_value,\n\ + int zp_out)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0) << 1, get_global_id(1) << 1, get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + UPSAMPLE_F32_TO_U8_PROCESS()\n\ +}\n\ +\n\ +__kernel void upsample_F32_U8to_U8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t inaxis,\n\ + __write_only image2d_t output,\n\ + float scale_value,\n\ + float tail_value,\n\ + int zp_out)\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0) << 1, get_global_id(1) << 1);\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1));\n\ + UPSAMPLE_F32_TO_U8_PROCESS()\n\ +}\n\ +"; /* end of upsample_cl*/ + + +typedef struct { + char const* name; + char const* data; +} source_map_t; + +static const source_map_t evis_resource[] = +{ + {"a_times_b_plus_c_vx", a_times_b_plus_c_vx}, + {"add_mean_std_norm_vx", add_mean_std_norm_vx}, + {"argmax_axis0_vx", argmax_axis0_vx}, + {"argmax_axis1_vx", argmax_axis1_vx}, + {"argmax_axis2_vx", argmax_axis2_vx}, + {"argmin_axis0_vx", argmin_axis0_vx}, + {"argmin_axis1_vx", argmin_axis1_vx}, + {"argmin_axis2_vx", argmin_axis2_vx}, + {"batchnorm_single_vx", batchnorm_single_vx}, + {"cast_vx", cast_vx}, + {"clip_F16_vx", clip_F16_vx}, + {"clip_I16_vx", clip_I16_vx}, + {"clip_I8_vx", clip_I8_vx}, + {"clip_U8_vx", clip_U8_vx}, + {"depth2space_crd_vx", depth2space_crd_vx}, + {"depthwise_conv1d_src0_vx", depthwise_conv1d_src0_vx}, + {"depthwise_conv1d_src1_vx", depthwise_conv1d_src1_vx}, + {"depthwise_conv1d_src2_vx", depthwise_conv1d_src2_vx}, + {"depthwise_conv1d_src3_vx", depthwise_conv1d_src3_vx}, + {"detect_post_box_vx", detect_post_box_vx}, + {"eltwise_unary_2d_vx", eltwise_unary_2d_vx}, + {"eltwise_unary_3d_vx", eltwise_unary_3d_vx}, + {"floordiv_vx", floordiv_vx}, + {"gather_vx", gather_vx}, + {"gather_mix_vx", gather_mix_vx}, + {"gather_nd_vx", gather_nd_vx}, + {"gather_nd_2d_vx", gather_nd_2d_vx}, + {"gather_nd_2d_mix_vx", gather_nd_2d_mix_vx}, + {"gather_nd_3d_vx", gather_nd_3d_vx}, + {"gather_nd_3d_mix_vx", gather_nd_3d_mix_vx}, + {"gather_nd_mix_vx", gather_nd_mix_vx}, + {"grucell_activation_vx", grucell_activation_vx}, + {"grucell_activation_sma_vx", grucell_activation_sma_vx}, + {"grucell_cdnn_activation_vx", grucell_cdnn_activation_vx}, + {"grucell_cdnn_activation_u8_vx", grucell_cdnn_activation_u8_vx}, + {"hswish_vx", hswish_vx}, + {"instance_normalization_f16_vx", instance_normalization_f16_vx}, + {"instance_normalization_i16_vx", instance_normalization_i16_vx}, + {"instance_normalization_i8_vx", instance_normalization_i8_vx}, + {"instance_normalization_u8_vx", instance_normalization_u8_vx}, + {"l2normalizescale_axis0_vx", l2normalizescale_axis0_vx}, + {"l2normalizescale_axis1_vx", l2normalizescale_axis1_vx}, + {"log_softmax_axis0_vx", log_softmax_axis0_vx}, + {"log_softmax_axis0_BF16_vx", log_softmax_axis0_BF16_vx}, + {"log_softmax_axis1_vx", log_softmax_axis1_vx}, + {"log_softmax_axis1_BF16_vx", log_softmax_axis1_BF16_vx}, + {"log_softmax_axis2_vx", log_softmax_axis2_vx}, + {"logical_not_vx", logical_not_vx}, + {"logical_ops_vx", logical_ops_vx}, + {"lstmunit_activation_BP_F16_vx", lstmunit_activation_BP_F16_vx}, + {"lstmunit_activation_BP_U8_vx", lstmunit_activation_BP_U8_vx}, + {"lstmunit_activation_B_F16_vx", lstmunit_activation_B_F16_vx}, + {"lstmunit_activation_B_U8_vx", lstmunit_activation_B_U8_vx}, + {"lstmunit_activation_CBP_F16_vx", lstmunit_activation_CBP_F16_vx}, + {"lstmunit_activation_CBP_U8_vx", lstmunit_activation_CBP_U8_vx}, + {"lstmunit_activation_CB_F16_vx", lstmunit_activation_CB_F16_vx}, + {"lstmunit_activation_CB_U8_vx", lstmunit_activation_CB_U8_vx}, + {"lstmunit_activation_CLP_F16_vx", lstmunit_activation_CLP_F16_vx}, + {"lstmunit_activation_CL_F16_vx", lstmunit_activation_CL_F16_vx}, + {"lstmunit_activation_CSP_F16_vx", lstmunit_activation_CSP_F16_vx}, + {"lstmunit_activation_CSP_U8_vx", lstmunit_activation_CSP_U8_vx}, + {"lstmunit_activation_CS_F16_vx", lstmunit_activation_CS_F16_vx}, + {"lstmunit_activation_CS_U8_vx", lstmunit_activation_CS_U8_vx}, + {"lstmunit_activation_LP_F16_vx", lstmunit_activation_LP_F16_vx}, + {"lstmunit_activation_L_F16_vx", lstmunit_activation_L_F16_vx}, + {"lstmunit_activation_SP_F16_vx", lstmunit_activation_SP_F16_vx}, + {"lstmunit_activation_SP_U8_vx", lstmunit_activation_SP_U8_vx}, + {"lstmunit_activation_S_F16_vx", lstmunit_activation_S_F16_vx}, + {"lstmunit_activation_S_U8_vx", lstmunit_activation_S_U8_vx}, + {"matrixmul_f16_vx", matrixmul_f16_vx}, + {"matrixmul_f16f16_u8_vx", matrixmul_f16f16_u8_vx}, + {"matrixmul_f16i16_i16_vx", matrixmul_f16i16_i16_vx}, + {"matrixmul_f16u8_f16_vx", matrixmul_f16u8_f16_vx}, + {"matrixmul_f16u8_u8_vx", matrixmul_f16u8_u8_vx}, + {"matrixmul_i16_vx", matrixmul_i16_vx}, + {"matrixmul_transA_vx", matrixmul_transA_vx}, + {"matrixmul_transB_f16_vx", matrixmul_transB_f16_vx}, + {"matrixmul_transB_f16_mix_vx", matrixmul_transB_f16_mix_vx}, + {"matrixmul_transB_u8_mix_vx", matrixmul_transB_u8_mix_vx}, + {"matrixmul_u8_vx", matrixmul_u8_vx}, + {"matrixmul_u8f16_f16_vx", matrixmul_u8f16_f16_vx}, + {"matrixmul_u8f16_u8_vx", matrixmul_u8f16_u8_vx}, + {"matrixmul_u8u8_f16_vx", matrixmul_u8u8_f16_vx}, + {"maximum_vx", maximum_vx}, + {"maximum_fp16_vx", maximum_fp16_vx}, + {"maximum_i16_vx", maximum_i16_vx}, + {"minimum_vx", minimum_vx}, + {"minimum_fp16_vx", minimum_fp16_vx}, + {"minimum_i16_vx", minimum_i16_vx}, + {"moments_axis0_vx", moments_axis0_vx}, + {"moments_axis01_vx", moments_axis01_vx}, + {"moments_axis012_vx", moments_axis012_vx}, + {"moments_axis1_vx", moments_axis1_vx}, + {"moments_axis2_vx", moments_axis2_vx}, + {"poolwithargmax_F16_vx", poolwithargmax_F16_vx}, + {"poolwithargmax_I16_vx", poolwithargmax_I16_vx}, + {"poolwithargmax_I8_vx", poolwithargmax_I8_vx}, + {"poolwithargmax_U8_vx", poolwithargmax_U8_vx}, + {"pow_fp16_vx", pow_fp16_vx}, + {"pow_fp16_i16_vx", pow_fp16_i16_vx}, + {"pow_fp16_i8_vx", pow_fp16_i8_vx}, + {"pow_i16_vx", pow_i16_vx}, + {"pow_i8_vx", pow_i8_vx}, + {"pow_u8_vx", pow_u8_vx}, + {"pre_process_bgra_vx", pre_process_bgra_vx}, + {"pre_process_bgra_trans_vx", pre_process_bgra_trans_vx}, + {"pre_process_gray_vx", pre_process_gray_vx}, + {"pre_process_gray_copy_vx", pre_process_gray_copy_vx}, + {"pre_process_nv12_scale_vx", pre_process_nv12_scale_vx}, + {"pre_process_nv12_scale_8bits_vx", pre_process_nv12_scale_8bits_vx}, + {"pre_process_nv12_scale_mix_vx", pre_process_nv12_scale_mix_vx}, + {"pre_process_nv12_trans_u8_vx", pre_process_nv12_trans_u8_vx}, + {"pre_process_rgb_vx", pre_process_rgb_vx}, + {"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx}, + {"pre_process_rgb_copy_trans_vx", pre_process_rgb_copy_trans_vx}, + {"pre_process_rgb_trans_vx", pre_process_rgb_trans_vx}, + {"pre_process_yuv420_copy_u8_vx", pre_process_yuv420_copy_u8_vx}, + {"pre_process_yuv420_scale_fp16_vx", pre_process_yuv420_scale_fp16_vx}, + {"pre_process_yuv420_scale_i16_vx", pre_process_yuv420_scale_i16_vx}, + {"pre_process_yuv420_scale_i8_vx", pre_process_yuv420_scale_i8_vx}, + {"pre_process_yuv420_scale_u8_vx", pre_process_yuv420_scale_u8_vx}, + {"pre_process_yuv420_trans_u8_vx", pre_process_yuv420_trans_u8_vx}, + {"pre_process_yuv444_copy_u8_vx", pre_process_yuv444_copy_u8_vx}, + {"pre_process_yuv444_scale_vx", pre_process_yuv444_scale_vx}, + {"pre_process_yuv444_scale_fp16_vx", pre_process_yuv444_scale_fp16_vx}, + {"pre_process_yuv444_trans_u8_vx", pre_process_yuv444_trans_u8_vx}, + {"prelu_vx", prelu_vx}, + {"prelu_BF16_vx", prelu_BF16_vx}, + {"random_multinomial_vx", random_multinomial_vx}, + {"reduceall_internal_axis0_vx", reduceall_internal_axis0_vx}, + {"reduceall_internal_axis1_vx", reduceall_internal_axis1_vx}, + {"reduceall_internal_axis2_vx", reduceall_internal_axis2_vx}, + {"reduceany_internal_axis0_vx", reduceany_internal_axis0_vx}, + {"reduceany_internal_axis1_vx", reduceany_internal_axis1_vx}, + {"reduceany_internal_axis2_vx", reduceany_internal_axis2_vx}, + {"reducemax_internal_axis0_vx", reducemax_internal_axis0_vx}, + {"reducemax_internal_axis1_vx", reducemax_internal_axis1_vx}, + {"reducemax_internal_axis2_vx", reducemax_internal_axis2_vx}, + {"reducemin_internal_axis0_vx", reducemin_internal_axis0_vx}, + {"reducemin_internal_axis1_vx", reducemin_internal_axis1_vx}, + {"reducemin_internal_axis2_vx", reducemin_internal_axis2_vx}, + {"reduceprod_internal_axis0_vx", reduceprod_internal_axis0_vx}, + {"reduceprod_internal_axis1_vx", reduceprod_internal_axis1_vx}, + {"reduceprod_internal_axis2_vx", reduceprod_internal_axis2_vx}, + {"relational_ops_2d_vx", relational_ops_2d_vx}, + {"relational_ops_3d_vx", relational_ops_3d_vx}, + {"relu_keras_vx", relu_keras_vx}, + {"resize_bilinear_BF16_vx", resize_bilinear_BF16_vx}, + {"resize_bilinear_F16_vx", resize_bilinear_F16_vx}, + {"resize_bilinear_I16_vx", resize_bilinear_I16_vx}, + {"resize_bilinear_I8_vx", resize_bilinear_I8_vx}, + {"resize_bilinear_U8_vx", resize_bilinear_U8_vx}, + {"resize_bilinear_U8_UP_2X_vx", resize_bilinear_U8_UP_2X_vx}, + {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx}, + {"resize_nearest_vx", resize_nearest_vx}, + {"scatter_nd_vx", scatter_nd_vx}, + {"scatter_nd_big_vx", scatter_nd_big_vx}, + {"select_vx", select_vx}, + {"swish_vx", swish_vx}, + {"tile_vx", tile_vx}, + {"tile_mix_vx", tile_mix_vx}, + {"upsample_F16_vx", upsample_F16_vx}, + {"upsample_I16_vx", upsample_I16_vx}, + {"upsample_I8_vx", upsample_I8_vx}, + {"upsample_U8_vx", upsample_U8_vx}, + {"vsi_nn_kernel_axis_aligned_bbox_transform_vx", vsi_nn_kernel_axis_aligned_bbox_transform_vx}, + {"vsi_nn_kernel_box_with_nms_limit_vx", vsi_nn_kernel_box_with_nms_limit_vx}, + {"vsi_nn_kernel_crop_vx", vsi_nn_kernel_crop_vx}, + {"vsi_nn_kernel_detection_postprocess_vx", vsi_nn_kernel_detection_postprocess_vx}, + {"vsi_nn_kernel_extra_ending_vx", vsi_nn_kernel_extra_ending_vx}, + {"vsi_nn_kernel_fullconnect2_vx", vsi_nn_kernel_fullconnect2_vx}, + {"vsi_nn_kernel_generate_proposals_vx", vsi_nn_kernel_generate_proposals_vx}, + {"vsi_nn_kernel_header_vx", vsi_nn_kernel_header_vx}, + {"vsi_nn_kernel_heatmap_max_keypoint_vx", vsi_nn_kernel_heatmap_max_keypoint_vx}, + {"vsi_nn_kernel_imageprocess_vx", vsi_nn_kernel_imageprocess_vx}, + {"vsi_nn_kernel_imageprocess_2_vx", vsi_nn_kernel_imageprocess_2_vx}, + {"vsi_nn_kernel_imageprocess_3_vx", vsi_nn_kernel_imageprocess_3_vx}, + {"vsi_nn_kernel_imageprocess_4_vx", vsi_nn_kernel_imageprocess_4_vx}, + {"vsi_nn_kernel_imageprocess_5_vx", vsi_nn_kernel_imageprocess_5_vx}, + {"vsi_nn_kernel_layernormalize_vx", vsi_nn_kernel_layernormalize_vx}, + {"vsi_nn_kernel_layernormalize_U8_vx", vsi_nn_kernel_layernormalize_U8_vx}, + {"vsi_nn_kernel_resize_vx", vsi_nn_kernel_resize_vx}, + {"vsi_nn_kernel_roi_align_vx", vsi_nn_kernel_roi_align_vx}, + {"vsi_nn_kernel_scale_vx", vsi_nn_kernel_scale_vx}, + {"vsi_nn_kernel_shufflechannel_vx", vsi_nn_kernel_shufflechannel_vx}, + {"vsi_nn_kernel_shufflechannel_axis1_vx", vsi_nn_kernel_shufflechannel_axis1_vx}, + {"vsi_nn_kernel_signalframe_vx", vsi_nn_kernel_signalframe_vx}, + {"vsi_nn_kernel_space2depth_vx", vsi_nn_kernel_space2depth_vx}, + {"vsi_nn_kernel_tensorstackconcat_vx", vsi_nn_kernel_tensorstackconcat_vx}, + {"vsi_nn_kernel_topk_vx", vsi_nn_kernel_topk_vx}, + {"vsi_nn_kernel_transform_gemm_vx", vsi_nn_kernel_transform_gemm_vx}, + {"vsi_nn_kernel_transform_interp_vx", vsi_nn_kernel_transform_interp_vx}, + {"vsi_nn_kernel_transform_setupThres_vx", vsi_nn_kernel_transform_setupThres_vx}, +}; + +static const source_map_t cl_resource[] = +{ + {"add_mean_std_norm_cl", add_mean_std_norm_cl}, + {"argmax_axis0_cl", argmax_axis0_cl}, + {"argmax_axis1_cl", argmax_axis1_cl}, + {"argmax_axis2_cl", argmax_axis2_cl}, + {"argmin_axis0_cl", argmin_axis0_cl}, + {"argmin_axis1_cl", argmin_axis1_cl}, + {"argmin_axis2_cl", argmin_axis2_cl}, + {"batchnorm_single_cl", batchnorm_single_cl}, + {"cast_cl", cast_cl}, + {"clip_F32_cl", clip_F32_cl}, + {"clip_U8_cl", clip_U8_cl}, + {"detect_post_box_cl", detect_post_box_cl}, + {"eltwise_ops_helper_cl", eltwise_ops_helper_cl}, + {"eltwise_unary_cl", eltwise_unary_cl}, + {"floordiv_cl", floordiv_cl}, + {"gather_cl", gather_cl}, + {"gather_nd_cl", gather_nd_cl}, + {"gather_nd_3d_cl", gather_nd_3d_cl}, + {"grucell_activation_cl", grucell_activation_cl}, + {"grucell_activation_sma_cl", grucell_activation_sma_cl}, + {"hswish_cl", hswish_cl}, + {"instance_normalization_f16_cl", instance_normalization_f16_cl}, + {"instance_normalization_f32_cl", instance_normalization_f32_cl}, + {"instance_normalization_i32_cl", instance_normalization_i32_cl}, + {"instance_normalization_u8_cl", instance_normalization_u8_cl}, + {"l2normalizescale_axis0_cl", l2normalizescale_axis0_cl}, + {"l2normalizescale_axis1_cl", l2normalizescale_axis1_cl}, + {"log_softmax_axis0_cl", log_softmax_axis0_cl}, + {"log_softmax_axis1_cl", log_softmax_axis1_cl}, + {"log_softmax_axis2_cl", log_softmax_axis2_cl}, + {"logical_not_cl", logical_not_cl}, + {"logical_ops_cl", logical_ops_cl}, + {"lstmunit_activation_BP_F32_cl", lstmunit_activation_BP_F32_cl}, + {"lstmunit_activation_BP_U8_cl", lstmunit_activation_BP_U8_cl}, + {"lstmunit_activation_B_F32_cl", lstmunit_activation_B_F32_cl}, + {"lstmunit_activation_B_U8_cl", lstmunit_activation_B_U8_cl}, + {"lstmunit_activation_CBP_F32_cl", lstmunit_activation_CBP_F32_cl}, + {"lstmunit_activation_CBP_U8_cl", lstmunit_activation_CBP_U8_cl}, + {"lstmunit_activation_CB_F32_cl", lstmunit_activation_CB_F32_cl}, + {"lstmunit_activation_CB_U8_cl", lstmunit_activation_CB_U8_cl}, + {"lstmunit_activation_CLP_F32_cl", lstmunit_activation_CLP_F32_cl}, + {"lstmunit_activation_CLP_U8_cl", lstmunit_activation_CLP_U8_cl}, + {"lstmunit_activation_CL_F32_cl", lstmunit_activation_CL_F32_cl}, + {"lstmunit_activation_CL_U8_cl", lstmunit_activation_CL_U8_cl}, + {"lstmunit_activation_CSP_F32_cl", lstmunit_activation_CSP_F32_cl}, + {"lstmunit_activation_CSP_U8_cl", lstmunit_activation_CSP_U8_cl}, + {"lstmunit_activation_CS_F32_cl", lstmunit_activation_CS_F32_cl}, + {"lstmunit_activation_CS_U8_cl", lstmunit_activation_CS_U8_cl}, + {"lstmunit_activation_LP_F32_cl", lstmunit_activation_LP_F32_cl}, + {"lstmunit_activation_L_F32_cl", lstmunit_activation_L_F32_cl}, + {"lstmunit_activation_SP_F32_cl", lstmunit_activation_SP_F32_cl}, + {"lstmunit_activation_SP_U8_cl", lstmunit_activation_SP_U8_cl}, + {"lstmunit_activation_S_F32_cl", lstmunit_activation_S_F32_cl}, + {"lstmunit_activation_S_U8_cl", lstmunit_activation_S_U8_cl}, + {"matrixmul_cl", matrixmul_cl}, + {"matrixmul_transA_cl", matrixmul_transA_cl}, + {"maximum_cl", maximum_cl}, + {"minimum_cl", minimum_cl}, + {"moments_axis0_cl", moments_axis0_cl}, + {"moments_axis01_cl", moments_axis01_cl}, + {"moments_axis012_cl", moments_axis012_cl}, + {"moments_axis1_cl", moments_axis1_cl}, + {"moments_axis2_cl", moments_axis2_cl}, + {"poolwithargmax_cl", poolwithargmax_cl}, + {"pow_cl", pow_cl}, + {"prelu_cl", prelu_cl}, + {"random_multinomial_cl", random_multinomial_cl}, + {"reduceall_internal_axis0_cl", reduceall_internal_axis0_cl}, + {"reduceall_internal_axis1_cl", reduceall_internal_axis1_cl}, + {"reduceall_internal_axis2_cl", reduceall_internal_axis2_cl}, + {"reduceany_internal_axis0_cl", reduceany_internal_axis0_cl}, + {"reduceany_internal_axis1_cl", reduceany_internal_axis1_cl}, + {"reduceany_internal_axis2_cl", reduceany_internal_axis2_cl}, + {"reducemax_internal_axis0_cl", reducemax_internal_axis0_cl}, + {"reducemax_internal_axis1_cl", reducemax_internal_axis1_cl}, + {"reducemax_internal_axis2_cl", reducemax_internal_axis2_cl}, + {"reducemin_internal_axis0_cl", reducemin_internal_axis0_cl}, + {"reducemin_internal_axis1_cl", reducemin_internal_axis1_cl}, + {"reducemin_internal_axis2_cl", reducemin_internal_axis2_cl}, + {"reduceprod_internal_axis0_cl", reduceprod_internal_axis0_cl}, + {"reduceprod_internal_axis1_cl", reduceprod_internal_axis1_cl}, + {"reduceprod_internal_axis2_cl", reduceprod_internal_axis2_cl}, + {"relational_ops_cl", relational_ops_cl}, + {"relu_keras_cl", relu_keras_cl}, + {"resize_bilinear_cl", resize_bilinear_cl}, + {"resize_nearest_cl", resize_nearest_cl}, + {"scatter_nd_cl", scatter_nd_cl}, + {"select_cl", select_cl}, + {"swish_cl", swish_cl}, + {"tile_cl", tile_cl}, + {"upsample_cl", upsample_cl}, +}; + +static const char* _load_code + ( + const char* source_name, + size_t* size, + const source_map_t* source_map, + size_t source_map_size, + const char* tail + ) +{ + const char* source; + char source_path[VSI_NN_MAX_PATH]; + size_t n; + int i; + source = NULL; + n = snprintf( source_path, VSI_NN_MAX_PATH, "%s%s", source_name, tail ); + if( n == VSI_NN_MAX_PATH ) + { + VSILOGE("Kernel source path overflow %d/%d", n, VSI_NN_MAX_PATH); + *size = 0; + return NULL; + } + for( i = 0; i < (int)source_map_size; i++ ) + { + if( strncmp( source_map[i].name, source_path, VSI_NN_MAX_PATH ) == 0 ) + { + source = source_map[i].data; + *size = strlen( source ); + break; + } + } + if( !source ) + { + *size = 0; + } + return source; +} /* _load_code() */ + +const char* vsi_nn_resource_load_source_code + ( + const char* source_name, + size_t* size, + vsi_nn_kernel_type_e type + ) +{ + const char* s = NULL; + switch( type ) + { + case VSI_NN_KERNEL_TYPE_EVIS: + s = _load_code( source_name, size, + evis_resource, _cnt_of_array(evis_resource), "_vx" ); + break; + case VSI_NN_KERNEL_TYPE_CL: + s = _load_code( source_name, size, + cl_resource, _cnt_of_array(cl_resource), "_cl" ); + break; + default: + break; + } + return s; +} /* vsi_nn_resource_load_source_code() */ diff --git a/src/tim/vx/internal/src/makefile.linux b/src/tim/vx/internal/src/makefile.linux new file mode 100644 index 0000000..86691ef --- /dev/null +++ b/src/tim/vx/internal/src/makefile.linux @@ -0,0 +1,129 @@ +include $(AQROOT)/makefile.linux.def + +INCLUDE += -I$(VIVANTE_SDK_INC) -I$(VIVANTE_SDK_INC)/HAL -I$(AQROOT)/sdk/inc +INCLUDE += -I../include/ops -I../include/utils -I../include/inference +INCLUDE += -I../include/client -I../include -I../include/libnnext + +CFLAGS += $(INCLUDE) +CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror +CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")))' + +################################################################################ +# Supply necessary libraries. + +LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC +LIBS += -lm -ldl + +############################################################################# +# Macros. +ifeq ($(gcdSTATIC_LINK), 1) +STATIC=1 +TARGET_NAME = libovxlib.a +else +CFLAGS += -fPIC +DYNAMIC := 1 +TARGET_NAME = libovxlib.so +endif + +ifneq ("$(OVXLIB_CONFIG)", "") + CFLAGS += -D$(OVXLIB_CONFIG) +endif + +ifneq ($(gcdSTATIC_LINK), 1) + ifeq ($(VSI_GPERF_DEBUG), 1) + TCMALLOC_DIR = $(OVXLIB_DIR)/third-party/gperftools + CFLAGS += -I$(TCMALLOC_DIR)/src + CFLAGS += -I$(TCMALLOC_DIR)/src/gperftools + CFLAGS += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free + CFLAGS += -g + LIBS += -L$(TCMALLOC_DIR)/.libs -ltcmalloc + endif +endif +############################################################################# +# Objects. +OBJECTS = $(OBJ_DIR)/vsi_nn_context.o \ + $(OBJ_DIR)/vsi_nn_client_op.o \ + $(OBJ_DIR)/vsi_nn_graph.o \ + $(OBJ_DIR)/vsi_nn_node_attr_template.o \ + $(OBJ_DIR)/vsi_nn_node.o \ + $(OBJ_DIR)/vsi_nn_ops.o \ + $(OBJ_DIR)/vsi_nn_daemon.o \ + $(OBJ_DIR)/vsi_nn_tensor.o \ + $(OBJ_DIR)/vsi_nn_version.o \ + $(OBJ_DIR)/vsi_nn_rnn.o \ + $(OBJ_DIR)/vsi_nn_rnn_helper.o \ + $(OBJ_DIR)/vsi_nn_internal_node.o \ + $(OBJ_DIR)/vsi_nn_log.o \ + $(OBJ_DIR)/vsi_nn_graph_optimization.o \ + $(OBJ_DIR)/vsi_nn_pre_post_process.o + +vpath %.c client +OBJECTS += $(OBJ_DIR)/vsi_nn_vxkernel.o + +vpath %.c utils +OBJECTS += $(OBJ_DIR)/vsi_nn_code_generator.o \ + $(OBJ_DIR)/vsi_nn_binary_tree.o \ + $(OBJ_DIR)/vsi_nn_map.o \ + $(OBJ_DIR)/vsi_nn_link_list.o \ + $(OBJ_DIR)/vsi_nn_math.o \ + $(OBJ_DIR)/vsi_nn_dtype_util.o \ + $(OBJ_DIR)/vsi_nn_shape_util.o \ + $(OBJ_DIR)/vsi_nn_dtype.o \ + $(OBJ_DIR)/vsi_nn_limits.o \ + $(OBJ_DIR)/vsi_nn_vdata.o \ + $(OBJ_DIR)/vsi_nn_util.o \ + $(OBJ_DIR)/vsi_nn_constraint_check.o \ + $(OBJ_DIR)/vsi_nn_hashmap.o \ + $(OBJ_DIR)/vsi_nn_tensor_op.o + +vpath %.c quantization +OBJECTS += $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o \ + $(OBJ_DIR)/vsi_nn_asymmetric_affine.o \ + $(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o + +vpath %.c pycc +OBJECTS += $(OBJ_DIR)/vsi_pycc_interface.o + +vpath %.c post +OBJECTS += $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \ + $(OBJ_DIR)/vsi_nn_post_cmupose.o + +vpath %.c libnnext +OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o + +vpath %.c libnnext/ops/kernel +SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}} + +vpath %.c ops +SRCS += ${notdir ${wildcard ops/*.c}} + +vpath %.c kernel +SRCS += ${notdir ${wildcard kernel/*.c}} + +vpath %.c kernel/cl +SRCS += ${notdir ${wildcard kernel/cl/*.c}} + +vpath %.c kernel/cpu +SRCS += ${notdir ${wildcard kernel/cpu/*.c}} + +vpath %.c kernel/evis +SRCS += ${notdir ${wildcard kernel/evis/*.c}} + +vpath %.c kernel/vx +SRCS += ${notdir ${wildcard kernel/vx/*.c}} + +vpath %.c custom/ops +SRCS += ${notdir ${wildcard custom/ops/*.c}} + +vpath %.c custom/ops/kernel +SRCS += ${notdir ${wildcard custom/ops/kernel/*.c}} + +OBJECTS += ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)} + +# installation directory +INSTALL_DIR := $(VIVANTE_SDK_LIB) + +################################################################################ +# Include the common makefile. + +include $(AQROOT)/common.target diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c b/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c new file mode 100644 index 0000000..96e084f --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c @@ -0,0 +1,113 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "a_times_b_plus_c", + inputs, 3, + outputs, 1, NULL ); + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_tensor_t * a_times_b[2] = {NULL}; + vsi_nn_tensor_attr_t attr; + vsi_bool ret = FALSE; + + memset(&attr, 0, sizeof(attr)); + memcpy(attr.size, outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof( uint32_t )); + attr.dim_num = outputs[0]->attr.dim_num; + attr.vtl = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + a_times_b[0] = vsi_nn_CreateTensor(self->graph, &attr); + ret = vsi_nn_OpCheck(VSI_NN_OP_MULTIPLY, self, inputs, a_times_b); + if (!ret) + { + goto final; + } + + a_times_b[1] = inputs[2]; + ret = vsi_nn_OpCheck(VSI_NN_OP_ADD, self, a_times_b, outputs); +final: + if (a_times_b[0]) vsi_nn_ReleaseTensor(&a_times_b[0]); + return ret; + +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ A_TIMES_B_PLUS_C, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_eltwise_setup, + /* optimize */ NULL, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c new file mode 100644 index 0000000..220e778 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c @@ -0,0 +1,150 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + int32_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t dims = 0; + vx_tensor input = NULL, input0 = NULL; + vx_tensor output = NULL, output0 = NULL; + status = VSI_FAILURE; + + if (inputs[0]->attr.dim_num > 4) + { + input_size[0] = vsi_nn_GetElementNum(inputs[0]) / + inputs[0]->attr.size[inputs[0]->attr.dim_num - 1]; + input_size[1] = inputs[0]->attr.size[inputs[0]->attr.dim_num - 1]; + dims= 2; + input = vxReshapeTensor(inputs[0]->t, input_size, dims); + output = vxReshapeTensor(outputs[0]->t, input_size, dims); + input0 = input; + output0 = output; + } + else + { + input0 = inputs[0]->t; + output0 = outputs[0]->t; + } + + self->n = vxLeakyReluLayer( + self->graph->g, + input0, + -1, + output0 + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + if (input) vxReleaseTensor(&input); + if (output) vxReleaseTensor(&output); + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(ABS, 1, 1) + /* IO_TYPE(INPUT, OUTPUT) */ + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BF16) + + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP) + + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(ABS) + if(!VALIDATE_OP_IO_TYPES(ABS, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ ABS, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c new file mode 100644 index 0000000..073d063 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c @@ -0,0 +1,210 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_constraint_check.h" + +static int32_t _get_input_num + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs + ) +{ + int32_t num; + num = (int32_t)(self->input.num - 1); + while( num >= 0 && NULL == inputs[num] ) + { + num --; + } + if( 0 > num ) + { + return -1; + } + + num++; + return num; +} + +vsi_bool _is_float32_data_format + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t input_num = 0; + uint32_t i = 0; + + input_num = _get_input_num(self, inputs); + + if (outputs[0]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32) + { + return FALSE; + } + + for ( i = 0; i < input_num; i++) + { + if (inputs[i]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32) + { + return FALSE; + } + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + uint32_t i; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* temp_output_tensor = NULL; + uint32_t input_num = 0; + + vsi_nn_internal_init_node_wksp( self ); + + input_num = _get_input_num(self, inputs); + for(i = 0; i < input_num -1; i++) + { + /* loop call add for input_num -1 times */ + + /* setup input for each add */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + if(i == 0) + { + curr->inputs[0] = inputs[i]; + } + else + { + curr->inputs[0] = temp_output_tensor->t; + } + curr->inputs[1] = inputs[i+1]; + + /* setup output for each add */ + if(i < input_num - 2) + { + memset(&attr, 0, sizeof(attr)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + if (_is_float32_data_format(self, inputs, outputs)) + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + } + else + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } + + temp_output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr->outputs[0] = temp_output_tensor->t; + } + else + { + curr->outputs[0] = outputs[0]; + } + + vsi_nn_internal_setup_node( self, curr ); + } + return ret; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp( self ); + + return status; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ ADDN, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 2, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c new file mode 100644 index 0000000..8f115d9 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c @@ -0,0 +1,288 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status _argmaxmin_op_compute + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank_in = 0; + uint32_t rank_out = 0; + int32_t axis = 0; + int32_t new_axis = 0; + uint32_t axis_size = 0; + vsi_bool ret; + vsi_nn_kernel_param_t * param = NULL; + + if( NULL == self ) + { + return VSI_FAILURE; + } + status = VSI_FAILURE; + + param =vsi_nn_kernel_param_create(); + if (strcmp(kernel_name, "argmax") == 0) + { + vsi_nn_argmax_param * p = &(self->nn_param.argmax); + axis = p->axis; + } + else + { + vsi_nn_argmin_param * p = &(self->nn_param.argmin); + axis = p->axis; + } + + // TODO: This optimzie is a hack for gpu path, + // it should be moved to gpu kernel setup. + ret = vsi_nn_kernel_optimize_reduce_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + &axis, 1, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], &rank_in, shapes[1], &rank_out, + &new_axis, &axis_size); + + // Add params + vsi_nn_kernel_param_add_int32( param, "axis", new_axis ); + + if( ret ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[1], rank_out ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, + &reshape_tensors[0], 1, + &reshape_tensors[1], 1, param ); + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + } + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* _argmaxmin_op_compute() */ + +static vsi_bool _argmaxmin_op_setup + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + int32_t axis = 0; + vsi_bool ret = TRUE; + + if (strcmp(kernel_name, "argmax") == 0) + { + vsi_nn_argmax_param * p = &(self->nn_param.argmax); + axis = p->axis; + + if (axis < 0) + { + axis = axis + inputs[0]->attr.dim_num; + p->axis = axis; + } + } + else + { + vsi_nn_argmin_param * p = &(self->nn_param.argmin); + axis = p->axis; + + if (axis < 0) + { + axis = axis + inputs[0]->attr.dim_num; + p->axis = axis; + } + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + uint32_t i = 0; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num - 1; + + for (i = 0; i < (uint32_t)axis; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + + for (i = axis; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1]; + } + + if (inputs[0]->attr.dim_num == 1) + { + outputs[0]->attr.dim_num = 1; + outputs[0]->attr.size[0] = 1; + } + } + + return ret; +} /* _argmaxmin_op_setup() */ + + +static vsi_status _argmaxmin_op_init + ( + const char * kernel_name, + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 11) == -1) + { + if (strcmp(kernel_name, "argmax") == 0) + { + vsi_nn_argmax_param * p = &(self->nn_param.argmax); + p->axis = 2; + } + else + { + vsi_nn_argmin_param * p = &(self->nn_param.argmin); + p->axis = 2; + } + } + + return status; +} /* _argmaxmin_op_init() */ + + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(ARGMIN, 1, 1) + IO_TYPE(D_F16, D_U8) + IO_TYPE(D_F16, D_I16) + IO_TYPE(D_BF16, D_U8) + IO_TYPE(D_BF16, D_I16) + IO_TYPE(D_I8|Q_DFP, D_U8) + IO_TYPE(D_I8|Q_DFP, D_I16) + IO_TYPE(D_U8|Q_ASYM, D_U8) + IO_TYPE(D_U8|Q_ASYM, D_I16) + IO_TYPE(D_I16|Q_DFP, D_U8) + IO_TYPE(D_I16|Q_DFP, D_I16) + IO_TYPE(D_F32, D_I32) + IO_TYPE(D_F16, D_I32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_I32) + END_IO_TYPE_DECL(ARGMIN) + if(!VALIDATE_OP_IO_TYPES(ARGMIN, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEF_ARG_MAX_MIN_OP(name, kernel_name) \ + static vsi_status op_compute_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _argmaxmin_op_compute( ""#kernel_name, self, inputs, outputs ); \ + } \ + static vsi_bool op_setup_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _argmaxmin_op_setup( ""#kernel_name, self, inputs, outputs ); \ + } \ + static vsi_status op_init_##kernel_name \ + ( \ + vsi_nn_node_t * self \ + ) \ + { \ + return _argmaxmin_op_init( ""#kernel_name, self ); \ + } \ +DEF_OP_REG \ + ( \ + /* op_name */ name, \ + /* init */ op_init_##kernel_name, \ + /* compute */ op_compute_##kernel_name, \ + /* deinit */ vsi_nn_op_common_deinit, \ + /* check */ op_check, \ + /* setup */ op_setup_##kernel_name, \ + /* optimize */ NULL, \ + /* input_num */ 1, \ + /* output_num */ 1 \ + ) +/* DEF_OP_REG(name, op_init_##kernel_name, op_compute_##kernel_name, \ + NULL, NULL, op_setup_##kernel_name, NULL, 1, 1)*/ + +DEF_ARG_MAX_MIN_OP( ARGMAX, argmax ); +DEF_ARG_MAX_MIN_OP( ARGMIN, argmin ); + + +#undef DEF_ARG_MAX_MIN_OP + +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c new file mode 100644 index 0000000..83b3664 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c @@ -0,0 +1,286 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" + +#define _ARG_NUM (0) +#define _INPUT_NUM (4) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status = VSI_SUCCESS; +#if 0 + vx_context ctx; + vsi_nn_axis_aligned_bbox_transform_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.axis_aligned_bbox_transform); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ + #define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, type ); + #undef _SET_PARAM +set_param_error: +#endif + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + /*TODO: Add code if need to change your parameter*/ + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + char *path = NULL; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM_list; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_axis_aligned_bbox_transform"; + path = getenv("USER_VX_SOURCE_PATH"); + if(path) + vsi_nn_VxResourceSetPath(path); + + if( kernel_info.type == VX_KERNEL_TYPE_VX) + { + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + } + else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ + { + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) + { + free(kernel_info.resource_name); + } + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = inputs[1]->attr.size[0]; + outputs[0]->attr.size[1] = inputs[1]->attr.size[1]; + outputs[0]->attr.dim_num = 2; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ AXIS_ALIGNED_BBOX_TRANSFORM, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c new file mode 100644 index 0000000..0865f49 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c @@ -0,0 +1,199 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_test.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vx_nn_reorg_params_ext_t param; + vsi_nn_tensor_t *block_size_tensor = NULL; + vsi_nn_tensor_t *pad_tensor = NULL; + vsi_nn_tensor_attr_t attr; + memset(¶m, 0, sizeof(vx_nn_reorg_params_ext_t)); + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = 2; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + block_size_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)self->nn_param.batch2space.block_size, + &attr); + TEST_CHECK_PTR(block_size_tensor, final); + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = 4; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + pad_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)self->nn_param.batch2space.crop, + &attr); + TEST_CHECK_PTR(pad_tensor, final); + + param.base.block_size = REQUIRED_IO(block_size_tensor); + param.pad = OPTIONAL_IO(pad_tensor); + param.base.type = VX_REORG_BATCH_TO_SPACE_ND; + self->n = vxReorgLayer2( self->graph->g, + inputs[0]->t, + (vx_nn_reorg_params_t *)¶m, + sizeof(vx_nn_reorg_params_ext_t), + outputs[0]->t); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + +final: + if (block_size_tensor) vsi_nn_ReleaseTensor(&block_size_tensor); + if (pad_tensor) vsi_nn_ReleaseTensor(&pad_tensor); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(BATCH2SPACE, 1, 1) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + END_IO_TYPE_DECL(BATCH2SPACE) + if (!VALIDATE_OP_IO_TYPES(BATCH2SPACE, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + if (inputs[0]->attr.dim_num != 4) + { + VSILOGE("batch2space only support 4D"); + return FALSE; + } + + if (self->nn_param.batch2space.block_size[0] < 0 + || self->nn_param.batch2space.block_size[1] < 0) + { + VSILOGE("Block size can't be less than zero in batch to space"); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_batch2space_param * p; + p = (vsi_nn_batch2space_param *)&(self->nn_param.batch2space); + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[3] = + inputs[0]->attr.size[3] / p->block_size[0] / p->block_size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[1] = + inputs[0]->attr.size[1] * p->block_size[1] - p->crop[2] - p->crop[3]; + outputs[0]->attr.size[0] = + inputs[0]->attr.size[0] * p->block_size[0] - p->crop[0] - p->crop[1]; + outputs[0]->attr.dim_num = 4; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.batch2space.local.block_size_tensor != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.batch2space.local.block_size_tensor)); + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ BATCH2SPACE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c new file mode 100644 index 0000000..c760898 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c @@ -0,0 +1,317 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status _try_set_high_presision_tensor + ( + vsi_nn_tensor_t **inputs + ) +{ + vsi_status status; + vsi_nn_vxtensor_attr_t attr; + + status = VSI_SUCCESS; + attr = VSI_NN_TENSOR_ATTR_HIGH_PRECISION; + + if(VSI_NN_TYPE_FLOAT32 == inputs[1]->attr.dtype.vx_type) + { + status = vsi_nn_SetTensorAttr(inputs[1], attr); + if(VSI_SUCCESS != status) + { + return status; + } + } + if(VSI_NN_TYPE_FLOAT32 == inputs[2]->attr.dtype.vx_type) + { + status = vsi_nn_SetTensorAttr(inputs[2], attr); + if(VSI_SUCCESS != status) + { + return status; + } + } + if(VSI_NN_TYPE_FLOAT32 == inputs[3]->attr.dtype.vx_type) + { + status = vsi_nn_SetTensorAttr(inputs[3], attr); + if(VSI_SUCCESS != status) + { + return status; + } + } + if(VSI_NN_TYPE_FLOAT32 == inputs[4]->attr.dtype.vx_type) + { + status = vsi_nn_SetTensorAttr(inputs[4], attr); + if(VSI_SUCCESS != status) + { + return status; + } + } + + return status; +} + +static vsi_bool _is_3d_batchnorm + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs + ) +{ + /* + We support 3d batchnorm at version 1.1.12 + */ + if (vsi_nn_compareVersion(self->graph, 1, 1, 12) == -1) + { + return FALSE; + } + else + { + if ( 3 == inputs[0]->attr.dim_num ) + { + return TRUE; + } + else + { + return FALSE; + } + } +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_tensor vx_input,vx_output; + status = VSI_FAILURE; + + status = _try_set_high_presision_tensor(inputs); + if(VSI_SUCCESS != status) + { + VSILOGE("Set tensor attr of high presision fail"); + return status; + } + if(_is_3d_batchnorm(self, inputs)) + { + vx_input = self->nn_param.batch_norm.local->reshaped_input->t; + vx_output = self->nn_param.batch_norm.local->reshaped_output->t; + } + else + { + vx_input = inputs[0]->t; + vx_output = outputs[0]->t; + } + + self->n = vxBatchNormalizationLayer( + self->graph->g, + self->nn_param.batch_norm.eps, + inputs[1]->t, + inputs[2]->t, + inputs[3]->t, + inputs[4]->t, + vx_input, + vx_output + ); + if( NULL == self->n ) + { + status = VSI_FAILURE; + } + return status; +} /* op_compute() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + uint32_t dim = 0; + vsi_nn_batcnnorm_lcl_data *local = NULL; + uint32_t shape[VSI_NN_MAX_DIM_NUM]; + char tensor_name[128]; + + dim = inputs[0]->attr.dim_num; + if(_is_3d_batchnorm(self, inputs) == FALSE) + { + return VSI_SUCCESS; + } + + VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + /* + reshape 3d input (xcn) --> 4d input (whcn) + reshape 3d output(xcn) --> 4d output(whcn) + */ + shape[0] = inputs[0]->attr.size[0]; + shape[1] = 1; + shape[2] = inputs[0]->attr.size[1]; + shape[3] = inputs[0]->attr.size[2]; + dim = 4; + local = self->nn_param.batch_norm.local; + if (VSI_NN_OPTIMIZE_BACKWARD == direction) + { + local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim); + } + else + { + local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim); + if(local->reshaped_output && local->reshaped_output->t) + { + memset(tensor_name, 0, sizeof(tensor_name)); + snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid); + if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid); + return VSI_FAILURE; + } + } + } + + return VSI_SUCCESS; +} /* op_optimize() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(BATCH_NORM, 5, 1) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_F32, D_BF16) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_F16) + END_IO_TYPE_DECL(BATCH_NORM) + if (!VALIDATE_OP_IO_TYPES(BATCH_NORM, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_batcnnorm_lcl_data *local = NULL; + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + memcpy( outputs[0]->attr.size, inputs[0]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + } + + if(_is_3d_batchnorm(self, inputs)) + { + local = (vsi_nn_batcnnorm_lcl_data *)malloc(sizeof(vsi_nn_batcnnorm_lcl_data)); + if(NULL == local) + { + return VSI_FAILURE; + } + memset(local, 0, sizeof(vsi_nn_batcnnorm_lcl_data)); + self->nn_param.batch_norm.local = local; + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_batch_norm_param *p = &(self->nn_param.batch_norm); + if(p->local) + { + if (p->local->reshaped_input) + { + vsi_nn_ReleaseTensor(&(p->local->reshaped_input)); + p->local->reshaped_input = NULL; + } + if (p->local->reshaped_output) + { + vsi_nn_ReleaseTensor(&(p->local->reshaped_output)); + p->local->reshaped_output = NULL; + } + vsi_nn_safe_free(p->local); + } + vsi_nn_op_common_deinit(self); + return VSI_SUCCESS; +} + + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ BATCH_NORM, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 5, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c new file mode 100644 index 0000000..a945e61 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c @@ -0,0 +1,227 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (5) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + int32_t shapes[4][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + int32_t* shapes_ptr[4] = {NULL}; + int32_t *shapes_in[3] = {NULL}; + size_t rank_in[3] = {0}; + uint32_t new_rank = 0; + vsi_nn_tensor_t* reshape_tensors[6] = { NULL }; + vsi_bool ret = TRUE; + uint32_t i = 0; + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_float32( param, "eps", self->nn_param.batch_norm.eps ); + + rank_in[0] = (size_t)inputs[0]->attr.dim_num; + rank_in[1] = (size_t)inputs[1]->attr.dim_num; + rank_in[2] = (size_t)inputs[3]->attr.dim_num; + shapes_in[0] = (int32_t *)inputs[0]->attr.size; + shapes_in[1] = (int32_t *)inputs[1]->attr.size; + shapes_in[2] = (int32_t *)inputs[3]->attr.size; + for (i = 0; i < 4; i++) + { + shapes_ptr[i] = shapes[i]; + } + + ret = vsi_nn_kernel_optimize_broadcast_shape( + (const int32_t**)shapes_in, (const size_t*)rank_in, 3, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes_ptr, shapes[3], &new_rank); + + if( ret ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + inputs[2], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[3] = vsi_nn_reshape_tensor( self->graph, + inputs[3], (uint32_t*)shapes[2], new_rank ); + reshape_tensors[4] = vsi_nn_reshape_tensor( self->graph, + inputs[4], (uint32_t*)shapes[2], new_rank ); + + reshape_tensors[5] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[3], new_rank ); + } + else + { + reshape_tensors[0] = inputs[0]; + reshape_tensors[1] = inputs[1]; + reshape_tensors[2] = inputs[2]; + reshape_tensors[3] = inputs[3]; + reshape_tensors[4] = inputs[4]; + + reshape_tensors[5] = outputs[0]; + } + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "batchnorm_single", + reshape_tensors, 5, + &reshape_tensors[5], 1, param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + if (ret) + { + for ( i = 0; i < 6; i++) + { + if (reshape_tensors[i]) + { + vsi_nn_ReleaseTensor( &reshape_tensors[i] ); + } + } + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + uint32_t i = 0; + uint32_t j = 0; + uint32_t rank = inputs[0]->attr.dim_num; + + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(BATCHNORM_SINGLE, 5, 1) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + END_IO_TYPE_DECL(BATCHNORM_SINGLE) + if(!VALIDATE_OP_IO_TYPES(BATCHNORM_SINGLE, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + for(i = 0; i < rank; i++) + { + vx_int32 shape0 = inputs[0]->attr.size[i]; + + for ( j = 1; j < self->input.num; j++) + { + uint32_t rank1 = inputs[j]->attr.dim_num; + vx_int32 shape1 = rank1 > i ? inputs[j]->attr.size[i] : 1; + + if(shape0 != shape1 && shape1 != 1) + { + VSILOGE("Invalid broadcast for inputs[%d] size[%u]", j, shape1); + return FALSE; + } + } + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + memcpy( outputs[0]->attr.size, inputs[0]->attr.size, + inputs[0]->attr.dim_num * sizeof( uint32_t ) ); + } + + return TRUE; +} /* op_setup() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ BATCHNORM_SINGLE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c new file mode 100644 index 0000000..b490e0e --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c @@ -0,0 +1,637 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_bidirectional_sequence_lstm_param* curr_param = + &self->nn_param.bidirectional_sequence_lstm; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t num_units = 0; + uint32_t output_size = 0; + uint32_t batch_size = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + if( curr_param->time_major ) + { + batch_size = inputs[BI_LSTM_INPUT_INPUT]->attr.size[1]; + } + else + { + batch_size = inputs[BI_LSTM_INPUT_INPUT]->attr.size[2]; + } + + num_units = inputs[BI_LSTM_FW_INPUT_WEIGHT_I2F]->attr.size[1]; + output_size = num_units; + + /* create h_state if app doesn't provide them */ + if( !inputs[BI_LSTM_FW_INPUT_H_STATE] ) + { + attr.dim_num = 2; + attr.size[1] = batch_size; + attr.size[0] = output_size; + memcpy(&attr.dtype, &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype )); + attr.vtl = FALSE; + attr.is_const = TRUE; + + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[BI_LSTM_FW_INPUT_H_STATE] = output_tensor->t; + } + + if( !inputs[BI_LSTM_BW_INPUT_H_STATE] ) + { + attr.dim_num = 2; + attr.size[1] = batch_size; + attr.size[0] = output_size; + memcpy(&attr.dtype, &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype )); + attr.vtl = FALSE; + attr.is_const = TRUE; + + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[BI_LSTM_BW_INPUT_H_STATE] = output_tensor->t; + } + + /* output */ + if( VSI_NN_DIM_AUTO == outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dim_num ) + { + if( curr_param->merge_outputs ) + { + outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.size[0] = output_size * 2; + outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.size[1] = inputs[BI_LSTM_INPUT_INPUT]->attr.size[1]; + outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.size[2] = inputs[BI_LSTM_INPUT_INPUT]->attr.size[2]; + outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dim_num = 3; + } + else + { + outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.size[0] = output_size; + outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.size[1] = inputs[BI_LSTM_INPUT_INPUT]->attr.size[1]; + outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.size[2] = inputs[BI_LSTM_INPUT_INPUT]->attr.size[2]; + outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dim_num = 3; + + outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.size[0] = output_size; + outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.size[1] = inputs[BI_LSTM_INPUT_INPUT]->attr.size[1]; + outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.size[2] = inputs[BI_LSTM_INPUT_INPUT]->attr.size[2]; + outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dim_num = 3; + } + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_bidirectional_sequence_lstm_param* curr_param = + &self->nn_param.bidirectional_sequence_lstm; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_tensor_t** split_output_tensors = NULL; + vsi_nn_tensor_t** lstmcell_reshape_output_tensors_fw =NULL; + vsi_nn_tensor_t** lstmcell_reshape_output_tensors_bw =NULL; + vsi_nn_tensor_t* last_step_h_state_fw = NULL; + vsi_nn_tensor_t* last_step_h_state_bw = NULL; + vsi_nn_tensor_t* last_step_c_state_fw = NULL; + vsi_nn_tensor_t* last_step_c_state_bw = NULL; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_tensor_t* input_tensor = NULL; + vsi_nn_tensor_t* aux_input_tensor = NULL; + vsi_nn_tensor_t** aux_split_output_tensors = NULL; + vsi_nn_tensor_t** reshape_output_tensors = NULL; + vsi_nn_tensor_t** aux_reshape_output_tensors = NULL; + vsi_bool has_aux_input = (inputs[BI_LSTM_AUX_INPUT] != NULL); + vsi_bool use_virtual_tensor = TRUE; + uint32_t batch_size = 0; + uint32_t time_step = 0; + uint32_t i = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_node_wksp( self ); + + if( curr_param->time_major ) + { + batch_size = inputs[BI_LSTM_INPUT_INPUT]->attr.size[1]; + time_step = inputs[BI_LSTM_INPUT_INPUT]->attr.size[2]; + } + else + { + batch_size = inputs[BI_LSTM_INPUT_INPUT]->attr.size[2]; + time_step = inputs[BI_LSTM_INPUT_INPUT]->attr.size[1]; + } + + setup_op_shapes( self, inputs, outputs); + + /* default to input */ + input_tensor = inputs[BI_LSTM_INPUT_INPUT]; + if( !curr_param->time_major ) + { + /* transpose to time_major */ + output_tensor = vsi_nn_rnn_transpose_time_major(self, + inputs[BI_LSTM_INPUT_INPUT], NULL, use_virtual_tensor); + input_tensor = output_tensor->t; + } + + /* default to aux input */ + if(has_aux_input) + { + aux_input_tensor = inputs[BI_LSTM_AUX_INPUT]; + if( !curr_param->time_major ) + { + /* transpose to time_major */ + output_tensor = vsi_nn_rnn_transpose_time_major(self, + inputs[BI_LSTM_AUX_INPUT], NULL, use_virtual_tensor); + aux_input_tensor = output_tensor->t; + } + } + + /* split input tensor */ + split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + vsi_nn_rnn_split_input_tensor(self, input_tensor, + split_output_tensors, time_step, use_virtual_tensor); + + vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + + /* split aux input tensor */ + if(has_aux_input) + { + aux_split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( aux_split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + aux_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + vsi_nn_rnn_split_input_tensor(self, aux_input_tensor, + aux_split_output_tensors, time_step, use_virtual_tensor); + + vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, time_step, use_virtual_tensor); + } + + /* prepare output tensor */ + lstmcell_reshape_output_tensors_fw = (vsi_nn_tensor_t **)malloc(time_step * + sizeof(vsi_nn_tensor_t **)); + memset( lstmcell_reshape_output_tensors_fw, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + lstmcell_reshape_output_tensors_bw = (vsi_nn_tensor_t **)malloc(time_step * + sizeof(vsi_nn_tensor_t **)); + memset( lstmcell_reshape_output_tensors_bw, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + for( i = 0; i < time_step; i++ ) + { + /* reshape for split output */ + output_tensor = vsi_nn_rnn_reshape_split_output(self, + split_output_tensors[i], batch_size, use_virtual_tensor); + reshape_output_tensors[i] = output_tensor->t; + + if (has_aux_input) + { + /* reshape for aux split output */ + output_tensor = vsi_nn_rnn_reshape_split_output(self, + aux_split_output_tensors[i], batch_size, use_virtual_tensor); + aux_reshape_output_tensors[i] = output_tensor->t; + } + } + + /* forward lstm op */ + last_step_h_state_fw = inputs[BI_LSTM_FW_INPUT_H_STATE]; + last_step_c_state_fw = inputs[BI_LSTM_FW_INPUT_C_STATE]; + for( i = 0; i < time_step; i++ ) + { + vsi_nn_tensor_t* lstmcell_out0 = NULL; + vsi_nn_tensor_t* lstmcell_out1 = NULL; + vsi_nn_tensor_t* lstmcell_out2 = NULL; + + /* lstmcell output */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + lstmcell_out0 = output_tensor->t; + + /* lstmcell output h_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + lstmcell_out1 = output_tensor->t; + + /* lstmcell output c_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + lstmcell_out2 = output_tensor->t; + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 ); + curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation; + curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip; + curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias; + curr->node->nn_param.lstmunit_ovxlib.proj_clip = curr_param->proj_clip; + curr->node->nn_param.lstmunit_ovxlib.recurrent_activation = curr_param->recurrent_activation; + memcpy( curr->node->nn_param.lstm_ovxlib.internal_dtype, + curr_param->internal_dtype, + sizeof(vsi_nn_dtype_t) * LSTMUNIT_QUANTIZE_PARAM_COUNT); + curr->inputs[LSTMUNIT_INPUT_INPUT] = reshape_output_tensors[i]; + curr->inputs[LSTMUNIT_INPUT_H_STATE] = last_step_h_state_fw; + curr->inputs[LSTMUNIT_INPUT_C_STATE] = last_step_c_state_fw; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2I] = inputs[BI_LSTM_FW_INPUT_WEIGHT_I2I]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2F] = inputs[BI_LSTM_FW_INPUT_WEIGHT_I2F]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2C] = inputs[BI_LSTM_FW_INPUT_WEIGHT_I2C]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2O] = inputs[BI_LSTM_FW_INPUT_WEIGHT_I2O]; + + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2I] = inputs[BI_LSTM_FW_INPUT_WEIGHT_R2I]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2F] = inputs[BI_LSTM_FW_INPUT_WEIGHT_R2F]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2C] = inputs[BI_LSTM_FW_INPUT_WEIGHT_R2C]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2O] = inputs[BI_LSTM_FW_INPUT_WEIGHT_R2O]; + + curr->inputs[LSTMUNIT_INPUT_WEIGHT_C2I] = inputs[BI_LSTM_FW_INPUT_WEIGHT_C2I]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_C2F] = inputs[BI_LSTM_FW_INPUT_WEIGHT_C2F]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_C2O] = inputs[BI_LSTM_FW_INPUT_WEIGHT_C2O]; + + curr->inputs[LSTMUNIT_INPUT_BIAS_I] = inputs[BI_LSTM_FW_INPUT_BIAS_I]; + curr->inputs[LSTMUNIT_INPUT_BIAS_F] = inputs[BI_LSTM_FW_INPUT_BIAS_F]; + curr->inputs[LSTMUNIT_INPUT_BIAS_C] = inputs[BI_LSTM_FW_INPUT_BIAS_C]; + curr->inputs[LSTMUNIT_INPUT_BIAS_O] = inputs[BI_LSTM_FW_INPUT_BIAS_O]; + + curr->inputs[LSTMUNIT_INPUT_WEIGHT_PROJ] = inputs[BI_LSTM_FW_INPUT_WEIGHT_PROJ]; + curr->inputs[LSTMUNIT_INPUT_BIAS_PROJ] = inputs[BI_LSTM_FW_INPUT_BIAS_PROJ]; + + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_I] = inputs[BI_LSTM_FW_INPUT_LAYERNORM_I]; + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_F] = inputs[BI_LSTM_FW_INPUT_LAYERNORM_F]; + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_C] = inputs[BI_LSTM_FW_INPUT_LAYERNORM_C]; + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_O] = inputs[BI_LSTM_FW_INPUT_LAYERNORM_O]; + + if (has_aux_input) + { + curr->inputs[LSTM_INPUT_AUX_INPUT] = aux_reshape_output_tensors[i]; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2I] = inputs[BI_LSTM_FW_AUX_INPUT_WEIGHT_I2I]; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2F] = inputs[BI_LSTM_FW_AUX_INPUT_WEIGHT_I2F]; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2C] = inputs[BI_LSTM_FW_AUX_INPUT_WEIGHT_I2C]; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2O] = inputs[BI_LSTM_FW_AUX_INPUT_WEIGHT_I2O]; + } + else + { + curr->inputs[LSTM_INPUT_AUX_INPUT] = NULL; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2I] = NULL; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2F] = NULL; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2C] = NULL; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2O] = NULL; + } + + curr->outputs[LSTMUNIT_OUTPUT_OUTPUT] = lstmcell_out0; + curr->outputs[LSTMUNIT_OUTPUT_H_STATE] = lstmcell_out1; + curr->outputs[LSTMUNIT_OUTPUT_C_STATE] = lstmcell_out2; + + vsi_nn_internal_setup_node( self, curr ); + + last_step_h_state_fw = lstmcell_out1; + last_step_c_state_fw = lstmcell_out2; + + /* reshape output to 3-dims */ + output_tensor = vsi_nn_rnn_reshape_cell_output(self, + lstmcell_out0, batch_size, use_virtual_tensor); + lstmcell_reshape_output_tensors_fw[i] = output_tensor->t; + } + + /* backward lstm op */ + last_step_h_state_bw = inputs[BI_LSTM_BW_INPUT_H_STATE]; + last_step_c_state_bw = inputs[BI_LSTM_BW_INPUT_C_STATE]; + for( i = 0; i < time_step; i++ ) + { + vsi_nn_tensor_t* lstmcell_out0 = NULL; + vsi_nn_tensor_t* lstmcell_out1 = NULL; + vsi_nn_tensor_t* lstmcell_out2 = NULL; + + /* lstmcell output */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + lstmcell_out0 = output_tensor->t; + + /* lstmcell output h_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + lstmcell_out1 = output_tensor->t; + + /* lstmcell output c_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + lstmcell_out2 = output_tensor->t; + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 ); + curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation; + curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip; + curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias; + curr->node->nn_param.lstmunit_ovxlib.proj_clip = curr_param->proj_clip; + curr->node->nn_param.lstmunit_ovxlib.recurrent_activation = curr_param->recurrent_activation; + memcpy( curr->node->nn_param.lstm_ovxlib.internal_dtype, + &(curr_param->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_COUNT]), + sizeof(vsi_nn_dtype_t) * LSTMUNIT_QUANTIZE_PARAM_COUNT); + curr->inputs[LSTMUNIT_INPUT_INPUT] = reshape_output_tensors[time_step - 1 - i]; + curr->inputs[LSTMUNIT_INPUT_H_STATE] = last_step_h_state_bw; + curr->inputs[LSTMUNIT_INPUT_C_STATE] = last_step_c_state_bw; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2I] = inputs[BI_LSTM_BW_INPUT_WEIGHT_I2I]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2F] = inputs[BI_LSTM_BW_INPUT_WEIGHT_I2F]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2C] = inputs[BI_LSTM_BW_INPUT_WEIGHT_I2C]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2O] = inputs[BI_LSTM_BW_INPUT_WEIGHT_I2O]; + + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2I] = inputs[BI_LSTM_BW_INPUT_WEIGHT_R2I]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2F] = inputs[BI_LSTM_BW_INPUT_WEIGHT_R2F]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2C] = inputs[BI_LSTM_BW_INPUT_WEIGHT_R2C]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2O] = inputs[BI_LSTM_BW_INPUT_WEIGHT_R2O]; + + curr->inputs[LSTMUNIT_INPUT_WEIGHT_C2I] = inputs[BI_LSTM_BW_INPUT_WEIGHT_C2I]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_C2F] = inputs[BI_LSTM_BW_INPUT_WEIGHT_C2F]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_C2O] = inputs[BI_LSTM_BW_INPUT_WEIGHT_C2O]; + + curr->inputs[LSTMUNIT_INPUT_BIAS_I] = inputs[BI_LSTM_BW_INPUT_BIAS_I]; + curr->inputs[LSTMUNIT_INPUT_BIAS_F] = inputs[BI_LSTM_BW_INPUT_BIAS_F]; + curr->inputs[LSTMUNIT_INPUT_BIAS_C] = inputs[BI_LSTM_BW_INPUT_BIAS_C]; + curr->inputs[LSTMUNIT_INPUT_BIAS_O] = inputs[BI_LSTM_BW_INPUT_BIAS_O]; + + curr->inputs[LSTMUNIT_INPUT_WEIGHT_PROJ] = inputs[BI_LSTM_BW_INPUT_WEIGHT_PROJ]; + curr->inputs[LSTMUNIT_INPUT_BIAS_PROJ] = inputs[BI_LSTM_BW_INPUT_BIAS_PROJ]; + + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_I] = inputs[BI_LSTM_BW_INPUT_LAYERNORM_I]; + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_F] = inputs[BI_LSTM_BW_INPUT_LAYERNORM_F]; + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_C] = inputs[BI_LSTM_BW_INPUT_LAYERNORM_C]; + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_O] = inputs[BI_LSTM_BW_INPUT_LAYERNORM_O]; + + if (has_aux_input) + { + curr->inputs[LSTM_INPUT_AUX_INPUT] = aux_reshape_output_tensors[time_step - 1 - i]; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2I] = inputs[BI_LSTM_BW_AUX_INPUT_WEIGHT_I2I]; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2F] = inputs[BI_LSTM_BW_AUX_INPUT_WEIGHT_I2F]; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2C] = inputs[BI_LSTM_BW_AUX_INPUT_WEIGHT_I2C]; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2O] = inputs[BI_LSTM_BW_AUX_INPUT_WEIGHT_I2O]; + } + else + { + curr->inputs[LSTM_INPUT_AUX_INPUT] = NULL; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2I] = NULL; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2F] = NULL; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2C] = NULL; + curr->inputs[LSTM_INPUT_AUX_WEIGHT_I2O] = NULL; + } + + curr->outputs[LSTMUNIT_OUTPUT_OUTPUT] = lstmcell_out0; + curr->outputs[LSTMUNIT_OUTPUT_H_STATE] = lstmcell_out1; + curr->outputs[LSTMUNIT_OUTPUT_C_STATE] = lstmcell_out2; + + vsi_nn_internal_setup_node( self, curr ); + + last_step_h_state_bw = lstmcell_out1; + last_step_c_state_bw = lstmcell_out2; + + /* reshape output to 3-dims */ + output_tensor = vsi_nn_rnn_reshape_cell_output(self, + lstmcell_out0, batch_size, use_virtual_tensor); + lstmcell_reshape_output_tensors_bw[i] = output_tensor->t; + } + + if(curr_param->merge_outputs) + { + vsi_nn_tensor_t** merge_tensors = NULL; + merge_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( merge_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + tensor = outputs[BI_LSTM_FW_OUTPUT_OUTPUT]; + if( !curr_param->time_major ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tensor = output_tensor->t; + } + + /* concat fw & bw output, the lstm's output is 3-dims */ + for( i = 0; i < time_step; i++ ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 2, 1 ); + curr->node->nn_param.concat.axis = 0; + curr->inputs[0] = lstmcell_reshape_output_tensors_fw[i]; + curr->inputs[1] = lstmcell_reshape_output_tensors_bw[i]; + curr->outputs[0] = output_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + merge_tensors[i] = output_tensor->t; + } + + + /* concat lstmcell output, the lstm's output is 3-dims */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr->node->nn_param.concat.axis = 2; + for( i = 0; i < time_step; i++ ) + { + curr->inputs[i] = merge_tensors[i]; + } + curr->outputs[0] = tensor; + vsi_nn_internal_setup_node( self, curr ); + + if( !curr_param->time_major ) + { + /* transpose time_major to batch_major*/ + vsi_nn_rnn_transpose_time_major(self, + tensor, outputs[BI_LSTM_FW_OUTPUT_OUTPUT], use_virtual_tensor); + } + vsi_nn_safe_free( merge_tensors ); + } + else + { + /* forward output*/ + tensor = outputs[BI_LSTM_FW_OUTPUT_OUTPUT]; + if( !curr_param->time_major ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tensor = output_tensor->t; + } + + /* concat lstmcell output, the lstm's output is 3-dims */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr->node->nn_param.concat.axis = 2; + for( i = 0; i < time_step; i++ ) + { + curr->inputs[i] = lstmcell_reshape_output_tensors_fw[i]; + } + curr->outputs[0] = tensor; + vsi_nn_internal_setup_node( self, curr ); + + if( !curr_param->time_major ) + { + /* transpose time_major to batch_major*/ + vsi_nn_rnn_transpose_time_major(self, + tensor, outputs[BI_LSTM_FW_OUTPUT_OUTPUT], use_virtual_tensor); + } + + /* backward output*/ + tensor = outputs[BI_LSTM_BW_OUTPUT_OUTPUT]; + if( !curr_param->time_major ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tensor = output_tensor->t; + } + + /* concat lstmcell output, the lstm's output is 3-dims */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr->node->nn_param.concat.axis = 2; + for( i = 0; i < time_step; i++ ) + { + curr->inputs[i] = lstmcell_reshape_output_tensors_bw[i]; + } + curr->outputs[0] = tensor; + vsi_nn_internal_setup_node( self, curr ); + + if( !curr_param->time_major ) + { + /* transpose time_major to batch_major*/ + vsi_nn_rnn_transpose_time_major(self, + tensor, outputs[BI_LSTM_BW_OUTPUT_OUTPUT], use_virtual_tensor); + } + } + + vsi_nn_safe_free( split_output_tensors ); + vsi_nn_safe_free( aux_split_output_tensors ) + vsi_nn_safe_free( reshape_output_tensors ); + vsi_nn_safe_free( aux_reshape_output_tensors ); + vsi_nn_safe_free( lstmcell_reshape_output_tensors_fw ); + vsi_nn_safe_free( lstmcell_reshape_output_tensors_bw ); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_safe_free(self->nn_param.bidirectional_sequence_lstm.internal_dtype); + vsi_nn_internal_deinit_node_wksp( self ); + + return status; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.bidirectional_sequence_lstm.internal_dtype = (vsi_nn_dtype_t *) + malloc(sizeof(vsi_nn_dtype_t) * LSTMUNIT_QUANTIZE_PARAM_COUNT * 2); + memset(self->nn_param.bidirectional_sequence_lstm.internal_dtype, 0, + sizeof(vsi_nn_dtype_t) * LSTMUNIT_QUANTIZE_PARAM_COUNT * 2); + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ BIDIRECTIONAL_SEQUENCE_LSTM, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ BI_LSTM_INPUT_CNT, + /* output_num */ BI_LSTM_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c new file mode 100644 index 0000000..710acf9 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c @@ -0,0 +1,552 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_bidirectional_sequence_rnn_param* curr_param = + &self->nn_param.bidirectional_sequence_rnn; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t num_units = 0; + uint32_t output_size = 0; + uint32_t batch_size = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + if( curr_param->time_major ) + { + batch_size = inputs[BI_RNN_INPUT_INPUT]->attr.size[1]; + } + else + { + batch_size = inputs[BI_RNN_INPUT_INPUT]->attr.size[2]; + } + + num_units = inputs[BI_RNN_FW_INPUT_WEIGHT_I]->attr.size[1]; + output_size = num_units; + + /* create h_state if app doesn't provide them */ + if( !inputs[BI_RNN_FW_INPUT_H_STATE] ) + { + attr.dim_num = 2; + attr.size[1] = batch_size; + attr.size[0] = output_size; + memcpy(&attr.dtype, &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype )); + attr.vtl = FALSE; + attr.is_const = TRUE; + + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[BI_RNN_FW_INPUT_H_STATE] = output_tensor->t; + } + + if( !inputs[BI_RNN_BW_INPUT_H_STATE] ) + { + attr.dim_num = 2; + attr.size[1] = batch_size; + attr.size[0] = output_size; + memcpy(&attr.dtype, &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype )); + attr.vtl = FALSE; + attr.is_const = TRUE; + + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[BI_RNN_BW_INPUT_H_STATE] = output_tensor->t; + } + + /* output */ + if( VSI_NN_DIM_AUTO == outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dim_num ) + { + if( curr_param->merge_outputs ) + { + outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.size[0] = output_size * 2; + outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.size[1] = inputs[BI_RNN_INPUT_INPUT]->attr.size[1]; + outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.size[2] = inputs[BI_RNN_INPUT_INPUT]->attr.size[2]; + outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dim_num = 3; + } + else + { + outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.size[0] = output_size; + outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.size[1] = inputs[BI_RNN_INPUT_INPUT]->attr.size[1]; + outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.size[2] = inputs[BI_RNN_INPUT_INPUT]->attr.size[2]; + outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dim_num = 3; + + outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.size[0] = output_size; + outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.size[1] = inputs[BI_RNN_INPUT_INPUT]->attr.size[1]; + outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.size[2] = inputs[BI_RNN_INPUT_INPUT]->attr.size[2]; + outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dim_num = 3; + } + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_bidirectional_sequence_rnn_param* curr_param = + &self->nn_param.bidirectional_sequence_rnn; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_tensor_t** split_output_tensors = NULL; + vsi_nn_tensor_t** rnncell_reshape_output_tensors_fw =NULL; + vsi_nn_tensor_t** rnncell_reshape_output_tensors_bw =NULL; + vsi_nn_tensor_t* last_step_h_state_fw = NULL; + vsi_nn_tensor_t* last_step_h_state_bw = NULL; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_tensor_t* input_tensor = NULL; + vsi_nn_tensor_t* aux_input_tensor = NULL; + vsi_nn_tensor_t** aux_split_output_tensors = NULL; + vsi_nn_tensor_t** reshape_output_tensors = NULL; + vsi_nn_tensor_t** aux_reshape_output_tensors = NULL; + vsi_bool has_aux_input = (inputs[BI_RNN_AUX_INPUT] != NULL); + vsi_bool use_virtual_tensor = TRUE; + uint32_t batch_size = 0; + uint32_t time_step = 0; + uint32_t i = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_node_wksp( self ); + + if( curr_param->time_major ) + { + batch_size = inputs[BI_RNN_INPUT_INPUT]->attr.size[1]; + time_step = inputs[BI_RNN_INPUT_INPUT]->attr.size[2]; + } + else + { + batch_size = inputs[BI_RNN_INPUT_INPUT]->attr.size[2]; + time_step = inputs[BI_RNN_INPUT_INPUT]->attr.size[1]; + } + + setup_op_shapes( self, inputs, outputs); + + /* default to input */ + input_tensor = inputs[BI_RNN_INPUT_INPUT]; + if( !curr_param->time_major ) + { + /* transpose to time_major */ + output_tensor = vsi_nn_rnn_transpose_time_major(self, + inputs[BI_RNN_INPUT_INPUT], NULL, use_virtual_tensor); + input_tensor = output_tensor->t; + } + + /* default to aux input */ + if(has_aux_input) + { + aux_input_tensor = inputs[BI_RNN_AUX_INPUT]; + if( !curr_param->time_major ) + { + /* transpose to time_major */ + output_tensor = vsi_nn_rnn_transpose_time_major(self, + inputs[BI_RNN_AUX_INPUT], NULL, use_virtual_tensor); + aux_input_tensor = output_tensor->t; + } + } + + /* split input tensor */ + split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + vsi_nn_rnn_split_input_tensor(self, input_tensor, + split_output_tensors, time_step, use_virtual_tensor); + + vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + + /* split aux input tensor */ + if(has_aux_input) + { + aux_split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( aux_split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + aux_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + vsi_nn_rnn_split_input_tensor(self, aux_input_tensor, + aux_split_output_tensors, time_step, use_virtual_tensor); + + vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, time_step, use_virtual_tensor); + } + + /* prepare output tensor */ + rnncell_reshape_output_tensors_fw = (vsi_nn_tensor_t **)malloc(time_step * + sizeof(vsi_nn_tensor_t **)); + memset( rnncell_reshape_output_tensors_fw, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + rnncell_reshape_output_tensors_bw = (vsi_nn_tensor_t **)malloc(time_step * + sizeof(vsi_nn_tensor_t **)); + memset( rnncell_reshape_output_tensors_bw, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + for( i = 0; i < time_step; i++ ) + { + /* reshape for split output */ + output_tensor = vsi_nn_rnn_reshape_split_output(self, + split_output_tensors[i], batch_size, use_virtual_tensor); + reshape_output_tensors[i] = output_tensor->t; + + if (has_aux_input) + { + /* reshape for aux split output */ + output_tensor = vsi_nn_rnn_reshape_split_output(self, + aux_split_output_tensors[i], batch_size, use_virtual_tensor); + aux_reshape_output_tensors[i] = output_tensor->t; + } + } + + /* forward rnn op */ + last_step_h_state_fw = inputs[BI_RNN_FW_INPUT_H_STATE]; + for( i = 0; i < time_step; i++ ) + { + vsi_nn_tensor_t* rnncell_out0 = NULL; + vsi_nn_tensor_t* rnncell_out1 = NULL; + + /* rnncell output */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + rnncell_out0 = output_tensor->t; + + /* rnncell output h_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + rnncell_out1 = output_tensor->t; + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 ); + curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation; + memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype, + curr_param->internal_dtype, + sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT); + curr->inputs[RNNCELL_INPUT_INPUT] = reshape_output_tensors[i]; + curr->inputs[RNNCELL_INPUT_H_STATE] = last_step_h_state_fw; + + curr->inputs[RNNCELL_INPUT_WEIGHT_I] = inputs[BI_RNN_FW_INPUT_WEIGHT_I]; + curr->inputs[RNNCELL_INPUT_WEIGHT_H] = inputs[BI_RNN_FW_INPUT_WEIGHT_H]; + + curr->inputs[RNNCELL_INPUT_BIAS] = inputs[BI_RNN_FW_INPUT_BIAS]; + + if (has_aux_input) + { + curr->inputs[RNNCELL_INPUT_AUX_INPUT] = aux_reshape_output_tensors[i]; + curr->inputs[RNNCELL_INPUT_AUX_WEIGHT] = inputs[BI_RNN_FW_AUX_INPUT_WEIGHT]; + } + else + { + curr->inputs[RNNCELL_INPUT_AUX_INPUT] = NULL; + curr->inputs[RNNCELL_INPUT_AUX_WEIGHT] = NULL; + } + + curr->outputs[RNNCELL_OUTPUT_OUTPUT] = rnncell_out0; + curr->outputs[RNNCELL_OUTPUT_H_STATE] = rnncell_out1; + + vsi_nn_internal_setup_node( self, curr ); + + last_step_h_state_fw = rnncell_out1; + + /* reshape output to 3-dims */ + output_tensor = vsi_nn_rnn_reshape_cell_output(self, + rnncell_out0, batch_size, use_virtual_tensor); + rnncell_reshape_output_tensors_fw[i] = output_tensor->t; + } + + /* backward rnn op */ + last_step_h_state_bw = inputs[BI_RNN_BW_INPUT_H_STATE]; + for( i = 0; i < time_step; i++ ) + { + vsi_nn_tensor_t* rnncell_out0 = NULL; + vsi_nn_tensor_t* rnncell_out1 = NULL; + + /* rnncell output */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + rnncell_out0 = output_tensor->t; + + /* rnncell output h_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + rnncell_out1 = output_tensor->t; + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 ); + curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation; + memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype, + &(curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_COUNT]), + sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT); + curr->inputs[RNNCELL_INPUT_INPUT] = reshape_output_tensors[time_step - 1 - i]; + curr->inputs[RNNCELL_INPUT_H_STATE] = last_step_h_state_bw; + + curr->inputs[RNNCELL_INPUT_WEIGHT_I] = inputs[BI_RNN_BW_INPUT_WEIGHT_I]; + curr->inputs[RNNCELL_INPUT_WEIGHT_H] = inputs[BI_RNN_BW_INPUT_WEIGHT_H]; + + curr->inputs[RNNCELL_INPUT_BIAS] = inputs[BI_RNN_BW_INPUT_BIAS]; + + if(has_aux_input) + { + curr->inputs[RNNCELL_INPUT_AUX_INPUT] = aux_reshape_output_tensors[time_step - 1 - i]; + curr->inputs[RNNCELL_INPUT_AUX_WEIGHT] = inputs[BI_RNN_BW_AUX_INPUT_WEIGHT]; + } + else + { + curr->inputs[RNNCELL_INPUT_AUX_INPUT] = NULL; + curr->inputs[RNNCELL_INPUT_AUX_WEIGHT] = NULL; + } + + curr->outputs[RNNCELL_OUTPUT_OUTPUT] = rnncell_out0; + curr->outputs[RNNCELL_OUTPUT_H_STATE] = rnncell_out1; + + vsi_nn_internal_setup_node( self, curr ); + + last_step_h_state_bw = rnncell_out1; + + /* reshape output to 3-dims */ + output_tensor = vsi_nn_rnn_reshape_cell_output(self, + rnncell_out0, batch_size, use_virtual_tensor); + rnncell_reshape_output_tensors_bw[i] = output_tensor->t; + } + + if(curr_param->merge_outputs) + { + vsi_nn_tensor_t** merge_tensors = NULL; + merge_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( merge_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + tensor = outputs[BI_RNN_FW_OUTPUT_OUTPUT]; + if( !curr_param->time_major ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tensor = output_tensor->t; + } + + /* concat fw & bw output, the rnn's output is 3-dims */ + for( i = 0; i < time_step; i++ ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 2, 1 ); + curr->node->nn_param.concat.axis = 0; + curr->inputs[0] = rnncell_reshape_output_tensors_fw[i]; + curr->inputs[1] = rnncell_reshape_output_tensors_bw[i]; + curr->outputs[0] = output_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + merge_tensors[i] = output_tensor->t; + } + + + /* concat rnncell output, the rnn's output is 3-dims */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr->node->nn_param.concat.axis = 2; + for( i = 0; i < time_step; i++ ) + { + curr->inputs[i] = merge_tensors[i]; + } + curr->outputs[0] = tensor; + vsi_nn_internal_setup_node( self, curr ); + + if( !curr_param->time_major ) + { + /* transpose time_major to batch_major*/ + vsi_nn_rnn_transpose_time_major(self, + tensor, outputs[BI_RNN_FW_OUTPUT_OUTPUT], use_virtual_tensor); + } + vsi_nn_safe_free( merge_tensors ); + } + else + { + /* forward output*/ + tensor = outputs[BI_RNN_FW_OUTPUT_OUTPUT]; + if( !curr_param->time_major ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tensor = output_tensor->t; + } + + /* concat rnncell output, the rnn's output is 3-dims */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr->node->nn_param.concat.axis = 2; + for( i = 0; i < time_step; i++ ) + { + curr->inputs[i] = rnncell_reshape_output_tensors_fw[i]; + } + curr->outputs[0] = tensor; + vsi_nn_internal_setup_node( self, curr ); + + if( !curr_param->time_major ) + { + /* transpose time_major to batch_major*/ + vsi_nn_rnn_transpose_time_major(self, + tensor, outputs[BI_RNN_FW_OUTPUT_OUTPUT], use_virtual_tensor); + } + + /* backward output*/ + tensor = outputs[BI_RNN_BW_OUTPUT_OUTPUT]; + if( !curr_param->time_major ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tensor = output_tensor->t; + } + + /* concat rnncell output, the rnn's output is 3-dims */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr->node->nn_param.concat.axis = 2; + for( i = 0; i < time_step; i++ ) + { + curr->inputs[i] = rnncell_reshape_output_tensors_bw[i]; + } + curr->outputs[0] = tensor; + vsi_nn_internal_setup_node( self, curr ); + + if( !curr_param->time_major ) + { + /* transpose time_major to batch_major*/ + vsi_nn_rnn_transpose_time_major(self, + tensor, outputs[BI_RNN_BW_OUTPUT_OUTPUT], use_virtual_tensor); + } + } + + vsi_nn_safe_free( split_output_tensors ); + vsi_nn_safe_free( aux_split_output_tensors ) + vsi_nn_safe_free( reshape_output_tensors ); + vsi_nn_safe_free( aux_reshape_output_tensors ); + vsi_nn_safe_free( rnncell_reshape_output_tensors_fw ); + vsi_nn_safe_free( rnncell_reshape_output_tensors_bw ); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_safe_free(self->nn_param.bidirectional_sequence_rnn.internal_dtype); + vsi_nn_internal_deinit_node_wksp( self ); + + return status; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.bidirectional_sequence_rnn.internal_dtype = (vsi_nn_dtype_t *) + malloc(sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT * 2); + memset(self->nn_param.bidirectional_sequence_rnn.internal_dtype, 0, + sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT * 2); + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ BIDIRECTIONAL_SEQUENCE_RNN, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ BI_RNN_INPUT_CNT, + /* output_num */ BI_RNN_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c new file mode 100644 index 0000000..fb6b0e1 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c @@ -0,0 +1,299 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" + +#define _ARG_NUM (6) +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (4) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_BOX_WITH_NMS_LIMIT_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_box_with_nms_limit_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.box_with_nms_limit); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ + #define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_FLOAT32, score_threshold ); + _SET_PARAM( 1, VX_TYPE_INT32, max_num_bbox ); + _SET_PARAM( 2, VX_TYPE_INT32, nms_kernel_method ); + _SET_PARAM( 3, VX_TYPE_FLOAT32, iou_threshold ); + _SET_PARAM( 4, VX_TYPE_FLOAT32, sigma ); + _SET_PARAM( 5, VX_TYPE_FLOAT32, nms_score_threshold ); + #undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + /*TODO: Add code if need to change your parameter*/ + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + char *path = NULL; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = vx_kernel_BOX_WITH_NMS_LIMIT_list; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_box_with_nms_limit"; + path = getenv("USER_VX_SOURCE_PATH"); + if(path) + vsi_nn_VxResourceSetPath(path); + + if( kernel_info.type == VX_KERNEL_TYPE_VX) + { + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + } + else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ + { + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) + { + free(kernel_info.resource_name); + } + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = 1; + outputs[0]->attr.size[0] = inputs[0]->attr.size[1]; + + outputs[1]->attr.dim_num = 2; + outputs[1]->attr.size[0] = 4; + outputs[1]->attr.size[1] = inputs[0]->attr.size[1]; + + outputs[2]->attr.dim_num = 1; + outputs[2]->attr.size[0] = inputs[0]->attr.size[1]; + + outputs[3]->attr.dim_num = 1; + outputs[3]->attr.size[0] = inputs[0]->attr.size[1]; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ BOX_WITH_NMS_LIMIT, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c new file mode 100644 index 0000000..86e6ae2 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c @@ -0,0 +1,284 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_bool _is_same_quant + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_dtype_t *dtype,*_dtype; + + dtype = &inputs[0]->attr.dtype; + _dtype = &outputs[0]->attr.dtype; + + if (vsi_nn_DtypeCompare(dtype, _dtype) == FALSE) + { + return FALSE; + } + + return TRUE; +} /* _is_same_quant */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + + if ( _is_same_quant(self, inputs, outputs)) + { + + vsi_nn_internal_compute_node( self ); + status = VSI_SUCCESS; + } + else + { + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 0; + vsi_bool ret; + + if ( NULL == self ) + { + return status; + } + + ret = vsi_nn_kernel_optimize_element_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); + if ( ret ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shape, new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shape, new_rank ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "cast", + &reshape_tensors[0], 1, + &reshape_tensors[1], 1, NULL ); + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + } + + if ( self->n ) + { + status = VSI_SUCCESS; + } + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(CAST, 1, 1) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_I32) + IO_TYPE(D_F32, D_U32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BOOL8) + IO_TYPE(D_I32, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_I32, D_U32) + IO_TYPE(D_I32, D_BOOL8) + IO_TYPE(D_U32, D_F32) + IO_TYPE(D_U32, D_I32) + IO_TYPE(D_U32, D_U32) + IO_TYPE(D_U32, D_BOOL8) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_BOOL8) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16, D_BOOL8) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8, D_BOOL8) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8, D_BOOL8) + IO_TYPE(D_F32, D_I16|Q_DFP) + IO_TYPE(D_F32, D_I8|Q_DFP) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I8|Q_DFP) + IO_TYPE(D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_I32) + IO_TYPE(D_F16, D_U8) + IO_TYPE(D_F16, D_I8) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BOOL8, D_F16) + IO_TYPE(D_BOOL8, D_BOOL8) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I32) + IO_TYPE(D_BF16, D_BF16) + END_IO_TYPE_DECL(CAST) + if(!VALIDATE_OP_IO_TYPES(CAST, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status; + + status = VSI_SUCCESS; + + if ( _is_same_quant(self, inputs, outputs)) + { + vsi_nn_internal_optimize_node( self, direction ); + } + + return status; +} /* op_optimize() */ + + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + + if ( NULL == self ) + { + return FALSE; + } + ret = vsi_nn_op_common_setup(self, inputs, outputs); + + if ( _is_same_quant(self, inputs, outputs) ) + { + vsi_nn_internal_node_t* curr = NULL; + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); + if (NULL == curr) + { + return FALSE; + } + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + + vsi_nn_internal_setup_node(self, curr); + } + + return ret; +} + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_internal_deinit_node_wksp(self); + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_init_node_wksp(self); + return status; +} /* op_init() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CAST, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c new file mode 100644 index 0000000..06898c1 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c @@ -0,0 +1,248 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_internal_node.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + float min_value = self->nn_param.clip.min; + float max_value = self->nn_param.clip.max; + + if ( (min_value == -1.0f && max_value == 1.0f) + || (min_value == 0.0f && max_value == 6.0f) ) + { + status = VSI_SUCCESS; + vsi_nn_internal_compute_node( self ); + } + else + { + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 0; + vsi_bool ret; + vsi_nn_kernel_param_t * param = NULL; + + param =vsi_nn_kernel_param_create(); + + ret = vsi_nn_kernel_optimize_element_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); + + vsi_nn_kernel_param_add_float32( param, "min_value", min_value ); + vsi_nn_kernel_param_add_float32( param, "max_value", max_value ); + + if( ret ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shape, new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shape, new_rank ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "clip", + &reshape_tensors[0], 1, + &reshape_tensors[1], 1, param ); + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + } + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(CLIP, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(CLIP) + if(!VALIDATE_OP_IO_TYPES(CLIP, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + float min = self->nn_param.clip.min; + float max = self->nn_param.clip.max; + + for (i = 0; i < _VSI_NN_CLIP_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.clip.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.clip.local.local_tensor[i])); + self->nn_param.clip.local.local_tensor[i] = NULL; + } + } + + if (self->nn_param.clip.local2 != NULL) + { + free(self->nn_param.clip.local2); + self->nn_param.clip.local2 = NULL; + } + + if ( (min == -1.0f && max == 1.0f) + || (min == 0.0f && max == 6.0f) ) + { + vsi_nn_internal_deinit_node_wksp( self ); + } + + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + self->nn_param.clip.local2 = + (vsi_nn_clip_lcl2_data *)malloc(sizeof(vsi_nn_clip_lcl2_data)); + if (NULL == self->nn_param.reduce.local2) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.clip.local2, 0, sizeof(vsi_nn_clip_lcl2_data)); + return status; +} /* op_init() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + vsi_nn_internal_node_t* curr = NULL; + float min = self->nn_param.clip.min; + float max = self->nn_param.clip.max; + + if ( (min == -1.0f && max == 1.0f) + || (min == 0.0f && max == 6.0f) ) + { + vsi_nn_internal_init_node_wksp(self); + if (min == -1.0f && max == 1.0f) + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU1, 0, 0); + } + else + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0); + } + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + + vsi_nn_internal_setup_node(self, curr); + } + else + { + ret = vsi_nn_op_common_setup(self, inputs, outputs); + } + return ret; +} /* op_init() */ + + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CLIP, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c new file mode 100644 index 0000000..648b2e1 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c @@ -0,0 +1,83 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_platform.h" +#include "vsi_nn_types.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" + +vsi_status vsi_nn_op_common_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + //TODO: assert_always() + return VSI_FAILURE; +} /* op_common_init() */ + +vsi_status vsi_nn_op_common_deinit + ( + vsi_nn_node_t * self + ) +{ + // This node resource will be released in graph process. + // Just release the vx node. + if( NULL != self && NULL != self->n ) + { + vxReleaseNode( &self->n ); + self->n = NULL; + } + return VSI_SUCCESS; +} /* op_common_deinit() */ + +vsi_bool vsi_nn_op_common_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + memcpy( outputs[0]->attr.size, inputs[0]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + } + return TRUE; +} /* op_common_setup() */ + +vsi_status vsi_nn_op_common_optimize + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return VSI_SUCCESS; +} /* op_common_optimize() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c new file mode 100644 index 0000000..6ee4172 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c @@ -0,0 +1,525 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_link_list.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static int32_t _get_input_num + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs + ) +{ + int32_t num; + num = (int32_t)(self->input.num - 1); + while( num >= 0 && NULL == inputs[num] ) + { + num --; + } + if( 0 > num ) + { + return -1; + } + + num++; + return num; +} + +static vsi_bool _is_same_quant + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i,num; + vsi_nn_dtype_t *dtype,*_dtype; + + dtype = NULL; + /* check inputs dtype */ + num = _get_input_num(self, inputs); + for(i = 0; i < num; i++) + { + if(NULL == dtype) + { + dtype = &inputs[i]->attr.dtype; + continue; + } + + _dtype = &inputs[i]->attr.dtype; + if(vsi_nn_DtypeCompare(dtype, _dtype) == FALSE) + { + return FALSE; + } + + dtype = _dtype; + } + + /* check outputs dtype */ + _dtype = &outputs[0]->attr.dtype; + if(vsi_nn_DtypeCompare(dtype, _dtype) == FALSE) + { + return FALSE; + } + + return TRUE; +} /* _is_same_quant */ + +static vsi_bool _is_highest_dimension + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + uint32_t axis = self->nn_param.concat.axis; + uint32_t dim = outputs[0]->attr.dim_num; + + /* + If the concat op need to be optimized to tensor view, the memory must be continues. + 1. axis is in the highest dimension + 2. the highest dimension is 1, and axis is in the second highest dimension + */ + if(axis == dim - 1) + { + ret = TRUE; + } + if((outputs[0]->attr.size[dim - 1] == 1) && (axis == dim - 2)) + { + ret = TRUE; + } + return ret; +} /* _is_highest_dimension() */ + +static vsi_status copy_tensor_to_view + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * src_in, + uint32_t axis, + vx_tensor dst_tensor + ) +{ + vsi_status ret; + vsi_nn_concat_lcl_data * data; + + ret = VSI_SUCCESS; + /* Malloc ptr */ + data = (vsi_nn_concat_lcl_data *)malloc( sizeof(vsi_nn_concat_lcl_data) ); + if( NULL == data ) + { + VSILOGE( "Create concat local data fail." ); + return VSI_FAILURE; + } + memset( data, 0, sizeof(vsi_nn_concat_lcl_data) ); + data->src_tensor = src_in->t; + data->dst_tensor = dst_tensor; + + /* Store node, ptr */ + vsi_nn_LinkListPushStart( + (vsi_nn_link_list_t **)&self->nn_param.concat.lcl_data, + (vsi_nn_link_list_t *)data ); + + return ret; +} /* copy_tensor_to_view() */ + +static vx_node _create_vx_concat + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vx_nn_concat_params_t param; + vx_node node; + int32_t num,i; + vsi_nn_concat_lcl_data *data = NULL; + vx_tensor *tensors = NULL; + vsi_status status = VSI_FAILURE; + vx_enum rank = VX_TENSOR_RANK_WHCN; + + num = _get_input_num(self, inputs); + if(num < 0) + { + return NULL; + } + + tensors = (vx_tensor *)malloc(sizeof(vx_tensor) * num); + if(NULL == tensors) + { + return NULL; + } + + node = NULL; + for(i = 0; i < num; i++) + { + tensors[i] = inputs[i]->t; + status = vxSetTensorAttribute(tensors[i], VX_TENSOR_RANK, &rank, sizeof(vx_enum)); + if(VSI_SUCCESS != status) + { + goto final; + } + } + status = vxSetTensorAttribute(outputs[0]->t, VX_TENSOR_RANK, &rank, sizeof(vx_enum)); + if(VSI_SUCCESS != status) + { + goto final; + } + + data = (vsi_nn_concat_lcl_data *)malloc(sizeof(vsi_nn_concat_lcl_data)); + if(NULL == data) + { + goto final; + } + + memset(data, 0, sizeof(vsi_nn_concat_lcl_data)); + data->array = vxCreateTensorObjectArray(self->graph->ctx->c, + num, + &tensors[0]); + if(NULL == data->array) + { + free(data); + data = NULL; + goto final; + } + param.axis = self->nn_param.concat.axis; + self->nn_param.concat.lcl_data = data; + + node = vxConcatIndefiniteLayer(self->graph->g, + data->array, + ¶m, + sizeof(param), + outputs[0]->t); + +final: + if(tensors) + { + free(tensors); + } + return node; +} /* _create_vx_concat() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_concat_lcl_data * iter; + + status = VSI_SUCCESS; + self->n = NULL; + if(_is_highest_dimension(self, outputs) && _is_same_quant(self, inputs, outputs)) + { + iter = self->nn_param.concat.lcl_data; + while( NULL != iter ) + { + iter->cp_node = vxTensorCopyNode(self->graph->g, + iter->src_tensor, iter->dst_tensor ); + if( NULL == iter->cp_node ) + { + VSILOGE( "Create vxTensorCopyNode fail." ); + status = VSI_FAILURE; + break; + } + iter = (vsi_nn_concat_lcl_data *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter ); + } + } + else + { + self->n = _create_vx_concat(self, inputs, outputs); + if(NULL == self->n) + { + status = VSI_FAILURE; + } + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret; + uint32_t axis,j; + int32_t num,i; + + ret = TRUE; + axis = self->nn_param.concat.axis; + num = _get_input_num(self, inputs); + if(num < 0) + { + return FALSE; + } + for( i = 1; i < num; i ++ ) + { + if( inputs[i]->attr.dim_num != inputs[i - 1]->attr.dim_num ) + { + VSILOGE( "Concat input dims num(%d vs %d)", + inputs[i]->attr.dim_num, + inputs[i - 1]->attr.dim_num + ); + ret = FALSE; + break; + } + if( outputs[0]->attr.dim_num != VSI_NN_DIM_AUTO && + outputs[0]->attr.dim_num != inputs[i]->attr.dim_num ) + { + VSILOGE( "Concat output dims num(%d vs %d)", + outputs[0]->attr.dim_num, + inputs[i]->attr.dim_num + ); + ret = FALSE; + break; + } + for( j = 0; j < inputs[i]->attr.dim_num; j ++ ) + { + if( axis == j ) + { + continue; + } + if( inputs[i]->attr.size[j] != inputs[i - 1]->attr.size[j] ) + { + VSILOGE( "Concat input dims size(%d vs %d)", + inputs[i]->attr.size[j], + inputs[i - 1]->attr.size[j] + ); + ret = FALSE; + break; + } + if( outputs[0]->attr.dim_num != VSI_NN_DIM_AUTO && + outputs[0]->attr.size[j] != inputs[i]->attr.size[j]) + { + VSILOGE( "Concat output dims size(%d vs %d)", + outputs[0]->attr.size[j], + inputs[i]->attr.size[j] + ); + ret = FALSE; + break; + } + } + if( FALSE == ret ) + { + break; + } + } + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + int32_t num,i; + vsi_bool ret; + uint32_t axis; + + self->nn_param.concat.lcl_data = NULL; + ret = TRUE; + if( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num ) + { + return ret; + } + + num = _get_input_num(self, inputs); + if(num < 0) + { + return FALSE; + } + axis = self->nn_param.concat.axis; + memcpy( outputs[0]->attr.size, inputs[0]->attr.size, + sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for( i = 1; i < num; i ++ ) + { + outputs[0]->attr.size[axis] += inputs[i]->attr.size[axis]; + } + return ret; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status; + int32_t num,i; + uint32_t axis; + vx_tensor in_view_tensor; + uint32_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; + + status = VSI_SUCCESS; + /* we don't create tensor view if the axis is not the highest dimension */ + if (_is_highest_dimension(self, outputs) == FALSE || + _is_same_quant(self, inputs, outputs) == FALSE) + { + return status; + } + /* Only backward run concat's optimize */ + if( direction == VSI_NN_OPTIMIZE_FORWARD ) + { + return status; + } + + VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + num = _get_input_num(self, inputs); + if(num < 0) + { + return VSI_FAILURE; + } + axis = self->nn_param.concat.axis; + + if( NULL == outputs[0]->t ) + { + vsi_nn_TensorReinit( self->graph, outputs[0] ); + } + + /* Create tensor from view */ + memset( start, 0, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + memset( end, 0, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + end[0] = inputs[0]->attr.size[0]; + end[1] = inputs[0]->attr.size[1]; + end[2] = inputs[0]->attr.size[2]; + end[3] = inputs[0]->attr.size[3]; + end[axis] = 0; + for( i = 0; i < num; i++ ) + { + start[axis] = end[axis]; + end[axis] += inputs[i]->attr.size[axis]; + in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, outputs[0]); + if( NULL == in_view_tensor ) + { + VSILOGE( "Create a tensor view fail."); + status = VSI_FAILURE; + break; + } + + if( NULL != inputs[i]->t ) + { + VSILOGI( "Concat copy %d tensor.", i ); + // Copy old tensor values to the new address. + status = copy_tensor_to_view( self, inputs[i], axis, in_view_tensor ); + if( VSI_FAILURE == status ) + { + break; + } + } + else + { + inputs[i]->t = in_view_tensor; + } + } + + return status; +} /* op_optimize() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_concat_lcl_data * data; + vsi_nn_concat_lcl_data * tmp; + + if(NULL == self) + { + return VSI_FAILURE; + } + + data = self->nn_param.concat.lcl_data; + if(self->n) + { + if( NULL != self && NULL != self->n ) + { + if(data && data->array) + { + vxReleaseObjectArray(&data->array); + free(data); + data = NULL; + } + vxReleaseNode( &self->n ); + self->n = NULL; + } + } + else + { + while( NULL != data ) + { + tmp = (vsi_nn_concat_lcl_data *)vsi_nn_LinkListPopStart( + (vsi_nn_link_list_t **)&data ); + vxReleaseNode( &tmp->cp_node ); + vxReleaseTensor( &tmp->dst_tensor ); + free( tmp ); + } + } + + return VSI_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +// TODO: Fix the concat input num. +DEF_OP_REG + ( + /* op_name */ CONCAT, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 16, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concatshift.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concatshift.c new file mode 100644 index 0000000..880d2d2 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concatshift.c @@ -0,0 +1,225 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vx_node cp_node = NULL; + + /* call CONCAT's op_compute */ + status = vsi_nn_OpCompute( VSI_NN_OP_CONCAT, self, inputs, outputs ); + + if( VSI_SUCCESS == status ) + { + cp_node = vxTensorCopyNode(self->graph->g, + self->nn_param.concatshift.lcl_data->src_tensor, + outputs[1]->t ); + + if( NULL != cp_node ) + { + self->nn_param.concatshift.lcl_data->cp_node = cp_node; + } + else + { + VSILOGE( "Create vxTensorCopyNode fail." ); + status = VSI_FAILURE; + } + } + + return status; +} /* op_compute() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_FAILURE; + + /* call CONCAT's op_deinit */ + status = vsi_nn_OpDeinit( VSI_NN_OP_CONCAT, self ); + + if( NULL != self->nn_param.concatshift.lcl_data ) + { + vxReleaseNode( &self->nn_param.concatshift.lcl_data->cp_node ); + vxReleaseTensor( &self->nn_param.concatshift.lcl_data->src_tensor ); + free( self->nn_param.concatshift.lcl_data ); + self->nn_param.concatshift.lcl_data = NULL; + } + + return status; +} /* op_deinit() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + /* call CONCAT's op_check */ + ret = vsi_nn_OpCheck( VSI_NN_OP_CONCAT, self, inputs, outputs ); + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + vsi_nn_concatshift_lcl_data * data = NULL; + + /* call CONCAT's op_setup */ + ret = vsi_nn_OpSetup( VSI_NN_OP_CONCAT, self, inputs, outputs ); + + if( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) + { + outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; + memcpy( &outputs[1]->attr.size, &outputs[0]->attr.size, sizeof(outputs[0]->attr.size) ); + + outputs[1]->attr.size[self->nn_param.concatshift.axis] = self->nn_param.concatshift.keep_size; + } + + data = ( vsi_nn_concatshift_lcl_data *)malloc(sizeof(vsi_nn_concatshift_lcl_data) ); + if( NULL != data ) + { + memset( data, 0x00, sizeof(vsi_nn_concatshift_lcl_data) ); + self->nn_param.concatshift.lcl_data = data; + } + else + { + ret = VSI_FAILURE; + } + + return ret; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status = VSI_SUCCESS; + uint32_t axis; + vx_tensor out_view_tensor; + uint32_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t i = 0; + uint32_t keep_size = 0; + + VSILOGD("Optimize %s", vsi_nn_OpGetName(self->op)); + vsi_nn_OpOptimize(VSI_NN_OP_CONCAT, self, inputs, outputs, direction); + + if(direction == VSI_NN_OPTIMIZE_BACKWARD) + { + return VSI_SUCCESS; + } + + if( NULL == outputs[0]->t ) + { + vsi_nn_TensorReinit( self->graph, outputs[0] ); + } + if( NULL == outputs[1]->t ) + { + vsi_nn_TensorReinit( self->graph, outputs[1] ); + } + + axis = self->nn_param.concatshift.axis; + keep_size = self->nn_param.concatshift.keep_size; + for( i = 0; i < outputs[0]->attr.dim_num; i++ ) + { + if( i == axis ) + { + start[i] = outputs[0]->attr.size[i] - keep_size; + } + else + { + start[i] = 0; + } + + end[i] = outputs[0]->attr.size[i]; + } + + out_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, outputs[0]); + if( out_view_tensor != NULL ) + { + self->nn_param.concatshift.lcl_data->src_tensor = out_view_tensor; + } + else + { + VSILOGE( "Create tensor %d from view fail.", i ); + status = VSI_FAILURE; + } + + return status; +} /* op_optimize() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CONCATSHIFT, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 16, + /* output_num */ 2 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c new file mode 100644 index 0000000..2d6f510 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c @@ -0,0 +1,184 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "stride", self->nn_param.conv1d.stride ); + vsi_nn_kernel_param_add_int32( param, "pad_front", self->nn_param.conv1d.pad[0] ); + vsi_nn_kernel_param_add_int32( param, "pad_end", self->nn_param.conv1d.pad[1] ); + vsi_nn_kernel_param_add_int32( param, "dilation", self->nn_param.conv1d.dilation); + vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy ); + vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy ); + vsi_nn_kernel_param_add_int32( param, + "down_scale_size_rounding", self->vx_param.down_scale_size_rounding ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv1d", + inputs, 3, outputs, 1, param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + vsi_nn_kernel_param_release( ¶m ); + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + BEGIN_IO_TYPE_DECL(CONV1D, 3, 1) + IO_TYPE(D_F16, D_F16, D_NONE, D_F16) + IO_TYPE(D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_BF16) + IO_TYPE(D_F32, D_F32, D_NONE, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) + END_IO_TYPE_DECL(CONV1D) + if (!VALIDATE_OP_IO_TYPES(CONV1D, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + /* Check fl and scale*/ + ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_conv1d_param* p = &self->nn_param.conv1d; + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + inputs[1]->attr.size[0], + p->pad, + p->stride, + p->dilation, + VSI_NN_ROUND_FLOOR + ); + + outputs[0]->attr.size[1] = inputs[1]->attr.size[2]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.dim_num = 3; + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + return vsi_nn_op_common_deinit(self); +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CONV1D, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c new file mode 100644 index 0000000..fe9c4a3 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c @@ -0,0 +1,288 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vx_tensor bias; + vsi_status status; + vx_nn_convolution_params_ext_t *p_ext = NULL; + vx_nn_convolution_params_ext2_t *p_ext2 = NULL; + vx_nn_convolution_params_ext2_t param_ext2; + memset( ¶m_ext2, 0, sizeof( vx_nn_convolution_params_ext2_t ) ); + p_ext2 = ¶m_ext2; + p_ext = &p_ext2->ext; + + status = VSI_FAILURE; + + //set ext relative parameters + p_ext->khr.padding_x = self->nn_param.conv2d.pad[0]; + p_ext->khr.padding_y = self->nn_param.conv2d.pad[2]; + if (self->nn_param.conv2d.dilation[0] > 0) + { + p_ext->khr.dilation_x = self->nn_param.conv2d.dilation[0] - 1; + } + if (self->nn_param.conv2d.dilation[1] > 0) + { + p_ext->khr.dilation_y = self->nn_param.conv2d.dilation[1] - 1; + } + p_ext->khr.overflow_policy = self->vx_param.overflow_policy; + p_ext->khr.rounding_policy = self->vx_param.rounding_policy; + p_ext->khr.down_scale_size_rounding = self->vx_param.down_scale_size_rounding; + + p_ext->padding_x_right = self->nn_param.conv2d.pad[1]; + p_ext->padding_y_bottom = self->nn_param.conv2d.pad[3]; + + //set ext2 relative parameters + p_ext2->depth_multiplier = self->nn_param.conv2d.multiplier; + p_ext2->stride_x = self->nn_param.conv2d.stride[0]; + p_ext2->stride_y = self->nn_param.conv2d.stride[1]; + + if( inputs[2] == NULL ) + { + bias = NULL; + } + else + { + bias = inputs[2]->t; + } + + self->n = vxConvolutionLayer( + self->graph->g, + inputs[0]->t, + inputs[1]->t, + bias, + (vx_nn_convolution_params_t *)p_ext2, + sizeof( vx_nn_convolution_params_ext2_t ), + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + /* Check fl and scale*/ + ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); + + if(ret) { + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(CONV2D, 3, 1) + /* IO_TYPE(INPUT, WEIGHT, BIAS, OUTPUT) */ + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + + IO_TYPE(D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_F32, D_F16) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_ASYM, D_U8|Q_ASYM) + + IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I8|Q_SYM) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + + /* IO_TYPE(INPUT, WEIGHT, NULL, OUTPUT) */ + IO_TYPE(D_F32, D_F32, D_NONE, D_F32) + + IO_TYPE(D_F16, D_F16, D_NONE, D_F16) + IO_TYPE(D_F16, D_F16, D_NONE, D_F16) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F16) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + + IO_TYPE(D_BF16, D_BF16, D_NONE, D_F32) + IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_NONE, D_I8|Q_SYM) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + + /* HW 9.0 */ + IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) + END_IO_TYPE_DECL(CONV2D) + ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, self->input.num, outputs, self->output.num); + if(!ret) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + /* check parameters */ + if(inputs[1]->attr.size[0] * inputs[1]->attr.size[1] > 6400) { + VSILOGE("Kernel size should <= 6400."); + return FALSE; + } + } + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_conv2d_param *nn_param; + uint32_t perm[] = { 3, 2, 0, 1 }; + + /* TODO: Driver should handle this, + * Check transpose + * */ + if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt && + VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type ) + { + vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL ); + inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; + } + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + nn_param = &self->nn_param.conv2d; + vsi_nn_compute_padding( + inputs[0]->attr.size, + inputs[1]->attr.size, + self->nn_param.conv2d.stride, + self->nn_param.conv2d.dilation, + self->nn_param.conv2d.pad_type, + self->nn_param.conv2d.pad + ); + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + inputs[1]->attr.size[0], + &nn_param->pad[0], + nn_param->stride[0], + nn_param->dilation[0], + VSI_NN_ROUND_FLOOR + ); + outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[1], + inputs[1]->attr.size[1], + &nn_param->pad[2], + nn_param->stride[1], + nn_param->dilation[1], + VSI_NN_ROUND_FLOOR + ); + if(self->nn_param.conv2d.weights > 0) + { + outputs[0]->attr.size[2] = self->nn_param.conv2d.weights; + } + else if(self->nn_param.conv2d.multiplier > 0) + { + outputs[0]->attr.size[2] = inputs[0]->attr.size[2] * self->nn_param.conv2d.multiplier; + } + else + { + outputs[0]->attr.size[2] = inputs[1]->attr.size[3]; + } + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CONV2D, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c new file mode 100644 index 0000000..680af0a --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c @@ -0,0 +1,312 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "ops/vsi_nn_op_conv_relu.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_nn_convolution_relu_pooling_params_ext2_t p; + status = VSI_FAILURE; + + if(vsi_nn_InitConvReluPoolParameter(self, &p, FALSE) != VSI_SUCCESS) + { + VSILOGE("SetConvReluParameter fail\n"); + return VSI_FAILURE; + } + + self->n = vxConvolutionReluPoolingLayer2( + self->graph->g, + inputs[0]->t, + inputs[1]->wb, + (vx_nn_convolution_relu_pooling_params_t *)&p, + sizeof(p), + outputs[0]->t + ); + + vsi_nn_DeinitConvReluPoolParameter( &p ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + BEGIN_IO_TYPE_DECL(CONV_RELU, 3, 1) + IO_TYPE(D_F16, D_F16, D_NONE, D_F16) + IO_TYPE(D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_BF16) + IO_TYPE(D_F32, D_F32, D_NONE, D_F32) + END_IO_TYPE_DECL(CONV_RELU) + if (!VALIDATE_OP_IO_TYPES(CONV_RELU, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + /* Check fl and scale*/ + ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, outputs ); + + return ret; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status; + vx_nn_convolution_relu_pooling_params_ext2_t p; + vx_weights_biases_parameter_optimizations_t opt; + vx_weights_biases_parameter_optimizations_t * p_opt; + + status = VSI_SUCCESS; + + if(direction == VSI_NN_OPTIMIZE_BACKWARD) + { + return VSI_SUCCESS; + } + + VSILOGD("Optimize %s", vsi_nn_OpGetName(self->op)); + /* Prepare weight_bias */ + if(inputs[1]->wb == NULL) + { + if(vsi_nn_InitConvReluPoolParameter(self, &p, FALSE) != VSI_SUCCESS) + { + VSILOGE("SetConvReluParameter fail\n"); + return VSI_FAILURE; + } + + p_opt = NULL; + if( outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC + || inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + memset( &opt, 0, sizeof( opt ) ); + opt.inputZeroPoint = inputs[0]->attr.dtype.zero_point; + opt.zrl = -1; + opt.outputFormat = outputs[0]->attr.dtype.vx_type; + p_opt = &opt; + } + + inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( + VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, + 4, + inputs[0]->attr.size, + outputs[0]->attr.size, + outputs[0]->attr.size, + outputs[0]->attr.dtype.vx_type, + (vx_nn_convolution_relu_pooling_params_t *)&p, + sizeof(p), + p_opt, + inputs[1]->t, inputs[2]->t + ); + vsi_nn_DeinitConvReluPoolParameter( &p ); + } + + if( NULL == inputs[1]->wb ) + { + VSILOGE( "Create weight bias fail." ); + status = VSI_FAILURE; + } + + return status; +} /* op_optimize() */ + +vsi_status vsi_nn_InitConvReluPoolParameter + ( + vsi_nn_node_t * node, + vx_nn_convolution_relu_pooling_params_ext2_t * param_ext2, + vsi_bool has_pool + ) +{ + int32_t pad_const_val; + vx_scalar pad_const; + vx_nn_convolution_relu_pooling_params_t *param; + vx_nn_convolution_relu_pooling_params_ext_t *param_ext; + + pad_const_val = 0; + pad_const = NULL; + param = NULL; + + if( NULL == node || NULL == param_ext2 ) + { + VSILOGE("Set param fail\n"); + return VSI_FAILURE; + } + memset( param_ext2, 0, sizeof( vx_nn_convolution_relu_pooling_params_ext2_t ) ); + param_ext = ¶m_ext2->ext; + param = ¶m_ext->base; + + pad_const = vxCreateScalar( node->graph->ctx->c, VX_TYPE_INT32, &pad_const_val ); + if( NULL == pad_const ) + { + VSILOGE("Create scalar fail\n"); + return VSI_FAILURE; + } + + if( node->nn_param.conv2d.dilation[0] > 0 ) + { + param->dilation_x = node->nn_param.conv2d.dilation[0] - 1; + } + if( node->nn_param.conv2d.dilation[1] > 0 ) + { + param->dilation_y = node->nn_param.conv2d.dilation[1] - 1; + } + param->pad_x_left = node->nn_param.conv2d.pad[0]; + param->pad_x_right = node->nn_param.conv2d.pad[1]; + param->pad_y_top = node->nn_param.conv2d.pad[2]; + param->pad_y_bottom = node->nn_param.conv2d.pad[3]; + param->accumulator_bits = (vx_uint8)node->vx_param.accumulator_bits; + param->overflow_policy = node->vx_param.overflow_policy; + param->rounding_policy = node->vx_param.rounding_policy; + param->down_scale_size_rounding = node->vx_param.down_scale_size_rounding; + param->enable_relu = (vx_bool)node->vx_param.has_relu; + param->pad_mode = VX_PAD_CONSTANT; + param->pad_const = pad_const; + if( TRUE == has_pool ) + { + param->pool_type = node->nn_param.pool.type; + param->pool_size_x = node->nn_param.pool.ksize[0]; + param->pool_size_y = node->nn_param.pool.ksize[1]; + } + param_ext->stride_x = node->nn_param.conv2d.stride[0]; + param_ext->stride_y = node->nn_param.conv2d.stride[1]; + + param_ext2->depth_multiplier = node->nn_param.conv2d.multiplier; + + return VSI_SUCCESS; +} /* vsi_nn_InitConvReluPoolParameter() */ + +void vsi_nn_DeinitConvReluPoolParameter + ( + vx_nn_convolution_relu_pooling_params_ext2_t * param + ) +{ + if( NULL != param ) + { + if( NULL != param->ext.base.pad_const ) + { + vxReleaseScalar( ¶m->ext.base.pad_const ); + } + } +} /* vsi_nn_DeinitConvReluPoolParameter() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CONV_RELU, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c new file mode 100644 index 0000000..2c56d23 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c @@ -0,0 +1,265 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "ops/vsi_nn_op_conv_relu.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_nn_convolution_relu_pooling_params_ext2_t p; + status = VSI_FAILURE; + + if(vsi_nn_InitConvReluPoolParameter(self, &p, TRUE) != VSI_SUCCESS) + { + VSILOGE("SetConvReluPoolParameter fail\n"); + return VSI_FAILURE; + } + + self->n = vxConvolutionReluPoolingLayer2( + self->graph->g, + inputs[0]->t, + inputs[1]->wb, + (vx_nn_convolution_relu_pooling_params_t *)&p, + sizeof(p), + outputs[0]->t + ); + + vsi_nn_DeinitConvReluPoolParameter( &p ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + BEGIN_IO_TYPE_DECL(CONV_RELU_POOL, 3, 1) + IO_TYPE(D_F16, D_F16, D_NONE, D_F16) + IO_TYPE(D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_BF16) + IO_TYPE(D_F32, D_F32, D_NONE, D_F32) + END_IO_TYPE_DECL(CONV_RELU_POOL) + if (!VALIDATE_OP_IO_TYPES(CONV_RELU_POOL, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + /* Check fl and scale*/ + ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret; + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + ret = TRUE; + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, outputs ); + if(ret == FALSE) + { + VSILOGE("OpSetup [VSI_NN_OP_CONV2D] fail\n"); + return FALSE; + } + + ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, outputs, outputs ); + if(ret == FALSE) + { + VSILOGE("OpSetup [VSI_NN_OP_POOL] fail\n"); + return FALSE; + } + } + + return ret; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status; + vsi_bool ret; + vsi_nn_tensor_t conv_out, *pconv_out; + vx_nn_convolution_relu_pooling_params_ext2_t p; + vx_weights_biases_parameter_optimizations_t opt; + vx_weights_biases_parameter_optimizations_t * p_opt; + + ret = FALSE; + status = VSI_FAILURE; + + if(direction == VSI_NN_OPTIMIZE_BACKWARD) + { + return VSI_SUCCESS; + } + + VSILOGD("Optimize %s", vsi_nn_OpGetName(self->op)); + memset(&conv_out, 0, sizeof(vsi_nn_tensor_t)); + pconv_out = &conv_out; + + ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, &pconv_out ); + if(ret == FALSE) + { + VSILOGE("OpSetup [VSI_NN_OP_CONV2D] fail\n"); + goto final; + } + + /* Prepare weight_bias */ + if(inputs[1]->wb == NULL) + { + if(vsi_nn_InitConvReluPoolParameter(self, &p, TRUE) != VSI_SUCCESS) + { + VSILOGE("SetConvReluPoolParameter fail\n"); + goto final; + } + + p_opt = NULL; + if( outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC + || inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + memset( &opt, 0, sizeof( opt ) ); + opt.inputZeroPoint = inputs[0]->attr.dtype.zero_point; + opt.zrl = -1; + opt.outputFormat = outputs[0]->attr.dtype.vx_type; + p_opt = &opt; + } + + inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( + VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, + 4, + inputs[0]->attr.size, + pconv_out->attr.size, + outputs[0]->attr.size, + outputs[0]->attr.dtype.vx_type, + (vx_nn_convolution_relu_pooling_params_t *)&p, + sizeof(p), + p_opt, + inputs[1]->t, inputs[2]->t + ); + vsi_nn_DeinitConvReluPoolParameter( &p ); + } + + if( NULL == inputs[1]->wb ) + { + VSILOGE( "Create weight bias fail." ); + } + else + { + status = VSI_SUCCESS; + } + +final: + return status; +} /* op_optimize() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CONV_RELU_POOL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c new file mode 100644 index 0000000..04757b5 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c @@ -0,0 +1,467 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (3) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +#define USE_OVX_API TRUE + +#if (USE_OVX_API == FALSE) +extern vx_kernel_description_t * vx_kernel_CROP_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_crop_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.crop); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, offset[0] ); + _SET_PARAM( 1, VX_TYPE_INT32, offset[1] ); + _SET_PARAM( 2, VX_TYPE_INT32, offset[2] ); + +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_pre_init + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_nn_type_e dataFormat = inputs[0]->attr.dtype.vx_type; + vsi_nn_type_e dstFormat = outputs[0]->attr.dtype.vx_type; + + if (dataFormat == VSI_NN_TYPE_FLOAT16 + || (dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_INT16)) + { + kernel_info->kernel_index = 1; + } + else if(dataFormat == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_FLOAT16) + { + kernel_info->kernel_index = 3; + } + else + { + kernel_info->kernel_index = 2; + } + + return VSI_SUCCESS; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_border_t border; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; +#endif + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; +#if (USE_OVX_API == TRUE) + vx_nn_stride_slice_params_t param; + vsi_nn_tensor_t *begin_dims_tensor = NULL; + vsi_nn_tensor_t *end_dims_tensor = NULL; + vsi_nn_tensor_t *stride_dims_tensor = NULL; + vsi_nn_tensor_attr_t attr; + int32_t start[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t end[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t stride[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t i; + + memset(¶m, 0, sizeof(vx_nn_stride_slice_params_t)); + + for (i = 0; i < self->nn_param.crop.dims; i++) + { + start[i] = self->nn_param.crop.offset[i]; + end[i] = self->nn_param.crop.offset[i] + outputs[0]->attr.size[i]; + stride[i] = 1; + } + + for (i = self->nn_param.crop.dims; i < inputs[0]->attr.dim_num; i++) + { + start[i] = 0; + end[i] = outputs[0]->attr.size[i]; + stride[i] = 1; + } + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = inputs[0]->attr.dim_num; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + begin_dims_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)start, + &attr); + if( NULL == begin_dims_tensor ) + { + VSILOGE("Create begin_dims_tensor fail.(crop)"); + return VSI_FAILURE; + } + + end_dims_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)end, + &attr); + if( NULL == end_dims_tensor ) + { + VSILOGE("Create end_dims_tensor fail.(crop)"); + status = VSI_FAILURE; + goto OnError; + } + + stride_dims_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)stride, + &attr); + if( NULL == stride_dims_tensor ) + { + VSILOGE("Create stride_dims_tensor fail.(crop)"); + status = VSI_FAILURE; + goto OnError; + } + + param.begin_dims = REQUIRED_IO(begin_dims_tensor); + param.end_dims = REQUIRED_IO(end_dims_tensor); + param.stride_dims = REQUIRED_IO(stride_dims_tensor); + + self->n = vxTensorStrideSliceNode( + self->graph->g, + inputs[0]->t, + ¶m, + sizeof(vx_nn_stride_slice_params_t), + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } +#else + vsi_nn_kernel_info_t kernel_info; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_crop"; + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_CROP_list; + kernel_info.init_index = 1; + + if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) + { + vx_op_pre_init(self, inputs, outputs, &kernel_info); + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } +#endif +OnError: + if (begin_dims_tensor) vsi_nn_ReleaseTensor(&begin_dims_tensor); + if (end_dims_tensor) vsi_nn_ReleaseTensor(&end_dims_tensor); + if (stride_dims_tensor) vsi_nn_ReleaseTensor(&stride_dims_tensor); + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(CROP, 1, 1) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + END_IO_TYPE_DECL(CROP) + if (!VALIDATE_OP_IO_TYPES(CROP, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_crop_param * p; + int32_t i; + p = (vsi_nn_crop_param *)&(self->nn_param.crop); + if (p->axis >= (int32_t)inputs[0]->attr.dim_num) + { + VSILOGE("Invalid parameter: axis!\n"); + return FALSE; + } + + if( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num ) + { + return TRUE; + } + + if (p->dims + p->axis == inputs[0]->attr.dim_num) + { + for(i = 0; i < p->axis; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + for(i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; + } + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + else + { + if (p->dims == 1) + { + for(i = 0; i <= p->axis; i++) + { + outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; + p->offset[i] = p->offset[0]; + } + for(i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + else + { + VSILOGE("Invalid parameter: offset dims!\n"); + return FALSE; + } + } + + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CROP, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 2, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c new file mode 100644 index 0000000..8d8de45 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -0,0 +1,255 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + + if (self->nn_param.dataconvert.lcl_data->use_reshape == FALSE + && inputs[0]->t != NULL && outputs[0]->t != NULL) + { + self->n = vxTensorCopyNode( + self->graph->g, + inputs[0]->t, + outputs[0]->t + ); + + if(NULL == self->n) + { + VSILOGE( "Create vxTensorCopyNode fail." ); + return VSI_FAILURE; + } + } + + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool _is_same_quant + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_dtype_t *dtype,*_dtype; + + dtype = &inputs[0]->attr.dtype; + _dtype = &outputs[0]->attr.dtype; + + if(vsi_nn_DtypeCompare(dtype, _dtype) == FALSE) + { + return FALSE; + } + + return TRUE; +} /* _is_same_quant */ + + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status; + + status = VSI_SUCCESS; + + if ( _is_same_quant(self, inputs, outputs) == FALSE || + (inputs[0]->t != NULL && outputs[0]->t != NULL)) + { + return status; + } + + VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + if( direction == VSI_NN_OPTIMIZE_FORWARD ) + { + if(NULL == inputs[0]->t && NULL != outputs[0]->t) + { + inputs[0]->t = vxReshapeTensor(outputs[0]->t, + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num); + if( inputs[0]->t == NULL ) + { + VSILOGE("Call vxReshapeTensor fail"); + return VSI_FAILURE; + } + self->nn_param.dataconvert.lcl_data->use_reshape = TRUE; + } + } + else + { + if(NULL == outputs[0]->t && NULL != inputs[0]->t) + { + outputs[0]->t = vxReshapeTensor(inputs[0]->t, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num); + if( outputs[0]->t == NULL ) + { + VSILOGE("Call vxReshapeTensor fail"); + return VSI_FAILURE; + } + self->nn_param.dataconvert.lcl_data->use_reshape = TRUE; + } + } + + return status; +} /* op_optimize() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.dataconvert.lcl_data = + (vsi_nn_dataconvert_lcl_data *)malloc(sizeof(vsi_nn_dataconvert_lcl_data)); + if (NULL == self->nn_param.dataconvert.lcl_data) + { + return VX_ERROR_NO_MEMORY; + } + + memset( self->nn_param.dataconvert.lcl_data, 0, sizeof(vsi_nn_dataconvert_lcl_data) ); + + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if(self->nn_param.dataconvert.lcl_data) + { + + free(self->nn_param.dataconvert.lcl_data); + self->nn_param.dataconvert.lcl_data = NULL; + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(DATACONVERT, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_I32) + IO_TYPE(D_F16, D_BF16) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_I32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F32, D_I16|Q_DFP) + IO_TYPE(D_F32, D_I8|Q_DFP) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_BOOL8, D_BOOL8) + IO_TYPE(D_BOOL8, D_U8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I8|Q_DFP) + IO_TYPE(D_BOOL8, D_I16|Q_DFP) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_I32, D_I16|Q_DFP) + END_IO_TYPE_DECL(DATACONVERT) + if (!VALIDATE_OP_IO_TYPES(DATACONVERT, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ DATACONVERT, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c new file mode 100644 index 0000000..d86b715 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c @@ -0,0 +1,338 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_constraint_check.h" + +#define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\ + (( in - 1 ) * stride + ksize - pad_1 - pad_2 + output_padding) +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_nn_deconvolution_params_ext2_t param; + vsi_nn_tensor_t *permute_tensor = NULL; +#ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS + vsi_nn_tensor_t *reverse_tensor = NULL; +#endif + vsi_nn_tensor_t *weight_tensor = NULL; + + status = VSI_FAILURE; +#ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS + if (FALSE == inputs[1]->attr.is_const) + { + vsi_nn_tensor_t *tmp_in_tensor = NULL; + vx_nn_tensor_reverse_params_t para; + vx_int32 axis_reverse[4] = {0, 1, 0, 0}; + vsi_nn_tensor_attr_t attr_reverse; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1) + { + uint32_t perm[] = { 0, 1, 3, 2 }; + vsi_nn_tensor_attr_t attr; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + memcpy( &attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t) ); + attr.size[2] = inputs[1]->attr.size[3]; + attr.size[3] = inputs[1]->attr.size[2]; + permute_tensor = vsi_nn_CreateTensor(self->graph, &attr); + self->n = vxTensorPermuteNode( self->graph->g, inputs[1]->t, + permute_tensor->t, perm, 4); + if( NULL == self->n ) + { + status = VSI_FAILURE; + goto final; + } + tmp_in_tensor = permute_tensor; + } + else + { + tmp_in_tensor = inputs[1]; + } + + memset(&attr_reverse, 0, sizeof(vsi_nn_tensor_attr_t)); + memcpy(&attr_reverse, &tmp_in_tensor->attr, sizeof(vsi_nn_tensor_attr_t) ); + reverse_tensor = vsi_nn_CreateTensor(self->graph, &attr_reverse); + para.axis = axis_reverse; + para.numberOfAxis = 2; + + self->n = vxTensorReverse( self->graph->g, tmp_in_tensor->t, ¶, + sizeof(vx_nn_tensor_reverse_params_t), reverse_tensor->t ); + if( NULL == self->n ) + { + status = VSI_FAILURE; + goto final; + } + + weight_tensor = reverse_tensor; + } + else + { + weight_tensor = inputs[1]; + } + +#else + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 && FALSE == inputs[1]->attr.is_const) + { + uint32_t perm[] = { 0, 1, 3, 2 }; + vsi_nn_tensor_attr_t attr; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + memcpy( &attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t) ); + attr.size[2] = inputs[1]->attr.size[3]; + attr.size[3] = inputs[1]->attr.size[2]; + permute_tensor = vsi_nn_CreateTensor(self->graph, &attr); + self->n = vxTensorPermuteNode( self->graph->g, inputs[1]->t, + permute_tensor->t, perm, 4); + if( NULL == self->n ) + { + status = VSI_FAILURE; + goto final; + } + weight_tensor = permute_tensor; + } + else + { + weight_tensor = inputs[1]; + } +#endif + // param.a_x = self->nn_param.deconv.dilation; + // param.a_y = self->nn_param.deconv.dilation; + param.ext.khr.a_x = 1; + param.ext.khr.a_y = 1; + param.ext.khr.padding_x = (size_t)self->nn_param.deconv.pad[0]; + param.ext.khr.padding_y = (size_t)self->nn_param.deconv.pad[2]; + param.ext.khr.overflow_policy = self->vx_param.overflow_policy; + param.ext.khr.rounding_policy = self->vx_param.rounding_policy; + param.ext.padding_x_right = (size_t)self->nn_param.deconv.pad[1]; + param.ext.padding_y_bottom = (size_t)self->nn_param.deconv.pad[3]; + param.ext.channel_group = self->nn_param.deconv.group; + param.stride_x = self->nn_param.deconv.stride[0]; + param.stride_y = self->nn_param.deconv.stride[1]; + //param.border_mode; + //param.border_const; + + self->n = vxDeconvolutionLayer( + self->graph->g, + inputs[0]->t, + weight_tensor->t, + (NULL == inputs[2]) ? NULL : inputs[2]->t, + (vx_nn_deconvolution_params_t *)¶m, + sizeof( vx_nn_deconvolution_params_ext2_t ), + outputs[0]->t + ); + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + +final: + if (permute_tensor) + { + vsi_nn_ReleaseTensor(&permute_tensor); + } +#ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS + if (reverse_tensor) + { + vsi_nn_ReleaseTensor(&reverse_tensor); + } +#endif + return status; + +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(DECONVOLUTION, 3, 1) + IO_TYPE(D_F16, D_F16, D_NONE, D_F16) + IO_TYPE(D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_BF16) + IO_TYPE(D_F32, D_F32, D_NONE, D_F32) + + /* HW 9.0 */ + IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) + END_IO_TYPE_DECL(DECONVOLUTION) + if (!VALIDATE_OP_IO_TYPES(DECONVOLUTION, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_deconv_param *nn_param; + uint32_t perm[] = { 3, 2, 0, 1 }; + uint32_t perm1[] = { 0, 1, 3, 2 }; + + /* TODO: Driver should handle this, + * Check transpose + * TODO: remove this + * */ + if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt ) + { + vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL ); + inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; + } + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + +#ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 && TRUE == inputs[1]->attr.is_const) + { + /* whnc->whcn */ + vsi_nn_PermuteTensor( self->graph, inputs[1], perm1, 4 ); + } + /* Rotate 180 degrees for weights data */ + if (TRUE == inputs[1]->attr.is_const) + { + vsi_nn_reshuffle_weight_data(self->graph, inputs[1]); + } +#else + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) >= 0 && TRUE == inputs[1]->attr.is_const) + { + /* whcn->whnc */ + vsi_nn_PermuteTensor( self->graph, inputs[1], perm1, 4 ); + } +#endif + + nn_param = &self->nn_param.deconv; + + nn_param->group = ( 0 == nn_param->group ) ? 1 : nn_param->group; + + nn_param->ksize[0] = inputs[1]->attr.size[0]; + nn_param->ksize[1] = inputs[1]->attr.size[1]; + + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = COMPUTE_DECONV_SZ( + inputs[0]->attr.size[0], + nn_param->ksize[0], + nn_param->pad[0], + nn_param->pad[1], + nn_param->stride[0], + nn_param->output_padding[0] + ); + + outputs[0]->attr.size[1] = COMPUTE_DECONV_SZ( + inputs[0]->attr.size[1], + nn_param->ksize[1], + nn_param->pad[2], + nn_param->pad[3], + nn_param->stride[1], + nn_param->output_padding[1] + ); + + if(self->nn_param.deconv.weights > 0) + { + outputs[0]->attr.size[2] = self->nn_param.deconv.weights; + } + else + { + outputs[0]->attr.size[2] = inputs[1]->attr.size[3]; + } + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ DECONVOLUTION, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c new file mode 100644 index 0000000..cf8b2a7 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c @@ -0,0 +1,197 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_log.h" +#include "kernel/vsi_nn_kernel.h" + +#define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\ + (( in - 1 ) * stride + ksize - pad_1 - pad_2 + output_padding) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t perm[] = { 0, 1, 3, 2 }; + vsi_nn_tensor_attr_t weight_attr; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_tensor_t* weight_tensor = NULL; + vsi_nn_tensor_t* new_inputs[3] = {NULL}; + + memcpy(&weight_attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t)); + weight_attr.size[3] = weight_attr.size[2]; + weight_attr.size[2] = weight_attr.size[1]; + weight_attr.size[1] = 1; + weight_attr.dim_num = 4; + weight_tensor = vsi_nn_CreateTensor( self->graph, &weight_attr ); + vsi_nn_ReshapeTensor( self->graph, inputs[1], weight_tensor, weight_attr.size, 4 ); + +#ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 && TRUE == weight_tensor->attr.is_const ) + { + /* whnc->whcn */ + vsi_nn_PermuteTensor( self->graph, weight_tensor, perm, 4 ); + } + + /* Rotate 180 degrees for weights data */ + if ( TRUE == weight_tensor->attr.is_const ) + { + vsi_nn_reshuffle_weight_data( self->graph, weight_tensor ); + } +#else + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) >= 0 && TRUE == weight_tensor->attr.is_const) + { + /* whcn->whnc */ + vsi_nn_PermuteTensor( self->graph, weight_tensor, perm, 4 ); + } +#endif + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "stride", self->nn_param.deconvolution1d.stride ); + vsi_nn_kernel_param_add_int32( param, "pad_front", self->nn_param.deconvolution1d.pad[0] ); + vsi_nn_kernel_param_add_int32( param, "pad_end", self->nn_param.deconvolution1d.pad[1] ); + vsi_nn_kernel_param_add_int32( param, "group", self->nn_param.deconvolution1d.group ); + vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy ); + vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy ); + vsi_nn_kernel_param_add_int32( param, + "down_scale_size_rounding", self->vx_param.down_scale_size_rounding ); + + new_inputs[0] = inputs[0]; + new_inputs[1] = weight_tensor; + new_inputs[2] = inputs[2]; + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "deconvolution1d", + new_inputs, 3, outputs, 1, param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + vsi_nn_kernel_param_release( ¶m ); + + if ( weight_tensor ) + { + vsi_nn_ReleaseTensor( &weight_tensor ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + //TODO: Check tensor shapes. + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_deconvolution1d_param *nn_param = &self->nn_param.deconvolution1d; + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + nn_param->group = ( 0 == nn_param->group ) ? 1 : nn_param->group; + nn_param->ksize = inputs[1]->attr.size[0]; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = COMPUTE_DECONV_SZ( + inputs[0]->attr.size[0], + nn_param->ksize, + nn_param->pad[0], + nn_param->pad[1], + nn_param->stride, + nn_param->output_padding + ); + + if( nn_param->weights > 0 ) + { + outputs[0]->attr.size[1] = nn_param->weights; + } + else + { + outputs[0]->attr.size[1] = inputs[1]->attr.size[3]; + } + outputs[0]->attr.size[1] = nn_param->weights; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_op_common_deinit( self ); + return VSI_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ DECONVOLUTION1D, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c new file mode 100644 index 0000000..53c41ec --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c @@ -0,0 +1,255 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_math.h" + +static vsi_status vsi_nn_depth2space_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_tensor_t *block_size_tensor = NULL; + vx_nn_reorg_params_t param; + + status = VSI_FAILURE; + memset(¶m, 0, sizeof(vx_nn_reorg_params_t)); + + block_size_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.depth2space.block_size, + VSI_NN_TYPE_INT32); + if( NULL == block_size_tensor ) + { + VSILOGE("Create block_size_tensor fail.(depth2space)"); + return VSI_FAILURE; + } + self->nn_param.depth2space.local.block_size_tensor = block_size_tensor; + param.block_size = REQUIRED_IO(block_size_tensor); + param.type = VX_REORG_DEPTH_TO_SPACE; + + self->n = vxReorgLayer2( self->graph->g, + inputs[0]->t, + ¶m, + sizeof(vx_nn_reorg_params_t), + outputs[0]->t); + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* vsi_nn_depth2space_compute() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + + if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_DCR) + { + status = vsi_nn_depth2space_compute(self, inputs, outputs); + } + else if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD) + { + status = vsi_nn_internal_compute_node( self ); + } + else + { + VSILOGE("Unknown depth2space mode.(depth2space)"); + return status; + } + + + return status; +} /* op_compute() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD) + { + return vsi_nn_internal_optimize_node(self, direction ); + } + else + { + return VSI_SUCCESS; + } +} /* op_optimize() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if(self->nn_param.depth2space.block_size < 0) + { + VSILOGE("Block size can't be less than zero in depth to space"); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static void op_set_depth2space_param_value(vsi_nn_nn_param_t *nn_param, + vsi_nn_op_t type_name, + vsi_nn_depth2space_mode_e mode, + vx_uint32 block_size + ) +{ + if (type_name == VSI_NN_OP_DEPTH2SPACE_INTERNAL) + { + nn_param->depth2space_internal.block_size = block_size; + nn_param->depth2space_internal.mode = mode; + } +} + +static vsi_bool op_set_depth2space_internal + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_op_t type_name + ) +{ + vsi_bool retn = TRUE; + vsi_nn_internal_node_t* curr = NULL; + + vsi_nn_internal_init_node_wksp( self ); + + curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + op_set_depth2space_param_value(&(curr->node->nn_param), type_name, + self->nn_param.depth2space.mode, self->nn_param.depth2space.block_size); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + retn = vsi_nn_internal_setup_node(self, curr); + + return retn; +} + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 22) == -1) + { + self->nn_param.depth2space.mode = VSI_NN_DEPTH2SPACE_DCR; + } + + return status; +} /* op_init() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + uint32_t size = node->nn_param.depth2space.block_size; + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = inputs[0]->attr.size[0] * size; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1] * size; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2] / (size * size); + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + + if (node->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD) + { + ret = op_set_depth2space_internal(node, inputs, outputs, VSI_NN_OP_DEPTH2SPACE_INTERNAL); + } + return ret; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.depth2space.local.block_size_tensor != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.depth2space.local.block_size_tensor)); + } + + if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD) + { + vsi_nn_internal_deinit_node_wksp(self); + } + else + { + vsi_nn_op_common_deinit(self); + } + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ DEPTH2SPACE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c new file mode 100644 index 0000000..d9de8b9 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c @@ -0,0 +1,157 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + //int32_t mode = self->nn_param.depth2space_internal.mode; + int32_t block_size = self->nn_param.depth2space_internal.block_size; + + if( NULL == self ) + { + return VSI_FAILURE; + } + + param =vsi_nn_kernel_param_create(); + + // Add params + vsi_nn_kernel_param_add_int32( param, "block_size", block_size ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "depth2space_internal", inputs, 1, outputs, 1, param ); + + if( self->n != NULL ) + { + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + return status; +} /* op_compute() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t size = self->nn_param.depth2space_internal.block_size; + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = inputs[0]->attr.size[0] * size; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1] * size; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2] / (size * size); + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + + return TRUE; +} /* op_setup() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(DEPTH2SPACE_INTERNAL, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(DEPTH2SPACE_INTERNAL) + if(!VALIDATE_OP_IO_TYPES(DEPTH2SPACE_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + + +#ifdef __cplusplus +extern "C" { +#endif +DEF_OP_REG + ( + /* op_name */ DEPTH2SPACE_INTERNAL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c new file mode 100644 index 0000000..5cbb9d6 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c @@ -0,0 +1,146 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +/* + Declare number of input and output. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "stride", + self->nn_param.depthwise_conv1d.stride ); + vsi_nn_kernel_param_add_int32( param, "pad_front", + self->nn_param.depthwise_conv1d.pad[0] ); + vsi_nn_kernel_param_add_int32( param, "pad_end", + self->nn_param.depthwise_conv1d.pad[1] ); + vsi_nn_kernel_param_add_int32( param, "dilation", + self->nn_param.depthwise_conv1d.dilation ); + vsi_nn_kernel_param_add_int32( param, "multiplier", + self->nn_param.depthwise_conv1d.multiplier ); + vsi_nn_kernel_param_add_int32( param, "overflow_policy", + self->vx_param.overflow_policy ); + vsi_nn_kernel_param_add_int32( param, "rounding_policy", + self->vx_param.rounding_policy ); + vsi_nn_kernel_param_add_int32( param, "down_scale_size_rounding", + self->vx_param.down_scale_size_rounding ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "depthwise_conv1d", + inputs, 3, outputs, 1, param ); + if( self->n ) + { + status = VSI_SUCCESS; + } + vsi_nn_kernel_param_release( ¶m ); + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_CONV2D, self, inputs, outputs); + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_depthwise_conv1d_param* p = &self->nn_param.depthwise_conv1d; + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + inputs[1]->attr.size[0], + p->pad, + p->stride, + p->dilation, + VSI_NN_ROUND_FLOOR + ); + + outputs[0]->attr.size[1] = inputs[1]->attr.size[2]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + return TRUE; +} /* op_setup() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ DEPTHWISE_CONV1D, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c new file mode 100644 index 0000000..726c672 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c @@ -0,0 +1,193 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (4) +#define _BOX_INPUT_NUM (2) +#define _BOX_OUTPUT_NUM (1) +#define _NMS_INPUT_NUM (2) +#define _NMS_OUTPUT_NUM (4) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param0 = NULL; + vsi_nn_kernel_param_t * param1 = NULL; + vsi_nn_tensor_t* box_tensors[3] = { NULL }; + vsi_nn_tensor_t* nms_tensors[6] = { NULL }; + vsi_nn_tensor_t* bbox_tensor = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_detection_postprocess_param * p = &(self->nn_param.detection_postprocess); + float inv_scale_y, inv_scale_x, inv_scale_h, inv_scale_w; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + attr.size[0] = 4; + attr.size[1] = inputs[0]->attr.size[1]; + attr.size[2] = inputs[0]->attr.size[2]; + attr.dim_num = 3; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + bbox_tensor = vsi_nn_CreateTensor( self->graph, &attr ); + + inv_scale_y = 1.0f / p->dy; + inv_scale_x = 1.0f / p->dx; + inv_scale_h = 1.0f / p->dh; + inv_scale_w = 1.0f / p->dw; + + if (bbox_tensor) + { + param0 = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_float32( param0, "inv_scale_y", inv_scale_y); + vsi_nn_kernel_param_add_float32( param0, "inv_scale_x", inv_scale_x); + vsi_nn_kernel_param_add_float32( param0, "inv_scale_h", inv_scale_h); + vsi_nn_kernel_param_add_float32( param0, "inv_scale_w", inv_scale_w); + box_tensors[0] = inputs[1]; + box_tensors[1] = inputs[2]; + box_tensors[2] = bbox_tensor; + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "detect_post_box", + &box_tensors[0], _BOX_INPUT_NUM, + &box_tensors[2], _BOX_OUTPUT_NUM, param0 ); + + param1 =vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param1, "nms_type", p->nms_type); + vsi_nn_kernel_param_add_int32( param1, "max_num_detections", p->max_num_detections); + vsi_nn_kernel_param_add_int32( param1, "maximum_class_per_detection", p->maximum_class_per_detection); + vsi_nn_kernel_param_add_int32( param1, "maximum_detection_per_class", p->maximum_detection_per_class); + vsi_nn_kernel_param_add_float32( param1, "score_threshold", p->score_threshold); + vsi_nn_kernel_param_add_float32( param1, "iou_threshold", p->iou_threshold); + vsi_nn_kernel_param_add_int32( param1, "is_bg_in_label", p->is_bg_in_label); + nms_tensors[0] = inputs[0]; + nms_tensors[1] = bbox_tensor; + nms_tensors[2] = outputs[0]; + nms_tensors[3] = outputs[1]; + nms_tensors[4] = outputs[2]; + nms_tensors[5] = outputs[3]; + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "detect_post_nms", + &nms_tensors[0], _NMS_INPUT_NUM, + &nms_tensors[2], _NMS_OUTPUT_NUM, param1 ); + vsi_nn_ReleaseTensor( &bbox_tensor ); + vsi_nn_kernel_param_release( ¶m0 ); + vsi_nn_kernel_param_release( ¶m1 ); + } + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(DETECTION_POSTPROCESS, 3, 1) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + END_IO_TYPE_DECL(DETECTION_POSTPROCESS) + if (!VALIDATE_OP_IO_TYPES(DETECTION_POSTPROCESS, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + vsi_nn_detection_postprocess_param * p; + p = &(self->nn_param.detection_postprocess); + + outputs[0]->attr.dim_num = 2; + outputs[0]->attr.size[0] = p->max_num_detections; + outputs[0]->attr.size[1] = inputs[0]->attr.size[2]; + + outputs[1]->attr.dim_num = 3; + outputs[1]->attr.size[0] = 4; + outputs[1]->attr.size[1] = p->max_num_detections; + outputs[1]->attr.size[2] = inputs[0]->attr.size[2]; + + outputs[2]->attr.dim_num = 2; + outputs[2]->attr.size[0] = p->max_num_detections; + outputs[2]->attr.size[1] = inputs[0]->attr.size[2]; + + outputs[3]->attr.dim_num = 1; + outputs[3]->attr.size[0] = inputs[0]->attr.size[2]; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ DETECTION_POSTPROCESS, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c new file mode 100644 index 0000000..ea76c4b --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c @@ -0,0 +1,143 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_dtype_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_internal_deinit_node_wksp( self ); + + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(DROPOUT, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + END_IO_TYPE_DECL(DROPOUT) + if (!VALIDATE_OP_IO_TYPES(DROPOUT, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + vsi_nn_internal_node_t* curr = NULL; + + vsi_nn_internal_init_node_wksp(self); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LINEAR, 0, 0); + curr->node->nn_param.linear.a = self->nn_param.dropout.ratio; + curr->node->nn_param.linear.b = 0; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + + vsi_nn_internal_setup_node(self, curr); + + return ret; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ DROPOUT, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c new file mode 100644 index 0000000..ea69316 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c @@ -0,0 +1,583 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status _eltwise_op_compute + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t new_rank = 0; + vsi_bool ret = TRUE; + vx_bool doShapeOptimized = TRUE; + vsi_nn_kernel_param_t * param = NULL; + + if( NULL == self ) + { + return VSI_FAILURE; + } + status = VSI_FAILURE; + + if ( strcmp(kernel_name, "sub") == 0 + || strcmp(kernel_name, "add") == 0 + || strcmp(kernel_name, "mul") == 0 ) + { + doShapeOptimized = FALSE; + + reshape_tensors[0] = inputs[0]; + reshape_tensors[1] = inputs[1]; + reshape_tensors[2] = outputs[0]; + } + + // TODO: This optimzie is a hack for gpu path, + // it should be moved to gpu kernel setup. + if (doShapeOptimized) + { + ret = vsi_nn_kernel_optimize_eltwise_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + } + + if( ret ) + { + // Add params + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_float32( param, "scale", self->nn_param.multiply.scale ); + vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy ); + vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy ); + + if (doShapeOptimized) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[2], new_rank ); + } + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, + &reshape_tensors[0], 2, + &reshape_tensors[2], 1, param ); + + if (doShapeOptimized) + { + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + } + + vsi_nn_kernel_param_release( ¶m ); + } + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* _eltwise_op_compute() */ + +vsi_bool vsi_nn_op_eltwise_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i, j, out_rank, in2_rank; + uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_bool ret = TRUE; + + out_rank = inputs[0]->attr.dim_num; + for ( i = 1; i < self->input.num; i++) + { + in2_rank = inputs[i]->attr.dim_num; + out_rank = vsi_nn_max( out_rank, in2_rank ); + } + + for(i = 0; i < out_rank; i++) + { + uint32_t sz0, sz1; + + sz0 = i < inputs[0]->attr.dim_num ? inputs[0]->attr.size[i] : 1; + for ( j = 1; j < self->input.num; j++) + { + sz1 = i < inputs[j]->attr.dim_num ? inputs[j]->attr.size[i] : 1; + sz0 = vsi_nn_max( sz0, sz1 ); + if (sz0 != sz1 && sz0 != 1 && sz1 != 1) + { + /* Two dimensions are compatible when: + 1. they are equal, or + 2. one of them is 1*/ + VSILOGE("Input size mismatch."); + return FALSE; + } + } + shape[i] = sz0; + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + } + else + { + uint32_t total_size_got; + uint32_t total_size_expected; + total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); + total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, + outputs[0]->attr.dim_num ); + if( total_size_expected != total_size_got ) + { + VSILOGW("Output size mismatch, expect %d, but got %d", + total_size_expected, total_size_got); + ret = FALSE; + } + } + + return ret; +} /* vsi_nn_op_eltwise_setup() */ + + +static vsi_bool op_check_minimum + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(MINIMUM, 2, 1) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + END_IO_TYPE_DECL(MINIMUM) + if(!VALIDATE_OP_IO_TYPES(MINIMUM, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_check_maximum + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(MAXIMUM, 2, 1) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + END_IO_TYPE_DECL(MAXIMUM) + if(!VALIDATE_OP_IO_TYPES(MAXIMUM, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_check_pow + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(POW, 2, 1) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F32, D_F32, D_F32) + END_IO_TYPE_DECL(POW) + if(!VALIDATE_OP_IO_TYPES(POW, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + + +static vsi_bool op_check_add + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(ADD, 2, 1) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I8, D_I8, D_I8) + IO_TYPE(D_I8, D_I8, D_U8) + IO_TYPE(D_I8, D_I8, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I32, D_I8|Q_DFP) + END_IO_TYPE_DECL(ADD) + if(!VALIDATE_OP_IO_TYPES(ADD, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_check_sub + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(SUBTRACT, 2, 1) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + END_IO_TYPE_DECL(SUBTRACT) + if(!VALIDATE_OP_IO_TYPES(SUBTRACT, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + + + + +static vsi_bool op_check_div + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(DIVIDE, 2, 1) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + END_IO_TYPE_DECL(DIVIDE) + if(!VALIDATE_OP_IO_TYPES(DIVIDE, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + + +static vsi_bool op_check_mul + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(MULTIPLY, 2, 1) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + END_IO_TYPE_DECL(MULTIPLY) + if(!VALIDATE_OP_IO_TYPES(MULTIPLY, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEF_ELEMENT_WISE_OP(name, kernel_name) \ + static vsi_status op_compute_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _eltwise_op_compute( ""#kernel_name, self, inputs, outputs ); \ + } \ +DEF_OP_REG(name, NULL, op_compute_##kernel_name, vsi_nn_op_common_deinit, \ + op_check_##kernel_name, vsi_nn_op_eltwise_setup, NULL, 2, 1) + +DEF_ELEMENT_WISE_OP( MINIMUM, minimum ); +DEF_ELEMENT_WISE_OP( MAXIMUM, maximum ); +DEF_ELEMENT_WISE_OP( ADD, add ); +DEF_ELEMENT_WISE_OP( SUBTRACT, sub ); +DEF_ELEMENT_WISE_OP( DIVIDE, div ); +DEF_ELEMENT_WISE_OP( MULTIPLY, mul ); +DEF_ELEMENT_WISE_OP( POW, pow ); + + +#undef DEF_ELEMENT_WISE_OP + +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c new file mode 100644 index 0000000..c4192d3 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c @@ -0,0 +1,183 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status _eltwise_unary_op_compute + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + + if( NULL == self ) + { + return status; + } + + // TODO: This optimzie is a hack for gpu path, + // it should be moved to gpu kernel setup. + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, inputs, 1, outputs, 1, NULL ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* _eltwise_op_compute() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i, out_rank; + uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_bool ret = TRUE; + + out_rank = inputs[0]->attr.dim_num; + + for(i = 0; i < out_rank; i++) + { + shape[i] = inputs[0]->attr.size[i]; + } + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + } + else + { + uint32_t total_size_got; + uint32_t total_size_expected; + total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); + total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, + outputs[0]->attr.dim_num ); + if( total_size_expected != total_size_got ) + { + VSILOGW("Output size mismatch, expect %d, but got %d", + total_size_expected, total_size_got); + ret = FALSE; + } + } + + return ret; +} /* op_setup() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(ELTWISE_UNARY, 1, 1) + /* IO_TYPE(INPUT, OUTPUT) */ + IO_TYPE(D_I32, D_I32) + + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BF16) + + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP) + + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(ELTWISE_UNARY) + if(!VALIDATE_OP_IO_TYPES(ELTWISE_UNARY, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEF_ELEMENT_WISE_UNARY_OP(name, kernel_name) \ + static vsi_status op_compute_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _eltwise_unary_op_compute( ""#kernel_name, self, inputs, outputs ); \ + } \ +DEF_OP_REG(name, NULL, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 2, 1) + +DEF_ELEMENT_WISE_UNARY_OP( SIN, sin ); +DEF_ELEMENT_WISE_UNARY_OP( EXP, exp ); +DEF_ELEMENT_WISE_UNARY_OP( LOG, log ); +DEF_ELEMENT_WISE_UNARY_OP( ELU, elu ); +DEF_ELEMENT_WISE_UNARY_OP( NEG, neg ); +DEF_ELEMENT_WISE_UNARY_OP( HARD_SIGMOID, hard_sigmoid ); +DEF_ELEMENT_WISE_UNARY_OP( MISH, mish ); + +#undef DEF_ELEMENT_UNARY_WISE_OP + +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c new file mode 100644 index 0000000..39dbfe9 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c @@ -0,0 +1,166 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static void _reshape_tensor + ( + vsi_nn_tensor_t * input, + vx_tensor * output + ) +{ + vsi_nn_tensor_attr_t attr; + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + *output = input->t; + + if (input->attr.dim_num == 2) + { + attr.size[0] = input->attr.size[0]; + attr.size[1] = 1; + attr.size[2] = input->attr.size[1]; + attr.dim_num = 3; + } + + *output = vxReshapeTensor( input->t, (int32_t *)attr.size, attr.dim_num ); +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_embedding_lookup_param* p = &self->nn_param.embedding_lookup; + + _reshape_tensor(inputs[1], &(p->local.lut_tensor)); + _reshape_tensor(outputs[0], &(p->local.output_tensor)); + + self->n = vxTensorTableLookupNode2( self->graph->g, + inputs[0]->t, p->local.lut_tensor, p->local.output_tensor); + if( !self->n ) + { + status = VSI_FAILURE; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(EMBEDDING_LOOKUP, 2, 1) + IO_TYPE(D_I32, D_F16, D_F16) + IO_TYPE(D_I32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I32, D_U8|Q_ASYM, D_U8|Q_ASYM) + END_IO_TYPE_DECL(EMBEDDING_LOOKUP) + + if (!VALIDATE_OP_IO_TYPES(EMBEDDING_LOOKUP, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[1]->attr.dim_num; + memcpy( outputs[0]->attr.size, inputs[1]->attr.size, + sizeof(int) * inputs[1]->attr.dim_num ); + outputs[0]->attr.size[outputs[0]->attr.dim_num - 1] = inputs[0]->attr.size[0]; + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.embedding_lookup.local.input_tensor != NULL) + { + vxReleaseTensor(&self->nn_param.embedding_lookup.local.input_tensor); + self->nn_param.embedding_lookup.local.input_tensor = NULL; + } + if (self->nn_param.embedding_lookup.local.lut_tensor != NULL) + { + vxReleaseTensor(&self->nn_param.embedding_lookup.local.lut_tensor); + self->nn_param.embedding_lookup.local.lut_tensor = NULL; + } + if (self->nn_param.embedding_lookup.local.output_tensor != NULL) + { + vxReleaseTensor(&self->nn_param.embedding_lookup.local.output_tensor); + self->nn_param.embedding_lookup.local.output_tensor = NULL; + } + vsi_nn_op_common_deinit(self); + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cpluplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ EMBEDDING_LOOKUP, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 2, + /* output_num */ 1 + ); +#ifdef __cpluplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c new file mode 100644 index 0000000..4ba10e9 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c @@ -0,0 +1,156 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_dtype_util_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self );; +} + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(EXPAND_BROADCAST, 1, 1) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_I32|Q_DFP, D_I32|Q_DFP) + IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM) + END_IO_TYPE_DECL(EXPAND_BROADCAST) + if (!VALIDATE_OP_IO_TYPES(EXPAND_BROADCAST, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t *input_1; + vsi_nn_internal_node_t* mul_node = NULL; + vsi_nn_expand_broadcast_param *p = &self->nn_param.expand_broadcast; + + vsi_nn_internal_init_node_wksp(self); + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = p->dim_num; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.is_const = TRUE; + for(i = 0; i < p->dim_num; i++) + { + attr.size[i] = p->shape[i]; + } + input_1 = vsi_nn_internal_new_tensor( self, &attr, 1.0f ); + + mul_node = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 ); + mul_node->inputs[0] = inputs[0]; + mul_node->inputs[1] = input_1->t; + mul_node->node->nn_param.multiply.scale = 1.0f; + mul_node->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + mul_node->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN; + mul_node->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, mul_node); + + return TRUE; +} + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp( self ); + return status; +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ EXPAND_BROADCAST, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c new file mode 100644 index 0000000..a3b7fb7 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c @@ -0,0 +1,347 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +extern vx_kernel_description_t * vx_kernel_EXTRA_ENDING_list[]; + +static void check_tensor_shape + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vx_reference * params, + uint32_t index, + vx_bool rsFlg + ) +{ + vsi_nn_tensor_attr_t attr; + + if (index == 0) + { + if( input->attr.dim_num == 1) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.extra_ending.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.extra_ending.local.local_tensor[index]; + } + else + params[index] = (vx_reference)input->t; + } + else if (index == 1) + { + params[index] = (vx_reference)input->t; + } + else if (index == 2) + { + if( input->attr.dim_num == 1) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.extra_ending.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.extra_ending.local.local_tensor[index]; + } + else + params[index] = (vx_reference)input->t; + } + else + { + VSILOGE("No more local tensor!(pow) at [%s : %d]\n", __FILE__, __LINE__); + } +} + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_tensor_t* extraInput + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[3]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + check_tensor_shape(self, inputs[0], params, 0, 0); + check_tensor_shape(self, extraInput, params, 1, 0); + check_tensor_shape(self, outputs[0], params, 2, 0); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, 3 ); + + return status; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_tensor_t* extraInput + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[3]; + vx_border_t border; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + check_tensor_shape(self, inputs[0], params, 0, 0); + check_tensor_shape(self, extraInput, params, 1, 0); + check_tensor_shape(self, outputs[0], params, 2, 0); + /*TODO: Add code if need to change your parameter*/ + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, 3 ); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + return status; +} + +static vsi_status vx_op_pre_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type; + + if (outputDataFormat == VSI_NN_TYPE_INT16 || outputDataFormat == VSI_NN_TYPE_FLOAT16) + { + kernel_info->kernel_index = 1; + } + if (outputDataFormat == VSI_NN_TYPE_INT8) + { + kernel_info->kernel_index = 2; + } + if (outputDataFormat == VSI_NN_TYPE_UINT8) + { + kernel_info->kernel_index = 3; + } + else + { + VSILOGE("Not support input or output data format!(extra ending) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + + return VSI_SUCCESS; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + vsi_nn_tensor_t* tmpRealInput = NULL; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_EXTRA_ENDING_list; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_extra_ending"; + + { + vsi_nn_tensor_attr_t attr; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.size[0] = self->nn_param.extra_ending.length; + attr.size[1] = 1; + attr.size[2] = 1; + attr.size[3] = 1; + attr.dim_num = 2; + attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + attr.vtl = FALSE; + tmpRealInput = vsi_nn_CreateTensorFromData(self->graph, + (uint8_t*)&self->nn_param.extra_ending.value, &attr); + } + + if( kernel_info.type == VX_KERNEL_TYPE_VX) + { + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + vx_op_pre_compute(self, inputs, outputs, &kernel_info); + } + else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ + { + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + kernel_info.type = VX_KERNEL_TYPE_CPU; + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) + { + free(kernel_info.resource_name); + } + if( NULL == self->n ) + { + status = VSI_FAILURE; + goto final; + } + + if(kernel_info.type == VX_KERNEL_TYPE_VX) + { + status = vx_op_compute(self, inputs, outputs, tmpRealInput); + } + else + { + status = cpu_op_compute(self, inputs, outputs, tmpRealInput); + } + +final: + if(tmpRealInput) vsi_nn_ReleaseTensor(&tmpRealInput); + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(EXTRA_ENDING, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + END_IO_TYPE_DECL(EXTRA_ENDING) + if (!VALIDATE_OP_IO_TYPES(EXTRA_ENDING, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + VSILOGE("output size cannot be zero!(EXTRA_ENDING)\n"); + return FALSE; + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + for (i = 0; i < _VSI_NN_EXTRA_ENDING_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.extra_ending.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.extra_ending.local.local_tensor[i])); + self->nn_param.extra_ending.local.local_tensor[i] = NULL; + } + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ EXTRA_ENDING, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c new file mode 100644 index 0000000..1e9d5a7 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c @@ -0,0 +1,108 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(FLOOR, 1, 1) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + END_IO_TYPE_DECL(FLOOR) + if (!VALIDATE_OP_IO_TYPES(FLOOR, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_nn_rounding_params_t p; + memset(&p, 0, sizeof(p)); + p.mode = VX_NN_DS_SIZE_ROUNDING_FLOOR; + self->n = vxTensorRoundingNode(self->graph->g, inputs[0]->t, &p, sizeof(p), outputs[0]->t); + if( !self->n ) + { + status = VSI_FAILURE; + } + return status; +} /* op_compute() */ + +#ifdef __cpluplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ FLOOR, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c new file mode 100644 index 0000000..9cd0bd2 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c @@ -0,0 +1,192 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + uint32_t new_rank = 0; + vsi_bool ret; + + if( NULL == self ) + { + return VSI_FAILURE; + } + + ret = vsi_nn_kernel_optimize_eltwise_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + if( ret ) + { + + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[2], new_rank ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "floordiv", + &reshape_tensors[0], _INPUT_NUM, + &reshape_tensors[2], _OUTPUT_NUM, NULL ); + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + } + + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(FLOORDIV, 2, 1) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(FLOORDIV) + if(!VALIDATE_OP_IO_TYPES(FLOORDIV, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i, out_rank, in1_rank, in2_rank; + uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_bool ret = TRUE; + + in1_rank = inputs[0]->attr.dim_num; + in2_rank = inputs[1]->attr.dim_num; + out_rank = vsi_nn_max( in1_rank, in2_rank ); + + for(i = 0; i < out_rank; i++) + { + uint32_t sz0, sz1; + sz0 = i < in1_rank ? inputs[0]->attr.size[i] : 1; + sz1 = i < in2_rank ? inputs[1]->attr.size[i] : 1; + shape[i] = vsi_nn_max( sz0, sz1 ); + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + } + else + { + uint32_t total_size_got; + uint32_t total_size_expected; + total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); + total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, + outputs[0]->attr.dim_num ); + if( total_size_expected != total_size_got ) + { + VSILOGW("Output size mismatch, expect %d, but got %d", + total_size_expected, total_size_got); + ret = FALSE; + } + } + + return ret; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ FLOORDIV, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c new file mode 100644 index 0000000..b2023b6 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c @@ -0,0 +1,283 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + uint32_t axis; + uint32_t i = 0; + uint32_t num_fc = 1, num_no_fc = 1; + uint32_t num_of_intput_dims = 0; + int32_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t dims = 0; + vx_tensor input = NULL; + vx_tensor output = NULL; + vx_tensor weight = NULL; + vx_tensor bias = NULL; + + status = VSI_FAILURE; + + memcpy(input_size, inputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + num_of_intput_dims = inputs[0]->attr.dim_num; + axis = inputs[0]->attr.dim_num - 2; + + for(i = 0; i <= (uint32_t)axis; ++i) + { + num_fc *= input_size[i]; + } + for(i = axis + 1; i < num_of_intput_dims; ++i) + { + num_no_fc *= input_size[i]; + } + + input_size[0] = num_fc; + input_size[1] = num_no_fc; + dims= 2; + input = vxReshapeTensor(inputs[0]->t, input_size, dims); + + weight = inputs[1]->t; + + if( inputs[2] != NULL ) + { + bias = inputs[2]->t; + } + + output = outputs[0]->t; + + self->n = vxFullyConnectedLayer( + self->graph->g, + input, + weight, + bias, + self->vx_param.overflow_policy, + self->vx_param.rounding_policy, + output + ); + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + + if (input) vxReleaseTensor(&input); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + /* Check fl and scale*/ + ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); + + ret = ret && vsi_nn_OpCheck(VSI_NN_OP_FCL_RELU, self, inputs, outputs); + + if(!ret) { + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(FCL, 3, 1) + /* IO_TYPE(INPUT, WEIGHT, BIAS, OUTPUT) */ + IO_TYPE(D_F16, D_F16, D_NONE, D_F16) + IO_TYPE(D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F32, D_U8|Q_ASYM) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + END_IO_TYPE_DECL(FCL) + ret = VALIDATE_OP_IO_TYPES(FCL, self, inputs, self->input.num, outputs, self->output.num); + if(!ret) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t dim_num; + uint32_t perm[4] = { 0 }; + uint32_t as_shape[4] = { 0 }; + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + /* TODO: Driver should handle this, + * Check transpose + * */ + if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt && + VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type ) + { + /* TODO: This is used to handle the first fcl. */ + if( 1 != inputs[0]->attr.size[0] || 1 != inputs[0]->attr.size[1] ) + { + dim_num = 4; + perm[0] = 3; + perm[1] = 2; + perm[2] = 0; + perm[3] = 1; + as_shape[0] = inputs[0]->attr.size[0]; + as_shape[1] = inputs[0]->attr.size[1]; + as_shape[2] = inputs[0]->attr.size[2]; + as_shape[3] = inputs[1]->attr.size[3]; + } + else + { + dim_num = 2; + perm[0] = 1; + perm[1] = 0; + as_shape[0] = vsi_nn_ShapeProduct( inputs[0]->attr.size, + inputs[0]->attr.dim_num ); + as_shape[1] = inputs[1]->attr.size[3]; + } + vsi_nn_TransposeTensor( self->graph, inputs[1], perm, dim_num, as_shape ); + inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + uint32_t input_dim = inputs[0]->attr.dim_num; + + if ( vsi_nn_compareVersion(self->graph, 1, 1, 0) >= 0) + { + switch (input_dim) + { + // CAUTION: FC input shape need contain batch size. + // and graph version no smaller than 5.0.0 + case 2: + case 3: + case 4: + outputs[0]->attr.dim_num = 2; + outputs[0]->attr.size[0] = inputs[1]->attr.size[1]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[input_dim-1]; + break; + default: + VSILOGE("input dim[%u] error\n", inputs[0]->attr.dim_num); + return FALSE; + } + } + else + { + switch (input_dim) + { + // CAUTION: FC input shape with/without batch size. + // and graph version smaller than 5.0.0 + case 1: + case 3: + // add a workaround to handle fc layer input without batch size + // But nput with 3 dimensions and with batch size will go into this path. + // FIX ME + outputs[0]->attr.dim_num = 1; + outputs[0]->attr.size[0] = inputs[1]->attr.size[1]; + break; + case 2: + case 4: + outputs[0]->attr.dim_num = 2; + outputs[0]->attr.size[0] = inputs[1]->attr.size[1]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[input_dim-1]; + break; + default: + VSILOGE("input dim[%u] error\n", inputs[0]->attr.dim_num); + return FALSE; + } + } + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ FCL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c new file mode 100644 index 0000000..7fca31a --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c @@ -0,0 +1,450 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_util.h" + +#define _ARG_NUM (2) +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +#define USE_OVX_API TRUE + +#if (USE_OVX_API == FALSE) +extern vx_kernel_description_t * vx_kernel_FCL2_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_fcl_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.fcl); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, axis ); + //_SET_PARAM( 1, VX_TYPE_FLOAT32, bias ); + //_SET_PARAM( 2, VX_TYPE_TENSOR, data_bias ); + //_SET_PARAM( 3, VX_TYPE_TENSOR, data_weight ); + //_SET_PARAM( 4, VX_TYPE_FLOAT32, regularize ); + _SET_PARAM( 1, VX_TYPE_INT32, weights ); +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + uint32_t axis; + vsi_nn_fcl_param * p; + uint32_t i = 0; + uint32_t num_fc = 1, num_no_fc = 1; + uint32_t num_of_dims[3] = {0}; + uint32_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t output_size[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t weights_size[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t size[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t ofm = 0; + uint32_t dims = 0; + vx_tensor input = NULL; + vx_tensor output = NULL; + vx_tensor weight = NULL; + vx_tensor bias = NULL; + int32_t index = 0; + vx_border_t border; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + p = (vsi_nn_fcl_param *)&(self->nn_param.fcl); + axis = p->axis; + + memcpy(input_size, inputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + num_of_dims[0] = inputs[0]->attr.dim_num; + memcpy(output_size, outputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + num_of_dims[1] = outputs[0]->attr.dim_num; + memcpy(weights_size, inputs[1]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + num_of_dims[2] = inputs[1]->attr.dim_num; + + ofm = weights_size[num_of_dims[2] - 1]; + + for(i = 0; i <= (uint32_t)axis; ++i) + { + num_fc *= input_size[i]; + } + for(i = axis + 1; i < num_of_dims[0]; ++i) + { + num_no_fc *= input_size[i]; + } + + size[0] = num_fc; + size[1] = num_no_fc; + dims= 2; + input = vxReshapeTensor(inputs[0]->t, size, dims); + + size[0] = num_fc; + size[1] = ofm; + dims= 2; + weight = vxReshapeTensor(inputs[1]->t, size, dims); + + size[0] = ofm; + size[1] = 1; + dims= 2; + bias = vxReshapeTensor(inputs[2]->t, size, dims); + + size[0] = ofm; + size[1] = num_no_fc; + dims= 2; + output = vxReshapeTensor(outputs[0]->t, size, dims); + + status |= vxSetParameterByIndex(self->n, index++, (vx_reference)input); + status |= vxSetParameterByIndex(self->n, index++, (vx_reference)weight); + status |= vxSetParameterByIndex(self->n, index++, (vx_reference)bias); + status |= vxSetParameterByIndex(self->n, index++, (vx_reference)output); + + border.mode = VX_BORDER_CONSTANT; + border.constant_value.S16 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + if (input) vxReleaseTensor(&input); + if (weight) vxReleaseTensor(&weight); + if (bias) vxReleaseTensor(&bias); + if (output) vxReleaseTensor(&output); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; +#endif + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; +#if (USE_OVX_API == TRUE) + uint32_t axis; + vsi_nn_fcl_param * p; + uint32_t i = 0; + uint32_t num_fc = 1, num_no_fc = 1; + uint32_t num_of_dims[4] = {0}; + int32_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t output_size[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t weights_size[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t bias_size[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t ofm = 0; + uint32_t dims = 0; + vx_tensor input = NULL; + vx_tensor output = NULL; + vx_tensor weight = NULL; + vx_tensor bias = NULL; + + p = (vsi_nn_fcl_param *)&(self->nn_param.fcl); + axis = p->axis; + + memcpy(input_size, inputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + num_of_dims[0] = inputs[0]->attr.dim_num; + memcpy(output_size, outputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + num_of_dims[1] = outputs[0]->attr.dim_num; + memcpy(weights_size, inputs[1]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + num_of_dims[2] = inputs[1]->attr.dim_num; + if( inputs[2] != NULL ) + { + memcpy(bias_size, inputs[2]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + num_of_dims[3] = inputs[2]->attr.dim_num; + } + + ofm = weights_size[num_of_dims[2] - 1]; + + for(i = 0; i <= (uint32_t)axis; ++i) + { + num_fc *= input_size[i]; + } + for(i = axis + 1; i < num_of_dims[0]; ++i) + { + num_no_fc *= input_size[i]; + } + + input_size[0] = num_fc; + input_size[1] = num_no_fc; + dims= 2; + input = vxReshapeTensor(inputs[0]->t, input_size, dims); + + weights_size[0] = num_fc; + weights_size[1] = ofm; + dims= 2; + weight = vxReshapeTensor(inputs[1]->t, weights_size, dims); + + if( inputs[2] != NULL ) + { + bias_size[0] = ofm; + bias_size[1] = 1; + dims= 2; + bias = vxReshapeTensor(inputs[2]->t, bias_size, dims); + } + + output_size[0] = ofm; + output_size[1] = num_no_fc; + dims= 2; + output = vxReshapeTensor(outputs[0]->t, output_size, dims); + + self->n = vxFullyConnectedLayer( + self->graph->g, + input, + weight, + bias, + self->vx_param.overflow_policy, + self->vx_param.rounding_policy, + output + ); + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + + if (input) vxReleaseTensor(&input); + if (weight) vxReleaseTensor(&weight); + if (bias) vxReleaseTensor(&bias); + if (output) vxReleaseTensor(&output); +#else + vsi_nn_kernel_info_t kernel_info; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_fullconnect2"; + kernel_info.type = VX_KERNEL_TYPE_VX; + kernel_info.kernel = vx_kernel_FCL2_list; + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } +#endif + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_FCL, self, inputs, outputs); + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_fcl_param * p; + uint32_t i, j; + uint32_t num_in_fmp = 1; + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + p = (vsi_nn_fcl_param *)&(self->nn_param.fcl); + if (inputs[1]->attr.is_const == TRUE) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num - p->axis; + for(i = p->axis + 1, j = 1; i < inputs[0]->attr.dim_num && j < outputs[0]->attr.dim_num; ++i, ++j) + { + outputs[0]->attr.size[j] = inputs[0]->attr.size[i]; + } + } + else + { + /* For fullconnect_op, weight not const tensor */ + outputs[0]->attr.dim_num = 2; + for (i = p->axis + 1; i < inputs[0]->attr.dim_num; i++) + { + num_in_fmp *= inputs[0]->attr.size[i]; + } + outputs[0]->attr.size[1] = num_in_fmp; + } + outputs[0]->attr.size[0] = p->weights; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ FCL2, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c new file mode 100644 index 0000000..9cbaebd --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c @@ -0,0 +1,316 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status _set_fc_relu_parameter + ( + vsi_nn_node_t * self, + vx_nn_convolution_relu_pooling_params_t * param + ); + +static vsi_status _set_fc_relu_parameter + ( + vsi_nn_node_t * self, + vx_nn_convolution_relu_pooling_params_t * param + ) +{ + vx_scalar pad_const; + int32_t pad_const_val; + + pad_const_val = 0; + memset( param, 0, sizeof(vx_nn_convolution_relu_pooling_params_t) ); + pad_const = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, &pad_const_val); + if( !pad_const ) + { + VSILOGE("Create scalar fail\n"); + return VSI_FAILURE; + } + + param->pad_x_left = 0; + param->pad_x_right = 0; + param->pad_y_top = 0; + param->pad_y_bottom = 0; + param->dilation_x = 0; + param->dilation_y = 0; + param->accumulator_bits = (vx_uint8)self->vx_param.accumulator_bits; + param->overflow_policy = self->vx_param.overflow_policy; + param->rounding_policy = self->vx_param.rounding_policy; + param->down_scale_size_rounding = self->vx_param.down_scale_size_rounding; + param->enable_relu = self->vx_param.has_relu; + param->pool_type = 0; + param->pool_size_x = 0; + param->pool_size_y = 0; + param->pad_mode = VX_PAD_CONSTANT; + param->pad_const = pad_const; + + return VSI_SUCCESS; +} /* _set_fc_relu_parameter() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxFullyConnectedReluLayer( + self->graph->g, + inputs[0]->t, + inputs[1]->wb, + 0, + 0, + self->vx_param.overflow_policy, + self->vx_param.rounding_policy, + self->vx_param.down_scale_size_rounding, + self->vx_param.has_relu, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + /* Check fl and scale*/ + ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); + + if(ret) { + /* check inputs outputs data type */ + /* NN Support */ + BEGIN_IO_TYPE_DECL(FCL_RELU, 3, 1) + /* IO_TYPE(INPUT, WEIGHT, BIAS, OUTPUT) */ + /* NN Support - I8 */ + IO_TYPE(D_I8|Q_SYM_PC, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_SYM_PC) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + + /* NN Support - U8 */ + IO_TYPE(D_U8|Q_SYM_PC, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_SYM_PC) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + + /* NN Support - I16 */ + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + + /* NN Support - F16 */ + IO_TYPE(D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_U8|Q_ASYM) + + /* NN Support - BF16 */ + IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) + + /* NN Support - F32 */ + IO_TYPE(D_F32, D_BF16, D_F32, D_F32) + IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) + END_IO_TYPE_DECL(FCL_RELU) + ret = VALIDATE_OP_IO_TYPES(FCL_RELU, self, inputs, self->input.num, outputs, self->output.num); + + /* TP Support */ + if (!ret ) { + uint32_t valid_dtypes[] = { + D_F16, D_BF16, D_F32, D_I16|Q_DFP, D_I8|Q_DFP, D_I8|Q_ASYM, D_U8|Q_DFP, D_U8|Q_ASYM + }; + + uint32_t weight_type = inputs[1]->attr.dtype.vx_type | inputs[1]->attr.dtype.qnt_type << Q_SHIFT; + uint32_t inputs_types[3] = { 0 }; + vsi_bool supported[3] = { FALSE, FALSE, FALSE }; + int i = 0; + + inputs_types[0] = inputs[0]->attr.dtype.vx_type | inputs[0]->attr.dtype.qnt_type << Q_SHIFT; + inputs_types[2] = outputs[0]->attr.dtype.vx_type | outputs[0]->attr.dtype.qnt_type << Q_SHIFT; + if (inputs[2]) { + switch(inputs[1]->attr.dtype.vx_type) { + case D_F16: + case D_BF16: + case D_F32: + if(inputs[2]->attr.dtype.vx_type == (vsi_nn_type_e)D_F32) { + inputs_types[1] = weight_type; + } + break; + case D_I16: + case D_I8: + case D_U8: + if (inputs[2]->attr.dtype.vx_type == (vsi_nn_type_e)D_I32 || + inputs[2]->attr.dtype.vx_type == (vsi_nn_type_e)D_I64) { + inputs_types[1] = weight_type; + } + break; + default: + break; + } + } else { + inputs_types[1] = weight_type; + } + + for (i = 0; i < 3; i++) { + supported[i] = is_item_in_array(&inputs_types[i], valid_dtypes, + sizeof(uint32_t), _cnt_of_array(valid_dtypes)); + } + + ret = supported[0] && supported[1] && supported[2]; + } + + if(!ret) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + vsi_nn_safe_free(desc); + return FALSE; + } + } + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret; + vx_nn_convolution_relu_pooling_params_t p; + vx_weights_biases_parameter_optimizations_ext_t opt; + vx_weights_biases_parameter_optimizations_ext_t * p_opt; + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + ret = vsi_nn_OpSetup( VSI_NN_OP_FCL, self, inputs, outputs ); + + /* Prepare weight_bias */ + if(inputs[1]->wb == NULL) + { + if( _set_fc_relu_parameter( self, &p ) != VSI_SUCCESS ) + { + VSILOGE("set fc_relu weightbias parameter fail\n"); + return FALSE; + } + + p_opt = NULL; + memset( &opt, 0, sizeof( opt ) ); + if( outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC + || inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + opt.inputZeroPoint = inputs[0]->attr.dtype.zero_point; + } + opt.zrl = -1; + opt.outputFormat = outputs[0]->attr.dtype.vx_type; + opt.num_of_input_dims = inputs[0]->attr.dim_num; + opt.num_of_output_dims = outputs[0]->attr.dim_num; + p_opt = &opt; + + inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3( + VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER, + inputs[0]->attr.size, + outputs[0]->attr.size, + outputs[0]->attr.size, + &p, + sizeof(p), + (vx_weights_biases_parameter_optimizations_t *)p_opt, + sizeof(opt), + inputs[1]->t, inputs[2]->t + ); + if( p.pad_const ) + { + vxReleaseScalar( &p.pad_const ); + } + } + + + if( NULL == inputs[1]->wb ) + { + VSILOGE( "Create weight bias fail." ); + ret = FALSE; + } + + return ret; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ FCL_RELU, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c new file mode 100644 index 0000000..4cc922e --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c @@ -0,0 +1,197 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + uint32_t i = 0; + uint32_t block_size = 1, block_num = 1, axis_num = 0, indices_num = 1; + int32_t axis = self->nn_param.gather.axis; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t dims_num = inputs[0]->attr.dim_num; + + param =vsi_nn_kernel_param_create(); + + for(i = 0; i < (uint32_t)axis; ++i) + { + block_size *= input_size[i]; + } + + axis_num = input_size[axis]; + for(i = axis + 1; i < dims_num; ++i) + { + block_num *= input_size[i]; + } + for(i = 0; i < (uint32_t)inputs[1]->attr.dim_num; ++i) + { + indices_num *= inputs[1]->attr.size[i]; + } + + vsi_nn_kernel_param_add_int32( param, "block_size", block_size ); + vsi_nn_kernel_param_add_int32( param, "block_num", block_num ); + vsi_nn_kernel_param_add_int32( param, "axis_num", axis_num ); + vsi_nn_kernel_param_add_int32( param, "indices_num", indices_num ); + n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(GATHER, 2, 1) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_F32, D_I32, D_F32) + IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I32, D_F16) + END_IO_TYPE_DECL(GATHER) + if(!VALIDATE_OP_IO_TYPES(GATHER, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + uint32_t i = 0; + vsi_nn_gather_param * p = NULL; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + uint32_t j = 0; + p = &(self->nn_param.gather); + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + inputs[1]->attr.dim_num - 1; + for (i = 0; i < (uint32_t)p->axis; i++) + { + outputs[0]->attr.size[j] = inputs[0]->attr.size[i]; + j++; + } + for (i = 0; i < inputs[1]->attr.dim_num; i++) + { + outputs[0]->attr.size[j] = inputs[1]->attr.size[i]; + j++; + } + for (i = (uint32_t)p->axis + 1; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[j] = inputs[0]->attr.size[i]; + j++; + } + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + for (i = 0; i < _VSI_NN_GATHER_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.gather.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.gather.local.local_tensor[i])); + self->nn_param.gather.local.local_tensor[i] = NULL; + } + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GATHER, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c new file mode 100644 index 0000000..9d5341a --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c @@ -0,0 +1,186 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (2) +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + uint32_t i = 0; + uint32_t block_size = 1, coord_dim = 1; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t dims_num = inputs[0]->attr.dim_num; + + if(inputs[1]->attr.dim_num > 1) + { + coord_dim = inputs[1]->attr.size[0]; + } + if( coord_dim > 3 ) + { + CHECK_STATUS(status); + return status; + } + + param =vsi_nn_kernel_param_create(); + + for(i = 0; i < dims_num - coord_dim; ++i) + { + block_size *= input_size[i]; + } + + vsi_nn_kernel_param_add_int32( param, "block_size", block_size ); + vsi_nn_kernel_param_add_int32( param, "coord_dim", coord_dim ); + n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(GATHER_ND, 2, 1) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_F32, D_I32, D_F32) + IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I32, D_F16) + END_IO_TYPE_DECL(GATHER_ND) + if(!VALIDATE_OP_IO_TYPES(GATHER_ND, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + uint32_t i = 0; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + uint32_t j = 0, coord_dim = 1; + if(inputs[1]->attr.dim_num > 1) + { + coord_dim = inputs[1]->attr.size[0]; + } + + for (i = 0; i < (uint32_t)inputs[0]->attr.dim_num - coord_dim; i++) + { + outputs[0]->attr.size[j++] = inputs[0]->attr.size[i]; + } + for (i = 1; i < inputs[1]->attr.dim_num; i++) + { + outputs[0]->attr.size[j++] = inputs[1]->attr.size[i]; + } + if(inputs[1]->attr.dim_num == 1) + { + outputs[0]->attr.size[j++] = inputs[1]->attr.size[0]; + } + outputs[0]->attr.dim_num = j; + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GATHER_ND, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c new file mode 100644 index 0000000..927123b --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c @@ -0,0 +1,308 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" + +#define _ARG_NUM (6) +#define _INPUT_NUM (4) +#define _OUTPUT_NUM (3) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_GENERATE_PROPOSALS_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_generate_proposals_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.generate_proposals); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ + #define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_FLOAT32, height_stride ); + _SET_PARAM( 1, VX_TYPE_FLOAT32, width_stride ); + _SET_PARAM( 2, VX_TYPE_INT32, pre_nms_top_n ); + _SET_PARAM( 3, VX_TYPE_INT32, post_nms_top_n ); + _SET_PARAM( 4, VX_TYPE_FLOAT32, iou_threshold ); + _SET_PARAM( 5, VX_TYPE_FLOAT32, min_size ); + #undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + /*TODO: Add code if need to change your parameter*/ + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + char *path = NULL; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = vx_kernel_GENERATE_PROPOSALS_list; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_generate_proposals"; + path = getenv("USER_VX_SOURCE_PATH"); + if(path) + vsi_nn_VxResourceSetPath(path); + + if( kernel_info.type == VX_KERNEL_TYPE_VX) + { + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + } + else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ + { + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) + { + free(kernel_info.resource_name); + } + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + vsi_nn_generate_proposals_param * p; + int32_t num_output_rois; + p = &(self->nn_param.generate_proposals); + num_output_rois = vsi_nn_GetElementNum(inputs[0]); + if(p->pre_nms_top_n > 0) + { + num_output_rois = p->pre_nms_top_n; + } + if(p->post_nms_top_n > 0) + { + num_output_rois = p->post_nms_top_n; + } + + outputs[0]->attr.dim_num = 1; + outputs[0]->attr.size[0] = num_output_rois; + + outputs[1]->attr.dim_num = 2; + outputs[1]->attr.size[0] = 4; + outputs[1]->attr.size[1] = num_output_rois; + + outputs[2]->attr.dim_num = 1; + outputs[2]->attr.size[0] = num_output_rois; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GENERATE_PROPOSALS, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c new file mode 100644 index 0000000..0797dba --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c @@ -0,0 +1,393 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +#define LOCAL() ((vsi_nn_grouped_conv2d_param_local_data *)nn_param->local) + +typedef struct _vsi_nn_grouped_conv2d_param_local_data { + vsi_nn_tensor_t ** input_tensor_group; + vsi_nn_tensor_t ** weight_tensor_group; + vsi_nn_tensor_t ** bias_tensor_group; + vsi_nn_tensor_t ** output_tensor_group; +} vsi_nn_grouped_conv2d_param_local_data; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool res; + uint32_t i; + vsi_nn_grouped_conv2d_param *nn_param = &self->nn_param.grouped_conv2d; + nn_param->local = (vsi_nn_grouped_conv2d_param_local_data*)malloc( + sizeof(vsi_nn_grouped_conv2d_param_local_data)); + memset(nn_param->local, 0, sizeof(vsi_nn_grouped_conv2d_param_local_data)); + /* TODO */ + /* example code : add op */ + /* + self->n = vxTensorAddNode( self->graph->g, inputs[0]->t, inputs[1]->t, + VX_CONVERT_POLICY_SATURATE, outputs[0]->t ); + */ + LOCAL()->input_tensor_group = (vsi_nn_tensor_t **)malloc( + nn_param->group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->input_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(LOCAL()->input_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *)); + res = vsi_nn_CreateTensorGroup(self->graph, inputs[0], 2, + LOCAL()->input_tensor_group, nn_param->group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + + LOCAL()->weight_tensor_group = (vsi_nn_tensor_t **)malloc( + nn_param->group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->weight_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(LOCAL()->weight_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *)); + res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 3, + LOCAL()->weight_tensor_group, nn_param->group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + + LOCAL()->bias_tensor_group = (vsi_nn_tensor_t **)malloc( + nn_param->group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->bias_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(LOCAL()->bias_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *)); + if (inputs[2] != NULL) + { + res = vsi_nn_CreateTensorGroup(self->graph, inputs[2], 0, + LOCAL()->bias_tensor_group, nn_param->group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + } + + LOCAL()->output_tensor_group = (vsi_nn_tensor_t **)malloc( + nn_param->group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->output_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(LOCAL()->output_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *)); + res = vsi_nn_CreateTensorGroup(self->graph, outputs[0], 2, + LOCAL()->output_tensor_group, nn_param->group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + + for (i = 0; i < nn_param->group; i++) + { + vx_tensor bias; + vx_nn_convolution_params_ext_t *p_ext = NULL; + vx_nn_convolution_params_ext2_t *p_ext2 = NULL; + vx_nn_convolution_params_ext2_t param_ext2; + memset( ¶m_ext2, 0, sizeof( vx_nn_convolution_params_ext2_t ) ); + p_ext2 = ¶m_ext2; + p_ext = &p_ext2->ext; + + //set ext relative parameters + p_ext->khr.padding_x = self->nn_param.conv2d.pad[0]; + p_ext->khr.padding_y = self->nn_param.conv2d.pad[2]; + if (self->nn_param.conv2d.dilation[0] > 0) + { + p_ext->khr.dilation_x = self->nn_param.conv2d.dilation[0] - 1; + } + if (self->nn_param.conv2d.dilation[1] > 0) + { + p_ext->khr.dilation_y = self->nn_param.conv2d.dilation[1] - 1; + } + p_ext->khr.overflow_policy = self->vx_param.overflow_policy; + p_ext->khr.rounding_policy = self->vx_param.rounding_policy; + p_ext->khr.down_scale_size_rounding = self->vx_param.down_scale_size_rounding; + + p_ext->padding_x_right = self->nn_param.conv2d.pad[1]; + p_ext->padding_y_bottom = self->nn_param.conv2d.pad[3]; + + //set ext2 relative parameters + p_ext2->depth_multiplier = self->nn_param.conv2d.multiplier; + p_ext2->stride_x = self->nn_param.conv2d.stride[0]; + p_ext2->stride_y = self->nn_param.conv2d.stride[1]; + + if( inputs[2] == NULL ) + { + bias = NULL; + } + else + { + bias = LOCAL()->bias_tensor_group[i]->t; + } + + self->n = vxConvolutionLayer( + self->graph->g, + LOCAL()->input_tensor_group[i]->t, + LOCAL()->weight_tensor_group[i]->t, + bias, + (vx_nn_convolution_params_t *)p_ext2, + sizeof(vx_nn_convolution_params_ext2_t), + LOCAL()->output_tensor_group[i]->t + ); + if( NULL == self->n ) + { + VSILOGE("Add vxConvolutionLayer fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + } + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(GROUPED_CONV2D, 3, 1) + IO_TYPE(D_F16, D_F16, D_NONE, D_F16) + IO_TYPE(D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_NONE, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) + END_IO_TYPE_DECL(GROUPED_CONV2D) + if (!VALIDATE_OP_IO_TYPES(GROUPED_CONV2D, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + vsi_nn_grouped_conv2d_param *nn_param; + uint32_t perm[] = { 3, 2, 0, 1 }; + + /* TODO: Driver should handle this, + * Check transpose + * */ +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt && + VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type ) + { + vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL ); + inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; + } + + nn_param = &self->nn_param.grouped_conv2d; + vsi_nn_compute_padding( + inputs[0]->attr.size, + inputs[1]->attr.size, + nn_param->stride, + nn_param->dilation, + nn_param->pad_type, + nn_param->pad + ); + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + inputs[1]->attr.size[0], + &nn_param->pad[0], + nn_param->stride[0], + nn_param->dilation[0], + VSI_NN_ROUND_FLOOR + ); + outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[1], + inputs[1]->attr.size[1], + &nn_param->pad[2], + nn_param->stride[1], + nn_param->dilation[1], + VSI_NN_ROUND_FLOOR + ); + if(self->nn_param.conv2d.weights > 0) + { + outputs[0]->attr.size[2] = self->nn_param.conv2d.weights; + } + else if(self->nn_param.conv2d.multiplier > 0) + { + outputs[0]->attr.size[2] = inputs[0]->attr.size[2] * self->nn_param.conv2d.multiplier; + } + else + { + outputs[0]->attr.size[2] = inputs[1]->attr.size[3]; + } + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_grouped_conv2d_param *nn_param = &(self->nn_param.grouped_conv2d); + uint32_t i; + if (LOCAL()) + { + if (LOCAL()->input_tensor_group) + { + for (i = 0; i < nn_param->group; i++) + { + vsi_nn_ReleaseTensor(&(LOCAL()->input_tensor_group[i])); + } + free(LOCAL()->input_tensor_group); + } + if (LOCAL()->weight_tensor_group) + { + for (i = 0; i < nn_param->group; i++) + { + vsi_nn_ReleaseTensor(&(LOCAL()->weight_tensor_group[i])); + } + free(LOCAL()->weight_tensor_group); + } + if (LOCAL()->bias_tensor_group != NULL) + { + for (i = 0; i < nn_param->group; i++) + { + vsi_nn_ReleaseTensor(&(LOCAL()->bias_tensor_group[i])); + } + free(LOCAL()->bias_tensor_group); + } + if (LOCAL()->output_tensor_group != NULL) + { + for (i = 0; i < nn_param->group; i++) + { + vsi_nn_ReleaseTensor(&(LOCAL()->output_tensor_group[i])); + } + free(LOCAL()->output_tensor_group); + } + + free(LOCAL()); + } + vsi_nn_op_common_deinit(self); + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GROUPED_CONV2D, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c new file mode 100644 index 0000000..82ad745 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c @@ -0,0 +1,721 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" + +typedef struct _gru_ovxlib_local_data_t { + vsi_nn_tensor_t* weights_input; + vsi_nn_tensor_t* weights_recurrent; + vsi_nn_tensor_t* cond_zeros; + vsi_nn_tensor_t* bias_z; + vsi_nn_tensor_t* bias_r; + vsi_nn_tensor_t* bias_z_r; + vsi_nn_tensor_t* bias_c; +} gru_ovxlib_local_data_t; + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_gru_ovxlib_param* curr_param = &self->nn_param.gru_ovxlib; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t num_units = 0; + uint32_t output_size = 0; + uint32_t batch_size = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + if( curr_param->time_major ) + { + batch_size = inputs[GRU_INPUT_INPUT]->attr.size[1]; + } + else + { + batch_size = inputs[GRU_INPUT_INPUT]->attr.size[2]; + } + + num_units = inputs[GRU_INPUT_WEIGHT_I2R]->attr.size[1]; + if ( num_units != curr_param->num_units ) + { + VSILOGE("The num_units not matched(GRU).\n"); + return FALSE; + } + output_size = num_units; + + /* create h_state input/output if app doesn't provide them */ + if( !inputs[GRU_INPUT_H_STATE] ) + { + attr.dim_num = 2; + attr.size[1] = batch_size; + attr.size[0] = output_size; + memcpy( &attr.dtype, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = FALSE; + attr.is_const = TRUE; + + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[GRU_INPUT_H_STATE] = output_tensor->t; + } + + if( !outputs[GRU_OUTPUT_H_STATE] ) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + memcpy( &attr.dtype, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = TRUE; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[GRU_OUTPUT_H_STATE] = output_tensor->t; + } + + /* output */ + if( VSI_NN_DIM_AUTO == outputs[GRU_OUTPUT_OUTPUT]->attr.dim_num ) + { + outputs[GRU_OUTPUT_OUTPUT]->attr.size[0] = output_size; + if ( curr_param->return_sequences ) + { + outputs[GRU_OUTPUT_OUTPUT]->attr.size[1] = inputs[GRU_INPUT_INPUT]->attr.size[1]; + outputs[GRU_OUTPUT_OUTPUT]->attr.size[2] = inputs[GRU_INPUT_INPUT]->attr.size[2]; + outputs[GRU_OUTPUT_OUTPUT]->attr.dim_num = 3; + } + else + { + outputs[GRU_OUTPUT_OUTPUT]->attr.size[1] = batch_size; + outputs[GRU_OUTPUT_OUTPUT]->attr.dim_num = 2; + } + } + + /* output_state_out */ + if( VSI_NN_DIM_AUTO == outputs[GRU_OUTPUT_H_STATE]->attr.dim_num ) + { + outputs[GRU_OUTPUT_H_STATE]->attr.size[0] = output_size; + outputs[GRU_OUTPUT_H_STATE]->attr.size[1] = batch_size; + outputs[GRU_OUTPUT_H_STATE]->attr.dim_num = 2; + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup_default + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_gru_ovxlib_param* curr_param = &self->nn_param.gru_ovxlib; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_tensor_t** split_output_tensors = NULL; + vsi_nn_tensor_t** grucell_reshape_output_tensors =NULL; + vsi_nn_tensor_t* last_step_h_state = NULL; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_tensor_t* input_tensor = NULL; + vsi_bool use_virtual_tensor = TRUE; + uint32_t batch_size = 0; + uint32_t time_step = 0; + uint32_t i = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_node_wksp( self ); + + if( curr_param->time_major ) + { + batch_size = inputs[GRU_INPUT_INPUT]->attr.size[1]; + time_step = inputs[GRU_INPUT_INPUT]->attr.size[2]; + } + else + { + batch_size = inputs[GRU_INPUT_INPUT]->attr.size[2]; + time_step = inputs[GRU_INPUT_INPUT]->attr.size[1]; + } + + setup_op_shapes( self, inputs, outputs); + + /* default to input */ + input_tensor = inputs[GRU_INPUT_INPUT]; + if( !curr_param->time_major ) + { + /* transpose to time_major */ + output_tensor = vsi_nn_rnn_transpose_time_major(self, + inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor); + input_tensor = output_tensor->t; + } + + /* split input tensor */ + split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + grucell_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, time_step, use_virtual_tensor); + + vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + + last_step_h_state = inputs[GRU_INPUT_H_STATE]; + for( i = 0; i < time_step; i++ ) + { + vsi_nn_tensor_t* reshape_output = NULL; + vsi_nn_tensor_t* grucell_out0 = NULL; + vsi_nn_tensor_t* grucell_out1 = NULL; + + /* reshape for split output */ + output_tensor = vsi_nn_rnn_reshape_split_output(self, + split_output_tensors[i], batch_size, use_virtual_tensor); + reshape_output = output_tensor->t; + + /* grucell output */ + if ( (i == time_step - 1) && !curr_param->return_sequences ) + { + grucell_out0 = outputs[GRU_OUTPUT_OUTPUT]; + } + else + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + grucell_out0 = output_tensor->t; + } + + if( i != time_step - 1 ) + { + /* grucell output h_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + grucell_out1 = output_tensor->t; + } + else + { + grucell_out1 = outputs[GRU_OUTPUT_H_STATE]; + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_OVXLIB, 0, 0 ); + curr->node->nn_param.grucell_ovxlib.num_units = curr_param->num_units; + curr->node->nn_param.grucell_ovxlib.activation = curr_param->activation; + curr->node->nn_param.grucell_ovxlib.recurrent_activation = curr_param->recurrent_activation; + curr->node->nn_param.grucell_ovxlib.linear_before_reset = curr_param->linear_before_reset; + memcpy( curr->node->nn_param.grucell_ovxlib.internal_dtype, + curr_param->internal_dtype, sizeof( curr_param->internal_dtype ) ); + curr->node->nn_param.grucell_ovxlib.use_cudnn_implementation = curr_param->use_cudnn_implementation; + curr->node->nn_param.grucell_ovxlib.cudnn_implementation_version = curr_param->cudnn_implementation_version; + curr->inputs[GRUCELL_INPUT_INPUT] = reshape_output; + curr->inputs[GRUCELL_INPUT_H_STATE] = last_step_h_state; + + curr->inputs[GRUCELL_INPUT_WEIGHT_I2R] = inputs[GRU_INPUT_WEIGHT_I2R]; + curr->inputs[GRUCELL_INPUT_WEIGHT_I2Z] = inputs[GRU_INPUT_WEIGHT_I2Z]; + curr->inputs[GRUCELL_INPUT_WEIGHT_H2R] = inputs[GRU_INPUT_WEIGHT_H2R]; + curr->inputs[GRUCELL_INPUT_WEIGHT_H2Z] = inputs[GRU_INPUT_WEIGHT_H2Z]; + + curr->inputs[GRUCELL_INPUT_BIAS_I2R] = inputs[GRU_INPUT_BIAS_I2R]; + curr->inputs[GRUCELL_INPUT_BIAS_I2Z] = inputs[GRU_INPUT_BIAS_I2Z]; + + curr->inputs[GRUCELL_INPUT_BIAS_H2R] = inputs[GRU_INPUT_BIAS_H2R]; + curr->inputs[GRUCELL_INPUT_BIAS_H2Z] = inputs[GRU_INPUT_BIAS_H2Z]; + + curr->inputs[GRUCELL_INPUT_WEIGHT_I2C] = inputs[GRU_INPUT_WEIGHT_I2C]; + curr->inputs[GRUCELL_INPUT_WEIGHT_H2C] = inputs[GRU_INPUT_WEIGHT_H2C]; + + curr->inputs[GRUCELL_INPUT_BIAS_I2C] = inputs[GRU_INPUT_BIAS_I2C]; + curr->inputs[GRUCELL_INPUT_BIAS_H2C] = inputs[GRU_INPUT_BIAS_H2C]; + + curr->outputs[GRUCELL_OUTPUT_OUTPUT] = grucell_out0; + curr->outputs[GRUCELL_OUTPUT_H_STATE] = grucell_out1; + + vsi_nn_internal_setup_node( self, curr ); + + last_step_h_state = grucell_out1; + + if ( curr_param->return_sequences ) + { + /* reshape output to 3-dims */ + output_tensor = vsi_nn_rnn_reshape_cell_output(self, + grucell_out0, batch_size, use_virtual_tensor); + grucell_reshape_output_tensors[i] = output_tensor->t; + } + } + + if ( curr_param->return_sequences ) + { + tensor = outputs[GRU_OUTPUT_OUTPUT]; + if( !curr_param->time_major ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tensor = output_tensor->t; + } + + /* concat grucell output, the gru's output is 3-dims */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr->node->nn_param.concat.axis = 2; + for( i = 0; i < time_step; i++ ) + { + curr->inputs[i] = grucell_reshape_output_tensors[i]; + } + curr->outputs[0] = tensor; + vsi_nn_internal_setup_node( self, curr ); + + if( !curr_param->time_major ) + { + /* transpose time_major to batch_major*/ + vsi_nn_rnn_transpose_time_major(self, + tensor, outputs[GRU_OUTPUT_OUTPUT], use_virtual_tensor); + } + } + + vsi_nn_safe_free( split_output_tensors ); + vsi_nn_safe_free( grucell_reshape_output_tensors ); + + return TRUE; +} /* op_setup_default() */ + +static vsi_bool op_setup_optimized + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_gru_ovxlib_param* p = &self->nn_param.gru_ovxlib; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_internal_tensor_t* tmp_tensor = NULL; + vsi_nn_tensor_t** split_output_tensors = NULL; + vsi_nn_tensor_t** grucell_reshape_output_tensors =NULL; + vsi_nn_tensor_t* last_step_h_state = NULL; + vsi_nn_tensor_t* input_tensor = NULL; + vsi_bool use_virtual_tensor = TRUE; + uint32_t batch_size = 0; + uint32_t time_step = 0; + uint32_t unit_nums = 0; + uint32_t i = 0; + grucell_activation_input_layout_e grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_CN; + vsi_nn_internal_tensor_t* recurrent_weight_for_nn = NULL; + vsi_nn_internal_tensor_t* input_weight_for_nn = NULL; + uint32_t permute_in_perm[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t reshape_size[VSI_NN_MAX_DIM_NUM] = { 0 }; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_node_wksp( self ); + + if( p->time_major ) + { + batch_size = inputs[GRU_INPUT_INPUT]->attr.size[1]; + time_step = inputs[GRU_INPUT_INPUT]->attr.size[2]; + } + else + { + batch_size = inputs[GRU_INPUT_INPUT]->attr.size[2]; + time_step = inputs[GRU_INPUT_INPUT]->attr.size[1]; + } + + setup_op_shapes( self, inputs, outputs); + + unit_nums = inputs[GRU_INPUT_WEIGHT_H2R]->attr.size[1]; + + /* default to input */ + input_tensor = inputs[GRU_INPUT_INPUT]; + if( !p->time_major ) + { + /* transpose to time_major */ + output_tensor = vsi_nn_rnn_transpose_time_major(self, + inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor); + input_tensor = output_tensor->t; + } + + /* input FC */ + p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRU_INPUT_WEIGHT_I2R], + inputs[GRU_INPUT_WEIGHT_I2Z], inputs[GRU_INPUT_WEIGHT_I2C]); + p->local->weights_input->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST); + + p->local->weights_recurrent = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRU_INPUT_WEIGHT_H2R], + inputs[GRU_INPUT_WEIGHT_H2Z], inputs[GRU_INPUT_WEIGHT_H2C]); + p->local->weights_recurrent->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->weights_recurrent, VSI_NN_TENSOR_ATTR_CONST); + + p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr, + inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]); + p->local->bias_r->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->bias_r, VSI_NN_TENSOR_ATTR_CONST); + + p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr, + inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]); + p->local->bias_z->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->bias_z, VSI_NN_TENSOR_ATTR_CONST); + + p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr, + inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]); + p->local->bias_c->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->bias_c, VSI_NN_TENSOR_ATTR_CONST); + + /* prepare weight and bias for recurrent fc */ + recurrent_weight_for_nn = vsi_nn_rnn_prepare_weight_for_nn_fc(self, p->local->weights_recurrent, 1, 1); + + /* transpose input from [T,B,D] to [D,T,B] */ + permute_in_perm[0] = 1; + permute_in_perm[1] = 2; + permute_in_perm[2] = 0; + tmp_tensor = vsi_nn_rnn_create_permute(self, input_tensor, NULL, permute_in_perm, 3, use_virtual_tensor); + + reshape_size[0] = tmp_tensor->t->attr.size[0]; + reshape_size[1] = tmp_tensor->t->attr.size[1]; + reshape_size[2] = tmp_tensor->t->attr.size[2]; + reshape_size[3] = 1; /* new batch dim */ + tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 4, use_virtual_tensor); + + input_weight_for_nn = vsi_nn_rnn_prepare_weight_for_nn_fc(self, p->local->weights_input, 1, 1); + + vsi_nn_internal_init_tensor_attr(&attr, &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT], + use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + curr->node->nn_param.conv2d.ksize[0] = 1; + curr->node->nn_param.conv2d.ksize[1] = 1; + curr->node->nn_param.conv2d.stride[0] = 1; + curr->node->nn_param.conv2d.stride[1] = 1; + curr->node->nn_param.conv2d.pad[0] = 0; + curr->node->nn_param.conv2d.pad[1] = 0; + curr->node->nn_param.conv2d.pad[2] = 0; + curr->node->nn_param.conv2d.pad[3] = 0; + curr->node->nn_param.conv2d.group = 1; + curr->node->nn_param.conv2d.dilation[0] = 1; + curr->node->nn_param.conv2d.dilation[1] = 1; + curr->node->nn_param.conv2d.weights = input_weight_for_nn->t->attr.size[3]; + + curr->inputs[0] = tmp_tensor->t; + curr->inputs[1] = input_weight_for_nn->t; + curr->inputs[2] = NULL; + curr->outputs[0] = output_tensor->t; + vsi_nn_internal_setup_node(self, curr); + + reshape_size[0] = output_tensor->t->attr.size[0]; + reshape_size[1] = output_tensor->t->attr.size[1]; + reshape_size[2] = output_tensor->t->attr.size[2]; + output_tensor = vsi_nn_rnn_create_reshape(self, output_tensor->t, NULL, reshape_size, 3, use_virtual_tensor); + + permute_in_perm[0] = 0; + permute_in_perm[1] = 2; + permute_in_perm[2] = 1; + tmp_tensor = vsi_nn_rnn_create_permute(self, output_tensor->t, NULL, permute_in_perm, 3, use_virtual_tensor); + + /* split input tensor */ + split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + grucell_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + vsi_nn_rnn_split_input_tensor(self, tmp_tensor->t, split_output_tensors, time_step, use_virtual_tensor); + + vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + + memcpy(&attr, &p->local->bias_r->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + p->local->cond_zeros = vsi_nn_CreateTensorWithDefault(self->graph, &attr, 0.0); + + last_step_h_state = inputs[GRU_INPUT_H_STATE]; + permute_in_perm[0] = 1; + permute_in_perm[1] = 0; + tmp_tensor = vsi_nn_rnn_create_permute(self, last_step_h_state, NULL, permute_in_perm, 2, use_virtual_tensor); + last_step_h_state = tmp_tensor->t; + + for( i = 0; i < time_step; i++ ) + { + vsi_nn_tensor_t* input_fc_output = NULL; + vsi_nn_tensor_t* recurrent_fc_output = NULL; + vsi_nn_tensor_t* grucell_out0 = NULL; + vsi_nn_tensor_t* grucell_out1 = NULL; + vsi_nn_internal_tensor_t* tmp = NULL; + vsi_nn_internal_tensor_t** splited_input_fc_output_tensors = NULL; + vsi_nn_internal_tensor_t** splited_recurrent_fc_output_tensors = NULL; + + /* reshape for split output */ + output_tensor = vsi_nn_rnn_reshape_split_output(self, + split_output_tensors[i], unit_nums * 3, use_virtual_tensor); + input_fc_output = output_tensor->t; + + /* last_step_h_state is not batch first, no need to permute */ + reshape_size[3] = 1; + reshape_size[2] = last_step_h_state->attr.size[1] / (1/*kernel_h*/ * 1/*kernel_w*/); + reshape_size[1] = 1/*kernel_h*/; + reshape_size[0] = last_step_h_state->attr.size[0]; + tmp = vsi_nn_rnn_create_reshape(self, last_step_h_state, NULL, reshape_size, 4, use_virtual_tensor); + + vsi_nn_internal_init_tensor_attr(&attr, + &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN], + use_virtual_tensor); + tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + curr->node->nn_param.conv2d.ksize[0] = 1; + curr->node->nn_param.conv2d.ksize[1] = 1; + curr->node->nn_param.conv2d.stride[0] = 1; + curr->node->nn_param.conv2d.stride[1] = 1; + curr->node->nn_param.conv2d.pad[0] = 0; + curr->node->nn_param.conv2d.pad[1] = 0; + curr->node->nn_param.conv2d.pad[2] = 0; + curr->node->nn_param.conv2d.pad[3] = 0; + curr->node->nn_param.conv2d.group = 1; + curr->node->nn_param.conv2d.dilation[0] = 1; + curr->node->nn_param.conv2d.dilation[1] = 1; + curr->node->nn_param.conv2d.weights = recurrent_weight_for_nn->t->attr.size[3]; + + curr->inputs[0] = tmp->t; + curr->inputs[1] = recurrent_weight_for_nn->t; + curr->inputs[2] = NULL; + curr->outputs[0] = tmp_tensor->t; + vsi_nn_internal_setup_node(self, curr); + + reshape_size[1] = recurrent_weight_for_nn->t->attr.size[3]; + reshape_size[0] = batch_size; + tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 2, use_virtual_tensor); + recurrent_fc_output = tmp_tensor->t; + + /* grucell output */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + grucell_out0 = output_tensor->t; + + /* grucell output h_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + grucell_out1 = output_tensor->t; + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 ); + curr->inputs[GRUCELL_ACTIVATION_INPUT_H_STATE] = last_step_h_state; + if(0) + { + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = input_fc_output; + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = NULL; + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = NULL; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R] = recurrent_fc_output; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_Z] = NULL; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_C] = NULL; + } + else + { + splited_input_fc_output_tensors = vsi_nn_create_split(self, + input_fc_output, 1, 3, NULL, use_virtual_tensor); + splited_recurrent_fc_output_tensors = vsi_nn_create_split(self, + recurrent_fc_output, 1, 3, NULL, use_virtual_tensor); + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R] = splited_recurrent_fc_output_tensors[0]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_Z] = splited_recurrent_fc_output_tensors[1]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_C] = splited_recurrent_fc_output_tensors[2]->t; + } + curr->inputs[GRUCELL_ACTIVATION_INPUT_BIAS_R] = p->local->bias_r; + curr->inputs[GRUCELL_ACTIVATION_INPUT_BIAS_Z] = p->local->bias_z; + curr->inputs[GRUCELL_ACTIVATION_INPUT_BIAS_C] = p->local->bias_c; + curr->inputs[GRUCELL_ACTIVATION_INPUT_COND_R] = p->local->cond_zeros; + curr->inputs[GRUCELL_ACTIVATION_INPUT_COND_Z] = p->local->cond_zeros; + curr->inputs[GRUCELL_ACTIVATION_INPUT_COND_C] = p->local->cond_zeros; + curr->outputs[0] = grucell_out0; + curr->outputs[1] = grucell_out1; + curr->node->nn_param.grucell_activation_internal.input_category = GRUCELL_INPUT_CATEGORY_CUDNN; + curr->node->nn_param.grucell_activation_internal.use_cudnn_implementation = TRUE; + curr->node->nn_param.grucell_activation_internal.input_layout = grucell_activation_input_layout; + vsi_nn_internal_setup_node(self, curr); + + last_step_h_state = grucell_out0; + + /* reshape output to 3-dims */ + grucell_reshape_output_tensors[i] = grucell_out0; + } + + /* concat grucell output, the gru's output is 3-dims */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONCAT, time_step, 1); + curr->node->nn_param.concat.axis = 1; + for( i = 0; i < time_step; i++ ) + { + curr->inputs[i] = grucell_reshape_output_tensors[i]; + } + curr->outputs[0] = tmp_tensor->t; + vsi_nn_internal_setup_node(self, curr); + + reshape_size[0] = batch_size; + reshape_size[1] = -1; + reshape_size[2] = time_step; + tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 3, use_virtual_tensor); + + if(p->time_major) + { + permute_in_perm[0] = 1; + permute_in_perm[1] = 0; + permute_in_perm[2] = 2; + } + else + { + permute_in_perm[0] = 1; + permute_in_perm[1] = 2; + permute_in_perm[2] = 0; + } + vsi_nn_rnn_create_permute(self, tmp_tensor->t, outputs[GRU_OUTPUT_OUTPUT], permute_in_perm, 3, use_virtual_tensor); + + permute_in_perm[0] = 1; + permute_in_perm[1] = 0; + vsi_nn_rnn_create_permute(self, last_step_h_state, outputs[GRU_OUTPUT_H_STATE], + permute_in_perm, 2, use_virtual_tensor); + + vsi_nn_safe_free( split_output_tensors ); + vsi_nn_safe_free( grucell_reshape_output_tensors ); + + return TRUE; +} /* op_setup_optimized() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if(self->nn_param.gru_ovxlib.use_cudnn_implementation) + { + return op_setup_optimized(self, inputs, outputs); + } + else + { + return op_setup_default(self, inputs, outputs); + } +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp( self ); + + vsi_safe_release_tensor(self->nn_param.gru_ovxlib.local->weights_input); + vsi_safe_release_tensor(self->nn_param.gru_ovxlib.local->weights_recurrent); + vsi_safe_release_tensor(self->nn_param.gru_ovxlib.local->cond_zeros); + + vsi_nn_safe_free(self->nn_param.gru_ovxlib.local); + + return status; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.gru_ovxlib.local = (gru_ovxlib_local_data_t *)malloc(sizeof(gru_ovxlib_local_data_t)); + memset(self->nn_param.gru_ovxlib.local, 0x00, sizeof(gru_ovxlib_local_data_t)); + self->nn_param.gru_ovxlib.time_major = TRUE; + self->nn_param.gru_ovxlib.activation = VSI_NN_ACT_TANH; + self->nn_param.gru_ovxlib.recurrent_activation = VSI_NN_ACT_SIGMOID; + self->nn_param.gru_ovxlib.return_sequences = TRUE; + self->nn_param.gru_ovxlib.linear_before_reset = 0; + self->nn_param.gru_ovxlib.cudnn_implementation_version = 0; + self->nn_param.gru_ovxlib.use_cudnn_implementation = FALSE; + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GRU_OVXLIB, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ GRU_INPUT_CNT, + /* output_num */ GRU_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c new file mode 100644 index 0000000..92a72c2 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c @@ -0,0 +1,203 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_grucell_activation_internal_param* p = &self->nn_param.grucell_activation_internal; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t* param; + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32(param, "gate_activation", p->gate_activation); + vsi_nn_kernel_param_add_int32(param, "candidate_activation", p->candidate_activation); + vsi_nn_kernel_param_add_int32(param, "input_category", p->input_category); + vsi_nn_kernel_param_add_int32(param, "use_cudnn_implementation", p->use_cudnn_implementation); + vsi_nn_kernel_param_add_int32(param, "input_layout", p->input_layout); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "grucell_activation", + inputs, GRUCELL_ACTIVATION_INPUT_COUNT, + outputs, GRUCELL_ACTIVATION_OUTPUT_COUNT, + param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /** + * input 0, input 1, input2 + * z{t_}, h{t__}, h{t-1} + * z{t} = gate_activation(z{t_}) + * h{t_} = candidate_activation(h{t__}) + * h{t} = z{t} * (h{t-1} - h{t_}) + h{t_} + */ + vsi_nn_grucell_activation_internal_param* p = &self->nn_param.grucell_activation_internal; + + if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.dim_num) + { + if(p->input_category == GRUCELL_INPUT_CATEGORY_DEFAULT) + { + outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.dim_num = \ + inputs[GRUCELL_ACTIVATION_INPUT_ZT_]->attr.dim_num; + memcpy( outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.size, + inputs[GRUCELL_ACTIVATION_INPUT_ZT_]->attr.size, + inputs[GRUCELL_ACTIVATION_INPUT_ZT_]->attr.dim_num * sizeof( uint32_t ) ); + } + else + { + outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.dim_num = \ + inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R]->attr.dim_num; + + if(GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC == p->input_layout) + { + outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.size[1] = \ + inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R]->attr.size[1]; + if(inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z]) + { + outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.size[0] = \ + inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z]->attr.size[0]; + } + else + { + /* for batch first, the inputs of Z/R/C are always concaetnated in axis-0 */ + outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.size[0] = \ + inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R]->attr.size[0] / 3; + } + } + else + { + outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.size[0] = \ + inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R]->attr.size[0]; + + if(inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z]) + { + outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.size[1] = \ + inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z]->attr.size[1]; + } + /* if `Z` is not provided, the inputs of Z/R/C are concatenated in axis-1 */ + else + { + outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.size[1] = \ + inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R]->attr.size[1] / 3; + } + } + } + } + if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACTIVATION_OUTPUT_H_STATE]->attr.dim_num) + { + outputs[GRUCELL_ACTIVATION_OUTPUT_H_STATE]->attr.dim_num = \ + inputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.dim_num; + memcpy( outputs[GRUCELL_ACTIVATION_OUTPUT_H_STATE]->attr.size, + inputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.size, + inputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.dim_num * sizeof( uint32_t ) ); + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.grucell_activation_internal.local = \ + (vsi_nn_grucell_activation_internal_local *)malloc(sizeof(vsi_nn_grucell_activation_internal_local)); + self->nn_param.grucell_activation_internal.gate_activation = VSI_NN_ACT_SIGMOID; + self->nn_param.grucell_activation_internal.candidate_activation = VSI_NN_ACT_TANH; + self->nn_param.grucell_activation_internal.input_category = GRUCELL_INPUT_CATEGORY_DEFAULT; + self->nn_param.grucell_activation_internal.use_cudnn_implementation = FALSE; + self->nn_param.grucell_activation_internal.input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC; + + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_safe_free(self->nn_param.grucell_activation_internal.local); + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GRUCELL_ACTIVATION_INTERNAL, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ GRUCELL_ACTIVATION_INPUT_COUNT, + /* output_num */ GRUCELL_ACTIVATION_OUTPUT_COUNT + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c new file mode 100644 index 0000000..8feedf7 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c @@ -0,0 +1,137 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_t n; + vsi_nn_kernel_param_t* param; + + param = vsi_nn_kernel_param_create(); + n = vsi_nn_kernel_selector( self->graph, "grucell_activation_sma", + inputs, GRUCELL_ACTIVATION_SMA_INPUT_COUNT, + outputs, GRUCELL_ACTIVATION_SMA_OUTPUT_COUNT, + param ); + + self->n = (vx_node)n; + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num) + { + outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num = \ + inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE]->attr.dim_num; + memcpy( outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.size, + inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE]->attr.size, + inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE]->attr.dim_num * sizeof( uint32_t ) ); + } + + if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_H_STATE]->attr.dim_num) + { + outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_H_STATE]->attr.dim_num = \ + inputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num; + memcpy( outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_H_STATE]->attr.size, + inputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.size, + inputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num * sizeof( uint32_t ) ); + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + return vsi_nn_op_common_deinit(self); +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GRUCELL_ACTIVATION_INTERNAL_SMA, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ GRUCELL_ACTIVATION_SMA_INPUT_COUNT, + /* output_num */ GRUCELL_ACTIVATION_SMA_OUTPUT_COUNT + ); +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c new file mode 100644 index 0000000..d5797d9 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c @@ -0,0 +1,1271 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "ops/vsi_nn_op_grucell_ovxlib.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" +#include "utils/vsi_nn_tensor_op.h" +#include "utils/vsi_nn_util.h" + +#define USE_GRUCELL_ACTIVATION + +typedef struct _grucell_ovxlib_local_data_t +{ + vsi_bool multi_batch; + vsi_bool force_input_recurrent_on_NN; + vsi_nn_activation_e gate_activation; + vsi_nn_activation_e candidate_activation; + vsi_nn_tensor_t* weights_update; + vsi_nn_tensor_t* weights_reset; + vsi_nn_tensor_t* weights_z_r; + vsi_nn_tensor_t* weights_c; + vsi_nn_tensor_t* weights_input; + vsi_nn_tensor_t* weights_recurrent; + vsi_nn_tensor_t* bias_z; + vsi_nn_tensor_t* bias_r; + vsi_nn_tensor_t* bias_z_r; + vsi_nn_tensor_t* bias_c; +} grucell_ovxlib_local_data_t; + +static vsi_nn_internal_tensor_t* create_multiply + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input1, + vsi_nn_tensor_t * input2, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); + tensor1 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 ); + + tmp_inode->inputs[0] = input1; + tmp_inode->inputs[1] = input2; + tmp_inode->node->nn_param.multiply.scale = 1.0f; + tmp_inode->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + tmp_inode->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN; + tmp_inode->outputs[0] = tensor1->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + return tensor1; +} + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_grucell_ovxlib_param* curr_param = &self->nn_param.grucell_ovxlib; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t output_size = 0; + uint32_t batch_size = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + batch_size = inputs[GRUCELL_INPUT_INPUT]->attr.size[1]; + output_size = inputs[GRUCELL_INPUT_WEIGHT_I2R]->attr.size[1]; + if ( output_size != curr_param->num_units ) + { + VSILOGE("The num_units not matched(GRUCELL).\n"); + return FALSE; + } + + /* create h_state input/output if app doesn't provide them */ + if( !inputs[GRUCELL_INPUT_H_STATE] ) + { + attr.dim_num = 2; + attr.size[1] = batch_size; + attr.size[0] = output_size; + memcpy( &attr.dtype, &outputs[GRUCELL_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = FALSE; + attr.is_const = FALSE; + + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[GRUCELL_INPUT_H_STATE] = output_tensor->t; + } + + if( !outputs[GRUCELL_OUTPUT_H_STATE] ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[GRUCELL_OUTPUT_OUTPUT]->attr.dtype, TRUE); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[GRUCELL_OUTPUT_H_STATE] = output_tensor->t; + } + + /* setup grucell output tensors' shape */ + /* output */ + if(VSI_NN_DIM_AUTO == outputs[GRUCELL_OUTPUT_OUTPUT]->attr.dim_num) + { + /* num_units */ + outputs[GRUCELL_OUTPUT_OUTPUT]->attr.size[0] = output_size; + /* batch_size */ + outputs[GRUCELL_OUTPUT_OUTPUT]->attr.size[1] = inputs[GRUCELL_INPUT_INPUT]->attr.size[1]; + outputs[GRUCELL_OUTPUT_OUTPUT]->attr.dim_num = inputs[GRUCELL_INPUT_INPUT]->attr.dim_num; + } + + /* output_state_out */ + if(VSI_NN_DIM_AUTO == outputs[GRUCELL_OUTPUT_H_STATE]->attr.dim_num) + { + outputs[GRUCELL_OUTPUT_H_STATE]->attr.dim_num = outputs[GRUCELL_OUTPUT_OUTPUT]->attr.dim_num; + memcpy( outputs[GRUCELL_OUTPUT_H_STATE]->attr.size, outputs[GRUCELL_OUTPUT_OUTPUT]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup_float + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_grucell_ovxlib_param* p = &self->nn_param.grucell_ovxlib; + vsi_nn_dtype_t dtype; + vsi_bool use_virtual_tensor = TRUE; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* tmp_tensor = NULL; + vsi_nn_internal_tensor_t* tensor_rt = NULL; + vsi_nn_internal_tensor_t* input_hstate = NULL; + vsi_nn_internal_tensor_t** splited_tensors = NULL; + + p->local->weights_update = vsi_nn_ConcatTensor(self->graph, 0, + inputs[GRUCELL_INPUT_WEIGHT_I2Z], inputs[GRUCELL_INPUT_WEIGHT_H2Z]); + p->local->weights_reset = vsi_nn_ConcatTensor(self->graph, 0, + inputs[GRUCELL_INPUT_WEIGHT_I2R], inputs[GRUCELL_INPUT_WEIGHT_H2R]); + p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr, + inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]); + p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr, + inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]); + p->local->bias_z_r = vsi_nn_ConcatTensor(self->graph, 0, p->local->bias_z, p->local->bias_r); + p->local->weights_z_r = vsi_nn_ConcatTensor(self->graph, 1, p->local->weights_update, p->local->weights_reset); + p->local->weights_c = vsi_nn_ConcatTensor(self->graph, 0, + inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_WEIGHT_H2C]); + p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr, + inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]); + + vsi_safe_release_tensor(p->local->bias_z); + vsi_safe_release_tensor(p->local->bias_r); + p->local->bias_z_r->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->bias_z_r, VSI_NN_TENSOR_ATTR_CONST); + p->local->weights_z_r->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->weights_z_r, VSI_NN_TENSOR_ATTR_CONST); + p->local->weights_c->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->weights_c, VSI_NN_TENSOR_ATTR_CONST); + p->local->bias_c->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->bias_c, VSI_NN_TENSOR_ATTR_CONST); + + input_hstate = vsi_nn_rnn_create_concat(self, 0, + use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]); + + dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + tmp_tensor = vsi_nn_rnn_create_tp_fc(self, input_hstate->t, + p->local->weights_z_r, p->local->bias_z_r, &dtype, use_virtual_tensor); + + splited_tensors = vsi_nn_create_split(self, tmp_tensor->t, 0, 2, NULL, use_virtual_tensor); + + /* reset Gate activations */ + tensor_rt = vsi_nn_rnn_create_activation(self, + splited_tensors[1]->t, + p->local->gate_activation, + &splited_tensors[1]->t->attr.dtype, + use_virtual_tensor); + + /* if linear_before_reset=0: ht=g(input*w_ic + (r.hstate)*w_hc + b_ic + b_hc)*/ + if ( p->linear_before_reset == 0 ) + { + /* r{t} * h{t-1}*/ + tensor_rt = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY, + tensor_rt->t, inputs[GRUCELL_INPUT_H_STATE], &tensor_rt->t->attr.dtype, use_virtual_tensor); + + /* [x{t}, r{t}] */ + tmp_tensor = vsi_nn_rnn_create_concat(self, 0, use_virtual_tensor, + inputs[GRUCELL_INPUT_INPUT], tensor_rt->t); + + dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + /* W{c} x [x{t}, r{t}] */ + tmp_tensor = vsi_nn_rnn_create_tp_fc(self, tmp_tensor->t, p->local->weights_c, p->local->bias_c, + &dtype, use_virtual_tensor); + } + /* if linear_before_reset!=0: ht=g(input*w_ic + (r.(hstate*w_hc + b_hc)) + b_ic)*/ + else + { + dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + /* r.(hstate*w_hc + b_hc) */ + tmp_tensor = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], inputs[GRUCELL_INPUT_WEIGHT_H2C], + inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor); + tensor_rt = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY, + tensor_rt->t, tmp_tensor->t, &tensor_rt->t->attr.dtype, use_virtual_tensor); + /* input*w_ic + b_ic */ + tmp_tensor = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_WEIGHT_I2C], + inputs[GRUCELL_INPUT_BIAS_I2C], &dtype, use_virtual_tensor); + + tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_ADD, + tensor_rt->t, tmp_tensor->t, &tensor_rt->t->attr.dtype, use_virtual_tensor); + } + +#define USE_GRUCELL_ACTIVATION +#ifdef USE_GRUCELL_ACTIVATION + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 ); + curr->inputs[0] = splited_tensors[0]->t; + curr->inputs[1] = tmp_tensor->t; + curr->inputs[2] = inputs[GRUCELL_INPUT_H_STATE]; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + curr->outputs[1] = outputs[GRUCELL_OUTPUT_H_STATE]; + curr->node->nn_param.grucell_activation_internal.gate_activation = p->local->gate_activation; + curr->node->nn_param.grucell_activation_internal.candidate_activation = p->local->candidate_activation; + curr->node->nn_param.grucell_activation_internal.use_cudnn_implementation = p->use_cudnn_implementation; + vsi_nn_internal_setup_node(self, curr); +#else + { + vsi_nn_internal_tensor_t* tensor_zt = NULL; + vsi_nn_internal_tensor_t* tensor_ht_ = NULL; + /* z{t} */ + tensor_zt = vsi_nn_rnn_create_activation(self, + splited_tensors[0]->t, + p->local->gate_activation, + &splited_tensors[0]->t->attr.dtype, + use_virtual_tensor); + /* h{t_} */ + tensor_ht_ = vsi_nn_rnn_create_activation(self, + tmp_tensor->t, + p->local->candidate_activation, + &tmp_tensor->t->attr.dtype, + use_virtual_tensor); + /* z{t} * h{t-1} + (1 - z{t}) * h{t_} ==> z{t} * (h{t-1} - h{t_}) + h{t_} */ + tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_SUBTRACT, + inputs[GRUCELL_INPUT_H_STATE], tensor_ht_->t, &tmp_tensor->t->attr.dtype, use_virtual_tensor); + tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY, + tensor_zt->t, tmp_tensor->t, &tensor_ht_->t->attr.dtype, use_virtual_tensor); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + curr->inputs[0] = tmp_tensor->t; + curr->inputs[1] = tensor_ht_->t; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + vsi_nn_internal_setup_node(self, curr); + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); +#endif + + return TRUE; +} + +static vsi_bool op_setup_float_cudnn + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_grucell_ovxlib_param* p = &self->nn_param.grucell_ovxlib; + vsi_bool use_virtual_tensor = TRUE; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* input_fc_output = NULL; + vsi_nn_internal_tensor_t* recurrent_fc_output = NULL; + vsi_nn_internal_tensor_t** splited_input_fc_output_tensors = NULL; + vsi_nn_internal_tensor_t** splited_recurrent_fc_output_tensors = NULL; + uint32_t kernel_h = 1, kernel_w = 1; + grucell_activation_input_layout_e grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC; + uint32_t reshaped_size[2] = { 0 }; + + p->local->multi_batch = inputs[GRUCELL_INPUT_INPUT]->attr.size[1] > 1; + + p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRUCELL_INPUT_WEIGHT_I2R], + inputs[GRUCELL_INPUT_WEIGHT_I2Z], inputs[GRUCELL_INPUT_WEIGHT_I2C]); + p->local->weights_input->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST); + + p->local->weights_recurrent = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRUCELL_INPUT_WEIGHT_H2R], + inputs[GRUCELL_INPUT_WEIGHT_H2Z], inputs[GRUCELL_INPUT_WEIGHT_H2C]); + p->local->weights_recurrent->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->weights_recurrent, VSI_NN_TENSOR_ATTR_CONST); + + p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr, + inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]); + p->local->bias_r->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->bias_r, VSI_NN_TENSOR_ATTR_CONST); + p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr, + inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]); + p->local->bias_z->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->bias_z, VSI_NN_TENSOR_ATTR_CONST); + p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr, + inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]); + p->local->bias_c->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->bias_c, VSI_NN_TENSOR_ATTR_CONST); + + if(p->local->multi_batch && p->local->force_input_recurrent_on_NN) + { + vsi_nn_internal_tensor_t* input_tensor = NULL; + vsi_nn_internal_tensor_t* tmp = NULL; + + /* + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); + */ + /* reshape and transpose input */ + input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT], + p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + + tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, p->local->weights_input, + NULL, kernel_h, kernel_w, + &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT], + use_virtual_tensor); + /* transpose and reshape output */ + reshaped_size[0] = inputs[GRUCELL_INPUT_INPUT]->attr.size[1]; + reshaped_size[1] = p->local->weights_input->attr.size[1]; + input_fc_output = vsi_nn_rnn_create_reshape(self, tmp->t, NULL, + reshaped_size, 2, use_virtual_tensor); + + grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_INPUT_NC_FC_CN; + } + else + { + input_fc_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_INPUT], + p->local->weights_input, NULL, + &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT], use_virtual_tensor); + grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC; + } + + if(p->local->multi_batch && p->local->force_input_recurrent_on_NN) + { + vsi_nn_internal_tensor_t* input_tensor = NULL; + vsi_nn_internal_tensor_t* tmp = NULL; + /* + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[GRUCELL_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); + */ + /* reshape and transpose input */ + input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_H_STATE], + p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + + tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, p->local->weights_recurrent, + NULL, kernel_h, kernel_w, + &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN], use_virtual_tensor); + /* transpose and reshape output */ + reshaped_size[0] = inputs[GRUCELL_INPUT_H_STATE]->attr.size[1]; + reshaped_size[1] = p->local->weights_recurrent->attr.size[1]; + recurrent_fc_output = vsi_nn_rnn_create_reshape(self, tmp->t, NULL, + reshaped_size, 2, use_virtual_tensor); + } + else + { + recurrent_fc_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], + p->local->weights_recurrent, NULL, + &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN], use_virtual_tensor); + } + +#ifdef USE_GRUCELL_ACTIVATION + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 ); + curr->inputs[GRUCELL_ACTIVATION_INPUT_H_STATE] = inputs[GRUCELL_INPUT_H_STATE]; + + if(p->local->multi_batch) + { + if(GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC == grucell_activation_input_layout) + { + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = input_fc_output->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = NULL; + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = NULL; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R] = recurrent_fc_output->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_Z] = NULL; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_C] = NULL; + } + else + { + splited_input_fc_output_tensors = vsi_nn_create_split(self, + input_fc_output->t, 1, 3, NULL, use_virtual_tensor); + splited_recurrent_fc_output_tensors = vsi_nn_create_split(self, + recurrent_fc_output->t, 1, 3, NULL, use_virtual_tensor); + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R] = splited_recurrent_fc_output_tensors[0]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_Z] = splited_recurrent_fc_output_tensors[1]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_C] = splited_recurrent_fc_output_tensors[2]->t; + } + } + else + { + splited_input_fc_output_tensors = vsi_nn_create_split(self, + input_fc_output->t, 0, 3, NULL, use_virtual_tensor); + splited_recurrent_fc_output_tensors = vsi_nn_create_split(self, + recurrent_fc_output->t, 0, 3, NULL, use_virtual_tensor); + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R] = splited_recurrent_fc_output_tensors[0]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_Z] = splited_recurrent_fc_output_tensors[1]->t; + curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_C] = splited_recurrent_fc_output_tensors[2]->t; + } + curr->inputs[GRUCELL_ACTIVATION_INPUT_BIAS_R] = p->local->bias_r; + curr->inputs[GRUCELL_ACTIVATION_INPUT_BIAS_Z] = p->local->bias_z; + curr->inputs[GRUCELL_ACTIVATION_INPUT_BIAS_C] = p->local->bias_c; + curr->inputs[GRUCELL_ACTIVATION_INPUT_COND_R] = inputs[GRUCELL_INPUT_COND_RESET]; + curr->inputs[GRUCELL_ACTIVATION_INPUT_COND_Z] = inputs[GRUCELL_INPUT_COND_UPDATE]; + curr->inputs[GRUCELL_ACTIVATION_INPUT_COND_C] = inputs[GRUCELL_INPUT_COND_CANDIDATE]; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + curr->outputs[1] = outputs[GRUCELL_OUTPUT_H_STATE]; + curr->node->nn_param.grucell_activation_internal.gate_activation = p->local->gate_activation; + curr->node->nn_param.grucell_activation_internal.candidate_activation = p->local->candidate_activation; + curr->node->nn_param.grucell_activation_internal.input_category = GRUCELL_INPUT_CATEGORY_CUDNN; + curr->node->nn_param.grucell_activation_internal.use_cudnn_implementation = TRUE; + curr->node->nn_param.grucell_activation_internal.input_layout = grucell_activation_input_layout; + vsi_nn_internal_setup_node(self, curr); +#else + { + vsi_nn_internal_tensor_t* tmp_tensor = NULL; + vsi_nn_internal_tensor_t* tensor_r = NULL; + vsi_nn_internal_tensor_t* tensor_u = NULL; + vsi_nn_internal_tensor_t* tensor_c = NULL; + vsi_bool is_cond_available = FALSE; + + if(inputs[GRUCELL_INPUT_COND_RESET] && inputs[GRUCELL_INPUT_COND_UPDATE] + && inputs[GRUCELL_INPUT_COND_CANDIDATE]) + { + is_cond_available = TRUE; + } + p->local->bias_z_r = vsi_nn_ConcatTensor(self->graph, 0, p->local->bias_r, + p->local->bias_z, p->local->bias_c); + + if(is_cond_available) + { + tmp_tensor = vsi_nn_rnn_create_concat(self, 0, use_virtual_tensor, + inputs[GRUCELL_INPUT_COND_RESET], inputs[GRUCELL_INPUT_COND_UPDATE], + inputs[GRUCELL_INPUT_COND_CANDIDATE]); + + input_fc_output = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_ADD, + input_fc_output->t,tmp_tensor->t, + &input_fc_output->t->attr.dtype, use_virtual_tensor); + } + recurrent_fc_output = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_ADD, + recurrent_fc_output->t, p->local->bias_z_r, + &recurrent_fc_output->t->attr.dtype, use_virtual_tensor); + + splited_input_fc_output_tensors = vsi_nn_create_split(self, input_fc_output->t, 0, 3, NULL, TRUE); + splited_recurrent_fc_output_tensors = vsi_nn_create_split(self, recurrent_fc_output->t, 0, 3, NULL, TRUE); + + tensor_r = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_ADD, + splited_input_fc_output_tensors[0]->t, splited_recurrent_fc_output_tensors[0]->t, + &splited_input_fc_output_tensors[0]->t->attr.dtype, use_virtual_tensor); + + tensor_u = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_ADD, + splited_input_fc_output_tensors[1]->t, splited_recurrent_fc_output_tensors[1]->t, + &splited_input_fc_output_tensors[1]->t->attr.dtype, use_virtual_tensor); + + /* reset Gate activations */ + tensor_r = vsi_nn_rnn_create_activation(self, + tensor_r->t, + p->local->gate_activation, + &tensor_r->t->attr.dtype, + use_virtual_tensor); + + tensor_u = vsi_nn_rnn_create_activation(self, + tensor_u->t, + p->local->gate_activation, + &tensor_u->t->attr.dtype, + use_virtual_tensor); + + /* r{t} * h{t-1}*/ + tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY, + tensor_r->t, splited_recurrent_fc_output_tensors[2]->t, &tensor_r->t->attr.dtype, use_virtual_tensor); + + tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_ADD, + tmp_tensor->t, splited_input_fc_output_tensors[2]->t, &tmp_tensor->t->attr.dtype, use_virtual_tensor); + + tmp_tensor = vsi_nn_rnn_create_activation(self, + tmp_tensor->t, + p->local->candidate_activation, + &tmp_tensor->t->attr.dtype, + use_virtual_tensor); + tensor_c = tmp_tensor; + + /* z{t} * h{t-1} + (1 - z{t}) * h{t_} ==> z{t} * (h{t-1} - h{t_}) + h{t_} */ + tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_SUBTRACT, + inputs[GRUCELL_INPUT_H_STATE], tensor_c->t, &tensor_c->t->attr.dtype, use_virtual_tensor); + + tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY, + tensor_u->t, tmp_tensor->t, &tmp_tensor->t->attr.dtype, use_virtual_tensor); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + curr->inputs[0] = tmp_tensor->t; + curr->inputs[1] = tensor_c->t; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + vsi_nn_internal_setup_node(self, curr); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); + } +#endif + + return TRUE; +} + +/* +use TP for sigmoid and tanh, split grucell_activation to 3 parts +*/ +static vsi_bool op_setup_float_cudnn_v2 + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_grucell_ovxlib_param* p = &self->nn_param.grucell_ovxlib; + vsi_nn_dtype_t dtype; + vsi_bool use_virtual_tensor = TRUE; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* input2cand_output = NULL; + vsi_nn_internal_tensor_t* recurrent2cand_output = NULL; + vsi_nn_internal_tensor_t** splited_input_fc_output_tensors = NULL; + vsi_nn_internal_tensor_t* tmp_tensor = NULL; + vsi_nn_internal_tensor_t* tensor_r = NULL; + vsi_nn_internal_tensor_t* concated_input = NULL; + vsi_nn_tensor_attr_t attr; + + /* input to r,z */ + p->local->weights_update = vsi_nn_ConcatTensor(self->graph, 1/* axis */, + inputs[GRUCELL_INPUT_WEIGHT_I2R], inputs[GRUCELL_INPUT_WEIGHT_I2Z]); + /* recurrent to r,z */ + p->local->weights_reset = vsi_nn_ConcatTensor(self->graph, 1/* axis */, + inputs[GRUCELL_INPUT_WEIGHT_H2R], inputs[GRUCELL_INPUT_WEIGHT_H2Z]); + /* [input, recurrent] to r,z */ + p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 0/* axis */, + p->local->weights_update, p->local->weights_reset); + p->local->weights_input->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST); + vsi_safe_release_tensor(p->local->weights_update); + vsi_safe_release_tensor(p->local->weights_reset); + + p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr, + inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]); + p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr, + inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]); + p->local->bias_z_r = vsi_nn_ConcatTensor(self->graph, 0/* axis */, + p->local->bias_r, p->local->bias_z); + p->local->bias_z_r->attr.is_const = TRUE; + vsi_nn_SetTensorAttr(p->local->bias_z_r, VSI_NN_TENSOR_ATTR_CONST); + vsi_safe_release_tensor(p->local->bias_z); + vsi_safe_release_tensor(p->local->bias_r); + + concated_input = vsi_nn_rnn_create_concat(self, 0/* axis */, + use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]); + + dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + tmp_tensor = vsi_nn_rnn_create_tp_fc(self, concated_input->t, p->local->weights_input, + p->local->bias_z_r, &dtype, use_virtual_tensor); + + dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + { + uint32_t _slices[] = { inputs[GRUCELL_INPUT_INPUT]->attr.size[0], + inputs[GRUCELL_INPUT_H_STATE]->attr.size[0] }; + splited_input_fc_output_tensors = vsi_nn_create_split(self, concated_input->t, + 0, 2, _slices, use_virtual_tensor); + } + input2cand_output = vsi_nn_rnn_create_tp_fc(self, splited_input_fc_output_tensors[0]->t, + inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_BIAS_I2C], &dtype, use_virtual_tensor); + + dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + recurrent2cand_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], + inputs[GRUCELL_INPUT_WEIGHT_H2C], inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor); + + tmp_tensor = vsi_nn_rnn_create_activation(self, tmp_tensor->t, p->local->gate_activation, + &tmp_tensor->t->attr.dtype, use_virtual_tensor); + + /* split for combined FC outputs, r_t, z_t */ + splited_input_fc_output_tensors = vsi_nn_create_split(self, tmp_tensor->t, + 0/* axis */, + 2/* dim num */, NULL, use_virtual_tensor); + + memset( &attr, 0x00, sizeof(attr) ); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_A_TIMES_B_PLUS_C, 0, 0 ); + curr->inputs[0] = splited_input_fc_output_tensors[0]->t; + curr->inputs[1] = recurrent2cand_output->t; + curr->inputs[2] = input2cand_output->t; + curr->outputs[0] = tmp_tensor->t; + vsi_nn_internal_setup_node(self, curr); + + tensor_r = vsi_nn_rnn_create_activation(self, tmp_tensor->t, + p->local->candidate_activation, &tmp_tensor->t->attr.dtype, use_virtual_tensor); + +#define USE_GRUCELL_ACTIVATION_SMA +#ifdef USE_GRUCELL_ACTIVATION_SMA + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL_SMA, 0, 0 ); + curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE] = inputs[GRUCELL_INPUT_H_STATE]; + curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_T_] = tensor_r->t; + curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_Z_T] = splited_input_fc_output_tensors[1]->t; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + curr->outputs[1] = outputs[GRUCELL_OUTPUT_H_STATE]; + curr->node->nn_param.grucell_activation_internal.gate_activation = p->local->gate_activation; + curr->node->nn_param.grucell_activation_internal.candidate_activation = p->local->candidate_activation; + curr->node->nn_param.grucell_activation_internal.use_cudnn_implementation = p->use_cudnn_implementation; + vsi_nn_internal_setup_node(self, curr); +#else + tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_SUBTRACT, + inputs[GRUCELL_INPUT_H_STATE], + tensor_r->t, &tensor_r->t->attr.dtype, use_virtual_tensor); + tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY, + splited_input_fc_output_tensors[1]->t, + tmp_tensor->t, &tmp_tensor->t->attr.dtype, use_virtual_tensor); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + curr->inputs[0] = tmp_tensor->t; + curr->inputs[1] = tensor_r->t; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + vsi_nn_internal_setup_node(self, curr); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); +#endif + + return TRUE; +} + +static vsi_bool op_setup_default + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_grucell_ovxlib_param* p = &self->nn_param.grucell_ovxlib; + vsi_nn_tensor_attr_t attr; + vsi_bool is_input_fc_on_tp = FALSE; + vsi_bool is_hstate_fc_on_tp = FALSE; + vsi_bool is_input_cand_fc_op_tp = FALSE; + vsi_bool is_hstate_cand_fc_op_tp = FALSE; + vsi_nn_internal_tensor_t* input_tensor = NULL; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_internal_tensor_t* tmp_tensor = NULL; + vsi_nn_internal_tensor_t* hstate_input_tensor = NULL; + vsi_nn_internal_tensor_t* input_gate_fc_outputs[GRUCELL_RZ_GATE_COUNT] = { NULL }; + vsi_nn_internal_tensor_t* hstate_gate_fc_outputs[GRUCELL_RZ_GATE_COUNT] = { NULL }; + vsi_nn_internal_tensor_t* gate_fc_outputs[GRUCELL_RZ_GATE_COUNT] = { NULL }; + vsi_nn_internal_tensor_t* gate_act_outputs[GRUCELL_RZ_GATE_COUNT] = { NULL }; + vsi_nn_internal_tensor_t* rh_mul_outputs = NULL; + vsi_nn_internal_tensor_t* input_cand_fc_output = NULL; + vsi_nn_internal_tensor_t* rh_cand_fc_output = NULL; + vsi_nn_internal_tensor_t* r_mul_hcand_fc_output = NULL; + vsi_nn_internal_tensor_t* cand_fc_output = NULL; + vsi_nn_internal_tensor_t* cand_act_output = NULL; + vsi_nn_internal_node_t* curr = NULL; + vsi_bool use_virtual_tensor = FALSE; + uint32_t kernel_h = 1; + uint32_t kernel_w = 1; + int32_t i = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + memset( &attr, 0x00, sizeof( attr ) ); + + if( inputs[GRUCELL_INPUT_INPUT]->attr.dtype.qnt_type + != inputs[GRUCELL_INPUT_WEIGHT_I2R]->attr.dtype.qnt_type) + { + /* input and input weights have different qtype, only TP can do this operation */ + is_input_fc_on_tp = TRUE; + } + else if( inputs[GRUCELL_INPUT_INPUT]->attr.size[0] % 64 != 0 ) + { + /* NN performs bad if input's shape is not aligned to 64-byte */ + is_input_fc_on_tp = TRUE; + } + + if( inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.qnt_type + != inputs[GRUCELL_INPUT_WEIGHT_H2R]->attr.dtype.qnt_type) + { + /* recurrent and recurrent weights have different qtype, only TP can do this operation */ + is_hstate_fc_on_tp = TRUE; + } + else if( inputs[GRUCELL_INPUT_H_STATE]->attr.size[0] % 64 != 0 ) + { + /* NN performs bad if inputs' shape is not aligned to 64-byte */ + is_hstate_fc_on_tp = TRUE; + } + + /* if both input fc and recurrent fc could be executed on NN, offloads one to TP*/ + if( !is_input_fc_on_tp && !is_hstate_fc_on_tp ) + { + is_input_fc_on_tp = TRUE; + } + /* TODO: now, all fc on tp because can't fetch the HW feature */ + is_input_fc_on_tp = TRUE; + is_hstate_fc_on_tp = TRUE; + + /* Input FC */ + if( is_input_fc_on_tp ) + { + /* tp */ + for( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++) + { + input_gate_fc_outputs[i] = vsi_nn_rnn_create_tp_fc(self, + inputs[GRUCELL_INPUT_INPUT], + inputs[GRUCELL_INPUT_WEIGHT_I2R + i], + inputs[GRUCELL_INPUT_BIAS_I2R + i], + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i], + use_virtual_tensor); + } + } + else + { + /* reshape and transpose input */ + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); + input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT], + p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + + for( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++) + { + vsi_nn_internal_tensor_t* tmp = vsi_nn_rnn_create_nn_fc(self, + input_tensor->t, + inputs[GRUCELL_INPUT_WEIGHT_I2R + i], + inputs[GRUCELL_INPUT_BIAS_I2R + i], + kernel_h, kernel_w, + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i], + use_virtual_tensor); + /* transpose and reshape output */ + input_gate_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self, + tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + } + } + + /* Hstate FC */ + if( is_hstate_fc_on_tp ) + { + for( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++) + { + hstate_gate_fc_outputs[i] = vsi_nn_rnn_create_tp_fc(self, + inputs[GRUCELL_INPUT_H_STATE], + inputs[GRUCELL_INPUT_WEIGHT_H2R + i], + inputs[GRUCELL_INPUT_BIAS_H2R + i], + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R + i], + use_virtual_tensor); + } + } + else + { + /* reshape and transpose input */ + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[GRUCELL_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); + hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, + inputs[GRUCELL_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + + for( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++) + { + vsi_nn_internal_tensor_t* tmp = vsi_nn_rnn_create_nn_fc(self, + hstate_input_tensor->t, + inputs[GRUCELL_INPUT_WEIGHT_H2R + i], + inputs[GRUCELL_INPUT_BIAS_H2R + i], + kernel_h, kernel_w, + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R + i], + use_virtual_tensor); + /* transpose and reshape output */ + hstate_gate_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self, + tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + } + } + + /* Gate Input FC add Hstate FC */ + for ( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++) + { + gate_fc_outputs[i] = vsi_nn_rnn_create_tensor_add(self, + input_gate_fc_outputs[i]->t, + hstate_gate_fc_outputs[i]->t, + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i], + use_virtual_tensor); + } + + /* Gate activations */ + for ( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++) + { + gate_act_outputs[i] = vsi_nn_rnn_create_activation(self, + gate_fc_outputs[i]->t, + p->local->gate_activation, + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i], + use_virtual_tensor); + } + + /* Candidate FC */ + /* if linear_before_reset=0: ht=g(input*w_ic + (r.hstate)*w_hc + b_ic + b_hc)*/ + /* if linear_before_reset!=0: ht=g(input*w_ic + (r.(hstate*w_hc + b_hc)) + b_ic)*/ + if ( p->linear_before_reset == 0 ) + { + rh_mul_outputs = create_multiply(self, + gate_act_outputs[GRUCELL_GATE_R]->t, + inputs[GRUCELL_INPUT_H_STATE], + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R], + use_virtual_tensor); + } + else + { + rh_mul_outputs = vsi_nn_rnn_create_reshape(self, + inputs[GRUCELL_INPUT_H_STATE], + NULL, + inputs[GRUCELL_INPUT_H_STATE]->attr.size, + inputs[GRUCELL_INPUT_H_STATE]->attr.dim_num, + use_virtual_tensor); + } + + if( inputs[GRUCELL_INPUT_INPUT]->attr.dtype.qnt_type + != inputs[GRUCELL_INPUT_WEIGHT_I2C]->attr.dtype.qnt_type) + { + /* input and input weights have different qtype, only TP can do this operation */ + is_input_cand_fc_op_tp = TRUE; + } + else if( inputs[GRUCELL_INPUT_INPUT]->attr.size[0] % 64 != 0 ) + { + /* NN performs bad if input's shape is not aligned to 64-byte */ + is_input_cand_fc_op_tp = TRUE; + } + + if( rh_mul_outputs->t->attr.dtype.qnt_type + != inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr.dtype.qnt_type) + { + /* recurrent and recurrent weights have different qtype, only TP can do this operation */ + is_hstate_cand_fc_op_tp = TRUE; + } + else if( rh_mul_outputs->t->attr.size[0] % 64 != 0 ) + { + /* NN performs bad if inputs' shape is not aligned to 64-byte */ + is_hstate_cand_fc_op_tp = TRUE; + } + /* if both input fc and recurrent fc could be executed on NN, offloads one to TP*/ + if( !is_input_cand_fc_op_tp && !is_hstate_cand_fc_op_tp ) + { + is_input_cand_fc_op_tp = TRUE; + } + /* TODO: now, all fc on tp because can't fetch the HW feature */ + is_input_cand_fc_op_tp = TRUE; + is_hstate_cand_fc_op_tp = TRUE; + + if ( is_input_cand_fc_op_tp ) + { + input_cand_fc_output = vsi_nn_rnn_create_tp_fc(self, + inputs[GRUCELL_INPUT_INPUT], + inputs[GRUCELL_INPUT_WEIGHT_I2C], + inputs[GRUCELL_INPUT_BIAS_I2C], + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C], + use_virtual_tensor); + } + else + { + vsi_nn_internal_tensor_t* tmp = NULL; + /* reshape and transpose input */ + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); + input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT], + p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + tmp = vsi_nn_rnn_create_nn_fc(self, + input_tensor->t, + inputs[GRUCELL_INPUT_WEIGHT_I2C], + inputs[GRUCELL_INPUT_BIAS_I2C], + kernel_h, kernel_w, + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C], + use_virtual_tensor); + /* transpose and reshape output */ + input_cand_fc_output = vsi_nn_rnn_process_output_for_nn_fc(self, + tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + } + if ( is_hstate_cand_fc_op_tp ) + { + /* if the tp support in:fp16,weight:u8,bias:fp32 batch>1, remove this. */ + if ((rh_mul_outputs->t->attr.dtype.vx_type) != (inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr.dtype.vx_type) + && (p->local->multi_batch)) + { + vsi_nn_tensor_t* wei_r2c_tensor = NULL; + + memcpy(&attr, &(inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr), sizeof(attr)); + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + + wei_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_WEIGHT_H2C], &(attr.dtype)); + rh_cand_fc_output = vsi_nn_rnn_create_tp_fc(self, + rh_mul_outputs->t, + wei_r2c_tensor, + inputs[GRUCELL_INPUT_BIAS_H2C], + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C], + use_virtual_tensor); + } + else + { + rh_cand_fc_output = vsi_nn_rnn_create_tp_fc(self, + rh_mul_outputs->t, + inputs[GRUCELL_INPUT_WEIGHT_H2C], + inputs[GRUCELL_INPUT_BIAS_H2C], + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C], + use_virtual_tensor); + } + } + else + { + vsi_nn_internal_tensor_t* tmp = NULL; + /* reshape and transpose input */ + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + rh_mul_outputs->t->attr.size[0], &kernel_h, &kernel_w); + hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, rh_mul_outputs->t, + p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + tmp = vsi_nn_rnn_create_nn_fc(self, + hstate_input_tensor->t, + inputs[GRUCELL_INPUT_WEIGHT_H2C], + inputs[GRUCELL_INPUT_BIAS_H2C], + kernel_h, kernel_w, + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C], + use_virtual_tensor); + /* transpose and reshape output */ + rh_cand_fc_output = vsi_nn_rnn_process_output_for_nn_fc(self, + tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + } + + if ( p->linear_before_reset == 0 ) + { + r_mul_hcand_fc_output = rh_cand_fc_output; + } + else + { + r_mul_hcand_fc_output = create_multiply(self, + gate_act_outputs[GRUCELL_GATE_R]->t, + rh_cand_fc_output->t, + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C], + use_virtual_tensor); + } + /* Candidate input FC add r*h FC */ + cand_fc_output = vsi_nn_rnn_create_tensor_add(self, + input_cand_fc_output->t, + r_mul_hcand_fc_output->t, + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C], + use_virtual_tensor); + + /* Candidate activation */ + cand_act_output = vsi_nn_rnn_create_activation(self, + cand_fc_output->t, + p->local->candidate_activation, + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C], + use_virtual_tensor); + + /* GRU cell output */ + memcpy( &attr.dtype, &gate_act_outputs[GRUCELL_GATE_Z]->t->attr.dtype, sizeof( attr.dtype ) ); + memcpy( &attr.size, &gate_act_outputs[GRUCELL_GATE_Z]->t->attr.size, sizeof( attr.size ) ); + attr.dim_num = gate_act_outputs[GRUCELL_GATE_Z]->t->attr.dim_num; + attr.vtl = use_virtual_tensor; + attr.is_const = TRUE; + input_tensor = vsi_nn_internal_new_tensor(self, &attr, 1.0f); + + memset( &attr, 0x00, sizeof(attr) ); + //memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + /* create internal tensor sub node (1-zt)*c */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SUBTRACT, 0, 0 ); + curr->inputs[0] = input_tensor->t; + curr->inputs[1] = gate_act_outputs[GRUCELL_GATE_Z]->t; + curr->outputs[0] = tmp_tensor->t; + + vsi_nn_internal_setup_node(self, curr); + + /* create internal multiply node (1-zt)*c */ + output_tensor = create_multiply(self, + tmp_tensor->t, + cand_act_output->t, + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C], + use_virtual_tensor); + + /* create internal multiply node zt*hstate */ + tmp_tensor = create_multiply(self, + gate_act_outputs[GRUCELL_GATE_Z]->t, + inputs[GRUCELL_INPUT_H_STATE], + &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2Z], + use_virtual_tensor); + + /* create internal tensor add node (1-zt)*c + zt*hstate */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + curr->inputs[0] = output_tensor->t; + curr->inputs[1] = tmp_tensor->t; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + + /* copy output to h_state */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; + curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); + + return TRUE; +} /* op_setup() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_grucell_ovxlib_param* p = &self->nn_param.grucell_ovxlib; + vsi_bool is_all_inputs_fp16 = FALSE; + vsi_bool is_all_inputs_u8 = FALSE; + + p->local->multi_batch = (inputs[GRUCELL_INPUT_INPUT]->attr.size[1] > 1); + p->local->gate_activation = p->recurrent_activation; + p->local->candidate_activation = p->activation; + + is_all_inputs_fp16 = inputs[GRUCELL_INPUT_INPUT]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 + && inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 + && inputs[GRUCELL_INPUT_WEIGHT_I2R]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 + && inputs[GRUCELL_INPUT_WEIGHT_I2Z]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 + && inputs[GRUCELL_INPUT_WEIGHT_H2R]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 + && inputs[GRUCELL_INPUT_WEIGHT_H2Z]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 + && inputs[GRUCELL_INPUT_WEIGHT_I2C]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 + && inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16; + + is_all_inputs_u8 = inputs[GRUCELL_INPUT_INPUT]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 + && inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 + && inputs[GRUCELL_INPUT_WEIGHT_I2R]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 + && inputs[GRUCELL_INPUT_WEIGHT_I2Z]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 + && inputs[GRUCELL_INPUT_WEIGHT_H2R]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 + && inputs[GRUCELL_INPUT_WEIGHT_H2Z]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 + && inputs[GRUCELL_INPUT_WEIGHT_I2C]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 + && inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8; + + if(is_all_inputs_u8) + { + vsi_nn_qnt_type_e qnt_type = inputs[GRUCELL_INPUT_WEIGHT_I2R]->attr.dtype.qnt_type; + float scale = inputs[GRUCELL_INPUT_WEIGHT_I2R]->attr.dtype.scale; + int32_t zero_point = inputs[GRUCELL_INPUT_WEIGHT_I2R]->attr.dtype.zero_point; + + is_all_inputs_u8 &= inputs[GRUCELL_INPUT_WEIGHT_I2R]->attr.dtype.qnt_type == qnt_type + && inputs[GRUCELL_INPUT_WEIGHT_I2Z]->attr.dtype.qnt_type == qnt_type + && inputs[GRUCELL_INPUT_WEIGHT_H2R]->attr.dtype.qnt_type == qnt_type + && inputs[GRUCELL_INPUT_WEIGHT_H2Z]->attr.dtype.qnt_type == qnt_type + && inputs[GRUCELL_INPUT_WEIGHT_I2C]->attr.dtype.qnt_type == qnt_type + && inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr.dtype.qnt_type == qnt_type; + + is_all_inputs_u8 &= inputs[GRUCELL_INPUT_WEIGHT_I2R]->attr.dtype.scale == scale + && inputs[GRUCELL_INPUT_WEIGHT_I2Z]->attr.dtype.scale == scale + && inputs[GRUCELL_INPUT_WEIGHT_H2R]->attr.dtype.scale == scale + && inputs[GRUCELL_INPUT_WEIGHT_H2Z]->attr.dtype.scale == scale + && inputs[GRUCELL_INPUT_WEIGHT_I2C]->attr.dtype.scale == scale + && inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr.dtype.scale == scale; + + is_all_inputs_u8 &= inputs[GRUCELL_INPUT_WEIGHT_I2R]->attr.dtype.zero_point == zero_point + && inputs[GRUCELL_INPUT_WEIGHT_I2Z]->attr.dtype.zero_point == zero_point + && inputs[GRUCELL_INPUT_WEIGHT_H2R]->attr.dtype.zero_point == zero_point + && inputs[GRUCELL_INPUT_WEIGHT_H2Z]->attr.dtype.zero_point == zero_point + && inputs[GRUCELL_INPUT_WEIGHT_I2C]->attr.dtype.zero_point == zero_point + && inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr.dtype.zero_point == zero_point; + } + + setup_op_shapes(self, inputs, outputs); + + if(is_all_inputs_fp16 || is_all_inputs_u8 ) + { + if(p->use_cudnn_implementation && p->linear_before_reset == 0) + { + switch(p->cudnn_implementation_version) + { + default: + case 0: + case 1: + return op_setup_float_cudnn(self, inputs, outputs); + /* break; */ + case 2: + return op_setup_float_cudnn_v2(self, inputs, outputs); + /* break; */ + case 3: + p->local->force_input_recurrent_on_NN = TRUE; + return op_setup_float_cudnn(self, inputs, outputs); + /* break; */ + } + } + else + { + return op_setup_float(self, inputs, outputs); + } + } + else + { + return op_setup_default(self, inputs, outputs); + } +} + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + int i = 0; + + vsi_nn_internal_init_node_wksp( self ); + + self->nn_param.grucell_ovxlib.local = \ + (grucell_ovxlib_local_data_t*)malloc(sizeof(grucell_ovxlib_local_data_t)); + if(self->nn_param.grucell_ovxlib.local) + { + memset(self->nn_param.grucell_ovxlib.local, 0x00, + sizeof(grucell_ovxlib_local_data_t)); + self->nn_param.grucell_ovxlib.local->candidate_activation = VSI_NN_ACT_TANH; + self->nn_param.grucell_ovxlib.local->gate_activation = VSI_NN_ACT_SIGMOID; + self->nn_param.grucell_ovxlib.local->force_input_recurrent_on_NN = FALSE; + } + else + { + status = VSI_FAILURE; + } + + for(i = 0; i < GRUCELL_QUANTIZE_PARAM_COUNT; i++) + { + memset(&self->nn_param.grucell_ovxlib.internal_dtype[i], 0x00, + sizeof(self->nn_param.grucell_ovxlib.internal_dtype[i])); + self->nn_param.grucell_ovxlib.internal_dtype[i].qnt_type = VSI_NN_QNT_TYPE_NONE; + self->nn_param.grucell_ovxlib.internal_dtype[i].vx_type = VSI_NN_TYPE_FLOAT16; + } + + self->nn_param.grucell_ovxlib.activation = VSI_NN_ACT_TANH; + self->nn_param.grucell_ovxlib.recurrent_activation = VSI_NN_ACT_SIGMOID; + self->nn_param.grucell_ovxlib.use_cudnn_implementation = FALSE; + self->nn_param.grucell_ovxlib.cudnn_implementation_version = 0; + + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_grucell_ovxlib_param* p = &self->nn_param.grucell_ovxlib; + + vsi_safe_release_tensor(p->local->weights_update); + vsi_safe_release_tensor(p->local->weights_reset); + vsi_safe_release_tensor(p->local->weights_z_r); + vsi_safe_release_tensor(p->local->weights_c); + vsi_safe_release_tensor(p->local->weights_input); + vsi_safe_release_tensor(p->local->weights_recurrent); + vsi_safe_release_tensor(p->local->bias_z); + vsi_safe_release_tensor(p->local->bias_r); + vsi_safe_release_tensor(p->local->bias_z_r); + vsi_safe_release_tensor(p->local->bias_c); + vsi_nn_internal_deinit_node_wksp( self ); + vsi_nn_safe_free(self->nn_param.grucell_ovxlib.local); + + return status; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GRUCELL_OVXLIB, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ GRUCELL_INPUT_CNT, + /* output_num */ GRUCELL_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c new file mode 100644 index 0000000..a0d0395 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c @@ -0,0 +1,129 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_nn_hashlut_params_t p; + memset( &p, 0, sizeof(p) ); + p.keys = inputs[1]->t; + p.values = inputs[2]->t; + self->n = vxHashTableLookupLayer( self->graph->g, inputs[0]->t, + &p, sizeof(p), outputs[1]->t, outputs[0]->t); + if( !self->n ) + { + status = VSI_FAILURE; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(HASHTABLE_LOOKUP, 3, 2) + IO_TYPE(D_I32, D_I32, D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I32, D_I32, D_I32, D_U8|Q_ASYM, D_I32) + IO_TYPE(D_I32, D_I32, D_F32, D_U8|Q_ASYM, D_F32) + IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I32, D_F16, D_F16, D_F16) + IO_TYPE(D_I32, D_I32, D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I32, D_I32, D_F16, D_I32) + IO_TYPE(D_I32, D_I32, D_F32, D_F16, D_F32) + IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + END_IO_TYPE_DECL(HASHTABLE_LOOKUP) + if (!VALIDATE_OP_IO_TYPES(HASHTABLE_LOOKUP, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( outputs[0]->attr.dim_num == VSI_NN_DIM_AUTO ) + { + outputs[0]->attr.dim_num = inputs[2]->attr.dim_num; + memcpy( outputs[0]->attr.size, inputs[2]->attr.size, + sizeof(int) * inputs[2]->attr.dim_num ); + outputs[0]->attr.size[outputs[0]->attr.dim_num - 1] = inputs[0]->attr.size[0]; + } + if( outputs[1]->attr.dim_num == VSI_NN_DIM_AUTO ) + { + outputs[1]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[1]->attr.size[0] = inputs[0]->attr.size[0]; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cpluplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ HASHTABLE_LOOKUP, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 3, + /* output_num */ 2 + ); +#ifdef __cpluplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c new file mode 100644 index 0000000..d588b12 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c @@ -0,0 +1,289 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (2) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_HEATMAP_MAX_KEYPOINT_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_heatmap_max_keypoint_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.heatmap_max_keypoint); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ + #define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, type ); + #undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + /*TODO: Add code if need to change your parameter*/ + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + char *path = NULL; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = vx_kernel_HEATMAP_MAX_KEYPOINT_list; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_heatmap_max_keypoint"; + path = getenv("USER_VX_SOURCE_PATH"); + if(path) + vsi_nn_VxResourceSetPath(path); + + if( kernel_info.type == VX_KERNEL_TYPE_VX) + { + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + } + else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ + { + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) + { + free(kernel_info.resource_name); + } + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = 2; + outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[3]; + outputs[1]->attr.dim_num = 3; + outputs[1]->attr.size[0] = 2; + outputs[1]->attr.size[1] = inputs[0]->attr.size[0]; + outputs[1]->attr.size[2] = inputs[0]->attr.size[3]; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ HEATMAP_MAX_KEYPOINT, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c new file mode 100644 index 0000000..cc8103c --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c @@ -0,0 +1,1158 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (14) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_IMAGEPROCESS_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_imageprocess_param * p; + int32_t i; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.imageprocess); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, crop.enable ); + _SET_PARAM( 1, VX_TYPE_INT32, crop.dim_num ); + for (i = 0; i < p->crop.dim_num; i++) + { + _SET_PARAM( 2 + i, VX_TYPE_INT32, crop.start[i] ); + } + _SET_PARAM( 6, VX_TYPE_BOOL, reverse_channel ); + _SET_PARAM( 7, VX_TYPE_INT32, mean.type ); + _SET_PARAM( 8, VX_TYPE_FLOAT32, mean.scale ); + _SET_PARAM( 9, VX_TYPE_INT32, mean.mean_value_size ); + for (i = 0; i < p->mean.mean_value_size; i++) + { + _SET_PARAM( 10 + i, VX_TYPE_FLOAT32, mean.mean_value[i] ); + } +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +struct _scaletotensor_kernel_params +{ + int32_t ratio[2]; + int32_t offset[2]; + float mean[3]; + float scale; +}; + +typedef struct _scaletotensor_kernel_params scaletotensor_kernel_params_t; + +static vsi_status prepare_params_scaletotensor + ( + vsi_nn_imageprocess_param *p, + scaletotensor_kernel_params_t *params, + vsi_nn_tensor_attr_t *attr_in, + vsi_nn_tensor_attr_t *attr_out + ) +{ + int32_t i; + if (p->crop.enable == TRUE) + { + params->offset[0] = p->crop.start[0]; + params->offset[1] = p->crop.start[1]; + } + else + { + params->offset[0] = 0; + params->offset[1] = 0; + } + + if (p->crop.enable == TRUE) + { + params->ratio[0] = (p->crop.length[0] << 15) / attr_out->size[0]; + params->ratio[1] = (p->crop.length[1] << 15) / attr_out->size[1]; + } + else + { + params->ratio[0] = (attr_in->size[0] << 15) / attr_out->size[0]; + params->ratio[1] = (attr_in->size[1] << 15) / attr_out->size[1]; + } + + if (p->mean.type == VSI_NN_IMAGEPROCESS_MEAN_NONE) + { + for (i = 0; i < 3; i++) + { + params->mean[i] = 0; + } + } + else if (p->mean.type == VSI_NN_IMAGEPROCESS_MEAN_CHANNEL) + { + for (i = 0; i < 3; i++) + { + params->mean[i] = p->mean.mean_value[i]; + } + } + else if (p->mean.type == VSI_NN_IMAGEPROCESS_MEAN_PIXEL) + { + for (i = 0; i < 3; i++) + { + params->mean[i] = p->mean.mean_value[0]; + } + } + params->scale = p->mean.scale; + return VSI_SUCCESS; +} + +static vsi_status prepare_params_grayscaletotensor + ( + vsi_nn_imageprocess_param *p, + scaletotensor_kernel_params_t *params, + vsi_nn_tensor_attr_t *attr_in, + vsi_nn_tensor_attr_t *attr_out + ) +{ + if (p->crop.enable == TRUE) + { + params->offset[0] = p->crop.start[0]; + params->offset[1] = p->crop.start[1]; + } + else + { + params->offset[0] = 0; + params->offset[1] = 0; + } + + if (p->crop.enable == TRUE) + { + params->ratio[0] = (p->crop.length[0] << 15) / attr_out->size[0]; + params->ratio[1] = (p->crop.length[1] << 15) / attr_out->size[1]; + } + else + { + params->ratio[0] = (attr_in->size[0] << 15) / attr_out->size[0]; + params->ratio[1] = (attr_in->size[1] << 15) / attr_out->size[1]; + } + + if (p->mean.type == VSI_NN_IMAGEPROCESS_MEAN_NONE) + { + params->mean[0] = 0; + } + else + { + params->mean[0] = p->mean.mean_value[0]; + } + params->scale = p->mean.scale; + return VSI_SUCCESS; +} + +static vsi_status _create_params_vx_scaletotensor + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num, + vsi_nn_tensor_attr_t *attr_in, + vsi_nn_tensor_attr_t *attr_out + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_imageprocess_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.imageprocess); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + { + scaletotensor_kernel_params_t scaletotensor_kernel_params; + prepare_params_scaletotensor(p, &scaletotensor_kernel_params, attr_in, attr_out); + _SET_PARAM( 0, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[0]); + _SET_PARAM( 1, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[1]); + _SET_PARAM( 2, VX_TYPE_INT32, scaletotensor_kernel_params.offset[0]); + _SET_PARAM( 3, VX_TYPE_INT32, scaletotensor_kernel_params.offset[1]); + _SET_PARAM( 4, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[0]); + _SET_PARAM( 5, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[1]); + _SET_PARAM( 6, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[2]); + _SET_PARAM( 7, VX_TYPE_FLOAT32, scaletotensor_kernel_params.scale); + } +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params_vx_scaletotensor */ + +static vsi_status _create_params_vx_grayscaletotensor + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num, + vsi_nn_tensor_attr_t *attr_in, + vsi_nn_tensor_attr_t *attr_out + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_imageprocess_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.imageprocess); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + { + scaletotensor_kernel_params_t scaletotensor_kernel_params; + prepare_params_grayscaletotensor(p, &scaletotensor_kernel_params, attr_in, attr_out); + _SET_PARAM( 0, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[0]); + _SET_PARAM( 1, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[1]); + _SET_PARAM( 2, VX_TYPE_INT32, scaletotensor_kernel_params.offset[0]); + _SET_PARAM( 3, VX_TYPE_INT32, scaletotensor_kernel_params.offset[1]); + _SET_PARAM( 4, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[0]); + _SET_PARAM( 5, VX_TYPE_FLOAT32, scaletotensor_kernel_params.scale); + } +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params_vx_scaletotensor */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status select_kernel_index + ( + vsi_nn_kernel_info_t * kernel_info, + vsi_nn_type_e outDataType, + vx_bool is_copy + ) +{ + if (!is_copy) + { + if (outDataType == VSI_NN_TYPE_FLOAT16) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess"; + kernel_info->kernel_index = 1; + } + else if (outDataType == VSI_NN_TYPE_INT8) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess"; + kernel_info->kernel_index = 2; + } + else if (outDataType == VSI_NN_TYPE_INT16) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_2"; + kernel_info->kernel_index = 3; + } + else if (outDataType == VSI_NN_TYPE_UINT8) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_2"; + kernel_info->kernel_index = 4; + } + else + { + VSILOGE("Unsupported data type(imageprocess).\n"); + return VSI_FAILURE; + } + } + else + { + if (outDataType == VSI_NN_TYPE_FLOAT16) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_3"; + kernel_info->kernel_index = 5; + } + else if (outDataType == VSI_NN_TYPE_INT8) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_3"; + kernel_info->kernel_index = 6; + } + else if (outDataType == VSI_NN_TYPE_INT16) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_3"; + kernel_info->kernel_index = 7; + } + else if (outDataType == VSI_NN_TYPE_UINT8) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_3"; + kernel_info->kernel_index = 8; + } + else + { + VSILOGE("Unsupported data type(imageprocess).\n"); + return VSI_FAILURE; + } + } + + return VSI_SUCCESS; +} + +static vsi_status select_kernel_index_gray + ( + vsi_nn_kernel_info_t * kernel_info, + vsi_nn_type_e outDataType, + vx_bool is_copy + ) +{ + if (!is_copy) + { + if (outDataType == VSI_NN_TYPE_FLOAT16) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_4"; + kernel_info->kernel_index = 9; + } + else if (outDataType == VSI_NN_TYPE_INT8) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_4"; + kernel_info->kernel_index = 10; + } + else if (outDataType == VSI_NN_TYPE_INT16) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; + kernel_info->kernel_index = 11; + } + else if (outDataType == VSI_NN_TYPE_UINT8) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; + kernel_info->kernel_index = 12; + } + else + { + VSILOGE("Unsupported data type(imageprocess).\n"); + return VSI_FAILURE; + } + } + else + { + if (outDataType == VSI_NN_TYPE_FLOAT16) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; + kernel_info->kernel_index = 13; + } + else if (outDataType == VSI_NN_TYPE_INT8) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; + kernel_info->kernel_index = 14; + } + else if (outDataType == VSI_NN_TYPE_INT16) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; + kernel_info->kernel_index = 15; + } + else if (outDataType == VSI_NN_TYPE_UINT8) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_imageprocess_5"; + kernel_info->kernel_index = 16; + } + else + { + VSILOGE("Unsupported data type(imageprocess).\n"); + return VSI_FAILURE; + } + } + + return VSI_SUCCESS; +} + +static vsi_status vx_op_pre_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_nn_type_e outDataType = outputs[0]->attr.dtype.vx_type; + vx_bool is_copy = (vx_bool)((inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) + && (inputs[0]->attr.size[1] == outputs[0]->attr.size[1])); + + if (inputs[0]->attr.size[2] == 1) + { + kernel_info->init_index = 2; + return select_kernel_index_gray(kernel_info, outDataType, is_copy); + } + else + { + kernel_info->init_index = 1; + return select_kernel_index(kernel_info, outDataType, is_copy); + } +} + +#define _ARG_NUM_SCALETOTENSOR (8) +#define _PARAM_NUM_SCALETOTENSOR (_ARG_NUM_SCALETOTENSOR + _IO_NUM) + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM_SCALETOTENSOR]; + vx_border_t border; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params_vx_scaletotensor( self, args, _ARG_NUM_SCALETOTENSOR, + &(inputs[0]->attr), &(outputs[0]->attr)); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM_SCALETOTENSOR ); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + _release_params( args, _ARG_NUM_SCALETOTENSOR ); + + return status; +} /* vx_op_compute() */ + +#define _ARG_NUM_GRAYSCALETOTENSOR (6) +#define _PARAM_NUM_GRAYSCALETOTENSOR (_ARG_NUM_GRAYSCALETOTENSOR + _IO_NUM) + +static vsi_status vx_gray_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM_GRAYSCALETOTENSOR]; + vx_border_t border; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params_vx_grayscaletotensor( self, args, _ARG_NUM_GRAYSCALETOTENSOR, + &(inputs[0]->attr), &(outputs[0]->attr)); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM_GRAYSCALETOTENSOR ); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + _release_params( args, _ARG_NUM_GRAYSCALETOTENSOR ); + + return status; +} /* vx_gray_op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(IMAGEPROCESS, 1, 1) + IO_TYPE(D_U8, D_F16) + IO_TYPE(D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_I8|Q_DFP) + END_IO_TYPE_DECL(IMAGEPROCESS) + if (!VALIDATE_OP_IO_TYPES(IMAGEPROCESS, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + vx_gray_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_imageprocess"; + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_IMAGEPROCESS_list; + + if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) + { + vx_op_pre_compute(self, inputs, outputs, &kernel_info); + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_imageprocess_param * p; + uint32_t i; + p = (vsi_nn_imageprocess_param *)&(self->nn_param.imageprocess); + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + /* TODO */ + if (inputs[0]->attr.dim_num != 4) + { + VSILOGE("Only support 4D tensor for image process!(IMAGEPROCESS)\n"); + return FALSE; + } + if (p->reverse_channel == TRUE && inputs[0]->attr.size[2] != 3) + { + VSILOGE("Only support 3 channels for reverse channel!(IMAGEPROCESS)\n"); + return FALSE; + } + + if (p->resize.type != VSI_NN_IMAGEPROCESS_RESIZE_NONE) + { + outputs[0]->attr.dim_num = p->resize.dim_num; + for(i = 0; i < (uint32_t)p->resize.dim_num; ++i) + { + outputs[0]->attr.size[i] = p->resize.length[i]; + } + } + else if (p->crop.enable == TRUE) + { + outputs[0]->attr.dim_num = p->crop.dim_num; + for(i = 0; i < (uint32_t)p->crop.dim_num; ++i) + { + outputs[0]->attr.size[i] = p->crop.length[i]; + } + } + else + { + // CWHN -> WHCN + outputs[0]->attr.size[0] = inputs[0]->attr.size[1]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[0]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + } + return TRUE; +} /* op_setup() */ + +typedef struct _vsi_nn_image_data_t +{ + int32_t id; + vx_image handle; +}vsi_nn_image_data_t; + +typedef struct _vsi_nn_image_list_t +{ + vsi_nn_link_list_t link_list; + vsi_nn_image_data_t image; +} vsi_nn_image_list_t; + +static void _init_image_list(vsi_nn_link_list_t *node) +{ + vsi_nn_image_list_t *image_list = (vsi_nn_image_list_t *)node; + image_list->link_list.next = NULL; + image_list->link_list.prev = NULL; + memset(&image_list->image, 0, sizeof(vsi_nn_image_data_t)); +} + +static vsi_nn_image_list_t* get_image_by_id +( + vsi_nn_image_list_t* head, + int32_t id +) +{ + vsi_nn_image_list_t *iter; + iter = head; + while(iter) + { + if (iter->image.id == id) + { + return iter; + } + iter = (vsi_nn_image_list_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter); + } + iter = (vsi_nn_image_list_t *)vsi_nn_LinkListNewNode( + sizeof(vsi_nn_image_list_t), _init_image_list); + iter->image.id = id; + return iter; +} + +vsi_nn_image_list_t* images_head = NULL; +// pipeline: +// 1.crop +// 2.resize +// 3.(val-mean)*scale +// 4.RGBRGBRGB ---> BBBGGGRRR +// 5.revert channel: BBBGGGRRR ---> RRRGGGBBB +vsi_status vsi_nn_InsertImageprocessSingleNode + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_attr_t *attr, + vsi_nn_imageprocess_param *p, + uint8_t *data, + vsi_nn_tensor_t *tensor_out, + int32_t id + ) +{ + vsi_nn_image_list_t* p_image; + vx_image image_global; + if(images_head == NULL) + { + images_head = (vsi_nn_image_list_t *)vsi_nn_LinkListNewNode( + sizeof(vsi_nn_image_list_t), _init_image_list); + } + p_image = get_image_by_id(images_head, id); + image_global = p_image->image.handle; + if(image_global == NULL) + { + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + vx_node node = NULL; + vx_reference params[_PARAM_NUM_SCALETOTENSOR]; + vx_border_t border; + vx_reference * args; + vx_image image = NULL; + vx_context ctx = vxGetContext( (vx_reference)graph->g ); + vx_imagepatch_addressing_t imgInfo; + vx_bool is_copy = (vx_bool)((attr->size[0] == tensor_out->attr.size[0]) + && (attr->size[1] == tensor_out->attr.size[1])); + vsi_nn_tensor_t *tensor_temp = NULL; + vsi_nn_tensor_t *output_scaletotensor = NULL; + vsi_nn_tensor_t *output_reversetensor = NULL; + vx_nn_tensor_reverse_params_t para; + int32_t reverse1_axis[4] = {2}; + uint32_t perm[] = {2, 0, 1, 3}; + vsi_nn_tensor_t out0; + uint32_t arg_num; + vx_bool is_gray = FALSE; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + memset(&out0, 0, sizeof(vsi_nn_tensor_t)); + para.axis = reverse1_axis; + para.numberOfAxis = 1; + + if (p->platform_type == VSI_NN_PLATFORM_TENSORFLOW) + { + vsi_nn_tensor_attr_t attr0; + memcpy(&attr0, &tensor_out->attr, sizeof(vsi_nn_tensor_attr_t)); + attr0.size[0] = tensor_out->attr.size[1]; + attr0.size[1] = tensor_out->attr.size[2]; + attr0.size[2] = tensor_out->attr.size[0]; + + if (attr0.size[2] == 1) + { + is_gray= TRUE; + p->reverse_channel = FALSE; + } + is_copy = (vx_bool)((attr->size[0] == attr0.size[0]) + && (attr->size[1] == attr0.size[1])); + if (!is_gray) + { + output_scaletotensor = vsi_nn_CreateTensor(graph, &attr0); + if (p->reverse_channel == TRUE) + { + output_reversetensor = vsi_nn_CreateTensor(graph, &attr0); + } + tensor_temp = output_scaletotensor; + } + else + { + out0.t = vxReshapeTensor(tensor_out->t, (int32_t *)attr0.size, attr0.dim_num); + memcpy(&out0.attr, &attr0, sizeof(vsi_nn_tensor_attr_t)); + tensor_temp = &out0; + } + } + else /* VSI_NN_PLATFORM_CAFFE */ + { + if (tensor_out->attr.size[2] == 1) + { + is_gray= TRUE; + p->reverse_channel = FALSE; + } + + if (p->reverse_channel == TRUE) + { + output_scaletotensor = vsi_nn_CreateTensor(graph, &(tensor_out->attr)); + tensor_temp = output_scaletotensor; + } + else + { + tensor_temp = tensor_out; + } + } + + args = ¶ms[_IO_NUM]; + + status = VSI_FAILURE; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_IMAGEPROCESS_list; + if (!is_gray) + { + kernel_info.init_index = 1; + status = select_kernel_index(&kernel_info, tensor_out->attr.dtype.vx_type, is_copy); + } + else + { + kernel_info.init_index = 2; + status = select_kernel_index_gray(&kernel_info, tensor_out->attr.dtype.vx_type, is_copy); + } + + node = vsi_nn_RegisterClientKernelAndNewNode( + graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == node ) + { + VSILOGE("Create scaletotensor node fails"); + status = VSI_FAILURE; + goto OnError; + } + //imgInfo = {width * num_of_channels, height, 1, width * num_of_channels, VX_SCALE_UNITY, VX_SCALE_UNITY, 1, 1}; + imgInfo.dim_x = attr->size[0] * attr->size[2]; + imgInfo.dim_y = attr->size[1]; + imgInfo.stride_x = 1; + imgInfo.stride_y = imgInfo.dim_x; + imgInfo.scale_x = VX_SCALE_UNITY; + imgInfo.scale_y = VX_SCALE_UNITY; + imgInfo.step_x = 1; + imgInfo.step_y = 1; + +#if defined(__linux__) + image = vxCreateImageFromHandle(ctx, VX_DF_IMAGE_U8, &imgInfo, (void **)&data, VX_MEMORY_TYPE_HOST); +#else + image = vxCreateImage(ctx, imgInfo.dim_x, imgInfo.dim_y, VX_DF_IMAGE_U8); + { + vx_rectangle_t rect = {0, 0, 0, 0}; + vx_map_id map_id = 0; + void* imgBaseAddr = NULL; + + rect.end_x = imgInfo.dim_x; + rect.end_y = imgInfo.dim_y; + vxMapImagePatch(image, &rect, 0,&map_id, &imgInfo, &imgBaseAddr, VX_WRITE_ONLY,VX_MEMORY_TYPE_HOST, 0);// get data pointer of image in GPU side + memcpy((vx_uint8*)imgBaseAddr, data, imgInfo.dim_x * imgInfo.dim_y); + vxUnmapImagePatch(image, map_id); + imgBaseAddr = NULL; + } +#endif + image_global = image; + p_image->image.handle = image; + + /* Set inputs and outputs */ + params[0] = (vx_reference)image; + params[1] = (vx_reference)tensor_temp->t; + + /* Init parameters. */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i + _IO_NUM] = (vx_reference)vxCreateScalar( ctx, type, &arg ); \ + status = vxGetStatus( params[i + _IO_NUM] ); \ + if( VSI_SUCCESS != status ) { \ + status = VSI_FAILURE;\ + goto OnError;\ + } \ + } while(0) + if (!is_gray) + { + { + scaletotensor_kernel_params_t scaletotensor_kernel_params; + prepare_params_scaletotensor(p, &scaletotensor_kernel_params, attr, &(tensor_temp->attr)); + _SET_PARAM( 0, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[0]); + _SET_PARAM( 1, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[1]); + _SET_PARAM( 2, VX_TYPE_INT32, scaletotensor_kernel_params.offset[0]); + _SET_PARAM( 3, VX_TYPE_INT32, scaletotensor_kernel_params.offset[1]); + if (p->reverse_channel == TRUE) + { + _SET_PARAM( 4, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[0]); + _SET_PARAM( 5, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[1]); + _SET_PARAM( 6, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[2]); + } + else + { + _SET_PARAM( 4, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[2]); + _SET_PARAM( 5, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[1]); + _SET_PARAM( 6, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[0]); + } + _SET_PARAM( 7, VX_TYPE_FLOAT32, scaletotensor_kernel_params.scale); + } + arg_num = _ARG_NUM_SCALETOTENSOR; + } + else + { + { + scaletotensor_kernel_params_t scaletotensor_kernel_params; + prepare_params_scaletotensor(p, &scaletotensor_kernel_params, attr, &(tensor_temp->attr)); + _SET_PARAM( 0, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[0]); + _SET_PARAM( 1, VX_TYPE_INT32, scaletotensor_kernel_params.ratio[1]); + _SET_PARAM( 2, VX_TYPE_INT32, scaletotensor_kernel_params.offset[0]); + _SET_PARAM( 3, VX_TYPE_INT32, scaletotensor_kernel_params.offset[1]); + _SET_PARAM( 4, VX_TYPE_FLOAT32, scaletotensor_kernel_params.mean[0]); + _SET_PARAM( 5, VX_TYPE_FLOAT32, scaletotensor_kernel_params.scale); + } + arg_num = _ARG_NUM_GRAYSCALETOTENSOR; + } +#undef _SET_PARAM + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters(node, params, _IO_NUM + arg_num); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(node, VX_NODE_BORDER, &border, sizeof(border)); + + _release_params( args, arg_num); + + if (p->platform_type == VSI_NN_PLATFORM_TENSORFLOW) + { + if (p->reverse_channel == TRUE) + { + node = vxTensorReverse( graph->g, output_scaletotensor->t, ¶, + sizeof(vx_nn_tensor_reverse_params_t), output_reversetensor->t ); + if( NULL == node ) + { + VSILOGE("Create vxTensorReverse node fails"); + status = VSI_FAILURE; + goto OnError; + } + + node = vxTensorPermuteNode( graph->g, output_reversetensor->t, + tensor_out->t, perm, 4); + if( NULL == node ) + { + VSILOGE("Create vxTensorPermuteNode node fails"); + status = VSI_FAILURE; + goto OnError; + } + } + else + { + if (!is_gray) + { + node = vxTensorPermuteNode( graph->g, output_scaletotensor->t, + tensor_out->t, perm, 4); + if( NULL == node ) + { + VSILOGE("Create vxTensorPermuteNode node fails"); + status = VSI_FAILURE; + goto OnError; + } + } + else + { + if (out0.t) vxReleaseTensor(&out0.t); + } + } + } + else /* VSI_NN_PLATFORM_CAFFE */ + { + if (p->reverse_channel == TRUE) + { + node = vxTensorReverse( graph->g, output_scaletotensor->t, ¶, + sizeof(vx_nn_tensor_reverse_params_t), tensor_out->t ); + if( NULL == node ) + { + VSILOGE("Create vxTensorReverse node fails"); + status = VSI_FAILURE; + goto OnError; + } + } + } + + //set graph inputs outputs again, because pre_process changed graph inputs + { + uint32_t num_of_graph_inputs; + vx_reference *graph_inputs = NULL; + uint32_t num_of_graph_outputs; + vx_reference *graph_outputs = NULL; + uint32_t i = 0; + + /* Explicitly set graph inputs and outputs */ + num_of_graph_inputs = 1; + graph_inputs = (vx_reference *)malloc( num_of_graph_inputs * sizeof( vx_reference ) ); + + graph_inputs[0] = (vx_reference)image; + + num_of_graph_outputs = graph->output.num; + graph_outputs = (vx_reference *)malloc( num_of_graph_outputs * sizeof( vx_reference ) ); + for( i = 0; i < num_of_graph_outputs; i++ ) + { + graph_outputs[i] = (vx_reference)( ( vsi_nn_GetTensor( graph, graph->output.tensors[i] ) )->t ); + } + status = vxIdentifyGraphInputsAndOutputs( graph->g, + num_of_graph_inputs, + graph_inputs, + num_of_graph_outputs, + graph_outputs ); + + if ( NULL != graph_inputs) + { + free( graph_inputs ); + } + if ( NULL != graph_outputs) + { + free( graph_outputs ); + } + } +OnError: + //if(tensor_temp) vsi_nn_ReleaseTensor(&tensor_temp); + if(output_scaletotensor) vsi_nn_ReleaseTensor(&output_scaletotensor); + if(output_reversetensor) vsi_nn_ReleaseTensor(&output_reversetensor); + return status; + } + else + { +#if !defined(__linux__) + vx_imagepatch_addressing_t imgInfo; + vx_rectangle_t rect = {0, 0, 0, 0}; + vx_map_id map_id = 0; + void* imgBaseAddr = NULL; + + //imgInfo = {width * num_of_channels, height, 1, width * num_of_channels, VX_SCALE_UNITY, VX_SCALE_UNITY, 1, 1}; + imgInfo.dim_x = attr->size[0] * attr->size[2]; + imgInfo.dim_y = attr->size[1]; + imgInfo.stride_x = 1; + imgInfo.stride_y = imgInfo.dim_x; + imgInfo.scale_x = VX_SCALE_UNITY; + imgInfo.scale_y = VX_SCALE_UNITY; + imgInfo.step_x = 1; + imgInfo.step_y = 1; + + rect.end_x = imgInfo.dim_x; + rect.end_y = imgInfo.dim_y; + vxMapImagePatch(image_global, &rect, 0,&map_id, &imgInfo, &imgBaseAddr, VX_WRITE_ONLY,VX_MEMORY_TYPE_HOST, 0);// get data pointer of image in GPU side + memcpy((vx_uint8*)imgBaseAddr, data, imgInfo.dim_x * imgInfo.dim_y); + vxUnmapImagePatch(image_global, map_id); + imgBaseAddr = NULL; +#endif + return VSI_SUCCESS; + } +} + +vsi_status vsi_nn_op_imageprocess_single_node + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_attr_t *attr, + vsi_nn_imageprocess_param *p, + uint8_t *data, + vsi_nn_tensor_t *tensor_out + ) +{ + return vsi_nn_InsertImageprocessSingleNode( + graph, attr, p, data, tensor_out, 0); +} + +static void _release_image_list(vsi_nn_link_list_t *node) +{ + vsi_nn_image_list_t *image_list = (vsi_nn_image_list_t *)node; + vxReleaseImage(&(image_list->image.handle)); +} + +vsi_status vsi_nn_ReleaseImageprocessSingleNode() +{ + vsi_nn_LinkListDeinit((vsi_nn_link_list_t *)images_head, _release_image_list); + return VSI_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ IMAGEPROCESS, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c new file mode 100644 index 0000000..46a23ce --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c @@ -0,0 +1,183 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + float eps = self->nn_param.instancenorm.eps; + uint32_t *input_size = inputs[0]->attr.size; + uint32_t dims_num = inputs[0]->attr.dim_num; + int32_t rs_flg = 0; + + param =vsi_nn_kernel_param_create(); + + if((input_size[1] * input_size[2] < 65536) + && dims_num > 2) + { + rs_flg = 1; + } + + vsi_nn_kernel_param_add_float32( param, "eps", eps ); + vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg ); + n = vsi_nn_kernel_selector( self->graph, "instance_norm", + inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(INSTANCE_NORM, 3, 1) + IO_TYPE(D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_F32, D_F16, D_I32) + IO_TYPE(D_I32, D_F32, D_F16, D_F32) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP) + END_IO_TYPE_DECL(INSTANCE_NORM) + if (!VALIDATE_OP_IO_TYPES(INSTANCE_NORM, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.instancenorm.lcl2_data = + (vsi_nn_instancenorm_lcl_data2 *)malloc(sizeof(vsi_nn_instancenorm_lcl_data2)); + if (NULL == self->nn_param.instancenorm.lcl2_data) + { + return VX_ERROR_NO_MEMORY; + } + + memset( self->nn_param.instancenorm.lcl2_data, 0, sizeof(vsi_nn_instancenorm_lcl_data2) ); + + self->nn_param.instancenorm.lcl2_data->reshapeFlg = 0; + self->nn_param.instancenorm.lcl2_data->execute_on_sw = 0; + self->nn_param.instancenorm.lcl2_data->hash_idx = 0; + + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + for (i = 0; i < _VSI_NN_INSTANCENORM_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.instancenorm.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.instancenorm.local.local_tensor[i])); + self->nn_param.instancenorm.local.local_tensor[i] = NULL; + } + } + if(self->nn_param.instancenorm.lcl2_data) + { + free(self->nn_param.instancenorm.lcl2_data); + self->nn_param.instancenorm.lcl2_data = NULL; + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ INSTANCE_NORM, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c new file mode 100644 index 0000000..055dbd9 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c @@ -0,0 +1,192 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VX_FAILURE; +#ifdef VX_L2NORM_AXIS_PARAMETER_SUPPORT + vx_nn_l2norm_params_t param; + + param.axis = self->nn_param.l2_normalize.axis; + + self->n = vxL2NormalizeLayer2( + self->graph->g, + inputs[0]->t, + ¶m, + sizeof(vx_nn_l2norm_params_t), + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } +#else + vsi_nn_l2_normalize_param * p; + int32_t axis = -1; + uint32_t i = 0; + uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t innerSize = 1; + uint32_t outerSize = 1; + uint32_t axisSize = 1; + vx_tensor vx_input = NULL; + vx_tensor vx_output = NULL; + vx_tensor input = inputs[0]->t; + vx_tensor output = outputs[0]->t; + + status = VSI_FAILURE; + + p = &(self->nn_param.l2_normalize); + axis = p->axis; + + if (axis != 2) + { + axisSize = inputs[0]->attr.size[axis]; + + for (i = 0; i < (uint32_t)axis; i++) + { + innerSize *= inputs[0]->attr.size[i]; + } + + for (i = (uint32_t)(axis + 1); i < inputs[0]->attr.dim_num; i++) + { + outerSize *= inputs[0]->attr.size[i]; + } + + sizes[0] = innerSize; + sizes[1] = 1; + sizes[2] = axisSize; + sizes[3] = outerSize; + + vx_input = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); + vx_output = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); + + input = vx_input; + output = vx_output; + } + + self->n = vxL2NormalizeLayer( + self->graph->g, + input, + output + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + + if (vx_input) vxReleaseTensor(&vx_input); + if (vx_output) vxReleaseTensor(&vx_output); +#endif + return status; +} /* op_compute() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 15) == -1) + { + self->nn_param.l2_normalize.axis = 2; + } + + return status; +} /* op_init() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(L2_NORMALIZE, 1, 1) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + END_IO_TYPE_DECL(L2_NORMALIZE) + if (!VALIDATE_OP_IO_TYPES(L2_NORMALIZE, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ L2_NORMALIZE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c new file mode 100644 index 0000000..52a54bb --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c @@ -0,0 +1,316 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +#define VSI_NN_L2NORMALIZESCALE_DEFAULT_AXIS 2 + + +static vsi_nn_tensor_t* _expand_scale_tensor + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_t *scale, + int32_t scale_size_in, + int32_t scale_size_out + ) +{ + vsi_status status = VX_SUCCESS; + float* f32_in_buffer = NULL; + float* f32_out_buffer = NULL; + int32_t i = 0; + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* scale_tensor = NULL; + vsi_nn_dtype_t out_dtype; + + f32_out_buffer= (float *)malloc(scale_size_out * sizeof(float)); + memset(f32_out_buffer, 0, scale_size_out * sizeof(float)); + f32_in_buffer = vsi_nn_ConvertTensorToFloat32Data(graph, scale); + if (NULL == f32_in_buffer) + { + scale_tensor = NULL; + goto final; + } + + for (i = 0; i < scale_size_in; i++) + { + f32_out_buffer[i] = f32_in_buffer[i]; + } + for (i = scale_size_in; i < scale_size_out; i++) + { + f32_out_buffer[i] = f32_in_buffer[scale_size_in - 1]; + } + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.size[0] = scale_size_out; + attr.size[1] = 1; + attr.dim_num = 2; + attr.dtype.vx_type = scale->attr.dtype.vx_type; + attr.vtl = FALSE; + scale_tensor = vsi_nn_CreateTensor(graph, &attr); + out_dtype = scale->attr.dtype; + out_dtype.vx_type = VSI_NN_TYPE_FLOAT32; + out_dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + status = vsi_nn_CopyRawDataToTensor (graph, + (uint8_t*)f32_out_buffer, &out_dtype, scale_tensor); + if (VSI_SUCCESS != status) + { + scale_tensor = NULL; + goto final; + } + +final: + if (f32_in_buffer) + { + free(f32_in_buffer); + f32_in_buffer = NULL; + } + + if (f32_out_buffer) + { + free(f32_out_buffer); + f32_out_buffer = NULL; + } + + return scale_tensor; +} + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + int32_t axis = 0; + int32_t new_axis = 0; + uint32_t axis_size = 0; + uint32_t rank_in = 0; + uint32_t rank_out = 0; + uint32_t size = 1; + uint32_t i = 0; + uint32_t scale_size = 1; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_nn_l2normalizescale_param * p = NULL; + vsi_bool ret = FALSE; + vsi_nn_kernel_param_t * param = NULL; + vsi_bool is_expand_scale = vx_false_e; + + p = &(self->nn_param.l2normalizescale); + axis = p->axis; + + param =vsi_nn_kernel_param_create(); + + ret = vsi_nn_kernel_optimize_reduce_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + &axis, 1, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], &rank_in, shapes[2], &rank_out, + &new_axis, &axis_size); + size = inputs[1]->attr.size[0]; + for (i = 1; i < inputs[1]->attr.dim_num; i ++) + { + size *= inputs[1]->attr.size[i]; + } + shapes[1][0] = (int32_t)size; + shapes[1][1] = 1; + shapes[1][2] = 1; + shapes[1][3] = 1; + scale_size = shapes[0][new_axis]; + is_expand_scale = (vx_bool)((size < scale_size) && (TRUE == inputs[1]->attr.is_const)); + vsi_nn_kernel_param_add_int32( param, "axis", new_axis ); + + if( ret ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], rank_in ); + if (is_expand_scale) + { + reshape_tensors[1] = _expand_scale_tensor(self->graph, inputs[1], size, scale_size); + } + else + { + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes[1], 2 ); + } + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[0], rank_in ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "l2normalizescale", + &reshape_tensors[0], _INPUT_NUM, + &reshape_tensors[2], _OUTPUT_NUM, param ); + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + } + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(L2NORMALIZESCALE, _INPUT_NUM, _OUTPUT_NUM) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_U8|Q_ASYM) + END_IO_TYPE_DECL(L2NORMALIZESCALE) + if(!VALIDATE_OP_IO_TYPES(L2NORMALIZESCALE, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + for (i = 0; i < _VSI_NN_L2NORMALIZESCALE_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.l2normalizescale.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.l2normalizescale.local.local_tensor[i])); + self->nn_param.l2normalizescale.local.local_tensor[i] = NULL; + } + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + if( NULL == self ) + { + return FALSE; + } + + if (self->nn_param.l2normalizescale.axis < 0) + { + self->nn_param.l2normalizescale.axis += (int32_t)inputs[0]->attr.dim_num; + } + + if (self->nn_param.l2normalizescale.axis < 0) + { + VSILOGD("l2normalizescale Invalid Axis: %d", self->nn_param.l2normalizescale.axis); + return FALSE; + } + + ret = vsi_nn_op_common_setup(self, inputs, outputs); + + return ret; +} + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + uint32_t i; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 13) == -1) + { + self->nn_param.l2normalizescale.axis = VSI_NN_L2NORMALIZESCALE_DEFAULT_AXIS; + } + for (i = 0; i < _VSI_NN_L2NORMALIZESCALE_LOCAL_TENSOR_NUM; i++) + { + self->nn_param.l2normalizescale.local.local_tensor[i] = NULL; + } + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ L2NORMALIZESCALE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 2, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c new file mode 100644 index 0000000..87f2b54 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -0,0 +1,447 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_LAYERNORM_list[]; + +static void check_tensor_shape + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vx_reference * params, + uint32_t index, + vx_bool rsFlg + ) +{ + vsi_nn_tensor_attr_t attr; + + if (index == 0 ) + { + if(input->attr.dim_num == 1) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.layernorm.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; + } + else if ((input->attr.dim_num == 3 && input->attr.size[2] == 1) + ||(input->attr.dim_num == 4 && input->attr.size[2] == 1 && input->attr.size[3] == 1)) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = 2; + self->nn_param.layernorm.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; + } + else + params[index] = (vx_reference)input->t; + } + else if(index == 1 ) + { + if(input->attr.dim_num == 1) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.layernorm.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; + } + else + params[index] = (vx_reference)input->t; + + } + else if(index == 2) + { + if(input->attr.dim_num == 1) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.layernorm.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; + } + else + params[index] = (vx_reference)input->t; + } + else if(index == 3) + { + if(input->attr.dim_num == 1) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.layernorm.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; + } + else if ((input->attr.dim_num == 3 && input->attr.size[2] == 1) + ||(input->attr.dim_num == 4 && input->attr.size[2] == 1 && input->attr.size[3] == 1)) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = 2; + self->nn_param.layernorm.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.layernorm.local.local_tensor[index]; + } + else + params[index] = (vx_reference)input->t; + } + else + { + VSILOGE("No more local tensor!(LAYERNORM) at [%s : %d]\n", __FILE__, __LINE__); + } +} + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_layernormalize_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.layernorm); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_FLOAT32, eps ); +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_pre_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type; + vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type; + vsi_nn_type_e scaleDataFormat = inputs[2]->attr.dtype.vx_type; + if (inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16 + && scaleDataFormat == VSI_NN_TYPE_FLOAT16) + { + kernel_info->kernel_index = 1; + } + else if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8 + && scaleDataFormat == VSI_NN_TYPE_FLOAT16) + { + kernel_info->kernel_index = 2; + } + else if (inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_UINT8 + && scaleDataFormat == VSI_NN_TYPE_FLOAT16) + { + kernel_info->kernel_index = 3; + } + else if (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_FLOAT16 + && scaleDataFormat == VSI_NN_TYPE_FLOAT16) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_layernormalize_U8"; + kernel_info->kernel_index = 4; + } + else + { + VSILOGE("Not support input or output data format!(LAYERNORM) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + return VSI_SUCCESS; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_border_t border; + vx_reference * args; + vx_bool rsFlg = FALSE; + int32_t in_zp; + vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type; + vsi_nn_tensor_attr_t attr; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + //_set_inputs_outputs( params, inputs, outputs ); + check_tensor_shape(self, inputs[0], params, 0, rsFlg); + check_tensor_shape(self, inputs[1], params, 1, rsFlg); + check_tensor_shape(self, inputs[2], params, 2, rsFlg); + check_tensor_shape(self, outputs[0], params, 3, rsFlg); + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + status = vsi_nn_vxGetTensorAttr(inputs[0]->t, &attr); + in_zp = attr.dtype.zero_point; + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = 0; + border.constant_value.S16 = 0; + border.constant_value.U8 = 0; + if(inputDataFormat == VSI_NN_TYPE_UINT8) + { + border.constant_value.U32 = (vx_uint32)in_zp; + border.constant_value.S16 = (vx_int16)in_zp; + border.constant_value.U8 = (vx_uint8)in_zp; + } + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_layernormalize"; + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_LAYERNORM_list; + kernel_info.init_index = 1; + + if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) + { + vx_op_pre_compute(self, inputs, outputs, &kernel_info); + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1) + IO_TYPE(D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) + END_IO_TYPE_DECL(LAYER_NORM) + if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + for (i = 0; i < _VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.layernorm.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.layernorm.local.local_tensor[i])); + self->nn_param.layernorm.local.local_tensor[i] = NULL; + } + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LAYER_NORM, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c new file mode 100644 index 0000000..5ac26a6 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c @@ -0,0 +1,117 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxLeakyReluLayer( + self->graph->g, + inputs[0]->t, + self->nn_param.activation.leaky_ratio, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(LEAKY_RELU, 1, 1) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + END_IO_TYPE_DECL(LEAKY_RELU) + if (!VALIDATE_OP_IO_TYPES(LEAKY_RELU, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LEAKY_RELU, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c new file mode 100644 index 0000000..50bdef0 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c @@ -0,0 +1,124 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LINEAR, + self->nn_param.linear.a, + self->nn_param.linear.b, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(LINEAR, 1, 1) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + END_IO_TYPE_DECL(LINEAR) + if (!VALIDATE_OP_IO_TYPES(LINEAR, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LINEAR, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c new file mode 100644 index 0000000..f3f6e38 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c @@ -0,0 +1,214 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status _log_softmax_op_compute + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank_in = 0; + int32_t axis = 0; + int32_t new_axis = 0; + float betaValue = 0; + + vsi_bool ret; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_log_softmax_param * p = NULL; + + if( NULL == self ) + { + return VSI_FAILURE; + } + status = VSI_FAILURE; + + p = &(self->nn_param.log_softmax); + axis = p->axis; + betaValue = p->betaValue; + + // TODO: This optimzie is a hack for gpu path, + // it should be moved to gpu kernel setup. + ret = vsi_nn_kernel_optimize_softmax_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rank_in, &new_axis); + + if( ret ) + { + // Add params + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "axis", new_axis ); + vsi_nn_kernel_param_add_float32( param, "beta", betaValue ); + + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[0], rank_in ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, + &reshape_tensors[0], 1, + &reshape_tensors[1], 1, param ); + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + + vsi_nn_kernel_param_release( ¶m ); + } + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* _log_softmax_op_compute() */ + +static vsi_bool _log_softmax_op_setup + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + if( NULL == self ) + { + return FALSE; + } + + if (self->nn_param.log_softmax.axis < 0) + self->nn_param.log_softmax.axis += (int32_t)inputs[0]->attr.dim_num; + + if (self->nn_param.log_softmax.axis < 0) + { + VSILOGD("LogSoftMax Invalid Axis: %d", self->nn_param.log_softmax.axis); + return FALSE; + } + + vsi_nn_op_common_setup(self, inputs, outputs); + + return TRUE; +} /* _log_softmax_op_setup() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(LOG_SOFTMAX, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_BF16, D_F16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(LOG_SOFTMAX) + if(!VALIDATE_OP_IO_TYPES(LOG_SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEF_LOG_SOFTMAX_OP(name, kernel_name) \ + static vsi_status op_compute_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _log_softmax_op_compute( ""#kernel_name, self, inputs, outputs ); \ + } \ + static vsi_bool op_setup_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _log_softmax_op_setup( ""#kernel_name, self, inputs, outputs ); \ + } \ +DEF_OP_REG \ + ( \ + /* op_name */ name, \ + /* init */ NULL, \ + /* compute */ op_compute_##kernel_name, \ + /* deinit */ vsi_nn_op_common_deinit, \ + /* check */ op_check, \ + /* setup */ op_setup_##kernel_name, \ + /* optimize */ NULL, \ + /* input_num */ 1, \ + /* output_num */ 1 \ + ) +/* DEF_OP_REG(name, op_init_##kernel_name, op_compute_##kernel_name, \ + NULL, NULL, op_setup_##kernel_name, NULL, 1, 1)*/ + +DEF_LOG_SOFTMAX_OP( LOG_SOFTMAX, log_softmax ); + + +#undef DEF_LOG_SOFTMAX_OP + +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c new file mode 100644 index 0000000..aec87a9 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c @@ -0,0 +1,169 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 0; + vsi_bool ret; + + if( NULL == self ) + { + return status; + } + + // TODO: This optimzie is a hack for gpu path, + // it should be moved to gpu kernel setup. + ret = vsi_nn_kernel_optimize_element_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); + if( ret ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shape, new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shape, new_rank ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "logical_not", + &reshape_tensors[0], _INPUT_NUM, + &reshape_tensors[1], _OUTPUT_NUM, NULL ); + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + } + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(LOGICAL_NOT, 1, 1) + IO_TYPE(D_I8, D_I8) + IO_TYPE(D_BOOL8, D_BOOL8) + END_IO_TYPE_DECL(LOGICAL_NOT) + if(!VALIDATE_OP_IO_TYPES(LOGICAL_NOT, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i, out_rank; + uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_bool ret = TRUE; + + out_rank = inputs[0]->attr.dim_num; + + for(i = 0; i < out_rank; i++) + { + shape[i] = inputs[0]->attr.size[i]; + } + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + } + else + { + uint32_t total_size_got; + uint32_t total_size_expected; + total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); + total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, + outputs[0]->attr.dim_num ); + if( total_size_expected != total_size_got ) + { + VSILOGW("Output size mismatch, expect %d, but got %d", + total_size_expected, total_size_got); + ret = FALSE; + } + } + + return ret; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LOGICAL_NOT, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c new file mode 100644 index 0000000..4ea6537 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c @@ -0,0 +1,193 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + uint32_t new_rank = 0; + vsi_bool ret; + + if( NULL == self ) + { + return VSI_FAILURE; + } + + ret = vsi_nn_kernel_optimize_eltwise_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + if( ret ) + { + param =vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "ops_type", self->nn_param.logical_ops.op ); + + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[2], new_rank ); + + if (shapes[1][3] > shapes[0][3] && new_rank == 4) + { + vsi_nn_tensor_t* reshape_tmp; + reshape_tmp = reshape_tensors[0]; + reshape_tensors[0] = reshape_tensors[1]; + reshape_tensors[1] = reshape_tmp; + } + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "logical_ops", + &reshape_tensors[0], _INPUT_NUM, + &reshape_tensors[2], _OUTPUT_NUM, param ); + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + + vsi_nn_kernel_param_release( ¶m ); + } + + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(LOGICAL_OPS, 2, 1) + IO_TYPE(D_I8, D_I8, D_I8) + IO_TYPE(D_BOOL8, D_BOOL8, D_BOOL8) + END_IO_TYPE_DECL(LOGICAL_OPS) + if(!VALIDATE_OP_IO_TYPES(LOGICAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i, out_rank, in1_rank, in2_rank; + uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_bool ret = TRUE; + + in1_rank = inputs[0]->attr.dim_num; + in2_rank = inputs[1]->attr.dim_num; + out_rank = vsi_nn_max( in1_rank, in2_rank ); + + for(i = 0; i < out_rank; i++) + { + uint32_t sz0, sz1; + sz0 = i < in1_rank ? inputs[0]->attr.size[i] : 1; + sz1 = i < in2_rank ? inputs[1]->attr.size[i] : 1; + shape[i] = vsi_nn_max( sz0, sz1 ); + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + } + else + { + uint32_t total_size_got; + uint32_t total_size_expected; + total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); + total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, + outputs[0]->attr.dim_num ); + if( total_size_expected != total_size_got ) + { + VSILOGW("Output size mismatch, expect %d, but got %d", + total_size_expected, total_size_got); + ret = FALSE; + } + } + + return ret; +} /* op_setup() */ + + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LOGICAL_OPS, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lrn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lrn.c new file mode 100644 index 0000000..120dd7f --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lrn.c @@ -0,0 +1,116 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxNormalizationLayer( + self->graph->g, + inputs[0]->t, + self->nn_param.lrn.type, + self->nn_param.lrn.size, + self->nn_param.lrn.alpha, + self->nn_param.lrn.beta, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(LRN, 1, 1) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + END_IO_TYPE_DECL(LRN) + if (!VALIDATE_OP_IO_TYPES(LRN, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LRN, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c new file mode 100644 index 0000000..9c4c9e8 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c @@ -0,0 +1,208 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +#define MAX_BATCH_COUNT 1024 + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + int32_t axis = -1; + uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t innerSize = 1; + uint32_t outerSize = 1; + uint32_t axisSize = 1; + vx_tensor vx_input = NULL; + vx_tensor vx_output = NULL; + vx_tensor input = inputs[0]->t; + vx_tensor output = outputs[0]->t; + uint32_t i = 0; + +#ifdef VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT + vx_nn_normalization_params_ext_t param; + + memset(¶m, 0, sizeof(vx_nn_normalization_params_ext_t)); + axis = self->nn_param.lrn.axis; + param.base.alpha = self->nn_param.lrn.alpha; + param.base.beta = self->nn_param.lrn.beta; + param.base.bias = self->nn_param.lrn.bias; + param.base.norm_size = self->nn_param.lrn.size; + param.base.type = self->nn_param.lrn.type; + param.axis = axis; + + if (param.base.type == VX_NN_NORMALIZATION_ACROSS_MAPS && axis != 2) + { + axisSize = inputs[0]->attr.size[axis]; + + for (i = 0; i < (uint32_t)axis; i++) + { + innerSize *= inputs[0]->attr.size[i]; + } + + for (i = (uint32_t)(axis + 1); i < inputs[0]->attr.dim_num; i++) + { + outerSize *= inputs[0]->attr.size[i]; + } + + sizes[0] = innerSize; + sizes[1] = 1; + sizes[2] = axisSize; + sizes[3] = outerSize; + + if(outerSize < MAX_BATCH_COUNT) + { + vx_input = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); + vx_output = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); + + input = vx_input; + output = vx_output; + + param.axis = 2; + } + } + + self->n = vxNormalizationLayer2( self->graph->g, + input, + (vx_nn_normalization_params_t*)¶m, + sizeof(vx_nn_normalization_params_ext_t), + output); +#else + vx_nn_normalization_params_t param; + + memset(¶m, 0, sizeof(vx_nn_normalization_params_t)); + axis = self->nn_param.lrn.axis; + param.alpha = self->nn_param.lrn.alpha; + param.beta = self->nn_param.lrn.beta; + param.bias = self->nn_param.lrn.bias; + param.norm_size = self->nn_param.lrn.size; + param.type = self->nn_param.lrn.type; + + if (param.type == VX_NN_NORMALIZATION_ACROSS_MAPS && axis != 2) + { + axisSize = inputs[0]->attr.size[axis]; + + for (i = 0; i < (uint32_t)axis; i++) + { + innerSize *= inputs[0]->attr.size[i]; + } + + for (i = (uint32_t)(axis + 1); i < inputs[0]->attr.dim_num; i++) + { + outerSize *= inputs[0]->attr.size[i]; + } + + sizes[0] = innerSize; + sizes[1] = 1; + sizes[2] = axisSize; + sizes[3] = outerSize; + + vx_input = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); + vx_output = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); + + input = vx_input; + output = vx_output; + } + + self->n = vxNormalizationLayer2( self->graph->g, + input, + ¶m, + sizeof(vx_nn_normalization_params_t), + output); +#endif + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + + if (vx_input) vxReleaseTensor(&vx_input); + if (vx_output) vxReleaseTensor(&vx_output); + + return status; +} /* op_compute() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 15) == -1) + { + self->nn_param.lrn.axis = 2; + } + + return status; +} /* op_init() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + //TODO: Check tensor shapes. + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_LRN, self, inputs, outputs); + + return ret; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LRN2, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c new file mode 100644 index 0000000..96086dc --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c @@ -0,0 +1,153 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_dtype_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_tensor_t * type_tensor = NULL; + vx_nn_lshproj_params_t p; + vx_bool valued = TRUE; + vsi_nn_tensor_t * weight_tensor = NULL; + + type_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.lsh_projection.type, + VSI_NN_TYPE_INT32); + + memset(&p, 0, sizeof(p)); + p.hash_func = REQUIRED_IO(inputs[0]); + p.weights = OPTIONAL_IO(inputs[2]); + //p.weights = inputs[2]->t; + p.type = type_tensor->t; + //This is a hack + // Need driver fix this + if (p.weights == NULL) + { + vsi_nn_tensor_attr_t attr; + float const_one = 1.0; + int32_t i; + int32_t count = inputs[1]->attr.size[1]; + float* const_data = malloc(count * sizeof(float)); + + for (i = 0; i < count; i++) + { + const_data[i] = const_one; + } + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.size[0] = count; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + weight_tensor = vsi_nn_CreateTensorFromData(self->graph, + (uint8_t *)const_data, &attr); + p.weights = weight_tensor->t; + free(const_data); + //valued = FALSE; + } + vxSetTensorAttribute(p.weights, VX_TENSOR_VALUE, &valued, sizeof(vx_bool)); + + self->n = vxLSHProjectionLayer( self->graph->g, + inputs[1]->t, &p, sizeof(p), outputs[0]->t); + if( !self->n ) + { + status = VSI_FAILURE; + } + vsi_nn_ReleaseTensor( &type_tensor ); + if (weight_tensor != NULL) vsi_nn_ReleaseTensor(&weight_tensor); + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = 1; + if( VSI_NN_LSH_PROJECTION_SPARSE == node->nn_param.lsh_projection.type ) + { + outputs[0]->attr.size[0] = inputs[0]->attr.size[1]; + } + else if( VSI_NN_LSH_PROJECTION_DENSE == node->nn_param.lsh_projection.type ) + { + outputs[0]->attr.size[0] = vsi_nn_GetElementNum( inputs[0] ); + } + else + { + VSILOGE("Unknown lsh projection hash type."); + } + } + return TRUE; +} /* op_setup() */ + +#ifdef __cpluplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LSH_PROJECTION, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 3, + /* output_num */ 1 + ); +#ifdef __cpluplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c new file mode 100644 index 0000000..900e50b --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c @@ -0,0 +1,278 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + +static vsi_status _create_local_tensor + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_tensor_t *act_tensor = NULL; + vsi_nn_tensor_t *forget_bias_tensor = NULL; + vsi_nn_tensor_t *cell_clip_tensor = NULL; + vsi_nn_tensor_t *proj_clip_tensor = NULL; + + if(NULL == self) + { + return VSI_FAILURE; + } + + act_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.lstm.activation, + VSI_NN_TYPE_INT32); + if(NULL == act_tensor) + { + goto error; + } + + if (self->nn_param.lstm.forget_bias != 0.0 ) + { + forget_bias_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.lstm.forget_bias, + VSI_NN_TYPE_FLOAT32); + if(NULL == forget_bias_tensor) + { + goto error; + } + } + + cell_clip_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.lstm.cell_clip, + VSI_NN_TYPE_FLOAT32); + if(NULL == cell_clip_tensor) + { + goto error; + } + + proj_clip_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.lstm.proj_clip, + VSI_NN_TYPE_FLOAT32); + if(NULL == proj_clip_tensor) + { + goto error; + } + + self->nn_param.lstm.local.activation_tensor = act_tensor; + self->nn_param.lstm.local.forget_bias_tensor = forget_bias_tensor; + self->nn_param.lstm.local.cell_clip_tensor = cell_clip_tensor; + self->nn_param.lstm.local.proj_clip_tensor = proj_clip_tensor; + return VSI_SUCCESS; +error: + if(act_tensor)vsi_nn_ReleaseTensor(&act_tensor); + if(forget_bias_tensor)vsi_nn_ReleaseTensor(&forget_bias_tensor); + if(cell_clip_tensor)vsi_nn_ReleaseTensor(&cell_clip_tensor); + if(proj_clip_tensor)vsi_nn_ReleaseTensor(&proj_clip_tensor); + return VSI_FAILURE; +} /* _create_local_tensor() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; +#if 1 + vx_nn_lstm_layer_params_ext_t p; + memset( &p, 0, sizeof( vx_nn_lstm_layer_params_ext_t )); + + status = VSI_FAILURE; + + status = _create_local_tensor(self); + if(status != VSI_SUCCESS) + { + return status; + } + + p.lstm_param.base.input2input_weight = REQUIRED_IO(inputs[3]); + p.lstm_param.base.input2forget_weight = REQUIRED_IO(inputs[4]); + p.lstm_param.base.input2cell_weight = REQUIRED_IO(inputs[5]); + p.lstm_param.base.input2output_weight = REQUIRED_IO(inputs[6]); + p.lstm_param.base.recurrent2input_weight = REQUIRED_IO(inputs[7]); + p.lstm_param.base.recurrent2forget_weight = REQUIRED_IO(inputs[8]); + p.lstm_param.base.recurrent2cell_weight = REQUIRED_IO(inputs[9]); + p.lstm_param.base.recurrent2output_weight = REQUIRED_IO(inputs[10]); + p.lstm_param.base.input_gate_bias = REQUIRED_IO(inputs[14]); + p.lstm_param.base.forget_gate_bias = REQUIRED_IO(inputs[15]); + p.lstm_param.base.cell_bias = REQUIRED_IO(inputs[16]); + p.lstm_param.base.output_gate_bias = OPTIONAL_IO(inputs[17]); + p.lstm_param.base.projection_weight = OPTIONAL_IO(inputs[18]); + p.lstm_param.base.projection_bias = OPTIONAL_IO(inputs[19]); + + p.lstm_param.base.activation = OPTIONAL_IO(self->nn_param.lstm.local.activation_tensor); + p.lstm_param.forget_bias = OPTIONAL_IO(self->nn_param.lstm.local.forget_bias_tensor); + p.lstm_param.base.cell_clip = REQUIRED_IO(self->nn_param.lstm.local.cell_clip_tensor); + p.lstm_param.base.proj_clip = REQUIRED_IO(self->nn_param.lstm.local.proj_clip_tensor); + + self->n = vxLstmLayer( + self->graph->g, + REQUIRED_IO(inputs[0]), + NULL, + NULL, + ( vx_nn_lstm_layer_params_t *)&p, + sizeof( vx_nn_lstm_layer_params_ext_t ), + REQUIRED_IO(outputs[0]) + ); +#else + vx_nn_lstm_layer_params_t p; + memset( &p, 0, sizeof( vx_nn_lstm_layer_params_t )); + + status = VSI_FAILURE; + + status = _create_local_tensor(self); + if(status != VSI_SUCCESS) + { + return status; + } + + p.lstm_param.input2input_weight = REQUIRED_IO(inputs[3]); + p.lstm_param.input2forget_weight = REQUIRED_IO(inputs[4]); + p.lstm_param.input2cell_weight = REQUIRED_IO(inputs[5]); + p.lstm_param.input2output_weight = REQUIRED_IO(inputs[6]); + p.lstm_param.recurrent2input_weight = REQUIRED_IO(inputs[7]); + p.lstm_param.recurrent2forget_weight = REQUIRED_IO(inputs[8]); + p.lstm_param.recurrent2cell_weight = REQUIRED_IO(inputs[9]); + p.lstm_param.recurrent2output_weight = REQUIRED_IO(inputs[10]); + p.lstm_param.input_gate_bias = REQUIRED_IO(inputs[14]); + p.lstm_param.forget_gate_bias = REQUIRED_IO(inputs[15]); + p.lstm_param.cell_bias = REQUIRED_IO(inputs[16]); + p.lstm_param.output_gate_bias = OPTIONAL_IO(inputs[17]); + p.lstm_param.projection_weight = OPTIONAL_IO(inputs[18]); + p.lstm_param.projection_bias = OPTIONAL_IO(inputs[19]); + + p.lstm_param.activation = OPTIONAL_IO(self->nn_param.lstm.local.activation_tensor); + p.lstm_param.cell_clip = REQUIRED_IO(self->nn_param.lstm.local.cell_clip_tensor); + p.lstm_param.proj_clip = REQUIRED_IO(self->nn_param.lstm.local.proj_clip_tensor); + + self->n = vxLstmLayer( + self->graph->g, + REQUIRED_IO(inputs[0]), + NULL, + NULL, + ( vx_nn_lstm_layer_params_t *)&p, + sizeof( vx_nn_lstm_layer_params_t ), + REQUIRED_IO(outputs[0]) + ); +#endif + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + //TODO: Check tensor shapes. + if( inputs[0]->attr.dim_num < 3) + { + VSILOGE( "Wrong shape parameters." ); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = self->nn_param.lstm.weights; + outputs[0]->attr.size[1] = inputs[0]->attr.size[inputs[0]->attr.dim_num - 2]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[inputs[0]->attr.dim_num - 1]; + outputs[0]->attr.dim_num = 3; + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_tensor_t *activation_tensor, *forget_bias_tensor; + vsi_nn_tensor_t *cell_clip_tensor, *proj_clip_tensor; + if(NULL == self) + { + return VSI_FAILURE; + } + + activation_tensor = self->nn_param.lstm.local.activation_tensor; + forget_bias_tensor = self->nn_param.lstm.local.forget_bias_tensor; + cell_clip_tensor = self->nn_param.lstm.local.cell_clip_tensor; + proj_clip_tensor = self->nn_param.lstm.local.proj_clip_tensor; + if(NULL != self->n) + { + if(activation_tensor)vsi_nn_ReleaseTensor(&activation_tensor); + if(forget_bias_tensor)vsi_nn_ReleaseTensor(&forget_bias_tensor); + if(cell_clip_tensor)vsi_nn_ReleaseTensor(&cell_clip_tensor); + if(proj_clip_tensor)vsi_nn_ReleaseTensor(&proj_clip_tensor); + vxReleaseNode( &self->n ); + self->n = NULL; + } + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LSTM, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 20, + /* output_num */ 3 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c new file mode 100644 index 0000000..899711c --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c @@ -0,0 +1,435 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_lstm_ovxlib_param* curr_param = &self->nn_param.lstm_ovxlib; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t num_units = 0; + uint32_t output_size = 0; + uint32_t batch_size = 0; + vsi_bool use_virtual_tensor = TRUE; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + if( curr_param->time_major ) + { + batch_size = inputs[LSTM_INPUT_INPUT]->attr.size[1]; + } + else + { + batch_size = inputs[LSTM_INPUT_INPUT]->attr.size[2]; + } + + num_units = inputs[LSTM_INPUT_WEIGHT_I2F]->attr.size[1]; + output_size = num_units; + if( inputs[LSTM_INPUT_WEIGHT_PROJ] ) + { + output_size = inputs[LSTM_INPUT_WEIGHT_PROJ]->attr.size[1]; + } + + /* create h_state and c_state input/output if app doesn't provide them */ + if( !inputs[LSTM_INPUT_H_STATE] ) + { + attr.dim_num = 2; + attr.size[1] = batch_size; + attr.size[0] = output_size; + memcpy( &attr.dtype, &outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = FALSE; + attr.is_const = TRUE; + + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[LSTM_INPUT_H_STATE] = output_tensor->t; + } + + if( !inputs[LSTM_INPUT_C_STATE] ) + { + attr.dim_num = 2; + attr.size[1] = batch_size; + attr.size[0] = num_units; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.vtl = FALSE; + attr.is_const = TRUE; + + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[LSTM_INPUT_C_STATE] = output_tensor->t; + } + + if( !outputs[LSTM_OUTPUT_H_STATE] ) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + memcpy( &attr.dtype, &outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = use_virtual_tensor; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[LSTM_OUTPUT_H_STATE] = output_tensor->t; + } + + if( !outputs[LSTM_OUTPUT_C_STATE] ) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + memcpy( &attr.dtype, &inputs[LSTM_INPUT_C_STATE]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = use_virtual_tensor; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[LSTM_OUTPUT_C_STATE] = output_tensor->t; + } + + /* output */ + if( VSI_NN_DIM_AUTO == outputs[LSTM_OUTPUT_OUTPUT]->attr.dim_num ) + { + outputs[LSTM_OUTPUT_OUTPUT]->attr.size[0] = output_size; + if( curr_param->return_sequences ) + { + outputs[LSTM_OUTPUT_OUTPUT]->attr.size[1] = inputs[LSTM_INPUT_INPUT]->attr.size[1]; + outputs[LSTM_OUTPUT_OUTPUT]->attr.size[2] = inputs[LSTM_INPUT_INPUT]->attr.size[2]; + outputs[LSTM_OUTPUT_OUTPUT]->attr.dim_num = 3; + } + else + { + outputs[LSTM_OUTPUT_OUTPUT]->attr.size[1] = batch_size; + outputs[LSTM_OUTPUT_OUTPUT]->attr.dim_num = 2; + } + } + + /* output_state_out */ + if( VSI_NN_DIM_AUTO == outputs[LSTM_OUTPUT_H_STATE]->attr.dim_num ) + { + outputs[LSTM_OUTPUT_H_STATE]->attr.size[0] = output_size; + outputs[LSTM_OUTPUT_H_STATE]->attr.size[1] = batch_size; + outputs[LSTM_OUTPUT_H_STATE]->attr.dim_num = 2; + } + + /* cell_state_out */ + if(VSI_NN_DIM_AUTO == outputs[LSTM_OUTPUT_C_STATE]->attr.dim_num) + { + outputs[LSTM_OUTPUT_C_STATE]->attr.size[0] = num_units; + outputs[LSTM_OUTPUT_C_STATE]->attr.size[1] = batch_size; + outputs[LSTM_OUTPUT_C_STATE]->attr.dim_num = 2; + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_lstm_ovxlib_param* curr_param = &self->nn_param.lstm_ovxlib; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_tensor_t** split_output_tensors = NULL; + vsi_nn_tensor_t** lstmunit_reshape_output_tensors =NULL; + vsi_nn_tensor_t* last_step_h_state = NULL; + vsi_nn_tensor_t* last_step_c_state = NULL; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_tensor_t* input_tensor = NULL; + vsi_bool use_virtual_tensor = TRUE; + uint32_t batch_size = 0; + uint32_t time_step = 0; + uint32_t i = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_node_wksp( self ); + + if( curr_param->time_major ) + { + batch_size = inputs[LSTM_INPUT_INPUT]->attr.size[1]; + time_step = inputs[LSTM_INPUT_INPUT]->attr.size[2]; + } + else + { + batch_size = inputs[LSTM_INPUT_INPUT]->attr.size[2]; + time_step = inputs[LSTM_INPUT_INPUT]->attr.size[1]; + } + + setup_op_shapes( self, inputs, outputs); + + /* default to input */ + input_tensor = inputs[LSTM_INPUT_INPUT]; + if( !curr_param->time_major ) + { + /* transpose to time_major */ + output_tensor = vsi_nn_rnn_transpose_time_major(self, + inputs[LSTM_INPUT_INPUT], NULL, use_virtual_tensor); + input_tensor = output_tensor->t; + } + + /* split input tensor */ + split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * \ + sizeof(vsi_nn_tensor_t **)); + memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + lstmunit_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * \ + sizeof(vsi_nn_tensor_t **)); + memset( lstmunit_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + vsi_nn_rnn_split_input_tensor(self, input_tensor, + split_output_tensors, time_step, use_virtual_tensor); + + vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + + last_step_h_state = inputs[LSTM_INPUT_H_STATE]; + last_step_c_state = inputs[LSTM_INPUT_C_STATE]; + for( i = 0; i < time_step; i++ ) + { + vsi_nn_tensor_t* reshape_output = NULL; + vsi_nn_tensor_t* lstmunit_out0 = NULL; + vsi_nn_tensor_t* lstmunit_out1 = NULL; + vsi_nn_tensor_t* lstmunit_out2 = NULL; + + /* reshape for split output */ + output_tensor = vsi_nn_rnn_reshape_split_output(self, + split_output_tensors[i], batch_size, use_virtual_tensor); + reshape_output = output_tensor->t; + + /* lstmunit output */ + if( (i == time_step - 1) && !curr_param->return_sequences ) + { + lstmunit_out0 = outputs[LSTM_OUTPUT_OUTPUT]; + } + else + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + lstmunit_out0 = output_tensor->t; + } + + if( i != time_step - 1 ) + { + /* lstmunit output h_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[LSTM_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + lstmunit_out1 = output_tensor->t; + + /* lstmunit output c_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[LSTM_OUTPUT_C_STATE]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + lstmunit_out2 = output_tensor->t; + } + else + { + lstmunit_out1 = outputs[LSTM_OUTPUT_H_STATE]; + lstmunit_out2 = outputs[LSTM_OUTPUT_C_STATE]; + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 ); + curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation; + curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip; + curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias; + curr->node->nn_param.lstmunit_ovxlib.proj_clip = curr_param->proj_clip; + curr->node->nn_param.lstmunit_ovxlib.recurrent_activation = curr_param->recurrent_activation; + memcpy( curr->node->nn_param.lstmunit_ovxlib.internal_dtype, + curr_param->internal_dtype, sizeof( curr_param->internal_dtype ) ); + curr->inputs[LSTMUNIT_INPUT_INPUT] = reshape_output; + curr->inputs[LSTMUNIT_INPUT_H_STATE] = last_step_h_state; + curr->inputs[LSTMUNIT_INPUT_C_STATE] = last_step_c_state; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2I] = inputs[LSTM_INPUT_WEIGHT_I2I]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2F] = inputs[LSTM_INPUT_WEIGHT_I2F]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2C] = inputs[LSTM_INPUT_WEIGHT_I2C]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_I2O] = inputs[LSTM_INPUT_WEIGHT_I2O]; + + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2I] = inputs[LSTM_INPUT_WEIGHT_R2I]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2F] = inputs[LSTM_INPUT_WEIGHT_R2F]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2C] = inputs[LSTM_INPUT_WEIGHT_R2C]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_R2O] = inputs[LSTM_INPUT_WEIGHT_R2O]; + + curr->inputs[LSTMUNIT_INPUT_WEIGHT_C2I] = inputs[LSTM_INPUT_WEIGHT_C2I]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_C2F] = inputs[LSTM_INPUT_WEIGHT_C2F]; + curr->inputs[LSTMUNIT_INPUT_WEIGHT_C2O] = inputs[LSTM_INPUT_WEIGHT_C2O]; + + curr->inputs[LSTMUNIT_INPUT_BIAS_I] = inputs[LSTM_INPUT_BIAS_I]; + curr->inputs[LSTMUNIT_INPUT_BIAS_F] = inputs[LSTM_INPUT_BIAS_F]; + curr->inputs[LSTMUNIT_INPUT_BIAS_C] = inputs[LSTM_INPUT_BIAS_C]; + curr->inputs[LSTMUNIT_INPUT_BIAS_O] = inputs[LSTM_INPUT_BIAS_O]; + + curr->inputs[LSTMUNIT_INPUT_WEIGHT_PROJ] = inputs[LSTM_INPUT_WEIGHT_PROJ]; + curr->inputs[LSTMUNIT_INPUT_BIAS_PROJ] = inputs[LSTM_INPUT_BIAS_PROJ]; + + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_I] = inputs[LSTM_INPUT_LAYERNORM_I]; + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_F] = inputs[LSTM_INPUT_LAYERNORM_F]; + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_C] = inputs[LSTM_INPUT_LAYERNORM_C]; + curr->inputs[LSTMUNIT_INPUT_LAYERNORM_O] = inputs[LSTM_INPUT_LAYERNORM_O]; + + curr->outputs[LSTMUNIT_OUTPUT_OUTPUT] = lstmunit_out0; + curr->outputs[LSTMUNIT_OUTPUT_H_STATE] = lstmunit_out1; + curr->outputs[LSTMUNIT_OUTPUT_C_STATE] = lstmunit_out2; + + vsi_nn_internal_setup_node( self, curr ); + + last_step_h_state = lstmunit_out1; + last_step_c_state = lstmunit_out2; + + if( curr_param->return_sequences ) + { + /* reshape output to 3-dims */ + output_tensor = vsi_nn_rnn_reshape_cell_output(self, + lstmunit_out0, batch_size, use_virtual_tensor); + lstmunit_reshape_output_tensors[i] = output_tensor->t; + } + } + + if( curr_param->return_sequences ) + { + tensor = outputs[LSTM_OUTPUT_OUTPUT]; + if( !curr_param->time_major ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tensor = output_tensor->t; + } + + /* concat */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr->node->nn_param.concat.axis = 2; + for( i = 0; i < time_step; i++ ) + { + curr->inputs[i] = lstmunit_reshape_output_tensors[i]; + } + curr->outputs[0] = tensor; + vsi_nn_internal_setup_node( self, curr ); + + if( !curr_param->time_major ) + { + /* transpose time_major to batch_major*/ + vsi_nn_rnn_transpose_time_major(self, + tensor, outputs[LSTM_OUTPUT_OUTPUT], use_virtual_tensor); + } + } + + vsi_nn_safe_free( split_output_tensors ); + vsi_nn_safe_free( lstmunit_reshape_output_tensors ); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp( self ); + + return status; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.lstm_ovxlib.activation = VSI_NN_ACT_TANH; + self->nn_param.lstm_ovxlib.recurrent_activation = VSI_NN_ACT_SIGMOID; + self->nn_param.lstm_ovxlib.return_sequences = TRUE; + self->nn_param.lstm_ovxlib.time_major = TRUE; + + return status; +} /* op_init() */ + +#ifdef __cpluplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LSTM_OVXLIB, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ LSTM_INPUT_CNT, + /* output_num */ LSTM_OUTPUT_CNT + ); +#ifdef __cpluplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c new file mode 100644 index 0000000..232bebf --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c @@ -0,0 +1,360 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + +static vsi_status _create_local_tensor + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_tensor_t *act_tensor = NULL; + vsi_nn_tensor_t *cell_clip_tensor = NULL; + vsi_nn_tensor_t *proj_clip_tensor = NULL; + vsi_nn_tensor_t *scratch_tensor = NULL; + vsi_nn_tensor_t *forget_bias_tensor = NULL; + + if(NULL == self) + { + return VSI_FAILURE; + } + + act_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.lstmunit.activation, + VSI_NN_TYPE_INT32); + if(NULL == act_tensor) + { + goto error; + } + + cell_clip_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.lstmunit.cell_clip, + VSI_NN_TYPE_FLOAT32); + if(NULL == cell_clip_tensor) + { + goto error; + } + + proj_clip_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.lstmunit.proj_clip, + VSI_NN_TYPE_FLOAT32); + if(NULL == proj_clip_tensor) + { + goto error; + } + + scratch_tensor = vsi_nn_CreateTensor( self->graph, &self->nn_param.lstmunit.local.scratch_attr ); + if(NULL == scratch_tensor) + { + goto error; + } + + forget_bias_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.lstmunit.forget_bias, + VSI_NN_TYPE_FLOAT32); + if(NULL == forget_bias_tensor) + { + goto error; + } + + self->nn_param.lstmunit.local.activation_tensor = act_tensor; + self->nn_param.lstmunit.local.cell_clip_tensor = cell_clip_tensor; + self->nn_param.lstmunit.local.proj_clip_tensor = proj_clip_tensor; + self->nn_param.lstmunit.local.scratch_tensor = scratch_tensor; + self->nn_param.lstmunit.local.forget_bias_tensor = forget_bias_tensor; + return VSI_SUCCESS; +error: + if(act_tensor)vsi_nn_ReleaseTensor(&act_tensor); + if(cell_clip_tensor)vsi_nn_ReleaseTensor(&cell_clip_tensor); + if(proj_clip_tensor)vsi_nn_ReleaseTensor(&proj_clip_tensor); + if(scratch_tensor)vsi_nn_ReleaseTensor(&scratch_tensor); + if(forget_bias_tensor)vsi_nn_ReleaseTensor(&forget_bias_tensor); + return VSI_FAILURE; +} /* _create_local_tensor() */ + +static vsi_status _init_lstmunit_param + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vx_nn_lstm_params_ext_t *param + ) +{ + param->base.input2input_weight = OPTIONAL_IO(inputs[3]); + param->base.input2forget_weight = REQUIRED_IO(inputs[4]); + param->base.input2cell_weight = REQUIRED_IO(inputs[5]); + param->base.input2output_weight = REQUIRED_IO(inputs[6]); + + param->base.recurrent2input_weight = OPTIONAL_IO(inputs[7]); + param->base.recurrent2forget_weight = REQUIRED_IO(inputs[8]); + param->base.recurrent2cell_weight = REQUIRED_IO(inputs[9]); + param->base.recurrent2output_weight = REQUIRED_IO(inputs[10]); + + param->base.cell2input_weight = OPTIONAL_IO(inputs[11]); + param->base.cell2forget_weight = OPTIONAL_IO(inputs[12]); + param->base.cell2output_weight = OPTIONAL_IO(inputs[13]); + + param->base.input_gate_bias = OPTIONAL_IO(inputs[14]); + param->base.forget_gate_bias = REQUIRED_IO(inputs[15]); + param->base.cell_bias = REQUIRED_IO(inputs[16]); + param->base.output_gate_bias = REQUIRED_IO(inputs[17]); + + param->base.projection_weight = OPTIONAL_IO(inputs[18]); + param->base.projection_bias = OPTIONAL_IO(inputs[19]); + + param->layernorm2input_weight = OPTIONAL_IO(inputs[20]); + param->layernorm2forget_weight = OPTIONAL_IO(inputs[21]); + param->layernorm2cell_weight = OPTIONAL_IO(inputs[22]); + param->layernorm2output_weight = OPTIONAL_IO(inputs[23]); + + param->base.activation = OPTIONAL_IO(self->nn_param.lstmunit.local.activation_tensor); + param->base.cell_clip = OPTIONAL_IO(self->nn_param.lstmunit.local.cell_clip_tensor); + param->base.proj_clip = OPTIONAL_IO(self->nn_param.lstmunit.local.proj_clip_tensor); + + param->forget_bias = REQUIRED_IO(self->nn_param.lstmunit.local.forget_bias_tensor); + param->norm_gain = 1.0f; + param->norm_shift = 0.0f; + + return VSI_SUCCESS; +} /* _init_lstmunit_param() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_nn_lstm_params_ext_t param; + + status = VSI_FAILURE; + memset(¶m, 0, sizeof(param)); + + status = _create_local_tensor(self); + if(status != VSI_SUCCESS) + { + return status; + } + + status = _init_lstmunit_param(self, inputs, ¶m); + if(status != VSI_SUCCESS) + { + return status; + } + + /* Support high precision for cell state input */ + if( inputs[2] != NULL && VSI_NN_TYPE_FLOAT32 == inputs[2]->attr.dtype.vx_type ) + { + status = vsi_nn_SetTensorAttr(inputs[2], VSI_NN_TENSOR_ATTR_HIGH_PRECISION); + if(VSI_SUCCESS != status) + { + VSILOGE("Set tensor attr of cell state input to high presision fail"); + return status; + } + } + + /* Support high precision for cell state output */ + if( outputs[2] != NULL && VSI_NN_TYPE_FLOAT32 == outputs[2]->attr.dtype.vx_type ) + { + status = vsi_nn_SetTensorAttr(outputs[2], VSI_NN_TENSOR_ATTR_HIGH_PRECISION); + if(VSI_SUCCESS != status) + { + VSILOGE("Set tensor attr of cell state output to high presision fail"); + return status; + } + } + + self->n = vxLstmUnitLayer( + self->graph->g, + REQUIRED_IO(inputs[0]), + REQUIRED_IO(inputs[1]), + REQUIRED_IO(inputs[2]), + (vx_nn_lstm_params_t *)¶m, + sizeof(param), + REQUIRED_IO(self->nn_param.lstmunit.local.scratch_tensor), + REQUIRED_IO(outputs[1]), + REQUIRED_IO(outputs[2]), + REQUIRED_IO(outputs[0]) + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + //TODO: Check tensor shapes. + return TRUE; +} /* op_check() */ + +/* + inputs[0]: input + inputs[1]: output_state_in + inputs[2]: cell_state_in + inputs[3] ~ inputs[23]: weights & bias + outputs[0]: scratch + outputs[1]: output_state_out + outputs[2]: cell_state_out + outputs[3]: output +*/ +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* scratch */ + self->nn_param.lstmunit.local.scratch_attr.vtl = TRUE; + self->nn_param.lstmunit.local.scratch_attr.is_const = FALSE; + self->nn_param.lstmunit.local.scratch_attr.dtype.vx_type = outputs[0]->attr.dtype.vx_type; + self->nn_param.lstmunit.local.scratch_attr.dim_num = inputs[0]->attr.dim_num; + self->nn_param.lstmunit.local.scratch_attr.size[0] = inputs[4]->attr.size[1] * 4; /* num_units * 4 */ + self->nn_param.lstmunit.local.scratch_attr.size[1] = inputs[0]->attr.size[1]; /* batch_size */ + + /* output */ + if(VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) + { + if(inputs[18]) /* enable projection_weight */ + { + outputs[0]->attr.size[0] = inputs[18]->attr.size[1]; /* output_size */ + } + else /* disable projection_weight */ + { + outputs[0]->attr.size[0] = inputs[4]->attr.size[1]; /* num_units */ + } + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; /* batch_size */ + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + + /* output_state_out */ + if(VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num) + { + outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; + memcpy( outputs[1]->attr.size, outputs[0]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + } + + /* cell_state_out */ + if(VSI_NN_DIM_AUTO == outputs[2]->attr.dim_num) + { + outputs[2]->attr.dim_num = outputs[1]->attr.dim_num; + outputs[2]->attr.size[0] = inputs[4]->attr.size[1]; /* num_units */ + outputs[2]->attr.size[1] = inputs[0]->attr.size[1]; /* batch_size */ + } + + if ((NULL != outputs[3]) && (NULL != inputs[4])) + { + uint32_t cifg_factor = /*input2input_weight*/inputs[3] == NULL ? 3/*use_cifg*/ : 4; + outputs[3]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[3]->attr.size[0] = inputs[4]->attr.size[1] * cifg_factor; /* num_units * 4 */ + outputs[3]->attr.size[1] = inputs[0]->attr.size[1]; /* batch_size */ + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_tensor_t *activation_tensor, *cell_clip_tensor, *proj_clip_tensor; + vsi_nn_tensor_t *scratch_tensor; + vsi_nn_tensor_t *forget_bias_tensor; + + if(NULL == self) + { + return VSI_FAILURE; + } + + activation_tensor = self->nn_param.lstmunit.local.activation_tensor; + cell_clip_tensor = self->nn_param.lstmunit.local.cell_clip_tensor; + proj_clip_tensor = self->nn_param.lstmunit.local.proj_clip_tensor; + scratch_tensor = self->nn_param.lstmunit.local.scratch_tensor; + forget_bias_tensor = self->nn_param.lstmunit.local.forget_bias_tensor; + if(NULL != self->n) + { + if(activation_tensor)vsi_nn_ReleaseTensor(&activation_tensor); + if(cell_clip_tensor)vsi_nn_ReleaseTensor(&cell_clip_tensor); + if(proj_clip_tensor)vsi_nn_ReleaseTensor(&proj_clip_tensor); + if(scratch_tensor)vsi_nn_ReleaseTensor(&scratch_tensor); + if(forget_bias_tensor)vsi_nn_ReleaseTensor(&forget_bias_tensor); + vxReleaseNode( &self->n ); + self->n = NULL; + } + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.lstmunit.activation = VSI_NN_ACT_TANH; + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LSTMUNIT, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 24, + /* output_num */ 4 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c new file mode 100644 index 0000000..d95f48d --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c @@ -0,0 +1,289 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_tensor_op.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_util.h" + +#define _INPUT_NUM (LSTMUNIT_ACT_INPUTS_COUNT) +#define _OUTPUT_NUM (LSTMUNIT_ACT_OUTUTS_COUNT) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + int32_t _is_ln= 0; + int32_t _is_cifg= 0; + int32_t _is_proj= 0; + int32_t _is_hybrid= 0; + int32_t _is_peephole= 0; + int32_t recurrent_activation; + float cell_clip; + float proj_clip; + float forget_bias; + vsi_nn_lstmunit_activation_param * p = NULL; + + p = &(self->nn_param.lstmunit_activation); + _is_ln = p->is_layer_norm ? 1 : 0; + _is_cifg = p->is_cifg ? 1 : 0; + _is_proj = p->is_projection ? 1 : 0; + _is_hybrid = p->is_hybrid ? 1 : 0; + _is_peephole = p->is_peephole ? 1 : 0; + recurrent_activation = (int32_t)(p->recurrent_activation); + cell_clip = p->cell_clip; + proj_clip = p->proj_clip; + forget_bias = p->forget_bias; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "_is_ln", _is_ln ); + vsi_nn_kernel_param_add_int32( param, "_is_cifg", _is_cifg ); + vsi_nn_kernel_param_add_int32( param, "_is_proj", _is_proj ); + vsi_nn_kernel_param_add_int32( param, "_is_hybrid", _is_hybrid ); + vsi_nn_kernel_param_add_int32( param, "_is_peephole", _is_peephole ); + vsi_nn_kernel_param_add_int32( param, "recurrent_activation", recurrent_activation ); + vsi_nn_kernel_param_add_float32( param, "cell_clip" , cell_clip ); + vsi_nn_kernel_param_add_float32( param, "proj_clip" , proj_clip ); + vsi_nn_kernel_param_add_float32( param, "forget_bias", forget_bias ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "lstmunit_activation", + inputs, _INPUT_NUM, + outputs, _OUTPUT_NUM, param ); + + vsi_nn_kernel_param_release( ¶m ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; + +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_lstmunit_activation_param * p; + vsi_nn_dtype_t dst_dtype; + int32_t ifco_start_index = 0; + vsi_nn_tensor_attr_t attr; + int32_t i = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + if( NULL == self ) + { + return FALSE; + } + + p = &(self->nn_param.lstmunit_activation); + + p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL; + p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL; + p->is_layer_norm = inputs[LSTMUNIT_ACT_LN_WF] != NULL; + p->is_hybrid = p->is_layer_norm ? 0 : inputs[LSTMUNIT_ACT_DATA_BF] != NULL; + p->recurrent_activation = p->recurrent_activation == VSI_NN_ACT_NONE ? + VSI_NN_ACT_SIGMOID : p->recurrent_activation; + + for( i = ifco_start_index; i < 4; i++ ) + { + vsi_nn_tensor_t* t0 = NULL; + vsi_nn_tensor_t* t1 = NULL; + dst_dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dst_dtype.vx_type = VSI_NN_TYPE_FLOAT32; + + if (inputs[LSTMUNIT_ACT_DATA_BI + i] && inputs[LSTMUNIT_ACT_DATA_BI + i]->attr.dim_num == 1) + { + memcpy(&attr, &(inputs[LSTMUNIT_ACT_DATA_BI + i]->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + t0 = vsi_nn_CreateTensor( self->graph, &attr ); + vsi_nn_ReshapeTensor(self->graph, inputs[LSTMUNIT_ACT_DATA_BI + i], t0, attr.size, attr.dim_num); + + if( dst_dtype.vx_type != t0->attr.dtype.vx_type + && dst_dtype.qnt_type != t0->attr.dtype.qnt_type ) + { + p->local.tensors[LSTMUNIT_ACT_TENSOR_BI + i] = + vsi_nn_ConvertTensorDtype( self->graph, t0, &dst_dtype ); + vsi_nn_ReleaseTensor( &t0 ); + } + else + { + p->local.tensors[LSTMUNIT_ACT_TENSOR_BI + i] = t0; + } + + inputs[LSTMUNIT_ACT_DATA_BI + i] = p->local.tensors[LSTMUNIT_ACT_TENSOR_BI + i]; + } + + if (inputs[LSTMUNIT_ACT_LN_WI + i] && inputs[LSTMUNIT_ACT_LN_WI + i]->attr.dim_num == 1) + { + memcpy(&attr, &(inputs[LSTMUNIT_ACT_LN_WI + i]->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + t1 = vsi_nn_CreateTensor( self->graph, &attr ); + vsi_nn_ReshapeTensor(self->graph, inputs[LSTMUNIT_ACT_LN_WI + i], t1, attr.size, attr.dim_num); + + if( dst_dtype.vx_type != t1->attr.dtype.vx_type + && dst_dtype.qnt_type != t1->attr.dtype.qnt_type ) + { + p->local.tensors[LSTMUNIT_ACT_TENSOR_LN_WI + i] = + vsi_nn_ConvertTensorDtype( self->graph, t1, &dst_dtype ); + vsi_nn_ReleaseTensor( &t1 ); + } + else + { + p->local.tensors[LSTMUNIT_ACT_TENSOR_LN_WI + i] = t1; + } + + inputs[LSTMUNIT_ACT_LN_WI + i] = p->local.tensors[LSTMUNIT_ACT_TENSOR_LN_WI + i]; + } + } + + if( VSI_NN_DIM_AUTO == outputs[LSTMUNIT_ACT_OUTPUT]->attr.dim_num ) + { + outputs[LSTMUNIT_ACT_OUTPUT]->attr.dim_num = inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dim_num; + outputs[LSTMUNIT_ACT_OUTPUT]->attr.size[0] = inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.size[0]; + outputs[LSTMUNIT_ACT_OUTPUT]->attr.size[1] = inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.size[1]; + outputs[LSTMUNIT_ACT_OUTPUT]->attr.size[2] = inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.size[2]; + outputs[LSTMUNIT_ACT_OUTPUT]->attr.size[3] = inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.size[3]; + } + + if( VSI_NN_DIM_AUTO == outputs[LSTMUNIT_ACT_CSTATE_OUT]->attr.dim_num ) + { + outputs[LSTMUNIT_ACT_CSTATE_OUT]->attr.dim_num = inputs[LSTMUNIT_ACT_CSTATE_IN]->attr.dim_num; + outputs[LSTMUNIT_ACT_CSTATE_OUT]->attr.size[0] = inputs[LSTMUNIT_ACT_CSTATE_IN]->attr.size[0]; + outputs[LSTMUNIT_ACT_CSTATE_OUT]->attr.size[1] = inputs[LSTMUNIT_ACT_CSTATE_IN]->attr.size[1]; + outputs[LSTMUNIT_ACT_CSTATE_OUT]->attr.size[2] = inputs[LSTMUNIT_ACT_CSTATE_IN]->attr.size[2]; + outputs[LSTMUNIT_ACT_CSTATE_OUT]->attr.size[3] = inputs[LSTMUNIT_ACT_CSTATE_IN]->attr.size[3]; + } + + if (outputs[LSTMUNIT_ACT_HSTATE_OUT] && VSI_NN_DIM_AUTO == outputs[LSTMUNIT_ACT_HSTATE_OUT]->attr.dim_num ) + { + outputs[LSTMUNIT_ACT_HSTATE_OUT]->attr.dim_num = outputs[LSTMUNIT_ACT_OUTPUT]->attr.dim_num; + outputs[LSTMUNIT_ACT_HSTATE_OUT]->attr.size[0] = outputs[LSTMUNIT_ACT_OUTPUT]->attr.size[0]; + outputs[LSTMUNIT_ACT_HSTATE_OUT]->attr.size[1] = outputs[LSTMUNIT_ACT_OUTPUT]->attr.size[1]; + outputs[LSTMUNIT_ACT_HSTATE_OUT]->attr.size[2] = outputs[LSTMUNIT_ACT_OUTPUT]->attr.size[2]; + outputs[LSTMUNIT_ACT_HSTATE_OUT]->attr.size[3] = outputs[LSTMUNIT_ACT_OUTPUT]->attr.size[3]; + } + + return TRUE; + +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + + vsi_status status = VSI_SUCCESS; + int32_t i = 0; + + for (i = 0; i < LSTMUNIT_ACT_TENSOR_CNT; i++) + { + if (self->nn_param.lstmunit_activation.local.tensors[i] != NULL) + { + vsi_nn_ReleaseTensor(&self->nn_param.lstmunit_activation.local.tensors[i]); + self->nn_param.lstmunit_activation.local.tensors[i] = NULL; + } + } + + if(self->nn_param.lstmunit_activation.local.lstmunit_param != NULL) + { + vsi_nn_ReleaseTensor(&self->nn_param.lstmunit_activation.local.lstmunit_param); + self->nn_param.lstmunit_activation.local.lstmunit_param = NULL; + } + + return status; + +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + + vsi_status status = VSI_SUCCESS; + + self->nn_param.lstmunit_activation.recurrent_activation = VSI_NN_ACT_SIGMOID; + + return status; + +} /* op_init() */ + +#ifdef __cpluplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LSTMUNIT_ACTIVATION, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cpluplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c new file mode 100644 index 0000000..07b6ca2 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c @@ -0,0 +1,713 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "ops/vsi_nn_op_lstmunit_ovxlib.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" + +static vsi_nn_internal_tensor_t* create_tp_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_lstmunit_ovxlib_param* p = &self->nn_param.lstmunit_ovxlib; + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_internal_tensor_t* tensor2 = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + tensor = bias; + if( !bias || p->local->use_layer_norm || p->local->use_hybrid ) + { + /* create zero bias for NN/TP */ + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr); + tensor = tensor1->t; + } + + vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); + tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 ); + tmp_inode->node->nn_param.fcl.axis = 0; + tmp_inode->node->nn_param.fcl.weights = weight->attr.size[1]; + + tmp_inode->inputs[0] = input; + tmp_inode->inputs[1] = weight; + tmp_inode->inputs[2] = tensor; + tmp_inode->outputs[0] = tensor2->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + return tensor2; +} + +static vsi_nn_internal_tensor_t* create_nn_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias, + uint32_t kernel_h, + uint32_t kernel_w, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_lstmunit_ovxlib_param* p = &self->nn_param.lstmunit_ovxlib; + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_internal_tensor_t* tensor2 = NULL; + vsi_nn_internal_tensor_t* reshaped_weight_tensor = NULL; + uint32_t reshaped_weight_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_nn_internal_node_t* tmp_inode = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + tensor = bias; + if( !bias || p->local->use_layer_norm || p->local->use_hybrid ) + { + /* create zero bias for NN/TP */ + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr); + tensor = tensor1->t; + } + + vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); + tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + reshaped_weight_shape[3] = weight->attr.size[1]; + reshaped_weight_shape[2] = weight->attr.size[0] / ( kernel_h * kernel_w ); + reshaped_weight_shape[1] = kernel_h; + reshaped_weight_shape[0] = kernel_w; + + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = weight->attr.vtl; + attr.is_const = FALSE; //weight->attr.is_const; + memcpy( &attr.dtype, &weight->attr.dtype, sizeof(attr.dtype) ); + memcpy( &attr.size, &reshaped_weight_shape, sizeof(attr.size)); + reshaped_weight_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + vsi_nn_ReshapeTensor( self->graph, weight, reshaped_weight_tensor->t, reshaped_weight_shape, 4 ); + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + tmp_inode->node->nn_param.conv2d.ksize[0] = kernel_w; + tmp_inode->node->nn_param.conv2d.ksize[1] = kernel_h; + tmp_inode->node->nn_param.conv2d.stride[0] = 1; + tmp_inode->node->nn_param.conv2d.stride[1] = 1; + tmp_inode->node->nn_param.conv2d.pad[0] = 0; + tmp_inode->node->nn_param.conv2d.pad[1] = 0; + tmp_inode->node->nn_param.conv2d.pad[2] = 0; + tmp_inode->node->nn_param.conv2d.pad[3] = 0; + tmp_inode->node->nn_param.conv2d.group = 1; + tmp_inode->node->nn_param.conv2d.dilation[0] = 1; + tmp_inode->node->nn_param.conv2d.dilation[1] = 1; + tmp_inode->node->nn_param.conv2d.weights = weight->attr.size[1]; + + tmp_inode->inputs[0] = input; + tmp_inode->inputs[1] = reshaped_weight_tensor->t; + tmp_inode->inputs[2] = tensor; + tmp_inode->outputs[0] = tensor2->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + return tensor2; +} + +static void create_peephole + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_internal_tensor_t ** input_fc, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* input_tensor0 = NULL; + vsi_nn_internal_tensor_t* input_tensor1 = NULL; + vsi_nn_internal_node_t* curr = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + memcpy(&(attr.dtype), &((*input_fc)->t->attr.dtype), sizeof(vsi_nn_dtype_t)); + input_tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + /* create internal nodes */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_MULTIPLY, 0, 0 ); + curr->node->nn_param.multiply.scale = 1.0f; + curr->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + curr->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN; + curr->inputs[0] = input; + curr->inputs[1] = weight; + curr->outputs[0] = input_tensor0->t; + vsi_nn_internal_setup_node(self, curr); + input_tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + /* create internal nodes */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + curr->inputs[0] = (*input_fc)->t; + curr->inputs[1] = input_tensor0->t; + curr->outputs[0] = input_tensor1->t; + vsi_nn_internal_setup_node(self, curr); + *input_fc = input_tensor1; +} + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_lstmunit_ovxlib_param* p = &self->nn_param.lstmunit_ovxlib; + + /* setup lstmunit output tensors' shape */ + /* output */ + if(VSI_NN_DIM_AUTO == outputs[LSTMUNIT_OUTPUT_OUTPUT]->attr.dim_num) + { + if(p->local->use_projection) /* enable projection_weight */ + { + /* output_size */ + outputs[LSTMUNIT_OUTPUT_OUTPUT]->attr.size[0] = inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr.size[1]; + } + else /* disable projection_weight */ + { + /* num_units */ + outputs[LSTMUNIT_OUTPUT_OUTPUT]->attr.size[0] = inputs[LSTMUNIT_INPUT_WEIGHT_I2F]->attr.size[1]; + } + /* batch_size */ + outputs[LSTMUNIT_OUTPUT_OUTPUT]->attr.size[1] = inputs[LSTMUNIT_INPUT_INPUT]->attr.size[1]; + outputs[LSTMUNIT_OUTPUT_OUTPUT]->attr.dim_num = inputs[LSTMUNIT_INPUT_INPUT]->attr.dim_num; + } + + /* output_state_out */ + if(VSI_NN_DIM_AUTO == outputs[LSTMUNIT_OUTPUT_H_STATE]->attr.dim_num) + { + outputs[LSTMUNIT_OUTPUT_H_STATE]->attr.dim_num = outputs[LSTMUNIT_OUTPUT_OUTPUT]->attr.dim_num; + memcpy( outputs[LSTMUNIT_OUTPUT_H_STATE]->attr.size, outputs[LSTMUNIT_OUTPUT_OUTPUT]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + } + + /* cell_state_out */ + if(VSI_NN_DIM_AUTO == outputs[LSTMUNIT_OUTPUT_C_STATE]->attr.dim_num) + { + outputs[LSTMUNIT_OUTPUT_C_STATE]->attr.dim_num = outputs[LSTMUNIT_OUTPUT_OUTPUT]->attr.dim_num; + outputs[LSTMUNIT_OUTPUT_C_STATE]->attr.size[0] = inputs[LSTMUNIT_INPUT_WEIGHT_I2F]->attr.size[1]; + outputs[LSTMUNIT_OUTPUT_C_STATE]->attr.size[1] = inputs[LSTMUNIT_INPUT_INPUT]->attr.size[1]; + } + + return TRUE; +} +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_lstmunit_ovxlib_param* p = &self->nn_param.lstmunit_ovxlib; + vsi_nn_tensor_attr_t attr; + vsi_bool is_input_fc_on_tp = FALSE; + vsi_bool is_recurrent_fc_on_tp = FALSE; + vsi_nn_internal_tensor_t* input_tensor = NULL; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_internal_tensor_t* tmp_tensor = NULL; + vsi_nn_internal_tensor_t* recurrent_input_tensor = NULL; + vsi_nn_internal_tensor_t* input_fc_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; + vsi_nn_internal_tensor_t* aux_input_fc_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; + vsi_nn_internal_tensor_t* input_add_aux_input_fc_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; + vsi_nn_internal_tensor_t* recurrent_fc_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; + vsi_nn_internal_tensor_t* layernorm_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; + vsi_nn_tensor_t* bias_tensors[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; + vsi_nn_tensor_t* zero_bias_tensor = NULL; + vsi_nn_internal_node_t* curr = NULL; + int32_t ifco_start_index = 0; + uint32_t kernel_h = 1; + uint32_t kernel_w = 1; + int32_t i = 0; + vsi_bool use_virtual_tensor = TRUE; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_node_wksp( self ); + + memset( p->local, 0x00, sizeof(vsi_nn_lstmunit_ovxlib_lcl_data_t) ); + memset( &attr, 0x00, sizeof( attr ) ); + p->local->use_cifg = ( NULL == inputs[LSTMUNIT_INPUT_WEIGHT_I2I] ); + p->local->use_layer_norm = ( NULL != inputs[LSTMUNIT_INPUT_LAYERNORM_F] ); + p->local->use_projection = ( NULL != inputs[LSTMUNIT_INPUT_WEIGHT_PROJ] ); + p->local->use_projection_bias = FALSE;//NULL != inputs[19]; + p->local->multi_batch = ( inputs[LSTMUNIT_INPUT_INPUT]->attr.size[1] > 1 ); + p->local->use_peephole = ( NULL != inputs[LSTMUNIT_INPUT_WEIGHT_C2O] ); + ifco_start_index = p->local->use_cifg ? 1 : 0; + if( inputs[LSTMUNIT_INPUT_WEIGHT_I2F]->attr.dtype.qnt_type + != inputs[LSTMUNIT_INPUT_BIAS_F]->attr.dtype.qnt_type ) + { + p->local->use_hybrid = TRUE; + } + + if( inputs[LSTMUNIT_INPUT_INPUT]->attr.dtype.qnt_type + != inputs[LSTMUNIT_INPUT_WEIGHT_I2F]->attr.dtype.qnt_type) + { + /* input and input weights have different qtype, only TP can do this operation */ + is_input_fc_on_tp = TRUE; + } + else if( inputs[LSTMUNIT_INPUT_INPUT]->attr.size[0] % 64 != 0 ) + { + /* NN performs bad if input's shape is not aligned to 64-byte */ + is_input_fc_on_tp = TRUE; + } + + if( inputs[LSTMUNIT_INPUT_H_STATE]->attr.dtype.qnt_type + != inputs[LSTMUNIT_INPUT_WEIGHT_R2F]->attr.dtype.qnt_type) + { + /* recurrent and recurrent weights have different qtype, only TP can do this operation */ + is_recurrent_fc_on_tp = TRUE; + } + else if( inputs[LSTMUNIT_INPUT_H_STATE]->attr.size[0] % 64 != 0 ) + { + /* NN performs bad if inputs' shape is not aligned to 64-byte */ + is_recurrent_fc_on_tp = TRUE; + } + + /* if both input fc and recurrent fc could be executed on NN, offloads one to TP*/ + if( !is_input_fc_on_tp && !is_recurrent_fc_on_tp ) + { + is_input_fc_on_tp = TRUE; + } + + setup_op_shapes(self, inputs, outputs); + + for( i = 0; i < LSTMUNIT_IFCO_GATE_COUNT; i++) + { + if( p->local->use_layer_norm || p->local->use_hybrid ) + { + bias_tensors[i] = NULL; + } + else + { + bias_tensors[i] = inputs[LSTMUNIT_INPUT_BIAS_I + i]; + } + } + + /* Input FC */ + if( is_input_fc_on_tp ) + { + /* tp */ + for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) + { + input_fc_outputs[i] = create_tp_fc(self, + inputs[LSTMUNIT_INPUT_INPUT], + inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i], + bias_tensors[i], + &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i], + use_virtual_tensor); + } + if (inputs[LSTMUNIT_INPUT_AUX_INPUT] != NULL) + { + for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) + { + aux_input_fc_outputs[i] = create_tp_fc(self, + inputs[LSTMUNIT_INPUT_AUX_INPUT], + inputs[LSTMUNIT_INPUT_AUX_WEIGHT_I2I + i], + NULL, + &p->internal_dtype_aux[LSTMUNIT_QUANTIZE_PARAM_AUX_I2I + i], + use_virtual_tensor); + } + } + } + else + { + /* reshape and transpose input */ + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[LSTMUNIT_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); + input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_INPUT], + p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + + for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) + { + vsi_nn_internal_tensor_t* tmp = create_nn_fc(self, + input_tensor->t, + inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i], + bias_tensors[i], + kernel_h, kernel_w, + &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i], + use_virtual_tensor); + /* transpose and reshape output */ + input_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self, + tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + } + if (inputs[LSTMUNIT_INPUT_AUX_INPUT] != NULL) + { + /* reshape and transpose input */ + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[LSTMUNIT_INPUT_AUX_INPUT]->attr.size[0], &kernel_h, &kernel_w); + input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_AUX_INPUT], + p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + + for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) + { + vsi_nn_internal_tensor_t* tmp = create_nn_fc(self, + input_tensor->t, + inputs[LSTMUNIT_INPUT_AUX_WEIGHT_I2I + i], + NULL, + kernel_h, kernel_w, + &p->internal_dtype_aux[LSTMUNIT_QUANTIZE_PARAM_AUX_I2I + i], + use_virtual_tensor); + /* transpose and reshape output */ + aux_input_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self, + tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + } + } + } + + if (inputs[LSTMUNIT_INPUT_AUX_INPUT] != NULL) + { + for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) + { + input_add_aux_input_fc_outputs[i] = vsi_nn_rnn_create_tensor_add(self, + input_fc_outputs[i]->t, + aux_input_fc_outputs[i]->t, + &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I], + use_virtual_tensor); + input_fc_outputs[i] = input_add_aux_input_fc_outputs[i]; + } + } + + /* Recurrent FC */ + if( is_recurrent_fc_on_tp ) + { + for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) + { + recurrent_fc_outputs[i] = create_tp_fc(self, + inputs[LSTMUNIT_INPUT_H_STATE], + inputs[LSTMUNIT_INPUT_WEIGHT_R2I + i], + NULL, + &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i], + use_virtual_tensor); + } + } + else + { + /* reshape and transpose input */ + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[LSTMUNIT_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); + recurrent_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, + inputs[LSTMUNIT_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + + for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) + { + vsi_nn_internal_tensor_t* tmp = create_nn_fc(self, + recurrent_input_tensor->t, + inputs[LSTMUNIT_INPUT_WEIGHT_R2I + i], + NULL, + kernel_h, kernel_w, + &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i], + use_virtual_tensor); + /* transpose and reshape output */ + recurrent_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self, + tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + } + } + + if (p->local->use_peephole) + { + /* update input gate */ + if (!p->local->use_cifg) + { + create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE], + inputs[LSTMUNIT_INPUT_WEIGHT_C2I], &(input_fc_outputs[0]), + use_virtual_tensor); + } + + /* update forget gate */ + create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE], + inputs[LSTMUNIT_INPUT_WEIGHT_C2F], &(input_fc_outputs[1]), + use_virtual_tensor); + + /* update output gate */ + create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE], + inputs[LSTMUNIT_INPUT_WEIGHT_C2O], &(input_fc_outputs[3]), + use_virtual_tensor); + } + + /* layernorm */ + if( p->local->use_layer_norm ) + { + for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ ) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + /* create internal nodes */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 ); + curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8; + curr->inputs[0] = input_fc_outputs[i]->t; + curr->inputs[1] = recurrent_fc_outputs[i]->t; + curr->outputs[0] = input_tensor->t; + vsi_nn_internal_setup_node(self, curr); + + layernorm_outputs[i] = input_tensor; + } + } + + /* activations */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_ACTIVATION, 0, 0 ); + curr->node->nn_param.lstmunit_activation.cell_clip = p->cell_clip; + curr->node->nn_param.lstmunit_activation.proj_clip = p->proj_clip; + curr->node->nn_param.lstmunit_activation.forget_bias = p->forget_bias; + curr->node->nn_param.lstmunit_activation.is_cifg = (uint8_t)p->local->use_cifg; + curr->node->nn_param.lstmunit_activation.is_projection = (uint8_t)p->local->use_projection; + curr->node->nn_param.lstmunit_activation.is_layer_norm = (uint8_t)p->local->use_layer_norm; + curr->node->nn_param.lstmunit_activation.is_peephole = FALSE; + curr->node->nn_param.lstmunit_activation.is_hybrid = (uint8_t)p->local->use_hybrid; + curr->node->nn_param.lstmunit_activation.recurrent_activation = p->recurrent_activation; + + curr->inputs[LSTMUNIT_ACT_CSTATE_IN] = inputs[LSTMUNIT_INPUT_C_STATE]; + for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ ) + { + if( p->local->use_layer_norm || p->local->use_hybrid ) + { + curr->inputs[LSTMUNIT_ACT_DATA_BI + i] = inputs[LSTMUNIT_INPUT_BIAS_I + i]; + } + + if( p->local->use_layer_norm ) + { + /* Pass layernorm weights to VSI_NN_OP_LSTMUNIT_ACTIVATION */ + curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i]; + curr->inputs[LSTMUNIT_ACT_INPUT_FC_I + i] = layernorm_outputs[i]->t; + curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = NULL; + } + else + { + curr->inputs[LSTMUNIT_ACT_LN_WI + i] = NULL; + curr->inputs[LSTMUNIT_ACT_INPUT_FC_I + i] = input_fc_outputs[i]->t; + curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = recurrent_fc_outputs[i]->t; + } + } + + if( p->local->use_projection ) + { + /* create virtual tensor for activations' output0 */ + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + + if( p->local->multi_batch && + inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + /* projection FC on NN requires quantized input */ + attr.dtype.scale = (float)0.007866097716834601; + attr.dtype.zero_point = 128; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; + attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + } + else + { + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } + output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + curr->outputs[LSTMUNIT_ACT_OUTPUT] = output_tensor->t; + curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = outputs[LSTMUNIT_OUTPUT_C_STATE]; + curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = NULL; + } + else + { + /* kernel VSI_NN_OP_LSTMUNIT_ACTIVATION has 3 outputs if no projection layer behind */ + curr->outputs[LSTMUNIT_ACT_OUTPUT] = outputs[LSTMUNIT_OUTPUT_OUTPUT]; + curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = outputs[LSTMUNIT_OUTPUT_C_STATE]; + curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = outputs[LSTMUNIT_OUTPUT_H_STATE]; + } + vsi_nn_internal_setup_node(self, curr); /* setup for VSI_NN_OP_LSTMUNIT_ACTIVATION */ + + if( p->local->use_projection ) + { + if( p->local->use_hybrid || !p->local->use_projection_bias ) + { + input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr, + &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr); + zero_bias_tensor = input_tensor->t; + } + else + { + zero_bias_tensor = inputs[LSTMUNIT_INPUT_BIAS_PROJ]; + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_FCL, 0, 0 ); + curr->node->nn_param.fcl.axis = 0; + curr->node->nn_param.fcl.weights = inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr.size[1]; + + curr->inputs[0] = output_tensor->t; + curr->inputs[1] = inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]; + curr->inputs[2] = zero_bias_tensor; + + tmp_tensor = output_tensor; + + /* Save output to h_state first and copy to output */ + if( p->local->use_hybrid && p->local->use_projection_bias ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[LSTMUNIT_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + curr->outputs[0] = output_tensor->t; + } + else + { + curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE]; + } + + vsi_nn_internal_setup_node(self, curr); + + if( p->local->use_hybrid && p->local->use_projection_bias ) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + curr->inputs[0] = tmp_tensor->t; + curr->inputs[1] = inputs[LSTMUNIT_INPUT_BIAS_PROJ]; + curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); + } + + /* copy h_state to output */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE]; + curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_OUTPUT]; + vsi_nn_internal_setup_node(self, curr); + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_safe_free(self->nn_param.lstmunit_ovxlib.local); + vsi_nn_safe_free(self->nn_param.lstmunit_ovxlib.internal_dtype_aux); + vsi_nn_internal_deinit_node_wksp( self ); + + return status; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.lstmunit_ovxlib.activation = VSI_NN_ACT_TANH; + self->nn_param.lstmunit_ovxlib.recurrent_activation = VSI_NN_ACT_SIGMOID; + self->nn_param.lstmunit_ovxlib.local = (vsi_nn_lstmunit_ovxlib_lcl_data_t *) + malloc(sizeof(vsi_nn_lstmunit_ovxlib_lcl_data_t)); + memset(self->nn_param.lstmunit_ovxlib.local, 0, + sizeof(vsi_nn_lstmunit_ovxlib_lcl_data_t)); + self->nn_param.lstmunit_ovxlib.internal_dtype_aux = (vsi_nn_dtype_t *) + malloc(sizeof(vsi_nn_dtype_t) * LSTMUNIT_QUANTIZE_PARAM_AUX_COUNT); + memset(self->nn_param.lstmunit_ovxlib.internal_dtype_aux, 0, + sizeof(vsi_nn_dtype_t) * LSTMUNIT_QUANTIZE_PARAM_AUX_COUNT); + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LSTMUNIT_OVXLIB, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ LSTMUNIT_INPUT_CNT, + /* output_num */ LSTMUNIT_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c new file mode 100644 index 0000000..e61783d --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c @@ -0,0 +1,241 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (7) +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + + int32_t transposeA = self->nn_param.matrixmul.transpose[0]; + int32_t transposeB = self->nn_param.matrixmul.transpose[1]; + int32_t adjointA = self->nn_param.matrixmul.adjoint[0]; + int32_t adjointB = self->nn_param.matrixmul.adjoint[1]; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "transposeA", transposeA ); + vsi_nn_kernel_param_add_int32( param, "transposeB", transposeB ); + vsi_nn_kernel_param_add_int32( param, "adjointA", adjointA ); + vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB ); + + n = vsi_nn_kernel_selector( self->graph, "matrixmul", inputs, 2, outputs, 1, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vx_bool status = TRUE; + + BEGIN_IO_TYPE_DECL(MATRIXMUL, 2, 1) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + END_IO_TYPE_DECL(MATRIXMUL) + if(!VALIDATE_OP_IO_TYPES(MATRIXMUL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + if (self->nn_param.matrixmul.transpose[0] == FALSE + && self->nn_param.matrixmul.transpose[1] == FALSE + && inputs[0]->attr.size[0] != inputs[1]->attr.size[1]) + { + VSILOGE("1st input tensor's size[0] is not equal to 2nd input tensor's size[1]"); + return FALSE; + } + else if (self->nn_param.matrixmul.transpose[0] == TRUE + && self->nn_param.matrixmul.transpose[1] == FALSE + && inputs[0]->attr.size[1] != inputs[1]->attr.size[1]) + { + VSILOGE("1st input tensor's size[1] is not equal to 2nd input tensor's size[1]"); + return FALSE; + } + else if (self->nn_param.matrixmul.transpose[0] == FALSE + && self->nn_param.matrixmul.transpose[1] == TRUE + && inputs[0]->attr.size[0] != inputs[1]->attr.size[0]) + { + VSILOGE("1st input tensor's size[0] is not equal to 2nd input tensor's size[0]"); + return FALSE; + } + + if(inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2 + && inputs[0]->attr.size[2] != 1 && inputs[1]->attr.size[2] != 1 + && inputs[0]->attr.size[2] != inputs[1]->attr.size[2]) + { + VSILOGE("illegal inputs shape"); + return FALSE; + } + + return status; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i = 0; + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = vsi_nn_max(inputs[0]->attr.dim_num, inputs[1]->attr.dim_num); + + if (node->nn_param.matrixmul.transpose[0] == FALSE + && node->nn_param.matrixmul.transpose[1] == FALSE) + { + outputs[0]->attr.size[0] = inputs[1]->attr.size[0]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + } + else if (node->nn_param.matrixmul.transpose[0] == TRUE + && node->nn_param.matrixmul.transpose[1] == FALSE) + { + outputs[0]->attr.size[0] = inputs[1]->attr.size[0]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[0]; + } + else if (node->nn_param.matrixmul.transpose[0] == FALSE + && node->nn_param.matrixmul.transpose[1] == TRUE) + { + outputs[0]->attr.size[0] = inputs[1]->attr.size[1]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + } + else + { + VSILOGE("Not support transpose A and B both TRUE!(MATRIXMUL) at [%s : %d]\n", __FILE__, __LINE__); + return FALSE; + } + + if(inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) + { + for (i = 2; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + else if(inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) + { + for (i = 2; i < inputs[1]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; + } + } + else if(inputs[0]->attr.size[2] >= inputs[1]->attr.size[2]) + { + for (i = 2; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + else + { + for (i = 2; i < inputs[1]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; + } + } + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ MATRIXMUL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 2, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c new file mode 100644 index 0000000..985f2da --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c @@ -0,0 +1,209 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM 1 +#define _OUTPUT_NUM 2 + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + int32_t* axis = self->nn_param.moments.axis; + int32_t axis_num = self->nn_param.moments.axis_num; + int32_t keep_dim = self->nn_param.moments.keep_dim ? 1 : 0; + + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_buffer( param, "axis", axis, axis_num); + vsi_nn_kernel_param_add_int32( param, "keep_dim", keep_dim); + n = vsi_nn_kernel_selector( self->graph, "moments", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(MOMENTS, 1, 2) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_F32, D_F32) + END_IO_TYPE_DECL(MOMENTS) + if(!VALIDATE_OP_IO_TYPES(MOMENTS, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + int32_t i = 0, j = 0; + vsi_nn_moments_param * p = NULL; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + int32_t* axis = NULL; + int32_t axis_num = 0; + p = &(self->nn_param.moments); + axis = p->axis; + axis_num = p->axis_num; + + for(i = 0; i < axis_num; i++) + { + if(axis[i] > 2) + { + return FALSE; + } + } + + if(p->keep_dim) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[1]->attr.dim_num = inputs[0]->attr.dim_num; + + for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + outputs[1]->attr.size[i] = inputs[0]->attr.size[i]; + } + switch(axis_num) + { + case 1: + outputs[0]->attr.size[axis[0]] = 1; + outputs[1]->attr.size[axis[0]] = 1; + break; + case 2: + outputs[0]->attr.size[axis[0]] = 1; + outputs[0]->attr.size[axis[1]] = 1; + outputs[1]->attr.size[axis[0]] = 1; + outputs[1]->attr.size[axis[1]] = 1; + break; + case 3: + outputs[0]->attr.size[axis[0]] = 1; + outputs[0]->attr.size[axis[1]] = 1; + outputs[0]->attr.size[axis[2]] = 1; + outputs[1]->attr.size[axis[0]] = 1; + outputs[1]->attr.size[axis[1]] = 1; + outputs[1]->attr.size[axis[2]] = 1; + break; + default: + return FALSE; + } + } + else + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num - axis_num; + outputs[1]->attr.dim_num = inputs[0]->attr.dim_num - axis_num; + + for (i = 0; i < axis[0]; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + outputs[1]->attr.size[i] = inputs[0]->attr.size[i]; + } + + for (j = axis[0] + axis_num; j < (int32_t)inputs[0]->attr.dim_num; j++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[j]; + outputs[1]->attr.size[i++] = inputs[0]->attr.size[j]; + } + } + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ MOMENTS, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c new file mode 100644 index 0000000..28602c7 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c @@ -0,0 +1,176 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_log.h" + +static const char *_get_vx_nbg_type + ( + vsi_nn_nbg_type_e type + ) +{ + switch (type) + { + case VSI_NN_NBG_FILE: + return VX_VIVANTE_IMPORT_KERNEL_FROM_FILE; + case VSI_NN_NBG_FOLDER: + return VX_VIVANTE_IMPORT_KERNEL_FROM_FOLDER; + case VSI_NN_NBG_LABEL: + return VX_VIVANTE_IMPORT_KERNEL_FROM_LABEL; + case VSI_NN_NBG_POINTER: + return VX_VIVANTE_IMPORT_KERNEL_FROM_POINTER; + default: + VSILOGE("error nbg type %d", type); + return NULL; + } +} + +static void _set_io_index + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t idx,i; + + idx = 0; + for(i = 0; i < self->input.num; i++) + { + vxSetParameterByIndex(self->n, idx++, (vx_reference)inputs[i]->t); + } + for(i = 0; i < self->output.num; i++) + { + vxSetParameterByIndex(self->n, idx++, (vx_reference)outputs[i]->t); + } +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_node node; + vx_kernel kernel; + + status = VSI_FAILURE; + kernel = NULL; + kernel = vxImportKernelFromURL( + self->graph->ctx->c, + _get_vx_nbg_type(self->nn_param.nbg.type), + self->nn_param.nbg.url + ); + if(NULL == kernel) + { + return status; + } + self->nn_param.nbg.local.kernel = kernel; + + node = NULL; + node = vxCreateGenericNode( + self->graph->g, + self->nn_param.nbg.local.kernel + ); + if(NULL == node) + { + vxReleaseKernel(&kernel); + return status; + } + + self->nn_param.nbg.local.kernel = kernel; + self->n = node; + _set_io_index(self, inputs, outputs); + status = VSI_SUCCESS; + + return status; +} + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* + * Network Binary Graph node do not need to calculate output shape + */ + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vx_kernel kernel; + + kernel = self->nn_param.nbg.local.kernel; + if(kernel) + { + vxReleaseKernel(&kernel); + kernel = self->nn_param.nbg.local.kernel = NULL; + } + vsi_nn_op_common_deinit(self); + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ NBG, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 10, + /* output_num */ 10 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c new file mode 100644 index 0000000..cbd71ed --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c @@ -0,0 +1,142 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + int i; + for( i = 0; i < 10; i ++ ) + { + if( NULL == outputs[i] ) + { + break; + } + if( NULL != outputs[i]->t ) + { + continue; + } + outputs[i]->t = inputs[0]->t; + } + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + int i; + for( i = 0; i < 10; i ++ ) + { + if( NULL == outputs[i] ) + { + break; + } + if( outputs[i]->attr.vtl != inputs[0]->attr.vtl ) + { + VSILOGW( "The tensor virtual attr changed in %#x op.", node->op ); + } + if( outputs[i]->attr.is_const != inputs[0]->attr.is_const ) + { + VSILOGW( "The tensor const attr changed in %#x op.", node->op ); + } + if( VSI_NN_DIM_AUTO == outputs[i]->attr.dim_num ) + { + if( NULL != outputs[i]->t ) + { + if( NULL == inputs[0]->t ) + { + memcpy( inputs[0], outputs[i], sizeof( vsi_nn_tensor_t ) ); + } + else + { + VSILOGE( "Invalid NOOP tensors." ); + vxReleaseTensor( &outputs[i]->t ); + memcpy( outputs[i], inputs[0], sizeof( vsi_nn_tensor_t ) ); + } + } + } + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + self->n = NULL; + vsi_nn_InitTensorsId( self->input.tensors, self->input.num ); + vsi_nn_InitTensorsId( self->output.tensors, self->output.num ); + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ NOOP, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 10 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c new file mode 100644 index 0000000..2ae7605 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c @@ -0,0 +1,248 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_constraint_check.h" + +vsi_status vsi_nn_InitPadParameter + ( + vsi_nn_node_t * node, + vx_nn_pad_params_t * param + ) +{ + int32_t pad_const_val; + uint8_t i; + if(NULL == node || NULL == param) + { + VSILOGE("Set param fail\n"); + return VSI_FAILURE; + } + + memset(param, 0, sizeof(vx_nn_pad_params_t)); + pad_const_val = node->nn_param.pad.const_val; + param->pad_mode = node->nn_param.pad.mode; + param->pad_const = vxCreateScalar( node->graph->ctx->c, VX_TYPE_INT32, &pad_const_val ); + if( NULL == param->pad_const ) + { + VSILOGE("Create scalar fail\n"); + return VSI_FAILURE; + } + switch (param->pad_mode) + { + case VSI_NN_PAD_MODE_CONSTANT: + param->pad_mode = VX_PAD_CONSTANT; + break; + case VSI_NN_PAD_MODE_REPLICATE: + param->pad_mode = VX_PAD_REPLICATE; + break; + case VSI_NN_PAD_MODE_SYMMETRIC: + param->pad_mode = VX_PAD_MIRROR_SYMMETRIC; + break; + case VSI_NN_PAD_MODE_REFLECT: + param->pad_mode = VX_PAD_MIRROR_REFLECT; + break; + default: + VSILOGE("Wrong pad_mode value"); + break; + } + + /* + * work around(TODO): + * driver only support pad 2 dimensions + */ + param->numViewDimensions = vsi_nn_max(node->nn_param.pad.dim_num, 2); + param->pad_front_array = (int32_t *)malloc(sizeof(int32_t) * param->numViewDimensions); + param->pad_back_array = (int32_t *)malloc(sizeof(int32_t) * param->numViewDimensions); + memset(param->pad_front_array, 0, sizeof(int32_t) * param->numViewDimensions); + memset(param->pad_back_array, 0, sizeof(int32_t) * param->numViewDimensions); + for(i=0; i < vsi_nn_min(param->numViewDimensions, node->nn_param.pad.dim_num); i++) + { + param->pad_front_array[i] = (int32_t)node->nn_param.pad.front_size[i]; + param->pad_back_array[i] = (int32_t)node->nn_param.pad.back_size[i]; + } + + return VSI_SUCCESS; +} /* vsi_nn_InitPadParameter() */ + +void vsi_nn_DeinitPadParameter + ( + vx_nn_pad_params_t * param + ) +{ + if( NULL != param ) + { + if( NULL != param->pad_const ) + { + vxReleaseScalar( ¶m->pad_const ); + } + if( NULL != param->pad_front_array ) + { + free( param->pad_front_array ); + } + if( NULL != param->pad_back_array ) + { + free( param->pad_back_array ); + } + } +} /* vsi_nn_DeinitPadParameter() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_nn_pad_params_t p; + + status = VSI_FAILURE; + if(VSI_SUCCESS != vsi_nn_InitPadParameter(self, &p)) + { + VSILOGE("Set Pad Layer Parameter fail\n"); + return VSI_FAILURE; + } + + self->n = vxTensorPadNode( + self->graph->g, + inputs[0]->t, + outputs[0]->t, + &p, + sizeof(p) + ); + + vsi_nn_DeinitPadParameter(&p); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PAD, 1, 1) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + END_IO_TYPE_DECL(PAD) + if (!VALIDATE_OP_IO_TYPES(PAD, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + if(self->nn_param.pad.dim_num != inputs[0]->attr.dim_num + && self->nn_param.pad.dim_num != 0 ) + { + VSILOGE("Error:input tensor dim should be equal with pad's."); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + if(self->nn_param.pad.dim_num == 0) + { + self->nn_param.pad.dim_num = (uint8_t)inputs[0]->attr.dim_num; + } + if(VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) + { + for(i=0; inn_param.pad.dim_num; i++) + { + uint32_t front = self->nn_param.pad.front_size[i]; + uint32_t back = self->nn_param.pad.back_size[i]; + outputs[0]->attr.size[i] = inputs[0]->attr.size[i] + front + back; + } + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + else + { + for(i=0; inn_param.pad.dim_num; i++) + { + uint32_t front = self->nn_param.pad.front_size[i]; + uint32_t back = self->nn_param.pad.back_size[i]; + + if (front + back + inputs[0]->attr.size[i] != outputs[0]->attr.size[i]) + { + VSILOGE("Error:output shape[%u] not equal front padding[%u] + input shape[%u] + back padding[%u]", + outputs[0]->attr.size[i], front, back); + return FALSE; + } + } + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif + /* Registrar */ + DEF_OP_REG + ( + /* op_name */ PAD, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c new file mode 100644 index 0000000..cd0a9db --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c @@ -0,0 +1,302 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_bool _is_same_memory_shape + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t input_dims[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t perm_dims[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t i = 0; + uint32_t idx = 0; + uint32_t dim_num0 = inputs[0]->attr.dim_num; + uint32_t dim_num1 = self->nn_param.permute.dim_num; + + if (dim_num0 != dim_num1) + return FALSE; + + /********squeeze tensor shape*******/ + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + if (inputs[0]->attr.size[i] == 1) + { + dim_num0 --; + } + else + { + input_dims[idx++] = i; + } + } + + for (i = 0, idx = 0; i < self->nn_param.permute.dim_num; i++) + { + uint32_t d = self->nn_param.permute.perm[i]; + + if (inputs[0]->attr.size[d] == 1) + { + dim_num1 --; + } + else + { + perm_dims[idx++] = d; + } + } + + if (dim_num0 != dim_num1) + return FALSE; + + for (i = 0; i < dim_num0; i++) + { + if (input_dims[i] != perm_dims[i]) + return FALSE; + } + + return TRUE; +} /* _is_same_memory_shape */ + +static vsi_bool _is_same_quant + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_dtype_t *dtype,*_dtype; + + dtype = &inputs[0]->attr.dtype; + _dtype = &outputs[0]->attr.dtype; + + if(vsi_nn_DtypeCompare(dtype, _dtype) == FALSE) + { + return FALSE; + } + + return TRUE; +} /* _is_same_quant */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + uint32_t perm[VSI_NN_MAX_DIM_NUM] = {0}; + status = VSI_SUCCESS; + + if (self->nn_param.permute.local.initialized == FALSE) + { + memcpy(perm, self->nn_param.permute.perm, + sizeof(uint32_t) * self->nn_param.permute.dim_num); + self->n = vxTensorPermuteNode( + self->graph->g, + inputs[0]->t, + outputs[0]->t, + perm, + self->nn_param.permute.dim_num + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PERMUTE, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_BOOL8, D_BOOL8) + IO_TYPE(D_BOOL8, D_I8|Q_DFP) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_BF16, D_BF16) + END_IO_TYPE_DECL(PERMUTE) + if (!VALIDATE_OP_IO_TYPES(PERMUTE, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret; + uint32_t i; + uint32_t axis; + + if( self->nn_param.permute.dim_num != inputs[0]->attr.dim_num ) + { + VSILOGE( "Error permute dims '%u' vs '%u' ", + self->nn_param.permute.dim_num, inputs[0]->attr.dim_num ); + return FALSE; + } + + ret = TRUE; + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for( i = 0; i < self->nn_param.permute.dim_num; i ++ ) + { + axis = self->nn_param.permute.perm[i]; + if( axis >= inputs[0]->attr.dim_num ) + { + VSILOGE( "Error permute axis '%u', the dim is '%u' ", + axis, inputs[0]->attr.dim_num ); + ret = FALSE; + break; + } + outputs[0]->attr.size[i] = inputs[0]->attr.size[axis]; + } + } + + return ret; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status; + uint32_t shape[VSI_NN_MAX_DIM_NUM]; + uint32_t i = 0; + + status = VSI_SUCCESS; + + if (_is_same_memory_shape(self, inputs, outputs) == FALSE || + _is_same_quant(self, inputs, outputs) == FALSE || + (inputs[0]->t != NULL && outputs[0]->t != NULL)) + { + return status; + } + + VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + + for (i = 0; i < self->nn_param.permute.dim_num; i++) + { + shape[i] = inputs[0]->attr.size[self->nn_param.permute.perm[i]]; + } + + if( direction == VSI_NN_OPTIMIZE_BACKWARD ) + { + if(NULL == inputs[0]->t && NULL != outputs[0]->t) + { + inputs[0]->t = vxReshapeTensor( outputs[0]->t, + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num ); + if( inputs[0]->t == NULL ) + { + status = VSI_FAILURE; + } + self->nn_param.permute.local.initialized = TRUE; + } + } + else + { + if(NULL == outputs[0]->t) + { + vsi_bool ret; + ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0], + shape, self->nn_param.permute.dim_num ); + if( ret == FALSE ) + { + status = VSI_FAILURE; + } + self->nn_param.permute.local.initialized = TRUE; + } + } + + //vsi_nn_ReshapeTensor(self->graph, inputs[0], outputs[0], shape, self->nn_param.permute.dim_num); + + return status; +} /* op_optimize() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PERMUTE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c new file mode 100644 index 0000000..5ec4a6c --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c @@ -0,0 +1,193 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_nn_pooling_params_ext_t params; + status = VSI_FAILURE; + + memset( ¶ms, 0, sizeof( params ) ); + params.base.pool_type = self->nn_param.pool.type; + params.base.pool_size_x = self->nn_param.pool.ksize[0]; + params.base.pool_size_y = self->nn_param.pool.ksize[1]; + params.base.pool_pad_x_left = self->nn_param.pool.pad[0]; + params.base.pool_pad_x_right = self->nn_param.pool.pad[1]; + params.base.pool_pad_y_top = self->nn_param.pool.pad[2]; + params.base.pool_pad_y_bottom = self->nn_param.pool.pad[3]; + params.base.rounding = self->vx_param.down_scale_size_rounding; + params.stride_x = self->nn_param.pool.stride[0]; + params.stride_y = self->nn_param.pool.stride[1]; + + self->n = vxPoolingLayer2( + self->graph->g, + inputs[0]->t, + (vx_nn_pooling_params_t *)¶ms, + sizeof( params ), + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(POOL, 1, 1) + /* IO_TYPE(INPUT, OUTPUT) */ + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BF16) + + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP) + + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(POOL) + if(!VALIDATE_OP_IO_TYPES(POOL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret; + + ret = TRUE; + vsi_nn_compute_padding( + inputs[0]->attr.size, + self->nn_param.pool.ksize, + self->nn_param.pool.stride, + NULL, + self->nn_param.pool.pad_type, + self->nn_param.pool.pad + ); + + /* Pooling */ + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + self->nn_param.pool.ksize[0], + &self->nn_param.pool.pad[0], + self->nn_param.pool.stride[0], + 0, + self->nn_param.pool.round_type + ); + outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[1], + self->nn_param.pool.ksize[1], + &self->nn_param.pool.pad[2], + self->nn_param.pool.stride[1], + 0, + self->nn_param.pool.round_type + ); + + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + if( NULL != outputs[1] ) + { + outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; + memcpy( outputs[1]->attr.size, outputs[0]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + } + + return ret; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ POOL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c new file mode 100644 index 0000000..2f88825 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c @@ -0,0 +1,296 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (2) + +static vsi_bool vsi_nn_poolwithargmax_optimize_shape + ( + vsi_nn_node_t * self, + const int32_t* shape_in, const int32_t* shape_out0, + const int32_t* shape_out1, const size_t rank_in, + int32_t* out_shape_input, int32_t* out_shape_output0, + int32_t* out_shape_output1, uint32_t* out_rank_output + ) +{ + vsi_bool enable_image_2d = FALSE; + int32_t hwLitimLen = 65536; + + if ((2 == self->nn_param.pool.ksize[1]) + && (2 == self->nn_param.pool.stride[1]) + && ((shape_in[1] % 2 == 0) || (shape_in[2] == 1))) + { + if (rank_in < 3) + { + enable_image_2d = TRUE; + } + else + { + enable_image_2d = (vsi_bool)(shape_in[1] * shape_in[2] < hwLitimLen); + } + } + + if( rank_in == 1 ) + { + out_shape_input[0] = shape_in[0]; + out_shape_input[1] = 1; + out_shape_input[2] = 1; + out_shape_output0[0] = shape_out0[0]; + out_shape_output0[1] = 1; + out_shape_output0[2] = 1; + out_shape_output1[0] = shape_out1[0]; + out_shape_output1[1] = 1; + out_shape_output1[2] = 1; + *out_rank_output = 2; + } + else if(rank_in == 3 && enable_image_2d) + { + out_shape_input[0] = shape_in[0]; + out_shape_input[1] = shape_in[1] * shape_in[2]; + out_shape_input[2] = 1; + out_shape_output0[0] = shape_out0[0]; + out_shape_output0[1] = shape_out0[1] * shape_out0[2]; + out_shape_output0[2] = 1; + out_shape_output1[0] = shape_out1[0]; + out_shape_output1[1] = shape_out1[1] * shape_out1[2]; + out_shape_output1[2] = 1; + *out_rank_output = 2; + } + else if(rank_in == 4 && enable_image_2d) + { + out_shape_input[0] = shape_in[0]; + out_shape_input[1] = shape_in[1] * shape_in[2]; + out_shape_input[2] = 1; + out_shape_input[3] = shape_in[3]; + out_shape_output0[0] = shape_out0[0]; + out_shape_output0[1] = shape_out0[1] * shape_out0[2]; + out_shape_output0[2] = 1; + out_shape_output0[3] = shape_out0[3]; + out_shape_output1[0] = shape_out1[0]; + out_shape_output1[1] = shape_out1[1] * shape_out1[2]; + out_shape_output1[2] = 1; + out_shape_output1[3] = shape_out1[3]; + *out_rank_output = 4; + } + else + { + uint32_t i; + for (i = 0; i < rank_in; i++) + { + out_shape_input[i] = shape_in[i]; + out_shape_output0[i] = shape_out0[i]; + out_shape_output1[i] = shape_out1[i]; + } + *out_rank_output = (uint32_t)rank_in; + } + + return TRUE; +} + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + uint32_t new_rank = 0; + vsi_bool ret; + vsi_nn_kernel_param_t * param = NULL; + int32_t ksize_x = (int32_t)self->nn_param.pool.ksize[0]; + int32_t ksize_y = (int32_t)self->nn_param.pool.ksize[1]; + int32_t stride_x = (int32_t)self->nn_param.pool.stride[0]; + int32_t stride_y = (int32_t)self->nn_param.pool.stride[1]; + int32_t pad_x = (int32_t)self->nn_param.pool.pad[0]; + int32_t pad_y = (int32_t)self->nn_param.pool.pad[2]; + + if( NULL == self ) + { + return VSI_FAILURE; + } + + param =vsi_nn_kernel_param_create(); + + ret = vsi_nn_poolwithargmax_optimize_shape(self, + (int32_t *)inputs[0]->attr.size, (int32_t *)outputs[0]->attr.size, + (int32_t *)outputs[1]->attr.size, inputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + vsi_nn_kernel_param_add_int32( param, "ksize_x", ksize_x ); + vsi_nn_kernel_param_add_int32( param, "ksize_y", ksize_y ); + vsi_nn_kernel_param_add_int32( param, "stride_x", stride_x ); + vsi_nn_kernel_param_add_int32( param, "stride_y", stride_y ); + vsi_nn_kernel_param_add_int32( param, "pad_x", pad_x ); + vsi_nn_kernel_param_add_int32( param, "pad_y", pad_y ); + + if( ret ) + { + + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[1], (uint32_t*)shapes[2], new_rank ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "poolwithargmax", + &reshape_tensors[0], _INPUT_NUM, + &reshape_tensors[1], _OUTPUT_NUM, param ); + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + } + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; + +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(POOLWITHARGMAX, 1, 2) + IO_TYPE(D_F16, D_F16, D_U8) + IO_TYPE(D_F16, D_I16|Q_DFP, D_U8) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_I16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8) + IO_TYPE(D_I8|Q_DFP, D_F16, D_U8) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_U8) + IO_TYPE(D_I16|Q_DFP, D_F16, D_U8) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16) + IO_TYPE(D_F32, D_F32, D_U8) + IO_TYPE(D_F32, D_U8|Q_ASYM, D_U8) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_U8) + IO_TYPE(D_I32, D_I32, D_U8) + + /* HW 9.0 */ + IO_TYPE(D_BF16, D_BF16, D_U8) + END_IO_TYPE_DECL(POOLWITHARGMAX) + if(!VALIDATE_OP_IO_TYPES(POOLWITHARGMAX, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + if (VX_CONVOLUTIONAL_NETWORK_POOLING_MAX != self->nn_param.pool.type) + { + VSILOGE("Unsupported pool type.\n"); + return FALSE; + } + + return TRUE; + +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, inputs, outputs ); + } + + return ret; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + + for (i = 0; i < _VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.pool.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.pool.local.local_tensor[i])); + self->nn_param.pool.local.local_tensor[i] = NULL; + } + } + + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ POOLWITHARGMAX, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c new file mode 100644 index 0000000..a50c1b3 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c @@ -0,0 +1,239 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_dtype_util.h" +#include "vsi_nn_internal_node.h" + +extern vx_kernel_description_t * vx_kernel_POST_PROCESS_list[]; + + +static vsi_bool _is_same_type + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if(vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) + { + return FALSE; + } + + return TRUE; +} /* _is_same_quant */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret; + uint32_t i; + uint32_t axis; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_bool use_virtual_tensor = TRUE; + + vsi_nn_internal_init_node_wksp( self ); + + if( self->nn_param.post_process.dim_num != inputs[0]->attr.dim_num ) + { + VSILOGE( "Error permute dims '%u' vs '%u' ", + self->nn_param.permute.dim_num, inputs[0]->attr.dim_num ); + return FALSE; + } + + ret = TRUE; + /* output */ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for( i = 0; i < self->nn_param.post_process.dim_num; i ++ ) + { + axis = self->nn_param.post_process.perm[i]; + if( axis >= inputs[0]->attr.dim_num ) + { + VSILOGE( "Error permute axis '%u', the dim is '%u' ", + axis, inputs[0]->attr.dim_num ); + ret = FALSE; + break; + } + outputs[0]->attr.size[i] = inputs[0]->attr.size[axis]; + } + } + + for (i = 0; i < self->nn_param.post_process.dim_num; i++) + { + axis = self->nn_param.post_process.perm[i]; + if (axis != i) + break; + } + + if (i == self->nn_param.post_process.dim_num) + self->nn_param.post_process.local.enable_perm = FALSE; + else + self->nn_param.post_process.local.enable_perm = TRUE; + + if (_is_same_type(self, inputs, outputs)) + self->nn_param.post_process.local.enable_data_conv = FALSE; + else + self->nn_param.post_process.local.enable_data_conv = TRUE; + + if (self->nn_param.post_process.local.enable_data_conv == FALSE && + self->nn_param.post_process.local.enable_perm == FALSE) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + curr->node->nn_param.reshape.size = outputs[0]->attr.size; + curr->node->nn_param.reshape.dim_num = outputs[0]->attr.dim_num; + curr->inputs[0] = inputs[POST_PROCESS_INPUT]; + curr->outputs[0] = outputs[POST_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (self->nn_param.post_process.local.enable_data_conv == TRUE && + self->nn_param.post_process.local.enable_perm == FALSE) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = inputs[POST_PROCESS_INPUT]; + curr->outputs[0] = outputs[POST_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (self->nn_param.post_process.local.enable_data_conv == FALSE && + self->nn_param.post_process.local.enable_perm == TRUE) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + curr->node->nn_param.permute.perm = self->nn_param.post_process.perm; + curr->node->nn_param.permute.dim_num = self->nn_param.post_process.dim_num; + curr->inputs[0] = inputs[POST_PROCESS_INPUT]; + curr->outputs[0] = outputs[POST_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else + { + /* transpose to time_major */ + memcpy( &attr, &inputs[POST_PROCESS_INPUT]->attr, sizeof( attr ) ); + memcpy( &attr.size, &outputs[POST_PROCESS_OUTPUT]->attr.size, sizeof( attr.size ) ); + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + curr->node->nn_param.permute.perm = self->nn_param.post_process.perm; + curr->node->nn_param.permute.dim_num = self->nn_param.post_process.dim_num; + curr->inputs[0] = inputs[POST_PROCESS_INPUT]; + curr->outputs[0] = output_tensor->t; + + vsi_nn_internal_setup_node( self, curr ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = output_tensor->t; + curr->outputs[0] = outputs[POST_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + + return ret; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp( self ); + + return status; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ POST_PROCESS, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ POST_PROCESS_INPUT_CNT, + /* output_num */ POST_PROCESS_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c new file mode 100644 index 0000000..c1877de --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -0,0 +1,445 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_util.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = vsi_nn_internal_compute_node( self ); + self->n = vsi_nn_internal_get_node_by_uid(self, 1)->node->n; + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_pre_process_param * p; + vsi_bool ret = TRUE; + + p = (vsi_nn_pre_process_param *)&(self->nn_param.pre_process); + + vsi_nn_internal_init_node_wksp( self ); + + if (p->type == VSI_NN_SOURCE_FORMAT_TENSOR) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_TENSOR, 0, 0 ); + + curr->node->nn_param.pre_process_tensor.perm = p->perm; + curr->node->nn_param.pre_process_tensor.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 ); + + curr->node->nn_param.pre_process_gray.mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_gray.scale = p->norm.scale; + curr->node->nn_param.pre_process_gray.rect.left = p->rect.left; + curr->node->nn_param.pre_process_gray.rect.top = p->rect.top; + curr->node->nn_param.pre_process_gray.rect.width = p->rect.width; + curr->node->nn_param.pre_process_gray.rect.height = p->rect.height; + curr->node->nn_param.pre_process_gray.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB, 0, 0 ); + + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[2]; + } + + curr->node->nn_param.pre_process_rgb.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_rgb.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_rgb.rect.left = p->rect.left; + curr->node->nn_param.pre_process_rgb.rect.top = p->rect.top; + curr->node->nn_param.pre_process_rgb.rect.width = p->rect.width; + curr->node->nn_param.pre_process_rgb.rect.height = p->rect.height; + curr->node->nn_param.pre_process_rgb.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_rgb.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_rgb.perm = p->perm; + curr->node->nn_param.pre_process_rgb.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV420, 0, 0 ); + + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[2]; + } + + curr->node->nn_param.pre_process_yuv420.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv420.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_yuv420.rect.left = p->rect.left; + curr->node->nn_param.pre_process_yuv420.rect.top = p->rect.top; + curr->node->nn_param.pre_process_yuv420.rect.width = p->rect.width; + curr->node->nn_param.pre_process_yuv420.rect.height = p->rect.height; + curr->node->nn_param.pre_process_yuv420.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_yuv420.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_yuv420.perm = p->perm; + curr->node->nn_param.pre_process_yuv420.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->inputs[1] = inputs[PRE_PROCESS_INPUT1]; + curr->inputs[2] = inputs[PRE_PROCESS_INPUT2]; + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_BGRA, 0, 0 ); + + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[2]; + } + + curr->node->nn_param.pre_process_bgra.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_bgra.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_bgra.rect.left = p->rect.left; + curr->node->nn_param.pre_process_bgra.rect.top = p->rect.top; + curr->node->nn_param.pre_process_bgra.rect.width = p->rect.width; + curr->node->nn_param.pre_process_bgra.rect.height = p->rect.height; + curr->node->nn_param.pre_process_bgra.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_bgra.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_bgra.perm = p->perm; + curr->node->nn_param.pre_process_bgra.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR) + { + uint32_t i = 0; + uint32_t axis = 2; + uint32_t group = 3; + vsi_nn_tensor_t ** input_tensor_group = &p->local->local_tensor[0]; + vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL}; + vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL }; + vsi_nn_tensor_attr_t attr; + float mean[3] = {0}; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + ret = vsi_nn_CreateTensorGroup(self->graph, inputs[0], axis, + input_tensor_group, group); + if (ret == FALSE) + { + goto final; + } + + memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t)); + memcpy(&attr.size, p->output_attr.size, p->output_attr.dim_num * sizeof(uint32_t)); + attr.size[axis] = 1; + attr.vtl = TRUE; + attr.is_const = FALSE; + output_tensor_group[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + output_tensor_group[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + output_tensor_group[2] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + if (p->reverse_channel) + { + int32_t order[3] = {2, 1, 0}; + + mean[0] = p->norm.mean[2]; + mean[1] = p->norm.mean[1]; + mean[2] = p->norm.mean[0]; + + vsi_nn_reorder_tensor( (vsi_nn_tensor_t **)output_tensor_group, order, + 3, (vsi_nn_tensor_t **)tmp_outputs ); + } + else + { + mean[0] = p->norm.mean[0]; + mean[1] = p->norm.mean[1]; + mean[2] = p->norm.mean[2]; + + memmove( tmp_outputs, output_tensor_group, sizeof(vsi_nn_tensor_t*) * 3 ); + } + + for (i = 0; i < 3; i++) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 ); + + curr->node->nn_param.pre_process_gray.mean = mean[i]; + curr->node->nn_param.pre_process_gray.scale = p->norm.scale; + curr->node->nn_param.pre_process_gray.rect.left = p->rect.left; + curr->node->nn_param.pre_process_gray.rect.top = p->rect.top; + curr->node->nn_param.pre_process_gray.rect.width = p->rect.width; + curr->node->nn_param.pre_process_gray.rect.height = p->rect.height; + curr->node->nn_param.pre_process_gray.output_attr.size = attr.size; + curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num; + + curr->inputs[0] = input_tensor_group[i]; + curr->outputs[0] = output_tensor_group[i]->t; + + vsi_nn_internal_setup_node(self, curr); + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 3, 1 ); + + curr->node->nn_param.concat.axis = axis; + curr->inputs[0] = tmp_outputs[0]->t; + curr->inputs[1] = tmp_outputs[1]->t; + curr->inputs[2] = tmp_outputs[2]->t; + curr->outputs[0] = outputs[0]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV444, 0, 0 ); + + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[2]; + } + + curr->node->nn_param.pre_process_yuv444.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv444.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_yuv444.rect.left = p->rect.left; + curr->node->nn_param.pre_process_yuv444.rect.top = p->rect.top; + curr->node->nn_param.pre_process_yuv444.rect.width = p->rect.width; + curr->node->nn_param.pre_process_yuv444.rect.height = p->rect.height; + curr->node->nn_param.pre_process_yuv444.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_yuv444.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_yuv444.perm = p->perm; + curr->node->nn_param.pre_process_yuv444.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->inputs[1] = inputs[PRE_PROCESS_INPUT1]; + curr->inputs[2] = inputs[PRE_PROCESS_INPUT2]; + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 ); + + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[2]; + } + + curr->node->nn_param.pre_process_nv12.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left; + curr->node->nn_param.pre_process_nv12.rect.top = p->rect.top; + curr->node->nn_param.pre_process_nv12.rect.width = p->rect.width; + curr->node->nn_param.pre_process_nv12.rect.height = p->rect.height; + curr->node->nn_param.pre_process_nv12.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_nv12.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_nv12.perm = p->perm; + curr->node->nn_param.pre_process_nv12.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + curr->inputs[1] = inputs[PRE_PROCESS_INPUT1]; + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else + { + VSILOGE( "Not support this type!(PRE_PROCESS)\n"); + return FALSE; + } + +final: + + return ret; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp( self ); + + if (self->nn_param.pre_process.local != NULL) + { + uint32_t i = 0; + + for ( i = 0; i < _VSI_NN_PRE_PROCESS_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.pre_process.local->local_tensor[i] != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.pre_process.local->local_tensor[i])); + } + } + + free(self->nn_param.pre_process.local); + self->nn_param.pre_process.local = NULL; + } + + return status; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.pre_process.local = + (vsi_nn_pre_process_lcl_data *)malloc(sizeof(vsi_nn_pre_process_lcl_data)); + + if (NULL == self->nn_param.pre_process.local) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.pre_process.local, 0, sizeof(vsi_nn_pre_process_lcl_data)); + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PRE_PROCESS, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ PRE_PROCESS_INPUT_CNT, + /* output_num */ PRE_PROCESS_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c new file mode 100644 index 0000000..c0889c6 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c @@ -0,0 +1,219 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_bgra.local.scale_x ); + vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_bgra.local.scale_y ); + vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_bgra.rect.left ); + vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_bgra.rect.top ); + vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_bgra.r_mean ); + vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_bgra.g_mean ); + vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_bgra.b_mean ); + vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_bgra.rgb_scale ); + vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_bgra.reverse_channel ); + vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_bgra.local.enable_perm ); + vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_bgra.local.enable_copy ); + n = vsi_nn_kernel_selector( self->graph, "pre_process_bgra", inputs, 1, outputs, 1, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PRE_PROCESS_BGRA, 1, 1) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + END_IO_TYPE_DECL(PRE_PROCESS_BGRA) + if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_BGRA, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + vsi_nn_pre_process_bgra_param * p = NULL; + uint32_t axis = 0; + uint32_t i = 0; + p = (vsi_nn_pre_process_bgra_param *)&(self->nn_param.pre_process_bgra); + + if (p->rect.width == 0 || p->rect.height == 0) + { + VSILOGE("Image size cannot be zero !(PRE_PROCESS_BGRA)\n"); + return FALSE; + } + else + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_BGRA)\n"); + return FALSE; + } + } + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + if (p->output_attr.dim_num > 0) + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_BGRA)\n"); + return FALSE; + } + else + { + outputs[0]->attr.dim_num = p->output_attr.dim_num; + outputs[0]->attr.size[i] = p->output_attr.size[i]; + } + } + } + else + { + VSILOGE("output dim num cannot be zero!(PRE_PROCESS_BGRA)\n"); + return FALSE; + } + } + + for (i = 0; i < self->nn_param.pre_process_bgra.dim_num; i++) + { + axis = self->nn_param.pre_process_bgra.perm[i]; + if (axis != i) + break; + } + + if (i == self->nn_param.pre_process_bgra.dim_num) + self->nn_param.pre_process_bgra.local.enable_perm = FALSE; + else + self->nn_param.pre_process_bgra.local.enable_perm = TRUE; + + if (self->nn_param.pre_process_bgra.local.enable_perm == FALSE) + { + p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; + p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; + } + else + { + p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1]; + p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2]; + } + + p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.pre_process_bgra.local.local_tensor != NULL) + { + vxReleaseTensor(&self->nn_param.pre_process_bgra.local.local_tensor); + self->nn_param.pre_process_bgra.local.local_tensor = NULL; + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PRE_PROCESS_BGRA, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c new file mode 100644 index 0000000..80797a2 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c @@ -0,0 +1,181 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_gray.local.scale_x ); + vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_gray.local.scale_y ); + vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_gray.rect.left ); + vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_gray.rect.top ); + vsi_nn_kernel_param_add_float32( param, "mean", self->nn_param.pre_process_gray.mean ); + vsi_nn_kernel_param_add_float32( param, "scale", self->nn_param.pre_process_gray.scale ); + vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_gray.local.enable_copy ); + n = vsi_nn_kernel_selector( self->graph, "pre_process_gray", inputs, 1, outputs, 1, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PRE_PROCESS_GRAY, 1, 1) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + END_IO_TYPE_DECL(PRE_PROCESS_GRAY) + if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_GRAY, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_pre_process_gray_param * p = NULL; + uint32_t i = 0; + p = (vsi_nn_pre_process_gray_param *)&(self->nn_param.pre_process_gray); + + if (p->rect.width == 0 || p->rect.height == 0) + { + VSILOGE("Image size cannot be zero !(PRE_PROCESS_GRAY)\n"); + return FALSE; + } + else + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_GRAY)\n"); + return FALSE; + } + } + } + + if( VSI_NN_DIM_AUTO == outputs[PRE_PROCESS_GRAY_OUTPUT]->attr.dim_num ) + { + /* TODO */ + if (p->output_attr.dim_num > 0) + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_GRAY)\n"); + return FALSE; + } + else + { + outputs[PRE_PROCESS_GRAY_OUTPUT]->attr.size[i] = p->output_attr.size[i]; + } + } + outputs[PRE_PROCESS_RGB_OUTPUT]->attr.dim_num = p->output_attr.dim_num; + } + else + { + VSILOGE("output dim num cannot be zero!(PRE_PROCESS_GRAY)\n"); + return FALSE; + } + } + + p->local.scale_x = (p->rect.width << 15) / outputs[PRE_PROCESS_GRAY_OUTPUT]->attr.size[0]; + p->local.scale_y = (p->rect.height << 15) / outputs[PRE_PROCESS_GRAY_OUTPUT]->attr.size[1]; + p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); + + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PRE_PROCESS_GRAY, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c new file mode 100644 index 0000000..d0f1454 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c @@ -0,0 +1,245 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_nv12.local->scale_x ); + vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_nv12.local->scale_y ); + vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_nv12.rect.left ); + vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_nv12.rect.top ); + vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_nv12.r_mean ); + vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_nv12.g_mean ); + vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_nv12.b_mean ); + vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_nv12.rgb_scale ); + vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_nv12.reverse_channel ); + vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_nv12.local->enable_perm ); + vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_nv12.local->enable_copy ); + n = vsi_nn_kernel_selector( self->graph, "pre_process_nv12", inputs, 2, outputs, 1, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PRE_PROCESS_NV12, 2, 1) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + END_IO_TYPE_DECL(PRE_PROCESS_NV12) + if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_NV12, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + vsi_nn_pre_process_nv12_param * p = NULL; + uint32_t axis = 0; + uint32_t i = 0; + p = (vsi_nn_pre_process_nv12_param *)&(self->nn_param.pre_process_nv12); + + if (p->rect.width == 0 || p->rect.height == 0) + { + VSILOGE("Image size cannot be zero !(PRE_PROCESS_NV12)\n"); + return FALSE; + } + else + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_NV12)\n"); + return FALSE; + } + } + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + if (p->output_attr.dim_num > 0) + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_NV12)\n"); + return FALSE; + } + else + { + outputs[0]->attr.dim_num = p->output_attr.dim_num; + outputs[0]->attr.size[i] = p->output_attr.size[i]; + } + } + } + else + { + VSILOGE("output dim num cannot be zero!(PRE_PROCESS_NV12)\n"); + return FALSE; + } + } + + for (i = 0; i < self->nn_param.pre_process_nv12.dim_num; i++) + { + axis = self->nn_param.pre_process_nv12.perm[i]; + if (axis != i) + break; + } + + if (i == self->nn_param.pre_process_nv12.dim_num) + self->nn_param.pre_process_nv12.local->enable_perm = FALSE; + else + self->nn_param.pre_process_nv12.local->enable_perm = TRUE; + + if (self->nn_param.pre_process_nv12.local->enable_perm == FALSE) + { + p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; + p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; + } + else + { + p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1]; + p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2]; + } + + p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15))); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.pre_process_nv12.local != NULL) + { + uint32_t i = 0; + for (i = 0; i < _VSI_NN_PRE_PROCESS_NV12_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.pre_process_nv12.local->local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.pre_process_nv12.local->local_tensor[i])); + self->nn_param.pre_process_nv12.local->local_tensor[i] = NULL; + } + } + free(self->nn_param.pre_process_nv12.local); + self->nn_param.pre_process_nv12.local = NULL; + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.pre_process_nv12.local = + (vsi_nn_pre_process_nv12_lcl_data *)malloc(sizeof(vsi_nn_pre_process_nv12_lcl_data)); + + if (NULL == self->nn_param.pre_process_nv12.local) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.pre_process_nv12.local, 0, sizeof(vsi_nn_pre_process_nv12_lcl_data)); + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PRE_PROCESS_NV12, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c new file mode 100644 index 0000000..bd9e5c8 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c @@ -0,0 +1,224 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb.local.scale_x ); + vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb.local.scale_y ); + vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb.rect.left ); + vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb.rect.top ); + vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean ); + vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean ); + vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean ); + vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_rgb.rgb_scale ); + vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel ); + vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm ); + vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy ); + n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", inputs, 1, outputs, 1, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB, 1, 1) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + END_IO_TYPE_DECL(PRE_PROCESS_RGB) + if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + vsi_nn_pre_process_rgb_param * p = NULL; + uint32_t axis = 0; + uint32_t i = 0; + p = (vsi_nn_pre_process_rgb_param *)&(self->nn_param.pre_process_rgb); + + if (p->rect.width == 0 || p->rect.height == 0) + { + VSILOGE("Image size cannot be zero !(PRE_PROCESS_RGB)\n"); + return FALSE; + } + else + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_RGB)\n"); + return FALSE; + } + } + } + + if( VSI_NN_DIM_AUTO == outputs[PRE_PROCESS_RGB_OUTPUT]->attr.dim_num ) + { + if (p->output_attr.dim_num > 0) + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_RGB)\n"); + return FALSE; + } + else + { + outputs[PRE_PROCESS_RGB_OUTPUT]->attr.dim_num = p->output_attr.dim_num; + outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[i] = p->output_attr.size[i]; + } + } + } + else + { + VSILOGE("output dim num cannot be zero!(PRE_PROCESS_RGB)\n"); + return FALSE; + } + } + + for (i = 0; i < self->nn_param.pre_process_rgb.dim_num; i++) + { + axis = self->nn_param.pre_process_rgb.perm[i]; + if (axis != i) + break; + } + + if (i == self->nn_param.pre_process_rgb.dim_num) + self->nn_param.pre_process_rgb.local.enable_perm = FALSE; + else + self->nn_param.pre_process_rgb.local.enable_perm = TRUE; + + if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE) + { + p->local.scale_x = (p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]; + p->local.scale_y = (p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]; + } + else + { + p->local.scale_x = (p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]; + p->local.scale_y = (p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]; + } + + p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.pre_process_rgb.local.local_tensor != NULL) + { + vxReleaseTensor(&self->nn_param.pre_process_rgb.local.local_tensor); + self->nn_param.pre_process_rgb.local.local_tensor = NULL; + } + + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PRE_PROCESS_RGB, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c new file mode 100644 index 0000000..c1536be --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c @@ -0,0 +1,239 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_dtype_util.h" +#include "vsi_nn_internal_node.h" + +extern vx_kernel_description_t * vx_kernel_PRE_PROCESS_TENSOR_list[]; + + +static vsi_bool _is_same_type + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if(vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) + { + return FALSE; + } + + return TRUE; +} /* _is_same_quant */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret; + uint32_t i; + uint32_t axis; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_bool use_virtual_tensor = TRUE; + + vsi_nn_internal_init_node_wksp( self ); + + if( self->nn_param.pre_process_tensor.dim_num != inputs[0]->attr.dim_num ) + { + VSILOGE( "Error permute dims '%u' vs '%u' ", + self->nn_param.permute.dim_num, inputs[0]->attr.dim_num ); + return FALSE; + } + + ret = TRUE; + /* output */ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for( i = 0; i < self->nn_param.pre_process_tensor.dim_num; i ++ ) + { + axis = self->nn_param.pre_process_tensor.perm[i]; + if( axis >= inputs[0]->attr.dim_num ) + { + VSILOGE( "Error permute axis '%u', the dim is '%u' ", + axis, inputs[0]->attr.dim_num ); + ret = FALSE; + break; + } + outputs[0]->attr.size[i] = inputs[0]->attr.size[axis]; + } + } + + for (i = 0; i < self->nn_param.pre_process_tensor.dim_num; i++) + { + axis = self->nn_param.pre_process_tensor.perm[i]; + if (axis != i) + break; + } + + if (i == self->nn_param.pre_process_tensor.dim_num) + self->nn_param.pre_process_tensor.local.enable_perm = FALSE; + else + self->nn_param.pre_process_tensor.local.enable_perm = TRUE; + + if (_is_same_type(self, inputs, outputs)) + self->nn_param.pre_process_tensor.local.enable_data_conv = FALSE; + else + self->nn_param.pre_process_tensor.local.enable_data_conv = TRUE; + + if (self->nn_param.pre_process_tensor.local.enable_data_conv == FALSE && + self->nn_param.pre_process_tensor.local.enable_perm == FALSE) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + curr->node->nn_param.reshape.size = outputs[0]->attr.size; + curr->node->nn_param.reshape.dim_num = outputs[0]->attr.dim_num; + curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT]; + curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (self->nn_param.pre_process_tensor.local.enable_data_conv == TRUE && + self->nn_param.pre_process_tensor.local.enable_perm == FALSE) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT]; + curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else if (self->nn_param.pre_process_tensor.local.enable_data_conv == FALSE && + self->nn_param.pre_process_tensor.local.enable_perm == TRUE) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + curr->node->nn_param.permute.perm = self->nn_param.pre_process_tensor.perm; + curr->node->nn_param.permute.dim_num = self->nn_param.pre_process_tensor.dim_num; + curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT]; + curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + else + { + /* transpose to time_major */ + memcpy( &attr, &outputs[PRE_PROCESS_TENSOR_OUTPUT]->attr, sizeof( attr ) ); + memcpy( &attr.size, &inputs[PRE_PROCESS_TENSOR_INPUT]->attr.size, sizeof( attr.size ) ); + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT]; + curr->outputs[0] = output_tensor->t; + + vsi_nn_internal_setup_node( self, curr ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + curr->node->nn_param.permute.perm = self->nn_param.pre_process_tensor.perm; + curr->node->nn_param.permute.dim_num = self->nn_param.pre_process_tensor.dim_num; + curr->inputs[0] = output_tensor->t; + curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT]; + + vsi_nn_internal_setup_node(self, curr); + } + + return ret; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp( self ); + + return status; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PRE_PROCESS_TENSOR, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ PRE_PROCESS_TENSOR_INPUT_CNT, + /* output_num */ PRE_PROCESS_TENSOR_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c new file mode 100644 index 0000000..3fe0c49 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c @@ -0,0 +1,222 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_yuv420.local.scale_x ); + vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_yuv420.local.scale_y ); + vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_yuv420.rect.left ); + vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_yuv420.rect.top ); + vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv420.r_mean ); + vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv420.g_mean ); + vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv420.b_mean ); + vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv420.rgb_scale ); + vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv420.reverse_channel ); + vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv420.local.enable_perm ); + vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv420.local.enable_copy ); + n = vsi_nn_kernel_selector( self->graph, "pre_process_yuv420", inputs, 3, outputs, 1, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PRE_PROCESS_YUV420, 3, 1) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + END_IO_TYPE_DECL(PRE_PROCESS_YUV420) + if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_YUV420, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + vsi_nn_pre_process_yuv420_param * p = NULL; + uint32_t axis = 0; + uint32_t i = 0; + p = (vsi_nn_pre_process_yuv420_param *)&(self->nn_param.pre_process_yuv420); + + if (p->rect.width == 0 || p->rect.height == 0) + { + VSILOGE("Image size cannot be zero !(PRE_PROCESS_YUV420)\n"); + return FALSE; + } + else + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_YUV420)\n"); + return FALSE; + } + } + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + if (p->output_attr.dim_num > 0) + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_YUV420)\n"); + return FALSE; + } + else + { + outputs[0]->attr.dim_num = p->output_attr.dim_num; + outputs[0]->attr.size[i] = p->output_attr.size[i]; + } + } + } + else + { + VSILOGE("output dim num cannot be zero!(PRE_PROCESS_YUV420)\n"); + return FALSE; + } + } + + for (i = 0; i < self->nn_param.pre_process_yuv420.dim_num; i++) + { + axis = self->nn_param.pre_process_yuv420.perm[i]; + if (axis != i) + break; + } + + if (i == self->nn_param.pre_process_yuv420.dim_num) + self->nn_param.pre_process_yuv420.local.enable_perm = FALSE; + else + self->nn_param.pre_process_yuv420.local.enable_perm = TRUE; + + if (self->nn_param.pre_process_yuv420.local.enable_perm == FALSE) + { + p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; + p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; + } + else + { + p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1]; + p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2]; + } + + p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + for (i = 0; i < _VSI_NN_PRE_PROCESS_YUV420_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.pre_process_yuv420.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.pre_process_yuv420.local.local_tensor[i])); + self->nn_param.pre_process_yuv420.local.local_tensor[i] = NULL; + } + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PRE_PROCESS_YUV420, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c new file mode 100644 index 0000000..0d7d370 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c @@ -0,0 +1,246 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_yuv444.local->scale_x ); + vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_yuv444.local->scale_y ); + vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_yuv444.rect.left ); + vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_yuv444.rect.top ); + vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv444.r_mean ); + vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv444.g_mean ); + vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv444.b_mean ); + vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv444.rgb_scale ); + vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv444.reverse_channel ); + vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv444.local->enable_perm ); + vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv444.local->enable_copy ); + n = vsi_nn_kernel_selector( self->graph, "pre_process_yuv444", inputs, 3, outputs, 1, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PRE_PROCESS_YUV444, 3, 1) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + END_IO_TYPE_DECL(PRE_PROCESS_YUV444) + if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_YUV444, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + vsi_nn_pre_process_yuv444_param * p = NULL; + uint32_t axis = 0; + uint32_t i = 0; + p = (vsi_nn_pre_process_yuv444_param *)&(self->nn_param.pre_process_yuv444); + + if (p->rect.width == 0 || p->rect.height == 0) + { + VSILOGE("Image size cannot be zero !(PRE_PROCESS_YUV444)\n"); + return FALSE; + } + else + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_YUV444)\n"); + return FALSE; + } + } + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + if (p->output_attr.dim_num > 0) + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_YUV444)\n"); + return FALSE; + } + else + { + outputs[0]->attr.dim_num = p->output_attr.dim_num; + outputs[0]->attr.size[i] = p->output_attr.size[i]; + } + } + } + else + { + VSILOGE("output dim num cannot be zero!(PRE_PROCESS_YUV444)\n"); + return FALSE; + } + } + + for (i = 0; i < self->nn_param.pre_process_yuv444.dim_num; i++) + { + axis = self->nn_param.pre_process_yuv444.perm[i]; + if (axis != i) + break; + } + + if (i == self->nn_param.pre_process_yuv444.dim_num) + self->nn_param.pre_process_yuv444.local->enable_perm = FALSE; + else + self->nn_param.pre_process_yuv444.local->enable_perm = TRUE; + + if (self->nn_param.pre_process_yuv444.local->enable_perm == FALSE) + { + p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; + p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; + } + else + { + p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[1]; + p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[2]; + } + + p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15))); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.pre_process_yuv444.local != NULL) + { + uint32_t i = 0; + for (i = 0; i < _VSI_NN_PRE_PROCESS_YUV444_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.pre_process_yuv444.local->local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.pre_process_yuv444.local->local_tensor[i])); + self->nn_param.pre_process_yuv444.local->local_tensor[i] = NULL; + } + } + free(self->nn_param.pre_process_yuv444.local); + self->nn_param.pre_process_yuv444.local = NULL; + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.pre_process_yuv444.local = + (vsi_nn_pre_process_yuv444_lcl_data *)malloc(sizeof(vsi_nn_pre_process_yuv444_lcl_data)); + + if (NULL == self->nn_param.pre_process_yuv444.local) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.pre_process_yuv444.local, 0, sizeof(vsi_nn_pre_process_yuv444_lcl_data)); + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PRE_PROCESS_YUV444, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c new file mode 100644 index 0000000..f51e4b2 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c @@ -0,0 +1,289 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_error.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define VSI_NN_PRELU_DEFAULT_AXIS 2 + +static vsi_bool _is_one_rank_tensor + ( + vsi_nn_tensor_t * input, + uint32_t *shape + ) +{ + uint32_t i = 0; + uint32_t one_rank = 0; + + *shape = 1; + + for (i = 0; i < input->attr.dim_num; i++) + { + if (input->attr.size[i] != 1) + { + *shape = input->attr.size[i]; + one_rank ++; + } + } + + if (one_rank <= 1) + { + return TRUE; + } + + return FALSE; +} + +static vsi_status _prelu_op_compute + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_prelu_param *prelu = &self->nn_param.prelu; + int32_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 }; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_bool one_rank = FALSE; + vsi_bool is_per_channel_alpha = 0; + uint32_t alpha_shape = 1; + uint32_t i = 0; + vsi_nn_kernel_param_t * param = NULL; + uint32_t dims = outputs[0]->attr.dim_num; + + reshape_tensors[0] = inputs[0]; + one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape); + + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + shapes[i] = 1; + } + + if (vsi_nn_compareVersion(self->graph, 1, 1, 20) == -1) + { + int32_t axis = prelu->axis; + + if (one_rank) + { + is_per_channel_alpha = (alpha_shape == 1) || axis == 2; + + if (is_per_channel_alpha) + { + shapes[0] = alpha_shape; + dims = 2; + } + else + { + memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(int32_t)); + dims = inputs[1]->attr.dim_num; + } + + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes, dims ); + } + else + { + memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(int32_t)); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes, inputs[1]->attr.dim_num ); + } + } + else + { + dims = inputs[1]->attr.dim_num; + + memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(int32_t)); + + if (one_rank) + { + is_per_channel_alpha = (inputs[1]->attr.dim_num > 2 && alpha_shape == inputs[0]->attr.size[2]); + } + + if (is_per_channel_alpha) + { + shapes[0] = alpha_shape; + shapes[1] = 1; + dims = 2; + } + + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes, dims ); + } + + // Add params + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "is_per_channel_alpha", is_per_channel_alpha ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, + &reshape_tensors[0], 2, + outputs, 1, param ); + + vsi_nn_kernel_param_release( ¶m ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* _prelu_op_compute() */ + +vsi_bool vsi_nn_op_prelu_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + vsi_nn_prelu_param *prelu = &self->nn_param.prelu; + + if( NULL == self ) + { + return FALSE; + } + + if (prelu->axis < 0) + { + prelu->axis += (int32_t)inputs[0]->attr.dim_num; + } + + if (prelu->axis < 0) + { + VSILOGD("PRelu Invalid Axis: %d \n", prelu->axis); + return FALSE; + } + + if (vsi_nn_compareVersion(self->graph, 1, 1, 20) == -1) + { + ret = vsi_nn_op_common_setup(self, inputs, outputs); + } + else + { + ret = vsi_nn_OpSetup( VSI_NN_OP_MULTIPLY, self, inputs, outputs ); + } + + return ret; +} /* vsi_nn_op_prelu_setup() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PRELU, 2, 1) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_BF16, D_F16, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + + /* HW 9.0 */ + IO_TYPE(D_F32, D_BF16, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32) + END_IO_TYPE_DECL(PRELU) + if(!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + if ( vsi_nn_compareVersion(self->graph, 1, 1, 20) >= 0 ) + { + vsi_nn_OpCheck( VSI_NN_OP_MULTIPLY, self, inputs, outputs ); + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 17) == -1) + { + self->nn_param.prelu.axis = VSI_NN_PRELU_DEFAULT_AXIS; + } + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEF_ELEMENT_WISE_OP(name, kernel_name) \ + static vsi_status op_compute_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _prelu_op_compute( ""#kernel_name, self, inputs, outputs ); \ + } \ +DEF_OP_REG(name, op_init, op_compute_##kernel_name, vsi_nn_op_common_deinit, \ + op_check, vsi_nn_op_prelu_setup, NULL, 2, 1) + +DEF_ELEMENT_WISE_OP( PRELU, prelu ); + +#undef DEF_ELEMENT_WISE_OP + +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c new file mode 100644 index 0000000..17ac2bb --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c @@ -0,0 +1,367 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + +/* +* inputs[0] - scores +* inputs[1] - bboxs +* inputs[2] - im_info +* inputs[3] - anchors +* outputs[0] - rois +* outputs[1] - scores +*/ + +#define ROUND(x) ((int)(x + 0.5f)) + +static vsi_nn_tensor_t * create_im_info_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_proposal_im_info * im_info + ) +{ + vsi_nn_tensor_t * tensor; + vsi_nn_tensor_attr_t attr; + memset( &attr, 0, sizeof( vsi_nn_tensor_attr_t ) ); + attr.size[0] = 1; + attr.size[1] = 1; + attr.size[2] = 4; + attr.size[3] = 1; + attr.dim_num = 4; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + tensor = vsi_nn_CreateTensorFromData( graph, + (uint8_t *)im_info, &attr ); + if( NULL == tensor ) + { + VSILOGE( "Create im info tensor fail." ); + } + return tensor; +} /* create_im_info_tensor() */ + + +static vsi_nn_tensor_t * create_anchor_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_proposal_anchor * anchor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t * tensor; + float * p_anchor; + float * data_anchor; + float base_area; + float center; + uint32_t anchor_sz; + float ratio_h; + float ratio_w; + float scale_h; + float scale_w; + int i; + int j; + + if( NULL == anchor->ratio || NULL == anchor->scale + || 0 >= anchor->ratio_num || 0 >= anchor->scale_num + || 0 >= anchor->base_size ) + { + VSILOGE( "Create anchor tensor fail." ); + return NULL; + } + + memset( &attr, 0, sizeof( vsi_nn_tensor_attr_t ) ); + anchor_sz = anchor->ratio_num * anchor->scale_num * 4; + + data_anchor = (float *)malloc( anchor_sz * sizeof( float ) ); + if( NULL == data_anchor ) + { + VSILOGE( "Create anchor tensor fail." ); + return NULL; + } + + /* Generate anchor data */ + p_anchor = data_anchor; + base_area = (float)(anchor->base_size * anchor->base_size); + center = (float)(0.5f * (anchor->base_size - 1.0f)); + + for( i = 0; i < anchor->ratio_num; i ++ ) + { + ratio_w = (float)ROUND( sqrt( base_area / anchor->ratio[i] ) ); + ratio_h = (float)ROUND( ratio_w * anchor->ratio[i] ); + for( j = 0; j < anchor->scale_num; j ++ ) + { + scale_w = (float)( 0.5f * (ratio_w * anchor->scale[j] - 1.0f ) ); + scale_h = (float)( 0.5f * (ratio_h * anchor->scale[j] - 1.0f ) ); + p_anchor[0] = center - scale_w; + p_anchor[1] = center - scale_h; + p_anchor[2] = center + scale_w; + p_anchor[3] = center + scale_h; + p_anchor += 4; + } + } + + /* Create tensor */ + attr.size[0] = 1; + attr.size[1] = 1; + attr.size[2] = 4; + attr.size[3] = anchor->ratio_num * anchor->scale_num; + attr.dim_num = 4; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + tensor = vsi_nn_CreateTensorFromData( graph, + (uint8_t *)data_anchor, &attr ); + + free( data_anchor ); + + if( NULL == tensor ) + { + VSILOGE( "Create anchor tensor fail." ); + } + return tensor; +} /* create_im_info_tensor() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_nn_rpn_params_t p; + vx_tensor rois_tmp, score_tmp; + + status = VSI_FAILURE; + if(self->nn_param.proposal.local.rois) + { + rois_tmp = self->nn_param.proposal.local.rois; + } + else + { + rois_tmp = outputs[0]->t; + } + if(self->nn_param.proposal.local.score) + { + score_tmp = self->nn_param.proposal.local.score; + } + else + { + score_tmp = (NULL != outputs[1])?outputs[1]->t : NULL; + } + + p.feature_stride = self->nn_param.proposal.feat_stride; + p.min_size = self->nn_param.proposal.min_size; + p.pre_nms_topn = self->nn_param.proposal.pre_nms_topn; + p.post_nms_topn = self->nn_param.proposal.post_nms_topn; + p.nms_thresh = self->nn_param.proposal.nms_thresh; + + self->n = vxRPNLayer( + self->graph->g, + inputs[0]->t, + inputs[1]->t, + inputs[3]->t, + inputs[2]->t, + &p, + sizeof( vx_nn_rpn_params_t ), + rois_tmp, + score_tmp + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_tensor_t * im_info; + vsi_nn_tensor_t * anchors; + + im_info = inputs[2]; + anchors = inputs[3]; + /* Check and generate im_info */ + if( NULL == im_info ) + { + im_info = create_im_info_tensor( node->graph, + &node->nn_param.proposal.im_info ); + inputs[2] = im_info; + node->input.tensors[2] = vsi_nn_AttachTensorToGraph( + node->graph, VSI_NN_TENSOR_ID_AUTO, im_info ); + } + + /* Check and generate anchors */ + if( NULL == anchors ) + { + anchors = create_anchor_tensor( node->graph, + &node->nn_param.proposal.anchor ); + inputs[3] = anchors; + node->input.tensors[3] = vsi_nn_AttachTensorToGraph( + node->graph, VSI_NN_TENSOR_ID_AUTO, anchors ); + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = 5; + outputs[0]->attr.size[1] = node->nn_param.proposal.post_nms_topn; + outputs[0]->attr.dim_num = 2; + } + + if( NULL != outputs[1] && VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) + { + outputs[1]->attr.size[0] = 1; + outputs[1]->attr.size[1] = node->nn_param.proposal.post_nms_topn; + outputs[1]->attr.dim_num = 2; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + int32_t size[VSI_NN_MAX_DIM_NUM]; + uint32_t dim; + vx_tensor rois_tmp, score_tmp; + + rois_tmp = NULL, score_tmp = NULL; + if( direction == VSI_NN_OPTIMIZE_BACKWARD ) + { + VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + + dim = 4; + size[0] = 1; + size[1] = 1; + self->nn_param.proposal.local.rois = NULL; + self->nn_param.proposal.local.score = NULL; + /* reshape rois tensor, [5,roi_num] --> [1,1,5,roi_num] */ + if(2 == outputs[0]->attr.dim_num) + { + size[2] = outputs[0]->attr.size[0]; + size[3] = outputs[0]->attr.size[1]; + rois_tmp = vxReshapeTensor(outputs[0]->t, size, dim); + if(NULL == rois_tmp) + { + goto error; + } + self->nn_param.proposal.local.rois = rois_tmp; + } + + /* reshape score tensor, [1,roi_num] --> [1,1,1,roi_num] */ + if(outputs[1] != NULL && 2 == outputs[1]->attr.dim_num) + { + size[2] = outputs[1]->attr.size[0]; + size[3] = outputs[1]->attr.size[1]; + score_tmp = vxReshapeTensor(outputs[1]->t, size, dim); + if(NULL == score_tmp) + { + goto error; + } + self->nn_param.proposal.local.score = score_tmp; + } + } + + return VSI_SUCCESS; +error: + if(rois_tmp)vxReleaseTensor(&rois_tmp); + if(score_tmp)vxReleaseTensor(&score_tmp); + return VSI_FAILURE; +} + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vx_tensor rois = self->nn_param.proposal.local.rois; + vx_tensor score = self->nn_param.proposal.local.score; + if( NULL != self && NULL != self->n ) + { + if(rois) + { + vxReleaseTensor(&rois); + rois = NULL; + } + if(score) + { + vxReleaseTensor(&score); + score = NULL; + } + vxReleaseNode( &self->n ); + self->n = NULL; + } + return VSI_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PROPOSAL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 4, + /* output_num */ 2 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c new file mode 100644 index 0000000..c5bf9d2 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c @@ -0,0 +1,105 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + /* TODO */ + /* example code : add op */ + /* + self->n = vxTensorAddNode( self->graph->g, inputs[0]->t, inputs[1]->t, + VX_CONVERT_POLICY_SATURATE, outputs[0]->t ); + */ + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ QUANTIZED_16BIT_LSTM, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ Q16_LSTM_INPUT_CNT, + /* output_num */ Q16_LSTM_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_random_multinomial.c b/src/tim/vx/internal/src/ops/vsi_nn_op_random_multinomial.c new file mode 100644 index 0000000..e7b7615 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_random_multinomial.c @@ -0,0 +1,124 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_t node; + + node = vsi_nn_kernel_selector( self->graph, "random_multinomial", + inputs, 2, outputs, 1, NULL ); + if( node ) + { + status = VSI_SUCCESS; + self->n = (vx_node)node; + } + + return status; + +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(RANDOM_MULTINOMIAL, 2, 1) + IO_TYPE(D_F16, D_I32, D_I32) + IO_TYPE(D_F32, D_I32, D_I32) + END_IO_TYPE_DECL(RANDOM_MULTINOMIAL) + if(!VALIDATE_OP_IO_TYPES(RANDOM_MULTINOMIAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_random_multinomial_param * p; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + p = &self->nn_param.random_multinomial; + outputs[0]->attr.size[0] = p->sample_num; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + outputs[0]->attr.dim_num = 2; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RANDOM_MULTINOMIAL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c new file mode 100644 index 0000000..323ee4a --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -0,0 +1,1339 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_platform.h" + +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +#define _ARG_NUM (6) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +#define USE_OVX_API TRUE + +typedef struct _vsi_nn_reduce_lcl2_data_t +{ + vsi_nn_tensor_t *reshaped_input; + vsi_nn_tensor_t *reshaped_output; + vsi_nn_tensor_t *reshaped_input1; + vsi_nn_tensor_t *reshaped_output1; + vsi_nn_tensor_t *reshaped_tmp; + vsi_nn_tensor_t *axis_tensor2; + int32_t axes[VSI_NN_MAX_DIM_NUM]; + int32_t axes_num; +} vsi_nn_reduce_lcl2_data_t; + +#if (USE_OVX_API == FALSE) +extern vx_kernel_description_t * vx_kernel_REDUCE_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vx_uint32 i; + vx_uint32 cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vx_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + vx_uint32 num + ) +{ + vx_status status; + vx_context ctx; + vsi_nn_reduce_param * p = NULL; + if( 0 == num ) + { + return VX_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &node->nn_param.reduce; + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VX_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, axis_num ); + _SET_PARAM( 1, VX_TYPE_INT32, keep_dim ); + _SET_PARAM( 2, VX_TYPE_INT32, axis[0] ); + _SET_PARAM( 3, VX_TYPE_INT32, axis[1] ); + _SET_PARAM( 4, VX_TYPE_INT32, axis[2] ); + _SET_PARAM( 5, VX_TYPE_INT32, axis[3] ); + +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + vx_uint32 num + ) +{ + vx_uint32 i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vx_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vx_status status = VX_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args = NULL; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VX_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + NULL +}; +#endif + +static vsi_status op_comput_reduce_mean(vsi_nn_node_t * self, + vsi_nn_tensor_t *axis_tensor, + vx_bool keep_dim, + vx_tensor input_t, + vx_tensor output_t) +{ + vsi_status status = VSI_FAILURE; + vx_nn_mean_params_t para; + + para.axis = REQUIRED_IO(axis_tensor); + para.keep_dims = keep_dim; + self->n = vxTensorMeanNode( self->graph->g, input_t, ¶, + sizeof(vx_nn_mean_params_t), output_t ); + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} + +static vsi_bool caculate_reshape_size(uint32_t* dim_value, + uint32_t* re_sizes, uint32_t* re_sizes2, + vx_int32 *resolved_dim, vx_int32 resolved_dim_count) +{ +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + vsi_bool enable_reshape = TRUE; + uint32_t size_count = 1; + uint32_t i = 0; + uint32_t dim_num = *dim_value; + if (dim_num > 4) + { + for (i = 4; i < dim_num; i++) + { + size_count *= re_sizes[i]; + } + } + + if (re_sizes[0] * re_sizes[1] * re_sizes[2] < VSI_NN_MAX_IMAGE_WIDTH) + { + re_sizes2[0] = re_sizes[0] * re_sizes[1] * re_sizes[2]; + re_sizes2[1] = re_sizes[3]; + if (size_count != 1) + { + re_sizes2[2] = size_count; + dim_num = 3; + } + else + { + dim_num = 2; + } + resolved_dim[resolved_dim_count - 1] = 1; + } + else if (re_sizes[0] * re_sizes[1] < VSI_NN_MAX_IMAGE_WIDTH) + { + re_sizes2[0] = re_sizes[0] * re_sizes[1]; + re_sizes2[1] = re_sizes[2]; + re_sizes2[2] = re_sizes[3]; + if (size_count != 1) + { + re_sizes2[3] = size_count; + dim_num = 4; + } + else + { + dim_num = 3; + } + resolved_dim[resolved_dim_count - 1] = 2; + } + else if (re_sizes[1] * re_sizes[2] < VSI_NN_MAX_IMAGE_WIDTH) + { + re_sizes2[0] = re_sizes[0]; + re_sizes2[1] = re_sizes[1] * re_sizes[2]; + re_sizes2[2] = re_sizes[3]; + if (size_count != 1) + { + re_sizes2[3] = size_count; + dim_num = 4; + } + else + { + dim_num = 3; + } + resolved_dim[resolved_dim_count - 1] = 2; + } + else + { + enable_reshape = FALSE; + } + *dim_value = dim_num; +#undef VSI_NN_MAX_IMAGE_WIDTH + return enable_reshape; +} + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; +#if (USE_OVX_API == TRUE) + + if (self->nn_param.reduce.type == VSI_NN_REDUCE_MEAN) + { + vx_tensor input_t, output_t; + vsi_nn_tensor_t *axis_tensor = NULL; + vsi_nn_tensor_t *axis_tensor2 = NULL; + vsi_nn_tensor_attr_t attr, attr2; + vx_int32 resolved_dim[4] = {-1, -1, -1, -1}; + vx_int32 resolved_dim_count = 0; + uint32_t i = 0; + uint32_t re_sizes[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t re_sizes2[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t dim_num; + vsi_nn_tensor_t *mean_tmp_tensor = NULL; + vsi_nn_tensor_t *reshaped_input1 = self->nn_param.reduce.local2->reshaped_input1; + vsi_nn_tensor_t *reshaped_output1 = self->nn_param.reduce.local2->reshaped_output1; + + resolved_dim_count = self->nn_param.reduce.local2->axes_num; + + for (i = 0; i < (uint32_t)resolved_dim_count; i++) + { + resolved_dim[i] = self->nn_param.reduce.local2->axes[i]; + } + + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + re_sizes[i] = 1; + re_sizes2[i] = 1; + } + memset(&attr2, 0, sizeof(attr)); + memcpy( &attr2, &reshaped_input1->attr, sizeof(vsi_nn_tensor_attr_t) ); + dim_num = reshaped_input1->attr.dim_num; + for (i = 0; i < dim_num; i++) + { + attr2.size[i] = reshaped_input1->attr.size[i]; + re_sizes[i] = reshaped_input1->attr.size[i]; + } + if ((VSI_NN_TYPE_FLOAT32 == inputs[POST_PROCESS_INPUT]->attr.dtype.vx_type) + || (VSI_NN_TYPE_INT32 == inputs[POST_PROCESS_INPUT]->attr.dtype.vx_type) + || (VSI_NN_TYPE_UINT32 == inputs[POST_PROCESS_INPUT]->attr.dtype.vx_type) + || (VSI_NN_TYPE_UINT64 == inputs[POST_PROCESS_INPUT]->attr.dtype.vx_type) + ) + { + attr2.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + } + else if (VSI_NN_TYPE_FLOAT64 == inputs[POST_PROCESS_INPUT]->attr.dtype.vx_type) + { + attr2.dtype.vx_type = VSI_NN_TYPE_FLOAT64; + } + else + { + attr2.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } + + attr2.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + + if ((2 == resolved_dim_count && resolved_dim[0] < 3 && resolved_dim[1] < 3) + || (1 == resolved_dim_count && resolved_dim[0] < 3) + || (resolved_dim[resolved_dim_count - 1] > 3) + || resolved_dim_count > 3) + { + memset(&attr, 0, sizeof(attr)); + attr.size[0] = resolved_dim_count; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_UINT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + axis_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)resolved_dim, + &attr); + if( NULL == axis_tensor ) + { + VSILOGE("Create axis_tensor fail.(reduce)"); + return VSI_FAILURE; + } + + self->nn_param.reduce.local.axis_tensor = axis_tensor; + input_t = reshaped_input1->t; + output_t = reshaped_output1->t; + status = op_comput_reduce_mean(self, + axis_tensor, + self->nn_param.reduce.keep_dim, + input_t, + output_t); + } + else if (3 == resolved_dim[resolved_dim_count - 1] && resolved_dim_count < 3) + { + if (1 == resolved_dim_count) + { + memset(&attr, 0, sizeof(attr)); + attr.size[0] = resolved_dim_count; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_UINT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + axis_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)resolved_dim, + &attr); + if( NULL == axis_tensor ) + { + VSILOGE("Create axis_tensor fail.(reduce)"); + return VSI_FAILURE; + } + + self->nn_param.reduce.local.axis_tensor = axis_tensor; + input_t = reshaped_input1->t; + output_t = reshaped_output1->t; + status = op_comput_reduce_mean(self, + axis_tensor, + self->nn_param.reduce.keep_dim, + input_t, + output_t); + } + else if (2 == resolved_dim_count) + { + vsi_bool enable_reshape = TRUE; + + attr2.size[resolved_dim[0]] = 1; + attr2.vtl = FALSE; + mean_tmp_tensor = vsi_nn_CreateTensor(self->graph, &attr2); + self->nn_param.reduce.local2->reshaped_tmp = mean_tmp_tensor; + re_sizes[resolved_dim[0]] = 1; + memset(&attr, 0, sizeof(attr)); + attr.size[0] = 1; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_UINT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + axis_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)&resolved_dim[0], + &attr); + if( NULL == axis_tensor ) + { + VSILOGE("Create axis_tensor fail.(reduce)"); + return VSI_FAILURE; + } + self->nn_param.reduce.local.axis_tensor = axis_tensor; + status = op_comput_reduce_mean(self, + axis_tensor, + self->nn_param.reduce.keep_dim, + reshaped_input1->t, + mean_tmp_tensor->t); + + enable_reshape = caculate_reshape_size(&dim_num, re_sizes, re_sizes2, + resolved_dim, resolved_dim_count); + + if (enable_reshape) + { + self->nn_param.reduce.local2->reshaped_input = + vsi_nn_reshape_tensor(self->graph, mean_tmp_tensor, re_sizes2, dim_num); + re_sizes2[resolved_dim[resolved_dim_count - 1]] = 1; + self->nn_param.reduce.local2->reshaped_output = + vsi_nn_reshape_tensor(self->graph, reshaped_output1, re_sizes2, dim_num); + } + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = 1; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_UINT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + axis_tensor2 = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)&resolved_dim[1], + &attr); + if( NULL == axis_tensor2 ) + { + VSILOGE("Create axis_tensor fail.(reduce)"); + return VSI_FAILURE; + } + + self->nn_param.reduce.local2->axis_tensor2 = axis_tensor2; + if (self->nn_param.reduce.local2->reshaped_input) + { + input_t = self->nn_param.reduce.local2->reshaped_input->t; + } + else + { + input_t = mean_tmp_tensor->t; + } + if (self->nn_param.reduce.local2->reshaped_output) + { + output_t = self->nn_param.reduce.local2->reshaped_output->t; + } + else + { + output_t = reshaped_output1->t; + } + status = op_comput_reduce_mean(self, + axis_tensor2, + self->nn_param.reduce.keep_dim, + input_t, + output_t); + } + } + else if (3 == resolved_dim_count) + { + vsi_bool enable_reshape = TRUE; + + attr2.size[resolved_dim[0]] = 1; + attr2.size[resolved_dim[1]] = 1; + attr2.vtl = FALSE; + mean_tmp_tensor = vsi_nn_CreateTensor(self->graph, &attr2); + self->nn_param.reduce.local2->reshaped_tmp = mean_tmp_tensor; + re_sizes[resolved_dim[0]] = 1; + re_sizes[resolved_dim[1]] = 1; + memset(&attr, 0, sizeof(attr)); + attr.size[0] = 2; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_UINT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + axis_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)&resolved_dim[0], + &attr); + if( NULL == axis_tensor ) + { + VSILOGE("Create axis_tensor fail.(reduce)"); + return VSI_FAILURE; + } + self->nn_param.reduce.local.axis_tensor = axis_tensor; + status = op_comput_reduce_mean(self, + axis_tensor, + self->nn_param.reduce.keep_dim, + reshaped_input1->t, + mean_tmp_tensor->t); + if (3 == resolved_dim[resolved_dim_count - 1]) + { + enable_reshape = caculate_reshape_size(&dim_num, re_sizes, re_sizes2, + resolved_dim, resolved_dim_count); + if (enable_reshape) + { + self->nn_param.reduce.local2->reshaped_input = + vsi_nn_reshape_tensor(self->graph, mean_tmp_tensor, re_sizes2, dim_num); + re_sizes2[resolved_dim[resolved_dim_count - 1]] = 1; + self->nn_param.reduce.local2->reshaped_output = + vsi_nn_reshape_tensor(self->graph, reshaped_output1, re_sizes2, dim_num); + } + } + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = 1; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_UINT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + axis_tensor2 = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)&resolved_dim[2], + &attr); + if( NULL == axis_tensor2 ) + { + VSILOGE("Create axis_tensor fail.(reduce)"); + return VSI_FAILURE; + } + + self->nn_param.reduce.local2->axis_tensor2 = axis_tensor2; + if (self->nn_param.reduce.local2->reshaped_input) + { + input_t = self->nn_param.reduce.local2->reshaped_input->t; + } + else + { + input_t = mean_tmp_tensor->t; + } + if (self->nn_param.reduce.local2->reshaped_output) + { + output_t = self->nn_param.reduce.local2->reshaped_output->t; + } + else + { + output_t = reshaped_output1->t; + } + status = op_comput_reduce_mean(self, + axis_tensor2, + self->nn_param.reduce.keep_dim, + input_t, + output_t); + } + + } + else if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM || + self->nn_param.reduce.type == VSI_NN_REDUCE_MAX || + self->nn_param.reduce.type == VSI_NN_REDUCE_MIN || + self->nn_param.reduce.type == VSI_NN_REDUCE_ALL || + self->nn_param.reduce.type == VSI_NN_REDUCE_ANY || + self->nn_param.reduce.type == VSI_NN_REDUCE_PROD) + { + status = vsi_nn_internal_compute_node( self ); + } + +#else + vsi_nn_kernel_info_t kernel_info; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_reduce"; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = vx_kernel_REDUCE_list; + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } +#endif + return status; +} /* op_compute() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM || + self->nn_param.reduce.type == VSI_NN_REDUCE_MAX || + self->nn_param.reduce.type == VSI_NN_REDUCE_MIN || + self->nn_param.reduce.type == VSI_NN_REDUCE_ALL || + self->nn_param.reduce.type == VSI_NN_REDUCE_ANY || + self->nn_param.reduce.type == VSI_NN_REDUCE_PROD) + { + return vsi_nn_internal_optimize_node(self, direction ); + } + else + { + return VSI_SUCCESS; + } +} /* op_optimize() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static void op_set_reduce_param_value(vsi_nn_nn_param_t *nn_param, + vsi_nn_op_t type_name, + vx_int32 *axis, + vx_uint32 axis_num, + vx_bool keep_dim + ) +{ + if (type_name == VSI_NN_OP_REDUCESUM_INTERNAL) + { + nn_param->reducesum_internal.axis = axis; + nn_param->reducesum_internal.axis_num = axis_num; + nn_param->reducesum_internal.keep_dim = keep_dim; + } + else if (type_name == VSI_NN_OP_REDUCEMAX_INTERNAL) + { + nn_param->reducemax_internal.axis = axis; + nn_param->reducemax_internal.axis_num = axis_num; + nn_param->reducemax_internal.keep_dim = keep_dim; + } + else if (type_name == VSI_NN_OP_REDUCEMIN_INTERNAL) + { + nn_param->reducemin_internal.axis = axis; + nn_param->reducemin_internal.axis_num = axis_num; + nn_param->reducemin_internal.keep_dim = keep_dim; + } + else if (type_name == VSI_NN_OP_REDUCEPROD_INTERNAL) + { + nn_param->reduceprod_internal.axis = axis; + nn_param->reduceprod_internal.axis_num = axis_num; + nn_param->reduceprod_internal.keep_dim = keep_dim; + } + else if (type_name == VSI_NN_OP_REDUCEALL_INTERNAL) + { + nn_param->reduceall_internal.axis = axis; + nn_param->reduceall_internal.axis_num = axis_num; + nn_param->reduceall_internal.keep_dim = keep_dim; + } + else if (type_name == VSI_NN_OP_REDUCEANY_INTERNAL) + { + nn_param->reduceany_internal.axis = axis; + nn_param->reduceany_internal.axis_num = axis_num; + nn_param->reduceany_internal.keep_dim = keep_dim; + } +} + +static vsi_bool optimzation_input_size( + const int32_t* shape_x, const size_t rank_x, + int32_t* out_shape_x, int32_t* out_rank_x, + const int32_t* resolved_dim, const int32_t resolved_dim_count, + int32_t* resolved_dim_out, int32_t* resolved_dim_out_count + ) +{ + int32_t i, j, k, out_i; + vx_bool is_change = vx_false_e; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t shape_out[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t rank_out; + int32_t dim_out; + + out_i = 0; + for (i = 0; i < resolved_dim[0]; i++) + { + out_shape_x[out_i++] = shape_x[i]; + } + + j = 0; + dim_out = 0; + for (i = 0; i < (resolved_dim_count - 1); i++) + { + if ((resolved_dim[i] + 1) == resolved_dim[i + 1]) + { + if (is_change) + { + shape[j++] = shape_x[resolved_dim[i + 1]]; + } + else + { + shape[j++] = shape_x[resolved_dim[i]]; + shape[j++] = shape_x[resolved_dim[i + 1]]; + is_change = vx_true_e; + } + } + else + { + if (is_change) + { + vsi_nn_kernel_optimize_element_shape( + shape, (size_t)j, + shape_out, &rank_out ); + if (2 == rank_out && 1 == shape_out[1]) + { + rank_out--; + } + for (k = 0; k < rank_out; k++) + { + resolved_dim_out[dim_out++] = out_i; + out_shape_x[out_i++] = shape_out[k]; + } + j = 0; + is_change = vx_false_e; + } + else + { + resolved_dim_out[dim_out++] = out_i; + out_shape_x[out_i++] = shape_x[resolved_dim[i]]; + } + + for ( k = resolved_dim[i] + 1; k < resolved_dim[i + 1]; k++ ) + { + out_shape_x[out_i++] = shape_x[k]; + } + } + } + + if (is_change) + { + vsi_nn_kernel_optimize_element_shape( + shape, (size_t)j, + shape_out, &rank_out ); + if (2 == rank_out && 1 == shape_out[1]) + { + rank_out--; + } + for (k = 0; k < rank_out; k++) + { + resolved_dim_out[dim_out++] = out_i; + out_shape_x[out_i++] = shape_out[k]; + } + } + else + { + resolved_dim_out[dim_out++] = out_i; + out_shape_x[out_i++] = shape_x[resolved_dim[resolved_dim_count - 1]]; + } + + for (i = resolved_dim[resolved_dim_count - 1] + 1; i < (int32_t)rank_x; i++) + { + out_shape_x[out_i++] = shape_x[i]; + } + + if (1 == out_i) + { + out_shape_x[out_i++] = 1; + } + + *out_rank_x = out_i; + *resolved_dim_out_count = dim_out; + + return TRUE; +} + +static vsi_bool op_set_reduce_axis( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + int32_t* out_shape_x, int32_t* out_rank_x + ) +{ + uint32_t i = 0, j = 0; + int32_t resolved_dim[4] = {-1, -1, -1, -1}; + int32_t resolved_dim2[4] = {-1, -1, -1, -1}; + int32_t resolved_dim_count = 0; + int32_t resolved_dim_count2 = 0; + vsi_bool is_loop = TRUE; + + for (i = 0; i < self->nn_param.reduce.axis_num; i++) + { + vx_int32 current_axis = self->nn_param.reduce.axis[i] < 0 ? \ + inputs[0]->attr.dim_num + self->nn_param.reduce.axis[i] : self->nn_param.reduce.axis[i]; + + if (current_axis < 0 || current_axis >= (vx_int32)inputs[0]->attr.dim_num) + { + VSILOGE("error: the axis value must be in the range [0, %d)\n", inputs[0]->attr.dim_num); + return FALSE; + } + + for (j = 0; j < 4; j++) + { + if (resolved_dim[j] == current_axis) + break; + } + + if (j == 4) + resolved_dim[resolved_dim_count++] = current_axis; + } + + for (i = resolved_dim_count; is_loop && (i > 0); i--) + { + is_loop = FALSE; + for (j = 1; j < i; j++) + { + if (resolved_dim[j] < resolved_dim[j - 1]) + { + vx_int32 temp = 0; + temp = resolved_dim[j]; + resolved_dim[j] = resolved_dim[j - 1]; + resolved_dim[j - 1] = temp; + is_loop = TRUE; + } + } + } + + if (resolved_dim_count > 1) + { + j = 0; + for (i = 0; i < (uint32_t)resolved_dim_count; i++) + { + if (inputs[POST_PROCESS_OUTPUT]->attr.size[resolved_dim[i]] > 1) + { + resolved_dim[j] = resolved_dim[i]; + j++; + } + } + if (j == 0) + { + j = 1; + } + resolved_dim_count = j; + } + + if (( 1 == resolved_dim_count )) + { + resolved_dim2[0] = resolved_dim[0]; + resolved_dim_count2 = resolved_dim_count; + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + out_shape_x[i] = (int32_t)(inputs[0]->attr.size[i]); + } + *out_rank_x = inputs[0]->attr.dim_num; + } + else + { + optimzation_input_size( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + out_shape_x, out_rank_x, resolved_dim, resolved_dim_count, + resolved_dim2, &resolved_dim_count2 ); + } + + + for (i = 0; i < (uint32_t)resolved_dim_count2; i++) + { + self->nn_param.reduce.local2->axes[i] = resolved_dim2[i]; + } + self->nn_param.reduce.local2->axes_num = resolved_dim_count2; + + return TRUE; +} + + +static vsi_bool op_set_reduce_internal + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_op_t type_name + ) +{ + uint32_t i = 0; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* tmp_output_tensor[2] = {NULL, NULL}; + vsi_bool use_virtual_tensor = TRUE; + uint32_t re_sizes[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t re_sizes2[VSI_NN_MAX_DIM_NUM] = {1}; + vsi_nn_tensor_t* new_output = NULL; + uint32_t dim_num; + vx_int32 resolved_dim_count = 0; + int32_t * axes = self->nn_param.reduce.local2->axes; + vx_bool is_use_float = vx_false_e; + resolved_dim_count = self->nn_param.reduce.local2->axes_num; + + if ((VSI_NN_OP_REDUCESUM_INTERNAL == type_name) || (VSI_NN_OP_REDUCEPROD_INTERNAL == type_name)) + { + is_use_float = vx_true_e; + } + + vsi_nn_internal_init_node_wksp( self ); + + memcpy( &attr, &inputs[POST_PROCESS_INPUT]->attr, sizeof(vsi_nn_tensor_attr_t) ); + dim_num = inputs[POST_PROCESS_INPUT]->attr.dim_num; + + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + re_sizes[i] = 1; + re_sizes2[i] = 1; + } + + for (i = 0; i < dim_num; i++) + { + attr.size[i] = inputs[POST_PROCESS_OUTPUT]->attr.size[i]; + re_sizes[i] = inputs[POST_PROCESS_OUTPUT]->attr.size[i]; + } + + if (is_use_float) + { + if ( (VSI_NN_TYPE_FLOAT32 == inputs[POST_PROCESS_INPUT]->attr.dtype.vx_type) + || (VSI_NN_TYPE_INT32 == inputs[POST_PROCESS_INPUT]->attr.dtype.vx_type) + || (VSI_NN_TYPE_UINT32 == inputs[POST_PROCESS_INPUT]->attr.dtype.vx_type) + || (VSI_NN_TYPE_UINT64 == inputs[POST_PROCESS_INPUT]->attr.dtype.vx_type) + ) + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + } + else if (VSI_NN_TYPE_FLOAT64 == inputs[POST_PROCESS_INPUT]->attr.dtype.vx_type) + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT64; + } + else + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + } + + if (1 == resolved_dim_count) + { + if (3 == axes[resolved_dim_count - 1]) + { + vsi_bool enable_reshape = TRUE; + enable_reshape = caculate_reshape_size(&dim_num, re_sizes, re_sizes2, + axes, resolved_dim_count); + if (enable_reshape) + { + self->nn_param.reduce.local2->reshaped_input = + vsi_nn_reshape_tensor(self->graph, inputs[0], re_sizes2, dim_num); + re_sizes2[axes[resolved_dim_count - 1]] = 1; + self->nn_param.reduce.local2->reshaped_output = + vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes2, dim_num); + } + } + + curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + op_set_reduce_param_value(&(curr->node->nn_param), type_name, + axes, 1, self->nn_param.reduce.keep_dim); + if (self->nn_param.reduce.local2->reshaped_input) + { + curr->inputs[0] = self->nn_param.reduce.local2->reshaped_input; + } + else + { + curr->inputs[0] = inputs[0]; + } + if (self->nn_param.reduce.local2->reshaped_output) + { + curr->outputs[0] = self->nn_param.reduce.local2->reshaped_output; + } + else + { + curr->outputs[0] = outputs[0]; + } + vsi_nn_internal_setup_node(self, curr); + } + else if (2 == resolved_dim_count) + { + attr.size[axes[0]] = 1; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + tmp_output_tensor[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + re_sizes[axes[0]] = 1; + + curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + op_set_reduce_param_value(&(curr->node->nn_param), type_name, + &(axes[0]), 1, vx_true_e); + curr->inputs[0] = inputs[POST_PROCESS_INPUT]; + curr->outputs[0] = tmp_output_tensor[0]->t; + vsi_nn_internal_setup_node( self, curr ); + + if (3 == axes[resolved_dim_count - 1]) + { + vsi_bool enable_reshape = TRUE; + enable_reshape = caculate_reshape_size(&dim_num, re_sizes, re_sizes2, + axes, resolved_dim_count); + + if (enable_reshape) + { + self->nn_param.reduce.local2->reshaped_input = + vsi_nn_reshape_tensor(self->graph, tmp_output_tensor[0]->t, re_sizes2, dim_num); + re_sizes2[axes[resolved_dim_count - 1]] = 1; + new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes2, dim_num); + } + else + { + re_sizes[axes[1]] = 1; + new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num); + } + } + else + { + re_sizes[axes[1]] = 1; + new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num); + } + + curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + op_set_reduce_param_value(&(curr->node->nn_param), type_name, + &(axes[1]), 1, vx_true_e); + if (self->nn_param.reduce.local2->reshaped_input) + { + curr->inputs[0] = self->nn_param.reduce.local2->reshaped_input; + } + else + { + curr->inputs[0] = tmp_output_tensor[0]->t; + } + curr->outputs[0] = new_output; + self->nn_param.reduce.local2->reshaped_output = new_output; + vsi_nn_internal_setup_node(self, curr); + } + else if (3 == resolved_dim_count) + { + attr.size[axes[0]] = 1; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + tmp_output_tensor[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + attr.size[axes[1]] = 1; + tmp_output_tensor[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + re_sizes[axes[0]] = 1; + re_sizes[axes[1]] = 1; + + curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + op_set_reduce_param_value(&(curr->node->nn_param), type_name, + &(axes[0]), 1, vx_true_e); + curr->inputs[0] = inputs[POST_PROCESS_INPUT]; + curr->outputs[0] = tmp_output_tensor[0]->t; + vsi_nn_internal_setup_node( self, curr ); + + curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + op_set_reduce_param_value(&(curr->node->nn_param), type_name, + &(axes[1]), 1, vx_true_e); + curr->inputs[0] = tmp_output_tensor[0]->t; + curr->outputs[0] = tmp_output_tensor[1]->t; + vsi_nn_internal_setup_node( self, curr ); + + + if (3 == axes[resolved_dim_count - 1]) + { + vsi_bool enable_reshape = TRUE; + enable_reshape = caculate_reshape_size(&dim_num, re_sizes, re_sizes2, + axes, resolved_dim_count); + if (enable_reshape) + { + self->nn_param.reduce.local2->reshaped_input = + vsi_nn_reshape_tensor(self->graph, tmp_output_tensor[1]->t, re_sizes2, dim_num); + re_sizes2[axes[resolved_dim_count - 1]] = 1; + new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes2, dim_num); + } + else + { + re_sizes[axes[2]] = 1; + new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num); + } + } + else + { + re_sizes[axes[2]] = 1; + new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num); + } + + curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + op_set_reduce_param_value(&(curr->node->nn_param), type_name, + &(axes[2]), 1, vx_true_e); + if (self->nn_param.reduce.local2->reshaped_input) + { + curr->inputs[0] = self->nn_param.reduce.local2->reshaped_input; + } + else + { + curr->inputs[0] = tmp_output_tensor[1]->t; + } + curr->outputs[0] = new_output; + self->nn_param.reduce.local2->reshaped_output = new_output; + vsi_nn_internal_setup_node(self, curr); + } + else + { + VSILOGE("error: resolved_dim_count is %d\n", resolved_dim_count); + return FALSE; + } + return TRUE; +} + + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + vsi_nn_tensor_t* reshape_in_t[1] = { NULL }; + vsi_nn_tensor_t* reshape_out_t[1] = { NULL }; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 0; + int32_t j; + if (self->nn_param.reduce.type != VSI_NN_REDUCE_MEAN && + self->nn_param.reduce.type != VSI_NN_REDUCE_SUM && + self->nn_param.reduce.type != VSI_NN_REDUCE_MAX && + self->nn_param.reduce.type != VSI_NN_REDUCE_MIN && + self->nn_param.reduce.type != VSI_NN_REDUCE_ALL && + self->nn_param.reduce.type != VSI_NN_REDUCE_ANY && + self->nn_param.reduce.type != VSI_NN_REDUCE_PROD) + { + VSILOGE("The type of reduce is not supported now.(reduce)"); + return FALSE; + } + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + int valid_dim_num = inputs[0]->attr.dim_num; + uint32_t i; + char dim_map[VSI_NN_MAX_DIM_NUM] = {0}; + + for (i = 0; i < self->nn_param.reduce.axis_num; i++) + { + int index = self->nn_param.reduce.axis[i]; + if (dim_map[index] == 0) { + dim_map[index] = 1; + valid_dim_num --; + } + } + + if (self->nn_param.reduce.keep_dim) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + if (dim_map[i] == 0) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + else + { + outputs[0]->attr.size[i] = 1; + } + } + } + else + { + int index = 0; + if (valid_dim_num == 0) + { + outputs[0]->attr.dim_num = 2; + outputs[0]->attr.size[0] = 1; + outputs[0]->attr.size[1] = 1; + } + else + { + outputs[0]->attr.dim_num = valid_dim_num; + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + if (dim_map[i] == 0) + { + outputs[0]->attr.size[index] = inputs[0]->attr.size[i]; + index++; + } + } + if (1 == outputs[0]->attr.dim_num) + { + outputs[0]->attr.dim_num = 2; + outputs[0]->attr.size[1] = 1; + } + } + } + } + + if (FALSE == op_set_reduce_axis(self, inputs, shape, &new_rank)) + { + VSILOGE("op_set_reduce_axis error"); + return FALSE; + } + reshape_in_t[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shape, new_rank ); + + self->nn_param.reduce.local2->reshaped_input1 = reshape_in_t[0]; + for (j = 0; j < self->nn_param.reduce.local2->axes_num; j++) + { + shape[self->nn_param.reduce.local2->axes[j]] = 1; + } + + reshape_out_t[0] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shape, new_rank ); + self->nn_param.reduce.local2->reshaped_output1 = reshape_out_t[0]; + if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM) + { + ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCESUM_INTERNAL); + } + else if (self->nn_param.reduce.type == VSI_NN_REDUCE_MAX) + { + ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEMAX_INTERNAL); + } + else if (self->nn_param.reduce.type == VSI_NN_REDUCE_MIN) + { + ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEMIN_INTERNAL); + } + else if (self->nn_param.reduce.type == VSI_NN_REDUCE_PROD) + { + ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEPROD_INTERNAL); + } + else if (self->nn_param.reduce.type == VSI_NN_REDUCE_ALL) + { + ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEALL_INTERNAL); + } + else if (self->nn_param.reduce.type == VSI_NN_REDUCE_ANY) + { + ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEANY_INTERNAL); + } + + + return ret; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.reduce.local.axis_tensor != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.reduce.local.axis_tensor)); + } + + if (self->nn_param.reduce.local2 != NULL) + { + if (self->nn_param.reduce.local2->axis_tensor2 != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.reduce.local2->axis_tensor2)); + } + if (self->nn_param.reduce.local2->reshaped_tmp != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.reduce.local2->reshaped_tmp)); + } + if (self->nn_param.reduce.local2->reshaped_output != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.reduce.local2->reshaped_output)); + } + if (self->nn_param.reduce.local2->reshaped_input != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.reduce.local2->reshaped_input)); + } + if (self->nn_param.reduce.local2->reshaped_output1 != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.reduce.local2->reshaped_output1)); + } + if (self->nn_param.reduce.local2->reshaped_input1 != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.reduce.local2->reshaped_input1)); + } + free(self->nn_param.reduce.local2); + self->nn_param.reduce.local2 = NULL; + } + + if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM || + self->nn_param.reduce.type == VSI_NN_REDUCE_MAX || + self->nn_param.reduce.type == VSI_NN_REDUCE_MIN || + self->nn_param.reduce.type == VSI_NN_REDUCE_ALL || + self->nn_param.reduce.type == VSI_NN_REDUCE_ANY || + self->nn_param.reduce.type == VSI_NN_REDUCE_PROD) + { + vsi_nn_internal_deinit_node_wksp(self); + } + else + { + vsi_nn_op_common_deinit(self); + } + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + self->nn_param.reduce.local2 = + (vsi_nn_reduce_lcl2_data_t *)malloc(sizeof(vsi_nn_reduce_lcl2_data_t)); + if (NULL == self->nn_param.reduce.local2) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.reduce.local2, 0, sizeof(vsi_nn_reduce_lcl2_data_t)); + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ REDUCE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c new file mode 100644 index 0000000..41cc43b --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c @@ -0,0 +1,454 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + + +static vsi_status _reduce_internal_op_compute + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank_in = 0; + uint32_t rank_out = 0; + int32_t axis = 0; + int32_t new_axis = 0; + uint32_t axis_size = 0; + vsi_bool ret; + vsi_nn_kernel_param_t * param = NULL; + + if( NULL == self ) + { + return VSI_FAILURE; + } + status = VSI_FAILURE; + + + + param =vsi_nn_kernel_param_create(); + + if (strcmp(kernel_name, "reducemax_internal") == 0) + { + vsi_nn_reducemax_internal_param * p = &(self->nn_param.reducemax_internal); + axis = p->axis[0]; + } + else if (strcmp(kernel_name, "reducemin_internal") == 0) + { + vsi_nn_reducemin_internal_param * p = &(self->nn_param.reducemin_internal); + axis = p->axis[0]; + } + else if (strcmp(kernel_name, "reduceprod_internal") == 0) + { + vsi_nn_reduceprod_internal_param * p = &(self->nn_param.reduceprod_internal); + axis = p->axis[0]; + } + else if (strcmp(kernel_name, "reduceall_internal") == 0) + { + vsi_nn_reduceall_internal_param * p = &(self->nn_param.reduceall_internal); + axis = p->axis[0]; + } + else if (strcmp(kernel_name, "reduceany_internal") == 0) + { + vsi_nn_reduceany_internal_param * p = &(self->nn_param.reduceany_internal); + axis = p->axis[0]; + } + else + { + vsi_nn_kernel_param_release( ¶m ); + return VSI_FAILURE; + + } + + ret = vsi_nn_kernel_optimize_reduce_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + &axis, 1, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], &rank_in, shapes[1], &rank_out, + &new_axis, &axis_size); + + // Add params + vsi_nn_kernel_param_add_int32( param, "axis", new_axis ); + + if( ret ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[1], rank_out ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, + &reshape_tensors[0], 1, + &reshape_tensors[1], 1, param ); + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + } + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + return status; +} /* op_compute() */ + +static vsi_bool _reduce_internal_op_setup + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + int32_t axis = 0; + + if (strcmp(kernel_name, "reducemax_internal") == 0) + { + vsi_nn_reducemax_internal_param * p = &(self->nn_param.reducemax_internal); + + axis = p->axis[0]; + if (axis < 0) + { + axis = axis + inputs[0]->attr.dim_num; + if (axis < 0) + { + VSILOGW("error input axis value %d input dim num is %d", + p->axis[0], inputs[0]->attr.dim_num); + return FALSE; + } + p->axis[0] = axis; + } + } + else if (strcmp(kernel_name, "reducemin_internal") == 0) + { + vsi_nn_reducemin_internal_param * p = &(self->nn_param.reducemin_internal); + + axis = p->axis[0]; + if (axis < 0) + { + axis = axis + inputs[0]->attr.dim_num; + if (axis < 0) + { + VSILOGW("error input axis value %d input dim num is %d", + p->axis[0], inputs[0]->attr.dim_num); + return FALSE; + } + p->axis[0] = axis; + } + } + else if (strcmp(kernel_name, "reduceprod_internal") == 0) + { + vsi_nn_reduceprod_internal_param * p = &(self->nn_param.reduceprod_internal); + + axis = p->axis[0]; + if (axis < 0) + { + axis = axis + inputs[0]->attr.dim_num; + if (axis < 0) + { + VSILOGW("error input axis value %d input dim num is %d", + p->axis[0], inputs[0]->attr.dim_num); + return FALSE; + } + p->axis[0] = axis; + } + } + else if (strcmp(kernel_name, "reduceall_internal") == 0) + { + vsi_nn_reduceall_internal_param * p = &(self->nn_param.reduceall_internal); + + axis = p->axis[0]; + if (axis < 0) + { + axis = axis + inputs[0]->attr.dim_num; + if (axis < 0) + { + VSILOGW("error input axis value %d input dim num is %d", + p->axis[0], inputs[0]->attr.dim_num); + return FALSE; + } + p->axis[0] = axis; + } + } + else if (strcmp(kernel_name, "reduceany_internal") == 0) + { + vsi_nn_reduceany_internal_param * p = &(self->nn_param.reduceany_internal); + + axis = p->axis[0]; + if (axis < 0) + { + axis = axis + inputs[0]->attr.dim_num; + if (axis < 0) + { + VSILOGW("error input axis value %d input dim num is %d", + p->axis[0], inputs[0]->attr.dim_num); + return FALSE; + } + p->axis[0] = axis; + } + } + else + { + return FALSE; + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + uint32_t i = 0; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num - 1; + + for (i = 0; i < (uint32_t)axis; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + + for (i = axis; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1]; + } + + if (inputs[0]->attr.dim_num == 1) + { + outputs[0]->attr.dim_num = 1; + outputs[0]->attr.size[0] = 1; + } + } + + return TRUE; +} /* op_setup() */ + + +static vsi_bool op_check_reduceall_internal + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(REDUCEALL_INTERNAL, 1, 1) + IO_TYPE(D_I8, D_I8) + IO_TYPE(D_BOOL8, D_BOOL8) + END_IO_TYPE_DECL(REDUCEALL_INTERNAL) + if(!VALIDATE_OP_IO_TYPES(REDUCEALL_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + + +static vsi_bool op_check_reduceany_internal + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(REDUCEANY_INTERNAL, 1, 1) + IO_TYPE(D_I8, D_I8) + IO_TYPE(D_BOOL8, D_BOOL8) + END_IO_TYPE_DECL(REDUCEANY_INTERNAL) + if(!VALIDATE_OP_IO_TYPES(REDUCEANY_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + + +static vsi_bool op_check_reducemax_internal + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(REDUCEMAX_INTERNAL, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(REDUCEMAX_INTERNAL) + if(!VALIDATE_OP_IO_TYPES(REDUCEMAX_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_check_reducemin_internal + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(REDUCEMIN_INTERNAL, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(REDUCEMIN_INTERNAL) + if(!VALIDATE_OP_IO_TYPES(REDUCEMIN_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_check_reduceprod_internal + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(REDUCEPROD_INTERNAL, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(REDUCEPROD_INTERNAL) + if(!VALIDATE_OP_IO_TYPES(REDUCEPROD_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEF_REDUCE_INTERNAL_OP(name, kernel_name) \ + static vsi_status op_compute_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _reduce_internal_op_compute( ""#kernel_name, self, inputs, outputs ); \ + } \ + static vsi_bool op_setup_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _reduce_internal_op_setup( ""#kernel_name, self, inputs, outputs ); \ + } \ +DEF_OP_REG \ + ( \ + /* op_name */ name, \ + /* init */ NULL, \ + /* compute */ op_compute_##kernel_name, \ + /* deinit */ vsi_nn_op_common_deinit, \ + /* check */ op_check_##kernel_name, \ + /* setup */ op_setup_##kernel_name, \ + /* optimize */ NULL, \ + /* input_num */ 1, \ + /* output_num */ 1 \ + ) + + +DEF_REDUCE_INTERNAL_OP( REDUCEMAX_INTERNAL, reducemax_internal ); +DEF_REDUCE_INTERNAL_OP( REDUCEMIN_INTERNAL, reducemin_internal ); +DEF_REDUCE_INTERNAL_OP( REDUCEPROD_INTERNAL, reduceprod_internal ); +DEF_REDUCE_INTERNAL_OP( REDUCEALL_INTERNAL, reduceall_internal ); +DEF_REDUCE_INTERNAL_OP( REDUCEANY_INTERNAL, reduceany_internal ); + +#undef DEF_REDUCE_INTERNAL_OP +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c new file mode 100644 index 0000000..3749f8a --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c @@ -0,0 +1,170 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank_in = 0; + uint32_t rank_out = 0; + int32_t new_axis[VSI_NN_MAX_DIM_NUM]; + uint32_t axis_size = 0; + vsi_bool ret; + + ret = vsi_nn_kernel_optimize_reduce_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (int32_t *)(self->nn_param.reducesum_internal.axis), + self->nn_param.reducesum_internal.axis_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], &rank_in, shapes[1], &rank_out, + new_axis, &axis_size); + + if( ret ) + { + self->nn_param.reducesum_internal.local->reshaped_input = + vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], rank_in ); + self->nn_param.reducesum_internal.local->reshaped_output = + vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[1], rank_out ); + + self->n = vxTensorReduceSumNode( self->graph->g, + self->nn_param.reducesum_internal.local->reshaped_input->t, + self->nn_param.reducesum_internal.local->reshaped_output->t, + (uint32_t *)new_axis, axis_size, FALSE); + + } + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.reducesum_internal.local = + (vsi_nn_reducesum_lcl_data_t *)malloc(sizeof(vsi_nn_reducesum_lcl_data_t)); + + if (NULL == self->nn_param.reducesum_internal.local) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.reducesum_internal.local, 0, sizeof(vsi_nn_reducesum_lcl_data_t)); + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.reducesum_internal.local != NULL) + { + if (self->nn_param.reducesum_internal.local->reshaped_input != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.reducesum_internal.local->reshaped_input)); + } + if (self->nn_param.reducesum_internal.local->reshaped_output != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.reducesum_internal.local->reshaped_output)); + } + + free(self->nn_param.reducesum_internal.local); + self->nn_param.reducesum_internal.local = NULL; + } + + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ REDUCESUM_INTERNAL, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c new file mode 100644 index 0000000..21e9bf3 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c @@ -0,0 +1,229 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status _comparisons_op_compute + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t new_rank = 0; + vsi_bool ret; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_relational_ops_type_t op_type = self->nn_param.relational_ops.op; + + if( NULL == self ) + { + return VSI_FAILURE; + } + status = VSI_FAILURE; + + // TODO: This optimzie is a hack for gpu path, + // it should be moved to gpu kernel setup. + ret = vsi_nn_kernel_optimize_eltwise_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + if( ret ) + { + // Add params + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[2], new_rank ); + + if (shapes[1][3] > shapes[0][3] && new_rank == 4) + { + vsi_nn_tensor_t* reshape_tmp; + reshape_tmp = reshape_tensors[0]; + reshape_tensors[0] = reshape_tensors[1]; + reshape_tensors[1] = reshape_tmp; + if (VSI_NN_RELATIONAL_OPS_GREAT == op_type) + { + op_type = VSI_NN_RELATIONAL_OPS_LESS; + } + else if (VSI_NN_RELATIONAL_OPS_LESS == op_type) + { + op_type = VSI_NN_RELATIONAL_OPS_GREAT; + } + else if (VSI_NN_RELATIONAL_OPS_GREAT_EQUAL == op_type) + { + op_type = VSI_NN_RELATIONAL_OPS_LESS_EQUAL; + } + else if (VSI_NN_RELATIONAL_OPS_LESS_EQUAL == op_type) + { + op_type = VSI_NN_RELATIONAL_OPS_GREAT_EQUAL; + } + } + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "operation", op_type ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, + &reshape_tensors[0], 2, + &reshape_tensors[2], 1, param ); + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + + vsi_nn_kernel_param_release( ¶m ); + } + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* _eltwise_op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(RELATIONAL_OPS, 2, 1) + IO_TYPE(D_F16, D_F16, D_BOOL8) + IO_TYPE(D_F16, D_I16|Q_DFP, D_BOOL8) + IO_TYPE(D_F16, D_I8|Q_DFP, D_BOOL8) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_BOOL8) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_BOOL8) + IO_TYPE(D_I16|Q_DFP, D_F16, D_BOOL8) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_BOOL8) + IO_TYPE(D_I8|Q_DFP, D_F16, D_BOOL8) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_BOOL8) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_BOOL8) + IO_TYPE(D_BF16, D_BF16, D_BOOL8) + IO_TYPE(D_BOOL8, D_BOOL8, D_BOOL8) + IO_TYPE(D_F32, D_F32, D_BOOL8) + IO_TYPE(D_I32, D_I32, D_BOOL8) + END_IO_TYPE_DECL(RELATIONAL_OPS) + if(!VALIDATE_OP_IO_TYPES(RELATIONAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i, out_rank, in1_rank, in2_rank; + uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_bool ret = TRUE; + + in1_rank = inputs[0]->attr.dim_num; + in2_rank = inputs[1]->attr.dim_num; + out_rank = vsi_nn_max( in1_rank, in2_rank ); + + for(i = 0; i < out_rank; i++) + { + uint32_t sz0, sz1; + sz0 = i < in1_rank ? inputs[0]->attr.size[i] : 1; + sz1 = i < in2_rank ? inputs[1]->attr.size[i] : 1; + shape[i] = vsi_nn_max( sz0, sz1 ); + } + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + } + else + { + uint32_t total_size_got; + uint32_t total_size_expected; + total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); + total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, + outputs[0]->attr.dim_num ); + if( total_size_expected != total_size_got ) + { + VSILOGW("Output size mismatch, expect %d, but got %d", + total_size_expected, total_size_got); + ret = FALSE; + } + } + + return ret; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEF_COMPARISONS_OP(name, kernel_name) \ + static vsi_status op_compute_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _comparisons_op_compute( ""#kernel_name, self, inputs, outputs ); \ + } \ +DEF_OP_REG(name, NULL, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 2, 1) + +DEF_COMPARISONS_OP( RELATIONAL_OPS, relational_ops ); + + +#undef DEF_COMPARISONS_OP + +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu.c new file mode 100644 index 0000000..7045e61 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu.c @@ -0,0 +1,129 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU, + 0, + 0, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(RELU, 1, 1) + /* IO_TYPE(INPUT, OUTPUT) */ + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BF16) + + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP) + + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(RELU) + if(!VALIDATE_OP_IO_TYPES(RELU, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RELU, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c new file mode 100644 index 0000000..a1ba17c --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c @@ -0,0 +1,94 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1, + 0, + 0, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_RELU, self, inputs, outputs); + + return ret; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RELU1, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c new file mode 100644 index 0000000..9020e7d --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c @@ -0,0 +1,95 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6, + 0, + 0, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_RELU, self, inputs, outputs); + + return ret; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RELU6, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c new file mode 100644 index 0000000..5b312cb --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c @@ -0,0 +1,158 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_util.h" + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_internal_deinit_node_wksp( self ); + return status; +} /* op_deinit() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_relu_keras_param * p; + vsi_nn_internal_node_t* curr = NULL; + float alpha = 0; + float max_value = 0; + float threshold = 0; + uint32_t max_raw = 0; + if( NULL == self ) + { + return FALSE; + } + + p = &(self->nn_param.relu_keras); + alpha = p->alpha; + max_value = p->max_value; + threshold = p->threshold; + + max_raw = *(uint32_t*)&max_value; + + vsi_nn_internal_init_node_wksp(self); + + if (alpha == 0 && max_raw == VSI_NN_FLOAT32_INF && threshold == 0) + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + } + else if (alpha == 1.0f && max_value == 1.0f && threshold == -1.0f) + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU1, 0, 0); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + } + else if (alpha == 0 && max_value == 6.0f && threshold == 0) + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + } + else if (alpha == 0.1 && max_value == VSI_NN_FLOAT32_INF && threshold == 0) + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LEAKY_RELU, 0, 0); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + } + else + { + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU_KERAS_INTERNAL, 0, 0); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + curr->node->nn_param.relu_keras_internal.max_value = max_value; + curr->node->nn_param.relu_keras_internal.alpha = alpha; + curr->node->nn_param.relu_keras_internal.threshold = threshold; + } + + vsi_nn_internal_setup_node(self, curr); + + return TRUE; +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RELU_KERAS, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c new file mode 100644 index 0000000..8c2c914 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c @@ -0,0 +1,181 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 0; + vsi_bool ret; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_relu_keras_internal_param * p = NULL; + float alpha = 0.0f; + float max_value = 0.0f; + float threshold = 0.0f; + + if( NULL == self ) + { + return status; + } + + p = &(self->nn_param.relu_keras_internal); + alpha = p->alpha; + max_value = p->max_value; + threshold = p->threshold; + param = vsi_nn_kernel_param_create(); + + ret = vsi_nn_kernel_optimize_element_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); + + vsi_nn_kernel_param_add_float32( param, "alpha", alpha ); + vsi_nn_kernel_param_add_float32( param, "max_value", max_value ); + vsi_nn_kernel_param_add_float32( param, "threshold", threshold ); + + if( ret ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shape, new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shape, new_rank ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "relu_keras", + &reshape_tensors[0], 1, + &reshape_tensors[1], 1, param ); + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + } + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; + +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(RELU_KERAS_INTERNAL, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(RELU_KERAS_INTERNAL) + if(!VALIDATE_OP_IO_TYPES(RELU_KERAS_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + for (i = 0; i < _VSI_NN_RELU_KERAS_INTERNAL_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.relu_keras_internal.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.relu_keras_internal.local.local_tensor[i])); + self->nn_param.relu_keras_internal.local.local_tensor[i] = NULL; + } + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RELU_KERAS_INTERNAL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c new file mode 100644 index 0000000..1cbf229 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c @@ -0,0 +1,112 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + float top = self->nn_param.relun.relu_clamp_top; + float bottom = self->nn_param.relun.relu_clamp_bottom; + vsi_enum func = -1; + + if(top == 1 && bottom == -1) + { + func = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1; + } + else if(top == 6) + { + func = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6; + } + else + { + VSILOGE("Do not support this feature"); + return VSI_FAILURE; + } + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + func, + 0, + 0, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_RELU, self, inputs, outputs); + + return ret; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RELUN, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reorg.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reorg.c new file mode 100644 index 0000000..10731cd --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reorg.c @@ -0,0 +1,138 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_link_list.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + + status = VSI_FAILURE; + self->n = vxReorgLayer( self->graph->g, + inputs[0]->t, + self->nn_param.reorg.stride, + outputs[0]->t); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(REORG, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + + /* HW 9.0 */ + IO_TYPE(D_BF16, D_BF16) + END_IO_TYPE_DECL(REORG) + if(!VALIDATE_OP_IO_TYPES(REORG, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t stride = node->nn_param.reorg.stride; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + memcpy( outputs[0]->attr.size, inputs[0]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + + outputs[0]->attr.size[0] = inputs[0]->attr.size[0] / stride; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1] / stride; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2] * stride * stride; + } + + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ REORG, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c new file mode 100644 index 0000000..91fcc24 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c @@ -0,0 +1,164 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* + *If reshape is un-initialized, we need add a tensorcopy + * when input and output are initialized. + */ + if(inputs[0]->t != NULL && outputs[0]->t != NULL && + self->nn_param.reshape.local.initialized == FALSE) + { + self->n = vxTensorCopyNode(self->graph->g, + inputs[0]->t, outputs[0]->t); + if(NULL == self->n) + { + VSILOGE( "Create vxTensorCopyNode fail." ); + return VSI_FAILURE; + } + VSILOGD("Create a copy node for reshape"); + } + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + //TODO: Check tensor shapes. + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + uint32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + memcpy(shape, self->nn_param.reshape.size, + sizeof(uint32_t) * self->nn_param.reshape.dim_num); + ret = vsi_nn_CalcReshapeTensor(inputs[0], + outputs[0], + shape, + self->nn_param.reshape.dim_num); + } + + return ret; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status; + vsi_bool ret; + + status = VSI_SUCCESS; + ret = TRUE; + if(self->nn_param.reshape.local.initialized == FALSE) + { + VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + if( direction == VSI_NN_OPTIMIZE_BACKWARD ) + { + if(NULL == inputs[0]->t && NULL != outputs[0]->t) + { + inputs[0]->t = vxReshapeTensor( outputs[0]->t, + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num ); + if( inputs[0]->t == NULL ) + { + status = VSI_FAILURE; + } + self->nn_param.reshape.local.initialized = TRUE; + } + } + else + { + if(NULL == outputs[0]->t) + { + ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0], + self->nn_param.reshape.size, self->nn_param.reshape.dim_num ); + if( ret == FALSE ) + { + status = VSI_FAILURE; + } + self->nn_param.reshape.local.initialized = TRUE; + } + } + } + + return status; +} /* op_optimize() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RESHAPE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c new file mode 100644 index 0000000..9fed06c --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c @@ -0,0 +1,509 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +/**************************************************************************** +* This operation originally come from: +* https://github.com/pjreddie/darknet/tree/master/src/upsample_layer.c +* which is used by YOLOv3 +*****************************************************************************/ + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +#define USE_OVX_API TRUE + +#if (USE_OVX_API == FALSE) +extern vx_kernel_description_t * vx_kernel_RESIZE_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_resize_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &node->nn_param.resize; + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, factor ); +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status op_pre_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_nn_type_e inputFormat = inputs[0]->attr.dtype.vx_type; + vsi_nn_type_e outputFormat = outputs[0]->attr.dtype.vx_type; + vsi_nn_type_e enableFormat; + float scale_factor = self->nn_param.resize.factor; + + enableFormat = ((inputFormat == VSI_NN_TYPE_FLOAT16 && outputFormat == VSI_NN_TYPE_FLOAT16) || + (inputFormat == VSI_NN_TYPE_INT16 && outputFormat == VSI_NN_TYPE_INT16) || + (inputFormat == VSI_NN_TYPE_INT8 && outputFormat == VSI_NN_TYPE_INT8) || + (inputFormat == VSI_NN_TYPE_UINT8 && outputFormat == VSI_NN_TYPE_UINT8)); + + if(scale_factor == 0.5f && enableFormat && inputs[0]->attr.size[1] % 2 == 0 + && inputs[0]->attr.size[1] * inputs[0]->attr.size[2] < 65536) + { + kernel_info->type = VX_KERNEL_TYPE_VX; + kernel_info->init_index = 1; + if (inputFormat == VX_TYPE_FLOAT16 || inputFormat == VX_TYPE_INT16 ) + { + kernel_info->kernel_index = 1; + } + else + { + kernel_info->kernel_index = 2; + } + } + else + { + kernel_info->type = VX_KERNEL_TYPE_CPU; + kernel_info->kernel_index = 0; + kernel_info->init_index = 0; + } + + return VSI_SUCCESS; +} + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_IO_NUM]; + vx_border_t border; + int32_t sizes[4] = {0}; + uint32_t dims = 2; + uint32_t input_size[4] = {1, 1, 1, 1}; + uint32_t output_size[4] = {1, 1, 1, 1}; + uint32_t i; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + for(i = 0; i < inputs[0]->attr.dim_num; ++i) + { + input_size[i] = inputs[0]->attr.size[i]; + } + for(i = 0; i < outputs[0]->attr.dim_num; ++i) + { + output_size[i] = outputs[0]->attr.size[i]; + } + + + sizes[0] = input_size[0]; + sizes[1] = input_size[1] * input_size[2] * input_size[3]; + self->nn_param.resize.local.local_tensor[0] = vxReshapeTensor(inputs[0]->t, sizes, dims); + + sizes[0] = output_size[0]; + sizes[1] = output_size[1] * output_size[2] * output_size[3]; + self->nn_param.resize.local.local_tensor[1] = vxReshapeTensor(outputs[0]->t, sizes, dims); + + params[0] = (vx_reference)self->nn_param.resize.local.local_tensor[0]; + params[1] = (vx_reference)self->nn_param.resize.local.local_tensor[1]; + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _IO_NUM ); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; +#endif + +static vsi_bool _is_same_shape + ( + vsi_nn_tensor_t * inputs, + uint32_t *sizes, + uint32_t dims + ) +{ + uint32_t i = 0; + + if (inputs->attr.dim_num != dims) + return FALSE; + + for (i = 0; i < dims; i++) + { + if (sizes[i] != inputs->attr.size[i]) + return FALSE; + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; +#if (USE_OVX_API == TRUE) + if ( ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) + && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type + || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type)) + || _is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num) ) + { + status = vsi_nn_internal_compute_node( self ); + } + else + { + vx_nn_scale_params_t para; + switch (self->nn_param.resize.type) + { + case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR: + para.type = VX_INTERPOLATION_NEAREST_NEIGHBOR; break; + case VSI_NN_INTERPOLATION_BILINEAR: + para.type = VX_INTERPOLATION_BILINEAR; break; + case VSI_NN_INTERPOLATION_AREA: + para.type = VX_INTERPOLATION_AREA; break; + default: + para.type = VX_INTERPOLATION_NEAREST_NEIGHBOR; + } + self->n = vxTensorScaleNode( self->graph->g, inputs[0]->t, ¶, + sizeof(vx_nn_scale_params_t), outputs[0]->t ); + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + } +#else + + vsi_nn_kernel_info_t kernel_info; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name = "vsi_nn_kernel_resize"; + kernel_info.kernel = vx_kernel_RESIZE_list; + + op_pre_compute(self, inputs, outputs, &kernel_info); + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } +#endif + return status; +} /* op_compute() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + if ( ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) + && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type + || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type) ) + || _is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num) ) + { + return vsi_nn_internal_optimize_node(self, direction ); + } + else + { + return VSI_SUCCESS; + } +} /* op_optimize() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + float factor = self->nn_param.resize.factor; + vsi_nn_internal_node_t* curr = NULL; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + if (factor != 0) + { + outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); + outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor); + } + else + { + outputs[0]->attr.size[0] = self->nn_param.resize.size[0]; + outputs[0]->attr.size[1] = self->nn_param.resize.size[1]; + } + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + + if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) + && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type)) + { + vsi_nn_internal_init_node_wksp( self ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 0, 0 ); + curr->node->nn_param.resize_internal.align_corners = self->nn_param.resize.align_corners; + curr->node->nn_param.resize_internal.factor = self->nn_param.resize.factor; + curr->node->nn_param.resize_internal.half_pixel_centers = self->nn_param.resize.half_pixel_centers; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } + else if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) + && (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type)) + { + vsi_nn_internal_init_node_wksp( self ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_NEAREST_INTERNAL, 0, 0 ); + curr->node->nn_param.resize_nearest_internal.align_corners = self->nn_param.resize.align_corners; + curr->node->nn_param.resize_nearest_internal.factor = self->nn_param.resize.factor; + curr->node->nn_param.resize_nearest_internal.half_pixel_centers = self->nn_param.resize.half_pixel_centers; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } + else if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num)) + { + vsi_nn_internal_init_node_wksp( self ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ +#if (USE_OVX_API == FALSE) + uint32_t i; + for (i = 0; i < _VSI_NN_RESIZE_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.resize.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.resize.local.local_tensor[i])); + self->nn_param.resize.local.local_tensor[i] = NULL; + } + } +#endif + if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) + && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type + || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type)) + { + vsi_nn_internal_deinit_node_wksp(self); + } + else + { + vsi_nn_op_common_deinit(self); + } + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 14) == -1) + { + self->nn_param.resize.align_corners = FALSE; + self->nn_param.resize.half_pixel_centers = FALSE; + } + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RESIZE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c new file mode 100644 index 0000000..0a7f893 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c @@ -0,0 +1,188 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + + vsi_status status = VSI_FAILURE; + int32_t align_corners = self->nn_param.resize_internal.align_corners; + int32_t half_pixel_centers = self->nn_param.resize_internal.half_pixel_centers; + vsi_nn_kernel_param_t * param = NULL; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners ); + vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "resize_bilinear", + &inputs[0], 1, + &outputs[0], 1, param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; + +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(RESIZE_INTERNAL, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + END_IO_TYPE_DECL(RESIZE_INTERNAL) + if(!VALIDATE_OP_IO_TYPES(RESIZE_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + float factor = self->nn_param.resize_internal.factor; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + if (factor != 0) + { + outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); + outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor); + } + else + { + outputs[0]->attr.size[0] = self->nn_param.resize.size[0]; + outputs[0]->attr.size[1] = self->nn_param.resize.size[1]; + } + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.resize_internal.lcl_data_ptr) + { + free(self->nn_param.resize_internal.lcl_data_ptr); + self->nn_param.resize_internal.lcl_data_ptr = NULL; + } + + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.resize_internal.lcl_data_ptr = \ + (vsi_nn_resize_in_lcl_data *)malloc(sizeof(vsi_nn_resize_in_lcl_data)); + if (NULL == self->nn_param.resize_internal.lcl_data_ptr) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.resize_internal.lcl_data_ptr, 0, sizeof(vsi_nn_resize_in_lcl_data)); + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RESIZE_INTERNAL, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c new file mode 100644 index 0000000..a0e0d48 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c @@ -0,0 +1,187 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + int32_t align_corners = self->nn_param.resize_nearest_internal.align_corners; + int32_t half_pixel_centers = self->nn_param.resize_nearest_internal.half_pixel_centers; + vsi_nn_kernel_param_t * param = NULL; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners ); + vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "resize_nearest", + &inputs[0], 1, + &outputs[0], 1, param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; + +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(RESIZE_NEAREST_INTERNAL, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + END_IO_TYPE_DECL(RESIZE_NEAREST_INTERNAL) + if(!VALIDATE_OP_IO_TYPES(RESIZE_NEAREST_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + float factor = self->nn_param.resize_nearest_internal.factor; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + if (factor != 0) + { + outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); + outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor); + } + else + { + outputs[0]->attr.size[0] = self->nn_param.resize.size[0]; + outputs[0]->attr.size[1] = self->nn_param.resize.size[1]; + } + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.resize_nearest_internal.lcl_data_ptr) + { + free(self->nn_param.resize_nearest_internal.lcl_data_ptr); + self->nn_param.resize_nearest_internal.lcl_data_ptr = NULL; + } + + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.resize_nearest_internal.lcl_data_ptr = + (vsi_nn_resize_nearest_in_lcl_data *)malloc(sizeof(vsi_nn_resize_nearest_in_lcl_data)); + if (NULL == self->nn_param.resize_nearest_internal.lcl_data_ptr) + { + return VX_ERROR_NO_MEMORY; + } + + memset(self->nn_param.resize_nearest_internal.lcl_data_ptr, 0, sizeof(vsi_nn_resize_nearest_in_lcl_data)); + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RESIZE_NEAREST_INTERNAL, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c new file mode 100644 index 0000000..86b9ad3 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c @@ -0,0 +1,384 @@ + +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +#define USE_OVX_API TRUE + +#if (USE_OVX_API == FALSE) +extern vx_kernel_description_t * vx_kernel_REVERSE_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_reverse_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &node->nn_param.reverse; + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, axis[0] ); +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status op_pre_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_nn_type_e in_dataType = inputs[0]->attr.dtype.vx_type; + vsi_nn_type_e out_dataType = outputs[0]->attr.dtype.vx_type; + uint32_t i; + uint32_t changed_num = 1; + + for( i = self->nn_param.reverse.axis[0] + 1; i < inputs[0]->attr.dim_num; i++ ) + { + changed_num *= inputs[0]->attr.size[inputs[0]->attr.dim_num - 1 - i]; + } + + if ((in_dataType != VSI_NN_TYPE_INT16 || out_dataType != VSI_NN_TYPE_INT16) + && self->nn_param.reverse.axis[0] != 0) + { + VSILOGE("tensorReverse shader unsupport format or axis:%d!\n", + self->nn_param.reverse.axis[0]); + return VSI_FAILURE; + } + else if (changed_num >= 65536) + { + VSILOGE("tensorReverse unsupport change num:%d!\n", changed_num); + return VSI_FAILURE; + } + + kernel_info->kernel_index = 1; + + return VSI_SUCCESS; +} + +static void reshape_tensor_shape + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vx_reference * params, + uint32_t index + ) +{ + uint32_t i; + int32_t size[4] = {0}; + int32_t size0[4] = {1, 1, 1, 1}; + uint32_t dims = 2; + + for( i = 0; i < input->attr.dim_num; i++ ) + { + size0[i] = input->attr.size[i]; + } + + size[0] = size0[0] * size0[1] * size0[2]; + size[1] = size0[3]; + + self->nn_param.reverse.local.local_tensor[index] = + vxReshapeTensor(input->t, size, dims); + params[index] = (vx_reference)self->nn_param.reverse.local.local_tensor[index]; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_border_t border; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + reshape_tensor_shape(self, inputs[0], params, 0); + reshape_tensor_shape(self, outputs[0], params, 1); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; +#endif +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; +#if (USE_OVX_API == TRUE) + vx_nn_tensor_reverse_params_t para; + vsi_nn_reverse_param * p; + int32_t axes[VSI_NN_MAX_DIM_NUM] = {0}; + p = &self->nn_param.reverse; + memcpy(axes, p->axis, sizeof(int32_t) * p->axis_num); + para.axis = axes; + para.numberOfAxis = p->axis_num; + self->n = vxTensorReverse( self->graph->g, inputs[0]->t, ¶, + sizeof(vx_nn_tensor_reverse_params_t), outputs[0]->t ); + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } +#else + vsi_nn_kernel_info_t kernel_info; + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_reverse"; + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_REVERSE_list; + kernel_info.init_index = 1; + + op_pre_compute(self, inputs, outputs, &kernel_info); + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } +#endif + return status; +} /* op_compute() */ + + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(REVERSE, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_U8|Q_DFP, D_U8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I32|Q_DFP, D_I32|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM) + IO_TYPE(D_U8|Q_SYM_PC, D_U8|Q_SYM_PC) + IO_TYPE(D_I8|Q_SYM_PC, D_I8|Q_SYM_PC) + IO_TYPE(D_I16|Q_SYM_PC, D_I16|Q_SYM_PC) + IO_TYPE(D_I32|Q_SYM_PC, D_I32|Q_SYM_PC) + IO_TYPE(D_U8, D_U8) + IO_TYPE(D_I8, D_I8) + IO_TYPE(D_I16, D_I16) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + + /* HW 9.0 */ + IO_TYPE(D_BF16, D_BF16) + END_IO_TYPE_DECL(REVERSE) + if(!VALIDATE_OP_IO_TYPES(REVERSE, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ +#if (USE_OVX_API == FALSE) + uint32_t i; + for (i = 0; i < _VSI_NN_REVERSE_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.reverse.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.reverse.local.local_tensor[i])); + self->nn_param.reverse.local.local_tensor[i] = NULL; + } + } +#endif + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ REVERSE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c new file mode 100644 index 0000000..38df152 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c @@ -0,0 +1,204 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_constraint_check.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_tensor_t* act_tensor = NULL; + vx_nn_rnn_params_t param; + + memset(¶m, 0, sizeof(vx_nn_rnn_params_t)); + + act_tensor = vsi_nn_VariableToTensor(self, + (uint8_t*)&self->nn_param.rnn.activation, + VSI_NN_TYPE_INT32); + + if (!act_tensor) + { + VSILOGE("RNN->Create Activation Tensor failed"); + status = VSI_FAILURE; + } + else + { + param.weights = REQUIRED_IO(inputs[1]); + param.recurrent_weights = REQUIRED_IO(inputs[2]); + param.bias = REQUIRED_IO(inputs[3]); + param.state_in = REQUIRED_IO(inputs[4]); + param.activation = REQUIRED_IO(act_tensor); + self->n = vxRNNLayer( + self->graph->g, + REQUIRED_IO(inputs[0]), + ¶m, + sizeof(param), + /*state output*/REQUIRED_IO(outputs[0]), + /*output*/REQUIRED_IO(outputs[1])); + + vsi_nn_ReleaseTensor(&act_tensor); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t input_idx = 0; + do { + vsi_bool break_early = FALSE; + + // input_idx = 0 : inputs[0].shape = shape(batch_size, input_size) + if (input_idx >= self->input.num) break; + break_early = (inputs[input_idx]->attr.dim_num != 2); + if (break_early) break; + input_idx ++; + + // input_idx = 1 : inputs[1].shape = shape(num_units, input_size) + if (input_idx >= self->input.num) break; + break_early = (inputs[input_idx]->attr.dim_num != 2); + if (break_early) break; + input_idx ++; + + // input_idx = 2 : inputs[2].shape = shape(num_units, num_units) + if (input_idx >= self->input.num) break; + break_early = (inputs[input_idx]->attr.dim_num != 2); + if (break_early) break; + input_idx ++; + + // input_idx = 3 : inputs[3].shape = shape(num_units) + if (input_idx >= self->input.num) break; + break_early = (inputs[input_idx]->attr.dim_num != 1); + if (break_early) break; + input_idx ++; + + // input_idx = 4 : inputs[4].shape = shape(batch_size, num_units) + if (input_idx >= self->input.num) break; + break_early = (inputs[input_idx]->attr.dim_num != 2); + if (break_early) break; + input_idx ++; + + return TRUE; + } while(0); + + { + BEGIN_IO_TYPE_DECL(RNN, 5, 1) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + END_IO_TYPE_DECL(RNN) + if(!VALIDATE_OP_IO_TYPES(RNN, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + + VSILOGE("RNN check shape faild at Input[%d]", input_idx); + + + + return FALSE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) { + outputs[0]->attr.size[0] = inputs[4]->attr.size[0]; + outputs[0]->attr.size[1] = inputs[4]->attr.size[1]; + outputs[1]->attr.size[0] = inputs[4]->attr.size[0]; + outputs[1]->attr.size[1] = inputs[4]->attr.size[1]; + + outputs[0]->attr.dim_num = outputs[1]->attr.dim_num = inputs[4]->attr.dim_num; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (NULL == self) + { + return VSI_FAILURE; + } + + if (NULL != self->n) + { + vxReleaseNode(&self->n); + self->n = NULL; + } + + return VSI_SUCCESS; +} /* op_deinit() */ + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RNN, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 5, + /* output_num */ 2 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c new file mode 100644 index 0000000..a6fa7b8 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c @@ -0,0 +1,382 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t output_size = 0; + uint32_t batch_size = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + batch_size = inputs[RNNCELL_INPUT_INPUT]->attr.size[1]; + output_size = inputs[RNNCELL_INPUT_WEIGHT_I]->attr.size[1]; + + /* create h_state input/output if app doesn't provide them */ + if( !inputs[RNNCELL_INPUT_H_STATE] ) + { + attr.dim_num = 2; + attr.size[1] = batch_size; + attr.size[0] = output_size; + memcpy( &attr.dtype, &outputs[RNNCELL_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = FALSE; + attr.is_const = TRUE; + + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[RNNCELL_INPUT_H_STATE] = output_tensor->t; + } + + if( !outputs[RNNCELL_OUTPUT_H_STATE] ) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + memcpy( &attr.dtype, &outputs[RNNCELL_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = TRUE; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[RNNCELL_OUTPUT_H_STATE] = output_tensor->t; + } + + /* setup rnncell output tensors' shape */ + /* output */ + if(VSI_NN_DIM_AUTO == outputs[RNNCELL_OUTPUT_OUTPUT]->attr.dim_num) + { + /* num_units */ + outputs[RNNCELL_OUTPUT_OUTPUT]->attr.size[0] = inputs[RNNCELL_INPUT_WEIGHT_I]->attr.size[1]; + /* batch_size */ + outputs[RNNCELL_OUTPUT_OUTPUT]->attr.size[1] = inputs[RNNCELL_INPUT_INPUT]->attr.size[1]; + outputs[RNNCELL_OUTPUT_OUTPUT]->attr.dim_num = inputs[RNNCELL_INPUT_INPUT]->attr.dim_num; + } + + /* output_state_out */ + if(VSI_NN_DIM_AUTO == outputs[RNNCELL_OUTPUT_H_STATE]->attr.dim_num) + { + outputs[RNNCELL_OUTPUT_H_STATE]->attr.dim_num = + outputs[RNNCELL_OUTPUT_OUTPUT]->attr.dim_num; + memcpy( outputs[RNNCELL_OUTPUT_H_STATE]->attr.size, + outputs[RNNCELL_OUTPUT_OUTPUT]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + } + return TRUE; +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_rnncell_ovxlib_param* p = &self->nn_param.rnncell_ovxlib; + vsi_nn_tensor_attr_t attr; + vsi_bool is_input_fc_on_tp = FALSE; + vsi_bool is_hstate_fc_on_tp = FALSE; + vsi_nn_internal_tensor_t* input_tensor = NULL; + vsi_nn_internal_tensor_t* input_gate_fc_outputs = NULL; + vsi_nn_internal_tensor_t* hstate_gate_fc_outputs = NULL; + vsi_nn_internal_tensor_t* aux_input_gate_fc_outputs = NULL; + vsi_nn_internal_tensor_t* input_add_hstate_outputs = NULL; + vsi_nn_internal_tensor_t* gate_fc_outputs = NULL; + vsi_nn_internal_tensor_t* hstate_input_tensor = NULL; + vsi_nn_internal_tensor_t* tmp = NULL; + vsi_nn_internal_node_t* curr = NULL; + vsi_bool use_virtual_tensor = TRUE; + uint32_t kernel_h = 1; + uint32_t kernel_w = 1; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_node_wksp( self ); + p->local = (vsi_nn_rnncell_ovxlib_lcl_data_t*) + malloc(sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t)); + + memset(p->local, 0x00, sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t)); + memset(&attr, 0x00, sizeof(attr)); + p->local->multi_batch = (inputs[RNNCELL_INPUT_INPUT]->attr.size[1]); + + if( inputs[RNNCELL_INPUT_INPUT]->attr.dtype.qnt_type + != inputs[RNNCELL_INPUT_WEIGHT_I]->attr.dtype.qnt_type) + { + /* input and input weights have different qtype, only TP can do this operation */ + is_input_fc_on_tp = TRUE; + } + else if( inputs[RNNCELL_INPUT_INPUT]->attr.size[0] % 64 != 0 ) + { + /* NN performs bad if input's shape is not aligned to 64-byte */ + is_input_fc_on_tp = TRUE; + } + + if( inputs[RNNCELL_INPUT_H_STATE]->attr.dtype.qnt_type + != inputs[RNNCELL_INPUT_WEIGHT_H]->attr.dtype.qnt_type) + { + /* recurrent and recurrent weights have different qtype, only TP can do this operation */ + is_hstate_fc_on_tp = TRUE; + } + else if( inputs[RNNCELL_INPUT_H_STATE]->attr.size[0] % 64 != 0 ) + { + /* NN performs bad if inputs' shape is not aligned to 64-byte */ + is_hstate_fc_on_tp = TRUE; + } + + /* if both input fc and recurrent fc could be executed on NN, offloads one to TP*/ + if( !is_input_fc_on_tp && !is_hstate_fc_on_tp ) + { + is_input_fc_on_tp = TRUE; + } + /* TODO: now, all fc on tp because can't fetch the HW feature */ + is_input_fc_on_tp = TRUE; + is_hstate_fc_on_tp = TRUE; + + setup_op_shapes(self, inputs, outputs); + + /* Input FC */ + if( is_input_fc_on_tp ) + { + /* tp */ + input_gate_fc_outputs = vsi_nn_rnn_create_tp_fc(self, + inputs[RNNCELL_INPUT_INPUT], + inputs[RNNCELL_INPUT_WEIGHT_I], + inputs[RNNCELL_INPUT_BIAS], + &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I], + use_virtual_tensor); + if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL) + { + aux_input_gate_fc_outputs = vsi_nn_rnn_create_tp_fc(self, + inputs[RNNCELL_INPUT_AUX_INPUT], + inputs[RNNCELL_INPUT_AUX_WEIGHT], + NULL, + &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX], + use_virtual_tensor); + } + } + else + { + /* reshape and transpose input */ + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[RNNCELL_INPUT_INPUT]->attr.size[0], + &kernel_h, &kernel_w); + input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[RNNCELL_INPUT_INPUT], + p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + + tmp = vsi_nn_rnn_create_nn_fc(self, + input_tensor->t, + inputs[RNNCELL_INPUT_WEIGHT_I], + inputs[RNNCELL_INPUT_BIAS], + kernel_h, kernel_w, + &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I], + use_virtual_tensor); + /* transpose and reshape output */ + input_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, + kernel_w, use_virtual_tensor); + if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL) + { + /* reshape and transpose input */ + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[RNNCELL_INPUT_AUX_INPUT]->attr.size[0], + &kernel_h, &kernel_w); + input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, + inputs[RNNCELL_INPUT_AUX_INPUT], + p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + tmp = vsi_nn_rnn_create_nn_fc(self, + input_tensor->t, + inputs[RNNCELL_INPUT_AUX_INPUT], + NULL, + kernel_h, kernel_w, + &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX], + use_virtual_tensor); + /* transpose and reshape output */ + aux_input_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self, + tmp->t, p->local->multi_batch, kernel_h, + kernel_w, use_virtual_tensor); + } + } + + /* Hstate FC */ + if( is_hstate_fc_on_tp ) + { + hstate_gate_fc_outputs = vsi_nn_rnn_create_tp_fc(self, + inputs[RNNCELL_INPUT_H_STATE], + inputs[RNNCELL_INPUT_WEIGHT_H], + NULL, + &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H], + use_virtual_tensor); + } + else + { + /* reshape and transpose input */ + vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, + inputs[RNNCELL_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); + hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, + inputs[RNNCELL_INPUT_H_STATE], + p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + + tmp = vsi_nn_rnn_create_nn_fc(self, + hstate_input_tensor->t, + inputs[RNNCELL_INPUT_WEIGHT_H], + NULL, + kernel_h, kernel_w, + &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H], + use_virtual_tensor); + /* transpose and reshape output */ + hstate_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self, + tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + } + + input_add_hstate_outputs = vsi_nn_rnn_create_tensor_add(self, + input_gate_fc_outputs->t, + hstate_gate_fc_outputs->t, + &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I], + use_virtual_tensor); + + if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL) + { + gate_fc_outputs = vsi_nn_rnn_create_tensor_add(self, + input_add_hstate_outputs->t, + aux_input_gate_fc_outputs->t, + &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I], + use_virtual_tensor); + } + else + { + gate_fc_outputs = input_add_hstate_outputs; + } + + /* activation */ + curr = vsi_nn_internal_new_node( self, vsi_nn_rnn_get_act_op_type(p->activation), 0, 0 ); + curr->inputs[0] = gate_fc_outputs->t; + curr->outputs[0] = outputs[RNNCELL_OUTPUT_OUTPUT]; + vsi_nn_internal_setup_node(self, curr); + + if (outputs[RNNCELL_OUTPUT_H_STATE] != NULL) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = outputs[RNNCELL_OUTPUT_OUTPUT]; + curr->outputs[0] = outputs[RNNCELL_OUTPUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_rnncell_ovxlib_param* p = &self->nn_param.rnncell_ovxlib; + vsi_nn_safe_free(p->local); + vsi_nn_safe_free(p->internal_dtype); + vsi_nn_internal_deinit_node_wksp( self ); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + self->nn_param.rnncell_ovxlib.local = (vsi_nn_rnncell_ovxlib_lcl_data_t *) + malloc(sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t)); + memset(self->nn_param.rnncell_ovxlib.local, 0, + sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t)); + self->nn_param.rnncell_ovxlib.internal_dtype = (vsi_nn_dtype_t *) + malloc(sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT); + memset(self->nn_param.rnncell_ovxlib.internal_dtype, 0, + sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT); + + return VSI_SUCCESS; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RNNCELL_OVXLIB, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ RNNCELL_INPUT_CNT, + /* output_num */ RNNCELL_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c new file mode 100644 index 0000000..53f5bd4 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c @@ -0,0 +1,302 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" + +#define _ARG_NUM (6) +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_ROI_ALIGN_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_roi_align_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.roi_align); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ + #define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, output_height ); + _SET_PARAM( 1, VX_TYPE_INT32, output_width ); + _SET_PARAM( 2, VX_TYPE_FLOAT32, height_ratio ); + _SET_PARAM( 3, VX_TYPE_FLOAT32, width_ratio ); + _SET_PARAM( 4, VX_TYPE_INT32, height_sample_num ); + _SET_PARAM( 5, VX_TYPE_INT32, width_sample_num ); + #undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + /*TODO: Add code if need to change your parameter*/ + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + char *path = NULL; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = vx_kernel_ROI_ALIGN_list; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_roi_align"; + path = getenv("USER_VX_SOURCE_PATH"); + if(path) + vsi_nn_VxResourceSetPath(path); + + if( kernel_info.type == VX_KERNEL_TYPE_VX) + { + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + } + else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ + { + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) + { + free(kernel_info.resource_name); + } + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + //If input0 is uint8, then input1 MUST be uint16, + //with zero point of 0 and scale of 0.125 + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 && + inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_UINT16) + { + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to compute outputs' shape. */ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + vsi_nn_roi_align_param *p; + p = &(self->nn_param.roi_align); + outputs[0]->attr.dim_num = 4; + outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; + outputs[0]->attr.size[1] = p->output_width; + outputs[0]->attr.size[2] = p->output_height; + outputs[0]->attr.size[3] = inputs[1]->attr.size[1]; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ ROI_ALIGN, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c new file mode 100644 index 0000000..9470ffc --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c @@ -0,0 +1,222 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_nn_roi_pool_params_ext_t params; + vx_tensor rois_input; + + status = VSI_FAILURE; + params.khr.pool_type = self->nn_param.roi_pool.type; + params.spatial_scale = self->nn_param.roi_pool.scale; + params.pooled_width = self->nn_param.roi_pool.size[0]; + params.pooled_height = self->nn_param.roi_pool.size[1]; + + if(self->nn_param.roi_pool.local.rois) + { + rois_input = self->nn_param.roi_pool.local.rois; + } + else + { + rois_input = inputs[1]->t; + } + + self->n = vxROIPoolingLayer( + self->graph->g, + inputs[0]->t, + rois_input, + (vx_nn_roi_pool_params_t *)¶ms, + sizeof( vx_nn_roi_pool_params_ext_t ), + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(ROI_POOL, 2, 1) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_F32) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F32, D_F16, D_F16) + IO_TYPE(D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16, D_F32) + IO_TYPE(D_F32, D_F32, D_BF16) + + /* HW 9.0 */ + IO_TYPE(D_BF16, D_F16, D_BF16) + END_IO_TYPE_DECL(ROI_POOL) + if(!VALIDATE_OP_IO_TYPES(ROI_POOL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_add_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = 4; + outputs[0]->attr.size[0] = node->nn_param.roi_pool.size[0]; + outputs[0]->attr.size[1] = node->nn_param.roi_pool.size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + + //FIXME: old proposal outputs dimension is 4 + if(4 == inputs[1]->attr.dim_num) + { + outputs[0]->attr.size[3] = inputs[1]->attr.size[3]; + } + else + { + outputs[0]->attr.size[3] = inputs[1]->attr.size[1]; + } + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + int32_t size[VSI_NN_MAX_DIM_NUM]; + uint32_t dim; + vx_tensor rois_tmp; + + rois_tmp = NULL; + if( direction == VSI_NN_OPTIMIZE_FORWARD && inputs[1]->attr.dim_num == 2 ) + { + /* reshape proposal rois tensor, [5,roi_num] --> [1,1,5,roi_num] */ + VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + + dim = 4; + size[0] = 1; + size[1] = 1; + self->nn_param.roi_pool.local.rois = NULL; + /* reshape rois tensor, [5,roi_num] --> [1,1,5,roi_num] */ + if(2 == inputs[1]->attr.dim_num) + { + size[2] = inputs[1]->attr.size[0]; + size[3] = inputs[1]->attr.size[1]; + rois_tmp = vxReshapeTensor(inputs[1]->t, size, dim); + if(NULL == rois_tmp) + { + return VSI_FAILURE; + } + self->nn_param.proposal.local.rois = rois_tmp; + } + } + + return VSI_SUCCESS; +} + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vx_tensor rois = self->nn_param.roi_pool.local.rois; + if( NULL != self && NULL != self->n ) + { + if(rois) + { + vxReleaseTensor(&rois); + rois = NULL; + } + vxReleaseNode( &self->n ); + self->n = NULL; + } + return VSI_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ ROI_POOL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 2, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c new file mode 100644 index 0000000..0a90083 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c @@ -0,0 +1,118 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RSQRT, + 0, + 0, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(RSQRT, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32) + END_IO_TYPE_DECL(RSQRT) + if(!VALIDATE_OP_IO_TYPES(RSQRT, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RSQRT, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c new file mode 100644 index 0000000..61b9d13 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c @@ -0,0 +1,317 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (2) +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_SCALE_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_scale_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = (vsi_nn_scale_param *)node->nn_param.client_param; + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, axis ); + _SET_PARAM( 1, VX_TYPE_FLOAT32, bias ); +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static void reshape_tensor_shape + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vx_reference * params, + uint32_t index + ) +{ + uint32_t i; + int32_t size[4] = {0}; + int32_t size0[4] = {1, 1, 1, 1}; + uint32_t dims = 2; + + for( i = 0; i < input->attr.dim_num; i++ ) + { + size0[i] = input->attr.size[i]; + } + + size[0] = size0[0]; + size[1] = size0[1] * size0[2] * size0[3]; + + self->nn_param.scale.local.local_tensor[index] = + vxReshapeTensor(input->t, size, dims); + params[index] = (vx_reference)self->nn_param.scale.local.local_tensor[index]; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_IO_NUM]; + vx_border_t border; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + if (inputs[0]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 || + inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 || + inputs[2]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 || + outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16) + { + VSILOGE("scale shader unsuport format!\n"); + return VSI_FAILURE; + } + + reshape_tensor_shape(self, inputs[0], params, 0); + reshape_tensor_shape(self, inputs[1], params, 1); + reshape_tensor_shape(self, inputs[2], params, 2); + reshape_tensor_shape(self, outputs[0], params, 3); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _IO_NUM ); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + return status; +} + +static vsi_nn_op_compute_t op_init_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_scale"; + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_SCALE_list; + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return status; + } + if (NULL != op_init_list[kernel_info.init_index]) + { + status = op_init_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(SCALE, 3, 1) + IO_TYPE(D_F16, D_F16, D_F32, D_F16) + END_IO_TYPE_DECL(SCALE) + if(!VALIDATE_OP_IO_TYPES(SCALE, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + for (i = 0; i < _VSI_NN_SCALE_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.scale.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.scale.local.local_tensor[i])); + self->nn_param.scale.local.local_tensor[i] = NULL; + } + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SCALE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c new file mode 100644 index 0000000..3c28b02 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c @@ -0,0 +1,181 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + uint32_t i = 0; + uint32_t block_size = 1, coord_dim = 1; + uint32_t idx_num = 1; + uint32_t *input_size = inputs[1]->attr.size; + uint32_t dims_num = inputs[1]->attr.dim_num; + + if(inputs[0]->attr.dim_num > 1) + { + coord_dim = inputs[0]->attr.size[0]; + } + if( coord_dim > 3 ) + { + CHECK_STATUS(status); + return status; + } + for(i = 0; i < inputs[0]->attr.dim_num; i++) + { + idx_num *= inputs[0]->attr.size[i]; + } + idx_num /= coord_dim; + + param =vsi_nn_kernel_param_create(); + + for(i = 0; i < dims_num; ++i) + { + block_size *= input_size[i]; + } + block_size /= idx_num; + + vsi_nn_kernel_param_add_int32( param, "block_size", block_size ); + vsi_nn_kernel_param_add_int32( param, "coord_dim", coord_dim ); + vsi_nn_kernel_param_add_int32( param, "idx_num", idx_num ); + n = vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(SCATTER_ND, 2, 1) + IO_TYPE(D_I32, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I32, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I32, D_F16, D_F16) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I32, D_U32, D_U32) + IO_TYPE(D_I32, D_F32, D_F32) + END_IO_TYPE_DECL(SCATTER_ND) + if(!VALIDATE_OP_IO_TYPES(SCATTER_ND, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + uint32_t i = 0; + vsi_nn_scatter_nd_param * p = &(self->nn_param.scatter_nd); + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + if(p->shape == NULL) + { + return FALSE; + } + + outputs[0]->attr.dim_num = p->dim_num; + for (i = 0; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = p->shape[i]; + } + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SCATTER_ND, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c new file mode 100644 index 0000000..a01b758 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c @@ -0,0 +1,215 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; + int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + int32_t* shapes_ptr[_IO_NUM]; + int32_t* shapes_in[_INPUT_NUM]; + size_t rank_in[_INPUT_NUM]; + uint32_t new_rank = 0; + int32_t i = 0; + vsi_bool ret; + + if( NULL == self ) + { + return VSI_FAILURE; + } + + for (i = 0; i < _IO_NUM; i++) + { + shapes_ptr[i] = shapes[i]; + } + + for (i = 0; i < _INPUT_NUM; i++) + { + shapes_in[i] = (int32_t *)inputs[i]->attr.size; + rank_in[i] = (size_t)inputs[i]->attr.dim_num; + } + + ret = vsi_nn_kernel_optimize_broadcast_shape( + (const int32_t**)shapes_in, (const size_t*)rank_in, _INPUT_NUM, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes_ptr, shapes[_INPUT_NUM], &new_rank); + + if( ret ) + { + for (i = 0; i < _INPUT_NUM; i++) + { + reshape_tensors[i] = vsi_nn_reshape_tensor( self->graph, + inputs[i], (uint32_t*)shapes[i], new_rank ); + } + + for (i = 0; i < _OUTPUT_NUM; i++) + { + reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( self->graph, + outputs[i], (uint32_t*)shapes[i + _INPUT_NUM], new_rank ); + } + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select", + &reshape_tensors[0], _INPUT_NUM, + &reshape_tensors[_INPUT_NUM], _OUTPUT_NUM, NULL ); + + for (i = 0; i < _IO_NUM; i++) + { + vsi_nn_ReleaseTensor( &reshape_tensors[i] ); + } + } + + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(SELECT, 3, 1) + IO_TYPE(D_I8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8, D_F16, D_F16, D_F16) + IO_TYPE(D_I8, D_I32, D_I32, D_I32) + IO_TYPE(D_I8, D_F32, D_F32, D_F32) + IO_TYPE(D_BOOL8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_BOOL8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_BOOL8, D_F16, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_I32, D_I32, D_I32) + IO_TYPE(D_BOOL8, D_F32, D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + END_IO_TYPE_DECL(SELECT) + if(!VALIDATE_OP_IO_TYPES(SELECT, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i, out_rank, in0_rank, in1_rank, in2_rank; + uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_bool ret = TRUE; + + in0_rank = inputs[0]->attr.dim_num; + in1_rank = inputs[1]->attr.dim_num; + in2_rank = inputs[2]->attr.dim_num; + out_rank = vsi_nn_max(in0_rank, vsi_nn_max( in1_rank, in2_rank )); + + for(i = 0; i < out_rank; i++) + { + uint32_t sz0, sz1, sz2; + sz0 = i < in0_rank ? inputs[0]->attr.size[i] : 1; + sz1 = i < in1_rank ? inputs[1]->attr.size[i] : 1; + sz2 = i < in2_rank ? inputs[2]->attr.size[i] : 1; + shape[i] = vsi_nn_max(vsi_nn_max(sz0, sz1), sz2); + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + } + else + { + uint32_t total_size_got; + uint32_t total_size_expected; + total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); + total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, + outputs[0]->attr.dim_num ); + if( total_size_expected != total_size_got ) + { + VSILOGW("Output size mismatch, expect %d, but got %d", + total_size_expected, total_size_got); + ret = FALSE; + } + } + + return ret; +} /* op_setup() */ + + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SELECT, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c new file mode 100644 index 0000000..5c02808 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c @@ -0,0 +1,549 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define USE_OVXLIB (0) + +#define _ARG_NUM (2) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +#if (USE_OVXLIB) + +extern vx_kernel_description_t * vx_kernel_SHUFFLECHANNEL_list[]; + +static vsi_bool _reshape_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i = 0; + uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t axis = 0; + vsi_nn_shufflechannel_param * p = NULL; + uint32_t before_size = 1; + uint32_t after_size = 1; + uint32_t * input_sizes = inputs[0]->attr.size; + uint32_t dims = inputs[0]->attr.dim_num; + + p = &(self->nn_param.shufflechannel); + axis = p->axis; + + for ( i = 0; i < (uint32_t)axis; i++) + { + before_size *= input_sizes[i]; + } + for ( i = axis + 1; i < dims; i++) + { + after_size *= input_sizes[i]; + } + + if (axis == 2 && after_size == 1) + { + sizes[0] = input_sizes[0]; + sizes[1] = input_sizes[1]; + sizes[2] = input_sizes[2]; + } + else + { + sizes[0] = before_size; + sizes[1] = input_sizes[axis]; + sizes[2] = after_size; + p->axis = 1; + } + dims = 3; + + p->local->input_tensor = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, dims); + p->local->output_tensor = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, dims); + + return TRUE; +} + +static void _set_inputs_outputs + ( + vsi_nn_node_t * self, + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_shufflechannel_param * p = NULL; + + p = &(self->nn_param.shufflechannel); + + params[0] = (vx_reference)p->local->input_tensor; + params[1] = (vx_reference)p->local->output_tensor; +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status = VSI_SUCCESS; + vx_context ctx; + vsi_nn_shufflechannel_param * p = NULL; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.shufflechannel); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, group_number ); + _SET_PARAM( 1, VX_TYPE_INT32, axis ); +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i = 0; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args = NULL; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( self, params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_pre_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type; + vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type; + int8_t inputFixedPointPos = inputs[0]->attr.dtype.fl; + int8_t outputFixedPointPos = outputs[0]->attr.dtype.fl; + int32_t inputZeroPoint = inputs[0]->attr.dtype.zero_point; + int32_t outputZeroPoint = outputs[0]->attr.dtype.zero_point; + vx_float32 inputScale = inputs[0]->attr.dtype.scale; + vx_float32 outputScale = outputs[0]->attr.dtype.scale; + int32_t axis = self->nn_param.shufflechannel.axis; + uint32_t *sizes = inputs[0]->attr.size; + vsi_bool is16Bits = FALSE; + vsi_bool is8Bits = FALSE; + + is16Bits = ((inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16) + || (inputDataFormat == VSI_NN_TYPE_INT16 && outputDataFormat == VSI_NN_TYPE_INT16 + && inputFixedPointPos == outputFixedPointPos)) ? TRUE : FALSE; + is8Bits = ((inputDataFormat == VSI_NN_TYPE_INT8 && outputDataFormat == VSI_NN_TYPE_INT8 + && inputFixedPointPos == outputFixedPointPos) + || (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8 + && inputZeroPoint == outputZeroPoint && inputScale == outputScale)) ? TRUE : FALSE; +#define VSI_NN_TENSOR_WIDTH_MAX (65536) + kernel_info->kernel_index = 0; + if (sizes[0] < VSI_NN_TENSOR_WIDTH_MAX && sizes[1] < VSI_NN_TENSOR_WIDTH_MAX) + { + if ( is16Bits && axis == 2 ) + { + kernel_info->kernel_index = 1; + } + else if ( is8Bits && axis == 2) + { + kernel_info->kernel_index = 2; + } + else if ( is16Bits && axis == 1) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_shufflechannel_axis1"; + kernel_info->kernel_index = 3; + } + else if ( is8Bits && axis == 1) + { + kernel_info->resource_name[0] = "vsi_nn_kernel_shufflechannel_axis1"; + kernel_info->kernel_index = 4; + } + } +#undef VSI_NN_TENSOR_WIDTH_MAX + + return VSI_SUCCESS; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_border_t border; + vx_reference * args = NULL; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( self, params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +#endif + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ +#if(USE_OVXLIB) + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_info_t kernel_info; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + + /* setup input/output shape */ + _reshape_tensor( self, inputs, outputs); + + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_shufflechannel"; + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_SHUFFLECHANNEL_list; + kernel_info.init_index = 1; + + if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) + { + vx_op_pre_compute(self, inputs, outputs, &kernel_info); + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +#else + vsi_status status = VSI_FAILURE; + vx_nn_reorg_params_ext2_t param; + vsi_nn_tensor_t *block_size_tensor = NULL; + vsi_nn_tensor_attr_t attr; + uint8_t data = 1; + + memset(¶m, 0, sizeof(vx_nn_reorg_params_ext2_t)); + memset(&attr, 0, sizeof(attr)); + attr.size[0] = 2; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + block_size_tensor = vsi_nn_CreateTensorFromData( + self->graph, + &data, + &attr); + if( NULL == block_size_tensor ) + { + VSILOGE("Create block_size_tensor fail.(shufflechannel)"); + return VSI_FAILURE; + } + + self->nn_param.shufflechannel.local->block_size_tensor = block_size_tensor; + param.base.block_size = REQUIRED_IO(block_size_tensor); + + param.base.type = VX_REORG_SHUFFLE_CHANNEL; + param.axis = &self->nn_param.shufflechannel.axis; + param.num_group = &self->nn_param.shufflechannel.group_number; + + self->n = vxReorgLayer2( self->graph->g, + inputs[0]->t, + (vx_nn_reorg_params_t *)¶m, + sizeof(vx_nn_reorg_params_ext2_t), + outputs[0]->t); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + + return status; +#endif +} /* op_compute() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + vsi_nn_shufflechannel_param *p = NULL; + int32_t axis = 0; + + if( NULL == self ) + { + return ret; + } + + p = &(self->nn_param.shufflechannel); + axis = p->axis; + + if (axis < 0) + { + axis = axis + inputs[0]->attr.dim_num; + p->axis = axis; + } + + if (p->axis < 0) + { + VSILOGD("shufflechannel Invalid Axis: %d", p->axis); + return FALSE; + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + memcpy( outputs[0]->attr.size, inputs[0]->attr.size, + sizeof(uint32_t) * inputs[0]->attr.dim_num ); + } + + return TRUE; +} /* op_setup() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_shufflechannel_param *p = NULL; + int32_t axis = 0; + int32_t dims = (int32_t)inputs[0]->attr.dim_num; + int32_t num_group = 0; + uint32_t *shape = inputs[0]->attr.size; + + p = &(self->nn_param.shufflechannel); + axis = p->axis; + num_group = p->group_number; + + if (axis > (dims - 1)) + { + VSILOGE("Invalid Axis: %d, (SHUFFLECHANNEL) at [%s : %d]\n", axis, __FILE__, __LINE__); + return FALSE; + } + if (shape[axis] % num_group) + { + VSILOGE("Invalid group_number: %d, (SHUFFLECHANNEL) at [%s : %d]\n", num_group, __FILE__, __LINE__); + return FALSE; + } + + { + BEGIN_IO_TYPE_DECL(SHUFFLECHANNEL, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_F32, D_BF16) + END_IO_TYPE_DECL(SHUFFLECHANNEL) + if(!VALIDATE_OP_IO_TYPES(SHUFFLECHANNEL, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_shufflechannel_lcl_data_t *local = NULL; + vsi_nn_shufflechannel_param *p = NULL; + + p = &(self->nn_param.shufflechannel); + self->nn_param.shufflechannel.axis = 2; + local = (vsi_nn_shufflechannel_lcl_data_t *)malloc(sizeof(vsi_nn_shufflechannel_lcl_data_t)); + if (NULL == local) + { + VSILOGE("Malloc fail, (SHUFFLECHANNEL) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(local, 0, sizeof(vsi_nn_shufflechannel_lcl_data_t)); + p->local = local; + + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_shufflechannel_param *p = &(self->nn_param.shufflechannel); + if (p->local) + { + if (p->local->input_tensor) + { + vxReleaseTensor(&(p->local->input_tensor)); + p->local->input_tensor = NULL; + } + if (p->local->output_tensor) + { + vxReleaseTensor(&(p->local->output_tensor)); + p->local->output_tensor = NULL; + } + if (p->local->block_size_tensor != NULL) + { + vsi_nn_ReleaseTensor(&(p->local->block_size_tensor)); + } + + vsi_nn_safe_free(p->local); + } + vsi_nn_op_common_deinit(self); + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SHUFFLECHANNEL, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c new file mode 100644 index 0000000..ff9d84e --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c @@ -0,0 +1,96 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LOGISTIC, + 0, + 0, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs); + + return ret; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SIGMOID, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c b/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c new file mode 100644 index 0000000..d51f294 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c @@ -0,0 +1,677 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (5) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +#define ENABLE_CPU 0 +#define TENSOR_ALL 0 + +extern vx_kernel_description_t * vx_kernel_SIGNALFRAME_list[]; + +static vsi_status _create_local_tensor + ( + vsi_nn_node_t * self + ) +{ + //vsi_nn_tensor_t *signal_tensor = NULL; + //vsi_nn_tensor_t *frame_tensor = NULL; + vsi_nn_tensor_t *window_length_tensor = NULL; + vsi_nn_tensor_t *step_tensor = NULL; + vsi_nn_tensor_t *pad_end_tensor = NULL; + vsi_nn_tensor_t *pad_tensor = NULL; + vsi_nn_tensor_t *axis_tensor = NULL; + + if(NULL == self) + { + return VSI_FAILURE; + } + + window_length_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.signalframe.window_length, + VSI_NN_TYPE_UINT32); + if(NULL == window_length_tensor) + { + goto error; + } + + step_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.signalframe.step, + VSI_NN_TYPE_UINT32); + if(NULL == step_tensor) + { + goto error; + } + + pad_end_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.signalframe.pad_end, + VSI_NN_TYPE_UINT32); + if(NULL == pad_end_tensor) + { + goto error; + } + + pad_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.signalframe.pad, + VSI_NN_TYPE_UINT32); + if(NULL == pad_tensor) + { + goto error; + } + + axis_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.signalframe.axis, + VSI_NN_TYPE_UINT32); + if(NULL == axis_tensor) + { + goto error; + } + + self->nn_param.signalframe.local.window_length_tensor = window_length_tensor; + self->nn_param.signalframe.local.step_tensor = step_tensor; + self->nn_param.signalframe.local.pad_end_tensor = pad_end_tensor; + self->nn_param.signalframe.local.pad_tensor = pad_tensor; + self->nn_param.signalframe.local.axis_tensor = axis_tensor; + + return VSI_SUCCESS; +error: + if(window_length_tensor)vsi_nn_ReleaseTensor(&window_length_tensor); + if(step_tensor)vsi_nn_ReleaseTensor(&step_tensor); + if(pad_end_tensor)vsi_nn_ReleaseTensor(&pad_end_tensor); + if(pad_tensor)vsi_nn_ReleaseTensor(&pad_tensor); + if(axis_tensor)vsi_nn_ReleaseTensor(&axis_tensor); + return VSI_FAILURE; +} /* _create_local_tensor() */ + +static void check_tensor_shape + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vx_reference * params, + uint32_t index, + vx_bool rsFlg + ) +{ + vsi_nn_tensor_attr_t attr; + + if (index == 0 ) + { + if( input->attr.dim_num == 1 ) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.signalframe.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; + } + else + params[index] = (vx_reference)input->t; + } + else if(index == 1 ) + { + if(input->attr.dim_num == 1) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.signalframe.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; + } + else if(input->attr.dim_num == 4) + { + memcpy(&attr, &(input->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[2] *= attr.size[3]; + attr.size[3] = 1; + attr.dim_num = 3; + self->nn_param.signalframe.local.local_tensor[index] = + vxReshapeTensor(input->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; + } + else + params[index] = (vx_reference)input->t; + + } + else + { + VSILOGE("No more local tensor!(signalframe) at [%s : %d]\n", __FILE__, __LINE__); + } +} + +static void check_local_tensor_shape + ( + vsi_nn_node_t * self, + vx_reference * params, + uint32_t index, + vx_bool rsFlg + ) +{ + vsi_nn_tensor_attr_t attr; + + if( self->nn_param.signalframe.local.window_length_tensor->attr.dim_num == 1 ) + { + memcpy(&attr, &(self->nn_param.signalframe.local.window_length_tensor->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.signalframe.local.local_tensor[index] = + vxReshapeTensor(self->nn_param.signalframe.local.window_length_tensor->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; + } + else + params[index] = (vx_reference)self->nn_param.signalframe.local.window_length_tensor->t; + index++; + + if( self->nn_param.signalframe.local.step_tensor->attr.dim_num == 1 ) + { + memcpy(&attr, &(self->nn_param.signalframe.local.step_tensor->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.signalframe.local.local_tensor[index] = + vxReshapeTensor(self->nn_param.signalframe.local.step_tensor->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; + } + else + params[index] = (vx_reference)self->nn_param.signalframe.local.step_tensor->t; + index++; + + if( self->nn_param.signalframe.local.pad_end_tensor->attr.dim_num == 1 ) + { + memcpy(&attr, &(self->nn_param.signalframe.local.pad_end_tensor->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.signalframe.local.local_tensor[index] = + vxReshapeTensor(self->nn_param.signalframe.local.pad_end_tensor->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; + } + else + params[index] = (vx_reference)self->nn_param.signalframe.local.pad_end_tensor->t; + index++; + + if( self->nn_param.signalframe.local.pad_tensor->attr.dim_num == 1 ) + { + memcpy(&attr, &(self->nn_param.signalframe.local.pad_tensor->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.signalframe.local.local_tensor[index] = + vxReshapeTensor(self->nn_param.signalframe.local.pad_tensor->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; + } + else + params[index] = (vx_reference)self->nn_param.signalframe.local.pad_tensor->t; + index++; + + if( self->nn_param.signalframe.local.axis_tensor->attr.dim_num == 1 ) + { + memcpy(&attr, &(self->nn_param.signalframe.local.axis_tensor->attr), sizeof(vsi_nn_tensor_attr_t)); + attr.size[1] = 1; + attr.dim_num = 2; + self->nn_param.signalframe.local.local_tensor[index] = + vxReshapeTensor(self->nn_param.signalframe.local.axis_tensor->t, (int32_t*)(attr.size), attr.dim_num); + params[index] = (vx_reference)self->nn_param.signalframe.local.local_tensor[index]; + } + else + params[index] = (vx_reference)self->nn_param.signalframe.local.axis_tensor->t; + +} + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_signalframe_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.signalframe); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_UINT32, window_length ); + _SET_PARAM( 1, VX_TYPE_UINT32, step ); + _SET_PARAM( 2, VX_TYPE_UINT32, pad_end ); + _SET_PARAM( 3, VX_TYPE_UINT32, pad ); + _SET_PARAM( 4, VX_TYPE_UINT32, axis ); +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +#if ENABLE_CPU +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } + + /*for( i = 0; i < _ARG_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i + 1]->t; + }*/ +} /* _set_inputs_outputs() */ +#endif + +#if ENABLE_CPU +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_bool rsFlg = FALSE; + vx_reference * args; + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + //_set_inputs_outputs( params, inputs, outputs ); + check_tensor_shape(self, inputs[0], params, 0, rsFlg); + check_tensor_shape(self, outputs[0], params, 1, rsFlg); + if(TENSOR_ALL) + check_local_tensor_shape(self, params, 2, rsFlg); + else + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} +#endif + +static vsi_status vx_op_pre_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type; + vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type; + uint32_t axis0 = self->nn_param.signalframe.axis; + uint32_t axis = axis0; + uint32_t dim = inputs[0]->attr.dim_num; + vx_bool dataTypeFlg = FALSE; + vx_bool etFlg = FALSE; + + if((inputDataFormat == VSI_NN_TYPE_INT8 && outputDataFormat == VSI_NN_TYPE_INT8) || + (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8)) + etFlg = TRUE; + + if ((inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16) || + (inputDataFormat == VSI_NN_TYPE_INT16 && outputDataFormat == VSI_NN_TYPE_INT16) || + etFlg) + dataTypeFlg = TRUE; + + axis = dim - axis0 - 1; + + if (dataTypeFlg + && ((dim == 1 && axis==0) || (dim == 2 && axis==1) || (dim == 3 && axis==2))) + { + kernel_info->kernel_index = 1; + if(etFlg) + { + kernel_info->kernel_index = 4; + } + } + else if(dataTypeFlg + && ((dim == 2 && axis==0) || (dim == 3 && axis==1))) + { + kernel_info->kernel_index = 2; + if(etFlg) + { + kernel_info->kernel_index = 5; + } + } + else if(dataTypeFlg + && (dim == 3 && axis==0)) + { + kernel_info->kernel_index = 3; + if(etFlg) + { + kernel_info->kernel_index = 6; + } + } + else + { + VSILOGE("Not support input or output data format!(SIGNALFRAME) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + return VSI_SUCCESS; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_border_t border; + vx_bool rsFlg = FALSE; + vx_reference * args; + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + //_set_inputs_outputs( params, inputs, outputs ); + check_tensor_shape(self, inputs[0], params, 0, rsFlg); + check_tensor_shape(self, outputs[0], params, 1, rsFlg); + if(TENSOR_ALL) + check_local_tensor_shape(self, params, 2, rsFlg); + else + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status |= vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = 0; + border.constant_value.S16 = 0; + border.constant_value.U8 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + return status; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + status = VSI_SUCCESS; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + if(0) + { + status = _create_local_tensor(self); + if(status != VSI_SUCCESS) + { + return status; + } + } + +#if ENABLE_CPU //cpu + { + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_signalframe"; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = vx_kernel_SIGNALFRAME_list; + kernel_info.init_index = 0; + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + status = cpu_op_compute(self, inputs, outputs); + + return status; + } +#endif + + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_signalframe"; + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_SIGNALFRAME_list; + kernel_info.init_index = 1; + + if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) + { + vx_op_pre_compute(self, inputs, outputs, &kernel_info); + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + status |= vx_op_compute(self, inputs, outputs); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(SIGNAL_FRAME, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32) + END_IO_TYPE_DECL(SIGNAL_FRAME) + if(!VALIDATE_OP_IO_TYPES(SIGNAL_FRAME, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + for (i = 0; i < _VSI_NN_SIGNALFRAME_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.signalframe.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.signalframe.local.local_tensor[i])); + self->nn_param.signalframe.local.local_tensor[i] = NULL; + } + } + + if(self->nn_param.signalframe.local.window_length_tensor) + vsi_nn_ReleaseTensor(&self->nn_param.signalframe.local.window_length_tensor); + if(self->nn_param.signalframe.local.step_tensor) + vsi_nn_ReleaseTensor(&self->nn_param.signalframe.local.step_tensor); + if(self->nn_param.signalframe.local.pad_end_tensor) + vsi_nn_ReleaseTensor(&self->nn_param.signalframe.local.pad_end_tensor); + if(self->nn_param.signalframe.local.pad_tensor) + vsi_nn_ReleaseTensor(&self->nn_param.signalframe.local.pad_tensor); + if(self->nn_param.signalframe.local.axis_tensor) + vsi_nn_ReleaseTensor(&self->nn_param.signalframe.local.axis_tensor); + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + vsi_bool ret; + uint32_t axis; + + ret = TRUE; + if( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num ) + { + return ret; + } + + axis = self->nn_param.signalframe.axis; + if(axis >= inputs[0]->attr.dim_num) + { + return FALSE; + } + + /* signal frame will increase dim num */ + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + 1; + for(i = 0; i < axis; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + if(self->nn_param.signalframe.pad_end) + { + outputs[0]->attr.size[axis] = inputs[0]->attr.size[axis]; + } + else + { + if(inputs[0]->attr.size[axis] >= self->nn_param.signalframe.window_length) + { + outputs[0]->attr.size[axis] = (inputs[0]->attr.size[axis] - self->nn_param.signalframe.window_length) \ + / self->nn_param.signalframe.step + 1; + } + else + { + outputs[0]->attr.size[axis] = 0; + return FALSE; + } + } + for(i = axis; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i + 1] = inputs[0]->attr.size[i]; + } + + return ret; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SIGNAL_FRAME, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c new file mode 100644 index 0000000..ff7ea13 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c @@ -0,0 +1,196 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (3) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs); + + return ret; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_slice_param * p; + vsi_nn_internal_node_t* curr = NULL; + uint32_t i; + if(self->nn_param.slice.dims == 0) + { + self->nn_param.slice.dims = inputs[0]->attr.dim_num; + } + + p = (vsi_nn_slice_param *)&(self->nn_param.slice); + vsi_nn_internal_init_node_wksp( self ); + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + for(i = 0; i < p->dims; i++) + { + outputs[0]->attr.size[i] = p->length[i]; + } + outputs[0]->attr.dim_num = p->dims; + } + + for (i = 0; i < self->nn_param.slice.dims; i++) + { + p->lcl_data->begin_dims[i] = self->nn_param.slice.start[i]; + p->lcl_data->end_dims[i] = self->nn_param.slice.start[i] + self->nn_param.slice.length[i]; + p->lcl_data->stride_dims[i] = 1; + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 ); + curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims; + curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims; + curr->node->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.stride_dims = p->lcl_data->stride_dims; + curr->node->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.begin_mask = 0; + curr->node->nn_param.strided_slice.end_mask = 0; + curr->node->nn_param.strided_slice.shrink_axis_mask = 0; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node( self, curr ); + + return TRUE; +} /* op_setup() */ + + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_slice_param * p = NULL; + + p = &(self->nn_param.slice); + + p->lcl_data = + (vsi_nn_slice_lcl_data *)malloc(sizeof(vsi_nn_slice_lcl_data)); + if (NULL == p->lcl_data) + { + return VX_ERROR_NO_MEMORY; + } + memset(p->lcl_data, 0, sizeof(vsi_nn_split_lcl_data)); + + return status; +} + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_slice_param * p = NULL; + + p = &(self->nn_param.slice); + + if (p->lcl_data) + { + free(p->lcl_data); + p->lcl_data = NULL; + } + + vsi_nn_internal_deinit_node_wksp( self ); + + return VSI_SUCCESS; +} /* op_deinit() */ + + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SLICE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c new file mode 100644 index 0000000..b71c583 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c @@ -0,0 +1,303 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_bool _is_same_shape + ( + vsi_nn_tensor_t * inputs, + uint32_t *sizes, + uint32_t dims + ) +{ + uint32_t i = 0; + + if (inputs->attr.dim_num != dims) + return FALSE; + + for (i = 0; i < dims; i++) + { + if (sizes[i] != inputs->attr.size[i]) + return FALSE; + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(SOFTMAX, 1, 1) + /* IO_TYPE(INPUT, OUTPUT) */ + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_U8|Q_ASYM) + + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_BF16, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F32) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_F32) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + END_IO_TYPE_DECL(SOFTMAX) + if(!VALIDATE_OP_IO_TYPES(SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +#define VSI_NN_SOFTMAX_DEFAULT_AXIS (10000) +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_softmax_param * p; + uint32_t dim_num; + uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t i = 0; + int32_t axis = -1; + vsi_nn_tensor_t* new_input = NULL; + vsi_nn_tensor_t* new_output = NULL; + + if (VSI_NN_OPTIMIZE_BACKWARD == direction) + { + return VSI_SUCCESS; + } + + p = &(self->nn_param.softmax); + axis = p->axis; + if (axis != VSI_NN_SOFTMAX_DEFAULT_AXIS) + { + uint32_t innerSize = 1; + uint32_t outerSize = 1; + for (i = 0; i < (uint32_t)axis; i++) + { + sizes[i] = inputs[0]->attr.size[i]; + innerSize *= inputs[0]->attr.size[i]; + } + + for (i = (uint32_t)(axis + 1); i < inputs[0]->attr.dim_num; i++) + { + outerSize *= inputs[0]->attr.size[i]; + } + + if (axis == 1) + { + if (sizes[0] == 1) + { + sizes[0] = inputs[0]->attr.size[axis]; + sizes[1] = outerSize; + + dim_num = 2; + } + else + { + sizes[axis] = 1; + sizes[axis + 1] = inputs[0]->attr.size[axis]; + sizes[axis + 2] = outerSize; + + dim_num = 4; + } + } + else if (axis >= 3) + { + sizes[0] = innerSize; + sizes[1] = 1; + sizes[2] = inputs[0]->attr.size[axis]; + sizes[3] = outerSize; + + dim_num = vsi_nn_min(4, inputs[0]->attr.dim_num); + } + else + { + sizes[axis] = inputs[0]->attr.size[axis]; + sizes[axis + 1] = outerSize; + + dim_num = vsi_nn_min((uint32_t)(axis + 2), inputs[0]->attr.dim_num); + } + } + + if (axis != VSI_NN_SOFTMAX_DEFAULT_AXIS && _is_same_shape(inputs[0], sizes, dim_num) == FALSE) + { + new_input = vsi_nn_reshape_tensor(self->graph, inputs[0], sizes, dim_num); + new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], sizes, dim_num); + curr = ((vsi_nn_internal_node_wksp_t *)((self)->internal_node_wksp))->nodes; + curr->inputs[0] = new_input; + curr->outputs[0] = new_output; + p->local.reshaped_input = new_input; + p->local.reshaped_output = new_output; + } + + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_softmax_param * p = &(self->nn_param.softmax); + if (p->local.reshaped_input) + { + vsi_nn_ReleaseTensor(&(p->local.reshaped_input)); + } + if (p->local.reshaped_output) + { + vsi_nn_ReleaseTensor(&(p->local.reshaped_output)); + } + + vsi_nn_internal_deinit_node_wksp( self ); + return status; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 7) == -1) + { + self->nn_param.softmax.axis = VSI_NN_SOFTMAX_DEFAULT_AXIS; + } + if (self->nn_param.softmax.beta == 0.f) + { + self->nn_param.softmax.beta = 1.f; + } + + return status; +} /* op_init() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_internal_node_t* curr = NULL; + if( NULL == self ) + { + return FALSE; + } + + if (self->nn_param.softmax.axis < 0) + self->nn_param.softmax.axis += (int32_t)inputs[0]->attr.dim_num; + + if (self->nn_param.softmax.axis < 0) + { + VSILOGD("SoftMax Invalid Axis: %d", self->nn_param.softmax.axis); + return FALSE; + } + + vsi_nn_internal_init_node_wksp(self); + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_SOFTMAX_INTERNAL, 0, 0); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + curr->node->nn_param.softmax_internal.beta = self->nn_param.softmax.beta; + vsi_nn_internal_setup_node(self, curr); + + return TRUE; +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SOFTMAX, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c new file mode 100644 index 0000000..1d7b1b2 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c @@ -0,0 +1,306 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_link_list.h" + +#define MAX_SOFTMAX_BATCH 65535 + +static vsi_bool _need_split_softmax + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs + ) +{ + vsi_bool ret = FALSE; + if(inputs[0]->attr.dim_num == 2 && inputs[0]->attr.size[1] > MAX_SOFTMAX_BATCH) + { + ret = TRUE; + } + + return ret; +} /* _need_split_softmax() */ + +static vsi_status _create_split_softmax + ( + vsi_nn_node_t * self, + vx_tensor src, + vx_tensor dst + ) +{ + vsi_nn_softmax_internal_lcl_data * data; + + data = (vsi_nn_softmax_internal_lcl_data *)malloc( sizeof(vsi_nn_softmax_internal_lcl_data) ); + if( NULL == data ) + { + VSILOGE( "Create softmax local data fail." ); + return VSI_FAILURE; + } + memset( data, 0, sizeof(vsi_nn_softmax_internal_lcl_data) ); + data->src_tensor = src; + data->dst_tensor = dst; + data->node = NULL; + + /* Store input & output */ + vsi_nn_LinkListPushStart( + (vsi_nn_link_list_t **)&self->nn_param.softmax_internal.data, + (vsi_nn_link_list_t *)data ); + + return VSI_SUCCESS; +} /* _create_split_softmax() */ + +static vsi_status vsi_nn_softmax_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_softmax_internal_lcl_data * iter; + size_t size = sizeof(vx_nn_softmax_params_t); +#ifdef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT + vx_nn_softmax_params_ext_t paramExt; + vx_nn_softmax_params_t *param = (vx_nn_softmax_params_t *)¶mExt; + + memset(¶mExt, 0, sizeof(vx_nn_softmax_params_ext_t)); + paramExt.base.beta = self->nn_param.softmax_internal.beta; + paramExt.axis = 0; + + size = sizeof(vx_nn_softmax_params_ext_t); +#else + vx_nn_softmax_params_t base; + vx_nn_softmax_params_t *param = &base; + + memset(&base, 0, sizeof(vx_nn_softmax_params_t)); + base.beta = self->nn_param.softmax_internal.beta; +#endif + + status = VSI_FAILURE; + + + status = VSI_FAILURE; + if(param->beta == 0.f) + { + VSILOGW("Softmax's beta is 0. Set beta to 1"); + /* FIXME: Compatible with old case generated by Acuity */ + /* FIXME: for NNAPI case with beta as 0, need refine logical */ + param->beta = 1.f; + } + iter = self->nn_param.softmax_internal.data; + self->n = NULL; + if(NULL != iter) + { + while (iter) + { + iter->node = vxSoftmaxLayer2(self->graph->g, + iter->src_tensor, + param, + size, + iter->dst_tensor); + if(iter->node == NULL) + { + VSILOGE( "Create vxSoftmaxLayer fail." ); + status = VSI_FAILURE; + break; + } + iter = (vsi_nn_softmax_internal_lcl_data *)vsi_nn_LinkListNext((vsi_nn_link_list_t *)iter); + status = VSI_SUCCESS; + } + } + else + { +#ifdef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT + if ( inputs[0]->attr.dim_num > 2 ) + { + paramExt.axis = 2; + } +#endif + + self->n = vxSoftmaxLayer2( self->graph->g, + inputs[0]->t, + param, + size, + outputs[0]->t); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + } + + return status; +} /* vsi_nn_softmax_compute() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status; + vx_tensor in_view_tensor,out_view_tensor; + uint32_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM]; + uint32_t axis, batch_size; + + in_view_tensor = NULL; + out_view_tensor = NULL; + status = VSI_SUCCESS; + if(direction == VSI_NN_OPTIMIZE_BACKWARD) + { + return status; + } + if(_need_split_softmax(self, inputs) == FALSE) + { + return status; + } + + VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + if( NULL == inputs[0]->t ) + { + vsi_nn_TensorReinit( self->graph, inputs[0] ); + } + if( NULL == outputs[0]->t ) + { + vsi_nn_TensorReinit( self->graph, outputs[0] ); + } + + axis = 1; /* we only split 2D softmax, so the axis = batch dim */ + batch_size = inputs[0]->attr.size[1]; + memset( start, 0, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + memset( end, 0, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + end[0] = inputs[0]->attr.size[0]; + end[1] = inputs[0]->attr.size[1]; + end[2] = inputs[0]->attr.size[2]; + end[3] = inputs[0]->attr.size[3]; + end[axis] = 0; + while(end[axis] < batch_size) + { + start[axis] = end[axis]; + end[axis] += MAX_SOFTMAX_BATCH; + if(end[axis] > inputs[0]->attr.size[axis]) + { + end[axis] = inputs[0]->attr.size[axis]; + } + + in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]); + if(NULL == in_view_tensor) + { + VSILOGE( "Create inputs view tensor fail."); + break; + } + out_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, outputs[0]); + if(NULL == out_view_tensor) + { + VSILOGE( "Create outputs view tensor fail."); + break; + } + + status = _create_split_softmax(self, in_view_tensor, out_view_tensor); + if(VSI_SUCCESS != status) + { + VSILOGE( "Create split softmax data struct fail."); + break; + } + } + + return status; +} + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + //TODO: Check tensor shapes. + return TRUE; +} /* op_check() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status; + vsi_nn_softmax_internal_lcl_data * data; + vsi_nn_softmax_internal_lcl_data * tmp; + + if(NULL == self) + { + return VSI_FAILURE; + } + data = self->nn_param.softmax_internal.data; + + status = VSI_SUCCESS; + if(data) + { + while( NULL != data ) + { + tmp = (vsi_nn_softmax_internal_lcl_data *)vsi_nn_LinkListPopStart( + (vsi_nn_link_list_t **)&data ); + vxReleaseNode( &tmp->node ); + vxReleaseTensor( &tmp->src_tensor ); + vxReleaseTensor( &tmp->dst_tensor ); + free( tmp ); + tmp = NULL; + } + } + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SOFTMAX_INTERNAL, + /* init */ NULL, + /* compute */ vsi_nn_softmax_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c new file mode 100644 index 0000000..bf7566c --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c @@ -0,0 +1,96 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SOFTRELU, + 0, + 0, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs); + + return ret; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SOFTRELU, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c new file mode 100644 index 0000000..72be798 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c @@ -0,0 +1,214 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vx_nn_reorg_params_ext_t param; + vsi_nn_tensor_t *block_size_tensor = NULL; + vsi_nn_tensor_t *pad_tensor = NULL; + vsi_nn_tensor_attr_t attr; + + memset(¶m, 0, sizeof(vx_nn_reorg_params_t)); + memset(&attr, 0, sizeof(attr)); + attr.size[0] = 2; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + block_size_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)self->nn_param.space2batch.block_size, + &attr); + if( NULL == block_size_tensor ) + { + VSILOGE("Create block_size_tensor fail.(space2batch)"); + return VSI_FAILURE; + } + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = 4; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + pad_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)self->nn_param.space2batch.pad, + &attr); + if( NULL == pad_tensor ) + { + VSILOGE("Create pad_tensor fail.(space2batch)"); + vsi_nn_ReleaseTensor(&block_size_tensor); + block_size_tensor = NULL; + return VSI_FAILURE; + } + + self->nn_param.space2batch.local.block_size_tensor = block_size_tensor; + self->nn_param.space2batch.local.pad_tensor = pad_tensor; + param.base.block_size = REQUIRED_IO(block_size_tensor); + param.pad = OPTIONAL_IO(pad_tensor); + param.base.type = VX_REORG_SPACE_TO_BATCH_ND; + + self->n = vxReorgLayer2( self->graph->g, + inputs[0]->t, + (vx_nn_reorg_params_t *)¶m, + sizeof(vx_nn_reorg_params_ext_t), + outputs[0]->t); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if (inputs[0]->attr.dim_num != 4) + { + VSILOGE("The input tensor shape must be 4-D!(space2batch)"); + return FALSE; + } + + if(self->nn_param.space2batch.block_size[0] < 0 + || self->nn_param.space2batch.block_size[1] < 0 + || self->nn_param.space2batch.pad[0] < 0 + || self->nn_param.space2batch.pad[1] < 0 + || self->nn_param.space2batch.pad[2] < 0 + || self->nn_param.space2batch.pad[3] < 0) + { + VSILOGE("Block size or pad can't be less than zero in space to batch"); + return FALSE; + } + + { + BEGIN_IO_TYPE_DECL(SPACE2DEPTH, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + END_IO_TYPE_DECL(SPACE2DEPTH) + if (!VALIDATE_OP_IO_TYPES(SPACE2DEPTH, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + return TRUE; +} /* op_add_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_space2batch_param * p; + p = (vsi_nn_space2batch_param *)&(self->nn_param.space2batch); + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[3] = + inputs[0]->attr.size[3] * p->block_size[0] * p->block_size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[1] = + (p->pad[2] + p->pad[3] + inputs[0]->attr.size[1]) / p->block_size[1]; + outputs[0]->attr.size[0] = + (p->pad[0] + p->pad[1] + inputs[0]->attr.size[0]) / p->block_size[0]; + outputs[0]->attr.dim_num = 4; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.space2batch.local.block_size_tensor != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.space2batch.local.block_size_tensor)); + } + if (self->nn_param.space2batch.local.pad_tensor != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.space2batch.local.pad_tensor)); + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SPACE2BATCH, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c new file mode 100644 index 0000000..47a5ac7 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c @@ -0,0 +1,410 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include + +#include "vsi_nn_platform.h" + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_math.h" +#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (2) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_SPACE2DEPTH_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_space2depth_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.space2depth); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ +#define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_INT32, block_size[0] ); + _SET_PARAM( 1, VX_TYPE_INT32, block_size[1] ); +#undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_pre_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_type_e dataFormat = inputs[0]->attr.dtype.vx_type; + int8_t input_fixPointPos = 0; + int8_t output_fixPointPos = 0; + vx_bool dataTypeFlg = FALSE; + vsi_nn_tensor_attr_t attr[2]; + + memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t)); + memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t)); + + status = vsi_nn_vxGetTensorAttr(inputs[0]->t, &attr[0]); + status |= vsi_nn_vxGetTensorAttr(outputs[0]->t, &attr[1]); + if (status != VX_SUCCESS) + { + VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); + return status; + } + + input_fixPointPos = attr[0].dtype.fl; + output_fixPointPos = attr[1].dtype.fl; + + if(input_fixPointPos == output_fixPointPos) + dataTypeFlg = TRUE; + + if ((dataFormat == VSI_NN_TYPE_INT16 && dataTypeFlg) || dataFormat == VSI_NN_TYPE_FLOAT16) + { + kernel_info->kernel_index = 2; + } + else + { + VSILOGE("Not support input or output data format!(PRELU)\n"); + return VSI_FAILURE; + } + + return VSI_SUCCESS; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_border_t border; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + int32_t size_x = self->nn_param.space2depth.block_size[0]; + int32_t size_y = self->nn_param.space2depth.block_size[1]; + if (size_x == size_y) + { + vx_nn_reorg_params_t param; + vsi_nn_tensor_t *block_size_tensor = NULL; + memset(¶m, 0, sizeof(vx_nn_reorg_params_t)); + + block_size_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.space2depth.block_size[0], + VSI_NN_TYPE_INT32); + if( NULL == block_size_tensor ) + { + VSILOGE("Create block_size_tensor fail.(space2depth)"); + return VSI_FAILURE; + } + self->nn_param.space2depth.local.block_size_tensor = block_size_tensor; + param.block_size = REQUIRED_IO(block_size_tensor); + param.type = VX_REORG_SPACE_TO_DEPTH; + + self->n = vxReorgLayer2( self->graph->g, + inputs[0]->t, + ¶m, + sizeof(vx_nn_reorg_params_t), + outputs[0]->t); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + } + else + { + vsi_nn_kernel_info_t kernel_info; + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_space2depth"; + //kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_SPACE2DEPTH_list; + kernel_info.kernel_index = 1; + //kernel_info.init_index = 0; + kernel_info.init_index = 1; + + if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) + { + vx_op_pre_compute(self, inputs, outputs, &kernel_info); + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if(self->nn_param.space2depth.block_size[0] < 0 + || self->nn_param.space2depth.block_size[1] < 0) + { + VSILOGE("Block size can't be less than zero in space to depth"); + return FALSE; + } + + { + BEGIN_IO_TYPE_DECL(SPACE2DEPTH, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + + /* HW 9.0 */ + IO_TYPE(D_BF16, D_BF16) + END_IO_TYPE_DECL(SPACE2DEPTH) + if(!VALIDATE_OP_IO_TYPES(SPACE2DEPTH, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + + return TRUE; +} /* op_check() */ + + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t size_x = node->nn_param.space2depth.block_size[0]; + uint32_t size_y = node->nn_param.space2depth.block_size[1]; + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = inputs[0]->attr.size[0] / size_x; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1] / size_y; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2] * size_x * size_y; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.space2depth.local.block_size_tensor != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.space2depth.local.block_size_tensor)); + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SPACE2DEPTH, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c b/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c new file mode 100644 index 0000000..86e3e4f --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c @@ -0,0 +1,738 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_dtype_util.h" + +#define _ARG_NUM (2) +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) +#define _VSI_PARAM (vsi_nn_spatial_transformer_param) + +extern vx_kernel_description_t * vx_kernel_SPATIAL_TRANSFORMER_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for ( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for ( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status = VSI_SUCCESS; + vx_context ctx; + vsi_nn_spatial_transformer_param * p; + int32_t flag = 0; + vsi_nn_tensor_t * thre_tensor; + vsi_nn_tensor_attr_t attr; + + uint16_t value_buf[6] = {0}; + + if ( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = (vsi_nn_spatial_transformer_param *)node->nn_param.client_param; + ctx = vxGetContext( (vx_reference)node->graph->g ); + + flag = ((p->has_theta_1_1 == 1) + | ((p->has_theta_1_2 == 1) << 1) + | ((p->has_theta_1_3 == 1) << 2) + | ((p->has_theta_2_1 == 1) << 3) + | ((p->has_theta_2_2 == 1) << 4) + | ((p->has_theta_2_3 == 1) << 5)); + + params[0] = (vx_reference)vxCreateScalar( ctx, VSI_NN_TYPE_INT32, &flag ); + + memset( &attr, 0, sizeof( vsi_nn_tensor_attr_t ) ); + attr.size[0] = 6; + attr.size[1] = 1; + attr.size[2] = 1; + attr.size[3] = 1; + attr.dim_num = 4; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + + vsi_nn_Float32ToDtype(p->theta_1_1, (uint8_t*)(&value_buf[0]), &attr.dtype); + vsi_nn_Float32ToDtype(p->theta_1_2, (uint8_t*)(&value_buf[1]), &attr.dtype); + vsi_nn_Float32ToDtype(p->theta_1_3, (uint8_t*)(&value_buf[2]), &attr.dtype); + vsi_nn_Float32ToDtype(p->theta_2_1, (uint8_t*)(&value_buf[3]), &attr.dtype); + vsi_nn_Float32ToDtype(p->theta_2_2, (uint8_t*)(&value_buf[4]), &attr.dtype); + vsi_nn_Float32ToDtype(p->theta_2_3, (uint8_t*)(&value_buf[5]), &attr.dtype); + + thre_tensor = vsi_nn_CreateTensorFromData( node->graph,(uint8_t *)&value_buf, &attr ); + + params[1] = (vx_reference)thre_tensor->t; + p->lcl.local_tensor = thre_tensor; + p->lcl.scl = (vx_scalar)params[0]; +#if 0 + /* Init parameters */ + #define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VSI_NN_TYPE_FLOAT32, has_theta_1_3 ); + _SET_PARAM( 1, VSI_NN_TYPE_FLOAT32, has_theta_2_1 ); + _SET_PARAM( 2, VSI_NN_TYPE_FLOAT32, has_theta_1_2 ); + _SET_PARAM( 3, VSI_NN_TYPE_FLOAT32, theta_2_1 ); + _SET_PARAM( 4, VSI_NN_TYPE_FLOAT32, has_output_W ); + _SET_PARAM( 5, VSI_NN_TYPE_INT32, output_W ); + _SET_PARAM( 6, VSI_NN_TYPE_FLOAT32, theta_1_3 ); + _SET_PARAM( 7, VSI_NN_TYPE_FLOAT32, theta_2_2 ); + _SET_PARAM( 8, VSI_NN_TYPE_FLOAT32, theta_1_2 ); + _SET_PARAM( 9, VSI_NN_TYPE_INT32, output_H ); + _SET_PARAM( 10, VSI_NN_TYPE_FLOAT32, has_theta_2_3 ); + _SET_PARAM( 11, VSI_NN_TYPE_FLOAT32, theta_2_3 ); + _SET_PARAM( 12, VSI_NN_TYPE_FLOAT32, has_theta_2_2 ); + _SET_PARAM( 13, VSI_NN_TYPE_FLOAT32, has_output_H ); + _SET_PARAM( 14, VSI_NN_TYPE_FLOAT32, has_theta_1_1 ); + _SET_PARAM( 15, VSI_NN_TYPE_FLOAT32, theta_1_1 ); + #undef _SET_PARAM +#endif +//set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_nn_spatial_transformer_param * p = NULL; + + p = (vsi_nn_spatial_transformer_param *)node->nn_param.client_param; + + if (p->lcl.local_tensor) vsi_nn_ReleaseTensor(&p->lcl.local_tensor); + if (p->lcl.scl) vxReleaseScalar(&p->lcl.scl); +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( self, args, _ARG_NUM ); + + return status; +} + +vsi_status setUPGridData(uint32_t output_W_, uint32_t output_H_, float scale, int32_t zeropoint, + vsi_nn_dtype_t data_type, vsi_nn_qnt_type_e qnt_type, uint8_t fp, int16_t *tensorData) +{ + vsi_status status = VSI_SUCCESS; + uint32_t x = 0; + uint32_t y = 0; + uint32_t idx = 0; + float *tmp_buf = NULL; + uint32_t i = 0; + vsi_nn_dtype_t dtype; + + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dtype.fl = 0; + dtype.scale = 1; + dtype.zero_point = 0; + + tmp_buf = (float*) malloc(output_W_ * output_H_ * 3 * sizeof(float)); + if ( tmp_buf == NULL ) + { + return VX_FAILURE; + } + for (y = 0; y < output_H_; y++ ) + { + for (x = 0; x < output_W_; x++) + { + float data0 = y * (float)1.0 / (float)output_H_ * 2 - 1; + float data1 = x * (float)1.0 / (float)output_W_ * 2 - 1; + float data2 = 1; + + tmp_buf[idx++] = data0; + tmp_buf[idx++] = data1; + tmp_buf[idx++] = data2; + } + } + + for (i = 0; i < output_H_ * output_W_ * 3; i++) + { + vsi_nn_Float32ToDtype( tmp_buf[i], (uint8_t*)&tensorData[i], &dtype ); + } + + if (tmp_buf) + { + free(tmp_buf); + tmp_buf = NULL; + } + return status; +} + +static vsi_status vx_op_compute_setupThre + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[4] = {NULL}; + //vx_reference * args; + vsi_nn_spatial_transformer_param * p = NULL; + int flag = 0; + vsi_nn_tensor_t * thre_tensor = NULL; + vsi_nn_tensor_attr_t attr; + vx_context ctx = NULL; + vx_scalar flag_s = NULL; + vx_tensor tmp_t = NULL, tmp_t1 = NULL; + + //float flag_buf[6]; + vx_uint16 value_buf[6]; + + memset( params, 0, sizeof( vx_reference * ) * 4 ); + p = (vsi_nn_spatial_transformer_param *)self->nn_param.client_param; + ctx = vxGetContext( (vx_reference)self->graph->g ); + + flag = ((p->has_theta_1_1 == 1) + | ((p->has_theta_1_2 == 1) << 1) + | ((p->has_theta_1_3 == 1) << 2) + | ((p->has_theta_2_1 == 1) << 3) + | ((p->has_theta_2_2 == 1) << 4) + | ((p->has_theta_2_3 == 1) << 5)); + + memset( &attr, 0, sizeof( vsi_nn_tensor_attr_t ) ); + attr.size[0] = 6; + attr.size[1] = 1; + attr.size[2] = 1; + attr.size[3] = 1; + attr.dim_num = 4; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.fl = 0; + attr.dtype.scale = 1; + attr.dtype.zero_point = 0; + attr.vtl = FALSE; + vsi_nn_Float32ToDtype( p->theta_1_1, (uint8_t*)(&value_buf[0]), &attr.dtype ); + vsi_nn_Float32ToDtype( p->theta_1_2, (uint8_t*)(&value_buf[1]), &attr.dtype ); + vsi_nn_Float32ToDtype( p->theta_1_3, (uint8_t*)(&value_buf[2]), &attr.dtype ); + vsi_nn_Float32ToDtype( p->theta_2_1, (uint8_t*)(&value_buf[3]), &attr.dtype ); + vsi_nn_Float32ToDtype( p->theta_2_2, (uint8_t*)(&value_buf[4]), &attr.dtype ); + vsi_nn_Float32ToDtype( p->theta_2_3, (uint8_t*)(&value_buf[5]), &attr.dtype ); + + thre_tensor = vsi_nn_CreateTensorFromData( self->graph,(uint8_t *)&value_buf, &attr ); + + if ( NULL == self->n ) + { + status = VSI_FAILURE; + if (thre_tensor) + { + vsi_nn_ReleaseTensor( &thre_tensor); + thre_tensor = NULL; + } + return status; + } + + flag_s = vxCreateScalar( ctx, VSI_NN_TYPE_INT32, &flag ); + + params[0] = (vx_reference)thre_tensor->t; + + attr.size[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1]; + attr.size[1] = 1; + attr.size[2] = inputs[0]->attr.size[2]; + attr.size[3] = inputs[0]->attr.size[3]; + attr.dim_num = inputs[0]->attr.dim_num; + + tmp_t = vxReshapeTensor( inputs[0]->t, (vx_int32*)attr.size, attr.dim_num ); + + params[1] = (vx_reference)tmp_t; + params[2] = (vx_reference)flag_s; + + attr.size[0] = outputs[0]->attr.size[0] * outputs[0]->attr.size[1]; + attr.size[1] = 1; + attr.size[2] = outputs[0]->attr.size[2]; + attr.size[3] = outputs[0]->attr.size[3]; + attr.dim_num = outputs[0]->attr.dim_num; + + tmp_t1 = vxReshapeTensor( outputs[0]->t, (vx_int32*)attr.size, attr.dim_num ); + + params[3] = (vx_reference)tmp_t1; + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, 4 ); + + //_release_params( args, 4 ); + if (thre_tensor) + { + vsi_nn_ReleaseTensor( &thre_tensor); + thre_tensor = NULL; + } + if (tmp_t) + { + vxReleaseTensor( &tmp_t ); + tmp_t = NULL; + } + if (tmp_t1) + { + vxReleaseTensor( &tmp_t1 ); + tmp_t1 = NULL; + } + if (flag_s) + { + vxReleaseScalar( &flag_s ); + flag_s = NULL; + } + + return status; +} + +static vsi_status vx_op_compute_gemm + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[3] = {NULL}; + vx_tensor paraTensor0 = NULL, paraTensor1 = NULL, paraTensor2 = NULL; + int32_t size[4] = {1}; + vsi_nn_tensor_attr_t out_attr; + int16_t *out_buffer = NULL; + uint32_t output_H = 0, output_W = 0; + float *buf = NULL; + + memcpy( &out_attr, &(outputs[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); + output_W = out_attr.size[0]; + output_H = out_attr.size[1]; + out_buffer = (int16_t*)malloc( output_W * output_H * 3 * sizeof(int16_t) ); + status = setUPGridData( output_W, output_H, out_attr.dtype.scale, out_attr.dtype.zero_point, + out_attr.dtype, out_attr.dtype.qnt_type, out_attr.dtype.fl, out_buffer ); + if (status == VSI_FAILURE) + { + goto OnError; + } + status = vsi_nn_copy_tensor_patch( inputs[1]->t, &inputs[1]->attr, out_buffer, VX_WRITE_ONLY ); + if (status == VSI_FAILURE) + { + goto OnError; + } + /* Copy tensor to buffer, and convert bufer to float32 format */ + buf = vsi_nn_ConvertTensorToFloat32Data(self->graph, inputs[1]); + if (buf == NULL) + { + goto OnError; + } + memset( params, 0, sizeof( vx_reference * ) * 3 ); + + size[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1]; + size[1] = 1; + paraTensor0 = vxReshapeTensor( inputs[0]->t, size, 2 ); + + size[0] = inputs[1]->attr.size[0] * output_W; + size[1] = output_H; + paraTensor1 = vxReshapeTensor( inputs[1]->t, size, 2 ); + + size[0] = inputs[0]->attr.size[1] * output_W; + size[1] = output_H; + paraTensor2 = vxReshapeTensor( inputs[2]->t, size, 2 ); + + if ( NULL == self->n ) + { + status = VSI_FAILURE; + goto OnError; + } + + params[0] = (vx_reference)paraTensor0; + params[1] = (vx_reference)paraTensor1; + params[2] = (vx_reference)paraTensor2; + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, 3 ); + +OnError: + if (paraTensor0) + { + vxReleaseTensor( ¶Tensor0 ); + paraTensor0 = NULL; + } + if (paraTensor1) + { + vxReleaseTensor( ¶Tensor1 ); + paraTensor1 = NULL; + } + if (paraTensor2) + { + vxReleaseTensor( ¶Tensor2 ); + paraTensor2 = NULL; + } + if (out_buffer) + { + free(out_buffer); + out_buffer = NULL; + } + return status; +} + + +static vsi_status vx_op_compute_interp + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[3]; + vx_border_t border; + + memset( params, 0, sizeof( vx_reference * ) * 3 ); + + params[0] = (vx_reference)inputs[3]->t; + params[1] = (vx_reference)inputs[2]->t; + params[2] =(vx_reference)outputs[0]->t; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, 3 ); + + border.mode = VX_BORDER_CONSTANT; + border.constant_value.S16 = 0; + + status |= vxSetNodeAttribute( self->n, VX_NODE_BORDER, + &border, sizeof(border) ); + // _release_params( args, 3 ); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute_setupThre, + vx_op_compute_gemm, + vx_op_compute_interp, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_kernel_info_t kernel_info; + char *path = NULL; + vsi_nn_tensor_attr_t attr,outattr; + vsi_nn_tensor_t *tmp_output_tensor[5] = {0}; + vsi_nn_tensor_t *input_t,*fc_t,*output_t; + vx_graph graph = self->graph->g; + + memset( &kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t) ); + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + + memcpy( &attr, &(inputs[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.fl = 0; + attr.dtype.scale = 1; + attr.dtype.zero_point = 0; + attr.vtl = FALSE; + + input_t = vsi_nn_CreateTensor( self->graph, &attr ); + + memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.fl = 0; + attr.dtype.scale = 1; + attr.dtype.zero_point = 0; + attr.vtl = FALSE; + fc_t= vsi_nn_CreateTensor( self->graph, &attr ); + + memcpy( &attr, &(outputs[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.fl = 0; + attr.dtype.scale = 1; + attr.dtype.zero_point = 0; + attr.vtl = FALSE; + output_t= vsi_nn_CreateTensor( self->graph, &attr ); + + vxTensorCopyNode( graph, inputs[0]->t, input_t->t ); + vxTensorCopyNode( graph, inputs[1]->t, fc_t->t ); + vxTensorCopyNode( graph, output_t->t, outputs[0]->t ); + + memcpy( &outattr, &(outputs[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); + // Tensor for thre_output + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.size[0] = 3; + attr.size[1] = 2; + attr.size[2] = 1; + attr.size[3] = 1; + attr.dim_num = 2; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.vtl = FALSE; + tmp_output_tensor[0] = vsi_nn_CreateTensor( self->graph, &attr ); + + // Tensor for grid + attr.size[0] = 3; + attr.size[1] = outattr.size[0] * outattr.size[1];//p->output_H * p->output_W; + attr.size[2] = 1; + attr.size[3] = 1; + attr.dim_num = 2; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.vtl = FALSE; + tmp_output_tensor[1] = vsi_nn_CreateTensor( self->graph, &attr ); + + // Tensor for grid_out + attr.size[0] = 2 * outattr.size[0];//2 * p->output_W; + attr.size[1] = outattr.size[1];//p->output_H ; + attr.size[2] = 1; + attr.size[3] = 1; + attr.dim_num = 2; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.vtl = FALSE; + tmp_output_tensor[2] = vsi_nn_CreateTensor( self->graph, &attr ); + status = VSI_FAILURE; + + + kernel_info.type = VX_KERNEL_TYPE_VX; + kernel_info.kernel = vx_kernel_SPATIAL_TRANSFORMER_list; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc( kernel_info.resource_num * sizeof(char *) ); + kernel_info.resource_name[0] = "vsi_nn_kernel_transform_setupThres"; + + path = getenv("USER_VX_SOURCE_PATH"); + if(path) + vsi_nn_VxResourceSetPath( path ); + + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + + // add setupThre + self->n = vsi_nn_RegisterClientKernelAndNewNode( self->graph, &kernel_info); + + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index]( self, &fc_t, tmp_output_tensor ); + } + + if ( NULL == self->n ) + { + status = VSI_FAILURE; + goto final; + } + + // add gemm + kernel_info.kernel_index = 2; + kernel_info.init_index = 2; + kernel_info.resource_name[0] = "vsi_nn_kernel_transform_gemm"; + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, tmp_output_tensor, outputs); + } + + // add interp + if (input_t->attr.dim_num == 2 && input_t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 + && output_t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) + { + kernel_info.kernel_index = 3; + kernel_info.init_index = 3; + } + else if (input_t->attr.dim_num == 4 && input_t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 + && output_t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) + { + kernel_info.kernel_index = 4; + kernel_info.init_index = 3; + } + kernel_info.resource_name[0] = "vsi_nn_kernel_transform_interp"; + self->n = vsi_nn_RegisterClientKernelAndNewNode( self->graph, &kernel_info); + tmp_output_tensor[3] = input_t; + + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index]( self, tmp_output_tensor, &output_t ); + } + if (tmp_output_tensor[0]) + { + vsi_nn_ReleaseTensor( &tmp_output_tensor[0] ); + tmp_output_tensor[0] = NULL; + } + if (tmp_output_tensor[1]) + { + vsi_nn_ReleaseTensor( &tmp_output_tensor[1] ); + tmp_output_tensor[1] = NULL; + } + if (tmp_output_tensor[2]) + { + vsi_nn_ReleaseTensor( &tmp_output_tensor[2] ); + tmp_output_tensor[2] = NULL; + } + if (input_t) + { + vsi_nn_ReleaseTensor( &input_t ); + input_t = NULL; + } + if (fc_t) + { + vsi_nn_ReleaseTensor( &fc_t ); + fc_t = NULL; + } + if (output_t) + { + vsi_nn_ReleaseTensor( &output_t ); + output_t = NULL; + } + +final: + if (kernel_info.resource_name) + { + free(kernel_info.resource_name); + kernel_info.resource_name = NULL; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + //vsi_nn_spatial_transformer_param * p; + //p = (vsi_nn_spatial_transformer_param *)&node->nn_param.client_param; + + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = inputs[0]->attr.size[0];//p->output_W; // W + outputs[0]->attr.size[1] = inputs[0]->attr.size[1];//p->output_H; // H + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; // C + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; // N + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SPATIAL_TRANSFORMER, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c new file mode 100644 index 0000000..7fa6eee --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c @@ -0,0 +1,349 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_link_list.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret; + uint32_t num,i,j; + uint32_t slices_num = self->nn_param.split.slices_num; + uint32_t axis = self->nn_param.split.axis; + + /* compute the output tensor number */ + num = (uint32_t)(self->output.num - 1); + while( NULL == outputs[num] ) + { + num --; + } + num++; + + ret = TRUE; + /* 1. check the input tensor number */ + if(self->input.num != 1) + { + VSILOGE("The split layer input num must be 1, here is %u\n", self->input.num); + return FALSE; + } + + /* 2. check output tensor number */ + if(slices_num == 0) + { + uint32_t remaind = inputs[0]->attr.size[axis] % num; + if(remaind != 0) + { + VSILOGE("Can not average the input tensor %u shape\n", axis); + return FALSE; + } + } + else if(slices_num != num) + { + VSILOGE( "slices num %u != output tensor num %u\n", slices_num, num); + return FALSE; + } + + /* 3. check output tensor shape and dimensions */ + for( i = 0; i < num; i ++ ) + { + /* the virtual tensor shape has not been calculated yet */ + if(outputs[i]->attr.vtl == TRUE + || outputs[i]->attr.dim_num == VSI_NN_DIM_AUTO) + continue; + + if( outputs[i]->attr.dim_num != inputs[0]->attr.dim_num ) + { + VSILOGE( "Split dims num(%d vs %d)", + outputs[i]->attr.dim_num, + inputs[0]->attr.dim_num); + ret = FALSE; + break; + } + + for( j = 0; j < outputs[i]->attr.dim_num; j ++ ) + { + if( axis == j ) + { + continue; + } + + if( outputs[i]->attr.size[j] != inputs[0]->attr.size[j] ) + { + VSILOGE( "Split dims size(%d vs %d)", + outputs[i]->attr.size[j], + inputs[0]->attr.size[j]); + ret = FALSE; + break; + } + } + + if( FALSE == ret ) + { + break; + } + } + for(i = 0; i < num; i++) + { + BEGIN_IO_TYPE_DECL(SPLIT, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_I32, D_I32) + END_IO_TYPE_DECL(SPLIT) + if(!VALIDATE_OP_IO_TYPES(SPLIT, self, inputs, 1, &outputs[i], 1)) { + char* desc = generate_op_io_types_desc(inputs, 1, &outputs[i], 1); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret; + uint32_t i,num,average; + uint32_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t axis = self->nn_param.split.axis; + const uint32_t *slices = self->nn_param.split.slices; + uint32_t slices_num = self->nn_param.split.slices_num; + vsi_nn_split_param * p = NULL; + vsi_nn_internal_node_t* curr = NULL; + + ret = TRUE; + average = 1; + /* compute the output tensor number */ + num = (uint32_t)(self->output.num - 1); + while( NULL == outputs[num] ) + { + num --; + } + num++; + + p = &(self->nn_param.split); + vsi_nn_internal_init_node_wksp( self ); + + if(slices_num == 0) + { + average = inputs[0]->attr.size[axis] / num; + } + + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + p->lcl_data->stride_dims[i] = 1; + } + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + end[i] = inputs[0]->attr.size[i]; + } + end[axis] = 0; + for(i = 0; i < num; i++) + { + int j; + start[axis] = end[axis]; + if(slices_num == 0) + end[axis] += average; + else + end[axis] += slices[i]; + + memcpy(&outputs[i]->attr.dtype, &inputs[0]->attr.dtype, sizeof(vsi_nn_dtype_t)); + outputs[i]->attr.dim_num = inputs[0]->attr.dim_num; + for(j = 0; j < VSI_NN_MAX_DIM_NUM; j++) + { + outputs[i]->attr.size[j] = inputs[0]->attr.size[j]; + } + outputs[i]->attr.size[axis] = end[axis] - start[axis]; + memcpy(p->lcl_data->begin_dims, start, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + memcpy(p->lcl_data->end_dims, end, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 ); + curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims; + curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims; + curr->node->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.stride_dims = p->lcl_data->stride_dims; + curr->node->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.begin_mask = 0; + curr->node->nn_param.strided_slice.end_mask = 0; + curr->node->nn_param.strided_slice.shrink_axis_mask = 0; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[i]; + vsi_nn_internal_setup_node( self, curr ); + } + + return ret; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_split_param * p = NULL; + + p = &(self->nn_param.split); + + p->lcl_data = + (vsi_nn_split_lcl_data *)malloc(sizeof(vsi_nn_split_lcl_data)); + if (NULL == p->lcl_data) + { + return VX_ERROR_NO_MEMORY; + } + memset(p->lcl_data, 0, sizeof(vsi_nn_split_lcl_data)); + + p->lcl_data->begin_dims = + (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + if (NULL == p->lcl_data->begin_dims) + { + return VX_ERROR_NO_MEMORY; + } + memset(p->lcl_data->begin_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + + p->lcl_data->end_dims = + (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + if (NULL == p->lcl_data->end_dims) + { + return VX_ERROR_NO_MEMORY; + } + memset(p->lcl_data->end_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + + p->lcl_data->stride_dims = + (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + if (NULL == p->lcl_data->stride_dims) + { + return VX_ERROR_NO_MEMORY; + } + memset(p->lcl_data->stride_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + + return status; +} + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_split_param * p = NULL; + + p = &(self->nn_param.split); + + if (p->lcl_data->begin_dims) + { + free(p->lcl_data->begin_dims); + p->lcl_data->begin_dims = NULL; + } + + if (p->lcl_data->end_dims) + { + free(p->lcl_data->end_dims); + p->lcl_data->end_dims = NULL; + } + + if (p->lcl_data->stride_dims) + { + free(p->lcl_data->stride_dims); + p->lcl_data->stride_dims = NULL; + } + + if (p->lcl_data) + { + free(p->lcl_data); + p->lcl_data = NULL; + } + + vsi_nn_internal_deinit_node_wksp( self ); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SPLIT, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 16 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c new file mode 100644 index 0000000..e711b48 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c @@ -0,0 +1,96 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SQRT, + 0, + 0, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs); + + return ret; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SQRT, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c new file mode 100644 index 0000000..1124ef7 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c @@ -0,0 +1,96 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + VX_NN_ACTIVATION_SQUARE, + 0, + 0, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs); + + return ret; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SQUARE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c new file mode 100644 index 0000000..fb4bcf7 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c @@ -0,0 +1,194 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_internal_node.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + uint32_t i = 0; + + if ( self->nn_param.squeeze.axis_num == 0 ) + { + for ( i = 0; i < inputs[0]->attr.dim_num; i++) + { + if (inputs[0]->attr.size[i] != 1) + { + VSILOGE("the size of rank %d must be reported if squeezing a dimension that is not 1", + i); + ret = FALSE; + } + } + } + else + { + for ( i = 0; i < self->nn_param.squeeze.axis_num; i++) + { + int32_t rank = self->nn_param.squeeze.axis[i]; + if (inputs[0]->attr.size[rank] != 1) + { + VSILOGE("the size of rank %d must be reported if squeezing a dimension that is not 1", + rank); + ret = FALSE; + } + } + } + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + uint32_t i = 0; + uint32_t outIdx = 0; + vsi_bool shouldSqueeze[VSI_NN_MAX_DIM_NUM] = {FALSE}; + uint32_t numDimsSqueezed = 0; + vsi_nn_internal_node_t* curr = NULL; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + memset(shouldSqueeze, 0, sizeof(vsi_bool) * VSI_NN_MAX_DIM_NUM); + + if ( self->nn_param.squeeze.axis_num == 0 ) + { + outputs[0]->attr.size[0] = 1; + outputs[0]->attr.dim_num = 1; + } + else + { + for ( i = 0; i < self->nn_param.squeeze.axis_num; i++) + { + int32_t rank = self->nn_param.squeeze.axis[i]; + + rank = rank < 0 ? rank + inputs[0]->attr.dim_num : rank; + + if ( !shouldSqueeze[rank] ) + { + ++numDimsSqueezed; + } + shouldSqueeze[rank] = TRUE; + } + + for ( i = 0; i < inputs[0]->attr.dim_num; i++) + { + if (!shouldSqueeze[i]) + { + outputs[0]->attr.size[outIdx ++] = inputs[0]->attr.size[i]; + } + } + + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num - numDimsSqueezed; + } + } + + vsi_nn_internal_init_node_wksp( self ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + curr->node->nn_param.reshape.size = outputs[0]->attr.size; + curr->node->nn_param.reshape.dim_num = outputs[0]->attr.dim_num; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node( self, curr ); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_internal_deinit_node_wksp( self ); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SQUEEZE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c new file mode 100644 index 0000000..84e3481 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c @@ -0,0 +1,244 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_link_list.h" +#include "utils/vsi_nn_dtype_util.h" +#include "client/vsi_nn_vxkernel.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM VSI_NN_STACK_MAX_INPUTS +#define _OUTPUT_NUM (1) +#define _IO_NUM (2) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_stack_param * p; + uint32_t i, j; + uint32_t block_size = 1; + uint32_t block_num = 1; + uint32_t axis; + uint32_t input_shape[2] = {1, 1}; + uint32_t output_shape[2] = {1, 1}; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_t *output_rs = NULL; + vsi_nn_stack_lcl_data * data; + vsi_bool ret = TRUE; + + vsi_nn_internal_init_node_wksp( node ); + + p = (vsi_nn_stack_param *)&(node->nn_param.stack); + axis = p->axis; + + for (i = 0; i < axis; i++) + { + block_size *= inputs[0]->attr.size[i]; + } + + for (i = axis; i < inputs[0]->attr.dim_num; i++) + { + block_num *= inputs[0]->attr.size[i]; + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + 1; + + for(i = 0, j = 0; j < outputs[0]->attr.dim_num; j++) + { + if (j == p->axis) + { + outputs[0]->attr.size[j] = node->input.num; + } + else + { + outputs[0]->attr.size[j] = inputs[0]->attr.size[i ++]; + } + } + } + + input_shape[0] = block_size; + input_shape[1] = block_num; + + curr = vsi_nn_internal_new_node( node, VSI_NN_OP_CONCAT, node->input.num, node->output.num ); + for (i = 0; i < node->input.num; i++) + { + vsi_nn_tensor_t *input_rs = NULL; + /* Malloc ptr */ + data = (vsi_nn_stack_lcl_data *)malloc( sizeof(vsi_nn_stack_lcl_data) ); + if( NULL == data ) + { + VSILOGE( "Create stack local data fail." ); + ret = FALSE; + goto final; + } + memset( data, 0, sizeof(vsi_nn_stack_lcl_data) ); + + input_rs = vsi_nn_reshape_tensor(node->graph, inputs[i], input_shape, 2); + data->src_in = input_rs; + /* Store node, ptr */ + vsi_nn_LinkListPushStart( + (vsi_nn_link_list_t **)&node->nn_param.stack.lcl_data, + (vsi_nn_link_list_t *)data ); + + curr->inputs[i] = input_rs; + } + + if (block_num == 1) + { + output_shape[0] = block_size; + output_shape[1] = node->input.num; + axis = 1; + } + else + { + output_shape[0] = block_size * node->input.num; + output_shape[1] = block_num; + axis = 0; + } + + /* Malloc ptr */ + data = (vsi_nn_stack_lcl_data *)malloc( sizeof(vsi_nn_stack_lcl_data) ); + if( NULL == data ) + { + VSILOGE( "Create stack local data fail." ); + ret = FALSE; + goto final; + } + memset( data, 0, sizeof(vsi_nn_stack_lcl_data) ); + + output_rs = vsi_nn_reshape_tensor(node->graph, outputs[0], output_shape, 2); + data->src_in = output_rs; + /* Store node, ptr */ + vsi_nn_LinkListPushStart( + (vsi_nn_link_list_t **)&node->nn_param.stack.lcl_data, + (vsi_nn_link_list_t *)data ); + + curr->outputs[0] = output_rs; + curr->node->nn_param.concat.axis = axis; + +final: + vsi_nn_internal_setup_node(node, curr); + + return ret; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_stack_lcl_data * data; + vsi_nn_stack_lcl_data * tmp; + + if(NULL == self) + { + return VSI_FAILURE; + } + + data = self->nn_param.stack.lcl_data; + while( NULL != data ) + { + tmp = (vsi_nn_stack_lcl_data *)vsi_nn_LinkListPopStart( + (vsi_nn_link_list_t **)&data ); + vsi_nn_ReleaseTensor(&tmp->src_in); + free( tmp ); + } + + vsi_nn_internal_deinit_node_wksp( self ); + + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ STACK, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c new file mode 100644 index 0000000..fa32a0c --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -0,0 +1,736 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vx_int32 get_slice_axis_value(vx_int32 value, vx_uint32 dimension_size) +{ + vx_int32 axis_vlaue = 0; + if (value < 0) + axis_vlaue = value + dimension_size; + else + axis_vlaue = value; + return axis_vlaue; +} + +static vx_int32 get_slice_mask_start_value(vx_int32 stride, vx_uint32 dimension_size) +{ + vx_int32 start_vlaue = 0; + if (stride > 0) + start_vlaue = 0; + else + start_vlaue = dimension_size - 1; + return start_vlaue; +} + +static vx_int32 get_slice_mask_stop_value(vx_int32 stride, vx_uint32 dimension_size) +{ + vx_int32 stop_vlaue = 0; + if (stride > 0) + stop_vlaue = dimension_size; + else + stop_vlaue = -1; + return stop_vlaue; +} + +static vx_int32 get_slice_clamp_stop(vx_int32 stride, vx_int32 stop, vx_uint32 dimension_size) +{ + vx_int32 stop_vlaue = 0; + if (stride > 0) + { + stop_vlaue = vsi_nn_clamp(stop, 0, (vx_int32)dimension_size); + } + else + { + stop_vlaue = vsi_nn_clamp(stop, -1, (vx_int32)dimension_size - 1); + } + return stop_vlaue; +} + +static vsi_bool _check_neg_start_end_dims + ( + int32_t *start, + int32_t *stop, + uint32_t dims + ) +{ + uint32_t i = 0; + + for (i = 0; i < dims; i++) + { + if (start[i] < 0 || stop[i] < 0) + return TRUE; + } + + return FALSE; +} /* _is_same_quant */ + +static vsi_bool _get_stride_slice_start_stop_stride + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vx_uint32 i = 0; + vx_int32 int32_value = 0; + vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice); + int32_t *start = p->lcl2_data->begin_dims; + int32_t *stop = p->lcl2_data->end_dims; + int32_t *stride = p->lcl2_data->stride_dims; + + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i ++) + { + start[i] = 0; + stop[i] = 1; + stride[i] = 1; + } + + for (i = 0; i < p->stride_dims_num; ++i) + { + stride[i] = p->stride_dims[i]; + } + + for (i = 0; i < p->begin_dims_num; ++i) + { + int32_value = p->begin_dims[i]; + + start[i] = get_slice_axis_value(int32_value, inputs[0]->attr.size[i]); + } + + for (i = 0; i < p->end_dims_num; ++i) + { + int32_value = p->end_dims[i]; + + stop[i] = get_slice_axis_value(int32_value, inputs[0]->attr.size[i]); + } + + /*if the ith bit of mask is set, the start or stop will be the fullest possible range in that dimension.*/ + for (i = 0; i < inputs[0]->attr.dim_num; i ++) + { + if (p->begin_mask & (1 << i)) + { + start[i] = get_slice_mask_start_value(stride[i], inputs[0]->attr.size[i]); + } + + start[i] = vsi_nn_clamp(start[i], 0, (vx_int32)(inputs[0]->attr.size[i] - 1)); + + if (p->shrink_axis_mask & (1 << i)) + { + stop[i] = start[i] + 1; + } + + if (p->end_mask & (1 << i)) + { + stop[i] = get_slice_mask_stop_value(stride[i], inputs[0]->attr.size[i]); + } + + stop[i] = get_slice_clamp_stop(stride[i], stop[i], inputs[0]->attr.size[i]); + } + + /* reset start stop and stride when output size is 1*/ + for (i = 0; i < outputs[0]->attr.dim_num; i ++) + { + if (outputs[0]->attr.size[i] == 1 && stride[i] < 0) + { + stride[i] = 1; + stop[i] = start[i] + 1; + } + } + + if (_check_neg_start_end_dims(start, stop, inputs[0]->attr.dim_num)) + { + memcpy(start, p->begin_dims, sizeof(int32_t) * p->begin_dims_num); + memcpy(stop, p->end_dims, sizeof(int32_t) * p->end_dims_num); + memcpy(stride, p->stride_dims, sizeof(int32_t) * p->stride_dims_num); + p->lcl2_data->begin_mask = p->begin_mask; + p->lcl2_data->end_mask = p->end_mask; + p->lcl2_data->shrink_axis_mask = p->shrink_axis_mask; + } + + return TRUE; +} + +static vsi_bool _check_is_same_shape( + vsi_nn_tensor_t ** inputs, + int32_t *start, + int32_t *stop, + int32_t *stride + ) +{ + int32_t i = 0; + int32_t dims = (int32_t)inputs[0]->attr.dim_num; + + for (i = dims - 1; i >= 0; i --) + { + if (inputs[0]->attr.size[i] == 1) + { + dims --; + continue; + } + else + break; + } + + for (i = 0; i < dims - 1; i++) + { + if (stride[i] != 1 || start[i] != 0 || stop[i] != (int32_t)inputs[0]->attr.size[i]) + return FALSE; + } + + if (stride[i] != 1) + return FALSE; + + return TRUE; +} + +static vsi_bool _is_same_quant + ( + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_dtype_t *src_dtype = NULL,*dst_dtype = NULL; + + src_dtype = &inputs[0]->attr.dtype; + dst_dtype = &outputs[0]->attr.dtype; + + if(vsi_nn_DtypeCompare(src_dtype, dst_dtype) == FALSE) + { + return FALSE; + } + + return TRUE; +} /* _is_same_quant */ + +static vsi_status copy_tensor_to_view + ( + vsi_nn_node_t * self, + vx_tensor src_tensor, + vsi_nn_tensor_t * dst_in + ) +{ + vsi_status ret; + vsi_nn_strided_slice_lcl_data2 * data = NULL; + + ret = VSI_SUCCESS; + /* Malloc ptr */ + data = self->nn_param.strided_slice.lcl2_data; + data->src_tensor = src_tensor; + if (dst_in->t) + data->dst_tensor = vxReshapeTensor(dst_in->t, (int32_t*)dst_in->attr.size, dst_in->attr.dim_num); + data->is_dataconvert_op = TRUE; + + return ret; +} /* copy_tensor_to_view() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_nn_stride_slice_params_t param; + vsi_nn_tensor_t *begin_dims_tensor = NULL; + vsi_nn_tensor_t *end_dims_tensor = NULL; + vsi_nn_tensor_t *stride_dims_tensor = NULL; + vsi_nn_tensor_t *output_tensor = NULL; + vsi_nn_tensor_attr_t attr; + int32_t *start_dims = NULL; + int32_t *stop_dims = NULL; + int32_t *stride_dims = NULL; + vsi_nn_strided_slice_lcl_data2 * p = self->nn_param.strided_slice.lcl2_data; + + start_dims = p->begin_dims; + stop_dims = p->end_dims; + stride_dims = p->stride_dims; + + if (TRUE == p->is_optimized) + { + vx_tensor dst_tensor = NULL; + + if (p->is_dataconvert_op) + { + dst_tensor = p->dst_tensor ? p->dst_tensor : outputs[0]->t; + p->cp_node = vxTensorCopyNode(self->graph->g, + p->src_tensor, dst_tensor ); + if( NULL == p->cp_node ) + { + VSILOGE( "Create vxTensorCopyNode fail." ); + status = VSI_FAILURE; + } + } + } + else + { + uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t dims = inputs[0]->attr.dim_num; + int32_t shrink_axis_mask = self->nn_param.strided_slice.shrink_axis_mask; + + memset(¶m, 0, sizeof(vx_nn_stride_slice_params_t)); + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = self->nn_param.strided_slice.begin_dims_num; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + begin_dims_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)start_dims, + &attr); + if( NULL == begin_dims_tensor ) + { + VSILOGE("Create begin_dims_tensor fail.(strided_slice)"); + return VSI_FAILURE; + } + + self->nn_param.strided_slice.local.begin_dims_tensor = begin_dims_tensor; + param.begin_dims = REQUIRED_IO(begin_dims_tensor); + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = self->nn_param.strided_slice.end_dims_num; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + end_dims_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)stop_dims, + &attr); + if( NULL == end_dims_tensor ) + { + VSILOGE("Create end_dims_tensor fail.(strided_slice)"); + return VSI_FAILURE; + } + + self->nn_param.strided_slice.local.end_dims_tensor = end_dims_tensor; + param.end_dims = REQUIRED_IO(end_dims_tensor); + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = self->nn_param.strided_slice.stride_dims_num; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + stride_dims_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)stride_dims, + &attr); + if( NULL == stride_dims_tensor ) + { + VSILOGE("Create stride_dims_tensor fail.(strided_slice)"); + return VSI_FAILURE; + } + + self->nn_param.strided_slice.local.stride_dims_tensor = stride_dims_tensor; + param.stride_dims = REQUIRED_IO(stride_dims_tensor); + + param.begin_mask = p->begin_mask; + param.end_mask = p->end_mask; + param.shrink_axis_mask = p->shrink_axis_mask; + + /* reshpae output tensor to keep output rank is the same as input's */ + memset(&sizes, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + memcpy(&sizes, &outputs[0]->attr.size, sizeof(int32_t) * outputs[0]->attr.dim_num); + + if (shrink_axis_mask && p->shrink_axis_mask == 0) + { + uint32_t i = 0; + uint32_t j = 0; + + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + if (shrink_axis_mask & (1 << i)) + { + sizes[i] = 1; + } + else + { + sizes[i] = outputs[0]->attr.size[j ++]; + } + } + } + + output_tensor = vsi_nn_reshape_tensor(self->graph, outputs[0], sizes, dims); + if( NULL == output_tensor ) + { + VSILOGE("Create output_tensor fail.(strided_slice)"); + return VSI_FAILURE; + } + + self->n = vxTensorStrideSliceNode( + self->graph->g, + inputs[0]->t, + ¶m, + sizeof(vx_nn_stride_slice_params_t), + output_tensor->t + ); + + if (output_tensor) + { + vsi_nn_ReleaseTensor(&output_tensor); + } + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(STRIDED_SLICE, 1, 1) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8) + IO_TYPE(D_F16, D_I16) + IO_TYPE(D_F16, D_U8) + IO_TYPE(D_I8, D_F16) + IO_TYPE(D_I16, D_F16) + IO_TYPE(D_U8, D_F16) + IO_TYPE(D_I8, D_I8) + IO_TYPE(D_I16, D_I16) + IO_TYPE(D_U8, D_U8) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_I32, D_I32) + + /* HW 9.0 */ + IO_TYPE(D_BF16, D_BF16) + END_IO_TYPE_DECL(STRIDED_SLICE) + if(!VALIDATE_OP_IO_TYPES(STRIDED_SLICE, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if(self->nn_param.strided_slice.begin_dims_num == 0) + { + self->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; + self->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num; + self->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num; + } + /* TODO: Add code to comput outputs' shape. */ + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice); + vx_uint32 i; + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + vx_int32 begin = 0, end = 1, stride = 1; + vx_int32 input_size = inputs[0]->attr.size[i]; + vx_int32 output_size = 0; + vx_int32 j; + + begin = get_slice_axis_value(p->begin_dims[i], input_size); + end = get_slice_axis_value(p->end_dims[i], input_size); + stride = p->stride_dims[i]; + if (p->begin_mask & (1 << i)) + { + begin = get_slice_mask_start_value(stride, input_size); + } + begin = vsi_nn_clamp(begin, 0, (vx_int32)(input_size - 1)); + if (p->shrink_axis_mask & (1 << i)) + { + end = begin + 1; + } + + if (p->end_mask & (1 << i)) + { + end = get_slice_mask_stop_value(stride, input_size); + } + end = get_slice_clamp_stop(stride, end, input_size); + for (j = begin; !((stride > 0) ? (j >= end) : (j <= end)); j += stride) + { + output_size++; + } + outputs[0]->attr.size[i] = output_size; + } + outputs[0]->attr.dim_num = 0; + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + if (p->shrink_axis_mask & (1 << i)) continue; + outputs[0]->attr.size[outputs[0]-> + attr.dim_num] = outputs[0]->attr.size[i]; + outputs[0]->attr.dim_num++; + } + } + + _get_stride_slice_start_stop_stride(self, inputs, outputs); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status = VSI_SUCCESS; + int32_t i = 0; + vx_tensor in_view_tensor = NULL; + vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice); + uint32_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t *start_dims = p->lcl2_data->begin_dims; + int32_t *stop_dims = p->lcl2_data->end_dims; + int32_t *stride_dims = p->lcl2_data->stride_dims; + vsi_bool is_same_quant_type = FALSE; + + /* Only forward run stride_slice's optimize */ + if( direction == VSI_NN_OPTIMIZE_BACKWARD ) + { + return status; + } + + if (_check_is_same_shape(inputs, start_dims, stop_dims, stride_dims) == FALSE) + return status; + + VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + + if( NULL == inputs[0]->t ) + { + vsi_nn_TensorReinit( self->graph, inputs[0] ); + } + + /* Create tensor from view */ + memcpy( start, (uint32_t*)start_dims, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + memcpy( end, (uint32_t*)stop_dims, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]); + if( NULL == in_view_tensor ) + { + VSILOGE( "Create tensor %d from view fail.", i ); + status = VSI_FAILURE; + goto OnError; + } + + self->nn_param.strided_slice.lcl2_data->is_optimized = TRUE; + + is_same_quant_type = _is_same_quant(inputs, outputs); + if( NULL != outputs[0]->t || is_same_quant_type == FALSE) + { + VSILOGW( "stride slice copy tensor."); + // Copy old tensor values to the new address. + status = copy_tensor_to_view( self, in_view_tensor, outputs[0]); + if( VSI_FAILURE == status ) + { + goto OnError; + } + } + else + { + outputs[0]->t = in_view_tensor; + } + +OnError: + return status; +} /* op_optimize() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_strided_slice_lcl_data2 * lcl2_data; + + if(NULL == self) + { + return VSI_FAILURE; + } + + lcl2_data = self->nn_param.strided_slice.lcl2_data; + if(self->n) + { + if( NULL != self && NULL != self->n ) + { + vxReleaseNode( &self->n ); + self->n = NULL; + } + } + + if (lcl2_data->cp_node) + { + vxReleaseNode( &lcl2_data->cp_node ); + } + + if (lcl2_data->src_tensor) + { + vxReleaseTensor( &lcl2_data->src_tensor ); + } + + if (lcl2_data->dst_tensor) + { + vxReleaseTensor( &lcl2_data->dst_tensor ); + } + + if (lcl2_data->begin_dims) + { + free(lcl2_data->begin_dims); + } + + if (lcl2_data->end_dims) + { + free(lcl2_data->end_dims); + } + + if (lcl2_data->stride_dims) + { + free(lcl2_data->stride_dims); + } + + if (lcl2_data) + { + free( lcl2_data ); + } + + if (self->nn_param.strided_slice.local.begin_dims_tensor != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.strided_slice.local.begin_dims_tensor)); + } + if (self->nn_param.strided_slice.local.end_dims_tensor != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.strided_slice.local.end_dims_tensor)); + } + if (self->nn_param.strided_slice.local.stride_dims_tensor != NULL) + { + vsi_nn_ReleaseTensor(&(self->nn_param.strided_slice.local.stride_dims_tensor)); + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.strided_slice.lcl2_data = + (vsi_nn_strided_slice_lcl_data2 *)malloc(sizeof(vsi_nn_strided_slice_lcl_data2)); + if (NULL == self->nn_param.strided_slice.lcl2_data) + { + return VX_ERROR_NO_MEMORY; + } + + memset( self->nn_param.strided_slice.lcl2_data, 0, sizeof(vsi_nn_strided_slice_lcl_data2) ); + + self->nn_param.strided_slice.lcl2_data->begin_dims = + (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + if (NULL == self->nn_param.strided_slice.lcl2_data->begin_dims) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.strided_slice.lcl2_data->begin_dims, 0, + sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + + self->nn_param.strided_slice.lcl2_data->end_dims = + (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + if (NULL == self->nn_param.strided_slice.lcl2_data->end_dims) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.strided_slice.lcl2_data->end_dims, 0, + sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + + self->nn_param.strided_slice.lcl2_data->stride_dims = + (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + if (NULL == self->nn_param.strided_slice.lcl2_data->stride_dims) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.strided_slice.lcl2_data->stride_dims, 0, + sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ STRIDED_SLICE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c new file mode 100644 index 0000000..e55d1f8 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c @@ -0,0 +1,263 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_status _create_local_tensor + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_tensor_t *rank_tensor = NULL; + vsi_nn_tensor_t *act_tensor = NULL; + + /* activation must set to 0, so the sdk will call VX_NN_ACTIVATION_NONE */ + int32_t activation = 0; + + if(NULL == self) + { + goto error; + } + + act_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&activation, + VSI_NN_TYPE_INT32); + if(NULL == act_tensor) + { + goto error; + } + + rank_tensor = vsi_nn_VariableToTensor(self, + (uint8_t *)&self->nn_param.svdf.rank, + VSI_NN_TYPE_INT32); + if(NULL == act_tensor) + { + goto error; + } + + self->nn_param.svdf.local.act_tensor = act_tensor; + self->nn_param.svdf.local.rank_tensor = rank_tensor; + return VSI_SUCCESS; +error: + if(rank_tensor)vsi_nn_ReleaseTensor(&rank_tensor); + if(act_tensor)vsi_nn_ReleaseTensor(&act_tensor); + return VSI_FAILURE; +} /* _create_local_tensor() */ + +static vsi_status _init_svdf_param + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vx_nn_svdf_params_t *param + ) +{ + vsi_nn_svdf_param *p = &self->nn_param.svdf; + + param->state_in = REQUIRED_IO(inputs[1]); + param->weights_feature = REQUIRED_IO(inputs[2]); + param->recurrent_time = REQUIRED_IO(inputs[3]); + param->bias = OPTIONAL_IO(inputs[4]); + param->activation = REQUIRED_IO(p->local.act_tensor); + param->rank = REQUIRED_IO(p->local.rank_tensor); + + return VSI_SUCCESS; +} /* _init_svdf_param() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vx_nn_svdf_params_t param; + vsi_nn_tensor_t * bias_tensor = NULL; + + status = VSI_FAILURE; + memset(¶m, 0, sizeof(param)); + + status = _create_local_tensor(self); + if(VSI_SUCCESS != status) + { + return status; + } + + status = _init_svdf_param(self, inputs, ¶m); + if(VSI_SUCCESS != status) + { + return status; + } + + if (param.bias == NULL) + { + vsi_nn_tensor_attr_t attr; + int32_t count = inputs[2]->attr.size[1]; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.size[0] = count; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + bias_tensor = vsi_nn_CreateTensor(self->graph, &attr); + param.bias = bias_tensor->t; + } + + self->n = vxSVDFLayer( + self->graph->g, + REQUIRED_IO(inputs[0]), + ¶m, + sizeof(param), + REQUIRED_IO(outputs[1]), /* state out */ + REQUIRED_IO(outputs[0]) + ); + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + + if (bias_tensor != NULL) vsi_nn_ReleaseTensor(&bias_tensor); + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + if(2 != inputs[0]->attr.dim_num) + { + VSILOGE("SVDF input dimension should be 2"); + ret = FALSE; + } + + { + BEGIN_IO_TYPE_DECL(SVDF, 5, 2) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) + END_IO_TYPE_DECL(SVDF) + if(!VALIDATE_OP_IO_TYPES(SVDF, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + + return ret; +} /* op_check() */ + +/* + * input[0]: input + * input[1]: state_in (variable) + * input[2]: weights_feature + * input[3]: weights_time + * input[4]: bias + * output[0]: output + * output[1]: state_out + */ +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_svdf_param *p; + + p = (vsi_nn_svdf_param *)&self->nn_param.svdf; + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = p->num_units; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + } + if(VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num) + { + outputs[1]->attr.dim_num = inputs[1]->attr.dim_num; + outputs[1]->attr.size[0] = inputs[1]->attr.size[0]; + outputs[1]->attr.size[1] = inputs[1]->attr.size[1]; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_tensor_t *act_tensor, *rank_tensor; + + if(NULL == self) + { + return VSI_FAILURE; + } + + act_tensor = self->nn_param.svdf.local.act_tensor; + rank_tensor = self->nn_param.svdf.local.rank_tensor; + if(NULL != self->n) + { + if(act_tensor)vsi_nn_ReleaseTensor(&act_tensor); + if(rank_tensor)vsi_nn_ReleaseTensor(&rank_tensor); + vxReleaseNode( &self->n ); + self->n = NULL; + } + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SVDF, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 5, + /* output_num */ 2 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c b/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c new file mode 100644 index 0000000..7900d88 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c @@ -0,0 +1,156 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + // TODO: + // Create kernel param + vsi_nn_kernel_param_t * param; + vsi_nn_kernel_node_t n; + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "type", self->nn_param.swish.type ); + vsi_nn_kernel_param_add_float32( param, "beta", self->nn_param.swish.beta ); + n = vsi_nn_kernel_selector( self->graph, "swish", inputs, 1, outputs, 1, param ); + if( n == NULL ) + { + vsi_nn_kernel_param_release( ¶m ); + status = VSI_FAILURE; + } + self->n = (vx_node)n; + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(SWISH, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(SWISH) + if(!VALIDATE_OP_IO_TYPES(SWISH, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + + if( NULL == self ) + { + return FALSE; + } + + ret = vsi_nn_op_common_setup(self, inputs, outputs); + + return ret; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.swish.beta = 1.0f; + + return status; +} /* op_init() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SWISH, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c new file mode 100644 index 0000000..cc46ab0 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c @@ -0,0 +1,173 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_pub.h" +#include "client/vsi_nn_vxkernel.h" + +#define _ARG_NUM (0) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _VX_KERNEL_VAR (vx_client_kernel_SYNC_HOST) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) +#define _VSI_PARAM (vsi_nn_client_sync_host_param) + +extern vx_kernel_description_t * vx_kernel_SYNC_HOST_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + char *path = NULL; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = vx_kernel_SYNC_HOST_list; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "sync_host"; + path = getenv("USER_VX_SOURCE_PATH"); + if(path) + vsi_nn_VxResourceSetPath(path); + + if( kernel_info.type == VX_KERNEL_TYPE_VX) + { + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + } + else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ + { + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) + { + free(kernel_info.resource_name); + } + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SYNC_HOST, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c new file mode 100644 index 0000000..850a2d1 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c @@ -0,0 +1,96 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + status = VSI_FAILURE; + + self->n = vxActivationLayer( + self->graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HYPERBOLIC_TAN, + self->nn_param.tanh.scale_a, + self->nn_param.tanh.scale_b, + outputs[0]->t + ); + + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs); + + return ret; +} /* op_check() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ TANH, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c new file mode 100644 index 0000000..0906eab --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c @@ -0,0 +1,146 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_tensor_add_mean_stddev_norm_param * p = NULL; + float eps; + + p = &(self->nn_param.tensor_add_mean_stddev_norm); + param = vsi_nn_kernel_param_create(); + eps = p->eps; + vsi_nn_kernel_param_add_float32( param, "eps", eps ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "add_mean_std_norm", + inputs, _INPUT_NUM, + outputs, _OUTPUT_NUM, param ); + + vsi_nn_kernel_param_release( ¶m ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; + +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(TENSOR_ADD_MEAN_STDDEV_NORM, 2, 1) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F32) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F32) + END_IO_TYPE_DECL(TENSOR_ADD_MEAN_STDDEV_NORM) + if(!VALIDATE_OP_IO_TYPES(TENSOR_ADD_MEAN_STDDEV_NORM, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; + +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ TENSOR_ADD_MEAN_STDDEV_NORM, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c new file mode 100644 index 0000000..4283e40 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c @@ -0,0 +1,421 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (0) +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_TENSORSTACKCONCAT_list[]; + +static vsi_bool _reshape_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i = 0; + uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t axis = 0; + vsi_nn_tensorstackconcat_param * p = NULL; + uint32_t before_size = 1; + uint32_t after_size = 1; + uint32_t * input_sizes = inputs[0]->attr.size; + uint32_t dims = inputs[0]->attr.dim_num; + uint32_t * output_sizes = outputs[0]->attr.size; + uint32_t new_dims = 0; + + p = &(self->nn_param.tensorstackconcat); + axis = p->axis; + + for ( i = 0; i < (uint32_t)axis; i++) + { + before_size *= input_sizes[i]; + } + for ( i = axis + 1; i < dims; i++) + { + after_size *= input_sizes[i]; + } + sizes[0] = before_size; + sizes[1] = input_sizes[axis]; + sizes[2] = after_size; + new_dims = 3; + p->local->local_tensor[0] = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, new_dims); + + sizes[0] = 1; + sizes[1] = 1; + new_dims = 2; + p->local->local_tensor[1] = vxReshapeTensor(inputs[1]->t, (int32_t *)sizes, new_dims); + + before_size = 1; + after_size = 1; + for ( i = 0; i < (uint32_t)axis; i++) + { + before_size *= output_sizes[i]; + } + for ( i = axis + 1; i < dims; i++) + { + after_size *= output_sizes[i]; + } + sizes[0] = before_size; + sizes[1] = output_sizes[axis]; + sizes[2] = after_size; + new_dims = 3; + p->local->local_tensor[2] = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, new_dims); + + p->axis = 1; + return TRUE; +} + +static void _set_inputs_outputs + ( + vsi_nn_node_t * self, + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_tensorstackconcat_param *p = NULL; + uint32_t i = 0; + + p = &(self->nn_param.tensorstackconcat); + + for (i = 0; i < _IO_NUM; i++) + { + params[i] = (vx_reference)(p->local->local_tensor[i]); + } +} /* _set_inputs_outputs() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( self, params, inputs, outputs ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + //_release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_pre_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_kernel_info_t * kernel_info + ) +{ + vsi_nn_type_e inputDataFormat = inputs[0]->attr.dtype.vx_type; + vsi_nn_type_e outputDataFormat = outputs[0]->attr.dtype.vx_type; + int8_t inputFixedPointPos = inputs[0]->attr.dtype.fl; + int8_t outputFixedPointPos = outputs[0]->attr.dtype.fl; + int32_t inputZeroPoint = inputs[0]->attr.dtype.zero_point; + int32_t outputZeroPoint = outputs[0]->attr.dtype.zero_point; + vx_float32 inputScale = inputs[0]->attr.dtype.scale; + vx_float32 outputScale = outputs[0]->attr.dtype.scale; + vsi_bool is16Bits = FALSE; + vsi_bool is8Bits = FALSE; + + is16Bits = ((inputDataFormat == VSI_NN_TYPE_FLOAT16 && outputDataFormat == VSI_NN_TYPE_FLOAT16) + || (inputDataFormat == VSI_NN_TYPE_INT16 && outputDataFormat == VSI_NN_TYPE_INT16 + && inputFixedPointPos == outputFixedPointPos)) ? TRUE : FALSE; + is8Bits = ((inputDataFormat == VSI_NN_TYPE_INT8 && outputDataFormat == VSI_NN_TYPE_INT8 + && inputFixedPointPos == outputFixedPointPos) + || (inputDataFormat == VSI_NN_TYPE_UINT8 && outputDataFormat == VSI_NN_TYPE_UINT8 + && inputZeroPoint == outputZeroPoint && inputScale == outputScale)) ? TRUE : FALSE; + + if (is16Bits) + { + kernel_info->kernel_index = 1; + } + else if (is8Bits) + { + kernel_info->kernel_index = 2; + } + else + { + VSILOGE("Not support input or output data format!(TENSORSTACKCONCAT) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + + return VSI_SUCCESS; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_border_t border; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( self, params, inputs, outputs ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_info_t kernel_info; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + + /* reshape input/output */ + _reshape_tensor( self, inputs, outputs); + + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_tensorstackconcat"; + kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); + kernel_info.kernel = vx_kernel_TENSORSTACKCONCAT_list; + kernel_info.init_index = 1; + + if (vsi_nn_is_do_vx_op_pre_init(kernel_info.type)) + { + vx_op_pre_compute(self, inputs, outputs, &kernel_info); + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) free(kernel_info.resource_name); + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + vsi_nn_tensorstackconcat_param *p = NULL; + vsi_nn_stackconcat_lcl_data *local = NULL; + int32_t axis = 0; + + if( NULL == self ) + { + return ret; + } + + p = &(self->nn_param.tensorstackconcat); + axis = p->axis; + local = (vsi_nn_stackconcat_lcl_data *)malloc(sizeof(vsi_nn_stackconcat_lcl_data)); + if (NULL == local) + { + return ret; + } + memset(local, 0, sizeof(vsi_nn_stackconcat_lcl_data)); + p->local = local; + + if (axis < 0) + { + axis = axis + inputs[0]->attr.dim_num; + p->axis = axis; + } + + return TRUE; +} /* op_setup() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_tensorstackconcat_param *p = NULL; + int32_t axis = 0; + int32_t dims = (int32_t)inputs[0]->attr.dim_num; + int32_t out_dims = (int32_t)outputs[0]->attr.dim_num; + + p = &(self->nn_param.tensorstackconcat); + axis = p->axis; + + if (axis < 0) + { + axis = axis + dims; + } + + if (axis > (dims - 1)) + { + VSILOGE("Invalid Axis: %d, (TENSORSTACKCONCAT) at [%s : %d]\n", axis, __FILE__, __LINE__); + return FALSE; + } + if( VSI_NN_DIM_AUTO == out_dims ) + { + VSILOGE("Invalid output, (TENSORSTACKCONCAT) at [%s : %d]\n", __FILE__, __LINE__); + return FALSE; + } + if( dims != out_dims ) + { + VSILOGE("Input and output's dims not matched, (TENSORSTACKCONCAT) at [%s : %d]\n", __FILE__, __LINE__); + return FALSE; + } + + { + BEGIN_IO_TYPE_DECL(TENSORSTACKCONCAT, 2, 1) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + END_IO_TYPE_DECL(TENSORSTACKCONCAT) + if(!VALIDATE_OP_IO_TYPES(TENSORSTACKCONCAT, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.tensorstackconcat.axis = 1; + + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_tensorstackconcat_param *p = &(self->nn_param.tensorstackconcat); + uint32_t i = 0; + if (p->local) + { + for (i = 0; i < _VSI_NN_STACKCONCAT_LOCAL_TENSOR_NUM; i++) + { + if (p->local->local_tensor[i]) + { + vxReleaseTensor(&(p->local->local_tensor[i])); + p->local->local_tensor[i] = NULL; + } + } + vsi_nn_safe_free(p->local); + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ TENSORSTACKCONCAT, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 2, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c new file mode 100644 index 0000000..b2c13dd --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c @@ -0,0 +1,163 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +/* + Declare number of input and output. + */ + +static vsi_status _tile_op_compute + ( + const char * kernel_name, + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, + &inputs[0], 1, + &outputs[0], 1, NULL ); + + + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* _tile_op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + vsi_nn_tile_param * p; + + BEGIN_IO_TYPE_DECL(TILE, 1, 1) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_U32, D_U32) + IO_TYPE(D_F32, D_F32) + END_IO_TYPE_DECL(TILE) + if(!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + p = &(self->nn_param.tile); + + if (inputs[0]->attr.dim_num != p->multiples_num) + { + VSILOGE("multiples_num MUST match the dims of input tensor!"); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + uint32_t i; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + vsi_nn_tile_param * p; + + p = &(self->nn_param.tile); + if (inputs[0]->attr.dim_num != p->multiples_num) + { + VSILOGE("multiples_num MUST match the dims of input tensor!"); + return FALSE; + } + + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i] * p->multiples[i]; + } + } + + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEF_TILE_OP(name, kernel_name) \ + static vsi_status op_compute_##kernel_name \ + ( \ + vsi_nn_node_t * self, \ + vsi_nn_tensor_t ** inputs, \ + vsi_nn_tensor_t ** outputs \ + ) \ + { \ + return _tile_op_compute( ""#kernel_name, self, inputs, outputs ); \ + } \ +DEF_OP_REG(name, NULL, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 1, 1) + +DEF_TILE_OP( TILE, tile ); + +#undef DEF_TILE_OP + +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c new file mode 100644 index 0000000..56d056d --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c @@ -0,0 +1,300 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (2) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +extern vx_kernel_description_t * vx_kernel_TOPK_list[]; + +static void _set_inputs_outputs + ( + vx_reference * params, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + uint32_t cnt; + + /* Set inputs */ + cnt = 0; + for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)inputs[i]->t; + } + + /* Set outputs */ + for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) + { + params[cnt] = (vx_reference)outputs[i]->t; + } +} /* _set_inputs_outputs() */ + +static vsi_status _create_params + ( + vsi_nn_node_t * node, + vx_reference * params, + uint32_t num + ) +{ + vsi_status status; + vx_context ctx; + vsi_nn_topk_param * p; + if( 0 == num ) + { + return VSI_SUCCESS; + } + memset( params, 0, sizeof( vx_reference * ) * num ); + p = &(node->nn_param.topk); + ctx = vxGetContext( (vx_reference)node->graph->g ); + /* Init parameters */ + #define _SET_PARAM( i, type, arg ) do{ \ + params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ + status = vxGetStatus( params[i] ); \ + if( VSI_SUCCESS != status ) { \ + goto set_param_error; \ + } \ + } while(0) + _SET_PARAM( 0, VX_TYPE_UINT32, k ); + #undef _SET_PARAM +set_param_error: + + return status; +} /* _create_params */ + +static void _release_params + ( + vx_reference * params, + uint32_t num + ) +{ + uint32_t i; + vx_scalar scalar; + for( i = 0; i < num; i ++ ) + { + scalar = (vx_scalar)params[i]; + vxReleaseScalar( &scalar ); + } +} /* _release_params() */ + +static vsi_status cpu_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_status vx_op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_reference params[_PARAM_NUM]; + vx_reference * args; + + args = ¶ms[_IO_NUM]; + + if( NULL == self->n ) + { + return VSI_FAILURE; + } + + /* Set inputs and outputs */ + _set_inputs_outputs( params, inputs, outputs ); + /*TODO: Add code if need to change your parameter*/ + + /* Init parameters. */ + _create_params( self, args, _ARG_NUM ); + + /* Pass parameters to node. */ + status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); + + _release_params( args, _ARG_NUM ); + + return status; +} + +static vsi_nn_op_compute_t op_compute_list[] = +{ + cpu_op_compute, + vx_op_compute, + NULL +}; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + vsi_nn_kernel_info_t kernel_info; + char *path = NULL; + + memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); + status = VSI_FAILURE; + kernel_info.type = VX_KERNEL_TYPE_CPU; + kernel_info.kernel = vx_kernel_TOPK_list; + kernel_info.resource_num = 1; + kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); + kernel_info.resource_name[0] = "vsi_nn_kernel_topk"; + path = getenv("USER_VX_SOURCE_PATH"); + if(path) + vsi_nn_VxResourceSetPath(path); + + if( kernel_info.type == VX_KERNEL_TYPE_VX) + { + kernel_info.kernel_index = 1; + kernel_info.init_index = 1; + } + else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ + { + kernel_info.kernel_index = 0; + kernel_info.init_index = 0; + } + + self->n = vsi_nn_RegisterClientKernelAndNewNode( + self->graph, &kernel_info); + if (kernel_info.resource_name) + { + free(kernel_info.resource_name); + } + if( NULL == self->n ) + { + return VSI_FAILURE; + } + if (NULL != op_compute_list[kernel_info.init_index]) + { + status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + uint32_t i; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + vsi_nn_topk_param * p; + + p = &(self->nn_param.topk); + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = p->k; + for (i = 1; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + outputs[1]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[1]->attr.size[0] = p->k; + for (i = 1; i < inputs[0]->attr.dim_num; i++) + { + outputs[1]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ TOPK, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c new file mode 100644 index 0000000..febe9e3 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c @@ -0,0 +1,302 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_unidirectional_sequence_rnn_param* curr_param = + &self->nn_param.unidirectional_sequence_rnn; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t num_units = 0; + uint32_t output_size = 0; + uint32_t batch_size = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + if( curr_param->time_major ) + { + batch_size = inputs[RNN_INPUT_INPUT]->attr.size[1]; + } + else + { + batch_size = inputs[RNN_INPUT_INPUT]->attr.size[2]; + } + + num_units = inputs[RNN_INPUT_WEIGHT_I]->attr.size[1]; + output_size = num_units; + + /* create h_state if app doesn't provide them */ + if( !inputs[RNN_INPUT_H_STATE] ) + { + attr.dim_num = 2; + attr.size[1] = batch_size; + attr.size[0] = output_size; + memcpy(&attr.dtype, &outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype )); + attr.vtl = FALSE; + attr.is_const = TRUE; + + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[RNN_INPUT_H_STATE] = output_tensor->t; + } + + /* output */ + if( VSI_NN_DIM_AUTO == outputs[RNN_OUTPUT_OUTPUT]->attr.dim_num ) + { + outputs[RNN_OUTPUT_OUTPUT]->attr.size[0] = output_size; + outputs[RNN_OUTPUT_OUTPUT]->attr.size[1] = inputs[RNN_INPUT_INPUT]->attr.size[1]; + outputs[RNN_OUTPUT_OUTPUT]->attr.size[2] = inputs[RNN_INPUT_INPUT]->attr.size[2]; + outputs[RNN_OUTPUT_OUTPUT]->attr.dim_num = 3; + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_unidirectional_sequence_rnn_param* curr_param = + &self->nn_param.unidirectional_sequence_rnn; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_tensor_t** split_output_tensors = NULL; + vsi_nn_tensor_t** rnncell_reshape_output_tensors =NULL; + vsi_nn_tensor_t* last_step_h_state = NULL; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_tensor_t* input_tensor = NULL; + vsi_bool use_virtual_tensor = TRUE; + uint32_t batch_size = 0; + uint32_t time_step = 0; + uint32_t i = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + vsi_nn_internal_init_node_wksp( self ); + + if( curr_param->time_major ) + { + batch_size = inputs[RNN_INPUT_INPUT]->attr.size[1]; + time_step = inputs[RNN_INPUT_INPUT]->attr.size[2]; + } + else + { + batch_size = inputs[RNN_INPUT_INPUT]->attr.size[2]; + time_step = inputs[RNN_INPUT_INPUT]->attr.size[1]; + } + + setup_op_shapes( self, inputs, outputs); + + /* default to input */ + input_tensor = inputs[RNN_INPUT_INPUT]; + if( !curr_param->time_major ) + { + /* transpose to time_major */ + output_tensor = vsi_nn_rnn_transpose_time_major(self, + inputs[RNN_INPUT_INPUT], NULL, use_virtual_tensor); + input_tensor = output_tensor->t; + } + + /* split input tensor */ + split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + rnncell_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * + sizeof(vsi_nn_tensor_t **)); + memset( rnncell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + + vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, time_step, use_virtual_tensor); + + vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + + last_step_h_state = inputs[RNN_INPUT_H_STATE]; + for( i = 0; i < time_step; i++ ) + { + vsi_nn_tensor_t* reshape_output = NULL; + vsi_nn_tensor_t* rnncell_out0 = NULL; + vsi_nn_tensor_t* rnncell_out1 = NULL; + + /* reshape for split output */ + output_tensor = vsi_nn_rnn_reshape_split_output(self, + split_output_tensors[i], batch_size, use_virtual_tensor); + reshape_output = output_tensor->t; + + /* rnncell output */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + rnncell_out0 = output_tensor->t; + + /* rnncell output h_state */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[RNNCELL_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + rnncell_out1 = output_tensor->t; + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 ); + curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation; + memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype, + curr_param->internal_dtype, sizeof( curr_param->internal_dtype ) ); + curr->inputs[RNNCELL_INPUT_INPUT] = reshape_output; + curr->inputs[RNNCELL_INPUT_H_STATE] = last_step_h_state; + + curr->inputs[RNNCELL_INPUT_WEIGHT_I] = inputs[RNN_INPUT_WEIGHT_I]; + curr->inputs[RNNCELL_INPUT_WEIGHT_H] = inputs[RNN_INPUT_WEIGHT_H]; + + curr->inputs[RNNCELL_INPUT_BIAS] = inputs[RNN_INPUT_BIAS]; + + curr->outputs[RNNCELL_OUTPUT_OUTPUT] = rnncell_out0; + curr->outputs[RNNCELL_OUTPUT_H_STATE] = rnncell_out1; + + vsi_nn_internal_setup_node( self, curr ); + + last_step_h_state = rnncell_out1; + + /* reshape output to 3-dims */ + output_tensor = vsi_nn_rnn_reshape_cell_output(self, + rnncell_out0, batch_size, use_virtual_tensor); + rnncell_reshape_output_tensors[i] = output_tensor->t; + } + + tensor = outputs[RNN_OUTPUT_OUTPUT]; + if( !curr_param->time_major ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tensor = output_tensor->t; + } + + /* concat rnncell output, the rnn's output is 3-dims */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr->node->nn_param.concat.axis = 2; + for( i = 0; i < time_step; i++ ) + { + curr->inputs[i] = rnncell_reshape_output_tensors[i]; + } + curr->outputs[0] = tensor; + vsi_nn_internal_setup_node( self, curr ); + + if( !curr_param->time_major ) + { + /* transpose time_major to batch_major*/ + vsi_nn_rnn_transpose_time_major(self, + tensor, outputs[RNN_OUTPUT_OUTPUT], use_virtual_tensor); + } + + vsi_nn_safe_free( split_output_tensors ); + vsi_nn_safe_free( rnncell_reshape_output_tensors ); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp( self ); + + return status; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ UNIDIRECTIONAL_SEQUENCE_RNN, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ RNN_INPUT_CNT, + /* output_num */ RNN_OUTPUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c new file mode 100644 index 0000000..906fb7c --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c @@ -0,0 +1,254 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "client/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" + +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (VSI_NN_UNSTACK_MAX_OUTPUTS) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_unstack_param * p; + vsi_nn_tensor_attr_t attr; + int32_t use_virtual_tensor = 1; + uint32_t tensor_num = self->output.num; + vsi_nn_internal_tensor_t* input_tensor = NULL; + vsi_nn_internal_tensor_t** output_tensors = NULL; + vsi_nn_internal_node_t* curr = NULL; + uint32_t* reshape_input_size = NULL; + uint32_t *slices = NULL; + uint32_t block_size = 1; + uint32_t block_num = 1; + uint32_t axis = 0; + uint32_t i, j; + + vsi_nn_internal_init_node_wksp( self ); + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + p = (vsi_nn_unstack_param *)&(self->nn_param.unstack); + if(p->axis == 0) + { + for(i = 0; i < inputs[0]->attr.dim_num - 1; i++) + { + for(j = 0; j < self->output.num; j++) + { + outputs[j]->attr.size[i] = inputs[0]->attr.size[i + 1]; + } + } + + for(j = 0; j < self->output.num; j++) + { + outputs[j]->attr.dim_num = inputs[0]->attr.dim_num - 1; + } + } + else if(p->axis == 1) + { + for(j = 0; j < self->output.num; j++) + { + outputs[j]->attr.size[0] = inputs[0]->attr.size[0]; + + for(i = 1; i < inputs[0]->attr.dim_num-1; i++) + { + outputs[j]->attr.size[i] = inputs[0]->attr.size[i + 1]; + } + outputs[j]->attr.dim_num = inputs[0]->attr.dim_num - 1; + } + } + else if(p->axis == 2) + { + for(j = 0; j < self->output.num; j++) + { + outputs[j]->attr.size[0] = inputs[0]->attr.size[0]; + outputs[j]->attr.size[1] = inputs[0]->attr.size[1]; + + for(i = 2; i < inputs[0]->attr.dim_num - 1; i++) + { + outputs[j]->attr.size[i] = inputs[0]->attr.size[i + 1]; + } + outputs[j]->attr.dim_num = inputs[0]->attr.dim_num - 1; + } + } + else if(p->axis == 3) + { + for(j = 0; j < self->output.num; j++) + { + outputs[j]->attr.size[0] = inputs[0]->attr.size[0]; + outputs[j]->attr.size[1] = inputs[0]->attr.size[1]; + outputs[j]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[j]->attr.dim_num = inputs[0]->attr.dim_num - 1; + } + } + } + + axis = self->nn_param.unstack.axis; + + for (i = 0; i < axis; i++) + { + block_size *= inputs[0]->attr.size[i]; + } + + for (i = axis + 1; i < inputs[0]->attr.dim_num; i++) + { + block_num *= inputs[0]->attr.size[i]; + } + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor); + input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + reshape_input_size = (uint32_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + reshape_input_size[0] = block_size; + reshape_input_size[1] = tensor_num; + reshape_input_size[2] = block_num; + + curr->node->nn_param.reshape.size = reshape_input_size; + curr->node->nn_param.reshape.dim_num = 3; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = input_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, tensor_num ); + curr->node->nn_param.split.axis = 1; + curr->node->nn_param.split.slices = slices; + curr->node->nn_param.split.slices_num = tensor_num; + curr->inputs[0] = input_tensor->t; + output_tensors = (vsi_nn_internal_tensor_t**)malloc(tensor_num * sizeof(vsi_nn_internal_tensor_t*)); + for (i = 0; i < tensor_num; i++) + { + slices[i] = 1; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &outputs[i]->attr.dtype, use_virtual_tensor); + output_tensors[i] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + curr->outputs[i] = output_tensors[i]->t; + } + vsi_nn_internal_setup_node( self, curr ); + + for (i = 0; i < tensor_num; i++) + { + uint32_t* output_size = NULL; + + output_size = (uint32_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + + memcpy(output_size, outputs[i]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + curr->node->nn_param.reshape.size = output_size; + curr->node->nn_param.reshape.dim_num = outputs[i]->attr.dim_num; + curr->inputs[0] = output_tensors[i]->t; + curr->outputs[0] = outputs[i]; + vsi_nn_internal_setup_node( self, curr ); + } + + vsi_nn_safe_free(output_tensors); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_internal_deinit_node_wksp( self ); + + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ UNSTACK, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c new file mode 100644 index 0000000..6dd771f --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c @@ -0,0 +1,314 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "ops/vsi_nn_op_upsample.h" +#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + + +static vsi_bool vsi_nn_upsample_optimize_shape + ( + vsi_nn_node_t * self, + const int32_t* shape_in0, const int32_t* shape_in1, + const int32_t* shape_out, const size_t rank_in, + int32_t* out_shape_input0, int32_t* out_shape_input1, + int32_t* out_shape_output, uint32_t* out_rank_output + ) +{ + vsi_bool enable_image_2d = FALSE; + int32_t hwLitimLen = 65536; + + if ((2 == self->nn_param.upsample.scale[0]) + && (2 == self->nn_param.upsample.scale[1])) + { + if (rank_in < 3) + { + enable_image_2d = TRUE; + } + else + { + enable_image_2d = (vsi_bool)((shape_out[1] * shape_out[2] < hwLitimLen) + && ( (shape_out[1] % 2) == 0 )); + } + } + + if( rank_in == 1 ) + { + out_shape_input0[0] = shape_in0[0]; + out_shape_input0[1] = 1; + out_shape_input0[2] = 1; + out_shape_input1[0] = shape_in1[0]; + out_shape_input1[1] = 1; + out_shape_input1[2] = 1; + out_shape_output[0] = shape_out[0]; + out_shape_output[1] = 1; + out_shape_output[2] = 1; + *out_rank_output = 2; + } + else if(rank_in == 3 && enable_image_2d) + { + out_shape_input0[0] = shape_in0[0]; + out_shape_input0[1] = shape_in0[1] * shape_in0[2]; + out_shape_input0[2] = 1; + out_shape_input1[0] = shape_in1[0]; + out_shape_input1[1] = shape_in1[1] * shape_in1[2]; + out_shape_input1[2] = 1; + out_shape_output[0] = shape_out[0]; + out_shape_output[1] = shape_out[1] * shape_out[2]; + out_shape_output[2] = 1; + *out_rank_output = 2; + } + else if(rank_in == 4 && enable_image_2d) + { + out_shape_input0[0] = shape_in0[0]; + out_shape_input0[1] = shape_in0[1] * shape_in0[2]; + out_shape_input0[2] = 1; + out_shape_input0[3] = shape_in0[3]; + out_shape_input1[0] = shape_in1[0]; + out_shape_input1[1] = shape_in1[1] * shape_in1[2]; + out_shape_input1[2] = 1; + out_shape_input1[3] = shape_in1[3]; + out_shape_output[0] = shape_out[0]; + out_shape_output[1] = shape_out[1] * shape_out[2]; + out_shape_output[2] = 1; + out_shape_output[3] = shape_out[3]; + if (1 == shape_in0[3]) + { + *out_rank_output = 2; + } + else + { + *out_rank_output = 4; + } + } + else + { + uint32_t i; + for (i = 0; i < rank_in; i++) + { + out_shape_input0[i] = shape_in0[i]; + out_shape_input1[i] = shape_in1[i]; + out_shape_output[i] = shape_out[i]; + } + *out_rank_output = (uint32_t)rank_in; + } + + return TRUE; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + uint32_t new_rank = 0; + vsi_bool ret; + vsi_nn_kernel_param_t * param = NULL; + int32_t scale_x = (int32_t)self->nn_param.upsample.scale[0]; + int32_t scale_y = (int32_t)self->nn_param.upsample.scale[1]; + + if( NULL == self ) + { + return VSI_FAILURE; + } + + param =vsi_nn_kernel_param_create(); + + ret = vsi_nn_upsample_optimize_shape(self, + (int32_t *)inputs[0]->attr.size, (int32_t *)inputs[1]->attr.size, + (int32_t *)outputs[0]->attr.size, inputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + vsi_nn_kernel_param_add_int32( param, "scale_x", scale_x ); + vsi_nn_kernel_param_add_int32( param, "scale_y", scale_y ); + + if( ret ) + { + + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[2], new_rank ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "upsample", + &reshape_tensors[0], _INPUT_NUM, + &reshape_tensors[2], _OUTPUT_NUM, param ); + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + } + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(UPSAMPLE, 2, 1) + IO_TYPE(D_I16|Q_DFP, D_U8, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_U8, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8, D_F16) + IO_TYPE(D_F16, D_U8, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_U8, D_I16|Q_DFP) + IO_TYPE(D_F16, D_U8, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8, D_F16) + IO_TYPE(D_F32, D_U8, D_F32) + IO_TYPE(D_F32, D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8, D_F32) + IO_TYPE(D_I32, D_U8, D_I32) + + /* HW 9.0 */ + IO_TYPE(D_BF16, D_U8, D_BF16) + END_IO_TYPE_DECL(UPSAMPLE) + if(!VALIDATE_OP_IO_TYPES(UPSAMPLE, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t h; + uint32_t w; + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + w = self->nn_param.upsample.size[0]; + h = self->nn_param.upsample.size[1]; + if (0 == self->nn_param.upsample.scale[0]) + { + self->nn_param.upsample.scale[0] = self->nn_param.upsample.size[0] / + inputs[0]->attr.size[0]; + } + if (0 == self->nn_param.upsample.scale[1]) + { + self->nn_param.upsample.scale[1] = self->nn_param.upsample.size[1] / + inputs[0]->attr.size[1]; + } + if ( 0 == self->nn_param.upsample.size[0] ) + { + w = inputs[0]->attr.size[0] * self->nn_param.upsample.scale[0]; + } + if ( 0 == self->nn_param.upsample.size[1] ) + { + h = inputs[0]->attr.size[1] * self->nn_param.upsample.scale[1]; + } + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = w; + outputs[0]->attr.size[1] = h; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[1]->attr.size[3]; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + uint32_t i; + for (i = 0; i < _VSI_NN_UPSAMPLE_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.upsample.local.local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.upsample.local.local_tensor[i])); + self->nn_param.upsample.local.local_tensor[i] = NULL; + } + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ UPSAMPLE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c new file mode 100644 index 0000000..5cbd3cb --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c @@ -0,0 +1,154 @@ + +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* + Need copy input data to output if don't reshape input to output + */ + if(inputs[0]->t != NULL && outputs[0]->t != NULL && + self->nn_param.variable.local->initialized == FALSE) + { + self->n = vxTensorCopyNode(self->graph->g, + inputs[0]->t, outputs[0]->t); + if(NULL == self->n) + { + VSILOGE( "Create vxTensorCopyNode fail." ); + return VSI_FAILURE; + } + VSILOGD("Create a copy node for variable"); + } + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_DATACONVERT, self, inputs, outputs); + + return ret; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_nn_variable_lcl_data *local = NULL; + if( direction == VSI_NN_OPTIMIZE_BACKWARD ) + { + return VSI_SUCCESS; + } + local = (vsi_nn_variable_lcl_data *)malloc(sizeof(vsi_nn_variable_lcl_data)); + if( NULL == local ) + { + VSILOGE("malloc memory fail"); + return VSI_FAILURE; + } + memset(local, 0, sizeof(vsi_nn_variable_lcl_data)); + if( NULL != inputs[0]->t && NULL == outputs[0]->t ) + { + VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + outputs[0]->t = vxReshapeTensor(inputs[0]->t, (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num); + if( NULL == outputs[0]->t ) + { + VSILOGE("Call vxReshapeTensor fail"); + free(local); + local = NULL; + return VSI_FAILURE; + } + local->initialized = TRUE; + } + else + { + local->initialized = FALSE; + } + self->nn_param.variable.local = local; + return VSI_SUCCESS; +} /* op_optimize() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_variable_lcl_data *local = self->nn_param.variable.local; + if(local) + { + free(local); + local = NULL; + } + vsi_nn_op_common_deinit(self); + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ VARIABLE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c b/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c new file mode 100644 index 0000000..1025604 --- /dev/null +++ b/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c @@ -0,0 +1,109 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "vsi_nn_log.h" +#include "quantization/vsi_nn_asymmetric_affine.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_dtype_util_prv.h" + +vsi_status vsi_nn_QuantAffineCalParam + ( + vsi_nn_type_e type, + float max_data, + float min_data, + float * scale, + int32_t * zero_point + ) +{ + double max_range, min_range; + int32_t tmp; + max_range = 0.0; + min_range = 0.0; + + switch( type ) + { + case VSI_NN_TYPE_UINT8: + case VSI_NN_TYPE_UINT16: + case VSI_NN_TYPE_UINT32: + break; + default: + VSILOGW("Not support type %#x", type); + return VSI_FAILURE; + } + type_get_range( type, &max_range, &min_range ); + *scale = ( max_data - min_data ) / (float)( max_range - min_range ); + tmp = (int32_t)vsi_rint( (float)min_range - min_data / *scale ); + *zero_point = (int32_t)vsi_nn_min( (int32_t)max_range, + vsi_nn_max( (int32_t)min_range, tmp ) ); + return VSI_SUCCESS; +} /* vsi_nn_QuantAffineCalParam() */ + +vsi_bool vsi_nn_QuantAffineCheck + ( + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *weight, + vsi_nn_tensor_t *bias + ) +{ + vsi_bool ret; + vsi_nn_type_e dtype; + const double diff_scale = (double)1e-5; + + ret = FALSE; + dtype = input->attr.dtype.vx_type; + + switch (dtype) + { + case VSI_NN_TYPE_UINT8: + case VSI_NN_TYPE_UINT16: + case VSI_NN_TYPE_UINT32: + case VSI_NN_TYPE_INT8: + { + double product_scale = (double)input->attr.dtype.scale * (double)weight->attr.dtype.scale; + const double acuity_round_decimals = 1e-8; + if(bias && bias->attr.dtype.scale) + { + double tmp0,tmp1; + double bias_scale = bias->attr.dtype.scale; + tmp0 = vsi_nn_abs(product_scale - bias_scale); + tmp1 = vsi_nn_min(product_scale, bias_scale) * diff_scale; + tmp1 = vsi_nn_max(tmp1, acuity_round_decimals); + if(tmp0 <= tmp1) + { + ret = TRUE; + } + } + else + { + ret = TRUE; + } + } + break; + default: + VSILOGW("input dtype error %#x", dtype); + break; + } + + return ret; +} /* vsi_nn_QuantAffineCheck() */ diff --git a/src/tim/vx/internal/src/quantization/vsi_nn_dynamic_fixed_point.c b/src/tim/vx/internal/src/quantization/vsi_nn_dynamic_fixed_point.c new file mode 100644 index 0000000..a2c8641 --- /dev/null +++ b/src/tim/vx/internal/src/quantization/vsi_nn_dynamic_fixed_point.c @@ -0,0 +1,100 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include "vsi_nn_log.h" +#include "quantization/vsi_nn_dynamic_fixed_point.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" + +vsi_status vsi_nn_QuantDFPCalParam + ( + vsi_nn_type_e dtype, + float max_data, + float min_data, + int8_t * fl + ) +{ + int32_t tmp; + int32_t bits; + + switch( dtype ) + { + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_INT16: + case VSI_NN_TYPE_INT32: + break; + default: + VSILOGW("Not support dtype %#x", dtype); + return VSI_FAILURE; + } + max_data = vsi_nn_max( vsi_nn_abs( max_data ), vsi_nn_abs( min_data ) ); + bits = vsi_nn_GetTypeBytes( dtype ) * 8; + tmp = (int32_t)ceil( log( max_data ) / log( 2 ) ); + *fl = (int8_t)(bits - 1 - tmp); + return VSI_SUCCESS; +} /* vsi_nn_QuantDFPCalParam() */ + +vsi_bool vsi_nn_QuantDFPCheck +( + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *weight, + vsi_nn_tensor_t *bias +) +{ + vsi_bool ret; + vsi_nn_type_e dtype; + + ret = FALSE; + dtype = input->attr.dtype.vx_type; + + switch (dtype) + { + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_INT16: + case VSI_NN_TYPE_INT32: + { + if(bias) + { + int8_t input_fl = input->attr.dtype.fl; + int8_t weight_fl = weight->attr.dtype.fl; + int8_t bias_fl = bias->attr.dtype.fl; + if(bias_fl == (input_fl + weight_fl)) + { + ret = TRUE; + } + } + else + { + ret = TRUE; + } + } + break; + default: + VSILOGW("input dtype error %#x", dtype); + break; + } + + return ret; +} /* vsi_nn_QuantDFPCheck() */ diff --git a/src/tim/vx/internal/src/quantization/vsi_nn_perchannel_symmetric_affine.c b/src/tim/vx/internal/src/quantization/vsi_nn_perchannel_symmetric_affine.c new file mode 100644 index 0000000..2a24c47 --- /dev/null +++ b/src/tim/vx/internal/src/quantization/vsi_nn_perchannel_symmetric_affine.c @@ -0,0 +1,123 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "vsi_nn_log.h" +#include "quantization/vsi_nn_perchannel_symmetric_affine.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_limits.h" + +vsi_status vsi_nn_QuantAffinePerchannelCalParam + ( + vsi_nn_type_e type, + float max_data, + float min_data, + float * scales + //int32_t * zero_point + ) +{ + double max_range, min_range; + //int32_t tmp; + max_range = 0.0; + min_range = 0.0; + + switch( type ) + { + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_UINT8: + case VSI_NN_TYPE_UINT16: + case VSI_NN_TYPE_UINT32: + break; + default: + VSILOGW("Not support type %#x", type); + return VSI_FAILURE; + } + vsi_nn_TypeGetRange( type, &max_range, &min_range ); + *scales = ( max_data - min_data ) / (float)( max_range - min_range ); + //tmp = (int32_t)vsi_nn_Rint( (float)min_range - min_data / *scales ); + //*zero_point = (int32_t)vsi_nn_min( (int32_t)max_range, + // vsi_nn_max( (int32_t)min_range, tmp ) ); + return VSI_SUCCESS; +} /* vsi_nn_QuantAffinePerchannelCalParam() */ + +vsi_bool vsi_nn_QuantAffinePerchannelCheck + ( + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *weight, + vsi_nn_tensor_t *bias + ) +{ +#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT + vsi_bool ret; + vsi_nn_type_e dtype; + const float diff_scale = (float)1e-5; + ret = FALSE; + dtype = input->attr.dtype.vx_type; + + switch (dtype) + { + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_UINT8: + case VSI_NN_TYPE_UINT16: + case VSI_NN_TYPE_UINT32: + { + float input_scale = input->attr.dtype.scale; + const float *w = NULL; + const float *b = NULL; + int i = 0; + w = weight->attr.dtype.scales; + if(bias && bias->attr.dtype.scales) + { + b = bias->attr.dtype.scales; + for (i=0; i < weight->attr.dtype.scale_dim; i++) + { + float weight_scale = *(w+i); + float bias_scale = *(b+i); + float iw_scale = input_scale * weight_scale; + float diff = vsi_nn_abs(bias_scale - iw_scale); + if(diff <= diff_scale) + { + ret = TRUE; + } + else + { + break; + } + } + } + else + { + ret = TRUE; + } + } + break; + default: + VSILOGW("input dtype error %#x", dtype); + break; + } +#else + vsi_bool ret; + ret = FALSE; +#endif + return ret; +} /* vsi_nn_QuantAffinePerchannelCheck() */ diff --git a/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c b/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c new file mode 100644 index 0000000..85d862d --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c @@ -0,0 +1,279 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "utils/vsi_nn_binary_tree.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" + +static vsi_nn_binary_tree_t * _new_node + ( void ) +{ + vsi_nn_binary_tree_t * node; + + node = (vsi_nn_binary_tree_t *)malloc( + sizeof( vsi_nn_binary_tree_t ) ); + + memset( node, 0, sizeof( vsi_nn_binary_tree_t ) ); + return node; +} /* _new_node() */ + +static vsi_nn_binary_tree_t * _min_node + ( + vsi_nn_binary_tree_t * node + ) +{ + vsi_nn_binary_tree_t * cur = node; + + while( NULL != cur->left ) + { + cur = cur->left; + } + + return cur; +} /* _min_node() */ + +static vsi_nn_binary_tree_t * _del_node_by_key + ( + vsi_nn_binary_tree_t * root, + vsi_nn_binary_tree_key_t key + ) +{ + if( NULL == root ) + { + return root; + } + if( key < root->key ) + { + root->left = _del_node_by_key( root->left, key ); + } + else if( key > root->key ) + { + root->right = _del_node_by_key( root->right, key ); + } + else + { + if( NULL == root->left ) + { + vsi_nn_binary_tree_t * node = root->right; + free( root ); + return node; + } + else if( NULL == root->right ) + { + vsi_nn_binary_tree_t * node = root->left; + free( root ); + return node; + } + else + { + vsi_nn_binary_tree_t * node = _min_node( root->right ); + root->key = node->key; + + /* copy data */ + root->data_ptr = node->data_ptr; + + root->right = _del_node_by_key(root->right, node->key ); + } + } + + return root; +} /* _del_node_by_key() */ + +static vsi_nn_binary_tree_t * _move_left + ( + vsi_nn_binary_tree_t * node + ) +{ + vsi_nn_binary_tree_t * left = NULL; + if( NULL != node ) + { + left = node->left; + } + return left; +} /* _move_left() */ + +static vsi_nn_binary_tree_t * _move_right + ( + vsi_nn_binary_tree_t * node + ) +{ + vsi_nn_binary_tree_t * right = NULL; + if( NULL != node ) + { + right = node->right; + } + return right; +} /* _move_right() */ + +static vsi_nn_binary_tree_t * _find_loc + ( + vsi_nn_binary_tree_t * node, + vsi_nn_binary_tree_key_t key, + int * val + ) +{ + int tmp; + vsi_nn_binary_tree_t * next; + vsi_nn_binary_tree_t * loc; + loc = NULL; + tmp = 0; + while( NULL != node ) + { + if( node->key > key ) + { + next = _move_left( node ); + tmp = -1; + } + else if( node->key < key ) + { + next = _move_right( node ); + tmp = 1; + } + else + { + loc = node; + tmp = 0; + break; + } + if( NULL != next ) + { + node = next; + } + else + { + loc = node; + break; + } + } + if( NULL != val ) + { + *val = tmp; + } + return loc; +} /* _find_loc_to_insert() */ + + +void vsi_nn_BinaryTreeRemoveNode + ( + vsi_nn_binary_tree_t ** root, + vsi_nn_binary_tree_key_t key + ) +{ + if( NULL == root && NULL != *root ) + { + return; + } + + *root = _del_node_by_key( *root, key ); +} /* vsi_nn_BinaryTreeRemoveNode() */ + +void vsi_nn_BinaryTreeNewNode + ( + vsi_nn_binary_tree_t ** root, + vsi_nn_binary_tree_key_t key, + void * data + ) +{ + int val; + vsi_nn_binary_tree_t * iter; + vsi_nn_binary_tree_t * node; + + if( NULL == root ) + { + return; + } + + val = 0; + iter = *root; + iter = _find_loc( iter, key, &val ); + if( NULL != iter && key == iter->key ) + { + //VSILOGE( "Key %#x has been registered.", (unsigned int)key ); + // Update node + iter->data_ptr = data; + return; + } + + /* New node */ + node = _new_node(); + if( NULL != node ) + { + node->key = key; + node->data_ptr = data; + } + else + { + VSILOGW( "Malloc binary tree node fail." ); + } + + /* Insert node */ + if( NULL == iter ) + { + *root = node; + } + else + { + if( val > 0 ) + { + iter->right = node; + } + else if( val < 0 ) + { + iter->left = node; + } + else + { + VSILOGE( "Hash collision!" ); + if( node ) + { + free( node ); + node = NULL; + } + } + } +} /* vsi_nn_BinaryTreeNewNode() */ + +void * vsi_nn_BinaryTreeGetNode + ( + vsi_nn_binary_tree_t ** root, + vsi_nn_binary_tree_key_t key + ) +{ + void * data; + vsi_nn_binary_tree_t * iter; + + if( NULL == root ) + { + return NULL; + } + data = NULL; + iter = *root; + iter = _find_loc( iter, key, NULL ); + if( NULL != iter && key == iter->key ) + { + data = iter->data_ptr; + } + return data; +} /* vsi_nn_BinaryTreeGetNode() */ diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c new file mode 100644 index 0000000..4a898bc --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -0,0 +1,540 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include "vsi_nn_prv.h" +#include "vsi_nn_assert.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_code_generator.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" + +/* The static data file handle. */ +static FILE * s_dfile_hndl = NULL; +static FILE * s_net_file_hndl = NULL; + +static void _try_open_file + ( + const char * file_path, + FILE ** fp, + const char * mode + ) +{ + if( NULL == file_path ) + { + return; + } + if( NULL != *fp ) + { + VSILOGW( "File handle is not NULL." ); + fclose( *fp ); + } + *fp = fopen( file_path, mode ); + if( NULL == *fp ) + { + VSILOGE( "Open file %s fail.", file_path ); + return; + } +} /* _try_open_file() */ + +static void _try_close_file + ( + FILE ** fp + ) +{ + if( NULL != *fp ) + { + fflush( *fp ); + fclose( *fp ); + *fp = NULL; + } +} /* _try_close_file() */ + +static void _try_pack_tensor_data + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + uint64_t * p_ofst, + uint64_t * p_sz + ) +{ + long ofst; + size_t cnt; + uint32_t bytes; + uint8_t * data; + + if( NULL == s_dfile_hndl || NULL == tensor + || NULL == p_ofst || NULL == p_sz ) + { + return; + } + *p_ofst = 0; + *p_sz = 0; + ofst = ftell( s_dfile_hndl ); + if( 0 > ofst ) + { + VSILOGE( "Get offset error %ld.", ofst ); + } + else + { + *p_ofst = (uint64_t)ofst; + data = vsi_nn_ConvertTensorToData( graph, tensor ); + bytes = vsi_nn_GetTensorSize( tensor->attr.size, + tensor->attr.dim_num, tensor->attr.dtype.vx_type ); + if( NULL != data ) + { + cnt = fwrite( data, (size_t)bytes, 1, s_dfile_hndl ); + if( cnt != 1 ) + { + VSILOGW( "Write tensor bytes(%zu/%d)", cnt, 1 ); + } + if( cnt > 0 ) + { + *p_sz = (uint64_t)bytes; + } + free( data ); + data = NULL; + } + } +} /* _pack_tensor_data() */ + +#define _write_code(str, ...) _write_code_ex(str"\n", ##__VA_ARGS__) +static void _write_code_ex + ( + const char * fmt, + ... + ) +{ +#define _MAX_LINE_SIZE (256 - 1) + + char line[_MAX_LINE_SIZE] = { 0 }; + int bytes; + va_list args; + + va_start( args, fmt ); + bytes = vsnprintf( line, _MAX_LINE_SIZE, fmt, args ); + va_end( args ); + + if( NULL != s_net_file_hndl ) + { + fwrite( line, bytes, 1, s_net_file_hndl ); + } + else + { + vprintf( fmt, args ); + } +} /* _write_code() */ + +static void _vx_param + ( + vsi_nn_node_id_t node_id, + vsi_nn_node_t * node + ) +{ + _write_code("node[%u]->vx_param.has_relu = %d;", + node_id, (int)node->vx_param.has_relu); + _write_code("node[%u]->vx_param.overflow_policy = %#x;", + node_id, (int)node->vx_param.overflow_policy); + _write_code("node[%u]->vx_param.rounding_policy = %#x;", + node_id, (int)node->vx_param.rounding_policy); + _write_code("node[%u]->vx_param.down_scale_size_rounding = %#x;", + node_id, (int)node->vx_param.down_scale_size_rounding); +} /* _vx_param() */ + +static void _conv_param + ( + vsi_nn_node_id_t node_id, + vsi_nn_node_t * node + ) +{ + _write_code("node[%u]->nn_param.conv2d.ksize[0] = %u;", + node_id, node->nn_param.conv2d.ksize[0]); + _write_code("node[%u]->nn_param.conv2d.ksize[1] = %u;", + node_id, node->nn_param.conv2d.ksize[1]); + _write_code("node[%u]->nn_param.conv2d.pad[0] = %u;", + node_id, node->nn_param.conv2d.pad[0]); + _write_code("node[%u]->nn_param.conv2d.pad[1] = %u;", + node_id, node->nn_param.conv2d.pad[1]); + _write_code("node[%u]->nn_param.conv2d.pad[2] = %u;", + node_id, node->nn_param.conv2d.pad[2]); + _write_code("node[%u]->nn_param.conv2d.pad[3] = %u;", + node_id, node->nn_param.conv2d.pad[3]); + _write_code("node[%u]->nn_param.conv2d.pad_type = %#x;", + node_id, node->nn_param.conv2d.pad_type); + _write_code("node[%u]->nn_param.conv2d.stride[0] = %u;", + node_id, node->nn_param.conv2d.stride[0]); + _write_code("node[%u]->nn_param.conv2d.stride[1] = %u;", + node_id, node->nn_param.conv2d.stride[1]); +} /* _conv_param() */ + +static void _pool_param + ( + vsi_nn_node_id_t node_id, + vsi_nn_node_t * node + ) +{ + _write_code("node[%u]->nn_param.pool.ksize[0] = %u;", + node_id, node->nn_param.pool.ksize[0]); + _write_code("node[%u]->nn_param.pool.ksize[1] = %u;", + node_id, node->nn_param.pool.ksize[1]); + _write_code("node[%u]->nn_param.pool.pad[0] = %u;", + node_id, node->nn_param.pool.pad[0]); + _write_code("node[%u]->nn_param.pool.pad[1] = %u;", + node_id, node->nn_param.pool.pad[1]); + _write_code("node[%u]->nn_param.pool.pad[2] = %u;", + node_id, node->nn_param.pool.pad[2]); + _write_code("node[%u]->nn_param.pool.pad[3] = %u;", + node_id, node->nn_param.pool.pad[3]); + _write_code("node[%u]->nn_param.pool.pad_type = %#x;", + node_id, node->nn_param.pool.pad_type); + _write_code("node[%u]->nn_param.pool.stride[0] = %u;", + node_id, node->nn_param.pool.stride[0]); + _write_code("node[%u]->nn_param.pool.stride[1] = %u;", + node_id, node->nn_param.pool.stride[1]); + _write_code("node[%u]->nn_param.pool.type = %#x;", + node_id, node->nn_param.pool.type); +} /* _pool_param() */ + +static void _lrn_param + ( + vsi_nn_node_id_t node_id, + vsi_nn_node_t * node + ) +{ + _write_code("node[%u]->nn_param.lrn.type = %#x;", + node_id, node->nn_param.lrn.type); + _write_code("node[%u]->nn_param.lrn.size = %d;", + node_id, node->nn_param.lrn.size); + _write_code("node[%u]->nn_param.lrn.alpha = %ff;", + node_id, node->nn_param.lrn.alpha); + _write_code("node[%u]->nn_param.lrn.beta = %ff;", + node_id, node->nn_param.lrn.beta); +} /* _lrn_param() */ + +static void _fcl_param + ( + vsi_nn_node_id_t node_id, + vsi_nn_node_t * node + ) +{ + _write_code("node[%u]->nn_param.fcl.weights = %d;", + node_id, node->nn_param.fcl.weights); +} /* _fcl_param() */ + +static void _concat_param + ( + vsi_nn_node_id_t node_id, + vsi_nn_node_t * node + ) +{ + _write_code("node[%u]->nn_param.concat.axis = %u;", + node_id, node->nn_param.concat.axis); +} /* _concat_param() */ + +static void _conv_relu_pool_param + ( + vsi_nn_node_id_t node_id, + vsi_nn_node_t * node + ) +{ + _conv_param( node_id, node ); + _pool_param( node_id, node ); +} /* _conv_relu_pool_param() */ + +typedef void (* _op_param_gen_t) + ( + vsi_nn_node_id_t node_id, + vsi_nn_node_t * node + ); + +static _op_param_gen_t s_op_gen[] = +{ + /* ADD */ NULL, + /* MULTIPLY */ NULL, + /* CONV2D */ _conv_param, + /* CONV_RELU */ _conv_param, + /* CONV_RELU_POOL */ _conv_relu_pool_param, + /* FCL */ _fcl_param, + /* FCL_RELU */ _fcl_param, + /* SOFTMAX */ NULL, + /* POOL */ _pool_param, + /* LEAKY_RELU */ NULL, + /* LRN */ _lrn_param, + /* CONCAT */ _concat_param, + /* SPLIT */ NULL, + /* NOOP */ NULL, + /* ROI_POOL */ NULL, + /* BATCH_NORM */ NULL, + /* PROPOSAL */ NULL, + /* DECONVOLUTION */ NULL, + /* RESHAPE */ NULL, + /* PERMUTE */ NULL, + /* PRELU */ NULL, + /* UPSAMPLE */ NULL, + /* RELU */ NULL, + /* RELUN */ NULL, + /* LSTM */ NULL, + /* REORG */ NULL, + /* VARIABLE */ NULL, + /* L2_NORMALIZE */ NULL, + /* FCL2 */ NULL, + /* POOLWITHARGMAX */ NULL, + /* ARGMAX */ NULL, + /* MAXIMUM */ NULL, + /* L2NORMALIZESCALE */ NULL, + /* CROP */ NULL, + /* SUBTRACT */ NULL, + /* RELU6 */ NULL, + /* SIGMOID */ NULL, + /* TANH */ NULL, + /* SQRT */ NULL, + /* RSQRT */ NULL, + /* SOFTRELU */ NULL, + /* DIVIDE */ NULL, + /* DROPOUT */ NULL, + /* SHUFFLECHANNEL */ NULL, + /* RESIZE */ NULL, + /* REVERSE */ NULL, + /* DEPTH2SPACE */ NULL, + /* SPACE2DEPTH */ NULL, + /* DATACONVERT */ NULL, + /* SCALE */ NULL, + /* SLICE */ NULL, + /* ELU */ NULL, + /* BATCH2SPACE */ NULL, + /* SPACE2BATCH */ NULL, + /* PAD */ NULL, + /* IMAGEPROCESS */ NULL, + /* MATRIXMUL */ NULL, + /* LSTMUNIT */ NULL, + /* LAYERNORM */ NULL, + /* REDUCE */ NULL, + /* INSTANCENORM */ NULL, + /* TENSORSTACKCONCAT */ NULL, + /* STRIDED_SLICE */ NULL, + /* SIGNALFRAME */ NULL, + /* A_TIMES_B_PLUS_C */ NULL, + /* SVDF */ NULL, + /* ABS */ NULL, + /* CONV1D */ NULL, + /* NBG */ NULL, + /* CONCATSHIFT */ NULL, + /* LRN2 */ _lrn_param, + /* RELATIONALOPS */ NULL, + /* SYNC_HOST */ NULL, + /* POW */ NULL, + /* FLOORDIV */ NULL, + /* MINIMUM */ NULL, + /* SPATIAL_TRANSFORMER */ NULL, + /* LOGICAL_OPS */ NULL, + /* SELECT */ NULL, + /* LSTMUNIT_ACTIVATION */ NULL, + /* LSTMUNIT_OVXLIB */ NULL, + /* TENSOR_ADD_MEAN_STDDEV_NORM */ NULL, + /* RELU1 */ NULL, + /* STACK */ NULL, + /* FLOOR */ NULL, + /* SQUARE */ NULL, + /* NEG */ NULL, + /* EXP */ NULL, + /* LSTM_OVXLIB */ NULL, + /* PRE_PROCESS_TENSOR */ NULL, + /* HASHTABLE_LOOKUP */ NULL, + /* EMBEDDING_LOOKUP */ NULL, + /* LSH_PROJECTION */ NULL, + /* RNN*/ NULL, + /* CLIP */ NULL, + /* POST_PROCESS */ NULL, + /* PRE_PROCESS_GRAY */ NULL, + /* UNSTACK */ NULL, + /* PRE_PROCESS_RGB */ NULL, + /* PRE_PROCESS */ NULL, + /* ADDN */ NULL, + /* PRE_PROCESS_YUV420 */ NULL, + /* EXTRA_ENDING */ NULL, + /* GATHER */ NULL, + /* TILE */ NULL, + /* GROUPED_CONV2D */ NULL, + /* TOPK */ NULL, + /* PRE_PROCESS_BGRA */ NULL, + /* LOGICAL_NOT */ NULL, + /* SIN */ NULL, + /* LOG */ NULL, + /* ARGMIN */ NULL, + /* ROI_ALIGN */ NULL, + /* HEATMAP_MAX_KEYPOINT */ NULL, + /* AXIS_ALIGNED_BBOX_TRANSFORM */ NULL, + /* BOX_WITH_NMS_LIMIT */ NULL, + /* GENERATE_PROPOSALS */ NULL, + /* DETECTION_POSTPROCESS */ NULL, + /* RANDOM_MULTINOMIAL */ NULL, + /* LOG_SOFTMAX */ NULL, + /* RELU_KERAS */ NULL, + /* GRU_OVXLIB */ NULL, + /* GRUCELL_OVXLIB */ NULL, + /* UNIDIRECTIONAL_SEQUENCE_RNN */ NULL, + /* QUANTIZED_16BIT_LSTM */ NULL, + /* BIDIRECTIONAL_SEQUENCE_RNN */ NULL, + /* BIDIRECTIONAL_SEQUENCE_LSTM */ NULL, + /* RNNCELL_OVXLIB */ NULL, + /* SWISH */ NULL, + /* DEPTHWISE_CONV1D */ NULL, + /* GATHER_ND */ NULL, + /* CAST */ NULL, + /* LINEAR */ NULL, + /* BATCHNORM_SINGLE */ NULL, + /* MOMENTS */ NULL, + /* SQUEEZE */ NULL, + /* HARD_SIGMOID */ NULL, + /* MISH */ NULL, + /* EXPAND_BROADCAST */ NULL, + /* PRE_PROCESS_YUV444 */ NULL, + /* PRE_PROCESS_NV12 */ NULL, + /* SCATTER_ND */ NULL, + /* DECONVOLUTION1D */ NULL, +}; +_compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); + +void vsi_nn_GenGraphCCode + ( + vsi_nn_graph_t * graph, + const char * net_path, + const char * data_path + ) +{ + uint32_t i; + uint32_t j; + uint64_t sz; + uint64_t ofst; + vsi_nn_node_t * node; + vsi_nn_node_id_t node_id ; + vsi_nn_node_id_t * sorted_nodes; + vsi_nn_tensor_t * tensor; + vsi_nn_tensor_id_t tensor_id; + + if( NULL == graph ) + { + return; + } + _try_open_file( net_path, &s_net_file_hndl, "w" ); + _try_open_file( data_path, &s_dfile_hndl, "wb" ); + VSILOGI( "Write graph ..." ); + _write_code( "\n#define load_data_to_tensor( tensor, ofst, size ) (0)\n" ); + _write_code( "vsi_nn_context_t ctx;" ); + _write_code( "vsi_nn_graph_t * graph;" ); + _write_code( "vsi_nn_node_t * node[%u];", graph->node_num ); + _write_code( "vsi_nn_tensor_id_t tensor[%u];", graph->tensor_num ); + _write_code( "vsi_nn_tensor_attr_t attr;"); + _write_code( "memset( &attr, 0, sizeof( attr ) );"); + _write_code( "ctx = vsi_nn_CreateContext();"); + _write_code( "graph = vsi_nn_CreateGraph( ctx, %u, %u );", + graph->tensor_num, graph->node_num ); + /* Write tensors */ + for( i = 0; i < graph->tensor_num; i++ ) + { + tensor_id = i; + tensor = vsi_nn_GetTensor( graph, tensor_id ); + if( NULL == tensor ) + { + continue; + } + _write_code( "attr.dim_num = %u;", tensor->attr.dim_num ); + _write_code( "attr.size[0] = %u;", tensor->attr.size[0] ); + _write_code( "attr.size[1] = %u;", tensor->attr.size[1] ); + _write_code( "attr.size[2] = %u;", tensor->attr.size[2] ); + _write_code( "attr.size[3] = %u;", tensor->attr.size[3] ); + _write_code( "attr.is_const = %d;", (int)tensor->attr.is_const ); + _write_code( "attr.vtl = %d;", (int)tensor->attr.vtl ); + _write_code( "attr.dtype.vx_type = %#x;", tensor->attr.dtype.vx_type ); + + ofst = 0; + sz = 0; + if( TRUE == tensor->attr.is_const ) + { + _try_pack_tensor_data( graph, tensor, &ofst, &sz ); + } + _write_code( "tensor[%u] = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL);", + tensor_id ); + if( sz > 0 ) + { + _write_code( "load_data_to_tensor( tensor[%u], %llu, %llu );", + tensor_id, ofst, sz ); + } + } + _write_code( "\n" ); + /* Write nodes */ + sorted_nodes = vsi_nn_SortGraphNode( graph ); + for( i = 0; i < graph->node_num; i++ ) + { + if( NULL != sorted_nodes ) + { + node_id = sorted_nodes[i]; + } + else + { + node_id = i; + } + node = vsi_nn_GetNode( graph, node_id ); + _write_code( "node[%u] = vsi_nn_AppendNode( graph, %#x, NULL );", + i, node->op ); + for( j = 0; j < node->input.num; j ++ ) + { + if( VSI_NN_TENSOR_ID_NA != node->input.tensors[j] ) + { + _write_code( "node[%u]->input.tensors[%d] = tensor[%u];", + i, j, node->input.tensors[j] ); + } + } + for( j = 0; j < node->output.num; j ++ ) + { + if( VSI_NN_TENSOR_ID_NA != node->output.tensors[j] ) + { + _write_code( "node[%u]->output.tensors[%d] = tensor[%u];", + i, j, node->output.tensors[j] ); + } + } + // write node params + if( node->op < _cnt_of_array( s_op_gen ) ) + { + if( NULL != s_op_gen[node->op] ) + { + s_op_gen[node->op]( i, node ); + } + } + _vx_param( i, node ); + _write_code( "\n" ); + } + + if( NULL != sorted_nodes ) + { + free( sorted_nodes ); + } + _try_close_file( &s_dfile_hndl ); + _try_close_file( &s_net_file_hndl ); +} /* vsi_nn_GenGraphCCode() */ + diff --git a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c new file mode 100644 index 0000000..845a790 --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c @@ -0,0 +1,281 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "utils/vsi_nn_constraint_check.h" + +#include +#include +#include + +#include "vsi_nn_node.h" +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" + +typedef struct _node_io_signature_t { + int count; + vsi_nn_type_e types[1]; +} node_io_signature_t; + +static const char* _get_dtype_name(vsi_nn_type_e type) +{ + switch(type) + { + case D_NONE: return "Optional"; + case D_I8: return "INT8"; + case D_I16: return "INT16"; + case D_I32: return "INT32"; + case D_I64: return "INT64"; + case D_U8: return "UINT8"; + case D_U16: return "UINT16"; + case D_U32: return "UINT32"; + case D_U64: return "UINT64"; + case D_F16: return "FLOAT16"; + case D_F32: return "FLOAT32"; + case D_F64: return "FLOAT64"; + case D_BF16: return "BFLOAT16"; + case D_BOOL8: return "BOOL8"; + default: + VSILOGE("Unknown data type: %d\n", type); + break; + } + + return NULL; +} + +static const char* _get_qtype_name(vsi_nn_qnt_type_e type) +{ + switch(type) + { + case VSI_NN_QNT_TYPE_NONE: return ""; + case VSI_NN_QNT_TYPE_DFP: return "DFP"; + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: return "ASYM"; + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: return "SYMM PC"; + default: + VSILOGE("Unknown quant type: %d\n", type); + break; + } + + return NULL; +} + +static node_io_signature_t* _get_op_signature + ( + vsi_nn_tensor_t** inputs, + int inputs_num, + vsi_nn_tensor_t** outputs, + int outputs_num, + const op_constraint_reg_type* op_constraint_reg + ) +{ + int i = 0; + int reg_io_count = op_constraint_reg->reg_input_num + + op_constraint_reg->reg_output_num; + node_io_signature_t* item = NULL; + + if((inputs_num + outputs_num) > reg_io_count) { + VSILOGW("Inputs/outputs count greater than registered inputs/outputs count: %d > %d", + (inputs_num + outputs_num), reg_io_count); + } + + item = malloc(sizeof(node_io_signature_t) + \ + (reg_io_count - 1) * sizeof(vsi_nn_type_e)); + item->count = inputs_num + outputs_num; + memset(&item->types[0], 0x00, reg_io_count * sizeof(vsi_nn_type_e)); + + inputs_num = vsi_nn_min(inputs_num, (int)op_constraint_reg->reg_input_num); + for(i = 0; i < inputs_num; i++) { + if(!inputs[i]) { + item->types[i] = VSI_NN_TYPE_NONE \ + | VSI_NN_QNT_TYPE_NONE << Q_SHIFT; + continue; + } + item->types[i] = inputs[i]->attr.dtype.vx_type \ + | inputs[i]->attr.dtype.qnt_type << Q_SHIFT; + } + + outputs_num = vsi_nn_min(outputs_num, (int)op_constraint_reg->reg_output_num); + for(i = 0; i < outputs_num; i++) { + if(!outputs[i]) { + item->types[op_constraint_reg->reg_input_num + i] = \ + VSI_NN_TYPE_NONE | VSI_NN_QNT_TYPE_NONE << Q_SHIFT; + continue; + } + item->types[op_constraint_reg->reg_input_num + i] = \ + outputs[i]->attr.dtype.vx_type | + outputs[i]->attr.dtype.qnt_type << Q_SHIFT; + } + + return item; +} + +vsi_bool is_const_tensor + ( + const vsi_nn_tensor_t* tensor + ) +{ + if(!tensor) { + return FALSE; + } + + return tensor->attr.is_const; +} + +vsi_bool validate_op_io_types + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t** inputs, + int inputs_num, + vsi_nn_tensor_t** outputs, + int outputs_num, + const op_constraint_reg_type* op_constraint_reg, + const char* name + ) +{ + vsi_bool matched = FALSE; + + if(self && self->attr.enable_op_constraint_check) { + uint32_t i = 0; + + node_io_signature_t* sig = _get_op_signature(inputs, inputs_num, + outputs, outputs_num, op_constraint_reg); + + VSILOGD("Validate [%s]", name); + if(sig && op_constraint_reg && op_constraint_reg->types) { + for(i = 0; i < op_constraint_reg->io_types_item_count; i++) { + const uint8_t* curr = ((const uint8_t*)op_constraint_reg->types) \ + + op_constraint_reg->io_types_item_size * i; + if(!memcmp(curr, sig->types, op_constraint_reg->io_types_item_size)) { + matched = TRUE; + break; + } + } + } + + vsi_nn_safe_free(sig); + } else { + matched = TRUE; + } + + return matched; +} + +char* generate_op_io_types_desc + ( + vsi_nn_tensor_t** inputs, + int inputs_num, + vsi_nn_tensor_t** outputs, + int outputs_num + ) +{ + int i = 0; + int total_sz = 0; + int used_sz = 0; + char* desc = NULL; + + for(i = 0; i < inputs_num; i++) { + if(inputs[i]) { + total_sz += snprintf(NULL, 0, "%s %s, ", + _get_qtype_name(inputs[i]->attr.dtype.qnt_type), + _get_dtype_name(inputs[i]->attr.dtype.vx_type)); + } + } + for(i = 0; i < outputs_num; i++) { + if(outputs[i]) { + total_sz += snprintf(NULL, 0, "%s %s, ", + _get_qtype_name(outputs[i]->attr.dtype.qnt_type), + _get_dtype_name(outputs[i]->attr.dtype.vx_type)); + } + } + + total_sz += 1; /* terminator */ + desc = (char*)malloc(sizeof(char) * total_sz); + memset(desc, 0x00, sizeof(char) * total_sz); + + for(i = 0; i < inputs_num; i++) { + if(inputs[i]) { + used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ", + _get_qtype_name(inputs[i]->attr.dtype.qnt_type), + _get_dtype_name(inputs[i]->attr.dtype.vx_type)); + } + } + for(i = 0; i < outputs_num; i++) { + if(outputs[i]) { + used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ", + _get_qtype_name(outputs[i]->attr.dtype.qnt_type), + _get_dtype_name(outputs[i]->attr.dtype.vx_type)); + } + } + + if(used_sz >= 2) { + desc[used_sz - 2] = '\0'; + } + + return desc; +} + +void destroy_op_io_types_desc + ( + char* desc + ) +{ + if(desc) { + free(desc); + } +} + +void print_op_io_types + ( + const char* name, + const op_constraint_reg_type* op_constraint_reg + ) +{ + /* print supported types for statistics use */ + VSILOGI("Operation: %s", name); + (void)op_constraint_reg; + (void)_get_dtype_name; + (void)_get_qtype_name; +} + +vsi_bool is_item_in_array + ( + const void* item, + const void* items, + int item_size, + int item_count + ) +{ + int i = 0; + + if (item && items) { + for (;i < item_count; i++) { + if(0 == memcmp(item, (uint8_t*)items + i * item_size, item_size)) { + return TRUE; + } + } + } + + return FALSE; +} diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c new file mode 100644 index 0000000..75f686c --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c @@ -0,0 +1,481 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_error.h" +#include "utils/vsi_nn_dtype_util_prv.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" + +#define DEF_DTYPE_CONVERT_NORMAL(SRC_NAME, SRC_DTYPE, DST_NAME, DST_DTYPE) \ +static inline void _convert_##SRC_NAME##_to_##DST_NAME \ + ( \ + const SRC_DTYPE * buffer, \ + size_t size, \ + DST_DTYPE * out_buffer \ + ) \ + { \ + uint32_t i; \ + for( i = 0; i < size; i ++ ) \ + { \ + out_buffer[i] = (DST_DTYPE)buffer[i]; \ + } \ + } +//DEF_DTYPE_CONVERT_NORMAL( bool8, int8_t, float, float ) +DEF_DTYPE_CONVERT_NORMAL( int8, int8_t, float, float ) +DEF_DTYPE_CONVERT_NORMAL( int16, int16_t, float, float ) +DEF_DTYPE_CONVERT_NORMAL( int32, int32_t, float, float ) +DEF_DTYPE_CONVERT_NORMAL( uint8, uint8_t, float, float ) +DEF_DTYPE_CONVERT_NORMAL( uint32, uint32_t, float, float ) +DEF_DTYPE_CONVERT_NORMAL( uint16, uint16_t, float, float ) +//DEF_DTYPE_CONVERT_NORMAL( float, float, bool8, int8_t ) +DEF_DTYPE_CONVERT_NORMAL( float, float, int8, int8_t ) +DEF_DTYPE_CONVERT_NORMAL( float, float, int16, int16_t ) +DEF_DTYPE_CONVERT_NORMAL( float, float, int32, int32_t ) +DEF_DTYPE_CONVERT_NORMAL( float, float, uint8, uint8_t ) +DEF_DTYPE_CONVERT_NORMAL( float, float, uint32, uint32_t ) +DEF_DTYPE_CONVERT_NORMAL( float, float, uint16, uint16_t ) +#undef DEF_DTYPE_CONVERT_NORMAL + +static inline void _convert_float16_to_float + ( + const vsi_float16 * buffer, + size_t size, + float * out_buffer + ) +{ + uint32_t i; + for( i = 0; i < size; i ++ ) + { + out_buffer[i] = fp16_to_fp32( (int16_t)buffer[i] ); + } +} /* _convert_float16_to_float */ + +static inline void _convert_float_to_float16 + ( + const float * buffer, + size_t size, + vsi_float16 * out_buffer + ) +{ + uint32_t i; + for( i = 0; i < size; i ++ ) + { + out_buffer[i] = (vsi_float16)fp32_to_fp16( buffer[i] ); + } +} /* _convert_float_to_float16 */ + +static inline void _convert_bfloat16_to_float + ( + const vsi_bfloat16 * buffer, + size_t size, + float * out_buffer + ) +{ + uint32_t i; + for( i = 0; i < size; i ++ ) + { + out_buffer[i] = bfp16_to_fp32( (int16_t)buffer[i] ); + } +} /* _convert_bfloat16_to_float */ + +static inline void _convert_float_to_bfloat16 + ( + const float * buffer, + size_t size, + vsi_bfloat16 * out_buffer + ) +{ + uint32_t i; + for( i = 0; i < size; i ++ ) + { + out_buffer[i] = (vsi_bfloat16)fp32_to_bfp16( buffer[i] ); + } +} /* _convert_float_to_bfloat16 */ + +#define DEF_DTYPE_CONVERT_QUANTIZE( SRC_NAME, SRC_DTYPE, ROUND, MIN, MAX ) \ + vsi_bool vsi_nn_dtype_convert_quantize_##SRC_NAME##_to_float \ + ( \ + const SRC_DTYPE * buffer, size_t size, \ + float scale, int32_t zero_point, \ + float * out_buffer \ + ) \ + { \ + uint32_t i; \ + if( !buffer || !out_buffer ) \ + { \ + return FALSE; \ + } \ + for( i = 0; i < size; i ++ ) \ + { \ + out_buffer[i] = (float)(((double)buffer[i] - (double)zero_point) * scale); \ + } \ + return TRUE; \ + } \ + vsi_bool vsi_nn_dtype_convert_float_to_quantize_##SRC_NAME \ + ( \ + const float * buffer, size_t size, \ + float scale, int32_t zero_point, \ + SRC_DTYPE * out_buffer \ + ) \ + { \ + uint32_t i; \ + if( !buffer || !out_buffer ) \ + { \ + return FALSE; \ + } \ + for( i = 0; i < size; i ++ ) \ + { \ + out_buffer[i] = (SRC_DTYPE)vsi_clamp(\ + ROUND( buffer[i] / scale ) + zero_point, \ + (double)MIN, (double)MAX ); \ + } \ + return TRUE; \ + } + +DEF_DTYPE_CONVERT_QUANTIZE( symm8, int8_t, vsi_rtne, SCHAR_MIN, SCHAR_MAX ) +DEF_DTYPE_CONVERT_QUANTIZE( symm16, int16_t, vsi_rtne, SHRT_MIN, SHRT_MAX ) +DEF_DTYPE_CONVERT_QUANTIZE( symm32, int32_t, vsi_rtne, INT_MIN, INT_MAX ) +DEF_DTYPE_CONVERT_QUANTIZE( symm64, int64_t, vsi_rtne, LLONG_MIN, LLONG_MAX ) +DEF_DTYPE_CONVERT_QUANTIZE( asymm8, uint8_t, vsi_rtne, 0, UCHAR_MAX ) +//DEF_DTYPE_CONVERT_QUANTIZE( asymm16, uint16_t, vsi_rtne, 0, USHRT_MAX ) +//DEF_DTYPE_CONVERT_QUANTIZE( asymm32, uint32_t, vsi_rtne, 0, UINT_MAX ) +#undef DEF_DTYPE_CONVERT_QUANTIZE + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel + ( + const float * buffer, size_t size, + const int32_t * shape, size_t rank, + const float * scale, size_t scale_size, + const int32_t * zero_point, size_t zero_point_size, + int32_t channel_dim, + int8_t * out_buffer + ) +{ + if( !buffer || !out_buffer ) + { + return FALSE; + } + VSI_ASSERT( FALSE ); + return TRUE; +} /* vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel() */ + +vsi_bool vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float + ( + const int8_t * buffer, size_t size, + const int32_t * shape, size_t rank, + const float * scale, size_t scale_size, + const int32_t * zero_point, size_t zero_point_size, + int32_t channel_dim, + float * out_buffer + ) +{ + if( !buffer || !out_buffer ) + { + return FALSE; + } + VSI_ASSERT( FALSE ); + return TRUE; +} /* vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float() */ + +vsi_bool vsi_nn_dtype_convert_float_to_dtype + ( + const float * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + void * out_buffer + ) +{ + if( !buffer || !out_buffer ) + { + return FALSE; + } + switch( dtype ) + { + case I8: + case BOOL8: + _convert_float_to_int8( buffer, size, (int8_t*)out_buffer ); + break; + case I16: + _convert_float_to_int16( buffer, size, (int16_t*)out_buffer ); + break; + case I32: + _convert_float_to_int32( buffer, size, (int32_t*)out_buffer ); + break; + case U8: + _convert_float_to_uint8( buffer, size, (uint8_t*)out_buffer ); + break; + case U16: + _convert_float_to_uint16( buffer, size, (uint16_t*)out_buffer ); + break; + case U32: + _convert_float_to_uint32( buffer, size, (uint32_t*)out_buffer ); + break; + case F16: + _convert_float_to_float16( buffer, size, (vsi_float16*)out_buffer ); + break; + case BF16: + _convert_float_to_bfloat16( buffer, size, (vsi_bfloat16*)out_buffer ); + break; + default: + VSILOGE("Don't support convert float to dtype %d.", dtype); + return FALSE; + } + return TRUE; +} /* vsi_nn_dtype_convert_float_to_dtype() */ + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm + ( + const float * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + float scale, int32_t zero_point, + void * out_buffer + ) +{ + switch( dtype ) + { + case U8: + return vsi_nn_dtype_convert_float_to_quantize_asymm8( + buffer, size, scale, zero_point, (uint8_t*)out_buffer ); + default: + VSILOGE("Don't support convert float to asymm quant %d.", dtype); + break; + } + return FALSE; +} /* vsi_nn_dtype_convert_float_to_quantize_aysmm() */ + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_dfp + ( + const float * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + int32_t fl, + void * out_buffer + ) +{ + float scale; + if( !buffer || !out_buffer ) + { + return FALSE; + } + scale = powf( 2.0f, (float)(-fl) ); + return vsi_nn_dtype_convert_float_to_quantize_symm( + buffer, size, dtype, scale, 0, out_buffer ); +} /* vsi_nn_dtype_convert_float_to_quantize_dfp() */ + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm + ( + const float * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + float scale, int32_t zero_point, + void * out_buffer + ) +{ + switch( dtype ) + { + case I8: + return vsi_nn_dtype_convert_float_to_quantize_symm8( + buffer, size, scale, zero_point, (int8_t*)out_buffer ); + case I16: + return vsi_nn_dtype_convert_float_to_quantize_symm16( + buffer, size, scale, zero_point, (int16_t*)out_buffer ); + case I32: + return vsi_nn_dtype_convert_float_to_quantize_symm32( + buffer, size, scale, zero_point, (int32_t*)out_buffer ); + case I64: + return vsi_nn_dtype_convert_float_to_quantize_symm64( + buffer, size, scale, zero_point, (int64_t*)out_buffer ); + default: + VSILOGE("Don't support convert float to symm quant %d.", dtype); + break; + } + return FALSE; +} /* vsi_nn_dtype_convert_float_to_quantize_symm() */ + +vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm_perchannel + ( + const float * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + const int32_t * shape, size_t rank, + const float * scale, size_t scale_size, + const int32_t * zero_point, size_t zero_point_size, + int32_t channel_dim, + void * out_buffer + ) +{ + switch( dtype ) + { + case I8: + vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel( + buffer, size, shape, rank, + scale, scale_size, zero_point, zero_point_size, + channel_dim, (int8_t*)out_buffer ); + break; + default: + VSILOGE("Don't support convert float to symm perchannel quant %d.", + dtype); + return FALSE; + } + return TRUE; +} /* vsi_nn_dtype_convert_float_to_quantize_symm_perchannel() */ + +vsi_bool vsi_nn_dtype_convert_dtype_to_float + ( + const void * buffer, + size_t size, + vsi_nn_kernel_dtype_e dtype, + float * out_buffer + ) +{ + if( !buffer || !out_buffer ) + { + return FALSE; + } + switch( dtype ) + { + case I8: + case BOOL8: + _convert_int8_to_float( (const int8_t*)buffer, size, out_buffer ); + break; + case I16: + _convert_int16_to_float( (const int16_t*)buffer, size, out_buffer ); + break; + case I32: + _convert_int32_to_float( (const int32_t*)buffer, size, out_buffer ); + break; + case U8: + _convert_uint8_to_float( (const uint8_t*)buffer, size, out_buffer ); + break; + case U16: + _convert_uint16_to_float( (const uint16_t*)buffer, size, out_buffer ); + break; + case U32: + _convert_uint32_to_float( (const uint32_t*)buffer, size, out_buffer ); + break; + case F16: + _convert_float16_to_float( (const vsi_float16*)buffer, size, out_buffer ); + break; + case BF16: + _convert_bfloat16_to_float( (const vsi_bfloat16*)buffer, size, out_buffer ); + break; + default: + VSILOGE("Don't support convert dtype %d to float.", dtype); + return FALSE; + } + return TRUE; +} /* vsi_nn_dtype_convert_dtype_to_float() */ + +vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float + ( + const void * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + float scale, int32_t zero_point, + float * out_buffer + ) +{ + switch( dtype ) + { + case U8: + return vsi_nn_dtype_convert_quantize_asymm8_to_float( + (const uint8_t *)buffer, size, scale, zero_point, out_buffer ); + default: + VSILOGE("Don't support convert asymm quant %d to float.", dtype); + break; + } + return FALSE; +} /* vsi_nn_dtype_convert_quantize_aysmm_to_float() */ + +vsi_bool vsi_nn_dtype_convert_quantize_dfp_to_float + ( + const void * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + int32_t fl, + float * out_buffer + ) +{ + float scale; + if( !buffer || !out_buffer ) + { + return FALSE; + } + scale = powf( 2.0f, (float)(-fl) ); + return vsi_nn_dtype_convert_quantize_symm_to_float( + buffer, size, dtype, scale, 0, out_buffer ); +} /* vsi_nn_dtype_convert_quantize_dfp_to_float() */ + +vsi_bool vsi_nn_dtype_convert_quantize_symm_to_float + ( + const void * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + float scale, int32_t zero_point, + float * out_buffer + ) +{ + switch( dtype ) + { + case I8: + return vsi_nn_dtype_convert_quantize_symm8_to_float( + (const int8_t *)buffer, size, scale, zero_point, out_buffer ); + case I16: + return vsi_nn_dtype_convert_quantize_symm16_to_float( + (const int16_t *)buffer, size, scale, zero_point, out_buffer ); + case I32: + return vsi_nn_dtype_convert_quantize_symm32_to_float( + (const int32_t *)buffer, size, scale, zero_point, out_buffer ); + case I64: + return vsi_nn_dtype_convert_quantize_symm64_to_float( + (const int64_t *)buffer, size, scale, zero_point, out_buffer ); + default: + VSILOGE("Don't support convert symm quant %d to float.", dtype); + break; + } + return FALSE; +} /* vsi_nn_dtype_convert_quantize_symm_to_float() */ + +vsi_bool vsi_nn_dtype_convert_quantize_symm_perchannel_to_float + ( + const void * buffer, size_t size, + vsi_nn_kernel_dtype_e dtype, + const int32_t * shape, size_t rank, + const float * scale, size_t scale_size, + const int32_t * zero_point, size_t zero_point_size, + int32_t channel_dim, + float * out_buffer + ) +{ + switch( dtype ) + { + case I8: + vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float( + (const int8_t*)buffer, size, shape, rank, + scale, scale_size, zero_point, zero_point_size, + channel_dim, out_buffer ); + break; + default: + VSILOGE("Don't support convert symm perchannel quant %d to float.", dtype); + return FALSE; + } + return TRUE; +} /* vsi_nn_dtype_convert_quantize_symm_perchannel_to_float() */ + diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c new file mode 100644 index 0000000..4b827d4 --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c @@ -0,0 +1,571 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_test.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" +#include "quantization/vsi_nn_asymmetric_affine.h" +#include "quantization/vsi_nn_dynamic_fixed_point.h" +#include "quantization/vsi_nn_perchannel_symmetric_affine.h" + +vsi_bool vsi_nn_TypeIsInteger + ( + const vsi_nn_type_e type + ) +{ + return type_is_integer(type); +} /* vsi_nn_TypeIsInteger() */ + +vsi_bool vsi_nn_TypeIsSigned + ( + const vsi_nn_type_e type + ) +{ + return type_is_signed(type); +} /* vsi_nn_TypeIsSigned() */ + +uint32_t vsi_nn_TypeGetBitWidth + ( + const vsi_nn_type_e type + ) +{ + uint32_t bw; + bw = 8 * vsi_nn_TypeGetBytes( type ); + if( type_is_signed( type ) ) + { + bw --; + } + return bw; +} /* vsi_nn_TypeGetBitWidth() */ + +int32_t vsi_nn_Fp32ToDFP + ( + const float in, + const int8_t fl, + const vsi_nn_type_e type + ) +{ + return fp32_to_dfp(in, fl, type); +} /* vsi_nn_Fp32ToDPF() */ + +float vsi_nn_DFPToFp32 + ( + const int32_t val, + const int8_t fl, + const vsi_nn_type_e type + ) +{ + return dfp_to_fp32(val, fl, type); +} /* vsi_nn_DFPToFp32() */ + +int32_t vsi_nn_Fp32ToAffine + ( + const float in, + const float scale, + const int32_t zero_point, + const vsi_nn_type_e type + ) +{ + return fp32_to_affine(in, scale, zero_point, type); +} /* vsi_nn_Fp32ToAffine() */ + +float vsi_nn_AffineToFp32 + ( + const int32_t val, + const float scale, + const int32_t zero_point, + const vsi_nn_type_e type + ) +{ + return affine_to_fp32(val, scale, zero_point, type); +} /* vsi_nn_AffineToFp32() */ + +uint16_t vsi_nn_Fp32ToFp16 + ( + float in + ) +{ + return fp32_to_fp16(in); +} /* vsi_nn_Fp32ToFp16() */ + +float vsi_nn_Fp16ToFp32 + ( + int16_t in + ) +{ + return fp16_to_fp32(in); +} /* vsi_nn_Fp16ToFp32() */ + +float vsi_nn_BFp16ToFp32 + ( + int16_t in + ) +{ + return bfp16_to_fp32(in); +} /* vsi_nn_Fp16ToFp32() */ + +uint16_t vsi_nn_Fp32ToBFp16 + ( + float in + ) +{ + return fp32_to_bfp16(in); +} /* vsi_nn_Fp32ToFp16() */ + + +vsi_status vsi_nn_IntegerConvert + ( + const void * src, + vsi_nn_type_e src_type, + void * dest, + vsi_nn_type_e dest_type + ) +{ + return integer_convert(src, src_type, dest, dest_type); +} /* vsi_nn_IntegerConvert() */ + +vsi_status vsi_nn_DtypeConvert + ( + uint8_t * src, + const vsi_nn_dtype_t * src_dtype, + uint8_t * dst, + const vsi_nn_dtype_t * dst_dtype + ) +{ + vsi_status status; + float data; + + data = 0.0f; + status = dtype_to_float32(src, &data, src_dtype); + if(status != VSI_SUCCESS) + { + VSILOGE("dtype data convert to float32 fail"); + return status; + } + status = float32_to_dtype(data, dst, dst_dtype); + if(status != VSI_SUCCESS) + { + VSILOGE("float32 data convert to dtype fail"); + return status; + } + return status; +} /* vsi_nn_DtypeConvert */ + +/* +* Deprated: Use vsi_nn_DtypeToFloat32() instead +*/ +vsi_status vsi_nn_DtypeToFp32 + ( + void * src, + float * dst, + uint32_t index, /* index to src buffer */ + const vsi_nn_dtype_t * src_dtype + ) +{ + uint8_t * ptr; + ptr = (uint8_t *)src; + + //VSILOGW("Deprecated API, use vsi_nn_DtypeToFloat32 instead."); + ptr += vsi_nn_TypeGetBytes( src_dtype->vx_type ) * index; + + return vsi_nn_DtypeToFloat32( ptr, dst, src_dtype ); +} /* vsi_nn_DtypeToFp32() */ + +/* +* Deprated: Use vsi_nn_Float32ToDtype() instead +*/ +vsi_status vsi_nn_Fp32toDtype + ( + float src, + void * dst, + uint32_t index, /* index to dst buffer */ + const vsi_nn_dtype_t * dst_dtype + ) +{ + uint8_t * ptr; + ptr = (uint8_t *)dst; + + //VSILOGW("Deprecated API, use vsi_nn_Float32ToDtype instead."); + ptr += vsi_nn_TypeGetBytes( dst_dtype->vx_type ) * index; + + return vsi_nn_Float32ToDtype( src, ptr, dst_dtype ); +} /* vsi_nn_Fp32toDtype */ + +vsi_status vsi_nn_DtypeToFloat32 + ( + uint8_t * src, + float * dst, + const vsi_nn_dtype_t * src_dtype + ) +{ + return dtype_to_float32(src, dst, src_dtype); +} /* vsi_nn_DtypeToFloat32() */ + +vsi_status vsi_nn_Float32ToDtype + ( + float src, + uint8_t * dst, + const vsi_nn_dtype_t * dst_dtype + ) +{ + return float32_to_dtype(src, dst, dst_dtype); +} /* vsi_nn_Float32ToDtype() */ + +int32_t vsi_nn_DtypeConvertRawData + ( + uint8_t * src, + int32_t src_bytes, + const vsi_nn_dtype_t * src_dtype, + uint8_t * dst, + int32_t dst_bytes, + const vsi_nn_dtype_t * dst_dtype + ) +{ + uint8_t * src_iter; + uint8_t * dst_iter; + int32_t count; + int32_t elements; + int32_t src_type_bytes; + int32_t dst_type_bytes; + int32_t target_bytes; + int32_t i; + vsi_status status; + count = 0; + if( NULL == src || NULL == dst || NULL == src_dtype ) + { + return count; + } + + src_type_bytes = vsi_nn_TypeGetBytes( src_dtype->vx_type ); + dst_type_bytes = vsi_nn_TypeGetBytes( dst_dtype->vx_type ); + elements = (int32_t)( src_bytes / src_type_bytes ); + target_bytes = dst_type_bytes * elements; + if( dst_bytes < target_bytes ) + { + VSILOGW("Wrong dest buffer size: %d, require: %d", dst_bytes, target_bytes); + return count; + } + src_iter = src; + dst_iter = dst; + for( i = 0; i < elements; i ++ ) + { + status = vsi_nn_DtypeConvert( src_iter, src_dtype, dst_iter, dst_dtype ); + if( VSI_FAILURE == status ) + { + break; + } + src_iter += src_type_bytes; + dst_iter += dst_type_bytes; + } + count = i; + return count; +} /* vsi_nn_DtypeConvertRawData() */ + +int32_t vsi_nn_DtypeConvertRawDataToFloat32 + ( + uint8_t * src, + int32_t src_bytes, + const vsi_nn_dtype_t * src_dtype, + float * dst, + int32_t dst_size + ) +{ + vsi_nn_dtype_t dst_dtype; + memset( &dst_dtype, 0, sizeof( vsi_nn_dtype_t ) ); + dst_dtype.vx_type = VSI_NN_TYPE_FLOAT32; + return vsi_nn_DtypeConvertRawData( + src, src_bytes, src_dtype, + (uint8_t *)dst, dst_size * sizeof( float ), &dst_dtype ); +} /*vsi_nn_DtypeConvertRawDataToFloat32()*/ + +int32_t vsi_nn_DtypeConvertFloat32ToRawData + ( + float * src, + int32_t src_size, + uint8_t * dst, + int32_t dst_bytes, + const vsi_nn_dtype_t * dst_dtype + ) +{ + vsi_nn_dtype_t src_dtype; + memset( &src_dtype, 0, sizeof( vsi_nn_dtype_t ) ); + src_dtype.vx_type = VSI_NN_TYPE_FLOAT32; + return vsi_nn_DtypeConvertRawData( + (uint8_t *)src, src_size * sizeof( float ), &src_dtype, + dst, dst_bytes, dst_dtype ); +} /*vsi_nn_DtypeConvertFloat32ToRawData()*/ + +uint32_t vsi_nn_TypeGetBytes + ( + const vsi_nn_type_e type + ) +{ + return type_get_bytes( type ); +} /* vsi_nn_TypeGetBytes() */ + +/* +* Deprecated: use vsi_nn_TypeGetBytes() insteatd. +*/ +uint32_t vsi_nn_GetTypeBytes + ( + const vsi_nn_type_e type + ) +{ + return type_get_bytes( type ); +} /* vsi_nn_GetTypeBytes() */ + +vsi_bool vsi_nn_QuantCheck + ( + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *weight, + vsi_nn_tensor_t *bias + ) +{ + vsi_bool ret = TRUE; + vsi_nn_qnt_type_e input_qnt_type, weight_qnt_type; + vsi_nn_type_e input_dtype, weight_dtype; + vsi_nn_qnt_type_e qnt_type; + + input_qnt_type = input->attr.dtype.qnt_type; + input_dtype = input->attr.dtype.vx_type; + weight_qnt_type = weight->attr.dtype.qnt_type; + weight_dtype = weight->attr.dtype.vx_type; + + //do not check quant parammeters if types of input/weight is hybrid combinaton + if(input_dtype != weight_dtype || input_qnt_type != weight_qnt_type) + { + return ret; + } + + if(VSI_NN_TYPE_VDATA == weight->attr.dtype.vx_type) + { + return ret; + } + if(type_is_integer(input_dtype) == FALSE) + { + return ret; + } + + qnt_type = input->attr.dtype.qnt_type; + switch(qnt_type) + { + case VSI_NN_QNT_TYPE_DFP: + ret = vsi_nn_QuantDFPCheck(input, weight, bias); + if(ret == FALSE) + { + VSILOGE("input_fl[%d] + weight_fl[%d] != bias_fl[%d]", + input->attr.dtype.fl, + weight->attr.dtype.fl, + bias->attr.dtype.fl); + } + break; + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + { + ret = vsi_nn_QuantAffinePerchannelCheck(input, weight, bias); + if(ret == FALSE) + { + VSILOGE("abs(input_scale * weight_scale - bias_scale) > 1e-5"); + } + } + else + { + ret = vsi_nn_QuantAffineCheck(input, weight, bias); + if(ret == FALSE) + { + VSILOGE("input_scale[%.12lf] * weight_scale[%.12lf] != bias_scale[%.12lf]", + input->attr.dtype.scale, + weight->attr.dtype.scale, + bias->attr.dtype.scale); + } + } + break; + default: + ret = FALSE; + break; + } + + return ret; +} /* vsi_nn_QuantCheck() */ + +vsi_bool vsi_nn_DtypeCompare + ( + vsi_nn_dtype_t *dtype0, + vsi_nn_dtype_t *dtype1 + ) +{ + if(NULL == dtype0 || NULL == dtype1) + { + return FALSE; + } + + if(dtype0->vx_type != dtype1->vx_type || dtype0->qnt_type != dtype1->qnt_type) + { + return FALSE; + } + if(dtype0->qnt_type == VSI_NN_QNT_TYPE_DFP) + { + if(dtype0->fl != dtype1->fl) + { + return FALSE; + } + } + else if(dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + const float diff = (float)1e-5; + if(dtype0->zero_point != dtype1->zero_point) + { + return FALSE; + } + if(vsi_nn_float_compare(dtype0->scale, dtype1->scale, diff) == FALSE) + { + return FALSE; + } + } + + return TRUE; +} /* vsi_nn_DtypeCompare */ + +vsi_status vsi_nn_vxConvertTensorToFloat32Data + ( + vx_context context, + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr, + float *f32_data, + uint32_t f32_data_sz + ) +{ + vsi_status status; + uint8_t *data; + uint32_t elements,stride; + vsi_nn_tensor_attr_t tensor_attr, *_attr; + + data = NULL; + if(NULL == context || NULL == tensor || NULL == f32_data) + { + return VSI_FAILURE; + } + if(NULL == attr) + { + memset(&tensor_attr, 0, sizeof(tensor_attr)); + status = vsi_nn_vxGetTensorAttr(tensor, &tensor_attr); + TEST_CHECK_STATUS(status, final); + _attr = &tensor_attr; + } + else + { + _attr = attr; + } + + status = VSI_FAILURE; + elements = vsi_nn_vxGetTensorElementNum(_attr); + stride = vsi_nn_TypeGetBytes(_attr->dtype.vx_type); + if(f32_data_sz != elements * sizeof(float)) + { + VSILOGE("buffer sz %u != required sz %u", f32_data_sz, elements * sizeof(float)); + return status; + } + data = vsi_nn_vxCopyTensorToData(context, tensor, _attr); + TEST_CHECK_PTR(data, final); + + vsi_nn_DtypeConvertRawDataToFloat32(data, + elements * stride, + (const vsi_nn_dtype_t *)&_attr->dtype, + f32_data, + elements); + + status = VSI_SUCCESS; +final: + if(data) + { + free(data); + data = NULL; + } + return status; +} /* vsi_nn_vxConvertTensorToFloat32Data() */ + +vsi_status vsi_nn_vxConvertFloat32DataToTensor + ( + vx_context context, + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr, + float *f32_data, + uint32_t f32_data_sz + ) +{ + vsi_status status; + uint8_t *data; + uint32_t elements,stride; + vsi_nn_tensor_attr_t tensor_attr, *_attr; + + data = NULL; + if(NULL == context || NULL == tensor || NULL == f32_data) + { + return VSI_FAILURE; + } + if(NULL == attr) + { + memset(&tensor_attr, 0, sizeof(tensor_attr)); + status = vsi_nn_vxGetTensorAttr(tensor, &tensor_attr); + TEST_CHECK_STATUS(status, final); + _attr = &tensor_attr; + } + else + { + _attr = attr; + } + + status = VSI_FAILURE; + elements = vsi_nn_vxGetTensorElementNum(_attr); + stride = vsi_nn_GetTypeBytes(_attr->dtype.vx_type); + if(f32_data_sz != elements * sizeof(float)) + { + VSILOGE("buffer sz %u != required sz %u", f32_data_sz, elements * sizeof(float)); + return status; + } + + data = (uint8_t *)malloc(elements * stride); + TEST_CHECK_PTR(data, final); + memset(data, 0, sizeof(uint8_t) * elements * stride); + vsi_nn_DtypeConvertFloat32ToRawData(f32_data, + elements, + data, + elements * vsi_nn_TypeGetBytes(_attr->dtype.vx_type), + (const vsi_nn_dtype_t *)&_attr->dtype); + + status = vsi_nn_vxCopyDataToTensor(context, tensor, _attr, data); +final: + if(data) + { + free(data); + data = NULL; + } + return status; +} /* vsi_nn_vxConvertFloat32DataToTensor() */ + diff --git a/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c b/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c new file mode 100644 index 0000000..b576fc1 --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c @@ -0,0 +1,485 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include "utils/vsi_nn_link_list.h" +#include "utils/vsi_nn_hashmap.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_types.h" + +typedef struct _tree +{ + struct _tree * left; + struct _tree * right; + const char * hash_key; + void * data_ptr; +} _binary_tree_t; + +static _binary_tree_t * _new_node + ( void ) +{ + _binary_tree_t * node; + + node = (_binary_tree_t *)malloc( + sizeof( _binary_tree_t ) ); + + memset( node, 0, sizeof( _binary_tree_t ) ); + return node; +} /* _new_node() */ + +static _binary_tree_t * _min_node + ( + _binary_tree_t * node + ) +{ + _binary_tree_t * cur = node; + + while( NULL != cur->left ) + { + cur = cur->left; + } + + return cur; +} /* _min_node() */ + +static _binary_tree_t * _del_node_by_key + ( + _binary_tree_t * root, + const char * hash_key + ) +{ + if( NULL == root ) + { + return root; + } + if( strcmp( hash_key, root->hash_key) < 0 ) + { + root->left = _del_node_by_key( root->left, hash_key ); + } + else if( strcmp( hash_key, root->hash_key) > 0 ) + { + root->right = _del_node_by_key( root->right, hash_key ); + } + else + { + if( NULL == root->left ) + { + _binary_tree_t * node = root->right; + free( root ); + return node; + } + else if( NULL == root->right ) + { + _binary_tree_t * node = root->left; + free( root ); + return node; + } + else + { + _binary_tree_t * node = _min_node( root->right ); + root->hash_key = node->hash_key; + + /* copy data */ + root->data_ptr = node->data_ptr; + + root->right = _del_node_by_key(root->right, node->hash_key ); + } + } + + return root; +} /* _del_node_by_key() */ + +static _binary_tree_t * _move_left + ( + _binary_tree_t * node + ) +{ + _binary_tree_t * left = NULL; + if( NULL != node ) + { + left = node->left; + } + return left; +} /* _move_left() */ + +static _binary_tree_t * _move_right + ( + _binary_tree_t * node + ) +{ + _binary_tree_t * right = NULL; + if( NULL != node ) + { + right = node->right; + } + return right; +} /* _move_right() */ + +static _binary_tree_t * _find_loc + ( + _binary_tree_t * node, + const char * hash_key, + int * val + ) +{ + int tmp; + _binary_tree_t * next; + _binary_tree_t * loc; + loc = NULL; + tmp = 0; + while( NULL != node ) + { + if( strcmp( node->hash_key, hash_key ) > 0 ) + { + next = _move_left( node ); + tmp = -1; + } + else if( strcmp( node->hash_key, hash_key ) < 0 ) + { + next = _move_right( node ); + tmp = 1; + } + else + { + loc = node; + tmp = 0; + break; + } + if( NULL != next ) + { + node = next; + } + else + { + loc = node; + break; + } + } + if( NULL != val ) + { + *val = tmp; + } + return loc; +} /* _find_loc_to_insert() */ + + +void _binary_tree_remove_node + ( + _binary_tree_t ** root, + const char * hash_key + ) +{ + if( !root || NULL == *root ) + { + return; + } + + *root = _del_node_by_key( *root, hash_key ); +} /* _binary_tree_remove_node() */ + +void _binary_tree_new_node + ( + _binary_tree_t ** root, + const char * hash_key, + void * data + ) +{ + int val; + _binary_tree_t * iter; + _binary_tree_t * node; + + if( NULL == root ) + { + return; + } + + val = 0; + iter = *root; + iter = _find_loc( iter, hash_key, &val ); + if( NULL != iter && strcmp( hash_key, iter->hash_key) == 0 ) + { + VSILOGD( "Key %s has been registered, update value.", hash_key ); + // Update node + iter->data_ptr = data; + return; + } + + /* New node */ + node = _new_node(); + if( NULL != node ) + { + node->hash_key = hash_key; + node->data_ptr = data; + } + else + { + VSILOGW( "Malloc binary tree node fail." ); + } + + /* Insert node */ + if( NULL == iter ) + { + if( *root == NULL) + { + *root = node; + } + else + { + // Root must be NULL. + VSI_ASSERT( FALSE ); + free( node ); + } + } + else + { + if( val > 0 ) + { + iter->right = node; + } + else if( val < 0 ) + { + iter->left = node; + } + else + { + VSILOGE( "Hash collision!" ); + if( node ) + { + free( node ); + node = NULL; + } + } + } +} /* _binary_tree_new_node() */ + +void * _binary_tree_get_node + ( + _binary_tree_t ** root, + const char * hash_key + ) +{ + void * data; + _binary_tree_t * iter; + + if( NULL == root ) + { + return NULL; + } + data = NULL; + iter = *root; + iter = _find_loc( iter, hash_key, NULL ); + if( NULL != iter && strcmp( hash_key, iter->hash_key ) == 0 ) + { + data = iter->data_ptr; + } + return data; +} /* _binary_tree_get_node() */ + +static void _free_item( vsi_nn_hashmap_item_t * item ) +{ + free( item->hash_key ); + free( item ); +} /* _free_item() */ + +vsi_nn_hashmap_t * vsi_nn_hashmap_create() +{ + vsi_nn_hashmap_t * map; + map = (vsi_nn_hashmap_t*)malloc( sizeof(vsi_nn_hashmap_t) ); + if( NULL == map ) + { + VSILOGE("Out of memory, create hashmap fail."); + return NULL; + } + memset( map, 0, sizeof( vsi_nn_hashmap_t ) ); + return map; +} /* vsi_nn_hashmap_create() */ + +void vsi_nn_hashmap_release + ( vsi_nn_hashmap_t ** map_ptr ) +{ + if( map_ptr && *map_ptr ) + { + vsi_nn_hashmap_clear( *map_ptr ); + free( *map_ptr ); + *map_ptr = NULL; + } +} /* vsi_nn_hashmap_release() */ + +void vsi_nn_hashmap_clear( vsi_nn_hashmap_t * map ) +{ + if( map ) + { + vsi_nn_hashmap_item_t * iter = map->items; + vsi_nn_hashmap_item_t * next = NULL; + + while( NULL != iter ) + { + next = (vsi_nn_hashmap_item_t *)vsi_nn_LinkListNext( + (vsi_nn_link_list_t *)iter ); + _binary_tree_remove_node( (_binary_tree_t**)&(map->values), iter->hash_key ); + vsi_nn_LinkListRemoveNode( (vsi_nn_link_list_t **)&map->items, + (vsi_nn_link_list_t *)iter ); + _free_item( (vsi_nn_hashmap_item_t*)iter ); + iter = next; + } + } +} + +void* vsi_nn_hashmap_get + ( + const vsi_nn_hashmap_t * map, + const char * key + ) +{ + const char * hash_key = key; + if( NULL == map ) + { + return NULL; + } + return _binary_tree_get_node( (_binary_tree_t**)&map->values, hash_key ); +} /* vsi_nn_hashmap_get() */ + +void vsi_nn_hashmap_add + ( + vsi_nn_hashmap_t * map, + const char * key, + void * value + ) +{ + vsi_nn_hashmap_item_t * iter; + size_t key_size = 0; + const char * hash_key = key; + if( NULL == map ) + { + return; + } + if( NULL == key ) + { + return; + } + iter = map->items; + while( NULL != iter ) + { + if( strcmp( iter->hash_key, hash_key ) == 0 ) + { + break; + } + iter = (vsi_nn_hashmap_item_t *)vsi_nn_LinkListNext( + (vsi_nn_link_list_t *)iter ); + } + if( NULL == iter ) + { + iter = (vsi_nn_hashmap_item_t *)vsi_nn_LinkListNewNode( + sizeof( vsi_nn_hashmap_item_t ), NULL ); + key_size = strlen( hash_key ) + 1; + iter->hash_key = (char*)malloc( sizeof(char) * key_size ); + VSI_ASSERT( iter->hash_key ); + memcpy( iter->hash_key, hash_key, key_size ); + vsi_nn_LinkListPushStart( (vsi_nn_link_list_t **)&map->items, + (vsi_nn_link_list_t *)iter ); + map->size += 1; + } + iter->data = value; + _binary_tree_new_node( (_binary_tree_t**)&map->values, iter->hash_key, value ); +} /* vsi_nn_hashmap_add() */ + +void vsi_nn_hashmap_remove + ( + vsi_nn_hashmap_t * map, + const char * key + ) +{ + vsi_nn_hashmap_item_t * iter; + const char * hash_key = key; + if( NULL == map ) + { + return; + } + _binary_tree_remove_node( (_binary_tree_t**)&(map->values), hash_key ); + iter = map->items; + while( NULL != iter ) + { + if( strcmp( iter->hash_key, hash_key ) == 0 ) + { + break; + } + iter = (vsi_nn_hashmap_item_t *)vsi_nn_LinkListNext( + (vsi_nn_link_list_t *)iter ); + } + if( NULL != iter ) + { + vsi_nn_LinkListRemoveNode( (vsi_nn_link_list_t **)&map->items, + (vsi_nn_link_list_t *)iter ); + _free_item( (vsi_nn_hashmap_item_t*)iter ); + map->size -= 1; + } +} /* vsi_nn_hashmap_remove() */ + +vsi_bool vsi_nn_hashmap_has + ( + vsi_nn_hashmap_t * map, + const char * key + ) +{ + const char * hash_key = key; + if( NULL == map ) + { + return FALSE; + } + if( NULL == _binary_tree_get_node( (_binary_tree_t**)&map->values, hash_key ) ) + { + return FALSE; + } + else + { + return TRUE; + } +} /* vsi_nn_hashmap_has() */ + +size_t vsi_nn_hashmap_get_size( const vsi_nn_hashmap_t * map ) +{ + if( !map ) + { + return 0; + } + return map->size; +} /* vsi_nn_hashmap_get_size() */ + +vsi_nn_hashmap_item_t* vsi_nn_hashmap_iter + ( vsi_nn_hashmap_t* map, vsi_nn_hashmap_item_t* item ) +{ + if( !map ) + { + return NULL; + } + if( !item ) + { + return map->items; + } + return (vsi_nn_hashmap_item_t *)vsi_nn_LinkListNext((vsi_nn_link_list_t *)item ); +} /* vsi_nn_hashmap_iter() */ + diff --git a/src/tim/vx/internal/src/utils/vsi_nn_limits.c b/src/tim/vx/internal/src/utils/vsi_nn_limits.c new file mode 100644 index 0000000..ad796a2 --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_limits.c @@ -0,0 +1,35 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "vsi_nn_types.h" +#include "utils/vsi_nn_dtype_util_prv.h" + +void vsi_nn_TypeGetRange + ( + vsi_nn_type_e type, + double * max_range, + double * min_range + ) +{ + type_get_range(type, max_range, min_range); +} /* vsi_nn_TypeGetRange() */ diff --git a/src/tim/vx/internal/src/utils/vsi_nn_link_list.c b/src/tim/vx/internal/src/utils/vsi_nn_link_list.c new file mode 100644 index 0000000..053e6e9 --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_link_list.c @@ -0,0 +1,417 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_link_list.h" +#include "vsi_nn_types.h" + +static vsi_nn_link_list_t * _walk_to_start + ( + vsi_nn_link_list_t * root + ); + +static vsi_nn_link_list_t * _walk_to_end + ( + vsi_nn_link_list_t * root + ); + +static vsi_nn_link_list_t * _move_next + ( + vsi_nn_link_list_t * root + ); + +static vsi_nn_link_list_t * _move_prev + ( + vsi_nn_link_list_t * root + ); + +static vsi_nn_link_list_t * _walk_to_start + ( + vsi_nn_link_list_t * root + ) +{ + if( NULL != root ) + { + while( NULL != root->prev ) + { + root = root->prev; + } + } + return root; +} /* _walk_to_start() */ + +static vsi_nn_link_list_t * _walk_to_end + ( + vsi_nn_link_list_t * root + ) +{ + if( NULL != root ) + { + while( NULL != root->next ) + { + root = root->next; + } + } + return root; +} /* _walk_to_end() */ + +static vsi_nn_link_list_t * _move_next + ( + vsi_nn_link_list_t * root + ) +{ + if( NULL != root ) + { + root = root->next; + } + return root; +} /* _move_next() */ + +static vsi_nn_link_list_t * _move_prev + ( + vsi_nn_link_list_t * root + ) +{ + if( NULL != root ) + { + root = root->prev; + } + return root; +} /* _move_prev() */ + +vsi_nn_link_list_t * vsi_nn_LinkListPopStart + ( + vsi_nn_link_list_t ** root + ) +{ + vsi_nn_link_list_t * node; + vsi_nn_link_list_t * self; + + if( NULL == root || NULL == *root ) + { + return NULL; + } + + node = NULL; + self = *root; + + self = _walk_to_start( self ); + node = self; + + self = _move_next( self ); + + if( NULL != self ) + { + self->prev = NULL; + } + if( NULL != node ) + { + node->next = NULL; + } + + *root = self; + return node; +} /* vsi_nn_LinkListPopStart() */ + +vsi_nn_link_list_t * vsi_nn_LinkListPopEnd + ( + vsi_nn_link_list_t ** root + ) +{ + vsi_nn_link_list_t * node; + vsi_nn_link_list_t * self; + + if( NULL == root || NULL == *root ) + { + return NULL; + } + + node = NULL; + self = *root; + + self = _walk_to_end( self ); + node = self; + + self = _move_prev( self ); + + if( NULL != self ) + { + self->next = NULL; + } + if( NULL != node ) + { + node->prev = NULL; + } + + *root = self; + return node; +} /* vsi_nn_LinkListPopEnd() */ + +void vsi_nn_LinkListPushStart + ( + vsi_nn_link_list_t ** root, + vsi_nn_link_list_t * nodes + ) +{ + vsi_nn_link_list_t * self; + if( NULL == root || NULL == nodes ) + { + return; + } + if( NULL == *root ) + { + *root = nodes; + } + else if( NULL != nodes ) + { + self = *root; + self = _walk_to_start( self ); + nodes = _walk_to_end( nodes ); + nodes->next = self; + self->prev = nodes; + self = _walk_to_start( self ); + *root = self; + } +} /* vsi_nn_LinkListPushStart() */ + +void vsi_nn_LinkListPushEnd + ( + vsi_nn_link_list_t ** root, + vsi_nn_link_list_t * nodes + ) +{ + vsi_nn_link_list_t * self; + if( NULL == root || NULL == nodes ) + { + return; + } + if( NULL == *root ) + { + *root = nodes; + } + else if( NULL != nodes ) + { + self = *root; + self = _walk_to_end( self ); + nodes = _walk_to_start( nodes ); + nodes->prev = self; + self->next = nodes; + self = _walk_to_start( self ); + *root = self; + } +} /* vsi_nn_LinkListPushEnd() */ + +vsi_nn_link_list_t * vsi_nn_LinkListNext + ( + vsi_nn_link_list_t * iter + ) +{ + return _move_next( iter ); +} /* vsi_nn_LinkListNext() */ + +vsi_nn_link_list_t * vsi_nn_LinkListNewNode + ( + size_t sz, + vsi_nn_link_list_init_t init + ) +{ + vsi_nn_link_list_t *node = (vsi_nn_link_list_t *)malloc(sz); + memset(node, 0, sz); + + if(init) + { + init(node); + } + + return node; +} /* vsi_nn_LinkListNewNode() */ + +void vsi_nn_LinkListRemoveNode + ( + vsi_nn_link_list_t ** root, + vsi_nn_link_list_t * node + ) +{ + vsi_nn_link_list_t * iter; + iter = *root; + iter = _walk_to_start( iter ); + while( NULL != iter ) + { + if( iter == node ) + { + break; + } + iter = _move_next( iter ); + } + if( NULL != iter ) + { + if( iter == *root ) + { + if( NULL != iter->prev ) + { + *root = iter->prev; + } + else if( NULL != iter->next ) + { + *root = iter->next; + } + else + { + *root = NULL; + } + } + if( NULL != iter->prev ) + { + iter->prev->next = iter->next; + } + if( NULL != iter->next ) + { + iter->next->prev = iter->prev; + } + } +} /* vsi_nn_LinkListRemoveNode() */ + +void vsi_nn_LinkListDeinit + ( + vsi_nn_link_list_t * root, + vsi_nn_link_list_deinit_t deinit + ) +{ + vsi_nn_link_list_t *tmp = NULL; + + while(root) + { + tmp = (vsi_nn_link_list_t *)vsi_nn_LinkListPopStart( &root ); + if(tmp) + { + if(deinit) + { + deinit(tmp); + } + free(tmp); + tmp = NULL; + } + } +} /* vsi_nn_LinkListDeinit() */ + +vsi_nn_link_list_t *vsi_nn_LinkListGetIndexNode + ( + vsi_nn_link_list_t * root, + uint32_t index + ) +{ + uint32_t n; + vsi_nn_link_list_t *iter; + if(NULL == root) + { + return NULL; + } + + n = 0; + iter = _walk_to_start(root); + while (iter) + { + if(n == index) + { + return iter; + } + n++; + iter = _move_next(iter); + } + return NULL; +} /* vsi_nn_LinkListGetIndexNode() */ + +void vsi_nn_LinkListDelIndexNode + ( + vsi_nn_link_list_t ** root, + uint32_t index + ) +{ + uint32_t n; + vsi_nn_link_list_t *iter; + if(NULL == root || NULL == *root) + { + return ; + } + + n = 0; + iter = _walk_to_start(*root); + while (iter) + { + if(n == index) + { + vsi_nn_link_list_t *del = iter; + if(iter->prev == NULL && iter->next == NULL) /* Only one node */ + { + *root = NULL; + } + else if(iter->prev == NULL && iter->next != NULL) /* head */ + { + iter = _move_next(iter); + iter->prev = NULL; + *root = iter; + } + else if(iter->prev != NULL && iter->next == NULL) /* tail */ + { + iter = _move_prev(iter); + iter->next = NULL; + } + else + { + iter->prev->next = iter->next; + iter->next->prev = iter->prev; + iter = _move_next(iter); + } + free(del); + return ; + } + n++; + iter = _move_next(iter); + } +} + +uint32_t vsi_nn_LinkListGetNodeNumber + ( + vsi_nn_link_list_t * root + ) +{ + uint32_t n; + vsi_nn_link_list_t *iter; + if(NULL == root) + { + return 0; + } + + n = 0; + iter = _walk_to_start(root); + while (iter) + { + iter = _move_next(iter); + n++; + } + + return n; +} diff --git a/src/tim/vx/internal/src/utils/vsi_nn_map.c b/src/tim/vx/internal/src/utils/vsi_nn_map.c new file mode 100644 index 0000000..b046f5e --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_map.c @@ -0,0 +1,143 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include "utils/vsi_nn_link_list.h" +#include "utils/vsi_nn_binary_tree.h" +#include "utils/vsi_nn_map.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_types.h" + +void vsi_nn_MapInit + ( + vsi_nn_map_t * map + ) +{ + if( NULL == map ) + { + return; + } + memset( map, 0, sizeof( vsi_nn_map_t ) ); +} /* vsi_nn_MapInit() */ + +void * vsi_nn_MapGet + ( + vsi_nn_map_t * map, + vsi_nn_map_key_t key + ) +{ + if( NULL == map ) + { + return NULL; + } + return vsi_nn_BinaryTreeGetNode( &map->values, key ); +} /* vsi_nn_MapGet() */ + +void vsi_nn_MapAdd + ( + vsi_nn_map_t * map, + vsi_nn_map_key_t key, + void * value + ) +{ + vsi_nn_map_key_list_t * key_iter; + if( NULL == map ) + { + return; + } + vsi_nn_BinaryTreeNewNode( &map->values, key, value ); + key_iter = map->keys; + while( NULL != key_iter ) + { + if( key_iter->val == key ) + { + break; + } + key_iter = (vsi_nn_map_key_list_t *)vsi_nn_LinkListNext( + (vsi_nn_link_list_t *)key_iter ); + } + if( NULL == key_iter ) + { + key_iter = (vsi_nn_map_key_list_t *)vsi_nn_LinkListNewNode( + sizeof( vsi_nn_map_key_list_t ), NULL ); + key_iter->val = key; + vsi_nn_LinkListPushStart( (vsi_nn_link_list_t **)&map->keys, + (vsi_nn_link_list_t *)key_iter ); + map->size += 1; + } +} /* vsi_nn_MapAdd() */ + +void vsi_nn_MapRemove + ( + vsi_nn_map_t * map, + vsi_nn_map_key_t key + ) +{ + vsi_nn_map_key_list_t * key_iter; + if( NULL == map ) + { + return; + } + vsi_nn_BinaryTreeRemoveNode( &map->values, key ); + key_iter = map->keys; + while( NULL != key_iter ) + { + if( key_iter->val == key ) + { + break; + } + key_iter = (vsi_nn_map_key_list_t *)vsi_nn_LinkListNext( + (vsi_nn_link_list_t *)key_iter ); + } + if( NULL != key_iter ) + { + vsi_nn_LinkListRemoveNode( (vsi_nn_link_list_t **)&map->keys, + (vsi_nn_link_list_t *)key_iter ); + free( key_iter ); + map->size -= 1; + } +} /* vsi_nn_MapRemove() */ + +vsi_bool vsi_nn_MapHasKey + ( + vsi_nn_map_t * map, + vsi_nn_map_key_t key + ) +{ + if( NULL == map ) + { + return FALSE; + } + if( NULL == vsi_nn_BinaryTreeGetNode( &map->values, key ) ) + { + return FALSE; + } + else + { + return TRUE; + } +} /* vsi_nn_MapHasKey() */ + diff --git a/src/tim/vx/internal/src/utils/vsi_nn_math.c b/src/tim/vx/internal/src/utils/vsi_nn_math.c new file mode 100644 index 0000000..19350a8 --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_math.c @@ -0,0 +1,431 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_tensor.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_map.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" + +static void _compute_stride + ( + uint32_t * shape, + uint32_t dim_num, + uint32_t * stride + ); + +static void _compute_stride + ( + uint32_t * shape, + uint32_t dim_num, + uint32_t * stride + ) +{ + int i; + uint32_t s; + s = 1; + for( i = dim_num - 1; i >= 0; i -- ) + { + stride[i] = s; + s *= shape[i]; + } +} /* _compute_stride() */ + +void vsi_nn_Transpose + ( + uint8_t * dst, + uint8_t * data, + uint32_t * shape, + uint32_t dim_num, + uint32_t * perm, + vsi_nn_type_e type + ) +{ + uint32_t i; + uint32_t i_dst; + uint32_t i_org; + uint32_t i_t; + uint32_t size; + uint32_t unit_bytes; + uint32_t org_stride[VSI_NN_MAX_DIM_NUM]; + uint32_t dst_stride[VSI_NN_MAX_DIM_NUM]; + uint32_t dst_shape[VSI_NN_MAX_DIM_NUM]; + + if( NULL == data || NULL == dst || NULL == shape || NULL == perm + || 0 == dim_num || dim_num > VSI_NN_MAX_DIM_NUM ) + { + return; + } + if( 1 == dim_num ) + { + VSILOGW( "Transpose error, incorrect dim %d", dim_num ); + return; + } + for( i = 0; i < dim_num; i ++ ) + { + if( perm[i] >= dim_num ) + { + VSILOGW( "Incorrect perm %d", perm[i] ); + return; + } + dst_shape[i] = shape[perm[i]]; + } + unit_bytes = vsi_nn_GetTypeBytes( type ); + _compute_stride( shape, dim_num, org_stride ); + _compute_stride( dst_shape, dim_num, dst_stride ); + size = vsi_nn_ShapeProduct( shape, dim_num ); + for( i_dst = 0; i_dst < size; i_dst ++ ) + { + i_org = 0; + i_t = i_dst; + for( i = 0; i < dim_num; i ++ ) + { + i_org += ( i_t / dst_stride[i] ) * org_stride[perm[i]]; + i_t %= dst_stride[i]; + } + memcpy( &dst[i_dst * unit_bytes], &data[i_org * unit_bytes], unit_bytes ); + //dst[i_dst] = data[i_org]; + } +} /* vsi_nn_Transpose() */ + +void vsi_nn_Permute + ( + uint8_t * dst, + uint8_t * data, + uint32_t * shape, + uint32_t dim_num, + uint32_t * perm, + vsi_nn_type_e type + ) +{ + uint32_t unit_bytes, i; + uint32_t org_stride[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t dst_stride[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t dst_shape[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t dim_stack[VSI_NN_MAX_DIM_NUM] = {0}; + uint8_t * in_addr_stack[VSI_NN_MAX_DIM_NUM] = {NULL}; + uint8_t * out_addr_stack[VSI_NN_MAX_DIM_NUM] = {NULL}; + uint8_t * in_addr_tmp = NULL; + uint8_t * out_addr_tmp = NULL; + uint32_t current = 0; + vsi_bool back = FALSE; + uint32_t layer = dim_num - 1; + + if( NULL == data || NULL == dst || NULL == shape || NULL == perm + || 0 == dim_num || dim_num > VSI_NN_MAX_DIM_NUM ) + { + return; + } + if( 1 == dim_num ) + { + VSILOGW( "Permute error, incorrect dim %d", dim_num ); + return; + } + + for( i = 0; i < dim_num; i ++ ) + { + if( perm[i] >= dim_num ) + { + VSILOGW( "Incorrect perm %d", perm[i] ); + return; + } + dst_shape[i] = shape[perm[i]]; + } + unit_bytes = vsi_nn_GetTypeBytes( type ); + vsi_nn_GetStrideSizeBySize( shape, dim_num, type, org_stride ); + vsi_nn_GetStrideSizeBySize( dst_shape, dim_num, type, dst_stride ); + + in_addr_tmp = data; + out_addr_tmp = dst; + + for (;;) + { + in_addr_stack[current] = in_addr_tmp; + out_addr_stack[current] = out_addr_tmp; + + if (layer == 1) + { + uint32_t x, y; + uint8_t* new_out_addr = out_addr_tmp; + for (y = 0; y < shape[perm[1]]; y++) + { + for (x = 0; x < shape[perm[0]]; x++) + { + uint8_t* new_in_addr = in_addr_tmp + (y * org_stride[perm[1]] + x * org_stride[perm[0]]); + memcpy(new_out_addr, new_in_addr, unit_bytes); + new_out_addr += unit_bytes; + } + } + + if (!current) break; + current--; + layer++; + back = TRUE; + } + else if (!back) + { + current++; + layer--; + } + else + { + dim_stack[current]++; + if (dim_stack[current] < shape[perm[layer]]) + { + in_addr_tmp += org_stride[perm[layer]]; + out_addr_tmp += dst_stride[layer]; + back = FALSE; + } + else + { + dim_stack[current] = 0; + if (!current) break; + current--; + layer++; + in_addr_tmp = in_addr_stack[current]; + out_addr_tmp = out_addr_stack[current]; + } + } + } +} /* vsi_nn_Permute() */ + +void vsi_nn_SqueezeShape + ( + uint32_t * shape, + uint32_t * dim_num + ) +{ + int i; + int origin_count; + int count; + int start; + count = *dim_num; + origin_count = count; + if( 1 == count ) + { + return; + } + start = 0; + for( i = 0; i < count; i ++ ) + { + if( 1 == shape[i] ) + { + continue; + } + else if( i > start ) + { + memmove( &shape[start], &shape[i], (count - i) * sizeof( uint32_t ) ); + count -= i - start; + start += i - start; + } + else + { + start = i + 1; + } + } + *dim_num = count; + memset( &shape[count], 0, sizeof( uint32_t ) * ( origin_count - count ) ); +} /* vsi_nn_SqueezeShape() */ + +uint32_t vsi_nn_ShapeProduct + ( + uint32_t * shape, + uint32_t dim_num + ) +{ + uint32_t i; + uint32_t res; + res = 1; + for ( i = 0; i < dim_num; i++ ) + { + res *= shape[i]; + } + return res; +} /* vsi_nn_ShapeProduct() */ + +void vsi_nn_InvertShape + ( + uint32_t * in, + uint32_t dim_num, + uint32_t * out + ) +{ + uint32_t i; + for ( i = 0; i < dim_num; i++ ) + { + out[i] = in[dim_num - 1 - i]; + } +} /* vsi_nn_InvertShape() */ + +void vsi_nn_InvertPermuteShape + ( + uint32_t * in, + uint32_t dim_num, + uint32_t * out + ) +{ + uint32_t i; + for ( i = 0; i < dim_num; i++ ) + { + out[in[i]] = i; + } +} /* vsi_nn_InvertPermuteShape() */ + +double vsi_nn_Rint + ( + double x + ) +{ + return vsi_rint(x); +} /* vsi_nn_Rint() */ + +// Implement the Philox algorithm to generate random numbers in parallel. +// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3. +// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf + +// This source code only implement philox_4x32_10 algorithm. +// ---------------------philox_4x32_10 algorithm beginning------------- +#ifndef PHILOX_W32_0 +#define PHILOX_W32_0 ((uint32_t)0x9E3779B9) +#endif +#ifndef PHILOX_W32_1 +#define PHILOX_W32_1 ((uint32_t)0xBB67AE85) +#endif + +#ifndef PHILOX_M4x32_0 +#define PHILOX_M4x32_0 ((uint32_t)0xD2511F53) +#endif +#ifndef PHILOX_M4x32_1 +#define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57) +#endif + +struct r123array4x32{ + uint32_t v[4]; +}; + +struct r123array2x32 { + uint32_t v[2]; +}; + +typedef struct r123array4x32 philox4x32_ctr_t; +typedef struct r123array2x32 philox4x32_key_t; +typedef struct r123array2x32 philox4x32_ukey_t; + +uint32_t mulhilo32(uint32_t a, uint32_t b, uint32_t* hip) +{ + uint64_t product = ((uint64_t)a)*((uint64_t)b); + *hip = product>>32; + return (uint32_t)product; +} + +philox4x32_key_t philox4x32keyinit(philox4x32_ukey_t uk) +{ + return uk; +} + +struct r123array2x32 _philox4x32bumpkey(struct r123array2x32 key) +{ + key.v[0] += PHILOX_W32_0; + key.v[1] += PHILOX_W32_1; + return key; +} + +struct r123array4x32 _philox4x32round(struct r123array4x32 ctr, struct r123array2x32 key) +{ + uint32_t hi0; + uint32_t hi1; + uint32_t lo0 = mulhilo32(PHILOX_M4x32_0, ctr.v[0], &hi0); + uint32_t lo1 = mulhilo32(PHILOX_M4x32_1, ctr.v[2], &hi1); + struct r123array4x32 out = {{hi1^ctr.v[1]^key.v[0], lo1, + hi0^ctr.v[3]^key.v[1], lo0}}; + return out; +} + +philox4x32_ctr_t philox4x32_R(uint32_t R, philox4x32_ctr_t ctr, philox4x32_key_t key) +{ + uint32_t i; + for (i = 0; i < R; i++) + { + if (i != 0) + { + key = _philox4x32bumpkey(key); + } + ctr = _philox4x32round(ctr, key); + } + return ctr; +} + +philox4x32_ctr_t g_ctr; +philox4x32_key_t g_key; + +void vsi_nn_random_init_for_philox_4x32_10 + ( + uint32_t low, + uint32_t high + ) +{ + philox4x32_ukey_t uk; + uk.v[0] = low; + uk.v[1] = high; + g_key = philox4x32keyinit(uk); +} + +void vsi_nn_random_generate_by_philox_4x32_10 + ( + uint32_t *random_buf, + uint32_t len + ) +{ + uint32_t i; + for (i = 0; i < len / 4; i++) + { + g_ctr = philox4x32_R(10, g_ctr, g_key); + memcpy(&(random_buf[i * 4]), &g_ctr, 4 * sizeof(uint32_t)); + } + i = len % 4; + if (i) + { + g_ctr = philox4x32_R(10, g_ctr, g_key); + memcpy(&(random_buf[(len / 4) * 4]), &g_ctr, i * sizeof(uint32_t)); + } +} +// ---------------------philox_4x32_10 algorithm end------------------- + +void vsi_nn_random_uniform_transform + ( + uint32_t *random_buf, + float *uniform_buf, + uint32_t len + ) +{ + float rand_max = (float)(pow(2.0,32)); + uint32_t i; + for (i = 0; i < len; i++) + { + uniform_buf[i] = random_buf[i] / rand_max; + } +} diff --git a/src/tim/vx/internal/src/utils/vsi_nn_shape_util.c b/src/tim/vx/internal/src/utils/vsi_nn_shape_util.c new file mode 100644 index 0000000..6a4be8d --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_shape_util.c @@ -0,0 +1,77 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include "vsi_nn_log.h" +#include "utils/vsi_nn_shape_util.h" + +void vsi_nn_shape_get_stride + ( + const int32_t * shape, + size_t rank, + size_t * out_stride + ) +{ + uint32_t i; + if( !shape || !out_stride ) + { + return; + } + + out_stride[0] = 1; + for( i = 1; i < rank; i ++ ) + { + out_stride[i] = shape[i - 1] * out_stride[i - 1]; + } +} /* vsi_nn_shape_get_stride() */ + +size_t vsi_nn_shape_get_size + ( + const int32_t * shape, + size_t rank + ) +{ + size_t size = 0; + uint32_t i; + if( !shape ) + { + return size; + } + size = 1; + for( i = 0; i < rank; i ++ ) + { + if( shape[i] > 0 ) + { + size *= shape[i]; + } + else + { + VSILOGE("Got invalid dim: %d at %d.", shape[i], i); + size = 0; + break; + } + } + return size; +} /* vsi_nn_shape_get_size() */ + diff --git a/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c b/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c new file mode 100644 index 0000000..b40e755 --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c @@ -0,0 +1,347 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include + +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" + +static void _compute_stride + ( + uint32_t * shape, + uint32_t dim_num, + uint32_t * stride + ) +{ + uint32_t i; + uint32_t s; + s = 1; + for( i = 0; i < dim_num; i ++ ) + { + stride[i] = s; + s *= shape[i]; + } +} /* _compute_stride() */ + +vsi_nn_tensor_t* vsi_nn_Concat + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t** tensors, + uint32_t tensor_num, + uint32_t axis + ) +{ + uint32_t i, j; + int32_t k; + uint8_t* buffer = NULL; + uint8_t* tmp = NULL; + size_t total_bytes = 0; + size_t tensor_size = 0; + size_t offset = 0, src = 0, dst = 0; + uint32_t* strides = NULL; + uint32_t* dst_strides = NULL; + uint32_t type_bytes = 0; + vsi_nn_tensor_attr_t output_attr; + vsi_nn_tensor_t* tensor_out = NULL; + // Validate inputs + if( tensor_num < 2 || !graph ) + { + return NULL; + } + for( i = 0; i < tensor_num; i ++ ) + { + if( !tensors[i] ) + { + VSILOGW("Concat tensor %u is null.", i); + return NULL; + } + } + memset( &output_attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + memcpy( &output_attr.dtype, &tensors[0]->attr.dtype, sizeof(vsi_nn_dtype_t) ); + memcpy( output_attr.size, tensors[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM ); + output_attr.dim_num = tensors[0]->attr.dim_num; + + for( i = 1; i < tensor_num; i ++ ) + { + if( tensors[0]->attr.dim_num != tensors[i]->attr.dim_num ) + { + VSILOGW("Concat tensor dim number mismatch."); + return NULL; + } + for( j = 0; j < tensors[0]->attr.dim_num; j ++) + { + if( j == axis ) + { + continue; + } + if( tensors[0]->attr.size[j] != tensors[i]->attr.size[j] ) + { + vsi_nn_PrintTensor(tensors[0], 0); + vsi_nn_PrintTensor(tensors[i], i); + VSILOGW("Concat tensor shapes mismatch."); + return NULL; + } + } + output_attr.size[axis] += tensors[i]->attr.size[axis]; + } + total_bytes = vsi_nn_GetTensorSize( output_attr.size, output_attr.dim_num, + output_attr.dtype.vx_type ); + buffer = (uint8_t*)malloc( total_bytes ); + strides = (uint32_t*)malloc( sizeof(uint32_t) * tensors[0]->attr.dim_num ); + dst_strides = (uint32_t*)malloc( sizeof(uint32_t) * tensors[0]->attr.dim_num ); + if (!buffer || !strides || !dst_strides) + { + VSILOGW("Out of memroy."); + goto concat_error; + } + type_bytes = vsi_nn_GetTypeBytes( output_attr.dtype.vx_type ); + _compute_stride(output_attr.size, output_attr.dim_num, dst_strides); + offset = 0; + for( i = 0; i < tensor_num; i ++ ) + { + tmp = (uint8_t*)vsi_nn_ConvertTensorToData( graph, tensors[i] ); + tensor_size = vsi_nn_GetElementNum( tensors[i] ); + if( !tmp ) + { + VSILOGW("Read tensor %u fail.", i); + goto concat_error; + } + _compute_stride(tensors[i]->attr.size, tensors[i]->attr.dim_num, strides); + for( j = 0; j < tensor_size; j ++ ) + { + src = j; + dst = 0; + for( k = tensors[0]->attr.dim_num - 1; k >= 0; k -- ) + { + dst += ( src / strides[k] ) * dst_strides[k]; + src %= strides[k]; + } + dst += offset; + src = j; + memcpy( &buffer[dst * type_bytes], &tmp[src * type_bytes], type_bytes ); + } + free(tmp); + offset += strides[axis] * tensors[i]->attr.size[axis]; + } + tensor_out = vsi_nn_CreateTensorFromData( graph, buffer, &output_attr ); + +concat_error: + if( buffer ) + { + free(buffer); + } + if( strides ) + { + free(strides); + } + if( dst_strides ) + { + free(dst_strides); + } + return tensor_out; +} + +vsi_nn_tensor_t* vsi_nn_ConvertTensorDtype + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* tensor, + const vsi_nn_dtype_t* dst_dtype + ) +{ + vsi_status status = VSI_SUCCESS; + uint32_t i = 0, src_stride = 0, dst_stride = 0; + uint32_t sz = 0; + uint8_t* src_buf = NULL; + uint32_t dst_buf_sz = 0; + uint8_t* dst_buf = NULL; + vsi_nn_tensor_attr_t dst_attr; + vsi_nn_tensor_t* dst_tensor = NULL; + + if( NULL == graph || NULL == tensor || NULL == dst_dtype ) + { + return NULL; + } + + sz = vsi_nn_GetElementNum( tensor ); + src_stride = vsi_nn_TypeGetBytes( tensor->attr.dtype.vx_type ); + dst_stride = vsi_nn_TypeGetBytes( dst_dtype->vx_type ); + dst_buf_sz = sz * dst_stride; + + dst_buf = (uint8_t *)malloc( dst_buf_sz ); + if( NULL != dst_buf ) + { + src_buf = vsi_nn_ConvertTensorToData( graph, tensor ); + if( NULL != src_buf ) + { + for( i = 0; i < sz; i++ ) + { + status = vsi_nn_DtypeConvert( &src_buf[src_stride * i], + &tensor->attr.dtype, &dst_buf[dst_stride * i], dst_dtype ); + if(status != VSI_SUCCESS) + { + break; + } + } + + if( VSI_SUCCESS == status ) + { + memcpy( &dst_attr, &tensor->attr, sizeof( dst_attr ) ); + memcpy( &dst_attr.dtype, dst_dtype, sizeof( dst_attr.dtype ) ); + dst_tensor = vsi_nn_CreateTensorFromData( graph, dst_buf, &dst_attr ); + } + } + } + + if( src_buf ) + { + free( src_buf ); + src_buf = NULL; + } + if( dst_buf ) + { + free( dst_buf ); + dst_buf = NULL; + } + + return dst_tensor; +} /* vsi_nn_ConvertTensorDtype() */ + +vsi_nn_tensor_t* vsi_nn_TensorAdd + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t** tensors, + uint32_t tensor_num, + vsi_nn_tensor_attr_t output_attr + ) +{ +#define MAX_TENSOR_NUM 16 + vsi_status status = VSI_SUCCESS; + uint32_t i, j; + uint8_t* buffer[MAX_TENSOR_NUM] = {NULL}; + uint8_t* tmp = NULL; + size_t total_bytes = 0; + size_t elements = 0; + uint32_t type_bytes = 0; + uint32_t out_bytes = 0; + float data = 0.0f, sum = 0.0f; + vsi_nn_tensor_t* tensor_out = NULL; + + // Validate inputs + if( tensor_num < 2 || !graph || tensor_num > MAX_TENSOR_NUM) + { + return NULL; + } + for( i = 0; i < tensor_num; i ++ ) + { + if( !tensors[i] ) + { + VSILOGE("Tensor %u is null(TensorAdd).", i); + return NULL; + } + } + for( i = 1; i < tensor_num; i ++ ) + { + if( tensors[0]->attr.dim_num != tensors[i]->attr.dim_num ) + { + VSILOGE("Tensor dim number mismatch(TensorAdd)."); + return NULL; + } + for( j = 0; j < tensors[0]->attr.dim_num; j ++) + { + if( tensors[0]->attr.size[j] != tensors[i]->attr.size[j] ) + { + vsi_nn_PrintTensor(tensors[0], 0); + vsi_nn_PrintTensor(tensors[i], i); + VSILOGE("Tensor shapes mismatch(TensorAdd)."); + return NULL; + } + } + } + for ( i = 0; i < tensor_num; i++ ) + { + buffer[i] = vsi_nn_ConvertTensorToData(graph, tensors[i]); + if ( !buffer[i] ) + { + VSILOGE("Convert tensor to data failed."); + goto error; + } + } + + elements = vsi_nn_GetElementNum( tensors[0] ); + out_bytes = vsi_nn_TypeGetBytes( output_attr.dtype.vx_type ); + total_bytes = vsi_nn_GetTensorSize( output_attr.size, output_attr.dim_num, + output_attr.dtype.vx_type ); + tmp = (uint8_t*)malloc( total_bytes ); + if ( !tmp ) + { + VSILOGE("Out of memroy."); + goto error; + } + for ( i = 0; i < elements; i++ ) + { + sum = 0.0f; + for ( j = 0; j < tensor_num; j++ ) + { + type_bytes = vsi_nn_TypeGetBytes( tensors[j]->attr.dtype.vx_type ); + status = vsi_nn_DtypeToFloat32(&(buffer[j][type_bytes * i]), &data, &tensors[j]->attr.dtype); + if ( status != VSI_SUCCESS ) + { + VSILOGE("Convert data failed."); + goto error; + } + sum += data; + } + status = vsi_nn_Float32ToDtype(sum, &tmp[out_bytes * i], &output_attr.dtype); + if ( status != VSI_SUCCESS ) + { + VSILOGE("Convert data failed."); + goto error; + } + } + tensor_out = vsi_nn_CreateTensorFromData( graph, tmp, &output_attr ); + +#undef MAX_TENSOR_NUM +error: + for ( i = 0; i < tensor_num; i++ ) + { + if ( buffer[i] ) + { + free(buffer[i]); + } + } + if( tmp ) + { + free(tmp); + } + + return tensor_out; +} /* vsi_nn_ConstTensorAdd() */ \ No newline at end of file diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c new file mode 100644 index 0000000..1482fc1 --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -0,0 +1,1313 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#include +#else +#include +#include +#include +#endif + +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" + +typedef struct _vx_status_desc_t +{ + vx_status status; + const char* desc; +} vx_status_desc_t; + +static vx_status_desc_t const vx_status_desc[] = +{ + { VX_STATUS_MIN /* (-25) */, "The lower bound of status codes in VX. Used for bounds checks only." }, + { VX_ERROR_REFERENCE_NONZERO /* (-24) */, "An operation did not complete due to a" + " reference count being non-zero." }, + { VX_ERROR_MULTIPLE_WRITERS /* (-23) */, "The graph has more than one node outputting" + " to the same data object. This is an invalid graph structure." }, + { VX_ERROR_GRAPH_ABANDONED /* (-22) */, "The graph is stopped due to an error or a callback that abandoned" + " execution." }, + { VX_ERROR_GRAPH_SCHEDULED /* (-21) */, "The supplied graph already has been scheduled and may be currently" + " executing." }, + { VX_ERROR_INVALID_SCOPE /* (-20) */, "The supplied parameter is from another scope and cannot be used" + " in the current scope." }, + { VX_ERROR_INVALID_NODE /* (-19) */, "The supplied node could not be created." }, + { VX_ERROR_INVALID_GRAPH /* (-18) */, "The supplied graph has invalid connections (cycles)." }, + { VX_ERROR_INVALID_TYPE /* (-17) */, "The supplied type parameter is incorrect." }, + { VX_ERROR_INVALID_VALUE /* (-16) */, "The supplied parameter has an incorrect value." }, + { VX_ERROR_INVALID_DIMENSION /* (-15) */, "The supplied parameter is too big or too small in dimension." }, + { VX_ERROR_INVALID_FORMAT /* (-14) */, "The supplied parameter is in an invalid format." }, + { VX_ERROR_INVALID_LINK /* (-13) */, "The link is not possible as specified. The parameters are" + " incompatible." }, + { VX_ERROR_INVALID_REFERENCE /* (-12) */, "The reference provided is not valid." }, + { VX_ERROR_INVALID_MODULE /* (-11) */, "The module does not contain the entry point." }, + { VX_ERROR_INVALID_PARAMETERS /* (-10) */, "The supplied parameter information does not match the" + " kernel contract." }, + { VX_ERROR_OPTIMIZED_AWAY /* (-9) */, "The object refered to has been optimized out of existence." }, + { VX_ERROR_NO_MEMORY /* (-8) */, "An internal or implicit allocation failed. Typically catastrophic." + " After detection, deconstruct the context." }, + { VX_ERROR_NO_RESOURCES /* (-7) */, "An internal or implicit resource can not be acquired (not memory)." + " This is typically catastrophic. After detection, deconstruct" + " the context." }, + { VX_ERROR_NOT_COMPATIBLE /* (-6) */, "The attempt to link two parameters together failed due" + " to type incompatibilty." }, + { VX_ERROR_NOT_ALLOCATED /* (-5) */, "The parameter must be allocated by the system. " }, + { VX_ERROR_NOT_SUFFICIENT /* (-4) */, "The given graph has failed verification due to an insufficient" + " number of required parameters, which cannot be automatically" + " created. Typically this indicates required atomic parameters." }, + { VX_ERROR_NOT_SUPPORTED /* (-3) */, "The requested set of parameters produce a configuration that cannot" + " be supported. " }, + { VX_ERROR_NOT_IMPLEMENTED /* (-2) */, "The requested kernel is missing. " }, + { VX_FAILURE /* (-1) */, "A generic error code, used when no other describes the error." }, + { VX_SUCCESS /* (0) */, "Success" }, +}; +/* Check whether enum value changed */ +_compiler_assert(VX_ERROR_NOT_IMPLEMENTED == -2, VX_STATUS_VALUE_CHANGED); +_compiler_assert(VX_ERROR_INVALID_PARAMETERS == -10, VX_STATUS_VALUE_CHANGED); +_compiler_assert(VX_ERROR_INVALID_GRAPH == -18, VX_STATUS_VALUE_CHANGED); +_compiler_assert(VX_STATUS_MIN == -25, VX_STATUS_VALUE_CHANGED); + +static const int16_t vx_status_desc_cnt = _cnt_of_array( vx_status_desc ); + +static uint32_t _compute_stride_rounding + ( + uint32_t out, + uint32_t stride, + vsi_nn_round_type_e rounding + ) +{ + if( VSI_NN_ROUND_CEIL == rounding ) + { + out = ( out + stride - 1 ) / stride; + } + else + { + out = out / stride; + } + return out; +} + +static uint32_t _compute_padding + ( + uint32_t in_size, + uint32_t ksize, + uint32_t stride, + uint32_t dilation_rate, + uint32_t out_size + ) +{ + uint32_t effective_ksize; + int32_t padding; + effective_ksize = (ksize - 1) * dilation_rate + 1; + padding = (out_size - 1) * stride + effective_ksize - in_size; + return vsi_nn_max(padding, 0); +} /* _compute_padding() */ + +uint8_t * vsi_nn_LoadBinaryData + ( + const char * filename, + uint32_t * sz + ) +{ + uint8_t * data; + uint32_t fsize; + size_t cnt; + FILE * fp; + + fp = fopen( filename, "rb" ); + if( NULL == fp ) + { + return NULL; + } + fseek( fp, 0L, SEEK_END ); + fsize = (uint32_t)ftell( fp ); + fseek( fp, 0L, SEEK_SET ); + data = (uint8_t *)malloc( fsize ); + cnt = 0; + if( NULL == data ) + { + VSILOGE( "Malloc %d memory fail.", fsize ); + } + else + { + while( (uint32_t)cnt < fsize ) + { + cnt += fread( &data[cnt], 1, fsize, fp ); + if( cnt == 0 ) + { + break; + } + } + VSILOGW( "Read %d bytes from file %s.", (uint32_t)cnt, filename ); + } + fclose( fp ); + if( NULL != sz ) + { + *sz = (uint32_t)cnt; + } + return data; +} /* vsi_nn_LoadBinaryData() */ + +uint32_t vsi_nn_GetStrideSize + ( + vsi_nn_tensor_attr_t * attr, + uint32_t * stride + ) +{ + + if( NULL == attr || NULL == stride ) + { + return 0; + } + + return vsi_nn_GetStrideSizeBySize(attr->size, attr->dim_num, attr->dtype.vx_type, stride); +} /* vsi_nn_GetStrideSize() */ + +uint32_t vsi_nn_GetStrideSizeBySize + ( + uint32_t * size, + uint32_t dim_num, + vsi_nn_type_e type, + uint32_t * stride + ) +{ + uint32_t total_bytes; + uint32_t i; + + if( NULL == size || NULL == stride ) + { + return 0; + } + + stride[0] = vsi_nn_GetTypeBytes( type ); + total_bytes = stride[0]; + for( i = 1; i < dim_num; i ++ ) + { + stride[i] = size[i - 1] * stride[i - 1]; + total_bytes *= size[i]; + } + total_bytes *= size[0]; + for( i = dim_num; i < VSI_NN_MAX_DIM_NUM; i ++ ) + { + stride[i] = total_bytes; + } + return total_bytes; +} /* vsi_nn_GetStrideSizeBySize() */ + +uint32_t vsi_nn_GetTotalBytesBySize + ( + uint32_t * size, + uint32_t dim_num, + vsi_nn_type_e type + ) +{ + return vsi_nn_ShapeProduct( size, dim_num ) * vsi_nn_GetTypeBytes( type ); +} /* vsi_nn_GetTotalBytesBySize() */ + +float vsi_nn_DataAsFloat32 + ( + uint8_t * data, + vsi_nn_type_e type + ) +{ + float val; + uint32_t *p = (uint32_t*)(&val); + int16_t fp16; + + *p = 0xFFFFFFFF; + switch( type ) + { + case VSI_NN_TYPE_BOOL8: + val = (float)((int8_t*)data)[0]; + break; + case VSI_NN_TYPE_INT8: + val = (float)((int8_t*)data)[0]; + break; + case VSI_NN_TYPE_UINT8: + val = (float)data[0]; + break; + case VSI_NN_TYPE_INT16: + val = (float)( (int16_t *)data )[0]; + break; + case VSI_NN_TYPE_UINT16: + val = (float)( (uint16_t *)data )[0]; + break; + case VSI_NN_TYPE_FLOAT16: + fp16 = ( (int16_t *)data )[0]; + val = vsi_nn_Fp16ToFp32( fp16 ); + break; + case VSI_NN_TYPE_BFLOAT16: + fp16 = ( (int16_t *)data )[0]; + val = vsi_nn_BFp16ToFp32( fp16 ); + break; + case VSI_NN_TYPE_INT32: + val = (float)( (int32_t *)data )[0]; + break; + case VSI_NN_TYPE_UINT32: + val = (float)( (uint32_t *)data )[0]; + break; + case VSI_NN_TYPE_FLOAT32: + val = ( (float *)data )[0]; + break; + case VSI_NN_TYPE_INT64: + case VSI_NN_TYPE_UINT64: + case VSI_NN_TYPE_FLOAT64: + default: + VSILOGW( "Unsupport type %d", type ); + break; + } + return val; +} /* vsi_nn_DataAsFloat32() */ + +void vsi_nn_UpdateTensorDims + ( + vsi_nn_tensor_attr_t * attr + ) +{ + uint32_t i; + uint32_t num; + if( NULL == attr ) + { + return; + } + + num = 0; + for( i = 0; i < attr->dim_num; i ++ ) + { + if( 0 == attr->size[i] ) + { + break; + } + num ++; + } + + if( attr->dim_num > VSI_NN_MAX_DIM_NUM ) + { + VSILOGW( "Error dim number: %d", attr->dim_num ); + attr->dim_num = num; + } + else if( attr->dim_num != num ) + { + VSILOGW( "Dim number and size mismatch: %d vs calculated = %d ", attr->dim_num, num ); + attr->dim_num = VSI_NN_DIM_AUTO; + } +} /* vsi_nn_UpdateTensorDims() */ + + +uint32_t vsi_nn_ComputeFilterSize + ( + uint32_t i_size, + uint32_t ksize, + uint32_t * pad, + uint32_t stride, + uint32_t dilation, + vsi_nn_round_type_e rounding + ) +{ + uint32_t out; + if( 0 == stride ) + { + if (i_size == ksize) { + stride = 1; + } else { + VSILOGE( "Error stride value: 0." ); + return 0; + } + } + if (dilation > 1) + { + ksize = dilation * (ksize - 1) + 1; + } + out = i_size + pad[0] + pad[1] - ksize; + out = _compute_stride_rounding( out, stride, rounding ); + out ++; + return out; +} /* vsi_nn_ComputeFilterSize() */ + +uint32_t vsi_nn_compute_filter_shape + ( + vsi_nn_pad_e padding_type, + uint32_t image_size, + uint32_t ksize, + uint32_t stride, + uint32_t dilation_rate + ) +{ + uint32_t effective_ksize; + effective_ksize = (ksize - 1) * dilation_rate + 1; + switch (padding_type) + { + case VSI_NN_PAD_SAME: + return (image_size + stride - 1) / stride; + case VSI_NN_PAD_VALID: + return (image_size + stride - effective_ksize) / stride; + default: + return 0; + } +} /* vsi_nn_compute_filter_shape() */ + +void vsi_nn_compute_padding + ( + uint32_t * in_shape, + uint32_t * ksize, + uint32_t * stride, + uint32_t * dilation, + vsi_nn_pad_e pad_type, + uint32_t * out_pad + ) +{ + uint32_t out_w, out_h; + uint32_t pad_w, pad_h; + uint32_t dilation_w, dilation_h; + if (NULL == in_shape || NULL == ksize + || NULL == stride || NULL == out_pad) + { + return; + } + if (pad_type == VSI_NN_PAD_AUTO) + { + return; + } + if (NULL == dilation || (dilation[0] == 0 && dilation[1] == 0)) + { + dilation_w = 1; + dilation_h = 1; + } + else + { + dilation_w = dilation[0]; + dilation_h = dilation[1]; + } + + out_w = vsi_nn_compute_filter_shape(pad_type, in_shape[0], ksize[0], stride[0], dilation_w); + out_h = vsi_nn_compute_filter_shape(pad_type, in_shape[1], ksize[1], stride[1], dilation_h); + pad_w = _compute_padding(in_shape[0], ksize[0], stride[0], dilation_w, out_w); + pad_h = _compute_padding(in_shape[1], ksize[1], stride[1], dilation_h, out_h); + out_pad[0] = pad_w / 2; + out_pad[1] = pad_w - out_pad[0]; + out_pad[2] = pad_h / 2; + out_pad[3] = pad_h - out_pad[2]; +} /* vsi_nn_compute_padding() */ + +void vsi_nn_ComputePadWithPadType + ( + uint32_t * in_shape, + uint32_t in_dim_num, + uint32_t * ksize, + uint32_t * stride, + vsi_nn_pad_e pad_type, + vsi_nn_round_type_e rounding, + uint32_t * out_pad + ) +{ + vsi_nn_compute_padding(in_shape, ksize, stride, NULL, pad_type, out_pad); +} /* vsi_nn_ComputePadWithPadType() */ + +void vsi_nn_compute_padding_conv1d +( + uint32_t * in_shape, + uint32_t * ksize, + uint32_t * stride, + uint32_t * dilation, + vsi_nn_pad_e pad_type, + uint32_t * out_pad +) +{ + uint32_t out_h; + uint32_t pad_h; + uint32_t dilation_h; + if (NULL == in_shape || NULL == ksize + || NULL == stride || NULL == out_pad) + { + return; + } + if (pad_type == VSI_NN_PAD_AUTO) + { + return; + } + if (NULL == dilation || dilation[0] == 0) + { + dilation_h = 1; + } + else + { + dilation_h = dilation[0]; + } + + out_h = vsi_nn_compute_filter_shape(pad_type, in_shape[0], ksize[0], stride[0], dilation_h); + pad_h = _compute_padding(in_shape[0], ksize[0], stride[0], dilation_h, out_h); + out_pad[0] = pad_h / 2; + out_pad[1] = pad_h - out_pad[0]; +} /* vsi_nn_compute_padding_conv1d() */ + +void vsi_nn_ComputePadWithPadTypeForConv1D + ( + uint32_t * in_shape, + uint32_t in_dim_num, + uint32_t * ksize, + uint32_t * stride, + vsi_nn_pad_e pad_type, + vsi_nn_round_type_e rounding, + uint32_t * out_pad + ) +{ + vsi_nn_compute_padding_conv1d(in_shape, ksize, stride, NULL, pad_type, out_pad); +} /* vsi_nn_ComputePadWithPadTypeForConv1D() */ + +void vsi_nn_InitTensorsId + ( + vsi_nn_tensor_id_t * ids, + int num + ) +{ + num --; + while( num >=0 ) + { + ids[num] = VSI_NN_TENSOR_ID_NA; + num --; + } +} /* vsi_nn_InitTensorsId() */ + +void vsi_nn_GetPadForOvx + ( + uint32_t * in_pad, + uint32_t * out_pad + ) +{ + if( NULL == in_pad || NULL == out_pad ) + { + return; + } + + /* Workaround for ovx api. */ + out_pad[0] = in_pad[0]; + out_pad[1] = in_pad[2]; + if( out_pad[0] != in_pad[1] ) + { + out_pad[0] = (uint32_t)( 0 - (int32_t)out_pad[0] ); + } + if( out_pad[1] != in_pad[3] ) + { + out_pad[1] = (uint32_t)( 0 - (int32_t)out_pad[1] ); + } +} /* vsi_nn_PadForDriver() */ + +vsi_bool vsi_nn_CreateTensorGroup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * in_tensor, + uint32_t axis, + vsi_nn_tensor_t ** out_tensors, + uint32_t group_number + ) +{ + vsi_bool ret; + uint32_t sz; + uint32_t i; + uint32_t start[VSI_NN_MAX_DIM_NUM]; + uint32_t end[VSI_NN_MAX_DIM_NUM]; + vsi_nn_tensor_attr_t attr; + + if( NULL == graph || NULL == in_tensor + || NULL == out_tensors || 0 == group_number + || 0 == in_tensor->attr.size[axis] ) + { + VSILOGW( "Create tensor group fail." ); + return FALSE; + } + + if( 0 != ( in_tensor->attr.size[axis] % group_number ) ) + { + VSILOGW( "Create tensor group fail." ); + return FALSE; + } + + ret = TRUE; + sz = in_tensor->attr.size[axis] / group_number; + + memcpy( &attr, &in_tensor->attr, sizeof( attr ) ); + attr.size[axis] = sz; + memset( start, 0, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + end[0] = in_tensor->attr.size[0]; + end[1] = in_tensor->attr.size[1]; + end[2] = in_tensor->attr.size[2]; + end[3] = in_tensor->attr.size[3]; + end[axis] = 0; + + for( i = 0; i < group_number; i ++ ) + { + start[axis] = end[axis]; + end[axis] += sz; +#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT + if ( attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC ) + { + attr.dtype.scales = in_tensor->attr.dtype.scales + sz * i; + attr.dtype.scale_dim = sz; + attr.dtype.zero_points = in_tensor->attr.dtype.zero_points + sz * i; + attr.dtype.zero_points_dim = sz; + } +#endif + out_tensors[i] = vsi_nn_CreateTensor( graph, &attr ); + if( NULL == out_tensors[i] ) + { + VSILOGE( "Create tensor %d fail.", i ); + ret = FALSE; + break; + } + if (out_tensors[i]->t) + { + vxReleaseTensor(&out_tensors[i]->t); + } + out_tensors[i]->t = vsi_nn_CreateViewTensor(graph, start, end, in_tensor); + if( NULL == out_tensors[i]->t ) + { + VSILOGE( "Create tensor %d from view fail.", i ); + ret = FALSE; + break; + } + } + return ret; +} /* vsi_nn_CreateTensorGroup() */ + +uint32_t vsi_nn_ShapeToString + ( + uint32_t * shape, + uint32_t dim_num, + char * buf, + uint32_t buf_sz, + vsi_bool for_print + ) +{ +#define _PRINT_FMT (0) +#define _NOT_PRINT_FMT (1) + uint32_t s; + uint32_t count; + const char * all_fmt[] = {" %d,", "%d_" }; + const char * fmt; + if( NULL == shape || NULL == buf + || dim_num == 0 || buf_sz == 0 ) + { + return 0; + } + if( FALSE == for_print ) + { + fmt = all_fmt[_NOT_PRINT_FMT]; + } + else + { + fmt = all_fmt[_PRINT_FMT]; + } + count = 0; + for( s = 0; s < dim_num; s++ ) + { + if( count >= buf_sz ) + { + break; + } + count += snprintf( &buf[count], buf_sz - count, + fmt, shape[s] ); + } + buf[count - 1] = 0; + return count; +} /* vsi_nn_ShapeToString() */ + +int32_t vsi_nn_Access + ( + const char *path, + int32_t mode + ) +{ + if(NULL == path) + { + return -1; + } + +#ifdef _WIN32 + return _access(path, mode); +#else + return access(path, mode); +#endif +} /* vsi_nn_Access() */ + +int32_t vsi_nn_Mkdir + ( + const char *path, + int32_t mode + ) +{ + if(NULL == path) + { + return -1; + } + +#ifdef _WIN32 + return _mkdir(path); +#else + return mkdir(path, mode); +#endif +} /* vsi_nn_Mkdir() */ + +vsi_bool vsi_nn_CheckFilePath + ( + const char *path + ) +{ + if(NULL == path) + { + VSILOGE("Please set file path"); + return FALSE; + } + + if(vsi_nn_Access(path, 0) == 0) + { + return TRUE; + } + + if(vsi_nn_Mkdir(path, 0775) == 0) + { + VSILOGI("Create directory %s", path); + return TRUE; + } + else + { + VSILOGE("Create directory %s fail", path); + } + + return FALSE; +} /* vsi_nn_CheckFilePath() */ + +void vsi_nn_GetFP32MultiAndPostShift + ( + vx_float32 mult, + vx_uint16 *M0, + vx_int8 *N + ) +{ + vx_uint32 uintMult = *((vx_uint32*)(&mult)); + vx_uint32 tmpMultiply = 0; + vx_int32 exp = 0; + vx_uint32 postShiftBit6to5 = 0; + vx_uint32 postShift = 0; + vx_int8 tmpPostShift = 0; + + tmpMultiply = (uintMult & 0x7FFFFF) >> 8; + *M0 = (vx_uint16)((1U << 15) + tmpMultiply); + + exp = (uintMult & 0x7F800000) >> 23; /* postShift is Scale's exp*/ + tmpPostShift = 15 - ((vx_int8)exp - 127); + postShift = tmpPostShift & 0x1F; + tmpPostShift = tmpPostShift >> 5; + postShiftBit6to5 = tmpPostShift & 3; + + *N = (vx_int8)(((postShiftBit6to5 << 5) | (postShift & 0x1F))); + *N = (((vx_int32)*N << 25) >> 25); +}/* vsi_nn_GetFP32MultiAndPostShift() */ + +typedef struct +{ + uint8_t* raw_addr; +} aligned_header; + +uint8_t * vsi_nn_MallocAlignedBuffer + ( + uint32_t mem_size, + uint32_t align_start_size, + uint32_t align_block_size + ) +{ + uint32_t sz; + uintptr_t temp; + uint8_t* raw_addr; + uint8_t* p; + uint8_t* align_addr; + aligned_header* header; + + sz = sizeof(aligned_header) + mem_size + align_start_size + align_block_size; + raw_addr = (uint8_t *)malloc( sz * sizeof( uint8_t ) ); + memset(raw_addr, 0, sizeof( uint8_t ) * sz); + p = raw_addr + sizeof(aligned_header); + + temp = (uintptr_t)(((uintptr_t)p) % align_start_size); + if (temp == 0) + { + align_addr = p; + } + else + { + align_addr = p + align_start_size - temp; + } + header = (aligned_header*)(align_addr - sizeof(aligned_header)); + header->raw_addr = raw_addr; + return align_addr; +}/* vsi_nn_MallocAlignedBuffer() */ + +void vsi_nn_FreeAlignedBuffer + ( + uint8_t* handle + ) +{ + aligned_header* header; + header = (aligned_header*)(handle - sizeof(aligned_header)); + free(header->raw_addr); +} + +vsi_bool vsi_nn_IsBufferAligned + ( + uint8_t * buf, + uint32_t align_start_size + ) +{ + uintptr_t temp; + + temp = (uintptr_t)(((uintptr_t)buf) % align_start_size); + if (temp == 0) + { + return TRUE; + } + return FALSE; +}/* vsi_nn_IsBufferAligned() */ + +void vsi_nn_FormatToString + ( + vsi_nn_tensor_t *tensor, + char *buf, + uint32_t buf_sz + ) +{ + switch(tensor->attr.dtype.vx_type) + { + case VSI_NN_TYPE_INT8:strncpy(buf, "i8 ", buf_sz);break; + case VSI_NN_TYPE_INT16:strncpy(buf, "i16", buf_sz);break; + case VSI_NN_TYPE_INT32:strncpy(buf, "i32", buf_sz);break; + case VSI_NN_TYPE_INT64:strncpy(buf, "i64", buf_sz);break; + case VSI_NN_TYPE_UINT8:strncpy(buf, "u8 ", buf_sz);break; + case VSI_NN_TYPE_UINT16:strncpy(buf, "u16", buf_sz);break; + case VSI_NN_TYPE_UINT32:strncpy(buf, "u32", buf_sz);break; + case VSI_NN_TYPE_UINT64:strncpy(buf, "u64", buf_sz);break; + case VSI_NN_TYPE_FLOAT16:strncpy(buf, "f16", buf_sz);break; + case VSI_NN_TYPE_FLOAT32:strncpy(buf, "f32", buf_sz);break; + case VSI_NN_TYPE_FLOAT64:strncpy(buf, "f64", buf_sz);break; + case VSI_NN_TYPE_BFLOAT16:strncpy(buf, "bf16", buf_sz);break; + case VSI_NN_TYPE_BOOL8:strncpy(buf, "bool8", buf_sz);break; + default: + break; + } +} /* vsi_nn_FormatToString() */ + +const char* vsi_nn_DescribeStatus + ( + vsi_status status + ) +{ + static const char* unknown = "unknown"; + int16_t i = 0; + + for( i = 0; i < vx_status_desc_cnt; i++ ) + { + if(vx_status_desc[i].status == status ) + { + return vx_status_desc[i].desc; + } + } + return unknown; +} /* vsi_nn_DescribeStatus() */ + +int32_t vsi_nn_partition +( + void* data, + int32_t left, + int32_t right, + comp_func func, + vsi_bool is_recursion, + uint32_t* indices +) +{ + int32_t key_index; + int32_t low = left; + int32_t high = right; + if (left < right) + { + key_index = indices[left]; + while (low < high) + { + while (low < high && func(data, key_index, indices[high])) + { + high--; + } + indices[low] = indices[high]; + while (low < high && func(data, indices[low], key_index)) + { + low++; + } + indices[high] = indices[low]; + } + indices[low] = key_index; + if (is_recursion) + { + vsi_nn_partition(data, left, low - 1, func, TRUE, indices); + vsi_nn_partition(data, low + 1, right, func, TRUE, indices); + } + } + return low; +} + + +/* Greatest Common Divisor*/ +static vsi_bool vsi_nn_GetDataDivisors + ( + vx_uint32 input_value, + vx_uint32 *divisors, + vx_uint32 gcd + ) +{ + vx_uint32 i = 0; +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + for (i = vsi_nn_min(input_value, VSI_NN_MAX_IMAGE_WIDTH - 1); i > 0; i --) + { + if ((i % gcd == 0) && (input_value % i == 0)) + { + *divisors = i; + + return TRUE; + } + } +#undef VSI_NN_MAX_IMAGE_WIDTH + return FALSE; +} + +void vsi_nn_OptimizedEltOPShape + ( + vsi_nn_tensor_t * input, + uint32_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t * num_of_dims + ) +{ + uint32_t element_count = 0; + uint32_t i = 0; +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + element_count = vsi_nn_GetElementNum(input); + + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + sizes[i] = 1; + } + + if (element_count < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes[0] = element_count; + *num_of_dims = 2; + } + else + { + vx_uint32 divisors = 1; + for (i = 0; i < 2; i++) + { + divisors = 1; + vsi_nn_GetDataDivisors(element_count, &divisors, 1); + if (1 == divisors) + { + divisors = element_count; + } + sizes[i] = divisors; + element_count = element_count / divisors; + } + + sizes[2] = element_count; + if (1 == sizes[2]) + { + *num_of_dims = 2; + } + else + { + *num_of_dims = 3; + } + } +#undef VSI_NN_MAX_IMAGE_WIDTH +} + +vsi_bool vsi_nn_OptimizedEltWiseOPShape + ( + vsi_nn_tensor_t * input0, + vsi_nn_tensor_t * input1, + vsi_nn_tensor_t * output, + uint32_t sizes0[VSI_NN_MAX_DIM_NUM], + uint32_t sizes1[VSI_NN_MAX_DIM_NUM], + uint32_t sizes2[VSI_NN_MAX_DIM_NUM], + uint32_t * dim_num + ) +{ + vsi_bool status = TRUE; + uint32_t i = 0; + uint32_t cnt = 0; + uint32_t dims = 0; + uint32_t element_count0 = 0; + uint32_t element_count1 = 0; + vsi_bool enable_broadcast = FALSE; + vsi_bool enable_broadcast1 = FALSE; + uint32_t broadcast_Bits = 0; + + element_count0 = vsi_nn_GetElementNum(input0); + element_count1 = vsi_nn_GetElementNum(input1); + + if (element_count0 == 1 || element_count1 == 1) + { + enable_broadcast1 = TRUE; + } + + /*************step 1:init tensor shape*****************/ + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + sizes0[i] = 1; + sizes1[i] = 1; + sizes2[i] = 1; + } + + /*************step 2:squeeze tensor shape*****************/ + for (i = 0; i < output->attr.dim_num; i++) + { + uint32_t sz0 = input0->attr.dim_num > i ? input0->attr.size[i] : 1; + uint32_t sz1 = input1->attr.dim_num > i ? input1->attr.size[i] : 1; + uint32_t sz2 = output->attr.dim_num > i ? output->attr.size[i] : 1; + + if (sz0 == sz1 && sz0 == 1) + { + continue; + } + else + { + sizes0[cnt] = sz0; + sizes1[cnt] = sz1; + sizes2[cnt] = sz2; + + cnt ++; + dims ++; + } + } + + for (i = 0; i < dims; i++) + { + uint32_t sz0 = sizes0[i]; + uint32_t sz1 = sizes1[i]; + + if (sz0 != sz1) + { + enable_broadcast = TRUE; + broadcast_Bits |= (1 << i); + } + } + + /*************step 3:reshape tensor shape*****************/ + if (enable_broadcast == FALSE || enable_broadcast1) + { + vsi_nn_OptimizedEltOPShape(input0, sizes0, &dims); + vsi_nn_OptimizedEltOPShape(input1, sizes1, &dims); + vsi_nn_OptimizedEltOPShape(output, sizes2, &dims); + } + else + { +#define VSI_NN_MAX_IMAGE_WIDTH (65536) + switch (broadcast_Bits) + { + case VSI_NN_BROAD_CAST_BITS_0: + { + vx_uint32 element_count = 1; + vx_uint32 divisors = 1; + + for (i = 1; i < dims; i++) + { + element_count *= sizes0[i]; + } + + divisors = 1; + vsi_nn_GetDataDivisors(element_count, &divisors, 1); + + sizes0[1] = divisors; + sizes1[1] = divisors; + sizes2[1] = divisors; + sizes0[2] = element_count / divisors; + sizes1[2] = element_count / divisors; + sizes2[2] = element_count / divisors; + dims = 3; + + break; + } + case VSI_NN_BROAD_CAST_BITS_0 | VSI_NN_BROAD_CAST_BITS_1: + case VSI_NN_BROAD_CAST_BITS_0 | VSI_NN_BROAD_CAST_BITS_1 | VSI_NN_BROAD_CAST_BITS_2: + { + vx_uint32 w0 = sizes0[0] * sizes0[1]; + vx_uint32 w1 = sizes1[0] * sizes1[1]; + vx_uint32 w = sizes2[0] * sizes2[1]; + vx_uint32 h = sizes0[2]; + + if (h < VSI_NN_MAX_IMAGE_WIDTH && (w0 == 1 || w1 == 1) + && w < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes0[0] = w0; + sizes1[0] = w1; + sizes2[0] = w; + sizes0[1] = sizes0[2]; + sizes1[1] = sizes1[2]; + sizes2[1] = sizes2[2]; + sizes0[2] = 1; + sizes1[2] = 1; + sizes2[2] = 1; + } + + break; + } + case VSI_NN_BROAD_CAST_BITS_2: + { + vx_uint32 w = sizes0[0] * sizes0[1]; + + if (w < VSI_NN_MAX_IMAGE_WIDTH) + { + sizes0[0] = w; + sizes1[0] = w; + sizes2[0] = w; + sizes0[1] = sizes0[2]; + sizes1[1] = sizes1[2]; + sizes2[1] = sizes2[2]; + sizes0[2] = 1; + sizes1[2] = 1; + sizes2[2] = 1; + } + + break; + } + default: + if (dims == output->attr.dim_num) + status = FALSE; + break; + } + } + +#undef VSI_NN_MAX_IMAGE_WIDTH + + if (status == TRUE) + *dim_num = vsi_nn_max(dims, 2); + + if (dims > 2 && sizes2[2] != 1) + { + status = FALSE; + } + + return status; +} + +void vsi_nn_print_int_array( int32_t* array, size_t size ) +{ + size_t i; + size_t n; +#define _MSG_SIZE (256) + char buf[256]; + n = 0; + for( i = 0; i < size; i ++ ) + { + n += snprintf( &buf[n], _MSG_SIZE - n, "%d, ", array[i] ); + if( n >= _MSG_SIZE ) + { + break; + } + } + VSILOGD( "%s", buf ); +#undef _MSG_SIZE +} /* vsi_nn_print_int_array() */ + +vsi_bool vsi_nn_IsEVISFeatureAvaiable + ( + vsi_nn_context_t context + ) +{ + if ( context->config.evis.ver == VSI_NN_HW_EVIS_1 + || context->config.evis.ver == VSI_NN_HW_EVIS_2 + ) + { + return TRUE; + } + + return FALSE; +} + +/* compare verision, return 1 greater, 0 equal, -1 less*/ +int32_t vsi_nn_compareVersion + ( + vsi_nn_graph_t * graph, + uint32_t version_major, + uint32_t version_minor, + uint32_t version_patch + ) +{ + uint32_t graph_version_major = 0; + uint32_t graph_version_minor = 0; + uint32_t graph_version_patch = 0; + + vsi_nn_GetGraphVersion( graph, &graph_version_major, + &graph_version_minor, &graph_version_patch ); + + if (graph_version_major > version_major) + { + return 1; + } + else if (graph_version_major < version_major) + { + return -1; + } + + if (graph_version_minor > version_minor) + { + return 1; + } + else if (graph_version_minor < version_minor) + { + return -1; + } + + if (graph_version_patch > version_patch) + { + return 1; + } + else if (graph_version_patch < version_patch) + { + return -1; + } + + return 0; +} + +float vsi_nn_activation + ( + float value, + vsi_nn_activation_e activation + ) +{ + switch(activation) + { + case VSI_NN_ACT_NONE: + return value; + case VSI_NN_ACT_RELU: + return value < 0.f ? 0.f : value; + case VSI_NN_ACT_RELU6: + return vsi_nn_max(0.f, vsi_nn_min(value, 6.f)); + case VSI_NN_ACT_TANH: + return (float)tanh(value); + case VSI_NN_ACT_SIGMOID: + return (float)(1.0f / (1.0f + exp(-value))); + case VSI_NN_ACT_HARD_SIGMOID: + value = value * 0.2f + 0.5f; + return vsi_nn_max(0.f, vsi_nn_min(value, 1.f)); + default: + VSILOGE("Unsupported activation: %d\n", activation); + exit(1); + } +} + +vsi_bool vsi_nn_is_same_data_type( + vsi_nn_tensor_t * src, + vsi_nn_tensor_t * dst + ) +{ + return (src->attr.dtype.vx_type == dst->attr.dtype.vx_type); +} + +vsi_bool vsi_nn_is_same_quant_type( + vsi_nn_tensor_t * src, + vsi_nn_tensor_t * dst + ) +{ + vx_bool result = FALSE; + + if (src->attr.dtype.vx_type == dst->attr.dtype.vx_type) + { + switch (src->attr.dtype.qnt_type) + { + case VSI_NN_QNT_TYPE_NONE: + result = TRUE; + break; + + case VSI_NN_QNT_TYPE_DFP: + if (src->attr.dtype.fl == dst->attr.dtype.fl) + { + result = TRUE; + } + break; + + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + if (src->attr.dtype.scale == dst->attr.dtype.scale && + src->attr.dtype.zero_point == dst->attr.dtype.zero_point) + { + result = TRUE; + } + break; + + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: + { + int32_t i = 0; + int32_t scale_cnt0 = src->attr.dtype.scale_dim; + int32_t scale_cnt1 = dst->attr.dtype.scale_dim; + + if (scale_cnt0 == scale_cnt1) + { + const float *src_scale_ptr = src->attr.dtype.scales; + const float *dst_scale_ptr = dst->attr.dtype.scales; + for (i = 0; i < scale_cnt0; i++) + { + if (src_scale_ptr[i] != dst_scale_ptr[i]) + break; + } + + if (i == scale_cnt0) + result = TRUE; + } + } + break; + + default: + break; + } + } + + return result; +} + +vsi_bool vsi_nn_is_same_type + ( + vsi_nn_tensor_t * src, + vsi_nn_tensor_t * dst + ) +{ + return (vsi_nn_is_same_data_type(src, dst) && vsi_nn_is_same_quant_type(src, dst)); +} diff --git a/src/tim/vx/internal/src/utils/vsi_nn_vdata.c b/src/tim/vx/internal/src/utils/vsi_nn_vdata.c new file mode 100644 index 0000000..c3171b6 --- /dev/null +++ b/src/tim/vx/internal/src/utils/vsi_nn_vdata.c @@ -0,0 +1,54 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include + +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_log.h" +#include "utils/vsi_nn_util.h" + +uint8_t * vsi_nn_VdataCreate + ( + vsi_nn_graph_t * graph, + vsi_nn_node_t * node, + uint32_t * p_stream_size + ) +{ + return NULL; +} /* vsi_nn_VdataCreate() */ + +vsi_nn_tensor_t * vsi_nn_CreateVDataTensor + ( + vsi_nn_graph_t * graph, + uint8_t * stream, + vsi_nn_tensor_attr_t * attr + ) +{ + return NULL; +} /* vsi_nn_CreateVDataTensor() */ + diff --git a/src/tim/vx/internal/src/vsi_nn_client_op.c b/src/tim/vx/internal/src/vsi_nn_client_op.c new file mode 100644 index 0000000..fcfa365 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_client_op.c @@ -0,0 +1,141 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_client_op.h" +#include "utils/vsi_nn_binary_tree.h" + + +typedef struct _client_node +{ + vsi_nn_op_t op; + vsi_nn_op_proc_t proc; +} _client_node_t; + +static vsi_nn_binary_tree_t * s_root = NULL; + +static _client_node_t * _create_client_node + ( + vsi_nn_op_t op, + vsi_nn_op_proc_t * proc + ) +{ + _client_node_t * node; + node = (_client_node_t *)malloc( sizeof( _client_node_t ) ); + if( NULL != node ) + { + node->op = op; + memcpy( &node->proc, proc, sizeof( vsi_nn_op_proc_t ) ); + } + return node; +} /* _create_client_node() */ + +static void _release_client_node + ( + _client_node_t ** node + ) +{ + if( NULL != node && NULL != *node ) + { + free( *node ); + *node = NULL; + } +} /* _release_client_node() */ + +vsi_bool vsi_nn_OpIsRegistered + ( + vsi_nn_op_t op + ) +{ + return ( NULL != vsi_nn_OpGetClient( op ) ); +} /* vsi_nn_OpIsRegistered() */ + +vsi_bool vsi_nn_OpRegisterClient + ( + vsi_nn_op_t op, + vsi_nn_op_proc_t * proc + ) +{ + vsi_bool ret; + _client_node_t * node; + + ret = FALSE; + if( TRUE == vsi_nn_OpIsRegistered( op ) ) + { + VSILOGE( "OP %#x has been registered.", op ); + return ret; + } + + node = _create_client_node( op, proc ); + if( NULL != node ) + { + vsi_nn_BinaryTreeNewNode( + &s_root, + (vsi_nn_binary_tree_key_t)op, + (void *)node + ); + ret = TRUE; + } + return ret; +} /* vsi_nn_OpRegisterClient() */ + +vsi_nn_op_proc_t * vsi_nn_OpGetClient + ( + vsi_nn_op_t op + ) +{ + vsi_nn_op_proc_t * proc; + _client_node_t * node; + + proc = NULL; + node = (_client_node_t *)vsi_nn_BinaryTreeGetNode( + &s_root, + (vsi_nn_binary_tree_key_t)op ); + if( NULL != node ) + { + proc = &node->proc; + } + return proc; +} /* vsi_nn_OpGetClient() */ + +void vsi_nn_OpRemoveClient + ( + vsi_nn_op_t op + ) +{ + _client_node_t * node; + + node = (_client_node_t *)vsi_nn_BinaryTreeGetNode( + &s_root, + (vsi_nn_binary_tree_key_t)op ); + if( NULL != node ) + { + _release_client_node( &node ); + vsi_nn_BinaryTreeRemoveNode( &s_root, op ); + } +} /* vsi_nn_OpRemoveClient() */ + diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c new file mode 100644 index 0000000..0ea0e3e --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -0,0 +1,116 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include "vsi_nn_types.h" +#include "vsi_nn_test.h" +#include "vsi_nn_context.h" +#include "vsi_nn_platform.h" + +static vsi_status query_hardware_caps + ( + vsi_nn_context_t context + ) +{ + vsi_status status = VSI_FAILURE; + vx_hardware_caps_params_t param; + +#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT + vx_hardware_caps_params_ext_t paramExt; + + memset(¶mExt, 0, sizeof(vx_hardware_caps_params_ext_t)); + status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt), + sizeof(vx_hardware_caps_params_ext_t)); + param.evis1 = paramExt.base.evis1; + param.evis2 = paramExt.base.evis2; +#else + memset(¶m, 0, sizeof(vx_hardware_caps_params_t)); + status = vxQueryHardwareCaps(context->c, ¶m, sizeof(vx_hardware_caps_params_t)); +#endif + TEST_CHECK_STATUS(status, final); + +#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT + context->config.subGroupSize = paramExt.subGroupSize; +#endif + if(param.evis1 == TRUE && param.evis2 == FALSE) + { + context->config.evis.ver = VSI_NN_HW_EVIS_1; + } + else if(param.evis1 == FALSE && param.evis2 == TRUE) + { + context->config.evis.ver = VSI_NN_HW_EVIS_2; + } + else + { + context->config.evis.ver = VSI_NN_HW_EVIS_NONE; + VSILOGW("Unsupported evis version"); + } + +final: + return status; +} + +vsi_nn_context_t vsi_nn_CreateContext + ( void ) +{ + vsi_nn_context_t context = NULL; + vx_context c = NULL; + + context = (vsi_nn_context_t)malloc(sizeof(struct _vsi_nn_context_t)); + if(NULL == context) + { + return NULL; + } + c = vxCreateContext(); + if(NULL == c) + { + free(context); + return NULL; + } + + memset(context, 0, sizeof(struct _vsi_nn_context_t)); + context->c = c; + if(query_hardware_caps(context) != VSI_SUCCESS) + { + vsi_nn_ReleaseContext(&context); + return NULL; + } + + return context; +} /* vsi_nn_CreateContext() */ + +void vsi_nn_ReleaseContext + ( vsi_nn_context_t * ctx ) +{ + if( NULL != ctx && NULL != *ctx ) + { + vsi_nn_context_t context = *ctx; + if(context->c) + { + vxReleaseContext( &context->c); + } + free(context); + *ctx = NULL; + } +} /* vsi_nn_ReleaseContext() */ + diff --git a/src/tim/vx/internal/src/vsi_nn_daemon.c b/src/tim/vx/internal/src/vsi_nn_daemon.c new file mode 100644 index 0000000..a5b2797 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_daemon.c @@ -0,0 +1,38 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_daemon.h" +#include "vsi_nn_log.h" +#include "kernel/vsi_nn_kernel.h" + +_INITIALIZER( daemon_start ) +{ + vsi_nn_kernel_backend_init(); +} /* _daemon_start() */ + +_DEINITIALIZER( daemon_shutdown ) +{ + vsi_nn_kernel_backend_deinit(); +} /* vsi_nn_daemen_shutdown() */ + diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c new file mode 100644 index 0000000..392370a --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -0,0 +1,1927 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_types.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_rnn.h" +#include "vsi_nn_test.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_version.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_vdata.h" +#include "utils/vsi_nn_map.h" +#include "vsi_nn_graph_optimization.h" + +static vsi_status _set_reference_name + ( + vsi_nn_graph_t *graph, + vsi_nn_node_t *node + ) +{ +#define _NODE_ID_LEN 64 + vsi_status status; + vsi_nn_tensor_t *tensor; + uint32_t i; + char name[_NODE_ID_LEN]; + + if(NULL == node || NULL == graph) + { + return VSI_FAILURE; + } + + status = VSI_SUCCESS; + memset(name, 0, sizeof(char) * _NODE_ID_LEN); + snprintf(name, sizeof(char) * _NODE_ID_LEN, "uid_%u", node->uid); + if(node && node->n) + { + status = vxSetReferenceName((vx_reference)node->n, name); + } + TEST_CHECK_STATUS(status, final); + for(i = 0; i < node->output.num; i++) + { + memset(name, 0, sizeof(char) * _NODE_ID_LEN); + snprintf(name, sizeof(char) * _NODE_ID_LEN, "uid_%u_out_%u", node->uid, i); + tensor = vsi_nn_GetTensor(graph, node->output.tensors[i]); + if(tensor && tensor->t) + { + status = vxSetReferenceName((vx_reference)tensor->t, name); + TEST_CHECK_STATUS(status, final); + } + } + +final: + return status; +} /* _set_reference_name() */ + +static vsi_status _check_swapped_tensors + ( + const vsi_nn_graph_t* graph + ) +{ + uint32_t i = 0; + vsi_status status = VSI_SUCCESS; + + VSILOGD("Check swapped tensors"); + for( i = 0; i < graph->node_num; i++ ) + { + vsi_nn_node_t* node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)i ); + + /* For NBG node, all inputs/outputs need to be set if tensors are swapped */ + if( node && VSI_NN_OP_NBG == node->op ) + { + uint32_t idx, j; + vsi_nn_tensor_t* tensor = NULL; + + idx = 0; + for( j = 0; j < node->input.num; j++ ) + { + tensor = vsi_nn_GetTensor( graph, node->input.tensors[j] ); + if( tensor && tensor->is_swapped ) + { + status = vxSetParameterByIndex( node->n, idx, (vx_reference)tensor->t ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Set input parameter %d for node[%08x] fail!", idx, node->n ); + goto final; + } + tensor->is_swapped = FALSE; + } + idx++; + } + + for( j = 0; j < node->output.num; j++ ) + { + tensor = vsi_nn_GetTensor( graph, node->output.tensors[j] ); + if( tensor && tensor->is_swapped ) + { + status = vxSetParameterByIndex( node->n, idx, (vx_reference)tensor->t ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Set output parameter %d for node[%08x] fail!", idx, node->n ); + goto final; + } + tensor->is_swapped = FALSE; + } + idx++; + } + } + } + +final: + return status; +} /* _check_swapped_tensors() */ + +static void free_io_buffer + ( + vsi_nn_tensor_t **buffer + ) +{ + if(buffer) + { + free(buffer); + buffer = NULL; + } +} /* free_io_buffer() */ + +static vsi_nn_tensor_t **allocate_io_buffer + ( + vsi_nn_graph_t * graph + ) +{ + vsi_nn_tensor_t **buffer; + + buffer = (vsi_nn_tensor_t **)malloc(sizeof(vsi_nn_tensor_t *) * graph->max_node_io); + if(NULL == buffer) + { + return NULL; + } + + return buffer; +} /* allocate_io_buffer() */ + +static vsi_status update_max_node_io + ( + vsi_nn_graph_t * graph, + vsi_nn_node_id_t *node_list + ) +{ + uint32_t i,max_io; + vsi_status status; + vsi_nn_node_id_t node_id; + vsi_nn_node_t *node; + + status = VSI_SUCCESS; + max_io = VSI_NN_MAX_IO_NUM; /* default max node io */ + for(i = 0; i < graph->node_num; i++) + { + node_id = node_list[i]; + node = vsi_nn_GetNode( graph, node_id ); + if(node->input.num > max_io) + { + max_io = node->input.num; + } + if(node->output.num > max_io) + { + max_io = node->output.num; + } + } + + graph->max_node_io = max_io; + return status; +} /* update_max_node_io() */ + +static vsi_status optimize_node_backward + ( + vsi_nn_graph_t * graph, + vsi_nn_node_id_t *node_list + ) +{ + int32_t i; + vsi_status status; + vsi_nn_tensor_t **inputs; + vsi_nn_tensor_t **outputs; + vsi_nn_node_id_t node_id; + vsi_nn_node_t *node; + + status = VSI_SUCCESS; + inputs = allocate_io_buffer(graph); + outputs = allocate_io_buffer(graph); + if(NULL == inputs || NULL == outputs) + { + VSILOGE("allocate io buffer fail"); + status = VSI_FAILURE; + goto final; + } + + for( i = graph->node_num - 1; i >= 0; i-- ) + { + node_id = node_list[i]; + memset( inputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); + memset( outputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); + + /* Get inputs, outputs. */ + node = vsi_nn_GetNode( graph, node_id ); + vsi_nn_GetTensors( graph, node->input.tensors, + node->input.num, inputs ); + vsi_nn_GetTensors( graph, node->output.tensors, + node->output.num, outputs ); + + status = vsi_nn_OpOptimize(node->op, node, inputs, outputs, VSI_NN_OPTIMIZE_BACKWARD); + if( status != VSI_SUCCESS ) + { + VSILOGE( "Backward optimize node[%u] %s fail", + node_id, vsi_nn_OpGetName(node->op)); + break; + } + } + +final: + free_io_buffer(inputs); + free_io_buffer(outputs); + return status; +} /* optimize_node_backward() */ + +static vsi_status optimize_node_forward + ( + vsi_nn_graph_t * graph, + vsi_nn_node_id_t *node_list + ) +{ + uint32_t i; + vsi_status status; + vsi_nn_tensor_t **inputs; + vsi_nn_tensor_t **outputs; + vsi_nn_node_id_t node_id; + vsi_nn_node_t *node; + + status = VSI_SUCCESS; + inputs = allocate_io_buffer(graph); + outputs = allocate_io_buffer(graph); + if(NULL == inputs || NULL == outputs) + { + VSILOGE("allocate io buffer fail"); + status = VSI_FAILURE; + goto final; + } + + for( i = 0; i < graph->node_num; i++ ) + { + node_id = node_list[i]; + memset( inputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); + memset( outputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); + + /* Get inputs, outputs. */ + node = vsi_nn_GetNode( graph, node_id ); + vsi_nn_GetTensors( graph, node->input.tensors, + node->input.num, inputs ); + vsi_nn_GetTensors( graph, node->output.tensors, + node->output.num, outputs ); + + status = vsi_nn_OpOptimize(node->op, node, inputs, outputs, VSI_NN_OPTIMIZE_FORWARD); + if( status != VSI_SUCCESS ) + { + VSILOGE( "Forward optimize node[%u] %s fail", + node_id, vsi_nn_OpGetName(node->op)); + break; + } + } + +final: + free_io_buffer(inputs); + free_io_buffer(outputs); + return status; +} /* optimize_node_forward() */ + +static vsi_status compute_node + ( + vsi_nn_graph_t * graph, + vsi_nn_node_id_t *node_list + ) +{ + uint32_t i,j; + vsi_status status; + vsi_nn_tensor_t **inputs; + vsi_nn_tensor_t **outputs; + vsi_nn_node_id_t node_id; + vsi_nn_node_t *node; + + status = VSI_SUCCESS; + inputs = allocate_io_buffer(graph); + outputs = allocate_io_buffer(graph); + if(NULL == inputs || NULL == outputs) + { + VSILOGE("allocate io buffer fail"); + status = VSI_FAILURE; + goto final; + } + + VSILOGI("Create vx node"); + for( i = 0; i < graph->node_num; i++ ) + { + node_id = node_list[i]; + memset( inputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); + memset( outputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); + + /* Get inputs, outputs. */ + node = vsi_nn_GetNode( graph, node_id ); + vsi_nn_GetTensors( graph, node->input.tensors, + node->input.num, inputs ); + vsi_nn_GetTensors( graph, node->output.tensors, + node->output.num, outputs ); + + /* Create vx output tensor */ + for ( j = 0; j < node->output.num; j++ ) + { + if( NULL == outputs[j] || NULL != outputs[j]->t ) + continue; + vsi_nn_TensorReinit( graph, outputs[j] ); + } + + /* Create vx node */ + VSILOGD("Instance node[%d] \"%s\" ...", node_id, vsi_nn_OpGetName(node->op)); + status = vsi_nn_OpCompute( node->op, node, inputs, outputs ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Create node[%d] %s fail", node_id, vsi_nn_OpGetName(node->op)); + break; + } + status = _set_reference_name(graph, node); + if( VSI_SUCCESS != status ) + { + VSILOGW("Set reference name fail"); + } + + status = vsi_nn_update_node_attr(node); + if( VSI_SUCCESS != status ) + { + VSILOGW("Update node attribute fail"); + } + } + +final: + free_io_buffer(inputs); + free_io_buffer(outputs); + return status; +} /* compute_node */ + +static vsi_status optimize_node + ( + vsi_nn_graph_t * graph, + vsi_nn_node_id_t *node_list + ) +{ + vsi_status status; + + status = VSI_FAILURE; + VSILOGD("Backward optimize neural network"); + status = optimize_node_backward(graph, node_list); + if(status != VSI_SUCCESS) + { + return VSI_FAILURE; + } + + VSILOGD("Forward optimize neural network"); + status = optimize_node_forward(graph, node_list); + if(status != VSI_SUCCESS) + { + return VSI_FAILURE; + } + + return status; +} /* optimize_node() */ + +static vsi_status setup_node + ( + vsi_nn_graph_t * graph, + vsi_nn_node_id_t *node_list + ) +{ + uint32_t i; + vsi_status status; + vsi_bool ret; + vsi_nn_tensor_t **inputs; + vsi_nn_tensor_t **outputs; + vsi_nn_node_id_t node_id; + vsi_nn_node_t *node; + + status = VSI_SUCCESS; + ret = TRUE; + inputs = allocate_io_buffer(graph); + outputs = allocate_io_buffer(graph); + if(NULL == inputs || NULL == outputs) + { + VSILOGE("allocate io buffer fail"); + status = VSI_FAILURE; + goto final; + } + + for( i = 0; i < graph->node_num; i++ ) + { + node_id = node_list[i]; + memset( inputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); + memset( outputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); + + /* Get inputs, outputs. */ + node = vsi_nn_GetNode( graph, node_id ); + vsi_nn_GetTensors( graph, node->input.tensors, + node->input.num, inputs ); + vsi_nn_GetTensors( graph, node->output.tensors, + node->output.num, outputs ); + + VSILOGD("Setup node id[%u] uid[%u] op[%s]", + node_id, node->uid, vsi_nn_OpGetName(node->op)); + if( vsi_nn_OpCheck( node->op, node, inputs, outputs ) ) + { + vsi_nn_print_node_io(graph, node, 0x01); + ret = vsi_nn_OpGenerateTensor( node, inputs, outputs ); + if(ret != TRUE) + { + VSILOGE( "Setup node[%u] %s fail", node_id, vsi_nn_OpGetName(node->op)); + status = VSI_FAILURE; + break; + } + vsi_nn_print_node_io(graph, node, 0x02); + } + else + { + VSILOGE( "Check node[%u] %s fail", node_id, vsi_nn_OpGetName(node->op)); + status = VSI_FAILURE; + break; + } + } + +final: + free_io_buffer(inputs); + free_io_buffer(outputs); + return status; +} /* setup_node() */ + +vsi_nn_graph_t * vsi_nn_CreateGraph + ( + vsi_nn_context_t ctx, + uint32_t max_tensor_num, + uint32_t max_node_num + ) +{ + vsi_nn_graph_t * graph; + graph = NULL; + + VSILOGI( "%s", vsi_nn_GetVersion() ); + + if( NULL == ctx ) + { + return graph; + } + + graph = (vsi_nn_graph_t *)malloc( sizeof( vsi_nn_graph_t ) ); + if( NULL != graph ) + { + memset( graph, 0, sizeof( vsi_nn_graph_t ) ); + graph->g = vxCreateGraph( ctx->c ); + if( NULL != graph->g ) + { + /* Configure driver mem aligned size, + * driver requests address and tensor size are aligend to 64 bytes. */ + const uint32_t ADDRESS_ALIGN_BYTES = 64; + graph->handle_manager.align_start_size = ADDRESS_ALIGN_BYTES; + #ifdef VX_WRAP_USER_MEMORY_SIZE_ALIGNMENT + graph->handle_manager.align_block_size = (VX_WRAP_USER_MEMORY_SIZE_ALIGNMENT); + #else + { + const uint32_t MEMORY_BLOCK_ALIGN_BYTES = 4096; + graph->handle_manager.align_block_size = MEMORY_BLOCK_ALIGN_BYTES; + } + #endif + graph->tensor_num = 0; + graph->node_num = 0; + graph->ctx = ctx; + graph->rnn_wksp = NULL; + graph->node_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) ); + graph->tensor_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) ); + vsi_nn_MapInit( graph->node_table ); + vsi_nn_MapInit( graph->tensor_table ); + } + else + { + VSILOGE( "Create vx graph fail." ); + free( graph ); + graph = NULL; + } + } + + return graph; +} /* vsi_nn_CreateGraph() */ + +void vsi_nn_ReleaseGraph + ( + vsi_nn_graph_t ** graph + ) +{ + uint32_t i; + vsi_nn_graph_t * ptr; + + ptr = *graph; + if( NULL != graph && NULL != * graph ) + { + if( NULL != ptr->tensors ) + { + for( i = 0; i < ptr->tensor_num; i++ ) + { + vsi_nn_RemoveTensor( *graph, (vsi_nn_tensor_id_t)i ); + } + free( (*graph)->tensor_table ); + } + if( ptr->complete_signal.exists + && NULL != ptr->complete_signal.tensor ) + { + vsi_nn_ReleaseTensor( &ptr->complete_signal.tensor ); + } + if( NULL != ptr->nodes ) + { + for( i = 0; i < ptr->node_num; i++ ) + { + vsi_nn_RemoveNode( *graph, (vsi_nn_node_id_t)i ); + } + free( (*graph)->node_table ); + } + if( NULL != ptr->input.tensors ) + { + free( ptr->input.tensors ); + } + if( NULL != ptr->output.tensors ) + { + free( ptr->output.tensors ); + } + if( NULL != ptr->rnn_wksp ) + { + vsi_nn_rnn_DeinitWksp( ptr ); + } + if( NULL != ptr->g ) + { + vxReleaseGraph( &ptr->g ); + } + free( ptr ); + *graph = NULL; + } + +} /* vsi_nn_ReleaseGraph() */ + +/* +* Create vx tensor and nodes. +* */ +vsi_status vsi_nn_SetupGraph + ( + vsi_nn_graph_t * graph, + vsi_bool sort + ) +{ + uint32_t i; + vsi_status status; + vsi_nn_node_id_t *sorted_nodes; + vsi_nn_node_id_t *nodes_list; + uint32_t num_of_graph_inputs; + vx_reference *graph_inputs = NULL; + uint32_t num_of_graph_outputs; + vx_reference *graph_outputs = NULL; + vsi_nn_tensor_t *tensor; + vsi_bool dirty = FALSE; + + status = VSI_FAILURE; + sorted_nodes = NULL; + nodes_list = NULL; + if( NULL == graph ) + { + return status; + } + + /* Optimize graph */ + status = vsi_nn_OptimizeGraph(graph, &dirty); + if(VSI_SUCCESS != status) + { + goto final; + } + + /* Prepare node list */ + nodes_list = (vsi_nn_node_id_t *)malloc( + graph->node_num * sizeof( vsi_nn_node_id_t ) ); + if( !nodes_list ) + { + goto final; + } + if( TRUE == sort || dirty) + { + VSILOGD( "Sort graph nodes."); + sorted_nodes = vsi_nn_SortGraphNode( graph ); + if (NULL == sorted_nodes) + { + VSILOGW("Sort graph nodes failure."); + free(nodes_list); + nodes_list = NULL; + return status; + } + memcpy(nodes_list, sorted_nodes, + graph->node_num * sizeof( vsi_nn_node_id_t )); + } + else + { + for ( i = 0; i < graph->node_num; i++ ) + { + nodes_list[i] = i; + } + } + + status = update_max_node_io( graph, nodes_list ); + if(VSI_SUCCESS != status) + { + goto final; + } + + /* Preprocess node and tensor */ + status = setup_node( graph, nodes_list ); + if(VSI_SUCCESS != status) + { + goto final; + } + + /* Optimize graph */ + status = optimize_node( graph, nodes_list ); + if(VSI_SUCCESS != status) + { + goto final; + } + + /* Create vx node and vx virtual tensor */ + status = compute_node( graph, nodes_list ); + if(VSI_SUCCESS != status) + { + goto final; + } + + /* Try setup graph complete signal node. */ + status = vsi_nn_TrySetupCompleteSignalNode( graph ); + TEST_CHECK_STATUS( status, final ); + + /* Explicitly set graph inputs and outputs */ + num_of_graph_inputs = graph->input.num; + graph_inputs = (vx_reference *)malloc( num_of_graph_inputs * sizeof( vx_reference ) ); + for( i = 0; i < num_of_graph_inputs; i++ ) + { + tensor = vsi_nn_GetTensor( graph, graph->input.tensors[i] ); + if (tensor) + { + graph_inputs[i] = (vx_reference)( tensor->t ); + } + else + { + graph_inputs[i] = NULL; + } + } + num_of_graph_outputs = graph->output.num; + if( graph->complete_signal.exists ) + { + num_of_graph_outputs += 1; + } + graph_outputs = (vx_reference *)malloc( num_of_graph_outputs * sizeof( vx_reference ) ); + for( i = 0; i < num_of_graph_outputs; i++ ) + { + tensor = vsi_nn_GetTensor( graph, graph->output.tensors[i] ); + if (tensor) + { + graph_outputs[i] = (vx_reference)( tensor->t ); + } + else + { + graph_outputs[i] = NULL; + } + } + if( graph->complete_signal.exists ) + { + graph_outputs[num_of_graph_outputs - 1] = \ + (vx_reference)graph->complete_signal.tensor->t; + } + status = vxIdentifyGraphInputsAndOutputs( graph->g, + num_of_graph_inputs, + graph_inputs, + num_of_graph_outputs, + graph_outputs ); + + if( VSI_SUCCESS != status ) + { + goto final; + } + +final: + if( NULL != sorted_nodes ) + { + free( sorted_nodes ); + } + if ( NULL != nodes_list ) + { + free( nodes_list ); + } + if ( NULL != graph_inputs) + { + free( graph_inputs ); + } + if ( NULL != graph_outputs) + { + free( graph_outputs ); + } + return status; +} /* vsi_nn_SetupGraph() */ + +/* +* Call vx verify graph. +* */ +vsi_status vsi_nn_VerifyGraph + ( + vsi_nn_graph_t * graph + ) +{ + vsi_status status; + status = VSI_FAILURE; + if( NULL != graph->g ) + { + status = vxVerifyGraph( graph->g ); + } + return status; +} /* vsi_nn_VerifyGraph() */ + +vsi_status vsi_nn_RunGraph + ( + const vsi_nn_graph_t * graph + ) +{ + vsi_status status; + status = VSI_FAILURE; + if( NULL != graph->g ) + { + if( vsi_nn_HasRNN( graph ) ) + { + status = vsi_nn_rnn_feed_internal_state( graph ); + } + else + { + status = VSI_SUCCESS; + } + + if( VSI_SUCCESS == status ) + { + status = _check_swapped_tensors( graph ); + } + + if( VSI_SUCCESS == status ) + { + status = vxProcessGraph( graph->g ); + } + + if( VSI_SUCCESS == status && vsi_nn_HasRNN( graph ) ) + { + status = vsi_nn_rnn_save_internal_state( graph ); + } + } + return status; +} /* vsi_nn_RunGraph() */ + +vsi_status vsi_nn_GenerateNBG( + vsi_nn_graph_t * graph, + void * nbg_buffer, + size_t * size + ) +{ + return (VX_SUCCESS == vxGenerateNBG( graph->g, nbg_buffer, size ))? VSI_SUCCESS : VSI_FAILURE; +} /* vsi_nn_GenerateNBG() */ + +vsi_status vsi_nn_AsyncRunGraph + ( + vsi_nn_graph_t * graph + ) +{ + vsi_status status; + status = VSI_FAILURE; + if( NULL != graph->g ) + { + if( vsi_nn_HasRNN( graph ) ) + { + status = vsi_nn_rnn_feed_internal_state( graph ); + } + else + { + status = VSI_SUCCESS; + } + + if( VSI_SUCCESS == status ) + { + status = _check_swapped_tensors( graph ); + } + + if( VSI_SUCCESS == status ) + { + status = vxScheduleGraph(graph->g); + } + } + return status; +} /* vsi_nn_AsynRunGraph() */ + + +vsi_status vsi_nn_AsyncRunWait + ( + vsi_nn_graph_t * graph + ) +{ + vsi_status status; + status = VSI_FAILURE; + if( NULL != graph->g ) + { + status = vxWaitGraph(graph->g); + if( VSI_SUCCESS == status && vsi_nn_HasRNN( graph ) ) + { + status = vsi_nn_rnn_save_internal_state( graph ); + } + } + return status; +} + + +vsi_status vsi_nn_SetGraphVersion + ( + vsi_nn_graph_t * graph, + uint32_t major, + uint32_t minor, + uint32_t patch + ) +{ + graph->version.major = major; + graph->version.minor = minor; + graph->version.patch = patch; + return VSI_SUCCESS; +} /* vsi_nn_SetGraphVersion() */ + +vsi_status vsi_nn_GetGraphVersion + ( + vsi_nn_graph_t * graph, + uint32_t * major, + uint32_t * minor, + uint32_t * patch + ) +{ + *major = graph->version.major; + *minor = graph->version.minor; + *patch = graph->version.patch; + return VSI_SUCCESS; +} /* vsi_nn_GetGraphVersion() */ + +static vsi_nn_tensor_id_t _add_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id, + vsi_nn_tensor_attr_t * attr, + uint8_t * data + ) +{ + vsi_nn_tensor_t * tensor; + tensor = NULL; + if( NULL == graph || NULL == attr ) + { + return VSI_NN_TENSOR_ID_NA; + } + if( VSI_NN_TENSOR_ID_AUTO == id ) + { + id = graph->cur_tid; + graph->tensor_num = graph->cur_tid; + } + + if (TRUE == attr->is_created_from_handle) + { + tensor = vsi_nn_CreateTensorFromHandle( graph, data, attr ); + } + else if( VSI_NN_TYPE_VDATA == attr->dtype.vx_type ) + { + if( NULL == data ) + { + id = VSI_NN_TENSOR_ID_NA; + } + else + { + tensor = vsi_nn_CreateVDataTensor( graph, data, attr ); + } + } + else if( NULL != data ) + { + tensor = vsi_nn_CreateTensorFromData( graph, data, attr ); + } + else + { + tensor = vsi_nn_CreateTensor( graph, attr ); + } + + if( NULL != tensor ) + { + vsi_nn_MapAdd( graph->tensor_table, (vsi_nn_map_key_t)id, (void *)tensor ); + graph->cur_tid ++; + } + else + { + id = VSI_NN_TENSOR_ID_NA; + } + return id; +} + +vsi_nn_tensor_id_t vsi_nn_AddTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id, + vsi_nn_tensor_attr_t * attr, + uint8_t * data + ) +{ + attr->is_created_from_handle = FALSE; + return _add_tensor(graph, id, attr, data); +} /* vsi_nn_AddTensor() */ + +vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id, + vsi_nn_tensor_attr_t * attr, + uint8_t * data + ) +{ + attr->is_created_from_handle = TRUE; + return _add_tensor(graph, id, attr, data); +} + +vsi_nn_tensor_id_t vsi_nn_AttachTensorToGraph + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id, + vsi_nn_tensor_t * tensor + ) +{ + if( NULL == graph || NULL == tensor ) + { + return VSI_NN_TENSOR_ID_NA; + } + if( VSI_NN_TENSOR_ID_AUTO == id ) + { + id = graph->cur_tid; + graph->tensor_num = graph->cur_tid; + } + graph->cur_tid ++; + vsi_nn_MapAdd( graph->tensor_table, (vsi_nn_map_key_t)id, (void *)tensor ); + return id; +} /* vsi_nn_AttachTensorToGraph() */ + +/* + * Deprecated, Use vsi_nn_RemoveTensor() instead + */ +void vsi_nn_DeleteTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id + ) +{ + vsi_nn_RemoveTensor( graph, id ); +} /* vsi_nn_DeleteTensor() */ + +void vsi_nn_RemoveTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id + ) +{ + vsi_nn_tensor_t * tensor; + if( NULL != graph ) + { + tensor = vsi_nn_GetTensor( graph, id ); + if( NULL != tensor ) + { + vsi_nn_ReleaseTensor( &tensor ); + vsi_nn_MapRemove( graph->tensor_table, + (vsi_nn_map_key_t)id ); + } + } +} /* vsi_nn_RemoveTensor() */ + +vsi_nn_tensor_t * vsi_nn_GetTensor + ( + const vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id + ) +{ + vsi_nn_tensor_t * tensor; + tensor = NULL; + if( NULL != graph ) + { + tensor = vsi_nn_MapGet( graph->tensor_table, (vsi_nn_map_key_t)id ); + } + return tensor; +} /* vsi_nn_GetTensor() */ + +vsi_nn_node_t * vsi_nn_GetNode + ( + const vsi_nn_graph_t * graph, + vsi_nn_node_id_t id + ) +{ + vsi_nn_node_t * node; + node = NULL; + if( NULL != graph ) + { + node = vsi_nn_MapGet( graph->node_table, (vsi_nn_map_key_t)id ); + } + return node; +} /* vsi_nn_GetTensor() */ + +void vsi_nn_GetTensors + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t * tensors_id, + uint32_t num, + vsi_nn_tensor_t ** tensors + ) +{ + uint32_t i; + + if( NULL == graph || NULL == graph->tensors + || NULL == tensors_id || NULL == tensors) + { + return; + } + memset( &tensors[0], 0, sizeof( vsi_nn_tensor_t * ) * num ); + if( num > graph->max_node_io ) + { + VSILOGW( "Tensor num(%d) is greater than the MAX(%d), \ + set to max num.", num, graph->max_node_io ); + num = graph->max_node_io; + } + for( i = 0; i < num; i++ ) + { + if( VSI_NN_TENSOR_ID_NA == tensors_id[i] ) + { + continue; + } + if( tensors_id[i] >= graph->tensor_num ) + { + VSILOGE( "Tensor id %d/%d", tensors_id[i], graph->tensor_num ); + continue; + } + tensors[i] = vsi_nn_GetTensor( graph, tensors_id[i] ); + } +} /* vsi_nn_GetTensors() */ + +vsi_nn_node_t * vsi_nn_AddNode + ( + vsi_nn_graph_t * graph, + vsi_nn_op_t op, + uint32_t input_num, + uint32_t output_num, + vsi_nn_node_id_t * node_id + ) +{ + vsi_nn_node_t * node; + vsi_nn_node_id_t id; + + if( NULL == graph ) + { + return NULL; + } + + id = graph->cur_nid; + node = vsi_nn_NewNode(graph, op, input_num, output_num); + if( NULL != node ) + { + vsi_nn_MapAdd( graph->node_table, (vsi_nn_map_key_t)id, (void *)node ); + graph->cur_nid ++; + graph->node_num = graph->cur_nid; + } + else + { + id = VSI_NN_NODE_ID_NA; + } + + if( NULL != node_id ) + { + *node_id = id; + } + return node; +} /* vsi_nn_AddNode() */ + +/* + * Deprecated, Use vsi_nn_AddNode instead + */ +vsi_nn_node_t * vsi_nn_AppendNode + ( + vsi_nn_graph_t * graph, + vsi_nn_op_t op, + vsi_nn_node_id_t * node_id + ) +{ + return vsi_nn_AddNode( graph, op, 0, 0, node_id ); +} /* vsi_nn_AppendNode() */ + +void vsi_nn_RemoveNode + ( + vsi_nn_graph_t * graph, + vsi_nn_node_id_t id + ) +{ + vsi_nn_node_t * node; + if( NULL != graph ) + { + node = vsi_nn_GetNode( graph, id ); + if( NULL != node ) + { + vsi_nn_ReleaseNode( &node ); + vsi_nn_MapRemove( graph->node_table, + (vsi_nn_map_key_t)id ); + } + } +} /* vsi_nn_RemoveNode() */ + +vsi_bool vsi_nn_SetGraphInputs + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t * tensors_id, + uint32_t tensor_num + ) +{ + vsi_bool ret; + ret = FALSE; + + if( NULL == graph || tensor_num == 0 ) + { + return ret; + } + + graph->input.tensors = (vsi_nn_tensor_id_t *)malloc( + tensor_num * sizeof( vsi_nn_tensor_id_t ) ); + + if( NULL != graph->input.tensors ) + { + graph->input.num = tensor_num; + ret = TRUE; + if( NULL != tensors_id ) + { + memcpy( graph->input.tensors, tensors_id, + tensor_num * sizeof( vsi_nn_tensor_id_t ) ); + } + } + + return ret; +} /* vsi_nn_SetGreaphInputs() */ + +vsi_bool vsi_nn_SetGraphOutputs + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t * tensors_id, + uint32_t tensor_num + ) +{ + vsi_bool ret; + ret = FALSE; + + if( NULL == graph || tensor_num == 0 ) + { + return ret; + } + + graph->output.tensors = (vsi_nn_tensor_id_t *)malloc( + tensor_num * sizeof( vsi_nn_tensor_id_t ) ); + if( NULL != graph->output.tensors ) + { + graph->output.num = tensor_num; + ret = TRUE; + if( NULL != tensors_id ) + { + memcpy( graph->output.tensors, tensors_id, + tensor_num * sizeof( vsi_nn_tensor_id_t ) ); + } + } + + return ret; + +} /* vsi_nn_SetGraphOutputs() */ + +vsi_nn_node_id_t * vsi_nn_SortGraphNode + ( + vsi_nn_graph_t * graph + ) +{ + uint32_t i,j; + uint32_t count; + vsi_bool dirty; + vsi_bool all_tensor_processed; + vsi_bool * tensors; + vsi_nn_node_id_t * nodes; + vsi_nn_node_id_t * sorted_nodes; + vsi_nn_node_t * node; + vsi_nn_node_id_t node_id; + vsi_nn_tensor_id_t tensor_id; + vsi_nn_tensor_t * tensor; + + if( NULL == graph || NULL == graph->nodes + || NULL == graph->tensors ) + { + return NULL; + } + + tensors = NULL; + sorted_nodes = NULL; + nodes = NULL; + node = NULL; + node_id = VSI_NN_NODE_ID_NA; + + /* Init variables. */ + tensors = (vsi_bool *)malloc( + graph->tensor_num * sizeof( vsi_bool ) ); + + if( NULL == tensors ) + { + goto _SortGraphNodeFinally; + } + + sorted_nodes = (vsi_nn_node_id_t *)malloc( + graph->node_num * sizeof( vsi_nn_node_id_t ) ); + nodes = (vsi_nn_node_id_t *)malloc( + graph->node_num * sizeof( vsi_nn_node_id_t ) ); + + if( NULL == sorted_nodes || NULL == nodes) + { + goto _SortGraphNodeFinally; + } + + for( i = 0; i < graph->tensor_num; i++ ) + { + tensor = vsi_nn_GetTensor( graph, (vsi_nn_tensor_id_t)i ); + if( NULL == tensor + || TRUE == tensor->attr.is_const ) + { + tensors[i] = TRUE; + } + else + { + tensors[i] = FALSE; + } + } + + for( i = 0; i < graph->input.num; i++ ) + { + tensor_id = graph->input.tensors[i]; + if( tensor_id != VSI_NN_TENSOR_ID_NA ) + { + tensors[tensor_id] = TRUE; + } + } + + for( i = 0; i < graph->node_num; i++ ) + { + nodes[i] = i; + } + count = graph->node_num; + do + { + dirty = FALSE; + all_tensor_processed = FALSE; + for( i = 0; i < count; i ++ ) + { + node_id = nodes[i]; + node = vsi_nn_GetNode( graph, node_id ); + all_tensor_processed = TRUE; + for( j = 0; j < node->input.num; j ++ ) + { + tensor_id = node->input.tensors[j]; + if( VSI_NN_TENSOR_ID_NA == tensor_id ) + { + continue; + } + if( FALSE == tensors[tensor_id] ) + { + all_tensor_processed = FALSE; + break; + } + } + if( TRUE == all_tensor_processed ) + { + sorted_nodes[graph->node_num - count] = nodes[i]; + nodes[i] = nodes[count - 1]; + count --; + i --; + dirty = TRUE; + for( j = 0; j < node->output.num; j ++ ) + { + tensor_id = node->output.tensors[j]; + if( VSI_NN_TENSOR_ID_NA == tensor_id ) + { + continue; + } + tensors[tensor_id] = TRUE; + } + } + } + if( FALSE == dirty ) + { + if( FALSE == all_tensor_processed ) + { + // TODO: Log all unprocessed tensors + VSILOGW("Unprocessed node %u", node_id); + } + break; + } + } while( count > 0 ); + + if( count != 0 ) + { + free( sorted_nodes ); + sorted_nodes = NULL; + } + +_SortGraphNodeFinally: + + /* Release memory. */ + free( tensors ); + free( nodes ); + return sorted_nodes; +} /* vsi_nn_SortGraphNode() */ + +uint32_t vsi_nn_GetNodesByUids + ( + vsi_nn_graph_t * graph, + uint32_t * node_uids, + uint32_t node_uids_size, + vsi_nn_node_id_t * nodes, + uint32_t nodes_num + ) +{ + uint32_t sz; + uint32_t i; + uint32_t j; + vsi_nn_node_t * node; + + sz = 0; + if( NULL == nodes || 0 >= nodes_num ) + { + return sz; + } + if( NULL != node_uids ) + { + for( i = 0; i < node_uids_size; i++ ) + { + for( j = 0; j < graph->node_num; j++ ) + { + node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)j ); + if( node_uids[i] == node->uid ) + { + nodes[sz] = (vsi_nn_node_id_t)j; + sz ++; + break; + } + } + } + } + else + { + for( j = 0; j < graph->node_num; j++ ) + { + nodes[j] = (vsi_nn_node_id_t)j; + } + sz = graph->node_num; + } + return sz; +} /* vsi_nn_GetNodesByUids() */ + +void vsi_nn_DumpGraphNodeOutputs + ( + vsi_nn_graph_t * graph, + const char * path, + uint32_t * node_uids, + uint32_t node_uids_size, + vsi_bool force_fp32, + vsi_nn_dim_fmt_e data_fmt + ) +{ + vsi_nn_DumpGraphNodeOutputsEx(graph, path, NULL, node_uids, node_uids_size, force_fp32, data_fmt ); +} /* vsi_nn_DumpGraphNodeOutputs() */ + +void vsi_nn_DumpGraphNodeOutputsEx + ( + vsi_nn_graph_t * graph, + const char * path, + const char * prefix, + uint32_t * node_uids, + uint32_t node_uids_size, + vsi_bool force_fp32, + vsi_nn_dim_fmt_e data_fmt + ) +{ +#define _MAX_TENSOR_NAME_SZ (1024) +#define _SHAPE_BUF_SZ (64) + char shape[_SHAPE_BUF_SZ] = { 0 }; + char filename[_MAX_TENSOR_NAME_SZ] = { 0 }; + char filename_prefix[_SHAPE_BUF_SZ] = { 0 }; + const char * op_name; + uint32_t i; + uint32_t o; + uint32_t node_num; + vsi_nn_node_id_t * nodes; + vsi_nn_node_t * node; + vsi_nn_tensor_t * tensor; + + if(vsi_nn_CheckFilePath(path) == FALSE) + { + return ; + } + + if( NULL == node_uids ) + { + node_num = graph->node_num; + } + else + { + if( node_uids_size <= 0 ) + { + VSILOGE("Error node_uids_size: %d.", node_uids_size); + return; + } + node_num = node_uids_size; + } + nodes = (vsi_nn_node_id_t *)malloc( node_num * sizeof( vsi_nn_node_id_t ) ); + if( NULL == nodes ) + { + VSILOGE("Malloc nodes memory fail."); + return; + } + node_num = vsi_nn_GetNodesByUids( graph, node_uids, node_uids_size, + nodes, node_num ); + + if( NULL != prefix ) + { + strncpy(filename_prefix, prefix, _SHAPE_BUF_SZ); + filename_prefix[_SHAPE_BUF_SZ - 1] = '\0'; + + strncat(filename_prefix, "_", _SHAPE_BUF_SZ - 1); + filename_prefix[_SHAPE_BUF_SZ - 1] = '\0'; + } + + VSILOGD("Dump %u nodes.", node_num); + for( i = 0; i < node_num; i++ ) + { + node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)i ); + + if( node->internal_node_wksp ) /* dump internal nodes if any */ + { + vsi_nn_internal_dump_node_output(graph, path, filename_prefix, + force_fp32, node); + } + + for( o = 0; o < node->output.num; o++ ) + { + tensor = vsi_nn_GetTensor( graph, node->output.tensors[o] ); + if( NULL != tensor ) + { + if( TRUE == tensor->attr.vtl ) + { + VSILOGW("Uid %u node's tensor %d is virtual", + node->uid, o); + continue; + } + // TODO: Support different tensor format + vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num, + shape, _SHAPE_BUF_SZ, FALSE ); + op_name = vsi_nn_OpGetName( node->op ); + snprintf( filename, _MAX_TENSOR_NAME_SZ, + "%s/%s%s_uid_%u_t_%u_s_%s.txt", path, filename_prefix, op_name, node->uid, o, shape); + if( FALSE == force_fp32 ) + { + vsi_nn_SaveTensorToText( graph, tensor, filename, NULL ); + } + else + { + vsi_nn_SaveTensorToTextByFp32( graph, tensor, filename, NULL ); + } + } + } + } + free( nodes ); +} /* vsi_nn_DumpGraphNodeOutputsEx */ + +void vsi_nn_PrintGraph + ( + vsi_nn_graph_t * graph + ) +{ + vsi_nn_tensor_t * tensor; + vsi_nn_node_t * node; + uint32_t i; + + if( NULL == graph ) + { + return; + } + + VSILOGI( "Graph:" ); + VSILOGI( "***************** Tensors ******************" ); + for( i = 0; i < graph->tensor_num; i ++ ) + { + tensor = vsi_nn_GetTensor( graph, (vsi_nn_tensor_id_t)i ); + if( NULL != tensor ) + { + vsi_nn_PrintTensor( tensor, (vsi_nn_tensor_id_t)i ); + } + } + VSILOGI( "***************** Nodes ******************" ); + for( i = 0; i < graph->node_num; i ++ ) + { + node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)i ); + if( NULL != node ) + { + vsi_nn_PrintNode( node, (vsi_nn_node_id_t)i ); + } + } + VSILOGI("******************************************" ); +} /* vsi_nn_PrintGraph() */ + +void vsi_nn_DumpGraphToJson + ( + vsi_nn_graph_t *graph + ) +{ +#define _SHAPE_BUF_SIZE 64 + uint32_t i,j; + FILE *fp; + vsi_nn_tensor_rel_t *tensor_ref, *tio; + vsi_nn_tensor_rel_table_t *table; + vsi_nn_node_t *node,*in_node; + vsi_nn_tensor_t *tensor; + char shape[_SHAPE_BUF_SIZE] = { 0 }; + + if(NULL == graph) + { + return ; + } + + fp = fopen("graph.json", "w+"); + if(NULL == fp) + { + VSILOGE("Create dump file fail"); + return ; + } + + tensor_ref = vsi_nn_CreateTensorRelevance(graph); + if(NULL == tensor_ref) + { + VSILOGE("build tensor io fail"); + fclose(fp); + return ; + } + + fprintf(fp, "{\n"); + fprintf(fp, "\t\"Layers\":{\n"); + for(i = 0; i < graph->node_num; i++) + { + node = vsi_nn_GetNode(graph, i); + if(node) + { + fprintf(fp, "\t\t\"uid_%u\":{\n\t\t\t\"op\": \"%s\",\n", + node->uid, vsi_nn_OpGetName(node->op)); + + /* dump inputs */ + fprintf(fp, "\t\t\t\"inputs\": [ "); + for(j = 0; j < node->input.num; j++) + { + tio = &tensor_ref[node->input.tensors[j]]; + if(tio->input.num > 0) + { + table = tio->input.table; + + /* tensor only 1 input node */ + in_node = vsi_nn_GetNode(graph, table[0].node); + if(j == node->input.num - 1) + { + fprintf(fp, "\"@uid_%u:out%u\" ", in_node->uid, table[0].index); + } + else + { + fprintf(fp, "\"@uid_%u:out%u\", ", in_node->uid, table[0].index); + } + } + else + { + if(j == node->input.num - 1) + { + fprintf(fp, "\"datainput_%u:out0\" ", j); + } + else + { + fprintf(fp, "\"datainput_%u:out0\", ", j); + } + } + + } + + /* dump input shape */ + fprintf(fp, "],\n\t\t\t\"inut_shape\": [ "); + for(j = 0; j < node->input.num; j++) + { + tensor = vsi_nn_GetTensor(graph, node->input.tensors[j]); + if(vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num, + shape, _SHAPE_BUF_SIZE, TRUE ) > 0) + { + fprintf(fp, "[%s ]", shape); + } + else + { + fprintf(fp, "[ - ]"); + } + if(j < node->input.num - 1) + { + fprintf(fp, ","); + } + } + + /* dump output */ + fprintf(fp, " ],\n\t\t\t\"outputs\": [ "); + for(j = 0; j < node->output.num; j++) + { + if(j == node->output.num - 1) + { + fprintf(fp, "\"out%u\" ", j); + } + else + { + fprintf(fp, "\"out%u\", ", j); + } + } + + //output shape + fprintf(fp, "],\n\t\t\t\"output_shape\": [ "); + for(j = 0; j < node->output.num; j++) + { + tensor = vsi_nn_GetTensor(graph, node->output.tensors[j]); + if(vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num, + shape, _SHAPE_BUF_SIZE, TRUE ) > 0) + { + fprintf(fp, "[%s ]", shape); + } + else + { + fprintf(fp, "[ - ]"); + } + if(j < node->output.num - 1) + { + fprintf(fp, ","); + } + } + fprintf(fp, " ]\n\t\t}"); + + if(i != graph->node_num - 1) + { + fprintf(fp, ","); + } + fprintf(fp, "\n"); + } + } + fprintf(fp, "\t}\n}\n"); + + vsi_nn_ReleaseTensorRelevance(graph, tensor_ref); + fclose(fp); +} /* vsi_nn_DumpGraphToJson() */ + +/* + * Documented in vsi_nn_graph.h + */ +vsi_status vsi_nn_TrySetupCompleteSignalNode + ( + vsi_nn_graph_t* graph + ) +{ + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_tensor_t* signal_tensor = NULL; + vsi_nn_node_t* signal_node = NULL; + vsi_nn_tensor_attr_t signal_tensor_attr; + vsi_status status = VSI_FAILURE; + if( graph->complete_signal.exists ) + { + if( !graph->complete_signal.write_address ) + { + VSILOGW("COMPLETE signal is set with null write addres."); + return VSI_FAILURE; + } + VSILOGD("Setup COMPLETE signal, value \"%d\", write address \"%p\"", + graph->complete_signal.value, graph->complete_signal.write_address); + /* Setup signal tensor attr */ + memset( &signal_tensor_attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + signal_tensor_attr.size[0] = 8; + signal_tensor_attr.size[1] = 1; + signal_tensor_attr.dim_num = 2; + signal_tensor_attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + signal_tensor_attr.vtl = FALSE; + /* Setup signal node */ + signal_node = vsi_nn_CreateNode( graph, VSI_NN_OP_EXTRA_ENDING ); + TEST_CHECK_PTR( signal_node, final ); + + signal_node->nn_param.extra_ending.length = sizeof(int64_t); + memcpy( &signal_node->nn_param.extra_ending.value, + &graph->complete_signal.value, sizeof(int64_t)); + + if( graph->output.num > 1 ) + { + VSILOGE("Not support COMPLETE signal with multi graph outputs."); + } + else + { + tensor = vsi_nn_GetTensor( graph, graph->output.tensors[0] ); + signal_tensor = vsi_nn_CreateTensorFromHandle( graph, + (uint8_t*)graph->complete_signal.write_address, + &signal_tensor_attr); + status = vsi_nn_OpCompute( signal_node->op, signal_node, + &tensor, &signal_tensor ); + TEST_CHECK_STATUS( status, final ); + } + graph->complete_signal.tensor = signal_tensor; + status = VSI_SUCCESS; + } + else + { + status = VSI_SUCCESS; + } +final: + if( signal_node ) + { + vsi_nn_ReleaseNode( &signal_node ); + } + return status; +} /* vsi_nn_TrySetupCompleteSignalNode() */ + +vsi_status vsi_nn_SetupRNNConnections + ( + vsi_nn_graph_t* graph, + const vsi_nn_rnn_external_connection_t* connections, + uint32_t connections_count + ) +{ + return vsi_nn_rnn_InitWksp( graph, connections, connections_count, NULL ); +} /* vsi_nn_SetupRNNConnections() */ + +vsi_status vsi_nn_ResetRNNBuffers + ( + vsi_nn_graph_t* graph + ) +{ + return vsi_nn_rnn_ResetBuffers( graph ); +} /* vsi_nn_ResetRNNBuffers() */ + +vsi_bool vsi_nn_HasRNN + ( + const vsi_nn_graph_t* graph + ) +{ + return NULL != graph && NULL != graph->rnn_wksp; +} /* vsi_nn_HasRNN() */ + +void vsi_nn_get_tensor_consumers + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_id_t tensor_id, + vsi_nn_node_t** nodes, + uint32_t* count + ) +{ + vsi_nn_node_t* node = NULL; + uint32_t i, j = 0; + uint32_t nodes_count = 0; + for(i = 0; i < graph->node_num; i++) + { + node = vsi_nn_GetNode(graph, i); + for(j = 0; j < node->input.num; j++) + { + if(node->input.tensors[j] == tensor_id) + { + if(nodes != NULL) + { + nodes[nodes_count] = node; + } + nodes_count += 1; + break; + } + } + } + if(count != NULL) + { + *count = nodes_count; + } +} /* vsi_nn_get_tensor_consumers() */ + +void vsi_nn_get_tensor_provider + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_id_t tensor_id, + vsi_nn_node_t** node + ) +{ + vsi_nn_node_t* cur_node = NULL; + uint32_t i, j = 0; + for(i = 0; i < graph->node_num; i++) + { + cur_node = vsi_nn_GetNode(graph, i); + for(j = 0; j < cur_node->output.num; j++) + { + if(cur_node->output.tensors[j] == tensor_id) + { + *node = cur_node; + return; + } + } + } +} /* vsi_nn_get_tensor_provider() */ + +vsi_status vsi_nn_SetGraphPreloadSize + ( + vsi_nn_graph_t* graph, + vsi_nn_graph_attr_preload_type_e attr, + uint32_t size + ) +{ + vsi_status status; + status = VSI_FAILURE; + +#if(defined(VX_PRELOAD_CONST_TENSOR_SUPPORT) && VX_PRELOAD_CONST_TENSOR_SUPPORT) + if(graph && graph->g) + { + switch(attr) + { + case VSI_NN_GRAPH_PRELOAD_VIPSRAM: + { + status = vxSetGraphAttribute(graph->g, VX_GRAPH_VIP_SRAM_PRE_LOAD, &size, sizeof(size)); + break; + } + + case VSI_NN_GRAPH_PRELOAD_AXISRAM: + { + status = vxSetGraphAttribute(graph->g, VX_GRAPH_AXI_SRAM_PRE_LOAD, &size, sizeof(size)); + break; + } + + default: + { + VSILOGE("Unsupported graph attribute: %d", attr); + break; + } + } + } +#else + status = VSI_SUCCESS; +#endif + + return status; +} + +vsi_nn_tensor_id_t vsi_nn_get_tensor_id + ( + vsi_nn_graph_t* graph, + const vsi_nn_tensor_t * tensor + ) +{ + uint32_t i; + vsi_nn_tensor_t * iter; + if( !graph || !tensor ) + { + return VSI_NN_TENSOR_ID_NA; + } + for(i = 0; i < graph->tensor_num; i++) + { + iter = vsi_nn_GetTensor( graph, i ); + if(iter && iter == tensor) + { + return i; + } + } + return VSI_NN_TENSOR_ID_NA; +} /* vsi_nn_get_tensor_id() */ + +vsi_status vsi_nn_SetGraphPriority + ( + vsi_nn_graph_t* graph, + uint32_t priority + ) +{ + vsi_status status = VSI_FAILURE; +#ifdef VX_GRAPH_PREEMPTION_SUPPORT + if(graph && graph->g) + { + status = vxSetGraphAttribute(graph->g, VX_GRAPH_PRIORITY_VALUE_VIV, &priority, sizeof(priority)); + } +#else + VSILOGE("Current driver not support graph priority."); +#endif + return status; +} diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c new file mode 100644 index 0000000..9732a50 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c @@ -0,0 +1,805 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_graph_optimization.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" + + +static vsi_bool _is_asymm_int8_norm_tensor + ( + vsi_nn_tensor_t * tensor + ) +{ + vsi_bool ret = FALSE; + + ret = ( tensor != NULL + && tensor->attr.vtl == FALSE && tensor->attr.is_const == FALSE + && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 + && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC); + + return ret; +}/* _is_asymm_int8_norm_tensor() */ + +static vsi_bool _is_asymm_int8_const_tensor + ( + vsi_nn_tensor_t * tensor + ) +{ + vsi_bool ret = FALSE; + + ret = ( tensor != NULL + && tensor->attr.is_const == TRUE + && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 + && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC); + + return ret; +}/* _is_asymm_int8_const_tensor() */ + +static vsi_bool _is_asymm_int8_virtual_tensor + ( + vsi_nn_tensor_t * tensor + ) +{ + vsi_bool ret = FALSE; + + ret = ( tensor != NULL + && tensor->attr.vtl == TRUE + && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 + && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC); + + return ret; +}/* _is_asymm_int8_virtual_tensor() */ + +static vsi_status _add_forward_node + ( + vsi_nn_graph_t* graph, + vsi_nn_node_t** first_node, + uint32_t nodes_count, + vsi_nn_node_t* node, + vsi_nn_tensor_id_t input, + vsi_nn_tensor_id_t output + ) +{ + uint32_t i = 0; + uint32_t j = 0; + + /* Reconnect node tensors */ + for(i = 0; i < nodes_count; i++) + { + for(j = 0; j < first_node[i]->input.num; j++) + { + if(first_node[i]->input.tensors[j] == input) + { + first_node[i]->input.tensors[j] = output; + } + } + } + + node->input.tensors[0] = input; + node->output.tensors[0] = output; + + return VSI_SUCCESS; +}/* _add_forward_node() */ + +static vsi_status _add_backward_node + ( + vsi_nn_graph_t* graph, + vsi_nn_node_t* last_node, + vsi_nn_node_t* node, + vsi_nn_tensor_id_t input, + vsi_nn_tensor_id_t output + ) +{ + uint32_t i = 0; + + /* Reconnect node output tensors */ + for(i = 0; i < (int32_t)last_node->output.num; i++) + { + if(last_node->output.tensors[i] == output) + { + last_node->output.tensors[i] = input; + break; + } + } + + node->input.tensors[0] = input; + node->output.tensors[0] = output; + + return VSI_SUCCESS; +}/* _add_backward_node() */ + +static vsi_status _add_dataconvert_node + ( + vsi_nn_graph_t* graph, + uint32_t idx, + vsi_nn_opt_direction_e direction, + vsi_nn_node_t** nodes, + uint32_t nodes_count, + vsi_nn_tensor_id_t input, + vsi_nn_tensor_id_t output + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_node_t* node = NULL; + + /* Add dataconvert node */ + node = vsi_nn_AddNode(graph, VSI_NN_OP_DATACONVERT, 1, 1, NULL); + node->uid = (uint32_t)(VSI_NN_DATACONVERT_NODE_UID_BASE) + idx; + if( NULL == node ) { + status = VSI_FAILURE; + goto final; + } + + if( direction == VSI_NN_OPTIMIZE_FORWARD ) + { + /* Reconnect node input tensors */ + VSILOGD("add a dataconvert op to input norm tensor[%d] ", input); + status = _add_forward_node(graph, nodes, nodes_count, node, input, output); + } + else + { + /* Reconnect node output tensors */ + VSILOGD("add a dataconvert op to output norm tensor[%d] ", output); + status = _add_backward_node(graph, nodes[0], node, input, output); + } + +final: + return status; +} /* _add_dataconvert_node() */ + +static void _get_graph_input_asymm_int8_norm_tensor + ( + vsi_nn_graph_t* graph, + uint32_t* count, + vsi_nn_tensor_id_t *tensor_ids, + uint32_t* valid_count + ) +{ + vsi_nn_node_t* node = NULL; + uint32_t i = 0, j = 0, k = 0; + uint32_t tensor_count = 0; + uint32_t id_count = 0; + + for(i = 0; i < graph->node_num; i++) + { + node = vsi_nn_GetNode(graph, i); + for(j = 0; j < node->input.num; j++) + { + vsi_nn_tensor_id_t id = node->input.tensors[j]; + vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + if (_is_asymm_int8_norm_tensor(tensor)) + { + if(tensor_ids != NULL) + { + for ( k = 0; k < id_count; k++) + { + if (tensor_ids[k] == id) + break; + } + if (k == id_count) + { + tensor_ids[id_count ++] = id; + } + + } + tensor_count += 1; + } + } + } + + if(count != NULL) + { + *count = tensor_count; + } + + if(valid_count != NULL) + { + *valid_count = id_count; + } +} /* _get_graph_input_asymm_int8_norm_tensor() */ + +static void _get_graph_output_asymm_int8_norm_tensor + ( + vsi_nn_graph_t* graph, + uint32_t* count, + vsi_nn_tensor_id_t *tensor_ids + ) +{ + vsi_nn_node_t* node = NULL; + uint32_t i = 0, j = 0; + uint32_t tensor_count = 0; + + for(i = 0; i < graph->node_num; i++) + { + node = vsi_nn_GetNode(graph, i); + for(j = 0; j < node->output.num; j++) + { + vsi_nn_tensor_id_t id = node->output.tensors[j]; + vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + if (_is_asymm_int8_norm_tensor(tensor)) + { + if(tensor_ids != NULL) + { + tensor_ids[tensor_count] = id; + } + tensor_count += 1; + } + } + } + + if(count != NULL) + { + *count = tensor_count; + } +} /* _get_graph_output_asymm_int8_norm_tensor() */ + +static vsi_status _add_graph_dataconvert_for_int8 + ( + vsi_nn_graph_t* graph, + vsi_bool *dirty + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_tensor_attr_t attr; + uint32_t input_count; + uint32_t input_valid_count = 0; + vsi_nn_tensor_id_t *input_ids = NULL; + vsi_nn_node_t*** input_nodes = NULL; + uint32_t i = 0; + uint32_t output_count; + vsi_nn_tensor_id_t *output_ids = NULL; + vsi_nn_node_t** output_nodes = NULL; + uint32_t dataconvert_idx = 0; + + _get_graph_input_asymm_int8_norm_tensor(graph, &input_count, NULL, NULL); + + if(input_count != 0) + { + input_ids = (vsi_nn_tensor_id_t *)malloc(sizeof(vsi_nn_tensor_id_t) * input_count); + _get_graph_input_asymm_int8_norm_tensor(graph, NULL, input_ids, &input_valid_count); + + if ( input_valid_count > 0 ) + { + input_nodes = (vsi_nn_node_t***)malloc(sizeof(vsi_nn_node_t**) * input_valid_count); + } + + for ( i = 0; i < input_valid_count; i++) + { + uint32_t nodes_count = 0; + vsi_nn_get_tensor_consumers(graph, input_ids[i], NULL, &nodes_count); + + if(nodes_count > 0) + { + input_nodes[i] = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*)*nodes_count); + vsi_nn_get_tensor_consumers(graph, input_ids[i], input_nodes[i], NULL); + + *dirty = TRUE; + } + } + } + + _get_graph_output_asymm_int8_norm_tensor(graph, &output_count, NULL); + + if(output_count > 0) + { + output_ids = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t) * output_count); + _get_graph_output_asymm_int8_norm_tensor(graph, NULL, output_ids); + + output_nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*) * output_count); + + for ( i = 0; i < output_count; i++) + { + vsi_nn_get_tensor_provider(graph, output_ids[i], &output_nodes[i]); + *dirty = TRUE; + } + } + + if ( input_valid_count > 0 ) + { + for ( i = 0; i < input_valid_count; i++) + { + uint32_t nodes_count = 0; + vsi_nn_get_tensor_consumers(graph, input_ids[i], NULL, &nodes_count); + + if(nodes_count != 0) + { + vsi_nn_tensor_id_t id = input_ids[i]; + vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + vsi_nn_tensor_id_t output; + + memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + attr.dtype.zero_point += 128; + attr.vtl = TRUE; + output = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL ); + + _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_FORWARD, + input_nodes[i], nodes_count, id, output); + } + } + + if(input_nodes) + { + free(input_nodes); + input_nodes = NULL; + } + } + + if ( output_count > 0 ) + { + for ( i = 0; i < output_count; i++) + { + vsi_nn_tensor_id_t id = output_ids[i]; + vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + vsi_nn_tensor_id_t input; + + memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + attr.dtype.zero_point += 128; + attr.vtl = TRUE; + input = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL ); + + _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_BACKWARD, + &output_nodes[i], 1, input, id); + } + + if(output_nodes) + { + free(output_nodes); + output_nodes = NULL; + } + } + + if (input_ids) + { + free(input_ids); + input_ids = NULL; + } + if (output_ids) + { + free(output_ids); + output_ids = NULL; + } + + return status; +} /* _add_graph_dataconvert_for_int8() */ + +static vsi_status _add_graph_data_convert + ( + vsi_nn_graph_t* graph, + vsi_bool *dirty + ) +{ + vsi_status status = VSI_FAILURE; + + status = _add_graph_dataconvert_for_int8(graph, dirty); + TEST_CHECK_STATUS(status, final); + +final: + return status; +}/* _add_graph_data_convert() */ + +static vsi_status _set_raw_tensor_attr + ( + vx_tensor tensor, + vsi_nn_tensor_attr_t attr, + const vsi_nn_vxtensor_attr_t vx_attr + ) +{ + vsi_status status; + + status = VSI_SUCCESS; + if( NULL == tensor ) + { + return VSI_FAILURE; + } + + if( VSI_SUCCESS == status && vsi_nn_hasattr( vx_attr, VSI_NN_TENSOR_ATTR_CONST ) ) + { + vx_enum data_lifetime; + if(attr.is_const == TRUE) + { + data_lifetime = VX_TENSOR_LIFE_TIME_STATIC; + } + else + { + data_lifetime = VX_TENSOR_LIFE_TIME_DYNAMIC; + } + status = vxSetTensorAttribute(tensor, + VX_TENSOR_LIFETIME, + &data_lifetime, + sizeof(vx_enum)); + } + if( VSI_SUCCESS == status && vsi_nn_hasattr( vx_attr, VSI_NN_TENSOR_ATTR_HIGH_PRECISION ) ) + { + vx_enum precision = VX_TENSOR_PRECISION_HIGH; + status = vxSetTensorAttribute(tensor, + VX_TENSOR_PRECISION, + &precision, + sizeof(vx_enum)); + } + + return status; +}/* _set_raw_tensor_attr() */ + +static vsi_bool _try_set_const_raw_tensor + ( + vx_tensor tensor, + vsi_nn_tensor_attr_t attr + ) +{ + vsi_status status; + vsi_bool ret; + vsi_nn_vxtensor_attr_t vx_attr; + + ret = TRUE; + status = VSI_SUCCESS; + if( TRUE == attr.is_const ) + { + vx_attr = VSI_NN_TENSOR_ATTR_CONST; + status = _set_raw_tensor_attr(tensor, attr, vx_attr); + } + if( VSI_FAILURE == status ) + { + ret = FALSE; + } + + return ret; +} /* _try_set_const_raw_tensor() */ + +vsi_status vsi_nn_CopyDataToRawTensor + ( + vsi_nn_graph_t * graph, + vx_tensor tensor, + uint8_t * data, + vsi_nn_tensor_attr_t attr + ) +{ + vsi_status status = VSI_FAILURE; + if( NULL == graph || NULL == data || NULL == tensor ) + { + return status; + } + + if( attr.is_created_from_handle ) + { + uint8_t* ptr = NULL; + vxSwapTensorHandle( tensor, NULL, (void **)&ptr); + if ( ptr == NULL ) + { + VSILOGE("vxSwapTensorHandle fail."); + return VSI_FAILURE; + } + memcpy( ptr, data, vsi_nn_GetTensorSize(attr.size, attr.dim_num, + attr.dtype.vx_type)); + status = vxSwapTensorHandle( tensor, ptr, NULL ); + status |= vxFlushHandle( (vx_reference)tensor ); + } + else + { + status = vsi_nn_copy_tensor_patch(tensor, &attr, data, VX_WRITE_ONLY); + } + + _try_set_const_raw_tensor(tensor, attr); + + return status; +} /* vsi_nn_CopyDataToRawTensor() */ + +static vx_tensor _create_const_raw_tensor + ( + vsi_nn_graph_t * graph, + uint8_t * data, + vsi_nn_tensor_attr_t attr + ) +{ + vx_tensor tensor = NULL; + vx_tensor_create_params_t params; + float * scales = NULL; + + memset( ¶ms, 0, sizeof( vx_tensor_create_params_t ) ); + params.num_of_dims = attr.dim_num; + params.sizes = attr.size; + params.data_format = (vsi_enum)attr.dtype.vx_type; + params.quant_format = (vsi_enum)attr.dtype.qnt_type; + switch( attr.dtype.qnt_type ) + { + case VSI_NN_QNT_TYPE_DFP: + params.quant_data.dfp.fixed_point_pos = (uint8_t)attr.dtype.fl; + break; + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + params.quant_data.affine.scale = attr.dtype.scale; + params.quant_data.affine.zeroPoint = (int32_t)attr.dtype.zero_point; + break; + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: +#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT + // This is a hack that driver doesn't support const scale + scales = (float *)malloc(sizeof(float) * attr.dtype.scale_dim); + memcpy(scales, attr.dtype.scales, attr.dtype.scale_dim * sizeof(float)); + params.quant_data.affinePerChannel.channelDim = attr.dtype.channel_dim; + params.quant_data.affinePerChannel.scaleCount = attr.dtype.scale_dim; + params.quant_data.affinePerChannel.scales = scales; + params.quant_data.affinePerChannel.zeroPoint = NULL; + params.quant_data.affinePerChannel.zeroPointCount = 0; + break; +#else + VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC." ); +#endif + default: + break; + } + + if( TRUE == attr.is_created_from_handle ) + { + vx_tensor_addressing addr; + uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + uint32_t buf_sz; + + buf_sz = vsi_nn_GetStrideSize( &attr, stride_size ); + if( buf_sz > 0 ) + { + uint32_t align_start_size = graph->handle_manager.align_start_size; + uint32_t align_block_size = graph->handle_manager.align_block_size; + if (data == NULL) + { + data = vsi_nn_MallocAlignedBuffer(buf_sz, align_start_size, + align_block_size); + attr.is_handle_malloc_by_ovxlib = TRUE; + } + else + { + attr.is_handle_malloc_by_ovxlib = FALSE; + if (!vsi_nn_IsBufferAligned(data, align_start_size)) + { + VSILOGE( "vsi_nn_IsBufferAligned is FALSE." ); + if( scales ) + { + free( scales ); + } + return NULL; + } + } + if( data ) + { + addr = vxCreateTensorAddressing(graph->ctx->c, + attr.size, stride_size, (vx_uint8)attr.dim_num); +#ifdef VX_13_NN_COMPATIBLITY + tensor = vxCreateTensorFromHandle2(graph->ctx->c, + ¶ms, sizeof(vx_tensor_create_params_t), + addr, data, VX_MEMORY_TYPE_HOST); +#else + tensor = vxCreateTensorFromHandle(graph->ctx->c, + ¶ms, sizeof(vx_tensor_create_params_t), + addr, data, VX_MEMORY_TYPE_HOST); +#endif + //memset(data, 0x5A, buf_sz); + vxReleaseTensorAddressing( &addr ); + vxFlushHandle( (vx_reference)tensor ); + } + } + } + else if( FALSE == attr.vtl ) + { + tensor = vxCreateTensor2( graph->ctx->c, + ¶ms, sizeof( vx_tensor_create_params_t ) ); + } + else + { + tensor = vxCreateVirtualTensor2( graph->g, + ¶ms, sizeof( vx_tensor_create_params_t ) ); + } + if( NULL == tensor ) + { + VSILOGE( "Create vx tensor fail." ); + } + if( scales ) + { + free( scales ); + } + + return tensor; +} /* _create_const_raw_tensor() */ + +vx_tensor vsi_nn_CreateRawTensorFromData + ( + vsi_nn_graph_t * graph, + uint8_t * data, + vsi_nn_tensor_attr_t * attr + ) +{ + vsi_status status; + vx_tensor tensor; + + status = VSI_FAILURE; + tensor = NULL; + + if( NULL == graph || NULL == data || NULL == attr ) + { + return NULL; + } + + tensor = _create_const_raw_tensor( graph, data, *attr ); + + status = vsi_nn_CopyDataToRawTensor( graph, tensor, data, *attr ); + + if( VSI_SUCCESS != status ) + { + VSILOGE("Create tensor from data fail."); + if( NULL != tensor ) + { + vxReleaseTensor( &tensor ); + tensor = NULL; + } + } + return tensor; +}/* vsi_nn_CreateRawTensorFromData() */ + +static void _convert_const_I8toU8 + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_id_t id + ) +{ + uint8_t * data = NULL; + vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + vsi_nn_tensor_attr_t *attr = &tensor->attr; + uint32_t sz = 0; + uint32_t i = 0; + + sz = vsi_nn_GetElementNum( tensor ); + + data = vsi_nn_ConvertTensorToData( graph, tensor ); + if( NULL == data ) + { + VSILOGE( "Convert data fail." ); + return ; + } + + for( i = 0; i < sz; i++ ) + { + data[i] = data[i] ^ 0x80; + } + + attr->dtype.vx_type = VSI_NN_TYPE_UINT8; + attr->dtype.zero_point += 128; + + if ( tensor->t ) vxReleaseTensor(&tensor->t); + tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr); +}/* _convert_const_I8toU8() */ + +static vsi_status _convert_graph_const_tensor + ( + vsi_nn_graph_t* graph + ) +{ + vsi_status status = VSI_SUCCESS; + uint32_t node_num = graph->node_num; + vsi_nn_node_t* node = NULL; + uint32_t i = 0; + uint32_t j = 0; + + for(i = 0; i < node_num; i++) + { + node = vsi_nn_GetNode(graph, i); + for(j = 0; j < node->input.num; j++) + { + vsi_nn_tensor_id_t id = node->input.tensors[j]; + vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + + if (_is_asymm_int8_const_tensor(tensor)) + { + _convert_const_I8toU8(graph, id); + } + } + } + + return status; +} /* _convert_graph_const_tensor() */ + +static vsi_status _convert_virtual_tensor_attr + ( + vsi_nn_tensor_t * tensor + ) +{ + if (_is_asymm_int8_virtual_tensor(tensor)) + { + tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + tensor->attr.dtype.zero_point += 128; + } + + return VSI_SUCCESS; +}/* _convert_virtual_tensor_attr() */ + +static vsi_status _convert_graph_virtual_tensor + ( + vsi_nn_graph_t* graph + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t node_num = graph->node_num; + vsi_nn_node_t* node = NULL; + uint32_t i = 0; + uint32_t j = 0; + + for(i = 0; i < node_num; i++) + { + node = vsi_nn_GetNode(graph, i); + for(j = 0; j < node->input.num; j++) + { + vsi_nn_tensor_id_t id = node->input.tensors[j]; + vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + + status = _convert_virtual_tensor_attr(tensor); + } + + for(j = 0; j < node->output.num; j++) + { + vsi_nn_tensor_id_t id = node->output.tensors[j]; + vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + + status = _convert_virtual_tensor_attr(tensor); + } + } + + return status; +} /* _convert_graph_virtual_tensor() */ + +static vsi_status _graph_optimization_convert_int8_to_uint8 +( + vsi_nn_graph_t* graph, + vsi_bool *dirty +) +{ + vsi_status status = VSI_FAILURE; + status = _convert_graph_virtual_tensor(graph); + TEST_CHECK_STATUS(status, final); + + status = _convert_graph_const_tensor(graph); + TEST_CHECK_STATUS(status, final); + + status = _add_graph_data_convert(graph, dirty); + TEST_CHECK_STATUS(status, final); + +final: + return status; +}/* _graph_optimization_convert_int8_to_uint8() */ + +vsi_status vsi_nn_OptimizeGraph + ( + vsi_nn_graph_t* graph, + vsi_bool *dirty + ) +{ + vsi_status status = VSI_FAILURE; + + status = _graph_optimization_convert_int8_to_uint8(graph, dirty); + TEST_CHECK_STATUS(status, final); + +final: + return status; +} /* vsi_nn_OptimizeGraph() */ + diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c new file mode 100644 index 0000000..2ad5bc0 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c @@ -0,0 +1,660 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_types.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_rnn.h" +#include "vsi_nn_test.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_vdata.h" +#include "utils/vsi_nn_map.h" + +/********************************************************** +* MACROS +**********************************************************/ +#define LINKLIST_APPEND( _HEAD, _ITEM ) do { \ + vsi_nn_LinkListPushEnd((vsi_nn_link_list_t **)&(_HEAD), \ + (vsi_nn_link_list_t *)(_ITEM) ); } while( 0 ) + +#define WKSP(_NODE_PTR) ((vsi_nn_internal_node_wksp_t *) \ + ((_NODE_PTR)->internal_node_wksp)) + +/********************************************************** +* LOCAL FUNCTIONS +**********************************************************/ +static vsi_nn_internal_node_t* vsi_nn_internal_create_node + ( + vsi_nn_graph_t* graph, + vsi_nn_op_t op, + uint32_t input_num, + uint32_t output_num + ) +{ + vsi_nn_internal_node_t* node = NULL; + vsi_nn_node_t* n = NULL; + vsi_nn_tensor_t** inputs = NULL; + vsi_nn_tensor_t** outputs = NULL; + + node = (vsi_nn_internal_node_t *)malloc( sizeof(vsi_nn_internal_node_t) ); + if( node ) + { + memset(node, 0x00, sizeof(vsi_nn_internal_node_t) ); + + n = vsi_nn_NewNode( graph, op, input_num, output_num ); + if( n ) + { + inputs = (vsi_nn_tensor_t **)malloc( n->input.num * sizeof(vsi_nn_tensor_t*)); + if( inputs ) + { + memset( inputs, 0x00, ( n->input.num * sizeof(vsi_nn_tensor_t*)) ); + } + outputs = (vsi_nn_tensor_t **)malloc( n->output.num * sizeof(vsi_nn_tensor_t*)); + if( outputs ) + { + memset( outputs, 0x00, ( n->output.num * sizeof(vsi_nn_tensor_t*)) ); + } + } + } + + if( node && n && inputs && outputs ) + { + node->node = n; + node->inputs = inputs; + node->outputs = outputs; + + return node; + } + else + { + if(n) + { + vsi_nn_ReleaseNode(&n); + n = NULL; + } + if(inputs) + { + free(inputs); + inputs = NULL; + } + if(outputs) + { + free(outputs); + outputs = NULL; + } + vsi_nn_internal_release_node( &node ); + return NULL; + } +} /* vsi_nn_internal_create_node() */ + +static vsi_nn_internal_tensor_t* vsi_nn_internal_create_tensor + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_attr_t* attr, + float default_value + ) +{ + vsi_nn_internal_tensor_t* tensor = NULL; + + if( !graph || !attr ) + { + return tensor; + } + + tensor = (vsi_nn_internal_tensor_t *)malloc( sizeof(vsi_nn_internal_tensor_t) ); + if( tensor ) + { + memset( tensor, 0x00, sizeof(vsi_nn_internal_tensor_t) ); + if( attr->is_const ) + { + tensor->t = vsi_nn_CreateTensorWithDefault( graph, attr, default_value ); + } + else + { + tensor->t = vsi_nn_CreateTensor( graph, attr ); + } + + if( !tensor->t ) + { + vsi_nn_internal_release_tensor( &tensor ); + } + } + + return tensor; +} /* vsi_nn_internal_create_tensor() */ + +/********************************************************** +* PUBLIC FUNCTIONS +**********************************************************/ +vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor + ( + vsi_nn_node_t* node, + vsi_nn_tensor_attr_t* input_attr, + vsi_nn_tensor_attr_t* weight_attr + ) +{ + vsi_nn_tensor_attr_t attr; + float scale = 1.0f; + int8_t fl = 0; + + memset(&attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); + + /* create zero bias for NN/TP */ + attr.size[0] = weight_attr->size[1]; + attr.dim_num = 1; + attr.vtl = FALSE; + attr.is_const = TRUE; + + if(input_attr->dtype.qnt_type != VSI_NN_QNT_TYPE_NONE && + input_attr->dtype.qnt_type != weight_attr->dtype.qnt_type) + { + VSILOGE("input qnt_type[%d] != weight qnt_type[%d]", + input_attr->dtype.qnt_type, weight_attr->dtype.vx_type); + return NULL; + } + + if (weight_attr->dtype.qnt_type == VSI_NN_QNT_TYPE_NONE) + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + } + else + { + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + } + + switch(input_attr->dtype.qnt_type) + { + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + scale = input_attr->dtype.scale; + break; + + case VSI_NN_QNT_TYPE_DFP: + fl = input_attr->dtype.fl; + break; + + case VSI_NN_QNT_TYPE_NONE: + scale = 1.0f; + fl = 0; + break; + + default: + VSILOGE("Unsupported quantization type: %d", input_attr->dtype.qnt_type); + break; + } + + switch(weight_attr->dtype.qnt_type) + { + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + attr.dtype.scale = weight_attr->dtype.scale * scale; + attr.dtype.zero_point = 0; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; + break; + + case VSI_NN_QNT_TYPE_DFP: + attr.dtype.fl = weight_attr->dtype.fl + fl; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_DFP; + break; + + case VSI_NN_QNT_TYPE_NONE: + break; + + default: + VSILOGE("Unsupported quantization type: %d", weight_attr->dtype.qnt_type); + break; + } + + return vsi_nn_internal_new_tensor(node, &attr, 0.0f); +} /* vsi_nn_internal_create_zero_bias_tensor() */ + +vsi_status vsi_nn_internal_deinit_node + ( + vsi_nn_node_t* node + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_internal_node_t* curr = NULL; + + curr = WKSP(node)->nodes; + while( NULL != curr ) + { + VSILOGD("Optimize node uid[%u] sub_uid[%u] op[%s]", + node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op)); + + status = vsi_nn_OpDeinit( curr->node->op, curr->node ); + if( VSI_SUCCESS != status ) + { + VSILOGE("op_optimize fail %d", curr->node->op); + break; + } + + curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)curr ); + } + + return status; +} /* vsi_nn_internal_deinit_node() */ + +vsi_status vsi_nn_internal_deinit_node_wksp + ( + vsi_nn_node_t* node + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_internal_node_t* head = NULL; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* tensor_head = NULL; + vsi_nn_internal_tensor_t* tensor_curr = NULL; + + if( node && node->internal_node_wksp ) + { + head = WKSP(node)->nodes; + while( NULL != head ) + { + curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListPopStart( + (vsi_nn_link_list_t **)&head ); + vsi_nn_internal_release_node( &curr ); + } + + tensor_head = WKSP(node)->tensors; + while( NULL != tensor_head ) + { + tensor_curr = (vsi_nn_internal_tensor_t *)vsi_nn_LinkListPopStart( + (vsi_nn_link_list_t **)&tensor_head ); + vsi_nn_internal_release_tensor( &tensor_curr ); + } + + free( node->internal_node_wksp ); + node->internal_node_wksp = NULL; + } + + return status; +} /* vsi_nn_internal_deinit_node_wksp() */ + +void vsi_nn_internal_dump_node_output + ( + vsi_nn_graph_t* graph, + const char* path, + const char* filename_prefix, + vsi_bool force_fp32, + vsi_nn_node_t* node + ) +{ +#define _MAX_TENSOR_NAME_SZ (1024) +#define _SHAPE_BUF_SZ (64) + char shape[_SHAPE_BUF_SZ] = { 0 }; + char filename[_MAX_TENSOR_NAME_SZ] = { 0 }; + const char* op_name; + uint32_t o; + vsi_nn_internal_node_t* head = ((vsi_nn_internal_node_wksp_t *)node->internal_node_wksp)->nodes; + while( NULL != head ) + { + vsi_nn_internal_node_t* curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListPopStart( + (vsi_nn_link_list_t **)&head ); + if( curr ) + { + if (curr->node->internal_node_wksp) + { + vsi_nn_internal_dump_node_output(graph, path, filename_prefix, + force_fp32, curr->node); + } + else + { + for( o = 0; o < curr->node->output.num; o++ ) + { + vsi_nn_tensor_t* tensor = curr->outputs[o]; + if( tensor ) + { + if( TRUE == tensor->attr.vtl ) + { + VSILOGW("Uid %u node's tensor %d is virtual", + curr->node->uid, o); + continue; + } + // TODO: Support different tensor format + vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num, + shape, _SHAPE_BUF_SZ, FALSE ); + op_name = vsi_nn_OpGetName( curr->node->op ); + snprintf( filename, _MAX_TENSOR_NAME_SZ, + "%s/%s%s_uid_%u_sub_%u_t_%u_s_%s.txt", path, filename_prefix, + op_name, node->uid, curr->node->uid, o, shape); + if( FALSE == force_fp32 ) + { + vsi_nn_SaveTensorToText( graph, tensor, filename, NULL ); + } + else + { + vsi_nn_SaveTensorToTextByFp32( graph, tensor, filename, NULL ); + } + } + } + } + } + } +} /* vsi_nn_internal_dump_node_output() */ + +vsi_nn_internal_node_t* vsi_nn_internal_get_node_by_uid + ( + vsi_nn_node_t* node, + int uid + ) +{ + vsi_nn_internal_node_t* head = NULL; + vsi_nn_internal_node_t* curr = NULL; + + if( node && node->internal_node_wksp ) + { + head = WKSP(node)->nodes; + while( NULL != head ) + { + curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListPopStart( + (vsi_nn_link_list_t **)&head ); + if( curr->node->uid == (uint32_t)uid ) + { + return curr; + } + } + } + + return NULL; +} /* vsi_nn_internal_get_node_by_uid() */ + +vsi_status vsi_nn_internal_init_node_wksp + ( + vsi_nn_node_t* node + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_internal_node_wksp_t* wksp = NULL; + + if( node->internal_node_wksp ) + { + vsi_nn_internal_deinit_node_wksp( node ); + } + + wksp = (vsi_nn_internal_node_wksp_t *)malloc( sizeof( vsi_nn_internal_node_wksp_t ) ); + if( wksp ) + { + memset( wksp, 0x00, sizeof( vsi_nn_internal_node_wksp_t ) ); + wksp->curr_node_uid = 1; + + node->internal_node_wksp = wksp; + + status = VSI_SUCCESS; + } + + return status; +} /* vsi_nn_internal_init_node_wksp() */ + +void vsi_nn_internal_init_tensor_attr + ( + vsi_nn_tensor_attr_t* attr, + const vsi_nn_dtype_t* dtype, + vsi_bool use_virtual_tensor + ) +{ + memset(attr, 0x00, sizeof(vsi_nn_tensor_attr_t)); + + //memset(attr->size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + attr->dim_num = VSI_NN_DIM_AUTO; + attr->vtl = use_virtual_tensor; + attr->is_const = FALSE; + + if( dtype->qnt_type == VSI_NN_QNT_TYPE_NONE && + ( dtype->vx_type != VSI_NN_TYPE_FLOAT16 && + dtype->vx_type != VSI_NN_TYPE_FLOAT32 ) ) + { + attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr->dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } + else + { + memcpy(&attr->dtype, dtype, sizeof(vsi_nn_dtype_t)); + } +} /* vsi_nn_internal_init_tensor_attr() */ + +vsi_nn_internal_node_t* vsi_nn_internal_new_node + ( + vsi_nn_node_t* node, + vsi_nn_op_t op, + uint32_t input_num, + uint32_t output_num + ) +{ + vsi_nn_internal_node_t* inode = NULL; + + inode = vsi_nn_internal_create_node( node->graph, + op, input_num, output_num ); + inode->node->attr.const_tensor_preload_type = node->attr.const_tensor_preload_type; + return inode; +} /* vsi_nn_internal_new_node() */ + +void* vsi_nn_internal_new_node_param + ( + vsi_nn_internal_node_t* inode, + size_t size /* in bytes */ + ) +{ + vsi_nn_internal_node_param_t* param = NULL; + size_t buf_sz = sizeof(vsi_nn_internal_node_param_t) + size; + void* ptr = NULL; + if( !inode ) + { + return ptr; + } + + param = (vsi_nn_internal_node_param_t *)malloc(buf_sz); + if( param ) + { + memset( param, 0x00, buf_sz ); + ptr = (void *)(¶m->param[0]); + LINKLIST_APPEND(inode->param, param); + } + + return ptr; +} /* vsi_nn_internal_new_node_param() */ + +vsi_nn_internal_tensor_t* vsi_nn_internal_new_tensor + ( + vsi_nn_node_t* node, + vsi_nn_tensor_attr_t* attr, + float default_value + ) +{ + vsi_nn_internal_tensor_t* tensor = NULL; + + tensor = vsi_nn_internal_create_tensor( node->graph, + attr, default_value ); + if( tensor ) + { + LINKLIST_APPEND( WKSP(node)->tensors, tensor ); + } + + return tensor; +} /* vsi_nn_internal_new_tensor() */ + +vsi_status vsi_nn_internal_release_node + ( + vsi_nn_internal_node_t** node + ) +{ + if( node && *node ) + { + vsi_nn_internal_node_t* ptr = *node; + + if( ptr->inputs && ptr->node->input.num ) + { + free( ptr->inputs ); + ptr->inputs = NULL; + } + if( ptr->outputs && ptr->node->output.num ) + { + free( ptr->outputs ); + ptr->outputs = NULL; + } + if( ptr->param ) + { + vsi_nn_LinkListDeinit((vsi_nn_link_list_t *)(ptr->param), NULL); + } + if( ptr->node ) + { + vsi_nn_ReleaseNode( &ptr->node ); + } + + free( ptr ); + *node = NULL; + } + + return VSI_SUCCESS; +} /* vsi_nn_internal_release_node() */ + +vsi_status vsi_nn_internal_release_tensor + ( + vsi_nn_internal_tensor_t** tensor + ) +{ + if( tensor && *tensor ) + { + vsi_nn_internal_tensor_t* ptr = *tensor; + + if( ptr->t ) + { + vsi_nn_ReleaseTensor( &ptr->t ); + } + free( ptr ); + *tensor = NULL; + } + + return VSI_SUCCESS; +} /* vsi_nn_internal_release_tensor() */ + +vsi_bool vsi_nn_internal_check_node + ( + vsi_nn_internal_node_t* inode + ) +{ + vsi_bool retn = TRUE; + + retn = vsi_nn_OpCheck( inode->node->op, inode->node, inode->inputs, inode->outputs ); + + return retn; +} /* vsi_nn_internal_setup_node() */ + +vsi_bool vsi_nn_internal_setup_node + ( + vsi_nn_node_t* node, + vsi_nn_internal_node_t* inode + ) +{ + vsi_bool retn = TRUE; + + retn = vsi_nn_OpSetup( inode->node->op, inode->node, inode->inputs, inode->outputs ); + if( retn ) + { + inode->node->uid = WKSP(node)->curr_node_uid; + LINKLIST_APPEND( WKSP(node)->nodes, inode ); + WKSP(node)->curr_node_uid++; + + retn = vsi_nn_internal_check_node(inode); + } + + return retn; +} /* vsi_nn_internal_setup_node() */ + +vsi_status vsi_nn_internal_compute_node + ( + vsi_nn_node_t* node + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_internal_node_t* curr = NULL; + uint32_t j = 0; + + curr = WKSP(node)->nodes; + while( NULL != curr ) + { + for ( j = 0; j < curr->node->output.num; j++ ) + { + if( NULL == curr->outputs[j] || NULL != curr->outputs[j]->t ) + continue; + vsi_nn_TensorReinit( node->graph, curr->outputs[j] ); + } + + VSILOGD("Compute node uid[%u] sub_uid[%u] op[%s]", + node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op)); + status = vsi_nn_OpCompute( curr->node->op, curr->node, curr->inputs, curr->outputs ); + if( VSI_SUCCESS != status ) + { + VSILOGE("op_compute fail %d", curr->node->op); + break; + } + + status = vsi_nn_update_node_attr(curr->node); + if( VSI_SUCCESS != status ) + { + VSILOGW("Update node attribute fail"); + } + + curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)curr ); + } + + return status; +} /* vsi_nn_internal_compute_node() */ + +vsi_status vsi_nn_internal_optimize_node + ( + vsi_nn_node_t* node, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_internal_node_t* curr = NULL; + + curr = WKSP(node)->nodes; + while( NULL != curr ) + { + VSILOGD("Optimize node uid[%u] sub_uid[%u] op[%s]", + node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op)); + + status = vsi_nn_OpOptimize( curr->node->op, curr->node, + curr->inputs, curr->outputs, direction ); + if( VSI_SUCCESS != status ) + { + VSILOGE("op_optimize fail %d", curr->node->op); + break; + } + + curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)curr ); + } + + return status; +} /* vsi_nn_internal_optimize_node() */ + +/* EOF */ diff --git a/src/tim/vx/internal/src/vsi_nn_log.c b/src/tim/vx/internal/src/vsi_nn_log.c new file mode 100644 index 0000000..c818463 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_log.c @@ -0,0 +1,81 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include + +#include "vsi_nn_log.h" +#include "vsi_nn_types.h" + +static vsi_bool _check_log_level + ( + vsi_nn_log_level_e level + ) +{ + char *env_level_s; + static vsi_nn_log_level_e env_level = VSI_NN_LOG_UNINIT; + + if(env_level == VSI_NN_LOG_UNINIT) + { + env_level_s = getenv("VSI_NN_LOG_LEVEL"); + if(env_level_s) + { + env_level = (vsi_nn_log_level_e)atoi(env_level_s); + } + else + { + env_level = VSI_NN_LOG_WARN; + } + } + + if(env_level >= level) + { + return TRUE; + } + + return FALSE; +} + +void vsi_nn_LogMsg + ( + vsi_nn_log_level_e level, + const char *fmt, + ... + ) +{ + char arg_buffer[VSI_NN_MAX_DEBUG_BUFFER_LEN] = {0}; + va_list arg; + + if(_check_log_level(level) == FALSE) + { + return ; + } + + va_start(arg, fmt); + vsnprintf(arg_buffer, VSI_NN_MAX_DEBUG_BUFFER_LEN, fmt, arg); + va_end(arg); + + fprintf(stderr, "%s\n", arg_buffer); +} /* vsi_nn_LogMsg() */ diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c new file mode 100644 index 0000000..0c05bb2 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_node.c @@ -0,0 +1,245 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_types.h" +#include "utils/vsi_nn_util.h" + +vsi_nn_node_t * vsi_nn_NewNode + ( + vsi_nn_graph_t * graph, + vsi_nn_op_t op, + uint32_t input_num, + uint32_t output_num + ) +{ + vsi_nn_node_t * node; + + node = NULL; + if(NULL == graph || FALSE == vsi_nn_OpIsValid(op)) + { + VSILOGE("Create node %s. fail", vsi_nn_OpGetName(op)); + return NULL; + } + + node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) ); + if( NULL != node ) + { + memset( node, 0, sizeof( vsi_nn_node_t ) ); + node->graph = graph; + node->op = op; + node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO; + node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR; + + /* init op */ + vsi_nn_OpInit( node->op, node ); + + if( 0 == input_num && 0 == output_num ) + { + vsi_nn_OpGetIoNum( op, node, &input_num, &output_num ); + } + + /* init output struct */ + node->output.num = output_num; + node->output.tensors = (vsi_nn_tensor_id_t *) malloc( + output_num * sizeof( vsi_nn_tensor_id_t ) ); + vsi_nn_InitTensorsId( node->output.tensors, output_num ); + + /* init input struct */ + node->input.num = input_num; + node->input.tensors = (vsi_nn_tensor_id_t *) malloc( + input_num * sizeof( vsi_nn_tensor_id_t ) ); + vsi_nn_InitTensorsId( node->input.tensors, input_num ); + node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE; + node->attr.enable_op_constraint_check = TRUE; + } + + node->uid = VSI_NN_NODE_UID_NA; + return node; +} /* vsi_nn_NewNode() */ + +/* +* Deprecated: Use vsi_nn_NewNode() instead +*/ +vsi_nn_node_t * vsi_nn_CreateNode + ( + vsi_nn_graph_t * graph, + vsi_nn_op_t op + ) +{ + return vsi_nn_NewNode( graph, op, 0, 0 ); +} /* vsi_nn_CreateNode() */ + +void vsi_nn_ReleaseNode + ( + vsi_nn_node_t ** node + ) +{ + vsi_nn_node_t * ptr; + ptr = *node; + if( NULL != node && NULL != *node ) + { + vsi_nn_OpDeinit( ptr->op, ptr ); + if( NULL != ptr->input.tensors ) + { + free( ptr->input.tensors ); + } + if( NULL != ptr->output.tensors ) + { + free( ptr->output.tensors ); + } + free( ptr ); + *node = NULL; + } +} /* vsi_nn_ReleaseNode() */ + +vsi_status vsi_nn_SetNodeInputsAndOutputs + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t * const inputs[], + int input_num, + vsi_nn_tensor_t * const outputs[], + int output_num + ) +{ + vsi_status status = VSI_SUCCESS; + int i; + vsi_nn_tensor_id_t id; + if( !node ) + { + return VSI_FAILURE; + } + if( inputs && input_num > 0 ) + { + assert(input_num <= (int)node->input.num); + for(i = 0; i < input_num; i ++) + { + id = vsi_nn_get_tensor_id(node->graph, inputs[i]); + node->input.tensors[i] = id; + } + } + if( outputs && output_num > 0 ) + { + assert(output_num <= (int)node->output.num); + for(i = 0; i < output_num; i ++) + { + id = vsi_nn_get_tensor_id(node->graph, outputs[i]); + node->output.tensors[i] = id; + } + } + return status; +} /* vsi_nn_SetNodeInputsAndOutputs() */ + +void vsi_nn_PrintNode + ( + vsi_nn_node_t * node, + vsi_nn_node_id_t id + ) +{ +#define _MAX_PRINT_BUF_SZ (256) + uint32_t i; + int count; + char buf[_MAX_PRINT_BUF_SZ]; + + if( NULL == node ) + { + return; + } + count = snprintf( &buf[0], _MAX_PRINT_BUF_SZ, "%s", "[in:" ); + for( i = 0; i < node->input.num; i ++ ) + { + if( count >= _MAX_PRINT_BUF_SZ ) + { + break; + } + count += snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, + " %d,", node->input.tensors[i] ); + } + count --; + count += snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, + "%s", " ], [out:" ); + for( i = 0; i < node->output.num; i ++ ) + { + if( count >= _MAX_PRINT_BUF_SZ ) + { + break; + } + count += snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, + " %d,", node->output.tensors[i] ); + } + count --; + count += snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, + "%s", " ]" ); + VSILOGI( "(%16s)node[%u] %s [%08x]", vsi_nn_OpGetName(node->op), id, buf, node->n ); +} /* vsi_nn_PrintNode() */ + +vsi_status vsi_nn_update_node_attr + ( + vsi_nn_node_t *node + ) +{ + vsi_status status = VSI_FAILURE; + +#if(defined(VX_PRELOAD_CONST_TENSOR_SUPPORT) && VX_PRELOAD_CONST_TENSOR_SUPPORT) + if(node) + { + /* some node don't have a `n`, skip it */ + status = VSI_SUCCESS; + if(node->n) + { + vx_enum preload_type = VX_PRELOAD_NULL; + switch(node->attr.const_tensor_preload_type) + { + default: + case VSI_NN_NODE_PRELOAD_NONE: + preload_type = VX_PRELOAD_NULL; + break; + + case VSI_NN_NODE_PRELOAD_VIPSRAM: + preload_type = VX_PRELOAD_CONST_TENSOR_VIPSRAM; + break; + + case VSI_NN_NODE_PRELOAD_AXISRAM: + preload_type = VX_PRELOAD_CONST_TENSOR_AXISRAM; + break; + } + status = vxSetNodeAttribute(node->n, VX_NODE_ATTRIBUTE_CONST_TENSOR_CACHE, + &preload_type, sizeof(preload_type)); + } + } +#else + status = VSI_SUCCESS; +#endif + + return status; +} /* vsi_nn_update_node_attr() */ diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c new file mode 100644 index 0000000..5c5ffd6 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c @@ -0,0 +1,277 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_types.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_assert.h" +#include "utils/vsi_nn_util.h" + +typedef void ( *_node_template )( vsi_nn_node_t * ); + +static void _template_pool( vsi_nn_node_t* node ); + +static void _template_conv2d( vsi_nn_node_t* node ); + +static void _template_conv_relu( vsi_nn_node_t* node ); + +static void _template_conv_relu_pool( vsi_nn_node_t* node ); + +static void _template_fcl( vsi_nn_node_t* node ); + +static void _template_fcl_relu( vsi_nn_node_t* node ); + +static void _template_lrn( vsi_nn_node_t* node ); + +static _node_template s_template[] = +{ + /* ADD */ NULL, + /* MULTIPLY */ NULL, + /* CONV2D */ _template_conv2d, + /* CONV_RELU */ _template_conv_relu, + /* CONV_RELU_POOL */ _template_conv_relu_pool, + /* FCL */ _template_fcl, + /* FCL_RELU */ _template_fcl_relu, + /* SOFTMAX */ NULL, + /* POOL */ _template_pool, + /* LEAKY_RELU */ NULL, + /* LRN */ _template_lrn, + /* CONCAT */ NULL, + /* SPLIT */ NULL, + /* NOOP */ NULL, + /* ROI_POOL */ NULL, + /* BATCH_NORM */ NULL, + /* PROPOSAL */ NULL, + /* DECONVOLUTION */ NULL, + /* RESHAPE */ NULL, + /* PERMUTE */ NULL, + /* PRELU */ NULL, + /* UPSAMPLE */ NULL, + /* RELU */ NULL, + /* RELUN */ NULL, + /* LSTM */ NULL, + /* REORG */ NULL, + /* VARIABLE */ NULL, + /* L2_NORMALIZE */ NULL, + /* FCL2 */ NULL, + /* POOLWITHARGMAX */ NULL, + /* ARGMAX */ NULL, + /* MAXIMUM */ NULL, + /* L2NORMALIZESCALE */ NULL, + /* CROP */ NULL, + /* SUBTRACT */ NULL, + /* RELU6 */ NULL, + /* SIGMOID */ NULL, + /* TANH */ NULL, + /* SQRT */ NULL, + /* RSQRT */ NULL, + /* SOFTRELU */ NULL, + /* DIVIDE */ NULL, + /* DROPOUT */ NULL, + /* SHUFFLECHANNEL */ NULL, + /* RESIZE */ NULL, + /* REVERSE */ NULL, + /* DEPTH2SPACE */ NULL, + /* SPACE2DEPTH */ NULL, + /* DATACONVERT */ NULL, + /* SCALE */ NULL, + /* SLICE */ NULL, + /* ELU */ NULL, + /* BATCH2SPACE */ NULL, + /* SPACE2BATCH */ NULL, + /* PAD */ NULL, + /* IMAGEPROCESS */ NULL, + /* MATRIXMUL */ NULL, + /* LSTMUNIT */ NULL, + /* LAYERNORM */ NULL, + /* REDUCE */ NULL, + /* INSTANCENORM */ NULL, + /* TENSORSTACKCONCAT */ NULL, + /* STRIDED_SLICE */ NULL, + /* SIGNALFRAME */ NULL, + /* A_TIMES_B_PLUS_C */ NULL, + /* SVDF */ NULL, + /* ABS */ NULL, + /* CONV1D */ NULL, + /* NBG */ NULL, + /* CONCATSHIFT */ NULL, + /* LRN2 */ _template_lrn, + /* RELATIONALOPS */ NULL, + /* SYNC_HOST */ NULL, + /* POW */ NULL, + /* FOORDIV */ NULL, + /* MINIMUM */ NULL, + /* SPATIAL_TRANSFORMER */ NULL, + /* LOGICAL_OPS */ NULL, + /* SELECT */ NULL, + /* LSTMUNIT_ACTIVATION */ NULL, + /* LSTMUNIT_OVXLIB */ NULL, + /* TENSOR_ADD_MEAN_STDDEV_NORM */ NULL, + /* RELU1 */ NULL, + /* STACK */ NULL, + /* FLOOR */ NULL, + /* SQUARE */ NULL, + /* NEG */ NULL, + /* EXP */ NULL, + /* LSTM_OVXLIB */ NULL, + /* PRE_PROCESS_TENSOR */ NULL, + /* HASHTABLE_LOOKUP */ NULL, + /* EMBEDDING_LOOKUP */ NULL, + /* LSH_PROJECTION */ NULL, + /* RNN */ NULL, + /* CLIP */ NULL, + /* POST_PROCESS */ NULL, + /* PRE_PROCESS_GRAY */ NULL, + /* UNSTACK */ NULL, + /* PRE_PROCESS_RGB */ NULL, + /* PRE_PROCESS */ NULL, + /* ADDN */ NULL, + /* PRE_PROCESS_YUV420 */ NULL, + /* EXTRA_ENDING */ NULL, + /* GATHER */ NULL, + /* TILE */ NULL, + /* GROUPED_CONV2D */ NULL, + /* TOPK */ NULL, + /* PRE_PROCESS_BGRA */ NULL, + /* LOGICAL_NOT */ NULL, + /* SIN */ NULL, + /* LOG */ NULL, + /* ARGMIN */ NULL, + /* ROI_ALIGN */ NULL, + /* HEATMAP_MAX_KEYPOINT */ NULL, + /* AXIS_ALIGNED_BBOX_TRANSFORM */ NULL, + /* BOX_WITH_NMS_LIMIT */ NULL, + /* GENERATE_PROPOSALS */ NULL, + /* DETECTION_POSTPROCESS */ NULL, + /* RANDOM_MULTINOMIAL */ NULL, + /* LOG_SOFTMAX */ NULL, + /* RELU_KERAS */ NULL, + /* GRU_OVXLIB */ NULL, + /* GRUCELL_OVXLIB */ NULL, + /* UNIDIRECTIONAL_SEQUENCE_RNN */ NULL, + /* QUANTIZED_16BIT_LSTM */ NULL, + /* BIDIRECTIONAL_SEQUENCE_RNN */ NULL, + /* BIDIRECTIONAL_SEQUENCE_LSTM */ NULL, + /* RNNCELL_OVXLIB */ NULL, + /* SWISH */ NULL, + /* GATHER_ND */ NULL, + /* CAST */ NULL, + /* LINEAR */ NULL, + /* MOMENTS */ NULL, + /* PRE_PROCESS_YUV444 */ NULL, + /* PRE_PROCESS_NV12 */ NULL, + /* SCATTER_ND */ NULL, + /* DECONVOLUTION1D */ NULL, +}; +//_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c ); + +void vsi_nn_apply_node_attr_template + ( vsi_nn_node_t * node ) +{ + if( node->op >= _cnt_of_array( s_template ) ) + { + VSILOGW( "Unsupport operation id %d.", node->op ); + return; + } + if( NULL != s_template[node->op] ) + { + s_template[node->op]( node ); + } +} /* ovx_apply_node_attr_template() */ + +static void _template_lrn + ( vsi_nn_node_t* node ) +{ + node->nn_param.lrn.bias = 1.0f; + node->nn_param.lrn.type = VX_CONVOLUTIONAL_NETWORK_NORM_ACROSS_MAPS; + node->nn_param.lrn.size = 5; + node->nn_param.lrn.alpha = 0.0001f; + node->nn_param.lrn.beta = 0.75f; +} /* _template_lrn() */ + +static void _template_pool + ( vsi_nn_node_t * node ) +{ + node->nn_param.pool.ksize[0] = 1; + node->nn_param.pool.ksize[1] = 1; + node->nn_param.pool.stride[0] = 1; + node->nn_param.pool.stride[1] = 1; + node->nn_param.pool.pad[0] = 0; + node->nn_param.pool.pad[1] = 0; + node->nn_param.pool.pad[2] = 0; + node->nn_param.pool.pad[3] = 0; + node->nn_param.pool.pad_type = VSI_NN_PAD_AUTO; + node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX; + node->nn_param.pool.round_type = VSI_NN_ROUND_CEIL; + node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_CEILING; +} /* _template_pool() */ + +static void _template_conv2d + ( vsi_nn_node_t * node ) +{ + node->nn_param.conv2d.ksize[0] = 1; + node->nn_param.conv2d.ksize[1] = 1; + node->nn_param.conv2d.weights = 1; + node->nn_param.conv2d.stride[0] = 1; + node->nn_param.conv2d.stride[1] = 1; + node->nn_param.conv2d.pad[0] = 0; + node->nn_param.conv2d.pad[1] = 0; + node->nn_param.conv2d.pad[2] = 0; + node->nn_param.conv2d.pad[3] = 0; + node->nn_param.conv2d.pad_type = VSI_NN_PAD_AUTO; + node->nn_param.conv2d.group = 1; + node->vx_param.has_relu = FALSE; +} /* _template_conv2d() */ + +static void _template_conv_relu + ( vsi_nn_node_t * node ) +{ + _template_conv2d( node ); + node->vx_param.has_relu = TRUE; +} /* _template_conv_relu() */ + +static void _template_conv_relu_pool + ( vsi_nn_node_t * node ) +{ + _template_conv_relu( node ); + _template_pool( node ); +} /* _template_conv_relu_pool() */ + +static void _template_fcl + ( vsi_nn_node_t * node ) +{ + node->nn_param.fcl.weights = 1; + node->vx_param.has_relu = FALSE; +} /* _template_fcl() */ + +static void _template_fcl_relu + ( vsi_nn_node_t* node ) +{ + _template_fcl( node ); + node->vx_param.has_relu = TRUE; +} /* _template_fcl_relu() */ + diff --git a/src/tim/vx/internal/src/vsi_nn_ops.c b/src/tim/vx/internal/src/vsi_nn_ops.c new file mode 100644 index 0000000..aba3f2c --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_ops.c @@ -0,0 +1,383 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include "vsi_nn_ops.h" +#include "vsi_nn_client_op.h" +#include "vsi_nn_node.h" +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" + +#define DEF_OP(NAME, ...) extern vsi_nn_op_proc_t vsi_nn_op_##NAME; +#include "interface/ops.def" +#undef DEF_OP +#define DEF_OP(NAME, ...) &vsi_nn_op_##NAME, +static const vsi_nn_op_proc_t * vsi_nn_ops_tab[VSI_NN_OP_NUM] = +{ +#include "interface/ops.def" +}; +#undef DEF_OP + +/* Custom ops */ +#define DEF_OP(NAME, ...) extern vsi_nn_op_proc_t vsi_nn_op_##NAME; +#include "custom/custom_ops.def" +#undef DEF_OP +#define DEF_OP(NAME, ...) &vsi_nn_op_##NAME, +static const vsi_nn_op_proc_t * vsi_nn_custom_ops_tab[VSI_NN_OP_CUSTOM_NUM] = +{ +#include "custom/custom_ops.def" +}; +#undef DEF_OP + +/* Internal ops */ +#define DEF_OP(NAME, ...) extern vsi_nn_op_proc_t vsi_nn_op_##NAME; +#include "internal/internal_ops.def" +#undef DEF_OP +#define DEF_OP(NAME, ...) &vsi_nn_op_##NAME, +static const vsi_nn_op_proc_t * vsi_nn_internal_ops_tab[VSI_NN_OP_INTERNAL_NUM] = +{ +#include "internal/internal_ops.def" +}; +#undef DEF_OP + +// TODO: Add name item to op structure +#define DEF_OP(NAME, ...) ""#NAME, +static const char * vsi_nn_ops_name[] = +{ +#include "interface/ops.def" + "UNKNOWN" +}; +#undef DEF_OP + +#define DEF_OP(NAME, ...) ""#NAME, +static const char * vsi_nn_custom_ops_name[] = +{ +#include "custom/custom_ops.def" + "UNKNOWN" +}; +#undef DEF_OP + +// TODO: Add name item to internal op structure +#define DEF_OP(NAME, ...) ""#NAME, +static const char * vsi_nn_internal_ops_name[] = +{ +#include "internal/internal_ops.def" + "UNKNOWN" +}; +#undef DEF_OP + +vsi_bool _is_custom_ops + ( + vsi_nn_op_t op + ) +{ + vsi_bool ret = FALSE; + if( op > VSI_NN_OP_CUSTOM_START && op < VSI_NN_OP_CUSTOM_END ) + { + ret = TRUE; + } + return ret; +} + +vsi_bool _is_internal_ops + ( + vsi_nn_op_t op + ) +{ + vsi_bool ret = FALSE; + if( op > VSI_NN_OP_INTERNAL_START && op < VSI_NN_OP_INTERNAL_END ) + { + ret = TRUE; + } + return ret; +} + +vsi_bool vsi_nn_OpIsValid + ( + vsi_nn_op_t op + ) +{ + vsi_bool valid; + valid = TRUE; + + if( (op < VSI_NN_OP_NUM ) || _is_internal_ops(op) ) + { + valid = TRUE; + } + else + { + if( _is_custom_ops(op) == FALSE ) + { + valid = vsi_nn_OpIsRegistered( op ); + } + } + return valid; +} /* vsi_nn_OpIsValid() */ + +const vsi_nn_op_proc_t * vsi_nn_OpGetProc + ( + vsi_nn_op_t op + ) +{ + const vsi_nn_op_proc_t * proc; + + /* Use client first */ + proc = vsi_nn_OpGetClient( op ); + if( NULL == proc && op < VSI_NN_OP_NUM ) + { + proc = vsi_nn_ops_tab[op]; + } + if( NULL == proc && _is_custom_ops(op) ) + { + proc = vsi_nn_custom_ops_tab[op - VSI_NN_OP_CUSTOM_START - 1]; + } + if( NULL == proc && _is_internal_ops(op) ) + { + proc = vsi_nn_internal_ops_tab[op - VSI_NN_OP_INTERNAL_START - 1]; + } + return proc; +} /* vsi_nn_OpGetProc() */ + +vsi_status vsi_nn_OpInit + ( + vsi_nn_op_t op, + vsi_nn_node_t * node + ) +{ + const vsi_nn_op_proc_t * proc; + vsi_status status; + + status = VSI_FAILURE; + proc = vsi_nn_OpGetProc( op ); + if( NULL != proc ) + { + status = VSI_SUCCESS; + if( NULL != proc->init ) + { + status = proc->init( node ); + } + } + return status; +} /* vsi_nn_OpInit() */ + +vsi_status vsi_nn_OpCompute + ( + vsi_nn_op_t op, + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + const vsi_nn_op_proc_t * proc; + vsi_status status; + + status = VSI_FAILURE; + proc = vsi_nn_OpGetProc( op ); + if( NULL != proc ) + { + if( NULL != proc->compute ) + { + status = proc->compute( node, inputs, outputs ); + } + else + { + VSILOGE("Do not support this platform"); + status = VSI_FAILURE; + } + } + return status; +} /* vsi_nn_op_compute() */ + +vsi_status vsi_nn_OpDeinit + ( + vsi_nn_op_t op, + vsi_nn_node_t * node + ) +{ + const vsi_nn_op_proc_t * proc; + vsi_status status; + + status = VSI_FAILURE; + proc = vsi_nn_OpGetProc( op ); + if( NULL != proc ) + { + status = VSI_SUCCESS; + if( proc->deinit ) + { + status = proc->deinit( node ); + } + } + return status; +} /* vsi_nn_op_deinit() */ + +vsi_status vsi_nn_OpOptimize + ( + vsi_nn_op_t op, + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + const vsi_nn_op_proc_t * proc; + vsi_status status; + + status = VSI_SUCCESS; + proc = vsi_nn_OpGetProc( op ); + if( NULL != proc && proc->optimize != NULL ) + { + status = proc->optimize( node, inputs, outputs, direction ); + if(status != VSI_SUCCESS) + { + VSILOGE("Optimize node %s fail", vsi_nn_OpGetName(node->op)); + } + } + return status; +} /* vsi_nn_OpOptimize() */ + +vsi_bool vsi_nn_OpCheck + ( + vsi_nn_op_t op, + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + const vsi_nn_op_proc_t * proc; + vsi_bool ret; + + ret = FALSE; + proc = vsi_nn_OpGetProc( op ); + if( NULL != proc ) + { + ret = TRUE; + if( proc->check ) + { + ret = proc->check( node, inputs, outputs ); + } + } + return ret; +} /* vsi_nn_op_check() */ + +void vsi_nn_OpGetIoNum + ( + vsi_nn_op_t op, + vsi_nn_node_t * node, + uint32_t * input_num, + uint32_t * output_num + ) +{ + const vsi_nn_op_proc_t * proc; + proc = vsi_nn_OpGetProc( op ); + if( NULL != proc ) + { + if( NULL != input_num ) + { + *input_num = proc->input_num; + } + if( NULL != output_num ) + { + *output_num = proc->output_num; + } + } +} /* vsi_nn_OpGetIoNum() */ + +vsi_bool vsi_nn_OpGenerateTensor + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret; + ret = vsi_nn_OpSetup( node->op, node, inputs, outputs ); + return ret; +} /* vsi_nn_OpGenerateTensor() */ + +vsi_bool vsi_nn_OpSetup + ( + vsi_nn_op_t op, + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + const vsi_nn_op_proc_t * proc; + vsi_bool ret; + + ret = FALSE; + proc = vsi_nn_OpGetProc( op ); + if( NULL != proc ) + { + ret = proc->setup( node, inputs, outputs ); + } + return ret; +} /* vsi_nn_OpSetup() */ + +vsi_bool vsi_nn_OpRegisterOvxInit + ( + vsi_nn_op_t op, + vsi_nn_op_compute_t compute + ) +{ + const vsi_nn_op_proc_t * proc; + vsi_nn_op_proc_t tmp; + vsi_bool ret; + + ret = FALSE; + proc = vsi_nn_OpGetProc( op ); + + if( NULL != proc ) + { + memcpy( &tmp, proc, sizeof( vsi_nn_op_proc_t ) ); + tmp.compute = compute; + vsi_nn_OpRegisterClient( op, &tmp ); + } + return ret; +} /* vsi_nn_OpRegisterClientCompute() */ + +const char * vsi_nn_OpGetName + ( + vsi_nn_op_t op + ) +{ + const char * name; + if( op < VSI_NN_OP_NUM ) + { + name = vsi_nn_ops_name[op]; + } + else if(_is_custom_ops(op)) + { + name = vsi_nn_custom_ops_name[op - VSI_NN_OP_CUSTOM_START - 1]; + } + else if(_is_internal_ops(op)) + { + name = vsi_nn_internal_ops_name[op - VSI_NN_OP_INTERNAL_START - 1]; + } + else + { + name = vsi_nn_ops_name[VSI_NN_OP_NUM]; + } + return name; +} /* vsi_nn_GetOpName() */ + diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c new file mode 100644 index 0000000..51083b1 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -0,0 +1,605 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include + +#include "vsi_nn_pre_post_process.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" + +static void _create_yuv_norm_tensors + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_attr_t* input_attr, + vsi_nn_preprocess_source_layout_e* source_layout, + vsi_nn_preprocess_source_format_e* source_format, + vsi_nn_tensor_id_t* yuv_tensors + ) +{ + int w = 0; + int h = 0; + vsi_nn_tensor_attr_t y_input_attr; + vsi_nn_tensor_attr_t uv_input_attr; + + if (*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) + { + w = input_attr->size[1]; + h = input_attr->size[2]; + } + else + { + w = input_attr->size[0]; + h = input_attr->size[1]; + } + /* Create y norm tensor */ + y_input_attr = *input_attr; + y_input_attr.size[0]= w; + y_input_attr.size[1]= h; + y_input_attr.size[2] = 1; + y_input_attr.size[3] = 1; + yuv_tensors[0] = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &y_input_attr, NULL ); + + /* Create uv norm tensor */ + if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420) + { + uv_input_attr = *input_attr; + uv_input_attr.size[0]= w/2; + uv_input_attr.size[1]= h/2; + uv_input_attr.size[2] = 1; + uv_input_attr.size[3] = 1; + + yuv_tensors[1] = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL ); + yuv_tensors[2] = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL ); + } + + else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12) + { + uv_input_attr = *input_attr; + uv_input_attr.size[0]= w; + uv_input_attr.size[1]= h/2; + uv_input_attr.size[2] = 1; + uv_input_attr.size[3] = 1; + + yuv_tensors[1] = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL ); + } + + else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444) + { + uv_input_attr = *input_attr; + uv_input_attr.size[0]= w; + uv_input_attr.size[1]= h; + uv_input_attr.size[2] = 1; + uv_input_attr.size[3] = 1; + yuv_tensors[1] = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL ); + yuv_tensors[2] = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL ); + + } +} /* _create_yuv_norm_tensors() */ + +static vsi_status _set_preproc_node_type + ( + vsi_nn_node_t* node, + vsi_nn_preprocess_source_format_e* source_format + ) +{ + vsi_status status = VSI_SUCCESS; + if(source_format != NULL) + node->nn_param.pre_process.type = *source_format; + else + { + VSILOGE("Preprocess source format need to be set!"); + status = VSI_FAILURE; + } + return status; +} /* _set_preproc_node_type() */ + +static void _set_preproc_node_rect_params + ( + vsi_nn_node_t* node, + vsi_nn_preprocess_crop_t* crop, + vsi_nn_tensor_attr_t attr, + vsi_nn_preprocess_source_layout_e* source_layout + ) +{ + if(crop != NULL) + { + node->nn_param.pre_process.rect.left = crop->begin[0]; + node->nn_param.pre_process.rect.top = crop->begin[1]; + node->nn_param.pre_process.rect.width = crop->size[0]; + node->nn_param.pre_process.rect.height = crop->size[1]; + } + else + { + node->nn_param.pre_process.rect.left = 0; + node->nn_param.pre_process.rect.top = 0; + node->nn_param.pre_process.rect.width = attr.size[0]; + node->nn_param.pre_process.rect.height = attr.size[1]; + if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) + { + node->nn_param.pre_process.rect.width = attr.size[1]; + node->nn_param.pre_process.rect.height = attr.size[2]; + } + } +} /* _set_preproc_node_rect_params() */ + +static void _set_preproc_node_norm_params + ( + vsi_nn_node_t* node, + vsi_nn_preprocess_mean_and_scale_t* mean_and_scale, + vsi_nn_tensor_attr_t attr + ) +{ + int32_t i = 0; + if(mean_and_scale != NULL) + { + for(i = 0; i < mean_and_scale->channel_len; i++) + { + node->nn_param.pre_process.norm.mean[i] = mean_and_scale->channel_mean[i]; + } + node->nn_param.pre_process.norm.scale = mean_and_scale->scale; + } + else + { + for(i = 0; i < (int32_t)attr.dim_num - 1; i++) + { + node->nn_param.pre_process.norm.mean[i] = 0; + } + node->nn_param.pre_process.norm.scale = 1.0f; + } +} /* _set_preproc_node_norm_params() */ + +static void _set_preproc_node_out_attr + ( + vsi_nn_node_t* node, + vsi_nn_preprocess_image_resize_t* image_resize, + vsi_nn_tensor_t* org_norm_tensor, + vsi_nn_preprocess_source_layout_e* source_layout + ) +{ + node->nn_param.pre_process.dim_num = org_norm_tensor->attr.dim_num; + node->nn_param.pre_process.output_attr.dim_num = org_norm_tensor->attr.dim_num; + node->nn_param.pre_process.output_attr.size = org_norm_tensor->attr.size; + if(image_resize != NULL) + { + node->nn_param.pre_process.output_attr.size[0] = image_resize->w; + node->nn_param.pre_process.output_attr.size[1] = image_resize->h; + node->nn_param.pre_process.output_attr.size[2] = image_resize->c; + if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) + { + node->nn_param.pre_process.output_attr.size[0] = image_resize->c; + node->nn_param.pre_process.output_attr.size[1] = image_resize->w; + node->nn_param.pre_process.output_attr.size[2] = image_resize->h; + } + } +} /* _set_preproc_node_out_attr() */ + +static void _set_preproc_node_input_attr + ( + vsi_nn_tensor_attr_t* input_attr, + vsi_nn_tensor_t* org_norm_tensor, + vsi_nn_preprocess_image_size_t* input_size, + vsi_nn_preprocess_source_format_e* source_format, + vsi_nn_preprocess_source_layout_e* source_layout + ) +{ + *input_attr = org_norm_tensor->attr; + input_attr->dim_num = org_norm_tensor->attr.dim_num; + if(input_size != NULL) + { + input_attr->size[0] = input_size->w; + input_attr->size[1] = input_size->h; + input_attr->size[2] = input_size->c; + if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) + { + input_attr->size[0] = input_size->c; + input_attr->size[1] = input_size->w; + input_attr->size[2] = input_size->h; + } + } + if(*source_format == VSI_NN_SOURCE_FORMAT_TENSOR) + { + input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32; + } + else + { + input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + input_attr->dtype.vx_type = VSI_NN_TYPE_UINT8; + } + if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB) + { + if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) + { + input_attr->size[0] = input_attr->size[1]*input_attr->size[0]; + input_attr->size[1] = input_attr->size[2]; + input_attr->size[2] = 1; + } + else + { + input_attr->size[0] = input_attr->size[2]*input_attr->size[0]; + input_attr->size[2] = 1; + } + } +} /*_set_preproc_node_input_attr() */ + +static void _set_preproc_node_output_attr + ( + vsi_nn_tensor_attr_t* output_attr, + vsi_nn_tensor_t* org_norm_tensor, + vsi_nn_preprocess_dtype_convert_t* data_convert + ) +{ + *output_attr = org_norm_tensor->attr; + if(data_convert != NULL) + { + output_attr->dtype = data_convert->dtype; + } + output_attr->dtype.fmt = VSI_NN_DIM_FMT_NCHW; + output_attr->dim_num = VSI_NN_DIM_AUTO; + output_attr->is_const = FALSE; + output_attr->vtl = TRUE; +} /* _set_preproc_node_output_attr() */ + +static void _set_postproc_node_output_attr + ( + vsi_nn_tensor_attr_t* output_attr, + vsi_nn_tensor_t* org_norm_tensor, + vsi_nn_postprocess_permute_t* permute, + vsi_nn_postprocess_dtype_convert_t* dtype_convert + ) +{ + int32_t i = 0; + output_attr->dim_num = org_norm_tensor->attr.dim_num; + output_attr->is_const = FALSE; + output_attr->vtl = FALSE; + if(dtype_convert != NULL) + { + output_attr->dtype = dtype_convert->dtype; + } + else + { + output_attr->dtype = org_norm_tensor->attr.dtype; + } + if(permute != NULL) + { + for(i = 0; i < permute->dim; i++) + { + output_attr->size[i] = org_norm_tensor->attr.size[permute->perm[i]]; + } + } + else + { + for(i = 0; i < (int32_t)org_norm_tensor->attr.dim_num; i++) + { + output_attr->size[i] = org_norm_tensor->attr.size[i]; + } + } +} /* _set_postproc_node_output_attr() */ + +vsi_status vsi_nn_add_single_preproc_node + ( + vsi_nn_graph_t* graph, + uint32_t input_idx, + vsi_nn_node_t** first_node, + uint32_t nodes_count, + vsi_nn_preprocess_base_t* preprocess, + uint32_t proc_count + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_preprocess_source_format_e* source_format = NULL; + vsi_nn_preprocess_source_layout_e* source_layout = NULL; + vsi_nn_node_t* node = NULL; + vsi_nn_preprocess_image_size_t* input_size = NULL; + vsi_nn_preprocess_crop_t* crop = NULL; + vsi_nn_preprocess_mean_and_scale_t* mean_and_scale = NULL; + vsi_nn_preprocess_permute_t* permute = NULL; + vsi_nn_preprocess_image_resize_t* image_resize = NULL; + vsi_nn_preprocess_dtype_convert_t* data_convert = NULL; + vsi_nn_tensor_attr_t input_attr; + vsi_nn_tensor_attr_t output_attr; + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_id_t preproc_input; + vsi_nn_tensor_id_t preproc_output; + vsi_nn_tensor_id_t yuv_inputs[3]; + vsi_nn_tensor_t* org_norm_tensor = NULL; + uint32_t node_input_num = 1; + int32_t reverse_channel = 0; + uint32_t i = 0; + uint32_t j = 0; + uint32_t idx =0; + + org_norm_tensor = vsi_nn_GetTensor(graph,graph->input.tensors[input_idx]); + attr = org_norm_tensor->attr; + + /* Get preprocess configurations*/ + for(idx = 0; idx < proc_count; idx++) + { + if(preprocess[idx].type == VSI_NN_PREPROCESS_SOURCE_LAYOUT) + source_layout = (vsi_nn_preprocess_source_layout_e*)preprocess[idx].param; + + else if(preprocess[idx].type == VSI_NN_PREPROCESS_SET_SOURCE_FORMAT) + source_format = (vsi_nn_preprocess_source_format_e*)preprocess[idx].param; + + else if(preprocess[idx].type == VSI_NN_PREPROCESS_CROP) + crop = (vsi_nn_preprocess_crop_t*)preprocess[idx].param; + + else if(preprocess[idx].type == VSI_NN_PREPROCESS_MEAN_AND_SCALE) + mean_and_scale = (vsi_nn_process_mean_and_scale_t*)preprocess[idx].param; + + else if(preprocess[idx].type == VSI_NN_PREPROCESS_PERMUTE) + permute = (vsi_nn_process_permute_t*)preprocess[idx].param; + + else if(preprocess[idx].type == VSI_NN_PREPROCESS_IMAGE_RESIZE_BILINEAR|| + preprocess[idx].type == VSI_NN_PREPROCESS_IMAGE_RESIZE_NEAREST) + image_resize = (vsi_nn_preprocess_image_resize_t*)preprocess[idx].param; + + else if(preprocess[idx].type == VSI_NN_PREPROCESS_REVERSE_CHANNEL) + reverse_channel = *(uint8_t*)preprocess[idx].param; + + else if(preprocess[idx].type == VSI_NN_PREPROCESS_DTYPE_CONVERT) + data_convert = (vsi_nn_preprocess_dtype_convert_t*)preprocess[idx].param; + + else if(preprocess[idx].type == VSI_NN_PREPROCESS_IMAGE_SIZE) + input_size = (vsi_nn_preprocess_image_size_t*)preprocess[idx].param; + else + { + VSILOGE("preprocess[%d] type is not support, please have a check!", idx); + status = VSI_FAILURE; + TEST_CHECK_STATUS(status, final); + } + } + + if(source_layout == NULL) + { + VSILOGE("Preprocess source layout need to be set!"); + status = VSI_FAILURE; + TEST_CHECK_STATUS(status, final); + } + + /* Add preprocess node */ + if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420) + { + node_input_num = 3; + } + else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12) + { + node_input_num = 2; + } + else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444) + { + node_input_num = 3; + } + + node = vsi_nn_AddNode(graph, VSI_NN_OP_PRE_PROCESS, node_input_num, 1, NULL); + node->uid = (uint32_t)(VSI_NN_PREPROC_NODE_UID_BASE) + input_idx; + + /* Set preprocess node parameters */ + status = _set_preproc_node_type(node, source_format); + TEST_CHECK_STATUS(status, final); + + _set_preproc_node_rect_params(node, crop, attr, source_layout); + _set_preproc_node_norm_params(node, mean_and_scale, attr); + + if(permute != NULL) + { + if((uint32_t)permute->dim != attr.dim_num) + { + VSILOGE("Preprocess permute dim dosen't match input dim"); + status = VSI_FAILURE; + TEST_CHECK_STATUS(status, final); + } + node->nn_param.pre_process.perm = (uint32_t*)permute->perm; + } + + if(reverse_channel) + node->nn_param.pre_process.reverse_channel = TRUE; + else + node->nn_param.pre_process.reverse_channel = FALSE; + + _set_preproc_node_out_attr(node, image_resize, org_norm_tensor, source_layout); + + /* Set input tensor attr */ + _set_preproc_node_input_attr(&input_attr, org_norm_tensor, input_size, source_format, source_layout); + + /* Set output tensor attr */ + _set_preproc_node_output_attr(&output_attr, org_norm_tensor, data_convert); + + /* Create new norm and virtual tensors */ + if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444) + { + _create_yuv_norm_tensors(graph, &input_attr, source_layout, source_format, yuv_inputs); + } + + preproc_input = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &input_attr, NULL); + preproc_output = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &output_attr, NULL); + + /* Reconnect node tensors */ + for(i = 0; i < nodes_count; i++) + { + for(j = 0; j < first_node[i]->input.num; j++) + { + if(first_node[i]->input.tensors[j] == graph->input.tensors[input_idx]) + { + first_node[i]->input.tensors[j] = preproc_output; + break; + } + } + } + + if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444) + { + for (i = 0; i < node_input_num; i++) + { + node->input.tensors[i] = yuv_inputs[i]; + graph->input.tensors[input_idx*node_input_num+i] = yuv_inputs[i]; + } + } + else + { + node->input.tensors[0] = preproc_input; + graph->input.tensors[input_idx] = preproc_input; + } + node->output.tensors[0] = preproc_output; + + status = VSI_SUCCESS; + +final: + return status; +} /* vsi_nn_add_single_preproc_node() */ + +vsi_status vsi_nn_add_single_postproc_node + ( + vsi_nn_graph_t* graph, + uint32_t output_idx, + vsi_nn_node_t* last_node, + vsi_nn_postprocess_base_t* postprocess, + uint32_t proc_count + ) +{ + vsi_nn_node_t* node; + vsi_nn_process_permute_t* permute = NULL; + vsi_nn_tensor_t* org_norm_tensor = NULL; + vsi_nn_tensor_attr_t input_attr; + vsi_nn_tensor_attr_t output_attr; + vsi_nn_tensor_id_t postproc_input; + vsi_nn_tensor_id_t postproc_output; + vsi_nn_postprocess_dtype_convert_t* dtype_convert = NULL; + int32_t i = 0; + int32_t idx = 0; + vsi_status status = VSI_SUCCESS; + + org_norm_tensor = vsi_nn_GetTensor(graph, graph->output.tensors[output_idx]); + + /*Create postprocess node*/ + node = vsi_nn_AddNode(graph, VSI_NN_OP_POST_PROCESS, 1, 1, NULL); + node->uid = (uint32_t)(VSI_NN_POSTPROC_NODE_UID_BASE) + output_idx; + + /* Get postprocess condigurations */ + for(idx = 0; idx < (int32_t)proc_count; idx++) + { + if(postprocess[idx].type == VSI_NN_POSTPROCESS_PERMUTE) + permute = (vsi_nn_process_permute_t*)postprocess[idx].param; + + if(postprocess[idx].type == VSI_NN_POSTPROCESS_DTYPE_CONVERT) + dtype_convert = (vsi_nn_postprocess_dtype_convert_t*)postprocess[idx].param; + } + + /* Set Postprocess node parameters */ + if(permute != NULL) + { + if((uint32_t)permute->dim != org_norm_tensor->attr.dim_num) + { + VSILOGE("Postprocess permute dim doesn't match output dim!"); + status = VSI_FAILURE; + TEST_CHECK_STATUS(status, final); + } + node->nn_param.post_process.perm = (uint32_t*)permute->perm; + } + node->nn_param.post_process.dim_num = org_norm_tensor->attr.dim_num; + + /* Set input tensor attr */ + input_attr = org_norm_tensor->attr; + input_attr.dim_num = VSI_NN_DIM_AUTO; + input_attr.is_const = FALSE; + input_attr.vtl = TRUE; + + /* Set output tensor attr */ + _set_postproc_node_output_attr(&output_attr, org_norm_tensor, permute, dtype_convert); + + /* Create new norm and virtual tensor */ + postproc_input = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &input_attr, NULL); + postproc_output = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &output_attr, NULL); + + /* Reconnect node tensors */ + node->input.tensors[0] = postproc_input; + node->output.tensors[0] = postproc_output; + for(i = 0; i < (int32_t)last_node->output.num; i++) + { + if(last_node->output.tensors[i] == graph->output.tensors[output_idx]) + { + last_node->output.tensors[i] = postproc_input; + break; + } + } + graph->output.tensors[output_idx] = postproc_output; + +final: + return status; +} /* vsi_nn_add_single_postproc_node() */ + +vsi_status vsi_nn_AddGraphPreProcess + ( + vsi_nn_graph_t* graph, + uint32_t input_idx, + vsi_nn_preprocess_base_t* preprocess, + uint32_t count + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_id_t input; + uint32_t nodes_count = 0; + vsi_nn_node_t** nodes = NULL; + + input = graph->input.tensors[input_idx]; + vsi_nn_get_tensor_consumers(graph, input, NULL, &nodes_count); + if(nodes_count != 0) + { + nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*)*nodes_count); + vsi_nn_get_tensor_consumers(graph, input, nodes, NULL); + status = vsi_nn_add_single_preproc_node(graph, input_idx, nodes, nodes_count, preprocess, count); + } + if(nodes) + { + free(nodes); + nodes = NULL; + } + return status; +} /* vsi_nn_AddGraphPreProcess() */ + +vsi_status vsi_nn_AddGraphPostProcess + ( + vsi_nn_graph_t* graph, + uint32_t output_idx, + vsi_nn_postprocess_base_t* postprocess, + uint32_t count + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_id_t output; + vsi_nn_node_t * node = NULL; + + output = graph->output.tensors[output_idx]; + vsi_nn_get_tensor_provider(graph, output, &node); + if(node != NULL) + { + status = vsi_nn_add_single_postproc_node(graph, output_idx, node, postprocess, count); + } + + return status; +} /* vsi_nn_AddGraphPostProcess() */ diff --git a/src/tim/vx/internal/src/vsi_nn_rnn.c b/src/tim/vx/internal/src/vsi_nn_rnn.c new file mode 100644 index 0000000..991817d --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_rnn.c @@ -0,0 +1,486 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_rnn_prv.h" +#include "vsi_nn_internal_node.h" + +/********************************************************** +* MACROS +**********************************************************/ +#define RNN_WKSP(_GRAPH) ( (vsi_nn_rnn_wksp_t *)((_GRAPH)->rnn_wksp) ) + +/********************************************************** +* LOCAL FUNCTIONS +**********************************************************/ +static vsi_status internal_buffer_init + ( + vsi_nn_rnn_internal_buffer_t* buffer, + vsi_nn_tensor_t* tensor, + float default_value + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t element_num = 0; + uint32_t i = 0; + uint32_t stride = 0; + uint32_t data_size = 0; + uint8_t* data = NULL; + + if( TRUE == tensor->attr.vtl ) + { + VSILOGE("Internal tensors cannot be dumpped."); + return status; + } + + if( NULL == buffer ) + { + VSILOGE("Internal buffer is NULL."); + return status; + } + + memcpy(&buffer->attr, &tensor->attr, sizeof(tensor->attr)); + data_size = vsi_nn_GetTensorSize( buffer->attr.size, buffer->attr.dim_num, buffer->attr.dtype.vx_type ); + element_num = vsi_nn_GetElementNum(tensor); + stride = vsi_nn_TypeGetBytes( tensor->attr.dtype.vx_type ); + + data = (uint8_t *)malloc(data_size); + if( NULL == buffer ) + { + VSILOGE("Out of memoery."); + goto error; + } + + /* init data with zero */ + for( i = 0; i < element_num; i++ ) + { + status = vsi_nn_Float32ToDtype(default_value, data + i * stride, &buffer->attr.dtype); + if( VSI_SUCCESS != status ) + { + VSILOGE("Convert default value to dtype fail"); + goto error; + } + } + + buffer->data = data; + buffer->data_size = data_size; + +error: + if( VSI_SUCCESS != status ) + { + vsi_nn_safe_free(data); + } + return status; +} /* internal_buffer_init() */ + +static vsi_status internal_buffer_deinit + ( + vsi_nn_rnn_internal_buffer_t* buffer + ) +{ + vsi_status status = VSI_FAILURE; + + if( NULL == buffer ) + { + VSILOGE("Internal buffer is NULL."); + return status; + } + + vsi_nn_safe_free( buffer->data ); + + return VSI_SUCCESS; +} /* internal_buffer_deinit() */ + +static vsi_status internal_buffer_copy_to_tensor + ( + const vsi_nn_graph_t* graph, + vsi_nn_rnn_internal_buffer_t* buffer, + vsi_nn_tensor_id_t tensorid + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t request_data_size = 0; + vsi_nn_tensor_t* tensor = NULL; + + if( NULL == buffer ) + { + VSILOGE("Internal buffer is NULL.\n"); + return status; + } + + tensor = vsi_nn_GetTensor( graph, tensorid ); + request_data_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type ); + if( request_data_size != buffer->data_size ) + { + VSILOGE("Internal buffer size error.\n"); + return status; + } + + status = vsi_nn_CopyDataToTensor( graph, tensor, buffer->data ); + + return status; +} /* internal_buffer_copy_to_tensor() */ + +static vsi_status internal_buffer_copy_from_tensor + ( + const vsi_nn_graph_t* graph, + vsi_nn_rnn_internal_buffer_t* buffer, + vsi_nn_tensor_id_t tensorid + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t request_data_size = 0; + uint8_t* data = NULL; + vsi_nn_tensor_t* tensor = NULL; + + if( NULL == buffer ) + { + VSILOGE("Internal buffer is NULL.\n"); + return status; + } + + tensor = vsi_nn_GetTensor( graph, tensorid ); + request_data_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type ); + if( request_data_size != buffer->data_size ) + { + VSILOGE("Internal buffer size error.\n"); + return status; + } + + data = vsi_nn_ConvertTensorToData( graph, tensor ); + if( buffer->data && data ) + { + memcpy( buffer->data, data, request_data_size ); + status = VSI_SUCCESS; + } + + vsi_nn_safe_free( data ); + + return status; +} /* internal_buffer_copy_from_tensor() */ + +static vsi_status _swap_rnn_tensor_handle + ( + const vsi_nn_graph_t* graph, + vsi_nn_tensor_id_t output_id, + vsi_nn_tensor_id_t input_id + ) +{ + vsi_nn_tensor_t* tensor_out = NULL; + vsi_nn_tensor_t* tensor_in = NULL; + + tensor_out = vsi_nn_GetTensor( graph, output_id ); + tensor_in = vsi_nn_GetTensor( graph, input_id ); + + return vsi_nn_SwapTensorHandle( tensor_out, tensor_in ); +} /* _swap_rnn_tensor_handle() */ + +/********************************************************** +* PUBLIC FUNCTIONS +**********************************************************/ +vsi_status vsi_nn_rnn_feed_internal_state + ( + const vsi_nn_graph_t* graph + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_rnn_connection_t * cur_conn = NULL; + uint32_t i = 0; + + /* copy previous data from internal buffer to related input tensors */ + if( NULL != graph && NULL != graph->rnn_wksp ) + { + /* don't copy/swap for first inference */ + if( RNN_WKSP(graph)->is_first_run ) + { + RNN_WKSP(graph)->is_first_run = FALSE; + } + else + { + cur_conn = RNN_WKSP(graph)->external_connection_list; + while( NULL != cur_conn && VSI_SUCCESS == status ) + { + if( cur_conn->tensor_swappable ) + { + status = _swap_rnn_tensor_handle( graph, cur_conn->connection.output, + cur_conn->connection.inputs[0] ); + if( VSI_SUCCESS != status ) + { + VSILOGE("Swap handle of RNN input/output fail."); + break; + } + } + else + { + for( i = 0; i < cur_conn->connection_inputs_count; i++ ) + { + vsi_nn_tensor_id_t input = cur_conn->connection.inputs[i]; + + status = internal_buffer_copy_to_tensor( graph, &cur_conn->buffer, input ); + if( VSI_SUCCESS != status ) + { + break; + } + } + } + cur_conn = (vsi_nn_rnn_connection_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)cur_conn ); + } + } + } + + return status; +} /* vsi_nn_rnn_feed_internal_state() */ + +vsi_status vsi_nn_rnn_save_internal_state + ( + const vsi_nn_graph_t* graph + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_rnn_connection_t * cur_conn = NULL; + + if( VSI_SUCCESS == status ) + { + /* copy tensors' data to internal buffer */ + if( NULL != graph->rnn_wksp ) + { + cur_conn = RNN_WKSP(graph)->external_connection_list; + while( NULL != cur_conn && VSI_SUCCESS == status ) + { + if( !cur_conn->tensor_swappable ) + { + status = internal_buffer_copy_from_tensor( graph, + &cur_conn->buffer, cur_conn->connection.output ); + } + cur_conn = (vsi_nn_rnn_connection_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)cur_conn ); + } + } + } + + return status; +} /* vsi_nn_rnn_save_internal_state() */ + +vsi_status vsi_nn_rnn_DeinitWksp + ( + vsi_nn_graph_t* graph + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_rnn_connection_t * cur_conn = NULL; + + if( NULL == graph ) + { + status = VSI_FAILURE; + return status; + } + + if( NULL == graph->rnn_wksp ) + { + return status; + } + + while( NULL != RNN_WKSP(graph)->external_connection_list ) + { + cur_conn = (vsi_nn_rnn_connection_t *)vsi_nn_LinkListPopStart( + (vsi_nn_link_list_t **)&RNN_WKSP(graph)->external_connection_list ); + internal_buffer_deinit( &cur_conn->buffer ); + vsi_nn_safe_free( cur_conn ); + } + + vsi_nn_safe_free( graph->rnn_wksp ); + + return status; +} /* vsi_nn_rnn_DeinitWksp() */ + +vsi_status vsi_nn_rnn_InitWksp + ( + vsi_nn_graph_t* graph, + const vsi_nn_rnn_external_connection_t* connections, + uint32_t connections_count, + void* user_data + ) +{ + vsi_status status = VSI_SUCCESS; + uint32_t i = 0; + uint32_t j = 0; + vsi_nn_rnn_connection_t* cur_conn = NULL; + vsi_nn_tensor_t* output_tensor = NULL; + vsi_nn_tensor_t* input_tensor = NULL; + + if( NULL == graph ) + { + status = VSI_FAILURE; + return status; + } + + vsi_nn_rnn_DeinitWksp( graph ); + + graph->rnn_wksp = malloc( sizeof( vsi_nn_rnn_wksp_t ) ); + if( NULL == graph->rnn_wksp ) + { + VSILOGE("Malloc memory for rnn_wksp fail, Out of memory."); + status = VSI_FAILURE; + return status; + } + + memset( graph->rnn_wksp, 0x00, sizeof( vsi_nn_rnn_wksp_t ) ); + RNN_WKSP(graph)->user_data = user_data; + RNN_WKSP(graph)->is_first_run = TRUE; + for( i = 0; i < connections_count; i++ ) + { + cur_conn = (vsi_nn_rnn_connection_t *)malloc( sizeof( vsi_nn_rnn_connection_t ) ); + if( NULL == cur_conn ) + { + VSILOGE("Malloc memory for connection fail, Out of memory."); + status = VSI_FAILURE; + break; + } + memset( cur_conn, 0x00, sizeof( vsi_nn_rnn_connection_t ) ); + memcpy( &cur_conn->connection, &connections[i], sizeof( connections[i] ) ); + + output_tensor = vsi_nn_GetTensor( graph, cur_conn->connection.output ); + for( j = 0; j < VSI_NN_MAX_RNN_CONNECTION_INPUTS; j++ ) + { + if( VSI_NN_TENSOR_ID_NA == cur_conn->connection.inputs[j] ) + { + break; + } + /* make sure input tensors have the same size and dtype with output tensor */ + input_tensor = vsi_nn_GetTensor( graph, cur_conn->connection.inputs[j] ); + if( output_tensor->attr.dim_num != input_tensor->attr.dim_num + || output_tensor->attr.dtype.vx_type != input_tensor->attr.dtype.vx_type + || 0 != memcmp(output_tensor->attr.size, input_tensor->attr.size, + output_tensor->attr.dim_num * sizeof(output_tensor->attr.size[0])) ) + { + VSILOGE("The tensors in connections must have the same size and dtype."); + status = VSI_FAILURE; + goto OnError; + } + } + + if( j == VSI_NN_MAX_RNN_CONNECTION_INPUTS ) + { + VSILOGE("The count of inputs is greater than maximum value: %d.", VSI_NN_MAX_RNN_CONNECTION_INPUTS); + status = VSI_FAILURE; + goto OnError; + } + else + { + cur_conn->connection_inputs_count = j; + } + + if( cur_conn->connection_inputs_count == 1 ) + { + input_tensor = vsi_nn_GetTensor( graph, cur_conn->connection.inputs[0] ); + if( output_tensor && output_tensor->attr.is_created_from_handle + && input_tensor && input_tensor->attr.is_created_from_handle ) + { + cur_conn->tensor_swappable = TRUE; + } + } + + if( !cur_conn->tensor_swappable ) + { + internal_buffer_init( &cur_conn->buffer, + vsi_nn_GetTensor( graph, cur_conn->connection.output ), 0.0f ); + } + + vsi_nn_LinkListPushEnd( + (vsi_nn_link_list_t **)&RNN_WKSP(graph)->external_connection_list, + (vsi_nn_link_list_t *)cur_conn ); + } + + return status; + +OnError: + vsi_nn_safe_free( cur_conn ); + return status; +} /* vsi_nn_rnn_InitWksp() */ + +vsi_status vsi_nn_rnn_ResetBuffers + ( + vsi_nn_graph_t* graph + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_rnn_connection_t * cur_conn = NULL; + + if( NULL == graph ) + { + status = VSI_FAILURE; + return status; + } + + if( NULL == graph->rnn_wksp ) + { + return status; + } + + if( NULL != graph->rnn_wksp ) + { + RNN_WKSP(graph)->is_first_run = TRUE; + cur_conn = RNN_WKSP(graph)->external_connection_list; + while( NULL != cur_conn && VSI_SUCCESS == status ) + { + if( !cur_conn->tensor_swappable ) + { + status = internal_buffer_deinit( &cur_conn->buffer ); + status = internal_buffer_init( &cur_conn->buffer, + vsi_nn_GetTensor( graph, cur_conn->connection.output ), 0.0f ); + } + + cur_conn = (vsi_nn_rnn_connection_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)cur_conn ); + } + } + + return status; +} /* vsi_nn_rnn_ResetBuffers() */ + +vsi_status vsi_nn_rnn_RunGraph + ( + vsi_nn_graph_t* graph + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_rnn_feed_internal_state( graph ); + + if( VSI_SUCCESS == status ) + { + status = vsi_nn_RunGraph( graph ); + } + + if( VSI_SUCCESS == status ) + { + status = vsi_nn_rnn_save_internal_state( graph ); + } + + return status; +} /* vsi_nn_rnn_RunGraph() */ diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c new file mode 100644 index 0000000..b12b5d4 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c @@ -0,0 +1,1019 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include + +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_rnn_prv.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_rnn_helper.h" + +vsi_bool vsi_nn_rnn_find_best_kernel_size + ( + vsi_bool multi_batch, + uint32_t input_size, + uint32_t* p_kernel_h, + uint32_t* p_kernel_w + ) +{ + uint32_t kernel_h = 1; + uint32_t kernel_w = 1; + + if( multi_batch) + { + /* batch FC only be converted to 1x1 or 1xN conv */ + /* try 1xN */ + kernel_h = 7; + while( input_size % kernel_h != 0 ) + { + kernel_h--; + } + } + else + { + /* try NxN */ + if( !multi_batch ) + { + #if( !defined( _WIN32 ) ) + /* try NxN conv */ + kernel_h = 8; + while( input_size % (kernel_h * kernel_h) != 0 ) + { + kernel_h--; + } + #endif + } + + if( kernel_h > 1 ) + { + kernel_w = kernel_h; + } + else + { + /* Only 1x1 found, try 1xN */ + kernel_h = 7; + while( input_size % kernel_h != 0 ) + { + kernel_h--; + } + kernel_w = 1; + } + + } + + VSILOGD("Use kernel_h: %d, kernel_w: %d to convert FC", kernel_h, kernel_w); + if( p_kernel_h ) + { + *p_kernel_h = kernel_h; + } + + if( p_kernel_w ) + { + *p_kernel_w = kernel_w; + } + + return TRUE; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_bool multi_batch, + uint32_t kernel_h, + uint32_t kernel_w, + int32_t use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_internal_tensor_t* tensor2 = NULL; + uint32_t* reshape_in_size = NULL; + uint32_t* permute_in_perm = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); + tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); + reshape_in_size = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(uint32_t)); + + reshape_in_size[3] = input->attr.size[1]; + reshape_in_size[2] = input->attr.size[0] / (kernel_h * kernel_w); + reshape_in_size[1] = kernel_h; + reshape_in_size[0] = kernel_w; + + tmp_inode->node->nn_param.reshape.size = reshape_in_size; + tmp_inode->node->nn_param.reshape.dim_num = 4; + tmp_inode->inputs[0] = input; + tmp_inode->outputs[0] = tensor1->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + if( multi_batch ) + { + uint32_t reshape_size[4] = { 0 }; + uint32_t c = 0, h = 0; + vsi_nn_internal_tensor_t* tensor0 = NULL; + h = tensor1->t->attr.size[2]; + c = tensor1->t->attr.size[1]; + + reshape_size[2] = tensor1->t->attr.size[3]; + reshape_size[1] = -1; + reshape_size[0] = tensor1->t->attr.size[0]; + tensor0 = vsi_nn_rnn_create_reshape(self, tensor1->t, NULL, reshape_size, 3, use_virtual_tensor); + + tensor2 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 ); + permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 3 * sizeof(uint32_t)); + permute_in_perm[0] = 2; + permute_in_perm[1] = 1; + permute_in_perm[2] = 0; + tmp_inode->node->nn_param.permute.perm = permute_in_perm; + tmp_inode->node->nn_param.permute.dim_num = 3; + tmp_inode->inputs[0] = tensor0->t; + tmp_inode->outputs[0] = tensor2->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + reshape_size[3] = tensor2->t->attr.size[2]; + reshape_size[2] = h; + reshape_size[1] = c; + reshape_size[0] = tensor2->t->attr.size[0]; + tensor0 = vsi_nn_rnn_create_reshape(self, tensor2->t, NULL, reshape_size, 4, use_virtual_tensor); + + tensor1 = tensor0; + } + + return tensor1; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_bool multi_batch, + uint32_t kernel_h, + uint32_t kernel_w, + int32_t use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_internal_tensor_t* tensor2 = NULL; + uint32_t* reshape_in_size = NULL; + uint32_t* permute_in_perm = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + vsi_nn_tensor_t* tensor = input; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); + + if( multi_batch ) + { + uint32_t reshape_size[4] = { 0 }; + uint32_t c = 0, h = 0; + vsi_nn_internal_tensor_t* tensor0 = NULL; + h = tensor->attr.size[2]; + c = tensor->attr.size[1]; + + reshape_size[2] = tensor->attr.size[3]; + reshape_size[1] = -1; + reshape_size[0] = tensor->attr.size[0]; + tensor0 = vsi_nn_rnn_create_reshape(self, tensor, NULL, reshape_size, 3, use_virtual_tensor); + + tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 ); + permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 3 * sizeof(uint32_t)); + + permute_in_perm[0] = 2; + permute_in_perm[1] = 1; + permute_in_perm[2] = 0; + + tmp_inode->node->nn_param.permute.perm = permute_in_perm; + tmp_inode->node->nn_param.permute.dim_num = 3; + tmp_inode->inputs[0] = tensor0->t; + tmp_inode->outputs[0] = tensor1->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + reshape_size[3] = tensor1->t->attr.size[2]; + reshape_size[2] = h; + reshape_size[1] = c; + reshape_size[0] = tensor1->t->attr.size[0]; + tensor0 = vsi_nn_rnn_create_reshape(self, tensor1->t, NULL, reshape_size, 4, use_virtual_tensor); + + tensor = tensor0->t; + } + + tensor2 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); + reshape_in_size = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(uint32_t)); + + reshape_in_size[1] = tensor->attr.size[3]; + reshape_in_size[0] = tensor->attr.size[2]; + + tmp_inode->node->nn_param.reshape.size = reshape_in_size; + tmp_inode->node->nn_param.reshape.dim_num = 2; + tmp_inode->inputs[0] = tensor; + tmp_inode->outputs[0] = tensor2->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + return tensor2; +} + +vsi_bool vsi_nn_rnn_process_output_for_nn_fc2 + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + vsi_bool multi_batch, + uint32_t kernel_h, + uint32_t kernel_w, + int32_t use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* tensor1 = NULL; + uint32_t* reshape_in_size = NULL; + uint32_t* permute_in_perm = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + vsi_nn_tensor_t* tensor = input; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); + + if( multi_batch ) + { + uint32_t reshape_size[4] = { 0 }; + uint32_t c = 0, h = 0; + vsi_nn_internal_tensor_t* tensor0 = NULL; + h = tensor->attr.size[2]; + c = tensor->attr.size[1]; + + reshape_size[2] = tensor->attr.size[3]; + reshape_size[1] = -1; + reshape_size[0] = tensor->attr.size[0]; + tensor0 = vsi_nn_rnn_create_reshape(self, tensor, NULL, reshape_size, 3, use_virtual_tensor); + + tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 ); + permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 3 * sizeof(uint32_t)); + + permute_in_perm[0] = 2; + permute_in_perm[1] = 1; + permute_in_perm[2] = 0; + + tmp_inode->node->nn_param.permute.perm = permute_in_perm; + tmp_inode->node->nn_param.permute.dim_num = 3; + tmp_inode->inputs[0] = tensor0->t; + tmp_inode->outputs[0] = tensor1->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + reshape_size[3] = tensor1->t->attr.size[2]; + reshape_size[2] = h; + reshape_size[1] = c; + reshape_size[0] = tensor1->t->attr.size[0]; + tensor0 = vsi_nn_rnn_create_reshape(self, tensor1->t, NULL, reshape_size, 4, use_virtual_tensor); + + tensor = tensor0->t; + } + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); + reshape_in_size = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(uint32_t)); + + reshape_in_size[1] = tensor->attr.size[3]; + reshape_in_size[0] = tensor->attr.size[2]; + + tmp_inode->node->nn_param.reshape.size = reshape_in_size; + tmp_inode->node->nn_param.reshape.dim_num = 2; + tmp_inode->inputs[0] = tensor; + tmp_inode->outputs[0] = output; + vsi_nn_internal_setup_node(self, tmp_inode); + + return TRUE; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_internal_tensor_t* tensor2 = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + tensor = bias; + if( !bias ) + { + /* create zero bias for NN/TP */ + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr); + tensor = tensor1->t; + } + vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); + tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 ); + tmp_inode->node->nn_param.fcl.axis = 0; + tmp_inode->node->nn_param.fcl.weights = weight->attr.size[1]; + + tmp_inode->inputs[0] = input; + tmp_inode->inputs[1] = weight; + tmp_inode->inputs[2] = tensor; + tmp_inode->outputs[0] = tensor2->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + return tensor2; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias, + uint32_t kernel_h, + uint32_t kernel_w, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_internal_tensor_t* tensor2 = NULL; + vsi_nn_internal_tensor_t* reshaped_weight_tensor = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + tensor = bias; + if( !bias ) + { + /* create zero bias for NN/TP */ + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr); + tensor = tensor1->t; + } + + vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); + tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + reshaped_weight_tensor = vsi_nn_rnn_prepare_weight_for_nn_fc(self, weight, kernel_h, kernel_w); + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + tmp_inode->node->nn_param.conv2d.ksize[0] = kernel_w; + tmp_inode->node->nn_param.conv2d.ksize[1] = kernel_h; + tmp_inode->node->nn_param.conv2d.stride[0] = 1; + tmp_inode->node->nn_param.conv2d.stride[1] = 1; + tmp_inode->node->nn_param.conv2d.pad[0] = 0; + tmp_inode->node->nn_param.conv2d.pad[1] = 0; + tmp_inode->node->nn_param.conv2d.pad[2] = 0; + tmp_inode->node->nn_param.conv2d.pad[3] = 0; + tmp_inode->node->nn_param.conv2d.group = 1; + tmp_inode->node->nn_param.conv2d.dilation[0] = 1; + tmp_inode->node->nn_param.conv2d.dilation[1] = 1; + tmp_inode->node->nn_param.conv2d.weights = weight->attr.size[1]; + + tmp_inode->inputs[0] = input; + tmp_inode->inputs[1] = reshaped_weight_tensor->t; + tmp_inode->inputs[2] = tensor; + tmp_inode->outputs[0] = tensor2->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + return tensor2; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_prepare_weight_for_nn_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * weight, + uint32_t kernel_h, + uint32_t kernel_w + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* reshaped_weight_tensor = NULL; + uint32_t reshaped_weight_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + + reshaped_weight_shape[3] = weight->attr.size[1]; + reshaped_weight_shape[2] = weight->attr.size[0] / ( kernel_h * kernel_w ); + reshaped_weight_shape[1] = kernel_h; + reshaped_weight_shape[0] = kernel_w; + + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = weight->attr.vtl; + attr.is_const = FALSE; + memcpy( &attr.dtype, &weight->attr.dtype, sizeof(attr.dtype)); + memcpy( &attr.size, &reshaped_weight_shape, sizeof(attr.size)); + reshaped_weight_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + vsi_nn_ReshapeTensor( self->graph, weight, reshaped_weight_tensor->t, reshaped_weight_shape, 4 ); + + reshaped_weight_tensor->t->attr.is_const = weight->attr.is_const; + if(reshaped_weight_tensor->t->attr.is_const) + { + vsi_nn_SetTensorAttr(reshaped_weight_tensor->t, VSI_NN_TENSOR_ATTR_CONST); + } + + return reshaped_weight_tensor; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias, + uint32_t kernel_h, + uint32_t kernel_w, + vsi_bool has_relu, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_internal_tensor_t* tensor2 = NULL; + vsi_nn_internal_tensor_t* reshaped_weight_tensor = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + tensor = bias; + if( !bias ) + { + /* create zero bias for NN/TP */ + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr); + tensor = tensor1->t; + } + + vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); + tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + reshaped_weight_tensor = vsi_nn_rnn_prepare_weight_for_nn_fc(self, weight, kernel_h, kernel_w); + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV_RELU, 0, 0 ); + tmp_inode->node->nn_param.conv2d.ksize[0] = kernel_w; + tmp_inode->node->nn_param.conv2d.ksize[1] = kernel_h; + tmp_inode->node->nn_param.conv2d.stride[0] = 1; + tmp_inode->node->nn_param.conv2d.stride[1] = 1; + tmp_inode->node->nn_param.conv2d.pad[0] = 0; + tmp_inode->node->nn_param.conv2d.pad[1] = 0; + tmp_inode->node->nn_param.conv2d.pad[2] = 0; + tmp_inode->node->nn_param.conv2d.pad[3] = 0; + tmp_inode->node->nn_param.conv2d.group = 1; + tmp_inode->node->nn_param.conv2d.dilation[0] = 1; + tmp_inode->node->nn_param.conv2d.dilation[1] = 1; + tmp_inode->node->nn_param.conv2d.weights = weight->attr.size[1]; + tmp_inode->node->vx_param.overflow_policy = VX_CONVERT_POLICY_WRAP; + tmp_inode->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO; + tmp_inode->node->vx_param.has_relu = has_relu; + tmp_inode->node->vx_param.down_scale_size_rounding = + VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR; + + tmp_inode->inputs[0] = input; + tmp_inode->inputs[1] = reshaped_weight_tensor->t; + tmp_inode->inputs[2] = tensor; + tmp_inode->outputs[0] = tensor2->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + return tensor2; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_add + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input1, + vsi_nn_tensor_t * input2, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); + tensor1 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_ADD, 0, 0 ); + + tmp_inode->inputs[0] = input1; + tmp_inode->inputs[1] = input2; + tmp_inode->outputs[0] = tensor1->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + return tensor1; +} + +vsi_nn_op_t vsi_nn_rnn_get_act_op_type + ( + vsi_nn_activation_e type + ) +{ + switch (type) + { + case VSI_NN_ACT_RELU: + return VSI_NN_OP_RELU; + case VSI_NN_ACT_RELU6: + return VSI_NN_OP_RELU6; + case VSI_NN_ACT_TANH: + return VSI_NN_OP_TANH; + case VSI_NN_ACT_SIGMOID: + return VSI_NN_OP_SIGMOID; + case VSI_NN_ACT_HARD_SIGMOID: + return VSI_NN_OP_HARD_SIGMOID; + default: + VSILOGE("error activation type %d", type); + break; + } + + return VSI_NN_OP_TANH; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_activation + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_activation_e act_type, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); + tensor1 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tmp_inode = vsi_nn_internal_new_node(self, vsi_nn_rnn_get_act_op_type(act_type), 0, 0 ); + + tmp_inode->inputs[0] = input; + tmp_inode->node->nn_param.tanh.scale_a = 1.0f; + tmp_inode->node->nn_param.tanh.scale_b = 1.0f; + tmp_inode->outputs[0] = tensor1->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + return tensor1; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + uint32_t* permute_in_perm = NULL; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_internal_node_t* curr = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + if (output == NULL) + { + vsi_nn_internal_init_tensor_attr(&attr, + &input->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + permute_in_perm[0] = 0; + permute_in_perm[1] = 2; + permute_in_perm[2] = 1; + + curr->node->nn_param.permute.perm = permute_in_perm; + curr->node->nn_param.permute.dim_num = 3; + curr->inputs[0] = input; + + if (output == NULL) + { + curr->outputs[0] = output_tensor->t; + } + else + { + curr->outputs[0] = output; + } + vsi_nn_internal_setup_node(self, curr); + + return output_tensor; +} + +void vsi_nn_rnn_split_input_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t ** output, + uint32_t time_step, + vsi_bool use_virtual_tensor + ) +{ + uint32_t* slices = NULL; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t i = 0; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, time_step ); + slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, time_step * sizeof(uint32_t)); + curr->node->nn_param.split.axis = 2; /* timestep axis */ + curr->node->nn_param.split.slices_num = time_step; + curr->inputs[0] = input; + + curr->node->nn_param.split.slices = slices; + for( i = 0; i < time_step; i++ ) + { + slices[i] = 1; + vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + curr->outputs[i] = output_tensor->t; + output[i] = output_tensor->t; + } + vsi_nn_internal_setup_node( self, curr ); +} + +void vsi_nn_rnn_data_check_aligned + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** input, + uint32_t time_step, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t i = 0; + uint32_t ofst = 0; + ofst = 0; + for( i = 0; i < time_step; i++ ) + { + uint32_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size, + input[i]->attr.dim_num, input[i]->attr.dtype.vx_type ); + + if( ofst & 0x3f ) + { + vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = input[i]; + curr->outputs[0] = output_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + input[i] = output_tensor->t; + } + + ofst += tensor_size; + } +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + uint32_t batch_size, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t *reshape_split_size = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + /* reshape for split output */ + vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + reshape_split_size = (uint32_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + reshape_split_size[0] = -1; + reshape_split_size[1] = batch_size; + + curr->node->nn_param.reshape.size = reshape_split_size; + curr->node->nn_param.reshape.dim_num = 2; + curr->inputs[0] = input; + curr->outputs[0] = output_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + return output_tensor; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + uint32_t batch_size, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + uint32_t* reshape_grucell_output_size = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + /* reshape output to 3-dims */ + vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + reshape_grucell_output_size = (uint32_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + reshape_grucell_output_size[0] = -1; + reshape_grucell_output_size[1] = batch_size; + reshape_grucell_output_size[2] = 1; + + curr->node->nn_param.reshape.size = reshape_grucell_output_size; + curr->node->nn_param.reshape.dim_num = 3; + curr->inputs[0] = input; + curr->outputs[0] = output_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + return output_tensor; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_binary_operator + ( + vsi_nn_node_t* self, + vsi_nn_op_t op, + vsi_nn_tensor_t* operand1, + vsi_nn_tensor_t* operand2, + const vsi_nn_dtype_t* output_dtype, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_internal_node_t* tmp_inode = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + tmp_inode = vsi_nn_internal_new_node(self, op, 0, 0 ); + tmp_inode->node->nn_param.multiply.scale = 1.0f; + tmp_inode->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + tmp_inode->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN; + tmp_inode->inputs[0] = operand1; + tmp_inode->inputs[1] = operand2; + tmp_inode->outputs[0] = output_tensor->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + return output_tensor; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_concat_impl + ( + vsi_nn_node_t* self, + uint32_t axis, + vsi_bool use_virtual_tensor, + vsi_nn_tensor_t* tensor, + ... + ) +{ + va_list args; + vsi_nn_tensor_t* next; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* tmp_tensor = NULL; + vsi_nn_internal_node_t* inode = NULL; + int tensor_count = 1; + + va_start(args, tensor); + + FOREACH_ARGS(args, next, vsi_nn_tensor_t*) + { + tensor_count++; + } + va_end(args); + + memset(&attr, 0x00, sizeof(attr)); + memcpy(&attr.dtype, &tensor->attr.dtype, sizeof(attr.dtype)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONCAT, tensor_count, 1); + inode->inputs[0] = tensor; + tensor_count = 0; + va_start(args, tensor); + + FOREACH_ARGS(args, next, vsi_nn_tensor_t*) + { + inode->inputs[1 + tensor_count++] = next; + } + va_end(args); + inode->outputs[0] = tmp_tensor->t; + + vsi_nn_internal_setup_node(self, inode); + + return tmp_tensor; +} + +vsi_nn_internal_tensor_t** vsi_nn_create_split + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t* tensor, + uint32_t axis, + uint32_t slices_num, + uint32_t* slices, + vsi_bool use_virtual_tensor + ) +{ + uint32_t i = 0; + uint32_t num_per_output = 0; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t** output_tensors = NULL; + + if(!slices_num) + { + VSILOGE("slices_num must be set!"); + return NULL; + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, slices_num ); + if(!slices) + { + slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, slices_num * sizeof(uint32_t)); + num_per_output = tensor->attr.size[axis] / slices_num; + for( i = 0; i < slices_num; i++ ) + { + slices[i] = num_per_output; + } + } + output_tensors = (vsi_nn_internal_tensor_t**)vsi_nn_internal_new_node_param(curr, + slices_num * sizeof(vsi_nn_internal_tensor_t*)); + curr->node->nn_param.split.axis = axis; + curr->node->nn_param.split.slices_num = slices_num; + curr->node->nn_param.split.slices = slices; + curr->inputs[0] = tensor; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &tensor->attr.dtype, use_virtual_tensor); + for( i = 0; i < slices_num; i++ ) + { + output_tensors[i] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + curr->outputs[i] = output_tensors[i]->t; + } + vsi_nn_internal_setup_node( self, curr ); + + return output_tensors; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t* input_tensor, + vsi_nn_tensor_t* output_tensor, + uint32_t* size, + uint32_t dim_num, + vsi_bool use_virtual_tensor + ) +{ + + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* tensor0 = NULL; + uint32_t* reshape_in_size = NULL; + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); + reshape_in_size = (uint32_t *)vsi_nn_internal_new_node_param(curr, dim_num * sizeof(uint32_t)); + memcpy(reshape_in_size, size, dim_num * sizeof(uint32_t)); + curr->node->nn_param.reshape.size = reshape_in_size; + curr->node->nn_param.reshape.dim_num = dim_num; + curr->inputs[0] = input_tensor; + curr->outputs[0] = output_tensor; + + if(output_tensor) + { + curr->outputs[0] = output_tensor; + } + else + { + vsi_nn_tensor_attr_t attr; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &input_tensor->attr.dtype, use_virtual_tensor); + tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + curr->outputs[0] = tensor0->t; + } + vsi_nn_internal_setup_node(self, curr); + + return tensor0; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t* input_tensor, + vsi_nn_tensor_t* output_tensor, + uint32_t* perm, + uint32_t dim_num, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* tensor0 = NULL; + uint32_t* permute_in_perm = NULL; + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0); + permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(curr, + dim_num * sizeof(uint32_t)); + memcpy(permute_in_perm, perm, dim_num * sizeof(uint32_t)); + curr->node->nn_param.permute.perm = permute_in_perm; + curr->node->nn_param.permute.dim_num = dim_num; + curr->inputs[0] = input_tensor; + + if(output_tensor) + { + curr->outputs[0] = output_tensor; + } + else + { + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_init_tensor_attr(&attr, &input_tensor->attr.dtype, use_virtual_tensor); + tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + curr->outputs[0] = tensor0->t; + } + vsi_nn_internal_setup_node(self, curr); + + return tensor0; +} + +vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_copy + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t* input_tensor, + vsi_nn_tensor_t* output_tensor, + vsi_nn_dtype_t* dtype, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* tensor0 = NULL; + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0); + curr->inputs[0] = input_tensor; + if(!dtype) + { + dtype = &input_tensor->attr.dtype; + } + + if(output_tensor) + { + curr->outputs[0] = output_tensor; + } + else + { + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_init_tensor_attr(&attr, dtype, use_virtual_tensor); + tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + curr->outputs[0] = tensor0->t; + } + vsi_nn_internal_setup_node(self, curr); + + return tensor0; +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c new file mode 100644 index 0000000..d9d0158 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -0,0 +1,2333 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include + +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_types.h" +#include "vsi_nn_test.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" +#include "utils/vsi_nn_tensor_op.h" + +static vsi_bool _try_set_const_tensor + ( + vsi_nn_tensor_t *tensor + ); + +static vsi_bool _auto_cal_shape + ( + uint32_t * input_shape, + uint32_t input_dim, + uint32_t * shape, + uint32_t * dim_num + ); + +static vsi_bool _init_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + uint8_t * data + ); + +static vsi_nn_tensor_t * _create_tensor + ( + vsi_nn_graph_t * graph, + uint8_t * data, + vsi_nn_tensor_attr_t * attr + ); + +static uint32_t get_tensor_elements_num + ( + const uint32_t * shape, + uint32_t dim_num, + vsi_nn_type_e type + ) +{ + uint32_t num; + uint32_t sz; + uint32_t dsize; + + sz = vsi_nn_GetTensorSize( shape, + dim_num, type ); + dsize = vsi_nn_GetTypeBytes( type ); + num = (uint32_t)(sz / dsize); + return num; +} /* get_tensor_elements_num() */ + +static void print_tensor + ( + vsi_nn_tensor_t *tensor, + vsi_nn_tensor_id_t id, + char *ext_str + ) +{ +#define _SHAPE_BUF_SZ (64) +#define _EXT_ATTR_BUF_SZ (64) +#define _ATTR_BUF_SZ (64) + int count; + char shape[_SHAPE_BUF_SZ] = { 0 }; + char ext_attr[_EXT_ATTR_BUF_SZ] = { 0 }; + char format[_ATTR_BUF_SZ] = {0}; + + if( !tensor ) + { + VSILOGD("%s None", ext_str); + return; + } + vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num, + shape, _SHAPE_BUF_SZ, TRUE ); + vsi_nn_FormatToString( tensor, format, _SHAPE_BUF_SZ ); + + /* Process quantize parameters */ + switch( tensor->attr.dtype.qnt_type ) + { + case VSI_NN_QNT_TYPE_DFP: + count = snprintf( &ext_attr[0], _EXT_ATTR_BUF_SZ, + "DFP fl=%3d", tensor->attr.dtype.fl ); + ext_attr[count] = 0; + break; + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + count = snprintf( &ext_attr[0], _EXT_ATTR_BUF_SZ, + "ASM zp=%3d, scale=%.6f", + tensor->attr.dtype.zero_point, tensor->attr.dtype.scale ); + ext_attr[count] = 0; + break; +#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: + count = snprintf( &ext_attr[0], _EXT_ATTR_BUF_SZ, + "SYM PERCHANNEL axis=%d, count=%d", + tensor->attr.dtype.channel_dim, tensor->attr.dtype.scale_dim ); + ext_attr[count] = 0; + break; +#endif + default: + strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ); + break; + } + + if(ext_str) + { + VSILOGD("%s id[%4u] vtl[%d] const[%d] shape[%-18s] fmt[%s] qnt[%s]", + ext_str, + id, + tensor->attr.vtl, + tensor->attr.is_const, + shape, + format, + ext_attr); + } + else + { + VSILOGD("id[%4u] vtl[%d] const[%d] shape[%-18s] fmt[%s] qnt[%s]", + id, + tensor->attr.vtl, + tensor->attr.is_const, + shape, + format, + ext_attr); + } +} + +static vsi_nn_tensor_rel_t *_init_tensor_rel_buffer + ( + vsi_nn_graph_t *graph, + uint32_t max_io + ) +{ + uint32_t i,tensor_num; + vsi_nn_tensor_rel_t *tensor_ref; + + tensor_num = graph->tensor_num; + tensor_ref = (vsi_nn_tensor_rel_t *)malloc(tensor_num * sizeof(vsi_nn_tensor_rel_t)); + if(NULL == tensor_ref) + { + return NULL; + } + memset(tensor_ref, 0, sizeof(vsi_nn_tensor_rel_t) * tensor_num); + + for(i = 0; i < tensor_num; i++) + { + tensor_ref[i].input.num = 0; + tensor_ref[i].output.num = 0; + tensor_ref[i].input.table = (vsi_nn_tensor_rel_table_t *)malloc( + max_io * sizeof(vsi_nn_tensor_rel_table_t)); + tensor_ref[i].output.table = (vsi_nn_tensor_rel_table_t *)malloc( + max_io * sizeof(vsi_nn_tensor_rel_table_t)); + if(NULL == tensor_ref[i].input.table || NULL == tensor_ref[i].output.table) + { + goto error; + } + memset(tensor_ref[i].input.table, 0, max_io * sizeof(vsi_nn_tensor_rel_table_t)); + memset(tensor_ref[i].output.table, 0, max_io * sizeof(vsi_nn_tensor_rel_table_t)); + } + + return tensor_ref; +error: + if(tensor_ref) + { + for(i = 0; i < tensor_num; i++) + { + if(tensor_ref[i].input.table) + { + free(tensor_ref[i].input.table); + tensor_ref[i].input.table = NULL; + } + if(tensor_ref[i].output.table) + { + free(tensor_ref[i].output.table); + tensor_ref[i].output.table = NULL; + } + } + free(tensor_ref); + tensor_ref = NULL; + } + return NULL; +} /* _init_tensor_rel_buffer() */ + +static vsi_bool _try_set_const_tensor + ( + vsi_nn_tensor_t *tensor + ) +{ + vsi_status status; + vsi_bool ret; + vsi_nn_vxtensor_attr_t attr; + + ret = TRUE; + status = VSI_SUCCESS; + if( TRUE == tensor->attr.is_const ) + { + attr = VSI_NN_TENSOR_ATTR_CONST; + status = vsi_nn_SetTensorAttr(tensor, attr); + } + if( VSI_FAILURE == status ) + { + ret = FALSE; + } + + return ret; +} /* _set_const_tensor() */ + +static vsi_bool _auto_cal_shape + ( + uint32_t * input_shape, + uint32_t input_dim, + uint32_t * shape, + uint32_t * dim_num + ) +{ + vsi_bool ret; + int32_t neg_idx; + uint32_t i; + uint32_t total_size; + + ret = TRUE; + neg_idx = -1; + total_size = vsi_nn_ShapeProduct( input_shape, input_dim ); + if (-1 == *dim_num) + { + *dim_num = 1; + shape[0] = total_size; + return ret; + } + + for( i = 0; i < *dim_num; i ++ ) + { + if( -1 != (int32_t)shape[i] ) + { + if (0 == shape[i]) + { + if (i >= input_dim) + { + VSILOGE( "Wrong shape '%d' ", (int32_t)shape[i] ); + ret = FALSE; + break; + } + shape[i] = input_shape[i]; + } + total_size /= shape[i]; + } + else if( -1 == neg_idx ) + { + neg_idx = i; + } + else + { + VSILOGE( "Wrong shape '%d' ", (int32_t)shape[i] ); + ret = FALSE; + break; + } + } + if( FALSE == ret ) + { + shape[neg_idx] = -1; + } + else if(neg_idx != -1) + { + shape[neg_idx] = total_size; + } + return ret; +} /* _auto_cal_shape() */ + +static vsi_bool _init_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + uint8_t * data + ) +{ + vsi_bool ret; + vx_tensor_create_params_t params; + float * scales = NULL; + int32_t * null_zp = NULL; + + ret = TRUE; + + memset( ¶ms, 0, sizeof( vx_tensor_create_params_t ) ); + params.num_of_dims = tensor->attr.dim_num; + params.sizes = tensor->attr.size; + params.data_format = (vsi_enum)tensor->attr.dtype.vx_type; + params.quant_format = (vsi_enum)tensor->attr.dtype.qnt_type; + switch( tensor->attr.dtype.qnt_type ) + { + case VSI_NN_QNT_TYPE_DFP: + params.quant_data.dfp.fixed_point_pos = (uint8_t)tensor->attr.dtype.fl; + break; + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + params.quant_data.affine.scale = tensor->attr.dtype.scale; + params.quant_data.affine.zeroPoint = (int32_t)tensor->attr.dtype.zero_point; + break; + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: +#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT + // This is a hack that driver doesn't support const scales + scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim); + memcpy(scales, tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float)); + params.quant_data.affinePerChannel.channelDim = tensor->attr.dtype.channel_dim; + params.quant_data.affinePerChannel.scaleCount = tensor->attr.dtype.scale_dim; + params.quant_data.affinePerChannel.scales = scales; + params.quant_data.affinePerChannel.zeroPoint = NULL; + params.quant_data.affinePerChannel.zeroPointCount = 0; + // TODO: This is a hack since driver will access a NULL pointer and cause a crash. + // Remove me in the future. + { + null_zp = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.scale_dim); + memset(null_zp, 0, sizeof(int32_t) * tensor->attr.dtype.scale_dim); + params.quant_data.affinePerChannel.zeroPoint = null_zp; + } + break; +#else + VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC." ); +#endif + default: + break; + } + + if( NULL != tensor->t ) + { + vxReleaseTensor( &tensor->t ); + } + if( NULL != tensor->wb ) + { + vxReleaseWeightsBiasesParameter( &tensor->wb ); + } + + if( TRUE == tensor->attr.is_created_from_handle ) + { + vx_tensor_addressing addr; + uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + uint32_t buf_sz; + + buf_sz = vsi_nn_GetStrideSize( &tensor->attr, stride_size ); + if( buf_sz > 0 ) + { + uint32_t align_start_size = graph->handle_manager.align_start_size; + uint32_t align_block_size = graph->handle_manager.align_block_size; + if (data == NULL) + { + data = vsi_nn_MallocAlignedBuffer(buf_sz, align_start_size, + align_block_size); + tensor->attr.is_handle_malloc_by_ovxlib = TRUE; +#ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL + tensor->attr.vsi_memory_type = VSI_MEMORY_TYPE_HOST; +#endif + } + else + { + tensor->attr.is_handle_malloc_by_ovxlib = FALSE; + if (!vsi_nn_IsBufferAligned(data, align_start_size)) + { + VSILOGE( "vsi_nn_IsBufferAligned is FALSE." ); + if( scales ) + { + free(scales); + } + if(null_zp) + { + free(null_zp); + null_zp = NULL; + } + return FALSE; + } + } + if( data ) + { + addr = vxCreateTensorAddressing(graph->ctx->c, + tensor->attr.size, stride_size, (uint8_t)tensor->attr.dim_num); +#ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL +#ifdef VX_13_NN_COMPATIBLITY + tensor->t = vxCreateTensorFromHandle2(graph->ctx->c, + ¶ms, sizeof(vx_tensor_create_params_t), + addr, data, tensor->attr.vsi_memory_type); +#else + tensor->t = vxCreateTensorFromHandle(graph->ctx->c, + ¶ms, sizeof(vx_tensor_create_params_t), + addr, data, tensor->attr.vsi_memory_type); +#endif +#else +#ifdef VX_13_NN_COMPATIBLITY + tensor->t = vxCreateTensorFromHandle2(graph->ctx->c, + ¶ms, sizeof(vx_tensor_create_params_t), + addr, data, VX_MEMORY_TYPE_HOST); +#else + tensor->t = vxCreateTensorFromHandle(graph->ctx->c, + ¶ms, sizeof(vx_tensor_create_params_t), + addr, data, VX_MEMORY_TYPE_HOST); +#endif + +#endif + //memset(data, 0x5A, buf_sz); + vxReleaseTensorAddressing( &addr ); + vxFlushHandle( (vx_reference)tensor->t ); + } + } + } + else if( FALSE == tensor->attr.vtl ) + { + tensor->t = vxCreateTensor2( graph->ctx->c, + ¶ms, sizeof( vx_tensor_create_params_t ) ); + } + else + { + tensor->t = vxCreateVirtualTensor2( graph->g, + ¶ms, sizeof( vx_tensor_create_params_t ) ); + } + if( NULL == tensor->t ) + { + VSILOGE( "Create vx tensor fail." ); + ret = FALSE; + } + + if( !tensor->attr.vtl && !tensor->attr.is_const ) + { + //norm tensor need to fill initial value + if( ( !tensor->attr.is_created_from_handle ) || tensor->attr.is_handle_malloc_by_ovxlib ) + { + vsi_nn_FillTensorWithValue( graph, tensor, 0.0f ); + if(tensor->attr.is_created_from_handle) + { + vxFlushHandle( (vx_reference)tensor->t ); + } + } + } + + ret = _try_set_const_tensor( tensor ); + + if( scales ) + { + free(scales); + } + if(null_zp) + { + free(null_zp); + null_zp = NULL; + } + return ret; +} /* _init_tensor() */ + +vsi_bool vsi_nn_TensorReinit + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor + ) +{ + vsi_bool ret; + ret = TRUE; + + if( NULL == graph || NULL == tensor ) + { + return FALSE; + } + if( tensor->attr.dim_num != VSI_NN_DIM_AUTO ) + { + ret = _init_tensor( graph, tensor, NULL ); + } + return ret; +} /* vsi_nn_TensorReinit() */ + +static vsi_nn_tensor_t * _create_tensor + ( + vsi_nn_graph_t * graph, + uint8_t * data, + vsi_nn_tensor_attr_t * attr + ) +{ + vsi_nn_tensor_t * tensor; + + tensor = NULL; + if( NULL == graph || NULL == graph->g || NULL == attr ) + { + return tensor; + } + + tensor = (vsi_nn_tensor_t *)malloc( sizeof( vsi_nn_tensor_t ) ); + //vsi_nn_UpdateTensorDims( attr ); + + if( NULL != tensor ) + { + memset( tensor, 0, sizeof( vsi_nn_tensor_t ) ); + memcpy( &tensor->attr, attr, sizeof( vsi_nn_tensor_attr_t ) ); + tensor->is_swapped = FALSE; + if( attr->dim_num != VSI_NN_DIM_AUTO ) + { + _init_tensor( graph, tensor, data); + if( NULL == tensor->t ) + { + VSILOGE( "Create vx tensor fail." ); + free( tensor ); + tensor = NULL; + } + } + } + return tensor; +} + +vsi_nn_tensor_t * vsi_nn_CreateTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_attr_t * attr + ) +{ + attr->is_created_from_handle = FALSE; + return _create_tensor(graph, NULL, attr); +} /* vsi_nn_CreateTensor() */ + +vsi_nn_tensor_t * vsi_nn_CreateTensorFromHandle + ( + vsi_nn_graph_t * graph, + uint8_t * data, + vsi_nn_tensor_attr_t * attr + ) +{ + attr->is_created_from_handle = TRUE; +#ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL + if(attr->vsi_memory_type == VSI_MEMORY_TYPE_NONE || attr->vsi_memory_type == 0) + { + attr->vsi_memory_type = VSI_MEMORY_TYPE_HOST; + } +#endif + return _create_tensor(graph, data, attr); +} /* vsi_nn_CreateTensorFromHandle() */ + +vsi_nn_tensor_t * vsi_nn_CreateTensorWithDefault + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_attr_t * attr, + float defualt_value + ) +{ + vsi_nn_tensor_t* t = vsi_nn_CreateTensor( graph, attr ); + if( t ) + { + uint32_t size = 0; + uint32_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint8_t* data = NULL; + + size = vsi_nn_GetStrideSize( &t->attr, stride ); + data = (uint8_t *)malloc( size ); + if( data ) + { + uint32_t i = 0; + uint32_t elements = size / stride[0]; + vsi_status status = VSI_SUCCESS; + + for( i = 0; i < elements; i ++ ) + { + status = vsi_nn_Float32ToDtype( defualt_value, &data[stride[0] * i], &t->attr.dtype ); + if( VSI_FAILURE == status ) + { + VSILOGE("Convert default_value to dtype fail"); + break; + } + } + + status = vsi_nn_CopyDataToTensor( graph, t, data ); + free( data ); + data = NULL; + if( VSI_FAILURE == status ) + { + VSILOGE("Copy data to tensor fail"); + } + } + } + + return t; +} /* vsi_nn_CreateTensorWithDefault() */ + +vsi_status vsi_nn_FillTensorWithValue + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + float value + ) +{ + vsi_status status = VSI_FAILURE; + + if( tensor ) + { + uint32_t size = 0; + uint32_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint8_t* data = NULL; + + size = vsi_nn_GetStrideSize( &tensor->attr, stride ); + data = (uint8_t *)malloc( size ); + if( data ) + { + uint32_t i = 0; + uint32_t elements = size / stride[0]; + + for( i = 0; i < elements; i ++ ) + { + status = vsi_nn_Float32ToDtype( value, &data[stride[0] * i], &tensor->attr.dtype ); + if( VSI_FAILURE == status ) + { + VSILOGE("Convert value to dtype fail"); + break; + } + } + + status = vsi_nn_CopyDataToTensor( graph, tensor, data ); + free( data ); + data = NULL; + if( VSI_FAILURE == status ) + { + VSILOGE("Copy data to tensor fail"); + } + } + } + + return status; +} /* vsi_nn_FillTensorWithValue() */ + +void vsi_nn_ReleaseTensor + ( + vsi_nn_tensor_t ** tensor + ) +{ + vsi_nn_tensor_t * ptr; + ptr = *tensor; + if( NULL != tensor && NULL != *tensor ) + { + uint8_t * handle = NULL; + if( NULL != ptr->t ) + { + if (ptr->attr.is_created_from_handle && + ptr->attr.is_handle_malloc_by_ovxlib) + { + vxSwapTensorHandle( ptr->t, NULL, (void**)&handle); + if ( handle == NULL ) + { + VSILOGE("vxSwapTensorHandle fail."); + return; + } + } + vxReleaseTensor( &ptr->t ); + if (handle) vsi_nn_FreeAlignedBuffer(handle); + } + + if (ptr->wb) { + vxReleaseWeightsBiasesParameter(&ptr->wb); + } + + free( ptr ); + *tensor = NULL; + } +} /* vsi_nn_ReleaseTensor() */ + +vsi_status vsi_nn_SetTensorAttr + ( + vsi_nn_tensor_t * tensor, + const vsi_nn_vxtensor_attr_t attrs + ) +{ + vsi_status status; + + status = VSI_SUCCESS; + if( NULL == tensor ) + { + return VSI_FAILURE; + } + + if( VSI_SUCCESS == status && vsi_nn_hasattr( attrs, VSI_NN_TENSOR_ATTR_CONST ) ) + { + vx_enum data_lifetime; + if(tensor->attr.is_const == TRUE) + { + data_lifetime = VX_TENSOR_LIFE_TIME_STATIC; + } + else + { + data_lifetime = VX_TENSOR_LIFE_TIME_DYNAMIC; + } + status = vxSetTensorAttribute(tensor->t, + VX_TENSOR_LIFETIME, + &data_lifetime, + sizeof(vx_enum)); + } + if( VSI_SUCCESS == status && vsi_nn_hasattr( attrs, VSI_NN_TENSOR_ATTR_HIGH_PRECISION ) ) + { + vx_enum precision = VX_TENSOR_PRECISION_HIGH; + status = vxSetTensorAttribute(tensor->t, + VX_TENSOR_PRECISION, + &precision, + sizeof(vx_enum)); + } + + return status; +} + +vsi_status vsi_nn_QueryTensorAttr + ( + vsi_nn_tensor_t * tensor, + const vsi_nn_vxtensor_attr_t attrs + ) +{ + vsi_status status; + + status = VSI_SUCCESS; + if( NULL == tensor ) + { + return VSI_FAILURE; + } + + if( VSI_SUCCESS == status && vsi_nn_hasattr( attrs, VSI_NN_TENSOR_ATTR_DIM_NUM ) ) + { + status = vxQueryTensor( tensor->t, VX_TENSOR_NUM_OF_DIMS, + &tensor->attr.dim_num, sizeof( tensor->attr.dim_num ) ); + } + + if( VSI_SUCCESS == status && vsi_nn_hasattr( attrs, VSI_NN_TENSOR_ATTR_DTYPE ) ) + { + status = vxQueryTensor( tensor->t, VX_TENSOR_DATA_TYPE, + &tensor->attr.dtype.vx_type, sizeof( tensor->attr.dtype.vx_type ) ); + } + + if( VSI_SUCCESS == status && vsi_nn_hasattr( attrs, VSI_NN_TENSOR_ATTR_SIZE ) ) + { + status = vxQueryTensor( tensor->t, VX_TENSOR_DIMS, + &tensor->attr.size, sizeof( tensor->attr.size ) ); + } + + if( VSI_SUCCESS == status && vsi_nn_hasattr( attrs, VSI_NN_TENSOR_ATTR_FIXED_POINT_POS ) ) + { + status = vxQueryTensor( tensor->t, VX_TENSOR_FIXED_POINT_POS, + &tensor->attr.dtype.fl, sizeof( tensor->attr.dtype.fl ) ); + } + + return status; +} /* vsi_nn_QueryTensorAttr() */ + +uint32_t vsi_nn_CopyTensorToBuffer + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + void * buffer + ) +{ + uint32_t sz; + uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_status status; + if( NULL == tensor || NULL == buffer ) + { + return 0; + } + sz = 0; + status = VSI_FAILURE; + + status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, buffer, VX_READ_ONLY); + if(VSI_SUCCESS == status) + { + sz = vsi_nn_GetStrideSize( &tensor->attr, stride_size ); + } + return sz; +} /* vsi_nn_CopyTensorToData() */ + +float * vsi_nn_ConvertTensorToFloat32Data + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_t *tensor + ) +{ + vsi_status status; + uint8_t *tensor_data = NULL; + uint32_t elements; + uint32_t i,stride; + float *data; + + if(NULL == graph || NULL == tensor) + { + return NULL; + } + + elements = vsi_nn_GetElementNum(tensor); + stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type); + + data = NULL; + data = (float *)malloc(elements * sizeof(float)); + + if( tensor->attr.is_created_from_handle ) + { + vxSwapTensorHandle(tensor->t, NULL, (void**)&tensor_data); + if ( tensor_data == NULL ) + { + VSILOGE("vxSwapTensorHandle fail."); + if( data ) + { + free( data ); + data = NULL; + } + return NULL; + } + } + else + { + tensor_data = vsi_nn_ConvertTensorToData(graph, tensor); + } + for(i = 0; i < elements; i++) + { + status = dtype_to_float32(&tensor_data[stride * i], &data[i], &tensor->attr.dtype); + if(status != VSI_SUCCESS) + { + free(data); + data = NULL; + break; + } + } + + if( !tensor->attr.is_created_from_handle ) + { + if(tensor_data)free(tensor_data); + } + return data; +} /* vsi_nn_ConvertTensorToFloat32Data() */ + +uint8_t * vsi_nn_ConvertTensorToData + ( + const vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor + ) +{ + uint8_t * data; + uint32_t buf_sz; + uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_status status; + if( NULL == tensor ) + { + return NULL; + } + + status = VSI_FAILURE; + data = NULL; + + buf_sz = vsi_nn_GetStrideSize( &tensor->attr, stride_size ); + // TODO: Fix this to use copy tensor to buffer + if( buf_sz > 0 ) + { + data = (uint8_t *)malloc( buf_sz ); + } + if( data && tensor->attr.is_created_from_handle ) + { + uint8_t* tensor_data = NULL; + vxSwapTensorHandle( tensor->t, NULL, (void **)&tensor_data ); + if ( tensor_data == NULL ) + { + VSILOGE("vxSwapTensorHandle fail."); + if( data ) + { + free( data ); + data = NULL; + } + return NULL; + } + memcpy( data, tensor_data, buf_sz); + } + else + { + if( NULL != data ) + { + status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, data, VX_READ_ONLY); + } + if(VSI_SUCCESS != status) + { + VSILOGE("Read tensor data fail"); + free(data); + data = NULL; + } + } + return data; + +} /* vsi_nn_ConvertTensorToData() */ + +/* +* Deprecated: Use vsi_nn_ConvertRawTensorToData2() instead +* WARNING: This is a bad API, +* please add a new API for WRITE_ONLY accessor. +*/ +uint8_t * vsi_nn_ConvertRawTensorToData + ( + vx_context context, + vx_tensor tensor, + uint32_t * dim, + vx_enum * data_format, + uint32_t * size, + uint32_t * stride_size, + vx_tensor_addressing * addr, + vx_enum accessor + ) +{ + uint8_t * data; + uint32_t buf_sz; + vsi_status status; + vsi_nn_tensor_attr_t attr; + if( NULL == tensor || NULL == context ) + { + return NULL; + } + + status = VSI_FAILURE; + data = NULL; + + status = vxQueryTensor(tensor, VX_TENSOR_NUM_OF_DIMS, dim, sizeof(uint32_t)); + status = vxQueryTensor(tensor, VX_TENSOR_DIMS, size, sizeof(uint32_t) * (*dim)); + status = vxQueryTensor(tensor, VX_TENSOR_DATA_TYPE, data_format, sizeof(vsi_enum)); + attr.dim_num = *dim; + memcpy(attr.size, size, sizeof(uint32_t) * attr.dim_num); + + buf_sz = vsi_nn_GetStrideSizeBySize(size, *dim, *data_format, stride_size); + // TODO: Fix this to use copy tensor to buffer + if( buf_sz > 0 ) + { + data = (uint8_t *)malloc( buf_sz ); + } + if( NULL != data ) + { + if (accessor != VX_READ_ONLY) + { + return data; + } + status = vsi_nn_copy_tensor_patch(tensor, &attr, data, VX_READ_ONLY); + if( VSI_SUCCESS != status ) + { + VSILOGE("Read tensor data fail"); + free(data); + data = NULL; + } + } + return data; +} /* vsi_nn_ConvertRawTensorToData() */ + +/* +* WARNING: This is a bad API, +* please add the new APIs for WRITE_ONLY and READ_ONLY. +* Then deprecate this function. +*/ +uint8_t * vsi_nn_ConvertRawTensorToData2 + ( + vx_context context, + vx_tensor tensor, + vsi_nn_tensor_attr_t * attr, + uint32_t * stride_size, + vx_tensor_addressing * addr, + vx_enum accessor + ) +{ + uint8_t * data; + uint32_t buf_sz; + vsi_status status; + + if( NULL == tensor || NULL == context ) + { + return NULL; + } + + status = VSI_FAILURE; + data = NULL; + + status = vxQueryTensor(tensor, VX_TENSOR_NUM_OF_DIMS, + &(attr->dim_num), sizeof(uint32_t)); + status = vxQueryTensor(tensor, VX_TENSOR_DIMS, + attr->size, sizeof(uint32_t) * (attr->dim_num)); + status = vxQueryTensor(tensor, VX_TENSOR_DATA_TYPE, + &(attr->dtype.vx_type), sizeof(vsi_enum)); + status = vxQueryTensor(tensor, VX_TENSOR_QUANT_FORMAT, + &(attr->dtype.qnt_type), sizeof(uint32_t)); + switch( attr->dtype.qnt_type ) + { + case VSI_NN_QNT_TYPE_DFP: + status = vxQueryTensor(tensor, VX_TENSOR_FIXED_POINT_POS, + &(attr->dtype.fl), sizeof(int8_t)); + break; + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + status = vxQueryTensor(tensor, VX_TENSOR_ZERO_POINT, + &(attr->dtype.zero_point), sizeof(int32_t)); + status = vxQueryTensor(tensor, VX_TENSOR_SCALE, + &(attr->dtype.scale), sizeof(float)); + break; + default: + break; + } + + buf_sz = vsi_nn_GetStrideSize( attr, stride_size ); + // TODO: Fix this to use copy tensor to buffer + if( buf_sz > 0 ) + { + data = (uint8_t *)malloc( buf_sz ); + } + if( NULL != data ) + { + if (accessor != VX_READ_ONLY) + { + return data; + } + status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_READ_ONLY); + if( VSI_SUCCESS != status ) + { + VSILOGE("Read tensor data fail"); + free(data); + data = NULL; + } + } + return data; +} /* vsi_nn_ConvertRawTensorToData2() */ + +void vsi_nn_SaveTensorToTextByFp32 + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + const char * filename, + char * seperator + ) +{ +#define _TENSOR_TMPBUF_SZ (512) + const float c_flush_th = 0.7f; + uint8_t * data; + uint8_t * ptr; + uint32_t type_bytes; + uint8_t buf[_TENSOR_TMPBUF_SZ]; + FILE * fp; + float write_data; + uint32_t sz; + uint32_t i; + uint32_t count; + + if( NULL == graph || NULL == tensor || NULL == filename ) + { + return; + } + if( NULL == seperator ) + { + seperator = "\n"; + } + + data = vsi_nn_ConvertTensorToData( graph, tensor ); + if( NULL == data ) + { + VSILOGE( "Convert data fail." ); + return; + } + + fp = fopen( filename, "w" ); + sz = vsi_nn_GetElementNum( tensor ); + + ptr = data; + type_bytes = vsi_nn_TypeGetBytes( tensor->attr.dtype.vx_type ); + count = 0; + for( i = 0; i < sz; i ++ ) + { + vsi_nn_DtypeToFloat32( ptr, &write_data, &tensor->attr.dtype ); + ptr += type_bytes; + + count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count, + "%f%s", write_data, seperator ); + if( ((float)count / _TENSOR_TMPBUF_SZ) > c_flush_th ) + { + fwrite( buf, count, 1, fp ); + count = 0; + } + } + fwrite( buf, count, 1, fp ); + fclose( fp ); + free( data ); +} /* vsi_nn_SaveTensorToTextByFp32() */ + +void vsi_nn_SaveTensorToText + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + const char * filename, + char * seperator + ) +{ + uint8_t * data; + uint32_t sz; + + if( NULL == graph || NULL == tensor || NULL == filename ) + { + return; + } + + data = vsi_nn_ConvertTensorToData( graph, tensor ); + if( NULL == data ) + { + VSILOGE( "Convert data fail." ); + return; + } + + sz = vsi_nn_GetElementNum( tensor ); + vsi_nn_SaveDataToText( filename, data, sz, + tensor->attr.dtype.vx_type, seperator ); + free( data ); +} /* vsi_nn_SaveTensorToText() */ + +void vsi_nn_SaveDataToText + ( + const char * filename, + uint8_t * data, + uint32_t data_size, + vsi_nn_type_e type, + char * seperator + ) +{ +#define _TENSOR_TMPBUF_SZ (512) + const float c_flush_th = 0.7f; + uint8_t buf[_TENSOR_TMPBUF_SZ]; + FILE * fp; + float write_data; + uint32_t type_bytes; + uint32_t i; + uint32_t count; + + if( NULL == filename ) + { + return; + } + if( NULL == seperator ) + { + seperator = "\n"; + } + + if( NULL == data ) + { + return; + } + + fp = fopen( filename, "w" ); + type_bytes = vsi_nn_GetTypeBytes( type ); + + count = 0; + for( i = 0; i < data_size; i ++ ) + { + write_data = vsi_nn_DataAsFloat32( &data[type_bytes * i], + type ); + if( type == VSI_NN_TYPE_UINT8 || type == VSI_NN_TYPE_INT8 ) + { + count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count, + "%d%s", (int32_t)write_data, seperator ); + } + else + { + count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count, + "%f%s", write_data, seperator ); + } + if( ((float) count / _TENSOR_TMPBUF_SZ ) > c_flush_th ) + { + fwrite( buf, count, 1, fp ); + count = 0; + } + } + fwrite( buf, count, 1, fp ); + fclose( fp ); +} /* vsi_nn_SaveDataToText() */ + +void vsi_nn_SaveTensorToBinary + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + const char * filename + ) +{ + uint8_t * data; + FILE * fp; + uint32_t sz; + uint32_t i; + + if( NULL == graph || NULL == tensor || NULL == filename ) + { + return; + } + + data = vsi_nn_ConvertTensorToData( graph, tensor ); + if( NULL == data ) + { + VSILOGE( "Convert data fail." ); + return; + } + + fp = fopen( filename, "wb" ); + sz = vsi_nn_GetTypeBytes( tensor->attr.dtype.vx_type ); + for( i = 0; i < tensor->attr.dim_num; i ++ ) + { + sz *= tensor->attr.size[i]; + } + fwrite( data, sz, 1, fp ); + fclose( fp ); + free( data ); +} /* vsi_nn_SaveTensorToBinary() */ + +vsi_nn_tensor_t * vsi_nn_CreateTensorFromData + ( + vsi_nn_graph_t * graph, + uint8_t * data, + vsi_nn_tensor_attr_t * attr + ) +{ + vsi_status status; + vsi_nn_tensor_t * tensor; + + status = VSI_FAILURE; + tensor = NULL; + + if( NULL == graph || NULL == data || NULL == attr ) + { + return NULL; + } + + tensor = vsi_nn_CreateTensor( graph, attr ); + + status = vsi_nn_CopyDataToTensor( graph, tensor, data ); + + if( VSI_SUCCESS != status ) + { + VSILOGE("Create tensor from data fail."); + if( NULL != tensor ) + { + vsi_nn_ReleaseTensor( &tensor ); + } + } + return tensor; +} /* vsi_nn_CreateTensorFromData() */ + +vsi_status vsi_nn_CopyDataToTensor + ( + const vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + void * data + ) +{ + vsi_status status = VSI_FAILURE; + if( NULL == data || NULL == tensor ) + { + return status; + } + + if( tensor->attr.is_created_from_handle ) + { + uint8_t* ptr = NULL; + vxSwapTensorHandle( tensor->t, NULL, (void **)&ptr); + if ( ptr == NULL ) + { + VSILOGE("vxSwapTensorHandle fail."); + return VSI_FAILURE; + } + memcpy( ptr, data, vsi_nn_GetTensorSize(tensor->attr.size, tensor->attr.dim_num, + tensor->attr.dtype.vx_type)); + status = vxSwapTensorHandle( tensor->t, ptr, NULL ); + status |= vxFlushHandle( (vx_reference)tensor->t ); + } + else + { + status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, data, VX_WRITE_ONLY); + } + return status; +} /* vsi_nn_CopyDataToTensor() */ + +vsi_status vsi_nn_FlushHandle + ( + const vsi_nn_tensor_t * tensor + ) +{ + if ( NULL == tensor || NULL == tensor->t ) + { + return VSI_FAILURE; + } + else + { + return vxFlushHandle( (vx_reference)tensor->t ); + } +} /* vsi_nn_FlushHandle() */ + +vsi_status vsi_nn_GetTensorHandle + ( + vsi_nn_tensor_t * tensor, + void** ptr + ) +{ + if ( NULL == tensor || NULL == tensor->t ) + { + return VSI_FAILURE; + } + else + { + return vxSwapTensorHandle(tensor->t, NULL, ptr); + } +} /* vsi_nn_GetTensorHandle() */ + +vsi_status vsi_nn_CopyRawDataToTensor + ( + vsi_nn_graph_t* graph, + uint8_t* src_data, + const vsi_nn_dtype_t* src_dtype, + vsi_nn_tensor_t* tensor + ) +{ + vsi_status status = VSI_FAILURE; + uint32_t src_data_sz = 0; + uint8_t* buffer = NULL; + uint32_t target_tensor_size = 0; /* in bytes */ + + src_data_sz = vsi_nn_GetElementNum(tensor) * vsi_nn_GetTypeBytes(src_dtype->vx_type); + target_tensor_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type ); + buffer = (uint8_t *)malloc(target_tensor_size); + + vsi_nn_DtypeConvertRawData(src_data, src_data_sz, src_dtype, buffer, target_tensor_size, &tensor->attr.dtype); + status = vsi_nn_CopyDataToTensor(graph, tensor, buffer); + + if( NULL != buffer ) + { + free( buffer ); + buffer = NULL; + } + return status; +} /* vsi_nn_CopyRawDataToTensor */ + +vsi_bool vsi_nn_CalcReshapeTensor + ( + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + uint32_t * shape, + uint32_t dim_num + ) +{ + vsi_bool ret; + uint32_t i; + uint32_t total_size; + uint32_t dst_size; + + if( NULL == input || NULL == output + || NULL == shape || 0 == dim_num ) + { + VSILOGE( "Wrong reshape parameters." ); + return FALSE; + } + + ret = _auto_cal_shape( input->attr.size, input->attr.dim_num, shape, &dim_num ); + if( FALSE == ret ) + { + return ret; + } + + /* Check total size */ + total_size = vsi_nn_ShapeProduct( input->attr.size, input->attr.dim_num ); + dst_size = vsi_nn_ShapeProduct( shape, dim_num ); + if( total_size != dst_size ) + { + VSILOGE( "Cannot calculate the reshape tensor %u to %u.", + total_size, dst_size ); + return FALSE; + } + + if( TRUE == ret ) + { + if( VSI_NN_DIM_AUTO == output->attr.dim_num ) + { + for( i = 0; i < dim_num; i ++ ) + { + output->attr.size[i] = shape[i]; + } + output->attr.dim_num = dim_num; + } + } + + return ret; +} /* vsi_nn_CalcReshapeTensor() */ + +/* + This function will create a new tensor, + and reshape input to output. +*/ +vsi_nn_tensor_t *vsi_nn_reshape_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + uint32_t * shape, + uint32_t dim_num + ) +{ + vsi_bool ret; + vsi_nn_tensor_t *output = NULL; + vsi_nn_tensor_attr_t attr; + if (NULL == graph || NULL == input || NULL == shape) + { + return NULL; + } + /* New a ovxlib tensor struct */ + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + memcpy(&attr, &input->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + output = vsi_nn_CreateTensor(graph, &attr); + if (NULL == output) + { + VSILOGW("Create tensor fail."); + return NULL; + } + + ret = vsi_nn_ReshapeTensor(graph, input, output, shape, dim_num); + if (FALSE == ret) + { + VSILOGW("Reshape tensor fail."); + vsi_nn_ReleaseTensor(&output); + output = NULL; + } + + return output; +} /* vsi_nn_reshape_tensor() */ + +vsi_bool vsi_nn_ReshapeTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + const uint32_t * shape, + uint32_t dim_num + ) +{ + vsi_bool ret; + uint32_t new_shape[VSI_NN_MAX_DIM_NUM] = {0}; + memcpy(new_shape, shape, sizeof(uint32_t) * dim_num); + + ret = TRUE; + ret = vsi_nn_CalcReshapeTensor(input, output, new_shape, dim_num); + if( FALSE == ret ) + { + return FALSE; + } + + /* Create a openvx tensor if it is not exist */ + if( NULL == input->t ) + { + ret = vsi_nn_TensorReinit( graph, input ); + } + + /* We can not reshape input to output if output->t is already exist */ + if( NULL != output->t ) + { + VSILOGW( "Free tensor." ); + } + + /* Create reshape tensor */ + output->t = vxReshapeTensor( input->t, (int32_t *)new_shape, dim_num ); + if( NULL == output->t ) + { + ret = FALSE; + } + + if( FALSE == ret ) + { + VSILOGW( "Reshape tensor error." ); + } + + return ret; +} /* vsi_nn_ReshapeTensor() */ + +void vsi_nn_TransposeTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + uint32_t * perm, + uint32_t dim_num, + uint32_t * as_shape + ) +{ + uint8_t * buf; + uint8_t * dst; + uint32_t buf_sz; + uint32_t tensor_sz; + uint32_t * shape_ptr; + vsi_status status; + + if( NULL == tensor || NULL == perm || 0 == dim_num ) + { + VSILOGE( "Wrong perm dims." ); + return; + } + tensor_sz = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, + tensor->attr.dtype.vx_type ); + shape_ptr = tensor->attr.size; + + if( NULL != as_shape ) + { + buf_sz = vsi_nn_GetTensorSize( as_shape, dim_num, tensor->attr.dtype.vx_type ); + if( buf_sz != tensor_sz ) + { + VSILOGW( "The shape does not match origin tensor's shape." ); + return; + } + shape_ptr = as_shape; + } + buf = vsi_nn_ConvertTensorToData( graph, tensor ); + + if( NULL == buf ) + { + VSILOGE( "Create tensor buf fail." ); + return; + } + dst = (uint8_t *)malloc( tensor_sz * sizeof( uint8_t ) ); + // TODO: Check memory allocate. + + vsi_nn_Transpose( dst, buf, shape_ptr, dim_num, perm, tensor->attr.dtype.vx_type ); + status = vsi_nn_CopyDataToTensor( graph, tensor, dst ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Copy transpose data fail with code %#x.", status ); + } + + free( buf ); + free( dst ); +} /* vsi_nn_TransposeTensor() */ + +void vsi_nn_PermuteTensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor, + uint32_t * perm, + uint32_t dim_num + ) +{ + uint8_t * buf = NULL; + uint8_t * dst = NULL; + uint32_t tensor_sz; + uint32_t * shape_ptr; + uint32_t dst_shape[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t i; + vsi_status status; + + if( NULL == tensor || NULL == perm || 0 == dim_num ) + { + VSILOGE( "Wrong perm parameters." ); + return; + } + tensor_sz = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, + tensor->attr.dtype.vx_type ); + shape_ptr = tensor->attr.size; + + buf = vsi_nn_ConvertTensorToData( graph, tensor ); + + if( NULL == buf ) + { + VSILOGE( "Create tensor buf fail." ); + return; + } + dst = (uint8_t *)malloc( tensor_sz * sizeof( uint8_t ) ); + if ( NULL == dst) + { + VSILOGE( "Malloc dst buf fail." ); + if( buf ) { free(buf); buf = NULL; } + return; + } + + for ( i = 0; i < dim_num; i++) + { + if( perm[i] >= dim_num ) + { + VSILOGW( "Incorrect perm %d", perm[i] ); + if( buf ) { free(buf); buf = NULL; } + if( dst ) { free(dst); dst = NULL; } + return; + } + dst_shape[i] = shape_ptr[perm[i]]; + } + vsi_nn_Permute( dst, buf, shape_ptr, dim_num, perm, tensor->attr.dtype.vx_type ); + memcpy(tensor->attr.size, dst_shape, sizeof(dst_shape)); + tensor->t = vxReshapeTensor(tensor->t, (int32_t *)tensor->attr.size, tensor->attr.dim_num); + status = vsi_nn_CopyDataToTensor( graph, tensor, dst ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Copy permute data fail with code %#x.", status ); + } + + if( buf ) { free(buf); buf = NULL; } + if( dst ) { free(dst); dst = NULL; } +} /* vsi_nn_PermuteTensor() */ + +uint32_t vsi_nn_GetElementNum + ( + const vsi_nn_tensor_t * tensor + ) +{ + if( NULL == tensor ) + { + return 0; + } + + return get_tensor_elements_num(tensor->attr.size, + tensor->attr.dim_num, tensor->attr.dtype.vx_type); +} /* vsi_nn_GetElementNum() */ + +uint32_t vsi_nn_GetTensorSize + ( + const uint32_t * shape, + uint32_t dim_num, + vsi_nn_type_e type + ) +{ + uint32_t sz; + uint32_t i; + sz = 0; + if( NULL == shape || 0 == dim_num ) + { + return sz; + } + sz = 1; + for( i = 0; i < dim_num; i ++ ) + { + sz *= shape[i]; + } + sz *= vsi_nn_GetTypeBytes( type ); + return sz; +} /* vsi_nn_GetTensorSize() */ + +vsi_nn_tensor_t * vsi_nn_VariableToTensor + ( + vsi_nn_node_t * self, + uint8_t * data, + vsi_nn_type_e type + ) +{ + vsi_nn_tensor_t * tensor; + vsi_nn_tensor_attr_t attr; + + if(NULL == data || NULL == self) + { + return NULL; + } + + memset( &attr, 0, sizeof( attr ) ); + attr.size[0] = 1; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = type; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + tensor = vsi_nn_CreateTensorFromData( + self->graph, + data, + &attr); + if(NULL == tensor) + { + return NULL; + } + + return tensor; +} /* vsi_nn_VariableToTensor() */ + +/* + type 0x01: input + type 0x02: output + type 0x03: all +*/ +void vsi_nn_print_node_io + ( + vsi_nn_graph_t *graph, + vsi_nn_node_t *node, + int32_t type + ) +{ + uint32_t i; + vsi_nn_tensor_id_t id; + vsi_nn_tensor_t *tensor; + char index[32]; +#define _TYPE_INPUT 0x01 +#define _TYPE_OUTPUT 0x02 + if (!(type & _TYPE_INPUT) && !(type & _TYPE_OUTPUT)) + { + VSILOGW("Can't handle this node io type %d", type); + return; + } + + if (type & _TYPE_INPUT) + { + for (i = 0; i < node->input.num; i++) + { + id = node->input.tensors[i]; + tensor = vsi_nn_GetTensor(graph, id); + snprintf(index, 32, "in(%d) :", i); + print_tensor(tensor, id, index); + } + } + if (type & _TYPE_OUTPUT) + { + for (i = 0; i < node->output.num; i++) + { + id = node->output.tensors[i]; + tensor = vsi_nn_GetTensor(graph, id); + snprintf(index, 32, "out(%d):", i); + print_tensor(tensor, id, index); + } + } +} + +void vsi_nn_PrintNodeIO + ( + vsi_nn_graph_t *graph, + vsi_nn_node_t *node + ) +{ + vsi_nn_print_node_io(graph, node, 0x03); +} /* vsi_nn_PrintNodeIO() */ + +void vsi_nn_PrintTensor + ( + vsi_nn_tensor_t * tensor, + vsi_nn_tensor_id_t id + ) +{ + print_tensor(tensor, id, NULL); +} /* vsi_nn_PrintTensor() */ + +vx_tensor vsi_nn_CreateViewTensor + ( + vsi_nn_graph_t *graph, + uint32_t *start, + uint32_t *end, + vsi_nn_tensor_t *tensor + ) +{ + size_t i,view_dim; + size_t view_start[VSI_NN_MAX_DIM_NUM] = {0}; + size_t view_end[VSI_NN_MAX_DIM_NUM] = {0}; + vx_tensor view_tensor; + if(NULL == graph + || NULL == start + || NULL == end + || NULL == tensor) + { + return NULL; + } + + view_dim = (size_t)tensor->attr.dim_num; + for(i = 0; i < view_dim; i++) + { + view_start[i] = (size_t)start[i]; + view_end[i] = (size_t)end[i]; + } + view_tensor = vxCreateTensorFromView( tensor->t, view_dim, view_start, view_end ); + if( NULL == view_tensor ) + { + VSILOGE("Call vxCreateTensorFromView fail."); + return NULL; + } + + return view_tensor; +} /* vsi_nn_CreateViewTensor() */ + +void *vsi_nn_Malloc + ( + size_t size + ) +{ + void *mem = malloc(size); + return mem; +} /* vsi_nn_Malloc() */ + +void vsi_nn_Free + ( + void * data + ) +{ + if(NULL != data) + { + free(data); + data = NULL; + } +} /* vsi_nn_Free() */ + +void vsi_nn_ReleaseTensorRelevance + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_rel_t *tensor_ref + ) +{ + uint32_t i; + if(NULL == tensor_ref || NULL == graph) + { + return ; + } + + for(i = 0; i < graph->tensor_num; i++) + { + if(tensor_ref[i].input.table) + { + free(tensor_ref[i].input.table); + tensor_ref[i].input.table = NULL; + } + if(tensor_ref[i].output.table) + { + free(tensor_ref[i].output.table); + tensor_ref[i].output.table = NULL; + } + } + + if(tensor_ref) + { + free(tensor_ref); + tensor_ref = NULL; + } +} /* vsi_nn_ReleaseTensorRelevance() */ + +vsi_nn_tensor_rel_t *vsi_nn_CreateTensorRelevance + ( + vsi_nn_graph_t *graph + ) +{ + uint32_t i,j,k; + uint32_t in_num,out_num; + uint32_t max_io,tensor_num; + vsi_nn_tensor_rel_t *tensor_ref; + vsi_nn_node_t *node; + +#define _MAX_TENSOR_IO 32 + max_io = _MAX_TENSOR_IO; + tensor_num = graph->tensor_num; + tensor_ref = _init_tensor_rel_buffer(graph, max_io); + if(NULL == tensor_ref) + { + VSILOGE("init tensor_ref buffer fail"); + return NULL; + } + + for (i = 0; i < tensor_num; i++) + { + in_num = 0; + out_num = 0; + + for(j = 0; j < graph->node_num; j++) + { + node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)j ); + for(k = 0; k < node->output.num; k++) + { + if(node->output.tensors[k] == i) + { + if(in_num > max_io) + { + VSILOGW("tensor ref input num > max_io %u, stop build", max_io); + break; + } + tensor_ref[i].input.table[in_num].node = j; + tensor_ref[i].input.table[in_num].index = k; + in_num++; + } + } + for(k = 0; k < node->input.num; k++) + { + if(node->input.tensors[k] == i) + { + if(out_num > max_io) + { + VSILOGW("tensor ref output num > max_io %u, stop build", max_io); + break; + } + tensor_ref[i].output.table[out_num].node = j; + tensor_ref[i].output.table[out_num].index = k; + out_num++; + } + } + } + tensor_ref[i].input.num = in_num; + tensor_ref[i].output.num = out_num; + } + + return tensor_ref; +} /* vsi_nn_CreateTensorRelevance() */ + +vsi_status vsi_nn_SwapTensorHandle + ( + vsi_nn_tensor_t * tensor0, + vsi_nn_tensor_t * tensor1 + ) +{ + uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + uint32_t buf_sz0, buf_sz1; + vsi_status status = VSI_FAILURE; + + if( NULL == tensor0 || NULL == tensor1 ) + { + VSILOGE("tensor0 or tensor1 is NULL."); + return VSI_FAILURE; + } + + if( !tensor0->attr.is_created_from_handle || !tensor1->attr.is_created_from_handle ) + { + VSILOGE("tensor0 or tensor1 is not created form handle."); + return VSI_FAILURE; + } + + buf_sz0 = vsi_nn_GetStrideSize( &tensor0->attr, stride_size ); + buf_sz1 = vsi_nn_GetStrideSize( &tensor0->attr, stride_size ); + + if( buf_sz0 != buf_sz1 ) + { + VSILOGE("The memory size of tensor0 and tensor1 are not equal."); + return VSI_FAILURE; + } + + status = vxSwapTensor( tensor0->t, tensor1->t ); + if( VX_SUCCESS == status ) + { + tensor0->is_swapped = TRUE; + tensor1->is_swapped = TRUE; + } + + return status; +} /* vsi_nn_SwapTensorHandle() */ + +uint32_t vsi_nn_vxGetTensorElementNum + ( + vsi_nn_tensor_attr_t *attr + ) +{ + if( NULL == attr ) + { + return 0; + } + + return get_tensor_elements_num(attr->size, + attr->dim_num, attr->dtype.vx_type); +} + +vsi_status vsi_nn_vxGetTensorAttr + ( + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr + ) +{ + vsi_status status = VSI_FAILURE; + + if(NULL == tensor || NULL == attr) + { + return status; + } + + status = vxQueryTensor(tensor, VX_TENSOR_NUM_OF_DIMS, + &(attr->dim_num), sizeof(uint32_t)); + TEST_CHECK_STATUS( status, final ); + status = vxQueryTensor(tensor, VX_TENSOR_DIMS, + attr->size, sizeof(uint32_t) * (attr->dim_num)); + TEST_CHECK_STATUS( status, final ); + status = vxQueryTensor(tensor, VX_TENSOR_DATA_TYPE, + &(attr->dtype.vx_type), sizeof(vsi_enum)); + TEST_CHECK_STATUS( status, final ); + status = vxQueryTensor(tensor, VX_TENSOR_QUANT_FORMAT, + &(attr->dtype.qnt_type), sizeof(uint32_t)); + TEST_CHECK_STATUS( status, final ); + switch( attr->dtype.qnt_type ) + { + case VSI_NN_QNT_TYPE_DFP: + status = vxQueryTensor(tensor, VX_TENSOR_FIXED_POINT_POS, + &(attr->dtype.fl), sizeof(int8_t)); + TEST_CHECK_STATUS( status, final ); + break; + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + status = vxQueryTensor(tensor, VX_TENSOR_ZERO_POINT, + &(attr->dtype.zero_point), sizeof(int32_t)); + TEST_CHECK_STATUS( status, final ); + status = vxQueryTensor(tensor, VX_TENSOR_SCALE, + &(attr->dtype.scale), sizeof(float)); + TEST_CHECK_STATUS( status, final ); + break; + default: + break; + } + +final: + return status; +} /* vsi_nn_vxGetTensorAttr() */ + +uint8_t *vsi_nn_vxCopyTensorToData + ( + vx_context context, + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr + ) +{ + uint8_t *data; + vsi_status status; + uint32_t buf_sz; + uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + + memset(stride_size, 0, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + if(NULL == tensor || NULL == context || NULL == attr) + { + return NULL; + } + data = NULL; + status = VSI_FAILURE; + + buf_sz = vsi_nn_GetStrideSize( attr, stride_size ); + if(0 < buf_sz) + { + data = (uint8_t *)malloc( buf_sz ); + if(NULL == data) + { + return NULL; + } + } + + status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_READ_ONLY); + if(VSI_SUCCESS != status) + { + VSILOGE("Copy tensor to data fail"); + free(data); + data = NULL; + } + return data; +} /* vsi_nn_vxCopyTensorToData() */ + +vsi_status vsi_nn_vxCopyDataToTensor + ( + vx_context context, + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr, + uint8_t *data + ) +{ + vsi_status status; + uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + + status = VSI_FAILURE; + if(NULL == tensor || NULL == attr || + NULL == context || NULL == data) + { + return status; + } + + memset(stride_size, 0, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + vsi_nn_GetStrideSize(attr, stride_size); + status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_WRITE_ONLY); + if(VSI_SUCCESS != status) + { + VSILOGE("Copy data to tensor fail"); + } + return status; +} /* vsi_nn_vxCopyDataToTensor() */ + +vsi_status vsi_nn_copy_tensor_veiw_patch + ( + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr, + void *user_ptr, + uint32_t *start, + uint32_t *end, + uint32_t *stride, + vsi_enum usage, + vsi_enum user_memory_type + ) +{ +#define USE_OPENVX_1_2 + size_t dim,i; + size_t vstart[VSI_NN_MAX_DIM_NUM],vend[VSI_NN_MAX_DIM_NUM],vstride[VSI_NN_MAX_DIM_NUM]; + vsi_status status = VSI_FAILURE; + if(NULL == tensor || NULL == user_ptr || NULL == start || NULL == end || NULL == stride) + { + VSILOGE("Invalid parameter"); + return status; + } + dim = (size_t)attr->dim_num; + for(i = 0; i < dim; i++) + { + vstart[i] = (size_t)start[i]; + vend[i] = (size_t)end[i]; + vstride[i] = (size_t)stride[i]; + } + +#ifdef USE_OPENVX_1_2 + status = vxCopyTensorPatch(tensor, dim, vstart, vend, vstride, user_ptr, usage, user_memory_type); +#else + { + vx_context context = NULL; + vx_tensor_addressing addr = NULL; + uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_nn_tensor_attr_t t; + + memset(vstart, 0, sizeof(size_t) * VSI_NN_MAX_DIM_NUM); + memset(vend, 0, sizeof(size_t) * VSI_NN_MAX_DIM_NUM); + memset(vstride, 0, sizeof(size_t) * VSI_NN_MAX_DIM_NUM); + status = vsi_nn_vxGetTensorAttr(tensor, &t); + vsi_nn_GetStrideSize( attr, stride_size ); + context = vxGetContext((vx_reference)tensor); + if( NULL == context ) + { + VSILOGE("Call vxGetContext fail"); + return status; + } + addr = vxCreateTensorAddressing( context, attr->size, + stride_size, attr->dim_num ); + if( NULL == addr ) + { + VSILOGE("Call vxCreateTensorAddressing fail"); + return status; + } + status = vxCopyTensorPatch_11( tensor, + NULL, + addr, + user_ptr, + usage, + user_memory_type + ); + vxReleaseTensorAddressing( &addr ); + if( VSI_SUCCESS != status ) + { + VSILOGE("Call vxCopyTensorPatch_11 fail"); + return status; + } + } +#endif + return status; +} /* vsi_nn_copy_tensor_veiw_patch() */ + +vsi_status vsi_nn_copy_tensor_patch + ( + vx_tensor tensor, + vsi_nn_tensor_attr_t *attr, + void * user_ptr, + vsi_enum usage + ) +{ + uint32_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM]; + vsi_status status = VSI_FAILURE; + if(NULL == tensor || NULL == user_ptr) + { + VSILOGE("Invalid parameter"); + return status; + } + vsi_nn_GetStrideSize(attr, stride); + memset(start, 0, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + memcpy(end, attr->size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + status = vsi_nn_copy_tensor_veiw_patch(tensor, attr, user_ptr, start, end, stride, usage, 0); + return status; +} /* vsi_nn_copy_tensor_patch() */ + +uint32_t vsi_nn_GetOffsetByCoords + ( + vsi_nn_tensor_attr_t *attr, + uint32_t *coords + ) +{ + uint32_t i, res = 0, strides = 1; + for (i = 0; i < attr->dim_num; i++) + { + res += coords[i] * strides; + strides *= attr->size[i]; + } + return res; +} + +void vsi_nn_reshuffle_weight_data + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * weights + ) +{ + int32_t b, sy, sx, c, h, w; + uint8_t* weight_data = NULL; + uint8_t* reshuffled_weights = NULL; + uint8_t* buffer = NULL; + int32_t kernel_size_x = weights->attr.size[0]; + int32_t kernel_size_y = weights->attr.size[1]; + int32_t weight_size_c = weights->attr.size[2]; + int32_t weight_size_b = weights->attr.size[3]; + int32_t slice_size = kernel_size_x * kernel_size_y; + int32_t item_size = vsi_nn_TypeGetBytes(weights->attr.dtype.vx_type); + + weight_data = vsi_nn_ConvertTensorToData(graph, weights); + buffer = (uint8_t*)malloc(item_size * slice_size * weight_size_c * weight_size_b); + memset(buffer, 0x00, item_size * slice_size * weight_size_c * weight_size_b); + memcpy(buffer, weight_data, item_size * slice_size * weight_size_c * weight_size_b); +#if 0 // transpose whnc to whcn if need + for (b = 0; b < weight_size_b; b++) + { + for (c = 0; c < weight_size_c; c++) + { + memcpy(buffer + kernel_size_x * kernel_size_y * (c * weight_size_b + b) * item_size, + weight_data + kernel_size_x * kernel_size_y * (b * weight_size_c + c) * item_size, + item_size * slice_size); + } + } +#endif + reshuffled_weights = weight_data; + for (b = 0; b < weight_size_b; b++) + { + for (sy = 0; sy < 1; sy++) + { + for (sx = 0; sx < 1; sx++) + { + for (c = 0; c < weight_size_c; c++) + { + uint8_t* weight_output = reshuffled_weights + + (b * slice_size * weight_size_c + slice_size * c) * item_size; + + uint8_t* data = buffer + (b * slice_size * weight_size_c + slice_size * c) * item_size; + + for (h = 0; h < kernel_size_y; h++) + { + for (w = 0; w < kernel_size_x; w++) + { + uint8_t* reshuffled_output = weight_output + (h * kernel_size_x + w) * item_size; + int32_t input_index = ((kernel_size_y - 1 - h) + sy) * kernel_size_x + + ((kernel_size_x - 1 - w) + sx); + + memcpy(reshuffled_output, data + input_index * item_size, item_size); + } + } + } + } + } + } + vsi_nn_CopyDataToTensor( graph, weights, weight_data ); + vsi_nn_Free( buffer ); + vsi_nn_Free( weight_data ); +} + +vsi_nn_tensor_t* vsi_nn_ConcatTensor_impl + ( + vsi_nn_graph_t* graph, + uint32_t axis, + ... + ) +{ + va_list args; + vsi_nn_tensor_t* next = NULL; + vsi_nn_tensor_t** tensors = NULL; + int tensor_count = 0; + + va_start(args, axis); + + FOREACH_ARGS(args, next, vsi_nn_tensor_t*) + { + tensor_count++; + } + va_end(args); + + tensors = (vsi_nn_tensor_t**)malloc(sizeof(vsi_nn_tensor_t*) * tensor_count); + tensor_count = 0; + va_start(args, axis); + + FOREACH_ARGS(args, next, vsi_nn_tensor_t*) + { + tensors[tensor_count++] = next; + } + va_end(args); + + next = vsi_nn_Concat(graph, tensors, tensor_count, axis); + + vsi_nn_safe_free(tensors); + + return next; +} + +vsi_nn_tensor_t* vsi_nn_ConstTensorAdd_impl + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_attr_t output_attr, + ... + ) +{ + va_list args; + vsi_nn_tensor_t* next = NULL; + vsi_nn_tensor_t** tensors = NULL; + int tensor_count = 0; + + va_start(args, output_attr); + FOREACH_ARGS(args, next, vsi_nn_tensor_t*) + { + tensor_count++; + } + va_end(args); + + tensors = (vsi_nn_tensor_t**)malloc(sizeof(vsi_nn_tensor_t*) * tensor_count); + tensor_count = 0; + va_start(args, output_attr); + FOREACH_ARGS(args, next, vsi_nn_tensor_t*) + { + tensors[tensor_count++] = next; + } + va_end(args); + + next = vsi_nn_TensorAdd(graph, tensors, tensor_count, output_attr); + + vsi_nn_safe_free(tensors); + + return next; +} + +vsi_status vsi_nn_SwapHandle + ( + vsi_nn_tensor_t * tensor, + void * new_ptr, + void ** old_ptr + ) +{ + if(!tensor) + { + return VSI_FAILURE; + } + vxSwapTensorHandle(tensor->t, new_ptr, old_ptr); + return VSI_SUCCESS; +} /* vsi_nn_SwapHandle() */ + diff --git a/src/tim/vx/internal/src/vsi_nn_version.c b/src/tim/vx/internal/src/vsi_nn_version.c new file mode 100644 index 0000000..d7abca3 --- /dev/null +++ b/src/tim/vx/internal/src/vsi_nn_version.c @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include +#include "vsi_nn_version.h" + +#define MACRO_TO_STRING(M) #M +#define VERSION_PREFIX "OVXLIB_VERSION==" +#define DEF_VERSION(a,b,c) VERSION_PREFIX MACRO_TO_STRING(a)"." MACRO_TO_STRING(b)"." MACRO_TO_STRING(c) +#define DEF_VERSION_STR DEF_VERSION(VSI_NN_VERSION_MAJOR,VSI_NN_VERSION_MINOR,VSI_NN_VERSION_PATCH) + +const char *vsi_nn_GetVersion(void) +{ + static const char *version = DEF_VERSION_STR; + return version; +} + +uint32_t vsi_nn_GetVersionMajor(void) +{ + return VSI_NN_VERSION_MAJOR; +} + +uint32_t vsi_nn_GetVersionMinor(void) +{ + return VSI_NN_VERSION_MINOR; +} + +uint32_t vsi_nn_GetVersionPatch(void) +{ + return VSI_NN_VERSION_PATCH; +} \ No newline at end of file diff --git a/src/tim/vx/operation.cc b/src/tim/vx/operation.cc new file mode 100644 index 0000000..2295441 --- /dev/null +++ b/src/tim/vx/operation.cc @@ -0,0 +1,126 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/operation.h" + +#include + +#include "graph_private.h" +#include "operation_private.h" +#include "type_utils.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { + +// OperationImpl implementation +OperationImpl::OperationImpl(Graph* graph, uint32_t operation_id, int input_cnt, + int output_cnt) + : graph_(reinterpret_cast(graph)), + operation_id_(operation_id), + input_cnt_(input_cnt), + output_cnt_(output_cnt), + node_(vsi_nn_AddNode(graph_->graph(), operation_id_, input_cnt_, + output_cnt_, NULL)) { + SetRoundingPolicy(); + node_->uid = graph_->graph()->cur_nid; +} + +OperationImpl& OperationImpl::BindInput(const std::shared_ptr& tensor) { + uint32_t tensor_id = tensor->GetId(); + node_->input.tensors[input_tensor_index++] = tensor_id; + if (tensor->GetSpec().attr_ == TensorAttribute::INPUT) { + graph_->AddInput(tensor_id); + } + return *this; +} + +OperationImpl& OperationImpl::BindOutput( + const std::shared_ptr& tensor) { + uint32_t tensor_id = tensor->GetId(); + node_->output.tensors[output_tensor_index++] = tensor_id; + if (tensor->GetSpec().attr_ == TensorAttribute::OUTPUT) { + graph_->AddOutput(tensor_id); + } + return *this; +} + +OperationImpl& OperationImpl::SetRoundingPolicy( + OverflowPolicy overflow_policy, RoundingPolicy rounding_policy, + DownScaleSizeRounding down_scale_size_rounding, uint32_t accumulator_bits) { + node_->vx_param.overflow_policy = TranslateOverflowPolicy(overflow_policy); + node_->vx_param.rounding_policy = TranslateRoundingPolicy(rounding_policy); + node_->vx_param.down_scale_size_rounding = + TranslateDownScaleSizeRounding(down_scale_size_rounding); + node_->vx_param.accumulator_bits = accumulator_bits; + + return *this; +} + +// Operation implementation +Operation::Operation(Graph* graph, uint32_t operation_id, int input_cnt, + int output_cnt) { + impl_ = std::make_unique(graph, operation_id, input_cnt, + output_cnt); +} + +Operation::~Operation() {} + +std::unique_ptr& Operation::impl() { return impl_; } + +Operation& Operation::BindInput(const std::shared_ptr& tensor) { + impl_->BindInput(tensor); + return *this; +} + +Operation& Operation::BindOutput(const std::shared_ptr& tensor) { + impl_->BindOutput(tensor); + return *this; +} + +Operation& Operation::SetRoundingPolicy( + OverflowPolicy overflow_policy, RoundingPolicy rounding_policy, + DownScaleSizeRounding down_scale_size_rounding, uint32_t accumulator_bits) { + impl_->SetRoundingPolicy(overflow_policy, rounding_policy, + down_scale_size_rounding, accumulator_bits); + return *this; +} + +Operation& Operation::BindInputs( + const std::vector>& tensors) { + for (auto& t : tensors) { + BindInput(t); + } + return *this; +} + +Operation& Operation::BindOutputs( + const std::vector>& tensors) { + for (auto& t : tensors) { + BindOutput(t); + } + return *this; +} + +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/operation_private.h b/src/tim/vx/operation_private.h new file mode 100644 index 0000000..60c0be0 --- /dev/null +++ b/src/tim/vx/operation_private.h @@ -0,0 +1,60 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_OPERATION_PRIVATE_H_ +#define TIM_VX_OPERATION_PRIVATE_H_ +#include "graph_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +class OperationImpl { + public: + OperationImpl(Graph* graph, uint32_t operation_id, int input_cnt = 0, + int output_cnt = 0); + ~OperationImpl() {} + + OperationImpl& BindInput(const std::shared_ptr& tensor); + OperationImpl& BindOutput(const std::shared_ptr& tensor); + OperationImpl& SetRoundingPolicy( + OverflowPolicy overflow_policy = OverflowPolicy::SATURATE, + RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO, + DownScaleSizeRounding down_scale_size_rounding = + DownScaleSizeRounding::FLOOR, + uint32_t accumulator_bits = 0); + + vsi_nn_node_t* node() { return this->node_; } + + GraphImpl* graph_; + uint32_t operation_id_{0}; + int32_t input_cnt_{0}; + int32_t output_cnt_{0}; + vsi_nn_node_t* node_{nullptr}; + int32_t input_tensor_index{0}; + int32_t output_tensor_index{0}; +}; + +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_OPERATION_PRIVATE_H_ */ \ No newline at end of file diff --git a/src/tim/vx/ops/activations.cc b/src/tim/vx/ops/activations.cc new file mode 100644 index 0000000..f3a7e39 --- /dev/null +++ b/src/tim/vx/ops/activations.cc @@ -0,0 +1,61 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/activations.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +#define DEFINE_NO_PARAMETER_ACTIVATION(NAME, VSI_OP_CODE) \ + NAME::NAME(Graph* graph) : Operation(graph, VSI_OP_CODE) {} + +DEFINE_NO_PARAMETER_ACTIVATION(Relu, VSI_NN_OP_RELU); +DEFINE_NO_PARAMETER_ACTIVATION(Relu1, VSI_NN_OP_RELU1); +DEFINE_NO_PARAMETER_ACTIVATION(Relu6, VSI_NN_OP_RELU6); +DEFINE_NO_PARAMETER_ACTIVATION(Elu, VSI_NN_OP_ELU); +DEFINE_NO_PARAMETER_ACTIVATION(Sigmoid, VSI_NN_OP_SIGMOID); + +#undef DEFINE_NO_PARAMETER_ACTIVATION + +HardSwish::HardSwish(Graph* graph) : Operation(graph, VSI_NN_OP_SWISH) { + this->impl()->node()->nn_param.swish.type = VSI_NN_HSWISH; + this->impl()->node()->nn_param.swish.beta = 1.0f; +} + +Prelu::Prelu(Graph* graph, int axis) + : Operation(graph, VSI_NN_OP_PRELU), axis_(axis) { + this->impl()->node()->nn_param.prelu.axis = axis_; +} + +Tanh::Tanh(Graph* graph) : Operation(graph, VSI_NN_OP_TANH) { + this->impl()->node()->nn_param.tanh.scale_a = 1.0; + this->impl()->node()->nn_param.tanh.scale_b = 1.0; +} + +} // namespace ops +} // namespace vx +} // namespace tim diff --git a/src/tim/vx/ops/addn.cc b/src/tim/vx/ops/addn.cc new file mode 100644 index 0000000..1428ebc --- /dev/null +++ b/src/tim/vx/ops/addn.cc @@ -0,0 +1,38 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/addn.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +AddN::AddN(Graph* graph, uint32_t num_inputs) + : Operation(graph, VSI_NN_OP_ADDN, num_inputs, 1) {} + +} // namespace ops +} // namespace vx +} // namespace tim diff --git a/src/tim/vx/ops/batch2space.cc b/src/tim/vx/ops/batch2space.cc new file mode 100644 index 0000000..a567336 --- /dev/null +++ b/src/tim/vx/ops/batch2space.cc @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/batch2space.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +Batch2Space::Batch2Space(Graph* graph, const std::vector& block_size, + const std::vector& crop) + : Operation(graph, VSI_NN_OP_BATCH2SPACE), + block_size_(block_size), + crop_(crop) { + this->impl()->node()->nn_param.batch2space.block_size = block_size_.data(); + this->impl()->node()->nn_param.batch2space.block_size_num = block_size.size(); + // the size of crop_ should be 4. + for (size_t i = 0; i < crop_.size(); i++) { + this->impl()->node()->nn_param.batch2space.crop[i] = crop_[i]; + } +} +} // namespace ops +} // namespace vx +} // namespace tim diff --git a/src/tim/vx/ops/concat.cc b/src/tim/vx/ops/concat.cc new file mode 100644 index 0000000..18e5337 --- /dev/null +++ b/src/tim/vx/ops/concat.cc @@ -0,0 +1,40 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/concat.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +Concat::Concat(Graph* graph, uint32_t axis, int input_cnt) + : Operation(graph, VSI_NN_OP_CONCAT, input_cnt, 1), axis_(axis) { + this->impl()->node()->nn_param.concat.axis = axis_; +} + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/conv2d.cc b/src/tim/vx/ops/conv2d.cc new file mode 100644 index 0000000..b043cf6 --- /dev/null +++ b/src/tim/vx/ops/conv2d.cc @@ -0,0 +1,72 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/conv2d.h" + +#include "operation_private.h" +#include "type_utils.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +Conv2d::Conv2d(Graph* graph, int32_t weights, PadType padding, + const std::array& ksize, + const std::array& stride, + const std::array& dilation, int32_t multiplier) + : Conv2d(graph, weights, padding, ksize, stride, dilation, {0, 0, 0, 0}, + multiplier) {} + +Conv2d::Conv2d(Graph* graph, int32_t weights, PadType padding, + const std::array& ksize, + const std::array& stride, + const std::array& dilation, + const std::array& pad, int32_t multiplier) + : Operation(graph, VSI_NN_OP_CONV2D), + weights_(weights), + padding_(padding), + ksize_(ksize), + stride_(stride), + dilation_(dilation), + pad_(pad), + multiplier_(multiplier) { + this->impl()->node()->nn_param.conv2d.ksize[0] = ksize_[0]; + this->impl()->node()->nn_param.conv2d.ksize[1] = ksize_[1]; + this->impl()->node()->nn_param.conv2d.stride[0] = stride_[0]; + this->impl()->node()->nn_param.conv2d.stride[1] = stride_[1]; + this->impl()->node()->nn_param.conv2d.pad_type = TranslatePadType(padding_); + this->impl()->node()->nn_param.conv2d.weights = weights; + this->impl()->node()->nn_param.conv2d.group = 1; + this->impl()->node()->nn_param.conv2d.dilation[0] = dilation_[0]; + this->impl()->node()->nn_param.conv2d.dilation[1] = dilation_[1]; + this->impl()->node()->nn_param.conv2d.pad[0] = pad_[0]; + this->impl()->node()->nn_param.conv2d.pad[1] = pad_[1]; + this->impl()->node()->nn_param.conv2d.pad[2] = pad_[2]; + this->impl()->node()->nn_param.conv2d.pad[3] = pad_[3]; + this->impl()->node()->nn_param.conv2d.multiplier = multiplier_; +} + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/depth2space.cc b/src/tim/vx/ops/depth2space.cc new file mode 100644 index 0000000..b2d5abf --- /dev/null +++ b/src/tim/vx/ops/depth2space.cc @@ -0,0 +1,40 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/depth2space.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { + +namespace ops { + +DepthToSpace::DepthToSpace(Graph* graph, int block_size) + : Operation(graph, VSI_NN_OP_DEPTH2SPACE), block_size_(block_size) { + this->impl()->node()->nn_param.depth2space.block_size = block_size_; +} +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/elementwise.cc b/src/tim/vx/ops/elementwise.cc new file mode 100644 index 0000000..4ee564f --- /dev/null +++ b/src/tim/vx/ops/elementwise.cc @@ -0,0 +1,64 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/elementwise.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +#define DEFINE_ELEMENTWISE_UNARY_OP(NAME, VSI_OP_CODE) \ + NAME::NAME(Graph* graph) : Operation(graph, VSI_OP_CODE) {} + +DEFINE_ELEMENTWISE_UNARY_OP(Abs, VSI_NN_OP_ABS); +DEFINE_ELEMENTWISE_UNARY_OP(Sin, VSI_NN_OP_SIN); +// TODO(jiangbo): enable it when ovxlib supports `Cos` +//DEFINE_ELEMENTWISE_UNARY_OP(Cos, VSI_NN_OP_COS); +DEFINE_ELEMENTWISE_UNARY_OP(Exp, VSI_NN_OP_EXP); +DEFINE_ELEMENTWISE_UNARY_OP(Log, VSI_NN_OP_LOG); +DEFINE_ELEMENTWISE_UNARY_OP(Sqrt, VSI_NN_OP_SQRT); +DEFINE_ELEMENTWISE_UNARY_OP(Rsqrt, VSI_NN_OP_RSQRT); +DEFINE_ELEMENTWISE_UNARY_OP(Square, VSI_NN_OP_SQUARE); +DEFINE_ELEMENTWISE_UNARY_OP(LogicalNot, VSI_NN_OP_LOGICAL_NOT); + +#undef DEFINE_ELEMENTWISE_UNARY_OP + +#define DEFINE_ELEMENTWISE_BINARY_OP(NAME, VSI_OP_CODE) \ + NAME::NAME(Graph* graph) : Operation(graph, VSI_OP_CODE, 2, 1) {} + +DEFINE_ELEMENTWISE_BINARY_OP(Minimum, VSI_NN_OP_MINIMUM); +DEFINE_ELEMENTWISE_BINARY_OP(Maximum, VSI_NN_OP_MAXIMUM); +DEFINE_ELEMENTWISE_BINARY_OP(Add, VSI_NN_OP_ADD); +DEFINE_ELEMENTWISE_BINARY_OP(Sub, VSI_NN_OP_SUBTRACT); +DEFINE_ELEMENTWISE_BINARY_OP(Div, VSI_NN_OP_DIVIDE); +DEFINE_ELEMENTWISE_BINARY_OP(Multiply, VSI_NN_OP_MULTIPLY); +DEFINE_ELEMENTWISE_BINARY_OP(Pow, VSI_NN_OP_POW); + +#undef DEFINE_ELEMENTWISE_BINARY_OP + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/fullyconnected.cc b/src/tim/vx/ops/fullyconnected.cc new file mode 100644 index 0000000..c6147a3 --- /dev/null +++ b/src/tim/vx/ops/fullyconnected.cc @@ -0,0 +1,41 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/fullyconnected.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +FullyConnected::FullyConnected(Graph* graph, uint32_t axis, uint32_t weights) + : Operation(graph, VSI_NN_OP_FCL) { + this->impl()->node()->nn_param.fcl.axis = axis; + this->impl()->node()->nn_param.fcl.weights = weights; +} + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/gather.cc b/src/tim/vx/ops/gather.cc new file mode 100644 index 0000000..302b2c1 --- /dev/null +++ b/src/tim/vx/ops/gather.cc @@ -0,0 +1,40 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/gather.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { + +namespace ops { + +Gather::Gather(Graph* graph, int axis) + : Operation(graph, VSI_NN_OP_GATHER), axis_(axis) { + this->impl()->node()->nn_param.gather.axis = axis_; +} +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/l2normalization.cc b/src/tim/vx/ops/l2normalization.cc new file mode 100644 index 0000000..4b882a7 --- /dev/null +++ b/src/tim/vx/ops/l2normalization.cc @@ -0,0 +1,39 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/l2normalization.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { +L2Normalization::L2Normalization(Graph* graph, int32_t axis) + : Operation(graph, VSI_NN_OP_L2_NORMALIZE), axis_(axis) { + this->impl()->node()->nn_param.l2_normalize.axis = axis_; +} + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/localresponsenormalization.cc b/src/tim/vx/ops/localresponsenormalization.cc new file mode 100644 index 0000000..a4bcaa2 --- /dev/null +++ b/src/tim/vx/ops/localresponsenormalization.cc @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/localresponsenormalization.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { +LocalResponseNormalization::LocalResponseNormalization(Graph* graph, + uint32_t size, + float alpha, float beta, + float bias, int32_t axis) + : Operation(graph, VSI_NN_OP_LRN2), + size_(size), + alpha_(alpha), + beta_(beta), + bias_(bias), + axis_(axis) { + this->impl()->node()->nn_param.lrn.size = size_ * 2 + 1; + this->impl()->node()->nn_param.lrn.alpha = alpha_; + this->impl()->node()->nn_param.lrn.beta = beta_; + this->impl()->node()->nn_param.lrn.bias = bias_; + this->impl()->node()->nn_param.lrn.axis = axis_; + this->impl()->node()->nn_param.lrn.type = + VX_CONVOLUTIONAL_NETWORK_NORM_ACROSS_MAPS; +} +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/pad.cc b/src/tim/vx/ops/pad.cc new file mode 100644 index 0000000..eb17e39 --- /dev/null +++ b/src/tim/vx/ops/pad.cc @@ -0,0 +1,46 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/pad.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { +Pad::Pad(Graph* graph, const std::vector& front_size, + const std::vector& back_size, int32_t const_val) + : Operation(graph, VSI_NN_OP_PAD), + front_size_(front_size), + back_size_(back_size), + const_val_(const_val) { + this->impl()->node()->nn_param.pad.front_size = front_size_.data(); + this->impl()->node()->nn_param.pad.back_size = back_size_.data(); + this->impl()->node()->nn_param.pad.dim_num = front_size_.size(); + this->impl()->node()->nn_param.pad.const_val = const_val_; + this->impl()->node()->nn_param.pad.mode = VSI_NN_PAD_MODE_CONSTANT; +} +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/permute.cc b/src/tim/vx/ops/permute.cc new file mode 100644 index 0000000..228e92b --- /dev/null +++ b/src/tim/vx/ops/permute.cc @@ -0,0 +1,41 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/permute.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +Permute::Permute(Graph* graph, const std::vector& perm) + : Operation(graph, VSI_NN_OP_PERMUTE), perm_(std::move(perm)) { + this->impl()->node()->nn_param.permute.perm = perm_.data(); + this->impl()->node()->nn_param.permute.dim_num = perm_.size(); +} + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/pool2d.cc b/src/tim/vx/ops/pool2d.cc new file mode 100644 index 0000000..3d13305 --- /dev/null +++ b/src/tim/vx/ops/pool2d.cc @@ -0,0 +1,55 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/pool2d.h" + +#include "operation_private.h" +#include "type_utils.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +Pool2d::Pool2d(Graph* graph, PoolType type, PadType padding, + const std::array& ksize, + const std::array& stride, RoundType round_type) + : Operation(graph, VSI_NN_OP_POOL, 1, 1), + type_(type), + padding_(padding), + ksize_(ksize), + stride_(stride), + round_type_(round_type) { + this->impl()->node()->nn_param.pool.type = TranslatePoolType(type_); + this->impl()->node()->nn_param.pool.round_type = + TranslateRoundType(round_type_); + this->impl()->node()->nn_param.pool.ksize[0] = ksize_[0]; + this->impl()->node()->nn_param.pool.ksize[1] = ksize_[1]; + this->impl()->node()->nn_param.pool.stride[0] = stride_[0]; + this->impl()->node()->nn_param.pool.stride[1] = stride_[1]; + this->impl()->node()->nn_param.pool.pad_type = TranslatePadType(padding_); +} + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/reduce.cc b/src/tim/vx/ops/reduce.cc new file mode 100644 index 0000000..9af8421 --- /dev/null +++ b/src/tim/vx/ops/reduce.cc @@ -0,0 +1,55 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/reduce.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +#define DEFINE_REDUCE_OP(NAME, VSI_OP_CODE) \ + Reduce##NAME::Reduce##NAME(Graph* graph, const std::vector& axis, \ + bool keep_dims) \ + : Operation(graph, VSI_NN_OP_REDUCE), \ + axis_(std::move(axis)), \ + keep_dims_(keep_dims) { \ + this->impl()->node()->nn_param.reduce.type = VSI_OP_CODE; \ + this->impl()->node()->nn_param.reduce.axis = axis_.data(); \ + this->impl()->node()->nn_param.reduce.axis_num = axis_.size(); \ + this->impl()->node()->nn_param.reduce.keep_dim = keep_dims_; \ + } + +DEFINE_REDUCE_OP(Min, VSI_NN_REDUCE_MIN); +DEFINE_REDUCE_OP(Max, VSI_NN_REDUCE_MAX); +DEFINE_REDUCE_OP(Any, VSI_NN_REDUCE_ANY); +DEFINE_REDUCE_OP(Prod, VSI_NN_REDUCE_PROD); +DEFINE_REDUCE_OP(Mean, VSI_NN_REDUCE_MEAN); + +#undef DEFINE_REDUCE_OP + +} // namespace ops +} // namespace vx +} // namespace tim diff --git a/src/tim/vx/ops/reshape.cc b/src/tim/vx/ops/reshape.cc new file mode 100644 index 0000000..63cf948 --- /dev/null +++ b/src/tim/vx/ops/reshape.cc @@ -0,0 +1,41 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/reshape.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +Reshape::Reshape(Graph* graph, const std::vector& size) + : Operation(graph, VSI_NN_OP_RESHAPE), size_(std::move(size)) { + this->impl()->node()->nn_param.reshape.size = size_.data(); + this->impl()->node()->nn_param.reshape.dim_num = size_.size(); +} + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/resize.cc b/src/tim/vx/ops/resize.cc new file mode 100644 index 0000000..a06b1c6 --- /dev/null +++ b/src/tim/vx/ops/resize.cc @@ -0,0 +1,54 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/resize.h" + +#include "operation_private.h" +#include "type_utils.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +Resize::Resize(Graph* graph, ResizeType type, float factor, bool align_corners, + bool half_pixel_centers, int target_height, int target_width) + : Operation(graph, VSI_NN_OP_RESIZE), + type_(type), + factor_(factor), + align_corners_(align_corners), + half_pixel_centers_(half_pixel_centers), + target_height_(target_height), + target_width_(target_width) { + impl()->node()->nn_param.resize.type = TranslateResizeType(type); + impl()->node()->nn_param.resize.factor = factor; + impl()->node()->nn_param.resize.align_corners = ToVxBool(align_corners); + impl()->node()->nn_param.resize.half_pixel_centers = + ToVxBool(half_pixel_centers); + impl()->node()->nn_param.resize.size[0] = target_width; + impl()->node()->nn_param.resize.size[1] = target_height; +} + +} // namespace ops +} // namespace vx +} // namespace tim diff --git a/src/tim/vx/ops/simple_operations.cc b/src/tim/vx/ops/simple_operations.cc new file mode 100644 index 0000000..558dc33 --- /dev/null +++ b/src/tim/vx/ops/simple_operations.cc @@ -0,0 +1,42 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/simple_operations.h" + +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +#define DEFINE_SIMPLE_OP(NAME, VSI_OP_CODE) \ + NAME::NAME(Graph* graph) : Operation(graph, VSI_OP_CODE) {} + +DEFINE_SIMPLE_OP(DataConvert, VSI_NN_OP_DATACONVERT); +DEFINE_SIMPLE_OP(Neg, VSI_NN_OP_NEG); + +#undef DEFINE_SIMPLE_OP + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/softmax.cc b/src/tim/vx/ops/softmax.cc new file mode 100644 index 0000000..53c935c --- /dev/null +++ b/src/tim/vx/ops/softmax.cc @@ -0,0 +1,41 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/softmax.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +Softmax::Softmax(Graph* graph, float beta, int32_t axis) + : Operation(graph, VSI_NN_OP_SOFTMAX), beta_(beta), axis_(axis) { + this->impl()->node()->nn_param.softmax.beta = beta_; + this->impl()->node()->nn_param.softmax.axis = axis_; +} + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/space2batch.cc b/src/tim/vx/ops/space2batch.cc new file mode 100644 index 0000000..2c990b9 --- /dev/null +++ b/src/tim/vx/ops/space2batch.cc @@ -0,0 +1,48 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/space2batch.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +Space2Batch::Space2Batch(Graph* graph, const std::vector& block_size, + const std::vector& pad) + : Operation(graph, VSI_NN_OP_SPACE2BATCH), + block_size_(block_size), + pad_(pad) { + this->impl()->node()->nn_param.space2batch.block_size = block_size_.data(); + this->impl()->node()->nn_param.space2batch.block_size_num = block_size.size(); + // the size of pad_ should be 4. + for (size_t i = 0; i < pad_.size(); i++) { + this->impl()->node()->nn_param.space2batch.pad[i] = pad_[i]; + } +} + +} // namespace ops +} // namespace vx +} // namespace tim diff --git a/src/tim/vx/ops/space2depth.cc b/src/tim/vx/ops/space2depth.cc new file mode 100644 index 0000000..5009ef8 --- /dev/null +++ b/src/tim/vx/ops/space2depth.cc @@ -0,0 +1,41 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/space2depth.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { + +namespace ops { + +SpaceToDepth::SpaceToDepth(Graph* graph, std::vector block_size) + : Operation(graph, VSI_NN_OP_SPACE2DEPTH), block_size_(block_size) { + this->impl()->node()->nn_param.space2depth.block_size[0] = block_size_[0]; + this->impl()->node()->nn_param.space2depth.block_size[1] = block_size_[1]; +} +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/ops/split.cc b/src/tim/vx/ops/split.cc new file mode 100644 index 0000000..900ba29 --- /dev/null +++ b/src/tim/vx/ops/split.cc @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/split.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +Split::Split(Graph* graph, uint32_t axis, std::vector slices) + : Operation(graph, VSI_NN_OP_SPLIT, 1, slices.size()), + axis_(axis), + slices_(std::move(slices)) { + this->impl()->node()->nn_param.split.axis = axis_; + this->impl()->node()->nn_param.split.slices = slices_.data(); + this->impl()->node()->nn_param.split.slices_num = slices_.size(); +} + +} // namespace ops +} // namespace vx +} // namespace tim diff --git a/src/tim/vx/ops/stridedslice.cc b/src/tim/vx/ops/stridedslice.cc new file mode 100644 index 0000000..0068e81 --- /dev/null +++ b/src/tim/vx/ops/stridedslice.cc @@ -0,0 +1,62 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/ops/stridedslice.h" + +#include "operation_private.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +namespace ops { + +StridedSlice::StridedSlice(Graph* graph, const std::vector begin_dims, + const std::vector end_dims, + const std::vector stride_dims, + int32_t begin_mask, int32_t end_mask, + int32_t shrink_axis_mask) + : Operation(graph, VSI_NN_OP_STRIDED_SLICE), + begin_dims_(std::move(begin_dims)), + end_dims_(std::move(end_dims)), + stride_dims_(std::move(stride_dims)), + begin_mask_(begin_mask), + end_mask_(end_mask), + shrink_axis_mask_(shrink_axis_mask) { + this->impl()->node()->nn_param.strided_slice.begin_mask = begin_mask_; + this->impl()->node()->nn_param.strided_slice.end_mask = end_mask_; + this->impl()->node()->nn_param.strided_slice.shrink_axis_mask = + shrink_axis_mask_; + this->impl()->node()->nn_param.strided_slice.begin_dims = begin_dims_.data(); + this->impl()->node()->nn_param.strided_slice.begin_dims_num = + begin_dims_.size(); + this->impl()->node()->nn_param.strided_slice.end_dims = end_dims_.data(); + this->impl()->node()->nn_param.strided_slice.end_dims_num = end_dims_.size(); + this->impl()->node()->nn_param.strided_slice.stride_dims = + stride_dims_.data(); + this->impl()->node()->nn_param.strided_slice.stride_dims_num = + stride_dims_.size(); +} + +} // namespace ops +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/tensor.cc b/src/tim/vx/tensor.cc new file mode 100644 index 0000000..c59e5ba --- /dev/null +++ b/src/tim/vx/tensor.cc @@ -0,0 +1,208 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "tim/vx/tensor.h" + +#include + +#include + +#include "graph_private.h" +#include "tensor_private.h" +#include "tim/vx/graph.h" +#include "type_utils.h" +#include "vsi_nn_pub.h" + +namespace { + +void PackTensorDtype(tim::vx::TensorSpec& spec, vsi_nn_dtype_t* dtype) { + dtype->vx_type = TranslateDataType(spec.datatype_); + dtype->qnt_type = TranslateQuantType(spec.quantization_.Type()); + switch (spec.quantization_.Type()) { + case tim::vx::QuantType::NONE: + break; + case tim::vx::QuantType::ASYMMETRIC: + dtype->scale = spec.quantization_.Scales()[0]; + dtype->zero_point = spec.quantization_.ZeroPoints()[0]; + //note:temporarily ignore the Uint8 weight case in conv. + // if (dtype->vx_type == VSI_NN_TYPE_UINT8 && dtype->zero_point == 0) { + // dtype->vx_type = VSI_NN_TYPE_INT8; + // } + break; + case tim::vx::QuantType::SYMMETRIC_PER_CHANNEL: { + dtype->scales = spec.quantization_.Scales().data(); + dtype->scale_dim = spec.quantization_.ZeroPoints().size(); +#if (VSI_NN_VERSION_MAJOR == 1 && VSI_NN_VERSION_MINOR == 1 && \ + VSI_NN_VERSION_PATCH <= 18) + { + std::vector zps(spec.quantization_.ZeroPoints().size()); + std::transform(spec.quantization_.ZeroPoints().begin(), + spec.quantization_.ZeroPoints().end(), zps.begin(), + [](const int& it) { return static_cast(it); }); + dtype->zero_points = zps.data(); + } +#else + dtype->zero_points = spec.quantization_.ZeroPoints().data(); +#endif + dtype->zero_points_dim = spec.quantization_.ZeroPoints().size(); + dtype->channel_dim = spec.quantization_.ChannelDim(); + break; + } + default: + break; + } +} + +} // namespace +namespace tim { +namespace vx { + +TensorImpl::TensorImpl(Graph* graph, const TensorSpec& spec, const void* data) + : graph_(reinterpret_cast(graph)), + id_(VSI_NN_TENSOR_ID_NA), + spec_(spec), + data_(data) { + Init(); +} + +TensorImpl::~TensorImpl() {} + +bool TensorImpl::CopyDataToTensor(const void* data, uint32_t size) { + if (!IsWriteable()) { + return false; + } + + bool retn = true; + if (data && VSI_NN_TENSOR_ID_NA != id_) { + retn = false; + vsi_nn_tensor_t* tensor = vsi_nn_GetTensor(graph_->graph(), id_); + if (tensor) { + /* + argument `data` of vsi_nn_CopyDataToTensor is non-const + convert it from const data to non-const, will be fixed in ovxlib + */ + uint32_t tensor_bytes = vsi_nn_GetTensorSize( + tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type); + const uint8_t* end = static_cast(data) + tensor_bytes; + std::vector data_copy(static_cast(data), end); + + retn = VSI_SUCCESS == + vsi_nn_CopyDataToTensor(graph_->graph(), tensor, data_copy.data()); + } + } + return retn; +} + +bool TensorImpl::CopyDataFromTensor(void* data) { + if (!IsReadable()) { + return false; + } + + bool retn = true; + if (data && VSI_NN_TENSOR_ID_NA != id_) { + retn = false; + vsi_nn_tensor_t* tensor = vsi_nn_GetTensor(graph_->graph(), id_); + if (tensor) { + if (tensor->attr.is_created_from_handle) { + void* old_ptr = NULL; + // TODO(jiangbo): current ovxlib didn't wrap this API + // use driver API directly + vxSwapTensorHandle(tensor->t, NULL, &old_ptr); + if (old_ptr) { + uint32_t tensor_bytes = + vsi_nn_GetTensorSize(tensor->attr.size, tensor->attr.dim_num, + tensor->attr.dtype.vx_type); + + memcpy(data, old_ptr, tensor_bytes); + retn = true; + } + } else { + vsi_nn_CopyTensorToBuffer(graph_->graph(), tensor, + static_cast(data)); + retn = true; + } + } + } + return retn; +} + +bool TensorImpl::Init() { + vsi_nn_tensor_attr_t attr; + + memset(&attr, 0x00, sizeof(attr)); + attr.dim_num = spec_.shape_.size(); + attr.is_const = FALSE; + attr.vtl = FALSE; + switch (spec_.attr_) { + case TensorAttribute::CONSTANT: + attr.is_const = TRUE; + break; + case TensorAttribute::TRANSIENT: + attr.vtl = TRUE; + break; + default: + break; + } + + for (ShapeType::size_type i = 0; i < spec_.shape_.size(); i++) { + attr.size[i] = spec_.shape_[i]; + } + + PackTensorDtype(spec_, &attr.dtype); + + if (spec_.attr_ == TensorAttribute::INPUT || + spec_.attr_ == TensorAttribute::OUTPUT) { + id_ = vsi_nn_AddTensorFromHandle(graph_->graph(), VSI_NN_TENSOR_ID_AUTO, + &attr, nullptr); + } else { + id_ = vsi_nn_AddTensor(graph_->graph(), VSI_NN_TENSOR_ID_AUTO, &attr, + nullptr); + } + + if (VSI_NN_TENSOR_ID_NA == id_) { + VSILOGE("Create tensor fail!"); + return false; + } + + if (data_) { + if (!CopyDataToTensor(data_, 0)) { + VSILOGE("Copy data to tensor fail!"); + return false; + } + } + + return true; +} + +uint32_t TensorImpl::GetId() { return id_; } + +bool TensorImpl::IsWriteable() { + return spec_.attr_ != TensorAttribute::TRANSIENT; +} + +bool TensorImpl::IsReadable() { + return spec_.attr_ != TensorAttribute::TRANSIENT; +} + +} // namespace vx +} // namespace tim \ No newline at end of file diff --git a/src/tim/vx/tensor_private.h b/src/tim/vx/tensor_private.h new file mode 100644 index 0000000..b021281 --- /dev/null +++ b/src/tim/vx/tensor_private.h @@ -0,0 +1,84 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_TENSOR_PRIVATE_H_ +#define TIM_VX_TENSOR_PRIVATE_H_ +#include "graph_private.h" +#include "tim/vx/tensor.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { + +class TensorImpl : public Tensor { + public: + TensorImpl(Graph* graph, const TensorSpec& spec, const void* data = nullptr); + ~TensorImpl(); + + bool Init(); + bool IsWriteable(); + bool IsReadable(); + + const ShapeType& GetShape() { return spec_.shape_; } + DataType GetDataType() { return spec_.datatype_; } + const Quantization& GetQuantization() { return spec_.quantization_; } + const TensorSpec& GetSpec() { return spec_; } + uint32_t GetId(); + bool CopyDataToTensor(const void* data, uint32_t size = 0); + bool CopyDataFromTensor(void* data); + bool IsPlaceHolder() { return false; } + bool IsConstTensor() { + return spec_.attr_ == tim::vx::TensorAttribute::CONSTANT; + } + + GraphImpl* graph_; + vsi_nn_tensor_id_t id_; + TensorSpec spec_; + const void* data_; +}; + +class TensorPlaceholder : public Tensor { + public: + TensorPlaceholder(Graph* graph) : id_(VSI_NN_TENSOR_ID_NA) {} + ~TensorPlaceholder(){}; + + const ShapeType& GetShape() { return spec_.shape_; } + DataType GetDataType() { return spec_.datatype_; } + const Quantization& GetQuantization() { return spec_.quantization_; } + const TensorSpec& GetSpec() { return spec_; } + uint32_t GetId() { return id_; }; + bool CopyDataToTensor(const void* data, uint32_t size = 0) { return false; } + bool CopyDataFromTensor(void* data) { return false; } + bool IsPlaceHolder() { return true; } + bool IsConstTensor() { + return spec_.attr_ == tim::vx::TensorAttribute::CONSTANT; + } + + vsi_nn_tensor_id_t id_; + TensorSpec spec_; +}; + +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_TENSOR_PRIVATE_H_ */ \ No newline at end of file diff --git a/src/tim/vx/type_utils.cc b/src/tim/vx/type_utils.cc new file mode 100644 index 0000000..21077fe --- /dev/null +++ b/src/tim/vx/type_utils.cc @@ -0,0 +1,162 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include "type_utils.h" + +namespace tim { +namespace vx { +vsi_nn_type_e TranslateDataType(DataType dtype) { + switch (dtype) { + case DataType::INT8: + return VSI_NN_TYPE_INT8; + case DataType::UINT8: + return VSI_NN_TYPE_UINT8; + case DataType::INT16: + return VSI_NN_TYPE_INT16; + case DataType::UINT16: + return VSI_NN_TYPE_UINT16; + case DataType::INT32: + return VSI_NN_TYPE_INT32; + case DataType::UINT32: + return VSI_NN_TYPE_UINT32; + case DataType::FLOAT16: + return VSI_NN_TYPE_FLOAT16; + case DataType::FLOAT32: + return VSI_NN_TYPE_FLOAT32; + default: + break; + } + return VSI_NN_TYPE_FLOAT16; +} + +vsi_nn_qnt_type_e TranslateQuantType(QuantType qtype) { + switch (qtype) { + case QuantType::NONE: + return VSI_NN_QNT_TYPE_NONE; + case QuantType::ASYMMETRIC: + return VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; + case QuantType::SYMMETRIC_PER_CHANNEL: + return VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC; + default: + break; + } + return VSI_NN_QNT_TYPE_NONE; +} + +vsi_nn_pad_e TranslatePadType(PadType pad) { + switch (pad) { + case PadType::AUTO: + return VSI_NN_PAD_AUTO; + case PadType::VALID: + return VSI_NN_PAD_VALID; + case PadType::SAME: + return VSI_NN_PAD_SAME; + + default: + break; + } + return VSI_NN_PAD_AUTO; +} + +vsi_enum TranslatePoolType(PoolType type) { + switch (type) { + case PoolType::MAX: + return VX_CONVOLUTIONAL_NETWORK_POOLING_MAX; + case PoolType::AVG: + return VX_CONVOLUTIONAL_NETWORK_POOLING_AVG; + case PoolType::L2: + return VX_CONVOLUTIONAL_NETWORK_POOLING_L2; + case PoolType::AVG_ANDROID: + return VX_CONVOLUTIONAL_NETWORK_POOLING_AVG_ANDROID; + + default: + break; + } + return VX_CONVOLUTIONAL_NETWORK_POOLING_MAX; +} + +vsi_nn_round_type_e TranslateRoundType(RoundType type) { + switch (type) { + case RoundType::CEILING: + return VSI_NN_ROUND_CEIL; + case RoundType::FLOOR: + return VSI_NN_ROUND_FLOOR; + + default: + break; + } + return VSI_NN_ROUND_CEIL; +} + +vsi_enum TranslateOverflowPolicy(OverflowPolicy type) { + switch (type) { + case OverflowPolicy::WRAP: + return VX_CONVERT_POLICY_WRAP; + case OverflowPolicy::SATURATE: + return VX_CONVERT_POLICY_SATURATE; + default: + break; + } + return VX_CONVERT_POLICY_SATURATE; +} + +vsi_enum TranslateRoundingPolicy(RoundingPolicy type) { + switch (type) { + case RoundingPolicy::TO_ZERO: + return VX_ROUND_POLICY_TO_ZERO; + case RoundingPolicy::RTNE: + return VX_ROUND_POLICY_TO_NEAREST_EVEN; + default: + break; + } + return VX_ROUND_POLICY_TO_ZERO; +} + +vsi_enum TranslateDownScaleSizeRounding(DownScaleSizeRounding type) { + switch (type) { + case DownScaleSizeRounding::FLOOR: + return VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR; + case DownScaleSizeRounding::CEILING: + return VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_CEILING; + default: + break; + } + return VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR; +} + +vsi_enum TranslateResizeType(ResizeType type) { + switch (type) { + case ResizeType::NEAREST_NEIGHBOR: + return VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR; + case ResizeType::BILINEAR: + return VSI_NN_INTERPOLATION_BILINEAR; + case ResizeType::AREA: + return VSI_NN_INTERPOLATION_AREA; + } + return VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR; +} + +vx_bool_e ToVxBool(bool val) { return val ? vx_true_e : vx_false_e; } + +} // namespace vx +} // namespace tim diff --git a/src/tim/vx/type_utils.h b/src/tim/vx/type_utils.h new file mode 100644 index 0000000..140e1e2 --- /dev/null +++ b/src/tim/vx/type_utils.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_TYPE_UTILS_H_ +#define TIM_VX_TYPE_UTILS_H_ + +#include "tim/vx/types.h" +#include "vsi_nn_pub.h" + +namespace tim { +namespace vx { +vsi_nn_type_e TranslateDataType(DataType dtype); +vsi_nn_qnt_type_e TranslateQuantType(QuantType qtype); +vsi_nn_pad_e TranslatePadType(PadType pad); +vsi_enum TranslatePoolType(PoolType type); +vsi_nn_round_type_e TranslateRoundType(RoundType type); +vsi_enum TranslateOverflowPolicy(OverflowPolicy type); +vsi_enum TranslateRoundingPolicy(RoundingPolicy type); +vsi_enum TranslateDownScaleSizeRounding(DownScaleSizeRounding type); +vsi_enum TranslateResizeType(ResizeType type); +vx_bool_e ToVxBool(bool val); +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_TYPE_UTILS_H_ */ diff --git a/toolchains/BUILD.bazel b/toolchains/BUILD.bazel new file mode 100644 index 0000000..5979947 --- /dev/null +++ b/toolchains/BUILD.bazel @@ -0,0 +1 @@ +package(default_visibility = ['//visibility:public']) diff --git a/toolchains/WORKSPACE b/toolchains/WORKSPACE new file mode 100644 index 0000000..711b03b --- /dev/null +++ b/toolchains/WORKSPACE @@ -0,0 +1 @@ +workspace(name = "TOOLCHAINS") \ No newline at end of file diff --git a/toolchains/cc_toolchain_base.bzl b/toolchains/cc_toolchain_base.bzl new file mode 100644 index 0000000..0a555a5 --- /dev/null +++ b/toolchains/cc_toolchain_base.bzl @@ -0,0 +1,119 @@ +# +# Vivante Cross Toolchain configuration +# +load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES") +load("@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl", + "tool_path", + "feature", + "with_feature_set", + "flag_group", + "flag_set") + +all_compile_actions = [ + ACTION_NAMES.assemble, + ACTION_NAMES.preprocess_assemble, + ACTION_NAMES.linkstamp_compile, + ACTION_NAMES.c_compile, + ACTION_NAMES.cpp_compile, + ACTION_NAMES.cpp_header_parsing, + ACTION_NAMES.cpp_module_compile, + ACTION_NAMES.cpp_module_codegen, + ACTION_NAMES.lto_backend, + ACTION_NAMES.clif_match, +] + +all_cpp_compile_actions = [ + ACTION_NAMES.linkstamp_compile, + ACTION_NAMES.cpp_compile, + ACTION_NAMES.cpp_header_parsing, + ACTION_NAMES.cpp_module_compile, + ACTION_NAMES.cpp_module_codegen, + ACTION_NAMES.clif_match, +] + +all_link_actions = [ + ACTION_NAMES.cpp_link_executable, + ACTION_NAMES.cpp_link_dynamic_library, + ACTION_NAMES.cpp_link_nodeps_dynamic_library, +] + +def build_cc_toolchain_config(impl): + return rule( + implementation = impl, + attrs = { + "toolchain_name": attr.string(mandatory=True), + "target_cpu": attr.string(mandatory=True), + "compiler": attr.string(mandatory=True), + }, + provides = [CcToolchainConfigInfo], + ) + +# construct tool path for local wrappers +def toolchain_tool_path(toolchain_name, tool_name): + return "//{toolchain_name}/bin:{tool_name}".format(toolchain_name=toolchain_name, tool_name=tool_name) + +# construct package path for remote archive tools +def toolchain_package_path(toolchain_name, target): + return "@{toolchain_name}//:{target}".format(toolchain_name=toolchain_name, target=target) + +def register_toolchain(toolchain_name, target_cpu, compiler, cc_toolchain_config): + native.filegroup( + name = "all_files", + srcs = [ + toolchain_tool_path(toolchain_name, "tool-wrappers"), + toolchain_package_path(toolchain_name, "compiler_pieces"), + ], + ) + + native.filegroup( + name = "linker_files", + srcs = [ + toolchain_tool_path(toolchain_name, "ar"), + toolchain_tool_path(toolchain_name, "gcc"), + toolchain_tool_path(toolchain_name, "ld"), + toolchain_package_path(toolchain_name, "compiler_pieces"), + ], + ) + + native.filegroup( + name = "compiler_files", + srcs = [ + toolchain_tool_path(toolchain_name, "as"), + toolchain_tool_path(toolchain_name, "gcc"), + toolchain_tool_path(toolchain_name, "ld"), + ], + ) + + native.filegroup( + name = "empty", + srcs = [], + ) + + cc_toolchain_config( + name = toolchain_name + "_config", + toolchain_name = toolchain_name, + target_cpu = target_cpu, + compiler = compiler) + + native.cc_toolchain( + name = "cc-compiler", + toolchain_identifier = toolchain_name, + toolchain_config = toolchain_name + "_config", + + all_files = ":all_files", + compiler_files = ":compiler_files", + dwp_files = ":empty", + linker_files = ":linker_files", + objcopy_files = toolchain_tool_path(toolchain_name, "objcopy"), + strip_files = toolchain_tool_path(toolchain_name, "strip"), + supports_param_files = 1, + ) + + native.cc_toolchain_suite( + name = "toolchain", + toolchains = { + # target_cpu | compiler + "{target_cpu}|{compiler}".format(target_cpu=target_cpu, compiler=compiler) : "cc-compiler", + }, + visibility = ["//visibility:public"], + ) diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/BUILD.bazel b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/BUILD.bazel new file mode 100644 index 0000000..6056e1c --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/BUILD.bazel @@ -0,0 +1,36 @@ +# This is the entry point for --crosstool_top. Toolchains are found +# by lopping off the name of --crosstool_top and searching for +# 'cc-compiler-${CPU}' in this BUILD file, where CPU is the target CPU +# specified in --cpu. +# +# Toochain directory structure +# └── +#    ├── toolchain.BUILD +#    ├── BUILD.bazel +#    ├── CROSSTOOL +#    ├── cc_toolchain_config.bzl +#    └── bin +#       ├── BUILD.bazel +#       ├── wrapper-ar +#       ├── wrapper-as +#       ├── wrapper-gcc +#       ├── wrapper-gcov +#       ├── wrapper-ld +#       ├── wrapper-nm +#       ├── wrapper-objcopy +#       ├── wrapper-objdump +#       └── wrapper-strip +# + +load("//:cc_toolchain_base.bzl", "register_toolchain") +load(":cc_toolchain_config.bzl", "cc_toolchain_config") + +# Register toolchain +# will generate a `cc_toolchain_suite` named `toolchain` +# `toolchain_name` must be identical with toolchain directory name and http_archive name in `toolchains.bzl` +register_toolchain( + toolchain_name = "gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu", + target_cpu = "armv8-a", + compiler = "gcc", + cc_toolchain_config = cc_toolchain_config, +) diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/BUILD.bazel b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/BUILD.bazel new file mode 100644 index 0000000..21ca3ef --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/BUILD.bazel @@ -0,0 +1,85 @@ +package(default_visibility = ["//gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu:__pkg__"]) + +filegroup( + name = "srcs", + srcs = glob(["**"]), + visibility = ["//visibility:public"], +) + +filegroup( + name = "gcc", + srcs = [ + "wrapper-gcc", + "@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//:gcc", + ], +) + +filegroup( + name = "ar", + srcs = [ + "wrapper-ar", + "@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//:ar", + ], +) + +filegroup( + name = "ld", + srcs = [ + "wrapper-ld", + "@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//:ld", + ], +) + +filegroup( + name = "nm", + srcs = [ + "wrapper-nm", + "@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//:nm", + ], +) + +filegroup( + name = "objcopy", + srcs = [ + "wrapper-objcopy", + "@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//:objcopy", + ], +) + +filegroup( + name = "objdump", + srcs = [ + "wrapper-objdump", + "@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//:objdump", + ], +) + +filegroup( + name = "strip", + srcs = [ + "wrapper-strip", + "@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//:strip", + ], +) + +filegroup( + name = "as", + srcs = [ + "wrapper-as", + "@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//:as", + ], +) + +filegroup( + name = "tool-wrappers", + srcs = [ + ":ar", + ":as", + ":gcc", + ":ld", + ":nm", + ":objcopy", + ":objdump", + ":strip", + ], +) diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-ar b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-ar new file mode 100755 index 0000000..50c4200 --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-ar @@ -0,0 +1,5 @@ +#!/bin/bash --norc + +exec -a aarch64-linux-gnu-ar \ + external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-ar \ + "$@" diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-as b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-as new file mode 100755 index 0000000..f8a06a5 --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-as @@ -0,0 +1,5 @@ +#!/bin/bash --norc + +exec -a aarch64-linux-gnu-as \ + external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-as \ + "$@" diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-cpp b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-cpp new file mode 100755 index 0000000..288d97b --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-cpp @@ -0,0 +1,6 @@ +#!/bin/bash --norc + +PATH="external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin:$PATH" \ + exec \ + external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-g++ \ + "$@" diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-gcc b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-gcc new file mode 100755 index 0000000..008b745 --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-gcc @@ -0,0 +1,6 @@ +#!/bin/bash --norc + +PATH="external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin:$PATH" \ + exec \ + external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-gcc \ + "$@" diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-gcov b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-gcov new file mode 100755 index 0000000..6c4e8d9 --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-gcov @@ -0,0 +1,5 @@ +#!/bin/bash --norc + +exec -a aarch64-linux-gnu-gcov \ + external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-gcov \ + "$@" diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-ld b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-ld new file mode 100755 index 0000000..3600daf --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-ld @@ -0,0 +1,5 @@ +#!/bin/bash --norc + +exec -a aarch64-linux-gnu-ld \ + external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-ld \ + "$@" diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-nm b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-nm new file mode 100755 index 0000000..ef404cc --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-nm @@ -0,0 +1,5 @@ +#!/bin/bash --norc + +exec -a aarch64-linux-gnu-nm \ + external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-nm \ + "$@" diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-objcopy b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-objcopy new file mode 100755 index 0000000..cc72443 --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-objcopy @@ -0,0 +1,5 @@ +#!/bin/bash --norc + +exec -a aarch64-linux-gnu-objcopy \ + external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-objcopy \ + "$@" diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-objdump b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-objdump new file mode 100755 index 0000000..ca63ec8 --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-objdump @@ -0,0 +1,5 @@ +#!/bin/bash --norc + +exec -a aarch64-linux-gnu-objdump \ + external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-objdump \ + "$@" diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-strip b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-strip new file mode 100755 index 0000000..f028324 --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/wrapper-strip @@ -0,0 +1,5 @@ +#!/bin/bash --norc + +exec -a aarch64-linux-gnu-strip \ + external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/bin/aarch64-linux-gnu-strip \ + "$@" diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/cc_toolchain_config.bzl b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/cc_toolchain_config.bzl new file mode 100644 index 0000000..967d318 --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/cc_toolchain_config.bzl @@ -0,0 +1,188 @@ +# +# Vivante Cross Toolchain configuration +# +load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES") +load("@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl", + "tool_path", + "feature", + "with_feature_set", + "flag_group", + "flag_set") +load("//:cc_toolchain_base.bzl", + "build_cc_toolchain_config", + "all_compile_actions", + "all_cpp_compile_actions", + "all_link_actions") + +tool_paths = [ + tool_path(name = "ar", path = "bin/wrapper-ar",), + tool_path(name = "compat-ld", path = "bin/wrapper-ld",), + tool_path(name = "cpp", path = "bin/wrapper-cpp",), + tool_path(name = "dwp", path = "bin/wrapper-dwp",), + tool_path(name = "gcc", path = "bin/wrapper-gcc",), + tool_path(name = "gcov", path = "bin/wrapper-gcov",), + tool_path(name = "ld", path = "bin/wrapper-ld",), + tool_path(name = "nm", path = "bin/wrapper-nm",), + tool_path(name = "objcopy", path = "bin/wrapper-objcopy",), + tool_path(name = "objdump", path = "bin/wrapper-objdump",), + tool_path(name = "strip", path = "bin/wrapper-strip",), +] + +def _impl(ctx): + builtin_sysroot = "external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/aarch64-linux-gnu/libc" + + compile_flags_feature = feature( + name = "compile_flags", + enabled = True, + flag_sets = [ + flag_set( + actions = all_compile_actions, + flag_groups = [ + flag_group( + flags = [ + "-idirafter", "external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/aarch64-linux-gnu/libc/usr/include", + "-idirafter", "external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/lib/gcc/aarch64-linux-gnu/7.3.1/include", + "-idirafter", "external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/lib/gcc/aarch64-linux-gnu/7.3.1/include-fixed", + "-idirafter", "external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/lib/gcc/aarch64-linux-gnu/7.3.1/install-tools/include", + "-idirafter", "external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/lib/gcc/aarch64-linux-gnu/7.3.1/plugin/include", + ], + ), + flag_group( + flags = [ + "-D__arm64", + "-Wall", # All warnings are enabled. + "-Wunused-but-set-parameter", # Enable a few more warnings that aren't part of -Wall. + "-Wno-free-nonheap-object", # Disable some that are problematic, has false positives + "-fno-omit-frame-pointer", # Keep stack frames for debugging, even in opt mode. + "-no-canonical-prefixes", + "-fstack-protector", + "-fPIE", + "-fPIC", + ], + ), + ], + ), + flag_set( + actions = all_cpp_compile_actions + [ACTION_NAMES.lto_backend], + flag_groups = [ + flag_group( + flags = [ + "-isystem" , "external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/aarch64-linux-gnu/include/c++/7.3.1", + "-isystem" , "external/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/aarch64-linux-gnu/include/c++/7.3.1/aarch64-linux-gnu", + ] + ), + ], + ), + flag_set( + actions = all_compile_actions, + flag_groups = [ + flag_group( + flags = [ + "-g", + ], + ), + ], + with_features = [with_feature_set(features = ["dbg"])], + ), + flag_set( + actions = all_compile_actions, + flag_groups = [ + flag_group( + flags = [ + "-g0", + "-O2", + "-DNDEBUG", + "-ffunction-sections", + "-fdata-sections", + ], + ), + ], + with_features = [with_feature_set(features = ["opt"])], + ), + flag_set( + actions = all_link_actions, + flag_groups = [ + flag_group( + flags = [ + "-lstdc++", + ], + ), + ], + ), + flag_set( + actions = all_link_actions, + flag_groups = [ + flag_group( + flags = [ + "-Wl,--gc-sections", + ], + ), + ], + with_features = [with_feature_set(features = ["opt"])], + ), + flag_set( + actions = [ + ACTION_NAMES.cpp_link_executable, + ], + flag_groups = [ + flag_group( + flags = [ + "-pie", + ], + ), + ], + ), + ], + ) + + cxx_builtin_include_directories = [ + "%package(@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//aarch64-linux-gnu/libc/usr/include)%", + "%package(@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//lib/gcc/aarch64-linux-gnu/7.3.1/include)%", + "%package(@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//lib/gcc/aarch64-linux-gnu/7.3.1/include-fixed)%", + "%package(@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//lib/gcc/aarch64-linux-gnu/7.3.1/install-tools/include)%", + "%package(@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//lib/gcc/aarch64-linux-gnu/7.3.1/plugin/include)%", + "%package(@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//aarch64-linux-gnu/include/c++/7.3.1)%", + "%package(@gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu//aarch64-linux-gnu/include/c++/7.3.1/aarch64-linux-gnu)%", + ] + + objcopy_embed_flags_feature = feature( + name = "objcopy_embed_flags", + enabled = True, + flag_sets = [ + flag_set( + actions = ["objcopy_embed_data"], + flag_groups = [ + flag_group( + flags = [ + "-I", + "binary", + ], + ), + ], + ), + ] + ) + + dbg_feature = feature(name = "dbg") + opt_feature = feature(name = "opt") + + return cc_common.create_cc_toolchain_config_info( + ctx = ctx, + toolchain_identifier = ctx.attr.toolchain_name, + host_system_name = "", + target_system_name = "linux", + target_cpu = ctx.attr.target_cpu, + target_libc = ctx.attr.target_cpu, + compiler = ctx.attr.compiler, + abi_version = ctx.attr.compiler, + abi_libc_version = ctx.attr.compiler, + tool_paths = tool_paths, + features = [compile_flags_feature, objcopy_embed_flags_feature, dbg_feature, opt_feature], + cxx_builtin_include_directories = cxx_builtin_include_directories, + builtin_sysroot = builtin_sysroot, + ) + +# DON'T MODIFY +cc_toolchain_config = build_cc_toolchain_config(_impl) + +# EOF diff --git a/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/toolchain.BUILD b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/toolchain.BUILD new file mode 100644 index 0000000..8a4d59c --- /dev/null +++ b/toolchains/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu/toolchain.BUILD @@ -0,0 +1,82 @@ +package(default_visibility = ["//visibility:public"]) + +filegroup( + name = "gcc", + srcs = [ + "bin/aarch64-linux-gnu-gcc", + ], +) + +filegroup( + name = "ar", + srcs = [ + "bin/aarch64-linux-gnu-ar", + ], +) + +filegroup( + name = "ld", + srcs = [ + "bin/aarch64-linux-gnu-ld", + ], +) + +filegroup( + name = "nm", + srcs = [ + "bin/aarch64-linux-gnu-nm", + ], +) + +filegroup( + name = "objcopy", + srcs = [ + "bin/aarch64-linux-gnu-objcopy", + ], +) + +filegroup( + name = "objdump", + srcs = [ + "bin/aarch64-linux-gnu-objdump", + ], +) + +filegroup( + name = "strip", + srcs = [ + "bin/aarch64-linux-gnu-strip", + ], +) + +filegroup( + name = "as", + srcs = [ + "bin/aarch64-linux-gnu-as", + ], +) + +filegroup( + name = "compiler_pieces", + srcs = glob([ + "aarch64-linux-gnu/**", + "libexec/**", + "lib/gcc/aarch64-linux-gnu/**", + "include/**", + ]), +) + +filegroup( + name = "compiler_components", + srcs = [ + ":ar", + ":as", + ":gcc", + ":ld", + ":nm", + ":objcopy", + ":objdump", + ":strip", + ], +) + diff --git a/toolchains/toolchains.bzl b/toolchains/toolchains.bzl new file mode 100644 index 0000000..064e081 --- /dev/null +++ b/toolchains/toolchains.bzl @@ -0,0 +1,12 @@ +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +def init_toolchains(name='TOOLCHAINS'): + http_archive( + name = "gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu", + build_file = "@TOOLCHAINS//gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu:toolchain.BUILD", + sha256 = "73eed74e593e2267504efbcf3678918bb22409ab7afa3dc7c135d2c6790c2345", + strip_prefix = "gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu", + urls = [ + "https://cnbj1.fds.api.xiaomi.com/mace/third-party/gcc-linaro/gcc-linaro-7.3.1-2018.05-x86_64_aarch64-linux-gnu.tar.xz", + ], + )